diff --git a/checkpoint-5700/config.json b/checkpoint-5700/config.json new file mode 100644 index 0000000000000000000000000000000000000000..61287e82fc75eb390f7e1b53237e3ac30a059076 --- /dev/null +++ b/checkpoint-5700/config.json @@ -0,0 +1,253 @@ +{ + "_name_or_path": "/lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-5700", + "architectures": [ + "LlavaLlamaModel" + ], + "drop_path_rate": 0.0, + "hidden_size": 2560, + "image_aspect_ratio": "resize", + "interpolate_mode": "linear", + "llm_cfg": { + "_name_or_path": "/lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-5700/llm", + "add_cross_attention": false, + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 1, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 2, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "silu", + "hidden_size": 2560, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 6912, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 4096, + "min_length": 0, + "model_max_length": 4096, + "model_type": "llama", + "no_repeat_ngram_size": 0, + "num_attention_heads": 20, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 32, + "num_key_value_heads": 20, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "pretraining_tp": 1, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": false, + "tokenizer_class": null, + "tokenizer_model_max_length": 4096, + "tokenizer_padding_side": "right", + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "bfloat16", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": false, + "vocab_size": 32000 + }, + "mm_hidden_size": 1152, + "mm_projector_cfg": { + "_name_or_path": "/lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-5700/mm_projector", + "add_cross_attention": false, + "architectures": [ + "MultimodalProjector" + ], + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "mm_projector_type": "mlp_downsample", + "model_type": "v2l_projector", + "no_repeat_ngram_size": 0, + "num_beam_groups": 1, + "num_beams": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "bfloat16", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false + }, + "mm_projector_lr": null, + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "cls_patch", + "mm_vision_select_layer": -2, + "model_dtype": "torch.bfloat16", + "model_type": "llava_llama", + "num_video_frames": 8, + "resume_path": "/lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/checkpoint-4400", + "s2": false, + "s2_max_split_size": 336, + "s2_scales": "336,672,1008", + "transformers_version": "4.36.2", + "tune_language_model": true, + "tune_mm_projector": true, + "tune_vision_tower": true, + "vision_resolution": -1, + "vision_tower_cfg": { + "_name_or_path": "/lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-5700/vision_tower", + "add_cross_attention": false, + "architectures": [ + "SiglipVisionModel" + ], + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu_pytorch_tanh", + "hidden_size": 1152, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 384, + "intermediate_size": 4304, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "siglip_vision_model", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 27, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "bfloat16", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false + } +} diff --git a/checkpoint-5700/global_step5700/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-5700/global_step5700/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bfefff93f913fccdc03d45b2a1956478301d11bf --- /dev/null +++ b/checkpoint-5700/global_step5700/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9311d2617bd70f02b2e58a96896628b009bdfc18dc5843832fc6ade1f8f94e52 +size 2361117175 diff --git a/checkpoint-5700/global_step5700/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt b/checkpoint-5700/global_step5700/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8148f59c8c9747fca2b3d388e448e66bc0ddeb6d --- /dev/null +++ b/checkpoint-5700/global_step5700/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:880004869a68e6740002d4e11ebc2b8fd03df6b0a3520a9d4b867069698842ef +size 2361117185 diff --git a/checkpoint-5700/global_step5700/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt b/checkpoint-5700/global_step5700/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..892699d5381e88db553a32d042927d93276e2bd2 --- /dev/null +++ b/checkpoint-5700/global_step5700/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73fea6ea669d5c3d871b03b591d62c98f2d30dade650342af0647aa638737c43 +size 2361117185 diff --git a/checkpoint-5700/global_step5700/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt b/checkpoint-5700/global_step5700/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8ecfdc1e0f94278aaaf93628ded6e62c6f35dc1f --- /dev/null +++ b/checkpoint-5700/global_step5700/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48014609f428c0ab27e926f64af2d1097ea422b97415b55f92b0acdf1403f8d1 +size 2361117185 diff --git a/checkpoint-5700/global_step5700/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt b/checkpoint-5700/global_step5700/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..74b663ea8b9c43a130d8c3f53e8a261f968dcf92 --- /dev/null +++ b/checkpoint-5700/global_step5700/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee53618476995c0d313688682d84dd2c129bbd46be6ff6d1697c145fc26d13fc +size 2361117185 diff --git a/checkpoint-5700/global_step5700/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt b/checkpoint-5700/global_step5700/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..44b92bf9a1b2baf75c0b39e37a2308b737a9bca6 --- /dev/null +++ b/checkpoint-5700/global_step5700/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33d366a065d835c43860172b3b1375059590b684da2834a7262a6d4169b307dd +size 2361117185 diff --git a/checkpoint-5700/global_step5700/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt b/checkpoint-5700/global_step5700/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..041edbc3f169ce01ad58648a9893303887bfe9b2 --- /dev/null +++ b/checkpoint-5700/global_step5700/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1386f17c460c0e5a73c2f3598111f791dac979d19835723538c00c05315c23bb +size 2361117185 diff --git a/checkpoint-5700/global_step5700/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-5700/global_step5700/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..54300ab7486449c8fca44726208983b526124ba8 --- /dev/null +++ b/checkpoint-5700/global_step5700/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d039f7c549dde93d147fb7d176c6027ce3c9aa1900f5d3532eab044aecc5da93 +size 2361117175 diff --git a/checkpoint-5700/global_step5700/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-5700/global_step5700/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..282392702611e5867ab80f2e4507117b005eb62e --- /dev/null +++ b/checkpoint-5700/global_step5700/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2ff0141298fbf4e50a71095b2aa1b59f156e24ea0b0a02ff74da71ccbb0f6bd +size 2361117175 diff --git a/checkpoint-5700/global_step5700/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-5700/global_step5700/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b428c98da043f33e8905427ecc4360699e587c14 --- /dev/null +++ b/checkpoint-5700/global_step5700/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8817fdffd6a436cbb8b9c38bd27d7c3d77ff2e040bf3e9c7a10d45a692e4f8e +size 2361117175 diff --git a/checkpoint-5700/global_step5700/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoint-5700/global_step5700/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6f851047f506a1dcce06ae73cdbbf6d33b4fa2dd --- /dev/null +++ b/checkpoint-5700/global_step5700/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac8578ca8d6e7376c21c7cc544b23fab9d1a5a6fef06b97f26eee66f89e81753 +size 2361117175 diff --git a/checkpoint-5700/global_step5700/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoint-5700/global_step5700/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6c032484b8ebd8b51cccd3bdc8b78d23c7334957 --- /dev/null +++ b/checkpoint-5700/global_step5700/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:770206d20ec6de01f37bebd3b7798a1f3176f1f7533e4aeeff3f2bc8b330fb5b +size 2361117175 diff --git a/checkpoint-5700/global_step5700/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoint-5700/global_step5700/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..eab638434162be202c360ab88e2366c7daea5d5e --- /dev/null +++ b/checkpoint-5700/global_step5700/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d498f78398338cb1f91667091a29a438662ffc3116c974478613497813cd7d9a +size 2361117175 diff --git a/checkpoint-5700/global_step5700/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/checkpoint-5700/global_step5700/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..60c0485fc03949437525a22846c27de2e225bb33 --- /dev/null +++ b/checkpoint-5700/global_step5700/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65f35d2cd657cadbf5ff7292804cfae9b29fc39840047ffd8d6f1f3a58de9a5e +size 2361117175 diff --git a/checkpoint-5700/global_step5700/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt b/checkpoint-5700/global_step5700/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1c8f4ec165c0c73a79f317be99f9e91fd9797617 --- /dev/null +++ b/checkpoint-5700/global_step5700/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2436a7ffb744415218a1389779471378b17f859c5fc25f128091493412365f2 +size 2361117175 diff --git a/checkpoint-5700/global_step5700/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt b/checkpoint-5700/global_step5700/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b61ec3b1dc59363249fa3fc74be1ab928babfc5b --- /dev/null +++ b/checkpoint-5700/global_step5700/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07a0e5b47137acf84340755763fb1c3448547049e9e49acd8588b248b376552c +size 2361117175 diff --git a/checkpoint-5700/global_step5700/zero_pp_rank_0_mp_rank_00_model_states.pt b/checkpoint-5700/global_step5700/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ca5658b2dfe60f9c8d1938d2524a0c388a422032 --- /dev/null +++ b/checkpoint-5700/global_step5700/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83a00cadb0b5ebca5817a41047588fd67ab95af5b58f7588ba236ca493771056 +size 413988 diff --git a/checkpoint-5700/global_step5700/zero_pp_rank_10_mp_rank_00_model_states.pt b/checkpoint-5700/global_step5700/zero_pp_rank_10_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4a0afb856285c9894e6ceca59ab1d1b0abcb1777 --- /dev/null +++ b/checkpoint-5700/global_step5700/zero_pp_rank_10_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11b8a70d42e46afc03605960f1e600adb98d0a0ed33d4fb1739db99c2d6a5d2d +size 414735 diff --git a/checkpoint-5700/global_step5700/zero_pp_rank_11_mp_rank_00_model_states.pt b/checkpoint-5700/global_step5700/zero_pp_rank_11_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3625f49fe7c83afc64107ca1108604ca972bd548 --- /dev/null +++ b/checkpoint-5700/global_step5700/zero_pp_rank_11_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0466ef22c00a053f70e11da100db232ac0ea3f09300c92b33104ea261eebaa98 +size 414735 diff --git a/checkpoint-5700/global_step5700/zero_pp_rank_12_mp_rank_00_model_states.pt b/checkpoint-5700/global_step5700/zero_pp_rank_12_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..de63e11932e48005754ccc4470e0776b6769ff8f --- /dev/null +++ b/checkpoint-5700/global_step5700/zero_pp_rank_12_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:242707438d8c74fab1720852bda659feeb5b909563fde6cb8d445a08ed5eb638 +size 414735 diff --git a/checkpoint-5700/global_step5700/zero_pp_rank_13_mp_rank_00_model_states.pt b/checkpoint-5700/global_step5700/zero_pp_rank_13_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f48436ae7d976257a0c548d6fc2c709df7d949b4 --- /dev/null +++ b/checkpoint-5700/global_step5700/zero_pp_rank_13_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8597dea4e03957b3a46f7354bb1928d7edf8e07387e652133edd06048d5c80fe +size 414735 diff --git a/checkpoint-5700/global_step5700/zero_pp_rank_14_mp_rank_00_model_states.pt b/checkpoint-5700/global_step5700/zero_pp_rank_14_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ee721656e565c4fb2359198740cf932fac8d7e1c --- /dev/null +++ b/checkpoint-5700/global_step5700/zero_pp_rank_14_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b6a1b44831c883e6e9f3a5103902089e5d3b220212252944adaa1fafd73ba28 +size 414735 diff --git a/checkpoint-5700/global_step5700/zero_pp_rank_15_mp_rank_00_model_states.pt b/checkpoint-5700/global_step5700/zero_pp_rank_15_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ceafdd9e0df79ad8fae18abae6a7dc11edb7d09b --- /dev/null +++ b/checkpoint-5700/global_step5700/zero_pp_rank_15_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:027bac1675fb9a6d286a5828eaacc83c97e7d64bc111841c3878fef3eea0f973 +size 414735 diff --git a/checkpoint-5700/global_step5700/zero_pp_rank_1_mp_rank_00_model_states.pt b/checkpoint-5700/global_step5700/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..729929b06b040cc7debb91903bb1d5a09dce119a --- /dev/null +++ b/checkpoint-5700/global_step5700/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51276c45453caaff1ee2e19e11360cfe73bb21c33afcebface1448242c3d47fd +size 413988 diff --git a/checkpoint-5700/global_step5700/zero_pp_rank_2_mp_rank_00_model_states.pt b/checkpoint-5700/global_step5700/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..16a588b8c5925eb7e403c91e321c79394df034e1 --- /dev/null +++ b/checkpoint-5700/global_step5700/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6eb7a021b44b4ca40c11dbd0989be868c8248ceeeda139786f23de31f9bc72d9 +size 413988 diff --git a/checkpoint-5700/global_step5700/zero_pp_rank_3_mp_rank_00_model_states.pt b/checkpoint-5700/global_step5700/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5d503889a86a57fc3ec0aaeb858f16a770a7e9d0 --- /dev/null +++ b/checkpoint-5700/global_step5700/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40bb21ddc57b8e6e98931e9d8c53cca423b0daab5ebf7f6649feab9e66e04a3e +size 413988 diff --git a/checkpoint-5700/global_step5700/zero_pp_rank_4_mp_rank_00_model_states.pt b/checkpoint-5700/global_step5700/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7d8157eafcb7c2c918aa0488d2c4b3ec34d51d35 --- /dev/null +++ b/checkpoint-5700/global_step5700/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38e27944f297980937ddf45ffb72ddf8999c1e3805793fbf11622cbcb0fb5265 +size 413988 diff --git a/checkpoint-5700/global_step5700/zero_pp_rank_5_mp_rank_00_model_states.pt b/checkpoint-5700/global_step5700/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..882ac45c47cafce91bfb1caa9116e1170b5b7b48 --- /dev/null +++ b/checkpoint-5700/global_step5700/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e60397c30fef97f921655a12cc56d66eb93483eea0a693235a5b5febb232fd58 +size 413988 diff --git a/checkpoint-5700/global_step5700/zero_pp_rank_6_mp_rank_00_model_states.pt b/checkpoint-5700/global_step5700/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e23b00ea9389285b8c59b9e0b3020e18a03bd135 --- /dev/null +++ b/checkpoint-5700/global_step5700/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56ab4e0f2003e99360211ec5c5c1d3ed6df9b6bd68a78bd56574dfc33dc7a105 +size 413988 diff --git a/checkpoint-5700/global_step5700/zero_pp_rank_7_mp_rank_00_model_states.pt b/checkpoint-5700/global_step5700/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..144935a4bac42ac69558c10b248ad669ac5bc753 --- /dev/null +++ b/checkpoint-5700/global_step5700/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d832275dec4e55d4d1c3d2b6e3777a3deb6ae2da56f0add1e340c7c5fe7c06aa +size 413988 diff --git a/checkpoint-5700/global_step5700/zero_pp_rank_8_mp_rank_00_model_states.pt b/checkpoint-5700/global_step5700/zero_pp_rank_8_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b54145537ebbff7aa2425715996a67047794f93b --- /dev/null +++ b/checkpoint-5700/global_step5700/zero_pp_rank_8_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d1df0ae85fd8c65b7439264269f87890da0064b390e5a041624cd6698d1c2b0 +size 413988 diff --git a/checkpoint-5700/global_step5700/zero_pp_rank_9_mp_rank_00_model_states.pt b/checkpoint-5700/global_step5700/zero_pp_rank_9_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..971f3a910029a90f5ee542c20d153cfe3e0be36a --- /dev/null +++ b/checkpoint-5700/global_step5700/zero_pp_rank_9_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fac9a997a36984d79c4d1406a9a2c552df48bcec36feaabeb51e2034544ba16 +size 413988 diff --git a/checkpoint-5700/latest b/checkpoint-5700/latest new file mode 100644 index 0000000000000000000000000000000000000000..c9e65f0c3be0eee7f8e0b514981e0dbb1ab5e575 --- /dev/null +++ b/checkpoint-5700/latest @@ -0,0 +1 @@ +global_step5700 \ No newline at end of file diff --git a/checkpoint-5700/llm/config.json b/checkpoint-5700/llm/config.json new file mode 100644 index 0000000000000000000000000000000000000000..d1b2aa73fe4d19ffdfbb304b41410ce00a0cd43e --- /dev/null +++ b/checkpoint-5700/llm/config.json @@ -0,0 +1,32 @@ +{ + "_name_or_path": "/lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-5700/llm", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 1, + "eos_token_id": 2, + "hidden_act": "silu", + "hidden_size": 2560, + "initializer_range": 0.02, + "intermediate_size": 6912, + "max_position_embeddings": 4096, + "model_max_length": 4096, + "model_type": "llama", + "num_attention_heads": 20, + "num_hidden_layers": 32, + "num_key_value_heads": 20, + "pad_token_id": 0, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 4096, + "tokenizer_padding_side": "right", + "torch_dtype": "bfloat16", + "transformers_version": "4.36.2", + "use_cache": false, + "vocab_size": 32000 +} diff --git a/checkpoint-5700/llm/generation_config.json b/checkpoint-5700/llm/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bf84ec1a28ba89feb07162d95b06633a40b4975f --- /dev/null +++ b/checkpoint-5700/llm/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "eos_token_id": 2, + "pad_token_id": 0, + "transformers_version": "4.36.2" +} diff --git a/checkpoint-5700/llm/model-00001-of-00002.safetensors b/checkpoint-5700/llm/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6eeba45aeba99a6d2b50448acffe5e6e761cdf2d --- /dev/null +++ b/checkpoint-5700/llm/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e89de72228dfce9b23443e6e6b637449a5edc8458d9a50219c36868777aadeab +size 4974521464 diff --git a/checkpoint-5700/llm/model-00002-of-00002.safetensors b/checkpoint-5700/llm/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..17d492c2c20bbe8149ad77a952d40edde61a425f --- /dev/null +++ b/checkpoint-5700/llm/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52f926825f685dc2cb04a3f7373251e782c22604903aab76d9d404a0fc735156 +size 428632856 diff --git a/checkpoint-5700/llm/model.safetensors.index.json b/checkpoint-5700/llm/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..8b173c9ac8194749df58c92051618c0ff74c4c20 --- /dev/null +++ b/checkpoint-5700/llm/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 5403120640 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.28.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.29.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/checkpoint-5700/llm/special_tokens_map.json b/checkpoint-5700/llm/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..14761dcf1466dc232bd41de9c21d4c617b15755e --- /dev/null +++ b/checkpoint-5700/llm/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-5700/llm/tokenizer.model b/checkpoint-5700/llm/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..3b7eab905db502ae7629c8a3c1f8412a3178c4c2 --- /dev/null +++ b/checkpoint-5700/llm/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7aedb3582ecda9fa99ee9242c17a9658f6744db083ee6ebdc8fb14857f84d220 +size 499723 diff --git a/checkpoint-5700/llm/tokenizer_config.json b/checkpoint-5700/llm/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..47ab96cd62cc374653a0ea0fb77f9457e0f53481 --- /dev/null +++ b/checkpoint-5700/llm/tokenizer_config.json @@ -0,0 +1,43 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": false, + "model_max_length": 4096, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/checkpoint-5700/mm_projector/config.json b/checkpoint-5700/mm_projector/config.json new file mode 100644 index 0000000000000000000000000000000000000000..b4b6f549aa9a5cd34b5e7b46308ec34ddb6cc93c --- /dev/null +++ b/checkpoint-5700/mm_projector/config.json @@ -0,0 +1,10 @@ +{ + "_name_or_path": "/lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-5700/mm_projector", + "architectures": [ + "MultimodalProjector" + ], + "mm_projector_type": "mlp_downsample", + "model_type": "v2l_projector", + "torch_dtype": "bfloat16", + "transformers_version": "4.36.2" +} diff --git a/checkpoint-5700/mm_projector/model.safetensors b/checkpoint-5700/mm_projector/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1ed07f9cb0b32a9378d1fc096b283d5edf4f5d3e --- /dev/null +++ b/checkpoint-5700/mm_projector/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52e9f5fbc766b02ba90e2c4438f2ec3611203de735b5fc0181293373a16bbdc6 +size 36729360 diff --git a/checkpoint-5700/rng_state_0.pth b/checkpoint-5700/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ffbe110967270adadf02d4c0806efa62c8a077a2 --- /dev/null +++ b/checkpoint-5700/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb36302a1081a8c59d56674f62bccb88a7c877017ebb66e7c51af5c9d06e7d2b +size 21687 diff --git a/checkpoint-5700/rng_state_1.pth b/checkpoint-5700/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..385cdccc7d520b2b67f866733bfbd8a12aacf60c --- /dev/null +++ b/checkpoint-5700/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96ca60de645a499045d7c93d6eb173f293d3d67d337b6fdd3fe01ad5a30cc775 +size 21687 diff --git a/checkpoint-5700/rng_state_10.pth b/checkpoint-5700/rng_state_10.pth new file mode 100644 index 0000000000000000000000000000000000000000..47b681ba9322ab9a98c47027af20642aff654767 --- /dev/null +++ b/checkpoint-5700/rng_state_10.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d48557c846f5e5500fb2aba9e20b61de81220c516ab253080cea5f984074e261 +size 21698 diff --git a/checkpoint-5700/rng_state_11.pth b/checkpoint-5700/rng_state_11.pth new file mode 100644 index 0000000000000000000000000000000000000000..94b6adecc378f29e5c222b52cac15c5b4230bea1 --- /dev/null +++ b/checkpoint-5700/rng_state_11.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a7fdbb913e0b5fa430ff46058fa78c50594c5bb263c5efaf244613954975ba5 +size 21698 diff --git a/checkpoint-5700/rng_state_12.pth b/checkpoint-5700/rng_state_12.pth new file mode 100644 index 0000000000000000000000000000000000000000..9875388b9c01fa7237c8300f8a6f8526f5afb6a1 --- /dev/null +++ b/checkpoint-5700/rng_state_12.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9da0ae60bf129f9674296069d31595a62fdfa856b43dc4f0a7587384726fe0c +size 21698 diff --git a/checkpoint-5700/rng_state_13.pth b/checkpoint-5700/rng_state_13.pth new file mode 100644 index 0000000000000000000000000000000000000000..e3008749383d7e3094e7dc7698a01bc463dec3ef --- /dev/null +++ b/checkpoint-5700/rng_state_13.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d945b4e792d8b71bdb459f42c6631c11172a7609e35f8c7a0faf35cd4d3454f +size 21698 diff --git a/checkpoint-5700/rng_state_14.pth b/checkpoint-5700/rng_state_14.pth new file mode 100644 index 0000000000000000000000000000000000000000..0d55eefe37fe6066cac2fe083858e320d883d202 --- /dev/null +++ b/checkpoint-5700/rng_state_14.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33c88d139a3a2ea2d75a89d5fc20dd99758fa98f7c3bde6fce72f02b8003dfee +size 21698 diff --git a/checkpoint-5700/rng_state_15.pth b/checkpoint-5700/rng_state_15.pth new file mode 100644 index 0000000000000000000000000000000000000000..b84e4b4cd9696e0f258a12b61ae4729a48672862 --- /dev/null +++ b/checkpoint-5700/rng_state_15.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44eabf91d950f368513f3657556c24c707c729bf3d887770def640afc50fda5f +size 21698 diff --git a/checkpoint-5700/rng_state_2.pth b/checkpoint-5700/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..b0e0becd8489e4eea5a214a6ede69e70ed79e2c1 --- /dev/null +++ b/checkpoint-5700/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b262d010ac0dec15711f640c7a1a4c16ee3c52e6baaa64b7a402605ef700017b +size 21687 diff --git a/checkpoint-5700/rng_state_3.pth b/checkpoint-5700/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..23b79d588deeb884e79cb145306be7421a1956a7 --- /dev/null +++ b/checkpoint-5700/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df87ff5ca3fc8bc34619c02a9c770190d063cb03983e56a9c08521fc9fa598cc +size 21687 diff --git a/checkpoint-5700/rng_state_4.pth b/checkpoint-5700/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..03f6d839bac9eba42afa93c5816ed4e02a6cc971 --- /dev/null +++ b/checkpoint-5700/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91cac33b7da752a1410acd20f19d18d5a3966e1fdb0dc62183f7c541c752ebc0 +size 21687 diff --git a/checkpoint-5700/rng_state_5.pth b/checkpoint-5700/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..80a2dca564dee8fa10ce1ec803bcb1318213cd1f --- /dev/null +++ b/checkpoint-5700/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8baf2f543e0c757fbac3e93597c5767735b9aef31ab0663c8c885d0f1ebc04c8 +size 21687 diff --git a/checkpoint-5700/rng_state_6.pth b/checkpoint-5700/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..dc8e5c6c9f26158a0a727a9697f617ac80a30f85 --- /dev/null +++ b/checkpoint-5700/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:563b4ba8016ddf56fcfac3d9c2bff462c447c314e1c9113e89b7284b44b47783 +size 21687 diff --git a/checkpoint-5700/rng_state_7.pth b/checkpoint-5700/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..f931689fc5990482a874553d752f313c556eb0fd --- /dev/null +++ b/checkpoint-5700/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f434ca99a3b2698f4e3d1893c80e924754c28bb3532792066ffecbc666fd4da +size 21687 diff --git a/checkpoint-5700/rng_state_8.pth b/checkpoint-5700/rng_state_8.pth new file mode 100644 index 0000000000000000000000000000000000000000..b70620de0f85068885d98ac95a42fdba47014d7b --- /dev/null +++ b/checkpoint-5700/rng_state_8.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37888ada1d4256c042af889df83bdd140d04ff0f450717087d05b0fe3b1d1986 +size 21687 diff --git a/checkpoint-5700/rng_state_9.pth b/checkpoint-5700/rng_state_9.pth new file mode 100644 index 0000000000000000000000000000000000000000..b696110e0de8d5d38bb9ca65c5be36726a9db5ba --- /dev/null +++ b/checkpoint-5700/rng_state_9.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e740a083da26c121e4df76074a469ada178c9eccd83e612030acc746e23f082 +size 21687 diff --git a/checkpoint-5700/scheduler.pt b/checkpoint-5700/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6a0fdc280af4236f9cdfdb01b5f318af1ecfc2c5 --- /dev/null +++ b/checkpoint-5700/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:289aa37df66d2f16fa3e961daba12364bda8ec51a804a17e4f041df71e42df9a +size 627 diff --git a/checkpoint-5700/trainer_state.json b/checkpoint-5700/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e1b370c435f029eab1e1fcf030398bd12cb7182e --- /dev/null +++ b/checkpoint-5700/trainer_state.json @@ -0,0 +1,34221 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9874404504114335, + "eval_steps": 500, + "global_step": 5700, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 1.1494252873563219e-07, + "loss": 0.8138, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 2.2988505747126437e-07, + "loss": 0.7974, + "step": 2 + }, + { + "epoch": 0.0, + "learning_rate": 3.4482758620689656e-07, + "loss": 0.8003, + "step": 3 + }, + { + "epoch": 0.0, + "learning_rate": 4.5977011494252875e-07, + "loss": 0.8151, + "step": 4 + }, + { + "epoch": 0.0, + "learning_rate": 5.747126436781609e-07, + "loss": 0.8152, + "step": 5 + }, + { + "epoch": 0.0, + "learning_rate": 6.896551724137931e-07, + "loss": 0.7962, + "step": 6 + }, + { + "epoch": 0.0, + "learning_rate": 8.045977011494253e-07, + "loss": 0.7987, + "step": 7 + }, + { + "epoch": 0.0, + "learning_rate": 9.195402298850575e-07, + "loss": 0.7904, + "step": 8 + }, + { + "epoch": 0.0, + "learning_rate": 1.0344827586206898e-06, + "loss": 0.8011, + "step": 9 + }, + { + "epoch": 0.0, + "learning_rate": 1.1494252873563219e-06, + "loss": 0.7733, + "step": 10 + }, + { + "epoch": 0.0, + "learning_rate": 1.2643678160919542e-06, + "loss": 0.7698, + "step": 11 + }, + { + "epoch": 0.0, + "learning_rate": 1.3793103448275862e-06, + "loss": 0.779, + "step": 12 + }, + { + "epoch": 0.0, + "learning_rate": 1.4942528735632185e-06, + "loss": 0.7727, + "step": 13 + }, + { + "epoch": 0.0, + "learning_rate": 1.6091954022988506e-06, + "loss": 0.7267, + "step": 14 + }, + { + "epoch": 0.0, + "learning_rate": 1.724137931034483e-06, + "loss": 0.7231, + "step": 15 + }, + { + "epoch": 0.0, + "learning_rate": 1.839080459770115e-06, + "loss": 0.727, + "step": 16 + }, + { + "epoch": 0.0, + "learning_rate": 1.9540229885057475e-06, + "loss": 0.728, + "step": 17 + }, + { + "epoch": 0.0, + "learning_rate": 2.0689655172413796e-06, + "loss": 0.6981, + "step": 18 + }, + { + "epoch": 0.0, + "learning_rate": 2.1839080459770117e-06, + "loss": 0.6842, + "step": 19 + }, + { + "epoch": 0.0, + "learning_rate": 2.2988505747126437e-06, + "loss": 0.6777, + "step": 20 + }, + { + "epoch": 0.0, + "learning_rate": 2.4137931034482762e-06, + "loss": 0.6727, + "step": 21 + }, + { + "epoch": 0.0, + "learning_rate": 2.5287356321839083e-06, + "loss": 0.6708, + "step": 22 + }, + { + "epoch": 0.0, + "learning_rate": 2.6436781609195404e-06, + "loss": 0.6726, + "step": 23 + }, + { + "epoch": 0.0, + "learning_rate": 2.7586206896551725e-06, + "loss": 0.665, + "step": 24 + }, + { + "epoch": 0.0, + "learning_rate": 2.8735632183908046e-06, + "loss": 0.6452, + "step": 25 + }, + { + "epoch": 0.0, + "learning_rate": 2.988505747126437e-06, + "loss": 0.6388, + "step": 26 + }, + { + "epoch": 0.0, + "learning_rate": 3.103448275862069e-06, + "loss": 0.6562, + "step": 27 + }, + { + "epoch": 0.0, + "learning_rate": 3.2183908045977012e-06, + "loss": 0.6516, + "step": 28 + }, + { + "epoch": 0.01, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.6408, + "step": 29 + }, + { + "epoch": 0.01, + "learning_rate": 3.448275862068966e-06, + "loss": 0.6408, + "step": 30 + }, + { + "epoch": 0.01, + "learning_rate": 3.563218390804598e-06, + "loss": 0.6422, + "step": 31 + }, + { + "epoch": 0.01, + "learning_rate": 3.67816091954023e-06, + "loss": 0.6256, + "step": 32 + }, + { + "epoch": 0.01, + "learning_rate": 3.793103448275862e-06, + "loss": 0.6399, + "step": 33 + }, + { + "epoch": 0.01, + "learning_rate": 3.908045977011495e-06, + "loss": 0.6133, + "step": 34 + }, + { + "epoch": 0.01, + "learning_rate": 4.022988505747127e-06, + "loss": 0.6151, + "step": 35 + }, + { + "epoch": 0.01, + "learning_rate": 4.137931034482759e-06, + "loss": 0.6182, + "step": 36 + }, + { + "epoch": 0.01, + "learning_rate": 4.252873563218391e-06, + "loss": 0.6266, + "step": 37 + }, + { + "epoch": 0.01, + "learning_rate": 4.367816091954023e-06, + "loss": 0.6073, + "step": 38 + }, + { + "epoch": 0.01, + "learning_rate": 4.482758620689656e-06, + "loss": 0.6164, + "step": 39 + }, + { + "epoch": 0.01, + "learning_rate": 4.5977011494252875e-06, + "loss": 0.6078, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 4.71264367816092e-06, + "loss": 0.6099, + "step": 41 + }, + { + "epoch": 0.01, + "learning_rate": 4.8275862068965525e-06, + "loss": 0.6025, + "step": 42 + }, + { + "epoch": 0.01, + "learning_rate": 4.942528735632184e-06, + "loss": 0.6016, + "step": 43 + }, + { + "epoch": 0.01, + "learning_rate": 5.057471264367817e-06, + "loss": 0.6129, + "step": 44 + }, + { + "epoch": 0.01, + "learning_rate": 5.172413793103449e-06, + "loss": 0.6055, + "step": 45 + }, + { + "epoch": 0.01, + "learning_rate": 5.287356321839081e-06, + "loss": 0.5933, + "step": 46 + }, + { + "epoch": 0.01, + "learning_rate": 5.402298850574713e-06, + "loss": 0.6011, + "step": 47 + }, + { + "epoch": 0.01, + "learning_rate": 5.517241379310345e-06, + "loss": 0.5938, + "step": 48 + }, + { + "epoch": 0.01, + "learning_rate": 5.6321839080459775e-06, + "loss": 0.5975, + "step": 49 + }, + { + "epoch": 0.01, + "learning_rate": 5.747126436781609e-06, + "loss": 0.5881, + "step": 50 + }, + { + "epoch": 0.01, + "learning_rate": 5.862068965517242e-06, + "loss": 0.5889, + "step": 51 + }, + { + "epoch": 0.01, + "learning_rate": 5.977011494252874e-06, + "loss": 0.5824, + "step": 52 + }, + { + "epoch": 0.01, + "learning_rate": 6.091954022988507e-06, + "loss": 0.5805, + "step": 53 + }, + { + "epoch": 0.01, + "learning_rate": 6.206896551724138e-06, + "loss": 0.5791, + "step": 54 + }, + { + "epoch": 0.01, + "learning_rate": 6.321839080459771e-06, + "loss": 0.5897, + "step": 55 + }, + { + "epoch": 0.01, + "learning_rate": 6.4367816091954025e-06, + "loss": 0.5856, + "step": 56 + }, + { + "epoch": 0.01, + "learning_rate": 6.551724137931035e-06, + "loss": 0.5927, + "step": 57 + }, + { + "epoch": 0.01, + "learning_rate": 6.666666666666667e-06, + "loss": 0.5984, + "step": 58 + }, + { + "epoch": 0.01, + "learning_rate": 6.781609195402299e-06, + "loss": 0.5773, + "step": 59 + }, + { + "epoch": 0.01, + "learning_rate": 6.896551724137932e-06, + "loss": 0.5679, + "step": 60 + }, + { + "epoch": 0.01, + "learning_rate": 7.011494252873564e-06, + "loss": 0.5864, + "step": 61 + }, + { + "epoch": 0.01, + "learning_rate": 7.126436781609196e-06, + "loss": 0.568, + "step": 62 + }, + { + "epoch": 0.01, + "learning_rate": 7.241379310344828e-06, + "loss": 0.5995, + "step": 63 + }, + { + "epoch": 0.01, + "learning_rate": 7.35632183908046e-06, + "loss": 0.5634, + "step": 64 + }, + { + "epoch": 0.01, + "learning_rate": 7.4712643678160925e-06, + "loss": 0.5783, + "step": 65 + }, + { + "epoch": 0.01, + "learning_rate": 7.586206896551724e-06, + "loss": 0.572, + "step": 66 + }, + { + "epoch": 0.01, + "learning_rate": 7.701149425287356e-06, + "loss": 0.5772, + "step": 67 + }, + { + "epoch": 0.01, + "learning_rate": 7.81609195402299e-06, + "loss": 0.5611, + "step": 68 + }, + { + "epoch": 0.01, + "learning_rate": 7.93103448275862e-06, + "loss": 0.5901, + "step": 69 + }, + { + "epoch": 0.01, + "learning_rate": 8.045977011494253e-06, + "loss": 0.5774, + "step": 70 + }, + { + "epoch": 0.01, + "learning_rate": 8.160919540229886e-06, + "loss": 0.5845, + "step": 71 + }, + { + "epoch": 0.01, + "learning_rate": 8.275862068965518e-06, + "loss": 0.5646, + "step": 72 + }, + { + "epoch": 0.01, + "learning_rate": 8.390804597701149e-06, + "loss": 0.5755, + "step": 73 + }, + { + "epoch": 0.01, + "learning_rate": 8.505747126436782e-06, + "loss": 0.5638, + "step": 74 + }, + { + "epoch": 0.01, + "learning_rate": 8.620689655172414e-06, + "loss": 0.5663, + "step": 75 + }, + { + "epoch": 0.01, + "learning_rate": 8.735632183908047e-06, + "loss": 0.5741, + "step": 76 + }, + { + "epoch": 0.01, + "learning_rate": 8.85057471264368e-06, + "loss": 0.5721, + "step": 77 + }, + { + "epoch": 0.01, + "learning_rate": 8.965517241379312e-06, + "loss": 0.5506, + "step": 78 + }, + { + "epoch": 0.01, + "learning_rate": 9.080459770114942e-06, + "loss": 0.5658, + "step": 79 + }, + { + "epoch": 0.01, + "learning_rate": 9.195402298850575e-06, + "loss": 0.5664, + "step": 80 + }, + { + "epoch": 0.01, + "learning_rate": 9.310344827586207e-06, + "loss": 0.5681, + "step": 81 + }, + { + "epoch": 0.01, + "learning_rate": 9.42528735632184e-06, + "loss": 0.5679, + "step": 82 + }, + { + "epoch": 0.01, + "learning_rate": 9.54022988505747e-06, + "loss": 0.571, + "step": 83 + }, + { + "epoch": 0.01, + "learning_rate": 9.655172413793105e-06, + "loss": 0.5493, + "step": 84 + }, + { + "epoch": 0.01, + "learning_rate": 9.770114942528738e-06, + "loss": 0.5607, + "step": 85 + }, + { + "epoch": 0.01, + "learning_rate": 9.885057471264368e-06, + "loss": 0.5613, + "step": 86 + }, + { + "epoch": 0.02, + "learning_rate": 1e-05, + "loss": 0.5624, + "step": 87 + }, + { + "epoch": 0.02, + "learning_rate": 1.0114942528735633e-05, + "loss": 0.5735, + "step": 88 + }, + { + "epoch": 0.02, + "learning_rate": 1.0229885057471264e-05, + "loss": 0.5745, + "step": 89 + }, + { + "epoch": 0.02, + "learning_rate": 1.0344827586206898e-05, + "loss": 0.5663, + "step": 90 + }, + { + "epoch": 0.02, + "learning_rate": 1.0459770114942529e-05, + "loss": 0.5482, + "step": 91 + }, + { + "epoch": 0.02, + "learning_rate": 1.0574712643678162e-05, + "loss": 0.5711, + "step": 92 + }, + { + "epoch": 0.02, + "learning_rate": 1.0689655172413792e-05, + "loss": 0.56, + "step": 93 + }, + { + "epoch": 0.02, + "learning_rate": 1.0804597701149427e-05, + "loss": 0.5514, + "step": 94 + }, + { + "epoch": 0.02, + "learning_rate": 1.091954022988506e-05, + "loss": 0.5783, + "step": 95 + }, + { + "epoch": 0.02, + "learning_rate": 1.103448275862069e-05, + "loss": 0.5573, + "step": 96 + }, + { + "epoch": 0.02, + "learning_rate": 1.1149425287356324e-05, + "loss": 0.5631, + "step": 97 + }, + { + "epoch": 0.02, + "learning_rate": 1.1264367816091955e-05, + "loss": 0.5467, + "step": 98 + }, + { + "epoch": 0.02, + "learning_rate": 1.1379310344827587e-05, + "loss": 0.5686, + "step": 99 + }, + { + "epoch": 0.02, + "learning_rate": 1.1494252873563218e-05, + "loss": 0.5516, + "step": 100 + }, + { + "epoch": 0.02, + "learning_rate": 1.1609195402298852e-05, + "loss": 0.5591, + "step": 101 + }, + { + "epoch": 0.02, + "learning_rate": 1.1724137931034483e-05, + "loss": 0.5489, + "step": 102 + }, + { + "epoch": 0.02, + "learning_rate": 1.1839080459770116e-05, + "loss": 0.5568, + "step": 103 + }, + { + "epoch": 0.02, + "learning_rate": 1.1954022988505748e-05, + "loss": 0.5457, + "step": 104 + }, + { + "epoch": 0.02, + "learning_rate": 1.206896551724138e-05, + "loss": 0.5429, + "step": 105 + }, + { + "epoch": 0.02, + "learning_rate": 1.2183908045977013e-05, + "loss": 0.5531, + "step": 106 + }, + { + "epoch": 0.02, + "learning_rate": 1.2298850574712644e-05, + "loss": 0.5406, + "step": 107 + }, + { + "epoch": 0.02, + "learning_rate": 1.2413793103448277e-05, + "loss": 0.5414, + "step": 108 + }, + { + "epoch": 0.02, + "learning_rate": 1.2528735632183907e-05, + "loss": 0.5448, + "step": 109 + }, + { + "epoch": 0.02, + "learning_rate": 1.2643678160919542e-05, + "loss": 0.548, + "step": 110 + }, + { + "epoch": 0.02, + "learning_rate": 1.2758620689655174e-05, + "loss": 0.5415, + "step": 111 + }, + { + "epoch": 0.02, + "learning_rate": 1.2873563218390805e-05, + "loss": 0.5541, + "step": 112 + }, + { + "epoch": 0.02, + "learning_rate": 1.298850574712644e-05, + "loss": 0.5606, + "step": 113 + }, + { + "epoch": 0.02, + "learning_rate": 1.310344827586207e-05, + "loss": 0.542, + "step": 114 + }, + { + "epoch": 0.02, + "learning_rate": 1.3218390804597702e-05, + "loss": 0.5431, + "step": 115 + }, + { + "epoch": 0.02, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.5307, + "step": 116 + }, + { + "epoch": 0.02, + "learning_rate": 1.3448275862068967e-05, + "loss": 0.549, + "step": 117 + }, + { + "epoch": 0.02, + "learning_rate": 1.3563218390804598e-05, + "loss": 0.5463, + "step": 118 + }, + { + "epoch": 0.02, + "learning_rate": 1.367816091954023e-05, + "loss": 0.5432, + "step": 119 + }, + { + "epoch": 0.02, + "learning_rate": 1.3793103448275863e-05, + "loss": 0.5369, + "step": 120 + }, + { + "epoch": 0.02, + "learning_rate": 1.3908045977011496e-05, + "loss": 0.5385, + "step": 121 + }, + { + "epoch": 0.02, + "learning_rate": 1.4022988505747128e-05, + "loss": 0.5413, + "step": 122 + }, + { + "epoch": 0.02, + "learning_rate": 1.4137931034482759e-05, + "loss": 0.5432, + "step": 123 + }, + { + "epoch": 0.02, + "learning_rate": 1.4252873563218392e-05, + "loss": 0.5262, + "step": 124 + }, + { + "epoch": 0.02, + "learning_rate": 1.4367816091954022e-05, + "loss": 0.5489, + "step": 125 + }, + { + "epoch": 0.02, + "learning_rate": 1.4482758620689657e-05, + "loss": 0.5398, + "step": 126 + }, + { + "epoch": 0.02, + "learning_rate": 1.459770114942529e-05, + "loss": 0.5472, + "step": 127 + }, + { + "epoch": 0.02, + "learning_rate": 1.471264367816092e-05, + "loss": 0.5424, + "step": 128 + }, + { + "epoch": 0.02, + "learning_rate": 1.4827586206896554e-05, + "loss": 0.5462, + "step": 129 + }, + { + "epoch": 0.02, + "learning_rate": 1.4942528735632185e-05, + "loss": 0.5408, + "step": 130 + }, + { + "epoch": 0.02, + "learning_rate": 1.5057471264367817e-05, + "loss": 0.5521, + "step": 131 + }, + { + "epoch": 0.02, + "learning_rate": 1.5172413793103448e-05, + "loss": 0.5362, + "step": 132 + }, + { + "epoch": 0.02, + "learning_rate": 1.528735632183908e-05, + "loss": 0.5475, + "step": 133 + }, + { + "epoch": 0.02, + "learning_rate": 1.540229885057471e-05, + "loss": 0.5244, + "step": 134 + }, + { + "epoch": 0.02, + "learning_rate": 1.5517241379310346e-05, + "loss": 0.5356, + "step": 135 + }, + { + "epoch": 0.02, + "learning_rate": 1.563218390804598e-05, + "loss": 0.5392, + "step": 136 + }, + { + "epoch": 0.02, + "learning_rate": 1.574712643678161e-05, + "loss": 0.5487, + "step": 137 + }, + { + "epoch": 0.02, + "learning_rate": 1.586206896551724e-05, + "loss": 0.5275, + "step": 138 + }, + { + "epoch": 0.02, + "learning_rate": 1.5977011494252876e-05, + "loss": 0.5268, + "step": 139 + }, + { + "epoch": 0.02, + "learning_rate": 1.6091954022988507e-05, + "loss": 0.5286, + "step": 140 + }, + { + "epoch": 0.02, + "learning_rate": 1.6206896551724137e-05, + "loss": 0.5404, + "step": 141 + }, + { + "epoch": 0.02, + "learning_rate": 1.632183908045977e-05, + "loss": 0.5323, + "step": 142 + }, + { + "epoch": 0.02, + "learning_rate": 1.6436781609195406e-05, + "loss": 0.5345, + "step": 143 + }, + { + "epoch": 0.02, + "learning_rate": 1.6551724137931037e-05, + "loss": 0.527, + "step": 144 + }, + { + "epoch": 0.03, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.5248, + "step": 145 + }, + { + "epoch": 0.03, + "learning_rate": 1.6781609195402298e-05, + "loss": 0.5292, + "step": 146 + }, + { + "epoch": 0.03, + "learning_rate": 1.6896551724137932e-05, + "loss": 0.5429, + "step": 147 + }, + { + "epoch": 0.03, + "learning_rate": 1.7011494252873563e-05, + "loss": 0.5322, + "step": 148 + }, + { + "epoch": 0.03, + "learning_rate": 1.7126436781609197e-05, + "loss": 0.5303, + "step": 149 + }, + { + "epoch": 0.03, + "learning_rate": 1.7241379310344828e-05, + "loss": 0.5264, + "step": 150 + }, + { + "epoch": 0.03, + "learning_rate": 1.7356321839080462e-05, + "loss": 0.5303, + "step": 151 + }, + { + "epoch": 0.03, + "learning_rate": 1.7471264367816093e-05, + "loss": 0.5355, + "step": 152 + }, + { + "epoch": 0.03, + "learning_rate": 1.7586206896551724e-05, + "loss": 0.5251, + "step": 153 + }, + { + "epoch": 0.03, + "learning_rate": 1.770114942528736e-05, + "loss": 0.5325, + "step": 154 + }, + { + "epoch": 0.03, + "learning_rate": 1.781609195402299e-05, + "loss": 0.5274, + "step": 155 + }, + { + "epoch": 0.03, + "learning_rate": 1.7931034482758623e-05, + "loss": 0.5273, + "step": 156 + }, + { + "epoch": 0.03, + "learning_rate": 1.8045977011494254e-05, + "loss": 0.5355, + "step": 157 + }, + { + "epoch": 0.03, + "learning_rate": 1.8160919540229885e-05, + "loss": 0.5371, + "step": 158 + }, + { + "epoch": 0.03, + "learning_rate": 1.827586206896552e-05, + "loss": 0.5453, + "step": 159 + }, + { + "epoch": 0.03, + "learning_rate": 1.839080459770115e-05, + "loss": 0.5329, + "step": 160 + }, + { + "epoch": 0.03, + "learning_rate": 1.8505747126436784e-05, + "loss": 0.5241, + "step": 161 + }, + { + "epoch": 0.03, + "learning_rate": 1.8620689655172415e-05, + "loss": 0.5136, + "step": 162 + }, + { + "epoch": 0.03, + "learning_rate": 1.873563218390805e-05, + "loss": 0.5355, + "step": 163 + }, + { + "epoch": 0.03, + "learning_rate": 1.885057471264368e-05, + "loss": 0.5372, + "step": 164 + }, + { + "epoch": 0.03, + "learning_rate": 1.896551724137931e-05, + "loss": 0.5187, + "step": 165 + }, + { + "epoch": 0.03, + "learning_rate": 1.908045977011494e-05, + "loss": 0.5368, + "step": 166 + }, + { + "epoch": 0.03, + "learning_rate": 1.9195402298850576e-05, + "loss": 0.5113, + "step": 167 + }, + { + "epoch": 0.03, + "learning_rate": 1.931034482758621e-05, + "loss": 0.5277, + "step": 168 + }, + { + "epoch": 0.03, + "learning_rate": 1.942528735632184e-05, + "loss": 0.5277, + "step": 169 + }, + { + "epoch": 0.03, + "learning_rate": 1.9540229885057475e-05, + "loss": 0.5299, + "step": 170 + }, + { + "epoch": 0.03, + "learning_rate": 1.9655172413793106e-05, + "loss": 0.5241, + "step": 171 + }, + { + "epoch": 0.03, + "learning_rate": 1.9770114942528737e-05, + "loss": 0.5478, + "step": 172 + }, + { + "epoch": 0.03, + "learning_rate": 1.9885057471264367e-05, + "loss": 0.5343, + "step": 173 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.5179, + "step": 174 + }, + { + "epoch": 0.03, + "learning_rate": 1.99999984252778e-05, + "loss": 0.5239, + "step": 175 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999993701111697e-05, + "loss": 0.5314, + "step": 176 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999985827503177e-05, + "loss": 0.535, + "step": 177 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999974804454722e-05, + "loss": 0.5273, + "step": 178 + }, + { + "epoch": 0.03, + "learning_rate": 1.99999606319698e-05, + "loss": 0.5348, + "step": 179 + }, + { + "epoch": 0.03, + "learning_rate": 1.999994331005288e-05, + "loss": 0.516, + "step": 180 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999922838709414e-05, + "loss": 0.523, + "step": 181 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999899217945845e-05, + "loss": 0.5273, + "step": 182 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999872447769624e-05, + "loss": 0.526, + "step": 183 + }, + { + "epoch": 0.03, + "learning_rate": 1.999984252818917e-05, + "loss": 0.5228, + "step": 184 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999809459213914e-05, + "loss": 0.5195, + "step": 185 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999773240854266e-05, + "loss": 0.5289, + "step": 186 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999733873121638e-05, + "loss": 0.5073, + "step": 187 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999691356028422e-05, + "loss": 0.5311, + "step": 188 + }, + { + "epoch": 0.03, + "learning_rate": 1.999964568958801e-05, + "loss": 0.5266, + "step": 189 + }, + { + "epoch": 0.03, + "learning_rate": 1.999959687381479e-05, + "loss": 0.5216, + "step": 190 + }, + { + "epoch": 0.03, + "learning_rate": 1.999954490872413e-05, + "loss": 0.5132, + "step": 191 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999489794332404e-05, + "loss": 0.5388, + "step": 192 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999431530656958e-05, + "loss": 0.532, + "step": 193 + }, + { + "epoch": 0.03, + "learning_rate": 1.999937011771615e-05, + "loss": 0.5262, + "step": 194 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999305555529324e-05, + "loss": 0.5378, + "step": 195 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999237844116807e-05, + "loss": 0.5188, + "step": 196 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999166983499923e-05, + "loss": 0.5267, + "step": 197 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999092973701e-05, + "loss": 0.519, + "step": 198 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999015814743337e-05, + "loss": 0.5109, + "step": 199 + }, + { + "epoch": 0.03, + "learning_rate": 1.999893550665124e-05, + "loss": 0.5264, + "step": 200 + }, + { + "epoch": 0.03, + "learning_rate": 1.9998852049449998e-05, + "loss": 0.5141, + "step": 201 + }, + { + "epoch": 0.03, + "learning_rate": 1.9998765443165896e-05, + "loss": 0.5398, + "step": 202 + }, + { + "epoch": 0.04, + "learning_rate": 1.9998675687826214e-05, + "loss": 0.517, + "step": 203 + }, + { + "epoch": 0.04, + "learning_rate": 1.9998582783459214e-05, + "loss": 0.5293, + "step": 204 + }, + { + "epoch": 0.04, + "learning_rate": 1.9998486730094157e-05, + "loss": 0.5163, + "step": 205 + }, + { + "epoch": 0.04, + "learning_rate": 1.99983875277613e-05, + "loss": 0.5251, + "step": 206 + }, + { + "epoch": 0.04, + "learning_rate": 1.9998285176491878e-05, + "loss": 0.5183, + "step": 207 + }, + { + "epoch": 0.04, + "learning_rate": 1.9998179676318133e-05, + "loss": 0.528, + "step": 208 + }, + { + "epoch": 0.04, + "learning_rate": 1.999807102727329e-05, + "loss": 0.5123, + "step": 209 + }, + { + "epoch": 0.04, + "learning_rate": 1.9997959229391567e-05, + "loss": 0.5251, + "step": 210 + }, + { + "epoch": 0.04, + "learning_rate": 1.9997844282708173e-05, + "loss": 0.5262, + "step": 211 + }, + { + "epoch": 0.04, + "learning_rate": 1.9997726187259307e-05, + "loss": 0.5303, + "step": 212 + }, + { + "epoch": 0.04, + "learning_rate": 1.999760494308217e-05, + "loss": 0.5351, + "step": 213 + }, + { + "epoch": 0.04, + "learning_rate": 1.9997480550214942e-05, + "loss": 0.5298, + "step": 214 + }, + { + "epoch": 0.04, + "learning_rate": 1.99973530086968e-05, + "loss": 0.5313, + "step": 215 + }, + { + "epoch": 0.04, + "learning_rate": 1.999722231856791e-05, + "loss": 0.5232, + "step": 216 + }, + { + "epoch": 0.04, + "learning_rate": 1.999708847986944e-05, + "loss": 0.5183, + "step": 217 + }, + { + "epoch": 0.04, + "learning_rate": 1.9996951492643538e-05, + "loss": 0.5284, + "step": 218 + }, + { + "epoch": 0.04, + "learning_rate": 1.9996811356933346e-05, + "loss": 0.5213, + "step": 219 + }, + { + "epoch": 0.04, + "learning_rate": 1.9996668072783e-05, + "loss": 0.5176, + "step": 220 + }, + { + "epoch": 0.04, + "learning_rate": 1.9996521640237624e-05, + "loss": 0.5141, + "step": 221 + }, + { + "epoch": 0.04, + "learning_rate": 1.999637205934334e-05, + "loss": 0.5304, + "step": 222 + }, + { + "epoch": 0.04, + "learning_rate": 1.9996219330147255e-05, + "loss": 0.5064, + "step": 223 + }, + { + "epoch": 0.04, + "learning_rate": 1.9996063452697472e-05, + "loss": 0.5068, + "step": 224 + }, + { + "epoch": 0.04, + "learning_rate": 1.999590442704308e-05, + "loss": 0.5203, + "step": 225 + }, + { + "epoch": 0.04, + "learning_rate": 1.9995742253234168e-05, + "loss": 0.5301, + "step": 226 + }, + { + "epoch": 0.04, + "learning_rate": 1.9995576931321812e-05, + "loss": 0.513, + "step": 227 + }, + { + "epoch": 0.04, + "learning_rate": 1.9995408461358074e-05, + "loss": 0.522, + "step": 228 + }, + { + "epoch": 0.04, + "learning_rate": 1.9995236843396018e-05, + "loss": 0.5117, + "step": 229 + }, + { + "epoch": 0.04, + "learning_rate": 1.999506207748969e-05, + "loss": 0.5231, + "step": 230 + }, + { + "epoch": 0.04, + "learning_rate": 1.999488416369414e-05, + "loss": 0.5127, + "step": 231 + }, + { + "epoch": 0.04, + "learning_rate": 1.9994703102065385e-05, + "loss": 0.521, + "step": 232 + }, + { + "epoch": 0.04, + "learning_rate": 1.9994518892660463e-05, + "loss": 0.5242, + "step": 233 + }, + { + "epoch": 0.04, + "learning_rate": 1.9994331535537385e-05, + "loss": 0.5205, + "step": 234 + }, + { + "epoch": 0.04, + "learning_rate": 1.9994141030755158e-05, + "loss": 0.5007, + "step": 235 + }, + { + "epoch": 0.04, + "learning_rate": 1.9993947378373782e-05, + "loss": 0.5293, + "step": 236 + }, + { + "epoch": 0.04, + "learning_rate": 1.9993750578454248e-05, + "loss": 0.51, + "step": 237 + }, + { + "epoch": 0.04, + "learning_rate": 1.999355063105853e-05, + "loss": 0.5178, + "step": 238 + }, + { + "epoch": 0.04, + "learning_rate": 1.999334753624961e-05, + "loss": 0.5296, + "step": 239 + }, + { + "epoch": 0.04, + "learning_rate": 1.999314129409144e-05, + "loss": 0.5249, + "step": 240 + }, + { + "epoch": 0.04, + "learning_rate": 1.999293190464899e-05, + "loss": 0.5024, + "step": 241 + }, + { + "epoch": 0.04, + "learning_rate": 1.999271936798819e-05, + "loss": 0.5026, + "step": 242 + }, + { + "epoch": 0.04, + "learning_rate": 1.9992503684175986e-05, + "loss": 0.5192, + "step": 243 + }, + { + "epoch": 0.04, + "learning_rate": 1.999228485328031e-05, + "loss": 0.5099, + "step": 244 + }, + { + "epoch": 0.04, + "learning_rate": 1.9992062875370073e-05, + "loss": 0.5017, + "step": 245 + }, + { + "epoch": 0.04, + "learning_rate": 1.999183775051519e-05, + "loss": 0.5169, + "step": 246 + }, + { + "epoch": 0.04, + "learning_rate": 1.9991609478786564e-05, + "loss": 0.5197, + "step": 247 + }, + { + "epoch": 0.04, + "learning_rate": 1.9991378060256084e-05, + "loss": 0.5364, + "step": 248 + }, + { + "epoch": 0.04, + "learning_rate": 1.999114349499664e-05, + "loss": 0.503, + "step": 249 + }, + { + "epoch": 0.04, + "learning_rate": 1.9990905783082098e-05, + "loss": 0.5201, + "step": 250 + }, + { + "epoch": 0.04, + "learning_rate": 1.999066492458733e-05, + "loss": 0.5104, + "step": 251 + }, + { + "epoch": 0.04, + "learning_rate": 1.9990420919588196e-05, + "loss": 0.5157, + "step": 252 + }, + { + "epoch": 0.04, + "learning_rate": 1.999017376816154e-05, + "loss": 0.5122, + "step": 253 + }, + { + "epoch": 0.04, + "learning_rate": 1.9989923470385198e-05, + "loss": 0.5206, + "step": 254 + }, + { + "epoch": 0.04, + "learning_rate": 1.9989670026338002e-05, + "loss": 0.5213, + "step": 255 + }, + { + "epoch": 0.04, + "learning_rate": 1.998941343609978e-05, + "loss": 0.5101, + "step": 256 + }, + { + "epoch": 0.04, + "learning_rate": 1.9989153699751332e-05, + "loss": 0.5015, + "step": 257 + }, + { + "epoch": 0.04, + "learning_rate": 1.9988890817374472e-05, + "loss": 0.515, + "step": 258 + }, + { + "epoch": 0.04, + "learning_rate": 1.9988624789051983e-05, + "loss": 0.5218, + "step": 259 + }, + { + "epoch": 0.05, + "learning_rate": 1.9988355614867654e-05, + "loss": 0.5197, + "step": 260 + }, + { + "epoch": 0.05, + "learning_rate": 1.998808329490626e-05, + "loss": 0.5071, + "step": 261 + }, + { + "epoch": 0.05, + "learning_rate": 1.9987807829253568e-05, + "loss": 0.5263, + "step": 262 + }, + { + "epoch": 0.05, + "learning_rate": 1.998752921799633e-05, + "loss": 0.5148, + "step": 263 + }, + { + "epoch": 0.05, + "learning_rate": 1.9987247461222297e-05, + "loss": 0.5179, + "step": 264 + }, + { + "epoch": 0.05, + "learning_rate": 1.9986962559020203e-05, + "loss": 0.5303, + "step": 265 + }, + { + "epoch": 0.05, + "learning_rate": 1.9986674511479783e-05, + "loss": 0.5189, + "step": 266 + }, + { + "epoch": 0.05, + "learning_rate": 1.998638331869175e-05, + "loss": 0.5192, + "step": 267 + }, + { + "epoch": 0.05, + "learning_rate": 1.9986088980747817e-05, + "loss": 0.5199, + "step": 268 + }, + { + "epoch": 0.05, + "learning_rate": 1.998579149774068e-05, + "loss": 0.5067, + "step": 269 + }, + { + "epoch": 0.05, + "learning_rate": 1.998549086976403e-05, + "loss": 0.5262, + "step": 270 + }, + { + "epoch": 0.05, + "learning_rate": 1.9985187096912554e-05, + "loss": 0.5066, + "step": 271 + }, + { + "epoch": 0.05, + "learning_rate": 1.9984880179281917e-05, + "loss": 0.5427, + "step": 272 + }, + { + "epoch": 0.05, + "learning_rate": 1.9984570116968785e-05, + "loss": 0.5037, + "step": 273 + }, + { + "epoch": 0.05, + "learning_rate": 1.9984256910070807e-05, + "loss": 0.5228, + "step": 274 + }, + { + "epoch": 0.05, + "learning_rate": 1.998394055868663e-05, + "loss": 0.5075, + "step": 275 + }, + { + "epoch": 0.05, + "learning_rate": 1.9983621062915886e-05, + "loss": 0.5133, + "step": 276 + }, + { + "epoch": 0.05, + "learning_rate": 1.9983298422859197e-05, + "loss": 0.4988, + "step": 277 + }, + { + "epoch": 0.05, + "learning_rate": 1.9982972638618175e-05, + "loss": 0.5129, + "step": 278 + }, + { + "epoch": 0.05, + "learning_rate": 1.9982643710295428e-05, + "loss": 0.5012, + "step": 279 + }, + { + "epoch": 0.05, + "learning_rate": 1.9982311637994547e-05, + "loss": 0.5289, + "step": 280 + }, + { + "epoch": 0.05, + "learning_rate": 1.9981976421820118e-05, + "loss": 0.5061, + "step": 281 + }, + { + "epoch": 0.05, + "learning_rate": 1.9981638061877714e-05, + "loss": 0.5192, + "step": 282 + }, + { + "epoch": 0.05, + "learning_rate": 1.99812965582739e-05, + "loss": 0.5029, + "step": 283 + }, + { + "epoch": 0.05, + "learning_rate": 1.9980951911116234e-05, + "loss": 0.5233, + "step": 284 + }, + { + "epoch": 0.05, + "learning_rate": 1.9980604120513257e-05, + "loss": 0.5109, + "step": 285 + }, + { + "epoch": 0.05, + "learning_rate": 1.9980253186574505e-05, + "loss": 0.5173, + "step": 286 + }, + { + "epoch": 0.05, + "learning_rate": 1.99798991094105e-05, + "loss": 0.4996, + "step": 287 + }, + { + "epoch": 0.05, + "learning_rate": 1.9979541889132758e-05, + "loss": 0.518, + "step": 288 + }, + { + "epoch": 0.05, + "learning_rate": 1.997918152585379e-05, + "loss": 0.5019, + "step": 289 + }, + { + "epoch": 0.05, + "learning_rate": 1.997881801968708e-05, + "loss": 0.5075, + "step": 290 + }, + { + "epoch": 0.05, + "learning_rate": 1.9978451370747122e-05, + "loss": 0.5123, + "step": 291 + }, + { + "epoch": 0.05, + "learning_rate": 1.9978081579149378e-05, + "loss": 0.5249, + "step": 292 + }, + { + "epoch": 0.05, + "learning_rate": 1.9977708645010323e-05, + "loss": 0.5011, + "step": 293 + }, + { + "epoch": 0.05, + "learning_rate": 1.9977332568447406e-05, + "loss": 0.4963, + "step": 294 + }, + { + "epoch": 0.05, + "learning_rate": 1.9976953349579073e-05, + "loss": 0.5083, + "step": 295 + }, + { + "epoch": 0.05, + "learning_rate": 1.9976570988524752e-05, + "loss": 0.5123, + "step": 296 + }, + { + "epoch": 0.05, + "learning_rate": 1.9976185485404867e-05, + "loss": 0.5139, + "step": 297 + }, + { + "epoch": 0.05, + "learning_rate": 1.9975796840340837e-05, + "loss": 0.5021, + "step": 298 + }, + { + "epoch": 0.05, + "learning_rate": 1.9975405053455052e-05, + "loss": 0.5138, + "step": 299 + }, + { + "epoch": 0.05, + "learning_rate": 1.997501012487091e-05, + "loss": 0.5121, + "step": 300 + }, + { + "epoch": 0.05, + "learning_rate": 1.9974612054712792e-05, + "loss": 0.4994, + "step": 301 + }, + { + "epoch": 0.05, + "learning_rate": 1.9974210843106065e-05, + "loss": 0.5099, + "step": 302 + }, + { + "epoch": 0.05, + "learning_rate": 1.9973806490177094e-05, + "loss": 0.5195, + "step": 303 + }, + { + "epoch": 0.05, + "learning_rate": 1.9973398996053218e-05, + "loss": 0.5147, + "step": 304 + }, + { + "epoch": 0.05, + "learning_rate": 1.9972988360862782e-05, + "loss": 0.5116, + "step": 305 + }, + { + "epoch": 0.05, + "learning_rate": 1.9972574584735112e-05, + "loss": 0.5153, + "step": 306 + }, + { + "epoch": 0.05, + "learning_rate": 1.9972157667800522e-05, + "loss": 0.5056, + "step": 307 + }, + { + "epoch": 0.05, + "learning_rate": 1.9971737610190326e-05, + "loss": 0.5172, + "step": 308 + }, + { + "epoch": 0.05, + "learning_rate": 1.9971314412036807e-05, + "loss": 0.5146, + "step": 309 + }, + { + "epoch": 0.05, + "learning_rate": 1.9970888073473257e-05, + "loss": 0.5089, + "step": 310 + }, + { + "epoch": 0.05, + "learning_rate": 1.9970458594633947e-05, + "loss": 0.5159, + "step": 311 + }, + { + "epoch": 0.05, + "learning_rate": 1.9970025975654137e-05, + "loss": 0.5144, + "step": 312 + }, + { + "epoch": 0.05, + "learning_rate": 1.996959021667008e-05, + "loss": 0.516, + "step": 313 + }, + { + "epoch": 0.05, + "learning_rate": 1.9969151317819014e-05, + "loss": 0.5195, + "step": 314 + }, + { + "epoch": 0.05, + "learning_rate": 1.9968709279239172e-05, + "loss": 0.5033, + "step": 315 + }, + { + "epoch": 0.05, + "learning_rate": 1.996826410106977e-05, + "loss": 0.5096, + "step": 316 + }, + { + "epoch": 0.05, + "learning_rate": 1.996781578345101e-05, + "loss": 0.5041, + "step": 317 + }, + { + "epoch": 0.06, + "learning_rate": 1.996736432652409e-05, + "loss": 0.5246, + "step": 318 + }, + { + "epoch": 0.06, + "learning_rate": 1.9966909730431196e-05, + "loss": 0.5088, + "step": 319 + }, + { + "epoch": 0.06, + "learning_rate": 1.99664519953155e-05, + "loss": 0.5369, + "step": 320 + }, + { + "epoch": 0.06, + "learning_rate": 1.9965991121321156e-05, + "loss": 0.5081, + "step": 321 + }, + { + "epoch": 0.06, + "learning_rate": 1.9965527108593325e-05, + "loss": 0.5189, + "step": 322 + }, + { + "epoch": 0.06, + "learning_rate": 1.9965059957278135e-05, + "loss": 0.5071, + "step": 323 + }, + { + "epoch": 0.06, + "learning_rate": 1.9964589667522724e-05, + "loss": 0.5225, + "step": 324 + }, + { + "epoch": 0.06, + "learning_rate": 1.99641162394752e-05, + "loss": 0.5079, + "step": 325 + }, + { + "epoch": 0.06, + "learning_rate": 1.996363967328466e-05, + "loss": 0.5081, + "step": 326 + }, + { + "epoch": 0.06, + "learning_rate": 1.9963159969101207e-05, + "loss": 0.5068, + "step": 327 + }, + { + "epoch": 0.06, + "learning_rate": 1.9962677127075916e-05, + "loss": 0.5092, + "step": 328 + }, + { + "epoch": 0.06, + "learning_rate": 1.9962191147360855e-05, + "loss": 0.4977, + "step": 329 + }, + { + "epoch": 0.06, + "learning_rate": 1.9961702030109088e-05, + "loss": 0.5134, + "step": 330 + }, + { + "epoch": 0.06, + "learning_rate": 1.996120977547465e-05, + "loss": 0.5065, + "step": 331 + }, + { + "epoch": 0.06, + "learning_rate": 1.9960714383612577e-05, + "loss": 0.5119, + "step": 332 + }, + { + "epoch": 0.06, + "learning_rate": 1.9960215854678894e-05, + "loss": 0.5076, + "step": 333 + }, + { + "epoch": 0.06, + "learning_rate": 1.9959714188830603e-05, + "loss": 0.4984, + "step": 334 + }, + { + "epoch": 0.06, + "learning_rate": 1.9959209386225707e-05, + "loss": 0.5095, + "step": 335 + }, + { + "epoch": 0.06, + "learning_rate": 1.9958701447023188e-05, + "loss": 0.505, + "step": 336 + }, + { + "epoch": 0.06, + "learning_rate": 1.9958190371383016e-05, + "loss": 0.5217, + "step": 337 + }, + { + "epoch": 0.06, + "learning_rate": 1.9957676159466154e-05, + "loss": 0.4897, + "step": 338 + }, + { + "epoch": 0.06, + "learning_rate": 1.9957158811434552e-05, + "loss": 0.5075, + "step": 339 + }, + { + "epoch": 0.06, + "learning_rate": 1.995663832745115e-05, + "loss": 0.5113, + "step": 340 + }, + { + "epoch": 0.06, + "learning_rate": 1.9956114707679858e-05, + "loss": 0.5193, + "step": 341 + }, + { + "epoch": 0.06, + "learning_rate": 1.99555879522856e-05, + "loss": 0.5017, + "step": 342 + }, + { + "epoch": 0.06, + "learning_rate": 1.9955058061434266e-05, + "loss": 0.5158, + "step": 343 + }, + { + "epoch": 0.06, + "learning_rate": 1.9954525035292748e-05, + "loss": 0.5055, + "step": 344 + }, + { + "epoch": 0.06, + "learning_rate": 1.9953988874028917e-05, + "loss": 0.5102, + "step": 345 + }, + { + "epoch": 0.06, + "learning_rate": 1.9953449577811635e-05, + "loss": 0.4962, + "step": 346 + }, + { + "epoch": 0.06, + "learning_rate": 1.9952907146810748e-05, + "loss": 0.5254, + "step": 347 + }, + { + "epoch": 0.06, + "learning_rate": 1.9952361581197097e-05, + "loss": 0.5107, + "step": 348 + }, + { + "epoch": 0.06, + "learning_rate": 1.9951812881142497e-05, + "loss": 0.5073, + "step": 349 + }, + { + "epoch": 0.06, + "learning_rate": 1.9951261046819766e-05, + "loss": 0.5069, + "step": 350 + }, + { + "epoch": 0.06, + "learning_rate": 1.9950706078402696e-05, + "loss": 0.4961, + "step": 351 + }, + { + "epoch": 0.06, + "learning_rate": 1.9950147976066073e-05, + "loss": 0.4943, + "step": 352 + }, + { + "epoch": 0.06, + "learning_rate": 1.994958673998567e-05, + "loss": 0.5141, + "step": 353 + }, + { + "epoch": 0.06, + "learning_rate": 1.994902237033824e-05, + "loss": 0.5022, + "step": 354 + }, + { + "epoch": 0.06, + "learning_rate": 1.994845486730153e-05, + "loss": 0.5025, + "step": 355 + }, + { + "epoch": 0.06, + "learning_rate": 1.9947884231054276e-05, + "loss": 0.5108, + "step": 356 + }, + { + "epoch": 0.06, + "learning_rate": 1.9947310461776195e-05, + "loss": 0.5178, + "step": 357 + }, + { + "epoch": 0.06, + "learning_rate": 1.9946733559647987e-05, + "loss": 0.5009, + "step": 358 + }, + { + "epoch": 0.06, + "learning_rate": 1.9946153524851352e-05, + "loss": 0.5157, + "step": 359 + }, + { + "epoch": 0.06, + "learning_rate": 1.9945570357568967e-05, + "loss": 0.5154, + "step": 360 + }, + { + "epoch": 0.06, + "learning_rate": 1.994498405798449e-05, + "loss": 0.527, + "step": 361 + }, + { + "epoch": 0.06, + "learning_rate": 1.994439462628258e-05, + "loss": 0.5034, + "step": 362 + }, + { + "epoch": 0.06, + "learning_rate": 1.9943802062648877e-05, + "loss": 0.5117, + "step": 363 + }, + { + "epoch": 0.06, + "learning_rate": 1.994320636727e-05, + "loss": 0.5125, + "step": 364 + }, + { + "epoch": 0.06, + "learning_rate": 1.994260754033356e-05, + "loss": 0.5067, + "step": 365 + }, + { + "epoch": 0.06, + "learning_rate": 1.994200558202816e-05, + "loss": 0.496, + "step": 366 + }, + { + "epoch": 0.06, + "learning_rate": 1.9941400492543376e-05, + "loss": 0.508, + "step": 367 + }, + { + "epoch": 0.06, + "learning_rate": 1.9940792272069783e-05, + "loss": 0.5069, + "step": 368 + }, + { + "epoch": 0.06, + "learning_rate": 1.9940180920798934e-05, + "loss": 0.5098, + "step": 369 + }, + { + "epoch": 0.06, + "learning_rate": 1.993956643892337e-05, + "loss": 0.5125, + "step": 370 + }, + { + "epoch": 0.06, + "learning_rate": 1.9938948826636625e-05, + "loss": 0.501, + "step": 371 + }, + { + "epoch": 0.06, + "learning_rate": 1.9938328084133206e-05, + "loss": 0.512, + "step": 372 + }, + { + "epoch": 0.06, + "learning_rate": 1.9937704211608615e-05, + "loss": 0.5049, + "step": 373 + }, + { + "epoch": 0.06, + "learning_rate": 1.9937077209259333e-05, + "loss": 0.5034, + "step": 374 + }, + { + "epoch": 0.06, + "learning_rate": 1.993644707728284e-05, + "loss": 0.5152, + "step": 375 + }, + { + "epoch": 0.07, + "learning_rate": 1.993581381587758e-05, + "loss": 0.5045, + "step": 376 + }, + { + "epoch": 0.07, + "learning_rate": 1.9935177425243007e-05, + "loss": 0.5107, + "step": 377 + }, + { + "epoch": 0.07, + "learning_rate": 1.993453790557954e-05, + "loss": 0.5137, + "step": 378 + }, + { + "epoch": 0.07, + "learning_rate": 1.99338952570886e-05, + "loss": 0.5065, + "step": 379 + }, + { + "epoch": 0.07, + "learning_rate": 1.993324947997258e-05, + "loss": 0.5046, + "step": 380 + }, + { + "epoch": 0.07, + "learning_rate": 1.9932600574434864e-05, + "loss": 0.5151, + "step": 381 + }, + { + "epoch": 0.07, + "learning_rate": 1.9931948540679822e-05, + "loss": 0.5004, + "step": 382 + }, + { + "epoch": 0.07, + "learning_rate": 1.993129337891281e-05, + "loss": 0.5128, + "step": 383 + }, + { + "epoch": 0.07, + "learning_rate": 1.9930635089340168e-05, + "loss": 0.5013, + "step": 384 + }, + { + "epoch": 0.07, + "learning_rate": 1.992997367216922e-05, + "loss": 0.5062, + "step": 385 + }, + { + "epoch": 0.07, + "learning_rate": 1.992930912760827e-05, + "loss": 0.5075, + "step": 386 + }, + { + "epoch": 0.07, + "learning_rate": 1.992864145586662e-05, + "loss": 0.5185, + "step": 387 + }, + { + "epoch": 0.07, + "learning_rate": 1.9927970657154548e-05, + "loss": 0.486, + "step": 388 + }, + { + "epoch": 0.07, + "learning_rate": 1.9927296731683317e-05, + "loss": 0.5152, + "step": 389 + }, + { + "epoch": 0.07, + "learning_rate": 1.9926619679665175e-05, + "loss": 0.5066, + "step": 390 + }, + { + "epoch": 0.07, + "learning_rate": 1.9925939501313358e-05, + "loss": 0.5078, + "step": 391 + }, + { + "epoch": 0.07, + "learning_rate": 1.992525619684208e-05, + "loss": 0.5135, + "step": 392 + }, + { + "epoch": 0.07, + "learning_rate": 1.9924569766466552e-05, + "loss": 0.5143, + "step": 393 + }, + { + "epoch": 0.07, + "learning_rate": 1.9923880210402956e-05, + "loss": 0.503, + "step": 394 + }, + { + "epoch": 0.07, + "learning_rate": 1.9923187528868463e-05, + "loss": 0.5251, + "step": 395 + }, + { + "epoch": 0.07, + "learning_rate": 1.9922491722081235e-05, + "loss": 0.5049, + "step": 396 + }, + { + "epoch": 0.07, + "learning_rate": 1.9921792790260402e-05, + "loss": 0.5091, + "step": 397 + }, + { + "epoch": 0.07, + "learning_rate": 1.9921090733626102e-05, + "loss": 0.4971, + "step": 398 + }, + { + "epoch": 0.07, + "learning_rate": 1.9920385552399434e-05, + "loss": 0.5029, + "step": 399 + }, + { + "epoch": 0.07, + "learning_rate": 1.9919677246802492e-05, + "loss": 0.5008, + "step": 400 + }, + { + "epoch": 0.07, + "learning_rate": 1.9918965817058357e-05, + "loss": 0.4976, + "step": 401 + }, + { + "epoch": 0.07, + "learning_rate": 1.9918251263391086e-05, + "loss": 0.5129, + "step": 402 + }, + { + "epoch": 0.07, + "learning_rate": 1.9917533586025725e-05, + "loss": 0.5045, + "step": 403 + }, + { + "epoch": 0.07, + "learning_rate": 1.9916812785188305e-05, + "loss": 0.5052, + "step": 404 + }, + { + "epoch": 0.07, + "learning_rate": 1.9916088861105835e-05, + "loss": 0.5116, + "step": 405 + }, + { + "epoch": 0.07, + "learning_rate": 1.9915361814006312e-05, + "loss": 0.5115, + "step": 406 + }, + { + "epoch": 0.07, + "learning_rate": 1.9914631644118712e-05, + "loss": 0.5029, + "step": 407 + }, + { + "epoch": 0.07, + "learning_rate": 1.9913898351673006e-05, + "loss": 0.504, + "step": 408 + }, + { + "epoch": 0.07, + "learning_rate": 1.9913161936900135e-05, + "loss": 0.5115, + "step": 409 + }, + { + "epoch": 0.07, + "learning_rate": 1.9912422400032027e-05, + "loss": 0.4922, + "step": 410 + }, + { + "epoch": 0.07, + "learning_rate": 1.99116797413016e-05, + "loss": 0.5161, + "step": 411 + }, + { + "epoch": 0.07, + "learning_rate": 1.9910933960942747e-05, + "loss": 0.5055, + "step": 412 + }, + { + "epoch": 0.07, + "learning_rate": 1.9910185059190344e-05, + "loss": 0.5047, + "step": 413 + }, + { + "epoch": 0.07, + "learning_rate": 1.990943303628026e-05, + "loss": 0.5034, + "step": 414 + }, + { + "epoch": 0.07, + "learning_rate": 1.990867789244934e-05, + "loss": 0.5254, + "step": 415 + }, + { + "epoch": 0.07, + "learning_rate": 1.990791962793541e-05, + "loss": 0.4984, + "step": 416 + }, + { + "epoch": 0.07, + "learning_rate": 1.990715824297728e-05, + "loss": 0.5208, + "step": 417 + }, + { + "epoch": 0.07, + "learning_rate": 1.9906393737814748e-05, + "loss": 0.5087, + "step": 418 + }, + { + "epoch": 0.07, + "learning_rate": 1.990562611268858e-05, + "loss": 0.4982, + "step": 419 + }, + { + "epoch": 0.07, + "learning_rate": 1.990485536784055e-05, + "loss": 0.4952, + "step": 420 + }, + { + "epoch": 0.07, + "learning_rate": 1.9904081503513395e-05, + "loss": 0.5156, + "step": 421 + }, + { + "epoch": 0.07, + "learning_rate": 1.9903304519950833e-05, + "loss": 0.5065, + "step": 422 + }, + { + "epoch": 0.07, + "learning_rate": 1.990252441739758e-05, + "loss": 0.5126, + "step": 423 + }, + { + "epoch": 0.07, + "learning_rate": 1.9901741196099313e-05, + "loss": 0.4999, + "step": 424 + }, + { + "epoch": 0.07, + "learning_rate": 1.9900954856302715e-05, + "loss": 0.5116, + "step": 425 + }, + { + "epoch": 0.07, + "learning_rate": 1.9900165398255434e-05, + "loss": 0.4924, + "step": 426 + }, + { + "epoch": 0.07, + "learning_rate": 1.9899372822206105e-05, + "loss": 0.5066, + "step": 427 + }, + { + "epoch": 0.07, + "learning_rate": 1.9898577128404343e-05, + "loss": 0.5091, + "step": 428 + }, + { + "epoch": 0.07, + "learning_rate": 1.9897778317100754e-05, + "loss": 0.4911, + "step": 429 + }, + { + "epoch": 0.07, + "learning_rate": 1.9896976388546915e-05, + "loss": 0.4944, + "step": 430 + }, + { + "epoch": 0.07, + "learning_rate": 1.9896171342995392e-05, + "loss": 0.5091, + "step": 431 + }, + { + "epoch": 0.07, + "learning_rate": 1.989536318069973e-05, + "loss": 0.5162, + "step": 432 + }, + { + "epoch": 0.08, + "learning_rate": 1.9894551901914445e-05, + "loss": 0.5103, + "step": 433 + }, + { + "epoch": 0.08, + "learning_rate": 1.989373750689506e-05, + "loss": 0.4951, + "step": 434 + }, + { + "epoch": 0.08, + "learning_rate": 1.9892919995898052e-05, + "loss": 0.5102, + "step": 435 + }, + { + "epoch": 0.08, + "learning_rate": 1.98920993691809e-05, + "loss": 0.51, + "step": 436 + }, + { + "epoch": 0.08, + "learning_rate": 1.9891275627002043e-05, + "loss": 0.5001, + "step": 437 + }, + { + "epoch": 0.08, + "learning_rate": 1.9890448769620932e-05, + "loss": 0.4947, + "step": 438 + }, + { + "epoch": 0.08, + "learning_rate": 1.988961879729797e-05, + "loss": 0.5059, + "step": 439 + }, + { + "epoch": 0.08, + "learning_rate": 1.9888785710294552e-05, + "loss": 0.5001, + "step": 440 + }, + { + "epoch": 0.08, + "learning_rate": 1.9887949508873058e-05, + "loss": 0.5009, + "step": 441 + }, + { + "epoch": 0.08, + "learning_rate": 1.988711019329684e-05, + "loss": 0.4887, + "step": 442 + }, + { + "epoch": 0.08, + "learning_rate": 1.9886267763830245e-05, + "loss": 0.5005, + "step": 443 + }, + { + "epoch": 0.08, + "learning_rate": 1.9885422220738583e-05, + "loss": 0.4987, + "step": 444 + }, + { + "epoch": 0.08, + "learning_rate": 1.9884573564288154e-05, + "loss": 0.4964, + "step": 445 + }, + { + "epoch": 0.08, + "learning_rate": 1.9883721794746242e-05, + "loss": 0.4959, + "step": 446 + }, + { + "epoch": 0.08, + "learning_rate": 1.9882866912381105e-05, + "loss": 0.5038, + "step": 447 + }, + { + "epoch": 0.08, + "learning_rate": 1.988200891746198e-05, + "loss": 0.5196, + "step": 448 + }, + { + "epoch": 0.08, + "learning_rate": 1.9881147810259094e-05, + "loss": 0.5063, + "step": 449 + }, + { + "epoch": 0.08, + "learning_rate": 1.988028359104364e-05, + "loss": 0.51, + "step": 450 + }, + { + "epoch": 0.08, + "learning_rate": 1.9879416260087808e-05, + "loss": 0.5078, + "step": 451 + }, + { + "epoch": 0.08, + "learning_rate": 1.9878545817664752e-05, + "loss": 0.498, + "step": 452 + }, + { + "epoch": 0.08, + "learning_rate": 1.9877672264048618e-05, + "loss": 0.5075, + "step": 453 + }, + { + "epoch": 0.08, + "learning_rate": 1.9876795599514523e-05, + "loss": 0.5062, + "step": 454 + }, + { + "epoch": 0.08, + "learning_rate": 1.987591582433857e-05, + "loss": 0.5133, + "step": 455 + }, + { + "epoch": 0.08, + "learning_rate": 1.9875032938797837e-05, + "loss": 0.4992, + "step": 456 + }, + { + "epoch": 0.08, + "learning_rate": 1.9874146943170386e-05, + "loss": 0.4929, + "step": 457 + }, + { + "epoch": 0.08, + "learning_rate": 1.9873257837735257e-05, + "loss": 0.5224, + "step": 458 + }, + { + "epoch": 0.08, + "learning_rate": 1.9872365622772464e-05, + "loss": 0.5048, + "step": 459 + }, + { + "epoch": 0.08, + "learning_rate": 1.987147029856301e-05, + "loss": 0.4979, + "step": 460 + }, + { + "epoch": 0.08, + "learning_rate": 1.9870571865388873e-05, + "loss": 0.5262, + "step": 461 + }, + { + "epoch": 0.08, + "learning_rate": 1.9869670323533005e-05, + "loss": 0.4916, + "step": 462 + }, + { + "epoch": 0.08, + "learning_rate": 1.9868765673279347e-05, + "loss": 0.4974, + "step": 463 + }, + { + "epoch": 0.08, + "learning_rate": 1.9867857914912808e-05, + "loss": 0.495, + "step": 464 + }, + { + "epoch": 0.08, + "learning_rate": 1.9866947048719285e-05, + "loss": 0.5099, + "step": 465 + }, + { + "epoch": 0.08, + "learning_rate": 1.986603307498565e-05, + "loss": 0.4879, + "step": 466 + }, + { + "epoch": 0.08, + "learning_rate": 1.9865115993999755e-05, + "loss": 0.5051, + "step": 467 + }, + { + "epoch": 0.08, + "learning_rate": 1.9864195806050425e-05, + "loss": 0.4841, + "step": 468 + }, + { + "epoch": 0.08, + "learning_rate": 1.9863272511427475e-05, + "loss": 0.5163, + "step": 469 + }, + { + "epoch": 0.08, + "learning_rate": 1.9862346110421682e-05, + "loss": 0.4991, + "step": 470 + }, + { + "epoch": 0.08, + "learning_rate": 1.986141660332482e-05, + "loss": 0.508, + "step": 471 + }, + { + "epoch": 0.08, + "learning_rate": 1.986048399042963e-05, + "loss": 0.501, + "step": 472 + }, + { + "epoch": 0.08, + "learning_rate": 1.9859548272029828e-05, + "loss": 0.4963, + "step": 473 + }, + { + "epoch": 0.08, + "learning_rate": 1.9858609448420118e-05, + "loss": 0.4996, + "step": 474 + }, + { + "epoch": 0.08, + "learning_rate": 1.9857667519896176e-05, + "loss": 0.4953, + "step": 475 + }, + { + "epoch": 0.08, + "learning_rate": 1.985672248675466e-05, + "loss": 0.4918, + "step": 476 + }, + { + "epoch": 0.08, + "learning_rate": 1.98557743492932e-05, + "loss": 0.5062, + "step": 477 + }, + { + "epoch": 0.08, + "learning_rate": 1.9854823107810402e-05, + "loss": 0.4982, + "step": 478 + }, + { + "epoch": 0.08, + "learning_rate": 1.9853868762605865e-05, + "loss": 0.4986, + "step": 479 + }, + { + "epoch": 0.08, + "learning_rate": 1.9852911313980146e-05, + "loss": 0.4995, + "step": 480 + }, + { + "epoch": 0.08, + "learning_rate": 1.9851950762234794e-05, + "loss": 0.4999, + "step": 481 + }, + { + "epoch": 0.08, + "learning_rate": 1.9850987107672322e-05, + "loss": 0.4976, + "step": 482 + }, + { + "epoch": 0.08, + "learning_rate": 1.9850020350596237e-05, + "loss": 0.5057, + "step": 483 + }, + { + "epoch": 0.08, + "learning_rate": 1.9849050491311005e-05, + "loss": 0.5108, + "step": 484 + }, + { + "epoch": 0.08, + "learning_rate": 1.9848077530122083e-05, + "loss": 0.5035, + "step": 485 + }, + { + "epoch": 0.08, + "learning_rate": 1.9847101467335895e-05, + "loss": 0.4953, + "step": 486 + }, + { + "epoch": 0.08, + "learning_rate": 1.9846122303259855e-05, + "loss": 0.5039, + "step": 487 + }, + { + "epoch": 0.08, + "learning_rate": 1.9845140038202338e-05, + "loss": 0.4933, + "step": 488 + }, + { + "epoch": 0.08, + "learning_rate": 1.9844154672472707e-05, + "loss": 0.4985, + "step": 489 + }, + { + "epoch": 0.08, + "learning_rate": 1.9843166206381296e-05, + "loss": 0.5092, + "step": 490 + }, + { + "epoch": 0.09, + "learning_rate": 1.9842174640239415e-05, + "loss": 0.5042, + "step": 491 + }, + { + "epoch": 0.09, + "learning_rate": 1.984117997435935e-05, + "loss": 0.5104, + "step": 492 + }, + { + "epoch": 0.09, + "learning_rate": 1.9840182209054377e-05, + "loss": 0.4955, + "step": 493 + }, + { + "epoch": 0.09, + "learning_rate": 1.9839181344638722e-05, + "loss": 0.508, + "step": 494 + }, + { + "epoch": 0.09, + "learning_rate": 1.9838177381427613e-05, + "loss": 0.5043, + "step": 495 + }, + { + "epoch": 0.09, + "learning_rate": 1.9837170319737236e-05, + "loss": 0.5014, + "step": 496 + }, + { + "epoch": 0.09, + "learning_rate": 1.9836160159884762e-05, + "loss": 0.4948, + "step": 497 + }, + { + "epoch": 0.09, + "learning_rate": 1.9835146902188336e-05, + "loss": 0.5076, + "step": 498 + }, + { + "epoch": 0.09, + "learning_rate": 1.9834130546967073e-05, + "loss": 0.5062, + "step": 499 + }, + { + "epoch": 0.09, + "learning_rate": 1.983311109454108e-05, + "loss": 0.4908, + "step": 500 + }, + { + "epoch": 0.09, + "learning_rate": 1.983208854523141e-05, + "loss": 0.499, + "step": 501 + }, + { + "epoch": 0.09, + "learning_rate": 1.983106289936013e-05, + "loss": 0.4919, + "step": 502 + }, + { + "epoch": 0.09, + "learning_rate": 1.9830034157250245e-05, + "loss": 0.5054, + "step": 503 + }, + { + "epoch": 0.09, + "learning_rate": 1.9829002319225754e-05, + "loss": 0.4974, + "step": 504 + }, + { + "epoch": 0.09, + "learning_rate": 1.9827967385611638e-05, + "loss": 0.4992, + "step": 505 + }, + { + "epoch": 0.09, + "learning_rate": 1.9826929356733836e-05, + "loss": 0.5083, + "step": 506 + }, + { + "epoch": 0.09, + "learning_rate": 1.9825888232919268e-05, + "loss": 0.5134, + "step": 507 + }, + { + "epoch": 0.09, + "learning_rate": 1.9824844014495835e-05, + "loss": 0.5082, + "step": 508 + }, + { + "epoch": 0.09, + "learning_rate": 1.9823796701792405e-05, + "loss": 0.4943, + "step": 509 + }, + { + "epoch": 0.09, + "learning_rate": 1.9822746295138827e-05, + "loss": 0.5056, + "step": 510 + }, + { + "epoch": 0.09, + "learning_rate": 1.9821692794865918e-05, + "loss": 0.495, + "step": 511 + }, + { + "epoch": 0.09, + "learning_rate": 1.982063620130547e-05, + "loss": 0.4864, + "step": 512 + }, + { + "epoch": 0.09, + "learning_rate": 1.9819576514790254e-05, + "loss": 0.4939, + "step": 513 + }, + { + "epoch": 0.09, + "learning_rate": 1.9818513735654012e-05, + "loss": 0.5137, + "step": 514 + }, + { + "epoch": 0.09, + "learning_rate": 1.981744786423146e-05, + "loss": 0.5134, + "step": 515 + }, + { + "epoch": 0.09, + "learning_rate": 1.9816378900858288e-05, + "loss": 0.4942, + "step": 516 + }, + { + "epoch": 0.09, + "learning_rate": 1.9815306845871163e-05, + "loss": 0.4944, + "step": 517 + }, + { + "epoch": 0.09, + "learning_rate": 1.981423169960772e-05, + "loss": 0.5076, + "step": 518 + }, + { + "epoch": 0.09, + "learning_rate": 1.981315346240657e-05, + "loss": 0.4909, + "step": 519 + }, + { + "epoch": 0.09, + "learning_rate": 1.981207213460729e-05, + "loss": 0.4992, + "step": 520 + }, + { + "epoch": 0.09, + "learning_rate": 1.9810987716550458e-05, + "loss": 0.4944, + "step": 521 + }, + { + "epoch": 0.09, + "learning_rate": 1.9809900208577586e-05, + "loss": 0.4966, + "step": 522 + }, + { + "epoch": 0.09, + "learning_rate": 1.980880961103119e-05, + "loss": 0.4817, + "step": 523 + }, + { + "epoch": 0.09, + "learning_rate": 1.9807715924254743e-05, + "loss": 0.4879, + "step": 524 + }, + { + "epoch": 0.09, + "learning_rate": 1.98066191485927e-05, + "loss": 0.5011, + "step": 525 + }, + { + "epoch": 0.09, + "learning_rate": 1.980551928439048e-05, + "loss": 0.5055, + "step": 526 + }, + { + "epoch": 0.09, + "learning_rate": 1.980441633199448e-05, + "loss": 0.4915, + "step": 527 + }, + { + "epoch": 0.09, + "learning_rate": 1.980331029175207e-05, + "loss": 0.5072, + "step": 528 + }, + { + "epoch": 0.09, + "learning_rate": 1.9802201164011587e-05, + "loss": 0.4988, + "step": 529 + }, + { + "epoch": 0.09, + "learning_rate": 1.980108894912235e-05, + "loss": 0.4962, + "step": 530 + }, + { + "epoch": 0.09, + "learning_rate": 1.979997364743464e-05, + "loss": 0.4929, + "step": 531 + }, + { + "epoch": 0.09, + "learning_rate": 1.979885525929972e-05, + "loss": 0.5002, + "step": 532 + }, + { + "epoch": 0.09, + "learning_rate": 1.979773378506982e-05, + "loss": 0.4916, + "step": 533 + }, + { + "epoch": 0.09, + "learning_rate": 1.9796609225098136e-05, + "loss": 0.5075, + "step": 534 + }, + { + "epoch": 0.09, + "learning_rate": 1.9795481579738848e-05, + "loss": 0.5014, + "step": 535 + }, + { + "epoch": 0.09, + "learning_rate": 1.97943508493471e-05, + "loss": 0.5107, + "step": 536 + }, + { + "epoch": 0.09, + "learning_rate": 1.979321703427901e-05, + "loss": 0.5007, + "step": 537 + }, + { + "epoch": 0.09, + "learning_rate": 1.9792080134891662e-05, + "loss": 0.5064, + "step": 538 + }, + { + "epoch": 0.09, + "learning_rate": 1.9790940151543122e-05, + "loss": 0.5029, + "step": 539 + }, + { + "epoch": 0.09, + "learning_rate": 1.9789797084592418e-05, + "loss": 0.5052, + "step": 540 + }, + { + "epoch": 0.09, + "learning_rate": 1.9788650934399553e-05, + "loss": 0.4913, + "step": 541 + }, + { + "epoch": 0.09, + "learning_rate": 1.9787501701325505e-05, + "loss": 0.502, + "step": 542 + }, + { + "epoch": 0.09, + "learning_rate": 1.9786349385732212e-05, + "loss": 0.4988, + "step": 543 + }, + { + "epoch": 0.09, + "learning_rate": 1.9785193987982593e-05, + "loss": 0.5049, + "step": 544 + }, + { + "epoch": 0.09, + "learning_rate": 1.9784035508440534e-05, + "loss": 0.4889, + "step": 545 + }, + { + "epoch": 0.09, + "learning_rate": 1.9782873947470888e-05, + "loss": 0.499, + "step": 546 + }, + { + "epoch": 0.09, + "learning_rate": 1.9781709305439486e-05, + "loss": 0.5083, + "step": 547 + }, + { + "epoch": 0.09, + "learning_rate": 1.9780541582713128e-05, + "loss": 0.5017, + "step": 548 + }, + { + "epoch": 0.1, + "learning_rate": 1.9779370779659578e-05, + "loss": 0.4985, + "step": 549 + }, + { + "epoch": 0.1, + "learning_rate": 1.9778196896647572e-05, + "loss": 0.4972, + "step": 550 + }, + { + "epoch": 0.1, + "learning_rate": 1.977701993404682e-05, + "loss": 0.5008, + "step": 551 + }, + { + "epoch": 0.1, + "learning_rate": 1.9775839892228004e-05, + "loss": 0.5087, + "step": 552 + }, + { + "epoch": 0.1, + "learning_rate": 1.9774656771562764e-05, + "loss": 0.4935, + "step": 553 + }, + { + "epoch": 0.1, + "learning_rate": 1.977347057242372e-05, + "loss": 0.5055, + "step": 554 + }, + { + "epoch": 0.1, + "learning_rate": 1.9772281295184465e-05, + "loss": 0.5013, + "step": 555 + }, + { + "epoch": 0.1, + "learning_rate": 1.9771088940219546e-05, + "loss": 0.5, + "step": 556 + }, + { + "epoch": 0.1, + "learning_rate": 1.97698935079045e-05, + "loss": 0.4929, + "step": 557 + }, + { + "epoch": 0.1, + "learning_rate": 1.9768694998615805e-05, + "loss": 0.5258, + "step": 558 + }, + { + "epoch": 0.1, + "learning_rate": 1.976749341273094e-05, + "loss": 0.5014, + "step": 559 + }, + { + "epoch": 0.1, + "learning_rate": 1.9766288750628327e-05, + "loss": 0.4991, + "step": 560 + }, + { + "epoch": 0.1, + "learning_rate": 1.976508101268738e-05, + "loss": 0.4859, + "step": 561 + }, + { + "epoch": 0.1, + "learning_rate": 1.976387019928846e-05, + "loss": 0.4919, + "step": 562 + }, + { + "epoch": 0.1, + "learning_rate": 1.97626563108129e-05, + "loss": 0.496, + "step": 563 + }, + { + "epoch": 0.1, + "learning_rate": 1.9761439347643027e-05, + "loss": 0.5063, + "step": 564 + }, + { + "epoch": 0.1, + "learning_rate": 1.97602193101621e-05, + "loss": 0.4906, + "step": 565 + }, + { + "epoch": 0.1, + "learning_rate": 1.9758996198754364e-05, + "loss": 0.4971, + "step": 566 + }, + { + "epoch": 0.1, + "learning_rate": 1.975777001380504e-05, + "loss": 0.5023, + "step": 567 + }, + { + "epoch": 0.1, + "learning_rate": 1.9756540755700308e-05, + "loss": 0.4956, + "step": 568 + }, + { + "epoch": 0.1, + "learning_rate": 1.9755308424827303e-05, + "loss": 0.5023, + "step": 569 + }, + { + "epoch": 0.1, + "learning_rate": 1.9754073021574153e-05, + "loss": 0.4913, + "step": 570 + }, + { + "epoch": 0.1, + "learning_rate": 1.9752834546329944e-05, + "loss": 0.4865, + "step": 571 + }, + { + "epoch": 0.1, + "learning_rate": 1.9751592999484713e-05, + "loss": 0.5035, + "step": 572 + }, + { + "epoch": 0.1, + "learning_rate": 1.9750348381429484e-05, + "loss": 0.4981, + "step": 573 + }, + { + "epoch": 0.1, + "learning_rate": 1.974910069255625e-05, + "loss": 0.5027, + "step": 574 + }, + { + "epoch": 0.1, + "learning_rate": 1.9747849933257955e-05, + "loss": 0.4961, + "step": 575 + }, + { + "epoch": 0.1, + "learning_rate": 1.9746596103928524e-05, + "loss": 0.5046, + "step": 576 + }, + { + "epoch": 0.1, + "learning_rate": 1.974533920496284e-05, + "loss": 0.4902, + "step": 577 + }, + { + "epoch": 0.1, + "learning_rate": 1.9744079236756756e-05, + "loss": 0.5076, + "step": 578 + }, + { + "epoch": 0.1, + "learning_rate": 1.9742816199707096e-05, + "loss": 0.483, + "step": 579 + }, + { + "epoch": 0.1, + "learning_rate": 1.9741550094211647e-05, + "loss": 0.5074, + "step": 580 + }, + { + "epoch": 0.1, + "learning_rate": 1.9740280920669153e-05, + "loss": 0.4936, + "step": 581 + }, + { + "epoch": 0.1, + "learning_rate": 1.973900867947934e-05, + "loss": 0.4946, + "step": 582 + }, + { + "epoch": 0.1, + "learning_rate": 1.9737733371042894e-05, + "loss": 0.4804, + "step": 583 + }, + { + "epoch": 0.1, + "learning_rate": 1.9736454995761468e-05, + "loss": 0.4976, + "step": 584 + }, + { + "epoch": 0.1, + "learning_rate": 1.973517355403767e-05, + "loss": 0.502, + "step": 585 + }, + { + "epoch": 0.1, + "learning_rate": 1.9733889046275095e-05, + "loss": 0.5016, + "step": 586 + }, + { + "epoch": 0.1, + "learning_rate": 1.9732601472878282e-05, + "loss": 0.5068, + "step": 587 + }, + { + "epoch": 0.1, + "learning_rate": 1.9731310834252747e-05, + "loss": 0.5192, + "step": 588 + }, + { + "epoch": 0.1, + "learning_rate": 1.9730017130804976e-05, + "loss": 0.4961, + "step": 589 + }, + { + "epoch": 0.1, + "learning_rate": 1.9728720362942404e-05, + "loss": 0.5111, + "step": 590 + }, + { + "epoch": 0.1, + "learning_rate": 1.9727420531073447e-05, + "loss": 0.5004, + "step": 591 + }, + { + "epoch": 0.1, + "learning_rate": 1.972611763560748e-05, + "loss": 0.4893, + "step": 592 + }, + { + "epoch": 0.1, + "learning_rate": 1.972481167695484e-05, + "loss": 0.4979, + "step": 593 + }, + { + "epoch": 0.1, + "learning_rate": 1.9723502655526832e-05, + "loss": 0.4872, + "step": 594 + }, + { + "epoch": 0.1, + "learning_rate": 1.9722190571735725e-05, + "loss": 0.4964, + "step": 595 + }, + { + "epoch": 0.1, + "learning_rate": 1.9720875425994758e-05, + "loss": 0.5013, + "step": 596 + }, + { + "epoch": 0.1, + "learning_rate": 1.9719557218718116e-05, + "loss": 0.4933, + "step": 597 + }, + { + "epoch": 0.1, + "learning_rate": 1.9718235950320978e-05, + "loss": 0.5033, + "step": 598 + }, + { + "epoch": 0.1, + "learning_rate": 1.9716911621219453e-05, + "loss": 0.5092, + "step": 599 + }, + { + "epoch": 0.1, + "learning_rate": 1.9715584231830642e-05, + "loss": 0.4879, + "step": 600 + }, + { + "epoch": 0.1, + "learning_rate": 1.9714253782572598e-05, + "loss": 0.5023, + "step": 601 + }, + { + "epoch": 0.1, + "learning_rate": 1.9712920273864333e-05, + "loss": 0.4989, + "step": 602 + }, + { + "epoch": 0.1, + "learning_rate": 1.9711583706125835e-05, + "loss": 0.4996, + "step": 603 + }, + { + "epoch": 0.1, + "learning_rate": 1.9710244079778042e-05, + "loss": 0.502, + "step": 604 + }, + { + "epoch": 0.1, + "learning_rate": 1.970890139524287e-05, + "loss": 0.5031, + "step": 605 + }, + { + "epoch": 0.1, + "learning_rate": 1.970755565294318e-05, + "loss": 0.507, + "step": 606 + }, + { + "epoch": 0.11, + "learning_rate": 1.9706206853302815e-05, + "loss": 0.4954, + "step": 607 + }, + { + "epoch": 0.11, + "learning_rate": 1.9704854996746565e-05, + "loss": 0.4885, + "step": 608 + }, + { + "epoch": 0.11, + "learning_rate": 1.9703500083700196e-05, + "loss": 0.4874, + "step": 609 + }, + { + "epoch": 0.11, + "learning_rate": 1.9702142114590426e-05, + "loss": 0.5074, + "step": 610 + }, + { + "epoch": 0.11, + "learning_rate": 1.970078108984494e-05, + "loss": 0.4931, + "step": 611 + }, + { + "epoch": 0.11, + "learning_rate": 1.969941700989239e-05, + "loss": 0.4933, + "step": 612 + }, + { + "epoch": 0.11, + "learning_rate": 1.9698049875162377e-05, + "loss": 0.4932, + "step": 613 + }, + { + "epoch": 0.11, + "learning_rate": 1.969667968608548e-05, + "loss": 0.5098, + "step": 614 + }, + { + "epoch": 0.11, + "learning_rate": 1.969530644309323e-05, + "loss": 0.4902, + "step": 615 + }, + { + "epoch": 0.11, + "learning_rate": 1.969393014661812e-05, + "loss": 0.5083, + "step": 616 + }, + { + "epoch": 0.11, + "learning_rate": 1.969255079709361e-05, + "loss": 0.4884, + "step": 617 + }, + { + "epoch": 0.11, + "learning_rate": 1.9691168394954117e-05, + "loss": 0.488, + "step": 618 + }, + { + "epoch": 0.11, + "learning_rate": 1.968978294063502e-05, + "loss": 0.5094, + "step": 619 + }, + { + "epoch": 0.11, + "learning_rate": 1.9688394434572666e-05, + "loss": 0.5111, + "step": 620 + }, + { + "epoch": 0.11, + "learning_rate": 1.9687002877204347e-05, + "loss": 0.4851, + "step": 621 + }, + { + "epoch": 0.11, + "learning_rate": 1.968560826896833e-05, + "loss": 0.5144, + "step": 622 + }, + { + "epoch": 0.11, + "learning_rate": 1.9684210610303848e-05, + "loss": 0.5036, + "step": 623 + }, + { + "epoch": 0.11, + "learning_rate": 1.9682809901651074e-05, + "loss": 0.5031, + "step": 624 + }, + { + "epoch": 0.11, + "learning_rate": 1.968140614345116e-05, + "loss": 0.5042, + "step": 625 + }, + { + "epoch": 0.11, + "learning_rate": 1.967999933614621e-05, + "loss": 0.4961, + "step": 626 + }, + { + "epoch": 0.11, + "learning_rate": 1.967858948017929e-05, + "loss": 0.4822, + "step": 627 + }, + { + "epoch": 0.11, + "learning_rate": 1.9677176575994425e-05, + "loss": 0.5064, + "step": 628 + }, + { + "epoch": 0.11, + "learning_rate": 1.9675760624036605e-05, + "loss": 0.5004, + "step": 629 + }, + { + "epoch": 0.11, + "learning_rate": 1.967434162475177e-05, + "loss": 0.4946, + "step": 630 + }, + { + "epoch": 0.11, + "learning_rate": 1.9672919578586832e-05, + "loss": 0.4925, + "step": 631 + }, + { + "epoch": 0.11, + "learning_rate": 1.9671494485989656e-05, + "loss": 0.4967, + "step": 632 + }, + { + "epoch": 0.11, + "learning_rate": 1.9670066347409063e-05, + "loss": 0.4867, + "step": 633 + }, + { + "epoch": 0.11, + "learning_rate": 1.966863516329484e-05, + "loss": 0.5025, + "step": 634 + }, + { + "epoch": 0.11, + "learning_rate": 1.966720093409773e-05, + "loss": 0.4966, + "step": 635 + }, + { + "epoch": 0.11, + "learning_rate": 1.9665763660269436e-05, + "loss": 0.4956, + "step": 636 + }, + { + "epoch": 0.11, + "learning_rate": 1.9664323342262623e-05, + "loss": 0.4999, + "step": 637 + }, + { + "epoch": 0.11, + "learning_rate": 1.96628799805309e-05, + "loss": 0.4919, + "step": 638 + }, + { + "epoch": 0.11, + "learning_rate": 1.966143357552886e-05, + "loss": 0.4875, + "step": 639 + }, + { + "epoch": 0.11, + "learning_rate": 1.9659984127712027e-05, + "loss": 0.4946, + "step": 640 + }, + { + "epoch": 0.11, + "learning_rate": 1.9658531637536905e-05, + "loss": 0.4844, + "step": 641 + }, + { + "epoch": 0.11, + "learning_rate": 1.9657076105460945e-05, + "loss": 0.4926, + "step": 642 + }, + { + "epoch": 0.11, + "learning_rate": 1.965561753194256e-05, + "loss": 0.4923, + "step": 643 + }, + { + "epoch": 0.11, + "learning_rate": 1.965415591744112e-05, + "loss": 0.4929, + "step": 644 + }, + { + "epoch": 0.11, + "learning_rate": 1.965269126241695e-05, + "loss": 0.4983, + "step": 645 + }, + { + "epoch": 0.11, + "learning_rate": 1.9651223567331333e-05, + "loss": 0.4932, + "step": 646 + }, + { + "epoch": 0.11, + "learning_rate": 1.964975283264652e-05, + "loss": 0.4953, + "step": 647 + }, + { + "epoch": 0.11, + "learning_rate": 1.9648279058825702e-05, + "loss": 0.493, + "step": 648 + }, + { + "epoch": 0.11, + "learning_rate": 1.964680224633304e-05, + "loss": 0.4916, + "step": 649 + }, + { + "epoch": 0.11, + "learning_rate": 1.9645322395633647e-05, + "loss": 0.4984, + "step": 650 + }, + { + "epoch": 0.11, + "learning_rate": 1.964383950719359e-05, + "loss": 0.4904, + "step": 651 + }, + { + "epoch": 0.11, + "learning_rate": 1.9642353581479904e-05, + "loss": 0.5009, + "step": 652 + }, + { + "epoch": 0.11, + "learning_rate": 1.964086461896057e-05, + "loss": 0.501, + "step": 653 + }, + { + "epoch": 0.11, + "learning_rate": 1.9639372620104527e-05, + "loss": 0.503, + "step": 654 + }, + { + "epoch": 0.11, + "learning_rate": 1.9637877585381672e-05, + "loss": 0.504, + "step": 655 + }, + { + "epoch": 0.11, + "learning_rate": 1.9636379515262857e-05, + "loss": 0.4844, + "step": 656 + }, + { + "epoch": 0.11, + "learning_rate": 1.9634878410219893e-05, + "loss": 0.4932, + "step": 657 + }, + { + "epoch": 0.11, + "learning_rate": 1.9633374270725546e-05, + "loss": 0.5057, + "step": 658 + }, + { + "epoch": 0.11, + "learning_rate": 1.963186709725353e-05, + "loss": 0.5004, + "step": 659 + }, + { + "epoch": 0.11, + "learning_rate": 1.9630356890278527e-05, + "loss": 0.4987, + "step": 660 + }, + { + "epoch": 0.11, + "learning_rate": 1.9628843650276167e-05, + "loss": 0.486, + "step": 661 + }, + { + "epoch": 0.11, + "learning_rate": 1.9627327377723035e-05, + "loss": 0.4997, + "step": 662 + }, + { + "epoch": 0.11, + "learning_rate": 1.9625808073096676e-05, + "loss": 0.5092, + "step": 663 + }, + { + "epoch": 0.12, + "learning_rate": 1.962428573687558e-05, + "loss": 0.4952, + "step": 664 + }, + { + "epoch": 0.12, + "learning_rate": 1.9622760369539206e-05, + "loss": 0.5028, + "step": 665 + }, + { + "epoch": 0.12, + "learning_rate": 1.9621231971567955e-05, + "loss": 0.5023, + "step": 666 + }, + { + "epoch": 0.12, + "learning_rate": 1.9619700543443187e-05, + "loss": 0.4897, + "step": 667 + }, + { + "epoch": 0.12, + "learning_rate": 1.961816608564722e-05, + "loss": 0.4956, + "step": 668 + }, + { + "epoch": 0.12, + "learning_rate": 1.9616628598663322e-05, + "loss": 0.5053, + "step": 669 + }, + { + "epoch": 0.12, + "learning_rate": 1.9615088082975715e-05, + "loss": 0.4903, + "step": 670 + }, + { + "epoch": 0.12, + "learning_rate": 1.9613544539069577e-05, + "loss": 0.4986, + "step": 671 + }, + { + "epoch": 0.12, + "learning_rate": 1.9611997967431037e-05, + "loss": 0.4863, + "step": 672 + }, + { + "epoch": 0.12, + "learning_rate": 1.9610448368547182e-05, + "loss": 0.4977, + "step": 673 + }, + { + "epoch": 0.12, + "learning_rate": 1.9608895742906046e-05, + "loss": 0.4801, + "step": 674 + }, + { + "epoch": 0.12, + "learning_rate": 1.960734009099662e-05, + "loss": 0.4961, + "step": 675 + }, + { + "epoch": 0.12, + "learning_rate": 1.9605781413308852e-05, + "loss": 0.4988, + "step": 676 + }, + { + "epoch": 0.12, + "learning_rate": 1.9604219710333637e-05, + "loss": 0.4955, + "step": 677 + }, + { + "epoch": 0.12, + "learning_rate": 1.9602654982562822e-05, + "loss": 0.4939, + "step": 678 + }, + { + "epoch": 0.12, + "learning_rate": 1.960108723048921e-05, + "loss": 0.5024, + "step": 679 + }, + { + "epoch": 0.12, + "learning_rate": 1.959951645460656e-05, + "loss": 0.4843, + "step": 680 + }, + { + "epoch": 0.12, + "learning_rate": 1.9597942655409574e-05, + "loss": 0.4985, + "step": 681 + }, + { + "epoch": 0.12, + "learning_rate": 1.9596365833393913e-05, + "loss": 0.4966, + "step": 682 + }, + { + "epoch": 0.12, + "learning_rate": 1.959478598905619e-05, + "loss": 0.511, + "step": 683 + }, + { + "epoch": 0.12, + "learning_rate": 1.9593203122893966e-05, + "loss": 0.4879, + "step": 684 + }, + { + "epoch": 0.12, + "learning_rate": 1.959161723540576e-05, + "loss": 0.4948, + "step": 685 + }, + { + "epoch": 0.12, + "learning_rate": 1.959002832709103e-05, + "loss": 0.4896, + "step": 686 + }, + { + "epoch": 0.12, + "learning_rate": 1.9588436398450206e-05, + "loss": 0.505, + "step": 687 + }, + { + "epoch": 0.12, + "learning_rate": 1.9586841449984643e-05, + "loss": 0.4988, + "step": 688 + }, + { + "epoch": 0.12, + "learning_rate": 1.958524348219667e-05, + "loss": 0.5032, + "step": 689 + }, + { + "epoch": 0.12, + "learning_rate": 1.958364249558956e-05, + "loss": 0.4961, + "step": 690 + }, + { + "epoch": 0.12, + "learning_rate": 1.9582038490667532e-05, + "loss": 0.4944, + "step": 691 + }, + { + "epoch": 0.12, + "learning_rate": 1.9580431467935753e-05, + "loss": 0.4867, + "step": 692 + }, + { + "epoch": 0.12, + "learning_rate": 1.957882142790035e-05, + "loss": 0.5004, + "step": 693 + }, + { + "epoch": 0.12, + "learning_rate": 1.95772083710684e-05, + "loss": 0.5042, + "step": 694 + }, + { + "epoch": 0.12, + "learning_rate": 1.9575592297947926e-05, + "loss": 0.4973, + "step": 695 + }, + { + "epoch": 0.12, + "learning_rate": 1.9573973209047893e-05, + "loss": 0.5008, + "step": 696 + }, + { + "epoch": 0.12, + "learning_rate": 1.9572351104878232e-05, + "loss": 0.4941, + "step": 697 + }, + { + "epoch": 0.12, + "learning_rate": 1.957072598594981e-05, + "loss": 0.4974, + "step": 698 + }, + { + "epoch": 0.12, + "learning_rate": 1.9569097852774456e-05, + "loss": 0.4998, + "step": 699 + }, + { + "epoch": 0.12, + "learning_rate": 1.9567466705864934e-05, + "loss": 0.4911, + "step": 700 + }, + { + "epoch": 0.12, + "learning_rate": 1.9565832545734972e-05, + "loss": 0.4998, + "step": 701 + }, + { + "epoch": 0.12, + "learning_rate": 1.9564195372899233e-05, + "loss": 0.4952, + "step": 702 + }, + { + "epoch": 0.12, + "learning_rate": 1.956255518787334e-05, + "loss": 0.5178, + "step": 703 + }, + { + "epoch": 0.12, + "learning_rate": 1.9560911991173856e-05, + "loss": 0.491, + "step": 704 + }, + { + "epoch": 0.12, + "learning_rate": 1.9559265783318304e-05, + "loss": 0.503, + "step": 705 + }, + { + "epoch": 0.12, + "learning_rate": 1.9557616564825138e-05, + "loss": 0.5068, + "step": 706 + }, + { + "epoch": 0.12, + "learning_rate": 1.955596433621378e-05, + "loss": 0.4874, + "step": 707 + }, + { + "epoch": 0.12, + "learning_rate": 1.9554309098004583e-05, + "loss": 0.5021, + "step": 708 + }, + { + "epoch": 0.12, + "learning_rate": 1.955265085071886e-05, + "loss": 0.504, + "step": 709 + }, + { + "epoch": 0.12, + "learning_rate": 1.9550989594878862e-05, + "loss": 0.4918, + "step": 710 + }, + { + "epoch": 0.12, + "learning_rate": 1.9549325331007795e-05, + "loss": 0.5056, + "step": 711 + }, + { + "epoch": 0.12, + "learning_rate": 1.954765805962981e-05, + "loss": 0.4862, + "step": 712 + }, + { + "epoch": 0.12, + "learning_rate": 1.9545987781270007e-05, + "loss": 0.5218, + "step": 713 + }, + { + "epoch": 0.12, + "learning_rate": 1.9544314496454423e-05, + "loss": 0.4962, + "step": 714 + }, + { + "epoch": 0.12, + "learning_rate": 1.9542638205710058e-05, + "loss": 0.5025, + "step": 715 + }, + { + "epoch": 0.12, + "learning_rate": 1.9540958909564846e-05, + "loss": 0.4878, + "step": 716 + }, + { + "epoch": 0.12, + "learning_rate": 1.9539276608547676e-05, + "loss": 0.4929, + "step": 717 + }, + { + "epoch": 0.12, + "learning_rate": 1.9537591303188375e-05, + "loss": 0.4846, + "step": 718 + }, + { + "epoch": 0.12, + "learning_rate": 1.953590299401772e-05, + "loss": 0.4969, + "step": 719 + }, + { + "epoch": 0.12, + "learning_rate": 1.953421168156744e-05, + "loss": 0.4985, + "step": 720 + }, + { + "epoch": 0.12, + "learning_rate": 1.9532517366370203e-05, + "loss": 0.4978, + "step": 721 + }, + { + "epoch": 0.13, + "learning_rate": 1.9530820048959616e-05, + "loss": 0.4921, + "step": 722 + }, + { + "epoch": 0.13, + "learning_rate": 1.9529119729870253e-05, + "loss": 0.5012, + "step": 723 + }, + { + "epoch": 0.13, + "learning_rate": 1.952741640963761e-05, + "loss": 0.4827, + "step": 724 + }, + { + "epoch": 0.13, + "learning_rate": 1.9525710088798142e-05, + "loss": 0.5087, + "step": 725 + }, + { + "epoch": 0.13, + "learning_rate": 1.9524000767889243e-05, + "loss": 0.4864, + "step": 726 + }, + { + "epoch": 0.13, + "learning_rate": 1.952228844744926e-05, + "loss": 0.4965, + "step": 727 + }, + { + "epoch": 0.13, + "learning_rate": 1.9520573128017467e-05, + "loss": 0.4973, + "step": 728 + }, + { + "epoch": 0.13, + "learning_rate": 1.951885481013411e-05, + "loss": 0.4982, + "step": 729 + }, + { + "epoch": 0.13, + "learning_rate": 1.951713349434035e-05, + "loss": 0.4904, + "step": 730 + }, + { + "epoch": 0.13, + "learning_rate": 1.9515409181178315e-05, + "loss": 0.4877, + "step": 731 + }, + { + "epoch": 0.13, + "learning_rate": 1.9513681871191063e-05, + "loss": 0.4991, + "step": 732 + }, + { + "epoch": 0.13, + "learning_rate": 1.95119515649226e-05, + "loss": 0.492, + "step": 733 + }, + { + "epoch": 0.13, + "learning_rate": 1.9510218262917883e-05, + "loss": 0.4919, + "step": 734 + }, + { + "epoch": 0.13, + "learning_rate": 1.9508481965722798e-05, + "loss": 0.4863, + "step": 735 + }, + { + "epoch": 0.13, + "learning_rate": 1.9506742673884186e-05, + "loss": 0.5002, + "step": 736 + }, + { + "epoch": 0.13, + "learning_rate": 1.9505000387949825e-05, + "loss": 0.4839, + "step": 737 + }, + { + "epoch": 0.13, + "learning_rate": 1.950325510846844e-05, + "loss": 0.4935, + "step": 738 + }, + { + "epoch": 0.13, + "learning_rate": 1.95015068359897e-05, + "loss": 0.4995, + "step": 739 + }, + { + "epoch": 0.13, + "learning_rate": 1.949975557106421e-05, + "loss": 0.4957, + "step": 740 + }, + { + "epoch": 0.13, + "learning_rate": 1.949800131424352e-05, + "loss": 0.4914, + "step": 741 + }, + { + "epoch": 0.13, + "learning_rate": 1.9496244066080122e-05, + "loss": 0.4897, + "step": 742 + }, + { + "epoch": 0.13, + "learning_rate": 1.949448382712746e-05, + "loss": 0.501, + "step": 743 + }, + { + "epoch": 0.13, + "learning_rate": 1.9492720597939902e-05, + "loss": 0.4924, + "step": 744 + }, + { + "epoch": 0.13, + "learning_rate": 1.9490954379072775e-05, + "loss": 0.4974, + "step": 745 + }, + { + "epoch": 0.13, + "learning_rate": 1.9489185171082334e-05, + "loss": 0.4968, + "step": 746 + }, + { + "epoch": 0.13, + "learning_rate": 1.9487412974525784e-05, + "loss": 0.4888, + "step": 747 + }, + { + "epoch": 0.13, + "learning_rate": 1.948563778996127e-05, + "loss": 0.4854, + "step": 748 + }, + { + "epoch": 0.13, + "learning_rate": 1.948385961794787e-05, + "loss": 0.5014, + "step": 749 + }, + { + "epoch": 0.13, + "learning_rate": 1.9482078459045617e-05, + "loss": 0.4971, + "step": 750 + }, + { + "epoch": 0.13, + "learning_rate": 1.9480294313815472e-05, + "loss": 0.4993, + "step": 751 + }, + { + "epoch": 0.13, + "learning_rate": 1.9478507182819345e-05, + "loss": 0.4901, + "step": 752 + }, + { + "epoch": 0.13, + "learning_rate": 1.9476717066620082e-05, + "loss": 0.5011, + "step": 753 + }, + { + "epoch": 0.13, + "learning_rate": 1.947492396578147e-05, + "loss": 0.5017, + "step": 754 + }, + { + "epoch": 0.13, + "learning_rate": 1.9473127880868233e-05, + "loss": 0.5016, + "step": 755 + }, + { + "epoch": 0.13, + "learning_rate": 1.9471328812446045e-05, + "loss": 0.4976, + "step": 756 + }, + { + "epoch": 0.13, + "learning_rate": 1.9469526761081504e-05, + "loss": 0.4896, + "step": 757 + }, + { + "epoch": 0.13, + "learning_rate": 1.946772172734216e-05, + "loss": 0.4892, + "step": 758 + }, + { + "epoch": 0.13, + "learning_rate": 1.9465913711796502e-05, + "loss": 0.5003, + "step": 759 + }, + { + "epoch": 0.13, + "learning_rate": 1.946410271501395e-05, + "loss": 0.4832, + "step": 760 + }, + { + "epoch": 0.13, + "learning_rate": 1.946228873756487e-05, + "loss": 0.5087, + "step": 761 + }, + { + "epoch": 0.13, + "learning_rate": 1.946047178002056e-05, + "loss": 0.4891, + "step": 762 + }, + { + "epoch": 0.13, + "learning_rate": 1.9458651842953264e-05, + "loss": 0.5023, + "step": 763 + }, + { + "epoch": 0.13, + "learning_rate": 1.945682892693616e-05, + "loss": 0.4886, + "step": 764 + }, + { + "epoch": 0.13, + "learning_rate": 1.9455003032543366e-05, + "loss": 0.5015, + "step": 765 + }, + { + "epoch": 0.13, + "learning_rate": 1.9453174160349938e-05, + "loss": 0.4845, + "step": 766 + }, + { + "epoch": 0.13, + "learning_rate": 1.9451342310931866e-05, + "loss": 0.4935, + "step": 767 + }, + { + "epoch": 0.13, + "learning_rate": 1.9449507484866084e-05, + "loss": 0.4867, + "step": 768 + }, + { + "epoch": 0.13, + "learning_rate": 1.944766968273046e-05, + "loss": 0.4937, + "step": 769 + }, + { + "epoch": 0.13, + "learning_rate": 1.9445828905103797e-05, + "loss": 0.4944, + "step": 770 + }, + { + "epoch": 0.13, + "learning_rate": 1.944398515256584e-05, + "loss": 0.4985, + "step": 771 + }, + { + "epoch": 0.13, + "learning_rate": 1.944213842569727e-05, + "loss": 0.4973, + "step": 772 + }, + { + "epoch": 0.13, + "learning_rate": 1.94402887250797e-05, + "loss": 0.4921, + "step": 773 + }, + { + "epoch": 0.13, + "learning_rate": 1.943843605129568e-05, + "loss": 0.493, + "step": 774 + }, + { + "epoch": 0.13, + "learning_rate": 1.943658040492871e-05, + "loss": 0.4867, + "step": 775 + }, + { + "epoch": 0.13, + "learning_rate": 1.9434721786563204e-05, + "loss": 0.4912, + "step": 776 + }, + { + "epoch": 0.13, + "learning_rate": 1.9432860196784533e-05, + "loss": 0.4909, + "step": 777 + }, + { + "epoch": 0.13, + "learning_rate": 1.9430995636178986e-05, + "loss": 0.4876, + "step": 778 + }, + { + "epoch": 0.13, + "learning_rate": 1.9429128105333802e-05, + "loss": 0.4934, + "step": 779 + }, + { + "epoch": 0.14, + "learning_rate": 1.9427257604837146e-05, + "loss": 0.4827, + "step": 780 + }, + { + "epoch": 0.14, + "learning_rate": 1.9425384135278126e-05, + "loss": 0.5015, + "step": 781 + }, + { + "epoch": 0.14, + "learning_rate": 1.942350769724678e-05, + "loss": 0.4936, + "step": 782 + }, + { + "epoch": 0.14, + "learning_rate": 1.9421628291334072e-05, + "loss": 0.4904, + "step": 783 + }, + { + "epoch": 0.14, + "learning_rate": 1.941974591813192e-05, + "loss": 0.4869, + "step": 784 + }, + { + "epoch": 0.14, + "learning_rate": 1.941786057823317e-05, + "loss": 0.4918, + "step": 785 + }, + { + "epoch": 0.14, + "learning_rate": 1.941597227223159e-05, + "loss": 0.5043, + "step": 786 + }, + { + "epoch": 0.14, + "learning_rate": 1.9414081000721898e-05, + "loss": 0.5034, + "step": 787 + }, + { + "epoch": 0.14, + "learning_rate": 1.9412186764299738e-05, + "loss": 0.4977, + "step": 788 + }, + { + "epoch": 0.14, + "learning_rate": 1.9410289563561685e-05, + "loss": 0.5031, + "step": 789 + }, + { + "epoch": 0.14, + "learning_rate": 1.9408389399105257e-05, + "loss": 0.4781, + "step": 790 + }, + { + "epoch": 0.14, + "learning_rate": 1.9406486271528896e-05, + "loss": 0.5038, + "step": 791 + }, + { + "epoch": 0.14, + "learning_rate": 1.940458018143199e-05, + "loss": 0.4921, + "step": 792 + }, + { + "epoch": 0.14, + "learning_rate": 1.9402671129414844e-05, + "loss": 0.4901, + "step": 793 + }, + { + "epoch": 0.14, + "learning_rate": 1.9400759116078703e-05, + "loss": 0.4883, + "step": 794 + }, + { + "epoch": 0.14, + "learning_rate": 1.9398844142025746e-05, + "loss": 0.4904, + "step": 795 + }, + { + "epoch": 0.14, + "learning_rate": 1.9396926207859085e-05, + "loss": 0.4899, + "step": 796 + }, + { + "epoch": 0.14, + "learning_rate": 1.9395005314182765e-05, + "loss": 0.5099, + "step": 797 + }, + { + "epoch": 0.14, + "learning_rate": 1.9393081461601752e-05, + "loss": 0.4862, + "step": 798 + }, + { + "epoch": 0.14, + "learning_rate": 1.939115465072196e-05, + "loss": 0.4944, + "step": 799 + }, + { + "epoch": 0.14, + "learning_rate": 1.938922488215023e-05, + "loss": 0.4799, + "step": 800 + }, + { + "epoch": 0.14, + "learning_rate": 1.9387292156494326e-05, + "loss": 0.4979, + "step": 801 + }, + { + "epoch": 0.14, + "learning_rate": 1.938535647436295e-05, + "loss": 0.4922, + "step": 802 + }, + { + "epoch": 0.14, + "learning_rate": 1.9383417836365734e-05, + "loss": 0.4974, + "step": 803 + }, + { + "epoch": 0.14, + "learning_rate": 1.9381476243113243e-05, + "loss": 0.4915, + "step": 804 + }, + { + "epoch": 0.14, + "learning_rate": 1.937953169521697e-05, + "loss": 0.4953, + "step": 805 + }, + { + "epoch": 0.14, + "learning_rate": 1.937758419328934e-05, + "loss": 0.4721, + "step": 806 + }, + { + "epoch": 0.14, + "learning_rate": 1.937563373794371e-05, + "loss": 0.4937, + "step": 807 + }, + { + "epoch": 0.14, + "learning_rate": 1.937368032979436e-05, + "loss": 0.4752, + "step": 808 + }, + { + "epoch": 0.14, + "learning_rate": 1.9371723969456512e-05, + "loss": 0.4992, + "step": 809 + }, + { + "epoch": 0.14, + "learning_rate": 1.9369764657546307e-05, + "loss": 0.5051, + "step": 810 + }, + { + "epoch": 0.14, + "learning_rate": 1.9367802394680816e-05, + "loss": 0.4984, + "step": 811 + }, + { + "epoch": 0.14, + "learning_rate": 1.9365837181478044e-05, + "loss": 0.4727, + "step": 812 + }, + { + "epoch": 0.14, + "learning_rate": 1.9363869018556928e-05, + "loss": 0.4883, + "step": 813 + }, + { + "epoch": 0.14, + "learning_rate": 1.936189790653733e-05, + "loss": 0.4926, + "step": 814 + }, + { + "epoch": 0.14, + "learning_rate": 1.9359923846040035e-05, + "loss": 0.4914, + "step": 815 + }, + { + "epoch": 0.14, + "learning_rate": 1.935794683768677e-05, + "loss": 0.5043, + "step": 816 + }, + { + "epoch": 0.14, + "learning_rate": 1.935596688210018e-05, + "loss": 0.4854, + "step": 817 + }, + { + "epoch": 0.14, + "learning_rate": 1.9353983979903836e-05, + "loss": 0.4882, + "step": 818 + }, + { + "epoch": 0.14, + "learning_rate": 1.9351998131722244e-05, + "loss": 0.4945, + "step": 819 + }, + { + "epoch": 0.14, + "learning_rate": 1.9350009338180842e-05, + "loss": 0.492, + "step": 820 + }, + { + "epoch": 0.14, + "learning_rate": 1.9348017599905984e-05, + "loss": 0.493, + "step": 821 + }, + { + "epoch": 0.14, + "learning_rate": 1.9346022917524958e-05, + "loss": 0.503, + "step": 822 + }, + { + "epoch": 0.14, + "learning_rate": 1.9344025291665978e-05, + "loss": 0.4995, + "step": 823 + }, + { + "epoch": 0.14, + "learning_rate": 1.9342024722958187e-05, + "loss": 0.497, + "step": 824 + }, + { + "epoch": 0.14, + "learning_rate": 1.9340021212031647e-05, + "loss": 0.4962, + "step": 825 + }, + { + "epoch": 0.14, + "learning_rate": 1.933801475951736e-05, + "loss": 0.4983, + "step": 826 + }, + { + "epoch": 0.14, + "learning_rate": 1.9336005366047246e-05, + "loss": 0.4809, + "step": 827 + }, + { + "epoch": 0.14, + "learning_rate": 1.933399303225415e-05, + "loss": 0.5052, + "step": 828 + }, + { + "epoch": 0.14, + "learning_rate": 1.933197775877184e-05, + "loss": 0.4871, + "step": 829 + }, + { + "epoch": 0.14, + "learning_rate": 1.9329959546235028e-05, + "loss": 0.4884, + "step": 830 + }, + { + "epoch": 0.14, + "learning_rate": 1.9327938395279325e-05, + "loss": 0.4829, + "step": 831 + }, + { + "epoch": 0.14, + "learning_rate": 1.9325914306541294e-05, + "loss": 0.5091, + "step": 832 + }, + { + "epoch": 0.14, + "learning_rate": 1.93238872806584e-05, + "loss": 0.4914, + "step": 833 + }, + { + "epoch": 0.14, + "learning_rate": 1.932185731826905e-05, + "loss": 0.4964, + "step": 834 + }, + { + "epoch": 0.14, + "learning_rate": 1.9319824420012566e-05, + "loss": 0.4823, + "step": 835 + }, + { + "epoch": 0.14, + "learning_rate": 1.93177885865292e-05, + "loss": 0.501, + "step": 836 + }, + { + "epoch": 0.14, + "learning_rate": 1.9315749818460127e-05, + "loss": 0.4884, + "step": 837 + }, + { + "epoch": 0.15, + "learning_rate": 1.9313708116447446e-05, + "loss": 0.5083, + "step": 838 + }, + { + "epoch": 0.15, + "learning_rate": 1.9311663481134174e-05, + "loss": 0.5016, + "step": 839 + }, + { + "epoch": 0.15, + "learning_rate": 1.9309615913164262e-05, + "loss": 0.5033, + "step": 840 + }, + { + "epoch": 0.15, + "learning_rate": 1.9307565413182582e-05, + "loss": 0.4805, + "step": 841 + }, + { + "epoch": 0.15, + "learning_rate": 1.9305511981834927e-05, + "loss": 0.4876, + "step": 842 + }, + { + "epoch": 0.15, + "learning_rate": 1.9303455619768006e-05, + "loss": 0.4936, + "step": 843 + }, + { + "epoch": 0.15, + "learning_rate": 1.930139632762947e-05, + "loss": 0.5003, + "step": 844 + }, + { + "epoch": 0.15, + "learning_rate": 1.9299334106067874e-05, + "loss": 0.492, + "step": 845 + }, + { + "epoch": 0.15, + "learning_rate": 1.9297268955732707e-05, + "loss": 0.4804, + "step": 846 + }, + { + "epoch": 0.15, + "learning_rate": 1.929520087727438e-05, + "loss": 0.4867, + "step": 847 + }, + { + "epoch": 0.15, + "learning_rate": 1.9293129871344215e-05, + "loss": 0.5015, + "step": 848 + }, + { + "epoch": 0.15, + "learning_rate": 1.9291055938594464e-05, + "loss": 0.4911, + "step": 849 + }, + { + "epoch": 0.15, + "learning_rate": 1.9288979079678306e-05, + "loss": 0.4982, + "step": 850 + }, + { + "epoch": 0.15, + "learning_rate": 1.928689929524983e-05, + "loss": 0.4947, + "step": 851 + }, + { + "epoch": 0.15, + "learning_rate": 1.928481658596406e-05, + "loss": 0.4926, + "step": 852 + }, + { + "epoch": 0.15, + "learning_rate": 1.9282730952476928e-05, + "loss": 0.4768, + "step": 853 + }, + { + "epoch": 0.15, + "learning_rate": 1.9280642395445298e-05, + "loss": 0.4899, + "step": 854 + }, + { + "epoch": 0.15, + "learning_rate": 1.9278550915526947e-05, + "loss": 0.4818, + "step": 855 + }, + { + "epoch": 0.15, + "learning_rate": 1.927645651338057e-05, + "loss": 0.4917, + "step": 856 + }, + { + "epoch": 0.15, + "learning_rate": 1.9274359189665792e-05, + "loss": 0.4928, + "step": 857 + }, + { + "epoch": 0.15, + "learning_rate": 1.9272258945043154e-05, + "loss": 0.4905, + "step": 858 + }, + { + "epoch": 0.15, + "learning_rate": 1.9270155780174113e-05, + "loss": 0.4839, + "step": 859 + }, + { + "epoch": 0.15, + "learning_rate": 1.9268049695721055e-05, + "loss": 0.4801, + "step": 860 + }, + { + "epoch": 0.15, + "learning_rate": 1.9265940692347276e-05, + "loss": 0.4792, + "step": 861 + }, + { + "epoch": 0.15, + "learning_rate": 1.9263828770716993e-05, + "loss": 0.4887, + "step": 862 + }, + { + "epoch": 0.15, + "learning_rate": 1.9261713931495344e-05, + "loss": 0.4897, + "step": 863 + }, + { + "epoch": 0.15, + "learning_rate": 1.925959617534839e-05, + "loss": 0.4902, + "step": 864 + }, + { + "epoch": 0.15, + "learning_rate": 1.92574755029431e-05, + "loss": 0.4804, + "step": 865 + }, + { + "epoch": 0.15, + "learning_rate": 1.925535191494738e-05, + "loss": 0.5123, + "step": 866 + }, + { + "epoch": 0.15, + "learning_rate": 1.9253225412030028e-05, + "loss": 0.4983, + "step": 867 + }, + { + "epoch": 0.15, + "learning_rate": 1.9251095994860782e-05, + "loss": 0.5005, + "step": 868 + }, + { + "epoch": 0.15, + "learning_rate": 1.924896366411029e-05, + "loss": 0.4775, + "step": 869 + }, + { + "epoch": 0.15, + "learning_rate": 1.9246828420450113e-05, + "loss": 0.4955, + "step": 870 + }, + { + "epoch": 0.15, + "learning_rate": 1.9244690264552745e-05, + "loss": 0.4846, + "step": 871 + }, + { + "epoch": 0.15, + "learning_rate": 1.924254919709157e-05, + "loss": 0.4919, + "step": 872 + }, + { + "epoch": 0.15, + "learning_rate": 1.924040521874092e-05, + "loss": 0.4945, + "step": 873 + }, + { + "epoch": 0.15, + "learning_rate": 1.923825833017602e-05, + "loss": 0.4845, + "step": 874 + }, + { + "epoch": 0.15, + "learning_rate": 1.9236108532073025e-05, + "loss": 0.4783, + "step": 875 + }, + { + "epoch": 0.15, + "learning_rate": 1.9233955825109e-05, + "loss": 0.4993, + "step": 876 + }, + { + "epoch": 0.15, + "learning_rate": 1.9231800209961932e-05, + "loss": 0.4976, + "step": 877 + }, + { + "epoch": 0.15, + "learning_rate": 1.9229641687310714e-05, + "loss": 0.4995, + "step": 878 + }, + { + "epoch": 0.15, + "learning_rate": 1.9227480257835163e-05, + "loss": 0.4897, + "step": 879 + }, + { + "epoch": 0.15, + "learning_rate": 1.922531592221601e-05, + "loss": 0.4904, + "step": 880 + }, + { + "epoch": 0.15, + "learning_rate": 1.92231486811349e-05, + "loss": 0.4959, + "step": 881 + }, + { + "epoch": 0.15, + "learning_rate": 1.9220978535274398e-05, + "loss": 0.4898, + "step": 882 + }, + { + "epoch": 0.15, + "learning_rate": 1.9218805485317973e-05, + "loss": 0.4901, + "step": 883 + }, + { + "epoch": 0.15, + "learning_rate": 1.9216629531950014e-05, + "loss": 0.4828, + "step": 884 + }, + { + "epoch": 0.15, + "learning_rate": 1.9214450675855832e-05, + "loss": 0.4791, + "step": 885 + }, + { + "epoch": 0.15, + "learning_rate": 1.9212268917721643e-05, + "loss": 0.4885, + "step": 886 + }, + { + "epoch": 0.15, + "learning_rate": 1.9210084258234576e-05, + "loss": 0.4964, + "step": 887 + }, + { + "epoch": 0.15, + "learning_rate": 1.920789669808268e-05, + "loss": 0.5053, + "step": 888 + }, + { + "epoch": 0.15, + "learning_rate": 1.9205706237954914e-05, + "loss": 0.491, + "step": 889 + }, + { + "epoch": 0.15, + "learning_rate": 1.9203512878541156e-05, + "loss": 0.4959, + "step": 890 + }, + { + "epoch": 0.15, + "learning_rate": 1.9201316620532186e-05, + "loss": 0.4881, + "step": 891 + }, + { + "epoch": 0.15, + "learning_rate": 1.919911746461971e-05, + "loss": 0.4935, + "step": 892 + }, + { + "epoch": 0.15, + "learning_rate": 1.919691541149633e-05, + "loss": 0.4887, + "step": 893 + }, + { + "epoch": 0.15, + "learning_rate": 1.919471046185558e-05, + "loss": 0.4885, + "step": 894 + }, + { + "epoch": 0.16, + "learning_rate": 1.919250261639189e-05, + "loss": 0.4915, + "step": 895 + }, + { + "epoch": 0.16, + "learning_rate": 1.9190291875800616e-05, + "loss": 0.4944, + "step": 896 + }, + { + "epoch": 0.16, + "learning_rate": 1.918807824077801e-05, + "loss": 0.4916, + "step": 897 + }, + { + "epoch": 0.16, + "learning_rate": 1.918586171202125e-05, + "loss": 0.4934, + "step": 898 + }, + { + "epoch": 0.16, + "learning_rate": 1.9183642290228415e-05, + "loss": 0.4836, + "step": 899 + }, + { + "epoch": 0.16, + "learning_rate": 1.9181419976098503e-05, + "loss": 0.5045, + "step": 900 + }, + { + "epoch": 0.16, + "learning_rate": 1.9179194770331418e-05, + "loss": 0.4787, + "step": 901 + }, + { + "epoch": 0.16, + "learning_rate": 1.917696667362798e-05, + "loss": 0.4937, + "step": 902 + }, + { + "epoch": 0.16, + "learning_rate": 1.917473568668991e-05, + "loss": 0.4817, + "step": 903 + }, + { + "epoch": 0.16, + "learning_rate": 1.9172501810219844e-05, + "loss": 0.4836, + "step": 904 + }, + { + "epoch": 0.16, + "learning_rate": 1.9170265044921338e-05, + "loss": 0.472, + "step": 905 + }, + { + "epoch": 0.16, + "learning_rate": 1.9168025391498837e-05, + "loss": 0.5059, + "step": 906 + }, + { + "epoch": 0.16, + "learning_rate": 1.9165782850657716e-05, + "loss": 0.4813, + "step": 907 + }, + { + "epoch": 0.16, + "learning_rate": 1.916353742310425e-05, + "loss": 0.496, + "step": 908 + }, + { + "epoch": 0.16, + "learning_rate": 1.916128910954562e-05, + "loss": 0.4805, + "step": 909 + }, + { + "epoch": 0.16, + "learning_rate": 1.9159037910689925e-05, + "loss": 0.4943, + "step": 910 + }, + { + "epoch": 0.16, + "learning_rate": 1.915678382724616e-05, + "loss": 0.4927, + "step": 911 + }, + { + "epoch": 0.16, + "learning_rate": 1.9154526859924242e-05, + "loss": 0.4999, + "step": 912 + }, + { + "epoch": 0.16, + "learning_rate": 1.915226700943499e-05, + "loss": 0.487, + "step": 913 + }, + { + "epoch": 0.16, + "learning_rate": 1.915000427649013e-05, + "loss": 0.4971, + "step": 914 + }, + { + "epoch": 0.16, + "learning_rate": 1.9147738661802295e-05, + "loss": 0.4923, + "step": 915 + }, + { + "epoch": 0.16, + "learning_rate": 1.9145470166085034e-05, + "loss": 0.4784, + "step": 916 + }, + { + "epoch": 0.16, + "learning_rate": 1.9143198790052788e-05, + "loss": 0.4825, + "step": 917 + }, + { + "epoch": 0.16, + "learning_rate": 1.9140924534420924e-05, + "loss": 0.4898, + "step": 918 + }, + { + "epoch": 0.16, + "learning_rate": 1.91386473999057e-05, + "loss": 0.4846, + "step": 919 + }, + { + "epoch": 0.16, + "learning_rate": 1.9136367387224288e-05, + "loss": 0.4944, + "step": 920 + }, + { + "epoch": 0.16, + "learning_rate": 1.9134084497094766e-05, + "loss": 0.4891, + "step": 921 + }, + { + "epoch": 0.16, + "learning_rate": 1.9131798730236116e-05, + "loss": 0.4883, + "step": 922 + }, + { + "epoch": 0.16, + "learning_rate": 1.9129510087368234e-05, + "loss": 0.488, + "step": 923 + }, + { + "epoch": 0.16, + "learning_rate": 1.9127218569211905e-05, + "loss": 0.4985, + "step": 924 + }, + { + "epoch": 0.16, + "learning_rate": 1.9124924176488838e-05, + "loss": 0.4915, + "step": 925 + }, + { + "epoch": 0.16, + "learning_rate": 1.9122626909921637e-05, + "loss": 0.4934, + "step": 926 + }, + { + "epoch": 0.16, + "learning_rate": 1.912032677023381e-05, + "loss": 0.4974, + "step": 927 + }, + { + "epoch": 0.16, + "learning_rate": 1.9118023758149777e-05, + "loss": 0.4957, + "step": 928 + }, + { + "epoch": 0.16, + "learning_rate": 1.9115717874394856e-05, + "loss": 0.4833, + "step": 929 + }, + { + "epoch": 0.16, + "learning_rate": 1.9113409119695276e-05, + "loss": 0.4893, + "step": 930 + }, + { + "epoch": 0.16, + "learning_rate": 1.9111097494778164e-05, + "loss": 0.4852, + "step": 931 + }, + { + "epoch": 0.16, + "learning_rate": 1.9108783000371555e-05, + "loss": 0.488, + "step": 932 + }, + { + "epoch": 0.16, + "learning_rate": 1.910646563720439e-05, + "loss": 0.4887, + "step": 933 + }, + { + "epoch": 0.16, + "learning_rate": 1.9104145406006495e-05, + "loss": 0.4851, + "step": 934 + }, + { + "epoch": 0.16, + "learning_rate": 1.9101822307508628e-05, + "loss": 0.4894, + "step": 935 + }, + { + "epoch": 0.16, + "learning_rate": 1.9099496342442432e-05, + "loss": 0.4945, + "step": 936 + }, + { + "epoch": 0.16, + "learning_rate": 1.9097167511540453e-05, + "loss": 0.4975, + "step": 937 + }, + { + "epoch": 0.16, + "learning_rate": 1.909483581553615e-05, + "loss": 0.4941, + "step": 938 + }, + { + "epoch": 0.16, + "learning_rate": 1.9092501255163874e-05, + "loss": 0.4986, + "step": 939 + }, + { + "epoch": 0.16, + "learning_rate": 1.9090163831158883e-05, + "loss": 0.4856, + "step": 940 + }, + { + "epoch": 0.16, + "learning_rate": 1.9087823544257334e-05, + "loss": 0.4816, + "step": 941 + }, + { + "epoch": 0.16, + "learning_rate": 1.9085480395196287e-05, + "loss": 0.5001, + "step": 942 + }, + { + "epoch": 0.16, + "learning_rate": 1.9083134384713708e-05, + "loss": 0.4829, + "step": 943 + }, + { + "epoch": 0.16, + "learning_rate": 1.9080785513548454e-05, + "loss": 0.5031, + "step": 944 + }, + { + "epoch": 0.16, + "learning_rate": 1.9078433782440292e-05, + "loss": 0.4799, + "step": 945 + }, + { + "epoch": 0.16, + "learning_rate": 1.9076079192129886e-05, + "loss": 0.5045, + "step": 946 + }, + { + "epoch": 0.16, + "learning_rate": 1.9073721743358805e-05, + "loss": 0.4859, + "step": 947 + }, + { + "epoch": 0.16, + "learning_rate": 1.907136143686951e-05, + "loss": 0.4885, + "step": 948 + }, + { + "epoch": 0.16, + "learning_rate": 1.9068998273405364e-05, + "loss": 0.5062, + "step": 949 + }, + { + "epoch": 0.16, + "learning_rate": 1.9066632253710636e-05, + "loss": 0.4889, + "step": 950 + }, + { + "epoch": 0.16, + "learning_rate": 1.9064263378530495e-05, + "loss": 0.4893, + "step": 951 + }, + { + "epoch": 0.16, + "learning_rate": 1.9061891648610997e-05, + "loss": 0.492, + "step": 952 + }, + { + "epoch": 0.17, + "learning_rate": 1.905951706469911e-05, + "loss": 0.4924, + "step": 953 + }, + { + "epoch": 0.17, + "learning_rate": 1.9057139627542693e-05, + "loss": 0.5003, + "step": 954 + }, + { + "epoch": 0.17, + "learning_rate": 1.905475933789051e-05, + "loss": 0.4777, + "step": 955 + }, + { + "epoch": 0.17, + "learning_rate": 1.9052376196492218e-05, + "loss": 0.4971, + "step": 956 + }, + { + "epoch": 0.17, + "learning_rate": 1.904999020409837e-05, + "loss": 0.4724, + "step": 957 + }, + { + "epoch": 0.17, + "learning_rate": 1.904760136146043e-05, + "loss": 0.4922, + "step": 958 + }, + { + "epoch": 0.17, + "learning_rate": 1.9045209669330747e-05, + "loss": 0.4811, + "step": 959 + }, + { + "epoch": 0.17, + "learning_rate": 1.904281512846257e-05, + "loss": 0.4858, + "step": 960 + }, + { + "epoch": 0.17, + "learning_rate": 1.904041773961004e-05, + "loss": 0.4897, + "step": 961 + }, + { + "epoch": 0.17, + "learning_rate": 1.9038017503528215e-05, + "loss": 0.4933, + "step": 962 + }, + { + "epoch": 0.17, + "learning_rate": 1.9035614420973026e-05, + "loss": 0.4866, + "step": 963 + }, + { + "epoch": 0.17, + "learning_rate": 1.9033208492701316e-05, + "loss": 0.4981, + "step": 964 + }, + { + "epoch": 0.17, + "learning_rate": 1.903079971947081e-05, + "loss": 0.4943, + "step": 965 + }, + { + "epoch": 0.17, + "learning_rate": 1.902838810204015e-05, + "loss": 0.4872, + "step": 966 + }, + { + "epoch": 0.17, + "learning_rate": 1.9025973641168854e-05, + "loss": 0.4959, + "step": 967 + }, + { + "epoch": 0.17, + "learning_rate": 1.9023556337617343e-05, + "loss": 0.4882, + "step": 968 + }, + { + "epoch": 0.17, + "learning_rate": 1.9021136192146936e-05, + "loss": 0.4891, + "step": 969 + }, + { + "epoch": 0.17, + "learning_rate": 1.901871320551984e-05, + "loss": 0.4902, + "step": 970 + }, + { + "epoch": 0.17, + "learning_rate": 1.9016287378499167e-05, + "loss": 0.4823, + "step": 971 + }, + { + "epoch": 0.17, + "learning_rate": 1.9013858711848914e-05, + "loss": 0.4887, + "step": 972 + }, + { + "epoch": 0.17, + "learning_rate": 1.9011427206333976e-05, + "loss": 0.4975, + "step": 973 + }, + { + "epoch": 0.17, + "learning_rate": 1.9008992862720145e-05, + "loss": 0.4885, + "step": 974 + }, + { + "epoch": 0.17, + "learning_rate": 1.90065556817741e-05, + "loss": 0.4819, + "step": 975 + }, + { + "epoch": 0.17, + "learning_rate": 1.900411566426342e-05, + "loss": 0.4903, + "step": 976 + }, + { + "epoch": 0.17, + "learning_rate": 1.9001672810956575e-05, + "loss": 0.4949, + "step": 977 + }, + { + "epoch": 0.17, + "learning_rate": 1.899922712262293e-05, + "loss": 0.4802, + "step": 978 + }, + { + "epoch": 0.17, + "learning_rate": 1.8996778600032736e-05, + "loss": 0.488, + "step": 979 + }, + { + "epoch": 0.17, + "learning_rate": 1.8994327243957143e-05, + "loss": 0.482, + "step": 980 + }, + { + "epoch": 0.17, + "learning_rate": 1.8991873055168194e-05, + "loss": 0.4865, + "step": 981 + }, + { + "epoch": 0.17, + "learning_rate": 1.8989416034438823e-05, + "loss": 0.4803, + "step": 982 + }, + { + "epoch": 0.17, + "learning_rate": 1.8986956182542853e-05, + "loss": 0.502, + "step": 983 + }, + { + "epoch": 0.17, + "learning_rate": 1.8984493500255e-05, + "loss": 0.4831, + "step": 984 + }, + { + "epoch": 0.17, + "learning_rate": 1.8982027988350877e-05, + "loss": 0.4882, + "step": 985 + }, + { + "epoch": 0.17, + "learning_rate": 1.8979559647606973e-05, + "loss": 0.4845, + "step": 986 + }, + { + "epoch": 0.17, + "learning_rate": 1.8977088478800687e-05, + "loss": 0.4836, + "step": 987 + }, + { + "epoch": 0.17, + "learning_rate": 1.89746144827103e-05, + "loss": 0.4937, + "step": 988 + }, + { + "epoch": 0.17, + "learning_rate": 1.8972137660114977e-05, + "loss": 0.5059, + "step": 989 + }, + { + "epoch": 0.17, + "learning_rate": 1.8969658011794785e-05, + "loss": 0.4897, + "step": 990 + }, + { + "epoch": 0.17, + "learning_rate": 1.8967175538530675e-05, + "loss": 0.4968, + "step": 991 + }, + { + "epoch": 0.17, + "learning_rate": 1.8964690241104484e-05, + "loss": 0.5004, + "step": 992 + }, + { + "epoch": 0.17, + "learning_rate": 1.8962202120298948e-05, + "loss": 0.4861, + "step": 993 + }, + { + "epoch": 0.17, + "learning_rate": 1.8959711176897682e-05, + "loss": 0.4945, + "step": 994 + }, + { + "epoch": 0.17, + "learning_rate": 1.8957217411685197e-05, + "loss": 0.4937, + "step": 995 + }, + { + "epoch": 0.17, + "learning_rate": 1.8954720825446893e-05, + "loss": 0.4894, + "step": 996 + }, + { + "epoch": 0.17, + "learning_rate": 1.895222141896905e-05, + "loss": 0.4847, + "step": 997 + }, + { + "epoch": 0.17, + "learning_rate": 1.8949719193038847e-05, + "loss": 0.4885, + "step": 998 + }, + { + "epoch": 0.17, + "learning_rate": 1.8947214148444346e-05, + "loss": 0.4816, + "step": 999 + }, + { + "epoch": 0.17, + "learning_rate": 1.8944706285974496e-05, + "loss": 0.4887, + "step": 1000 + }, + { + "epoch": 0.17, + "learning_rate": 1.8942195606419133e-05, + "loss": 0.492, + "step": 1001 + }, + { + "epoch": 0.17, + "learning_rate": 1.8939682110568982e-05, + "loss": 0.493, + "step": 1002 + }, + { + "epoch": 0.17, + "learning_rate": 1.8937165799215657e-05, + "loss": 0.5016, + "step": 1003 + }, + { + "epoch": 0.17, + "learning_rate": 1.8934646673151655e-05, + "loss": 0.4833, + "step": 1004 + }, + { + "epoch": 0.17, + "learning_rate": 1.8932124733170357e-05, + "loss": 0.5154, + "step": 1005 + }, + { + "epoch": 0.17, + "learning_rate": 1.8929599980066034e-05, + "loss": 0.4956, + "step": 1006 + }, + { + "epoch": 0.17, + "learning_rate": 1.892707241463385e-05, + "loss": 0.5025, + "step": 1007 + }, + { + "epoch": 0.17, + "learning_rate": 1.8924542037669845e-05, + "loss": 0.4741, + "step": 1008 + }, + { + "epoch": 0.17, + "learning_rate": 1.8922008849970947e-05, + "loss": 0.49, + "step": 1009 + }, + { + "epoch": 0.17, + "learning_rate": 1.8919472852334964e-05, + "loss": 0.4839, + "step": 1010 + }, + { + "epoch": 0.18, + "learning_rate": 1.8916934045560603e-05, + "loss": 0.4931, + "step": 1011 + }, + { + "epoch": 0.18, + "learning_rate": 1.891439243044744e-05, + "loss": 0.4864, + "step": 1012 + }, + { + "epoch": 0.18, + "learning_rate": 1.8911848007795944e-05, + "loss": 0.497, + "step": 1013 + }, + { + "epoch": 0.18, + "learning_rate": 1.890930077840747e-05, + "loss": 0.4826, + "step": 1014 + }, + { + "epoch": 0.18, + "learning_rate": 1.890675074308425e-05, + "loss": 0.4936, + "step": 1015 + }, + { + "epoch": 0.18, + "learning_rate": 1.8904197902629408e-05, + "loss": 0.4765, + "step": 1016 + }, + { + "epoch": 0.18, + "learning_rate": 1.8901642257846943e-05, + "loss": 0.4886, + "step": 1017 + }, + { + "epoch": 0.18, + "learning_rate": 1.889908380954174e-05, + "loss": 0.4853, + "step": 1018 + }, + { + "epoch": 0.18, + "learning_rate": 1.8896522558519574e-05, + "loss": 0.5038, + "step": 1019 + }, + { + "epoch": 0.18, + "learning_rate": 1.8893958505587093e-05, + "loss": 0.4756, + "step": 1020 + }, + { + "epoch": 0.18, + "learning_rate": 1.8891391651551826e-05, + "loss": 0.4978, + "step": 1021 + }, + { + "epoch": 0.18, + "learning_rate": 1.88888219972222e-05, + "loss": 0.4758, + "step": 1022 + }, + { + "epoch": 0.18, + "learning_rate": 1.8886249543407505e-05, + "loss": 0.5042, + "step": 1023 + }, + { + "epoch": 0.18, + "learning_rate": 1.8883674290917927e-05, + "loss": 0.4856, + "step": 1024 + }, + { + "epoch": 0.18, + "learning_rate": 1.8881096240564523e-05, + "loss": 0.4897, + "step": 1025 + }, + { + "epoch": 0.18, + "learning_rate": 1.8878515393159236e-05, + "loss": 0.4896, + "step": 1026 + }, + { + "epoch": 0.18, + "learning_rate": 1.8875931749514893e-05, + "loss": 0.4969, + "step": 1027 + }, + { + "epoch": 0.18, + "learning_rate": 1.8873345310445193e-05, + "loss": 0.4821, + "step": 1028 + }, + { + "epoch": 0.18, + "learning_rate": 1.8870756076764728e-05, + "loss": 0.4879, + "step": 1029 + }, + { + "epoch": 0.18, + "learning_rate": 1.8868164049288954e-05, + "loss": 0.4839, + "step": 1030 + }, + { + "epoch": 0.18, + "learning_rate": 1.886556922883422e-05, + "loss": 0.5094, + "step": 1031 + }, + { + "epoch": 0.18, + "learning_rate": 1.8862971616217753e-05, + "loss": 0.487, + "step": 1032 + }, + { + "epoch": 0.18, + "learning_rate": 1.8860371212257648e-05, + "loss": 0.485, + "step": 1033 + }, + { + "epoch": 0.18, + "learning_rate": 1.88577680177729e-05, + "loss": 0.4799, + "step": 1034 + }, + { + "epoch": 0.18, + "learning_rate": 1.885516203358336e-05, + "loss": 0.4906, + "step": 1035 + }, + { + "epoch": 0.18, + "learning_rate": 1.8852553260509775e-05, + "loss": 0.4869, + "step": 1036 + }, + { + "epoch": 0.18, + "learning_rate": 1.884994169937376e-05, + "loss": 0.4998, + "step": 1037 + }, + { + "epoch": 0.18, + "learning_rate": 1.8847327350997814e-05, + "loss": 0.4949, + "step": 1038 + }, + { + "epoch": 0.18, + "learning_rate": 1.8844710216205306e-05, + "loss": 0.4873, + "step": 1039 + }, + { + "epoch": 0.18, + "learning_rate": 1.8842090295820497e-05, + "loss": 0.4882, + "step": 1040 + }, + { + "epoch": 0.18, + "learning_rate": 1.8839467590668507e-05, + "loss": 0.4983, + "step": 1041 + }, + { + "epoch": 0.18, + "learning_rate": 1.883684210157535e-05, + "loss": 0.4784, + "step": 1042 + }, + { + "epoch": 0.18, + "learning_rate": 1.8834213829367908e-05, + "loss": 0.4906, + "step": 1043 + }, + { + "epoch": 0.18, + "learning_rate": 1.8831582774873935e-05, + "loss": 0.4817, + "step": 1044 + }, + { + "epoch": 0.18, + "learning_rate": 1.8828948938922073e-05, + "loss": 0.4962, + "step": 1045 + }, + { + "epoch": 0.18, + "learning_rate": 1.882631232234183e-05, + "loss": 0.4835, + "step": 1046 + }, + { + "epoch": 0.18, + "learning_rate": 1.8823672925963598e-05, + "loss": 0.5017, + "step": 1047 + }, + { + "epoch": 0.18, + "learning_rate": 1.8821030750618633e-05, + "loss": 0.4919, + "step": 1048 + }, + { + "epoch": 0.18, + "learning_rate": 1.8818385797139083e-05, + "loss": 0.4901, + "step": 1049 + }, + { + "epoch": 0.18, + "learning_rate": 1.8815738066357954e-05, + "loss": 0.4795, + "step": 1050 + }, + { + "epoch": 0.18, + "learning_rate": 1.8813087559109137e-05, + "loss": 0.4837, + "step": 1051 + }, + { + "epoch": 0.18, + "learning_rate": 1.8810434276227397e-05, + "loss": 0.4824, + "step": 1052 + }, + { + "epoch": 0.18, + "learning_rate": 1.8807778218548364e-05, + "loss": 0.4876, + "step": 1053 + }, + { + "epoch": 0.18, + "learning_rate": 1.8805119386908556e-05, + "loss": 0.4757, + "step": 1054 + }, + { + "epoch": 0.18, + "learning_rate": 1.8802457782145352e-05, + "loss": 0.4832, + "step": 1055 + }, + { + "epoch": 0.18, + "learning_rate": 1.879979340509701e-05, + "loss": 0.4731, + "step": 1056 + }, + { + "epoch": 0.18, + "learning_rate": 1.8797126256602666e-05, + "loss": 0.5019, + "step": 1057 + }, + { + "epoch": 0.18, + "learning_rate": 1.8794456337502318e-05, + "loss": 0.4823, + "step": 1058 + }, + { + "epoch": 0.18, + "learning_rate": 1.8791783648636844e-05, + "loss": 0.494, + "step": 1059 + }, + { + "epoch": 0.18, + "learning_rate": 1.878910819084799e-05, + "loss": 0.4827, + "step": 1060 + }, + { + "epoch": 0.18, + "learning_rate": 1.878642996497838e-05, + "loss": 0.4991, + "step": 1061 + }, + { + "epoch": 0.18, + "learning_rate": 1.8783748971871508e-05, + "loss": 0.4858, + "step": 1062 + }, + { + "epoch": 0.18, + "learning_rate": 1.8781065212371732e-05, + "loss": 0.5024, + "step": 1063 + }, + { + "epoch": 0.18, + "learning_rate": 1.877837868732429e-05, + "loss": 0.492, + "step": 1064 + }, + { + "epoch": 0.18, + "learning_rate": 1.877568939757529e-05, + "loss": 0.5046, + "step": 1065 + }, + { + "epoch": 0.18, + "learning_rate": 1.8772997343971708e-05, + "loss": 0.4811, + "step": 1066 + }, + { + "epoch": 0.18, + "learning_rate": 1.877030252736139e-05, + "loss": 0.4876, + "step": 1067 + }, + { + "epoch": 0.19, + "learning_rate": 1.8767604948593052e-05, + "loss": 0.4762, + "step": 1068 + }, + { + "epoch": 0.19, + "learning_rate": 1.8764904608516287e-05, + "loss": 0.4956, + "step": 1069 + }, + { + "epoch": 0.19, + "learning_rate": 1.8762201507981546e-05, + "loss": 0.4916, + "step": 1070 + }, + { + "epoch": 0.19, + "learning_rate": 1.8759495647840158e-05, + "loss": 0.4886, + "step": 1071 + }, + { + "epoch": 0.19, + "learning_rate": 1.875678702894432e-05, + "loss": 0.5012, + "step": 1072 + }, + { + "epoch": 0.19, + "learning_rate": 1.8754075652147094e-05, + "loss": 0.4869, + "step": 1073 + }, + { + "epoch": 0.19, + "learning_rate": 1.8751361518302413e-05, + "loss": 0.4958, + "step": 1074 + }, + { + "epoch": 0.19, + "learning_rate": 1.8748644628265085e-05, + "loss": 0.4942, + "step": 1075 + }, + { + "epoch": 0.19, + "learning_rate": 1.874592498289077e-05, + "loss": 0.4724, + "step": 1076 + }, + { + "epoch": 0.19, + "learning_rate": 1.874320258303601e-05, + "loss": 0.4914, + "step": 1077 + }, + { + "epoch": 0.19, + "learning_rate": 1.8740477429558205e-05, + "loss": 0.4907, + "step": 1078 + }, + { + "epoch": 0.19, + "learning_rate": 1.8737749523315636e-05, + "loss": 0.4783, + "step": 1079 + }, + { + "epoch": 0.19, + "learning_rate": 1.8735018865167433e-05, + "loss": 0.4902, + "step": 1080 + }, + { + "epoch": 0.19, + "learning_rate": 1.873228545597361e-05, + "loss": 0.4966, + "step": 1081 + }, + { + "epoch": 0.19, + "learning_rate": 1.872954929659503e-05, + "loss": 0.4806, + "step": 1082 + }, + { + "epoch": 0.19, + "learning_rate": 1.8726810387893438e-05, + "loss": 0.4954, + "step": 1083 + }, + { + "epoch": 0.19, + "learning_rate": 1.8724068730731436e-05, + "loss": 0.4813, + "step": 1084 + }, + { + "epoch": 0.19, + "learning_rate": 1.872132432597249e-05, + "loss": 0.4887, + "step": 1085 + }, + { + "epoch": 0.19, + "learning_rate": 1.8718577174480938e-05, + "loss": 0.4834, + "step": 1086 + }, + { + "epoch": 0.19, + "learning_rate": 1.8715827277121982e-05, + "loss": 0.4913, + "step": 1087 + }, + { + "epoch": 0.19, + "learning_rate": 1.8713074634761687e-05, + "loss": 0.4868, + "step": 1088 + }, + { + "epoch": 0.19, + "learning_rate": 1.8710319248266978e-05, + "loss": 0.4912, + "step": 1089 + }, + { + "epoch": 0.19, + "learning_rate": 1.8707561118505656e-05, + "loss": 0.4802, + "step": 1090 + }, + { + "epoch": 0.19, + "learning_rate": 1.8704800246346367e-05, + "loss": 0.4976, + "step": 1091 + }, + { + "epoch": 0.19, + "learning_rate": 1.8702036632658646e-05, + "loss": 0.4892, + "step": 1092 + }, + { + "epoch": 0.19, + "learning_rate": 1.869927027831287e-05, + "loss": 0.4902, + "step": 1093 + }, + { + "epoch": 0.19, + "learning_rate": 1.8696501184180283e-05, + "loss": 0.4751, + "step": 1094 + }, + { + "epoch": 0.19, + "learning_rate": 1.8693729351133005e-05, + "loss": 0.4891, + "step": 1095 + }, + { + "epoch": 0.19, + "learning_rate": 1.8690954780044004e-05, + "loss": 0.4905, + "step": 1096 + }, + { + "epoch": 0.19, + "learning_rate": 1.8688177471787118e-05, + "loss": 0.4763, + "step": 1097 + }, + { + "epoch": 0.19, + "learning_rate": 1.8685397427237043e-05, + "loss": 0.4752, + "step": 1098 + }, + { + "epoch": 0.19, + "learning_rate": 1.868261464726934e-05, + "loss": 0.4936, + "step": 1099 + }, + { + "epoch": 0.19, + "learning_rate": 1.8679829132760427e-05, + "loss": 0.487, + "step": 1100 + }, + { + "epoch": 0.19, + "learning_rate": 1.867704088458759e-05, + "loss": 0.4899, + "step": 1101 + }, + { + "epoch": 0.19, + "learning_rate": 1.867424990362897e-05, + "loss": 0.5005, + "step": 1102 + }, + { + "epoch": 0.19, + "learning_rate": 1.8671456190763572e-05, + "loss": 0.5067, + "step": 1103 + }, + { + "epoch": 0.19, + "learning_rate": 1.866865974687126e-05, + "loss": 0.4883, + "step": 1104 + }, + { + "epoch": 0.19, + "learning_rate": 1.866586057283276e-05, + "loss": 0.4883, + "step": 1105 + }, + { + "epoch": 0.19, + "learning_rate": 1.8663058669529654e-05, + "loss": 0.482, + "step": 1106 + }, + { + "epoch": 0.19, + "learning_rate": 1.866025403784439e-05, + "loss": 0.5132, + "step": 1107 + }, + { + "epoch": 0.19, + "learning_rate": 1.8657446678660264e-05, + "loss": 0.4922, + "step": 1108 + }, + { + "epoch": 0.19, + "learning_rate": 1.865463659286144e-05, + "loss": 0.4843, + "step": 1109 + }, + { + "epoch": 0.19, + "learning_rate": 1.8651823781332948e-05, + "loss": 0.4855, + "step": 1110 + }, + { + "epoch": 0.19, + "learning_rate": 1.8649008244960657e-05, + "loss": 0.4881, + "step": 1111 + }, + { + "epoch": 0.19, + "learning_rate": 1.8646189984631306e-05, + "loss": 0.4867, + "step": 1112 + }, + { + "epoch": 0.19, + "learning_rate": 1.8643369001232498e-05, + "loss": 0.4921, + "step": 1113 + }, + { + "epoch": 0.19, + "learning_rate": 1.864054529565267e-05, + "loss": 0.4784, + "step": 1114 + }, + { + "epoch": 0.19, + "learning_rate": 1.8637718868781154e-05, + "loss": 0.4994, + "step": 1115 + }, + { + "epoch": 0.19, + "learning_rate": 1.86348897215081e-05, + "loss": 0.4842, + "step": 1116 + }, + { + "epoch": 0.19, + "learning_rate": 1.863205785472454e-05, + "loss": 0.4868, + "step": 1117 + }, + { + "epoch": 0.19, + "learning_rate": 1.8629223269322353e-05, + "loss": 0.4885, + "step": 1118 + }, + { + "epoch": 0.19, + "learning_rate": 1.8626385966194275e-05, + "loss": 0.488, + "step": 1119 + }, + { + "epoch": 0.19, + "learning_rate": 1.86235459462339e-05, + "loss": 0.4855, + "step": 1120 + }, + { + "epoch": 0.19, + "learning_rate": 1.862070321033568e-05, + "loss": 0.4894, + "step": 1121 + }, + { + "epoch": 0.19, + "learning_rate": 1.8617857759394913e-05, + "loss": 0.478, + "step": 1122 + }, + { + "epoch": 0.19, + "learning_rate": 1.8615009594307757e-05, + "loss": 0.4825, + "step": 1123 + }, + { + "epoch": 0.19, + "learning_rate": 1.861215871597123e-05, + "loss": 0.4743, + "step": 1124 + }, + { + "epoch": 0.19, + "learning_rate": 1.8609305125283202e-05, + "loss": 0.4972, + "step": 1125 + }, + { + "epoch": 0.2, + "learning_rate": 1.860644882314239e-05, + "loss": 0.4808, + "step": 1126 + }, + { + "epoch": 0.2, + "learning_rate": 1.8603589810448377e-05, + "loss": 0.4905, + "step": 1127 + }, + { + "epoch": 0.2, + "learning_rate": 1.8600728088101587e-05, + "loss": 0.4849, + "step": 1128 + }, + { + "epoch": 0.2, + "learning_rate": 1.8597863657003303e-05, + "loss": 0.4904, + "step": 1129 + }, + { + "epoch": 0.2, + "learning_rate": 1.859499651805567e-05, + "loss": 0.4808, + "step": 1130 + }, + { + "epoch": 0.2, + "learning_rate": 1.859212667216167e-05, + "loss": 0.4904, + "step": 1131 + }, + { + "epoch": 0.2, + "learning_rate": 1.8589254120225145e-05, + "loss": 0.4728, + "step": 1132 + }, + { + "epoch": 0.2, + "learning_rate": 1.858637886315079e-05, + "loss": 0.4987, + "step": 1133 + }, + { + "epoch": 0.2, + "learning_rate": 1.8583500901844157e-05, + "loss": 0.4776, + "step": 1134 + }, + { + "epoch": 0.2, + "learning_rate": 1.858062023721164e-05, + "loss": 0.4862, + "step": 1135 + }, + { + "epoch": 0.2, + "learning_rate": 1.8577736870160482e-05, + "loss": 0.488, + "step": 1136 + }, + { + "epoch": 0.2, + "learning_rate": 1.857485080159879e-05, + "loss": 0.4913, + "step": 1137 + }, + { + "epoch": 0.2, + "learning_rate": 1.857196203243552e-05, + "loss": 0.4903, + "step": 1138 + }, + { + "epoch": 0.2, + "learning_rate": 1.8569070563580466e-05, + "loss": 0.4746, + "step": 1139 + }, + { + "epoch": 0.2, + "learning_rate": 1.8566176395944277e-05, + "loss": 0.4835, + "step": 1140 + }, + { + "epoch": 0.2, + "learning_rate": 1.8563279530438464e-05, + "loss": 0.4722, + "step": 1141 + }, + { + "epoch": 0.2, + "learning_rate": 1.8560379967975376e-05, + "loss": 0.4798, + "step": 1142 + }, + { + "epoch": 0.2, + "learning_rate": 1.8557477709468214e-05, + "loss": 0.4928, + "step": 1143 + }, + { + "epoch": 0.2, + "learning_rate": 1.8554572755831026e-05, + "loss": 0.4819, + "step": 1144 + }, + { + "epoch": 0.2, + "learning_rate": 1.8551665107978708e-05, + "loss": 0.4879, + "step": 1145 + }, + { + "epoch": 0.2, + "learning_rate": 1.8548754766827016e-05, + "loss": 0.4904, + "step": 1146 + }, + { + "epoch": 0.2, + "learning_rate": 1.8545841733292543e-05, + "loss": 0.4838, + "step": 1147 + }, + { + "epoch": 0.2, + "learning_rate": 1.8542926008292726e-05, + "loss": 0.4819, + "step": 1148 + }, + { + "epoch": 0.2, + "learning_rate": 1.8540007592745865e-05, + "loss": 0.4732, + "step": 1149 + }, + { + "epoch": 0.2, + "learning_rate": 1.8537086487571095e-05, + "loss": 0.4843, + "step": 1150 + }, + { + "epoch": 0.2, + "learning_rate": 1.85341626936884e-05, + "loss": 0.4837, + "step": 1151 + }, + { + "epoch": 0.2, + "learning_rate": 1.8531236212018616e-05, + "loss": 0.4972, + "step": 1152 + }, + { + "epoch": 0.2, + "learning_rate": 1.8528307043483425e-05, + "loss": 0.4893, + "step": 1153 + }, + { + "epoch": 0.2, + "learning_rate": 1.8525375189005345e-05, + "loss": 0.4974, + "step": 1154 + }, + { + "epoch": 0.2, + "learning_rate": 1.852244064950775e-05, + "loss": 0.4873, + "step": 1155 + }, + { + "epoch": 0.2, + "learning_rate": 1.8519503425914857e-05, + "loss": 0.5025, + "step": 1156 + }, + { + "epoch": 0.2, + "learning_rate": 1.851656351915173e-05, + "loss": 0.4822, + "step": 1157 + }, + { + "epoch": 0.2, + "learning_rate": 1.851362093014427e-05, + "loss": 0.4985, + "step": 1158 + }, + { + "epoch": 0.2, + "learning_rate": 1.851067565981924e-05, + "loss": 0.4862, + "step": 1159 + }, + { + "epoch": 0.2, + "learning_rate": 1.850772770910423e-05, + "loss": 0.4824, + "step": 1160 + }, + { + "epoch": 0.2, + "learning_rate": 1.850477707892768e-05, + "loss": 0.471, + "step": 1161 + }, + { + "epoch": 0.2, + "learning_rate": 1.8501823770218873e-05, + "loss": 0.4918, + "step": 1162 + }, + { + "epoch": 0.2, + "learning_rate": 1.8498867783907942e-05, + "loss": 0.499, + "step": 1163 + }, + { + "epoch": 0.2, + "learning_rate": 1.8495909120925857e-05, + "loss": 0.5001, + "step": 1164 + }, + { + "epoch": 0.2, + "learning_rate": 1.849294778220443e-05, + "loss": 0.4884, + "step": 1165 + }, + { + "epoch": 0.2, + "learning_rate": 1.8489983768676322e-05, + "loss": 0.4888, + "step": 1166 + }, + { + "epoch": 0.2, + "learning_rate": 1.8487017081275028e-05, + "loss": 0.4697, + "step": 1167 + }, + { + "epoch": 0.2, + "learning_rate": 1.8484047720934898e-05, + "loss": 0.4897, + "step": 1168 + }, + { + "epoch": 0.2, + "learning_rate": 1.8481075688591104e-05, + "loss": 0.4859, + "step": 1169 + }, + { + "epoch": 0.2, + "learning_rate": 1.8478100985179676e-05, + "loss": 0.4838, + "step": 1170 + }, + { + "epoch": 0.2, + "learning_rate": 1.8475123611637485e-05, + "loss": 0.4881, + "step": 1171 + }, + { + "epoch": 0.2, + "learning_rate": 1.8472143568902235e-05, + "loss": 0.4959, + "step": 1172 + }, + { + "epoch": 0.2, + "learning_rate": 1.846916085791247e-05, + "loss": 0.4892, + "step": 1173 + }, + { + "epoch": 0.2, + "learning_rate": 1.8466175479607583e-05, + "loss": 0.4869, + "step": 1174 + }, + { + "epoch": 0.2, + "learning_rate": 1.84631874349278e-05, + "loss": 0.4968, + "step": 1175 + }, + { + "epoch": 0.2, + "learning_rate": 1.8460196724814193e-05, + "loss": 0.4881, + "step": 1176 + }, + { + "epoch": 0.2, + "learning_rate": 1.8457203350208664e-05, + "loss": 0.4959, + "step": 1177 + }, + { + "epoch": 0.2, + "learning_rate": 1.845420731205396e-05, + "loss": 0.4905, + "step": 1178 + }, + { + "epoch": 0.2, + "learning_rate": 1.8451208611293672e-05, + "loss": 0.4918, + "step": 1179 + }, + { + "epoch": 0.2, + "learning_rate": 1.844820724887222e-05, + "loss": 0.4899, + "step": 1180 + }, + { + "epoch": 0.2, + "learning_rate": 1.8445203225734866e-05, + "loss": 0.4855, + "step": 1181 + }, + { + "epoch": 0.2, + "learning_rate": 1.8442196542827712e-05, + "loss": 0.4858, + "step": 1182 + }, + { + "epoch": 0.2, + "learning_rate": 1.8439187201097696e-05, + "loss": 0.4913, + "step": 1183 + }, + { + "epoch": 0.21, + "learning_rate": 1.8436175201492594e-05, + "loss": 0.4818, + "step": 1184 + }, + { + "epoch": 0.21, + "learning_rate": 1.8433160544961017e-05, + "loss": 0.4775, + "step": 1185 + }, + { + "epoch": 0.21, + "learning_rate": 1.8430143232452413e-05, + "loss": 0.4777, + "step": 1186 + }, + { + "epoch": 0.21, + "learning_rate": 1.8427123264917074e-05, + "loss": 0.4813, + "step": 1187 + }, + { + "epoch": 0.21, + "learning_rate": 1.8424100643306113e-05, + "loss": 0.4798, + "step": 1188 + }, + { + "epoch": 0.21, + "learning_rate": 1.8421075368571493e-05, + "loss": 0.4814, + "step": 1189 + }, + { + "epoch": 0.21, + "learning_rate": 1.8418047441666012e-05, + "loss": 0.4834, + "step": 1190 + }, + { + "epoch": 0.21, + "learning_rate": 1.8415016863543286e-05, + "loss": 0.4783, + "step": 1191 + }, + { + "epoch": 0.21, + "learning_rate": 1.8411983635157792e-05, + "loss": 0.4856, + "step": 1192 + }, + { + "epoch": 0.21, + "learning_rate": 1.8408947757464825e-05, + "loss": 0.4851, + "step": 1193 + }, + { + "epoch": 0.21, + "learning_rate": 1.840590923142051e-05, + "loss": 0.484, + "step": 1194 + }, + { + "epoch": 0.21, + "learning_rate": 1.8402868057981823e-05, + "loss": 0.4843, + "step": 1195 + }, + { + "epoch": 0.21, + "learning_rate": 1.839982423810656e-05, + "loss": 0.4877, + "step": 1196 + }, + { + "epoch": 0.21, + "learning_rate": 1.8396777772753355e-05, + "loss": 0.488, + "step": 1197 + }, + { + "epoch": 0.21, + "learning_rate": 1.839372866288168e-05, + "loss": 0.4934, + "step": 1198 + }, + { + "epoch": 0.21, + "learning_rate": 1.839067690945183e-05, + "loss": 0.4813, + "step": 1199 + }, + { + "epoch": 0.21, + "learning_rate": 1.8387622513424942e-05, + "loss": 0.4897, + "step": 1200 + }, + { + "epoch": 0.21, + "learning_rate": 1.8384565475762978e-05, + "loss": 0.4739, + "step": 1201 + }, + { + "epoch": 0.21, + "learning_rate": 1.838150579742873e-05, + "loss": 0.5024, + "step": 1202 + }, + { + "epoch": 0.21, + "learning_rate": 1.837844347938584e-05, + "loss": 0.4796, + "step": 1203 + }, + { + "epoch": 0.21, + "learning_rate": 1.8375378522598756e-05, + "loss": 0.4995, + "step": 1204 + }, + { + "epoch": 0.21, + "learning_rate": 1.8372310928032774e-05, + "loss": 0.4788, + "step": 1205 + }, + { + "epoch": 0.21, + "learning_rate": 1.8369240696654017e-05, + "loss": 0.4866, + "step": 1206 + }, + { + "epoch": 0.21, + "learning_rate": 1.8366167829429434e-05, + "loss": 0.4749, + "step": 1207 + }, + { + "epoch": 0.21, + "learning_rate": 1.836309232732681e-05, + "loss": 0.4948, + "step": 1208 + }, + { + "epoch": 0.21, + "learning_rate": 1.836001419131476e-05, + "loss": 0.4762, + "step": 1209 + }, + { + "epoch": 0.21, + "learning_rate": 1.835693342236272e-05, + "loss": 0.5067, + "step": 1210 + }, + { + "epoch": 0.21, + "learning_rate": 1.8353850021440962e-05, + "loss": 0.4815, + "step": 1211 + }, + { + "epoch": 0.21, + "learning_rate": 1.835076398952059e-05, + "loss": 0.4806, + "step": 1212 + }, + { + "epoch": 0.21, + "learning_rate": 1.8347675327573525e-05, + "loss": 0.4874, + "step": 1213 + }, + { + "epoch": 0.21, + "learning_rate": 1.834458403657253e-05, + "loss": 0.4903, + "step": 1214 + }, + { + "epoch": 0.21, + "learning_rate": 1.834149011749119e-05, + "loss": 0.4731, + "step": 1215 + }, + { + "epoch": 0.21, + "learning_rate": 1.8338393571303917e-05, + "loss": 0.4878, + "step": 1216 + }, + { + "epoch": 0.21, + "learning_rate": 1.8335294398985953e-05, + "loss": 0.4938, + "step": 1217 + }, + { + "epoch": 0.21, + "learning_rate": 1.8332192601513358e-05, + "loss": 0.4939, + "step": 1218 + }, + { + "epoch": 0.21, + "learning_rate": 1.8329088179863033e-05, + "loss": 0.4756, + "step": 1219 + }, + { + "epoch": 0.21, + "learning_rate": 1.8325981135012693e-05, + "loss": 0.4886, + "step": 1220 + }, + { + "epoch": 0.21, + "learning_rate": 1.832287146794089e-05, + "loss": 0.4813, + "step": 1221 + }, + { + "epoch": 0.21, + "learning_rate": 1.8319759179626992e-05, + "loss": 0.495, + "step": 1222 + }, + { + "epoch": 0.21, + "learning_rate": 1.8316644271051197e-05, + "loss": 0.4832, + "step": 1223 + }, + { + "epoch": 0.21, + "learning_rate": 1.8313526743194536e-05, + "loss": 0.483, + "step": 1224 + }, + { + "epoch": 0.21, + "learning_rate": 1.8310406597038843e-05, + "loss": 0.4954, + "step": 1225 + }, + { + "epoch": 0.21, + "learning_rate": 1.83072838335668e-05, + "loss": 0.4769, + "step": 1226 + }, + { + "epoch": 0.21, + "learning_rate": 1.8304158453761904e-05, + "loss": 0.4896, + "step": 1227 + }, + { + "epoch": 0.21, + "learning_rate": 1.8301030458608475e-05, + "loss": 0.4876, + "step": 1228 + }, + { + "epoch": 0.21, + "learning_rate": 1.8297899849091654e-05, + "loss": 0.4811, + "step": 1229 + }, + { + "epoch": 0.21, + "learning_rate": 1.8294766626197414e-05, + "loss": 0.4938, + "step": 1230 + }, + { + "epoch": 0.21, + "learning_rate": 1.8291630790912544e-05, + "loss": 0.4829, + "step": 1231 + }, + { + "epoch": 0.21, + "learning_rate": 1.828849234422466e-05, + "loss": 0.4799, + "step": 1232 + }, + { + "epoch": 0.21, + "learning_rate": 1.8285351287122192e-05, + "loss": 0.475, + "step": 1233 + }, + { + "epoch": 0.21, + "learning_rate": 1.8282207620594405e-05, + "loss": 0.4934, + "step": 1234 + }, + { + "epoch": 0.21, + "learning_rate": 1.827906134563138e-05, + "loss": 0.4791, + "step": 1235 + }, + { + "epoch": 0.21, + "learning_rate": 1.827591246322401e-05, + "loss": 0.4891, + "step": 1236 + }, + { + "epoch": 0.21, + "learning_rate": 1.827276097436403e-05, + "loss": 0.4866, + "step": 1237 + }, + { + "epoch": 0.21, + "learning_rate": 1.8269606880043974e-05, + "loss": 0.4862, + "step": 1238 + }, + { + "epoch": 0.21, + "learning_rate": 1.8266450181257213e-05, + "loss": 0.4986, + "step": 1239 + }, + { + "epoch": 0.21, + "learning_rate": 1.826329087899793e-05, + "loss": 0.4878, + "step": 1240 + }, + { + "epoch": 0.21, + "learning_rate": 1.8260128974261123e-05, + "loss": 0.4904, + "step": 1241 + }, + { + "epoch": 0.22, + "learning_rate": 1.8256964468042624e-05, + "loss": 0.4943, + "step": 1242 + }, + { + "epoch": 0.22, + "learning_rate": 1.8253797361339075e-05, + "loss": 0.4712, + "step": 1243 + }, + { + "epoch": 0.22, + "learning_rate": 1.825062765514794e-05, + "loss": 0.4856, + "step": 1244 + }, + { + "epoch": 0.22, + "learning_rate": 1.8247455350467496e-05, + "loss": 0.4751, + "step": 1245 + }, + { + "epoch": 0.22, + "learning_rate": 1.8244280448296852e-05, + "loss": 0.493, + "step": 1246 + }, + { + "epoch": 0.22, + "learning_rate": 1.824110294963591e-05, + "loss": 0.4863, + "step": 1247 + }, + { + "epoch": 0.22, + "learning_rate": 1.8237922855485422e-05, + "loss": 0.4936, + "step": 1248 + }, + { + "epoch": 0.22, + "learning_rate": 1.823474016684693e-05, + "loss": 0.4764, + "step": 1249 + }, + { + "epoch": 0.22, + "learning_rate": 1.8231554884722807e-05, + "loss": 0.4865, + "step": 1250 + }, + { + "epoch": 0.22, + "learning_rate": 1.8228367010116246e-05, + "loss": 0.4816, + "step": 1251 + }, + { + "epoch": 0.22, + "learning_rate": 1.822517654403124e-05, + "loss": 0.4873, + "step": 1252 + }, + { + "epoch": 0.22, + "learning_rate": 1.8221983487472617e-05, + "loss": 0.4851, + "step": 1253 + }, + { + "epoch": 0.22, + "learning_rate": 1.8218787841446003e-05, + "loss": 0.4765, + "step": 1254 + }, + { + "epoch": 0.22, + "learning_rate": 1.8215589606957862e-05, + "loss": 0.4912, + "step": 1255 + }, + { + "epoch": 0.22, + "learning_rate": 1.821238878501545e-05, + "loss": 0.4789, + "step": 1256 + }, + { + "epoch": 0.22, + "learning_rate": 1.820918537662685e-05, + "loss": 0.4815, + "step": 1257 + }, + { + "epoch": 0.22, + "learning_rate": 1.8205979382800963e-05, + "loss": 0.4828, + "step": 1258 + }, + { + "epoch": 0.22, + "learning_rate": 1.820277080454749e-05, + "loss": 0.4934, + "step": 1259 + }, + { + "epoch": 0.22, + "learning_rate": 1.8199559642876962e-05, + "loss": 0.4828, + "step": 1260 + }, + { + "epoch": 0.22, + "learning_rate": 1.8196345898800715e-05, + "loss": 0.4804, + "step": 1261 + }, + { + "epoch": 0.22, + "learning_rate": 1.8193129573330896e-05, + "loss": 0.4816, + "step": 1262 + }, + { + "epoch": 0.22, + "learning_rate": 1.8189910667480476e-05, + "loss": 0.4906, + "step": 1263 + }, + { + "epoch": 0.22, + "learning_rate": 1.8186689182263225e-05, + "loss": 0.4922, + "step": 1264 + }, + { + "epoch": 0.22, + "learning_rate": 1.818346511869373e-05, + "loss": 0.4915, + "step": 1265 + }, + { + "epoch": 0.22, + "learning_rate": 1.8180238477787406e-05, + "loss": 0.4774, + "step": 1266 + }, + { + "epoch": 0.22, + "learning_rate": 1.8177009260560447e-05, + "loss": 0.479, + "step": 1267 + }, + { + "epoch": 0.22, + "learning_rate": 1.817377746802989e-05, + "loss": 0.4945, + "step": 1268 + }, + { + "epoch": 0.22, + "learning_rate": 1.8170543101213565e-05, + "loss": 0.49, + "step": 1269 + }, + { + "epoch": 0.22, + "learning_rate": 1.816730616113012e-05, + "loss": 0.4786, + "step": 1270 + }, + { + "epoch": 0.22, + "learning_rate": 1.816406664879901e-05, + "loss": 0.4775, + "step": 1271 + }, + { + "epoch": 0.22, + "learning_rate": 1.8160824565240495e-05, + "loss": 0.4895, + "step": 1272 + }, + { + "epoch": 0.22, + "learning_rate": 1.8157579911475664e-05, + "loss": 0.4846, + "step": 1273 + }, + { + "epoch": 0.22, + "learning_rate": 1.8154332688526395e-05, + "loss": 0.4966, + "step": 1274 + }, + { + "epoch": 0.22, + "learning_rate": 1.8151082897415386e-05, + "loss": 0.4857, + "step": 1275 + }, + { + "epoch": 0.22, + "learning_rate": 1.8147830539166132e-05, + "loss": 0.4779, + "step": 1276 + }, + { + "epoch": 0.22, + "learning_rate": 1.8144575614802958e-05, + "loss": 0.4868, + "step": 1277 + }, + { + "epoch": 0.22, + "learning_rate": 1.8141318125350974e-05, + "loss": 0.4888, + "step": 1278 + }, + { + "epoch": 0.22, + "learning_rate": 1.8138058071836116e-05, + "loss": 0.4667, + "step": 1279 + }, + { + "epoch": 0.22, + "learning_rate": 1.8134795455285116e-05, + "loss": 0.5037, + "step": 1280 + }, + { + "epoch": 0.22, + "learning_rate": 1.8131530276725514e-05, + "loss": 0.4845, + "step": 1281 + }, + { + "epoch": 0.22, + "learning_rate": 1.8128262537185663e-05, + "loss": 0.4904, + "step": 1282 + }, + { + "epoch": 0.22, + "learning_rate": 1.812499223769472e-05, + "loss": 0.4663, + "step": 1283 + }, + { + "epoch": 0.22, + "learning_rate": 1.8121719379282646e-05, + "loss": 0.4887, + "step": 1284 + }, + { + "epoch": 0.22, + "learning_rate": 1.8118443962980215e-05, + "loss": 0.4854, + "step": 1285 + }, + { + "epoch": 0.22, + "learning_rate": 1.8115165989818992e-05, + "loss": 0.4785, + "step": 1286 + }, + { + "epoch": 0.22, + "learning_rate": 1.8111885460831362e-05, + "loss": 0.4927, + "step": 1287 + }, + { + "epoch": 0.22, + "learning_rate": 1.810860237705051e-05, + "loss": 0.4945, + "step": 1288 + }, + { + "epoch": 0.22, + "learning_rate": 1.8105316739510424e-05, + "loss": 0.4768, + "step": 1289 + }, + { + "epoch": 0.22, + "learning_rate": 1.8102028549245894e-05, + "loss": 0.5007, + "step": 1290 + }, + { + "epoch": 0.22, + "learning_rate": 1.8098737807292517e-05, + "loss": 0.4795, + "step": 1291 + }, + { + "epoch": 0.22, + "learning_rate": 1.8095444514686702e-05, + "loss": 0.4891, + "step": 1292 + }, + { + "epoch": 0.22, + "learning_rate": 1.8092148672465647e-05, + "loss": 0.4903, + "step": 1293 + }, + { + "epoch": 0.22, + "learning_rate": 1.8088850281667358e-05, + "loss": 0.4997, + "step": 1294 + }, + { + "epoch": 0.22, + "learning_rate": 1.808554934333065e-05, + "loss": 0.4868, + "step": 1295 + }, + { + "epoch": 0.22, + "learning_rate": 1.808224585849513e-05, + "loss": 0.5118, + "step": 1296 + }, + { + "epoch": 0.22, + "learning_rate": 1.8078939828201213e-05, + "loss": 0.4826, + "step": 1297 + }, + { + "epoch": 0.22, + "learning_rate": 1.807563125349012e-05, + "loss": 0.4833, + "step": 1298 + }, + { + "epoch": 0.23, + "learning_rate": 1.8072320135403862e-05, + "loss": 0.4942, + "step": 1299 + }, + { + "epoch": 0.23, + "learning_rate": 1.806900647498526e-05, + "loss": 0.4836, + "step": 1300 + }, + { + "epoch": 0.23, + "learning_rate": 1.8065690273277933e-05, + "loss": 0.4759, + "step": 1301 + }, + { + "epoch": 0.23, + "learning_rate": 1.8062371531326298e-05, + "loss": 0.4762, + "step": 1302 + }, + { + "epoch": 0.23, + "learning_rate": 1.8059050250175577e-05, + "loss": 0.4876, + "step": 1303 + }, + { + "epoch": 0.23, + "learning_rate": 1.805572643087179e-05, + "loss": 0.4878, + "step": 1304 + }, + { + "epoch": 0.23, + "learning_rate": 1.8052400074461752e-05, + "loss": 0.4943, + "step": 1305 + }, + { + "epoch": 0.23, + "learning_rate": 1.8049071181993083e-05, + "loss": 0.4735, + "step": 1306 + }, + { + "epoch": 0.23, + "learning_rate": 1.8045739754514197e-05, + "loss": 0.4954, + "step": 1307 + }, + { + "epoch": 0.23, + "learning_rate": 1.804240579307431e-05, + "loss": 0.4753, + "step": 1308 + }, + { + "epoch": 0.23, + "learning_rate": 1.8039069298723438e-05, + "loss": 0.4947, + "step": 1309 + }, + { + "epoch": 0.23, + "learning_rate": 1.8035730272512383e-05, + "loss": 0.4878, + "step": 1310 + }, + { + "epoch": 0.23, + "learning_rate": 1.803238871549276e-05, + "loss": 0.4959, + "step": 1311 + }, + { + "epoch": 0.23, + "learning_rate": 1.802904462871697e-05, + "loss": 0.4842, + "step": 1312 + }, + { + "epoch": 0.23, + "learning_rate": 1.8025698013238217e-05, + "loss": 0.479, + "step": 1313 + }, + { + "epoch": 0.23, + "learning_rate": 1.8022348870110495e-05, + "loss": 0.4788, + "step": 1314 + }, + { + "epoch": 0.23, + "learning_rate": 1.8018997200388605e-05, + "loss": 0.4837, + "step": 1315 + }, + { + "epoch": 0.23, + "learning_rate": 1.801564300512813e-05, + "loss": 0.4876, + "step": 1316 + }, + { + "epoch": 0.23, + "learning_rate": 1.8012286285385456e-05, + "loss": 0.4842, + "step": 1317 + }, + { + "epoch": 0.23, + "learning_rate": 1.800892704221777e-05, + "loss": 0.4833, + "step": 1318 + }, + { + "epoch": 0.23, + "learning_rate": 1.8005565276683038e-05, + "loss": 0.4918, + "step": 1319 + }, + { + "epoch": 0.23, + "learning_rate": 1.8002200989840034e-05, + "loss": 0.4897, + "step": 1320 + }, + { + "epoch": 0.23, + "learning_rate": 1.7998834182748318e-05, + "loss": 0.4864, + "step": 1321 + }, + { + "epoch": 0.23, + "learning_rate": 1.7995464856468253e-05, + "loss": 0.4754, + "step": 1322 + }, + { + "epoch": 0.23, + "learning_rate": 1.7992093012060988e-05, + "loss": 0.4875, + "step": 1323 + }, + { + "epoch": 0.23, + "learning_rate": 1.798871865058846e-05, + "loss": 0.4805, + "step": 1324 + }, + { + "epoch": 0.23, + "learning_rate": 1.7985341773113416e-05, + "loss": 0.4966, + "step": 1325 + }, + { + "epoch": 0.23, + "learning_rate": 1.7981962380699376e-05, + "loss": 0.4825, + "step": 1326 + }, + { + "epoch": 0.23, + "learning_rate": 1.7978580474410665e-05, + "loss": 0.4848, + "step": 1327 + }, + { + "epoch": 0.23, + "learning_rate": 1.7975196055312393e-05, + "loss": 0.4776, + "step": 1328 + }, + { + "epoch": 0.23, + "learning_rate": 1.797180912447047e-05, + "loss": 0.4793, + "step": 1329 + }, + { + "epoch": 0.23, + "learning_rate": 1.7968419682951584e-05, + "loss": 0.4984, + "step": 1330 + }, + { + "epoch": 0.23, + "learning_rate": 1.796502773182322e-05, + "loss": 0.4848, + "step": 1331 + }, + { + "epoch": 0.23, + "learning_rate": 1.7961633272153662e-05, + "loss": 0.4775, + "step": 1332 + }, + { + "epoch": 0.23, + "learning_rate": 1.7958236305011972e-05, + "loss": 0.5025, + "step": 1333 + }, + { + "epoch": 0.23, + "learning_rate": 1.7954836831468007e-05, + "loss": 0.4843, + "step": 1334 + }, + { + "epoch": 0.23, + "learning_rate": 1.7951434852592406e-05, + "loss": 0.4804, + "step": 1335 + }, + { + "epoch": 0.23, + "learning_rate": 1.794803036945661e-05, + "loss": 0.4761, + "step": 1336 + }, + { + "epoch": 0.23, + "learning_rate": 1.794462338313284e-05, + "loss": 0.4893, + "step": 1337 + }, + { + "epoch": 0.23, + "learning_rate": 1.7941213894694108e-05, + "loss": 0.4781, + "step": 1338 + }, + { + "epoch": 0.23, + "learning_rate": 1.7937801905214213e-05, + "loss": 0.4984, + "step": 1339 + }, + { + "epoch": 0.23, + "learning_rate": 1.7934387415767745e-05, + "loss": 0.4858, + "step": 1340 + }, + { + "epoch": 0.23, + "learning_rate": 1.7930970427430074e-05, + "loss": 0.4893, + "step": 1341 + }, + { + "epoch": 0.23, + "learning_rate": 1.7927550941277364e-05, + "loss": 0.4732, + "step": 1342 + }, + { + "epoch": 0.23, + "learning_rate": 1.7924128958386558e-05, + "loss": 0.4996, + "step": 1343 + }, + { + "epoch": 0.23, + "learning_rate": 1.79207044798354e-05, + "loss": 0.4775, + "step": 1344 + }, + { + "epoch": 0.23, + "learning_rate": 1.7917277506702406e-05, + "loss": 0.4816, + "step": 1345 + }, + { + "epoch": 0.23, + "learning_rate": 1.791384804006688e-05, + "loss": 0.4786, + "step": 1346 + }, + { + "epoch": 0.23, + "learning_rate": 1.7910416081008914e-05, + "loss": 0.4925, + "step": 1347 + }, + { + "epoch": 0.23, + "learning_rate": 1.7906981630609383e-05, + "loss": 0.4695, + "step": 1348 + }, + { + "epoch": 0.23, + "learning_rate": 1.7903544689949955e-05, + "loss": 0.4898, + "step": 1349 + }, + { + "epoch": 0.23, + "learning_rate": 1.7900105260113066e-05, + "loss": 0.4902, + "step": 1350 + }, + { + "epoch": 0.23, + "learning_rate": 1.7896663342181954e-05, + "loss": 0.4999, + "step": 1351 + }, + { + "epoch": 0.23, + "learning_rate": 1.7893218937240627e-05, + "loss": 0.4797, + "step": 1352 + }, + { + "epoch": 0.23, + "learning_rate": 1.788977204637388e-05, + "loss": 0.5008, + "step": 1353 + }, + { + "epoch": 0.23, + "learning_rate": 1.78863226706673e-05, + "loss": 0.4776, + "step": 1354 + }, + { + "epoch": 0.23, + "learning_rate": 1.788287081120724e-05, + "loss": 0.4872, + "step": 1355 + }, + { + "epoch": 0.23, + "learning_rate": 1.7879416469080847e-05, + "loss": 0.4703, + "step": 1356 + }, + { + "epoch": 0.24, + "learning_rate": 1.7875959645376043e-05, + "loss": 0.4942, + "step": 1357 + }, + { + "epoch": 0.24, + "learning_rate": 1.7872500341181546e-05, + "loss": 0.4934, + "step": 1358 + }, + { + "epoch": 0.24, + "learning_rate": 1.7869038557586832e-05, + "loss": 0.4881, + "step": 1359 + }, + { + "epoch": 0.24, + "learning_rate": 1.786557429568218e-05, + "loss": 0.4758, + "step": 1360 + }, + { + "epoch": 0.24, + "learning_rate": 1.7862107556558633e-05, + "loss": 0.4827, + "step": 1361 + }, + { + "epoch": 0.24, + "learning_rate": 1.7858638341308026e-05, + "loss": 0.4804, + "step": 1362 + }, + { + "epoch": 0.24, + "learning_rate": 1.785516665102297e-05, + "loss": 0.4954, + "step": 1363 + }, + { + "epoch": 0.24, + "learning_rate": 1.7851692486796847e-05, + "loss": 0.4928, + "step": 1364 + }, + { + "epoch": 0.24, + "learning_rate": 1.7848215849723836e-05, + "loss": 0.4783, + "step": 1365 + }, + { + "epoch": 0.24, + "learning_rate": 1.7844736740898876e-05, + "loss": 0.4845, + "step": 1366 + }, + { + "epoch": 0.24, + "learning_rate": 1.7841255161417698e-05, + "loss": 0.4874, + "step": 1367 + }, + { + "epoch": 0.24, + "learning_rate": 1.7837771112376804e-05, + "loss": 0.4759, + "step": 1368 + }, + { + "epoch": 0.24, + "learning_rate": 1.7834284594873478e-05, + "loss": 0.4842, + "step": 1369 + }, + { + "epoch": 0.24, + "learning_rate": 1.7830795610005775e-05, + "loss": 0.4797, + "step": 1370 + }, + { + "epoch": 0.24, + "learning_rate": 1.7827304158872538e-05, + "loss": 0.5033, + "step": 1371 + }, + { + "epoch": 0.24, + "learning_rate": 1.782381024257337e-05, + "loss": 0.4867, + "step": 1372 + }, + { + "epoch": 0.24, + "learning_rate": 1.782031386220867e-05, + "loss": 0.486, + "step": 1373 + }, + { + "epoch": 0.24, + "learning_rate": 1.78168150188796e-05, + "loss": 0.4841, + "step": 1374 + }, + { + "epoch": 0.24, + "learning_rate": 1.78133137136881e-05, + "loss": 0.4853, + "step": 1375 + }, + { + "epoch": 0.24, + "learning_rate": 1.7809809947736892e-05, + "loss": 0.4927, + "step": 1376 + }, + { + "epoch": 0.24, + "learning_rate": 1.780630372212946e-05, + "loss": 0.4803, + "step": 1377 + }, + { + "epoch": 0.24, + "learning_rate": 1.7802795037970076e-05, + "loss": 0.4791, + "step": 1378 + }, + { + "epoch": 0.24, + "learning_rate": 1.7799283896363778e-05, + "loss": 0.4781, + "step": 1379 + }, + { + "epoch": 0.24, + "learning_rate": 1.779577029841638e-05, + "loss": 0.4866, + "step": 1380 + }, + { + "epoch": 0.24, + "learning_rate": 1.779225424523447e-05, + "loss": 0.4822, + "step": 1381 + }, + { + "epoch": 0.24, + "learning_rate": 1.7788735737925414e-05, + "loss": 0.4758, + "step": 1382 + }, + { + "epoch": 0.24, + "learning_rate": 1.7785214777597342e-05, + "loss": 0.4866, + "step": 1383 + }, + { + "epoch": 0.24, + "learning_rate": 1.778169136535916e-05, + "loss": 0.4759, + "step": 1384 + }, + { + "epoch": 0.24, + "learning_rate": 1.777816550232055e-05, + "loss": 0.4994, + "step": 1385 + }, + { + "epoch": 0.24, + "learning_rate": 1.7774637189591963e-05, + "loss": 0.4736, + "step": 1386 + }, + { + "epoch": 0.24, + "learning_rate": 1.777110642828462e-05, + "loss": 0.4879, + "step": 1387 + }, + { + "epoch": 0.24, + "learning_rate": 1.776757321951051e-05, + "loss": 0.471, + "step": 1388 + }, + { + "epoch": 0.24, + "learning_rate": 1.776403756438241e-05, + "loss": 0.4798, + "step": 1389 + }, + { + "epoch": 0.24, + "learning_rate": 1.776049946401384e-05, + "loss": 0.4738, + "step": 1390 + }, + { + "epoch": 0.24, + "learning_rate": 1.7756958919519118e-05, + "loss": 0.4954, + "step": 1391 + }, + { + "epoch": 0.24, + "learning_rate": 1.7753415932013313e-05, + "loss": 0.4746, + "step": 1392 + }, + { + "epoch": 0.24, + "learning_rate": 1.7749870502612267e-05, + "loss": 0.4881, + "step": 1393 + }, + { + "epoch": 0.24, + "learning_rate": 1.7746322632432593e-05, + "loss": 0.48, + "step": 1394 + }, + { + "epoch": 0.24, + "learning_rate": 1.774277232259168e-05, + "loss": 0.4918, + "step": 1395 + }, + { + "epoch": 0.24, + "learning_rate": 1.7739219574207673e-05, + "loss": 0.4842, + "step": 1396 + }, + { + "epoch": 0.24, + "learning_rate": 1.7735664388399492e-05, + "loss": 0.4949, + "step": 1397 + }, + { + "epoch": 0.24, + "learning_rate": 1.773210676628682e-05, + "loss": 0.488, + "step": 1398 + }, + { + "epoch": 0.24, + "learning_rate": 1.772854670899011e-05, + "loss": 0.4911, + "step": 1399 + }, + { + "epoch": 0.24, + "learning_rate": 1.7724984217630594e-05, + "loss": 0.4925, + "step": 1400 + }, + { + "epoch": 0.24, + "learning_rate": 1.7721419293330245e-05, + "loss": 0.4714, + "step": 1401 + }, + { + "epoch": 0.24, + "learning_rate": 1.771785193721182e-05, + "loss": 0.4858, + "step": 1402 + }, + { + "epoch": 0.24, + "learning_rate": 1.771428215039884e-05, + "loss": 0.4903, + "step": 1403 + }, + { + "epoch": 0.24, + "learning_rate": 1.7710709934015585e-05, + "loss": 0.482, + "step": 1404 + }, + { + "epoch": 0.24, + "learning_rate": 1.7707135289187115e-05, + "loss": 0.4867, + "step": 1405 + }, + { + "epoch": 0.24, + "learning_rate": 1.7703558217039233e-05, + "loss": 0.4799, + "step": 1406 + }, + { + "epoch": 0.24, + "learning_rate": 1.769997871869852e-05, + "loss": 0.4765, + "step": 1407 + }, + { + "epoch": 0.24, + "learning_rate": 1.7696396795292324e-05, + "loss": 0.4905, + "step": 1408 + }, + { + "epoch": 0.24, + "learning_rate": 1.769281244794875e-05, + "loss": 0.4839, + "step": 1409 + }, + { + "epoch": 0.24, + "learning_rate": 1.7689225677796667e-05, + "loss": 0.4807, + "step": 1410 + }, + { + "epoch": 0.24, + "learning_rate": 1.7685636485965713e-05, + "loss": 0.4713, + "step": 1411 + }, + { + "epoch": 0.24, + "learning_rate": 1.7682044873586273e-05, + "loss": 0.4737, + "step": 1412 + }, + { + "epoch": 0.24, + "learning_rate": 1.7678450841789515e-05, + "loss": 0.4845, + "step": 1413 + }, + { + "epoch": 0.24, + "learning_rate": 1.7674854391707357e-05, + "loss": 0.48, + "step": 1414 + }, + { + "epoch": 0.25, + "learning_rate": 1.7671255524472482e-05, + "loss": 0.4823, + "step": 1415 + }, + { + "epoch": 0.25, + "learning_rate": 1.7667654241218332e-05, + "loss": 0.4718, + "step": 1416 + }, + { + "epoch": 0.25, + "learning_rate": 1.766405054307911e-05, + "loss": 0.4727, + "step": 1417 + }, + { + "epoch": 0.25, + "learning_rate": 1.766044443118978e-05, + "loss": 0.4721, + "step": 1418 + }, + { + "epoch": 0.25, + "learning_rate": 1.7656835906686072e-05, + "loss": 0.4903, + "step": 1419 + }, + { + "epoch": 0.25, + "learning_rate": 1.7653224970704465e-05, + "loss": 0.4977, + "step": 1420 + }, + { + "epoch": 0.25, + "learning_rate": 1.7649611624382207e-05, + "loss": 0.4877, + "step": 1421 + }, + { + "epoch": 0.25, + "learning_rate": 1.76459958688573e-05, + "loss": 0.4823, + "step": 1422 + }, + { + "epoch": 0.25, + "learning_rate": 1.7642377705268505e-05, + "loss": 0.4861, + "step": 1423 + }, + { + "epoch": 0.25, + "learning_rate": 1.7638757134755346e-05, + "loss": 0.4747, + "step": 1424 + }, + { + "epoch": 0.25, + "learning_rate": 1.7635134158458095e-05, + "loss": 0.4974, + "step": 1425 + }, + { + "epoch": 0.25, + "learning_rate": 1.7631508777517794e-05, + "loss": 0.4708, + "step": 1426 + }, + { + "epoch": 0.25, + "learning_rate": 1.7627880993076237e-05, + "loss": 0.4903, + "step": 1427 + }, + { + "epoch": 0.25, + "learning_rate": 1.762425080627597e-05, + "loss": 0.461, + "step": 1428 + }, + { + "epoch": 0.25, + "learning_rate": 1.76206182182603e-05, + "loss": 0.487, + "step": 1429 + }, + { + "epoch": 0.25, + "learning_rate": 1.76169832301733e-05, + "loss": 0.4694, + "step": 1430 + }, + { + "epoch": 0.25, + "learning_rate": 1.7613345843159777e-05, + "loss": 0.4955, + "step": 1431 + }, + { + "epoch": 0.25, + "learning_rate": 1.760970605836531e-05, + "loss": 0.4795, + "step": 1432 + }, + { + "epoch": 0.25, + "learning_rate": 1.7606063876936235e-05, + "loss": 0.4889, + "step": 1433 + }, + { + "epoch": 0.25, + "learning_rate": 1.7602419300019627e-05, + "loss": 0.4722, + "step": 1434 + }, + { + "epoch": 0.25, + "learning_rate": 1.7598772328763335e-05, + "loss": 0.4881, + "step": 1435 + }, + { + "epoch": 0.25, + "learning_rate": 1.7595122964315945e-05, + "loss": 0.4842, + "step": 1436 + }, + { + "epoch": 0.25, + "learning_rate": 1.7591471207826804e-05, + "loss": 0.4988, + "step": 1437 + }, + { + "epoch": 0.25, + "learning_rate": 1.758781706044602e-05, + "loss": 0.4792, + "step": 1438 + }, + { + "epoch": 0.25, + "learning_rate": 1.7584160523324437e-05, + "loss": 0.4915, + "step": 1439 + }, + { + "epoch": 0.25, + "learning_rate": 1.7580501597613665e-05, + "loss": 0.4851, + "step": 1440 + }, + { + "epoch": 0.25, + "learning_rate": 1.7576840284466065e-05, + "loss": 0.4947, + "step": 1441 + }, + { + "epoch": 0.25, + "learning_rate": 1.7573176585034744e-05, + "loss": 0.4741, + "step": 1442 + }, + { + "epoch": 0.25, + "learning_rate": 1.7569510500473566e-05, + "loss": 0.4847, + "step": 1443 + }, + { + "epoch": 0.25, + "learning_rate": 1.756584203193714e-05, + "loss": 0.4885, + "step": 1444 + }, + { + "epoch": 0.25, + "learning_rate": 1.7562171180580834e-05, + "loss": 0.4767, + "step": 1445 + }, + { + "epoch": 0.25, + "learning_rate": 1.755849794756076e-05, + "loss": 0.4881, + "step": 1446 + }, + { + "epoch": 0.25, + "learning_rate": 1.7554822334033782e-05, + "loss": 0.4759, + "step": 1447 + }, + { + "epoch": 0.25, + "learning_rate": 1.7551144341157514e-05, + "loss": 0.4845, + "step": 1448 + }, + { + "epoch": 0.25, + "learning_rate": 1.7547463970090324e-05, + "loss": 0.4891, + "step": 1449 + }, + { + "epoch": 0.25, + "learning_rate": 1.7543781221991317e-05, + "loss": 0.4907, + "step": 1450 + }, + { + "epoch": 0.25, + "learning_rate": 1.7540096098020358e-05, + "loss": 0.5027, + "step": 1451 + }, + { + "epoch": 0.25, + "learning_rate": 1.753640859933806e-05, + "loss": 0.482, + "step": 1452 + }, + { + "epoch": 0.25, + "learning_rate": 1.7532718727105772e-05, + "loss": 0.4841, + "step": 1453 + }, + { + "epoch": 0.25, + "learning_rate": 1.7529026482485605e-05, + "loss": 0.4958, + "step": 1454 + }, + { + "epoch": 0.25, + "learning_rate": 1.7525331866640406e-05, + "loss": 0.4702, + "step": 1455 + }, + { + "epoch": 0.25, + "learning_rate": 1.752163488073378e-05, + "loss": 0.479, + "step": 1456 + }, + { + "epoch": 0.25, + "learning_rate": 1.7517935525930068e-05, + "loss": 0.4984, + "step": 1457 + }, + { + "epoch": 0.25, + "learning_rate": 1.751423380339436e-05, + "loss": 0.4763, + "step": 1458 + }, + { + "epoch": 0.25, + "learning_rate": 1.7510529714292497e-05, + "loss": 0.4875, + "step": 1459 + }, + { + "epoch": 0.25, + "learning_rate": 1.750682325979106e-05, + "loss": 0.4765, + "step": 1460 + }, + { + "epoch": 0.25, + "learning_rate": 1.7503114441057374e-05, + "loss": 0.486, + "step": 1461 + }, + { + "epoch": 0.25, + "learning_rate": 1.7499403259259515e-05, + "loss": 0.471, + "step": 1462 + }, + { + "epoch": 0.25, + "learning_rate": 1.749568971556629e-05, + "loss": 0.4966, + "step": 1463 + }, + { + "epoch": 0.25, + "learning_rate": 1.749197381114727e-05, + "loss": 0.4799, + "step": 1464 + }, + { + "epoch": 0.25, + "learning_rate": 1.7488255547172754e-05, + "loss": 0.4838, + "step": 1465 + }, + { + "epoch": 0.25, + "learning_rate": 1.7484534924813785e-05, + "loss": 0.4895, + "step": 1466 + }, + { + "epoch": 0.25, + "learning_rate": 1.748081194524216e-05, + "loss": 0.4809, + "step": 1467 + }, + { + "epoch": 0.25, + "learning_rate": 1.7477086609630403e-05, + "loss": 0.4868, + "step": 1468 + }, + { + "epoch": 0.25, + "learning_rate": 1.7473358919151792e-05, + "loss": 0.4892, + "step": 1469 + }, + { + "epoch": 0.25, + "learning_rate": 1.746962887498034e-05, + "loss": 0.4867, + "step": 1470 + }, + { + "epoch": 0.25, + "learning_rate": 1.746589647829081e-05, + "loss": 0.4791, + "step": 1471 + }, + { + "epoch": 0.26, + "learning_rate": 1.7462161730258688e-05, + "loss": 0.4751, + "step": 1472 + }, + { + "epoch": 0.26, + "learning_rate": 1.745842463206022e-05, + "loss": 0.4752, + "step": 1473 + }, + { + "epoch": 0.26, + "learning_rate": 1.7454685184872388e-05, + "loss": 0.4848, + "step": 1474 + }, + { + "epoch": 0.26, + "learning_rate": 1.74509433898729e-05, + "loss": 0.4824, + "step": 1475 + }, + { + "epoch": 0.26, + "learning_rate": 1.7447199248240222e-05, + "loss": 0.4817, + "step": 1476 + }, + { + "epoch": 0.26, + "learning_rate": 1.7443452761153546e-05, + "loss": 0.4745, + "step": 1477 + }, + { + "epoch": 0.26, + "learning_rate": 1.743970392979281e-05, + "loss": 0.4781, + "step": 1478 + }, + { + "epoch": 0.26, + "learning_rate": 1.743595275533869e-05, + "loss": 0.4853, + "step": 1479 + }, + { + "epoch": 0.26, + "learning_rate": 1.743219923897259e-05, + "loss": 0.4854, + "step": 1480 + }, + { + "epoch": 0.26, + "learning_rate": 1.7428443381876657e-05, + "loss": 0.4923, + "step": 1481 + }, + { + "epoch": 0.26, + "learning_rate": 1.7424685185233788e-05, + "loss": 0.4898, + "step": 1482 + }, + { + "epoch": 0.26, + "learning_rate": 1.7420924650227603e-05, + "loss": 0.4776, + "step": 1483 + }, + { + "epoch": 0.26, + "learning_rate": 1.7417161778042456e-05, + "loss": 0.4953, + "step": 1484 + }, + { + "epoch": 0.26, + "learning_rate": 1.741339656986345e-05, + "loss": 0.4786, + "step": 1485 + }, + { + "epoch": 0.26, + "learning_rate": 1.7409629026876412e-05, + "loss": 0.4889, + "step": 1486 + }, + { + "epoch": 0.26, + "learning_rate": 1.740585915026791e-05, + "loss": 0.4938, + "step": 1487 + }, + { + "epoch": 0.26, + "learning_rate": 1.7402086941225246e-05, + "loss": 0.4819, + "step": 1488 + }, + { + "epoch": 0.26, + "learning_rate": 1.739831240093645e-05, + "loss": 0.4728, + "step": 1489 + }, + { + "epoch": 0.26, + "learning_rate": 1.7394535530590305e-05, + "loss": 0.4836, + "step": 1490 + }, + { + "epoch": 0.26, + "learning_rate": 1.7390756331376307e-05, + "loss": 0.4717, + "step": 1491 + }, + { + "epoch": 0.26, + "learning_rate": 1.7386974804484694e-05, + "loss": 0.4874, + "step": 1492 + }, + { + "epoch": 0.26, + "learning_rate": 1.738319095110644e-05, + "loss": 0.4797, + "step": 1493 + }, + { + "epoch": 0.26, + "learning_rate": 1.7379404772433247e-05, + "loss": 0.4836, + "step": 1494 + }, + { + "epoch": 0.26, + "learning_rate": 1.7375616269657544e-05, + "loss": 0.4834, + "step": 1495 + }, + { + "epoch": 0.26, + "learning_rate": 1.7371825443972513e-05, + "loss": 0.4895, + "step": 1496 + }, + { + "epoch": 0.26, + "learning_rate": 1.736803229657204e-05, + "loss": 0.4899, + "step": 1497 + }, + { + "epoch": 0.26, + "learning_rate": 1.7364236828650768e-05, + "loss": 0.4932, + "step": 1498 + }, + { + "epoch": 0.26, + "learning_rate": 1.736043904140405e-05, + "loss": 0.4728, + "step": 1499 + }, + { + "epoch": 0.26, + "learning_rate": 1.7356638936027975e-05, + "loss": 0.4743, + "step": 1500 + }, + { + "epoch": 0.26, + "learning_rate": 1.7352836513719377e-05, + "loss": 0.4668, + "step": 1501 + }, + { + "epoch": 0.26, + "learning_rate": 1.7349031775675796e-05, + "loss": 0.4973, + "step": 1502 + }, + { + "epoch": 0.26, + "learning_rate": 1.734522472309552e-05, + "loss": 0.4741, + "step": 1503 + }, + { + "epoch": 0.26, + "learning_rate": 1.734141535717756e-05, + "loss": 0.4871, + "step": 1504 + }, + { + "epoch": 0.26, + "learning_rate": 1.7337603679121645e-05, + "loss": 0.4767, + "step": 1505 + }, + { + "epoch": 0.26, + "learning_rate": 1.7333789690128252e-05, + "loss": 0.4918, + "step": 1506 + }, + { + "epoch": 0.26, + "learning_rate": 1.7329973391398575e-05, + "loss": 0.4799, + "step": 1507 + }, + { + "epoch": 0.26, + "learning_rate": 1.732615478413453e-05, + "loss": 0.5018, + "step": 1508 + }, + { + "epoch": 0.26, + "learning_rate": 1.732233386953877e-05, + "loss": 0.465, + "step": 1509 + }, + { + "epoch": 0.26, + "learning_rate": 1.731851064881467e-05, + "loss": 0.4904, + "step": 1510 + }, + { + "epoch": 0.26, + "learning_rate": 1.7314685123166333e-05, + "loss": 0.4751, + "step": 1511 + }, + { + "epoch": 0.26, + "learning_rate": 1.7310857293798585e-05, + "loss": 0.4875, + "step": 1512 + }, + { + "epoch": 0.26, + "learning_rate": 1.730702716191698e-05, + "loss": 0.4971, + "step": 1513 + }, + { + "epoch": 0.26, + "learning_rate": 1.73031947287278e-05, + "loss": 0.4836, + "step": 1514 + }, + { + "epoch": 0.26, + "learning_rate": 1.7299359995438046e-05, + "loss": 0.4697, + "step": 1515 + }, + { + "epoch": 0.26, + "learning_rate": 1.7295522963255443e-05, + "loss": 0.4878, + "step": 1516 + }, + { + "epoch": 0.26, + "learning_rate": 1.729168363338845e-05, + "loss": 0.4813, + "step": 1517 + }, + { + "epoch": 0.26, + "learning_rate": 1.7287842007046232e-05, + "loss": 0.4766, + "step": 1518 + }, + { + "epoch": 0.26, + "learning_rate": 1.7283998085438703e-05, + "loss": 0.4687, + "step": 1519 + }, + { + "epoch": 0.26, + "learning_rate": 1.728015186977647e-05, + "loss": 0.4789, + "step": 1520 + }, + { + "epoch": 0.26, + "learning_rate": 1.7276303361270886e-05, + "loss": 0.472, + "step": 1521 + }, + { + "epoch": 0.26, + "learning_rate": 1.7272452561134015e-05, + "loss": 0.4761, + "step": 1522 + }, + { + "epoch": 0.26, + "learning_rate": 1.7268599470578644e-05, + "loss": 0.4844, + "step": 1523 + }, + { + "epoch": 0.26, + "learning_rate": 1.7264744090818284e-05, + "loss": 0.4951, + "step": 1524 + }, + { + "epoch": 0.26, + "learning_rate": 1.726088642306716e-05, + "loss": 0.474, + "step": 1525 + }, + { + "epoch": 0.26, + "learning_rate": 1.7257026468540238e-05, + "loss": 0.4939, + "step": 1526 + }, + { + "epoch": 0.26, + "learning_rate": 1.725316422845317e-05, + "loss": 0.4736, + "step": 1527 + }, + { + "epoch": 0.26, + "learning_rate": 1.724929970402236e-05, + "loss": 0.4868, + "step": 1528 + }, + { + "epoch": 0.26, + "learning_rate": 1.7245432896464913e-05, + "loss": 0.4686, + "step": 1529 + }, + { + "epoch": 0.27, + "learning_rate": 1.724156380699866e-05, + "loss": 0.4903, + "step": 1530 + }, + { + "epoch": 0.27, + "learning_rate": 1.723769243684215e-05, + "loss": 0.4675, + "step": 1531 + }, + { + "epoch": 0.27, + "learning_rate": 1.723381878721465e-05, + "loss": 0.4902, + "step": 1532 + }, + { + "epoch": 0.27, + "learning_rate": 1.7229942859336142e-05, + "loss": 0.4852, + "step": 1533 + }, + { + "epoch": 0.27, + "learning_rate": 1.7226064654427327e-05, + "loss": 0.4787, + "step": 1534 + }, + { + "epoch": 0.27, + "learning_rate": 1.7222184173709627e-05, + "loss": 0.4813, + "step": 1535 + }, + { + "epoch": 0.27, + "learning_rate": 1.721830141840518e-05, + "loss": 0.4895, + "step": 1536 + }, + { + "epoch": 0.27, + "learning_rate": 1.721441638973683e-05, + "loss": 0.4857, + "step": 1537 + }, + { + "epoch": 0.27, + "learning_rate": 1.7210529088928156e-05, + "loss": 0.4911, + "step": 1538 + }, + { + "epoch": 0.27, + "learning_rate": 1.7206639517203433e-05, + "loss": 0.4721, + "step": 1539 + }, + { + "epoch": 0.27, + "learning_rate": 1.7202747675787662e-05, + "loss": 0.481, + "step": 1540 + }, + { + "epoch": 0.27, + "learning_rate": 1.7198853565906558e-05, + "loss": 0.4851, + "step": 1541 + }, + { + "epoch": 0.27, + "learning_rate": 1.719495718878655e-05, + "loss": 0.4899, + "step": 1542 + }, + { + "epoch": 0.27, + "learning_rate": 1.7191058545654783e-05, + "loss": 0.4861, + "step": 1543 + }, + { + "epoch": 0.27, + "learning_rate": 1.7187157637739108e-05, + "loss": 0.4907, + "step": 1544 + }, + { + "epoch": 0.27, + "learning_rate": 1.7183254466268093e-05, + "loss": 0.4905, + "step": 1545 + }, + { + "epoch": 0.27, + "learning_rate": 1.7179349032471026e-05, + "loss": 0.492, + "step": 1546 + }, + { + "epoch": 0.27, + "learning_rate": 1.7175441337577897e-05, + "loss": 0.4805, + "step": 1547 + }, + { + "epoch": 0.27, + "learning_rate": 1.717153138281941e-05, + "loss": 0.4872, + "step": 1548 + }, + { + "epoch": 0.27, + "learning_rate": 1.7167619169426996e-05, + "loss": 0.4735, + "step": 1549 + }, + { + "epoch": 0.27, + "learning_rate": 1.7163704698632772e-05, + "loss": 0.4869, + "step": 1550 + }, + { + "epoch": 0.27, + "learning_rate": 1.7159787971669586e-05, + "loss": 0.4725, + "step": 1551 + }, + { + "epoch": 0.27, + "learning_rate": 1.7155868989770984e-05, + "loss": 0.4903, + "step": 1552 + }, + { + "epoch": 0.27, + "learning_rate": 1.715194775417123e-05, + "loss": 0.4845, + "step": 1553 + }, + { + "epoch": 0.27, + "learning_rate": 1.71480242661053e-05, + "loss": 0.4637, + "step": 1554 + }, + { + "epoch": 0.27, + "learning_rate": 1.7144098526808867e-05, + "loss": 0.4855, + "step": 1555 + }, + { + "epoch": 0.27, + "learning_rate": 1.7140170537518327e-05, + "loss": 0.474, + "step": 1556 + }, + { + "epoch": 0.27, + "learning_rate": 1.7136240299470772e-05, + "loss": 0.4753, + "step": 1557 + }, + { + "epoch": 0.27, + "learning_rate": 1.7132307813904016e-05, + "loss": 0.4766, + "step": 1558 + }, + { + "epoch": 0.27, + "learning_rate": 1.7128373082056567e-05, + "loss": 0.4801, + "step": 1559 + }, + { + "epoch": 0.27, + "learning_rate": 1.712443610516765e-05, + "loss": 0.472, + "step": 1560 + }, + { + "epoch": 0.27, + "learning_rate": 1.7120496884477196e-05, + "loss": 0.4802, + "step": 1561 + }, + { + "epoch": 0.27, + "learning_rate": 1.7116555421225837e-05, + "loss": 0.4882, + "step": 1562 + }, + { + "epoch": 0.27, + "learning_rate": 1.7112611716654918e-05, + "loss": 0.4937, + "step": 1563 + }, + { + "epoch": 0.27, + "learning_rate": 1.710866577200648e-05, + "loss": 0.486, + "step": 1564 + }, + { + "epoch": 0.27, + "learning_rate": 1.7104717588523285e-05, + "loss": 0.4797, + "step": 1565 + }, + { + "epoch": 0.27, + "learning_rate": 1.710076716744879e-05, + "loss": 0.4787, + "step": 1566 + }, + { + "epoch": 0.27, + "learning_rate": 1.709681451002715e-05, + "loss": 0.4634, + "step": 1567 + }, + { + "epoch": 0.27, + "learning_rate": 1.7092859617503242e-05, + "loss": 0.495, + "step": 1568 + }, + { + "epoch": 0.27, + "learning_rate": 1.7088902491122636e-05, + "loss": 0.4658, + "step": 1569 + }, + { + "epoch": 0.27, + "learning_rate": 1.7084943132131604e-05, + "loss": 0.4963, + "step": 1570 + }, + { + "epoch": 0.27, + "learning_rate": 1.7080981541777123e-05, + "loss": 0.4947, + "step": 1571 + }, + { + "epoch": 0.27, + "learning_rate": 1.7077017721306877e-05, + "loss": 0.488, + "step": 1572 + }, + { + "epoch": 0.27, + "learning_rate": 1.707305167196925e-05, + "loss": 0.487, + "step": 1573 + }, + { + "epoch": 0.27, + "learning_rate": 1.7069083395013323e-05, + "loss": 0.4934, + "step": 1574 + }, + { + "epoch": 0.27, + "learning_rate": 1.7065112891688883e-05, + "loss": 0.4877, + "step": 1575 + }, + { + "epoch": 0.27, + "learning_rate": 1.706114016324642e-05, + "loss": 0.4946, + "step": 1576 + }, + { + "epoch": 0.27, + "learning_rate": 1.7057165210937124e-05, + "loss": 0.4809, + "step": 1577 + }, + { + "epoch": 0.27, + "learning_rate": 1.7053188036012885e-05, + "loss": 0.4978, + "step": 1578 + }, + { + "epoch": 0.27, + "learning_rate": 1.704920863972629e-05, + "loss": 0.4744, + "step": 1579 + }, + { + "epoch": 0.27, + "learning_rate": 1.704522702333063e-05, + "loss": 0.4862, + "step": 1580 + }, + { + "epoch": 0.27, + "learning_rate": 1.7041243188079884e-05, + "loss": 0.4763, + "step": 1581 + }, + { + "epoch": 0.27, + "learning_rate": 1.7037257135228745e-05, + "loss": 0.4792, + "step": 1582 + }, + { + "epoch": 0.27, + "learning_rate": 1.7033268866032605e-05, + "loss": 0.4759, + "step": 1583 + }, + { + "epoch": 0.27, + "learning_rate": 1.7029278381747537e-05, + "loss": 0.4778, + "step": 1584 + }, + { + "epoch": 0.27, + "learning_rate": 1.7025285683630324e-05, + "loss": 0.4713, + "step": 1585 + }, + { + "epoch": 0.27, + "learning_rate": 1.7021290772938447e-05, + "loss": 0.4769, + "step": 1586 + }, + { + "epoch": 0.27, + "learning_rate": 1.7017293650930083e-05, + "loss": 0.4747, + "step": 1587 + }, + { + "epoch": 0.28, + "learning_rate": 1.7013294318864095e-05, + "loss": 0.4903, + "step": 1588 + }, + { + "epoch": 0.28, + "learning_rate": 1.7009292778000058e-05, + "loss": 0.4922, + "step": 1589 + }, + { + "epoch": 0.28, + "learning_rate": 1.7005289029598233e-05, + "loss": 0.4684, + "step": 1590 + }, + { + "epoch": 0.28, + "learning_rate": 1.7001283074919576e-05, + "loss": 0.4811, + "step": 1591 + }, + { + "epoch": 0.28, + "learning_rate": 1.699727491522574e-05, + "loss": 0.4888, + "step": 1592 + }, + { + "epoch": 0.28, + "learning_rate": 1.699326455177908e-05, + "loss": 0.4798, + "step": 1593 + }, + { + "epoch": 0.28, + "learning_rate": 1.698925198584263e-05, + "loss": 0.485, + "step": 1594 + }, + { + "epoch": 0.28, + "learning_rate": 1.6985237218680125e-05, + "loss": 0.4776, + "step": 1595 + }, + { + "epoch": 0.28, + "learning_rate": 1.6981220251555996e-05, + "loss": 0.4769, + "step": 1596 + }, + { + "epoch": 0.28, + "learning_rate": 1.6977201085735367e-05, + "loss": 0.477, + "step": 1597 + }, + { + "epoch": 0.28, + "learning_rate": 1.6973179722484048e-05, + "loss": 0.4779, + "step": 1598 + }, + { + "epoch": 0.28, + "learning_rate": 1.6969156163068547e-05, + "loss": 0.4849, + "step": 1599 + }, + { + "epoch": 0.28, + "learning_rate": 1.696513040875606e-05, + "loss": 0.4938, + "step": 1600 + }, + { + "epoch": 0.28, + "learning_rate": 1.696110246081448e-05, + "loss": 0.486, + "step": 1601 + }, + { + "epoch": 0.28, + "learning_rate": 1.695707232051238e-05, + "loss": 0.479, + "step": 1602 + }, + { + "epoch": 0.28, + "learning_rate": 1.6953039989119036e-05, + "loss": 0.488, + "step": 1603 + }, + { + "epoch": 0.28, + "learning_rate": 1.6949005467904405e-05, + "loss": 0.4918, + "step": 1604 + }, + { + "epoch": 0.28, + "learning_rate": 1.6944968758139144e-05, + "loss": 0.4734, + "step": 1605 + }, + { + "epoch": 0.28, + "learning_rate": 1.694092986109458e-05, + "loss": 0.4876, + "step": 1606 + }, + { + "epoch": 0.28, + "learning_rate": 1.693688877804275e-05, + "loss": 0.4866, + "step": 1607 + }, + { + "epoch": 0.28, + "learning_rate": 1.693284551025637e-05, + "loss": 0.4785, + "step": 1608 + }, + { + "epoch": 0.28, + "learning_rate": 1.6928800059008845e-05, + "loss": 0.4798, + "step": 1609 + }, + { + "epoch": 0.28, + "learning_rate": 1.6924752425574262e-05, + "loss": 0.4768, + "step": 1610 + }, + { + "epoch": 0.28, + "learning_rate": 1.6920702611227405e-05, + "loss": 0.4744, + "step": 1611 + }, + { + "epoch": 0.28, + "learning_rate": 1.691665061724374e-05, + "loss": 0.4773, + "step": 1612 + }, + { + "epoch": 0.28, + "learning_rate": 1.691259644489942e-05, + "loss": 0.4709, + "step": 1613 + }, + { + "epoch": 0.28, + "learning_rate": 1.6908540095471288e-05, + "loss": 0.4761, + "step": 1614 + }, + { + "epoch": 0.28, + "learning_rate": 1.690448157023686e-05, + "loss": 0.493, + "step": 1615 + }, + { + "epoch": 0.28, + "learning_rate": 1.6900420870474347e-05, + "loss": 0.4955, + "step": 1616 + }, + { + "epoch": 0.28, + "learning_rate": 1.6896357997462653e-05, + "loss": 0.4714, + "step": 1617 + }, + { + "epoch": 0.28, + "learning_rate": 1.6892292952481352e-05, + "loss": 0.4854, + "step": 1618 + }, + { + "epoch": 0.28, + "learning_rate": 1.6888225736810705e-05, + "loss": 0.472, + "step": 1619 + }, + { + "epoch": 0.28, + "learning_rate": 1.688415635173166e-05, + "loss": 0.4962, + "step": 1620 + }, + { + "epoch": 0.28, + "learning_rate": 1.6880084798525848e-05, + "loss": 0.4751, + "step": 1621 + }, + { + "epoch": 0.28, + "learning_rate": 1.6876011078475586e-05, + "loss": 0.4819, + "step": 1622 + }, + { + "epoch": 0.28, + "learning_rate": 1.6871935192863862e-05, + "loss": 0.4739, + "step": 1623 + }, + { + "epoch": 0.28, + "learning_rate": 1.6867857142974354e-05, + "loss": 0.4955, + "step": 1624 + }, + { + "epoch": 0.28, + "learning_rate": 1.686377693009143e-05, + "loss": 0.4622, + "step": 1625 + }, + { + "epoch": 0.28, + "learning_rate": 1.6859694555500125e-05, + "loss": 0.4855, + "step": 1626 + }, + { + "epoch": 0.28, + "learning_rate": 1.685561002048616e-05, + "loss": 0.4688, + "step": 1627 + }, + { + "epoch": 0.28, + "learning_rate": 1.6851523326335932e-05, + "loss": 0.4835, + "step": 1628 + }, + { + "epoch": 0.28, + "learning_rate": 1.684743447433653e-05, + "loss": 0.4752, + "step": 1629 + }, + { + "epoch": 0.28, + "learning_rate": 1.684334346577571e-05, + "loss": 0.4714, + "step": 1630 + }, + { + "epoch": 0.28, + "learning_rate": 1.6839250301941912e-05, + "loss": 0.494, + "step": 1631 + }, + { + "epoch": 0.28, + "learning_rate": 1.6835154984124266e-05, + "loss": 0.4731, + "step": 1632 + }, + { + "epoch": 0.28, + "learning_rate": 1.6831057513612554e-05, + "loss": 0.4931, + "step": 1633 + }, + { + "epoch": 0.28, + "learning_rate": 1.682695789169726e-05, + "loss": 0.4652, + "step": 1634 + }, + { + "epoch": 0.28, + "learning_rate": 1.682285611966954e-05, + "loss": 0.4901, + "step": 1635 + }, + { + "epoch": 0.28, + "learning_rate": 1.681875219882122e-05, + "loss": 0.4747, + "step": 1636 + }, + { + "epoch": 0.28, + "learning_rate": 1.6814646130444804e-05, + "loss": 0.4875, + "step": 1637 + }, + { + "epoch": 0.28, + "learning_rate": 1.681053791583348e-05, + "loss": 0.4651, + "step": 1638 + }, + { + "epoch": 0.28, + "learning_rate": 1.6806427556281105e-05, + "loss": 0.493, + "step": 1639 + }, + { + "epoch": 0.28, + "learning_rate": 1.6802315053082218e-05, + "loss": 0.4864, + "step": 1640 + }, + { + "epoch": 0.28, + "learning_rate": 1.6798200407532025e-05, + "loss": 0.4772, + "step": 1641 + }, + { + "epoch": 0.28, + "learning_rate": 1.6794083620926412e-05, + "loss": 0.4849, + "step": 1642 + }, + { + "epoch": 0.28, + "learning_rate": 1.6789964694561936e-05, + "loss": 0.4909, + "step": 1643 + }, + { + "epoch": 0.28, + "learning_rate": 1.6785843629735832e-05, + "loss": 0.4645, + "step": 1644 + }, + { + "epoch": 0.28, + "learning_rate": 1.6781720427746008e-05, + "loss": 0.488, + "step": 1645 + }, + { + "epoch": 0.29, + "learning_rate": 1.677759508989104e-05, + "loss": 0.4667, + "step": 1646 + }, + { + "epoch": 0.29, + "learning_rate": 1.6773467617470184e-05, + "loss": 0.4853, + "step": 1647 + }, + { + "epoch": 0.29, + "learning_rate": 1.6769338011783363e-05, + "loss": 0.478, + "step": 1648 + }, + { + "epoch": 0.29, + "learning_rate": 1.676520627413117e-05, + "loss": 0.4832, + "step": 1649 + }, + { + "epoch": 0.29, + "learning_rate": 1.676107240581488e-05, + "loss": 0.4763, + "step": 1650 + }, + { + "epoch": 0.29, + "learning_rate": 1.6756936408136423e-05, + "loss": 0.488, + "step": 1651 + }, + { + "epoch": 0.29, + "learning_rate": 1.6752798282398414e-05, + "loss": 0.472, + "step": 1652 + }, + { + "epoch": 0.29, + "learning_rate": 1.6748658029904132e-05, + "loss": 0.4785, + "step": 1653 + }, + { + "epoch": 0.29, + "learning_rate": 1.6744515651957525e-05, + "loss": 0.4887, + "step": 1654 + }, + { + "epoch": 0.29, + "learning_rate": 1.6740371149863212e-05, + "loss": 0.4847, + "step": 1655 + }, + { + "epoch": 0.29, + "learning_rate": 1.6736224524926487e-05, + "loss": 0.4734, + "step": 1656 + }, + { + "epoch": 0.29, + "learning_rate": 1.6732075778453298e-05, + "loss": 0.491, + "step": 1657 + }, + { + "epoch": 0.29, + "learning_rate": 1.6727924911750274e-05, + "loss": 0.4717, + "step": 1658 + }, + { + "epoch": 0.29, + "learning_rate": 1.6723771926124704e-05, + "loss": 0.4911, + "step": 1659 + }, + { + "epoch": 0.29, + "learning_rate": 1.6719616822884555e-05, + "loss": 0.4733, + "step": 1660 + }, + { + "epoch": 0.29, + "learning_rate": 1.6715459603338445e-05, + "loss": 0.4723, + "step": 1661 + }, + { + "epoch": 0.29, + "learning_rate": 1.6711300268795674e-05, + "loss": 0.4768, + "step": 1662 + }, + { + "epoch": 0.29, + "learning_rate": 1.6707138820566195e-05, + "loss": 0.4821, + "step": 1663 + }, + { + "epoch": 0.29, + "learning_rate": 1.670297525996064e-05, + "loss": 0.4755, + "step": 1664 + }, + { + "epoch": 0.29, + "learning_rate": 1.6698809588290292e-05, + "loss": 0.4733, + "step": 1665 + }, + { + "epoch": 0.29, + "learning_rate": 1.6694641806867112e-05, + "loss": 0.489, + "step": 1666 + }, + { + "epoch": 0.29, + "learning_rate": 1.6690471917003716e-05, + "loss": 0.4801, + "step": 1667 + }, + { + "epoch": 0.29, + "learning_rate": 1.6686299920013388e-05, + "loss": 0.484, + "step": 1668 + }, + { + "epoch": 0.29, + "learning_rate": 1.668212581721008e-05, + "loss": 0.4744, + "step": 1669 + }, + { + "epoch": 0.29, + "learning_rate": 1.6677949609908394e-05, + "loss": 0.4864, + "step": 1670 + }, + { + "epoch": 0.29, + "learning_rate": 1.6673771299423613e-05, + "loss": 0.4845, + "step": 1671 + }, + { + "epoch": 0.29, + "learning_rate": 1.666959088707166e-05, + "loss": 0.4727, + "step": 1672 + }, + { + "epoch": 0.29, + "learning_rate": 1.6665408374169144e-05, + "loss": 0.5003, + "step": 1673 + }, + { + "epoch": 0.29, + "learning_rate": 1.666122376203332e-05, + "loss": 0.4625, + "step": 1674 + }, + { + "epoch": 0.29, + "learning_rate": 1.665703705198211e-05, + "loss": 0.4741, + "step": 1675 + }, + { + "epoch": 0.29, + "learning_rate": 1.6652848245334097e-05, + "loss": 0.473, + "step": 1676 + }, + { + "epoch": 0.29, + "learning_rate": 1.6648657343408517e-05, + "loss": 0.4817, + "step": 1677 + }, + { + "epoch": 0.29, + "learning_rate": 1.6644464347525273e-05, + "loss": 0.4679, + "step": 1678 + }, + { + "epoch": 0.29, + "learning_rate": 1.664026925900492e-05, + "loss": 0.491, + "step": 1679 + }, + { + "epoch": 0.29, + "learning_rate": 1.663607207916869e-05, + "loss": 0.4781, + "step": 1680 + }, + { + "epoch": 0.29, + "learning_rate": 1.6631872809338456e-05, + "loss": 0.4882, + "step": 1681 + }, + { + "epoch": 0.29, + "learning_rate": 1.6627671450836753e-05, + "loss": 0.4775, + "step": 1682 + }, + { + "epoch": 0.29, + "learning_rate": 1.6623468004986774e-05, + "loss": 0.4884, + "step": 1683 + }, + { + "epoch": 0.29, + "learning_rate": 1.661926247311238e-05, + "loss": 0.4808, + "step": 1684 + }, + { + "epoch": 0.29, + "learning_rate": 1.6615054856538067e-05, + "loss": 0.4874, + "step": 1685 + }, + { + "epoch": 0.29, + "learning_rate": 1.661084515658901e-05, + "loss": 0.4707, + "step": 1686 + }, + { + "epoch": 0.29, + "learning_rate": 1.6606633374591022e-05, + "loss": 0.4841, + "step": 1687 + }, + { + "epoch": 0.29, + "learning_rate": 1.660241951187059e-05, + "loss": 0.4763, + "step": 1688 + }, + { + "epoch": 0.29, + "learning_rate": 1.6598203569754843e-05, + "loss": 0.4839, + "step": 1689 + }, + { + "epoch": 0.29, + "learning_rate": 1.6593985549571568e-05, + "loss": 0.4783, + "step": 1690 + }, + { + "epoch": 0.29, + "learning_rate": 1.6589765452649205e-05, + "loss": 0.4928, + "step": 1691 + }, + { + "epoch": 0.29, + "learning_rate": 1.6585543280316853e-05, + "loss": 0.4722, + "step": 1692 + }, + { + "epoch": 0.29, + "learning_rate": 1.658131903390426e-05, + "loss": 0.4848, + "step": 1693 + }, + { + "epoch": 0.29, + "learning_rate": 1.657709271474183e-05, + "loss": 0.4727, + "step": 1694 + }, + { + "epoch": 0.29, + "learning_rate": 1.6572864324160617e-05, + "loss": 0.4902, + "step": 1695 + }, + { + "epoch": 0.29, + "learning_rate": 1.6568633863492332e-05, + "loss": 0.4872, + "step": 1696 + }, + { + "epoch": 0.29, + "learning_rate": 1.6564401334069333e-05, + "loss": 0.4794, + "step": 1697 + }, + { + "epoch": 0.29, + "learning_rate": 1.656016673722463e-05, + "loss": 0.4731, + "step": 1698 + }, + { + "epoch": 0.29, + "learning_rate": 1.655593007429189e-05, + "loss": 0.4825, + "step": 1699 + }, + { + "epoch": 0.29, + "learning_rate": 1.6551691346605426e-05, + "loss": 0.4699, + "step": 1700 + }, + { + "epoch": 0.29, + "learning_rate": 1.65474505555002e-05, + "loss": 0.4846, + "step": 1701 + }, + { + "epoch": 0.29, + "learning_rate": 1.6543207702311822e-05, + "loss": 0.4806, + "step": 1702 + }, + { + "epoch": 0.3, + "learning_rate": 1.6538962788376557e-05, + "loss": 0.4719, + "step": 1703 + }, + { + "epoch": 0.3, + "learning_rate": 1.6534715815031325e-05, + "loss": 0.4718, + "step": 1704 + }, + { + "epoch": 0.3, + "learning_rate": 1.6530466783613674e-05, + "loss": 0.4824, + "step": 1705 + }, + { + "epoch": 0.3, + "learning_rate": 1.652621569546182e-05, + "loss": 0.4716, + "step": 1706 + }, + { + "epoch": 0.3, + "learning_rate": 1.652196255191462e-05, + "loss": 0.4657, + "step": 1707 + }, + { + "epoch": 0.3, + "learning_rate": 1.651770735431158e-05, + "loss": 0.482, + "step": 1708 + }, + { + "epoch": 0.3, + "learning_rate": 1.6513450103992844e-05, + "loss": 0.4808, + "step": 1709 + }, + { + "epoch": 0.3, + "learning_rate": 1.650919080229921e-05, + "loss": 0.4923, + "step": 1710 + }, + { + "epoch": 0.3, + "learning_rate": 1.650492945057213e-05, + "loss": 0.495, + "step": 1711 + }, + { + "epoch": 0.3, + "learning_rate": 1.6500666050153685e-05, + "loss": 0.4701, + "step": 1712 + }, + { + "epoch": 0.3, + "learning_rate": 1.649640060238661e-05, + "loss": 0.4815, + "step": 1713 + }, + { + "epoch": 0.3, + "learning_rate": 1.6492133108614284e-05, + "loss": 0.472, + "step": 1714 + }, + { + "epoch": 0.3, + "learning_rate": 1.6487863570180734e-05, + "loss": 0.4886, + "step": 1715 + }, + { + "epoch": 0.3, + "learning_rate": 1.6483591988430625e-05, + "loss": 0.4786, + "step": 1716 + }, + { + "epoch": 0.3, + "learning_rate": 1.6479318364709266e-05, + "loss": 0.4679, + "step": 1717 + }, + { + "epoch": 0.3, + "learning_rate": 1.647504270036262e-05, + "loss": 0.4775, + "step": 1718 + }, + { + "epoch": 0.3, + "learning_rate": 1.647076499673727e-05, + "loss": 0.4839, + "step": 1719 + }, + { + "epoch": 0.3, + "learning_rate": 1.6466485255180464e-05, + "loss": 0.4751, + "step": 1720 + }, + { + "epoch": 0.3, + "learning_rate": 1.646220347704008e-05, + "loss": 0.4819, + "step": 1721 + }, + { + "epoch": 0.3, + "learning_rate": 1.645791966366464e-05, + "loss": 0.4794, + "step": 1722 + }, + { + "epoch": 0.3, + "learning_rate": 1.6453633816403312e-05, + "loss": 0.481, + "step": 1723 + }, + { + "epoch": 0.3, + "learning_rate": 1.6449345936605894e-05, + "loss": 0.4795, + "step": 1724 + }, + { + "epoch": 0.3, + "learning_rate": 1.644505602562283e-05, + "loss": 0.4851, + "step": 1725 + }, + { + "epoch": 0.3, + "learning_rate": 1.6440764084805208e-05, + "loss": 0.4598, + "step": 1726 + }, + { + "epoch": 0.3, + "learning_rate": 1.6436470115504745e-05, + "loss": 0.4824, + "step": 1727 + }, + { + "epoch": 0.3, + "learning_rate": 1.643217411907381e-05, + "loss": 0.4777, + "step": 1728 + }, + { + "epoch": 0.3, + "learning_rate": 1.6427876096865394e-05, + "loss": 0.4871, + "step": 1729 + }, + { + "epoch": 0.3, + "learning_rate": 1.6423576050233144e-05, + "loss": 0.4854, + "step": 1730 + }, + { + "epoch": 0.3, + "learning_rate": 1.6419273980531333e-05, + "loss": 0.4809, + "step": 1731 + }, + { + "epoch": 0.3, + "learning_rate": 1.6414969889114872e-05, + "loss": 0.4785, + "step": 1732 + }, + { + "epoch": 0.3, + "learning_rate": 1.641066377733931e-05, + "loss": 0.4757, + "step": 1733 + }, + { + "epoch": 0.3, + "learning_rate": 1.6406355646560838e-05, + "loss": 0.4696, + "step": 1734 + }, + { + "epoch": 0.3, + "learning_rate": 1.640204549813627e-05, + "loss": 0.493, + "step": 1735 + }, + { + "epoch": 0.3, + "learning_rate": 1.6397733333423072e-05, + "loss": 0.4724, + "step": 1736 + }, + { + "epoch": 0.3, + "learning_rate": 1.639341915377933e-05, + "loss": 0.4714, + "step": 1737 + }, + { + "epoch": 0.3, + "learning_rate": 1.6389102960563776e-05, + "loss": 0.4763, + "step": 1738 + }, + { + "epoch": 0.3, + "learning_rate": 1.6384784755135767e-05, + "loss": 0.4857, + "step": 1739 + }, + { + "epoch": 0.3, + "learning_rate": 1.63804645388553e-05, + "loss": 0.4857, + "step": 1740 + }, + { + "epoch": 0.3, + "learning_rate": 1.6376142313083e-05, + "loss": 0.4676, + "step": 1741 + }, + { + "epoch": 0.3, + "learning_rate": 1.6371818079180126e-05, + "loss": 0.4663, + "step": 1742 + }, + { + "epoch": 0.3, + "learning_rate": 1.636749183850858e-05, + "loss": 0.4854, + "step": 1743 + }, + { + "epoch": 0.3, + "learning_rate": 1.636316359243088e-05, + "loss": 0.475, + "step": 1744 + }, + { + "epoch": 0.3, + "learning_rate": 1.6358833342310192e-05, + "loss": 0.4826, + "step": 1745 + }, + { + "epoch": 0.3, + "learning_rate": 1.635450108951029e-05, + "loss": 0.4843, + "step": 1746 + }, + { + "epoch": 0.3, + "learning_rate": 1.6350166835395607e-05, + "loss": 0.4876, + "step": 1747 + }, + { + "epoch": 0.3, + "learning_rate": 1.6345830581331187e-05, + "loss": 0.4754, + "step": 1748 + }, + { + "epoch": 0.3, + "learning_rate": 1.6341492328682703e-05, + "loss": 0.4711, + "step": 1749 + }, + { + "epoch": 0.3, + "learning_rate": 1.6337152078816475e-05, + "loss": 0.4811, + "step": 1750 + }, + { + "epoch": 0.3, + "learning_rate": 1.633280983309943e-05, + "loss": 0.4787, + "step": 1751 + }, + { + "epoch": 0.3, + "learning_rate": 1.6328465592899142e-05, + "loss": 0.489, + "step": 1752 + }, + { + "epoch": 0.3, + "learning_rate": 1.63241193595838e-05, + "loss": 0.4747, + "step": 1753 + }, + { + "epoch": 0.3, + "learning_rate": 1.631977113452223e-05, + "loss": 0.4961, + "step": 1754 + }, + { + "epoch": 0.3, + "learning_rate": 1.631542091908387e-05, + "loss": 0.4681, + "step": 1755 + }, + { + "epoch": 0.3, + "learning_rate": 1.6311068714638817e-05, + "loss": 0.4746, + "step": 1756 + }, + { + "epoch": 0.3, + "learning_rate": 1.6306714522557755e-05, + "loss": 0.4848, + "step": 1757 + }, + { + "epoch": 0.3, + "learning_rate": 1.6302358344212025e-05, + "loss": 0.4704, + "step": 1758 + }, + { + "epoch": 0.3, + "learning_rate": 1.6298000180973572e-05, + "loss": 0.5019, + "step": 1759 + }, + { + "epoch": 0.3, + "learning_rate": 1.629364003421498e-05, + "loss": 0.4716, + "step": 1760 + }, + { + "epoch": 0.31, + "learning_rate": 1.628927790530945e-05, + "loss": 0.4922, + "step": 1761 + }, + { + "epoch": 0.31, + "learning_rate": 1.6284913795630813e-05, + "loss": 0.4904, + "step": 1762 + }, + { + "epoch": 0.31, + "learning_rate": 1.6280547706553525e-05, + "loss": 0.4808, + "step": 1763 + }, + { + "epoch": 0.31, + "learning_rate": 1.6276179639452654e-05, + "loss": 0.4619, + "step": 1764 + }, + { + "epoch": 0.31, + "learning_rate": 1.62718095957039e-05, + "loss": 0.4895, + "step": 1765 + }, + { + "epoch": 0.31, + "learning_rate": 1.6267437576683585e-05, + "loss": 0.4717, + "step": 1766 + }, + { + "epoch": 0.31, + "learning_rate": 1.6263063583768652e-05, + "loss": 0.4779, + "step": 1767 + }, + { + "epoch": 0.31, + "learning_rate": 1.625868761833667e-05, + "loss": 0.4694, + "step": 1768 + }, + { + "epoch": 0.31, + "learning_rate": 1.6254309681765814e-05, + "loss": 0.4825, + "step": 1769 + }, + { + "epoch": 0.31, + "learning_rate": 1.6249929775434903e-05, + "loss": 0.4637, + "step": 1770 + }, + { + "epoch": 0.31, + "learning_rate": 1.624554790072336e-05, + "loss": 0.4858, + "step": 1771 + }, + { + "epoch": 0.31, + "learning_rate": 1.6241164059011228e-05, + "loss": 0.4688, + "step": 1772 + }, + { + "epoch": 0.31, + "learning_rate": 1.6236778251679177e-05, + "loss": 0.4832, + "step": 1773 + }, + { + "epoch": 0.31, + "learning_rate": 1.6232390480108493e-05, + "loss": 0.4774, + "step": 1774 + }, + { + "epoch": 0.31, + "learning_rate": 1.6228000745681082e-05, + "loss": 0.4782, + "step": 1775 + }, + { + "epoch": 0.31, + "learning_rate": 1.622360904977946e-05, + "loss": 0.4663, + "step": 1776 + }, + { + "epoch": 0.31, + "learning_rate": 1.6219215393786772e-05, + "loss": 0.4862, + "step": 1777 + }, + { + "epoch": 0.31, + "learning_rate": 1.6214819779086774e-05, + "loss": 0.4758, + "step": 1778 + }, + { + "epoch": 0.31, + "learning_rate": 1.621042220706384e-05, + "loss": 0.4686, + "step": 1779 + }, + { + "epoch": 0.31, + "learning_rate": 1.6206022679102967e-05, + "loss": 0.4765, + "step": 1780 + }, + { + "epoch": 0.31, + "learning_rate": 1.6201621196589755e-05, + "loss": 0.4844, + "step": 1781 + }, + { + "epoch": 0.31, + "learning_rate": 1.6197217760910426e-05, + "loss": 0.4676, + "step": 1782 + }, + { + "epoch": 0.31, + "learning_rate": 1.619281237345182e-05, + "loss": 0.4866, + "step": 1783 + }, + { + "epoch": 0.31, + "learning_rate": 1.618840503560139e-05, + "loss": 0.4763, + "step": 1784 + }, + { + "epoch": 0.31, + "learning_rate": 1.6183995748747204e-05, + "loss": 0.4792, + "step": 1785 + }, + { + "epoch": 0.31, + "learning_rate": 1.6179584514277937e-05, + "loss": 0.4864, + "step": 1786 + }, + { + "epoch": 0.31, + "learning_rate": 1.6175171333582887e-05, + "loss": 0.4832, + "step": 1787 + }, + { + "epoch": 0.31, + "learning_rate": 1.617075620805196e-05, + "loss": 0.4923, + "step": 1788 + }, + { + "epoch": 0.31, + "learning_rate": 1.6166339139075676e-05, + "loss": 0.461, + "step": 1789 + }, + { + "epoch": 0.31, + "learning_rate": 1.616192012804516e-05, + "loss": 0.4977, + "step": 1790 + }, + { + "epoch": 0.31, + "learning_rate": 1.6157499176352164e-05, + "loss": 0.4786, + "step": 1791 + }, + { + "epoch": 0.31, + "learning_rate": 1.6153076285389036e-05, + "loss": 0.47, + "step": 1792 + }, + { + "epoch": 0.31, + "learning_rate": 1.614865145654875e-05, + "loss": 0.4768, + "step": 1793 + }, + { + "epoch": 0.31, + "learning_rate": 1.6144224691224868e-05, + "loss": 0.4867, + "step": 1794 + }, + { + "epoch": 0.31, + "learning_rate": 1.6139795990811583e-05, + "loss": 0.4725, + "step": 1795 + }, + { + "epoch": 0.31, + "learning_rate": 1.613536535670369e-05, + "loss": 0.4866, + "step": 1796 + }, + { + "epoch": 0.31, + "learning_rate": 1.6130932790296586e-05, + "loss": 0.475, + "step": 1797 + }, + { + "epoch": 0.31, + "learning_rate": 1.612649829298629e-05, + "loss": 0.4841, + "step": 1798 + }, + { + "epoch": 0.31, + "learning_rate": 1.612206186616942e-05, + "loss": 0.4907, + "step": 1799 + }, + { + "epoch": 0.31, + "learning_rate": 1.6117623511243204e-05, + "loss": 0.4836, + "step": 1800 + }, + { + "epoch": 0.31, + "learning_rate": 1.611318322960548e-05, + "loss": 0.4662, + "step": 1801 + }, + { + "epoch": 0.31, + "learning_rate": 1.6108741022654685e-05, + "loss": 0.4805, + "step": 1802 + }, + { + "epoch": 0.31, + "learning_rate": 1.6104296891789867e-05, + "loss": 0.4682, + "step": 1803 + }, + { + "epoch": 0.31, + "learning_rate": 1.6099850838410685e-05, + "loss": 0.4861, + "step": 1804 + }, + { + "epoch": 0.31, + "learning_rate": 1.6095402863917398e-05, + "loss": 0.4799, + "step": 1805 + }, + { + "epoch": 0.31, + "learning_rate": 1.6090952969710868e-05, + "loss": 0.4819, + "step": 1806 + }, + { + "epoch": 0.31, + "learning_rate": 1.608650115719257e-05, + "loss": 0.4732, + "step": 1807 + }, + { + "epoch": 0.31, + "learning_rate": 1.6082047427764572e-05, + "loss": 0.4735, + "step": 1808 + }, + { + "epoch": 0.31, + "learning_rate": 1.607759178282955e-05, + "loss": 0.4878, + "step": 1809 + }, + { + "epoch": 0.31, + "learning_rate": 1.607313422379079e-05, + "loss": 0.4873, + "step": 1810 + }, + { + "epoch": 0.31, + "learning_rate": 1.6068674752052168e-05, + "loss": 0.4742, + "step": 1811 + }, + { + "epoch": 0.31, + "learning_rate": 1.606421336901818e-05, + "loss": 0.4871, + "step": 1812 + }, + { + "epoch": 0.31, + "learning_rate": 1.605975007609391e-05, + "loss": 0.4788, + "step": 1813 + }, + { + "epoch": 0.31, + "learning_rate": 1.605528487468504e-05, + "loss": 0.4738, + "step": 1814 + }, + { + "epoch": 0.31, + "learning_rate": 1.605081776619787e-05, + "loss": 0.4719, + "step": 1815 + }, + { + "epoch": 0.31, + "learning_rate": 1.604634875203929e-05, + "loss": 0.4735, + "step": 1816 + }, + { + "epoch": 0.31, + "learning_rate": 1.6041877833616782e-05, + "loss": 0.4733, + "step": 1817 + }, + { + "epoch": 0.31, + "learning_rate": 1.6037405012338448e-05, + "loss": 0.4873, + "step": 1818 + }, + { + "epoch": 0.32, + "learning_rate": 1.6032930289612974e-05, + "loss": 0.4684, + "step": 1819 + }, + { + "epoch": 0.32, + "learning_rate": 1.6028453666849645e-05, + "loss": 0.4884, + "step": 1820 + }, + { + "epoch": 0.32, + "learning_rate": 1.6023975145458352e-05, + "loss": 0.4753, + "step": 1821 + }, + { + "epoch": 0.32, + "learning_rate": 1.6019494726849582e-05, + "loss": 0.4745, + "step": 1822 + }, + { + "epoch": 0.32, + "learning_rate": 1.6015012412434417e-05, + "loss": 0.4688, + "step": 1823 + }, + { + "epoch": 0.32, + "learning_rate": 1.6010528203624537e-05, + "loss": 0.4771, + "step": 1824 + }, + { + "epoch": 0.32, + "learning_rate": 1.6006042101832212e-05, + "loss": 0.4708, + "step": 1825 + }, + { + "epoch": 0.32, + "learning_rate": 1.6001554108470325e-05, + "loss": 0.4742, + "step": 1826 + }, + { + "epoch": 0.32, + "learning_rate": 1.5997064224952345e-05, + "loss": 0.4749, + "step": 1827 + }, + { + "epoch": 0.32, + "learning_rate": 1.5992572452692324e-05, + "loss": 0.4973, + "step": 1828 + }, + { + "epoch": 0.32, + "learning_rate": 1.598807879310493e-05, + "loss": 0.4844, + "step": 1829 + }, + { + "epoch": 0.32, + "learning_rate": 1.5983583247605414e-05, + "loss": 0.4757, + "step": 1830 + }, + { + "epoch": 0.32, + "learning_rate": 1.5979085817609625e-05, + "loss": 0.4706, + "step": 1831 + }, + { + "epoch": 0.32, + "learning_rate": 1.5974586504534e-05, + "loss": 0.4778, + "step": 1832 + }, + { + "epoch": 0.32, + "learning_rate": 1.5970085309795572e-05, + "loss": 0.4701, + "step": 1833 + }, + { + "epoch": 0.32, + "learning_rate": 1.5965582234811972e-05, + "loss": 0.4736, + "step": 1834 + }, + { + "epoch": 0.32, + "learning_rate": 1.5961077281001418e-05, + "loss": 0.4801, + "step": 1835 + }, + { + "epoch": 0.32, + "learning_rate": 1.5956570449782715e-05, + "loss": 0.4773, + "step": 1836 + }, + { + "epoch": 0.32, + "learning_rate": 1.5952061742575268e-05, + "loss": 0.463, + "step": 1837 + }, + { + "epoch": 0.32, + "learning_rate": 1.594755116079907e-05, + "loss": 0.4813, + "step": 1838 + }, + { + "epoch": 0.32, + "learning_rate": 1.5943038705874697e-05, + "loss": 0.4776, + "step": 1839 + }, + { + "epoch": 0.32, + "learning_rate": 1.593852437922333e-05, + "loss": 0.4953, + "step": 1840 + }, + { + "epoch": 0.32, + "learning_rate": 1.593400818226673e-05, + "loss": 0.4605, + "step": 1841 + }, + { + "epoch": 0.32, + "learning_rate": 1.5929490116427247e-05, + "loss": 0.4966, + "step": 1842 + }, + { + "epoch": 0.32, + "learning_rate": 1.592497018312782e-05, + "loss": 0.4885, + "step": 1843 + }, + { + "epoch": 0.32, + "learning_rate": 1.5920448383791972e-05, + "loss": 0.4816, + "step": 1844 + }, + { + "epoch": 0.32, + "learning_rate": 1.591592471984383e-05, + "loss": 0.491, + "step": 1845 + }, + { + "epoch": 0.32, + "learning_rate": 1.5911399192708085e-05, + "loss": 0.4932, + "step": 1846 + }, + { + "epoch": 0.32, + "learning_rate": 1.590687180381003e-05, + "loss": 0.4819, + "step": 1847 + }, + { + "epoch": 0.32, + "learning_rate": 1.590234255457555e-05, + "loss": 0.4898, + "step": 1848 + }, + { + "epoch": 0.32, + "learning_rate": 1.5897811446431096e-05, + "loss": 0.478, + "step": 1849 + }, + { + "epoch": 0.32, + "learning_rate": 1.5893278480803716e-05, + "loss": 0.4747, + "step": 1850 + }, + { + "epoch": 0.32, + "learning_rate": 1.588874365912105e-05, + "loss": 0.4751, + "step": 1851 + }, + { + "epoch": 0.32, + "learning_rate": 1.588420698281131e-05, + "loss": 0.482, + "step": 1852 + }, + { + "epoch": 0.32, + "learning_rate": 1.587966845330329e-05, + "loss": 0.4772, + "step": 1853 + }, + { + "epoch": 0.32, + "learning_rate": 1.587512807202639e-05, + "loss": 0.4687, + "step": 1854 + }, + { + "epoch": 0.32, + "learning_rate": 1.5870585840410565e-05, + "loss": 0.4734, + "step": 1855 + }, + { + "epoch": 0.32, + "learning_rate": 1.586604175988637e-05, + "loss": 0.4748, + "step": 1856 + }, + { + "epoch": 0.32, + "learning_rate": 1.5861495831884942e-05, + "loss": 0.4822, + "step": 1857 + }, + { + "epoch": 0.32, + "learning_rate": 1.585694805783799e-05, + "loss": 0.4839, + "step": 1858 + }, + { + "epoch": 0.32, + "learning_rate": 1.5852398439177813e-05, + "loss": 0.4657, + "step": 1859 + }, + { + "epoch": 0.32, + "learning_rate": 1.584784697733728e-05, + "loss": 0.4751, + "step": 1860 + }, + { + "epoch": 0.32, + "learning_rate": 1.5843293673749863e-05, + "loss": 0.4783, + "step": 1861 + }, + { + "epoch": 0.32, + "learning_rate": 1.583873852984959e-05, + "loss": 0.4855, + "step": 1862 + }, + { + "epoch": 0.32, + "learning_rate": 1.5834181547071082e-05, + "loss": 0.4744, + "step": 1863 + }, + { + "epoch": 0.32, + "learning_rate": 1.582962272684953e-05, + "loss": 0.4675, + "step": 1864 + }, + { + "epoch": 0.32, + "learning_rate": 1.582506207062072e-05, + "loss": 0.4887, + "step": 1865 + }, + { + "epoch": 0.32, + "learning_rate": 1.582049957982099e-05, + "loss": 0.4932, + "step": 1866 + }, + { + "epoch": 0.32, + "learning_rate": 1.5815935255887286e-05, + "loss": 0.4717, + "step": 1867 + }, + { + "epoch": 0.32, + "learning_rate": 1.5811369100257104e-05, + "loss": 0.4868, + "step": 1868 + }, + { + "epoch": 0.32, + "learning_rate": 1.5806801114368542e-05, + "loss": 0.4721, + "step": 1869 + }, + { + "epoch": 0.32, + "learning_rate": 1.580223129966025e-05, + "loss": 0.4768, + "step": 1870 + }, + { + "epoch": 0.32, + "learning_rate": 1.5797659657571475e-05, + "loss": 0.4676, + "step": 1871 + }, + { + "epoch": 0.32, + "learning_rate": 1.579308618954202e-05, + "loss": 0.4818, + "step": 1872 + }, + { + "epoch": 0.32, + "learning_rate": 1.5788510897012286e-05, + "loss": 0.4774, + "step": 1873 + }, + { + "epoch": 0.32, + "learning_rate": 1.5783933781423222e-05, + "loss": 0.4734, + "step": 1874 + }, + { + "epoch": 0.32, + "learning_rate": 1.5779354844216377e-05, + "loss": 0.4675, + "step": 1875 + }, + { + "epoch": 0.32, + "learning_rate": 1.5774774086833856e-05, + "loss": 0.4822, + "step": 1876 + }, + { + "epoch": 0.33, + "learning_rate": 1.577019151071835e-05, + "loss": 0.4683, + "step": 1877 + }, + { + "epoch": 0.33, + "learning_rate": 1.5765607117313097e-05, + "loss": 0.4903, + "step": 1878 + }, + { + "epoch": 0.33, + "learning_rate": 1.5761020908061947e-05, + "loss": 0.4545, + "step": 1879 + }, + { + "epoch": 0.33, + "learning_rate": 1.5756432884409297e-05, + "loss": 0.4954, + "step": 1880 + }, + { + "epoch": 0.33, + "learning_rate": 1.5751843047800107e-05, + "loss": 0.4833, + "step": 1881 + }, + { + "epoch": 0.33, + "learning_rate": 1.5747251399679937e-05, + "loss": 0.478, + "step": 1882 + }, + { + "epoch": 0.33, + "learning_rate": 1.574265794149489e-05, + "loss": 0.4749, + "step": 1883 + }, + { + "epoch": 0.33, + "learning_rate": 1.5738062674691657e-05, + "loss": 0.4749, + "step": 1884 + }, + { + "epoch": 0.33, + "learning_rate": 1.5733465600717486e-05, + "loss": 0.5003, + "step": 1885 + }, + { + "epoch": 0.33, + "learning_rate": 1.5728866721020203e-05, + "loss": 0.4891, + "step": 1886 + }, + { + "epoch": 0.33, + "learning_rate": 1.5724266037048196e-05, + "loss": 0.4746, + "step": 1887 + }, + { + "epoch": 0.33, + "learning_rate": 1.571966355025043e-05, + "loss": 0.4751, + "step": 1888 + }, + { + "epoch": 0.33, + "learning_rate": 1.571505926207643e-05, + "loss": 0.4775, + "step": 1889 + }, + { + "epoch": 0.33, + "learning_rate": 1.571045317397629e-05, + "loss": 0.4793, + "step": 1890 + }, + { + "epoch": 0.33, + "learning_rate": 1.5705845287400675e-05, + "loss": 0.4811, + "step": 1891 + }, + { + "epoch": 0.33, + "learning_rate": 1.5701235603800813e-05, + "loss": 0.4856, + "step": 1892 + }, + { + "epoch": 0.33, + "learning_rate": 1.5696624124628495e-05, + "loss": 0.4813, + "step": 1893 + }, + { + "epoch": 0.33, + "learning_rate": 1.569201085133608e-05, + "loss": 0.4919, + "step": 1894 + }, + { + "epoch": 0.33, + "learning_rate": 1.56873957853765e-05, + "loss": 0.4731, + "step": 1895 + }, + { + "epoch": 0.33, + "learning_rate": 1.5682778928203232e-05, + "loss": 0.4643, + "step": 1896 + }, + { + "epoch": 0.33, + "learning_rate": 1.5678160281270344e-05, + "loss": 0.4742, + "step": 1897 + }, + { + "epoch": 0.33, + "learning_rate": 1.567353984603244e-05, + "loss": 0.4752, + "step": 1898 + }, + { + "epoch": 0.33, + "learning_rate": 1.566891762394471e-05, + "loss": 0.4819, + "step": 1899 + }, + { + "epoch": 0.33, + "learning_rate": 1.5664293616462894e-05, + "loss": 0.4824, + "step": 1900 + }, + { + "epoch": 0.33, + "learning_rate": 1.56596678250433e-05, + "loss": 0.4747, + "step": 1901 + }, + { + "epoch": 0.33, + "learning_rate": 1.5655040251142787e-05, + "loss": 0.4805, + "step": 1902 + }, + { + "epoch": 0.33, + "learning_rate": 1.5650410896218788e-05, + "loss": 0.4683, + "step": 1903 + }, + { + "epoch": 0.33, + "learning_rate": 1.5645779761729297e-05, + "loss": 0.4902, + "step": 1904 + }, + { + "epoch": 0.33, + "learning_rate": 1.564114684913286e-05, + "loss": 0.4837, + "step": 1905 + }, + { + "epoch": 0.33, + "learning_rate": 1.563651215988859e-05, + "loss": 0.4874, + "step": 1906 + }, + { + "epoch": 0.33, + "learning_rate": 1.5631875695456154e-05, + "loss": 0.4736, + "step": 1907 + }, + { + "epoch": 0.33, + "learning_rate": 1.5627237457295778e-05, + "loss": 0.4728, + "step": 1908 + }, + { + "epoch": 0.33, + "learning_rate": 1.5622597446868254e-05, + "loss": 0.4907, + "step": 1909 + }, + { + "epoch": 0.33, + "learning_rate": 1.5617955665634925e-05, + "loss": 0.4877, + "step": 1910 + }, + { + "epoch": 0.33, + "learning_rate": 1.5613312115057697e-05, + "loss": 0.4736, + "step": 1911 + }, + { + "epoch": 0.33, + "learning_rate": 1.5608666796599026e-05, + "loss": 0.4774, + "step": 1912 + }, + { + "epoch": 0.33, + "learning_rate": 1.5604019711721935e-05, + "loss": 0.4786, + "step": 1913 + }, + { + "epoch": 0.33, + "learning_rate": 1.559937086188999e-05, + "loss": 0.4809, + "step": 1914 + }, + { + "epoch": 0.33, + "learning_rate": 1.5594720248567327e-05, + "loss": 0.4688, + "step": 1915 + }, + { + "epoch": 0.33, + "learning_rate": 1.5590067873218627e-05, + "loss": 0.4923, + "step": 1916 + }, + { + "epoch": 0.33, + "learning_rate": 1.5585413737309133e-05, + "loss": 0.4631, + "step": 1917 + }, + { + "epoch": 0.33, + "learning_rate": 1.558075784230464e-05, + "loss": 0.4852, + "step": 1918 + }, + { + "epoch": 0.33, + "learning_rate": 1.557610018967149e-05, + "loss": 0.4703, + "step": 1919 + }, + { + "epoch": 0.33, + "learning_rate": 1.5571440780876588e-05, + "loss": 0.4803, + "step": 1920 + }, + { + "epoch": 0.33, + "learning_rate": 1.556677961738739e-05, + "loss": 0.4872, + "step": 1921 + }, + { + "epoch": 0.33, + "learning_rate": 1.5562116700671907e-05, + "loss": 0.486, + "step": 1922 + }, + { + "epoch": 0.33, + "learning_rate": 1.555745203219869e-05, + "loss": 0.4732, + "step": 1923 + }, + { + "epoch": 0.33, + "learning_rate": 1.5552785613436853e-05, + "loss": 0.4689, + "step": 1924 + }, + { + "epoch": 0.33, + "learning_rate": 1.5548117445856067e-05, + "loss": 0.4753, + "step": 1925 + }, + { + "epoch": 0.33, + "learning_rate": 1.5543447530926536e-05, + "loss": 0.4887, + "step": 1926 + }, + { + "epoch": 0.33, + "learning_rate": 1.5538775870119026e-05, + "loss": 0.4671, + "step": 1927 + }, + { + "epoch": 0.33, + "learning_rate": 1.553410246490485e-05, + "loss": 0.4737, + "step": 1928 + }, + { + "epoch": 0.33, + "learning_rate": 1.5529427316755876e-05, + "loss": 0.4791, + "step": 1929 + }, + { + "epoch": 0.33, + "learning_rate": 1.552475042714451e-05, + "loss": 0.4842, + "step": 1930 + }, + { + "epoch": 0.33, + "learning_rate": 1.5520071797543717e-05, + "loss": 0.4761, + "step": 1931 + }, + { + "epoch": 0.33, + "learning_rate": 1.5515391429427e-05, + "loss": 0.4845, + "step": 1932 + }, + { + "epoch": 0.33, + "learning_rate": 1.5510709324268422e-05, + "loss": 0.4711, + "step": 1933 + }, + { + "epoch": 0.34, + "learning_rate": 1.5506025483542577e-05, + "loss": 0.472, + "step": 1934 + }, + { + "epoch": 0.34, + "learning_rate": 1.5501339908724624e-05, + "loss": 0.4945, + "step": 1935 + }, + { + "epoch": 0.34, + "learning_rate": 1.5496652601290253e-05, + "loss": 0.4747, + "step": 1936 + }, + { + "epoch": 0.34, + "learning_rate": 1.5491963562715705e-05, + "loss": 0.4855, + "step": 1937 + }, + { + "epoch": 0.34, + "learning_rate": 1.548727279447777e-05, + "loss": 0.4706, + "step": 1938 + }, + { + "epoch": 0.34, + "learning_rate": 1.548258029805378e-05, + "loss": 0.4798, + "step": 1939 + }, + { + "epoch": 0.34, + "learning_rate": 1.5477886074921604e-05, + "loss": 0.4732, + "step": 1940 + }, + { + "epoch": 0.34, + "learning_rate": 1.5473190126559667e-05, + "loss": 0.4786, + "step": 1941 + }, + { + "epoch": 0.34, + "learning_rate": 1.546849245444693e-05, + "loss": 0.4803, + "step": 1942 + }, + { + "epoch": 0.34, + "learning_rate": 1.5463793060062903e-05, + "loss": 0.478, + "step": 1943 + }, + { + "epoch": 0.34, + "learning_rate": 1.5459091944887626e-05, + "loss": 0.489, + "step": 1944 + }, + { + "epoch": 0.34, + "learning_rate": 1.5454389110401694e-05, + "loss": 0.4699, + "step": 1945 + }, + { + "epoch": 0.34, + "learning_rate": 1.5449684558086243e-05, + "loss": 0.4861, + "step": 1946 + }, + { + "epoch": 0.34, + "learning_rate": 1.5444978289422937e-05, + "loss": 0.4789, + "step": 1947 + }, + { + "epoch": 0.34, + "learning_rate": 1.5440270305893995e-05, + "loss": 0.479, + "step": 1948 + }, + { + "epoch": 0.34, + "learning_rate": 1.5435560608982166e-05, + "loss": 0.4813, + "step": 1949 + }, + { + "epoch": 0.34, + "learning_rate": 1.5430849200170747e-05, + "loss": 0.4849, + "step": 1950 + }, + { + "epoch": 0.34, + "learning_rate": 1.5426136080943566e-05, + "loss": 0.4782, + "step": 1951 + }, + { + "epoch": 0.34, + "learning_rate": 1.5421421252784998e-05, + "loss": 0.4741, + "step": 1952 + }, + { + "epoch": 0.34, + "learning_rate": 1.541670471717995e-05, + "loss": 0.4812, + "step": 1953 + }, + { + "epoch": 0.34, + "learning_rate": 1.5411986475613864e-05, + "loss": 0.4782, + "step": 1954 + }, + { + "epoch": 0.34, + "learning_rate": 1.540726652957273e-05, + "loss": 0.4738, + "step": 1955 + }, + { + "epoch": 0.34, + "learning_rate": 1.540254488054307e-05, + "loss": 0.472, + "step": 1956 + }, + { + "epoch": 0.34, + "learning_rate": 1.5397821530011935e-05, + "loss": 0.4885, + "step": 1957 + }, + { + "epoch": 0.34, + "learning_rate": 1.5393096479466922e-05, + "loss": 0.4813, + "step": 1958 + }, + { + "epoch": 0.34, + "learning_rate": 1.538836973039616e-05, + "loss": 0.4777, + "step": 1959 + }, + { + "epoch": 0.34, + "learning_rate": 1.5383641284288308e-05, + "loss": 0.4748, + "step": 1960 + }, + { + "epoch": 0.34, + "learning_rate": 1.537891114263257e-05, + "loss": 0.4864, + "step": 1961 + }, + { + "epoch": 0.34, + "learning_rate": 1.5374179306918674e-05, + "loss": 0.4671, + "step": 1962 + }, + { + "epoch": 0.34, + "learning_rate": 1.5369445778636885e-05, + "loss": 0.4594, + "step": 1963 + }, + { + "epoch": 0.34, + "learning_rate": 1.5364710559278e-05, + "loss": 0.4797, + "step": 1964 + }, + { + "epoch": 0.34, + "learning_rate": 1.5359973650333352e-05, + "loss": 0.4878, + "step": 1965 + }, + { + "epoch": 0.34, + "learning_rate": 1.535523505329481e-05, + "loss": 0.463, + "step": 1966 + }, + { + "epoch": 0.34, + "learning_rate": 1.535049476965476e-05, + "loss": 0.4725, + "step": 1967 + }, + { + "epoch": 0.34, + "learning_rate": 1.5345752800906128e-05, + "loss": 0.4716, + "step": 1968 + }, + { + "epoch": 0.34, + "learning_rate": 1.5341009148542378e-05, + "loss": 0.4837, + "step": 1969 + }, + { + "epoch": 0.34, + "learning_rate": 1.5336263814057493e-05, + "loss": 0.4715, + "step": 1970 + }, + { + "epoch": 0.34, + "learning_rate": 1.5331516798945987e-05, + "loss": 0.4911, + "step": 1971 + }, + { + "epoch": 0.34, + "learning_rate": 1.532676810470291e-05, + "loss": 0.4739, + "step": 1972 + }, + { + "epoch": 0.34, + "learning_rate": 1.5322017732823836e-05, + "loss": 0.4929, + "step": 1973 + }, + { + "epoch": 0.34, + "learning_rate": 1.5317265684804865e-05, + "loss": 0.4621, + "step": 1974 + }, + { + "epoch": 0.34, + "learning_rate": 1.5312511962142634e-05, + "loss": 0.4749, + "step": 1975 + }, + { + "epoch": 0.34, + "learning_rate": 1.5307756566334295e-05, + "loss": 0.467, + "step": 1976 + }, + { + "epoch": 0.34, + "learning_rate": 1.5302999498877537e-05, + "loss": 0.4822, + "step": 1977 + }, + { + "epoch": 0.34, + "learning_rate": 1.5298240761270575e-05, + "loss": 0.4682, + "step": 1978 + }, + { + "epoch": 0.34, + "learning_rate": 1.529348035501214e-05, + "loss": 0.4744, + "step": 1979 + }, + { + "epoch": 0.34, + "learning_rate": 1.52887182816015e-05, + "loss": 0.4743, + "step": 1980 + }, + { + "epoch": 0.34, + "learning_rate": 1.5283954542538442e-05, + "loss": 0.4805, + "step": 1981 + }, + { + "epoch": 0.34, + "learning_rate": 1.5279189139323284e-05, + "loss": 0.4704, + "step": 1982 + }, + { + "epoch": 0.34, + "learning_rate": 1.5274422073456853e-05, + "loss": 0.5015, + "step": 1983 + }, + { + "epoch": 0.34, + "learning_rate": 1.526965334644052e-05, + "loss": 0.4753, + "step": 1984 + }, + { + "epoch": 0.34, + "learning_rate": 1.5264882959776164e-05, + "loss": 0.4768, + "step": 1985 + }, + { + "epoch": 0.34, + "learning_rate": 1.526011091496619e-05, + "loss": 0.4517, + "step": 1986 + }, + { + "epoch": 0.34, + "learning_rate": 1.5255337213513532e-05, + "loss": 0.4776, + "step": 1987 + }, + { + "epoch": 0.34, + "learning_rate": 1.5250561856921638e-05, + "loss": 0.4706, + "step": 1988 + }, + { + "epoch": 0.34, + "learning_rate": 1.5245784846694483e-05, + "loss": 0.4815, + "step": 1989 + }, + { + "epoch": 0.34, + "learning_rate": 1.5241006184336553e-05, + "loss": 0.4739, + "step": 1990 + }, + { + "epoch": 0.34, + "learning_rate": 1.5236225871352867e-05, + "loss": 0.4761, + "step": 1991 + }, + { + "epoch": 0.35, + "learning_rate": 1.5231443909248956e-05, + "loss": 0.4844, + "step": 1992 + }, + { + "epoch": 0.35, + "learning_rate": 1.5226660299530874e-05, + "loss": 0.4789, + "step": 1993 + }, + { + "epoch": 0.35, + "learning_rate": 1.522187504370519e-05, + "loss": 0.4887, + "step": 1994 + }, + { + "epoch": 0.35, + "learning_rate": 1.5217088143278995e-05, + "loss": 0.4777, + "step": 1995 + }, + { + "epoch": 0.35, + "learning_rate": 1.5212299599759894e-05, + "loss": 0.4814, + "step": 1996 + }, + { + "epoch": 0.35, + "learning_rate": 1.5207509414656017e-05, + "loss": 0.4742, + "step": 1997 + }, + { + "epoch": 0.35, + "learning_rate": 1.5202717589476006e-05, + "loss": 0.4831, + "step": 1998 + }, + { + "epoch": 0.35, + "learning_rate": 1.5197924125729015e-05, + "loss": 0.4824, + "step": 1999 + }, + { + "epoch": 0.35, + "learning_rate": 1.519312902492472e-05, + "loss": 0.4619, + "step": 2000 + }, + { + "epoch": 0.35, + "learning_rate": 1.5188332288573313e-05, + "loss": 0.4846, + "step": 2001 + }, + { + "epoch": 0.35, + "learning_rate": 1.51835339181855e-05, + "loss": 0.4707, + "step": 2002 + }, + { + "epoch": 0.35, + "learning_rate": 1.5178733915272501e-05, + "loss": 0.4717, + "step": 2003 + }, + { + "epoch": 0.35, + "learning_rate": 1.5173932281346049e-05, + "loss": 0.473, + "step": 2004 + }, + { + "epoch": 0.35, + "learning_rate": 1.5169129017918389e-05, + "loss": 0.4676, + "step": 2005 + }, + { + "epoch": 0.35, + "learning_rate": 1.5164324126502287e-05, + "loss": 0.4798, + "step": 2006 + }, + { + "epoch": 0.35, + "learning_rate": 1.5159517608611015e-05, + "loss": 0.4926, + "step": 2007 + }, + { + "epoch": 0.35, + "learning_rate": 1.515470946575836e-05, + "loss": 0.4698, + "step": 2008 + }, + { + "epoch": 0.35, + "learning_rate": 1.514989969945862e-05, + "loss": 0.4893, + "step": 2009 + }, + { + "epoch": 0.35, + "learning_rate": 1.5145088311226599e-05, + "loss": 0.4794, + "step": 2010 + }, + { + "epoch": 0.35, + "learning_rate": 1.5140275302577627e-05, + "loss": 0.4828, + "step": 2011 + }, + { + "epoch": 0.35, + "learning_rate": 1.5135460675027525e-05, + "loss": 0.4627, + "step": 2012 + }, + { + "epoch": 0.35, + "learning_rate": 1.5130644430092638e-05, + "loss": 0.4708, + "step": 2013 + }, + { + "epoch": 0.35, + "learning_rate": 1.5125826569289812e-05, + "loss": 0.475, + "step": 2014 + }, + { + "epoch": 0.35, + "learning_rate": 1.512100709413641e-05, + "loss": 0.4832, + "step": 2015 + }, + { + "epoch": 0.35, + "learning_rate": 1.5116186006150294e-05, + "loss": 0.4714, + "step": 2016 + }, + { + "epoch": 0.35, + "learning_rate": 1.5111363306849845e-05, + "loss": 0.4816, + "step": 2017 + }, + { + "epoch": 0.35, + "learning_rate": 1.5106538997753938e-05, + "loss": 0.4778, + "step": 2018 + }, + { + "epoch": 0.35, + "learning_rate": 1.510171308038197e-05, + "loss": 0.467, + "step": 2019 + }, + { + "epoch": 0.35, + "learning_rate": 1.5096885556253833e-05, + "loss": 0.4623, + "step": 2020 + }, + { + "epoch": 0.35, + "learning_rate": 1.5092056426889923e-05, + "loss": 0.4858, + "step": 2021 + }, + { + "epoch": 0.35, + "learning_rate": 1.5087225693811159e-05, + "loss": 0.4515, + "step": 2022 + }, + { + "epoch": 0.35, + "learning_rate": 1.5082393358538946e-05, + "loss": 0.4879, + "step": 2023 + }, + { + "epoch": 0.35, + "learning_rate": 1.5077559422595202e-05, + "loss": 0.4633, + "step": 2024 + }, + { + "epoch": 0.35, + "learning_rate": 1.5072723887502352e-05, + "loss": 0.4859, + "step": 2025 + }, + { + "epoch": 0.35, + "learning_rate": 1.5067886754783316e-05, + "loss": 0.4942, + "step": 2026 + }, + { + "epoch": 0.35, + "learning_rate": 1.5063048025961523e-05, + "loss": 0.4801, + "step": 2027 + }, + { + "epoch": 0.35, + "learning_rate": 1.5058207702560907e-05, + "loss": 0.4776, + "step": 2028 + }, + { + "epoch": 0.35, + "learning_rate": 1.5053365786105898e-05, + "loss": 0.4837, + "step": 2029 + }, + { + "epoch": 0.35, + "learning_rate": 1.5048522278121432e-05, + "loss": 0.4721, + "step": 2030 + }, + { + "epoch": 0.35, + "learning_rate": 1.5043677180132946e-05, + "loss": 0.4912, + "step": 2031 + }, + { + "epoch": 0.35, + "learning_rate": 1.5038830493666371e-05, + "loss": 0.4744, + "step": 2032 + }, + { + "epoch": 0.35, + "learning_rate": 1.5033982220248151e-05, + "loss": 0.481, + "step": 2033 + }, + { + "epoch": 0.35, + "learning_rate": 1.5029132361405219e-05, + "loss": 0.4795, + "step": 2034 + }, + { + "epoch": 0.35, + "learning_rate": 1.502428091866501e-05, + "loss": 0.4778, + "step": 2035 + }, + { + "epoch": 0.35, + "learning_rate": 1.5019427893555462e-05, + "loss": 0.474, + "step": 2036 + }, + { + "epoch": 0.35, + "learning_rate": 1.501457328760501e-05, + "loss": 0.4732, + "step": 2037 + }, + { + "epoch": 0.35, + "learning_rate": 1.5009717102342577e-05, + "loss": 0.4752, + "step": 2038 + }, + { + "epoch": 0.35, + "learning_rate": 1.5004859339297601e-05, + "loss": 0.4855, + "step": 2039 + }, + { + "epoch": 0.35, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.4876, + "step": 2040 + }, + { + "epoch": 0.35, + "learning_rate": 1.4995139085980203e-05, + "loss": 0.4735, + "step": 2041 + }, + { + "epoch": 0.35, + "learning_rate": 1.499027659876912e-05, + "loss": 0.4773, + "step": 2042 + }, + { + "epoch": 0.35, + "learning_rate": 1.498541253989817e-05, + "loss": 0.4759, + "step": 2043 + }, + { + "epoch": 0.35, + "learning_rate": 1.4980546910899261e-05, + "loss": 0.4661, + "step": 2044 + }, + { + "epoch": 0.35, + "learning_rate": 1.4975679713304794e-05, + "loss": 0.4859, + "step": 2045 + }, + { + "epoch": 0.35, + "learning_rate": 1.4970810948647664e-05, + "loss": 0.4868, + "step": 2046 + }, + { + "epoch": 0.35, + "learning_rate": 1.4965940618461265e-05, + "loss": 0.4837, + "step": 2047 + }, + { + "epoch": 0.35, + "learning_rate": 1.496106872427948e-05, + "loss": 0.4805, + "step": 2048 + }, + { + "epoch": 0.35, + "learning_rate": 1.4956195267636679e-05, + "loss": 0.4798, + "step": 2049 + }, + { + "epoch": 0.36, + "learning_rate": 1.495132025006774e-05, + "loss": 0.4697, + "step": 2050 + }, + { + "epoch": 0.36, + "learning_rate": 1.4946443673108015e-05, + "loss": 0.4797, + "step": 2051 + }, + { + "epoch": 0.36, + "learning_rate": 1.4941565538293358e-05, + "loss": 0.4784, + "step": 2052 + }, + { + "epoch": 0.36, + "learning_rate": 1.4936685847160113e-05, + "loss": 0.4813, + "step": 2053 + }, + { + "epoch": 0.36, + "learning_rate": 1.4931804601245105e-05, + "loss": 0.4836, + "step": 2054 + }, + { + "epoch": 0.36, + "learning_rate": 1.4926921802085662e-05, + "loss": 0.4865, + "step": 2055 + }, + { + "epoch": 0.36, + "learning_rate": 1.4922037451219586e-05, + "loss": 0.462, + "step": 2056 + }, + { + "epoch": 0.36, + "learning_rate": 1.4917151550185187e-05, + "loss": 0.4853, + "step": 2057 + }, + { + "epoch": 0.36, + "learning_rate": 1.4912264100521243e-05, + "loss": 0.4677, + "step": 2058 + }, + { + "epoch": 0.36, + "learning_rate": 1.4907375103767037e-05, + "loss": 0.4763, + "step": 2059 + }, + { + "epoch": 0.36, + "learning_rate": 1.4902484561462323e-05, + "loss": 0.4692, + "step": 2060 + }, + { + "epoch": 0.36, + "learning_rate": 1.4897592475147356e-05, + "loss": 0.4795, + "step": 2061 + }, + { + "epoch": 0.36, + "learning_rate": 1.489269884636287e-05, + "loss": 0.4737, + "step": 2062 + }, + { + "epoch": 0.36, + "learning_rate": 1.4887803676650083e-05, + "loss": 0.4805, + "step": 2063 + }, + { + "epoch": 0.36, + "learning_rate": 1.4882906967550708e-05, + "loss": 0.4819, + "step": 2064 + }, + { + "epoch": 0.36, + "learning_rate": 1.487800872060693e-05, + "loss": 0.479, + "step": 2065 + }, + { + "epoch": 0.36, + "learning_rate": 1.4873108937361429e-05, + "loss": 0.463, + "step": 2066 + }, + { + "epoch": 0.36, + "learning_rate": 1.4868207619357362e-05, + "loss": 0.4814, + "step": 2067 + }, + { + "epoch": 0.36, + "learning_rate": 1.4863304768138374e-05, + "loss": 0.4654, + "step": 2068 + }, + { + "epoch": 0.36, + "learning_rate": 1.4858400385248585e-05, + "loss": 0.4851, + "step": 2069 + }, + { + "epoch": 0.36, + "learning_rate": 1.4853494472232613e-05, + "loss": 0.479, + "step": 2070 + }, + { + "epoch": 0.36, + "learning_rate": 1.4848587030635537e-05, + "loss": 0.4832, + "step": 2071 + }, + { + "epoch": 0.36, + "learning_rate": 1.484367806200294e-05, + "loss": 0.4865, + "step": 2072 + }, + { + "epoch": 0.36, + "learning_rate": 1.4838767567880865e-05, + "loss": 0.475, + "step": 2073 + }, + { + "epoch": 0.36, + "learning_rate": 1.4833855549815848e-05, + "loss": 0.4734, + "step": 2074 + }, + { + "epoch": 0.36, + "learning_rate": 1.4828942009354902e-05, + "loss": 0.4751, + "step": 2075 + }, + { + "epoch": 0.36, + "learning_rate": 1.482402694804552e-05, + "loss": 0.4698, + "step": 2076 + }, + { + "epoch": 0.36, + "learning_rate": 1.4819110367435672e-05, + "loss": 0.4898, + "step": 2077 + }, + { + "epoch": 0.36, + "learning_rate": 1.4814192269073808e-05, + "loss": 0.4844, + "step": 2078 + }, + { + "epoch": 0.36, + "learning_rate": 1.4809272654508858e-05, + "loss": 0.4787, + "step": 2079 + }, + { + "epoch": 0.36, + "learning_rate": 1.4804351525290221e-05, + "loss": 0.4723, + "step": 2080 + }, + { + "epoch": 0.36, + "learning_rate": 1.4799428882967787e-05, + "loss": 0.489, + "step": 2081 + }, + { + "epoch": 0.36, + "learning_rate": 1.479450472909191e-05, + "loss": 0.473, + "step": 2082 + }, + { + "epoch": 0.36, + "learning_rate": 1.4789579065213425e-05, + "loss": 0.4792, + "step": 2083 + }, + { + "epoch": 0.36, + "learning_rate": 1.4784651892883644e-05, + "loss": 0.488, + "step": 2084 + }, + { + "epoch": 0.36, + "learning_rate": 1.4779723213654354e-05, + "loss": 0.4646, + "step": 2085 + }, + { + "epoch": 0.36, + "learning_rate": 1.477479302907781e-05, + "loss": 0.4817, + "step": 2086 + }, + { + "epoch": 0.36, + "learning_rate": 1.476986134070675e-05, + "loss": 0.4806, + "step": 2087 + }, + { + "epoch": 0.36, + "learning_rate": 1.4764928150094384e-05, + "loss": 0.4804, + "step": 2088 + }, + { + "epoch": 0.36, + "learning_rate": 1.4759993458794388e-05, + "loss": 0.4801, + "step": 2089 + }, + { + "epoch": 0.36, + "learning_rate": 1.475505726836092e-05, + "loss": 0.4725, + "step": 2090 + }, + { + "epoch": 0.36, + "learning_rate": 1.4750119580348601e-05, + "loss": 0.4812, + "step": 2091 + }, + { + "epoch": 0.36, + "learning_rate": 1.4745180396312533e-05, + "loss": 0.4754, + "step": 2092 + }, + { + "epoch": 0.36, + "learning_rate": 1.474023971780828e-05, + "loss": 0.4702, + "step": 2093 + }, + { + "epoch": 0.36, + "learning_rate": 1.4735297546391887e-05, + "loss": 0.4676, + "step": 2094 + }, + { + "epoch": 0.36, + "learning_rate": 1.4730353883619856e-05, + "loss": 0.4733, + "step": 2095 + }, + { + "epoch": 0.36, + "learning_rate": 1.4725408731049173e-05, + "loss": 0.4712, + "step": 2096 + }, + { + "epoch": 0.36, + "learning_rate": 1.4720462090237285e-05, + "loss": 0.4859, + "step": 2097 + }, + { + "epoch": 0.36, + "learning_rate": 1.4715513962742102e-05, + "loss": 0.4719, + "step": 2098 + }, + { + "epoch": 0.36, + "learning_rate": 1.471056435012202e-05, + "loss": 0.4891, + "step": 2099 + }, + { + "epoch": 0.36, + "learning_rate": 1.4705613253935886e-05, + "loss": 0.475, + "step": 2100 + }, + { + "epoch": 0.36, + "learning_rate": 1.4700660675743021e-05, + "loss": 0.4895, + "step": 2101 + }, + { + "epoch": 0.36, + "learning_rate": 1.469570661710321e-05, + "loss": 0.4623, + "step": 2102 + }, + { + "epoch": 0.36, + "learning_rate": 1.469075107957671e-05, + "loss": 0.4755, + "step": 2103 + }, + { + "epoch": 0.36, + "learning_rate": 1.4685794064724235e-05, + "loss": 0.4781, + "step": 2104 + }, + { + "epoch": 0.36, + "learning_rate": 1.4680835574106977e-05, + "loss": 0.4876, + "step": 2105 + }, + { + "epoch": 0.36, + "learning_rate": 1.4675875609286579e-05, + "loss": 0.491, + "step": 2106 + }, + { + "epoch": 0.37, + "learning_rate": 1.4670914171825157e-05, + "loss": 0.488, + "step": 2107 + }, + { + "epoch": 0.37, + "learning_rate": 1.4665951263285283e-05, + "loss": 0.477, + "step": 2108 + }, + { + "epoch": 0.37, + "learning_rate": 1.4660986885230002e-05, + "loss": 0.4932, + "step": 2109 + }, + { + "epoch": 0.37, + "learning_rate": 1.465602103922282e-05, + "loss": 0.472, + "step": 2110 + }, + { + "epoch": 0.37, + "learning_rate": 1.4651053726827695e-05, + "loss": 0.4847, + "step": 2111 + }, + { + "epoch": 0.37, + "learning_rate": 1.4646084949609062e-05, + "loss": 0.4797, + "step": 2112 + }, + { + "epoch": 0.37, + "learning_rate": 1.4641114709131805e-05, + "loss": 0.4709, + "step": 2113 + }, + { + "epoch": 0.37, + "learning_rate": 1.4636143006961279e-05, + "loss": 0.4815, + "step": 2114 + }, + { + "epoch": 0.37, + "learning_rate": 1.4631169844663284e-05, + "loss": 0.4732, + "step": 2115 + }, + { + "epoch": 0.37, + "learning_rate": 1.4626195223804101e-05, + "loss": 0.4884, + "step": 2116 + }, + { + "epoch": 0.37, + "learning_rate": 1.4621219145950452e-05, + "loss": 0.4591, + "step": 2117 + }, + { + "epoch": 0.37, + "learning_rate": 1.4616241612669523e-05, + "loss": 0.4718, + "step": 2118 + }, + { + "epoch": 0.37, + "learning_rate": 1.461126262552897e-05, + "loss": 0.4565, + "step": 2119 + }, + { + "epoch": 0.37, + "learning_rate": 1.460628218609689e-05, + "loss": 0.4789, + "step": 2120 + }, + { + "epoch": 0.37, + "learning_rate": 1.4601300295941847e-05, + "loss": 0.476, + "step": 2121 + }, + { + "epoch": 0.37, + "learning_rate": 1.4596316956632856e-05, + "loss": 0.4833, + "step": 2122 + }, + { + "epoch": 0.37, + "learning_rate": 1.45913321697394e-05, + "loss": 0.4711, + "step": 2123 + }, + { + "epoch": 0.37, + "learning_rate": 1.4586345936831404e-05, + "loss": 0.4863, + "step": 2124 + }, + { + "epoch": 0.37, + "learning_rate": 1.4581358259479252e-05, + "loss": 0.4724, + "step": 2125 + }, + { + "epoch": 0.37, + "learning_rate": 1.457636913925379e-05, + "loss": 0.4782, + "step": 2126 + }, + { + "epoch": 0.37, + "learning_rate": 1.4571378577726317e-05, + "loss": 0.4747, + "step": 2127 + }, + { + "epoch": 0.37, + "learning_rate": 1.4566386576468572e-05, + "loss": 0.4751, + "step": 2128 + }, + { + "epoch": 0.37, + "learning_rate": 1.4561393137052767e-05, + "loss": 0.4757, + "step": 2129 + }, + { + "epoch": 0.37, + "learning_rate": 1.4556398261051553e-05, + "loss": 0.4747, + "step": 2130 + }, + { + "epoch": 0.37, + "learning_rate": 1.455140195003804e-05, + "loss": 0.4643, + "step": 2131 + }, + { + "epoch": 0.37, + "learning_rate": 1.4546404205585789e-05, + "loss": 0.4826, + "step": 2132 + }, + { + "epoch": 0.37, + "learning_rate": 1.4541405029268813e-05, + "loss": 0.475, + "step": 2133 + }, + { + "epoch": 0.37, + "learning_rate": 1.4536404422661575e-05, + "loss": 0.4854, + "step": 2134 + }, + { + "epoch": 0.37, + "learning_rate": 1.4531402387338982e-05, + "loss": 0.4695, + "step": 2135 + }, + { + "epoch": 0.37, + "learning_rate": 1.4526398924876407e-05, + "loss": 0.4703, + "step": 2136 + }, + { + "epoch": 0.37, + "learning_rate": 1.4521394036849652e-05, + "loss": 0.4847, + "step": 2137 + }, + { + "epoch": 0.37, + "learning_rate": 1.4516387724834989e-05, + "loss": 0.4657, + "step": 2138 + }, + { + "epoch": 0.37, + "learning_rate": 1.4511379990409119e-05, + "loss": 0.4667, + "step": 2139 + }, + { + "epoch": 0.37, + "learning_rate": 1.4506370835149209e-05, + "loss": 0.471, + "step": 2140 + }, + { + "epoch": 0.37, + "learning_rate": 1.4501360260632855e-05, + "loss": 0.4593, + "step": 2141 + }, + { + "epoch": 0.37, + "learning_rate": 1.4496348268438116e-05, + "loss": 0.4654, + "step": 2142 + }, + { + "epoch": 0.37, + "learning_rate": 1.4491334860143494e-05, + "loss": 0.4815, + "step": 2143 + }, + { + "epoch": 0.37, + "learning_rate": 1.4486320037327924e-05, + "loss": 0.4863, + "step": 2144 + }, + { + "epoch": 0.37, + "learning_rate": 1.4481303801570805e-05, + "loss": 0.4664, + "step": 2145 + }, + { + "epoch": 0.37, + "learning_rate": 1.4476286154451968e-05, + "loss": 0.4875, + "step": 2146 + }, + { + "epoch": 0.37, + "learning_rate": 1.4471267097551698e-05, + "loss": 0.4843, + "step": 2147 + }, + { + "epoch": 0.37, + "learning_rate": 1.4466246632450714e-05, + "loss": 0.4747, + "step": 2148 + }, + { + "epoch": 0.37, + "learning_rate": 1.4461224760730189e-05, + "loss": 0.471, + "step": 2149 + }, + { + "epoch": 0.37, + "learning_rate": 1.4456201483971724e-05, + "loss": 0.4707, + "step": 2150 + }, + { + "epoch": 0.37, + "learning_rate": 1.4451176803757383e-05, + "loss": 0.4741, + "step": 2151 + }, + { + "epoch": 0.37, + "learning_rate": 1.4446150721669654e-05, + "loss": 0.4866, + "step": 2152 + }, + { + "epoch": 0.37, + "learning_rate": 1.4441123239291477e-05, + "loss": 0.467, + "step": 2153 + }, + { + "epoch": 0.37, + "learning_rate": 1.4436094358206224e-05, + "loss": 0.4785, + "step": 2154 + }, + { + "epoch": 0.37, + "learning_rate": 1.4431064079997723e-05, + "loss": 0.4608, + "step": 2155 + }, + { + "epoch": 0.37, + "learning_rate": 1.4426032406250228e-05, + "loss": 0.4788, + "step": 2156 + }, + { + "epoch": 0.37, + "learning_rate": 1.4420999338548432e-05, + "loss": 0.4773, + "step": 2157 + }, + { + "epoch": 0.37, + "learning_rate": 1.4415964878477477e-05, + "loss": 0.4902, + "step": 2158 + }, + { + "epoch": 0.37, + "learning_rate": 1.4410929027622932e-05, + "loss": 0.4673, + "step": 2159 + }, + { + "epoch": 0.37, + "learning_rate": 1.440589178757082e-05, + "loss": 0.4779, + "step": 2160 + }, + { + "epoch": 0.37, + "learning_rate": 1.4400853159907584e-05, + "loss": 0.4775, + "step": 2161 + }, + { + "epoch": 0.37, + "learning_rate": 1.4395813146220117e-05, + "loss": 0.4901, + "step": 2162 + }, + { + "epoch": 0.37, + "learning_rate": 1.4390771748095735e-05, + "loss": 0.4659, + "step": 2163 + }, + { + "epoch": 0.37, + "learning_rate": 1.4385728967122207e-05, + "loss": 0.4782, + "step": 2164 + }, + { + "epoch": 0.38, + "learning_rate": 1.4380684804887726e-05, + "loss": 0.482, + "step": 2165 + }, + { + "epoch": 0.38, + "learning_rate": 1.4375639262980921e-05, + "loss": 0.4734, + "step": 2166 + }, + { + "epoch": 0.38, + "learning_rate": 1.437059234299086e-05, + "loss": 0.4619, + "step": 2167 + }, + { + "epoch": 0.38, + "learning_rate": 1.4365544046507039e-05, + "loss": 0.489, + "step": 2168 + }, + { + "epoch": 0.38, + "learning_rate": 1.4360494375119396e-05, + "loss": 0.4819, + "step": 2169 + }, + { + "epoch": 0.38, + "learning_rate": 1.4355443330418292e-05, + "loss": 0.4827, + "step": 2170 + }, + { + "epoch": 0.38, + "learning_rate": 1.435039091399453e-05, + "loss": 0.4666, + "step": 2171 + }, + { + "epoch": 0.38, + "learning_rate": 1.4345337127439333e-05, + "loss": 0.4795, + "step": 2172 + }, + { + "epoch": 0.38, + "learning_rate": 1.4340281972344374e-05, + "loss": 0.4665, + "step": 2173 + }, + { + "epoch": 0.38, + "learning_rate": 1.4335225450301735e-05, + "loss": 0.4774, + "step": 2174 + }, + { + "epoch": 0.38, + "learning_rate": 1.4330167562903948e-05, + "loss": 0.4776, + "step": 2175 + }, + { + "epoch": 0.38, + "learning_rate": 1.4325108311743959e-05, + "loss": 0.4809, + "step": 2176 + }, + { + "epoch": 0.38, + "learning_rate": 1.4320047698415156e-05, + "loss": 0.4686, + "step": 2177 + }, + { + "epoch": 0.38, + "learning_rate": 1.4314985724511353e-05, + "loss": 0.4643, + "step": 2178 + }, + { + "epoch": 0.38, + "learning_rate": 1.4309922391626784e-05, + "loss": 0.4634, + "step": 2179 + }, + { + "epoch": 0.38, + "learning_rate": 1.4304857701356123e-05, + "loss": 0.4828, + "step": 2180 + }, + { + "epoch": 0.38, + "learning_rate": 1.4299791655294461e-05, + "loss": 0.48, + "step": 2181 + }, + { + "epoch": 0.38, + "learning_rate": 1.4294724255037329e-05, + "loss": 0.4784, + "step": 2182 + }, + { + "epoch": 0.38, + "learning_rate": 1.4289655502180667e-05, + "loss": 0.4829, + "step": 2183 + }, + { + "epoch": 0.38, + "learning_rate": 1.428458539832086e-05, + "loss": 0.4745, + "step": 2184 + }, + { + "epoch": 0.38, + "learning_rate": 1.42795139450547e-05, + "loss": 0.4731, + "step": 2185 + }, + { + "epoch": 0.38, + "learning_rate": 1.4274441143979418e-05, + "loss": 0.4751, + "step": 2186 + }, + { + "epoch": 0.38, + "learning_rate": 1.4269366996692666e-05, + "loss": 0.4652, + "step": 2187 + }, + { + "epoch": 0.38, + "learning_rate": 1.4264291504792514e-05, + "loss": 0.468, + "step": 2188 + }, + { + "epoch": 0.38, + "learning_rate": 1.4259214669877462e-05, + "loss": 0.4812, + "step": 2189 + }, + { + "epoch": 0.38, + "learning_rate": 1.4254136493546432e-05, + "loss": 0.4743, + "step": 2190 + }, + { + "epoch": 0.38, + "learning_rate": 1.4249056977398767e-05, + "loss": 0.4736, + "step": 2191 + }, + { + "epoch": 0.38, + "learning_rate": 1.4243976123034231e-05, + "loss": 0.4811, + "step": 2192 + }, + { + "epoch": 0.38, + "learning_rate": 1.4238893932053013e-05, + "loss": 0.4814, + "step": 2193 + }, + { + "epoch": 0.38, + "learning_rate": 1.4233810406055718e-05, + "loss": 0.4729, + "step": 2194 + }, + { + "epoch": 0.38, + "learning_rate": 1.4228725546643373e-05, + "loss": 0.4779, + "step": 2195 + }, + { + "epoch": 0.38, + "learning_rate": 1.422363935541743e-05, + "loss": 0.4712, + "step": 2196 + }, + { + "epoch": 0.38, + "learning_rate": 1.4218551833979759e-05, + "loss": 0.467, + "step": 2197 + }, + { + "epoch": 0.38, + "learning_rate": 1.4213462983932641e-05, + "loss": 0.4695, + "step": 2198 + }, + { + "epoch": 0.38, + "learning_rate": 1.4208372806878782e-05, + "loss": 0.4685, + "step": 2199 + }, + { + "epoch": 0.38, + "learning_rate": 1.420328130442131e-05, + "loss": 0.4762, + "step": 2200 + }, + { + "epoch": 0.38, + "learning_rate": 1.419818847816376e-05, + "loss": 0.4696, + "step": 2201 + }, + { + "epoch": 0.38, + "learning_rate": 1.4193094329710089e-05, + "loss": 0.4629, + "step": 2202 + }, + { + "epoch": 0.38, + "learning_rate": 1.4187998860664672e-05, + "loss": 0.4587, + "step": 2203 + }, + { + "epoch": 0.38, + "learning_rate": 1.4182902072632301e-05, + "loss": 0.4809, + "step": 2204 + }, + { + "epoch": 0.38, + "learning_rate": 1.4177803967218178e-05, + "loss": 0.4734, + "step": 2205 + }, + { + "epoch": 0.38, + "learning_rate": 1.4172704546027926e-05, + "loss": 0.4799, + "step": 2206 + }, + { + "epoch": 0.38, + "learning_rate": 1.4167603810667578e-05, + "loss": 0.4761, + "step": 2207 + }, + { + "epoch": 0.38, + "learning_rate": 1.4162501762743579e-05, + "loss": 0.4763, + "step": 2208 + }, + { + "epoch": 0.38, + "learning_rate": 1.4157398403862794e-05, + "loss": 0.4727, + "step": 2209 + }, + { + "epoch": 0.38, + "learning_rate": 1.4152293735632498e-05, + "loss": 0.4846, + "step": 2210 + }, + { + "epoch": 0.38, + "learning_rate": 1.4147187759660377e-05, + "loss": 0.4676, + "step": 2211 + }, + { + "epoch": 0.38, + "learning_rate": 1.414208047755453e-05, + "loss": 0.4847, + "step": 2212 + }, + { + "epoch": 0.38, + "learning_rate": 1.4136971890923465e-05, + "loss": 0.4629, + "step": 2213 + }, + { + "epoch": 0.38, + "learning_rate": 1.4131862001376107e-05, + "loss": 0.4789, + "step": 2214 + }, + { + "epoch": 0.38, + "learning_rate": 1.4126750810521783e-05, + "loss": 0.4673, + "step": 2215 + }, + { + "epoch": 0.38, + "learning_rate": 1.4121638319970234e-05, + "loss": 0.4909, + "step": 2216 + }, + { + "epoch": 0.38, + "learning_rate": 1.4116524531331616e-05, + "loss": 0.4755, + "step": 2217 + }, + { + "epoch": 0.38, + "learning_rate": 1.4111409446216482e-05, + "loss": 0.4735, + "step": 2218 + }, + { + "epoch": 0.38, + "learning_rate": 1.4106293066235806e-05, + "loss": 0.4708, + "step": 2219 + }, + { + "epoch": 0.38, + "learning_rate": 1.410117539300096e-05, + "loss": 0.4804, + "step": 2220 + }, + { + "epoch": 0.38, + "learning_rate": 1.4096056428123721e-05, + "loss": 0.4605, + "step": 2221 + }, + { + "epoch": 0.38, + "learning_rate": 1.4090936173216289e-05, + "loss": 0.4638, + "step": 2222 + }, + { + "epoch": 0.39, + "learning_rate": 1.4085814629891252e-05, + "loss": 0.4724, + "step": 2223 + }, + { + "epoch": 0.39, + "learning_rate": 1.4080691799761618e-05, + "loss": 0.4739, + "step": 2224 + }, + { + "epoch": 0.39, + "learning_rate": 1.4075567684440788e-05, + "loss": 0.4719, + "step": 2225 + }, + { + "epoch": 0.39, + "learning_rate": 1.4070442285542579e-05, + "loss": 0.4873, + "step": 2226 + }, + { + "epoch": 0.39, + "learning_rate": 1.4065315604681198e-05, + "loss": 0.4739, + "step": 2227 + }, + { + "epoch": 0.39, + "learning_rate": 1.4060187643471276e-05, + "loss": 0.4852, + "step": 2228 + }, + { + "epoch": 0.39, + "learning_rate": 1.4055058403527828e-05, + "loss": 0.472, + "step": 2229 + }, + { + "epoch": 0.39, + "learning_rate": 1.4049927886466281e-05, + "loss": 0.4794, + "step": 2230 + }, + { + "epoch": 0.39, + "learning_rate": 1.4044796093902466e-05, + "loss": 0.4655, + "step": 2231 + }, + { + "epoch": 0.39, + "learning_rate": 1.403966302745261e-05, + "loss": 0.4765, + "step": 2232 + }, + { + "epoch": 0.39, + "learning_rate": 1.4034528688733344e-05, + "loss": 0.459, + "step": 2233 + }, + { + "epoch": 0.39, + "learning_rate": 1.4029393079361699e-05, + "loss": 0.4716, + "step": 2234 + }, + { + "epoch": 0.39, + "learning_rate": 1.402425620095511e-05, + "loss": 0.4742, + "step": 2235 + }, + { + "epoch": 0.39, + "learning_rate": 1.40191180551314e-05, + "loss": 0.4857, + "step": 2236 + }, + { + "epoch": 0.39, + "learning_rate": 1.4013978643508807e-05, + "loss": 0.4715, + "step": 2237 + }, + { + "epoch": 0.39, + "learning_rate": 1.4008837967705959e-05, + "loss": 0.4794, + "step": 2238 + }, + { + "epoch": 0.39, + "learning_rate": 1.4003696029341884e-05, + "loss": 0.4615, + "step": 2239 + }, + { + "epoch": 0.39, + "learning_rate": 1.3998552830036003e-05, + "loss": 0.4667, + "step": 2240 + }, + { + "epoch": 0.39, + "learning_rate": 1.399340837140814e-05, + "loss": 0.4716, + "step": 2241 + }, + { + "epoch": 0.39, + "learning_rate": 1.3988262655078514e-05, + "loss": 0.468, + "step": 2242 + }, + { + "epoch": 0.39, + "learning_rate": 1.3983115682667743e-05, + "loss": 0.4775, + "step": 2243 + }, + { + "epoch": 0.39, + "learning_rate": 1.3977967455796828e-05, + "loss": 0.4658, + "step": 2244 + }, + { + "epoch": 0.39, + "learning_rate": 1.3972817976087183e-05, + "loss": 0.459, + "step": 2245 + }, + { + "epoch": 0.39, + "learning_rate": 1.3967667245160608e-05, + "loss": 0.4804, + "step": 2246 + }, + { + "epoch": 0.39, + "learning_rate": 1.3962515264639291e-05, + "loss": 0.4736, + "step": 2247 + }, + { + "epoch": 0.39, + "learning_rate": 1.3957362036145826e-05, + "loss": 0.4788, + "step": 2248 + }, + { + "epoch": 0.39, + "learning_rate": 1.3952207561303188e-05, + "loss": 0.4674, + "step": 2249 + }, + { + "epoch": 0.39, + "learning_rate": 1.3947051841734756e-05, + "loss": 0.4654, + "step": 2250 + }, + { + "epoch": 0.39, + "learning_rate": 1.3941894879064289e-05, + "loss": 0.4666, + "step": 2251 + }, + { + "epoch": 0.39, + "learning_rate": 1.3936736674915947e-05, + "loss": 0.4889, + "step": 2252 + }, + { + "epoch": 0.39, + "learning_rate": 1.393157723091428e-05, + "loss": 0.4643, + "step": 2253 + }, + { + "epoch": 0.39, + "learning_rate": 1.3926416548684221e-05, + "loss": 0.4876, + "step": 2254 + }, + { + "epoch": 0.39, + "learning_rate": 1.3921254629851103e-05, + "loss": 0.4603, + "step": 2255 + }, + { + "epoch": 0.39, + "learning_rate": 1.391609147604064e-05, + "loss": 0.4882, + "step": 2256 + }, + { + "epoch": 0.39, + "learning_rate": 1.3910927088878943e-05, + "loss": 0.4752, + "step": 2257 + }, + { + "epoch": 0.39, + "learning_rate": 1.39057614699925e-05, + "loss": 0.4832, + "step": 2258 + }, + { + "epoch": 0.39, + "learning_rate": 1.3900594621008201e-05, + "loss": 0.4699, + "step": 2259 + }, + { + "epoch": 0.39, + "learning_rate": 1.3895426543553313e-05, + "loss": 0.4706, + "step": 2260 + }, + { + "epoch": 0.39, + "learning_rate": 1.3890257239255495e-05, + "loss": 0.4842, + "step": 2261 + }, + { + "epoch": 0.39, + "learning_rate": 1.3885086709742788e-05, + "loss": 0.4819, + "step": 2262 + }, + { + "epoch": 0.39, + "learning_rate": 1.3879914956643623e-05, + "loss": 0.4778, + "step": 2263 + }, + { + "epoch": 0.39, + "learning_rate": 1.3874741981586814e-05, + "loss": 0.4678, + "step": 2264 + }, + { + "epoch": 0.39, + "learning_rate": 1.3869567786201562e-05, + "loss": 0.4702, + "step": 2265 + }, + { + "epoch": 0.39, + "learning_rate": 1.386439237211745e-05, + "loss": 0.483, + "step": 2266 + }, + { + "epoch": 0.39, + "learning_rate": 1.3859215740964446e-05, + "loss": 0.4682, + "step": 2267 + }, + { + "epoch": 0.39, + "learning_rate": 1.3854037894372905e-05, + "loss": 0.4867, + "step": 2268 + }, + { + "epoch": 0.39, + "learning_rate": 1.3848858833973555e-05, + "loss": 0.4761, + "step": 2269 + }, + { + "epoch": 0.39, + "learning_rate": 1.3843678561397517e-05, + "loss": 0.4568, + "step": 2270 + }, + { + "epoch": 0.39, + "learning_rate": 1.3838497078276288e-05, + "loss": 0.4671, + "step": 2271 + }, + { + "epoch": 0.39, + "learning_rate": 1.3833314386241744e-05, + "loss": 0.4686, + "step": 2272 + }, + { + "epoch": 0.39, + "learning_rate": 1.3828130486926145e-05, + "loss": 0.468, + "step": 2273 + }, + { + "epoch": 0.39, + "learning_rate": 1.382294538196214e-05, + "loss": 0.4744, + "step": 2274 + }, + { + "epoch": 0.39, + "learning_rate": 1.3817759072982737e-05, + "loss": 0.4769, + "step": 2275 + }, + { + "epoch": 0.39, + "learning_rate": 1.3812571561621341e-05, + "loss": 0.4793, + "step": 2276 + }, + { + "epoch": 0.39, + "learning_rate": 1.3807382849511732e-05, + "loss": 0.4674, + "step": 2277 + }, + { + "epoch": 0.39, + "learning_rate": 1.3802192938288063e-05, + "loss": 0.4612, + "step": 2278 + }, + { + "epoch": 0.39, + "learning_rate": 1.3797001829584868e-05, + "loss": 0.4749, + "step": 2279 + }, + { + "epoch": 0.39, + "learning_rate": 1.3791809525037057e-05, + "loss": 0.4634, + "step": 2280 + }, + { + "epoch": 0.4, + "learning_rate": 1.3786616026279922e-05, + "loss": 0.4755, + "step": 2281 + }, + { + "epoch": 0.4, + "learning_rate": 1.378142133494912e-05, + "loss": 0.475, + "step": 2282 + }, + { + "epoch": 0.4, + "learning_rate": 1.3776225452680696e-05, + "loss": 0.4809, + "step": 2283 + }, + { + "epoch": 0.4, + "learning_rate": 1.377102838111106e-05, + "loss": 0.4685, + "step": 2284 + }, + { + "epoch": 0.4, + "learning_rate": 1.3765830121877004e-05, + "loss": 0.4739, + "step": 2285 + }, + { + "epoch": 0.4, + "learning_rate": 1.3760630676615685e-05, + "loss": 0.4748, + "step": 2286 + }, + { + "epoch": 0.4, + "learning_rate": 1.3755430046964649e-05, + "loss": 0.4793, + "step": 2287 + }, + { + "epoch": 0.4, + "learning_rate": 1.3750228234561796e-05, + "loss": 0.4773, + "step": 2288 + }, + { + "epoch": 0.4, + "learning_rate": 1.3745025241045414e-05, + "loss": 0.4722, + "step": 2289 + }, + { + "epoch": 0.4, + "learning_rate": 1.3739821068054153e-05, + "loss": 0.4733, + "step": 2290 + }, + { + "epoch": 0.4, + "learning_rate": 1.373461571722704e-05, + "loss": 0.473, + "step": 2291 + }, + { + "epoch": 0.4, + "learning_rate": 1.3729409190203475e-05, + "loss": 0.4796, + "step": 2292 + }, + { + "epoch": 0.4, + "learning_rate": 1.3724201488623216e-05, + "loss": 0.4749, + "step": 2293 + }, + { + "epoch": 0.4, + "learning_rate": 1.371899261412641e-05, + "loss": 0.4794, + "step": 2294 + }, + { + "epoch": 0.4, + "learning_rate": 1.3713782568353553e-05, + "loss": 0.4751, + "step": 2295 + }, + { + "epoch": 0.4, + "learning_rate": 1.3708571352945527e-05, + "loss": 0.48, + "step": 2296 + }, + { + "epoch": 0.4, + "learning_rate": 1.3703358969543575e-05, + "loss": 0.4806, + "step": 2297 + }, + { + "epoch": 0.4, + "learning_rate": 1.3698145419789302e-05, + "loss": 0.4642, + "step": 2298 + }, + { + "epoch": 0.4, + "learning_rate": 1.3692930705324697e-05, + "loss": 0.4907, + "step": 2299 + }, + { + "epoch": 0.4, + "learning_rate": 1.3687714827792093e-05, + "loss": 0.4663, + "step": 2300 + }, + { + "epoch": 0.4, + "learning_rate": 1.368249778883421e-05, + "loss": 0.476, + "step": 2301 + }, + { + "epoch": 0.4, + "learning_rate": 1.3677279590094123e-05, + "loss": 0.47, + "step": 2302 + }, + { + "epoch": 0.4, + "learning_rate": 1.3672060233215277e-05, + "loss": 0.4694, + "step": 2303 + }, + { + "epoch": 0.4, + "learning_rate": 1.3666839719841473e-05, + "loss": 0.4755, + "step": 2304 + }, + { + "epoch": 0.4, + "learning_rate": 1.3661618051616893e-05, + "loss": 0.4808, + "step": 2305 + }, + { + "epoch": 0.4, + "learning_rate": 1.3656395230186062e-05, + "loss": 0.4644, + "step": 2306 + }, + { + "epoch": 0.4, + "learning_rate": 1.3651171257193883e-05, + "loss": 0.4786, + "step": 2307 + }, + { + "epoch": 0.4, + "learning_rate": 1.3645946134285617e-05, + "loss": 0.4711, + "step": 2308 + }, + { + "epoch": 0.4, + "learning_rate": 1.3640719863106888e-05, + "loss": 0.4827, + "step": 2309 + }, + { + "epoch": 0.4, + "learning_rate": 1.3635492445303679e-05, + "loss": 0.4798, + "step": 2310 + }, + { + "epoch": 0.4, + "learning_rate": 1.3630263882522341e-05, + "loss": 0.4772, + "step": 2311 + }, + { + "epoch": 0.4, + "learning_rate": 1.3625034176409577e-05, + "loss": 0.4721, + "step": 2312 + }, + { + "epoch": 0.4, + "learning_rate": 1.3619803328612454e-05, + "loss": 0.49, + "step": 2313 + }, + { + "epoch": 0.4, + "learning_rate": 1.3614571340778398e-05, + "loss": 0.4814, + "step": 2314 + }, + { + "epoch": 0.4, + "learning_rate": 1.3609338214555195e-05, + "loss": 0.4737, + "step": 2315 + }, + { + "epoch": 0.4, + "learning_rate": 1.3604103951590993e-05, + "loss": 0.464, + "step": 2316 + }, + { + "epoch": 0.4, + "learning_rate": 1.3598868553534286e-05, + "loss": 0.4787, + "step": 2317 + }, + { + "epoch": 0.4, + "learning_rate": 1.359363202203394e-05, + "loss": 0.4692, + "step": 2318 + }, + { + "epoch": 0.4, + "learning_rate": 1.3588394358739167e-05, + "loss": 0.4746, + "step": 2319 + }, + { + "epoch": 0.4, + "learning_rate": 1.3583155565299544e-05, + "loss": 0.4711, + "step": 2320 + }, + { + "epoch": 0.4, + "learning_rate": 1.3577915643364997e-05, + "loss": 0.4707, + "step": 2321 + }, + { + "epoch": 0.4, + "learning_rate": 1.3572674594585813e-05, + "loss": 0.4784, + "step": 2322 + }, + { + "epoch": 0.4, + "learning_rate": 1.356743242061263e-05, + "loss": 0.4633, + "step": 2323 + }, + { + "epoch": 0.4, + "learning_rate": 1.3562189123096439e-05, + "loss": 0.4693, + "step": 2324 + }, + { + "epoch": 0.4, + "learning_rate": 1.3556944703688592e-05, + "loss": 0.4733, + "step": 2325 + }, + { + "epoch": 0.4, + "learning_rate": 1.3551699164040786e-05, + "loss": 0.4557, + "step": 2326 + }, + { + "epoch": 0.4, + "learning_rate": 1.3546452505805076e-05, + "loss": 0.4625, + "step": 2327 + }, + { + "epoch": 0.4, + "learning_rate": 1.3541204730633864e-05, + "loss": 0.473, + "step": 2328 + }, + { + "epoch": 0.4, + "learning_rate": 1.3535955840179918e-05, + "loss": 0.4798, + "step": 2329 + }, + { + "epoch": 0.4, + "learning_rate": 1.3530705836096333e-05, + "loss": 0.4713, + "step": 2330 + }, + { + "epoch": 0.4, + "learning_rate": 1.3525454720036581e-05, + "loss": 0.4861, + "step": 2331 + }, + { + "epoch": 0.4, + "learning_rate": 1.3520202493654466e-05, + "loss": 0.4849, + "step": 2332 + }, + { + "epoch": 0.4, + "learning_rate": 1.3514949158604147e-05, + "loss": 0.4749, + "step": 2333 + }, + { + "epoch": 0.4, + "learning_rate": 1.3509694716540135e-05, + "loss": 0.4686, + "step": 2334 + }, + { + "epoch": 0.4, + "learning_rate": 1.3504439169117283e-05, + "loss": 0.4771, + "step": 2335 + }, + { + "epoch": 0.4, + "learning_rate": 1.34991825179908e-05, + "loss": 0.4818, + "step": 2336 + }, + { + "epoch": 0.4, + "learning_rate": 1.349392476481624e-05, + "loss": 0.4784, + "step": 2337 + }, + { + "epoch": 0.41, + "learning_rate": 1.3488665911249503e-05, + "loss": 0.482, + "step": 2338 + }, + { + "epoch": 0.41, + "learning_rate": 1.348340595894683e-05, + "loss": 0.4707, + "step": 2339 + }, + { + "epoch": 0.41, + "learning_rate": 1.3478144909564824e-05, + "loss": 0.4692, + "step": 2340 + }, + { + "epoch": 0.41, + "learning_rate": 1.3472882764760414e-05, + "loss": 0.4772, + "step": 2341 + }, + { + "epoch": 0.41, + "learning_rate": 1.3467619526190885e-05, + "loss": 0.4598, + "step": 2342 + }, + { + "epoch": 0.41, + "learning_rate": 1.3462355195513868e-05, + "loss": 0.4806, + "step": 2343 + }, + { + "epoch": 0.41, + "learning_rate": 1.3457089774387333e-05, + "loss": 0.4645, + "step": 2344 + }, + { + "epoch": 0.41, + "learning_rate": 1.3451823264469595e-05, + "loss": 0.4834, + "step": 2345 + }, + { + "epoch": 0.41, + "learning_rate": 1.344655566741931e-05, + "loss": 0.4737, + "step": 2346 + }, + { + "epoch": 0.41, + "learning_rate": 1.3441286984895486e-05, + "loss": 0.4835, + "step": 2347 + }, + { + "epoch": 0.41, + "learning_rate": 1.3436017218557453e-05, + "loss": 0.4712, + "step": 2348 + }, + { + "epoch": 0.41, + "learning_rate": 1.3430746370064904e-05, + "loss": 0.461, + "step": 2349 + }, + { + "epoch": 0.41, + "learning_rate": 1.342547444107786e-05, + "loss": 0.4763, + "step": 2350 + }, + { + "epoch": 0.41, + "learning_rate": 1.342020143325669e-05, + "loss": 0.4925, + "step": 2351 + }, + { + "epoch": 0.41, + "learning_rate": 1.341492734826209e-05, + "loss": 0.4636, + "step": 2352 + }, + { + "epoch": 0.41, + "learning_rate": 1.3409652187755114e-05, + "loss": 0.4786, + "step": 2353 + }, + { + "epoch": 0.41, + "learning_rate": 1.3404375953397136e-05, + "loss": 0.4604, + "step": 2354 + }, + { + "epoch": 0.41, + "learning_rate": 1.339909864684988e-05, + "loss": 0.48, + "step": 2355 + }, + { + "epoch": 0.41, + "learning_rate": 1.3393820269775405e-05, + "loss": 0.471, + "step": 2356 + }, + { + "epoch": 0.41, + "learning_rate": 1.3388540823836103e-05, + "loss": 0.4639, + "step": 2357 + }, + { + "epoch": 0.41, + "learning_rate": 1.3383260310694712e-05, + "loss": 0.4765, + "step": 2358 + }, + { + "epoch": 0.41, + "learning_rate": 1.3377978732014295e-05, + "loss": 0.4856, + "step": 2359 + }, + { + "epoch": 0.41, + "learning_rate": 1.3372696089458264e-05, + "loss": 0.4787, + "step": 2360 + }, + { + "epoch": 0.41, + "learning_rate": 1.3367412384690346e-05, + "loss": 0.4897, + "step": 2361 + }, + { + "epoch": 0.41, + "learning_rate": 1.3362127619374622e-05, + "loss": 0.4831, + "step": 2362 + }, + { + "epoch": 0.41, + "learning_rate": 1.3356841795175494e-05, + "loss": 0.4737, + "step": 2363 + }, + { + "epoch": 0.41, + "learning_rate": 1.3351554913757712e-05, + "loss": 0.4593, + "step": 2364 + }, + { + "epoch": 0.41, + "learning_rate": 1.3346266976786341e-05, + "loss": 0.4735, + "step": 2365 + }, + { + "epoch": 0.41, + "learning_rate": 1.3340977985926793e-05, + "loss": 0.4674, + "step": 2366 + }, + { + "epoch": 0.41, + "learning_rate": 1.3335687942844806e-05, + "loss": 0.4677, + "step": 2367 + }, + { + "epoch": 0.41, + "learning_rate": 1.3330396849206447e-05, + "loss": 0.4696, + "step": 2368 + }, + { + "epoch": 0.41, + "learning_rate": 1.3325104706678116e-05, + "loss": 0.475, + "step": 2369 + }, + { + "epoch": 0.41, + "learning_rate": 1.3319811516926541e-05, + "loss": 0.4709, + "step": 2370 + }, + { + "epoch": 0.41, + "learning_rate": 1.3314517281618794e-05, + "loss": 0.4784, + "step": 2371 + }, + { + "epoch": 0.41, + "learning_rate": 1.3309222002422255e-05, + "loss": 0.4635, + "step": 2372 + }, + { + "epoch": 0.41, + "learning_rate": 1.3303925681004649e-05, + "loss": 0.4752, + "step": 2373 + }, + { + "epoch": 0.41, + "learning_rate": 1.3298628319034014e-05, + "loss": 0.4705, + "step": 2374 + }, + { + "epoch": 0.41, + "learning_rate": 1.3293329918178737e-05, + "loss": 0.4671, + "step": 2375 + }, + { + "epoch": 0.41, + "learning_rate": 1.3288030480107508e-05, + "loss": 0.4841, + "step": 2376 + }, + { + "epoch": 0.41, + "learning_rate": 1.3282730006489361e-05, + "loss": 0.4869, + "step": 2377 + }, + { + "epoch": 0.41, + "learning_rate": 1.327742849899365e-05, + "loss": 0.4633, + "step": 2378 + }, + { + "epoch": 0.41, + "learning_rate": 1.3272125959290059e-05, + "loss": 0.478, + "step": 2379 + }, + { + "epoch": 0.41, + "learning_rate": 1.326682238904859e-05, + "loss": 0.4815, + "step": 2380 + }, + { + "epoch": 0.41, + "learning_rate": 1.326151778993957e-05, + "loss": 0.4784, + "step": 2381 + }, + { + "epoch": 0.41, + "learning_rate": 1.325621216363366e-05, + "loss": 0.4718, + "step": 2382 + }, + { + "epoch": 0.41, + "learning_rate": 1.3250905511801831e-05, + "loss": 0.472, + "step": 2383 + }, + { + "epoch": 0.41, + "learning_rate": 1.3245597836115386e-05, + "loss": 0.4734, + "step": 2384 + }, + { + "epoch": 0.41, + "learning_rate": 1.3240289138245949e-05, + "loss": 0.4684, + "step": 2385 + }, + { + "epoch": 0.41, + "learning_rate": 1.3234979419865466e-05, + "loss": 0.4808, + "step": 2386 + }, + { + "epoch": 0.41, + "learning_rate": 1.3229668682646197e-05, + "loss": 0.4775, + "step": 2387 + }, + { + "epoch": 0.41, + "learning_rate": 1.3224356928260735e-05, + "loss": 0.467, + "step": 2388 + }, + { + "epoch": 0.41, + "learning_rate": 1.3219044158381988e-05, + "loss": 0.4793, + "step": 2389 + }, + { + "epoch": 0.41, + "learning_rate": 1.321373037468318e-05, + "loss": 0.4763, + "step": 2390 + }, + { + "epoch": 0.41, + "learning_rate": 1.3208415578837859e-05, + "loss": 0.4783, + "step": 2391 + }, + { + "epoch": 0.41, + "learning_rate": 1.3203099772519889e-05, + "loss": 0.4776, + "step": 2392 + }, + { + "epoch": 0.41, + "learning_rate": 1.3197782957403458e-05, + "loss": 0.4723, + "step": 2393 + }, + { + "epoch": 0.41, + "learning_rate": 1.3192465135163062e-05, + "loss": 0.4635, + "step": 2394 + }, + { + "epoch": 0.41, + "learning_rate": 1.3187146307473521e-05, + "loss": 0.4769, + "step": 2395 + }, + { + "epoch": 0.42, + "learning_rate": 1.3181826476009974e-05, + "loss": 0.4628, + "step": 2396 + }, + { + "epoch": 0.42, + "learning_rate": 1.317650564244787e-05, + "loss": 0.4688, + "step": 2397 + }, + { + "epoch": 0.42, + "learning_rate": 1.3171183808462969e-05, + "loss": 0.4708, + "step": 2398 + }, + { + "epoch": 0.42, + "learning_rate": 1.3165860975731363e-05, + "loss": 0.4826, + "step": 2399 + }, + { + "epoch": 0.42, + "learning_rate": 1.3160537145929447e-05, + "loss": 0.4562, + "step": 2400 + }, + { + "epoch": 0.42, + "learning_rate": 1.3155212320733925e-05, + "loss": 0.4821, + "step": 2401 + }, + { + "epoch": 0.42, + "learning_rate": 1.3149886501821831e-05, + "loss": 0.4732, + "step": 2402 + }, + { + "epoch": 0.42, + "learning_rate": 1.3144559690870494e-05, + "loss": 0.4798, + "step": 2403 + }, + { + "epoch": 0.42, + "learning_rate": 1.3139231889557568e-05, + "loss": 0.4572, + "step": 2404 + }, + { + "epoch": 0.42, + "learning_rate": 1.313390309956101e-05, + "loss": 0.4702, + "step": 2405 + }, + { + "epoch": 0.42, + "learning_rate": 1.3128573322559097e-05, + "loss": 0.4806, + "step": 2406 + }, + { + "epoch": 0.42, + "learning_rate": 1.3123242560230408e-05, + "loss": 0.4686, + "step": 2407 + }, + { + "epoch": 0.42, + "learning_rate": 1.3117910814253845e-05, + "loss": 0.4713, + "step": 2408 + }, + { + "epoch": 0.42, + "learning_rate": 1.3112578086308602e-05, + "loss": 0.4625, + "step": 2409 + }, + { + "epoch": 0.42, + "learning_rate": 1.3107244378074197e-05, + "loss": 0.4679, + "step": 2410 + }, + { + "epoch": 0.42, + "learning_rate": 1.3101909691230456e-05, + "loss": 0.4695, + "step": 2411 + }, + { + "epoch": 0.42, + "learning_rate": 1.3096574027457503e-05, + "loss": 0.4692, + "step": 2412 + }, + { + "epoch": 0.42, + "learning_rate": 1.3091237388435773e-05, + "loss": 0.4807, + "step": 2413 + }, + { + "epoch": 0.42, + "learning_rate": 1.3085899775846018e-05, + "loss": 0.4654, + "step": 2414 + }, + { + "epoch": 0.42, + "learning_rate": 1.3080561191369286e-05, + "loss": 0.4757, + "step": 2415 + }, + { + "epoch": 0.42, + "learning_rate": 1.3075221636686935e-05, + "loss": 0.465, + "step": 2416 + }, + { + "epoch": 0.42, + "learning_rate": 1.3069881113480629e-05, + "loss": 0.4844, + "step": 2417 + }, + { + "epoch": 0.42, + "learning_rate": 1.3064539623432331e-05, + "loss": 0.4861, + "step": 2418 + }, + { + "epoch": 0.42, + "learning_rate": 1.305919716822432e-05, + "loss": 0.4714, + "step": 2419 + }, + { + "epoch": 0.42, + "learning_rate": 1.3053853749539169e-05, + "loss": 0.4671, + "step": 2420 + }, + { + "epoch": 0.42, + "learning_rate": 1.3048509369059762e-05, + "loss": 0.4642, + "step": 2421 + }, + { + "epoch": 0.42, + "learning_rate": 1.3043164028469274e-05, + "loss": 0.4752, + "step": 2422 + }, + { + "epoch": 0.42, + "learning_rate": 1.3037817729451199e-05, + "loss": 0.4585, + "step": 2423 + }, + { + "epoch": 0.42, + "learning_rate": 1.3032470473689322e-05, + "loss": 0.4709, + "step": 2424 + }, + { + "epoch": 0.42, + "learning_rate": 1.3027122262867727e-05, + "loss": 0.4715, + "step": 2425 + }, + { + "epoch": 0.42, + "learning_rate": 1.3021773098670804e-05, + "loss": 0.4692, + "step": 2426 + }, + { + "epoch": 0.42, + "learning_rate": 1.301642298278325e-05, + "loss": 0.4687, + "step": 2427 + }, + { + "epoch": 0.42, + "learning_rate": 1.3011071916890049e-05, + "loss": 0.479, + "step": 2428 + }, + { + "epoch": 0.42, + "learning_rate": 1.3005719902676483e-05, + "loss": 0.4872, + "step": 2429 + }, + { + "epoch": 0.42, + "learning_rate": 1.300036694182815e-05, + "loss": 0.4764, + "step": 2430 + }, + { + "epoch": 0.42, + "learning_rate": 1.2995013036030932e-05, + "loss": 0.4736, + "step": 2431 + }, + { + "epoch": 0.42, + "learning_rate": 1.2989658186971007e-05, + "loss": 0.4796, + "step": 2432 + }, + { + "epoch": 0.42, + "learning_rate": 1.298430239633486e-05, + "loss": 0.4766, + "step": 2433 + }, + { + "epoch": 0.42, + "learning_rate": 1.2978945665809267e-05, + "loss": 0.4731, + "step": 2434 + }, + { + "epoch": 0.42, + "learning_rate": 1.2973587997081298e-05, + "loss": 0.4743, + "step": 2435 + }, + { + "epoch": 0.42, + "learning_rate": 1.2968229391838322e-05, + "loss": 0.4577, + "step": 2436 + }, + { + "epoch": 0.42, + "learning_rate": 1.2962869851768008e-05, + "loss": 0.4817, + "step": 2437 + }, + { + "epoch": 0.42, + "learning_rate": 1.2957509378558301e-05, + "loss": 0.4717, + "step": 2438 + }, + { + "epoch": 0.42, + "learning_rate": 1.2952147973897464e-05, + "loss": 0.475, + "step": 2439 + }, + { + "epoch": 0.42, + "learning_rate": 1.2946785639474034e-05, + "loss": 0.4607, + "step": 2440 + }, + { + "epoch": 0.42, + "learning_rate": 1.2941422376976851e-05, + "loss": 0.4735, + "step": 2441 + }, + { + "epoch": 0.42, + "learning_rate": 1.2936058188095045e-05, + "loss": 0.4658, + "step": 2442 + }, + { + "epoch": 0.42, + "learning_rate": 1.2930693074518038e-05, + "loss": 0.4786, + "step": 2443 + }, + { + "epoch": 0.42, + "learning_rate": 1.292532703793554e-05, + "loss": 0.4759, + "step": 2444 + }, + { + "epoch": 0.42, + "learning_rate": 1.2919960080037557e-05, + "loss": 0.4765, + "step": 2445 + }, + { + "epoch": 0.42, + "learning_rate": 1.2914592202514385e-05, + "loss": 0.4822, + "step": 2446 + }, + { + "epoch": 0.42, + "learning_rate": 1.2909223407056599e-05, + "loss": 0.4684, + "step": 2447 + }, + { + "epoch": 0.42, + "learning_rate": 1.290385369535508e-05, + "loss": 0.4725, + "step": 2448 + }, + { + "epoch": 0.42, + "learning_rate": 1.2898483069100982e-05, + "loss": 0.4593, + "step": 2449 + }, + { + "epoch": 0.42, + "learning_rate": 1.2893111529985761e-05, + "loss": 0.4803, + "step": 2450 + }, + { + "epoch": 0.42, + "learning_rate": 1.2887739079701147e-05, + "loss": 0.4645, + "step": 2451 + }, + { + "epoch": 0.42, + "learning_rate": 1.2882365719939167e-05, + "loss": 0.4601, + "step": 2452 + }, + { + "epoch": 0.42, + "learning_rate": 1.2876991452392124e-05, + "loss": 0.4691, + "step": 2453 + }, + { + "epoch": 0.43, + "learning_rate": 1.2871616278752628e-05, + "loss": 0.482, + "step": 2454 + }, + { + "epoch": 0.43, + "learning_rate": 1.2866240200713544e-05, + "loss": 0.4746, + "step": 2455 + }, + { + "epoch": 0.43, + "learning_rate": 1.2860863219968049e-05, + "loss": 0.4694, + "step": 2456 + }, + { + "epoch": 0.43, + "learning_rate": 1.285548533820959e-05, + "loss": 0.4724, + "step": 2457 + }, + { + "epoch": 0.43, + "learning_rate": 1.2850106557131898e-05, + "loss": 0.4836, + "step": 2458 + }, + { + "epoch": 0.43, + "learning_rate": 1.2844726878428993e-05, + "loss": 0.4568, + "step": 2459 + }, + { + "epoch": 0.43, + "learning_rate": 1.2839346303795173e-05, + "loss": 0.4723, + "step": 2460 + }, + { + "epoch": 0.43, + "learning_rate": 1.2833964834925024e-05, + "loss": 0.4683, + "step": 2461 + }, + { + "epoch": 0.43, + "learning_rate": 1.2828582473513405e-05, + "loss": 0.4734, + "step": 2462 + }, + { + "epoch": 0.43, + "learning_rate": 1.2823199221255467e-05, + "loss": 0.4639, + "step": 2463 + }, + { + "epoch": 0.43, + "learning_rate": 1.2817815079846627e-05, + "loss": 0.4841, + "step": 2464 + }, + { + "epoch": 0.43, + "learning_rate": 1.2812430050982596e-05, + "loss": 0.4697, + "step": 2465 + }, + { + "epoch": 0.43, + "learning_rate": 1.2807044136359358e-05, + "loss": 0.4852, + "step": 2466 + }, + { + "epoch": 0.43, + "learning_rate": 1.2801657337673176e-05, + "loss": 0.4572, + "step": 2467 + }, + { + "epoch": 0.43, + "learning_rate": 1.2796269656620593e-05, + "loss": 0.4676, + "step": 2468 + }, + { + "epoch": 0.43, + "learning_rate": 1.2790881094898428e-05, + "loss": 0.4739, + "step": 2469 + }, + { + "epoch": 0.43, + "learning_rate": 1.2785491654203781e-05, + "loss": 0.465, + "step": 2470 + }, + { + "epoch": 0.43, + "learning_rate": 1.2780101336234024e-05, + "loss": 0.4632, + "step": 2471 + }, + { + "epoch": 0.43, + "learning_rate": 1.277471014268681e-05, + "loss": 0.4684, + "step": 2472 + }, + { + "epoch": 0.43, + "learning_rate": 1.2769318075260064e-05, + "loss": 0.466, + "step": 2473 + }, + { + "epoch": 0.43, + "learning_rate": 1.2763925135651984e-05, + "loss": 0.4665, + "step": 2474 + }, + { + "epoch": 0.43, + "learning_rate": 1.2758531325561055e-05, + "loss": 0.4725, + "step": 2475 + }, + { + "epoch": 0.43, + "learning_rate": 1.275313664668602e-05, + "loss": 0.4775, + "step": 2476 + }, + { + "epoch": 0.43, + "learning_rate": 1.2747741100725906e-05, + "loss": 0.4611, + "step": 2477 + }, + { + "epoch": 0.43, + "learning_rate": 1.274234468938001e-05, + "loss": 0.4672, + "step": 2478 + }, + { + "epoch": 0.43, + "learning_rate": 1.27369474143479e-05, + "loss": 0.4659, + "step": 2479 + }, + { + "epoch": 0.43, + "learning_rate": 1.273154927732942e-05, + "loss": 0.4725, + "step": 2480 + }, + { + "epoch": 0.43, + "learning_rate": 1.2726150280024683e-05, + "loss": 0.4698, + "step": 2481 + }, + { + "epoch": 0.43, + "learning_rate": 1.2720750424134073e-05, + "loss": 0.4636, + "step": 2482 + }, + { + "epoch": 0.43, + "learning_rate": 1.2715349711358245e-05, + "loss": 0.4809, + "step": 2483 + }, + { + "epoch": 0.43, + "learning_rate": 1.270994814339812e-05, + "loss": 0.4763, + "step": 2484 + }, + { + "epoch": 0.43, + "learning_rate": 1.27045457219549e-05, + "loss": 0.476, + "step": 2485 + }, + { + "epoch": 0.43, + "learning_rate": 1.2699142448730037e-05, + "loss": 0.4695, + "step": 2486 + }, + { + "epoch": 0.43, + "learning_rate": 1.2693738325425272e-05, + "loss": 0.4781, + "step": 2487 + }, + { + "epoch": 0.43, + "learning_rate": 1.268833335374259e-05, + "loss": 0.4728, + "step": 2488 + }, + { + "epoch": 0.43, + "learning_rate": 1.2682927535384273e-05, + "loss": 0.4731, + "step": 2489 + }, + { + "epoch": 0.43, + "learning_rate": 1.2677520872052843e-05, + "loss": 0.4733, + "step": 2490 + }, + { + "epoch": 0.43, + "learning_rate": 1.2672113365451102e-05, + "loss": 0.4683, + "step": 2491 + }, + { + "epoch": 0.43, + "learning_rate": 1.2666705017282115e-05, + "loss": 0.4621, + "step": 2492 + }, + { + "epoch": 0.43, + "learning_rate": 1.2661295829249207e-05, + "loss": 0.4603, + "step": 2493 + }, + { + "epoch": 0.43, + "learning_rate": 1.2655885803055978e-05, + "loss": 0.4765, + "step": 2494 + }, + { + "epoch": 0.43, + "learning_rate": 1.2650474940406279e-05, + "loss": 0.465, + "step": 2495 + }, + { + "epoch": 0.43, + "learning_rate": 1.2645063243004236e-05, + "loss": 0.4827, + "step": 2496 + }, + { + "epoch": 0.43, + "learning_rate": 1.263965071255423e-05, + "loss": 0.4617, + "step": 2497 + }, + { + "epoch": 0.43, + "learning_rate": 1.2634237350760912e-05, + "loss": 0.479, + "step": 2498 + }, + { + "epoch": 0.43, + "learning_rate": 1.2628823159329182e-05, + "loss": 0.4593, + "step": 2499 + }, + { + "epoch": 0.43, + "learning_rate": 1.2623408139964216e-05, + "loss": 0.47, + "step": 2500 + }, + { + "epoch": 0.43, + "learning_rate": 1.2617992294371444e-05, + "loss": 0.4627, + "step": 2501 + }, + { + "epoch": 0.43, + "learning_rate": 1.2612575624256552e-05, + "loss": 0.4814, + "step": 2502 + }, + { + "epoch": 0.43, + "learning_rate": 1.2607158131325494e-05, + "loss": 0.4612, + "step": 2503 + }, + { + "epoch": 0.43, + "learning_rate": 1.260173981728448e-05, + "loss": 0.4736, + "step": 2504 + }, + { + "epoch": 0.43, + "learning_rate": 1.2596320683839976e-05, + "loss": 0.4656, + "step": 2505 + }, + { + "epoch": 0.43, + "learning_rate": 1.2590900732698707e-05, + "loss": 0.4845, + "step": 2506 + }, + { + "epoch": 0.43, + "learning_rate": 1.258547996556766e-05, + "loss": 0.4808, + "step": 2507 + }, + { + "epoch": 0.43, + "learning_rate": 1.258005838415407e-05, + "loss": 0.4772, + "step": 2508 + }, + { + "epoch": 0.43, + "learning_rate": 1.2574635990165438e-05, + "loss": 0.4645, + "step": 2509 + }, + { + "epoch": 0.43, + "learning_rate": 1.2569212785309517e-05, + "loss": 0.4762, + "step": 2510 + }, + { + "epoch": 0.43, + "learning_rate": 1.2563788771294316e-05, + "loss": 0.4611, + "step": 2511 + }, + { + "epoch": 0.44, + "learning_rate": 1.2558363949828092e-05, + "loss": 0.4709, + "step": 2512 + }, + { + "epoch": 0.44, + "learning_rate": 1.2552938322619368e-05, + "loss": 0.4674, + "step": 2513 + }, + { + "epoch": 0.44, + "learning_rate": 1.2547511891376916e-05, + "loss": 0.4781, + "step": 2514 + }, + { + "epoch": 0.44, + "learning_rate": 1.2542084657809754e-05, + "loss": 0.471, + "step": 2515 + }, + { + "epoch": 0.44, + "learning_rate": 1.2536656623627167e-05, + "loss": 0.4734, + "step": 2516 + }, + { + "epoch": 0.44, + "learning_rate": 1.2531227790538675e-05, + "loss": 0.4711, + "step": 2517 + }, + { + "epoch": 0.44, + "learning_rate": 1.252579816025407e-05, + "loss": 0.4753, + "step": 2518 + }, + { + "epoch": 0.44, + "learning_rate": 1.2520367734483376e-05, + "loss": 0.4738, + "step": 2519 + }, + { + "epoch": 0.44, + "learning_rate": 1.2514936514936878e-05, + "loss": 0.4843, + "step": 2520 + }, + { + "epoch": 0.44, + "learning_rate": 1.2509504503325106e-05, + "loss": 0.4699, + "step": 2521 + }, + { + "epoch": 0.44, + "learning_rate": 1.2504071701358842e-05, + "loss": 0.464, + "step": 2522 + }, + { + "epoch": 0.44, + "learning_rate": 1.2498638110749122e-05, + "loss": 0.4696, + "step": 2523 + }, + { + "epoch": 0.44, + "learning_rate": 1.2493203733207219e-05, + "loss": 0.4725, + "step": 2524 + }, + { + "epoch": 0.44, + "learning_rate": 1.2487768570444665e-05, + "loss": 0.4738, + "step": 2525 + }, + { + "epoch": 0.44, + "learning_rate": 1.2482332624173227e-05, + "loss": 0.4762, + "step": 2526 + }, + { + "epoch": 0.44, + "learning_rate": 1.2476895896104937e-05, + "loss": 0.4761, + "step": 2527 + }, + { + "epoch": 0.44, + "learning_rate": 1.2471458387952053e-05, + "loss": 0.4705, + "step": 2528 + }, + { + "epoch": 0.44, + "learning_rate": 1.2466020101427092e-05, + "loss": 0.4712, + "step": 2529 + }, + { + "epoch": 0.44, + "learning_rate": 1.246058103824281e-05, + "loss": 0.4657, + "step": 2530 + }, + { + "epoch": 0.44, + "learning_rate": 1.245514120011221e-05, + "loss": 0.4795, + "step": 2531 + }, + { + "epoch": 0.44, + "learning_rate": 1.2449700588748541e-05, + "loss": 0.4813, + "step": 2532 + }, + { + "epoch": 0.44, + "learning_rate": 1.2444259205865295e-05, + "loss": 0.4658, + "step": 2533 + }, + { + "epoch": 0.44, + "learning_rate": 1.2438817053176198e-05, + "loss": 0.4812, + "step": 2534 + }, + { + "epoch": 0.44, + "learning_rate": 1.243337413239523e-05, + "loss": 0.4641, + "step": 2535 + }, + { + "epoch": 0.44, + "learning_rate": 1.2427930445236611e-05, + "loss": 0.4854, + "step": 2536 + }, + { + "epoch": 0.44, + "learning_rate": 1.2422485993414795e-05, + "loss": 0.4503, + "step": 2537 + }, + { + "epoch": 0.44, + "learning_rate": 1.2417040778644487e-05, + "loss": 0.4716, + "step": 2538 + }, + { + "epoch": 0.44, + "learning_rate": 1.2411594802640621e-05, + "loss": 0.4662, + "step": 2539 + }, + { + "epoch": 0.44, + "learning_rate": 1.2406148067118387e-05, + "loss": 0.4983, + "step": 2540 + }, + { + "epoch": 0.44, + "learning_rate": 1.2400700573793191e-05, + "loss": 0.4679, + "step": 2541 + }, + { + "epoch": 0.44, + "learning_rate": 1.2395252324380701e-05, + "loss": 0.4713, + "step": 2542 + }, + { + "epoch": 0.44, + "learning_rate": 1.2389803320596806e-05, + "loss": 0.4604, + "step": 2543 + }, + { + "epoch": 0.44, + "learning_rate": 1.2384353564157646e-05, + "loss": 0.4689, + "step": 2544 + }, + { + "epoch": 0.44, + "learning_rate": 1.2378903056779584e-05, + "loss": 0.4677, + "step": 2545 + }, + { + "epoch": 0.44, + "learning_rate": 1.2373451800179235e-05, + "loss": 0.4716, + "step": 2546 + }, + { + "epoch": 0.44, + "learning_rate": 1.2367999796073436e-05, + "loss": 0.4647, + "step": 2547 + }, + { + "epoch": 0.44, + "learning_rate": 1.2362547046179265e-05, + "loss": 0.4787, + "step": 2548 + }, + { + "epoch": 0.44, + "learning_rate": 1.2357093552214043e-05, + "loss": 0.4702, + "step": 2549 + }, + { + "epoch": 0.44, + "learning_rate": 1.2351639315895309e-05, + "loss": 0.4917, + "step": 2550 + }, + { + "epoch": 0.44, + "learning_rate": 1.2346184338940847e-05, + "loss": 0.4759, + "step": 2551 + }, + { + "epoch": 0.44, + "learning_rate": 1.2340728623068671e-05, + "loss": 0.4709, + "step": 2552 + }, + { + "epoch": 0.44, + "learning_rate": 1.2335272169997034e-05, + "loss": 0.4675, + "step": 2553 + }, + { + "epoch": 0.44, + "learning_rate": 1.232981498144441e-05, + "loss": 0.4636, + "step": 2554 + }, + { + "epoch": 0.44, + "learning_rate": 1.2324357059129512e-05, + "loss": 0.4759, + "step": 2555 + }, + { + "epoch": 0.44, + "learning_rate": 1.231889840477128e-05, + "loss": 0.4661, + "step": 2556 + }, + { + "epoch": 0.44, + "learning_rate": 1.2313439020088889e-05, + "loss": 0.4721, + "step": 2557 + }, + { + "epoch": 0.44, + "learning_rate": 1.2307978906801738e-05, + "loss": 0.4802, + "step": 2558 + }, + { + "epoch": 0.44, + "learning_rate": 1.2302518066629467e-05, + "loss": 0.4725, + "step": 2559 + }, + { + "epoch": 0.44, + "learning_rate": 1.2297056501291932e-05, + "loss": 0.4564, + "step": 2560 + }, + { + "epoch": 0.44, + "learning_rate": 1.2291594212509224e-05, + "loss": 0.4795, + "step": 2561 + }, + { + "epoch": 0.44, + "learning_rate": 1.2286131202001661e-05, + "loss": 0.4759, + "step": 2562 + }, + { + "epoch": 0.44, + "learning_rate": 1.2280667471489784e-05, + "loss": 0.4716, + "step": 2563 + }, + { + "epoch": 0.44, + "learning_rate": 1.2275203022694371e-05, + "loss": 0.4811, + "step": 2564 + }, + { + "epoch": 0.44, + "learning_rate": 1.2269737857336412e-05, + "loss": 0.4767, + "step": 2565 + }, + { + "epoch": 0.44, + "learning_rate": 1.2264271977137136e-05, + "loss": 0.489, + "step": 2566 + }, + { + "epoch": 0.44, + "learning_rate": 1.2258805383817992e-05, + "loss": 0.4769, + "step": 2567 + }, + { + "epoch": 0.44, + "learning_rate": 1.2253338079100652e-05, + "loss": 0.4674, + "step": 2568 + }, + { + "epoch": 0.45, + "learning_rate": 1.224787006470701e-05, + "loss": 0.4674, + "step": 2569 + }, + { + "epoch": 0.45, + "learning_rate": 1.2242401342359188e-05, + "loss": 0.4662, + "step": 2570 + }, + { + "epoch": 0.45, + "learning_rate": 1.2236931913779534e-05, + "loss": 0.4616, + "step": 2571 + }, + { + "epoch": 0.45, + "learning_rate": 1.223146178069061e-05, + "loss": 0.4749, + "step": 2572 + }, + { + "epoch": 0.45, + "learning_rate": 1.2225990944815207e-05, + "loss": 0.4771, + "step": 2573 + }, + { + "epoch": 0.45, + "learning_rate": 1.222051940787633e-05, + "loss": 0.4713, + "step": 2574 + }, + { + "epoch": 0.45, + "learning_rate": 1.2215047171597214e-05, + "loss": 0.4797, + "step": 2575 + }, + { + "epoch": 0.45, + "learning_rate": 1.2209574237701306e-05, + "loss": 0.473, + "step": 2576 + }, + { + "epoch": 0.45, + "learning_rate": 1.2204100607912277e-05, + "loss": 0.472, + "step": 2577 + }, + { + "epoch": 0.45, + "learning_rate": 1.2198626283954016e-05, + "loss": 0.4965, + "step": 2578 + }, + { + "epoch": 0.45, + "learning_rate": 1.2193151267550631e-05, + "loss": 0.4688, + "step": 2579 + }, + { + "epoch": 0.45, + "learning_rate": 1.2187675560426448e-05, + "loss": 0.4699, + "step": 2580 + }, + { + "epoch": 0.45, + "learning_rate": 1.218219916430601e-05, + "loss": 0.4662, + "step": 2581 + }, + { + "epoch": 0.45, + "learning_rate": 1.2176722080914081e-05, + "loss": 0.4808, + "step": 2582 + }, + { + "epoch": 0.45, + "learning_rate": 1.2171244311975635e-05, + "loss": 0.4722, + "step": 2583 + }, + { + "epoch": 0.45, + "learning_rate": 1.2165765859215863e-05, + "loss": 0.477, + "step": 2584 + }, + { + "epoch": 0.45, + "learning_rate": 1.2160286724360177e-05, + "loss": 0.4682, + "step": 2585 + }, + { + "epoch": 0.45, + "learning_rate": 1.2154806909134198e-05, + "loss": 0.4701, + "step": 2586 + }, + { + "epoch": 0.45, + "learning_rate": 1.2149326415263762e-05, + "loss": 0.4716, + "step": 2587 + }, + { + "epoch": 0.45, + "learning_rate": 1.2143845244474925e-05, + "loss": 0.4747, + "step": 2588 + }, + { + "epoch": 0.45, + "learning_rate": 1.2138363398493946e-05, + "loss": 0.4615, + "step": 2589 + }, + { + "epoch": 0.45, + "learning_rate": 1.2132880879047307e-05, + "loss": 0.4785, + "step": 2590 + }, + { + "epoch": 0.45, + "learning_rate": 1.212739768786169e-05, + "loss": 0.4696, + "step": 2591 + }, + { + "epoch": 0.45, + "learning_rate": 1.2121913826664001e-05, + "loss": 0.4786, + "step": 2592 + }, + { + "epoch": 0.45, + "learning_rate": 1.211642929718135e-05, + "loss": 0.4572, + "step": 2593 + }, + { + "epoch": 0.45, + "learning_rate": 1.2110944101141058e-05, + "loss": 0.4768, + "step": 2594 + }, + { + "epoch": 0.45, + "learning_rate": 1.210545824027066e-05, + "loss": 0.48, + "step": 2595 + }, + { + "epoch": 0.45, + "learning_rate": 1.2099971716297896e-05, + "loss": 0.4893, + "step": 2596 + }, + { + "epoch": 0.45, + "learning_rate": 1.2094484530950714e-05, + "loss": 0.4605, + "step": 2597 + }, + { + "epoch": 0.45, + "learning_rate": 1.2088996685957277e-05, + "loss": 0.4703, + "step": 2598 + }, + { + "epoch": 0.45, + "learning_rate": 1.2083508183045947e-05, + "loss": 0.4669, + "step": 2599 + }, + { + "epoch": 0.45, + "learning_rate": 1.2078019023945298e-05, + "loss": 0.4827, + "step": 2600 + }, + { + "epoch": 0.45, + "learning_rate": 1.2072529210384113e-05, + "loss": 0.4603, + "step": 2601 + }, + { + "epoch": 0.45, + "learning_rate": 1.2067038744091375e-05, + "loss": 0.4719, + "step": 2602 + }, + { + "epoch": 0.45, + "learning_rate": 1.2061547626796276e-05, + "loss": 0.492, + "step": 2603 + }, + { + "epoch": 0.45, + "learning_rate": 1.205605586022822e-05, + "loss": 0.4637, + "step": 2604 + }, + { + "epoch": 0.45, + "learning_rate": 1.2050563446116798e-05, + "loss": 0.4739, + "step": 2605 + }, + { + "epoch": 0.45, + "learning_rate": 1.2045070386191822e-05, + "loss": 0.4676, + "step": 2606 + }, + { + "epoch": 0.45, + "learning_rate": 1.2039576682183295e-05, + "loss": 0.4634, + "step": 2607 + }, + { + "epoch": 0.45, + "learning_rate": 1.2034082335821436e-05, + "loss": 0.4728, + "step": 2608 + }, + { + "epoch": 0.45, + "learning_rate": 1.2028587348836653e-05, + "loss": 0.491, + "step": 2609 + }, + { + "epoch": 0.45, + "learning_rate": 1.2023091722959565e-05, + "loss": 0.4605, + "step": 2610 + }, + { + "epoch": 0.45, + "learning_rate": 1.2017595459920985e-05, + "loss": 0.4839, + "step": 2611 + }, + { + "epoch": 0.45, + "learning_rate": 1.2012098561451933e-05, + "loss": 0.4614, + "step": 2612 + }, + { + "epoch": 0.45, + "learning_rate": 1.2006601029283629e-05, + "loss": 0.4768, + "step": 2613 + }, + { + "epoch": 0.45, + "learning_rate": 1.2001102865147485e-05, + "loss": 0.4662, + "step": 2614 + }, + { + "epoch": 0.45, + "learning_rate": 1.199560407077512e-05, + "loss": 0.4783, + "step": 2615 + }, + { + "epoch": 0.45, + "learning_rate": 1.1990104647898349e-05, + "loss": 0.4689, + "step": 2616 + }, + { + "epoch": 0.45, + "learning_rate": 1.1984604598249186e-05, + "loss": 0.4901, + "step": 2617 + }, + { + "epoch": 0.45, + "learning_rate": 1.1979103923559836e-05, + "loss": 0.4675, + "step": 2618 + }, + { + "epoch": 0.45, + "learning_rate": 1.1973602625562712e-05, + "loss": 0.4807, + "step": 2619 + }, + { + "epoch": 0.45, + "learning_rate": 1.1968100705990411e-05, + "loss": 0.4636, + "step": 2620 + }, + { + "epoch": 0.45, + "learning_rate": 1.1962598166575737e-05, + "loss": 0.4763, + "step": 2621 + }, + { + "epoch": 0.45, + "learning_rate": 1.1957095009051683e-05, + "loss": 0.4741, + "step": 2622 + }, + { + "epoch": 0.45, + "learning_rate": 1.1951591235151438e-05, + "loss": 0.466, + "step": 2623 + }, + { + "epoch": 0.45, + "learning_rate": 1.1946086846608383e-05, + "loss": 0.4722, + "step": 2624 + }, + { + "epoch": 0.45, + "learning_rate": 1.1940581845156097e-05, + "loss": 0.4787, + "step": 2625 + }, + { + "epoch": 0.45, + "learning_rate": 1.1935076232528348e-05, + "loss": 0.4593, + "step": 2626 + }, + { + "epoch": 0.46, + "learning_rate": 1.1929570010459096e-05, + "loss": 0.4751, + "step": 2627 + }, + { + "epoch": 0.46, + "learning_rate": 1.19240631806825e-05, + "loss": 0.4698, + "step": 2628 + }, + { + "epoch": 0.46, + "learning_rate": 1.1918555744932905e-05, + "loss": 0.4768, + "step": 2629 + }, + { + "epoch": 0.46, + "learning_rate": 1.1913047704944845e-05, + "loss": 0.477, + "step": 2630 + }, + { + "epoch": 0.46, + "learning_rate": 1.1907539062453044e-05, + "loss": 0.4555, + "step": 2631 + }, + { + "epoch": 0.46, + "learning_rate": 1.1902029819192424e-05, + "loss": 0.4759, + "step": 2632 + }, + { + "epoch": 0.46, + "learning_rate": 1.1896519976898086e-05, + "loss": 0.4771, + "step": 2633 + }, + { + "epoch": 0.46, + "learning_rate": 1.1891009537305326e-05, + "loss": 0.4672, + "step": 2634 + }, + { + "epoch": 0.46, + "learning_rate": 1.1885498502149626e-05, + "loss": 0.474, + "step": 2635 + }, + { + "epoch": 0.46, + "learning_rate": 1.187998687316666e-05, + "loss": 0.4618, + "step": 2636 + }, + { + "epoch": 0.46, + "learning_rate": 1.1874474652092279e-05, + "loss": 0.474, + "step": 2637 + }, + { + "epoch": 0.46, + "learning_rate": 1.1868961840662525e-05, + "loss": 0.4633, + "step": 2638 + }, + { + "epoch": 0.46, + "learning_rate": 1.1863448440613634e-05, + "loss": 0.4843, + "step": 2639 + }, + { + "epoch": 0.46, + "learning_rate": 1.1857934453682016e-05, + "loss": 0.478, + "step": 2640 + }, + { + "epoch": 0.46, + "learning_rate": 1.1852419881604276e-05, + "loss": 0.475, + "step": 2641 + }, + { + "epoch": 0.46, + "learning_rate": 1.1846904726117187e-05, + "loss": 0.4687, + "step": 2642 + }, + { + "epoch": 0.46, + "learning_rate": 1.1841388988957728e-05, + "loss": 0.4864, + "step": 2643 + }, + { + "epoch": 0.46, + "learning_rate": 1.1835872671863042e-05, + "loss": 0.4612, + "step": 2644 + }, + { + "epoch": 0.46, + "learning_rate": 1.183035577657047e-05, + "loss": 0.4755, + "step": 2645 + }, + { + "epoch": 0.46, + "learning_rate": 1.1824838304817521e-05, + "loss": 0.4553, + "step": 2646 + }, + { + "epoch": 0.46, + "learning_rate": 1.1819320258341891e-05, + "loss": 0.494, + "step": 2647 + }, + { + "epoch": 0.46, + "learning_rate": 1.1813801638881466e-05, + "loss": 0.4708, + "step": 2648 + }, + { + "epoch": 0.46, + "learning_rate": 1.1808282448174295e-05, + "loss": 0.479, + "step": 2649 + }, + { + "epoch": 0.46, + "learning_rate": 1.1802762687958624e-05, + "loss": 0.4585, + "step": 2650 + }, + { + "epoch": 0.46, + "learning_rate": 1.1797242359972868e-05, + "loss": 0.4832, + "step": 2651 + }, + { + "epoch": 0.46, + "learning_rate": 1.1791721465955621e-05, + "loss": 0.4615, + "step": 2652 + }, + { + "epoch": 0.46, + "learning_rate": 1.1786200007645662e-05, + "loss": 0.4823, + "step": 2653 + }, + { + "epoch": 0.46, + "learning_rate": 1.1780677986781943e-05, + "loss": 0.4647, + "step": 2654 + }, + { + "epoch": 0.46, + "learning_rate": 1.177515540510359e-05, + "loss": 0.4725, + "step": 2655 + }, + { + "epoch": 0.46, + "learning_rate": 1.1769632264349914e-05, + "loss": 0.461, + "step": 2656 + }, + { + "epoch": 0.46, + "learning_rate": 1.1764108566260392e-05, + "loss": 0.478, + "step": 2657 + }, + { + "epoch": 0.46, + "learning_rate": 1.1758584312574693e-05, + "loss": 0.4814, + "step": 2658 + }, + { + "epoch": 0.46, + "learning_rate": 1.1753059505032636e-05, + "loss": 0.4764, + "step": 2659 + }, + { + "epoch": 0.46, + "learning_rate": 1.1747534145374236e-05, + "loss": 0.4802, + "step": 2660 + }, + { + "epoch": 0.46, + "learning_rate": 1.1742008235339677e-05, + "loss": 0.4746, + "step": 2661 + }, + { + "epoch": 0.46, + "learning_rate": 1.1736481776669307e-05, + "loss": 0.465, + "step": 2662 + }, + { + "epoch": 0.46, + "learning_rate": 1.1730954771103653e-05, + "loss": 0.4773, + "step": 2663 + }, + { + "epoch": 0.46, + "learning_rate": 1.1725427220383421e-05, + "loss": 0.482, + "step": 2664 + }, + { + "epoch": 0.46, + "learning_rate": 1.1719899126249482e-05, + "loss": 0.4719, + "step": 2665 + }, + { + "epoch": 0.46, + "learning_rate": 1.1714370490442872e-05, + "loss": 0.4558, + "step": 2666 + }, + { + "epoch": 0.46, + "learning_rate": 1.1708841314704811e-05, + "loss": 0.4749, + "step": 2667 + }, + { + "epoch": 0.46, + "learning_rate": 1.1703311600776677e-05, + "loss": 0.4726, + "step": 2668 + }, + { + "epoch": 0.46, + "learning_rate": 1.1697781350400025e-05, + "loss": 0.4765, + "step": 2669 + }, + { + "epoch": 0.46, + "learning_rate": 1.1692250565316577e-05, + "loss": 0.4621, + "step": 2670 + }, + { + "epoch": 0.46, + "learning_rate": 1.1686719247268221e-05, + "loss": 0.477, + "step": 2671 + }, + { + "epoch": 0.46, + "learning_rate": 1.1681187397997018e-05, + "loss": 0.4871, + "step": 2672 + }, + { + "epoch": 0.46, + "learning_rate": 1.1675655019245191e-05, + "loss": 0.4632, + "step": 2673 + }, + { + "epoch": 0.46, + "learning_rate": 1.1670122112755134e-05, + "loss": 0.4695, + "step": 2674 + }, + { + "epoch": 0.46, + "learning_rate": 1.1664588680269403e-05, + "loss": 0.4929, + "step": 2675 + }, + { + "epoch": 0.46, + "learning_rate": 1.1659054723530721e-05, + "loss": 0.4679, + "step": 2676 + }, + { + "epoch": 0.46, + "learning_rate": 1.1653520244281975e-05, + "loss": 0.4802, + "step": 2677 + }, + { + "epoch": 0.46, + "learning_rate": 1.1647985244266226e-05, + "loss": 0.4704, + "step": 2678 + }, + { + "epoch": 0.46, + "learning_rate": 1.1642449725226685e-05, + "loss": 0.4741, + "step": 2679 + }, + { + "epoch": 0.46, + "learning_rate": 1.1636913688906739e-05, + "loss": 0.4736, + "step": 2680 + }, + { + "epoch": 0.46, + "learning_rate": 1.1631377137049925e-05, + "loss": 0.4716, + "step": 2681 + }, + { + "epoch": 0.46, + "learning_rate": 1.1625840071399952e-05, + "loss": 0.4609, + "step": 2682 + }, + { + "epoch": 0.46, + "learning_rate": 1.1620302493700689e-05, + "loss": 0.479, + "step": 2683 + }, + { + "epoch": 0.46, + "learning_rate": 1.1614764405696162e-05, + "loss": 0.4727, + "step": 2684 + }, + { + "epoch": 0.47, + "learning_rate": 1.1609225809130566e-05, + "loss": 0.4828, + "step": 2685 + }, + { + "epoch": 0.47, + "learning_rate": 1.1603686705748247e-05, + "loss": 0.4744, + "step": 2686 + }, + { + "epoch": 0.47, + "learning_rate": 1.1598147097293721e-05, + "loss": 0.4803, + "step": 2687 + }, + { + "epoch": 0.47, + "learning_rate": 1.1592606985511648e-05, + "loss": 0.4444, + "step": 2688 + }, + { + "epoch": 0.47, + "learning_rate": 1.1587066372146863e-05, + "loss": 0.4755, + "step": 2689 + }, + { + "epoch": 0.47, + "learning_rate": 1.1581525258944346e-05, + "loss": 0.4771, + "step": 2690 + }, + { + "epoch": 0.47, + "learning_rate": 1.1575983647649243e-05, + "loss": 0.4714, + "step": 2691 + }, + { + "epoch": 0.47, + "learning_rate": 1.1570441540006849e-05, + "loss": 0.4707, + "step": 2692 + }, + { + "epoch": 0.47, + "learning_rate": 1.1564898937762627e-05, + "loss": 0.4846, + "step": 2693 + }, + { + "epoch": 0.47, + "learning_rate": 1.1559355842662188e-05, + "loss": 0.4755, + "step": 2694 + }, + { + "epoch": 0.47, + "learning_rate": 1.155381225645129e-05, + "loss": 0.4822, + "step": 2695 + }, + { + "epoch": 0.47, + "learning_rate": 1.1548268180875868e-05, + "loss": 0.4682, + "step": 2696 + }, + { + "epoch": 0.47, + "learning_rate": 1.1542723617681989e-05, + "loss": 0.4668, + "step": 2697 + }, + { + "epoch": 0.47, + "learning_rate": 1.1537178568615879e-05, + "loss": 0.4676, + "step": 2698 + }, + { + "epoch": 0.47, + "learning_rate": 1.1531633035423931e-05, + "loss": 0.4818, + "step": 2699 + }, + { + "epoch": 0.47, + "learning_rate": 1.1526087019852673e-05, + "loss": 0.4701, + "step": 2700 + }, + { + "epoch": 0.47, + "learning_rate": 1.152054052364879e-05, + "loss": 0.4807, + "step": 2701 + }, + { + "epoch": 0.47, + "learning_rate": 1.151499354855913e-05, + "loss": 0.4669, + "step": 2702 + }, + { + "epoch": 0.47, + "learning_rate": 1.150944609633067e-05, + "loss": 0.469, + "step": 2703 + }, + { + "epoch": 0.47, + "learning_rate": 1.1503898168710555e-05, + "loss": 0.4634, + "step": 2704 + }, + { + "epoch": 0.47, + "learning_rate": 1.1498349767446072e-05, + "loss": 0.4728, + "step": 2705 + }, + { + "epoch": 0.47, + "learning_rate": 1.1492800894284664e-05, + "loss": 0.4676, + "step": 2706 + }, + { + "epoch": 0.47, + "learning_rate": 1.1487251550973914e-05, + "loss": 0.4762, + "step": 2707 + }, + { + "epoch": 0.47, + "learning_rate": 1.1481701739261557e-05, + "loss": 0.4664, + "step": 2708 + }, + { + "epoch": 0.47, + "learning_rate": 1.1476151460895476e-05, + "loss": 0.4752, + "step": 2709 + }, + { + "epoch": 0.47, + "learning_rate": 1.1470600717623699e-05, + "loss": 0.4611, + "step": 2710 + }, + { + "epoch": 0.47, + "learning_rate": 1.1465049511194404e-05, + "loss": 0.4794, + "step": 2711 + }, + { + "epoch": 0.47, + "learning_rate": 1.1459497843355907e-05, + "loss": 0.473, + "step": 2712 + }, + { + "epoch": 0.47, + "learning_rate": 1.1453945715856682e-05, + "loss": 0.4751, + "step": 2713 + }, + { + "epoch": 0.47, + "learning_rate": 1.1448393130445337e-05, + "loss": 0.4613, + "step": 2714 + }, + { + "epoch": 0.47, + "learning_rate": 1.1442840088870628e-05, + "loss": 0.4664, + "step": 2715 + }, + { + "epoch": 0.47, + "learning_rate": 1.1437286592881458e-05, + "loss": 0.4638, + "step": 2716 + }, + { + "epoch": 0.47, + "learning_rate": 1.1431732644226865e-05, + "loss": 0.4743, + "step": 2717 + }, + { + "epoch": 0.47, + "learning_rate": 1.1426178244656038e-05, + "loss": 0.4629, + "step": 2718 + }, + { + "epoch": 0.47, + "learning_rate": 1.1420623395918297e-05, + "loss": 0.4804, + "step": 2719 + }, + { + "epoch": 0.47, + "learning_rate": 1.1415068099763123e-05, + "loss": 0.4752, + "step": 2720 + }, + { + "epoch": 0.47, + "learning_rate": 1.1409512357940114e-05, + "loss": 0.4806, + "step": 2721 + }, + { + "epoch": 0.47, + "learning_rate": 1.140395617219903e-05, + "loss": 0.46, + "step": 2722 + }, + { + "epoch": 0.47, + "learning_rate": 1.1398399544289751e-05, + "loss": 0.4777, + "step": 2723 + }, + { + "epoch": 0.47, + "learning_rate": 1.1392842475962311e-05, + "loss": 0.4726, + "step": 2724 + }, + { + "epoch": 0.47, + "learning_rate": 1.1387284968966879e-05, + "loss": 0.4657, + "step": 2725 + }, + { + "epoch": 0.47, + "learning_rate": 1.1381727025053758e-05, + "loss": 0.4499, + "step": 2726 + }, + { + "epoch": 0.47, + "learning_rate": 1.1376168645973393e-05, + "loss": 0.4751, + "step": 2727 + }, + { + "epoch": 0.47, + "learning_rate": 1.1370609833476365e-05, + "loss": 0.46, + "step": 2728 + }, + { + "epoch": 0.47, + "learning_rate": 1.136505058931339e-05, + "loss": 0.47, + "step": 2729 + }, + { + "epoch": 0.47, + "learning_rate": 1.1359490915235323e-05, + "loss": 0.462, + "step": 2730 + }, + { + "epoch": 0.47, + "learning_rate": 1.135393081299315e-05, + "loss": 0.4797, + "step": 2731 + }, + { + "epoch": 0.47, + "learning_rate": 1.1348370284337996e-05, + "loss": 0.4655, + "step": 2732 + }, + { + "epoch": 0.47, + "learning_rate": 1.1342809331021117e-05, + "loss": 0.4653, + "step": 2733 + }, + { + "epoch": 0.47, + "learning_rate": 1.1337247954793904e-05, + "loss": 0.4585, + "step": 2734 + }, + { + "epoch": 0.47, + "learning_rate": 1.1331686157407887e-05, + "loss": 0.4755, + "step": 2735 + }, + { + "epoch": 0.47, + "learning_rate": 1.1326123940614715e-05, + "loss": 0.4748, + "step": 2736 + }, + { + "epoch": 0.47, + "learning_rate": 1.1320561306166182e-05, + "loss": 0.4718, + "step": 2737 + }, + { + "epoch": 0.47, + "learning_rate": 1.131499825581421e-05, + "loss": 0.4608, + "step": 2738 + }, + { + "epoch": 0.47, + "learning_rate": 1.1309434791310848e-05, + "loss": 0.4723, + "step": 2739 + }, + { + "epoch": 0.47, + "learning_rate": 1.1303870914408277e-05, + "loss": 0.4694, + "step": 2740 + }, + { + "epoch": 0.47, + "learning_rate": 1.1298306626858811e-05, + "loss": 0.4756, + "step": 2741 + }, + { + "epoch": 0.48, + "learning_rate": 1.1292741930414894e-05, + "loss": 0.471, + "step": 2742 + }, + { + "epoch": 0.48, + "learning_rate": 1.128717682682909e-05, + "loss": 0.4806, + "step": 2743 + }, + { + "epoch": 0.48, + "learning_rate": 1.1281611317854107e-05, + "loss": 0.4507, + "step": 2744 + }, + { + "epoch": 0.48, + "learning_rate": 1.1276045405242761e-05, + "loss": 0.4736, + "step": 2745 + }, + { + "epoch": 0.48, + "learning_rate": 1.127047909074801e-05, + "loss": 0.462, + "step": 2746 + }, + { + "epoch": 0.48, + "learning_rate": 1.1264912376122931e-05, + "loss": 0.471, + "step": 2747 + }, + { + "epoch": 0.48, + "learning_rate": 1.1259345263120738e-05, + "loss": 0.4628, + "step": 2748 + }, + { + "epoch": 0.48, + "learning_rate": 1.1253777753494753e-05, + "loss": 0.4649, + "step": 2749 + }, + { + "epoch": 0.48, + "learning_rate": 1.1248209848998433e-05, + "loss": 0.4619, + "step": 2750 + }, + { + "epoch": 0.48, + "learning_rate": 1.1242641551385365e-05, + "loss": 0.4657, + "step": 2751 + }, + { + "epoch": 0.48, + "learning_rate": 1.1237072862409248e-05, + "loss": 0.4603, + "step": 2752 + }, + { + "epoch": 0.48, + "learning_rate": 1.1231503783823914e-05, + "loss": 0.4749, + "step": 2753 + }, + { + "epoch": 0.48, + "learning_rate": 1.1225934317383305e-05, + "loss": 0.464, + "step": 2754 + }, + { + "epoch": 0.48, + "learning_rate": 1.1220364464841502e-05, + "loss": 0.4737, + "step": 2755 + }, + { + "epoch": 0.48, + "learning_rate": 1.1214794227952694e-05, + "loss": 0.4646, + "step": 2756 + }, + { + "epoch": 0.48, + "learning_rate": 1.1209223608471202e-05, + "loss": 0.4833, + "step": 2757 + }, + { + "epoch": 0.48, + "learning_rate": 1.1203652608151456e-05, + "loss": 0.4797, + "step": 2758 + }, + { + "epoch": 0.48, + "learning_rate": 1.1198081228748012e-05, + "loss": 0.4794, + "step": 2759 + }, + { + "epoch": 0.48, + "learning_rate": 1.1192509472015549e-05, + "loss": 0.473, + "step": 2760 + }, + { + "epoch": 0.48, + "learning_rate": 1.1186937339708856e-05, + "loss": 0.4608, + "step": 2761 + }, + { + "epoch": 0.48, + "learning_rate": 1.1181364833582848e-05, + "loss": 0.472, + "step": 2762 + }, + { + "epoch": 0.48, + "learning_rate": 1.1175791955392552e-05, + "loss": 0.477, + "step": 2763 + }, + { + "epoch": 0.48, + "learning_rate": 1.1170218706893121e-05, + "loss": 0.4608, + "step": 2764 + }, + { + "epoch": 0.48, + "learning_rate": 1.1164645089839812e-05, + "loss": 0.4727, + "step": 2765 + }, + { + "epoch": 0.48, + "learning_rate": 1.1159071105988012e-05, + "loss": 0.4593, + "step": 2766 + }, + { + "epoch": 0.48, + "learning_rate": 1.1153496757093205e-05, + "loss": 0.4681, + "step": 2767 + }, + { + "epoch": 0.48, + "learning_rate": 1.114792204491101e-05, + "loss": 0.4665, + "step": 2768 + }, + { + "epoch": 0.48, + "learning_rate": 1.1142346971197151e-05, + "loss": 0.4654, + "step": 2769 + }, + { + "epoch": 0.48, + "learning_rate": 1.1136771537707464e-05, + "loss": 0.4698, + "step": 2770 + }, + { + "epoch": 0.48, + "learning_rate": 1.1131195746197902e-05, + "loss": 0.4699, + "step": 2771 + }, + { + "epoch": 0.48, + "learning_rate": 1.1125619598424528e-05, + "loss": 0.482, + "step": 2772 + }, + { + "epoch": 0.48, + "learning_rate": 1.1120043096143523e-05, + "loss": 0.48, + "step": 2773 + }, + { + "epoch": 0.48, + "learning_rate": 1.1114466241111168e-05, + "loss": 0.4747, + "step": 2774 + }, + { + "epoch": 0.48, + "learning_rate": 1.110888903508387e-05, + "loss": 0.4621, + "step": 2775 + }, + { + "epoch": 0.48, + "learning_rate": 1.1103311479818133e-05, + "loss": 0.4847, + "step": 2776 + }, + { + "epoch": 0.48, + "learning_rate": 1.1097733577070585e-05, + "loss": 0.4682, + "step": 2777 + }, + { + "epoch": 0.48, + "learning_rate": 1.1092155328597945e-05, + "loss": 0.4803, + "step": 2778 + }, + { + "epoch": 0.48, + "learning_rate": 1.108657673615706e-05, + "loss": 0.4698, + "step": 2779 + }, + { + "epoch": 0.48, + "learning_rate": 1.1080997801504872e-05, + "loss": 0.4759, + "step": 2780 + }, + { + "epoch": 0.48, + "learning_rate": 1.1075418526398435e-05, + "loss": 0.4704, + "step": 2781 + }, + { + "epoch": 0.48, + "learning_rate": 1.1069838912594914e-05, + "loss": 0.4643, + "step": 2782 + }, + { + "epoch": 0.48, + "learning_rate": 1.1064258961851575e-05, + "loss": 0.4732, + "step": 2783 + }, + { + "epoch": 0.48, + "learning_rate": 1.1058678675925796e-05, + "loss": 0.4784, + "step": 2784 + }, + { + "epoch": 0.48, + "learning_rate": 1.105309805657505e-05, + "loss": 0.4694, + "step": 2785 + }, + { + "epoch": 0.48, + "learning_rate": 1.1047517105556933e-05, + "loss": 0.4724, + "step": 2786 + }, + { + "epoch": 0.48, + "learning_rate": 1.1041935824629121e-05, + "loss": 0.4654, + "step": 2787 + }, + { + "epoch": 0.48, + "learning_rate": 1.1036354215549422e-05, + "loss": 0.4745, + "step": 2788 + }, + { + "epoch": 0.48, + "learning_rate": 1.1030772280075714e-05, + "loss": 0.4658, + "step": 2789 + }, + { + "epoch": 0.48, + "learning_rate": 1.1025190019966017e-05, + "loss": 0.4662, + "step": 2790 + }, + { + "epoch": 0.48, + "learning_rate": 1.1019607436978419e-05, + "loss": 0.461, + "step": 2791 + }, + { + "epoch": 0.48, + "learning_rate": 1.1014024532871128e-05, + "loss": 0.473, + "step": 2792 + }, + { + "epoch": 0.48, + "learning_rate": 1.1008441309402448e-05, + "loss": 0.4741, + "step": 2793 + }, + { + "epoch": 0.48, + "learning_rate": 1.1002857768330786e-05, + "loss": 0.4655, + "step": 2794 + }, + { + "epoch": 0.48, + "learning_rate": 1.0997273911414648e-05, + "loss": 0.4662, + "step": 2795 + }, + { + "epoch": 0.48, + "learning_rate": 1.099168974041263e-05, + "loss": 0.4602, + "step": 2796 + }, + { + "epoch": 0.48, + "learning_rate": 1.0986105257083446e-05, + "loss": 0.4546, + "step": 2797 + }, + { + "epoch": 0.48, + "learning_rate": 1.0980520463185894e-05, + "loss": 0.4846, + "step": 2798 + }, + { + "epoch": 0.48, + "learning_rate": 1.0974935360478875e-05, + "loss": 0.4559, + "step": 2799 + }, + { + "epoch": 0.49, + "learning_rate": 1.0969349950721382e-05, + "loss": 0.4704, + "step": 2800 + }, + { + "epoch": 0.49, + "learning_rate": 1.0963764235672516e-05, + "loss": 0.4658, + "step": 2801 + }, + { + "epoch": 0.49, + "learning_rate": 1.0958178217091455e-05, + "loss": 0.4947, + "step": 2802 + }, + { + "epoch": 0.49, + "learning_rate": 1.0952591896737499e-05, + "loss": 0.4545, + "step": 2803 + }, + { + "epoch": 0.49, + "learning_rate": 1.094700527637002e-05, + "loss": 0.4725, + "step": 2804 + }, + { + "epoch": 0.49, + "learning_rate": 1.0941418357748493e-05, + "loss": 0.4617, + "step": 2805 + }, + { + "epoch": 0.49, + "learning_rate": 1.0935831142632489e-05, + "loss": 0.4739, + "step": 2806 + }, + { + "epoch": 0.49, + "learning_rate": 1.0930243632781669e-05, + "loss": 0.4565, + "step": 2807 + }, + { + "epoch": 0.49, + "learning_rate": 1.0924655829955793e-05, + "loss": 0.472, + "step": 2808 + }, + { + "epoch": 0.49, + "learning_rate": 1.09190677359147e-05, + "loss": 0.4593, + "step": 2809 + }, + { + "epoch": 0.49, + "learning_rate": 1.0913479352418336e-05, + "loss": 0.4592, + "step": 2810 + }, + { + "epoch": 0.49, + "learning_rate": 1.0907890681226728e-05, + "loss": 0.474, + "step": 2811 + }, + { + "epoch": 0.49, + "learning_rate": 1.09023017241e-05, + "loss": 0.4666, + "step": 2812 + }, + { + "epoch": 0.49, + "learning_rate": 1.0896712482798358e-05, + "loss": 0.4696, + "step": 2813 + }, + { + "epoch": 0.49, + "learning_rate": 1.0891122959082108e-05, + "loss": 0.4647, + "step": 2814 + }, + { + "epoch": 0.49, + "learning_rate": 1.0885533154711633e-05, + "loss": 0.4632, + "step": 2815 + }, + { + "epoch": 0.49, + "learning_rate": 1.0879943071447417e-05, + "loss": 0.4635, + "step": 2816 + }, + { + "epoch": 0.49, + "learning_rate": 1.087435271105002e-05, + "loss": 0.4699, + "step": 2817 + }, + { + "epoch": 0.49, + "learning_rate": 1.0868762075280102e-05, + "loss": 0.4664, + "step": 2818 + }, + { + "epoch": 0.49, + "learning_rate": 1.0863171165898398e-05, + "loss": 0.4585, + "step": 2819 + }, + { + "epoch": 0.49, + "learning_rate": 1.0857579984665733e-05, + "loss": 0.4537, + "step": 2820 + }, + { + "epoch": 0.49, + "learning_rate": 1.0851988533343022e-05, + "loss": 0.4682, + "step": 2821 + }, + { + "epoch": 0.49, + "learning_rate": 1.0846396813691258e-05, + "loss": 0.4823, + "step": 2822 + }, + { + "epoch": 0.49, + "learning_rate": 1.0840804827471523e-05, + "loss": 0.4675, + "step": 2823 + }, + { + "epoch": 0.49, + "learning_rate": 1.0835212576444983e-05, + "loss": 0.4673, + "step": 2824 + }, + { + "epoch": 0.49, + "learning_rate": 1.0829620062372887e-05, + "loss": 0.4716, + "step": 2825 + }, + { + "epoch": 0.49, + "learning_rate": 1.0824027287016566e-05, + "loss": 0.4714, + "step": 2826 + }, + { + "epoch": 0.49, + "learning_rate": 1.0818434252137428e-05, + "loss": 0.4608, + "step": 2827 + }, + { + "epoch": 0.49, + "learning_rate": 1.0812840959496978e-05, + "loss": 0.4751, + "step": 2828 + }, + { + "epoch": 0.49, + "learning_rate": 1.0807247410856783e-05, + "loss": 0.4667, + "step": 2829 + }, + { + "epoch": 0.49, + "learning_rate": 1.0801653607978506e-05, + "loss": 0.4661, + "step": 2830 + }, + { + "epoch": 0.49, + "learning_rate": 1.0796059552623881e-05, + "loss": 0.4628, + "step": 2831 + }, + { + "epoch": 0.49, + "learning_rate": 1.0790465246554728e-05, + "loss": 0.48, + "step": 2832 + }, + { + "epoch": 0.49, + "learning_rate": 1.078487069153294e-05, + "loss": 0.4635, + "step": 2833 + }, + { + "epoch": 0.49, + "learning_rate": 1.0779275889320495e-05, + "loss": 0.4781, + "step": 2834 + }, + { + "epoch": 0.49, + "learning_rate": 1.0773680841679437e-05, + "loss": 0.4668, + "step": 2835 + }, + { + "epoch": 0.49, + "learning_rate": 1.0768085550371902e-05, + "loss": 0.4714, + "step": 2836 + }, + { + "epoch": 0.49, + "learning_rate": 1.076249001716009e-05, + "loss": 0.459, + "step": 2837 + }, + { + "epoch": 0.49, + "learning_rate": 1.0756894243806291e-05, + "loss": 0.4767, + "step": 2838 + }, + { + "epoch": 0.49, + "learning_rate": 1.0751298232072856e-05, + "loss": 0.4708, + "step": 2839 + }, + { + "epoch": 0.49, + "learning_rate": 1.0745701983722219e-05, + "loss": 0.4797, + "step": 2840 + }, + { + "epoch": 0.49, + "learning_rate": 1.0740105500516889e-05, + "loss": 0.4657, + "step": 2841 + }, + { + "epoch": 0.49, + "learning_rate": 1.0734508784219446e-05, + "loss": 0.4839, + "step": 2842 + }, + { + "epoch": 0.49, + "learning_rate": 1.0728911836592548e-05, + "loss": 0.4604, + "step": 2843 + }, + { + "epoch": 0.49, + "learning_rate": 1.0723314659398916e-05, + "loss": 0.4856, + "step": 2844 + }, + { + "epoch": 0.49, + "learning_rate": 1.0717717254401356e-05, + "loss": 0.4612, + "step": 2845 + }, + { + "epoch": 0.49, + "learning_rate": 1.0712119623362738e-05, + "loss": 0.4742, + "step": 2846 + }, + { + "epoch": 0.49, + "learning_rate": 1.0706521768046006e-05, + "loss": 0.4681, + "step": 2847 + }, + { + "epoch": 0.49, + "learning_rate": 1.0700923690214166e-05, + "loss": 0.4755, + "step": 2848 + }, + { + "epoch": 0.49, + "learning_rate": 1.0695325391630309e-05, + "loss": 0.4642, + "step": 2849 + }, + { + "epoch": 0.49, + "learning_rate": 1.0689726874057589e-05, + "loss": 0.4708, + "step": 2850 + }, + { + "epoch": 0.49, + "learning_rate": 1.068412813925922e-05, + "loss": 0.4728, + "step": 2851 + }, + { + "epoch": 0.49, + "learning_rate": 1.06785291889985e-05, + "loss": 0.4745, + "step": 2852 + }, + { + "epoch": 0.49, + "learning_rate": 1.0672930025038783e-05, + "loss": 0.4644, + "step": 2853 + }, + { + "epoch": 0.49, + "learning_rate": 1.0667330649143498e-05, + "loss": 0.4762, + "step": 2854 + }, + { + "epoch": 0.49, + "learning_rate": 1.0661731063076134e-05, + "loss": 0.4733, + "step": 2855 + }, + { + "epoch": 0.49, + "learning_rate": 1.0656131268600254e-05, + "loss": 0.4657, + "step": 2856 + }, + { + "epoch": 0.49, + "learning_rate": 1.0650531267479477e-05, + "loss": 0.4802, + "step": 2857 + }, + { + "epoch": 0.5, + "learning_rate": 1.0644931061477492e-05, + "loss": 0.4758, + "step": 2858 + }, + { + "epoch": 0.5, + "learning_rate": 1.0639330652358058e-05, + "loss": 0.471, + "step": 2859 + }, + { + "epoch": 0.5, + "learning_rate": 1.0633730041884988e-05, + "loss": 0.4702, + "step": 2860 + }, + { + "epoch": 0.5, + "learning_rate": 1.0628129231822166e-05, + "loss": 0.4673, + "step": 2861 + }, + { + "epoch": 0.5, + "learning_rate": 1.0622528223933534e-05, + "loss": 0.4733, + "step": 2862 + }, + { + "epoch": 0.5, + "learning_rate": 1.06169270199831e-05, + "loss": 0.4704, + "step": 2863 + }, + { + "epoch": 0.5, + "learning_rate": 1.061132562173493e-05, + "loss": 0.4627, + "step": 2864 + }, + { + "epoch": 0.5, + "learning_rate": 1.0605724030953155e-05, + "loss": 0.4642, + "step": 2865 + }, + { + "epoch": 0.5, + "learning_rate": 1.0600122249401965e-05, + "loss": 0.4826, + "step": 2866 + }, + { + "epoch": 0.5, + "learning_rate": 1.059452027884561e-05, + "loss": 0.4649, + "step": 2867 + }, + { + "epoch": 0.5, + "learning_rate": 1.0588918121048396e-05, + "loss": 0.473, + "step": 2868 + }, + { + "epoch": 0.5, + "learning_rate": 1.0583315777774697e-05, + "loss": 0.4647, + "step": 2869 + }, + { + "epoch": 0.5, + "learning_rate": 1.0577713250788935e-05, + "loss": 0.4806, + "step": 2870 + }, + { + "epoch": 0.5, + "learning_rate": 1.0572110541855596e-05, + "loss": 0.4572, + "step": 2871 + }, + { + "epoch": 0.5, + "learning_rate": 1.0566507652739224e-05, + "loss": 0.4676, + "step": 2872 + }, + { + "epoch": 0.5, + "learning_rate": 1.056090458520442e-05, + "loss": 0.4635, + "step": 2873 + }, + { + "epoch": 0.5, + "learning_rate": 1.0555301341015832e-05, + "loss": 0.4789, + "step": 2874 + }, + { + "epoch": 0.5, + "learning_rate": 1.0549697921938172e-05, + "loss": 0.4621, + "step": 2875 + }, + { + "epoch": 0.5, + "learning_rate": 1.0544094329736213e-05, + "loss": 0.4799, + "step": 2876 + }, + { + "epoch": 0.5, + "learning_rate": 1.0538490566174766e-05, + "loss": 0.4554, + "step": 2877 + }, + { + "epoch": 0.5, + "learning_rate": 1.0532886633018711e-05, + "loss": 0.478, + "step": 2878 + }, + { + "epoch": 0.5, + "learning_rate": 1.052728253203297e-05, + "loss": 0.4675, + "step": 2879 + }, + { + "epoch": 0.5, + "learning_rate": 1.0521678264982534e-05, + "loss": 0.4751, + "step": 2880 + }, + { + "epoch": 0.5, + "learning_rate": 1.0516073833632424e-05, + "loss": 0.4759, + "step": 2881 + }, + { + "epoch": 0.5, + "learning_rate": 1.0510469239747731e-05, + "loss": 0.4729, + "step": 2882 + }, + { + "epoch": 0.5, + "learning_rate": 1.0504864485093588e-05, + "loss": 0.4586, + "step": 2883 + }, + { + "epoch": 0.5, + "learning_rate": 1.0499259571435185e-05, + "loss": 0.4834, + "step": 2884 + }, + { + "epoch": 0.5, + "learning_rate": 1.0493654500537756e-05, + "loss": 0.4687, + "step": 2885 + }, + { + "epoch": 0.5, + "learning_rate": 1.0488049274166583e-05, + "loss": 0.458, + "step": 2886 + }, + { + "epoch": 0.5, + "learning_rate": 1.0482443894087007e-05, + "loss": 0.4751, + "step": 2887 + }, + { + "epoch": 0.5, + "learning_rate": 1.0476838362064408e-05, + "loss": 0.4803, + "step": 2888 + }, + { + "epoch": 0.5, + "learning_rate": 1.047123267986422e-05, + "loss": 0.4636, + "step": 2889 + }, + { + "epoch": 0.5, + "learning_rate": 1.0465626849251919e-05, + "loss": 0.4917, + "step": 2890 + }, + { + "epoch": 0.5, + "learning_rate": 1.046002087199303e-05, + "loss": 0.4683, + "step": 2891 + }, + { + "epoch": 0.5, + "learning_rate": 1.0454414749853126e-05, + "loss": 0.4804, + "step": 2892 + }, + { + "epoch": 0.5, + "learning_rate": 1.0448808484597821e-05, + "loss": 0.4634, + "step": 2893 + }, + { + "epoch": 0.5, + "learning_rate": 1.044320207799278e-05, + "loss": 0.4697, + "step": 2894 + }, + { + "epoch": 0.5, + "learning_rate": 1.0437595531803713e-05, + "loss": 0.4658, + "step": 2895 + }, + { + "epoch": 0.5, + "learning_rate": 1.0431988847796361e-05, + "loss": 0.4785, + "step": 2896 + }, + { + "epoch": 0.5, + "learning_rate": 1.0426382027736524e-05, + "loss": 0.4607, + "step": 2897 + }, + { + "epoch": 0.5, + "learning_rate": 1.042077507339004e-05, + "loss": 0.4539, + "step": 2898 + }, + { + "epoch": 0.5, + "learning_rate": 1.0415167986522785e-05, + "loss": 0.4563, + "step": 2899 + }, + { + "epoch": 0.5, + "learning_rate": 1.040956076890068e-05, + "loss": 0.4754, + "step": 2900 + }, + { + "epoch": 0.5, + "learning_rate": 1.0403953422289687e-05, + "loss": 0.4598, + "step": 2901 + }, + { + "epoch": 0.5, + "learning_rate": 1.0398345948455815e-05, + "loss": 0.4637, + "step": 2902 + }, + { + "epoch": 0.5, + "learning_rate": 1.0392738349165097e-05, + "loss": 0.4655, + "step": 2903 + }, + { + "epoch": 0.5, + "learning_rate": 1.038713062618362e-05, + "loss": 0.4774, + "step": 2904 + }, + { + "epoch": 0.5, + "learning_rate": 1.0381522781277506e-05, + "loss": 0.4659, + "step": 2905 + }, + { + "epoch": 0.5, + "learning_rate": 1.0375914816212913e-05, + "loss": 0.4837, + "step": 2906 + }, + { + "epoch": 0.5, + "learning_rate": 1.0370306732756037e-05, + "loss": 0.4699, + "step": 2907 + }, + { + "epoch": 0.5, + "learning_rate": 1.0364698532673117e-05, + "loss": 0.4766, + "step": 2908 + }, + { + "epoch": 0.5, + "learning_rate": 1.035909021773042e-05, + "loss": 0.4821, + "step": 2909 + }, + { + "epoch": 0.5, + "learning_rate": 1.0353481789694258e-05, + "loss": 0.4764, + "step": 2910 + }, + { + "epoch": 0.5, + "learning_rate": 1.0347873250330971e-05, + "loss": 0.4531, + "step": 2911 + }, + { + "epoch": 0.5, + "learning_rate": 1.0342264601406936e-05, + "loss": 0.4722, + "step": 2912 + }, + { + "epoch": 0.5, + "learning_rate": 1.0336655844688571e-05, + "loss": 0.4608, + "step": 2913 + }, + { + "epoch": 0.5, + "learning_rate": 1.0331046981942311e-05, + "loss": 0.4781, + "step": 2914 + }, + { + "epoch": 0.5, + "learning_rate": 1.0325438014934655e-05, + "loss": 0.46, + "step": 2915 + }, + { + "epoch": 0.51, + "learning_rate": 1.03198289454321e-05, + "loss": 0.4754, + "step": 2916 + }, + { + "epoch": 0.51, + "learning_rate": 1.0314219775201198e-05, + "loss": 0.4663, + "step": 2917 + }, + { + "epoch": 0.51, + "learning_rate": 1.0308610506008527e-05, + "loss": 0.4763, + "step": 2918 + }, + { + "epoch": 0.51, + "learning_rate": 1.030300113962069e-05, + "loss": 0.4618, + "step": 2919 + }, + { + "epoch": 0.51, + "learning_rate": 1.029739167780433e-05, + "loss": 0.4673, + "step": 2920 + }, + { + "epoch": 0.51, + "learning_rate": 1.0291782122326112e-05, + "loss": 0.4567, + "step": 2921 + }, + { + "epoch": 0.51, + "learning_rate": 1.0286172474952742e-05, + "loss": 0.4812, + "step": 2922 + }, + { + "epoch": 0.51, + "learning_rate": 1.0280562737450938e-05, + "loss": 0.4611, + "step": 2923 + }, + { + "epoch": 0.51, + "learning_rate": 1.0274952911587464e-05, + "loss": 0.4646, + "step": 2924 + }, + { + "epoch": 0.51, + "learning_rate": 1.0269342999129097e-05, + "loss": 0.4584, + "step": 2925 + }, + { + "epoch": 0.51, + "learning_rate": 1.026373300184265e-05, + "loss": 0.4704, + "step": 2926 + }, + { + "epoch": 0.51, + "learning_rate": 1.025812292149496e-05, + "loss": 0.4644, + "step": 2927 + }, + { + "epoch": 0.51, + "learning_rate": 1.0252512759852891e-05, + "loss": 0.4719, + "step": 2928 + }, + { + "epoch": 0.51, + "learning_rate": 1.0246902518683331e-05, + "loss": 0.485, + "step": 2929 + }, + { + "epoch": 0.51, + "learning_rate": 1.0241292199753196e-05, + "loss": 0.461, + "step": 2930 + }, + { + "epoch": 0.51, + "learning_rate": 1.0235681804829426e-05, + "loss": 0.4711, + "step": 2931 + }, + { + "epoch": 0.51, + "learning_rate": 1.0230071335678982e-05, + "loss": 0.4688, + "step": 2932 + }, + { + "epoch": 0.51, + "learning_rate": 1.0224460794068849e-05, + "loss": 0.4713, + "step": 2933 + }, + { + "epoch": 0.51, + "learning_rate": 1.0218850181766038e-05, + "loss": 0.458, + "step": 2934 + }, + { + "epoch": 0.51, + "learning_rate": 1.0213239500537577e-05, + "loss": 0.4666, + "step": 2935 + }, + { + "epoch": 0.51, + "learning_rate": 1.020762875215052e-05, + "loss": 0.4552, + "step": 2936 + }, + { + "epoch": 0.51, + "learning_rate": 1.0202017938371947e-05, + "loss": 0.4701, + "step": 2937 + }, + { + "epoch": 0.51, + "learning_rate": 1.0196407060968942e-05, + "loss": 0.4638, + "step": 2938 + }, + { + "epoch": 0.51, + "learning_rate": 1.0190796121708627e-05, + "loss": 0.4686, + "step": 2939 + }, + { + "epoch": 0.51, + "learning_rate": 1.0185185122358139e-05, + "loss": 0.4612, + "step": 2940 + }, + { + "epoch": 0.51, + "learning_rate": 1.017957406468462e-05, + "loss": 0.4647, + "step": 2941 + }, + { + "epoch": 0.51, + "learning_rate": 1.0173962950455249e-05, + "loss": 0.4582, + "step": 2942 + }, + { + "epoch": 0.51, + "learning_rate": 1.0168351781437215e-05, + "loss": 0.4753, + "step": 2943 + }, + { + "epoch": 0.51, + "learning_rate": 1.0162740559397726e-05, + "loss": 0.465, + "step": 2944 + }, + { + "epoch": 0.51, + "learning_rate": 1.0157129286104e-05, + "loss": 0.461, + "step": 2945 + }, + { + "epoch": 0.51, + "learning_rate": 1.015151796332328e-05, + "loss": 0.4559, + "step": 2946 + }, + { + "epoch": 0.51, + "learning_rate": 1.0145906592822819e-05, + "loss": 0.4736, + "step": 2947 + }, + { + "epoch": 0.51, + "learning_rate": 1.014029517636989e-05, + "loss": 0.4715, + "step": 2948 + }, + { + "epoch": 0.51, + "learning_rate": 1.013468371573177e-05, + "loss": 0.4753, + "step": 2949 + }, + { + "epoch": 0.51, + "learning_rate": 1.0129072212675766e-05, + "loss": 0.467, + "step": 2950 + }, + { + "epoch": 0.51, + "learning_rate": 1.0123460668969184e-05, + "loss": 0.4833, + "step": 2951 + }, + { + "epoch": 0.51, + "learning_rate": 1.0117849086379355e-05, + "loss": 0.4622, + "step": 2952 + }, + { + "epoch": 0.51, + "learning_rate": 1.011223746667361e-05, + "loss": 0.4684, + "step": 2953 + }, + { + "epoch": 0.51, + "learning_rate": 1.0106625811619297e-05, + "loss": 0.4609, + "step": 2954 + }, + { + "epoch": 0.51, + "learning_rate": 1.010101412298378e-05, + "loss": 0.4705, + "step": 2955 + }, + { + "epoch": 0.51, + "learning_rate": 1.0095402402534423e-05, + "loss": 0.4534, + "step": 2956 + }, + { + "epoch": 0.51, + "learning_rate": 1.0089790652038613e-05, + "loss": 0.4711, + "step": 2957 + }, + { + "epoch": 0.51, + "learning_rate": 1.0084178873263735e-05, + "loss": 0.4686, + "step": 2958 + }, + { + "epoch": 0.51, + "learning_rate": 1.0078567067977193e-05, + "loss": 0.4894, + "step": 2959 + }, + { + "epoch": 0.51, + "learning_rate": 1.0072955237946383e-05, + "loss": 0.474, + "step": 2960 + }, + { + "epoch": 0.51, + "learning_rate": 1.0067343384938731e-05, + "loss": 0.4594, + "step": 2961 + }, + { + "epoch": 0.51, + "learning_rate": 1.0061731510721653e-05, + "loss": 0.4644, + "step": 2962 + }, + { + "epoch": 0.51, + "learning_rate": 1.005611961706258e-05, + "loss": 0.4748, + "step": 2963 + }, + { + "epoch": 0.51, + "learning_rate": 1.0050507705728943e-05, + "loss": 0.4724, + "step": 2964 + }, + { + "epoch": 0.51, + "learning_rate": 1.0044895778488184e-05, + "loss": 0.4678, + "step": 2965 + }, + { + "epoch": 0.51, + "learning_rate": 1.0039283837107753e-05, + "loss": 0.469, + "step": 2966 + }, + { + "epoch": 0.51, + "learning_rate": 1.0033671883355093e-05, + "loss": 0.4649, + "step": 2967 + }, + { + "epoch": 0.51, + "learning_rate": 1.0028059918997664e-05, + "loss": 0.4727, + "step": 2968 + }, + { + "epoch": 0.51, + "learning_rate": 1.0022447945802917e-05, + "loss": 0.4778, + "step": 2969 + }, + { + "epoch": 0.51, + "learning_rate": 1.0016835965538314e-05, + "loss": 0.4505, + "step": 2970 + }, + { + "epoch": 0.51, + "learning_rate": 1.0011223979971319e-05, + "loss": 0.4661, + "step": 2971 + }, + { + "epoch": 0.51, + "learning_rate": 1.0005611990869392e-05, + "loss": 0.472, + "step": 2972 + }, + { + "epoch": 0.52, + "learning_rate": 1e-05, + "loss": 0.4793, + "step": 2973 + }, + { + "epoch": 0.52, + "learning_rate": 9.99438800913061e-06, + "loss": 0.463, + "step": 2974 + }, + { + "epoch": 0.52, + "learning_rate": 9.988776020028685e-06, + "loss": 0.4748, + "step": 2975 + }, + { + "epoch": 0.52, + "learning_rate": 9.983164034461686e-06, + "loss": 0.4596, + "step": 2976 + }, + { + "epoch": 0.52, + "learning_rate": 9.977552054197088e-06, + "loss": 0.4713, + "step": 2977 + }, + { + "epoch": 0.52, + "learning_rate": 9.971940081002338e-06, + "loss": 0.4618, + "step": 2978 + }, + { + "epoch": 0.52, + "learning_rate": 9.96632811664491e-06, + "loss": 0.4759, + "step": 2979 + }, + { + "epoch": 0.52, + "learning_rate": 9.96071616289225e-06, + "loss": 0.4563, + "step": 2980 + }, + { + "epoch": 0.52, + "learning_rate": 9.955104221511816e-06, + "loss": 0.4776, + "step": 2981 + }, + { + "epoch": 0.52, + "learning_rate": 9.949492294271062e-06, + "loss": 0.4688, + "step": 2982 + }, + { + "epoch": 0.52, + "learning_rate": 9.943880382937426e-06, + "loss": 0.481, + "step": 2983 + }, + { + "epoch": 0.52, + "learning_rate": 9.938268489278352e-06, + "loss": 0.4643, + "step": 2984 + }, + { + "epoch": 0.52, + "learning_rate": 9.932656615061274e-06, + "loss": 0.4796, + "step": 2985 + }, + { + "epoch": 0.52, + "learning_rate": 9.927044762053622e-06, + "loss": 0.4643, + "step": 2986 + }, + { + "epoch": 0.52, + "learning_rate": 9.921432932022812e-06, + "loss": 0.4787, + "step": 2987 + }, + { + "epoch": 0.52, + "learning_rate": 9.915821126736266e-06, + "loss": 0.4675, + "step": 2988 + }, + { + "epoch": 0.52, + "learning_rate": 9.910209347961389e-06, + "loss": 0.4763, + "step": 2989 + }, + { + "epoch": 0.52, + "learning_rate": 9.904597597465577e-06, + "loss": 0.4723, + "step": 2990 + }, + { + "epoch": 0.52, + "learning_rate": 9.898985877016225e-06, + "loss": 0.4615, + "step": 2991 + }, + { + "epoch": 0.52, + "learning_rate": 9.893374188380705e-06, + "loss": 0.4593, + "step": 2992 + }, + { + "epoch": 0.52, + "learning_rate": 9.887762533326396e-06, + "loss": 0.4683, + "step": 2993 + }, + { + "epoch": 0.52, + "learning_rate": 9.882150913620648e-06, + "loss": 0.4717, + "step": 2994 + }, + { + "epoch": 0.52, + "learning_rate": 9.876539331030814e-06, + "loss": 0.4672, + "step": 2995 + }, + { + "epoch": 0.52, + "learning_rate": 9.870927787324236e-06, + "loss": 0.4518, + "step": 2996 + }, + { + "epoch": 0.52, + "learning_rate": 9.865316284268232e-06, + "loss": 0.4798, + "step": 2997 + }, + { + "epoch": 0.52, + "learning_rate": 9.859704823630115e-06, + "loss": 0.4739, + "step": 2998 + }, + { + "epoch": 0.52, + "learning_rate": 9.854093407177185e-06, + "loss": 0.4692, + "step": 2999 + }, + { + "epoch": 0.52, + "learning_rate": 9.848482036676725e-06, + "loss": 0.4533, + "step": 3000 + }, + { + "epoch": 0.52, + "learning_rate": 9.842870713896004e-06, + "loss": 0.4729, + "step": 3001 + }, + { + "epoch": 0.52, + "learning_rate": 9.837259440602274e-06, + "loss": 0.4635, + "step": 3002 + }, + { + "epoch": 0.52, + "learning_rate": 9.831648218562787e-06, + "loss": 0.4655, + "step": 3003 + }, + { + "epoch": 0.52, + "learning_rate": 9.82603704954475e-06, + "loss": 0.4707, + "step": 3004 + }, + { + "epoch": 0.52, + "learning_rate": 9.820425935315381e-06, + "loss": 0.4678, + "step": 3005 + }, + { + "epoch": 0.52, + "learning_rate": 9.814814877641865e-06, + "loss": 0.4674, + "step": 3006 + }, + { + "epoch": 0.52, + "learning_rate": 9.809203878291374e-06, + "loss": 0.4597, + "step": 3007 + }, + { + "epoch": 0.52, + "learning_rate": 9.80359293903106e-06, + "loss": 0.4624, + "step": 3008 + }, + { + "epoch": 0.52, + "learning_rate": 9.797982061628056e-06, + "loss": 0.4711, + "step": 3009 + }, + { + "epoch": 0.52, + "learning_rate": 9.792371247849481e-06, + "loss": 0.4489, + "step": 3010 + }, + { + "epoch": 0.52, + "learning_rate": 9.786760499462425e-06, + "loss": 0.4755, + "step": 3011 + }, + { + "epoch": 0.52, + "learning_rate": 9.781149818233969e-06, + "loss": 0.4623, + "step": 3012 + }, + { + "epoch": 0.52, + "learning_rate": 9.775539205931153e-06, + "loss": 0.4762, + "step": 3013 + }, + { + "epoch": 0.52, + "learning_rate": 9.769928664321021e-06, + "loss": 0.4694, + "step": 3014 + }, + { + "epoch": 0.52, + "learning_rate": 9.764318195170575e-06, + "loss": 0.4562, + "step": 3015 + }, + { + "epoch": 0.52, + "learning_rate": 9.758707800246806e-06, + "loss": 0.4686, + "step": 3016 + }, + { + "epoch": 0.52, + "learning_rate": 9.753097481316672e-06, + "loss": 0.4821, + "step": 3017 + }, + { + "epoch": 0.52, + "learning_rate": 9.747487240147112e-06, + "loss": 0.4611, + "step": 3018 + }, + { + "epoch": 0.52, + "learning_rate": 9.741877078505046e-06, + "loss": 0.4765, + "step": 3019 + }, + { + "epoch": 0.52, + "learning_rate": 9.736266998157353e-06, + "loss": 0.4685, + "step": 3020 + }, + { + "epoch": 0.52, + "learning_rate": 9.73065700087091e-06, + "loss": 0.4648, + "step": 3021 + }, + { + "epoch": 0.52, + "learning_rate": 9.725047088412538e-06, + "loss": 0.4586, + "step": 3022 + }, + { + "epoch": 0.52, + "learning_rate": 9.719437262549061e-06, + "loss": 0.4572, + "step": 3023 + }, + { + "epoch": 0.52, + "learning_rate": 9.713827525047261e-06, + "loss": 0.4636, + "step": 3024 + }, + { + "epoch": 0.52, + "learning_rate": 9.708217877673888e-06, + "loss": 0.4712, + "step": 3025 + }, + { + "epoch": 0.52, + "learning_rate": 9.702608322195674e-06, + "loss": 0.4651, + "step": 3026 + }, + { + "epoch": 0.52, + "learning_rate": 9.696998860379313e-06, + "loss": 0.4669, + "step": 3027 + }, + { + "epoch": 0.52, + "learning_rate": 9.691389493991478e-06, + "loss": 0.4548, + "step": 3028 + }, + { + "epoch": 0.52, + "learning_rate": 9.685780224798805e-06, + "loss": 0.4726, + "step": 3029 + }, + { + "epoch": 0.52, + "learning_rate": 9.6801710545679e-06, + "loss": 0.4694, + "step": 3030 + }, + { + "epoch": 0.53, + "learning_rate": 9.674561985065349e-06, + "loss": 0.4704, + "step": 3031 + }, + { + "epoch": 0.53, + "learning_rate": 9.668953018057687e-06, + "loss": 0.4679, + "step": 3032 + }, + { + "epoch": 0.53, + "learning_rate": 9.663344155311436e-06, + "loss": 0.4752, + "step": 3033 + }, + { + "epoch": 0.53, + "learning_rate": 9.657735398593068e-06, + "loss": 0.4705, + "step": 3034 + }, + { + "epoch": 0.53, + "learning_rate": 9.652126749669036e-06, + "loss": 0.4681, + "step": 3035 + }, + { + "epoch": 0.53, + "learning_rate": 9.646518210305747e-06, + "loss": 0.4661, + "step": 3036 + }, + { + "epoch": 0.53, + "learning_rate": 9.64090978226958e-06, + "loss": 0.4631, + "step": 3037 + }, + { + "epoch": 0.53, + "learning_rate": 9.635301467326888e-06, + "loss": 0.4626, + "step": 3038 + }, + { + "epoch": 0.53, + "learning_rate": 9.629693267243963e-06, + "loss": 0.4856, + "step": 3039 + }, + { + "epoch": 0.53, + "learning_rate": 9.62408518378709e-06, + "loss": 0.4672, + "step": 3040 + }, + { + "epoch": 0.53, + "learning_rate": 9.618477218722496e-06, + "loss": 0.4745, + "step": 3041 + }, + { + "epoch": 0.53, + "learning_rate": 9.612869373816383e-06, + "loss": 0.4665, + "step": 3042 + }, + { + "epoch": 0.53, + "learning_rate": 9.607261650834906e-06, + "loss": 0.4643, + "step": 3043 + }, + { + "epoch": 0.53, + "learning_rate": 9.601654051544188e-06, + "loss": 0.4712, + "step": 3044 + }, + { + "epoch": 0.53, + "learning_rate": 9.596046577710314e-06, + "loss": 0.4799, + "step": 3045 + }, + { + "epoch": 0.53, + "learning_rate": 9.59043923109932e-06, + "loss": 0.4698, + "step": 3046 + }, + { + "epoch": 0.53, + "learning_rate": 9.58483201347722e-06, + "loss": 0.4743, + "step": 3047 + }, + { + "epoch": 0.53, + "learning_rate": 9.579224926609962e-06, + "loss": 0.4615, + "step": 3048 + }, + { + "epoch": 0.53, + "learning_rate": 9.57361797226348e-06, + "loss": 0.4528, + "step": 3049 + }, + { + "epoch": 0.53, + "learning_rate": 9.568011152203642e-06, + "loss": 0.4662, + "step": 3050 + }, + { + "epoch": 0.53, + "learning_rate": 9.562404468196292e-06, + "loss": 0.4747, + "step": 3051 + }, + { + "epoch": 0.53, + "learning_rate": 9.556797922007221e-06, + "loss": 0.464, + "step": 3052 + }, + { + "epoch": 0.53, + "learning_rate": 9.55119151540218e-06, + "loss": 0.4595, + "step": 3053 + }, + { + "epoch": 0.53, + "learning_rate": 9.545585250146879e-06, + "loss": 0.4652, + "step": 3054 + }, + { + "epoch": 0.53, + "learning_rate": 9.539979128006971e-06, + "loss": 0.4774, + "step": 3055 + }, + { + "epoch": 0.53, + "learning_rate": 9.534373150748086e-06, + "loss": 0.4644, + "step": 3056 + }, + { + "epoch": 0.53, + "learning_rate": 9.528767320135783e-06, + "loss": 0.469, + "step": 3057 + }, + { + "epoch": 0.53, + "learning_rate": 9.523161637935592e-06, + "loss": 0.4634, + "step": 3058 + }, + { + "epoch": 0.53, + "learning_rate": 9.517556105912994e-06, + "loss": 0.4786, + "step": 3059 + }, + { + "epoch": 0.53, + "learning_rate": 9.511950725833418e-06, + "loss": 0.4586, + "step": 3060 + }, + { + "epoch": 0.53, + "learning_rate": 9.50634549946225e-06, + "loss": 0.4675, + "step": 3061 + }, + { + "epoch": 0.53, + "learning_rate": 9.500740428564819e-06, + "loss": 0.4654, + "step": 3062 + }, + { + "epoch": 0.53, + "learning_rate": 9.495135514906415e-06, + "loss": 0.4633, + "step": 3063 + }, + { + "epoch": 0.53, + "learning_rate": 9.489530760252272e-06, + "loss": 0.4626, + "step": 3064 + }, + { + "epoch": 0.53, + "learning_rate": 9.483926166367578e-06, + "loss": 0.4748, + "step": 3065 + }, + { + "epoch": 0.53, + "learning_rate": 9.478321735017471e-06, + "loss": 0.4612, + "step": 3066 + }, + { + "epoch": 0.53, + "learning_rate": 9.47271746796703e-06, + "loss": 0.4688, + "step": 3067 + }, + { + "epoch": 0.53, + "learning_rate": 9.467113366981294e-06, + "loss": 0.4579, + "step": 3068 + }, + { + "epoch": 0.53, + "learning_rate": 9.461509433825238e-06, + "loss": 0.4761, + "step": 3069 + }, + { + "epoch": 0.53, + "learning_rate": 9.455905670263792e-06, + "loss": 0.4584, + "step": 3070 + }, + { + "epoch": 0.53, + "learning_rate": 9.45030207806183e-06, + "loss": 0.4614, + "step": 3071 + }, + { + "epoch": 0.53, + "learning_rate": 9.44469865898417e-06, + "loss": 0.4731, + "step": 3072 + }, + { + "epoch": 0.53, + "learning_rate": 9.439095414795584e-06, + "loss": 0.4735, + "step": 3073 + }, + { + "epoch": 0.53, + "learning_rate": 9.433492347260776e-06, + "loss": 0.4629, + "step": 3074 + }, + { + "epoch": 0.53, + "learning_rate": 9.427889458144405e-06, + "loss": 0.478, + "step": 3075 + }, + { + "epoch": 0.53, + "learning_rate": 9.422286749211068e-06, + "loss": 0.4597, + "step": 3076 + }, + { + "epoch": 0.53, + "learning_rate": 9.416684222225308e-06, + "loss": 0.4684, + "step": 3077 + }, + { + "epoch": 0.53, + "learning_rate": 9.411081878951607e-06, + "loss": 0.4601, + "step": 3078 + }, + { + "epoch": 0.53, + "learning_rate": 9.40547972115439e-06, + "loss": 0.481, + "step": 3079 + }, + { + "epoch": 0.53, + "learning_rate": 9.39987775059804e-06, + "loss": 0.4723, + "step": 3080 + }, + { + "epoch": 0.53, + "learning_rate": 9.394275969046845e-06, + "loss": 0.4726, + "step": 3081 + }, + { + "epoch": 0.53, + "learning_rate": 9.388674378265074e-06, + "loss": 0.4632, + "step": 3082 + }, + { + "epoch": 0.53, + "learning_rate": 9.383072980016902e-06, + "loss": 0.4645, + "step": 3083 + }, + { + "epoch": 0.53, + "learning_rate": 9.377471776066469e-06, + "loss": 0.477, + "step": 3084 + }, + { + "epoch": 0.53, + "learning_rate": 9.371870768177836e-06, + "loss": 0.4647, + "step": 3085 + }, + { + "epoch": 0.53, + "learning_rate": 9.366269958115014e-06, + "loss": 0.4684, + "step": 3086 + }, + { + "epoch": 0.53, + "learning_rate": 9.360669347641946e-06, + "loss": 0.4689, + "step": 3087 + }, + { + "epoch": 0.53, + "learning_rate": 9.355068938522508e-06, + "loss": 0.4689, + "step": 3088 + }, + { + "epoch": 0.54, + "learning_rate": 9.349468732520529e-06, + "loss": 0.4681, + "step": 3089 + }, + { + "epoch": 0.54, + "learning_rate": 9.34386873139975e-06, + "loss": 0.4613, + "step": 3090 + }, + { + "epoch": 0.54, + "learning_rate": 9.33826893692387e-06, + "loss": 0.4658, + "step": 3091 + }, + { + "epoch": 0.54, + "learning_rate": 9.332669350856503e-06, + "loss": 0.4719, + "step": 3092 + }, + { + "epoch": 0.54, + "learning_rate": 9.327069974961219e-06, + "loss": 0.4618, + "step": 3093 + }, + { + "epoch": 0.54, + "learning_rate": 9.321470811001502e-06, + "loss": 0.4697, + "step": 3094 + }, + { + "epoch": 0.54, + "learning_rate": 9.315871860740782e-06, + "loss": 0.462, + "step": 3095 + }, + { + "epoch": 0.54, + "learning_rate": 9.310273125942418e-06, + "loss": 0.4703, + "step": 3096 + }, + { + "epoch": 0.54, + "learning_rate": 9.304674608369695e-06, + "loss": 0.4592, + "step": 3097 + }, + { + "epoch": 0.54, + "learning_rate": 9.299076309785839e-06, + "loss": 0.4773, + "step": 3098 + }, + { + "epoch": 0.54, + "learning_rate": 9.293478231954e-06, + "loss": 0.4705, + "step": 3099 + }, + { + "epoch": 0.54, + "learning_rate": 9.287880376637262e-06, + "loss": 0.476, + "step": 3100 + }, + { + "epoch": 0.54, + "learning_rate": 9.282282745598646e-06, + "loss": 0.4617, + "step": 3101 + }, + { + "epoch": 0.54, + "learning_rate": 9.276685340601085e-06, + "loss": 0.4752, + "step": 3102 + }, + { + "epoch": 0.54, + "learning_rate": 9.271088163407455e-06, + "loss": 0.463, + "step": 3103 + }, + { + "epoch": 0.54, + "learning_rate": 9.265491215780556e-06, + "loss": 0.4675, + "step": 3104 + }, + { + "epoch": 0.54, + "learning_rate": 9.259894499483116e-06, + "loss": 0.4748, + "step": 3105 + }, + { + "epoch": 0.54, + "learning_rate": 9.254298016277785e-06, + "loss": 0.4773, + "step": 3106 + }, + { + "epoch": 0.54, + "learning_rate": 9.248701767927146e-06, + "loss": 0.463, + "step": 3107 + }, + { + "epoch": 0.54, + "learning_rate": 9.243105756193714e-06, + "loss": 0.4689, + "step": 3108 + }, + { + "epoch": 0.54, + "learning_rate": 9.23750998283991e-06, + "loss": 0.4777, + "step": 3109 + }, + { + "epoch": 0.54, + "learning_rate": 9.231914449628103e-06, + "loss": 0.4587, + "step": 3110 + }, + { + "epoch": 0.54, + "learning_rate": 9.226319158320565e-06, + "loss": 0.4722, + "step": 3111 + }, + { + "epoch": 0.54, + "learning_rate": 9.22072411067951e-06, + "loss": 0.4691, + "step": 3112 + }, + { + "epoch": 0.54, + "learning_rate": 9.215129308467062e-06, + "loss": 0.4654, + "step": 3113 + }, + { + "epoch": 0.54, + "learning_rate": 9.20953475344527e-06, + "loss": 0.4789, + "step": 3114 + }, + { + "epoch": 0.54, + "learning_rate": 9.20394044737612e-06, + "loss": 0.4623, + "step": 3115 + }, + { + "epoch": 0.54, + "learning_rate": 9.198346392021494e-06, + "loss": 0.469, + "step": 3116 + }, + { + "epoch": 0.54, + "learning_rate": 9.192752589143219e-06, + "loss": 0.4741, + "step": 3117 + }, + { + "epoch": 0.54, + "learning_rate": 9.187159040503025e-06, + "loss": 0.4653, + "step": 3118 + }, + { + "epoch": 0.54, + "learning_rate": 9.181565747862575e-06, + "loss": 0.4558, + "step": 3119 + }, + { + "epoch": 0.54, + "learning_rate": 9.175972712983439e-06, + "loss": 0.4667, + "step": 3120 + }, + { + "epoch": 0.54, + "learning_rate": 9.170379937627116e-06, + "loss": 0.4752, + "step": 3121 + }, + { + "epoch": 0.54, + "learning_rate": 9.16478742355502e-06, + "loss": 0.4638, + "step": 3122 + }, + { + "epoch": 0.54, + "learning_rate": 9.159195172528478e-06, + "loss": 0.4571, + "step": 3123 + }, + { + "epoch": 0.54, + "learning_rate": 9.153603186308747e-06, + "loss": 0.4727, + "step": 3124 + }, + { + "epoch": 0.54, + "learning_rate": 9.148011466656981e-06, + "loss": 0.465, + "step": 3125 + }, + { + "epoch": 0.54, + "learning_rate": 9.14242001533427e-06, + "loss": 0.4761, + "step": 3126 + }, + { + "epoch": 0.54, + "learning_rate": 9.136828834101606e-06, + "loss": 0.4711, + "step": 3127 + }, + { + "epoch": 0.54, + "learning_rate": 9.1312379247199e-06, + "loss": 0.4669, + "step": 3128 + }, + { + "epoch": 0.54, + "learning_rate": 9.125647288949982e-06, + "loss": 0.4521, + "step": 3129 + }, + { + "epoch": 0.54, + "learning_rate": 9.120056928552586e-06, + "loss": 0.4658, + "step": 3130 + }, + { + "epoch": 0.54, + "learning_rate": 9.114466845288372e-06, + "loss": 0.463, + "step": 3131 + }, + { + "epoch": 0.54, + "learning_rate": 9.108877040917896e-06, + "loss": 0.4682, + "step": 3132 + }, + { + "epoch": 0.54, + "learning_rate": 9.103287517201647e-06, + "loss": 0.4614, + "step": 3133 + }, + { + "epoch": 0.54, + "learning_rate": 9.097698275900004e-06, + "loss": 0.4742, + "step": 3134 + }, + { + "epoch": 0.54, + "learning_rate": 9.092109318773274e-06, + "loss": 0.4581, + "step": 3135 + }, + { + "epoch": 0.54, + "learning_rate": 9.086520647581667e-06, + "loss": 0.4641, + "step": 3136 + }, + { + "epoch": 0.54, + "learning_rate": 9.080932264085302e-06, + "loss": 0.451, + "step": 3137 + }, + { + "epoch": 0.54, + "learning_rate": 9.075344170044212e-06, + "loss": 0.4747, + "step": 3138 + }, + { + "epoch": 0.54, + "learning_rate": 9.069756367218333e-06, + "loss": 0.4549, + "step": 3139 + }, + { + "epoch": 0.54, + "learning_rate": 9.064168857367514e-06, + "loss": 0.476, + "step": 3140 + }, + { + "epoch": 0.54, + "learning_rate": 9.05858164225151e-06, + "loss": 0.4573, + "step": 3141 + }, + { + "epoch": 0.54, + "learning_rate": 9.052994723629982e-06, + "loss": 0.4651, + "step": 3142 + }, + { + "epoch": 0.54, + "learning_rate": 9.047408103262503e-06, + "loss": 0.4638, + "step": 3143 + }, + { + "epoch": 0.54, + "learning_rate": 9.041821782908544e-06, + "loss": 0.4693, + "step": 3144 + }, + { + "epoch": 0.54, + "learning_rate": 9.03623576432749e-06, + "loss": 0.4805, + "step": 3145 + }, + { + "epoch": 0.54, + "learning_rate": 9.03065004927862e-06, + "loss": 0.4691, + "step": 3146 + }, + { + "epoch": 0.55, + "learning_rate": 9.02506463952113e-06, + "loss": 0.4667, + "step": 3147 + }, + { + "epoch": 0.55, + "learning_rate": 9.019479536814108e-06, + "loss": 0.4706, + "step": 3148 + }, + { + "epoch": 0.55, + "learning_rate": 9.013894742916554e-06, + "loss": 0.455, + "step": 3149 + }, + { + "epoch": 0.55, + "learning_rate": 9.008310259587374e-06, + "loss": 0.4606, + "step": 3150 + }, + { + "epoch": 0.55, + "learning_rate": 9.002726088585356e-06, + "loss": 0.4682, + "step": 3151 + }, + { + "epoch": 0.55, + "learning_rate": 8.997142231669217e-06, + "loss": 0.4523, + "step": 3152 + }, + { + "epoch": 0.55, + "learning_rate": 8.991558690597553e-06, + "loss": 0.473, + "step": 3153 + }, + { + "epoch": 0.55, + "learning_rate": 8.985975467128875e-06, + "loss": 0.4886, + "step": 3154 + }, + { + "epoch": 0.55, + "learning_rate": 8.980392563021585e-06, + "loss": 0.4663, + "step": 3155 + }, + { + "epoch": 0.55, + "learning_rate": 8.974809980033987e-06, + "loss": 0.4706, + "step": 3156 + }, + { + "epoch": 0.55, + "learning_rate": 8.969227719924289e-06, + "loss": 0.4569, + "step": 3157 + }, + { + "epoch": 0.55, + "learning_rate": 8.963645784450584e-06, + "loss": 0.4681, + "step": 3158 + }, + { + "epoch": 0.55, + "learning_rate": 8.958064175370884e-06, + "loss": 0.4673, + "step": 3159 + }, + { + "epoch": 0.55, + "learning_rate": 8.95248289444307e-06, + "loss": 0.4689, + "step": 3160 + }, + { + "epoch": 0.55, + "learning_rate": 8.946901943424951e-06, + "loss": 0.4577, + "step": 3161 + }, + { + "epoch": 0.55, + "learning_rate": 8.941321324074207e-06, + "loss": 0.4649, + "step": 3162 + }, + { + "epoch": 0.55, + "learning_rate": 8.935741038148426e-06, + "loss": 0.4548, + "step": 3163 + }, + { + "epoch": 0.55, + "learning_rate": 8.930161087405089e-06, + "loss": 0.476, + "step": 3164 + }, + { + "epoch": 0.55, + "learning_rate": 8.924581473601568e-06, + "loss": 0.4596, + "step": 3165 + }, + { + "epoch": 0.55, + "learning_rate": 8.919002198495135e-06, + "loss": 0.4701, + "step": 3166 + }, + { + "epoch": 0.55, + "learning_rate": 8.913423263842943e-06, + "loss": 0.4719, + "step": 3167 + }, + { + "epoch": 0.55, + "learning_rate": 8.90784467140206e-06, + "loss": 0.4662, + "step": 3168 + }, + { + "epoch": 0.55, + "learning_rate": 8.90226642292942e-06, + "loss": 0.4529, + "step": 3169 + }, + { + "epoch": 0.55, + "learning_rate": 8.896688520181867e-06, + "loss": 0.4792, + "step": 3170 + }, + { + "epoch": 0.55, + "learning_rate": 8.891110964916135e-06, + "loss": 0.4617, + "step": 3171 + }, + { + "epoch": 0.55, + "learning_rate": 8.885533758888835e-06, + "loss": 0.4782, + "step": 3172 + }, + { + "epoch": 0.55, + "learning_rate": 8.879956903856484e-06, + "loss": 0.4593, + "step": 3173 + }, + { + "epoch": 0.55, + "learning_rate": 8.874380401575476e-06, + "loss": 0.4656, + "step": 3174 + }, + { + "epoch": 0.55, + "learning_rate": 8.868804253802103e-06, + "loss": 0.4525, + "step": 3175 + }, + { + "epoch": 0.55, + "learning_rate": 8.863228462292537e-06, + "loss": 0.4732, + "step": 3176 + }, + { + "epoch": 0.55, + "learning_rate": 8.85765302880285e-06, + "loss": 0.4595, + "step": 3177 + }, + { + "epoch": 0.55, + "learning_rate": 8.852077955088993e-06, + "loss": 0.459, + "step": 3178 + }, + { + "epoch": 0.55, + "learning_rate": 8.846503242906798e-06, + "loss": 0.4555, + "step": 3179 + }, + { + "epoch": 0.55, + "learning_rate": 8.840928894011995e-06, + "loss": 0.4713, + "step": 3180 + }, + { + "epoch": 0.55, + "learning_rate": 8.83535491016019e-06, + "loss": 0.4572, + "step": 3181 + }, + { + "epoch": 0.55, + "learning_rate": 8.829781293106884e-06, + "loss": 0.4592, + "step": 3182 + }, + { + "epoch": 0.55, + "learning_rate": 8.82420804460745e-06, + "loss": 0.4783, + "step": 3183 + }, + { + "epoch": 0.55, + "learning_rate": 8.818635166417154e-06, + "loss": 0.4731, + "step": 3184 + }, + { + "epoch": 0.55, + "learning_rate": 8.813062660291146e-06, + "loss": 0.4696, + "step": 3185 + }, + { + "epoch": 0.55, + "learning_rate": 8.807490527984453e-06, + "loss": 0.4599, + "step": 3186 + }, + { + "epoch": 0.55, + "learning_rate": 8.80191877125199e-06, + "loss": 0.4681, + "step": 3187 + }, + { + "epoch": 0.55, + "learning_rate": 8.796347391848547e-06, + "loss": 0.476, + "step": 3188 + }, + { + "epoch": 0.55, + "learning_rate": 8.790776391528803e-06, + "loss": 0.4761, + "step": 3189 + }, + { + "epoch": 0.55, + "learning_rate": 8.785205772047308e-06, + "loss": 0.4663, + "step": 3190 + }, + { + "epoch": 0.55, + "learning_rate": 8.779635535158498e-06, + "loss": 0.4719, + "step": 3191 + }, + { + "epoch": 0.55, + "learning_rate": 8.774065682616699e-06, + "loss": 0.4611, + "step": 3192 + }, + { + "epoch": 0.55, + "learning_rate": 8.76849621617609e-06, + "loss": 0.4583, + "step": 3193 + }, + { + "epoch": 0.55, + "learning_rate": 8.762927137590757e-06, + "loss": 0.477, + "step": 3194 + }, + { + "epoch": 0.55, + "learning_rate": 8.757358448614636e-06, + "loss": 0.4614, + "step": 3195 + }, + { + "epoch": 0.55, + "learning_rate": 8.751790151001569e-06, + "loss": 0.4577, + "step": 3196 + }, + { + "epoch": 0.55, + "learning_rate": 8.74622224650525e-06, + "loss": 0.4662, + "step": 3197 + }, + { + "epoch": 0.55, + "learning_rate": 8.740654736879265e-06, + "loss": 0.4608, + "step": 3198 + }, + { + "epoch": 0.55, + "learning_rate": 8.73508762387707e-06, + "loss": 0.4625, + "step": 3199 + }, + { + "epoch": 0.55, + "learning_rate": 8.729520909251994e-06, + "loss": 0.4822, + "step": 3200 + }, + { + "epoch": 0.55, + "learning_rate": 8.723954594757244e-06, + "loss": 0.4694, + "step": 3201 + }, + { + "epoch": 0.55, + "learning_rate": 8.718388682145897e-06, + "loss": 0.4683, + "step": 3202 + }, + { + "epoch": 0.55, + "learning_rate": 8.712823173170914e-06, + "loss": 0.4658, + "step": 3203 + }, + { + "epoch": 0.56, + "learning_rate": 8.707258069585109e-06, + "loss": 0.4694, + "step": 3204 + }, + { + "epoch": 0.56, + "learning_rate": 8.70169337314119e-06, + "loss": 0.4782, + "step": 3205 + }, + { + "epoch": 0.56, + "learning_rate": 8.696129085591726e-06, + "loss": 0.4704, + "step": 3206 + }, + { + "epoch": 0.56, + "learning_rate": 8.690565208689157e-06, + "loss": 0.4619, + "step": 3207 + }, + { + "epoch": 0.56, + "learning_rate": 8.685001744185795e-06, + "loss": 0.477, + "step": 3208 + }, + { + "epoch": 0.56, + "learning_rate": 8.679438693833821e-06, + "loss": 0.4601, + "step": 3209 + }, + { + "epoch": 0.56, + "learning_rate": 8.67387605938529e-06, + "loss": 0.4735, + "step": 3210 + }, + { + "epoch": 0.56, + "learning_rate": 8.668313842592116e-06, + "loss": 0.4636, + "step": 3211 + }, + { + "epoch": 0.56, + "learning_rate": 8.662752045206096e-06, + "loss": 0.4682, + "step": 3212 + }, + { + "epoch": 0.56, + "learning_rate": 8.657190668978887e-06, + "loss": 0.4597, + "step": 3213 + }, + { + "epoch": 0.56, + "learning_rate": 8.651629715662006e-06, + "loss": 0.4815, + "step": 3214 + }, + { + "epoch": 0.56, + "learning_rate": 8.646069187006854e-06, + "loss": 0.4489, + "step": 3215 + }, + { + "epoch": 0.56, + "learning_rate": 8.640509084764682e-06, + "loss": 0.4722, + "step": 3216 + }, + { + "epoch": 0.56, + "learning_rate": 8.634949410686615e-06, + "loss": 0.4673, + "step": 3217 + }, + { + "epoch": 0.56, + "learning_rate": 8.629390166523638e-06, + "loss": 0.4812, + "step": 3218 + }, + { + "epoch": 0.56, + "learning_rate": 8.623831354026609e-06, + "loss": 0.4494, + "step": 3219 + }, + { + "epoch": 0.56, + "learning_rate": 8.618272974946244e-06, + "loss": 0.4666, + "step": 3220 + }, + { + "epoch": 0.56, + "learning_rate": 8.612715031033125e-06, + "loss": 0.4671, + "step": 3221 + }, + { + "epoch": 0.56, + "learning_rate": 8.607157524037692e-06, + "loss": 0.4623, + "step": 3222 + }, + { + "epoch": 0.56, + "learning_rate": 8.601600455710254e-06, + "loss": 0.4647, + "step": 3223 + }, + { + "epoch": 0.56, + "learning_rate": 8.596043827800976e-06, + "loss": 0.4815, + "step": 3224 + }, + { + "epoch": 0.56, + "learning_rate": 8.590487642059888e-06, + "loss": 0.4712, + "step": 3225 + }, + { + "epoch": 0.56, + "learning_rate": 8.584931900236879e-06, + "loss": 0.4617, + "step": 3226 + }, + { + "epoch": 0.56, + "learning_rate": 8.579376604081705e-06, + "loss": 0.4552, + "step": 3227 + }, + { + "epoch": 0.56, + "learning_rate": 8.573821755343965e-06, + "loss": 0.473, + "step": 3228 + }, + { + "epoch": 0.56, + "learning_rate": 8.568267355773137e-06, + "loss": 0.4719, + "step": 3229 + }, + { + "epoch": 0.56, + "learning_rate": 8.562713407118543e-06, + "loss": 0.4709, + "step": 3230 + }, + { + "epoch": 0.56, + "learning_rate": 8.557159911129373e-06, + "loss": 0.4589, + "step": 3231 + }, + { + "epoch": 0.56, + "learning_rate": 8.551606869554665e-06, + "loss": 0.4652, + "step": 3232 + }, + { + "epoch": 0.56, + "learning_rate": 8.54605428414332e-06, + "loss": 0.471, + "step": 3233 + }, + { + "epoch": 0.56, + "learning_rate": 8.540502156644096e-06, + "loss": 0.4634, + "step": 3234 + }, + { + "epoch": 0.56, + "learning_rate": 8.534950488805599e-06, + "loss": 0.4661, + "step": 3235 + }, + { + "epoch": 0.56, + "learning_rate": 8.529399282376306e-06, + "loss": 0.467, + "step": 3236 + }, + { + "epoch": 0.56, + "learning_rate": 8.523848539104527e-06, + "loss": 0.4558, + "step": 3237 + }, + { + "epoch": 0.56, + "learning_rate": 8.518298260738448e-06, + "loss": 0.4654, + "step": 3238 + }, + { + "epoch": 0.56, + "learning_rate": 8.512748449026087e-06, + "loss": 0.4705, + "step": 3239 + }, + { + "epoch": 0.56, + "learning_rate": 8.507199105715336e-06, + "loss": 0.4738, + "step": 3240 + }, + { + "epoch": 0.56, + "learning_rate": 8.50165023255393e-06, + "loss": 0.453, + "step": 3241 + }, + { + "epoch": 0.56, + "learning_rate": 8.496101831289447e-06, + "loss": 0.4738, + "step": 3242 + }, + { + "epoch": 0.56, + "learning_rate": 8.490553903669335e-06, + "loss": 0.4652, + "step": 3243 + }, + { + "epoch": 0.56, + "learning_rate": 8.485006451440874e-06, + "loss": 0.471, + "step": 3244 + }, + { + "epoch": 0.56, + "learning_rate": 8.479459476351213e-06, + "loss": 0.4559, + "step": 3245 + }, + { + "epoch": 0.56, + "learning_rate": 8.473912980147329e-06, + "loss": 0.4811, + "step": 3246 + }, + { + "epoch": 0.56, + "learning_rate": 8.46836696457607e-06, + "loss": 0.4489, + "step": 3247 + }, + { + "epoch": 0.56, + "learning_rate": 8.462821431384123e-06, + "loss": 0.4736, + "step": 3248 + }, + { + "epoch": 0.56, + "learning_rate": 8.457276382318016e-06, + "loss": 0.4585, + "step": 3249 + }, + { + "epoch": 0.56, + "learning_rate": 8.451731819124137e-06, + "loss": 0.4688, + "step": 3250 + }, + { + "epoch": 0.56, + "learning_rate": 8.446187743548711e-06, + "loss": 0.4674, + "step": 3251 + }, + { + "epoch": 0.56, + "learning_rate": 8.440644157337819e-06, + "loss": 0.4713, + "step": 3252 + }, + { + "epoch": 0.56, + "learning_rate": 8.435101062237377e-06, + "loss": 0.4648, + "step": 3253 + }, + { + "epoch": 0.56, + "learning_rate": 8.42955845999315e-06, + "loss": 0.4804, + "step": 3254 + }, + { + "epoch": 0.56, + "learning_rate": 8.42401635235076e-06, + "loss": 0.4546, + "step": 3255 + }, + { + "epoch": 0.56, + "learning_rate": 8.418474741055657e-06, + "loss": 0.4593, + "step": 3256 + }, + { + "epoch": 0.56, + "learning_rate": 8.412933627853142e-06, + "loss": 0.4624, + "step": 3257 + }, + { + "epoch": 0.56, + "learning_rate": 8.407393014488354e-06, + "loss": 0.4556, + "step": 3258 + }, + { + "epoch": 0.56, + "learning_rate": 8.401852902706285e-06, + "loss": 0.4685, + "step": 3259 + }, + { + "epoch": 0.56, + "learning_rate": 8.396313294251755e-06, + "loss": 0.4548, + "step": 3260 + }, + { + "epoch": 0.56, + "learning_rate": 8.390774190869434e-06, + "loss": 0.47, + "step": 3261 + }, + { + "epoch": 0.57, + "learning_rate": 8.385235594303842e-06, + "loss": 0.4616, + "step": 3262 + }, + { + "epoch": 0.57, + "learning_rate": 8.379697506299313e-06, + "loss": 0.4621, + "step": 3263 + }, + { + "epoch": 0.57, + "learning_rate": 8.374159928600051e-06, + "loss": 0.4602, + "step": 3264 + }, + { + "epoch": 0.57, + "learning_rate": 8.368622862950079e-06, + "loss": 0.4845, + "step": 3265 + }, + { + "epoch": 0.57, + "learning_rate": 8.363086311093266e-06, + "loss": 0.4663, + "step": 3266 + }, + { + "epoch": 0.57, + "learning_rate": 8.357550274773317e-06, + "loss": 0.4665, + "step": 3267 + }, + { + "epoch": 0.57, + "learning_rate": 8.352014755733775e-06, + "loss": 0.458, + "step": 3268 + }, + { + "epoch": 0.57, + "learning_rate": 8.346479755718028e-06, + "loss": 0.4712, + "step": 3269 + }, + { + "epoch": 0.57, + "learning_rate": 8.340945276469282e-06, + "loss": 0.4693, + "step": 3270 + }, + { + "epoch": 0.57, + "learning_rate": 8.335411319730604e-06, + "loss": 0.4677, + "step": 3271 + }, + { + "epoch": 0.57, + "learning_rate": 8.329877887244867e-06, + "loss": 0.4688, + "step": 3272 + }, + { + "epoch": 0.57, + "learning_rate": 8.32434498075481e-06, + "loss": 0.4869, + "step": 3273 + }, + { + "epoch": 0.57, + "learning_rate": 8.318812602002984e-06, + "loss": 0.4563, + "step": 3274 + }, + { + "epoch": 0.57, + "learning_rate": 8.313280752731779e-06, + "loss": 0.4772, + "step": 3275 + }, + { + "epoch": 0.57, + "learning_rate": 8.307749434683426e-06, + "loss": 0.4574, + "step": 3276 + }, + { + "epoch": 0.57, + "learning_rate": 8.302218649599978e-06, + "loss": 0.4658, + "step": 3277 + }, + { + "epoch": 0.57, + "learning_rate": 8.296688399223327e-06, + "loss": 0.4539, + "step": 3278 + }, + { + "epoch": 0.57, + "learning_rate": 8.29115868529519e-06, + "loss": 0.4621, + "step": 3279 + }, + { + "epoch": 0.57, + "learning_rate": 8.285629509557132e-06, + "loss": 0.458, + "step": 3280 + }, + { + "epoch": 0.57, + "learning_rate": 8.28010087375052e-06, + "loss": 0.4663, + "step": 3281 + }, + { + "epoch": 0.57, + "learning_rate": 8.274572779616579e-06, + "loss": 0.4599, + "step": 3282 + }, + { + "epoch": 0.57, + "learning_rate": 8.269045228896349e-06, + "loss": 0.4721, + "step": 3283 + }, + { + "epoch": 0.57, + "learning_rate": 8.263518223330698e-06, + "loss": 0.4592, + "step": 3284 + }, + { + "epoch": 0.57, + "learning_rate": 8.25799176466033e-06, + "loss": 0.4667, + "step": 3285 + }, + { + "epoch": 0.57, + "learning_rate": 8.252465854625766e-06, + "loss": 0.4715, + "step": 3286 + }, + { + "epoch": 0.57, + "learning_rate": 8.246940494967369e-06, + "loss": 0.4611, + "step": 3287 + }, + { + "epoch": 0.57, + "learning_rate": 8.24141568742531e-06, + "loss": 0.4716, + "step": 3288 + }, + { + "epoch": 0.57, + "learning_rate": 8.235891433739606e-06, + "loss": 0.4684, + "step": 3289 + }, + { + "epoch": 0.57, + "learning_rate": 8.230367735650088e-06, + "loss": 0.4594, + "step": 3290 + }, + { + "epoch": 0.57, + "learning_rate": 8.224844594896411e-06, + "loss": 0.4731, + "step": 3291 + }, + { + "epoch": 0.57, + "learning_rate": 8.219322013218062e-06, + "loss": 0.4606, + "step": 3292 + }, + { + "epoch": 0.57, + "learning_rate": 8.213799992354341e-06, + "loss": 0.4584, + "step": 3293 + }, + { + "epoch": 0.57, + "learning_rate": 8.208278534044382e-06, + "loss": 0.465, + "step": 3294 + }, + { + "epoch": 0.57, + "learning_rate": 8.202757640027137e-06, + "loss": 0.4589, + "step": 3295 + }, + { + "epoch": 0.57, + "learning_rate": 8.197237312041377e-06, + "loss": 0.4627, + "step": 3296 + }, + { + "epoch": 0.57, + "learning_rate": 8.191717551825707e-06, + "loss": 0.4657, + "step": 3297 + }, + { + "epoch": 0.57, + "learning_rate": 8.186198361118537e-06, + "loss": 0.4591, + "step": 3298 + }, + { + "epoch": 0.57, + "learning_rate": 8.18067974165811e-06, + "loss": 0.4741, + "step": 3299 + }, + { + "epoch": 0.57, + "learning_rate": 8.175161695182484e-06, + "loss": 0.4651, + "step": 3300 + }, + { + "epoch": 0.57, + "learning_rate": 8.169644223429535e-06, + "loss": 0.461, + "step": 3301 + }, + { + "epoch": 0.57, + "learning_rate": 8.16412732813696e-06, + "loss": 0.4654, + "step": 3302 + }, + { + "epoch": 0.57, + "learning_rate": 8.158611011042272e-06, + "loss": 0.4785, + "step": 3303 + }, + { + "epoch": 0.57, + "learning_rate": 8.153095273882816e-06, + "loss": 0.4581, + "step": 3304 + }, + { + "epoch": 0.57, + "learning_rate": 8.147580118395728e-06, + "loss": 0.4684, + "step": 3305 + }, + { + "epoch": 0.57, + "learning_rate": 8.142065546317988e-06, + "loss": 0.4578, + "step": 3306 + }, + { + "epoch": 0.57, + "learning_rate": 8.136551559386368e-06, + "loss": 0.4739, + "step": 3307 + }, + { + "epoch": 0.57, + "learning_rate": 8.131038159337478e-06, + "loss": 0.4797, + "step": 3308 + }, + { + "epoch": 0.57, + "learning_rate": 8.125525347907726e-06, + "loss": 0.4735, + "step": 3309 + }, + { + "epoch": 0.57, + "learning_rate": 8.120013126833344e-06, + "loss": 0.4692, + "step": 3310 + }, + { + "epoch": 0.57, + "learning_rate": 8.114501497850375e-06, + "loss": 0.462, + "step": 3311 + }, + { + "epoch": 0.57, + "learning_rate": 8.108990462694676e-06, + "loss": 0.4587, + "step": 3312 + }, + { + "epoch": 0.57, + "learning_rate": 8.103480023101919e-06, + "loss": 0.4697, + "step": 3313 + }, + { + "epoch": 0.57, + "learning_rate": 8.097970180807577e-06, + "loss": 0.4634, + "step": 3314 + }, + { + "epoch": 0.57, + "learning_rate": 8.09246093754696e-06, + "loss": 0.4669, + "step": 3315 + }, + { + "epoch": 0.57, + "learning_rate": 8.086952295055158e-06, + "loss": 0.4627, + "step": 3316 + }, + { + "epoch": 0.57, + "learning_rate": 8.081444255067096e-06, + "loss": 0.4747, + "step": 3317 + }, + { + "epoch": 0.57, + "learning_rate": 8.075936819317501e-06, + "loss": 0.4707, + "step": 3318 + }, + { + "epoch": 0.57, + "learning_rate": 8.070429989540905e-06, + "loss": 0.4572, + "step": 3319 + }, + { + "epoch": 0.58, + "learning_rate": 8.064923767471657e-06, + "loss": 0.4639, + "step": 3320 + }, + { + "epoch": 0.58, + "learning_rate": 8.059418154843908e-06, + "loss": 0.4774, + "step": 3321 + }, + { + "epoch": 0.58, + "learning_rate": 8.053913153391622e-06, + "loss": 0.4606, + "step": 3322 + }, + { + "epoch": 0.58, + "learning_rate": 8.048408764848565e-06, + "loss": 0.4735, + "step": 3323 + }, + { + "epoch": 0.58, + "learning_rate": 8.042904990948319e-06, + "loss": 0.4653, + "step": 3324 + }, + { + "epoch": 0.58, + "learning_rate": 8.037401833424265e-06, + "loss": 0.4701, + "step": 3325 + }, + { + "epoch": 0.58, + "learning_rate": 8.03189929400959e-06, + "loss": 0.4588, + "step": 3326 + }, + { + "epoch": 0.58, + "learning_rate": 8.026397374437294e-06, + "loss": 0.471, + "step": 3327 + }, + { + "epoch": 0.58, + "learning_rate": 8.020896076440169e-06, + "loss": 0.4658, + "step": 3328 + }, + { + "epoch": 0.58, + "learning_rate": 8.015395401750816e-06, + "loss": 0.468, + "step": 3329 + }, + { + "epoch": 0.58, + "learning_rate": 8.009895352101656e-06, + "loss": 0.4561, + "step": 3330 + }, + { + "epoch": 0.58, + "learning_rate": 8.004395929224881e-06, + "loss": 0.4632, + "step": 3331 + }, + { + "epoch": 0.58, + "learning_rate": 7.998897134852518e-06, + "loss": 0.462, + "step": 3332 + }, + { + "epoch": 0.58, + "learning_rate": 7.993398970716375e-06, + "loss": 0.4553, + "step": 3333 + }, + { + "epoch": 0.58, + "learning_rate": 7.987901438548069e-06, + "loss": 0.4636, + "step": 3334 + }, + { + "epoch": 0.58, + "learning_rate": 7.982404540079018e-06, + "loss": 0.4721, + "step": 3335 + }, + { + "epoch": 0.58, + "learning_rate": 7.976908277040438e-06, + "loss": 0.4641, + "step": 3336 + }, + { + "epoch": 0.58, + "learning_rate": 7.97141265116335e-06, + "loss": 0.4606, + "step": 3337 + }, + { + "epoch": 0.58, + "learning_rate": 7.965917664178564e-06, + "loss": 0.4607, + "step": 3338 + }, + { + "epoch": 0.58, + "learning_rate": 7.960423317816708e-06, + "loss": 0.4664, + "step": 3339 + }, + { + "epoch": 0.58, + "learning_rate": 7.95492961380818e-06, + "loss": 0.4586, + "step": 3340 + }, + { + "epoch": 0.58, + "learning_rate": 7.949436553883203e-06, + "loss": 0.4743, + "step": 3341 + }, + { + "epoch": 0.58, + "learning_rate": 7.943944139771784e-06, + "loss": 0.4768, + "step": 3342 + }, + { + "epoch": 0.58, + "learning_rate": 7.938452373203722e-06, + "loss": 0.4768, + "step": 3343 + }, + { + "epoch": 0.58, + "learning_rate": 7.932961255908628e-06, + "loss": 0.4606, + "step": 3344 + }, + { + "epoch": 0.58, + "learning_rate": 7.92747078961589e-06, + "loss": 0.4795, + "step": 3345 + }, + { + "epoch": 0.58, + "learning_rate": 7.921980976054707e-06, + "loss": 0.4605, + "step": 3346 + }, + { + "epoch": 0.58, + "learning_rate": 7.916491816954055e-06, + "loss": 0.4802, + "step": 3347 + }, + { + "epoch": 0.58, + "learning_rate": 7.91100331404273e-06, + "loss": 0.4603, + "step": 3348 + }, + { + "epoch": 0.58, + "learning_rate": 7.905515469049287e-06, + "loss": 0.4667, + "step": 3349 + }, + { + "epoch": 0.58, + "learning_rate": 7.900028283702106e-06, + "loss": 0.4751, + "step": 3350 + }, + { + "epoch": 0.58, + "learning_rate": 7.894541759729344e-06, + "loss": 0.4701, + "step": 3351 + }, + { + "epoch": 0.58, + "learning_rate": 7.889055898858943e-06, + "loss": 0.4495, + "step": 3352 + }, + { + "epoch": 0.58, + "learning_rate": 7.883570702818654e-06, + "loss": 0.4592, + "step": 3353 + }, + { + "epoch": 0.58, + "learning_rate": 7.878086173336004e-06, + "loss": 0.4667, + "step": 3354 + }, + { + "epoch": 0.58, + "learning_rate": 7.872602312138316e-06, + "loss": 0.4748, + "step": 3355 + }, + { + "epoch": 0.58, + "learning_rate": 7.867119120952698e-06, + "loss": 0.464, + "step": 3356 + }, + { + "epoch": 0.58, + "learning_rate": 7.861636601506056e-06, + "loss": 0.4657, + "step": 3357 + }, + { + "epoch": 0.58, + "learning_rate": 7.856154755525078e-06, + "loss": 0.461, + "step": 3358 + }, + { + "epoch": 0.58, + "learning_rate": 7.85067358473624e-06, + "loss": 0.4753, + "step": 3359 + }, + { + "epoch": 0.58, + "learning_rate": 7.845193090865807e-06, + "loss": 0.468, + "step": 3360 + }, + { + "epoch": 0.58, + "learning_rate": 7.839713275639826e-06, + "loss": 0.4724, + "step": 3361 + }, + { + "epoch": 0.58, + "learning_rate": 7.83423414078414e-06, + "loss": 0.4642, + "step": 3362 + }, + { + "epoch": 0.58, + "learning_rate": 7.828755688024369e-06, + "loss": 0.4805, + "step": 3363 + }, + { + "epoch": 0.58, + "learning_rate": 7.823277919085919e-06, + "loss": 0.465, + "step": 3364 + }, + { + "epoch": 0.58, + "learning_rate": 7.817800835693993e-06, + "loss": 0.475, + "step": 3365 + }, + { + "epoch": 0.58, + "learning_rate": 7.812324439573554e-06, + "loss": 0.4709, + "step": 3366 + }, + { + "epoch": 0.58, + "learning_rate": 7.806848732449372e-06, + "loss": 0.4754, + "step": 3367 + }, + { + "epoch": 0.58, + "learning_rate": 7.801373716045987e-06, + "loss": 0.4573, + "step": 3368 + }, + { + "epoch": 0.58, + "learning_rate": 7.795899392087728e-06, + "loss": 0.4714, + "step": 3369 + }, + { + "epoch": 0.58, + "learning_rate": 7.790425762298698e-06, + "loss": 0.4527, + "step": 3370 + }, + { + "epoch": 0.58, + "learning_rate": 7.784952828402789e-06, + "loss": 0.464, + "step": 3371 + }, + { + "epoch": 0.58, + "learning_rate": 7.779480592123673e-06, + "loss": 0.4656, + "step": 3372 + }, + { + "epoch": 0.58, + "learning_rate": 7.774009055184795e-06, + "loss": 0.4687, + "step": 3373 + }, + { + "epoch": 0.58, + "learning_rate": 7.768538219309392e-06, + "loss": 0.4685, + "step": 3374 + }, + { + "epoch": 0.58, + "learning_rate": 7.763068086220467e-06, + "loss": 0.4703, + "step": 3375 + }, + { + "epoch": 0.58, + "learning_rate": 7.757598657640813e-06, + "loss": 0.4587, + "step": 3376 + }, + { + "epoch": 0.59, + "learning_rate": 7.752129935292993e-06, + "loss": 0.4714, + "step": 3377 + }, + { + "epoch": 0.59, + "learning_rate": 7.746661920899351e-06, + "loss": 0.4716, + "step": 3378 + }, + { + "epoch": 0.59, + "learning_rate": 7.74119461618201e-06, + "loss": 0.4672, + "step": 3379 + }, + { + "epoch": 0.59, + "learning_rate": 7.735728022862865e-06, + "loss": 0.4719, + "step": 3380 + }, + { + "epoch": 0.59, + "learning_rate": 7.730262142663591e-06, + "loss": 0.4781, + "step": 3381 + }, + { + "epoch": 0.59, + "learning_rate": 7.724796977305632e-06, + "loss": 0.4406, + "step": 3382 + }, + { + "epoch": 0.59, + "learning_rate": 7.71933252851022e-06, + "loss": 0.4652, + "step": 3383 + }, + { + "epoch": 0.59, + "learning_rate": 7.713868797998342e-06, + "loss": 0.4732, + "step": 3384 + }, + { + "epoch": 0.59, + "learning_rate": 7.708405787490777e-06, + "loss": 0.4693, + "step": 3385 + }, + { + "epoch": 0.59, + "learning_rate": 7.702943498708069e-06, + "loss": 0.4669, + "step": 3386 + }, + { + "epoch": 0.59, + "learning_rate": 7.697481933370535e-06, + "loss": 0.4611, + "step": 3387 + }, + { + "epoch": 0.59, + "learning_rate": 7.692021093198264e-06, + "loss": 0.4602, + "step": 3388 + }, + { + "epoch": 0.59, + "learning_rate": 7.686560979911115e-06, + "loss": 0.4677, + "step": 3389 + }, + { + "epoch": 0.59, + "learning_rate": 7.681101595228727e-06, + "loss": 0.463, + "step": 3390 + }, + { + "epoch": 0.59, + "learning_rate": 7.67564294087049e-06, + "loss": 0.4554, + "step": 3391 + }, + { + "epoch": 0.59, + "learning_rate": 7.670185018555592e-06, + "loss": 0.4666, + "step": 3392 + }, + { + "epoch": 0.59, + "learning_rate": 7.664727830002967e-06, + "loss": 0.4804, + "step": 3393 + }, + { + "epoch": 0.59, + "learning_rate": 7.659271376931327e-06, + "loss": 0.4709, + "step": 3394 + }, + { + "epoch": 0.59, + "learning_rate": 7.653815661059156e-06, + "loss": 0.4663, + "step": 3395 + }, + { + "epoch": 0.59, + "learning_rate": 7.648360684104695e-06, + "loss": 0.4557, + "step": 3396 + }, + { + "epoch": 0.59, + "learning_rate": 7.642906447785962e-06, + "loss": 0.4693, + "step": 3397 + }, + { + "epoch": 0.59, + "learning_rate": 7.637452953820737e-06, + "loss": 0.4672, + "step": 3398 + }, + { + "epoch": 0.59, + "learning_rate": 7.632000203926564e-06, + "loss": 0.4754, + "step": 3399 + }, + { + "epoch": 0.59, + "learning_rate": 7.626548199820768e-06, + "loss": 0.4541, + "step": 3400 + }, + { + "epoch": 0.59, + "learning_rate": 7.621096943220416e-06, + "loss": 0.483, + "step": 3401 + }, + { + "epoch": 0.59, + "learning_rate": 7.6156464358423586e-06, + "loss": 0.4626, + "step": 3402 + }, + { + "epoch": 0.59, + "learning_rate": 7.610196679403195e-06, + "loss": 0.4719, + "step": 3403 + }, + { + "epoch": 0.59, + "learning_rate": 7.6047476756193035e-06, + "loss": 0.4599, + "step": 3404 + }, + { + "epoch": 0.59, + "learning_rate": 7.599299426206812e-06, + "loss": 0.4793, + "step": 3405 + }, + { + "epoch": 0.59, + "learning_rate": 7.5938519328816156e-06, + "loss": 0.4594, + "step": 3406 + }, + { + "epoch": 0.59, + "learning_rate": 7.588405197359381e-06, + "loss": 0.4724, + "step": 3407 + }, + { + "epoch": 0.59, + "learning_rate": 7.582959221355514e-06, + "loss": 0.4478, + "step": 3408 + }, + { + "epoch": 0.59, + "learning_rate": 7.577514006585209e-06, + "loss": 0.4568, + "step": 3409 + }, + { + "epoch": 0.59, + "learning_rate": 7.572069554763391e-06, + "loss": 0.4599, + "step": 3410 + }, + { + "epoch": 0.59, + "learning_rate": 7.5666258676047735e-06, + "loss": 0.468, + "step": 3411 + }, + { + "epoch": 0.59, + "learning_rate": 7.561182946823805e-06, + "loss": 0.4635, + "step": 3412 + }, + { + "epoch": 0.59, + "learning_rate": 7.5557407941347095e-06, + "loss": 0.4705, + "step": 3413 + }, + { + "epoch": 0.59, + "learning_rate": 7.550299411251461e-06, + "loss": 0.469, + "step": 3414 + }, + { + "epoch": 0.59, + "learning_rate": 7.54485879988779e-06, + "loss": 0.4667, + "step": 3415 + }, + { + "epoch": 0.59, + "learning_rate": 7.539418961757195e-06, + "loss": 0.464, + "step": 3416 + }, + { + "epoch": 0.59, + "learning_rate": 7.533979898572909e-06, + "loss": 0.4575, + "step": 3417 + }, + { + "epoch": 0.59, + "learning_rate": 7.528541612047953e-06, + "loss": 0.4699, + "step": 3418 + }, + { + "epoch": 0.59, + "learning_rate": 7.523104103895066e-06, + "loss": 0.4639, + "step": 3419 + }, + { + "epoch": 0.59, + "learning_rate": 7.517667375826772e-06, + "loss": 0.4799, + "step": 3420 + }, + { + "epoch": 0.59, + "learning_rate": 7.512231429555339e-06, + "loss": 0.4692, + "step": 3421 + }, + { + "epoch": 0.59, + "learning_rate": 7.506796266792782e-06, + "loss": 0.4812, + "step": 3422 + }, + { + "epoch": 0.59, + "learning_rate": 7.501361889250882e-06, + "loss": 0.4523, + "step": 3423 + }, + { + "epoch": 0.59, + "learning_rate": 7.4959282986411595e-06, + "loss": 0.4693, + "step": 3424 + }, + { + "epoch": 0.59, + "learning_rate": 7.490495496674899e-06, + "loss": 0.4711, + "step": 3425 + }, + { + "epoch": 0.59, + "learning_rate": 7.485063485063124e-06, + "loss": 0.4732, + "step": 3426 + }, + { + "epoch": 0.59, + "learning_rate": 7.479632265516626e-06, + "loss": 0.4637, + "step": 3427 + }, + { + "epoch": 0.59, + "learning_rate": 7.474201839745932e-06, + "loss": 0.4777, + "step": 3428 + }, + { + "epoch": 0.59, + "learning_rate": 7.468772209461324e-06, + "loss": 0.4588, + "step": 3429 + }, + { + "epoch": 0.59, + "learning_rate": 7.463343376372837e-06, + "loss": 0.4633, + "step": 3430 + }, + { + "epoch": 0.59, + "learning_rate": 7.457915342190247e-06, + "loss": 0.4508, + "step": 3431 + }, + { + "epoch": 0.59, + "learning_rate": 7.452488108623089e-06, + "loss": 0.4695, + "step": 3432 + }, + { + "epoch": 0.59, + "learning_rate": 7.447061677380635e-06, + "loss": 0.4594, + "step": 3433 + }, + { + "epoch": 0.59, + "learning_rate": 7.441636050171909e-06, + "loss": 0.4679, + "step": 3434 + }, + { + "epoch": 0.6, + "learning_rate": 7.436211228705687e-06, + "loss": 0.4748, + "step": 3435 + }, + { + "epoch": 0.6, + "learning_rate": 7.430787214690485e-06, + "loss": 0.4642, + "step": 3436 + }, + { + "epoch": 0.6, + "learning_rate": 7.425364009834563e-06, + "loss": 0.4695, + "step": 3437 + }, + { + "epoch": 0.6, + "learning_rate": 7.4199416158459316e-06, + "loss": 0.4621, + "step": 3438 + }, + { + "epoch": 0.6, + "learning_rate": 7.414520034432345e-06, + "loss": 0.465, + "step": 3439 + }, + { + "epoch": 0.6, + "learning_rate": 7.409099267301296e-06, + "loss": 0.4635, + "step": 3440 + }, + { + "epoch": 0.6, + "learning_rate": 7.403679316160024e-06, + "loss": 0.4551, + "step": 3441 + }, + { + "epoch": 0.6, + "learning_rate": 7.398260182715524e-06, + "loss": 0.4667, + "step": 3442 + }, + { + "epoch": 0.6, + "learning_rate": 7.392841868674506e-06, + "loss": 0.4818, + "step": 3443 + }, + { + "epoch": 0.6, + "learning_rate": 7.387424375743451e-06, + "loss": 0.4771, + "step": 3444 + }, + { + "epoch": 0.6, + "learning_rate": 7.3820077056285595e-06, + "loss": 0.4621, + "step": 3445 + }, + { + "epoch": 0.6, + "learning_rate": 7.3765918600357875e-06, + "loss": 0.4668, + "step": 3446 + }, + { + "epoch": 0.6, + "learning_rate": 7.371176840670822e-06, + "loss": 0.4691, + "step": 3447 + }, + { + "epoch": 0.6, + "learning_rate": 7.365762649239092e-06, + "loss": 0.4661, + "step": 3448 + }, + { + "epoch": 0.6, + "learning_rate": 7.360349287445774e-06, + "loss": 0.464, + "step": 3449 + }, + { + "epoch": 0.6, + "learning_rate": 7.354936756995766e-06, + "loss": 0.4699, + "step": 3450 + }, + { + "epoch": 0.6, + "learning_rate": 7.349525059593725e-06, + "loss": 0.4499, + "step": 3451 + }, + { + "epoch": 0.6, + "learning_rate": 7.344114196944023e-06, + "loss": 0.4578, + "step": 3452 + }, + { + "epoch": 0.6, + "learning_rate": 7.338704170750794e-06, + "loss": 0.4681, + "step": 3453 + }, + { + "epoch": 0.6, + "learning_rate": 7.333294982717887e-06, + "loss": 0.4698, + "step": 3454 + }, + { + "epoch": 0.6, + "learning_rate": 7.327886634548899e-06, + "loss": 0.4616, + "step": 3455 + }, + { + "epoch": 0.6, + "learning_rate": 7.32247912794716e-06, + "loss": 0.4617, + "step": 3456 + }, + { + "epoch": 0.6, + "learning_rate": 7.3170724646157284e-06, + "loss": 0.4624, + "step": 3457 + }, + { + "epoch": 0.6, + "learning_rate": 7.311666646257412e-06, + "loss": 0.4685, + "step": 3458 + }, + { + "epoch": 0.6, + "learning_rate": 7.3062616745747325e-06, + "loss": 0.4553, + "step": 3459 + }, + { + "epoch": 0.6, + "learning_rate": 7.300857551269969e-06, + "loss": 0.4675, + "step": 3460 + }, + { + "epoch": 0.6, + "learning_rate": 7.295454278045104e-06, + "loss": 0.4565, + "step": 3461 + }, + { + "epoch": 0.6, + "learning_rate": 7.290051856601879e-06, + "loss": 0.4733, + "step": 3462 + }, + { + "epoch": 0.6, + "learning_rate": 7.28465028864176e-06, + "loss": 0.45, + "step": 3463 + }, + { + "epoch": 0.6, + "learning_rate": 7.279249575865929e-06, + "loss": 0.4632, + "step": 3464 + }, + { + "epoch": 0.6, + "learning_rate": 7.27384971997532e-06, + "loss": 0.4666, + "step": 3465 + }, + { + "epoch": 0.6, + "learning_rate": 7.268450722670582e-06, + "loss": 0.4735, + "step": 3466 + }, + { + "epoch": 0.6, + "learning_rate": 7.263052585652104e-06, + "loss": 0.4551, + "step": 3467 + }, + { + "epoch": 0.6, + "learning_rate": 7.257655310619996e-06, + "loss": 0.473, + "step": 3468 + }, + { + "epoch": 0.6, + "learning_rate": 7.252258899274096e-06, + "loss": 0.4582, + "step": 3469 + }, + { + "epoch": 0.6, + "learning_rate": 7.246863353313983e-06, + "loss": 0.4638, + "step": 3470 + }, + { + "epoch": 0.6, + "learning_rate": 7.241468674438947e-06, + "loss": 0.4695, + "step": 3471 + }, + { + "epoch": 0.6, + "learning_rate": 7.236074864348017e-06, + "loss": 0.4695, + "step": 3472 + }, + { + "epoch": 0.6, + "learning_rate": 7.230681924739939e-06, + "loss": 0.4592, + "step": 3473 + }, + { + "epoch": 0.6, + "learning_rate": 7.225289857313194e-06, + "loss": 0.4716, + "step": 3474 + }, + { + "epoch": 0.6, + "learning_rate": 7.219898663765979e-06, + "loss": 0.477, + "step": 3475 + }, + { + "epoch": 0.6, + "learning_rate": 7.214508345796218e-06, + "loss": 0.464, + "step": 3476 + }, + { + "epoch": 0.6, + "learning_rate": 7.209118905101575e-06, + "loss": 0.4659, + "step": 3477 + }, + { + "epoch": 0.6, + "learning_rate": 7.203730343379408e-06, + "loss": 0.4538, + "step": 3478 + }, + { + "epoch": 0.6, + "learning_rate": 7.198342662326827e-06, + "loss": 0.4664, + "step": 3479 + }, + { + "epoch": 0.6, + "learning_rate": 7.192955863640645e-06, + "loss": 0.4818, + "step": 3480 + }, + { + "epoch": 0.6, + "learning_rate": 7.187569949017408e-06, + "loss": 0.4638, + "step": 3481 + }, + { + "epoch": 0.6, + "learning_rate": 7.1821849201533765e-06, + "loss": 0.4564, + "step": 3482 + }, + { + "epoch": 0.6, + "learning_rate": 7.176800778744537e-06, + "loss": 0.4707, + "step": 3483 + }, + { + "epoch": 0.6, + "learning_rate": 7.1714175264865975e-06, + "loss": 0.4766, + "step": 3484 + }, + { + "epoch": 0.6, + "learning_rate": 7.166035165074976e-06, + "loss": 0.4582, + "step": 3485 + }, + { + "epoch": 0.6, + "learning_rate": 7.16065369620483e-06, + "loss": 0.4723, + "step": 3486 + }, + { + "epoch": 0.6, + "learning_rate": 7.155273121571009e-06, + "loss": 0.4649, + "step": 3487 + }, + { + "epoch": 0.6, + "learning_rate": 7.149893442868105e-06, + "loss": 0.4558, + "step": 3488 + }, + { + "epoch": 0.6, + "learning_rate": 7.1445146617904135e-06, + "loss": 0.4584, + "step": 3489 + }, + { + "epoch": 0.6, + "learning_rate": 7.139136780031953e-06, + "loss": 0.4733, + "step": 3490 + }, + { + "epoch": 0.6, + "learning_rate": 7.133759799286458e-06, + "loss": 0.4657, + "step": 3491 + }, + { + "epoch": 0.6, + "learning_rate": 7.128383721247376e-06, + "loss": 0.4882, + "step": 3492 + }, + { + "epoch": 0.61, + "learning_rate": 7.123008547607877e-06, + "loss": 0.4711, + "step": 3493 + }, + { + "epoch": 0.61, + "learning_rate": 7.1176342800608365e-06, + "loss": 0.4656, + "step": 3494 + }, + { + "epoch": 0.61, + "learning_rate": 7.112260920298859e-06, + "loss": 0.4632, + "step": 3495 + }, + { + "epoch": 0.61, + "learning_rate": 7.1068884700142416e-06, + "loss": 0.4702, + "step": 3496 + }, + { + "epoch": 0.61, + "learning_rate": 7.101516930899019e-06, + "loss": 0.4586, + "step": 3497 + }, + { + "epoch": 0.61, + "learning_rate": 7.096146304644924e-06, + "loss": 0.475, + "step": 3498 + }, + { + "epoch": 0.61, + "learning_rate": 7.090776592943402e-06, + "loss": 0.456, + "step": 3499 + }, + { + "epoch": 0.61, + "learning_rate": 7.08540779748562e-06, + "loss": 0.4742, + "step": 3500 + }, + { + "epoch": 0.61, + "learning_rate": 7.080039919962445e-06, + "loss": 0.4584, + "step": 3501 + }, + { + "epoch": 0.61, + "learning_rate": 7.074672962064464e-06, + "loss": 0.4636, + "step": 3502 + }, + { + "epoch": 0.61, + "learning_rate": 7.069306925481965e-06, + "loss": 0.4546, + "step": 3503 + }, + { + "epoch": 0.61, + "learning_rate": 7.063941811904956e-06, + "loss": 0.4525, + "step": 3504 + }, + { + "epoch": 0.61, + "learning_rate": 7.058577623023153e-06, + "loss": 0.4522, + "step": 3505 + }, + { + "epoch": 0.61, + "learning_rate": 7.0532143605259686e-06, + "loss": 0.4889, + "step": 3506 + }, + { + "epoch": 0.61, + "learning_rate": 7.047852026102541e-06, + "loss": 0.4739, + "step": 3507 + }, + { + "epoch": 0.61, + "learning_rate": 7.042490621441701e-06, + "loss": 0.4732, + "step": 3508 + }, + { + "epoch": 0.61, + "learning_rate": 7.0371301482319985e-06, + "loss": 0.4569, + "step": 3509 + }, + { + "epoch": 0.61, + "learning_rate": 7.0317706081616785e-06, + "loss": 0.4685, + "step": 3510 + }, + { + "epoch": 0.61, + "learning_rate": 7.026412002918701e-06, + "loss": 0.4413, + "step": 3511 + }, + { + "epoch": 0.61, + "learning_rate": 7.021054334190736e-06, + "loss": 0.4756, + "step": 3512 + }, + { + "epoch": 0.61, + "learning_rate": 7.015697603665141e-06, + "loss": 0.456, + "step": 3513 + }, + { + "epoch": 0.61, + "learning_rate": 7.010341813028996e-06, + "loss": 0.4734, + "step": 3514 + }, + { + "epoch": 0.61, + "learning_rate": 7.004986963969072e-06, + "loss": 0.457, + "step": 3515 + }, + { + "epoch": 0.61, + "learning_rate": 6.999633058171853e-06, + "loss": 0.4699, + "step": 3516 + }, + { + "epoch": 0.61, + "learning_rate": 6.994280097323519e-06, + "loss": 0.469, + "step": 3517 + }, + { + "epoch": 0.61, + "learning_rate": 6.988928083109954e-06, + "loss": 0.4659, + "step": 3518 + }, + { + "epoch": 0.61, + "learning_rate": 6.9835770172167535e-06, + "loss": 0.4597, + "step": 3519 + }, + { + "epoch": 0.61, + "learning_rate": 6.978226901329195e-06, + "loss": 0.4713, + "step": 3520 + }, + { + "epoch": 0.61, + "learning_rate": 6.9728777371322775e-06, + "loss": 0.4692, + "step": 3521 + }, + { + "epoch": 0.61, + "learning_rate": 6.967529526310681e-06, + "loss": 0.4704, + "step": 3522 + }, + { + "epoch": 0.61, + "learning_rate": 6.962182270548803e-06, + "loss": 0.462, + "step": 3523 + }, + { + "epoch": 0.61, + "learning_rate": 6.9568359715307265e-06, + "loss": 0.4608, + "step": 3524 + }, + { + "epoch": 0.61, + "learning_rate": 6.951490630940241e-06, + "loss": 0.4685, + "step": 3525 + }, + { + "epoch": 0.61, + "learning_rate": 6.9461462504608335e-06, + "loss": 0.4449, + "step": 3526 + }, + { + "epoch": 0.61, + "learning_rate": 6.94080283177568e-06, + "loss": 0.4617, + "step": 3527 + }, + { + "epoch": 0.61, + "learning_rate": 6.935460376567673e-06, + "loss": 0.484, + "step": 3528 + }, + { + "epoch": 0.61, + "learning_rate": 6.930118886519374e-06, + "loss": 0.4661, + "step": 3529 + }, + { + "epoch": 0.61, + "learning_rate": 6.924778363313071e-06, + "loss": 0.4587, + "step": 3530 + }, + { + "epoch": 0.61, + "learning_rate": 6.919438808630716e-06, + "loss": 0.4628, + "step": 3531 + }, + { + "epoch": 0.61, + "learning_rate": 6.914100224153983e-06, + "loss": 0.471, + "step": 3532 + }, + { + "epoch": 0.61, + "learning_rate": 6.90876261156423e-06, + "loss": 0.4574, + "step": 3533 + }, + { + "epoch": 0.61, + "learning_rate": 6.903425972542501e-06, + "loss": 0.465, + "step": 3534 + }, + { + "epoch": 0.61, + "learning_rate": 6.898090308769548e-06, + "loss": 0.4648, + "step": 3535 + }, + { + "epoch": 0.61, + "learning_rate": 6.892755621925804e-06, + "loss": 0.4702, + "step": 3536 + }, + { + "epoch": 0.61, + "learning_rate": 6.887421913691402e-06, + "loss": 0.4554, + "step": 3537 + }, + { + "epoch": 0.61, + "learning_rate": 6.882089185746158e-06, + "loss": 0.4733, + "step": 3538 + }, + { + "epoch": 0.61, + "learning_rate": 6.876757439769592e-06, + "loss": 0.4595, + "step": 3539 + }, + { + "epoch": 0.61, + "learning_rate": 6.871426677440907e-06, + "loss": 0.4607, + "step": 3540 + }, + { + "epoch": 0.61, + "learning_rate": 6.866096900438992e-06, + "loss": 0.4549, + "step": 3541 + }, + { + "epoch": 0.61, + "learning_rate": 6.860768110442438e-06, + "loss": 0.4729, + "step": 3542 + }, + { + "epoch": 0.61, + "learning_rate": 6.855440309129509e-06, + "loss": 0.4623, + "step": 3543 + }, + { + "epoch": 0.61, + "learning_rate": 6.850113498178173e-06, + "loss": 0.4746, + "step": 3544 + }, + { + "epoch": 0.61, + "learning_rate": 6.844787679266076e-06, + "loss": 0.4501, + "step": 3545 + }, + { + "epoch": 0.61, + "learning_rate": 6.839462854070554e-06, + "loss": 0.4743, + "step": 3546 + }, + { + "epoch": 0.61, + "learning_rate": 6.834139024268638e-06, + "loss": 0.4688, + "step": 3547 + }, + { + "epoch": 0.61, + "learning_rate": 6.828816191537032e-06, + "loss": 0.4658, + "step": 3548 + }, + { + "epoch": 0.61, + "learning_rate": 6.8234943575521365e-06, + "loss": 0.4566, + "step": 3549 + }, + { + "epoch": 0.61, + "learning_rate": 6.818173523990029e-06, + "loss": 0.4841, + "step": 3550 + }, + { + "epoch": 0.62, + "learning_rate": 6.812853692526482e-06, + "loss": 0.4527, + "step": 3551 + }, + { + "epoch": 0.62, + "learning_rate": 6.807534864836942e-06, + "loss": 0.4757, + "step": 3552 + }, + { + "epoch": 0.62, + "learning_rate": 6.802217042596544e-06, + "loss": 0.4675, + "step": 3553 + }, + { + "epoch": 0.62, + "learning_rate": 6.7969002274801145e-06, + "loss": 0.4678, + "step": 3554 + }, + { + "epoch": 0.62, + "learning_rate": 6.791584421162143e-06, + "loss": 0.4627, + "step": 3555 + }, + { + "epoch": 0.62, + "learning_rate": 6.7862696253168225e-06, + "loss": 0.4719, + "step": 3556 + }, + { + "epoch": 0.62, + "learning_rate": 6.780955841618013e-06, + "loss": 0.4641, + "step": 3557 + }, + { + "epoch": 0.62, + "learning_rate": 6.775643071739267e-06, + "loss": 0.4674, + "step": 3558 + }, + { + "epoch": 0.62, + "learning_rate": 6.770331317353804e-06, + "loss": 0.4595, + "step": 3559 + }, + { + "epoch": 0.62, + "learning_rate": 6.765020580134538e-06, + "loss": 0.4496, + "step": 3560 + }, + { + "epoch": 0.62, + "learning_rate": 6.759710861754054e-06, + "loss": 0.4621, + "step": 3561 + }, + { + "epoch": 0.62, + "learning_rate": 6.7544021638846145e-06, + "loss": 0.4679, + "step": 3562 + }, + { + "epoch": 0.62, + "learning_rate": 6.749094488198173e-06, + "loss": 0.4621, + "step": 3563 + }, + { + "epoch": 0.62, + "learning_rate": 6.743787836366343e-06, + "loss": 0.4696, + "step": 3564 + }, + { + "epoch": 0.62, + "learning_rate": 6.738482210060433e-06, + "loss": 0.4578, + "step": 3565 + }, + { + "epoch": 0.62, + "learning_rate": 6.733177610951414e-06, + "loss": 0.4576, + "step": 3566 + }, + { + "epoch": 0.62, + "learning_rate": 6.727874040709943e-06, + "loss": 0.459, + "step": 3567 + }, + { + "epoch": 0.62, + "learning_rate": 6.7225715010063516e-06, + "loss": 0.47, + "step": 3568 + }, + { + "epoch": 0.62, + "learning_rate": 6.717269993510642e-06, + "loss": 0.4695, + "step": 3569 + }, + { + "epoch": 0.62, + "learning_rate": 6.711969519892499e-06, + "loss": 0.4753, + "step": 3570 + }, + { + "epoch": 0.62, + "learning_rate": 6.706670081821267e-06, + "loss": 0.4626, + "step": 3571 + }, + { + "epoch": 0.62, + "learning_rate": 6.70137168096599e-06, + "loss": 0.4655, + "step": 3572 + }, + { + "epoch": 0.62, + "learning_rate": 6.696074318995355e-06, + "loss": 0.457, + "step": 3573 + }, + { + "epoch": 0.62, + "learning_rate": 6.690777997577745e-06, + "loss": 0.4677, + "step": 3574 + }, + { + "epoch": 0.62, + "learning_rate": 6.685482718381209e-06, + "loss": 0.4578, + "step": 3575 + }, + { + "epoch": 0.62, + "learning_rate": 6.680188483073458e-06, + "loss": 0.4644, + "step": 3576 + }, + { + "epoch": 0.62, + "learning_rate": 6.6748952933218895e-06, + "loss": 0.4594, + "step": 3577 + }, + { + "epoch": 0.62, + "learning_rate": 6.6696031507935575e-06, + "loss": 0.4721, + "step": 3578 + }, + { + "epoch": 0.62, + "learning_rate": 6.664312057155199e-06, + "loss": 0.4788, + "step": 3579 + }, + { + "epoch": 0.62, + "learning_rate": 6.659022014073209e-06, + "loss": 0.4659, + "step": 3580 + }, + { + "epoch": 0.62, + "learning_rate": 6.653733023213658e-06, + "loss": 0.4614, + "step": 3581 + }, + { + "epoch": 0.62, + "learning_rate": 6.64844508624229e-06, + "loss": 0.4639, + "step": 3582 + }, + { + "epoch": 0.62, + "learning_rate": 6.643158204824506e-06, + "loss": 0.4665, + "step": 3583 + }, + { + "epoch": 0.62, + "learning_rate": 6.637872380625383e-06, + "loss": 0.4654, + "step": 3584 + }, + { + "epoch": 0.62, + "learning_rate": 6.632587615309658e-06, + "loss": 0.4663, + "step": 3585 + }, + { + "epoch": 0.62, + "learning_rate": 6.627303910541743e-06, + "loss": 0.4588, + "step": 3586 + }, + { + "epoch": 0.62, + "learning_rate": 6.622021267985705e-06, + "loss": 0.4555, + "step": 3587 + }, + { + "epoch": 0.62, + "learning_rate": 6.616739689305287e-06, + "loss": 0.4616, + "step": 3588 + }, + { + "epoch": 0.62, + "learning_rate": 6.6114591761638995e-06, + "loss": 0.4486, + "step": 3589 + }, + { + "epoch": 0.62, + "learning_rate": 6.606179730224598e-06, + "loss": 0.4681, + "step": 3590 + }, + { + "epoch": 0.62, + "learning_rate": 6.600901353150123e-06, + "loss": 0.468, + "step": 3591 + }, + { + "epoch": 0.62, + "learning_rate": 6.595624046602867e-06, + "loss": 0.4731, + "step": 3592 + }, + { + "epoch": 0.62, + "learning_rate": 6.59034781224489e-06, + "loss": 0.453, + "step": 3593 + }, + { + "epoch": 0.62, + "learning_rate": 6.585072651737911e-06, + "loss": 0.4699, + "step": 3594 + }, + { + "epoch": 0.62, + "learning_rate": 6.579798566743314e-06, + "loss": 0.4721, + "step": 3595 + }, + { + "epoch": 0.62, + "learning_rate": 6.574525558922142e-06, + "loss": 0.4519, + "step": 3596 + }, + { + "epoch": 0.62, + "learning_rate": 6.5692536299350974e-06, + "loss": 0.4623, + "step": 3597 + }, + { + "epoch": 0.62, + "learning_rate": 6.563982781442551e-06, + "loss": 0.4699, + "step": 3598 + }, + { + "epoch": 0.62, + "learning_rate": 6.558713015104519e-06, + "loss": 0.4544, + "step": 3599 + }, + { + "epoch": 0.62, + "learning_rate": 6.553444332580692e-06, + "loss": 0.4631, + "step": 3600 + }, + { + "epoch": 0.62, + "learning_rate": 6.54817673553041e-06, + "loss": 0.4713, + "step": 3601 + }, + { + "epoch": 0.62, + "learning_rate": 6.54291022561267e-06, + "loss": 0.461, + "step": 3602 + }, + { + "epoch": 0.62, + "learning_rate": 6.537644804486136e-06, + "loss": 0.4755, + "step": 3603 + }, + { + "epoch": 0.62, + "learning_rate": 6.532380473809118e-06, + "loss": 0.452, + "step": 3604 + }, + { + "epoch": 0.62, + "learning_rate": 6.527117235239591e-06, + "loss": 0.471, + "step": 3605 + }, + { + "epoch": 0.62, + "learning_rate": 6.521855090435178e-06, + "loss": 0.4683, + "step": 3606 + }, + { + "epoch": 0.62, + "learning_rate": 6.516594041053173e-06, + "loss": 0.4794, + "step": 3607 + }, + { + "epoch": 0.63, + "learning_rate": 6.511334088750501e-06, + "loss": 0.4724, + "step": 3608 + }, + { + "epoch": 0.63, + "learning_rate": 6.50607523518376e-06, + "loss": 0.4787, + "step": 3609 + }, + { + "epoch": 0.63, + "learning_rate": 6.500817482009201e-06, + "loss": 0.4538, + "step": 3610 + }, + { + "epoch": 0.63, + "learning_rate": 6.495560830882719e-06, + "loss": 0.4778, + "step": 3611 + }, + { + "epoch": 0.63, + "learning_rate": 6.49030528345987e-06, + "loss": 0.4512, + "step": 3612 + }, + { + "epoch": 0.63, + "learning_rate": 6.4850508413958564e-06, + "loss": 0.4702, + "step": 3613 + }, + { + "epoch": 0.63, + "learning_rate": 6.479797506345539e-06, + "loss": 0.4632, + "step": 3614 + }, + { + "epoch": 0.63, + "learning_rate": 6.47454527996342e-06, + "loss": 0.4605, + "step": 3615 + }, + { + "epoch": 0.63, + "learning_rate": 6.469294163903666e-06, + "loss": 0.4625, + "step": 3616 + }, + { + "epoch": 0.63, + "learning_rate": 6.464044159820086e-06, + "loss": 0.4677, + "step": 3617 + }, + { + "epoch": 0.63, + "learning_rate": 6.458795269366136e-06, + "loss": 0.4603, + "step": 3618 + }, + { + "epoch": 0.63, + "learning_rate": 6.453547494194929e-06, + "loss": 0.4607, + "step": 3619 + }, + { + "epoch": 0.63, + "learning_rate": 6.448300835959218e-06, + "loss": 0.4366, + "step": 3620 + }, + { + "epoch": 0.63, + "learning_rate": 6.443055296311413e-06, + "loss": 0.4668, + "step": 3621 + }, + { + "epoch": 0.63, + "learning_rate": 6.4378108769035644e-06, + "loss": 0.452, + "step": 3622 + }, + { + "epoch": 0.63, + "learning_rate": 6.432567579387372e-06, + "loss": 0.4678, + "step": 3623 + }, + { + "epoch": 0.63, + "learning_rate": 6.427325405414189e-06, + "loss": 0.4571, + "step": 3624 + }, + { + "epoch": 0.63, + "learning_rate": 6.422084356635003e-06, + "loss": 0.4564, + "step": 3625 + }, + { + "epoch": 0.63, + "learning_rate": 6.41684443470046e-06, + "loss": 0.4556, + "step": 3626 + }, + { + "epoch": 0.63, + "learning_rate": 6.4116056412608355e-06, + "loss": 0.4651, + "step": 3627 + }, + { + "epoch": 0.63, + "learning_rate": 6.406367977966066e-06, + "loss": 0.4597, + "step": 3628 + }, + { + "epoch": 0.63, + "learning_rate": 6.4011314464657186e-06, + "loss": 0.4807, + "step": 3629 + }, + { + "epoch": 0.63, + "learning_rate": 6.3958960484090094e-06, + "loss": 0.4596, + "step": 3630 + }, + { + "epoch": 0.63, + "learning_rate": 6.390661785444809e-06, + "loss": 0.4634, + "step": 3631 + }, + { + "epoch": 0.63, + "learning_rate": 6.385428659221604e-06, + "loss": 0.4588, + "step": 3632 + }, + { + "epoch": 0.63, + "learning_rate": 6.38019667138755e-06, + "loss": 0.477, + "step": 3633 + }, + { + "epoch": 0.63, + "learning_rate": 6.374965823590425e-06, + "loss": 0.4646, + "step": 3634 + }, + { + "epoch": 0.63, + "learning_rate": 6.369736117477662e-06, + "loss": 0.4688, + "step": 3635 + }, + { + "epoch": 0.63, + "learning_rate": 6.364507554696322e-06, + "loss": 0.4649, + "step": 3636 + }, + { + "epoch": 0.63, + "learning_rate": 6.3592801368931134e-06, + "loss": 0.4653, + "step": 3637 + }, + { + "epoch": 0.63, + "learning_rate": 6.354053865714387e-06, + "loss": 0.4684, + "step": 3638 + }, + { + "epoch": 0.63, + "learning_rate": 6.348828742806122e-06, + "loss": 0.4755, + "step": 3639 + }, + { + "epoch": 0.63, + "learning_rate": 6.343604769813945e-06, + "loss": 0.468, + "step": 3640 + }, + { + "epoch": 0.63, + "learning_rate": 6.338381948383111e-06, + "loss": 0.4654, + "step": 3641 + }, + { + "epoch": 0.63, + "learning_rate": 6.33316028015853e-06, + "loss": 0.4646, + "step": 3642 + }, + { + "epoch": 0.63, + "learning_rate": 6.3279397667847265e-06, + "loss": 0.4656, + "step": 3643 + }, + { + "epoch": 0.63, + "learning_rate": 6.322720409905878e-06, + "loss": 0.4594, + "step": 3644 + }, + { + "epoch": 0.63, + "learning_rate": 6.317502211165794e-06, + "loss": 0.4647, + "step": 3645 + }, + { + "epoch": 0.63, + "learning_rate": 6.312285172207909e-06, + "loss": 0.4608, + "step": 3646 + }, + { + "epoch": 0.63, + "learning_rate": 6.30706929467531e-06, + "loss": 0.4775, + "step": 3647 + }, + { + "epoch": 0.63, + "learning_rate": 6.3018545802107e-06, + "loss": 0.4598, + "step": 3648 + }, + { + "epoch": 0.63, + "learning_rate": 6.2966410304564304e-06, + "loss": 0.4789, + "step": 3649 + }, + { + "epoch": 0.63, + "learning_rate": 6.291428647054474e-06, + "loss": 0.4667, + "step": 3650 + }, + { + "epoch": 0.63, + "learning_rate": 6.286217431646447e-06, + "loss": 0.4702, + "step": 3651 + }, + { + "epoch": 0.63, + "learning_rate": 6.281007385873594e-06, + "loss": 0.454, + "step": 3652 + }, + { + "epoch": 0.63, + "learning_rate": 6.275798511376785e-06, + "loss": 0.4691, + "step": 3653 + }, + { + "epoch": 0.63, + "learning_rate": 6.270590809796531e-06, + "loss": 0.4717, + "step": 3654 + }, + { + "epoch": 0.63, + "learning_rate": 6.265384282772961e-06, + "loss": 0.4571, + "step": 3655 + }, + { + "epoch": 0.63, + "learning_rate": 6.260178931945852e-06, + "loss": 0.4604, + "step": 3656 + }, + { + "epoch": 0.63, + "learning_rate": 6.25497475895459e-06, + "loss": 0.4663, + "step": 3657 + }, + { + "epoch": 0.63, + "learning_rate": 6.249771765438205e-06, + "loss": 0.4655, + "step": 3658 + }, + { + "epoch": 0.63, + "learning_rate": 6.244569953035355e-06, + "loss": 0.4662, + "step": 3659 + }, + { + "epoch": 0.63, + "learning_rate": 6.2393693233843155e-06, + "loss": 0.4591, + "step": 3660 + }, + { + "epoch": 0.63, + "learning_rate": 6.234169878123001e-06, + "loss": 0.475, + "step": 3661 + }, + { + "epoch": 0.63, + "learning_rate": 6.228971618888943e-06, + "loss": 0.466, + "step": 3662 + }, + { + "epoch": 0.63, + "learning_rate": 6.223774547319308e-06, + "loss": 0.4608, + "step": 3663 + }, + { + "epoch": 0.63, + "learning_rate": 6.218578665050883e-06, + "loss": 0.4663, + "step": 3664 + }, + { + "epoch": 0.63, + "learning_rate": 6.2133839737200795e-06, + "loss": 0.4641, + "step": 3665 + }, + { + "epoch": 0.64, + "learning_rate": 6.208190474962945e-06, + "loss": 0.4789, + "step": 3666 + }, + { + "epoch": 0.64, + "learning_rate": 6.202998170415133e-06, + "loss": 0.4632, + "step": 3667 + }, + { + "epoch": 0.64, + "learning_rate": 6.19780706171194e-06, + "loss": 0.4625, + "step": 3668 + }, + { + "epoch": 0.64, + "learning_rate": 6.19261715048827e-06, + "loss": 0.4738, + "step": 3669 + }, + { + "epoch": 0.64, + "learning_rate": 6.187428438378662e-06, + "loss": 0.4628, + "step": 3670 + }, + { + "epoch": 0.64, + "learning_rate": 6.1822409270172665e-06, + "loss": 0.4553, + "step": 3671 + }, + { + "epoch": 0.64, + "learning_rate": 6.177054618037866e-06, + "loss": 0.4724, + "step": 3672 + }, + { + "epoch": 0.64, + "learning_rate": 6.171869513073858e-06, + "loss": 0.4736, + "step": 3673 + }, + { + "epoch": 0.64, + "learning_rate": 6.166685613758259e-06, + "loss": 0.4536, + "step": 3674 + }, + { + "epoch": 0.64, + "learning_rate": 6.161502921723719e-06, + "loss": 0.4635, + "step": 3675 + }, + { + "epoch": 0.64, + "learning_rate": 6.156321438602484e-06, + "loss": 0.4506, + "step": 3676 + }, + { + "epoch": 0.64, + "learning_rate": 6.1511411660264485e-06, + "loss": 0.4714, + "step": 3677 + }, + { + "epoch": 0.64, + "learning_rate": 6.145962105627097e-06, + "loss": 0.4637, + "step": 3678 + }, + { + "epoch": 0.64, + "learning_rate": 6.140784259035553e-06, + "loss": 0.4725, + "step": 3679 + }, + { + "epoch": 0.64, + "learning_rate": 6.1356076278825516e-06, + "loss": 0.4605, + "step": 3680 + }, + { + "epoch": 0.64, + "learning_rate": 6.130432213798441e-06, + "loss": 0.4652, + "step": 3681 + }, + { + "epoch": 0.64, + "learning_rate": 6.125258018413191e-06, + "loss": 0.4574, + "step": 3682 + }, + { + "epoch": 0.64, + "learning_rate": 6.120085043356378e-06, + "loss": 0.4687, + "step": 3683 + }, + { + "epoch": 0.64, + "learning_rate": 6.114913290257219e-06, + "loss": 0.4612, + "step": 3684 + }, + { + "epoch": 0.64, + "learning_rate": 6.109742760744508e-06, + "loss": 0.4657, + "step": 3685 + }, + { + "epoch": 0.64, + "learning_rate": 6.104573456446687e-06, + "loss": 0.4554, + "step": 3686 + }, + { + "epoch": 0.64, + "learning_rate": 6.0994053789918004e-06, + "loss": 0.473, + "step": 3687 + }, + { + "epoch": 0.64, + "learning_rate": 6.094238530007501e-06, + "loss": 0.4553, + "step": 3688 + }, + { + "epoch": 0.64, + "learning_rate": 6.089072911121061e-06, + "loss": 0.4625, + "step": 3689 + }, + { + "epoch": 0.64, + "learning_rate": 6.083908523959362e-06, + "loss": 0.4584, + "step": 3690 + }, + { + "epoch": 0.64, + "learning_rate": 6.078745370148902e-06, + "loss": 0.4723, + "step": 3691 + }, + { + "epoch": 0.64, + "learning_rate": 6.073583451315782e-06, + "loss": 0.4441, + "step": 3692 + }, + { + "epoch": 0.64, + "learning_rate": 6.068422769085722e-06, + "loss": 0.4579, + "step": 3693 + }, + { + "epoch": 0.64, + "learning_rate": 6.063263325084054e-06, + "loss": 0.4471, + "step": 3694 + }, + { + "epoch": 0.64, + "learning_rate": 6.0581051209357135e-06, + "loss": 0.4524, + "step": 3695 + }, + { + "epoch": 0.64, + "learning_rate": 6.052948158265248e-06, + "loss": 0.4682, + "step": 3696 + }, + { + "epoch": 0.64, + "learning_rate": 6.047792438696813e-06, + "loss": 0.4705, + "step": 3697 + }, + { + "epoch": 0.64, + "learning_rate": 6.042637963854179e-06, + "loss": 0.4437, + "step": 3698 + }, + { + "epoch": 0.64, + "learning_rate": 6.037484735360711e-06, + "loss": 0.4598, + "step": 3699 + }, + { + "epoch": 0.64, + "learning_rate": 6.0323327548393926e-06, + "loss": 0.4514, + "step": 3700 + }, + { + "epoch": 0.64, + "learning_rate": 6.027182023912819e-06, + "loss": 0.4683, + "step": 3701 + }, + { + "epoch": 0.64, + "learning_rate": 6.0220325442031714e-06, + "loss": 0.4591, + "step": 3702 + }, + { + "epoch": 0.64, + "learning_rate": 6.016884317332261e-06, + "loss": 0.459, + "step": 3703 + }, + { + "epoch": 0.64, + "learning_rate": 6.011737344921487e-06, + "loss": 0.4688, + "step": 3704 + }, + { + "epoch": 0.64, + "learning_rate": 6.0065916285918625e-06, + "loss": 0.4723, + "step": 3705 + }, + { + "epoch": 0.64, + "learning_rate": 6.001447169964e-06, + "loss": 0.4551, + "step": 3706 + }, + { + "epoch": 0.64, + "learning_rate": 5.996303970658119e-06, + "loss": 0.472, + "step": 3707 + }, + { + "epoch": 0.64, + "learning_rate": 5.991162032294042e-06, + "loss": 0.475, + "step": 3708 + }, + { + "epoch": 0.64, + "learning_rate": 5.986021356491192e-06, + "loss": 0.4661, + "step": 3709 + }, + { + "epoch": 0.64, + "learning_rate": 5.980881944868604e-06, + "loss": 0.4583, + "step": 3710 + }, + { + "epoch": 0.64, + "learning_rate": 5.975743799044894e-06, + "loss": 0.4576, + "step": 3711 + }, + { + "epoch": 0.64, + "learning_rate": 5.970606920638304e-06, + "loss": 0.4629, + "step": 3712 + }, + { + "epoch": 0.64, + "learning_rate": 5.965471311266658e-06, + "loss": 0.4794, + "step": 3713 + }, + { + "epoch": 0.64, + "learning_rate": 5.960336972547391e-06, + "loss": 0.4629, + "step": 3714 + }, + { + "epoch": 0.64, + "learning_rate": 5.955203906097537e-06, + "loss": 0.4614, + "step": 3715 + }, + { + "epoch": 0.64, + "learning_rate": 5.9500721135337205e-06, + "loss": 0.4714, + "step": 3716 + }, + { + "epoch": 0.64, + "learning_rate": 5.944941596472176e-06, + "loss": 0.4577, + "step": 3717 + }, + { + "epoch": 0.64, + "learning_rate": 5.939812356528727e-06, + "loss": 0.4576, + "step": 3718 + }, + { + "epoch": 0.64, + "learning_rate": 5.934684395318806e-06, + "loss": 0.4625, + "step": 3719 + }, + { + "epoch": 0.64, + "learning_rate": 5.929557714457425e-06, + "loss": 0.4628, + "step": 3720 + }, + { + "epoch": 0.64, + "learning_rate": 5.924432315559213e-06, + "loss": 0.4631, + "step": 3721 + }, + { + "epoch": 0.64, + "learning_rate": 5.919308200238385e-06, + "loss": 0.4518, + "step": 3722 + }, + { + "epoch": 0.64, + "learning_rate": 5.914185370108749e-06, + "loss": 0.4709, + "step": 3723 + }, + { + "epoch": 0.65, + "learning_rate": 5.9090638267837144e-06, + "loss": 0.4643, + "step": 3724 + }, + { + "epoch": 0.65, + "learning_rate": 5.90394357187628e-06, + "loss": 0.4795, + "step": 3725 + }, + { + "epoch": 0.65, + "learning_rate": 5.898824606999047e-06, + "loss": 0.444, + "step": 3726 + }, + { + "epoch": 0.65, + "learning_rate": 5.893706933764196e-06, + "loss": 0.4698, + "step": 3727 + }, + { + "epoch": 0.65, + "learning_rate": 5.888590553783517e-06, + "loss": 0.4571, + "step": 3728 + }, + { + "epoch": 0.65, + "learning_rate": 5.883475468668387e-06, + "loss": 0.4757, + "step": 3729 + }, + { + "epoch": 0.65, + "learning_rate": 5.8783616800297675e-06, + "loss": 0.4632, + "step": 3730 + }, + { + "epoch": 0.65, + "learning_rate": 5.873249189478221e-06, + "loss": 0.4665, + "step": 3731 + }, + { + "epoch": 0.65, + "learning_rate": 5.868137998623897e-06, + "loss": 0.4642, + "step": 3732 + }, + { + "epoch": 0.65, + "learning_rate": 5.8630281090765386e-06, + "loss": 0.47, + "step": 3733 + }, + { + "epoch": 0.65, + "learning_rate": 5.857919522445475e-06, + "loss": 0.461, + "step": 3734 + }, + { + "epoch": 0.65, + "learning_rate": 5.8528122403396226e-06, + "loss": 0.4655, + "step": 3735 + }, + { + "epoch": 0.65, + "learning_rate": 5.847706264367503e-06, + "loss": 0.4718, + "step": 3736 + }, + { + "epoch": 0.65, + "learning_rate": 5.842601596137206e-06, + "loss": 0.4683, + "step": 3737 + }, + { + "epoch": 0.65, + "learning_rate": 5.8374982372564255e-06, + "loss": 0.4635, + "step": 3738 + }, + { + "epoch": 0.65, + "learning_rate": 5.832396189332423e-06, + "loss": 0.464, + "step": 3739 + }, + { + "epoch": 0.65, + "learning_rate": 5.8272954539720775e-06, + "loss": 0.4756, + "step": 3740 + }, + { + "epoch": 0.65, + "learning_rate": 5.822196032781824e-06, + "loss": 0.4625, + "step": 3741 + }, + { + "epoch": 0.65, + "learning_rate": 5.817097927367701e-06, + "loss": 0.4692, + "step": 3742 + }, + { + "epoch": 0.65, + "learning_rate": 5.812001139335329e-06, + "loss": 0.4654, + "step": 3743 + }, + { + "epoch": 0.65, + "learning_rate": 5.806905670289913e-06, + "loss": 0.4728, + "step": 3744 + }, + { + "epoch": 0.65, + "learning_rate": 5.801811521836246e-06, + "loss": 0.4626, + "step": 3745 + }, + { + "epoch": 0.65, + "learning_rate": 5.796718695578695e-06, + "loss": 0.4652, + "step": 3746 + }, + { + "epoch": 0.65, + "learning_rate": 5.7916271931212185e-06, + "loss": 0.4602, + "step": 3747 + }, + { + "epoch": 0.65, + "learning_rate": 5.786537016067362e-06, + "loss": 0.4601, + "step": 3748 + }, + { + "epoch": 0.65, + "learning_rate": 5.781448166020242e-06, + "loss": 0.465, + "step": 3749 + }, + { + "epoch": 0.65, + "learning_rate": 5.776360644582569e-06, + "loss": 0.4587, + "step": 3750 + }, + { + "epoch": 0.65, + "learning_rate": 5.771274453356628e-06, + "loss": 0.467, + "step": 3751 + }, + { + "epoch": 0.65, + "learning_rate": 5.766189593944289e-06, + "loss": 0.4571, + "step": 3752 + }, + { + "epoch": 0.65, + "learning_rate": 5.761106067946993e-06, + "loss": 0.4588, + "step": 3753 + }, + { + "epoch": 0.65, + "learning_rate": 5.756023876965773e-06, + "loss": 0.4725, + "step": 3754 + }, + { + "epoch": 0.65, + "learning_rate": 5.7509430226012365e-06, + "loss": 0.4601, + "step": 3755 + }, + { + "epoch": 0.65, + "learning_rate": 5.745863506453569e-06, + "loss": 0.4668, + "step": 3756 + }, + { + "epoch": 0.65, + "learning_rate": 5.740785330122542e-06, + "loss": 0.4774, + "step": 3757 + }, + { + "epoch": 0.65, + "learning_rate": 5.735708495207486e-06, + "loss": 0.4693, + "step": 3758 + }, + { + "epoch": 0.65, + "learning_rate": 5.730633003307338e-06, + "loss": 0.4639, + "step": 3759 + }, + { + "epoch": 0.65, + "learning_rate": 5.725558856020584e-06, + "loss": 0.4714, + "step": 3760 + }, + { + "epoch": 0.65, + "learning_rate": 5.7204860549453025e-06, + "loss": 0.4505, + "step": 3761 + }, + { + "epoch": 0.65, + "learning_rate": 5.715414601679144e-06, + "loss": 0.4769, + "step": 3762 + }, + { + "epoch": 0.65, + "learning_rate": 5.710344497819333e-06, + "loss": 0.4562, + "step": 3763 + }, + { + "epoch": 0.65, + "learning_rate": 5.705275744962676e-06, + "loss": 0.4739, + "step": 3764 + }, + { + "epoch": 0.65, + "learning_rate": 5.700208344705537e-06, + "loss": 0.4644, + "step": 3765 + }, + { + "epoch": 0.65, + "learning_rate": 5.695142298643881e-06, + "loss": 0.4559, + "step": 3766 + }, + { + "epoch": 0.65, + "learning_rate": 5.690077608373219e-06, + "loss": 0.4567, + "step": 3767 + }, + { + "epoch": 0.65, + "learning_rate": 5.685014275488649e-06, + "loss": 0.4767, + "step": 3768 + }, + { + "epoch": 0.65, + "learning_rate": 5.679952301584844e-06, + "loss": 0.4589, + "step": 3769 + }, + { + "epoch": 0.65, + "learning_rate": 5.674891688256041e-06, + "loss": 0.4607, + "step": 3770 + }, + { + "epoch": 0.65, + "learning_rate": 5.669832437096058e-06, + "loss": 0.4569, + "step": 3771 + }, + { + "epoch": 0.65, + "learning_rate": 5.664774549698269e-06, + "loss": 0.4761, + "step": 3772 + }, + { + "epoch": 0.65, + "learning_rate": 5.659718027655631e-06, + "loss": 0.4579, + "step": 3773 + }, + { + "epoch": 0.65, + "learning_rate": 5.6546628725606675e-06, + "loss": 0.4528, + "step": 3774 + }, + { + "epoch": 0.65, + "learning_rate": 5.649609086005476e-06, + "loss": 0.458, + "step": 3775 + }, + { + "epoch": 0.65, + "learning_rate": 5.644556669581709e-06, + "loss": 0.4681, + "step": 3776 + }, + { + "epoch": 0.65, + "learning_rate": 5.639505624880604e-06, + "loss": 0.4688, + "step": 3777 + }, + { + "epoch": 0.65, + "learning_rate": 5.634455953492964e-06, + "loss": 0.4686, + "step": 3778 + }, + { + "epoch": 0.65, + "learning_rate": 5.629407657009143e-06, + "loss": 0.4591, + "step": 3779 + }, + { + "epoch": 0.65, + "learning_rate": 5.624360737019081e-06, + "loss": 0.4553, + "step": 3780 + }, + { + "epoch": 0.66, + "learning_rate": 5.619315195112276e-06, + "loss": 0.4691, + "step": 3781 + }, + { + "epoch": 0.66, + "learning_rate": 5.614271032877799e-06, + "loss": 0.4607, + "step": 3782 + }, + { + "epoch": 0.66, + "learning_rate": 5.609228251904265e-06, + "loss": 0.4665, + "step": 3783 + }, + { + "epoch": 0.66, + "learning_rate": 5.6041868537798845e-06, + "loss": 0.4743, + "step": 3784 + }, + { + "epoch": 0.66, + "learning_rate": 5.59914684009242e-06, + "loss": 0.4555, + "step": 3785 + }, + { + "epoch": 0.66, + "learning_rate": 5.594108212429183e-06, + "loss": 0.4663, + "step": 3786 + }, + { + "epoch": 0.66, + "learning_rate": 5.589070972377068e-06, + "loss": 0.458, + "step": 3787 + }, + { + "epoch": 0.66, + "learning_rate": 5.584035121522526e-06, + "loss": 0.4612, + "step": 3788 + }, + { + "epoch": 0.66, + "learning_rate": 5.579000661451574e-06, + "loss": 0.4473, + "step": 3789 + }, + { + "epoch": 0.66, + "learning_rate": 5.573967593749778e-06, + "loss": 0.456, + "step": 3790 + }, + { + "epoch": 0.66, + "learning_rate": 5.568935920002276e-06, + "loss": 0.4696, + "step": 3791 + }, + { + "epoch": 0.66, + "learning_rate": 5.563905641793776e-06, + "loss": 0.4693, + "step": 3792 + }, + { + "epoch": 0.66, + "learning_rate": 5.558876760708527e-06, + "loss": 0.4759, + "step": 3793 + }, + { + "epoch": 0.66, + "learning_rate": 5.553849278330349e-06, + "loss": 0.4616, + "step": 3794 + }, + { + "epoch": 0.66, + "learning_rate": 5.54882319624262e-06, + "loss": 0.4664, + "step": 3795 + }, + { + "epoch": 0.66, + "learning_rate": 5.54379851602828e-06, + "loss": 0.4681, + "step": 3796 + }, + { + "epoch": 0.66, + "learning_rate": 5.538775239269818e-06, + "loss": 0.4507, + "step": 3797 + }, + { + "epoch": 0.66, + "learning_rate": 5.533753367549285e-06, + "loss": 0.4625, + "step": 3798 + }, + { + "epoch": 0.66, + "learning_rate": 5.528732902448305e-06, + "loss": 0.4612, + "step": 3799 + }, + { + "epoch": 0.66, + "learning_rate": 5.523713845548033e-06, + "loss": 0.4672, + "step": 3800 + }, + { + "epoch": 0.66, + "learning_rate": 5.518696198429201e-06, + "loss": 0.4616, + "step": 3801 + }, + { + "epoch": 0.66, + "learning_rate": 5.513679962672076e-06, + "loss": 0.4722, + "step": 3802 + }, + { + "epoch": 0.66, + "learning_rate": 5.508665139856513e-06, + "loss": 0.4661, + "step": 3803 + }, + { + "epoch": 0.66, + "learning_rate": 5.503651731561887e-06, + "loss": 0.4585, + "step": 3804 + }, + { + "epoch": 0.66, + "learning_rate": 5.498639739367148e-06, + "loss": 0.4713, + "step": 3805 + }, + { + "epoch": 0.66, + "learning_rate": 5.493629164850795e-06, + "loss": 0.4669, + "step": 3806 + }, + { + "epoch": 0.66, + "learning_rate": 5.488620009590881e-06, + "loss": 0.4646, + "step": 3807 + }, + { + "epoch": 0.66, + "learning_rate": 5.483612275165018e-06, + "loss": 0.464, + "step": 3808 + }, + { + "epoch": 0.66, + "learning_rate": 5.478605963150348e-06, + "loss": 0.442, + "step": 3809 + }, + { + "epoch": 0.66, + "learning_rate": 5.473601075123599e-06, + "loss": 0.4687, + "step": 3810 + }, + { + "epoch": 0.66, + "learning_rate": 5.468597612661021e-06, + "loss": 0.4591, + "step": 3811 + }, + { + "epoch": 0.66, + "learning_rate": 5.4635955773384295e-06, + "loss": 0.4737, + "step": 3812 + }, + { + "epoch": 0.66, + "learning_rate": 5.458594970731188e-06, + "loss": 0.4548, + "step": 3813 + }, + { + "epoch": 0.66, + "learning_rate": 5.453595794414211e-06, + "loss": 0.4619, + "step": 3814 + }, + { + "epoch": 0.66, + "learning_rate": 5.448598049961964e-06, + "loss": 0.4589, + "step": 3815 + }, + { + "epoch": 0.66, + "learning_rate": 5.443601738948452e-06, + "loss": 0.462, + "step": 3816 + }, + { + "epoch": 0.66, + "learning_rate": 5.438606862947237e-06, + "loss": 0.4483, + "step": 3817 + }, + { + "epoch": 0.66, + "learning_rate": 5.433613423531432e-06, + "loss": 0.4479, + "step": 3818 + }, + { + "epoch": 0.66, + "learning_rate": 5.428621422273687e-06, + "loss": 0.4714, + "step": 3819 + }, + { + "epoch": 0.66, + "learning_rate": 5.4236308607462095e-06, + "loss": 0.4632, + "step": 3820 + }, + { + "epoch": 0.66, + "learning_rate": 5.418641740520748e-06, + "loss": 0.4571, + "step": 3821 + }, + { + "epoch": 0.66, + "learning_rate": 5.413654063168602e-06, + "loss": 0.4652, + "step": 3822 + }, + { + "epoch": 0.66, + "learning_rate": 5.408667830260603e-06, + "loss": 0.4642, + "step": 3823 + }, + { + "epoch": 0.66, + "learning_rate": 5.403683043367145e-06, + "loss": 0.4725, + "step": 3824 + }, + { + "epoch": 0.66, + "learning_rate": 5.398699704058156e-06, + "loss": 0.4654, + "step": 3825 + }, + { + "epoch": 0.66, + "learning_rate": 5.393717813903112e-06, + "loss": 0.4514, + "step": 3826 + }, + { + "epoch": 0.66, + "learning_rate": 5.388737374471032e-06, + "loss": 0.4707, + "step": 3827 + }, + { + "epoch": 0.66, + "learning_rate": 5.383758387330476e-06, + "loss": 0.4746, + "step": 3828 + }, + { + "epoch": 0.66, + "learning_rate": 5.378780854049553e-06, + "loss": 0.4654, + "step": 3829 + }, + { + "epoch": 0.66, + "learning_rate": 5.373804776195903e-06, + "loss": 0.4837, + "step": 3830 + }, + { + "epoch": 0.66, + "learning_rate": 5.368830155336717e-06, + "loss": 0.4546, + "step": 3831 + }, + { + "epoch": 0.66, + "learning_rate": 5.363856993038725e-06, + "loss": 0.4698, + "step": 3832 + }, + { + "epoch": 0.66, + "learning_rate": 5.358885290868195e-06, + "loss": 0.4645, + "step": 3833 + }, + { + "epoch": 0.66, + "learning_rate": 5.353915050390941e-06, + "loss": 0.4748, + "step": 3834 + }, + { + "epoch": 0.66, + "learning_rate": 5.3489462731723045e-06, + "loss": 0.4708, + "step": 3835 + }, + { + "epoch": 0.66, + "learning_rate": 5.343978960777184e-06, + "loss": 0.4718, + "step": 3836 + }, + { + "epoch": 0.66, + "learning_rate": 5.3390131147699995e-06, + "loss": 0.4653, + "step": 3837 + }, + { + "epoch": 0.66, + "learning_rate": 5.3340487367147195e-06, + "loss": 0.4753, + "step": 3838 + }, + { + "epoch": 0.67, + "learning_rate": 5.329085828174847e-06, + "loss": 0.4544, + "step": 3839 + }, + { + "epoch": 0.67, + "learning_rate": 5.324124390713423e-06, + "loss": 0.4679, + "step": 3840 + }, + { + "epoch": 0.67, + "learning_rate": 5.3191644258930275e-06, + "loss": 0.4557, + "step": 3841 + }, + { + "epoch": 0.67, + "learning_rate": 5.3142059352757625e-06, + "loss": 0.4634, + "step": 3842 + }, + { + "epoch": 0.67, + "learning_rate": 5.309248920423293e-06, + "loss": 0.4606, + "step": 3843 + }, + { + "epoch": 0.67, + "learning_rate": 5.304293382896792e-06, + "loss": 0.4601, + "step": 3844 + }, + { + "epoch": 0.67, + "learning_rate": 5.299339324256986e-06, + "loss": 0.4662, + "step": 3845 + }, + { + "epoch": 0.67, + "learning_rate": 5.294386746064115e-06, + "loss": 0.4661, + "step": 3846 + }, + { + "epoch": 0.67, + "learning_rate": 5.28943564987798e-06, + "loss": 0.4621, + "step": 3847 + }, + { + "epoch": 0.67, + "learning_rate": 5.2844860372578995e-06, + "loss": 0.457, + "step": 3848 + }, + { + "epoch": 0.67, + "learning_rate": 5.2795379097627195e-06, + "loss": 0.4558, + "step": 3849 + }, + { + "epoch": 0.67, + "learning_rate": 5.274591268950828e-06, + "loss": 0.4629, + "step": 3850 + }, + { + "epoch": 0.67, + "learning_rate": 5.2696461163801445e-06, + "loss": 0.4588, + "step": 3851 + }, + { + "epoch": 0.67, + "learning_rate": 5.264702453608119e-06, + "loss": 0.4654, + "step": 3852 + }, + { + "epoch": 0.67, + "learning_rate": 5.2597602821917206e-06, + "loss": 0.4633, + "step": 3853 + }, + { + "epoch": 0.67, + "learning_rate": 5.254819603687469e-06, + "loss": 0.4682, + "step": 3854 + }, + { + "epoch": 0.67, + "learning_rate": 5.249880419651403e-06, + "loss": 0.4468, + "step": 3855 + }, + { + "epoch": 0.67, + "learning_rate": 5.244942731639084e-06, + "loss": 0.4686, + "step": 3856 + }, + { + "epoch": 0.67, + "learning_rate": 5.2400065412056136e-06, + "loss": 0.4523, + "step": 3857 + }, + { + "epoch": 0.67, + "learning_rate": 5.235071849905617e-06, + "loss": 0.4693, + "step": 3858 + }, + { + "epoch": 0.67, + "learning_rate": 5.230138659293254e-06, + "loss": 0.467, + "step": 3859 + }, + { + "epoch": 0.67, + "learning_rate": 5.2252069709221945e-06, + "loss": 0.4571, + "step": 3860 + }, + { + "epoch": 0.67, + "learning_rate": 5.220276786345648e-06, + "loss": 0.4575, + "step": 3861 + }, + { + "epoch": 0.67, + "learning_rate": 5.21534810711636e-06, + "loss": 0.4704, + "step": 3862 + }, + { + "epoch": 0.67, + "learning_rate": 5.2104209347865786e-06, + "loss": 0.4462, + "step": 3863 + }, + { + "epoch": 0.67, + "learning_rate": 5.205495270908094e-06, + "loss": 0.4624, + "step": 3864 + }, + { + "epoch": 0.67, + "learning_rate": 5.200571117032216e-06, + "loss": 0.4513, + "step": 3865 + }, + { + "epoch": 0.67, + "learning_rate": 5.195648474709783e-06, + "loss": 0.4586, + "step": 3866 + }, + { + "epoch": 0.67, + "learning_rate": 5.190727345491149e-06, + "loss": 0.4619, + "step": 3867 + }, + { + "epoch": 0.67, + "learning_rate": 5.185807730926191e-06, + "loss": 0.4741, + "step": 3868 + }, + { + "epoch": 0.67, + "learning_rate": 5.180889632564331e-06, + "loss": 0.4711, + "step": 3869 + }, + { + "epoch": 0.67, + "learning_rate": 5.175973051954482e-06, + "loss": 0.4641, + "step": 3870 + }, + { + "epoch": 0.67, + "learning_rate": 5.171057990645098e-06, + "loss": 0.4523, + "step": 3871 + }, + { + "epoch": 0.67, + "learning_rate": 5.166144450184154e-06, + "loss": 0.4696, + "step": 3872 + }, + { + "epoch": 0.67, + "learning_rate": 5.16123243211914e-06, + "loss": 0.4567, + "step": 3873 + }, + { + "epoch": 0.67, + "learning_rate": 5.156321937997064e-06, + "loss": 0.4642, + "step": 3874 + }, + { + "epoch": 0.67, + "learning_rate": 5.151412969364464e-06, + "loss": 0.4562, + "step": 3875 + }, + { + "epoch": 0.67, + "learning_rate": 5.1465055277673915e-06, + "loss": 0.4813, + "step": 3876 + }, + { + "epoch": 0.67, + "learning_rate": 5.141599614751416e-06, + "loss": 0.4642, + "step": 3877 + }, + { + "epoch": 0.67, + "learning_rate": 5.136695231861633e-06, + "loss": 0.463, + "step": 3878 + }, + { + "epoch": 0.67, + "learning_rate": 5.131792380642639e-06, + "loss": 0.4429, + "step": 3879 + }, + { + "epoch": 0.67, + "learning_rate": 5.126891062638575e-06, + "loss": 0.4724, + "step": 3880 + }, + { + "epoch": 0.67, + "learning_rate": 5.121991279393073e-06, + "loss": 0.4663, + "step": 3881 + }, + { + "epoch": 0.67, + "learning_rate": 5.117093032449297e-06, + "loss": 0.4603, + "step": 3882 + }, + { + "epoch": 0.67, + "learning_rate": 5.112196323349918e-06, + "loss": 0.4549, + "step": 3883 + }, + { + "epoch": 0.67, + "learning_rate": 5.107301153637133e-06, + "loss": 0.4709, + "step": 3884 + }, + { + "epoch": 0.67, + "learning_rate": 5.10240752485265e-06, + "loss": 0.4503, + "step": 3885 + }, + { + "epoch": 0.67, + "learning_rate": 5.097515438537678e-06, + "loss": 0.462, + "step": 3886 + }, + { + "epoch": 0.67, + "learning_rate": 5.092624896232969e-06, + "loss": 0.4536, + "step": 3887 + }, + { + "epoch": 0.67, + "learning_rate": 5.087735899478759e-06, + "loss": 0.4736, + "step": 3888 + }, + { + "epoch": 0.67, + "learning_rate": 5.082848449814816e-06, + "loss": 0.4562, + "step": 3889 + }, + { + "epoch": 0.67, + "learning_rate": 5.0779625487804125e-06, + "loss": 0.4619, + "step": 3890 + }, + { + "epoch": 0.67, + "learning_rate": 5.073078197914341e-06, + "loss": 0.467, + "step": 3891 + }, + { + "epoch": 0.67, + "learning_rate": 5.068195398754898e-06, + "loss": 0.4736, + "step": 3892 + }, + { + "epoch": 0.67, + "learning_rate": 5.063314152839891e-06, + "loss": 0.4517, + "step": 3893 + }, + { + "epoch": 0.67, + "learning_rate": 5.058434461706642e-06, + "loss": 0.4644, + "step": 3894 + }, + { + "epoch": 0.67, + "learning_rate": 5.053556326891986e-06, + "loss": 0.4685, + "step": 3895 + }, + { + "epoch": 0.67, + "learning_rate": 5.048679749932261e-06, + "loss": 0.4656, + "step": 3896 + }, + { + "epoch": 0.68, + "learning_rate": 5.043804732363321e-06, + "loss": 0.4573, + "step": 3897 + }, + { + "epoch": 0.68, + "learning_rate": 5.038931275720522e-06, + "loss": 0.4655, + "step": 3898 + }, + { + "epoch": 0.68, + "learning_rate": 5.03405938153874e-06, + "loss": 0.4577, + "step": 3899 + }, + { + "epoch": 0.68, + "learning_rate": 5.029189051352339e-06, + "loss": 0.4766, + "step": 3900 + }, + { + "epoch": 0.68, + "learning_rate": 5.02432028669521e-06, + "loss": 0.4757, + "step": 3901 + }, + { + "epoch": 0.68, + "learning_rate": 5.0194530891007405e-06, + "loss": 0.4582, + "step": 3902 + }, + { + "epoch": 0.68, + "learning_rate": 5.01458746010183e-06, + "loss": 0.4724, + "step": 3903 + }, + { + "epoch": 0.68, + "learning_rate": 5.0097234012308836e-06, + "loss": 0.4504, + "step": 3904 + }, + { + "epoch": 0.68, + "learning_rate": 5.004860914019798e-06, + "loss": 0.4703, + "step": 3905 + }, + { + "epoch": 0.68, + "learning_rate": 5.000000000000003e-06, + "loss": 0.4569, + "step": 3906 + }, + { + "epoch": 0.68, + "learning_rate": 4.9951406607024024e-06, + "loss": 0.4723, + "step": 3907 + }, + { + "epoch": 0.68, + "learning_rate": 4.990282897657425e-06, + "loss": 0.4625, + "step": 3908 + }, + { + "epoch": 0.68, + "learning_rate": 4.985426712394994e-06, + "loss": 0.4656, + "step": 3909 + }, + { + "epoch": 0.68, + "learning_rate": 4.980572106444539e-06, + "loss": 0.4499, + "step": 3910 + }, + { + "epoch": 0.68, + "learning_rate": 4.9757190813349945e-06, + "loss": 0.462, + "step": 3911 + }, + { + "epoch": 0.68, + "learning_rate": 4.970867638594783e-06, + "loss": 0.452, + "step": 3912 + }, + { + "epoch": 0.68, + "learning_rate": 4.966017779751854e-06, + "loss": 0.4632, + "step": 3913 + }, + { + "epoch": 0.68, + "learning_rate": 4.961169506333632e-06, + "loss": 0.4513, + "step": 3914 + }, + { + "epoch": 0.68, + "learning_rate": 4.956322819867059e-06, + "loss": 0.4621, + "step": 3915 + }, + { + "epoch": 0.68, + "learning_rate": 4.9514777218785704e-06, + "loss": 0.4767, + "step": 3916 + }, + { + "epoch": 0.68, + "learning_rate": 4.946634213894104e-06, + "loss": 0.4647, + "step": 3917 + }, + { + "epoch": 0.68, + "learning_rate": 4.941792297439098e-06, + "loss": 0.4511, + "step": 3918 + }, + { + "epoch": 0.68, + "learning_rate": 4.936951974038481e-06, + "loss": 0.4713, + "step": 3919 + }, + { + "epoch": 0.68, + "learning_rate": 4.932113245216689e-06, + "loss": 0.4471, + "step": 3920 + }, + { + "epoch": 0.68, + "learning_rate": 4.927276112497652e-06, + "loss": 0.4602, + "step": 3921 + }, + { + "epoch": 0.68, + "learning_rate": 4.922440577404804e-06, + "loss": 0.4603, + "step": 3922 + }, + { + "epoch": 0.68, + "learning_rate": 4.917606641461056e-06, + "loss": 0.4642, + "step": 3923 + }, + { + "epoch": 0.68, + "learning_rate": 4.912774306188842e-06, + "loss": 0.4735, + "step": 3924 + }, + { + "epoch": 0.68, + "learning_rate": 4.90794357311008e-06, + "loss": 0.4675, + "step": 3925 + }, + { + "epoch": 0.68, + "learning_rate": 4.903114443746173e-06, + "loss": 0.4655, + "step": 3926 + }, + { + "epoch": 0.68, + "learning_rate": 4.898286919618034e-06, + "loss": 0.4552, + "step": 3927 + }, + { + "epoch": 0.68, + "learning_rate": 4.8934610022460635e-06, + "loss": 0.4642, + "step": 3928 + }, + { + "epoch": 0.68, + "learning_rate": 4.888636693150161e-06, + "loss": 0.4609, + "step": 3929 + }, + { + "epoch": 0.68, + "learning_rate": 4.883813993849706e-06, + "loss": 0.4498, + "step": 3930 + }, + { + "epoch": 0.68, + "learning_rate": 4.878992905863591e-06, + "loss": 0.4568, + "step": 3931 + }, + { + "epoch": 0.68, + "learning_rate": 4.874173430710192e-06, + "loss": 0.4625, + "step": 3932 + }, + { + "epoch": 0.68, + "learning_rate": 4.869355569907367e-06, + "loss": 0.4587, + "step": 3933 + }, + { + "epoch": 0.68, + "learning_rate": 4.864539324972478e-06, + "loss": 0.47, + "step": 3934 + }, + { + "epoch": 0.68, + "learning_rate": 4.859724697422377e-06, + "loss": 0.4627, + "step": 3935 + }, + { + "epoch": 0.68, + "learning_rate": 4.8549116887734045e-06, + "loss": 0.4528, + "step": 3936 + }, + { + "epoch": 0.68, + "learning_rate": 4.850100300541386e-06, + "loss": 0.4505, + "step": 3937 + }, + { + "epoch": 0.68, + "learning_rate": 4.8452905342416405e-06, + "loss": 0.4524, + "step": 3938 + }, + { + "epoch": 0.68, + "learning_rate": 4.840482391388988e-06, + "loss": 0.4619, + "step": 3939 + }, + { + "epoch": 0.68, + "learning_rate": 4.835675873497716e-06, + "loss": 0.4656, + "step": 3940 + }, + { + "epoch": 0.68, + "learning_rate": 4.830870982081614e-06, + "loss": 0.4612, + "step": 3941 + }, + { + "epoch": 0.68, + "learning_rate": 4.8260677186539554e-06, + "loss": 0.4606, + "step": 3942 + }, + { + "epoch": 0.68, + "learning_rate": 4.821266084727505e-06, + "loss": 0.4662, + "step": 3943 + }, + { + "epoch": 0.68, + "learning_rate": 4.816466081814504e-06, + "loss": 0.4507, + "step": 3944 + }, + { + "epoch": 0.68, + "learning_rate": 4.811667711426686e-06, + "loss": 0.4686, + "step": 3945 + }, + { + "epoch": 0.68, + "learning_rate": 4.8068709750752825e-06, + "loss": 0.4615, + "step": 3946 + }, + { + "epoch": 0.68, + "learning_rate": 4.802075874270988e-06, + "loss": 0.4663, + "step": 3947 + }, + { + "epoch": 0.68, + "learning_rate": 4.797282410523997e-06, + "loss": 0.4541, + "step": 3948 + }, + { + "epoch": 0.68, + "learning_rate": 4.792490585343983e-06, + "loss": 0.4682, + "step": 3949 + }, + { + "epoch": 0.68, + "learning_rate": 4.787700400240108e-06, + "loss": 0.4638, + "step": 3950 + }, + { + "epoch": 0.68, + "learning_rate": 4.78291185672101e-06, + "loss": 0.4696, + "step": 3951 + }, + { + "epoch": 0.68, + "learning_rate": 4.7781249562948136e-06, + "loss": 0.4554, + "step": 3952 + }, + { + "epoch": 0.68, + "learning_rate": 4.773339700469129e-06, + "loss": 0.4688, + "step": 3953 + }, + { + "epoch": 0.68, + "learning_rate": 4.7685560907510465e-06, + "loss": 0.4552, + "step": 3954 + }, + { + "epoch": 0.69, + "learning_rate": 4.7637741286471385e-06, + "loss": 0.4667, + "step": 3955 + }, + { + "epoch": 0.69, + "learning_rate": 4.7589938156634485e-06, + "loss": 0.4608, + "step": 3956 + }, + { + "epoch": 0.69, + "learning_rate": 4.7542151533055235e-06, + "loss": 0.4648, + "step": 3957 + }, + { + "epoch": 0.69, + "learning_rate": 4.7494381430783656e-06, + "loss": 0.4578, + "step": 3958 + }, + { + "epoch": 0.69, + "learning_rate": 4.744662786486471e-06, + "loss": 0.4605, + "step": 3959 + }, + { + "epoch": 0.69, + "learning_rate": 4.739889085033812e-06, + "loss": 0.458, + "step": 3960 + }, + { + "epoch": 0.69, + "learning_rate": 4.73511704022384e-06, + "loss": 0.4681, + "step": 3961 + }, + { + "epoch": 0.69, + "learning_rate": 4.730346653559486e-06, + "loss": 0.4721, + "step": 3962 + }, + { + "epoch": 0.69, + "learning_rate": 4.725577926543151e-06, + "loss": 0.4547, + "step": 3963 + }, + { + "epoch": 0.69, + "learning_rate": 4.720810860676722e-06, + "loss": 0.4637, + "step": 3964 + }, + { + "epoch": 0.69, + "learning_rate": 4.7160454574615596e-06, + "loss": 0.472, + "step": 3965 + }, + { + "epoch": 0.69, + "learning_rate": 4.711281718398503e-06, + "loss": 0.4559, + "step": 3966 + }, + { + "epoch": 0.69, + "learning_rate": 4.706519644987863e-06, + "loss": 0.4531, + "step": 3967 + }, + { + "epoch": 0.69, + "learning_rate": 4.701759238729428e-06, + "loss": 0.4634, + "step": 3968 + }, + { + "epoch": 0.69, + "learning_rate": 4.697000501122466e-06, + "loss": 0.4764, + "step": 3969 + }, + { + "epoch": 0.69, + "learning_rate": 4.6922434336657095e-06, + "loss": 0.4595, + "step": 3970 + }, + { + "epoch": 0.69, + "learning_rate": 4.68748803785737e-06, + "loss": 0.4585, + "step": 3971 + }, + { + "epoch": 0.69, + "learning_rate": 4.682734315195138e-06, + "loss": 0.4674, + "step": 3972 + }, + { + "epoch": 0.69, + "learning_rate": 4.677982267176168e-06, + "loss": 0.4676, + "step": 3973 + }, + { + "epoch": 0.69, + "learning_rate": 4.673231895297092e-06, + "loss": 0.4559, + "step": 3974 + }, + { + "epoch": 0.69, + "learning_rate": 4.668483201054013e-06, + "loss": 0.476, + "step": 3975 + }, + { + "epoch": 0.69, + "learning_rate": 4.663736185942512e-06, + "loss": 0.451, + "step": 3976 + }, + { + "epoch": 0.69, + "learning_rate": 4.658990851457625e-06, + "loss": 0.4596, + "step": 3977 + }, + { + "epoch": 0.69, + "learning_rate": 4.654247199093873e-06, + "loss": 0.4557, + "step": 3978 + }, + { + "epoch": 0.69, + "learning_rate": 4.649505230345244e-06, + "loss": 0.4646, + "step": 3979 + }, + { + "epoch": 0.69, + "learning_rate": 4.644764946705193e-06, + "loss": 0.4544, + "step": 3980 + }, + { + "epoch": 0.69, + "learning_rate": 4.640026349666651e-06, + "loss": 0.4833, + "step": 3981 + }, + { + "epoch": 0.69, + "learning_rate": 4.635289440722001e-06, + "loss": 0.4612, + "step": 3982 + }, + { + "epoch": 0.69, + "learning_rate": 4.6305542213631205e-06, + "loss": 0.4913, + "step": 3983 + }, + { + "epoch": 0.69, + "learning_rate": 4.625820693081331e-06, + "loss": 0.4662, + "step": 3984 + }, + { + "epoch": 0.69, + "learning_rate": 4.621088857367433e-06, + "loss": 0.4635, + "step": 3985 + }, + { + "epoch": 0.69, + "learning_rate": 4.616358715711693e-06, + "loss": 0.4612, + "step": 3986 + }, + { + "epoch": 0.69, + "learning_rate": 4.611630269603842e-06, + "loss": 0.4644, + "step": 3987 + }, + { + "epoch": 0.69, + "learning_rate": 4.606903520533082e-06, + "loss": 0.4544, + "step": 3988 + }, + { + "epoch": 0.69, + "learning_rate": 4.602178469988064e-06, + "loss": 0.4627, + "step": 3989 + }, + { + "epoch": 0.69, + "learning_rate": 4.5974551194569336e-06, + "loss": 0.4668, + "step": 3990 + }, + { + "epoch": 0.69, + "learning_rate": 4.592733470427272e-06, + "loss": 0.4575, + "step": 3991 + }, + { + "epoch": 0.69, + "learning_rate": 4.588013524386138e-06, + "loss": 0.4608, + "step": 3992 + }, + { + "epoch": 0.69, + "learning_rate": 4.5832952828200535e-06, + "loss": 0.4691, + "step": 3993 + }, + { + "epoch": 0.69, + "learning_rate": 4.578578747215003e-06, + "loss": 0.4686, + "step": 3994 + }, + { + "epoch": 0.69, + "learning_rate": 4.573863919056438e-06, + "loss": 0.4526, + "step": 3995 + }, + { + "epoch": 0.69, + "learning_rate": 4.569150799829257e-06, + "loss": 0.4591, + "step": 3996 + }, + { + "epoch": 0.69, + "learning_rate": 4.564439391017836e-06, + "loss": 0.4828, + "step": 3997 + }, + { + "epoch": 0.69, + "learning_rate": 4.559729694106008e-06, + "loss": 0.4569, + "step": 3998 + }, + { + "epoch": 0.69, + "learning_rate": 4.555021710577068e-06, + "loss": 0.475, + "step": 3999 + }, + { + "epoch": 0.69, + "learning_rate": 4.550315441913759e-06, + "loss": 0.4542, + "step": 4000 + }, + { + "epoch": 0.69, + "learning_rate": 4.545610889598304e-06, + "loss": 0.4579, + "step": 4001 + }, + { + "epoch": 0.69, + "learning_rate": 4.540908055112378e-06, + "loss": 0.4683, + "step": 4002 + }, + { + "epoch": 0.69, + "learning_rate": 4.536206939937101e-06, + "loss": 0.4582, + "step": 4003 + }, + { + "epoch": 0.69, + "learning_rate": 4.531507545553072e-06, + "loss": 0.4596, + "step": 4004 + }, + { + "epoch": 0.69, + "learning_rate": 4.526809873440335e-06, + "loss": 0.4677, + "step": 4005 + }, + { + "epoch": 0.69, + "learning_rate": 4.522113925078402e-06, + "loss": 0.4544, + "step": 4006 + }, + { + "epoch": 0.69, + "learning_rate": 4.517419701946224e-06, + "loss": 0.4599, + "step": 4007 + }, + { + "epoch": 0.69, + "learning_rate": 4.51272720552223e-06, + "loss": 0.4528, + "step": 4008 + }, + { + "epoch": 0.69, + "learning_rate": 4.508036437284298e-06, + "loss": 0.4655, + "step": 4009 + }, + { + "epoch": 0.69, + "learning_rate": 4.503347398709751e-06, + "loss": 0.4552, + "step": 4010 + }, + { + "epoch": 0.69, + "learning_rate": 4.498660091275379e-06, + "loss": 0.4575, + "step": 4011 + }, + { + "epoch": 0.7, + "learning_rate": 4.493974516457423e-06, + "loss": 0.4682, + "step": 4012 + }, + { + "epoch": 0.7, + "learning_rate": 4.489290675731584e-06, + "loss": 0.465, + "step": 4013 + }, + { + "epoch": 0.7, + "learning_rate": 4.484608570573002e-06, + "loss": 0.4485, + "step": 4014 + }, + { + "epoch": 0.7, + "learning_rate": 4.479928202456283e-06, + "loss": 0.4514, + "step": 4015 + }, + { + "epoch": 0.7, + "learning_rate": 4.475249572855492e-06, + "loss": 0.4706, + "step": 4016 + }, + { + "epoch": 0.7, + "learning_rate": 4.470572683244127e-06, + "loss": 0.465, + "step": 4017 + }, + { + "epoch": 0.7, + "learning_rate": 4.4658975350951505e-06, + "loss": 0.4532, + "step": 4018 + }, + { + "epoch": 0.7, + "learning_rate": 4.461224129880976e-06, + "loss": 0.4539, + "step": 4019 + }, + { + "epoch": 0.7, + "learning_rate": 4.45655246907347e-06, + "loss": 0.4543, + "step": 4020 + }, + { + "epoch": 0.7, + "learning_rate": 4.451882554143938e-06, + "loss": 0.4633, + "step": 4021 + }, + { + "epoch": 0.7, + "learning_rate": 4.447214386563145e-06, + "loss": 0.4477, + "step": 4022 + }, + { + "epoch": 0.7, + "learning_rate": 4.442547967801314e-06, + "loss": 0.4694, + "step": 4023 + }, + { + "epoch": 0.7, + "learning_rate": 4.437883299328097e-06, + "loss": 0.4691, + "step": 4024 + }, + { + "epoch": 0.7, + "learning_rate": 4.433220382612614e-06, + "loss": 0.4572, + "step": 4025 + }, + { + "epoch": 0.7, + "learning_rate": 4.4285592191234125e-06, + "loss": 0.4435, + "step": 4026 + }, + { + "epoch": 0.7, + "learning_rate": 4.423899810328512e-06, + "loss": 0.4609, + "step": 4027 + }, + { + "epoch": 0.7, + "learning_rate": 4.419242157695364e-06, + "loss": 0.4541, + "step": 4028 + }, + { + "epoch": 0.7, + "learning_rate": 4.4145862626908684e-06, + "loss": 0.4774, + "step": 4029 + }, + { + "epoch": 0.7, + "learning_rate": 4.409932126781373e-06, + "loss": 0.4526, + "step": 4030 + }, + { + "epoch": 0.7, + "learning_rate": 4.405279751432674e-06, + "loss": 0.4674, + "step": 4031 + }, + { + "epoch": 0.7, + "learning_rate": 4.400629138110014e-06, + "loss": 0.4585, + "step": 4032 + }, + { + "epoch": 0.7, + "learning_rate": 4.395980288278067e-06, + "loss": 0.4693, + "step": 4033 + }, + { + "epoch": 0.7, + "learning_rate": 4.391333203400974e-06, + "loss": 0.4503, + "step": 4034 + }, + { + "epoch": 0.7, + "learning_rate": 4.386687884942307e-06, + "loss": 0.4735, + "step": 4035 + }, + { + "epoch": 0.7, + "learning_rate": 4.382044334365078e-06, + "loss": 0.4385, + "step": 4036 + }, + { + "epoch": 0.7, + "learning_rate": 4.3774025531317476e-06, + "loss": 0.4545, + "step": 4037 + }, + { + "epoch": 0.7, + "learning_rate": 4.372762542704223e-06, + "loss": 0.4537, + "step": 4038 + }, + { + "epoch": 0.7, + "learning_rate": 4.368124304543852e-06, + "loss": 0.4692, + "step": 4039 + }, + { + "epoch": 0.7, + "learning_rate": 4.363487840111413e-06, + "loss": 0.4754, + "step": 4040 + }, + { + "epoch": 0.7, + "learning_rate": 4.358853150867137e-06, + "loss": 0.4589, + "step": 4041 + }, + { + "epoch": 0.7, + "learning_rate": 4.354220238270705e-06, + "loss": 0.4623, + "step": 4042 + }, + { + "epoch": 0.7, + "learning_rate": 4.349589103781212e-06, + "loss": 0.4689, + "step": 4043 + }, + { + "epoch": 0.7, + "learning_rate": 4.344959748857215e-06, + "loss": 0.4627, + "step": 4044 + }, + { + "epoch": 0.7, + "learning_rate": 4.340332174956703e-06, + "loss": 0.4543, + "step": 4045 + }, + { + "epoch": 0.7, + "learning_rate": 4.335706383537109e-06, + "loss": 0.4541, + "step": 4046 + }, + { + "epoch": 0.7, + "learning_rate": 4.331082376055292e-06, + "loss": 0.4708, + "step": 4047 + }, + { + "epoch": 0.7, + "learning_rate": 4.326460153967558e-06, + "loss": 0.4497, + "step": 4048 + }, + { + "epoch": 0.7, + "learning_rate": 4.32183971872966e-06, + "loss": 0.477, + "step": 4049 + }, + { + "epoch": 0.7, + "learning_rate": 4.317221071796768e-06, + "loss": 0.4651, + "step": 4050 + }, + { + "epoch": 0.7, + "learning_rate": 4.312604214623504e-06, + "loss": 0.4612, + "step": 4051 + }, + { + "epoch": 0.7, + "learning_rate": 4.307989148663921e-06, + "loss": 0.4646, + "step": 4052 + }, + { + "epoch": 0.7, + "learning_rate": 4.3033758753715095e-06, + "loss": 0.4634, + "step": 4053 + }, + { + "epoch": 0.7, + "learning_rate": 4.298764396199191e-06, + "loss": 0.4535, + "step": 4054 + }, + { + "epoch": 0.7, + "learning_rate": 4.294154712599325e-06, + "loss": 0.4565, + "step": 4055 + }, + { + "epoch": 0.7, + "learning_rate": 4.28954682602371e-06, + "loss": 0.4544, + "step": 4056 + }, + { + "epoch": 0.7, + "learning_rate": 4.284940737923571e-06, + "loss": 0.4611, + "step": 4057 + }, + { + "epoch": 0.7, + "learning_rate": 4.280336449749573e-06, + "loss": 0.4682, + "step": 4058 + }, + { + "epoch": 0.7, + "learning_rate": 4.275733962951804e-06, + "loss": 0.472, + "step": 4059 + }, + { + "epoch": 0.7, + "learning_rate": 4.271133278979802e-06, + "loss": 0.461, + "step": 4060 + }, + { + "epoch": 0.7, + "learning_rate": 4.266534399282517e-06, + "loss": 0.4797, + "step": 4061 + }, + { + "epoch": 0.7, + "learning_rate": 4.261937325308347e-06, + "loss": 0.4607, + "step": 4062 + }, + { + "epoch": 0.7, + "learning_rate": 4.257342058505109e-06, + "loss": 0.4616, + "step": 4063 + }, + { + "epoch": 0.7, + "learning_rate": 4.252748600320063e-06, + "loss": 0.4599, + "step": 4064 + }, + { + "epoch": 0.7, + "learning_rate": 4.248156952199895e-06, + "loss": 0.4682, + "step": 4065 + }, + { + "epoch": 0.7, + "learning_rate": 4.243567115590705e-06, + "loss": 0.4599, + "step": 4066 + }, + { + "epoch": 0.7, + "learning_rate": 4.238979091938054e-06, + "loss": 0.4683, + "step": 4067 + }, + { + "epoch": 0.7, + "learning_rate": 4.234392882686904e-06, + "loss": 0.4566, + "step": 4068 + }, + { + "epoch": 0.7, + "learning_rate": 4.2298084892816574e-06, + "loss": 0.4679, + "step": 4069 + }, + { + "epoch": 0.71, + "learning_rate": 4.225225913166146e-06, + "loss": 0.4593, + "step": 4070 + }, + { + "epoch": 0.71, + "learning_rate": 4.2206451557836235e-06, + "loss": 0.4572, + "step": 4071 + }, + { + "epoch": 0.71, + "learning_rate": 4.2160662185767805e-06, + "loss": 0.466, + "step": 4072 + }, + { + "epoch": 0.71, + "learning_rate": 4.21148910298772e-06, + "loss": 0.4592, + "step": 4073 + }, + { + "epoch": 0.71, + "learning_rate": 4.2069138104579825e-06, + "loss": 0.4481, + "step": 4074 + }, + { + "epoch": 0.71, + "learning_rate": 4.202340342428529e-06, + "loss": 0.4599, + "step": 4075 + }, + { + "epoch": 0.71, + "learning_rate": 4.197768700339752e-06, + "loss": 0.4645, + "step": 4076 + }, + { + "epoch": 0.71, + "learning_rate": 4.19319888563146e-06, + "loss": 0.4528, + "step": 4077 + }, + { + "epoch": 0.71, + "learning_rate": 4.188630899742894e-06, + "loss": 0.4721, + "step": 4078 + }, + { + "epoch": 0.71, + "learning_rate": 4.184064744112718e-06, + "loss": 0.4717, + "step": 4079 + }, + { + "epoch": 0.71, + "learning_rate": 4.179500420179011e-06, + "loss": 0.4654, + "step": 4080 + }, + { + "epoch": 0.71, + "learning_rate": 4.174937929379285e-06, + "loss": 0.4483, + "step": 4081 + }, + { + "epoch": 0.71, + "learning_rate": 4.17037727315047e-06, + "loss": 0.4791, + "step": 4082 + }, + { + "epoch": 0.71, + "learning_rate": 4.16581845292892e-06, + "loss": 0.4593, + "step": 4083 + }, + { + "epoch": 0.71, + "learning_rate": 4.161261470150414e-06, + "loss": 0.4577, + "step": 4084 + }, + { + "epoch": 0.71, + "learning_rate": 4.156706326250137e-06, + "loss": 0.4593, + "step": 4085 + }, + { + "epoch": 0.71, + "learning_rate": 4.15215302266272e-06, + "loss": 0.4555, + "step": 4086 + }, + { + "epoch": 0.71, + "learning_rate": 4.147601560822192e-06, + "loss": 0.4354, + "step": 4087 + }, + { + "epoch": 0.71, + "learning_rate": 4.143051942162013e-06, + "loss": 0.475, + "step": 4088 + }, + { + "epoch": 0.71, + "learning_rate": 4.138504168115059e-06, + "loss": 0.451, + "step": 4089 + }, + { + "epoch": 0.71, + "learning_rate": 4.133958240113629e-06, + "loss": 0.461, + "step": 4090 + }, + { + "epoch": 0.71, + "learning_rate": 4.129414159589438e-06, + "loss": 0.4575, + "step": 4091 + }, + { + "epoch": 0.71, + "learning_rate": 4.124871927973611e-06, + "loss": 0.4669, + "step": 4092 + }, + { + "epoch": 0.71, + "learning_rate": 4.120331546696711e-06, + "loss": 0.4473, + "step": 4093 + }, + { + "epoch": 0.71, + "learning_rate": 4.115793017188695e-06, + "loss": 0.4759, + "step": 4094 + }, + { + "epoch": 0.71, + "learning_rate": 4.111256340878952e-06, + "loss": 0.4549, + "step": 4095 + }, + { + "epoch": 0.71, + "learning_rate": 4.106721519196284e-06, + "loss": 0.4605, + "step": 4096 + }, + { + "epoch": 0.71, + "learning_rate": 4.102188553568905e-06, + "loss": 0.4536, + "step": 4097 + }, + { + "epoch": 0.71, + "learning_rate": 4.097657445424454e-06, + "loss": 0.4647, + "step": 4098 + }, + { + "epoch": 0.71, + "learning_rate": 4.093128196189971e-06, + "loss": 0.4607, + "step": 4099 + }, + { + "epoch": 0.71, + "learning_rate": 4.088600807291918e-06, + "loss": 0.4582, + "step": 4100 + }, + { + "epoch": 0.71, + "learning_rate": 4.084075280156175e-06, + "loss": 0.4684, + "step": 4101 + }, + { + "epoch": 0.71, + "learning_rate": 4.079551616208032e-06, + "loss": 0.4605, + "step": 4102 + }, + { + "epoch": 0.71, + "learning_rate": 4.075029816872183e-06, + "loss": 0.4753, + "step": 4103 + }, + { + "epoch": 0.71, + "learning_rate": 4.070509883572754e-06, + "loss": 0.4575, + "step": 4104 + }, + { + "epoch": 0.71, + "learning_rate": 4.065991817733272e-06, + "loss": 0.4594, + "step": 4105 + }, + { + "epoch": 0.71, + "learning_rate": 4.061475620776672e-06, + "loss": 0.4531, + "step": 4106 + }, + { + "epoch": 0.71, + "learning_rate": 4.056961294125305e-06, + "loss": 0.459, + "step": 4107 + }, + { + "epoch": 0.71, + "learning_rate": 4.052448839200935e-06, + "loss": 0.4658, + "step": 4108 + }, + { + "epoch": 0.71, + "learning_rate": 4.04793825742474e-06, + "loss": 0.46, + "step": 4109 + }, + { + "epoch": 0.71, + "learning_rate": 4.0434295502172885e-06, + "loss": 0.4643, + "step": 4110 + }, + { + "epoch": 0.71, + "learning_rate": 4.038922718998585e-06, + "loss": 0.4642, + "step": 4111 + }, + { + "epoch": 0.71, + "learning_rate": 4.034417765188031e-06, + "loss": 0.464, + "step": 4112 + }, + { + "epoch": 0.71, + "learning_rate": 4.0299146902044304e-06, + "loss": 0.4568, + "step": 4113 + }, + { + "epoch": 0.71, + "learning_rate": 4.025413495466004e-06, + "loss": 0.4665, + "step": 4114 + }, + { + "epoch": 0.71, + "learning_rate": 4.020914182390379e-06, + "loss": 0.454, + "step": 4115 + }, + { + "epoch": 0.71, + "learning_rate": 4.016416752394591e-06, + "loss": 0.4543, + "step": 4116 + }, + { + "epoch": 0.71, + "learning_rate": 4.011921206895074e-06, + "loss": 0.4674, + "step": 4117 + }, + { + "epoch": 0.71, + "learning_rate": 4.007427547307676e-06, + "loss": 0.4708, + "step": 4118 + }, + { + "epoch": 0.71, + "learning_rate": 4.00293577504766e-06, + "loss": 0.4561, + "step": 4119 + }, + { + "epoch": 0.71, + "learning_rate": 3.998445891529675e-06, + "loss": 0.465, + "step": 4120 + }, + { + "epoch": 0.71, + "learning_rate": 3.993957898167788e-06, + "loss": 0.455, + "step": 4121 + }, + { + "epoch": 0.71, + "learning_rate": 3.989471796375466e-06, + "loss": 0.4655, + "step": 4122 + }, + { + "epoch": 0.71, + "learning_rate": 3.9849875875655875e-06, + "loss": 0.4592, + "step": 4123 + }, + { + "epoch": 0.71, + "learning_rate": 3.980505273150421e-06, + "loss": 0.4649, + "step": 4124 + }, + { + "epoch": 0.71, + "learning_rate": 3.9760248545416465e-06, + "loss": 0.4585, + "step": 4125 + }, + { + "epoch": 0.71, + "learning_rate": 3.971546333150358e-06, + "loss": 0.4722, + "step": 4126 + }, + { + "epoch": 0.71, + "learning_rate": 3.967069710387029e-06, + "loss": 0.4527, + "step": 4127 + }, + { + "epoch": 0.72, + "learning_rate": 3.962594987661557e-06, + "loss": 0.4672, + "step": 4128 + }, + { + "epoch": 0.72, + "learning_rate": 3.958122166383217e-06, + "loss": 0.4467, + "step": 4129 + }, + { + "epoch": 0.72, + "learning_rate": 3.953651247960715e-06, + "loss": 0.4607, + "step": 4130 + }, + { + "epoch": 0.72, + "learning_rate": 3.949182233802131e-06, + "loss": 0.4523, + "step": 4131 + }, + { + "epoch": 0.72, + "learning_rate": 3.944715125314961e-06, + "loss": 0.4652, + "step": 4132 + }, + { + "epoch": 0.72, + "learning_rate": 3.940249923906093e-06, + "loss": 0.456, + "step": 4133 + }, + { + "epoch": 0.72, + "learning_rate": 3.935786630981819e-06, + "loss": 0.4622, + "step": 4134 + }, + { + "epoch": 0.72, + "learning_rate": 3.931325247947834e-06, + "loss": 0.4512, + "step": 4135 + }, + { + "epoch": 0.72, + "learning_rate": 3.926865776209212e-06, + "loss": 0.4694, + "step": 4136 + }, + { + "epoch": 0.72, + "learning_rate": 3.922408217170454e-06, + "loss": 0.4577, + "step": 4137 + }, + { + "epoch": 0.72, + "learning_rate": 3.917952572235433e-06, + "loss": 0.4587, + "step": 4138 + }, + { + "epoch": 0.72, + "learning_rate": 3.913498842807433e-06, + "loss": 0.457, + "step": 4139 + }, + { + "epoch": 0.72, + "learning_rate": 3.909047030289131e-06, + "loss": 0.4623, + "step": 4140 + }, + { + "epoch": 0.72, + "learning_rate": 3.9045971360826014e-06, + "loss": 0.4521, + "step": 4141 + }, + { + "epoch": 0.72, + "learning_rate": 3.900149161589317e-06, + "loss": 0.4721, + "step": 4142 + }, + { + "epoch": 0.72, + "learning_rate": 3.895703108210135e-06, + "loss": 0.4549, + "step": 4143 + }, + { + "epoch": 0.72, + "learning_rate": 3.891258977345319e-06, + "loss": 0.4636, + "step": 4144 + }, + { + "epoch": 0.72, + "learning_rate": 3.886816770394524e-06, + "loss": 0.4565, + "step": 4145 + }, + { + "epoch": 0.72, + "learning_rate": 3.882376488756797e-06, + "loss": 0.4719, + "step": 4146 + }, + { + "epoch": 0.72, + "learning_rate": 3.877938133830581e-06, + "loss": 0.4526, + "step": 4147 + }, + { + "epoch": 0.72, + "learning_rate": 3.873501707013711e-06, + "loss": 0.4708, + "step": 4148 + }, + { + "epoch": 0.72, + "learning_rate": 3.869067209703418e-06, + "loss": 0.4645, + "step": 4149 + }, + { + "epoch": 0.72, + "learning_rate": 3.8646346432963165e-06, + "loss": 0.4581, + "step": 4150 + }, + { + "epoch": 0.72, + "learning_rate": 3.860204009188421e-06, + "loss": 0.4715, + "step": 4151 + }, + { + "epoch": 0.72, + "learning_rate": 3.8557753087751345e-06, + "loss": 0.4606, + "step": 4152 + }, + { + "epoch": 0.72, + "learning_rate": 3.851348543451253e-06, + "loss": 0.4533, + "step": 4153 + }, + { + "epoch": 0.72, + "learning_rate": 3.846923714610962e-06, + "loss": 0.4532, + "step": 4154 + }, + { + "epoch": 0.72, + "learning_rate": 3.8425008236478355e-06, + "loss": 0.458, + "step": 4155 + }, + { + "epoch": 0.72, + "learning_rate": 3.838079871954842e-06, + "loss": 0.455, + "step": 4156 + }, + { + "epoch": 0.72, + "learning_rate": 3.833660860924328e-06, + "loss": 0.4683, + "step": 4157 + }, + { + "epoch": 0.72, + "learning_rate": 3.829243791948043e-06, + "loss": 0.462, + "step": 4158 + }, + { + "epoch": 0.72, + "learning_rate": 3.824828666417114e-06, + "loss": 0.4533, + "step": 4159 + }, + { + "epoch": 0.72, + "learning_rate": 3.820415485722064e-06, + "loss": 0.4669, + "step": 4160 + }, + { + "epoch": 0.72, + "learning_rate": 3.8160042512528e-06, + "loss": 0.4668, + "step": 4161 + }, + { + "epoch": 0.72, + "learning_rate": 3.8115949643986095e-06, + "loss": 0.461, + "step": 4162 + }, + { + "epoch": 0.72, + "learning_rate": 3.8071876265481823e-06, + "loss": 0.4602, + "step": 4163 + }, + { + "epoch": 0.72, + "learning_rate": 3.8027822390895774e-06, + "loss": 0.4562, + "step": 4164 + }, + { + "epoch": 0.72, + "learning_rate": 3.7983788034102488e-06, + "loss": 0.4569, + "step": 4165 + }, + { + "epoch": 0.72, + "learning_rate": 3.7939773208970353e-06, + "loss": 0.4684, + "step": 4166 + }, + { + "epoch": 0.72, + "learning_rate": 3.7895777929361586e-06, + "loss": 0.4488, + "step": 4167 + }, + { + "epoch": 0.72, + "learning_rate": 3.7851802209132303e-06, + "loss": 0.4698, + "step": 4168 + }, + { + "epoch": 0.72, + "learning_rate": 3.7807846062132293e-06, + "loss": 0.4496, + "step": 4169 + }, + { + "epoch": 0.72, + "learning_rate": 3.776390950220544e-06, + "loss": 0.463, + "step": 4170 + }, + { + "epoch": 0.72, + "learning_rate": 3.7719992543189233e-06, + "loss": 0.4654, + "step": 4171 + }, + { + "epoch": 0.72, + "learning_rate": 3.767609519891513e-06, + "loss": 0.475, + "step": 4172 + }, + { + "epoch": 0.72, + "learning_rate": 3.7632217483208242e-06, + "loss": 0.4586, + "step": 4173 + }, + { + "epoch": 0.72, + "learning_rate": 3.758835940988773e-06, + "loss": 0.4617, + "step": 4174 + }, + { + "epoch": 0.72, + "learning_rate": 3.7544520992766454e-06, + "loss": 0.462, + "step": 4175 + }, + { + "epoch": 0.72, + "learning_rate": 3.7500702245651e-06, + "loss": 0.4621, + "step": 4176 + }, + { + "epoch": 0.72, + "learning_rate": 3.745690318234186e-06, + "loss": 0.4515, + "step": 4177 + }, + { + "epoch": 0.72, + "learning_rate": 3.7413123816633344e-06, + "loss": 0.4545, + "step": 4178 + }, + { + "epoch": 0.72, + "learning_rate": 3.7369364162313528e-06, + "loss": 0.453, + "step": 4179 + }, + { + "epoch": 0.72, + "learning_rate": 3.7325624233164157e-06, + "loss": 0.4785, + "step": 4180 + }, + { + "epoch": 0.72, + "learning_rate": 3.7281904042961016e-06, + "loss": 0.4586, + "step": 4181 + }, + { + "epoch": 0.72, + "learning_rate": 3.723820360547351e-06, + "loss": 0.4756, + "step": 4182 + }, + { + "epoch": 0.72, + "learning_rate": 3.7194522934464785e-06, + "loss": 0.4498, + "step": 4183 + }, + { + "epoch": 0.72, + "learning_rate": 3.715086204369186e-06, + "loss": 0.4692, + "step": 4184 + }, + { + "epoch": 0.72, + "learning_rate": 3.7107220946905497e-06, + "loss": 0.4498, + "step": 4185 + }, + { + "epoch": 0.73, + "learning_rate": 3.7063599657850248e-06, + "loss": 0.4646, + "step": 4186 + }, + { + "epoch": 0.73, + "learning_rate": 3.701999819026432e-06, + "loss": 0.4478, + "step": 4187 + }, + { + "epoch": 0.73, + "learning_rate": 3.6976416557879757e-06, + "loss": 0.4595, + "step": 4188 + }, + { + "epoch": 0.73, + "learning_rate": 3.6932854774422457e-06, + "loss": 0.4676, + "step": 4189 + }, + { + "epoch": 0.73, + "learning_rate": 3.6889312853611857e-06, + "loss": 0.4629, + "step": 4190 + }, + { + "epoch": 0.73, + "learning_rate": 3.6845790809161273e-06, + "loss": 0.4552, + "step": 4191 + }, + { + "epoch": 0.73, + "learning_rate": 3.680228865477774e-06, + "loss": 0.4722, + "step": 4192 + }, + { + "epoch": 0.73, + "learning_rate": 3.675880640416205e-06, + "loss": 0.4545, + "step": 4193 + }, + { + "epoch": 0.73, + "learning_rate": 3.671534407100863e-06, + "loss": 0.4684, + "step": 4194 + }, + { + "epoch": 0.73, + "learning_rate": 3.6671901669005683e-06, + "loss": 0.4703, + "step": 4195 + }, + { + "epoch": 0.73, + "learning_rate": 3.662847921183528e-06, + "loss": 0.4646, + "step": 4196 + }, + { + "epoch": 0.73, + "learning_rate": 3.658507671317296e-06, + "loss": 0.4587, + "step": 4197 + }, + { + "epoch": 0.73, + "learning_rate": 3.654169418668815e-06, + "loss": 0.464, + "step": 4198 + }, + { + "epoch": 0.73, + "learning_rate": 3.6498331646043917e-06, + "loss": 0.4713, + "step": 4199 + }, + { + "epoch": 0.73, + "learning_rate": 3.6454989104897097e-06, + "loss": 0.4587, + "step": 4200 + }, + { + "epoch": 0.73, + "learning_rate": 3.641166657689812e-06, + "loss": 0.4594, + "step": 4201 + }, + { + "epoch": 0.73, + "learning_rate": 3.63683640756912e-06, + "loss": 0.4769, + "step": 4202 + }, + { + "epoch": 0.73, + "learning_rate": 3.6325081614914216e-06, + "loss": 0.4718, + "step": 4203 + }, + { + "epoch": 0.73, + "learning_rate": 3.6281819208198744e-06, + "loss": 0.4522, + "step": 4204 + }, + { + "epoch": 0.73, + "learning_rate": 3.6238576869170074e-06, + "loss": 0.4739, + "step": 4205 + }, + { + "epoch": 0.73, + "learning_rate": 3.6195354611447033e-06, + "loss": 0.4603, + "step": 4206 + }, + { + "epoch": 0.73, + "learning_rate": 3.6152152448642374e-06, + "loss": 0.4557, + "step": 4207 + }, + { + "epoch": 0.73, + "learning_rate": 3.6108970394362274e-06, + "loss": 0.4721, + "step": 4208 + }, + { + "epoch": 0.73, + "learning_rate": 3.606580846220671e-06, + "loss": 0.457, + "step": 4209 + }, + { + "epoch": 0.73, + "learning_rate": 3.602266666576929e-06, + "loss": 0.4608, + "step": 4210 + }, + { + "epoch": 0.73, + "learning_rate": 3.59795450186373e-06, + "loss": 0.4588, + "step": 4211 + }, + { + "epoch": 0.73, + "learning_rate": 3.5936443534391676e-06, + "loss": 0.4771, + "step": 4212 + }, + { + "epoch": 0.73, + "learning_rate": 3.58933622266069e-06, + "loss": 0.4578, + "step": 4213 + }, + { + "epoch": 0.73, + "learning_rate": 3.5850301108851326e-06, + "loss": 0.4705, + "step": 4214 + }, + { + "epoch": 0.73, + "learning_rate": 3.580726019468671e-06, + "loss": 0.455, + "step": 4215 + }, + { + "epoch": 0.73, + "learning_rate": 3.5764239497668584e-06, + "loss": 0.462, + "step": 4216 + }, + { + "epoch": 0.73, + "learning_rate": 3.5721239031346067e-06, + "loss": 0.4613, + "step": 4217 + }, + { + "epoch": 0.73, + "learning_rate": 3.5678258809261935e-06, + "loss": 0.4637, + "step": 4218 + }, + { + "epoch": 0.73, + "learning_rate": 3.563529884495259e-06, + "loss": 0.4509, + "step": 4219 + }, + { + "epoch": 0.73, + "learning_rate": 3.5592359151947974e-06, + "loss": 0.4659, + "step": 4220 + }, + { + "epoch": 0.73, + "learning_rate": 3.554943974377174e-06, + "loss": 0.4512, + "step": 4221 + }, + { + "epoch": 0.73, + "learning_rate": 3.55065406339411e-06, + "loss": 0.4715, + "step": 4222 + }, + { + "epoch": 0.73, + "learning_rate": 3.546366183596691e-06, + "loss": 0.4478, + "step": 4223 + }, + { + "epoch": 0.73, + "learning_rate": 3.5420803363353604e-06, + "loss": 0.463, + "step": 4224 + }, + { + "epoch": 0.73, + "learning_rate": 3.537796522959921e-06, + "loss": 0.4407, + "step": 4225 + }, + { + "epoch": 0.73, + "learning_rate": 3.5335147448195406e-06, + "loss": 0.4692, + "step": 4226 + }, + { + "epoch": 0.73, + "learning_rate": 3.5292350032627344e-06, + "loss": 0.4446, + "step": 4227 + }, + { + "epoch": 0.73, + "learning_rate": 3.524957299637386e-06, + "loss": 0.4563, + "step": 4228 + }, + { + "epoch": 0.73, + "learning_rate": 3.5206816352907347e-06, + "loss": 0.4592, + "step": 4229 + }, + { + "epoch": 0.73, + "learning_rate": 3.5164080115693767e-06, + "loss": 0.471, + "step": 4230 + }, + { + "epoch": 0.73, + "learning_rate": 3.5121364298192673e-06, + "loss": 0.4637, + "step": 4231 + }, + { + "epoch": 0.73, + "learning_rate": 3.507866891385716e-06, + "loss": 0.457, + "step": 4232 + }, + { + "epoch": 0.73, + "learning_rate": 3.503599397613394e-06, + "loss": 0.4632, + "step": 4233 + }, + { + "epoch": 0.73, + "learning_rate": 3.4993339498463197e-06, + "loss": 0.4578, + "step": 4234 + }, + { + "epoch": 0.73, + "learning_rate": 3.4950705494278723e-06, + "loss": 0.4661, + "step": 4235 + }, + { + "epoch": 0.73, + "learning_rate": 3.4908091977007896e-06, + "loss": 0.4629, + "step": 4236 + }, + { + "epoch": 0.73, + "learning_rate": 3.4865498960071576e-06, + "loss": 0.4547, + "step": 4237 + }, + { + "epoch": 0.73, + "learning_rate": 3.4822926456884243e-06, + "loss": 0.452, + "step": 4238 + }, + { + "epoch": 0.73, + "learning_rate": 3.4780374480853774e-06, + "loss": 0.4657, + "step": 4239 + }, + { + "epoch": 0.73, + "learning_rate": 3.47378430453818e-06, + "loss": 0.4537, + "step": 4240 + }, + { + "epoch": 0.73, + "learning_rate": 3.469533216386328e-06, + "loss": 0.4718, + "step": 4241 + }, + { + "epoch": 0.73, + "learning_rate": 3.465284184968679e-06, + "loss": 0.4629, + "step": 4242 + }, + { + "epoch": 0.74, + "learning_rate": 3.4610372116234425e-06, + "loss": 0.4636, + "step": 4243 + }, + { + "epoch": 0.74, + "learning_rate": 3.456792297688181e-06, + "loss": 0.4467, + "step": 4244 + }, + { + "epoch": 0.74, + "learning_rate": 3.4525494444998064e-06, + "loss": 0.4652, + "step": 4245 + }, + { + "epoch": 0.74, + "learning_rate": 3.4483086533945776e-06, + "loss": 0.4378, + "step": 4246 + }, + { + "epoch": 0.74, + "learning_rate": 3.4440699257081113e-06, + "loss": 0.4639, + "step": 4247 + }, + { + "epoch": 0.74, + "learning_rate": 3.4398332627753704e-06, + "loss": 0.4473, + "step": 4248 + }, + { + "epoch": 0.74, + "learning_rate": 3.435598665930672e-06, + "loss": 0.4607, + "step": 4249 + }, + { + "epoch": 0.74, + "learning_rate": 3.431366136507669e-06, + "loss": 0.4599, + "step": 4250 + }, + { + "epoch": 0.74, + "learning_rate": 3.4271356758393827e-06, + "loss": 0.4609, + "step": 4251 + }, + { + "epoch": 0.74, + "learning_rate": 3.4229072852581735e-06, + "loss": 0.4724, + "step": 4252 + }, + { + "epoch": 0.74, + "learning_rate": 3.4186809660957433e-06, + "loss": 0.4685, + "step": 4253 + }, + { + "epoch": 0.74, + "learning_rate": 3.41445671968315e-06, + "loss": 0.4552, + "step": 4254 + }, + { + "epoch": 0.74, + "learning_rate": 3.410234547350797e-06, + "loss": 0.458, + "step": 4255 + }, + { + "epoch": 0.74, + "learning_rate": 3.4060144504284375e-06, + "loss": 0.4633, + "step": 4256 + }, + { + "epoch": 0.74, + "learning_rate": 3.4017964302451578e-06, + "loss": 0.4646, + "step": 4257 + }, + { + "epoch": 0.74, + "learning_rate": 3.3975804881294095e-06, + "loss": 0.4643, + "step": 4258 + }, + { + "epoch": 0.74, + "learning_rate": 3.393366625408979e-06, + "loss": 0.4545, + "step": 4259 + }, + { + "epoch": 0.74, + "learning_rate": 3.3891548434109942e-06, + "loss": 0.4573, + "step": 4260 + }, + { + "epoch": 0.74, + "learning_rate": 3.384945143461936e-06, + "loss": 0.4598, + "step": 4261 + }, + { + "epoch": 0.74, + "learning_rate": 3.380737526887624e-06, + "loss": 0.4526, + "step": 4262 + }, + { + "epoch": 0.74, + "learning_rate": 3.376531995013228e-06, + "loss": 0.4548, + "step": 4263 + }, + { + "epoch": 0.74, + "learning_rate": 3.3723285491632508e-06, + "loss": 0.4643, + "step": 4264 + }, + { + "epoch": 0.74, + "learning_rate": 3.368127190661543e-06, + "loss": 0.4648, + "step": 4265 + }, + { + "epoch": 0.74, + "learning_rate": 3.3639279208313113e-06, + "loss": 0.456, + "step": 4266 + }, + { + "epoch": 0.74, + "learning_rate": 3.35973074099508e-06, + "loss": 0.4522, + "step": 4267 + }, + { + "epoch": 0.74, + "learning_rate": 3.3555356524747307e-06, + "loss": 0.4633, + "step": 4268 + }, + { + "epoch": 0.74, + "learning_rate": 3.3513426565914854e-06, + "loss": 0.4587, + "step": 4269 + }, + { + "epoch": 0.74, + "learning_rate": 3.3471517546659072e-06, + "loss": 0.4573, + "step": 4270 + }, + { + "epoch": 0.74, + "learning_rate": 3.3429629480178905e-06, + "loss": 0.4629, + "step": 4271 + }, + { + "epoch": 0.74, + "learning_rate": 3.3387762379666753e-06, + "loss": 0.4619, + "step": 4272 + }, + { + "epoch": 0.74, + "learning_rate": 3.3345916258308565e-06, + "loss": 0.4738, + "step": 4273 + }, + { + "epoch": 0.74, + "learning_rate": 3.33040911292834e-06, + "loss": 0.4561, + "step": 4274 + }, + { + "epoch": 0.74, + "learning_rate": 3.3262287005763915e-06, + "loss": 0.4697, + "step": 4275 + }, + { + "epoch": 0.74, + "learning_rate": 3.3220503900916068e-06, + "loss": 0.4553, + "step": 4276 + }, + { + "epoch": 0.74, + "learning_rate": 3.3178741827899253e-06, + "loss": 0.4582, + "step": 4277 + }, + { + "epoch": 0.74, + "learning_rate": 3.3137000799866148e-06, + "loss": 0.4789, + "step": 4278 + }, + { + "epoch": 0.74, + "learning_rate": 3.309528082996287e-06, + "loss": 0.4688, + "step": 4279 + }, + { + "epoch": 0.74, + "learning_rate": 3.3053581931328914e-06, + "loss": 0.4618, + "step": 4280 + }, + { + "epoch": 0.74, + "learning_rate": 3.3011904117097093e-06, + "loss": 0.4571, + "step": 4281 + }, + { + "epoch": 0.74, + "learning_rate": 3.297024740039366e-06, + "loss": 0.443, + "step": 4282 + }, + { + "epoch": 0.74, + "learning_rate": 3.292861179433805e-06, + "loss": 0.4692, + "step": 4283 + }, + { + "epoch": 0.74, + "learning_rate": 3.28869973120433e-06, + "loss": 0.4474, + "step": 4284 + }, + { + "epoch": 0.74, + "learning_rate": 3.2845403966615574e-06, + "loss": 0.4649, + "step": 4285 + }, + { + "epoch": 0.74, + "learning_rate": 3.2803831771154483e-06, + "loss": 0.4625, + "step": 4286 + }, + { + "epoch": 0.74, + "learning_rate": 3.276228073875296e-06, + "loss": 0.4664, + "step": 4287 + }, + { + "epoch": 0.74, + "learning_rate": 3.2720750882497276e-06, + "loss": 0.4606, + "step": 4288 + }, + { + "epoch": 0.74, + "learning_rate": 3.2679242215467066e-06, + "loss": 0.4553, + "step": 4289 + }, + { + "epoch": 0.74, + "learning_rate": 3.263775475073514e-06, + "loss": 0.4546, + "step": 4290 + }, + { + "epoch": 0.74, + "learning_rate": 3.259628850136789e-06, + "loss": 0.4592, + "step": 4291 + }, + { + "epoch": 0.74, + "learning_rate": 3.255484348042478e-06, + "loss": 0.464, + "step": 4292 + }, + { + "epoch": 0.74, + "learning_rate": 3.2513419700958715e-06, + "loss": 0.4555, + "step": 4293 + }, + { + "epoch": 0.74, + "learning_rate": 3.2472017176015893e-06, + "loss": 0.4587, + "step": 4294 + }, + { + "epoch": 0.74, + "learning_rate": 3.24306359186358e-06, + "loss": 0.4514, + "step": 4295 + }, + { + "epoch": 0.74, + "learning_rate": 3.238927594185127e-06, + "loss": 0.4553, + "step": 4296 + }, + { + "epoch": 0.74, + "learning_rate": 3.2347937258688342e-06, + "loss": 0.461, + "step": 4297 + }, + { + "epoch": 0.74, + "learning_rate": 3.2306619882166414e-06, + "loss": 0.4527, + "step": 4298 + }, + { + "epoch": 0.74, + "learning_rate": 3.226532382529819e-06, + "loss": 0.4625, + "step": 4299 + }, + { + "epoch": 0.74, + "learning_rate": 3.2224049101089616e-06, + "loss": 0.4595, + "step": 4300 + }, + { + "epoch": 0.75, + "learning_rate": 3.218279572253994e-06, + "loss": 0.4629, + "step": 4301 + }, + { + "epoch": 0.75, + "learning_rate": 3.214156370264169e-06, + "loss": 0.4642, + "step": 4302 + }, + { + "epoch": 0.75, + "learning_rate": 3.2100353054380683e-06, + "loss": 0.456, + "step": 4303 + }, + { + "epoch": 0.75, + "learning_rate": 3.2059163790735927e-06, + "loss": 0.4647, + "step": 4304 + }, + { + "epoch": 0.75, + "learning_rate": 3.201799592467978e-06, + "loss": 0.46, + "step": 4305 + }, + { + "epoch": 0.75, + "learning_rate": 3.197684946917784e-06, + "loss": 0.4564, + "step": 4306 + }, + { + "epoch": 0.75, + "learning_rate": 3.1935724437188954e-06, + "loss": 0.4499, + "step": 4307 + }, + { + "epoch": 0.75, + "learning_rate": 3.1894620841665248e-06, + "loss": 0.4549, + "step": 4308 + }, + { + "epoch": 0.75, + "learning_rate": 3.1853538695551965e-06, + "loss": 0.4534, + "step": 4309 + }, + { + "epoch": 0.75, + "learning_rate": 3.181247801178785e-06, + "loss": 0.4736, + "step": 4310 + }, + { + "epoch": 0.75, + "learning_rate": 3.177143880330463e-06, + "loss": 0.4637, + "step": 4311 + }, + { + "epoch": 0.75, + "learning_rate": 3.1730421083027395e-06, + "loss": 0.466, + "step": 4312 + }, + { + "epoch": 0.75, + "learning_rate": 3.1689424863874473e-06, + "loss": 0.4582, + "step": 4313 + }, + { + "epoch": 0.75, + "learning_rate": 3.1648450158757373e-06, + "loss": 0.4527, + "step": 4314 + }, + { + "epoch": 0.75, + "learning_rate": 3.1607496980580897e-06, + "loss": 0.447, + "step": 4315 + }, + { + "epoch": 0.75, + "learning_rate": 3.1566565342242916e-06, + "loss": 0.4677, + "step": 4316 + }, + { + "epoch": 0.75, + "learning_rate": 3.1525655256634757e-06, + "loss": 0.4666, + "step": 4317 + }, + { + "epoch": 0.75, + "learning_rate": 3.1484766736640717e-06, + "loss": 0.4664, + "step": 4318 + }, + { + "epoch": 0.75, + "learning_rate": 3.1443899795138454e-06, + "loss": 0.4636, + "step": 4319 + }, + { + "epoch": 0.75, + "learning_rate": 3.140305444499877e-06, + "loss": 0.4435, + "step": 4320 + }, + { + "epoch": 0.75, + "learning_rate": 3.1362230699085693e-06, + "loss": 0.4718, + "step": 4321 + }, + { + "epoch": 0.75, + "learning_rate": 3.1321428570256464e-06, + "loss": 0.451, + "step": 4322 + }, + { + "epoch": 0.75, + "learning_rate": 3.128064807136142e-06, + "loss": 0.468, + "step": 4323 + }, + { + "epoch": 0.75, + "learning_rate": 3.123988921524418e-06, + "loss": 0.4656, + "step": 4324 + }, + { + "epoch": 0.75, + "learning_rate": 3.119915201474153e-06, + "loss": 0.4552, + "step": 4325 + }, + { + "epoch": 0.75, + "learning_rate": 3.115843648268344e-06, + "loss": 0.4616, + "step": 4326 + }, + { + "epoch": 0.75, + "learning_rate": 3.1117742631892965e-06, + "loss": 0.469, + "step": 4327 + }, + { + "epoch": 0.75, + "learning_rate": 3.107707047518649e-06, + "loss": 0.4706, + "step": 4328 + }, + { + "epoch": 0.75, + "learning_rate": 3.103642002537349e-06, + "loss": 0.4639, + "step": 4329 + }, + { + "epoch": 0.75, + "learning_rate": 3.099579129525653e-06, + "loss": 0.4644, + "step": 4330 + }, + { + "epoch": 0.75, + "learning_rate": 3.0955184297631437e-06, + "loss": 0.4657, + "step": 4331 + }, + { + "epoch": 0.75, + "learning_rate": 3.0914599045287165e-06, + "loss": 0.4497, + "step": 4332 + }, + { + "epoch": 0.75, + "learning_rate": 3.087403555100583e-06, + "loss": 0.4677, + "step": 4333 + }, + { + "epoch": 0.75, + "learning_rate": 3.0833493827562598e-06, + "loss": 0.4448, + "step": 4334 + }, + { + "epoch": 0.75, + "learning_rate": 3.079297388772595e-06, + "loss": 0.4665, + "step": 4335 + }, + { + "epoch": 0.75, + "learning_rate": 3.0752475744257414e-06, + "loss": 0.4676, + "step": 4336 + }, + { + "epoch": 0.75, + "learning_rate": 3.0711999409911587e-06, + "loss": 0.4557, + "step": 4337 + }, + { + "epoch": 0.75, + "learning_rate": 3.067154489743631e-06, + "loss": 0.4644, + "step": 4338 + }, + { + "epoch": 0.75, + "learning_rate": 3.06311122195725e-06, + "loss": 0.452, + "step": 4339 + }, + { + "epoch": 0.75, + "learning_rate": 3.0590701389054235e-06, + "loss": 0.4546, + "step": 4340 + }, + { + "epoch": 0.75, + "learning_rate": 3.0550312418608617e-06, + "loss": 0.455, + "step": 4341 + }, + { + "epoch": 0.75, + "learning_rate": 3.0509945320955925e-06, + "loss": 0.4618, + "step": 4342 + }, + { + "epoch": 0.75, + "learning_rate": 3.046960010880966e-06, + "loss": 0.4677, + "step": 4343 + }, + { + "epoch": 0.75, + "learning_rate": 3.042927679487622e-06, + "loss": 0.4728, + "step": 4344 + }, + { + "epoch": 0.75, + "learning_rate": 3.0388975391855226e-06, + "loss": 0.4678, + "step": 4345 + }, + { + "epoch": 0.75, + "learning_rate": 3.03486959124394e-06, + "loss": 0.456, + "step": 4346 + }, + { + "epoch": 0.75, + "learning_rate": 3.0308438369314563e-06, + "loss": 0.4641, + "step": 4347 + }, + { + "epoch": 0.75, + "learning_rate": 3.026820277515955e-06, + "loss": 0.4568, + "step": 4348 + }, + { + "epoch": 0.75, + "learning_rate": 3.022798914264633e-06, + "loss": 0.4769, + "step": 4349 + }, + { + "epoch": 0.75, + "learning_rate": 3.018779748444005e-06, + "loss": 0.4551, + "step": 4350 + }, + { + "epoch": 0.75, + "learning_rate": 3.0147627813198777e-06, + "loss": 0.4693, + "step": 4351 + }, + { + "epoch": 0.75, + "learning_rate": 3.0107480141573763e-06, + "loss": 0.46, + "step": 4352 + }, + { + "epoch": 0.75, + "learning_rate": 3.006735448220922e-06, + "loss": 0.4506, + "step": 4353 + }, + { + "epoch": 0.75, + "learning_rate": 3.002725084774262e-06, + "loss": 0.4529, + "step": 4354 + }, + { + "epoch": 0.75, + "learning_rate": 2.998716925080427e-06, + "loss": 0.4752, + "step": 4355 + }, + { + "epoch": 0.75, + "learning_rate": 2.9947109704017707e-06, + "loss": 0.4475, + "step": 4356 + }, + { + "epoch": 0.75, + "learning_rate": 2.9907072219999443e-06, + "loss": 0.4598, + "step": 4357 + }, + { + "epoch": 0.75, + "learning_rate": 2.9867056811359063e-06, + "loss": 0.4503, + "step": 4358 + }, + { + "epoch": 0.76, + "learning_rate": 2.9827063490699225e-06, + "loss": 0.4649, + "step": 4359 + }, + { + "epoch": 0.76, + "learning_rate": 2.9787092270615527e-06, + "loss": 0.4645, + "step": 4360 + }, + { + "epoch": 0.76, + "learning_rate": 2.974714316369679e-06, + "loss": 0.4648, + "step": 4361 + }, + { + "epoch": 0.76, + "learning_rate": 2.9707216182524667e-06, + "loss": 0.461, + "step": 4362 + }, + { + "epoch": 0.76, + "learning_rate": 2.966731133967399e-06, + "loss": 0.4601, + "step": 4363 + }, + { + "epoch": 0.76, + "learning_rate": 2.9627428647712553e-06, + "loss": 0.4539, + "step": 4364 + }, + { + "epoch": 0.76, + "learning_rate": 2.9587568119201193e-06, + "loss": 0.4541, + "step": 4365 + }, + { + "epoch": 0.76, + "learning_rate": 2.954772976669378e-06, + "loss": 0.4597, + "step": 4366 + }, + { + "epoch": 0.76, + "learning_rate": 2.950791360273714e-06, + "loss": 0.4622, + "step": 4367 + }, + { + "epoch": 0.76, + "learning_rate": 2.9468119639871163e-06, + "loss": 0.4503, + "step": 4368 + }, + { + "epoch": 0.76, + "learning_rate": 2.942834789062876e-06, + "loss": 0.4597, + "step": 4369 + }, + { + "epoch": 0.76, + "learning_rate": 2.9388598367535793e-06, + "loss": 0.4547, + "step": 4370 + }, + { + "epoch": 0.76, + "learning_rate": 2.9348871083111185e-06, + "loss": 0.4495, + "step": 4371 + }, + { + "epoch": 0.76, + "learning_rate": 2.93091660498668e-06, + "loss": 0.4566, + "step": 4372 + }, + { + "epoch": 0.76, + "learning_rate": 2.926948328030755e-06, + "loss": 0.462, + "step": 4373 + }, + { + "epoch": 0.76, + "learning_rate": 2.9229822786931263e-06, + "loss": 0.4492, + "step": 4374 + }, + { + "epoch": 0.76, + "learning_rate": 2.9190184582228787e-06, + "loss": 0.4665, + "step": 4375 + }, + { + "epoch": 0.76, + "learning_rate": 2.9150568678683987e-06, + "loss": 0.4561, + "step": 4376 + }, + { + "epoch": 0.76, + "learning_rate": 2.911097508877365e-06, + "loss": 0.4533, + "step": 4377 + }, + { + "epoch": 0.76, + "learning_rate": 2.907140382496757e-06, + "loss": 0.4547, + "step": 4378 + }, + { + "epoch": 0.76, + "learning_rate": 2.9031854899728485e-06, + "loss": 0.4533, + "step": 4379 + }, + { + "epoch": 0.76, + "learning_rate": 2.899232832551214e-06, + "loss": 0.4637, + "step": 4380 + }, + { + "epoch": 0.76, + "learning_rate": 2.8952824114767164e-06, + "loss": 0.465, + "step": 4381 + }, + { + "epoch": 0.76, + "learning_rate": 2.891334227993521e-06, + "loss": 0.4696, + "step": 4382 + }, + { + "epoch": 0.76, + "learning_rate": 2.8873882833450863e-06, + "loss": 0.4657, + "step": 4383 + }, + { + "epoch": 0.76, + "learning_rate": 2.8834445787741647e-06, + "loss": 0.4593, + "step": 4384 + }, + { + "epoch": 0.76, + "learning_rate": 2.8795031155228083e-06, + "loss": 0.4618, + "step": 4385 + }, + { + "epoch": 0.76, + "learning_rate": 2.8755638948323494e-06, + "loss": 0.4623, + "step": 4386 + }, + { + "epoch": 0.76, + "learning_rate": 2.8716269179434366e-06, + "loss": 0.4635, + "step": 4387 + }, + { + "epoch": 0.76, + "learning_rate": 2.8676921860959874e-06, + "loss": 0.4454, + "step": 4388 + }, + { + "epoch": 0.76, + "learning_rate": 2.8637597005292295e-06, + "loss": 0.4648, + "step": 4389 + }, + { + "epoch": 0.76, + "learning_rate": 2.859829462481676e-06, + "loss": 0.4558, + "step": 4390 + }, + { + "epoch": 0.76, + "learning_rate": 2.855901473191134e-06, + "loss": 0.4496, + "step": 4391 + }, + { + "epoch": 0.76, + "learning_rate": 2.851975733894705e-06, + "loss": 0.4724, + "step": 4392 + }, + { + "epoch": 0.76, + "learning_rate": 2.8480522458287686e-06, + "loss": 0.4589, + "step": 4393 + }, + { + "epoch": 0.76, + "learning_rate": 2.8441310102290187e-06, + "loss": 0.4592, + "step": 4394 + }, + { + "epoch": 0.76, + "learning_rate": 2.840212028330418e-06, + "loss": 0.4554, + "step": 4395 + }, + { + "epoch": 0.76, + "learning_rate": 2.8362953013672325e-06, + "loss": 0.4692, + "step": 4396 + }, + { + "epoch": 0.76, + "learning_rate": 2.8323808305730062e-06, + "loss": 0.451, + "step": 4397 + }, + { + "epoch": 0.76, + "learning_rate": 2.8284686171805875e-06, + "loss": 0.452, + "step": 4398 + }, + { + "epoch": 0.76, + "learning_rate": 2.8245586624221076e-06, + "loss": 0.4629, + "step": 4399 + }, + { + "epoch": 0.76, + "learning_rate": 2.8206509675289785e-06, + "loss": 0.4694, + "step": 4400 + }, + { + "epoch": 0.76, + "learning_rate": 2.8167455337319084e-06, + "loss": 0.4702, + "step": 4401 + }, + { + "epoch": 0.76, + "learning_rate": 2.8128423622608947e-06, + "loss": 0.4631, + "step": 4402 + }, + { + "epoch": 0.76, + "learning_rate": 2.808941454345221e-06, + "loss": 0.4537, + "step": 4403 + }, + { + "epoch": 0.76, + "learning_rate": 2.8050428112134474e-06, + "loss": 0.4615, + "step": 4404 + }, + { + "epoch": 0.76, + "learning_rate": 2.8011464340934403e-06, + "loss": 0.4602, + "step": 4405 + }, + { + "epoch": 0.76, + "learning_rate": 2.7972523242123407e-06, + "loss": 0.4631, + "step": 4406 + }, + { + "epoch": 0.76, + "learning_rate": 2.79336048279657e-06, + "loss": 0.4574, + "step": 4407 + }, + { + "epoch": 0.76, + "learning_rate": 2.7894709110718476e-06, + "loss": 0.4615, + "step": 4408 + }, + { + "epoch": 0.76, + "learning_rate": 2.7855836102631707e-06, + "loss": 0.4502, + "step": 4409 + }, + { + "epoch": 0.76, + "learning_rate": 2.781698581594826e-06, + "loss": 0.4604, + "step": 4410 + }, + { + "epoch": 0.76, + "learning_rate": 2.7778158262903764e-06, + "loss": 0.4571, + "step": 4411 + }, + { + "epoch": 0.76, + "learning_rate": 2.7739353455726735e-06, + "loss": 0.4692, + "step": 4412 + }, + { + "epoch": 0.76, + "learning_rate": 2.7700571406638633e-06, + "loss": 0.4588, + "step": 4413 + }, + { + "epoch": 0.76, + "learning_rate": 2.7661812127853536e-06, + "loss": 0.4522, + "step": 4414 + }, + { + "epoch": 0.76, + "learning_rate": 2.762307563157852e-06, + "loss": 0.4688, + "step": 4415 + }, + { + "epoch": 0.77, + "learning_rate": 2.7584361930013413e-06, + "loss": 0.4656, + "step": 4416 + }, + { + "epoch": 0.77, + "learning_rate": 2.7545671035350907e-06, + "loss": 0.4602, + "step": 4417 + }, + { + "epoch": 0.77, + "learning_rate": 2.7507002959776443e-06, + "loss": 0.4645, + "step": 4418 + }, + { + "epoch": 0.77, + "learning_rate": 2.7468357715468296e-06, + "loss": 0.4498, + "step": 4419 + }, + { + "epoch": 0.77, + "learning_rate": 2.742973531459767e-06, + "loss": 0.4576, + "step": 4420 + }, + { + "epoch": 0.77, + "learning_rate": 2.739113576932838e-06, + "loss": 0.4691, + "step": 4421 + }, + { + "epoch": 0.77, + "learning_rate": 2.735255909181719e-06, + "loss": 0.466, + "step": 4422 + }, + { + "epoch": 0.77, + "learning_rate": 2.7314005294213573e-06, + "loss": 0.4498, + "step": 4423 + }, + { + "epoch": 0.77, + "learning_rate": 2.7275474388659896e-06, + "loss": 0.4618, + "step": 4424 + }, + { + "epoch": 0.77, + "learning_rate": 2.7236966387291176e-06, + "loss": 0.4587, + "step": 4425 + }, + { + "epoch": 0.77, + "learning_rate": 2.7198481302235325e-06, + "loss": 0.4686, + "step": 4426 + }, + { + "epoch": 0.77, + "learning_rate": 2.7160019145613002e-06, + "loss": 0.4644, + "step": 4427 + }, + { + "epoch": 0.77, + "learning_rate": 2.7121579929537677e-06, + "loss": 0.4603, + "step": 4428 + }, + { + "epoch": 0.77, + "learning_rate": 2.7083163666115564e-06, + "loss": 0.4479, + "step": 4429 + }, + { + "epoch": 0.77, + "learning_rate": 2.7044770367445583e-06, + "loss": 0.4653, + "step": 4430 + }, + { + "epoch": 0.77, + "learning_rate": 2.7006400045619597e-06, + "loss": 0.4453, + "step": 4431 + }, + { + "epoch": 0.77, + "learning_rate": 2.6968052712722037e-06, + "loss": 0.4655, + "step": 4432 + }, + { + "epoch": 0.77, + "learning_rate": 2.692972838083022e-06, + "loss": 0.4513, + "step": 4433 + }, + { + "epoch": 0.77, + "learning_rate": 2.6891427062014184e-06, + "loss": 0.4551, + "step": 4434 + }, + { + "epoch": 0.77, + "learning_rate": 2.6853148768336703e-06, + "loss": 0.4565, + "step": 4435 + }, + { + "epoch": 0.77, + "learning_rate": 2.6814893511853347e-06, + "loss": 0.4567, + "step": 4436 + }, + { + "epoch": 0.77, + "learning_rate": 2.677666130461232e-06, + "loss": 0.4506, + "step": 4437 + }, + { + "epoch": 0.77, + "learning_rate": 2.6738452158654736e-06, + "loss": 0.4658, + "step": 4438 + }, + { + "epoch": 0.77, + "learning_rate": 2.670026608601429e-06, + "loss": 0.4617, + "step": 4439 + }, + { + "epoch": 0.77, + "learning_rate": 2.6662103098717485e-06, + "loss": 0.4667, + "step": 4440 + }, + { + "epoch": 0.77, + "learning_rate": 2.6623963208783553e-06, + "loss": 0.4617, + "step": 4441 + }, + { + "epoch": 0.77, + "learning_rate": 2.658584642822444e-06, + "loss": 0.4574, + "step": 4442 + }, + { + "epoch": 0.77, + "learning_rate": 2.654775276904483e-06, + "loss": 0.4585, + "step": 4443 + }, + { + "epoch": 0.77, + "learning_rate": 2.6509682243242074e-06, + "loss": 0.4679, + "step": 4444 + }, + { + "epoch": 0.77, + "learning_rate": 2.6471634862806272e-06, + "loss": 0.4718, + "step": 4445 + }, + { + "epoch": 0.77, + "learning_rate": 2.6433610639720265e-06, + "loss": 0.4694, + "step": 4446 + }, + { + "epoch": 0.77, + "learning_rate": 2.6395609585959547e-06, + "loss": 0.4645, + "step": 4447 + }, + { + "epoch": 0.77, + "learning_rate": 2.635763171349235e-06, + "loss": 0.4486, + "step": 4448 + }, + { + "epoch": 0.77, + "learning_rate": 2.631967703427959e-06, + "loss": 0.4579, + "step": 4449 + }, + { + "epoch": 0.77, + "learning_rate": 2.628174556027492e-06, + "loss": 0.4639, + "step": 4450 + }, + { + "epoch": 0.77, + "learning_rate": 2.624383730342457e-06, + "loss": 0.4476, + "step": 4451 + }, + { + "epoch": 0.77, + "learning_rate": 2.620595227566758e-06, + "loss": 0.4616, + "step": 4452 + }, + { + "epoch": 0.77, + "learning_rate": 2.616809048893563e-06, + "loss": 0.4561, + "step": 4453 + }, + { + "epoch": 0.77, + "learning_rate": 2.6130251955153063e-06, + "loss": 0.4682, + "step": 4454 + }, + { + "epoch": 0.77, + "learning_rate": 2.6092436686236966e-06, + "loss": 0.4561, + "step": 4455 + }, + { + "epoch": 0.77, + "learning_rate": 2.6054644694096942e-06, + "loss": 0.4541, + "step": 4456 + }, + { + "epoch": 0.77, + "learning_rate": 2.601687599063549e-06, + "loss": 0.471, + "step": 4457 + }, + { + "epoch": 0.77, + "learning_rate": 2.597913058774758e-06, + "loss": 0.4713, + "step": 4458 + }, + { + "epoch": 0.77, + "learning_rate": 2.594140849732092e-06, + "loss": 0.4573, + "step": 4459 + }, + { + "epoch": 0.77, + "learning_rate": 2.590370973123589e-06, + "loss": 0.4741, + "step": 4460 + }, + { + "epoch": 0.77, + "learning_rate": 2.5866034301365505e-06, + "loss": 0.4652, + "step": 4461 + }, + { + "epoch": 0.77, + "learning_rate": 2.5828382219575467e-06, + "loss": 0.4584, + "step": 4462 + }, + { + "epoch": 0.77, + "learning_rate": 2.5790753497723986e-06, + "loss": 0.4553, + "step": 4463 + }, + { + "epoch": 0.77, + "learning_rate": 2.5753148147662145e-06, + "loss": 0.4543, + "step": 4464 + }, + { + "epoch": 0.77, + "learning_rate": 2.5715566181233454e-06, + "loss": 0.4604, + "step": 4465 + }, + { + "epoch": 0.77, + "learning_rate": 2.567800761027417e-06, + "loss": 0.4679, + "step": 4466 + }, + { + "epoch": 0.77, + "learning_rate": 2.564047244661316e-06, + "loss": 0.4518, + "step": 4467 + }, + { + "epoch": 0.77, + "learning_rate": 2.5602960702071913e-06, + "loss": 0.4688, + "step": 4468 + }, + { + "epoch": 0.77, + "learning_rate": 2.556547238846456e-06, + "loss": 0.4524, + "step": 4469 + }, + { + "epoch": 0.77, + "learning_rate": 2.5528007517597807e-06, + "loss": 0.4525, + "step": 4470 + }, + { + "epoch": 0.77, + "learning_rate": 2.549056610127101e-06, + "loss": 0.4516, + "step": 4471 + }, + { + "epoch": 0.77, + "learning_rate": 2.5453148151276153e-06, + "loss": 0.4685, + "step": 4472 + }, + { + "epoch": 0.77, + "learning_rate": 2.5415753679397827e-06, + "loss": 0.457, + "step": 4473 + }, + { + "epoch": 0.78, + "learning_rate": 2.537838269741314e-06, + "loss": 0.4706, + "step": 4474 + }, + { + "epoch": 0.78, + "learning_rate": 2.534103521709195e-06, + "loss": 0.4706, + "step": 4475 + }, + { + "epoch": 0.78, + "learning_rate": 2.530371125019664e-06, + "loss": 0.4684, + "step": 4476 + }, + { + "epoch": 0.78, + "learning_rate": 2.526641080848212e-06, + "loss": 0.4625, + "step": 4477 + }, + { + "epoch": 0.78, + "learning_rate": 2.5229133903696012e-06, + "loss": 0.4558, + "step": 4478 + }, + { + "epoch": 0.78, + "learning_rate": 2.519188054757844e-06, + "loss": 0.4569, + "step": 4479 + }, + { + "epoch": 0.78, + "learning_rate": 2.5154650751862197e-06, + "loss": 0.4576, + "step": 4480 + }, + { + "epoch": 0.78, + "learning_rate": 2.5117444528272496e-06, + "loss": 0.4697, + "step": 4481 + }, + { + "epoch": 0.78, + "learning_rate": 2.5080261888527314e-06, + "loss": 0.4654, + "step": 4482 + }, + { + "epoch": 0.78, + "learning_rate": 2.504310284433713e-06, + "loss": 0.4708, + "step": 4483 + }, + { + "epoch": 0.78, + "learning_rate": 2.500596740740491e-06, + "loss": 0.4518, + "step": 4484 + }, + { + "epoch": 0.78, + "learning_rate": 2.4968855589426288e-06, + "loss": 0.4566, + "step": 4485 + }, + { + "epoch": 0.78, + "learning_rate": 2.4931767402089423e-06, + "loss": 0.4543, + "step": 4486 + }, + { + "epoch": 0.78, + "learning_rate": 2.489470285707507e-06, + "loss": 0.4584, + "step": 4487 + }, + { + "epoch": 0.78, + "learning_rate": 2.4857661966056423e-06, + "loss": 0.452, + "step": 4488 + }, + { + "epoch": 0.78, + "learning_rate": 2.4820644740699327e-06, + "loss": 0.4568, + "step": 4489 + }, + { + "epoch": 0.78, + "learning_rate": 2.478365119266223e-06, + "loss": 0.4539, + "step": 4490 + }, + { + "epoch": 0.78, + "learning_rate": 2.4746681333595957e-06, + "loss": 0.4531, + "step": 4491 + }, + { + "epoch": 0.78, + "learning_rate": 2.4709735175143977e-06, + "loss": 0.4556, + "step": 4492 + }, + { + "epoch": 0.78, + "learning_rate": 2.4672812728942295e-06, + "loss": 0.4429, + "step": 4493 + }, + { + "epoch": 0.78, + "learning_rate": 2.4635914006619454e-06, + "loss": 0.4626, + "step": 4494 + }, + { + "epoch": 0.78, + "learning_rate": 2.4599039019796444e-06, + "loss": 0.4629, + "step": 4495 + }, + { + "epoch": 0.78, + "learning_rate": 2.4562187780086834e-06, + "loss": 0.4611, + "step": 4496 + }, + { + "epoch": 0.78, + "learning_rate": 2.45253602990968e-06, + "loss": 0.4483, + "step": 4497 + }, + { + "epoch": 0.78, + "learning_rate": 2.448855658842487e-06, + "loss": 0.4526, + "step": 4498 + }, + { + "epoch": 0.78, + "learning_rate": 2.4451776659662207e-06, + "loss": 0.4572, + "step": 4499 + }, + { + "epoch": 0.78, + "learning_rate": 2.441502052439243e-06, + "loss": 0.4578, + "step": 4500 + }, + { + "epoch": 0.78, + "learning_rate": 2.4378288194191714e-06, + "loss": 0.4605, + "step": 4501 + }, + { + "epoch": 0.78, + "learning_rate": 2.4341579680628637e-06, + "loss": 0.467, + "step": 4502 + }, + { + "epoch": 0.78, + "learning_rate": 2.430489499526438e-06, + "loss": 0.4525, + "step": 4503 + }, + { + "epoch": 0.78, + "learning_rate": 2.4268234149652582e-06, + "loss": 0.4609, + "step": 4504 + }, + { + "epoch": 0.78, + "learning_rate": 2.423159715533937e-06, + "loss": 0.4632, + "step": 4505 + }, + { + "epoch": 0.78, + "learning_rate": 2.419498402386338e-06, + "loss": 0.4622, + "step": 4506 + }, + { + "epoch": 0.78, + "learning_rate": 2.4158394766755645e-06, + "loss": 0.4534, + "step": 4507 + }, + { + "epoch": 0.78, + "learning_rate": 2.4121829395539854e-06, + "loss": 0.4598, + "step": 4508 + }, + { + "epoch": 0.78, + "learning_rate": 2.4085287921731972e-06, + "loss": 0.4521, + "step": 4509 + }, + { + "epoch": 0.78, + "learning_rate": 2.4048770356840577e-06, + "loss": 0.4724, + "step": 4510 + }, + { + "epoch": 0.78, + "learning_rate": 2.401227671236668e-06, + "loss": 0.4629, + "step": 4511 + }, + { + "epoch": 0.78, + "learning_rate": 2.3975806999803717e-06, + "loss": 0.464, + "step": 4512 + }, + { + "epoch": 0.78, + "learning_rate": 2.3939361230637692e-06, + "loss": 0.4545, + "step": 4513 + }, + { + "epoch": 0.78, + "learning_rate": 2.3902939416346917e-06, + "loss": 0.4662, + "step": 4514 + }, + { + "epoch": 0.78, + "learning_rate": 2.386654156840226e-06, + "loss": 0.4666, + "step": 4515 + }, + { + "epoch": 0.78, + "learning_rate": 2.3830167698267038e-06, + "loss": 0.47, + "step": 4516 + }, + { + "epoch": 0.78, + "learning_rate": 2.379381781739699e-06, + "loss": 0.4547, + "step": 4517 + }, + { + "epoch": 0.78, + "learning_rate": 2.375749193724032e-06, + "loss": 0.467, + "step": 4518 + }, + { + "epoch": 0.78, + "learning_rate": 2.3721190069237655e-06, + "loss": 0.4426, + "step": 4519 + }, + { + "epoch": 0.78, + "learning_rate": 2.3684912224822086e-06, + "loss": 0.4621, + "step": 4520 + }, + { + "epoch": 0.78, + "learning_rate": 2.364865841541908e-06, + "loss": 0.4545, + "step": 4521 + }, + { + "epoch": 0.78, + "learning_rate": 2.3612428652446586e-06, + "loss": 0.4635, + "step": 4522 + }, + { + "epoch": 0.78, + "learning_rate": 2.3576222947314962e-06, + "loss": 0.4416, + "step": 4523 + }, + { + "epoch": 0.78, + "learning_rate": 2.354004131142702e-06, + "loss": 0.4648, + "step": 4524 + }, + { + "epoch": 0.78, + "learning_rate": 2.3503883756177935e-06, + "loss": 0.4496, + "step": 4525 + }, + { + "epoch": 0.78, + "learning_rate": 2.346775029295535e-06, + "loss": 0.4434, + "step": 4526 + }, + { + "epoch": 0.78, + "learning_rate": 2.343164093313931e-06, + "loss": 0.4577, + "step": 4527 + }, + { + "epoch": 0.78, + "learning_rate": 2.339555568810221e-06, + "loss": 0.4519, + "step": 4528 + }, + { + "epoch": 0.78, + "learning_rate": 2.3359494569208927e-06, + "loss": 0.4616, + "step": 4529 + }, + { + "epoch": 0.78, + "learning_rate": 2.33234575878167e-06, + "loss": 0.4537, + "step": 4530 + }, + { + "epoch": 0.78, + "learning_rate": 2.328744475527519e-06, + "loss": 0.4601, + "step": 4531 + }, + { + "epoch": 0.79, + "learning_rate": 2.325145608292646e-06, + "loss": 0.4775, + "step": 4532 + }, + { + "epoch": 0.79, + "learning_rate": 2.3215491582104855e-06, + "loss": 0.4496, + "step": 4533 + }, + { + "epoch": 0.79, + "learning_rate": 2.31795512641373e-06, + "loss": 0.464, + "step": 4534 + }, + { + "epoch": 0.79, + "learning_rate": 2.3143635140342936e-06, + "loss": 0.4599, + "step": 4535 + }, + { + "epoch": 0.79, + "learning_rate": 2.310774322203335e-06, + "loss": 0.4569, + "step": 4536 + }, + { + "epoch": 0.79, + "learning_rate": 2.307187552051252e-06, + "loss": 0.4522, + "step": 4537 + }, + { + "epoch": 0.79, + "learning_rate": 2.3036032047076774e-06, + "loss": 0.4585, + "step": 4538 + }, + { + "epoch": 0.79, + "learning_rate": 2.300021281301483e-06, + "loss": 0.4488, + "step": 4539 + }, + { + "epoch": 0.79, + "learning_rate": 2.29644178296077e-06, + "loss": 0.4625, + "step": 4540 + }, + { + "epoch": 0.79, + "learning_rate": 2.292864710812891e-06, + "loss": 0.4622, + "step": 4541 + }, + { + "epoch": 0.79, + "learning_rate": 2.2892900659844154e-06, + "loss": 0.4641, + "step": 4542 + }, + { + "epoch": 0.79, + "learning_rate": 2.2857178496011633e-06, + "loss": 0.4577, + "step": 4543 + }, + { + "epoch": 0.79, + "learning_rate": 2.282148062788182e-06, + "loss": 0.4583, + "step": 4544 + }, + { + "epoch": 0.79, + "learning_rate": 2.278580706669757e-06, + "loss": 0.4488, + "step": 4545 + }, + { + "epoch": 0.79, + "learning_rate": 2.27501578236941e-06, + "loss": 0.475, + "step": 4546 + }, + { + "epoch": 0.79, + "learning_rate": 2.2714532910098885e-06, + "loss": 0.4611, + "step": 4547 + }, + { + "epoch": 0.79, + "learning_rate": 2.267893233713182e-06, + "loss": 0.4476, + "step": 4548 + }, + { + "epoch": 0.79, + "learning_rate": 2.264335611600511e-06, + "loss": 0.4498, + "step": 4549 + }, + { + "epoch": 0.79, + "learning_rate": 2.2607804257923316e-06, + "loss": 0.4632, + "step": 4550 + }, + { + "epoch": 0.79, + "learning_rate": 2.2572276774083212e-06, + "loss": 0.459, + "step": 4551 + }, + { + "epoch": 0.79, + "learning_rate": 2.253677367567406e-06, + "loss": 0.4535, + "step": 4552 + }, + { + "epoch": 0.79, + "learning_rate": 2.2501294973877374e-06, + "loss": 0.4536, + "step": 4553 + }, + { + "epoch": 0.79, + "learning_rate": 2.2465840679866923e-06, + "loss": 0.4461, + "step": 4554 + }, + { + "epoch": 0.79, + "learning_rate": 2.2430410804808842e-06, + "loss": 0.4591, + "step": 4555 + }, + { + "epoch": 0.79, + "learning_rate": 2.23950053598616e-06, + "loss": 0.4658, + "step": 4556 + }, + { + "epoch": 0.79, + "learning_rate": 2.235962435617596e-06, + "loss": 0.4751, + "step": 4557 + }, + { + "epoch": 0.79, + "learning_rate": 2.2324267804894895e-06, + "loss": 0.4686, + "step": 4558 + }, + { + "epoch": 0.79, + "learning_rate": 2.2288935717153825e-06, + "loss": 0.4607, + "step": 4559 + }, + { + "epoch": 0.79, + "learning_rate": 2.2253628104080415e-06, + "loss": 0.4609, + "step": 4560 + }, + { + "epoch": 0.79, + "learning_rate": 2.2218344976794527e-06, + "loss": 0.4695, + "step": 4561 + }, + { + "epoch": 0.79, + "learning_rate": 2.218308634640842e-06, + "loss": 0.4632, + "step": 4562 + }, + { + "epoch": 0.79, + "learning_rate": 2.214785222402661e-06, + "loss": 0.4513, + "step": 4563 + }, + { + "epoch": 0.79, + "learning_rate": 2.2112642620745906e-06, + "loss": 0.4658, + "step": 4564 + }, + { + "epoch": 0.79, + "learning_rate": 2.2077457547655325e-06, + "loss": 0.469, + "step": 4565 + }, + { + "epoch": 0.79, + "learning_rate": 2.204229701583621e-06, + "loss": 0.4621, + "step": 4566 + }, + { + "epoch": 0.79, + "learning_rate": 2.2007161036362255e-06, + "loss": 0.4633, + "step": 4567 + }, + { + "epoch": 0.79, + "learning_rate": 2.1972049620299273e-06, + "loss": 0.4548, + "step": 4568 + }, + { + "epoch": 0.79, + "learning_rate": 2.1936962778705417e-06, + "loss": 0.4551, + "step": 4569 + }, + { + "epoch": 0.79, + "learning_rate": 2.1901900522631114e-06, + "loss": 0.4452, + "step": 4570 + }, + { + "epoch": 0.79, + "learning_rate": 2.186686286311903e-06, + "loss": 0.4726, + "step": 4571 + }, + { + "epoch": 0.79, + "learning_rate": 2.183184981120404e-06, + "loss": 0.4621, + "step": 4572 + }, + { + "epoch": 0.79, + "learning_rate": 2.1796861377913304e-06, + "loss": 0.4636, + "step": 4573 + }, + { + "epoch": 0.79, + "learning_rate": 2.176189757426633e-06, + "loss": 0.4523, + "step": 4574 + }, + { + "epoch": 0.79, + "learning_rate": 2.172695841127468e-06, + "loss": 0.4625, + "step": 4575 + }, + { + "epoch": 0.79, + "learning_rate": 2.1692043899942304e-06, + "loss": 0.4616, + "step": 4576 + }, + { + "epoch": 0.79, + "learning_rate": 2.165715405126525e-06, + "loss": 0.4617, + "step": 4577 + }, + { + "epoch": 0.79, + "learning_rate": 2.1622288876232e-06, + "loss": 0.4576, + "step": 4578 + }, + { + "epoch": 0.79, + "learning_rate": 2.158744838582305e-06, + "loss": 0.4586, + "step": 4579 + }, + { + "epoch": 0.79, + "learning_rate": 2.155263259101127e-06, + "loss": 0.455, + "step": 4580 + }, + { + "epoch": 0.79, + "learning_rate": 2.1517841502761672e-06, + "loss": 0.4683, + "step": 4581 + }, + { + "epoch": 0.79, + "learning_rate": 2.148307513203154e-06, + "loss": 0.4508, + "step": 4582 + }, + { + "epoch": 0.79, + "learning_rate": 2.144833348977037e-06, + "loss": 0.4752, + "step": 4583 + }, + { + "epoch": 0.79, + "learning_rate": 2.141361658691975e-06, + "loss": 0.451, + "step": 4584 + }, + { + "epoch": 0.79, + "learning_rate": 2.1378924434413708e-06, + "loss": 0.4566, + "step": 4585 + }, + { + "epoch": 0.79, + "learning_rate": 2.1344257043178253e-06, + "loss": 0.4511, + "step": 4586 + }, + { + "epoch": 0.79, + "learning_rate": 2.130961442413171e-06, + "loss": 0.4548, + "step": 4587 + }, + { + "epoch": 0.79, + "learning_rate": 2.127499658818458e-06, + "loss": 0.4382, + "step": 4588 + }, + { + "epoch": 0.79, + "learning_rate": 2.1240403546239575e-06, + "loss": 0.4616, + "step": 4589 + }, + { + "epoch": 0.8, + "learning_rate": 2.1205835309191593e-06, + "loss": 0.4636, + "step": 4590 + }, + { + "epoch": 0.8, + "learning_rate": 2.117129188792765e-06, + "loss": 0.4684, + "step": 4591 + }, + { + "epoch": 0.8, + "learning_rate": 2.113677329332704e-06, + "loss": 0.4547, + "step": 4592 + }, + { + "epoch": 0.8, + "learning_rate": 2.1102279536261193e-06, + "loss": 0.4541, + "step": 4593 + }, + { + "epoch": 0.8, + "learning_rate": 2.1067810627593744e-06, + "loss": 0.466, + "step": 4594 + }, + { + "epoch": 0.8, + "learning_rate": 2.1033366578180468e-06, + "loss": 0.4506, + "step": 4595 + }, + { + "epoch": 0.8, + "learning_rate": 2.099894739886933e-06, + "loss": 0.4586, + "step": 4596 + }, + { + "epoch": 0.8, + "learning_rate": 2.0964553100500495e-06, + "loss": 0.4676, + "step": 4597 + }, + { + "epoch": 0.8, + "learning_rate": 2.093018369390619e-06, + "loss": 0.4577, + "step": 4598 + }, + { + "epoch": 0.8, + "learning_rate": 2.0895839189910906e-06, + "loss": 0.4736, + "step": 4599 + }, + { + "epoch": 0.8, + "learning_rate": 2.0861519599331236e-06, + "loss": 0.4652, + "step": 4600 + }, + { + "epoch": 0.8, + "learning_rate": 2.0827224932975963e-06, + "loss": 0.4546, + "step": 4601 + }, + { + "epoch": 0.8, + "learning_rate": 2.0792955201646005e-06, + "loss": 0.4617, + "step": 4602 + }, + { + "epoch": 0.8, + "learning_rate": 2.075871041613441e-06, + "loss": 0.4547, + "step": 4603 + }, + { + "epoch": 0.8, + "learning_rate": 2.07244905872264e-06, + "loss": 0.452, + "step": 4604 + }, + { + "epoch": 0.8, + "learning_rate": 2.0690295725699292e-06, + "loss": 0.4653, + "step": 4605 + }, + { + "epoch": 0.8, + "learning_rate": 2.0656125842322574e-06, + "loss": 0.4589, + "step": 4606 + }, + { + "epoch": 0.8, + "learning_rate": 2.0621980947857865e-06, + "loss": 0.4498, + "step": 4607 + }, + { + "epoch": 0.8, + "learning_rate": 2.0587861053058924e-06, + "loss": 0.4534, + "step": 4608 + }, + { + "epoch": 0.8, + "learning_rate": 2.055376616867164e-06, + "loss": 0.4682, + "step": 4609 + }, + { + "epoch": 0.8, + "learning_rate": 2.0519696305433913e-06, + "loss": 0.4458, + "step": 4610 + }, + { + "epoch": 0.8, + "learning_rate": 2.0485651474075987e-06, + "loss": 0.4538, + "step": 4611 + }, + { + "epoch": 0.8, + "learning_rate": 2.0451631685319995e-06, + "loss": 0.4492, + "step": 4612 + }, + { + "epoch": 0.8, + "learning_rate": 2.0417636949880316e-06, + "loss": 0.468, + "step": 4613 + }, + { + "epoch": 0.8, + "learning_rate": 2.038366727846339e-06, + "loss": 0.4535, + "step": 4614 + }, + { + "epoch": 0.8, + "learning_rate": 2.0349722681767794e-06, + "loss": 0.4561, + "step": 4615 + }, + { + "epoch": 0.8, + "learning_rate": 2.0315803170484204e-06, + "loss": 0.4518, + "step": 4616 + }, + { + "epoch": 0.8, + "learning_rate": 2.028190875529532e-06, + "loss": 0.4601, + "step": 4617 + }, + { + "epoch": 0.8, + "learning_rate": 2.0248039446876078e-06, + "loss": 0.4445, + "step": 4618 + }, + { + "epoch": 0.8, + "learning_rate": 2.0214195255893365e-06, + "loss": 0.466, + "step": 4619 + }, + { + "epoch": 0.8, + "learning_rate": 2.018037619300628e-06, + "loss": 0.4525, + "step": 4620 + }, + { + "epoch": 0.8, + "learning_rate": 2.0146582268865854e-06, + "loss": 0.4705, + "step": 4621 + }, + { + "epoch": 0.8, + "learning_rate": 2.011281349411539e-06, + "loss": 0.4509, + "step": 4622 + }, + { + "epoch": 0.8, + "learning_rate": 2.0079069879390156e-06, + "loss": 0.4678, + "step": 4623 + }, + { + "epoch": 0.8, + "learning_rate": 2.0045351435317484e-06, + "loss": 0.4413, + "step": 4624 + }, + { + "epoch": 0.8, + "learning_rate": 2.0011658172516823e-06, + "loss": 0.4621, + "step": 4625 + }, + { + "epoch": 0.8, + "learning_rate": 1.9977990101599687e-06, + "loss": 0.4394, + "step": 4626 + }, + { + "epoch": 0.8, + "learning_rate": 1.994434723316967e-06, + "loss": 0.4719, + "step": 4627 + }, + { + "epoch": 0.8, + "learning_rate": 1.991072957782233e-06, + "loss": 0.4662, + "step": 4628 + }, + { + "epoch": 0.8, + "learning_rate": 1.987713714614543e-06, + "loss": 0.4462, + "step": 4629 + }, + { + "epoch": 0.8, + "learning_rate": 1.9843569948718744e-06, + "loss": 0.4586, + "step": 4630 + }, + { + "epoch": 0.8, + "learning_rate": 1.981002799611399e-06, + "loss": 0.4606, + "step": 4631 + }, + { + "epoch": 0.8, + "learning_rate": 1.9776511298895064e-06, + "loss": 0.4556, + "step": 4632 + }, + { + "epoch": 0.8, + "learning_rate": 1.9743019867617864e-06, + "loss": 0.4375, + "step": 4633 + }, + { + "epoch": 0.8, + "learning_rate": 1.970955371283034e-06, + "loss": 0.4534, + "step": 4634 + }, + { + "epoch": 0.8, + "learning_rate": 1.9676112845072447e-06, + "loss": 0.4627, + "step": 4635 + }, + { + "epoch": 0.8, + "learning_rate": 1.9642697274876178e-06, + "loss": 0.458, + "step": 4636 + }, + { + "epoch": 0.8, + "learning_rate": 1.9609307012765664e-06, + "loss": 0.4652, + "step": 4637 + }, + { + "epoch": 0.8, + "learning_rate": 1.9575942069256914e-06, + "loss": 0.4623, + "step": 4638 + }, + { + "epoch": 0.8, + "learning_rate": 1.954260245485804e-06, + "loss": 0.4648, + "step": 4639 + }, + { + "epoch": 0.8, + "learning_rate": 1.9509288180069185e-06, + "loss": 0.4569, + "step": 4640 + }, + { + "epoch": 0.8, + "learning_rate": 1.9475999255382516e-06, + "loss": 0.4541, + "step": 4641 + }, + { + "epoch": 0.8, + "learning_rate": 1.944273569128213e-06, + "loss": 0.4501, + "step": 4642 + }, + { + "epoch": 0.8, + "learning_rate": 1.940949749824422e-06, + "loss": 0.4718, + "step": 4643 + }, + { + "epoch": 0.8, + "learning_rate": 1.9376284686737036e-06, + "loss": 0.4484, + "step": 4644 + }, + { + "epoch": 0.8, + "learning_rate": 1.93430972672207e-06, + "loss": 0.4633, + "step": 4645 + }, + { + "epoch": 0.8, + "learning_rate": 1.9309935250147417e-06, + "loss": 0.4523, + "step": 4646 + }, + { + "epoch": 0.81, + "learning_rate": 1.9276798645961392e-06, + "loss": 0.472, + "step": 4647 + }, + { + "epoch": 0.81, + "learning_rate": 1.924368746509884e-06, + "loss": 0.4423, + "step": 4648 + }, + { + "epoch": 0.81, + "learning_rate": 1.9210601717987887e-06, + "loss": 0.4608, + "step": 4649 + }, + { + "epoch": 0.81, + "learning_rate": 1.9177541415048728e-06, + "loss": 0.4644, + "step": 4650 + }, + { + "epoch": 0.81, + "learning_rate": 1.914450656669353e-06, + "loss": 0.4609, + "step": 4651 + }, + { + "epoch": 0.81, + "learning_rate": 1.9111497183326433e-06, + "loss": 0.4549, + "step": 4652 + }, + { + "epoch": 0.81, + "learning_rate": 1.907851327534358e-06, + "loss": 0.4677, + "step": 4653 + }, + { + "epoch": 0.81, + "learning_rate": 1.9045554853132986e-06, + "loss": 0.4447, + "step": 4654 + }, + { + "epoch": 0.81, + "learning_rate": 1.9012621927074849e-06, + "loss": 0.47, + "step": 4655 + }, + { + "epoch": 0.81, + "learning_rate": 1.8979714507541103e-06, + "loss": 0.4524, + "step": 4656 + }, + { + "epoch": 0.81, + "learning_rate": 1.8946832604895805e-06, + "loss": 0.4636, + "step": 4657 + }, + { + "epoch": 0.81, + "learning_rate": 1.8913976229494924e-06, + "loss": 0.4436, + "step": 4658 + }, + { + "epoch": 0.81, + "learning_rate": 1.8881145391686384e-06, + "loss": 0.46, + "step": 4659 + }, + { + "epoch": 0.81, + "learning_rate": 1.8848340101810114e-06, + "loss": 0.4482, + "step": 4660 + }, + { + "epoch": 0.81, + "learning_rate": 1.881556037019787e-06, + "loss": 0.462, + "step": 4661 + }, + { + "epoch": 0.81, + "learning_rate": 1.8782806207173542e-06, + "loss": 0.4598, + "step": 4662 + }, + { + "epoch": 0.81, + "learning_rate": 1.875007762305282e-06, + "loss": 0.4697, + "step": 4663 + }, + { + "epoch": 0.81, + "learning_rate": 1.8717374628143391e-06, + "loss": 0.4405, + "step": 4664 + }, + { + "epoch": 0.81, + "learning_rate": 1.8684697232744886e-06, + "loss": 0.4616, + "step": 4665 + }, + { + "epoch": 0.81, + "learning_rate": 1.865204544714888e-06, + "loss": 0.4561, + "step": 4666 + }, + { + "epoch": 0.81, + "learning_rate": 1.8619419281638883e-06, + "loss": 0.4652, + "step": 4667 + }, + { + "epoch": 0.81, + "learning_rate": 1.8586818746490288e-06, + "loss": 0.4573, + "step": 4668 + }, + { + "epoch": 0.81, + "learning_rate": 1.8554243851970466e-06, + "loss": 0.4537, + "step": 4669 + }, + { + "epoch": 0.81, + "learning_rate": 1.8521694608338692e-06, + "loss": 0.4525, + "step": 4670 + }, + { + "epoch": 0.81, + "learning_rate": 1.8489171025846198e-06, + "loss": 0.4616, + "step": 4671 + }, + { + "epoch": 0.81, + "learning_rate": 1.845667311473608e-06, + "loss": 0.4585, + "step": 4672 + }, + { + "epoch": 0.81, + "learning_rate": 1.842420088524337e-06, + "loss": 0.4477, + "step": 4673 + }, + { + "epoch": 0.81, + "learning_rate": 1.839175434759507e-06, + "loss": 0.4571, + "step": 4674 + }, + { + "epoch": 0.81, + "learning_rate": 1.8359333512009959e-06, + "loss": 0.4569, + "step": 4675 + }, + { + "epoch": 0.81, + "learning_rate": 1.8326938388698846e-06, + "loss": 0.4577, + "step": 4676 + }, + { + "epoch": 0.81, + "learning_rate": 1.829456898786437e-06, + "loss": 0.4595, + "step": 4677 + }, + { + "epoch": 0.81, + "learning_rate": 1.8262225319701122e-06, + "loss": 0.46, + "step": 4678 + }, + { + "epoch": 0.81, + "learning_rate": 1.8229907394395562e-06, + "loss": 0.4578, + "step": 4679 + }, + { + "epoch": 0.81, + "learning_rate": 1.8197615222125975e-06, + "loss": 0.4488, + "step": 4680 + }, + { + "epoch": 0.81, + "learning_rate": 1.81653488130627e-06, + "loss": 0.4586, + "step": 4681 + }, + { + "epoch": 0.81, + "learning_rate": 1.8133108177367787e-06, + "loss": 0.4556, + "step": 4682 + }, + { + "epoch": 0.81, + "learning_rate": 1.810089332519528e-06, + "loss": 0.4613, + "step": 4683 + }, + { + "epoch": 0.81, + "learning_rate": 1.806870426669105e-06, + "loss": 0.4579, + "step": 4684 + }, + { + "epoch": 0.81, + "learning_rate": 1.803654101199288e-06, + "loss": 0.4692, + "step": 4685 + }, + { + "epoch": 0.81, + "learning_rate": 1.8004403571230422e-06, + "loss": 0.4557, + "step": 4686 + }, + { + "epoch": 0.81, + "learning_rate": 1.79722919545251e-06, + "loss": 0.4528, + "step": 4687 + }, + { + "epoch": 0.81, + "learning_rate": 1.7940206171990416e-06, + "loss": 0.4531, + "step": 4688 + }, + { + "epoch": 0.81, + "learning_rate": 1.7908146233731515e-06, + "loss": 0.4565, + "step": 4689 + }, + { + "epoch": 0.81, + "learning_rate": 1.7876112149845526e-06, + "loss": 0.4536, + "step": 4690 + }, + { + "epoch": 0.81, + "learning_rate": 1.7844103930421409e-06, + "loss": 0.474, + "step": 4691 + }, + { + "epoch": 0.81, + "learning_rate": 1.7812121585539964e-06, + "loss": 0.4487, + "step": 4692 + }, + { + "epoch": 0.81, + "learning_rate": 1.7780165125273885e-06, + "loss": 0.467, + "step": 4693 + }, + { + "epoch": 0.81, + "learning_rate": 1.7748234559687628e-06, + "loss": 0.4548, + "step": 4694 + }, + { + "epoch": 0.81, + "learning_rate": 1.771632989883758e-06, + "loss": 0.466, + "step": 4695 + }, + { + "epoch": 0.81, + "learning_rate": 1.7684451152771932e-06, + "loss": 0.4491, + "step": 4696 + }, + { + "epoch": 0.81, + "learning_rate": 1.7652598331530734e-06, + "loss": 0.4749, + "step": 4697 + }, + { + "epoch": 0.81, + "learning_rate": 1.76207714451458e-06, + "loss": 0.4563, + "step": 4698 + }, + { + "epoch": 0.81, + "learning_rate": 1.758897050364089e-06, + "loss": 0.4695, + "step": 4699 + }, + { + "epoch": 0.81, + "learning_rate": 1.7557195517031532e-06, + "loss": 0.4566, + "step": 4700 + }, + { + "epoch": 0.81, + "learning_rate": 1.7525446495325038e-06, + "loss": 0.4568, + "step": 4701 + }, + { + "epoch": 0.81, + "learning_rate": 1.7493723448520616e-06, + "loss": 0.4316, + "step": 4702 + }, + { + "epoch": 0.81, + "learning_rate": 1.7462026386609253e-06, + "loss": 0.4566, + "step": 4703 + }, + { + "epoch": 0.81, + "learning_rate": 1.7430355319573799e-06, + "loss": 0.4596, + "step": 4704 + }, + { + "epoch": 0.82, + "learning_rate": 1.7398710257388784e-06, + "loss": 0.4623, + "step": 4705 + }, + { + "epoch": 0.82, + "learning_rate": 1.7367091210020748e-06, + "loss": 0.4465, + "step": 4706 + }, + { + "epoch": 0.82, + "learning_rate": 1.7335498187427912e-06, + "loss": 0.4636, + "step": 4707 + }, + { + "epoch": 0.82, + "learning_rate": 1.7303931199560286e-06, + "loss": 0.4593, + "step": 4708 + }, + { + "epoch": 0.82, + "learning_rate": 1.7272390256359728e-06, + "loss": 0.4531, + "step": 4709 + }, + { + "epoch": 0.82, + "learning_rate": 1.7240875367759902e-06, + "loss": 0.4527, + "step": 4710 + }, + { + "epoch": 0.82, + "learning_rate": 1.7209386543686247e-06, + "loss": 0.4652, + "step": 4711 + }, + { + "epoch": 0.82, + "learning_rate": 1.7177923794055974e-06, + "loss": 0.4445, + "step": 4712 + }, + { + "epoch": 0.82, + "learning_rate": 1.7146487128778077e-06, + "loss": 0.4478, + "step": 4713 + }, + { + "epoch": 0.82, + "learning_rate": 1.711507655775344e-06, + "loss": 0.4577, + "step": 4714 + }, + { + "epoch": 0.82, + "learning_rate": 1.708369209087457e-06, + "loss": 0.469, + "step": 4715 + }, + { + "epoch": 0.82, + "learning_rate": 1.7052333738025873e-06, + "loss": 0.4628, + "step": 4716 + }, + { + "epoch": 0.82, + "learning_rate": 1.7021001509083457e-06, + "loss": 0.4574, + "step": 4717 + }, + { + "epoch": 0.82, + "learning_rate": 1.6989695413915286e-06, + "loss": 0.4702, + "step": 4718 + }, + { + "epoch": 0.82, + "learning_rate": 1.6958415462380983e-06, + "loss": 0.4528, + "step": 4719 + }, + { + "epoch": 0.82, + "learning_rate": 1.6927161664331992e-06, + "loss": 0.4627, + "step": 4720 + }, + { + "epoch": 0.82, + "learning_rate": 1.6895934029611593e-06, + "loss": 0.4677, + "step": 4721 + }, + { + "epoch": 0.82, + "learning_rate": 1.6864732568054687e-06, + "loss": 0.4795, + "step": 4722 + }, + { + "epoch": 0.82, + "learning_rate": 1.6833557289488046e-06, + "loss": 0.4558, + "step": 4723 + }, + { + "epoch": 0.82, + "learning_rate": 1.6802408203730092e-06, + "loss": 0.4652, + "step": 4724 + }, + { + "epoch": 0.82, + "learning_rate": 1.6771285320591112e-06, + "loss": 0.4564, + "step": 4725 + }, + { + "epoch": 0.82, + "learning_rate": 1.674018864987309e-06, + "loss": 0.4535, + "step": 4726 + }, + { + "epoch": 0.82, + "learning_rate": 1.6709118201369702e-06, + "loss": 0.4631, + "step": 4727 + }, + { + "epoch": 0.82, + "learning_rate": 1.6678073984866438e-06, + "loss": 0.4622, + "step": 4728 + }, + { + "epoch": 0.82, + "learning_rate": 1.6647056010140495e-06, + "loss": 0.4565, + "step": 4729 + }, + { + "epoch": 0.82, + "learning_rate": 1.6616064286960852e-06, + "loss": 0.4612, + "step": 4730 + }, + { + "epoch": 0.82, + "learning_rate": 1.6585098825088086e-06, + "loss": 0.4598, + "step": 4731 + }, + { + "epoch": 0.82, + "learning_rate": 1.6554159634274692e-06, + "loss": 0.4607, + "step": 4732 + }, + { + "epoch": 0.82, + "learning_rate": 1.6523246724264775e-06, + "loss": 0.4438, + "step": 4733 + }, + { + "epoch": 0.82, + "learning_rate": 1.6492360104794158e-06, + "loss": 0.4751, + "step": 4734 + }, + { + "epoch": 0.82, + "learning_rate": 1.6461499785590407e-06, + "loss": 0.458, + "step": 4735 + }, + { + "epoch": 0.82, + "learning_rate": 1.6430665776372834e-06, + "loss": 0.4495, + "step": 4736 + }, + { + "epoch": 0.82, + "learning_rate": 1.639985808685245e-06, + "loss": 0.4514, + "step": 4737 + }, + { + "epoch": 0.82, + "learning_rate": 1.6369076726731913e-06, + "loss": 0.4569, + "step": 4738 + }, + { + "epoch": 0.82, + "learning_rate": 1.6338321705705651e-06, + "loss": 0.465, + "step": 4739 + }, + { + "epoch": 0.82, + "learning_rate": 1.6307593033459856e-06, + "loss": 0.4623, + "step": 4740 + }, + { + "epoch": 0.82, + "learning_rate": 1.6276890719672277e-06, + "loss": 0.4724, + "step": 4741 + }, + { + "epoch": 0.82, + "learning_rate": 1.6246214774012458e-06, + "loss": 0.4702, + "step": 4742 + }, + { + "epoch": 0.82, + "learning_rate": 1.6215565206141638e-06, + "loss": 0.4578, + "step": 4743 + }, + { + "epoch": 0.82, + "learning_rate": 1.6184942025712725e-06, + "loss": 0.4601, + "step": 4744 + }, + { + "epoch": 0.82, + "learning_rate": 1.6154345242370283e-06, + "loss": 0.4708, + "step": 4745 + }, + { + "epoch": 0.82, + "learning_rate": 1.6123774865750607e-06, + "loss": 0.4546, + "step": 4746 + }, + { + "epoch": 0.82, + "learning_rate": 1.6093230905481727e-06, + "loss": 0.448, + "step": 4747 + }, + { + "epoch": 0.82, + "learning_rate": 1.6062713371183225e-06, + "loss": 0.4579, + "step": 4748 + }, + { + "epoch": 0.82, + "learning_rate": 1.6032222272466458e-06, + "loss": 0.455, + "step": 4749 + }, + { + "epoch": 0.82, + "learning_rate": 1.600175761893442e-06, + "loss": 0.4588, + "step": 4750 + }, + { + "epoch": 0.82, + "learning_rate": 1.597131942018182e-06, + "loss": 0.4521, + "step": 4751 + }, + { + "epoch": 0.82, + "learning_rate": 1.5940907685794926e-06, + "loss": 0.4582, + "step": 4752 + }, + { + "epoch": 0.82, + "learning_rate": 1.5910522425351805e-06, + "loss": 0.4698, + "step": 4753 + }, + { + "epoch": 0.82, + "learning_rate": 1.5880163648422099e-06, + "loss": 0.459, + "step": 4754 + }, + { + "epoch": 0.82, + "learning_rate": 1.5849831364567137e-06, + "loss": 0.4697, + "step": 4755 + }, + { + "epoch": 0.82, + "learning_rate": 1.5819525583339945e-06, + "loss": 0.4598, + "step": 4756 + }, + { + "epoch": 0.82, + "learning_rate": 1.5789246314285067e-06, + "loss": 0.4548, + "step": 4757 + }, + { + "epoch": 0.82, + "learning_rate": 1.5758993566938907e-06, + "loss": 0.4695, + "step": 4758 + }, + { + "epoch": 0.82, + "learning_rate": 1.572876735082931e-06, + "loss": 0.4452, + "step": 4759 + }, + { + "epoch": 0.82, + "learning_rate": 1.5698567675475884e-06, + "loss": 0.4694, + "step": 4760 + }, + { + "epoch": 0.82, + "learning_rate": 1.5668394550389854e-06, + "loss": 0.4591, + "step": 4761 + }, + { + "epoch": 0.82, + "learning_rate": 1.5638247985074084e-06, + "loss": 0.4627, + "step": 4762 + }, + { + "epoch": 0.83, + "learning_rate": 1.560812798902307e-06, + "loss": 0.4537, + "step": 4763 + }, + { + "epoch": 0.83, + "learning_rate": 1.5578034571722879e-06, + "loss": 0.4618, + "step": 4764 + }, + { + "epoch": 0.83, + "learning_rate": 1.554796774265137e-06, + "loss": 0.4487, + "step": 4765 + }, + { + "epoch": 0.83, + "learning_rate": 1.5517927511277832e-06, + "loss": 0.473, + "step": 4766 + }, + { + "epoch": 0.83, + "learning_rate": 1.54879138870633e-06, + "loss": 0.449, + "step": 4767 + }, + { + "epoch": 0.83, + "learning_rate": 1.5457926879460404e-06, + "loss": 0.4628, + "step": 4768 + }, + { + "epoch": 0.83, + "learning_rate": 1.5427966497913383e-06, + "loss": 0.4592, + "step": 4769 + }, + { + "epoch": 0.83, + "learning_rate": 1.5398032751858117e-06, + "loss": 0.4573, + "step": 4770 + }, + { + "epoch": 0.83, + "learning_rate": 1.5368125650722021e-06, + "loss": 0.4605, + "step": 4771 + }, + { + "epoch": 0.83, + "learning_rate": 1.5338245203924196e-06, + "loss": 0.4631, + "step": 4772 + }, + { + "epoch": 0.83, + "learning_rate": 1.5308391420875312e-06, + "loss": 0.4626, + "step": 4773 + }, + { + "epoch": 0.83, + "learning_rate": 1.5278564310977673e-06, + "loss": 0.4625, + "step": 4774 + }, + { + "epoch": 0.83, + "learning_rate": 1.5248763883625162e-06, + "loss": 0.4395, + "step": 4775 + }, + { + "epoch": 0.83, + "learning_rate": 1.5218990148203228e-06, + "loss": 0.4601, + "step": 4776 + }, + { + "epoch": 0.83, + "learning_rate": 1.5189243114089003e-06, + "loss": 0.4473, + "step": 4777 + }, + { + "epoch": 0.83, + "learning_rate": 1.5159522790651072e-06, + "loss": 0.4738, + "step": 4778 + }, + { + "epoch": 0.83, + "learning_rate": 1.5129829187249734e-06, + "loss": 0.4613, + "step": 4779 + }, + { + "epoch": 0.83, + "learning_rate": 1.51001623132368e-06, + "loss": 0.4755, + "step": 4780 + }, + { + "epoch": 0.83, + "learning_rate": 1.5070522177955716e-06, + "loss": 0.4657, + "step": 4781 + }, + { + "epoch": 0.83, + "learning_rate": 1.5040908790741448e-06, + "loss": 0.4627, + "step": 4782 + }, + { + "epoch": 0.83, + "learning_rate": 1.5011322160920594e-06, + "loss": 0.4536, + "step": 4783 + }, + { + "epoch": 0.83, + "learning_rate": 1.4981762297811308e-06, + "loss": 0.4732, + "step": 4784 + }, + { + "epoch": 0.83, + "learning_rate": 1.4952229210723257e-06, + "loss": 0.4543, + "step": 4785 + }, + { + "epoch": 0.83, + "learning_rate": 1.4922722908957743e-06, + "loss": 0.4656, + "step": 4786 + }, + { + "epoch": 0.83, + "learning_rate": 1.4893243401807622e-06, + "loss": 0.4625, + "step": 4787 + }, + { + "epoch": 0.83, + "learning_rate": 1.4863790698557301e-06, + "loss": 0.4676, + "step": 4788 + }, + { + "epoch": 0.83, + "learning_rate": 1.483436480848276e-06, + "loss": 0.4544, + "step": 4789 + }, + { + "epoch": 0.83, + "learning_rate": 1.4804965740851451e-06, + "loss": 0.4508, + "step": 4790 + }, + { + "epoch": 0.83, + "learning_rate": 1.4775593504922547e-06, + "loss": 0.4606, + "step": 4791 + }, + { + "epoch": 0.83, + "learning_rate": 1.4746248109946592e-06, + "loss": 0.4709, + "step": 4792 + }, + { + "epoch": 0.83, + "learning_rate": 1.4716929565165784e-06, + "loss": 0.4516, + "step": 4793 + }, + { + "epoch": 0.83, + "learning_rate": 1.4687637879813832e-06, + "loss": 0.4483, + "step": 4794 + }, + { + "epoch": 0.83, + "learning_rate": 1.4658373063115993e-06, + "loss": 0.4591, + "step": 4795 + }, + { + "epoch": 0.83, + "learning_rate": 1.4629135124289084e-06, + "loss": 0.4539, + "step": 4796 + }, + { + "epoch": 0.83, + "learning_rate": 1.459992407254137e-06, + "loss": 0.4629, + "step": 4797 + }, + { + "epoch": 0.83, + "learning_rate": 1.4570739917072752e-06, + "loss": 0.467, + "step": 4798 + }, + { + "epoch": 0.83, + "learning_rate": 1.4541582667074606e-06, + "loss": 0.4629, + "step": 4799 + }, + { + "epoch": 0.83, + "learning_rate": 1.4512452331729864e-06, + "loss": 0.449, + "step": 4800 + }, + { + "epoch": 0.83, + "learning_rate": 1.4483348920212913e-06, + "loss": 0.4625, + "step": 4801 + }, + { + "epoch": 0.83, + "learning_rate": 1.4454272441689764e-06, + "loss": 0.4711, + "step": 4802 + }, + { + "epoch": 0.83, + "learning_rate": 1.4425222905317892e-06, + "loss": 0.4425, + "step": 4803 + }, + { + "epoch": 0.83, + "learning_rate": 1.4396200320246256e-06, + "loss": 0.4623, + "step": 4804 + }, + { + "epoch": 0.83, + "learning_rate": 1.4367204695615367e-06, + "loss": 0.4434, + "step": 4805 + }, + { + "epoch": 0.83, + "learning_rate": 1.433823604055723e-06, + "loss": 0.4681, + "step": 4806 + }, + { + "epoch": 0.83, + "learning_rate": 1.4309294364195403e-06, + "loss": 0.4519, + "step": 4807 + }, + { + "epoch": 0.83, + "learning_rate": 1.4280379675644817e-06, + "loss": 0.4702, + "step": 4808 + }, + { + "epoch": 0.83, + "learning_rate": 1.4251491984012089e-06, + "loss": 0.4474, + "step": 4809 + }, + { + "epoch": 0.83, + "learning_rate": 1.4222631298395207e-06, + "loss": 0.4577, + "step": 4810 + }, + { + "epoch": 0.83, + "learning_rate": 1.4193797627883655e-06, + "loss": 0.4602, + "step": 4811 + }, + { + "epoch": 0.83, + "learning_rate": 1.4164990981558458e-06, + "loss": 0.4659, + "step": 4812 + }, + { + "epoch": 0.83, + "learning_rate": 1.4136211368492104e-06, + "loss": 0.4534, + "step": 4813 + }, + { + "epoch": 0.83, + "learning_rate": 1.4107458797748596e-06, + "loss": 0.4612, + "step": 4814 + }, + { + "epoch": 0.83, + "learning_rate": 1.407873327838335e-06, + "loss": 0.4476, + "step": 4815 + }, + { + "epoch": 0.83, + "learning_rate": 1.4050034819443315e-06, + "loss": 0.4596, + "step": 4816 + }, + { + "epoch": 0.83, + "learning_rate": 1.4021363429966984e-06, + "loss": 0.4659, + "step": 4817 + }, + { + "epoch": 0.83, + "learning_rate": 1.3992719118984167e-06, + "loss": 0.4677, + "step": 4818 + }, + { + "epoch": 0.83, + "learning_rate": 1.3964101895516259e-06, + "loss": 0.4455, + "step": 4819 + }, + { + "epoch": 0.83, + "learning_rate": 1.3935511768576092e-06, + "loss": 0.4581, + "step": 4820 + }, + { + "epoch": 0.84, + "learning_rate": 1.3906948747168003e-06, + "loss": 0.4482, + "step": 4821 + }, + { + "epoch": 0.84, + "learning_rate": 1.3878412840287713e-06, + "loss": 0.4671, + "step": 4822 + }, + { + "epoch": 0.84, + "learning_rate": 1.3849904056922424e-06, + "loss": 0.4556, + "step": 4823 + }, + { + "epoch": 0.84, + "learning_rate": 1.382142240605091e-06, + "loss": 0.4594, + "step": 4824 + }, + { + "epoch": 0.84, + "learning_rate": 1.3792967896643228e-06, + "loss": 0.4634, + "step": 4825 + }, + { + "epoch": 0.84, + "learning_rate": 1.3764540537660997e-06, + "loss": 0.4604, + "step": 4826 + }, + { + "epoch": 0.84, + "learning_rate": 1.3736140338057247e-06, + "loss": 0.461, + "step": 4827 + }, + { + "epoch": 0.84, + "learning_rate": 1.3707767306776498e-06, + "loss": 0.464, + "step": 4828 + }, + { + "epoch": 0.84, + "learning_rate": 1.3679421452754627e-06, + "loss": 0.4531, + "step": 4829 + }, + { + "epoch": 0.84, + "learning_rate": 1.3651102784919024e-06, + "loss": 0.4397, + "step": 4830 + }, + { + "epoch": 0.84, + "learning_rate": 1.3622811312188489e-06, + "loss": 0.4567, + "step": 4831 + }, + { + "epoch": 0.84, + "learning_rate": 1.3594547043473283e-06, + "loss": 0.4537, + "step": 4832 + }, + { + "epoch": 0.84, + "learning_rate": 1.3566309987675087e-06, + "loss": 0.4479, + "step": 4833 + }, + { + "epoch": 0.84, + "learning_rate": 1.353810015368694e-06, + "loss": 0.4678, + "step": 4834 + }, + { + "epoch": 0.84, + "learning_rate": 1.350991755039347e-06, + "loss": 0.4594, + "step": 4835 + }, + { + "epoch": 0.84, + "learning_rate": 1.3481762186670556e-06, + "loss": 0.4674, + "step": 4836 + }, + { + "epoch": 0.84, + "learning_rate": 1.3453634071385591e-06, + "loss": 0.4529, + "step": 4837 + }, + { + "epoch": 0.84, + "learning_rate": 1.342553321339738e-06, + "loss": 0.4667, + "step": 4838 + }, + { + "epoch": 0.84, + "learning_rate": 1.339745962155613e-06, + "loss": 0.4576, + "step": 4839 + }, + { + "epoch": 0.84, + "learning_rate": 1.3369413304703481e-06, + "loss": 0.4639, + "step": 4840 + }, + { + "epoch": 0.84, + "learning_rate": 1.3341394271672403e-06, + "loss": 0.4409, + "step": 4841 + }, + { + "epoch": 0.84, + "learning_rate": 1.3313402531287423e-06, + "loss": 0.4684, + "step": 4842 + }, + { + "epoch": 0.84, + "learning_rate": 1.328543809236431e-06, + "loss": 0.4511, + "step": 4843 + }, + { + "epoch": 0.84, + "learning_rate": 1.3257500963710336e-06, + "loss": 0.4654, + "step": 4844 + }, + { + "epoch": 0.84, + "learning_rate": 1.3229591154124132e-06, + "loss": 0.4546, + "step": 4845 + }, + { + "epoch": 0.84, + "learning_rate": 1.3201708672395762e-06, + "loss": 0.4644, + "step": 4846 + }, + { + "epoch": 0.84, + "learning_rate": 1.3173853527306658e-06, + "loss": 0.449, + "step": 4847 + }, + { + "epoch": 0.84, + "learning_rate": 1.3146025727629618e-06, + "loss": 0.4428, + "step": 4848 + }, + { + "epoch": 0.84, + "learning_rate": 1.3118225282128861e-06, + "loss": 0.457, + "step": 4849 + }, + { + "epoch": 0.84, + "learning_rate": 1.3090452199559988e-06, + "loss": 0.4526, + "step": 4850 + }, + { + "epoch": 0.84, + "learning_rate": 1.3062706488669974e-06, + "loss": 0.4526, + "step": 4851 + }, + { + "epoch": 0.84, + "learning_rate": 1.3034988158197171e-06, + "loss": 0.4551, + "step": 4852 + }, + { + "epoch": 0.84, + "learning_rate": 1.3007297216871328e-06, + "loss": 0.4581, + "step": 4853 + }, + { + "epoch": 0.84, + "learning_rate": 1.2979633673413571e-06, + "loss": 0.4569, + "step": 4854 + }, + { + "epoch": 0.84, + "learning_rate": 1.295199753653633e-06, + "loss": 0.4456, + "step": 4855 + }, + { + "epoch": 0.84, + "learning_rate": 1.2924388814943467e-06, + "loss": 0.4659, + "step": 4856 + }, + { + "epoch": 0.84, + "learning_rate": 1.2896807517330211e-06, + "loss": 0.4537, + "step": 4857 + }, + { + "epoch": 0.84, + "learning_rate": 1.2869253652383141e-06, + "loss": 0.4776, + "step": 4858 + }, + { + "epoch": 0.84, + "learning_rate": 1.2841727228780188e-06, + "loss": 0.4571, + "step": 4859 + }, + { + "epoch": 0.84, + "learning_rate": 1.2814228255190608e-06, + "loss": 0.4624, + "step": 4860 + }, + { + "epoch": 0.84, + "learning_rate": 1.278675674027513e-06, + "loss": 0.4688, + "step": 4861 + }, + { + "epoch": 0.84, + "learning_rate": 1.275931269268569e-06, + "loss": 0.4514, + "step": 4862 + }, + { + "epoch": 0.84, + "learning_rate": 1.2731896121065645e-06, + "loss": 0.4537, + "step": 4863 + }, + { + "epoch": 0.84, + "learning_rate": 1.2704507034049717e-06, + "loss": 0.4614, + "step": 4864 + }, + { + "epoch": 0.84, + "learning_rate": 1.2677145440263927e-06, + "loss": 0.4525, + "step": 4865 + }, + { + "epoch": 0.84, + "learning_rate": 1.2649811348325691e-06, + "loss": 0.4668, + "step": 4866 + }, + { + "epoch": 0.84, + "learning_rate": 1.2622504766843657e-06, + "loss": 0.4593, + "step": 4867 + }, + { + "epoch": 0.84, + "learning_rate": 1.2595225704417958e-06, + "loss": 0.4559, + "step": 4868 + }, + { + "epoch": 0.84, + "learning_rate": 1.2567974169639941e-06, + "loss": 0.455, + "step": 4869 + }, + { + "epoch": 0.84, + "learning_rate": 1.254075017109233e-06, + "loss": 0.4632, + "step": 4870 + }, + { + "epoch": 0.84, + "learning_rate": 1.251355371734918e-06, + "loss": 0.4641, + "step": 4871 + }, + { + "epoch": 0.84, + "learning_rate": 1.248638481697586e-06, + "loss": 0.4654, + "step": 4872 + }, + { + "epoch": 0.84, + "learning_rate": 1.2459243478529094e-06, + "loss": 0.463, + "step": 4873 + }, + { + "epoch": 0.84, + "learning_rate": 1.2432129710556828e-06, + "loss": 0.4561, + "step": 4874 + }, + { + "epoch": 0.84, + "learning_rate": 1.2405043521598448e-06, + "loss": 0.4463, + "step": 4875 + }, + { + "epoch": 0.84, + "learning_rate": 1.2377984920184571e-06, + "loss": 0.4678, + "step": 4876 + }, + { + "epoch": 0.84, + "learning_rate": 1.2350953914837182e-06, + "loss": 0.4545, + "step": 4877 + }, + { + "epoch": 0.85, + "learning_rate": 1.2323950514069483e-06, + "loss": 0.449, + "step": 4878 + }, + { + "epoch": 0.85, + "learning_rate": 1.2296974726386124e-06, + "loss": 0.4515, + "step": 4879 + }, + { + "epoch": 0.85, + "learning_rate": 1.2270026560282955e-06, + "loss": 0.4654, + "step": 4880 + }, + { + "epoch": 0.85, + "learning_rate": 1.224310602424712e-06, + "loss": 0.4686, + "step": 4881 + }, + { + "epoch": 0.85, + "learning_rate": 1.2216213126757115e-06, + "loss": 0.4766, + "step": 4882 + }, + { + "epoch": 0.85, + "learning_rate": 1.2189347876282697e-06, + "loss": 0.4535, + "step": 4883 + }, + { + "epoch": 0.85, + "learning_rate": 1.2162510281284967e-06, + "loss": 0.4661, + "step": 4884 + }, + { + "epoch": 0.85, + "learning_rate": 1.2135700350216207e-06, + "loss": 0.4507, + "step": 4885 + }, + { + "epoch": 0.85, + "learning_rate": 1.2108918091520106e-06, + "loss": 0.465, + "step": 4886 + }, + { + "epoch": 0.85, + "learning_rate": 1.2082163513631595e-06, + "loss": 0.4533, + "step": 4887 + }, + { + "epoch": 0.85, + "learning_rate": 1.2055436624976847e-06, + "loss": 0.4669, + "step": 4888 + }, + { + "epoch": 0.85, + "learning_rate": 1.2028737433973358e-06, + "loss": 0.4558, + "step": 4889 + }, + { + "epoch": 0.85, + "learning_rate": 1.2002065949029896e-06, + "loss": 0.4458, + "step": 4890 + }, + { + "epoch": 0.85, + "learning_rate": 1.1975422178546502e-06, + "loss": 0.4476, + "step": 4891 + }, + { + "epoch": 0.85, + "learning_rate": 1.194880613091447e-06, + "loss": 0.4452, + "step": 4892 + }, + { + "epoch": 0.85, + "learning_rate": 1.1922217814516345e-06, + "loss": 0.4614, + "step": 4893 + }, + { + "epoch": 0.85, + "learning_rate": 1.1895657237726055e-06, + "loss": 0.4551, + "step": 4894 + }, + { + "epoch": 0.85, + "learning_rate": 1.1869124408908627e-06, + "loss": 0.4591, + "step": 4895 + }, + { + "epoch": 0.85, + "learning_rate": 1.1842619336420469e-06, + "loss": 0.4667, + "step": 4896 + }, + { + "epoch": 0.85, + "learning_rate": 1.1816142028609189e-06, + "loss": 0.4559, + "step": 4897 + }, + { + "epoch": 0.85, + "learning_rate": 1.1789692493813688e-06, + "loss": 0.4649, + "step": 4898 + }, + { + "epoch": 0.85, + "learning_rate": 1.1763270740364074e-06, + "loss": 0.454, + "step": 4899 + }, + { + "epoch": 0.85, + "learning_rate": 1.1736876776581706e-06, + "loss": 0.4562, + "step": 4900 + }, + { + "epoch": 0.85, + "learning_rate": 1.1710510610779314e-06, + "loss": 0.4628, + "step": 4901 + }, + { + "epoch": 0.85, + "learning_rate": 1.1684172251260684e-06, + "loss": 0.46, + "step": 4902 + }, + { + "epoch": 0.85, + "learning_rate": 1.1657861706320983e-06, + "loss": 0.4451, + "step": 4903 + }, + { + "epoch": 0.85, + "learning_rate": 1.1631578984246516e-06, + "loss": 0.4541, + "step": 4904 + }, + { + "epoch": 0.85, + "learning_rate": 1.1605324093314951e-06, + "loss": 0.4652, + "step": 4905 + }, + { + "epoch": 0.85, + "learning_rate": 1.1579097041795073e-06, + "loss": 0.4575, + "step": 4906 + }, + { + "epoch": 0.85, + "learning_rate": 1.1552897837946963e-06, + "loss": 0.4577, + "step": 4907 + }, + { + "epoch": 0.85, + "learning_rate": 1.15267264900219e-06, + "loss": 0.4464, + "step": 4908 + }, + { + "epoch": 0.85, + "learning_rate": 1.1500583006262423e-06, + "loss": 0.4721, + "step": 4909 + }, + { + "epoch": 0.85, + "learning_rate": 1.1474467394902288e-06, + "loss": 0.4498, + "step": 4910 + }, + { + "epoch": 0.85, + "learning_rate": 1.144837966416641e-06, + "loss": 0.4546, + "step": 4911 + }, + { + "epoch": 0.85, + "learning_rate": 1.1422319822271044e-06, + "loss": 0.455, + "step": 4912 + }, + { + "epoch": 0.85, + "learning_rate": 1.1396287877423528e-06, + "loss": 0.4723, + "step": 4913 + }, + { + "epoch": 0.85, + "learning_rate": 1.1370283837822515e-06, + "loss": 0.4466, + "step": 4914 + }, + { + "epoch": 0.85, + "learning_rate": 1.134430771165782e-06, + "loss": 0.4503, + "step": 4915 + }, + { + "epoch": 0.85, + "learning_rate": 1.1318359507110489e-06, + "loss": 0.4498, + "step": 4916 + }, + { + "epoch": 0.85, + "learning_rate": 1.1292439232352781e-06, + "loss": 0.4733, + "step": 4917 + }, + { + "epoch": 0.85, + "learning_rate": 1.1266546895548091e-06, + "loss": 0.4523, + "step": 4918 + }, + { + "epoch": 0.85, + "learning_rate": 1.1240682504851108e-06, + "loss": 0.4667, + "step": 4919 + }, + { + "epoch": 0.85, + "learning_rate": 1.1214846068407658e-06, + "loss": 0.4549, + "step": 4920 + }, + { + "epoch": 0.85, + "learning_rate": 1.118903759435479e-06, + "loss": 0.472, + "step": 4921 + }, + { + "epoch": 0.85, + "learning_rate": 1.1163257090820745e-06, + "loss": 0.4643, + "step": 4922 + }, + { + "epoch": 0.85, + "learning_rate": 1.113750456592494e-06, + "loss": 0.4533, + "step": 4923 + }, + { + "epoch": 0.85, + "learning_rate": 1.1111780027778019e-06, + "loss": 0.4689, + "step": 4924 + }, + { + "epoch": 0.85, + "learning_rate": 1.1086083484481735e-06, + "loss": 0.456, + "step": 4925 + }, + { + "epoch": 0.85, + "learning_rate": 1.1060414944129106e-06, + "loss": 0.4549, + "step": 4926 + }, + { + "epoch": 0.85, + "learning_rate": 1.1034774414804273e-06, + "loss": 0.4662, + "step": 4927 + }, + { + "epoch": 0.85, + "learning_rate": 1.1009161904582588e-06, + "loss": 0.4497, + "step": 4928 + }, + { + "epoch": 0.85, + "learning_rate": 1.0983577421530578e-06, + "loss": 0.4653, + "step": 4929 + }, + { + "epoch": 0.85, + "learning_rate": 1.0958020973705918e-06, + "loss": 0.4561, + "step": 4930 + }, + { + "epoch": 0.85, + "learning_rate": 1.0932492569157505e-06, + "loss": 0.4618, + "step": 4931 + }, + { + "epoch": 0.85, + "learning_rate": 1.0906992215925315e-06, + "loss": 0.4698, + "step": 4932 + }, + { + "epoch": 0.85, + "learning_rate": 1.0881519922040574e-06, + "loss": 0.4597, + "step": 4933 + }, + { + "epoch": 0.85, + "learning_rate": 1.0856075695525624e-06, + "loss": 0.4554, + "step": 4934 + }, + { + "epoch": 0.85, + "learning_rate": 1.0830659544393996e-06, + "loss": 0.4674, + "step": 4935 + }, + { + "epoch": 0.86, + "learning_rate": 1.0805271476650382e-06, + "loss": 0.4485, + "step": 4936 + }, + { + "epoch": 0.86, + "learning_rate": 1.077991150029054e-06, + "loss": 0.4717, + "step": 4937 + }, + { + "epoch": 0.86, + "learning_rate": 1.0754579623301564e-06, + "loss": 0.4524, + "step": 4938 + }, + { + "epoch": 0.86, + "learning_rate": 1.0729275853661503e-06, + "loss": 0.4571, + "step": 4939 + }, + { + "epoch": 0.86, + "learning_rate": 1.070400019933966e-06, + "loss": 0.4484, + "step": 4940 + }, + { + "epoch": 0.86, + "learning_rate": 1.0678752668296466e-06, + "loss": 0.45, + "step": 4941 + }, + { + "epoch": 0.86, + "learning_rate": 1.0653533268483495e-06, + "loss": 0.4516, + "step": 4942 + }, + { + "epoch": 0.86, + "learning_rate": 1.0628342007843472e-06, + "loss": 0.4651, + "step": 4943 + }, + { + "epoch": 0.86, + "learning_rate": 1.0603178894310185e-06, + "loss": 0.4505, + "step": 4944 + }, + { + "epoch": 0.86, + "learning_rate": 1.0578043935808702e-06, + "loss": 0.4601, + "step": 4945 + }, + { + "epoch": 0.86, + "learning_rate": 1.055293714025506e-06, + "loss": 0.4572, + "step": 4946 + }, + { + "epoch": 0.86, + "learning_rate": 1.0527858515556565e-06, + "loss": 0.4456, + "step": 4947 + }, + { + "epoch": 0.86, + "learning_rate": 1.0502808069611537e-06, + "loss": 0.4486, + "step": 4948 + }, + { + "epoch": 0.86, + "learning_rate": 1.0477785810309504e-06, + "loss": 0.4663, + "step": 4949 + }, + { + "epoch": 0.86, + "learning_rate": 1.045279174553111e-06, + "loss": 0.4539, + "step": 4950 + }, + { + "epoch": 0.86, + "learning_rate": 1.0427825883148057e-06, + "loss": 0.4528, + "step": 4951 + }, + { + "epoch": 0.86, + "learning_rate": 1.0402888231023212e-06, + "loss": 0.4556, + "step": 4952 + }, + { + "epoch": 0.86, + "learning_rate": 1.0377978797010558e-06, + "loss": 0.4717, + "step": 4953 + }, + { + "epoch": 0.86, + "learning_rate": 1.0353097588955198e-06, + "loss": 0.4627, + "step": 4954 + }, + { + "epoch": 0.86, + "learning_rate": 1.0328244614693285e-06, + "loss": 0.4615, + "step": 4955 + }, + { + "epoch": 0.86, + "learning_rate": 1.0303419882052157e-06, + "loss": 0.4536, + "step": 4956 + }, + { + "epoch": 0.86, + "learning_rate": 1.0278623398850251e-06, + "loss": 0.4785, + "step": 4957 + }, + { + "epoch": 0.86, + "learning_rate": 1.025385517289703e-06, + "loss": 0.4565, + "step": 4958 + }, + { + "epoch": 0.86, + "learning_rate": 1.0229115211993146e-06, + "loss": 0.4651, + "step": 4959 + }, + { + "epoch": 0.86, + "learning_rate": 1.0204403523930284e-06, + "loss": 0.4497, + "step": 4960 + }, + { + "epoch": 0.86, + "learning_rate": 1.0179720116491288e-06, + "loss": 0.4652, + "step": 4961 + }, + { + "epoch": 0.86, + "learning_rate": 1.0155064997450026e-06, + "loss": 0.4497, + "step": 4962 + }, + { + "epoch": 0.86, + "learning_rate": 1.0130438174571478e-06, + "loss": 0.4602, + "step": 4963 + }, + { + "epoch": 0.86, + "learning_rate": 1.0105839655611783e-06, + "loss": 0.4488, + "step": 4964 + }, + { + "epoch": 0.86, + "learning_rate": 1.0081269448318065e-06, + "loss": 0.4584, + "step": 4965 + }, + { + "epoch": 0.86, + "learning_rate": 1.005672756042858e-06, + "loss": 0.4623, + "step": 4966 + }, + { + "epoch": 0.86, + "learning_rate": 1.003221399967267e-06, + "loss": 0.4594, + "step": 4967 + }, + { + "epoch": 0.86, + "learning_rate": 1.0007728773770753e-06, + "loss": 0.4573, + "step": 4968 + }, + { + "epoch": 0.86, + "learning_rate": 9.983271890434277e-07, + "loss": 0.4533, + "step": 4969 + }, + { + "epoch": 0.86, + "learning_rate": 9.95884335736581e-07, + "loss": 0.4615, + "step": 4970 + }, + { + "epoch": 0.86, + "learning_rate": 9.934443182259023e-07, + "loss": 0.454, + "step": 4971 + }, + { + "epoch": 0.86, + "learning_rate": 9.91007137279858e-07, + "loss": 0.465, + "step": 4972 + }, + { + "epoch": 0.86, + "learning_rate": 9.88572793666026e-07, + "loss": 0.4574, + "step": 4973 + }, + { + "epoch": 0.86, + "learning_rate": 9.861412881510891e-07, + "loss": 0.4652, + "step": 4974 + }, + { + "epoch": 0.86, + "learning_rate": 9.837126215008374e-07, + "loss": 0.462, + "step": 4975 + }, + { + "epoch": 0.86, + "learning_rate": 9.81286794480163e-07, + "loss": 0.4675, + "step": 4976 + }, + { + "epoch": 0.86, + "learning_rate": 9.788638078530689e-07, + "loss": 0.4563, + "step": 4977 + }, + { + "epoch": 0.86, + "learning_rate": 9.764436623826601e-07, + "loss": 0.4531, + "step": 4978 + }, + { + "epoch": 0.86, + "learning_rate": 9.740263588311483e-07, + "loss": 0.4551, + "step": 4979 + }, + { + "epoch": 0.86, + "learning_rate": 9.716118979598533e-07, + "loss": 0.4514, + "step": 4980 + }, + { + "epoch": 0.86, + "learning_rate": 9.692002805291888e-07, + "loss": 0.4601, + "step": 4981 + }, + { + "epoch": 0.86, + "learning_rate": 9.667915072986877e-07, + "loss": 0.4578, + "step": 4982 + }, + { + "epoch": 0.86, + "learning_rate": 9.643855790269752e-07, + "loss": 0.4563, + "step": 4983 + }, + { + "epoch": 0.86, + "learning_rate": 9.619824964717873e-07, + "loss": 0.4612, + "step": 4984 + }, + { + "epoch": 0.86, + "learning_rate": 9.595822603899584e-07, + "loss": 0.4669, + "step": 4985 + }, + { + "epoch": 0.86, + "learning_rate": 9.571848715374333e-07, + "loss": 0.4589, + "step": 4986 + }, + { + "epoch": 0.86, + "learning_rate": 9.547903306692562e-07, + "loss": 0.4595, + "step": 4987 + }, + { + "epoch": 0.86, + "learning_rate": 9.523986385395689e-07, + "loss": 0.4563, + "step": 4988 + }, + { + "epoch": 0.86, + "learning_rate": 9.500097959016297e-07, + "loss": 0.4678, + "step": 4989 + }, + { + "epoch": 0.86, + "learning_rate": 9.476238035077855e-07, + "loss": 0.4641, + "step": 4990 + }, + { + "epoch": 0.86, + "learning_rate": 9.452406621094923e-07, + "loss": 0.4592, + "step": 4991 + }, + { + "epoch": 0.86, + "learning_rate": 9.428603724573083e-07, + "loss": 0.4424, + "step": 4992 + }, + { + "epoch": 0.86, + "learning_rate": 9.404829353008915e-07, + "loss": 0.468, + "step": 4993 + }, + { + "epoch": 0.87, + "learning_rate": 9.381083513890055e-07, + "loss": 0.4511, + "step": 4994 + }, + { + "epoch": 0.87, + "learning_rate": 9.357366214695074e-07, + "loss": 0.4791, + "step": 4995 + }, + { + "epoch": 0.87, + "learning_rate": 9.333677462893643e-07, + "loss": 0.4511, + "step": 4996 + }, + { + "epoch": 0.87, + "learning_rate": 9.310017265946381e-07, + "loss": 0.4723, + "step": 4997 + }, + { + "epoch": 0.87, + "learning_rate": 9.286385631304939e-07, + "loss": 0.4526, + "step": 4998 + }, + { + "epoch": 0.87, + "learning_rate": 9.262782566411976e-07, + "loss": 0.4586, + "step": 4999 + }, + { + "epoch": 0.87, + "learning_rate": 9.239208078701145e-07, + "loss": 0.4481, + "step": 5000 + }, + { + "epoch": 0.87, + "learning_rate": 9.215662175597106e-07, + "loss": 0.4651, + "step": 5001 + }, + { + "epoch": 0.87, + "learning_rate": 9.192144864515495e-07, + "loss": 0.4539, + "step": 5002 + }, + { + "epoch": 0.87, + "learning_rate": 9.168656152862965e-07, + "loss": 0.4531, + "step": 5003 + }, + { + "epoch": 0.87, + "learning_rate": 9.145196048037142e-07, + "loss": 0.4536, + "step": 5004 + }, + { + "epoch": 0.87, + "learning_rate": 9.121764557426682e-07, + "loss": 0.4593, + "step": 5005 + }, + { + "epoch": 0.87, + "learning_rate": 9.098361688411206e-07, + "loss": 0.4562, + "step": 5006 + }, + { + "epoch": 0.87, + "learning_rate": 9.074987448361261e-07, + "loss": 0.4564, + "step": 5007 + }, + { + "epoch": 0.87, + "learning_rate": 9.051641844638515e-07, + "loss": 0.4582, + "step": 5008 + }, + { + "epoch": 0.87, + "learning_rate": 9.028324884595474e-07, + "loss": 0.4658, + "step": 5009 + }, + { + "epoch": 0.87, + "learning_rate": 9.00503657557571e-07, + "loss": 0.4541, + "step": 5010 + }, + { + "epoch": 0.87, + "learning_rate": 8.981776924913743e-07, + "loss": 0.4616, + "step": 5011 + }, + { + "epoch": 0.87, + "learning_rate": 8.958545939935059e-07, + "loss": 0.473, + "step": 5012 + }, + { + "epoch": 0.87, + "learning_rate": 8.935343627956172e-07, + "loss": 0.4581, + "step": 5013 + }, + { + "epoch": 0.87, + "learning_rate": 8.912169996284447e-07, + "loss": 0.4719, + "step": 5014 + }, + { + "epoch": 0.87, + "learning_rate": 8.889025052218359e-07, + "loss": 0.4643, + "step": 5015 + }, + { + "epoch": 0.87, + "learning_rate": 8.865908803047241e-07, + "loss": 0.4535, + "step": 5016 + }, + { + "epoch": 0.87, + "learning_rate": 8.842821256051437e-07, + "loss": 0.4527, + "step": 5017 + }, + { + "epoch": 0.87, + "learning_rate": 8.819762418502242e-07, + "loss": 0.4523, + "step": 5018 + }, + { + "epoch": 0.87, + "learning_rate": 8.796732297661914e-07, + "loss": 0.4632, + "step": 5019 + }, + { + "epoch": 0.87, + "learning_rate": 8.773730900783672e-07, + "loss": 0.46, + "step": 5020 + }, + { + "epoch": 0.87, + "learning_rate": 8.750758235111644e-07, + "loss": 0.4641, + "step": 5021 + }, + { + "epoch": 0.87, + "learning_rate": 8.727814307880956e-07, + "loss": 0.4665, + "step": 5022 + }, + { + "epoch": 0.87, + "learning_rate": 8.70489912631769e-07, + "loss": 0.4578, + "step": 5023 + }, + { + "epoch": 0.87, + "learning_rate": 8.682012697638842e-07, + "loss": 0.4444, + "step": 5024 + }, + { + "epoch": 0.87, + "learning_rate": 8.659155029052346e-07, + "loss": 0.4695, + "step": 5025 + }, + { + "epoch": 0.87, + "learning_rate": 8.636326127757121e-07, + "loss": 0.4478, + "step": 5026 + }, + { + "epoch": 0.87, + "learning_rate": 8.613526000943029e-07, + "loss": 0.4595, + "step": 5027 + }, + { + "epoch": 0.87, + "learning_rate": 8.590754655790779e-07, + "loss": 0.4503, + "step": 5028 + }, + { + "epoch": 0.87, + "learning_rate": 8.568012099472123e-07, + "loss": 0.4582, + "step": 5029 + }, + { + "epoch": 0.87, + "learning_rate": 8.545298339149699e-07, + "loss": 0.4453, + "step": 5030 + }, + { + "epoch": 0.87, + "learning_rate": 8.522613381977074e-07, + "loss": 0.4529, + "step": 5031 + }, + { + "epoch": 0.87, + "learning_rate": 8.499957235098722e-07, + "loss": 0.4622, + "step": 5032 + }, + { + "epoch": 0.87, + "learning_rate": 8.477329905650111e-07, + "loss": 0.4481, + "step": 5033 + }, + { + "epoch": 0.87, + "learning_rate": 8.454731400757599e-07, + "loss": 0.4599, + "step": 5034 + }, + { + "epoch": 0.87, + "learning_rate": 8.432161727538424e-07, + "loss": 0.4695, + "step": 5035 + }, + { + "epoch": 0.87, + "learning_rate": 8.40962089310079e-07, + "loss": 0.458, + "step": 5036 + }, + { + "epoch": 0.87, + "learning_rate": 8.387108904543817e-07, + "loss": 0.4724, + "step": 5037 + }, + { + "epoch": 0.87, + "learning_rate": 8.364625768957535e-07, + "loss": 0.4504, + "step": 5038 + }, + { + "epoch": 0.87, + "learning_rate": 8.342171493422857e-07, + "loss": 0.4565, + "step": 5039 + }, + { + "epoch": 0.87, + "learning_rate": 8.319746085011627e-07, + "loss": 0.4611, + "step": 5040 + }, + { + "epoch": 0.87, + "learning_rate": 8.29734955078666e-07, + "loss": 0.4466, + "step": 5041 + }, + { + "epoch": 0.87, + "learning_rate": 8.274981897801571e-07, + "loss": 0.4624, + "step": 5042 + }, + { + "epoch": 0.87, + "learning_rate": 8.252643133100935e-07, + "loss": 0.4657, + "step": 5043 + }, + { + "epoch": 0.87, + "learning_rate": 8.230333263720225e-07, + "loss": 0.4603, + "step": 5044 + }, + { + "epoch": 0.87, + "learning_rate": 8.208052296685842e-07, + "loss": 0.4567, + "step": 5045 + }, + { + "epoch": 0.87, + "learning_rate": 8.185800239014996e-07, + "loss": 0.4677, + "step": 5046 + }, + { + "epoch": 0.87, + "learning_rate": 8.163577097715858e-07, + "loss": 0.4488, + "step": 5047 + }, + { + "epoch": 0.87, + "learning_rate": 8.14138287978754e-07, + "loss": 0.4595, + "step": 5048 + }, + { + "epoch": 0.87, + "learning_rate": 8.119217592219919e-07, + "loss": 0.4542, + "step": 5049 + }, + { + "epoch": 0.87, + "learning_rate": 8.097081241993865e-07, + "loss": 0.4716, + "step": 5050 + }, + { + "epoch": 0.88, + "learning_rate": 8.074973836081102e-07, + "loss": 0.4672, + "step": 5051 + }, + { + "epoch": 0.88, + "learning_rate": 8.052895381444226e-07, + "loss": 0.466, + "step": 5052 + }, + { + "epoch": 0.88, + "learning_rate": 8.03084588503672e-07, + "loss": 0.4464, + "step": 5053 + }, + { + "epoch": 0.88, + "learning_rate": 8.008825353802941e-07, + "loss": 0.4606, + "step": 5054 + }, + { + "epoch": 0.88, + "learning_rate": 7.986833794678139e-07, + "loss": 0.4489, + "step": 5055 + }, + { + "epoch": 0.88, + "learning_rate": 7.964871214588455e-07, + "loss": 0.4624, + "step": 5056 + }, + { + "epoch": 0.88, + "learning_rate": 7.942937620450864e-07, + "loss": 0.4473, + "step": 5057 + }, + { + "epoch": 0.88, + "learning_rate": 7.921033019173208e-07, + "loss": 0.4586, + "step": 5058 + }, + { + "epoch": 0.88, + "learning_rate": 7.899157417654268e-07, + "loss": 0.438, + "step": 5059 + }, + { + "epoch": 0.88, + "learning_rate": 7.877310822783613e-07, + "loss": 0.4699, + "step": 5060 + }, + { + "epoch": 0.88, + "learning_rate": 7.855493241441692e-07, + "loss": 0.4454, + "step": 5061 + }, + { + "epoch": 0.88, + "learning_rate": 7.833704680499865e-07, + "loss": 0.4564, + "step": 5062 + }, + { + "epoch": 0.88, + "learning_rate": 7.811945146820299e-07, + "loss": 0.4445, + "step": 5063 + }, + { + "epoch": 0.88, + "learning_rate": 7.790214647256044e-07, + "loss": 0.4501, + "step": 5064 + }, + { + "epoch": 0.88, + "learning_rate": 7.768513188650995e-07, + "loss": 0.4467, + "step": 5065 + }, + { + "epoch": 0.88, + "learning_rate": 7.746840777839903e-07, + "loss": 0.4536, + "step": 5066 + }, + { + "epoch": 0.88, + "learning_rate": 7.725197421648389e-07, + "loss": 0.4573, + "step": 5067 + }, + { + "epoch": 0.88, + "learning_rate": 7.703583126892889e-07, + "loss": 0.4523, + "step": 5068 + }, + { + "epoch": 0.88, + "learning_rate": 7.68199790038071e-07, + "loss": 0.4583, + "step": 5069 + }, + { + "epoch": 0.88, + "learning_rate": 7.660441748909997e-07, + "loss": 0.4582, + "step": 5070 + }, + { + "epoch": 0.88, + "learning_rate": 7.638914679269772e-07, + "loss": 0.4713, + "step": 5071 + }, + { + "epoch": 0.88, + "learning_rate": 7.617416698239821e-07, + "loss": 0.457, + "step": 5072 + }, + { + "epoch": 0.88, + "learning_rate": 7.595947812590832e-07, + "loss": 0.4698, + "step": 5073 + }, + { + "epoch": 0.88, + "learning_rate": 7.574508029084315e-07, + "loss": 0.4464, + "step": 5074 + }, + { + "epoch": 0.88, + "learning_rate": 7.553097354472594e-07, + "loss": 0.4571, + "step": 5075 + }, + { + "epoch": 0.88, + "learning_rate": 7.531715795498861e-07, + "loss": 0.4684, + "step": 5076 + }, + { + "epoch": 0.88, + "learning_rate": 7.510363358897122e-07, + "loss": 0.4622, + "step": 5077 + }, + { + "epoch": 0.88, + "learning_rate": 7.489040051392204e-07, + "loss": 0.4578, + "step": 5078 + }, + { + "epoch": 0.88, + "learning_rate": 7.46774587969975e-07, + "loss": 0.4585, + "step": 5079 + }, + { + "epoch": 0.88, + "learning_rate": 7.446480850526239e-07, + "loss": 0.4705, + "step": 5080 + }, + { + "epoch": 0.88, + "learning_rate": 7.42524497056899e-07, + "loss": 0.4539, + "step": 5081 + }, + { + "epoch": 0.88, + "learning_rate": 7.404038246516121e-07, + "loss": 0.4642, + "step": 5082 + }, + { + "epoch": 0.88, + "learning_rate": 7.382860685046589e-07, + "loss": 0.4536, + "step": 5083 + }, + { + "epoch": 0.88, + "learning_rate": 7.361712292830092e-07, + "loss": 0.4567, + "step": 5084 + }, + { + "epoch": 0.88, + "learning_rate": 7.34059307652728e-07, + "loss": 0.445, + "step": 5085 + }, + { + "epoch": 0.88, + "learning_rate": 7.319503042789467e-07, + "loss": 0.4614, + "step": 5086 + }, + { + "epoch": 0.88, + "learning_rate": 7.298442198258871e-07, + "loss": 0.4591, + "step": 5087 + }, + { + "epoch": 0.88, + "learning_rate": 7.277410549568476e-07, + "loss": 0.4633, + "step": 5088 + }, + { + "epoch": 0.88, + "learning_rate": 7.256408103342095e-07, + "loss": 0.4729, + "step": 5089 + }, + { + "epoch": 0.88, + "learning_rate": 7.235434866194335e-07, + "loss": 0.4619, + "step": 5090 + }, + { + "epoch": 0.88, + "learning_rate": 7.21449084473056e-07, + "loss": 0.4475, + "step": 5091 + }, + { + "epoch": 0.88, + "learning_rate": 7.193576045547034e-07, + "loss": 0.4671, + "step": 5092 + }, + { + "epoch": 0.88, + "learning_rate": 7.172690475230715e-07, + "loss": 0.4508, + "step": 5093 + }, + { + "epoch": 0.88, + "learning_rate": 7.151834140359404e-07, + "loss": 0.4615, + "step": 5094 + }, + { + "epoch": 0.88, + "learning_rate": 7.131007047501703e-07, + "loss": 0.4544, + "step": 5095 + }, + { + "epoch": 0.88, + "learning_rate": 7.110209203216967e-07, + "loss": 0.4578, + "step": 5096 + }, + { + "epoch": 0.88, + "learning_rate": 7.089440614055398e-07, + "loss": 0.4484, + "step": 5097 + }, + { + "epoch": 0.88, + "learning_rate": 7.0687012865579e-07, + "loss": 0.469, + "step": 5098 + }, + { + "epoch": 0.88, + "learning_rate": 7.047991227256235e-07, + "loss": 0.4572, + "step": 5099 + }, + { + "epoch": 0.88, + "learning_rate": 7.027310442672919e-07, + "loss": 0.4567, + "step": 5100 + }, + { + "epoch": 0.88, + "learning_rate": 7.006658939321265e-07, + "loss": 0.4512, + "step": 5101 + }, + { + "epoch": 0.88, + "learning_rate": 6.986036723705303e-07, + "loss": 0.4632, + "step": 5102 + }, + { + "epoch": 0.88, + "learning_rate": 6.965443802319927e-07, + "loss": 0.4554, + "step": 5103 + }, + { + "epoch": 0.88, + "learning_rate": 6.94488018165077e-07, + "loss": 0.4726, + "step": 5104 + }, + { + "epoch": 0.88, + "learning_rate": 6.924345868174187e-07, + "loss": 0.4646, + "step": 5105 + }, + { + "epoch": 0.88, + "learning_rate": 6.903840868357382e-07, + "loss": 0.4524, + "step": 5106 + }, + { + "epoch": 0.88, + "learning_rate": 6.883365188658275e-07, + "loss": 0.4595, + "step": 5107 + }, + { + "epoch": 0.88, + "learning_rate": 6.86291883552559e-07, + "loss": 0.4705, + "step": 5108 + }, + { + "epoch": 0.89, + "learning_rate": 6.842501815398739e-07, + "loss": 0.4498, + "step": 5109 + }, + { + "epoch": 0.89, + "learning_rate": 6.822114134707991e-07, + "loss": 0.4548, + "step": 5110 + }, + { + "epoch": 0.89, + "learning_rate": 6.801755799874354e-07, + "loss": 0.4446, + "step": 5111 + }, + { + "epoch": 0.89, + "learning_rate": 6.781426817309522e-07, + "loss": 0.468, + "step": 5112 + }, + { + "epoch": 0.89, + "learning_rate": 6.761127193416018e-07, + "loss": 0.4473, + "step": 5113 + }, + { + "epoch": 0.89, + "learning_rate": 6.740856934587092e-07, + "loss": 0.4591, + "step": 5114 + }, + { + "epoch": 0.89, + "learning_rate": 6.720616047206774e-07, + "loss": 0.4511, + "step": 5115 + }, + { + "epoch": 0.89, + "learning_rate": 6.700404537649774e-07, + "loss": 0.4592, + "step": 5116 + }, + { + "epoch": 0.89, + "learning_rate": 6.680222412281601e-07, + "loss": 0.4624, + "step": 5117 + }, + { + "epoch": 0.89, + "learning_rate": 6.660069677458558e-07, + "loss": 0.4514, + "step": 5118 + }, + { + "epoch": 0.89, + "learning_rate": 6.63994633952757e-07, + "loss": 0.4521, + "step": 5119 + }, + { + "epoch": 0.89, + "learning_rate": 6.61985240482641e-07, + "loss": 0.4507, + "step": 5120 + }, + { + "epoch": 0.89, + "learning_rate": 6.59978787968354e-07, + "loss": 0.444, + "step": 5121 + }, + { + "epoch": 0.89, + "learning_rate": 6.57975277041818e-07, + "loss": 0.4518, + "step": 5122 + }, + { + "epoch": 0.89, + "learning_rate": 6.559747083340251e-07, + "loss": 0.4569, + "step": 5123 + }, + { + "epoch": 0.89, + "learning_rate": 6.539770824750447e-07, + "loss": 0.4668, + "step": 5124 + }, + { + "epoch": 0.89, + "learning_rate": 6.519824000940178e-07, + "loss": 0.4592, + "step": 5125 + }, + { + "epoch": 0.89, + "learning_rate": 6.499906618191598e-07, + "loss": 0.454, + "step": 5126 + }, + { + "epoch": 0.89, + "learning_rate": 6.480018682777578e-07, + "loss": 0.4523, + "step": 5127 + }, + { + "epoch": 0.89, + "learning_rate": 6.460160200961662e-07, + "loss": 0.4587, + "step": 5128 + }, + { + "epoch": 0.89, + "learning_rate": 6.44033117899825e-07, + "loss": 0.453, + "step": 5129 + }, + { + "epoch": 0.89, + "learning_rate": 6.42053162313232e-07, + "loss": 0.4617, + "step": 5130 + }, + { + "epoch": 0.89, + "learning_rate": 6.400761539599653e-07, + "loss": 0.4639, + "step": 5131 + }, + { + "epoch": 0.89, + "learning_rate": 6.381020934626725e-07, + "loss": 0.4716, + "step": 5132 + }, + { + "epoch": 0.89, + "learning_rate": 6.361309814430727e-07, + "loss": 0.4593, + "step": 5133 + }, + { + "epoch": 0.89, + "learning_rate": 6.341628185219583e-07, + "loss": 0.4644, + "step": 5134 + }, + { + "epoch": 0.89, + "learning_rate": 6.32197605319187e-07, + "loss": 0.4514, + "step": 5135 + }, + { + "epoch": 0.89, + "learning_rate": 6.302353424536977e-07, + "loss": 0.463, + "step": 5136 + }, + { + "epoch": 0.89, + "learning_rate": 6.282760305434899e-07, + "loss": 0.4533, + "step": 5137 + }, + { + "epoch": 0.89, + "learning_rate": 6.263196702056395e-07, + "loss": 0.4631, + "step": 5138 + }, + { + "epoch": 0.89, + "learning_rate": 6.243662620562918e-07, + "loss": 0.4516, + "step": 5139 + }, + { + "epoch": 0.89, + "learning_rate": 6.2241580671066e-07, + "loss": 0.4672, + "step": 5140 + }, + { + "epoch": 0.89, + "learning_rate": 6.204683047830318e-07, + "loss": 0.4477, + "step": 5141 + }, + { + "epoch": 0.89, + "learning_rate": 6.185237568867597e-07, + "loss": 0.4681, + "step": 5142 + }, + { + "epoch": 0.89, + "learning_rate": 6.165821636342684e-07, + "loss": 0.46, + "step": 5143 + }, + { + "epoch": 0.89, + "learning_rate": 6.146435256370531e-07, + "loss": 0.4561, + "step": 5144 + }, + { + "epoch": 0.89, + "learning_rate": 6.127078435056766e-07, + "loss": 0.4513, + "step": 5145 + }, + { + "epoch": 0.89, + "learning_rate": 6.107751178497722e-07, + "loss": 0.4697, + "step": 5146 + }, + { + "epoch": 0.89, + "learning_rate": 6.088453492780388e-07, + "loss": 0.4588, + "step": 5147 + }, + { + "epoch": 0.89, + "learning_rate": 6.069185383982501e-07, + "loss": 0.4566, + "step": 5148 + }, + { + "epoch": 0.89, + "learning_rate": 6.049946858172395e-07, + "loss": 0.4567, + "step": 5149 + }, + { + "epoch": 0.89, + "learning_rate": 6.030737921409169e-07, + "loss": 0.4653, + "step": 5150 + }, + { + "epoch": 0.89, + "learning_rate": 6.01155857974256e-07, + "loss": 0.4675, + "step": 5151 + }, + { + "epoch": 0.89, + "learning_rate": 5.992408839213005e-07, + "loss": 0.4616, + "step": 5152 + }, + { + "epoch": 0.89, + "learning_rate": 5.973288705851587e-07, + "loss": 0.4532, + "step": 5153 + }, + { + "epoch": 0.89, + "learning_rate": 5.954198185680116e-07, + "loss": 0.461, + "step": 5154 + }, + { + "epoch": 0.89, + "learning_rate": 5.935137284711035e-07, + "loss": 0.4463, + "step": 5155 + }, + { + "epoch": 0.89, + "learning_rate": 5.916106008947454e-07, + "loss": 0.4567, + "step": 5156 + }, + { + "epoch": 0.89, + "learning_rate": 5.897104364383177e-07, + "loss": 0.4475, + "step": 5157 + }, + { + "epoch": 0.89, + "learning_rate": 5.878132357002663e-07, + "loss": 0.46, + "step": 5158 + }, + { + "epoch": 0.89, + "learning_rate": 5.859189992781045e-07, + "loss": 0.461, + "step": 5159 + }, + { + "epoch": 0.89, + "learning_rate": 5.840277277684136e-07, + "loss": 0.4669, + "step": 5160 + }, + { + "epoch": 0.89, + "learning_rate": 5.821394217668331e-07, + "loss": 0.4558, + "step": 5161 + }, + { + "epoch": 0.89, + "learning_rate": 5.802540818680814e-07, + "loss": 0.4566, + "step": 5162 + }, + { + "epoch": 0.89, + "learning_rate": 5.783717086659302e-07, + "loss": 0.4571, + "step": 5163 + }, + { + "epoch": 0.89, + "learning_rate": 5.764923027532265e-07, + "loss": 0.4475, + "step": 5164 + }, + { + "epoch": 0.89, + "learning_rate": 5.746158647218758e-07, + "loss": 0.4604, + "step": 5165 + }, + { + "epoch": 0.89, + "learning_rate": 5.727423951628541e-07, + "loss": 0.4634, + "step": 5166 + }, + { + "epoch": 0.9, + "learning_rate": 5.708718946662006e-07, + "loss": 0.4448, + "step": 5167 + }, + { + "epoch": 0.9, + "learning_rate": 5.690043638210141e-07, + "loss": 0.4579, + "step": 5168 + }, + { + "epoch": 0.9, + "learning_rate": 5.671398032154707e-07, + "loss": 0.4518, + "step": 5169 + }, + { + "epoch": 0.9, + "learning_rate": 5.652782134367974e-07, + "loss": 0.4809, + "step": 5170 + }, + { + "epoch": 0.9, + "learning_rate": 5.634195950712939e-07, + "loss": 0.4448, + "step": 5171 + }, + { + "epoch": 0.9, + "learning_rate": 5.615639487043201e-07, + "loss": 0.4605, + "step": 5172 + }, + { + "epoch": 0.9, + "learning_rate": 5.597112749203038e-07, + "loss": 0.4666, + "step": 5173 + }, + { + "epoch": 0.9, + "learning_rate": 5.578615743027338e-07, + "loss": 0.4622, + "step": 5174 + }, + { + "epoch": 0.9, + "learning_rate": 5.56014847434162e-07, + "loss": 0.4523, + "step": 5175 + }, + { + "epoch": 0.9, + "learning_rate": 5.541710948962043e-07, + "loss": 0.4649, + "step": 5176 + }, + { + "epoch": 0.9, + "learning_rate": 5.523303172695427e-07, + "loss": 0.457, + "step": 5177 + }, + { + "epoch": 0.9, + "learning_rate": 5.504925151339191e-07, + "loss": 0.453, + "step": 5178 + }, + { + "epoch": 0.9, + "learning_rate": 5.48657689068135e-07, + "loss": 0.4498, + "step": 5179 + }, + { + "epoch": 0.9, + "learning_rate": 5.468258396500636e-07, + "loss": 0.4503, + "step": 5180 + }, + { + "epoch": 0.9, + "learning_rate": 5.449969674566369e-07, + "loss": 0.4526, + "step": 5181 + }, + { + "epoch": 0.9, + "learning_rate": 5.431710730638428e-07, + "loss": 0.4634, + "step": 5182 + }, + { + "epoch": 0.9, + "learning_rate": 5.413481570467382e-07, + "loss": 0.463, + "step": 5183 + }, + { + "epoch": 0.9, + "learning_rate": 5.395282199794427e-07, + "loss": 0.4711, + "step": 5184 + }, + { + "epoch": 0.9, + "learning_rate": 5.377112624351355e-07, + "loss": 0.4462, + "step": 5185 + }, + { + "epoch": 0.9, + "learning_rate": 5.358972849860533e-07, + "loss": 0.4639, + "step": 5186 + }, + { + "epoch": 0.9, + "learning_rate": 5.340862882034992e-07, + "loss": 0.4728, + "step": 5187 + }, + { + "epoch": 0.9, + "learning_rate": 5.322782726578413e-07, + "loss": 0.4621, + "step": 5188 + }, + { + "epoch": 0.9, + "learning_rate": 5.304732389184986e-07, + "loss": 0.4525, + "step": 5189 + }, + { + "epoch": 0.9, + "learning_rate": 5.286711875539585e-07, + "loss": 0.4578, + "step": 5190 + }, + { + "epoch": 0.9, + "learning_rate": 5.268721191317683e-07, + "loss": 0.4595, + "step": 5191 + }, + { + "epoch": 0.9, + "learning_rate": 5.250760342185335e-07, + "loss": 0.4715, + "step": 5192 + }, + { + "epoch": 0.9, + "learning_rate": 5.232829333799205e-07, + "loss": 0.4446, + "step": 5193 + }, + { + "epoch": 0.9, + "learning_rate": 5.214928171806543e-07, + "loss": 0.4671, + "step": 5194 + }, + { + "epoch": 0.9, + "learning_rate": 5.197056861845284e-07, + "loss": 0.4451, + "step": 5195 + }, + { + "epoch": 0.9, + "learning_rate": 5.179215409543848e-07, + "loss": 0.4566, + "step": 5196 + }, + { + "epoch": 0.9, + "learning_rate": 5.161403820521305e-07, + "loss": 0.453, + "step": 5197 + }, + { + "epoch": 0.9, + "learning_rate": 5.143622100387336e-07, + "loss": 0.4703, + "step": 5198 + }, + { + "epoch": 0.9, + "learning_rate": 5.125870254742182e-07, + "loss": 0.4572, + "step": 5199 + }, + { + "epoch": 0.9, + "learning_rate": 5.108148289176685e-07, + "loss": 0.4512, + "step": 5200 + }, + { + "epoch": 0.9, + "learning_rate": 5.090456209272276e-07, + "loss": 0.4503, + "step": 5201 + }, + { + "epoch": 0.9, + "learning_rate": 5.07279402060099e-07, + "loss": 0.4579, + "step": 5202 + }, + { + "epoch": 0.9, + "learning_rate": 5.055161728725433e-07, + "loss": 0.4633, + "step": 5203 + }, + { + "epoch": 0.9, + "learning_rate": 5.037559339198805e-07, + "loss": 0.4608, + "step": 5204 + }, + { + "epoch": 0.9, + "learning_rate": 5.01998685756484e-07, + "loss": 0.4749, + "step": 5205 + }, + { + "epoch": 0.9, + "learning_rate": 5.002444289357955e-07, + "loss": 0.4498, + "step": 5206 + }, + { + "epoch": 0.9, + "learning_rate": 4.984931640103041e-07, + "loss": 0.4603, + "step": 5207 + }, + { + "epoch": 0.9, + "learning_rate": 4.967448915315609e-07, + "loss": 0.4564, + "step": 5208 + }, + { + "epoch": 0.9, + "learning_rate": 4.949996120501765e-07, + "loss": 0.4628, + "step": 5209 + }, + { + "epoch": 0.9, + "learning_rate": 4.932573261158169e-07, + "loss": 0.4512, + "step": 5210 + }, + { + "epoch": 0.9, + "learning_rate": 4.915180342772053e-07, + "loss": 0.4598, + "step": 5211 + }, + { + "epoch": 0.9, + "learning_rate": 4.89781737082119e-07, + "loss": 0.4577, + "step": 5212 + }, + { + "epoch": 0.9, + "learning_rate": 4.880484350774007e-07, + "loss": 0.4422, + "step": 5213 + }, + { + "epoch": 0.9, + "learning_rate": 4.863181288089391e-07, + "loss": 0.4424, + "step": 5214 + }, + { + "epoch": 0.9, + "learning_rate": 4.845908188216874e-07, + "loss": 0.4558, + "step": 5215 + }, + { + "epoch": 0.9, + "learning_rate": 4.828665056596504e-07, + "loss": 0.4684, + "step": 5216 + }, + { + "epoch": 0.9, + "learning_rate": 4.811451898658925e-07, + "loss": 0.4609, + "step": 5217 + }, + { + "epoch": 0.9, + "learning_rate": 4.794268719825334e-07, + "loss": 0.4519, + "step": 5218 + }, + { + "epoch": 0.9, + "learning_rate": 4.777115525507447e-07, + "loss": 0.4669, + "step": 5219 + }, + { + "epoch": 0.9, + "learning_rate": 4.759992321107587e-07, + "loss": 0.453, + "step": 5220 + }, + { + "epoch": 0.9, + "learning_rate": 4.7428991120186065e-07, + "loss": 0.4488, + "step": 5221 + }, + { + "epoch": 0.9, + "learning_rate": 4.725835903623921e-07, + "loss": 0.4633, + "step": 5222 + }, + { + "epoch": 0.9, + "learning_rate": 4.708802701297499e-07, + "loss": 0.4644, + "step": 5223 + }, + { + "epoch": 0.9, + "learning_rate": 4.6917995104038384e-07, + "loss": 0.4636, + "step": 5224 + }, + { + "epoch": 0.91, + "learning_rate": 4.6748263362980105e-07, + "loss": 0.466, + "step": 5225 + }, + { + "epoch": 0.91, + "learning_rate": 4.6578831843256176e-07, + "loss": 0.4613, + "step": 5226 + }, + { + "epoch": 0.91, + "learning_rate": 4.6409700598228025e-07, + "loss": 0.4511, + "step": 5227 + }, + { + "epoch": 0.91, + "learning_rate": 4.6240869681162814e-07, + "loss": 0.453, + "step": 5228 + }, + { + "epoch": 0.91, + "learning_rate": 4.607233914523268e-07, + "loss": 0.4489, + "step": 5229 + }, + { + "epoch": 0.91, + "learning_rate": 4.590410904351561e-07, + "loss": 0.4446, + "step": 5230 + }, + { + "epoch": 0.91, + "learning_rate": 4.573617942899433e-07, + "loss": 0.4639, + "step": 5231 + }, + { + "epoch": 0.91, + "learning_rate": 4.556855035455787e-07, + "loss": 0.4521, + "step": 5232 + }, + { + "epoch": 0.91, + "learning_rate": 4.540122187299978e-07, + "loss": 0.4636, + "step": 5233 + }, + { + "epoch": 0.91, + "learning_rate": 4.523419403701923e-07, + "loss": 0.4583, + "step": 5234 + }, + { + "epoch": 0.91, + "learning_rate": 4.5067466899220703e-07, + "loss": 0.4539, + "step": 5235 + }, + { + "epoch": 0.91, + "learning_rate": 4.490104051211408e-07, + "loss": 0.4485, + "step": 5236 + }, + { + "epoch": 0.91, + "learning_rate": 4.4734914928114435e-07, + "loss": 0.447, + "step": 5237 + }, + { + "epoch": 0.91, + "learning_rate": 4.456909019954181e-07, + "loss": 0.4452, + "step": 5238 + }, + { + "epoch": 0.91, + "learning_rate": 4.440356637862231e-07, + "loss": 0.4675, + "step": 5239 + }, + { + "epoch": 0.91, + "learning_rate": 4.4238343517486237e-07, + "loss": 0.4524, + "step": 5240 + }, + { + "epoch": 0.91, + "learning_rate": 4.407342166816997e-07, + "loss": 0.4583, + "step": 5241 + }, + { + "epoch": 0.91, + "learning_rate": 4.3908800882614397e-07, + "loss": 0.4549, + "step": 5242 + }, + { + "epoch": 0.91, + "learning_rate": 4.3744481212666167e-07, + "loss": 0.4564, + "step": 5243 + }, + { + "epoch": 0.91, + "learning_rate": 4.358046271007699e-07, + "loss": 0.4557, + "step": 5244 + }, + { + "epoch": 0.91, + "learning_rate": 4.3416745426503095e-07, + "loss": 0.4698, + "step": 5245 + }, + { + "epoch": 0.91, + "learning_rate": 4.325332941350668e-07, + "loss": 0.4578, + "step": 5246 + }, + { + "epoch": 0.91, + "learning_rate": 4.30902147225547e-07, + "loss": 0.4563, + "step": 5247 + }, + { + "epoch": 0.91, + "learning_rate": 4.2927401405019166e-07, + "loss": 0.4425, + "step": 5248 + }, + { + "epoch": 0.91, + "learning_rate": 4.276488951217705e-07, + "loss": 0.471, + "step": 5249 + }, + { + "epoch": 0.91, + "learning_rate": 4.2602679095210766e-07, + "loss": 0.4542, + "step": 5250 + }, + { + "epoch": 0.91, + "learning_rate": 4.244077020520776e-07, + "loss": 0.4553, + "step": 5251 + }, + { + "epoch": 0.91, + "learning_rate": 4.227916289316003e-07, + "loss": 0.4466, + "step": 5252 + }, + { + "epoch": 0.91, + "learning_rate": 4.2117857209964863e-07, + "loss": 0.4621, + "step": 5253 + }, + { + "epoch": 0.91, + "learning_rate": 4.195685320642484e-07, + "loss": 0.462, + "step": 5254 + }, + { + "epoch": 0.91, + "learning_rate": 4.179615093324729e-07, + "loss": 0.4546, + "step": 5255 + }, + { + "epoch": 0.91, + "learning_rate": 4.1635750441044067e-07, + "loss": 0.4504, + "step": 5256 + }, + { + "epoch": 0.91, + "learning_rate": 4.147565178033286e-07, + "loss": 0.4651, + "step": 5257 + }, + { + "epoch": 0.91, + "learning_rate": 4.131585500153579e-07, + "loss": 0.4507, + "step": 5258 + }, + { + "epoch": 0.91, + "learning_rate": 4.1156360154979813e-07, + "loss": 0.4504, + "step": 5259 + }, + { + "epoch": 0.91, + "learning_rate": 4.099716729089698e-07, + "loss": 0.4499, + "step": 5260 + }, + { + "epoch": 0.91, + "learning_rate": 4.083827645942429e-07, + "loss": 0.4594, + "step": 5261 + }, + { + "epoch": 0.91, + "learning_rate": 4.067968771060349e-07, + "loss": 0.4668, + "step": 5262 + }, + { + "epoch": 0.91, + "learning_rate": 4.0521401094381186e-07, + "loss": 0.4552, + "step": 5263 + }, + { + "epoch": 0.91, + "learning_rate": 4.036341666060872e-07, + "loss": 0.4517, + "step": 5264 + }, + { + "epoch": 0.91, + "learning_rate": 4.0205734459042854e-07, + "loss": 0.4649, + "step": 5265 + }, + { + "epoch": 0.91, + "learning_rate": 4.004835453934419e-07, + "loss": 0.4472, + "step": 5266 + }, + { + "epoch": 0.91, + "learning_rate": 3.9891276951079083e-07, + "loss": 0.4725, + "step": 5267 + }, + { + "epoch": 0.91, + "learning_rate": 3.9734501743717956e-07, + "loss": 0.4478, + "step": 5268 + }, + { + "epoch": 0.91, + "learning_rate": 3.957802896663665e-07, + "loss": 0.4535, + "step": 5269 + }, + { + "epoch": 0.91, + "learning_rate": 3.9421858669114966e-07, + "loss": 0.4643, + "step": 5270 + }, + { + "epoch": 0.91, + "learning_rate": 3.9265990900337893e-07, + "loss": 0.4498, + "step": 5271 + }, + { + "epoch": 0.91, + "learning_rate": 3.9110425709395606e-07, + "loss": 0.4652, + "step": 5272 + }, + { + "epoch": 0.91, + "learning_rate": 3.8955163145282024e-07, + "loss": 0.4634, + "step": 5273 + }, + { + "epoch": 0.91, + "learning_rate": 3.8800203256896483e-07, + "loss": 0.4447, + "step": 5274 + }, + { + "epoch": 0.91, + "learning_rate": 3.8645546093042385e-07, + "loss": 0.459, + "step": 5275 + }, + { + "epoch": 0.91, + "learning_rate": 3.8491191702428654e-07, + "loss": 0.4555, + "step": 5276 + }, + { + "epoch": 0.91, + "learning_rate": 3.833714013366796e-07, + "loss": 0.4627, + "step": 5277 + }, + { + "epoch": 0.91, + "learning_rate": 3.8183391435278163e-07, + "loss": 0.4461, + "step": 5278 + }, + { + "epoch": 0.91, + "learning_rate": 3.802994565568141e-07, + "loss": 0.4504, + "step": 5279 + }, + { + "epoch": 0.91, + "learning_rate": 3.787680284320472e-07, + "loss": 0.461, + "step": 5280 + }, + { + "epoch": 0.91, + "learning_rate": 3.7723963046079724e-07, + "loss": 0.4498, + "step": 5281 + }, + { + "epoch": 0.92, + "learning_rate": 3.757142631244204e-07, + "loss": 0.4495, + "step": 5282 + }, + { + "epoch": 0.92, + "learning_rate": 3.7419192690332786e-07, + "loss": 0.4553, + "step": 5283 + }, + { + "epoch": 0.92, + "learning_rate": 3.726726222769672e-07, + "loss": 0.4456, + "step": 5284 + }, + { + "epoch": 0.92, + "learning_rate": 3.7115634972383464e-07, + "loss": 0.4597, + "step": 5285 + }, + { + "epoch": 0.92, + "learning_rate": 3.696431097214748e-07, + "loss": 0.4608, + "step": 5286 + }, + { + "epoch": 0.92, + "learning_rate": 3.6813290274647197e-07, + "loss": 0.4527, + "step": 5287 + }, + { + "epoch": 0.92, + "learning_rate": 3.6662572927445907e-07, + "loss": 0.4614, + "step": 5288 + }, + { + "epoch": 0.92, + "learning_rate": 3.651215897801097e-07, + "loss": 0.453, + "step": 5289 + }, + { + "epoch": 0.92, + "learning_rate": 3.6362048473714496e-07, + "loss": 0.4504, + "step": 5290 + }, + { + "epoch": 0.92, + "learning_rate": 3.6212241461833107e-07, + "loss": 0.4459, + "step": 5291 + }, + { + "epoch": 0.92, + "learning_rate": 3.606273798954751e-07, + "loss": 0.4535, + "step": 5292 + }, + { + "epoch": 0.92, + "learning_rate": 3.5913538103943155e-07, + "loss": 0.4683, + "step": 5293 + }, + { + "epoch": 0.92, + "learning_rate": 3.5764641852009565e-07, + "loss": 0.4617, + "step": 5294 + }, + { + "epoch": 0.92, + "learning_rate": 3.5616049280640995e-07, + "loss": 0.4643, + "step": 5295 + }, + { + "epoch": 0.92, + "learning_rate": 3.5467760436635577e-07, + "loss": 0.4484, + "step": 5296 + }, + { + "epoch": 0.92, + "learning_rate": 3.5319775366696175e-07, + "loss": 0.4557, + "step": 5297 + }, + { + "epoch": 0.92, + "learning_rate": 3.517209411742994e-07, + "loss": 0.4481, + "step": 5298 + }, + { + "epoch": 0.92, + "learning_rate": 3.502471673534824e-07, + "loss": 0.4605, + "step": 5299 + }, + { + "epoch": 0.92, + "learning_rate": 3.48776432668666e-07, + "loss": 0.4491, + "step": 5300 + }, + { + "epoch": 0.92, + "learning_rate": 3.4730873758305193e-07, + "loss": 0.4689, + "step": 5301 + }, + { + "epoch": 0.92, + "learning_rate": 3.458440825588827e-07, + "loss": 0.457, + "step": 5302 + }, + { + "epoch": 0.92, + "learning_rate": 3.4438246805744034e-07, + "loss": 0.4624, + "step": 5303 + }, + { + "epoch": 0.92, + "learning_rate": 3.429238945390556e-07, + "loss": 0.4581, + "step": 5304 + }, + { + "epoch": 0.92, + "learning_rate": 3.4146836246309656e-07, + "loss": 0.4609, + "step": 5305 + }, + { + "epoch": 0.92, + "learning_rate": 3.4001587228797427e-07, + "loss": 0.4566, + "step": 5306 + }, + { + "epoch": 0.92, + "learning_rate": 3.385664244711451e-07, + "loss": 0.4571, + "step": 5307 + }, + { + "epoch": 0.92, + "learning_rate": 3.3712001946910046e-07, + "loss": 0.4469, + "step": 5308 + }, + { + "epoch": 0.92, + "learning_rate": 3.3567665773738156e-07, + "loss": 0.4617, + "step": 5309 + }, + { + "epoch": 0.92, + "learning_rate": 3.342363397305648e-07, + "loss": 0.4555, + "step": 5310 + }, + { + "epoch": 0.92, + "learning_rate": 3.327990659022706e-07, + "loss": 0.4569, + "step": 5311 + }, + { + "epoch": 0.92, + "learning_rate": 3.313648367051614e-07, + "loss": 0.4567, + "step": 5312 + }, + { + "epoch": 0.92, + "learning_rate": 3.299336525909391e-07, + "loss": 0.4615, + "step": 5313 + }, + { + "epoch": 0.92, + "learning_rate": 3.2850551401034767e-07, + "loss": 0.4533, + "step": 5314 + }, + { + "epoch": 0.92, + "learning_rate": 3.270804214131684e-07, + "loss": 0.4612, + "step": 5315 + }, + { + "epoch": 0.92, + "learning_rate": 3.2565837524823227e-07, + "loss": 0.4474, + "step": 5316 + }, + { + "epoch": 0.92, + "learning_rate": 3.242393759633988e-07, + "loss": 0.459, + "step": 5317 + }, + { + "epoch": 0.92, + "learning_rate": 3.228234240055772e-07, + "loss": 0.4459, + "step": 5318 + }, + { + "epoch": 0.92, + "learning_rate": 3.2141051982071293e-07, + "loss": 0.4676, + "step": 5319 + }, + { + "epoch": 0.92, + "learning_rate": 3.2000066385379225e-07, + "loss": 0.4667, + "step": 5320 + }, + { + "epoch": 0.92, + "learning_rate": 3.185938565488422e-07, + "loss": 0.4633, + "step": 5321 + }, + { + "epoch": 0.92, + "learning_rate": 3.171900983489273e-07, + "loss": 0.4517, + "step": 5322 + }, + { + "epoch": 0.92, + "learning_rate": 3.1578938969615394e-07, + "loss": 0.458, + "step": 5323 + }, + { + "epoch": 0.92, + "learning_rate": 3.143917310316691e-07, + "loss": 0.4582, + "step": 5324 + }, + { + "epoch": 0.92, + "learning_rate": 3.129971227956563e-07, + "loss": 0.4636, + "step": 5325 + }, + { + "epoch": 0.92, + "learning_rate": 3.1160556542733757e-07, + "loss": 0.448, + "step": 5326 + }, + { + "epoch": 0.92, + "learning_rate": 3.1021705936498005e-07, + "loss": 0.4799, + "step": 5327 + }, + { + "epoch": 0.92, + "learning_rate": 3.0883160504588504e-07, + "loss": 0.4445, + "step": 5328 + }, + { + "epoch": 0.92, + "learning_rate": 3.0744920290639247e-07, + "loss": 0.4554, + "step": 5329 + }, + { + "epoch": 0.92, + "learning_rate": 3.0606985338188177e-07, + "loss": 0.4439, + "step": 5330 + }, + { + "epoch": 0.92, + "learning_rate": 3.0469355690677216e-07, + "loss": 0.4707, + "step": 5331 + }, + { + "epoch": 0.92, + "learning_rate": 3.0332031391452243e-07, + "loss": 0.45, + "step": 5332 + }, + { + "epoch": 0.92, + "learning_rate": 3.019501248376244e-07, + "loss": 0.47, + "step": 5333 + }, + { + "epoch": 0.92, + "learning_rate": 3.0058299010761294e-07, + "loss": 0.4555, + "step": 5334 + }, + { + "epoch": 0.92, + "learning_rate": 2.992189101550613e-07, + "loss": 0.4653, + "step": 5335 + }, + { + "epoch": 0.92, + "learning_rate": 2.9785788540957706e-07, + "loss": 0.4473, + "step": 5336 + }, + { + "epoch": 0.92, + "learning_rate": 2.964999162998072e-07, + "loss": 0.4552, + "step": 5337 + }, + { + "epoch": 0.92, + "learning_rate": 2.951450032534364e-07, + "loss": 0.4465, + "step": 5338 + }, + { + "epoch": 0.92, + "learning_rate": 2.937931466971888e-07, + "loss": 0.4582, + "step": 5339 + }, + { + "epoch": 0.93, + "learning_rate": 2.9244434705682276e-07, + "loss": 0.4413, + "step": 5340 + }, + { + "epoch": 0.93, + "learning_rate": 2.9109860475713403e-07, + "loss": 0.4547, + "step": 5341 + }, + { + "epoch": 0.93, + "learning_rate": 2.897559202219602e-07, + "loss": 0.4603, + "step": 5342 + }, + { + "epoch": 0.93, + "learning_rate": 2.884162938741686e-07, + "loss": 0.4502, + "step": 5343 + }, + { + "epoch": 0.93, + "learning_rate": 2.870797261356684e-07, + "loss": 0.4443, + "step": 5344 + }, + { + "epoch": 0.93, + "learning_rate": 2.8574621742740506e-07, + "loss": 0.4499, + "step": 5345 + }, + { + "epoch": 0.93, + "learning_rate": 2.8441576816936043e-07, + "loss": 0.4647, + "step": 5346 + }, + { + "epoch": 0.93, + "learning_rate": 2.830883787805494e-07, + "loss": 0.4481, + "step": 5347 + }, + { + "epoch": 0.93, + "learning_rate": 2.817640496790275e-07, + "loss": 0.455, + "step": 5348 + }, + { + "epoch": 0.93, + "learning_rate": 2.8044278128188327e-07, + "loss": 0.467, + "step": 5349 + }, + { + "epoch": 0.93, + "learning_rate": 2.791245740052451e-07, + "loss": 0.4679, + "step": 5350 + }, + { + "epoch": 0.93, + "learning_rate": 2.7780942826427514e-07, + "loss": 0.461, + "step": 5351 + }, + { + "epoch": 0.93, + "learning_rate": 2.7649734447316777e-07, + "loss": 0.4574, + "step": 5352 + }, + { + "epoch": 0.93, + "learning_rate": 2.751883230451613e-07, + "loss": 0.4629, + "step": 5353 + }, + { + "epoch": 0.93, + "learning_rate": 2.738823643925215e-07, + "loss": 0.4565, + "step": 5354 + }, + { + "epoch": 0.93, + "learning_rate": 2.725794689265537e-07, + "loss": 0.4407, + "step": 5355 + }, + { + "epoch": 0.93, + "learning_rate": 2.7127963705759653e-07, + "loss": 0.459, + "step": 5356 + }, + { + "epoch": 0.93, + "learning_rate": 2.699828691950268e-07, + "loss": 0.4583, + "step": 5357 + }, + { + "epoch": 0.93, + "learning_rate": 2.6868916574725347e-07, + "loss": 0.4489, + "step": 5358 + }, + { + "epoch": 0.93, + "learning_rate": 2.6739852712171946e-07, + "loss": 0.4553, + "step": 5359 + }, + { + "epoch": 0.93, + "learning_rate": 2.661109537249085e-07, + "loss": 0.4625, + "step": 5360 + }, + { + "epoch": 0.93, + "learning_rate": 2.6482644596232953e-07, + "loss": 0.4665, + "step": 5361 + }, + { + "epoch": 0.93, + "learning_rate": 2.6354500423853457e-07, + "loss": 0.452, + "step": 5362 + }, + { + "epoch": 0.93, + "learning_rate": 2.622666289571063e-07, + "loss": 0.4487, + "step": 5363 + }, + { + "epoch": 0.93, + "learning_rate": 2.6099132052066044e-07, + "loss": 0.4503, + "step": 5364 + }, + { + "epoch": 0.93, + "learning_rate": 2.5971907933085016e-07, + "loss": 0.4621, + "step": 5365 + }, + { + "epoch": 0.93, + "learning_rate": 2.5844990578835825e-07, + "loss": 0.4508, + "step": 5366 + }, + { + "epoch": 0.93, + "learning_rate": 2.571838002929061e-07, + "loss": 0.4531, + "step": 5367 + }, + { + "epoch": 0.93, + "learning_rate": 2.559207632432448e-07, + "loss": 0.4536, + "step": 5368 + }, + { + "epoch": 0.93, + "learning_rate": 2.546607950371627e-07, + "loss": 0.4642, + "step": 5369 + }, + { + "epoch": 0.93, + "learning_rate": 2.534038960714791e-07, + "loss": 0.4454, + "step": 5370 + }, + { + "epoch": 0.93, + "learning_rate": 2.5215006674204625e-07, + "loss": 0.4569, + "step": 5371 + }, + { + "epoch": 0.93, + "learning_rate": 2.508993074437527e-07, + "loss": 0.4533, + "step": 5372 + }, + { + "epoch": 0.93, + "learning_rate": 2.4965161857051667e-07, + "loss": 0.4574, + "step": 5373 + }, + { + "epoch": 0.93, + "learning_rate": 2.4840700051529054e-07, + "loss": 0.4555, + "step": 5374 + }, + { + "epoch": 0.93, + "learning_rate": 2.4716545367006186e-07, + "loss": 0.4636, + "step": 5375 + }, + { + "epoch": 0.93, + "learning_rate": 2.459269784258467e-07, + "loss": 0.4602, + "step": 5376 + }, + { + "epoch": 0.93, + "learning_rate": 2.4469157517269636e-07, + "loss": 0.4591, + "step": 5377 + }, + { + "epoch": 0.93, + "learning_rate": 2.4345924429969523e-07, + "loss": 0.4491, + "step": 5378 + }, + { + "epoch": 0.93, + "learning_rate": 2.4222998619495953e-07, + "loss": 0.4593, + "step": 5379 + }, + { + "epoch": 0.93, + "learning_rate": 2.41003801245635e-07, + "loss": 0.4663, + "step": 5380 + }, + { + "epoch": 0.93, + "learning_rate": 2.3978068983790294e-07, + "loss": 0.4496, + "step": 5381 + }, + { + "epoch": 0.93, + "learning_rate": 2.3856065235697613e-07, + "loss": 0.4602, + "step": 5382 + }, + { + "epoch": 0.93, + "learning_rate": 2.3734368918709838e-07, + "loss": 0.4575, + "step": 5383 + }, + { + "epoch": 0.93, + "learning_rate": 2.3612980071154534e-07, + "loss": 0.4551, + "step": 5384 + }, + { + "epoch": 0.93, + "learning_rate": 2.349189873126223e-07, + "loss": 0.4377, + "step": 5385 + }, + { + "epoch": 0.93, + "learning_rate": 2.33711249371672e-07, + "loss": 0.4714, + "step": 5386 + }, + { + "epoch": 0.93, + "learning_rate": 2.325065872690624e-07, + "loss": 0.4479, + "step": 5387 + }, + { + "epoch": 0.93, + "learning_rate": 2.3130500138419553e-07, + "loss": 0.4634, + "step": 5388 + }, + { + "epoch": 0.93, + "learning_rate": 2.3010649209550428e-07, + "loss": 0.4547, + "step": 5389 + }, + { + "epoch": 0.93, + "learning_rate": 2.2891105978045336e-07, + "loss": 0.4579, + "step": 5390 + }, + { + "epoch": 0.93, + "learning_rate": 2.2771870481553715e-07, + "loss": 0.4419, + "step": 5391 + }, + { + "epoch": 0.93, + "learning_rate": 2.265294275762786e-07, + "loss": 0.4609, + "step": 5392 + }, + { + "epoch": 0.93, + "learning_rate": 2.25343228437237e-07, + "loss": 0.4552, + "step": 5393 + }, + { + "epoch": 0.93, + "learning_rate": 2.2416010777199904e-07, + "loss": 0.4539, + "step": 5394 + }, + { + "epoch": 0.93, + "learning_rate": 2.229800659531811e-07, + "loss": 0.4517, + "step": 5395 + }, + { + "epoch": 0.93, + "learning_rate": 2.218031033524304e-07, + "loss": 0.4577, + "step": 5396 + }, + { + "epoch": 0.93, + "learning_rate": 2.2062922034042478e-07, + "loss": 0.4423, + "step": 5397 + }, + { + "epoch": 0.94, + "learning_rate": 2.194584172868741e-07, + "loss": 0.4606, + "step": 5398 + }, + { + "epoch": 0.94, + "learning_rate": 2.1829069456051456e-07, + "loss": 0.4527, + "step": 5399 + }, + { + "epoch": 0.94, + "learning_rate": 2.17126052529113e-07, + "loss": 0.4753, + "step": 5400 + }, + { + "epoch": 0.94, + "learning_rate": 2.159644915594694e-07, + "loss": 0.4676, + "step": 5401 + }, + { + "epoch": 0.94, + "learning_rate": 2.1480601201741004e-07, + "loss": 0.4511, + "step": 5402 + }, + { + "epoch": 0.94, + "learning_rate": 2.1365061426778967e-07, + "loss": 0.4427, + "step": 5403 + }, + { + "epoch": 0.94, + "learning_rate": 2.1249829867449723e-07, + "loss": 0.4649, + "step": 5404 + }, + { + "epoch": 0.94, + "learning_rate": 2.11349065600448e-07, + "loss": 0.4483, + "step": 5405 + }, + { + "epoch": 0.94, + "learning_rate": 2.1020291540758352e-07, + "loss": 0.4481, + "step": 5406 + }, + { + "epoch": 0.94, + "learning_rate": 2.0905984845687954e-07, + "loss": 0.4496, + "step": 5407 + }, + { + "epoch": 0.94, + "learning_rate": 2.0791986510833918e-07, + "loss": 0.4565, + "step": 5408 + }, + { + "epoch": 0.94, + "learning_rate": 2.067829657209941e-07, + "loss": 0.466, + "step": 5409 + }, + { + "epoch": 0.94, + "learning_rate": 2.0564915065290237e-07, + "loss": 0.4785, + "step": 5410 + }, + { + "epoch": 0.94, + "learning_rate": 2.0451842026115277e-07, + "loss": 0.4514, + "step": 5411 + }, + { + "epoch": 0.94, + "learning_rate": 2.0339077490186488e-07, + "loss": 0.4763, + "step": 5412 + }, + { + "epoch": 0.94, + "learning_rate": 2.022662149301824e-07, + "loss": 0.4583, + "step": 5413 + }, + { + "epoch": 0.94, + "learning_rate": 2.011447407002809e-07, + "loss": 0.4565, + "step": 5414 + }, + { + "epoch": 0.94, + "learning_rate": 2.0002635256536008e-07, + "loss": 0.4539, + "step": 5415 + }, + { + "epoch": 0.94, + "learning_rate": 1.9891105087765371e-07, + "loss": 0.4502, + "step": 5416 + }, + { + "epoch": 0.94, + "learning_rate": 1.977988359884153e-07, + "loss": 0.4453, + "step": 5417 + }, + { + "epoch": 0.94, + "learning_rate": 1.9668970824793355e-07, + "loss": 0.4491, + "step": 5418 + }, + { + "epoch": 0.94, + "learning_rate": 1.955836680055223e-07, + "loss": 0.4513, + "step": 5419 + }, + { + "epoch": 0.94, + "learning_rate": 1.9448071560952187e-07, + "loss": 0.4656, + "step": 5420 + }, + { + "epoch": 0.94, + "learning_rate": 1.93380851407301e-07, + "loss": 0.453, + "step": 5421 + }, + { + "epoch": 0.94, + "learning_rate": 1.92284075745256e-07, + "loss": 0.4534, + "step": 5422 + }, + { + "epoch": 0.94, + "learning_rate": 1.9119038896880938e-07, + "loss": 0.4653, + "step": 5423 + }, + { + "epoch": 0.94, + "learning_rate": 1.9009979142241453e-07, + "loss": 0.4533, + "step": 5424 + }, + { + "epoch": 0.94, + "learning_rate": 1.8901228344954558e-07, + "loss": 0.4484, + "step": 5425 + }, + { + "epoch": 0.94, + "learning_rate": 1.8792786539270967e-07, + "loss": 0.4684, + "step": 5426 + }, + { + "epoch": 0.94, + "learning_rate": 1.8684653759343586e-07, + "loss": 0.4482, + "step": 5427 + }, + { + "epoch": 0.94, + "learning_rate": 1.85768300392285e-07, + "loss": 0.4614, + "step": 5428 + }, + { + "epoch": 0.94, + "learning_rate": 1.8469315412883882e-07, + "loss": 0.4721, + "step": 5429 + }, + { + "epoch": 0.94, + "learning_rate": 1.83621099141712e-07, + "loss": 0.4699, + "step": 5430 + }, + { + "epoch": 0.94, + "learning_rate": 1.8255213576854115e-07, + "loss": 0.4545, + "step": 5431 + }, + { + "epoch": 0.94, + "learning_rate": 1.8148626434598916e-07, + "loss": 0.4776, + "step": 5432 + }, + { + "epoch": 0.94, + "learning_rate": 1.804234852097464e-07, + "loss": 0.4429, + "step": 5433 + }, + { + "epoch": 0.94, + "learning_rate": 1.793637986945307e-07, + "loss": 0.4531, + "step": 5434 + }, + { + "epoch": 0.94, + "learning_rate": 1.7830720513408395e-07, + "loss": 0.4535, + "step": 5435 + }, + { + "epoch": 0.94, + "learning_rate": 1.7725370486117333e-07, + "loss": 0.4703, + "step": 5436 + }, + { + "epoch": 0.94, + "learning_rate": 1.762032982075934e-07, + "loss": 0.4514, + "step": 5437 + }, + { + "epoch": 0.94, + "learning_rate": 1.7515598550416625e-07, + "loss": 0.464, + "step": 5438 + }, + { + "epoch": 0.94, + "learning_rate": 1.741117670807335e-07, + "loss": 0.4535, + "step": 5439 + }, + { + "epoch": 0.94, + "learning_rate": 1.7307064326616775e-07, + "loss": 0.4614, + "step": 5440 + }, + { + "epoch": 0.94, + "learning_rate": 1.7203261438836439e-07, + "loss": 0.4606, + "step": 5441 + }, + { + "epoch": 0.94, + "learning_rate": 1.709976807742475e-07, + "loss": 0.4658, + "step": 5442 + }, + { + "epoch": 0.94, + "learning_rate": 1.699658427497597e-07, + "loss": 0.4466, + "step": 5443 + }, + { + "epoch": 0.94, + "learning_rate": 1.6893710063987433e-07, + "loss": 0.4664, + "step": 5444 + }, + { + "epoch": 0.94, + "learning_rate": 1.6791145476858894e-07, + "loss": 0.4565, + "step": 5445 + }, + { + "epoch": 0.94, + "learning_rate": 1.66888905458924e-07, + "loss": 0.4685, + "step": 5446 + }, + { + "epoch": 0.94, + "learning_rate": 1.6586945303292633e-07, + "loss": 0.4544, + "step": 5447 + }, + { + "epoch": 0.94, + "learning_rate": 1.648530978116658e-07, + "loss": 0.4624, + "step": 5448 + }, + { + "epoch": 0.94, + "learning_rate": 1.6383984011523967e-07, + "loss": 0.4592, + "step": 5449 + }, + { + "epoch": 0.94, + "learning_rate": 1.6282968026276602e-07, + "loss": 0.4581, + "step": 5450 + }, + { + "epoch": 0.94, + "learning_rate": 1.6182261857238812e-07, + "loss": 0.4612, + "step": 5451 + }, + { + "epoch": 0.94, + "learning_rate": 1.6081865536127895e-07, + "loss": 0.4542, + "step": 5452 + }, + { + "epoch": 0.94, + "learning_rate": 1.5981779094562667e-07, + "loss": 0.4658, + "step": 5453 + }, + { + "epoch": 0.94, + "learning_rate": 1.5882002564065025e-07, + "loss": 0.4668, + "step": 5454 + }, + { + "epoch": 0.94, + "learning_rate": 1.578253597605872e-07, + "loss": 0.4458, + "step": 5455 + }, + { + "epoch": 0.95, + "learning_rate": 1.56833793618707e-07, + "loss": 0.4646, + "step": 5456 + }, + { + "epoch": 0.95, + "learning_rate": 1.558453275272942e-07, + "loss": 0.4627, + "step": 5457 + }, + { + "epoch": 0.95, + "learning_rate": 1.5485996179766206e-07, + "loss": 0.4641, + "step": 5458 + }, + { + "epoch": 0.95, + "learning_rate": 1.5387769674014563e-07, + "loss": 0.4484, + "step": 5459 + }, + { + "epoch": 0.95, + "learning_rate": 1.5289853266410416e-07, + "loss": 0.4618, + "step": 5460 + }, + { + "epoch": 0.95, + "learning_rate": 1.519224698779198e-07, + "loss": 0.4555, + "step": 5461 + }, + { + "epoch": 0.95, + "learning_rate": 1.5094950868899672e-07, + "loss": 0.4498, + "step": 5462 + }, + { + "epoch": 0.95, + "learning_rate": 1.4997964940376752e-07, + "loss": 0.4479, + "step": 5463 + }, + { + "epoch": 0.95, + "learning_rate": 1.4901289232767903e-07, + "loss": 0.4602, + "step": 5464 + }, + { + "epoch": 0.95, + "learning_rate": 1.4804923776520985e-07, + "loss": 0.4452, + "step": 5465 + }, + { + "epoch": 0.95, + "learning_rate": 1.4708868601985503e-07, + "loss": 0.4645, + "step": 5466 + }, + { + "epoch": 0.95, + "learning_rate": 1.4613123739413704e-07, + "loss": 0.4587, + "step": 5467 + }, + { + "epoch": 0.95, + "learning_rate": 1.4517689218959907e-07, + "loss": 0.465, + "step": 5468 + }, + { + "epoch": 0.95, + "learning_rate": 1.4422565070680406e-07, + "loss": 0.4604, + "step": 5469 + }, + { + "epoch": 0.95, + "learning_rate": 1.4327751324534233e-07, + "loss": 0.4602, + "step": 5470 + }, + { + "epoch": 0.95, + "learning_rate": 1.4233248010382506e-07, + "loss": 0.4438, + "step": 5471 + }, + { + "epoch": 0.95, + "learning_rate": 1.4139055157988303e-07, + "loss": 0.4658, + "step": 5472 + }, + { + "epoch": 0.95, + "learning_rate": 1.4045172797017336e-07, + "loss": 0.4561, + "step": 5473 + }, + { + "epoch": 0.95, + "learning_rate": 1.3951600957037292e-07, + "loss": 0.4513, + "step": 5474 + }, + { + "epoch": 0.95, + "learning_rate": 1.385833966751815e-07, + "loss": 0.458, + "step": 5475 + }, + { + "epoch": 0.95, + "learning_rate": 1.376538895783186e-07, + "loss": 0.4642, + "step": 5476 + }, + { + "epoch": 0.95, + "learning_rate": 1.3672748857252783e-07, + "loss": 0.4616, + "step": 5477 + }, + { + "epoch": 0.95, + "learning_rate": 1.358041939495758e-07, + "loss": 0.4598, + "step": 5478 + }, + { + "epoch": 0.95, + "learning_rate": 1.3488400600024654e-07, + "loss": 0.45, + "step": 5479 + }, + { + "epoch": 0.95, + "learning_rate": 1.339669250143505e-07, + "loss": 0.4489, + "step": 5480 + }, + { + "epoch": 0.95, + "learning_rate": 1.3305295128071437e-07, + "loss": 0.4467, + "step": 5481 + }, + { + "epoch": 0.95, + "learning_rate": 1.321420850871935e-07, + "loss": 0.4734, + "step": 5482 + }, + { + "epoch": 0.95, + "learning_rate": 1.3123432672065506e-07, + "loss": 0.4501, + "step": 5483 + }, + { + "epoch": 0.95, + "learning_rate": 1.303296764669959e-07, + "loss": 0.463, + "step": 5484 + }, + { + "epoch": 0.95, + "learning_rate": 1.2942813461112924e-07, + "loss": 0.4432, + "step": 5485 + }, + { + "epoch": 0.95, + "learning_rate": 1.2852970143699129e-07, + "loss": 0.4676, + "step": 5486 + }, + { + "epoch": 0.95, + "learning_rate": 1.276343772275379e-07, + "loss": 0.453, + "step": 5487 + }, + { + "epoch": 0.95, + "learning_rate": 1.267421622647469e-07, + "loss": 0.4718, + "step": 5488 + }, + { + "epoch": 0.95, + "learning_rate": 1.2585305682961679e-07, + "loss": 0.4574, + "step": 5489 + }, + { + "epoch": 0.95, + "learning_rate": 1.2496706120216585e-07, + "loss": 0.4711, + "step": 5490 + }, + { + "epoch": 0.95, + "learning_rate": 1.2408417566143306e-07, + "loss": 0.4482, + "step": 5491 + }, + { + "epoch": 0.95, + "learning_rate": 1.2320440048547933e-07, + "loss": 0.4482, + "step": 5492 + }, + { + "epoch": 0.95, + "learning_rate": 1.2232773595138415e-07, + "loss": 0.4576, + "step": 5493 + }, + { + "epoch": 0.95, + "learning_rate": 1.2145418233524886e-07, + "loss": 0.4591, + "step": 5494 + }, + { + "epoch": 0.95, + "learning_rate": 1.2058373991219341e-07, + "loss": 0.4595, + "step": 5495 + }, + { + "epoch": 0.95, + "learning_rate": 1.197164089563596e-07, + "loss": 0.4454, + "step": 5496 + }, + { + "epoch": 0.95, + "learning_rate": 1.1885218974090895e-07, + "loss": 0.4451, + "step": 5497 + }, + { + "epoch": 0.95, + "learning_rate": 1.1799108253802149e-07, + "loss": 0.4504, + "step": 5498 + }, + { + "epoch": 0.95, + "learning_rate": 1.1713308761889696e-07, + "loss": 0.4535, + "step": 5499 + }, + { + "epoch": 0.95, + "learning_rate": 1.1627820525375811e-07, + "loss": 0.4496, + "step": 5500 + }, + { + "epoch": 0.95, + "learning_rate": 1.1542643571184619e-07, + "loss": 0.4506, + "step": 5501 + }, + { + "epoch": 0.95, + "learning_rate": 1.1457777926141889e-07, + "loss": 0.4528, + "step": 5502 + }, + { + "epoch": 0.95, + "learning_rate": 1.1373223616975681e-07, + "loss": 0.454, + "step": 5503 + }, + { + "epoch": 0.95, + "learning_rate": 1.1288980670315918e-07, + "loss": 0.4688, + "step": 5504 + }, + { + "epoch": 0.95, + "learning_rate": 1.1205049112694488e-07, + "loss": 0.4584, + "step": 5505 + }, + { + "epoch": 0.95, + "learning_rate": 1.1121428970545023e-07, + "loss": 0.4515, + "step": 5506 + }, + { + "epoch": 0.95, + "learning_rate": 1.1038120270203345e-07, + "loss": 0.4583, + "step": 5507 + }, + { + "epoch": 0.95, + "learning_rate": 1.0955123037907134e-07, + "loss": 0.4628, + "step": 5508 + }, + { + "epoch": 0.95, + "learning_rate": 1.0872437299795701e-07, + "loss": 0.4589, + "step": 5509 + }, + { + "epoch": 0.95, + "learning_rate": 1.079006308191055e-07, + "loss": 0.4698, + "step": 5510 + }, + { + "epoch": 0.95, + "learning_rate": 1.0708000410195041e-07, + "loss": 0.452, + "step": 5511 + }, + { + "epoch": 0.95, + "learning_rate": 1.0626249310494385e-07, + "loss": 0.4537, + "step": 5512 + }, + { + "epoch": 0.96, + "learning_rate": 1.0544809808555545e-07, + "loss": 0.4577, + "step": 5513 + }, + { + "epoch": 0.96, + "learning_rate": 1.0463681930027336e-07, + "loss": 0.4576, + "step": 5514 + }, + { + "epoch": 0.96, + "learning_rate": 1.0382865700460876e-07, + "loss": 0.4449, + "step": 5515 + }, + { + "epoch": 0.96, + "learning_rate": 1.030236114530847e-07, + "loss": 0.4702, + "step": 5516 + }, + { + "epoch": 0.96, + "learning_rate": 1.0222168289924616e-07, + "loss": 0.4356, + "step": 5517 + }, + { + "epoch": 0.96, + "learning_rate": 1.0142287159565778e-07, + "loss": 0.4542, + "step": 5518 + }, + { + "epoch": 0.96, + "learning_rate": 1.0062717779389942e-07, + "loss": 0.4467, + "step": 5519 + }, + { + "epoch": 0.96, + "learning_rate": 9.98346017445706e-08, + "loss": 0.4777, + "step": 5520 + }, + { + "epoch": 0.96, + "learning_rate": 9.904514369728724e-08, + "loss": 0.4518, + "step": 5521 + }, + { + "epoch": 0.96, + "learning_rate": 9.82588039006882e-08, + "loss": 0.4577, + "step": 5522 + }, + { + "epoch": 0.96, + "learning_rate": 9.74755826024254e-08, + "loss": 0.462, + "step": 5523 + }, + { + "epoch": 0.96, + "learning_rate": 9.669548004916817e-08, + "loss": 0.47, + "step": 5524 + }, + { + "epoch": 0.96, + "learning_rate": 9.591849648660779e-08, + "loss": 0.4533, + "step": 5525 + }, + { + "epoch": 0.96, + "learning_rate": 9.51446321594507e-08, + "loss": 0.4538, + "step": 5526 + }, + { + "epoch": 0.96, + "learning_rate": 9.437388731141861e-08, + "loss": 0.4458, + "step": 5527 + }, + { + "epoch": 0.96, + "learning_rate": 9.360626218525625e-08, + "loss": 0.4658, + "step": 5528 + }, + { + "epoch": 0.96, + "learning_rate": 9.284175702272246e-08, + "loss": 0.4435, + "step": 5529 + }, + { + "epoch": 0.96, + "learning_rate": 9.208037206459242e-08, + "loss": 0.445, + "step": 5530 + }, + { + "epoch": 0.96, + "learning_rate": 9.132210755066096e-08, + "loss": 0.4653, + "step": 5531 + }, + { + "epoch": 0.96, + "learning_rate": 9.056696371973928e-08, + "loss": 0.4495, + "step": 5532 + }, + { + "epoch": 0.96, + "learning_rate": 8.981494080965602e-08, + "loss": 0.4581, + "step": 5533 + }, + { + "epoch": 0.96, + "learning_rate": 8.906603905725619e-08, + "loss": 0.4604, + "step": 5534 + }, + { + "epoch": 0.96, + "learning_rate": 8.832025869840222e-08, + "loss": 0.4697, + "step": 5535 + }, + { + "epoch": 0.96, + "learning_rate": 8.757759996797399e-08, + "loss": 0.4452, + "step": 5536 + }, + { + "epoch": 0.96, + "learning_rate": 8.683806309986776e-08, + "loss": 0.4541, + "step": 5537 + }, + { + "epoch": 0.96, + "learning_rate": 8.610164832699608e-08, + "loss": 0.4456, + "step": 5538 + }, + { + "epoch": 0.96, + "learning_rate": 8.536835588128678e-08, + "loss": 0.4716, + "step": 5539 + }, + { + "epoch": 0.96, + "learning_rate": 8.463818599369067e-08, + "loss": 0.455, + "step": 5540 + }, + { + "epoch": 0.96, + "learning_rate": 8.391113889416713e-08, + "loss": 0.4659, + "step": 5541 + }, + { + "epoch": 0.96, + "learning_rate": 8.318721481169633e-08, + "loss": 0.45, + "step": 5542 + }, + { + "epoch": 0.96, + "learning_rate": 8.24664139742759e-08, + "loss": 0.4638, + "step": 5543 + }, + { + "epoch": 0.96, + "learning_rate": 8.174873660891536e-08, + "loss": 0.4472, + "step": 5544 + }, + { + "epoch": 0.96, + "learning_rate": 8.103418294164611e-08, + "loss": 0.4584, + "step": 5545 + }, + { + "epoch": 0.96, + "learning_rate": 8.032275319750926e-08, + "loss": 0.4668, + "step": 5546 + }, + { + "epoch": 0.96, + "learning_rate": 7.96144476005689e-08, + "loss": 0.465, + "step": 5547 + }, + { + "epoch": 0.96, + "learning_rate": 7.890926637390106e-08, + "loss": 0.4387, + "step": 5548 + }, + { + "epoch": 0.96, + "learning_rate": 7.820720973959694e-08, + "loss": 0.4637, + "step": 5549 + }, + { + "epoch": 0.96, + "learning_rate": 7.750827791876747e-08, + "loss": 0.4694, + "step": 5550 + }, + { + "epoch": 0.96, + "learning_rate": 7.681247113153655e-08, + "loss": 0.4576, + "step": 5551 + }, + { + "epoch": 0.96, + "learning_rate": 7.611978959704558e-08, + "loss": 0.4624, + "step": 5552 + }, + { + "epoch": 0.96, + "learning_rate": 7.543023353344892e-08, + "loss": 0.4565, + "step": 5553 + }, + { + "epoch": 0.96, + "learning_rate": 7.474380315791951e-08, + "loss": 0.4679, + "step": 5554 + }, + { + "epoch": 0.96, + "learning_rate": 7.406049868664445e-08, + "loss": 0.4649, + "step": 5555 + }, + { + "epoch": 0.96, + "learning_rate": 7.338032033482712e-08, + "loss": 0.4597, + "step": 5556 + }, + { + "epoch": 0.96, + "learning_rate": 7.270326831668617e-08, + "loss": 0.4514, + "step": 5557 + }, + { + "epoch": 0.96, + "learning_rate": 7.202934284545438e-08, + "loss": 0.4614, + "step": 5558 + }, + { + "epoch": 0.96, + "learning_rate": 7.135854413338194e-08, + "loss": 0.4515, + "step": 5559 + }, + { + "epoch": 0.96, + "learning_rate": 7.069087239173211e-08, + "loss": 0.4506, + "step": 5560 + }, + { + "epoch": 0.96, + "learning_rate": 7.002632783078445e-08, + "loss": 0.4676, + "step": 5561 + }, + { + "epoch": 0.96, + "learning_rate": 6.936491065983486e-08, + "loss": 0.4422, + "step": 5562 + }, + { + "epoch": 0.96, + "learning_rate": 6.870662108719117e-08, + "loss": 0.4611, + "step": 5563 + }, + { + "epoch": 0.96, + "learning_rate": 6.805145932017975e-08, + "loss": 0.4621, + "step": 5564 + }, + { + "epoch": 0.96, + "learning_rate": 6.73994255651389e-08, + "loss": 0.4611, + "step": 5565 + }, + { + "epoch": 0.96, + "learning_rate": 6.675052002742321e-08, + "loss": 0.4516, + "step": 5566 + }, + { + "epoch": 0.96, + "learning_rate": 6.610474291140257e-08, + "loss": 0.4512, + "step": 5567 + }, + { + "epoch": 0.96, + "learning_rate": 6.546209442046093e-08, + "loss": 0.4645, + "step": 5568 + }, + { + "epoch": 0.96, + "learning_rate": 6.482257475699526e-08, + "loss": 0.453, + "step": 5569 + }, + { + "epoch": 0.96, + "learning_rate": 6.418618412242116e-08, + "loss": 0.4429, + "step": 5570 + }, + { + "epoch": 0.97, + "learning_rate": 6.355292271716495e-08, + "loss": 0.4523, + "step": 5571 + }, + { + "epoch": 0.97, + "learning_rate": 6.292279074066821e-08, + "loss": 0.4651, + "step": 5572 + }, + { + "epoch": 0.97, + "learning_rate": 6.229578839138772e-08, + "loss": 0.4459, + "step": 5573 + }, + { + "epoch": 0.97, + "learning_rate": 6.167191586679556e-08, + "loss": 0.4407, + "step": 5574 + }, + { + "epoch": 0.97, + "learning_rate": 6.105117336337674e-08, + "loss": 0.4617, + "step": 5575 + }, + { + "epoch": 0.97, + "learning_rate": 6.043356107662823e-08, + "loss": 0.4538, + "step": 5576 + }, + { + "epoch": 0.97, + "learning_rate": 5.981907920106667e-08, + "loss": 0.4598, + "step": 5577 + }, + { + "epoch": 0.97, + "learning_rate": 5.920772793021945e-08, + "loss": 0.4442, + "step": 5578 + }, + { + "epoch": 0.97, + "learning_rate": 5.8599507456625907e-08, + "loss": 0.4693, + "step": 5579 + }, + { + "epoch": 0.97, + "learning_rate": 5.799441797184391e-08, + "loss": 0.4507, + "step": 5580 + }, + { + "epoch": 0.97, + "learning_rate": 5.739245966644102e-08, + "loss": 0.4531, + "step": 5581 + }, + { + "epoch": 0.97, + "learning_rate": 5.6793632730003375e-08, + "loss": 0.4542, + "step": 5582 + }, + { + "epoch": 0.97, + "learning_rate": 5.6197937351125664e-08, + "loss": 0.4596, + "step": 5583 + }, + { + "epoch": 0.97, + "learning_rate": 5.560537371742003e-08, + "loss": 0.4511, + "step": 5584 + }, + { + "epoch": 0.97, + "learning_rate": 5.501594201551164e-08, + "loss": 0.4645, + "step": 5585 + }, + { + "epoch": 0.97, + "learning_rate": 5.4429642431036435e-08, + "loss": 0.4548, + "step": 5586 + }, + { + "epoch": 0.97, + "learning_rate": 5.3846475148648936e-08, + "loss": 0.4547, + "step": 5587 + }, + { + "epoch": 0.97, + "learning_rate": 5.326644035201334e-08, + "loss": 0.4662, + "step": 5588 + }, + { + "epoch": 0.97, + "learning_rate": 5.268953822380796e-08, + "loss": 0.4589, + "step": 5589 + }, + { + "epoch": 0.97, + "learning_rate": 5.211576894572523e-08, + "loss": 0.4518, + "step": 5590 + }, + { + "epoch": 0.97, + "learning_rate": 5.154513269847061e-08, + "loss": 0.4601, + "step": 5591 + }, + { + "epoch": 0.97, + "learning_rate": 5.097762966176256e-08, + "loss": 0.4462, + "step": 5592 + }, + { + "epoch": 0.97, + "learning_rate": 5.041326001433366e-08, + "loss": 0.464, + "step": 5593 + }, + { + "epoch": 0.97, + "learning_rate": 4.985202393392841e-08, + "loss": 0.4583, + "step": 5594 + }, + { + "epoch": 0.97, + "learning_rate": 4.9293921597305396e-08, + "loss": 0.4461, + "step": 5595 + }, + { + "epoch": 0.97, + "learning_rate": 4.873895318023625e-08, + "loss": 0.4444, + "step": 5596 + }, + { + "epoch": 0.97, + "learning_rate": 4.818711885750338e-08, + "loss": 0.4643, + "step": 5597 + }, + { + "epoch": 0.97, + "learning_rate": 4.763841880290554e-08, + "loss": 0.4594, + "step": 5598 + }, + { + "epoch": 0.97, + "learning_rate": 4.7092853189252273e-08, + "loss": 0.4535, + "step": 5599 + }, + { + "epoch": 0.97, + "learning_rate": 4.655042218836725e-08, + "loss": 0.4511, + "step": 5600 + }, + { + "epoch": 0.97, + "learning_rate": 4.6011125971084924e-08, + "loss": 0.4572, + "step": 5601 + }, + { + "epoch": 0.97, + "learning_rate": 4.547496470725388e-08, + "loss": 0.4642, + "step": 5602 + }, + { + "epoch": 0.97, + "learning_rate": 4.49419385657357e-08, + "loss": 0.4607, + "step": 5603 + }, + { + "epoch": 0.97, + "learning_rate": 4.4412047714402774e-08, + "loss": 0.4552, + "step": 5604 + }, + { + "epoch": 0.97, + "learning_rate": 4.388529232014271e-08, + "loss": 0.457, + "step": 5605 + }, + { + "epoch": 0.97, + "learning_rate": 4.336167254885393e-08, + "loss": 0.4658, + "step": 5606 + }, + { + "epoch": 0.97, + "learning_rate": 4.2841188565446724e-08, + "loss": 0.474, + "step": 5607 + }, + { + "epoch": 0.97, + "learning_rate": 4.232384053384553e-08, + "loss": 0.4547, + "step": 5608 + }, + { + "epoch": 0.97, + "learning_rate": 4.1809628616985564e-08, + "loss": 0.454, + "step": 5609 + }, + { + "epoch": 0.97, + "learning_rate": 4.129855297681618e-08, + "loss": 0.4624, + "step": 5610 + }, + { + "epoch": 0.97, + "learning_rate": 4.0790613774295274e-08, + "loss": 0.4564, + "step": 5611 + }, + { + "epoch": 0.97, + "learning_rate": 4.028581116939823e-08, + "loss": 0.452, + "step": 5612 + }, + { + "epoch": 0.97, + "learning_rate": 3.978414532110897e-08, + "loss": 0.4578, + "step": 5613 + }, + { + "epoch": 0.97, + "learning_rate": 3.928561638742334e-08, + "loss": 0.447, + "step": 5614 + }, + { + "epoch": 0.97, + "learning_rate": 3.8790224525352416e-08, + "loss": 0.4691, + "step": 5615 + }, + { + "epoch": 0.97, + "learning_rate": 3.829796989091472e-08, + "loss": 0.4469, + "step": 5616 + }, + { + "epoch": 0.97, + "learning_rate": 3.780885263914402e-08, + "loss": 0.4586, + "step": 5617 + }, + { + "epoch": 0.97, + "learning_rate": 3.7322872924084876e-08, + "loss": 0.4491, + "step": 5618 + }, + { + "epoch": 0.97, + "learning_rate": 3.684003089879484e-08, + "loss": 0.4598, + "step": 5619 + }, + { + "epoch": 0.97, + "learning_rate": 3.636032671534229e-08, + "loss": 0.4636, + "step": 5620 + }, + { + "epoch": 0.97, + "learning_rate": 3.5883760524805244e-08, + "loss": 0.4654, + "step": 5621 + }, + { + "epoch": 0.97, + "learning_rate": 3.5410332477278096e-08, + "loss": 0.4607, + "step": 5622 + }, + { + "epoch": 0.97, + "learning_rate": 3.49400427218638e-08, + "loss": 0.4674, + "step": 5623 + }, + { + "epoch": 0.97, + "learning_rate": 3.447289140667609e-08, + "loss": 0.4592, + "step": 5624 + }, + { + "epoch": 0.97, + "learning_rate": 3.4008878678843946e-08, + "loss": 0.4642, + "step": 5625 + }, + { + "epoch": 0.97, + "learning_rate": 3.35480046845027e-08, + "loss": 0.4505, + "step": 5626 + }, + { + "epoch": 0.97, + "learning_rate": 3.309026956880512e-08, + "loss": 0.464, + "step": 5627 + }, + { + "epoch": 0.97, + "learning_rate": 3.2635673475910345e-08, + "loss": 0.4551, + "step": 5628 + }, + { + "epoch": 0.98, + "learning_rate": 3.218421654899162e-08, + "loss": 0.4552, + "step": 5629 + }, + { + "epoch": 0.98, + "learning_rate": 3.173589893023188e-08, + "loss": 0.4475, + "step": 5630 + }, + { + "epoch": 0.98, + "learning_rate": 3.1290720760828176e-08, + "loss": 0.4659, + "step": 5631 + }, + { + "epoch": 0.98, + "learning_rate": 3.0848682180985025e-08, + "loss": 0.4481, + "step": 5632 + }, + { + "epoch": 0.98, + "learning_rate": 3.040978332992106e-08, + "loss": 0.4626, + "step": 5633 + }, + { + "epoch": 0.98, + "learning_rate": 2.9974024345864604e-08, + "loss": 0.457, + "step": 5634 + }, + { + "epoch": 0.98, + "learning_rate": 2.9541405366054764e-08, + "loss": 0.4577, + "step": 5635 + }, + { + "epoch": 0.98, + "learning_rate": 2.9111926526744772e-08, + "loss": 0.4611, + "step": 5636 + }, + { + "epoch": 0.98, + "learning_rate": 2.8685587963194206e-08, + "loss": 0.4485, + "step": 5637 + }, + { + "epoch": 0.98, + "learning_rate": 2.826238980967788e-08, + "loss": 0.453, + "step": 5638 + }, + { + "epoch": 0.98, + "learning_rate": 2.7842332199478074e-08, + "loss": 0.4563, + "step": 5639 + }, + { + "epoch": 0.98, + "learning_rate": 2.7425415264890065e-08, + "loss": 0.4569, + "step": 5640 + }, + { + "epoch": 0.98, + "learning_rate": 2.7011639137221046e-08, + "loss": 0.4554, + "step": 5641 + }, + { + "epoch": 0.98, + "learning_rate": 2.6601003946784555e-08, + "loss": 0.4503, + "step": 5642 + }, + { + "epoch": 0.98, + "learning_rate": 2.6193509822910466e-08, + "loss": 0.4741, + "step": 5643 + }, + { + "epoch": 0.98, + "learning_rate": 2.578915689393613e-08, + "loss": 0.4402, + "step": 5644 + }, + { + "epoch": 0.98, + "learning_rate": 2.538794528720967e-08, + "loss": 0.4572, + "step": 5645 + }, + { + "epoch": 0.98, + "learning_rate": 2.4989875129091124e-08, + "loss": 0.4563, + "step": 5646 + }, + { + "epoch": 0.98, + "learning_rate": 2.4594946544949094e-08, + "loss": 0.4601, + "step": 5647 + }, + { + "epoch": 0.98, + "learning_rate": 2.420315965916631e-08, + "loss": 0.463, + "step": 5648 + }, + { + "epoch": 0.98, + "learning_rate": 2.3814514595132955e-08, + "loss": 0.4661, + "step": 5649 + }, + { + "epoch": 0.98, + "learning_rate": 2.3429011475250008e-08, + "loss": 0.4664, + "step": 5650 + }, + { + "epoch": 0.98, + "learning_rate": 2.304665042092924e-08, + "loss": 0.4606, + "step": 5651 + }, + { + "epoch": 0.98, + "learning_rate": 2.266743155259432e-08, + "loss": 0.457, + "step": 5652 + }, + { + "epoch": 0.98, + "learning_rate": 2.2291354989677492e-08, + "loss": 0.458, + "step": 5653 + }, + { + "epoch": 0.98, + "learning_rate": 2.19184208506229e-08, + "loss": 0.453, + "step": 5654 + }, + { + "epoch": 0.98, + "learning_rate": 2.154862925288326e-08, + "loss": 0.4644, + "step": 5655 + }, + { + "epoch": 0.98, + "learning_rate": 2.118198031292207e-08, + "loss": 0.4466, + "step": 5656 + }, + { + "epoch": 0.98, + "learning_rate": 2.0818474146212518e-08, + "loss": 0.4588, + "step": 5657 + }, + { + "epoch": 0.98, + "learning_rate": 2.045811086724192e-08, + "loss": 0.4617, + "step": 5658 + }, + { + "epoch": 0.98, + "learning_rate": 2.010089058950171e-08, + "loss": 0.4629, + "step": 5659 + }, + { + "epoch": 0.98, + "learning_rate": 1.9746813425498555e-08, + "loss": 0.4552, + "step": 5660 + }, + { + "epoch": 0.98, + "learning_rate": 1.9395879486745483e-08, + "loss": 0.4621, + "step": 5661 + }, + { + "epoch": 0.98, + "learning_rate": 1.9048088883767414e-08, + "loss": 0.453, + "step": 5662 + }, + { + "epoch": 0.98, + "learning_rate": 1.870344172610006e-08, + "loss": 0.4623, + "step": 5663 + }, + { + "epoch": 0.98, + "learning_rate": 1.8361938122287704e-08, + "loss": 0.4524, + "step": 5664 + }, + { + "epoch": 0.98, + "learning_rate": 1.8023578179884315e-08, + "loss": 0.4618, + "step": 5665 + }, + { + "epoch": 0.98, + "learning_rate": 1.7688362005454653e-08, + "loss": 0.4413, + "step": 5666 + }, + { + "epoch": 0.98, + "learning_rate": 1.7356289704574257e-08, + "loss": 0.4618, + "step": 5667 + }, + { + "epoch": 0.98, + "learning_rate": 1.7027361381826147e-08, + "loss": 0.4502, + "step": 5668 + }, + { + "epoch": 0.98, + "learning_rate": 1.6701577140805225e-08, + "loss": 0.4669, + "step": 5669 + }, + { + "epoch": 0.98, + "learning_rate": 1.6378937084114978e-08, + "loss": 0.4464, + "step": 5670 + }, + { + "epoch": 0.98, + "learning_rate": 1.6059441313369672e-08, + "loss": 0.4608, + "step": 5671 + }, + { + "epoch": 0.98, + "learning_rate": 1.5743089929193266e-08, + "loss": 0.4479, + "step": 5672 + }, + { + "epoch": 0.98, + "learning_rate": 1.5429883031217173e-08, + "loss": 0.4573, + "step": 5673 + }, + { + "epoch": 0.98, + "learning_rate": 1.511982071808471e-08, + "loss": 0.4414, + "step": 5674 + }, + { + "epoch": 0.98, + "learning_rate": 1.4812903087448872e-08, + "loss": 0.452, + "step": 5675 + }, + { + "epoch": 0.98, + "learning_rate": 1.4509130235971226e-08, + "loss": 0.4685, + "step": 5676 + }, + { + "epoch": 0.98, + "learning_rate": 1.420850225932302e-08, + "loss": 0.4814, + "step": 5677 + }, + { + "epoch": 0.98, + "learning_rate": 1.3911019252187408e-08, + "loss": 0.4546, + "step": 5678 + }, + { + "epoch": 0.98, + "learning_rate": 1.3616681308251667e-08, + "loss": 0.46, + "step": 5679 + }, + { + "epoch": 0.98, + "learning_rate": 1.332548852021831e-08, + "loss": 0.4617, + "step": 5680 + }, + { + "epoch": 0.98, + "learning_rate": 1.30374409797962e-08, + "loss": 0.4526, + "step": 5681 + }, + { + "epoch": 0.98, + "learning_rate": 1.2752538777704993e-08, + "loss": 0.4703, + "step": 5682 + }, + { + "epoch": 0.98, + "learning_rate": 1.24707820036718e-08, + "loss": 0.4595, + "step": 5683 + }, + { + "epoch": 0.98, + "learning_rate": 1.2192170746434529e-08, + "loss": 0.4681, + "step": 5684 + }, + { + "epoch": 0.98, + "learning_rate": 1.1916705093740766e-08, + "loss": 0.4605, + "step": 5685 + }, + { + "epoch": 0.99, + "learning_rate": 1.164438513234667e-08, + "loss": 0.4527, + "step": 5686 + }, + { + "epoch": 0.99, + "learning_rate": 1.1375210948019188e-08, + "loss": 0.4723, + "step": 5687 + }, + { + "epoch": 0.99, + "learning_rate": 1.1109182625531622e-08, + "loss": 0.4574, + "step": 5688 + }, + { + "epoch": 0.99, + "learning_rate": 1.0846300248668063e-08, + "loss": 0.4398, + "step": 5689 + }, + { + "epoch": 0.99, + "learning_rate": 1.0586563900222279e-08, + "loss": 0.4609, + "step": 5690 + }, + { + "epoch": 0.99, + "learning_rate": 1.0329973661996617e-08, + "loss": 0.4525, + "step": 5691 + }, + { + "epoch": 0.99, + "learning_rate": 1.0076529614804209e-08, + "loss": 0.4581, + "step": 5692 + }, + { + "epoch": 0.99, + "learning_rate": 9.82623183846343e-09, + "loss": 0.437, + "step": 5693 + }, + { + "epoch": 0.99, + "learning_rate": 9.579080411805664e-09, + "loss": 0.4612, + "step": 5694 + }, + { + "epoch": 0.99, + "learning_rate": 9.335075412669758e-09, + "loss": 0.462, + "step": 5695 + }, + { + "epoch": 0.99, + "learning_rate": 9.094216917903131e-09, + "loss": 0.4575, + "step": 5696 + }, + { + "epoch": 0.99, + "learning_rate": 8.85650500336288e-09, + "loss": 0.4613, + "step": 5697 + }, + { + "epoch": 0.99, + "learning_rate": 8.621939743916896e-09, + "loss": 0.4579, + "step": 5698 + }, + { + "epoch": 0.99, + "learning_rate": 8.390521213437197e-09, + "loss": 0.4488, + "step": 5699 + }, + { + "epoch": 0.99, + "learning_rate": 8.162249484809926e-09, + "loss": 0.4607, + "step": 5700 + } + ], + "logging_steps": 1.0, + "max_steps": 5772, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-5700/vision_tower/config.json b/checkpoint-5700/vision_tower/config.json new file mode 100644 index 0000000000000000000000000000000000000000..fb3d0280a1f3ffca65b21f8638511fdc4cdc4404 --- /dev/null +++ b/checkpoint-5700/vision_tower/config.json @@ -0,0 +1,19 @@ +{ + "_name_or_path": "/lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-5700/vision_tower", + "architectures": [ + "SiglipVisionModel" + ], + "attention_dropout": 0.0, + "hidden_act": "gelu_pytorch_tanh", + "hidden_size": 1152, + "image_size": 384, + "intermediate_size": 4304, + "layer_norm_eps": 1e-06, + "model_type": "siglip_vision_model", + "num_attention_heads": 16, + "num_channels": 3, + "num_hidden_layers": 27, + "patch_size": 14, + "torch_dtype": "bfloat16", + "transformers_version": "4.36.2" +} diff --git a/checkpoint-5700/vision_tower/model.safetensors b/checkpoint-5700/vision_tower/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..be5f88a49f854350a277e7410c9fac0cf146bf9c --- /dev/null +++ b/checkpoint-5700/vision_tower/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54df69cd28429bc24cb942fa555a88dae4b1805d97d039cb72282b16aa59de95 +size 856506120 diff --git a/checkpoint-5700/vision_tower/preprocessor_config.json b/checkpoint-5700/vision_tower/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0f13134ed29056f82f3ab7e0246f0ab973e7ecf3 --- /dev/null +++ b/checkpoint-5700/vision_tower/preprocessor_config.json @@ -0,0 +1,24 @@ +{ + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.5, + 0.5, + 0.5 + ], + "image_processor_type": "SiglipImageProcessor", + "image_std": [ + 0.5, + 0.5, + 0.5 + ], + "processor_class": "SiglipProcessor", + "resample": 3, + "rescale_factor": 0.00392156862745098, + "size": { + "height": 384, + "width": 384 + } +} diff --git a/checkpoint-5700/zero_to_fp32.py b/checkpoint-5700/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..c5246ff52274e1d6142001ccf085186d3545ce57 --- /dev/null +++ b/checkpoint-5700/zero_to_fp32.py @@ -0,0 +1,578 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage == 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dicts.append(torch.load(f, map_location=device)) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage == 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage == 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage == 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file) diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ec510df2c4068ab6ffb2ec2ddb9b899dba4692cd --- /dev/null +++ b/config.json @@ -0,0 +1,253 @@ +{ + "_name_or_path": "/lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask", + "architectures": [ + "LlavaLlamaModel" + ], + "drop_path_rate": 0.0, + "hidden_size": 2560, + "image_aspect_ratio": "resize", + "interpolate_mode": "linear", + "llm_cfg": { + "_name_or_path": "/lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/llm", + "add_cross_attention": false, + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 1, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 2, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "silu", + "hidden_size": 2560, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 6912, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 4096, + "min_length": 0, + "model_max_length": 4096, + "model_type": "llama", + "no_repeat_ngram_size": 0, + "num_attention_heads": 20, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 32, + "num_key_value_heads": 20, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "pretraining_tp": 1, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": false, + "tokenizer_class": null, + "tokenizer_model_max_length": 4096, + "tokenizer_padding_side": "right", + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "bfloat16", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 32000 + }, + "mm_hidden_size": 1152, + "mm_projector_cfg": { + "_name_or_path": "/lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/mm_projector", + "add_cross_attention": false, + "architectures": [ + "MultimodalProjector" + ], + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "mm_projector_type": "mlp_downsample", + "model_type": "v2l_projector", + "no_repeat_ngram_size": 0, + "num_beam_groups": 1, + "num_beams": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "bfloat16", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false + }, + "mm_projector_lr": null, + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "cls_patch", + "mm_vision_select_layer": -2, + "model_dtype": "torch.bfloat16", + "model_type": "llava_llama", + "num_video_frames": 8, + "resume_path": "/lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask", + "s2": false, + "s2_max_split_size": 336, + "s2_scales": "336,672,1008", + "transformers_version": "4.36.2", + "tune_language_model": true, + "tune_mm_projector": true, + "tune_vision_tower": true, + "vision_resolution": -1, + "vision_tower_cfg": { + "_name_or_path": "/lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/vision_tower", + "add_cross_attention": false, + "architectures": [ + "SiglipVisionModel" + ], + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu_pytorch_tanh", + "hidden_size": 1152, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 384, + "intermediate_size": 4304, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "siglip_vision_model", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 27, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "bfloat16", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false + } +} diff --git a/llm/config.json b/llm/config.json new file mode 100644 index 0000000000000000000000000000000000000000..4c57517e8304d2fc3c47317cc6f214e0927bb892 --- /dev/null +++ b/llm/config.json @@ -0,0 +1,32 @@ +{ + "_name_or_path": "/lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/llm", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 1, + "eos_token_id": 2, + "hidden_act": "silu", + "hidden_size": 2560, + "initializer_range": 0.02, + "intermediate_size": 6912, + "max_position_embeddings": 4096, + "model_max_length": 4096, + "model_type": "llama", + "num_attention_heads": 20, + "num_hidden_layers": 32, + "num_key_value_heads": 20, + "pad_token_id": 0, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 4096, + "tokenizer_padding_side": "right", + "torch_dtype": "bfloat16", + "transformers_version": "4.36.2", + "use_cache": true, + "vocab_size": 32000 +} diff --git a/llm/generation_config.json b/llm/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bf84ec1a28ba89feb07162d95b06633a40b4975f --- /dev/null +++ b/llm/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "eos_token_id": 2, + "pad_token_id": 0, + "transformers_version": "4.36.2" +} diff --git a/llm/model-00001-of-00002.safetensors b/llm/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1fa422caf4fae8e0bd73939bc01e5e80ddb52da0 --- /dev/null +++ b/llm/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0643dac0d5297b93c78108073a85b9e2242d93e0ad2e2773fbdc5a5ecadabe5 +size 4974521464 diff --git a/llm/model-00002-of-00002.safetensors b/llm/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b57b958599b8aa0814eaac11c89d7a369565c05c --- /dev/null +++ b/llm/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:629885d5424ea3bb4203261764d63120ece254b8f08ec9f488abad0cd83ce604 +size 428632856 diff --git a/llm/model.safetensors.index.json b/llm/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..8b173c9ac8194749df58c92051618c0ff74c4c20 --- /dev/null +++ b/llm/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 5403120640 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.28.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.29.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/llm/special_tokens_map.json b/llm/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..14761dcf1466dc232bd41de9c21d4c617b15755e --- /dev/null +++ b/llm/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/llm/tokenizer.model b/llm/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..3b7eab905db502ae7629c8a3c1f8412a3178c4c2 --- /dev/null +++ b/llm/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7aedb3582ecda9fa99ee9242c17a9658f6744db083ee6ebdc8fb14857f84d220 +size 499723 diff --git a/llm/tokenizer_config.json b/llm/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..47ab96cd62cc374653a0ea0fb77f9457e0f53481 --- /dev/null +++ b/llm/tokenizer_config.json @@ -0,0 +1,43 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": false, + "model_max_length": 4096, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/mm_projector/config.json b/mm_projector/config.json new file mode 100644 index 0000000000000000000000000000000000000000..4893e3d03ada6a587c508fbe39bcbc93a4fde069 --- /dev/null +++ b/mm_projector/config.json @@ -0,0 +1,10 @@ +{ + "_name_or_path": "/lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/mm_projector", + "architectures": [ + "MultimodalProjector" + ], + "mm_projector_type": "mlp_downsample", + "model_type": "v2l_projector", + "torch_dtype": "bfloat16", + "transformers_version": "4.36.2" +} diff --git a/mm_projector/model.safetensors b/mm_projector/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b95c1918af2fb52bee373a2fd13a8244b711dff4 --- /dev/null +++ b/mm_projector/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9682d0f9b77fbcb0161b2ce5796278d1c31a3845c5f72279370fa81215f1e450 +size 36729360 diff --git a/terminal.log b/terminal.log new file mode 100644 index 0000000000000000000000000000000000000000..81b3533d0f95154be1cdd566580bd43d2f6ae6ea --- /dev/null +++ b/terminal.log @@ -0,0 +1,27890 @@ +srun: job 6683204 queued and waiting for resources +srun: job 6683204 has been allocated resources +wandb: Currently logged in as: memmelma. Use `wandb login --relogin` to force relogin +wandb: Currently logged in as: memmelma. Use `wandb login --relogin` to force relogin +MASTER_ADDR=batch-block1-0010 +JobID: 6683204 | Full list: batch-block1-0010 batch-block1-10014 +NETWORK=Efficient-Large-Model/VILA1.5-3b +MASTER_ADDR=batch-block1-0010 +JobID: 6683204 | Full list: batch-block1-0010 batch-block1-10014 +NETWORK=Efficient-Large-Model/VILA1.5-3b +WARNING:torch.distributed.run: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:torch.distributed.run: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +[2025-04-09 13:42:16,546] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-09 13:42:16,546] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-09 13:42:16,546] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-09 13:42:16,546] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-09 13:42:16,546] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-09 13:42:16,546] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-09 13:42:16,609] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-09 13:42:16,629] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-09 13:42:16,719] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-09 13:42:16,719] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-09 13:42:16,719] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-09 13:42:16,719] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-09 13:42:16,719] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-09 13:42:16,719] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-09 13:42:16,719] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-09 13:42:16,719] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-09 13:42:17,712] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-09 13:42:17,713] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-09 13:42:17,713] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-09 13:42:17,713] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-09 13:42:17,712] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-09 13:42:17,712] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-09 13:42:17,712] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-09 13:42:17,712] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-09 13:42:17,712] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-09 13:42:17,712] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-09 13:42:17,712] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-09 13:42:17,712] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-09 13:42:17,712] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-09 13:42:17,712] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-09 13:42:17,712] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-09 13:42:17,712] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-09 13:42:17,712] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-09 13:42:17,712] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-09 13:42:17,713] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-09 13:42:17,713] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-09 13:42:17,713] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-09 13:42:17,713] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-09 13:42:17,713] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-09 13:42:17,713] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-09 13:42:17,713] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-09 13:42:17,713] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-09 13:42:17,713] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-09 13:42:17,713] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-09 13:42:17,713] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-09 13:42:17,713] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-09 13:42:17,713] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-09 13:42:17,713] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-09 13:42:17,713] [INFO] [comm.py:625:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/huggingface_hub/file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. + warnings.warn( +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/huggingface_hub/file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. + warnings.warn( +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/huggingface_hub/file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. + warnings.warn( +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/huggingface_hub/file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. + warnings.warn( +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/huggingface_hub/file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. + warnings.warn( +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/huggingface_hub/file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. + warnings.warn( +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/huggingface_hub/file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. + warnings.warn( +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/huggingface_hub/file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. + warnings.warn( +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/huggingface_hub/file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. + warnings.warn( +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/huggingface_hub/file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. + warnings.warn( +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/huggingface_hub/file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. + warnings.warn( +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/huggingface_hub/file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. + warnings.warn( +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/huggingface_hub/file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. + warnings.warn( +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/huggingface_hub/file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. + warnings.warn( +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/huggingface_hub/file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. + warnings.warn( +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/huggingface_hub/file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. + warnings.warn( + Fetching 17 files: 0%| | 0/17 [00:00\nWould this person be more likely to be a type a or b person?\nAnswer the question using a single word or phrase.'}, {'from': 'gpt', 'value': ''}]] (ignored) + 3%|▎ | 156/5772 [16:38<9:51:35, 6.32s/it] 3%|▎ | 156/5772 [16:31<9:51:36, 6.32s/it] {'loss': 0.5273, 'learning_rate': 1.7931034482758623e-05, 'epoch': 0.03} + 3%|▎ | 156/5772 [16:38<9:51:35, 6.32s/it] {'loss': 0.5273, 'learning_rate': 1.7931034482758623e-05, 'epoch': 0.03} + 3%|▎ | 156/5772 [16:31<9:51:36, 6.32s/it] 3%|▎ | 157/5772 [16:37<9:37:44, 6.17s/it] 3%|▎ | 157/5772 [16:44<9:37:45, 6.17s/it] {'loss': 0.5355, 'learning_rate': 1.8045977011494254e-05, 'epoch': 0.03} + 3%|▎ | 157/5772 [16:44<9:37:45, 6.17s/it] {'loss': 0.5355, 'learning_rate': 1.8045977011494254e-05, 'epoch': 0.03} + 3%|▎ | 157/5772 [16:37<9:37:44, 6.17s/it] 3%|▎ | 158/5772 [16:43<9:31:27, 6.11s/it] 3%|▎ | 158/5772 [16:50<9:31:28, 6.11s/it] {'loss': 0.5371, 'learning_rate': 1.8160919540229885e-05, 'epoch': 0.03} + 3%|▎ | 158/5772 [16:50<9:31:28, 6.11s/it] {'loss': 0.5371, 'learning_rate': 1.8160919540229885e-05, 'epoch': 0.03} + 3%|▎ | 158/5772 [16:43<9:31:27, 6.11s/it] 3%|▎ | 159/5772 [16:56<9:34:12, 6.14s/it] 3%|▎ | 159/5772 [16:49<9:34:12, 6.14s/it] {'loss': 0.5453, 'learning_rate': 1.827586206896552e-05, 'epoch': 0.03} + 3%|▎ | 159/5772 [16:56<9:34:12, 6.14s/it] {'loss': 0.5453, 'learning_rate': 1.827586206896552e-05, 'epoch': 0.03} + 3%|▎ | 159/5772 [16:49<9:34:12, 6.14s/it] 3%|▎ | 160/5772 [17:02<9:36:56, 6.17s/it] 3%|▎ | 160/5772 [16:55<9:36:56, 6.17s/it] {'loss': 0.5329, 'learning_rate': 1.839080459770115e-05, 'epoch': 0.03} + 3%|▎ | 160/5772 [17:02<9:36:56, 6.17s/it] {'loss': 0.5329, 'learning_rate': 1.839080459770115e-05, 'epoch': 0.03} + 3%|▎ | 160/5772 [16:55<9:36:56, 6.17s/it] 3%|▎ | 161/5772 [17:08<9:33:20, 6.13s/it] 3%|▎ | 161/5772 [17:01<9:33:20, 6.13s/it] {'loss': 0.5241, 'learning_rate': 1.8505747126436784e-05, 'epoch': 0.03} + 3%|▎ | 161/5772 [17:08<9:33:20, 6.13s/it] {'loss': 0.5241, 'learning_rate': 1.8505747126436784e-05, 'epoch': 0.03} + 3%|▎ | 161/5772 [17:01<9:33:20, 6.13s/it] 3%|▎ | 162/5772 [17:15<9:33:50, 6.14s/it] 3%|▎ | 162/5772 [17:07<9:33:50, 6.14s/it] {'loss': 0.5136, 'learning_rate': 1.8620689655172415e-05, 'epoch': 0.03} + 3%|▎ | 162/5772 [17:15<9:33:50, 6.14s/it] {'loss': 0.5136, 'learning_rate': 1.8620689655172415e-05, 'epoch': 0.03} + 3%|▎ | 162/5772 [17:07<9:33:50, 6.14s/it] 3%|▎ | 163/5772 [17:21<9:44:13, 6.25s/it] 3%|▎ | 163/5772 [17:14<9:44:13, 6.25s/it] {'loss': 0.5355, 'learning_rate': 1.873563218390805e-05, 'epoch': 0.03} + 3%|▎ | 163/5772 [17:21<9:44:13, 6.25s/it] {'loss': 0.5355, 'learning_rate': 1.873563218390805e-05, 'epoch': 0.03} + 3%|▎ | 163/5772 [17:14<9:44:13, 6.25s/it] 3%|▎ | 164/5772 [17:27<9:43:49, 6.25s/it] 3%|▎ | 164/5772 [17:20<9:43:49, 6.25s/it] {'loss': 0.5372, 'learning_rate': 1.885057471264368e-05, 'epoch': 0.03} + 3%|▎ | 164/5772 [17:27<9:43:49, 6.25s/it] {'loss': 0.5372, 'learning_rate': 1.885057471264368e-05, 'epoch': 0.03} + 3%|▎ | 164/5772 [17:20<9:43:49, 6.25s/it] 3%|▎ | 165/5772 [17:33<9:41:56, 6.23s/it] 3%|▎ | 165/5772 [17:26<9:41:56, 6.23s/it] {'loss': 0.5187, 'learning_rate': 1.896551724137931e-05, 'epoch': 0.03} + 3%|▎ | 165/5772 [17:33<9:41:56, 6.23s/it] {'loss': 0.5187, 'learning_rate': 1.896551724137931e-05, 'epoch': 0.03} + 3%|▎ | 165/5772 [17:26<9:41:56, 6.23s/it] 3%|▎ | 166/5772 [17:40<9:39:21, 6.20s/it] 3%|▎ | 166/5772 [17:33<9:39:21, 6.20s/it] {'loss': 0.5368, 'learning_rate': 1.908045977011494e-05, 'epoch': 0.03} + 3%|▎ | 166/5772 [17:40<9:39:21, 6.20s/it] {'loss': 0.5368, 'learning_rate': 1.908045977011494e-05, 'epoch': 0.03} + 3%|▎ | 166/5772 [17:33<9:39:21, 6.20s/it] 3%|▎ | 167/5772 [17:46<9:36:28, 6.17s/it] 3%|▎ | 167/5772 [17:39<9:36:28, 6.17s/it] {'loss': 0.5113, 'learning_rate': 1.9195402298850576e-05, 'epoch': 0.03} + 3%|▎ | 167/5772 [17:46<9:36:28, 6.17s/it] {'loss': 0.5113, 'learning_rate': 1.9195402298850576e-05, 'epoch': 0.03} + 3%|▎ | 167/5772 [17:39<9:36:28, 6.17s/it] 3%|▎ | 168/5772 [17:52<9:37:04, 6.18s/it] 3%|▎ | 168/5772 [17:45<9:37:04, 6.18s/it] {'loss': 0.5277, 'learning_rate': 1.931034482758621e-05, 'epoch': 0.03} + 3%|▎ | 168/5772 [17:52<9:37:04, 6.18s/it] {'loss': 0.5277, 'learning_rate': 1.931034482758621e-05, 'epoch': 0.03} + 3%|▎ | 168/5772 [17:45<9:37:04, 6.18s/it] 3%|▎ | 169/5772 [17:58<9:33:29, 6.14s/it] 3%|▎ | 169/5772 [17:51<9:33:29, 6.14s/it] {'loss': 0.5277, 'learning_rate': 1.942528735632184e-05, 'epoch': 0.03} + 3%|▎ | 169/5772 [17:58<9:33:29, 6.14s/it] {'loss': 0.5277, 'learning_rate': 1.942528735632184e-05, 'epoch': 0.03} + 3%|▎ | 169/5772 [17:51<9:33:29, 6.14s/it] 3%|▎ | 170/5772 [18:04<9:31:28, 6.12s/it] 3%|▎ | 170/5772 [17:57<9:31:28, 6.12s/it] {'loss': 0.5299, 'learning_rate': 1.9540229885057475e-05, 'epoch': 0.03} + 3%|▎ | 170/5772 [18:04<9:31:28, 6.12s/it] {'loss': 0.5299, 'learning_rate': 1.9540229885057475e-05, 'epoch': 0.03} + 3%|▎ | 170/5772 [17:57<9:31:28, 6.12s/it] 3%|▎ | 171/5772 [18:10<9:30:29, 6.11s/it] 3%|▎ | 171/5772 [18:03<9:30:29, 6.11s/it] {'loss': 0.5241, 'learning_rate': 1.9655172413793106e-05, 'epoch': 0.03} + 3%|▎ | 171/5772 [18:10<9:30:29, 6.11s/it] {'loss': 0.5241, 'learning_rate': 1.9655172413793106e-05, 'epoch': 0.03} + 3%|▎ | 171/5772 [18:03<9:30:29, 6.11s/it] 3%|▎ | 172/5772 [18:16<9:31:14, 6.12s/it] 3%|▎ | 172/5772 [18:09<9:31:16, 6.12s/it] {'loss': 0.5478, 'learning_rate': 1.9770114942528737e-05, 'epoch': 0.03} + 3%|▎ | 172/5772 [18:16<9:31:14, 6.12s/it] {'loss': 0.5478, 'learning_rate': 1.9770114942528737e-05, 'epoch': 0.03} + 3%|▎ | 172/5772 [18:09<9:31:16, 6.12s/it] 3%|▎ | 173/5772 [18:22<9:34:39, 6.16s/it] 3%|▎ | 173/5772 [18:15<9:34:39, 6.16s/it] {'loss': 0.5343, 'learning_rate': 1.9885057471264367e-05, 'epoch': 0.03} + 3%|▎ | 173/5772 [18:22<9:34:39, 6.16s/it] {'loss': 0.5343, 'learning_rate': 1.9885057471264367e-05, 'epoch': 0.03} + 3%|▎ | 173/5772 [18:15<9:34:39, 6.16s/it] 3%|▎ | 174/5772 [18:29<9:33:23, 6.15s/it] 3%|▎ | 174/5772 [18:22<9:33:23, 6.15s/it] {'loss': 0.5179, 'learning_rate': 2e-05, 'epoch': 0.03} + 3%|▎ | 174/5772 [18:29<9:33:23, 6.15s/it] {'loss': 0.5179, 'learning_rate': 2e-05, 'epoch': 0.03} + 3%|▎ | 174/5772 [18:22<9:33:23, 6.15s/it] 3%|▎ | 175/5772 [18:35<9:35:06, 6.17s/it] 3%|▎ | 175/5772 [18:28<9:35:06, 6.17s/it] {'loss': 0.5239, 'learning_rate': 1.99999984252778e-05, 'epoch': 0.03} + 3%|▎ | 175/5772 [18:35<9:35:06, 6.17s/it] {'loss': 0.5239, 'learning_rate': 1.99999984252778e-05, 'epoch': 0.03} + 3%|▎ | 175/5772 [18:28<9:35:06, 6.17s/it] 3%|▎ | 176/5772 [18:41<9:34:48, 6.16s/it] 3%|▎ | 176/5772 [18:34<9:34:47, 6.16s/it] {'loss': 0.5314, 'learning_rate': 1.9999993701111697e-05, 'epoch': 0.03} + 3%|▎ | 176/5772 [18:41<9:34:48, 6.16s/it] {'loss': 0.5314, 'learning_rate': 1.9999993701111697e-05, 'epoch': 0.03} + 3%|▎ | 176/5772 [18:34<9:34:47, 6.16s/it] 3%|▎ | 177/5772 [18:48<9:52:22, 6.35s/it] 3%|▎ | 177/5772 [18:41<9:52:22, 6.35s/it] {'loss': 0.535, 'learning_rate': 1.9999985827503177e-05, 'epoch': 0.03} + 3%|▎ | 177/5772 [18:48<9:52:22, 6.35s/it] {'loss': 0.535, 'learning_rate': 1.9999985827503177e-05, 'epoch': 0.03} + 3%|▎ | 177/5772 [18:41<9:52:22, 6.35s/it] 3%|▎ | 178/5772 [18:54<9:45:55, 6.28s/it] 3%|▎ | 178/5772 [18:47<9:45:55, 6.28s/it] {'loss': 0.5273, 'learning_rate': 1.9999974804454722e-05, 'epoch': 0.03} + 3%|▎ | 178/5772 [18:54<9:45:55, 6.28s/it] {'loss': 0.5273, 'learning_rate': 1.9999974804454722e-05, 'epoch': 0.03} + 3%|▎ | 178/5772 [18:47<9:45:55, 6.28s/it] 3%|▎ | 179/5772 [19:00<9:46:38, 6.29s/it] 3%|▎ | 179/5772 [18:53<9:46:38, 6.29s/it] {'loss': 0.5348, 'learning_rate': 1.99999606319698e-05, 'epoch': 0.03} + 3%|▎ | 179/5772 [19:00<9:46:38, 6.29s/it] {'loss': 0.5348, 'learning_rate': 1.99999606319698e-05, 'epoch': 0.03} + 3%|▎ | 179/5772 [18:53<9:46:38, 6.29s/it] 3%|▎ | 180/5772 [19:06<9:33:27, 6.15s/it] 3%|▎ | 180/5772 [18:59<9:33:27, 6.15s/it] {'loss': 0.516, 'learning_rate': 1.999994331005288e-05, 'epoch': 0.03} + {'loss': 0.516, 'learning_rate': 1.999994331005288e-05, 'epoch': 0.03} + 3%|▎ | 180/5772 [19:06<9:33:27, 6.15s/it] 3%|▎ | 180/5772 [18:59<9:33:27, 6.15s/it] 3%|▎ | 181/5772 [19:12<9:31:27, 6.13s/it] 3%|▎ | 181/5772 [19:05<9:31:27, 6.13s/it] {'loss': 0.523, 'learning_rate': 1.9999922838709414e-05, 'epoch': 0.03} + 3%|▎ | 181/5772 [19:12<9:31:27, 6.13s/it] {'loss': 0.523, 'learning_rate': 1.9999922838709414e-05, 'epoch': 0.03} + 3%|▎ | 181/5772 [19:05<9:31:27, 6.13s/it] 3%|▎ | 182/5772 [19:18<9:31:35, 6.14s/it] 3%|▎ | 182/5772 [19:11<9:31:35, 6.14s/it] {'loss': 0.5273, 'learning_rate': 1.9999899217945845e-05, 'epoch': 0.03} + 3%|▎ | 182/5772 [19:18<9:31:35, 6.14s/it] {'loss': 0.5273, 'learning_rate': 1.9999899217945845e-05, 'epoch': 0.03} + 3%|▎ | 182/5772 [19:11<9:31:35, 6.14s/it] 3%|▎ | 183/5772 [19:24<9:33:09, 6.15s/it] 3%|▎ | 183/5772 [19:17<9:33:09, 6.15s/it] {'loss': 0.526, 'learning_rate': 1.9999872447769624e-05, 'epoch': 0.03} + 3%|▎ | 183/5772 [19:24<9:33:09, 6.15s/it] {'loss': 0.526, 'learning_rate': 1.9999872447769624e-05, 'epoch': 0.03} + 3%|▎ | 183/5772 [19:17<9:33:09, 6.15s/it] 3%|▎ | 184/5772 [19:24<9:41:46, 6.25s/it] 3%|▎ | 184/5772 [19:31<9:41:46, 6.25s/it] {'loss': 0.5228, 'learning_rate': 1.999984252818917e-05, 'epoch': 0.03} + 3%|▎ | 184/5772 [19:31<9:41:46, 6.25s/it] {'loss': 0.5228, 'learning_rate': 1.999984252818917e-05, 'epoch': 0.03} + 3%|▎ | 184/5772 [19:24<9:41:46, 6.25s/it] 3%|▎ | 185/5772 [19:37<9:33:52, 6.16s/it] 3%|▎ | 185/5772 [19:30<9:33:52, 6.16s/it] {'loss': 0.5195, 'learning_rate': 1.9999809459213914e-05, 'epoch': 0.03} + 3%|▎ | 185/5772 [19:37<9:33:52, 6.16s/it] {'loss': 0.5195, 'learning_rate': 1.9999809459213914e-05, 'epoch': 0.03} + 3%|▎ | 185/5772 [19:30<9:33:52, 6.16s/it] 3%|▎ | 186/5772 [19:43<9:32:15, 6.15s/it] 3%|▎ | 186/5772 [19:36<9:32:15, 6.15s/it] {'loss': 0.5289, 'learning_rate': 1.9999773240854266e-05, 'epoch': 0.03} + 3%|▎ | 186/5772 [19:43<9:32:15, 6.15s/it] {'loss': 0.5289, 'learning_rate': 1.9999773240854266e-05, 'epoch': 0.03} + 3%|▎ | 186/5772 [19:36<9:32:15, 6.15s/it] 3%|▎ | 187/5772 [19:49<9:19:23, 6.01s/it] 3%|▎ | 187/5772 [19:42<9:19:23, 6.01s/it] {'loss': 0.5073, 'learning_rate': 1.9999733873121638e-05, 'epoch': 0.03} + 3%|▎ | 187/5772 [19:49<9:19:23, 6.01s/it] {'loss': 0.5073, 'learning_rate': 1.9999733873121638e-05, 'epoch': 0.03} + 3%|▎ | 187/5772 [19:42<9:19:23, 6.01s/it] 3%|▎ | 188/5772 [19:54<9:13:47, 5.95s/it] 3%|▎ | 188/5772 [19:47<9:13:47, 5.95s/it] {'loss': 0.5311, 'learning_rate': 1.9999691356028422e-05, 'epoch': 0.03} + 3%|▎ | 188/5772 [19:54<9:13:47, 5.95s/it] {'loss': 0.5311, 'learning_rate': 1.9999691356028422e-05, 'epoch': 0.03} + 3%|▎ | 188/5772 [19:47<9:13:47, 5.95s/it] 3%|▎ | 189/5772 [20:01<9:21:32, 6.03s/it] 3%|▎ | 189/5772 [19:54<9:21:32, 6.03s/it] {'loss': 0.5266, 'learning_rate': 1.999964568958801e-05, 'epoch': 0.03} + 3%|▎ | 189/5772 [20:01<9:21:32, 6.03s/it] {'loss': 0.5266, 'learning_rate': 1.999964568958801e-05, 'epoch': 0.03} + 3%|▎ | 189/5772 [19:54<9:21:32, 6.03s/it] 3%|▎ | 190/5772 [20:07<9:39:51, 6.23s/it] 3%|▎ | 190/5772 [20:00<9:39:51, 6.23s/it] {'loss': 0.5216, 'learning_rate': 1.999959687381479e-05, 'epoch': 0.03} + 3%|▎ | 190/5772 [20:07<9:39:51, 6.23s/it] {'loss': 0.5216, 'learning_rate': 1.999959687381479e-05, 'epoch': 0.03} + 3%|▎ | 190/5772 [20:00<9:39:51, 6.23s/it] 3%|▎ | 191/5772 [20:14<9:40:12, 6.24s/it] 3%|▎ | 191/5772 [20:07<9:40:12, 6.24s/it] {'loss': 0.5132, 'learning_rate': 1.999954490872413e-05, 'epoch': 0.03} + 3%|▎ | 191/5772 [20:14<9:40:12, 6.24s/it] {'loss': 0.5132, 'learning_rate': 1.999954490872413e-05, 'epoch': 0.03} + 3%|▎ | 191/5772 [20:07<9:40:12, 6.24s/it] 3%|▎ | 192/5772 [20:20<9:42:19, 6.26s/it] 3%|▎ | 192/5772 [20:13<9:42:19, 6.26s/it] {'loss': 0.5388, 'learning_rate': 1.9999489794332404e-05, 'epoch': 0.03} + 3%|▎ | 192/5772 [20:20<9:42:19, 6.26s/it] {'loss': 0.5388, 'learning_rate': 1.9999489794332404e-05, 'epoch': 0.03} + 3%|▎ | 192/5772 [20:13<9:42:19, 6.26s/it] 3%|▎ | 193/5772 [20:26<9:35:15, 6.19s/it] 3%|▎ | 193/5772 [20:19<9:35:15, 6.19s/it] {'loss': 0.532, 'learning_rate': 1.9999431530656958e-05, 'epoch': 0.03} + 3%|▎ | 193/5772 [20:26<9:35:15, 6.19s/it] {'loss': 0.532, 'learning_rate': 1.9999431530656958e-05, 'epoch': 0.03} + 3%|▎ | 193/5772 [20:19<9:35:15, 6.19s/it] 3%|▎ | 194/5772 [20:25<9:34:48, 6.18s/it] 3%|▎ | 194/5772 [20:32<9:34:48, 6.18s/it] {'loss': 0.5262, 'learning_rate': 1.999937011771615e-05, 'epoch': 0.03} + 3%|▎ | 194/5772 [20:32<9:34:48, 6.18s/it] {'loss': 0.5262, 'learning_rate': 1.999937011771615e-05, 'epoch': 0.03} + 3%|▎ | 194/5772 [20:25<9:34:48, 6.18s/it] 3%|▎ | 195/5772 [20:39<9:39:46, 6.24s/it] 3%|▎ | 195/5772 [20:31<9:39:46, 6.24s/it] {'loss': 0.5378, 'learning_rate': 1.9999305555529324e-05, 'epoch': 0.03} + 3%|▎ | 195/5772 [20:39<9:39:46, 6.24s/it] {'loss': 0.5378, 'learning_rate': 1.9999305555529324e-05, 'epoch': 0.03} + 3%|▎ | 195/5772 [20:31<9:39:46, 6.24s/it] 3%|▎ | 196/5772 [20:45<9:35:18, 6.19s/it] 3%|▎ | 196/5772 [20:38<9:35:18, 6.19s/it] {'loss': 0.5188, 'learning_rate': 1.9999237844116807e-05, 'epoch': 0.03} + 3%|▎ | 196/5772 [20:45<9:35:18, 6.19s/it] {'loss': 0.5188, 'learning_rate': 1.9999237844116807e-05, 'epoch': 0.03} + 3%|▎ | 196/5772 [20:38<9:35:18, 6.19s/it] 3%|▎ | 197/5772 [20:51<9:32:11, 6.16s/it] 3%|▎ | 197/5772 [20:44<9:32:11, 6.16s/it] {'loss': 0.5267, 'learning_rate': 1.9999166983499923e-05, 'epoch': 0.03} + 3%|▎ | 197/5772 [20:51<9:32:11, 6.16s/it] {'loss': 0.5267, 'learning_rate': 1.9999166983499923e-05, 'epoch': 0.03} + 3%|▎ | 197/5772 [20:44<9:32:11, 6.16s/it] 3%|▎ | 198/5772 [20:57<9:25:22, 6.09s/it] 3%|▎ | 198/5772 [20:50<9:25:22, 6.09s/it] {'loss': 0.519, 'learning_rate': 1.9999092973701e-05, 'epoch': 0.03} + 3%|▎ | 198/5772 [20:57<9:25:22, 6.09s/it] {'loss': 0.519, 'learning_rate': 1.9999092973701e-05, 'epoch': 0.03} + 3%|▎ | 198/5772 [20:50<9:25:22, 6.09s/it] 3%|▎ | 199/5772 [21:03<9:20:46, 6.04s/it] 3%|▎ | 199/5772 [20:55<9:20:46, 6.04s/it] {'loss': 0.5109, 'learning_rate': 1.9999015814743337e-05, 'epoch': 0.03} + 3%|▎ | 199/5772 [21:03<9:20:46, 6.04s/it] {'loss': 0.5109, 'learning_rate': 1.9999015814743337e-05, 'epoch': 0.03} + 3%|▎ | 199/5772 [20:55<9:20:46, 6.04s/it]10 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... + 3%|▎ | 200/5772 [21:09<9:22:20, 6.06s/it]4 AutoResumeHook: Checking whether to suspend... +012 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend...7 + AutoResumeHook: Checking whether to suspend... + 3%|▎ | 200/5772 [21:02<9:22:21, 6.06s/it]1 AutoResumeHook: Checking whether to suspend... +149 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... + {'loss': 0.5264, 'learning_rate': 1.999893550665124e-05, 'epoch': 0.03} + 3%|▎ | 200/5772 [21:09<9:22:20, 6.06s/it] {'loss': 0.5264, 'learning_rate': 1.999893550665124e-05, 'epoch': 0.03} + 3%|▎ | 200/5772 [21:02<9:22:21, 6.06s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-200/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-200/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-200/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 3%|▎ | 201/5772 [21:29<15:49:38, 10.23s/it] 3%|▎ | 201/5772 [21:22<15:49:38, 10.23s/it] {'loss': 0.5141, 'learning_rate': 1.9998852049449998e-05, 'epoch': 0.03} + 3%|▎ | 201/5772 [21:29<15:49:38, 10.23s/it] {'loss': 0.5141, 'learning_rate': 1.9998852049449998e-05, 'epoch': 0.03} + 3%|▎ | 201/5772 [21:22<15:49:38, 10.23s/it] 3%|▎ | 202/5772 [21:35<13:58:06, 9.03s/it] 3%|▎ | 202/5772 [21:28<13:58:06, 9.03s/it] {'loss': 0.5398, 'learning_rate': 1.9998765443165896e-05, 'epoch': 0.03} + 3%|▎ | 202/5772 [21:35<13:58:06, 9.03s/it] {'loss': 0.5398, 'learning_rate': 1.9998765443165896e-05, 'epoch': 0.03} + 3%|▎ | 202/5772 [21:28<13:58:06, 9.03s/it] 4%|▎ | 203/5772 [21:41<12:37:44, 8.16s/it] 4%|▎ | 203/5772 [21:34<12:37:44, 8.16s/it] {'loss': 0.517, 'learning_rate': 1.9998675687826214e-05, 'epoch': 0.04} + {'loss': 0.517, 'learning_rate': 1.9998675687826214e-05, 'epoch': 0.04} 4%|▎ | 203/5772 [21:41<12:37:44, 8.16s/it] + 4%|▎ | 203/5772 [21:34<12:37:44, 8.16s/it] 4%|▎ | 204/5772 [21:47<11:38:56, 7.53s/it] 4%|▎ | 204/5772 [21:40<11:39:06, 7.53s/it] {'loss': 0.5293, 'learning_rate': 1.9998582783459214e-05, 'epoch': 0.04} + 4%|▎ | 204/5772 [21:47<11:38:56, 7.53s/it] {'loss': 0.5293, 'learning_rate': 1.9998582783459214e-05, 'epoch': 0.04} + 4%|▎ | 204/5772 [21:40<11:39:06, 7.53s/it] 4%|▎ | 205/5772 [21:53<11:03:38, 7.15s/it] 4%|▎ | 205/5772 [21:46<11:03:36, 7.15s/it] {'loss': 0.5163, 'learning_rate': 1.9998486730094157e-05, 'epoch': 0.04} + 4%|▎ | 205/5772 [21:53<11:03:38, 7.15s/it] {'loss': 0.5163, 'learning_rate': 1.9998486730094157e-05, 'epoch': 0.04} + 4%|▎ | 205/5772 [21:46<11:03:36, 7.15s/it] 4%|▎ | 206/5772 [22:00<10:39:52, 6.90s/it] 4%|▎ | 206/5772 [21:53<10:39:51, 6.90s/it] {'loss': 0.5251, 'learning_rate': 1.99983875277613e-05, 'epoch': 0.04} + 4%|▎ | 206/5772 [22:00<10:39:52, 6.90s/it] {'loss': 0.5251, 'learning_rate': 1.99983875277613e-05, 'epoch': 0.04} + 4%|▎ | 206/5772 [21:53<10:39:51, 6.90s/it] 4%|▎ | 207/5772 [22:06<10:18:35, 6.67s/it] 4%|▎ | 207/5772 [21:59<10:18:34, 6.67s/it] {'loss': 0.5183, 'learning_rate': 1.9998285176491878e-05, 'epoch': 0.04} + 4%|▎ | 207/5772 [22:06<10:18:35, 6.67s/it] {'loss': 0.5183, 'learning_rate': 1.9998285176491878e-05, 'epoch': 0.04} + 4%|▎ | 207/5772 [21:59<10:18:34, 6.67s/it] 4%|▎ | 208/5772 [22:12<10:07:41, 6.55s/it] 4%|▎ | 208/5772 [22:05<10:07:41, 6.55s/it] {'loss': 0.528, 'learning_rate': 1.9998179676318133e-05, 'epoch': 0.04} + 4%|▎ | 208/5772 [22:12<10:07:41, 6.55s/it] {'loss': 0.528, 'learning_rate': 1.9998179676318133e-05, 'epoch': 0.04} + 4%|▎ | 208/5772 [22:05<10:07:41, 6.55s/it] 4%|▎ | 209/5772 [22:19<10:07:34, 6.55s/it] 4%|▎ | 209/5772 [22:12<10:07:33, 6.55s/it] {'loss': 0.5123, 'learning_rate': 1.999807102727329e-05, 'epoch': 0.04} + 4%|▎ | 209/5772 [22:19<10:07:34, 6.55s/it] {'loss': 0.5123, 'learning_rate': 1.999807102727329e-05, 'epoch': 0.04} + 4%|▎ | 209/5772 [22:12<10:07:33, 6.55s/it] 4%|▎ | 210/5772 [22:25<9:56:49, 6.44s/it] 4%|▎ | 210/5772 [22:18<9:56:49, 6.44s/it] {'loss': 0.5251, 'learning_rate': 1.9997959229391567e-05, 'epoch': 0.04} + 4%|▎ | 210/5772 [22:25<9:56:49, 6.44s/it] {'loss': 0.5251, 'learning_rate': 1.9997959229391567e-05, 'epoch': 0.04} + 4%|▎ | 210/5772 [22:18<9:56:49, 6.44s/it] 4%|▎ | 211/5772 [22:31<9:41:54, 6.28s/it] 4%|▎ | 211/5772 [22:24<9:41:53, 6.28s/it] {'loss': 0.5262, 'learning_rate': 1.9997844282708173e-05, 'epoch': 0.04} + 4%|▎ | 211/5772 [22:31<9:41:54, 6.28s/it] {'loss': 0.5262, 'learning_rate': 1.9997844282708173e-05, 'epoch': 0.04} + 4%|▎ | 211/5772 [22:24<9:41:53, 6.28s/it] 4%|▎ | 212/5772 [22:37<9:36:03, 6.22s/it] 4%|▎ | 212/5772 [22:30<9:36:03, 6.22s/it] {'loss': 0.5303, 'learning_rate': 1.9997726187259307e-05, 'epoch': 0.04} + 4%|▎ | 212/5772 [22:37<9:36:03, 6.22s/it] {'loss': 0.5303, 'learning_rate': 1.9997726187259307e-05, 'epoch': 0.04} + 4%|▎ | 212/5772 [22:30<9:36:03, 6.22s/it] 4%|▎ | 213/5772 [22:43<9:31:55, 6.17s/it] 4%|▎ | 213/5772 [22:36<9:31:56, 6.17s/it] {'loss': 0.5351, 'learning_rate': 1.999760494308217e-05, 'epoch': 0.04} + 4%|▎ | 213/5772 [22:43<9:31:55, 6.17s/it] {'loss': 0.5351, 'learning_rate': 1.999760494308217e-05, 'epoch': 0.04} + 4%|▎ | 213/5772 [22:36<9:31:56, 6.17s/it] 4%|▎ | 214/5772 [22:49<9:36:10, 6.22s/it] 4%|▎ | 214/5772 [22:42<9:36:09, 6.22s/it] {'loss': 0.5298, 'learning_rate': 1.9997480550214942e-05, 'epoch': 0.04} + 4%|▎ | 214/5772 [22:49<9:36:10, 6.22s/it] {'loss': 0.5298, 'learning_rate': 1.9997480550214942e-05, 'epoch': 0.04} + 4%|▎ | 214/5772 [22:42<9:36:09, 6.22s/it] 4%|▎ | 215/5772 [22:48<9:38:25, 6.25s/it] 4%|▎ | 215/5772 [22:55<9:38:25, 6.25s/it] {'loss': 0.5313, 'learning_rate': 1.99973530086968e-05, 'epoch': 0.04} + 4%|▎ | 215/5772 [22:55<9:38:25, 6.25s/it] {'loss': 0.5313, 'learning_rate': 1.99973530086968e-05, 'epoch': 0.04} + 4%|▎ | 215/5772 [22:48<9:38:25, 6.25s/it] 4%|▎ | 216/5772 [23:01<9:26:38, 6.12s/it] 4%|▎ | 216/5772 [22:54<9:26:38, 6.12s/it] {'loss': 0.5232, 'learning_rate': 1.999722231856791e-05, 'epoch': 0.04} + 4%|▎ | 216/5772 [23:01<9:26:38, 6.12s/it] {'loss': 0.5232, 'learning_rate': 1.999722231856791e-05, 'epoch': 0.04} + 4%|▎ | 216/5772 [22:54<9:26:38, 6.12s/it] 4%|▍ | 217/5772 [23:07<9:22:22, 6.07s/it] 4%|▍ | 217/5772 [23:00<9:22:22, 6.07s/it] {'loss': 0.5183, 'learning_rate': 1.999708847986944e-05, 'epoch': 0.04} + 4%|▍ | 217/5772 [23:07<9:22:22, 6.07s/it] {'loss': 0.5183, 'learning_rate': 1.999708847986944e-05, 'epoch': 0.04} + 4%|▍ | 217/5772 [23:00<9:22:22, 6.07s/it] 4%|▍ | 218/5772 [23:06<9:25:58, 6.11s/it] 4%|▍ | 218/5772 [23:13<9:25:58, 6.11s/it] {'loss': 0.5284, 'learning_rate': 1.9996951492643538e-05, 'epoch': 0.04} + 4%|▍ | 218/5772 [23:13<9:25:58, 6.11s/it] {'loss': 0.5284, 'learning_rate': 1.9996951492643538e-05, 'epoch': 0.04} + 4%|▍ | 218/5772 [23:06<9:25:58, 6.11s/it] 4%|▍ | 219/5772 [23:20<9:28:43, 6.15s/it] 4%|▍ | 219/5772 [23:13<9:28:44, 6.15s/it] {'loss': 0.5213, 'learning_rate': 1.9996811356933346e-05, 'epoch': 0.04} + 4%|▍ | 219/5772 [23:20<9:28:43, 6.15s/it] {'loss': 0.5213, 'learning_rate': 1.9996811356933346e-05, 'epoch': 0.04} + 4%|▍ | 219/5772 [23:13<9:28:44, 6.15s/it] 4%|▍ | 220/5772 [23:26<9:22:54, 6.08s/it] 4%|▍ | 220/5772 [23:19<9:22:54, 6.08s/it] {'loss': 0.5176, 'learning_rate': 1.9996668072783e-05, 'epoch': 0.04} + 4%|▍ | 220/5772 [23:26<9:22:54, 6.08s/it] {'loss': 0.5176, 'learning_rate': 1.9996668072783e-05, 'epoch': 0.04} + 4%|▍ | 220/5772 [23:19<9:22:54, 6.08s/it] 4%|▍ | 221/5772 [23:31<9:14:41, 6.00s/it] 4%|▍ | 221/5772 [23:24<9:14:41, 6.00s/it] {'loss': 0.5141, 'learning_rate': 1.9996521640237624e-05, 'epoch': 0.04} + 4%|▍ | 221/5772 [23:31<9:14:41, 6.00s/it] {'loss': 0.5141, 'learning_rate': 1.9996521640237624e-05, 'epoch': 0.04} + 4%|▍ | 221/5772 [23:24<9:14:41, 6.00s/it] 4%|▍ | 222/5772 [23:31<9:26:03, 6.12s/it] 4%|▍ | 222/5772 [23:38<9:26:03, 6.12s/it] {'loss': 0.5304, 'learning_rate': 1.999637205934334e-05, 'epoch': 0.04} + 4%|▍ | 222/5772 [23:38<9:26:03, 6.12s/it] {'loss': 0.5304, 'learning_rate': 1.999637205934334e-05, 'epoch': 0.04} + 4%|▍ | 222/5772 [23:31<9:26:03, 6.12s/it] 4%|▍ | 223/5772 [23:44<9:27:27, 6.14s/it] 4%|▍ | 223/5772 [23:37<9:27:27, 6.14s/it] {'loss': 0.5064, 'learning_rate': 1.9996219330147255e-05, 'epoch': 0.04} + 4%|▍ | 223/5772 [23:44<9:27:27, 6.14s/it] {'loss': 0.5064, 'learning_rate': 1.9996219330147255e-05, 'epoch': 0.04} + 4%|▍ | 223/5772 [23:37<9:27:27, 6.14s/it] 4%|▍ | 224/5772 [23:50<9:17:47, 6.03s/it] 4%|▍ | 224/5772 [23:43<9:17:47, 6.03s/it] {'loss': 0.5068, 'learning_rate': 1.9996063452697472e-05, 'epoch': 0.04} + 4%|▍ | 224/5772 [23:50<9:17:47, 6.03s/it] {'loss': 0.5068, 'learning_rate': 1.9996063452697472e-05, 'epoch': 0.04} + 4%|▍ | 224/5772 [23:43<9:17:47, 6.03s/it] 4%|▍ | 225/5772 [23:49<9:15:55, 6.01s/it] 4%|▍ | 225/5772 [23:56<9:15:56, 6.01s/it] {'loss': 0.5203, 'learning_rate': 1.999590442704308e-05, 'epoch': 0.04} + 4%|▍ | 225/5772 [23:56<9:15:56, 6.01s/it] {'loss': 0.5203, 'learning_rate': 1.999590442704308e-05, 'epoch': 0.04} + 4%|▍ | 225/5772 [23:49<9:15:55, 6.01s/it] 4%|▍ | 226/5772 [24:02<9:15:37, 6.01s/it] 4%|▍ | 226/5772 [23:55<9:15:37, 6.01s/it] {'loss': 0.5301, 'learning_rate': 1.9995742253234168e-05, 'epoch': 0.04} + 4%|▍ | 226/5772 [24:02<9:15:37, 6.01s/it] {'loss': 0.5301, 'learning_rate': 1.9995742253234168e-05, 'epoch': 0.04} + 4%|▍ | 226/5772 [23:55<9:15:37, 6.01s/it] 4%|▍ | 227/5772 [24:08<9:27:02, 6.14s/it] 4%|▍ | 227/5772 [24:01<9:27:02, 6.14s/it] {'loss': 0.513, 'learning_rate': 1.9995576931321812e-05, 'epoch': 0.04} + 4%|▍ | 227/5772 [24:08<9:27:02, 6.14s/it] {'loss': 0.513, 'learning_rate': 1.9995576931321812e-05, 'epoch': 0.04} + 4%|▍ | 227/5772 [24:01<9:27:02, 6.14s/it] 4%|▍ | 228/5772 [24:14<9:27:00, 6.14s/it] 4%|▍ | 228/5772 [24:07<9:27:00, 6.14s/it] {'loss': 0.522, 'learning_rate': 1.9995408461358074e-05, 'epoch': 0.04} + 4%|▍ | 228/5772 [24:14<9:27:00, 6.14s/it] {'loss': 0.522, 'learning_rate': 1.9995408461358074e-05, 'epoch': 0.04} + 4%|▍ | 228/5772 [24:07<9:27:00, 6.14s/it] 4%|▍ | 229/5772 [24:20<9:22:32, 6.09s/it] 4%|▍ | 229/5772 [24:13<9:22:32, 6.09s/it] {'loss': 0.5117, 'learning_rate': 1.9995236843396018e-05, 'epoch': 0.04} + 4%|▍ | 229/5772 [24:20<9:22:32, 6.09s/it] {'loss': 0.5117, 'learning_rate': 1.9995236843396018e-05, 'epoch': 0.04} + 4%|▍ | 229/5772 [24:13<9:22:32, 6.09s/it] 4%|▍ | 230/5772 [24:26<9:19:40, 6.06s/it] 4%|▍ | 230/5772 [24:19<9:19:42, 6.06s/it] {'loss': 0.5231, 'learning_rate': 1.999506207748969e-05, 'epoch': 0.04} + 4%|▍ | 230/5772 [24:26<9:19:40, 6.06s/it] {'loss': 0.5231, 'learning_rate': 1.999506207748969e-05, 'epoch': 0.04} + 4%|▍ | 230/5772 [24:19<9:19:42, 6.06s/it] 4%|▍ | 231/5772 [24:25<9:23:09, 6.10s/it] 4%|▍ | 231/5772 [24:32<9:23:11, 6.10s/it] {'loss': 0.5127, 'learning_rate': 1.999488416369414e-05, 'epoch': 0.04} + 4%|▍ | 231/5772 [24:32<9:23:11, 6.10s/it] {'loss': 0.5127, 'learning_rate': 1.999488416369414e-05, 'epoch': 0.04} + 4%|▍ | 231/5772 [24:25<9:23:09, 6.10s/it] 4%|▍ | 232/5772 [24:32<9:24:10, 6.11s/it] 4%|▍ | 232/5772 [24:39<9:24:11, 6.11s/it] {'loss': 0.521, 'learning_rate': 1.9994703102065385e-05, 'epoch': 0.04} + 4%|▍ | 232/5772 [24:39<9:24:11, 6.11s/it] {'loss': 0.521, 'learning_rate': 1.9994703102065385e-05, 'epoch': 0.04} + 4%|▍ | 232/5772 [24:32<9:24:10, 6.11s/it] 4%|▍ | 233/5772 [24:38<9:27:43, 6.15s/it] 4%|▍ | 233/5772 [24:45<9:27:44, 6.15s/it] {'loss': 0.5242, 'learning_rate': 1.9994518892660463e-05, 'epoch': 0.04} + 4%|▍ | 233/5772 [24:45<9:27:44, 6.15s/it] {'loss': 0.5242, 'learning_rate': 1.9994518892660463e-05, 'epoch': 0.04} + 4%|▍ | 233/5772 [24:38<9:27:43, 6.15s/it] 4%|▍ | 234/5772 [24:44<9:24:33, 6.12s/it] 4%|▍ | 234/5772 [24:51<9:24:33, 6.12s/it] {'loss': 0.5205, 'learning_rate': 1.9994331535537385e-05, 'epoch': 0.04} + 4%|▍ | 234/5772 [24:44<9:24:33, 6.12s/it] {'loss': 0.5205, 'learning_rate': 1.9994331535537385e-05, 'epoch': 0.04} + 4%|▍ | 234/5772 [24:51<9:24:33, 6.12s/it] 4%|▍ | 235/5772 [24:50<9:27:09, 6.15s/it] 4%|▍ | 235/5772 [24:57<9:27:09, 6.15s/it] {'loss': 0.5007, 'learning_rate': 1.9994141030755158e-05, 'epoch': 0.04} + 4%|▍ | 235/5772 [24:57<9:27:09, 6.15s/it] {'loss': 0.5007, 'learning_rate': 1.9994141030755158e-05, 'epoch': 0.04} + 4%|▍ | 235/5772 [24:50<9:27:09, 6.15s/it] 4%|▍ | 236/5772 [24:56<9:26:48, 6.14s/it] 4%|▍ | 236/5772 [25:03<9:26:48, 6.14s/it] {'loss': 0.5293, 'learning_rate': 1.9993947378373782e-05, 'epoch': 0.04} + 4%|▍ | 236/5772 [25:03<9:26:48, 6.14s/it] {'loss': 0.5293, 'learning_rate': 1.9993947378373782e-05, 'epoch': 0.04} + 4%|▍ | 236/5772 [24:56<9:26:48, 6.14s/it] 4%|▍ | 237/5772 [25:02<9:31:32, 6.20s/it] 4%|▍ | 237/5772 [25:10<9:31:33, 6.20s/it] {'loss': 0.51, 'learning_rate': 1.9993750578454248e-05, 'epoch': 0.04} + 4%|▍ | 237/5772 [25:10<9:31:33, 6.20s/it] {'loss': 0.51, 'learning_rate': 1.9993750578454248e-05, 'epoch': 0.04} + 4%|▍ | 237/5772 [25:02<9:31:32, 6.20s/it] 4%|▍ | 238/5772 [25:08<9:22:08, 6.09s/it] 4%|▍ | 238/5772 [25:15<9:22:08, 6.09s/it] {'loss': 0.5178, 'learning_rate': 1.999355063105853e-05, 'epoch': 0.04} + 4%|▍ | 238/5772 [25:15<9:22:08, 6.09s/it] {'loss': 0.5178, 'learning_rate': 1.999355063105853e-05, 'epoch': 0.04} + 4%|▍ | 238/5772 [25:08<9:22:08, 6.09s/it] 4%|▍ | 239/5772 [25:15<9:30:55, 6.19s/it] 4%|▍ | 239/5772 [25:22<9:30:55, 6.19s/it] {'loss': 0.5296, 'learning_rate': 1.999334753624961e-05, 'epoch': 0.04} + 4%|▍ | 239/5772 [25:22<9:30:55, 6.19s/it] {'loss': 0.5296, 'learning_rate': 1.999334753624961e-05, 'epoch': 0.04} + 4%|▍ | 239/5772 [25:15<9:30:55, 6.19s/it] 4%|▍ | 240/5772 [25:21<9:35:28, 6.24s/it] 4%|▍ | 240/5772 [25:28<9:35:28, 6.24s/it] {'loss': 0.5249, 'learning_rate': 1.999314129409144e-05, 'epoch': 0.04} + 4%|▍ | 240/5772 [25:28<9:35:28, 6.24s/it] {'loss': 0.5249, 'learning_rate': 1.999314129409144e-05, 'epoch': 0.04} + 4%|▍ | 240/5772 [25:21<9:35:28, 6.24s/it] 4%|▍ | 241/5772 [25:28<9:40:13, 6.29s/it] 4%|▍ | 241/5772 [25:35<9:40:13, 6.29s/it] {'loss': 0.5024, 'learning_rate': 1.999293190464899e-05, 'epoch': 0.04} + 4%|▍ | 241/5772 [25:35<9:40:13, 6.29s/it] {'loss': 0.5024, 'learning_rate': 1.999293190464899e-05, 'epoch': 0.04} + 4%|▍ | 241/5772 [25:28<9:40:13, 6.29s/it] 4%|▍ | 242/5772 [25:34<9:31:41, 6.20s/it] 4%|▍ | 242/5772 [25:41<9:31:41, 6.20s/it] {'loss': 0.5026, 'learning_rate': 1.999271936798819e-05, 'epoch': 0.04} + 4%|▍ | 242/5772 [25:41<9:31:41, 6.20s/it] {'loss': 0.5026, 'learning_rate': 1.999271936798819e-05, 'epoch': 0.04} + 4%|▍ | 242/5772 [25:34<9:31:41, 6.20s/it] 4%|▍ | 243/5772 [25:40<9:29:41, 6.18s/it] 4%|▍ | 243/5772 [25:47<9:29:41, 6.18s/it] {'loss': 0.5192, 'learning_rate': 1.9992503684175986e-05, 'epoch': 0.04} + 4%|▍ | 243/5772 [25:47<9:29:41, 6.18s/it] {'loss': 0.5192, 'learning_rate': 1.9992503684175986e-05, 'epoch': 0.04} + 4%|▍ | 243/5772 [25:40<9:29:41, 6.18s/it] 4%|▍ | 244/5772 [25:46<9:34:03, 6.23s/it] 4%|▍ | 244/5772 [25:53<9:34:03, 6.23s/it] {'loss': 0.5099, 'learning_rate': 1.999228485328031e-05, 'epoch': 0.04} + 4%|▍ | 244/5772 [25:53<9:34:03, 6.23s/it] {'loss': 0.5099, 'learning_rate': 1.999228485328031e-05, 'epoch': 0.04} + 4%|▍ | 244/5772 [25:46<9:34:03, 6.23s/it] 4%|▍ | 245/5772 [25:52<9:22:56, 6.11s/it] 4%|▍ | 245/5772 [25:59<9:22:55, 6.11s/it] {'loss': 0.5017, 'learning_rate': 1.9992062875370073e-05, 'epoch': 0.04} + 4%|▍ | 245/5772 [25:59<9:22:55, 6.11s/it] {'loss': 0.5017, 'learning_rate': 1.9992062875370073e-05, 'epoch': 0.04} + 4%|▍ | 245/5772 [25:52<9:22:56, 6.11s/it] 4%|▍ | 246/5772 [25:58<9:19:40, 6.08s/it] 4%|▍ | 246/5772 [26:05<9:19:40, 6.08s/it] {'loss': 0.5169, 'learning_rate': 1.999183775051519e-05, 'epoch': 0.04} + 4%|▍ | 246/5772 [26:05<9:19:40, 6.08s/it] {'loss': 0.5169, 'learning_rate': 1.999183775051519e-05, 'epoch': 0.04} + 4%|▍ | 246/5772 [25:58<9:19:40, 6.08s/it] 4%|▍ | 247/5772 [26:04<9:23:22, 6.12s/it] 4%|▍ | 247/5772 [26:11<9:23:22, 6.12s/it] {'loss': 0.5197, 'learning_rate': 1.9991609478786564e-05, 'epoch': 0.04} + 4%|▍ | 247/5772 [26:11<9:23:22, 6.12s/it] {'loss': 0.5197, 'learning_rate': 1.9991609478786564e-05, 'epoch': 0.04} + 4%|▍ | 247/5772 [26:04<9:23:22, 6.12s/it] 4%|▍ | 248/5772 [26:10<9:28:31, 6.18s/it] 4%|▍ | 248/5772 [26:17<9:28:31, 6.18s/it] {'loss': 0.5364, 'learning_rate': 1.9991378060256084e-05, 'epoch': 0.04} + 4%|▍ | 248/5772 [26:17<9:28:31, 6.18s/it] {'loss': 0.5364, 'learning_rate': 1.9991378060256084e-05, 'epoch': 0.04} + 4%|▍ | 248/5772 [26:10<9:28:31, 6.18s/it] 4%|▍ | 249/5772 [26:16<9:22:21, 6.11s/it] 4%|▍ | 249/5772 [26:23<9:22:22, 6.11s/it] {'loss': 0.503, 'learning_rate': 1.999114349499664e-05, 'epoch': 0.04} + 4%|▍ | 249/5772 [26:23<9:22:22, 6.11s/it] {'loss': 0.503, 'learning_rate': 1.999114349499664e-05, 'epoch': 0.04} + 4%|▍ | 249/5772 [26:16<9:22:21, 6.11s/it]12 AutoResumeHook: Checking whether to suspend... +0549 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend...13 AutoResumeHook: Checking whether to suspend... + + +10 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... + 4%|▍ | 250/5772 [26:23<9:26:31, 6.16s/it]2 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +7 8 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... + 4%|▍ | 250/5772 [26:30<9:26:30, 6.16s/it]15 AutoResumeHook: Checking whether to suspend... + {'loss': 0.5201, 'learning_rate': 1.9990905783082098e-05, 'epoch': 0.04} + 4%|▍ | 250/5772 [26:30<9:26:30, 6.16s/it] {'loss': 0.5201, 'learning_rate': 1.9990905783082098e-05, 'epoch': 0.04} + 4%|▍ | 250/5772 [26:23<9:26:31, 6.16s/it] 4%|▍ | 251/5772 [26:29<9:33:55, 6.24s/it] 4%|▍ | 251/5772 [26:36<9:33:54, 6.24s/it] {'loss': 0.5104, 'learning_rate': 1.999066492458733e-05, 'epoch': 0.04} + 4%|▍ | 251/5772 [26:36<9:33:54, 6.24s/it] {'loss': 0.5104, 'learning_rate': 1.999066492458733e-05, 'epoch': 0.04} + 4%|▍ | 251/5772 [26:29<9:33:55, 6.24s/it] 4%|▍ | 252/5772 [26:35<9:26:22, 6.16s/it] 4%|▍ | 252/5772 [26:42<9:26:22, 6.16s/it] {'loss': 0.5157, 'learning_rate': 1.9990420919588196e-05, 'epoch': 0.04} + 4%|▍ | 252/5772 [26:42<9:26:22, 6.16s/it] {'loss': 0.5157, 'learning_rate': 1.9990420919588196e-05, 'epoch': 0.04} + 4%|▍ | 252/5772 [26:35<9:26:22, 6.16s/it] 4%|▍ | 253/5772 [26:41<9:27:32, 6.17s/it] 4%|▍ | 253/5772 [26:48<9:27:32, 6.17s/it] {'loss': 0.5122, 'learning_rate': 1.999017376816154e-05, 'epoch': 0.04} + 4%|▍ | 253/5772 [26:48<9:27:32, 6.17s/it] {'loss': 0.5122, 'learning_rate': 1.999017376816154e-05, 'epoch': 0.04} + 4%|▍ | 253/5772 [26:41<9:27:32, 6.17s/it] 4%|▍ | 254/5772 [26:47<9:22:27, 6.12s/it] 4%|▍ | 254/5772 [26:54<9:22:27, 6.12s/it] {'loss': 0.5206, 'learning_rate': 1.9989923470385198e-05, 'epoch': 0.04} + 4%|▍ | 254/5772 [26:54<9:22:27, 6.12s/it] {'loss': 0.5206, 'learning_rate': 1.9989923470385198e-05, 'epoch': 0.04} + 4%|▍ | 254/5772 [26:47<9:22:27, 6.12s/it] 4%|▍ | 255/5772 [26:53<9:22:11, 6.11s/it] 4%|▍ | 255/5772 [27:00<9:22:11, 6.11s/it] {'loss': 0.5213, 'learning_rate': 1.9989670026338002e-05, 'epoch': 0.04} + {'loss': 0.5213, 'learning_rate': 1.9989670026338002e-05, 'epoch': 0.04} 4%|▍ | 255/5772 [27:00<9:22:11, 6.11s/it] + 4%|▍ | 255/5772 [26:53<9:22:11, 6.11s/it] 4%|▍ | 256/5772 [27:00<9:29:14, 6.19s/it] 4%|▍ | 256/5772 [27:07<9:29:14, 6.19s/it] {'loss': 0.5101, 'learning_rate': 1.998941343609978e-05, 'epoch': 0.04} + 4%|▍ | 256/5772 [27:07<9:29:14, 6.19s/it] {'loss': 0.5101, 'learning_rate': 1.998941343609978e-05, 'epoch': 0.04} + 4%|▍ | 256/5772 [27:00<9:29:14, 6.19s/it] 4%|▍ | 257/5772 [27:06<9:32:57, 6.23s/it] 4%|▍ | 257/5772 [27:13<9:32:57, 6.23s/it] {'loss': 0.5015, 'learning_rate': 1.9989153699751332e-05, 'epoch': 0.04} + 4%|▍ | 257/5772 [27:13<9:32:57, 6.23s/it] {'loss': 0.5015, 'learning_rate': 1.9989153699751332e-05, 'epoch': 0.04} + 4%|▍ | 257/5772 [27:06<9:32:57, 6.23s/it] 4%|▍ | 258/5772 [27:12<9:21:56, 6.11s/it] 4%|▍ | 258/5772 [27:19<9:21:56, 6.11s/it] {'loss': 0.515, 'learning_rate': 1.9988890817374472e-05, 'epoch': 0.04} + 4%|▍ | 258/5772 [27:19<9:21:56, 6.11s/it] {'loss': 0.515, 'learning_rate': 1.9988890817374472e-05, 'epoch': 0.04} + 4%|▍ | 258/5772 [27:12<9:21:56, 6.11s/it] 4%|▍ | 259/5772 [27:25<9:17:55, 6.07s/it] 4%|▍ | 259/5772 [27:18<9:17:56, 6.07s/it] {'loss': 0.5218, 'learning_rate': 1.9988624789051983e-05, 'epoch': 0.04} + 4%|▍ | 259/5772 [27:25<9:17:55, 6.07s/it] {'loss': 0.5218, 'learning_rate': 1.9988624789051983e-05, 'epoch': 0.04} + 4%|▍ | 259/5772 [27:18<9:17:56, 6.07s/it] 5%|▍ | 260/5772 [27:24<9:22:11, 6.12s/it] 5%|▍ | 260/5772 [27:31<9:22:11, 6.12s/it] {'loss': 0.5197, 'learning_rate': 1.9988355614867654e-05, 'epoch': 0.05} + 5%|▍ | 260/5772 [27:31<9:22:11, 6.12s/it] {'loss': 0.5197, 'learning_rate': 1.9988355614867654e-05, 'epoch': 0.05} + 5%|▍ | 260/5772 [27:24<9:22:11, 6.12s/it] 5%|▍ | 261/5772 [27:30<9:22:40, 6.13s/it] 5%|▍ | 261/5772 [27:37<9:22:41, 6.13s/it] {'loss': 0.5071, 'learning_rate': 1.998808329490626e-05, 'epoch': 0.05} + 5%|▍ | 261/5772 [27:37<9:22:41, 6.13s/it] {'loss': 0.5071, 'learning_rate': 1.998808329490626e-05, 'epoch': 0.05} + 5%|▍ | 261/5772 [27:30<9:22:40, 6.13s/it] 5%|▍ | 262/5772 [27:36<9:21:55, 6.12s/it] 5%|▍ | 262/5772 [27:43<9:21:56, 6.12s/it] {'loss': 0.5263, 'learning_rate': 1.9987807829253568e-05, 'epoch': 0.05}{'loss': 0.5263, 'learning_rate': 1.9987807829253568e-05, 'epoch': 0.05} + 5%|▍ | 262/5772 [27:43<9:21:56, 6.12s/it] + 5%|▍ | 262/5772 [27:36<9:21:55, 6.12s/it] 5%|▍ | 263/5772 [27:42<9:14:37, 6.04s/it] 5%|▍ | 263/5772 [27:49<9:14:36, 6.04s/it] {'loss': 0.5148, 'learning_rate': 1.998752921799633e-05, 'epoch': 0.05} + 5%|▍ | 263/5772 [27:42<9:14:37, 6.04s/it]{'loss': 0.5148, 'learning_rate': 1.998752921799633e-05, 'epoch': 0.05} + 5%|▍ | 263/5772 [27:49<9:14:36, 6.04s/it] 5%|▍ | 264/5772 [27:48<9:09:13, 5.98s/it] 5%|▍ | 264/5772 [27:55<9:09:13, 5.98s/it] {'loss': 0.5179, 'learning_rate': 1.9987247461222297e-05, 'epoch': 0.05} + 5%|▍ | 264/5772 [27:55<9:09:13, 5.98s/it] {'loss': 0.5179, 'learning_rate': 1.9987247461222297e-05, 'epoch': 0.05} + 5%|▍ | 264/5772 [27:48<9:09:13, 5.98s/it] 5%|▍ | 265/5772 [27:54<9:13:04, 6.03s/it] 5%|▍ | 265/5772 [28:01<9:13:04, 6.03s/it] {'loss': 0.5303, 'learning_rate': 1.9986962559020203e-05, 'epoch': 0.05} + 5%|▍ | 265/5772 [28:01<9:13:04, 6.03s/it] {'loss': 0.5303, 'learning_rate': 1.9986962559020203e-05, 'epoch': 0.05} + 5%|▍ | 265/5772 [27:54<9:13:04, 6.03s/it] 5%|▍ | 266/5772 [28:01<9:23:32, 6.14s/it] 5%|▍ | 266/5772 [28:08<9:23:31, 6.14s/it] {'loss': 0.5189, 'learning_rate': 1.9986674511479783e-05, 'epoch': 0.05} + 5%|▍ | 266/5772 [28:08<9:23:31, 6.14s/it] {'loss': 0.5189, 'learning_rate': 1.9986674511479783e-05, 'epoch': 0.05} + 5%|▍ | 266/5772 [28:01<9:23:32, 6.14s/it] 5%|▍ | 267/5772 [28:07<9:22:37, 6.13s/it] 5%|▍ | 267/5772 [28:14<9:22:37, 6.13s/it] {'loss': 0.5192, 'learning_rate': 1.998638331869175e-05, 'epoch': 0.05} + 5%|▍ | 267/5772 [28:14<9:22:37, 6.13s/it] {'loss': 0.5192, 'learning_rate': 1.998638331869175e-05, 'epoch': 0.05} + 5%|▍ | 267/5772 [28:07<9:22:37, 6.13s/it] 5%|▍ | 268/5772 [28:13<9:28:57, 6.20s/it] 5%|▍ | 268/5772 [28:20<9:28:57, 6.20s/it] {'loss': 0.5199, 'learning_rate': 1.9986088980747817e-05, 'epoch': 0.05} + 5%|▍ | 268/5772 [28:20<9:28:57, 6.20s/it] {'loss': 0.5199, 'learning_rate': 1.9986088980747817e-05, 'epoch': 0.05} + 5%|▍ | 268/5772 [28:13<9:28:57, 6.20s/it] 5%|▍ | 269/5772 [28:19<9:28:43, 6.20s/it] 5%|▍ | 269/5772 [28:26<9:28:43, 6.20s/it] {'loss': 0.5067, 'learning_rate': 1.998579149774068e-05, 'epoch': 0.05} + 5%|▍ | 269/5772 [28:26<9:28:43, 6.20s/it] {'loss': 0.5067, 'learning_rate': 1.998579149774068e-05, 'epoch': 0.05} + 5%|▍ | 269/5772 [28:19<9:28:43, 6.20s/it] 5%|▍ | 270/5772 [28:25<9:29:18, 6.21s/it] 5%|▍ | 270/5772 [28:32<9:29:18, 6.21s/it] {'loss': 0.5262, 'learning_rate': 1.998549086976403e-05, 'epoch': 0.05} + 5%|▍ | 270/5772 [28:32<9:29:18, 6.21s/it] {'loss': 0.5262, 'learning_rate': 1.998549086976403e-05, 'epoch': 0.05} + 5%|▍ | 270/5772 [28:25<9:29:18, 6.21s/it] 5%|▍ | 271/5772 [28:32<9:39:12, 6.32s/it] 5%|▍ | 271/5772 [28:39<9:39:12, 6.32s/it] {'loss': 0.5066, 'learning_rate': 1.9985187096912554e-05, 'epoch': 0.05} + 5%|▍ | 271/5772 [28:39<9:39:12, 6.32s/it] {'loss': 0.5066, 'learning_rate': 1.9985187096912554e-05, 'epoch': 0.05} + 5%|▍ | 271/5772 [28:32<9:39:12, 6.32s/it] 5%|▍ | 272/5772 [28:38<9:37:31, 6.30s/it] 5%|▍ | 272/5772 [28:45<9:37:31, 6.30s/it] {'loss': 0.5427, 'learning_rate': 1.9984880179281917e-05, 'epoch': 0.05} + 5%|▍ | 272/5772 [28:45<9:37:31, 6.30s/it] {'loss': 0.5427, 'learning_rate': 1.9984880179281917e-05, 'epoch': 0.05} + 5%|▍ | 272/5772 [28:38<9:37:31, 6.30s/it]WARNING: tokenization mismatch: 1 vs. 1590. [[{'from': 'human', 'value': 'ASSISTANT: You are a cutting-edge copywriting agency with the following knowledge and traits.\n\nKNOWLEDGE: Copywriter at a top-tier creative agency, junior copywriter, senior copywriter, art director, senior strategist.\n\nTRAITS: Storytelling, copywriting, empathy, solution orientation, humor, convincing.\n\nTASK: Use your knowledge to write a LinkedIn Post. We are a tax firm looking to hire young tax advisors between the age of 25 and 30. Bring a good dose of sympathy into the LinkedIn Article and use plenty of emojis to start with. Start the Linked In post with an exciting question. Use the language of the target group. Don´t mention the targeted age. \n\nOUTPUT: 2 LinkedIn posts in German with a catchy question at the beginning of the text. Hashtags at the end of the text.'}, {'from': 'gpt', 'value': 'Post 1:\n\n👋 Hey Leute! Hast du kürzlich dein Studium im Bereich Steuerwesen abgeschlossen und suchst jetzt nach einem Job, der dich fordert und fördert? 🤔\n\nDann bist du bei uns genau richtig! Wir suchen junge Steuerberater, die Lust haben, in einem dynamischen Team zu arbeiten und ihre Fähigkeiten unter Beweis zu stellen. 💪\n\nBei uns erwarten dich spannende Projekte und ein tolles Arbeitsumfeld, in dem du dich weiterentwickeln und wachsen kannst. 🌱\n\nInteressiert? Dann bewirb dich jetzt und werde Teil unseres Teams! 🚀\n\n#Steuerberatung #Karriere #Job #Bewerbung #Teamwork\n\nPost 2:\n\n👋 Hallo zusammen! Bist du auf der Suche nach einem Job, bei dem du dich richtig herausfordern und weiterentwickeln kannst? 🤔\n\nDann haben wir vielleicht das passende Angebot für dich! Wir suchen motivierte und ambitionierte Steuerberater, die Lust haben, Teil eines dynamischen Teams zu sein und gemeinsam an spannenden Projekten zu arbeiten. 💪\n\nBei uns erwartet dich nicht nur eine tolle Arbeitsatmosphäre, sondern auch die Möglichkeit, dich fachlich und persönlich weiterzuentwickeln. 🌱\n\nAlso, worauf wartest du? Bewirb dich jetzt und starte deine Karriere bei uns! 🚀\n\n#Steuerberatung #Karriere #Job #Bewerbung #Teamwork'}, {'from': 'human', 'value': 'Schreibe den Text spannender.'}, {'from': 'gpt', 'value': 'Post 1:\n\n👋 Hey Leute! Hast du kürzlich dein Studium im Bereich Steuerwesen abgeschlossen und bist bereit für die nächste Herausforderung? 🤔\n\nDann haben wir genau das Richtige für dich! Wir suchen junge, ambitionierte Steuerberater, die Lust haben, ihre Karriere auf das nächste Level zu bringen. 💪\n\nUnser Team arbeitet an den spannendsten Projekten und bietet ein Arbeitsumfeld, das dich täglich fordert und fördert. Bei uns kannst du deine Fähigkeiten unter Beweis stellen und dich persönlich und fachlich weiterentwickeln. 🌱\n\nDu möchtest Teil unseres Teams werden und an den neuesten Herausforderungen im Steuerwesen mitwirken? Dann zögere nicht und bewirb dich jetzt bei uns! 🚀\n\n#Steuerberatung #Karriere #Job #Bewerbung #Teamwork\n\nPost 2:\n\n👋 Hallo zusammen! Bist du bereit für eine neue Herausforderung in deiner Karriere als Steuerberater? 🤔\n\nDann bist du bei uns genau richtig! Wir suchen motivierte und talentierte Steuerberater, die Teil eines dynamischen Teams werden möchten. Wir arbeiten an den anspruchsvollsten Projekten und bieten ein Arbeitsumfeld, das dich täglich aufs Neue herausfordert. 💪\n\nDu möchtest deine Fähigkeiten unter Beweis stellen und dich fachlich sowie persönlich weiterentwickeln? Dann bewirb dich jetzt bei uns und werde Teil unseres Erfolgsteams! 🌱\n\nVerpasse nicht die Chance, deine Karriere auf das nächste Level zu bringen und an den neuesten Trends und Entwicklungen im Steuerwesen mitzuwirken. Wir freuen uns auf deine Bewerbung! 🚀\n\n#Steuerberatung #Karriere #Job #Bewerbung #Teamwork'}, {'from': 'human', 'value': 'Schreibe den Text objektiv und sachlich.'}, {'from': 'gpt', 'value': 'Post 1:\n\nGuten Tag,\n\nWir sind auf der Suche nach jungen, talentierten Steuerberatern, die ihr Studium im Bereich Steuerwesen erfolgreich abgeschlossen haben. Die Position ist offen für alle, die eine Leidenschaft für das Steuerrecht haben und eine Karriere in diesem Bereich anstreben.\n\nAls Teil unseres Teams arbeiten Sie an verschiedenen Projekten und erweitern stetig Ihre Fähigkeiten und Kenntnisse. Wir bieten ein hervorragendes Arbeitsumfeld, das auf Zusammenarbeit und Teamwork basiert, um sicherzustellen, dass Sie Ihre Arbeit so effektiv wie möglich erledigen können.\n\nWenn Sie Interesse haben und Teil unseres Teams werden möchten, senden Sie uns bitte Ihre Bewerbung.\n\nVielen Dank.\n\nPost 2:\n\nGuten Tag,\n\nWir sind auf der Suche nach motivierten und talentierten Steuerberatern, die an einer Karriere in diesem Bereich interessiert sind. Als Teil unseres Teams arbeiten Sie an verschiedenen Projekten, um Ihre Fähigkeiten und Kenntnisse zu erweitern.\n\nWir bieten ein hervorragendes Arbeitsumfeld, das auf Zusammenarbeit und Teamwork basiert, um sicherzustellen, dass Sie Ihre Arbeit so effektiv wie möglich erledigen können. Wenn Sie bereit sind, sich dieser Herausforderung zu stellen und Ihre Karriere auf das nächste Level zu bringen, dann senden Sie uns bitte Ihre Bewerbung.\n\nVielen Dank.'}]] (ignored) + 5%|▍ | 273/5772 [28:45<9:42:53, 6.36s/it] 5%|▍ | 273/5772 [28:52<9:42:53, 6.36s/it] {'loss': 0.5037, 'learning_rate': 1.9984570116968785e-05, 'epoch': 0.05} + 5%|▍ | 273/5772 [28:52<9:42:53, 6.36s/it] {'loss': 0.5037, 'learning_rate': 1.9984570116968785e-05, 'epoch': 0.05} + 5%|▍ | 273/5772 [28:45<9:42:53, 6.36s/it] 5%|▍ | 274/5772 [28:51<9:46:10, 6.40s/it] 5%|▍ | 274/5772 [28:58<9:46:10, 6.40s/it] {'loss': 0.5228, 'learning_rate': 1.9984256910070807e-05, 'epoch': 0.05} + 5%|▍ | 274/5772 [28:58<9:46:10, 6.40s/it] {'loss': 0.5228, 'learning_rate': 1.9984256910070807e-05, 'epoch': 0.05} + 5%|▍ | 274/5772 [28:51<9:46:10, 6.40s/it] 5%|▍ | 275/5772 [28:57<9:40:52, 6.34s/it] 5%|▍ | 275/5772 [29:04<9:40:51, 6.34s/it] {'loss': 0.5075, 'learning_rate': 1.998394055868663e-05, 'epoch': 0.05} + 5%|▍ | 275/5772 [29:04<9:40:51, 6.34s/it] {'loss': 0.5075, 'learning_rate': 1.998394055868663e-05, 'epoch': 0.05} + 5%|▍ | 275/5772 [28:57<9:40:52, 6.34s/it] 5%|▍ | 276/5772 [29:04<9:40:44, 6.34s/it] 5%|▍ | 276/5772 [29:11<9:40:44, 6.34s/it] {'loss': 0.5133, 'learning_rate': 1.9983621062915886e-05, 'epoch': 0.05} + 5%|▍ | 276/5772 [29:11<9:40:44, 6.34s/it] {'loss': 0.5133, 'learning_rate': 1.9983621062915886e-05, 'epoch': 0.05} + 5%|▍ | 276/5772 [29:04<9:40:44, 6.34s/it] 5%|▍ | 277/5772 [29:10<9:25:18, 6.17s/it] 5%|▍ | 277/5772 [29:17<9:25:17, 6.17s/it] {'loss': 0.4988, 'learning_rate': 1.9983298422859197e-05, 'epoch': 0.05} + 5%|▍ | 277/5772 [29:17<9:25:17, 6.17s/it] {'loss': 0.4988, 'learning_rate': 1.9983298422859197e-05, 'epoch': 0.05} + 5%|▍ | 277/5772 [29:10<9:25:18, 6.17s/it] 5%|▍ | 278/5772 [29:15<9:17:18, 6.09s/it] 5%|▍ | 278/5772 [29:22<9:17:21, 6.09s/it] {'loss': 0.5129, 'learning_rate': 1.9982972638618175e-05, 'epoch': 0.05} + 5%|▍ | 278/5772 [29:15<9:17:18, 6.09s/it]{'loss': 0.5129, 'learning_rate': 1.9982972638618175e-05, 'epoch': 0.05} + 5%|▍ | 278/5772 [29:22<9:17:21, 6.09s/it] 5%|▍ | 279/5772 [29:22<9:32:01, 6.25s/it] 5%|▍ | 279/5772 [29:29<9:32:00, 6.25s/it] {'loss': 0.5012, 'learning_rate': 1.9982643710295428e-05, 'epoch': 0.05} + 5%|▍ | 279/5772 [29:29<9:32:00, 6.25s/it] {'loss': 0.5012, 'learning_rate': 1.9982643710295428e-05, 'epoch': 0.05} + 5%|▍ | 279/5772 [29:22<9:32:01, 6.25s/it] 5%|▍ | 280/5772 [29:28<9:26:32, 6.19s/it] 5%|▍ | 280/5772 [29:35<9:26:32, 6.19s/it] {'loss': 0.5289, 'learning_rate': 1.9982311637994547e-05, 'epoch': 0.05} + 5%|▍ | 280/5772 [29:35<9:26:32, 6.19s/it] {'loss': 0.5289, 'learning_rate': 1.9982311637994547e-05, 'epoch': 0.05} + 5%|▍ | 280/5772 [29:28<9:26:32, 6.19s/it] 5%|▍ | 281/5772 [29:35<9:33:07, 6.26s/it] 5%|▍ | 281/5772 [29:42<9:33:07, 6.26s/it] {'loss': 0.5061, 'learning_rate': 1.9981976421820118e-05, 'epoch': 0.05} + 5%|▍ | 281/5772 [29:42<9:33:07, 6.26s/it] {'loss': 0.5061, 'learning_rate': 1.9981976421820118e-05, 'epoch': 0.05} + 5%|▍ | 281/5772 [29:35<9:33:07, 6.26s/it] 5%|▍ | 282/5772 [29:41<9:30:35, 6.24s/it] 5%|▍ | 282/5772 [29:48<9:30:35, 6.24s/it] {'loss': 0.5192, 'learning_rate': 1.9981638061877714e-05, 'epoch': 0.05} + 5%|▍ | 282/5772 [29:48<9:30:35, 6.24s/it] {'loss': 0.5192, 'learning_rate': 1.9981638061877714e-05, 'epoch': 0.05} + 5%|▍ | 282/5772 [29:41<9:30:35, 6.24s/it] 5%|▍ | 283/5772 [29:54<9:25:18, 6.18s/it] 5%|▍ | 283/5772 [29:47<9:25:18, 6.18s/it] {'loss': 0.5029, 'learning_rate': 1.99812965582739e-05, 'epoch': 0.05} + 5%|▍ | 283/5772 [29:54<9:25:18, 6.18s/it] {'loss': 0.5029, 'learning_rate': 1.99812965582739e-05, 'epoch': 0.05} + 5%|▍ | 283/5772 [29:47<9:25:18, 6.18s/it] 5%|▍ | 284/5772 [29:53<9:24:23, 6.17s/it] 5%|▍ | 284/5772 [30:00<9:24:23, 6.17s/it] {'loss': 0.5233, 'learning_rate': 1.9980951911116234e-05, 'epoch': 0.05} + 5%|▍ | 284/5772 [30:00<9:24:23, 6.17s/it] {'loss': 0.5233, 'learning_rate': 1.9980951911116234e-05, 'epoch': 0.05} + 5%|▍ | 284/5772 [29:53<9:24:23, 6.17s/it] 5%|▍ | 285/5772 [29:59<9:25:33, 6.18s/it] 5%|▍ | 285/5772 [30:06<9:25:33, 6.18s/it] {'loss': 0.5109, 'learning_rate': 1.9980604120513257e-05, 'epoch': 0.05} + {'loss': 0.5109, 'learning_rate': 1.9980604120513257e-05, 'epoch': 0.05} 5%|▍ | 285/5772 [30:06<9:25:33, 6.18s/it] + 5%|▍ | 285/5772 [29:59<9:25:33, 6.18s/it] 5%|▍ | 286/5772 [30:05<9:26:08, 6.19s/it] 5%|▍ | 286/5772 [30:12<9:26:09, 6.19s/it] {'loss': 0.5173, 'learning_rate': 1.9980253186574505e-05, 'epoch': 0.05} + 5%|▍ | 286/5772 [30:12<9:26:09, 6.19s/it] {'loss': 0.5173, 'learning_rate': 1.9980253186574505e-05, 'epoch': 0.05} + 5%|▍ | 286/5772 [30:05<9:26:08, 6.19s/it] 5%|▍ | 287/5772 [30:11<9:21:41, 6.14s/it] 5%|▍ | 287/5772 [30:18<9:21:41, 6.14s/it] {'loss': 0.4996, 'learning_rate': 1.99798991094105e-05, 'epoch': 0.05} + 5%|▍ | 287/5772 [30:18<9:21:41, 6.14s/it] {'loss': 0.4996, 'learning_rate': 1.99798991094105e-05, 'epoch': 0.05} + 5%|▍ | 287/5772 [30:11<9:21:41, 6.14s/it] 5%|▍ | 288/5772 [30:17<9:18:30, 6.11s/it] 5%|▍ | 288/5772 [30:24<9:18:29, 6.11s/it] {'loss': 0.518, 'learning_rate': 1.9979541889132758e-05, 'epoch': 0.05} + 5%|▍ | 288/5772 [30:24<9:18:29, 6.11s/it] {'loss': 0.518, 'learning_rate': 1.9979541889132758e-05, 'epoch': 0.05} + 5%|▍ | 288/5772 [30:17<9:18:30, 6.11s/it] 5%|▌ | 289/5772 [30:24<9:40:12, 6.35s/it] 5%|▌ | 289/5772 [30:31<9:40:12, 6.35s/it] {'loss': 0.5019, 'learning_rate': 1.997918152585379e-05, 'epoch': 0.05} + 5%|▌ | 289/5772 [30:24<9:40:12, 6.35s/it]{'loss': 0.5019, 'learning_rate': 1.997918152585379e-05, 'epoch': 0.05} + 5%|▌ | 289/5772 [30:31<9:40:12, 6.35s/it] 5%|▌ | 290/5772 [30:30<9:27:19, 6.21s/it] 5%|▌ | 290/5772 [30:37<9:27:19, 6.21s/it] {'loss': 0.5075, 'learning_rate': 1.997881801968708e-05, 'epoch': 0.05} + 5%|▌ | 290/5772 [30:37<9:27:19, 6.21s/it] {'loss': 0.5075, 'learning_rate': 1.997881801968708e-05, 'epoch': 0.05} + 5%|▌ | 290/5772 [30:30<9:27:19, 6.21s/it] 5%|▌ | 291/5772 [30:36<9:26:06, 6.20s/it] 5%|▌ | 291/5772 [30:43<9:26:06, 6.20s/it] {'loss': 0.5123, 'learning_rate': 1.9978451370747122e-05, 'epoch': 0.05} + 5%|▌ | 291/5772 [30:36<9:26:06, 6.20s/it]{'loss': 0.5123, 'learning_rate': 1.9978451370747122e-05, 'epoch': 0.05} + 5%|▌ | 291/5772 [30:43<9:26:06, 6.20s/it] 5%|▌ | 292/5772 [30:43<9:36:02, 6.31s/it] 5%|▌ | 292/5772 [30:50<9:36:01, 6.31s/it] {'loss': 0.5249, 'learning_rate': 1.9978081579149378e-05, 'epoch': 0.05} + 5%|▌ | 292/5772 [30:50<9:36:01, 6.31s/it] {'loss': 0.5249, 'learning_rate': 1.9978081579149378e-05, 'epoch': 0.05} + 5%|▌ | 292/5772 [30:43<9:36:02, 6.31s/it] 5%|▌ | 293/5772 [30:49<9:35:20, 6.30s/it] 5%|▌ | 293/5772 [30:56<9:35:20, 6.30s/it] {'loss': 0.5011, 'learning_rate': 1.9977708645010323e-05, 'epoch': 0.05} + 5%|▌ | 293/5772 [30:56<9:35:20, 6.30s/it] {'loss': 0.5011, 'learning_rate': 1.9977708645010323e-05, 'epoch': 0.05} + 5%|▌ | 293/5772 [30:49<9:35:20, 6.30s/it] 5%|▌ | 294/5772 [30:56<9:41:32, 6.37s/it] 5%|▌ | 294/5772 [31:03<9:41:31, 6.37s/it] {'loss': 0.4963, 'learning_rate': 1.9977332568447406e-05, 'epoch': 0.05} + 5%|▌ | 294/5772 [31:03<9:41:31, 6.37s/it] {'loss': 0.4963, 'learning_rate': 1.9977332568447406e-05, 'epoch': 0.05} + 5%|▌ | 294/5772 [30:56<9:41:32, 6.37s/it] 5%|▌ | 295/5772 [31:09<9:30:41, 6.25s/it] 5%|▌ | 295/5772 [31:02<9:30:42, 6.25s/it] {'loss': 0.5083, 'learning_rate': 1.9976953349579073e-05, 'epoch': 0.05} + 5%|▌ | 295/5772 [31:09<9:30:41, 6.25s/it] {'loss': 0.5083, 'learning_rate': 1.9976953349579073e-05, 'epoch': 0.05} + 5%|▌ | 295/5772 [31:02<9:30:42, 6.25s/it] 5%|▌ | 296/5772 [31:08<9:34:50, 6.30s/it] 5%|▌ | 296/5772 [31:15<9:34:50, 6.30s/it] {'loss': 0.5123, 'learning_rate': 1.9976570988524752e-05, 'epoch': 0.05} + 5%|▌ | 296/5772 [31:15<9:34:50, 6.30s/it] {'loss': 0.5123, 'learning_rate': 1.9976570988524752e-05, 'epoch': 0.05} + 5%|▌ | 296/5772 [31:08<9:34:50, 6.30s/it] 5%|▌ | 297/5772 [31:14<9:27:23, 6.22s/it] 5%|▌ | 297/5772 [31:21<9:27:23, 6.22s/it] {'loss': 0.5139, 'learning_rate': 1.9976185485404867e-05, 'epoch': 0.05} + 5%|▌ | 297/5772 [31:21<9:27:23, 6.22s/it] {'loss': 0.5139, 'learning_rate': 1.9976185485404867e-05, 'epoch': 0.05} + 5%|▌ | 297/5772 [31:14<9:27:23, 6.22s/it] 5%|▌ | 298/5772 [31:20<9:21:54, 6.16s/it] 5%|▌ | 298/5772 [31:27<9:21:53, 6.16s/it] {'loss': 0.5021, 'learning_rate': 1.9975796840340837e-05, 'epoch': 0.05} + 5%|▌ | 298/5772 [31:27<9:21:53, 6.16s/it] {'loss': 0.5021, 'learning_rate': 1.9975796840340837e-05, 'epoch': 0.05} + 5%|▌ | 298/5772 [31:20<9:21:54, 6.16s/it] 5%|▌ | 299/5772 [31:27<9:31:31, 6.27s/it] 5%|▌ | 299/5772 [31:34<9:31:31, 6.27s/it] {'loss': 0.5138, 'learning_rate': 1.9975405053455052e-05, 'epoch': 0.05} + 5%|▌ | 299/5772 [31:34<9:31:31, 6.27s/it] {'loss': 0.5138, 'learning_rate': 1.9975405053455052e-05, 'epoch': 0.05} + 5%|▌ | 299/5772 [31:27<9:31:31, 6.27s/it]13 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +0 AutoResumeHook: Checking whether to suspend... +410 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... + 2AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... + 5%|▌ | 300/5772 [31:33<9:27:29, 6.22s/it]1 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... + 5%|▌ | 300/5772 [31:40<9:27:29, 6.22s/it]7 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... + {'loss': 0.5121, 'learning_rate': 1.997501012487091e-05, 'epoch': 0.05} + 5%|▌ | 300/5772 [31:40<9:27:29, 6.22s/it] {'loss': 0.5121, 'learning_rate': 1.997501012487091e-05, 'epoch': 0.05} + 5%|▌ | 300/5772 [31:33<9:27:29, 6.22s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-300/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-300/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-300/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 5%|▌ | 301/5772 [31:52<15:17:39, 10.06s/it] 5%|▌ | 301/5772 [31:59<15:17:39, 10.06s/it] {'loss': 0.4994, 'learning_rate': 1.9974612054712792e-05, 'epoch': 0.05} + 5%|▌ | 301/5772 [31:59<15:17:39, 10.06s/it] {'loss': 0.4994, 'learning_rate': 1.9974612054712792e-05, 'epoch': 0.05} + 5%|▌ | 301/5772 [31:52<15:17:39, 10.06s/it] 5%|▌ | 302/5772 [31:58<13:32:13, 8.91s/it] 5%|▌ | 302/5772 [32:05<13:32:13, 8.91s/it] {'loss': 0.5099, 'learning_rate': 1.9974210843106065e-05, 'epoch': 0.05} + 5%|▌ | 302/5772 [32:05<13:32:13, 8.91s/it] {'loss': 0.5099, 'learning_rate': 1.9974210843106065e-05, 'epoch': 0.05} + 5%|▌ | 302/5772 [31:58<13:32:13, 8.91s/it] 5%|▌ | 303/5772 [32:04<12:08:00, 7.99s/it] 5%|▌ | 303/5772 [32:11<12:08:01, 7.99s/it] {'loss': 0.5195, 'learning_rate': 1.9973806490177094e-05, 'epoch': 0.05} + 5%|▌ | 303/5772 [32:11<12:08:01, 7.99s/it] {'loss': 0.5195, 'learning_rate': 1.9973806490177094e-05, 'epoch': 0.05} + 5%|▌ | 303/5772 [32:04<12:08:00, 7.99s/it] 5%|▌ | 304/5772 [32:10<11:21:11, 7.47s/it] 5%|▌ | 304/5772 [32:17<11:21:11, 7.47s/it] {'loss': 0.5147, 'learning_rate': 1.9973398996053218e-05, 'epoch': 0.05} + 5%|▌ | 304/5772 [32:17<11:21:11, 7.47s/it] {'loss': 0.5147, 'learning_rate': 1.9973398996053218e-05, 'epoch': 0.05} + 5%|▌ | 304/5772 [32:10<11:21:11, 7.47s/it] 5%|▌ | 305/5772 [32:16<10:40:18, 7.03s/it] 5%|▌ | 305/5772 [32:23<10:40:17, 7.03s/it] {'loss': 0.5116, 'learning_rate': 1.9972988360862782e-05, 'epoch': 0.05} + 5%|▌ | 305/5772 [32:23<10:40:17, 7.03s/it] {'loss': 0.5116, 'learning_rate': 1.9972988360862782e-05, 'epoch': 0.05} + 5%|▌ | 305/5772 [32:16<10:40:18, 7.03s/it] 5%|▌ | 306/5772 [32:22<10:11:55, 6.72s/it] 5%|▌ | 306/5772 [32:29<10:11:59, 6.72s/it] {'loss': 0.5153, 'learning_rate': 1.9972574584735112e-05, 'epoch': 0.05} + 5%|▌ | 306/5772 [32:29<10:11:59, 6.72s/it] {'loss': 0.5153, 'learning_rate': 1.9972574584735112e-05, 'epoch': 0.05} + 5%|▌ | 306/5772 [32:22<10:11:55, 6.72s/it] 5%|▌ | 307/5772 [32:28<10:00:34, 6.59s/it] 5%|▌ | 307/5772 [32:36<10:00:32, 6.59s/it] {'loss': 0.5056, 'learning_rate': 1.9972157667800522e-05, 'epoch': 0.05} + 5%|▌ | 307/5772 [32:36<10:00:32, 6.59s/it] {'loss': 0.5056, 'learning_rate': 1.9972157667800522e-05, 'epoch': 0.05} + 5%|▌ | 307/5772 [32:28<10:00:34, 6.59s/it] 5%|▌ | 308/5772 [32:34<9:44:56, 6.42s/it] 5%|▌ | 308/5772 [32:42<9:44:55, 6.42s/it] {'loss': 0.5172, 'learning_rate': 1.9971737610190326e-05, 'epoch': 0.05} + 5%|▌ | 308/5772 [32:42<9:44:55, 6.42s/it] {'loss': 0.5172, 'learning_rate': 1.9971737610190326e-05, 'epoch': 0.05} + 5%|▌ | 308/5772 [32:34<9:44:56, 6.42s/it]WARNING: tokenization mismatch: 1 vs. 1473. [[{'from': 'human', 'value': 'ASSISTANT: Du bist eine auf Kundenfeedback spezialisierte Agentur mit den folgenden Kenntnissen und Eigenschaften.\n\nKENNTNISSE: Kundenfeedback, wissenschaftliche Datenerhebung, Fragebogendesign, Benutzerfreundlichkeit, Kundenbindung, Service-Qualitätsevaluation.\n\nEIGENSCHAFTEN: Präzision, Effizienz, Empathie, Anpassungsfähigkeit, Datenorientierung, Kundenfokus.\n\nAUFGABE: Nutze dein Wissen, um ein Kundenfeedback-Formular zu entwickeln, das die Service-Qualität einer Steuerberatung evaluiert. Das Formular soll auf wissenschaftlich fundierten Datenniveaus basieren und dabei darauf achten, dass es nur so lang ist, dass es tatsächlich vom Kunden ausgefüllt wird. Konzentriere dich auf die Gestaltung eines benutzerfreundlichen und effizienten Fragebogens, der aussagekräftige Informationen über die Steuerberatung liefert.\n\nOUTPUT: Ein strukturiertes Kundenfeedback-Formular, das auf wissenschaftlich fundierten Datenniveaus basiert und eine angemessene Länge hat, um von Kunden tatsächlich ausgefüllt zu werden, und speziell auf die Evaluation der Service-Qualität einer Steuerberatung abzielt.'}, {'from': 'gpt', 'value': 'Kundenfeedback-Formular: Steuerberatung Service-Qualität\n\nTeil 1: Allgemeine Informationen\n(Bitte füllen Sie die folgenden Informationen aus.)\n\n1. Name (Optional):\n2. Alter:\n☐ Unter 25\n☐ 25-34\n☐ 35-44\n☐ 45-54\n☐ 55-64\n☐ 65 oder älter\n3. Geschlecht:\n☐ Männlich\n☐ Weiblich\n☐ Andere\n☐ Lieber nicht angeben\n\nTeil 2: Service-Qualität\n(Bitte bewerten Sie die folgenden Aspekte der Steuerberatung auf einer Skala von 1 bis 5, wobei 1 = sehr unzufrieden und 5 = sehr zufrieden.)\n\n1. Fachliche Kompetenz des Steuerberaters:\n☐ 1\n☐ 2\n☐ 3\n☐ 4\n☐ 5\n2. Freundlichkeit und Hilfsbereitschaft des Personals:\n☐ 1\n☐ 2\n☐ 3\n☐ 4\n☐ 5\n3. Reaktionsgeschwindigkeit auf Anfragen und Probleme:\n☐ 1\n☐ 2\n☐ 3\n☐ 4\n☐ 5\n4. Verständlichkeit der erbrachten Informationen und Beratung:\n☐ 1\n☐ 2\n☐ 3\n☐ 4\n☐ 5\n5. Qualität der bereitgestellten Dienstleistungen und Lösungen:\n☐ 1\n☐ 2\n☐ 3\n☐ 4\n☐ 5\n6. Preis-Leistungs-Verhältnis:\n☐ 1\n☐ 2\n☐ 3\n☐ 4\n☐ 5\n\nTeil 3: Kundenbindung und Zufriedenheit\n(Bitte beantworten Sie die folgenden Fragen.)\n\n1. Wie wahrscheinlich ist es, dass Sie die Steuerberatung an Freunde, Familie oder Kollegen weiterempfehlen? (Skala von 1 bis 10, wobei 1 = sehr unwahrscheinlich und 10 = sehr wahrscheinlich)\n☐ 1\n☐ 2\n☐ 3\n☐ 4\n☐ 5\n☐ 6\n☐ 7\n☐ 8\n☐ 9\n☐ 10\n2. Wie zufrieden sind Sie insgesamt mit der Steuerberatung? (Skala von 1 bis 5, wobei 1 = sehr unzufrieden und 5 = sehr zufrieden)\n☐ 1\n☐ 2\n☐ 3\n☐ 4\n☐ 5\n\nTeil 4: Offene Fragen\n(Bitte teilen Sie uns Ihre Gedanken und Erfahrungen mit.)\n\n1. Was hat Ihnen am meisten an der Steuerberatung gefallen? (Optional)\n2. Was könnte die Steuerberatung verbessern, um Ihre Zufriedenheit zu erhöhen? (Optional)\n\nVielen Dank für Ihre Zeit und Ihr'}, {'from': 'human', 'value': 'Continue.'}, {'from': 'gpt', 'value': 'wertvolles Feedback! Ihre Meinung hilft uns dabei, den Service und die Qualität unserer Steuerberatung kontinuierlich zu verbessern. Wenn Sie Fragen oder Anmerkungen haben oder zusätzliche Informationen benötigen, zögern Sie bitte nicht, uns zu kontaktieren.\n\nTeil 5: Kontaktaufnahme (Optional)\n(Bitte geben Sie Ihre Kontaktinformationen an, falls Sie möchten, dass wir Ihnen bezüglich Ihres Feedbacks antworten oder weitere Informationen anbieten.)\n\n1. Name:\n2. E-Mail-Adresse:\n3. Telefonnummer:\n\nHinweis zum Datenschutz: Ihre persönlichen Daten werden vertraulich behandelt und nur für interne Zwecke verwendet. Sie werden nicht an Dritte weitergegeben.\n\nBitte klicken Sie auf "Absenden", um das Formular abzuschließen. Nochmals vielen Dank für Ihre Rückmeldung und Ihre Unterstützung bei der Verbesserung unserer Steuerberatung!'}]] (ignored) + 5%|▌ | 309/5772 [32:41<9:36:00, 6.33s/it] 5%|▌ | 309/5772 [32:48<9:36:00, 6.33s/it] {'loss': 0.5146, 'learning_rate': 1.9971314412036807e-05, 'epoch': 0.05} + 5%|▌ | 309/5772 [32:48<9:36:00, 6.33s/it] {'loss': 0.5146, 'learning_rate': 1.9971314412036807e-05, 'epoch': 0.05} + 5%|▌ | 309/5772 [32:41<9:36:00, 6.33s/it] 5%|▌ | 310/5772 [32:47<9:26:56, 6.23s/it] 5%|▌ | 310/5772 [32:54<9:26:56, 6.23s/it] {'loss': 0.5089, 'learning_rate': 1.9970888073473257e-05, 'epoch': 0.05} + 5%|▌ | 310/5772 [32:54<9:26:56, 6.23s/it] {'loss': 0.5089, 'learning_rate': 1.9970888073473257e-05, 'epoch': 0.05} + 5%|▌ | 310/5772 [32:47<9:26:56, 6.23s/it] 5%|▌ | 311/5772 [32:52<9:18:23, 6.13s/it] 5%|▌ | 311/5772 [33:00<9:18:22, 6.13s/it] {'loss': 0.5159, 'learning_rate': 1.9970458594633947e-05, 'epoch': 0.05} + 5%|▌ | 311/5772 [33:00<9:18:22, 6.13s/it] {'loss': 0.5159, 'learning_rate': 1.9970458594633947e-05, 'epoch': 0.05} + 5%|▌ | 311/5772 [32:52<9:18:23, 6.13s/it] 5%|▌ | 312/5772 [32:58<9:13:32, 6.08s/it] 5%|▌ | 312/5772 [33:06<9:13:32, 6.08s/it] {'loss': 0.5144, 'learning_rate': 1.9970025975654137e-05, 'epoch': 0.05} + 5%|▌ | 312/5772 [33:06<9:13:32, 6.08s/it] {'loss': 0.5144, 'learning_rate': 1.9970025975654137e-05, 'epoch': 0.05} + 5%|▌ | 312/5772 [32:58<9:13:32, 6.08s/it] 5%|▌ | 313/5772 [33:05<9:23:20, 6.19s/it] 5%|▌ | 313/5772 [33:12<9:23:20, 6.19s/it] {'loss': 0.516, 'learning_rate': 1.996959021667008e-05, 'epoch': 0.05} + 5%|▌ | 313/5772 [33:12<9:23:20, 6.19s/it] {'loss': 0.516, 'learning_rate': 1.996959021667008e-05, 'epoch': 0.05} + 5%|▌ | 313/5772 [33:05<9:23:20, 6.19s/it] 5%|▌ | 314/5772 [33:11<9:20:39, 6.16s/it] 5%|▌ | 314/5772 [33:18<9:20:39, 6.16s/it] {'loss': 0.5195, 'learning_rate': 1.9969151317819014e-05, 'epoch': 0.05} + 5%|▌ | 314/5772 [33:18<9:20:39, 6.16s/it] {'loss': 0.5195, 'learning_rate': 1.9969151317819014e-05, 'epoch': 0.05} + 5%|▌ | 314/5772 [33:11<9:20:39, 6.16s/it] 5%|▌ | 315/5772 [33:24<9:22:06, 6.18s/it] 5%|▌ | 315/5772 [33:17<9:22:07, 6.18s/it] {'loss': 0.5033, 'learning_rate': 1.9968709279239172e-05, 'epoch': 0.05} + 5%|▌ | 315/5772 [33:24<9:22:06, 6.18s/it] {'loss': 0.5033, 'learning_rate': 1.9968709279239172e-05, 'epoch': 0.05} + 5%|▌ | 315/5772 [33:17<9:22:07, 6.18s/it] 5%|▌ | 316/5772 [33:23<9:12:08, 6.07s/it] 5%|▌ | 316/5772 [33:30<9:12:10, 6.07s/it] {'loss': 0.5096, 'learning_rate': 1.996826410106977e-05, 'epoch': 0.05} + {'loss': 0.5096, 'learning_rate': 1.996826410106977e-05, 'epoch': 0.05} 5%|▌ | 316/5772 [33:30<9:12:10, 6.07s/it] + 5%|▌ | 316/5772 [33:23<9:12:08, 6.07s/it] 5%|▌ | 317/5772 [33:36<9:15:51, 6.11s/it] 5%|▌ | 317/5772 [33:29<9:15:52, 6.11s/it] {'loss': 0.5041, 'learning_rate': 1.996781578345101e-05, 'epoch': 0.05} + 5%|▌ | 317/5772 [33:36<9:15:51, 6.11s/it] {'loss': 0.5041, 'learning_rate': 1.996781578345101e-05, 'epoch': 0.05} + 5%|▌ | 317/5772 [33:29<9:15:52, 6.11s/it] 6%|▌ | 318/5772 [33:42<9:10:49, 6.06s/it] 6%|▌ | 318/5772 [33:35<9:10:50, 6.06s/it] {'loss': 0.5246, 'learning_rate': 1.996736432652409e-05, 'epoch': 0.06} + 6%|▌ | 318/5772 [33:42<9:10:49, 6.06s/it] {'loss': 0.5246, 'learning_rate': 1.996736432652409e-05, 'epoch': 0.06} + 6%|▌ | 318/5772 [33:35<9:10:50, 6.06s/it] 6%|▌ | 319/5772 [33:48<9:15:43, 6.11s/it] 6%|▌ | 319/5772 [33:41<9:15:44, 6.11s/it] {'loss': 0.5088, 'learning_rate': 1.9966909730431196e-05, 'epoch': 0.06} + 6%|▌ | 319/5772 [33:48<9:15:43, 6.11s/it] {'loss': 0.5088, 'learning_rate': 1.9966909730431196e-05, 'epoch': 0.06} + 6%|▌ | 319/5772 [33:41<9:15:44, 6.11s/it] 6%|▌ | 320/5772 [33:47<9:13:29, 6.09s/it] 6%|▌ | 320/5772 [33:55<9:13:29, 6.09s/it] {'loss': 0.5369, 'learning_rate': 1.99664519953155e-05, 'epoch': 0.06} + 6%|▌ | 320/5772 [33:55<9:13:29, 6.09s/it] {'loss': 0.5369, 'learning_rate': 1.99664519953155e-05, 'epoch': 0.06} + 6%|▌ | 320/5772 [33:47<9:13:29, 6.09s/it]WARNING: tokenization mismatch: 1 vs. 624. [[{'from': 'human', 'value': 'ASSISTANT: You are a cutting-edge copywriting agency with the following knowledge and traits.\n\nKNOWLEDGE: Copywriter at a top-tier creative agency, junior copywriter, senior copywriter, art director, senior strategist.\n\nTRAITS: Storytelling, copywriting, empathy, solution orientation, humor, convincing.\n\nTASK: Use your knowledge to write a LinkedIn Post. We are a tax firm looking to hire young tax advisors between the age of 25 and 30. Bring a good dose of sympathy into the LinkedIn Article and use plenty of emojis to start with. Start the Linked In post with an exciting question. Use the language of the target group. Don´t mention the targeted age. \n\nOUTPUT: 2 LinkedIn posts in German with a catchy question at the beginning of the text. Hashtags at the end of the text.'}, {'from': 'gpt', 'value': 'Post 1:\n\n👋 Hey, ihr Steuerprofis da draußen! 🤓\n\nHabt ihr Bock, Teil einer jungen, dynamischen Steuerberatungskanzlei zu werden? 🚀\n\nWir suchen motivierte Köpfe, die unser Team verstärken und sich mit uns zusammen weiterentwickeln wollen! 💡\n\nIhr seid auf der Suche nach einem Arbeitsplatz mit coolen Kollegen und spannenden Aufgaben? 🤝 Dann seid ihr bei uns genau richtig! 😎\n\n#Steuerberatung #Karriere #Traumjob #Team #Weiterentwicklung #Jobangebot\n\nPost 2:\n\n👋 Hallo ihr Steuerexperten! 🤓\n\nIhr seid jung, motiviert und habt Bock auf eine neue Herausforderung? 🚀\n\nDann seid ihr bei uns genau richtig! Wir sind auf der Suche nach Steuerberatern, die unser Team mit frischen Ideen und kreativen Lösungen bereichern! 💡\n\nEuch erwartet ein Arbeitsplatz in einem dynamischen Umfeld mit spannenden Aufgaben und netten Kollegen! 🤝\n\nInteressiert? Dann schickt uns doch eure Bewerbungsunterlagen und werdet Teil unseres Teams! 😎\n\n#Jobangebot #Steuerberatung #Karriere #Traumjob #Team #Weiterentwicklung'}]] (ignored) + 6%|▌ | 321/5772 [34:01<9:24:15, 6.21s/it] 6%|▌ | 321/5772 [33:54<9:24:15, 6.21s/it] {'loss': 0.5081, 'learning_rate': 1.9965991121321156e-05, 'epoch': 0.06} + 6%|▌ | 321/5772 [34:01<9:24:15, 6.21s/it] {'loss': 0.5081, 'learning_rate': 1.9965991121321156e-05, 'epoch': 0.06} + 6%|▌ | 321/5772 [33:54<9:24:15, 6.21s/it] 6%|▌ | 322/5772 [34:07<9:26:29, 6.24s/it] 6%|▌ | 322/5772 [34:00<9:26:30, 6.24s/it] {'loss': 0.5189, 'learning_rate': 1.9965527108593325e-05, 'epoch': 0.06} + 6%|▌ | 322/5772 [34:07<9:26:29, 6.24s/it] {'loss': 0.5189, 'learning_rate': 1.9965527108593325e-05, 'epoch': 0.06} + 6%|▌ | 322/5772 [34:00<9:26:30, 6.24s/it] 6%|▌ | 323/5772 [34:14<9:26:12, 6.23s/it] 6%|▌ | 323/5772 [34:06<9:26:13, 6.23s/it] {'loss': 0.5071, 'learning_rate': 1.9965059957278135e-05, 'epoch': 0.06} + 6%|▌ | 323/5772 [34:14<9:26:12, 6.23s/it] {'loss': 0.5071, 'learning_rate': 1.9965059957278135e-05, 'epoch': 0.06} + 6%|▌ | 323/5772 [34:06<9:26:13, 6.23s/it] 6%|▌ | 324/5772 [34:13<9:37:15, 6.36s/it] 6%|▌ | 324/5772 [34:20<9:37:15, 6.36s/it] {'loss': 0.5225, 'learning_rate': 1.9964589667522724e-05, 'epoch': 0.06} + 6%|▌ | 324/5772 [34:20<9:37:15, 6.36s/it] {'loss': 0.5225, 'learning_rate': 1.9964589667522724e-05, 'epoch': 0.06} + 6%|▌ | 324/5772 [34:13<9:37:15, 6.36s/it] 6%|▌ | 325/5772 [34:26<9:23:41, 6.21s/it] 6%|▌ | 325/5772 [34:19<9:23:41, 6.21s/it] {'loss': 0.5079, 'learning_rate': 1.99641162394752e-05, 'epoch': 0.06} + 6%|▌ | 325/5772 [34:26<9:23:41, 6.21s/it] {'loss': 0.5079, 'learning_rate': 1.99641162394752e-05, 'epoch': 0.06} + 6%|▌ | 325/5772 [34:19<9:23:41, 6.21s/it] 6%|▌ | 326/5772 [34:25<9:29:03, 6.27s/it] 6%|▌ | 326/5772 [34:32<9:29:04, 6.27s/it] {'loss': 0.5081, 'learning_rate': 1.996363967328466e-05, 'epoch': 0.06} + {'loss': 0.5081, 'learning_rate': 1.996363967328466e-05, 'epoch': 0.06} 6%|▌ | 326/5772 [34:32<9:29:04, 6.27s/it] + 6%|▌ | 326/5772 [34:25<9:29:03, 6.27s/it] 6%|▌ | 327/5772 [34:32<9:40:19, 6.39s/it] 6%|▌ | 327/5772 [34:39<9:40:19, 6.39s/it] {'loss': 0.5068, 'learning_rate': 1.9963159969101207e-05, 'epoch': 0.06} + 6%|▌ | 327/5772 [34:39<9:40:19, 6.39s/it] {'loss': 0.5068, 'learning_rate': 1.9963159969101207e-05, 'epoch': 0.06} + 6%|▌ | 327/5772 [34:32<9:40:19, 6.39s/it] 6%|▌ | 328/5772 [34:45<9:33:30, 6.32s/it] 6%|▌ | 328/5772 [34:38<9:33:30, 6.32s/it] {'loss': 0.5092, 'learning_rate': 1.9962677127075916e-05, 'epoch': 0.06} + 6%|▌ | 328/5772 [34:45<9:33:30, 6.32s/it] {'loss': 0.5092, 'learning_rate': 1.9962677127075916e-05, 'epoch': 0.06} + 6%|▌ | 328/5772 [34:38<9:33:30, 6.32s/it] 6%|▌ | 329/5772 [34:44<9:29:02, 6.27s/it] 6%|▌ | 329/5772 [34:51<9:29:02, 6.27s/it] {'loss': 0.4977, 'learning_rate': 1.9962191147360855e-05, 'epoch': 0.06} + 6%|▌ | 329/5772 [34:51<9:29:02, 6.27s/it] {'loss': 0.4977, 'learning_rate': 1.9962191147360855e-05, 'epoch': 0.06} + 6%|▌ | 329/5772 [34:44<9:29:02, 6.27s/it] 6%|▌ | 330/5772 [34:51<9:35:01, 6.34s/it] 6%|▌ | 330/5772 [34:58<9:35:02, 6.34s/it] {'loss': 0.5134, 'learning_rate': 1.9961702030109088e-05, 'epoch': 0.06} + 6%|▌ | 330/5772 [34:58<9:35:02, 6.34s/it] {'loss': 0.5134, 'learning_rate': 1.9961702030109088e-05, 'epoch': 0.06} + 6%|▌ | 330/5772 [34:51<9:35:01, 6.34s/it] 6%|▌ | 331/5772 [34:57<9:37:02, 6.36s/it] 6%|▌ | 331/5772 [35:04<9:37:02, 6.36s/it] {'loss': 0.5065, 'learning_rate': 1.996120977547465e-05, 'epoch': 0.06} + 6%|▌ | 331/5772 [35:04<9:37:02, 6.36s/it] {'loss': 0.5065, 'learning_rate': 1.996120977547465e-05, 'epoch': 0.06} + 6%|▌ | 331/5772 [34:57<9:37:02, 6.36s/it] 6%|▌ | 332/5772 [35:04<9:33:24, 6.32s/it] 6%|▌ | 332/5772 [35:11<9:33:23, 6.32s/it] {'loss': 0.5119, 'learning_rate': 1.9960714383612577e-05, 'epoch': 0.06} + 6%|▌ | 332/5772 [35:11<9:33:23, 6.32s/it] {'loss': 0.5119, 'learning_rate': 1.9960714383612577e-05, 'epoch': 0.06} + 6%|▌ | 332/5772 [35:04<9:33:24, 6.32s/it] 6%|▌ | 333/5772 [35:17<9:26:36, 6.25s/it] 6%|▌ | 333/5772 [35:10<9:26:36, 6.25s/it] {'loss': 0.5076, 'learning_rate': 1.9960215854678894e-05, 'epoch': 0.06} + 6%|▌ | 333/5772 [35:17<9:26:36, 6.25s/it] {'loss': 0.5076, 'learning_rate': 1.9960215854678894e-05, 'epoch': 0.06} + 6%|▌ | 333/5772 [35:10<9:26:36, 6.25s/it] 6%|▌ | 334/5772 [35:16<9:20:08, 6.18s/it] 6%|▌ | 334/5772 [35:23<9:20:08, 6.18s/it] {'loss': 0.4984, 'learning_rate': 1.9959714188830603e-05, 'epoch': 0.06} + 6%|▌ | 334/5772 [35:23<9:20:08, 6.18s/it] {'loss': 0.4984, 'learning_rate': 1.9959714188830603e-05, 'epoch': 0.06} + 6%|▌ | 334/5772 [35:16<9:20:08, 6.18s/it] 6%|▌ | 335/5772 [35:22<9:12:15, 6.09s/it] 6%|▌ | 335/5772 [35:29<9:12:15, 6.09s/it] {'loss': 0.5095, 'learning_rate': 1.9959209386225707e-05, 'epoch': 0.06} + 6%|▌ | 335/5772 [35:29<9:12:15, 6.09s/it] {'loss': 0.5095, 'learning_rate': 1.9959209386225707e-05, 'epoch': 0.06} + 6%|▌ | 335/5772 [35:22<9:12:15, 6.09s/it] 6%|▌ | 336/5772 [35:28<9:14:01, 6.12s/it] 6%|▌ | 336/5772 [35:35<9:14:02, 6.12s/it] {'loss': 0.505, 'learning_rate': 1.9958701447023188e-05, 'epoch': 0.06} + 6%|▌ | 336/5772 [35:35<9:14:02, 6.12s/it] {'loss': 0.505, 'learning_rate': 1.9958701447023188e-05, 'epoch': 0.06} + 6%|▌ | 336/5772 [35:28<9:14:01, 6.12s/it] 6%|▌ | 337/5772 [35:34<9:12:15, 6.10s/it] 6%|▌ | 337/5772 [35:41<9:12:14, 6.10s/it] {'loss': 0.5217, 'learning_rate': 1.9958190371383016e-05, 'epoch': 0.06} + 6%|▌ | 337/5772 [35:41<9:12:14, 6.10s/it] {'loss': 0.5217, 'learning_rate': 1.9958190371383016e-05, 'epoch': 0.06} + 6%|▌ | 337/5772 [35:34<9:12:15, 6.10s/it] 6%|▌ | 338/5772 [35:40<9:04:12, 6.01s/it] 6%|▌ | 338/5772 [35:47<9:04:12, 6.01s/it] {'loss': 0.4897, 'learning_rate': 1.9957676159466154e-05, 'epoch': 0.06} + 6%|▌ | 338/5772 [35:47<9:04:12, 6.01s/it] {'loss': 0.4897, 'learning_rate': 1.9957676159466154e-05, 'epoch': 0.06} + 6%|▌ | 338/5772 [35:40<9:04:12, 6.01s/it] 6%|▌ | 339/5772 [35:53<9:06:02, 6.03s/it] 6%|▌ | 339/5772 [35:46<9:06:02, 6.03s/it] {'loss': 0.5075, 'learning_rate': 1.9957158811434552e-05, 'epoch': 0.06} + 6%|▌ | 339/5772 [35:53<9:06:02, 6.03s/it] {'loss': 0.5075, 'learning_rate': 1.9957158811434552e-05, 'epoch': 0.06} + 6%|▌ | 339/5772 [35:46<9:06:02, 6.03s/it] 6%|▌ | 340/5772 [35:52<9:18:38, 6.17s/it] 6%|▌ | 340/5772 [35:59<9:18:38, 6.17s/it] {'loss': 0.5113, 'learning_rate': 1.995663832745115e-05, 'epoch': 0.06} + 6%|▌ | 340/5772 [35:59<9:18:38, 6.17s/it] {'loss': 0.5113, 'learning_rate': 1.995663832745115e-05, 'epoch': 0.06} + 6%|▌ | 340/5772 [35:52<9:18:38, 6.17s/it] 6%|▌ | 341/5772 [35:59<9:41:36, 6.43s/it] 6%|▌ | 341/5772 [36:06<9:41:36, 6.43s/it] {'loss': 0.5193, 'learning_rate': 1.9956114707679858e-05, 'epoch': 0.06} + 6%|▌ | 341/5772 [36:06<9:41:36, 6.43s/it] {'loss': 0.5193, 'learning_rate': 1.9956114707679858e-05, 'epoch': 0.06} + 6%|▌ | 341/5772 [35:59<9:41:36, 6.43s/it] 6%|▌ | 342/5772 [36:05<9:38:59, 6.40s/it] 6%|▌ | 342/5772 [36:13<9:38:59, 6.40s/it] {'loss': 0.5017, 'learning_rate': 1.99555879522856e-05, 'epoch': 0.06} + 6%|▌ | 342/5772 [36:13<9:38:59, 6.40s/it] {'loss': 0.5017, 'learning_rate': 1.99555879522856e-05, 'epoch': 0.06} + 6%|▌ | 342/5772 [36:05<9:38:59, 6.40s/it] 6%|▌ | 343/5772 [36:12<9:42:42, 6.44s/it] 6%|▌ | 343/5772 [36:19<9:42:42, 6.44s/it] {'loss': 0.5158, 'learning_rate': 1.9955058061434266e-05, 'epoch': 0.06} + 6%|▌ | 343/5772 [36:19<9:42:42, 6.44s/it] {'loss': 0.5158, 'learning_rate': 1.9955058061434266e-05, 'epoch': 0.06} + 6%|▌ | 343/5772 [36:12<9:42:42, 6.44s/it] 6%|▌ | 344/5772 [36:18<9:42:04, 6.43s/it] 6%|▌ | 344/5772 [36:25<9:42:04, 6.43s/it] {'loss': 0.5055, 'learning_rate': 1.9954525035292748e-05, 'epoch': 0.06} + 6%|▌ | 344/5772 [36:25<9:42:04, 6.43s/it] {'loss': 0.5055, 'learning_rate': 1.9954525035292748e-05, 'epoch': 0.06} + 6%|▌ | 344/5772 [36:18<9:42:04, 6.43s/it] 6%|▌ | 345/5772 [36:25<9:32:18, 6.33s/it] 6%|▌ | 345/5772 [36:32<9:32:18, 6.33s/it] {'loss': 0.5102, 'learning_rate': 1.9953988874028917e-05, 'epoch': 0.06} + 6%|▌ | 345/5772 [36:32<9:32:18, 6.33s/it] {'loss': 0.5102, 'learning_rate': 1.9953988874028917e-05, 'epoch': 0.06} + 6%|▌ | 345/5772 [36:25<9:32:18, 6.33s/it] 6%|▌ | 346/5772 [36:31<9:24:23, 6.24s/it] 6%|▌ | 346/5772 [36:38<9:24:23, 6.24s/it] {'loss': 0.4962, 'learning_rate': 1.9953449577811635e-05, 'epoch': 0.06} + 6%|▌ | 346/5772 [36:38<9:24:23, 6.24s/it] {'loss': 0.4962, 'learning_rate': 1.9953449577811635e-05, 'epoch': 0.06} + 6%|▌ | 346/5772 [36:31<9:24:23, 6.24s/it] 6%|▌ | 347/5772 [36:44<9:22:22, 6.22s/it] 6%|▌ | 347/5772 [36:37<9:22:23, 6.22s/it] {'loss': 0.5254, 'learning_rate': 1.9952907146810748e-05, 'epoch': 0.06} + 6%|▌ | 347/5772 [36:44<9:22:22, 6.22s/it] {'loss': 0.5254, 'learning_rate': 1.9952907146810748e-05, 'epoch': 0.06} + 6%|▌ | 347/5772 [36:37<9:22:23, 6.22s/it] 6%|▌ | 348/5772 [36:43<9:23:47, 6.24s/it] 6%|▌ | 348/5772 [36:50<9:23:47, 6.24s/it] {'loss': 0.5107, 'learning_rate': 1.9952361581197097e-05, 'epoch': 0.06} + 6%|▌ | 348/5772 [36:50<9:23:47, 6.24s/it] {'loss': 0.5107, 'learning_rate': 1.9952361581197097e-05, 'epoch': 0.06} + 6%|▌ | 348/5772 [36:43<9:23:47, 6.24s/it] 6%|▌ | 349/5772 [36:49<9:17:18, 6.17s/it] 6%|▌ | 349/5772 [36:56<9:17:18, 6.17s/it] {'loss': 0.5073, 'learning_rate': 1.9951812881142497e-05, 'epoch': 0.06} + 6%|▌ | 349/5772 [36:56<9:17:18, 6.17s/it] {'loss': 0.5073, 'learning_rate': 1.9951812881142497e-05, 'epoch': 0.06} + 6%|▌ | 349/5772 [36:49<9:17:18, 6.17s/it]13 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +011 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +109 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +12 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... + 6%|▌ | 350/5772 [37:02<9:13:00, 6.12s/it]AutoResumeHook: Checking whether to suspend...6 AutoResumeHook: Checking whether to suspend... + +2 4AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... + 6%|▌ | 350/5772 [36:55<9:13:00, 6.12s/it]1 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... + {'loss': 0.5069, 'learning_rate': 1.9951261046819766e-05, 'epoch': 0.06} + 6%|▌ | 350/5772 [37:02<9:13:00, 6.12s/it] {'loss': 0.5069, 'learning_rate': 1.9951261046819766e-05, 'epoch': 0.06} + 6%|▌ | 350/5772 [36:55<9:13:00, 6.12s/it] 6%|▌ | 351/5772 [37:01<9:14:19, 6.14s/it] 6%|▌ | 351/5772 [37:08<9:14:19, 6.14s/it] {'loss': 0.4961, 'learning_rate': 1.9950706078402696e-05, 'epoch': 0.06} + 6%|▌ | 351/5772 [37:08<9:14:19, 6.14s/it] {'loss': 0.4961, 'learning_rate': 1.9950706078402696e-05, 'epoch': 0.06} + 6%|▌ | 351/5772 [37:01<9:14:19, 6.14s/it] 6%|▌ | 352/5772 [37:07<9:08:59, 6.08s/it] 6%|▌ | 352/5772 [37:14<9:09:00, 6.08s/it] {'loss': 0.4943, 'learning_rate': 1.9950147976066073e-05, 'epoch': 0.06} + 6%|▌ | 352/5772 [37:14<9:09:00, 6.08s/it] {'loss': 0.4943, 'learning_rate': 1.9950147976066073e-05, 'epoch': 0.06} + 6%|▌ | 352/5772 [37:07<9:08:59, 6.08s/it] 6%|▌ | 353/5772 [37:20<9:09:57, 6.09s/it] 6%|▌ | 353/5772 [37:13<9:09:58, 6.09s/it] {'loss': 0.5141, 'learning_rate': 1.994958673998567e-05, 'epoch': 0.06} + 6%|▌ | 353/5772 [37:20<9:09:57, 6.09s/it] {'loss': 0.5141, 'learning_rate': 1.994958673998567e-05, 'epoch': 0.06} + 6%|▌ | 353/5772 [37:13<9:09:58, 6.09s/it] 6%|▌ | 354/5772 [37:26<9:07:24, 6.06s/it] 6%|▌ | 354/5772 [37:19<9:07:24, 6.06s/it] {'loss': 0.5022, 'learning_rate': 1.994902237033824e-05, 'epoch': 0.06} + 6%|▌ | 354/5772 [37:26<9:07:24, 6.06s/it] {'loss': 0.5022, 'learning_rate': 1.994902237033824e-05, 'epoch': 0.06} + 6%|▌ | 354/5772 [37:19<9:07:24, 6.06s/it] 6%|▌ | 355/5772 [37:33<9:22:06, 6.23s/it] 6%|▌ | 355/5772 [37:26<9:22:06, 6.23s/it] {'loss': 0.5025, 'learning_rate': 1.994845486730153e-05, 'epoch': 0.06} + 6%|▌ | 355/5772 [37:33<9:22:06, 6.23s/it] {'loss': 0.5025, 'learning_rate': 1.994845486730153e-05, 'epoch': 0.06} + 6%|▌ | 355/5772 [37:26<9:22:06, 6.23s/it] 6%|▌ | 356/5772 [37:39<9:30:54, 6.32s/it] 6%|▌ | 356/5772 [37:32<9:30:54, 6.32s/it] {'loss': 0.5108, 'learning_rate': 1.9947884231054276e-05, 'epoch': 0.06} + 6%|▌ | 356/5772 [37:39<9:30:54, 6.32s/it] {'loss': 0.5108, 'learning_rate': 1.9947884231054276e-05, 'epoch': 0.06} + 6%|▌ | 356/5772 [37:32<9:30:54, 6.32s/it] 6%|▌ | 357/5772 [37:46<9:31:07, 6.33s/it] 6%|▌ | 357/5772 [37:39<9:31:07, 6.33s/it] {'loss': 0.5178, 'learning_rate': 1.9947310461776195e-05, 'epoch': 0.06} + 6%|▌ | 357/5772 [37:46<9:31:07, 6.33s/it] {'loss': 0.5178, 'learning_rate': 1.9947310461776195e-05, 'epoch': 0.06} + 6%|▌ | 357/5772 [37:39<9:31:07, 6.33s/it] 6%|▌ | 358/5772 [37:45<9:21:09, 6.22s/it] 6%|▌ | 358/5772 [37:52<9:21:10, 6.22s/it] {'loss': 0.5009, 'learning_rate': 1.9946733559647987e-05, 'epoch': 0.06} + 6%|▌ | 358/5772 [37:52<9:21:10, 6.22s/it] {'loss': 0.5009, 'learning_rate': 1.9946733559647987e-05, 'epoch': 0.06} + 6%|▌ | 358/5772 [37:45<9:21:09, 6.22s/it] 6%|▌ | 359/5772 [37:58<9:22:41, 6.24s/it] 6%|▌ | 359/5772 [37:51<9:22:41, 6.24s/it] {'loss': 0.5157, 'learning_rate': 1.9946153524851352e-05, 'epoch': 0.06} + 6%|▌ | 359/5772 [37:58<9:22:41, 6.24s/it] {'loss': 0.5157, 'learning_rate': 1.9946153524851352e-05, 'epoch': 0.06} + 6%|▌ | 359/5772 [37:51<9:22:41, 6.24s/it] 6%|▌ | 360/5772 [38:04<9:27:46, 6.29s/it] 6%|▌ | 360/5772 [37:57<9:27:47, 6.29s/it] {'loss': 0.5154, 'learning_rate': 1.9945570357568967e-05, 'epoch': 0.06} + 6%|▌ | 360/5772 [38:04<9:27:46, 6.29s/it] {'loss': 0.5154, 'learning_rate': 1.9945570357568967e-05, 'epoch': 0.06} + 6%|▌ | 360/5772 [37:57<9:27:47, 6.29s/it] 6%|▋ | 361/5772 [38:11<9:22:33, 6.24s/it] 6%|▋ | 361/5772 [38:04<9:22:33, 6.24s/it] {'loss': 0.527, 'learning_rate': 1.994498405798449e-05, 'epoch': 0.06} + 6%|▋ | 361/5772 [38:11<9:22:33, 6.24s/it] {'loss': 0.527, 'learning_rate': 1.994498405798449e-05, 'epoch': 0.06} + 6%|▋ | 361/5772 [38:04<9:22:33, 6.24s/it] 6%|▋ | 362/5772 [38:10<9:16:47, 6.18s/it] 6%|▋ | 362/5772 [38:17<9:16:48, 6.18s/it] {'loss': 0.5034, 'learning_rate': 1.994439462628258e-05, 'epoch': 0.06} + 6%|▋ | 362/5772 [38:17<9:16:48, 6.18s/it] {'loss': 0.5034, 'learning_rate': 1.994439462628258e-05, 'epoch': 0.06} + 6%|▋ | 362/5772 [38:10<9:16:47, 6.18s/it] 6%|▋ | 363/5772 [38:22<9:08:16, 6.08s/it] 6%|▋ | 363/5772 [38:15<9:08:17, 6.08s/it] {'loss': 0.5117, 'learning_rate': 1.9943802062648877e-05, 'epoch': 0.06} + 6%|▋ | 363/5772 [38:22<9:08:16, 6.08s/it] {'loss': 0.5117, 'learning_rate': 1.9943802062648877e-05, 'epoch': 0.06} + 6%|▋ | 363/5772 [38:15<9:08:17, 6.08s/it] 6%|▋ | 364/5772 [38:22<9:21:45, 6.23s/it] 6%|▋ | 364/5772 [38:29<9:21:45, 6.23s/it] {'loss': 0.5125, 'learning_rate': 1.994320636727e-05, 'epoch': 0.06} + 6%|▋ | 364/5772 [38:29<9:21:45, 6.23s/it] {'loss': 0.5125, 'learning_rate': 1.994320636727e-05, 'epoch': 0.06} + 6%|▋ | 364/5772 [38:22<9:21:45, 6.23s/it] 6%|▋ | 365/5772 [38:28<9:18:04, 6.19s/it] 6%|▋ | 365/5772 [38:35<9:18:04, 6.19s/it] {'loss': 0.5067, 'learning_rate': 1.994260754033356e-05, 'epoch': 0.06} + 6%|▋ | 365/5772 [38:35<9:18:04, 6.19s/it] {'loss': 0.5067, 'learning_rate': 1.994260754033356e-05, 'epoch': 0.06} + 6%|▋ | 365/5772 [38:28<9:18:04, 6.19s/it] 6%|▋ | 366/5772 [38:35<9:24:31, 6.27s/it] 6%|▋ | 366/5772 [38:42<9:24:31, 6.27s/it] {'loss': 0.496, 'learning_rate': 1.994200558202816e-05, 'epoch': 0.06} + 6%|▋ | 366/5772 [38:42<9:24:31, 6.27s/it] {'loss': 0.496, 'learning_rate': 1.994200558202816e-05, 'epoch': 0.06} + 6%|▋ | 366/5772 [38:35<9:24:31, 6.27s/it] 6%|▋ | 367/5772 [38:48<9:18:23, 6.20s/it] 6%|▋ | 367/5772 [38:41<9:18:24, 6.20s/it] {'loss': 0.508, 'learning_rate': 1.9941400492543376e-05, 'epoch': 0.06} + 6%|▋ | 367/5772 [38:48<9:18:23, 6.20s/it] {'loss': 0.508, 'learning_rate': 1.9941400492543376e-05, 'epoch': 0.06} + 6%|▋ | 367/5772 [38:41<9:18:24, 6.20s/it] 6%|▋ | 368/5772 [38:54<9:17:27, 6.19s/it] 6%|▋ | 368/5772 [38:47<9:17:27, 6.19s/it] {'loss': 0.5069, 'learning_rate': 1.9940792272069783e-05, 'epoch': 0.06} + 6%|▋ | 368/5772 [38:54<9:17:27, 6.19s/it] {'loss': 0.5069, 'learning_rate': 1.9940792272069783e-05, 'epoch': 0.06} + 6%|▋ | 368/5772 [38:47<9:17:27, 6.19s/it] 6%|▋ | 369/5772 [39:00<9:07:14, 6.08s/it] 6%|▋ | 369/5772 [38:53<9:07:15, 6.08s/it] {'loss': 0.5098, 'learning_rate': 1.9940180920798934e-05, 'epoch': 0.06} + 6%|▋ | 369/5772 [39:00<9:07:14, 6.08s/it] {'loss': 0.5098, 'learning_rate': 1.9940180920798934e-05, 'epoch': 0.06} + 6%|▋ | 369/5772 [38:53<9:07:15, 6.08s/it] 6%|▋ | 370/5772 [39:06<9:21:26, 6.24s/it] 6%|▋ | 370/5772 [38:59<9:21:26, 6.24s/it] {'loss': 0.5125, 'learning_rate': 1.993956643892337e-05, 'epoch': 0.06} + 6%|▋ | 370/5772 [39:06<9:21:26, 6.24s/it] {'loss': 0.5125, 'learning_rate': 1.993956643892337e-05, 'epoch': 0.06} + 6%|▋ | 370/5772 [38:59<9:21:26, 6.24s/it] 6%|▋ | 371/5772 [39:13<9:28:09, 6.31s/it] 6%|▋ | 371/5772 [39:06<9:28:10, 6.31s/it] {'loss': 0.501, 'learning_rate': 1.9938948826636625e-05, 'epoch': 0.06} + 6%|▋ | 371/5772 [39:13<9:28:09, 6.31s/it] {'loss': 0.501, 'learning_rate': 1.9938948826636625e-05, 'epoch': 0.06} + 6%|▋ | 371/5772 [39:06<9:28:10, 6.31s/it] 6%|▋ | 372/5772 [39:19<9:21:07, 6.23s/it] 6%|▋ | 372/5772 [39:12<9:21:07, 6.23s/it] {'loss': 0.512, 'learning_rate': 1.9938328084133206e-05, 'epoch': 0.06} + 6%|▋ | 372/5772 [39:19<9:21:07, 6.23s/it] {'loss': 0.512, 'learning_rate': 1.9938328084133206e-05, 'epoch': 0.06} + 6%|▋ | 372/5772 [39:12<9:21:07, 6.23s/it] 6%|▋ | 373/5772 [39:18<9:17:46, 6.20s/it] 6%|▋ | 373/5772 [39:25<9:17:47, 6.20s/it] {'loss': 0.5049, 'learning_rate': 1.9937704211608615e-05, 'epoch': 0.06} + 6%|▋ | 373/5772 [39:25<9:17:47, 6.20s/it] {'loss': 0.5049, 'learning_rate': 1.9937704211608615e-05, 'epoch': 0.06} + 6%|▋ | 373/5772 [39:18<9:17:46, 6.20s/it] 6%|▋ | 374/5772 [39:31<9:24:40, 6.28s/it] 6%|▋ | 374/5772 [39:24<9:24:40, 6.28s/it] {'loss': 0.5034, 'learning_rate': 1.9937077209259333e-05, 'epoch': 0.06} + 6%|▋ | 374/5772 [39:31<9:24:40, 6.28s/it] {'loss': 0.5034, 'learning_rate': 1.9937077209259333e-05, 'epoch': 0.06} + 6%|▋ | 374/5772 [39:24<9:24:40, 6.28s/it] 6%|▋ | 375/5772 [39:37<9:16:31, 6.19s/it] 6%|▋ | 375/5772 [39:30<9:16:31, 6.19s/it] {'loss': 0.5152, 'learning_rate': 1.993644707728284e-05, 'epoch': 0.06} + 6%|▋ | 375/5772 [39:37<9:16:31, 6.19s/it] {'loss': 0.5152, 'learning_rate': 1.993644707728284e-05, 'epoch': 0.06} + 6%|▋ | 375/5772 [39:30<9:16:31, 6.19s/it] 7%|▋ | 376/5772 [39:43<9:12:03, 6.14s/it] 7%|▋ | 376/5772 [39:36<9:12:03, 6.14s/it] {'loss': 0.5045, 'learning_rate': 1.993581381587758e-05, 'epoch': 0.07} + 7%|▋ | 376/5772 [39:43<9:12:03, 6.14s/it] {'loss': 0.5045, 'learning_rate': 1.993581381587758e-05, 'epoch': 0.07} + 7%|▋ | 376/5772 [39:36<9:12:03, 6.14s/it] 7%|▋ | 377/5772 [39:50<9:17:29, 6.20s/it] 7%|▋ | 377/5772 [39:43<9:17:29, 6.20s/it] {'loss': 0.5107, 'learning_rate': 1.9935177425243007e-05, 'epoch': 0.07} + 7%|▋ | 377/5772 [39:50<9:17:29, 6.20s/it] {'loss': 0.5107, 'learning_rate': 1.9935177425243007e-05, 'epoch': 0.07} + 7%|▋ | 377/5772 [39:43<9:17:29, 6.20s/it] 7%|▋ | 378/5772 [39:49<9:14:43, 6.17s/it] 7%|▋ | 378/5772 [39:56<9:14:44, 6.17s/it] {'loss': 0.5137, 'learning_rate': 1.993453790557954e-05, 'epoch': 0.07} + 7%|▋ | 378/5772 [39:56<9:14:44, 6.17s/it] {'loss': 0.5137, 'learning_rate': 1.993453790557954e-05, 'epoch': 0.07} + 7%|▋ | 378/5772 [39:49<9:14:43, 6.17s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (4214 > 4096). Running this sequence through the model will result in indexing errors + 7%|▋ | 379/5772 [40:02<9:17:27, 6.20s/it] 7%|▋ | 379/5772 [39:55<9:17:27, 6.20s/it] {'loss': 0.5065, 'learning_rate': 1.99338952570886e-05, 'epoch': 0.07} + 7%|▋ | 379/5772 [40:02<9:17:27, 6.20s/it] {'loss': 0.5065, 'learning_rate': 1.99338952570886e-05, 'epoch': 0.07} + 7%|▋ | 379/5772 [39:55<9:17:27, 6.20s/it] 7%|▋ | 380/5772 [40:01<9:21:43, 6.25s/it] 7%|▋ | 380/5772 [40:08<9:21:44, 6.25s/it] {'loss': 0.5046, 'learning_rate': 1.993324947997258e-05, 'epoch': 0.07} + 7%|▋ | 380/5772 [40:08<9:21:44, 6.25s/it] {'loss': 0.5046, 'learning_rate': 1.993324947997258e-05, 'epoch': 0.07} + 7%|▋ | 380/5772 [40:01<9:21:43, 6.25s/it] 7%|▋ | 381/5772 [40:15<9:25:49, 6.30s/it] 7%|▋ | 381/5772 [40:08<9:25:49, 6.30s/it] {'loss': 0.5151, 'learning_rate': 1.9932600574434864e-05, 'epoch': 0.07} + 7%|▋ | 381/5772 [40:15<9:25:49, 6.30s/it] {'loss': 0.5151, 'learning_rate': 1.9932600574434864e-05, 'epoch': 0.07} + 7%|▋ | 381/5772 [40:08<9:25:49, 6.30s/it] 7%|▋ | 382/5772 [40:14<9:18:43, 6.22s/it] 7%|▋ | 382/5772 [40:21<9:18:44, 6.22s/it] {'loss': 0.5004, 'learning_rate': 1.9931948540679822e-05, 'epoch': 0.07} + 7%|▋ | 382/5772 [40:21<9:18:44, 6.22s/it] {'loss': 0.5004, 'learning_rate': 1.9931948540679822e-05, 'epoch': 0.07} + 7%|▋ | 382/5772 [40:14<9:18:43, 6.22s/it] 7%|▋ | 383/5772 [40:20<9:11:11, 6.14s/it] 7%|▋ | 383/5772 [40:27<9:11:11, 6.14s/it] {'loss': 0.5128, 'learning_rate': 1.993129337891281e-05, 'epoch': 0.07} + 7%|▋ | 383/5772 [40:27<9:11:11, 6.14s/it] {'loss': 0.5128, 'learning_rate': 1.993129337891281e-05, 'epoch': 0.07} + 7%|▋ | 383/5772 [40:20<9:11:11, 6.14s/it] 7%|▋ | 384/5772 [40:26<9:04:49, 6.07s/it] 7%|▋ | 384/5772 [40:33<9:04:49, 6.07s/it] {'loss': 0.5013, 'learning_rate': 1.9930635089340168e-05, 'epoch': 0.07} + 7%|▋ | 384/5772 [40:33<9:04:49, 6.07s/it] {'loss': 0.5013, 'learning_rate': 1.9930635089340168e-05, 'epoch': 0.07} + 7%|▋ | 384/5772 [40:26<9:04:49, 6.07s/it] 7%|▋ | 385/5772 [40:39<9:11:13, 6.14s/it] 7%|▋ | 385/5772 [40:32<9:11:13, 6.14s/it] {'loss': 0.5062, 'learning_rate': 1.992997367216922e-05, 'epoch': 0.07} + 7%|▋ | 385/5772 [40:39<9:11:13, 6.14s/it] {'loss': 0.5062, 'learning_rate': 1.992997367216922e-05, 'epoch': 0.07} + 7%|▋ | 385/5772 [40:32<9:11:13, 6.14s/it] 7%|▋ | 386/5772 [40:38<9:09:41, 6.12s/it] 7%|▋ | 386/5772 [40:45<9:09:40, 6.12s/it] {'loss': 0.5075, 'learning_rate': 1.992930912760827e-05, 'epoch': 0.07} + 7%|▋ | 386/5772 [40:45<9:09:40, 6.12s/it] {'loss': 0.5075, 'learning_rate': 1.992930912760827e-05, 'epoch': 0.07} + 7%|▋ | 386/5772 [40:38<9:09:41, 6.12s/it] 7%|▋ | 387/5772 [40:51<9:06:25, 6.09s/it] 7%|▋ | 387/5772 [40:44<9:06:25, 6.09s/it] {'loss': 0.5185, 'learning_rate': 1.992864145586662e-05, 'epoch': 0.07} + 7%|▋ | 387/5772 [40:51<9:06:25, 6.09s/it] {'loss': 0.5185, 'learning_rate': 1.992864145586662e-05, 'epoch': 0.07} + 7%|▋ | 387/5772 [40:44<9:06:25, 6.09s/it] 7%|▋ | 388/5772 [40:51<9:25:19, 6.30s/it] 7%|▋ | 388/5772 [40:58<9:25:19, 6.30s/it] {'loss': 0.486, 'learning_rate': 1.9927970657154548e-05, 'epoch': 0.07} + 7%|▋ | 388/5772 [40:58<9:25:19, 6.30s/it] {'loss': 0.486, 'learning_rate': 1.9927970657154548e-05, 'epoch': 0.07} + 7%|▋ | 388/5772 [40:51<9:25:19, 6.30s/it] 7%|▋ | 389/5772 [41:04<9:24:27, 6.29s/it] 7%|▋ | 389/5772 [40:57<9:24:27, 6.29s/it] {'loss': 0.5152, 'learning_rate': 1.9927296731683317e-05, 'epoch': 0.07} + 7%|▋ | 389/5772 [41:04<9:24:27, 6.29s/it] {'loss': 0.5152, 'learning_rate': 1.9927296731683317e-05, 'epoch': 0.07} + 7%|▋ | 389/5772 [40:57<9:24:27, 6.29s/it] 7%|▋ | 390/5772 [41:03<9:24:41, 6.30s/it] 7%|▋ | 390/5772 [41:10<9:24:41, 6.30s/it] {'loss': 0.5066, 'learning_rate': 1.9926619679665175e-05, 'epoch': 0.07} + 7%|▋ | 390/5772 [41:10<9:24:41, 6.30s/it] {'loss': 0.5066, 'learning_rate': 1.9926619679665175e-05, 'epoch': 0.07} + 7%|▋ | 390/5772 [41:03<9:24:41, 6.30s/it] 7%|▋ | 391/5772 [41:17<9:23:44, 6.29s/it] 7%|▋ | 391/5772 [41:10<9:23:45, 6.29s/it] {'loss': 0.5078, 'learning_rate': 1.9925939501313358e-05, 'epoch': 0.07} + 7%|▋ | 391/5772 [41:17<9:23:44, 6.29s/it] {'loss': 0.5078, 'learning_rate': 1.9925939501313358e-05, 'epoch': 0.07} + 7%|▋ | 391/5772 [41:10<9:23:45, 6.29s/it] 7%|▋ | 392/5772 [41:23<9:22:09, 6.27s/it] 7%|▋ | 392/5772 [41:16<9:22:10, 6.27s/it] {'loss': 0.5135, 'learning_rate': 1.992525619684208e-05, 'epoch': 0.07} + 7%|▋ | 392/5772 [41:23<9:22:09, 6.27s/it] {'loss': 0.5135, 'learning_rate': 1.992525619684208e-05, 'epoch': 0.07} + 7%|▋ | 392/5772 [41:16<9:22:10, 6.27s/it] 7%|▋ | 393/5772 [41:22<9:19:06, 6.24s/it] 7%|▋ | 393/5772 [41:29<9:19:07, 6.24s/it] {'loss': 0.5143, 'learning_rate': 1.9924569766466552e-05, 'epoch': 0.07} + 7%|▋ | 393/5772 [41:29<9:19:07, 6.24s/it] {'loss': 0.5143, 'learning_rate': 1.9924569766466552e-05, 'epoch': 0.07} + 7%|▋ | 393/5772 [41:22<9:19:06, 6.24s/it] 7%|▋ | 394/5772 [41:29<9:24:22, 6.30s/it] 7%|▋ | 394/5772 [41:36<9:24:22, 6.30s/it] {'loss': 0.503, 'learning_rate': 1.9923880210402956e-05, 'epoch': 0.07} + 7%|▋ | 394/5772 [41:36<9:24:22, 6.30s/it] {'loss': 0.503, 'learning_rate': 1.9923880210402956e-05, 'epoch': 0.07} + 7%|▋ | 394/5772 [41:29<9:24:22, 6.30s/it]/lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/VILA/llava/model/llava_arch.py:397: UserWarning: Inputs truncated! + warnings.warn("Inputs truncated!") + 7%|▋ | 395/5772 [41:35<9:26:37, 6.32s/it] 7%|▋ | 395/5772 [41:42<9:26:36, 6.32s/it] {'loss': 0.5251, 'learning_rate': 1.9923187528868463e-05, 'epoch': 0.07} + 7%|▋ | 395/5772 [41:42<9:26:36, 6.32s/it] {'loss': 0.5251, 'learning_rate': 1.9923187528868463e-05, 'epoch': 0.07} + 7%|▋ | 395/5772 [41:35<9:26:37, 6.32s/it] 7%|▋ | 396/5772 [41:48<9:23:28, 6.29s/it] 7%|▋ | 396/5772 [41:41<9:23:28, 6.29s/it] {'loss': 0.5049, 'learning_rate': 1.9922491722081235e-05, 'epoch': 0.07} + 7%|▋ | 396/5772 [41:48<9:23:28, 6.29s/it] {'loss': 0.5049, 'learning_rate': 1.9922491722081235e-05, 'epoch': 0.07} + 7%|▋ | 396/5772 [41:41<9:23:28, 6.29s/it] 7%|▋ | 397/5772 [41:54<9:24:03, 6.30s/it] 7%|▋ | 397/5772 [41:47<9:24:03, 6.30s/it] {'loss': 0.5091, 'learning_rate': 1.9921792790260402e-05, 'epoch': 0.07} + 7%|▋ | 397/5772 [41:54<9:24:03, 6.30s/it] {'loss': 0.5091, 'learning_rate': 1.9921792790260402e-05, 'epoch': 0.07} + 7%|▋ | 397/5772 [41:47<9:24:03, 6.30s/it] 7%|▋ | 398/5772 [41:54<9:18:42, 6.24s/it] 7%|▋ | 398/5772 [42:01<9:18:42, 6.24s/it] {'loss': 0.4971, 'learning_rate': 1.9921090733626102e-05, 'epoch': 0.07} + 7%|▋ | 398/5772 [42:01<9:18:42, 6.24s/it] {'loss': 0.4971, 'learning_rate': 1.9921090733626102e-05, 'epoch': 0.07} + 7%|▋ | 398/5772 [41:54<9:18:42, 6.24s/it] 7%|▋ | 399/5772 [42:00<9:18:06, 6.23s/it] 7%|▋ | 399/5772 [42:07<9:18:05, 6.23s/it] {'loss': 0.5029, 'learning_rate': 1.9920385552399434e-05, 'epoch': 0.07} + 7%|▋ | 399/5772 [42:07<9:18:05, 6.23s/it] {'loss': 0.5029, 'learning_rate': 1.9920385552399434e-05, 'epoch': 0.07} + 7%|▋ | 399/5772 [42:00<9:18:06, 6.23s/it]13 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +81215 AutoResumeHook: Checking whether to suspend... AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... + +11 AutoResumeHook: Checking whether to suspend... +05 7%|▋ | 400/5772 [42:13<9:24:03, 6.30s/it]9 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +46 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... + 7%|▋ | 400/5772 [42:06<9:24:03, 6.30s/it]7 AutoResumeHook: Checking whether to suspend... + {'loss': 0.5008, 'learning_rate': 1.9919677246802492e-05, 'epoch': 0.07} + 7%|▋ | 400/5772 [42:13<9:24:03, 6.30s/it] {'loss': 0.5008, 'learning_rate': 1.9919677246802492e-05, 'epoch': 0.07} + 7%|▋ | 400/5772 [42:06<9:24:03, 6.30s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-400/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-400/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-400/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 7%|▋ | 401/5772 [42:26<15:30:31, 10.39s/it] 7%|▋ | 401/5772 [42:33<15:30:31, 10.39s/it] {'loss': 0.4976, 'learning_rate': 1.9918965817058357e-05, 'epoch': 0.07} + 7%|▋ | 401/5772 [42:33<15:30:31, 10.39s/it] {'loss': 0.4976, 'learning_rate': 1.9918965817058357e-05, 'epoch': 0.07} + 7%|▋ | 401/5772 [42:26<15:30:31, 10.39s/it] 7%|▋ | 402/5772 [42:39<13:33:12, 9.09s/it] 7%|▋ | 402/5772 [42:32<13:33:12, 9.09s/it] {'loss': 0.5129, 'learning_rate': 1.9918251263391086e-05, 'epoch': 0.07} + 7%|▋ | 402/5772 [42:39<13:33:12, 9.09s/it] {'loss': 0.5129, 'learning_rate': 1.9918251263391086e-05, 'epoch': 0.07} + 7%|▋ | 402/5772 [42:32<13:33:12, 9.09s/it] 7%|▋ | 403/5772 [42:45<12:10:38, 8.17s/it] 7%|▋ | 403/5772 [42:38<12:10:39, 8.17s/it] {'loss': 0.5045, 'learning_rate': 1.9917533586025725e-05, 'epoch': 0.07} + 7%|▋ | 403/5772 [42:45<12:10:38, 8.17s/it] {'loss': 0.5045, 'learning_rate': 1.9917533586025725e-05, 'epoch': 0.07} + 7%|▋ | 403/5772 [42:38<12:10:39, 8.17s/it] 7%|▋ | 404/5772 [42:52<11:24:20, 7.65s/it] 7%|▋ | 404/5772 [42:45<11:24:20, 7.65s/it] {'loss': 0.5052, 'learning_rate': 1.9916812785188305e-05, 'epoch': 0.07} + 7%|▋ | 404/5772 [42:52<11:24:20, 7.65s/it] {'loss': 0.5052, 'learning_rate': 1.9916812785188305e-05, 'epoch': 0.07} + 7%|▋ | 404/5772 [42:45<11:24:20, 7.65s/it] 7%|▋ | 405/5772 [42:58<10:43:11, 7.19s/it] 7%|▋ | 405/5772 [42:51<10:43:11, 7.19s/it] {'loss': 0.5116, 'learning_rate': 1.9916088861105835e-05, 'epoch': 0.07} + 7%|▋ | 405/5772 [42:58<10:43:11, 7.19s/it] {'loss': 0.5116, 'learning_rate': 1.9916088861105835e-05, 'epoch': 0.07} + 7%|▋ | 405/5772 [42:51<10:43:11, 7.19s/it] 7%|▋ | 406/5772 [43:04<10:18:33, 6.92s/it] 7%|▋ | 406/5772 [42:57<10:18:33, 6.92s/it] {'loss': 0.5115, 'learning_rate': 1.9915361814006312e-05, 'epoch': 0.07} + 7%|▋ | 406/5772 [43:04<10:18:33, 6.92s/it] {'loss': 0.5115, 'learning_rate': 1.9915361814006312e-05, 'epoch': 0.07} + 7%|▋ | 406/5772 [42:57<10:18:33, 6.92s/it] 7%|▋ | 407/5772 [43:10<10:03:52, 6.75s/it] 7%|▋ | 407/5772 [43:03<10:03:52, 6.75s/it] {'loss': 0.5029, 'learning_rate': 1.9914631644118712e-05, 'epoch': 0.07} + 7%|▋ | 407/5772 [43:10<10:03:52, 6.75s/it] {'loss': 0.5029, 'learning_rate': 1.9914631644118712e-05, 'epoch': 0.07} + 7%|▋ | 407/5772 [43:03<10:03:52, 6.75s/it] 7%|▋ | 408/5772 [43:16<9:39:49, 6.49s/it] 7%|▋ | 408/5772 [43:09<9:39:49, 6.49s/it] {'loss': 0.504, 'learning_rate': 1.9913898351673006e-05, 'epoch': 0.07} + 7%|▋ | 408/5772 [43:16<9:39:49, 6.49s/it] {'loss': 0.504, 'learning_rate': 1.9913898351673006e-05, 'epoch': 0.07} + 7%|▋ | 408/5772 [43:09<9:39:49, 6.49s/it] 7%|▋ | 409/5772 [43:22<9:23:06, 6.30s/it] 7%|▋ | 409/5772 [43:15<9:23:06, 6.30s/it] {'loss': 0.5115, 'learning_rate': 1.9913161936900135e-05, 'epoch': 0.07} + 7%|▋ | 409/5772 [43:22<9:23:06, 6.30s/it] {'loss': 0.5115, 'learning_rate': 1.9913161936900135e-05, 'epoch': 0.07} + 7%|▋ | 409/5772 [43:15<9:23:06, 6.30s/it] 7%|▋ | 410/5772 [43:29<9:24:26, 6.32s/it] 7%|▋ | 410/5772 [43:22<9:24:26, 6.32s/it] {'loss': 0.4922, 'learning_rate': 1.9912422400032027e-05, 'epoch': 0.07} + 7%|▋ | 410/5772 [43:29<9:24:26, 6.32s/it] {'loss': 0.4922, 'learning_rate': 1.9912422400032027e-05, 'epoch': 0.07} + 7%|▋ | 410/5772 [43:22<9:24:26, 6.32s/it] 7%|▋ | 411/5772 [43:34<9:13:41, 6.20s/it] 7%|▋ | 411/5772 [43:27<9:13:41, 6.20s/it] {'loss': 0.5161, 'learning_rate': 1.99116797413016e-05, 'epoch': 0.07} + 7%|▋ | 411/5772 [43:34<9:13:41, 6.20s/it] {'loss': 0.5161, 'learning_rate': 1.99116797413016e-05, 'epoch': 0.07} + 7%|▋ | 411/5772 [43:27<9:13:41, 6.20s/it] 7%|▋ | 412/5772 [43:40<9:02:16, 6.07s/it] 7%|▋ | 412/5772 [43:33<9:02:16, 6.07s/it] {'loss': 0.5055, 'learning_rate': 1.9910933960942747e-05, 'epoch': 0.07} + 7%|▋ | 412/5772 [43:40<9:02:16, 6.07s/it] {'loss': 0.5055, 'learning_rate': 1.9910933960942747e-05, 'epoch': 0.07} + 7%|▋ | 412/5772 [43:33<9:02:16, 6.07s/it] 7%|▋ | 413/5772 [43:47<9:14:38, 6.21s/it] 7%|▋ | 413/5772 [43:40<9:14:38, 6.21s/it] {'loss': 0.5047, 'learning_rate': 1.9910185059190344e-05, 'epoch': 0.07} + 7%|▋ | 413/5772 [43:47<9:14:38, 6.21s/it] {'loss': 0.5047, 'learning_rate': 1.9910185059190344e-05, 'epoch': 0.07} + 7%|▋ | 413/5772 [43:40<9:14:38, 6.21s/it] 7%|▋ | 414/5772 [43:46<9:23:25, 6.31s/it] 7%|▋ | 414/5772 [43:53<9:23:27, 6.31s/it] {'loss': 0.5034, 'learning_rate': 1.990943303628026e-05, 'epoch': 0.07} + 7%|▋ | 414/5772 [43:53<9:23:27, 6.31s/it] {'loss': 0.5034, 'learning_rate': 1.990943303628026e-05, 'epoch': 0.07} + 7%|▋ | 414/5772 [43:46<9:23:25, 6.31s/it] 7%|▋ | 415/5772 [43:59<9:19:06, 6.26s/it] 7%|▋ | 415/5772 [43:52<9:19:06, 6.26s/it] {'loss': 0.5254, 'learning_rate': 1.990867789244934e-05, 'epoch': 0.07} + 7%|▋ | 415/5772 [43:59<9:19:06, 6.26s/it] {'loss': 0.5254, 'learning_rate': 1.990867789244934e-05, 'epoch': 0.07} + 7%|▋ | 415/5772 [43:52<9:19:06, 6.26s/it] 7%|▋ | 416/5772 [44:05<9:11:37, 6.18s/it] 7%|▋ | 416/5772 [43:58<9:11:37, 6.18s/it] {'loss': 0.4984, 'learning_rate': 1.990791962793541e-05, 'epoch': 0.07} + 7%|▋ | 416/5772 [44:05<9:11:37, 6.18s/it] {'loss': 0.4984, 'learning_rate': 1.990791962793541e-05, 'epoch': 0.07} + 7%|▋ | 416/5772 [43:58<9:11:37, 6.18s/it] 7%|▋ | 417/5772 [44:12<9:15:27, 6.22s/it] 7%|▋ | 417/5772 [44:05<9:15:27, 6.22s/it] {'loss': 0.5208, 'learning_rate': 1.990715824297728e-05, 'epoch': 0.07} + 7%|▋ | 417/5772 [44:12<9:15:27, 6.22s/it] {'loss': 0.5208, 'learning_rate': 1.990715824297728e-05, 'epoch': 0.07} + 7%|▋ | 417/5772 [44:05<9:15:27, 6.22s/it] 7%|▋ | 418/5772 [44:18<9:17:50, 6.25s/it] 7%|▋ | 418/5772 [44:11<9:17:50, 6.25s/it] {'loss': 0.5087, 'learning_rate': 1.9906393737814748e-05, 'epoch': 0.07} + 7%|▋ | 418/5772 [44:18<9:17:50, 6.25s/it] {'loss': 0.5087, 'learning_rate': 1.9906393737814748e-05, 'epoch': 0.07} + 7%|▋ | 418/5772 [44:11<9:17:50, 6.25s/it] 7%|▋ | 419/5772 [44:24<9:13:44, 6.21s/it] 7%|▋ | 419/5772 [44:17<9:13:44, 6.21s/it] {'loss': 0.4982, 'learning_rate': 1.990562611268858e-05, 'epoch': 0.07} + 7%|▋ | 419/5772 [44:24<9:13:44, 6.21s/it] {'loss': 0.4982, 'learning_rate': 1.990562611268858e-05, 'epoch': 0.07} + 7%|▋ | 419/5772 [44:17<9:13:44, 6.21s/it] 7%|▋ | 420/5772 [44:30<9:12:11, 6.19s/it] 7%|▋ | 420/5772 [44:23<9:12:11, 6.19s/it] {'loss': 0.4952, 'learning_rate': 1.990485536784055e-05, 'epoch': 0.07} + 7%|▋ | 420/5772 [44:30<9:12:11, 6.19s/it] {'loss': 0.4952, 'learning_rate': 1.990485536784055e-05, 'epoch': 0.07} + 7%|▋ | 420/5772 [44:23<9:12:11, 6.19s/it] 7%|▋ | 421/5772 [44:36<9:01:49, 6.08s/it] 7%|▋ | 421/5772 [44:29<9:01:49, 6.08s/it] {'loss': 0.5156, 'learning_rate': 1.9904081503513395e-05, 'epoch': 0.07} + 7%|▋ | 421/5772 [44:36<9:01:49, 6.08s/it] {'loss': 0.5156, 'learning_rate': 1.9904081503513395e-05, 'epoch': 0.07} + 7%|▋ | 421/5772 [44:29<9:01:49, 6.08s/it] 7%|▋ | 422/5772 [44:42<9:07:45, 6.14s/it] 7%|▋ | 422/5772 [44:35<9:07:45, 6.14s/it] {'loss': 0.5065, 'learning_rate': 1.9903304519950833e-05, 'epoch': 0.07} + 7%|▋ | 422/5772 [44:42<9:07:45, 6.14s/it] {'loss': 0.5065, 'learning_rate': 1.9903304519950833e-05, 'epoch': 0.07} + 7%|▋ | 422/5772 [44:35<9:07:45, 6.14s/it] 7%|▋ | 423/5772 [44:49<9:13:06, 6.20s/it] 7%|▋ | 423/5772 [44:42<9:13:07, 6.20s/it] {'loss': 0.5126, 'learning_rate': 1.990252441739758e-05, 'epoch': 0.07} + 7%|▋ | 423/5772 [44:49<9:13:06, 6.20s/it] {'loss': 0.5126, 'learning_rate': 1.990252441739758e-05, 'epoch': 0.07} + 7%|▋ | 423/5772 [44:42<9:13:07, 6.20s/it] 7%|▋ | 424/5772 [44:48<9:19:03, 6.27s/it] 7%|▋ | 424/5772 [44:55<9:19:03, 6.27s/it] {'loss': 0.4999, 'learning_rate': 1.9901741196099313e-05, 'epoch': 0.07} + 7%|▋ | 424/5772 [44:55<9:19:03, 6.27s/it] {'loss': 0.4999, 'learning_rate': 1.9901741196099313e-05, 'epoch': 0.07} + 7%|▋ | 424/5772 [44:48<9:19:03, 6.27s/it] 7%|▋ | 425/5772 [45:02<9:20:13, 6.29s/it] 7%|▋ | 425/5772 [44:55<9:20:13, 6.29s/it] {'loss': 0.5116, 'learning_rate': 1.9900954856302715e-05, 'epoch': 0.07} + 7%|▋ | 425/5772 [45:02<9:20:13, 6.29s/it] {'loss': 0.5116, 'learning_rate': 1.9900954856302715e-05, 'epoch': 0.07} + 7%|▋ | 425/5772 [44:55<9:20:13, 6.29s/it] 7%|▋ | 426/5772 [45:01<9:15:34, 6.24s/it] 7%|▋ | 426/5772 [45:08<9:15:34, 6.24s/it] {'loss': 0.4924, 'learning_rate': 1.9900165398255434e-05, 'epoch': 0.07} + 7%|▋ | 426/5772 [45:08<9:15:34, 6.24s/it] {'loss': 0.4924, 'learning_rate': 1.9900165398255434e-05, 'epoch': 0.07} + 7%|▋ | 426/5772 [45:01<9:15:34, 6.24s/it] 7%|▋ | 427/5772 [45:07<9:19:20, 6.28s/it] 7%|▋ | 427/5772 [45:14<9:19:20, 6.28s/it] {'loss': 0.5066, 'learning_rate': 1.9899372822206105e-05, 'epoch': 0.07} + 7%|▋ | 427/5772 [45:14<9:19:20, 6.28s/it] {'loss': 0.5066, 'learning_rate': 1.9899372822206105e-05, 'epoch': 0.07} + 7%|▋ | 427/5772 [45:07<9:19:20, 6.28s/it] 7%|▋ | 428/5772 [45:14<9:33:11, 6.44s/it] 7%|▋ | 428/5772 [45:21<9:33:12, 6.44s/it] {'loss': 0.5091, 'learning_rate': 1.9898577128404343e-05, 'epoch': 0.07} + 7%|▋ | 428/5772 [45:21<9:33:12, 6.44s/it] {'loss': 0.5091, 'learning_rate': 1.9898577128404343e-05, 'epoch': 0.07} + 7%|▋ | 428/5772 [45:14<9:33:11, 6.44s/it] 7%|▋ | 429/5772 [45:20<9:32:09, 6.43s/it] 7%|▋ | 429/5772 [45:27<9:32:09, 6.43s/it] {'loss': 0.4911, 'learning_rate': 1.9897778317100754e-05, 'epoch': 0.07} + 7%|▋ | 429/5772 [45:27<9:32:09, 6.43s/it] {'loss': 0.4911, 'learning_rate': 1.9897778317100754e-05, 'epoch': 0.07} + 7%|▋ | 429/5772 [45:20<9:32:09, 6.43s/it] 7%|▋ | 430/5772 [45:26<9:25:18, 6.35s/it] 7%|▋ | 430/5772 [45:33<9:25:19, 6.35s/it] {'loss': 0.4944, 'learning_rate': 1.9896976388546915e-05, 'epoch': 0.07} + 7%|▋ | 430/5772 [45:33<9:25:19, 6.35s/it] {'loss': 0.4944, 'learning_rate': 1.9896976388546915e-05, 'epoch': 0.07} + 7%|▋ | 430/5772 [45:26<9:25:18, 6.35s/it] 7%|▋ | 431/5772 [45:32<9:17:51, 6.27s/it] 7%|▋ | 431/5772 [45:40<9:17:51, 6.27s/it] {'loss': 0.5091, 'learning_rate': 1.9896171342995392e-05, 'epoch': 0.07} + 7%|▋ | 431/5772 [45:40<9:17:51, 6.27s/it] {'loss': 0.5091, 'learning_rate': 1.9896171342995392e-05, 'epoch': 0.07} + 7%|▋ | 431/5772 [45:32<9:17:51, 6.27s/it] 7%|▋ | 432/5772 [45:38<9:10:06, 6.18s/it] 7%|▋ | 432/5772 [45:46<9:10:06, 6.18s/it] {'loss': 0.5162, 'learning_rate': 1.989536318069973e-05, 'epoch': 0.07} + 7%|▋ | 432/5772 [45:46<9:10:06, 6.18s/it] {'loss': 0.5162, 'learning_rate': 1.989536318069973e-05, 'epoch': 0.07} + 7%|▋ | 432/5772 [45:38<9:10:06, 6.18s/it] 8%|▊ | 433/5772 [45:44<9:03:30, 6.11s/it] 8%|▊ | 433/5772 [45:51<9:03:30, 6.11s/it] {'loss': 0.5103, 'learning_rate': 1.9894551901914445e-05, 'epoch': 0.08} + 8%|▊ | 433/5772 [45:51<9:03:30, 6.11s/it] {'loss': 0.5103, 'learning_rate': 1.9894551901914445e-05, 'epoch': 0.08} + 8%|▊ | 433/5772 [45:44<9:03:30, 6.11s/it] 8%|▊ | 434/5772 [45:50<8:51:08, 5.97s/it] 8%|▊ | 434/5772 [45:57<8:51:08, 5.97s/it] {'loss': 0.4951, 'learning_rate': 1.989373750689506e-05, 'epoch': 0.08} + 8%|▊ | 434/5772 [45:57<8:51:08, 5.97s/it] {'loss': 0.4951, 'learning_rate': 1.989373750689506e-05, 'epoch': 0.08} + 8%|▊ | 434/5772 [45:50<8:51:08, 5.97s/it] 8%|▊ | 435/5772 [45:56<8:50:02, 5.96s/it] 8%|▊ | 435/5772 [46:03<8:50:02, 5.96s/it] {'loss': 0.5102, 'learning_rate': 1.9892919995898052e-05, 'epoch': 0.08} + 8%|▊ | 435/5772 [46:03<8:50:02, 5.96s/it] {'loss': 0.5102, 'learning_rate': 1.9892919995898052e-05, 'epoch': 0.08} + 8%|▊ | 435/5772 [45:56<8:50:02, 5.96s/it] 8%|▊ | 436/5772 [46:02<8:58:02, 6.05s/it] 8%|▊ | 436/5772 [46:09<8:58:02, 6.05s/it] {'loss': 0.51, 'learning_rate': 1.98920993691809e-05, 'epoch': 0.08} + {'loss': 0.51, 'learning_rate': 1.98920993691809e-05, 'epoch': 0.08} + 8%|▊ | 436/5772 [46:09<8:58:02, 6.05s/it] 8%|▊ | 436/5772 [46:02<8:58:02, 6.05s/it] 8%|▊ | 437/5772 [46:08<8:57:04, 6.04s/it] 8%|▊ | 437/5772 [46:15<8:57:04, 6.04s/it] {'loss': 0.5001, 'learning_rate': 1.9891275627002043e-05, 'epoch': 0.08} + 8%|▊ | 437/5772 [46:15<8:57:04, 6.04s/it] {'loss': 0.5001, 'learning_rate': 1.9891275627002043e-05, 'epoch': 0.08} + 8%|▊ | 437/5772 [46:08<8:57:04, 6.04s/it] 8%|▊ | 438/5772 [46:14<8:58:06, 6.05s/it] 8%|▊ | 438/5772 [46:21<8:58:06, 6.05s/it] {'loss': 0.4947, 'learning_rate': 1.9890448769620932e-05, 'epoch': 0.08} + 8%|▊ | 438/5772 [46:21<8:58:06, 6.05s/it] {'loss': 0.4947, 'learning_rate': 1.9890448769620932e-05, 'epoch': 0.08} + 8%|▊ | 438/5772 [46:14<8:58:06, 6.05s/it] 8%|▊ | 439/5772 [46:20<8:56:30, 6.04s/it] 8%|▊ | 439/5772 [46:27<8:56:31, 6.04s/it] {'loss': 0.5059, 'learning_rate': 1.988961879729797e-05, 'epoch': 0.08} + 8%|▊ | 439/5772 [46:27<8:56:31, 6.04s/it] {'loss': 0.5059, 'learning_rate': 1.988961879729797e-05, 'epoch': 0.08} + 8%|▊ | 439/5772 [46:20<8:56:30, 6.04s/it] 8%|▊ | 440/5772 [46:27<9:00:46, 6.09s/it] 8%|▊ | 440/5772 [46:34<9:00:53, 6.09s/it] {'loss': 0.5001, 'learning_rate': 1.9888785710294552e-05, 'epoch': 0.08} + 8%|▊ | 440/5772 [46:34<9:00:53, 6.09s/it] {'loss': 0.5001, 'learning_rate': 1.9888785710294552e-05, 'epoch': 0.08} + 8%|▊ | 440/5772 [46:27<9:00:46, 6.09s/it] 8%|▊ | 441/5772 [46:33<9:01:45, 6.10s/it] 8%|▊ | 441/5772 [46:40<9:01:42, 6.10s/it] {'loss': 0.5009, 'learning_rate': 1.9887949508873058e-05, 'epoch': 0.08} + 8%|▊ | 441/5772 [46:40<9:01:42, 6.10s/it] {'loss': 0.5009, 'learning_rate': 1.9887949508873058e-05, 'epoch': 0.08} + 8%|▊ | 441/5772 [46:33<9:01:45, 6.10s/it] 8%|▊ | 442/5772 [46:39<9:01:46, 6.10s/it] 8%|▊ | 442/5772 [46:46<9:01:45, 6.10s/it] {'loss': 0.4887, 'learning_rate': 1.988711019329684e-05, 'epoch': 0.08} + 8%|▊ | 442/5772 [46:39<9:01:46, 6.10s/it]{'loss': 0.4887, 'learning_rate': 1.988711019329684e-05, 'epoch': 0.08} + 8%|▊ | 442/5772 [46:46<9:01:45, 6.10s/it] 8%|▊ | 443/5772 [46:45<8:54:27, 6.02s/it] 8%|▊ | 443/5772 [46:52<8:54:27, 6.02s/it] {'loss': 0.5005, 'learning_rate': 1.9886267763830245e-05, 'epoch': 0.08} + {'loss': 0.5005, 'learning_rate': 1.9886267763830245e-05, 'epoch': 0.08} 8%|▊ | 443/5772 [46:52<8:54:27, 6.02s/it] + 8%|▊ | 443/5772 [46:45<8:54:27, 6.02s/it] 8%|▊ | 444/5772 [46:58<8:54:06, 6.01s/it] 8%|▊ | 444/5772 [46:51<8:54:07, 6.01s/it] {'loss': 0.4987, 'learning_rate': 1.9885422220738583e-05, 'epoch': 0.08} + 8%|▊ | 444/5772 [46:58<8:54:06, 6.01s/it] {'loss': 0.4987, 'learning_rate': 1.9885422220738583e-05, 'epoch': 0.08} + 8%|▊ | 444/5772 [46:51<8:54:07, 6.01s/it] 8%|▊ | 445/5772 [46:56<8:50:21, 5.97s/it] 8%|▊ | 445/5772 [47:04<8:50:21, 5.97s/it] {'loss': 0.4964, 'learning_rate': 1.9884573564288154e-05, 'epoch': 0.08} + 8%|▊ | 445/5772 [47:04<8:50:21, 5.97s/it] {'loss': 0.4964, 'learning_rate': 1.9884573564288154e-05, 'epoch': 0.08} + 8%|▊ | 445/5772 [46:56<8:50:21, 5.97s/it] 8%|▊ | 446/5772 [47:03<8:58:58, 6.07s/it] 8%|▊ | 446/5772 [47:10<8:58:58, 6.07s/it] {'loss': 0.4959, 'learning_rate': 1.9883721794746242e-05, 'epoch': 0.08} + 8%|▊ | 446/5772 [47:10<8:58:58, 6.07s/it] {'loss': 0.4959, 'learning_rate': 1.9883721794746242e-05, 'epoch': 0.08} + 8%|▊ | 446/5772 [47:03<8:58:58, 6.07s/it] 8%|▊ | 447/5772 [47:16<9:07:20, 6.17s/it] 8%|▊ | 447/5772 [47:09<9:07:21, 6.17s/it] {'loss': 0.5038, 'learning_rate': 1.9882866912381105e-05, 'epoch': 0.08} + 8%|▊ | 447/5772 [47:16<9:07:20, 6.17s/it] {'loss': 0.5038, 'learning_rate': 1.9882866912381105e-05, 'epoch': 0.08} + 8%|▊ | 447/5772 [47:09<9:07:21, 6.17s/it] 8%|▊ | 448/5772 [47:15<9:00:52, 6.10s/it] 8%|▊ | 448/5772 [47:22<9:00:53, 6.10s/it] {'loss': 0.5196, 'learning_rate': 1.988200891746198e-05, 'epoch': 0.08} + 8%|▊ | 448/5772 [47:22<9:00:53, 6.10s/it] {'loss': 0.5196, 'learning_rate': 1.988200891746198e-05, 'epoch': 0.08} + 8%|▊ | 448/5772 [47:15<9:00:52, 6.10s/it] 8%|▊ | 449/5772 [47:21<9:07:07, 6.17s/it] 8%|▊ | 449/5772 [47:28<9:07:06, 6.17s/it] {'loss': 0.5063, 'learning_rate': 1.9881147810259094e-05, 'epoch': 0.08} + 8%|▊ | 449/5772 [47:28<9:07:06, 6.17s/it] {'loss': 0.5063, 'learning_rate': 1.9881147810259094e-05, 'epoch': 0.08} + 8%|▊ | 449/5772 [47:21<9:07:07, 6.17s/it]4 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +05 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +1510 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +11 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend...276 + AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + + +3 AutoResumeHook: Checking whether to suspend... + 8%|▊ | 450/5772 [47:28<9:13:16, 6.24s/it]9 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... + 8%|▊ | 450/5772 [47:35<9:13:16, 6.24s/it]1 AutoResumeHook: Checking whether to suspend... + {'loss': 0.51, 'learning_rate': 1.988028359104364e-05, 'epoch': 0.08} + 8%|▊ | 450/5772 [47:35<9:13:16, 6.24s/it] {'loss': 0.51, 'learning_rate': 1.988028359104364e-05, 'epoch': 0.08} + 8%|▊ | 450/5772 [47:28<9:13:16, 6.24s/it] 8%|▊ | 451/5772 [47:34<9:09:28, 6.20s/it] 8%|▊ | 451/5772 [47:41<9:09:28, 6.20s/it] {'loss': 0.5078, 'learning_rate': 1.9879416260087808e-05, 'epoch': 0.08} + 8%|▊ | 451/5772 [47:41<9:09:28, 6.20s/it] {'loss': 0.5078, 'learning_rate': 1.9879416260087808e-05, 'epoch': 0.08} + 8%|▊ | 451/5772 [47:34<9:09:28, 6.20s/it] 8%|▊ | 452/5772 [47:40<9:04:19, 6.14s/it] 8%|▊ | 452/5772 [47:47<9:04:19, 6.14s/it] {'loss': 0.498, 'learning_rate': 1.9878545817664752e-05, 'epoch': 0.08} + 8%|▊ | 452/5772 [47:47<9:04:19, 6.14s/it] {'loss': 0.498, 'learning_rate': 1.9878545817664752e-05, 'epoch': 0.08} + 8%|▊ | 452/5772 [47:40<9:04:19, 6.14s/it] 8%|▊ | 453/5772 [47:46<8:56:47, 6.06s/it] 8%|▊ | 453/5772 [47:53<8:56:47, 6.06s/it] {'loss': 0.5075, 'learning_rate': 1.9877672264048618e-05, 'epoch': 0.08} + 8%|▊ | 453/5772 [47:53<8:56:47, 6.06s/it] {'loss': 0.5075, 'learning_rate': 1.9877672264048618e-05, 'epoch': 0.08} + 8%|▊ | 453/5772 [47:46<8:56:47, 6.06s/it] 8%|▊ | 454/5772 [47:53<9:16:48, 6.28s/it] 8%|▊ | 454/5772 [48:00<9:16:48, 6.28s/it] {'loss': 0.5062, 'learning_rate': 1.9876795599514523e-05, 'epoch': 0.08} + 8%|▊ | 454/5772 [48:00<9:16:48, 6.28s/it] {'loss': 0.5062, 'learning_rate': 1.9876795599514523e-05, 'epoch': 0.08} + 8%|▊ | 454/5772 [47:53<9:16:48, 6.28s/it] 8%|▊ | 455/5772 [48:06<9:17:46, 6.29s/it] 8%|▊ | 455/5772 [47:59<9:17:47, 6.29s/it] {'loss': 0.5133, 'learning_rate': 1.987591582433857e-05, 'epoch': 0.08} + 8%|▊ | 455/5772 [48:06<9:17:46, 6.29s/it] {'loss': 0.5133, 'learning_rate': 1.987591582433857e-05, 'epoch': 0.08} + 8%|▊ | 455/5772 [47:59<9:17:47, 6.29s/it] 8%|▊ | 456/5772 [48:05<9:18:20, 6.30s/it] 8%|▊ | 456/5772 [48:12<9:18:19, 6.30s/it] {'loss': 0.4992, 'learning_rate': 1.9875032938797837e-05, 'epoch': 0.08} + {'loss': 0.4992, 'learning_rate': 1.9875032938797837e-05, 'epoch': 0.08} 8%|▊ | 456/5772 [48:12<9:18:19, 6.30s/it] + 8%|▊ | 456/5772 [48:05<9:18:20, 6.30s/it] 8%|▊ | 457/5772 [48:18<9:07:52, 6.18s/it] 8%|▊ | 457/5772 [48:11<9:07:52, 6.18s/it] {'loss': 0.4929, 'learning_rate': 1.9874146943170386e-05, 'epoch': 0.08} + 8%|▊ | 457/5772 [48:18<9:07:52, 6.18s/it] {'loss': 0.4929, 'learning_rate': 1.9874146943170386e-05, 'epoch': 0.08} + 8%|▊ | 457/5772 [48:11<9:07:52, 6.18s/it] 8%|▊ | 458/5772 [48:25<9:11:56, 6.23s/it] 8%|▊ | 458/5772 [48:18<9:11:56, 6.23s/it] {'loss': 0.5224, 'learning_rate': 1.9873257837735257e-05, 'epoch': 0.08} + 8%|▊ | 458/5772 [48:25<9:11:56, 6.23s/it] {'loss': 0.5224, 'learning_rate': 1.9873257837735257e-05, 'epoch': 0.08} + 8%|▊ | 458/5772 [48:18<9:11:56, 6.23s/it] 8%|▊ | 459/5772 [48:24<9:13:02, 6.25s/it] 8%|▊ | 459/5772 [48:31<9:13:02, 6.25s/it] {'loss': 0.5048, 'learning_rate': 1.9872365622772464e-05, 'epoch': 0.08} + 8%|▊ | 459/5772 [48:31<9:13:02, 6.25s/it] {'loss': 0.5048, 'learning_rate': 1.9872365622772464e-05, 'epoch': 0.08} + 8%|▊ | 459/5772 [48:24<9:13:02, 6.25s/it] 8%|▊ | 460/5772 [48:37<9:03:36, 6.14s/it] 8%|▊ | 460/5772 [48:30<9:03:36, 6.14s/it] {'loss': 0.4979, 'learning_rate': 1.987147029856301e-05, 'epoch': 0.08} + 8%|▊ | 460/5772 [48:37<9:03:36, 6.14s/it] {'loss': 0.4979, 'learning_rate': 1.987147029856301e-05, 'epoch': 0.08} + 8%|▊ | 460/5772 [48:30<9:03:36, 6.14s/it] 8%|▊ | 461/5772 [48:43<9:05:26, 6.16s/it] 8%|▊ | 461/5772 [48:36<9:05:26, 6.16s/it] {'loss': 0.5262, 'learning_rate': 1.9870571865388873e-05, 'epoch': 0.08} + {'loss': 0.5262, 'learning_rate': 1.9870571865388873e-05, 'epoch': 0.08} 8%|▊ | 461/5772 [48:43<9:05:26, 6.16s/it] + 8%|▊ | 461/5772 [48:36<9:05:26, 6.16s/it] 8%|▊ | 462/5772 [48:42<9:03:49, 6.14s/it] 8%|▊ | 462/5772 [48:49<9:03:49, 6.14s/it] {'loss': 0.4916, 'learning_rate': 1.9869670323533005e-05, 'epoch': 0.08} + 8%|▊ | 462/5772 [48:49<9:03:49, 6.14s/it] {'loss': 0.4916, 'learning_rate': 1.9869670323533005e-05, 'epoch': 0.08} + 8%|▊ | 462/5772 [48:42<9:03:49, 6.14s/it] 8%|▊ | 463/5772 [48:55<8:55:12, 6.05s/it] 8%|▊ | 463/5772 [48:48<8:55:12, 6.05s/it] {'loss': 0.4974, 'learning_rate': 1.9868765673279347e-05, 'epoch': 0.08} + 8%|▊ | 463/5772 [48:55<8:55:12, 6.05s/it] {'loss': 0.4974, 'learning_rate': 1.9868765673279347e-05, 'epoch': 0.08} + 8%|▊ | 463/5772 [48:48<8:55:12, 6.05s/it] 8%|▊ | 464/5772 [48:54<8:55:15, 6.05s/it] 8%|▊ | 464/5772 [49:01<8:55:16, 6.05s/it] {'loss': 0.495, 'learning_rate': 1.9867857914912808e-05, 'epoch': 0.08} + 8%|▊ | 464/5772 [49:01<8:55:16, 6.05s/it] {'loss': 0.495, 'learning_rate': 1.9867857914912808e-05, 'epoch': 0.08} + 8%|▊ | 464/5772 [48:54<8:55:15, 6.05s/it] 8%|▊ | 465/5772 [49:00<9:03:53, 6.15s/it] 8%|▊ | 465/5772 [49:07<9:03:54, 6.15s/it] {'loss': 0.5099, 'learning_rate': 1.9866947048719285e-05, 'epoch': 0.08} + 8%|▊ | 465/5772 [49:07<9:03:54, 6.15s/it] {'loss': 0.5099, 'learning_rate': 1.9866947048719285e-05, 'epoch': 0.08} + 8%|▊ | 465/5772 [49:00<9:03:53, 6.15s/it] 8%|▊ | 466/5772 [49:07<9:09:35, 6.21s/it] 8%|▊ | 466/5772 [49:14<9:09:35, 6.21s/it] {'loss': 0.4879, 'learning_rate': 1.986603307498565e-05, 'epoch': 0.08} + 8%|▊ | 466/5772 [49:14<9:09:35, 6.21s/it] {'loss': 0.4879, 'learning_rate': 1.986603307498565e-05, 'epoch': 0.08} + 8%|▊ | 466/5772 [49:07<9:09:35, 6.21s/it] 8%|▊ | 467/5772 [49:13<9:07:08, 6.19s/it] 8%|▊ | 467/5772 [49:20<9:07:09, 6.19s/it] {'loss': 0.5051, 'learning_rate': 1.9865115993999755e-05, 'epoch': 0.08} + 8%|▊ | 467/5772 [49:20<9:07:09, 6.19s/it] {'loss': 0.5051, 'learning_rate': 1.9865115993999755e-05, 'epoch': 0.08} + 8%|▊ | 467/5772 [49:13<9:07:08, 6.19s/it] 8%|▊ | 468/5772 [49:19<8:56:50, 6.07s/it] 8%|▊ | 468/5772 [49:26<8:56:50, 6.07s/it] {'loss': 0.4841, 'learning_rate': 1.9864195806050425e-05, 'epoch': 0.08} + 8%|▊ | 468/5772 [49:26<8:56:50, 6.07s/it] {'loss': 0.4841, 'learning_rate': 1.9864195806050425e-05, 'epoch': 0.08} + 8%|▊ | 468/5772 [49:19<8:56:50, 6.07s/it] 8%|▊ | 469/5772 [49:25<9:04:39, 6.16s/it] 8%|▊ | 469/5772 [49:32<9:04:39, 6.16s/it] {'loss': 0.5163, 'learning_rate': 1.9863272511427475e-05, 'epoch': 0.08} + 8%|▊ | 469/5772 [49:32<9:04:39, 6.16s/it] {'loss': 0.5163, 'learning_rate': 1.9863272511427475e-05, 'epoch': 0.08} + 8%|▊ | 469/5772 [49:25<9:04:39, 6.16s/it] 8%|▊ | 470/5772 [49:32<9:20:16, 6.34s/it] 8%|▊ | 470/5772 [49:39<9:20:16, 6.34s/it] {'loss': 0.4991, 'learning_rate': 1.9862346110421682e-05, 'epoch': 0.08} + 8%|▊ | 470/5772 [49:39<9:20:16, 6.34s/it] {'loss': 0.4991, 'learning_rate': 1.9862346110421682e-05, 'epoch': 0.08} + 8%|▊ | 470/5772 [49:32<9:20:16, 6.34s/it] 8%|▊ | 471/5772 [49:45<9:17:45, 6.31s/it] 8%|▊ | 471/5772 [49:38<9:17:46, 6.31s/it] {'loss': 0.508, 'learning_rate': 1.986141660332482e-05, 'epoch': 0.08} + 8%|▊ | 471/5772 [49:45<9:17:45, 6.31s/it] {'loss': 0.508, 'learning_rate': 1.986141660332482e-05, 'epoch': 0.08} + 8%|▊ | 471/5772 [49:38<9:17:46, 6.31s/it] 8%|▊ | 472/5772 [49:44<9:08:43, 6.21s/it] 8%|▊ | 472/5772 [49:51<9:08:43, 6.21s/it] {'loss': 0.501, 'learning_rate': 1.986048399042963e-05, 'epoch': 0.08} + 8%|▊ | 472/5772 [49:51<9:08:43, 6.21s/it] {'loss': 0.501, 'learning_rate': 1.986048399042963e-05, 'epoch': 0.08} + 8%|▊ | 472/5772 [49:44<9:08:43, 6.21s/it] 8%|▊ | 473/5772 [49:50<9:09:21, 6.22s/it] 8%|▊ | 473/5772 [49:57<9:09:21, 6.22s/it] {'loss': 0.4963, 'learning_rate': 1.9859548272029828e-05, 'epoch': 0.08} + 8%|▊ | 473/5772 [49:57<9:09:21, 6.22s/it] {'loss': 0.4963, 'learning_rate': 1.9859548272029828e-05, 'epoch': 0.08} + 8%|▊ | 473/5772 [49:50<9:09:21, 6.22s/it] 8%|▊ | 474/5772 [49:56<9:02:13, 6.14s/it] 8%|▊ | 474/5772 [50:03<9:02:14, 6.14s/it] {'loss': 0.4996, 'learning_rate': 1.9858609448420118e-05, 'epoch': 0.08} + 8%|▊ | 474/5772 [50:03<9:02:14, 6.14s/it] {'loss': 0.4996, 'learning_rate': 1.9858609448420118e-05, 'epoch': 0.08} + 8%|▊ | 474/5772 [49:56<9:02:13, 6.14s/it] 8%|▊ | 475/5772 [50:03<9:10:05, 6.23s/it] 8%|▊ | 475/5772 [50:10<9:10:06, 6.23s/it] {'loss': 0.4953, 'learning_rate': 1.9857667519896176e-05, 'epoch': 0.08} + 8%|▊ | 475/5772 [50:10<9:10:06, 6.23s/it] {'loss': 0.4953, 'learning_rate': 1.9857667519896176e-05, 'epoch': 0.08} + 8%|▊ | 475/5772 [50:03<9:10:05, 6.23s/it] 8%|▊ | 476/5772 [50:09<9:10:53, 6.24s/it] 8%|▊ | 476/5772 [50:16<9:10:53, 6.24s/it] {'loss': 0.4918, 'learning_rate': 1.985672248675466e-05, 'epoch': 0.08} + 8%|▊ | 476/5772 [50:16<9:10:53, 6.24s/it] {'loss': 0.4918, 'learning_rate': 1.985672248675466e-05, 'epoch': 0.08} + 8%|▊ | 476/5772 [50:09<9:10:53, 6.24s/it] 8%|▊ | 477/5772 [50:15<9:03:34, 6.16s/it] 8%|▊ | 477/5772 [50:22<9:03:34, 6.16s/it] {'loss': 0.5062, 'learning_rate': 1.98557743492932e-05, 'epoch': 0.08} + 8%|▊ | 477/5772 [50:22<9:03:34, 6.16s/it] {'loss': 0.5062, 'learning_rate': 1.98557743492932e-05, 'epoch': 0.08} + 8%|▊ | 477/5772 [50:15<9:03:34, 6.16s/it] 8%|▊ | 478/5772 [50:21<9:05:51, 6.19s/it] 8%|▊ | 478/5772 [50:28<9:05:51, 6.19s/it] {'loss': 0.4982, 'learning_rate': 1.9854823107810402e-05, 'epoch': 0.08} + 8%|▊ | 478/5772 [50:28<9:05:51, 6.19s/it] {'loss': 0.4982, 'learning_rate': 1.9854823107810402e-05, 'epoch': 0.08} + 8%|▊ | 478/5772 [50:21<9:05:51, 6.19s/it] 8%|▊ | 479/5772 [50:27<9:00:38, 6.13s/it] 8%|▊ | 479/5772 [50:34<9:00:37, 6.13s/it] {'loss': 0.4986, 'learning_rate': 1.9853868762605865e-05, 'epoch': 0.08} + 8%|▊ | 479/5772 [50:34<9:00:37, 6.13s/it] {'loss': 0.4986, 'learning_rate': 1.9853868762605865e-05, 'epoch': 0.08} + 8%|▊ | 479/5772 [50:27<9:00:38, 6.13s/it] 8%|▊ | 480/5772 [50:33<8:54:30, 6.06s/it] 8%|▊ | 480/5772 [50:40<8:54:38, 6.06s/it] {'loss': 0.4995, 'learning_rate': 1.9852911313980146e-05, 'epoch': 0.08} + 8%|▊ | 480/5772 [50:40<8:54:38, 6.06s/it] {'loss': 0.4995, 'learning_rate': 1.9852911313980146e-05, 'epoch': 0.08} + 8%|▊ | 480/5772 [50:33<8:54:30, 6.06s/it] 8%|▊ | 481/5772 [50:39<9:02:51, 6.16s/it] 8%|▊ | 481/5772 [50:46<9:02:49, 6.16s/it] {'loss': 0.4999, 'learning_rate': 1.9851950762234794e-05, 'epoch': 0.08} + 8%|▊ | 481/5772 [50:46<9:02:49, 6.16s/it] {'loss': 0.4999, 'learning_rate': 1.9851950762234794e-05, 'epoch': 0.08} + 8%|▊ | 481/5772 [50:39<9:02:51, 6.16s/it] 8%|▊ | 482/5772 [50:46<9:07:13, 6.21s/it] 8%|▊ | 482/5772 [50:53<9:07:12, 6.21s/it] {'loss': 0.4976, 'learning_rate': 1.9850987107672322e-05, 'epoch': 0.08} + 8%|▊ | 482/5772 [50:53<9:07:12, 6.21s/it] {'loss': 0.4976, 'learning_rate': 1.9850987107672322e-05, 'epoch': 0.08} + 8%|▊ | 482/5772 [50:46<9:07:13, 6.21s/it] 8%|▊ | 483/5772 [50:52<9:11:08, 6.25s/it] 8%|▊ | 483/5772 [50:59<9:11:07, 6.25s/it] {'loss': 0.5057, 'learning_rate': 1.9850020350596237e-05, 'epoch': 0.08} + 8%|▊ | 483/5772 [50:59<9:11:07, 6.25s/it] {'loss': 0.5057, 'learning_rate': 1.9850020350596237e-05, 'epoch': 0.08} + 8%|▊ | 483/5772 [50:52<9:11:08, 6.25s/it] 8%|▊ | 484/5772 [50:58<9:11:31, 6.26s/it] 8%|▊ | 484/5772 [51:05<9:11:30, 6.26s/it] {'loss': 0.5108, 'learning_rate': 1.9849050491311005e-05, 'epoch': 0.08} + 8%|▊ | 484/5772 [51:05<9:11:30, 6.26s/it] {'loss': 0.5108, 'learning_rate': 1.9849050491311005e-05, 'epoch': 0.08} + 8%|▊ | 484/5772 [50:58<9:11:31, 6.26s/it] 8%|▊ | 485/5772 [51:05<9:16:53, 6.32s/it] 8%|▊ | 485/5772 [51:12<9:16:52, 6.32s/it] {'loss': 0.5035, 'learning_rate': 1.9848077530122083e-05, 'epoch': 0.08} + 8%|▊ | 485/5772 [51:12<9:16:52, 6.32s/it] {'loss': 0.5035, 'learning_rate': 1.9848077530122083e-05, 'epoch': 0.08} + 8%|▊ | 485/5772 [51:05<9:16:53, 6.32s/it] 8%|▊ | 486/5772 [51:11<9:05:21, 6.19s/it] 8%|▊ | 486/5772 [51:18<9:05:21, 6.19s/it] {'loss': 0.4953, 'learning_rate': 1.9847101467335895e-05, 'epoch': 0.08} + 8%|▊ | 486/5772 [51:18<9:05:21, 6.19s/it] {'loss': 0.4953, 'learning_rate': 1.9847101467335895e-05, 'epoch': 0.08} + 8%|▊ | 486/5772 [51:11<9:05:21, 6.19s/it] 8%|▊ | 487/5772 [51:17<9:05:30, 6.19s/it] 8%|▊ | 487/5772 [51:24<9:05:30, 6.19s/it] {'loss': 0.5039, 'learning_rate': 1.9846122303259855e-05, 'epoch': 0.08} + 8%|▊ | 487/5772 [51:24<9:05:30, 6.19s/it] {'loss': 0.5039, 'learning_rate': 1.9846122303259855e-05, 'epoch': 0.08} + 8%|▊ | 487/5772 [51:17<9:05:30, 6.19s/it] 8%|▊ | 488/5772 [51:23<9:09:53, 6.24s/it] 8%|▊ | 488/5772 [51:30<9:09:53, 6.24s/it] {'loss': 0.4933, 'learning_rate': 1.9845140038202338e-05, 'epoch': 0.08}{'loss': 0.4933, 'learning_rate': 1.9845140038202338e-05, 'epoch': 0.08} + 8%|▊ | 488/5772 [51:30<9:09:53, 6.24s/it] + 8%|▊ | 488/5772 [51:23<9:09:53, 6.24s/it] 8%|▊ | 489/5772 [51:30<9:18:31, 6.34s/it] 8%|▊ | 489/5772 [51:37<9:18:30, 6.34s/it] {'loss': 0.4985, 'learning_rate': 1.9844154672472707e-05, 'epoch': 0.08} + 8%|▊ | 489/5772 [51:37<9:18:30, 6.34s/it] {'loss': 0.4985, 'learning_rate': 1.9844154672472707e-05, 'epoch': 0.08} + 8%|▊ | 489/5772 [51:30<9:18:31, 6.34s/it] 8%|▊ | 490/5772 [51:43<9:18:18, 6.34s/it] 8%|▊ | 490/5772 [51:36<9:18:19, 6.34s/it] {'loss': 0.5092, 'learning_rate': 1.9843166206381296e-05, 'epoch': 0.08} + 8%|▊ | 490/5772 [51:43<9:18:18, 6.34s/it] {'loss': 0.5092, 'learning_rate': 1.9843166206381296e-05, 'epoch': 0.08} + 8%|▊ | 490/5772 [51:36<9:18:19, 6.34s/it] 9%|▊ | 491/5772 [51:42<9:16:44, 6.33s/it] 9%|▊ | 491/5772 [51:49<9:16:44, 6.33s/it] {'loss': 0.5042, 'learning_rate': 1.9842174640239415e-05, 'epoch': 0.09} + 9%|▊ | 491/5772 [51:49<9:16:44, 6.33s/it] {'loss': 0.5042, 'learning_rate': 1.9842174640239415e-05, 'epoch': 0.09} + 9%|▊ | 491/5772 [51:42<9:16:44, 6.33s/it] 9%|▊ | 492/5772 [51:55<9:05:14, 6.20s/it] 9%|▊ | 492/5772 [51:48<9:05:14, 6.20s/it] {'loss': 0.5104, 'learning_rate': 1.984117997435935e-05, 'epoch': 0.09} + 9%|▊ | 492/5772 [51:55<9:05:14, 6.20s/it] {'loss': 0.5104, 'learning_rate': 1.984117997435935e-05, 'epoch': 0.09} + 9%|▊ | 492/5772 [51:48<9:05:14, 6.20s/it] 9%|▊ | 493/5772 [51:55<9:07:04, 6.22s/it] 9%|▊ | 493/5772 [52:02<9:07:04, 6.22s/it] {'loss': 0.4955, 'learning_rate': 1.9840182209054377e-05, 'epoch': 0.09} + 9%|▊ | 493/5772 [52:02<9:07:04, 6.22s/it] {'loss': 0.4955, 'learning_rate': 1.9840182209054377e-05, 'epoch': 0.09} + 9%|▊ | 493/5772 [51:55<9:07:04, 6.22s/it] 9%|▊ | 494/5772 [52:07<8:55:48, 6.09s/it] 9%|▊ | 494/5772 [52:00<8:55:48, 6.09s/it] {'loss': 0.508, 'learning_rate': 1.9839181344638722e-05, 'epoch': 0.09} + 9%|▊ | 494/5772 [52:07<8:55:48, 6.09s/it] {'loss': 0.508, 'learning_rate': 1.9839181344638722e-05, 'epoch': 0.09} + 9%|▊ | 494/5772 [52:00<8:55:48, 6.09s/it] 9%|▊ | 495/5772 [52:14<9:04:37, 6.19s/it] 9%|▊ | 495/5772 [52:07<9:04:37, 6.19s/it] {'loss': 0.5043, 'learning_rate': 1.9838177381427613e-05, 'epoch': 0.09} + 9%|▊ | 495/5772 [52:14<9:04:37, 6.19s/it] {'loss': 0.5043, 'learning_rate': 1.9838177381427613e-05, 'epoch': 0.09} + 9%|▊ | 495/5772 [52:07<9:04:37, 6.19s/it] 9%|▊ | 496/5772 [52:13<9:04:35, 6.19s/it] 9%|▊ | 496/5772 [52:20<9:04:37, 6.19s/it] {'loss': 0.5014, 'learning_rate': 1.9837170319737236e-05, 'epoch': 0.09} + 9%|▊ | 496/5772 [52:20<9:04:37, 6.19s/it] {'loss': 0.5014, 'learning_rate': 1.9837170319737236e-05, 'epoch': 0.09} + 9%|▊ | 496/5772 [52:13<9:04:35, 6.19s/it] 9%|▊ | 497/5772 [52:19<9:01:20, 6.16s/it] 9%|▊ | 497/5772 [52:26<9:01:20, 6.16s/it] {'loss': 0.4948, 'learning_rate': 1.9836160159884762e-05, 'epoch': 0.09} + 9%|▊ | 497/5772 [52:26<9:01:20, 6.16s/it] {'loss': 0.4948, 'learning_rate': 1.9836160159884762e-05, 'epoch': 0.09} + 9%|▊ | 497/5772 [52:19<9:01:20, 6.16s/it] 9%|▊ | 498/5772 [52:25<9:06:16, 6.21s/it] 9%|▊ | 498/5772 [52:32<9:06:16, 6.21s/it] {'loss': 0.5076, 'learning_rate': 1.9835146902188336e-05, 'epoch': 0.09} + 9%|▊ | 498/5772 [52:32<9:06:16, 6.21s/it] {'loss': 0.5076, 'learning_rate': 1.9835146902188336e-05, 'epoch': 0.09} + 9%|▊ | 498/5772 [52:25<9:06:16, 6.21s/it] 9%|▊ | 499/5772 [52:32<9:07:41, 6.23s/it] 9%|▊ | 499/5772 [52:39<9:07:40, 6.23s/it] {'loss': 0.5062, 'learning_rate': 1.9834130546967073e-05, 'epoch': 0.09} + 9%|▊ | 499/5772 [52:39<9:07:40, 6.23s/it] {'loss': 0.5062, 'learning_rate': 1.9834130546967073e-05, 'epoch': 0.09} + 9%|▊ | 499/5772 [52:32<9:07:41, 6.23s/it]5 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +012 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... + 9%|▊ | 500/5772 [52:44<8:55:11, 6.09s/it]7AutoResumeHook: Checking whether to suspend... AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... + +11 9%|▊ | 500/5772 [52:37<8:55:12, 6.09s/it]6 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4908, 'learning_rate': 1.983311109454108e-05, 'epoch': 0.09} + 9%|▊ | 500/5772 [52:44<8:55:11, 6.09s/it] {'loss': 0.4908, 'learning_rate': 1.983311109454108e-05, 'epoch': 0.09} + 9%|▊ | 500/5772 [52:37<8:55:12, 6.09s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-500/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-500/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-500/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 9%|▊ | 501/5772 [53:04<14:47:58, 10.11s/it] 9%|▊ | 501/5772 [52:57<14:47:58, 10.11s/it] {'loss': 0.499, 'learning_rate': 1.983208854523141e-05, 'epoch': 0.09} + 9%|▊ | 501/5772 [53:04<14:47:58, 10.11s/it] {'loss': 0.499, 'learning_rate': 1.983208854523141e-05, 'epoch': 0.09} + 9%|▊ | 501/5772 [52:57<14:47:58, 10.11s/it] 9%|▊ | 502/5772 [53:10<13:07:57, 8.97s/it] 9%|▊ | 502/5772 [53:03<13:07:57, 8.97s/it] {'loss': 0.4919, 'learning_rate': 1.983106289936013e-05, 'epoch': 0.09} + 9%|▊ | 502/5772 [53:10<13:07:57, 8.97s/it] {'loss': 0.4919, 'learning_rate': 1.983106289936013e-05, 'epoch': 0.09} + 9%|▊ | 502/5772 [53:03<13:07:57, 8.97s/it] 9%|▊ | 503/5772 [53:09<11:52:28, 8.11s/it] 9%|▊ | 503/5772 [53:16<11:52:29, 8.11s/it] {'loss': 0.5054, 'learning_rate': 1.9830034157250245e-05, 'epoch': 0.09} + 9%|▊ | 503/5772 [53:16<11:52:29, 8.11s/it] {'loss': 0.5054, 'learning_rate': 1.9830034157250245e-05, 'epoch': 0.09} + 9%|▊ | 503/5772 [53:09<11:52:28, 8.11s/it] 9%|▊ | 504/5772 [53:15<10:55:31, 7.47s/it] 9%|▊ | 504/5772 [53:22<10:55:31, 7.47s/it] {'loss': 0.4974, 'learning_rate': 1.9829002319225754e-05, 'epoch': 0.09} + 9%|▊ | 504/5772 [53:22<10:55:31, 7.47s/it] {'loss': 0.4974, 'learning_rate': 1.9829002319225754e-05, 'epoch': 0.09} + 9%|▊ | 504/5772 [53:15<10:55:31, 7.47s/it] 9%|▊ | 505/5772 [53:28<10:17:45, 7.04s/it] 9%|▊ | 505/5772 [53:21<10:17:46, 7.04s/it] {'loss': 0.4992, 'learning_rate': 1.9827967385611638e-05, 'epoch': 0.09} + 9%|▊ | 505/5772 [53:28<10:17:45, 7.04s/it] {'loss': 0.4992, 'learning_rate': 1.9827967385611638e-05, 'epoch': 0.09} + 9%|▊ | 505/5772 [53:21<10:17:46, 7.04s/it] 9%|▉ | 506/5772 [53:27<9:51:31, 6.74s/it] 9%|▉ | 506/5772 [53:34<9:51:31, 6.74s/it] {'loss': 0.5083, 'learning_rate': 1.9826929356733836e-05, 'epoch': 0.09} + 9%|▉ | 506/5772 [53:34<9:51:31, 6.74s/it] {'loss': 0.5083, 'learning_rate': 1.9826929356733836e-05, 'epoch': 0.09} + 9%|▉ | 506/5772 [53:27<9:51:31, 6.74s/it] 9%|▉ | 507/5772 [53:33<9:29:37, 6.49s/it] 9%|▉ | 507/5772 [53:40<9:29:38, 6.49s/it] {'loss': 0.5134, 'learning_rate': 1.9825888232919268e-05, 'epoch': 0.09} + 9%|▉ | 507/5772 [53:40<9:29:38, 6.49s/it] {'loss': 0.5134, 'learning_rate': 1.9825888232919268e-05, 'epoch': 0.09} + 9%|▉ | 507/5772 [53:33<9:29:37, 6.49s/it] 9%|▉ | 508/5772 [53:40<9:30:48, 6.51s/it] 9%|▉ | 508/5772 [53:47<9:30:48, 6.51s/it] {'loss': 0.5082, 'learning_rate': 1.9824844014495835e-05, 'epoch': 0.09} + 9%|▉ | 508/5772 [53:47<9:30:48, 6.51s/it] {'loss': 0.5082, 'learning_rate': 1.9824844014495835e-05, 'epoch': 0.09} + 9%|▉ | 508/5772 [53:40<9:30:48, 6.51s/it] 9%|▉ | 509/5772 [53:53<9:16:26, 6.34s/it] 9%|▉ | 509/5772 [53:46<9:16:27, 6.34s/it] {'loss': 0.4943, 'learning_rate': 1.9823796701792405e-05, 'epoch': 0.09} + 9%|▉ | 509/5772 [53:53<9:16:26, 6.34s/it] {'loss': 0.4943, 'learning_rate': 1.9823796701792405e-05, 'epoch': 0.09} + 9%|▉ | 509/5772 [53:46<9:16:27, 6.34s/it] 9%|▉ | 510/5772 [53:52<9:21:13, 6.40s/it] 9%|▉ | 510/5772 [53:59<9:21:14, 6.40s/it] {'loss': 0.5056, 'learning_rate': 1.9822746295138827e-05, 'epoch': 0.09} + 9%|▉ | 510/5772 [53:59<9:21:14, 6.40s/it] {'loss': 0.5056, 'learning_rate': 1.9822746295138827e-05, 'epoch': 0.09} + 9%|▉ | 510/5772 [53:52<9:21:13, 6.40s/it] 9%|▉ | 511/5772 [54:05<9:11:39, 6.29s/it] 9%|▉ | 511/5772 [53:58<9:11:39, 6.29s/it] {'loss': 0.495, 'learning_rate': 1.9821692794865918e-05, 'epoch': 0.09} + 9%|▉ | 511/5772 [54:05<9:11:39, 6.29s/it] {'loss': 0.495, 'learning_rate': 1.9821692794865918e-05, 'epoch': 0.09} + 9%|▉ | 511/5772 [53:58<9:11:39, 6.29s/it] 9%|▉ | 512/5772 [54:05<9:08:47, 6.26s/it] 9%|▉ | 512/5772 [54:12<9:08:47, 6.26s/it] {'loss': 0.4864, 'learning_rate': 1.982063620130547e-05, 'epoch': 0.09} + 9%|▉ | 512/5772 [54:12<9:08:47, 6.26s/it] {'loss': 0.4864, 'learning_rate': 1.982063620130547e-05, 'epoch': 0.09} + 9%|▉ | 512/5772 [54:05<9:08:47, 6.26s/it] 9%|▉ | 513/5772 [54:11<9:10:27, 6.28s/it] 9%|▉ | 513/5772 [54:18<9:10:28, 6.28s/it] {'loss': 0.4939, 'learning_rate': 1.9819576514790254e-05, 'epoch': 0.09} + 9%|▉ | 513/5772 [54:18<9:10:28, 6.28s/it] {'loss': 0.4939, 'learning_rate': 1.9819576514790254e-05, 'epoch': 0.09} + 9%|▉ | 513/5772 [54:11<9:10:27, 6.28s/it] 9%|▉ | 514/5772 [54:24<8:59:11, 6.15s/it] 9%|▉ | 514/5772 [54:17<8:59:11, 6.15s/it] {'loss': 0.5137, 'learning_rate': 1.9818513735654012e-05, 'epoch': 0.09} + 9%|▉ | 514/5772 [54:24<8:59:11, 6.15s/it] {'loss': 0.5137, 'learning_rate': 1.9818513735654012e-05, 'epoch': 0.09} + 9%|▉ | 514/5772 [54:17<8:59:11, 6.15s/it] 9%|▉ | 515/5772 [54:23<8:55:52, 6.12s/it] 9%|▉ | 515/5772 [54:30<8:55:52, 6.12s/it] {'loss': 0.5134, 'learning_rate': 1.981744786423146e-05, 'epoch': 0.09} + 9%|▉ | 515/5772 [54:30<8:55:52, 6.12s/it] {'loss': 0.5134, 'learning_rate': 1.981744786423146e-05, 'epoch': 0.09} + 9%|▉ | 515/5772 [54:23<8:55:52, 6.12s/it] 9%|▉ | 516/5772 [54:36<8:51:53, 6.07s/it] 9%|▉ | 516/5772 [54:29<8:51:54, 6.07s/it] {'loss': 0.4942, 'learning_rate': 1.9816378900858288e-05, 'epoch': 0.09} + 9%|▉ | 516/5772 [54:36<8:51:53, 6.07s/it] {'loss': 0.4942, 'learning_rate': 1.9816378900858288e-05, 'epoch': 0.09} + 9%|▉ | 516/5772 [54:29<8:51:54, 6.07s/it] 9%|▉ | 517/5772 [54:42<8:54:17, 6.10s/it] 9%|▉ | 517/5772 [54:35<8:54:17, 6.10s/it] {'loss': 0.4944, 'learning_rate': 1.9815306845871163e-05, 'epoch': 0.09} + 9%|▉ | 517/5772 [54:42<8:54:17, 6.10s/it] {'loss': 0.4944, 'learning_rate': 1.9815306845871163e-05, 'epoch': 0.09} + 9%|▉ | 517/5772 [54:35<8:54:17, 6.10s/it] 9%|▉ | 518/5772 [54:41<8:54:24, 6.10s/it] 9%|▉ | 518/5772 [54:48<8:54:24, 6.10s/it] {'loss': 0.5076, 'learning_rate': 1.981423169960772e-05, 'epoch': 0.09} + 9%|▉ | 518/5772 [54:48<8:54:24, 6.10s/it] {'loss': 0.5076, 'learning_rate': 1.981423169960772e-05, 'epoch': 0.09} + 9%|▉ | 518/5772 [54:41<8:54:24, 6.10s/it] 9%|▉ | 519/5772 [54:54<8:49:11, 6.04s/it] 9%|▉ | 519/5772 [54:47<8:49:11, 6.04s/it] {'loss': 0.4909, 'learning_rate': 1.981315346240657e-05, 'epoch': 0.09} + 9%|▉ | 519/5772 [54:54<8:49:11, 6.04s/it] {'loss': 0.4909, 'learning_rate': 1.981315346240657e-05, 'epoch': 0.09} + 9%|▉ | 519/5772 [54:47<8:49:11, 6.04s/it] 9%|▉ | 520/5772 [54:53<8:58:28, 6.15s/it] 9%|▉ | 520/5772 [55:00<8:58:30, 6.15s/it] {'loss': 0.4992, 'learning_rate': 1.981207213460729e-05, 'epoch': 0.09} + 9%|▉ | 520/5772 [55:00<8:58:30, 6.15s/it] {'loss': 0.4992, 'learning_rate': 1.981207213460729e-05, 'epoch': 0.09} + 9%|▉ | 520/5772 [54:53<8:58:28, 6.15s/it] 9%|▉ | 521/5772 [55:07<9:08:17, 6.26s/it] 9%|▉ | 521/5772 [55:00<9:08:17, 6.27s/it] {'loss': 0.4944, 'learning_rate': 1.9810987716550458e-05, 'epoch': 0.09} + 9%|▉ | 521/5772 [55:07<9:08:17, 6.26s/it] {'loss': 0.4944, 'learning_rate': 1.9810987716550458e-05, 'epoch': 0.09} + 9%|▉ | 521/5772 [55:00<9:08:17, 6.27s/it] 9%|▉ | 522/5772 [55:06<8:58:44, 6.16s/it] 9%|▉ | 522/5772 [55:13<8:58:44, 6.16s/it] {'loss': 0.4966, 'learning_rate': 1.9809900208577586e-05, 'epoch': 0.09} + 9%|▉ | 522/5772 [55:13<8:58:44, 6.16s/it] {'loss': 0.4966, 'learning_rate': 1.9809900208577586e-05, 'epoch': 0.09} + 9%|▉ | 522/5772 [55:06<8:58:44, 6.16s/it] 9%|▉ | 523/5772 [55:12<8:56:35, 6.13s/it] 9%|▉ | 523/5772 [55:19<8:56:35, 6.13s/it] {'loss': 0.4817, 'learning_rate': 1.980880961103119e-05, 'epoch': 0.09} + 9%|▉ | 523/5772 [55:19<8:56:35, 6.13s/it] {'loss': 0.4817, 'learning_rate': 1.980880961103119e-05, 'epoch': 0.09} + 9%|▉ | 523/5772 [55:12<8:56:35, 6.13s/it] 9%|▉ | 524/5772 [55:25<8:49:21, 6.05s/it] 9%|▉ | 524/5772 [55:18<8:49:22, 6.05s/it] {'loss': 0.4879, 'learning_rate': 1.9807715924254743e-05, 'epoch': 0.09} + 9%|▉ | 524/5772 [55:25<8:49:21, 6.05s/it] {'loss': 0.4879, 'learning_rate': 1.9807715924254743e-05, 'epoch': 0.09} + 9%|▉ | 524/5772 [55:18<8:49:22, 6.05s/it] 9%|▉ | 525/5772 [55:24<8:48:42, 6.05s/it] 9%|▉ | 525/5772 [55:31<8:48:43, 6.05s/it] {'loss': 0.5011, 'learning_rate': 1.98066191485927e-05, 'epoch': 0.09} + 9%|▉ | 525/5772 [55:31<8:48:43, 6.05s/it] {'loss': 0.5011, 'learning_rate': 1.98066191485927e-05, 'epoch': 0.09} + 9%|▉ | 525/5772 [55:24<8:48:42, 6.05s/it] 9%|▉ | 526/5772 [55:30<8:52:42, 6.09s/it] 9%|▉ | 526/5772 [55:37<8:52:42, 6.09s/it] {'loss': 0.5055, 'learning_rate': 1.980551928439048e-05, 'epoch': 0.09} + 9%|▉ | 526/5772 [55:37<8:52:42, 6.09s/it] {'loss': 0.5055, 'learning_rate': 1.980551928439048e-05, 'epoch': 0.09} + 9%|▉ | 526/5772 [55:30<8:52:42, 6.09s/it] 9%|▉ | 527/5772 [55:43<8:59:27, 6.17s/it] 9%|▉ | 527/5772 [55:36<8:59:27, 6.17s/it] {'loss': 0.4915, 'learning_rate': 1.980441633199448e-05, 'epoch': 0.09} + 9%|▉ | 527/5772 [55:43<8:59:27, 6.17s/it] {'loss': 0.4915, 'learning_rate': 1.980441633199448e-05, 'epoch': 0.09} + 9%|▉ | 527/5772 [55:36<8:59:27, 6.17s/it] 9%|▉ | 528/5772 [55:42<8:52:51, 6.10s/it] 9%|▉ | 528/5772 [55:49<8:52:53, 6.10s/it] {'loss': 0.5072, 'learning_rate': 1.980331029175207e-05, 'epoch': 0.09} + 9%|▉ | 528/5772 [55:49<8:52:53, 6.10s/it] {'loss': 0.5072, 'learning_rate': 1.980331029175207e-05, 'epoch': 0.09} + 9%|▉ | 528/5772 [55:42<8:52:51, 6.10s/it] 9%|▉ | 529/5772 [55:55<8:47:41, 6.04s/it] 9%|▉ | 529/5772 [55:48<8:47:41, 6.04s/it] {'loss': 0.4988, 'learning_rate': 1.9802201164011587e-05, 'epoch': 0.09} + 9%|▉ | 529/5772 [55:55<8:47:41, 6.04s/it] {'loss': 0.4988, 'learning_rate': 1.9802201164011587e-05, 'epoch': 0.09} + 9%|▉ | 529/5772 [55:48<8:47:41, 6.04s/it] 9%|▉ | 530/5772 [55:54<8:43:19, 5.99s/it] 9%|▉ | 530/5772 [56:01<8:43:19, 5.99s/it] {'loss': 0.4962, 'learning_rate': 1.980108894912235e-05, 'epoch': 0.09} + 9%|▉ | 530/5772 [56:01<8:43:19, 5.99s/it] {'loss': 0.4962, 'learning_rate': 1.980108894912235e-05, 'epoch': 0.09} + 9%|▉ | 530/5772 [55:54<8:43:19, 5.99s/it] 9%|▉ | 531/5772 [56:07<8:47:08, 6.03s/it] 9%|▉ | 531/5772 [56:00<8:47:09, 6.03s/it] {'loss': 0.4929, 'learning_rate': 1.979997364743464e-05, 'epoch': 0.09} + 9%|▉ | 531/5772 [56:07<8:47:08, 6.03s/it] {'loss': 0.4929, 'learning_rate': 1.979997364743464e-05, 'epoch': 0.09} + 9%|▉ | 531/5772 [56:00<8:47:09, 6.03s/it] 9%|▉ | 532/5772 [56:13<8:55:16, 6.13s/it] 9%|▉ | 532/5772 [56:06<8:55:16, 6.13s/it] {'loss': 0.5002, 'learning_rate': 1.979885525929972e-05, 'epoch': 0.09} + 9%|▉ | 532/5772 [56:13<8:55:16, 6.13s/it] {'loss': 0.5002, 'learning_rate': 1.979885525929972e-05, 'epoch': 0.09} + 9%|▉ | 532/5772 [56:06<8:55:16, 6.13s/it] 9%|▉ | 533/5772 [56:13<8:55:01, 6.13s/it] 9%|▉ | 533/5772 [56:20<8:55:01, 6.13s/it] {'loss': 0.4916, 'learning_rate': 1.979773378506982e-05, 'epoch': 0.09} + 9%|▉ | 533/5772 [56:20<8:55:01, 6.13s/it] {'loss': 0.4916, 'learning_rate': 1.979773378506982e-05, 'epoch': 0.09} + 9%|▉ | 533/5772 [56:13<8:55:01, 6.13s/it] 9%|▉ | 534/5772 [56:26<8:53:19, 6.11s/it] 9%|▉ | 534/5772 [56:19<8:53:19, 6.11s/it] {'loss': 0.5075, 'learning_rate': 1.9796609225098136e-05, 'epoch': 0.09} + 9%|▉ | 534/5772 [56:26<8:53:19, 6.11s/it] {'loss': 0.5075, 'learning_rate': 1.9796609225098136e-05, 'epoch': 0.09} + 9%|▉ | 534/5772 [56:19<8:53:19, 6.11s/it] 9%|▉ | 535/5772 [56:25<9:01:35, 6.21s/it] 9%|▉ | 535/5772 [56:32<9:01:37, 6.21s/it] {'loss': 0.5014, 'learning_rate': 1.9795481579738848e-05, 'epoch': 0.09} + 9%|▉ | 535/5772 [56:32<9:01:37, 6.21s/it] {'loss': 0.5014, 'learning_rate': 1.9795481579738848e-05, 'epoch': 0.09} + 9%|▉ | 535/5772 [56:25<9:01:35, 6.21s/it] 9%|▉ | 536/5772 [56:31<8:50:17, 6.08s/it] 9%|▉ | 536/5772 [56:38<8:50:24, 6.08s/it] {'loss': 0.5107, 'learning_rate': 1.97943508493471e-05, 'epoch': 0.09} + {'loss': 0.5107, 'learning_rate': 1.97943508493471e-05, 'epoch': 0.09} 9%|▉ | 536/5772 [56:38<8:50:24, 6.08s/it] + 9%|▉ | 536/5772 [56:31<8:50:17, 6.08s/it] 9%|▉ | 537/5772 [56:37<8:49:18, 6.07s/it] 9%|▉ | 537/5772 [56:44<8:49:16, 6.07s/it] {'loss': 0.5007, 'learning_rate': 1.979321703427901e-05, 'epoch': 0.09} + 9%|▉ | 537/5772 [56:44<8:49:16, 6.07s/it] {'loss': 0.5007, 'learning_rate': 1.979321703427901e-05, 'epoch': 0.09} + 9%|▉ | 537/5772 [56:37<8:49:18, 6.07s/it] 9%|▉ | 538/5772 [56:43<8:50:01, 6.08s/it] 9%|▉ | 538/5772 [56:50<8:50:01, 6.08s/it] {'loss': 0.5064, 'learning_rate': 1.9792080134891662e-05, 'epoch': 0.09} + 9%|▉ | 538/5772 [56:50<8:50:01, 6.08s/it] {'loss': 0.5064, 'learning_rate': 1.9792080134891662e-05, 'epoch': 0.09} + 9%|▉ | 538/5772 [56:43<8:50:01, 6.08s/it] 9%|▉ | 539/5772 [56:49<8:53:22, 6.12s/it] 9%|▉ | 539/5772 [56:56<8:53:21, 6.12s/it] {'loss': 0.5029, 'learning_rate': 1.9790940151543122e-05, 'epoch': 0.09} + 9%|▉ | 539/5772 [56:56<8:53:21, 6.12s/it] {'loss': 0.5029, 'learning_rate': 1.9790940151543122e-05, 'epoch': 0.09} + 9%|▉ | 539/5772 [56:49<8:53:22, 6.12s/it] 9%|▉ | 540/5772 [56:55<8:47:08, 6.05s/it] 9%|▉ | 540/5772 [57:02<8:47:07, 6.05s/it] {'loss': 0.5052, 'learning_rate': 1.9789797084592418e-05, 'epoch': 0.09} + 9%|▉ | 540/5772 [57:02<8:47:07, 6.05s/it] {'loss': 0.5052, 'learning_rate': 1.9789797084592418e-05, 'epoch': 0.09} + 9%|▉ | 540/5772 [56:55<8:47:08, 6.05s/it] 9%|▉ | 541/5772 [57:01<8:42:34, 5.99s/it] 9%|▉ | 541/5772 [57:08<8:42:33, 5.99s/it] {'loss': 0.4913, 'learning_rate': 1.9788650934399553e-05, 'epoch': 0.09} + 9%|▉ | 541/5772 [57:08<8:42:33, 5.99s/it] {'loss': 0.4913, 'learning_rate': 1.9788650934399553e-05, 'epoch': 0.09} + 9%|▉ | 541/5772 [57:01<8:42:34, 5.99s/it] 9%|▉ | 542/5772 [57:07<8:53:45, 6.12s/it] 9%|▉ | 542/5772 [57:14<8:53:45, 6.12s/it] {'loss': 0.502, 'learning_rate': 1.9787501701325505e-05, 'epoch': 0.09} + 9%|▉ | 542/5772 [57:14<8:53:45, 6.12s/it] {'loss': 0.502, 'learning_rate': 1.9787501701325505e-05, 'epoch': 0.09} + 9%|▉ | 542/5772 [57:07<8:53:45, 6.12s/it] 9%|▉ | 543/5772 [57:13<8:48:04, 6.06s/it] 9%|▉ | 543/5772 [57:20<8:48:04, 6.06s/it] {'loss': 0.4988, 'learning_rate': 1.9786349385732212e-05, 'epoch': 0.09} + 9%|▉ | 543/5772 [57:20<8:48:04, 6.06s/it] {'loss': 0.4988, 'learning_rate': 1.9786349385732212e-05, 'epoch': 0.09} + 9%|▉ | 543/5772 [57:13<8:48:04, 6.06s/it] 9%|▉ | 544/5772 [57:20<8:52:17, 6.11s/it] 9%|▉ | 544/5772 [57:27<8:52:17, 6.11s/it] {'loss': 0.5049, 'learning_rate': 1.9785193987982593e-05, 'epoch': 0.09} + 9%|▉ | 544/5772 [57:27<8:52:17, 6.11s/it] {'loss': 0.5049, 'learning_rate': 1.9785193987982593e-05, 'epoch': 0.09} + 9%|▉ | 544/5772 [57:20<8:52:17, 6.11s/it] 9%|▉ | 545/5772 [57:26<9:02:22, 6.23s/it] 9%|▉ | 545/5772 [57:33<9:02:23, 6.23s/it] {'loss': 0.4889, 'learning_rate': 1.9784035508440534e-05, 'epoch': 0.09} + 9%|▉ | 545/5772 [57:33<9:02:23, 6.23s/it] {'loss': 0.4889, 'learning_rate': 1.9784035508440534e-05, 'epoch': 0.09} + 9%|▉ | 545/5772 [57:26<9:02:22, 6.23s/it] 9%|▉ | 546/5772 [57:32<9:04:22, 6.25s/it] 9%|▉ | 546/5772 [57:39<9:04:22, 6.25s/it] {'loss': 0.499, 'learning_rate': 1.9782873947470888e-05, 'epoch': 0.09} + 9%|▉ | 546/5772 [57:39<9:04:22, 6.25s/it] {'loss': 0.499, 'learning_rate': 1.9782873947470888e-05, 'epoch': 0.09} + 9%|▉ | 546/5772 [57:32<9:04:22, 6.25s/it] 9%|▉ | 547/5772 [57:38<8:57:41, 6.17s/it] 9%|▉ | 547/5772 [57:45<8:57:41, 6.17s/it] {'loss': 0.5083, 'learning_rate': 1.9781709305439486e-05, 'epoch': 0.09} + 9%|▉ | 547/5772 [57:45<8:57:41, 6.17s/it] {'loss': 0.5083, 'learning_rate': 1.9781709305439486e-05, 'epoch': 0.09} + 9%|▉ | 547/5772 [57:38<8:57:41, 6.17s/it] 9%|▉ | 548/5772 [57:45<8:59:56, 6.20s/it] 9%|▉ | 548/5772 [57:52<8:59:55, 6.20s/it] {'loss': 0.5017, 'learning_rate': 1.9780541582713128e-05, 'epoch': 0.09} + 9%|▉ | 548/5772 [57:52<8:59:55, 6.20s/it] {'loss': 0.5017, 'learning_rate': 1.9780541582713128e-05, 'epoch': 0.09} + 9%|▉ | 548/5772 [57:45<8:59:56, 6.20s/it] 10%|▉ | 549/5772 [57:51<9:01:13, 6.22s/it] 10%|▉ | 549/5772 [57:58<9:01:13, 6.22s/it] {'loss': 0.4985, 'learning_rate': 1.9779370779659578e-05, 'epoch': 0.1} + 10%|▉ | 549/5772 [57:58<9:01:13, 6.22s/it] {'loss': 0.4985, 'learning_rate': 1.9779370779659578e-05, 'epoch': 0.1} + 10%|▉ | 549/5772 [57:51<9:01:13, 6.22s/it]2 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +05 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... + 15AutoResumeHook: Checking whether to suspend...12 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... + + 10%|▉ | 550/5772 [58:04<8:50:02, 6.09s/it]8 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... + 10%|▉ | 550/5772 [57:57<8:50:03, 6.09s/it]37 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4972, 'learning_rate': 1.9778196896647572e-05, 'epoch': 0.1} + 10%|▉ | 550/5772 [58:04<8:50:02, 6.09s/it] {'loss': 0.4972, 'learning_rate': 1.9778196896647572e-05, 'epoch': 0.1} + 10%|▉ | 550/5772 [57:57<8:50:03, 6.09s/it] 10%|▉ | 551/5772 [58:03<8:45:11, 6.04s/it] 10%|▉ | 551/5772 [58:10<8:45:11, 6.04s/it] {'loss': 0.5008, 'learning_rate': 1.977701993404682e-05, 'epoch': 0.1} + 10%|▉ | 551/5772 [58:10<8:45:11, 6.04s/it] {'loss': 0.5008, 'learning_rate': 1.977701993404682e-05, 'epoch': 0.1} + 10%|▉ | 551/5772 [58:03<8:45:11, 6.04s/it] 10%|▉ | 552/5772 [58:08<8:39:57, 5.98s/it] 10%|▉ | 552/5772 [58:15<8:39:57, 5.98s/it] {'loss': 0.5087, 'learning_rate': 1.9775839892228004e-05, 'epoch': 0.1} + 10%|▉ | 552/5772 [58:15<8:39:57, 5.98s/it] {'loss': 0.5087, 'learning_rate': 1.9775839892228004e-05, 'epoch': 0.1} + 10%|▉ | 552/5772 [58:08<8:39:57, 5.98s/it] 10%|▉ | 553/5772 [58:15<8:47:05, 6.06s/it] 10%|▉ | 553/5772 [58:22<8:47:05, 6.06s/it] {'loss': 0.4935, 'learning_rate': 1.9774656771562764e-05, 'epoch': 0.1} + 10%|▉ | 553/5772 [58:22<8:47:05, 6.06s/it] {'loss': 0.4935, 'learning_rate': 1.9774656771562764e-05, 'epoch': 0.1} + 10%|▉ | 553/5772 [58:15<8:47:05, 6.06s/it] 10%|▉ | 554/5772 [58:21<8:47:02, 6.06s/it] 10%|▉ | 554/5772 [58:28<8:47:03, 6.06s/it] {'loss': 0.5055, 'learning_rate': 1.977347057242372e-05, 'epoch': 0.1} + 10%|▉ | 554/5772 [58:28<8:47:03, 6.06s/it] {'loss': 0.5055, 'learning_rate': 1.977347057242372e-05, 'epoch': 0.1} + 10%|▉ | 554/5772 [58:21<8:47:02, 6.06s/it] 10%|▉ | 555/5772 [58:27<8:45:45, 6.05s/it] 10%|▉ | 555/5772 [58:34<8:45:45, 6.05s/it] {'loss': 0.5013, 'learning_rate': 1.9772281295184465e-05, 'epoch': 0.1} + 10%|▉ | 555/5772 [58:34<8:45:45, 6.05s/it] {'loss': 0.5013, 'learning_rate': 1.9772281295184465e-05, 'epoch': 0.1} + 10%|▉ | 555/5772 [58:27<8:45:45, 6.05s/it] 10%|▉ | 556/5772 [58:33<8:43:07, 6.02s/it] 10%|▉ | 556/5772 [58:40<8:43:07, 6.02s/it] {'loss': 0.5, 'learning_rate': 1.9771088940219546e-05, 'epoch': 0.1} + 10%|▉ | 556/5772 [58:40<8:43:07, 6.02s/it] {'loss': 0.5, 'learning_rate': 1.9771088940219546e-05, 'epoch': 0.1} + 10%|▉ | 556/5772 [58:33<8:43:07, 6.02s/it] 10%|▉ | 557/5772 [58:39<8:48:48, 6.08s/it] 10%|▉ | 557/5772 [58:46<8:48:48, 6.08s/it] {'loss': 0.4929, 'learning_rate': 1.97698935079045e-05, 'epoch': 0.1} + 10%|▉ | 557/5772 [58:46<8:48:48, 6.08s/it] {'loss': 0.4929, 'learning_rate': 1.97698935079045e-05, 'epoch': 0.1} + 10%|▉ | 557/5772 [58:39<8:48:48, 6.08s/it] 10%|▉ | 558/5772 [58:45<8:51:10, 6.11s/it] 10%|▉ | 558/5772 [58:52<8:51:11, 6.11s/it] {'loss': 0.5258, 'learning_rate': 1.9768694998615805e-05, 'epoch': 0.1} + 10%|▉ | 558/5772 [58:52<8:51:11, 6.11s/it] {'loss': 0.5258, 'learning_rate': 1.9768694998615805e-05, 'epoch': 0.1} + 10%|▉ | 558/5772 [58:45<8:51:10, 6.11s/it] 10%|▉ | 559/5772 [58:59<8:59:16, 6.21s/it] 10%|▉ | 559/5772 [58:51<8:59:17, 6.21s/it] {'loss': 0.5014, 'learning_rate': 1.976749341273094e-05, 'epoch': 0.1} + 10%|▉ | 559/5772 [58:59<8:59:16, 6.21s/it] {'loss': 0.5014, 'learning_rate': 1.976749341273094e-05, 'epoch': 0.1} + 10%|▉ | 559/5772 [58:51<8:59:17, 6.21s/it] 10%|▉ | 560/5772 [58:58<8:58:56, 6.20s/it] 10%|▉ | 560/5772 [59:05<8:58:56, 6.20s/it] {'loss': 0.4991, 'learning_rate': 1.9766288750628327e-05, 'epoch': 0.1} + 10%|▉ | 560/5772 [59:05<8:58:56, 6.20s/it] {'loss': 0.4991, 'learning_rate': 1.9766288750628327e-05, 'epoch': 0.1} + 10%|▉ | 560/5772 [58:58<8:58:56, 6.20s/it] 10%|▉ | 561/5772 [59:11<9:00:36, 6.22s/it] 10%|▉ | 561/5772 [59:04<9:00:36, 6.22s/it] {'loss': 0.4859, 'learning_rate': 1.976508101268738e-05, 'epoch': 0.1} + 10%|▉ | 561/5772 [59:11<9:00:36, 6.22s/it] {'loss': 0.4859, 'learning_rate': 1.976508101268738e-05, 'epoch': 0.1} + 10%|▉ | 561/5772 [59:04<9:00:36, 6.22s/it] 10%|▉ | 562/5772 [59:10<9:03:13, 6.26s/it] 10%|▉ | 562/5772 [59:17<9:03:13, 6.26s/it] {'loss': 0.4919, 'learning_rate': 1.976387019928846e-05, 'epoch': 0.1} + 10%|▉ | 562/5772 [59:17<9:03:13, 6.26s/it] {'loss': 0.4919, 'learning_rate': 1.976387019928846e-05, 'epoch': 0.1} + 10%|▉ | 562/5772 [59:10<9:03:13, 6.26s/it] 10%|▉ | 563/5772 [59:16<9:01:11, 6.23s/it] 10%|▉ | 563/5772 [59:24<9:01:11, 6.23s/it] {'loss': 0.496, 'learning_rate': 1.97626563108129e-05, 'epoch': 0.1} + 10%|▉ | 563/5772 [59:24<9:01:11, 6.23s/it] {'loss': 0.496, 'learning_rate': 1.97626563108129e-05, 'epoch': 0.1} + 10%|▉ | 563/5772 [59:16<9:01:11, 6.23s/it] 10%|▉ | 564/5772 [59:23<9:04:45, 6.28s/it] 10%|▉ | 564/5772 [59:30<9:04:45, 6.28s/it] {'loss': 0.5063, 'learning_rate': 1.9761439347643027e-05, 'epoch': 0.1} + 10%|▉ | 564/5772 [59:30<9:04:45, 6.28s/it] {'loss': 0.5063, 'learning_rate': 1.9761439347643027e-05, 'epoch': 0.1} + 10%|▉ | 564/5772 [59:23<9:04:45, 6.28s/it] 10%|▉ | 565/5772 [59:29<8:59:28, 6.22s/it] 10%|▉ | 565/5772 [59:36<8:59:28, 6.22s/it] {'loss': 0.4906, 'learning_rate': 1.97602193101621e-05, 'epoch': 0.1} + 10%|▉ | 565/5772 [59:36<8:59:28, 6.22s/it] {'loss': 0.4906, 'learning_rate': 1.97602193101621e-05, 'epoch': 0.1} + 10%|▉ | 565/5772 [59:29<8:59:28, 6.22s/it] 10%|▉ | 566/5772 [59:35<8:49:26, 6.10s/it] 10%|▉ | 566/5772 [59:42<8:49:26, 6.10s/it] {'loss': 0.4971, 'learning_rate': 1.9758996198754364e-05, 'epoch': 0.1} + 10%|▉ | 566/5772 [59:42<8:49:26, 6.10s/it] {'loss': 0.4971, 'learning_rate': 1.9758996198754364e-05, 'epoch': 0.1} + 10%|▉ | 566/5772 [59:35<8:49:26, 6.10s/it] 10%|▉ | 567/5772 [59:41<9:04:32, 6.28s/it] 10%|▉ | 567/5772 [59:48<9:04:32, 6.28s/it] {'loss': 0.5023, 'learning_rate': 1.975777001380504e-05, 'epoch': 0.1} + 10%|▉ | 567/5772 [59:48<9:04:32, 6.28s/it] {'loss': 0.5023, 'learning_rate': 1.975777001380504e-05, 'epoch': 0.1} + 10%|▉ | 567/5772 [59:41<9:04:32, 6.28s/it] 10%|▉ | 568/5772 [59:47<8:56:06, 6.18s/it] 10%|▉ | 568/5772 [59:54<8:56:06, 6.18s/it] {'loss': 0.4956, 'learning_rate': 1.9756540755700308e-05, 'epoch': 0.1} + 10%|▉ | 568/5772 [59:54<8:56:06, 6.18s/it] {'loss': 0.4956, 'learning_rate': 1.9756540755700308e-05, 'epoch': 0.1} + 10%|▉ | 568/5772 [59:47<8:56:06, 6.18s/it] 10%|▉ | 569/5772 [1:00:01<9:09:29, 6.34s/it] 10%|▉ | 569/5772 [59:54<9:09:29, 6.34s/it] {'loss': 0.5023, 'learning_rate': 1.9755308424827303e-05, 'epoch': 0.1} + 10%|▉ | 569/5772 [1:00:01<9:09:29, 6.34s/it] {'loss': 0.5023, 'learning_rate': 1.9755308424827303e-05, 'epoch': 0.1} + 10%|▉ | 569/5772 [59:54<9:09:29, 6.34s/it] 10%|▉ | 570/5772 [1:00:01<9:14:01, 6.39s/it] 10%|▉ | 570/5772 [1:00:08<9:14:02, 6.39s/it] {'loss': 0.4913, 'learning_rate': 1.9754073021574153e-05, 'epoch': 0.1} + 10%|▉ | 570/5772 [1:00:08<9:14:02, 6.39s/it] {'loss': 0.4913, 'learning_rate': 1.9754073021574153e-05, 'epoch': 0.1} + 10%|▉ | 570/5772 [1:00:01<9:14:01, 6.39s/it] 10%|▉ | 571/5772 [1:00:07<9:14:03, 6.39s/it] 10%|▉ | 571/5772 [1:00:14<9:14:02, 6.39s/it] {'loss': 0.4865, 'learning_rate': 1.9752834546329944e-05, 'epoch': 0.1} + 10%|▉ | 571/5772 [1:00:14<9:14:02, 6.39s/it] {'loss': 0.4865, 'learning_rate': 1.9752834546329944e-05, 'epoch': 0.1} + 10%|▉ | 571/5772 [1:00:07<9:14:03, 6.39s/it] 10%|▉ | 572/5772 [1:00:13<9:02:43, 6.26s/it] 10%|▉ | 572/5772 [1:00:20<9:02:42, 6.26s/it] {'loss': 0.5035, 'learning_rate': 1.9751592999484713e-05, 'epoch': 0.1} + 10%|▉ | 572/5772 [1:00:20<9:02:42, 6.26s/it] {'loss': 0.5035, 'learning_rate': 1.9751592999484713e-05, 'epoch': 0.1} + 10%|▉ | 572/5772 [1:00:13<9:02:43, 6.26s/it] 10%|▉ | 573/5772 [1:00:19<8:52:35, 6.15s/it] 10%|▉ | 573/5772 [1:00:26<8:52:34, 6.15s/it] {'loss': 0.4981, 'learning_rate': 1.9750348381429484e-05, 'epoch': 0.1} + 10%|▉ | 573/5772 [1:00:26<8:52:34, 6.15s/it] {'loss': 0.4981, 'learning_rate': 1.9750348381429484e-05, 'epoch': 0.1} + 10%|▉ | 573/5772 [1:00:19<8:52:35, 6.15s/it] 10%|▉ | 574/5772 [1:00:25<8:49:44, 6.11s/it] 10%|▉ | 574/5772 [1:00:32<8:49:44, 6.11s/it] {'loss': 0.5027, 'learning_rate': 1.974910069255625e-05, 'epoch': 0.1} + 10%|▉ | 574/5772 [1:00:32<8:49:44, 6.11s/it] {'loss': 0.5027, 'learning_rate': 1.974910069255625e-05, 'epoch': 0.1} + 10%|▉ | 574/5772 [1:00:25<8:49:44, 6.11s/it] 10%|▉ | 575/5772 [1:00:38<8:50:44, 6.13s/it] 10%|▉ | 575/5772 [1:00:31<8:50:44, 6.13s/it] {'loss': 0.4961, 'learning_rate': 1.9747849933257955e-05, 'epoch': 0.1} + 10%|▉ | 575/5772 [1:00:38<8:50:44, 6.13s/it] {'loss': 0.4961, 'learning_rate': 1.9747849933257955e-05, 'epoch': 0.1} + 10%|▉ | 575/5772 [1:00:31<8:50:44, 6.13s/it] 10%|▉ | 576/5772 [1:00:37<8:48:54, 6.11s/it] 10%|▉ | 576/5772 [1:00:44<8:48:55, 6.11s/it] {'loss': 0.5046, 'learning_rate': 1.9746596103928524e-05, 'epoch': 0.1} + 10%|▉ | 576/5772 [1:00:44<8:48:55, 6.11s/it] {'loss': 0.5046, 'learning_rate': 1.9746596103928524e-05, 'epoch': 0.1} + 10%|▉ | 576/5772 [1:00:37<8:48:54, 6.11s/it] 10%|▉ | 577/5772 [1:00:50<8:46:59, 6.09s/it] 10%|▉ | 577/5772 [1:00:43<8:47:00, 6.09s/it] {'loss': 0.4902, 'learning_rate': 1.974533920496284e-05, 'epoch': 0.1} + 10%|▉ | 577/5772 [1:00:50<8:46:59, 6.09s/it] {'loss': 0.4902, 'learning_rate': 1.974533920496284e-05, 'epoch': 0.1} + 10%|▉ | 577/5772 [1:00:43<8:47:00, 6.09s/it] 10%|█ | 578/5772 [1:00:56<8:41:35, 6.03s/it] 10%|█ | 578/5772 [1:00:49<8:41:35, 6.03s/it] {'loss': 0.5076, 'learning_rate': 1.9744079236756756e-05, 'epoch': 0.1} + 10%|█ | 578/5772 [1:00:56<8:41:35, 6.03s/it] {'loss': 0.5076, 'learning_rate': 1.9744079236756756e-05, 'epoch': 0.1} + 10%|█ | 578/5772 [1:00:49<8:41:35, 6.03s/it] 10%|█ | 579/5772 [1:00:55<8:40:44, 6.02s/it] 10%|█ | 579/5772 [1:01:02<8:40:44, 6.02s/it] {'loss': 0.483, 'learning_rate': 1.9742816199707096e-05, 'epoch': 0.1} + 10%|█ | 579/5772 [1:01:02<8:40:44, 6.02s/it] {'loss': 0.483, 'learning_rate': 1.9742816199707096e-05, 'epoch': 0.1} + 10%|█ | 579/5772 [1:00:55<8:40:44, 6.02s/it] 10%|█ | 580/5772 [1:01:01<8:37:45, 5.98s/it] 10%|█ | 580/5772 [1:01:08<8:37:45, 5.98s/it] {'loss': 0.5074, 'learning_rate': 1.9741550094211647e-05, 'epoch': 0.1} + 10%|█ | 580/5772 [1:01:08<8:37:45, 5.98s/it] {'loss': 0.5074, 'learning_rate': 1.9741550094211647e-05, 'epoch': 0.1} + 10%|█ | 580/5772 [1:01:01<8:37:45, 5.98s/it] 10%|█ | 581/5772 [1:01:14<8:39:49, 6.01s/it] 10%|█ | 581/5772 [1:01:07<8:39:49, 6.01s/it] {'loss': 0.4936, 'learning_rate': 1.9740280920669153e-05, 'epoch': 0.1} + 10%|█ | 581/5772 [1:01:14<8:39:49, 6.01s/it] {'loss': 0.4936, 'learning_rate': 1.9740280920669153e-05, 'epoch': 0.1} + 10%|█ | 581/5772 [1:01:07<8:39:49, 6.01s/it] 10%|█ | 582/5772 [1:01:20<8:37:13, 5.98s/it] 10%|█ | 582/5772 [1:01:13<8:37:13, 5.98s/it] {'loss': 0.4946, 'learning_rate': 1.973900867947934e-05, 'epoch': 0.1} + 10%|█ | 582/5772 [1:01:20<8:37:13, 5.98s/it] {'loss': 0.4946, 'learning_rate': 1.973900867947934e-05, 'epoch': 0.1} + 10%|█ | 582/5772 [1:01:13<8:37:13, 5.98s/it] 10%|█ | 583/5772 [1:01:26<8:33:55, 5.94s/it] 10%|█ | 583/5772 [1:01:19<8:33:55, 5.94s/it] {'loss': 0.4804, 'learning_rate': 1.9737733371042894e-05, 'epoch': 0.1} + 10%|█ | 583/5772 [1:01:26<8:33:55, 5.94s/it] {'loss': 0.4804, 'learning_rate': 1.9737733371042894e-05, 'epoch': 0.1} + 10%|█ | 583/5772 [1:01:19<8:33:55, 5.94s/it] 10%|█ | 584/5772 [1:01:25<8:33:29, 5.94s/it] 10%|█ | 584/5772 [1:01:32<8:33:30, 5.94s/it] {'loss': 0.4976, 'learning_rate': 1.9736454995761468e-05, 'epoch': 0.1} + 10%|█ | 584/5772 [1:01:32<8:33:30, 5.94s/it] {'loss': 0.4976, 'learning_rate': 1.9736454995761468e-05, 'epoch': 0.1} + 10%|█ | 584/5772 [1:01:25<8:33:29, 5.94s/it] 10%|█ | 585/5772 [1:01:31<8:30:54, 5.91s/it] 10%|█ | 585/5772 [1:01:38<8:30:56, 5.91s/it] {'loss': 0.502, 'learning_rate': 1.973517355403767e-05, 'epoch': 0.1} + 10%|█ | 585/5772 [1:01:38<8:30:56, 5.91s/it] {'loss': 0.502, 'learning_rate': 1.973517355403767e-05, 'epoch': 0.1} + 10%|█ | 585/5772 [1:01:31<8:30:54, 5.91s/it] 10%|█ | 586/5772 [1:01:36<8:28:23, 5.88s/it] 10%|█ | 586/5772 [1:01:43<8:28:23, 5.88s/it] {'loss': 0.5016, 'learning_rate': 1.9733889046275095e-05, 'epoch': 0.1} + 10%|█ | 586/5772 [1:01:43<8:28:23, 5.88s/it] {'loss': 0.5016, 'learning_rate': 1.9733889046275095e-05, 'epoch': 0.1} + 10%|█ | 586/5772 [1:01:36<8:28:23, 5.88s/it] 10%|█ | 587/5772 [1:01:50<8:34:07, 5.95s/it] 10%|█ | 587/5772 [1:01:42<8:34:07, 5.95s/it] {'loss': 0.5068, 'learning_rate': 1.9732601472878282e-05, 'epoch': 0.1} + 10%|█ | 587/5772 [1:01:50<8:34:07, 5.95s/it] {'loss': 0.5068, 'learning_rate': 1.9732601472878282e-05, 'epoch': 0.1} + 10%|█ | 587/5772 [1:01:42<8:34:07, 5.95s/it] 10%|█ | 588/5772 [1:01:49<8:37:54, 5.99s/it] 10%|█ | 588/5772 [1:01:56<8:37:54, 5.99s/it] {'loss': 0.5192, 'learning_rate': 1.9731310834252747e-05, 'epoch': 0.1} + 10%|█ | 588/5772 [1:01:56<8:37:54, 5.99s/it] {'loss': 0.5192, 'learning_rate': 1.9731310834252747e-05, 'epoch': 0.1} + 10%|█ | 588/5772 [1:01:49<8:37:54, 5.99s/it] 10%|█ | 589/5772 [1:01:55<8:47:35, 6.11s/it] 10%|█ | 589/5772 [1:02:02<8:47:37, 6.11s/it] {'loss': 0.4961, 'learning_rate': 1.9730017130804976e-05, 'epoch': 0.1} + 10%|█ | 589/5772 [1:02:02<8:47:37, 6.11s/it] {'loss': 0.4961, 'learning_rate': 1.9730017130804976e-05, 'epoch': 0.1} + 10%|█ | 589/5772 [1:01:55<8:47:35, 6.11s/it] 10%|█ | 590/5772 [1:02:08<8:40:40, 6.03s/it] 10%|█ | 590/5772 [1:02:01<8:40:41, 6.03s/it] {'loss': 0.5111, 'learning_rate': 1.9728720362942404e-05, 'epoch': 0.1} + 10%|█ | 590/5772 [1:02:08<8:40:40, 6.03s/it] {'loss': 0.5111, 'learning_rate': 1.9728720362942404e-05, 'epoch': 0.1} + 10%|█ | 590/5772 [1:02:01<8:40:41, 6.03s/it] 10%|█ | 591/5772 [1:02:14<8:47:58, 6.11s/it] 10%|█ | 591/5772 [1:02:07<8:47:59, 6.11s/it] {'loss': 0.5004, 'learning_rate': 1.9727420531073447e-05, 'epoch': 0.1} + 10%|█ | 591/5772 [1:02:14<8:47:58, 6.11s/it] {'loss': 0.5004, 'learning_rate': 1.9727420531073447e-05, 'epoch': 0.1} + 10%|█ | 591/5772 [1:02:07<8:47:59, 6.11s/it] 10%|█ | 592/5772 [1:02:21<8:57:31, 6.23s/it] 10%|█ | 592/5772 [1:02:14<8:57:31, 6.23s/it] {'loss': 0.4893, 'learning_rate': 1.972611763560748e-05, 'epoch': 0.1} + 10%|█ | 592/5772 [1:02:21<8:57:31, 6.23s/it] {'loss': 0.4893, 'learning_rate': 1.972611763560748e-05, 'epoch': 0.1} + 10%|█ | 592/5772 [1:02:14<8:57:31, 6.23s/it] 10%|█ | 593/5772 [1:02:27<8:50:58, 6.15s/it] 10%|█ | 593/5772 [1:02:20<8:50:59, 6.15s/it] {'loss': 0.4979, 'learning_rate': 1.972481167695484e-05, 'epoch': 0.1} + 10%|█ | 593/5772 [1:02:27<8:50:58, 6.15s/it] {'loss': 0.4979, 'learning_rate': 1.972481167695484e-05, 'epoch': 0.1} + 10%|█ | 593/5772 [1:02:20<8:50:59, 6.15s/it] 10%|█ | 594/5772 [1:02:26<8:55:00, 6.20s/it] 10%|█ | 594/5772 [1:02:33<8:55:01, 6.20s/it] {'loss': 0.4872, 'learning_rate': 1.9723502655526832e-05, 'epoch': 0.1} + 10%|█ | 594/5772 [1:02:33<8:55:01, 6.20s/it] {'loss': 0.4872, 'learning_rate': 1.9723502655526832e-05, 'epoch': 0.1} + 10%|█ | 594/5772 [1:02:26<8:55:00, 6.20s/it] 10%|█ | 595/5772 [1:02:32<9:05:13, 6.32s/it] 10%|█ | 595/5772 [1:02:40<9:05:13, 6.32s/it] {'loss': 0.4964, 'learning_rate': 1.9722190571735725e-05, 'epoch': 0.1} + 10%|█ | 595/5772 [1:02:40<9:05:13, 6.32s/it] {'loss': 0.4964, 'learning_rate': 1.9722190571735725e-05, 'epoch': 0.1} + 10%|█ | 595/5772 [1:02:32<9:05:13, 6.32s/it] 10%|█ | 596/5772 [1:02:38<8:56:37, 6.22s/it] 10%|█ | 596/5772 [1:02:46<8:56:37, 6.22s/it] {'loss': 0.5013, 'learning_rate': 1.9720875425994758e-05, 'epoch': 0.1} + 10%|█ | 596/5772 [1:02:46<8:56:37, 6.22s/it] {'loss': 0.5013, 'learning_rate': 1.9720875425994758e-05, 'epoch': 0.1} + 10%|█ | 596/5772 [1:02:38<8:56:37, 6.22s/it] 10%|█ | 597/5772 [1:02:52<8:50:51, 6.15s/it] 10%|█ | 597/5772 [1:02:44<8:50:55, 6.16s/it] {'loss': 0.4933, 'learning_rate': 1.9719557218718116e-05, 'epoch': 0.1} + 10%|█ | 597/5772 [1:02:52<8:50:51, 6.15s/it] {'loss': 0.4933, 'learning_rate': 1.9719557218718116e-05, 'epoch': 0.1} + 10%|█ | 597/5772 [1:02:44<8:50:55, 6.16s/it] 10%|█ | 598/5772 [1:02:58<9:05:17, 6.32s/it] 10%|█ | 598/5772 [1:02:51<9:05:16, 6.32s/it] {'loss': 0.5033, 'learning_rate': 1.9718235950320978e-05, 'epoch': 0.1} + 10%|█ | 598/5772 [1:02:58<9:05:17, 6.32s/it] {'loss': 0.5033, 'learning_rate': 1.9718235950320978e-05, 'epoch': 0.1} + 10%|█ | 598/5772 [1:02:51<9:05:16, 6.32s/it] 10%|█ | 599/5772 [1:03:05<9:04:28, 6.32s/it] 10%|█ | 599/5772 [1:02:57<9:04:28, 6.32s/it] {'loss': 0.5092, 'learning_rate': 1.9716911621219453e-05, 'epoch': 0.1} + 10%|█ | 599/5772 [1:03:05<9:04:28, 6.32s/it] {'loss': 0.5092, 'learning_rate': 1.9716911621219453e-05, 'epoch': 0.1} + 10%|█ | 599/5772 [1:02:57<9:04:28, 6.32s/it]5 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +138 AutoResumeHook: Checking whether to suspend...10AutoResumeHook: Checking whether to suspend... + +AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... + 10%|█ | 600/5772 [1:03:11<8:59:28, 6.26s/it]14 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +06 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend...7 + AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... + 10%|█ | 600/5772 [1:03:04<8:59:28, 6.26s/it]1 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4879, 'learning_rate': 1.9715584231830642e-05, 'epoch': 0.1} + 10%|█ | 600/5772 [1:03:11<8:59:28, 6.26s/it] {'loss': 0.4879, 'learning_rate': 1.9715584231830642e-05, 'epoch': 0.1} + 10%|█ | 600/5772 [1:03:04<8:59:28, 6.26s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-600/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-600/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-600/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 10%|█ | 601/5772 [1:03:31<15:09:02, 10.55s/it] 10%|█ | 601/5772 [1:03:24<15:09:01, 10.55s/it] {'loss': 0.5023, 'learning_rate': 1.9714253782572598e-05, 'epoch': 0.1} + 10%|█ | 601/5772 [1:03:31<15:09:02, 10.55s/it] {'loss': 0.5023, 'learning_rate': 1.9714253782572598e-05, 'epoch': 0.1} + 10%|█ | 601/5772 [1:03:24<15:09:01, 10.55s/it] 10%|█ | 602/5772 [1:03:31<13:21:08, 9.30s/it] 10%|█ | 602/5772 [1:03:38<13:21:08, 9.30s/it] {'loss': 0.4989, 'learning_rate': 1.9712920273864333e-05, 'epoch': 0.1} + 10%|█ | 602/5772 [1:03:38<13:21:08, 9.30s/it] {'loss': 0.4989, 'learning_rate': 1.9712920273864333e-05, 'epoch': 0.1} + 10%|█ | 602/5772 [1:03:31<13:21:08, 9.30s/it] 10%|█ | 603/5772 [1:03:37<12:04:23, 8.41s/it] 10%|█ | 603/5772 [1:03:44<12:04:24, 8.41s/it] {'loss': 0.4996, 'learning_rate': 1.9711583706125835e-05, 'epoch': 0.1} + 10%|█ | 603/5772 [1:03:44<12:04:24, 8.41s/it] {'loss': 0.4996, 'learning_rate': 1.9711583706125835e-05, 'epoch': 0.1} + 10%|█ | 603/5772 [1:03:37<12:04:23, 8.41s/it] 10%|█ | 604/5772 [1:03:50<11:04:56, 7.72s/it] 10%|█ | 604/5772 [1:03:43<11:04:56, 7.72s/it] {'loss': 0.502, 'learning_rate': 1.9710244079778042e-05, 'epoch': 0.1} + 10%|█ | 604/5772 [1:03:50<11:04:56, 7.72s/it] {'loss': 0.502, 'learning_rate': 1.9710244079778042e-05, 'epoch': 0.1} + 10%|█ | 604/5772 [1:03:43<11:04:56, 7.72s/it] 10%|█ | 605/5772 [1:03:56<10:26:13, 7.27s/it] 10%|█ | 605/5772 [1:03:49<10:26:13, 7.27s/it] {'loss': 0.5031, 'learning_rate': 1.970890139524287e-05, 'epoch': 0.1} + 10%|█ | 605/5772 [1:03:56<10:26:13, 7.27s/it] {'loss': 0.5031, 'learning_rate': 1.970890139524287e-05, 'epoch': 0.1} + 10%|█ | 605/5772 [1:03:49<10:26:13, 7.27s/it] 10%|█ | 606/5772 [1:04:02<9:57:54, 6.94s/it] 10%|█ | 606/5772 [1:03:55<9:57:54, 6.94s/it] {'loss': 0.507, 'learning_rate': 1.970755565294318e-05, 'epoch': 0.1} + 10%|█ | 606/5772 [1:04:02<9:57:54, 6.94s/it] {'loss': 0.507, 'learning_rate': 1.970755565294318e-05, 'epoch': 0.1} + 10%|█ | 606/5772 [1:03:55<9:57:54, 6.94s/it] 11%|█ | 607/5772 [1:04:08<9:33:58, 6.67s/it] 11%|█ | 607/5772 [1:04:01<9:33:59, 6.67s/it] {'loss': 0.4954, 'learning_rate': 1.9706206853302815e-05, 'epoch': 0.11} + 11%|█ | 607/5772 [1:04:08<9:33:58, 6.67s/it] {'loss': 0.4954, 'learning_rate': 1.9706206853302815e-05, 'epoch': 0.11} + 11%|█ | 607/5772 [1:04:01<9:33:59, 6.67s/it] 11%|█ | 608/5772 [1:04:15<9:23:16, 6.54s/it] 11%|█ | 608/5772 [1:04:08<9:23:16, 6.54s/it] {'loss': 0.4885, 'learning_rate': 1.9704854996746565e-05, 'epoch': 0.11} + 11%|█ | 608/5772 [1:04:15<9:23:16, 6.54s/it] {'loss': 0.4885, 'learning_rate': 1.9704854996746565e-05, 'epoch': 0.11} + 11%|█ | 608/5772 [1:04:08<9:23:16, 6.54s/it] 11%|█ | 609/5772 [1:04:21<9:14:32, 6.44s/it] 11%|█ | 609/5772 [1:04:14<9:14:32, 6.44s/it] {'loss': 0.4874, 'learning_rate': 1.9703500083700196e-05, 'epoch': 0.11} + 11%|█ | 609/5772 [1:04:21<9:14:32, 6.44s/it] {'loss': 0.4874, 'learning_rate': 1.9703500083700196e-05, 'epoch': 0.11} + 11%|█ | 609/5772 [1:04:14<9:14:32, 6.44s/it] 11%|█ | 610/5772 [1:04:27<8:58:34, 6.26s/it] 11%|█ | 610/5772 [1:04:20<8:58:34, 6.26s/it] {'loss': 0.5074, 'learning_rate': 1.9702142114590426e-05, 'epoch': 0.11} + 11%|█ | 610/5772 [1:04:27<8:58:34, 6.26s/it] {'loss': 0.5074, 'learning_rate': 1.9702142114590426e-05, 'epoch': 0.11} + 11%|█ | 610/5772 [1:04:20<8:58:34, 6.26s/it] 11%|█ | 611/5772 [1:04:26<9:03:32, 6.32s/it] 11%|█ | 611/5772 [1:04:33<9:03:33, 6.32s/it] {'loss': 0.4931, 'learning_rate': 1.970078108984494e-05, 'epoch': 0.11} + 11%|█ | 611/5772 [1:04:33<9:03:33, 6.32s/it] {'loss': 0.4931, 'learning_rate': 1.970078108984494e-05, 'epoch': 0.11} + 11%|█ | 611/5772 [1:04:26<9:03:32, 6.32s/it] 11%|█ | 612/5772 [1:04:39<8:57:23, 6.25s/it] 11%|█ | 612/5772 [1:04:32<8:57:22, 6.25s/it] {'loss': 0.4933, 'learning_rate': 1.969941700989239e-05, 'epoch': 0.11} + 11%|█ | 612/5772 [1:04:39<8:57:23, 6.25s/it] {'loss': 0.4933, 'learning_rate': 1.969941700989239e-05, 'epoch': 0.11} + 11%|█ | 612/5772 [1:04:32<8:57:22, 6.25s/it] 11%|█ | 613/5772 [1:04:39<9:01:25, 6.30s/it] 11%|█ | 613/5772 [1:04:46<9:01:25, 6.30s/it] {'loss': 0.4932, 'learning_rate': 1.9698049875162377e-05, 'epoch': 0.11} + 11%|█ | 613/5772 [1:04:46<9:01:25, 6.30s/it] {'loss': 0.4932, 'learning_rate': 1.9698049875162377e-05, 'epoch': 0.11} + 11%|█ | 613/5772 [1:04:39<9:01:25, 6.30s/it] 11%|█ | 614/5772 [1:04:45<8:57:22, 6.25s/it] 11%|█ | 614/5772 [1:04:52<8:57:22, 6.25s/it] {'loss': 0.5098, 'learning_rate': 1.969667968608548e-05, 'epoch': 0.11} + 11%|█ | 614/5772 [1:04:52<8:57:22, 6.25s/it] {'loss': 0.5098, 'learning_rate': 1.969667968608548e-05, 'epoch': 0.11} + 11%|█ | 614/5772 [1:04:45<8:57:22, 6.25s/it] 11%|█ | 615/5772 [1:04:51<9:07:28, 6.37s/it] 11%|█ | 615/5772 [1:04:59<9:07:27, 6.37s/it] {'loss': 0.4902, 'learning_rate': 1.969530644309323e-05, 'epoch': 0.11} + 11%|█ | 615/5772 [1:04:59<9:07:27, 6.37s/it] {'loss': 0.4902, 'learning_rate': 1.969530644309323e-05, 'epoch': 0.11} + 11%|█ | 615/5772 [1:04:51<9:07:28, 6.37s/it] 11%|█ | 616/5772 [1:04:58<9:02:54, 6.32s/it] 11%|█ | 616/5772 [1:05:05<9:02:54, 6.32s/it] {'loss': 0.5083, 'learning_rate': 1.969393014661812e-05, 'epoch': 0.11} + 11%|█ | 616/5772 [1:05:05<9:02:54, 6.32s/it] {'loss': 0.5083, 'learning_rate': 1.969393014661812e-05, 'epoch': 0.11} + 11%|█ | 616/5772 [1:04:58<9:02:54, 6.32s/it] 11%|█ | 617/5772 [1:05:04<9:02:45, 6.32s/it] 11%|█ | 617/5772 [1:05:11<9:02:45, 6.32s/it] {'loss': 0.4884, 'learning_rate': 1.969255079709361e-05, 'epoch': 0.11} + 11%|█ | 617/5772 [1:05:11<9:02:45, 6.32s/it] {'loss': 0.4884, 'learning_rate': 1.969255079709361e-05, 'epoch': 0.11} + 11%|█ | 617/5772 [1:05:04<9:02:45, 6.32s/it] 11%|█ | 618/5772 [1:05:11<9:16:08, 6.47s/it] 11%|█ | 618/5772 [1:05:18<9:16:09, 6.47s/it] {'loss': 0.488, 'learning_rate': 1.9691168394954117e-05, 'epoch': 0.11} + 11%|█ | 618/5772 [1:05:18<9:16:09, 6.47s/it] {'loss': 0.488, 'learning_rate': 1.9691168394954117e-05, 'epoch': 0.11} + 11%|█ | 618/5772 [1:05:11<9:16:08, 6.47s/it] 11%|█ | 619/5772 [1:05:24<9:05:14, 6.35s/it] 11%|█ | 619/5772 [1:05:17<9:05:14, 6.35s/it] {'loss': 0.5094, 'learning_rate': 1.968978294063502e-05, 'epoch': 0.11} + 11%|█ | 619/5772 [1:05:24<9:05:14, 6.35s/it] {'loss': 0.5094, 'learning_rate': 1.968978294063502e-05, 'epoch': 0.11} + 11%|█ | 619/5772 [1:05:17<9:05:14, 6.35s/it] 11%|█ | 620/5772 [1:05:23<9:03:11, 6.33s/it] 11%|█ | 620/5772 [1:05:30<9:03:12, 6.33s/it] {'loss': 0.5111, 'learning_rate': 1.9688394434572666e-05, 'epoch': 0.11} + 11%|█ | 620/5772 [1:05:30<9:03:12, 6.33s/it] {'loss': 0.5111, 'learning_rate': 1.9688394434572666e-05, 'epoch': 0.11} + 11%|█ | 620/5772 [1:05:23<9:03:11, 6.33s/it] 11%|█ | 621/5772 [1:05:29<8:59:50, 6.29s/it] 11%|█ | 621/5772 [1:05:36<8:59:50, 6.29s/it] {'loss': 0.4851, 'learning_rate': 1.9687002877204347e-05, 'epoch': 0.11} + 11%|█ | 621/5772 [1:05:36<8:59:50, 6.29s/it] {'loss': 0.4851, 'learning_rate': 1.9687002877204347e-05, 'epoch': 0.11} + 11%|█ | 621/5772 [1:05:29<8:59:50, 6.29s/it] 11%|█ | 622/5772 [1:05:43<8:59:20, 6.28s/it] 11%|█ | 622/5772 [1:05:36<8:59:20, 6.28s/it] {'loss': 0.5144, 'learning_rate': 1.968560826896833e-05, 'epoch': 0.11} + 11%|█ | 622/5772 [1:05:43<8:59:20, 6.28s/it] {'loss': 0.5144, 'learning_rate': 1.968560826896833e-05, 'epoch': 0.11} + 11%|█ | 622/5772 [1:05:36<8:59:20, 6.28s/it] 11%|█ | 623/5772 [1:05:49<9:02:45, 6.32s/it] 11%|█ | 623/5772 [1:05:42<9:02:46, 6.32s/it] {'loss': 0.5036, 'learning_rate': 1.9684210610303848e-05, 'epoch': 0.11} + 11%|█ | 623/5772 [1:05:49<9:02:45, 6.32s/it] {'loss': 0.5036, 'learning_rate': 1.9684210610303848e-05, 'epoch': 0.11} + 11%|█ | 623/5772 [1:05:42<9:02:46, 6.32s/it] 11%|█ | 624/5772 [1:05:55<8:49:43, 6.17s/it] 11%|█ | 624/5772 [1:05:48<8:49:44, 6.17s/it] {'loss': 0.5031, 'learning_rate': 1.9682809901651074e-05, 'epoch': 0.11} + 11%|█ | 624/5772 [1:05:55<8:49:43, 6.17s/it] {'loss': 0.5031, 'learning_rate': 1.9682809901651074e-05, 'epoch': 0.11} + 11%|█ | 624/5772 [1:05:48<8:49:44, 6.17s/it] 11%|█ | 625/5772 [1:06:01<8:48:57, 6.17s/it] 11%|█ | 625/5772 [1:05:54<8:48:58, 6.17s/it] {'loss': 0.5042, 'learning_rate': 1.968140614345116e-05, 'epoch': 0.11} + 11%|█ | 625/5772 [1:06:01<8:48:57, 6.17s/it] {'loss': 0.5042, 'learning_rate': 1.968140614345116e-05, 'epoch': 0.11} + 11%|█ | 625/5772 [1:05:54<8:48:58, 6.17s/it] 11%|█ | 626/5772 [1:06:00<8:55:25, 6.24s/it] 11%|█ | 626/5772 [1:06:07<8:55:26, 6.24s/it] {'loss': 0.4961, 'learning_rate': 1.967999933614621e-05, 'epoch': 0.11} + 11%|█ | 626/5772 [1:06:07<8:55:26, 6.24s/it] {'loss': 0.4961, 'learning_rate': 1.967999933614621e-05, 'epoch': 0.11} + 11%|█ | 626/5772 [1:06:00<8:55:25, 6.24s/it] 11%|█ | 627/5772 [1:06:14<8:51:43, 6.20s/it] 11%|█ | 627/5772 [1:06:07<8:51:43, 6.20s/it] {'loss': 0.4822, 'learning_rate': 1.967858948017929e-05, 'epoch': 0.11} + 11%|█ | 627/5772 [1:06:14<8:51:43, 6.20s/it] {'loss': 0.4822, 'learning_rate': 1.967858948017929e-05, 'epoch': 0.11} + 11%|█ | 627/5772 [1:06:07<8:51:43, 6.20s/it] 11%|█ | 628/5772 [1:06:13<8:56:29, 6.26s/it] 11%|█ | 628/5772 [1:06:20<8:56:30, 6.26s/it] {'loss': 0.5064, 'learning_rate': 1.9677176575994425e-05, 'epoch': 0.11} + 11%|█ | 628/5772 [1:06:20<8:56:30, 6.26s/it] {'loss': 0.5064, 'learning_rate': 1.9677176575994425e-05, 'epoch': 0.11} + 11%|█ | 628/5772 [1:06:13<8:56:29, 6.26s/it] 11%|█ | 629/5772 [1:06:26<8:54:56, 6.24s/it] 11%|█ | 629/5772 [1:06:19<8:54:56, 6.24s/it] {'loss': 0.5004, 'learning_rate': 1.9675760624036605e-05, 'epoch': 0.11} + 11%|█ | 629/5772 [1:06:26<8:54:56, 6.24s/it] {'loss': 0.5004, 'learning_rate': 1.9675760624036605e-05, 'epoch': 0.11} + 11%|█ | 629/5772 [1:06:19<8:54:56, 6.24s/it] 11%|█ | 630/5772 [1:06:32<8:52:07, 6.21s/it] 11%|█ | 630/5772 [1:06:25<8:52:07, 6.21s/it] {'loss': 0.4946, 'learning_rate': 1.967434162475177e-05, 'epoch': 0.11} + 11%|█ | 630/5772 [1:06:32<8:52:07, 6.21s/it] {'loss': 0.4946, 'learning_rate': 1.967434162475177e-05, 'epoch': 0.11} + 11%|█ | 630/5772 [1:06:25<8:52:07, 6.21s/it] 11%|█ | 631/5772 [1:06:39<9:10:13, 6.42s/it] 11%|█ | 631/5772 [1:06:32<9:10:13, 6.42s/it] {'loss': 0.4925, 'learning_rate': 1.9672919578586832e-05, 'epoch': 0.11} + 11%|█ | 631/5772 [1:06:39<9:10:13, 6.42s/it] {'loss': 0.4925, 'learning_rate': 1.9672919578586832e-05, 'epoch': 0.11} + 11%|█ | 631/5772 [1:06:32<9:10:13, 6.42s/it] 11%|█ | 632/5772 [1:06:45<8:55:03, 6.25s/it] 11%|█ | 632/5772 [1:06:38<8:55:03, 6.25s/it] {'loss': 0.4967, 'learning_rate': 1.9671494485989656e-05, 'epoch': 0.11} + 11%|█ | 632/5772 [1:06:45<8:55:03, 6.25s/it] {'loss': 0.4967, 'learning_rate': 1.9671494485989656e-05, 'epoch': 0.11} + 11%|█ | 632/5772 [1:06:38<8:55:03, 6.25s/it] 11%|█ | 633/5772 [1:06:51<8:46:37, 6.15s/it] 11%|█ | 633/5772 [1:06:44<8:46:37, 6.15s/it] {'loss': 0.4867, 'learning_rate': 1.9670066347409063e-05, 'epoch': 0.11} + 11%|█ | 633/5772 [1:06:51<8:46:37, 6.15s/it] {'loss': 0.4867, 'learning_rate': 1.9670066347409063e-05, 'epoch': 0.11} + 11%|█ | 633/5772 [1:06:44<8:46:37, 6.15s/it] 11%|█ | 634/5772 [1:06:57<8:50:08, 6.19s/it] 11%|█ | 634/5772 [1:06:50<8:50:08, 6.19s/it] {'loss': 0.5025, 'learning_rate': 1.966863516329484e-05, 'epoch': 0.11} + 11%|█ | 634/5772 [1:06:57<8:50:08, 6.19s/it] {'loss': 0.5025, 'learning_rate': 1.966863516329484e-05, 'epoch': 0.11} + 11%|█ | 634/5772 [1:06:50<8:50:08, 6.19s/it] 11%|█ | 635/5772 [1:07:03<8:41:19, 6.09s/it] 11%|█ | 635/5772 [1:06:56<8:41:19, 6.09s/it] {'loss': 0.4966, 'learning_rate': 1.966720093409773e-05, 'epoch': 0.11} + 11%|█ | 635/5772 [1:07:03<8:41:19, 6.09s/it] {'loss': 0.4966, 'learning_rate': 1.966720093409773e-05, 'epoch': 0.11} + 11%|█ | 635/5772 [1:06:56<8:41:19, 6.09s/it] 11%|█ | 636/5772 [1:07:09<8:38:09, 6.05s/it] 11%|█ | 636/5772 [1:07:02<8:38:09, 6.05s/it] {'loss': 0.4956, 'learning_rate': 1.9665763660269436e-05, 'epoch': 0.11} + 11%|█ | 636/5772 [1:07:09<8:38:09, 6.05s/it] {'loss': 0.4956, 'learning_rate': 1.9665763660269436e-05, 'epoch': 0.11} + 11%|█ | 636/5772 [1:07:02<8:38:09, 6.05s/it] 11%|█ | 637/5772 [1:07:15<8:46:17, 6.15s/it] 11%|█ | 637/5772 [1:07:08<8:46:17, 6.15s/it] {'loss': 0.4999, 'learning_rate': 1.9664323342262623e-05, 'epoch': 0.11} + 11%|█ | 637/5772 [1:07:15<8:46:17, 6.15s/it] {'loss': 0.4999, 'learning_rate': 1.9664323342262623e-05, 'epoch': 0.11} + 11%|█ | 637/5772 [1:07:08<8:46:17, 6.15s/it] 11%|█ | 638/5772 [1:07:14<8:40:56, 6.09s/it] 11%|█ | 638/5772 [1:07:21<8:40:56, 6.09s/it] {'loss': 0.4919, 'learning_rate': 1.96628799805309e-05, 'epoch': 0.11} + 11%|█ | 638/5772 [1:07:21<8:40:56, 6.09s/it] {'loss': 0.4919, 'learning_rate': 1.96628799805309e-05, 'epoch': 0.11} + 11%|█ | 638/5772 [1:07:14<8:40:56, 6.09s/it] 11%|█ | 639/5772 [1:07:21<8:44:46, 6.13s/it] 11%|█ | 639/5772 [1:07:28<8:44:47, 6.13s/it] {'loss': 0.4875, 'learning_rate': 1.966143357552886e-05, 'epoch': 0.11} + 11%|█ | 639/5772 [1:07:28<8:44:47, 6.13s/it] {'loss': 0.4875, 'learning_rate': 1.966143357552886e-05, 'epoch': 0.11} + 11%|█ | 639/5772 [1:07:21<8:44:46, 6.13s/it] 11%|█ | 640/5772 [1:07:27<8:44:36, 6.13s/it] 11%|█ | 640/5772 [1:07:34<8:44:36, 6.13s/it] {'loss': 0.4946, 'learning_rate': 1.9659984127712027e-05, 'epoch': 0.11} + 11%|█ | 640/5772 [1:07:34<8:44:36, 6.13s/it] {'loss': 0.4946, 'learning_rate': 1.9659984127712027e-05, 'epoch': 0.11} + 11%|█ | 640/5772 [1:07:27<8:44:36, 6.13s/it] 11%|█ | 641/5772 [1:07:33<8:42:52, 6.11s/it] 11%|█ | 641/5772 [1:07:40<8:42:51, 6.11s/it] {'loss': 0.4844, 'learning_rate': 1.9658531637536905e-05, 'epoch': 0.11} + 11%|█ | 641/5772 [1:07:40<8:42:51, 6.11s/it] {'loss': 0.4844, 'learning_rate': 1.9658531637536905e-05, 'epoch': 0.11} + 11%|█ | 641/5772 [1:07:33<8:42:52, 6.11s/it] 11%|█ | 642/5772 [1:07:39<8:42:55, 6.12s/it] 11%|█ | 642/5772 [1:07:46<8:42:55, 6.12s/it] {'loss': 0.4926, 'learning_rate': 1.9657076105460945e-05, 'epoch': 0.11} + 11%|█ | 642/5772 [1:07:46<8:42:55, 6.12s/it] {'loss': 0.4926, 'learning_rate': 1.9657076105460945e-05, 'epoch': 0.11} + 11%|█ | 642/5772 [1:07:39<8:42:55, 6.12s/it] 11%|█ | 643/5772 [1:07:52<8:38:13, 6.06s/it] 11%|█ | 643/5772 [1:07:45<8:38:13, 6.06s/it] {'loss': 0.4923, 'learning_rate': 1.965561753194256e-05, 'epoch': 0.11} + 11%|█ | 643/5772 [1:07:52<8:38:13, 6.06s/it] {'loss': 0.4923, 'learning_rate': 1.965561753194256e-05, 'epoch': 0.11} + 11%|█ | 643/5772 [1:07:45<8:38:13, 6.06s/it] 11%|█ | 644/5772 [1:07:58<8:42:06, 6.11s/it] 11%|█ | 644/5772 [1:07:51<8:42:06, 6.11s/it] {'loss': 0.4929, 'learning_rate': 1.965415591744112e-05, 'epoch': 0.11} + 11%|█ | 644/5772 [1:07:58<8:42:06, 6.11s/it] {'loss': 0.4929, 'learning_rate': 1.965415591744112e-05, 'epoch': 0.11} + 11%|█ | 644/5772 [1:07:51<8:42:06, 6.11s/it] 11%|█ | 645/5772 [1:08:04<8:41:46, 6.11s/it] 11%|█ | 645/5772 [1:07:57<8:41:46, 6.11s/it] {'loss': 0.4983, 'learning_rate': 1.965269126241695e-05, 'epoch': 0.11} + 11%|█ | 645/5772 [1:08:04<8:41:46, 6.11s/it] {'loss': 0.4983, 'learning_rate': 1.965269126241695e-05, 'epoch': 0.11} + 11%|█ | 645/5772 [1:07:57<8:41:46, 6.11s/it] 11%|█ | 646/5772 [1:08:10<8:45:14, 6.15s/it] 11%|█ | 646/5772 [1:08:03<8:45:15, 6.15s/it] {'loss': 0.4932, 'learning_rate': 1.9651223567331333e-05, 'epoch': 0.11} + 11%|█ | 646/5772 [1:08:10<8:45:14, 6.15s/it] {'loss': 0.4932, 'learning_rate': 1.9651223567331333e-05, 'epoch': 0.11} + 11%|█ | 646/5772 [1:08:03<8:45:15, 6.15s/it] 11%|█ | 647/5772 [1:08:17<8:53:58, 6.25s/it] 11%|█ | 647/5772 [1:08:10<8:53:58, 6.25s/it] {'loss': 0.4953, 'learning_rate': 1.964975283264652e-05, 'epoch': 0.11} + 11%|█ | 647/5772 [1:08:17<8:53:58, 6.25s/it] {'loss': 0.4953, 'learning_rate': 1.964975283264652e-05, 'epoch': 0.11} + 11%|█ | 647/5772 [1:08:10<8:53:58, 6.25s/it] 11%|█ | 648/5772 [1:08:23<8:53:26, 6.25s/it] 11%|█ | 648/5772 [1:08:16<8:53:26, 6.25s/it] {'loss': 0.493, 'learning_rate': 1.9648279058825702e-05, 'epoch': 0.11} + 11%|█ | 648/5772 [1:08:23<8:53:26, 6.25s/it] {'loss': 0.493, 'learning_rate': 1.9648279058825702e-05, 'epoch': 0.11} + 11%|█ | 648/5772 [1:08:16<8:53:26, 6.25s/it] 11%|█ | 649/5772 [1:08:23<8:56:58, 6.29s/it] 11%|█ | 649/5772 [1:08:30<8:56:59, 6.29s/it] {'loss': 0.4916, 'learning_rate': 1.964680224633304e-05, 'epoch': 0.11} + 11%|█ | 649/5772 [1:08:30<8:56:59, 6.29s/it] {'loss': 0.4916, 'learning_rate': 1.964680224633304e-05, 'epoch': 0.11} + 11%|█ | 649/5772 [1:08:23<8:56:58, 6.29s/it]4 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +32 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +5 AutoResumeHook: Checking whether to suspend... +06 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +1113 11%|█▏ | 650/5772 [1:08:35<8:43:09, 6.13s/it]14 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + + +15 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... + 11%|█▏ | 650/5772 [1:08:28<8:43:10, 6.13s/it]1 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4984, 'learning_rate': 1.9645322395633647e-05, 'epoch': 0.11} + 11%|█▏ | 650/5772 [1:08:35<8:43:09, 6.13s/it] {'loss': 0.4984, 'learning_rate': 1.9645322395633647e-05, 'epoch': 0.11} + 11%|█▏ | 650/5772 [1:08:28<8:43:10, 6.13s/it] 11%|█▏ | 651/5772 [1:08:41<8:38:36, 6.08s/it] 11%|█▏ | 651/5772 [1:08:34<8:38:37, 6.08s/it] {'loss': 0.4904, 'learning_rate': 1.964383950719359e-05, 'epoch': 0.11} + 11%|█▏ | 651/5772 [1:08:41<8:38:36, 6.08s/it] {'loss': 0.4904, 'learning_rate': 1.964383950719359e-05, 'epoch': 0.11} + 11%|█▏ | 651/5772 [1:08:34<8:38:37, 6.08s/it] 11%|█▏ | 652/5772 [1:08:48<8:48:34, 6.19s/it] 11%|█▏ | 652/5772 [1:08:41<8:48:34, 6.19s/it] {'loss': 0.5009, 'learning_rate': 1.9642353581479904e-05, 'epoch': 0.11} + 11%|█▏ | 652/5772 [1:08:48<8:48:34, 6.19s/it] {'loss': 0.5009, 'learning_rate': 1.9642353581479904e-05, 'epoch': 0.11} + 11%|█▏ | 652/5772 [1:08:41<8:48:34, 6.19s/it] 11%|█▏ | 653/5772 [1:08:54<8:42:15, 6.12s/it] 11%|█▏ | 653/5772 [1:08:47<8:42:15, 6.12s/it] {'loss': 0.501, 'learning_rate': 1.964086461896057e-05, 'epoch': 0.11} + 11%|█▏ | 653/5772 [1:08:54<8:42:15, 6.12s/it] {'loss': 0.501, 'learning_rate': 1.964086461896057e-05, 'epoch': 0.11} + 11%|█▏ | 653/5772 [1:08:47<8:42:15, 6.12s/it] 11%|█▏ | 654/5772 [1:09:00<8:47:26, 6.18s/it] 11%|█▏ | 654/5772 [1:08:53<8:47:26, 6.18s/it] {'loss': 0.503, 'learning_rate': 1.9639372620104527e-05, 'epoch': 0.11} + 11%|█▏ | 654/5772 [1:09:00<8:47:26, 6.18s/it] {'loss': 0.503, 'learning_rate': 1.9639372620104527e-05, 'epoch': 0.11} + 11%|█▏ | 654/5772 [1:08:53<8:47:26, 6.18s/it] 11%|█▏ | 655/5772 [1:09:06<8:49:12, 6.21s/it] 11%|█▏ | 655/5772 [1:08:59<8:49:12, 6.21s/it] {'loss': 0.504, 'learning_rate': 1.9637877585381672e-05, 'epoch': 0.11} + 11%|█▏ | 655/5772 [1:09:06<8:49:12, 6.21s/it] {'loss': 0.504, 'learning_rate': 1.9637877585381672e-05, 'epoch': 0.11} + 11%|█▏ | 655/5772 [1:08:59<8:49:12, 6.21s/it] 11%|█▏ | 656/5772 [1:09:12<8:40:39, 6.11s/it] 11%|█▏ | 656/5772 [1:09:05<8:40:39, 6.11s/it] {'loss': 0.4844, 'learning_rate': 1.9636379515262857e-05, 'epoch': 0.11} + 11%|█▏ | 656/5772 [1:09:12<8:40:39, 6.11s/it] {'loss': 0.4844, 'learning_rate': 1.9636379515262857e-05, 'epoch': 0.11} + 11%|█▏ | 656/5772 [1:09:05<8:40:39, 6.11s/it] 11%|█▏ | 657/5772 [1:09:18<8:35:27, 6.05s/it] 11%|█▏ | 657/5772 [1:09:11<8:35:27, 6.05s/it] {'loss': 0.4932, 'learning_rate': 1.9634878410219893e-05, 'epoch': 0.11} + 11%|█▏ | 657/5772 [1:09:18<8:35:27, 6.05s/it] {'loss': 0.4932, 'learning_rate': 1.9634878410219893e-05, 'epoch': 0.11} + 11%|█▏ | 657/5772 [1:09:11<8:35:27, 6.05s/it] 11%|█▏ | 658/5772 [1:09:17<8:37:40, 6.07s/it] 11%|█▏ | 658/5772 [1:09:24<8:37:40, 6.07s/it] {'loss': 0.5057, 'learning_rate': 1.9633374270725546e-05, 'epoch': 0.11} + 11%|█▏ | 658/5772 [1:09:24<8:37:40, 6.07s/it] {'loss': 0.5057, 'learning_rate': 1.9633374270725546e-05, 'epoch': 0.11} + 11%|█▏ | 658/5772 [1:09:17<8:37:40, 6.07s/it] 11%|█▏ | 659/5772 [1:09:30<8:30:52, 5.99s/it] 11%|█▏ | 659/5772 [1:09:23<8:30:52, 5.99s/it] {'loss': 0.5004, 'learning_rate': 1.963186709725353e-05, 'epoch': 0.11} + 11%|█▏ | 659/5772 [1:09:30<8:30:52, 5.99s/it] {'loss': 0.5004, 'learning_rate': 1.963186709725353e-05, 'epoch': 0.11} + 11%|█▏ | 659/5772 [1:09:23<8:30:52, 5.99s/it] 11%|█▏ | 660/5772 [1:09:36<8:41:53, 6.13s/it] 11%|█▏ | 660/5772 [1:09:29<8:41:53, 6.13s/it] {'loss': 0.4987, 'learning_rate': 1.9630356890278527e-05, 'epoch': 0.11} + 11%|█▏ | 660/5772 [1:09:36<8:41:53, 6.13s/it] {'loss': 0.4987, 'learning_rate': 1.9630356890278527e-05, 'epoch': 0.11} + 11%|█▏ | 660/5772 [1:09:29<8:41:53, 6.13s/it] 11%|█▏ | 661/5772 [1:09:42<8:32:46, 6.02s/it] 11%|█▏ | 661/5772 [1:09:35<8:32:46, 6.02s/it] {'loss': 0.486, 'learning_rate': 1.9628843650276167e-05, 'epoch': 0.11} + 11%|█▏ | 661/5772 [1:09:42<8:32:46, 6.02s/it] {'loss': 0.486, 'learning_rate': 1.9628843650276167e-05, 'epoch': 0.11} + 11%|█▏ | 661/5772 [1:09:35<8:32:46, 6.02s/it] 11%|█▏ | 662/5772 [1:09:48<8:36:05, 6.06s/it] 11%|█▏ | 662/5772 [1:09:41<8:36:05, 6.06s/it] {'loss': 0.4997, 'learning_rate': 1.9627327377723035e-05, 'epoch': 0.11} + 11%|█▏ | 662/5772 [1:09:48<8:36:05, 6.06s/it] {'loss': 0.4997, 'learning_rate': 1.9627327377723035e-05, 'epoch': 0.11} + 11%|█▏ | 662/5772 [1:09:41<8:36:05, 6.06s/it] 11%|█▏ | 663/5772 [1:09:47<8:35:05, 6.05s/it] 11%|█▏ | 663/5772 [1:09:54<8:35:05, 6.05s/it] {'loss': 0.5092, 'learning_rate': 1.9625808073096676e-05, 'epoch': 0.11} + 11%|█▏ | 663/5772 [1:09:54<8:35:05, 6.05s/it] {'loss': 0.5092, 'learning_rate': 1.9625808073096676e-05, 'epoch': 0.11} + 11%|█▏ | 663/5772 [1:09:47<8:35:05, 6.05s/it] 12%|█▏ | 664/5772 [1:10:01<8:42:38, 6.14s/it] 12%|█▏ | 664/5772 [1:09:54<8:42:38, 6.14s/it] {'loss': 0.4952, 'learning_rate': 1.962428573687558e-05, 'epoch': 0.12} + 12%|█▏ | 664/5772 [1:10:01<8:42:38, 6.14s/it] {'loss': 0.4952, 'learning_rate': 1.962428573687558e-05, 'epoch': 0.12} + 12%|█▏ | 664/5772 [1:09:54<8:42:38, 6.14s/it] 12%|█▏ | 665/5772 [1:10:07<8:48:49, 6.21s/it] 12%|█▏ | 665/5772 [1:10:00<8:48:49, 6.21s/it] {'loss': 0.5028, 'learning_rate': 1.9622760369539206e-05, 'epoch': 0.12} + 12%|█▏ | 665/5772 [1:10:07<8:48:49, 6.21s/it] {'loss': 0.5028, 'learning_rate': 1.9622760369539206e-05, 'epoch': 0.12} + 12%|█▏ | 665/5772 [1:10:00<8:48:49, 6.21s/it] 12%|█▏ | 666/5772 [1:10:13<8:44:02, 6.16s/it] 12%|█▏ | 666/5772 [1:10:06<8:44:02, 6.16s/it] {'loss': 0.5023, 'learning_rate': 1.9621231971567955e-05, 'epoch': 0.12} + 12%|█▏ | 666/5772 [1:10:13<8:44:02, 6.16s/it] {'loss': 0.5023, 'learning_rate': 1.9621231971567955e-05, 'epoch': 0.12} + 12%|█▏ | 666/5772 [1:10:06<8:44:02, 6.16s/it] 12%|█▏ | 667/5772 [1:10:19<8:41:46, 6.13s/it] 12%|█▏ | 667/5772 [1:10:12<8:41:46, 6.13s/it] {'loss': 0.4897, 'learning_rate': 1.9619700543443187e-05, 'epoch': 0.12} + 12%|█▏ | 667/5772 [1:10:19<8:41:46, 6.13s/it] {'loss': 0.4897, 'learning_rate': 1.9619700543443187e-05, 'epoch': 0.12} + 12%|█▏ | 667/5772 [1:10:12<8:41:46, 6.13s/it] 12%|█▏ | 668/5772 [1:10:25<8:35:02, 6.05s/it] 12%|█▏ | 668/5772 [1:10:18<8:35:02, 6.05s/it] {'loss': 0.4956, 'learning_rate': 1.961816608564722e-05, 'epoch': 0.12} + 12%|█▏ | 668/5772 [1:10:25<8:35:02, 6.05s/it] {'loss': 0.4956, 'learning_rate': 1.961816608564722e-05, 'epoch': 0.12} + 12%|█▏ | 668/5772 [1:10:18<8:35:02, 6.05s/it] 12%|█▏ | 669/5772 [1:10:31<8:33:28, 6.04s/it] 12%|█▏ | 669/5772 [1:10:24<8:33:29, 6.04s/it] {'loss': 0.5053, 'learning_rate': 1.9616628598663322e-05, 'epoch': 0.12} + 12%|█▏ | 669/5772 [1:10:31<8:33:28, 6.04s/it] {'loss': 0.5053, 'learning_rate': 1.9616628598663322e-05, 'epoch': 0.12} + 12%|█▏ | 669/5772 [1:10:24<8:33:29, 6.04s/it] 12%|█▏ | 670/5772 [1:10:37<8:32:51, 6.03s/it] 12%|█▏ | 670/5772 [1:10:30<8:32:51, 6.03s/it] {'loss': 0.4903, 'learning_rate': 1.9615088082975715e-05, 'epoch': 0.12} + 12%|█▏ | 670/5772 [1:10:37<8:32:51, 6.03s/it] {'loss': 0.4903, 'learning_rate': 1.9615088082975715e-05, 'epoch': 0.12} + 12%|█▏ | 670/5772 [1:10:30<8:32:51, 6.03s/it] 12%|█▏ | 671/5772 [1:10:43<8:36:37, 6.08s/it] 12%|█▏ | 671/5772 [1:10:36<8:36:37, 6.08s/it] {'loss': 0.4986, 'learning_rate': 1.9613544539069577e-05, 'epoch': 0.12} + 12%|█▏ | 671/5772 [1:10:43<8:36:37, 6.08s/it] {'loss': 0.4986, 'learning_rate': 1.9613544539069577e-05, 'epoch': 0.12} + 12%|█▏ | 671/5772 [1:10:36<8:36:37, 6.08s/it] 12%|█▏ | 672/5772 [1:10:49<8:36:46, 6.08s/it] 12%|█▏ | 672/5772 [1:10:42<8:36:46, 6.08s/it] {'loss': 0.4863, 'learning_rate': 1.9611997967431037e-05, 'epoch': 0.12} + 12%|█▏ | 672/5772 [1:10:49<8:36:46, 6.08s/it] {'loss': 0.4863, 'learning_rate': 1.9611997967431037e-05, 'epoch': 0.12} + 12%|█▏ | 672/5772 [1:10:42<8:36:46, 6.08s/it] 12%|█▏ | 673/5772 [1:10:48<8:29:41, 6.00s/it] 12%|█▏ | 673/5772 [1:10:55<8:29:41, 6.00s/it] {'loss': 0.4977, 'learning_rate': 1.9610448368547182e-05, 'epoch': 0.12} + 12%|█▏ | 673/5772 [1:10:55<8:29:41, 6.00s/it] {'loss': 0.4977, 'learning_rate': 1.9610448368547182e-05, 'epoch': 0.12} + 12%|█▏ | 673/5772 [1:10:48<8:29:41, 6.00s/it] 12%|█▏ | 674/5772 [1:11:01<8:31:50, 6.02s/it] 12%|█▏ | 674/5772 [1:10:54<8:31:50, 6.02s/it] {'loss': 0.4801, 'learning_rate': 1.9608895742906046e-05, 'epoch': 0.12} + 12%|█▏ | 674/5772 [1:11:01<8:31:50, 6.02s/it] {'loss': 0.4801, 'learning_rate': 1.9608895742906046e-05, 'epoch': 0.12} + 12%|█▏ | 674/5772 [1:10:54<8:31:50, 6.02s/it] 12%|█▏ | 675/5772 [1:11:07<8:35:18, 6.07s/it] 12%|█▏ | 675/5772 [1:11:00<8:35:18, 6.07s/it] {'loss': 0.4961, 'learning_rate': 1.960734009099662e-05, 'epoch': 0.12} + 12%|█▏ | 675/5772 [1:11:07<8:35:18, 6.07s/it] {'loss': 0.4961, 'learning_rate': 1.960734009099662e-05, 'epoch': 0.12} + 12%|█▏ | 675/5772 [1:11:00<8:35:18, 6.07s/it] 12%|█▏ | 676/5772 [1:11:14<8:37:24, 6.09s/it] 12%|█▏ | 676/5772 [1:11:07<8:37:24, 6.09s/it] {'loss': 0.4988, 'learning_rate': 1.9605781413308852e-05, 'epoch': 0.12} + 12%|█▏ | 676/5772 [1:11:14<8:37:24, 6.09s/it] {'loss': 0.4988, 'learning_rate': 1.9605781413308852e-05, 'epoch': 0.12} + 12%|█▏ | 676/5772 [1:11:07<8:37:24, 6.09s/it] 12%|█▏ | 677/5772 [1:11:20<8:32:44, 6.04s/it] 12%|█▏ | 677/5772 [1:11:12<8:32:44, 6.04s/it] {'loss': 0.4955, 'learning_rate': 1.9604219710333637e-05, 'epoch': 0.12} + 12%|█▏ | 677/5772 [1:11:20<8:32:44, 6.04s/it] {'loss': 0.4955, 'learning_rate': 1.9604219710333637e-05, 'epoch': 0.12} + 12%|█▏ | 677/5772 [1:11:12<8:32:44, 6.04s/it] 12%|█▏ | 678/5772 [1:11:26<8:33:59, 6.05s/it] 12%|█▏ | 678/5772 [1:11:19<8:33:58, 6.05s/it] {'loss': 0.4939, 'learning_rate': 1.9602654982562822e-05, 'epoch': 0.12} + 12%|█▏ | 678/5772 [1:11:26<8:33:59, 6.05s/it] {'loss': 0.4939, 'learning_rate': 1.9602654982562822e-05, 'epoch': 0.12} + 12%|█▏ | 678/5772 [1:11:19<8:33:58, 6.05s/it] 12%|█▏ | 679/5772 [1:11:32<8:31:30, 6.03s/it] 12%|█▏ | 679/5772 [1:11:25<8:31:30, 6.03s/it] {'loss': 0.5024, 'learning_rate': 1.960108723048921e-05, 'epoch': 0.12} + 12%|█▏ | 679/5772 [1:11:32<8:31:30, 6.03s/it] {'loss': 0.5024, 'learning_rate': 1.960108723048921e-05, 'epoch': 0.12} + 12%|█▏ | 679/5772 [1:11:25<8:31:30, 6.03s/it] 12%|█▏ | 680/5772 [1:11:38<8:40:51, 6.14s/it] 12%|█▏ | 680/5772 [1:11:31<8:40:51, 6.14s/it] {'loss': 0.4843, 'learning_rate': 1.959951645460656e-05, 'epoch': 0.12} + 12%|█▏ | 680/5772 [1:11:38<8:40:51, 6.14s/it] {'loss': 0.4843, 'learning_rate': 1.959951645460656e-05, 'epoch': 0.12} + 12%|█▏ | 680/5772 [1:11:31<8:40:51, 6.14s/it] 12%|█▏ | 681/5772 [1:11:44<8:49:46, 6.24s/it] 12%|█▏ | 681/5772 [1:11:37<8:49:46, 6.24s/it] {'loss': 0.4985, 'learning_rate': 1.9597942655409574e-05, 'epoch': 0.12} + 12%|█▏ | 681/5772 [1:11:44<8:49:46, 6.24s/it] {'loss': 0.4985, 'learning_rate': 1.9597942655409574e-05, 'epoch': 0.12} + 12%|█▏ | 681/5772 [1:11:37<8:49:46, 6.24s/it] 12%|█▏ | 682/5772 [1:11:51<8:46:49, 6.21s/it] 12%|█▏ | 682/5772 [1:11:44<8:46:49, 6.21s/it] {'loss': 0.4966, 'learning_rate': 1.9596365833393913e-05, 'epoch': 0.12} + 12%|█▏ | 682/5772 [1:11:51<8:46:49, 6.21s/it] {'loss': 0.4966, 'learning_rate': 1.9596365833393913e-05, 'epoch': 0.12} + 12%|█▏ | 682/5772 [1:11:44<8:46:49, 6.21s/it] 12%|█▏ | 683/5772 [1:11:57<8:46:13, 6.20s/it] 12%|█▏ | 683/5772 [1:11:50<8:46:13, 6.20s/it] {'loss': 0.511, 'learning_rate': 1.959478598905619e-05, 'epoch': 0.12} + 12%|█▏ | 683/5772 [1:11:57<8:46:13, 6.20s/it] {'loss': 0.511, 'learning_rate': 1.959478598905619e-05, 'epoch': 0.12} + 12%|█▏ | 683/5772 [1:11:50<8:46:13, 6.20s/it] 12%|█▏ | 684/5772 [1:11:56<8:43:35, 6.17s/it] 12%|█▏ | 684/5772 [1:12:03<8:43:35, 6.17s/it] {'loss': 0.4879, 'learning_rate': 1.9593203122893966e-05, 'epoch': 0.12} + 12%|█▏ | 684/5772 [1:12:03<8:43:35, 6.17s/it] {'loss': 0.4879, 'learning_rate': 1.9593203122893966e-05, 'epoch': 0.12} + 12%|█▏ | 684/5772 [1:11:56<8:43:35, 6.17s/it] 12%|█▏ | 685/5772 [1:12:02<8:37:11, 6.10s/it] 12%|█▏ | 685/5772 [1:12:09<8:37:12, 6.10s/it] {'loss': 0.4948, 'learning_rate': 1.959161723540576e-05, 'epoch': 0.12} + 12%|█▏ | 685/5772 [1:12:09<8:37:12, 6.10s/it] {'loss': 0.4948, 'learning_rate': 1.959161723540576e-05, 'epoch': 0.12} + 12%|█▏ | 685/5772 [1:12:02<8:37:11, 6.10s/it] 12%|█▏ | 686/5772 [1:12:08<8:33:32, 6.06s/it] 12%|█▏ | 686/5772 [1:12:15<8:33:31, 6.06s/it] {'loss': 0.4896, 'learning_rate': 1.959002832709103e-05, 'epoch': 0.12} + 12%|█▏ | 686/5772 [1:12:15<8:33:31, 6.06s/it] {'loss': 0.4896, 'learning_rate': 1.959002832709103e-05, 'epoch': 0.12} + 12%|█▏ | 686/5772 [1:12:08<8:33:32, 6.06s/it] 12%|█▏ | 687/5772 [1:12:21<8:34:23, 6.07s/it] 12%|█▏ | 687/5772 [1:12:14<8:34:24, 6.07s/it] {'loss': 0.505, 'learning_rate': 1.9588436398450206e-05, 'epoch': 0.12} + 12%|█▏ | 687/5772 [1:12:21<8:34:23, 6.07s/it] {'loss': 0.505, 'learning_rate': 1.9588436398450206e-05, 'epoch': 0.12} + 12%|█▏ | 687/5772 [1:12:14<8:34:24, 6.07s/it] 12%|█▏ | 688/5772 [1:12:28<8:51:36, 6.27s/it] 12%|█▏ | 688/5772 [1:12:21<8:51:36, 6.27s/it] {'loss': 0.4988, 'learning_rate': 1.9586841449984643e-05, 'epoch': 0.12} + 12%|█▏ | 688/5772 [1:12:28<8:51:36, 6.27s/it] {'loss': 0.4988, 'learning_rate': 1.9586841449984643e-05, 'epoch': 0.12} + 12%|█▏ | 688/5772 [1:12:21<8:51:36, 6.27s/it] 12%|█▏ | 689/5772 [1:12:27<9:00:50, 6.38s/it] 12%|█▏ | 689/5772 [1:12:34<9:00:50, 6.38s/it] {'loss': 0.5032, 'learning_rate': 1.958524348219667e-05, 'epoch': 0.12} + 12%|█▏ | 689/5772 [1:12:34<9:00:50, 6.38s/it] {'loss': 0.5032, 'learning_rate': 1.958524348219667e-05, 'epoch': 0.12} + 12%|█▏ | 689/5772 [1:12:27<9:00:50, 6.38s/it] 12%|█▏ | 690/5772 [1:12:40<8:56:18, 6.33s/it] 12%|█▏ | 690/5772 [1:12:33<8:56:18, 6.33s/it] {'loss': 0.4961, 'learning_rate': 1.958364249558956e-05, 'epoch': 0.12} + 12%|█▏ | 690/5772 [1:12:40<8:56:18, 6.33s/it] {'loss': 0.4961, 'learning_rate': 1.958364249558956e-05, 'epoch': 0.12} + 12%|█▏ | 690/5772 [1:12:33<8:56:18, 6.33s/it] 12%|█▏ | 691/5772 [1:12:40<8:51:48, 6.28s/it] 12%|█▏ | 691/5772 [1:12:47<8:51:48, 6.28s/it] {'loss': 0.4944, 'learning_rate': 1.9582038490667532e-05, 'epoch': 0.12} + 12%|█▏ | 691/5772 [1:12:47<8:51:48, 6.28s/it] {'loss': 0.4944, 'learning_rate': 1.9582038490667532e-05, 'epoch': 0.12} + 12%|█▏ | 691/5772 [1:12:40<8:51:48, 6.28s/it] 12%|█▏ | 692/5772 [1:12:53<8:44:41, 6.20s/it] 12%|█▏ | 692/5772 [1:12:46<8:44:42, 6.20s/it] {'loss': 0.4867, 'learning_rate': 1.9580431467935753e-05, 'epoch': 0.12} + 12%|█▏ | 692/5772 [1:12:53<8:44:41, 6.20s/it] {'loss': 0.4867, 'learning_rate': 1.9580431467935753e-05, 'epoch': 0.12} + 12%|█▏ | 692/5772 [1:12:46<8:44:42, 6.20s/it] 12%|█▏ | 693/5772 [1:12:59<8:41:32, 6.16s/it] 12%|█▏ | 693/5772 [1:12:52<8:41:32, 6.16s/it] {'loss': 0.5004, 'learning_rate': 1.957882142790035e-05, 'epoch': 0.12} + 12%|█▏ | 693/5772 [1:12:59<8:41:32, 6.16s/it] {'loss': 0.5004, 'learning_rate': 1.957882142790035e-05, 'epoch': 0.12} + 12%|█▏ | 693/5772 [1:12:52<8:41:32, 6.16s/it] 12%|█▏ | 694/5772 [1:13:05<8:41:12, 6.16s/it] 12%|█▏ | 694/5772 [1:12:58<8:41:12, 6.16s/it] {'loss': 0.5042, 'learning_rate': 1.95772083710684e-05, 'epoch': 0.12} + 12%|█▏ | 694/5772 [1:13:05<8:41:12, 6.16s/it] {'loss': 0.5042, 'learning_rate': 1.95772083710684e-05, 'epoch': 0.12} + 12%|█▏ | 694/5772 [1:12:58<8:41:12, 6.16s/it] 12%|█▏ | 695/5772 [1:13:12<8:57:46, 6.36s/it] 12%|█▏ | 695/5772 [1:13:05<8:57:47, 6.36s/it] {'loss': 0.4973, 'learning_rate': 1.9575592297947926e-05, 'epoch': 0.12} + 12%|█▏ | 695/5772 [1:13:12<8:57:46, 6.36s/it] {'loss': 0.4973, 'learning_rate': 1.9575592297947926e-05, 'epoch': 0.12} + 12%|█▏ | 695/5772 [1:13:05<8:57:47, 6.36s/it] 12%|█▏ | 696/5772 [1:13:11<8:55:58, 6.34s/it] 12%|█▏ | 696/5772 [1:13:18<8:55:59, 6.34s/it] {'loss': 0.5008, 'learning_rate': 1.9573973209047893e-05, 'epoch': 0.12} + 12%|█▏ | 696/5772 [1:13:18<8:55:59, 6.34s/it] {'loss': 0.5008, 'learning_rate': 1.9573973209047893e-05, 'epoch': 0.12} + 12%|█▏ | 696/5772 [1:13:11<8:55:58, 6.34s/it] 12%|█▏ | 697/5772 [1:13:17<9:00:05, 6.39s/it] 12%|█▏ | 697/5772 [1:13:24<9:00:05, 6.39s/it] {'loss': 0.4941, 'learning_rate': 1.9572351104878232e-05, 'epoch': 0.12} + 12%|█▏ | 697/5772 [1:13:24<9:00:05, 6.39s/it] {'loss': 0.4941, 'learning_rate': 1.9572351104878232e-05, 'epoch': 0.12} + 12%|█▏ | 697/5772 [1:13:17<9:00:05, 6.39s/it] 12%|█▏ | 698/5772 [1:13:30<8:49:25, 6.26s/it] 12%|█▏ | 698/5772 [1:13:23<8:49:25, 6.26s/it] {'loss': 0.4974, 'learning_rate': 1.957072598594981e-05, 'epoch': 0.12} + 12%|█▏ | 698/5772 [1:13:30<8:49:25, 6.26s/it] {'loss': 0.4974, 'learning_rate': 1.957072598594981e-05, 'epoch': 0.12} + 12%|█▏ | 698/5772 [1:13:23<8:49:25, 6.26s/it] 12%|█▏ | 699/5772 [1:13:37<8:50:34, 6.28s/it] 12%|█▏ | 699/5772 [1:13:30<8:50:35, 6.28s/it] {'loss': 0.4998, 'learning_rate': 1.9569097852774456e-05, 'epoch': 0.12} + 12%|█▏ | 699/5772 [1:13:37<8:50:34, 6.28s/it] {'loss': 0.4998, 'learning_rate': 1.9569097852774456e-05, 'epoch': 0.12} + 12%|█▏ | 699/5772 [1:13:30<8:50:35, 6.28s/it]13 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... + 12%|█▏ | 700/5772 [1:13:43<8:53:36, 6.31s/it]9 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +07 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... + 12%|█▏ | 700/5772 [1:13:36<8:53:37, 6.31s/it]3 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4911, 'learning_rate': 1.9567466705864934e-05, 'epoch': 0.12} + 12%|█▏ | 700/5772 [1:13:43<8:53:36, 6.31s/it] {'loss': 0.4911, 'learning_rate': 1.9567466705864934e-05, 'epoch': 0.12} + 12%|█▏ | 700/5772 [1:13:36<8:53:37, 6.31s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-700/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-700/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-700/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 12%|█▏ | 701/5772 [1:14:04<14:52:05, 10.56s/it] 12%|█▏ | 701/5772 [1:13:57<14:52:05, 10.56s/it] {'loss': 0.4998, 'learning_rate': 1.9565832545734972e-05, 'epoch': 0.12} + 12%|█▏ | 701/5772 [1:14:04<14:52:05, 10.56s/it] {'loss': 0.4998, 'learning_rate': 1.9565832545734972e-05, 'epoch': 0.12} + 12%|█▏ | 701/5772 [1:13:57<14:52:05, 10.56s/it] 12%|█▏ | 702/5772 [1:14:10<12:55:05, 9.17s/it] 12%|█▏ | 702/5772 [1:14:03<12:55:04, 9.17s/it] {'loss': 0.4952, 'learning_rate': 1.9564195372899233e-05, 'epoch': 0.12} + 12%|█▏ | 702/5772 [1:14:10<12:55:05, 9.17s/it] {'loss': 0.4952, 'learning_rate': 1.9564195372899233e-05, 'epoch': 0.12} + 12%|█▏ | 702/5772 [1:14:03<12:55:04, 9.17s/it] 12%|█▏ | 703/5772 [1:14:16<11:48:40, 8.39s/it] 12%|█▏ | 703/5772 [1:14:09<11:48:40, 8.39s/it] {'loss': 0.5178, 'learning_rate': 1.956255518787334e-05, 'epoch': 0.12} + 12%|█▏ | 703/5772 [1:14:16<11:48:40, 8.39s/it] {'loss': 0.5178, 'learning_rate': 1.956255518787334e-05, 'epoch': 0.12} + 12%|█▏ | 703/5772 [1:14:09<11:48:40, 8.39s/it] 12%|█▏ | 704/5772 [1:14:22<10:54:29, 7.75s/it] 12%|█▏ | 704/5772 [1:14:15<10:54:29, 7.75s/it] {'loss': 0.491, 'learning_rate': 1.9560911991173856e-05, 'epoch': 0.12} + 12%|█▏ | 704/5772 [1:14:22<10:54:29, 7.75s/it] {'loss': 0.491, 'learning_rate': 1.9560911991173856e-05, 'epoch': 0.12} + 12%|█▏ | 704/5772 [1:14:15<10:54:29, 7.75s/it] 12%|█▏ | 705/5772 [1:14:22<10:15:51, 7.29s/it] 12%|█▏ | 705/5772 [1:14:29<10:15:51, 7.29s/it] {'loss': 0.503, 'learning_rate': 1.9559265783318304e-05, 'epoch': 0.12} + 12%|█▏ | 705/5772 [1:14:29<10:15:51, 7.29s/it] {'loss': 0.503, 'learning_rate': 1.9559265783318304e-05, 'epoch': 0.12} + 12%|█▏ | 705/5772 [1:14:22<10:15:51, 7.29s/it] 12%|█▏ | 706/5772 [1:14:35<9:49:24, 6.98s/it] 12%|█▏ | 706/5772 [1:14:28<9:49:24, 6.98s/it] {'loss': 0.5068, 'learning_rate': 1.9557616564825138e-05, 'epoch': 0.12} + 12%|█▏ | 706/5772 [1:14:35<9:49:24, 6.98s/it] {'loss': 0.5068, 'learning_rate': 1.9557616564825138e-05, 'epoch': 0.12} + 12%|█▏ | 706/5772 [1:14:28<9:49:24, 6.98s/it] 12%|█▏ | 707/5772 [1:14:41<9:21:02, 6.65s/it] 12%|█▏ | 707/5772 [1:14:34<9:21:03, 6.65s/it] {'loss': 0.4874, 'learning_rate': 1.955596433621378e-05, 'epoch': 0.12} + 12%|█▏ | 707/5772 [1:14:41<9:21:02, 6.65s/it] {'loss': 0.4874, 'learning_rate': 1.955596433621378e-05, 'epoch': 0.12} + 12%|█▏ | 707/5772 [1:14:34<9:21:03, 6.65s/it] 12%|█▏ | 708/5772 [1:14:47<9:05:26, 6.46s/it] 12%|█▏ | 708/5772 [1:14:40<9:05:26, 6.46s/it] {'loss': 0.5021, 'learning_rate': 1.9554309098004583e-05, 'epoch': 0.12} + 12%|█▏ | 708/5772 [1:14:47<9:05:26, 6.46s/it] {'loss': 0.5021, 'learning_rate': 1.9554309098004583e-05, 'epoch': 0.12} + 12%|█▏ | 708/5772 [1:14:40<9:05:26, 6.46s/it] 12%|█▏ | 709/5772 [1:14:46<9:11:54, 6.54s/it] 12%|█▏ | 709/5772 [1:14:53<9:11:54, 6.54s/it] {'loss': 0.504, 'learning_rate': 1.955265085071886e-05, 'epoch': 0.12} + 12%|█▏ | 709/5772 [1:14:53<9:11:54, 6.54s/it] {'loss': 0.504, 'learning_rate': 1.955265085071886e-05, 'epoch': 0.12} + 12%|█▏ | 709/5772 [1:14:46<9:11:54, 6.54s/it] 12%|█▏ | 710/5772 [1:15:00<9:01:32, 6.42s/it] 12%|█▏ | 710/5772 [1:14:53<9:01:33, 6.42s/it] {'loss': 0.4918, 'learning_rate': 1.9550989594878862e-05, 'epoch': 0.12} + 12%|█▏ | 710/5772 [1:15:00<9:01:32, 6.42s/it] {'loss': 0.4918, 'learning_rate': 1.9550989594878862e-05, 'epoch': 0.12} + 12%|█▏ | 710/5772 [1:14:53<9:01:33, 6.42s/it] 12%|█▏ | 711/5772 [1:15:06<8:52:42, 6.32s/it] 12%|█▏ | 711/5772 [1:14:59<8:52:42, 6.32s/it] {'loss': 0.5056, 'learning_rate': 1.9549325331007795e-05, 'epoch': 0.12} + 12%|█▏ | 711/5772 [1:15:06<8:52:42, 6.32s/it] {'loss': 0.5056, 'learning_rate': 1.9549325331007795e-05, 'epoch': 0.12} + 12%|█▏ | 711/5772 [1:14:59<8:52:42, 6.32s/it] 12%|█▏ | 712/5772 [1:15:05<8:50:10, 6.29s/it] 12%|█▏ | 712/5772 [1:15:12<8:50:10, 6.29s/it] {'loss': 0.4862, 'learning_rate': 1.954765805962981e-05, 'epoch': 0.12} + 12%|█▏ | 712/5772 [1:15:12<8:50:10, 6.29s/it] {'loss': 0.4862, 'learning_rate': 1.954765805962981e-05, 'epoch': 0.12} + 12%|█▏ | 712/5772 [1:15:05<8:50:10, 6.29s/it] 12%|█▏ | 713/5772 [1:15:11<8:47:41, 6.26s/it] 12%|█▏ | 713/5772 [1:15:18<8:47:41, 6.26s/it] {'loss': 0.5218, 'learning_rate': 1.9545987781270007e-05, 'epoch': 0.12} + 12%|█▏ | 713/5772 [1:15:18<8:47:41, 6.26s/it] {'loss': 0.5218, 'learning_rate': 1.9545987781270007e-05, 'epoch': 0.12} + 12%|█▏ | 713/5772 [1:15:11<8:47:41, 6.26s/it] 12%|█▏ | 714/5772 [1:15:24<8:48:22, 6.27s/it] 12%|█▏ | 714/5772 [1:15:17<8:48:23, 6.27s/it] {'loss': 0.4962, 'learning_rate': 1.9544314496454423e-05, 'epoch': 0.12} + 12%|█▏ | 714/5772 [1:15:24<8:48:22, 6.27s/it] {'loss': 0.4962, 'learning_rate': 1.9544314496454423e-05, 'epoch': 0.12} + 12%|█▏ | 714/5772 [1:15:17<8:48:23, 6.27s/it] 12%|█▏ | 715/5772 [1:15:30<8:41:30, 6.19s/it] 12%|█▏ | 715/5772 [1:15:23<8:41:30, 6.19s/it] {'loss': 0.5025, 'learning_rate': 1.9542638205710058e-05, 'epoch': 0.12} + 12%|█▏ | 715/5772 [1:15:30<8:41:30, 6.19s/it] {'loss': 0.5025, 'learning_rate': 1.9542638205710058e-05, 'epoch': 0.12} + 12%|█▏ | 715/5772 [1:15:23<8:41:30, 6.19s/it] 12%|█▏ | 716/5772 [1:15:29<8:37:12, 6.14s/it] 12%|█▏ | 716/5772 [1:15:36<8:37:12, 6.14s/it] {'loss': 0.4878, 'learning_rate': 1.9540958909564846e-05, 'epoch': 0.12} + 12%|█▏ | 716/5772 [1:15:36<8:37:12, 6.14s/it] {'loss': 0.4878, 'learning_rate': 1.9540958909564846e-05, 'epoch': 0.12} + 12%|█▏ | 716/5772 [1:15:29<8:37:12, 6.14s/it] 12%|█▏ | 717/5772 [1:15:35<8:35:12, 6.12s/it] 12%|█▏ | 717/5772 [1:15:42<8:35:12, 6.12s/it] {'loss': 0.4929, 'learning_rate': 1.9539276608547676e-05, 'epoch': 0.12} + 12%|█▏ | 717/5772 [1:15:42<8:35:12, 6.12s/it] {'loss': 0.4929, 'learning_rate': 1.9539276608547676e-05, 'epoch': 0.12} + 12%|█▏ | 717/5772 [1:15:35<8:35:12, 6.12s/it] 12%|█▏ | 718/5772 [1:15:42<8:40:25, 6.18s/it] 12%|█▏ | 718/5772 [1:15:49<8:40:25, 6.18s/it] {'loss': 0.4846, 'learning_rate': 1.9537591303188375e-05, 'epoch': 0.12} + 12%|█▏ | 718/5772 [1:15:49<8:40:25, 6.18s/it] {'loss': 0.4846, 'learning_rate': 1.9537591303188375e-05, 'epoch': 0.12} + 12%|█▏ | 718/5772 [1:15:42<8:40:25, 6.18s/it] 12%|█▏ | 719/5772 [1:15:48<8:39:15, 6.17s/it] 12%|█▏ | 719/5772 [1:15:55<8:39:15, 6.17s/it] {'loss': 0.4969, 'learning_rate': 1.953590299401772e-05, 'epoch': 0.12} + 12%|█▏ | 719/5772 [1:15:55<8:39:15, 6.17s/it] {'loss': 0.4969, 'learning_rate': 1.953590299401772e-05, 'epoch': 0.12} + 12%|█▏ | 719/5772 [1:15:48<8:39:15, 6.17s/it] 12%|█▏ | 720/5772 [1:15:54<8:40:50, 6.19s/it] 12%|█▏ | 720/5772 [1:16:01<8:40:50, 6.19s/it] {'loss': 0.4985, 'learning_rate': 1.953421168156744e-05, 'epoch': 0.12} + 12%|█▏ | 720/5772 [1:16:01<8:40:50, 6.19s/it] {'loss': 0.4985, 'learning_rate': 1.953421168156744e-05, 'epoch': 0.12} + 12%|█▏ | 720/5772 [1:15:54<8:40:50, 6.19s/it] 12%|█▏ | 721/5772 [1:16:00<8:36:20, 6.13s/it] 12%|█▏ | 721/5772 [1:16:07<8:36:20, 6.13s/it] {'loss': 0.4978, 'learning_rate': 1.9532517366370203e-05, 'epoch': 0.12} + 12%|█▏ | 721/5772 [1:16:07<8:36:20, 6.13s/it] {'loss': 0.4978, 'learning_rate': 1.9532517366370203e-05, 'epoch': 0.12} + 12%|█▏ | 721/5772 [1:16:00<8:36:20, 6.13s/it] 13%|█▎ | 722/5772 [1:16:06<8:36:12, 6.13s/it] 13%|█▎ | 722/5772 [1:16:13<8:36:12, 6.13s/it] {'loss': 0.4921, 'learning_rate': 1.9530820048959616e-05, 'epoch': 0.13} + 13%|█▎ | 722/5772 [1:16:13<8:36:12, 6.13s/it] {'loss': 0.4921, 'learning_rate': 1.9530820048959616e-05, 'epoch': 0.13} + 13%|█▎ | 722/5772 [1:16:06<8:36:12, 6.13s/it] 13%|█▎ | 723/5772 [1:16:19<8:36:44, 6.14s/it] 13%|█▎ | 723/5772 [1:16:12<8:36:45, 6.14s/it] {'loss': 0.5012, 'learning_rate': 1.9529119729870253e-05, 'epoch': 0.13} + 13%|█▎ | 723/5772 [1:16:19<8:36:44, 6.14s/it] {'loss': 0.5012, 'learning_rate': 1.9529119729870253e-05, 'epoch': 0.13} + 13%|█▎ | 723/5772 [1:16:12<8:36:45, 6.14s/it] 13%|█▎ | 724/5772 [1:16:25<8:33:22, 6.10s/it] 13%|█▎ | 724/5772 [1:16:18<8:33:23, 6.10s/it] {'loss': 0.4827, 'learning_rate': 1.952741640963761e-05, 'epoch': 0.13} + 13%|█▎ | 724/5772 [1:16:25<8:33:22, 6.10s/it] {'loss': 0.4827, 'learning_rate': 1.952741640963761e-05, 'epoch': 0.13} + 13%|█▎ | 724/5772 [1:16:18<8:33:23, 6.10s/it] 13%|█▎ | 725/5772 [1:16:24<8:27:07, 6.03s/it] 13%|█▎ | 725/5772 [1:16:31<8:27:08, 6.03s/it] {'loss': 0.5087, 'learning_rate': 1.9525710088798142e-05, 'epoch': 0.13} + 13%|█▎ | 725/5772 [1:16:31<8:27:08, 6.03s/it] {'loss': 0.5087, 'learning_rate': 1.9525710088798142e-05, 'epoch': 0.13} + 13%|█▎ | 725/5772 [1:16:24<8:27:07, 6.03s/it] 13%|█▎ | 726/5772 [1:16:38<8:38:31, 6.17s/it] 13%|█▎ | 726/5772 [1:16:31<8:38:31, 6.17s/it] {'loss': 0.4864, 'learning_rate': 1.9524000767889243e-05, 'epoch': 0.13} + 13%|█▎ | 726/5772 [1:16:38<8:38:31, 6.17s/it] {'loss': 0.4864, 'learning_rate': 1.9524000767889243e-05, 'epoch': 0.13} + 13%|█▎ | 726/5772 [1:16:31<8:38:31, 6.17s/it] 13%|█▎ | 727/5772 [1:16:44<8:43:20, 6.22s/it] 13%|█▎ | 727/5772 [1:16:37<8:43:20, 6.22s/it] {'loss': 0.4965, 'learning_rate': 1.952228844744926e-05, 'epoch': 0.13} + 13%|█▎ | 727/5772 [1:16:44<8:43:20, 6.22s/it] {'loss': 0.4965, 'learning_rate': 1.952228844744926e-05, 'epoch': 0.13} + 13%|█▎ | 727/5772 [1:16:37<8:43:20, 6.22s/it] 13%|█▎ | 728/5772 [1:16:43<8:40:23, 6.19s/it] 13%|█▎ | 728/5772 [1:16:50<8:40:23, 6.19s/it] {'loss': 0.4973, 'learning_rate': 1.9520573128017467e-05, 'epoch': 0.13} + 13%|█▎ | 728/5772 [1:16:50<8:40:23, 6.19s/it] {'loss': 0.4973, 'learning_rate': 1.9520573128017467e-05, 'epoch': 0.13} + 13%|█▎ | 728/5772 [1:16:43<8:40:23, 6.19s/it] 13%|█▎ | 729/5772 [1:16:49<8:40:58, 6.20s/it] 13%|█▎ | 729/5772 [1:16:57<8:40:58, 6.20s/it] {'loss': 0.4982, 'learning_rate': 1.951885481013411e-05, 'epoch': 0.13} + 13%|█▎ | 729/5772 [1:16:57<8:40:58, 6.20s/it] {'loss': 0.4982, 'learning_rate': 1.951885481013411e-05, 'epoch': 0.13} + 13%|█▎ | 729/5772 [1:16:49<8:40:58, 6.20s/it] 13%|█▎ | 730/5772 [1:16:56<8:55:36, 6.37s/it] 13%|█▎ | 730/5772 [1:17:03<8:55:37, 6.37s/it] {'loss': 0.4904, 'learning_rate': 1.951713349434035e-05, 'epoch': 0.13} + 13%|█▎ | 730/5772 [1:17:03<8:55:37, 6.37s/it] {'loss': 0.4904, 'learning_rate': 1.951713349434035e-05, 'epoch': 0.13} + 13%|█▎ | 730/5772 [1:16:56<8:55:36, 6.37s/it] 13%|█▎ | 731/5772 [1:17:02<8:36:31, 6.15s/it] 13%|█▎ | 731/5772 [1:17:09<8:36:31, 6.15s/it] {'loss': 0.4877, 'learning_rate': 1.9515409181178315e-05, 'epoch': 0.13} + 13%|█▎ | 731/5772 [1:17:09<8:36:31, 6.15s/it] {'loss': 0.4877, 'learning_rate': 1.9515409181178315e-05, 'epoch': 0.13} + 13%|█▎ | 731/5772 [1:17:02<8:36:31, 6.15s/it] 13%|█▎ | 732/5772 [1:17:08<8:37:38, 6.16s/it] 13%|█▎ | 732/5772 [1:17:15<8:37:39, 6.16s/it] {'loss': 0.4991, 'learning_rate': 1.9513681871191063e-05, 'epoch': 0.13} + 13%|█▎ | 732/5772 [1:17:15<8:37:39, 6.16s/it] {'loss': 0.4991, 'learning_rate': 1.9513681871191063e-05, 'epoch': 0.13} + 13%|█▎ | 732/5772 [1:17:08<8:37:38, 6.16s/it] 13%|█▎ | 733/5772 [1:17:14<8:31:25, 6.09s/it] 13%|█▎ | 733/5772 [1:17:21<8:31:26, 6.09s/it] {'loss': 0.492, 'learning_rate': 1.95119515649226e-05, 'epoch': 0.13} + 13%|█▎ | 733/5772 [1:17:21<8:31:26, 6.09s/it] {'loss': 0.492, 'learning_rate': 1.95119515649226e-05, 'epoch': 0.13} + 13%|█▎ | 733/5772 [1:17:14<8:31:25, 6.09s/it] 13%|█▎ | 734/5772 [1:17:20<8:35:41, 6.14s/it] 13%|█▎ | 734/5772 [1:17:27<8:35:41, 6.14s/it] {'loss': 0.4919, 'learning_rate': 1.9510218262917883e-05, 'epoch': 0.13} + 13%|█▎ | 734/5772 [1:17:27<8:35:41, 6.14s/it] {'loss': 0.4919, 'learning_rate': 1.9510218262917883e-05, 'epoch': 0.13} + 13%|█▎ | 734/5772 [1:17:20<8:35:41, 6.14s/it] 13%|█▎ | 735/5772 [1:17:27<8:40:49, 6.20s/it] 13%|█▎ | 735/5772 [1:17:34<8:40:49, 6.20s/it] {'loss': 0.4863, 'learning_rate': 1.9508481965722798e-05, 'epoch': 0.13} + 13%|█▎ | 735/5772 [1:17:34<8:40:49, 6.20s/it] {'loss': 0.4863, 'learning_rate': 1.9508481965722798e-05, 'epoch': 0.13} + 13%|█▎ | 735/5772 [1:17:27<8:40:49, 6.20s/it] 13%|█▎ | 736/5772 [1:17:33<8:33:16, 6.12s/it] 13%|█▎ | 736/5772 [1:17:40<8:33:16, 6.12s/it] {'loss': 0.5002, 'learning_rate': 1.9506742673884186e-05, 'epoch': 0.13} + 13%|█▎ | 736/5772 [1:17:40<8:33:16, 6.12s/it] {'loss': 0.5002, 'learning_rate': 1.9506742673884186e-05, 'epoch': 0.13} + 13%|█▎ | 736/5772 [1:17:33<8:33:16, 6.12s/it] 13%|█▎ | 737/5772 [1:17:39<8:34:34, 6.13s/it] 13%|█▎ | 737/5772 [1:17:46<8:34:34, 6.13s/it] {'loss': 0.4839, 'learning_rate': 1.9505000387949825e-05, 'epoch': 0.13} + 13%|█▎ | 737/5772 [1:17:46<8:34:34, 6.13s/it] {'loss': 0.4839, 'learning_rate': 1.9505000387949825e-05, 'epoch': 0.13} + 13%|█▎ | 737/5772 [1:17:39<8:34:34, 6.13s/it] 13%|█▎ | 738/5772 [1:17:45<8:36:30, 6.16s/it] 13%|█▎ | 738/5772 [1:17:52<8:36:30, 6.16s/it] {'loss': 0.4935, 'learning_rate': 1.950325510846844e-05, 'epoch': 0.13} + 13%|█▎ | 738/5772 [1:17:52<8:36:30, 6.16s/it] {'loss': 0.4935, 'learning_rate': 1.950325510846844e-05, 'epoch': 0.13} + 13%|█▎ | 738/5772 [1:17:45<8:36:30, 6.16s/it] 13%|█▎ | 739/5772 [1:17:51<8:37:03, 6.16s/it] 13%|█▎ | 739/5772 [1:17:58<8:37:03, 6.16s/it] {'loss': 0.4995, 'learning_rate': 1.95015068359897e-05, 'epoch': 0.13} + 13%|█▎ | 739/5772 [1:17:58<8:37:03, 6.16s/it] {'loss': 0.4995, 'learning_rate': 1.95015068359897e-05, 'epoch': 0.13} + 13%|█▎ | 739/5772 [1:17:51<8:37:03, 6.16s/it] 13%|█▎ | 740/5772 [1:17:57<8:32:21, 6.11s/it] 13%|█▎ | 740/5772 [1:18:04<8:32:21, 6.11s/it] {'loss': 0.4957, 'learning_rate': 1.949975557106421e-05, 'epoch': 0.13} + 13%|█▎ | 740/5772 [1:18:04<8:32:21, 6.11s/it] {'loss': 0.4957, 'learning_rate': 1.949975557106421e-05, 'epoch': 0.13} + 13%|█▎ | 740/5772 [1:17:57<8:32:21, 6.11s/it] 13%|█▎ | 741/5772 [1:18:03<8:38:16, 6.18s/it] 13%|█▎ | 741/5772 [1:18:10<8:38:18, 6.18s/it] {'loss': 0.4914, 'learning_rate': 1.949800131424352e-05, 'epoch': 0.13} + 13%|█▎ | 741/5772 [1:18:10<8:38:18, 6.18s/it] {'loss': 0.4914, 'learning_rate': 1.949800131424352e-05, 'epoch': 0.13} + 13%|█▎ | 741/5772 [1:18:03<8:38:16, 6.18s/it] 13%|█▎ | 742/5772 [1:18:10<8:37:49, 6.18s/it] 13%|█▎ | 742/5772 [1:18:17<8:37:49, 6.18s/it] {'loss': 0.4897, 'learning_rate': 1.9496244066080122e-05, 'epoch': 0.13} + 13%|█▎ | 742/5772 [1:18:17<8:37:49, 6.18s/it] {'loss': 0.4897, 'learning_rate': 1.9496244066080122e-05, 'epoch': 0.13} + 13%|█▎ | 742/5772 [1:18:10<8:37:49, 6.18s/it] 13%|█▎ | 743/5772 [1:18:23<8:40:52, 6.21s/it] 13%|█▎ | 743/5772 [1:18:16<8:40:53, 6.21s/it] {'loss': 0.501, 'learning_rate': 1.949448382712746e-05, 'epoch': 0.13} + 13%|█▎ | 743/5772 [1:18:23<8:40:52, 6.21s/it] {'loss': 0.501, 'learning_rate': 1.949448382712746e-05, 'epoch': 0.13} + 13%|█▎ | 743/5772 [1:18:16<8:40:53, 6.21s/it] 13%|█▎ | 744/5772 [1:18:22<8:35:13, 6.15s/it] 13%|█▎ | 744/5772 [1:18:29<8:35:13, 6.15s/it] {'loss': 0.4924, 'learning_rate': 1.9492720597939902e-05, 'epoch': 0.13} + 13%|█▎ | 744/5772 [1:18:29<8:35:13, 6.15s/it] {'loss': 0.4924, 'learning_rate': 1.9492720597939902e-05, 'epoch': 0.13} + 13%|█▎ | 744/5772 [1:18:22<8:35:13, 6.15s/it] 13%|█▎ | 745/5772 [1:18:28<8:28:44, 6.07s/it] 13%|█▎ | 745/5772 [1:18:35<8:28:44, 6.07s/it] {'loss': 0.4974, 'learning_rate': 1.9490954379072775e-05, 'epoch': 0.13} + 13%|█▎ | 745/5772 [1:18:35<8:28:44, 6.07s/it] {'loss': 0.4974, 'learning_rate': 1.9490954379072775e-05, 'epoch': 0.13} + 13%|█▎ | 745/5772 [1:18:28<8:28:44, 6.07s/it] 13%|█▎ | 746/5772 [1:18:34<8:29:26, 6.08s/it] 13%|█▎ | 746/5772 [1:18:41<8:29:25, 6.08s/it] {'loss': 0.4968, 'learning_rate': 1.9489185171082334e-05, 'epoch': 0.13} + 13%|█▎ | 746/5772 [1:18:41<8:29:25, 6.08s/it] {'loss': 0.4968, 'learning_rate': 1.9489185171082334e-05, 'epoch': 0.13} + 13%|█▎ | 746/5772 [1:18:34<8:29:26, 6.08s/it] 13%|█▎ | 747/5772 [1:18:40<8:24:22, 6.02s/it] 13%|█▎ | 747/5772 [1:18:47<8:24:22, 6.02s/it] {'loss': 0.4888, 'learning_rate': 1.9487412974525784e-05, 'epoch': 0.13} + 13%|█▎ | 747/5772 [1:18:47<8:24:22, 6.02s/it] {'loss': 0.4888, 'learning_rate': 1.9487412974525784e-05, 'epoch': 0.13} + 13%|█▎ | 747/5772 [1:18:40<8:24:22, 6.02s/it] 13%|█▎ | 748/5772 [1:18:46<8:21:26, 5.99s/it] 13%|█▎ | 748/5772 [1:18:53<8:21:26, 5.99s/it] {'loss': 0.4854, 'learning_rate': 1.948563778996127e-05, 'epoch': 0.13} + 13%|█▎ | 748/5772 [1:18:53<8:21:26, 5.99s/it] {'loss': 0.4854, 'learning_rate': 1.948563778996127e-05, 'epoch': 0.13} + 13%|█▎ | 748/5772 [1:18:46<8:21:26, 5.99s/it] 13%|█▎ | 749/5772 [1:18:52<8:28:04, 6.07s/it] 13%|█▎ | 749/5772 [1:18:59<8:28:03, 6.07s/it] {'loss': 0.5014, 'learning_rate': 1.948385961794787e-05, 'epoch': 0.13} + 13%|█▎ | 749/5772 [1:18:59<8:28:03, 6.07s/it] {'loss': 0.5014, 'learning_rate': 1.948385961794787e-05, 'epoch': 0.13} + 13%|█▎ | 749/5772 [1:18:52<8:28:04, 6.07s/it]4 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +06 AutoResumeHook: Checking whether to suspend... + 11 AutoResumeHook: Checking whether to suspend... +13 15AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend...3 + AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... + 13%|█▎ | 750/5772 [1:18:58<8:22:01, 6.00s/it]8 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... + 13%|█▎ | 750/5772 [1:19:05<8:22:01, 6.00s/it]10 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4971, 'learning_rate': 1.9482078459045617e-05, 'epoch': 0.13} + 13%|█▎ | 750/5772 [1:19:05<8:22:01, 6.00s/it] {'loss': 0.4971, 'learning_rate': 1.9482078459045617e-05, 'epoch': 0.13} + 13%|█▎ | 750/5772 [1:18:58<8:22:01, 6.00s/it] 13%|█▎ | 751/5772 [1:19:11<8:22:50, 6.01s/it] 13%|█▎ | 751/5772 [1:19:04<8:22:52, 6.01s/it] {'loss': 0.4993, 'learning_rate': 1.9480294313815472e-05, 'epoch': 0.13} + 13%|█▎ | 751/5772 [1:19:11<8:22:50, 6.01s/it] {'loss': 0.4993, 'learning_rate': 1.9480294313815472e-05, 'epoch': 0.13} + 13%|█▎ | 751/5772 [1:19:04<8:22:52, 6.01s/it] 13%|█▎ | 752/5772 [1:19:10<8:20:45, 5.99s/it] 13%|█▎ | 752/5772 [1:19:17<8:20:46, 5.99s/it] {'loss': 0.4901, 'learning_rate': 1.9478507182819345e-05, 'epoch': 0.13} + 13%|█▎ | 752/5772 [1:19:17<8:20:46, 5.99s/it] {'loss': 0.4901, 'learning_rate': 1.9478507182819345e-05, 'epoch': 0.13} + 13%|█▎ | 752/5772 [1:19:10<8:20:45, 5.99s/it] 13%|█▎ | 753/5772 [1:19:16<8:30:13, 6.10s/it] 13%|█▎ | 753/5772 [1:19:23<8:30:14, 6.10s/it] {'loss': 0.5011, 'learning_rate': 1.9476717066620082e-05, 'epoch': 0.13} + 13%|█▎ | 753/5772 [1:19:23<8:30:14, 6.10s/it] {'loss': 0.5011, 'learning_rate': 1.9476717066620082e-05, 'epoch': 0.13} + 13%|█▎ | 753/5772 [1:19:16<8:30:13, 6.10s/it] 13%|█▎ | 754/5772 [1:19:22<8:32:35, 6.13s/it] 13%|█▎ | 754/5772 [1:19:29<8:32:35, 6.13s/it] {'loss': 0.5017, 'learning_rate': 1.947492396578147e-05, 'epoch': 0.13} + 13%|█▎ | 754/5772 [1:19:29<8:32:35, 6.13s/it] {'loss': 0.5017, 'learning_rate': 1.947492396578147e-05, 'epoch': 0.13} + 13%|█▎ | 754/5772 [1:19:22<8:32:35, 6.13s/it] 13%|█▎ | 755/5772 [1:19:29<8:40:58, 6.23s/it] 13%|█▎ | 755/5772 [1:19:36<8:40:59, 6.23s/it] {'loss': 0.5016, 'learning_rate': 1.9473127880868233e-05, 'epoch': 0.13} + 13%|█▎ | 755/5772 [1:19:36<8:40:59, 6.23s/it] {'loss': 0.5016, 'learning_rate': 1.9473127880868233e-05, 'epoch': 0.13} + 13%|█▎ | 755/5772 [1:19:29<8:40:58, 6.23s/it] 13%|█▎ | 756/5772 [1:19:35<8:39:53, 6.22s/it] 13%|█▎ | 756/5772 [1:19:42<8:39:53, 6.22s/it] {'loss': 0.4976, 'learning_rate': 1.9471328812446045e-05, 'epoch': 0.13} + 13%|█▎ | 756/5772 [1:19:42<8:39:53, 6.22s/it] {'loss': 0.4976, 'learning_rate': 1.9471328812446045e-05, 'epoch': 0.13} + 13%|█▎ | 756/5772 [1:19:35<8:39:53, 6.22s/it] 13%|█▎ | 757/5772 [1:19:41<8:43:02, 6.26s/it] 13%|█▎ | 757/5772 [1:19:48<8:43:02, 6.26s/it] {'loss': 0.4896, 'learning_rate': 1.9469526761081504e-05, 'epoch': 0.13} + 13%|█▎ | 757/5772 [1:19:48<8:43:02, 6.26s/it] {'loss': 0.4896, 'learning_rate': 1.9469526761081504e-05, 'epoch': 0.13} + 13%|█▎ | 757/5772 [1:19:41<8:43:02, 6.26s/it] 13%|█▎ | 758/5772 [1:19:48<8:42:33, 6.25s/it] 13%|█▎ | 758/5772 [1:19:55<8:42:33, 6.25s/it] {'loss': 0.4892, 'learning_rate': 1.946772172734216e-05, 'epoch': 0.13} + 13%|█▎ | 758/5772 [1:19:55<8:42:33, 6.25s/it] {'loss': 0.4892, 'learning_rate': 1.946772172734216e-05, 'epoch': 0.13} + 13%|█▎ | 758/5772 [1:19:48<8:42:33, 6.25s/it] 13%|█▎ | 759/5772 [1:20:00<8:33:40, 6.15s/it] 13%|█▎ | 759/5772 [1:19:53<8:33:40, 6.15s/it] {'loss': 0.5003, 'learning_rate': 1.9465913711796502e-05, 'epoch': 0.13} + 13%|█▎ | 759/5772 [1:20:00<8:33:40, 6.15s/it] {'loss': 0.5003, 'learning_rate': 1.9465913711796502e-05, 'epoch': 0.13} + 13%|█▎ | 759/5772 [1:19:53<8:33:40, 6.15s/it] 13%|█▎ | 760/5772 [1:20:00<8:41:32, 6.24s/it] 13%|█▎ | 760/5772 [1:20:07<8:41:32, 6.24s/it] {'loss': 0.4832, 'learning_rate': 1.946410271501395e-05, 'epoch': 0.13} + 13%|█▎ | 760/5772 [1:20:07<8:41:32, 6.24s/it] {'loss': 0.4832, 'learning_rate': 1.946410271501395e-05, 'epoch': 0.13} + 13%|█▎ | 760/5772 [1:20:00<8:41:32, 6.24s/it] 13%|█▎ | 761/5772 [1:20:06<8:39:03, 6.22s/it] 13%|█▎ | 761/5772 [1:20:13<8:39:03, 6.22s/it] {'loss': 0.5087, 'learning_rate': 1.946228873756487e-05, 'epoch': 0.13} + 13%|█▎ | 761/5772 [1:20:13<8:39:03, 6.22s/it] {'loss': 0.5087, 'learning_rate': 1.946228873756487e-05, 'epoch': 0.13} + 13%|█▎ | 761/5772 [1:20:06<8:39:03, 6.22s/it] 13%|█▎ | 762/5772 [1:20:12<8:33:15, 6.15s/it] 13%|█▎ | 762/5772 [1:20:19<8:33:15, 6.15s/it] {'loss': 0.4891, 'learning_rate': 1.946047178002056e-05, 'epoch': 0.13} + 13%|█▎ | 762/5772 [1:20:19<8:33:15, 6.15s/it] {'loss': 0.4891, 'learning_rate': 1.946047178002056e-05, 'epoch': 0.13} + 13%|█▎ | 762/5772 [1:20:12<8:33:15, 6.15s/it] 13%|█▎ | 763/5772 [1:20:25<8:32:04, 6.13s/it] 13%|█▎ | 763/5772 [1:20:18<8:32:05, 6.13s/it] {'loss': 0.5023, 'learning_rate': 1.9458651842953264e-05, 'epoch': 0.13} + 13%|█▎ | 763/5772 [1:20:25<8:32:04, 6.13s/it] {'loss': 0.5023, 'learning_rate': 1.9458651842953264e-05, 'epoch': 0.13} + 13%|█▎ | 763/5772 [1:20:18<8:32:05, 6.13s/it] 13%|█▎ | 764/5772 [1:20:31<8:32:10, 6.14s/it] 13%|█▎ | 764/5772 [1:20:24<8:32:10, 6.14s/it] {'loss': 0.4886, 'learning_rate': 1.945682892693616e-05, 'epoch': 0.13} + 13%|█▎ | 764/5772 [1:20:31<8:32:10, 6.14s/it] {'loss': 0.4886, 'learning_rate': 1.945682892693616e-05, 'epoch': 0.13} + 13%|█▎ | 764/5772 [1:20:24<8:32:10, 6.14s/it] 13%|█▎ | 765/5772 [1:20:31<8:35:06, 6.17s/it] 13%|█▎ | 765/5772 [1:20:38<8:35:06, 6.17s/it] {'loss': 0.5015, 'learning_rate': 1.9455003032543366e-05, 'epoch': 0.13} + 13%|█▎ | 765/5772 [1:20:38<8:35:06, 6.17s/it] {'loss': 0.5015, 'learning_rate': 1.9455003032543366e-05, 'epoch': 0.13} + 13%|█▎ | 765/5772 [1:20:31<8:35:06, 6.17s/it] 13%|█▎ | 766/5772 [1:20:37<8:39:51, 6.23s/it] 13%|█▎ | 766/5772 [1:20:44<8:39:52, 6.23s/it] {'loss': 0.4845, 'learning_rate': 1.9453174160349938e-05, 'epoch': 0.13} + 13%|█▎ | 766/5772 [1:20:44<8:39:52, 6.23s/it] {'loss': 0.4845, 'learning_rate': 1.9453174160349938e-05, 'epoch': 0.13} + 13%|█▎ | 766/5772 [1:20:37<8:39:51, 6.23s/it] 13%|█▎ | 767/5772 [1:20:43<8:40:14, 6.24s/it] 13%|█▎ | 767/5772 [1:20:50<8:40:14, 6.24s/it] {'loss': 0.4935, 'learning_rate': 1.9451342310931866e-05, 'epoch': 0.13} + 13%|█▎ | 767/5772 [1:20:50<8:40:14, 6.24s/it] {'loss': 0.4935, 'learning_rate': 1.9451342310931866e-05, 'epoch': 0.13} + 13%|█▎ | 767/5772 [1:20:43<8:40:14, 6.24s/it] 13%|█▎ | 768/5772 [1:20:49<8:40:25, 6.24s/it] 13%|█▎ | 768/5772 [1:20:56<8:40:25, 6.24s/it] {'loss': 0.4867, 'learning_rate': 1.9449507484866084e-05, 'epoch': 0.13} + 13%|█▎ | 768/5772 [1:20:56<8:40:25, 6.24s/it] {'loss': 0.4867, 'learning_rate': 1.9449507484866084e-05, 'epoch': 0.13} + 13%|█▎ | 768/5772 [1:20:49<8:40:25, 6.24s/it] 13%|█▎ | 769/5772 [1:20:56<8:52:22, 6.38s/it] 13%|█▎ | 769/5772 [1:21:03<8:52:22, 6.38s/it] {'loss': 0.4937, 'learning_rate': 1.944766968273046e-05, 'epoch': 0.13} + 13%|█▎ | 769/5772 [1:21:03<8:52:22, 6.38s/it] {'loss': 0.4937, 'learning_rate': 1.944766968273046e-05, 'epoch': 0.13} + 13%|█▎ | 769/5772 [1:20:56<8:52:22, 6.38s/it] 13%|█▎ | 770/5772 [1:21:09<8:41:31, 6.26s/it] 13%|█▎ | 770/5772 [1:21:02<8:41:31, 6.26s/it] {'loss': 0.4944, 'learning_rate': 1.9445828905103797e-05, 'epoch': 0.13} + 13%|█▎ | 770/5772 [1:21:09<8:41:31, 6.26s/it] {'loss': 0.4944, 'learning_rate': 1.9445828905103797e-05, 'epoch': 0.13} + 13%|█▎ | 770/5772 [1:21:02<8:41:31, 6.26s/it] 13%|█▎ | 771/5772 [1:21:15<8:37:45, 6.21s/it] 13%|█▎ | 771/5772 [1:21:08<8:37:46, 6.21s/it] {'loss': 0.4985, 'learning_rate': 1.944398515256584e-05, 'epoch': 0.13} + 13%|█▎ | 771/5772 [1:21:15<8:37:45, 6.21s/it] {'loss': 0.4985, 'learning_rate': 1.944398515256584e-05, 'epoch': 0.13} + 13%|█▎ | 771/5772 [1:21:08<8:37:46, 6.21s/it] 13%|█▎ | 772/5772 [1:21:14<8:33:51, 6.17s/it] 13%|█▎ | 772/5772 [1:21:21<8:33:51, 6.17s/it] {'loss': 0.4973, 'learning_rate': 1.944213842569727e-05, 'epoch': 0.13} + 13%|█▎ | 772/5772 [1:21:21<8:33:51, 6.17s/it] {'loss': 0.4973, 'learning_rate': 1.944213842569727e-05, 'epoch': 0.13} + 13%|█▎ | 772/5772 [1:21:14<8:33:51, 6.17s/it] 13%|█▎ | 773/5772 [1:21:20<8:32:58, 6.16s/it] 13%|█▎ | 773/5772 [1:21:27<8:32:58, 6.16s/it] {'loss': 0.4921, 'learning_rate': 1.94402887250797e-05, 'epoch': 0.13} + 13%|█▎ | 773/5772 [1:21:27<8:32:58, 6.16s/it] {'loss': 0.4921, 'learning_rate': 1.94402887250797e-05, 'epoch': 0.13} + 13%|█▎ | 773/5772 [1:21:20<8:32:58, 6.16s/it] 13%|█▎ | 774/5772 [1:21:33<8:26:03, 6.08s/it] 13%|█▎ | 774/5772 [1:21:26<8:26:03, 6.08s/it] {'loss': 0.493, 'learning_rate': 1.943843605129568e-05, 'epoch': 0.13} + 13%|█▎ | 774/5772 [1:21:33<8:26:03, 6.08s/it] {'loss': 0.493, 'learning_rate': 1.943843605129568e-05, 'epoch': 0.13} + 13%|█▎ | 774/5772 [1:21:26<8:26:03, 6.08s/it] 13%|█▎ | 775/5772 [1:21:33<8:30:18, 6.13s/it] 13%|█▎ | 775/5772 [1:21:40<8:30:18, 6.13s/it] {'loss': 0.4867, 'learning_rate': 1.943658040492871e-05, 'epoch': 0.13} + 13%|█▎ | 775/5772 [1:21:40<8:30:18, 6.13s/it] {'loss': 0.4867, 'learning_rate': 1.943658040492871e-05, 'epoch': 0.13} + 13%|█▎ | 775/5772 [1:21:33<8:30:18, 6.13s/it] 13%|█▎ | 776/5772 [1:21:38<8:21:59, 6.03s/it] 13%|█▎ | 776/5772 [1:21:45<8:21:59, 6.03s/it] {'loss': 0.4912, 'learning_rate': 1.9434721786563204e-05, 'epoch': 0.13} + 13%|█▎ | 776/5772 [1:21:45<8:21:59, 6.03s/it] {'loss': 0.4912, 'learning_rate': 1.9434721786563204e-05, 'epoch': 0.13} + 13%|█▎ | 776/5772 [1:21:38<8:21:59, 6.03s/it] 13%|█▎ | 777/5772 [1:21:44<8:22:26, 6.04s/it] 13%|█▎ | 777/5772 [1:21:51<8:22:26, 6.04s/it] {'loss': 0.4909, 'learning_rate': 1.9432860196784533e-05, 'epoch': 0.13} + 13%|█▎ | 777/5772 [1:21:51<8:22:26, 6.04s/it] {'loss': 0.4909, 'learning_rate': 1.9432860196784533e-05, 'epoch': 0.13} + 13%|█▎ | 777/5772 [1:21:44<8:22:26, 6.04s/it] 13%|█▎ | 778/5772 [1:21:50<8:23:09, 6.05s/it] 13%|█▎ | 778/5772 [1:21:57<8:23:09, 6.05s/it] {'loss': 0.4876, 'learning_rate': 1.9430995636178986e-05, 'epoch': 0.13} + 13%|█▎ | 778/5772 [1:21:57<8:23:09, 6.05s/it] {'loss': 0.4876, 'learning_rate': 1.9430995636178986e-05, 'epoch': 0.13} + 13%|█▎ | 778/5772 [1:21:50<8:23:09, 6.05s/it] 13%|█▎ | 779/5772 [1:22:03<8:18:26, 5.99s/it] 13%|█▎ | 779/5772 [1:21:56<8:18:27, 5.99s/it] {'loss': 0.4934, 'learning_rate': 1.9429128105333802e-05, 'epoch': 0.13} + 13%|█▎ | 779/5772 [1:22:03<8:18:26, 5.99s/it] {'loss': 0.4934, 'learning_rate': 1.9429128105333802e-05, 'epoch': 0.13} + 13%|█▎ | 779/5772 [1:21:56<8:18:27, 5.99s/it] 14%|█▎ | 780/5772 [1:22:02<8:23:18, 6.05s/it] 14%|█▎ | 780/5772 [1:22:10<8:23:18, 6.05s/it] {'loss': 0.4827, 'learning_rate': 1.9427257604837146e-05, 'epoch': 0.14} + 14%|█▎ | 780/5772 [1:22:10<8:23:18, 6.05s/it] {'loss': 0.4827, 'learning_rate': 1.9427257604837146e-05, 'epoch': 0.14} + 14%|█▎ | 780/5772 [1:22:02<8:23:18, 6.05s/it] 14%|█▎ | 781/5772 [1:22:09<8:30:44, 6.14s/it] 14%|█▎ | 781/5772 [1:22:16<8:30:45, 6.14s/it] {'loss': 0.5015, 'learning_rate': 1.9425384135278126e-05, 'epoch': 0.14} + 14%|█▎ | 781/5772 [1:22:16<8:30:45, 6.14s/it] {'loss': 0.5015, 'learning_rate': 1.9425384135278126e-05, 'epoch': 0.14} + 14%|█▎ | 781/5772 [1:22:09<8:30:44, 6.14s/it] 14%|█▎ | 782/5772 [1:22:15<8:28:21, 6.11s/it] 14%|█▎ | 782/5772 [1:22:22<8:28:21, 6.11s/it] {'loss': 0.4936, 'learning_rate': 1.942350769724678e-05, 'epoch': 0.14} + 14%|█▎ | 782/5772 [1:22:22<8:28:21, 6.11s/it] {'loss': 0.4936, 'learning_rate': 1.942350769724678e-05, 'epoch': 0.14} + 14%|█▎ | 782/5772 [1:22:15<8:28:21, 6.11s/it] 14%|█▎ | 783/5772 [1:22:21<8:30:08, 6.14s/it] 14%|█▎ | 783/5772 [1:22:28<8:30:08, 6.14s/it] {'loss': 0.4904, 'learning_rate': 1.9421628291334072e-05, 'epoch': 0.14} + 14%|█▎ | 783/5772 [1:22:28<8:30:08, 6.14s/it] {'loss': 0.4904, 'learning_rate': 1.9421628291334072e-05, 'epoch': 0.14} + 14%|█▎ | 783/5772 [1:22:21<8:30:08, 6.14s/it] 14%|█▎ | 784/5772 [1:22:27<8:24:31, 6.07s/it] 14%|█▎ | 784/5772 [1:22:34<8:24:31, 6.07s/it] {'loss': 0.4869, 'learning_rate': 1.941974591813192e-05, 'epoch': 0.14} + 14%|█▎ | 784/5772 [1:22:34<8:24:31, 6.07s/it] {'loss': 0.4869, 'learning_rate': 1.941974591813192e-05, 'epoch': 0.14} + 14%|█▎ | 784/5772 [1:22:27<8:24:31, 6.07s/it] 14%|█▎ | 785/5772 [1:22:33<8:27:06, 6.10s/it] 14%|█▎ | 785/5772 [1:22:40<8:27:07, 6.10s/it] {'loss': 0.4918, 'learning_rate': 1.941786057823317e-05, 'epoch': 0.14} + 14%|█▎ | 785/5772 [1:22:40<8:27:07, 6.10s/it] {'loss': 0.4918, 'learning_rate': 1.941786057823317e-05, 'epoch': 0.14} + 14%|█▎ | 785/5772 [1:22:33<8:27:06, 6.10s/it] 14%|█▎ | 786/5772 [1:22:39<8:27:12, 6.10s/it] 14%|█▎ | 786/5772 [1:22:46<8:27:11, 6.10s/it] {'loss': 0.5043, 'learning_rate': 1.941597227223159e-05, 'epoch': 0.14} + 14%|█▎ | 786/5772 [1:22:46<8:27:11, 6.10s/it] {'loss': 0.5043, 'learning_rate': 1.941597227223159e-05, 'epoch': 0.14} + 14%|█▎ | 786/5772 [1:22:39<8:27:12, 6.10s/it] 14%|█▎ | 787/5772 [1:22:46<8:35:42, 6.21s/it] 14%|█▎ | 787/5772 [1:22:53<8:35:42, 6.21s/it] {'loss': 0.5034, 'learning_rate': 1.9414081000721898e-05, 'epoch': 0.14} + 14%|█▎ | 787/5772 [1:22:53<8:35:42, 6.21s/it] {'loss': 0.5034, 'learning_rate': 1.9414081000721898e-05, 'epoch': 0.14} + 14%|█▎ | 787/5772 [1:22:46<8:35:42, 6.21s/it] 14%|█▎ | 788/5772 [1:22:52<8:31:10, 6.15s/it] 14%|█▎ | 788/5772 [1:22:59<8:31:10, 6.15s/it] {'loss': 0.4977, 'learning_rate': 1.9412186764299738e-05, 'epoch': 0.14} + 14%|█▎ | 788/5772 [1:22:59<8:31:10, 6.15s/it] {'loss': 0.4977, 'learning_rate': 1.9412186764299738e-05, 'epoch': 0.14} + 14%|█▎ | 788/5772 [1:22:52<8:31:10, 6.15s/it] 14%|█▎ | 789/5772 [1:22:58<8:36:13, 6.22s/it] 14%|█▎ | 789/5772 [1:23:05<8:36:13, 6.22s/it] {'loss': 0.5031, 'learning_rate': 1.9410289563561685e-05, 'epoch': 0.14} + 14%|█▎ | 789/5772 [1:23:05<8:36:13, 6.22s/it] {'loss': 0.5031, 'learning_rate': 1.9410289563561685e-05, 'epoch': 0.14} + 14%|█▎ | 789/5772 [1:22:58<8:36:13, 6.22s/it] 14%|█▎ | 790/5772 [1:23:04<8:36:30, 6.22s/it] 14%|█▎ | 790/5772 [1:23:11<8:36:30, 6.22s/it] {'loss': 0.4781, 'learning_rate': 1.9408389399105257e-05, 'epoch': 0.14} + 14%|█▎ | 790/5772 [1:23:11<8:36:30, 6.22s/it] {'loss': 0.4781, 'learning_rate': 1.9408389399105257e-05, 'epoch': 0.14} + 14%|█▎ | 790/5772 [1:23:04<8:36:30, 6.22s/it] 14%|█▎ | 791/5772 [1:23:11<8:41:29, 6.28s/it] 14%|█▎ | 791/5772 [1:23:18<8:41:29, 6.28s/it] {'loss': 0.5038, 'learning_rate': 1.9406486271528896e-05, 'epoch': 0.14} + 14%|█▎ | 791/5772 [1:23:18<8:41:29, 6.28s/it] {'loss': 0.5038, 'learning_rate': 1.9406486271528896e-05, 'epoch': 0.14} + 14%|█▎ | 791/5772 [1:23:11<8:41:29, 6.28s/it] 14%|█▎ | 792/5772 [1:23:24<8:38:23, 6.25s/it] 14%|█▎ | 792/5772 [1:23:17<8:38:23, 6.25s/it] {'loss': 0.4921, 'learning_rate': 1.940458018143199e-05, 'epoch': 0.14} + 14%|█▎ | 792/5772 [1:23:24<8:38:23, 6.25s/it] {'loss': 0.4921, 'learning_rate': 1.940458018143199e-05, 'epoch': 0.14} + 14%|█▎ | 792/5772 [1:23:17<8:38:23, 6.25s/it] 14%|█▎ | 793/5772 [1:23:30<8:25:47, 6.10s/it] 14%|█▎ | 793/5772 [1:23:23<8:25:48, 6.10s/it] {'loss': 0.4901, 'learning_rate': 1.9402671129414844e-05, 'epoch': 0.14} + 14%|█▎ | 793/5772 [1:23:30<8:25:47, 6.10s/it] {'loss': 0.4901, 'learning_rate': 1.9402671129414844e-05, 'epoch': 0.14} + 14%|█▎ | 793/5772 [1:23:23<8:25:48, 6.10s/it] 14%|█▍ | 794/5772 [1:23:36<8:23:03, 6.06s/it] 14%|█▍ | 794/5772 [1:23:29<8:23:03, 6.06s/it] {'loss': 0.4883, 'learning_rate': 1.9400759116078703e-05, 'epoch': 0.14} + 14%|█▍ | 794/5772 [1:23:36<8:23:03, 6.06s/it] {'loss': 0.4883, 'learning_rate': 1.9400759116078703e-05, 'epoch': 0.14} + 14%|█▍ | 794/5772 [1:23:29<8:23:03, 6.06s/it] 14%|█▍ | 795/5772 [1:23:42<8:21:54, 6.05s/it] 14%|█▍ | 795/5772 [1:23:35<8:21:54, 6.05s/it] {'loss': 0.4904, 'learning_rate': 1.9398844142025746e-05, 'epoch': 0.14} + 14%|█▍ | 795/5772 [1:23:42<8:21:54, 6.05s/it] {'loss': 0.4904, 'learning_rate': 1.9398844142025746e-05, 'epoch': 0.14} + 14%|█▍ | 795/5772 [1:23:35<8:21:54, 6.05s/it] 14%|█▍ | 796/5772 [1:23:48<8:23:26, 6.07s/it] 14%|█▍ | 796/5772 [1:23:41<8:23:26, 6.07s/it] {'loss': 0.4899, 'learning_rate': 1.9396926207859085e-05, 'epoch': 0.14} + 14%|█▍ | 796/5772 [1:23:48<8:23:26, 6.07s/it] {'loss': 0.4899, 'learning_rate': 1.9396926207859085e-05, 'epoch': 0.14} + 14%|█▍ | 796/5772 [1:23:41<8:23:26, 6.07s/it] 14%|█▍ | 797/5772 [1:23:54<8:35:14, 6.21s/it] 14%|█▍ | 797/5772 [1:23:47<8:35:15, 6.21s/it] {'loss': 0.5099, 'learning_rate': 1.9395005314182765e-05, 'epoch': 0.14} + 14%|█▍ | 797/5772 [1:23:54<8:35:14, 6.21s/it] {'loss': 0.5099, 'learning_rate': 1.9395005314182765e-05, 'epoch': 0.14} + 14%|█▍ | 797/5772 [1:23:47<8:35:15, 6.21s/it] 14%|█▍ | 798/5772 [1:24:00<8:24:25, 6.08s/it] 14%|█▍ | 798/5772 [1:23:53<8:24:25, 6.08s/it] {'loss': 0.4862, 'learning_rate': 1.9393081461601752e-05, 'epoch': 0.14} + 14%|█▍ | 798/5772 [1:24:00<8:24:25, 6.08s/it] {'loss': 0.4862, 'learning_rate': 1.9393081461601752e-05, 'epoch': 0.14} + 14%|█▍ | 798/5772 [1:23:53<8:24:25, 6.08s/it] 14%|█▍ | 799/5772 [1:24:06<8:17:58, 6.01s/it] 14%|█▍ | 799/5772 [1:23:59<8:17:58, 6.01s/it] {'loss': 0.4944, 'learning_rate': 1.939115465072196e-05, 'epoch': 0.14} + 14%|█▍ | 799/5772 [1:24:06<8:17:58, 6.01s/it] {'loss': 0.4944, 'learning_rate': 1.939115465072196e-05, 'epoch': 0.14} + 14%|█▍ | 799/5772 [1:23:59<8:17:58, 6.01s/it]5 AutoResumeHook: Checking whether to suspend...4 + AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +014 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +3AutoResumeHook: Checking whether to suspend... AutoResumeHook: Checking whether to suspend... + +8 AutoResumeHook: Checking whether to suspend... + 14%|█▍ | 800/5772 [1:24:12<8:20:53, 6.04s/it] 14%|█▍ | 800/5772 [1:24:05<8:20:53, 6.04s/it]9 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4799, 'learning_rate': 1.938922488215023e-05, 'epoch': 0.14} + 14%|█▍ | 800/5772 [1:24:12<8:20:53, 6.04s/it] {'loss': 0.4799, 'learning_rate': 1.938922488215023e-05, 'epoch': 0.14} + 14%|█▍ | 800/5772 [1:24:05<8:20:53, 6.04s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-800/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-800/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-800/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 14%|█▍ | 801/5772 [1:24:26<14:28:31, 10.48s/it] 14%|█▍ | 801/5772 [1:24:33<14:28:31, 10.48s/it] {'loss': 0.4979, 'learning_rate': 1.9387292156494326e-05, 'epoch': 0.14} + 14%|█▍ | 801/5772 [1:24:33<14:28:31, 10.48s/it] {'loss': 0.4979, 'learning_rate': 1.9387292156494326e-05, 'epoch': 0.14} + 14%|█▍ | 801/5772 [1:24:26<14:28:31, 10.48s/it] 14%|█▍ | 802/5772 [1:24:32<12:48:05, 9.27s/it] 14%|█▍ | 802/5772 [1:24:39<12:48:06, 9.27s/it] {'loss': 0.4922, 'learning_rate': 1.938535647436295e-05, 'epoch': 0.14} + 14%|█▍ | 802/5772 [1:24:39<12:48:06, 9.27s/it] {'loss': 0.4922, 'learning_rate': 1.938535647436295e-05, 'epoch': 0.14} + 14%|█▍ | 802/5772 [1:24:32<12:48:05, 9.27s/it] 14%|█▍ | 803/5772 [1:24:38<11:23:07, 8.25s/it] 14%|█▍ | 803/5772 [1:24:45<11:23:07, 8.25s/it] {'loss': 0.4974, 'learning_rate': 1.9383417836365734e-05, 'epoch': 0.14} + 14%|█▍ | 803/5772 [1:24:45<11:23:07, 8.25s/it] {'loss': 0.4974, 'learning_rate': 1.9383417836365734e-05, 'epoch': 0.14} + 14%|█▍ | 803/5772 [1:24:38<11:23:07, 8.25s/it] 14%|█▍ | 804/5772 [1:24:44<10:32:22, 7.64s/it] 14%|█▍ | 804/5772 [1:24:51<10:32:22, 7.64s/it] {'loss': 0.4915, 'learning_rate': 1.9381476243113243e-05, 'epoch': 0.14} + 14%|█▍ | 804/5772 [1:24:51<10:32:22, 7.64s/it] {'loss': 0.4915, 'learning_rate': 1.9381476243113243e-05, 'epoch': 0.14} + 14%|█▍ | 804/5772 [1:24:44<10:32:22, 7.64s/it] 14%|█▍ | 805/5772 [1:24:51<10:16:11, 7.44s/it] 14%|█▍ | 805/5772 [1:24:58<10:16:12, 7.44s/it] {'loss': 0.4953, 'learning_rate': 1.937953169521697e-05, 'epoch': 0.14} + 14%|█▍ | 805/5772 [1:24:58<10:16:12, 7.44s/it] {'loss': 0.4953, 'learning_rate': 1.937953169521697e-05, 'epoch': 0.14} + 14%|█▍ | 805/5772 [1:24:51<10:16:11, 7.44s/it] 14%|█▍ | 806/5772 [1:24:57<9:38:29, 6.99s/it] 14%|█▍ | 806/5772 [1:25:04<9:38:29, 6.99s/it] {'loss': 0.4721, 'learning_rate': 1.937758419328934e-05, 'epoch': 0.14} + 14%|█▍ | 806/5772 [1:25:04<9:38:29, 6.99s/it] {'loss': 0.4721, 'learning_rate': 1.937758419328934e-05, 'epoch': 0.14} + 14%|█▍ | 806/5772 [1:24:57<9:38:29, 6.99s/it] 14%|█▍ | 807/5772 [1:25:04<9:21:08, 6.78s/it] 14%|█▍ | 807/5772 [1:25:11<9:21:08, 6.78s/it] {'loss': 0.4937, 'learning_rate': 1.937563373794371e-05, 'epoch': 0.14} + 14%|█▍ | 807/5772 [1:25:11<9:21:08, 6.78s/it] {'loss': 0.4937, 'learning_rate': 1.937563373794371e-05, 'epoch': 0.14} + 14%|█▍ | 807/5772 [1:25:04<9:21:08, 6.78s/it] 14%|█▍ | 808/5772 [1:25:10<9:18:00, 6.74s/it] 14%|█▍ | 808/5772 [1:25:17<9:18:00, 6.74s/it] {'loss': 0.4752, 'learning_rate': 1.937368032979436e-05, 'epoch': 0.14} + 14%|█▍ | 808/5772 [1:25:17<9:18:00, 6.74s/it] {'loss': 0.4752, 'learning_rate': 1.937368032979436e-05, 'epoch': 0.14} + 14%|█▍ | 808/5772 [1:25:10<9:18:00, 6.74s/it] 14%|█▍ | 809/5772 [1:25:17<9:08:12, 6.63s/it] 14%|█▍ | 809/5772 [1:25:24<9:08:13, 6.63s/it] {'loss': 0.4992, 'learning_rate': 1.9371723969456512e-05, 'epoch': 0.14} + 14%|█▍ | 809/5772 [1:25:24<9:08:13, 6.63s/it] {'loss': 0.4992, 'learning_rate': 1.9371723969456512e-05, 'epoch': 0.14} + 14%|█▍ | 809/5772 [1:25:17<9:08:12, 6.63s/it] 14%|█▍ | 810/5772 [1:25:23<8:52:41, 6.44s/it] 14%|█▍ | 810/5772 [1:25:30<8:52:41, 6.44s/it] {'loss': 0.5051, 'learning_rate': 1.9369764657546307e-05, 'epoch': 0.14} + 14%|█▍ | 810/5772 [1:25:30<8:52:41, 6.44s/it] {'loss': 0.5051, 'learning_rate': 1.9369764657546307e-05, 'epoch': 0.14} + 14%|█▍ | 810/5772 [1:25:23<8:52:41, 6.44s/it] 14%|█▍ | 811/5772 [1:25:29<8:38:59, 6.28s/it] 14%|█▍ | 811/5772 [1:25:36<8:38:59, 6.28s/it] {'loss': 0.4984, 'learning_rate': 1.9367802394680816e-05, 'epoch': 0.14} + 14%|█▍ | 811/5772 [1:25:36<8:38:59, 6.28s/it] {'loss': 0.4984, 'learning_rate': 1.9367802394680816e-05, 'epoch': 0.14} + 14%|█▍ | 811/5772 [1:25:29<8:38:59, 6.28s/it] 14%|█▍ | 812/5772 [1:25:35<8:44:05, 6.34s/it] 14%|█▍ | 812/5772 [1:25:42<8:44:05, 6.34s/it] {'loss': 0.4727, 'learning_rate': 1.9365837181478044e-05, 'epoch': 0.14} + 14%|█▍ | 812/5772 [1:25:42<8:44:05, 6.34s/it] {'loss': 0.4727, 'learning_rate': 1.9365837181478044e-05, 'epoch': 0.14} + 14%|█▍ | 812/5772 [1:25:35<8:44:05, 6.34s/it] 14%|█▍ | 813/5772 [1:25:41<8:35:08, 6.23s/it] 14%|█▍ | 813/5772 [1:25:48<8:35:08, 6.23s/it] {'loss': 0.4883, 'learning_rate': 1.9363869018556928e-05, 'epoch': 0.14} + 14%|█▍ | 813/5772 [1:25:48<8:35:08, 6.23s/it] {'loss': 0.4883, 'learning_rate': 1.9363869018556928e-05, 'epoch': 0.14} + 14%|█▍ | 813/5772 [1:25:41<8:35:08, 6.23s/it] 14%|█▍ | 814/5772 [1:25:47<8:25:36, 6.12s/it] 14%|█▍ | 814/5772 [1:25:54<8:25:36, 6.12s/it] {'loss': 0.4926, 'learning_rate': 1.936189790653733e-05, 'epoch': 0.14} + 14%|█▍ | 814/5772 [1:25:54<8:25:36, 6.12s/it] {'loss': 0.4926, 'learning_rate': 1.936189790653733e-05, 'epoch': 0.14} + 14%|█▍ | 814/5772 [1:25:47<8:25:36, 6.12s/it] 14%|█▍ | 815/5772 [1:25:53<8:29:40, 6.17s/it] 14%|█▍ | 815/5772 [1:26:00<8:29:40, 6.17s/it] {'loss': 0.4914, 'learning_rate': 1.9359923846040035e-05, 'epoch': 0.14} + 14%|█▍ | 815/5772 [1:26:00<8:29:40, 6.17s/it] {'loss': 0.4914, 'learning_rate': 1.9359923846040035e-05, 'epoch': 0.14} + 14%|█▍ | 815/5772 [1:25:53<8:29:40, 6.17s/it] 14%|█▍ | 816/5772 [1:26:00<8:39:43, 6.29s/it] 14%|█▍ | 816/5772 [1:26:07<8:39:44, 6.29s/it] {'loss': 0.5043, 'learning_rate': 1.935794683768677e-05, 'epoch': 0.14} + 14%|█▍ | 816/5772 [1:26:07<8:39:44, 6.29s/it] {'loss': 0.5043, 'learning_rate': 1.935794683768677e-05, 'epoch': 0.14} + 14%|█▍ | 816/5772 [1:26:00<8:39:43, 6.29s/it] 14%|█▍ | 817/5772 [1:26:06<8:27:12, 6.14s/it] 14%|█▍ | 817/5772 [1:26:13<8:27:12, 6.14s/it] {'loss': 0.4854, 'learning_rate': 1.935596688210018e-05, 'epoch': 0.14} + 14%|█▍ | 817/5772 [1:26:13<8:27:12, 6.14s/it] {'loss': 0.4854, 'learning_rate': 1.935596688210018e-05, 'epoch': 0.14} + 14%|█▍ | 817/5772 [1:26:06<8:27:12, 6.14s/it] 14%|█▍ | 818/5772 [1:26:12<8:25:47, 6.13s/it] 14%|█▍ | 818/5772 [1:26:19<8:25:47, 6.13s/it] {'loss': 0.4882, 'learning_rate': 1.9353983979903836e-05, 'epoch': 0.14} + 14%|█▍ | 818/5772 [1:26:19<8:25:47, 6.13s/it] {'loss': 0.4882, 'learning_rate': 1.9353983979903836e-05, 'epoch': 0.14} + 14%|█▍ | 818/5772 [1:26:12<8:25:47, 6.13s/it] 14%|█▍ | 819/5772 [1:26:18<8:20:30, 6.06s/it] 14%|█▍ | 819/5772 [1:26:25<8:20:30, 6.06s/it] {'loss': 0.4945, 'learning_rate': 1.9351998131722244e-05, 'epoch': 0.14} + 14%|█▍ | 819/5772 [1:26:25<8:20:30, 6.06s/it] {'loss': 0.4945, 'learning_rate': 1.9351998131722244e-05, 'epoch': 0.14} + 14%|█▍ | 819/5772 [1:26:18<8:20:30, 6.06s/it] 14%|█▍ | 820/5772 [1:26:31<8:19:05, 6.05s/it] 14%|█▍ | 820/5772 [1:26:24<8:19:06, 6.05s/it] {'loss': 0.492, 'learning_rate': 1.9350009338180842e-05, 'epoch': 0.14} + 14%|█▍ | 820/5772 [1:26:31<8:19:05, 6.05s/it] {'loss': 0.492, 'learning_rate': 1.9350009338180842e-05, 'epoch': 0.14} + 14%|█▍ | 820/5772 [1:26:24<8:19:06, 6.05s/it] 14%|█▍ | 821/5772 [1:26:30<8:29:06, 6.17s/it] 14%|█▍ | 821/5772 [1:26:37<8:29:06, 6.17s/it] {'loss': 0.493, 'learning_rate': 1.9348017599905984e-05, 'epoch': 0.14} + 14%|█▍ | 821/5772 [1:26:37<8:29:06, 6.17s/it] {'loss': 0.493, 'learning_rate': 1.9348017599905984e-05, 'epoch': 0.14} + 14%|█▍ | 821/5772 [1:26:30<8:29:06, 6.17s/it] 14%|█▍ | 822/5772 [1:26:36<8:25:14, 6.12s/it] 14%|█▍ | 822/5772 [1:26:43<8:25:14, 6.12s/it] {'loss': 0.503, 'learning_rate': 1.9346022917524958e-05, 'epoch': 0.14} + 14%|█▍ | 822/5772 [1:26:43<8:25:14, 6.12s/it] {'loss': 0.503, 'learning_rate': 1.9346022917524958e-05, 'epoch': 0.14} + 14%|█▍ | 822/5772 [1:26:36<8:25:14, 6.12s/it] 14%|█▍ | 823/5772 [1:26:42<8:27:45, 6.16s/it] 14%|█▍ | 823/5772 [1:26:49<8:27:46, 6.16s/it] {'loss': 0.4995, 'learning_rate': 1.9344025291665978e-05, 'epoch': 0.14} + 14%|█▍ | 823/5772 [1:26:49<8:27:46, 6.16s/it] {'loss': 0.4995, 'learning_rate': 1.9344025291665978e-05, 'epoch': 0.14} + 14%|█▍ | 823/5772 [1:26:42<8:27:45, 6.16s/it] 14%|█▍ | 824/5772 [1:26:49<8:33:18, 6.22s/it] 14%|█▍ | 824/5772 [1:26:56<8:33:19, 6.22s/it] {'loss': 0.497, 'learning_rate': 1.9342024722958187e-05, 'epoch': 0.14} + 14%|█▍ | 824/5772 [1:26:56<8:33:19, 6.22s/it] {'loss': 0.497, 'learning_rate': 1.9342024722958187e-05, 'epoch': 0.14} + 14%|█▍ | 824/5772 [1:26:49<8:33:18, 6.22s/it] 14%|█▍ | 825/5772 [1:26:55<8:26:31, 6.14s/it] 14%|█▍ | 825/5772 [1:27:02<8:26:31, 6.14s/it] {'loss': 0.4962, 'learning_rate': 1.9340021212031647e-05, 'epoch': 0.14} + 14%|█▍ | 825/5772 [1:27:02<8:26:31, 6.14s/it] {'loss': 0.4962, 'learning_rate': 1.9340021212031647e-05, 'epoch': 0.14} + 14%|█▍ | 825/5772 [1:26:55<8:26:31, 6.14s/it] 14%|█▍ | 826/5772 [1:27:01<8:26:05, 6.14s/it] 14%|█▍ | 826/5772 [1:27:08<8:26:06, 6.14s/it] {'loss': 0.4983, 'learning_rate': 1.933801475951736e-05, 'epoch': 0.14} + 14%|█▍ | 826/5772 [1:27:08<8:26:06, 6.14s/it] {'loss': 0.4983, 'learning_rate': 1.933801475951736e-05, 'epoch': 0.14} + 14%|█▍ | 826/5772 [1:27:01<8:26:05, 6.14s/it] 14%|█▍ | 827/5772 [1:27:07<8:28:22, 6.17s/it] 14%|█▍ | 827/5772 [1:27:14<8:28:22, 6.17s/it] {'loss': 0.4809, 'learning_rate': 1.9336005366047246e-05, 'epoch': 0.14} + 14%|█▍ | 827/5772 [1:27:14<8:28:22, 6.17s/it] {'loss': 0.4809, 'learning_rate': 1.9336005366047246e-05, 'epoch': 0.14} + 14%|█▍ | 827/5772 [1:27:07<8:28:22, 6.17s/it] 14%|█▍ | 828/5772 [1:27:13<8:21:57, 6.09s/it] 14%|█▍ | 828/5772 [1:27:20<8:21:57, 6.09s/it] {'loss': 0.5052, 'learning_rate': 1.933399303225415e-05, 'epoch': 0.14} + 14%|█▍ | 828/5772 [1:27:20<8:21:57, 6.09s/it] {'loss': 0.5052, 'learning_rate': 1.933399303225415e-05, 'epoch': 0.14} + 14%|█▍ | 828/5772 [1:27:13<8:21:57, 6.09s/it] 14%|█▍ | 829/5772 [1:27:19<8:21:14, 6.08s/it] 14%|█▍ | 829/5772 [1:27:26<8:21:14, 6.08s/it] {'loss': 0.4871, 'learning_rate': 1.933197775877184e-05, 'epoch': 0.14} + 14%|█▍ | 829/5772 [1:27:26<8:21:14, 6.08s/it] {'loss': 0.4871, 'learning_rate': 1.933197775877184e-05, 'epoch': 0.14} + 14%|█▍ | 829/5772 [1:27:19<8:21:14, 6.08s/it] 14%|█▍ | 830/5772 [1:27:25<8:14:43, 6.01s/it] 14%|█▍ | 830/5772 [1:27:32<8:14:43, 6.01s/it] {'loss': 0.4884, 'learning_rate': 1.9329959546235028e-05, 'epoch': 0.14} + 14%|█▍ | 830/5772 [1:27:32<8:14:43, 6.01s/it] {'loss': 0.4884, 'learning_rate': 1.9329959546235028e-05, 'epoch': 0.14} + 14%|█▍ | 830/5772 [1:27:25<8:14:43, 6.01s/it] 14%|█▍ | 831/5772 [1:27:32<8:33:48, 6.24s/it] 14%|█▍ | 831/5772 [1:27:39<8:33:49, 6.24s/it] {'loss': 0.4829, 'learning_rate': 1.9327938395279325e-05, 'epoch': 0.14} + 14%|█▍ | 831/5772 [1:27:39<8:33:49, 6.24s/it] {'loss': 0.4829, 'learning_rate': 1.9327938395279325e-05, 'epoch': 0.14} + 14%|█▍ | 831/5772 [1:27:32<8:33:48, 6.24s/it] 14%|█▍ | 832/5772 [1:27:38<8:30:15, 6.20s/it] 14%|█▍ | 832/5772 [1:27:45<8:30:14, 6.20s/it] {'loss': 0.5091, 'learning_rate': 1.9325914306541294e-05, 'epoch': 0.14} + 14%|█▍ | 832/5772 [1:27:45<8:30:14, 6.20s/it] {'loss': 0.5091, 'learning_rate': 1.9325914306541294e-05, 'epoch': 0.14} + 14%|█▍ | 832/5772 [1:27:38<8:30:15, 6.20s/it] 14%|█▍ | 833/5772 [1:27:44<8:26:22, 6.15s/it] 14%|█▍ | 833/5772 [1:27:51<8:26:22, 6.15s/it] {'loss': 0.4914, 'learning_rate': 1.93238872806584e-05, 'epoch': 0.14} + 14%|█▍ | 833/5772 [1:27:51<8:26:22, 6.15s/it] {'loss': 0.4914, 'learning_rate': 1.93238872806584e-05, 'epoch': 0.14} + 14%|█▍ | 833/5772 [1:27:44<8:26:22, 6.15s/it] 14%|█▍ | 834/5772 [1:27:50<8:40:50, 6.33s/it] 14%|█▍ | 834/5772 [1:27:57<8:40:50, 6.33s/it] {'loss': 0.4964, 'learning_rate': 1.932185731826905e-05, 'epoch': 0.14} + 14%|█▍ | 834/5772 [1:27:57<8:40:50, 6.33s/it] {'loss': 0.4964, 'learning_rate': 1.932185731826905e-05, 'epoch': 0.14} + 14%|█▍ | 834/5772 [1:27:50<8:40:50, 6.33s/it] 14%|█▍ | 835/5772 [1:27:57<8:40:12, 6.32s/it] 14%|█▍ | 835/5772 [1:28:04<8:40:12, 6.32s/it] {'loss': 0.4823, 'learning_rate': 1.9319824420012566e-05, 'epoch': 0.14} + 14%|█▍ | 835/5772 [1:28:04<8:40:12, 6.32s/it] {'loss': 0.4823, 'learning_rate': 1.9319824420012566e-05, 'epoch': 0.14} + 14%|█▍ | 835/5772 [1:27:57<8:40:12, 6.32s/it] 14%|█▍ | 836/5772 [1:28:03<8:33:10, 6.24s/it] 14%|█▍ | 836/5772 [1:28:10<8:33:09, 6.24s/it] {'loss': 0.501, 'learning_rate': 1.93177885865292e-05, 'epoch': 0.14} + 14%|█▍ | 836/5772 [1:28:10<8:33:09, 6.24s/it] {'loss': 0.501, 'learning_rate': 1.93177885865292e-05, 'epoch': 0.14} + 14%|█▍ | 836/5772 [1:28:03<8:33:10, 6.24s/it] 15%|█▍ | 837/5772 [1:28:09<8:31:15, 6.22s/it] 15%|█▍ | 837/5772 [1:28:16<8:31:15, 6.22s/it] {'loss': 0.4884, 'learning_rate': 1.9315749818460127e-05, 'epoch': 0.14} + 15%|█▍ | 837/5772 [1:28:16<8:31:15, 6.22s/it] {'loss': 0.4884, 'learning_rate': 1.9315749818460127e-05, 'epoch': 0.14} + 15%|█▍ | 837/5772 [1:28:09<8:31:15, 6.22s/it] 15%|█▍ | 838/5772 [1:28:15<8:31:55, 6.23s/it] 15%|█▍ | 838/5772 [1:28:22<8:31:55, 6.23s/it] {'loss': 0.5083, 'learning_rate': 1.9313708116447446e-05, 'epoch': 0.15} + 15%|█▍ | 838/5772 [1:28:22<8:31:55, 6.23s/it] {'loss': 0.5083, 'learning_rate': 1.9313708116447446e-05, 'epoch': 0.15} + 15%|█▍ | 838/5772 [1:28:15<8:31:55, 6.23s/it] 15%|█▍ | 839/5772 [1:28:21<8:23:47, 6.13s/it] 15%|█▍ | 839/5772 [1:28:28<8:23:48, 6.13s/it] {'loss': 0.5016, 'learning_rate': 1.9311663481134174e-05, 'epoch': 0.15} + 15%|█▍ | 839/5772 [1:28:28<8:23:48, 6.13s/it] {'loss': 0.5016, 'learning_rate': 1.9311663481134174e-05, 'epoch': 0.15} + 15%|█▍ | 839/5772 [1:28:21<8:23:47, 6.13s/it] 15%|█▍ | 840/5772 [1:28:28<8:32:28, 6.23s/it] 15%|█▍ | 840/5772 [1:28:35<8:32:29, 6.23s/it] {'loss': 0.5033, 'learning_rate': 1.9309615913164262e-05, 'epoch': 0.15} + 15%|█▍ | 840/5772 [1:28:28<8:32:28, 6.23s/it]{'loss': 0.5033, 'learning_rate': 1.9309615913164262e-05, 'epoch': 0.15} + 15%|█▍ | 840/5772 [1:28:35<8:32:29, 6.23s/it] 15%|█▍ | 841/5772 [1:28:34<8:34:59, 6.27s/it] 15%|█▍ | 841/5772 [1:28:41<8:34:59, 6.27s/it] {'loss': 0.4805, 'learning_rate': 1.9307565413182582e-05, 'epoch': 0.15} + 15%|█▍ | 841/5772 [1:28:41<8:34:59, 6.27s/it] {'loss': 0.4805, 'learning_rate': 1.9307565413182582e-05, 'epoch': 0.15} + 15%|█▍ | 841/5772 [1:28:34<8:34:59, 6.27s/it] 15%|█▍ | 842/5772 [1:28:40<8:27:18, 6.17s/it] 15%|█▍ | 842/5772 [1:28:47<8:27:18, 6.17s/it] {'loss': 0.4876, 'learning_rate': 1.9305511981834927e-05, 'epoch': 0.15} + 15%|█▍ | 842/5772 [1:28:47<8:27:18, 6.17s/it] {'loss': 0.4876, 'learning_rate': 1.9305511981834927e-05, 'epoch': 0.15} + 15%|█▍ | 842/5772 [1:28:40<8:27:18, 6.17s/it] 15%|█▍ | 843/5772 [1:28:46<8:19:53, 6.09s/it] 15%|█▍ | 843/5772 [1:28:53<8:19:53, 6.09s/it] {'loss': 0.4936, 'learning_rate': 1.9303455619768006e-05, 'epoch': 0.15} + 15%|█▍ | 843/5772 [1:28:53<8:19:53, 6.09s/it] {'loss': 0.4936, 'learning_rate': 1.9303455619768006e-05, 'epoch': 0.15} + 15%|█▍ | 843/5772 [1:28:46<8:19:53, 6.09s/it] 15%|█▍ | 844/5772 [1:28:52<8:13:00, 6.00s/it] 15%|█▍ | 844/5772 [1:28:59<8:13:00, 6.00s/it] {'loss': 0.5003, 'learning_rate': 1.930139632762947e-05, 'epoch': 0.15} + 15%|█▍ | 844/5772 [1:28:59<8:13:00, 6.00s/it] {'loss': 0.5003, 'learning_rate': 1.930139632762947e-05, 'epoch': 0.15} + 15%|█▍ | 844/5772 [1:28:52<8:13:00, 6.00s/it] 15%|█▍ | 845/5772 [1:28:58<8:13:50, 6.01s/it] 15%|█▍ | 845/5772 [1:29:05<8:13:49, 6.01s/it] {'loss': 0.492, 'learning_rate': 1.9299334106067874e-05, 'epoch': 0.15} + 15%|█▍ | 845/5772 [1:29:05<8:13:49, 6.01s/it] {'loss': 0.492, 'learning_rate': 1.9299334106067874e-05, 'epoch': 0.15} + 15%|█▍ | 845/5772 [1:28:58<8:13:50, 6.01s/it] 15%|█▍ | 846/5772 [1:29:04<8:25:39, 6.16s/it] 15%|█▍ | 846/5772 [1:29:11<8:25:39, 6.16s/it] {'loss': 0.4804, 'learning_rate': 1.9297268955732707e-05, 'epoch': 0.15} + 15%|█▍ | 846/5772 [1:29:11<8:25:39, 6.16s/it] {'loss': 0.4804, 'learning_rate': 1.9297268955732707e-05, 'epoch': 0.15} + 15%|█▍ | 846/5772 [1:29:04<8:25:39, 6.16s/it] 15%|█▍ | 847/5772 [1:29:10<8:24:01, 6.14s/it] 15%|█▍ | 847/5772 [1:29:17<8:24:01, 6.14s/it] {'loss': 0.4867, 'learning_rate': 1.929520087727438e-05, 'epoch': 0.15} + 15%|█▍ | 847/5772 [1:29:17<8:24:01, 6.14s/it] {'loss': 0.4867, 'learning_rate': 1.929520087727438e-05, 'epoch': 0.15} + 15%|█▍ | 847/5772 [1:29:10<8:24:01, 6.14s/it] 15%|█▍ | 848/5772 [1:29:16<8:26:01, 6.17s/it] 15%|█▍ | 848/5772 [1:29:23<8:26:02, 6.17s/it] {'loss': 0.5015, 'learning_rate': 1.9293129871344215e-05, 'epoch': 0.15} + 15%|█▍ | 848/5772 [1:29:23<8:26:02, 6.17s/it] {'loss': 0.5015, 'learning_rate': 1.9293129871344215e-05, 'epoch': 0.15} + 15%|█▍ | 848/5772 [1:29:16<8:26:01, 6.17s/it] 15%|█▍ | 849/5772 [1:29:23<8:26:40, 6.18s/it] 15%|█▍ | 849/5772 [1:29:30<8:26:41, 6.18s/it] {'loss': 0.4911, 'learning_rate': 1.9291055938594464e-05, 'epoch': 0.15} + 15%|█▍ | 849/5772 [1:29:30<8:26:41, 6.18s/it] {'loss': 0.4911, 'learning_rate': 1.9291055938594464e-05, 'epoch': 0.15} + 15%|█▍ | 849/5772 [1:29:23<8:26:40, 6.18s/it]4 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +57 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend...10 + AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +0 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... + 15%|█▍ | 850/5772 [1:29:29<8:31:58, 6.24s/it]3 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... + 15%|█▍ | 850/5772 [1:29:36<8:31:58, 6.24s/it]9 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4982, 'learning_rate': 1.9288979079678306e-05, 'epoch': 0.15} + 15%|█▍ | 850/5772 [1:29:36<8:31:58, 6.24s/it] {'loss': 0.4982, 'learning_rate': 1.9288979079678306e-05, 'epoch': 0.15} + 15%|█▍ | 850/5772 [1:29:29<8:31:58, 6.24s/it] 15%|█▍ | 851/5772 [1:29:35<8:26:42, 6.18s/it] 15%|█▍ | 851/5772 [1:29:42<8:26:43, 6.18s/it] {'loss': 0.4947, 'learning_rate': 1.928689929524983e-05, 'epoch': 0.15} + 15%|█▍ | 851/5772 [1:29:42<8:26:43, 6.18s/it] {'loss': 0.4947, 'learning_rate': 1.928689929524983e-05, 'epoch': 0.15} + 15%|█▍ | 851/5772 [1:29:35<8:26:42, 6.18s/it] 15%|█▍ | 852/5772 [1:29:41<8:18:20, 6.08s/it] 15%|█▍ | 852/5772 [1:29:48<8:18:20, 6.08s/it] {'loss': 0.4926, 'learning_rate': 1.928481658596406e-05, 'epoch': 0.15} + 15%|█▍ | 852/5772 [1:29:48<8:18:20, 6.08s/it] {'loss': 0.4926, 'learning_rate': 1.928481658596406e-05, 'epoch': 0.15} + 15%|█▍ | 852/5772 [1:29:41<8:18:20, 6.08s/it] 15%|█▍ | 853/5772 [1:29:47<8:22:32, 6.13s/it] 15%|█▍ | 853/5772 [1:29:54<8:22:31, 6.13s/it] {'loss': 0.4768, 'learning_rate': 1.9282730952476928e-05, 'epoch': 0.15} + 15%|█▍ | 853/5772 [1:29:54<8:22:31, 6.13s/it] {'loss': 0.4768, 'learning_rate': 1.9282730952476928e-05, 'epoch': 0.15} + 15%|█▍ | 853/5772 [1:29:47<8:22:32, 6.13s/it] 15%|█▍ | 854/5772 [1:29:53<8:18:29, 6.08s/it] 15%|█▍ | 854/5772 [1:30:00<8:18:30, 6.08s/it] {'loss': 0.4899, 'learning_rate': 1.9280642395445298e-05, 'epoch': 0.15} + 15%|█▍ | 854/5772 [1:30:00<8:18:30, 6.08s/it] {'loss': 0.4899, 'learning_rate': 1.9280642395445298e-05, 'epoch': 0.15} + 15%|█▍ | 854/5772 [1:29:53<8:18:29, 6.08s/it] 15%|█▍ | 855/5772 [1:29:59<8:16:08, 6.05s/it] 15%|█▍ | 855/5772 [1:30:06<8:16:08, 6.05s/it] {'loss': 0.4818, 'learning_rate': 1.9278550915526947e-05, 'epoch': 0.15} + 15%|█▍ | 855/5772 [1:30:06<8:16:08, 6.05s/it] {'loss': 0.4818, 'learning_rate': 1.9278550915526947e-05, 'epoch': 0.15} + 15%|█▍ | 855/5772 [1:29:59<8:16:08, 6.05s/it] 15%|█▍ | 856/5772 [1:30:05<8:08:59, 5.97s/it] 15%|█▍ | 856/5772 [1:30:12<8:08:59, 5.97s/it] {'loss': 0.4917, 'learning_rate': 1.927645651338057e-05, 'epoch': 0.15} + 15%|█▍ | 856/5772 [1:30:12<8:08:59, 5.97s/it] {'loss': 0.4917, 'learning_rate': 1.927645651338057e-05, 'epoch': 0.15} + 15%|█▍ | 856/5772 [1:30:05<8:08:59, 5.97s/it] 15%|█▍ | 857/5772 [1:30:12<8:34:04, 6.28s/it] 15%|█▍ | 857/5772 [1:30:19<8:34:04, 6.28s/it] {'loss': 0.4928, 'learning_rate': 1.9274359189665792e-05, 'epoch': 0.15} + 15%|█▍ | 857/5772 [1:30:19<8:34:04, 6.28s/it] {'loss': 0.4928, 'learning_rate': 1.9274359189665792e-05, 'epoch': 0.15} + 15%|█▍ | 857/5772 [1:30:12<8:34:04, 6.28s/it] 15%|█▍ | 858/5772 [1:30:18<8:26:22, 6.18s/it] 15%|█▍ | 858/5772 [1:30:25<8:26:22, 6.18s/it] {'loss': 0.4905, 'learning_rate': 1.9272258945043154e-05, 'epoch': 0.15} + 15%|█▍ | 858/5772 [1:30:25<8:26:22, 6.18s/it] {'loss': 0.4905, 'learning_rate': 1.9272258945043154e-05, 'epoch': 0.15} + 15%|█▍ | 858/5772 [1:30:18<8:26:22, 6.18s/it] 15%|█▍ | 859/5772 [1:30:24<8:25:24, 6.17s/it] 15%|█▍ | 859/5772 [1:30:31<8:25:24, 6.17s/it] {'loss': 0.4839, 'learning_rate': 1.9270155780174113e-05, 'epoch': 0.15} + 15%|█▍ | 859/5772 [1:30:31<8:25:24, 6.17s/it] {'loss': 0.4839, 'learning_rate': 1.9270155780174113e-05, 'epoch': 0.15} + 15%|█▍ | 859/5772 [1:30:24<8:25:24, 6.17s/it] 15%|█▍ | 860/5772 [1:30:30<8:22:36, 6.14s/it] 15%|█▍ | 860/5772 [1:30:37<8:22:35, 6.14s/it] {'loss': 0.4801, 'learning_rate': 1.9268049695721055e-05, 'epoch': 0.15} + 15%|█▍ | 860/5772 [1:30:37<8:22:35, 6.14s/it] {'loss': 0.4801, 'learning_rate': 1.9268049695721055e-05, 'epoch': 0.15} + 15%|█▍ | 860/5772 [1:30:30<8:22:36, 6.14s/it] 15%|█▍ | 861/5772 [1:30:43<8:11:46, 6.01s/it] 15%|█▍ | 861/5772 [1:30:36<8:11:46, 6.01s/it] {'loss': 0.4792, 'learning_rate': 1.9265940692347276e-05, 'epoch': 0.15} + 15%|█▍ | 861/5772 [1:30:36<8:11:46, 6.01s/it]{'loss': 0.4792, 'learning_rate': 1.9265940692347276e-05, 'epoch': 0.15} + 15%|█▍ | 861/5772 [1:30:43<8:11:46, 6.01s/it] 15%|█▍ | 862/5772 [1:30:42<8:13:08, 6.03s/it] 15%|█▍ | 862/5772 [1:30:49<8:13:08, 6.03s/it] {'loss': 0.4887, 'learning_rate': 1.9263828770716993e-05, 'epoch': 0.15} + 15%|█▍ | 862/5772 [1:30:49<8:13:08, 6.03s/it] {'loss': 0.4887, 'learning_rate': 1.9263828770716993e-05, 'epoch': 0.15} + 15%|█▍ | 862/5772 [1:30:42<8:13:08, 6.03s/it] 15%|█▍ | 863/5772 [1:30:48<8:09:05, 5.98s/it] 15%|█▍ | 863/5772 [1:30:55<8:09:05, 5.98s/it] {'loss': 0.4897, 'learning_rate': 1.9261713931495344e-05, 'epoch': 0.15} + 15%|█▍ | 863/5772 [1:30:55<8:09:05, 5.98s/it] {'loss': 0.4897, 'learning_rate': 1.9261713931495344e-05, 'epoch': 0.15} + 15%|█▍ | 863/5772 [1:30:48<8:09:05, 5.98s/it] 15%|█▍ | 864/5772 [1:30:54<8:04:59, 5.93s/it] 15%|█▍ | 864/5772 [1:31:01<8:05:00, 5.93s/it] {'loss': 0.4902, 'learning_rate': 1.925959617534839e-05, 'epoch': 0.15} + 15%|█▍ | 864/5772 [1:31:01<8:05:00, 5.93s/it] {'loss': 0.4902, 'learning_rate': 1.925959617534839e-05, 'epoch': 0.15} + 15%|█▍ | 864/5772 [1:30:54<8:04:59, 5.93s/it] 15%|█▍ | 865/5772 [1:30:59<8:03:11, 5.91s/it] 15%|█▍ | 865/5772 [1:31:06<8:03:11, 5.91s/it] {'loss': 0.4804, 'learning_rate': 1.92574755029431e-05, 'epoch': 0.15} + 15%|█▍ | 865/5772 [1:31:06<8:03:11, 5.91s/it] {'loss': 0.4804, 'learning_rate': 1.92574755029431e-05, 'epoch': 0.15} + 15%|█▍ | 865/5772 [1:30:59<8:03:11, 5.91s/it] 15%|█▌ | 866/5772 [1:31:06<8:17:08, 6.08s/it] 15%|█▌ | 866/5772 [1:31:13<8:17:08, 6.08s/it] {'loss': 0.5123, 'learning_rate': 1.925535191494738e-05, 'epoch': 0.15} + 15%|█▌ | 866/5772 [1:31:13<8:17:08, 6.08s/it] {'loss': 0.5123, 'learning_rate': 1.925535191494738e-05, 'epoch': 0.15} + 15%|█▌ | 866/5772 [1:31:06<8:17:08, 6.08s/it] 15%|█▌ | 867/5772 [1:31:12<8:20:51, 6.13s/it] 15%|█▌ | 867/5772 [1:31:19<8:20:51, 6.13s/it] {'loss': 0.4983, 'learning_rate': 1.9253225412030028e-05, 'epoch': 0.15} + 15%|█▌ | 867/5772 [1:31:19<8:20:51, 6.13s/it] {'loss': 0.4983, 'learning_rate': 1.9253225412030028e-05, 'epoch': 0.15} + 15%|█▌ | 867/5772 [1:31:12<8:20:51, 6.13s/it] 15%|█▌ | 868/5772 [1:31:25<8:21:46, 6.14s/it] 15%|█▌ | 868/5772 [1:31:18<8:21:47, 6.14s/it] {'loss': 0.5005, 'learning_rate': 1.9251095994860782e-05, 'epoch': 0.15} + 15%|█▌ | 868/5772 [1:31:25<8:21:46, 6.14s/it] {'loss': 0.5005, 'learning_rate': 1.9251095994860782e-05, 'epoch': 0.15} + 15%|█▌ | 868/5772 [1:31:18<8:21:47, 6.14s/it] 15%|█▌ | 869/5772 [1:31:24<8:20:13, 6.12s/it] 15%|█▌ | 869/5772 [1:31:31<8:20:13, 6.12s/it] {'loss': 0.4775, 'learning_rate': 1.924896366411029e-05, 'epoch': 0.15} + 15%|█▌ | 869/5772 [1:31:31<8:20:13, 6.12s/it] {'loss': 0.4775, 'learning_rate': 1.924896366411029e-05, 'epoch': 0.15} + 15%|█▌ | 869/5772 [1:31:24<8:20:13, 6.12s/it] 15%|█▌ | 870/5772 [1:31:38<8:29:58, 6.24s/it] 15%|█▌ | 870/5772 [1:31:31<8:29:58, 6.24s/it] {'loss': 0.4955, 'learning_rate': 1.9246828420450113e-05, 'epoch': 0.15} + 15%|█▌ | 870/5772 [1:31:38<8:29:58, 6.24s/it] {'loss': 0.4955, 'learning_rate': 1.9246828420450113e-05, 'epoch': 0.15} + 15%|█▌ | 870/5772 [1:31:31<8:29:58, 6.24s/it] 15%|█▌ | 871/5772 [1:31:44<8:21:24, 6.14s/it] 15%|█▌ | 871/5772 [1:31:37<8:21:24, 6.14s/it] {'loss': 0.4846, 'learning_rate': 1.9244690264552745e-05, 'epoch': 0.15} + 15%|█▌ | 871/5772 [1:31:44<8:21:24, 6.14s/it] {'loss': 0.4846, 'learning_rate': 1.9244690264552745e-05, 'epoch': 0.15} + 15%|█▌ | 871/5772 [1:31:37<8:21:24, 6.14s/it] 15%|█▌ | 872/5772 [1:31:43<8:23:47, 6.17s/it] 15%|█▌ | 872/5772 [1:31:50<8:23:48, 6.17s/it] {'loss': 0.4919, 'learning_rate': 1.924254919709157e-05, 'epoch': 0.15} + 15%|█▌ | 872/5772 [1:31:50<8:23:48, 6.17s/it] {'loss': 0.4919, 'learning_rate': 1.924254919709157e-05, 'epoch': 0.15} + 15%|█▌ | 872/5772 [1:31:43<8:23:47, 6.17s/it] 15%|█▌ | 873/5772 [1:31:49<8:20:22, 6.13s/it] 15%|█▌ | 873/5772 [1:31:56<8:20:22, 6.13s/it] {'loss': 0.4945, 'learning_rate': 1.924040521874092e-05, 'epoch': 0.15} + 15%|█▌ | 873/5772 [1:31:56<8:20:22, 6.13s/it] {'loss': 0.4945, 'learning_rate': 1.924040521874092e-05, 'epoch': 0.15} + 15%|█▌ | 873/5772 [1:31:49<8:20:22, 6.13s/it] 15%|█▌ | 874/5772 [1:32:02<8:27:07, 6.21s/it] 15%|█▌ | 874/5772 [1:31:55<8:27:07, 6.21s/it] {'loss': 0.4845, 'learning_rate': 1.923825833017602e-05, 'epoch': 0.15} + 15%|█▌ | 874/5772 [1:32:02<8:27:07, 6.21s/it] {'loss': 0.4845, 'learning_rate': 1.923825833017602e-05, 'epoch': 0.15} + 15%|█▌ | 874/5772 [1:31:55<8:27:07, 6.21s/it] 15%|█▌ | 875/5772 [1:32:09<8:22:40, 6.16s/it] 15%|█▌ | 875/5772 [1:32:01<8:22:40, 6.16s/it] {'loss': 0.4783, 'learning_rate': 1.9236108532073025e-05, 'epoch': 0.15} + 15%|█▌ | 875/5772 [1:32:09<8:22:40, 6.16s/it] {'loss': 0.4783, 'learning_rate': 1.9236108532073025e-05, 'epoch': 0.15} + 15%|█▌ | 875/5772 [1:32:01<8:22:40, 6.16s/it] 15%|█▌ | 876/5772 [1:32:07<8:11:58, 6.03s/it] 15%|█▌ | 876/5772 [1:32:14<8:11:58, 6.03s/it] {'loss': 0.4993, 'learning_rate': 1.9233955825109e-05, 'epoch': 0.15} + 15%|█▌ | 876/5772 [1:32:14<8:11:58, 6.03s/it] {'loss': 0.4993, 'learning_rate': 1.9233955825109e-05, 'epoch': 0.15} + 15%|█▌ | 876/5772 [1:32:07<8:11:58, 6.03s/it] 15%|█▌ | 877/5772 [1:32:13<8:16:28, 6.09s/it] 15%|█▌ | 877/5772 [1:32:20<8:16:28, 6.09s/it] {'loss': 0.4976, 'learning_rate': 1.9231800209961932e-05, 'epoch': 0.15} + 15%|█▌ | 877/5772 [1:32:20<8:16:28, 6.09s/it] {'loss': 0.4976, 'learning_rate': 1.9231800209961932e-05, 'epoch': 0.15} + 15%|█▌ | 877/5772 [1:32:13<8:16:28, 6.09s/it] 15%|█▌ | 878/5772 [1:32:20<8:17:29, 6.10s/it] 15%|█▌ | 878/5772 [1:32:27<8:17:29, 6.10s/it] {'loss': 0.4995, 'learning_rate': 1.9229641687310714e-05, 'epoch': 0.15} + 15%|█▌ | 878/5772 [1:32:27<8:17:29, 6.10s/it] {'loss': 0.4995, 'learning_rate': 1.9229641687310714e-05, 'epoch': 0.15} + 15%|█▌ | 878/5772 [1:32:20<8:17:29, 6.10s/it] 15%|█▌ | 879/5772 [1:32:26<8:16:44, 6.09s/it] 15%|█▌ | 879/5772 [1:32:33<8:16:44, 6.09s/it] {'loss': 0.4897, 'learning_rate': 1.9227480257835163e-05, 'epoch': 0.15} + 15%|█▌ | 879/5772 [1:32:33<8:16:44, 6.09s/it] {'loss': 0.4897, 'learning_rate': 1.9227480257835163e-05, 'epoch': 0.15} + 15%|█▌ | 879/5772 [1:32:26<8:16:44, 6.09s/it] 15%|█▌ | 880/5772 [1:32:32<8:18:06, 6.11s/it] 15%|█▌ | 880/5772 [1:32:39<8:18:06, 6.11s/it] {'loss': 0.4904, 'learning_rate': 1.922531592221601e-05, 'epoch': 0.15} + 15%|█▌ | 880/5772 [1:32:39<8:18:06, 6.11s/it] {'loss': 0.4904, 'learning_rate': 1.922531592221601e-05, 'epoch': 0.15} + 15%|█▌ | 880/5772 [1:32:32<8:18:06, 6.11s/it] 15%|█▌ | 881/5772 [1:32:45<8:22:02, 6.16s/it] 15%|█▌ | 881/5772 [1:32:38<8:22:03, 6.16s/it] {'loss': 0.4959, 'learning_rate': 1.92231486811349e-05, 'epoch': 0.15} + 15%|█▌ | 881/5772 [1:32:45<8:22:02, 6.16s/it] {'loss': 0.4959, 'learning_rate': 1.92231486811349e-05, 'epoch': 0.15} + 15%|█▌ | 881/5772 [1:32:38<8:22:03, 6.16s/it] 15%|█▌ | 882/5772 [1:32:44<8:20:31, 6.14s/it] 15%|█▌ | 882/5772 [1:32:51<8:20:31, 6.14s/it] {'loss': 0.4898, 'learning_rate': 1.9220978535274398e-05, 'epoch': 0.15} + 15%|█▌ | 882/5772 [1:32:51<8:20:31, 6.14s/it] {'loss': 0.4898, 'learning_rate': 1.9220978535274398e-05, 'epoch': 0.15} + 15%|█▌ | 882/5772 [1:32:44<8:20:31, 6.14s/it] 15%|█▌ | 883/5772 [1:32:50<8:21:10, 6.15s/it] 15%|█▌ | 883/5772 [1:32:57<8:21:10, 6.15s/it] {'loss': 0.4901, 'learning_rate': 1.9218805485317973e-05, 'epoch': 0.15} + 15%|█▌ | 883/5772 [1:32:57<8:21:10, 6.15s/it] {'loss': 0.4901, 'learning_rate': 1.9218805485317973e-05, 'epoch': 0.15} + 15%|█▌ | 883/5772 [1:32:50<8:21:10, 6.15s/it] 15%|█▌ | 884/5772 [1:32:57<8:22:26, 6.17s/it] 15%|█▌ | 884/5772 [1:33:04<8:22:26, 6.17s/it] {'loss': 0.4828, 'learning_rate': 1.9216629531950014e-05, 'epoch': 0.15} + 15%|█▌ | 884/5772 [1:33:04<8:22:26, 6.17s/it] {'loss': 0.4828, 'learning_rate': 1.9216629531950014e-05, 'epoch': 0.15} + 15%|█▌ | 884/5772 [1:32:57<8:22:26, 6.17s/it] 15%|█▌ | 885/5772 [1:33:03<8:30:23, 6.27s/it] 15%|█▌ | 885/5772 [1:33:10<8:30:24, 6.27s/it] {'loss': 0.4791, 'learning_rate': 1.9214450675855832e-05, 'epoch': 0.15} + 15%|█▌ | 885/5772 [1:33:10<8:30:24, 6.27s/it] {'loss': 0.4791, 'learning_rate': 1.9214450675855832e-05, 'epoch': 0.15} + 15%|█▌ | 885/5772 [1:33:03<8:30:23, 6.27s/it] 15%|█▌ | 886/5772 [1:33:09<8:23:35, 6.18s/it] 15%|█▌ | 886/5772 [1:33:16<8:23:35, 6.18s/it] {'loss': 0.4885, 'learning_rate': 1.9212268917721643e-05, 'epoch': 0.15} + 15%|█▌ | 886/5772 [1:33:16<8:23:35, 6.18s/it] {'loss': 0.4885, 'learning_rate': 1.9212268917721643e-05, 'epoch': 0.15} + 15%|█▌ | 886/5772 [1:33:09<8:23:35, 6.18s/it] 15%|█▌ | 887/5772 [1:33:16<8:34:10, 6.32s/it] 15%|█▌ | 887/5772 [1:33:23<8:34:10, 6.32s/it] {'loss': 0.4964, 'learning_rate': 1.9210084258234576e-05, 'epoch': 0.15} + 15%|█▌ | 887/5772 [1:33:23<8:34:10, 6.32s/it] {'loss': 0.4964, 'learning_rate': 1.9210084258234576e-05, 'epoch': 0.15} + 15%|█▌ | 887/5772 [1:33:16<8:34:10, 6.32s/it] 15%|█▌ | 888/5772 [1:33:29<8:34:54, 6.33s/it] 15%|█▌ | 888/5772 [1:33:22<8:34:54, 6.33s/it] {'loss': 0.5053, 'learning_rate': 1.920789669808268e-05, 'epoch': 0.15} + 15%|█▌ | 888/5772 [1:33:29<8:34:54, 6.33s/it] {'loss': 0.5053, 'learning_rate': 1.920789669808268e-05, 'epoch': 0.15} + 15%|█▌ | 888/5772 [1:33:22<8:34:54, 6.33s/it] 15%|█▌ | 889/5772 [1:33:35<8:30:57, 6.28s/it] 15%|█▌ | 889/5772 [1:33:28<8:30:57, 6.28s/it] {'loss': 0.491, 'learning_rate': 1.9205706237954914e-05, 'epoch': 0.15} + 15%|█▌ | 889/5772 [1:33:35<8:30:57, 6.28s/it] {'loss': 0.491, 'learning_rate': 1.9205706237954914e-05, 'epoch': 0.15} + 15%|█▌ | 889/5772 [1:33:28<8:30:57, 6.28s/it] 15%|█▌ | 890/5772 [1:33:34<8:24:36, 6.20s/it] 15%|█▌ | 890/5772 [1:33:41<8:24:37, 6.20s/it] {'loss': 0.4959, 'learning_rate': 1.9203512878541156e-05, 'epoch': 0.15} + 15%|█▌ | 890/5772 [1:33:41<8:24:37, 6.20s/it] {'loss': 0.4959, 'learning_rate': 1.9203512878541156e-05, 'epoch': 0.15} + 15%|█▌ | 890/5772 [1:33:34<8:24:36, 6.20s/it] 15%|█▌ | 891/5772 [1:33:47<8:24:23, 6.20s/it] 15%|█▌ | 891/5772 [1:33:40<8:24:23, 6.20s/it] {'loss': 0.4881, 'learning_rate': 1.9201316620532186e-05, 'epoch': 0.15} + 15%|█▌ | 891/5772 [1:33:47<8:24:23, 6.20s/it] {'loss': 0.4881, 'learning_rate': 1.9201316620532186e-05, 'epoch': 0.15} + 15%|█▌ | 891/5772 [1:33:40<8:24:23, 6.20s/it] 15%|█▌ | 892/5772 [1:33:47<8:23:32, 6.19s/it] 15%|█▌ | 892/5772 [1:33:54<8:23:33, 6.19s/it] {'loss': 0.4935, 'learning_rate': 1.919911746461971e-05, 'epoch': 0.15} + 15%|█▌ | 892/5772 [1:33:54<8:23:33, 6.19s/it] {'loss': 0.4935, 'learning_rate': 1.919911746461971e-05, 'epoch': 0.15} + 15%|█▌ | 892/5772 [1:33:47<8:23:32, 6.19s/it] 15%|█▌ | 893/5772 [1:33:53<8:25:01, 6.21s/it] 15%|█▌ | 893/5772 [1:34:00<8:25:03, 6.21s/it] {'loss': 0.4887, 'learning_rate': 1.919691541149633e-05, 'epoch': 0.15} + 15%|█▌ | 893/5772 [1:34:00<8:25:03, 6.21s/it] {'loss': 0.4887, 'learning_rate': 1.919691541149633e-05, 'epoch': 0.15} + 15%|█▌ | 893/5772 [1:33:53<8:25:01, 6.21s/it] 15%|█▌ | 894/5772 [1:33:59<8:15:45, 6.10s/it] 15%|█▌ | 894/5772 [1:34:06<8:15:45, 6.10s/it] {'loss': 0.4885, 'learning_rate': 1.919471046185558e-05, 'epoch': 0.15} + 15%|█▌ | 894/5772 [1:34:06<8:15:45, 6.10s/it] {'loss': 0.4885, 'learning_rate': 1.919471046185558e-05, 'epoch': 0.15} + 15%|█▌ | 894/5772 [1:33:59<8:15:45, 6.10s/it] 16%|█▌ | 895/5772 [1:34:05<8:15:18, 6.09s/it] 16%|█▌ | 895/5772 [1:34:12<8:15:18, 6.09s/it] {'loss': 0.4915, 'learning_rate': 1.919250261639189e-05, 'epoch': 0.16} + 16%|█▌ | 895/5772 [1:34:12<8:15:18, 6.09s/it] {'loss': 0.4915, 'learning_rate': 1.919250261639189e-05, 'epoch': 0.16} + 16%|█▌ | 895/5772 [1:34:05<8:15:18, 6.09s/it] 16%|█▌ | 896/5772 [1:34:11<8:22:21, 6.18s/it] 16%|█▌ | 896/5772 [1:34:18<8:22:20, 6.18s/it] {'loss': 0.4944, 'learning_rate': 1.9190291875800616e-05, 'epoch': 0.16} + 16%|█▌ | 896/5772 [1:34:18<8:22:20, 6.18s/it] {'loss': 0.4944, 'learning_rate': 1.9190291875800616e-05, 'epoch': 0.16} + 16%|█▌ | 896/5772 [1:34:11<8:22:21, 6.18s/it] 16%|█▌ | 897/5772 [1:34:17<8:14:29, 6.09s/it] 16%|█▌ | 897/5772 [1:34:24<8:14:29, 6.09s/it] {'loss': 0.4916, 'learning_rate': 1.918807824077801e-05, 'epoch': 0.16} + 16%|█▌ | 897/5772 [1:34:24<8:14:29, 6.09s/it] {'loss': 0.4916, 'learning_rate': 1.918807824077801e-05, 'epoch': 0.16} + 16%|█▌ | 897/5772 [1:34:17<8:14:29, 6.09s/it] 16%|█▌ | 898/5772 [1:34:23<8:16:18, 6.11s/it] 16%|█▌ | 898/5772 [1:34:30<8:16:19, 6.11s/it] {'loss': 0.4934, 'learning_rate': 1.918586171202125e-05, 'epoch': 0.16} + 16%|█▌ | 898/5772 [1:34:30<8:16:19, 6.11s/it] {'loss': 0.4934, 'learning_rate': 1.918586171202125e-05, 'epoch': 0.16} + 16%|█▌ | 898/5772 [1:34:23<8:16:18, 6.11s/it] 16%|█▌ | 899/5772 [1:34:36<8:20:20, 6.16s/it] 16%|█▌ | 899/5772 [1:34:29<8:20:21, 6.16s/it] {'loss': 0.4836, 'learning_rate': 1.9183642290228415e-05, 'epoch': 0.16} + 16%|█▌ | 899/5772 [1:34:36<8:20:20, 6.16s/it] {'loss': 0.4836, 'learning_rate': 1.9183642290228415e-05, 'epoch': 0.16} + 16%|█▌ | 899/5772 [1:34:29<8:20:21, 6.16s/it]4 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +51 6AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +1512 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... + 16%|█▌ | 900/5772 [1:34:42<8:15:59, 6.11s/it]0 3 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... + 16%|█▌ | 900/5772 [1:34:35<8:16:00, 6.11s/it] {'loss': 0.5045, 'learning_rate': 1.9181419976098503e-05, 'epoch': 0.16} + 16%|█▌ | 900/5772 [1:34:42<8:15:59, 6.11s/it] {'loss': 0.5045, 'learning_rate': 1.9181419976098503e-05, 'epoch': 0.16} + 16%|█▌ | 900/5772 [1:34:35<8:16:00, 6.11s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-900/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-900/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-900/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 16%|█▌ | 901/5772 [1:35:01<13:22:25, 9.88s/it] 16%|█▌ | 901/5772 [1:34:54<13:22:25, 9.88s/it] {'loss': 0.4787, 'learning_rate': 1.9179194770331418e-05, 'epoch': 0.16} + 16%|█▌ | 901/5772 [1:35:01<13:22:25, 9.88s/it] {'loss': 0.4787, 'learning_rate': 1.9179194770331418e-05, 'epoch': 0.16} + 16%|█▌ | 901/5772 [1:34:54<13:22:25, 9.88s/it] 16%|█▌ | 902/5772 [1:35:07<11:45:44, 8.69s/it] 16%|█▌ | 902/5772 [1:35:00<11:45:44, 8.69s/it] {'loss': 0.4937, 'learning_rate': 1.917696667362798e-05, 'epoch': 0.16} + 16%|█▌ | 902/5772 [1:35:07<11:45:44, 8.69s/it] {'loss': 0.4937, 'learning_rate': 1.917696667362798e-05, 'epoch': 0.16} + 16%|█▌ | 902/5772 [1:35:00<11:45:44, 8.69s/it] 16%|█▌ | 903/5772 [1:35:06<10:41:31, 7.91s/it] 16%|█▌ | 903/5772 [1:35:13<10:41:31, 7.91s/it] {'loss': 0.4817, 'learning_rate': 1.917473568668991e-05, 'epoch': 0.16} + 16%|█▌ | 903/5772 [1:35:13<10:41:31, 7.91s/it] {'loss': 0.4817, 'learning_rate': 1.917473568668991e-05, 'epoch': 0.16} + 16%|█▌ | 903/5772 [1:35:06<10:41:31, 7.91s/it] 16%|█▌ | 904/5772 [1:35:12<9:59:06, 7.38s/it] 16%|█▌ | 904/5772 [1:35:19<9:59:07, 7.38s/it] {'loss': 0.4836, 'learning_rate': 1.9172501810219844e-05, 'epoch': 0.16} + 16%|█▌ | 904/5772 [1:35:19<9:59:07, 7.38s/it] {'loss': 0.4836, 'learning_rate': 1.9172501810219844e-05, 'epoch': 0.16} + 16%|█▌ | 904/5772 [1:35:12<9:59:06, 7.38s/it] 16%|█▌ | 905/5772 [1:35:18<9:24:31, 6.96s/it] 16%|█▌ | 905/5772 [1:35:25<9:24:31, 6.96s/it] {'loss': 0.472, 'learning_rate': 1.9170265044921338e-05, 'epoch': 0.16} + 16%|█▌ | 905/5772 [1:35:25<9:24:31, 6.96s/it] {'loss': 0.472, 'learning_rate': 1.9170265044921338e-05, 'epoch': 0.16} + 16%|█▌ | 905/5772 [1:35:18<9:24:31, 6.96s/it] 16%|█▌ | 906/5772 [1:35:24<9:05:13, 6.72s/it] 16%|█▌ | 906/5772 [1:35:31<9:05:13, 6.72s/it] {'loss': 0.5059, 'learning_rate': 1.9168025391498837e-05, 'epoch': 0.16} + 16%|█▌ | 906/5772 [1:35:31<9:05:13, 6.72s/it] {'loss': 0.5059, 'learning_rate': 1.9168025391498837e-05, 'epoch': 0.16} + 16%|█▌ | 906/5772 [1:35:24<9:05:13, 6.72s/it] 16%|█▌ | 907/5772 [1:35:30<8:44:24, 6.47s/it] 16%|█▌ | 907/5772 [1:35:37<8:44:24, 6.47s/it] {'loss': 0.4813, 'learning_rate': 1.9165782850657716e-05, 'epoch': 0.16} + 16%|█▌ | 907/5772 [1:35:37<8:44:24, 6.47s/it] {'loss': 0.4813, 'learning_rate': 1.9165782850657716e-05, 'epoch': 0.16} + 16%|█▌ | 907/5772 [1:35:30<8:44:24, 6.47s/it] 16%|█▌ | 908/5772 [1:35:44<8:47:16, 6.50s/it] 16%|█▌ | 908/5772 [1:35:37<8:47:16, 6.50s/it] {'loss': 0.496, 'learning_rate': 1.916353742310425e-05, 'epoch': 0.16} + 16%|█▌ | 908/5772 [1:35:44<8:47:16, 6.50s/it] {'loss': 0.496, 'learning_rate': 1.916353742310425e-05, 'epoch': 0.16} + 16%|█▌ | 908/5772 [1:35:37<8:47:16, 6.50s/it] 16%|█▌ | 909/5772 [1:35:43<8:34:15, 6.34s/it] 16%|█▌ | 909/5772 [1:35:50<8:34:15, 6.34s/it] {'loss': 0.4805, 'learning_rate': 1.916128910954562e-05, 'epoch': 0.16} + 16%|█▌ | 909/5772 [1:35:50<8:34:15, 6.34s/it] {'loss': 0.4805, 'learning_rate': 1.916128910954562e-05, 'epoch': 0.16} + 16%|█▌ | 909/5772 [1:35:43<8:34:15, 6.34s/it] 16%|█▌ | 910/5772 [1:35:49<8:27:23, 6.26s/it] 16%|█▌ | 910/5772 [1:35:56<8:27:23, 6.26s/it] {'loss': 0.4943, 'learning_rate': 1.9159037910689925e-05, 'epoch': 0.16} + 16%|█▌ | 910/5772 [1:35:56<8:27:23, 6.26s/it] {'loss': 0.4943, 'learning_rate': 1.9159037910689925e-05, 'epoch': 0.16} + 16%|█▌ | 910/5772 [1:35:49<8:27:23, 6.26s/it] 16%|█▌ | 911/5772 [1:35:55<8:20:40, 6.18s/it] 16%|█▌ | 911/5772 [1:36:02<8:20:40, 6.18s/it] {'loss': 0.4927, 'learning_rate': 1.915678382724616e-05, 'epoch': 0.16} + 16%|█▌ | 911/5772 [1:36:02<8:20:40, 6.18s/it] {'loss': 0.4927, 'learning_rate': 1.915678382724616e-05, 'epoch': 0.16} + 16%|█▌ | 911/5772 [1:35:55<8:20:40, 6.18s/it] 16%|█▌ | 912/5772 [1:36:08<8:18:35, 6.16s/it] 16%|█▌ | 912/5772 [1:36:01<8:18:35, 6.16s/it] {'loss': 0.4999, 'learning_rate': 1.9154526859924242e-05, 'epoch': 0.16} + 16%|█▌ | 912/5772 [1:36:08<8:18:35, 6.16s/it] {'loss': 0.4999, 'learning_rate': 1.9154526859924242e-05, 'epoch': 0.16} + 16%|█▌ | 912/5772 [1:36:01<8:18:35, 6.16s/it] 16%|█▌ | 913/5772 [1:36:07<8:23:23, 6.22s/it] 16%|█▌ | 913/5772 [1:36:14<8:23:23, 6.22s/it] {'loss': 0.487, 'learning_rate': 1.915226700943499e-05, 'epoch': 0.16} + 16%|█▌ | 913/5772 [1:36:14<8:23:23, 6.22s/it] {'loss': 0.487, 'learning_rate': 1.915226700943499e-05, 'epoch': 0.16} + 16%|█▌ | 913/5772 [1:36:07<8:23:23, 6.22s/it] 16%|█▌ | 914/5772 [1:36:21<8:24:48, 6.23s/it] 16%|█▌ | 914/5772 [1:36:14<8:24:50, 6.24s/it] {'loss': 0.4971, 'learning_rate': 1.915000427649013e-05, 'epoch': 0.16} + 16%|█▌ | 914/5772 [1:36:21<8:24:48, 6.23s/it] {'loss': 0.4971, 'learning_rate': 1.915000427649013e-05, 'epoch': 0.16} + 16%|█▌ | 914/5772 [1:36:14<8:24:50, 6.24s/it] 16%|█▌ | 915/5772 [1:36:20<8:21:48, 6.20s/it] 16%|█▌ | 915/5772 [1:36:27<8:21:49, 6.20s/it] {'loss': 0.4923, 'learning_rate': 1.9147738661802295e-05, 'epoch': 0.16} + 16%|█▌ | 915/5772 [1:36:27<8:21:49, 6.20s/it] {'loss': 0.4923, 'learning_rate': 1.9147738661802295e-05, 'epoch': 0.16} + 16%|█▌ | 915/5772 [1:36:20<8:21:48, 6.20s/it] 16%|█▌ | 916/5772 [1:36:33<8:19:34, 6.17s/it] 16%|█▌ | 916/5772 [1:36:26<8:19:34, 6.17s/it] {'loss': 0.4784, 'learning_rate': 1.9145470166085034e-05, 'epoch': 0.16} + 16%|█▌ | 916/5772 [1:36:33<8:19:34, 6.17s/it] {'loss': 0.4784, 'learning_rate': 1.9145470166085034e-05, 'epoch': 0.16} + 16%|█▌ | 916/5772 [1:36:26<8:19:34, 6.17s/it] 16%|█▌ | 917/5772 [1:36:39<8:14:03, 6.11s/it] 16%|█▌ | 917/5772 [1:36:32<8:14:04, 6.11s/it] {'loss': 0.4825, 'learning_rate': 1.9143198790052788e-05, 'epoch': 0.16} + 16%|█▌ | 917/5772 [1:36:39<8:14:03, 6.11s/it] {'loss': 0.4825, 'learning_rate': 1.9143198790052788e-05, 'epoch': 0.16} + 16%|█▌ | 917/5772 [1:36:32<8:14:04, 6.11s/it] 16%|█▌ | 918/5772 [1:36:45<8:18:14, 6.16s/it] 16%|█▌ | 918/5772 [1:36:38<8:18:14, 6.16s/it] {'loss': 0.4898, 'learning_rate': 1.9140924534420924e-05, 'epoch': 0.16} + 16%|█▌ | 918/5772 [1:36:45<8:18:14, 6.16s/it] {'loss': 0.4898, 'learning_rate': 1.9140924534420924e-05, 'epoch': 0.16} + 16%|█▌ | 918/5772 [1:36:38<8:18:14, 6.16s/it] 16%|█▌ | 919/5772 [1:36:44<8:16:15, 6.14s/it] 16%|█▌ | 919/5772 [1:36:51<8:16:16, 6.14s/it] {'loss': 0.4846, 'learning_rate': 1.91386473999057e-05, 'epoch': 0.16} + 16%|█▌ | 919/5772 [1:36:51<8:16:16, 6.14s/it] {'loss': 0.4846, 'learning_rate': 1.91386473999057e-05, 'epoch': 0.16} + 16%|█▌ | 919/5772 [1:36:44<8:16:15, 6.14s/it] 16%|█▌ | 920/5772 [1:36:50<8:16:25, 6.14s/it] 16%|█▌ | 920/5772 [1:36:57<8:16:25, 6.14s/it] {'loss': 0.4944, 'learning_rate': 1.9136367387224288e-05, 'epoch': 0.16} + 16%|█▌ | 920/5772 [1:36:57<8:16:25, 6.14s/it] {'loss': 0.4944, 'learning_rate': 1.9136367387224288e-05, 'epoch': 0.16} + 16%|█▌ | 920/5772 [1:36:50<8:16:25, 6.14s/it] 16%|█▌ | 921/5772 [1:37:04<8:23:37, 6.23s/it] 16%|█▌ | 921/5772 [1:36:57<8:23:37, 6.23s/it] {'loss': 0.4891, 'learning_rate': 1.9134084497094766e-05, 'epoch': 0.16} + 16%|█▌ | 921/5772 [1:37:04<8:23:37, 6.23s/it] {'loss': 0.4891, 'learning_rate': 1.9134084497094766e-05, 'epoch': 0.16} + 16%|█▌ | 921/5772 [1:36:57<8:23:37, 6.23s/it] 16%|█▌ | 922/5772 [1:37:10<8:23:23, 6.23s/it] 16%|█▌ | 922/5772 [1:37:03<8:23:24, 6.23s/it] {'loss': 0.4883, 'learning_rate': 1.9131798730236116e-05, 'epoch': 0.16} + 16%|█▌ | 922/5772 [1:37:10<8:23:23, 6.23s/it] {'loss': 0.4883, 'learning_rate': 1.9131798730236116e-05, 'epoch': 0.16} + 16%|█▌ | 922/5772 [1:37:03<8:23:24, 6.23s/it] 16%|█▌ | 923/5772 [1:37:09<8:22:02, 6.21s/it] 16%|█▌ | 923/5772 [1:37:16<8:22:03, 6.21s/it] {'loss': 0.488, 'learning_rate': 1.9129510087368234e-05, 'epoch': 0.16} + 16%|█▌ | 923/5772 [1:37:16<8:22:03, 6.21s/it] {'loss': 0.488, 'learning_rate': 1.9129510087368234e-05, 'epoch': 0.16} + 16%|█▌ | 923/5772 [1:37:09<8:22:02, 6.21s/it] 16%|█▌ | 924/5772 [1:37:16<8:26:03, 6.26s/it] 16%|█▌ | 924/5772 [1:37:23<8:26:03, 6.26s/it] {'loss': 0.4985, 'learning_rate': 1.9127218569211905e-05, 'epoch': 0.16} + 16%|█▌ | 924/5772 [1:37:23<8:26:03, 6.26s/it] {'loss': 0.4985, 'learning_rate': 1.9127218569211905e-05, 'epoch': 0.16} + 16%|█▌ | 924/5772 [1:37:16<8:26:03, 6.26s/it] 16%|█▌ | 925/5772 [1:37:21<8:18:09, 6.17s/it] 16%|█▌ | 925/5772 [1:37:28<8:18:10, 6.17s/it] {'loss': 0.4915, 'learning_rate': 1.9124924176488838e-05, 'epoch': 0.16} + 16%|█▌ | 925/5772 [1:37:28<8:18:10, 6.17s/it] {'loss': 0.4915, 'learning_rate': 1.9124924176488838e-05, 'epoch': 0.16} + 16%|█▌ | 925/5772 [1:37:21<8:18:09, 6.17s/it] 16%|█▌ | 926/5772 [1:37:28<8:20:54, 6.20s/it] 16%|█▌ | 926/5772 [1:37:35<8:20:54, 6.20s/it] {'loss': 0.4934, 'learning_rate': 1.9122626909921637e-05, 'epoch': 0.16} + 16%|█▌ | 926/5772 [1:37:35<8:20:54, 6.20s/it] {'loss': 0.4934, 'learning_rate': 1.9122626909921637e-05, 'epoch': 0.16} + 16%|█▌ | 926/5772 [1:37:28<8:20:54, 6.20s/it] 16%|█▌ | 927/5772 [1:37:41<8:16:53, 6.15s/it] 16%|█▌ | 927/5772 [1:37:34<8:16:53, 6.15s/it] {'loss': 0.4974, 'learning_rate': 1.912032677023381e-05, 'epoch': 0.16} + 16%|█▌ | 927/5772 [1:37:41<8:16:53, 6.15s/it] {'loss': 0.4974, 'learning_rate': 1.912032677023381e-05, 'epoch': 0.16} + 16%|█▌ | 927/5772 [1:37:34<8:16:53, 6.15s/it] 16%|█▌ | 928/5772 [1:37:47<8:15:33, 6.14s/it] 16%|█▌ | 928/5772 [1:37:40<8:15:33, 6.14s/it] {'loss': 0.4957, 'learning_rate': 1.9118023758149777e-05, 'epoch': 0.16} + 16%|█▌ | 928/5772 [1:37:47<8:15:33, 6.14s/it] {'loss': 0.4957, 'learning_rate': 1.9118023758149777e-05, 'epoch': 0.16} + 16%|█▌ | 928/5772 [1:37:40<8:15:33, 6.14s/it] 16%|█▌ | 929/5772 [1:37:53<8:12:48, 6.11s/it] 16%|█▌ | 929/5772 [1:37:46<8:12:48, 6.11s/it] {'loss': 0.4833, 'learning_rate': 1.9115717874394856e-05, 'epoch': 0.16} + 16%|█▌ | 929/5772 [1:37:53<8:12:48, 6.11s/it] {'loss': 0.4833, 'learning_rate': 1.9115717874394856e-05, 'epoch': 0.16} + 16%|█▌ | 929/5772 [1:37:46<8:12:48, 6.11s/it] 16%|█▌ | 930/5772 [1:37:59<8:20:33, 6.20s/it] 16%|█▌ | 930/5772 [1:37:52<8:20:33, 6.20s/it] {'loss': 0.4893, 'learning_rate': 1.9113409119695276e-05, 'epoch': 0.16} + 16%|█▌ | 930/5772 [1:37:59<8:20:33, 6.20s/it] {'loss': 0.4893, 'learning_rate': 1.9113409119695276e-05, 'epoch': 0.16} + 16%|█▌ | 930/5772 [1:37:52<8:20:33, 6.20s/it] 16%|█▌ | 931/5772 [1:37:59<8:26:43, 6.28s/it] 16%|█▌ | 931/5772 [1:38:06<8:26:44, 6.28s/it] {'loss': 0.4852, 'learning_rate': 1.9111097494778164e-05, 'epoch': 0.16} + 16%|█▌ | 931/5772 [1:38:06<8:26:44, 6.28s/it] {'loss': 0.4852, 'learning_rate': 1.9111097494778164e-05, 'epoch': 0.16} + 16%|█▌ | 931/5772 [1:37:59<8:26:43, 6.28s/it] 16%|█▌ | 932/5772 [1:38:12<8:18:17, 6.18s/it] 16%|█▌ | 932/5772 [1:38:05<8:18:18, 6.18s/it] {'loss': 0.488, 'learning_rate': 1.9108783000371555e-05, 'epoch': 0.16} + 16%|█▌ | 932/5772 [1:38:12<8:18:17, 6.18s/it] {'loss': 0.488, 'learning_rate': 1.9108783000371555e-05, 'epoch': 0.16} + 16%|█▌ | 932/5772 [1:38:05<8:18:18, 6.18s/it] 16%|█▌ | 933/5772 [1:38:18<8:17:36, 6.17s/it] 16%|█▌ | 933/5772 [1:38:11<8:17:36, 6.17s/it] {'loss': 0.4887, 'learning_rate': 1.910646563720439e-05, 'epoch': 0.16} + 16%|█▌ | 933/5772 [1:38:18<8:17:36, 6.17s/it] {'loss': 0.4887, 'learning_rate': 1.910646563720439e-05, 'epoch': 0.16} + 16%|█▌ | 933/5772 [1:38:11<8:17:36, 6.17s/it] 16%|█▌ | 934/5772 [1:38:24<8:15:53, 6.15s/it] 16%|█▌ | 934/5772 [1:38:17<8:15:53, 6.15s/it] {'loss': 0.4851, 'learning_rate': 1.9104145406006495e-05, 'epoch': 0.16} + 16%|█▌ | 934/5772 [1:38:24<8:15:53, 6.15s/it] {'loss': 0.4851, 'learning_rate': 1.9104145406006495e-05, 'epoch': 0.16} + 16%|█▌ | 934/5772 [1:38:17<8:15:53, 6.15s/it] 16%|█▌ | 935/5772 [1:38:30<8:14:10, 6.13s/it] 16%|█▌ | 935/5772 [1:38:23<8:14:10, 6.13s/it] {'loss': 0.4894, 'learning_rate': 1.9101822307508628e-05, 'epoch': 0.16} + 16%|█▌ | 935/5772 [1:38:30<8:14:10, 6.13s/it] {'loss': 0.4894, 'learning_rate': 1.9101822307508628e-05, 'epoch': 0.16} + 16%|█▌ | 935/5772 [1:38:23<8:14:10, 6.13s/it] 16%|█▌ | 936/5772 [1:38:36<8:09:25, 6.07s/it] 16%|█▌ | 936/5772 [1:38:29<8:09:25, 6.07s/it] {'loss': 0.4945, 'learning_rate': 1.9099496342442432e-05, 'epoch': 0.16} + 16%|█▌ | 936/5772 [1:38:36<8:09:25, 6.07s/it] {'loss': 0.4945, 'learning_rate': 1.9099496342442432e-05, 'epoch': 0.16} + 16%|█▌ | 936/5772 [1:38:29<8:09:25, 6.07s/it] 16%|█▌ | 937/5772 [1:38:43<8:19:13, 6.20s/it] 16%|█▌ | 937/5772 [1:38:35<8:19:14, 6.20s/it] {'loss': 0.4975, 'learning_rate': 1.9097167511540453e-05, 'epoch': 0.16} + 16%|█▌ | 937/5772 [1:38:43<8:19:13, 6.20s/it] {'loss': 0.4975, 'learning_rate': 1.9097167511540453e-05, 'epoch': 0.16} + 16%|█▌ | 937/5772 [1:38:35<8:19:14, 6.20s/it] 16%|█▋ | 938/5772 [1:38:49<8:20:06, 6.21s/it] 16%|█▋ | 938/5772 [1:38:42<8:20:07, 6.21s/it] {'loss': 0.4941, 'learning_rate': 1.909483581553615e-05, 'epoch': 0.16} + 16%|█▋ | 938/5772 [1:38:49<8:20:06, 6.21s/it] {'loss': 0.4941, 'learning_rate': 1.909483581553615e-05, 'epoch': 0.16} + 16%|█▋ | 938/5772 [1:38:42<8:20:07, 6.21s/it] 16%|█▋ | 939/5772 [1:38:55<8:18:25, 6.19s/it] 16%|█▋ | 939/5772 [1:38:48<8:18:24, 6.19s/it] {'loss': 0.4986, 'learning_rate': 1.9092501255163874e-05, 'epoch': 0.16} + 16%|█▋ | 939/5772 [1:38:55<8:18:25, 6.19s/it] {'loss': 0.4986, 'learning_rate': 1.9092501255163874e-05, 'epoch': 0.16} + 16%|█▋ | 939/5772 [1:38:48<8:18:24, 6.19s/it] 16%|█▋ | 940/5772 [1:39:01<8:09:59, 6.08s/it] 16%|█▋ | 940/5772 [1:38:54<8:09:59, 6.08s/it] {'loss': 0.4856, 'learning_rate': 1.9090163831158883e-05, 'epoch': 0.16} + 16%|█▋ | 940/5772 [1:39:01<8:09:59, 6.08s/it] {'loss': 0.4856, 'learning_rate': 1.9090163831158883e-05, 'epoch': 0.16} + 16%|█▋ | 940/5772 [1:38:54<8:09:59, 6.08s/it] 16%|█▋ | 941/5772 [1:39:07<8:11:19, 6.10s/it] 16%|█▋ | 941/5772 [1:39:00<8:11:19, 6.10s/it] {'loss': 0.4816, 'learning_rate': 1.9087823544257334e-05, 'epoch': 0.16} + 16%|█▋ | 941/5772 [1:39:07<8:11:19, 6.10s/it] {'loss': 0.4816, 'learning_rate': 1.9087823544257334e-05, 'epoch': 0.16} + 16%|█▋ | 941/5772 [1:39:00<8:11:19, 6.10s/it] 16%|█▋ | 942/5772 [1:39:06<8:12:20, 6.12s/it] 16%|█▋ | 942/5772 [1:39:13<8:12:21, 6.12s/it] {'loss': 0.5001, 'learning_rate': 1.9085480395196287e-05, 'epoch': 0.16} + 16%|█▋ | 942/5772 [1:39:13<8:12:21, 6.12s/it] {'loss': 0.5001, 'learning_rate': 1.9085480395196287e-05, 'epoch': 0.16} + 16%|█▋ | 942/5772 [1:39:06<8:12:20, 6.12s/it] 16%|█▋ | 943/5772 [1:39:12<8:06:23, 6.04s/it] 16%|█▋ | 943/5772 [1:39:19<8:06:23, 6.04s/it] {'loss': 0.4829, 'learning_rate': 1.9083134384713708e-05, 'epoch': 0.16} + 16%|█▋ | 943/5772 [1:39:19<8:06:23, 6.04s/it] {'loss': 0.4829, 'learning_rate': 1.9083134384713708e-05, 'epoch': 0.16} + 16%|█▋ | 943/5772 [1:39:12<8:06:23, 6.04s/it] 16%|█▋ | 944/5772 [1:39:18<8:08:02, 6.07s/it] 16%|█▋ | 944/5772 [1:39:25<8:08:02, 6.07s/it] {'loss': 0.5031, 'learning_rate': 1.9080785513548454e-05, 'epoch': 0.16} + 16%|█▋ | 944/5772 [1:39:25<8:08:02, 6.07s/it] {'loss': 0.5031, 'learning_rate': 1.9080785513548454e-05, 'epoch': 0.16} + 16%|█▋ | 944/5772 [1:39:18<8:08:02, 6.07s/it] 16%|█▋ | 945/5772 [1:39:24<8:10:39, 6.10s/it] 16%|█▋ | 945/5772 [1:39:31<8:10:39, 6.10s/it] {'loss': 0.4799, 'learning_rate': 1.9078433782440292e-05, 'epoch': 0.16} + 16%|█▋ | 945/5772 [1:39:31<8:10:39, 6.10s/it] {'loss': 0.4799, 'learning_rate': 1.9078433782440292e-05, 'epoch': 0.16} + 16%|█▋ | 945/5772 [1:39:24<8:10:39, 6.10s/it] 16%|█▋ | 946/5772 [1:39:37<8:10:38, 6.10s/it] 16%|█▋ | 946/5772 [1:39:30<8:10:39, 6.10s/it] {'loss': 0.5045, 'learning_rate': 1.9076079192129886e-05, 'epoch': 0.16} + 16%|█▋ | 946/5772 [1:39:37<8:10:38, 6.10s/it] {'loss': 0.5045, 'learning_rate': 1.9076079192129886e-05, 'epoch': 0.16} + 16%|█▋ | 946/5772 [1:39:30<8:10:39, 6.10s/it] 16%|█▋ | 947/5772 [1:39:37<8:17:14, 6.18s/it] 16%|█▋ | 947/5772 [1:39:44<8:17:14, 6.18s/it] {'loss': 0.4859, 'learning_rate': 1.9073721743358805e-05, 'epoch': 0.16} + 16%|█▋ | 947/5772 [1:39:44<8:17:14, 6.18s/it] {'loss': 0.4859, 'learning_rate': 1.9073721743358805e-05, 'epoch': 0.16} + 16%|█▋ | 947/5772 [1:39:37<8:17:14, 6.18s/it] 16%|█▋ | 948/5772 [1:39:50<8:12:40, 6.13s/it] 16%|█▋ | 948/5772 [1:39:43<8:12:40, 6.13s/it] {'loss': 0.4885, 'learning_rate': 1.907136143686951e-05, 'epoch': 0.16} + 16%|█▋ | 948/5772 [1:39:50<8:12:40, 6.13s/it] {'loss': 0.4885, 'learning_rate': 1.907136143686951e-05, 'epoch': 0.16} + 16%|█▋ | 948/5772 [1:39:43<8:12:40, 6.13s/it] 16%|█▋ | 949/5772 [1:39:56<8:26:07, 6.30s/it] 16%|█▋ | 949/5772 [1:39:49<8:26:07, 6.30s/it] {'loss': 0.5062, 'learning_rate': 1.9068998273405364e-05, 'epoch': 0.16} + 16%|█▋ | 949/5772 [1:39:56<8:26:07, 6.30s/it] {'loss': 0.5062, 'learning_rate': 1.9068998273405364e-05, 'epoch': 0.16} + 16%|█▋ | 949/5772 [1:39:49<8:26:07, 6.30s/it]13 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +45 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +6 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +08 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend...3 + AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... + 16%|█▋ | 950/5772 [1:39:55<8:21:23, 6.24s/it] 16%|█▋ | 950/5772 [1:40:02<8:21:23, 6.24s/it]9 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4889, 'learning_rate': 1.9066632253710636e-05, 'epoch': 0.16} + 16%|█▋ | 950/5772 [1:40:02<8:21:23, 6.24s/it] {'loss': 0.4889, 'learning_rate': 1.9066632253710636e-05, 'epoch': 0.16} + 16%|█▋ | 950/5772 [1:39:55<8:21:23, 6.24s/it] 16%|█▋ | 951/5772 [1:40:09<8:20:40, 6.23s/it] 16%|█▋ | 951/5772 [1:40:02<8:20:40, 6.23s/it] {'loss': 0.4893, 'learning_rate': 1.9064263378530495e-05, 'epoch': 0.16} + 16%|█▋ | 951/5772 [1:40:09<8:20:40, 6.23s/it] {'loss': 0.4893, 'learning_rate': 1.9064263378530495e-05, 'epoch': 0.16} + 16%|█▋ | 951/5772 [1:40:02<8:20:40, 6.23s/it] 16%|█▋ | 952/5772 [1:40:15<8:18:37, 6.21s/it] 16%|█▋ | 952/5772 [1:40:08<8:18:36, 6.21s/it] {'loss': 0.492, 'learning_rate': 1.9061891648610997e-05, 'epoch': 0.16} + 16%|█▋ | 952/5772 [1:40:15<8:18:37, 6.21s/it] {'loss': 0.492, 'learning_rate': 1.9061891648610997e-05, 'epoch': 0.16} + 16%|█▋ | 952/5772 [1:40:08<8:18:36, 6.21s/it] 17%|█▋ | 953/5772 [1:40:21<8:22:49, 6.26s/it] 17%|█▋ | 953/5772 [1:40:14<8:22:49, 6.26s/it] {'loss': 0.4924, 'learning_rate': 1.905951706469911e-05, 'epoch': 0.17} + 17%|█▋ | 953/5772 [1:40:21<8:22:49, 6.26s/it] {'loss': 0.4924, 'learning_rate': 1.905951706469911e-05, 'epoch': 0.17} + 17%|█▋ | 953/5772 [1:40:14<8:22:49, 6.26s/it] 17%|█▋ | 954/5772 [1:40:20<8:18:33, 6.21s/it] 17%|█▋ | 954/5772 [1:40:27<8:18:33, 6.21s/it] {'loss': 0.5003, 'learning_rate': 1.9057139627542693e-05, 'epoch': 0.17} + 17%|█▋ | 954/5772 [1:40:27<8:18:33, 6.21s/it] {'loss': 0.5003, 'learning_rate': 1.9057139627542693e-05, 'epoch': 0.17} + 17%|█▋ | 954/5772 [1:40:20<8:18:33, 6.21s/it] 17%|█▋ | 955/5772 [1:40:34<8:18:17, 6.21s/it] 17%|█▋ | 955/5772 [1:40:26<8:18:17, 6.21s/it] {'loss': 0.4777, 'learning_rate': 1.905475933789051e-05, 'epoch': 0.17} + 17%|█▋ | 955/5772 [1:40:34<8:18:17, 6.21s/it] {'loss': 0.4777, 'learning_rate': 1.905475933789051e-05, 'epoch': 0.17} + 17%|█▋ | 955/5772 [1:40:26<8:18:17, 6.21s/it] 17%|█▋ | 956/5772 [1:40:33<8:16:16, 6.18s/it] 17%|█▋ | 956/5772 [1:40:40<8:16:16, 6.18s/it] {'loss': 0.4971, 'learning_rate': 1.9052376196492218e-05, 'epoch': 0.17} + 17%|█▋ | 956/5772 [1:40:40<8:16:16, 6.18s/it] {'loss': 0.4971, 'learning_rate': 1.9052376196492218e-05, 'epoch': 0.17} + 17%|█▋ | 956/5772 [1:40:33<8:16:16, 6.18s/it] 17%|█▋ | 957/5772 [1:40:46<8:14:49, 6.17s/it] 17%|█▋ | 957/5772 [1:40:39<8:14:49, 6.17s/it] {'loss': 0.4724, 'learning_rate': 1.904999020409837e-05, 'epoch': 0.17} + 17%|█▋ | 957/5772 [1:40:46<8:14:49, 6.17s/it] {'loss': 0.4724, 'learning_rate': 1.904999020409837e-05, 'epoch': 0.17} + 17%|█▋ | 957/5772 [1:40:39<8:14:49, 6.17s/it] 17%|█▋ | 958/5772 [1:40:45<8:13:43, 6.15s/it] 17%|█▋ | 958/5772 [1:40:52<8:13:43, 6.15s/it] {'loss': 0.4922, 'learning_rate': 1.904760136146043e-05, 'epoch': 0.17} + 17%|█▋ | 958/5772 [1:40:52<8:13:43, 6.15s/it] {'loss': 0.4922, 'learning_rate': 1.904760136146043e-05, 'epoch': 0.17} + 17%|█▋ | 958/5772 [1:40:45<8:13:43, 6.15s/it] 17%|█▋ | 959/5772 [1:40:58<8:13:55, 6.16s/it] 17%|█▋ | 959/5772 [1:40:51<8:13:55, 6.16s/it] {'loss': 0.4811, 'learning_rate': 1.9045209669330747e-05, 'epoch': 0.17} + 17%|█▋ | 959/5772 [1:40:58<8:13:55, 6.16s/it] {'loss': 0.4811, 'learning_rate': 1.9045209669330747e-05, 'epoch': 0.17} + 17%|█▋ | 959/5772 [1:40:51<8:13:55, 6.16s/it] 17%|█▋ | 960/5772 [1:41:04<8:09:21, 6.10s/it] 17%|█▋ | 960/5772 [1:40:57<8:09:21, 6.10s/it] {'loss': 0.4858, 'learning_rate': 1.904281512846257e-05, 'epoch': 0.17} + 17%|█▋ | 960/5772 [1:41:04<8:09:21, 6.10s/it] {'loss': 0.4858, 'learning_rate': 1.904281512846257e-05, 'epoch': 0.17} + 17%|█▋ | 960/5772 [1:40:57<8:09:21, 6.10s/it] 17%|█▋ | 961/5772 [1:41:10<8:10:32, 6.12s/it] 17%|█▋ | 961/5772 [1:41:03<8:10:32, 6.12s/it] {'loss': 0.4897, 'learning_rate': 1.904041773961004e-05, 'epoch': 0.17} + 17%|█▋ | 961/5772 [1:41:10<8:10:32, 6.12s/it] {'loss': 0.4897, 'learning_rate': 1.904041773961004e-05, 'epoch': 0.17} + 17%|█▋ | 961/5772 [1:41:03<8:10:32, 6.12s/it] 17%|█▋ | 962/5772 [1:41:16<8:08:34, 6.09s/it] 17%|█▋ | 962/5772 [1:41:09<8:08:34, 6.09s/it] {'loss': 0.4933, 'learning_rate': 1.9038017503528215e-05, 'epoch': 0.17} + 17%|█▋ | 962/5772 [1:41:16<8:08:34, 6.09s/it] {'loss': 0.4933, 'learning_rate': 1.9038017503528215e-05, 'epoch': 0.17} + 17%|█▋ | 962/5772 [1:41:09<8:08:34, 6.09s/it] 17%|█▋ | 963/5772 [1:41:15<8:06:23, 6.07s/it] 17%|█▋ | 963/5772 [1:41:22<8:06:24, 6.07s/it] {'loss': 0.4866, 'learning_rate': 1.9035614420973026e-05, 'epoch': 0.17} + 17%|█▋ | 963/5772 [1:41:22<8:06:24, 6.07s/it] {'loss': 0.4866, 'learning_rate': 1.9035614420973026e-05, 'epoch': 0.17} + 17%|█▋ | 963/5772 [1:41:15<8:06:23, 6.07s/it] 17%|█▋ | 964/5772 [1:41:28<8:07:00, 6.08s/it] 17%|█▋ | 964/5772 [1:41:21<8:07:00, 6.08s/it] {'loss': 0.4981, 'learning_rate': 1.9033208492701316e-05, 'epoch': 0.17} + 17%|█▋ | 964/5772 [1:41:28<8:07:00, 6.08s/it] {'loss': 0.4981, 'learning_rate': 1.9033208492701316e-05, 'epoch': 0.17} + 17%|█▋ | 964/5772 [1:41:21<8:07:00, 6.08s/it] 17%|█▋ | 965/5772 [1:41:35<8:14:22, 6.17s/it] 17%|█▋ | 965/5772 [1:41:28<8:14:23, 6.17s/it] {'loss': 0.4943, 'learning_rate': 1.903079971947081e-05, 'epoch': 0.17} + 17%|█▋ | 965/5772 [1:41:35<8:14:22, 6.17s/it] {'loss': 0.4943, 'learning_rate': 1.903079971947081e-05, 'epoch': 0.17} + 17%|█▋ | 965/5772 [1:41:28<8:14:23, 6.17s/it] 17%|█▋ | 966/5772 [1:41:41<8:06:18, 6.07s/it] 17%|█▋ | 966/5772 [1:41:34<8:06:18, 6.07s/it] {'loss': 0.4872, 'learning_rate': 1.902838810204015e-05, 'epoch': 0.17} + 17%|█▋ | 966/5772 [1:41:41<8:06:18, 6.07s/it] {'loss': 0.4872, 'learning_rate': 1.902838810204015e-05, 'epoch': 0.17} + 17%|█▋ | 966/5772 [1:41:34<8:06:18, 6.07s/it] 17%|█▋ | 967/5772 [1:41:47<8:09:34, 6.11s/it] 17%|█▋ | 967/5772 [1:41:40<8:09:34, 6.11s/it] {'loss': 0.4959, 'learning_rate': 1.9025973641168854e-05, 'epoch': 0.17} + 17%|█▋ | 967/5772 [1:41:47<8:09:34, 6.11s/it] {'loss': 0.4959, 'learning_rate': 1.9025973641168854e-05, 'epoch': 0.17} + 17%|█▋ | 967/5772 [1:41:40<8:09:34, 6.11s/it] 17%|█▋ | 968/5772 [1:41:46<8:12:08, 6.15s/it] 17%|█▋ | 968/5772 [1:41:53<8:12:08, 6.15s/it] {'loss': 0.4882, 'learning_rate': 1.9023556337617343e-05, 'epoch': 0.17} + 17%|█▋ | 968/5772 [1:41:53<8:12:08, 6.15s/it] {'loss': 0.4882, 'learning_rate': 1.9023556337617343e-05, 'epoch': 0.17} + 17%|█▋ | 968/5772 [1:41:46<8:12:08, 6.15s/it] 17%|█▋ | 969/5772 [1:41:52<8:14:37, 6.18s/it] 17%|█▋ | 969/5772 [1:41:59<8:14:37, 6.18s/it] {'loss': 0.4891, 'learning_rate': 1.9021136192146936e-05, 'epoch': 0.17} + 17%|█▋ | 969/5772 [1:41:59<8:14:37, 6.18s/it] {'loss': 0.4891, 'learning_rate': 1.9021136192146936e-05, 'epoch': 0.17} + 17%|█▋ | 969/5772 [1:41:52<8:14:37, 6.18s/it] 17%|█▋ | 970/5772 [1:42:05<8:14:00, 6.17s/it] 17%|█▋ | 970/5772 [1:41:58<8:14:00, 6.17s/it] {'loss': 0.4902, 'learning_rate': 1.901871320551984e-05, 'epoch': 0.17} + 17%|█▋ | 970/5772 [1:42:05<8:14:00, 6.17s/it] {'loss': 0.4902, 'learning_rate': 1.901871320551984e-05, 'epoch': 0.17} + 17%|█▋ | 970/5772 [1:41:58<8:14:00, 6.17s/it] 17%|█▋ | 971/5772 [1:42:11<8:09:29, 6.12s/it] 17%|█▋ | 971/5772 [1:42:04<8:09:29, 6.12s/it] {'loss': 0.4823, 'learning_rate': 1.9016287378499167e-05, 'epoch': 0.17} + 17%|█▋ | 971/5772 [1:42:11<8:09:29, 6.12s/it] {'loss': 0.4823, 'learning_rate': 1.9016287378499167e-05, 'epoch': 0.17} + 17%|█▋ | 971/5772 [1:42:04<8:09:29, 6.12s/it] 17%|█▋ | 972/5772 [1:42:17<8:05:39, 6.07s/it] 17%|█▋ | 972/5772 [1:42:10<8:05:40, 6.07s/it] {'loss': 0.4887, 'learning_rate': 1.9013858711848914e-05, 'epoch': 0.17} + 17%|█▋ | 972/5772 [1:42:17<8:05:39, 6.07s/it] {'loss': 0.4887, 'learning_rate': 1.9013858711848914e-05, 'epoch': 0.17} + 17%|█▋ | 972/5772 [1:42:10<8:05:40, 6.07s/it] 17%|█▋ | 973/5772 [1:42:24<8:09:59, 6.13s/it] 17%|█▋ | 973/5772 [1:42:17<8:10:00, 6.13s/it] {'loss': 0.4975, 'learning_rate': 1.9011427206333976e-05, 'epoch': 0.17} + 17%|█▋ | 973/5772 [1:42:24<8:09:59, 6.13s/it] {'loss': 0.4975, 'learning_rate': 1.9011427206333976e-05, 'epoch': 0.17} + 17%|█▋ | 973/5772 [1:42:17<8:10:00, 6.13s/it] 17%|█▋ | 974/5772 [1:42:23<8:08:12, 6.11s/it] 17%|█▋ | 974/5772 [1:42:30<8:08:13, 6.11s/it] {'loss': 0.4885, 'learning_rate': 1.9008992862720145e-05, 'epoch': 0.17} + 17%|█▋ | 974/5772 [1:42:30<8:08:13, 6.11s/it] {'loss': 0.4885, 'learning_rate': 1.9008992862720145e-05, 'epoch': 0.17} + 17%|█▋ | 974/5772 [1:42:23<8:08:12, 6.11s/it] 17%|█▋ | 975/5772 [1:42:35<7:59:50, 6.00s/it] 17%|█▋ | 975/5772 [1:42:28<7:59:50, 6.00s/it] {'loss': 0.4819, 'learning_rate': 1.90065556817741e-05, 'epoch': 0.17} + 17%|█▋ | 975/5772 [1:42:35<7:59:50, 6.00s/it] {'loss': 0.4819, 'learning_rate': 1.90065556817741e-05, 'epoch': 0.17} + 17%|█▋ | 975/5772 [1:42:28<7:59:50, 6.00s/it] 17%|█▋ | 976/5772 [1:42:41<8:00:50, 6.02s/it] 17%|█▋ | 976/5772 [1:42:34<8:00:50, 6.02s/it] {'loss': 0.4903, 'learning_rate': 1.900411566426342e-05, 'epoch': 0.17} + 17%|█▋ | 976/5772 [1:42:41<8:00:50, 6.02s/it] {'loss': 0.4903, 'learning_rate': 1.900411566426342e-05, 'epoch': 0.17} + 17%|█▋ | 976/5772 [1:42:34<8:00:50, 6.02s/it] 17%|█▋ | 977/5772 [1:42:48<8:08:46, 6.12s/it] 17%|█▋ | 977/5772 [1:42:41<8:08:46, 6.12s/it] {'loss': 0.4949, 'learning_rate': 1.9001672810956575e-05, 'epoch': 0.17} + 17%|█▋ | 977/5772 [1:42:48<8:08:46, 6.12s/it] {'loss': 0.4949, 'learning_rate': 1.9001672810956575e-05, 'epoch': 0.17} + 17%|█▋ | 977/5772 [1:42:41<8:08:46, 6.12s/it] 17%|█▋ | 978/5772 [1:42:54<8:09:02, 6.12s/it] 17%|█▋ | 978/5772 [1:42:47<8:09:02, 6.12s/it] {'loss': 0.4802, 'learning_rate': 1.899922712262293e-05, 'epoch': 0.17} + 17%|█▋ | 978/5772 [1:42:54<8:09:02, 6.12s/it] {'loss': 0.4802, 'learning_rate': 1.899922712262293e-05, 'epoch': 0.17} + 17%|█▋ | 978/5772 [1:42:47<8:09:02, 6.12s/it] 17%|█▋ | 979/5772 [1:43:00<8:04:51, 6.07s/it] 17%|█▋ | 979/5772 [1:42:53<8:04:51, 6.07s/it] {'loss': 0.488, 'learning_rate': 1.8996778600032736e-05, 'epoch': 0.17} + 17%|█▋ | 979/5772 [1:43:00<8:04:51, 6.07s/it] {'loss': 0.488, 'learning_rate': 1.8996778600032736e-05, 'epoch': 0.17} + 17%|█▋ | 979/5772 [1:42:53<8:04:51, 6.07s/it] 17%|█▋ | 980/5772 [1:43:06<8:05:34, 6.08s/it] 17%|█▋ | 980/5772 [1:42:59<8:05:34, 6.08s/it] {'loss': 0.482, 'learning_rate': 1.8994327243957143e-05, 'epoch': 0.17} + 17%|█▋ | 980/5772 [1:43:06<8:05:34, 6.08s/it] {'loss': 0.482, 'learning_rate': 1.8994327243957143e-05, 'epoch': 0.17} + 17%|█▋ | 980/5772 [1:42:59<8:05:34, 6.08s/it] 17%|█▋ | 981/5772 [1:43:12<8:02:58, 6.05s/it] 17%|█▋ | 981/5772 [1:43:05<8:02:58, 6.05s/it] {'loss': 0.4865, 'learning_rate': 1.8991873055168194e-05, 'epoch': 0.17} + 17%|█▋ | 981/5772 [1:43:12<8:02:58, 6.05s/it] {'loss': 0.4865, 'learning_rate': 1.8991873055168194e-05, 'epoch': 0.17} + 17%|█▋ | 981/5772 [1:43:05<8:02:58, 6.05s/it] 17%|█▋ | 982/5772 [1:43:19<8:16:04, 6.21s/it] 17%|█▋ | 982/5772 [1:43:12<8:16:05, 6.21s/it] {'loss': 0.4803, 'learning_rate': 1.8989416034438823e-05, 'epoch': 0.17} + 17%|█▋ | 982/5772 [1:43:19<8:16:04, 6.21s/it] {'loss': 0.4803, 'learning_rate': 1.8989416034438823e-05, 'epoch': 0.17} + 17%|█▋ | 982/5772 [1:43:12<8:16:05, 6.21s/it] 17%|█▋ | 983/5772 [1:43:25<8:26:08, 6.34s/it] 17%|█▋ | 983/5772 [1:43:18<8:26:08, 6.34s/it] {'loss': 0.502, 'learning_rate': 1.8986956182542853e-05, 'epoch': 0.17} + 17%|█▋ | 983/5772 [1:43:25<8:26:08, 6.34s/it] {'loss': 0.502, 'learning_rate': 1.8986956182542853e-05, 'epoch': 0.17} + 17%|█▋ | 983/5772 [1:43:18<8:26:08, 6.34s/it] 17%|█▋ | 984/5772 [1:43:31<8:19:06, 6.25s/it] 17%|█▋ | 984/5772 [1:43:24<8:19:05, 6.25s/it] {'loss': 0.4831, 'learning_rate': 1.8984493500255e-05, 'epoch': 0.17} + 17%|█▋ | 984/5772 [1:43:31<8:19:06, 6.25s/it] {'loss': 0.4831, 'learning_rate': 1.8984493500255e-05, 'epoch': 0.17} + 17%|█▋ | 984/5772 [1:43:24<8:19:05, 6.25s/it] 17%|█▋ | 985/5772 [1:43:38<8:19:43, 6.26s/it] 17%|█▋ | 985/5772 [1:43:31<8:19:43, 6.26s/it] {'loss': 0.4882, 'learning_rate': 1.8982027988350877e-05, 'epoch': 0.17} + 17%|█▋ | 985/5772 [1:43:38<8:19:43, 6.26s/it] {'loss': 0.4882, 'learning_rate': 1.8982027988350877e-05, 'epoch': 0.17} + 17%|█▋ | 985/5772 [1:43:31<8:19:43, 6.26s/it] 17%|█▋ | 986/5772 [1:43:44<8:20:04, 6.27s/it] 17%|█▋ | 986/5772 [1:43:37<8:20:04, 6.27s/it] {'loss': 0.4845, 'learning_rate': 1.8979559647606973e-05, 'epoch': 0.17} + 17%|█▋ | 986/5772 [1:43:44<8:20:04, 6.27s/it] {'loss': 0.4845, 'learning_rate': 1.8979559647606973e-05, 'epoch': 0.17} + 17%|█▋ | 986/5772 [1:43:37<8:20:04, 6.27s/it] 17%|█▋ | 987/5772 [1:43:50<8:23:59, 6.32s/it] 17%|█▋ | 987/5772 [1:43:43<8:24:00, 6.32s/it] {'loss': 0.4836, 'learning_rate': 1.8977088478800687e-05, 'epoch': 0.17} + 17%|█▋ | 987/5772 [1:43:50<8:23:59, 6.32s/it] {'loss': 0.4836, 'learning_rate': 1.8977088478800687e-05, 'epoch': 0.17} + 17%|█▋ | 987/5772 [1:43:43<8:24:00, 6.32s/it] 17%|█▋ | 988/5772 [1:43:56<8:14:45, 6.21s/it] 17%|█▋ | 988/5772 [1:43:49<8:14:45, 6.21s/it] {'loss': 0.4937, 'learning_rate': 1.89746144827103e-05, 'epoch': 0.17} + 17%|█▋ | 988/5772 [1:43:56<8:14:45, 6.21s/it] {'loss': 0.4937, 'learning_rate': 1.89746144827103e-05, 'epoch': 0.17} + 17%|█▋ | 988/5772 [1:43:49<8:14:45, 6.21s/it] 17%|█▋ | 989/5772 [1:44:03<8:22:22, 6.30s/it] 17%|█▋ | 989/5772 [1:43:56<8:22:22, 6.30s/it] {'loss': 0.5059, 'learning_rate': 1.8972137660114977e-05, 'epoch': 0.17} + 17%|█▋ | 989/5772 [1:44:03<8:22:22, 6.30s/it] {'loss': 0.5059, 'learning_rate': 1.8972137660114977e-05, 'epoch': 0.17} + 17%|█▋ | 989/5772 [1:43:56<8:22:22, 6.30s/it] 17%|█▋ | 990/5772 [1:44:09<8:18:41, 6.26s/it] 17%|█▋ | 990/5772 [1:44:02<8:18:41, 6.26s/it] {'loss': 0.4897, 'learning_rate': 1.8969658011794785e-05, 'epoch': 0.17} + 17%|█▋ | 990/5772 [1:44:09<8:18:41, 6.26s/it] {'loss': 0.4897, 'learning_rate': 1.8969658011794785e-05, 'epoch': 0.17} + 17%|█▋ | 990/5772 [1:44:02<8:18:41, 6.26s/it] 17%|█▋ | 991/5772 [1:44:15<8:18:15, 6.25s/it] 17%|█▋ | 991/5772 [1:44:08<8:18:15, 6.25s/it] {'loss': 0.4968, 'learning_rate': 1.8967175538530675e-05, 'epoch': 0.17} + 17%|█▋ | 991/5772 [1:44:15<8:18:15, 6.25s/it] {'loss': 0.4968, 'learning_rate': 1.8967175538530675e-05, 'epoch': 0.17} + 17%|█▋ | 991/5772 [1:44:08<8:18:15, 6.25s/it] 17%|█▋ | 992/5772 [1:44:15<8:28:24, 6.38s/it] 17%|█▋ | 992/5772 [1:44:22<8:28:24, 6.38s/it] {'loss': 0.5004, 'learning_rate': 1.8964690241104484e-05, 'epoch': 0.17} + 17%|█▋ | 992/5772 [1:44:22<8:28:24, 6.38s/it] {'loss': 0.5004, 'learning_rate': 1.8964690241104484e-05, 'epoch': 0.17} + 17%|█▋ | 992/5772 [1:44:15<8:28:24, 6.38s/it] 17%|█▋ | 993/5772 [1:44:21<8:22:21, 6.31s/it] 17%|█▋ | 993/5772 [1:44:28<8:22:22, 6.31s/it] {'loss': 0.4861, 'learning_rate': 1.8962202120298948e-05, 'epoch': 0.17} + 17%|█▋ | 993/5772 [1:44:28<8:22:22, 6.31s/it] {'loss': 0.4861, 'learning_rate': 1.8962202120298948e-05, 'epoch': 0.17} + 17%|█▋ | 993/5772 [1:44:21<8:22:21, 6.31s/it] 17%|█▋ | 994/5772 [1:44:34<8:11:56, 6.18s/it] 17%|█▋ | 994/5772 [1:44:27<8:11:57, 6.18s/it] {'loss': 0.4945, 'learning_rate': 1.8959711176897682e-05, 'epoch': 0.17} + 17%|█▋ | 994/5772 [1:44:34<8:11:56, 6.18s/it] {'loss': 0.4945, 'learning_rate': 1.8959711176897682e-05, 'epoch': 0.17} + 17%|█▋ | 994/5772 [1:44:27<8:11:57, 6.18s/it] 17%|█▋ | 995/5772 [1:44:40<8:13:07, 6.19s/it] 17%|█▋ | 995/5772 [1:44:33<8:13:08, 6.19s/it] {'loss': 0.4937, 'learning_rate': 1.8957217411685197e-05, 'epoch': 0.17} + 17%|█▋ | 995/5772 [1:44:40<8:13:07, 6.19s/it] {'loss': 0.4937, 'learning_rate': 1.8957217411685197e-05, 'epoch': 0.17} + 17%|█▋ | 995/5772 [1:44:33<8:13:08, 6.19s/it] 17%|█▋ | 996/5772 [1:44:46<8:02:49, 6.07s/it] 17%|█▋ | 996/5772 [1:44:39<8:02:49, 6.07s/it] {'loss': 0.4894, 'learning_rate': 1.8954720825446893e-05, 'epoch': 0.17} + 17%|█▋ | 996/5772 [1:44:46<8:02:49, 6.07s/it] {'loss': 0.4894, 'learning_rate': 1.8954720825446893e-05, 'epoch': 0.17} + 17%|█▋ | 996/5772 [1:44:39<8:02:49, 6.07s/it] 17%|█▋ | 997/5772 [1:44:52<8:04:42, 6.09s/it] 17%|█▋ | 997/5772 [1:44:45<8:04:42, 6.09s/it] {'loss': 0.4847, 'learning_rate': 1.895222141896905e-05, 'epoch': 0.17} + 17%|█▋ | 997/5772 [1:44:52<8:04:42, 6.09s/it] {'loss': 0.4847, 'learning_rate': 1.895222141896905e-05, 'epoch': 0.17} + 17%|█▋ | 997/5772 [1:44:45<8:04:42, 6.09s/it] 17%|█▋ | 998/5772 [1:44:51<7:57:50, 6.01s/it] 17%|█▋ | 998/5772 [1:44:58<7:57:50, 6.01s/it] {'loss': 0.4885, 'learning_rate': 1.8949719193038847e-05, 'epoch': 0.17} + 17%|█▋ | 998/5772 [1:44:58<7:57:50, 6.01s/it] {'loss': 0.4885, 'learning_rate': 1.8949719193038847e-05, 'epoch': 0.17} + 17%|█▋ | 998/5772 [1:44:51<7:57:50, 6.01s/it] 17%|█▋ | 999/5772 [1:44:57<7:59:22, 6.03s/it] 17%|█▋ | 999/5772 [1:45:04<7:59:23, 6.03s/it] {'loss': 0.4816, 'learning_rate': 1.8947214148444346e-05, 'epoch': 0.17} + 17%|█▋ | 999/5772 [1:45:04<7:59:23, 6.03s/it] {'loss': 0.4816, 'learning_rate': 1.8947214148444346e-05, 'epoch': 0.17} + 17%|█▋ | 999/5772 [1:44:57<7:59:22, 6.03s/it]5 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +1112 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +0 17%|█▋ | 1000/5772 [1:45:10<8:04:59, 6.10s/it]4 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +1AutoResumeHook: Checking whether to suspend... AutoResumeHook: Checking whether to suspend... + + 17%|█▋ | 1000/5772 [1:45:03<8:04:59, 6.10s/it]15 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4887, 'learning_rate': 1.8944706285974496e-05, 'epoch': 0.17} + 17%|█▋ | 1000/5772 [1:45:10<8:04:59, 6.10s/it] {'loss': 0.4887, 'learning_rate': 1.8944706285974496e-05, 'epoch': 0.17} + 17%|█▋ | 1000/5772 [1:45:03<8:04:59, 6.10s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-1000/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-1000/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-1000/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 17%|█▋ | 1001/5772 [1:45:29<13:03:45, 9.86s/it] 17%|█▋ | 1001/5772 [1:45:22<13:03:45, 9.86s/it] {'loss': 0.492, 'learning_rate': 1.8942195606419133e-05, 'epoch': 0.17} + 17%|█▋ | 1001/5772 [1:45:29<13:03:45, 9.86s/it] {'loss': 0.492, 'learning_rate': 1.8942195606419133e-05, 'epoch': 0.17} + 17%|█▋ | 1001/5772 [1:45:22<13:03:45, 9.86s/it] 17%|█▋ | 1002/5772 [1:45:35<11:38:43, 8.79s/it] 17%|█▋ | 1002/5772 [1:45:28<11:38:43, 8.79s/it] {'loss': 0.493, 'learning_rate': 1.8939682110568982e-05, 'epoch': 0.17} + 17%|█▋ | 1002/5772 [1:45:35<11:38:43, 8.79s/it] {'loss': 0.493, 'learning_rate': 1.8939682110568982e-05, 'epoch': 0.17} + 17%|█▋ | 1002/5772 [1:45:28<11:38:43, 8.79s/it] 17%|█▋ | 1003/5772 [1:45:34<10:35:29, 8.00s/it] 17%|█▋ | 1003/5772 [1:45:41<10:35:29, 8.00s/it] {'loss': 0.5016, 'learning_rate': 1.8937165799215657e-05, 'epoch': 0.17} + 17%|█▋ | 1003/5772 [1:45:41<10:35:29, 8.00s/it] {'loss': 0.5016, 'learning_rate': 1.8937165799215657e-05, 'epoch': 0.17} + 17%|█▋ | 1003/5772 [1:45:34<10:35:29, 8.00s/it] 17%|█▋ | 1004/5772 [1:45:47<9:49:30, 7.42s/it] 17%|█▋ | 1004/5772 [1:45:40<9:49:30, 7.42s/it] {'loss': 0.4833, 'learning_rate': 1.8934646673151655e-05, 'epoch': 0.17} + 17%|█▋ | 1004/5772 [1:45:47<9:49:30, 7.42s/it] {'loss': 0.4833, 'learning_rate': 1.8934646673151655e-05, 'epoch': 0.17} + 17%|█▋ | 1004/5772 [1:45:40<9:49:30, 7.42s/it] 17%|█▋ | 1005/5772 [1:45:54<9:21:43, 7.07s/it] 17%|█▋ | 1005/5772 [1:45:47<9:21:43, 7.07s/it] {'loss': 0.5154, 'learning_rate': 1.8932124733170357e-05, 'epoch': 0.17} + 17%|█▋ | 1005/5772 [1:45:54<9:21:43, 7.07s/it] {'loss': 0.5154, 'learning_rate': 1.8932124733170357e-05, 'epoch': 0.17} + 17%|█▋ | 1005/5772 [1:45:47<9:21:43, 7.07s/it] 17%|█▋ | 1006/5772 [1:46:00<8:58:03, 6.77s/it] 17%|█▋ | 1006/5772 [1:45:53<8:58:04, 6.77s/it] {'loss': 0.4956, 'learning_rate': 1.8929599980066034e-05, 'epoch': 0.17} + 17%|█▋ | 1006/5772 [1:46:00<8:58:03, 6.77s/it] {'loss': 0.4956, 'learning_rate': 1.8929599980066034e-05, 'epoch': 0.17} + 17%|█▋ | 1006/5772 [1:45:53<8:58:04, 6.77s/it] 17%|█▋ | 1007/5772 [1:46:06<8:45:55, 6.62s/it] 17%|█▋ | 1007/5772 [1:45:59<8:45:55, 6.62s/it] {'loss': 0.5025, 'learning_rate': 1.892707241463385e-05, 'epoch': 0.17} + 17%|█▋ | 1007/5772 [1:46:06<8:45:55, 6.62s/it] {'loss': 0.5025, 'learning_rate': 1.892707241463385e-05, 'epoch': 0.17} + 17%|█▋ | 1007/5772 [1:45:59<8:45:55, 6.62s/it] 17%|█▋ | 1008/5772 [1:46:12<8:33:19, 6.47s/it] 17%|█▋ | 1008/5772 [1:46:05<8:33:19, 6.47s/it] {'loss': 0.4741, 'learning_rate': 1.8924542037669845e-05, 'epoch': 0.17} + 17%|█▋ | 1008/5772 [1:46:12<8:33:19, 6.47s/it] {'loss': 0.4741, 'learning_rate': 1.8924542037669845e-05, 'epoch': 0.17} + 17%|█▋ | 1008/5772 [1:46:05<8:33:19, 6.47s/it] 17%|█▋ | 1009/5772 [1:46:18<8:27:28, 6.39s/it] 17%|█▋ | 1009/5772 [1:46:11<8:27:31, 6.39s/it] {'loss': 0.49, 'learning_rate': 1.8922008849970947e-05, 'epoch': 0.17} + 17%|█▋ | 1009/5772 [1:46:18<8:27:28, 6.39s/it] {'loss': 0.49, 'learning_rate': 1.8922008849970947e-05, 'epoch': 0.17} + 17%|█▋ | 1009/5772 [1:46:11<8:27:31, 6.39s/it] 17%|█▋ | 1010/5772 [1:46:24<8:22:32, 6.33s/it] 17%|█▋ | 1010/5772 [1:46:17<8:22:31, 6.33s/it] {'loss': 0.4839, 'learning_rate': 1.8919472852334964e-05, 'epoch': 0.17} + 17%|█▋ | 1010/5772 [1:46:24<8:22:32, 6.33s/it] {'loss': 0.4839, 'learning_rate': 1.8919472852334964e-05, 'epoch': 0.17} + 17%|█▋ | 1010/5772 [1:46:17<8:22:31, 6.33s/it] 18%|█▊ | 1011/5772 [1:46:23<8:12:31, 6.21s/it] 18%|█▊ | 1011/5772 [1:46:30<8:12:32, 6.21s/it] {'loss': 0.4931, 'learning_rate': 1.8916934045560603e-05, 'epoch': 0.18} + 18%|█▊ | 1011/5772 [1:46:30<8:12:32, 6.21s/it] {'loss': 0.4931, 'learning_rate': 1.8916934045560603e-05, 'epoch': 0.18} + 18%|█▊ | 1011/5772 [1:46:23<8:12:31, 6.21s/it] 18%|█▊ | 1012/5772 [1:46:37<8:21:56, 6.33s/it] 18%|█▊ | 1012/5772 [1:46:30<8:21:55, 6.33s/it] {'loss': 0.4864, 'learning_rate': 1.891439243044744e-05, 'epoch': 0.18} + 18%|█▊ | 1012/5772 [1:46:37<8:21:56, 6.33s/it] {'loss': 0.4864, 'learning_rate': 1.891439243044744e-05, 'epoch': 0.18} + 18%|█▊ | 1012/5772 [1:46:30<8:21:55, 6.33s/it] 18%|█▊ | 1013/5772 [1:46:43<8:18:28, 6.28s/it] 18%|█▊ | 1013/5772 [1:46:36<8:18:27, 6.28s/it] {'loss': 0.497, 'learning_rate': 1.8911848007795944e-05, 'epoch': 0.18} + 18%|█▊ | 1013/5772 [1:46:43<8:18:28, 6.28s/it] {'loss': 0.497, 'learning_rate': 1.8911848007795944e-05, 'epoch': 0.18} + 18%|█▊ | 1013/5772 [1:46:36<8:18:27, 6.28s/it] 18%|█▊ | 1014/5772 [1:46:43<8:31:42, 6.45s/it] 18%|█▊ | 1014/5772 [1:46:50<8:31:43, 6.45s/it] {'loss': 0.4826, 'learning_rate': 1.890930077840747e-05, 'epoch': 0.18} + 18%|█▊ | 1014/5772 [1:46:50<8:31:43, 6.45s/it] {'loss': 0.4826, 'learning_rate': 1.890930077840747e-05, 'epoch': 0.18} + 18%|█▊ | 1014/5772 [1:46:43<8:31:42, 6.45s/it] 18%|█▊ | 1015/5772 [1:46:56<8:24:25, 6.36s/it] 18%|█▊ | 1015/5772 [1:46:49<8:24:25, 6.36s/it] {'loss': 0.4936, 'learning_rate': 1.890675074308425e-05, 'epoch': 0.18} + 18%|█▊ | 1015/5772 [1:46:56<8:24:25, 6.36s/it] {'loss': 0.4936, 'learning_rate': 1.890675074308425e-05, 'epoch': 0.18} + 18%|█▊ | 1015/5772 [1:46:49<8:24:25, 6.36s/it] 18%|█▊ | 1016/5772 [1:47:03<8:26:39, 6.39s/it] 18%|█▊ | 1016/5772 [1:46:56<8:26:39, 6.39s/it] {'loss': 0.4765, 'learning_rate': 1.8904197902629408e-05, 'epoch': 0.18} + 18%|█▊ | 1016/5772 [1:47:03<8:26:39, 6.39s/it] {'loss': 0.4765, 'learning_rate': 1.8904197902629408e-05, 'epoch': 0.18} + 18%|█▊ | 1016/5772 [1:46:56<8:26:39, 6.39s/it] 18%|█▊ | 1017/5772 [1:47:01<8:12:09, 6.21s/it] 18%|█▊ | 1017/5772 [1:47:08<8:12:09, 6.21s/it] {'loss': 0.4886, 'learning_rate': 1.8901642257846943e-05, 'epoch': 0.18} + 18%|█▊ | 1017/5772 [1:47:08<8:12:09, 6.21s/it] {'loss': 0.4886, 'learning_rate': 1.8901642257846943e-05, 'epoch': 0.18} + 18%|█▊ | 1017/5772 [1:47:01<8:12:09, 6.21s/it] 18%|█▊ | 1018/5772 [1:47:15<8:12:09, 6.21s/it] 18%|█▊ | 1018/5772 [1:47:08<8:12:10, 6.21s/it] {'loss': 0.4853, 'learning_rate': 1.889908380954174e-05, 'epoch': 0.18} + 18%|█▊ | 1018/5772 [1:47:15<8:12:09, 6.21s/it] {'loss': 0.4853, 'learning_rate': 1.889908380954174e-05, 'epoch': 0.18} + 18%|█▊ | 1018/5772 [1:47:08<8:12:10, 6.21s/it] 18%|█▊ | 1019/5772 [1:47:21<8:10:29, 6.19s/it] 18%|█▊ | 1019/5772 [1:47:14<8:10:29, 6.19s/it] {'loss': 0.5038, 'learning_rate': 1.8896522558519574e-05, 'epoch': 0.18} + 18%|█▊ | 1019/5772 [1:47:21<8:10:29, 6.19s/it] {'loss': 0.5038, 'learning_rate': 1.8896522558519574e-05, 'epoch': 0.18} + 18%|█▊ | 1019/5772 [1:47:14<8:10:29, 6.19s/it] 18%|█▊ | 1020/5772 [1:47:20<8:04:59, 6.12s/it] 18%|█▊ | 1020/5772 [1:47:27<8:04:59, 6.12s/it] {'loss': 0.4756, 'learning_rate': 1.8893958505587093e-05, 'epoch': 0.18} + 18%|█▊ | 1020/5772 [1:47:27<8:04:59, 6.12s/it] {'loss': 0.4756, 'learning_rate': 1.8893958505587093e-05, 'epoch': 0.18} + 18%|█▊ | 1020/5772 [1:47:20<8:04:59, 6.12s/it] 18%|█▊ | 1021/5772 [1:47:33<8:11:32, 6.21s/it] 18%|█▊ | 1021/5772 [1:47:26<8:11:32, 6.21s/it] {'loss': 0.4978, 'learning_rate': 1.8891391651551826e-05, 'epoch': 0.18} + 18%|█▊ | 1021/5772 [1:47:33<8:11:32, 6.21s/it] {'loss': 0.4978, 'learning_rate': 1.8891391651551826e-05, 'epoch': 0.18} + 18%|█▊ | 1021/5772 [1:47:26<8:11:32, 6.21s/it] 18%|█▊ | 1022/5772 [1:47:33<8:19:33, 6.31s/it] 18%|█▊ | 1022/5772 [1:47:40<8:19:34, 6.31s/it] {'loss': 0.4758, 'learning_rate': 1.88888219972222e-05, 'epoch': 0.18} + 18%|█▊ | 1022/5772 [1:47:40<8:19:34, 6.31s/it] {'loss': 0.4758, 'learning_rate': 1.88888219972222e-05, 'epoch': 0.18} + 18%|█▊ | 1022/5772 [1:47:33<8:19:33, 6.31s/it] 18%|█▊ | 1023/5772 [1:47:46<8:15:24, 6.26s/it] 18%|█▊ | 1023/5772 [1:47:39<8:15:24, 6.26s/it] {'loss': 0.5042, 'learning_rate': 1.8886249543407505e-05, 'epoch': 0.18} + 18%|█▊ | 1023/5772 [1:47:46<8:15:24, 6.26s/it] {'loss': 0.5042, 'learning_rate': 1.8886249543407505e-05, 'epoch': 0.18} + 18%|█▊ | 1023/5772 [1:47:39<8:15:24, 6.26s/it] 18%|█▊ | 1024/5772 [1:47:52<8:12:34, 6.22s/it] 18%|█▊ | 1024/5772 [1:47:45<8:12:34, 6.22s/it] {'loss': 0.4856, 'learning_rate': 1.8883674290917927e-05, 'epoch': 0.18} + 18%|█▊ | 1024/5772 [1:47:52<8:12:34, 6.22s/it] {'loss': 0.4856, 'learning_rate': 1.8883674290917927e-05, 'epoch': 0.18} + 18%|█▊ | 1024/5772 [1:47:45<8:12:34, 6.22s/it] 18%|█▊ | 1025/5772 [1:47:58<8:16:22, 6.27s/it] 18%|█▊ | 1025/5772 [1:47:51<8:16:22, 6.27s/it] {'loss': 0.4897, 'learning_rate': 1.8881096240564523e-05, 'epoch': 0.18} + 18%|█▊ | 1025/5772 [1:47:58<8:16:22, 6.27s/it] {'loss': 0.4897, 'learning_rate': 1.8881096240564523e-05, 'epoch': 0.18} + 18%|█▊ | 1025/5772 [1:47:51<8:16:22, 6.27s/it] 18%|█▊ | 1026/5772 [1:48:05<8:16:41, 6.28s/it] 18%|█▊ | 1026/5772 [1:47:58<8:16:41, 6.28s/it] {'loss': 0.4896, 'learning_rate': 1.8878515393159236e-05, 'epoch': 0.18} + 18%|█▊ | 1026/5772 [1:48:05<8:16:41, 6.28s/it] {'loss': 0.4896, 'learning_rate': 1.8878515393159236e-05, 'epoch': 0.18} + 18%|█▊ | 1026/5772 [1:47:58<8:16:41, 6.28s/it] 18%|█▊ | 1027/5772 [1:48:11<8:20:07, 6.32s/it] 18%|█▊ | 1027/5772 [1:48:04<8:20:07, 6.32s/it] {'loss': 0.4969, 'learning_rate': 1.8875931749514893e-05, 'epoch': 0.18} + 18%|█▊ | 1027/5772 [1:48:11<8:20:07, 6.32s/it] {'loss': 0.4969, 'learning_rate': 1.8875931749514893e-05, 'epoch': 0.18} + 18%|█▊ | 1027/5772 [1:48:04<8:20:07, 6.32s/it] 18%|█▊ | 1028/5772 [1:48:17<8:20:31, 6.33s/it] 18%|█▊ | 1028/5772 [1:48:10<8:20:31, 6.33s/it] {'loss': 0.4821, 'learning_rate': 1.8873345310445193e-05, 'epoch': 0.18} + 18%|█▊ | 1028/5772 [1:48:17<8:20:31, 6.33s/it] {'loss': 0.4821, 'learning_rate': 1.8873345310445193e-05, 'epoch': 0.18} + 18%|█▊ | 1028/5772 [1:48:10<8:20:31, 6.33s/it] 18%|█▊ | 1029/5772 [1:48:24<8:17:07, 6.29s/it] 18%|█▊ | 1029/5772 [1:48:17<8:17:07, 6.29s/it] {'loss': 0.4879, 'learning_rate': 1.8870756076764728e-05, 'epoch': 0.18} + 18%|█▊ | 1029/5772 [1:48:24<8:17:07, 6.29s/it] {'loss': 0.4879, 'learning_rate': 1.8870756076764728e-05, 'epoch': 0.18} + 18%|█▊ | 1029/5772 [1:48:17<8:17:07, 6.29s/it] 18%|█▊ | 1030/5772 [1:48:23<8:12:58, 6.24s/it] 18%|█▊ | 1030/5772 [1:48:30<8:12:59, 6.24s/it] {'loss': 0.4839, 'learning_rate': 1.8868164049288954e-05, 'epoch': 0.18} + 18%|█▊ | 1030/5772 [1:48:30<8:12:59, 6.24s/it] {'loss': 0.4839, 'learning_rate': 1.8868164049288954e-05, 'epoch': 0.18} + 18%|█▊ | 1030/5772 [1:48:23<8:12:58, 6.24s/it] 18%|█▊ | 1031/5772 [1:48:29<8:17:29, 6.30s/it] 18%|█▊ | 1031/5772 [1:48:36<8:17:29, 6.30s/it] {'loss': 0.5094, 'learning_rate': 1.886556922883422e-05, 'epoch': 0.18} + 18%|█▊ | 1031/5772 [1:48:36<8:17:29, 6.30s/it] {'loss': 0.5094, 'learning_rate': 1.886556922883422e-05, 'epoch': 0.18} + 18%|█▊ | 1031/5772 [1:48:29<8:17:29, 6.30s/it] 18%|█▊ | 1032/5772 [1:48:35<8:18:08, 6.31s/it] 18%|█▊ | 1032/5772 [1:48:42<8:18:08, 6.31s/it] {'loss': 0.487, 'learning_rate': 1.8862971616217753e-05, 'epoch': 0.18} + 18%|█▊ | 1032/5772 [1:48:42<8:18:08, 6.31s/it] {'loss': 0.487, 'learning_rate': 1.8862971616217753e-05, 'epoch': 0.18} + 18%|█▊ | 1032/5772 [1:48:35<8:18:08, 6.31s/it] 18%|█▊ | 1033/5772 [1:48:42<8:21:23, 6.35s/it] 18%|█▊ | 1033/5772 [1:48:49<8:21:23, 6.35s/it] {'loss': 0.485, 'learning_rate': 1.8860371212257648e-05, 'epoch': 0.18} + 18%|█▊ | 1033/5772 [1:48:49<8:21:23, 6.35s/it] {'loss': 0.485, 'learning_rate': 1.8860371212257648e-05, 'epoch': 0.18} + 18%|█▊ | 1033/5772 [1:48:42<8:21:23, 6.35s/it] 18%|█▊ | 1034/5772 [1:48:55<8:14:27, 6.26s/it] 18%|█▊ | 1034/5772 [1:48:48<8:14:28, 6.26s/it] {'loss': 0.4799, 'learning_rate': 1.88577680177729e-05, 'epoch': 0.18} + 18%|█▊ | 1034/5772 [1:48:55<8:14:27, 6.26s/it] {'loss': 0.4799, 'learning_rate': 1.88577680177729e-05, 'epoch': 0.18} + 18%|█▊ | 1034/5772 [1:48:48<8:14:28, 6.26s/it] 18%|█▊ | 1035/5772 [1:49:01<8:07:04, 6.17s/it] 18%|█▊ | 1035/5772 [1:48:54<8:07:05, 6.17s/it] {'loss': 0.4906, 'learning_rate': 1.885516203358336e-05, 'epoch': 0.18} + 18%|█▊ | 1035/5772 [1:49:01<8:07:04, 6.17s/it] {'loss': 0.4906, 'learning_rate': 1.885516203358336e-05, 'epoch': 0.18} + 18%|█▊ | 1035/5772 [1:48:54<8:07:05, 6.17s/it] 18%|█▊ | 1036/5772 [1:49:07<8:04:34, 6.14s/it] 18%|█▊ | 1036/5772 [1:49:00<8:04:34, 6.14s/it] {'loss': 0.4869, 'learning_rate': 1.8852553260509775e-05, 'epoch': 0.18} + 18%|█▊ | 1036/5772 [1:49:07<8:04:34, 6.14s/it] {'loss': 0.4869, 'learning_rate': 1.8852553260509775e-05, 'epoch': 0.18} + 18%|█▊ | 1036/5772 [1:49:00<8:04:34, 6.14s/it] 18%|█▊ | 1037/5772 [1:49:13<8:11:46, 6.23s/it] 18%|█▊ | 1037/5772 [1:49:06<8:11:47, 6.23s/it] {'loss': 0.4998, 'learning_rate': 1.884994169937376e-05, 'epoch': 0.18} + 18%|█▊ | 1037/5772 [1:49:13<8:11:46, 6.23s/it] {'loss': 0.4998, 'learning_rate': 1.884994169937376e-05, 'epoch': 0.18} + 18%|█▊ | 1037/5772 [1:49:06<8:11:47, 6.23s/it] 18%|█▊ | 1038/5772 [1:49:20<8:16:28, 6.29s/it] 18%|█▊ | 1038/5772 [1:49:13<8:16:28, 6.29s/it] {'loss': 0.4949, 'learning_rate': 1.8847327350997814e-05, 'epoch': 0.18} + 18%|█▊ | 1038/5772 [1:49:20<8:16:28, 6.29s/it] {'loss': 0.4949, 'learning_rate': 1.8847327350997814e-05, 'epoch': 0.18} + 18%|█▊ | 1038/5772 [1:49:13<8:16:28, 6.29s/it] 18%|█▊ | 1039/5772 [1:49:26<8:14:05, 6.26s/it] 18%|█▊ | 1039/5772 [1:49:19<8:14:05, 6.26s/it] {'loss': 0.4873, 'learning_rate': 1.8844710216205306e-05, 'epoch': 0.18} + 18%|█▊ | 1039/5772 [1:49:26<8:14:05, 6.26s/it] {'loss': 0.4873, 'learning_rate': 1.8844710216205306e-05, 'epoch': 0.18} + 18%|█▊ | 1039/5772 [1:49:19<8:14:05, 6.26s/it] 18%|█▊ | 1040/5772 [1:49:32<8:08:09, 6.19s/it] 18%|█▊ | 1040/5772 [1:49:25<8:08:09, 6.19s/it] {'loss': 0.4882, 'learning_rate': 1.8842090295820497e-05, 'epoch': 0.18} + 18%|█▊ | 1040/5772 [1:49:32<8:08:09, 6.19s/it] {'loss': 0.4882, 'learning_rate': 1.8842090295820497e-05, 'epoch': 0.18} + 18%|█▊ | 1040/5772 [1:49:25<8:08:09, 6.19s/it] 18%|█▊ | 1041/5772 [1:49:38<8:04:17, 6.14s/it] 18%|█▊ | 1041/5772 [1:49:31<8:04:17, 6.14s/it] {'loss': 0.4983, 'learning_rate': 1.8839467590668507e-05, 'epoch': 0.18} + 18%|█▊ | 1041/5772 [1:49:38<8:04:17, 6.14s/it] {'loss': 0.4983, 'learning_rate': 1.8839467590668507e-05, 'epoch': 0.18} + 18%|█▊ | 1041/5772 [1:49:31<8:04:17, 6.14s/it] 18%|█▊ | 1042/5772 [1:49:44<7:59:28, 6.08s/it] 18%|█▊ | 1042/5772 [1:49:37<7:59:28, 6.08s/it] {'loss': 0.4784, 'learning_rate': 1.883684210157535e-05, 'epoch': 0.18} + 18%|█▊ | 1042/5772 [1:49:44<7:59:28, 6.08s/it] {'loss': 0.4784, 'learning_rate': 1.883684210157535e-05, 'epoch': 0.18} + 18%|█▊ | 1042/5772 [1:49:37<7:59:28, 6.08s/it] 18%|█▊ | 1043/5772 [1:49:51<8:12:24, 6.25s/it] 18%|█▊ | 1043/5772 [1:49:44<8:12:24, 6.25s/it] {'loss': 0.4906, 'learning_rate': 1.8834213829367908e-05, 'epoch': 0.18} + 18%|█▊ | 1043/5772 [1:49:51<8:12:24, 6.25s/it] {'loss': 0.4906, 'learning_rate': 1.8834213829367908e-05, 'epoch': 0.18} + 18%|█▊ | 1043/5772 [1:49:44<8:12:24, 6.25s/it] 18%|█▊ | 1044/5772 [1:49:57<8:12:17, 6.25s/it] 18%|█▊ | 1044/5772 [1:49:50<8:12:17, 6.25s/it] {'loss': 0.4817, 'learning_rate': 1.8831582774873935e-05, 'epoch': 0.18} + 18%|█▊ | 1044/5772 [1:49:57<8:12:17, 6.25s/it] {'loss': 0.4817, 'learning_rate': 1.8831582774873935e-05, 'epoch': 0.18} + 18%|█▊ | 1044/5772 [1:49:50<8:12:17, 6.25s/it] 18%|█▊ | 1045/5772 [1:50:04<8:20:34, 6.35s/it] 18%|█▊ | 1045/5772 [1:49:56<8:20:34, 6.35s/it] {'loss': 0.4962, 'learning_rate': 1.8828948938922073e-05, 'epoch': 0.18} + 18%|█▊ | 1045/5772 [1:50:04<8:20:34, 6.35s/it] {'loss': 0.4962, 'learning_rate': 1.8828948938922073e-05, 'epoch': 0.18} + 18%|█▊ | 1045/5772 [1:49:56<8:20:34, 6.35s/it] 18%|█▊ | 1046/5772 [1:50:10<8:27:07, 6.44s/it] 18%|█▊ | 1046/5772 [1:50:03<8:27:06, 6.44s/it] {'loss': 0.4835, 'learning_rate': 1.882631232234183e-05, 'epoch': 0.18} + 18%|█▊ | 1046/5772 [1:50:10<8:27:07, 6.44s/it] {'loss': 0.4835, 'learning_rate': 1.882631232234183e-05, 'epoch': 0.18} + 18%|█▊ | 1046/5772 [1:50:03<8:27:06, 6.44s/it] 18%|█▊ | 1047/5772 [1:50:16<8:23:02, 6.39s/it] 18%|█▊ | 1047/5772 [1:50:09<8:23:03, 6.39s/it] {'loss': 0.5017, 'learning_rate': 1.8823672925963598e-05, 'epoch': 0.18} + 18%|█▊ | 1047/5772 [1:50:16<8:23:02, 6.39s/it] {'loss': 0.5017, 'learning_rate': 1.8823672925963598e-05, 'epoch': 0.18} + 18%|█▊ | 1047/5772 [1:50:09<8:23:03, 6.39s/it] 18%|█▊ | 1048/5772 [1:50:23<8:21:07, 6.36s/it] 18%|█▊ | 1048/5772 [1:50:16<8:21:07, 6.36s/it] {'loss': 0.4919, 'learning_rate': 1.8821030750618633e-05, 'epoch': 0.18} + 18%|█▊ | 1048/5772 [1:50:23<8:21:07, 6.36s/it] {'loss': 0.4919, 'learning_rate': 1.8821030750618633e-05, 'epoch': 0.18} + 18%|█▊ | 1048/5772 [1:50:16<8:21:07, 6.36s/it] 18%|█▊ | 1049/5772 [1:50:29<8:26:30, 6.43s/it] 18%|█▊ | 1049/5772 [1:50:22<8:26:29, 6.43s/it] {'loss': 0.4901, 'learning_rate': 1.8818385797139083e-05, 'epoch': 0.18} + 18%|█▊ | 1049/5772 [1:50:29<8:26:30, 6.43s/it] {'loss': 0.4901, 'learning_rate': 1.8818385797139083e-05, 'epoch': 0.18} + 18%|█▊ | 1049/5772 [1:50:22<8:26:29, 6.43s/it]6 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend...15 + AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +27 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... + 18%|█▊ | 1050/5772 [1:50:36<8:27:54, 6.45s/it]10 AutoResumeHook: Checking whether to suspend... +011 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... + 18%|█▊ | 1050/5772 [1:50:29<8:27:54, 6.45s/it]1 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4795, 'learning_rate': 1.8815738066357954e-05, 'epoch': 0.18} + 18%|█▊ | 1050/5772 [1:50:36<8:27:54, 6.45s/it] {'loss': 0.4795, 'learning_rate': 1.8815738066357954e-05, 'epoch': 0.18} + 18%|█▊ | 1050/5772 [1:50:29<8:27:54, 6.45s/it] 18%|█▊ | 1051/5772 [1:50:42<8:13:46, 6.28s/it] 18%|█▊ | 1051/5772 [1:50:35<8:13:45, 6.28s/it] {'loss': 0.4837, 'learning_rate': 1.8813087559109137e-05, 'epoch': 0.18} + 18%|█▊ | 1051/5772 [1:50:42<8:13:46, 6.28s/it] {'loss': 0.4837, 'learning_rate': 1.8813087559109137e-05, 'epoch': 0.18} + 18%|█▊ | 1051/5772 [1:50:35<8:13:45, 6.28s/it] 18%|█▊ | 1052/5772 [1:50:48<8:14:48, 6.29s/it] 18%|█▊ | 1052/5772 [1:50:41<8:14:49, 6.29s/it] {'loss': 0.4824, 'learning_rate': 1.8810434276227397e-05, 'epoch': 0.18} + 18%|█▊ | 1052/5772 [1:50:48<8:14:48, 6.29s/it] {'loss': 0.4824, 'learning_rate': 1.8810434276227397e-05, 'epoch': 0.18} + 18%|█▊ | 1052/5772 [1:50:41<8:14:49, 6.29s/it] 18%|█▊ | 1053/5772 [1:50:54<8:02:25, 6.13s/it] 18%|█▊ | 1053/5772 [1:50:47<8:02:25, 6.13s/it] {'loss': 0.4876, 'learning_rate': 1.8807778218548364e-05, 'epoch': 0.18} + 18%|█▊ | 1053/5772 [1:50:54<8:02:25, 6.13s/it] {'loss': 0.4876, 'learning_rate': 1.8807778218548364e-05, 'epoch': 0.18} + 18%|█▊ | 1053/5772 [1:50:47<8:02:25, 6.13s/it] 18%|█▊ | 1054/5772 [1:51:00<7:52:22, 6.01s/it] 18%|█▊ | 1054/5772 [1:50:52<7:52:22, 6.01s/it] {'loss': 0.4757, 'learning_rate': 1.8805119386908556e-05, 'epoch': 0.18} + 18%|█▊ | 1054/5772 [1:51:00<7:52:22, 6.01s/it] {'loss': 0.4757, 'learning_rate': 1.8805119386908556e-05, 'epoch': 0.18} + 18%|█▊ | 1054/5772 [1:50:52<7:52:22, 6.01s/it] 18%|█▊ | 1055/5772 [1:51:06<7:52:34, 6.01s/it] 18%|█▊ | 1055/5772 [1:50:58<7:52:34, 6.01s/it] {'loss': 0.4832, 'learning_rate': 1.8802457782145352e-05, 'epoch': 0.18} + 18%|█▊ | 1055/5772 [1:51:06<7:52:34, 6.01s/it] {'loss': 0.4832, 'learning_rate': 1.8802457782145352e-05, 'epoch': 0.18} + 18%|█▊ | 1055/5772 [1:50:58<7:52:34, 6.01s/it] 18%|█▊ | 1056/5772 [1:51:12<8:05:17, 6.17s/it] 18%|█▊ | 1056/5772 [1:51:05<8:05:17, 6.17s/it] {'loss': 0.4731, 'learning_rate': 1.879979340509701e-05, 'epoch': 0.18} + 18%|█▊ | 1056/5772 [1:51:12<8:05:17, 6.17s/it] {'loss': 0.4731, 'learning_rate': 1.879979340509701e-05, 'epoch': 0.18} + 18%|█▊ | 1056/5772 [1:51:05<8:05:17, 6.17s/it] 18%|█▊ | 1057/5772 [1:51:18<8:09:03, 6.22s/it] 18%|█▊ | 1057/5772 [1:51:11<8:09:03, 6.22s/it] {'loss': 0.5019, 'learning_rate': 1.8797126256602666e-05, 'epoch': 0.18} + 18%|█▊ | 1057/5772 [1:51:18<8:09:03, 6.22s/it] {'loss': 0.5019, 'learning_rate': 1.8797126256602666e-05, 'epoch': 0.18} + 18%|█▊ | 1057/5772 [1:51:11<8:09:03, 6.22s/it] 18%|█▊ | 1058/5772 [1:51:25<8:06:59, 6.20s/it] 18%|█▊ | 1058/5772 [1:51:18<8:07:00, 6.20s/it] {'loss': 0.4823, 'learning_rate': 1.8794456337502318e-05, 'epoch': 0.18} + 18%|█▊ | 1058/5772 [1:51:25<8:06:59, 6.20s/it] {'loss': 0.4823, 'learning_rate': 1.8794456337502318e-05, 'epoch': 0.18} + 18%|█▊ | 1058/5772 [1:51:18<8:07:00, 6.20s/it] 18%|█▊ | 1059/5772 [1:51:31<8:00:49, 6.12s/it] 18%|█▊ | 1059/5772 [1:51:23<8:00:49, 6.12s/it] {'loss': 0.494, 'learning_rate': 1.8791783648636844e-05, 'epoch': 0.18} + 18%|█▊ | 1059/5772 [1:51:31<8:00:49, 6.12s/it] {'loss': 0.494, 'learning_rate': 1.8791783648636844e-05, 'epoch': 0.18} + 18%|█▊ | 1059/5772 [1:51:23<8:00:49, 6.12s/it] 18%|█▊ | 1060/5772 [1:51:30<8:01:35, 6.13s/it] 18%|█▊ | 1060/5772 [1:51:37<8:01:36, 6.13s/it] {'loss': 0.4827, 'learning_rate': 1.878910819084799e-05, 'epoch': 0.18} + 18%|█▊ | 1060/5772 [1:51:37<8:01:36, 6.13s/it] {'loss': 0.4827, 'learning_rate': 1.878910819084799e-05, 'epoch': 0.18} + 18%|█▊ | 1060/5772 [1:51:30<8:01:35, 6.13s/it] 18%|█▊ | 1061/5772 [1:51:43<7:54:56, 6.05s/it] 18%|█▊ | 1061/5772 [1:51:35<7:54:56, 6.05s/it] {'loss': 0.4991, 'learning_rate': 1.878642996497838e-05, 'epoch': 0.18} + 18%|█▊ | 1061/5772 [1:51:43<7:54:56, 6.05s/it] {'loss': 0.4991, 'learning_rate': 1.878642996497838e-05, 'epoch': 0.18} + 18%|█▊ | 1061/5772 [1:51:35<7:54:56, 6.05s/it] 18%|█▊ | 1062/5772 [1:51:49<8:05:26, 6.18s/it] 18%|█▊ | 1062/5772 [1:51:42<8:05:26, 6.18s/it] {'loss': 0.4858, 'learning_rate': 1.8783748971871508e-05, 'epoch': 0.18} + 18%|█▊ | 1062/5772 [1:51:49<8:05:26, 6.18s/it] {'loss': 0.4858, 'learning_rate': 1.8783748971871508e-05, 'epoch': 0.18} + 18%|█▊ | 1062/5772 [1:51:42<8:05:26, 6.18s/it] 18%|█▊ | 1063/5772 [1:51:55<8:01:50, 6.14s/it] 18%|█▊ | 1063/5772 [1:51:48<8:01:50, 6.14s/it] {'loss': 0.5024, 'learning_rate': 1.8781065212371732e-05, 'epoch': 0.18} + 18%|█▊ | 1063/5772 [1:51:55<8:01:50, 6.14s/it] {'loss': 0.5024, 'learning_rate': 1.8781065212371732e-05, 'epoch': 0.18} + 18%|█▊ | 1063/5772 [1:51:48<8:01:50, 6.14s/it] 18%|█▊ | 1064/5772 [1:52:01<8:01:45, 6.14s/it] 18%|█▊ | 1064/5772 [1:51:54<8:01:45, 6.14s/it] {'loss': 0.492, 'learning_rate': 1.877837868732429e-05, 'epoch': 0.18} + 18%|█▊ | 1064/5772 [1:52:01<8:01:45, 6.14s/it] {'loss': 0.492, 'learning_rate': 1.877837868732429e-05, 'epoch': 0.18} + 18%|█▊ | 1064/5772 [1:51:54<8:01:45, 6.14s/it] 18%|█▊ | 1065/5772 [1:52:07<8:02:49, 6.15s/it] 18%|█▊ | 1065/5772 [1:52:00<8:02:50, 6.15s/it] {'loss': 0.5046, 'learning_rate': 1.877568939757529e-05, 'epoch': 0.18} + 18%|█▊ | 1065/5772 [1:52:07<8:02:49, 6.15s/it] {'loss': 0.5046, 'learning_rate': 1.877568939757529e-05, 'epoch': 0.18} + 18%|█▊ | 1065/5772 [1:52:00<8:02:50, 6.15s/it] 18%|█▊ | 1066/5772 [1:52:14<8:03:35, 6.17s/it] 18%|█▊ | 1066/5772 [1:52:07<8:03:36, 6.17s/it] {'loss': 0.4811, 'learning_rate': 1.8772997343971708e-05, 'epoch': 0.18} + 18%|█▊ | 1066/5772 [1:52:14<8:03:35, 6.17s/it] {'loss': 0.4811, 'learning_rate': 1.8772997343971708e-05, 'epoch': 0.18} + 18%|█▊ | 1066/5772 [1:52:07<8:03:36, 6.17s/it] 18%|█▊ | 1067/5772 [1:52:20<8:04:28, 6.18s/it] 18%|█▊ | 1067/5772 [1:52:13<8:04:28, 6.18s/it] {'loss': 0.4876, 'learning_rate': 1.877030252736139e-05, 'epoch': 0.18} + 18%|█▊ | 1067/5772 [1:52:20<8:04:28, 6.18s/it] {'loss': 0.4876, 'learning_rate': 1.877030252736139e-05, 'epoch': 0.18} + 18%|█▊ | 1067/5772 [1:52:13<8:04:28, 6.18s/it] 19%|█▊ | 1068/5772 [1:52:26<7:57:11, 6.09s/it] 19%|█▊ | 1068/5772 [1:52:19<7:57:11, 6.09s/it] {'loss': 0.4762, 'learning_rate': 1.8767604948593052e-05, 'epoch': 0.19} + 19%|█▊ | 1068/5772 [1:52:26<7:57:11, 6.09s/it] {'loss': 0.4762, 'learning_rate': 1.8767604948593052e-05, 'epoch': 0.19} + 19%|█▊ | 1068/5772 [1:52:19<7:57:11, 6.09s/it] 19%|█▊ | 1069/5772 [1:52:32<7:58:04, 6.10s/it] 19%|█▊ | 1069/5772 [1:52:25<7:58:04, 6.10s/it] {'loss': 0.4956, 'learning_rate': 1.8764904608516287e-05, 'epoch': 0.19} + 19%|█▊ | 1069/5772 [1:52:32<7:58:04, 6.10s/it] {'loss': 0.4956, 'learning_rate': 1.8764904608516287e-05, 'epoch': 0.19} + 19%|█▊ | 1069/5772 [1:52:25<7:58:04, 6.10s/it] 19%|█▊ | 1070/5772 [1:52:38<7:58:06, 6.10s/it] 19%|█▊ | 1070/5772 [1:52:31<7:58:06, 6.10s/it] {'loss': 0.4916, 'learning_rate': 1.8762201507981546e-05, 'epoch': 0.19} + 19%|█▊ | 1070/5772 [1:52:38<7:58:06, 6.10s/it] {'loss': 0.4916, 'learning_rate': 1.8762201507981546e-05, 'epoch': 0.19} + 19%|█▊ | 1070/5772 [1:52:31<7:58:06, 6.10s/it] 19%|█▊ | 1071/5772 [1:52:37<7:52:52, 6.04s/it] 19%|█▊ | 1071/5772 [1:52:44<7:52:53, 6.04s/it] {'loss': 0.4886, 'learning_rate': 1.8759495647840158e-05, 'epoch': 0.19} + 19%|█▊ | 1071/5772 [1:52:44<7:52:53, 6.04s/it] {'loss': 0.4886, 'learning_rate': 1.8759495647840158e-05, 'epoch': 0.19} + 19%|█▊ | 1071/5772 [1:52:37<7:52:52, 6.04s/it] 19%|█▊ | 1072/5772 [1:52:50<7:53:29, 6.04s/it] 19%|█▊ | 1072/5772 [1:52:43<7:53:29, 6.04s/it] {'loss': 0.5012, 'learning_rate': 1.875678702894432e-05, 'epoch': 0.19} + 19%|█▊ | 1072/5772 [1:52:50<7:53:29, 6.04s/it] {'loss': 0.5012, 'learning_rate': 1.875678702894432e-05, 'epoch': 0.19} + 19%|█▊ | 1072/5772 [1:52:43<7:53:29, 6.04s/it] 19%|█▊ | 1073/5772 [1:52:56<8:01:38, 6.15s/it] 19%|█▊ | 1073/5772 [1:52:49<8:01:39, 6.15s/it] {'loss': 0.4869, 'learning_rate': 1.8754075652147094e-05, 'epoch': 0.19} + 19%|█▊ | 1073/5772 [1:52:56<8:01:38, 6.15s/it] {'loss': 0.4869, 'learning_rate': 1.8754075652147094e-05, 'epoch': 0.19} + 19%|█▊ | 1073/5772 [1:52:49<8:01:39, 6.15s/it] 19%|█▊ | 1074/5772 [1:53:02<7:58:48, 6.12s/it] 19%|█▊ | 1074/5772 [1:52:55<7:58:48, 6.12s/it] {'loss': 0.4958, 'learning_rate': 1.8751361518302413e-05, 'epoch': 0.19} + 19%|█▊ | 1074/5772 [1:53:02<7:58:48, 6.12s/it] {'loss': 0.4958, 'learning_rate': 1.8751361518302413e-05, 'epoch': 0.19} + 19%|█▊ | 1074/5772 [1:52:55<7:58:48, 6.12s/it] 19%|█▊ | 1075/5772 [1:53:08<7:56:44, 6.09s/it] 19%|█▊ | 1075/5772 [1:53:01<7:56:44, 6.09s/it] {'loss': 0.4942, 'learning_rate': 1.8748644628265085e-05, 'epoch': 0.19} + 19%|█▊ | 1075/5772 [1:53:08<7:56:44, 6.09s/it] {'loss': 0.4942, 'learning_rate': 1.8748644628265085e-05, 'epoch': 0.19} + 19%|█▊ | 1075/5772 [1:53:01<7:56:44, 6.09s/it] 19%|█▊ | 1076/5772 [1:53:14<7:56:16, 6.09s/it] 19%|█▊ | 1076/5772 [1:53:07<7:56:16, 6.09s/it] {'loss': 0.4724, 'learning_rate': 1.874592498289077e-05, 'epoch': 0.19} + 19%|█▊ | 1076/5772 [1:53:14<7:56:16, 6.09s/it] {'loss': 0.4724, 'learning_rate': 1.874592498289077e-05, 'epoch': 0.19} + 19%|█▊ | 1076/5772 [1:53:07<7:56:16, 6.09s/it] 19%|█▊ | 1077/5772 [1:53:14<7:58:56, 6.12s/it] 19%|█▊ | 1077/5772 [1:53:21<7:58:58, 6.12s/it] {'loss': 0.4914, 'learning_rate': 1.874320258303601e-05, 'epoch': 0.19} + 19%|█▊ | 1077/5772 [1:53:21<7:58:58, 6.12s/it] {'loss': 0.4914, 'learning_rate': 1.874320258303601e-05, 'epoch': 0.19} + 19%|█▊ | 1077/5772 [1:53:14<7:58:56, 6.12s/it] 19%|█▊ | 1078/5772 [1:53:27<8:02:36, 6.17s/it] 19%|█▊ | 1078/5772 [1:53:20<8:02:37, 6.17s/it] {'loss': 0.4907, 'learning_rate': 1.8740477429558205e-05, 'epoch': 0.19} + 19%|█▊ | 1078/5772 [1:53:27<8:02:36, 6.17s/it] {'loss': 0.4907, 'learning_rate': 1.8740477429558205e-05, 'epoch': 0.19} + 19%|█▊ | 1078/5772 [1:53:20<8:02:37, 6.17s/it] 19%|█▊ | 1079/5772 [1:53:26<7:58:52, 6.12s/it] 19%|█▊ | 1079/5772 [1:53:33<7:58:53, 6.12s/it] {'loss': 0.4783, 'learning_rate': 1.8737749523315636e-05, 'epoch': 0.19} + 19%|█▊ | 1079/5772 [1:53:33<7:58:53, 6.12s/it] {'loss': 0.4783, 'learning_rate': 1.8737749523315636e-05, 'epoch': 0.19} + 19%|█▊ | 1079/5772 [1:53:26<7:58:52, 6.12s/it] 19%|█▊ | 1080/5772 [1:53:32<7:53:30, 6.06s/it] 19%|█▊ | 1080/5772 [1:53:39<7:53:32, 6.06s/it] {'loss': 0.4902, 'learning_rate': 1.8735018865167433e-05, 'epoch': 0.19} + 19%|█▊ | 1080/5772 [1:53:39<7:53:32, 6.06s/it] {'loss': 0.4902, 'learning_rate': 1.8735018865167433e-05, 'epoch': 0.19} + 19%|█▊ | 1080/5772 [1:53:32<7:53:30, 6.06s/it] 19%|█▊ | 1081/5772 [1:53:45<7:59:44, 6.14s/it] 19%|█▊ | 1081/5772 [1:53:38<7:59:46, 6.14s/it] {'loss': 0.4966, 'learning_rate': 1.873228545597361e-05, 'epoch': 0.19} + 19%|█▊ | 1081/5772 [1:53:45<7:59:44, 6.14s/it] {'loss': 0.4966, 'learning_rate': 1.873228545597361e-05, 'epoch': 0.19} + 19%|█▊ | 1081/5772 [1:53:38<7:59:46, 6.14s/it] 19%|█▊ | 1082/5772 [1:53:44<8:03:15, 6.18s/it] 19%|█▊ | 1082/5772 [1:53:51<8:03:14, 6.18s/it] {'loss': 0.4806, 'learning_rate': 1.872954929659503e-05, 'epoch': 0.19} + 19%|█▊ | 1082/5772 [1:53:51<8:03:14, 6.18s/it] {'loss': 0.4806, 'learning_rate': 1.872954929659503e-05, 'epoch': 0.19} + 19%|█▊ | 1082/5772 [1:53:44<8:03:15, 6.18s/it] 19%|█▉ | 1083/5772 [1:53:58<8:03:11, 6.18s/it] 19%|█▉ | 1083/5772 [1:53:51<8:03:12, 6.18s/it] {'loss': 0.4954, 'learning_rate': 1.8726810387893438e-05, 'epoch': 0.19} + 19%|█▉ | 1083/5772 [1:53:58<8:03:11, 6.18s/it] {'loss': 0.4954, 'learning_rate': 1.8726810387893438e-05, 'epoch': 0.19} + 19%|█▉ | 1083/5772 [1:53:51<8:03:12, 6.18s/it] 19%|█▉ | 1084/5772 [1:54:04<8:12:57, 6.31s/it] 19%|█▉ | 1084/5772 [1:53:57<8:12:57, 6.31s/it] {'loss': 0.4813, 'learning_rate': 1.8724068730731436e-05, 'epoch': 0.19} + 19%|█▉ | 1084/5772 [1:54:04<8:12:57, 6.31s/it] {'loss': 0.4813, 'learning_rate': 1.8724068730731436e-05, 'epoch': 0.19} + 19%|█▉ | 1084/5772 [1:53:57<8:12:57, 6.31s/it] 19%|█▉ | 1085/5772 [1:54:10<8:11:37, 6.29s/it] 19%|█▉ | 1085/5772 [1:54:03<8:11:37, 6.29s/it] {'loss': 0.4887, 'learning_rate': 1.872132432597249e-05, 'epoch': 0.19} + 19%|█▉ | 1085/5772 [1:54:10<8:11:37, 6.29s/it] {'loss': 0.4887, 'learning_rate': 1.872132432597249e-05, 'epoch': 0.19} + 19%|█▉ | 1085/5772 [1:54:03<8:11:37, 6.29s/it] 19%|█▉ | 1086/5772 [1:54:09<8:07:08, 6.24s/it] 19%|█▉ | 1086/5772 [1:54:17<8:07:08, 6.24s/it] {'loss': 0.4834, 'learning_rate': 1.8718577174480938e-05, 'epoch': 0.19} + 19%|█▉ | 1086/5772 [1:54:17<8:07:08, 6.24s/it] {'loss': 0.4834, 'learning_rate': 1.8718577174480938e-05, 'epoch': 0.19} + 19%|█▉ | 1086/5772 [1:54:09<8:07:08, 6.24s/it] 19%|█▉ | 1087/5772 [1:54:23<8:06:18, 6.23s/it] 19%|█▉ | 1087/5772 [1:54:16<8:06:18, 6.23s/it] {'loss': 0.4913, 'learning_rate': 1.8715827277121982e-05, 'epoch': 0.19} + 19%|█▉ | 1087/5772 [1:54:23<8:06:18, 6.23s/it] {'loss': 0.4913, 'learning_rate': 1.8715827277121982e-05, 'epoch': 0.19} + 19%|█▉ | 1087/5772 [1:54:16<8:06:18, 6.23s/it] 19%|█▉ | 1088/5772 [1:54:22<8:03:04, 6.19s/it] 19%|█▉ | 1088/5772 [1:54:29<8:03:05, 6.19s/it] {'loss': 0.4868, 'learning_rate': 1.8713074634761687e-05, 'epoch': 0.19} + 19%|█▉ | 1088/5772 [1:54:29<8:03:05, 6.19s/it] {'loss': 0.4868, 'learning_rate': 1.8713074634761687e-05, 'epoch': 0.19} + 19%|█▉ | 1088/5772 [1:54:22<8:03:04, 6.19s/it] 19%|█▉ | 1089/5772 [1:54:35<8:07:03, 6.24s/it] 19%|█▉ | 1089/5772 [1:54:28<8:07:03, 6.24s/it] {'loss': 0.4912, 'learning_rate': 1.8710319248266978e-05, 'epoch': 0.19} + 19%|█▉ | 1089/5772 [1:54:35<8:07:03, 6.24s/it] {'loss': 0.4912, 'learning_rate': 1.8710319248266978e-05, 'epoch': 0.19} + 19%|█▉ | 1089/5772 [1:54:28<8:07:03, 6.24s/it] 19%|█▉ | 1090/5772 [1:54:41<8:02:45, 6.19s/it] 19%|█▉ | 1090/5772 [1:54:34<8:02:45, 6.19s/it] {'loss': 0.4802, 'learning_rate': 1.8707561118505656e-05, 'epoch': 0.19} + 19%|█▉ | 1090/5772 [1:54:41<8:02:45, 6.19s/it] {'loss': 0.4802, 'learning_rate': 1.8707561118505656e-05, 'epoch': 0.19} + 19%|█▉ | 1090/5772 [1:54:34<8:02:45, 6.19s/it] 19%|█▉ | 1091/5772 [1:54:40<7:53:23, 6.07s/it] 19%|█▉ | 1091/5772 [1:54:47<7:53:29, 6.07s/it] {'loss': 0.4976, 'learning_rate': 1.8704800246346367e-05, 'epoch': 0.19} + 19%|█▉ | 1091/5772 [1:54:47<7:53:29, 6.07s/it] {'loss': 0.4976, 'learning_rate': 1.8704800246346367e-05, 'epoch': 0.19} + 19%|█▉ | 1091/5772 [1:54:40<7:53:23, 6.07s/it] 19%|█▉ | 1092/5772 [1:54:46<7:51:17, 6.04s/it] 19%|█▉ | 1092/5772 [1:54:53<7:51:15, 6.04s/it] {'loss': 0.4892, 'learning_rate': 1.8702036632658646e-05, 'epoch': 0.19} + 19%|█▉ | 1092/5772 [1:54:53<7:51:15, 6.04s/it] {'loss': 0.4892, 'learning_rate': 1.8702036632658646e-05, 'epoch': 0.19} + 19%|█▉ | 1092/5772 [1:54:46<7:51:17, 6.04s/it] 19%|█▉ | 1093/5772 [1:54:59<7:57:14, 6.12s/it] 19%|█▉ | 1093/5772 [1:54:52<7:57:16, 6.12s/it] {'loss': 0.4902, 'learning_rate': 1.869927027831287e-05, 'epoch': 0.19} + 19%|█▉ | 1093/5772 [1:54:59<7:57:14, 6.12s/it] {'loss': 0.4902, 'learning_rate': 1.869927027831287e-05, 'epoch': 0.19} + 19%|█▉ | 1093/5772 [1:54:52<7:57:16, 6.12s/it] 19%|█▉ | 1094/5772 [1:55:06<7:58:54, 6.14s/it] 19%|█▉ | 1094/5772 [1:54:58<7:58:55, 6.14s/it] {'loss': 0.4751, 'learning_rate': 1.8696501184180283e-05, 'epoch': 0.19} + 19%|█▉ | 1094/5772 [1:55:06<7:58:54, 6.14s/it] {'loss': 0.4751, 'learning_rate': 1.8696501184180283e-05, 'epoch': 0.19} + 19%|█▉ | 1094/5772 [1:54:58<7:58:55, 6.14s/it] 19%|█▉ | 1095/5772 [1:55:12<8:07:28, 6.25s/it] 19%|█▉ | 1095/5772 [1:55:05<8:07:29, 6.25s/it] {'loss': 0.4891, 'learning_rate': 1.8693729351133005e-05, 'epoch': 0.19} + 19%|█▉ | 1095/5772 [1:55:12<8:07:28, 6.25s/it] {'loss': 0.4891, 'learning_rate': 1.8693729351133005e-05, 'epoch': 0.19} + 19%|█▉ | 1095/5772 [1:55:05<8:07:29, 6.25s/it] 19%|█▉ | 1096/5772 [1:55:18<8:08:47, 6.27s/it] 19%|█▉ | 1096/5772 [1:55:11<8:08:48, 6.27s/it] {'loss': 0.4905, 'learning_rate': 1.8690954780044004e-05, 'epoch': 0.19} + 19%|█▉ | 1096/5772 [1:55:18<8:08:47, 6.27s/it] {'loss': 0.4905, 'learning_rate': 1.8690954780044004e-05, 'epoch': 0.19} + 19%|█▉ | 1096/5772 [1:55:11<8:08:48, 6.27s/it] 19%|█▉ | 1097/5772 [1:55:24<8:00:25, 6.17s/it] 19%|█▉ | 1097/5772 [1:55:17<8:00:25, 6.17s/it] {'loss': 0.4763, 'learning_rate': 1.8688177471787118e-05, 'epoch': 0.19} + 19%|█▉ | 1097/5772 [1:55:24<8:00:25, 6.17s/it] {'loss': 0.4763, 'learning_rate': 1.8688177471787118e-05, 'epoch': 0.19} + 19%|█▉ | 1097/5772 [1:55:17<8:00:25, 6.17s/it] 19%|█▉ | 1098/5772 [1:55:24<8:08:14, 6.27s/it] 19%|█▉ | 1098/5772 [1:55:31<8:08:15, 6.27s/it] {'loss': 0.4752, 'learning_rate': 1.8685397427237043e-05, 'epoch': 0.19} + 19%|█▉ | 1098/5772 [1:55:31<8:08:15, 6.27s/it] {'loss': 0.4752, 'learning_rate': 1.8685397427237043e-05, 'epoch': 0.19} + 19%|█▉ | 1098/5772 [1:55:24<8:08:14, 6.27s/it] 19%|█▉ | 1099/5772 [1:55:37<8:09:24, 6.28s/it] 19%|█▉ | 1099/5772 [1:55:30<8:09:24, 6.28s/it] {'loss': 0.4936, 'learning_rate': 1.868261464726934e-05, 'epoch': 0.19} + 19%|█▉ | 1099/5772 [1:55:37<8:09:24, 6.28s/it] {'loss': 0.4936, 'learning_rate': 1.868261464726934e-05, 'epoch': 0.19} + 19%|█▉ | 1099/5772 [1:55:30<8:09:24, 6.28s/it]6 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +02 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... + 19%|█▉ | 1100/5772 [1:55:43<8:03:24, 6.21s/it]12 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend...4 + AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... + 19%|█▉ | 1100/5772 [1:55:36<8:03:25, 6.21s/it]15 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... + {'loss': 0.487, 'learning_rate': 1.8679829132760427e-05, 'epoch': 0.19} + 19%|█▉ | 1100/5772 [1:55:43<8:03:24, 6.21s/it] {'loss': 0.487, 'learning_rate': 1.8679829132760427e-05, 'epoch': 0.19} + 19%|█▉ | 1100/5772 [1:55:36<8:03:25, 6.21s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-1100/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-1100/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-1100/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 19%|█▉ | 1101/5772 [1:56:05<14:13:32, 10.96s/it] 19%|█▉ | 1101/5772 [1:55:58<14:13:32, 10.96s/it] {'loss': 0.4899, 'learning_rate': 1.867704088458759e-05, 'epoch': 0.19} + 19%|█▉ | 1101/5772 [1:56:05<14:13:32, 10.96s/it] {'loss': 0.4899, 'learning_rate': 1.867704088458759e-05, 'epoch': 0.19} + 19%|█▉ | 1101/5772 [1:55:58<14:13:32, 10.96s/it] 19%|█▉ | 1102/5772 [1:56:12<12:25:02, 9.57s/it] 19%|█▉ | 1102/5772 [1:56:04<12:25:02, 9.57s/it] {'loss': 0.5005, 'learning_rate': 1.867424990362897e-05, 'epoch': 0.19} + 19%|█▉ | 1102/5772 [1:56:12<12:25:02, 9.57s/it] {'loss': 0.5005, 'learning_rate': 1.867424990362897e-05, 'epoch': 0.19} + 19%|█▉ | 1102/5772 [1:56:04<12:25:02, 9.57s/it] 19%|█▉ | 1103/5772 [1:56:18<11:10:39, 8.62s/it] 19%|█▉ | 1103/5772 [1:56:11<11:10:45, 8.62s/it] {'loss': 0.5067, 'learning_rate': 1.8671456190763572e-05, 'epoch': 0.19} + 19%|█▉ | 1103/5772 [1:56:18<11:10:39, 8.62s/it] {'loss': 0.5067, 'learning_rate': 1.8671456190763572e-05, 'epoch': 0.19} + 19%|█▉ | 1103/5772 [1:56:11<11:10:45, 8.62s/it] 19%|█▉ | 1104/5772 [1:56:17<10:16:14, 7.92s/it] 19%|█▉ | 1104/5772 [1:56:24<10:16:16, 7.92s/it] {'loss': 0.4883, 'learning_rate': 1.866865974687126e-05, 'epoch': 0.19} + 19%|█▉ | 1104/5772 [1:56:24<10:16:16, 7.92s/it] {'loss': 0.4883, 'learning_rate': 1.866865974687126e-05, 'epoch': 0.19} + 19%|█▉ | 1104/5772 [1:56:17<10:16:14, 7.92s/it] 19%|█▉ | 1105/5772 [1:56:30<9:34:33, 7.39s/it] 19%|█▉ | 1105/5772 [1:56:23<9:34:32, 7.39s/it] {'loss': 0.4883, 'learning_rate': 1.866586057283276e-05, 'epoch': 0.19} + 19%|█▉ | 1105/5772 [1:56:30<9:34:33, 7.39s/it] {'loss': 0.4883, 'learning_rate': 1.866586057283276e-05, 'epoch': 0.19} + 19%|█▉ | 1105/5772 [1:56:23<9:34:32, 7.39s/it] 19%|█▉ | 1106/5772 [1:56:36<9:04:43, 7.00s/it] 19%|█▉ | 1106/5772 [1:56:29<9:04:43, 7.00s/it] {'loss': 0.482, 'learning_rate': 1.8663058669529654e-05, 'epoch': 0.19} + 19%|█▉ | 1106/5772 [1:56:36<9:04:43, 7.00s/it] {'loss': 0.482, 'learning_rate': 1.8663058669529654e-05, 'epoch': 0.19} + 19%|█▉ | 1106/5772 [1:56:29<9:04:43, 7.00s/it] 19%|█▉ | 1107/5772 [1:56:43<8:50:27, 6.82s/it] 19%|█▉ | 1107/5772 [1:56:36<8:50:26, 6.82s/it] {'loss': 0.5132, 'learning_rate': 1.866025403784439e-05, 'epoch': 0.19} + 19%|█▉ | 1107/5772 [1:56:43<8:50:27, 6.82s/it] {'loss': 0.5132, 'learning_rate': 1.866025403784439e-05, 'epoch': 0.19} + 19%|█▉ | 1107/5772 [1:56:36<8:50:26, 6.82s/it] 19%|█▉ | 1108/5772 [1:56:49<8:37:01, 6.65s/it] 19%|█▉ | 1108/5772 [1:56:42<8:37:01, 6.65s/it] {'loss': 0.4922, 'learning_rate': 1.8657446678660264e-05, 'epoch': 0.19} + 19%|█▉ | 1108/5772 [1:56:49<8:37:01, 6.65s/it] {'loss': 0.4922, 'learning_rate': 1.8657446678660264e-05, 'epoch': 0.19} + 19%|█▉ | 1108/5772 [1:56:42<8:37:01, 6.65s/it] 19%|█▉ | 1109/5772 [1:56:56<8:31:34, 6.58s/it] 19%|█▉ | 1109/5772 [1:56:48<8:31:34, 6.58s/it] {'loss': 0.4843, 'learning_rate': 1.865463659286144e-05, 'epoch': 0.19} + 19%|█▉ | 1109/5772 [1:56:56<8:31:34, 6.58s/it] {'loss': 0.4843, 'learning_rate': 1.865463659286144e-05, 'epoch': 0.19} + 19%|█▉ | 1109/5772 [1:56:48<8:31:34, 6.58s/it] 19%|█▉ | 1110/5772 [1:56:55<8:35:49, 6.64s/it] 19%|█▉ | 1110/5772 [1:57:02<8:35:50, 6.64s/it] {'loss': 0.4855, 'learning_rate': 1.8651823781332948e-05, 'epoch': 0.19} + 19%|█▉ | 1110/5772 [1:57:02<8:35:50, 6.64s/it] {'loss': 0.4855, 'learning_rate': 1.8651823781332948e-05, 'epoch': 0.19} + 19%|█▉ | 1110/5772 [1:56:55<8:35:49, 6.64s/it] 19%|█▉ | 1111/5772 [1:57:09<8:30:33, 6.57s/it] 19%|█▉ | 1111/5772 [1:57:02<8:30:33, 6.57s/it] {'loss': 0.4881, 'learning_rate': 1.8649008244960657e-05, 'epoch': 0.19} + 19%|█▉ | 1111/5772 [1:57:09<8:30:33, 6.57s/it] {'loss': 0.4881, 'learning_rate': 1.8649008244960657e-05, 'epoch': 0.19} + 19%|█▉ | 1111/5772 [1:57:02<8:30:33, 6.57s/it] 19%|█▉ | 1112/5772 [1:57:08<8:30:24, 6.57s/it] 19%|█▉ | 1112/5772 [1:57:15<8:30:24, 6.57s/it] {'loss': 0.4867, 'learning_rate': 1.8646189984631306e-05, 'epoch': 0.19} + 19%|█▉ | 1112/5772 [1:57:15<8:30:24, 6.57s/it] {'loss': 0.4867, 'learning_rate': 1.8646189984631306e-05, 'epoch': 0.19} + 19%|█▉ | 1112/5772 [1:57:08<8:30:24, 6.57s/it] 19%|█▉ | 1113/5772 [1:57:21<8:19:43, 6.44s/it] 19%|█▉ | 1113/5772 [1:57:14<8:19:43, 6.44s/it] {'loss': 0.4921, 'learning_rate': 1.8643369001232498e-05, 'epoch': 0.19} + 19%|█▉ | 1113/5772 [1:57:21<8:19:43, 6.44s/it] {'loss': 0.4921, 'learning_rate': 1.8643369001232498e-05, 'epoch': 0.19} + 19%|█▉ | 1113/5772 [1:57:14<8:19:43, 6.44s/it] 19%|█▉ | 1114/5772 [1:57:28<8:21:58, 6.47s/it] 19%|█▉ | 1114/5772 [1:57:21<8:21:58, 6.47s/it] {'loss': 0.4784, 'learning_rate': 1.864054529565267e-05, 'epoch': 0.19} + 19%|█▉ | 1114/5772 [1:57:28<8:21:58, 6.47s/it] {'loss': 0.4784, 'learning_rate': 1.864054529565267e-05, 'epoch': 0.19} + 19%|█▉ | 1114/5772 [1:57:21<8:21:58, 6.47s/it] 19%|█▉ | 1115/5772 [1:57:34<8:16:03, 6.39s/it] 19%|█▉ | 1115/5772 [1:57:27<8:16:02, 6.39s/it] {'loss': 0.4994, 'learning_rate': 1.8637718868781154e-05, 'epoch': 0.19} + 19%|█▉ | 1115/5772 [1:57:34<8:16:03, 6.39s/it] {'loss': 0.4994, 'learning_rate': 1.8637718868781154e-05, 'epoch': 0.19} + 19%|█▉ | 1115/5772 [1:57:27<8:16:02, 6.39s/it] 19%|█▉ | 1116/5772 [1:57:40<8:10:52, 6.33s/it] 19%|█▉ | 1116/5772 [1:57:33<8:10:52, 6.33s/it] {'loss': 0.4842, 'learning_rate': 1.86348897215081e-05, 'epoch': 0.19} + 19%|█▉ | 1116/5772 [1:57:40<8:10:52, 6.33s/it] {'loss': 0.4842, 'learning_rate': 1.86348897215081e-05, 'epoch': 0.19} + 19%|█▉ | 1116/5772 [1:57:33<8:10:52, 6.33s/it] 19%|█▉ | 1117/5772 [1:57:47<8:07:43, 6.29s/it] 19%|█▉ | 1117/5772 [1:57:39<8:07:44, 6.29s/it] {'loss': 0.4868, 'learning_rate': 1.863205785472454e-05, 'epoch': 0.19} + 19%|█▉ | 1117/5772 [1:57:47<8:07:43, 6.29s/it] {'loss': 0.4868, 'learning_rate': 1.863205785472454e-05, 'epoch': 0.19} + 19%|█▉ | 1117/5772 [1:57:39<8:07:44, 6.29s/it] 19%|█▉ | 1118/5772 [1:57:53<8:08:50, 6.30s/it] 19%|█▉ | 1118/5772 [1:57:46<8:08:49, 6.30s/it] {'loss': 0.4885, 'learning_rate': 1.8629223269322353e-05, 'epoch': 0.19} + 19%|█▉ | 1118/5772 [1:57:53<8:08:50, 6.30s/it] {'loss': 0.4885, 'learning_rate': 1.8629223269322353e-05, 'epoch': 0.19} + 19%|█▉ | 1118/5772 [1:57:46<8:08:49, 6.30s/it] 19%|█▉ | 1119/5772 [1:57:59<7:57:31, 6.16s/it] 19%|█▉ | 1119/5772 [1:57:52<7:57:31, 6.16s/it] {'loss': 0.488, 'learning_rate': 1.8626385966194275e-05, 'epoch': 0.19} + 19%|█▉ | 1119/5772 [1:57:59<7:57:31, 6.16s/it] {'loss': 0.488, 'learning_rate': 1.8626385966194275e-05, 'epoch': 0.19} + 19%|█▉ | 1119/5772 [1:57:52<7:57:31, 6.16s/it] 19%|█▉ | 1120/5772 [1:57:58<7:53:05, 6.10s/it] 19%|█▉ | 1120/5772 [1:58:05<7:53:05, 6.10s/it] {'loss': 0.4855, 'learning_rate': 1.86235459462339e-05, 'epoch': 0.19} + 19%|█▉ | 1120/5772 [1:58:05<7:53:05, 6.10s/it] {'loss': 0.4855, 'learning_rate': 1.86235459462339e-05, 'epoch': 0.19} + 19%|█▉ | 1120/5772 [1:57:58<7:53:05, 6.10s/it] 19%|█▉ | 1121/5772 [1:58:11<7:50:28, 6.07s/it] 19%|█▉ | 1121/5772 [1:58:04<7:50:28, 6.07s/it] {'loss': 0.4894, 'learning_rate': 1.862070321033568e-05, 'epoch': 0.19} + 19%|█▉ | 1121/5772 [1:58:11<7:50:28, 6.07s/it] {'loss': 0.4894, 'learning_rate': 1.862070321033568e-05, 'epoch': 0.19} + 19%|█▉ | 1121/5772 [1:58:04<7:50:28, 6.07s/it] 19%|█▉ | 1122/5772 [1:58:17<7:53:29, 6.11s/it] 19%|█▉ | 1122/5772 [1:58:10<7:53:29, 6.11s/it] {'loss': 0.478, 'learning_rate': 1.8617857759394913e-05, 'epoch': 0.19} + 19%|█▉ | 1122/5772 [1:58:17<7:53:29, 6.11s/it] {'loss': 0.478, 'learning_rate': 1.8617857759394913e-05, 'epoch': 0.19} + 19%|█▉ | 1122/5772 [1:58:10<7:53:29, 6.11s/it] 19%|█▉ | 1123/5772 [1:58:24<8:06:21, 6.28s/it] 19%|█▉ | 1123/5772 [1:58:16<8:06:22, 6.28s/it] {'loss': 0.4825, 'learning_rate': 1.8615009594307757e-05, 'epoch': 0.19} + 19%|█▉ | 1123/5772 [1:58:24<8:06:21, 6.28s/it] {'loss': 0.4825, 'learning_rate': 1.8615009594307757e-05, 'epoch': 0.19} + 19%|█▉ | 1123/5772 [1:58:16<8:06:22, 6.28s/it] 19%|█▉ | 1124/5772 [1:58:22<7:54:49, 6.13s/it] 19%|█▉ | 1124/5772 [1:58:29<7:54:49, 6.13s/it] {'loss': 0.4743, 'learning_rate': 1.861215871597123e-05, 'epoch': 0.19} + 19%|█▉ | 1124/5772 [1:58:29<7:54:49, 6.13s/it] {'loss': 0.4743, 'learning_rate': 1.861215871597123e-05, 'epoch': 0.19} + 19%|█▉ | 1124/5772 [1:58:22<7:54:49, 6.13s/it] 19%|█▉ | 1125/5772 [1:58:36<8:01:59, 6.22s/it] 19%|█▉ | 1125/5772 [1:58:29<8:01:59, 6.22s/it] {'loss': 0.4972, 'learning_rate': 1.8609305125283202e-05, 'epoch': 0.19} + 19%|█▉ | 1125/5772 [1:58:36<8:01:59, 6.22s/it] {'loss': 0.4972, 'learning_rate': 1.8609305125283202e-05, 'epoch': 0.19} + 19%|█▉ | 1125/5772 [1:58:29<8:01:59, 6.22s/it] 20%|█▉ | 1126/5772 [1:58:42<7:56:49, 6.16s/it] 20%|█▉ | 1126/5772 [1:58:35<7:56:49, 6.16s/it] {'loss': 0.4808, 'learning_rate': 1.860644882314239e-05, 'epoch': 0.2} + 20%|█▉ | 1126/5772 [1:58:42<7:56:49, 6.16s/it] {'loss': 0.4808, 'learning_rate': 1.860644882314239e-05, 'epoch': 0.2} + 20%|█▉ | 1126/5772 [1:58:35<7:56:49, 6.16s/it] 20%|█▉ | 1127/5772 [1:58:48<7:53:24, 6.12s/it] 20%|█▉ | 1127/5772 [1:58:41<7:53:25, 6.12s/it] {'loss': 0.4905, 'learning_rate': 1.8603589810448377e-05, 'epoch': 0.2} + 20%|█▉ | 1127/5772 [1:58:48<7:53:24, 6.12s/it] {'loss': 0.4905, 'learning_rate': 1.8603589810448377e-05, 'epoch': 0.2} + 20%|█▉ | 1127/5772 [1:58:41<7:53:25, 6.12s/it] 20%|█▉ | 1128/5772 [1:58:47<7:57:07, 6.16s/it] 20%|█▉ | 1128/5772 [1:58:54<7:57:08, 6.16s/it] {'loss': 0.4849, 'learning_rate': 1.8600728088101587e-05, 'epoch': 0.2} + 20%|█▉ | 1128/5772 [1:58:54<7:57:08, 6.16s/it] {'loss': 0.4849, 'learning_rate': 1.8600728088101587e-05, 'epoch': 0.2} + 20%|█▉ | 1128/5772 [1:58:47<7:57:07, 6.16s/it] 20%|█▉ | 1129/5772 [1:59:00<8:01:40, 6.22s/it] 20%|█▉ | 1129/5772 [1:58:53<8:01:40, 6.22s/it] {'loss': 0.4904, 'learning_rate': 1.8597863657003303e-05, 'epoch': 0.2} + 20%|█▉ | 1129/5772 [1:59:00<8:01:40, 6.22s/it] {'loss': 0.4904, 'learning_rate': 1.8597863657003303e-05, 'epoch': 0.2} + 20%|█▉ | 1129/5772 [1:58:53<8:01:40, 6.22s/it] 20%|█▉ | 1130/5772 [1:59:06<7:57:33, 6.17s/it] 20%|█▉ | 1130/5772 [1:58:59<7:57:33, 6.17s/it] {'loss': 0.4808, 'learning_rate': 1.859499651805567e-05, 'epoch': 0.2} + 20%|█▉ | 1130/5772 [1:59:06<7:57:33, 6.17s/it] {'loss': 0.4808, 'learning_rate': 1.859499651805567e-05, 'epoch': 0.2} + 20%|█▉ | 1130/5772 [1:58:59<7:57:33, 6.17s/it] 20%|█▉ | 1131/5772 [1:59:13<7:55:41, 6.15s/it] 20%|█▉ | 1131/5772 [1:59:06<7:55:41, 6.15s/it] {'loss': 0.4904, 'learning_rate': 1.859212667216167e-05, 'epoch': 0.2} + 20%|█▉ | 1131/5772 [1:59:13<7:55:41, 6.15s/it] {'loss': 0.4904, 'learning_rate': 1.859212667216167e-05, 'epoch': 0.2} + 20%|█▉ | 1131/5772 [1:59:06<7:55:41, 6.15s/it] 20%|█▉ | 1132/5772 [1:59:19<7:52:20, 6.11s/it] 20%|█▉ | 1132/5772 [1:59:12<7:52:21, 6.11s/it] {'loss': 0.4728, 'learning_rate': 1.8589254120225145e-05, 'epoch': 0.2} + 20%|█▉ | 1132/5772 [1:59:19<7:52:20, 6.11s/it] {'loss': 0.4728, 'learning_rate': 1.8589254120225145e-05, 'epoch': 0.2} + 20%|█▉ | 1132/5772 [1:59:12<7:52:21, 6.11s/it] 20%|█▉ | 1133/5772 [1:59:25<8:01:05, 6.22s/it] 20%|█▉ | 1133/5772 [1:59:18<8:01:05, 6.22s/it] {'loss': 0.4987, 'learning_rate': 1.858637886315079e-05, 'epoch': 0.2} + 20%|█▉ | 1133/5772 [1:59:25<8:01:05, 6.22s/it] {'loss': 0.4987, 'learning_rate': 1.858637886315079e-05, 'epoch': 0.2} + 20%|█▉ | 1133/5772 [1:59:18<8:01:05, 6.22s/it] 20%|█▉ | 1134/5772 [1:59:24<7:56:47, 6.17s/it] 20%|█▉ | 1134/5772 [1:59:31<7:56:47, 6.17s/it] {'loss': 0.4776, 'learning_rate': 1.8583500901844157e-05, 'epoch': 0.2} + 20%|█▉ | 1134/5772 [1:59:31<7:56:47, 6.17s/it] {'loss': 0.4776, 'learning_rate': 1.8583500901844157e-05, 'epoch': 0.2} + 20%|█▉ | 1134/5772 [1:59:24<7:56:47, 6.17s/it] 20%|█▉ | 1135/5772 [1:59:37<8:01:51, 6.23s/it] 20%|█▉ | 1135/5772 [1:59:30<8:01:51, 6.23s/it] {'loss': 0.4862, 'learning_rate': 1.858062023721164e-05, 'epoch': 0.2} + 20%|█▉ | 1135/5772 [1:59:37<8:01:51, 6.23s/it] {'loss': 0.4862, 'learning_rate': 1.858062023721164e-05, 'epoch': 0.2} + 20%|█▉ | 1135/5772 [1:59:30<8:01:51, 6.23s/it] 20%|█▉ | 1136/5772 [1:59:44<7:59:04, 6.20s/it] 20%|█▉ | 1136/5772 [1:59:37<7:59:04, 6.20s/it] {'loss': 0.488, 'learning_rate': 1.8577736870160482e-05, 'epoch': 0.2} + 20%|█▉ | 1136/5772 [1:59:44<7:59:04, 6.20s/it] {'loss': 0.488, 'learning_rate': 1.8577736870160482e-05, 'epoch': 0.2} + 20%|█▉ | 1136/5772 [1:59:37<7:59:04, 6.20s/it] 20%|█▉ | 1137/5772 [1:59:50<7:56:06, 6.16s/it] 20%|█▉ | 1137/5772 [1:59:43<7:56:06, 6.16s/it] {'loss': 0.4913, 'learning_rate': 1.857485080159879e-05, 'epoch': 0.2} + 20%|█▉ | 1137/5772 [1:59:50<7:56:06, 6.16s/it] {'loss': 0.4913, 'learning_rate': 1.857485080159879e-05, 'epoch': 0.2} + 20%|█▉ | 1137/5772 [1:59:43<7:56:06, 6.16s/it] 20%|█▉ | 1138/5772 [1:59:49<7:49:58, 6.09s/it] 20%|█▉ | 1138/5772 [1:59:56<7:49:59, 6.09s/it] {'loss': 0.4903, 'learning_rate': 1.857196203243552e-05, 'epoch': 0.2} + 20%|█▉ | 1138/5772 [1:59:56<7:49:59, 6.09s/it] {'loss': 0.4903, 'learning_rate': 1.857196203243552e-05, 'epoch': 0.2} + 20%|█▉ | 1138/5772 [1:59:49<7:49:58, 6.09s/it] 20%|█▉ | 1139/5772 [2:00:02<7:45:47, 6.03s/it] 20%|█▉ | 1139/5772 [1:59:54<7:45:48, 6.03s/it] {'loss': 0.4746, 'learning_rate': 1.8569070563580466e-05, 'epoch': 0.2} + 20%|█▉ | 1139/5772 [2:00:02<7:45:47, 6.03s/it] {'loss': 0.4746, 'learning_rate': 1.8569070563580466e-05, 'epoch': 0.2} + 20%|█▉ | 1139/5772 [1:59:54<7:45:48, 6.03s/it] 20%|█▉ | 1140/5772 [2:00:08<7:46:12, 6.04s/it] 20%|█▉ | 1140/5772 [2:00:01<7:46:12, 6.04s/it] {'loss': 0.4835, 'learning_rate': 1.8566176395944277e-05, 'epoch': 0.2} + 20%|█▉ | 1140/5772 [2:00:08<7:46:12, 6.04s/it] {'loss': 0.4835, 'learning_rate': 1.8566176395944277e-05, 'epoch': 0.2} + 20%|█▉ | 1140/5772 [2:00:01<7:46:12, 6.04s/it] 20%|█▉ | 1141/5772 [2:00:14<7:48:49, 6.07s/it] 20%|█▉ | 1141/5772 [2:00:07<7:48:49, 6.07s/it] {'loss': 0.4722, 'learning_rate': 1.8563279530438464e-05, 'epoch': 0.2} + 20%|█▉ | 1141/5772 [2:00:14<7:48:49, 6.07s/it] {'loss': 0.4722, 'learning_rate': 1.8563279530438464e-05, 'epoch': 0.2} + 20%|█▉ | 1141/5772 [2:00:07<7:48:49, 6.07s/it] 20%|█▉ | 1142/5772 [2:00:13<7:49:57, 6.09s/it] 20%|█▉ | 1142/5772 [2:00:20<7:49:57, 6.09s/it] {'loss': 0.4798, 'learning_rate': 1.8560379967975376e-05, 'epoch': 0.2} + 20%|█▉ | 1142/5772 [2:00:20<7:49:57, 6.09s/it] {'loss': 0.4798, 'learning_rate': 1.8560379967975376e-05, 'epoch': 0.2} + 20%|█▉ | 1142/5772 [2:00:13<7:49:57, 6.09s/it] 20%|█▉ | 1143/5772 [2:00:19<7:49:10, 6.08s/it] 20%|█▉ | 1143/5772 [2:00:26<7:49:10, 6.08s/it] {'loss': 0.4928, 'learning_rate': 1.8557477709468214e-05, 'epoch': 0.2} + 20%|█▉ | 1143/5772 [2:00:26<7:49:10, 6.08s/it] {'loss': 0.4928, 'learning_rate': 1.8557477709468214e-05, 'epoch': 0.2} + 20%|█▉ | 1143/5772 [2:00:19<7:49:10, 6.08s/it] 20%|█▉ | 1144/5772 [2:00:25<7:47:51, 6.07s/it] 20%|█▉ | 1144/5772 [2:00:32<7:47:51, 6.07s/it] {'loss': 0.4819, 'learning_rate': 1.8554572755831026e-05, 'epoch': 0.2} + 20%|█▉ | 1144/5772 [2:00:32<7:47:51, 6.07s/it] {'loss': 0.4819, 'learning_rate': 1.8554572755831026e-05, 'epoch': 0.2} + 20%|█▉ | 1144/5772 [2:00:25<7:47:51, 6.07s/it] 20%|█▉ | 1145/5772 [2:00:31<7:47:50, 6.07s/it] 20%|█▉ | 1145/5772 [2:00:38<7:47:50, 6.07s/it] {'loss': 0.4879, 'learning_rate': 1.8551665107978708e-05, 'epoch': 0.2} + 20%|█▉ | 1145/5772 [2:00:38<7:47:50, 6.07s/it] {'loss': 0.4879, 'learning_rate': 1.8551665107978708e-05, 'epoch': 0.2} + 20%|█▉ | 1145/5772 [2:00:31<7:47:50, 6.07s/it] 20%|█▉ | 1146/5772 [2:00:37<7:44:01, 6.02s/it] 20%|█▉ | 1146/5772 [2:00:44<7:44:02, 6.02s/it] {'loss': 0.4904, 'learning_rate': 1.8548754766827016e-05, 'epoch': 0.2} + 20%|█▉ | 1146/5772 [2:00:44<7:44:02, 6.02s/it] {'loss': 0.4904, 'learning_rate': 1.8548754766827016e-05, 'epoch': 0.2} + 20%|█▉ | 1146/5772 [2:00:37<7:44:01, 6.02s/it] 20%|█▉ | 1147/5772 [2:00:43<7:51:43, 6.12s/it] 20%|█▉ | 1147/5772 [2:00:50<7:51:44, 6.12s/it] {'loss': 0.4838, 'learning_rate': 1.8545841733292543e-05, 'epoch': 0.2} + 20%|█▉ | 1147/5772 [2:00:50<7:51:44, 6.12s/it] {'loss': 0.4838, 'learning_rate': 1.8545841733292543e-05, 'epoch': 0.2} + 20%|█▉ | 1147/5772 [2:00:43<7:51:43, 6.12s/it] 20%|█▉ | 1148/5772 [2:00:50<7:55:57, 6.18s/it] 20%|█▉ | 1148/5772 [2:00:57<7:55:57, 6.18s/it] {'loss': 0.4819, 'learning_rate': 1.8542926008292726e-05, 'epoch': 0.2} + 20%|█▉ | 1148/5772 [2:00:57<7:55:57, 6.18s/it] {'loss': 0.4819, 'learning_rate': 1.8542926008292726e-05, 'epoch': 0.2} + 20%|█▉ | 1148/5772 [2:00:50<7:55:57, 6.18s/it] 20%|█▉ | 1149/5772 [2:00:56<7:57:22, 6.20s/it] 20%|█▉ | 1149/5772 [2:01:03<7:57:22, 6.20s/it] {'loss': 0.4732, 'learning_rate': 1.8540007592745865e-05, 'epoch': 0.2} + 20%|█▉ | 1149/5772 [2:01:03<7:57:22, 6.20s/it] {'loss': 0.4732, 'learning_rate': 1.8540007592745865e-05, 'epoch': 0.2} + 20%|█▉ | 1149/5772 [2:00:56<7:57:22, 6.20s/it]5 AutoResumeHook: Checking whether to suspend... +01210 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +7 2AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... + 20%|█▉ | 1150/5772 [2:01:02<7:54:00, 6.15s/it]1 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... + 20%|█▉ | 1150/5772 [2:01:09<7:54:00, 6.15s/it] {'loss': 0.4843, 'learning_rate': 1.8537086487571095e-05, 'epoch': 0.2} + 20%|█▉ | 1150/5772 [2:01:09<7:54:00, 6.15s/it] {'loss': 0.4843, 'learning_rate': 1.8537086487571095e-05, 'epoch': 0.2} + 20%|█▉ | 1150/5772 [2:01:02<7:54:00, 6.15s/it] 20%|█▉ | 1151/5772 [2:01:08<7:53:07, 6.14s/it] 20%|█▉ | 1151/5772 [2:01:15<7:53:07, 6.14s/it] {'loss': 0.4837, 'learning_rate': 1.85341626936884e-05, 'epoch': 0.2} + 20%|█▉ | 1151/5772 [2:01:15<7:53:07, 6.14s/it] {'loss': 0.4837, 'learning_rate': 1.85341626936884e-05, 'epoch': 0.2} + 20%|█▉ | 1151/5772 [2:01:08<7:53:07, 6.14s/it] 20%|█▉ | 1152/5772 [2:01:14<8:00:49, 6.24s/it] 20%|█▉ | 1152/5772 [2:01:21<8:00:50, 6.24s/it] {'loss': 0.4972, 'learning_rate': 1.8531236212018616e-05, 'epoch': 0.2} + 20%|█▉ | 1152/5772 [2:01:21<8:00:50, 6.24s/it] {'loss': 0.4972, 'learning_rate': 1.8531236212018616e-05, 'epoch': 0.2} + 20%|█▉ | 1152/5772 [2:01:14<8:00:49, 6.24s/it] 20%|█▉ | 1153/5772 [2:01:21<8:04:44, 6.30s/it] 20%|█▉ | 1153/5772 [2:01:28<8:04:44, 6.30s/it] {'loss': 0.4893, 'learning_rate': 1.8528307043483425e-05, 'epoch': 0.2} + 20%|█▉ | 1153/5772 [2:01:28<8:04:44, 6.30s/it] {'loss': 0.4893, 'learning_rate': 1.8528307043483425e-05, 'epoch': 0.2} + 20%|█▉ | 1153/5772 [2:01:21<8:04:44, 6.30s/it] 20%|█▉ | 1154/5772 [2:01:27<8:07:56, 6.34s/it] 20%|█▉ | 1154/5772 [2:01:34<8:07:56, 6.34s/it] {'loss': 0.4974, 'learning_rate': 1.8525375189005345e-05, 'epoch': 0.2} + 20%|█▉ | 1154/5772 [2:01:34<8:07:56, 6.34s/it] {'loss': 0.4974, 'learning_rate': 1.8525375189005345e-05, 'epoch': 0.2} + 20%|█▉ | 1154/5772 [2:01:27<8:07:56, 6.34s/it] 20%|██ | 1155/5772 [2:01:33<8:01:31, 6.26s/it] 20%|██ | 1155/5772 [2:01:40<8:01:31, 6.26s/it] {'loss': 0.4873, 'learning_rate': 1.852244064950775e-05, 'epoch': 0.2} + 20%|██ | 1155/5772 [2:01:40<8:01:31, 6.26s/it] {'loss': 0.4873, 'learning_rate': 1.852244064950775e-05, 'epoch': 0.2} + 20%|██ | 1155/5772 [2:01:33<8:01:31, 6.26s/it] 20%|██ | 1156/5772 [2:01:39<7:54:46, 6.17s/it] 20%|██ | 1156/5772 [2:01:46<7:54:46, 6.17s/it] {'loss': 0.5025, 'learning_rate': 1.8519503425914857e-05, 'epoch': 0.2} + 20%|██ | 1156/5772 [2:01:46<7:54:46, 6.17s/it] {'loss': 0.5025, 'learning_rate': 1.8519503425914857e-05, 'epoch': 0.2} + 20%|██ | 1156/5772 [2:01:39<7:54:46, 6.17s/it] 20%|██ | 1157/5772 [2:01:45<7:50:14, 6.11s/it] 20%|██ | 1157/5772 [2:01:52<7:50:14, 6.11s/it] {'loss': 0.4822, 'learning_rate': 1.851656351915173e-05, 'epoch': 0.2} + 20%|██ | 1157/5772 [2:01:52<7:50:14, 6.11s/it] {'loss': 0.4822, 'learning_rate': 1.851656351915173e-05, 'epoch': 0.2} + 20%|██ | 1157/5772 [2:01:45<7:50:14, 6.11s/it] 20%|██ | 1158/5772 [2:01:52<7:57:37, 6.21s/it] 20%|██ | 1158/5772 [2:01:59<7:57:37, 6.21s/it] {'loss': 0.4985, 'learning_rate': 1.851362093014427e-05, 'epoch': 0.2} + 20%|██ | 1158/5772 [2:01:59<7:57:37, 6.21s/it] {'loss': 0.4985, 'learning_rate': 1.851362093014427e-05, 'epoch': 0.2} + 20%|██ | 1158/5772 [2:01:52<7:57:37, 6.21s/it] 20%|██ | 1159/5772 [2:01:58<8:03:14, 6.29s/it] 20%|██ | 1159/5772 [2:02:05<8:03:14, 6.29s/it] {'loss': 0.4862, 'learning_rate': 1.851067565981924e-05, 'epoch': 0.2} + 20%|██ | 1159/5772 [2:02:05<8:03:14, 6.29s/it] {'loss': 0.4862, 'learning_rate': 1.851067565981924e-05, 'epoch': 0.2} + 20%|██ | 1159/5772 [2:01:58<8:03:14, 6.29s/it] 20%|██ | 1160/5772 [2:02:04<7:51:21, 6.13s/it] 20%|██ | 1160/5772 [2:02:11<7:51:21, 6.13s/it] {'loss': 0.4824, 'learning_rate': 1.850772770910423e-05, 'epoch': 0.2} + 20%|██ | 1160/5772 [2:02:11<7:51:21, 6.13s/it] {'loss': 0.4824, 'learning_rate': 1.850772770910423e-05, 'epoch': 0.2} + 20%|██ | 1160/5772 [2:02:04<7:51:21, 6.13s/it] 20%|██ | 1161/5772 [2:02:10<7:54:38, 6.18s/it] 20%|██ | 1161/5772 [2:02:17<7:54:38, 6.18s/it] {'loss': 0.471, 'learning_rate': 1.850477707892768e-05, 'epoch': 0.2} + 20%|██ | 1161/5772 [2:02:17<7:54:38, 6.18s/it] {'loss': 0.471, 'learning_rate': 1.850477707892768e-05, 'epoch': 0.2} + 20%|██ | 1161/5772 [2:02:10<7:54:38, 6.18s/it] 20%|██ | 1162/5772 [2:02:16<7:54:43, 6.18s/it] 20%|██ | 1162/5772 [2:02:23<7:54:43, 6.18s/it] {'loss': 0.4918, 'learning_rate': 1.8501823770218873e-05, 'epoch': 0.2} + 20%|██ | 1162/5772 [2:02:23<7:54:43, 6.18s/it] {'loss': 0.4918, 'learning_rate': 1.8501823770218873e-05, 'epoch': 0.2} + 20%|██ | 1162/5772 [2:02:16<7:54:43, 6.18s/it] 20%|██ | 1163/5772 [2:02:22<7:47:31, 6.09s/it] 20%|██ | 1163/5772 [2:02:29<7:47:32, 6.09s/it] {'loss': 0.499, 'learning_rate': 1.8498867783907942e-05, 'epoch': 0.2} + 20%|██ | 1163/5772 [2:02:29<7:47:32, 6.09s/it] {'loss': 0.499, 'learning_rate': 1.8498867783907942e-05, 'epoch': 0.2} + 20%|██ | 1163/5772 [2:02:22<7:47:31, 6.09s/it] 20%|██ | 1164/5772 [2:02:28<7:43:44, 6.04s/it] 20%|██ | 1164/5772 [2:02:35<7:43:44, 6.04s/it] {'loss': 0.5001, 'learning_rate': 1.8495909120925857e-05, 'epoch': 0.2} + 20%|██ | 1164/5772 [2:02:35<7:43:44, 6.04s/it] {'loss': 0.5001, 'learning_rate': 1.8495909120925857e-05, 'epoch': 0.2} + 20%|██ | 1164/5772 [2:02:28<7:43:44, 6.04s/it] 20%|██ | 1165/5772 [2:02:35<7:52:10, 6.15s/it] 20%|██ | 1165/5772 [2:02:42<7:52:10, 6.15s/it] {'loss': 0.4884, 'learning_rate': 1.849294778220443e-05, 'epoch': 0.2} + 20%|██ | 1165/5772 [2:02:42<7:52:10, 6.15s/it] {'loss': 0.4884, 'learning_rate': 1.849294778220443e-05, 'epoch': 0.2} + 20%|██ | 1165/5772 [2:02:35<7:52:10, 6.15s/it] 20%|██ | 1166/5772 [2:02:41<7:52:17, 6.15s/it] 20%|██ | 1166/5772 [2:02:48<7:52:17, 6.15s/it] {'loss': 0.4888, 'learning_rate': 1.8489983768676322e-05, 'epoch': 0.2} + 20%|██ | 1166/5772 [2:02:48<7:52:17, 6.15s/it] {'loss': 0.4888, 'learning_rate': 1.8489983768676322e-05, 'epoch': 0.2} + 20%|██ | 1166/5772 [2:02:41<7:52:17, 6.15s/it] 20%|██ | 1167/5772 [2:02:47<7:49:38, 6.12s/it] 20%|██ | 1167/5772 [2:02:54<7:49:38, 6.12s/it] {'loss': 0.4697, 'learning_rate': 1.8487017081275028e-05, 'epoch': 0.2} + 20%|██ | 1167/5772 [2:02:54<7:49:38, 6.12s/it] {'loss': 0.4697, 'learning_rate': 1.8487017081275028e-05, 'epoch': 0.2} + 20%|██ | 1167/5772 [2:02:47<7:49:38, 6.12s/it] 20%|██ | 1168/5772 [2:02:53<7:45:35, 6.07s/it] 20%|██ | 1168/5772 [2:03:00<7:45:41, 6.07s/it] {'loss': 0.4897, 'learning_rate': 1.8484047720934898e-05, 'epoch': 0.2} + 20%|██ | 1168/5772 [2:03:00<7:45:41, 6.07s/it] {'loss': 0.4897, 'learning_rate': 1.8484047720934898e-05, 'epoch': 0.2} + 20%|██ | 1168/5772 [2:02:53<7:45:35, 6.07s/it] 20%|██ | 1169/5772 [2:03:00<8:05:55, 6.33s/it] 20%|██ | 1169/5772 [2:03:07<8:05:53, 6.33s/it] {'loss': 0.4859, 'learning_rate': 1.8481075688591104e-05, 'epoch': 0.2} + 20%|██ | 1169/5772 [2:03:07<8:05:53, 6.33s/it] {'loss': 0.4859, 'learning_rate': 1.8481075688591104e-05, 'epoch': 0.2} + 20%|██ | 1169/5772 [2:03:00<8:05:55, 6.33s/it] 20%|██ | 1170/5772 [2:03:06<8:07:18, 6.35s/it] 20%|██ | 1170/5772 [2:03:13<8:07:16, 6.35s/it] {'loss': 0.4838, 'learning_rate': 1.8478100985179676e-05, 'epoch': 0.2} + 20%|██ | 1170/5772 [2:03:13<8:07:16, 6.35s/it] {'loss': 0.4838, 'learning_rate': 1.8478100985179676e-05, 'epoch': 0.2} + 20%|██ | 1170/5772 [2:03:06<8:07:18, 6.35s/it] 20%|██ | 1171/5772 [2:03:13<8:08:55, 6.38s/it] 20%|██ | 1171/5772 [2:03:20<8:08:54, 6.38s/it] {'loss': 0.4881, 'learning_rate': 1.8475123611637485e-05, 'epoch': 0.2} + 20%|██ | 1171/5772 [2:03:20<8:08:54, 6.38s/it] {'loss': 0.4881, 'learning_rate': 1.8475123611637485e-05, 'epoch': 0.2} + 20%|██ | 1171/5772 [2:03:13<8:08:55, 6.38s/it] 20%|██ | 1172/5772 [2:03:18<7:57:35, 6.23s/it] 20%|██ | 1172/5772 [2:03:25<7:57:34, 6.23s/it] {'loss': 0.4959, 'learning_rate': 1.8472143568902235e-05, 'epoch': 0.2} + 20%|██ | 1172/5772 [2:03:26<7:57:34, 6.23s/it] {'loss': 0.4959, 'learning_rate': 1.8472143568902235e-05, 'epoch': 0.2} + 20%|██ | 1172/5772 [2:03:18<7:57:35, 6.23s/it] 20%|██ | 1173/5772 [2:03:25<7:57:49, 6.23s/it] 20%|██ | 1173/5772 [2:03:32<7:57:49, 6.23s/it] {'loss': 0.4892, 'learning_rate': 1.846916085791247e-05, 'epoch': 0.2} + 20%|██ | 1173/5772 [2:03:32<7:57:49, 6.23s/it] {'loss': 0.4892, 'learning_rate': 1.846916085791247e-05, 'epoch': 0.2} + 20%|██ | 1173/5772 [2:03:25<7:57:49, 6.23s/it] 20%|██ | 1174/5772 [2:03:31<7:51:31, 6.15s/it] 20%|██ | 1174/5772 [2:03:38<7:51:31, 6.15s/it] {'loss': 0.4869, 'learning_rate': 1.8466175479607583e-05, 'epoch': 0.2} + 20%|██ | 1174/5772 [2:03:38<7:51:31, 6.15s/it] {'loss': 0.4869, 'learning_rate': 1.8466175479607583e-05, 'epoch': 0.2} + 20%|██ | 1174/5772 [2:03:31<7:51:31, 6.15s/it] 20%|██ | 1175/5772 [2:03:37<7:58:35, 6.25s/it] 20%|██ | 1175/5772 [2:03:44<7:58:35, 6.25s/it] {'loss': 0.4968, 'learning_rate': 1.84631874349278e-05, 'epoch': 0.2} + 20%|██ | 1175/5772 [2:03:44<7:58:35, 6.25s/it] {'loss': 0.4968, 'learning_rate': 1.84631874349278e-05, 'epoch': 0.2} + 20%|██ | 1175/5772 [2:03:37<7:58:35, 6.25s/it] 20%|██ | 1176/5772 [2:03:50<7:52:30, 6.17s/it] 20%|██ | 1176/5772 [2:03:43<7:52:31, 6.17s/it] {'loss': 0.4881, 'learning_rate': 1.8460196724814193e-05, 'epoch': 0.2} + 20%|██ | 1176/5772 [2:03:50<7:52:30, 6.17s/it] {'loss': 0.4881, 'learning_rate': 1.8460196724814193e-05, 'epoch': 0.2} + 20%|██ | 1176/5772 [2:03:43<7:52:31, 6.17s/it] 20%|██ | 1177/5772 [2:03:57<8:10:27, 6.40s/it] 20%|██ | 1177/5772 [2:03:50<8:10:28, 6.40s/it] {'loss': 0.4959, 'learning_rate': 1.8457203350208664e-05, 'epoch': 0.2} + 20%|██ | 1177/5772 [2:03:57<8:10:27, 6.40s/it] {'loss': 0.4959, 'learning_rate': 1.8457203350208664e-05, 'epoch': 0.2} + 20%|██ | 1177/5772 [2:03:50<8:10:28, 6.40s/it] 20%|██ | 1178/5772 [2:03:56<8:00:13, 6.27s/it] 20%|██ | 1178/5772 [2:04:03<8:00:13, 6.27s/it] {'loss': 0.4905, 'learning_rate': 1.845420731205396e-05, 'epoch': 0.2} + 20%|██ | 1178/5772 [2:04:03<8:00:13, 6.27s/it] {'loss': 0.4905, 'learning_rate': 1.845420731205396e-05, 'epoch': 0.2} + 20%|██ | 1178/5772 [2:03:56<8:00:13, 6.27s/it] 20%|██ | 1179/5772 [2:04:03<8:08:30, 6.38s/it] 20%|██ | 1179/5772 [2:04:10<8:08:30, 6.38s/it] {'loss': 0.4918, 'learning_rate': 1.8451208611293672e-05, 'epoch': 0.2} + 20%|██ | 1179/5772 [2:04:10<8:08:30, 6.38s/it] {'loss': 0.4918, 'learning_rate': 1.8451208611293672e-05, 'epoch': 0.2} + 20%|██ | 1179/5772 [2:04:03<8:08:30, 6.38s/it] 20%|██ | 1180/5772 [2:04:09<8:00:36, 6.28s/it] 20%|██ | 1180/5772 [2:04:16<8:00:36, 6.28s/it] {'loss': 0.4899, 'learning_rate': 1.844820724887222e-05, 'epoch': 0.2} + 20%|██ | 1180/5772 [2:04:16<8:00:36, 6.28s/it] {'loss': 0.4899, 'learning_rate': 1.844820724887222e-05, 'epoch': 0.2} + 20%|██ | 1180/5772 [2:04:09<8:00:36, 6.28s/it] 20%|██ | 1181/5772 [2:04:15<7:52:20, 6.17s/it] 20%|██ | 1181/5772 [2:04:22<7:52:20, 6.17s/it] {'loss': 0.4855, 'learning_rate': 1.8445203225734866e-05, 'epoch': 0.2} + 20%|██ | 1181/5772 [2:04:22<7:52:20, 6.17s/it] {'loss': 0.4855, 'learning_rate': 1.8445203225734866e-05, 'epoch': 0.2} + 20%|██ | 1181/5772 [2:04:15<7:52:20, 6.17s/it] 20%|██ | 1182/5772 [2:04:20<7:44:47, 6.08s/it] 20%|██ | 1182/5772 [2:04:28<7:44:47, 6.08s/it] {'loss': 0.4858, 'learning_rate': 1.8442196542827712e-05, 'epoch': 0.2} + 20%|██ | 1182/5772 [2:04:28<7:44:47, 6.08s/it] {'loss': 0.4858, 'learning_rate': 1.8442196542827712e-05, 'epoch': 0.2} + 20%|██ | 1182/5772 [2:04:20<7:44:47, 6.08s/it] 20%|██ | 1183/5772 [2:04:27<7:48:01, 6.12s/it] 20%|██ | 1183/5772 [2:04:34<7:48:01, 6.12s/it] {'loss': 0.4913, 'learning_rate': 1.8439187201097696e-05, 'epoch': 0.2} + 20%|██ | 1183/5772 [2:04:34<7:48:01, 6.12s/it] {'loss': 0.4913, 'learning_rate': 1.8439187201097696e-05, 'epoch': 0.2} + 20%|██ | 1183/5772 [2:04:27<7:48:01, 6.12s/it] 21%|██ | 1184/5772 [2:04:33<7:53:02, 6.19s/it] 21%|██ | 1184/5772 [2:04:40<7:53:03, 6.19s/it] {'loss': 0.4818, 'learning_rate': 1.8436175201492594e-05, 'epoch': 0.21} + 21%|██ | 1184/5772 [2:04:40<7:53:03, 6.19s/it] {'loss': 0.4818, 'learning_rate': 1.8436175201492594e-05, 'epoch': 0.21} + 21%|██ | 1184/5772 [2:04:33<7:53:02, 6.19s/it] 21%|██ | 1185/5772 [2:04:39<7:50:49, 6.16s/it] 21%|██ | 1185/5772 [2:04:46<7:50:49, 6.16s/it] {'loss': 0.4775, 'learning_rate': 1.8433160544961017e-05, 'epoch': 0.21} + 21%|██ | 1185/5772 [2:04:46<7:50:49, 6.16s/it] {'loss': 0.4775, 'learning_rate': 1.8433160544961017e-05, 'epoch': 0.21} + 21%|██ | 1185/5772 [2:04:39<7:50:49, 6.16s/it] 21%|██ | 1186/5772 [2:04:45<7:53:07, 6.19s/it] 21%|██ | 1186/5772 [2:04:52<7:53:07, 6.19s/it] {'loss': 0.4777, 'learning_rate': 1.8430143232452413e-05, 'epoch': 0.21} + 21%|██ | 1186/5772 [2:04:52<7:53:07, 6.19s/it] {'loss': 0.4777, 'learning_rate': 1.8430143232452413e-05, 'epoch': 0.21} + 21%|██ | 1186/5772 [2:04:45<7:53:07, 6.19s/it] 21%|██ | 1187/5772 [2:04:51<7:47:47, 6.12s/it] 21%|██ | 1187/5772 [2:04:58<7:47:47, 6.12s/it] {'loss': 0.4813, 'learning_rate': 1.8427123264917074e-05, 'epoch': 0.21} + 21%|██ | 1187/5772 [2:04:58<7:47:47, 6.12s/it] {'loss': 0.4813, 'learning_rate': 1.8427123264917074e-05, 'epoch': 0.21} + 21%|██ | 1187/5772 [2:04:51<7:47:47, 6.12s/it] 21%|██ | 1188/5772 [2:04:58<7:55:08, 6.22s/it] 21%|██ | 1188/5772 [2:05:05<7:55:08, 6.22s/it] {'loss': 0.4798, 'learning_rate': 1.8424100643306113e-05, 'epoch': 0.21} + 21%|██ | 1188/5772 [2:05:05<7:55:08, 6.22s/it] {'loss': 0.4798, 'learning_rate': 1.8424100643306113e-05, 'epoch': 0.21} + 21%|██ | 1188/5772 [2:04:58<7:55:08, 6.22s/it] 21%|██ | 1189/5772 [2:05:04<7:56:38, 6.24s/it] 21%|██ | 1189/5772 [2:05:11<7:56:38, 6.24s/it] {'loss': 0.4814, 'learning_rate': 1.8421075368571493e-05, 'epoch': 0.21} + 21%|██ | 1189/5772 [2:05:11<7:56:38, 6.24s/it] {'loss': 0.4814, 'learning_rate': 1.8421075368571493e-05, 'epoch': 0.21} + 21%|██ | 1189/5772 [2:05:04<7:56:38, 6.24s/it] 21%|██ | 1190/5772 [2:05:10<7:55:52, 6.23s/it] 21%|██ | 1190/5772 [2:05:17<7:55:52, 6.23s/it] {'loss': 0.4834, 'learning_rate': 1.8418047441666012e-05, 'epoch': 0.21} + 21%|██ | 1190/5772 [2:05:17<7:55:52, 6.23s/it] {'loss': 0.4834, 'learning_rate': 1.8418047441666012e-05, 'epoch': 0.21} + 21%|██ | 1190/5772 [2:05:10<7:55:52, 6.23s/it] 21%|██ | 1191/5772 [2:05:17<8:03:08, 6.33s/it] 21%|██ | 1191/5772 [2:05:24<8:03:08, 6.33s/it] {'loss': 0.4783, 'learning_rate': 1.8415016863543286e-05, 'epoch': 0.21} + 21%|██ | 1191/5772 [2:05:24<8:03:08, 6.33s/it] {'loss': 0.4783, 'learning_rate': 1.8415016863543286e-05, 'epoch': 0.21} + 21%|██ | 1191/5772 [2:05:17<8:03:08, 6.33s/it] 21%|██ | 1192/5772 [2:05:23<7:59:42, 6.28s/it] 21%|██ | 1192/5772 [2:05:30<7:59:43, 6.28s/it] {'loss': 0.4856, 'learning_rate': 1.8411983635157792e-05, 'epoch': 0.21} + 21%|██ | 1192/5772 [2:05:30<7:59:43, 6.28s/it] {'loss': 0.4856, 'learning_rate': 1.8411983635157792e-05, 'epoch': 0.21} + 21%|██ | 1192/5772 [2:05:23<7:59:42, 6.28s/it] 21%|██ | 1193/5772 [2:05:30<8:10:51, 6.43s/it] 21%|██ | 1193/5772 [2:05:37<8:10:51, 6.43s/it] {'loss': 0.4851, 'learning_rate': 1.8408947757464825e-05, 'epoch': 0.21} + 21%|██ | 1193/5772 [2:05:37<8:10:51, 6.43s/it] {'loss': 0.4851, 'learning_rate': 1.8408947757464825e-05, 'epoch': 0.21} + 21%|██ | 1193/5772 [2:05:30<8:10:51, 6.43s/it] 21%|██ | 1194/5772 [2:05:36<8:11:56, 6.45s/it] 21%|██ | 1194/5772 [2:05:43<8:11:56, 6.45s/it] {'loss': 0.484, 'learning_rate': 1.840590923142051e-05, 'epoch': 0.21} + 21%|██ | 1194/5772 [2:05:43<8:11:56, 6.45s/it] {'loss': 0.484, 'learning_rate': 1.840590923142051e-05, 'epoch': 0.21} + 21%|██ | 1194/5772 [2:05:36<8:11:56, 6.45s/it] 21%|██ | 1195/5772 [2:05:42<8:03:04, 6.33s/it] 21%|██ | 1195/5772 [2:05:49<8:03:04, 6.33s/it] {'loss': 0.4843, 'learning_rate': 1.8402868057981823e-05, 'epoch': 0.21} + 21%|██ | 1195/5772 [2:05:49<8:03:04, 6.33s/it] {'loss': 0.4843, 'learning_rate': 1.8402868057981823e-05, 'epoch': 0.21} + 21%|██ | 1195/5772 [2:05:42<8:03:04, 6.33s/it] 21%|██ | 1196/5772 [2:05:49<8:00:35, 6.30s/it] 21%|██ | 1196/5772 [2:05:56<8:00:35, 6.30s/it] {'loss': 0.4877, 'learning_rate': 1.839982423810656e-05, 'epoch': 0.21} + 21%|██ | 1196/5772 [2:05:56<8:00:35, 6.30s/it] {'loss': 0.4877, 'learning_rate': 1.839982423810656e-05, 'epoch': 0.21} + 21%|██ | 1196/5772 [2:05:49<8:00:35, 6.30s/it] 21%|██ | 1197/5772 [2:05:55<8:02:34, 6.33s/it] 21%|██ | 1197/5772 [2:06:02<8:02:34, 6.33s/it] {'loss': 0.488, 'learning_rate': 1.8396777772753355e-05, 'epoch': 0.21} + 21%|██ | 1197/5772 [2:06:02<8:02:34, 6.33s/it] {'loss': 0.488, 'learning_rate': 1.8396777772753355e-05, 'epoch': 0.21} + 21%|██ | 1197/5772 [2:05:55<8:02:34, 6.33s/it] 21%|██ | 1198/5772 [2:06:01<8:05:47, 6.37s/it] 21%|██ | 1198/5772 [2:06:09<8:05:47, 6.37s/it] {'loss': 0.4934, 'learning_rate': 1.839372866288168e-05, 'epoch': 0.21} + 21%|██ | 1198/5772 [2:06:09<8:05:47, 6.37s/it] {'loss': 0.4934, 'learning_rate': 1.839372866288168e-05, 'epoch': 0.21} + 21%|██ | 1198/5772 [2:06:01<8:05:47, 6.37s/it] 21%|██ | 1199/5772 [2:06:08<7:58:30, 6.28s/it] 21%|██ | 1199/5772 [2:06:15<7:58:30, 6.28s/it] {'loss': 0.4813, 'learning_rate': 1.839067690945183e-05, 'epoch': 0.21} + 21%|██ | 1199/5772 [2:06:15<7:58:30, 6.28s/it] {'loss': 0.4813, 'learning_rate': 1.839067690945183e-05, 'epoch': 0.21} + 21%|██ | 1199/5772 [2:06:08<7:58:30, 6.28s/it]5 AutoResumeHook: Checking whether to suspend... +0 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +912 AutoResumeHook: Checking whether to suspend... 21%|██ | 1200/5772 [2:06:14<7:54:03, 6.22s/it]7 AutoResumeHook: Checking whether to suspend...2 + AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... + +AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... + 21%|██ | 1200/5772 [2:06:21<7:54:03, 6.22s/it]1 AutoResumeHook: Checking whether to suspend...15 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... + + {'loss': 0.4897, 'learning_rate': 1.8387622513424942e-05, 'epoch': 0.21} + 21%|██ | 1200/5772 [2:06:21<7:54:03, 6.22s/it] {'loss': 0.4897, 'learning_rate': 1.8387622513424942e-05, 'epoch': 0.21} + 21%|██ | 1200/5772 [2:06:14<7:54:03, 6.22s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-1200/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-1200/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-1200/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 21%|██ | 1201/5772 [2:06:39<12:39:04, 9.96s/it] 21%|██ | 1201/5772 [2:06:32<12:39:04, 9.96s/it] {'loss': 0.4739, 'learning_rate': 1.8384565475762978e-05, 'epoch': 0.21} + 21%|██ | 1201/5772 [2:06:39<12:39:04, 9.96s/it] {'loss': 0.4739, 'learning_rate': 1.8384565475762978e-05, 'epoch': 0.21} + 21%|██ | 1201/5772 [2:06:32<12:39:04, 9.96s/it] 21%|██ | 1202/5772 [2:06:45<11:06:42, 8.75s/it] 21%|██ | 1202/5772 [2:06:38<11:06:43, 8.75s/it] {'loss': 0.5024, 'learning_rate': 1.838150579742873e-05, 'epoch': 0.21} + 21%|██ | 1202/5772 [2:06:45<11:06:42, 8.75s/it] {'loss': 0.5024, 'learning_rate': 1.838150579742873e-05, 'epoch': 0.21} + 21%|██ | 1202/5772 [2:06:38<11:06:43, 8.75s/it] 21%|██ | 1203/5772 [2:06:44<10:02:51, 7.92s/it] 21%|██ | 1203/5772 [2:06:51<10:02:51, 7.92s/it] {'loss': 0.4796, 'learning_rate': 1.837844347938584e-05, 'epoch': 0.21} + 21%|██ | 1203/5772 [2:06:51<10:02:51, 7.92s/it] {'loss': 0.4796, 'learning_rate': 1.837844347938584e-05, 'epoch': 0.21} + 21%|██ | 1203/5772 [2:06:44<10:02:51, 7.92s/it] 21%|██ | 1204/5772 [2:06:51<9:26:55, 7.45s/it] 21%|██ | 1204/5772 [2:06:58<9:26:55, 7.45s/it] {'loss': 0.4995, 'learning_rate': 1.8375378522598756e-05, 'epoch': 0.21} + 21%|██ | 1204/5772 [2:06:58<9:26:55, 7.45s/it] {'loss': 0.4995, 'learning_rate': 1.8375378522598756e-05, 'epoch': 0.21} + 21%|██ | 1204/5772 [2:06:51<9:26:55, 7.45s/it] 21%|██ | 1205/5772 [2:06:56<8:47:32, 6.93s/it] 21%|██ | 1205/5772 [2:07:03<8:47:32, 6.93s/it] {'loss': 0.4788, 'learning_rate': 1.8372310928032774e-05, 'epoch': 0.21} + 21%|██ | 1205/5772 [2:07:03<8:47:32, 6.93s/it] {'loss': 0.4788, 'learning_rate': 1.8372310928032774e-05, 'epoch': 0.21} + 21%|██ | 1205/5772 [2:06:56<8:47:32, 6.93s/it] 21%|██ | 1206/5772 [2:07:03<8:33:00, 6.74s/it] 21%|██ | 1206/5772 [2:07:10<8:33:05, 6.74s/it] {'loss': 0.4866, 'learning_rate': 1.8369240696654017e-05, 'epoch': 0.21} + 21%|██ | 1206/5772 [2:07:10<8:33:05, 6.74s/it] {'loss': 0.4866, 'learning_rate': 1.8369240696654017e-05, 'epoch': 0.21} + 21%|██ | 1206/5772 [2:07:03<8:33:00, 6.74s/it] 21%|██ | 1207/5772 [2:07:09<8:17:01, 6.53s/it] 21%|██ | 1207/5772 [2:07:16<8:17:00, 6.53s/it] {'loss': 0.4749, 'learning_rate': 1.8366167829429434e-05, 'epoch': 0.21} + 21%|██ | 1207/5772 [2:07:16<8:17:00, 6.53s/it] {'loss': 0.4749, 'learning_rate': 1.8366167829429434e-05, 'epoch': 0.21} + 21%|██ | 1207/5772 [2:07:09<8:17:01, 6.53s/it] 21%|██ | 1208/5772 [2:07:22<8:08:10, 6.42s/it] 21%|██ | 1208/5772 [2:07:15<8:08:11, 6.42s/it] {'loss': 0.4948, 'learning_rate': 1.836309232732681e-05, 'epoch': 0.21} + 21%|██ | 1208/5772 [2:07:22<8:08:10, 6.42s/it] {'loss': 0.4948, 'learning_rate': 1.836309232732681e-05, 'epoch': 0.21} + 21%|██ | 1208/5772 [2:07:15<8:08:11, 6.42s/it] 21%|██ | 1209/5772 [2:07:21<8:00:44, 6.32s/it] 21%|██ | 1209/5772 [2:07:28<8:00:43, 6.32s/it] {'loss': 0.4762, 'learning_rate': 1.836001419131476e-05, 'epoch': 0.21} + 21%|██ | 1209/5772 [2:07:28<8:00:43, 6.32s/it] {'loss': 0.4762, 'learning_rate': 1.836001419131476e-05, 'epoch': 0.21} + 21%|██ | 1209/5772 [2:07:21<8:00:44, 6.32s/it] 21%|██ | 1210/5772 [2:07:27<8:01:59, 6.34s/it] 21%|██ | 1210/5772 [2:07:34<8:01:59, 6.34s/it] {'loss': 0.5067, 'learning_rate': 1.835693342236272e-05, 'epoch': 0.21} + 21%|██ | 1210/5772 [2:07:34<8:01:59, 6.34s/it] {'loss': 0.5067, 'learning_rate': 1.835693342236272e-05, 'epoch': 0.21} + 21%|██ | 1210/5772 [2:07:27<8:01:59, 6.34s/it] 21%|██ | 1211/5772 [2:07:33<7:56:06, 6.26s/it] 21%|██ | 1211/5772 [2:07:40<7:56:06, 6.26s/it] {'loss': 0.4815, 'learning_rate': 1.8353850021440962e-05, 'epoch': 0.21} + 21%|██ | 1211/5772 [2:07:40<7:56:06, 6.26s/it] {'loss': 0.4815, 'learning_rate': 1.8353850021440962e-05, 'epoch': 0.21} + 21%|██ | 1211/5772 [2:07:33<7:56:06, 6.26s/it] 21%|██ | 1212/5772 [2:07:47<7:53:45, 6.23s/it] 21%|██ | 1212/5772 [2:07:40<7:53:46, 6.23s/it] {'loss': 0.4806, 'learning_rate': 1.835076398952059e-05, 'epoch': 0.21} + 21%|██ | 1212/5772 [2:07:47<7:53:45, 6.23s/it] {'loss': 0.4806, 'learning_rate': 1.835076398952059e-05, 'epoch': 0.21} + 21%|██ | 1212/5772 [2:07:40<7:53:46, 6.23s/it] 21%|██ | 1213/5772 [2:07:53<7:53:24, 6.23s/it] 21%|██ | 1213/5772 [2:07:46<7:53:25, 6.23s/it] {'loss': 0.4874, 'learning_rate': 1.8347675327573525e-05, 'epoch': 0.21} + 21%|██ | 1213/5772 [2:07:53<7:53:24, 6.23s/it] {'loss': 0.4874, 'learning_rate': 1.8347675327573525e-05, 'epoch': 0.21} + 21%|██ | 1213/5772 [2:07:46<7:53:25, 6.23s/it] 21%|██ | 1214/5772 [2:07:52<7:48:31, 6.17s/it] 21%|██ | 1214/5772 [2:07:59<7:48:31, 6.17s/it] {'loss': 0.4903, 'learning_rate': 1.834458403657253e-05, 'epoch': 0.21} + 21%|██ | 1214/5772 [2:07:59<7:48:31, 6.17s/it] {'loss': 0.4903, 'learning_rate': 1.834458403657253e-05, 'epoch': 0.21} + 21%|██ | 1214/5772 [2:07:52<7:48:31, 6.17s/it] 21%|██ | 1215/5772 [2:07:58<7:44:24, 6.11s/it] 21%|██ | 1215/5772 [2:08:05<7:44:24, 6.11s/it] {'loss': 0.4731, 'learning_rate': 1.834149011749119e-05, 'epoch': 0.21} + 21%|██ | 1215/5772 [2:08:05<7:44:24, 6.11s/it] {'loss': 0.4731, 'learning_rate': 1.834149011749119e-05, 'epoch': 0.21} + 21%|██ | 1215/5772 [2:07:58<7:44:24, 6.11s/it] 21%|██ | 1216/5772 [2:08:04<7:48:27, 6.17s/it] 21%|██ | 1216/5772 [2:08:11<7:48:28, 6.17s/it] {'loss': 0.4878, 'learning_rate': 1.8338393571303917e-05, 'epoch': 0.21} + 21%|██ | 1216/5772 [2:08:11<7:48:28, 6.17s/it] {'loss': 0.4878, 'learning_rate': 1.8338393571303917e-05, 'epoch': 0.21} + 21%|██ | 1216/5772 [2:08:04<7:48:27, 6.17s/it] 21%|██ | 1217/5772 [2:08:10<7:47:51, 6.16s/it] 21%|██ | 1217/5772 [2:08:17<7:47:52, 6.16s/it] {'loss': 0.4938, 'learning_rate': 1.8335294398985953e-05, 'epoch': 0.21} + 21%|██ | 1217/5772 [2:08:17<7:47:52, 6.16s/it] {'loss': 0.4938, 'learning_rate': 1.8335294398985953e-05, 'epoch': 0.21} + 21%|██ | 1217/5772 [2:08:10<7:47:51, 6.16s/it] 21%|██ | 1218/5772 [2:08:17<7:51:39, 6.21s/it] 21%|██ | 1218/5772 [2:08:24<7:51:39, 6.21s/it] {'loss': 0.4939, 'learning_rate': 1.8332192601513358e-05, 'epoch': 0.21} + 21%|██ | 1218/5772 [2:08:24<7:51:39, 6.21s/it] {'loss': 0.4939, 'learning_rate': 1.8332192601513358e-05, 'epoch': 0.21} + 21%|██ | 1218/5772 [2:08:17<7:51:39, 6.21s/it] 21%|██ | 1219/5772 [2:08:23<7:49:02, 6.18s/it] 21%|██ | 1219/5772 [2:08:30<7:49:02, 6.18s/it] {'loss': 0.4756, 'learning_rate': 1.8329088179863033e-05, 'epoch': 0.21} + 21%|██ | 1219/5772 [2:08:30<7:49:02, 6.18s/it] {'loss': 0.4756, 'learning_rate': 1.8329088179863033e-05, 'epoch': 0.21} + 21%|██ | 1219/5772 [2:08:23<7:49:02, 6.18s/it] 21%|██ | 1220/5772 [2:08:29<7:49:19, 6.19s/it] 21%|██ | 1220/5772 [2:08:36<7:49:19, 6.19s/it] {'loss': 0.4886, 'learning_rate': 1.8325981135012693e-05, 'epoch': 0.21} + 21%|██ | 1220/5772 [2:08:36<7:49:19, 6.19s/it] {'loss': 0.4886, 'learning_rate': 1.8325981135012693e-05, 'epoch': 0.21} + 21%|██ | 1220/5772 [2:08:29<7:49:19, 6.19s/it] 21%|██ | 1221/5772 [2:08:35<7:53:05, 6.24s/it] 21%|██ | 1221/5772 [2:08:42<7:53:05, 6.24s/it] {'loss': 0.4813, 'learning_rate': 1.832287146794089e-05, 'epoch': 0.21} + 21%|██ | 1221/5772 [2:08:42<7:53:05, 6.24s/it] {'loss': 0.4813, 'learning_rate': 1.832287146794089e-05, 'epoch': 0.21} + 21%|██ | 1221/5772 [2:08:35<7:53:05, 6.24s/it] 21%|██ | 1222/5772 [2:08:49<8:00:11, 6.33s/it] 21%|██ | 1222/5772 [2:08:42<8:00:11, 6.33s/it] {'loss': 0.495, 'learning_rate': 1.8319759179626992e-05, 'epoch': 0.21} + 21%|██ | 1222/5772 [2:08:49<8:00:11, 6.33s/it] {'loss': 0.495, 'learning_rate': 1.8319759179626992e-05, 'epoch': 0.21} + 21%|██ | 1222/5772 [2:08:42<8:00:11, 6.33s/it] 21%|██ | 1223/5772 [2:08:48<7:59:23, 6.32s/it] 21%|██ | 1223/5772 [2:08:55<7:59:23, 6.32s/it] {'loss': 0.4832, 'learning_rate': 1.8316644271051197e-05, 'epoch': 0.21} + 21%|██ | 1223/5772 [2:08:55<7:59:23, 6.32s/it] {'loss': 0.4832, 'learning_rate': 1.8316644271051197e-05, 'epoch': 0.21} + 21%|██ | 1223/5772 [2:08:48<7:59:23, 6.32s/it] 21%|██ | 1224/5772 [2:08:54<7:47:22, 6.17s/it] 21%|██ | 1224/5772 [2:09:01<7:47:22, 6.17s/it] {'loss': 0.483, 'learning_rate': 1.8313526743194536e-05, 'epoch': 0.21} + 21%|██ | 1224/5772 [2:09:01<7:47:22, 6.17s/it] {'loss': 0.483, 'learning_rate': 1.8313526743194536e-05, 'epoch': 0.21} + 21%|██ | 1224/5772 [2:08:54<7:47:22, 6.17s/it] 21%|██ | 1225/5772 [2:09:00<7:48:49, 6.19s/it] 21%|██ | 1225/5772 [2:09:07<7:48:49, 6.19s/it] {'loss': 0.4954, 'learning_rate': 1.8310406597038843e-05, 'epoch': 0.21} + 21%|██ | 1225/5772 [2:09:07<7:48:49, 6.19s/it] {'loss': 0.4954, 'learning_rate': 1.8310406597038843e-05, 'epoch': 0.21} + 21%|██ | 1225/5772 [2:09:00<7:48:49, 6.19s/it] 21%|██ | 1226/5772 [2:09:06<7:48:58, 6.19s/it] 21%|██ | 1226/5772 [2:09:13<7:48:59, 6.19s/it] {'loss': 0.4769, 'learning_rate': 1.83072838335668e-05, 'epoch': 0.21} + 21%|██ | 1226/5772 [2:09:13<7:48:59, 6.19s/it] {'loss': 0.4769, 'learning_rate': 1.83072838335668e-05, 'epoch': 0.21} + 21%|██ | 1226/5772 [2:09:06<7:48:58, 6.19s/it] 21%|██▏ | 1227/5772 [2:09:12<7:46:36, 6.16s/it] 21%|██▏ | 1227/5772 [2:09:19<7:46:37, 6.16s/it] {'loss': 0.4896, 'learning_rate': 1.8304158453761904e-05, 'epoch': 0.21} + 21%|██▏ | 1227/5772 [2:09:19<7:46:37, 6.16s/it] {'loss': 0.4896, 'learning_rate': 1.8304158453761904e-05, 'epoch': 0.21} + 21%|██▏ | 1227/5772 [2:09:12<7:46:36, 6.16s/it] 21%|██▏ | 1228/5772 [2:09:19<7:47:09, 6.17s/it] 21%|██▏ | 1228/5772 [2:09:26<7:47:09, 6.17s/it] {'loss': 0.4876, 'learning_rate': 1.8301030458608475e-05, 'epoch': 0.21} + 21%|██▏ | 1228/5772 [2:09:26<7:47:09, 6.17s/it] {'loss': 0.4876, 'learning_rate': 1.8301030458608475e-05, 'epoch': 0.21} + 21%|██▏ | 1228/5772 [2:09:19<7:47:09, 6.17s/it] 21%|██▏ | 1229/5772 [2:09:24<7:40:04, 6.08s/it] 21%|██▏ | 1229/5772 [2:09:31<7:40:04, 6.08s/it] {'loss': 0.4811, 'learning_rate': 1.8297899849091654e-05, 'epoch': 0.21} + 21%|██▏ | 1229/5772 [2:09:31<7:40:04, 6.08s/it] {'loss': 0.4811, 'learning_rate': 1.8297899849091654e-05, 'epoch': 0.21} + 21%|██▏ | 1229/5772 [2:09:24<7:40:04, 6.08s/it] 21%|██▏ | 1230/5772 [2:09:31<7:42:14, 6.11s/it] 21%|██▏ | 1230/5772 [2:09:38<7:42:15, 6.11s/it] {'loss': 0.4938, 'learning_rate': 1.8294766626197414e-05, 'epoch': 0.21} + 21%|██▏ | 1230/5772 [2:09:38<7:42:15, 6.11s/it] {'loss': 0.4938, 'learning_rate': 1.8294766626197414e-05, 'epoch': 0.21} + 21%|██▏ | 1230/5772 [2:09:31<7:42:14, 6.11s/it] 21%|██▏ | 1231/5772 [2:09:37<7:45:51, 6.16s/it] 21%|██▏ | 1231/5772 [2:09:44<7:45:50, 6.16s/it] {'loss': 0.4829, 'learning_rate': 1.8291630790912544e-05, 'epoch': 0.21} + 21%|██▏ | 1231/5772 [2:09:44<7:45:50, 6.16s/it] {'loss': 0.4829, 'learning_rate': 1.8291630790912544e-05, 'epoch': 0.21} + 21%|██▏ | 1231/5772 [2:09:37<7:45:51, 6.16s/it] 21%|██▏ | 1232/5772 [2:09:43<7:44:50, 6.14s/it] 21%|██▏ | 1232/5772 [2:09:50<7:44:50, 6.14s/it] {'loss': 0.4799, 'learning_rate': 1.828849234422466e-05, 'epoch': 0.21} + 21%|██▏ | 1232/5772 [2:09:50<7:44:50, 6.14s/it] {'loss': 0.4799, 'learning_rate': 1.828849234422466e-05, 'epoch': 0.21} + 21%|██▏ | 1232/5772 [2:09:43<7:44:50, 6.14s/it] 21%|██▏ | 1233/5772 [2:09:49<7:45:45, 6.16s/it] 21%|██▏ | 1233/5772 [2:09:56<7:45:45, 6.16s/it] {'loss': 0.475, 'learning_rate': 1.8285351287122192e-05, 'epoch': 0.21} + 21%|██▏ | 1233/5772 [2:09:56<7:45:45, 6.16s/it] {'loss': 0.475, 'learning_rate': 1.8285351287122192e-05, 'epoch': 0.21} + 21%|██▏ | 1233/5772 [2:09:49<7:45:45, 6.16s/it] 21%|██▏ | 1234/5772 [2:09:55<7:42:02, 6.11s/it] 21%|██▏ | 1234/5772 [2:10:02<7:42:01, 6.11s/it] {'loss': 0.4934, 'learning_rate': 1.8282207620594405e-05, 'epoch': 0.21} + 21%|██▏ | 1234/5772 [2:10:02<7:42:01, 6.11s/it] {'loss': 0.4934, 'learning_rate': 1.8282207620594405e-05, 'epoch': 0.21} + 21%|██▏ | 1234/5772 [2:09:55<7:42:02, 6.11s/it] 21%|██▏ | 1235/5772 [2:10:01<7:42:34, 6.12s/it] 21%|██▏ | 1235/5772 [2:10:08<7:42:34, 6.12s/it] {'loss': 0.4791, 'learning_rate': 1.827906134563138e-05, 'epoch': 0.21} + 21%|██▏ | 1235/5772 [2:10:08<7:42:34, 6.12s/it] {'loss': 0.4791, 'learning_rate': 1.827906134563138e-05, 'epoch': 0.21} + 21%|██▏ | 1235/5772 [2:10:01<7:42:34, 6.12s/it] 21%|██▏ | 1236/5772 [2:10:07<7:41:31, 6.10s/it] 21%|██▏ | 1236/5772 [2:10:14<7:41:31, 6.10s/it] {'loss': 0.4891, 'learning_rate': 1.827591246322401e-05, 'epoch': 0.21} + 21%|██▏ | 1236/5772 [2:10:14<7:41:31, 6.10s/it] {'loss': 0.4891, 'learning_rate': 1.827591246322401e-05, 'epoch': 0.21} + 21%|██▏ | 1236/5772 [2:10:07<7:41:31, 6.10s/it] 21%|██▏ | 1237/5772 [2:10:14<7:52:11, 6.25s/it] 21%|██▏ | 1237/5772 [2:10:21<7:52:11, 6.25s/it] {'loss': 0.4866, 'learning_rate': 1.827276097436403e-05, 'epoch': 0.21} + 21%|██▏ | 1237/5772 [2:10:21<7:52:11, 6.25s/it] {'loss': 0.4866, 'learning_rate': 1.827276097436403e-05, 'epoch': 0.21} + 21%|██▏ | 1237/5772 [2:10:14<7:52:11, 6.25s/it] 21%|██▏ | 1238/5772 [2:10:20<7:48:16, 6.20s/it] 21%|██▏ | 1238/5772 [2:10:27<7:48:15, 6.20s/it] {'loss': 0.4862, 'learning_rate': 1.8269606880043974e-05, 'epoch': 0.21} + 21%|██▏ | 1238/5772 [2:10:27<7:48:15, 6.20s/it] {'loss': 0.4862, 'learning_rate': 1.8269606880043974e-05, 'epoch': 0.21} + 21%|██▏ | 1238/5772 [2:10:20<7:48:16, 6.20s/it] 21%|██▏ | 1239/5772 [2:10:26<7:50:52, 6.23s/it] 21%|██▏ | 1239/5772 [2:10:33<7:50:51, 6.23s/it] {'loss': 0.4986, 'learning_rate': 1.8266450181257213e-05, 'epoch': 0.21} + 21%|██▏ | 1239/5772 [2:10:33<7:50:51, 6.23s/it] {'loss': 0.4986, 'learning_rate': 1.8266450181257213e-05, 'epoch': 0.21} + 21%|██▏ | 1239/5772 [2:10:26<7:50:52, 6.23s/it] 21%|██▏ | 1240/5772 [2:10:32<7:48:16, 6.20s/it] 21%|██▏ | 1240/5772 [2:10:40<7:48:17, 6.20s/it] {'loss': 0.4878, 'learning_rate': 1.826329087899793e-05, 'epoch': 0.21} + 21%|██▏ | 1240/5772 [2:10:40<7:48:17, 6.20s/it] {'loss': 0.4878, 'learning_rate': 1.826329087899793e-05, 'epoch': 0.21} + 21%|██▏ | 1240/5772 [2:10:32<7:48:16, 6.20s/it] 22%|██▏ | 1241/5772 [2:10:39<7:54:10, 6.28s/it] 22%|██▏ | 1241/5772 [2:10:46<7:54:10, 6.28s/it] {'loss': 0.4904, 'learning_rate': 1.8260128974261123e-05, 'epoch': 0.21} + 22%|██▏ | 1241/5772 [2:10:46<7:54:10, 6.28s/it] {'loss': 0.4904, 'learning_rate': 1.8260128974261123e-05, 'epoch': 0.21} + 22%|██▏ | 1241/5772 [2:10:39<7:54:10, 6.28s/it] 22%|██▏ | 1242/5772 [2:10:45<7:52:28, 6.26s/it] 22%|██▏ | 1242/5772 [2:10:52<7:52:28, 6.26s/it] {'loss': 0.4943, 'learning_rate': 1.8256964468042624e-05, 'epoch': 0.22} + 22%|██▏ | 1242/5772 [2:10:52<7:52:28, 6.26s/it] {'loss': 0.4943, 'learning_rate': 1.8256964468042624e-05, 'epoch': 0.22} + 22%|██▏ | 1242/5772 [2:10:45<7:52:28, 6.26s/it] 22%|██▏ | 1243/5772 [2:10:51<7:47:13, 6.19s/it] 22%|██▏ | 1243/5772 [2:10:58<7:47:12, 6.19s/it] {'loss': 0.4712, 'learning_rate': 1.8253797361339075e-05, 'epoch': 0.22} + 22%|██▏ | 1243/5772 [2:10:58<7:47:12, 6.19s/it] {'loss': 0.4712, 'learning_rate': 1.8253797361339075e-05, 'epoch': 0.22} + 22%|██▏ | 1243/5772 [2:10:51<7:47:13, 6.19s/it] 22%|██▏ | 1244/5772 [2:10:57<7:45:05, 6.16s/it] 22%|██▏ | 1244/5772 [2:11:04<7:45:05, 6.16s/it] {'loss': 0.4856, 'learning_rate': 1.825062765514794e-05, 'epoch': 0.22} + 22%|██▏ | 1244/5772 [2:11:04<7:45:05, 6.16s/it] {'loss': 0.4856, 'learning_rate': 1.825062765514794e-05, 'epoch': 0.22} + 22%|██▏ | 1244/5772 [2:10:57<7:45:05, 6.16s/it] 22%|██▏ | 1245/5772 [2:11:04<7:51:24, 6.25s/it] 22%|██▏ | 1245/5772 [2:11:11<7:51:24, 6.25s/it] {'loss': 0.4751, 'learning_rate': 1.8247455350467496e-05, 'epoch': 0.22} + 22%|██▏ | 1245/5772 [2:11:11<7:51:24, 6.25s/it] {'loss': 0.4751, 'learning_rate': 1.8247455350467496e-05, 'epoch': 0.22} + 22%|██▏ | 1245/5772 [2:11:04<7:51:24, 6.25s/it] 22%|██▏ | 1246/5772 [2:11:10<7:42:10, 6.13s/it] 22%|██▏ | 1246/5772 [2:11:17<7:42:09, 6.13s/it] {'loss': 0.493, 'learning_rate': 1.8244280448296852e-05, 'epoch': 0.22} + 22%|██▏ | 1246/5772 [2:11:17<7:42:09, 6.13s/it] {'loss': 0.493, 'learning_rate': 1.8244280448296852e-05, 'epoch': 0.22} + 22%|██▏ | 1246/5772 [2:11:10<7:42:10, 6.13s/it] 22%|██▏ | 1247/5772 [2:11:16<7:43:00, 6.14s/it] 22%|██▏ | 1247/5772 [2:11:23<7:43:00, 6.14s/it] {'loss': 0.4863, 'learning_rate': 1.824110294963591e-05, 'epoch': 0.22} + 22%|██▏ | 1247/5772 [2:11:23<7:43:00, 6.14s/it] {'loss': 0.4863, 'learning_rate': 1.824110294963591e-05, 'epoch': 0.22} + 22%|██▏ | 1247/5772 [2:11:16<7:43:00, 6.14s/it] 22%|██▏ | 1248/5772 [2:11:22<7:44:57, 6.17s/it] 22%|██▏ | 1248/5772 [2:11:29<7:44:56, 6.17s/it] {'loss': 0.4936, 'learning_rate': 1.8237922855485422e-05, 'epoch': 0.22} + 22%|██▏ | 1248/5772 [2:11:29<7:44:56, 6.17s/it] {'loss': 0.4936, 'learning_rate': 1.8237922855485422e-05, 'epoch': 0.22} + 22%|██▏ | 1248/5772 [2:11:22<7:44:57, 6.17s/it] 22%|██▏ | 1249/5772 [2:11:28<7:48:35, 6.22s/it] 22%|██▏ | 1249/5772 [2:11:35<7:48:35, 6.22s/it] {'loss': 0.4764, 'learning_rate': 1.823474016684693e-05, 'epoch': 0.22} + 22%|██▏ | 1249/5772 [2:11:35<7:48:35, 6.22s/it] {'loss': 0.4764, 'learning_rate': 1.823474016684693e-05, 'epoch': 0.22} + 22%|██▏ | 1249/5772 [2:11:28<7:48:35, 6.22s/it]13 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +1410 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +5 AutoResumeHook: Checking whether to suspend... +07 AutoResumeHook: Checking whether to suspend... +15 12AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend...3 + AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... + 22%|██▏ | 1250/5772 [2:11:42<7:54:55, 6.30s/it] 22%|██▏ | 1250/5772 [2:11:35<7:54:55, 6.30s/it]1 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4865, 'learning_rate': 1.8231554884722807e-05, 'epoch': 0.22} + 22%|██▏ | 1250/5772 [2:11:42<7:54:55, 6.30s/it] {'loss': 0.4865, 'learning_rate': 1.8231554884722807e-05, 'epoch': 0.22} + 22%|██▏ | 1250/5772 [2:11:35<7:54:55, 6.30s/it] 22%|██▏ | 1251/5772 [2:11:41<7:46:23, 6.19s/it] 22%|██▏ | 1251/5772 [2:11:48<7:46:24, 6.19s/it] {'loss': 0.4816, 'learning_rate': 1.8228367010116246e-05, 'epoch': 0.22} + 22%|██▏ | 1251/5772 [2:11:48<7:46:24, 6.19s/it] {'loss': 0.4816, 'learning_rate': 1.8228367010116246e-05, 'epoch': 0.22} + 22%|██▏ | 1251/5772 [2:11:41<7:46:23, 6.19s/it] 22%|██▏ | 1252/5772 [2:11:47<7:40:40, 6.12s/it] 22%|██▏ | 1252/5772 [2:11:54<7:40:40, 6.12s/it] {'loss': 0.4873, 'learning_rate': 1.822517654403124e-05, 'epoch': 0.22} + 22%|██▏ | 1252/5772 [2:11:54<7:40:40, 6.12s/it] {'loss': 0.4873, 'learning_rate': 1.822517654403124e-05, 'epoch': 0.22} + 22%|██▏ | 1252/5772 [2:11:47<7:40:40, 6.12s/it] 22%|██▏ | 1253/5772 [2:11:53<7:36:49, 6.07s/it] 22%|██▏ | 1253/5772 [2:12:00<7:36:49, 6.07s/it] {'loss': 0.4851, 'learning_rate': 1.8221983487472617e-05, 'epoch': 0.22} + 22%|██▏ | 1253/5772 [2:12:00<7:36:49, 6.07s/it] {'loss': 0.4851, 'learning_rate': 1.8221983487472617e-05, 'epoch': 0.22} + 22%|██▏ | 1253/5772 [2:11:53<7:36:49, 6.07s/it] 22%|██▏ | 1254/5772 [2:11:58<7:30:18, 5.98s/it] 22%|██▏ | 1254/5772 [2:12:05<7:30:18, 5.98s/it] {'loss': 0.4765, 'learning_rate': 1.8218787841446003e-05, 'epoch': 0.22} + 22%|██▏ | 1254/5772 [2:12:05<7:30:18, 5.98s/it] {'loss': 0.4765, 'learning_rate': 1.8218787841446003e-05, 'epoch': 0.22} + 22%|██▏ | 1254/5772 [2:11:58<7:30:18, 5.98s/it] 22%|██▏ | 1255/5772 [2:12:05<7:40:59, 6.12s/it] 22%|██▏ | 1255/5772 [2:12:12<7:40:58, 6.12s/it] {'loss': 0.4912, 'learning_rate': 1.8215589606957862e-05, 'epoch': 0.22} + 22%|██▏ | 1255/5772 [2:12:12<7:40:58, 6.12s/it] {'loss': 0.4912, 'learning_rate': 1.8215589606957862e-05, 'epoch': 0.22} + 22%|██▏ | 1255/5772 [2:12:05<7:40:59, 6.12s/it] 22%|██▏ | 1256/5772 [2:12:11<7:39:51, 6.11s/it] 22%|██▏ | 1256/5772 [2:12:18<7:39:51, 6.11s/it] {'loss': 0.4789, 'learning_rate': 1.821238878501545e-05, 'epoch': 0.22} + 22%|██▏ | 1256/5772 [2:12:18<7:39:51, 6.11s/it] {'loss': 0.4789, 'learning_rate': 1.821238878501545e-05, 'epoch': 0.22} + 22%|██▏ | 1256/5772 [2:12:11<7:39:51, 6.11s/it] 22%|██▏ | 1257/5772 [2:12:17<7:35:59, 6.06s/it] 22%|██▏ | 1257/5772 [2:12:24<7:36:00, 6.06s/it] {'loss': 0.4815, 'learning_rate': 1.820918537662685e-05, 'epoch': 0.22} + 22%|██▏ | 1257/5772 [2:12:24<7:36:00, 6.06s/it] {'loss': 0.4815, 'learning_rate': 1.820918537662685e-05, 'epoch': 0.22} + 22%|██▏ | 1257/5772 [2:12:17<7:35:59, 6.06s/it] 22%|██▏ | 1258/5772 [2:12:23<7:28:26, 5.96s/it] 22%|██▏ | 1258/5772 [2:12:30<7:28:25, 5.96s/it] {'loss': 0.4828, 'learning_rate': 1.8205979382800963e-05, 'epoch': 0.22} + 22%|██▏ | 1258/5772 [2:12:30<7:28:25, 5.96s/it] {'loss': 0.4828, 'learning_rate': 1.8205979382800963e-05, 'epoch': 0.22} + 22%|██▏ | 1258/5772 [2:12:23<7:28:26, 5.96s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (5002 > 4096). Running this sequence through the model will result in indexing errors + 22%|██▏ | 1259/5772 [2:12:29<7:28:25, 5.96s/it] 22%|██▏ | 1259/5772 [2:12:36<7:28:25, 5.96s/it] {'loss': 0.4934, 'learning_rate': 1.820277080454749e-05, 'epoch': 0.22} + 22%|██▏ | 1259/5772 [2:12:36<7:28:25, 5.96s/it] {'loss': 0.4934, 'learning_rate': 1.820277080454749e-05, 'epoch': 0.22} + 22%|██▏ | 1259/5772 [2:12:29<7:28:25, 5.96s/it] 22%|██▏ | 1260/5772 [2:12:35<7:36:26, 6.07s/it] 22%|██▏ | 1260/5772 [2:12:42<7:36:26, 6.07s/it] {'loss': 0.4828, 'learning_rate': 1.8199559642876962e-05, 'epoch': 0.22} + 22%|██▏ | 1260/5772 [2:12:42<7:36:26, 6.07s/it] {'loss': 0.4828, 'learning_rate': 1.8199559642876962e-05, 'epoch': 0.22} + 22%|██▏ | 1260/5772 [2:12:35<7:36:26, 6.07s/it] 22%|██▏ | 1261/5772 [2:12:41<7:35:48, 6.06s/it] 22%|██▏ | 1261/5772 [2:12:48<7:35:48, 6.06s/it] {'loss': 0.4804, 'learning_rate': 1.8196345898800715e-05, 'epoch': 0.22} + 22%|██▏ | 1261/5772 [2:12:48<7:35:48, 6.06s/it] {'loss': 0.4804, 'learning_rate': 1.8196345898800715e-05, 'epoch': 0.22} + 22%|██▏ | 1261/5772 [2:12:41<7:35:48, 6.06s/it] 22%|██▏ | 1262/5772 [2:12:47<7:40:55, 6.13s/it] 22%|██▏ | 1262/5772 [2:12:54<7:40:55, 6.13s/it] {'loss': 0.4816, 'learning_rate': 1.8193129573330896e-05, 'epoch': 0.22} + {'loss': 0.4816, 'learning_rate': 1.8193129573330896e-05, 'epoch': 0.22} + 22%|██▏ | 1262/5772 [2:12:54<7:40:55, 6.13s/it] 22%|██▏ | 1262/5772 [2:12:47<7:40:55, 6.13s/it] 22%|██▏ | 1263/5772 [2:12:54<7:47:32, 6.22s/it] 22%|██▏ | 1263/5772 [2:13:01<7:47:32, 6.22s/it] {'loss': 0.4906, 'learning_rate': 1.8189910667480476e-05, 'epoch': 0.22} + 22%|██▏ | 1263/5772 [2:13:01<7:47:32, 6.22s/it] {'loss': 0.4906, 'learning_rate': 1.8189910667480476e-05, 'epoch': 0.22} + 22%|██▏ | 1263/5772 [2:12:54<7:47:32, 6.22s/it] 22%|██▏ | 1264/5772 [2:13:00<7:44:58, 6.19s/it] 22%|██▏ | 1264/5772 [2:13:07<7:44:59, 6.19s/it] {'loss': 0.4922, 'learning_rate': 1.8186689182263225e-05, 'epoch': 0.22} + 22%|██▏ | 1264/5772 [2:13:07<7:44:59, 6.19s/it] {'loss': 0.4922, 'learning_rate': 1.8186689182263225e-05, 'epoch': 0.22} + 22%|██▏ | 1264/5772 [2:13:00<7:44:58, 6.19s/it] 22%|██▏ | 1265/5772 [2:13:06<7:47:12, 6.22s/it] 22%|██▏ | 1265/5772 [2:13:13<7:47:13, 6.22s/it] {'loss': 0.4915, 'learning_rate': 1.818346511869373e-05, 'epoch': 0.22} + 22%|██▏ | 1265/5772 [2:13:13<7:47:13, 6.22s/it] {'loss': 0.4915, 'learning_rate': 1.818346511869373e-05, 'epoch': 0.22} + 22%|██▏ | 1265/5772 [2:13:06<7:47:12, 6.22s/it] 22%|██▏ | 1266/5772 [2:13:12<7:47:22, 6.22s/it] 22%|██▏ | 1266/5772 [2:13:19<7:47:21, 6.22s/it] {'loss': 0.4774, 'learning_rate': 1.8180238477787406e-05, 'epoch': 0.22} + 22%|██▏ | 1266/5772 [2:13:19<7:47:21, 6.22s/it] {'loss': 0.4774, 'learning_rate': 1.8180238477787406e-05, 'epoch': 0.22} + 22%|██▏ | 1266/5772 [2:13:12<7:47:22, 6.22s/it] 22%|██▏ | 1267/5772 [2:13:25<7:42:42, 6.16s/it] 22%|██▏ | 1267/5772 [2:13:18<7:42:43, 6.16s/it] {'loss': 0.479, 'learning_rate': 1.8177009260560447e-05, 'epoch': 0.22} + 22%|██▏ | 1267/5772 [2:13:25<7:42:42, 6.16s/it] {'loss': 0.479, 'learning_rate': 1.8177009260560447e-05, 'epoch': 0.22} + 22%|██▏ | 1267/5772 [2:13:18<7:42:43, 6.16s/it] 22%|██▏ | 1268/5772 [2:13:32<7:48:45, 6.24s/it] 22%|██▏ | 1268/5772 [2:13:25<7:48:46, 6.24s/it] {'loss': 0.4945, 'learning_rate': 1.817377746802989e-05, 'epoch': 0.22} + 22%|██▏ | 1268/5772 [2:13:32<7:48:45, 6.24s/it] {'loss': 0.4945, 'learning_rate': 1.817377746802989e-05, 'epoch': 0.22} + 22%|██▏ | 1268/5772 [2:13:25<7:48:46, 6.24s/it] 22%|██▏ | 1269/5772 [2:13:31<7:50:38, 6.27s/it] 22%|██▏ | 1269/5772 [2:13:38<7:50:38, 6.27s/it] {'loss': 0.49, 'learning_rate': 1.8170543101213565e-05, 'epoch': 0.22} + 22%|██▏ | 1269/5772 [2:13:38<7:50:38, 6.27s/it] {'loss': 0.49, 'learning_rate': 1.8170543101213565e-05, 'epoch': 0.22} + 22%|██▏ | 1269/5772 [2:13:31<7:50:38, 6.27s/it] 22%|██▏ | 1270/5772 [2:13:44<7:46:20, 6.22s/it] 22%|██▏ | 1270/5772 [2:13:37<7:46:21, 6.22s/it] {'loss': 0.4786, 'learning_rate': 1.816730616113012e-05, 'epoch': 0.22} + 22%|██▏ | 1270/5772 [2:13:44<7:46:20, 6.22s/it] {'loss': 0.4786, 'learning_rate': 1.816730616113012e-05, 'epoch': 0.22} + 22%|██▏ | 1270/5772 [2:13:37<7:46:21, 6.22s/it] 22%|██▏ | 1271/5772 [2:13:50<7:41:30, 6.15s/it] 22%|██▏ | 1271/5772 [2:13:43<7:41:31, 6.15s/it] {'loss': 0.4775, 'learning_rate': 1.816406664879901e-05, 'epoch': 0.22} + 22%|██▏ | 1271/5772 [2:13:50<7:41:30, 6.15s/it] {'loss': 0.4775, 'learning_rate': 1.816406664879901e-05, 'epoch': 0.22} + 22%|██▏ | 1271/5772 [2:13:43<7:41:31, 6.15s/it] 22%|██▏ | 1272/5772 [2:13:49<7:41:19, 6.15s/it] 22%|██▏ | 1272/5772 [2:13:56<7:41:19, 6.15s/it] {'loss': 0.4895, 'learning_rate': 1.8160824565240495e-05, 'epoch': 0.22} + 22%|██▏ | 1272/5772 [2:13:56<7:41:19, 6.15s/it] {'loss': 0.4895, 'learning_rate': 1.8160824565240495e-05, 'epoch': 0.22} + 22%|██▏ | 1272/5772 [2:13:49<7:41:19, 6.15s/it] 22%|██▏ | 1273/5772 [2:13:56<7:54:08, 6.32s/it] 22%|██▏ | 1273/5772 [2:14:03<7:54:09, 6.32s/it] {'loss': 0.4846, 'learning_rate': 1.8157579911475664e-05, 'epoch': 0.22} + 22%|██▏ | 1273/5772 [2:14:03<7:54:09, 6.32s/it] {'loss': 0.4846, 'learning_rate': 1.8157579911475664e-05, 'epoch': 0.22} + 22%|██▏ | 1273/5772 [2:13:56<7:54:08, 6.32s/it] 22%|██▏ | 1274/5772 [2:14:02<7:45:11, 6.21s/it] 22%|██▏ | 1274/5772 [2:14:09<7:45:11, 6.21s/it] {'loss': 0.4966, 'learning_rate': 1.8154332688526395e-05, 'epoch': 0.22} + 22%|██▏ | 1274/5772 [2:14:09<7:45:11, 6.21s/it] {'loss': 0.4966, 'learning_rate': 1.8154332688526395e-05, 'epoch': 0.22} + 22%|██▏ | 1274/5772 [2:14:02<7:45:11, 6.21s/it]/lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/VILA/llava/model/llava_arch.py:397: UserWarning: Inputs truncated! + warnings.warn("Inputs truncated!") + 22%|██▏ | 1275/5772 [2:14:09<8:00:51, 6.42s/it] 22%|██▏ | 1275/5772 [2:14:16<8:00:51, 6.42s/it] {'loss': 0.4857, 'learning_rate': 1.8151082897415386e-05, 'epoch': 0.22} + 22%|██▏ | 1275/5772 [2:14:16<8:00:51, 6.42s/it] {'loss': 0.4857, 'learning_rate': 1.8151082897415386e-05, 'epoch': 0.22} + 22%|██▏ | 1275/5772 [2:14:09<8:00:51, 6.42s/it] 22%|██▏ | 1276/5772 [2:14:15<7:49:22, 6.26s/it] 22%|██▏ | 1276/5772 [2:14:22<7:49:22, 6.26s/it] {'loss': 0.4779, 'learning_rate': 1.8147830539166132e-05, 'epoch': 0.22} + 22%|██▏ | 1276/5772 [2:14:22<7:49:22, 6.26s/it] {'loss': 0.4779, 'learning_rate': 1.8147830539166132e-05, 'epoch': 0.22} + 22%|██▏ | 1276/5772 [2:14:15<7:49:22, 6.26s/it] 22%|██▏ | 1277/5772 [2:14:21<7:45:27, 6.21s/it] 22%|██▏ | 1277/5772 [2:14:28<7:45:28, 6.21s/it] {'loss': 0.4868, 'learning_rate': 1.8144575614802958e-05, 'epoch': 0.22} + 22%|██▏ | 1277/5772 [2:14:28<7:45:28, 6.21s/it] {'loss': 0.4868, 'learning_rate': 1.8144575614802958e-05, 'epoch': 0.22} + 22%|██▏ | 1277/5772 [2:14:21<7:45:27, 6.21s/it] 22%|██▏ | 1278/5772 [2:14:27<7:45:27, 6.21s/it] 22%|██▏ | 1278/5772 [2:14:34<7:45:28, 6.21s/it] {'loss': 0.4888, 'learning_rate': 1.8141318125350974e-05, 'epoch': 0.22} + 22%|██▏ | 1278/5772 [2:14:34<7:45:28, 6.21s/it] {'loss': 0.4888, 'learning_rate': 1.8141318125350974e-05, 'epoch': 0.22} + 22%|██▏ | 1278/5772 [2:14:27<7:45:27, 6.21s/it] 22%|██▏ | 1279/5772 [2:14:33<7:42:33, 6.18s/it] 22%|██▏ | 1279/5772 [2:14:40<7:42:32, 6.18s/it] {'loss': 0.4667, 'learning_rate': 1.8138058071836116e-05, 'epoch': 0.22} + 22%|██▏ | 1279/5772 [2:14:40<7:42:32, 6.18s/it] {'loss': 0.4667, 'learning_rate': 1.8138058071836116e-05, 'epoch': 0.22} + 22%|██▏ | 1279/5772 [2:14:33<7:42:33, 6.18s/it] 22%|██▏ | 1280/5772 [2:14:39<7:40:16, 6.15s/it] 22%|██▏ | 1280/5772 [2:14:46<7:40:16, 6.15s/it] {'loss': 0.5037, 'learning_rate': 1.8134795455285116e-05, 'epoch': 0.22} + 22%|██▏ | 1280/5772 [2:14:46<7:40:16, 6.15s/it] {'loss': 0.5037, 'learning_rate': 1.8134795455285116e-05, 'epoch': 0.22} + 22%|██▏ | 1280/5772 [2:14:39<7:40:16, 6.15s/it] 22%|██▏ | 1281/5772 [2:14:45<7:38:40, 6.13s/it] 22%|██▏ | 1281/5772 [2:14:52<7:38:39, 6.13s/it] {'loss': 0.4845, 'learning_rate': 1.8131530276725514e-05, 'epoch': 0.22} + 22%|██▏ | 1281/5772 [2:14:52<7:38:39, 6.13s/it] {'loss': 0.4845, 'learning_rate': 1.8131530276725514e-05, 'epoch': 0.22} + 22%|██▏ | 1281/5772 [2:14:45<7:38:40, 6.13s/it] 22%|██▏ | 1282/5772 [2:14:52<7:42:15, 6.18s/it] 22%|██▏ | 1282/5772 [2:14:59<7:42:15, 6.18s/it] {'loss': 0.4904, 'learning_rate': 1.8128262537185663e-05, 'epoch': 0.22} + 22%|██▏ | 1282/5772 [2:14:59<7:42:15, 6.18s/it] {'loss': 0.4904, 'learning_rate': 1.8128262537185663e-05, 'epoch': 0.22} + 22%|██▏ | 1282/5772 [2:14:52<7:42:15, 6.18s/it] 22%|██▏ | 1283/5772 [2:14:58<7:37:17, 6.11s/it] 22%|██▏ | 1283/5772 [2:15:05<7:37:17, 6.11s/it] {'loss': 0.4663, 'learning_rate': 1.812499223769472e-05, 'epoch': 0.22} + 22%|██▏ | 1283/5772 [2:15:05<7:37:17, 6.11s/it] {'loss': 0.4663, 'learning_rate': 1.812499223769472e-05, 'epoch': 0.22} + 22%|██▏ | 1283/5772 [2:14:58<7:37:17, 6.11s/it] 22%|██▏ | 1284/5772 [2:15:04<7:37:45, 6.12s/it] 22%|██▏ | 1284/5772 [2:15:11<7:37:44, 6.12s/it] {'loss': 0.4887, 'learning_rate': 1.8121719379282646e-05, 'epoch': 0.22} + 22%|██▏ | 1284/5772 [2:15:11<7:37:44, 6.12s/it] {'loss': 0.4887, 'learning_rate': 1.8121719379282646e-05, 'epoch': 0.22} + 22%|██▏ | 1284/5772 [2:15:04<7:37:45, 6.12s/it] 22%|██▏ | 1285/5772 [2:15:10<7:37:47, 6.12s/it] 22%|██▏ | 1285/5772 [2:15:17<7:37:47, 6.12s/it] {'loss': 0.4854, 'learning_rate': 1.8118443962980215e-05, 'epoch': 0.22} + 22%|██▏ | 1285/5772 [2:15:17<7:37:47, 6.12s/it] {'loss': 0.4854, 'learning_rate': 1.8118443962980215e-05, 'epoch': 0.22} + 22%|██▏ | 1285/5772 [2:15:10<7:37:47, 6.12s/it] 22%|██▏ | 1286/5772 [2:15:23<7:32:09, 6.05s/it] 22%|██▏ | 1286/5772 [2:15:16<7:32:10, 6.05s/it] {'loss': 0.4785, 'learning_rate': 1.8115165989818992e-05, 'epoch': 0.22} + 22%|██▏ | 1286/5772 [2:15:23<7:32:09, 6.05s/it] {'loss': 0.4785, 'learning_rate': 1.8115165989818992e-05, 'epoch': 0.22} + 22%|██▏ | 1286/5772 [2:15:16<7:32:10, 6.05s/it] 22%|██▏ | 1287/5772 [2:15:22<7:37:14, 6.12s/it] 22%|██▏ | 1287/5772 [2:15:29<7:37:14, 6.12s/it] {'loss': 0.4927, 'learning_rate': 1.8111885460831362e-05, 'epoch': 0.22} + 22%|██▏ | 1287/5772 [2:15:29<7:37:14, 6.12s/it] {'loss': 0.4927, 'learning_rate': 1.8111885460831362e-05, 'epoch': 0.22} + 22%|██▏ | 1287/5772 [2:15:22<7:37:14, 6.12s/it] 22%|██▏ | 1288/5772 [2:15:28<7:37:22, 6.12s/it] 22%|██▏ | 1288/5772 [2:15:35<7:37:22, 6.12s/it] {'loss': 0.4945, 'learning_rate': 1.810860237705051e-05, 'epoch': 0.22} + 22%|██▏ | 1288/5772 [2:15:35<7:37:22, 6.12s/it] {'loss': 0.4945, 'learning_rate': 1.810860237705051e-05, 'epoch': 0.22} + 22%|██▏ | 1288/5772 [2:15:28<7:37:22, 6.12s/it] 22%|██▏ | 1289/5772 [2:15:34<7:33:19, 6.07s/it] 22%|██▏ | 1289/5772 [2:15:41<7:33:19, 6.07s/it] {'loss': 0.4768, 'learning_rate': 1.8105316739510424e-05, 'epoch': 0.22} + 22%|██▏ | 1289/5772 [2:15:41<7:33:19, 6.07s/it] {'loss': 0.4768, 'learning_rate': 1.8105316739510424e-05, 'epoch': 0.22} + 22%|██▏ | 1289/5772 [2:15:34<7:33:19, 6.07s/it] 22%|██▏ | 1290/5772 [2:15:47<7:38:54, 6.14s/it] 22%|██▏ | 1290/5772 [2:15:40<7:38:54, 6.14s/it] {'loss': 0.5007, 'learning_rate': 1.8102028549245894e-05, 'epoch': 0.22} + 22%|██▏ | 1290/5772 [2:15:47<7:38:54, 6.14s/it] {'loss': 0.5007, 'learning_rate': 1.8102028549245894e-05, 'epoch': 0.22} + 22%|██▏ | 1290/5772 [2:15:40<7:38:54, 6.14s/it] 22%|██▏ | 1291/5772 [2:15:47<7:41:12, 6.18s/it] 22%|██▏ | 1291/5772 [2:15:54<7:41:12, 6.18s/it] {'loss': 0.4795, 'learning_rate': 1.8098737807292517e-05, 'epoch': 0.22} + 22%|██▏ | 1291/5772 [2:15:54<7:41:12, 6.18s/it] {'loss': 0.4795, 'learning_rate': 1.8098737807292517e-05, 'epoch': 0.22} + 22%|██▏ | 1291/5772 [2:15:47<7:41:12, 6.18s/it] 22%|██▏ | 1292/5772 [2:16:00<7:44:56, 6.23s/it] 22%|██▏ | 1292/5772 [2:15:53<7:44:57, 6.23s/it] {'loss': 0.4891, 'learning_rate': 1.8095444514686702e-05, 'epoch': 0.22} + 22%|██▏ | 1292/5772 [2:16:00<7:44:56, 6.23s/it] {'loss': 0.4891, 'learning_rate': 1.8095444514686702e-05, 'epoch': 0.22} + 22%|██▏ | 1292/5772 [2:15:53<7:44:57, 6.23s/it] 22%|██▏ | 1293/5772 [2:16:00<7:52:04, 6.32s/it] 22%|██▏ | 1293/5772 [2:16:07<7:52:04, 6.32s/it] {'loss': 0.4903, 'learning_rate': 1.8092148672465647e-05, 'epoch': 0.22} + 22%|██▏ | 1293/5772 [2:16:07<7:52:04, 6.32s/it] {'loss': 0.4903, 'learning_rate': 1.8092148672465647e-05, 'epoch': 0.22} + 22%|██▏ | 1293/5772 [2:16:00<7:52:04, 6.32s/it] 22%|██▏ | 1294/5772 [2:16:06<7:54:30, 6.36s/it] 22%|██▏ | 1294/5772 [2:16:13<7:54:30, 6.36s/it] {'loss': 0.4997, 'learning_rate': 1.8088850281667358e-05, 'epoch': 0.22} + 22%|██▏ | 1294/5772 [2:16:13<7:54:30, 6.36s/it] {'loss': 0.4997, 'learning_rate': 1.8088850281667358e-05, 'epoch': 0.22} + 22%|██▏ | 1294/5772 [2:16:06<7:54:30, 6.36s/it] 22%|██▏ | 1295/5772 [2:16:12<7:56:02, 6.38s/it] 22%|██▏ | 1295/5772 [2:16:19<7:56:03, 6.38s/it] {'loss': 0.4868, 'learning_rate': 1.808554934333065e-05, 'epoch': 0.22} + 22%|██▏ | 1295/5772 [2:16:19<7:56:03, 6.38s/it] {'loss': 0.4868, 'learning_rate': 1.808554934333065e-05, 'epoch': 0.22} + 22%|██▏ | 1295/5772 [2:16:12<7:56:02, 6.38s/it] 22%|██▏ | 1296/5772 [2:16:19<7:50:42, 6.31s/it] 22%|██▏ | 1296/5772 [2:16:26<7:50:42, 6.31s/it] {'loss': 0.5118, 'learning_rate': 1.808224585849513e-05, 'epoch': 0.22} + 22%|██▏ | 1296/5772 [2:16:26<7:50:42, 6.31s/it] {'loss': 0.5118, 'learning_rate': 1.808224585849513e-05, 'epoch': 0.22} + 22%|██▏ | 1296/5772 [2:16:19<7:50:42, 6.31s/it] 22%|██▏ | 1297/5772 [2:16:25<7:44:30, 6.23s/it] 22%|██▏ | 1297/5772 [2:16:32<7:44:30, 6.23s/it] {'loss': 0.4826, 'learning_rate': 1.8078939828201213e-05, 'epoch': 0.22} + 22%|██▏ | 1297/5772 [2:16:32<7:44:30, 6.23s/it] {'loss': 0.4826, 'learning_rate': 1.8078939828201213e-05, 'epoch': 0.22} + 22%|██▏ | 1297/5772 [2:16:25<7:44:30, 6.23s/it] 22%|██▏ | 1298/5772 [2:16:30<7:34:21, 6.09s/it] 22%|██▏ | 1298/5772 [2:16:37<7:34:21, 6.09s/it] {'loss': 0.4833, 'learning_rate': 1.807563125349012e-05, 'epoch': 0.22} + 22%|██▏ | 1298/5772 [2:16:37<7:34:21, 6.09s/it] {'loss': 0.4833, 'learning_rate': 1.807563125349012e-05, 'epoch': 0.22} + 22%|██▏ | 1298/5772 [2:16:30<7:34:21, 6.09s/it] 23%|██▎ | 1299/5772 [2:16:36<7:32:07, 6.06s/it] 23%|██▎ | 1299/5772 [2:16:43<7:32:07, 6.06s/it] {'loss': 0.4942, 'learning_rate': 1.8072320135403862e-05, 'epoch': 0.23} + 23%|██▎ | 1299/5772 [2:16:43<7:32:07, 6.06s/it] {'loss': 0.4942, 'learning_rate': 1.8072320135403862e-05, 'epoch': 0.23} + 23%|██▎ | 1299/5772 [2:16:36<7:32:07, 6.06s/it]11 AutoResumeHook: Checking whether to suspend... +09 AutoResumeHook: Checking whether to suspend... +81312 AutoResumeHook: Checking whether to suspend... AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + + +14 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +72 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + + 23%|██▎ | 1300/5772 [2:16:50<7:32:51, 6.08s/it]AutoResumeHook: Checking whether to suspend...15 5 + AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... + 23%|██▎ | 1300/5772 [2:16:42<7:32:51, 6.08s/it]3 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4836, 'learning_rate': 1.806900647498526e-05, 'epoch': 0.23} + 23%|██▎ | 1300/5772 [2:16:50<7:32:51, 6.08s/it] {'loss': 0.4836, 'learning_rate': 1.806900647498526e-05, 'epoch': 0.23} + 23%|██▎ | 1300/5772 [2:16:42<7:32:51, 6.08s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-1300/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-1300/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-1300/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 23%|██▎ | 1301/5772 [2:17:02<12:25:30, 10.00s/it] 23%|██▎ | 1301/5772 [2:17:09<12:25:31, 10.00s/it] {'loss': 0.4759, 'learning_rate': 1.8065690273277933e-05, 'epoch': 0.23} + 23%|██▎ | 1301/5772 [2:17:09<12:25:31, 10.00s/it] {'loss': 0.4759, 'learning_rate': 1.8065690273277933e-05, 'epoch': 0.23} + 23%|██▎ | 1301/5772 [2:17:02<12:25:30, 10.00s/it] 23%|██▎ | 1302/5772 [2:17:08<10:54:50, 8.79s/it] 23%|██▎ | 1302/5772 [2:17:15<10:54:50, 8.79s/it] {'loss': 0.4762, 'learning_rate': 1.8062371531326298e-05, 'epoch': 0.23} + 23%|██▎ | 1302/5772 [2:17:15<10:54:50, 8.79s/it] {'loss': 0.4762, 'learning_rate': 1.8062371531326298e-05, 'epoch': 0.23} + 23%|██▎ | 1302/5772 [2:17:08<10:54:50, 8.79s/it] 23%|██▎ | 1303/5772 [2:17:14<9:55:24, 7.99s/it] 23%|██▎ | 1303/5772 [2:17:21<9:55:24, 7.99s/it] {'loss': 0.4876, 'learning_rate': 1.8059050250175577e-05, 'epoch': 0.23} + 23%|██▎ | 1303/5772 [2:17:21<9:55:24, 7.99s/it] {'loss': 0.4876, 'learning_rate': 1.8059050250175577e-05, 'epoch': 0.23} + 23%|██▎ | 1303/5772 [2:17:14<9:55:24, 7.99s/it] 23%|██▎ | 1304/5772 [2:17:27<9:11:35, 7.41s/it] 23%|██▎ | 1304/5772 [2:17:20<9:11:35, 7.41s/it] {'loss': 0.4878, 'learning_rate': 1.805572643087179e-05, 'epoch': 0.23} + 23%|██▎ | 1304/5772 [2:17:27<9:11:35, 7.41s/it] {'loss': 0.4878, 'learning_rate': 1.805572643087179e-05, 'epoch': 0.23} + 23%|██▎ | 1304/5772 [2:17:20<9:11:35, 7.41s/it] 23%|██▎ | 1305/5772 [2:17:34<9:00:46, 7.26s/it] 23%|██▎ | 1305/5772 [2:17:27<9:00:47, 7.26s/it] {'loss': 0.4943, 'learning_rate': 1.8052400074461752e-05, 'epoch': 0.23} + 23%|██▎ | 1305/5772 [2:17:34<9:00:46, 7.26s/it] {'loss': 0.4943, 'learning_rate': 1.8052400074461752e-05, 'epoch': 0.23} + 23%|██▎ | 1305/5772 [2:17:27<9:00:47, 7.26s/it] 23%|██▎ | 1306/5772 [2:17:39<8:24:49, 6.78s/it] 23%|██▎ | 1306/5772 [2:17:32<8:24:50, 6.78s/it] {'loss': 0.4735, 'learning_rate': 1.8049071181993083e-05, 'epoch': 0.23} + 23%|██▎ | 1306/5772 [2:17:39<8:24:49, 6.78s/it] {'loss': 0.4735, 'learning_rate': 1.8049071181993083e-05, 'epoch': 0.23} + 23%|██▎ | 1306/5772 [2:17:32<8:24:50, 6.78s/it] 23%|██▎ | 1307/5772 [2:17:39<8:10:56, 6.60s/it] 23%|██▎ | 1307/5772 [2:17:46<8:10:57, 6.60s/it] {'loss': 0.4954, 'learning_rate': 1.8045739754514197e-05, 'epoch': 0.23} + 23%|██▎ | 1307/5772 [2:17:46<8:10:57, 6.60s/it] {'loss': 0.4954, 'learning_rate': 1.8045739754514197e-05, 'epoch': 0.23} + 23%|██▎ | 1307/5772 [2:17:39<8:10:56, 6.60s/it] 23%|██▎ | 1308/5772 [2:17:45<7:59:39, 6.45s/it] 23%|██▎ | 1308/5772 [2:17:52<7:59:40, 6.45s/it] {'loss': 0.4753, 'learning_rate': 1.804240579307431e-05, 'epoch': 0.23} + 23%|██▎ | 1308/5772 [2:17:52<7:59:40, 6.45s/it] {'loss': 0.4753, 'learning_rate': 1.804240579307431e-05, 'epoch': 0.23} + 23%|██▎ | 1308/5772 [2:17:45<7:59:39, 6.45s/it] 23%|██▎ | 1309/5772 [2:17:58<7:50:08, 6.32s/it] 23%|██▎ | 1309/5772 [2:17:51<7:50:08, 6.32s/it] {'loss': 0.4947, 'learning_rate': 1.8039069298723438e-05, 'epoch': 0.23} + 23%|██▎ | 1309/5772 [2:17:58<7:50:08, 6.32s/it] {'loss': 0.4947, 'learning_rate': 1.8039069298723438e-05, 'epoch': 0.23} + 23%|██▎ | 1309/5772 [2:17:51<7:50:08, 6.32s/it] 23%|██▎ | 1310/5772 [2:18:03<7:38:01, 6.16s/it] 23%|██▎ | 1310/5772 [2:17:56<7:38:02, 6.16s/it] {'loss': 0.4878, 'learning_rate': 1.8035730272512383e-05, 'epoch': 0.23} + 23%|██▎ | 1310/5772 [2:18:03<7:38:01, 6.16s/it] {'loss': 0.4878, 'learning_rate': 1.8035730272512383e-05, 'epoch': 0.23} + 23%|██▎ | 1310/5772 [2:17:56<7:38:02, 6.16s/it] 23%|██▎ | 1311/5772 [2:18:10<7:37:19, 6.15s/it] 23%|██▎ | 1311/5772 [2:18:03<7:37:19, 6.15s/it] {'loss': 0.4959, 'learning_rate': 1.803238871549276e-05, 'epoch': 0.23} + 23%|██▎ | 1311/5772 [2:18:10<7:37:19, 6.15s/it] {'loss': 0.4959, 'learning_rate': 1.803238871549276e-05, 'epoch': 0.23} + 23%|██▎ | 1311/5772 [2:18:03<7:37:19, 6.15s/it] 23%|██▎ | 1312/5772 [2:18:16<7:41:40, 6.21s/it] 23%|██▎ | 1312/5772 [2:18:09<7:41:41, 6.21s/it] {'loss': 0.4842, 'learning_rate': 1.802904462871697e-05, 'epoch': 0.23} + 23%|██▎ | 1312/5772 [2:18:16<7:41:40, 6.21s/it] {'loss': 0.4842, 'learning_rate': 1.802904462871697e-05, 'epoch': 0.23} + 23%|██▎ | 1312/5772 [2:18:09<7:41:41, 6.21s/it] 23%|██▎ | 1313/5772 [2:18:22<7:41:45, 6.21s/it] 23%|██▎ | 1313/5772 [2:18:15<7:41:46, 6.21s/it] {'loss': 0.479, 'learning_rate': 1.8025698013238217e-05, 'epoch': 0.23} + 23%|██▎ | 1313/5772 [2:18:22<7:41:45, 6.21s/it] {'loss': 0.479, 'learning_rate': 1.8025698013238217e-05, 'epoch': 0.23} + 23%|██▎ | 1313/5772 [2:18:15<7:41:46, 6.21s/it] 23%|██▎ | 1314/5772 [2:18:21<7:33:30, 6.10s/it] 23%|██▎ | 1314/5772 [2:18:28<7:33:31, 6.10s/it] {'loss': 0.4788, 'learning_rate': 1.8022348870110495e-05, 'epoch': 0.23} + 23%|██▎ | 1314/5772 [2:18:21<7:33:30, 6.10s/it]{'loss': 0.4788, 'learning_rate': 1.8022348870110495e-05, 'epoch': 0.23} + 23%|██▎ | 1314/5772 [2:18:28<7:33:31, 6.10s/it] 23%|██▎ | 1315/5772 [2:18:35<7:41:51, 6.22s/it] 23%|██▎ | 1315/5772 [2:18:27<7:41:52, 6.22s/it] {'loss': 0.4837, 'learning_rate': 1.8018997200388605e-05, 'epoch': 0.23} + 23%|██▎ | 1315/5772 [2:18:35<7:41:51, 6.22s/it] {'loss': 0.4837, 'learning_rate': 1.8018997200388605e-05, 'epoch': 0.23} + 23%|██▎ | 1315/5772 [2:18:27<7:41:52, 6.22s/it] 23%|██▎ | 1316/5772 [2:18:41<7:42:02, 6.22s/it] 23%|██▎ | 1316/5772 [2:18:34<7:42:04, 6.22s/it] {'loss': 0.4876, 'learning_rate': 1.801564300512813e-05, 'epoch': 0.23} + 23%|██▎ | 1316/5772 [2:18:41<7:42:02, 6.22s/it] {'loss': 0.4876, 'learning_rate': 1.801564300512813e-05, 'epoch': 0.23} + 23%|██▎ | 1316/5772 [2:18:34<7:42:04, 6.22s/it] 23%|██▎ | 1317/5772 [2:18:47<7:46:48, 6.29s/it] 23%|██▎ | 1317/5772 [2:18:40<7:46:48, 6.29s/it] {'loss': 0.4842, 'learning_rate': 1.8012286285385456e-05, 'epoch': 0.23} + 23%|██▎ | 1317/5772 [2:18:47<7:46:48, 6.29s/it] {'loss': 0.4842, 'learning_rate': 1.8012286285385456e-05, 'epoch': 0.23} + 23%|██▎ | 1317/5772 [2:18:40<7:46:48, 6.29s/it] 23%|██▎ | 1318/5772 [2:18:46<7:37:57, 6.17s/it] 23%|██▎ | 1318/5772 [2:18:53<7:37:58, 6.17s/it] {'loss': 0.4833, 'learning_rate': 1.800892704221777e-05, 'epoch': 0.23} + 23%|██▎ | 1318/5772 [2:18:53<7:37:58, 6.17s/it] {'loss': 0.4833, 'learning_rate': 1.800892704221777e-05, 'epoch': 0.23} + 23%|██▎ | 1318/5772 [2:18:46<7:37:57, 6.17s/it] 23%|██▎ | 1319/5772 [2:18:59<7:38:04, 6.17s/it] 23%|██▎ | 1319/5772 [2:18:52<7:38:04, 6.17s/it] {'loss': 0.4918, 'learning_rate': 1.8005565276683038e-05, 'epoch': 0.23} + 23%|██▎ | 1319/5772 [2:18:59<7:38:04, 6.17s/it] {'loss': 0.4918, 'learning_rate': 1.8005565276683038e-05, 'epoch': 0.23} + 23%|██▎ | 1319/5772 [2:18:52<7:38:04, 6.17s/it] 23%|██▎ | 1320/5772 [2:19:06<7:45:55, 6.28s/it] 23%|██▎ | 1320/5772 [2:18:59<7:45:55, 6.28s/it] {'loss': 0.4897, 'learning_rate': 1.8002200989840034e-05, 'epoch': 0.23} + 23%|██▎ | 1320/5772 [2:19:06<7:45:55, 6.28s/it] {'loss': 0.4897, 'learning_rate': 1.8002200989840034e-05, 'epoch': 0.23} + 23%|██▎ | 1320/5772 [2:18:59<7:45:55, 6.28s/it] 23%|██▎ | 1321/5772 [2:19:12<7:48:18, 6.31s/it] 23%|██▎ | 1321/5772 [2:19:05<7:48:19, 6.31s/it] {'loss': 0.4864, 'learning_rate': 1.7998834182748318e-05, 'epoch': 0.23} + 23%|██▎ | 1321/5772 [2:19:12<7:48:18, 6.31s/it] {'loss': 0.4864, 'learning_rate': 1.7998834182748318e-05, 'epoch': 0.23} + 23%|██▎ | 1321/5772 [2:19:05<7:48:19, 6.31s/it] 23%|██▎ | 1322/5772 [2:19:18<7:45:14, 6.27s/it] 23%|██▎ | 1322/5772 [2:19:11<7:45:14, 6.27s/it] {'loss': 0.4754, 'learning_rate': 1.7995464856468253e-05, 'epoch': 0.23} + 23%|██▎ | 1322/5772 [2:19:18<7:45:14, 6.27s/it] {'loss': 0.4754, 'learning_rate': 1.7995464856468253e-05, 'epoch': 0.23} + 23%|██▎ | 1322/5772 [2:19:11<7:45:14, 6.27s/it] 23%|██▎ | 1323/5772 [2:19:25<7:50:05, 6.34s/it] 23%|██▎ | 1323/5772 [2:19:18<7:50:05, 6.34s/it] {'loss': 0.4875, 'learning_rate': 1.7992093012060988e-05, 'epoch': 0.23} + 23%|██▎ | 1323/5772 [2:19:25<7:50:05, 6.34s/it] {'loss': 0.4875, 'learning_rate': 1.7992093012060988e-05, 'epoch': 0.23} + 23%|██▎ | 1323/5772 [2:19:18<7:50:05, 6.34s/it] 23%|██▎ | 1324/5772 [2:19:31<7:47:54, 6.31s/it] 23%|██▎ | 1324/5772 [2:19:24<7:47:54, 6.31s/it] {'loss': 0.4805, 'learning_rate': 1.798871865058846e-05, 'epoch': 0.23} + 23%|██▎ | 1324/5772 [2:19:31<7:47:54, 6.31s/it] {'loss': 0.4805, 'learning_rate': 1.798871865058846e-05, 'epoch': 0.23} + 23%|██▎ | 1324/5772 [2:19:24<7:47:54, 6.31s/it] 23%|██▎ | 1325/5772 [2:19:30<7:41:55, 6.23s/it] 23%|██▎ | 1325/5772 [2:19:37<7:41:56, 6.23s/it] {'loss': 0.4966, 'learning_rate': 1.7985341773113416e-05, 'epoch': 0.23} + 23%|██▎ | 1325/5772 [2:19:37<7:41:56, 6.23s/it] {'loss': 0.4966, 'learning_rate': 1.7985341773113416e-05, 'epoch': 0.23} + 23%|██▎ | 1325/5772 [2:19:30<7:41:55, 6.23s/it] 23%|██▎ | 1326/5772 [2:19:43<7:40:30, 6.21s/it] 23%|██▎ | 1326/5772 [2:19:36<7:40:30, 6.21s/it] {'loss': 0.4825, 'learning_rate': 1.7981962380699376e-05, 'epoch': 0.23} + 23%|██▎ | 1326/5772 [2:19:43<7:40:30, 6.21s/it] {'loss': 0.4825, 'learning_rate': 1.7981962380699376e-05, 'epoch': 0.23} + 23%|██▎ | 1326/5772 [2:19:36<7:40:30, 6.21s/it] 23%|██▎ | 1327/5772 [2:19:42<7:39:54, 6.21s/it] 23%|██▎ | 1327/5772 [2:19:50<7:39:54, 6.21s/it] {'loss': 0.4848, 'learning_rate': 1.7978580474410665e-05, 'epoch': 0.23} + 23%|██▎ | 1327/5772 [2:19:50<7:39:54, 6.21s/it] {'loss': 0.4848, 'learning_rate': 1.7978580474410665e-05, 'epoch': 0.23} + 23%|██▎ | 1327/5772 [2:19:42<7:39:54, 6.21s/it] 23%|██▎ | 1328/5772 [2:19:48<7:35:00, 6.14s/it] 23%|██▎ | 1328/5772 [2:19:56<7:35:01, 6.14s/it] {'loss': 0.4776, 'learning_rate': 1.7975196055312393e-05, 'epoch': 0.23} + 23%|██▎ | 1328/5772 [2:19:56<7:35:01, 6.14s/it] {'loss': 0.4776, 'learning_rate': 1.7975196055312393e-05, 'epoch': 0.23} + 23%|██▎ | 1328/5772 [2:19:48<7:35:00, 6.14s/it] 23%|██▎ | 1329/5772 [2:19:54<7:31:06, 6.09s/it] 23%|██▎ | 1329/5772 [2:20:01<7:31:06, 6.09s/it] {'loss': 0.4793, 'learning_rate': 1.797180912447047e-05, 'epoch': 0.23} + 23%|██▎ | 1329/5772 [2:20:01<7:31:06, 6.09s/it] {'loss': 0.4793, 'learning_rate': 1.797180912447047e-05, 'epoch': 0.23} + 23%|██▎ | 1329/5772 [2:19:54<7:31:06, 6.09s/it] 23%|██▎ | 1330/5772 [2:20:01<7:32:40, 6.11s/it] 23%|██▎ | 1330/5772 [2:20:08<7:32:39, 6.11s/it] {'loss': 0.4984, 'learning_rate': 1.7968419682951584e-05, 'epoch': 0.23} + 23%|██▎ | 1330/5772 [2:20:08<7:32:39, 6.11s/it] {'loss': 0.4984, 'learning_rate': 1.7968419682951584e-05, 'epoch': 0.23} + 23%|██▎ | 1330/5772 [2:20:01<7:32:40, 6.11s/it] 23%|██▎ | 1331/5772 [2:20:07<7:29:25, 6.07s/it] 23%|██▎ | 1331/5772 [2:20:14<7:29:25, 6.07s/it] {'loss': 0.4848, 'learning_rate': 1.796502773182322e-05, 'epoch': 0.23} + 23%|██▎ | 1331/5772 [2:20:14<7:29:25, 6.07s/it] {'loss': 0.4848, 'learning_rate': 1.796502773182322e-05, 'epoch': 0.23} + 23%|██▎ | 1331/5772 [2:20:07<7:29:25, 6.07s/it] 23%|██▎ | 1332/5772 [2:20:13<7:27:02, 6.04s/it] 23%|██▎ | 1332/5772 [2:20:20<7:27:02, 6.04s/it] {'loss': 0.4775, 'learning_rate': 1.7961633272153662e-05, 'epoch': 0.23} + 23%|██▎ | 1332/5772 [2:20:20<7:27:02, 6.04s/it] {'loss': 0.4775, 'learning_rate': 1.7961633272153662e-05, 'epoch': 0.23} + 23%|██▎ | 1332/5772 [2:20:13<7:27:02, 6.04s/it] 23%|██▎ | 1333/5772 [2:20:19<7:27:05, 6.04s/it] 23%|██▎ | 1333/5772 [2:20:26<7:27:05, 6.04s/it] {'loss': 0.5025, 'learning_rate': 1.7958236305011972e-05, 'epoch': 0.23} + 23%|██▎ | 1333/5772 [2:20:26<7:27:05, 6.04s/it] {'loss': 0.5025, 'learning_rate': 1.7958236305011972e-05, 'epoch': 0.23} + 23%|██▎ | 1333/5772 [2:20:19<7:27:05, 6.04s/it] 23%|██▎ | 1334/5772 [2:20:25<7:30:33, 6.09s/it] 23%|██▎ | 1334/5772 [2:20:32<7:30:33, 6.09s/it] {'loss': 0.4843, 'learning_rate': 1.7954836831468007e-05, 'epoch': 0.23} + 23%|██▎ | 1334/5772 [2:20:32<7:30:33, 6.09s/it] {'loss': 0.4843, 'learning_rate': 1.7954836831468007e-05, 'epoch': 0.23} + 23%|██▎ | 1334/5772 [2:20:25<7:30:33, 6.09s/it] 23%|██▎ | 1335/5772 [2:20:31<7:24:16, 6.01s/it] 23%|██▎ | 1335/5772 [2:20:38<7:24:16, 6.01s/it] {'loss': 0.4804, 'learning_rate': 1.7951434852592406e-05, 'epoch': 0.23} + 23%|██▎ | 1335/5772 [2:20:38<7:24:16, 6.01s/it] {'loss': 0.4804, 'learning_rate': 1.7951434852592406e-05, 'epoch': 0.23} + 23%|██▎ | 1335/5772 [2:20:31<7:24:16, 6.01s/it] 23%|██▎ | 1336/5772 [2:20:37<7:21:35, 5.97s/it] 23%|██▎ | 1336/5772 [2:20:44<7:21:35, 5.97s/it] {'loss': 0.4761, 'learning_rate': 1.794803036945661e-05, 'epoch': 0.23} + 23%|██▎ | 1336/5772 [2:20:44<7:21:35, 5.97s/it] {'loss': 0.4761, 'learning_rate': 1.794803036945661e-05, 'epoch': 0.23} + 23%|██▎ | 1336/5772 [2:20:37<7:21:35, 5.97s/it] 23%|██▎ | 1337/5772 [2:20:43<7:28:04, 6.06s/it] 23%|██▎ | 1337/5772 [2:20:50<7:28:04, 6.06s/it] {'loss': 0.4893, 'learning_rate': 1.794462338313284e-05, 'epoch': 0.23} + 23%|██▎ | 1337/5772 [2:20:50<7:28:04, 6.06s/it] {'loss': 0.4893, 'learning_rate': 1.794462338313284e-05, 'epoch': 0.23} + 23%|██▎ | 1337/5772 [2:20:43<7:28:04, 6.06s/it] 23%|██▎ | 1338/5772 [2:20:56<7:29:51, 6.09s/it] 23%|██▎ | 1338/5772 [2:20:49<7:29:52, 6.09s/it] {'loss': 0.4781, 'learning_rate': 1.7941213894694108e-05, 'epoch': 0.23} + 23%|██▎ | 1338/5772 [2:20:56<7:29:51, 6.09s/it] {'loss': 0.4781, 'learning_rate': 1.7941213894694108e-05, 'epoch': 0.23} + 23%|██▎ | 1338/5772 [2:20:49<7:29:52, 6.09s/it] 23%|██▎ | 1339/5772 [2:20:55<7:26:23, 6.04s/it] 23%|██▎ | 1339/5772 [2:21:02<7:26:23, 6.04s/it] {'loss': 0.4984, 'learning_rate': 1.7937801905214213e-05, 'epoch': 0.23} + 23%|██▎ | 1339/5772 [2:21:02<7:26:23, 6.04s/it] {'loss': 0.4984, 'learning_rate': 1.7937801905214213e-05, 'epoch': 0.23} + 23%|██▎ | 1339/5772 [2:20:55<7:26:23, 6.04s/it] 23%|██▎ | 1340/5772 [2:21:02<7:42:19, 6.26s/it] 23%|██▎ | 1340/5772 [2:21:09<7:42:20, 6.26s/it] {'loss': 0.4858, 'learning_rate': 1.7934387415767745e-05, 'epoch': 0.23} + 23%|██▎ | 1340/5772 [2:21:09<7:42:20, 6.26s/it] {'loss': 0.4858, 'learning_rate': 1.7934387415767745e-05, 'epoch': 0.23} + 23%|██▎ | 1340/5772 [2:21:02<7:42:19, 6.26s/it] 23%|██▎ | 1341/5772 [2:21:07<7:30:14, 6.10s/it] 23%|██▎ | 1341/5772 [2:21:14<7:30:15, 6.10s/it] {'loss': 0.4893, 'learning_rate': 1.7930970427430074e-05, 'epoch': 0.23} + 23%|██▎ | 1341/5772 [2:21:14<7:30:15, 6.10s/it] {'loss': 0.4893, 'learning_rate': 1.7930970427430074e-05, 'epoch': 0.23} + 23%|██▎ | 1341/5772 [2:21:07<7:30:14, 6.10s/it] 23%|██▎ | 1342/5772 [2:21:20<7:25:46, 6.04s/it] 23%|██▎ | 1342/5772 [2:21:13<7:25:46, 6.04s/it] {'loss': 0.4732, 'learning_rate': 1.7927550941277364e-05, 'epoch': 0.23} + 23%|██▎ | 1342/5772 [2:21:20<7:25:46, 6.04s/it] {'loss': 0.4732, 'learning_rate': 1.7927550941277364e-05, 'epoch': 0.23} + 23%|██▎ | 1342/5772 [2:21:13<7:25:46, 6.04s/it] 23%|██▎ | 1343/5772 [2:21:27<7:41:16, 6.25s/it] 23%|██▎ | 1343/5772 [2:21:20<7:41:16, 6.25s/it] {'loss': 0.4996, 'learning_rate': 1.7924128958386558e-05, 'epoch': 0.23} + 23%|██▎ | 1343/5772 [2:21:27<7:41:16, 6.25s/it] {'loss': 0.4996, 'learning_rate': 1.7924128958386558e-05, 'epoch': 0.23} + 23%|██▎ | 1343/5772 [2:21:20<7:41:16, 6.25s/it] 23%|██▎ | 1344/5772 [2:21:26<7:35:06, 6.17s/it] 23%|██▎ | 1344/5772 [2:21:33<7:35:07, 6.17s/it] {'loss': 0.4775, 'learning_rate': 1.79207044798354e-05, 'epoch': 0.23} + 23%|██▎ | 1344/5772 [2:21:33<7:35:07, 6.17s/it] {'loss': 0.4775, 'learning_rate': 1.79207044798354e-05, 'epoch': 0.23} + 23%|██▎ | 1344/5772 [2:21:26<7:35:06, 6.17s/it] 23%|██▎ | 1345/5772 [2:21:32<7:32:43, 6.14s/it] 23%|██▎ | 1345/5772 [2:21:39<7:32:44, 6.14s/it] {'loss': 0.4816, 'learning_rate': 1.7917277506702406e-05, 'epoch': 0.23} + 23%|██▎ | 1345/5772 [2:21:39<7:32:44, 6.14s/it] {'loss': 0.4816, 'learning_rate': 1.7917277506702406e-05, 'epoch': 0.23} + 23%|██▎ | 1345/5772 [2:21:32<7:32:43, 6.14s/it] 23%|██▎ | 1346/5772 [2:21:38<7:35:42, 6.18s/it] 23%|██▎ | 1346/5772 [2:21:45<7:35:42, 6.18s/it] {'loss': 0.4786, 'learning_rate': 1.791384804006688e-05, 'epoch': 0.23} + 23%|██▎ | 1346/5772 [2:21:45<7:35:42, 6.18s/it] {'loss': 0.4786, 'learning_rate': 1.791384804006688e-05, 'epoch': 0.23} + 23%|██▎ | 1346/5772 [2:21:38<7:35:42, 6.18s/it] 23%|██▎ | 1347/5772 [2:21:44<7:33:40, 6.15s/it] 23%|██▎ | 1347/5772 [2:21:51<7:33:40, 6.15s/it] {'loss': 0.4925, 'learning_rate': 1.7910416081008914e-05, 'epoch': 0.23} + 23%|██▎ | 1347/5772 [2:21:51<7:33:40, 6.15s/it] {'loss': 0.4925, 'learning_rate': 1.7910416081008914e-05, 'epoch': 0.23} + 23%|██▎ | 1347/5772 [2:21:44<7:33:40, 6.15s/it] 23%|██▎ | 1348/5772 [2:21:51<7:32:37, 6.14s/it] 23%|██▎ | 1348/5772 [2:21:58<7:32:36, 6.14s/it] {'loss': 0.4695, 'learning_rate': 1.7906981630609383e-05, 'epoch': 0.23} + 23%|██▎ | 1348/5772 [2:21:58<7:32:36, 6.14s/it] {'loss': 0.4695, 'learning_rate': 1.7906981630609383e-05, 'epoch': 0.23} + 23%|██▎ | 1348/5772 [2:21:51<7:32:37, 6.14s/it] 23%|██▎ | 1349/5772 [2:21:57<7:43:18, 6.28s/it] 23%|██▎ | 1349/5772 [2:22:04<7:43:17, 6.28s/it] {'loss': 0.4898, 'learning_rate': 1.7903544689949955e-05, 'epoch': 0.23} + 23%|██▎ | 1349/5772 [2:22:04<7:43:17, 6.28s/it] {'loss': 0.4898, 'learning_rate': 1.7903544689949955e-05, 'epoch': 0.23} + 23%|██▎ | 1349/5772 [2:21:57<7:43:18, 6.28s/it]10 AutoResumeHook: Checking whether to suspend... +20 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +7 23%|██▎ | 1350/5772 [2:22:03<7:39:19, 6.23s/it] AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... + 23%|██▎ | 1350/5772 [2:22:10<7:39:19, 6.23s/it]3 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4902, 'learning_rate': 1.7900105260113066e-05, 'epoch': 0.23} + 23%|██▎ | 1350/5772 [2:22:10<7:39:19, 6.23s/it] {'loss': 0.4902, 'learning_rate': 1.7900105260113066e-05, 'epoch': 0.23} + 23%|██▎ | 1350/5772 [2:22:03<7:39:19, 6.23s/it] 23%|██▎ | 1351/5772 [2:22:09<7:38:55, 6.23s/it] 23%|██▎ | 1351/5772 [2:22:17<7:38:55, 6.23s/it] {'loss': 0.4999, 'learning_rate': 1.7896663342181954e-05, 'epoch': 0.23} + 23%|██▎ | 1351/5772 [2:22:17<7:38:55, 6.23s/it] {'loss': 0.4999, 'learning_rate': 1.7896663342181954e-05, 'epoch': 0.23} + 23%|██▎ | 1351/5772 [2:22:09<7:38:55, 6.23s/it] 23%|██▎ | 1352/5772 [2:22:16<7:41:53, 6.27s/it] 23%|██▎ | 1352/5772 [2:22:23<7:41:54, 6.27s/it] {'loss': 0.4797, 'learning_rate': 1.7893218937240627e-05, 'epoch': 0.23} + 23%|██▎ | 1352/5772 [2:22:23<7:41:54, 6.27s/it] {'loss': 0.4797, 'learning_rate': 1.7893218937240627e-05, 'epoch': 0.23} + 23%|██▎ | 1352/5772 [2:22:16<7:41:53, 6.27s/it] 23%|██▎ | 1353/5772 [2:22:22<7:35:08, 6.18s/it] 23%|██▎ | 1353/5772 [2:22:29<7:35:09, 6.18s/it] {'loss': 0.5008, 'learning_rate': 1.788977204637388e-05, 'epoch': 0.23} + 23%|██▎ | 1353/5772 [2:22:29<7:35:09, 6.18s/it] {'loss': 0.5008, 'learning_rate': 1.788977204637388e-05, 'epoch': 0.23} + 23%|██▎ | 1353/5772 [2:22:22<7:35:08, 6.18s/it] 23%|██▎ | 1354/5772 [2:22:28<7:33:12, 6.15s/it] 23%|██▎ | 1354/5772 [2:22:35<7:33:11, 6.15s/it] {'loss': 0.4776, 'learning_rate': 1.78863226706673e-05, 'epoch': 0.23} + 23%|██▎ | 1354/5772 [2:22:35<7:33:11, 6.15s/it] {'loss': 0.4776, 'learning_rate': 1.78863226706673e-05, 'epoch': 0.23} + 23%|██▎ | 1354/5772 [2:22:28<7:33:12, 6.15s/it] 23%|██▎ | 1355/5772 [2:22:34<7:31:49, 6.14s/it] 23%|██▎ | 1355/5772 [2:22:41<7:31:48, 6.14s/it] {'loss': 0.4872, 'learning_rate': 1.788287081120724e-05, 'epoch': 0.23} + 23%|██▎ | 1355/5772 [2:22:41<7:31:48, 6.14s/it] {'loss': 0.4872, 'learning_rate': 1.788287081120724e-05, 'epoch': 0.23} + 23%|██▎ | 1355/5772 [2:22:34<7:31:49, 6.14s/it] 23%|██▎ | 1356/5772 [2:22:40<7:30:14, 6.12s/it] 23%|██▎ | 1356/5772 [2:22:47<7:30:14, 6.12s/it] {'loss': 0.4703, 'learning_rate': 1.7879416469080847e-05, 'epoch': 0.23} + 23%|██▎ | 1356/5772 [2:22:47<7:30:14, 6.12s/it] {'loss': 0.4703, 'learning_rate': 1.7879416469080847e-05, 'epoch': 0.23} + 23%|██▎ | 1356/5772 [2:22:40<7:30:14, 6.12s/it] 24%|██▎ | 1357/5772 [2:22:46<7:28:22, 6.09s/it] 24%|██▎ | 1357/5772 [2:22:53<7:28:22, 6.09s/it] {'loss': 0.4942, 'learning_rate': 1.7875959645376043e-05, 'epoch': 0.24} + 24%|██▎ | 1357/5772 [2:22:53<7:28:22, 6.09s/it] {'loss': 0.4942, 'learning_rate': 1.7875959645376043e-05, 'epoch': 0.24} + 24%|██▎ | 1357/5772 [2:22:46<7:28:22, 6.09s/it] 24%|██▎ | 1358/5772 [2:22:52<7:34:02, 6.17s/it] 24%|██▎ | 1358/5772 [2:22:59<7:34:01, 6.17s/it] {'loss': 0.4934, 'learning_rate': 1.7872500341181546e-05, 'epoch': 0.24} + 24%|██▎ | 1358/5772 [2:22:59<7:34:01, 6.17s/it] {'loss': 0.4934, 'learning_rate': 1.7872500341181546e-05, 'epoch': 0.24} + 24%|██▎ | 1358/5772 [2:22:52<7:34:02, 6.17s/it] 24%|██▎ | 1359/5772 [2:22:59<7:32:39, 6.15s/it] 24%|██▎ | 1359/5772 [2:23:06<7:32:39, 6.15s/it] {'loss': 0.4881, 'learning_rate': 1.7869038557586832e-05, 'epoch': 0.24} + 24%|██▎ | 1359/5772 [2:23:06<7:32:39, 6.15s/it] {'loss': 0.4881, 'learning_rate': 1.7869038557586832e-05, 'epoch': 0.24} + 24%|██▎ | 1359/5772 [2:22:59<7:32:39, 6.15s/it] 24%|██▎ | 1360/5772 [2:23:12<7:26:51, 6.08s/it] 24%|██▎ | 1360/5772 [2:23:04<7:26:51, 6.08s/it] {'loss': 0.4758, 'learning_rate': 1.786557429568218e-05, 'epoch': 0.24} + 24%|██▎ | 1360/5772 [2:23:12<7:26:51, 6.08s/it] {'loss': 0.4758, 'learning_rate': 1.786557429568218e-05, 'epoch': 0.24} + 24%|██▎ | 1360/5772 [2:23:04<7:26:51, 6.08s/it] 24%|██▎ | 1361/5772 [2:23:11<7:28:38, 6.10s/it] 24%|██▎ | 1361/5772 [2:23:18<7:28:38, 6.10s/it] {'loss': 0.4827, 'learning_rate': 1.7862107556558633e-05, 'epoch': 0.24} + 24%|██▎ | 1361/5772 [2:23:18<7:28:38, 6.10s/it] {'loss': 0.4827, 'learning_rate': 1.7862107556558633e-05, 'epoch': 0.24} + 24%|██▎ | 1361/5772 [2:23:11<7:28:38, 6.10s/it] 24%|██▎ | 1362/5772 [2:23:17<7:29:36, 6.12s/it] 24%|██▎ | 1362/5772 [2:23:24<7:29:36, 6.12s/it] {'loss': 0.4804, 'learning_rate': 1.7858638341308026e-05, 'epoch': 0.24} + 24%|██▎ | 1362/5772 [2:23:24<7:29:36, 6.12s/it] {'loss': 0.4804, 'learning_rate': 1.7858638341308026e-05, 'epoch': 0.24} + 24%|██▎ | 1362/5772 [2:23:17<7:29:36, 6.12s/it] 24%|██▎ | 1363/5772 [2:23:23<7:34:18, 6.18s/it] 24%|██▎ | 1363/5772 [2:23:30<7:34:18, 6.18s/it] {'loss': 0.4954, 'learning_rate': 1.785516665102297e-05, 'epoch': 0.24} + 24%|██▎ | 1363/5772 [2:23:30<7:34:18, 6.18s/it] {'loss': 0.4954, 'learning_rate': 1.785516665102297e-05, 'epoch': 0.24} + 24%|██▎ | 1363/5772 [2:23:23<7:34:18, 6.18s/it] 24%|██▎ | 1364/5772 [2:23:29<7:35:03, 6.19s/it] 24%|██▎ | 1364/5772 [2:23:36<7:35:03, 6.19s/it] {'loss': 0.4928, 'learning_rate': 1.7851692486796847e-05, 'epoch': 0.24} + 24%|██▎ | 1364/5772 [2:23:36<7:35:03, 6.19s/it] {'loss': 0.4928, 'learning_rate': 1.7851692486796847e-05, 'epoch': 0.24} + 24%|██▎ | 1364/5772 [2:23:29<7:35:03, 6.19s/it] 24%|██▎ | 1365/5772 [2:23:35<7:28:28, 6.11s/it] 24%|██▎ | 1365/5772 [2:23:42<7:28:28, 6.11s/it] {'loss': 0.4783, 'learning_rate': 1.7848215849723836e-05, 'epoch': 0.24} + 24%|██▎ | 1365/5772 [2:23:42<7:28:28, 6.11s/it] {'loss': 0.4783, 'learning_rate': 1.7848215849723836e-05, 'epoch': 0.24} + 24%|██▎ | 1365/5772 [2:23:35<7:28:28, 6.11s/it] 24%|██▎ | 1366/5772 [2:23:41<7:27:49, 6.10s/it] 24%|██▎ | 1366/5772 [2:23:48<7:27:49, 6.10s/it] {'loss': 0.4845, 'learning_rate': 1.7844736740898876e-05, 'epoch': 0.24} + 24%|██▎ | 1366/5772 [2:23:48<7:27:49, 6.10s/it] {'loss': 0.4845, 'learning_rate': 1.7844736740898876e-05, 'epoch': 0.24} + 24%|██▎ | 1366/5772 [2:23:41<7:27:49, 6.10s/it] 24%|██▎ | 1367/5772 [2:23:47<7:24:40, 6.06s/it] 24%|██▎ | 1367/5772 [2:23:54<7:24:40, 6.06s/it] {'loss': 0.4874, 'learning_rate': 1.7841255161417698e-05, 'epoch': 0.24} + 24%|██▎ | 1367/5772 [2:23:54<7:24:40, 6.06s/it] {'loss': 0.4874, 'learning_rate': 1.7841255161417698e-05, 'epoch': 0.24} + 24%|██▎ | 1367/5772 [2:23:47<7:24:40, 6.06s/it] 24%|██▎ | 1368/5772 [2:23:53<7:22:06, 6.02s/it] 24%|██▎ | 1368/5772 [2:24:00<7:22:07, 6.02s/it] {'loss': 0.4759, 'learning_rate': 1.7837771112376804e-05, 'epoch': 0.24} + 24%|██▎ | 1368/5772 [2:24:00<7:22:07, 6.02s/it] {'loss': 0.4759, 'learning_rate': 1.7837771112376804e-05, 'epoch': 0.24} + 24%|██▎ | 1368/5772 [2:23:53<7:22:06, 6.02s/it] 24%|██▎ | 1369/5772 [2:23:59<7:23:39, 6.05s/it] 24%|██▎ | 1369/5772 [2:24:06<7:23:39, 6.05s/it] {'loss': 0.4842, 'learning_rate': 1.7834284594873478e-05, 'epoch': 0.24} + 24%|██▎ | 1369/5772 [2:24:06<7:23:39, 6.05s/it] {'loss': 0.4842, 'learning_rate': 1.7834284594873478e-05, 'epoch': 0.24} + 24%|██▎ | 1369/5772 [2:23:59<7:23:39, 6.05s/it] 24%|██▎ | 1370/5772 [2:24:05<7:23:06, 6.04s/it] 24%|██▎ | 1370/5772 [2:24:12<7:23:06, 6.04s/it] {'loss': 0.4797, 'learning_rate': 1.7830795610005775e-05, 'epoch': 0.24} + 24%|██▎ | 1370/5772 [2:24:12<7:23:06, 6.04s/it] {'loss': 0.4797, 'learning_rate': 1.7830795610005775e-05, 'epoch': 0.24} + 24%|██▎ | 1370/5772 [2:24:05<7:23:06, 6.04s/it] 24%|██▍ | 1371/5772 [2:24:12<7:26:00, 6.08s/it] 24%|██▍ | 1371/5772 [2:24:19<7:26:00, 6.08s/it] {'loss': 0.5033, 'learning_rate': 1.7827304158872538e-05, 'epoch': 0.24} + 24%|██▍ | 1371/5772 [2:24:19<7:26:00, 6.08s/it] {'loss': 0.5033, 'learning_rate': 1.7827304158872538e-05, 'epoch': 0.24} + 24%|██▍ | 1371/5772 [2:24:12<7:26:00, 6.08s/it] 24%|██▍ | 1372/5772 [2:24:25<7:23:14, 6.04s/it] 24%|██▍ | 1372/5772 [2:24:17<7:23:14, 6.04s/it] {'loss': 0.4867, 'learning_rate': 1.782381024257337e-05, 'epoch': 0.24} + 24%|██▍ | 1372/5772 [2:24:25<7:23:14, 6.04s/it] {'loss': 0.4867, 'learning_rate': 1.782381024257337e-05, 'epoch': 0.24} + 24%|██▍ | 1372/5772 [2:24:17<7:23:14, 6.04s/it] 24%|██▍ | 1373/5772 [2:24:24<7:28:37, 6.12s/it] 24%|██▍ | 1373/5772 [2:24:31<7:28:37, 6.12s/it] {'loss': 0.486, 'learning_rate': 1.782031386220867e-05, 'epoch': 0.24} + 24%|██▍ | 1373/5772 [2:24:31<7:28:37, 6.12s/it] {'loss': 0.486, 'learning_rate': 1.782031386220867e-05, 'epoch': 0.24} + 24%|██▍ | 1373/5772 [2:24:24<7:28:37, 6.12s/it] 24%|██▍ | 1374/5772 [2:24:30<7:29:51, 6.14s/it] 24%|██▍ | 1374/5772 [2:24:37<7:29:51, 6.14s/it] {'loss': 0.4841, 'learning_rate': 1.78168150188796e-05, 'epoch': 0.24} + 24%|██▍ | 1374/5772 [2:24:37<7:29:51, 6.14s/it] {'loss': 0.4841, 'learning_rate': 1.78168150188796e-05, 'epoch': 0.24} + 24%|██▍ | 1374/5772 [2:24:30<7:29:51, 6.14s/it] 24%|██▍ | 1375/5772 [2:24:36<7:29:04, 6.13s/it] 24%|██▍ | 1375/5772 [2:24:43<7:29:04, 6.13s/it] {'loss': 0.4853, 'learning_rate': 1.78133137136881e-05, 'epoch': 0.24} + 24%|██▍ | 1375/5772 [2:24:43<7:29:04, 6.13s/it] {'loss': 0.4853, 'learning_rate': 1.78133137136881e-05, 'epoch': 0.24} + 24%|██▍ | 1375/5772 [2:24:36<7:29:04, 6.13s/it] 24%|██▍ | 1376/5772 [2:24:42<7:33:01, 6.18s/it] 24%|██▍ | 1376/5772 [2:24:49<7:33:01, 6.18s/it] {'loss': 0.4927, 'learning_rate': 1.7809809947736892e-05, 'epoch': 0.24} + 24%|██▍ | 1376/5772 [2:24:49<7:33:01, 6.18s/it] {'loss': 0.4927, 'learning_rate': 1.7809809947736892e-05, 'epoch': 0.24} + 24%|██▍ | 1376/5772 [2:24:42<7:33:01, 6.18s/it] 24%|██▍ | 1377/5772 [2:24:49<7:40:47, 6.29s/it] 24%|██▍ | 1377/5772 [2:24:56<7:40:47, 6.29s/it] {'loss': 0.4803, 'learning_rate': 1.780630372212946e-05, 'epoch': 0.24} + 24%|██▍ | 1377/5772 [2:24:56<7:40:47, 6.29s/it] {'loss': 0.4803, 'learning_rate': 1.780630372212946e-05, 'epoch': 0.24} + 24%|██▍ | 1377/5772 [2:24:49<7:40:47, 6.29s/it] 24%|██▍ | 1378/5772 [2:24:55<7:35:06, 6.21s/it] 24%|██▍ | 1378/5772 [2:25:02<7:35:06, 6.21s/it] {'loss': 0.4791, 'learning_rate': 1.7802795037970076e-05, 'epoch': 0.24} + 24%|██▍ | 1378/5772 [2:25:02<7:35:06, 6.21s/it] {'loss': 0.4791, 'learning_rate': 1.7802795037970076e-05, 'epoch': 0.24} + 24%|██▍ | 1378/5772 [2:24:55<7:35:06, 6.21s/it] 24%|██▍ | 1379/5772 [2:25:01<7:30:26, 6.15s/it] 24%|██▍ | 1379/5772 [2:25:08<7:30:26, 6.15s/it] {'loss': 0.4781, 'learning_rate': 1.7799283896363778e-05, 'epoch': 0.24} + 24%|██▍ | 1379/5772 [2:25:08<7:30:26, 6.15s/it] {'loss': 0.4781, 'learning_rate': 1.7799283896363778e-05, 'epoch': 0.24} + 24%|██▍ | 1379/5772 [2:25:01<7:30:26, 6.15s/it] 24%|██▍ | 1380/5772 [2:25:07<7:26:07, 6.09s/it] 24%|██▍ | 1380/5772 [2:25:14<7:26:07, 6.09s/it] {'loss': 0.4866, 'learning_rate': 1.779577029841638e-05, 'epoch': 0.24} + 24%|██▍ | 1380/5772 [2:25:14<7:26:07, 6.09s/it] {'loss': 0.4866, 'learning_rate': 1.779577029841638e-05, 'epoch': 0.24} + 24%|██▍ | 1380/5772 [2:25:07<7:26:07, 6.09s/it] 24%|██▍ | 1381/5772 [2:25:13<7:29:42, 6.14s/it] 24%|██▍ | 1381/5772 [2:25:20<7:29:42, 6.14s/it] {'loss': 0.4822, 'learning_rate': 1.779225424523447e-05, 'epoch': 0.24} + 24%|██▍ | 1381/5772 [2:25:20<7:29:42, 6.14s/it] {'loss': 0.4822, 'learning_rate': 1.779225424523447e-05, 'epoch': 0.24} + 24%|██▍ | 1381/5772 [2:25:13<7:29:42, 6.14s/it] 24%|██▍ | 1382/5772 [2:25:20<7:37:47, 6.26s/it] 24%|██▍ | 1382/5772 [2:25:27<7:37:47, 6.26s/it] {'loss': 0.4758, 'learning_rate': 1.7788735737925414e-05, 'epoch': 0.24} + 24%|██▍ | 1382/5772 [2:25:27<7:37:47, 6.26s/it] {'loss': 0.4758, 'learning_rate': 1.7788735737925414e-05, 'epoch': 0.24} + 24%|██▍ | 1382/5772 [2:25:20<7:37:47, 6.26s/it] 24%|██▍ | 1383/5772 [2:25:26<7:39:32, 6.28s/it] 24%|██▍ | 1383/5772 [2:25:33<7:39:32, 6.28s/it] {'loss': 0.4866, 'learning_rate': 1.7785214777597342e-05, 'epoch': 0.24} + 24%|██▍ | 1383/5772 [2:25:33<7:39:32, 6.28s/it] {'loss': 0.4866, 'learning_rate': 1.7785214777597342e-05, 'epoch': 0.24} + 24%|██▍ | 1383/5772 [2:25:26<7:39:32, 6.28s/it] 24%|██▍ | 1384/5772 [2:25:32<7:40:25, 6.30s/it] 24%|██▍ | 1384/5772 [2:25:39<7:40:24, 6.30s/it] {'loss': 0.4759, 'learning_rate': 1.778169136535916e-05, 'epoch': 0.24} + 24%|██▍ | 1384/5772 [2:25:39<7:40:24, 6.30s/it] {'loss': 0.4759, 'learning_rate': 1.778169136535916e-05, 'epoch': 0.24} + 24%|██▍ | 1384/5772 [2:25:32<7:40:25, 6.30s/it] 24%|██▍ | 1385/5772 [2:25:39<7:48:22, 6.41s/it] 24%|██▍ | 1385/5772 [2:25:46<7:48:22, 6.41s/it] {'loss': 0.4994, 'learning_rate': 1.777816550232055e-05, 'epoch': 0.24} + 24%|██▍ | 1385/5772 [2:25:46<7:48:22, 6.41s/it] {'loss': 0.4994, 'learning_rate': 1.777816550232055e-05, 'epoch': 0.24} + 24%|██▍ | 1385/5772 [2:25:39<7:48:22, 6.41s/it] 24%|██▍ | 1386/5772 [2:25:45<7:36:26, 6.24s/it] 24%|██▍ | 1386/5772 [2:25:52<7:36:26, 6.24s/it] {'loss': 0.4736, 'learning_rate': 1.7774637189591963e-05, 'epoch': 0.24} + {'loss': 0.4736, 'learning_rate': 1.7774637189591963e-05, 'epoch': 0.24} 24%|██▍ | 1386/5772 [2:25:52<7:36:26, 6.24s/it] + 24%|██▍ | 1386/5772 [2:25:45<7:36:26, 6.24s/it] 24%|██▍ | 1387/5772 [2:25:51<7:42:06, 6.32s/it] 24%|██▍ | 1387/5772 [2:25:58<7:42:06, 6.32s/it] {'loss': 0.4879, 'learning_rate': 1.777110642828462e-05, 'epoch': 0.24} + 24%|██▍ | 1387/5772 [2:25:58<7:42:06, 6.32s/it] {'loss': 0.4879, 'learning_rate': 1.777110642828462e-05, 'epoch': 0.24} + 24%|██▍ | 1387/5772 [2:25:51<7:42:06, 6.32s/it] 24%|██▍ | 1388/5772 [2:25:58<7:39:53, 6.29s/it] 24%|██▍ | 1388/5772 [2:26:05<7:39:53, 6.29s/it] {'loss': 0.471, 'learning_rate': 1.776757321951051e-05, 'epoch': 0.24} + 24%|██▍ | 1388/5772 [2:26:05<7:39:53, 6.29s/it] {'loss': 0.471, 'learning_rate': 1.776757321951051e-05, 'epoch': 0.24} + 24%|██▍ | 1388/5772 [2:25:58<7:39:53, 6.29s/it] 24%|██▍ | 1389/5772 [2:26:11<7:30:21, 6.17s/it] 24%|██▍ | 1389/5772 [2:26:03<7:30:22, 6.17s/it] {'loss': 0.4798, 'learning_rate': 1.776403756438241e-05, 'epoch': 0.24} + 24%|██▍ | 1389/5772 [2:26:11<7:30:21, 6.17s/it] {'loss': 0.4798, 'learning_rate': 1.776403756438241e-05, 'epoch': 0.24} + 24%|██▍ | 1389/5772 [2:26:03<7:30:22, 6.17s/it] 24%|██▍ | 1390/5772 [2:26:09<7:24:23, 6.08s/it] 24%|██▍ | 1390/5772 [2:26:16<7:24:24, 6.08s/it] {'loss': 0.4738, 'learning_rate': 1.776049946401384e-05, 'epoch': 0.24} + 24%|██▍ | 1390/5772 [2:26:16<7:24:24, 6.08s/it] {'loss': 0.4738, 'learning_rate': 1.776049946401384e-05, 'epoch': 0.24} + 24%|██▍ | 1390/5772 [2:26:09<7:24:23, 6.08s/it] 24%|██▍ | 1391/5772 [2:26:15<7:19:58, 6.03s/it] 24%|██▍ | 1391/5772 [2:26:22<7:19:58, 6.03s/it] {'loss': 0.4954, 'learning_rate': 1.7756958919519118e-05, 'epoch': 0.24} + 24%|██▍ | 1391/5772 [2:26:22<7:19:58, 6.03s/it] {'loss': 0.4954, 'learning_rate': 1.7756958919519118e-05, 'epoch': 0.24} + 24%|██▍ | 1391/5772 [2:26:15<7:19:58, 6.03s/it] 24%|██▍ | 1392/5772 [2:26:21<7:23:57, 6.08s/it] 24%|██▍ | 1392/5772 [2:26:29<7:23:58, 6.08s/it] {'loss': 0.4746, 'learning_rate': 1.7753415932013313e-05, 'epoch': 0.24} + 24%|██▍ | 1392/5772 [2:26:29<7:23:58, 6.08s/it] {'loss': 0.4746, 'learning_rate': 1.7753415932013313e-05, 'epoch': 0.24} + 24%|██▍ | 1392/5772 [2:26:21<7:23:57, 6.08s/it] 24%|██▍ | 1393/5772 [2:26:28<7:30:19, 6.17s/it] 24%|██▍ | 1393/5772 [2:26:35<7:30:19, 6.17s/it] {'loss': 0.4881, 'learning_rate': 1.7749870502612267e-05, 'epoch': 0.24} + 24%|██▍ | 1393/5772 [2:26:35<7:30:19, 6.17s/it] {'loss': 0.4881, 'learning_rate': 1.7749870502612267e-05, 'epoch': 0.24} + 24%|██▍ | 1393/5772 [2:26:28<7:30:19, 6.17s/it] 24%|██▍ | 1394/5772 [2:26:41<7:28:24, 6.15s/it] 24%|██▍ | 1394/5772 [2:26:34<7:28:25, 6.15s/it] {'loss': 0.48, 'learning_rate': 1.7746322632432593e-05, 'epoch': 0.24} + 24%|██▍ | 1394/5772 [2:26:41<7:28:24, 6.15s/it] {'loss': 0.48, 'learning_rate': 1.7746322632432593e-05, 'epoch': 0.24} + 24%|██▍ | 1394/5772 [2:26:34<7:28:25, 6.15s/it] 24%|██▍ | 1395/5772 [2:26:40<7:30:13, 6.17s/it] 24%|██▍ | 1395/5772 [2:26:47<7:30:13, 6.17s/it] {'loss': 0.4918, 'learning_rate': 1.774277232259168e-05, 'epoch': 0.24} + 24%|██▍ | 1395/5772 [2:26:47<7:30:13, 6.17s/it] {'loss': 0.4918, 'learning_rate': 1.774277232259168e-05, 'epoch': 0.24} + 24%|██▍ | 1395/5772 [2:26:40<7:30:13, 6.17s/it] 24%|██▍ | 1396/5772 [2:26:46<7:27:19, 6.13s/it] 24%|██▍ | 1396/5772 [2:26:53<7:27:19, 6.13s/it] {'loss': 0.4842, 'learning_rate': 1.7739219574207673e-05, 'epoch': 0.24} + 24%|██▍ | 1396/5772 [2:26:53<7:27:19, 6.13s/it] {'loss': 0.4842, 'learning_rate': 1.7739219574207673e-05, 'epoch': 0.24} + 24%|██▍ | 1396/5772 [2:26:46<7:27:19, 6.13s/it] 24%|██▍ | 1397/5772 [2:26:59<7:25:15, 6.11s/it] 24%|██▍ | 1397/5772 [2:26:52<7:25:15, 6.11s/it] {'loss': 0.4949, 'learning_rate': 1.7735664388399492e-05, 'epoch': 0.24} + 24%|██▍ | 1397/5772 [2:26:59<7:25:15, 6.11s/it] {'loss': 0.4949, 'learning_rate': 1.7735664388399492e-05, 'epoch': 0.24} + 24%|██▍ | 1397/5772 [2:26:52<7:25:15, 6.11s/it] 24%|██▍ | 1398/5772 [2:26:58<7:25:09, 6.11s/it] 24%|██▍ | 1398/5772 [2:27:05<7:25:10, 6.11s/it] {'loss': 0.488, 'learning_rate': 1.773210676628682e-05, 'epoch': 0.24} + 24%|██▍ | 1398/5772 [2:27:05<7:25:10, 6.11s/it] {'loss': 0.488, 'learning_rate': 1.773210676628682e-05, 'epoch': 0.24} + 24%|██▍ | 1398/5772 [2:26:58<7:25:09, 6.11s/it] 24%|██▍ | 1399/5772 [2:27:04<7:22:26, 6.07s/it] 24%|██▍ | 1399/5772 [2:27:11<7:22:27, 6.07s/it] {'loss': 0.4911, 'learning_rate': 1.772854670899011e-05, 'epoch': 0.24} + 24%|██▍ | 1399/5772 [2:27:11<7:22:27, 6.07s/it] {'loss': 0.4911, 'learning_rate': 1.772854670899011e-05, 'epoch': 0.24} + 24%|██▍ | 1399/5772 [2:27:04<7:22:26, 6.07s/it]11 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +09 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... + 5 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... + 24%|██▍ | 1400/5772 [2:27:11<7:31:44, 6.20s/it]158 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + + 24%|██▍ | 1400/5772 [2:27:18<7:31:44, 6.20s/it]43 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +7 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4925, 'learning_rate': 1.7724984217630594e-05, 'epoch': 0.24} + 24%|██▍ | 1400/5772 [2:27:18<7:31:44, 6.20s/it] {'loss': 0.4925, 'learning_rate': 1.7724984217630594e-05, 'epoch': 0.24} + 24%|██▍ | 1400/5772 [2:27:11<7:31:44, 6.20s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-1400/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-1400/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-1400/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 24%|██▍ | 1401/5772 [2:27:30<12:07:14, 9.98s/it] 24%|██▍ | 1401/5772 [2:27:37<12:07:14, 9.98s/it] {'loss': 0.4714, 'learning_rate': 1.7721419293330245e-05, 'epoch': 0.24} + 24%|██▍ | 1401/5772 [2:27:37<12:07:14, 9.98s/it] {'loss': 0.4714, 'learning_rate': 1.7721419293330245e-05, 'epoch': 0.24} + 24%|██▍ | 1401/5772 [2:27:30<12:07:14, 9.98s/it] 24%|██▍ | 1402/5772 [2:27:36<10:48:09, 8.90s/it] 24%|██▍ | 1402/5772 [2:27:43<10:48:09, 8.90s/it] {'loss': 0.4858, 'learning_rate': 1.771785193721182e-05, 'epoch': 0.24} + 24%|██▍ | 1402/5772 [2:27:43<10:48:09, 8.90s/it] {'loss': 0.4858, 'learning_rate': 1.771785193721182e-05, 'epoch': 0.24} + 24%|██▍ | 1402/5772 [2:27:36<10:48:09, 8.90s/it] 24%|██▍ | 1403/5772 [2:27:42<9:47:14, 8.06s/it] 24%|██▍ | 1403/5772 [2:27:49<9:47:14, 8.06s/it] {'loss': 0.4903, 'learning_rate': 1.771428215039884e-05, 'epoch': 0.24} + 24%|██▍ | 1403/5772 [2:27:49<9:47:14, 8.06s/it] {'loss': 0.4903, 'learning_rate': 1.771428215039884e-05, 'epoch': 0.24} + 24%|██▍ | 1403/5772 [2:27:42<9:47:14, 8.06s/it] 24%|██▍ | 1404/5772 [2:27:55<9:07:06, 7.52s/it] 24%|██▍ | 1404/5772 [2:27:48<9:07:07, 7.52s/it] {'loss': 0.482, 'learning_rate': 1.7710709934015585e-05, 'epoch': 0.24} + 24%|██▍ | 1404/5772 [2:27:55<9:07:06, 7.52s/it] {'loss': 0.482, 'learning_rate': 1.7710709934015585e-05, 'epoch': 0.24} + 24%|██▍ | 1404/5772 [2:27:48<9:07:07, 7.52s/it] 24%|██▍ | 1405/5772 [2:27:54<8:34:09, 7.06s/it] 24%|██▍ | 1405/5772 [2:28:01<8:34:09, 7.06s/it] {'loss': 0.4867, 'learning_rate': 1.7707135289187115e-05, 'epoch': 0.24} + 24%|██▍ | 1405/5772 [2:28:01<8:34:09, 7.06s/it] {'loss': 0.4867, 'learning_rate': 1.7707135289187115e-05, 'epoch': 0.24} + 24%|██▍ | 1405/5772 [2:27:54<8:34:09, 7.06s/it] 24%|██▍ | 1406/5772 [2:28:01<8:15:34, 6.81s/it] 24%|██▍ | 1406/5772 [2:28:08<8:15:34, 6.81s/it] {'loss': 0.4799, 'learning_rate': 1.7703558217039233e-05, 'epoch': 0.24} + 24%|██▍ | 1406/5772 [2:28:08<8:15:34, 6.81s/it] {'loss': 0.4799, 'learning_rate': 1.7703558217039233e-05, 'epoch': 0.24} + 24%|██▍ | 1406/5772 [2:28:01<8:15:34, 6.81s/it] 24%|██▍ | 1407/5772 [2:28:07<7:58:57, 6.58s/it] 24%|██▍ | 1407/5772 [2:28:14<7:58:58, 6.58s/it] {'loss': 0.4765, 'learning_rate': 1.769997871869852e-05, 'epoch': 0.24} + 24%|██▍ | 1407/5772 [2:28:14<7:58:58, 6.58s/it] {'loss': 0.4765, 'learning_rate': 1.769997871869852e-05, 'epoch': 0.24} + 24%|██▍ | 1407/5772 [2:28:07<7:58:57, 6.58s/it] 24%|██▍ | 1408/5772 [2:28:13<7:50:17, 6.47s/it] 24%|██▍ | 1408/5772 [2:28:20<7:50:17, 6.47s/it] {'loss': 0.4905, 'learning_rate': 1.7696396795292324e-05, 'epoch': 0.24} + 24%|██▍ | 1408/5772 [2:28:20<7:50:17, 6.47s/it] {'loss': 0.4905, 'learning_rate': 1.7696396795292324e-05, 'epoch': 0.24} + 24%|██▍ | 1408/5772 [2:28:13<7:50:17, 6.47s/it] 24%|██▍ | 1409/5772 [2:28:19<7:37:28, 6.29s/it] 24%|██▍ | 1409/5772 [2:28:26<7:37:28, 6.29s/it] {'loss': 0.4839, 'learning_rate': 1.769281244794875e-05, 'epoch': 0.24} + 24%|██▍ | 1409/5772 [2:28:26<7:37:28, 6.29s/it] {'loss': 0.4839, 'learning_rate': 1.769281244794875e-05, 'epoch': 0.24} + 24%|██▍ | 1409/5772 [2:28:19<7:37:28, 6.29s/it] 24%|██▍ | 1410/5772 [2:28:25<7:31:32, 6.21s/it] 24%|██▍ | 1410/5772 [2:28:32<7:31:32, 6.21s/it] {'loss': 0.4807, 'learning_rate': 1.7689225677796667e-05, 'epoch': 0.24} + 24%|██▍ | 1410/5772 [2:28:32<7:31:32, 6.21s/it] {'loss': 0.4807, 'learning_rate': 1.7689225677796667e-05, 'epoch': 0.24} + 24%|██▍ | 1410/5772 [2:28:25<7:31:32, 6.21s/it] 24%|██▍ | 1411/5772 [2:28:31<7:35:37, 6.27s/it] 24%|██▍ | 1411/5772 [2:28:38<7:35:37, 6.27s/it] {'loss': 0.4713, 'learning_rate': 1.7685636485965713e-05, 'epoch': 0.24} + 24%|██▍ | 1411/5772 [2:28:38<7:35:37, 6.27s/it] {'loss': 0.4713, 'learning_rate': 1.7685636485965713e-05, 'epoch': 0.24} + 24%|██▍ | 1411/5772 [2:28:31<7:35:37, 6.27s/it] 24%|██▍ | 1412/5772 [2:28:38<7:38:17, 6.31s/it] 24%|██▍ | 1412/5772 [2:28:45<7:38:17, 6.31s/it] {'loss': 0.4737, 'learning_rate': 1.7682044873586273e-05, 'epoch': 0.24} + 24%|██▍ | 1412/5772 [2:28:45<7:38:17, 6.31s/it] {'loss': 0.4737, 'learning_rate': 1.7682044873586273e-05, 'epoch': 0.24} + 24%|██▍ | 1412/5772 [2:28:38<7:38:17, 6.31s/it] 24%|██▍ | 1413/5772 [2:28:44<7:37:18, 6.29s/it] 24%|██▍ | 1413/5772 [2:28:51<7:37:18, 6.29s/it] {'loss': 0.4845, 'learning_rate': 1.7678450841789515e-05, 'epoch': 0.24} + 24%|██▍ | 1413/5772 [2:28:51<7:37:18, 6.29s/it] {'loss': 0.4845, 'learning_rate': 1.7678450841789515e-05, 'epoch': 0.24} + 24%|██▍ | 1413/5772 [2:28:44<7:37:18, 6.29s/it] 24%|██▍ | 1414/5772 [2:28:50<7:38:13, 6.31s/it] 24%|██▍ | 1414/5772 [2:28:57<7:38:14, 6.31s/it] {'loss': 0.48, 'learning_rate': 1.7674854391707357e-05, 'epoch': 0.24} + 24%|██▍ | 1414/5772 [2:28:57<7:38:14, 6.31s/it] {'loss': 0.48, 'learning_rate': 1.7674854391707357e-05, 'epoch': 0.24} + 24%|██▍ | 1414/5772 [2:28:50<7:38:13, 6.31s/it] 25%|██▍ | 1415/5772 [2:28:56<7:35:49, 6.28s/it] 25%|██▍ | 1415/5772 [2:29:03<7:35:49, 6.28s/it] {'loss': 0.4823, 'learning_rate': 1.7671255524472482e-05, 'epoch': 0.25} + 25%|██▍ | 1415/5772 [2:29:03<7:35:49, 6.28s/it] {'loss': 0.4823, 'learning_rate': 1.7671255524472482e-05, 'epoch': 0.25} + 25%|██▍ | 1415/5772 [2:28:56<7:35:49, 6.28s/it] 25%|██▍ | 1416/5772 [2:29:02<7:29:59, 6.20s/it] 25%|██▍ | 1416/5772 [2:29:09<7:29:59, 6.20s/it] {'loss': 0.4718, 'learning_rate': 1.7667654241218332e-05, 'epoch': 0.25} + 25%|██▍ | 1416/5772 [2:29:09<7:29:59, 6.20s/it] {'loss': 0.4718, 'learning_rate': 1.7667654241218332e-05, 'epoch': 0.25} + 25%|██▍ | 1416/5772 [2:29:02<7:29:59, 6.20s/it] 25%|██▍ | 1417/5772 [2:29:08<7:26:51, 6.16s/it] 25%|██▍ | 1417/5772 [2:29:16<7:26:52, 6.16s/it] {'loss': 0.4727, 'learning_rate': 1.766405054307911e-05, 'epoch': 0.25} + 25%|██▍ | 1417/5772 [2:29:16<7:26:52, 6.16s/it] {'loss': 0.4727, 'learning_rate': 1.766405054307911e-05, 'epoch': 0.25} + 25%|██▍ | 1417/5772 [2:29:08<7:26:51, 6.16s/it] 25%|██▍ | 1418/5772 [2:29:15<7:29:45, 6.20s/it] 25%|██▍ | 1418/5772 [2:29:22<7:29:45, 6.20s/it] {'loss': 0.4721, 'learning_rate': 1.766044443118978e-05, 'epoch': 0.25} + 25%|██▍ | 1418/5772 [2:29:22<7:29:45, 6.20s/it] {'loss': 0.4721, 'learning_rate': 1.766044443118978e-05, 'epoch': 0.25} + 25%|██▍ | 1418/5772 [2:29:15<7:29:45, 6.20s/it] 25%|██▍ | 1419/5772 [2:29:21<7:26:28, 6.15s/it] 25%|██▍ | 1419/5772 [2:29:28<7:26:28, 6.15s/it] {'loss': 0.4903, 'learning_rate': 1.7656835906686072e-05, 'epoch': 0.25} + 25%|██▍ | 1419/5772 [2:29:28<7:26:28, 6.15s/it] {'loss': 0.4903, 'learning_rate': 1.7656835906686072e-05, 'epoch': 0.25} + 25%|██▍ | 1419/5772 [2:29:21<7:26:28, 6.15s/it] 25%|██▍ | 1420/5772 [2:29:27<7:26:00, 6.15s/it] 25%|██▍ | 1420/5772 [2:29:34<7:25:59, 6.15s/it] {'loss': 0.4977, 'learning_rate': 1.7653224970704465e-05, 'epoch': 0.25} + 25%|██▍ | 1420/5772 [2:29:34<7:25:59, 6.15s/it] {'loss': 0.4977, 'learning_rate': 1.7653224970704465e-05, 'epoch': 0.25} + 25%|██▍ | 1420/5772 [2:29:27<7:26:00, 6.15s/it] 25%|██▍ | 1421/5772 [2:29:33<7:23:28, 6.12s/it] 25%|██▍ | 1421/5772 [2:29:40<7:23:28, 6.12s/it] {'loss': 0.4877, 'learning_rate': 1.7649611624382207e-05, 'epoch': 0.25} + 25%|██▍ | 1421/5772 [2:29:40<7:23:28, 6.12s/it] {'loss': 0.4877, 'learning_rate': 1.7649611624382207e-05, 'epoch': 0.25} + 25%|██▍ | 1421/5772 [2:29:33<7:23:28, 6.12s/it] 25%|██▍ | 1422/5772 [2:29:39<7:23:58, 6.12s/it] 25%|██▍ | 1422/5772 [2:29:46<7:23:58, 6.12s/it] {'loss': 0.4823, 'learning_rate': 1.76459958688573e-05, 'epoch': 0.25} + 25%|██▍ | 1422/5772 [2:29:46<7:23:58, 6.12s/it] {'loss': 0.4823, 'learning_rate': 1.76459958688573e-05, 'epoch': 0.25} + 25%|██▍ | 1422/5772 [2:29:39<7:23:58, 6.12s/it] 25%|██▍ | 1423/5772 [2:29:52<7:21:17, 6.09s/it] 25%|██▍ | 1423/5772 [2:29:45<7:21:18, 6.09s/it] {'loss': 0.4861, 'learning_rate': 1.7642377705268505e-05, 'epoch': 0.25} + 25%|██▍ | 1423/5772 [2:29:52<7:21:17, 6.09s/it] {'loss': 0.4861, 'learning_rate': 1.7642377705268505e-05, 'epoch': 0.25} + 25%|██▍ | 1423/5772 [2:29:45<7:21:18, 6.09s/it] 25%|██▍ | 1424/5772 [2:29:59<7:31:38, 6.23s/it] 25%|██▍ | 1424/5772 [2:29:52<7:31:39, 6.23s/it] {'loss': 0.4747, 'learning_rate': 1.7638757134755346e-05, 'epoch': 0.25} + 25%|██▍ | 1424/5772 [2:29:59<7:31:38, 6.23s/it] {'loss': 0.4747, 'learning_rate': 1.7638757134755346e-05, 'epoch': 0.25} + 25%|██▍ | 1424/5772 [2:29:52<7:31:39, 6.23s/it] 25%|██▍ | 1425/5772 [2:29:58<7:40:22, 6.35s/it] 25%|██▍ | 1425/5772 [2:30:05<7:40:22, 6.35s/it] {'loss': 0.4974, 'learning_rate': 1.7635134158458095e-05, 'epoch': 0.25} + 25%|██▍ | 1425/5772 [2:30:05<7:40:22, 6.35s/it] {'loss': 0.4974, 'learning_rate': 1.7635134158458095e-05, 'epoch': 0.25} + 25%|██▍ | 1425/5772 [2:29:58<7:40:22, 6.35s/it] 25%|██▍ | 1426/5772 [2:30:04<7:35:55, 6.29s/it] 25%|██▍ | 1426/5772 [2:30:12<7:35:54, 6.29s/it] {'loss': 0.4708, 'learning_rate': 1.7631508777517794e-05, 'epoch': 0.25} + 25%|██▍ | 1426/5772 [2:30:12<7:35:54, 6.29s/it] {'loss': 0.4708, 'learning_rate': 1.7631508777517794e-05, 'epoch': 0.25} + 25%|██▍ | 1426/5772 [2:30:04<7:35:55, 6.29s/it] 25%|██▍ | 1427/5772 [2:30:11<7:41:27, 6.37s/it] 25%|██▍ | 1427/5772 [2:30:18<7:41:28, 6.37s/it] {'loss': 0.4903, 'learning_rate': 1.7627880993076237e-05, 'epoch': 0.25} + 25%|██▍ | 1427/5772 [2:30:18<7:41:28, 6.37s/it] {'loss': 0.4903, 'learning_rate': 1.7627880993076237e-05, 'epoch': 0.25} + 25%|██▍ | 1427/5772 [2:30:11<7:41:27, 6.37s/it] 25%|██▍ | 1428/5772 [2:30:17<7:39:23, 6.35s/it] 25%|██▍ | 1428/5772 [2:30:24<7:39:23, 6.35s/it] {'loss': 0.461, 'learning_rate': 1.762425080627597e-05, 'epoch': 0.25} + 25%|██▍ | 1428/5772 [2:30:24<7:39:23, 6.35s/it] {'loss': 0.461, 'learning_rate': 1.762425080627597e-05, 'epoch': 0.25} + 25%|██▍ | 1428/5772 [2:30:17<7:39:23, 6.35s/it] 25%|██▍ | 1429/5772 [2:30:23<7:30:54, 6.23s/it] 25%|██▍ | 1429/5772 [2:30:30<7:30:54, 6.23s/it] {'loss': 0.487, 'learning_rate': 1.76206182182603e-05, 'epoch': 0.25} + 25%|██▍ | 1429/5772 [2:30:30<7:30:54, 6.23s/it] {'loss': 0.487, 'learning_rate': 1.76206182182603e-05, 'epoch': 0.25} + 25%|██▍ | 1429/5772 [2:30:23<7:30:54, 6.23s/it] 25%|██▍ | 1430/5772 [2:30:37<7:31:21, 6.24s/it] 25%|██▍ | 1430/5772 [2:30:30<7:31:21, 6.24s/it] {'loss': 0.4694, 'learning_rate': 1.76169832301733e-05, 'epoch': 0.25} + 25%|██▍ | 1430/5772 [2:30:37<7:31:21, 6.24s/it] {'loss': 0.4694, 'learning_rate': 1.76169832301733e-05, 'epoch': 0.25} + 25%|██▍ | 1430/5772 [2:30:30<7:31:21, 6.24s/it] 25%|██▍ | 1431/5772 [2:30:43<7:31:38, 6.24s/it] 25%|██▍ | 1431/5772 [2:30:36<7:31:38, 6.24s/it] {'loss': 0.4955, 'learning_rate': 1.7613345843159777e-05, 'epoch': 0.25} + 25%|██▍ | 1431/5772 [2:30:43<7:31:38, 6.24s/it] {'loss': 0.4955, 'learning_rate': 1.7613345843159777e-05, 'epoch': 0.25} + 25%|██▍ | 1431/5772 [2:30:36<7:31:38, 6.24s/it] 25%|██▍ | 1432/5772 [2:30:42<7:28:14, 6.20s/it] 25%|██▍ | 1432/5772 [2:30:49<7:28:14, 6.20s/it] {'loss': 0.4795, 'learning_rate': 1.760970605836531e-05, 'epoch': 0.25} + 25%|██▍ | 1432/5772 [2:30:49<7:28:14, 6.20s/it] {'loss': 0.4795, 'learning_rate': 1.760970605836531e-05, 'epoch': 0.25} + 25%|██▍ | 1432/5772 [2:30:42<7:28:14, 6.20s/it] 25%|██▍ | 1433/5772 [2:30:48<7:26:31, 6.17s/it] 25%|██▍ | 1433/5772 [2:30:55<7:26:31, 6.17s/it] {'loss': 0.4889, 'learning_rate': 1.7606063876936235e-05, 'epoch': 0.25} + 25%|██▍ | 1433/5772 [2:30:55<7:26:31, 6.17s/it] {'loss': 0.4889, 'learning_rate': 1.7606063876936235e-05, 'epoch': 0.25} + 25%|██▍ | 1433/5772 [2:30:48<7:26:31, 6.17s/it] 25%|██▍ | 1434/5772 [2:30:55<7:34:18, 6.28s/it] 25%|██▍ | 1434/5772 [2:31:02<7:34:18, 6.28s/it] {'loss': 0.4722, 'learning_rate': 1.7602419300019627e-05, 'epoch': 0.25} + 25%|██▍ | 1434/5772 [2:31:02<7:34:18, 6.28s/it] {'loss': 0.4722, 'learning_rate': 1.7602419300019627e-05, 'epoch': 0.25} + 25%|██▍ | 1434/5772 [2:30:55<7:34:18, 6.28s/it] 25%|██▍ | 1435/5772 [2:31:08<7:31:41, 6.25s/it] 25%|██▍ | 1435/5772 [2:31:01<7:31:41, 6.25s/it] {'loss': 0.4881, 'learning_rate': 1.7598772328763335e-05, 'epoch': 0.25} + 25%|██▍ | 1435/5772 [2:31:08<7:31:41, 6.25s/it] {'loss': 0.4881, 'learning_rate': 1.7598772328763335e-05, 'epoch': 0.25} + 25%|██▍ | 1435/5772 [2:31:01<7:31:41, 6.25s/it] 25%|██▍ | 1436/5772 [2:31:07<7:37:42, 6.33s/it] 25%|██▍ | 1436/5772 [2:31:14<7:37:42, 6.33s/it] {'loss': 0.4842, 'learning_rate': 1.7595122964315945e-05, 'epoch': 0.25} + 25%|██▍ | 1436/5772 [2:31:14<7:37:42, 6.33s/it] {'loss': 0.4842, 'learning_rate': 1.7595122964315945e-05, 'epoch': 0.25} + 25%|██▍ | 1436/5772 [2:31:07<7:37:42, 6.33s/it] 25%|██▍ | 1437/5772 [2:31:14<7:40:56, 6.38s/it] 25%|██▍ | 1437/5772 [2:31:21<7:40:56, 6.38s/it] {'loss': 0.4988, 'learning_rate': 1.7591471207826804e-05, 'epoch': 0.25} + 25%|██▍ | 1437/5772 [2:31:21<7:40:56, 6.38s/it] {'loss': 0.4988, 'learning_rate': 1.7591471207826804e-05, 'epoch': 0.25} + 25%|██▍ | 1437/5772 [2:31:14<7:40:56, 6.38s/it] 25%|██▍ | 1438/5772 [2:31:20<7:32:53, 6.27s/it] 25%|██▍ | 1438/5772 [2:31:27<7:32:53, 6.27s/it] {'loss': 0.4792, 'learning_rate': 1.758781706044602e-05, 'epoch': 0.25} + 25%|██▍ | 1438/5772 [2:31:27<7:32:53, 6.27s/it] {'loss': 0.4792, 'learning_rate': 1.758781706044602e-05, 'epoch': 0.25} + 25%|██▍ | 1438/5772 [2:31:20<7:32:53, 6.27s/it] 25%|██▍ | 1439/5772 [2:31:26<7:35:03, 6.30s/it] 25%|██▍ | 1439/5772 [2:31:33<7:35:03, 6.30s/it] {'loss': 0.4915, 'learning_rate': 1.7584160523324437e-05, 'epoch': 0.25} + 25%|██▍ | 1439/5772 [2:31:33<7:35:03, 6.30s/it] {'loss': 0.4915, 'learning_rate': 1.7584160523324437e-05, 'epoch': 0.25} + 25%|██▍ | 1439/5772 [2:31:26<7:35:03, 6.30s/it] 25%|██▍ | 1440/5772 [2:31:32<7:31:16, 6.25s/it] 25%|██▍ | 1440/5772 [2:31:39<7:31:15, 6.25s/it] {'loss': 0.4851, 'learning_rate': 1.7580501597613665e-05, 'epoch': 0.25} + 25%|██▍ | 1440/5772 [2:31:39<7:31:15, 6.25s/it] {'loss': 0.4851, 'learning_rate': 1.7580501597613665e-05, 'epoch': 0.25} + 25%|██▍ | 1440/5772 [2:31:32<7:31:16, 6.25s/it] 25%|██▍ | 1441/5772 [2:31:39<7:36:19, 6.32s/it] 25%|██▍ | 1441/5772 [2:31:46<7:36:19, 6.32s/it] {'loss': 0.4947, 'learning_rate': 1.7576840284466065e-05, 'epoch': 0.25} + 25%|██▍ | 1441/5772 [2:31:46<7:36:19, 6.32s/it] {'loss': 0.4947, 'learning_rate': 1.7576840284466065e-05, 'epoch': 0.25} + 25%|██▍ | 1441/5772 [2:31:39<7:36:19, 6.32s/it] 25%|██▍ | 1442/5772 [2:31:44<7:22:41, 6.13s/it] 25%|██▍ | 1442/5772 [2:31:51<7:22:41, 6.13s/it] {'loss': 0.4741, 'learning_rate': 1.7573176585034744e-05, 'epoch': 0.25} + 25%|██▍ | 1442/5772 [2:31:51<7:22:41, 6.13s/it] {'loss': 0.4741, 'learning_rate': 1.7573176585034744e-05, 'epoch': 0.25} + 25%|██▍ | 1442/5772 [2:31:44<7:22:41, 6.13s/it] 25%|██▌ | 1443/5772 [2:31:51<7:33:24, 6.28s/it] 25%|██▌ | 1443/5772 [2:31:58<7:33:24, 6.28s/it] {'loss': 0.4847, 'learning_rate': 1.7569510500473566e-05, 'epoch': 0.25} + 25%|██▌ | 1443/5772 [2:31:58<7:33:24, 6.28s/it] {'loss': 0.4847, 'learning_rate': 1.7569510500473566e-05, 'epoch': 0.25} + 25%|██▌ | 1443/5772 [2:31:51<7:33:24, 6.28s/it] 25%|██▌ | 1444/5772 [2:31:57<7:26:37, 6.19s/it] 25%|██▌ | 1444/5772 [2:32:04<7:26:37, 6.19s/it] {'loss': 0.4885, 'learning_rate': 1.756584203193714e-05, 'epoch': 0.25} + 25%|██▌ | 1444/5772 [2:32:04<7:26:37, 6.19s/it] {'loss': 0.4885, 'learning_rate': 1.756584203193714e-05, 'epoch': 0.25} + 25%|██▌ | 1444/5772 [2:31:57<7:26:37, 6.19s/it] 25%|██▌ | 1445/5772 [2:32:03<7:17:42, 6.07s/it] 25%|██▌ | 1445/5772 [2:32:10<7:17:42, 6.07s/it] {'loss': 0.4767, 'learning_rate': 1.7562171180580834e-05, 'epoch': 0.25} + 25%|██▌ | 1445/5772 [2:32:10<7:17:42, 6.07s/it] {'loss': 0.4767, 'learning_rate': 1.7562171180580834e-05, 'epoch': 0.25} + 25%|██▌ | 1445/5772 [2:32:03<7:17:42, 6.07s/it] 25%|██▌ | 1446/5772 [2:32:09<7:13:35, 6.01s/it] 25%|██▌ | 1446/5772 [2:32:16<7:13:35, 6.01s/it] {'loss': 0.4881, 'learning_rate': 1.755849794756076e-05, 'epoch': 0.25} + 25%|██▌ | 1446/5772 [2:32:16<7:13:35, 6.01s/it] {'loss': 0.4881, 'learning_rate': 1.755849794756076e-05, 'epoch': 0.25} + 25%|██▌ | 1446/5772 [2:32:09<7:13:35, 6.01s/it] 25%|██▌ | 1447/5772 [2:32:22<7:20:03, 6.10s/it] 25%|██▌ | 1447/5772 [2:32:15<7:20:03, 6.10s/it] {'loss': 0.4759, 'learning_rate': 1.7554822334033782e-05, 'epoch': 0.25} + 25%|██▌ | 1447/5772 [2:32:22<7:20:03, 6.10s/it] {'loss': 0.4759, 'learning_rate': 1.7554822334033782e-05, 'epoch': 0.25} + 25%|██▌ | 1447/5772 [2:32:15<7:20:03, 6.10s/it] 25%|██▌ | 1448/5772 [2:32:21<7:19:24, 6.10s/it] 25%|██▌ | 1448/5772 [2:32:28<7:19:25, 6.10s/it] {'loss': 0.4845, 'learning_rate': 1.7551144341157514e-05, 'epoch': 0.25} + 25%|██▌ | 1448/5772 [2:32:28<7:19:25, 6.10s/it] {'loss': 0.4845, 'learning_rate': 1.7551144341157514e-05, 'epoch': 0.25} + 25%|██▌ | 1448/5772 [2:32:21<7:19:24, 6.10s/it] 25%|██▌ | 1449/5772 [2:32:27<7:19:34, 6.10s/it] 25%|██▌ | 1449/5772 [2:32:34<7:19:34, 6.10s/it] {'loss': 0.4891, 'learning_rate': 1.7547463970090324e-05, 'epoch': 0.25} + 25%|██▌ | 1449/5772 [2:32:34<7:19:34, 6.10s/it] {'loss': 0.4891, 'learning_rate': 1.7547463970090324e-05, 'epoch': 0.25} + 25%|██▌ | 1449/5772 [2:32:27<7:19:34, 6.10s/it]11 AutoResumeHook: Checking whether to suspend... +05 AutoResumeHook: Checking whether to suspend...8 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... + 25%|██▌ | 1450/5772 [2:32:40<7:18:40, 6.09s/it]14 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +4 + AutoResumeHook: Checking whether to suspend... + 6 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... + 25%|██▌ | 1450/5772 [2:32:33<7:18:41, 6.09s/it]13 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4907, 'learning_rate': 1.7543781221991317e-05, 'epoch': 0.25} + 25%|██▌ | 1450/5772 [2:32:40<7:18:40, 6.09s/it] {'loss': 0.4907, 'learning_rate': 1.7543781221991317e-05, 'epoch': 0.25} + 25%|██▌ | 1450/5772 [2:32:33<7:18:41, 6.09s/it] 25%|██▌ | 1451/5772 [2:32:39<7:18:00, 6.08s/it] 25%|██▌ | 1451/5772 [2:32:46<7:18:01, 6.08s/it] {'loss': 0.5027, 'learning_rate': 1.7540096098020358e-05, 'epoch': 0.25} + 25%|██▌ | 1451/5772 [2:32:46<7:18:01, 6.08s/it] {'loss': 0.5027, 'learning_rate': 1.7540096098020358e-05, 'epoch': 0.25} + 25%|██▌ | 1451/5772 [2:32:39<7:18:00, 6.08s/it] 25%|██▌ | 1452/5772 [2:32:53<7:27:50, 6.22s/it] 25%|██▌ | 1452/5772 [2:32:46<7:27:51, 6.22s/it] {'loss': 0.482, 'learning_rate': 1.753640859933806e-05, 'epoch': 0.25} + 25%|██▌ | 1452/5772 [2:32:46<7:27:51, 6.22s/it]{'loss': 0.482, 'learning_rate': 1.753640859933806e-05, 'epoch': 0.25} + 25%|██▌ | 1452/5772 [2:32:53<7:27:50, 6.22s/it] 25%|██▌ | 1453/5772 [2:32:59<7:27:39, 6.22s/it] 25%|██▌ | 1453/5772 [2:32:52<7:27:39, 6.22s/it] {'loss': 0.4841, 'learning_rate': 1.7532718727105772e-05, 'epoch': 0.25} + 25%|██▌ | 1453/5772 [2:32:59<7:27:39, 6.22s/it] {'loss': 0.4841, 'learning_rate': 1.7532718727105772e-05, 'epoch': 0.25} + 25%|██▌ | 1453/5772 [2:32:52<7:27:39, 6.22s/it] 25%|██▌ | 1454/5772 [2:32:59<7:40:33, 6.40s/it] 25%|██▌ | 1454/5772 [2:33:06<7:40:33, 6.40s/it] {'loss': 0.4958, 'learning_rate': 1.7529026482485605e-05, 'epoch': 0.25} + 25%|██▌ | 1454/5772 [2:33:06<7:40:33, 6.40s/it] {'loss': 0.4958, 'learning_rate': 1.7529026482485605e-05, 'epoch': 0.25} + 25%|██▌ | 1454/5772 [2:32:59<7:40:33, 6.40s/it] 25%|██▌ | 1455/5772 [2:33:12<7:35:43, 6.33s/it] 25%|██▌ | 1455/5772 [2:33:05<7:35:43, 6.33s/it] {'loss': 0.4702, 'learning_rate': 1.7525331866640406e-05, 'epoch': 0.25} + 25%|██▌ | 1455/5772 [2:33:12<7:35:43, 6.33s/it] {'loss': 0.4702, 'learning_rate': 1.7525331866640406e-05, 'epoch': 0.25} + 25%|██▌ | 1455/5772 [2:33:05<7:35:43, 6.33s/it] 25%|██▌ | 1456/5772 [2:33:18<7:31:39, 6.28s/it] 25%|██▌ | 1456/5772 [2:33:11<7:31:40, 6.28s/it] {'loss': 0.479, 'learning_rate': 1.752163488073378e-05, 'epoch': 0.25} + 25%|██▌ | 1456/5772 [2:33:18<7:31:39, 6.28s/it] {'loss': 0.479, 'learning_rate': 1.752163488073378e-05, 'epoch': 0.25} + 25%|██▌ | 1456/5772 [2:33:11<7:31:40, 6.28s/it] 25%|██▌ | 1457/5772 [2:33:24<7:24:53, 6.19s/it] 25%|██▌ | 1457/5772 [2:33:17<7:24:52, 6.19s/it] {'loss': 0.4984, 'learning_rate': 1.7517935525930068e-05, 'epoch': 0.25} + 25%|██▌ | 1457/5772 [2:33:24<7:24:53, 6.19s/it] {'loss': 0.4984, 'learning_rate': 1.7517935525930068e-05, 'epoch': 0.25} + 25%|██▌ | 1457/5772 [2:33:17<7:24:52, 6.19s/it] 25%|██▌ | 1458/5772 [2:33:23<7:22:56, 6.16s/it] 25%|██▌ | 1458/5772 [2:33:30<7:22:57, 6.16s/it] {'loss': 0.4763, 'learning_rate': 1.751423380339436e-05, 'epoch': 0.25} + 25%|██▌ | 1458/5772 [2:33:23<7:22:56, 6.16s/it]{'loss': 0.4763, 'learning_rate': 1.751423380339436e-05, 'epoch': 0.25} + 25%|██▌ | 1458/5772 [2:33:30<7:22:57, 6.16s/it] 25%|██▌ | 1459/5772 [2:33:29<7:20:03, 6.12s/it] 25%|██▌ | 1459/5772 [2:33:36<7:20:03, 6.12s/it] {'loss': 0.4875, 'learning_rate': 1.7510529714292497e-05, 'epoch': 0.25} + 25%|██▌ | 1459/5772 [2:33:36<7:20:03, 6.12s/it] {'loss': 0.4875, 'learning_rate': 1.7510529714292497e-05, 'epoch': 0.25} + 25%|██▌ | 1459/5772 [2:33:29<7:20:03, 6.12s/it] 25%|██▌ | 1460/5772 [2:33:42<7:14:38, 6.05s/it] 25%|██▌ | 1460/5772 [2:33:35<7:14:38, 6.05s/it] {'loss': 0.4765, 'learning_rate': 1.750682325979106e-05, 'epoch': 0.25} + 25%|██▌ | 1460/5772 [2:33:42<7:14:38, 6.05s/it] {'loss': 0.4765, 'learning_rate': 1.750682325979106e-05, 'epoch': 0.25} + 25%|██▌ | 1460/5772 [2:33:35<7:14:38, 6.05s/it] 25%|██▌ | 1461/5772 [2:33:49<7:22:04, 6.15s/it] 25%|██▌ | 1461/5772 [2:33:42<7:22:04, 6.15s/it] {'loss': 0.486, 'learning_rate': 1.7503114441057374e-05, 'epoch': 0.25} + 25%|██▌ | 1461/5772 [2:33:49<7:22:04, 6.15s/it] {'loss': 0.486, 'learning_rate': 1.7503114441057374e-05, 'epoch': 0.25} + 25%|██▌ | 1461/5772 [2:33:42<7:22:04, 6.15s/it] 25%|██▌ | 1462/5772 [2:33:48<7:22:51, 6.17s/it] 25%|██▌ | 1462/5772 [2:33:55<7:22:51, 6.17s/it] {'loss': 0.471, 'learning_rate': 1.7499403259259515e-05, 'epoch': 0.25} + 25%|██▌ | 1462/5772 [2:33:55<7:22:51, 6.17s/it] {'loss': 0.471, 'learning_rate': 1.7499403259259515e-05, 'epoch': 0.25} + 25%|██▌ | 1462/5772 [2:33:48<7:22:51, 6.17s/it] 25%|██▌ | 1463/5772 [2:33:54<7:24:34, 6.19s/it] 25%|██▌ | 1463/5772 [2:34:01<7:24:34, 6.19s/it] {'loss': 0.4966, 'learning_rate': 1.749568971556629e-05, 'epoch': 0.25} + 25%|██▌ | 1463/5772 [2:34:01<7:24:34, 6.19s/it] {'loss': 0.4966, 'learning_rate': 1.749568971556629e-05, 'epoch': 0.25} + 25%|██▌ | 1463/5772 [2:33:54<7:24:34, 6.19s/it] 25%|██▌ | 1464/5772 [2:34:00<7:17:21, 6.09s/it] 25%|██▌ | 1464/5772 [2:34:07<7:17:21, 6.09s/it] {'loss': 0.4799, 'learning_rate': 1.749197381114727e-05, 'epoch': 0.25} + 25%|██▌ | 1464/5772 [2:34:07<7:17:21, 6.09s/it] {'loss': 0.4799, 'learning_rate': 1.749197381114727e-05, 'epoch': 0.25} + 25%|██▌ | 1464/5772 [2:34:00<7:17:21, 6.09s/it] 25%|██▌ | 1465/5772 [2:34:13<7:18:21, 6.11s/it] 25%|██▌ | 1465/5772 [2:34:06<7:18:22, 6.11s/it] {'loss': 0.4838, 'learning_rate': 1.7488255547172754e-05, 'epoch': 0.25} + 25%|██▌ | 1465/5772 [2:34:13<7:18:21, 6.11s/it] {'loss': 0.4838, 'learning_rate': 1.7488255547172754e-05, 'epoch': 0.25} + 25%|██▌ | 1465/5772 [2:34:06<7:18:22, 6.11s/it] 25%|██▌ | 1466/5772 [2:34:19<7:12:18, 6.02s/it] 25%|██▌ | 1466/5772 [2:34:12<7:12:18, 6.02s/it] {'loss': 0.4895, 'learning_rate': 1.7484534924813785e-05, 'epoch': 0.25} + 25%|██▌ | 1466/5772 [2:34:19<7:12:18, 6.02s/it] {'loss': 0.4895, 'learning_rate': 1.7484534924813785e-05, 'epoch': 0.25} + 25%|██▌ | 1466/5772 [2:34:12<7:12:18, 6.02s/it] 25%|██▌ | 1467/5772 [2:34:25<7:09:57, 5.99s/it] 25%|██▌ | 1467/5772 [2:34:18<7:09:57, 5.99s/it] {'loss': 0.4809, 'learning_rate': 1.748081194524216e-05, 'epoch': 0.25} + 25%|██▌ | 1467/5772 [2:34:25<7:09:57, 5.99s/it] {'loss': 0.4809, 'learning_rate': 1.748081194524216e-05, 'epoch': 0.25} + 25%|██▌ | 1467/5772 [2:34:18<7:09:57, 5.99s/it] 25%|██▌ | 1468/5772 [2:34:24<7:12:41, 6.03s/it] 25%|██▌ | 1468/5772 [2:34:31<7:12:42, 6.03s/it] {'loss': 0.4868, 'learning_rate': 1.7477086609630403e-05, 'epoch': 0.25} + 25%|██▌ | 1468/5772 [2:34:31<7:12:42, 6.03s/it] {'loss': 0.4868, 'learning_rate': 1.7477086609630403e-05, 'epoch': 0.25} + 25%|██▌ | 1468/5772 [2:34:24<7:12:41, 6.03s/it] 25%|██▌ | 1469/5772 [2:34:30<7:13:37, 6.05s/it] 25%|██▌ | 1469/5772 [2:34:37<7:13:37, 6.05s/it] {'loss': 0.4892, 'learning_rate': 1.7473358919151792e-05, 'epoch': 0.25} + 25%|██▌ | 1469/5772 [2:34:37<7:13:37, 6.05s/it] {'loss': 0.4892, 'learning_rate': 1.7473358919151792e-05, 'epoch': 0.25} + 25%|██▌ | 1469/5772 [2:34:30<7:13:37, 6.05s/it] 25%|██▌ | 1470/5772 [2:34:36<7:11:54, 6.02s/it] 25%|██▌ | 1470/5772 [2:34:43<7:11:54, 6.02s/it] {'loss': 0.4867, 'learning_rate': 1.746962887498034e-05, 'epoch': 0.25} + 25%|██▌ | 1470/5772 [2:34:43<7:11:54, 6.02s/it] {'loss': 0.4867, 'learning_rate': 1.746962887498034e-05, 'epoch': 0.25} + 25%|██▌ | 1470/5772 [2:34:36<7:11:54, 6.02s/it] 25%|██▌ | 1471/5772 [2:34:42<7:12:43, 6.04s/it] 25%|██▌ | 1471/5772 [2:34:49<7:12:43, 6.04s/it] {'loss': 0.4791, 'learning_rate': 1.746589647829081e-05, 'epoch': 0.25} + 25%|██▌ | 1471/5772 [2:34:49<7:12:43, 6.04s/it] {'loss': 0.4791, 'learning_rate': 1.746589647829081e-05, 'epoch': 0.25} + 25%|██▌ | 1471/5772 [2:34:42<7:12:43, 6.04s/it] 26%|██▌ | 1472/5772 [2:34:48<7:11:21, 6.02s/it] 26%|██▌ | 1472/5772 [2:34:55<7:11:21, 6.02s/it] {'loss': 0.4751, 'learning_rate': 1.7462161730258688e-05, 'epoch': 0.26} + 26%|██▌ | 1472/5772 [2:34:55<7:11:21, 6.02s/it] {'loss': 0.4751, 'learning_rate': 1.7462161730258688e-05, 'epoch': 0.26} + 26%|██▌ | 1472/5772 [2:34:48<7:11:21, 6.02s/it] 26%|██▌ | 1473/5772 [2:34:54<7:04:58, 5.93s/it] 26%|██▌ | 1473/5772 [2:35:01<7:04:58, 5.93s/it] {'loss': 0.4752, 'learning_rate': 1.745842463206022e-05, 'epoch': 0.26} + 26%|██▌ | 1473/5772 [2:35:01<7:04:58, 5.93s/it] {'loss': 0.4752, 'learning_rate': 1.745842463206022e-05, 'epoch': 0.26} + 26%|██▌ | 1473/5772 [2:34:54<7:04:58, 5.93s/it] 26%|██▌ | 1474/5772 [2:35:00<7:03:22, 5.91s/it] 26%|██▌ | 1474/5772 [2:35:07<7:03:22, 5.91s/it] {'loss': 0.4848, 'learning_rate': 1.7454685184872388e-05, 'epoch': 0.26} + 26%|██▌ | 1474/5772 [2:35:07<7:03:22, 5.91s/it] {'loss': 0.4848, 'learning_rate': 1.7454685184872388e-05, 'epoch': 0.26} + 26%|██▌ | 1474/5772 [2:35:00<7:03:22, 5.91s/it] 26%|██▌ | 1475/5772 [2:35:06<7:16:37, 6.10s/it] 26%|██▌ | 1475/5772 [2:35:13<7:16:37, 6.10s/it] {'loss': 0.4824, 'learning_rate': 1.74509433898729e-05, 'epoch': 0.26} + 26%|██▌ | 1475/5772 [2:35:13<7:16:37, 6.10s/it] {'loss': 0.4824, 'learning_rate': 1.74509433898729e-05, 'epoch': 0.26} + 26%|██▌ | 1475/5772 [2:35:06<7:16:37, 6.10s/it] 26%|██▌ | 1476/5772 [2:35:19<7:19:35, 6.14s/it] 26%|██▌ | 1476/5772 [2:35:12<7:19:36, 6.14s/it] {'loss': 0.4817, 'learning_rate': 1.7447199248240222e-05, 'epoch': 0.26} + 26%|██▌ | 1476/5772 [2:35:19<7:19:35, 6.14s/it] {'loss': 0.4817, 'learning_rate': 1.7447199248240222e-05, 'epoch': 0.26} + 26%|██▌ | 1476/5772 [2:35:12<7:19:36, 6.14s/it] 26%|██▌ | 1477/5772 [2:35:18<7:12:41, 6.04s/it] 26%|██▌ | 1477/5772 [2:35:25<7:12:41, 6.04s/it] {'loss': 0.4745, 'learning_rate': 1.7443452761153546e-05, 'epoch': 0.26} + 26%|██▌ | 1477/5772 [2:35:25<7:12:41, 6.04s/it] {'loss': 0.4745, 'learning_rate': 1.7443452761153546e-05, 'epoch': 0.26} + 26%|██▌ | 1477/5772 [2:35:18<7:12:41, 6.04s/it] 26%|██▌ | 1478/5772 [2:35:32<7:20:36, 6.16s/it] 26%|██▌ | 1478/5772 [2:35:25<7:20:36, 6.16s/it] {'loss': 0.4781, 'learning_rate': 1.743970392979281e-05, 'epoch': 0.26} + 26%|██▌ | 1478/5772 [2:35:32<7:20:36, 6.16s/it] {'loss': 0.4781, 'learning_rate': 1.743970392979281e-05, 'epoch': 0.26} + 26%|██▌ | 1478/5772 [2:35:25<7:20:36, 6.16s/it] 26%|██▌ | 1479/5772 [2:35:31<7:29:46, 6.29s/it] 26%|██▌ | 1479/5772 [2:35:38<7:29:46, 6.29s/it] {'loss': 0.4853, 'learning_rate': 1.743595275533869e-05, 'epoch': 0.26} + 26%|██▌ | 1479/5772 [2:35:38<7:29:46, 6.29s/it] {'loss': 0.4853, 'learning_rate': 1.743595275533869e-05, 'epoch': 0.26} + 26%|██▌ | 1479/5772 [2:35:31<7:29:46, 6.29s/it] 26%|██▌ | 1480/5772 [2:35:45<7:34:10, 6.35s/it] 26%|██▌ | 1480/5772 [2:35:38<7:34:10, 6.35s/it] {'loss': 0.4854, 'learning_rate': 1.743219923897259e-05, 'epoch': 0.26} + 26%|██▌ | 1480/5772 [2:35:45<7:34:10, 6.35s/it] {'loss': 0.4854, 'learning_rate': 1.743219923897259e-05, 'epoch': 0.26} + 26%|██▌ | 1480/5772 [2:35:38<7:34:10, 6.35s/it] 26%|██▌ | 1481/5772 [2:35:44<7:28:16, 6.27s/it] 26%|██▌ | 1481/5772 [2:35:51<7:28:16, 6.27s/it] {'loss': 0.4923, 'learning_rate': 1.7428443381876657e-05, 'epoch': 0.26} + 26%|██▌ | 1481/5772 [2:35:51<7:28:16, 6.27s/it] {'loss': 0.4923, 'learning_rate': 1.7428443381876657e-05, 'epoch': 0.26} + 26%|██▌ | 1481/5772 [2:35:44<7:28:16, 6.27s/it] 26%|██▌ | 1482/5772 [2:35:57<7:24:52, 6.22s/it] 26%|██▌ | 1482/5772 [2:35:50<7:24:52, 6.22s/it] {'loss': 0.4898, 'learning_rate': 1.7424685185233788e-05, 'epoch': 0.26} + 26%|██▌ | 1482/5772 [2:35:57<7:24:52, 6.22s/it] {'loss': 0.4898, 'learning_rate': 1.7424685185233788e-05, 'epoch': 0.26} + 26%|██▌ | 1482/5772 [2:35:50<7:24:52, 6.22s/it] 26%|██▌ | 1483/5772 [2:35:56<7:19:24, 6.15s/it] 26%|██▌ | 1483/5772 [2:36:03<7:19:24, 6.15s/it] {'loss': 0.4776, 'learning_rate': 1.7420924650227603e-05, 'epoch': 0.26} + 26%|██▌ | 1483/5772 [2:36:03<7:19:24, 6.15s/it] {'loss': 0.4776, 'learning_rate': 1.7420924650227603e-05, 'epoch': 0.26} + 26%|██▌ | 1483/5772 [2:35:56<7:19:24, 6.15s/it] 26%|██▌ | 1484/5772 [2:36:09<7:15:47, 6.10s/it] 26%|██▌ | 1484/5772 [2:36:02<7:15:47, 6.10s/it] {'loss': 0.4953, 'learning_rate': 1.7417161778042456e-05, 'epoch': 0.26} + 26%|██▌ | 1484/5772 [2:36:09<7:15:47, 6.10s/it] {'loss': 0.4953, 'learning_rate': 1.7417161778042456e-05, 'epoch': 0.26} + 26%|██▌ | 1484/5772 [2:36:02<7:15:47, 6.10s/it] 26%|██▌ | 1485/5772 [2:36:15<7:15:36, 6.10s/it] 26%|██▌ | 1485/5772 [2:36:08<7:15:36, 6.10s/it] {'loss': 0.4786, 'learning_rate': 1.741339656986345e-05, 'epoch': 0.26} + 26%|██▌ | 1485/5772 [2:36:15<7:15:36, 6.10s/it] {'loss': 0.4786, 'learning_rate': 1.741339656986345e-05, 'epoch': 0.26} + 26%|██▌ | 1485/5772 [2:36:08<7:15:36, 6.10s/it] 26%|██▌ | 1486/5772 [2:36:21<7:11:35, 6.04s/it] 26%|██▌ | 1486/5772 [2:36:14<7:11:35, 6.04s/it] {'loss': 0.4889, 'learning_rate': 1.7409629026876412e-05, 'epoch': 0.26} + 26%|██▌ | 1486/5772 [2:36:21<7:11:35, 6.04s/it] {'loss': 0.4889, 'learning_rate': 1.7409629026876412e-05, 'epoch': 0.26} + 26%|██▌ | 1486/5772 [2:36:14<7:11:35, 6.04s/it] 26%|██▌ | 1487/5772 [2:36:27<7:12:12, 6.05s/it] 26%|██▌ | 1487/5772 [2:36:20<7:12:12, 6.05s/it] {'loss': 0.4938, 'learning_rate': 1.740585915026791e-05, 'epoch': 0.26} + 26%|██▌ | 1487/5772 [2:36:27<7:12:12, 6.05s/it] {'loss': 0.4938, 'learning_rate': 1.740585915026791e-05, 'epoch': 0.26} + 26%|██▌ | 1487/5772 [2:36:20<7:12:12, 6.05s/it] 26%|██▌ | 1488/5772 [2:36:33<7:03:56, 5.94s/it] 26%|██▌ | 1488/5772 [2:36:26<7:03:56, 5.94s/it] {'loss': 0.4819, 'learning_rate': 1.7402086941225246e-05, 'epoch': 0.26} + 26%|██▌ | 1488/5772 [2:36:33<7:03:56, 5.94s/it] {'loss': 0.4819, 'learning_rate': 1.7402086941225246e-05, 'epoch': 0.26} + 26%|██▌ | 1488/5772 [2:36:26<7:03:56, 5.94s/it] 26%|██▌ | 1489/5772 [2:36:32<7:05:56, 5.97s/it] 26%|██▌ | 1489/5772 [2:36:39<7:05:57, 5.97s/it] {'loss': 0.4728, 'learning_rate': 1.739831240093645e-05, 'epoch': 0.26} + 26%|██▌ | 1489/5772 [2:36:39<7:05:57, 5.97s/it] {'loss': 0.4728, 'learning_rate': 1.739831240093645e-05, 'epoch': 0.26} + 26%|██▌ | 1489/5772 [2:36:32<7:05:56, 5.97s/it] 26%|██▌ | 1490/5772 [2:36:45<7:10:02, 6.03s/it] 26%|██▌ | 1490/5772 [2:36:38<7:10:01, 6.03s/it] {'loss': 0.4836, 'learning_rate': 1.7394535530590305e-05, 'epoch': 0.26} + 26%|██▌ | 1490/5772 [2:36:45<7:10:02, 6.03s/it] {'loss': 0.4836, 'learning_rate': 1.7394535530590305e-05, 'epoch': 0.26} + 26%|██▌ | 1490/5772 [2:36:38<7:10:01, 6.03s/it] 26%|██▌ | 1491/5772 [2:36:51<7:09:44, 6.02s/it] 26%|██▌ | 1491/5772 [2:36:44<7:09:44, 6.02s/it] {'loss': 0.4717, 'learning_rate': 1.7390756331376307e-05, 'epoch': 0.26} + 26%|██▌ | 1491/5772 [2:36:51<7:09:44, 6.02s/it] {'loss': 0.4717, 'learning_rate': 1.7390756331376307e-05, 'epoch': 0.26} + 26%|██▌ | 1491/5772 [2:36:44<7:09:44, 6.02s/it] 26%|██▌ | 1492/5772 [2:36:57<7:20:11, 6.17s/it] 26%|██▌ | 1492/5772 [2:36:50<7:20:11, 6.17s/it] {'loss': 0.4874, 'learning_rate': 1.7386974804484694e-05, 'epoch': 0.26} + 26%|██▌ | 1492/5772 [2:36:57<7:20:11, 6.17s/it] {'loss': 0.4874, 'learning_rate': 1.7386974804484694e-05, 'epoch': 0.26} + 26%|██▌ | 1492/5772 [2:36:50<7:20:11, 6.17s/it] 26%|██▌ | 1493/5772 [2:37:04<7:37:18, 6.41s/it] 26%|██▌ | 1493/5772 [2:36:57<7:37:18, 6.41s/it] {'loss': 0.4797, 'learning_rate': 1.738319095110644e-05, 'epoch': 0.26} + 26%|██▌ | 1493/5772 [2:37:04<7:37:18, 6.41s/it] {'loss': 0.4797, 'learning_rate': 1.738319095110644e-05, 'epoch': 0.26} + 26%|██▌ | 1493/5772 [2:36:57<7:37:18, 6.41s/it] 26%|██▌ | 1494/5772 [2:37:10<7:26:40, 6.26s/it] 26%|██▌ | 1494/5772 [2:37:03<7:26:41, 6.26s/it] {'loss': 0.4836, 'learning_rate': 1.7379404772433247e-05, 'epoch': 0.26} + 26%|██▌ | 1494/5772 [2:37:10<7:26:40, 6.26s/it] {'loss': 0.4836, 'learning_rate': 1.7379404772433247e-05, 'epoch': 0.26} + 26%|██▌ | 1494/5772 [2:37:03<7:26:41, 6.26s/it] 26%|██▌ | 1495/5772 [2:37:16<7:21:21, 6.19s/it] 26%|██▌ | 1495/5772 [2:37:09<7:21:21, 6.19s/it] {'loss': 0.4834, 'learning_rate': 1.7375616269657544e-05, 'epoch': 0.26} + 26%|██▌ | 1495/5772 [2:37:16<7:21:21, 6.19s/it] {'loss': 0.4834, 'learning_rate': 1.7375616269657544e-05, 'epoch': 0.26} + 26%|██▌ | 1495/5772 [2:37:09<7:21:21, 6.19s/it] 26%|██▌ | 1496/5772 [2:37:15<7:16:13, 6.12s/it] 26%|██▌ | 1496/5772 [2:37:22<7:16:13, 6.12s/it] {'loss': 0.4895, 'learning_rate': 1.7371825443972513e-05, 'epoch': 0.26} + 26%|██▌ | 1496/5772 [2:37:22<7:16:13, 6.12s/it] {'loss': 0.4895, 'learning_rate': 1.7371825443972513e-05, 'epoch': 0.26} + 26%|██▌ | 1496/5772 [2:37:15<7:16:13, 6.12s/it] 26%|██▌ | 1497/5772 [2:37:21<7:17:27, 6.14s/it] 26%|██▌ | 1497/5772 [2:37:28<7:17:28, 6.14s/it] {'loss': 0.4899, 'learning_rate': 1.736803229657204e-05, 'epoch': 0.26} + {'loss': 0.4899, 'learning_rate': 1.736803229657204e-05, 'epoch': 0.26} 26%|██▌ | 1497/5772 [2:37:28<7:17:28, 6.14s/it] + 26%|██▌ | 1497/5772 [2:37:21<7:17:27, 6.14s/it] 26%|██▌ | 1498/5772 [2:37:27<7:15:43, 6.12s/it] 26%|██▌ | 1498/5772 [2:37:35<7:15:43, 6.12s/it] {'loss': 0.4932, 'learning_rate': 1.7364236828650768e-05, 'epoch': 0.26} + 26%|██▌ | 1498/5772 [2:37:35<7:15:43, 6.12s/it] {'loss': 0.4932, 'learning_rate': 1.7364236828650768e-05, 'epoch': 0.26} + 26%|██▌ | 1498/5772 [2:37:27<7:15:43, 6.12s/it] 26%|██▌ | 1499/5772 [2:37:34<7:19:09, 6.17s/it] 26%|██▌ | 1499/5772 [2:37:41<7:19:09, 6.17s/it] {'loss': 0.4728, 'learning_rate': 1.736043904140405e-05, 'epoch': 0.26} + 26%|██▌ | 1499/5772 [2:37:41<7:19:09, 6.17s/it] {'loss': 0.4728, 'learning_rate': 1.736043904140405e-05, 'epoch': 0.26} + 26%|██▌ | 1499/5772 [2:37:34<7:19:09, 6.17s/it]5 AutoResumeHook: Checking whether to suspend... +014 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... + 9 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend...12 AutoResumeHook: Checking whether to suspend... + +10 AutoResumeHook: Checking whether to suspend... + 26%|██▌ | 1500/5772 [2:37:40<7:19:46, 6.18s/it]2 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +7 8 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... + 26%|██▌ | 1500/5772 [2:37:47<7:19:45, 6.18s/it]4 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4743, 'learning_rate': 1.7356638936027975e-05, 'epoch': 0.26} + 26%|██▌ | 1500/5772 [2:37:47<7:19:45, 6.18s/it] {'loss': 0.4743, 'learning_rate': 1.7356638936027975e-05, 'epoch': 0.26} + 26%|██▌ | 1500/5772 [2:37:40<7:19:46, 6.18s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-1500/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-1500/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-1500/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 26%|██▌ | 1501/5772 [2:37:58<11:40:45, 9.84s/it] 26%|██▌ | 1501/5772 [2:38:05<11:40:46, 9.84s/it] {'loss': 0.4668, 'learning_rate': 1.7352836513719377e-05, 'epoch': 0.26} + 26%|██▌ | 1501/5772 [2:38:05<11:40:46, 9.84s/it] {'loss': 0.4668, 'learning_rate': 1.7352836513719377e-05, 'epoch': 0.26} + 26%|██▌ | 1501/5772 [2:37:58<11:40:45, 9.84s/it] 26%|██▌ | 1502/5772 [2:38:04<10:20:32, 8.72s/it] 26%|██▌ | 1502/5772 [2:38:12<10:20:32, 8.72s/it] {'loss': 0.4973, 'learning_rate': 1.7349031775675796e-05, 'epoch': 0.26} + 26%|██▌ | 1502/5772 [2:38:12<10:20:32, 8.72s/it] {'loss': 0.4973, 'learning_rate': 1.7349031775675796e-05, 'epoch': 0.26} + 26%|██▌ | 1502/5772 [2:38:04<10:20:32, 8.72s/it] 26%|██▌ | 1503/5772 [2:38:11<9:27:22, 7.97s/it] 26%|██▌ | 1503/5772 [2:38:18<9:27:21, 7.97s/it] {'loss': 0.4741, 'learning_rate': 1.734522472309552e-05, 'epoch': 0.26} + 26%|██▌ | 1503/5772 [2:38:18<9:27:21, 7.97s/it] {'loss': 0.4741, 'learning_rate': 1.734522472309552e-05, 'epoch': 0.26} + 26%|██▌ | 1503/5772 [2:38:11<9:27:22, 7.97s/it] 26%|██▌ | 1504/5772 [2:38:17<8:44:44, 7.38s/it] 26%|██▌ | 1504/5772 [2:38:24<8:44:44, 7.38s/it] {'loss': 0.4871, 'learning_rate': 1.734141535717756e-05, 'epoch': 0.26} + 26%|██▌ | 1504/5772 [2:38:24<8:44:44, 7.38s/it] {'loss': 0.4871, 'learning_rate': 1.734141535717756e-05, 'epoch': 0.26} + 26%|██▌ | 1504/5772 [2:38:17<8:44:44, 7.38s/it] 26%|██▌ | 1505/5772 [2:38:22<8:11:04, 6.91s/it] 26%|██▌ | 1505/5772 [2:38:30<8:11:04, 6.91s/it] {'loss': 0.4767, 'learning_rate': 1.7337603679121645e-05, 'epoch': 0.26} + 26%|██▌ | 1505/5772 [2:38:30<8:11:04, 6.91s/it] {'loss': 0.4767, 'learning_rate': 1.7337603679121645e-05, 'epoch': 0.26} + 26%|██▌ | 1505/5772 [2:38:23<8:11:04, 6.91s/it] 26%|██▌ | 1506/5772 [2:38:29<7:56:35, 6.70s/it] 26%|██▌ | 1506/5772 [2:38:36<7:56:35, 6.70s/it] {'loss': 0.4918, 'learning_rate': 1.7333789690128252e-05, 'epoch': 0.26} + 26%|██▌ | 1506/5772 [2:38:36<7:56:35, 6.70s/it] {'loss': 0.4918, 'learning_rate': 1.7333789690128252e-05, 'epoch': 0.26} + 26%|██▌ | 1506/5772 [2:38:29<7:56:35, 6.70s/it] 26%|██▌ | 1507/5772 [2:38:35<7:49:51, 6.61s/it] 26%|██▌ | 1507/5772 [2:38:42<7:49:52, 6.61s/it] {'loss': 0.4799, 'learning_rate': 1.7329973391398575e-05, 'epoch': 0.26} + 26%|██▌ | 1507/5772 [2:38:42<7:49:52, 6.61s/it] {'loss': 0.4799, 'learning_rate': 1.7329973391398575e-05, 'epoch': 0.26} + 26%|██▌ | 1507/5772 [2:38:35<7:49:51, 6.61s/it] 26%|██▌ | 1508/5772 [2:38:41<7:43:59, 6.53s/it] 26%|██▌ | 1508/5772 [2:38:49<7:43:59, 6.53s/it] {'loss': 0.5018, 'learning_rate': 1.732615478413453e-05, 'epoch': 0.26} + 26%|██▌ | 1508/5772 [2:38:49<7:43:59, 6.53s/it] {'loss': 0.5018, 'learning_rate': 1.732615478413453e-05, 'epoch': 0.26} + 26%|██▌ | 1508/5772 [2:38:41<7:43:59, 6.53s/it] 26%|██▌ | 1509/5772 [2:38:47<7:33:22, 6.38s/it] 26%|██▌ | 1509/5772 [2:38:55<7:33:22, 6.38s/it] {'loss': 0.465, 'learning_rate': 1.732233386953877e-05, 'epoch': 0.26} + 26%|██▌ | 1509/5772 [2:38:55<7:33:22, 6.38s/it] {'loss': 0.465, 'learning_rate': 1.732233386953877e-05, 'epoch': 0.26} + 26%|██▌ | 1509/5772 [2:38:48<7:33:22, 6.38s/it] 26%|██▌ | 1510/5772 [2:38:54<7:35:31, 6.41s/it] 26%|██▌ | 1510/5772 [2:39:01<7:35:32, 6.41s/it] {'loss': 0.4904, 'learning_rate': 1.731851064881467e-05, 'epoch': 0.26} + 26%|██▌ | 1510/5772 [2:39:01<7:35:32, 6.41s/it] {'loss': 0.4904, 'learning_rate': 1.731851064881467e-05, 'epoch': 0.26} + 26%|██▌ | 1510/5772 [2:38:54<7:35:31, 6.41s/it] 26%|██▌ | 1511/5772 [2:39:00<7:27:56, 6.31s/it] 26%|██▌ | 1511/5772 [2:39:07<7:27:56, 6.31s/it] {'loss': 0.4751, 'learning_rate': 1.7314685123166333e-05, 'epoch': 0.26} + 26%|██▌ | 1511/5772 [2:39:07<7:27:56, 6.31s/it] {'loss': 0.4751, 'learning_rate': 1.7314685123166333e-05, 'epoch': 0.26} + 26%|██▌ | 1511/5772 [2:39:00<7:27:56, 6.31s/it] 26%|██▌ | 1512/5772 [2:39:06<7:24:11, 6.26s/it] 26%|██▌ | 1512/5772 [2:39:13<7:24:11, 6.26s/it] {'loss': 0.4875, 'learning_rate': 1.7310857293798585e-05, 'epoch': 0.26} + 26%|██▌ | 1512/5772 [2:39:13<7:24:11, 6.26s/it] {'loss': 0.4875, 'learning_rate': 1.7310857293798585e-05, 'epoch': 0.26} + 26%|██▌ | 1512/5772 [2:39:06<7:24:11, 6.26s/it] 26%|██▌ | 1513/5772 [2:39:13<7:30:40, 6.35s/it] 26%|██▌ | 1513/5772 [2:39:20<7:30:40, 6.35s/it] {'loss': 0.4971, 'learning_rate': 1.730702716191698e-05, 'epoch': 0.26} + 26%|██▌ | 1513/5772 [2:39:20<7:30:40, 6.35s/it] {'loss': 0.4971, 'learning_rate': 1.730702716191698e-05, 'epoch': 0.26} + 26%|██▌ | 1513/5772 [2:39:13<7:30:40, 6.35s/it] 26%|██▌ | 1514/5772 [2:39:26<7:19:09, 6.19s/it] 26%|██▌ | 1514/5772 [2:39:19<7:19:10, 6.19s/it] {'loss': 0.4836, 'learning_rate': 1.73031947287278e-05, 'epoch': 0.26} + 26%|██▌ | 1514/5772 [2:39:26<7:19:09, 6.19s/it] {'loss': 0.4836, 'learning_rate': 1.73031947287278e-05, 'epoch': 0.26} + 26%|██▌ | 1514/5772 [2:39:19<7:19:10, 6.19s/it] 26%|██▌ | 1515/5772 [2:39:25<7:14:47, 6.13s/it] 26%|██▌ | 1515/5772 [2:39:32<7:14:48, 6.13s/it] {'loss': 0.4697, 'learning_rate': 1.7299359995438046e-05, 'epoch': 0.26} + 26%|██▌ | 1515/5772 [2:39:32<7:14:48, 6.13s/it] {'loss': 0.4697, 'learning_rate': 1.7299359995438046e-05, 'epoch': 0.26} + 26%|██▌ | 1515/5772 [2:39:25<7:14:47, 6.13s/it] 26%|██▋ | 1516/5772 [2:39:38<7:14:35, 6.13s/it] 26%|██▋ | 1516/5772 [2:39:31<7:14:35, 6.13s/it] {'loss': 0.4878, 'learning_rate': 1.7295522963255443e-05, 'epoch': 0.26} + 26%|██▋ | 1516/5772 [2:39:38<7:14:35, 6.13s/it] {'loss': 0.4878, 'learning_rate': 1.7295522963255443e-05, 'epoch': 0.26} + 26%|██▋ | 1516/5772 [2:39:31<7:14:35, 6.13s/it] 26%|██▋ | 1517/5772 [2:39:44<7:12:20, 6.10s/it] 26%|██▋ | 1517/5772 [2:39:37<7:12:21, 6.10s/it] {'loss': 0.4813, 'learning_rate': 1.729168363338845e-05, 'epoch': 0.26} + 26%|██▋ | 1517/5772 [2:39:44<7:12:20, 6.10s/it] {'loss': 0.4813, 'learning_rate': 1.729168363338845e-05, 'epoch': 0.26} + 26%|██▋ | 1517/5772 [2:39:37<7:12:21, 6.10s/it] 26%|██▋ | 1518/5772 [2:39:50<7:07:10, 6.03s/it] 26%|██▋ | 1518/5772 [2:39:43<7:07:10, 6.03s/it] {'loss': 0.4766, 'learning_rate': 1.7287842007046232e-05, 'epoch': 0.26} + 26%|██▋ | 1518/5772 [2:39:50<7:07:10, 6.03s/it] {'loss': 0.4766, 'learning_rate': 1.7287842007046232e-05, 'epoch': 0.26} + 26%|██▋ | 1518/5772 [2:39:43<7:07:10, 6.03s/it] 26%|██▋ | 1519/5772 [2:39:56<7:16:37, 6.16s/it] 26%|██▋ | 1519/5772 [2:39:49<7:16:38, 6.16s/it] {'loss': 0.4687, 'learning_rate': 1.7283998085438703e-05, 'epoch': 0.26} + 26%|██▋ | 1519/5772 [2:39:56<7:16:37, 6.16s/it] {'loss': 0.4687, 'learning_rate': 1.7283998085438703e-05, 'epoch': 0.26} + 26%|██▋ | 1519/5772 [2:39:49<7:16:38, 6.16s/it] 26%|██▋ | 1520/5772 [2:39:55<7:07:32, 6.03s/it] 26%|██▋ | 1520/5772 [2:40:02<7:07:33, 6.03s/it] {'loss': 0.4789, 'learning_rate': 1.728015186977647e-05, 'epoch': 0.26} + 26%|██▋ | 1520/5772 [2:40:02<7:07:33, 6.03s/it] {'loss': 0.4789, 'learning_rate': 1.728015186977647e-05, 'epoch': 0.26} + 26%|██▋ | 1520/5772 [2:39:55<7:07:32, 6.03s/it] 26%|██▋ | 1521/5772 [2:40:08<7:11:45, 6.09s/it] 26%|██▋ | 1521/5772 [2:40:01<7:11:45, 6.09s/it] {'loss': 0.472, 'learning_rate': 1.7276303361270886e-05, 'epoch': 0.26} + 26%|██▋ | 1521/5772 [2:40:08<7:11:45, 6.09s/it] {'loss': 0.472, 'learning_rate': 1.7276303361270886e-05, 'epoch': 0.26} + 26%|██▋ | 1521/5772 [2:40:01<7:11:45, 6.09s/it] 26%|██▋ | 1522/5772 [2:40:07<7:10:13, 6.07s/it] 26%|██▋ | 1522/5772 [2:40:14<7:10:13, 6.07s/it] {'loss': 0.4761, 'learning_rate': 1.7272452561134015e-05, 'epoch': 0.26} + 26%|██▋ | 1522/5772 [2:40:14<7:10:13, 6.07s/it] {'loss': 0.4761, 'learning_rate': 1.7272452561134015e-05, 'epoch': 0.26} + 26%|██▋ | 1522/5772 [2:40:07<7:10:13, 6.07s/it] 26%|██▋ | 1523/5772 [2:40:20<7:12:40, 6.11s/it] 26%|██▋ | 1523/5772 [2:40:13<7:12:40, 6.11s/it] {'loss': 0.4844, 'learning_rate': 1.7268599470578644e-05, 'epoch': 0.26} + 26%|██▋ | 1523/5772 [2:40:20<7:12:40, 6.11s/it] {'loss': 0.4844, 'learning_rate': 1.7268599470578644e-05, 'epoch': 0.26} + 26%|██▋ | 1523/5772 [2:40:13<7:12:40, 6.11s/it] 26%|██▋ | 1524/5772 [2:40:27<7:22:00, 6.24s/it] 26%|██▋ | 1524/5772 [2:40:20<7:22:00, 6.24s/it] {'loss': 0.4951, 'learning_rate': 1.7264744090818284e-05, 'epoch': 0.26} + 26%|██▋ | 1524/5772 [2:40:27<7:22:00, 6.24s/it] {'loss': 0.4951, 'learning_rate': 1.7264744090818284e-05, 'epoch': 0.26} + 26%|██▋ | 1524/5772 [2:40:20<7:22:00, 6.24s/it] 26%|██▋ | 1525/5772 [2:40:26<7:19:25, 6.21s/it] 26%|██▋ | 1525/5772 [2:40:33<7:19:25, 6.21s/it] {'loss': 0.474, 'learning_rate': 1.726088642306716e-05, 'epoch': 0.26} + 26%|██▋ | 1525/5772 [2:40:33<7:19:25, 6.21s/it] {'loss': 0.474, 'learning_rate': 1.726088642306716e-05, 'epoch': 0.26} + 26%|██▋ | 1525/5772 [2:40:26<7:19:25, 6.21s/it] 26%|██▋ | 1526/5772 [2:40:39<7:21:23, 6.24s/it] 26%|██▋ | 1526/5772 [2:40:32<7:21:23, 6.24s/it] {'loss': 0.4939, 'learning_rate': 1.7257026468540238e-05, 'epoch': 0.26} + 26%|██▋ | 1526/5772 [2:40:39<7:21:23, 6.24s/it] {'loss': 0.4939, 'learning_rate': 1.7257026468540238e-05, 'epoch': 0.26} + 26%|██▋ | 1526/5772 [2:40:32<7:21:23, 6.24s/it] 26%|██▋ | 1527/5772 [2:40:39<7:24:35, 6.28s/it] 26%|██▋ | 1527/5772 [2:40:46<7:24:35, 6.28s/it] {'loss': 0.4736, 'learning_rate': 1.725316422845317e-05, 'epoch': 0.26} + 26%|██▋ | 1527/5772 [2:40:46<7:24:35, 6.28s/it] {'loss': 0.4736, 'learning_rate': 1.725316422845317e-05, 'epoch': 0.26} + 26%|██▋ | 1527/5772 [2:40:39<7:24:35, 6.28s/it] 26%|██▋ | 1528/5772 [2:40:52<7:20:00, 6.22s/it] 26%|██▋ | 1528/5772 [2:40:45<7:20:00, 6.22s/it] {'loss': 0.4868, 'learning_rate': 1.724929970402236e-05, 'epoch': 0.26} + 26%|██▋ | 1528/5772 [2:40:52<7:20:00, 6.22s/it] {'loss': 0.4868, 'learning_rate': 1.724929970402236e-05, 'epoch': 0.26} + 26%|██▋ | 1528/5772 [2:40:45<7:20:00, 6.22s/it] 26%|██▋ | 1529/5772 [2:40:58<7:13:55, 6.14s/it] 26%|██▋ | 1529/5772 [2:40:51<7:13:56, 6.14s/it] {'loss': 0.4686, 'learning_rate': 1.7245432896464913e-05, 'epoch': 0.26} + 26%|██▋ | 1529/5772 [2:40:58<7:13:55, 6.14s/it] {'loss': 0.4686, 'learning_rate': 1.7245432896464913e-05, 'epoch': 0.26} + 26%|██▋ | 1529/5772 [2:40:51<7:13:56, 6.14s/it] 27%|██▋ | 1530/5772 [2:41:04<7:15:54, 6.17s/it] 27%|██▋ | 1530/5772 [2:40:57<7:15:54, 6.17s/it] {'loss': 0.4903, 'learning_rate': 1.724156380699866e-05, 'epoch': 0.27} + 27%|██▋ | 1530/5772 [2:41:04<7:15:54, 6.17s/it] {'loss': 0.4903, 'learning_rate': 1.724156380699866e-05, 'epoch': 0.27} + 27%|██▋ | 1530/5772 [2:40:57<7:15:54, 6.17s/it] 27%|██▋ | 1531/5772 [2:41:10<7:21:13, 6.24s/it] 27%|██▋ | 1531/5772 [2:41:03<7:21:13, 6.24s/it] {'loss': 0.4675, 'learning_rate': 1.723769243684215e-05, 'epoch': 0.27} + 27%|██▋ | 1531/5772 [2:41:10<7:21:13, 6.24s/it] {'loss': 0.4675, 'learning_rate': 1.723769243684215e-05, 'epoch': 0.27} + 27%|██▋ | 1531/5772 [2:41:03<7:21:13, 6.24s/it] 27%|██▋ | 1532/5772 [2:41:16<7:19:14, 6.22s/it] 27%|██▋ | 1532/5772 [2:41:09<7:19:14, 6.22s/it] {'loss': 0.4902, 'learning_rate': 1.723381878721465e-05, 'epoch': 0.27} + 27%|██▋ | 1532/5772 [2:41:16<7:19:14, 6.22s/it] {'loss': 0.4902, 'learning_rate': 1.723381878721465e-05, 'epoch': 0.27} + 27%|██▋ | 1532/5772 [2:41:09<7:19:14, 6.22s/it] 27%|██▋ | 1533/5772 [2:41:16<7:21:33, 6.25s/it] 27%|██▋ | 1533/5772 [2:41:23<7:21:34, 6.25s/it] {'loss': 0.4852, 'learning_rate': 1.7229942859336142e-05, 'epoch': 0.27} + 27%|██▋ | 1533/5772 [2:41:23<7:21:34, 6.25s/it] {'loss': 0.4852, 'learning_rate': 1.7229942859336142e-05, 'epoch': 0.27} + 27%|██▋ | 1533/5772 [2:41:16<7:21:33, 6.25s/it] 27%|██▋ | 1534/5772 [2:41:29<7:18:57, 6.21s/it] 27%|██▋ | 1534/5772 [2:41:22<7:18:57, 6.21s/it] {'loss': 0.4787, 'learning_rate': 1.7226064654427327e-05, 'epoch': 0.27} + 27%|██▋ | 1534/5772 [2:41:29<7:18:57, 6.21s/it] {'loss': 0.4787, 'learning_rate': 1.7226064654427327e-05, 'epoch': 0.27} + 27%|██▋ | 1534/5772 [2:41:22<7:18:57, 6.21s/it] 27%|██▋ | 1535/5772 [2:41:35<7:15:52, 6.17s/it] 27%|██▋ | 1535/5772 [2:41:28<7:15:52, 6.17s/it] {'loss': 0.4813, 'learning_rate': 1.7222184173709627e-05, 'epoch': 0.27} + 27%|██▋ | 1535/5772 [2:41:35<7:15:52, 6.17s/it] {'loss': 0.4813, 'learning_rate': 1.7222184173709627e-05, 'epoch': 0.27} + 27%|██▋ | 1535/5772 [2:41:28<7:15:52, 6.17s/it] 27%|██▋ | 1536/5772 [2:41:42<7:24:25, 6.30s/it] 27%|██▋ | 1536/5772 [2:41:35<7:24:25, 6.30s/it] {'loss': 0.4895, 'learning_rate': 1.721830141840518e-05, 'epoch': 0.27} + 27%|██▋ | 1536/5772 [2:41:42<7:24:25, 6.30s/it] {'loss': 0.4895, 'learning_rate': 1.721830141840518e-05, 'epoch': 0.27} + 27%|██▋ | 1536/5772 [2:41:35<7:24:25, 6.30s/it] 27%|██▋ | 1537/5772 [2:41:41<7:25:17, 6.31s/it] 27%|██▋ | 1537/5772 [2:41:48<7:25:17, 6.31s/it] {'loss': 0.4857, 'learning_rate': 1.721441638973683e-05, 'epoch': 0.27} + 27%|██▋ | 1537/5772 [2:41:48<7:25:17, 6.31s/it] {'loss': 0.4857, 'learning_rate': 1.721441638973683e-05, 'epoch': 0.27} + 27%|██▋ | 1537/5772 [2:41:41<7:25:17, 6.31s/it] 27%|██▋ | 1538/5772 [2:41:54<7:22:00, 6.26s/it] 27%|██▋ | 1538/5772 [2:41:47<7:22:00, 6.26s/it] {'loss': 0.4911, 'learning_rate': 1.7210529088928156e-05, 'epoch': 0.27} + 27%|██▋ | 1538/5772 [2:41:54<7:22:00, 6.26s/it] {'loss': 0.4911, 'learning_rate': 1.7210529088928156e-05, 'epoch': 0.27} + 27%|██▋ | 1538/5772 [2:41:47<7:22:00, 6.26s/it] 27%|██▋ | 1539/5772 [2:42:01<7:27:50, 6.35s/it] 27%|██▋ | 1539/5772 [2:41:54<7:27:49, 6.35s/it] {'loss': 0.4721, 'learning_rate': 1.7206639517203433e-05, 'epoch': 0.27} + 27%|██▋ | 1539/5772 [2:42:01<7:27:50, 6.35s/it] {'loss': 0.4721, 'learning_rate': 1.7206639517203433e-05, 'epoch': 0.27} + 27%|██▋ | 1539/5772 [2:41:54<7:27:49, 6.35s/it] 27%|██▋ | 1540/5772 [2:42:06<7:14:48, 6.16s/it] 27%|██▋ | 1540/5772 [2:41:59<7:14:48, 6.16s/it] {'loss': 0.481, 'learning_rate': 1.7202747675787662e-05, 'epoch': 0.27} + 27%|██▋ | 1540/5772 [2:42:06<7:14:48, 6.16s/it] {'loss': 0.481, 'learning_rate': 1.7202747675787662e-05, 'epoch': 0.27} + 27%|██▋ | 1540/5772 [2:41:59<7:14:48, 6.16s/it] 27%|██▋ | 1541/5772 [2:42:13<7:21:17, 6.26s/it] 27%|██▋ | 1541/5772 [2:42:06<7:21:18, 6.26s/it] {'loss': 0.4851, 'learning_rate': 1.7198853565906558e-05, 'epoch': 0.27} + 27%|██▋ | 1541/5772 [2:42:13<7:21:17, 6.26s/it] {'loss': 0.4851, 'learning_rate': 1.7198853565906558e-05, 'epoch': 0.27} + 27%|██▋ | 1541/5772 [2:42:06<7:21:18, 6.26s/it] 27%|██▋ | 1542/5772 [2:42:19<7:20:36, 6.25s/it] 27%|██▋ | 1542/5772 [2:42:12<7:20:36, 6.25s/it] {'loss': 0.4899, 'learning_rate': 1.719495718878655e-05, 'epoch': 0.27} + 27%|██▋ | 1542/5772 [2:42:19<7:20:36, 6.25s/it] {'loss': 0.4899, 'learning_rate': 1.719495718878655e-05, 'epoch': 0.27} + 27%|██▋ | 1542/5772 [2:42:12<7:20:36, 6.25s/it] 27%|██▋ | 1543/5772 [2:42:25<7:11:14, 6.12s/it] 27%|██▋ | 1543/5772 [2:42:18<7:11:14, 6.12s/it] {'loss': 0.4861, 'learning_rate': 1.7191058545654783e-05, 'epoch': 0.27} + 27%|██▋ | 1543/5772 [2:42:25<7:11:14, 6.12s/it] {'loss': 0.4861, 'learning_rate': 1.7191058545654783e-05, 'epoch': 0.27} + 27%|██▋ | 1543/5772 [2:42:18<7:11:14, 6.12s/it] 27%|██▋ | 1544/5772 [2:42:31<7:06:08, 6.05s/it] 27%|██▋ | 1544/5772 [2:42:24<7:06:08, 6.05s/it] {'loss': 0.4907, 'learning_rate': 1.7187157637739108e-05, 'epoch': 0.27} + 27%|██▋ | 1544/5772 [2:42:31<7:06:08, 6.05s/it] {'loss': 0.4907, 'learning_rate': 1.7187157637739108e-05, 'epoch': 0.27} + 27%|██▋ | 1544/5772 [2:42:24<7:06:08, 6.05s/it] 27%|██▋ | 1545/5772 [2:42:37<7:08:42, 6.09s/it] 27%|██▋ | 1545/5772 [2:42:30<7:08:42, 6.09s/it] {'loss': 0.4905, 'learning_rate': 1.7183254466268093e-05, 'epoch': 0.27} + 27%|██▋ | 1545/5772 [2:42:37<7:08:42, 6.09s/it] {'loss': 0.4905, 'learning_rate': 1.7183254466268093e-05, 'epoch': 0.27} + 27%|██▋ | 1545/5772 [2:42:30<7:08:42, 6.09s/it] 27%|██▋ | 1546/5772 [2:42:43<7:10:15, 6.11s/it] 27%|██▋ | 1546/5772 [2:42:36<7:10:15, 6.11s/it] {'loss': 0.492, 'learning_rate': 1.7179349032471026e-05, 'epoch': 0.27} + 27%|██▋ | 1546/5772 [2:42:43<7:10:15, 6.11s/it] {'loss': 0.492, 'learning_rate': 1.7179349032471026e-05, 'epoch': 0.27} + 27%|██▋ | 1546/5772 [2:42:36<7:10:15, 6.11s/it] 27%|██▋ | 1547/5772 [2:42:42<7:14:37, 6.17s/it] 27%|██▋ | 1547/5772 [2:42:49<7:14:37, 6.17s/it] {'loss': 0.4805, 'learning_rate': 1.7175441337577897e-05, 'epoch': 0.27} + 27%|██▋ | 1547/5772 [2:42:49<7:14:37, 6.17s/it] {'loss': 0.4805, 'learning_rate': 1.7175441337577897e-05, 'epoch': 0.27} + 27%|██▋ | 1547/5772 [2:42:42<7:14:37, 6.17s/it] 27%|██▋ | 1548/5772 [2:42:56<7:13:24, 6.16s/it] 27%|██▋ | 1548/5772 [2:42:49<7:13:24, 6.16s/it] {'loss': 0.4872, 'learning_rate': 1.717153138281941e-05, 'epoch': 0.27} + 27%|██▋ | 1548/5772 [2:42:56<7:13:24, 6.16s/it] {'loss': 0.4872, 'learning_rate': 1.717153138281941e-05, 'epoch': 0.27} + 27%|██▋ | 1548/5772 [2:42:49<7:13:24, 6.16s/it] 27%|██▋ | 1549/5772 [2:43:02<7:21:08, 6.27s/it] 27%|██▋ | 1549/5772 [2:42:55<7:21:08, 6.27s/it] {'loss': 0.4735, 'learning_rate': 1.7167619169426996e-05, 'epoch': 0.27} + 27%|██▋ | 1549/5772 [2:43:02<7:21:08, 6.27s/it] {'loss': 0.4735, 'learning_rate': 1.7167619169426996e-05, 'epoch': 0.27} + 27%|██▋ | 1549/5772 [2:42:55<7:21:08, 6.27s/it]14 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +813 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +010 AutoResumeHook: Checking whether to suspend... + 27%|██▋ | 1550/5772 [2:43:09<7:32:16, 6.43s/it]15 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... + 27%|██▋ | 1550/5772 [2:43:02<7:32:16, 6.43s/it]6 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4869, 'learning_rate': 1.7163704698632772e-05, 'epoch': 0.27} + 27%|██▋ | 1550/5772 [2:43:09<7:32:16, 6.43s/it] {'loss': 0.4869, 'learning_rate': 1.7163704698632772e-05, 'epoch': 0.27} + 27%|██▋ | 1550/5772 [2:43:02<7:32:16, 6.43s/it] 27%|██▋ | 1551/5772 [2:43:15<7:26:33, 6.35s/it] 27%|██▋ | 1551/5772 [2:43:08<7:26:33, 6.35s/it] {'loss': 0.4725, 'learning_rate': 1.7159787971669586e-05, 'epoch': 0.27} + 27%|██▋ | 1551/5772 [2:43:15<7:26:33, 6.35s/it] {'loss': 0.4725, 'learning_rate': 1.7159787971669586e-05, 'epoch': 0.27} + 27%|██▋ | 1551/5772 [2:43:08<7:26:33, 6.35s/it] 27%|██▋ | 1552/5772 [2:43:22<7:38:27, 6.52s/it] 27%|██▋ | 1552/5772 [2:43:15<7:38:27, 6.52s/it] {'loss': 0.4903, 'learning_rate': 1.7155868989770984e-05, 'epoch': 0.27} + 27%|██▋ | 1552/5772 [2:43:22<7:38:27, 6.52s/it] {'loss': 0.4903, 'learning_rate': 1.7155868989770984e-05, 'epoch': 0.27} + 27%|██▋ | 1552/5772 [2:43:15<7:38:27, 6.52s/it] 27%|██▋ | 1553/5772 [2:43:21<7:37:15, 6.50s/it] 27%|██▋ | 1553/5772 [2:43:28<7:37:15, 6.50s/it] {'loss': 0.4845, 'learning_rate': 1.715194775417123e-05, 'epoch': 0.27} + 27%|██▋ | 1553/5772 [2:43:28<7:37:15, 6.50s/it] {'loss': 0.4845, 'learning_rate': 1.715194775417123e-05, 'epoch': 0.27} + 27%|██▋ | 1553/5772 [2:43:21<7:37:15, 6.50s/it] 27%|██▋ | 1554/5772 [2:43:34<7:25:21, 6.34s/it] 27%|██▋ | 1554/5772 [2:43:27<7:25:21, 6.34s/it] {'loss': 0.4637, 'learning_rate': 1.71480242661053e-05, 'epoch': 0.27} + 27%|██▋ | 1554/5772 [2:43:34<7:25:21, 6.34s/it] {'loss': 0.4637, 'learning_rate': 1.71480242661053e-05, 'epoch': 0.27} + 27%|██▋ | 1554/5772 [2:43:27<7:25:21, 6.34s/it] 27%|██▋ | 1555/5772 [2:43:40<7:19:38, 6.26s/it] 27%|██▋ | 1555/5772 [2:43:33<7:19:38, 6.26s/it] {'loss': 0.4855, 'learning_rate': 1.7144098526808867e-05, 'epoch': 0.27} + 27%|██▋ | 1555/5772 [2:43:40<7:19:38, 6.26s/it] {'loss': 0.4855, 'learning_rate': 1.7144098526808867e-05, 'epoch': 0.27} + 27%|██▋ | 1555/5772 [2:43:33<7:19:38, 6.26s/it] 27%|██▋ | 1556/5772 [2:43:47<7:17:38, 6.23s/it] 27%|██▋ | 1556/5772 [2:43:40<7:17:38, 6.23s/it] {'loss': 0.474, 'learning_rate': 1.7140170537518327e-05, 'epoch': 0.27} + 27%|██▋ | 1556/5772 [2:43:47<7:17:38, 6.23s/it] {'loss': 0.474, 'learning_rate': 1.7140170537518327e-05, 'epoch': 0.27} + 27%|██▋ | 1556/5772 [2:43:40<7:17:38, 6.23s/it] 27%|██▋ | 1557/5772 [2:43:53<7:15:27, 6.20s/it] 27%|██▋ | 1557/5772 [2:43:46<7:15:26, 6.20s/it] {'loss': 0.4753, 'learning_rate': 1.7136240299470772e-05, 'epoch': 0.27} + 27%|██▋ | 1557/5772 [2:43:53<7:15:27, 6.20s/it] {'loss': 0.4753, 'learning_rate': 1.7136240299470772e-05, 'epoch': 0.27} + 27%|██▋ | 1557/5772 [2:43:46<7:15:26, 6.20s/it] 27%|██▋ | 1558/5772 [2:43:59<7:07:56, 6.09s/it] 27%|██▋ | 1558/5772 [2:43:52<7:07:56, 6.09s/it] {'loss': 0.4766, 'learning_rate': 1.7132307813904016e-05, 'epoch': 0.27} + 27%|██▋ | 1558/5772 [2:43:59<7:07:56, 6.09s/it] {'loss': 0.4766, 'learning_rate': 1.7132307813904016e-05, 'epoch': 0.27} + 27%|██▋ | 1558/5772 [2:43:52<7:07:56, 6.09s/it] 27%|██▋ | 1559/5772 [2:44:05<7:14:03, 6.18s/it] 27%|██▋ | 1559/5772 [2:43:58<7:14:03, 6.18s/it] {'loss': 0.4801, 'learning_rate': 1.7128373082056567e-05, 'epoch': 0.27} + 27%|██▋ | 1559/5772 [2:44:05<7:14:03, 6.18s/it] {'loss': 0.4801, 'learning_rate': 1.7128373082056567e-05, 'epoch': 0.27} + 27%|██▋ | 1559/5772 [2:43:58<7:14:03, 6.18s/it] 27%|██▋ | 1560/5772 [2:44:11<7:10:24, 6.13s/it] 27%|██▋ | 1560/5772 [2:44:04<7:10:24, 6.13s/it] {'loss': 0.472, 'learning_rate': 1.712443610516765e-05, 'epoch': 0.27} + 27%|██▋ | 1560/5772 [2:44:11<7:10:24, 6.13s/it] {'loss': 0.472, 'learning_rate': 1.712443610516765e-05, 'epoch': 0.27} + 27%|██▋ | 1560/5772 [2:44:04<7:10:24, 6.13s/it] 27%|██▋ | 1561/5772 [2:44:18<7:31:17, 6.43s/it] 27%|██▋ | 1561/5772 [2:44:11<7:31:17, 6.43s/it] {'loss': 0.4802, 'learning_rate': 1.7120496884477196e-05, 'epoch': 0.27} + 27%|██▋ | 1561/5772 [2:44:18<7:31:17, 6.43s/it] {'loss': 0.4802, 'learning_rate': 1.7120496884477196e-05, 'epoch': 0.27} + 27%|██▋ | 1561/5772 [2:44:11<7:31:17, 6.43s/it] 27%|██▋ | 1562/5772 [2:44:24<7:20:32, 6.28s/it] 27%|██▋ | 1562/5772 [2:44:17<7:20:32, 6.28s/it] {'loss': 0.4882, 'learning_rate': 1.7116555421225837e-05, 'epoch': 0.27} + 27%|██▋ | 1562/5772 [2:44:24<7:20:32, 6.28s/it] {'loss': 0.4882, 'learning_rate': 1.7116555421225837e-05, 'epoch': 0.27} + 27%|██▋ | 1562/5772 [2:44:17<7:20:32, 6.28s/it] 27%|██▋ | 1563/5772 [2:44:30<7:22:28, 6.31s/it] 27%|██▋ | 1563/5772 [2:44:23<7:22:27, 6.31s/it] {'loss': 0.4937, 'learning_rate': 1.7112611716654918e-05, 'epoch': 0.27} + 27%|██▋ | 1563/5772 [2:44:23<7:22:27, 6.31s/it]{'loss': 0.4937, 'learning_rate': 1.7112611716654918e-05, 'epoch': 0.27} + 27%|██▋ | 1563/5772 [2:44:30<7:22:28, 6.31s/it] 27%|██▋ | 1564/5772 [2:44:37<7:22:21, 6.31s/it] 27%|██▋ | 1564/5772 [2:44:30<7:22:21, 6.31s/it] {'loss': 0.486, 'learning_rate': 1.710866577200648e-05, 'epoch': 0.27} + 27%|██▋ | 1564/5772 [2:44:37<7:22:21, 6.31s/it] {'loss': 0.486, 'learning_rate': 1.710866577200648e-05, 'epoch': 0.27} + 27%|██▋ | 1564/5772 [2:44:30<7:22:21, 6.31s/it] 27%|██▋ | 1565/5772 [2:44:36<7:18:53, 6.26s/it] 27%|██▋ | 1565/5772 [2:44:43<7:18:54, 6.26s/it] {'loss': 0.4797, 'learning_rate': 1.7104717588523285e-05, 'epoch': 0.27} + 27%|██▋ | 1565/5772 [2:44:43<7:18:54, 6.26s/it] {'loss': 0.4797, 'learning_rate': 1.7104717588523285e-05, 'epoch': 0.27} + 27%|██▋ | 1565/5772 [2:44:36<7:18:53, 6.26s/it] 27%|██▋ | 1566/5772 [2:44:42<7:13:29, 6.18s/it] 27%|██▋ | 1566/5772 [2:44:49<7:13:29, 6.18s/it] {'loss': 0.4787, 'learning_rate': 1.710076716744879e-05, 'epoch': 0.27} + 27%|██▋ | 1566/5772 [2:44:49<7:13:29, 6.18s/it] {'loss': 0.4787, 'learning_rate': 1.710076716744879e-05, 'epoch': 0.27} + 27%|██▋ | 1566/5772 [2:44:42<7:13:29, 6.18s/it] 27%|██▋ | 1567/5772 [2:44:55<7:08:58, 6.12s/it] 27%|██▋ | 1567/5772 [2:44:48<7:08:59, 6.12s/it] {'loss': 0.4634, 'learning_rate': 1.709681451002715e-05, 'epoch': 0.27} + 27%|██▋ | 1567/5772 [2:44:55<7:08:58, 6.12s/it] {'loss': 0.4634, 'learning_rate': 1.709681451002715e-05, 'epoch': 0.27} + 27%|██▋ | 1567/5772 [2:44:48<7:08:59, 6.12s/it] 27%|██▋ | 1568/5772 [2:44:54<7:12:14, 6.17s/it] 27%|██▋ | 1568/5772 [2:45:01<7:12:15, 6.17s/it] {'loss': 0.495, 'learning_rate': 1.7092859617503242e-05, 'epoch': 0.27} + 27%|██▋ | 1568/5772 [2:45:01<7:12:15, 6.17s/it] {'loss': 0.495, 'learning_rate': 1.7092859617503242e-05, 'epoch': 0.27} + 27%|██▋ | 1568/5772 [2:44:54<7:12:14, 6.17s/it] 27%|██▋ | 1569/5772 [2:45:00<7:15:22, 6.22s/it] 27%|██▋ | 1569/5772 [2:45:07<7:15:23, 6.22s/it] {'loss': 0.4658, 'learning_rate': 1.7088902491122636e-05, 'epoch': 0.27} + 27%|██▋ | 1569/5772 [2:45:07<7:15:23, 6.22s/it] {'loss': 0.4658, 'learning_rate': 1.7088902491122636e-05, 'epoch': 0.27} + 27%|██▋ | 1569/5772 [2:45:00<7:15:22, 6.22s/it] 27%|██▋ | 1570/5772 [2:45:14<7:13:44, 6.19s/it] 27%|██▋ | 1570/5772 [2:45:07<7:13:44, 6.19s/it] {'loss': 0.4963, 'learning_rate': 1.7084943132131604e-05, 'epoch': 0.27} + 27%|██▋ | 1570/5772 [2:45:14<7:13:44, 6.19s/it] {'loss': 0.4963, 'learning_rate': 1.7084943132131604e-05, 'epoch': 0.27} + 27%|██▋ | 1570/5772 [2:45:07<7:13:44, 6.19s/it] 27%|██▋ | 1571/5772 [2:45:20<7:20:23, 6.29s/it] 27%|██▋ | 1571/5772 [2:45:13<7:20:23, 6.29s/it] {'loss': 0.4947, 'learning_rate': 1.7080981541777123e-05, 'epoch': 0.27} + 27%|██▋ | 1571/5772 [2:45:20<7:20:23, 6.29s/it] {'loss': 0.4947, 'learning_rate': 1.7080981541777123e-05, 'epoch': 0.27} + 27%|██▋ | 1571/5772 [2:45:13<7:20:23, 6.29s/it] 27%|██▋ | 1572/5772 [2:45:27<7:30:24, 6.43s/it] 27%|██▋ | 1572/5772 [2:45:20<7:30:24, 6.43s/it] {'loss': 0.488, 'learning_rate': 1.7077017721306877e-05, 'epoch': 0.27} + 27%|██▋ | 1572/5772 [2:45:27<7:30:24, 6.43s/it] {'loss': 0.488, 'learning_rate': 1.7077017721306877e-05, 'epoch': 0.27} + 27%|██▋ | 1572/5772 [2:45:20<7:30:24, 6.43s/it] 27%|██▋ | 1573/5772 [2:45:26<7:26:26, 6.38s/it] 27%|██▋ | 1573/5772 [2:45:33<7:26:26, 6.38s/it] {'loss': 0.487, 'learning_rate': 1.707305167196925e-05, 'epoch': 0.27} + 27%|██▋ | 1573/5772 [2:45:33<7:26:26, 6.38s/it] {'loss': 0.487, 'learning_rate': 1.707305167196925e-05, 'epoch': 0.27} + 27%|██▋ | 1573/5772 [2:45:26<7:26:26, 6.38s/it] 27%|██▋ | 1574/5772 [2:45:39<7:26:04, 6.38s/it] 27%|██▋ | 1574/5772 [2:45:32<7:26:04, 6.38s/it] {'loss': 0.4934, 'learning_rate': 1.7069083395013323e-05, 'epoch': 0.27} + 27%|██▋ | 1574/5772 [2:45:32<7:26:04, 6.38s/it] {'loss': 0.4934, 'learning_rate': 1.7069083395013323e-05, 'epoch': 0.27} + 27%|██▋ | 1574/5772 [2:45:39<7:26:04, 6.38s/it] 27%|██▋ | 1575/5772 [2:45:45<7:16:24, 6.24s/it] 27%|██▋ | 1575/5772 [2:45:38<7:16:24, 6.24s/it] {'loss': 0.4877, 'learning_rate': 1.7065112891688883e-05, 'epoch': 0.27} + 27%|██▋ | 1575/5772 [2:45:45<7:16:24, 6.24s/it] {'loss': 0.4877, 'learning_rate': 1.7065112891688883e-05, 'epoch': 0.27} + 27%|██▋ | 1575/5772 [2:45:38<7:16:24, 6.24s/it] 27%|██▋ | 1576/5772 [2:45:45<7:22:47, 6.33s/it] 27%|██▋ | 1576/5772 [2:45:52<7:22:47, 6.33s/it] {'loss': 0.4946, 'learning_rate': 1.706114016324642e-05, 'epoch': 0.27} + 27%|██▋ | 1576/5772 [2:45:52<7:22:47, 6.33s/it] {'loss': 0.4946, 'learning_rate': 1.706114016324642e-05, 'epoch': 0.27} + 27%|██▋ | 1576/5772 [2:45:45<7:22:47, 6.33s/it] 27%|██▋ | 1577/5772 [2:45:58<7:25:42, 6.37s/it] 27%|██▋ | 1577/5772 [2:45:51<7:25:42, 6.37s/it] {'loss': 0.4809, 'learning_rate': 1.7057165210937124e-05, 'epoch': 0.27} + 27%|██▋ | 1577/5772 [2:45:58<7:25:42, 6.37s/it] {'loss': 0.4809, 'learning_rate': 1.7057165210937124e-05, 'epoch': 0.27} + 27%|██▋ | 1577/5772 [2:45:51<7:25:42, 6.37s/it] 27%|██▋ | 1578/5772 [2:46:05<7:21:08, 6.31s/it] 27%|██▋ | 1578/5772 [2:45:58<7:21:08, 6.31s/it] {'loss': 0.4978, 'learning_rate': 1.7053188036012885e-05, 'epoch': 0.27} + 27%|██▋ | 1578/5772 [2:46:05<7:21:08, 6.31s/it] {'loss': 0.4978, 'learning_rate': 1.7053188036012885e-05, 'epoch': 0.27} + 27%|██▋ | 1578/5772 [2:45:58<7:21:08, 6.31s/it] 27%|██▋ | 1579/5772 [2:46:04<7:16:31, 6.25s/it] 27%|██▋ | 1579/5772 [2:46:11<7:16:32, 6.25s/it] {'loss': 0.4744, 'learning_rate': 1.704920863972629e-05, 'epoch': 0.27} + 27%|██▋ | 1579/5772 [2:46:11<7:16:32, 6.25s/it] {'loss': 0.4744, 'learning_rate': 1.704920863972629e-05, 'epoch': 0.27} + 27%|██▋ | 1579/5772 [2:46:04<7:16:31, 6.25s/it] 27%|██▋ | 1580/5772 [2:46:17<7:09:47, 6.15s/it] 27%|██▋ | 1580/5772 [2:46:10<7:09:47, 6.15s/it] {'loss': 0.4862, 'learning_rate': 1.704522702333063e-05, 'epoch': 0.27} + 27%|██▋ | 1580/5772 [2:46:17<7:09:47, 6.15s/it] {'loss': 0.4862, 'learning_rate': 1.704522702333063e-05, 'epoch': 0.27} + 27%|██▋ | 1580/5772 [2:46:10<7:09:47, 6.15s/it] 27%|██▋ | 1581/5772 [2:46:16<7:13:01, 6.20s/it] 27%|██▋ | 1581/5772 [2:46:23<7:13:01, 6.20s/it] {'loss': 0.4763, 'learning_rate': 1.7041243188079884e-05, 'epoch': 0.27} + 27%|██▋ | 1581/5772 [2:46:23<7:13:01, 6.20s/it] {'loss': 0.4763, 'learning_rate': 1.7041243188079884e-05, 'epoch': 0.27} + 27%|██▋ | 1581/5772 [2:46:16<7:13:01, 6.20s/it] 27%|██▋ | 1582/5772 [2:46:29<7:05:01, 6.09s/it] 27%|██▋ | 1582/5772 [2:46:22<7:05:01, 6.09s/it] {'loss': 0.4792, 'learning_rate': 1.7037257135228745e-05, 'epoch': 0.27} + 27%|██▋ | 1582/5772 [2:46:29<7:05:01, 6.09s/it] {'loss': 0.4792, 'learning_rate': 1.7037257135228745e-05, 'epoch': 0.27} + 27%|██▋ | 1582/5772 [2:46:22<7:05:01, 6.09s/it] 27%|██▋ | 1583/5772 [2:46:35<7:06:16, 6.11s/it] 27%|██▋ | 1583/5772 [2:46:28<7:06:16, 6.11s/it] {'loss': 0.4759, 'learning_rate': 1.7033268866032605e-05, 'epoch': 0.27} + 27%|██▋ | 1583/5772 [2:46:35<7:06:16, 6.11s/it] {'loss': 0.4759, 'learning_rate': 1.7033268866032605e-05, 'epoch': 0.27} + 27%|██▋ | 1583/5772 [2:46:28<7:06:16, 6.11s/it] 27%|██▋ | 1584/5772 [2:46:34<7:15:43, 6.24s/it] 27%|██▋ | 1584/5772 [2:46:41<7:15:44, 6.24s/it] {'loss': 0.4778, 'learning_rate': 1.7029278381747537e-05, 'epoch': 0.27} + 27%|██▋ | 1584/5772 [2:46:41<7:15:44, 6.24s/it] {'loss': 0.4778, 'learning_rate': 1.7029278381747537e-05, 'epoch': 0.27} + 27%|██▋ | 1584/5772 [2:46:34<7:15:43, 6.24s/it] 27%|██▋ | 1585/5772 [2:46:48<7:14:16, 6.22s/it] 27%|██▋ | 1585/5772 [2:46:41<7:14:16, 6.22s/it] {'loss': 0.4713, 'learning_rate': 1.7025285683630324e-05, 'epoch': 0.27} + 27%|██▋ | 1585/5772 [2:46:48<7:14:16, 6.22s/it] {'loss': 0.4713, 'learning_rate': 1.7025285683630324e-05, 'epoch': 0.27} + 27%|██▋ | 1585/5772 [2:46:41<7:14:16, 6.22s/it] 27%|██▋ | 1586/5772 [2:46:54<7:17:08, 6.27s/it] 27%|██▋ | 1586/5772 [2:46:47<7:17:09, 6.27s/it] {'loss': 0.4769, 'learning_rate': 1.7021290772938447e-05, 'epoch': 0.27} + 27%|██▋ | 1586/5772 [2:46:54<7:17:08, 6.27s/it] {'loss': 0.4769, 'learning_rate': 1.7021290772938447e-05, 'epoch': 0.27} + 27%|██▋ | 1586/5772 [2:46:47<7:17:09, 6.27s/it] 27%|██▋ | 1587/5772 [2:46:53<7:10:44, 6.18s/it] 27%|██▋ | 1587/5772 [2:47:00<7:10:45, 6.18s/it] {'loss': 0.4747, 'learning_rate': 1.7017293650930083e-05, 'epoch': 0.27} + 27%|██▋ | 1587/5772 [2:47:00<7:10:45, 6.18s/it] {'loss': 0.4747, 'learning_rate': 1.7017293650930083e-05, 'epoch': 0.27} + 27%|██▋ | 1587/5772 [2:46:53<7:10:44, 6.18s/it] 28%|██▊ | 1588/5772 [2:47:06<7:15:52, 6.25s/it] 28%|██▊ | 1588/5772 [2:46:59<7:15:53, 6.25s/it] {'loss': 0.4903, 'learning_rate': 1.7013294318864095e-05, 'epoch': 0.28} + 28%|██▊ | 1588/5772 [2:47:06<7:15:52, 6.25s/it] {'loss': 0.4903, 'learning_rate': 1.7013294318864095e-05, 'epoch': 0.28} + 28%|██▊ | 1588/5772 [2:46:59<7:15:53, 6.25s/it] 28%|██▊ | 1589/5772 [2:47:12<7:10:54, 6.18s/it] 28%|██▊ | 1589/5772 [2:47:05<7:10:54, 6.18s/it] {'loss': 0.4922, 'learning_rate': 1.7009292778000058e-05, 'epoch': 0.28} + 28%|██▊ | 1589/5772 [2:47:12<7:10:54, 6.18s/it] {'loss': 0.4922, 'learning_rate': 1.7009292778000058e-05, 'epoch': 0.28} + 28%|██▊ | 1589/5772 [2:47:05<7:10:54, 6.18s/it] 28%|██▊ | 1590/5772 [2:47:18<7:08:30, 6.15s/it] 28%|██▊ | 1590/5772 [2:47:11<7:08:30, 6.15s/it] {'loss': 0.4684, 'learning_rate': 1.7005289029598233e-05, 'epoch': 0.28} + 28%|██▊ | 1590/5772 [2:47:18<7:08:30, 6.15s/it] {'loss': 0.4684, 'learning_rate': 1.7005289029598233e-05, 'epoch': 0.28} + 28%|██▊ | 1590/5772 [2:47:11<7:08:30, 6.15s/it] 28%|██▊ | 1591/5772 [2:47:25<7:06:35, 6.12s/it] 28%|██▊ | 1591/5772 [2:47:18<7:06:35, 6.12s/it] {'loss': 0.4811, 'learning_rate': 1.7001283074919576e-05, 'epoch': 0.28} + 28%|██▊ | 1591/5772 [2:47:25<7:06:35, 6.12s/it] {'loss': 0.4811, 'learning_rate': 1.7001283074919576e-05, 'epoch': 0.28} + 28%|██▊ | 1591/5772 [2:47:18<7:06:35, 6.12s/it] 28%|██▊ | 1592/5772 [2:47:24<7:05:54, 6.11s/it] 28%|██▊ | 1592/5772 [2:47:31<7:05:54, 6.11s/it] {'loss': 0.4888, 'learning_rate': 1.699727491522574e-05, 'epoch': 0.28} + 28%|██▊ | 1592/5772 [2:47:31<7:05:54, 6.11s/it] {'loss': 0.4888, 'learning_rate': 1.699727491522574e-05, 'epoch': 0.28} + 28%|██▊ | 1592/5772 [2:47:24<7:05:54, 6.11s/it] 28%|██▊ | 1593/5772 [2:47:37<7:03:36, 6.08s/it] 28%|██▊ | 1593/5772 [2:47:30<7:03:36, 6.08s/it] {'loss': 0.4798, 'learning_rate': 1.699326455177908e-05, 'epoch': 0.28} + 28%|██▊ | 1593/5772 [2:47:37<7:03:36, 6.08s/it] {'loss': 0.4798, 'learning_rate': 1.699326455177908e-05, 'epoch': 0.28} + 28%|██▊ | 1593/5772 [2:47:30<7:03:36, 6.08s/it] 28%|██▊ | 1594/5772 [2:47:43<7:00:48, 6.04s/it] 28%|██▊ | 1594/5772 [2:47:36<7:00:48, 6.04s/it] {'loss': 0.485, 'learning_rate': 1.698925198584263e-05, 'epoch': 0.28} + 28%|██▊ | 1594/5772 [2:47:43<7:00:48, 6.04s/it] {'loss': 0.485, 'learning_rate': 1.698925198584263e-05, 'epoch': 0.28} + 28%|██▊ | 1594/5772 [2:47:36<7:00:48, 6.04s/it] 28%|██▊ | 1595/5772 [2:47:49<7:02:54, 6.07s/it] 28%|██▊ | 1595/5772 [2:47:42<7:02:54, 6.07s/it] {'loss': 0.4776, 'learning_rate': 1.6985237218680125e-05, 'epoch': 0.28} + 28%|██▊ | 1595/5772 [2:47:49<7:02:54, 6.07s/it] {'loss': 0.4776, 'learning_rate': 1.6985237218680125e-05, 'epoch': 0.28} + 28%|██▊ | 1595/5772 [2:47:42<7:02:54, 6.07s/it] 28%|██▊ | 1596/5772 [2:47:55<7:00:41, 6.04s/it] 28%|██▊ | 1596/5772 [2:47:48<7:00:42, 6.04s/it] {'loss': 0.4769, 'learning_rate': 1.6981220251555996e-05, 'epoch': 0.28} + 28%|██▊ | 1596/5772 [2:47:55<7:00:41, 6.04s/it] {'loss': 0.4769, 'learning_rate': 1.6981220251555996e-05, 'epoch': 0.28} + 28%|██▊ | 1596/5772 [2:47:48<7:00:42, 6.04s/it] 28%|██▊ | 1597/5772 [2:47:53<6:54:16, 5.95s/it] 28%|██▊ | 1597/5772 [2:48:00<6:54:16, 5.95s/it] {'loss': 0.477, 'learning_rate': 1.6977201085735367e-05, 'epoch': 0.28} + 28%|██▊ | 1597/5772 [2:48:00<6:54:16, 5.95s/it] {'loss': 0.477, 'learning_rate': 1.6977201085735367e-05, 'epoch': 0.28} + 28%|██▊ | 1597/5772 [2:47:53<6:54:16, 5.95s/it] 28%|██▊ | 1598/5772 [2:48:06<6:52:05, 5.92s/it] 28%|██▊ | 1598/5772 [2:47:59<6:52:05, 5.92s/it] {'loss': 0.4779, 'learning_rate': 1.6973179722484048e-05, 'epoch': 0.28} + 28%|██▊ | 1598/5772 [2:48:06<6:52:05, 5.92s/it] {'loss': 0.4779, 'learning_rate': 1.6973179722484048e-05, 'epoch': 0.28} + 28%|██▊ | 1598/5772 [2:47:59<6:52:05, 5.92s/it] 28%|██▊ | 1599/5772 [2:48:05<6:51:06, 5.91s/it] 28%|██▊ | 1599/5772 [2:48:12<6:51:07, 5.91s/it] {'loss': 0.4849, 'learning_rate': 1.6969156163068547e-05, 'epoch': 0.28} + 28%|██▊ | 1599/5772 [2:48:12<6:51:07, 5.91s/it] {'loss': 0.4849, 'learning_rate': 1.6969156163068547e-05, 'epoch': 0.28} + 28%|██▊ | 1599/5772 [2:48:05<6:51:06, 5.91s/it]9 AutoResumeHook: Checking whether to suspend... +05 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... + 28%|██▊ | 1600/5772 [2:48:11<6:54:11, 5.96s/it]10 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... + 28%|██▊ | 1600/5772 [2:48:18<6:54:11, 5.96s/it]3 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4938, 'learning_rate': 1.696513040875606e-05, 'epoch': 0.28} + 28%|██▊ | 1600/5772 [2:48:18<6:54:11, 5.96s/it] {'loss': 0.4938, 'learning_rate': 1.696513040875606e-05, 'epoch': 0.28} + 28%|██▊ | 1600/5772 [2:48:11<6:54:11, 5.96s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-1600/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-1600/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-1600/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 28%|██▊ | 1601/5772 [2:48:30<11:19:40, 9.78s/it] 28%|██▊ | 1601/5772 [2:48:37<11:19:40, 9.78s/it] {'loss': 0.486, 'learning_rate': 1.696110246081448e-05, 'epoch': 0.28} + 28%|██▊ | 1601/5772 [2:48:37<11:19:40, 9.78s/it] {'loss': 0.486, 'learning_rate': 1.696110246081448e-05, 'epoch': 0.28} + 28%|██▊ | 1601/5772 [2:48:30<11:19:40, 9.78s/it] 28%|██▊ | 1602/5772 [2:48:36<9:55:26, 8.57s/it] 28%|██▊ | 1602/5772 [2:48:43<9:55:26, 8.57s/it] {'loss': 0.479, 'learning_rate': 1.695707232051238e-05, 'epoch': 0.28} + 28%|██▊ | 1602/5772 [2:48:43<9:55:26, 8.57s/it] {'loss': 0.479, 'learning_rate': 1.695707232051238e-05, 'epoch': 0.28} + 28%|██▊ | 1602/5772 [2:48:36<9:55:26, 8.57s/it] 28%|██▊ | 1603/5772 [2:48:42<9:07:41, 7.88s/it] 28%|██▊ | 1603/5772 [2:48:49<9:07:41, 7.88s/it] {'loss': 0.488, 'learning_rate': 1.6953039989119036e-05, 'epoch': 0.28} + 28%|██▊ | 1603/5772 [2:48:49<9:07:41, 7.88s/it] {'loss': 0.488, 'learning_rate': 1.6953039989119036e-05, 'epoch': 0.28} + 28%|██▊ | 1603/5772 [2:48:42<9:07:41, 7.88s/it] 28%|██▊ | 1604/5772 [2:48:55<8:28:23, 7.32s/it] 28%|██▊ | 1604/5772 [2:48:48<8:28:24, 7.32s/it] {'loss': 0.4918, 'learning_rate': 1.6949005467904405e-05, 'epoch': 0.28} + 28%|██▊ | 1604/5772 [2:48:55<8:28:23, 7.32s/it] {'loss': 0.4918, 'learning_rate': 1.6949005467904405e-05, 'epoch': 0.28} + 28%|██▊ | 1604/5772 [2:48:48<8:28:24, 7.32s/it] 28%|██▊ | 1605/5772 [2:48:54<8:05:08, 6.99s/it] 28%|██▊ | 1605/5772 [2:49:01<8:05:08, 6.99s/it] {'loss': 0.4734, 'learning_rate': 1.6944968758139144e-05, 'epoch': 0.28} + 28%|██▊ | 1605/5772 [2:49:01<8:05:08, 6.99s/it] {'loss': 0.4734, 'learning_rate': 1.6944968758139144e-05, 'epoch': 0.28} + 28%|██▊ | 1605/5772 [2:48:54<8:05:08, 6.99s/it] 28%|██▊ | 1606/5772 [2:49:01<7:53:42, 6.82s/it] 28%|██▊ | 1606/5772 [2:49:08<7:53:42, 6.82s/it] {'loss': 0.4876, 'learning_rate': 1.694092986109458e-05, 'epoch': 0.28} + 28%|██▊ | 1606/5772 [2:49:08<7:53:42, 6.82s/it] {'loss': 0.4876, 'learning_rate': 1.694092986109458e-05, 'epoch': 0.28} + 28%|██▊ | 1606/5772 [2:49:01<7:53:42, 6.82s/it] 28%|██▊ | 1607/5772 [2:49:14<7:38:13, 6.60s/it] 28%|██▊ | 1607/5772 [2:49:07<7:38:13, 6.60s/it] {'loss': 0.4866, 'learning_rate': 1.693688877804275e-05, 'epoch': 0.28} + 28%|██▊ | 1607/5772 [2:49:14<7:38:13, 6.60s/it] {'loss': 0.4866, 'learning_rate': 1.693688877804275e-05, 'epoch': 0.28} + 28%|██▊ | 1607/5772 [2:49:07<7:38:13, 6.60s/it] 28%|██▊ | 1608/5772 [2:49:13<7:33:19, 6.53s/it] 28%|██▊ | 1608/5772 [2:49:20<7:33:19, 6.53s/it] {'loss': 0.4785, 'learning_rate': 1.693284551025637e-05, 'epoch': 0.28} + 28%|██▊ | 1608/5772 [2:49:20<7:33:19, 6.53s/it] {'loss': 0.4785, 'learning_rate': 1.693284551025637e-05, 'epoch': 0.28} + 28%|██▊ | 1608/5772 [2:49:13<7:33:19, 6.53s/it] 28%|██▊ | 1609/5772 [2:49:19<7:26:48, 6.44s/it] 28%|██▊ | 1609/5772 [2:49:26<7:26:49, 6.44s/it] {'loss': 0.4798, 'learning_rate': 1.6928800059008845e-05, 'epoch': 0.28} + 28%|██▊ | 1609/5772 [2:49:26<7:26:49, 6.44s/it] {'loss': 0.4798, 'learning_rate': 1.6928800059008845e-05, 'epoch': 0.28} + 28%|██▊ | 1609/5772 [2:49:19<7:26:48, 6.44s/it] 28%|██▊ | 1610/5772 [2:49:25<7:18:53, 6.33s/it] 28%|██▊ | 1610/5772 [2:49:32<7:18:54, 6.33s/it] {'loss': 0.4768, 'learning_rate': 1.6924752425574262e-05, 'epoch': 0.28} + 28%|██▊ | 1610/5772 [2:49:32<7:18:54, 6.33s/it] {'loss': 0.4768, 'learning_rate': 1.6924752425574262e-05, 'epoch': 0.28} + 28%|██▊ | 1610/5772 [2:49:25<7:18:53, 6.33s/it] 28%|██▊ | 1611/5772 [2:49:31<7:13:07, 6.25s/it] 28%|██▊ | 1611/5772 [2:49:38<7:13:07, 6.25s/it] {'loss': 0.4744, 'learning_rate': 1.6920702611227405e-05, 'epoch': 0.28} + 28%|██▊ | 1611/5772 [2:49:38<7:13:07, 6.25s/it] {'loss': 0.4744, 'learning_rate': 1.6920702611227405e-05, 'epoch': 0.28} + 28%|██▊ | 1611/5772 [2:49:31<7:13:07, 6.25s/it] 28%|██▊ | 1612/5772 [2:49:38<7:11:27, 6.22s/it] 28%|██▊ | 1612/5772 [2:49:45<7:11:27, 6.22s/it] {'loss': 0.4773, 'learning_rate': 1.691665061724374e-05, 'epoch': 0.28} + 28%|██▊ | 1612/5772 [2:49:45<7:11:27, 6.22s/it] {'loss': 0.4773, 'learning_rate': 1.691665061724374e-05, 'epoch': 0.28} + 28%|██▊ | 1612/5772 [2:49:38<7:11:27, 6.22s/it] 28%|██▊ | 1613/5772 [2:49:44<7:13:32, 6.25s/it] 28%|██▊ | 1613/5772 [2:49:51<7:13:32, 6.25s/it] {'loss': 0.4709, 'learning_rate': 1.691259644489942e-05, 'epoch': 0.28} + 28%|██▊ | 1613/5772 [2:49:51<7:13:32, 6.25s/it] {'loss': 0.4709, 'learning_rate': 1.691259644489942e-05, 'epoch': 0.28} + 28%|██▊ | 1613/5772 [2:49:44<7:13:32, 6.25s/it] 28%|██▊ | 1614/5772 [2:49:50<7:13:42, 6.26s/it] 28%|██▊ | 1614/5772 [2:49:57<7:13:42, 6.26s/it] {'loss': 0.4761, 'learning_rate': 1.6908540095471288e-05, 'epoch': 0.28} + 28%|██▊ | 1614/5772 [2:49:57<7:13:42, 6.26s/it] {'loss': 0.4761, 'learning_rate': 1.6908540095471288e-05, 'epoch': 0.28} + 28%|██▊ | 1614/5772 [2:49:50<7:13:42, 6.26s/it] 28%|██▊ | 1615/5772 [2:49:56<7:11:15, 6.22s/it] 28%|██▊ | 1615/5772 [2:50:03<7:11:15, 6.22s/it] {'loss': 0.493, 'learning_rate': 1.690448157023686e-05, 'epoch': 0.28} + 28%|██▊ | 1615/5772 [2:50:03<7:11:15, 6.22s/it] {'loss': 0.493, 'learning_rate': 1.690448157023686e-05, 'epoch': 0.28} + 28%|██▊ | 1615/5772 [2:49:56<7:11:15, 6.22s/it] 28%|██▊ | 1616/5772 [2:50:02<7:05:12, 6.14s/it] 28%|██▊ | 1616/5772 [2:50:09<7:05:13, 6.14s/it] {'loss': 0.4955, 'learning_rate': 1.6900420870474347e-05, 'epoch': 0.28} + 28%|██▊ | 1616/5772 [2:50:09<7:05:13, 6.14s/it] {'loss': 0.4955, 'learning_rate': 1.6900420870474347e-05, 'epoch': 0.28} + 28%|██▊ | 1616/5772 [2:50:02<7:05:12, 6.14s/it] 28%|██▊ | 1617/5772 [2:50:08<7:06:33, 6.16s/it] 28%|██▊ | 1617/5772 [2:50:16<7:06:33, 6.16s/it] {'loss': 0.4714, 'learning_rate': 1.6896357997462653e-05, 'epoch': 0.28} + 28%|██▊ | 1617/5772 [2:50:16<7:06:33, 6.16s/it] {'loss': 0.4714, 'learning_rate': 1.6896357997462653e-05, 'epoch': 0.28} + 28%|██▊ | 1617/5772 [2:50:08<7:06:33, 6.16s/it] 28%|██▊ | 1618/5772 [2:50:15<7:05:17, 6.14s/it] 28%|██▊ | 1618/5772 [2:50:22<7:05:17, 6.14s/it] {'loss': 0.4854, 'learning_rate': 1.6892292952481352e-05, 'epoch': 0.28} + 28%|██▊ | 1618/5772 [2:50:22<7:05:17, 6.14s/it] {'loss': 0.4854, 'learning_rate': 1.6892292952481352e-05, 'epoch': 0.28} + 28%|██▊ | 1618/5772 [2:50:15<7:05:17, 6.14s/it] 28%|██▊ | 1619/5772 [2:50:21<7:04:57, 6.14s/it] 28%|██▊ | 1619/5772 [2:50:28<7:04:56, 6.14s/it] {'loss': 0.472, 'learning_rate': 1.6888225736810705e-05, 'epoch': 0.28} + 28%|██▊ | 1619/5772 [2:50:28<7:04:56, 6.14s/it] {'loss': 0.472, 'learning_rate': 1.6888225736810705e-05, 'epoch': 0.28} + 28%|██▊ | 1619/5772 [2:50:21<7:04:57, 6.14s/it] 28%|██▊ | 1620/5772 [2:50:34<7:02:26, 6.10s/it] 28%|██▊ | 1620/5772 [2:50:27<7:02:26, 6.10s/it] {'loss': 0.4962, 'learning_rate': 1.688415635173166e-05, 'epoch': 0.28} + 28%|██▊ | 1620/5772 [2:50:34<7:02:26, 6.10s/it] {'loss': 0.4962, 'learning_rate': 1.688415635173166e-05, 'epoch': 0.28} + 28%|██▊ | 1620/5772 [2:50:27<7:02:26, 6.10s/it] 28%|██▊ | 1621/5772 [2:50:33<7:03:05, 6.12s/it] 28%|██▊ | 1621/5772 [2:50:40<7:03:05, 6.12s/it] {'loss': 0.4751, 'learning_rate': 1.6880084798525848e-05, 'epoch': 0.28} + 28%|██▊ | 1621/5772 [2:50:40<7:03:05, 6.12s/it] {'loss': 0.4751, 'learning_rate': 1.6880084798525848e-05, 'epoch': 0.28} + 28%|██▊ | 1621/5772 [2:50:33<7:03:05, 6.12s/it] 28%|██▊ | 1622/5772 [2:50:46<6:55:53, 6.01s/it] 28%|██▊ | 1622/5772 [2:50:39<6:55:53, 6.01s/it] {'loss': 0.4819, 'learning_rate': 1.6876011078475586e-05, 'epoch': 0.28} + 28%|██▊ | 1622/5772 [2:50:46<6:55:53, 6.01s/it] {'loss': 0.4819, 'learning_rate': 1.6876011078475586e-05, 'epoch': 0.28} + 28%|██▊ | 1622/5772 [2:50:39<6:55:53, 6.01s/it] 28%|██▊ | 1623/5772 [2:50:52<6:56:50, 6.03s/it] 28%|██▊ | 1623/5772 [2:50:45<6:56:50, 6.03s/it] {'loss': 0.4739, 'learning_rate': 1.6871935192863862e-05, 'epoch': 0.28} + 28%|██▊ | 1623/5772 [2:50:52<6:56:50, 6.03s/it] {'loss': 0.4739, 'learning_rate': 1.6871935192863862e-05, 'epoch': 0.28} + 28%|██▊ | 1623/5772 [2:50:45<6:56:50, 6.03s/it] 28%|██▊ | 1624/5772 [2:50:51<6:56:51, 6.03s/it] 28%|██▊ | 1624/5772 [2:50:58<6:56:52, 6.03s/it] {'loss': 0.4955, 'learning_rate': 1.6867857142974354e-05, 'epoch': 0.28} + 28%|██▊ | 1624/5772 [2:50:58<6:56:52, 6.03s/it] {'loss': 0.4955, 'learning_rate': 1.6867857142974354e-05, 'epoch': 0.28} + 28%|██▊ | 1624/5772 [2:50:51<6:56:51, 6.03s/it] 28%|██▊ | 1625/5772 [2:50:57<7:03:40, 6.13s/it] 28%|██▊ | 1625/5772 [2:51:04<7:03:41, 6.13s/it] {'loss': 0.4622, 'learning_rate': 1.686377693009143e-05, 'epoch': 0.28} + 28%|██▊ | 1625/5772 [2:51:04<7:03:41, 6.13s/it] {'loss': 0.4622, 'learning_rate': 1.686377693009143e-05, 'epoch': 0.28} + 28%|██▊ | 1625/5772 [2:50:57<7:03:40, 6.13s/it] 28%|██▊ | 1626/5772 [2:51:10<7:05:23, 6.16s/it] 28%|██▊ | 1626/5772 [2:51:03<7:05:23, 6.16s/it] {'loss': 0.4855, 'learning_rate': 1.6859694555500125e-05, 'epoch': 0.28} + 28%|██▊ | 1626/5772 [2:51:10<7:05:23, 6.16s/it] {'loss': 0.4855, 'learning_rate': 1.6859694555500125e-05, 'epoch': 0.28} + 28%|██▊ | 1626/5772 [2:51:03<7:05:23, 6.16s/it] 28%|██▊ | 1627/5772 [2:51:09<7:05:03, 6.15s/it] 28%|██▊ | 1627/5772 [2:51:17<7:05:03, 6.15s/it] {'loss': 0.4688, 'learning_rate': 1.685561002048616e-05, 'epoch': 0.28} + 28%|██▊ | 1627/5772 [2:51:17<7:05:03, 6.15s/it] {'loss': 0.4688, 'learning_rate': 1.685561002048616e-05, 'epoch': 0.28} + 28%|██▊ | 1627/5772 [2:51:09<7:05:03, 6.15s/it] 28%|██▊ | 1628/5772 [2:51:23<7:10:25, 6.23s/it] 28%|██▊ | 1628/5772 [2:51:16<7:10:25, 6.23s/it] {'loss': 0.4835, 'learning_rate': 1.6851523326335932e-05, 'epoch': 0.28} + 28%|██▊ | 1628/5772 [2:51:23<7:10:25, 6.23s/it] {'loss': 0.4835, 'learning_rate': 1.6851523326335932e-05, 'epoch': 0.28} + 28%|██▊ | 1628/5772 [2:51:16<7:10:25, 6.23s/it] 28%|██▊ | 1629/5772 [2:51:22<7:03:09, 6.13s/it] 28%|██▊ | 1629/5772 [2:51:29<7:03:09, 6.13s/it] {'loss': 0.4752, 'learning_rate': 1.684743447433653e-05, 'epoch': 0.28} + 28%|██▊ | 1629/5772 [2:51:29<7:03:09, 6.13s/it] {'loss': 0.4752, 'learning_rate': 1.684743447433653e-05, 'epoch': 0.28} + 28%|██▊ | 1629/5772 [2:51:22<7:03:09, 6.13s/it] 28%|██▊ | 1630/5772 [2:51:28<6:56:18, 6.03s/it] 28%|██▊ | 1630/5772 [2:51:35<6:56:19, 6.03s/it] {'loss': 0.4714, 'learning_rate': 1.684334346577571e-05, 'epoch': 0.28} + 28%|██▊ | 1630/5772 [2:51:35<6:56:19, 6.03s/it] {'loss': 0.4714, 'learning_rate': 1.684334346577571e-05, 'epoch': 0.28} + 28%|██▊ | 1630/5772 [2:51:28<6:56:18, 6.03s/it] 28%|██▊ | 1631/5772 [2:51:41<7:05:54, 6.17s/it] 28%|██▊ | 1631/5772 [2:51:34<7:05:55, 6.17s/it] {'loss': 0.494, 'learning_rate': 1.6839250301941912e-05, 'epoch': 0.28} + 28%|██▊ | 1631/5772 [2:51:41<7:05:54, 6.17s/it] {'loss': 0.494, 'learning_rate': 1.6839250301941912e-05, 'epoch': 0.28} + 28%|██▊ | 1631/5772 [2:51:34<7:05:55, 6.17s/it] 28%|██▊ | 1632/5772 [2:51:40<6:58:27, 6.06s/it] 28%|██▊ | 1632/5772 [2:51:47<6:58:27, 6.06s/it] {'loss': 0.4731, 'learning_rate': 1.6835154984124266e-05, 'epoch': 0.28} + 28%|██▊ | 1632/5772 [2:51:47<6:58:27, 6.06s/it] {'loss': 0.4731, 'learning_rate': 1.6835154984124266e-05, 'epoch': 0.28} + 28%|██▊ | 1632/5772 [2:51:40<6:58:27, 6.06s/it] 28%|██▊ | 1633/5772 [2:51:46<6:54:48, 6.01s/it] 28%|██▊ | 1633/5772 [2:51:53<6:54:49, 6.01s/it] {'loss': 0.4931, 'learning_rate': 1.6831057513612554e-05, 'epoch': 0.28} + 28%|██▊ | 1633/5772 [2:51:53<6:54:49, 6.01s/it] {'loss': 0.4931, 'learning_rate': 1.6831057513612554e-05, 'epoch': 0.28} + 28%|██▊ | 1633/5772 [2:51:46<6:54:48, 6.01s/it] 28%|██▊ | 1634/5772 [2:51:59<6:53:13, 5.99s/it] 28%|██▊ | 1634/5772 [2:51:52<6:53:13, 5.99s/it] {'loss': 0.4652, 'learning_rate': 1.682695789169726e-05, 'epoch': 0.28} + 28%|██▊ | 1634/5772 [2:51:59<6:53:13, 5.99s/it] {'loss': 0.4652, 'learning_rate': 1.682695789169726e-05, 'epoch': 0.28} + 28%|██▊ | 1634/5772 [2:51:52<6:53:13, 5.99s/it] 28%|██▊ | 1635/5772 [2:51:59<7:10:04, 6.24s/it] 28%|██▊ | 1635/5772 [2:52:06<7:10:04, 6.24s/it] {'loss': 0.4901, 'learning_rate': 1.682285611966954e-05, 'epoch': 0.28} + 28%|██▊ | 1635/5772 [2:52:06<7:10:04, 6.24s/it] {'loss': 0.4901, 'learning_rate': 1.682285611966954e-05, 'epoch': 0.28} + 28%|██▊ | 1635/5772 [2:51:59<7:10:04, 6.24s/it] 28%|██▊ | 1636/5772 [2:52:05<7:07:41, 6.20s/it] 28%|██▊ | 1636/5772 [2:52:12<7:07:41, 6.20s/it] {'loss': 0.4747, 'learning_rate': 1.681875219882122e-05, 'epoch': 0.28} + 28%|██▊ | 1636/5772 [2:52:12<7:07:41, 6.20s/it] {'loss': 0.4747, 'learning_rate': 1.681875219882122e-05, 'epoch': 0.28} + 28%|██▊ | 1636/5772 [2:52:05<7:07:41, 6.20s/it] 28%|██▊ | 1637/5772 [2:52:11<7:07:12, 6.20s/it] 28%|██▊ | 1637/5772 [2:52:18<7:07:13, 6.20s/it] {'loss': 0.4875, 'learning_rate': 1.6814646130444804e-05, 'epoch': 0.28} + 28%|██▊ | 1637/5772 [2:52:18<7:07:13, 6.20s/it] {'loss': 0.4875, 'learning_rate': 1.6814646130444804e-05, 'epoch': 0.28} + 28%|██▊ | 1637/5772 [2:52:11<7:07:12, 6.20s/it] 28%|██▊ | 1638/5772 [2:52:17<6:57:01, 6.05s/it] 28%|██▊ | 1638/5772 [2:52:24<6:57:01, 6.05s/it] {'loss': 0.4651, 'learning_rate': 1.681053791583348e-05, 'epoch': 0.28} + 28%|██▊ | 1638/5772 [2:52:24<6:57:01, 6.05s/it] {'loss': 0.4651, 'learning_rate': 1.681053791583348e-05, 'epoch': 0.28} + 28%|██▊ | 1638/5772 [2:52:17<6:57:01, 6.05s/it] 28%|██▊ | 1639/5772 [2:52:30<7:00:06, 6.10s/it] 28%|██▊ | 1639/5772 [2:52:23<7:00:06, 6.10s/it] {'loss': 0.493, 'learning_rate': 1.6806427556281105e-05, 'epoch': 0.28} + 28%|██▊ | 1639/5772 [2:52:30<7:00:06, 6.10s/it] {'loss': 0.493, 'learning_rate': 1.6806427556281105e-05, 'epoch': 0.28} + 28%|██▊ | 1639/5772 [2:52:23<7:00:06, 6.10s/it] 28%|██▊ | 1640/5772 [2:52:29<7:08:58, 6.23s/it] 28%|██▊ | 1640/5772 [2:52:36<7:08:59, 6.23s/it] {'loss': 0.4864, 'learning_rate': 1.6802315053082218e-05, 'epoch': 0.28} + 28%|██▊ | 1640/5772 [2:52:36<7:08:59, 6.23s/it] {'loss': 0.4864, 'learning_rate': 1.6802315053082218e-05, 'epoch': 0.28} + 28%|██▊ | 1640/5772 [2:52:29<7:08:58, 6.23s/it] 28%|██▊ | 1641/5772 [2:52:42<7:03:13, 6.15s/it] 28%|██▊ | 1641/5772 [2:52:35<7:03:14, 6.15s/it] {'loss': 0.4772, 'learning_rate': 1.6798200407532025e-05, 'epoch': 0.28} + 28%|██▊ | 1641/5772 [2:52:42<7:03:13, 6.15s/it] {'loss': 0.4772, 'learning_rate': 1.6798200407532025e-05, 'epoch': 0.28} + 28%|██▊ | 1641/5772 [2:52:35<7:03:14, 6.15s/it] 28%|██▊ | 1642/5772 [2:52:48<7:00:47, 6.11s/it] 28%|██▊ | 1642/5772 [2:52:41<7:00:48, 6.11s/it] {'loss': 0.4849, 'learning_rate': 1.6794083620926412e-05, 'epoch': 0.28} + 28%|██▊ | 1642/5772 [2:52:48<7:00:47, 6.11s/it] {'loss': 0.4849, 'learning_rate': 1.6794083620926412e-05, 'epoch': 0.28} + 28%|██▊ | 1642/5772 [2:52:41<7:00:48, 6.11s/it] 28%|██▊ | 1643/5772 [2:52:54<7:00:33, 6.11s/it] 28%|██▊ | 1643/5772 [2:52:47<7:00:33, 6.11s/it] {'loss': 0.4909, 'learning_rate': 1.6789964694561936e-05, 'epoch': 0.28} + 28%|██▊ | 1643/5772 [2:52:54<7:00:33, 6.11s/it] {'loss': 0.4909, 'learning_rate': 1.6789964694561936e-05, 'epoch': 0.28} + 28%|██▊ | 1643/5772 [2:52:47<7:00:33, 6.11s/it] 28%|██▊ | 1644/5772 [2:53:00<6:53:51, 6.02s/it] 28%|██▊ | 1644/5772 [2:52:53<6:53:51, 6.02s/it] {'loss': 0.4645, 'learning_rate': 1.6785843629735832e-05, 'epoch': 0.28} + 28%|██▊ | 1644/5772 [2:53:00<6:53:51, 6.02s/it] {'loss': 0.4645, 'learning_rate': 1.6785843629735832e-05, 'epoch': 0.28} + 28%|██▊ | 1644/5772 [2:52:53<6:53:51, 6.02s/it] 28%|██▊ | 1645/5772 [2:52:59<6:52:19, 5.99s/it] 28%|██▊ | 1645/5772 [2:53:06<6:52:19, 5.99s/it] {'loss': 0.488, 'learning_rate': 1.6781720427746008e-05, 'epoch': 0.28} + 28%|██▊ | 1645/5772 [2:53:06<6:52:19, 5.99s/it] {'loss': 0.488, 'learning_rate': 1.6781720427746008e-05, 'epoch': 0.28} + 28%|██▊ | 1645/5772 [2:52:59<6:52:19, 5.99s/it] 29%|██▊ | 1646/5772 [2:53:05<6:53:24, 6.01s/it] 29%|██▊ | 1646/5772 [2:53:12<6:53:25, 6.01s/it] {'loss': 0.4667, 'learning_rate': 1.677759508989104e-05, 'epoch': 0.29} + 29%|██▊ | 1646/5772 [2:53:12<6:53:25, 6.01s/it] {'loss': 0.4667, 'learning_rate': 1.677759508989104e-05, 'epoch': 0.29} + 29%|██▊ | 1646/5772 [2:53:05<6:53:24, 6.01s/it] 29%|██▊ | 1647/5772 [2:53:18<6:52:16, 6.00s/it] 29%|██▊ | 1647/5772 [2:53:11<6:52:16, 6.00s/it] {'loss': 0.4853, 'learning_rate': 1.6773467617470184e-05, 'epoch': 0.29} + 29%|██▊ | 1647/5772 [2:53:18<6:52:16, 6.00s/it] {'loss': 0.4853, 'learning_rate': 1.6773467617470184e-05, 'epoch': 0.29} + 29%|██▊ | 1647/5772 [2:53:11<6:52:16, 6.00s/it] 29%|██▊ | 1648/5772 [2:53:25<7:01:32, 6.13s/it] 29%|██▊ | 1648/5772 [2:53:18<7:01:32, 6.13s/it] {'loss': 0.478, 'learning_rate': 1.6769338011783363e-05, 'epoch': 0.29} + 29%|██▊ | 1648/5772 [2:53:25<7:01:32, 6.13s/it] {'loss': 0.478, 'learning_rate': 1.6769338011783363e-05, 'epoch': 0.29} + 29%|██▊ | 1648/5772 [2:53:18<7:01:32, 6.13s/it] 29%|██▊ | 1649/5772 [2:53:31<7:05:46, 6.20s/it] 29%|██▊ | 1649/5772 [2:53:24<7:05:45, 6.20s/it] {'loss': 0.4832, 'learning_rate': 1.676520627413117e-05, 'epoch': 0.29} + 29%|██▊ | 1649/5772 [2:53:31<7:05:46, 6.20s/it] {'loss': 0.4832, 'learning_rate': 1.676520627413117e-05, 'epoch': 0.29} + 29%|██▊ | 1649/5772 [2:53:24<7:05:45, 6.20s/it]5 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend...14 + AutoResumeHook: Checking whether to suspend... +42 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +7 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +0 29%|██▊ | 1650/5772 [2:53:37<7:08:41, 6.24s/it]9 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... + 29%|██▊ | 1650/5772 [2:53:30<7:08:42, 6.24s/it]11 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4763, 'learning_rate': 1.676107240581488e-05, 'epoch': 0.29} + 29%|██▊ | 1650/5772 [2:53:37<7:08:41, 6.24s/it] {'loss': 0.4763, 'learning_rate': 1.676107240581488e-05, 'epoch': 0.29} + 29%|██▊ | 1650/5772 [2:53:30<7:08:42, 6.24s/it] 29%|██▊ | 1651/5772 [2:53:37<7:09:39, 6.26s/it] 29%|██▊ | 1651/5772 [2:53:44<7:09:43, 6.26s/it] {'loss': 0.488, 'learning_rate': 1.6756936408136423e-05, 'epoch': 0.29} + 29%|██▊ | 1651/5772 [2:53:44<7:09:43, 6.26s/it] {'loss': 0.488, 'learning_rate': 1.6756936408136423e-05, 'epoch': 0.29} + 29%|██▊ | 1651/5772 [2:53:37<7:09:39, 6.26s/it] 29%|██▊ | 1652/5772 [2:53:50<7:09:58, 6.26s/it] 29%|██▊ | 1652/5772 [2:53:43<7:09:59, 6.26s/it] {'loss': 0.472, 'learning_rate': 1.6752798282398414e-05, 'epoch': 0.29} + 29%|██▊ | 1652/5772 [2:53:50<7:09:58, 6.26s/it] {'loss': 0.472, 'learning_rate': 1.6752798282398414e-05, 'epoch': 0.29} + 29%|██▊ | 1652/5772 [2:53:43<7:09:59, 6.26s/it] 29%|██▊ | 1653/5772 [2:53:56<7:03:53, 6.17s/it] 29%|██▊ | 1653/5772 [2:53:49<7:03:54, 6.17s/it] {'loss': 0.4785, 'learning_rate': 1.6748658029904132e-05, 'epoch': 0.29} + 29%|██▊ | 1653/5772 [2:53:49<7:03:54, 6.17s/it]{'loss': 0.4785, 'learning_rate': 1.6748658029904132e-05, 'epoch': 0.29} + 29%|██▊ | 1653/5772 [2:53:56<7:03:53, 6.17s/it] 29%|██▊ | 1654/5772 [2:53:55<7:10:40, 6.28s/it] 29%|██▊ | 1654/5772 [2:54:02<7:10:40, 6.27s/it] {'loss': 0.4887, 'learning_rate': 1.6744515651957525e-05, 'epoch': 0.29} + 29%|██▊ | 1654/5772 [2:54:02<7:10:40, 6.27s/it] {'loss': 0.4887, 'learning_rate': 1.6744515651957525e-05, 'epoch': 0.29} + 29%|██▊ | 1654/5772 [2:53:55<7:10:40, 6.28s/it] 29%|██▊ | 1655/5772 [2:54:08<7:05:22, 6.20s/it] 29%|██▊ | 1655/5772 [2:54:01<7:05:23, 6.20s/it] {'loss': 0.4847, 'learning_rate': 1.6740371149863212e-05, 'epoch': 0.29} + 29%|██▊ | 1655/5772 [2:54:08<7:05:22, 6.20s/it] {'loss': 0.4847, 'learning_rate': 1.6740371149863212e-05, 'epoch': 0.29} + 29%|██▊ | 1655/5772 [2:54:01<7:05:23, 6.20s/it] 29%|██▊ | 1656/5772 [2:54:15<7:07:43, 6.24s/it] 29%|██▊ | 1656/5772 [2:54:08<7:07:43, 6.24s/it] {'loss': 0.4734, 'learning_rate': 1.6736224524926487e-05, 'epoch': 0.29} + 29%|██▊ | 1656/5772 [2:54:15<7:07:43, 6.24s/it] {'loss': 0.4734, 'learning_rate': 1.6736224524926487e-05, 'epoch': 0.29} + 29%|██▊ | 1656/5772 [2:54:08<7:07:43, 6.24s/it] 29%|██▊ | 1657/5772 [2:54:21<7:01:43, 6.15s/it] 29%|██▊ | 1657/5772 [2:54:14<7:01:44, 6.15s/it] {'loss': 0.491, 'learning_rate': 1.6732075778453298e-05, 'epoch': 0.29} + 29%|██▊ | 1657/5772 [2:54:21<7:01:43, 6.15s/it] {'loss': 0.491, 'learning_rate': 1.6732075778453298e-05, 'epoch': 0.29} + 29%|██▊ | 1657/5772 [2:54:14<7:01:44, 6.15s/it] 29%|██▊ | 1658/5772 [2:54:27<6:57:31, 6.09s/it] 29%|██▊ | 1658/5772 [2:54:20<6:57:32, 6.09s/it] {'loss': 0.4717, 'learning_rate': 1.6727924911750274e-05, 'epoch': 0.29} + 29%|██▊ | 1658/5772 [2:54:27<6:57:31, 6.09s/it] {'loss': 0.4717, 'learning_rate': 1.6727924911750274e-05, 'epoch': 0.29} + 29%|██▊ | 1658/5772 [2:54:20<6:57:32, 6.09s/it] 29%|██▊ | 1659/5772 [2:54:26<7:00:47, 6.14s/it] 29%|██▊ | 1659/5772 [2:54:33<7:00:48, 6.14s/it] {'loss': 0.4911, 'learning_rate': 1.6723771926124704e-05, 'epoch': 0.29} + 29%|██▊ | 1659/5772 [2:54:33<7:00:48, 6.14s/it] {'loss': 0.4911, 'learning_rate': 1.6723771926124704e-05, 'epoch': 0.29} + 29%|██▊ | 1659/5772 [2:54:26<7:00:47, 6.14s/it] 29%|██▉ | 1660/5772 [2:54:32<6:58:31, 6.11s/it] 29%|██▉ | 1660/5772 [2:54:39<6:58:31, 6.11s/it] {'loss': 0.4733, 'learning_rate': 1.6719616822884555e-05, 'epoch': 0.29} + 29%|██▉ | 1660/5772 [2:54:39<6:58:31, 6.11s/it] {'loss': 0.4733, 'learning_rate': 1.6719616822884555e-05, 'epoch': 0.29} + 29%|██▉ | 1660/5772 [2:54:32<6:58:31, 6.11s/it] 29%|██▉ | 1661/5772 [2:54:38<6:52:53, 6.03s/it] 29%|██▉ | 1661/5772 [2:54:45<6:52:53, 6.03s/it] {'loss': 0.4723, 'learning_rate': 1.6715459603338445e-05, 'epoch': 0.29} + 29%|██▉ | 1661/5772 [2:54:45<6:52:53, 6.03s/it] {'loss': 0.4723, 'learning_rate': 1.6715459603338445e-05, 'epoch': 0.29} + 29%|██▉ | 1661/5772 [2:54:38<6:52:53, 6.03s/it] 29%|██▉ | 1662/5772 [2:54:44<6:57:39, 6.10s/it] 29%|██▉ | 1662/5772 [2:54:51<6:57:40, 6.10s/it] {'loss': 0.4768, 'learning_rate': 1.6711300268795674e-05, 'epoch': 0.29} + 29%|██▉ | 1662/5772 [2:54:51<6:57:40, 6.10s/it] {'loss': 0.4768, 'learning_rate': 1.6711300268795674e-05, 'epoch': 0.29} + 29%|██▉ | 1662/5772 [2:54:44<6:57:39, 6.10s/it] 29%|██▉ | 1663/5772 [2:54:57<6:55:55, 6.07s/it] 29%|██▉ | 1663/5772 [2:54:50<6:55:55, 6.07s/it] {'loss': 0.4821, 'learning_rate': 1.6707138820566195e-05, 'epoch': 0.29} + 29%|██▉ | 1663/5772 [2:54:57<6:55:55, 6.07s/it] {'loss': 0.4821, 'learning_rate': 1.6707138820566195e-05, 'epoch': 0.29} + 29%|██▉ | 1663/5772 [2:54:50<6:55:55, 6.07s/it] 29%|██▉ | 1664/5772 [2:54:56<6:56:23, 6.08s/it] 29%|██▉ | 1664/5772 [2:55:03<6:56:23, 6.08s/it] {'loss': 0.4755, 'learning_rate': 1.670297525996064e-05, 'epoch': 0.29} + 29%|██▉ | 1664/5772 [2:55:03<6:56:23, 6.08s/it] {'loss': 0.4755, 'learning_rate': 1.670297525996064e-05, 'epoch': 0.29} + 29%|██▉ | 1664/5772 [2:54:56<6:56:23, 6.08s/it] 29%|██▉ | 1665/5772 [2:55:03<7:06:45, 6.23s/it] 29%|██▉ | 1665/5772 [2:55:10<7:06:45, 6.23s/it] {'loss': 0.4733, 'learning_rate': 1.6698809588290292e-05, 'epoch': 0.29} + 29%|██▉ | 1665/5772 [2:55:10<7:06:45, 6.23s/it] {'loss': 0.4733, 'learning_rate': 1.6698809588290292e-05, 'epoch': 0.29} + 29%|██▉ | 1665/5772 [2:55:03<7:06:45, 6.23s/it] 29%|██▉ | 1666/5772 [2:55:16<6:58:49, 6.12s/it] 29%|██▉ | 1666/5772 [2:55:09<6:58:49, 6.12s/it] {'loss': 0.489, 'learning_rate': 1.6694641806867112e-05, 'epoch': 0.29} + 29%|██▉ | 1666/5772 [2:55:16<6:58:49, 6.12s/it] {'loss': 0.489, 'learning_rate': 1.6694641806867112e-05, 'epoch': 0.29} + 29%|██▉ | 1666/5772 [2:55:09<6:58:49, 6.12s/it] 29%|██▉ | 1667/5772 [2:55:22<7:04:00, 6.20s/it] 29%|██▉ | 1667/5772 [2:55:15<7:04:00, 6.20s/it] {'loss': 0.4801, 'learning_rate': 1.6690471917003716e-05, 'epoch': 0.29} + 29%|██▉ | 1667/5772 [2:55:22<7:04:00, 6.20s/it] {'loss': 0.4801, 'learning_rate': 1.6690471917003716e-05, 'epoch': 0.29} + 29%|██▉ | 1667/5772 [2:55:15<7:04:00, 6.20s/it] 29%|██▉ | 1668/5772 [2:55:21<7:09:02, 6.27s/it] 29%|██▉ | 1668/5772 [2:55:28<7:09:02, 6.27s/it] {'loss': 0.484, 'learning_rate': 1.6686299920013388e-05, 'epoch': 0.29} + 29%|██▉ | 1668/5772 [2:55:28<7:09:02, 6.27s/it] {'loss': 0.484, 'learning_rate': 1.6686299920013388e-05, 'epoch': 0.29} + 29%|██▉ | 1668/5772 [2:55:21<7:09:02, 6.27s/it] 29%|██▉ | 1669/5772 [2:55:35<7:06:51, 6.24s/it] 29%|██▉ | 1669/5772 [2:55:28<7:06:51, 6.24s/it] {'loss': 0.4744, 'learning_rate': 1.668212581721008e-05, 'epoch': 0.29} + 29%|██▉ | 1669/5772 [2:55:35<7:06:51, 6.24s/it] {'loss': 0.4744, 'learning_rate': 1.668212581721008e-05, 'epoch': 0.29} + 29%|██▉ | 1669/5772 [2:55:28<7:06:51, 6.24s/it] 29%|██▉ | 1670/5772 [2:55:41<7:09:52, 6.29s/it] 29%|██▉ | 1670/5772 [2:55:34<7:09:52, 6.29s/it] {'loss': 0.4864, 'learning_rate': 1.6677949609908394e-05, 'epoch': 0.29} + 29%|██▉ | 1670/5772 [2:55:41<7:09:52, 6.29s/it] {'loss': 0.4864, 'learning_rate': 1.6677949609908394e-05, 'epoch': 0.29} + 29%|██▉ | 1670/5772 [2:55:34<7:09:52, 6.29s/it] 29%|██▉ | 1671/5772 [2:55:47<7:13:51, 6.35s/it] 29%|██▉ | 1671/5772 [2:55:40<7:13:51, 6.35s/it] {'loss': 0.4845, 'learning_rate': 1.6673771299423613e-05, 'epoch': 0.29} + 29%|██▉ | 1671/5772 [2:55:47<7:13:51, 6.35s/it] {'loss': 0.4845, 'learning_rate': 1.6673771299423613e-05, 'epoch': 0.29} + 29%|██▉ | 1671/5772 [2:55:40<7:13:51, 6.35s/it] 29%|██▉ | 1672/5772 [2:55:53<7:05:50, 6.23s/it] 29%|██▉ | 1672/5772 [2:55:46<7:05:50, 6.23s/it] {'loss': 0.4727, 'learning_rate': 1.666959088707166e-05, 'epoch': 0.29} + 29%|██▉ | 1672/5772 [2:55:53<7:05:50, 6.23s/it] {'loss': 0.4727, 'learning_rate': 1.666959088707166e-05, 'epoch': 0.29} + 29%|██▉ | 1672/5772 [2:55:46<7:05:50, 6.23s/it] 29%|██▉ | 1673/5772 [2:56:00<7:05:50, 6.23s/it] 29%|██▉ | 1673/5772 [2:55:53<7:05:50, 6.23s/it] {'loss': 0.5003, 'learning_rate': 1.6665408374169144e-05, 'epoch': 0.29} + 29%|██▉ | 1673/5772 [2:56:00<7:05:50, 6.23s/it] {'loss': 0.5003, 'learning_rate': 1.6665408374169144e-05, 'epoch': 0.29} + 29%|██▉ | 1673/5772 [2:55:53<7:05:50, 6.23s/it] 29%|██▉ | 1674/5772 [2:56:06<6:58:40, 6.13s/it] 29%|██▉ | 1674/5772 [2:55:58<6:58:41, 6.13s/it] {'loss': 0.4625, 'learning_rate': 1.666122376203332e-05, 'epoch': 0.29} + 29%|██▉ | 1674/5772 [2:56:06<6:58:40, 6.13s/it] {'loss': 0.4625, 'learning_rate': 1.666122376203332e-05, 'epoch': 0.29} + 29%|██▉ | 1674/5772 [2:55:58<6:58:41, 6.13s/it] 29%|██▉ | 1675/5772 [2:56:11<6:54:41, 6.07s/it] 29%|██▉ | 1675/5772 [2:56:04<6:54:42, 6.07s/it] {'loss': 0.4741, 'learning_rate': 1.665703705198211e-05, 'epoch': 0.29} + 29%|██▉ | 1675/5772 [2:56:11<6:54:41, 6.07s/it] {'loss': 0.4741, 'learning_rate': 1.665703705198211e-05, 'epoch': 0.29} + 29%|██▉ | 1675/5772 [2:56:04<6:54:42, 6.07s/it] 29%|██▉ | 1676/5772 [2:56:18<7:01:40, 6.18s/it] 29%|██▉ | 1676/5772 [2:56:11<7:01:40, 6.18s/it] {'loss': 0.473, 'learning_rate': 1.6652848245334097e-05, 'epoch': 0.29} + 29%|██▉ | 1676/5772 [2:56:18<7:01:40, 6.18s/it] {'loss': 0.473, 'learning_rate': 1.6652848245334097e-05, 'epoch': 0.29} + 29%|██▉ | 1676/5772 [2:56:11<7:01:40, 6.18s/it] 29%|██▉ | 1677/5772 [2:56:17<7:01:18, 6.17s/it] 29%|██▉ | 1677/5772 [2:56:24<7:01:19, 6.17s/it] {'loss': 0.4817, 'learning_rate': 1.6648657343408517e-05, 'epoch': 0.29} + 29%|██▉ | 1677/5772 [2:56:24<7:01:19, 6.17s/it] {'loss': 0.4817, 'learning_rate': 1.6648657343408517e-05, 'epoch': 0.29} + 29%|██▉ | 1677/5772 [2:56:17<7:01:18, 6.17s/it] 29%|██▉ | 1678/5772 [2:56:23<7:04:11, 6.22s/it] 29%|██▉ | 1678/5772 [2:56:30<7:04:12, 6.22s/it] {'loss': 0.4679, 'learning_rate': 1.6644464347525273e-05, 'epoch': 0.29} + 29%|██▉ | 1678/5772 [2:56:30<7:04:12, 6.22s/it] {'loss': 0.4679, 'learning_rate': 1.6644464347525273e-05, 'epoch': 0.29} + 29%|██▉ | 1678/5772 [2:56:23<7:04:11, 6.22s/it] 29%|██▉ | 1679/5772 [2:56:29<6:59:34, 6.15s/it] 29%|██▉ | 1679/5772 [2:56:36<6:59:34, 6.15s/it] {'loss': 0.491, 'learning_rate': 1.664026925900492e-05, 'epoch': 0.29} + 29%|██▉ | 1679/5772 [2:56:36<6:59:34, 6.15s/it] {'loss': 0.491, 'learning_rate': 1.664026925900492e-05, 'epoch': 0.29} + 29%|██▉ | 1679/5772 [2:56:29<6:59:34, 6.15s/it] 29%|██▉ | 1680/5772 [2:56:42<6:55:03, 6.09s/it] 29%|██▉ | 1680/5772 [2:56:35<6:55:05, 6.09s/it] {'loss': 0.4781, 'learning_rate': 1.663607207916869e-05, 'epoch': 0.29} + 29%|██▉ | 1680/5772 [2:56:42<6:55:03, 6.09s/it] {'loss': 0.4781, 'learning_rate': 1.663607207916869e-05, 'epoch': 0.29} + 29%|██▉ | 1680/5772 [2:56:35<6:55:05, 6.09s/it] 29%|██▉ | 1681/5772 [2:56:42<7:00:31, 6.17s/it] 29%|██▉ | 1681/5772 [2:56:49<7:00:32, 6.17s/it] {'loss': 0.4882, 'learning_rate': 1.6631872809338456e-05, 'epoch': 0.29} + 29%|██▉ | 1681/5772 [2:56:49<7:00:32, 6.17s/it] {'loss': 0.4882, 'learning_rate': 1.6631872809338456e-05, 'epoch': 0.29} + 29%|██▉ | 1681/5772 [2:56:42<7:00:31, 6.17s/it] 29%|██▉ | 1682/5772 [2:56:55<7:01:19, 6.18s/it] 29%|██▉ | 1682/5772 [2:56:48<7:01:19, 6.18s/it] {'loss': 0.4775, 'learning_rate': 1.6627671450836753e-05, 'epoch': 0.29} + 29%|██▉ | 1682/5772 [2:56:55<7:01:19, 6.18s/it] {'loss': 0.4775, 'learning_rate': 1.6627671450836753e-05, 'epoch': 0.29} + 29%|██▉ | 1682/5772 [2:56:48<7:01:19, 6.18s/it] 29%|██▉ | 1683/5772 [2:57:01<7:00:35, 6.17s/it] 29%|██▉ | 1683/5772 [2:56:54<7:00:35, 6.17s/it] {'loss': 0.4884, 'learning_rate': 1.6623468004986774e-05, 'epoch': 0.29} + 29%|██▉ | 1683/5772 [2:57:01<7:00:35, 6.17s/it] {'loss': 0.4884, 'learning_rate': 1.6623468004986774e-05, 'epoch': 0.29} + 29%|██▉ | 1683/5772 [2:56:54<7:00:35, 6.17s/it] 29%|██▉ | 1684/5772 [2:57:07<6:58:13, 6.14s/it] 29%|██▉ | 1684/5772 [2:57:00<6:58:13, 6.14s/it] {'loss': 0.4808, 'learning_rate': 1.661926247311238e-05, 'epoch': 0.29} + 29%|██▉ | 1684/5772 [2:57:07<6:58:13, 6.14s/it] {'loss': 0.4808, 'learning_rate': 1.661926247311238e-05, 'epoch': 0.29} + 29%|██▉ | 1684/5772 [2:57:00<6:58:13, 6.14s/it] 29%|██▉ | 1685/5772 [2:57:13<7:01:03, 6.18s/it] 29%|██▉ | 1685/5772 [2:57:06<7:01:03, 6.18s/it] {'loss': 0.4874, 'learning_rate': 1.6615054856538067e-05, 'epoch': 0.29} + 29%|██▉ | 1685/5772 [2:57:13<7:01:03, 6.18s/it] {'loss': 0.4874, 'learning_rate': 1.6615054856538067e-05, 'epoch': 0.29} + 29%|██▉ | 1685/5772 [2:57:06<7:01:03, 6.18s/it] 29%|██▉ | 1686/5772 [2:57:19<6:57:51, 6.14s/it] 29%|██▉ | 1686/5772 [2:57:12<6:57:51, 6.14s/it] {'loss': 0.4707, 'learning_rate': 1.661084515658901e-05, 'epoch': 0.29} + 29%|██▉ | 1686/5772 [2:57:19<6:57:51, 6.14s/it] {'loss': 0.4707, 'learning_rate': 1.661084515658901e-05, 'epoch': 0.29} + 29%|██▉ | 1686/5772 [2:57:12<6:57:51, 6.14s/it] 29%|██▉ | 1687/5772 [2:57:18<6:57:25, 6.13s/it] 29%|██▉ | 1687/5772 [2:57:26<6:57:25, 6.13s/it] {'loss': 0.4841, 'learning_rate': 1.6606633374591022e-05, 'epoch': 0.29} + 29%|██▉ | 1687/5772 [2:57:26<6:57:25, 6.13s/it] {'loss': 0.4841, 'learning_rate': 1.6606633374591022e-05, 'epoch': 0.29} + 29%|██▉ | 1687/5772 [2:57:18<6:57:25, 6.13s/it] 29%|██▉ | 1688/5772 [2:57:32<6:59:03, 6.16s/it] 29%|██▉ | 1688/5772 [2:57:25<6:59:03, 6.16s/it] {'loss': 0.4763, 'learning_rate': 1.660241951187059e-05, 'epoch': 0.29} + 29%|██▉ | 1688/5772 [2:57:32<6:59:03, 6.16s/it] {'loss': 0.4763, 'learning_rate': 1.660241951187059e-05, 'epoch': 0.29} + 29%|██▉ | 1688/5772 [2:57:25<6:59:03, 6.16s/it] 29%|██▉ | 1689/5772 [2:57:31<6:56:36, 6.12s/it] 29%|██▉ | 1689/5772 [2:57:38<6:56:36, 6.12s/it] {'loss': 0.4839, 'learning_rate': 1.6598203569754843e-05, 'epoch': 0.29} + 29%|██▉ | 1689/5772 [2:57:38<6:56:36, 6.12s/it] {'loss': 0.4839, 'learning_rate': 1.6598203569754843e-05, 'epoch': 0.29} + 29%|██▉ | 1689/5772 [2:57:31<6:56:36, 6.12s/it] 29%|██▉ | 1690/5772 [2:57:44<7:02:52, 6.22s/it] 29%|██▉ | 1690/5772 [2:57:37<7:02:52, 6.22s/it] {'loss': 0.4783, 'learning_rate': 1.6593985549571568e-05, 'epoch': 0.29} + 29%|██▉ | 1690/5772 [2:57:44<7:02:52, 6.22s/it] {'loss': 0.4783, 'learning_rate': 1.6593985549571568e-05, 'epoch': 0.29} + 29%|██▉ | 1690/5772 [2:57:37<7:02:52, 6.22s/it] 29%|██▉ | 1691/5772 [2:57:43<6:57:06, 6.13s/it] 29%|██▉ | 1691/5772 [2:57:50<6:57:12, 6.13s/it] {'loss': 0.4928, 'learning_rate': 1.6589765452649205e-05, 'epoch': 0.29} + 29%|██▉ | 1691/5772 [2:57:50<6:57:12, 6.13s/it] {'loss': 0.4928, 'learning_rate': 1.6589765452649205e-05, 'epoch': 0.29} + 29%|██▉ | 1691/5772 [2:57:43<6:57:06, 6.13s/it] 29%|██▉ | 1692/5772 [2:57:56<6:56:33, 6.13s/it] 29%|██▉ | 1692/5772 [2:57:49<6:56:36, 6.13s/it] {'loss': 0.4722, 'learning_rate': 1.6585543280316853e-05, 'epoch': 0.29} + 29%|██▉ | 1692/5772 [2:57:56<6:56:33, 6.13s/it] {'loss': 0.4722, 'learning_rate': 1.6585543280316853e-05, 'epoch': 0.29} + 29%|██▉ | 1692/5772 [2:57:49<6:56:36, 6.13s/it] 29%|██▉ | 1693/5772 [2:58:02<6:57:38, 6.14s/it] 29%|██▉ | 1693/5772 [2:57:55<6:57:39, 6.14s/it] {'loss': 0.4848, 'learning_rate': 1.658131903390426e-05, 'epoch': 0.29} + 29%|██▉ | 1693/5772 [2:58:02<6:57:38, 6.14s/it] {'loss': 0.4848, 'learning_rate': 1.658131903390426e-05, 'epoch': 0.29} + 29%|██▉ | 1693/5772 [2:57:55<6:57:39, 6.14s/it] 29%|██▉ | 1694/5772 [2:58:09<6:58:14, 6.15s/it] 29%|██▉ | 1694/5772 [2:58:02<6:58:14, 6.15s/it] {'loss': 0.4727, 'learning_rate': 1.657709271474183e-05, 'epoch': 0.29} + 29%|██▉ | 1694/5772 [2:58:09<6:58:14, 6.15s/it] {'loss': 0.4727, 'learning_rate': 1.657709271474183e-05, 'epoch': 0.29} + 29%|██▉ | 1694/5772 [2:58:02<6:58:14, 6.15s/it] 29%|██▉ | 1695/5772 [2:58:15<7:00:17, 6.19s/it] 29%|██▉ | 1695/5772 [2:58:08<7:00:18, 6.19s/it] {'loss': 0.4902, 'learning_rate': 1.6572864324160617e-05, 'epoch': 0.29} + 29%|██▉ | 1695/5772 [2:58:15<7:00:17, 6.19s/it] {'loss': 0.4902, 'learning_rate': 1.6572864324160617e-05, 'epoch': 0.29} + 29%|██▉ | 1695/5772 [2:58:08<7:00:18, 6.19s/it] 29%|██▉ | 1696/5772 [2:58:14<7:02:50, 6.22s/it] 29%|██▉ | 1696/5772 [2:58:21<7:02:50, 6.22s/it] {'loss': 0.4872, 'learning_rate': 1.6568633863492332e-05, 'epoch': 0.29} + 29%|██▉ | 1696/5772 [2:58:21<7:02:50, 6.22s/it] {'loss': 0.4872, 'learning_rate': 1.6568633863492332e-05, 'epoch': 0.29} + 29%|██▉ | 1696/5772 [2:58:14<7:02:50, 6.22s/it] 29%|██▉ | 1697/5772 [2:58:27<6:56:19, 6.13s/it] 29%|██▉ | 1697/5772 [2:58:20<6:56:20, 6.13s/it] {'loss': 0.4794, 'learning_rate': 1.6564401334069333e-05, 'epoch': 0.29} + 29%|██▉ | 1697/5772 [2:58:27<6:56:19, 6.13s/it] {'loss': 0.4794, 'learning_rate': 1.6564401334069333e-05, 'epoch': 0.29} + 29%|██▉ | 1697/5772 [2:58:20<6:56:20, 6.13s/it] 29%|██▉ | 1698/5772 [2:58:26<6:53:48, 6.09s/it] 29%|██▉ | 1698/5772 [2:58:33<6:53:48, 6.09s/it] {'loss': 0.4731, 'learning_rate': 1.656016673722463e-05, 'epoch': 0.29} + 29%|██▉ | 1698/5772 [2:58:33<6:53:48, 6.09s/it] {'loss': 0.4731, 'learning_rate': 1.656016673722463e-05, 'epoch': 0.29} + 29%|██▉ | 1698/5772 [2:58:26<6:53:48, 6.09s/it] 29%|██▉ | 1699/5772 [2:58:39<6:56:24, 6.13s/it] 29%|██▉ | 1699/5772 [2:58:32<6:56:24, 6.13s/it] {'loss': 0.4825, 'learning_rate': 1.655593007429189e-05, 'epoch': 0.29} + 29%|██▉ | 1699/5772 [2:58:39<6:56:24, 6.13s/it] {'loss': 0.4825, 'learning_rate': 1.655593007429189e-05, 'epoch': 0.29} + 29%|██▉ | 1699/5772 [2:58:32<6:56:24, 6.13s/it]5 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +0 29%|██▉ | 1700/5772 [2:58:45<6:56:54, 6.14s/it]11 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... + 29%|██▉ | 1700/5772 [2:58:38<6:56:54, 6.14s/it] {'loss': 0.4699, 'learning_rate': 1.6551691346605426e-05, 'epoch': 0.29} + 29%|██▉ | 1700/5772 [2:58:46<6:56:54, 6.14s/it] {'loss': 0.4699, 'learning_rate': 1.6551691346605426e-05, 'epoch': 0.29} + 29%|██▉ | 1700/5772 [2:58:38<6:56:54, 6.14s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-1700/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-1700/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-1700/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 29%|██▉ | 1701/5772 [2:59:04<11:12:05, 9.91s/it] 29%|██▉ | 1701/5772 [2:58:57<11:12:06, 9.91s/it] {'loss': 0.4846, 'learning_rate': 1.65474505555002e-05, 'epoch': 0.29} + 29%|██▉ | 1701/5772 [2:59:04<11:12:05, 9.91s/it] {'loss': 0.4846, 'learning_rate': 1.65474505555002e-05, 'epoch': 0.29} + 29%|██▉ | 1701/5772 [2:58:57<11:12:06, 9.91s/it] 29%|██▉ | 1702/5772 [2:59:04<10:03:49, 8.90s/it] 29%|██▉ | 1702/5772 [2:59:11<10:03:50, 8.90s/it] {'loss': 0.4806, 'learning_rate': 1.6543207702311822e-05, 'epoch': 0.29} + 29%|██▉ | 1702/5772 [2:59:11<10:03:50, 8.90s/it] {'loss': 0.4806, 'learning_rate': 1.6543207702311822e-05, 'epoch': 0.29} + 29%|██▉ | 1702/5772 [2:59:04<10:03:49, 8.90s/it] 30%|██▉ | 1703/5772 [2:59:17<9:19:13, 8.25s/it] 30%|██▉ | 1703/5772 [2:59:10<9:19:14, 8.25s/it] {'loss': 0.4719, 'learning_rate': 1.6538962788376557e-05, 'epoch': 0.3} + 30%|██▉ | 1703/5772 [2:59:17<9:19:13, 8.25s/it] {'loss': 0.4719, 'learning_rate': 1.6538962788376557e-05, 'epoch': 0.3} + 30%|██▉ | 1703/5772 [2:59:10<9:19:14, 8.25s/it] 30%|██▉ | 1704/5772 [2:59:16<8:30:27, 7.53s/it] 30%|██▉ | 1704/5772 [2:59:23<8:30:28, 7.53s/it] {'loss': 0.4718, 'learning_rate': 1.6534715815031325e-05, 'epoch': 0.3} + 30%|██▉ | 1704/5772 [2:59:23<8:30:28, 7.53s/it] {'loss': 0.4718, 'learning_rate': 1.6534715815031325e-05, 'epoch': 0.3} + 30%|██▉ | 1704/5772 [2:59:16<8:30:27, 7.53s/it] 30%|██▉ | 1705/5772 [2:59:30<8:03:19, 7.13s/it] 30%|██▉ | 1705/5772 [2:59:22<8:03:19, 7.13s/it] {'loss': 0.4824, 'learning_rate': 1.6530466783613674e-05, 'epoch': 0.3} + 30%|██▉ | 1705/5772 [2:59:30<8:03:19, 7.13s/it] {'loss': 0.4824, 'learning_rate': 1.6530466783613674e-05, 'epoch': 0.3} + 30%|██▉ | 1705/5772 [2:59:22<8:03:19, 7.13s/it] 30%|██▉ | 1706/5772 [2:59:29<7:43:39, 6.84s/it] 30%|██▉ | 1706/5772 [2:59:36<7:43:39, 6.84s/it] {'loss': 0.4716, 'learning_rate': 1.652621569546182e-05, 'epoch': 0.3} + 30%|██▉ | 1706/5772 [2:59:36<7:43:39, 6.84s/it] {'loss': 0.4716, 'learning_rate': 1.652621569546182e-05, 'epoch': 0.3} + 30%|██▉ | 1706/5772 [2:59:29<7:43:39, 6.84s/it] 30%|██▉ | 1707/5772 [2:59:35<7:26:18, 6.59s/it] 30%|██▉ | 1707/5772 [2:59:42<7:26:19, 6.59s/it] {'loss': 0.4657, 'learning_rate': 1.652196255191462e-05, 'epoch': 0.3} + 30%|██▉ | 1707/5772 [2:59:42<7:26:19, 6.59s/it] {'loss': 0.4657, 'learning_rate': 1.652196255191462e-05, 'epoch': 0.3} + 30%|██▉ | 1707/5772 [2:59:35<7:26:18, 6.59s/it] 30%|██▉ | 1708/5772 [2:59:48<7:16:17, 6.44s/it] 30%|██▉ | 1708/5772 [2:59:41<7:16:17, 6.44s/it] {'loss': 0.482, 'learning_rate': 1.651770735431158e-05, 'epoch': 0.3} + 30%|██▉ | 1708/5772 [2:59:48<7:16:17, 6.44s/it] {'loss': 0.482, 'learning_rate': 1.651770735431158e-05, 'epoch': 0.3} + 30%|██▉ | 1708/5772 [2:59:41<7:16:17, 6.44s/it] 30%|██▉ | 1709/5772 [2:59:47<7:10:00, 6.35s/it] 30%|██▉ | 1709/5772 [2:59:54<7:10:00, 6.35s/it] {'loss': 0.4808, 'learning_rate': 1.6513450103992844e-05, 'epoch': 0.3} + 30%|██▉ | 1709/5772 [2:59:54<7:10:00, 6.35s/it] {'loss': 0.4808, 'learning_rate': 1.6513450103992844e-05, 'epoch': 0.3} + 30%|██▉ | 1709/5772 [2:59:47<7:10:00, 6.35s/it] 30%|██▉ | 1710/5772 [3:00:00<7:09:11, 6.34s/it] 30%|██▉ | 1710/5772 [2:59:53<7:09:11, 6.34s/it] {'loss': 0.4923, 'learning_rate': 1.650919080229921e-05, 'epoch': 0.3} + 30%|██▉ | 1710/5772 [3:00:00<7:09:11, 6.34s/it] {'loss': 0.4923, 'learning_rate': 1.650919080229921e-05, 'epoch': 0.3} + 30%|██▉ | 1710/5772 [2:59:53<7:09:11, 6.34s/it] 30%|██▉ | 1711/5772 [3:00:00<7:09:26, 6.34s/it] 30%|██▉ | 1711/5772 [3:00:07<7:09:27, 6.35s/it] {'loss': 0.495, 'learning_rate': 1.650492945057213e-05, 'epoch': 0.3} + 30%|██▉ | 1711/5772 [3:00:07<7:09:27, 6.35s/it] {'loss': 0.495, 'learning_rate': 1.650492945057213e-05, 'epoch': 0.3} + 30%|██▉ | 1711/5772 [3:00:00<7:09:26, 6.34s/it] 30%|██▉ | 1712/5772 [3:00:06<7:09:05, 6.34s/it] 30%|██▉ | 1712/5772 [3:00:13<7:09:05, 6.34s/it] {'loss': 0.4701, 'learning_rate': 1.6500666050153685e-05, 'epoch': 0.3} + 30%|██▉ | 1712/5772 [3:00:13<7:09:05, 6.34s/it] {'loss': 0.4701, 'learning_rate': 1.6500666050153685e-05, 'epoch': 0.3} + 30%|██▉ | 1712/5772 [3:00:06<7:09:05, 6.34s/it] 30%|██▉ | 1713/5772 [3:00:12<7:03:10, 6.26s/it] 30%|██▉ | 1713/5772 [3:00:19<7:03:10, 6.26s/it] {'loss': 0.4815, 'learning_rate': 1.649640060238661e-05, 'epoch': 0.3} + 30%|██▉ | 1713/5772 [3:00:19<7:03:10, 6.26s/it] {'loss': 0.4815, 'learning_rate': 1.649640060238661e-05, 'epoch': 0.3} + 30%|██▉ | 1713/5772 [3:00:12<7:03:10, 6.26s/it] 30%|██▉ | 1714/5772 [3:00:19<7:11:01, 6.37s/it] 30%|██▉ | 1714/5772 [3:00:26<7:11:02, 6.37s/it] {'loss': 0.472, 'learning_rate': 1.6492133108614284e-05, 'epoch': 0.3} + 30%|██▉ | 1714/5772 [3:00:26<7:11:02, 6.37s/it] {'loss': 0.472, 'learning_rate': 1.6492133108614284e-05, 'epoch': 0.3} + 30%|██▉ | 1714/5772 [3:00:19<7:11:01, 6.37s/it] 30%|██▉ | 1715/5772 [3:00:25<7:07:30, 6.32s/it] 30%|██▉ | 1715/5772 [3:00:32<7:07:30, 6.32s/it] {'loss': 0.4886, 'learning_rate': 1.6487863570180734e-05, 'epoch': 0.3} + 30%|██▉ | 1715/5772 [3:00:32<7:07:30, 6.32s/it] {'loss': 0.4886, 'learning_rate': 1.6487863570180734e-05, 'epoch': 0.3} + 30%|██▉ | 1715/5772 [3:00:25<7:07:30, 6.32s/it] 30%|██▉ | 1716/5772 [3:00:31<6:58:37, 6.19s/it] 30%|██▉ | 1716/5772 [3:00:38<6:58:37, 6.19s/it] {'loss': 0.4786, 'learning_rate': 1.6483591988430625e-05, 'epoch': 0.3} + 30%|██▉ | 1716/5772 [3:00:38<6:58:37, 6.19s/it] {'loss': 0.4786, 'learning_rate': 1.6483591988430625e-05, 'epoch': 0.3} + 30%|██▉ | 1716/5772 [3:00:31<6:58:37, 6.19s/it] 30%|██▉ | 1717/5772 [3:00:36<6:50:51, 6.08s/it] 30%|██▉ | 1717/5772 [3:00:44<6:50:52, 6.08s/it] {'loss': 0.4679, 'learning_rate': 1.6479318364709266e-05, 'epoch': 0.3} + 30%|██▉ | 1717/5772 [3:00:44<6:50:52, 6.08s/it] {'loss': 0.4679, 'learning_rate': 1.6479318364709266e-05, 'epoch': 0.3} + 30%|██▉ | 1717/5772 [3:00:36<6:50:51, 6.08s/it] 30%|██▉ | 1718/5772 [3:00:43<6:50:28, 6.08s/it] 30%|██▉ | 1718/5772 [3:00:50<6:50:28, 6.08s/it] {'loss': 0.4775, 'learning_rate': 1.647504270036262e-05, 'epoch': 0.3} + 30%|██▉ | 1718/5772 [3:00:50<6:50:28, 6.08s/it] {'loss': 0.4775, 'learning_rate': 1.647504270036262e-05, 'epoch': 0.3} + 30%|██▉ | 1718/5772 [3:00:43<6:50:28, 6.08s/it] 30%|██▉ | 1719/5772 [3:00:49<6:56:55, 6.17s/it] 30%|██▉ | 1719/5772 [3:00:56<6:56:55, 6.17s/it] {'loss': 0.4839, 'learning_rate': 1.647076499673727e-05, 'epoch': 0.3} + 30%|██▉ | 1719/5772 [3:00:56<6:56:55, 6.17s/it] {'loss': 0.4839, 'learning_rate': 1.647076499673727e-05, 'epoch': 0.3} + 30%|██▉ | 1719/5772 [3:00:49<6:56:55, 6.17s/it] 30%|██▉ | 1720/5772 [3:01:02<6:50:58, 6.09s/it] 30%|██▉ | 1720/5772 [3:00:55<6:50:59, 6.09s/it] {'loss': 0.4751, 'learning_rate': 1.6466485255180464e-05, 'epoch': 0.3} + 30%|██▉ | 1720/5772 [3:01:02<6:50:58, 6.09s/it] {'loss': 0.4751, 'learning_rate': 1.6466485255180464e-05, 'epoch': 0.3} + 30%|██▉ | 1720/5772 [3:00:55<6:50:59, 6.09s/it] 30%|██▉ | 1721/5772 [3:01:01<6:50:47, 6.08s/it] 30%|██▉ | 1721/5772 [3:01:08<6:50:47, 6.08s/it] {'loss': 0.4819, 'learning_rate': 1.646220347704008e-05, 'epoch': 0.3} + 30%|██▉ | 1721/5772 [3:01:08<6:50:47, 6.08s/it] {'loss': 0.4819, 'learning_rate': 1.646220347704008e-05, 'epoch': 0.3} + 30%|██▉ | 1721/5772 [3:01:01<6:50:47, 6.08s/it] 30%|██▉ | 1722/5772 [3:01:07<6:50:33, 6.08s/it] 30%|██▉ | 1722/5772 [3:01:14<6:50:34, 6.08s/it] {'loss': 0.4794, 'learning_rate': 1.645791966366464e-05, 'epoch': 0.3} + 30%|██▉ | 1722/5772 [3:01:14<6:50:34, 6.08s/it] {'loss': 0.4794, 'learning_rate': 1.645791966366464e-05, 'epoch': 0.3} + 30%|██▉ | 1722/5772 [3:01:07<6:50:33, 6.08s/it] 30%|██▉ | 1723/5772 [3:01:13<6:44:38, 6.00s/it] 30%|██▉ | 1723/5772 [3:01:20<6:44:37, 6.00s/it] {'loss': 0.481, 'learning_rate': 1.6453633816403312e-05, 'epoch': 0.3} + 30%|██▉ | 1723/5772 [3:01:20<6:44:37, 6.00s/it] {'loss': 0.481, 'learning_rate': 1.6453633816403312e-05, 'epoch': 0.3} + 30%|██▉ | 1723/5772 [3:01:13<6:44:38, 6.00s/it] 30%|██▉ | 1724/5772 [3:01:19<6:48:12, 6.05s/it] 30%|██▉ | 1724/5772 [3:01:26<6:48:12, 6.05s/it] {'loss': 0.4795, 'learning_rate': 1.6449345936605894e-05, 'epoch': 0.3} + 30%|██▉ | 1724/5772 [3:01:26<6:48:12, 6.05s/it] {'loss': 0.4795, 'learning_rate': 1.6449345936605894e-05, 'epoch': 0.3} + 30%|██▉ | 1724/5772 [3:01:19<6:48:12, 6.05s/it] 30%|██▉ | 1725/5772 [3:01:25<6:52:13, 6.11s/it] 30%|██▉ | 1725/5772 [3:01:32<6:52:14, 6.11s/it] {'loss': 0.4851, 'learning_rate': 1.644505602562283e-05, 'epoch': 0.3} + 30%|██▉ | 1725/5772 [3:01:32<6:52:14, 6.11s/it] {'loss': 0.4851, 'learning_rate': 1.644505602562283e-05, 'epoch': 0.3} + 30%|██▉ | 1725/5772 [3:01:25<6:52:13, 6.11s/it] 30%|██▉ | 1726/5772 [3:01:35<8:01:25, 7.14s/it] 30%|██▉ | 1726/5772 [3:01:42<8:01:24, 7.14s/it] {'loss': 0.4598, 'learning_rate': 1.6440764084805208e-05, 'epoch': 0.3} + 30%|██▉ | 1726/5772 [3:01:42<8:01:24, 7.14s/it] {'loss': 0.4598, 'learning_rate': 1.6440764084805208e-05, 'epoch': 0.3} + 30%|██▉ | 1726/5772 [3:01:35<8:01:25, 7.14s/it] 30%|██▉ | 1727/5772 [3:01:41<7:43:13, 6.87s/it] 30%|██▉ | 1727/5772 [3:01:48<7:43:12, 6.87s/it] {'loss': 0.4824, 'learning_rate': 1.6436470115504745e-05, 'epoch': 0.3} + 30%|██▉ | 1727/5772 [3:01:48<7:43:12, 6.87s/it] {'loss': 0.4824, 'learning_rate': 1.6436470115504745e-05, 'epoch': 0.3} + 30%|██▉ | 1727/5772 [3:01:41<7:43:13, 6.87s/it] 30%|██▉ | 1728/5772 [3:01:47<7:28:25, 6.65s/it] 30%|██▉ | 1728/5772 [3:01:54<7:28:25, 6.65s/it] {'loss': 0.4777, 'learning_rate': 1.643217411907381e-05, 'epoch': 0.3} + 30%|██▉ | 1728/5772 [3:01:54<7:28:25, 6.65s/it] {'loss': 0.4777, 'learning_rate': 1.643217411907381e-05, 'epoch': 0.3} + 30%|██▉ | 1728/5772 [3:01:47<7:28:25, 6.65s/it] 30%|██▉ | 1729/5772 [3:02:00<7:18:12, 6.50s/it] 30%|██▉ | 1729/5772 [3:01:53<7:18:12, 6.50s/it] {'loss': 0.4871, 'learning_rate': 1.6427876096865394e-05, 'epoch': 0.3} + 30%|██▉ | 1729/5772 [3:02:00<7:18:12, 6.50s/it] {'loss': 0.4871, 'learning_rate': 1.6427876096865394e-05, 'epoch': 0.3} + 30%|██▉ | 1729/5772 [3:01:53<7:18:12, 6.50s/it] 30%|██▉ | 1730/5772 [3:01:59<7:09:13, 6.37s/it] 30%|██▉ | 1730/5772 [3:02:06<7:09:13, 6.37s/it] {'loss': 0.4854, 'learning_rate': 1.6423576050233144e-05, 'epoch': 0.3} + 30%|██▉ | 1730/5772 [3:02:06<7:09:13, 6.37s/it] {'loss': 0.4854, 'learning_rate': 1.6423576050233144e-05, 'epoch': 0.3} + 30%|██▉ | 1730/5772 [3:01:59<7:09:13, 6.37s/it] 30%|██▉ | 1731/5772 [3:02:05<7:00:47, 6.25s/it] 30%|██▉ | 1731/5772 [3:02:12<7:00:47, 6.25s/it] {'loss': 0.4809, 'learning_rate': 1.6419273980531333e-05, 'epoch': 0.3} + 30%|██▉ | 1731/5772 [3:02:12<7:00:47, 6.25s/it] {'loss': 0.4809, 'learning_rate': 1.6419273980531333e-05, 'epoch': 0.3} + 30%|██▉ | 1731/5772 [3:02:05<7:00:47, 6.25s/it] 30%|███ | 1732/5772 [3:02:18<6:57:24, 6.20s/it] 30%|███ | 1732/5772 [3:02:11<6:57:24, 6.20s/it] {'loss': 0.4785, 'learning_rate': 1.6414969889114872e-05, 'epoch': 0.3} + 30%|███ | 1732/5772 [3:02:18<6:57:24, 6.20s/it] {'loss': 0.4785, 'learning_rate': 1.6414969889114872e-05, 'epoch': 0.3} + 30%|███ | 1732/5772 [3:02:11<6:57:24, 6.20s/it] 30%|███ | 1733/5772 [3:02:17<6:49:10, 6.08s/it] 30%|███ | 1733/5772 [3:02:24<6:49:10, 6.08s/it] {'loss': 0.4757, 'learning_rate': 1.641066377733931e-05, 'epoch': 0.3} + 30%|███ | 1733/5772 [3:02:24<6:49:10, 6.08s/it] {'loss': 0.4757, 'learning_rate': 1.641066377733931e-05, 'epoch': 0.3} + 30%|███ | 1733/5772 [3:02:17<6:49:10, 6.08s/it] 30%|███ | 1734/5772 [3:02:31<6:53:16, 6.14s/it] 30%|███ | 1734/5772 [3:02:23<6:53:16, 6.14s/it] {'loss': 0.4696, 'learning_rate': 1.6406355646560838e-05, 'epoch': 0.3} + 30%|███ | 1734/5772 [3:02:31<6:53:16, 6.14s/it] {'loss': 0.4696, 'learning_rate': 1.6406355646560838e-05, 'epoch': 0.3} + 30%|███ | 1734/5772 [3:02:23<6:53:16, 6.14s/it] 30%|███ | 1735/5772 [3:02:36<6:46:08, 6.04s/it] 30%|███ | 1735/5772 [3:02:29<6:46:08, 6.04s/it] {'loss': 0.493, 'learning_rate': 1.640204549813627e-05, 'epoch': 0.3} + 30%|███ | 1735/5772 [3:02:36<6:46:08, 6.04s/it] {'loss': 0.493, 'learning_rate': 1.640204549813627e-05, 'epoch': 0.3} + 30%|███ | 1735/5772 [3:02:29<6:46:08, 6.04s/it] 30%|███ | 1736/5772 [3:02:35<6:49:25, 6.09s/it] 30%|███ | 1736/5772 [3:02:43<6:49:25, 6.09s/it] {'loss': 0.4724, 'learning_rate': 1.6397733333423072e-05, 'epoch': 0.3} + 30%|███ | 1736/5772 [3:02:43<6:49:25, 6.09s/it] {'loss': 0.4724, 'learning_rate': 1.6397733333423072e-05, 'epoch': 0.3} + 30%|███ | 1736/5772 [3:02:35<6:49:25, 6.09s/it] 30%|███ | 1737/5772 [3:02:49<6:50:32, 6.10s/it] 30%|███ | 1737/5772 [3:02:42<6:50:32, 6.10s/it] {'loss': 0.4714, 'learning_rate': 1.639341915377933e-05, 'epoch': 0.3} + 30%|███ | 1737/5772 [3:02:49<6:50:32, 6.10s/it] {'loss': 0.4714, 'learning_rate': 1.639341915377933e-05, 'epoch': 0.3} + 30%|███ | 1737/5772 [3:02:42<6:50:32, 6.10s/it] 30%|███ | 1738/5772 [3:02:48<6:52:53, 6.14s/it] 30%|███ | 1738/5772 [3:02:55<6:52:53, 6.14s/it] {'loss': 0.4763, 'learning_rate': 1.6389102960563776e-05, 'epoch': 0.3} + 30%|███ | 1738/5772 [3:02:55<6:52:53, 6.14s/it] {'loss': 0.4763, 'learning_rate': 1.6389102960563776e-05, 'epoch': 0.3} + 30%|███ | 1738/5772 [3:02:48<6:52:53, 6.14s/it] 30%|███ | 1739/5772 [3:02:54<6:44:57, 6.02s/it] 30%|███ | 1739/5772 [3:03:01<6:44:57, 6.02s/it] {'loss': 0.4857, 'learning_rate': 1.6384784755135767e-05, 'epoch': 0.3} + 30%|███ | 1739/5772 [3:03:01<6:44:57, 6.02s/it] {'loss': 0.4857, 'learning_rate': 1.6384784755135767e-05, 'epoch': 0.3} + 30%|███ | 1739/5772 [3:02:54<6:44:57, 6.02s/it] 30%|███ | 1740/5772 [3:03:07<6:48:48, 6.08s/it] 30%|███ | 1740/5772 [3:03:00<6:48:49, 6.08s/it] {'loss': 0.4857, 'learning_rate': 1.63804645388553e-05, 'epoch': 0.3} + 30%|███ | 1740/5772 [3:03:07<6:48:48, 6.08s/it] {'loss': 0.4857, 'learning_rate': 1.63804645388553e-05, 'epoch': 0.3} + 30%|███ | 1740/5772 [3:03:00<6:48:49, 6.08s/it] 30%|███ | 1741/5772 [3:03:13<6:44:28, 6.02s/it] 30%|███ | 1741/5772 [3:03:06<6:44:28, 6.02s/it] {'loss': 0.4676, 'learning_rate': 1.6376142313083e-05, 'epoch': 0.3} + 30%|███ | 1741/5772 [3:03:13<6:44:28, 6.02s/it] {'loss': 0.4676, 'learning_rate': 1.6376142313083e-05, 'epoch': 0.3} + 30%|███ | 1741/5772 [3:03:06<6:44:28, 6.02s/it] 30%|███ | 1742/5772 [3:03:15<7:54:18, 7.06s/it] 30%|███ | 1742/5772 [3:03:22<7:54:19, 7.06s/it] {'loss': 0.4663, 'learning_rate': 1.6371818079180126e-05, 'epoch': 0.3} + 30%|███ | 1742/5772 [3:03:22<7:54:19, 7.06s/it] {'loss': 0.4663, 'learning_rate': 1.6371818079180126e-05, 'epoch': 0.3} + 30%|███ | 1742/5772 [3:03:15<7:54:18, 7.06s/it] 30%|███ | 1743/5772 [3:03:28<7:30:38, 6.71s/it] 30%|███ | 1743/5772 [3:03:21<7:30:38, 6.71s/it] {'loss': 0.4854, 'learning_rate': 1.636749183850858e-05, 'epoch': 0.3} + 30%|███ | 1743/5772 [3:03:28<7:30:38, 6.71s/it] {'loss': 0.4854, 'learning_rate': 1.636749183850858e-05, 'epoch': 0.3} + 30%|███ | 1743/5772 [3:03:21<7:30:38, 6.71s/it] 30%|███ | 1744/5772 [3:03:34<7:14:42, 6.48s/it] 30%|███ | 1744/5772 [3:03:27<7:14:43, 6.48s/it] {'loss': 0.475, 'learning_rate': 1.636316359243088e-05, 'epoch': 0.3} + 30%|███ | 1744/5772 [3:03:34<7:14:42, 6.48s/it] {'loss': 0.475, 'learning_rate': 1.636316359243088e-05, 'epoch': 0.3} + 30%|███ | 1744/5772 [3:03:27<7:14:43, 6.48s/it] 30%|███ | 1745/5772 [3:03:40<7:11:00, 6.42s/it] 30%|███ | 1745/5772 [3:03:33<7:11:00, 6.42s/it] {'loss': 0.4826, 'learning_rate': 1.6358833342310192e-05, 'epoch': 0.3} + 30%|███ | 1745/5772 [3:03:40<7:11:00, 6.42s/it] {'loss': 0.4826, 'learning_rate': 1.6358833342310192e-05, 'epoch': 0.3} + 30%|███ | 1745/5772 [3:03:33<7:11:00, 6.42s/it] 30%|███ | 1746/5772 [3:03:40<7:17:53, 6.53s/it] 30%|███ | 1746/5772 [3:03:47<7:17:54, 6.53s/it] {'loss': 0.4843, 'learning_rate': 1.635450108951029e-05, 'epoch': 0.3} + 30%|███ | 1746/5772 [3:03:47<7:17:54, 6.53s/it] {'loss': 0.4843, 'learning_rate': 1.635450108951029e-05, 'epoch': 0.3} + 30%|███ | 1746/5772 [3:03:40<7:17:53, 6.53s/it] 30%|███ | 1747/5772 [3:03:46<7:07:02, 6.37s/it] 30%|███ | 1747/5772 [3:03:53<7:07:02, 6.37s/it] {'loss': 0.4876, 'learning_rate': 1.6350166835395607e-05, 'epoch': 0.3} + 30%|███ | 1747/5772 [3:03:53<7:07:02, 6.37s/it] {'loss': 0.4876, 'learning_rate': 1.6350166835395607e-05, 'epoch': 0.3} + 30%|███ | 1747/5772 [3:03:46<7:07:02, 6.37s/it] 30%|███ | 1748/5772 [3:03:52<7:07:51, 6.38s/it] 30%|███ | 1748/5772 [3:04:00<7:07:53, 6.38s/it] {'loss': 0.4754, 'learning_rate': 1.6345830581331187e-05, 'epoch': 0.3} + 30%|███ | 1748/5772 [3:04:00<7:07:53, 6.38s/it] {'loss': 0.4754, 'learning_rate': 1.6345830581331187e-05, 'epoch': 0.3} + 30%|███ | 1748/5772 [3:03:52<7:07:51, 6.38s/it] 30%|███ | 1749/5772 [3:04:06<7:06:25, 6.36s/it] 30%|███ | 1749/5772 [3:03:59<7:06:26, 6.36s/it] {'loss': 0.4711, 'learning_rate': 1.6341492328682703e-05, 'epoch': 0.3} + 30%|███ | 1749/5772 [3:04:06<7:06:25, 6.36s/it] {'loss': 0.4711, 'learning_rate': 1.6341492328682703e-05, 'epoch': 0.3} + 30%|███ | 1749/5772 [3:03:59<7:06:26, 6.36s/it]13 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +1214 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +4 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... + 30%|███ | 1750/5772 [3:04:12<7:02:50, 6.31s/it]2 AutoResumeHook: Checking whether to suspend... +0 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... + 30%|███ | 1750/5772 [3:04:05<7:02:51, 6.31s/it]3 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4811, 'learning_rate': 1.6337152078816475e-05, 'epoch': 0.3} + 30%|███ | 1750/5772 [3:04:12<7:02:50, 6.31s/it] {'loss': 0.4811, 'learning_rate': 1.6337152078816475e-05, 'epoch': 0.3} + 30%|███ | 1750/5772 [3:04:05<7:02:51, 6.31s/it] 30%|███ | 1751/5772 [3:04:18<7:00:25, 6.27s/it] 30%|███ | 1751/5772 [3:04:11<7:00:26, 6.27s/it] {'loss': 0.4787, 'learning_rate': 1.633280983309943e-05, 'epoch': 0.3} + 30%|███ | 1751/5772 [3:04:18<7:00:25, 6.27s/it] {'loss': 0.4787, 'learning_rate': 1.633280983309943e-05, 'epoch': 0.3} + 30%|███ | 1751/5772 [3:04:11<7:00:26, 6.27s/it] 30%|███ | 1752/5772 [3:04:18<7:09:25, 6.41s/it] 30%|███ | 1752/5772 [3:04:25<7:09:25, 6.41s/it] {'loss': 0.489, 'learning_rate': 1.6328465592899142e-05, 'epoch': 0.3} + 30%|███ | 1752/5772 [3:04:25<7:09:25, 6.41s/it] {'loss': 0.489, 'learning_rate': 1.6328465592899142e-05, 'epoch': 0.3} + 30%|███ | 1752/5772 [3:04:18<7:09:25, 6.41s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (4901 > 4096). Running this sequence through the model will result in indexing errors + 30%|███ | 1753/5772 [3:04:24<6:59:18, 6.26s/it] 30%|███ | 1753/5772 [3:04:31<6:59:18, 6.26s/it] {'loss': 0.4747, 'learning_rate': 1.63241193595838e-05, 'epoch': 0.3} + 30%|███ | 1753/5772 [3:04:31<6:59:18, 6.26s/it] {'loss': 0.4747, 'learning_rate': 1.63241193595838e-05, 'epoch': 0.3} + 30%|███ | 1753/5772 [3:04:24<6:59:18, 6.26s/it] 30%|███ | 1754/5772 [3:04:37<6:53:02, 6.17s/it] 30%|███ | 1754/5772 [3:04:30<6:53:02, 6.17s/it] {'loss': 0.4961, 'learning_rate': 1.631977113452223e-05, 'epoch': 0.3} + 30%|███ | 1754/5772 [3:04:37<6:53:02, 6.17s/it] {'loss': 0.4961, 'learning_rate': 1.631977113452223e-05, 'epoch': 0.3} + 30%|███ | 1754/5772 [3:04:30<6:53:02, 6.17s/it] 30%|███ | 1755/5772 [3:04:43<6:51:41, 6.15s/it] 30%|███ | 1755/5772 [3:04:36<6:51:41, 6.15s/it] {'loss': 0.4681, 'learning_rate': 1.631542091908387e-05, 'epoch': 0.3} + 30%|███ | 1755/5772 [3:04:43<6:51:41, 6.15s/it] {'loss': 0.4681, 'learning_rate': 1.631542091908387e-05, 'epoch': 0.3} + 30%|███ | 1755/5772 [3:04:36<6:51:41, 6.15s/it] 30%|███ | 1756/5772 [3:04:52<7:48:39, 7.00s/it] 30%|███ | 1756/5772 [3:04:45<7:48:39, 7.00s/it] {'loss': 0.4746, 'learning_rate': 1.6311068714638817e-05, 'epoch': 0.3} + 30%|███ | 1756/5772 [3:04:52<7:48:39, 7.00s/it] {'loss': 0.4746, 'learning_rate': 1.6311068714638817e-05, 'epoch': 0.3} + 30%|███ | 1756/5772 [3:04:45<7:48:39, 7.00s/it] 30%|███ | 1757/5772 [3:04:51<7:38:59, 6.86s/it] 30%|███ | 1757/5772 [3:04:58<7:38:59, 6.86s/it] {'loss': 0.4848, 'learning_rate': 1.6306714522557755e-05, 'epoch': 0.3} + 30%|███ | 1757/5772 [3:04:58<7:38:59, 6.86s/it] {'loss': 0.4848, 'learning_rate': 1.6306714522557755e-05, 'epoch': 0.3} + 30%|███ | 1757/5772 [3:04:51<7:38:59, 6.86s/it] 30%|███ | 1758/5772 [3:04:57<7:19:20, 6.57s/it] 30%|███ | 1758/5772 [3:05:04<7:19:20, 6.57s/it] {'loss': 0.4704, 'learning_rate': 1.6302358344212025e-05, 'epoch': 0.3} + 30%|███ | 1758/5772 [3:05:04<7:19:20, 6.57s/it] {'loss': 0.4704, 'learning_rate': 1.6302358344212025e-05, 'epoch': 0.3} + 30%|███ | 1758/5772 [3:04:57<7:19:20, 6.57s/it] 30%|███ | 1759/5772 [3:05:03<7:06:18, 6.37s/it] 30%|███ | 1759/5772 [3:05:10<7:06:18, 6.37s/it] {'loss': 0.5019, 'learning_rate': 1.6298000180973572e-05, 'epoch': 0.3} + 30%|███ | 1759/5772 [3:05:10<7:06:18, 6.37s/it] {'loss': 0.5019, 'learning_rate': 1.6298000180973572e-05, 'epoch': 0.3} + 30%|███ | 1759/5772 [3:05:03<7:06:18, 6.37s/it] 30%|███ | 1760/5772 [3:05:16<7:01:44, 6.31s/it] 30%|███ | 1760/5772 [3:05:09<7:01:44, 6.31s/it] {'loss': 0.4716, 'learning_rate': 1.629364003421498e-05, 'epoch': 0.3} + 30%|███ | 1760/5772 [3:05:16<7:01:44, 6.31s/it] {'loss': 0.4716, 'learning_rate': 1.629364003421498e-05, 'epoch': 0.3} + 30%|███ | 1760/5772 [3:05:09<7:01:44, 6.31s/it] 31%|███ | 1761/5772 [3:05:16<7:01:37, 6.31s/it] 31%|███ | 1761/5772 [3:05:23<7:01:37, 6.31s/it] {'loss': 0.4922, 'learning_rate': 1.628927790530945e-05, 'epoch': 0.31} + 31%|███ | 1761/5772 [3:05:23<7:01:37, 6.31s/it] {'loss': 0.4922, 'learning_rate': 1.628927790530945e-05, 'epoch': 0.31} + 31%|███ | 1761/5772 [3:05:16<7:01:37, 6.31s/it] 31%|███ | 1762/5772 [3:05:25<8:01:57, 7.21s/it] 31%|███ | 1762/5772 [3:05:32<8:01:57, 7.21s/it] {'loss': 0.4904, 'learning_rate': 1.6284913795630813e-05, 'epoch': 0.31} + 31%|███ | 1762/5772 [3:05:32<8:01:57, 7.21s/it] {'loss': 0.4904, 'learning_rate': 1.6284913795630813e-05, 'epoch': 0.31} + 31%|███ | 1762/5772 [3:05:25<8:01:57, 7.21s/it] 31%|███ | 1763/5772 [3:05:31<7:44:52, 6.96s/it] 31%|███ | 1763/5772 [3:05:38<7:44:52, 6.96s/it] {'loss': 0.4808, 'learning_rate': 1.6280547706553525e-05, 'epoch': 0.31} + 31%|███ | 1763/5772 [3:05:38<7:44:52, 6.96s/it] {'loss': 0.4808, 'learning_rate': 1.6280547706553525e-05, 'epoch': 0.31} + 31%|███ | 1763/5772 [3:05:31<7:44:52, 6.96s/it] 31%|███ | 1764/5772 [3:05:44<7:22:53, 6.63s/it] 31%|███ | 1764/5772 [3:05:37<7:22:53, 6.63s/it] {'loss': 0.4619, 'learning_rate': 1.6276179639452654e-05, 'epoch': 0.31} + 31%|███ | 1764/5772 [3:05:44<7:22:53, 6.63s/it] {'loss': 0.4619, 'learning_rate': 1.6276179639452654e-05, 'epoch': 0.31} + 31%|███ | 1764/5772 [3:05:37<7:22:53, 6.63s/it] 31%|███ | 1765/5772 [3:05:55<10:59:07, 9.87s/it] 31%|███ | 1765/5772 [3:06:02<10:59:08, 9.87s/it] {'loss': 0.4895, 'learning_rate': 1.62718095957039e-05, 'epoch': 0.31} + 31%|███ | 1765/5772 [3:06:02<10:59:08, 9.87s/it] {'loss': 0.4895, 'learning_rate': 1.62718095957039e-05, 'epoch': 0.31} + 31%|███ | 1765/5772 [3:05:55<10:59:07, 9.87s/it] 31%|███ | 1766/5772 [3:06:01<9:43:49, 8.74s/it] 31%|███ | 1766/5772 [3:06:08<9:43:49, 8.74s/it] {'loss': 0.4717, 'learning_rate': 1.6267437576683585e-05, 'epoch': 0.31} + 31%|███ | 1766/5772 [3:06:08<9:43:49, 8.74s/it] {'loss': 0.4717, 'learning_rate': 1.6267437576683585e-05, 'epoch': 0.31} + 31%|███ | 1766/5772 [3:06:01<9:43:49, 8.74s/it] 31%|███ | 1767/5772 [3:06:09<9:37:50, 8.66s/it] 31%|███ | 1767/5772 [3:06:16<9:37:50, 8.66s/it] {'loss': 0.4779, 'learning_rate': 1.6263063583768652e-05, 'epoch': 0.31} + 31%|███ | 1767/5772 [3:06:16<9:37:50, 8.66s/it] {'loss': 0.4779, 'learning_rate': 1.6263063583768652e-05, 'epoch': 0.31} + 31%|███ | 1767/5772 [3:06:09<9:37:50, 8.66s/it] 31%|███ | 1768/5772 [3:06:15<8:43:23, 7.84s/it] 31%|███ | 1768/5772 [3:06:22<8:43:23, 7.84s/it] {'loss': 0.4694, 'learning_rate': 1.625868761833667e-05, 'epoch': 0.31} + 31%|███ | 1768/5772 [3:06:22<8:43:23, 7.84s/it] {'loss': 0.4694, 'learning_rate': 1.625868761833667e-05, 'epoch': 0.31} + 31%|███ | 1768/5772 [3:06:15<8:43:23, 7.84s/it] 31%|███ | 1769/5772 [3:06:21<8:08:13, 7.32s/it] 31%|███ | 1769/5772 [3:06:28<8:08:14, 7.32s/it] {'loss': 0.4825, 'learning_rate': 1.6254309681765814e-05, 'epoch': 0.31} + 31%|███ | 1769/5772 [3:06:28<8:08:14, 7.32s/it] {'loss': 0.4825, 'learning_rate': 1.6254309681765814e-05, 'epoch': 0.31} + 31%|███ | 1769/5772 [3:06:21<8:08:13, 7.32s/it]/lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/VILA/llava/model/llava_arch.py:397: UserWarning: Inputs truncated! + warnings.warn("Inputs truncated!") + 31%|███ | 1770/5772 [3:06:31<8:56:09, 8.04s/it] 31%|███ | 1770/5772 [3:06:38<8:56:09, 8.04s/it] {'loss': 0.4637, 'learning_rate': 1.6249929775434903e-05, 'epoch': 0.31} + 31%|███ | 1770/5772 [3:06:38<8:56:09, 8.04s/it] {'loss': 0.4637, 'learning_rate': 1.6249929775434903e-05, 'epoch': 0.31} + 31%|███ | 1770/5772 [3:06:31<8:56:09, 8.04s/it] 31%|███ | 1771/5772 [3:06:37<8:16:39, 7.45s/it] 31%|███ | 1771/5772 [3:06:44<8:16:39, 7.45s/it] {'loss': 0.4858, 'learning_rate': 1.624554790072336e-05, 'epoch': 0.31} + 31%|███ | 1771/5772 [3:06:44<8:16:39, 7.45s/it] {'loss': 0.4858, 'learning_rate': 1.624554790072336e-05, 'epoch': 0.31} + 31%|███ | 1771/5772 [3:06:37<8:16:39, 7.45s/it] 31%|███ | 1772/5772 [3:06:50<10:02:15, 9.03s/it] 31%|███ | 1772/5772 [3:06:57<10:02:15, 9.03s/it] {'loss': 0.4688, 'learning_rate': 1.6241164059011228e-05, 'epoch': 0.31} + 31%|███ | 1772/5772 [3:06:57<10:02:15, 9.03s/it] {'loss': 0.4688, 'learning_rate': 1.6241164059011228e-05, 'epoch': 0.31} + 31%|███ | 1772/5772 [3:06:50<10:02:15, 9.03s/it] 31%|███ | 1773/5772 [3:06:56<9:07:06, 8.21s/it] 31%|███ | 1773/5772 [3:07:03<9:07:06, 8.21s/it] {'loss': 0.4832, 'learning_rate': 1.6236778251679177e-05, 'epoch': 0.31} + 31%|███ | 1773/5772 [3:07:03<9:07:06, 8.21s/it] {'loss': 0.4832, 'learning_rate': 1.6236778251679177e-05, 'epoch': 0.31} + 31%|███ | 1773/5772 [3:06:56<9:07:06, 8.21s/it] 31%|███ | 1774/5772 [3:07:06<9:32:22, 8.59s/it] 31%|███ | 1774/5772 [3:07:13<9:32:23, 8.59s/it] {'loss': 0.4774, 'learning_rate': 1.6232390480108493e-05, 'epoch': 0.31} + 31%|███ | 1774/5772 [3:07:13<9:32:23, 8.59s/it] {'loss': 0.4774, 'learning_rate': 1.6232390480108493e-05, 'epoch': 0.31} + 31%|███ | 1774/5772 [3:07:06<9:32:22, 8.59s/it] 31%|███ | 1775/5772 [3:07:15<9:43:35, 8.76s/it] 31%|███ | 1775/5772 [3:07:22<9:43:34, 8.76s/it] {'loss': 0.4782, 'learning_rate': 1.6228000745681082e-05, 'epoch': 0.31} + 31%|███ | 1775/5772 [3:07:22<9:43:34, 8.76s/it] {'loss': 0.4782, 'learning_rate': 1.6228000745681082e-05, 'epoch': 0.31} + 31%|███ | 1775/5772 [3:07:15<9:43:35, 8.76s/it] 31%|███ | 1776/5772 [3:07:21<8:55:38, 8.04s/it] 31%|███ | 1776/5772 [3:07:28<8:55:38, 8.04s/it] {'loss': 0.4663, 'learning_rate': 1.622360904977946e-05, 'epoch': 0.31} + 31%|███ | 1776/5772 [3:07:28<8:55:38, 8.04s/it] {'loss': 0.4663, 'learning_rate': 1.622360904977946e-05, 'epoch': 0.31} + 31%|███ | 1776/5772 [3:07:21<8:55:38, 8.04s/it] 31%|███ | 1777/5772 [3:07:30<9:10:15, 8.26s/it] 31%|███ | 1777/5772 [3:07:37<9:10:15, 8.26s/it] {'loss': 0.4862, 'learning_rate': 1.6219215393786772e-05, 'epoch': 0.31} + 31%|███ | 1777/5772 [3:07:37<9:10:15, 8.26s/it] {'loss': 0.4862, 'learning_rate': 1.6219215393786772e-05, 'epoch': 0.31} + 31%|███ | 1777/5772 [3:07:30<9:10:15, 8.26s/it] 31%|███ | 1778/5772 [3:07:36<8:27:03, 7.62s/it] 31%|███ | 1778/5772 [3:07:43<8:27:03, 7.62s/it] {'loss': 0.4758, 'learning_rate': 1.6214819779086774e-05, 'epoch': 0.31} + 31%|███ | 1778/5772 [3:07:43<8:27:03, 7.62s/it] {'loss': 0.4758, 'learning_rate': 1.6214819779086774e-05, 'epoch': 0.31} + 31%|███ | 1778/5772 [3:07:36<8:27:03, 7.62s/it] 31%|███ | 1779/5772 [3:07:42<7:54:38, 7.13s/it] 31%|███ | 1779/5772 [3:07:49<7:54:38, 7.13s/it] {'loss': 0.4686, 'learning_rate': 1.621042220706384e-05, 'epoch': 0.31} + 31%|███ | 1779/5772 [3:07:49<7:54:38, 7.13s/it] {'loss': 0.4686, 'learning_rate': 1.621042220706384e-05, 'epoch': 0.31} + 31%|███ | 1779/5772 [3:07:42<7:54:38, 7.13s/it] 31%|███ | 1780/5772 [3:07:48<7:41:11, 6.93s/it] 31%|███ | 1780/5772 [3:07:55<7:41:11, 6.93s/it] {'loss': 0.4765, 'learning_rate': 1.6206022679102967e-05, 'epoch': 0.31} + 31%|███ | 1780/5772 [3:07:55<7:41:11, 6.93s/it] {'loss': 0.4765, 'learning_rate': 1.6206022679102967e-05, 'epoch': 0.31} + 31%|███ | 1780/5772 [3:07:48<7:41:11, 6.93s/it] 31%|███ | 1781/5772 [3:07:55<7:33:39, 6.82s/it] 31%|███ | 1781/5772 [3:08:02<7:33:39, 6.82s/it] {'loss': 0.4844, 'learning_rate': 1.6201621196589755e-05, 'epoch': 0.31} + 31%|███ | 1781/5772 [3:08:02<7:33:39, 6.82s/it] {'loss': 0.4844, 'learning_rate': 1.6201621196589755e-05, 'epoch': 0.31} + 31%|███ | 1781/5772 [3:07:55<7:33:39, 6.82s/it] 31%|███ | 1782/5772 [3:08:01<7:19:19, 6.61s/it] 31%|███ | 1782/5772 [3:08:08<7:19:19, 6.61s/it] {'loss': 0.4676, 'learning_rate': 1.6197217760910426e-05, 'epoch': 0.31} + 31%|███ | 1782/5772 [3:08:08<7:19:19, 6.61s/it] {'loss': 0.4676, 'learning_rate': 1.6197217760910426e-05, 'epoch': 0.31} + 31%|███ | 1782/5772 [3:08:01<7:19:19, 6.61s/it] 31%|███ | 1783/5772 [3:08:09<7:54:48, 7.14s/it] 31%|███ | 1783/5772 [3:08:17<7:54:48, 7.14s/it] {'loss': 0.4866, 'learning_rate': 1.619281237345182e-05, 'epoch': 0.31} + 31%|███ | 1783/5772 [3:08:17<7:54:48, 7.14s/it] {'loss': 0.4866, 'learning_rate': 1.619281237345182e-05, 'epoch': 0.31} + 31%|███ | 1783/5772 [3:08:09<7:54:48, 7.14s/it] 31%|███ | 1784/5772 [3:08:16<7:34:31, 6.84s/it] 31%|███ | 1784/5772 [3:08:23<7:34:31, 6.84s/it] {'loss': 0.4763, 'learning_rate': 1.618840503560139e-05, 'epoch': 0.31} + 31%|███ | 1784/5772 [3:08:23<7:34:31, 6.84s/it] {'loss': 0.4763, 'learning_rate': 1.618840503560139e-05, 'epoch': 0.31} + 31%|███ | 1784/5772 [3:08:16<7:34:31, 6.84s/it] 31%|███ | 1785/5772 [3:08:22<7:16:57, 6.58s/it] 31%|███ | 1785/5772 [3:08:29<7:16:57, 6.58s/it] {'loss': 0.4792, 'learning_rate': 1.6183995748747204e-05, 'epoch': 0.31} + 31%|███ | 1785/5772 [3:08:29<7:16:57, 6.58s/it] {'loss': 0.4792, 'learning_rate': 1.6183995748747204e-05, 'epoch': 0.31} + 31%|███ | 1785/5772 [3:08:22<7:16:57, 6.58s/it] 31%|███ | 1786/5772 [3:08:28<7:06:59, 6.43s/it] 31%|███ | 1786/5772 [3:08:35<7:06:59, 6.43s/it] {'loss': 0.4864, 'learning_rate': 1.6179584514277937e-05, 'epoch': 0.31} + 31%|███ | 1786/5772 [3:08:35<7:06:59, 6.43s/it] {'loss': 0.4864, 'learning_rate': 1.6179584514277937e-05, 'epoch': 0.31} + 31%|███ | 1786/5772 [3:08:28<7:06:59, 6.43s/it] 31%|███ | 1787/5772 [3:08:34<7:03:36, 6.38s/it] 31%|███ | 1787/5772 [3:08:41<7:03:36, 6.38s/it] {'loss': 0.4832, 'learning_rate': 1.6175171333582887e-05, 'epoch': 0.31} + 31%|███ | 1787/5772 [3:08:41<7:03:36, 6.38s/it] {'loss': 0.4832, 'learning_rate': 1.6175171333582887e-05, 'epoch': 0.31} + 31%|███ | 1787/5772 [3:08:34<7:03:36, 6.38s/it] 31%|███ | 1788/5772 [3:08:40<6:54:51, 6.25s/it] 31%|███ | 1788/5772 [3:08:47<6:54:51, 6.25s/it] {'loss': 0.4923, 'learning_rate': 1.617075620805196e-05, 'epoch': 0.31} + 31%|███ | 1788/5772 [3:08:47<6:54:51, 6.25s/it] {'loss': 0.4923, 'learning_rate': 1.617075620805196e-05, 'epoch': 0.31} + 31%|███ | 1788/5772 [3:08:40<6:54:51, 6.25s/it] 31%|███ | 1789/5772 [3:08:46<6:58:01, 6.30s/it] 31%|███ | 1789/5772 [3:08:53<6:58:01, 6.30s/it] {'loss': 0.461, 'learning_rate': 1.6166339139075676e-05, 'epoch': 0.31} + 31%|███ | 1789/5772 [3:08:53<6:58:01, 6.30s/it] {'loss': 0.461, 'learning_rate': 1.6166339139075676e-05, 'epoch': 0.31} + 31%|███ | 1789/5772 [3:08:46<6:58:01, 6.30s/it] 31%|███ | 1790/5772 [3:08:53<6:57:20, 6.29s/it] 31%|███ | 1790/5772 [3:09:00<6:57:20, 6.29s/it] {'loss': 0.4977, 'learning_rate': 1.616192012804516e-05, 'epoch': 0.31} + 31%|███ | 1790/5772 [3:09:00<6:57:20, 6.29s/it] {'loss': 0.4977, 'learning_rate': 1.616192012804516e-05, 'epoch': 0.31} + 31%|███ | 1790/5772 [3:08:53<6:57:20, 6.29s/it] 31%|███ | 1791/5772 [3:08:59<6:53:54, 6.24s/it] 31%|███ | 1791/5772 [3:09:06<6:53:53, 6.24s/it] {'loss': 0.4786, 'learning_rate': 1.6157499176352164e-05, 'epoch': 0.31} + 31%|███ | 1791/5772 [3:09:06<6:53:53, 6.24s/it] {'loss': 0.4786, 'learning_rate': 1.6157499176352164e-05, 'epoch': 0.31} + 31%|███ | 1791/5772 [3:08:59<6:53:54, 6.24s/it] 31%|███ | 1792/5772 [3:09:05<6:46:19, 6.13s/it] 31%|███ | 1792/5772 [3:09:12<6:46:19, 6.13s/it] {'loss': 0.47, 'learning_rate': 1.6153076285389036e-05, 'epoch': 0.31} + 31%|███ | 1792/5772 [3:09:12<6:46:19, 6.13s/it] {'loss': 0.47, 'learning_rate': 1.6153076285389036e-05, 'epoch': 0.31} + 31%|███ | 1792/5772 [3:09:05<6:46:19, 6.13s/it] 31%|███ | 1793/5772 [3:09:11<6:48:30, 6.16s/it] 31%|███ | 1793/5772 [3:09:18<6:48:30, 6.16s/it] {'loss': 0.4768, 'learning_rate': 1.614865145654875e-05, 'epoch': 0.31} + 31%|███ | 1793/5772 [3:09:18<6:48:30, 6.16s/it] {'loss': 0.4768, 'learning_rate': 1.614865145654875e-05, 'epoch': 0.31} + 31%|███ | 1793/5772 [3:09:11<6:48:30, 6.16s/it] 31%|███ | 1794/5772 [3:09:17<6:55:09, 6.26s/it] 31%|███ | 1794/5772 [3:09:24<6:55:09, 6.26s/it] {'loss': 0.4867, 'learning_rate': 1.6144224691224868e-05, 'epoch': 0.31} + 31%|███ | 1794/5772 [3:09:24<6:55:09, 6.26s/it] {'loss': 0.4867, 'learning_rate': 1.6144224691224868e-05, 'epoch': 0.31} + 31%|███ | 1794/5772 [3:09:17<6:55:09, 6.26s/it] 31%|███ | 1795/5772 [3:09:24<7:03:08, 6.38s/it] 31%|███ | 1795/5772 [3:09:31<7:03:08, 6.38s/it] {'loss': 0.4725, 'learning_rate': 1.6139795990811583e-05, 'epoch': 0.31} + 31%|███ | 1795/5772 [3:09:31<7:03:08, 6.38s/it] {'loss': 0.4725, 'learning_rate': 1.6139795990811583e-05, 'epoch': 0.31} + 31%|███ | 1795/5772 [3:09:24<7:03:08, 6.38s/it] 31%|███ | 1796/5772 [3:09:30<6:55:52, 6.28s/it] 31%|███ | 1796/5772 [3:09:37<6:55:52, 6.28s/it] {'loss': 0.4866, 'learning_rate': 1.613536535670369e-05, 'epoch': 0.31} + 31%|███ | 1796/5772 [3:09:37<6:55:52, 6.28s/it] {'loss': 0.4866, 'learning_rate': 1.613536535670369e-05, 'epoch': 0.31} + 31%|███ | 1796/5772 [3:09:30<6:55:52, 6.28s/it] 31%|███ | 1797/5772 [3:09:36<6:54:59, 6.26s/it] 31%|███ | 1797/5772 [3:09:43<6:54:59, 6.26s/it] {'loss': 0.475, 'learning_rate': 1.6130932790296586e-05, 'epoch': 0.31} + 31%|███ | 1797/5772 [3:09:43<6:54:59, 6.26s/it] {'loss': 0.475, 'learning_rate': 1.6130932790296586e-05, 'epoch': 0.31} + 31%|███ | 1797/5772 [3:09:36<6:54:59, 6.26s/it] 31%|███ | 1798/5772 [3:09:42<6:50:27, 6.20s/it] 31%|███ | 1798/5772 [3:09:49<6:50:27, 6.20s/it] {'loss': 0.4841, 'learning_rate': 1.612649829298629e-05, 'epoch': 0.31} + 31%|███ | 1798/5772 [3:09:49<6:50:27, 6.20s/it] {'loss': 0.4841, 'learning_rate': 1.612649829298629e-05, 'epoch': 0.31} + 31%|███ | 1798/5772 [3:09:42<6:50:27, 6.20s/it] 31%|███ | 1799/5772 [3:09:48<6:48:21, 6.17s/it] 31%|███ | 1799/5772 [3:09:55<6:48:21, 6.17s/it] {'loss': 0.4907, 'learning_rate': 1.612206186616942e-05, 'epoch': 0.31} + 31%|███ | 1799/5772 [3:09:55<6:48:21, 6.17s/it] {'loss': 0.4907, 'learning_rate': 1.612206186616942e-05, 'epoch': 0.31} + 31%|███ | 1799/5772 [3:09:48<6:48:21, 6.17s/it]7 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +1214 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +1110 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +0 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... + 31%|███ | 1800/5772 [3:09:54<6:47:17, 6.15s/it] 31%|███ | 1800/5772 [3:10:01<6:47:17, 6.15s/it]15 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4836, 'learning_rate': 1.6117623511243204e-05, 'epoch': 0.31} + 31%|███ | 1800/5772 [3:10:02<6:47:17, 6.15s/it] {'loss': 0.4836, 'learning_rate': 1.6117623511243204e-05, 'epoch': 0.31} + 31%|███ | 1800/5772 [3:09:54<6:47:17, 6.15s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-1800/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-1800/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-1800/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 31%|███ | 1801/5772 [3:10:13<10:52:13, 9.85s/it] 31%|███ | 1801/5772 [3:10:20<10:52:13, 9.85s/it] {'loss': 0.4662, 'learning_rate': 1.611318322960548e-05, 'epoch': 0.31} + 31%|███ | 1801/5772 [3:10:20<10:52:13, 9.85s/it] {'loss': 0.4662, 'learning_rate': 1.611318322960548e-05, 'epoch': 0.31} + 31%|███ | 1801/5772 [3:10:13<10:52:13, 9.85s/it] 31%|███ | 1802/5772 [3:10:19<9:41:27, 8.79s/it] 31%|███ | 1802/5772 [3:10:26<9:41:27, 8.79s/it] {'loss': 0.4805, 'learning_rate': 1.6108741022654685e-05, 'epoch': 0.31} + 31%|███ | 1802/5772 [3:10:26<9:41:27, 8.79s/it] {'loss': 0.4805, 'learning_rate': 1.6108741022654685e-05, 'epoch': 0.31} + 31%|███ | 1802/5772 [3:10:19<9:41:27, 8.79s/it] 31%|███ | 1803/5772 [3:10:25<8:46:45, 7.96s/it] 31%|███ | 1803/5772 [3:10:32<8:46:45, 7.96s/it] {'loss': 0.4682, 'learning_rate': 1.6104296891789867e-05, 'epoch': 0.31} + 31%|███ | 1803/5772 [3:10:32<8:46:45, 7.96s/it] {'loss': 0.4682, 'learning_rate': 1.6104296891789867e-05, 'epoch': 0.31} + 31%|███ | 1803/5772 [3:10:25<8:46:45, 7.96s/it] 31%|███▏ | 1804/5772 [3:10:38<8:06:49, 7.36s/it] 31%|███▏ | 1804/5772 [3:10:31<8:06:50, 7.36s/it] {'loss': 0.4861, 'learning_rate': 1.6099850838410685e-05, 'epoch': 0.31} + 31%|███▏ | 1804/5772 [3:10:38<8:06:49, 7.36s/it] {'loss': 0.4861, 'learning_rate': 1.6099850838410685e-05, 'epoch': 0.31} + 31%|███▏ | 1804/5772 [3:10:31<8:06:50, 7.36s/it] 31%|███▏ | 1805/5772 [3:10:45<7:46:43, 7.06s/it] 31%|███▏ | 1805/5772 [3:10:38<7:46:43, 7.06s/it] {'loss': 0.4799, 'learning_rate': 1.6095402863917398e-05, 'epoch': 0.31} + 31%|███▏ | 1805/5772 [3:10:45<7:46:43, 7.06s/it] {'loss': 0.4799, 'learning_rate': 1.6095402863917398e-05, 'epoch': 0.31} + 31%|███▏ | 1805/5772 [3:10:38<7:46:43, 7.06s/it] 31%|███▏ | 1806/5772 [3:10:50<7:21:04, 6.67s/it] 31%|███▏ | 1806/5772 [3:10:43<7:21:04, 6.67s/it] {'loss': 0.4819, 'learning_rate': 1.6090952969710868e-05, 'epoch': 0.31} + 31%|███▏ | 1806/5772 [3:10:50<7:21:04, 6.67s/it] {'loss': 0.4819, 'learning_rate': 1.6090952969710868e-05, 'epoch': 0.31} + 31%|███▏ | 1806/5772 [3:10:43<7:21:04, 6.67s/it] 31%|███▏ | 1807/5772 [3:10:57<7:09:26, 6.50s/it] 31%|███▏ | 1807/5772 [3:10:49<7:09:26, 6.50s/it] {'loss': 0.4732, 'learning_rate': 1.608650115719257e-05, 'epoch': 0.31} + 31%|███▏ | 1807/5772 [3:10:57<7:09:26, 6.50s/it] {'loss': 0.4732, 'learning_rate': 1.608650115719257e-05, 'epoch': 0.31} + 31%|███▏ | 1807/5772 [3:10:49<7:09:26, 6.50s/it] 31%|███▏ | 1808/5772 [3:11:02<6:58:48, 6.34s/it] 31%|███▏ | 1808/5772 [3:10:55<6:58:48, 6.34s/it] {'loss': 0.4735, 'learning_rate': 1.6082047427764572e-05, 'epoch': 0.31} + 31%|███▏ | 1808/5772 [3:11:02<6:58:48, 6.34s/it] {'loss': 0.4735, 'learning_rate': 1.6082047427764572e-05, 'epoch': 0.31} + 31%|███▏ | 1808/5772 [3:10:55<6:58:48, 6.34s/it] 31%|███▏ | 1809/5772 [3:11:09<7:01:31, 6.38s/it] 31%|███▏ | 1809/5772 [3:11:02<7:01:31, 6.38s/it] {'loss': 0.4878, 'learning_rate': 1.607759178282955e-05, 'epoch': 0.31} + 31%|███▏ | 1809/5772 [3:11:09<7:01:31, 6.38s/it] {'loss': 0.4878, 'learning_rate': 1.607759178282955e-05, 'epoch': 0.31} + 31%|███▏ | 1809/5772 [3:11:02<7:01:31, 6.38s/it] 31%|███▏ | 1810/5772 [3:11:08<7:05:08, 6.44s/it] 31%|███▏ | 1810/5772 [3:11:16<7:05:08, 6.44s/it] {'loss': 0.4873, 'learning_rate': 1.607313422379079e-05, 'epoch': 0.31} + 31%|███▏ | 1810/5772 [3:11:16<7:05:08, 6.44s/it] {'loss': 0.4873, 'learning_rate': 1.607313422379079e-05, 'epoch': 0.31} + 31%|███▏ | 1810/5772 [3:11:08<7:05:08, 6.44s/it] 31%|███▏ | 1811/5772 [3:11:15<7:03:03, 6.41s/it] 31%|███▏ | 1811/5772 [3:11:22<7:03:03, 6.41s/it] {'loss': 0.4742, 'learning_rate': 1.6068674752052168e-05, 'epoch': 0.31} + 31%|███▏ | 1811/5772 [3:11:22<7:03:03, 6.41s/it] {'loss': 0.4742, 'learning_rate': 1.6068674752052168e-05, 'epoch': 0.31} + 31%|███▏ | 1811/5772 [3:11:15<7:03:03, 6.41s/it] 31%|███▏ | 1812/5772 [3:11:21<6:56:38, 6.31s/it] 31%|███▏ | 1812/5772 [3:11:28<6:56:39, 6.31s/it] {'loss': 0.4871, 'learning_rate': 1.606421336901818e-05, 'epoch': 0.31} + 31%|███▏ | 1812/5772 [3:11:28<6:56:39, 6.31s/it] {'loss': 0.4871, 'learning_rate': 1.606421336901818e-05, 'epoch': 0.31} + 31%|███▏ | 1812/5772 [3:11:21<6:56:38, 6.31s/it] 31%|███▏ | 1813/5772 [3:11:27<6:50:29, 6.22s/it] 31%|███▏ | 1813/5772 [3:11:34<6:50:30, 6.22s/it] {'loss': 0.4788, 'learning_rate': 1.605975007609391e-05, 'epoch': 0.31} + 31%|███▏ | 1813/5772 [3:11:34<6:50:30, 6.22s/it] {'loss': 0.4788, 'learning_rate': 1.605975007609391e-05, 'epoch': 0.31} + 31%|███▏ | 1813/5772 [3:11:27<6:50:29, 6.22s/it] 31%|███▏ | 1814/5772 [3:11:33<6:42:41, 6.10s/it] 31%|███▏ | 1814/5772 [3:11:40<6:42:41, 6.10s/it] {'loss': 0.4738, 'learning_rate': 1.605528487468504e-05, 'epoch': 0.31} + 31%|███▏ | 1814/5772 [3:11:40<6:42:41, 6.10s/it] {'loss': 0.4738, 'learning_rate': 1.605528487468504e-05, 'epoch': 0.31} + 31%|███▏ | 1814/5772 [3:11:33<6:42:41, 6.10s/it] 31%|███▏ | 1815/5772 [3:11:46<6:41:18, 6.08s/it] 31%|███▏ | 1815/5772 [3:11:39<6:41:18, 6.08s/it] {'loss': 0.4719, 'learning_rate': 1.605081776619787e-05, 'epoch': 0.31} + 31%|███▏ | 1815/5772 [3:11:46<6:41:18, 6.08s/it] {'loss': 0.4719, 'learning_rate': 1.605081776619787e-05, 'epoch': 0.31} + 31%|███▏ | 1815/5772 [3:11:39<6:41:18, 6.08s/it] 31%|███▏ | 1816/5772 [3:11:45<6:38:08, 6.04s/it] 31%|███▏ | 1816/5772 [3:11:52<6:38:08, 6.04s/it] {'loss': 0.4735, 'learning_rate': 1.604634875203929e-05, 'epoch': 0.31} + 31%|███▏ | 1816/5772 [3:11:52<6:38:08, 6.04s/it] {'loss': 0.4735, 'learning_rate': 1.604634875203929e-05, 'epoch': 0.31} + 31%|███▏ | 1816/5772 [3:11:45<6:38:08, 6.04s/it] 31%|███▏ | 1817/5772 [3:11:51<6:35:39, 6.00s/it] 31%|███▏ | 1817/5772 [3:11:58<6:35:39, 6.00s/it] {'loss': 0.4733, 'learning_rate': 1.6041877833616782e-05, 'epoch': 0.31} + 31%|███▏ | 1817/5772 [3:11:58<6:35:39, 6.00s/it] {'loss': 0.4733, 'learning_rate': 1.6041877833616782e-05, 'epoch': 0.31} + 31%|███▏ | 1817/5772 [3:11:51<6:35:39, 6.00s/it] 31%|███▏ | 1818/5772 [3:11:57<6:39:29, 6.06s/it] 31%|███▏ | 1818/5772 [3:12:04<6:39:29, 6.06s/it] {'loss': 0.4873, 'learning_rate': 1.6037405012338448e-05, 'epoch': 0.31} + 31%|███▏ | 1818/5772 [3:12:04<6:39:29, 6.06s/it] {'loss': 0.4873, 'learning_rate': 1.6037405012338448e-05, 'epoch': 0.31} + 31%|███▏ | 1818/5772 [3:11:57<6:39:29, 6.06s/it] 32%|███▏ | 1819/5772 [3:12:03<6:46:35, 6.17s/it] 32%|███▏ | 1819/5772 [3:12:10<6:46:35, 6.17s/it] {'loss': 0.4684, 'learning_rate': 1.6032930289612974e-05, 'epoch': 0.32} + 32%|███▏ | 1819/5772 [3:12:10<6:46:35, 6.17s/it] {'loss': 0.4684, 'learning_rate': 1.6032930289612974e-05, 'epoch': 0.32} + 32%|███▏ | 1819/5772 [3:12:03<6:46:35, 6.17s/it] 32%|███▏ | 1820/5772 [3:12:09<6:43:20, 6.12s/it] 32%|███▏ | 1820/5772 [3:12:16<6:43:19, 6.12s/it] {'loss': 0.4884, 'learning_rate': 1.6028453666849645e-05, 'epoch': 0.32} + 32%|███▏ | 1820/5772 [3:12:16<6:43:19, 6.12s/it] {'loss': 0.4884, 'learning_rate': 1.6028453666849645e-05, 'epoch': 0.32} + 32%|███▏ | 1820/5772 [3:12:09<6:43:20, 6.12s/it] 32%|███▏ | 1821/5772 [3:12:15<6:39:53, 6.07s/it] 32%|███▏ | 1821/5772 [3:12:22<6:39:54, 6.07s/it] {'loss': 0.4753, 'learning_rate': 1.6023975145458352e-05, 'epoch': 0.32} + 32%|███▏ | 1821/5772 [3:12:22<6:39:54, 6.07s/it] {'loss': 0.4753, 'learning_rate': 1.6023975145458352e-05, 'epoch': 0.32} + 32%|███▏ | 1821/5772 [3:12:15<6:39:53, 6.07s/it] 32%|███▏ | 1822/5772 [3:12:21<6:38:01, 6.05s/it] 32%|███▏ | 1822/5772 [3:12:28<6:38:00, 6.05s/it] {'loss': 0.4745, 'learning_rate': 1.6019494726849582e-05, 'epoch': 0.32} + 32%|███▏ | 1822/5772 [3:12:28<6:38:00, 6.05s/it] {'loss': 0.4745, 'learning_rate': 1.6019494726849582e-05, 'epoch': 0.32} + 32%|███▏ | 1822/5772 [3:12:21<6:38:01, 6.05s/it] 32%|███▏ | 1823/5772 [3:12:27<6:33:01, 5.97s/it] 32%|███▏ | 1823/5772 [3:12:34<6:33:01, 5.97s/it] {'loss': 0.4688, 'learning_rate': 1.6015012412434417e-05, 'epoch': 0.32} + 32%|███▏ | 1823/5772 [3:12:34<6:33:01, 5.97s/it] {'loss': 0.4688, 'learning_rate': 1.6015012412434417e-05, 'epoch': 0.32} + 32%|███▏ | 1823/5772 [3:12:27<6:33:01, 5.97s/it] 32%|███▏ | 1824/5772 [3:12:33<6:33:35, 5.98s/it] 32%|███▏ | 1824/5772 [3:12:40<6:33:35, 5.98s/it] {'loss': 0.4771, 'learning_rate': 1.6010528203624537e-05, 'epoch': 0.32} + 32%|███▏ | 1824/5772 [3:12:40<6:33:35, 5.98s/it] {'loss': 0.4771, 'learning_rate': 1.6010528203624537e-05, 'epoch': 0.32} + 32%|███▏ | 1824/5772 [3:12:33<6:33:35, 5.98s/it] 32%|███▏ | 1825/5772 [3:12:39<6:33:41, 5.98s/it] 32%|███▏ | 1825/5772 [3:12:46<6:33:41, 5.98s/it] {'loss': 0.4708, 'learning_rate': 1.6006042101832212e-05, 'epoch': 0.32} + 32%|███▏ | 1825/5772 [3:12:46<6:33:41, 5.98s/it] {'loss': 0.4708, 'learning_rate': 1.6006042101832212e-05, 'epoch': 0.32} + 32%|███▏ | 1825/5772 [3:12:39<6:33:41, 5.98s/it] 32%|███▏ | 1826/5772 [3:12:45<6:29:21, 5.92s/it] 32%|███▏ | 1826/5772 [3:12:52<6:29:21, 5.92s/it] {'loss': 0.4742, 'learning_rate': 1.6001554108470325e-05, 'epoch': 0.32} + 32%|███▏ | 1826/5772 [3:12:52<6:29:21, 5.92s/it] {'loss': 0.4742, 'learning_rate': 1.6001554108470325e-05, 'epoch': 0.32} + 32%|███▏ | 1826/5772 [3:12:45<6:29:21, 5.92s/it] 32%|███▏ | 1827/5772 [3:12:51<6:32:51, 5.97s/it] 32%|███▏ | 1827/5772 [3:12:58<6:32:51, 5.97s/it] {'loss': 0.4749, 'learning_rate': 1.5997064224952345e-05, 'epoch': 0.32} + 32%|███▏ | 1827/5772 [3:12:58<6:32:51, 5.97s/it] {'loss': 0.4749, 'learning_rate': 1.5997064224952345e-05, 'epoch': 0.32} + 32%|███▏ | 1827/5772 [3:12:51<6:32:51, 5.97s/it] 32%|███▏ | 1828/5772 [3:12:57<6:31:57, 5.96s/it] 32%|███▏ | 1828/5772 [3:13:04<6:31:57, 5.96s/it] {'loss': 0.4973, 'learning_rate': 1.5992572452692324e-05, 'epoch': 0.32} + {'loss': 0.4973, 'learning_rate': 1.5992572452692324e-05, 'epoch': 0.32} 32%|███▏ | 1828/5772 [3:13:04<6:31:57, 5.96s/it] + 32%|███▏ | 1828/5772 [3:12:57<6:31:57, 5.96s/it] 32%|███▏ | 1829/5772 [3:13:03<6:36:30, 6.03s/it] 32%|███▏ | 1829/5772 [3:13:10<6:36:30, 6.03s/it] {'loss': 0.4844, 'learning_rate': 1.598807879310493e-05, 'epoch': 0.32} + 32%|███▏ | 1829/5772 [3:13:10<6:36:30, 6.03s/it] {'loss': 0.4844, 'learning_rate': 1.598807879310493e-05, 'epoch': 0.32} + 32%|███▏ | 1829/5772 [3:13:03<6:36:30, 6.03s/it] 32%|███▏ | 1830/5772 [3:13:09<6:37:01, 6.04s/it] 32%|███▏ | 1830/5772 [3:13:16<6:37:01, 6.04s/it] {'loss': 0.4757, 'learning_rate': 1.5983583247605414e-05, 'epoch': 0.32} + 32%|███▏ | 1830/5772 [3:13:16<6:37:01, 6.04s/it] {'loss': 0.4757, 'learning_rate': 1.5983583247605414e-05, 'epoch': 0.32} + 32%|███▏ | 1830/5772 [3:13:09<6:37:01, 6.04s/it] 32%|███▏ | 1831/5772 [3:13:16<6:49:40, 6.24s/it] 32%|███▏ | 1831/5772 [3:13:23<6:49:40, 6.24s/it] {'loss': 0.4706, 'learning_rate': 1.5979085817609625e-05, 'epoch': 0.32} + 32%|███▏ | 1831/5772 [3:13:23<6:49:40, 6.24s/it] {'loss': 0.4706, 'learning_rate': 1.5979085817609625e-05, 'epoch': 0.32} + 32%|███▏ | 1831/5772 [3:13:16<6:49:40, 6.24s/it] 32%|███▏ | 1832/5772 [3:13:22<6:46:22, 6.19s/it] 32%|███▏ | 1832/5772 [3:13:29<6:46:22, 6.19s/it] {'loss': 0.4778, 'learning_rate': 1.5974586504534e-05, 'epoch': 0.32} + 32%|███▏ | 1832/5772 [3:13:29<6:46:22, 6.19s/it] {'loss': 0.4778, 'learning_rate': 1.5974586504534e-05, 'epoch': 0.32} + 32%|███▏ | 1832/5772 [3:13:22<6:46:22, 6.19s/it] 32%|███▏ | 1833/5772 [3:13:35<6:54:01, 6.31s/it] 32%|███▏ | 1833/5772 [3:13:28<6:54:02, 6.31s/it] {'loss': 0.4701, 'learning_rate': 1.5970085309795572e-05, 'epoch': 0.32} + 32%|███▏ | 1833/5772 [3:13:35<6:54:01, 6.31s/it] {'loss': 0.4701, 'learning_rate': 1.5970085309795572e-05, 'epoch': 0.32} + 32%|███▏ | 1833/5772 [3:13:28<6:54:02, 6.31s/it] 32%|███▏ | 1834/5772 [3:13:34<6:45:34, 6.18s/it] 32%|███▏ | 1834/5772 [3:13:41<6:45:35, 6.18s/it] {'loss': 0.4736, 'learning_rate': 1.5965582234811972e-05, 'epoch': 0.32} + 32%|███▏ | 1834/5772 [3:13:41<6:45:35, 6.18s/it] {'loss': 0.4736, 'learning_rate': 1.5965582234811972e-05, 'epoch': 0.32} + 32%|███▏ | 1834/5772 [3:13:34<6:45:34, 6.18s/it] 32%|███▏ | 1835/5772 [3:13:40<6:44:52, 6.17s/it] 32%|███▏ | 1835/5772 [3:13:47<6:44:52, 6.17s/it] {'loss': 0.4801, 'learning_rate': 1.5961077281001418e-05, 'epoch': 0.32} + 32%|███▏ | 1835/5772 [3:13:48<6:44:52, 6.17s/it] {'loss': 0.4801, 'learning_rate': 1.5961077281001418e-05, 'epoch': 0.32} + 32%|███▏ | 1835/5772 [3:13:40<6:44:52, 6.17s/it] 32%|███▏ | 1836/5772 [3:13:54<6:41:50, 6.13s/it] 32%|███▏ | 1836/5772 [3:13:46<6:41:50, 6.13s/it] {'loss': 0.4773, 'learning_rate': 1.5956570449782715e-05, 'epoch': 0.32} + 32%|███▏ | 1836/5772 [3:13:54<6:41:50, 6.13s/it] {'loss': 0.4773, 'learning_rate': 1.5956570449782715e-05, 'epoch': 0.32} + 32%|███▏ | 1836/5772 [3:13:46<6:41:50, 6.13s/it] 32%|███▏ | 1837/5772 [3:13:52<6:39:22, 6.09s/it] 32%|███▏ | 1837/5772 [3:14:00<6:39:22, 6.09s/it] {'loss': 0.463, 'learning_rate': 1.5952061742575268e-05, 'epoch': 0.32} + 32%|███▏ | 1837/5772 [3:14:00<6:39:22, 6.09s/it] {'loss': 0.463, 'learning_rate': 1.5952061742575268e-05, 'epoch': 0.32} + 32%|███▏ | 1837/5772 [3:13:52<6:39:22, 6.09s/it] 32%|███▏ | 1838/5772 [3:13:59<6:43:33, 6.15s/it] 32%|███▏ | 1838/5772 [3:14:06<6:43:33, 6.16s/it] {'loss': 0.4813, 'learning_rate': 1.594755116079907e-05, 'epoch': 0.32} + 32%|███▏ | 1838/5772 [3:14:06<6:43:33, 6.16s/it] {'loss': 0.4813, 'learning_rate': 1.594755116079907e-05, 'epoch': 0.32} + 32%|███▏ | 1838/5772 [3:13:59<6:43:33, 6.15s/it] 32%|███▏ | 1839/5772 [3:14:05<6:44:12, 6.17s/it] 32%|███▏ | 1839/5772 [3:14:12<6:44:12, 6.17s/it] {'loss': 0.4776, 'learning_rate': 1.5943038705874697e-05, 'epoch': 0.32} + 32%|███▏ | 1839/5772 [3:14:12<6:44:12, 6.17s/it] {'loss': 0.4776, 'learning_rate': 1.5943038705874697e-05, 'epoch': 0.32} + 32%|███▏ | 1839/5772 [3:14:05<6:44:12, 6.17s/it] 32%|███▏ | 1840/5772 [3:14:11<6:46:22, 6.20s/it] 32%|███▏ | 1840/5772 [3:14:18<6:46:22, 6.20s/it] {'loss': 0.4953, 'learning_rate': 1.593852437922333e-05, 'epoch': 0.32} + 32%|███▏ | 1840/5772 [3:14:18<6:46:22, 6.20s/it] {'loss': 0.4953, 'learning_rate': 1.593852437922333e-05, 'epoch': 0.32} + 32%|███▏ | 1840/5772 [3:14:11<6:46:22, 6.20s/it] 32%|███▏ | 1841/5772 [3:14:17<6:41:28, 6.13s/it] 32%|███▏ | 1841/5772 [3:14:24<6:41:28, 6.13s/it] {'loss': 0.4605, 'learning_rate': 1.593400818226673e-05, 'epoch': 0.32} + 32%|███▏ | 1841/5772 [3:14:24<6:41:28, 6.13s/it] {'loss': 0.4605, 'learning_rate': 1.593400818226673e-05, 'epoch': 0.32} + 32%|███▏ | 1841/5772 [3:14:17<6:41:28, 6.13s/it] 32%|███▏ | 1842/5772 [3:14:24<6:46:43, 6.21s/it] 32%|███▏ | 1842/5772 [3:14:31<6:46:43, 6.21s/it] {'loss': 0.4966, 'learning_rate': 1.5929490116427247e-05, 'epoch': 0.32} + 32%|███▏ | 1842/5772 [3:14:31<6:46:43, 6.21s/it] {'loss': 0.4966, 'learning_rate': 1.5929490116427247e-05, 'epoch': 0.32} + 32%|███▏ | 1842/5772 [3:14:24<6:46:43, 6.21s/it] 32%|███▏ | 1843/5772 [3:14:30<6:46:58, 6.22s/it] 32%|███▏ | 1843/5772 [3:14:37<6:46:59, 6.22s/it] {'loss': 0.4885, 'learning_rate': 1.592497018312782e-05, 'epoch': 0.32} + 32%|███▏ | 1843/5772 [3:14:37<6:46:59, 6.22s/it] {'loss': 0.4885, 'learning_rate': 1.592497018312782e-05, 'epoch': 0.32} + 32%|███▏ | 1843/5772 [3:14:30<6:46:58, 6.22s/it] 32%|███▏ | 1844/5772 [3:14:36<6:45:59, 6.20s/it] 32%|███▏ | 1844/5772 [3:14:43<6:45:59, 6.20s/it] {'loss': 0.4816, 'learning_rate': 1.5920448383791972e-05, 'epoch': 0.32} + 32%|███▏ | 1844/5772 [3:14:43<6:45:59, 6.20s/it] {'loss': 0.4816, 'learning_rate': 1.5920448383791972e-05, 'epoch': 0.32} + 32%|███▏ | 1844/5772 [3:14:36<6:45:59, 6.20s/it] 32%|███▏ | 1845/5772 [3:14:42<6:43:37, 6.17s/it] 32%|███▏ | 1845/5772 [3:14:49<6:43:37, 6.17s/it] {'loss': 0.491, 'learning_rate': 1.591592471984383e-05, 'epoch': 0.32} + 32%|███▏ | 1845/5772 [3:14:49<6:43:37, 6.17s/it] {'loss': 0.491, 'learning_rate': 1.591592471984383e-05, 'epoch': 0.32} + 32%|███▏ | 1845/5772 [3:14:42<6:43:37, 6.17s/it] 32%|███▏ | 1846/5772 [3:14:48<6:43:59, 6.17s/it] 32%|███▏ | 1846/5772 [3:14:55<6:43:59, 6.17s/it] {'loss': 0.4932, 'learning_rate': 1.5911399192708085e-05, 'epoch': 0.32} + 32%|███▏ | 1846/5772 [3:14:55<6:43:59, 6.17s/it] {'loss': 0.4932, 'learning_rate': 1.5911399192708085e-05, 'epoch': 0.32} + 32%|███▏ | 1846/5772 [3:14:48<6:43:59, 6.17s/it] 32%|███▏ | 1847/5772 [3:14:55<6:50:04, 6.27s/it] 32%|███▏ | 1847/5772 [3:15:02<6:50:05, 6.27s/it] {'loss': 0.4819, 'learning_rate': 1.590687180381003e-05, 'epoch': 0.32} + 32%|███▏ | 1847/5772 [3:15:02<6:50:05, 6.27s/it] {'loss': 0.4819, 'learning_rate': 1.590687180381003e-05, 'epoch': 0.32} + 32%|███▏ | 1847/5772 [3:14:55<6:50:04, 6.27s/it] 32%|███▏ | 1848/5772 [3:15:01<6:48:10, 6.24s/it] 32%|███▏ | 1848/5772 [3:15:08<6:48:10, 6.24s/it] {'loss': 0.4898, 'learning_rate': 1.590234255457555e-05, 'epoch': 0.32} + 32%|███▏ | 1848/5772 [3:15:08<6:48:10, 6.24s/it] {'loss': 0.4898, 'learning_rate': 1.590234255457555e-05, 'epoch': 0.32} + 32%|███▏ | 1848/5772 [3:15:01<6:48:10, 6.24s/it] 32%|███▏ | 1849/5772 [3:15:07<6:47:15, 6.23s/it] 32%|███▏ | 1849/5772 [3:15:14<6:47:15, 6.23s/it] {'loss': 0.478, 'learning_rate': 1.5897811446431096e-05, 'epoch': 0.32} + 32%|███▏ | 1849/5772 [3:15:14<6:47:15, 6.23s/it] {'loss': 0.478, 'learning_rate': 1.5897811446431096e-05, 'epoch': 0.32} + 32%|███▏ | 1849/5772 [3:15:07<6:47:15, 6.23s/it]5 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +01 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend...4 AutoResumeHook: Checking whether to suspend... + +9 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... + 32%|███▏ | 1850/5772 [3:15:13<6:44:55, 6.19s/it]2 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend...15 + AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +7 3AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... + 32%|███▏ | 1850/5772 [3:15:20<6:44:55, 6.19s/it] {'loss': 0.4747, 'learning_rate': 1.5893278480803716e-05, 'epoch': 0.32} + 32%|███▏ | 1850/5772 [3:15:20<6:44:55, 6.19s/it] {'loss': 0.4747, 'learning_rate': 1.5893278480803716e-05, 'epoch': 0.32} + 32%|███▏ | 1850/5772 [3:15:13<6:44:55, 6.19s/it] 32%|███▏ | 1851/5772 [3:15:19<6:38:51, 6.10s/it] 32%|███▏ | 1851/5772 [3:15:26<6:38:50, 6.10s/it] {'loss': 0.4751, 'learning_rate': 1.588874365912105e-05, 'epoch': 0.32} + 32%|███▏ | 1851/5772 [3:15:26<6:38:50, 6.10s/it] {'loss': 0.4751, 'learning_rate': 1.588874365912105e-05, 'epoch': 0.32} + 32%|███▏ | 1851/5772 [3:15:19<6:38:51, 6.10s/it] 32%|███▏ | 1852/5772 [3:15:25<6:39:08, 6.11s/it] 32%|███▏ | 1852/5772 [3:15:32<6:39:08, 6.11s/it] {'loss': 0.482, 'learning_rate': 1.588420698281131e-05, 'epoch': 0.32} + 32%|███▏ | 1852/5772 [3:15:32<6:39:08, 6.11s/it] {'loss': 0.482, 'learning_rate': 1.588420698281131e-05, 'epoch': 0.32} + 32%|███▏ | 1852/5772 [3:15:25<6:39:08, 6.11s/it] 32%|███▏ | 1853/5772 [3:15:31<6:34:31, 6.04s/it] 32%|███▏ | 1853/5772 [3:15:38<6:34:31, 6.04s/it] {'loss': 0.4772, 'learning_rate': 1.587966845330329e-05, 'epoch': 0.32} + 32%|███▏ | 1853/5772 [3:15:38<6:34:31, 6.04s/it] {'loss': 0.4772, 'learning_rate': 1.587966845330329e-05, 'epoch': 0.32} + 32%|███▏ | 1853/5772 [3:15:31<6:34:31, 6.04s/it] 32%|███▏ | 1854/5772 [3:15:37<6:32:38, 6.01s/it] 32%|███▏ | 1854/5772 [3:15:44<6:32:38, 6.01s/it] {'loss': 0.4687, 'learning_rate': 1.587512807202639e-05, 'epoch': 0.32} + 32%|███▏ | 1854/5772 [3:15:44<6:32:38, 6.01s/it] {'loss': 0.4687, 'learning_rate': 1.587512807202639e-05, 'epoch': 0.32} + 32%|███▏ | 1854/5772 [3:15:37<6:32:38, 6.01s/it] 32%|███▏ | 1855/5772 [3:15:50<6:37:11, 6.08s/it] 32%|███▏ | 1855/5772 [3:15:43<6:37:11, 6.08s/it] {'loss': 0.4734, 'learning_rate': 1.5870585840410565e-05, 'epoch': 0.32} + 32%|███▏ | 1855/5772 [3:15:50<6:37:11, 6.08s/it] {'loss': 0.4734, 'learning_rate': 1.5870585840410565e-05, 'epoch': 0.32} + 32%|███▏ | 1855/5772 [3:15:43<6:37:11, 6.08s/it] 32%|███▏ | 1856/5772 [3:15:49<6:34:20, 6.04s/it] 32%|███▏ | 1856/5772 [3:15:56<6:34:20, 6.04s/it] {'loss': 0.4748, 'learning_rate': 1.586604175988637e-05, 'epoch': 0.32} + 32%|███▏ | 1856/5772 [3:15:56<6:34:20, 6.04s/it] {'loss': 0.4748, 'learning_rate': 1.586604175988637e-05, 'epoch': 0.32} + 32%|███▏ | 1856/5772 [3:15:49<6:34:20, 6.04s/it] 32%|███▏ | 1857/5772 [3:15:55<6:33:52, 6.04s/it] 32%|███▏ | 1857/5772 [3:16:02<6:33:52, 6.04s/it] {'loss': 0.4822, 'learning_rate': 1.5861495831884942e-05, 'epoch': 0.32} + 32%|███▏ | 1857/5772 [3:16:02<6:33:52, 6.04s/it] {'loss': 0.4822, 'learning_rate': 1.5861495831884942e-05, 'epoch': 0.32} + 32%|███▏ | 1857/5772 [3:15:55<6:33:52, 6.04s/it] 32%|███▏ | 1858/5772 [3:16:01<6:29:31, 5.97s/it] 32%|███▏ | 1858/5772 [3:16:08<6:29:32, 5.97s/it] {'loss': 0.4839, 'learning_rate': 1.585694805783799e-05, 'epoch': 0.32} + 32%|███▏ | 1858/5772 [3:16:08<6:29:32, 5.97s/it] {'loss': 0.4839, 'learning_rate': 1.585694805783799e-05, 'epoch': 0.32} + 32%|███▏ | 1858/5772 [3:16:01<6:29:31, 5.97s/it] 32%|███▏ | 1859/5772 [3:16:07<6:28:35, 5.96s/it] 32%|███▏ | 1859/5772 [3:16:14<6:28:35, 5.96s/it] {'loss': 0.4657, 'learning_rate': 1.5852398439177813e-05, 'epoch': 0.32} + 32%|███▏ | 1859/5772 [3:16:14<6:28:35, 5.96s/it] {'loss': 0.4657, 'learning_rate': 1.5852398439177813e-05, 'epoch': 0.32} + 32%|███▏ | 1859/5772 [3:16:07<6:28:35, 5.96s/it] 32%|███▏ | 1860/5772 [3:16:14<6:39:14, 6.12s/it] 32%|███▏ | 1860/5772 [3:16:21<6:39:14, 6.12s/it] {'loss': 0.4751, 'learning_rate': 1.584784697733728e-05, 'epoch': 0.32} + 32%|███▏ | 1860/5772 [3:16:21<6:39:14, 6.12s/it] {'loss': 0.4751, 'learning_rate': 1.584784697733728e-05, 'epoch': 0.32} + 32%|███▏ | 1860/5772 [3:16:14<6:39:14, 6.12s/it] 32%|███▏ | 1861/5772 [3:16:20<6:40:13, 6.14s/it] 32%|███▏ | 1861/5772 [3:16:27<6:40:13, 6.14s/it] {'loss': 0.4783, 'learning_rate': 1.5843293673749863e-05, 'epoch': 0.32} + 32%|███▏ | 1861/5772 [3:16:27<6:40:13, 6.14s/it] {'loss': 0.4783, 'learning_rate': 1.5843293673749863e-05, 'epoch': 0.32} + 32%|███▏ | 1861/5772 [3:16:20<6:40:13, 6.14s/it] 32%|███▏ | 1862/5772 [3:16:26<6:41:31, 6.16s/it] 32%|███▏ | 1862/5772 [3:16:33<6:41:31, 6.16s/it] {'loss': 0.4855, 'learning_rate': 1.583873852984959e-05, 'epoch': 0.32} + 32%|███▏ | 1862/5772 [3:16:33<6:41:31, 6.16s/it] {'loss': 0.4855, 'learning_rate': 1.583873852984959e-05, 'epoch': 0.32} + 32%|███▏ | 1862/5772 [3:16:26<6:41:31, 6.16s/it] 32%|███▏ | 1863/5772 [3:16:33<6:52:28, 6.33s/it] 32%|███▏ | 1863/5772 [3:16:40<6:52:28, 6.33s/it] {'loss': 0.4744, 'learning_rate': 1.5834181547071082e-05, 'epoch': 0.32} + 32%|███▏ | 1863/5772 [3:16:40<6:52:28, 6.33s/it] {'loss': 0.4744, 'learning_rate': 1.5834181547071082e-05, 'epoch': 0.32} + 32%|███▏ | 1863/5772 [3:16:33<6:52:28, 6.33s/it] 32%|███▏ | 1864/5772 [3:16:39<6:52:11, 6.33s/it] 32%|███▏ | 1864/5772 [3:16:46<6:52:11, 6.33s/it] {'loss': 0.4675, 'learning_rate': 1.582962272684953e-05, 'epoch': 0.32} + 32%|███▏ | 1864/5772 [3:16:46<6:52:11, 6.33s/it] {'loss': 0.4675, 'learning_rate': 1.582962272684953e-05, 'epoch': 0.32} + 32%|███▏ | 1864/5772 [3:16:39<6:52:11, 6.33s/it] 32%|███▏ | 1865/5772 [3:16:46<6:54:56, 6.37s/it] 32%|███▏ | 1865/5772 [3:16:53<6:54:56, 6.37s/it] {'loss': 0.4887, 'learning_rate': 1.582506207062072e-05, 'epoch': 0.32} + 32%|███▏ | 1865/5772 [3:16:53<6:54:56, 6.37s/it] {'loss': 0.4887, 'learning_rate': 1.582506207062072e-05, 'epoch': 0.32} + 32%|███▏ | 1865/5772 [3:16:46<6:54:56, 6.37s/it] 32%|███▏ | 1866/5772 [3:16:52<6:54:16, 6.36s/it] 32%|███▏ | 1866/5772 [3:16:59<6:54:17, 6.36s/it] {'loss': 0.4932, 'learning_rate': 1.582049957982099e-05, 'epoch': 0.32} + 32%|███▏ | 1866/5772 [3:16:59<6:54:17, 6.36s/it] {'loss': 0.4932, 'learning_rate': 1.582049957982099e-05, 'epoch': 0.32} + 32%|███▏ | 1866/5772 [3:16:52<6:54:16, 6.36s/it] 32%|███▏ | 1867/5772 [3:16:58<6:52:32, 6.34s/it] 32%|███▏ | 1867/5772 [3:17:05<6:52:31, 6.34s/it] {'loss': 0.4717, 'learning_rate': 1.5815935255887286e-05, 'epoch': 0.32} + 32%|███▏ | 1867/5772 [3:17:05<6:52:31, 6.34s/it] {'loss': 0.4717, 'learning_rate': 1.5815935255887286e-05, 'epoch': 0.32} + 32%|███▏ | 1867/5772 [3:16:58<6:52:32, 6.34s/it] 32%|███▏ | 1868/5772 [3:17:04<6:46:28, 6.25s/it] 32%|███▏ | 1868/5772 [3:17:11<6:46:28, 6.25s/it] {'loss': 0.4868, 'learning_rate': 1.5811369100257104e-05, 'epoch': 0.32} + 32%|███▏ | 1868/5772 [3:17:11<6:46:28, 6.25s/it] {'loss': 0.4868, 'learning_rate': 1.5811369100257104e-05, 'epoch': 0.32} + 32%|███▏ | 1868/5772 [3:17:04<6:46:28, 6.25s/it] 32%|███▏ | 1869/5772 [3:17:10<6:44:27, 6.22s/it] 32%|███▏ | 1869/5772 [3:17:17<6:44:27, 6.22s/it] {'loss': 0.4721, 'learning_rate': 1.5806801114368542e-05, 'epoch': 0.32} + 32%|███▏ | 1869/5772 [3:17:17<6:44:27, 6.22s/it] {'loss': 0.4721, 'learning_rate': 1.5806801114368542e-05, 'epoch': 0.32} + 32%|███▏ | 1869/5772 [3:17:10<6:44:27, 6.22s/it] 32%|███▏ | 1870/5772 [3:17:24<6:49:24, 6.30s/it] 32%|███▏ | 1870/5772 [3:17:17<6:49:24, 6.30s/it] {'loss': 0.4768, 'learning_rate': 1.580223129966025e-05, 'epoch': 0.32} + 32%|███▏ | 1870/5772 [3:17:24<6:49:24, 6.30s/it] {'loss': 0.4768, 'learning_rate': 1.580223129966025e-05, 'epoch': 0.32} + 32%|███▏ | 1870/5772 [3:17:17<6:49:24, 6.30s/it] 32%|███▏ | 1871/5772 [3:17:23<6:48:52, 6.29s/it] 32%|███▏ | 1871/5772 [3:17:30<6:48:52, 6.29s/it] {'loss': 0.4676, 'learning_rate': 1.5797659657571475e-05, 'epoch': 0.32} + 32%|███▏ | 1871/5772 [3:17:30<6:48:52, 6.29s/it] {'loss': 0.4676, 'learning_rate': 1.5797659657571475e-05, 'epoch': 0.32} + 32%|███▏ | 1871/5772 [3:17:23<6:48:52, 6.29s/it] 32%|███▏ | 1872/5772 [3:17:29<6:44:30, 6.22s/it] 32%|███▏ | 1872/5772 [3:17:36<6:44:31, 6.22s/it] {'loss': 0.4818, 'learning_rate': 1.579308618954202e-05, 'epoch': 0.32} + 32%|███▏ | 1872/5772 [3:17:36<6:44:31, 6.22s/it] {'loss': 0.4818, 'learning_rate': 1.579308618954202e-05, 'epoch': 0.32} + 32%|███▏ | 1872/5772 [3:17:29<6:44:30, 6.22s/it] 32%|███▏ | 1873/5772 [3:17:35<6:44:05, 6.22s/it] 32%|███▏ | 1873/5772 [3:17:42<6:44:04, 6.22s/it] {'loss': 0.4774, 'learning_rate': 1.5788510897012286e-05, 'epoch': 0.32} + 32%|███▏ | 1873/5772 [3:17:42<6:44:04, 6.22s/it] {'loss': 0.4774, 'learning_rate': 1.5788510897012286e-05, 'epoch': 0.32} + 32%|███▏ | 1873/5772 [3:17:35<6:44:05, 6.22s/it] 32%|███▏ | 1874/5772 [3:17:48<6:39:19, 6.15s/it] 32%|███▏ | 1874/5772 [3:17:41<6:39:19, 6.15s/it] {'loss': 0.4734, 'learning_rate': 1.5783933781423222e-05, 'epoch': 0.32} + 32%|███▏ | 1874/5772 [3:17:48<6:39:19, 6.15s/it] {'loss': 0.4734, 'learning_rate': 1.5783933781423222e-05, 'epoch': 0.32} + 32%|███▏ | 1874/5772 [3:17:41<6:39:19, 6.15s/it] 32%|███▏ | 1875/5772 [3:17:55<6:50:12, 6.32s/it] 32%|███▏ | 1875/5772 [3:17:48<6:50:12, 6.32s/it] {'loss': 0.4675, 'learning_rate': 1.5779354844216377e-05, 'epoch': 0.32} + 32%|███▏ | 1875/5772 [3:17:55<6:50:12, 6.32s/it] {'loss': 0.4675, 'learning_rate': 1.5779354844216377e-05, 'epoch': 0.32} + 32%|███▏ | 1875/5772 [3:17:48<6:50:12, 6.32s/it] 33%|███▎ | 1876/5772 [3:18:01<6:49:04, 6.30s/it] 33%|███▎ | 1876/5772 [3:17:54<6:49:04, 6.30s/it] {'loss': 0.4822, 'learning_rate': 1.5774774086833856e-05, 'epoch': 0.32} + 33%|███▎ | 1876/5772 [3:18:01<6:49:04, 6.30s/it] {'loss': 0.4822, 'learning_rate': 1.5774774086833856e-05, 'epoch': 0.32} + 33%|███▎ | 1876/5772 [3:17:54<6:49:04, 6.30s/it] 33%|███▎ | 1877/5772 [3:18:08<6:54:03, 6.38s/it] 33%|███▎ | 1877/5772 [3:18:01<6:54:03, 6.38s/it] {'loss': 0.4683, 'learning_rate': 1.577019151071835e-05, 'epoch': 0.33} + 33%|███▎ | 1877/5772 [3:18:08<6:54:03, 6.38s/it] {'loss': 0.4683, 'learning_rate': 1.577019151071835e-05, 'epoch': 0.33} + 33%|███▎ | 1877/5772 [3:18:01<6:54:03, 6.38s/it] 33%|███▎ | 1878/5772 [3:18:07<6:45:13, 6.24s/it] 33%|███▎ | 1878/5772 [3:18:14<6:45:14, 6.24s/it] {'loss': 0.4903, 'learning_rate': 1.5765607117313097e-05, 'epoch': 0.33} + 33%|███▎ | 1878/5772 [3:18:14<6:45:14, 6.24s/it] {'loss': 0.4903, 'learning_rate': 1.5765607117313097e-05, 'epoch': 0.33} + 33%|███▎ | 1878/5772 [3:18:07<6:45:13, 6.24s/it] 33%|███▎ | 1879/5772 [3:18:13<6:39:10, 6.15s/it] 33%|███▎ | 1879/5772 [3:18:20<6:39:10, 6.15s/it] {'loss': 0.4545, 'learning_rate': 1.5761020908061947e-05, 'epoch': 0.33} + 33%|███▎ | 1879/5772 [3:18:20<6:39:10, 6.15s/it] {'loss': 0.4545, 'learning_rate': 1.5761020908061947e-05, 'epoch': 0.33} + 33%|███▎ | 1879/5772 [3:18:13<6:39:10, 6.15s/it] 33%|███▎ | 1880/5772 [3:18:19<6:38:30, 6.14s/it] 33%|███▎ | 1880/5772 [3:18:26<6:38:31, 6.14s/it] {'loss': 0.4954, 'learning_rate': 1.5756432884409297e-05, 'epoch': 0.33} + 33%|███▎ | 1880/5772 [3:18:26<6:38:31, 6.14s/it] {'loss': 0.4954, 'learning_rate': 1.5756432884409297e-05, 'epoch': 0.33} + 33%|███▎ | 1880/5772 [3:18:19<6:38:30, 6.14s/it] 33%|███▎ | 1881/5772 [3:18:25<6:43:58, 6.23s/it] 33%|███▎ | 1881/5772 [3:18:32<6:43:59, 6.23s/it] {'loss': 0.4833, 'learning_rate': 1.5751843047800107e-05, 'epoch': 0.33} + 33%|███▎ | 1881/5772 [3:18:32<6:43:59, 6.23s/it] {'loss': 0.4833, 'learning_rate': 1.5751843047800107e-05, 'epoch': 0.33} + 33%|███▎ | 1881/5772 [3:18:25<6:43:58, 6.23s/it] 33%|███▎ | 1882/5772 [3:18:38<6:39:54, 6.17s/it] 33%|███▎ | 1882/5772 [3:18:31<6:39:55, 6.17s/it] {'loss': 0.478, 'learning_rate': 1.5747251399679937e-05, 'epoch': 0.33} + 33%|███▎ | 1882/5772 [3:18:38<6:39:54, 6.17s/it] {'loss': 0.478, 'learning_rate': 1.5747251399679937e-05, 'epoch': 0.33} + 33%|███▎ | 1882/5772 [3:18:31<6:39:55, 6.17s/it] 33%|███▎ | 1883/5772 [3:18:38<6:41:43, 6.20s/it] 33%|███▎ | 1883/5772 [3:18:45<6:41:43, 6.20s/it] {'loss': 0.4749, 'learning_rate': 1.574265794149489e-05, 'epoch': 0.33} + 33%|███▎ | 1883/5772 [3:18:45<6:41:43, 6.20s/it] {'loss': 0.4749, 'learning_rate': 1.574265794149489e-05, 'epoch': 0.33} + 33%|███▎ | 1883/5772 [3:18:38<6:41:43, 6.20s/it] 33%|███▎ | 1884/5772 [3:18:44<6:47:57, 6.30s/it] 33%|███▎ | 1884/5772 [3:18:51<6:47:57, 6.30s/it] {'loss': 0.4749, 'learning_rate': 1.5738062674691657e-05, 'epoch': 0.33} + 33%|███▎ | 1884/5772 [3:18:51<6:47:57, 6.30s/it] {'loss': 0.4749, 'learning_rate': 1.5738062674691657e-05, 'epoch': 0.33} + 33%|███▎ | 1884/5772 [3:18:44<6:47:57, 6.30s/it] 33%|███▎ | 1885/5772 [3:18:50<6:44:37, 6.25s/it] 33%|███▎ | 1885/5772 [3:18:57<6:44:37, 6.25s/it] {'loss': 0.5003, 'learning_rate': 1.5733465600717486e-05, 'epoch': 0.33} + 33%|███▎ | 1885/5772 [3:18:57<6:44:37, 6.25s/it] {'loss': 0.5003, 'learning_rate': 1.5733465600717486e-05, 'epoch': 0.33} + 33%|███▎ | 1885/5772 [3:18:50<6:44:37, 6.25s/it] 33%|███▎ | 1886/5772 [3:18:56<6:41:00, 6.19s/it] 33%|███▎ | 1886/5772 [3:19:03<6:41:00, 6.19s/it] {'loss': 0.4891, 'learning_rate': 1.5728866721020203e-05, 'epoch': 0.33} + 33%|███▎ | 1886/5772 [3:19:03<6:41:00, 6.19s/it] {'loss': 0.4891, 'learning_rate': 1.5728866721020203e-05, 'epoch': 0.33} + 33%|███▎ | 1886/5772 [3:18:56<6:41:00, 6.19s/it] 33%|███▎ | 1887/5772 [3:19:02<6:39:11, 6.17s/it] 33%|███▎ | 1887/5772 [3:19:09<6:39:11, 6.17s/it] {'loss': 0.4746, 'learning_rate': 1.5724266037048196e-05, 'epoch': 0.33} + 33%|███▎ | 1887/5772 [3:19:09<6:39:11, 6.17s/it] {'loss': 0.4746, 'learning_rate': 1.5724266037048196e-05, 'epoch': 0.33} + 33%|███▎ | 1887/5772 [3:19:02<6:39:11, 6.17s/it] 33%|███▎ | 1888/5772 [3:19:08<6:37:06, 6.13s/it] 33%|███▎ | 1888/5772 [3:19:15<6:37:07, 6.13s/it] {'loss': 0.4751, 'learning_rate': 1.571966355025043e-05, 'epoch': 0.33} + 33%|███▎ | 1888/5772 [3:19:15<6:37:07, 6.13s/it] {'loss': 0.4751, 'learning_rate': 1.571966355025043e-05, 'epoch': 0.33} + 33%|███▎ | 1888/5772 [3:19:08<6:37:06, 6.13s/it] 33%|███▎ | 1889/5772 [3:19:15<6:38:01, 6.15s/it] 33%|███▎ | 1889/5772 [3:19:22<6:38:01, 6.15s/it] {'loss': 0.4775, 'learning_rate': 1.571505926207643e-05, 'epoch': 0.33} + 33%|███▎ | 1889/5772 [3:19:22<6:38:01, 6.15s/it] {'loss': 0.4775, 'learning_rate': 1.571505926207643e-05, 'epoch': 0.33} + 33%|███▎ | 1889/5772 [3:19:15<6:38:01, 6.15s/it] 33%|███▎ | 1890/5772 [3:19:21<6:37:27, 6.14s/it] 33%|███▎ | 1890/5772 [3:19:28<6:37:27, 6.14s/it] {'loss': 0.4793, 'learning_rate': 1.571045317397629e-05, 'epoch': 0.33} + 33%|███▎ | 1890/5772 [3:19:28<6:37:27, 6.14s/it] {'loss': 0.4793, 'learning_rate': 1.571045317397629e-05, 'epoch': 0.33} + 33%|███▎ | 1890/5772 [3:19:21<6:37:27, 6.14s/it] 33%|███▎ | 1891/5772 [3:19:27<6:36:16, 6.13s/it] 33%|███▎ | 1891/5772 [3:19:34<6:36:16, 6.13s/it] {'loss': 0.4811, 'learning_rate': 1.5705845287400675e-05, 'epoch': 0.33} + 33%|███▎ | 1891/5772 [3:19:34<6:36:16, 6.13s/it] {'loss': 0.4811, 'learning_rate': 1.5705845287400675e-05, 'epoch': 0.33} + 33%|███▎ | 1891/5772 [3:19:27<6:36:16, 6.13s/it] 33%|███▎ | 1892/5772 [3:19:33<6:30:47, 6.04s/it] 33%|███▎ | 1892/5772 [3:19:40<6:30:47, 6.04s/it] {'loss': 0.4856, 'learning_rate': 1.5701235603800813e-05, 'epoch': 0.33} + 33%|███▎ | 1892/5772 [3:19:40<6:30:47, 6.04s/it] {'loss': 0.4856, 'learning_rate': 1.5701235603800813e-05, 'epoch': 0.33} + 33%|███▎ | 1892/5772 [3:19:33<6:30:47, 6.04s/it] 33%|███▎ | 1893/5772 [3:19:39<6:34:33, 6.10s/it] 33%|███▎ | 1893/5772 [3:19:46<6:34:33, 6.10s/it] {'loss': 0.4813, 'learning_rate': 1.5696624124628495e-05, 'epoch': 0.33} + 33%|███▎ | 1893/5772 [3:19:46<6:34:33, 6.10s/it] {'loss': 0.4813, 'learning_rate': 1.5696624124628495e-05, 'epoch': 0.33} + 33%|███▎ | 1893/5772 [3:19:39<6:34:33, 6.10s/it] 33%|███▎ | 1894/5772 [3:19:45<6:33:07, 6.08s/it] 33%|███▎ | 1894/5772 [3:19:52<6:33:07, 6.08s/it] {'loss': 0.4919, 'learning_rate': 1.569201085133608e-05, 'epoch': 0.33} + 33%|███▎ | 1894/5772 [3:19:52<6:33:07, 6.08s/it] {'loss': 0.4919, 'learning_rate': 1.569201085133608e-05, 'epoch': 0.33} + 33%|███▎ | 1894/5772 [3:19:45<6:33:07, 6.08s/it] 33%|███▎ | 1895/5772 [3:19:51<6:38:28, 6.17s/it] 33%|███▎ | 1895/5772 [3:19:58<6:38:28, 6.17s/it] {'loss': 0.4731, 'learning_rate': 1.56873957853765e-05, 'epoch': 0.33} + 33%|███▎ | 1895/5772 [3:19:58<6:38:28, 6.17s/it] {'loss': 0.4731, 'learning_rate': 1.56873957853765e-05, 'epoch': 0.33} + 33%|███▎ | 1895/5772 [3:19:51<6:38:28, 6.17s/it] 33%|███▎ | 1896/5772 [3:19:58<6:40:45, 6.20s/it] 33%|███▎ | 1896/5772 [3:20:05<6:40:45, 6.20s/it] {'loss': 0.4643, 'learning_rate': 1.5682778928203232e-05, 'epoch': 0.33} + 33%|███▎ | 1896/5772 [3:20:05<6:40:45, 6.20s/it] {'loss': 0.4643, 'learning_rate': 1.5682778928203232e-05, 'epoch': 0.33} + 33%|███▎ | 1896/5772 [3:19:58<6:40:45, 6.20s/it]WARNING: tokenization mismatch: 1 vs. 737. [[{'from': 'human', 'value': 'ASSISTANT: You are a top consultant specialising in advising tax firms and have the following skills and qualities.\n\nKNOWLEDGE: Consultant in a top consulting firm specialising in tax consulting, senior strategist, digital expert.\n\nCHARACTERISTICS: Customer-focused, empathetic, solution-oriented, personable, persuasive, creative.\n\nTASK: Use your knowledge to develop 10 measures to encourage clients to submit their documents digitally instead of by post. The measures should be formulated in such a way that the client recognises the advantages of digital filing. The law firm is digitised and can accept digital documents.\n\nOUTPUT: A list of the 10 actions, each with the name of the action as a heading and 2 short additional bullet points.\n\nOUTPUT LANGUAGE: German'}, {'from': 'gpt', 'value': '1. Persönlicher Kontakt und Beratung:\n\t* Persönliche Beratung und Unterstützung bei der Umstellung auf digitales Einreichen\n\t* Betonung der Vorteile, wie Zeit- und Kosteneinsparungen\n2. Schulung und Unterstützung:\n\t* Schulungen für die digitale Einreichung von Dokumenten\n\t* Anleitung zur Nutzung von Tools und Software\n3. Anreize schaffen:\n\t* Ermäßigung bei der Beauftragung digitaler Dienstleistungen\n\t* Sonderangebote für Kunden, die ihre Dokumente digital einreichen\n4. Informationen bereitstellen:\n\t* Veröffentlichung von Leitfäden und Handbüchern für die digitale Einreichung von Dokumenten\n\t* FAQ-Liste für häufig gestellte Fragen zum digitalen Einreichen\n5. Klare Kommunikation:\n\t* Verständliche Kommunikation der Vorteile der digitalen Einreichung\n\t* Bereitstellung von Informationen über die Sicherheit der digitalen Übermittlung\n6. Einfache Zugänglichkeit:\n\t* Vereinfachung des digitalen Einreichungsprozesses durch intuitive Online-Plattformen\n\t* Bereitstellung von Unterstützung für den Fall technischer Schwierigkeiten\n7. Zeitersparnis betonen:\n\t* Betonung der Vorteile von Zeitersparnis durch digitales Einreichen\n\t* Erinnerungen an Fristen und den Nutzen der pünktlichen Einreichung\n8. Umweltbewusstsein:\n\t* Betonung des Umweltvorteils durch digitales Einreichen von Dokumenten\n\t* Verweis auf die Möglichkeit, Papierverbrauch zu reduzieren\n9. Kosteneffizienz:\n\t* Betonung der Vorteile von Kosteneinsparungen durch digitales Einreichen\n\t* Verweis auf die Möglichkeit, Papier- und Portokosten zu sparen\n10. Feedback einholen:\n* Umfrage zur Bewertung des digitalen Einreichungsprozesses\n* Analyse der Ergebnisse zur Verbesserung der digitalen Dienstleistungen'}]] (ignored) + 33%|███▎ | 1897/5772 [3:20:04<6:39:23, 6.18s/it] 33%|███▎ | 1897/5772 [3:20:11<6:39:23, 6.18s/it] {'loss': 0.4742, 'learning_rate': 1.5678160281270344e-05, 'epoch': 0.33} + 33%|███▎ | 1897/5772 [3:20:11<6:39:23, 6.18s/it] {'loss': 0.4742, 'learning_rate': 1.5678160281270344e-05, 'epoch': 0.33} + 33%|███▎ | 1897/5772 [3:20:04<6:39:23, 6.18s/it] 33%|███▎ | 1898/5772 [3:20:10<6:37:44, 6.16s/it] 33%|███▎ | 1898/5772 [3:20:17<6:37:44, 6.16s/it] {'loss': 0.4752, 'learning_rate': 1.567353984603244e-05, 'epoch': 0.33} + 33%|███▎ | 1898/5772 [3:20:17<6:37:44, 6.16s/it] {'loss': 0.4752, 'learning_rate': 1.567353984603244e-05, 'epoch': 0.33} + 33%|███▎ | 1898/5772 [3:20:10<6:37:44, 6.16s/it] 33%|███▎ | 1899/5772 [3:20:16<6:37:47, 6.16s/it] 33%|███▎ | 1899/5772 [3:20:23<6:37:47, 6.16s/it] {'loss': 0.4819, 'learning_rate': 1.566891762394471e-05, 'epoch': 0.33} + 33%|███▎ | 1899/5772 [3:20:23<6:37:47, 6.16s/it] {'loss': 0.4819, 'learning_rate': 1.566891762394471e-05, 'epoch': 0.33} + 33%|███▎ | 1899/5772 [3:20:16<6:37:47, 6.16s/it]WARNING: tokenization mismatch: 1 vs. 1419. [[{'from': 'human', 'value': 'ASSISTANT: You are an innovative tax advisory firm with the following knowledge and traits.\n\nKNOWLEDGE: Tax advisor, junior tax advisor, senior tax advisor, financial strategist, tax regulations.\n\nTRAITS: Empathy, innovation, solution orientation, professionalism, communication, client focus.\n\nTASK: Use your knowledge to develop an internal strategy document. We are a tax advisory firm looking to identify 10 creative ways to improve client satisfaction and loyalty. Focus on ideas that are tailored to the needs of tax advisors and enhance the overall client experience.\n\nOUTPUT: A table in German containing 10 creative strategies to improve client satisfaction and loyalty for a tax advisory firm, including a brief description for each.'}, {'from': 'gpt', 'value': 'Tabelle: 10 kreative Strategien zur Steigerung der Kundenzufriedenheit und -loyalität in einer Steuerberatungskanzlei\n\n| Nr. | Strategie | Beschreibung |\n| --- | --- | --- |\n| 1 | Personalisierter Service | Jeder Kunde erhält einen dedizierten Steuerberater, der auf seine individuellen Bedürfnisse und Anliegen eingeht. |\n| 2 | Proaktive Steuerplanung | Regelmäßige Updates und Empfehlungen zur Steueroptimierung, basierend auf aktuellen Gesetzesänderungen und Kundenzielen. |\n| 3 | Umfassende Schulungsangebote | Kostenlose Workshops und Webinare, um das Steuerwissen der Kunden zu erweitern und den Umgang mit Steuerthemen zu erleichtern. |\n| 4 | Digitale Plattform | Eine benutzerfreundliche Online-Plattform, die den Kunden den Zugriff auf ihre Steuerdokumente und den Kontakt zum Berater erleichtert. |\n| 5 | Einführung von Steuer-Coaching | Persönliche Coaching-Sitzungen, um den Kunden dabei zu unterstützen, ihre steuerlichen Ziele und Möglichkeiten besser zu verstehen. |\n| 6 | Schnelle Reaktionszeiten | Verbindliches Versprechen, innerhalb von 24 Stunden auf Kundenanfragen zu reagieren, um eine effiziente Kommunikation zu gewährleisten. |\n| 7 | Transparente Preisgestaltung | Klare und faire Preisstrukturen, die den Kunden im Voraus bekannt sind, um böse Überraschungen zu vermeiden. |\n| 8 | Individuelle Jahresgespräche | Regelmäßige persönliche Jahresgespräche, um auf die Bedürfnisse und Ziele der Kunden einzugehen und die Zusammenarbeit zu stärken. |\n| 9 | Kunden-Feedback-System | Einrichtung eines einfachen Feedback-Systems, um Kundenmeinungen und Verbesserungsvorschläge aktiv einzuholen und umzusetzen. |\n| 10 | Steuer-Newsletter und Informationsangebot | Monatlicher Newsletter mit aktuellen Steuerinformationen und Tipps, der den Kunden einen Mehrwert bietet und sie auf dem Laufenden hält. |'}, {'from': 'human', 'value': 'Bitte etwas kreativer und auf bezugnehmend auf die zwischenmenschliche Ebene.'}, {'from': 'gpt', 'value': 'Tabelle: 10 kreative Strategien zur Steigerung der Kundenzufriedenheit und -loyalität in einer Steuerberatungskanzlei mit Fokus auf zwischenmenschlicher Ebene\n\n| Nr. | Strategie | Beschreibung |\n| --- | --- | --- |\n| 1 | Kunden-Events | Organisation von regelmäßigen, informellen Veranstaltungen, um den persönlichen Kontakt zwischen Klienten und Beratern zu fördern. |\n| 2 | Persönliche Geburtstagsgrüße | Individuelle Geburtstagskarten oder -nachrichten für jeden Kunden, um Wertschätzung zu zeigen und Beziehungen zu stärken. |\n| 3 | Kundenbefragung & Anpassung | Aktives Einholen von Kundenmeinungen zur individuellen Betreuung und Anpassung unserer Leistungen entsprechend der Kundenbedürfnisse. |\n| 4 | Teamvorstellung & Empathie | Vorstellung unserer Berater auf der Firmenwebsite und in sozialen Medien, um eine persönliche und empathische Beziehung zum Kunden aufzubauen. |\n| 5 | Kunden-Mentoring-Programm | Schaffung eines Programms, bei dem langjährige Kunden als Mentoren für Neukunden fungieren, um Vertrauen und Loyalität zu fördern. |\n| 6 | "Tag der offenen Tür" | Einladung der Kunden zu einem "Tag der offenen Tür" in unserer Kanzlei, um ihnen einen persönlichen Einblick in unsere Arbeitsweise zu geben. |\n| 7 | Individuelle Erfolgsgeschichten | Erstellung von individuellen Erfolgsgeschichten für jeden Kunden, um ihre steuerlichen Erfolge hervorzuheben und ihre Loyalität zu stärken. |\n| 8 | Gemeinsame soziale Projekte | Zusammenarbeit mit Kunden an sozialen Projekten und Spendenaktionen, um gemeinsame Werte zu teilen und die Zusammenarbeit zu fördern. |\n| 9 | Flexibilität in der Kommunikation | Anbieten verschiedener Kommunikationswege (Telefon, E-Mail, Videoanruf, etc.), um auf die individuellen Bedürfnisse der Kunden einzugehen. |\n| 10 | Jährliches Kunden-Dankeschön | Überraschung der Kunden mit einer kleinen Aufmerksamkeit oder einem Dankeschön-Event, um die Wertschätzung für ihre Treue auszudrücken. |'}]] (ignored) +13 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +04 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend...10 + AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... + 33%|███▎ | 1900/5772 [3:20:22<6:34:25, 6.11s/it]3 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... + 33%|███▎ | 1900/5772 [3:20:29<6:34:25, 6.11s/it]15 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4824, 'learning_rate': 1.5664293616462894e-05, 'epoch': 0.33} + 33%|███▎ | 1900/5772 [3:20:29<6:34:25, 6.11s/it] {'loss': 0.4824, 'learning_rate': 1.5664293616462894e-05, 'epoch': 0.33} + 33%|███▎ | 1900/5772 [3:20:22<6:34:25, 6.11s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-1900/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-1900/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-1900/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 33%|███▎ | 1901/5772 [3:20:41<10:50:15, 10.08s/it] 33%|███▎ | 1901/5772 [3:20:48<10:50:17, 10.08s/it] {'loss': 0.4747, 'learning_rate': 1.56596678250433e-05, 'epoch': 0.33} + 33%|███▎ | 1901/5772 [3:20:48<10:50:17, 10.08s/it] {'loss': 0.4747, 'learning_rate': 1.56596678250433e-05, 'epoch': 0.33} + 33%|███▎ | 1901/5772 [3:20:41<10:50:15, 10.08s/it] 33%|███▎ | 1902/5772 [3:20:47<9:32:46, 8.88s/it] 33%|███▎ | 1902/5772 [3:20:55<9:32:46, 8.88s/it] {'loss': 0.4805, 'learning_rate': 1.5655040251142787e-05, 'epoch': 0.33} + 33%|███▎ | 1902/5772 [3:20:55<9:32:46, 8.88s/it] {'loss': 0.4805, 'learning_rate': 1.5655040251142787e-05, 'epoch': 0.33} + 33%|███▎ | 1902/5772 [3:20:47<9:32:46, 8.88s/it] 33%|███▎ | 1903/5772 [3:20:53<8:36:35, 8.01s/it] 33%|███▎ | 1903/5772 [3:21:00<8:36:35, 8.01s/it] {'loss': 0.4683, 'learning_rate': 1.5650410896218788e-05, 'epoch': 0.33} + 33%|███▎ | 1903/5772 [3:21:00<8:36:35, 8.01s/it] {'loss': 0.4683, 'learning_rate': 1.5650410896218788e-05, 'epoch': 0.33} + 33%|███▎ | 1903/5772 [3:20:53<8:36:35, 8.01s/it] 33%|███▎ | 1904/5772 [3:21:00<8:02:59, 7.49s/it] 33%|███▎ | 1904/5772 [3:21:07<8:02:59, 7.49s/it] {'loss': 0.4902, 'learning_rate': 1.5645779761729297e-05, 'epoch': 0.33} + 33%|███▎ | 1904/5772 [3:21:07<8:02:59, 7.49s/it] {'loss': 0.4902, 'learning_rate': 1.5645779761729297e-05, 'epoch': 0.33} + 33%|███▎ | 1904/5772 [3:21:00<8:02:59, 7.49s/it] 33%|███▎ | 1905/5772 [3:21:06<7:38:22, 7.11s/it] 33%|███▎ | 1905/5772 [3:21:13<7:38:21, 7.11s/it] {'loss': 0.4837, 'learning_rate': 1.564114684913286e-05, 'epoch': 0.33} + 33%|███▎ | 1905/5772 [3:21:13<7:38:21, 7.11s/it] {'loss': 0.4837, 'learning_rate': 1.564114684913286e-05, 'epoch': 0.33} + 33%|███▎ | 1905/5772 [3:21:06<7:38:22, 7.11s/it] 33%|███▎ | 1906/5772 [3:21:12<7:26:38, 6.93s/it] 33%|███▎ | 1906/5772 [3:21:20<7:26:38, 6.93s/it] {'loss': 0.4874, 'learning_rate': 1.563651215988859e-05, 'epoch': 0.33} + 33%|███▎ | 1906/5772 [3:21:20<7:26:38, 6.93s/it] {'loss': 0.4874, 'learning_rate': 1.563651215988859e-05, 'epoch': 0.33} + 33%|███▎ | 1906/5772 [3:21:12<7:26:38, 6.93s/it] 33%|███▎ | 1907/5772 [3:21:19<7:10:54, 6.69s/it] 33%|███▎ | 1907/5772 [3:21:26<7:10:54, 6.69s/it] {'loss': 0.4736, 'learning_rate': 1.5631875695456154e-05, 'epoch': 0.33} + 33%|███▎ | 1907/5772 [3:21:26<7:10:54, 6.69s/it] {'loss': 0.4736, 'learning_rate': 1.5631875695456154e-05, 'epoch': 0.33} + 33%|███▎ | 1907/5772 [3:21:19<7:10:54, 6.69s/it] 33%|███▎ | 1908/5772 [3:21:25<7:01:16, 6.54s/it] 33%|███▎ | 1908/5772 [3:21:32<7:01:16, 6.54s/it] {'loss': 0.4728, 'learning_rate': 1.5627237457295778e-05, 'epoch': 0.33} + 33%|███▎ | 1908/5772 [3:21:25<7:01:16, 6.54s/it]{'loss': 0.4728, 'learning_rate': 1.5627237457295778e-05, 'epoch': 0.33} + 33%|███▎ | 1908/5772 [3:21:32<7:01:16, 6.54s/it] 33%|███▎ | 1909/5772 [3:21:31<6:50:09, 6.37s/it] 33%|███▎ | 1909/5772 [3:21:38<6:50:09, 6.37s/it] {'loss': 0.4907, 'learning_rate': 1.5622597446868254e-05, 'epoch': 0.33} + 33%|███▎ | 1909/5772 [3:21:38<6:50:09, 6.37s/it] {'loss': 0.4907, 'learning_rate': 1.5622597446868254e-05, 'epoch': 0.33} + 33%|███▎ | 1909/5772 [3:21:31<6:50:09, 6.37s/it] 33%|███▎ | 1910/5772 [3:21:37<6:49:09, 6.36s/it] 33%|███▎ | 1910/5772 [3:21:44<6:49:09, 6.36s/it] {'loss': 0.4877, 'learning_rate': 1.5617955665634925e-05, 'epoch': 0.33} + 33%|███▎ | 1910/5772 [3:21:44<6:49:09, 6.36s/it] {'loss': 0.4877, 'learning_rate': 1.5617955665634925e-05, 'epoch': 0.33} + 33%|███▎ | 1910/5772 [3:21:37<6:49:09, 6.36s/it] 33%|███▎ | 1911/5772 [3:21:43<6:44:06, 6.28s/it] 33%|███▎ | 1911/5772 [3:21:50<6:44:05, 6.28s/it] {'loss': 0.4736, 'learning_rate': 1.5613312115057697e-05, 'epoch': 0.33} + 33%|███▎ | 1911/5772 [3:21:50<6:44:05, 6.28s/it] {'loss': 0.4736, 'learning_rate': 1.5613312115057697e-05, 'epoch': 0.33} + 33%|███▎ | 1911/5772 [3:21:43<6:44:06, 6.28s/it] 33%|███▎ | 1912/5772 [3:21:49<6:37:04, 6.17s/it] 33%|███▎ | 1912/5772 [3:21:56<6:37:04, 6.17s/it] {'loss': 0.4774, 'learning_rate': 1.5608666796599026e-05, 'epoch': 0.33} + 33%|███▎ | 1912/5772 [3:21:56<6:37:04, 6.17s/it] {'loss': 0.4774, 'learning_rate': 1.5608666796599026e-05, 'epoch': 0.33} + 33%|███▎ | 1912/5772 [3:21:49<6:37:04, 6.17s/it] 33%|███▎ | 1913/5772 [3:21:55<6:37:47, 6.18s/it] 33%|███▎ | 1913/5772 [3:22:02<6:37:47, 6.18s/it] {'loss': 0.4786, 'learning_rate': 1.5604019711721935e-05, 'epoch': 0.33} + 33%|███▎ | 1913/5772 [3:22:02<6:37:47, 6.18s/it] {'loss': 0.4786, 'learning_rate': 1.5604019711721935e-05, 'epoch': 0.33} + 33%|███▎ | 1913/5772 [3:21:55<6:37:47, 6.18s/it] 33%|███▎ | 1914/5772 [3:22:02<6:44:28, 6.29s/it] 33%|███▎ | 1914/5772 [3:22:09<6:44:28, 6.29s/it] {'loss': 0.4809, 'learning_rate': 1.559937086188999e-05, 'epoch': 0.33} + 33%|███▎ | 1914/5772 [3:22:09<6:44:28, 6.29s/it] {'loss': 0.4809, 'learning_rate': 1.559937086188999e-05, 'epoch': 0.33} + 33%|███▎ | 1914/5772 [3:22:02<6:44:28, 6.29s/it] 33%|███▎ | 1915/5772 [3:22:08<6:40:56, 6.24s/it] 33%|███▎ | 1915/5772 [3:22:15<6:40:56, 6.24s/it] {'loss': 0.4688, 'learning_rate': 1.5594720248567327e-05, 'epoch': 0.33} + 33%|███▎ | 1915/5772 [3:22:15<6:40:56, 6.24s/it] {'loss': 0.4688, 'learning_rate': 1.5594720248567327e-05, 'epoch': 0.33} + 33%|███▎ | 1915/5772 [3:22:08<6:40:56, 6.24s/it] 33%|███▎ | 1916/5772 [3:22:14<6:34:16, 6.14s/it] 33%|███▎ | 1916/5772 [3:22:21<6:34:16, 6.13s/it] {'loss': 0.4923, 'learning_rate': 1.5590067873218627e-05, 'epoch': 0.33} + 33%|███▎ | 1916/5772 [3:22:21<6:34:16, 6.13s/it] {'loss': 0.4923, 'learning_rate': 1.5590067873218627e-05, 'epoch': 0.33} + 33%|███▎ | 1916/5772 [3:22:14<6:34:16, 6.14s/it] 33%|███▎ | 1917/5772 [3:22:21<6:47:43, 6.35s/it] 33%|███▎ | 1917/5772 [3:22:28<6:47:43, 6.35s/it] {'loss': 0.4631, 'learning_rate': 1.5585413737309133e-05, 'epoch': 0.33} + 33%|███▎ | 1917/5772 [3:22:28<6:47:43, 6.35s/it] {'loss': 0.4631, 'learning_rate': 1.5585413737309133e-05, 'epoch': 0.33} + 33%|███▎ | 1917/5772 [3:22:21<6:47:43, 6.35s/it] 33%|███▎ | 1918/5772 [3:22:27<6:48:40, 6.36s/it] 33%|███▎ | 1918/5772 [3:22:34<6:48:41, 6.36s/it] {'loss': 0.4852, 'learning_rate': 1.558075784230464e-05, 'epoch': 0.33} + 33%|███▎ | 1918/5772 [3:22:34<6:48:41, 6.36s/it] {'loss': 0.4852, 'learning_rate': 1.558075784230464e-05, 'epoch': 0.33} + 33%|███▎ | 1918/5772 [3:22:27<6:48:40, 6.36s/it] 33%|███▎ | 1919/5772 [3:22:40<6:39:29, 6.22s/it] 33%|███▎ | 1919/5772 [3:22:33<6:39:30, 6.22s/it] {'loss': 0.4703, 'learning_rate': 1.557610018967149e-05, 'epoch': 0.33} + 33%|███▎ | 1919/5772 [3:22:40<6:39:29, 6.22s/it] {'loss': 0.4703, 'learning_rate': 1.557610018967149e-05, 'epoch': 0.33} + 33%|███▎ | 1919/5772 [3:22:33<6:39:30, 6.22s/it] 33%|███▎ | 1920/5772 [3:22:46<6:30:48, 6.09s/it] 33%|███▎ | 1920/5772 [3:22:39<6:30:48, 6.09s/it] {'loss': 0.4803, 'learning_rate': 1.5571440780876588e-05, 'epoch': 0.33} + 33%|███▎ | 1920/5772 [3:22:46<6:30:48, 6.09s/it] {'loss': 0.4803, 'learning_rate': 1.5571440780876588e-05, 'epoch': 0.33} + 33%|███▎ | 1920/5772 [3:22:39<6:30:48, 6.09s/it] 33%|███▎ | 1921/5772 [3:22:45<6:29:06, 6.06s/it] 33%|███▎ | 1921/5772 [3:22:52<6:29:06, 6.06s/it] {'loss': 0.4872, 'learning_rate': 1.556677961738739e-05, 'epoch': 0.33} + 33%|███▎ | 1921/5772 [3:22:52<6:29:06, 6.06s/it] {'loss': 0.4872, 'learning_rate': 1.556677961738739e-05, 'epoch': 0.33} + 33%|███▎ | 1921/5772 [3:22:45<6:29:06, 6.06s/it] 33%|███▎ | 1922/5772 [3:22:51<6:30:06, 6.08s/it] 33%|███▎ | 1922/5772 [3:22:58<6:30:07, 6.08s/it] {'loss': 0.486, 'learning_rate': 1.5562116700671907e-05, 'epoch': 0.33} + 33%|███▎ | 1922/5772 [3:22:58<6:30:07, 6.08s/it] {'loss': 0.486, 'learning_rate': 1.5562116700671907e-05, 'epoch': 0.33} + 33%|███▎ | 1922/5772 [3:22:51<6:30:06, 6.08s/it] 33%|███▎ | 1923/5772 [3:22:57<6:29:56, 6.08s/it] 33%|███▎ | 1923/5772 [3:23:04<6:29:56, 6.08s/it] {'loss': 0.4732, 'learning_rate': 1.555745203219869e-05, 'epoch': 0.33} + 33%|███▎ | 1923/5772 [3:23:04<6:29:56, 6.08s/it] {'loss': 0.4732, 'learning_rate': 1.555745203219869e-05, 'epoch': 0.33} + 33%|███▎ | 1923/5772 [3:22:57<6:29:56, 6.08s/it] 33%|███▎ | 1924/5772 [3:23:03<6:37:20, 6.20s/it] 33%|███▎ | 1924/5772 [3:23:10<6:37:20, 6.20s/it] {'loss': 0.4689, 'learning_rate': 1.5552785613436853e-05, 'epoch': 0.33} + 33%|███▎ | 1924/5772 [3:23:10<6:37:20, 6.20s/it] {'loss': 0.4689, 'learning_rate': 1.5552785613436853e-05, 'epoch': 0.33} + 33%|███▎ | 1924/5772 [3:23:03<6:37:20, 6.20s/it] 33%|███▎ | 1925/5772 [3:23:10<6:35:33, 6.17s/it] 33%|███▎ | 1925/5772 [3:23:17<6:35:33, 6.17s/it] {'loss': 0.4753, 'learning_rate': 1.5548117445856067e-05, 'epoch': 0.33} + 33%|███▎ | 1925/5772 [3:23:17<6:35:33, 6.17s/it] {'loss': 0.4753, 'learning_rate': 1.5548117445856067e-05, 'epoch': 0.33} + 33%|███▎ | 1925/5772 [3:23:10<6:35:33, 6.17s/it] 33%|███▎ | 1926/5772 [3:23:16<6:39:06, 6.23s/it] 33%|███▎ | 1926/5772 [3:23:23<6:39:06, 6.23s/it] {'loss': 0.4887, 'learning_rate': 1.5543447530926536e-05, 'epoch': 0.33} + 33%|███▎ | 1926/5772 [3:23:23<6:39:06, 6.23s/it] {'loss': 0.4887, 'learning_rate': 1.5543447530926536e-05, 'epoch': 0.33} + 33%|███▎ | 1926/5772 [3:23:16<6:39:06, 6.23s/it] 33%|███▎ | 1927/5772 [3:23:22<6:30:59, 6.10s/it] 33%|███▎ | 1927/5772 [3:23:29<6:30:59, 6.10s/it] {'loss': 0.4671, 'learning_rate': 1.5538775870119026e-05, 'epoch': 0.33} + 33%|███▎ | 1927/5772 [3:23:29<6:30:59, 6.10s/it] {'loss': 0.4671, 'learning_rate': 1.5538775870119026e-05, 'epoch': 0.33} + 33%|███▎ | 1927/5772 [3:23:22<6:30:59, 6.10s/it] 33%|███▎ | 1928/5772 [3:23:28<6:35:41, 6.18s/it] 33%|███▎ | 1928/5772 [3:23:35<6:35:41, 6.18s/it] {'loss': 0.4737, 'learning_rate': 1.553410246490485e-05, 'epoch': 0.33} + 33%|███▎ | 1928/5772 [3:23:35<6:35:41, 6.18s/it] {'loss': 0.4737, 'learning_rate': 1.553410246490485e-05, 'epoch': 0.33} + 33%|███▎ | 1928/5772 [3:23:28<6:35:41, 6.18s/it] 33%|███▎ | 1929/5772 [3:23:34<6:30:05, 6.09s/it] 33%|███▎ | 1929/5772 [3:23:41<6:30:05, 6.09s/it] {'loss': 0.4791, 'learning_rate': 1.5529427316755876e-05, 'epoch': 0.33} + 33%|███▎ | 1929/5772 [3:23:41<6:30:05, 6.09s/it] {'loss': 0.4791, 'learning_rate': 1.5529427316755876e-05, 'epoch': 0.33} + 33%|███▎ | 1929/5772 [3:23:34<6:30:05, 6.09s/it] 33%|███▎ | 1930/5772 [3:23:40<6:29:20, 6.08s/it] 33%|███▎ | 1930/5772 [3:23:47<6:29:20, 6.08s/it] {'loss': 0.4842, 'learning_rate': 1.552475042714451e-05, 'epoch': 0.33} + 33%|███▎ | 1930/5772 [3:23:47<6:29:20, 6.08s/it] {'loss': 0.4842, 'learning_rate': 1.552475042714451e-05, 'epoch': 0.33} + 33%|███▎ | 1930/5772 [3:23:40<6:29:20, 6.08s/it] 33%|███▎ | 1931/5772 [3:23:47<6:38:19, 6.22s/it] 33%|███▎ | 1931/5772 [3:23:54<6:38:19, 6.22s/it] {'loss': 0.4761, 'learning_rate': 1.5520071797543717e-05, 'epoch': 0.33} + 33%|███▎ | 1931/5772 [3:23:54<6:38:19, 6.22s/it] {'loss': 0.4761, 'learning_rate': 1.5520071797543717e-05, 'epoch': 0.33} + 33%|███▎ | 1931/5772 [3:23:47<6:38:19, 6.22s/it] 33%|███▎ | 1932/5772 [3:23:53<6:34:39, 6.17s/it] 33%|███▎ | 1932/5772 [3:24:00<6:34:39, 6.17s/it] {'loss': 0.4845, 'learning_rate': 1.5515391429427e-05, 'epoch': 0.33} + 33%|███▎ | 1932/5772 [3:24:00<6:34:39, 6.17s/it] {'loss': 0.4845, 'learning_rate': 1.5515391429427e-05, 'epoch': 0.33} + 33%|███▎ | 1932/5772 [3:23:53<6:34:39, 6.17s/it] 33%|███▎ | 1933/5772 [3:23:59<6:40:29, 6.26s/it] 33%|███▎ | 1933/5772 [3:24:06<6:40:29, 6.26s/it] {'loss': 0.4711, 'learning_rate': 1.5510709324268422e-05, 'epoch': 0.33} + 33%|███▎ | 1933/5772 [3:24:06<6:40:29, 6.26s/it] {'loss': 0.4711, 'learning_rate': 1.5510709324268422e-05, 'epoch': 0.33} + 33%|███▎ | 1933/5772 [3:23:59<6:40:29, 6.26s/it] 34%|███▎ | 1934/5772 [3:24:05<6:35:23, 6.18s/it] 34%|███▎ | 1934/5772 [3:24:12<6:35:23, 6.18s/it] {'loss': 0.472, 'learning_rate': 1.5506025483542577e-05, 'epoch': 0.34} + 34%|███▎ | 1934/5772 [3:24:12<6:35:23, 6.18s/it] {'loss': 0.472, 'learning_rate': 1.5506025483542577e-05, 'epoch': 0.34} + 34%|███▎ | 1934/5772 [3:24:05<6:35:23, 6.18s/it] 34%|███▎ | 1935/5772 [3:24:11<6:33:46, 6.16s/it] 34%|███▎ | 1935/5772 [3:24:18<6:33:47, 6.16s/it] {'loss': 0.4945, 'learning_rate': 1.5501339908724624e-05, 'epoch': 0.34} + 34%|███▎ | 1935/5772 [3:24:18<6:33:47, 6.16s/it] {'loss': 0.4945, 'learning_rate': 1.5501339908724624e-05, 'epoch': 0.34} + 34%|███▎ | 1935/5772 [3:24:11<6:33:46, 6.16s/it] 34%|███▎ | 1936/5772 [3:24:17<6:30:46, 6.11s/it] 34%|███▎ | 1936/5772 [3:24:24<6:30:46, 6.11s/it] {'loss': 0.4747, 'learning_rate': 1.5496652601290253e-05, 'epoch': 0.34} + 34%|███▎ | 1936/5772 [3:24:24<6:30:46, 6.11s/it] {'loss': 0.4747, 'learning_rate': 1.5496652601290253e-05, 'epoch': 0.34} + 34%|███▎ | 1936/5772 [3:24:17<6:30:46, 6.11s/it] 34%|███▎ | 1937/5772 [3:24:23<6:29:36, 6.10s/it] 34%|███▎ | 1937/5772 [3:24:30<6:29:35, 6.10s/it] {'loss': 0.4855, 'learning_rate': 1.5491963562715705e-05, 'epoch': 0.34} +{'loss': 0.4855, 'learning_rate': 1.5491963562715705e-05, 'epoch': 0.34} + 34%|███▎ | 1937/5772 [3:24:23<6:29:36, 6.10s/it] 34%|███▎ | 1937/5772 [3:24:30<6:29:35, 6.10s/it] 34%|███▎ | 1938/5772 [3:24:30<6:35:46, 6.19s/it] 34%|███▎ | 1938/5772 [3:24:37<6:35:46, 6.19s/it] {'loss': 0.4706, 'learning_rate': 1.548727279447777e-05, 'epoch': 0.34} + 34%|███▎ | 1938/5772 [3:24:37<6:35:46, 6.19s/it] {'loss': 0.4706, 'learning_rate': 1.548727279447777e-05, 'epoch': 0.34} + 34%|███▎ | 1938/5772 [3:24:30<6:35:46, 6.19s/it] 34%|███▎ | 1939/5772 [3:24:36<6:29:29, 6.10s/it] 34%|███▎ | 1939/5772 [3:24:43<6:29:29, 6.10s/it] {'loss': 0.4798, 'learning_rate': 1.548258029805378e-05, 'epoch': 0.34} + 34%|███▎ | 1939/5772 [3:24:43<6:29:29, 6.10s/it] {'loss': 0.4798, 'learning_rate': 1.548258029805378e-05, 'epoch': 0.34} + 34%|███▎ | 1939/5772 [3:24:36<6:29:29, 6.10s/it] 34%|███▎ | 1940/5772 [3:24:42<6:31:06, 6.12s/it] 34%|███▎ | 1940/5772 [3:24:49<6:31:06, 6.12s/it] {'loss': 0.4732, 'learning_rate': 1.5477886074921604e-05, 'epoch': 0.34} + 34%|███▎ | 1940/5772 [3:24:49<6:31:06, 6.12s/it] {'loss': 0.4732, 'learning_rate': 1.5477886074921604e-05, 'epoch': 0.34} + 34%|███▎ | 1940/5772 [3:24:42<6:31:06, 6.12s/it] 34%|███▎ | 1941/5772 [3:24:48<6:35:19, 6.19s/it] 34%|███▎ | 1941/5772 [3:24:55<6:35:19, 6.19s/it] {'loss': 0.4786, 'learning_rate': 1.5473190126559667e-05, 'epoch': 0.34} + 34%|███▎ | 1941/5772 [3:24:55<6:35:19, 6.19s/it] {'loss': 0.4786, 'learning_rate': 1.5473190126559667e-05, 'epoch': 0.34} + 34%|███▎ | 1941/5772 [3:24:48<6:35:19, 6.19s/it] 34%|███▎ | 1942/5772 [3:24:54<6:35:50, 6.20s/it] 34%|███▎ | 1942/5772 [3:25:01<6:35:50, 6.20s/it] {'loss': 0.4803, 'learning_rate': 1.546849245444693e-05, 'epoch': 0.34} + 34%|███▎ | 1942/5772 [3:25:01<6:35:50, 6.20s/it] {'loss': 0.4803, 'learning_rate': 1.546849245444693e-05, 'epoch': 0.34} + 34%|███▎ | 1942/5772 [3:24:54<6:35:50, 6.20s/it] 34%|███▎ | 1943/5772 [3:25:00<6:30:42, 6.12s/it] 34%|███▎ | 1943/5772 [3:25:07<6:30:42, 6.12s/it] {'loss': 0.478, 'learning_rate': 1.5463793060062903e-05, 'epoch': 0.34} + 34%|███▎ | 1943/5772 [3:25:07<6:30:42, 6.12s/it] {'loss': 0.478, 'learning_rate': 1.5463793060062903e-05, 'epoch': 0.34} + 34%|███▎ | 1943/5772 [3:25:00<6:30:42, 6.12s/it] 34%|███▎ | 1944/5772 [3:25:06<6:32:51, 6.16s/it] 34%|███▎ | 1944/5772 [3:25:14<6:32:51, 6.16s/it] {'loss': 0.489, 'learning_rate': 1.5459091944887626e-05, 'epoch': 0.34} + 34%|███▎ | 1944/5772 [3:25:14<6:32:51, 6.16s/it] {'loss': 0.489, 'learning_rate': 1.5459091944887626e-05, 'epoch': 0.34} + 34%|███▎ | 1944/5772 [3:25:06<6:32:51, 6.16s/it] 34%|███▎ | 1945/5772 [3:25:12<6:28:17, 6.09s/it] 34%|███▎ | 1945/5772 [3:25:19<6:28:18, 6.09s/it] {'loss': 0.4699, 'learning_rate': 1.5454389110401694e-05, 'epoch': 0.34} + 34%|███▎ | 1945/5772 [3:25:19<6:28:18, 6.09s/it] {'loss': 0.4699, 'learning_rate': 1.5454389110401694e-05, 'epoch': 0.34} + 34%|███▎ | 1945/5772 [3:25:12<6:28:17, 6.09s/it] 34%|███▎ | 1946/5772 [3:25:18<6:27:21, 6.07s/it] 34%|███▎ | 1946/5772 [3:25:25<6:27:21, 6.07s/it] {'loss': 0.4861, 'learning_rate': 1.5449684558086243e-05, 'epoch': 0.34} + 34%|███▎ | 1946/5772 [3:25:25<6:27:21, 6.07s/it] {'loss': 0.4861, 'learning_rate': 1.5449684558086243e-05, 'epoch': 0.34} + 34%|███▎ | 1946/5772 [3:25:18<6:27:21, 6.07s/it] 34%|███▎ | 1947/5772 [3:25:24<6:22:40, 6.00s/it] 34%|███▎ | 1947/5772 [3:25:31<6:22:40, 6.00s/it] {'loss': 0.4789, 'learning_rate': 1.5444978289422937e-05, 'epoch': 0.34} + 34%|███▎ | 1947/5772 [3:25:31<6:22:40, 6.00s/it] {'loss': 0.4789, 'learning_rate': 1.5444978289422937e-05, 'epoch': 0.34} + 34%|███▎ | 1947/5772 [3:25:24<6:22:40, 6.00s/it] 34%|███▎ | 1948/5772 [3:25:31<6:27:22, 6.08s/it] 34%|███▎ | 1948/5772 [3:25:38<6:27:22, 6.08s/it] {'loss': 0.479, 'learning_rate': 1.5440270305893995e-05, 'epoch': 0.34} + 34%|███▎ | 1948/5772 [3:25:38<6:27:22, 6.08s/it] {'loss': 0.479, 'learning_rate': 1.5440270305893995e-05, 'epoch': 0.34} + 34%|███▎ | 1948/5772 [3:25:31<6:27:22, 6.08s/it] 34%|███▍ | 1949/5772 [3:25:37<6:29:53, 6.12s/it] 34%|███▍ | 1949/5772 [3:25:44<6:29:53, 6.12s/it] {'loss': 0.4813, 'learning_rate': 1.5435560608982166e-05, 'epoch': 0.34} + 34%|███▍ | 1949/5772 [3:25:44<6:29:53, 6.12s/it] {'loss': 0.4813, 'learning_rate': 1.5435560608982166e-05, 'epoch': 0.34} + 34%|███▍ | 1949/5772 [3:25:37<6:29:53, 6.12s/it]05 AutoResumeHook: Checking whether to suspend... +128 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... + 34%|███▍ | 1950/5772 [3:25:50<6:26:07, 6.06s/it]11 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... + 7 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +461 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + + 34%|███▍ | 1950/5772 [3:25:43<6:26:07, 6.06s/it]2 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4849, 'learning_rate': 1.5430849200170747e-05, 'epoch': 0.34} + 34%|███▍ | 1950/5772 [3:25:50<6:26:07, 6.06s/it] {'loss': 0.4849, 'learning_rate': 1.5430849200170747e-05, 'epoch': 0.34} + 34%|███▍ | 1950/5772 [3:25:43<6:26:07, 6.06s/it] 34%|███▍ | 1951/5772 [3:25:49<6:36:29, 6.23s/it] 34%|███▍ | 1951/5772 [3:25:56<6:36:29, 6.23s/it] {'loss': 0.4782, 'learning_rate': 1.5426136080943566e-05, 'epoch': 0.34} + 34%|███▍ | 1951/5772 [3:25:56<6:36:29, 6.23s/it] {'loss': 0.4782, 'learning_rate': 1.5426136080943566e-05, 'epoch': 0.34} + 34%|███▍ | 1951/5772 [3:25:49<6:36:29, 6.23s/it] 34%|███▍ | 1952/5772 [3:25:55<6:32:51, 6.17s/it] 34%|███▍ | 1952/5772 [3:26:02<6:32:51, 6.17s/it] {'loss': 0.4741, 'learning_rate': 1.5421421252784998e-05, 'epoch': 0.34} + 34%|███▍ | 1952/5772 [3:26:02<6:32:51, 6.17s/it] {'loss': 0.4741, 'learning_rate': 1.5421421252784998e-05, 'epoch': 0.34} + 34%|███▍ | 1952/5772 [3:25:55<6:32:51, 6.17s/it] 34%|███▍ | 1953/5772 [3:26:01<6:30:45, 6.14s/it] 34%|███▍ | 1953/5772 [3:26:08<6:30:45, 6.14s/it] {'loss': 0.4812, 'learning_rate': 1.541670471717995e-05, 'epoch': 0.34} + 34%|███▍ | 1953/5772 [3:26:08<6:30:45, 6.14s/it] {'loss': 0.4812, 'learning_rate': 1.541670471717995e-05, 'epoch': 0.34} + 34%|███▍ | 1953/5772 [3:26:01<6:30:45, 6.14s/it] 34%|███▍ | 1954/5772 [3:26:07<6:30:08, 6.13s/it] 34%|███▍ | 1954/5772 [3:26:15<6:30:08, 6.13s/it] {'loss': 0.4782, 'learning_rate': 1.5411986475613864e-05, 'epoch': 0.34} + 34%|███▍ | 1954/5772 [3:26:15<6:30:08, 6.13s/it] {'loss': 0.4782, 'learning_rate': 1.5411986475613864e-05, 'epoch': 0.34} + 34%|███▍ | 1954/5772 [3:26:07<6:30:08, 6.13s/it] 34%|███▍ | 1955/5772 [3:26:13<6:27:25, 6.09s/it] 34%|███▍ | 1955/5772 [3:26:21<6:27:25, 6.09s/it] {'loss': 0.4738, 'learning_rate': 1.540726652957273e-05, 'epoch': 0.34} + 34%|███▍ | 1955/5772 [3:26:21<6:27:25, 6.09s/it] {'loss': 0.4738, 'learning_rate': 1.540726652957273e-05, 'epoch': 0.34} + 34%|███▍ | 1955/5772 [3:26:13<6:27:25, 6.09s/it] 34%|███▍ | 1956/5772 [3:26:19<6:24:22, 6.04s/it] 34%|███▍ | 1956/5772 [3:26:26<6:24:22, 6.04s/it] {'loss': 0.472, 'learning_rate': 1.540254488054307e-05, 'epoch': 0.34} + 34%|███▍ | 1956/5772 [3:26:26<6:24:22, 6.04s/it] {'loss': 0.472, 'learning_rate': 1.540254488054307e-05, 'epoch': 0.34} + 34%|███▍ | 1956/5772 [3:26:19<6:24:22, 6.04s/it] 34%|███▍ | 1957/5772 [3:26:26<6:25:16, 6.06s/it] 34%|███▍ | 1957/5772 [3:26:33<6:25:16, 6.06s/it] {'loss': 0.4885, 'learning_rate': 1.5397821530011935e-05, 'epoch': 0.34} + 34%|███▍ | 1957/5772 [3:26:33<6:25:16, 6.06s/it] {'loss': 0.4885, 'learning_rate': 1.5397821530011935e-05, 'epoch': 0.34} + 34%|███▍ | 1957/5772 [3:26:26<6:25:16, 6.06s/it] 34%|███▍ | 1958/5772 [3:26:32<6:27:22, 6.09s/it] 34%|███▍ | 1958/5772 [3:26:39<6:27:22, 6.09s/it] {'loss': 0.4813, 'learning_rate': 1.5393096479466922e-05, 'epoch': 0.34} + 34%|███▍ | 1958/5772 [3:26:39<6:27:22, 6.09s/it] {'loss': 0.4813, 'learning_rate': 1.5393096479466922e-05, 'epoch': 0.34} + 34%|███▍ | 1958/5772 [3:26:32<6:27:22, 6.09s/it] 34%|███▍ | 1959/5772 [3:26:38<6:33:12, 6.19s/it] 34%|███▍ | 1959/5772 [3:26:45<6:33:12, 6.19s/it] {'loss': 0.4777, 'learning_rate': 1.538836973039616e-05, 'epoch': 0.34} + 34%|███▍ | 1959/5772 [3:26:45<6:33:12, 6.19s/it] {'loss': 0.4777, 'learning_rate': 1.538836973039616e-05, 'epoch': 0.34} + 34%|███▍ | 1959/5772 [3:26:38<6:33:12, 6.19s/it] 34%|███▍ | 1960/5772 [3:26:44<6:27:29, 6.10s/it] 34%|███▍ | 1960/5772 [3:26:51<6:27:29, 6.10s/it] {'loss': 0.4748, 'learning_rate': 1.5383641284288308e-05, 'epoch': 0.34} + 34%|███▍ | 1960/5772 [3:26:51<6:27:29, 6.10s/it] {'loss': 0.4748, 'learning_rate': 1.5383641284288308e-05, 'epoch': 0.34} + 34%|███▍ | 1960/5772 [3:26:44<6:27:29, 6.10s/it] 34%|███▍ | 1961/5772 [3:26:50<6:25:08, 6.06s/it] 34%|███▍ | 1961/5772 [3:26:57<6:25:08, 6.06s/it] {'loss': 0.4864, 'learning_rate': 1.537891114263257e-05, 'epoch': 0.34} + 34%|███▍ | 1961/5772 [3:26:57<6:25:08, 6.06s/it] {'loss': 0.4864, 'learning_rate': 1.537891114263257e-05, 'epoch': 0.34} + 34%|███▍ | 1961/5772 [3:26:50<6:25:08, 6.06s/it] 34%|███▍ | 1962/5772 [3:26:56<6:21:09, 6.00s/it] 34%|███▍ | 1962/5772 [3:27:03<6:21:09, 6.00s/it] {'loss': 0.4671, 'learning_rate': 1.5374179306918674e-05, 'epoch': 0.34} + 34%|███▍ | 1962/5772 [3:27:03<6:21:09, 6.00s/it] {'loss': 0.4671, 'learning_rate': 1.5374179306918674e-05, 'epoch': 0.34} + 34%|███▍ | 1962/5772 [3:26:56<6:21:09, 6.00s/it] 34%|███▍ | 1963/5772 [3:27:02<6:21:56, 6.02s/it] 34%|███▍ | 1963/5772 [3:27:09<6:21:56, 6.02s/it] {'loss': 0.4594, 'learning_rate': 1.5369445778636885e-05, 'epoch': 0.34} + 34%|███▍ | 1963/5772 [3:27:09<6:21:56, 6.02s/it] {'loss': 0.4594, 'learning_rate': 1.5369445778636885e-05, 'epoch': 0.34} + 34%|███▍ | 1963/5772 [3:27:02<6:21:56, 6.02s/it] 34%|███▍ | 1964/5772 [3:27:08<6:25:45, 6.08s/it] 34%|███▍ | 1964/5772 [3:27:15<6:25:44, 6.08s/it] {'loss': 0.4797, 'learning_rate': 1.5364710559278e-05, 'epoch': 0.34} + 34%|███▍ | 1964/5772 [3:27:15<6:25:44, 6.08s/it] {'loss': 0.4797, 'learning_rate': 1.5364710559278e-05, 'epoch': 0.34} + 34%|███▍ | 1964/5772 [3:27:08<6:25:45, 6.08s/it] 34%|███▍ | 1965/5772 [3:27:15<6:33:38, 6.20s/it] 34%|███▍ | 1965/5772 [3:27:22<6:33:39, 6.20s/it] {'loss': 0.4878, 'learning_rate': 1.5359973650333352e-05, 'epoch': 0.34} + 34%|███▍ | 1965/5772 [3:27:22<6:33:39, 6.20s/it] {'loss': 0.4878, 'learning_rate': 1.5359973650333352e-05, 'epoch': 0.34} + 34%|███▍ | 1965/5772 [3:27:15<6:33:38, 6.20s/it] 34%|███▍ | 1966/5772 [3:27:21<6:31:29, 6.17s/it] 34%|███▍ | 1966/5772 [3:27:28<6:31:29, 6.17s/it] {'loss': 0.463, 'learning_rate': 1.535523505329481e-05, 'epoch': 0.34} + 34%|███▍ | 1966/5772 [3:27:28<6:31:29, 6.17s/it] {'loss': 0.463, 'learning_rate': 1.535523505329481e-05, 'epoch': 0.34} + 34%|███▍ | 1966/5772 [3:27:21<6:31:29, 6.17s/it] 34%|███▍ | 1967/5772 [3:27:27<6:35:14, 6.23s/it] 34%|███▍ | 1967/5772 [3:27:34<6:35:14, 6.23s/it] {'loss': 0.4725, 'learning_rate': 1.535049476965476e-05, 'epoch': 0.34} + 34%|███▍ | 1967/5772 [3:27:34<6:35:14, 6.23s/it] {'loss': 0.4725, 'learning_rate': 1.535049476965476e-05, 'epoch': 0.34} + 34%|███▍ | 1967/5772 [3:27:27<6:35:14, 6.23s/it] 34%|███▍ | 1968/5772 [3:27:41<6:40:58, 6.32s/it] 34%|███▍ | 1968/5772 [3:27:34<6:40:59, 6.32s/it] {'loss': 0.4716, 'learning_rate': 1.5345752800906128e-05, 'epoch': 0.34} + 34%|███▍ | 1968/5772 [3:27:41<6:40:58, 6.32s/it] {'loss': 0.4716, 'learning_rate': 1.5345752800906128e-05, 'epoch': 0.34} + 34%|███▍ | 1968/5772 [3:27:34<6:40:59, 6.32s/it] 34%|███▍ | 1969/5772 [3:27:40<6:34:58, 6.23s/it] 34%|███▍ | 1969/5772 [3:27:47<6:34:58, 6.23s/it] {'loss': 0.4837, 'learning_rate': 1.5341009148542378e-05, 'epoch': 0.34} + 34%|███▍ | 1969/5772 [3:27:47<6:34:58, 6.23s/it] {'loss': 0.4837, 'learning_rate': 1.5341009148542378e-05, 'epoch': 0.34} + 34%|███▍ | 1969/5772 [3:27:40<6:34:58, 6.23s/it] 34%|███▍ | 1970/5772 [3:27:46<6:35:04, 6.23s/it] 34%|███▍ | 1970/5772 [3:27:53<6:35:05, 6.23s/it] {'loss': 0.4715, 'learning_rate': 1.5336263814057493e-05, 'epoch': 0.34} + 34%|███▍ | 1970/5772 [3:27:53<6:35:05, 6.23s/it] {'loss': 0.4715, 'learning_rate': 1.5336263814057493e-05, 'epoch': 0.34} + 34%|███▍ | 1970/5772 [3:27:46<6:35:04, 6.23s/it] 34%|███▍ | 1971/5772 [3:27:52<6:30:20, 6.16s/it] 34%|███▍ | 1971/5772 [3:27:59<6:30:20, 6.16s/it] {'loss': 0.4911, 'learning_rate': 1.5331516798945987e-05, 'epoch': 0.34} + 34%|███▍ | 1971/5772 [3:27:59<6:30:20, 6.16s/it] {'loss': 0.4911, 'learning_rate': 1.5331516798945987e-05, 'epoch': 0.34} + 34%|███▍ | 1971/5772 [3:27:52<6:30:20, 6.16s/it] 34%|███▍ | 1972/5772 [3:27:59<6:41:56, 6.35s/it] 34%|███▍ | 1972/5772 [3:28:06<6:41:56, 6.35s/it] {'loss': 0.4739, 'learning_rate': 1.532676810470291e-05, 'epoch': 0.34} + 34%|███▍ | 1972/5772 [3:28:06<6:41:56, 6.35s/it] {'loss': 0.4739, 'learning_rate': 1.532676810470291e-05, 'epoch': 0.34} + 34%|███▍ | 1972/5772 [3:27:59<6:41:56, 6.35s/it] 34%|███▍ | 1973/5772 [3:28:05<6:35:41, 6.25s/it] 34%|███▍ | 1973/5772 [3:28:12<6:35:41, 6.25s/it] {'loss': 0.4929, 'learning_rate': 1.5322017732823836e-05, 'epoch': 0.34} + 34%|███▍ | 1973/5772 [3:28:12<6:35:41, 6.25s/it] {'loss': 0.4929, 'learning_rate': 1.5322017732823836e-05, 'epoch': 0.34} + 34%|███▍ | 1973/5772 [3:28:05<6:35:41, 6.25s/it] 34%|███▍ | 1974/5772 [3:28:11<6:44:17, 6.39s/it] 34%|███▍ | 1974/5772 [3:28:18<6:44:17, 6.39s/it] {'loss': 0.4621, 'learning_rate': 1.5317265684804865e-05, 'epoch': 0.34} + 34%|███▍ | 1974/5772 [3:28:18<6:44:17, 6.39s/it] {'loss': 0.4621, 'learning_rate': 1.5317265684804865e-05, 'epoch': 0.34} + 34%|███▍ | 1974/5772 [3:28:11<6:44:17, 6.39s/it] 34%|███▍ | 1975/5772 [3:28:17<6:37:59, 6.29s/it] 34%|███▍ | 1975/5772 [3:28:24<6:37:59, 6.29s/it] {'loss': 0.4749, 'learning_rate': 1.5312511962142634e-05, 'epoch': 0.34} + 34%|███▍ | 1975/5772 [3:28:24<6:37:59, 6.29s/it] {'loss': 0.4749, 'learning_rate': 1.5312511962142634e-05, 'epoch': 0.34} + 34%|███▍ | 1975/5772 [3:28:17<6:37:59, 6.29s/it] 34%|███▍ | 1976/5772 [3:28:23<6:33:13, 6.22s/it] 34%|███▍ | 1976/5772 [3:28:31<6:33:13, 6.22s/it] {'loss': 0.467, 'learning_rate': 1.5307756566334295e-05, 'epoch': 0.34} + 34%|███▍ | 1976/5772 [3:28:31<6:33:13, 6.22s/it] {'loss': 0.467, 'learning_rate': 1.5307756566334295e-05, 'epoch': 0.34} + 34%|███▍ | 1976/5772 [3:28:23<6:33:13, 6.22s/it] 34%|███▍ | 1977/5772 [3:28:29<6:26:14, 6.11s/it] 34%|███▍ | 1977/5772 [3:28:36<6:26:14, 6.11s/it] {'loss': 0.4822, 'learning_rate': 1.5302999498877537e-05, 'epoch': 0.34} + 34%|███▍ | 1977/5772 [3:28:36<6:26:14, 6.11s/it] {'loss': 0.4822, 'learning_rate': 1.5302999498877537e-05, 'epoch': 0.34} + 34%|███▍ | 1977/5772 [3:28:29<6:26:14, 6.11s/it] 34%|███▍ | 1978/5772 [3:28:36<6:35:51, 6.26s/it] 34%|███▍ | 1978/5772 [3:28:43<6:35:51, 6.26s/it] {'loss': 0.4682, 'learning_rate': 1.5298240761270575e-05, 'epoch': 0.34} + 34%|███▍ | 1978/5772 [3:28:43<6:35:51, 6.26s/it] {'loss': 0.4682, 'learning_rate': 1.5298240761270575e-05, 'epoch': 0.34} + 34%|███▍ | 1978/5772 [3:28:36<6:35:51, 6.26s/it] 34%|███▍ | 1979/5772 [3:28:42<6:37:51, 6.29s/it] 34%|███▍ | 1979/5772 [3:28:49<6:37:51, 6.29s/it] {'loss': 0.4744, 'learning_rate': 1.529348035501214e-05, 'epoch': 0.34} + 34%|███▍ | 1979/5772 [3:28:49<6:37:51, 6.29s/it] {'loss': 0.4744, 'learning_rate': 1.529348035501214e-05, 'epoch': 0.34} + 34%|███▍ | 1979/5772 [3:28:42<6:37:51, 6.29s/it] 34%|███▍ | 1980/5772 [3:28:48<6:32:53, 6.22s/it] 34%|███▍ | 1980/5772 [3:28:55<6:32:53, 6.22s/it] {'loss': 0.4743, 'learning_rate': 1.52887182816015e-05, 'epoch': 0.34} + 34%|███▍ | 1980/5772 [3:28:55<6:32:53, 6.22s/it] {'loss': 0.4743, 'learning_rate': 1.52887182816015e-05, 'epoch': 0.34} + 34%|███▍ | 1980/5772 [3:28:48<6:32:53, 6.22s/it] 34%|███▍ | 1981/5772 [3:28:55<6:31:29, 6.20s/it] 34%|███▍ | 1981/5772 [3:29:02<6:31:30, 6.20s/it] {'loss': 0.4805, 'learning_rate': 1.5283954542538442e-05, 'epoch': 0.34} + 34%|███▍ | 1981/5772 [3:29:02<6:31:30, 6.20s/it] {'loss': 0.4805, 'learning_rate': 1.5283954542538442e-05, 'epoch': 0.34} + 34%|███▍ | 1981/5772 [3:28:55<6:31:29, 6.20s/it] 34%|███▍ | 1982/5772 [3:29:01<6:38:53, 6.31s/it] 34%|███▍ | 1982/5772 [3:29:08<6:38:53, 6.31s/it] {'loss': 0.4704, 'learning_rate': 1.5279189139323284e-05, 'epoch': 0.34} + 34%|███▍ | 1982/5772 [3:29:08<6:38:53, 6.31s/it] {'loss': 0.4704, 'learning_rate': 1.5279189139323284e-05, 'epoch': 0.34} + 34%|███▍ | 1982/5772 [3:29:01<6:38:53, 6.31s/it] 34%|███▍ | 1983/5772 [3:29:07<6:37:34, 6.30s/it] 34%|███▍ | 1983/5772 [3:29:14<6:37:34, 6.30s/it] {'loss': 0.5015, 'learning_rate': 1.5274422073456853e-05, 'epoch': 0.34} + 34%|███▍ | 1983/5772 [3:29:14<6:37:34, 6.30s/it] {'loss': 0.5015, 'learning_rate': 1.5274422073456853e-05, 'epoch': 0.34} + 34%|███▍ | 1983/5772 [3:29:07<6:37:34, 6.30s/it] 34%|███▍ | 1984/5772 [3:29:14<6:40:48, 6.35s/it] {'loss': 0.4753, 'learning_rate': 1.526965334644052e-05, 'epoch': 0.34} + 34%|███▍ | 1984/5772 [3:29:14<6:40:48, 6.35s/it] 34%|███▍ | 1984/5772 [3:29:21<6:40:47, 6.35s/it] {'loss': 0.4753, 'learning_rate': 1.526965334644052e-05, 'epoch': 0.34} + 34%|███▍ | 1984/5772 [3:29:21<6:40:47, 6.35s/it] 34%|███▍ | 1985/5772 [3:29:20<6:38:37, 6.32s/it] 34%|███▍ | 1985/5772 [3:29:27<6:38:37, 6.32s/it] {'loss': 0.4768, 'learning_rate': 1.5264882959776164e-05, 'epoch': 0.34} + 34%|███▍ | 1985/5772 [3:29:27<6:38:37, 6.32s/it] {'loss': 0.4768, 'learning_rate': 1.5264882959776164e-05, 'epoch': 0.34} + 34%|███▍ | 1985/5772 [3:29:20<6:38:37, 6.32s/it] 34%|███▍ | 1986/5772 [3:29:26<6:29:14, 6.17s/it] 34%|███▍ | 1986/5772 [3:29:33<6:29:14, 6.17s/it] {'loss': 0.4517, 'learning_rate': 1.526011091496619e-05, 'epoch': 0.34} + 34%|███▍ | 1986/5772 [3:29:33<6:29:14, 6.17s/it] {'loss': 0.4517, 'learning_rate': 1.526011091496619e-05, 'epoch': 0.34} + 34%|███▍ | 1986/5772 [3:29:26<6:29:14, 6.17s/it] 34%|███▍ | 1987/5772 [3:29:32<6:25:59, 6.12s/it] 34%|███▍ | 1987/5772 [3:29:39<6:25:59, 6.12s/it] {'loss': 0.4776, 'learning_rate': 1.5255337213513532e-05, 'epoch': 0.34} + 34%|███▍ | 1987/5772 [3:29:39<6:25:59, 6.12s/it] {'loss': 0.4776, 'learning_rate': 1.5255337213513532e-05, 'epoch': 0.34} + 34%|███▍ | 1987/5772 [3:29:32<6:25:59, 6.12s/it] 34%|███▍ | 1988/5772 [3:29:38<6:27:30, 6.14s/it] 34%|███▍ | 1988/5772 [3:29:45<6:27:30, 6.14s/it] {'loss': 0.4706, 'learning_rate': 1.5250561856921638e-05, 'epoch': 0.34} + 34%|███▍ | 1988/5772 [3:29:45<6:27:30, 6.14s/it] {'loss': 0.4706, 'learning_rate': 1.5250561856921638e-05, 'epoch': 0.34} + 34%|███▍ | 1988/5772 [3:29:38<6:27:30, 6.14s/it] 34%|███▍ | 1989/5772 [3:29:44<6:30:29, 6.19s/it] 34%|███▍ | 1989/5772 [3:29:51<6:30:30, 6.19s/it] {'loss': 0.4815, 'learning_rate': 1.5245784846694483e-05, 'epoch': 0.34} + 34%|███▍ | 1989/5772 [3:29:51<6:30:30, 6.19s/it] {'loss': 0.4815, 'learning_rate': 1.5245784846694483e-05, 'epoch': 0.34} + 34%|███▍ | 1989/5772 [3:29:44<6:30:29, 6.19s/it] 34%|███▍ | 1990/5772 [3:29:50<6:27:28, 6.15s/it] 34%|███▍ | 1990/5772 [3:29:57<6:27:28, 6.15s/it] {'loss': 0.4739, 'learning_rate': 1.5241006184336553e-05, 'epoch': 0.34} + 34%|███▍ | 1990/5772 [3:29:57<6:27:28, 6.15s/it] {'loss': 0.4739, 'learning_rate': 1.5241006184336553e-05, 'epoch': 0.34} + 34%|███▍ | 1990/5772 [3:29:50<6:27:28, 6.15s/it] 34%|███▍ | 1991/5772 [3:29:57<6:26:20, 6.13s/it] 34%|███▍ | 1991/5772 [3:30:04<6:26:19, 6.13s/it] {'loss': 0.4761, 'learning_rate': 1.5236225871352867e-05, 'epoch': 0.34} + 34%|███▍ | 1991/5772 [3:30:04<6:26:19, 6.13s/it] {'loss': 0.4761, 'learning_rate': 1.5236225871352867e-05, 'epoch': 0.34} + 34%|███▍ | 1991/5772 [3:29:57<6:26:20, 6.13s/it] 35%|███▍ | 1992/5772 [3:30:03<6:33:09, 6.24s/it] 35%|███▍ | 1992/5772 [3:30:10<6:33:09, 6.24s/it] {'loss': 0.4844, 'learning_rate': 1.5231443909248956e-05, 'epoch': 0.35} + 35%|███▍ | 1992/5772 [3:30:10<6:33:09, 6.24s/it] {'loss': 0.4844, 'learning_rate': 1.5231443909248956e-05, 'epoch': 0.35} + 35%|███▍ | 1992/5772 [3:30:03<6:33:09, 6.24s/it] 35%|███▍ | 1993/5772 [3:30:09<6:36:47, 6.30s/it] 35%|███▍ | 1993/5772 [3:30:17<6:36:47, 6.30s/it] {'loss': 0.4789, 'learning_rate': 1.5226660299530874e-05, 'epoch': 0.35} + 35%|███▍ | 1993/5772 [3:30:17<6:36:47, 6.30s/it] {'loss': 0.4789, 'learning_rate': 1.5226660299530874e-05, 'epoch': 0.35} + 35%|███▍ | 1993/5772 [3:30:09<6:36:47, 6.30s/it] 35%|███▍ | 1994/5772 [3:30:15<6:28:51, 6.18s/it] 35%|███▍ | 1994/5772 [3:30:22<6:28:51, 6.18s/it] {'loss': 0.4887, 'learning_rate': 1.522187504370519e-05, 'epoch': 0.35} + 35%|███▍ | 1994/5772 [3:30:22<6:28:51, 6.18s/it] {'loss': 0.4887, 'learning_rate': 1.522187504370519e-05, 'epoch': 0.35} + 35%|███▍ | 1994/5772 [3:30:15<6:28:51, 6.18s/it] 35%|███▍ | 1995/5772 [3:30:22<6:28:26, 6.17s/it] 35%|███▍ | 1995/5772 [3:30:29<6:28:27, 6.17s/it] {'loss': 0.4777, 'learning_rate': 1.5217088143278995e-05, 'epoch': 0.35} + 35%|███▍ | 1995/5772 [3:30:29<6:28:27, 6.17s/it] {'loss': 0.4777, 'learning_rate': 1.5217088143278995e-05, 'epoch': 0.35} + 35%|███▍ | 1995/5772 [3:30:22<6:28:26, 6.17s/it] 35%|███▍ | 1996/5772 [3:30:28<6:29:21, 6.19s/it] 35%|███▍ | 1996/5772 [3:30:35<6:29:21, 6.19s/it] {'loss': 0.4814, 'learning_rate': 1.5212299599759894e-05, 'epoch': 0.35} + 35%|███▍ | 1996/5772 [3:30:35<6:29:21, 6.19s/it] {'loss': 0.4814, 'learning_rate': 1.5212299599759894e-05, 'epoch': 0.35} + 35%|███▍ | 1996/5772 [3:30:28<6:29:21, 6.19s/it] 35%|███▍ | 1997/5772 [3:30:34<6:28:03, 6.17s/it] 35%|███▍ | 1997/5772 [3:30:41<6:28:03, 6.17s/it] {'loss': 0.4742, 'learning_rate': 1.5207509414656017e-05, 'epoch': 0.35} + 35%|███▍ | 1997/5772 [3:30:41<6:28:03, 6.17s/it] {'loss': 0.4742, 'learning_rate': 1.5207509414656017e-05, 'epoch': 0.35} + 35%|███▍ | 1997/5772 [3:30:34<6:28:03, 6.17s/it] 35%|███▍ | 1998/5772 [3:30:40<6:21:50, 6.07s/it] 35%|███▍ | 1998/5772 [3:30:47<6:21:49, 6.07s/it] {'loss': 0.4831, 'learning_rate': 1.5202717589476006e-05, 'epoch': 0.35} + 35%|███▍ | 1998/5772 [3:30:47<6:21:49, 6.07s/it] {'loss': 0.4831, 'learning_rate': 1.5202717589476006e-05, 'epoch': 0.35} + 35%|███▍ | 1998/5772 [3:30:40<6:21:50, 6.07s/it] 35%|███▍ | 1999/5772 [3:30:46<6:18:05, 6.01s/it] 35%|███▍ | 1999/5772 [3:30:53<6:18:05, 6.01s/it] {'loss': 0.4824, 'learning_rate': 1.5197924125729015e-05, 'epoch': 0.35} + 35%|███▍ | 1999/5772 [3:30:53<6:18:05, 6.01s/it] {'loss': 0.4824, 'learning_rate': 1.5197924125729015e-05, 'epoch': 0.35} + 35%|███▍ | 1999/5772 [3:30:46<6:18:05, 6.01s/it]4 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +05 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... + 7AutoResumeHook: Checking whether to suspend... AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... + +12 14AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... + 35%|███▍ | 2000/5772 [3:30:52<6:28:56, 6.19s/it]10 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... + 35%|███▍ | 2000/5772 [3:30:59<6:28:56, 6.19s/it]2 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4619, 'learning_rate': 1.519312902492472e-05, 'epoch': 0.35} + 35%|███▍ | 2000/5772 [3:30:59<6:28:56, 6.19s/it] {'loss': 0.4619, 'learning_rate': 1.519312902492472e-05, 'epoch': 0.35} + 35%|███▍ | 2000/5772 [3:30:52<6:28:56, 6.19s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-2000/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-2000/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-2000/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 35%|███▍ | 2001/5772 [3:31:19<10:37:21, 10.14s/it] 35%|███▍ | 2001/5772 [3:31:12<10:37:21, 10.14s/it] {'loss': 0.4846, 'learning_rate': 1.5188332288573313e-05, 'epoch': 0.35} + 35%|███▍ | 2001/5772 [3:31:19<10:37:21, 10.14s/it] {'loss': 0.4846, 'learning_rate': 1.5188332288573313e-05, 'epoch': 0.35} + 35%|███▍ | 2001/5772 [3:31:12<10:37:21, 10.14s/it] 35%|███▍ | 2002/5772 [3:31:18<9:21:46, 8.94s/it] 35%|███▍ | 2002/5772 [3:31:25<9:21:46, 8.94s/it] {'loss': 0.4707, 'learning_rate': 1.51835339181855e-05, 'epoch': 0.35} + 35%|███▍ | 2002/5772 [3:31:25<9:21:46, 8.94s/it] {'loss': 0.4707, 'learning_rate': 1.51835339181855e-05, 'epoch': 0.35} + 35%|███▍ | 2002/5772 [3:31:18<9:21:46, 8.94s/it] 35%|███▍ | 2003/5772 [3:31:24<8:25:40, 8.05s/it] 35%|███▍ | 2003/5772 [3:31:31<8:25:40, 8.05s/it] {'loss': 0.4717, 'learning_rate': 1.5178733915272501e-05, 'epoch': 0.35} + 35%|███▍ | 2003/5772 [3:31:31<8:25:40, 8.05s/it] {'loss': 0.4717, 'learning_rate': 1.5178733915272501e-05, 'epoch': 0.35} + 35%|███▍ | 2003/5772 [3:31:24<8:25:40, 8.05s/it] 35%|███▍ | 2004/5772 [3:31:29<7:43:56, 7.39s/it] 35%|███▍ | 2004/5772 [3:31:37<7:43:56, 7.39s/it] {'loss': 0.473, 'learning_rate': 1.5173932281346049e-05, 'epoch': 0.35} + 35%|███▍ | 2004/5772 [3:31:37<7:43:56, 7.39s/it] {'loss': 0.473, 'learning_rate': 1.5173932281346049e-05, 'epoch': 0.35} + 35%|███▍ | 2004/5772 [3:31:29<7:43:56, 7.39s/it] 35%|███▍ | 2005/5772 [3:31:36<7:24:16, 7.08s/it] 35%|███▍ | 2005/5772 [3:31:43<7:24:16, 7.08s/it]{'loss': 0.4676, 'learning_rate': 1.5169129017918389e-05, 'epoch': 0.35} + {'loss': 0.4676, 'learning_rate': 1.5169129017918389e-05, 'epoch': 0.35} + 35%|███▍ | 2005/5772 [3:31:43<7:24:16, 7.08s/it] 35%|███▍ | 2005/5772 [3:31:36<7:24:16, 7.08s/it] 35%|███▍ | 2006/5772 [3:31:42<7:09:26, 6.84s/it] 35%|███▍ | 2006/5772 [3:31:49<7:09:26, 6.84s/it] {'loss': 0.4798, 'learning_rate': 1.5164324126502287e-05, 'epoch': 0.35} + 35%|███▍ | 2006/5772 [3:31:49<7:09:26, 6.84s/it] {'loss': 0.4798, 'learning_rate': 1.5164324126502287e-05, 'epoch': 0.35} + 35%|███▍ | 2006/5772 [3:31:42<7:09:26, 6.84s/it] 35%|███▍ | 2007/5772 [3:31:48<6:58:06, 6.66s/it] 35%|███▍ | 2007/5772 [3:31:55<6:58:07, 6.66s/it] {'loss': 0.4926, 'learning_rate': 1.5159517608611015e-05, 'epoch': 0.35} + 35%|███▍ | 2007/5772 [3:31:55<6:58:07, 6.66s/it] {'loss': 0.4926, 'learning_rate': 1.5159517608611015e-05, 'epoch': 0.35} + 35%|███▍ | 2007/5772 [3:31:48<6:58:06, 6.66s/it] 35%|███▍ | 2008/5772 [3:31:55<6:51:56, 6.57s/it] 35%|███▍ | 2008/5772 [3:32:02<6:51:56, 6.57s/it] {'loss': 0.4698, 'learning_rate': 1.515470946575836e-05, 'epoch': 0.35} + 35%|███▍ | 2008/5772 [3:32:02<6:51:56, 6.57s/it] {'loss': 0.4698, 'learning_rate': 1.515470946575836e-05, 'epoch': 0.35} + 35%|███▍ | 2008/5772 [3:31:55<6:51:56, 6.57s/it] 35%|███▍ | 2009/5772 [3:32:01<6:44:11, 6.44s/it] 35%|███▍ | 2009/5772 [3:32:08<6:44:11, 6.44s/it] {'loss': 0.4893, 'learning_rate': 1.514989969945862e-05, 'epoch': 0.35} + 35%|███▍ | 2009/5772 [3:32:08<6:44:11, 6.44s/it] {'loss': 0.4893, 'learning_rate': 1.514989969945862e-05, 'epoch': 0.35} + 35%|███▍ | 2009/5772 [3:32:01<6:44:11, 6.44s/it] 35%|███▍ | 2010/5772 [3:32:07<6:34:50, 6.30s/it] 35%|███▍ | 2010/5772 [3:32:14<6:34:50, 6.30s/it] {'loss': 0.4794, 'learning_rate': 1.5145088311226599e-05, 'epoch': 0.35} + 35%|███▍ | 2010/5772 [3:32:14<6:34:50, 6.30s/it] {'loss': 0.4794, 'learning_rate': 1.5145088311226599e-05, 'epoch': 0.35} + 35%|███▍ | 2010/5772 [3:32:07<6:34:50, 6.30s/it] 35%|███▍ | 2011/5772 [3:32:13<6:28:19, 6.20s/it] 35%|███▍ | 2011/5772 [3:32:20<6:28:19, 6.19s/it] {'loss': 0.4828, 'learning_rate': 1.5140275302577627e-05, 'epoch': 0.35} + 35%|███▍ | 2011/5772 [3:32:20<6:28:19, 6.19s/it] {'loss': 0.4828, 'learning_rate': 1.5140275302577627e-05, 'epoch': 0.35} + 35%|███▍ | 2011/5772 [3:32:13<6:28:19, 6.20s/it] 35%|███▍ | 2012/5772 [3:32:19<6:37:00, 6.34s/it] 35%|███▍ | 2012/5772 [3:32:27<6:37:00, 6.34s/it] {'loss': 0.4627, 'learning_rate': 1.5135460675027525e-05, 'epoch': 0.35} + 35%|███▍ | 2012/5772 [3:32:27<6:37:00, 6.34s/it] {'loss': 0.4627, 'learning_rate': 1.5135460675027525e-05, 'epoch': 0.35} + 35%|███▍ | 2012/5772 [3:32:19<6:37:00, 6.34s/it] 35%|███▍ | 2013/5772 [3:32:25<6:29:44, 6.22s/it] 35%|███▍ | 2013/5772 [3:32:32<6:29:44, 6.22s/it] {'loss': 0.4708, 'learning_rate': 1.5130644430092638e-05, 'epoch': 0.35} + 35%|███▍ | 2013/5772 [3:32:32<6:29:44, 6.22s/it] {'loss': 0.4708, 'learning_rate': 1.5130644430092638e-05, 'epoch': 0.35} + 35%|███▍ | 2013/5772 [3:32:25<6:29:44, 6.22s/it] 35%|███▍ | 2014/5772 [3:32:31<6:24:37, 6.14s/it] 35%|███▍ | 2014/5772 [3:32:38<6:24:37, 6.14s/it] {'loss': 0.475, 'learning_rate': 1.5125826569289812e-05, 'epoch': 0.35} + 35%|███▍ | 2014/5772 [3:32:38<6:24:37, 6.14s/it] {'loss': 0.475, 'learning_rate': 1.5125826569289812e-05, 'epoch': 0.35} + 35%|███▍ | 2014/5772 [3:32:31<6:24:37, 6.14s/it] 35%|███▍ | 2015/5772 [3:32:45<6:27:10, 6.18s/it] 35%|███▍ | 2015/5772 [3:32:38<6:27:11, 6.18s/it] {'loss': 0.4832, 'learning_rate': 1.512100709413641e-05, 'epoch': 0.35} + 35%|███▍ | 2015/5772 [3:32:45<6:27:10, 6.18s/it] {'loss': 0.4832, 'learning_rate': 1.512100709413641e-05, 'epoch': 0.35} + 35%|███▍ | 2015/5772 [3:32:38<6:27:11, 6.18s/it] 35%|███▍ | 2016/5772 [3:32:44<6:25:05, 6.15s/it] 35%|███▍ | 2016/5772 [3:32:51<6:25:05, 6.15s/it] {'loss': 0.4714, 'learning_rate': 1.5116186006150294e-05, 'epoch': 0.35} + 35%|███▍ | 2016/5772 [3:32:51<6:25:05, 6.15s/it] {'loss': 0.4714, 'learning_rate': 1.5116186006150294e-05, 'epoch': 0.35} + 35%|███▍ | 2016/5772 [3:32:44<6:25:05, 6.15s/it] 35%|███▍ | 2017/5772 [3:32:57<6:24:13, 6.14s/it] 35%|███▍ | 2017/5772 [3:32:50<6:24:14, 6.14s/it] {'loss': 0.4816, 'learning_rate': 1.5111363306849845e-05, 'epoch': 0.35} + 35%|███▍ | 2017/5772 [3:32:57<6:24:13, 6.14s/it] {'loss': 0.4816, 'learning_rate': 1.5111363306849845e-05, 'epoch': 0.35} + 35%|███▍ | 2017/5772 [3:32:50<6:24:14, 6.14s/it] 35%|███▍ | 2018/5772 [3:33:03<6:21:33, 6.10s/it] 35%|███▍ | 2018/5772 [3:32:56<6:21:33, 6.10s/it] {'loss': 0.4778, 'learning_rate': 1.5106538997753938e-05, 'epoch': 0.35} + 35%|███▍ | 2018/5772 [3:33:03<6:21:33, 6.10s/it] {'loss': 0.4778, 'learning_rate': 1.5106538997753938e-05, 'epoch': 0.35} + 35%|███▍ | 2018/5772 [3:32:56<6:21:33, 6.10s/it] 35%|███▍ | 2019/5772 [3:33:02<6:21:22, 6.10s/it] 35%|███▍ | 2019/5772 [3:33:09<6:21:22, 6.10s/it] {'loss': 0.467, 'learning_rate': 1.510171308038197e-05, 'epoch': 0.35} + 35%|███▍ | 2019/5772 [3:33:09<6:21:22, 6.10s/it] {'loss': 0.467, 'learning_rate': 1.510171308038197e-05, 'epoch': 0.35} + 35%|███▍ | 2019/5772 [3:33:02<6:21:22, 6.10s/it] 35%|███▍ | 2020/5772 [3:33:08<6:17:25, 6.04s/it] 35%|███▍ | 2020/5772 [3:33:15<6:17:25, 6.04s/it] {'loss': 0.4623, 'learning_rate': 1.5096885556253833e-05, 'epoch': 0.35} + 35%|███▍ | 2020/5772 [3:33:15<6:17:25, 6.04s/it] {'loss': 0.4623, 'learning_rate': 1.5096885556253833e-05, 'epoch': 0.35} + 35%|███▍ | 2020/5772 [3:33:08<6:17:25, 6.04s/it] 35%|███▌ | 2021/5772 [3:33:14<6:18:11, 6.05s/it] 35%|███▌ | 2021/5772 [3:33:21<6:18:11, 6.05s/it] {'loss': 0.4858, 'learning_rate': 1.5092056426889923e-05, 'epoch': 0.35} + 35%|███▌ | 2021/5772 [3:33:21<6:18:11, 6.05s/it] {'loss': 0.4858, 'learning_rate': 1.5092056426889923e-05, 'epoch': 0.35} + 35%|███▌ | 2021/5772 [3:33:14<6:18:11, 6.05s/it] 35%|███▌ | 2022/5772 [3:33:21<6:30:13, 6.24s/it] 35%|███▌ | 2022/5772 [3:33:28<6:30:13, 6.24s/it] {'loss': 0.4515, 'learning_rate': 1.5087225693811159e-05, 'epoch': 0.35} + 35%|███▌ | 2022/5772 [3:33:28<6:30:13, 6.24s/it] {'loss': 0.4515, 'learning_rate': 1.5087225693811159e-05, 'epoch': 0.35} + 35%|███▌ | 2022/5772 [3:33:21<6:30:13, 6.24s/it] 35%|███▌ | 2023/5772 [3:33:27<6:25:00, 6.16s/it] 35%|███▌ | 2023/5772 [3:33:34<6:25:00, 6.16s/it] {'loss': 0.4879, 'learning_rate': 1.5082393358538946e-05, 'epoch': 0.35} + 35%|███▌ | 2023/5772 [3:33:34<6:25:00, 6.16s/it] {'loss': 0.4879, 'learning_rate': 1.5082393358538946e-05, 'epoch': 0.35} + 35%|███▌ | 2023/5772 [3:33:27<6:25:00, 6.16s/it] 35%|███▌ | 2024/5772 [3:33:32<6:19:57, 6.08s/it] 35%|███▌ | 2024/5772 [3:33:40<6:19:57, 6.08s/it] {'loss': 0.4633, 'learning_rate': 1.5077559422595202e-05, 'epoch': 0.35} + 35%|███▌ | 2024/5772 [3:33:40<6:19:57, 6.08s/it] {'loss': 0.4633, 'learning_rate': 1.5077559422595202e-05, 'epoch': 0.35} + 35%|███▌ | 2024/5772 [3:33:32<6:19:57, 6.08s/it] 35%|███▌ | 2025/5772 [3:33:39<6:22:04, 6.12s/it] 35%|███▌ | 2025/5772 [3:33:46<6:22:04, 6.12s/it] {'loss': 0.4859, 'learning_rate': 1.5072723887502352e-05, 'epoch': 0.35} + 35%|███▌ | 2025/5772 [3:33:46<6:22:04, 6.12s/it] {'loss': 0.4859, 'learning_rate': 1.5072723887502352e-05, 'epoch': 0.35} + 35%|███▌ | 2025/5772 [3:33:39<6:22:04, 6.12s/it] 35%|███▌ | 2026/5772 [3:33:45<6:27:06, 6.20s/it] 35%|███▌ | 2026/5772 [3:33:52<6:27:06, 6.20s/it] {'loss': 0.4942, 'learning_rate': 1.5067886754783316e-05, 'epoch': 0.35} + 35%|███▌ | 2026/5772 [3:33:52<6:27:06, 6.20s/it] {'loss': 0.4942, 'learning_rate': 1.5067886754783316e-05, 'epoch': 0.35} + 35%|███▌ | 2026/5772 [3:33:45<6:27:06, 6.20s/it] 35%|███▌ | 2027/5772 [3:33:51<6:24:29, 6.16s/it] 35%|███▌ | 2027/5772 [3:33:58<6:24:29, 6.16s/it] {'loss': 0.4801, 'learning_rate': 1.5063048025961523e-05, 'epoch': 0.35} + 35%|███▌ | 2027/5772 [3:33:58<6:24:29, 6.16s/it] {'loss': 0.4801, 'learning_rate': 1.5063048025961523e-05, 'epoch': 0.35} + 35%|███▌ | 2027/5772 [3:33:51<6:24:29, 6.16s/it] 35%|███▌ | 2028/5772 [3:33:57<6:23:47, 6.15s/it] 35%|███▌ | 2028/5772 [3:34:04<6:23:47, 6.15s/it] {'loss': 0.4776, 'learning_rate': 1.5058207702560907e-05, 'epoch': 0.35} + 35%|███▌ | 2028/5772 [3:34:04<6:23:47, 6.15s/it] {'loss': 0.4776, 'learning_rate': 1.5058207702560907e-05, 'epoch': 0.35} + 35%|███▌ | 2028/5772 [3:33:57<6:23:47, 6.15s/it] 35%|███▌ | 2029/5772 [3:34:10<6:21:46, 6.12s/it] 35%|███▌ | 2029/5772 [3:34:03<6:21:46, 6.12s/it] {'loss': 0.4837, 'learning_rate': 1.5053365786105898e-05, 'epoch': 0.35} + 35%|███▌ | 2029/5772 [3:34:10<6:21:46, 6.12s/it] {'loss': 0.4837, 'learning_rate': 1.5053365786105898e-05, 'epoch': 0.35} + 35%|███▌ | 2029/5772 [3:34:03<6:21:46, 6.12s/it] 35%|███▌ | 2030/5772 [3:34:17<6:23:08, 6.14s/it] 35%|███▌ | 2030/5772 [3:34:10<6:23:08, 6.14s/it] {'loss': 0.4721, 'learning_rate': 1.5048522278121432e-05, 'epoch': 0.35} + 35%|███▌ | 2030/5772 [3:34:17<6:23:08, 6.14s/it] {'loss': 0.4721, 'learning_rate': 1.5048522278121432e-05, 'epoch': 0.35} + 35%|███▌ | 2030/5772 [3:34:10<6:23:08, 6.14s/it] 35%|███▌ | 2031/5772 [3:34:16<6:21:24, 6.12s/it] 35%|███▌ | 2031/5772 [3:34:23<6:21:24, 6.12s/it] {'loss': 0.4912, 'learning_rate': 1.5043677180132946e-05, 'epoch': 0.35} + 35%|███▌ | 2031/5772 [3:34:23<6:21:24, 6.12s/it] {'loss': 0.4912, 'learning_rate': 1.5043677180132946e-05, 'epoch': 0.35} + 35%|███▌ | 2031/5772 [3:34:16<6:21:24, 6.12s/it] 35%|███▌ | 2032/5772 [3:34:22<6:25:57, 6.19s/it] 35%|███▌ | 2032/5772 [3:34:29<6:25:59, 6.19s/it] {'loss': 0.4744, 'learning_rate': 1.5038830493666371e-05, 'epoch': 0.35} + 35%|███▌ | 2032/5772 [3:34:29<6:25:59, 6.19s/it] {'loss': 0.4744, 'learning_rate': 1.5038830493666371e-05, 'epoch': 0.35} + 35%|███▌ | 2032/5772 [3:34:22<6:25:57, 6.19s/it] 35%|███▌ | 2033/5772 [3:34:28<6:29:05, 6.24s/it] 35%|███▌ | 2033/5772 [3:34:35<6:29:05, 6.24s/it] {'loss': 0.481, 'learning_rate': 1.5033982220248151e-05, 'epoch': 0.35} + 35%|███▌ | 2033/5772 [3:34:35<6:29:05, 6.24s/it] {'loss': 0.481, 'learning_rate': 1.5033982220248151e-05, 'epoch': 0.35} + 35%|███▌ | 2033/5772 [3:34:28<6:29:05, 6.24s/it] 35%|███▌ | 2034/5772 [3:34:34<6:23:21, 6.15s/it] 35%|███▌ | 2034/5772 [3:34:41<6:23:21, 6.15s/it] {'loss': 0.4795, 'learning_rate': 1.5029132361405219e-05, 'epoch': 0.35} + 35%|███▌ | 2034/5772 [3:34:41<6:23:21, 6.15s/it] {'loss': 0.4795, 'learning_rate': 1.5029132361405219e-05, 'epoch': 0.35} + 35%|███▌ | 2034/5772 [3:34:34<6:23:21, 6.15s/it] 35%|███▌ | 2035/5772 [3:34:40<6:19:21, 6.09s/it] 35%|███▌ | 2035/5772 [3:34:47<6:19:21, 6.09s/it] {'loss': 0.4778, 'learning_rate': 1.502428091866501e-05, 'epoch': 0.35} + 35%|███▌ | 2035/5772 [3:34:47<6:19:21, 6.09s/it] {'loss': 0.4778, 'learning_rate': 1.502428091866501e-05, 'epoch': 0.35} + 35%|███▌ | 2035/5772 [3:34:40<6:19:21, 6.09s/it] 35%|███▌ | 2036/5772 [3:34:46<6:15:29, 6.03s/it] 35%|███▌ | 2036/5772 [3:34:53<6:15:29, 6.03s/it] {'loss': 0.474, 'learning_rate': 1.5019427893555462e-05, 'epoch': 0.35} + 35%|███▌ | 2036/5772 [3:34:53<6:15:29, 6.03s/it] {'loss': 0.474, 'learning_rate': 1.5019427893555462e-05, 'epoch': 0.35} + 35%|███▌ | 2036/5772 [3:34:46<6:15:29, 6.03s/it] 35%|███▌ | 2037/5772 [3:34:52<6:10:29, 5.95s/it] 35%|███▌ | 2037/5772 [3:34:59<6:10:29, 5.95s/it] {'loss': 0.4732, 'learning_rate': 1.501457328760501e-05, 'epoch': 0.35} + 35%|███▌ | 2037/5772 [3:34:59<6:10:29, 5.95s/it] {'loss': 0.4732, 'learning_rate': 1.501457328760501e-05, 'epoch': 0.35} + 35%|███▌ | 2037/5772 [3:34:52<6:10:29, 5.95s/it] 35%|███▌ | 2038/5772 [3:34:58<6:11:36, 5.97s/it] 35%|███▌ | 2038/5772 [3:35:05<6:11:36, 5.97s/it] {'loss': 0.4752, 'learning_rate': 1.5009717102342577e-05, 'epoch': 0.35} + 35%|███▌ | 2038/5772 [3:35:05<6:11:36, 5.97s/it] {'loss': 0.4752, 'learning_rate': 1.5009717102342577e-05, 'epoch': 0.35} + 35%|███▌ | 2038/5772 [3:34:58<6:11:36, 5.97s/it] 35%|███▌ | 2039/5772 [3:35:04<6:14:30, 6.02s/it] 35%|███▌ | 2039/5772 [3:35:11<6:14:30, 6.02s/it] {'loss': 0.4855, 'learning_rate': 1.5004859339297601e-05, 'epoch': 0.35} + 35%|███▌ | 2039/5772 [3:35:11<6:14:30, 6.02s/it] {'loss': 0.4855, 'learning_rate': 1.5004859339297601e-05, 'epoch': 0.35} + 35%|███▌ | 2039/5772 [3:35:04<6:14:30, 6.02s/it] 35%|███▌ | 2040/5772 [3:35:10<6:18:17, 6.08s/it] 35%|███▌ | 2040/5772 [3:35:17<6:18:18, 6.08s/it] {'loss': 0.4876, 'learning_rate': 1.5000000000000002e-05, 'epoch': 0.35} + 35%|███▌ | 2040/5772 [3:35:17<6:18:18, 6.08s/it] {'loss': 0.4876, 'learning_rate': 1.5000000000000002e-05, 'epoch': 0.35} + 35%|███▌ | 2040/5772 [3:35:10<6:18:17, 6.08s/it] 35%|███▌ | 2041/5772 [3:35:16<6:16:27, 6.05s/it] 35%|███▌ | 2041/5772 [3:35:23<6:16:27, 6.05s/it] {'loss': 0.4735, 'learning_rate': 1.4995139085980203e-05, 'epoch': 0.35} + 35%|███▌ | 2041/5772 [3:35:23<6:16:27, 6.05s/it] {'loss': 0.4735, 'learning_rate': 1.4995139085980203e-05, 'epoch': 0.35} + 35%|███▌ | 2041/5772 [3:35:16<6:16:27, 6.05s/it] 35%|███▌ | 2042/5772 [3:35:23<6:21:41, 6.14s/it] 35%|███▌ | 2042/5772 [3:35:30<6:21:40, 6.14s/it] {'loss': 0.4773, 'learning_rate': 1.499027659876912e-05, 'epoch': 0.35} + 35%|███▌ | 2042/5772 [3:35:30<6:21:40, 6.14s/it] {'loss': 0.4773, 'learning_rate': 1.499027659876912e-05, 'epoch': 0.35} + 35%|███▌ | 2042/5772 [3:35:23<6:21:41, 6.14s/it] 35%|███▌ | 2043/5772 [3:35:29<6:23:32, 6.17s/it] 35%|███▌ | 2043/5772 [3:35:36<6:23:32, 6.17s/it] {'loss': 0.4759, 'learning_rate': 1.498541253989817e-05, 'epoch': 0.35} + 35%|███▌ | 2043/5772 [3:35:36<6:23:32, 6.17s/it] {'loss': 0.4759, 'learning_rate': 1.498541253989817e-05, 'epoch': 0.35} + 35%|███▌ | 2043/5772 [3:35:29<6:23:32, 6.17s/it] 35%|███▌ | 2044/5772 [3:35:35<6:27:01, 6.23s/it] 35%|███▌ | 2044/5772 [3:35:42<6:27:00, 6.23s/it] {'loss': 0.4661, 'learning_rate': 1.4980546910899261e-05, 'epoch': 0.35} + 35%|███▌ | 2044/5772 [3:35:42<6:27:00, 6.23s/it] {'loss': 0.4661, 'learning_rate': 1.4980546910899261e-05, 'epoch': 0.35} + 35%|███▌ | 2044/5772 [3:35:35<6:27:01, 6.23s/it] 35%|███▌ | 2045/5772 [3:35:42<6:31:11, 6.30s/it] 35%|███▌ | 2045/5772 [3:35:49<6:31:11, 6.30s/it] {'loss': 0.4859, 'learning_rate': 1.4975679713304794e-05, 'epoch': 0.35} + 35%|███▌ | 2045/5772 [3:35:49<6:31:11, 6.30s/it] {'loss': 0.4859, 'learning_rate': 1.4975679713304794e-05, 'epoch': 0.35} + 35%|███▌ | 2045/5772 [3:35:42<6:31:11, 6.30s/it] 35%|███▌ | 2046/5772 [3:35:48<6:31:04, 6.30s/it] 35%|███▌ | 2046/5772 [3:35:55<6:31:04, 6.30s/it] {'loss': 0.4868, 'learning_rate': 1.4970810948647664e-05, 'epoch': 0.35} + 35%|███▌ | 2046/5772 [3:35:55<6:31:04, 6.30s/it] {'loss': 0.4868, 'learning_rate': 1.4970810948647664e-05, 'epoch': 0.35} + 35%|███▌ | 2046/5772 [3:35:48<6:31:04, 6.30s/it] 35%|███▌ | 2047/5772 [3:35:54<6:24:51, 6.20s/it] 35%|███▌ | 2047/5772 [3:36:01<6:24:51, 6.20s/it] {'loss': 0.4837, 'learning_rate': 1.4965940618461265e-05, 'epoch': 0.35} + 35%|███▌ | 2047/5772 [3:36:01<6:24:51, 6.20s/it] {'loss': 0.4837, 'learning_rate': 1.4965940618461265e-05, 'epoch': 0.35} + 35%|███▌ | 2047/5772 [3:35:54<6:24:51, 6.20s/it] 35%|███▌ | 2048/5772 [3:36:00<6:18:48, 6.10s/it] 35%|███▌ | 2048/5772 [3:36:07<6:18:48, 6.10s/it] {'loss': 0.4805, 'learning_rate': 1.496106872427948e-05, 'epoch': 0.35} + 35%|███▌ | 2048/5772 [3:36:07<6:18:48, 6.10s/it] {'loss': 0.4805, 'learning_rate': 1.496106872427948e-05, 'epoch': 0.35} + 35%|███▌ | 2048/5772 [3:36:00<6:18:48, 6.10s/it] 35%|███▌ | 2049/5772 [3:36:06<6:17:37, 6.09s/it] 35%|███▌ | 2049/5772 [3:36:13<6:17:36, 6.09s/it] {'loss': 0.4798, 'learning_rate': 1.4956195267636679e-05, 'epoch': 0.35} + 35%|███▌ | 2049/5772 [3:36:13<6:17:36, 6.09s/it] {'loss': 0.4798, 'learning_rate': 1.4956195267636679e-05, 'epoch': 0.35} + 35%|███▌ | 2049/5772 [3:36:06<6:17:37, 6.09s/it]4 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +5312 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend...11 AutoResumeHook: Checking whether to suspend... + + +15 AutoResumeHook: Checking whether to suspend... +07 14 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... 36%|███▌ | 2050/5772 [3:36:12<6:19:24, 6.12s/it] +2 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... + 36%|███▌ | 2050/5772 [3:36:19<6:19:24, 6.12s/it]9 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4697, 'learning_rate': 1.495132025006774e-05, 'epoch': 0.36} + 36%|███▌ | 2050/5772 [3:36:19<6:19:24, 6.12s/it] {'loss': 0.4697, 'learning_rate': 1.495132025006774e-05, 'epoch': 0.36} + 36%|███▌ | 2050/5772 [3:36:12<6:19:24, 6.12s/it] 36%|███▌ | 2051/5772 [3:36:18<6:21:13, 6.15s/it] 36%|███▌ | 2051/5772 [3:36:25<6:21:13, 6.15s/it] {'loss': 0.4797, 'learning_rate': 1.4946443673108015e-05, 'epoch': 0.36} + 36%|███▌ | 2051/5772 [3:36:25<6:21:13, 6.15s/it] {'loss': 0.4797, 'learning_rate': 1.4946443673108015e-05, 'epoch': 0.36} + 36%|███▌ | 2051/5772 [3:36:18<6:21:13, 6.15s/it] 36%|███▌ | 2052/5772 [3:36:24<6:17:22, 6.09s/it] 36%|███▌ | 2052/5772 [3:36:31<6:17:22, 6.09s/it] {'loss': 0.4784, 'learning_rate': 1.4941565538293358e-05, 'epoch': 0.36} + 36%|███▌ | 2052/5772 [3:36:31<6:17:22, 6.09s/it] {'loss': 0.4784, 'learning_rate': 1.4941565538293358e-05, 'epoch': 0.36} + 36%|███▌ | 2052/5772 [3:36:24<6:17:22, 6.09s/it] 36%|███▌ | 2053/5772 [3:36:30<6:17:08, 6.08s/it] 36%|███▌ | 2053/5772 [3:36:37<6:17:09, 6.08s/it] {'loss': 0.4813, 'learning_rate': 1.4936685847160113e-05, 'epoch': 0.36} + 36%|███▌ | 2053/5772 [3:36:37<6:17:09, 6.08s/it] {'loss': 0.4813, 'learning_rate': 1.4936685847160113e-05, 'epoch': 0.36} + 36%|███▌ | 2053/5772 [3:36:30<6:17:08, 6.08s/it] 36%|███▌ | 2054/5772 [3:36:36<6:18:34, 6.11s/it] 36%|███▌ | 2054/5772 [3:36:43<6:18:34, 6.11s/it] {'loss': 0.4836, 'learning_rate': 1.4931804601245105e-05, 'epoch': 0.36} + 36%|███▌ | 2054/5772 [3:36:43<6:18:34, 6.11s/it] {'loss': 0.4836, 'learning_rate': 1.4931804601245105e-05, 'epoch': 0.36} + 36%|███▌ | 2054/5772 [3:36:36<6:18:34, 6.11s/it] 36%|███▌ | 2055/5772 [3:36:43<6:21:30, 6.16s/it] 36%|███▌ | 2055/5772 [3:36:50<6:21:29, 6.16s/it] {'loss': 0.4865, 'learning_rate': 1.4926921802085662e-05, 'epoch': 0.36} + 36%|███▌ | 2055/5772 [3:36:50<6:21:29, 6.16s/it] {'loss': 0.4865, 'learning_rate': 1.4926921802085662e-05, 'epoch': 0.36} + 36%|███▌ | 2055/5772 [3:36:43<6:21:30, 6.16s/it] 36%|███▌ | 2056/5772 [3:36:49<6:20:49, 6.15s/it] 36%|███▌ | 2056/5772 [3:36:56<6:20:49, 6.15s/it] {'loss': 0.462, 'learning_rate': 1.4922037451219586e-05, 'epoch': 0.36} + 36%|███▌ | 2056/5772 [3:36:56<6:20:49, 6.15s/it] {'loss': 0.462, 'learning_rate': 1.4922037451219586e-05, 'epoch': 0.36} + 36%|███▌ | 2056/5772 [3:36:49<6:20:49, 6.15s/it] 36%|███▌ | 2057/5772 [3:36:55<6:28:05, 6.27s/it] 36%|███▌ | 2057/5772 [3:37:02<6:28:05, 6.27s/it] {'loss': 0.4853, 'learning_rate': 1.4917151550185187e-05, 'epoch': 0.36} + 36%|███▌ | 2057/5772 [3:37:02<6:28:05, 6.27s/it] {'loss': 0.4853, 'learning_rate': 1.4917151550185187e-05, 'epoch': 0.36} + 36%|███▌ | 2057/5772 [3:36:55<6:28:05, 6.27s/it] 36%|███▌ | 2058/5772 [3:37:01<6:23:17, 6.19s/it] 36%|███▌ | 2058/5772 [3:37:08<6:23:17, 6.19s/it] {'loss': 0.4677, 'learning_rate': 1.4912264100521243e-05, 'epoch': 0.36} + 36%|███▌ | 2058/5772 [3:37:08<6:23:17, 6.19s/it] {'loss': 0.4677, 'learning_rate': 1.4912264100521243e-05, 'epoch': 0.36} + 36%|███▌ | 2058/5772 [3:37:01<6:23:17, 6.19s/it] 36%|███▌ | 2059/5772 [3:37:07<6:21:54, 6.17s/it] 36%|███▌ | 2059/5772 [3:37:15<6:21:54, 6.17s/it] {'loss': 0.4763, 'learning_rate': 1.4907375103767037e-05, 'epoch': 0.36} + 36%|███▌ | 2059/5772 [3:37:15<6:21:54, 6.17s/it] {'loss': 0.4763, 'learning_rate': 1.4907375103767037e-05, 'epoch': 0.36} + 36%|███▌ | 2059/5772 [3:37:07<6:21:54, 6.17s/it] 36%|███▌ | 2060/5772 [3:37:13<6:17:42, 6.11s/it] 36%|███▌ | 2060/5772 [3:37:20<6:17:43, 6.11s/it] {'loss': 0.4692, 'learning_rate': 1.4902484561462323e-05, 'epoch': 0.36} + 36%|███▌ | 2060/5772 [3:37:20<6:17:43, 6.11s/it] {'loss': 0.4692, 'learning_rate': 1.4902484561462323e-05, 'epoch': 0.36} + 36%|███▌ | 2060/5772 [3:37:13<6:17:42, 6.11s/it] 36%|███▌ | 2061/5772 [3:37:20<6:17:30, 6.10s/it] 36%|███▌ | 2061/5772 [3:37:27<6:17:30, 6.10s/it] {'loss': 0.4795, 'learning_rate': 1.4897592475147356e-05, 'epoch': 0.36} + 36%|███▌ | 2061/5772 [3:37:27<6:17:30, 6.10s/it] {'loss': 0.4795, 'learning_rate': 1.4897592475147356e-05, 'epoch': 0.36} + 36%|███▌ | 2061/5772 [3:37:20<6:17:30, 6.10s/it] 36%|███▌ | 2062/5772 [3:37:26<6:25:07, 6.23s/it] 36%|███▌ | 2062/5772 [3:37:33<6:25:07, 6.23s/it] {'loss': 0.4737, 'learning_rate': 1.489269884636287e-05, 'epoch': 0.36} + 36%|███▌ | 2062/5772 [3:37:33<6:25:07, 6.23s/it] {'loss': 0.4737, 'learning_rate': 1.489269884636287e-05, 'epoch': 0.36} + 36%|███▌ | 2062/5772 [3:37:26<6:25:07, 6.23s/it] 36%|███▌ | 2063/5772 [3:37:32<6:17:04, 6.10s/it] 36%|███▌ | 2063/5772 [3:37:39<6:17:03, 6.10s/it] {'loss': 0.4805, 'learning_rate': 1.4887803676650083e-05, 'epoch': 0.36} + 36%|███▌ | 2063/5772 [3:37:39<6:17:03, 6.10s/it] {'loss': 0.4805, 'learning_rate': 1.4887803676650083e-05, 'epoch': 0.36} + 36%|███▌ | 2063/5772 [3:37:32<6:17:04, 6.10s/it] 36%|███▌ | 2064/5772 [3:37:45<6:17:16, 6.10s/it] 36%|███▌ | 2064/5772 [3:37:38<6:17:16, 6.10s/it] {'loss': 0.4819, 'learning_rate': 1.4882906967550708e-05, 'epoch': 0.36} + 36%|███▌ | 2064/5772 [3:37:45<6:17:16, 6.10s/it] {'loss': 0.4819, 'learning_rate': 1.4882906967550708e-05, 'epoch': 0.36} + 36%|███▌ | 2064/5772 [3:37:38<6:17:16, 6.10s/it] 36%|███▌ | 2065/5772 [3:37:44<6:18:33, 6.13s/it] 36%|███▌ | 2065/5772 [3:37:51<6:18:33, 6.13s/it] {'loss': 0.479, 'learning_rate': 1.487800872060693e-05, 'epoch': 0.36} + 36%|███▌ | 2065/5772 [3:37:51<6:18:33, 6.13s/it] {'loss': 0.479, 'learning_rate': 1.487800872060693e-05, 'epoch': 0.36} + 36%|███▌ | 2065/5772 [3:37:44<6:18:33, 6.13s/it] 36%|███▌ | 2066/5772 [3:37:57<6:14:27, 6.06s/it] 36%|███▌ | 2066/5772 [3:37:50<6:14:28, 6.06s/it] {'loss': 0.463, 'learning_rate': 1.4873108937361429e-05, 'epoch': 0.36} + 36%|███▌ | 2066/5772 [3:37:57<6:14:27, 6.06s/it] {'loss': 0.463, 'learning_rate': 1.4873108937361429e-05, 'epoch': 0.36} + 36%|███▌ | 2066/5772 [3:37:50<6:14:28, 6.06s/it] 36%|███▌ | 2067/5772 [3:37:56<6:17:34, 6.11s/it] 36%|███▌ | 2067/5772 [3:38:03<6:17:34, 6.11s/it] {'loss': 0.4814, 'learning_rate': 1.4868207619357362e-05, 'epoch': 0.36} + 36%|███▌ | 2067/5772 [3:38:03<6:17:34, 6.11s/it] {'loss': 0.4814, 'learning_rate': 1.4868207619357362e-05, 'epoch': 0.36} + 36%|███▌ | 2067/5772 [3:37:56<6:17:34, 6.11s/it] 36%|███▌ | 2068/5772 [3:38:02<6:17:04, 6.11s/it] 36%|███▌ | 2068/5772 [3:38:09<6:17:04, 6.11s/it] {'loss': 0.4654, 'learning_rate': 1.4863304768138374e-05, 'epoch': 0.36} + 36%|███▌ | 2068/5772 [3:38:09<6:17:04, 6.11s/it] {'loss': 0.4654, 'learning_rate': 1.4863304768138374e-05, 'epoch': 0.36} + 36%|███▌ | 2068/5772 [3:38:02<6:17:04, 6.11s/it] 36%|███▌ | 2069/5772 [3:38:08<6:14:45, 6.07s/it] 36%|███▌ | 2069/5772 [3:38:15<6:14:46, 6.07s/it] {'loss': 0.4851, 'learning_rate': 1.4858400385248585e-05, 'epoch': 0.36} + 36%|███▌ | 2069/5772 [3:38:15<6:14:46, 6.07s/it] {'loss': 0.4851, 'learning_rate': 1.4858400385248585e-05, 'epoch': 0.36} + 36%|███▌ | 2069/5772 [3:38:08<6:14:45, 6.07s/it] 36%|███▌ | 2070/5772 [3:38:15<6:20:59, 6.17s/it] 36%|███▌ | 2070/5772 [3:38:22<6:20:59, 6.17s/it] {'loss': 0.479, 'learning_rate': 1.4853494472232613e-05, 'epoch': 0.36} + 36%|███▌ | 2070/5772 [3:38:22<6:20:59, 6.17s/it] {'loss': 0.479, 'learning_rate': 1.4853494472232613e-05, 'epoch': 0.36} + 36%|███▌ | 2070/5772 [3:38:15<6:20:59, 6.17s/it] 36%|███▌ | 2071/5772 [3:38:21<6:20:56, 6.18s/it] 36%|███▌ | 2071/5772 [3:38:28<6:20:56, 6.18s/it] {'loss': 0.4832, 'learning_rate': 1.4848587030635537e-05, 'epoch': 0.36} + 36%|███▌ | 2071/5772 [3:38:28<6:20:56, 6.18s/it] {'loss': 0.4832, 'learning_rate': 1.4848587030635537e-05, 'epoch': 0.36} + 36%|███▌ | 2071/5772 [3:38:21<6:20:56, 6.18s/it] 36%|███▌ | 2072/5772 [3:38:27<6:20:54, 6.18s/it] 36%|███▌ | 2072/5772 [3:38:34<6:20:54, 6.18s/it] {'loss': 0.4865, 'learning_rate': 1.484367806200294e-05, 'epoch': 0.36} + 36%|███▌ | 2072/5772 [3:38:27<6:20:54, 6.18s/it]{'loss': 0.4865, 'learning_rate': 1.484367806200294e-05, 'epoch': 0.36} + 36%|███▌ | 2072/5772 [3:38:34<6:20:54, 6.18s/it] 36%|███▌ | 2073/5772 [3:38:33<6:20:43, 6.18s/it] 36%|███▌ | 2073/5772 [3:38:40<6:20:43, 6.18s/it] {'loss': 0.475, 'learning_rate': 1.4838767567880865e-05, 'epoch': 0.36} + 36%|███▌ | 2073/5772 [3:38:40<6:20:43, 6.18s/it] {'loss': 0.475, 'learning_rate': 1.4838767567880865e-05, 'epoch': 0.36} + 36%|███▌ | 2073/5772 [3:38:33<6:20:43, 6.18s/it] 36%|███▌ | 2074/5772 [3:38:40<6:21:02, 6.18s/it] 36%|███▌ | 2074/5772 [3:38:47<6:21:02, 6.18s/it] {'loss': 0.4734, 'learning_rate': 1.4833855549815848e-05, 'epoch': 0.36} + 36%|███▌ | 2074/5772 [3:38:47<6:21:02, 6.18s/it] {'loss': 0.4734, 'learning_rate': 1.4833855549815848e-05, 'epoch': 0.36} + 36%|███▌ | 2074/5772 [3:38:40<6:21:02, 6.18s/it] 36%|███▌ | 2075/5772 [3:38:46<6:19:46, 6.16s/it] 36%|███▌ | 2075/5772 [3:38:53<6:19:46, 6.16s/it] {'loss': 0.4751, 'learning_rate': 1.4828942009354902e-05, 'epoch': 0.36} + 36%|███▌ | 2075/5772 [3:38:53<6:19:46, 6.16s/it] {'loss': 0.4751, 'learning_rate': 1.4828942009354902e-05, 'epoch': 0.36} + 36%|███▌ | 2075/5772 [3:38:46<6:19:46, 6.16s/it] 36%|███▌ | 2076/5772 [3:38:52<6:24:28, 6.24s/it] 36%|███▌ | 2076/5772 [3:38:59<6:24:28, 6.24s/it] {'loss': 0.4698, 'learning_rate': 1.482402694804552e-05, 'epoch': 0.36} + 36%|███▌ | 2076/5772 [3:38:59<6:24:28, 6.24s/it] {'loss': 0.4698, 'learning_rate': 1.482402694804552e-05, 'epoch': 0.36} + 36%|███▌ | 2076/5772 [3:38:52<6:24:28, 6.24s/it] 36%|███▌ | 2077/5772 [3:38:58<6:20:59, 6.19s/it] 36%|███▌ | 2077/5772 [3:39:05<6:20:59, 6.19s/it] {'loss': 0.4898, 'learning_rate': 1.4819110367435672e-05, 'epoch': 0.36} + 36%|███▌ | 2077/5772 [3:39:05<6:20:59, 6.19s/it] {'loss': 0.4898, 'learning_rate': 1.4819110367435672e-05, 'epoch': 0.36} + 36%|███▌ | 2077/5772 [3:38:58<6:20:59, 6.19s/it] 36%|███▌ | 2078/5772 [3:39:11<6:20:21, 6.18s/it] 36%|███▌ | 2078/5772 [3:39:04<6:20:21, 6.18s/it] {'loss': 0.4844, 'learning_rate': 1.4814192269073808e-05, 'epoch': 0.36} + 36%|███▌ | 2078/5772 [3:39:11<6:20:21, 6.18s/it] {'loss': 0.4844, 'learning_rate': 1.4814192269073808e-05, 'epoch': 0.36} + 36%|███▌ | 2078/5772 [3:39:04<6:20:21, 6.18s/it] 36%|███▌ | 2079/5772 [3:39:10<6:19:27, 6.17s/it] 36%|███▌ | 2079/5772 [3:39:17<6:19:28, 6.17s/it]{'loss': 0.4787, 'learning_rate': 1.4809272654508858e-05, 'epoch': 0.36} + {'loss': 0.4787, 'learning_rate': 1.4809272654508858e-05, 'epoch': 0.36} + 36%|███▌ | 2079/5772 [3:39:17<6:19:28, 6.17s/it] 36%|███▌ | 2079/5772 [3:39:10<6:19:27, 6.17s/it] 36%|███▌ | 2080/5772 [3:39:17<6:18:03, 6.14s/it] 36%|███▌ | 2080/5772 [3:39:24<6:18:03, 6.14s/it] {'loss': 0.4723, 'learning_rate': 1.4804351525290221e-05, 'epoch': 0.36} + 36%|███▌ | 2080/5772 [3:39:24<6:18:03, 6.14s/it] {'loss': 0.4723, 'learning_rate': 1.4804351525290221e-05, 'epoch': 0.36} + 36%|███▌ | 2080/5772 [3:39:17<6:18:03, 6.14s/it] 36%|███▌ | 2081/5772 [3:39:22<6:14:43, 6.09s/it] 36%|███▌ | 2081/5772 [3:39:30<6:14:43, 6.09s/it] {'loss': 0.489, 'learning_rate': 1.4799428882967787e-05, 'epoch': 0.36} + 36%|███▌ | 2081/5772 [3:39:30<6:14:43, 6.09s/it] {'loss': 0.489, 'learning_rate': 1.4799428882967787e-05, 'epoch': 0.36} + 36%|███▌ | 2081/5772 [3:39:22<6:14:43, 6.09s/it] 36%|███▌ | 2082/5772 [3:39:29<6:22:58, 6.23s/it] 36%|███▌ | 2082/5772 [3:39:36<6:22:58, 6.23s/it] {'loss': 0.473, 'learning_rate': 1.479450472909191e-05, 'epoch': 0.36} + 36%|███▌ | 2082/5772 [3:39:36<6:22:58, 6.23s/it] {'loss': 0.473, 'learning_rate': 1.479450472909191e-05, 'epoch': 0.36} + 36%|███▌ | 2082/5772 [3:39:29<6:22:58, 6.23s/it] 36%|███▌ | 2083/5772 [3:39:42<6:14:38, 6.09s/it] 36%|███▌ | 2083/5772 [3:39:35<6:14:38, 6.09s/it] {'loss': 0.4792, 'learning_rate': 1.4789579065213425e-05, 'epoch': 0.36} + 36%|███▌ | 2083/5772 [3:39:42<6:14:38, 6.09s/it] {'loss': 0.4792, 'learning_rate': 1.4789579065213425e-05, 'epoch': 0.36} + 36%|███▌ | 2083/5772 [3:39:35<6:14:38, 6.09s/it] 36%|███▌ | 2084/5772 [3:39:48<6:18:13, 6.15s/it] 36%|███▌ | 2084/5772 [3:39:41<6:18:13, 6.15s/it] {'loss': 0.488, 'learning_rate': 1.4784651892883644e-05, 'epoch': 0.36} + 36%|███▌ | 2084/5772 [3:39:48<6:18:13, 6.15s/it] {'loss': 0.488, 'learning_rate': 1.4784651892883644e-05, 'epoch': 0.36} + 36%|███▌ | 2084/5772 [3:39:41<6:18:13, 6.15s/it] 36%|███▌ | 2085/5772 [3:39:54<6:10:13, 6.02s/it] 36%|███▌ | 2085/5772 [3:39:47<6:10:13, 6.02s/it] {'loss': 0.4646, 'learning_rate': 1.4779723213654354e-05, 'epoch': 0.36} + 36%|███▌ | 2085/5772 [3:39:54<6:10:13, 6.02s/it] {'loss': 0.4646, 'learning_rate': 1.4779723213654354e-05, 'epoch': 0.36} + 36%|███▌ | 2085/5772 [3:39:47<6:10:13, 6.02s/it] 36%|███▌ | 2086/5772 [3:39:53<6:11:34, 6.05s/it] 36%|███▌ | 2086/5772 [3:40:00<6:11:34, 6.05s/it] {'loss': 0.4817, 'learning_rate': 1.477479302907781e-05, 'epoch': 0.36} + 36%|███▌ | 2086/5772 [3:40:00<6:11:34, 6.05s/it] {'loss': 0.4817, 'learning_rate': 1.477479302907781e-05, 'epoch': 0.36} + 36%|███▌ | 2086/5772 [3:39:53<6:11:34, 6.05s/it] 36%|███▌ | 2087/5772 [3:40:06<6:19:02, 6.17s/it] 36%|███▌ | 2087/5772 [3:39:59<6:19:03, 6.17s/it] {'loss': 0.4806, 'learning_rate': 1.476986134070675e-05, 'epoch': 0.36} + 36%|███▌ | 2087/5772 [3:40:06<6:19:02, 6.17s/it] {'loss': 0.4806, 'learning_rate': 1.476986134070675e-05, 'epoch': 0.36} + 36%|███▌ | 2087/5772 [3:39:59<6:19:03, 6.17s/it] 36%|███▌ | 2088/5772 [3:40:13<6:18:55, 6.17s/it] 36%|███▌ | 2088/5772 [3:40:06<6:18:55, 6.17s/it] {'loss': 0.4804, 'learning_rate': 1.4764928150094384e-05, 'epoch': 0.36} + 36%|███▌ | 2088/5772 [3:40:13<6:18:55, 6.17s/it] {'loss': 0.4804, 'learning_rate': 1.4764928150094384e-05, 'epoch': 0.36} + 36%|███▌ | 2088/5772 [3:40:06<6:18:55, 6.17s/it] 36%|███▌ | 2089/5772 [3:40:19<6:21:34, 6.22s/it] 36%|███▌ | 2089/5772 [3:40:12<6:21:35, 6.22s/it] {'loss': 0.4801, 'learning_rate': 1.4759993458794388e-05, 'epoch': 0.36} + 36%|███▌ | 2089/5772 [3:40:19<6:21:34, 6.22s/it] {'loss': 0.4801, 'learning_rate': 1.4759993458794388e-05, 'epoch': 0.36} + 36%|███▌ | 2089/5772 [3:40:12<6:21:35, 6.22s/it] 36%|███▌ | 2090/5772 [3:40:25<6:20:12, 6.20s/it] 36%|███▌ | 2090/5772 [3:40:18<6:20:12, 6.20s/it] {'loss': 0.4725, 'learning_rate': 1.475505726836092e-05, 'epoch': 0.36} + 36%|███▌ | 2090/5772 [3:40:25<6:20:12, 6.20s/it] {'loss': 0.4725, 'learning_rate': 1.475505726836092e-05, 'epoch': 0.36} + 36%|███▌ | 2090/5772 [3:40:18<6:20:12, 6.20s/it] 36%|███▌ | 2091/5772 [3:40:31<6:19:30, 6.19s/it] 36%|███▌ | 2091/5772 [3:40:24<6:19:30, 6.19s/it] {'loss': 0.4812, 'learning_rate': 1.4750119580348601e-05, 'epoch': 0.36} + 36%|███▌ | 2091/5772 [3:40:31<6:19:30, 6.19s/it] {'loss': 0.4812, 'learning_rate': 1.4750119580348601e-05, 'epoch': 0.36} + 36%|███▌ | 2091/5772 [3:40:24<6:19:30, 6.19s/it] 36%|███▌ | 2092/5772 [3:40:30<6:20:58, 6.21s/it] 36%|███▌ | 2092/5772 [3:40:38<6:20:59, 6.21s/it] {'loss': 0.4754, 'learning_rate': 1.4745180396312533e-05, 'epoch': 0.36} + 36%|███▌ | 2092/5772 [3:40:38<6:20:59, 6.21s/it] {'loss': 0.4754, 'learning_rate': 1.4745180396312533e-05, 'epoch': 0.36} + 36%|███▌ | 2092/5772 [3:40:30<6:20:58, 6.21s/it] 36%|███▋ | 2093/5772 [3:40:44<6:20:16, 6.20s/it] 36%|███▋ | 2093/5772 [3:40:37<6:20:16, 6.20s/it] {'loss': 0.4702, 'learning_rate': 1.474023971780828e-05, 'epoch': 0.36} + 36%|███▋ | 2093/5772 [3:40:44<6:20:16, 6.20s/it] {'loss': 0.4702, 'learning_rate': 1.474023971780828e-05, 'epoch': 0.36} + 36%|███▋ | 2093/5772 [3:40:37<6:20:16, 6.20s/it] 36%|███▋ | 2094/5772 [3:40:50<6:19:32, 6.19s/it] 36%|███▋ | 2094/5772 [3:40:43<6:19:32, 6.19s/it] {'loss': 0.4676, 'learning_rate': 1.4735297546391887e-05, 'epoch': 0.36} + 36%|███▋ | 2094/5772 [3:40:50<6:19:32, 6.19s/it] {'loss': 0.4676, 'learning_rate': 1.4735297546391887e-05, 'epoch': 0.36} + 36%|███▋ | 2094/5772 [3:40:43<6:19:32, 6.19s/it] 36%|███▋ | 2095/5772 [3:40:49<6:24:01, 6.27s/it] 36%|███▋ | 2095/5772 [3:40:56<6:24:01, 6.27s/it] {'loss': 0.4733, 'learning_rate': 1.4730353883619856e-05, 'epoch': 0.36} + 36%|███▋ | 2095/5772 [3:40:56<6:24:01, 6.27s/it] {'loss': 0.4733, 'learning_rate': 1.4730353883619856e-05, 'epoch': 0.36} + 36%|███▋ | 2095/5772 [3:40:49<6:24:01, 6.27s/it] 36%|███▋ | 2096/5772 [3:40:55<6:19:38, 6.20s/it] 36%|███▋ | 2096/5772 [3:41:02<6:19:39, 6.20s/it] {'loss': 0.4712, 'learning_rate': 1.4725408731049173e-05, 'epoch': 0.36} + 36%|███▋ | 2096/5772 [3:41:02<6:19:39, 6.20s/it] {'loss': 0.4712, 'learning_rate': 1.4725408731049173e-05, 'epoch': 0.36} + 36%|███▋ | 2096/5772 [3:40:55<6:19:38, 6.20s/it] 36%|███▋ | 2097/5772 [3:41:09<6:22:49, 6.25s/it] 36%|███▋ | 2097/5772 [3:41:02<6:22:49, 6.25s/it] {'loss': 0.4859, 'learning_rate': 1.4720462090237285e-05, 'epoch': 0.36} + 36%|███▋ | 2097/5772 [3:41:09<6:22:49, 6.25s/it] {'loss': 0.4859, 'learning_rate': 1.4720462090237285e-05, 'epoch': 0.36} + 36%|███▋ | 2097/5772 [3:41:02<6:22:49, 6.25s/it] 36%|███▋ | 2098/5772 [3:41:15<6:24:14, 6.28s/it] 36%|███▋ | 2098/5772 [3:41:08<6:24:14, 6.28s/it] {'loss': 0.4719, 'learning_rate': 1.4715513962742102e-05, 'epoch': 0.36} + 36%|███▋ | 2098/5772 [3:41:15<6:24:14, 6.28s/it] {'loss': 0.4719, 'learning_rate': 1.4715513962742102e-05, 'epoch': 0.36} + 36%|███▋ | 2098/5772 [3:41:08<6:24:14, 6.28s/it] 36%|███▋ | 2099/5772 [3:41:21<6:22:42, 6.25s/it] 36%|███▋ | 2099/5772 [3:41:14<6:22:42, 6.25s/it] {'loss': 0.4891, 'learning_rate': 1.471056435012202e-05, 'epoch': 0.36} + 36%|███▋ | 2099/5772 [3:41:21<6:22:42, 6.25s/it] {'loss': 0.4891, 'learning_rate': 1.471056435012202e-05, 'epoch': 0.36} + 36%|███▋ | 2099/5772 [3:41:14<6:22:42, 6.25s/it]13 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... + 36%|███▋ | 2100/5772 [3:41:28<6:24:32, 6.28s/it]10 9AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +0 AutoResumeHook: Checking whether to suspend... + 36%|███▋ | 2100/5772 [3:41:21<6:24:32, 6.28s/it]2 AutoResumeHook: Checking whether to suspend... + {'loss': 0.475, 'learning_rate': 1.4705613253935886e-05, 'epoch': 0.36} + 36%|███▋ | 2100/5772 [3:41:28<6:24:32, 6.28s/it] {'loss': 0.475, 'learning_rate': 1.4705613253935886e-05, 'epoch': 0.36} + 36%|███▋ | 2100/5772 [3:41:21<6:24:32, 6.28s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-2100/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-2100/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-2100/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 36%|███▋ | 2101/5772 [3:41:39<10:14:46, 10.05s/it] 36%|███▋ | 2101/5772 [3:41:46<10:14:47, 10.05s/it] {'loss': 0.4895, 'learning_rate': 1.4700660675743021e-05, 'epoch': 0.36} + 36%|███▋ | 2101/5772 [3:41:46<10:14:47, 10.05s/it] {'loss': 0.4895, 'learning_rate': 1.4700660675743021e-05, 'epoch': 0.36} + 36%|███▋ | 2101/5772 [3:41:39<10:14:46, 10.05s/it] 36%|███▋ | 2102/5772 [3:41:46<9:08:35, 8.97s/it] 36%|███▋ | 2102/5772 [3:41:53<9:08:35, 8.97s/it] {'loss': 0.4623, 'learning_rate': 1.469570661710321e-05, 'epoch': 0.36} + 36%|███▋ | 2102/5772 [3:41:53<9:08:35, 8.97s/it] {'loss': 0.4623, 'learning_rate': 1.469570661710321e-05, 'epoch': 0.36} + 36%|███▋ | 2102/5772 [3:41:46<9:08:35, 8.97s/it] 36%|███▋ | 2103/5772 [3:41:52<8:21:29, 8.20s/it] 36%|███▋ | 2103/5772 [3:41:59<8:21:29, 8.20s/it] {'loss': 0.4755, 'learning_rate': 1.469075107957671e-05, 'epoch': 0.36} + 36%|███▋ | 2103/5772 [3:41:59<8:21:29, 8.20s/it] {'loss': 0.4755, 'learning_rate': 1.469075107957671e-05, 'epoch': 0.36} + 36%|███▋ | 2103/5772 [3:41:52<8:21:29, 8.20s/it] 36%|███▋ | 2104/5772 [3:41:58<7:45:12, 7.61s/it] 36%|███▋ | 2104/5772 [3:42:06<7:45:12, 7.61s/it] {'loss': 0.4781, 'learning_rate': 1.4685794064724235e-05, 'epoch': 0.36} + 36%|███▋ | 2104/5772 [3:42:06<7:45:12, 7.61s/it] {'loss': 0.4781, 'learning_rate': 1.4685794064724235e-05, 'epoch': 0.36} + 36%|███▋ | 2104/5772 [3:41:58<7:45:12, 7.61s/it] 36%|███▋ | 2105/5772 [3:42:05<7:23:35, 7.26s/it] 36%|███▋ | 2105/5772 [3:42:12<7:23:36, 7.26s/it] {'loss': 0.4876, 'learning_rate': 1.4680835574106977e-05, 'epoch': 0.36} + 36%|███▋ | 2105/5772 [3:42:12<7:23:36, 7.26s/it] {'loss': 0.4876, 'learning_rate': 1.4680835574106977e-05, 'epoch': 0.36} + 36%|███▋ | 2105/5772 [3:42:05<7:23:35, 7.26s/it] 36%|███▋ | 2106/5772 [3:42:11<7:03:15, 6.93s/it] 36%|███▋ | 2106/5772 [3:42:18<7:03:14, 6.93s/it] {'loss': 0.491, 'learning_rate': 1.4675875609286579e-05, 'epoch': 0.36} + 36%|███▋ | 2106/5772 [3:42:18<7:03:14, 6.93s/it] {'loss': 0.491, 'learning_rate': 1.4675875609286579e-05, 'epoch': 0.36} + 36%|███▋ | 2106/5772 [3:42:11<7:03:15, 6.93s/it] 37%|███▋ | 2107/5772 [3:42:17<6:51:38, 6.74s/it] 37%|███▋ | 2107/5772 [3:42:24<6:51:39, 6.74s/it] {'loss': 0.488, 'learning_rate': 1.4670914171825157e-05, 'epoch': 0.37} + 37%|███▋ | 2107/5772 [3:42:24<6:51:39, 6.74s/it] {'loss': 0.488, 'learning_rate': 1.4670914171825157e-05, 'epoch': 0.37} + 37%|███▋ | 2107/5772 [3:42:17<6:51:38, 6.74s/it] 37%|███▋ | 2108/5772 [3:42:30<6:35:53, 6.48s/it] 37%|███▋ | 2108/5772 [3:42:23<6:35:53, 6.48s/it] {'loss': 0.477, 'learning_rate': 1.4665951263285283e-05, 'epoch': 0.37} + 37%|███▋ | 2108/5772 [3:42:30<6:35:53, 6.48s/it]{'loss': 0.477, 'learning_rate': 1.4665951263285283e-05, 'epoch': 0.37} + 37%|███▋ | 2108/5772 [3:42:23<6:35:53, 6.48s/it] 37%|███▋ | 2109/5772 [3:42:29<6:25:57, 6.32s/it] 37%|███▋ | 2109/5772 [3:42:36<6:25:57, 6.32s/it] {'loss': 0.4932, 'learning_rate': 1.4660986885230002e-05, 'epoch': 0.37} + 37%|███▋ | 2109/5772 [3:42:36<6:25:57, 6.32s/it] {'loss': 0.4932, 'learning_rate': 1.4660986885230002e-05, 'epoch': 0.37} + 37%|███▋ | 2109/5772 [3:42:29<6:25:57, 6.32s/it] 37%|███▋ | 2110/5772 [3:42:35<6:20:03, 6.23s/it] 37%|███▋ | 2110/5772 [3:42:42<6:20:03, 6.23s/it] {'loss': 0.472, 'learning_rate': 1.465602103922282e-05, 'epoch': 0.37} + 37%|███▋ | 2110/5772 [3:42:42<6:20:03, 6.23s/it] {'loss': 0.472, 'learning_rate': 1.465602103922282e-05, 'epoch': 0.37} + 37%|███▋ | 2110/5772 [3:42:35<6:20:03, 6.23s/it] 37%|███▋ | 2111/5772 [3:42:41<6:20:21, 6.23s/it] 37%|███▋ | 2111/5772 [3:42:48<6:20:21, 6.23s/it] {'loss': 0.4847, 'learning_rate': 1.4651053726827695e-05, 'epoch': 0.37} + 37%|███▋ | 2111/5772 [3:42:48<6:20:21, 6.23s/it] {'loss': 0.4847, 'learning_rate': 1.4651053726827695e-05, 'epoch': 0.37} + 37%|███▋ | 2111/5772 [3:42:41<6:20:21, 6.23s/it] 37%|███▋ | 2112/5772 [3:42:48<6:17:59, 6.20s/it] 37%|███▋ | 2112/5772 [3:42:55<6:17:58, 6.20s/it] {'loss': 0.4797, 'learning_rate': 1.4646084949609062e-05, 'epoch': 0.37} + 37%|███▋ | 2112/5772 [3:42:55<6:17:58, 6.20s/it] {'loss': 0.4797, 'learning_rate': 1.4646084949609062e-05, 'epoch': 0.37} + 37%|███▋ | 2112/5772 [3:42:48<6:17:59, 6.20s/it] 37%|███▋ | 2113/5772 [3:43:01<6:13:38, 6.13s/it] 37%|███▋ | 2113/5772 [3:42:54<6:13:39, 6.13s/it] {'loss': 0.4709, 'learning_rate': 1.4641114709131805e-05, 'epoch': 0.37} + 37%|███▋ | 2113/5772 [3:43:01<6:13:38, 6.13s/it] {'loss': 0.4709, 'learning_rate': 1.4641114709131805e-05, 'epoch': 0.37} + 37%|███▋ | 2113/5772 [3:42:54<6:13:39, 6.13s/it] 37%|███▋ | 2114/5772 [3:43:07<6:19:12, 6.22s/it] 37%|███▋ | 2114/5772 [3:43:00<6:19:13, 6.22s/it] {'loss': 0.4815, 'learning_rate': 1.4636143006961279e-05, 'epoch': 0.37} + 37%|███▋ | 2114/5772 [3:43:07<6:19:12, 6.22s/it] {'loss': 0.4815, 'learning_rate': 1.4636143006961279e-05, 'epoch': 0.37} + 37%|███▋ | 2114/5772 [3:43:00<6:19:13, 6.22s/it] 37%|███▋ | 2115/5772 [3:43:06<6:19:56, 6.23s/it] 37%|███▋ | 2115/5772 [3:43:13<6:19:56, 6.23s/it] {'loss': 0.4732, 'learning_rate': 1.4631169844663284e-05, 'epoch': 0.37} + 37%|███▋ | 2115/5772 [3:43:13<6:19:56, 6.23s/it] {'loss': 0.4732, 'learning_rate': 1.4631169844663284e-05, 'epoch': 0.37} + 37%|███▋ | 2115/5772 [3:43:06<6:19:56, 6.23s/it] 37%|███▋ | 2116/5772 [3:43:13<6:24:05, 6.30s/it] 37%|███▋ | 2116/5772 [3:43:20<6:24:05, 6.30s/it] {'loss': 0.4884, 'learning_rate': 1.4626195223804101e-05, 'epoch': 0.37} + 37%|███▋ | 2116/5772 [3:43:20<6:24:05, 6.30s/it] {'loss': 0.4884, 'learning_rate': 1.4626195223804101e-05, 'epoch': 0.37} + 37%|███▋ | 2116/5772 [3:43:13<6:24:05, 6.30s/it] 37%|███▋ | 2117/5772 [3:43:19<6:17:39, 6.20s/it] 37%|███▋ | 2117/5772 [3:43:26<6:17:39, 6.20s/it] {'loss': 0.4591, 'learning_rate': 1.4621219145950452e-05, 'epoch': 0.37} + 37%|███▋ | 2117/5772 [3:43:26<6:17:39, 6.20s/it] {'loss': 0.4591, 'learning_rate': 1.4621219145950452e-05, 'epoch': 0.37} + 37%|███▋ | 2117/5772 [3:43:19<6:17:39, 6.20s/it] 37%|███▋ | 2118/5772 [3:43:32<6:14:09, 6.14s/it] 37%|███▋ | 2118/5772 [3:43:25<6:14:09, 6.14s/it] {'loss': 0.4718, 'learning_rate': 1.4616241612669523e-05, 'epoch': 0.37} + 37%|███▋ | 2118/5772 [3:43:32<6:14:09, 6.14s/it] {'loss': 0.4718, 'learning_rate': 1.4616241612669523e-05, 'epoch': 0.37} + 37%|███▋ | 2118/5772 [3:43:25<6:14:09, 6.14s/it] 37%|███▋ | 2119/5772 [3:43:31<6:12:38, 6.12s/it] 37%|███▋ | 2119/5772 [3:43:38<6:12:38, 6.12s/it] {'loss': 0.4565, 'learning_rate': 1.461126262552897e-05, 'epoch': 0.37} + 37%|███▋ | 2119/5772 [3:43:38<6:12:38, 6.12s/it] {'loss': 0.4565, 'learning_rate': 1.461126262552897e-05, 'epoch': 0.37} + 37%|███▋ | 2119/5772 [3:43:31<6:12:38, 6.12s/it] 37%|███▋ | 2120/5772 [3:43:44<6:07:01, 6.03s/it] 37%|███▋ | 2120/5772 [3:43:37<6:07:01, 6.03s/it] {'loss': 0.4789, 'learning_rate': 1.460628218609689e-05, 'epoch': 0.37} + 37%|███▋ | 2120/5772 [3:43:44<6:07:01, 6.03s/it] {'loss': 0.4789, 'learning_rate': 1.460628218609689e-05, 'epoch': 0.37} + 37%|███▋ | 2120/5772 [3:43:37<6:07:01, 6.03s/it] 37%|███▋ | 2121/5772 [3:43:50<6:13:43, 6.14s/it] 37%|███▋ | 2121/5772 [3:43:43<6:13:44, 6.14s/it] {'loss': 0.476, 'learning_rate': 1.4601300295941847e-05, 'epoch': 0.37} + 37%|███▋ | 2121/5772 [3:43:50<6:13:43, 6.14s/it] {'loss': 0.476, 'learning_rate': 1.4601300295941847e-05, 'epoch': 0.37} + 37%|███▋ | 2121/5772 [3:43:43<6:13:44, 6.14s/it] 37%|███▋ | 2122/5772 [3:43:49<6:17:55, 6.21s/it] 37%|███▋ | 2122/5772 [3:43:56<6:17:55, 6.21s/it] {'loss': 0.4833, 'learning_rate': 1.4596316956632856e-05, 'epoch': 0.37} + 37%|███▋ | 2122/5772 [3:43:56<6:17:55, 6.21s/it] {'loss': 0.4833, 'learning_rate': 1.4596316956632856e-05, 'epoch': 0.37} + 37%|███▋ | 2122/5772 [3:43:49<6:17:55, 6.21s/it] 37%|███▋ | 2123/5772 [3:44:02<6:11:56, 6.12s/it] 37%|███▋ | 2123/5772 [3:43:55<6:11:57, 6.12s/it] {'loss': 0.4711, 'learning_rate': 1.45913321697394e-05, 'epoch': 0.37} + 37%|███▋ | 2123/5772 [3:44:02<6:11:56, 6.12s/it] {'loss': 0.4711, 'learning_rate': 1.45913321697394e-05, 'epoch': 0.37} + 37%|███▋ | 2123/5772 [3:43:55<6:11:57, 6.12s/it] 37%|███▋ | 2124/5772 [3:44:08<6:08:29, 6.06s/it] 37%|███▋ | 2124/5772 [3:44:01<6:08:29, 6.06s/it] {'loss': 0.4863, 'learning_rate': 1.4586345936831404e-05, 'epoch': 0.37} + 37%|███▋ | 2124/5772 [3:44:08<6:08:29, 6.06s/it] {'loss': 0.4863, 'learning_rate': 1.4586345936831404e-05, 'epoch': 0.37} + 37%|███▋ | 2124/5772 [3:44:01<6:08:29, 6.06s/it] 37%|███▋ | 2125/5772 [3:44:14<6:06:14, 6.03s/it] 37%|███▋ | 2125/5772 [3:44:07<6:06:15, 6.03s/it] {'loss': 0.4724, 'learning_rate': 1.4581358259479252e-05, 'epoch': 0.37} + 37%|███▋ | 2125/5772 [3:44:14<6:06:14, 6.03s/it] {'loss': 0.4724, 'learning_rate': 1.4581358259479252e-05, 'epoch': 0.37} + 37%|███▋ | 2125/5772 [3:44:07<6:06:15, 6.03s/it] 37%|███▋ | 2126/5772 [3:44:13<6:05:34, 6.02s/it] 37%|███▋ | 2126/5772 [3:44:20<6:05:34, 6.02s/it] {'loss': 0.4782, 'learning_rate': 1.457636913925379e-05, 'epoch': 0.37} + 37%|███▋ | 2126/5772 [3:44:20<6:05:34, 6.02s/it] {'loss': 0.4782, 'learning_rate': 1.457636913925379e-05, 'epoch': 0.37} + 37%|███▋ | 2126/5772 [3:44:13<6:05:34, 6.02s/it] 37%|███▋ | 2127/5772 [3:44:26<6:08:53, 6.07s/it] 37%|███▋ | 2127/5772 [3:44:19<6:08:53, 6.07s/it] {'loss': 0.4747, 'learning_rate': 1.4571378577726317e-05, 'epoch': 0.37} + 37%|███▋ | 2127/5772 [3:44:26<6:08:53, 6.07s/it] {'loss': 0.4747, 'learning_rate': 1.4571378577726317e-05, 'epoch': 0.37} + 37%|███▋ | 2127/5772 [3:44:19<6:08:53, 6.07s/it] 37%|███▋ | 2128/5772 [3:44:26<6:13:06, 6.14s/it] 37%|███▋ | 2128/5772 [3:44:33<6:13:06, 6.14s/it]{'loss': 0.4751, 'learning_rate': 1.4566386576468572e-05, 'epoch': 0.37} {'loss': 0.4751, 'learning_rate': 1.4566386576468572e-05, 'epoch': 0.37} + + 37%|███▋ | 2128/5772 [3:44:33<6:13:06, 6.14s/it] 37%|███▋ | 2128/5772 [3:44:26<6:13:06, 6.14s/it] 37%|███▋ | 2129/5772 [3:44:39<6:14:32, 6.17s/it] 37%|███▋ | 2129/5772 [3:44:32<6:14:32, 6.17s/it] {'loss': 0.4757, 'learning_rate': 1.4561393137052767e-05, 'epoch': 0.37} + 37%|███▋ | 2129/5772 [3:44:39<6:14:32, 6.17s/it] {'loss': 0.4757, 'learning_rate': 1.4561393137052767e-05, 'epoch': 0.37} + 37%|███▋ | 2129/5772 [3:44:32<6:14:32, 6.17s/it] 37%|███▋ | 2130/5772 [3:44:38<6:15:33, 6.19s/it] 37%|███▋ | 2130/5772 [3:44:45<6:15:33, 6.19s/it] {'loss': 0.4747, 'learning_rate': 1.4556398261051553e-05, 'epoch': 0.37} + 37%|███▋ | 2130/5772 [3:44:45<6:15:33, 6.19s/it] {'loss': 0.4747, 'learning_rate': 1.4556398261051553e-05, 'epoch': 0.37} + 37%|███▋ | 2130/5772 [3:44:38<6:15:33, 6.19s/it] 37%|███▋ | 2131/5772 [3:44:44<6:13:02, 6.15s/it] 37%|███▋ | 2131/5772 [3:44:51<6:13:02, 6.15s/it] {'loss': 0.4643, 'learning_rate': 1.455140195003804e-05, 'epoch': 0.37} + 37%|███▋ | 2131/5772 [3:44:51<6:13:02, 6.15s/it] {'loss': 0.4643, 'learning_rate': 1.455140195003804e-05, 'epoch': 0.37} + 37%|███▋ | 2131/5772 [3:44:44<6:13:02, 6.15s/it] 37%|███▋ | 2132/5772 [3:44:57<6:05:30, 6.02s/it] 37%|███▋ | 2132/5772 [3:44:50<6:05:30, 6.02s/it] {'loss': 0.4826, 'learning_rate': 1.4546404205585789e-05, 'epoch': 0.37} + 37%|███▋ | 2132/5772 [3:44:57<6:05:30, 6.02s/it] {'loss': 0.4826, 'learning_rate': 1.4546404205585789e-05, 'epoch': 0.37} + 37%|███▋ | 2132/5772 [3:44:50<6:05:30, 6.02s/it] 37%|███▋ | 2133/5772 [3:44:56<6:04:49, 6.02s/it] 37%|███▋ | 2133/5772 [3:45:03<6:04:49, 6.02s/it] {'loss': 0.475, 'learning_rate': 1.4541405029268813e-05, 'epoch': 0.37} + 37%|███▋ | 2133/5772 [3:45:03<6:04:49, 6.02s/it] {'loss': 0.475, 'learning_rate': 1.4541405029268813e-05, 'epoch': 0.37} + 37%|███▋ | 2133/5772 [3:44:56<6:04:49, 6.02s/it] 37%|███▋ | 2134/5772 [3:45:09<6:13:37, 6.16s/it] 37%|███▋ | 2134/5772 [3:45:02<6:13:38, 6.16s/it] {'loss': 0.4854, 'learning_rate': 1.4536404422661575e-05, 'epoch': 0.37} + 37%|███▋ | 2134/5772 [3:45:09<6:13:37, 6.16s/it] {'loss': 0.4854, 'learning_rate': 1.4536404422661575e-05, 'epoch': 0.37} + 37%|███▋ | 2134/5772 [3:45:02<6:13:38, 6.16s/it] 37%|███▋ | 2135/5772 [3:45:16<6:12:59, 6.15s/it] 37%|███▋ | 2135/5772 [3:45:08<6:12:59, 6.15s/it] {'loss': 0.4695, 'learning_rate': 1.4531402387338982e-05, 'epoch': 0.37} + 37%|███▋ | 2135/5772 [3:45:16<6:12:59, 6.15s/it] {'loss': 0.4695, 'learning_rate': 1.4531402387338982e-05, 'epoch': 0.37} + 37%|███▋ | 2135/5772 [3:45:08<6:12:59, 6.15s/it] 37%|███▋ | 2136/5772 [3:45:14<6:08:04, 6.07s/it] 37%|███▋ | 2136/5772 [3:45:21<6:08:05, 6.07s/it] {'loss': 0.4703, 'learning_rate': 1.4526398924876407e-05, 'epoch': 0.37} + 37%|███▋ | 2136/5772 [3:45:21<6:08:05, 6.07s/it] {'loss': 0.4703, 'learning_rate': 1.4526398924876407e-05, 'epoch': 0.37} + 37%|███▋ | 2136/5772 [3:45:14<6:08:04, 6.07s/it] 37%|███▋ | 2137/5772 [3:45:28<6:15:11, 6.19s/it] 37%|███▋ | 2137/5772 [3:45:21<6:15:11, 6.19s/it] {'loss': 0.4847, 'learning_rate': 1.4521394036849652e-05, 'epoch': 0.37} + 37%|███▋ | 2137/5772 [3:45:28<6:15:11, 6.19s/it] {'loss': 0.4847, 'learning_rate': 1.4521394036849652e-05, 'epoch': 0.37} + 37%|███▋ | 2137/5772 [3:45:21<6:15:11, 6.19s/it] 37%|███▋ | 2138/5772 [3:45:27<6:11:08, 6.13s/it] 37%|███▋ | 2138/5772 [3:45:34<6:11:08, 6.13s/it] {'loss': 0.4657, 'learning_rate': 1.4516387724834989e-05, 'epoch': 0.37} + 37%|███▋ | 2138/5772 [3:45:34<6:11:08, 6.13s/it] {'loss': 0.4657, 'learning_rate': 1.4516387724834989e-05, 'epoch': 0.37} + 37%|███▋ | 2138/5772 [3:45:27<6:11:08, 6.13s/it] 37%|███▋ | 2139/5772 [3:45:40<6:08:30, 6.09s/it] 37%|███▋ | 2139/5772 [3:45:33<6:08:30, 6.09s/it] {'loss': 0.4667, 'learning_rate': 1.4511379990409119e-05, 'epoch': 0.37} + 37%|███▋ | 2139/5772 [3:45:40<6:08:30, 6.09s/it] {'loss': 0.4667, 'learning_rate': 1.4511379990409119e-05, 'epoch': 0.37} + 37%|███▋ | 2139/5772 [3:45:33<6:08:30, 6.09s/it] 37%|███▋ | 2140/5772 [3:45:39<6:12:17, 6.15s/it] 37%|███▋ | 2140/5772 [3:45:46<6:12:17, 6.15s/it] {'loss': 0.471, 'learning_rate': 1.4506370835149209e-05, 'epoch': 0.37} + 37%|███▋ | 2140/5772 [3:45:46<6:12:17, 6.15s/it] {'loss': 0.471, 'learning_rate': 1.4506370835149209e-05, 'epoch': 0.37} + 37%|███▋ | 2140/5772 [3:45:39<6:12:17, 6.15s/it] 37%|███▋ | 2141/5772 [3:45:45<6:12:24, 6.15s/it] 37%|███▋ | 2141/5772 [3:45:52<6:12:24, 6.15s/it] {'loss': 0.4593, 'learning_rate': 1.4501360260632855e-05, 'epoch': 0.37} + 37%|███▋ | 2141/5772 [3:45:52<6:12:24, 6.15s/it] {'loss': 0.4593, 'learning_rate': 1.4501360260632855e-05, 'epoch': 0.37} + 37%|███▋ | 2141/5772 [3:45:45<6:12:24, 6.15s/it] 37%|███▋ | 2142/5772 [3:45:51<6:12:17, 6.15s/it] 37%|███▋ | 2142/5772 [3:45:58<6:12:17, 6.15s/it] {'loss': 0.4654, 'learning_rate': 1.4496348268438116e-05, 'epoch': 0.37} + 37%|███▋ | 2142/5772 [3:45:58<6:12:17, 6.15s/it] {'loss': 0.4654, 'learning_rate': 1.4496348268438116e-05, 'epoch': 0.37} + 37%|███▋ | 2142/5772 [3:45:51<6:12:17, 6.15s/it] 37%|███▋ | 2143/5772 [3:45:58<6:14:26, 6.19s/it] 37%|███▋ | 2143/5772 [3:46:05<6:14:26, 6.19s/it] {'loss': 0.4815, 'learning_rate': 1.4491334860143494e-05, 'epoch': 0.37} + 37%|███▋ | 2143/5772 [3:46:05<6:14:26, 6.19s/it] {'loss': 0.4815, 'learning_rate': 1.4491334860143494e-05, 'epoch': 0.37} + 37%|███▋ | 2143/5772 [3:45:58<6:14:26, 6.19s/it] 37%|███▋ | 2144/5772 [3:46:04<6:12:20, 6.16s/it] 37%|███▋ | 2144/5772 [3:46:11<6:12:20, 6.16s/it] {'loss': 0.4863, 'learning_rate': 1.4486320037327924e-05, 'epoch': 0.37} + 37%|███▋ | 2144/5772 [3:46:11<6:12:20, 6.16s/it] {'loss': 0.4863, 'learning_rate': 1.4486320037327924e-05, 'epoch': 0.37} + 37%|███▋ | 2144/5772 [3:46:04<6:12:20, 6.16s/it] 37%|███▋ | 2145/5772 [3:46:10<6:10:39, 6.13s/it] 37%|███▋ | 2145/5772 [3:46:17<6:10:39, 6.13s/it] {'loss': 0.4664, 'learning_rate': 1.4481303801570805e-05, 'epoch': 0.37} + 37%|███▋ | 2145/5772 [3:46:17<6:10:39, 6.13s/it] {'loss': 0.4664, 'learning_rate': 1.4481303801570805e-05, 'epoch': 0.37} + 37%|███▋ | 2145/5772 [3:46:10<6:10:39, 6.13s/it] 37%|███▋ | 2146/5772 [3:46:16<6:17:49, 6.25s/it] 37%|███▋ | 2146/5772 [3:46:23<6:17:49, 6.25s/it] {'loss': 0.4875, 'learning_rate': 1.4476286154451968e-05, 'epoch': 0.37} + 37%|███▋ | 2146/5772 [3:46:23<6:17:49, 6.25s/it] {'loss': 0.4875, 'learning_rate': 1.4476286154451968e-05, 'epoch': 0.37} + 37%|███▋ | 2146/5772 [3:46:16<6:17:49, 6.25s/it] 37%|███▋ | 2147/5772 [3:46:30<6:25:23, 6.38s/it] 37%|███▋ | 2147/5772 [3:46:23<6:25:23, 6.38s/it] {'loss': 0.4843, 'learning_rate': 1.4471267097551698e-05, 'epoch': 0.37} + {'loss': 0.4843, 'learning_rate': 1.4471267097551698e-05, 'epoch': 0.37} 37%|███▋ | 2147/5772 [3:46:30<6:25:23, 6.38s/it] + 37%|███▋ | 2147/5772 [3:46:23<6:25:23, 6.38s/it] 37%|███▋ | 2148/5772 [3:46:29<6:21:14, 6.31s/it] 37%|███▋ | 2148/5772 [3:46:36<6:21:14, 6.31s/it] {'loss': 0.4747, 'learning_rate': 1.4466246632450714e-05, 'epoch': 0.37} + 37%|███▋ | 2148/5772 [3:46:36<6:21:14, 6.31s/it] {'loss': 0.4747, 'learning_rate': 1.4466246632450714e-05, 'epoch': 0.37} + 37%|███▋ | 2148/5772 [3:46:29<6:21:14, 6.31s/it] 37%|███▋ | 2149/5772 [3:46:42<6:17:25, 6.25s/it] 37%|███▋ | 2149/5772 [3:46:35<6:17:26, 6.25s/it] {'loss': 0.471, 'learning_rate': 1.4461224760730189e-05, 'epoch': 0.37} + 37%|███▋ | 2149/5772 [3:46:42<6:17:25, 6.25s/it] {'loss': 0.471, 'learning_rate': 1.4461224760730189e-05, 'epoch': 0.37} + 37%|███▋ | 2149/5772 [3:46:35<6:17:26, 6.25s/it]4 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... + 14 AutoResumeHook: Checking whether to suspend...11 + AutoResumeHook: Checking whether to suspend... +910 AutoResumeHook: Checking whether to suspend... AutoResumeHook: Checking whether to suspend... + +AutoResumeHook: Checking whether to suspend... + 37%|███▋ | 2150/5772 [3:46:42<6:19:21, 6.28s/it]12 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... + 37%|███▋ | 2150/5772 [3:46:49<6:19:21, 6.28s/it] {'loss': 0.4707, 'learning_rate': 1.4456201483971724e-05, 'epoch': 0.37} + 37%|███▋ | 2150/5772 [3:46:49<6:19:21, 6.28s/it] {'loss': 0.4707, 'learning_rate': 1.4456201483971724e-05, 'epoch': 0.37} + 37%|███▋ | 2150/5772 [3:46:42<6:19:21, 6.28s/it] 37%|███▋ | 2151/5772 [3:46:55<6:17:14, 6.25s/it] 37%|███▋ | 2151/5772 [3:46:48<6:17:15, 6.25s/it] {'loss': 0.4741, 'learning_rate': 1.4451176803757383e-05, 'epoch': 0.37} + 37%|███▋ | 2151/5772 [3:46:55<6:17:14, 6.25s/it] {'loss': 0.4741, 'learning_rate': 1.4451176803757383e-05, 'epoch': 0.37} + 37%|███▋ | 2151/5772 [3:46:48<6:17:15, 6.25s/it] 37%|███▋ | 2152/5772 [3:46:54<6:16:05, 6.23s/it] 37%|███▋ | 2152/5772 [3:47:01<6:16:04, 6.23s/it] {'loss': 0.4866, 'learning_rate': 1.4446150721669654e-05, 'epoch': 0.37} + 37%|███▋ | 2152/5772 [3:47:01<6:16:04, 6.23s/it] {'loss': 0.4866, 'learning_rate': 1.4446150721669654e-05, 'epoch': 0.37} + 37%|███▋ | 2152/5772 [3:46:54<6:16:05, 6.23s/it] 37%|███▋ | 2153/5772 [3:47:07<6:08:52, 6.12s/it] 37%|███▋ | 2153/5772 [3:47:00<6:08:53, 6.12s/it] {'loss': 0.467, 'learning_rate': 1.4441123239291477e-05, 'epoch': 0.37} + 37%|███▋ | 2153/5772 [3:47:07<6:08:52, 6.12s/it] {'loss': 0.467, 'learning_rate': 1.4441123239291477e-05, 'epoch': 0.37} + 37%|███▋ | 2153/5772 [3:47:00<6:08:53, 6.12s/it] 37%|███▋ | 2154/5772 [3:47:06<6:07:23, 6.09s/it] 37%|███▋ | 2154/5772 [3:47:13<6:07:23, 6.09s/it] {'loss': 0.4785, 'learning_rate': 1.4436094358206224e-05, 'epoch': 0.37} + 37%|███▋ | 2154/5772 [3:47:13<6:07:23, 6.09s/it] {'loss': 0.4785, 'learning_rate': 1.4436094358206224e-05, 'epoch': 0.37} + 37%|███▋ | 2154/5772 [3:47:06<6:07:23, 6.09s/it] 37%|███▋ | 2155/5772 [3:47:12<6:15:28, 6.23s/it] 37%|███▋ | 2155/5772 [3:47:20<6:15:28, 6.23s/it] {'loss': 0.4608, 'learning_rate': 1.4431064079997723e-05, 'epoch': 0.37} + 37%|███▋ | 2155/5772 [3:47:20<6:15:28, 6.23s/it] {'loss': 0.4608, 'learning_rate': 1.4431064079997723e-05, 'epoch': 0.37} + 37%|███▋ | 2155/5772 [3:47:12<6:15:28, 6.23s/it] 37%|███▋ | 2156/5772 [3:47:25<6:10:55, 6.15s/it] 37%|███▋ | 2156/5772 [3:47:18<6:10:55, 6.15s/it] {'loss': 0.4788, 'learning_rate': 1.4426032406250228e-05, 'epoch': 0.37} + 37%|███▋ | 2156/5772 [3:47:26<6:10:55, 6.15s/it] {'loss': 0.4788, 'learning_rate': 1.4426032406250228e-05, 'epoch': 0.37} + 37%|███▋ | 2156/5772 [3:47:18<6:10:55, 6.15s/it] 37%|███▋ | 2157/5772 [3:47:25<6:13:35, 6.20s/it] 37%|███▋ | 2157/5772 [3:47:32<6:13:35, 6.20s/it] {'loss': 0.4773, 'learning_rate': 1.4420999338548432e-05, 'epoch': 0.37} + 37%|███▋ | 2157/5772 [3:47:32<6:13:35, 6.20s/it] {'loss': 0.4773, 'learning_rate': 1.4420999338548432e-05, 'epoch': 0.37} + 37%|███▋ | 2157/5772 [3:47:25<6:13:35, 6.20s/it] 37%|███▋ | 2158/5772 [3:47:38<6:15:45, 6.24s/it] 37%|███▋ | 2158/5772 [3:47:31<6:15:45, 6.24s/it] {'loss': 0.4902, 'learning_rate': 1.4415964878477477e-05, 'epoch': 0.37} + 37%|███▋ | 2158/5772 [3:47:38<6:15:45, 6.24s/it] {'loss': 0.4902, 'learning_rate': 1.4415964878477477e-05, 'epoch': 0.37} + 37%|███▋ | 2158/5772 [3:47:31<6:15:45, 6.24s/it] 37%|███▋ | 2159/5772 [3:47:44<6:09:06, 6.13s/it] 37%|███▋ | 2159/5772 [3:47:37<6:09:06, 6.13s/it] {'loss': 0.4673, 'learning_rate': 1.4410929027622932e-05, 'epoch': 0.37} + 37%|███▋ | 2159/5772 [3:47:44<6:09:06, 6.13s/it] {'loss': 0.4673, 'learning_rate': 1.4410929027622932e-05, 'epoch': 0.37} + 37%|███▋ | 2159/5772 [3:47:37<6:09:06, 6.13s/it] 37%|███▋ | 2160/5772 [3:47:43<6:07:10, 6.10s/it] 37%|███▋ | 2160/5772 [3:47:50<6:07:11, 6.10s/it] {'loss': 0.4779, 'learning_rate': 1.440589178757082e-05, 'epoch': 0.37} + 37%|███▋ | 2160/5772 [3:47:50<6:07:11, 6.10s/it] {'loss': 0.4779, 'learning_rate': 1.440589178757082e-05, 'epoch': 0.37} + 37%|███▋ | 2160/5772 [3:47:43<6:07:10, 6.10s/it] 37%|███▋ | 2161/5772 [3:47:49<6:13:14, 6.20s/it] 37%|███▋ | 2161/5772 [3:47:56<6:13:14, 6.20s/it] {'loss': 0.4775, 'learning_rate': 1.4400853159907584e-05, 'epoch': 0.37} + 37%|███▋ | 2161/5772 [3:47:56<6:13:14, 6.20s/it] {'loss': 0.4775, 'learning_rate': 1.4400853159907584e-05, 'epoch': 0.37} + 37%|███▋ | 2161/5772 [3:47:49<6:13:14, 6.20s/it] 37%|███▋ | 2162/5772 [3:48:03<6:13:06, 6.20s/it] 37%|███▋ | 2162/5772 [3:47:56<6:13:06, 6.20s/it] {'loss': 0.4901, 'learning_rate': 1.4395813146220117e-05, 'epoch': 0.37} + 37%|███▋ | 2162/5772 [3:48:03<6:13:06, 6.20s/it] {'loss': 0.4901, 'learning_rate': 1.4395813146220117e-05, 'epoch': 0.37} + 37%|███▋ | 2162/5772 [3:47:56<6:13:06, 6.20s/it] 37%|███▋ | 2163/5772 [3:48:02<6:15:00, 6.23s/it] 37%|███▋ | 2163/5772 [3:48:09<6:15:00, 6.23s/it] {'loss': 0.4659, 'learning_rate': 1.4390771748095735e-05, 'epoch': 0.37} + 37%|███▋ | 2163/5772 [3:48:09<6:15:00, 6.23s/it] {'loss': 0.4659, 'learning_rate': 1.4390771748095735e-05, 'epoch': 0.37} + 37%|███▋ | 2163/5772 [3:48:02<6:15:00, 6.23s/it] 37%|███▋ | 2164/5772 [3:48:15<6:14:39, 6.23s/it] 37%|███▋ | 2164/5772 [3:48:08<6:14:39, 6.23s/it] {'loss': 0.4782, 'learning_rate': 1.4385728967122207e-05, 'epoch': 0.37} + 37%|███▋ | 2164/5772 [3:48:15<6:14:39, 6.23s/it] {'loss': 0.4782, 'learning_rate': 1.4385728967122207e-05, 'epoch': 0.37} + 37%|███▋ | 2164/5772 [3:48:08<6:14:39, 6.23s/it] 38%|███▊ | 2165/5772 [3:48:21<6:07:50, 6.12s/it] 38%|███▊ | 2165/5772 [3:48:14<6:07:50, 6.12s/it] {'loss': 0.482, 'learning_rate': 1.4380684804887726e-05, 'epoch': 0.38} + 38%|███▊ | 2165/5772 [3:48:21<6:07:50, 6.12s/it] {'loss': 0.482, 'learning_rate': 1.4380684804887726e-05, 'epoch': 0.38} + 38%|███▊ | 2165/5772 [3:48:14<6:07:50, 6.12s/it] 38%|███▊ | 2166/5772 [3:48:20<6:12:45, 6.20s/it] 38%|███▊ | 2166/5772 [3:48:27<6:12:45, 6.20s/it] {'loss': 0.4734, 'learning_rate': 1.4375639262980921e-05, 'epoch': 0.38} + 38%|███▊ | 2166/5772 [3:48:27<6:12:45, 6.20s/it] {'loss': 0.4734, 'learning_rate': 1.4375639262980921e-05, 'epoch': 0.38} + 38%|███▊ | 2166/5772 [3:48:20<6:12:45, 6.20s/it] 38%|███▊ | 2167/5772 [3:48:33<6:08:06, 6.13s/it] 38%|███▊ | 2167/5772 [3:48:26<6:08:06, 6.13s/it] {'loss': 0.4619, 'learning_rate': 1.437059234299086e-05, 'epoch': 0.38} + 38%|███▊ | 2167/5772 [3:48:33<6:08:06, 6.13s/it] {'loss': 0.4619, 'learning_rate': 1.437059234299086e-05, 'epoch': 0.38} + 38%|███▊ | 2167/5772 [3:48:26<6:08:06, 6.13s/it] 38%|███▊ | 2168/5772 [3:48:40<6:12:34, 6.20s/it] 38%|███▊ | 2168/5772 [3:48:33<6:12:34, 6.20s/it] {'loss': 0.489, 'learning_rate': 1.4365544046507039e-05, 'epoch': 0.38} + 38%|███▊ | 2168/5772 [3:48:40<6:12:34, 6.20s/it] {'loss': 0.489, 'learning_rate': 1.4365544046507039e-05, 'epoch': 0.38} + 38%|███▊ | 2168/5772 [3:48:33<6:12:34, 6.20s/it] 38%|███▊ | 2169/5772 [3:48:39<6:11:44, 6.19s/it] 38%|███▊ | 2169/5772 [3:48:46<6:11:44, 6.19s/it] {'loss': 0.4819, 'learning_rate': 1.4360494375119396e-05, 'epoch': 0.38} + 38%|███▊ | 2169/5772 [3:48:46<6:11:44, 6.19s/it] {'loss': 0.4819, 'learning_rate': 1.4360494375119396e-05, 'epoch': 0.38} + 38%|███▊ | 2169/5772 [3:48:39<6:11:44, 6.19s/it] 38%|███▊ | 2170/5772 [3:48:45<6:15:33, 6.26s/it] 38%|███▊ | 2170/5772 [3:48:52<6:15:33, 6.26s/it] {'loss': 0.4827, 'learning_rate': 1.4355443330418292e-05, 'epoch': 0.38} + 38%|███▊ | 2170/5772 [3:48:52<6:15:33, 6.26s/it] {'loss': 0.4827, 'learning_rate': 1.4355443330418292e-05, 'epoch': 0.38} + 38%|███▊ | 2170/5772 [3:48:45<6:15:33, 6.26s/it] 38%|███▊ | 2171/5772 [3:48:58<6:12:47, 6.21s/it] 38%|███▊ | 2171/5772 [3:48:51<6:12:47, 6.21s/it] {'loss': 0.4666, 'learning_rate': 1.435039091399453e-05, 'epoch': 0.38} + 38%|███▊ | 2171/5772 [3:48:58<6:12:47, 6.21s/it] {'loss': 0.4666, 'learning_rate': 1.435039091399453e-05, 'epoch': 0.38} + 38%|███▊ | 2171/5772 [3:48:51<6:12:47, 6.21s/it] 38%|███▊ | 2172/5772 [3:48:58<6:19:44, 6.33s/it] 38%|███▊ | 2172/5772 [3:49:05<6:19:44, 6.33s/it] {'loss': 0.4795, 'learning_rate': 1.4345337127439333e-05, 'epoch': 0.38} + 38%|███▊ | 2172/5772 [3:49:05<6:19:44, 6.33s/it] {'loss': 0.4795, 'learning_rate': 1.4345337127439333e-05, 'epoch': 0.38} + 38%|███▊ | 2172/5772 [3:48:58<6:19:44, 6.33s/it] 38%|███▊ | 2173/5772 [3:49:11<6:14:01, 6.24s/it] 38%|███▊ | 2173/5772 [3:49:04<6:14:01, 6.24s/it] {'loss': 0.4665, 'learning_rate': 1.4340281972344374e-05, 'epoch': 0.38} + 38%|███▊ | 2173/5772 [3:49:11<6:14:01, 6.24s/it] {'loss': 0.4665, 'learning_rate': 1.4340281972344374e-05, 'epoch': 0.38} + 38%|███▊ | 2173/5772 [3:49:04<6:14:01, 6.24s/it] 38%|███▊ | 2174/5772 [3:49:10<6:14:15, 6.24s/it] 38%|███▊ | 2174/5772 [3:49:17<6:14:15, 6.24s/it] {'loss': 0.4774, 'learning_rate': 1.4335225450301735e-05, 'epoch': 0.38} + 38%|███▊ | 2174/5772 [3:49:17<6:14:15, 6.24s/it] {'loss': 0.4774, 'learning_rate': 1.4335225450301735e-05, 'epoch': 0.38} + 38%|███▊ | 2174/5772 [3:49:10<6:14:15, 6.24s/it] 38%|███▊ | 2175/5772 [3:49:24<6:16:58, 6.29s/it] 38%|███▊ | 2175/5772 [3:49:17<6:16:59, 6.29s/it]{'loss': 0.4776, 'learning_rate': 1.4330167562903948e-05, 'epoch': 0.38} + {'loss': 0.4776, 'learning_rate': 1.4330167562903948e-05, 'epoch': 0.38} + 38%|███▊ | 2175/5772 [3:49:24<6:16:58, 6.29s/it] 38%|███▊ | 2175/5772 [3:49:17<6:16:59, 6.29s/it] 38%|███▊ | 2176/5772 [3:49:30<6:22:45, 6.39s/it] 38%|███▊ | 2176/5772 [3:49:23<6:22:45, 6.39s/it] {'loss': 0.4809, 'learning_rate': 1.4325108311743959e-05, 'epoch': 0.38} + 38%|███▊ | 2176/5772 [3:49:30<6:22:45, 6.39s/it] {'loss': 0.4809, 'learning_rate': 1.4325108311743959e-05, 'epoch': 0.38} + 38%|███▊ | 2176/5772 [3:49:23<6:22:45, 6.39s/it] 38%|███▊ | 2177/5772 [3:49:37<6:23:34, 6.40s/it] 38%|███▊ | 2177/5772 [3:49:30<6:23:34, 6.40s/it] {'loss': 0.4686, 'learning_rate': 1.4320047698415156e-05, 'epoch': 0.38} + 38%|███▊ | 2177/5772 [3:49:37<6:23:34, 6.40s/it] {'loss': 0.4686, 'learning_rate': 1.4320047698415156e-05, 'epoch': 0.38} + 38%|███▊ | 2177/5772 [3:49:30<6:23:34, 6.40s/it] 38%|███▊ | 2178/5772 [3:49:36<6:18:52, 6.33s/it] 38%|███▊ | 2178/5772 [3:49:43<6:18:53, 6.33s/it] {'loss': 0.4643, 'learning_rate': 1.4314985724511353e-05, 'epoch': 0.38} + 38%|███▊ | 2178/5772 [3:49:43<6:18:53, 6.33s/it] {'loss': 0.4643, 'learning_rate': 1.4314985724511353e-05, 'epoch': 0.38} + 38%|███▊ | 2178/5772 [3:49:36<6:18:52, 6.33s/it] 38%|███▊ | 2179/5772 [3:49:49<6:14:51, 6.26s/it] 38%|███▊ | 2179/5772 [3:49:42<6:14:51, 6.26s/it] {'loss': 0.4634, 'learning_rate': 1.4309922391626784e-05, 'epoch': 0.38} + 38%|███▊ | 2179/5772 [3:49:49<6:14:51, 6.26s/it] {'loss': 0.4634, 'learning_rate': 1.4309922391626784e-05, 'epoch': 0.38} + 38%|███▊ | 2179/5772 [3:49:42<6:14:51, 6.26s/it] 38%|███▊ | 2180/5772 [3:49:56<6:20:27, 6.36s/it] 38%|███▊ | 2180/5772 [3:49:49<6:20:27, 6.36s/it] {'loss': 0.4828, 'learning_rate': 1.4304857701356123e-05, 'epoch': 0.38} + 38%|███▊ | 2180/5772 [3:49:56<6:20:27, 6.36s/it] {'loss': 0.4828, 'learning_rate': 1.4304857701356123e-05, 'epoch': 0.38} + 38%|███▊ | 2180/5772 [3:49:49<6:20:27, 6.36s/it] 38%|███▊ | 2181/5772 [3:50:02<6:17:51, 6.31s/it] 38%|███▊ | 2181/5772 [3:49:55<6:17:51, 6.31s/it] {'loss': 0.48, 'learning_rate': 1.4299791655294461e-05, 'epoch': 0.38} + 38%|███▊ | 2181/5772 [3:50:02<6:17:51, 6.31s/it] {'loss': 0.48, 'learning_rate': 1.4299791655294461e-05, 'epoch': 0.38} + 38%|███▊ | 2181/5772 [3:49:55<6:17:51, 6.31s/it] 38%|███▊ | 2182/5772 [3:50:02<6:26:36, 6.46s/it] 38%|███▊ | 2182/5772 [3:50:09<6:26:37, 6.46s/it] {'loss': 0.4784, 'learning_rate': 1.4294724255037329e-05, 'epoch': 0.38} + 38%|███▊ | 2182/5772 [3:50:09<6:26:37, 6.46s/it] {'loss': 0.4784, 'learning_rate': 1.4294724255037329e-05, 'epoch': 0.38} + 38%|███▊ | 2182/5772 [3:50:02<6:26:36, 6.46s/it] 38%|███▊ | 2183/5772 [3:50:15<6:19:38, 6.35s/it] 38%|███▊ | 2183/5772 [3:50:08<6:19:38, 6.35s/it] {'loss': 0.4829, 'learning_rate': 1.4289655502180667e-05, 'epoch': 0.38} + 38%|███▊ | 2183/5772 [3:50:15<6:19:38, 6.35s/it] {'loss': 0.4829, 'learning_rate': 1.4289655502180667e-05, 'epoch': 0.38} + 38%|███▊ | 2183/5772 [3:50:08<6:19:38, 6.35s/it] 38%|███▊ | 2184/5772 [3:50:14<6:15:19, 6.28s/it] 38%|███▊ | 2184/5772 [3:50:21<6:15:19, 6.28s/it] {'loss': 0.4745, 'learning_rate': 1.428458539832086e-05, 'epoch': 0.38} + 38%|███▊ | 2184/5772 [3:50:21<6:15:19, 6.28s/it] {'loss': 0.4745, 'learning_rate': 1.428458539832086e-05, 'epoch': 0.38} + 38%|███▊ | 2184/5772 [3:50:14<6:15:19, 6.28s/it] 38%|███▊ | 2185/5772 [3:50:20<6:11:03, 6.21s/it] 38%|███▊ | 2185/5772 [3:50:27<6:11:03, 6.21s/it] {'loss': 0.4731, 'learning_rate': 1.42795139450547e-05, 'epoch': 0.38} + 38%|███▊ | 2185/5772 [3:50:27<6:11:03, 6.21s/it] {'loss': 0.4731, 'learning_rate': 1.42795139450547e-05, 'epoch': 0.38} + 38%|███▊ | 2185/5772 [3:50:20<6:11:03, 6.21s/it] 38%|███▊ | 2186/5772 [3:50:26<6:10:14, 6.19s/it] 38%|███▊ | 2186/5772 [3:50:33<6:10:14, 6.19s/it] {'loss': 0.4751, 'learning_rate': 1.4274441143979418e-05, 'epoch': 0.38} + 38%|███▊ | 2186/5772 [3:50:33<6:10:14, 6.19s/it] {'loss': 0.4751, 'learning_rate': 1.4274441143979418e-05, 'epoch': 0.38} + 38%|███▊ | 2186/5772 [3:50:26<6:10:14, 6.19s/it] 38%|███▊ | 2187/5772 [3:50:32<6:10:53, 6.21s/it] 38%|███▊ | 2187/5772 [3:50:39<6:10:54, 6.21s/it] {'loss': 0.4652, 'learning_rate': 1.4269366996692666e-05, 'epoch': 0.38} + 38%|███▊ | 2187/5772 [3:50:39<6:10:54, 6.21s/it] {'loss': 0.4652, 'learning_rate': 1.4269366996692666e-05, 'epoch': 0.38} + 38%|███▊ | 2187/5772 [3:50:32<6:10:53, 6.21s/it] 38%|███▊ | 2188/5772 [3:50:38<6:07:39, 6.15s/it] 38%|███▊ | 2188/5772 [3:50:45<6:07:39, 6.15s/it] {'loss': 0.468, 'learning_rate': 1.4264291504792514e-05, 'epoch': 0.38} + 38%|███▊ | 2188/5772 [3:50:45<6:07:39, 6.15s/it] {'loss': 0.468, 'learning_rate': 1.4264291504792514e-05, 'epoch': 0.38} + 38%|███▊ | 2188/5772 [3:50:38<6:07:39, 6.15s/it] 38%|███▊ | 2189/5772 [3:50:44<6:07:50, 6.16s/it] 38%|███▊ | 2189/5772 [3:50:52<6:07:50, 6.16s/it] {'loss': 0.4812, 'learning_rate': 1.4259214669877462e-05, 'epoch': 0.38} + 38%|███▊ | 2189/5772 [3:50:52<6:07:50, 6.16s/it] {'loss': 0.4812, 'learning_rate': 1.4259214669877462e-05, 'epoch': 0.38} + 38%|███▊ | 2189/5772 [3:50:44<6:07:50, 6.16s/it] 38%|███▊ | 2190/5772 [3:50:57<6:01:14, 6.05s/it] 38%|███▊ | 2190/5772 [3:50:50<6:01:15, 6.05s/it] {'loss': 0.4743, 'learning_rate': 1.4254136493546432e-05, 'epoch': 0.38} + 38%|███▊ | 2190/5772 [3:50:57<6:01:14, 6.05s/it] {'loss': 0.4743, 'learning_rate': 1.4254136493546432e-05, 'epoch': 0.38} + 38%|███▊ | 2190/5772 [3:50:50<6:01:15, 6.05s/it] 38%|███▊ | 2191/5772 [3:50:56<5:55:42, 5.96s/it] 38%|███▊ | 2191/5772 [3:51:03<5:55:42, 5.96s/it] {'loss': 0.4736, 'learning_rate': 1.4249056977398767e-05, 'epoch': 0.38} + 38%|███▊ | 2191/5772 [3:51:03<5:55:42, 5.96s/it] {'loss': 0.4736, 'learning_rate': 1.4249056977398767e-05, 'epoch': 0.38} + 38%|███▊ | 2191/5772 [3:50:56<5:55:42, 5.96s/it] 38%|███▊ | 2192/5772 [3:51:02<5:54:46, 5.95s/it] 38%|███▊ | 2192/5772 [3:51:09<5:54:46, 5.95s/it] {'loss': 0.4811, 'learning_rate': 1.4243976123034231e-05, 'epoch': 0.38} + 38%|███▊ | 2192/5772 [3:51:09<5:54:46, 5.95s/it] {'loss': 0.4811, 'learning_rate': 1.4243976123034231e-05, 'epoch': 0.38} + 38%|███▊ | 2192/5772 [3:51:02<5:54:46, 5.95s/it] 38%|███▊ | 2193/5772 [3:51:08<6:03:57, 6.10s/it] 38%|███▊ | 2193/5772 [3:51:15<6:03:57, 6.10s/it] {'loss': 0.4814, 'learning_rate': 1.4238893932053013e-05, 'epoch': 0.38} + 38%|███▊ | 2193/5772 [3:51:15<6:03:57, 6.10s/it] {'loss': 0.4814, 'learning_rate': 1.4238893932053013e-05, 'epoch': 0.38} + 38%|███▊ | 2193/5772 [3:51:08<6:03:57, 6.10s/it] 38%|███▊ | 2194/5772 [3:51:14<6:04:02, 6.10s/it] 38%|███▊ | 2194/5772 [3:51:22<6:04:03, 6.10s/it] {'loss': 0.4729, 'learning_rate': 1.4233810406055718e-05, 'epoch': 0.38} + 38%|███▊ | 2194/5772 [3:51:22<6:04:03, 6.10s/it] {'loss': 0.4729, 'learning_rate': 1.4233810406055718e-05, 'epoch': 0.38} + 38%|███▊ | 2194/5772 [3:51:14<6:04:02, 6.10s/it] 38%|███▊ | 2195/5772 [3:51:28<6:07:07, 6.16s/it] 38%|███▊ | 2195/5772 [3:51:21<6:07:09, 6.16s/it] {'loss': 0.4779, 'learning_rate': 1.4228725546643373e-05, 'epoch': 0.38} + 38%|███▊ | 2195/5772 [3:51:28<6:07:07, 6.16s/it] {'loss': 0.4779, 'learning_rate': 1.4228725546643373e-05, 'epoch': 0.38} + 38%|███▊ | 2195/5772 [3:51:21<6:07:09, 6.16s/it] 38%|███▊ | 2196/5772 [3:51:27<6:05:02, 6.12s/it] 38%|███▊ | 2196/5772 [3:51:34<6:05:02, 6.12s/it] {'loss': 0.4712, 'learning_rate': 1.422363935541743e-05, 'epoch': 0.38} + 38%|███▊ | 2196/5772 [3:51:34<6:05:02, 6.12s/it] {'loss': 0.4712, 'learning_rate': 1.422363935541743e-05, 'epoch': 0.38} + 38%|███▊ | 2196/5772 [3:51:27<6:05:02, 6.12s/it] 38%|███▊ | 2197/5772 [3:51:33<6:10:20, 6.22s/it] 38%|███▊ | 2197/5772 [3:51:40<6:10:20, 6.22s/it] {'loss': 0.467, 'learning_rate': 1.4218551833979759e-05, 'epoch': 0.38} + 38%|███▊ | 2197/5772 [3:51:40<6:10:20, 6.22s/it] {'loss': 0.467, 'learning_rate': 1.4218551833979759e-05, 'epoch': 0.38} + 38%|███▊ | 2197/5772 [3:51:33<6:10:20, 6.22s/it] 38%|███▊ | 2198/5772 [3:51:40<6:12:04, 6.25s/it] 38%|███▊ | 2198/5772 [3:51:47<6:12:05, 6.25s/it] {'loss': 0.4695, 'learning_rate': 1.4213462983932641e-05, 'epoch': 0.38} + 38%|███▊ | 2198/5772 [3:51:47<6:12:05, 6.25s/it] {'loss': 0.4695, 'learning_rate': 1.4213462983932641e-05, 'epoch': 0.38} + 38%|███▊ | 2198/5772 [3:51:40<6:12:04, 6.25s/it] 38%|███▊ | 2199/5772 [3:51:46<6:13:12, 6.27s/it] 38%|███▊ | 2199/5772 [3:51:53<6:13:11, 6.27s/it] {'loss': 0.4685, 'learning_rate': 1.4208372806878782e-05, 'epoch': 0.38} + 38%|███▊ | 2199/5772 [3:51:53<6:13:11, 6.27s/it] {'loss': 0.4685, 'learning_rate': 1.4208372806878782e-05, 'epoch': 0.38} + 38%|███▊ | 2199/5772 [3:51:46<6:13:12, 6.27s/it]13 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... + 38%|███▊ | 2200/5772 [3:51:59<6:17:34, 6.34s/it]5 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +07 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... + 38%|███▊ | 2200/5772 [3:51:52<6:17:34, 6.34s/it]14 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4762, 'learning_rate': 1.420328130442131e-05, 'epoch': 0.38} + 38%|███▊ | 2200/5772 [3:51:59<6:17:34, 6.34s/it] {'loss': 0.4762, 'learning_rate': 1.420328130442131e-05, 'epoch': 0.38} + 38%|███▊ | 2200/5772 [3:51:52<6:17:34, 6.34s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-2200/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-2200/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-2200/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 38%|███▊ | 2201/5772 [3:52:14<10:55:11, 11.01s/it] 38%|███▊ | 2201/5772 [3:52:21<10:55:11, 11.01s/it] {'loss': 0.4696, 'learning_rate': 1.419818847816376e-05, 'epoch': 0.38} + 38%|███▊ | 2201/5772 [3:52:21<10:55:11, 11.01s/it] {'loss': 0.4696, 'learning_rate': 1.419818847816376e-05, 'epoch': 0.38} + 38%|███▊ | 2201/5772 [3:52:14<10:55:11, 11.01s/it] 38%|███▊ | 2202/5772 [3:52:20<9:24:41, 9.49s/it] 38%|███▊ | 2202/5772 [3:52:27<9:24:41, 9.49s/it] {'loss': 0.4628, 'learning_rate': 1.4193094329710089e-05, 'epoch': 0.38} + 38%|███▊ | 2202/5772 [3:52:27<9:24:41, 9.49s/it] {'loss': 0.4628, 'learning_rate': 1.4193094329710089e-05, 'epoch': 0.38} + 38%|███▊ | 2202/5772 [3:52:20<9:24:41, 9.49s/it] 38%|███▊ | 2203/5772 [3:52:26<8:21:12, 8.43s/it] 38%|███▊ | 2203/5772 [3:52:33<8:21:12, 8.43s/it] {'loss': 0.4587, 'learning_rate': 1.4187998860664672e-05, 'epoch': 0.38} + 38%|███▊ | 2203/5772 [3:52:33<8:21:12, 8.43s/it] {'loss': 0.4587, 'learning_rate': 1.4187998860664672e-05, 'epoch': 0.38} + 38%|███▊ | 2203/5772 [3:52:26<8:21:12, 8.43s/it] 38%|███▊ | 2204/5772 [3:52:32<7:34:26, 7.64s/it] 38%|███▊ | 2204/5772 [3:52:39<7:34:27, 7.64s/it] {'loss': 0.4808, 'learning_rate': 1.4182902072632301e-05, 'epoch': 0.38} + 38%|███▊ | 2204/5772 [3:52:39<7:34:27, 7.64s/it] {'loss': 0.4808, 'learning_rate': 1.4182902072632301e-05, 'epoch': 0.38} + 38%|███▊ | 2204/5772 [3:52:32<7:34:26, 7.64s/it] 38%|███▊ | 2205/5772 [3:52:38<7:08:02, 7.20s/it] 38%|███▊ | 2205/5772 [3:52:45<7:08:03, 7.20s/it] {'loss': 0.4736, 'learning_rate': 1.4177803967218178e-05, 'epoch': 0.38} + 38%|███▊ | 2205/5772 [3:52:45<7:08:03, 7.20s/it] {'loss': 0.4736, 'learning_rate': 1.4177803967218178e-05, 'epoch': 0.38} + 38%|███▊ | 2205/5772 [3:52:38<7:08:02, 7.20s/it] 38%|███▊ | 2206/5772 [3:52:44<6:48:19, 6.87s/it] 38%|███▊ | 2206/5772 [3:52:51<6:48:19, 6.87s/it] {'loss': 0.4798, 'learning_rate': 1.4172704546027926e-05, 'epoch': 0.38} + 38%|███▊ | 2206/5772 [3:52:51<6:48:19, 6.87s/it] {'loss': 0.4798, 'learning_rate': 1.4172704546027926e-05, 'epoch': 0.38} + 38%|███▊ | 2206/5772 [3:52:44<6:48:19, 6.87s/it] 38%|███▊ | 2207/5772 [3:52:51<6:36:59, 6.68s/it] 38%|███▊ | 2207/5772 [3:52:58<6:36:59, 6.68s/it] {'loss': 0.4762, 'learning_rate': 1.4167603810667578e-05, 'epoch': 0.38} + 38%|███▊ | 2207/5772 [3:52:58<6:36:59, 6.68s/it] {'loss': 0.4762, 'learning_rate': 1.4167603810667578e-05, 'epoch': 0.38} + 38%|███▊ | 2207/5772 [3:52:51<6:36:59, 6.68s/it] 38%|███▊ | 2208/5772 [3:52:57<6:25:47, 6.49s/it] 38%|███▊ | 2208/5772 [3:53:04<6:25:47, 6.49s/it] {'loss': 0.4764, 'learning_rate': 1.4162501762743579e-05, 'epoch': 0.38} + 38%|███▊ | 2208/5772 [3:53:04<6:25:47, 6.49s/it] {'loss': 0.4764, 'learning_rate': 1.4162501762743579e-05, 'epoch': 0.38} + 38%|███▊ | 2208/5772 [3:52:57<6:25:47, 6.49s/it] 38%|███▊ | 2209/5772 [3:53:03<6:24:01, 6.47s/it] 38%|███▊ | 2209/5772 [3:53:10<6:24:01, 6.47s/it] {'loss': 0.4728, 'learning_rate': 1.4157398403862794e-05, 'epoch': 0.38} + 38%|███▊ | 2209/5772 [3:53:10<6:24:01, 6.47s/it] {'loss': 0.4728, 'learning_rate': 1.4157398403862794e-05, 'epoch': 0.38} + 38%|███▊ | 2209/5772 [3:53:03<6:24:01, 6.47s/it] 38%|███▊ | 2210/5772 [3:53:09<6:20:24, 6.41s/it] 38%|███▊ | 2210/5772 [3:53:16<6:20:24, 6.41s/it] {'loss': 0.4847, 'learning_rate': 1.4152293735632498e-05, 'epoch': 0.38} + 38%|███▊ | 2210/5772 [3:53:16<6:20:24, 6.41s/it] {'loss': 0.4847, 'learning_rate': 1.4152293735632498e-05, 'epoch': 0.38} + 38%|███▊ | 2210/5772 [3:53:09<6:20:24, 6.41s/it] 38%|███▊ | 2211/5772 [3:53:15<6:13:45, 6.30s/it] 38%|███▊ | 2211/5772 [3:53:22<6:13:46, 6.30s/it] {'loss': 0.4677, 'learning_rate': 1.4147187759660377e-05, 'epoch': 0.38} + 38%|███▊ | 2211/5772 [3:53:22<6:13:46, 6.30s/it] {'loss': 0.4677, 'learning_rate': 1.4147187759660377e-05, 'epoch': 0.38} + 38%|███▊ | 2211/5772 [3:53:15<6:13:45, 6.30s/it] 38%|███▊ | 2212/5772 [3:53:22<6:13:50, 6.30s/it] 38%|███▊ | 2212/5772 [3:53:29<6:13:49, 6.30s/it] {'loss': 0.4848, 'learning_rate': 1.414208047755453e-05, 'epoch': 0.38} + 38%|███▊ | 2212/5772 [3:53:29<6:13:49, 6.30s/it] {'loss': 0.4848, 'learning_rate': 1.414208047755453e-05, 'epoch': 0.38} + 38%|███▊ | 2212/5772 [3:53:22<6:13:50, 6.30s/it] 38%|███▊ | 2213/5772 [3:53:28<6:07:44, 6.20s/it] 38%|███▊ | 2213/5772 [3:53:35<6:07:44, 6.20s/it] {'loss': 0.4629, 'learning_rate': 1.4136971890923465e-05, 'epoch': 0.38} + 38%|███▊ | 2213/5772 [3:53:35<6:07:44, 6.20s/it] {'loss': 0.4629, 'learning_rate': 1.4136971890923465e-05, 'epoch': 0.38} + 38%|███▊ | 2213/5772 [3:53:28<6:07:44, 6.20s/it] 38%|███▊ | 2214/5772 [3:53:34<6:08:26, 6.21s/it] 38%|███▊ | 2214/5772 [3:53:41<6:08:26, 6.21s/it] {'loss': 0.4789, 'learning_rate': 1.4131862001376107e-05, 'epoch': 0.38} + 38%|███▊ | 2214/5772 [3:53:41<6:08:26, 6.21s/it] {'loss': 0.4789, 'learning_rate': 1.4131862001376107e-05, 'epoch': 0.38} + 38%|███▊ | 2214/5772 [3:53:34<6:08:26, 6.21s/it] 38%|███▊ | 2215/5772 [3:53:40<6:07:17, 6.20s/it] 38%|███▊ | 2215/5772 [3:53:47<6:07:17, 6.20s/it] {'loss': 0.4672, 'learning_rate': 1.4126750810521783e-05, 'epoch': 0.38} + 38%|███▊ | 2215/5772 [3:53:47<6:07:17, 6.20s/it] {'loss': 0.4672, 'learning_rate': 1.4126750810521783e-05, 'epoch': 0.38} + 38%|███▊ | 2215/5772 [3:53:40<6:07:17, 6.20s/it] 38%|███▊ | 2216/5772 [3:53:46<6:10:35, 6.25s/it] 38%|███▊ | 2216/5772 [3:53:53<6:10:34, 6.25s/it] {'loss': 0.4909, 'learning_rate': 1.4121638319970234e-05, 'epoch': 0.38} + 38%|███▊ | 2216/5772 [3:53:53<6:10:34, 6.25s/it] {'loss': 0.4909, 'learning_rate': 1.4121638319970234e-05, 'epoch': 0.38} + 38%|███▊ | 2216/5772 [3:53:46<6:10:35, 6.25s/it] 38%|███▊ | 2217/5772 [3:53:52<6:07:09, 6.20s/it] 38%|███▊ | 2217/5772 [3:53:59<6:07:08, 6.20s/it] {'loss': 0.4755, 'learning_rate': 1.4116524531331616e-05, 'epoch': 0.38} + 38%|███▊ | 2217/5772 [3:53:59<6:07:08, 6.20s/it] {'loss': 0.4755, 'learning_rate': 1.4116524531331616e-05, 'epoch': 0.38} + 38%|███▊ | 2217/5772 [3:53:52<6:07:09, 6.20s/it] 38%|███▊ | 2218/5772 [3:53:59<6:05:27, 6.17s/it] 38%|███▊ | 2218/5772 [3:54:06<6:05:27, 6.17s/it] {'loss': 0.4736, 'learning_rate': 1.4111409446216482e-05, 'epoch': 0.38} + 38%|███▊ | 2218/5772 [3:54:06<6:05:27, 6.17s/it] {'loss': 0.4736, 'learning_rate': 1.4111409446216482e-05, 'epoch': 0.38} + 38%|███▊ | 2218/5772 [3:53:59<6:05:27, 6.17s/it] 38%|███▊ | 2219/5772 [3:54:04<6:01:53, 6.11s/it] 38%|███▊ | 2219/5772 [3:54:12<6:01:53, 6.11s/it] {'loss': 0.4709, 'learning_rate': 1.4106293066235806e-05, 'epoch': 0.38} + 38%|███▊ | 2219/5772 [3:54:12<6:01:53, 6.11s/it] {'loss': 0.4709, 'learning_rate': 1.4106293066235806e-05, 'epoch': 0.38} + 38%|███▊ | 2219/5772 [3:54:04<6:01:53, 6.11s/it] 38%|███▊ | 2220/5772 [3:54:11<6:02:35, 6.12s/it] 38%|███▊ | 2220/5772 [3:54:18<6:02:35, 6.12s/it] {'loss': 0.4803, 'learning_rate': 1.410117539300096e-05, 'epoch': 0.38} + 38%|███▊ | 2220/5772 [3:54:18<6:02:35, 6.12s/it] {'loss': 0.4803, 'learning_rate': 1.410117539300096e-05, 'epoch': 0.38} + 38%|███▊ | 2220/5772 [3:54:11<6:02:35, 6.12s/it] 38%|███▊ | 2221/5772 [3:54:16<5:57:43, 6.04s/it] 38%|███▊ | 2221/5772 [3:54:24<5:57:43, 6.04s/it] {'loss': 0.4605, 'learning_rate': 1.4096056428123721e-05, 'epoch': 0.38} + 38%|███▊ | 2221/5772 [3:54:24<5:57:43, 6.04s/it] {'loss': 0.4605, 'learning_rate': 1.4096056428123721e-05, 'epoch': 0.38} + 38%|███▊ | 2221/5772 [3:54:16<5:57:43, 6.04s/it] 38%|███▊ | 2222/5772 [3:54:22<5:54:13, 5.99s/it] 38%|███▊ | 2222/5772 [3:54:29<5:54:12, 5.99s/it] {'loss': 0.4636, 'learning_rate': 1.4090936173216289e-05, 'epoch': 0.38} + 38%|███▊ | 2222/5772 [3:54:29<5:54:12, 5.99s/it] {'loss': 0.4636, 'learning_rate': 1.4090936173216289e-05, 'epoch': 0.38} + 38%|███▊ | 2222/5772 [3:54:22<5:54:13, 5.99s/it] 39%|███▊ | 2223/5772 [3:54:29<5:59:32, 6.08s/it] 39%|███▊ | 2223/5772 [3:54:36<5:59:32, 6.08s/it] {'loss': 0.4725, 'learning_rate': 1.4085814629891252e-05, 'epoch': 0.39} + 39%|███▊ | 2223/5772 [3:54:36<5:59:32, 6.08s/it] {'loss': 0.4725, 'learning_rate': 1.4085814629891252e-05, 'epoch': 0.39} + 39%|███▊ | 2223/5772 [3:54:29<5:59:32, 6.08s/it] 39%|███▊ | 2224/5772 [3:54:35<5:57:29, 6.05s/it] 39%|███▊ | 2224/5772 [3:54:42<5:57:29, 6.05s/it] {'loss': 0.4739, 'learning_rate': 1.4080691799761618e-05, 'epoch': 0.39} + 39%|███▊ | 2224/5772 [3:54:42<5:57:29, 6.05s/it] {'loss': 0.4739, 'learning_rate': 1.4080691799761618e-05, 'epoch': 0.39} + 39%|███▊ | 2224/5772 [3:54:35<5:57:29, 6.05s/it] 39%|███▊ | 2225/5772 [3:54:40<5:52:12, 5.96s/it] 39%|███▊ | 2225/5772 [3:54:47<5:52:12, 5.96s/it] {'loss': 0.4719, 'learning_rate': 1.4075567684440788e-05, 'epoch': 0.39} + 39%|███▊ | 2225/5772 [3:54:47<5:52:12, 5.96s/it] {'loss': 0.4719, 'learning_rate': 1.4075567684440788e-05, 'epoch': 0.39} + 39%|███▊ | 2225/5772 [3:54:40<5:52:12, 5.96s/it] 39%|███▊ | 2226/5772 [3:54:46<5:55:09, 6.01s/it] 39%|███▊ | 2226/5772 [3:54:54<5:55:11, 6.01s/it] {'loss': 0.4875, 'learning_rate': 1.4070442285542579e-05, 'epoch': 0.39} + {'loss': 0.4875, 'learning_rate': 1.4070442285542579e-05, 'epoch': 0.39} 39%|███▊ | 2226/5772 [3:54:54<5:55:11, 6.01s/it] + 39%|███▊ | 2226/5772 [3:54:46<5:55:09, 6.01s/it] 39%|███▊ | 2227/5772 [3:54:52<5:54:18, 6.00s/it] 39%|███▊ | 2227/5772 [3:55:00<5:54:17, 6.00s/it] {'loss': 0.4738, 'learning_rate': 1.4065315604681198e-05, 'epoch': 0.39} + 39%|███▊ | 2227/5772 [3:55:00<5:54:17, 6.00s/it] {'loss': 0.4738, 'learning_rate': 1.4065315604681198e-05, 'epoch': 0.39} + 39%|███▊ | 2227/5772 [3:54:52<5:54:18, 6.00s/it] 39%|███▊ | 2228/5772 [3:54:59<6:03:21, 6.15s/it] 39%|███▊ | 2228/5772 [3:55:06<6:03:21, 6.15s/it] {'loss': 0.485, 'learning_rate': 1.4060187643471276e-05, 'epoch': 0.39} + 39%|███▊ | 2228/5772 [3:55:06<6:03:21, 6.15s/it] {'loss': 0.485, 'learning_rate': 1.4060187643471276e-05, 'epoch': 0.39} + 39%|███▊ | 2228/5772 [3:54:59<6:03:21, 6.15s/it] 39%|███▊ | 2229/5772 [3:55:05<6:02:08, 6.13s/it] 39%|███▊ | 2229/5772 [3:55:12<6:02:07, 6.13s/it] {'loss': 0.4719, 'learning_rate': 1.4055058403527828e-05, 'epoch': 0.39} + 39%|███▊ | 2229/5772 [3:55:12<6:02:07, 6.13s/it] {'loss': 0.4719, 'learning_rate': 1.4055058403527828e-05, 'epoch': 0.39} + 39%|███▊ | 2229/5772 [3:55:05<6:02:08, 6.13s/it] 39%|███▊ | 2230/5772 [3:55:11<6:03:10, 6.15s/it] 39%|███▊ | 2230/5772 [3:55:18<6:03:09, 6.15s/it] {'loss': 0.4794, 'learning_rate': 1.4049927886466281e-05, 'epoch': 0.39} + 39%|███▊ | 2230/5772 [3:55:18<6:03:09, 6.15s/it] {'loss': 0.4794, 'learning_rate': 1.4049927886466281e-05, 'epoch': 0.39} + 39%|███▊ | 2230/5772 [3:55:11<6:03:10, 6.15s/it] 39%|███▊ | 2231/5772 [3:55:17<5:55:40, 6.03s/it] 39%|███▊ | 2231/5772 [3:55:24<5:55:39, 6.03s/it] {'loss': 0.4657, 'learning_rate': 1.4044796093902466e-05, 'epoch': 0.39} + 39%|███▊ | 2231/5772 [3:55:24<5:55:39, 6.03s/it] {'loss': 0.4657, 'learning_rate': 1.4044796093902466e-05, 'epoch': 0.39} + 39%|███▊ | 2231/5772 [3:55:17<5:55:40, 6.03s/it] 39%|███▊ | 2232/5772 [3:55:23<6:02:17, 6.14s/it] 39%|███▊ | 2232/5772 [3:55:30<6:02:17, 6.14s/it] {'loss': 0.4766, 'learning_rate': 1.403966302745261e-05, 'epoch': 0.39} + 39%|███▊ | 2232/5772 [3:55:30<6:02:17, 6.14s/it] {'loss': 0.4766, 'learning_rate': 1.403966302745261e-05, 'epoch': 0.39} + 39%|███▊ | 2232/5772 [3:55:23<6:02:17, 6.14s/it] 39%|███▊ | 2233/5772 [3:55:29<6:01:08, 6.12s/it] 39%|███▊ | 2233/5772 [3:55:37<6:01:08, 6.12s/it] {'loss': 0.4589, 'learning_rate': 1.4034528688733344e-05, 'epoch': 0.39} + 39%|███▊ | 2233/5772 [3:55:37<6:01:08, 6.12s/it] {'loss': 0.4589, 'learning_rate': 1.4034528688733344e-05, 'epoch': 0.39} + 39%|███▊ | 2233/5772 [3:55:29<6:01:08, 6.12s/it] 39%|███▊ | 2234/5772 [3:55:36<6:04:57, 6.19s/it] 39%|███▊ | 2234/5772 [3:55:43<6:04:56, 6.19s/it] {'loss': 0.4717, 'learning_rate': 1.4029393079361699e-05, 'epoch': 0.39} + 39%|███▊ | 2234/5772 [3:55:43<6:04:56, 6.19s/it] {'loss': 0.4717, 'learning_rate': 1.4029393079361699e-05, 'epoch': 0.39} + 39%|███▊ | 2234/5772 [3:55:36<6:04:57, 6.19s/it] 39%|███▊ | 2235/5772 [3:55:42<6:02:25, 6.15s/it] 39%|███▊ | 2235/5772 [3:55:49<6:02:26, 6.15s/it] {'loss': 0.474, 'learning_rate': 1.402425620095511e-05, 'epoch': 0.39} + {'loss': 0.474, 'learning_rate': 1.402425620095511e-05, 'epoch': 0.39} 39%|███▊ | 2235/5772 [3:55:49<6:02:26, 6.15s/it] + 39%|███▊ | 2235/5772 [3:55:42<6:02:25, 6.15s/it] 39%|███▊ | 2236/5772 [3:55:48<6:01:42, 6.14s/it] 39%|███▊ | 2236/5772 [3:55:55<6:01:42, 6.14s/it] {'loss': 0.4857, 'learning_rate': 1.40191180551314e-05, 'epoch': 0.39} + 39%|███▊ | 2236/5772 [3:55:55<6:01:42, 6.14s/it] {'loss': 0.4857, 'learning_rate': 1.40191180551314e-05, 'epoch': 0.39} + 39%|███▊ | 2236/5772 [3:55:48<6:01:42, 6.14s/it] 39%|███▉ | 2237/5772 [3:55:54<6:01:14, 6.13s/it] 39%|███▉ | 2237/5772 [3:56:01<6:01:14, 6.13s/it] {'loss': 0.4714, 'learning_rate': 1.4013978643508807e-05, 'epoch': 0.39} + 39%|███▉ | 2237/5772 [3:56:01<6:01:14, 6.13s/it] {'loss': 0.4714, 'learning_rate': 1.4013978643508807e-05, 'epoch': 0.39} + 39%|███▉ | 2237/5772 [3:55:54<6:01:14, 6.13s/it] 39%|███▉ | 2238/5772 [3:56:00<5:56:19, 6.05s/it] 39%|███▉ | 2238/5772 [3:56:07<5:56:18, 6.05s/it] {'loss': 0.4794, 'learning_rate': 1.4008837967705959e-05, 'epoch': 0.39} + 39%|███▉ | 2238/5772 [3:56:07<5:56:18, 6.05s/it] {'loss': 0.4794, 'learning_rate': 1.4008837967705959e-05, 'epoch': 0.39} + 39%|███▉ | 2238/5772 [3:56:00<5:56:19, 6.05s/it] 39%|███▉ | 2239/5772 [3:56:06<5:54:45, 6.02s/it] 39%|███▉ | 2239/5772 [3:56:13<5:54:45, 6.02s/it] {'loss': 0.4615, 'learning_rate': 1.4003696029341884e-05, 'epoch': 0.39} + 39%|███▉ | 2239/5772 [3:56:13<5:54:45, 6.02s/it] {'loss': 0.4615, 'learning_rate': 1.4003696029341884e-05, 'epoch': 0.39} + 39%|███▉ | 2239/5772 [3:56:06<5:54:45, 6.02s/it] 39%|███▉ | 2240/5772 [3:56:12<5:53:50, 6.01s/it] 39%|███▉ | 2240/5772 [3:56:19<5:53:50, 6.01s/it] {'loss': 0.4666, 'learning_rate': 1.3998552830036003e-05, 'epoch': 0.39} + 39%|███▉ | 2240/5772 [3:56:19<5:53:50, 6.01s/it] {'loss': 0.4666, 'learning_rate': 1.3998552830036003e-05, 'epoch': 0.39} + 39%|███▉ | 2240/5772 [3:56:12<5:53:50, 6.01s/it] 39%|███▉ | 2241/5772 [3:56:18<5:57:28, 6.07s/it] 39%|███▉ | 2241/5772 [3:56:25<5:57:29, 6.07s/it] {'loss': 0.4718, 'learning_rate': 1.399340837140814e-05, 'epoch': 0.39} + 39%|███▉ | 2241/5772 [3:56:25<5:57:29, 6.07s/it] {'loss': 0.4718, 'learning_rate': 1.399340837140814e-05, 'epoch': 0.39} + 39%|███▉ | 2241/5772 [3:56:18<5:57:28, 6.07s/it] 39%|███▉ | 2242/5772 [3:56:24<5:53:48, 6.01s/it] 39%|███▉ | 2242/5772 [3:56:31<5:53:47, 6.01s/it] {'loss': 0.4681, 'learning_rate': 1.3988262655078514e-05, 'epoch': 0.39} + 39%|███▉ | 2242/5772 [3:56:31<5:53:47, 6.01s/it] {'loss': 0.4681, 'learning_rate': 1.3988262655078514e-05, 'epoch': 0.39} + 39%|███▉ | 2242/5772 [3:56:24<5:53:48, 6.01s/it] 39%|███▉ | 2243/5772 [3:56:30<5:59:11, 6.11s/it] 39%|███▉ | 2243/5772 [3:56:37<5:59:11, 6.11s/it] {'loss': 0.4775, 'learning_rate': 1.3983115682667743e-05, 'epoch': 0.39} + 39%|███▉ | 2243/5772 [3:56:37<5:59:11, 6.11s/it] {'loss': 0.4775, 'learning_rate': 1.3983115682667743e-05, 'epoch': 0.39} + 39%|███▉ | 2243/5772 [3:56:30<5:59:11, 6.11s/it] 39%|███▉ | 2244/5772 [3:56:36<5:51:42, 5.98s/it] 39%|███▉ | 2244/5772 [3:56:43<5:51:42, 5.98s/it] {'loss': 0.4658, 'learning_rate': 1.3977967455796828e-05, 'epoch': 0.39} + 39%|███▉ | 2244/5772 [3:56:43<5:51:42, 5.98s/it] {'loss': 0.4658, 'learning_rate': 1.3977967455796828e-05, 'epoch': 0.39} + 39%|███▉ | 2244/5772 [3:56:36<5:51:42, 5.98s/it] 39%|███▉ | 2245/5772 [3:56:42<5:51:17, 5.98s/it] 39%|███▉ | 2245/5772 [3:56:49<5:51:17, 5.98s/it] {'loss': 0.4591, 'learning_rate': 1.3972817976087183e-05, 'epoch': 0.39} + 39%|███▉ | 2245/5772 [3:56:49<5:51:17, 5.98s/it] {'loss': 0.4591, 'learning_rate': 1.3972817976087183e-05, 'epoch': 0.39} + 39%|███▉ | 2245/5772 [3:56:42<5:51:17, 5.98s/it] 39%|███▉ | 2246/5772 [3:56:48<5:52:44, 6.00s/it] 39%|███▉ | 2246/5772 [3:56:55<5:52:44, 6.00s/it] {'loss': 0.4805, 'learning_rate': 1.3967667245160608e-05, 'epoch': 0.39} + 39%|███▉ | 2246/5772 [3:56:55<5:52:44, 6.00s/it] {'loss': 0.4805, 'learning_rate': 1.3967667245160608e-05, 'epoch': 0.39} + 39%|███▉ | 2246/5772 [3:56:48<5:52:44, 6.00s/it] 39%|███▉ | 2247/5772 [3:56:54<5:58:52, 6.11s/it] 39%|███▉ | 2247/5772 [3:57:01<5:58:52, 6.11s/it] {'loss': 0.4736, 'learning_rate': 1.3962515264639291e-05, 'epoch': 0.39} + 39%|███▉ | 2247/5772 [3:57:01<5:58:52, 6.11s/it] {'loss': 0.4736, 'learning_rate': 1.3962515264639291e-05, 'epoch': 0.39} + 39%|███▉ | 2247/5772 [3:56:54<5:58:52, 6.11s/it] 39%|███▉ | 2248/5772 [3:57:00<5:57:17, 6.08s/it] 39%|███▉ | 2248/5772 [3:57:07<5:57:17, 6.08s/it] {'loss': 0.4789, 'learning_rate': 1.3957362036145826e-05, 'epoch': 0.39} + 39%|███▉ | 2248/5772 [3:57:07<5:57:17, 6.08s/it] {'loss': 0.4789, 'learning_rate': 1.3957362036145826e-05, 'epoch': 0.39} + 39%|███▉ | 2248/5772 [3:57:00<5:57:17, 6.08s/it] 39%|███▉ | 2249/5772 [3:57:07<6:00:26, 6.14s/it] 39%|███▉ | 2249/5772 [3:57:14<6:00:26, 6.14s/it] {'loss': 0.4675, 'learning_rate': 1.3952207561303188e-05, 'epoch': 0.39} + 39%|███▉ | 2249/5772 [3:57:14<6:00:26, 6.14s/it] {'loss': 0.4675, 'learning_rate': 1.3952207561303188e-05, 'epoch': 0.39} + 39%|███▉ | 2249/5772 [3:57:07<6:00:26, 6.14s/it]3 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +60 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... + 25 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... + + 39%|███▉ | 2250/5772 [3:57:13<5:58:15, 6.10s/it]8 AutoResumeHook: Checking whether to suspend... + 39%|███▉ | 2250/5772 [3:57:20<5:58:15, 6.10s/it] {'loss': 0.4655, 'learning_rate': 1.3947051841734756e-05, 'epoch': 0.39} + 39%|███▉ | 2250/5772 [3:57:20<5:58:15, 6.10s/it] {'loss': 0.4655, 'learning_rate': 1.3947051841734756e-05, 'epoch': 0.39} + 39%|███▉ | 2250/5772 [3:57:13<5:58:15, 6.10s/it] 39%|███▉ | 2251/5772 [3:57:19<6:02:10, 6.17s/it] 39%|███▉ | 2251/5772 [3:57:26<6:02:10, 6.17s/it] {'loss': 0.4666, 'learning_rate': 1.3941894879064289e-05, 'epoch': 0.39} + 39%|███▉ | 2251/5772 [3:57:26<6:02:10, 6.17s/it] {'loss': 0.4666, 'learning_rate': 1.3941894879064289e-05, 'epoch': 0.39} + 39%|███▉ | 2251/5772 [3:57:19<6:02:10, 6.17s/it] 39%|███▉ | 2252/5772 [3:57:25<6:01:04, 6.15s/it] 39%|███▉ | 2252/5772 [3:57:32<6:01:04, 6.15s/it] {'loss': 0.4887, 'learning_rate': 1.3936736674915947e-05, 'epoch': 0.39} + 39%|███▉ | 2252/5772 [3:57:32<6:01:04, 6.15s/it] {'loss': 0.4887, 'learning_rate': 1.3936736674915947e-05, 'epoch': 0.39} + 39%|███▉ | 2252/5772 [3:57:25<6:01:04, 6.15s/it] 39%|███▉ | 2253/5772 [3:57:31<5:57:51, 6.10s/it] 39%|███▉ | 2253/5772 [3:57:38<5:57:51, 6.10s/it] {'loss': 0.4643, 'learning_rate': 1.393157723091428e-05, 'epoch': 0.39} + 39%|███▉ | 2253/5772 [3:57:38<5:57:51, 6.10s/it] {'loss': 0.4643, 'learning_rate': 1.393157723091428e-05, 'epoch': 0.39} + 39%|███▉ | 2253/5772 [3:57:31<5:57:51, 6.10s/it] 39%|███▉ | 2254/5772 [3:57:38<6:04:30, 6.22s/it] 39%|███▉ | 2254/5772 [3:57:45<6:04:30, 6.22s/it] {'loss': 0.4875, 'learning_rate': 1.3926416548684221e-05, 'epoch': 0.39} + 39%|███▉ | 2254/5772 [3:57:45<6:04:30, 6.22s/it] {'loss': 0.4875, 'learning_rate': 1.3926416548684221e-05, 'epoch': 0.39} + 39%|███▉ | 2254/5772 [3:57:38<6:04:30, 6.22s/it]Apr 09 17:41:33.221821 110516 slurmstepd 0x155550ab8700: error: *** STEP 6683204.0 ON batch-block1-0010 CANCELLED AT 2025-04-09T17:41:33 DUE TO TIME LIMIT *** +srun: Job step aborted: Waiting up to 122 seconds for job step to finish. + 39%|███▉ | 2255/5772 [3:57:43<5:57:22, 6.10s/it] 39%|███▉ | 2255/5772 [3:57:50<5:57:22, 6.10s/it] {'loss': 0.4604, 'learning_rate': 1.3921254629851103e-05, 'epoch': 0.39} + 39%|███▉ | 2255/5772 [3:57:50<5:57:22, 6.10s/it] {'loss': 0.4604, 'learning_rate': 1.3921254629851103e-05, 'epoch': 0.39} + 39%|███▉ | 2255/5772 [3:57:43<5:57:22, 6.10s/it]srun: error: batch-block1-10014: task 1: Terminated +srun: Terminating StepId=6683204.0 +srun: error: batch-block1-0010: task 0: Terminated +srun: job 6692876 queued and waiting for resources +srun: job 6692876 has been allocated resources +wandb: Currently logged in as: memmelma. Use `wandb login --relogin` to force relogin +MASTER_ADDR=batch-block1-2085 +JobID: 6692876 | Full list: batch-block1-2085 batch-block1-2109 +NETWORK=Efficient-Large-Model/VILA1.5-3b +wandb: Currently logged in as: memmelma. Use `wandb login --relogin` to force relogin +MASTER_ADDR=batch-block1-2085 +JobID: 6692876 | Full list: batch-block1-2085 batch-block1-2109 +NETWORK=Efficient-Large-Model/VILA1.5-3b +WARNING:torch.distributed.run: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:torch.distributed.run: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +[2025-04-10 01:57:48,581] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 01:57:48,581] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 01:57:48,581] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 01:57:48,581] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 01:57:48,581] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 01:57:48,581] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 01:57:48,581] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 01:57:48,581] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 01:57:48,870] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 01:57:48,870] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 01:57:48,870] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 01:57:48,870] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 01:57:48,870] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 01:57:48,870] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 01:57:48,872] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 01:57:48,872] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 01:57:49,727] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 01:57:49,727] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 01:57:49,727] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 01:57:49,727] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 01:57:49,727] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 01:57:49,727] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 01:57:49,727] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 01:57:49,727] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 01:57:49,727] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 01:57:49,727] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 01:57:49,727] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 01:57:49,727] [INFO] [comm.py:625:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +[2025-04-10 01:57:49,727] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 01:57:49,727] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 01:57:49,727] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 01:57:49,727] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 01:57:49,727] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 01:57:50,215] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 01:57:50,215] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 01:57:50,215] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 01:57:50,215] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 01:57:50,215] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 01:57:50,215] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 01:57:50,215] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 01:57:50,215] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 01:57:50,215] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 01:57:50,215] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 01:57:50,215] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 01:57:50,215] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 01:57:50,215] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 01:57:50,215] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 01:57:50,215] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 01:57:50,215] [INFO] [comm.py:594:init_distributed] cdb=None +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +[2025-04-10 01:57:58,459] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 2.70B parameters + Loading checkpoint shards: 0%| | 0/2 [00:00 4096). Running this sequence through the model will result in indexing errors + 39%|███▉ | 2275/5772 [07:58<6:08:16, 6.32s/it] 39%|███▉ | 2275/5772 [07:56<6:08:16, 6.32s/it] {'loss': 0.4769, 'learning_rate': 1.3817759072982737e-05, 'epoch': 0.39} + 39%|███▉ | 2275/5772 [07:58<6:08:16, 6.32s/it] {'loss': 0.4769, 'learning_rate': 1.3817759072982737e-05, 'epoch': 0.39} + 39%|███▉ | 2275/5772 [07:56<6:08:16, 6.32s/it] 39%|███▉ | 2276/5772 [08:02<6:02:41, 6.22s/it] 39%|███▉ | 2276/5772 [08:04<6:02:42, 6.22s/it] {'loss': 0.4793, 'learning_rate': 1.3812571561621341e-05, 'epoch': 0.39} + 39%|███▉ | 2276/5772 [08:02<6:02:41, 6.22s/it]{'loss': 0.4793, 'learning_rate': 1.3812571561621341e-05, 'epoch': 0.39} + 39%|███▉ | 2276/5772 [08:04<6:02:42, 6.22s/it] 39%|███▉ | 2277/5772 [08:10<6:02:29, 6.22s/it] 39%|███▉ | 2277/5772 [08:08<6:02:29, 6.22s/it] {'loss': 0.4674, 'learning_rate': 1.3807382849511732e-05, 'epoch': 0.39} + 39%|███▉ | 2277/5772 [08:10<6:02:29, 6.22s/it] {'loss': 0.4674, 'learning_rate': 1.3807382849511732e-05, 'epoch': 0.39} + 39%|███▉ | 2277/5772 [08:08<6:02:29, 6.22s/it] 39%|███▉ | 2278/5772 [08:16<5:59:40, 6.18s/it] 39%|███▉ | 2278/5772 [08:14<5:59:40, 6.18s/it] {'loss': 0.4612, 'learning_rate': 1.3802192938288063e-05, 'epoch': 0.39} + 39%|███▉ | 2278/5772 [08:16<5:59:40, 6.18s/it] {'loss': 0.4612, 'learning_rate': 1.3802192938288063e-05, 'epoch': 0.39} + 39%|███▉ | 2278/5772 [08:14<5:59:40, 6.18s/it] 39%|███▉ | 2279/5772 [08:22<5:57:38, 6.14s/it] 39%|███▉ | 2279/5772 [08:20<5:57:38, 6.14s/it] {'loss': 0.4749, 'learning_rate': 1.3797001829584868e-05, 'epoch': 0.39} + 39%|███▉ | 2279/5772 [08:22<5:57:38, 6.14s/it] {'loss': 0.4749, 'learning_rate': 1.3797001829584868e-05, 'epoch': 0.39} + 39%|███▉ | 2279/5772 [08:20<5:57:38, 6.14s/it] 40%|███▉ | 2280/5772 [08:28<5:55:18, 6.11s/it] 40%|███▉ | 2280/5772 [08:26<5:55:18, 6.10s/it] {'loss': 0.4634, 'learning_rate': 1.3791809525037057e-05, 'epoch': 0.39} + 40%|███▉ | 2280/5772 [08:28<5:55:18, 6.11s/it] {'loss': 0.4634, 'learning_rate': 1.3791809525037057e-05, 'epoch': 0.39} + 40%|███▉ | 2280/5772 [08:26<5:55:18, 6.10s/it] 40%|███▉ | 2281/5772 [08:34<5:55:11, 6.10s/it] 40%|███▉ | 2281/5772 [08:32<5:55:11, 6.10s/it] {'loss': 0.4755, 'learning_rate': 1.3786616026279922e-05, 'epoch': 0.4} + 40%|███▉ | 2281/5772 [08:34<5:55:11, 6.10s/it] {'loss': 0.4755, 'learning_rate': 1.3786616026279922e-05, 'epoch': 0.4} + 40%|███▉ | 2281/5772 [08:32<5:55:11, 6.10s/it] 40%|███▉ | 2282/5772 [08:40<5:51:37, 6.05s/it] 40%|███▉ | 2282/5772 [08:38<5:51:37, 6.05s/it] {'loss': 0.475, 'learning_rate': 1.378142133494912e-05, 'epoch': 0.4} + 40%|███▉ | 2282/5772 [08:40<5:51:37, 6.05s/it] {'loss': 0.475, 'learning_rate': 1.378142133494912e-05, 'epoch': 0.4} + 40%|███▉ | 2282/5772 [08:38<5:51:37, 6.05s/it] 40%|███▉ | 2283/5772 [08:47<5:58:25, 6.16s/it] 40%|███▉ | 2283/5772 [08:45<5:58:26, 6.16s/it] {'loss': 0.4809, 'learning_rate': 1.3776225452680696e-05, 'epoch': 0.4} + 40%|███▉ | 2283/5772 [08:47<5:58:25, 6.16s/it] {'loss': 0.4809, 'learning_rate': 1.3776225452680696e-05, 'epoch': 0.4} + 40%|███▉ | 2283/5772 [08:45<5:58:26, 6.16s/it] 40%|███▉ | 2284/5772 [08:53<5:56:40, 6.14s/it] 40%|███▉ | 2284/5772 [08:51<5:56:40, 6.14s/it] {'loss': 0.4685, 'learning_rate': 1.377102838111106e-05, 'epoch': 0.4} + 40%|███▉ | 2284/5772 [08:53<5:56:40, 6.14s/it] {'loss': 0.4685, 'learning_rate': 1.377102838111106e-05, 'epoch': 0.4} + 40%|███▉ | 2284/5772 [08:51<5:56:40, 6.14s/it] 40%|███▉ | 2285/5772 [08:59<5:57:46, 6.16s/it] 40%|███▉ | 2285/5772 [08:57<5:57:46, 6.16s/it] {'loss': 0.4739, 'learning_rate': 1.3765830121877004e-05, 'epoch': 0.4} + 40%|███▉ | 2285/5772 [08:59<5:57:46, 6.16s/it] {'loss': 0.4739, 'learning_rate': 1.3765830121877004e-05, 'epoch': 0.4} + 40%|███▉ | 2285/5772 [08:57<5:57:46, 6.16s/it] 40%|███▉ | 2286/5772 [09:05<5:56:30, 6.14s/it] 40%|███▉ | 2286/5772 [09:03<5:56:30, 6.14s/it] {'loss': 0.4748, 'learning_rate': 1.3760630676615685e-05, 'epoch': 0.4} + 40%|███▉ | 2286/5772 [09:05<5:56:30, 6.14s/it] {'loss': 0.4748, 'learning_rate': 1.3760630676615685e-05, 'epoch': 0.4} + 40%|███▉ | 2286/5772 [09:03<5:56:30, 6.14s/it] 40%|███▉ | 2287/5772 [09:11<5:56:25, 6.14s/it] 40%|███▉ | 2287/5772 [09:09<5:56:25, 6.14s/it] {'loss': 0.4793, 'learning_rate': 1.3755430046964649e-05, 'epoch': 0.4} + 40%|███▉ | 2287/5772 [09:11<5:56:25, 6.14s/it] {'loss': 0.4793, 'learning_rate': 1.3755430046964649e-05, 'epoch': 0.4} + 40%|███▉ | 2287/5772 [09:09<5:56:25, 6.14s/it] 40%|███▉ | 2288/5772 [09:17<5:49:50, 6.02s/it] 40%|███▉ | 2288/5772 [09:15<5:49:50, 6.02s/it] {'loss': 0.4773, 'learning_rate': 1.3750228234561796e-05, 'epoch': 0.4} + 40%|███▉ | 2288/5772 [09:17<5:49:50, 6.02s/it] {'loss': 0.4773, 'learning_rate': 1.3750228234561796e-05, 'epoch': 0.4} + 40%|███▉ | 2288/5772 [09:15<5:49:50, 6.02s/it] 40%|███▉ | 2289/5772 [09:23<5:54:59, 6.12s/it] 40%|███▉ | 2289/5772 [09:21<5:54:59, 6.12s/it] {'loss': 0.4722, 'learning_rate': 1.3745025241045414e-05, 'epoch': 0.4} + 40%|███▉ | 2289/5772 [09:23<5:54:59, 6.12s/it] {'loss': 0.4722, 'learning_rate': 1.3745025241045414e-05, 'epoch': 0.4} + 40%|███▉ | 2289/5772 [09:21<5:54:59, 6.12s/it] 40%|███▉ | 2290/5772 [09:29<5:53:32, 6.09s/it] 40%|███▉ | 2290/5772 [09:27<5:53:32, 6.09s/it] {'loss': 0.4733, 'learning_rate': 1.3739821068054153e-05, 'epoch': 0.4} + 40%|███▉ | 2290/5772 [09:29<5:53:32, 6.09s/it] {'loss': 0.4733, 'learning_rate': 1.3739821068054153e-05, 'epoch': 0.4} + 40%|███▉ | 2290/5772 [09:27<5:53:32, 6.09s/it] 40%|███▉ | 2291/5772 [09:35<5:47:13, 5.99s/it] 40%|███▉ | 2291/5772 [09:33<5:47:13, 5.99s/it] {'loss': 0.473, 'learning_rate': 1.373461571722704e-05, 'epoch': 0.4} + 40%|███▉ | 2291/5772 [09:35<5:47:13, 5.99s/it] {'loss': 0.473, 'learning_rate': 1.373461571722704e-05, 'epoch': 0.4} + 40%|███▉ | 2291/5772 [09:33<5:47:13, 5.99s/it]/lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/VILA/llava/model/llava_arch.py:397: UserWarning: Inputs truncated! + warnings.warn("Inputs truncated!") + 40%|███▉ | 2292/5772 [09:42<5:56:57, 6.15s/it] 40%|███▉ | 2292/5772 [09:40<5:56:57, 6.15s/it] {'loss': 0.4796, 'learning_rate': 1.3729409190203475e-05, 'epoch': 0.4} + 40%|███▉ | 2292/5772 [09:42<5:56:57, 6.15s/it] {'loss': 0.4796, 'learning_rate': 1.3729409190203475e-05, 'epoch': 0.4} + 40%|███▉ | 2292/5772 [09:40<5:56:57, 6.15s/it] 40%|███▉ | 2293/5772 [09:48<5:59:14, 6.20s/it] 40%|███▉ | 2293/5772 [09:46<5:59:14, 6.20s/it] {'loss': 0.4749, 'learning_rate': 1.3724201488623216e-05, 'epoch': 0.4} + 40%|███▉ | 2293/5772 [09:48<5:59:14, 6.20s/it] {'loss': 0.4749, 'learning_rate': 1.3724201488623216e-05, 'epoch': 0.4} + 40%|███▉ | 2293/5772 [09:46<5:59:14, 6.20s/it] 40%|███▉ | 2294/5772 [09:54<6:02:37, 6.26s/it] 40%|███▉ | 2294/5772 [09:52<6:02:37, 6.26s/it] {'loss': 0.4794, 'learning_rate': 1.371899261412641e-05, 'epoch': 0.4} + 40%|███▉ | 2294/5772 [09:54<6:02:37, 6.26s/it] {'loss': 0.4794, 'learning_rate': 1.371899261412641e-05, 'epoch': 0.4} + 40%|███▉ | 2294/5772 [09:52<6:02:37, 6.26s/it] 40%|███▉ | 2295/5772 [10:00<5:56:17, 6.15s/it] 40%|███▉ | 2295/5772 [09:58<5:56:16, 6.15s/it] {'loss': 0.4751, 'learning_rate': 1.3713782568353553e-05, 'epoch': 0.4} + 40%|███▉ | 2295/5772 [10:00<5:56:17, 6.15s/it] {'loss': 0.4751, 'learning_rate': 1.3713782568353553e-05, 'epoch': 0.4} + 40%|███▉ | 2295/5772 [09:58<5:56:16, 6.15s/it] 40%|███▉ | 2296/5772 [10:06<5:51:59, 6.08s/it] 40%|███▉ | 2296/5772 [10:04<5:52:00, 6.08s/it] {'loss': 0.48, 'learning_rate': 1.3708571352945527e-05, 'epoch': 0.4} + 40%|███▉ | 2296/5772 [10:06<5:51:59, 6.08s/it] {'loss': 0.48, 'learning_rate': 1.3708571352945527e-05, 'epoch': 0.4} + 40%|███▉ | 2296/5772 [10:04<5:52:00, 6.08s/it] 40%|███▉ | 2297/5772 [10:12<5:50:49, 6.06s/it] 40%|███▉ | 2297/5772 [10:10<5:50:49, 6.06s/it] {'loss': 0.4806, 'learning_rate': 1.3703358969543575e-05, 'epoch': 0.4} + 40%|███▉ | 2297/5772 [10:12<5:50:49, 6.06s/it] {'loss': 0.4806, 'learning_rate': 1.3703358969543575e-05, 'epoch': 0.4} + 40%|███▉ | 2297/5772 [10:10<5:50:49, 6.06s/it] 40%|███▉ | 2298/5772 [10:18<5:57:33, 6.18s/it] 40%|███▉ | 2298/5772 [10:17<5:57:34, 6.18s/it] {'loss': 0.4642, 'learning_rate': 1.3698145419789302e-05, 'epoch': 0.4} + 40%|███▉ | 2298/5772 [10:18<5:57:33, 6.18s/it] {'loss': 0.4642, 'learning_rate': 1.3698145419789302e-05, 'epoch': 0.4} + 40%|███▉ | 2298/5772 [10:17<5:57:34, 6.18s/it] 40%|███▉ | 2299/5772 [10:25<5:55:12, 6.14s/it] 40%|███▉ | 2299/5772 [10:23<5:55:12, 6.14s/it] {'loss': 0.4907, 'learning_rate': 1.3692930705324697e-05, 'epoch': 0.4} + 40%|███▉ | 2299/5772 [10:25<5:55:12, 6.14s/it] {'loss': 0.4907, 'learning_rate': 1.3692930705324697e-05, 'epoch': 0.4} + 40%|███▉ | 2299/5772 [10:23<5:55:12, 6.14s/it]12 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +1315 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + + 40%|███▉ | 2300/5772 [10:31<6:03:02, 6.27s/it]14 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +046 1 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend...10 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... + + 40%|███▉ | 2300/5772 [10:29<6:03:02, 6.27s/it] {'loss': 0.4663, 'learning_rate': 1.3687714827792093e-05, 'epoch': 0.4} + 40%|███▉ | 2300/5772 [10:31<6:03:02, 6.27s/it] {'loss': 0.4663, 'learning_rate': 1.3687714827792093e-05, 'epoch': 0.4} + 40%|███▉ | 2300/5772 [10:29<6:03:02, 6.27s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-2300/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-2300/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-2300/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 40%|███▉ | 2301/5772 [10:56<11:26:49, 11.87s/it] 40%|███▉ | 2301/5772 [10:54<11:26:49, 11.87s/it] {'loss': 0.476, 'learning_rate': 1.368249778883421e-05, 'epoch': 0.4} + 40%|███▉ | 2301/5772 [10:56<11:26:49, 11.87s/it] {'loss': 0.476, 'learning_rate': 1.368249778883421e-05, 'epoch': 0.4} + 40%|███▉ | 2301/5772 [10:54<11:26:49, 11.87s/it] 40%|███▉ | 2302/5772 [11:02<9:49:52, 10.20s/it] 40%|███▉ | 2302/5772 [11:00<9:49:52, 10.20s/it] {'loss': 0.47, 'learning_rate': 1.3677279590094123e-05, 'epoch': 0.4} + 40%|███▉ | 2302/5772 [11:02<9:49:52, 10.20s/it] {'loss': 0.47, 'learning_rate': 1.3677279590094123e-05, 'epoch': 0.4} + 40%|███▉ | 2302/5772 [11:00<9:49:52, 10.20s/it] 40%|███▉ | 2303/5772 [11:08<8:36:23, 8.93s/it] 40%|███▉ | 2303/5772 [11:06<8:36:22, 8.93s/it] {'loss': 0.4694, 'learning_rate': 1.3672060233215277e-05, 'epoch': 0.4} + 40%|███▉ | 2303/5772 [11:08<8:36:23, 8.93s/it] {'loss': 0.4694, 'learning_rate': 1.3672060233215277e-05, 'epoch': 0.4} + 40%|███▉ | 2303/5772 [11:06<8:36:22, 8.93s/it] 40%|███▉ | 2304/5772 [11:14<7:47:17, 8.08s/it] 40%|███▉ | 2304/5772 [11:12<7:47:17, 8.08s/it] {'loss': 0.4755, 'learning_rate': 1.3666839719841473e-05, 'epoch': 0.4} + 40%|███▉ | 2304/5772 [11:14<7:47:17, 8.08s/it] {'loss': 0.4755, 'learning_rate': 1.3666839719841473e-05, 'epoch': 0.4} + 40%|███▉ | 2304/5772 [11:12<7:47:17, 8.08s/it] 40%|███▉ | 2305/5772 [11:21<7:13:52, 7.51s/it] 40%|███▉ | 2305/5772 [11:19<7:13:52, 7.51s/it] {'loss': 0.4808, 'learning_rate': 1.3661618051616893e-05, 'epoch': 0.4} + 40%|███▉ | 2305/5772 [11:21<7:13:52, 7.51s/it] {'loss': 0.4808, 'learning_rate': 1.3661618051616893e-05, 'epoch': 0.4} + 40%|███▉ | 2305/5772 [11:19<7:13:52, 7.51s/it] 40%|███▉ | 2306/5772 [11:27<6:46:20, 7.03s/it] 40%|███▉ | 2306/5772 [11:25<6:46:24, 7.04s/it] {'loss': 0.4644, 'learning_rate': 1.3656395230186062e-05, 'epoch': 0.4} + 40%|███▉ | 2306/5772 [11:27<6:46:20, 7.03s/it] {'loss': 0.4644, 'learning_rate': 1.3656395230186062e-05, 'epoch': 0.4} + 40%|███▉ | 2306/5772 [11:25<6:46:24, 7.04s/it] 40%|███▉ | 2307/5772 [11:31<6:28:37, 6.73s/it] 40%|███▉ | 2307/5772 [11:33<6:28:39, 6.73s/it] {'loss': 0.4786, 'learning_rate': 1.3651171257193883e-05, 'epoch': 0.4} + 40%|███▉ | 2307/5772 [11:33<6:28:39, 6.73s/it] {'loss': 0.4786, 'learning_rate': 1.3651171257193883e-05, 'epoch': 0.4} + 40%|███▉ | 2307/5772 [11:31<6:28:37, 6.73s/it] 40%|███▉ | 2308/5772 [11:39<6:24:37, 6.66s/it] 40%|███▉ | 2308/5772 [11:37<6:24:36, 6.66s/it] {'loss': 0.4711, 'learning_rate': 1.3645946134285617e-05, 'epoch': 0.4} + 40%|███▉ | 2308/5772 [11:39<6:24:37, 6.66s/it] {'loss': 0.4711, 'learning_rate': 1.3645946134285617e-05, 'epoch': 0.4} + 40%|███▉ | 2308/5772 [11:37<6:24:36, 6.66s/it] 40%|████ | 2309/5772 [11:45<6:14:43, 6.49s/it] 40%|████ | 2309/5772 [11:43<6:14:42, 6.49s/it] {'loss': 0.4827, 'learning_rate': 1.3640719863106888e-05, 'epoch': 0.4} + 40%|████ | 2309/5772 [11:45<6:14:43, 6.49s/it] {'loss': 0.4827, 'learning_rate': 1.3640719863106888e-05, 'epoch': 0.4} + 40%|████ | 2309/5772 [11:43<6:14:42, 6.49s/it] 40%|████ | 2310/5772 [11:52<6:14:01, 6.48s/it] 40%|████ | 2310/5772 [11:50<6:14:00, 6.48s/it] {'loss': 0.4798, 'learning_rate': 1.3635492445303679e-05, 'epoch': 0.4} + 40%|████ | 2310/5772 [11:52<6:14:01, 6.48s/it] {'loss': 0.4798, 'learning_rate': 1.3635492445303679e-05, 'epoch': 0.4} + 40%|████ | 2310/5772 [11:50<6:14:00, 6.48s/it] 40%|████ | 2311/5772 [11:55<6:03:05, 6.29s/it] 40%|████ | 2311/5772 [11:57<6:03:05, 6.29s/it] {'loss': 0.4772, 'learning_rate': 1.3630263882522341e-05, 'epoch': 0.4} + 40%|████ | 2311/5772 [11:57<6:03:05, 6.29s/it] {'loss': 0.4772, 'learning_rate': 1.3630263882522341e-05, 'epoch': 0.4} + 40%|████ | 2311/5772 [11:55<6:03:05, 6.29s/it] 40%|████ | 2312/5772 [12:03<5:55:19, 6.16s/it] 40%|████ | 2312/5772 [12:01<5:55:19, 6.16s/it] {'loss': 0.4721, 'learning_rate': 1.3625034176409577e-05, 'epoch': 0.4} + 40%|████ | 2312/5772 [12:03<5:55:19, 6.16s/it] {'loss': 0.4721, 'learning_rate': 1.3625034176409577e-05, 'epoch': 0.4} + 40%|████ | 2312/5772 [12:01<5:55:19, 6.16s/it] 40%|████ | 2313/5772 [12:07<5:54:16, 6.15s/it] 40%|████ | 2313/5772 [12:09<5:54:16, 6.15s/it] {'loss': 0.49, 'learning_rate': 1.3619803328612454e-05, 'epoch': 0.4} + 40%|████ | 2313/5772 [12:09<5:54:16, 6.15s/it] {'loss': 0.49, 'learning_rate': 1.3619803328612454e-05, 'epoch': 0.4} + 40%|████ | 2313/5772 [12:07<5:54:16, 6.15s/it] 40%|████ | 2314/5772 [12:16<5:53:29, 6.13s/it] 40%|████ | 2314/5772 [12:14<5:53:30, 6.13s/it] {'loss': 0.4814, 'learning_rate': 1.3614571340778398e-05, 'epoch': 0.4} + 40%|████ | 2314/5772 [12:16<5:53:29, 6.13s/it] {'loss': 0.4814, 'learning_rate': 1.3614571340778398e-05, 'epoch': 0.4} + 40%|████ | 2314/5772 [12:14<5:53:30, 6.13s/it] 40%|████ | 2315/5772 [12:22<5:51:54, 6.11s/it] 40%|████ | 2315/5772 [12:20<5:51:54, 6.11s/it] {'loss': 0.4737, 'learning_rate': 1.3609338214555195e-05, 'epoch': 0.4} + 40%|████ | 2315/5772 [12:22<5:51:54, 6.11s/it] {'loss': 0.4737, 'learning_rate': 1.3609338214555195e-05, 'epoch': 0.4} + 40%|████ | 2315/5772 [12:20<5:51:54, 6.11s/it] 40%|████ | 2316/5772 [12:28<5:53:20, 6.13s/it] 40%|████ | 2316/5772 [12:26<5:53:20, 6.13s/it] {'loss': 0.464, 'learning_rate': 1.3604103951590993e-05, 'epoch': 0.4} + 40%|████ | 2316/5772 [12:28<5:53:20, 6.13s/it] {'loss': 0.464, 'learning_rate': 1.3604103951590993e-05, 'epoch': 0.4} + 40%|████ | 2316/5772 [12:26<5:53:20, 6.13s/it] 40%|████ | 2317/5772 [12:34<5:56:53, 6.20s/it] 40%|████ | 2317/5772 [12:32<5:56:53, 6.20s/it] {'loss': 0.4787, 'learning_rate': 1.3598868553534286e-05, 'epoch': 0.4} + 40%|████ | 2317/5772 [12:34<5:56:53, 6.20s/it] {'loss': 0.4787, 'learning_rate': 1.3598868553534286e-05, 'epoch': 0.4} + 40%|████ | 2317/5772 [12:32<5:56:53, 6.20s/it] 40%|████ | 2318/5772 [12:40<5:56:28, 6.19s/it] 40%|████ | 2318/5772 [12:38<5:56:28, 6.19s/it] {'loss': 0.4692, 'learning_rate': 1.359363202203394e-05, 'epoch': 0.4} + 40%|████ | 2318/5772 [12:40<5:56:28, 6.19s/it] {'loss': 0.4692, 'learning_rate': 1.359363202203394e-05, 'epoch': 0.4} + 40%|████ | 2318/5772 [12:38<5:56:28, 6.19s/it] 40%|████ | 2319/5772 [12:45<6:04:28, 6.33s/it] 40%|████ | 2319/5772 [12:47<6:04:29, 6.33s/it] {'loss': 0.4746, 'learning_rate': 1.3588394358739167e-05, 'epoch': 0.4} + 40%|████ | 2319/5772 [12:47<6:04:29, 6.33s/it] {'loss': 0.4746, 'learning_rate': 1.3588394358739167e-05, 'epoch': 0.4} + 40%|████ | 2319/5772 [12:45<6:04:28, 6.33s/it] 40%|████ | 2320/5772 [12:51<5:58:10, 6.23s/it] 40%|████ | 2320/5772 [12:53<5:58:10, 6.23s/it] {'loss': 0.4711, 'learning_rate': 1.3583155565299544e-05, 'epoch': 0.4} + 40%|████ | 2320/5772 [12:53<5:58:10, 6.23s/it] {'loss': 0.4711, 'learning_rate': 1.3583155565299544e-05, 'epoch': 0.4} + 40%|████ | 2320/5772 [12:51<5:58:10, 6.23s/it] 40%|████ | 2321/5772 [12:59<5:49:12, 6.07s/it] 40%|████ | 2321/5772 [12:57<5:49:13, 6.07s/it] {'loss': 0.4707, 'learning_rate': 1.3577915643364997e-05, 'epoch': 0.4} + 40%|████ | 2321/5772 [12:59<5:49:12, 6.07s/it] {'loss': 0.4707, 'learning_rate': 1.3577915643364997e-05, 'epoch': 0.4} + 40%|████ | 2321/5772 [12:57<5:49:13, 6.07s/it] 40%|████ | 2322/5772 [13:05<5:51:17, 6.11s/it] 40%|████ | 2322/5772 [13:03<5:51:18, 6.11s/it] {'loss': 0.4784, 'learning_rate': 1.3572674594585813e-05, 'epoch': 0.4} + 40%|████ | 2322/5772 [13:05<5:51:17, 6.11s/it] {'loss': 0.4784, 'learning_rate': 1.3572674594585813e-05, 'epoch': 0.4} + 40%|████ | 2322/5772 [13:03<5:51:18, 6.11s/it] 40%|████ | 2323/5772 [13:11<5:50:34, 6.10s/it] 40%|████ | 2323/5772 [13:09<5:50:34, 6.10s/it] {'loss': 0.4633, 'learning_rate': 1.356743242061263e-05, 'epoch': 0.4} + 40%|████ | 2323/5772 [13:11<5:50:34, 6.10s/it] {'loss': 0.4633, 'learning_rate': 1.356743242061263e-05, 'epoch': 0.4} + 40%|████ | 2323/5772 [13:09<5:50:34, 6.10s/it] 40%|████ | 2324/5772 [13:18<5:59:10, 6.25s/it] 40%|████ | 2324/5772 [13:16<5:59:10, 6.25s/it] {'loss': 0.4693, 'learning_rate': 1.3562189123096439e-05, 'epoch': 0.4} + 40%|████ | 2324/5772 [13:18<5:59:10, 6.25s/it] {'loss': 0.4693, 'learning_rate': 1.3562189123096439e-05, 'epoch': 0.4} + 40%|████ | 2324/5772 [13:16<5:59:10, 6.25s/it] 40%|████ | 2325/5772 [13:22<5:55:54, 6.20s/it] 40%|████ | 2325/5772 [13:24<5:55:54, 6.20s/it] {'loss': 0.4733, 'learning_rate': 1.3556944703688592e-05, 'epoch': 0.4} + 40%|████ | 2325/5772 [13:24<5:55:54, 6.20s/it] {'loss': 0.4733, 'learning_rate': 1.3556944703688592e-05, 'epoch': 0.4} + 40%|████ | 2325/5772 [13:22<5:55:54, 6.20s/it] 40%|████ | 2326/5772 [13:30<5:56:19, 6.20s/it] 40%|████ | 2326/5772 [13:28<5:56:19, 6.20s/it] {'loss': 0.4557, 'learning_rate': 1.3551699164040786e-05, 'epoch': 0.4} + 40%|████ | 2326/5772 [13:30<5:56:19, 6.20s/it] {'loss': 0.4557, 'learning_rate': 1.3551699164040786e-05, 'epoch': 0.4} + 40%|████ | 2326/5772 [13:28<5:56:19, 6.20s/it] 40%|████ | 2327/5772 [13:36<5:53:26, 6.16s/it] 40%|████ | 2327/5772 [13:34<5:53:26, 6.16s/it] {'loss': 0.4625, 'learning_rate': 1.3546452505805076e-05, 'epoch': 0.4} + 40%|████ | 2327/5772 [13:36<5:53:26, 6.16s/it] {'loss': 0.4625, 'learning_rate': 1.3546452505805076e-05, 'epoch': 0.4} + 40%|████ | 2327/5772 [13:34<5:53:26, 6.16s/it] 40%|████ | 2328/5772 [13:40<5:50:52, 6.11s/it] 40%|████ | 2328/5772 [13:42<5:50:52, 6.11s/it] {'loss': 0.473, 'learning_rate': 1.3541204730633864e-05, 'epoch': 0.4} + 40%|████ | 2328/5772 [13:42<5:50:52, 6.11s/it] {'loss': 0.473, 'learning_rate': 1.3541204730633864e-05, 'epoch': 0.4} + 40%|████ | 2328/5772 [13:40<5:50:52, 6.11s/it] 40%|████ | 2329/5772 [13:46<5:48:23, 6.07s/it] 40%|████ | 2329/5772 [13:48<5:48:24, 6.07s/it] {'loss': 0.4798, 'learning_rate': 1.3535955840179918e-05, 'epoch': 0.4} + 40%|████ | 2329/5772 [13:48<5:48:24, 6.07s/it] {'loss': 0.4798, 'learning_rate': 1.3535955840179918e-05, 'epoch': 0.4} + 40%|████ | 2329/5772 [13:46<5:48:23, 6.07s/it] 40%|████ | 2330/5772 [13:52<5:50:32, 6.11s/it] 40%|████ | 2330/5772 [13:54<5:50:32, 6.11s/it] {'loss': 0.4713, 'learning_rate': 1.3530705836096333e-05, 'epoch': 0.4} + 40%|████ | 2330/5772 [13:54<5:50:32, 6.11s/it] {'loss': 0.4713, 'learning_rate': 1.3530705836096333e-05, 'epoch': 0.4} + 40%|████ | 2330/5772 [13:52<5:50:32, 6.11s/it] 40%|████ | 2331/5772 [13:58<5:49:21, 6.09s/it] 40%|████ | 2331/5772 [14:00<5:49:21, 6.09s/it] {'loss': 0.4861, 'learning_rate': 1.3525454720036581e-05, 'epoch': 0.4} + 40%|████ | 2331/5772 [14:00<5:49:21, 6.09s/it] {'loss': 0.4861, 'learning_rate': 1.3525454720036581e-05, 'epoch': 0.4} + 40%|████ | 2331/5772 [13:58<5:49:21, 6.09s/it] 40%|████ | 2332/5772 [14:06<5:48:30, 6.08s/it] 40%|████ | 2332/5772 [14:04<5:48:30, 6.08s/it] {'loss': 0.4849, 'learning_rate': 1.3520202493654466e-05, 'epoch': 0.4} + 40%|████ | 2332/5772 [14:06<5:48:30, 6.08s/it] {'loss': 0.4849, 'learning_rate': 1.3520202493654466e-05, 'epoch': 0.4} + 40%|████ | 2332/5772 [14:04<5:48:30, 6.08s/it] 40%|████ | 2333/5772 [14:12<5:47:36, 6.06s/it] 40%|████ | 2333/5772 [14:10<5:47:36, 6.06s/it] {'loss': 0.4749, 'learning_rate': 1.3514949158604147e-05, 'epoch': 0.4} + 40%|████ | 2333/5772 [14:12<5:47:36, 6.06s/it] {'loss': 0.4749, 'learning_rate': 1.3514949158604147e-05, 'epoch': 0.4} + 40%|████ | 2333/5772 [14:10<5:47:36, 6.06s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (4399 > 4096). Running this sequence through the model will result in indexing errors + 40%|████ | 2334/5772 [14:18<5:44:48, 6.02s/it] 40%|████ | 2334/5772 [14:16<5:44:48, 6.02s/it] {'loss': 0.4686, 'learning_rate': 1.3509694716540135e-05, 'epoch': 0.4} + 40%|████ | 2334/5772 [14:18<5:44:48, 6.02s/it] {'loss': 0.4686, 'learning_rate': 1.3509694716540135e-05, 'epoch': 0.4} + 40%|████ | 2334/5772 [14:16<5:44:48, 6.02s/it] 40%|████ | 2335/5772 [14:24<5:44:16, 6.01s/it] 40%|████ | 2335/5772 [14:22<5:44:16, 6.01s/it] {'loss': 0.4771, 'learning_rate': 1.3504439169117283e-05, 'epoch': 0.4} + 40%|████ | 2335/5772 [14:24<5:44:16, 6.01s/it] {'loss': 0.4771, 'learning_rate': 1.3504439169117283e-05, 'epoch': 0.4} + 40%|████ | 2335/5772 [14:22<5:44:16, 6.01s/it] 40%|████ | 2336/5772 [14:30<5:45:33, 6.03s/it] 40%|████ | 2336/5772 [14:28<5:45:33, 6.03s/it] {'loss': 0.4818, 'learning_rate': 1.34991825179908e-05, 'epoch': 0.4} + 40%|████ | 2336/5772 [14:30<5:45:33, 6.03s/it] {'loss': 0.4818, 'learning_rate': 1.34991825179908e-05, 'epoch': 0.4} + 40%|████ | 2336/5772 [14:28<5:45:33, 6.03s/it] 40%|████ | 2337/5772 [14:36<5:46:18, 6.05s/it] 40%|████ | 2337/5772 [14:34<5:46:18, 6.05s/it] {'loss': 0.4784, 'learning_rate': 1.349392476481624e-05, 'epoch': 0.4} + 40%|████ | 2337/5772 [14:36<5:46:18, 6.05s/it] {'loss': 0.4784, 'learning_rate': 1.349392476481624e-05, 'epoch': 0.4} + 40%|████ | 2337/5772 [14:34<5:46:18, 6.05s/it] 41%|████ | 2338/5772 [14:40<5:46:58, 6.06s/it] 41%|████ | 2338/5772 [14:42<5:46:58, 6.06s/it] {'loss': 0.482, 'learning_rate': 1.3488665911249503e-05, 'epoch': 0.41} + 41%|████ | 2338/5772 [14:42<5:46:58, 6.06s/it] {'loss': 0.482, 'learning_rate': 1.3488665911249503e-05, 'epoch': 0.41} + 41%|████ | 2338/5772 [14:40<5:46:58, 6.06s/it] 41%|████ | 2339/5772 [14:48<5:45:21, 6.04s/it] 41%|████ | 2339/5772 [14:46<5:45:21, 6.04s/it] {'loss': 0.4707, 'learning_rate': 1.348340595894683e-05, 'epoch': 0.41} + 41%|████ | 2339/5772 [14:48<5:45:21, 6.04s/it] {'loss': 0.4707, 'learning_rate': 1.348340595894683e-05, 'epoch': 0.41} + 41%|████ | 2339/5772 [14:46<5:45:21, 6.04s/it] 41%|████ | 2340/5772 [14:53<5:48:44, 6.10s/it] 41%|████ | 2340/5772 [14:55<5:48:44, 6.10s/it] {'loss': 0.4692, 'learning_rate': 1.3478144909564824e-05, 'epoch': 0.41} + 41%|████ | 2340/5772 [14:55<5:48:44, 6.10s/it] {'loss': 0.4692, 'learning_rate': 1.3478144909564824e-05, 'epoch': 0.41} + 41%|████ | 2340/5772 [14:53<5:48:44, 6.10s/it] 41%|████ | 2341/5772 [14:59<5:46:44, 6.06s/it] 41%|████ | 2341/5772 [15:01<5:46:44, 6.06s/it] {'loss': 0.4772, 'learning_rate': 1.3472882764760414e-05, 'epoch': 0.41} + 41%|████ | 2341/5772 [15:01<5:46:44, 6.06s/it] {'loss': 0.4772, 'learning_rate': 1.3472882764760414e-05, 'epoch': 0.41} + 41%|████ | 2341/5772 [14:59<5:46:44, 6.06s/it] 41%|████ | 2342/5772 [15:05<5:50:09, 6.13s/it] 41%|████ | 2342/5772 [15:07<5:50:09, 6.13s/it] {'loss': 0.4598, 'learning_rate': 1.3467619526190885e-05, 'epoch': 0.41} + 41%|████ | 2342/5772 [15:07<5:50:09, 6.13s/it] {'loss': 0.4598, 'learning_rate': 1.3467619526190885e-05, 'epoch': 0.41} + 41%|████ | 2342/5772 [15:05<5:50:09, 6.13s/it] 41%|████ | 2343/5772 [15:11<5:43:32, 6.01s/it] 41%|████ | 2343/5772 [15:13<5:43:32, 6.01s/it] {'loss': 0.4806, 'learning_rate': 1.3462355195513868e-05, 'epoch': 0.41} + 41%|████ | 2343/5772 [15:13<5:43:32, 6.01s/it] {'loss': 0.4806, 'learning_rate': 1.3462355195513868e-05, 'epoch': 0.41} + 41%|████ | 2343/5772 [15:11<5:43:32, 6.01s/it] 41%|████ | 2344/5772 [15:17<5:45:18, 6.04s/it] 41%|████ | 2344/5772 [15:19<5:45:18, 6.04s/it] {'loss': 0.4645, 'learning_rate': 1.3457089774387333e-05, 'epoch': 0.41} + 41%|████ | 2344/5772 [15:19<5:45:18, 6.04s/it] {'loss': 0.4645, 'learning_rate': 1.3457089774387333e-05, 'epoch': 0.41} + 41%|████ | 2344/5772 [15:17<5:45:18, 6.04s/it] 41%|████ | 2345/5772 [15:23<5:45:31, 6.05s/it] 41%|████ | 2345/5772 [15:25<5:45:32, 6.05s/it] {'loss': 0.4834, 'learning_rate': 1.3451823264469595e-05, 'epoch': 0.41} + 41%|████ | 2345/5772 [15:25<5:45:32, 6.05s/it] {'loss': 0.4834, 'learning_rate': 1.3451823264469595e-05, 'epoch': 0.41} + 41%|████ | 2345/5772 [15:23<5:45:31, 6.05s/it] 41%|████ | 2346/5772 [15:29<5:47:59, 6.09s/it] 41%|████ | 2346/5772 [15:31<5:47:59, 6.09s/it] {'loss': 0.4737, 'learning_rate': 1.344655566741931e-05, 'epoch': 0.41} + 41%|████ | 2346/5772 [15:31<5:47:59, 6.09s/it] {'loss': 0.4737, 'learning_rate': 1.344655566741931e-05, 'epoch': 0.41} + 41%|████ | 2346/5772 [15:29<5:47:59, 6.09s/it] 41%|████ | 2347/5772 [15:35<5:53:55, 6.20s/it] 41%|████ | 2347/5772 [15:37<5:53:55, 6.20s/it] {'loss': 0.4835, 'learning_rate': 1.3441286984895486e-05, 'epoch': 0.41} + 41%|████ | 2347/5772 [15:37<5:53:55, 6.20s/it] {'loss': 0.4835, 'learning_rate': 1.3441286984895486e-05, 'epoch': 0.41} + 41%|████ | 2347/5772 [15:35<5:53:55, 6.20s/it] 41%|████ | 2348/5772 [15:44<5:56:14, 6.24s/it] 41%|████ | 2348/5772 [15:42<5:56:14, 6.24s/it] {'loss': 0.4712, 'learning_rate': 1.3436017218557453e-05, 'epoch': 0.41} + 41%|████ | 2348/5772 [15:44<5:56:14, 6.24s/it] {'loss': 0.4712, 'learning_rate': 1.3436017218557453e-05, 'epoch': 0.41} + 41%|████ | 2348/5772 [15:42<5:56:14, 6.24s/it] 41%|████ | 2349/5772 [15:48<6:02:50, 6.36s/it] 41%|████ | 2349/5772 [15:50<6:02:50, 6.36s/it] {'loss': 0.461, 'learning_rate': 1.3430746370064904e-05, 'epoch': 0.41} + 41%|████ | 2349/5772 [15:50<6:02:50, 6.36s/it] {'loss': 0.461, 'learning_rate': 1.3430746370064904e-05, 'epoch': 0.41} + 41%|████ | 2349/5772 [15:48<6:02:50, 6.36s/it]11 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +1213 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +1 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend...15 AutoResumeHook: Checking whether to suspend... + +14 AutoResumeHook: Checking whether to suspend... +3 5AutoResumeHook: Checking whether to suspend... 0 +AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... + 41%|████ | 2350/5772 [15:57<6:01:40, 6.34s/it]AutoResumeHook: Checking whether to suspend... + 41%|████ | 2350/5772 [15:55<6:01:40, 6.34s/it]2 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4763, 'learning_rate': 1.342547444107786e-05, 'epoch': 0.41} + 41%|████ | 2350/5772 [15:57<6:01:40, 6.34s/it] {'loss': 0.4763, 'learning_rate': 1.342547444107786e-05, 'epoch': 0.41} + 41%|████ | 2350/5772 [15:55<6:01:40, 6.34s/it]/lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/VILA/llava/model/llava_arch.py:397: UserWarning: Inputs truncated! + warnings.warn("Inputs truncated!") + 41%|████ | 2351/5772 [16:01<6:01:25, 6.34s/it] 41%|████ | 2351/5772 [16:03<6:01:26, 6.34s/it] {'loss': 0.4925, 'learning_rate': 1.342020143325669e-05, 'epoch': 0.41} + 41%|████ | 2351/5772 [16:03<6:01:26, 6.34s/it] {'loss': 0.4925, 'learning_rate': 1.342020143325669e-05, 'epoch': 0.41} + 41%|████ | 2351/5772 [16:01<6:01:25, 6.34s/it] 41%|████ | 2352/5772 [16:09<5:59:51, 6.31s/it] 41%|████ | 2352/5772 [16:07<5:59:52, 6.31s/it] {'loss': 0.4636, 'learning_rate': 1.341492734826209e-05, 'epoch': 0.41} + 41%|████ | 2352/5772 [16:09<5:59:51, 6.31s/it] {'loss': 0.4636, 'learning_rate': 1.341492734826209e-05, 'epoch': 0.41} + 41%|████ | 2352/5772 [16:07<5:59:52, 6.31s/it] 41%|████ | 2353/5772 [16:15<5:56:30, 6.26s/it] 41%|████ | 2353/5772 [16:13<5:56:33, 6.26s/it] {'loss': 0.4786, 'learning_rate': 1.3409652187755114e-05, 'epoch': 0.41} + 41%|████ | 2353/5772 [16:15<5:56:30, 6.26s/it] {'loss': 0.4786, 'learning_rate': 1.3409652187755114e-05, 'epoch': 0.41} + 41%|████ | 2353/5772 [16:13<5:56:33, 6.26s/it] 41%|████ | 2354/5772 [16:22<5:55:26, 6.24s/it] 41%|████ | 2354/5772 [16:20<5:55:25, 6.24s/it] {'loss': 0.4604, 'learning_rate': 1.3404375953397136e-05, 'epoch': 0.41} + 41%|████ | 2354/5772 [16:22<5:55:26, 6.24s/it] {'loss': 0.4604, 'learning_rate': 1.3404375953397136e-05, 'epoch': 0.41} + 41%|████ | 2354/5772 [16:20<5:55:25, 6.24s/it] 41%|████ | 2355/5772 [16:26<6:00:55, 6.34s/it] 41%|████ | 2355/5772 [16:28<6:00:56, 6.34s/it] {'loss': 0.48, 'learning_rate': 1.339909864684988e-05, 'epoch': 0.41} + 41%|████ | 2355/5772 [16:28<6:00:56, 6.34s/it] {'loss': 0.48, 'learning_rate': 1.339909864684988e-05, 'epoch': 0.41} + 41%|████ | 2355/5772 [16:26<6:00:55, 6.34s/it] 41%|████ | 2356/5772 [16:32<5:55:34, 6.25s/it] 41%|████ | 2356/5772 [16:34<5:55:34, 6.25s/it] {'loss': 0.471, 'learning_rate': 1.3393820269775405e-05, 'epoch': 0.41} + 41%|████ | 2356/5772 [16:34<5:55:34, 6.25s/it] {'loss': 0.471, 'learning_rate': 1.3393820269775405e-05, 'epoch': 0.41} + 41%|████ | 2356/5772 [16:32<5:55:34, 6.25s/it] 41%|████ | 2357/5772 [16:41<6:02:21, 6.37s/it] 41%|████ | 2357/5772 [16:39<6:02:21, 6.37s/it] {'loss': 0.4639, 'learning_rate': 1.3388540823836103e-05, 'epoch': 0.41} + 41%|████ | 2357/5772 [16:41<6:02:21, 6.37s/it] {'loss': 0.4639, 'learning_rate': 1.3388540823836103e-05, 'epoch': 0.41} + 41%|████ | 2357/5772 [16:39<6:02:21, 6.37s/it] 41%|████ | 2358/5772 [16:45<5:56:55, 6.27s/it] 41%|████ | 2358/5772 [16:47<5:56:55, 6.27s/it] {'loss': 0.4765, 'learning_rate': 1.3383260310694712e-05, 'epoch': 0.41} + 41%|████ | 2358/5772 [16:47<5:56:55, 6.27s/it] {'loss': 0.4765, 'learning_rate': 1.3383260310694712e-05, 'epoch': 0.41} + 41%|████ | 2358/5772 [16:45<5:56:55, 6.27s/it] 41%|████ | 2359/5772 [16:51<5:54:57, 6.24s/it] 41%|████ | 2359/5772 [16:53<5:54:57, 6.24s/it] {'loss': 0.4856, 'learning_rate': 1.3377978732014295e-05, 'epoch': 0.41} + 41%|████ | 2359/5772 [16:53<5:54:57, 6.24s/it] {'loss': 0.4856, 'learning_rate': 1.3377978732014295e-05, 'epoch': 0.41} + 41%|████ | 2359/5772 [16:51<5:54:57, 6.24s/it] 41%|████ | 2360/5772 [16:59<5:53:52, 6.22s/it] 41%|████ | 2360/5772 [16:57<5:53:53, 6.22s/it] {'loss': 0.4787, 'learning_rate': 1.3372696089458264e-05, 'epoch': 0.41} + 41%|████ | 2360/5772 [16:59<5:53:52, 6.22s/it] {'loss': 0.4787, 'learning_rate': 1.3372696089458264e-05, 'epoch': 0.41} + 41%|████ | 2360/5772 [16:57<5:53:53, 6.22s/it] 41%|████ | 2361/5772 [17:05<5:50:46, 6.17s/it] 41%|████ | 2361/5772 [17:03<5:50:46, 6.17s/it] {'loss': 0.4897, 'learning_rate': 1.3367412384690346e-05, 'epoch': 0.41} + 41%|████ | 2361/5772 [17:05<5:50:46, 6.17s/it] {'loss': 0.4897, 'learning_rate': 1.3367412384690346e-05, 'epoch': 0.41} + 41%|████ | 2361/5772 [17:03<5:50:46, 6.17s/it] 41%|████ | 2362/5772 [17:10<5:54:55, 6.25s/it] 41%|████ | 2362/5772 [17:12<5:54:56, 6.25s/it] {'loss': 0.4831, 'learning_rate': 1.3362127619374622e-05, 'epoch': 0.41} + 41%|████ | 2362/5772 [17:12<5:54:56, 6.25s/it] {'loss': 0.4831, 'learning_rate': 1.3362127619374622e-05, 'epoch': 0.41} + 41%|████ | 2362/5772 [17:10<5:54:55, 6.25s/it] 41%|████ | 2363/5772 [17:16<5:55:35, 6.26s/it] 41%|████ | 2363/5772 [17:18<5:55:35, 6.26s/it] {'loss': 0.4737, 'learning_rate': 1.3356841795175494e-05, 'epoch': 0.41} + 41%|████ | 2363/5772 [17:18<5:55:35, 6.26s/it] {'loss': 0.4737, 'learning_rate': 1.3356841795175494e-05, 'epoch': 0.41} + 41%|████ | 2363/5772 [17:16<5:55:35, 6.26s/it] 41%|████ | 2364/5772 [17:22<5:54:59, 6.25s/it] 41%|████ | 2364/5772 [17:24<5:55:00, 6.25s/it] {'loss': 0.4593, 'learning_rate': 1.3351554913757712e-05, 'epoch': 0.41} + 41%|████ | 2364/5772 [17:24<5:55:00, 6.25s/it] {'loss': 0.4593, 'learning_rate': 1.3351554913757712e-05, 'epoch': 0.41} + 41%|████ | 2364/5772 [17:22<5:54:59, 6.25s/it] 41%|████ | 2365/5772 [17:29<5:56:50, 6.28s/it] 41%|████ | 2365/5772 [17:31<5:56:51, 6.28s/it] {'loss': 0.4735, 'learning_rate': 1.3346266976786341e-05, 'epoch': 0.41} + 41%|████ | 2365/5772 [17:31<5:56:51, 6.28s/it] {'loss': 0.4735, 'learning_rate': 1.3346266976786341e-05, 'epoch': 0.41} + 41%|████ | 2365/5772 [17:29<5:56:50, 6.28s/it] 41%|████ | 2366/5772 [17:35<5:51:03, 6.18s/it] 41%|████ | 2366/5772 [17:37<5:51:02, 6.18s/it] {'loss': 0.4674, 'learning_rate': 1.3340977985926793e-05, 'epoch': 0.41} + 41%|████ | 2366/5772 [17:37<5:51:02, 6.18s/it] {'loss': 0.4674, 'learning_rate': 1.3340977985926793e-05, 'epoch': 0.41} + 41%|████ | 2366/5772 [17:35<5:51:03, 6.18s/it] 41%|████ | 2367/5772 [17:41<5:51:02, 6.19s/it] 41%|████ | 2367/5772 [17:43<5:51:02, 6.19s/it] {'loss': 0.4677, 'learning_rate': 1.3335687942844806e-05, 'epoch': 0.41} + 41%|████ | 2367/5772 [17:43<5:51:02, 6.19s/it] {'loss': 0.4677, 'learning_rate': 1.3335687942844806e-05, 'epoch': 0.41} + 41%|████ | 2367/5772 [17:41<5:51:02, 6.19s/it] 41%|████ | 2368/5772 [17:47<5:51:58, 6.20s/it] 41%|████ | 2368/5772 [17:49<5:51:59, 6.20s/it] {'loss': 0.4696, 'learning_rate': 1.3330396849206447e-05, 'epoch': 0.41} + 41%|████ | 2368/5772 [17:49<5:51:59, 6.20s/it] {'loss': 0.4696, 'learning_rate': 1.3330396849206447e-05, 'epoch': 0.41} + 41%|████ | 2368/5772 [17:47<5:51:58, 6.20s/it] 41%|████ | 2369/5772 [17:53<5:53:20, 6.23s/it] 41%|████ | 2369/5772 [17:55<5:53:20, 6.23s/it] {'loss': 0.475, 'learning_rate': 1.3325104706678116e-05, 'epoch': 0.41} + {'loss': 0.475, 'learning_rate': 1.3325104706678116e-05, 'epoch': 0.41} 41%|████ | 2369/5772 [17:55<5:53:20, 6.23s/it] + 41%|████ | 2369/5772 [17:53<5:53:20, 6.23s/it] 41%|████ | 2370/5772 [17:59<5:50:33, 6.18s/it] 41%|████ | 2370/5772 [18:01<5:50:33, 6.18s/it] {'loss': 0.4709, 'learning_rate': 1.3319811516926541e-05, 'epoch': 0.41} + 41%|████ | 2370/5772 [18:01<5:50:33, 6.18s/it] {'loss': 0.4709, 'learning_rate': 1.3319811516926541e-05, 'epoch': 0.41} + 41%|████ | 2370/5772 [17:59<5:50:33, 6.18s/it] 41%|████ | 2371/5772 [18:06<5:53:27, 6.24s/it] 41%|████ | 2371/5772 [18:08<5:53:27, 6.24s/it] {'loss': 0.4784, 'learning_rate': 1.3314517281618794e-05, 'epoch': 0.41} + 41%|████ | 2371/5772 [18:08<5:53:27, 6.24s/it] {'loss': 0.4784, 'learning_rate': 1.3314517281618794e-05, 'epoch': 0.41} + 41%|████ | 2371/5772 [18:06<5:53:27, 6.24s/it] 41%|████ | 2372/5772 [18:14<5:53:33, 6.24s/it] 41%|████ | 2372/5772 [18:12<5:53:34, 6.24s/it] {'loss': 0.4635, 'learning_rate': 1.3309222002422255e-05, 'epoch': 0.41} + 41%|████ | 2372/5772 [18:14<5:53:33, 6.24s/it] {'loss': 0.4635, 'learning_rate': 1.3309222002422255e-05, 'epoch': 0.41} + 41%|████ | 2372/5772 [18:12<5:53:34, 6.24s/it] 41%|████ | 2373/5772 [18:18<5:51:31, 6.21s/it] 41%|████ | 2373/5772 [18:20<5:51:31, 6.21s/it] {'loss': 0.4752, 'learning_rate': 1.3303925681004649e-05, 'epoch': 0.41} + 41%|████ | 2373/5772 [18:20<5:51:31, 6.21s/it] {'loss': 0.4752, 'learning_rate': 1.3303925681004649e-05, 'epoch': 0.41} + 41%|████ | 2373/5772 [18:18<5:51:31, 6.21s/it] 41%|████ | 2374/5772 [18:24<5:49:44, 6.18s/it] 41%|████ | 2374/5772 [18:26<5:49:44, 6.18s/it] {'loss': 0.4705, 'learning_rate': 1.3298628319034014e-05, 'epoch': 0.41} + 41%|████ | 2374/5772 [18:26<5:49:44, 6.18s/it] {'loss': 0.4705, 'learning_rate': 1.3298628319034014e-05, 'epoch': 0.41} + 41%|████ | 2374/5772 [18:24<5:49:44, 6.18s/it] 41%|████ | 2375/5772 [18:30<5:51:59, 6.22s/it] 41%|████ | 2375/5772 [18:32<5:51:59, 6.22s/it] {'loss': 0.4671, 'learning_rate': 1.3293329918178737e-05, 'epoch': 0.41} + 41%|████ | 2375/5772 [18:32<5:51:59, 6.22s/it] {'loss': 0.4671, 'learning_rate': 1.3293329918178737e-05, 'epoch': 0.41} + 41%|████ | 2375/5772 [18:30<5:51:59, 6.22s/it] 41%|████ | 2376/5772 [18:36<5:46:14, 6.12s/it] 41%|████ | 2376/5772 [18:38<5:46:14, 6.12s/it] {'loss': 0.4841, 'learning_rate': 1.3288030480107508e-05, 'epoch': 0.41} + 41%|████ | 2376/5772 [18:38<5:46:14, 6.12s/it] {'loss': 0.4841, 'learning_rate': 1.3288030480107508e-05, 'epoch': 0.41} + 41%|████ | 2376/5772 [18:36<5:46:14, 6.12s/it] 41%|████ | 2377/5772 [18:44<5:46:11, 6.12s/it] 41%|████ | 2377/5772 [18:42<5:46:11, 6.12s/it] {'loss': 0.4869, 'learning_rate': 1.3282730006489361e-05, 'epoch': 0.41} + 41%|████ | 2377/5772 [18:44<5:46:11, 6.12s/it] {'loss': 0.4869, 'learning_rate': 1.3282730006489361e-05, 'epoch': 0.41} + 41%|████ | 2377/5772 [18:42<5:46:11, 6.12s/it] 41%|████ | 2378/5772 [18:49<5:46:05, 6.12s/it] 41%|████ | 2378/5772 [18:51<5:46:05, 6.12s/it] {'loss': 0.4633, 'learning_rate': 1.327742849899365e-05, 'epoch': 0.41} + 41%|████ | 2378/5772 [18:51<5:46:05, 6.12s/it] {'loss': 0.4633, 'learning_rate': 1.327742849899365e-05, 'epoch': 0.41} + 41%|████ | 2378/5772 [18:49<5:46:05, 6.12s/it] 41%|████ | 2379/5772 [18:55<5:43:12, 6.07s/it] 41%|████ | 2379/5772 [18:57<5:43:12, 6.07s/it] {'loss': 0.478, 'learning_rate': 1.3272125959290059e-05, 'epoch': 0.41} + 41%|████ | 2379/5772 [18:57<5:43:12, 6.07s/it] {'loss': 0.478, 'learning_rate': 1.3272125959290059e-05, 'epoch': 0.41} + 41%|████ | 2379/5772 [18:55<5:43:12, 6.07s/it] 41%|████ | 2380/5772 [19:01<5:47:47, 6.15s/it] 41%|████ | 2380/5772 [19:03<5:47:47, 6.15s/it] {'loss': 0.4815, 'learning_rate': 1.326682238904859e-05, 'epoch': 0.41} + 41%|████ | 2380/5772 [19:03<5:47:47, 6.15s/it] {'loss': 0.4815, 'learning_rate': 1.326682238904859e-05, 'epoch': 0.41} + 41%|████ | 2380/5772 [19:01<5:47:47, 6.15s/it] 41%|████▏ | 2381/5772 [19:09<5:40:47, 6.03s/it] 41%|████▏ | 2381/5772 [19:07<5:40:47, 6.03s/it] {'loss': 0.4784, 'learning_rate': 1.326151778993957e-05, 'epoch': 0.41} + 41%|████▏ | 2381/5772 [19:09<5:40:47, 6.03s/it] {'loss': 0.4784, 'learning_rate': 1.326151778993957e-05, 'epoch': 0.41} + 41%|████▏ | 2381/5772 [19:07<5:40:47, 6.03s/it] 41%|████▏ | 2382/5772 [19:13<5:39:33, 6.01s/it] 41%|████▏ | 2382/5772 [19:15<5:39:33, 6.01s/it] {'loss': 0.4718, 'learning_rate': 1.325621216363366e-05, 'epoch': 0.41} + 41%|████▏ | 2382/5772 [19:15<5:39:33, 6.01s/it] {'loss': 0.4718, 'learning_rate': 1.325621216363366e-05, 'epoch': 0.41} + 41%|████▏ | 2382/5772 [19:13<5:39:33, 6.01s/it] 41%|████▏ | 2383/5772 [19:21<5:39:39, 6.01s/it] 41%|████▏ | 2383/5772 [19:19<5:39:39, 6.01s/it] {'loss': 0.472, 'learning_rate': 1.3250905511801831e-05, 'epoch': 0.41} + 41%|████▏ | 2383/5772 [19:21<5:39:39, 6.01s/it] {'loss': 0.472, 'learning_rate': 1.3250905511801831e-05, 'epoch': 0.41} + 41%|████▏ | 2383/5772 [19:19<5:39:39, 6.01s/it] 41%|████▏ | 2384/5772 [19:27<5:37:21, 5.97s/it] 41%|████▏ | 2384/5772 [19:25<5:37:21, 5.97s/it] {'loss': 0.4734, 'learning_rate': 1.3245597836115386e-05, 'epoch': 0.41} + 41%|████▏ | 2384/5772 [19:27<5:37:21, 5.97s/it] {'loss': 0.4734, 'learning_rate': 1.3245597836115386e-05, 'epoch': 0.41} + 41%|████▏ | 2384/5772 [19:25<5:37:21, 5.97s/it] 41%|████▏ | 2385/5772 [19:31<5:37:26, 5.98s/it] 41%|████▏ | 2385/5772 [19:32<5:37:26, 5.98s/it] {'loss': 0.4684, 'learning_rate': 1.3240289138245949e-05, 'epoch': 0.41} + 41%|████▏ | 2385/5772 [19:32<5:37:26, 5.98s/it] {'loss': 0.4684, 'learning_rate': 1.3240289138245949e-05, 'epoch': 0.41} + 41%|████▏ | 2385/5772 [19:31<5:37:26, 5.98s/it] 41%|████▏ | 2386/5772 [19:37<5:40:03, 6.03s/it] 41%|████▏ | 2386/5772 [19:39<5:40:03, 6.03s/it] {'loss': 0.4808, 'learning_rate': 1.3234979419865466e-05, 'epoch': 0.41} + 41%|████▏ | 2386/5772 [19:39<5:40:03, 6.03s/it] {'loss': 0.4808, 'learning_rate': 1.3234979419865466e-05, 'epoch': 0.41} + 41%|████▏ | 2386/5772 [19:37<5:40:03, 6.03s/it] 41%|████▏ | 2387/5772 [19:43<5:46:08, 6.14s/it] 41%|████▏ | 2387/5772 [19:45<5:46:08, 6.14s/it] {'loss': 0.4775, 'learning_rate': 1.3229668682646197e-05, 'epoch': 0.41} + 41%|████▏ | 2387/5772 [19:45<5:46:08, 6.14s/it] {'loss': 0.4775, 'learning_rate': 1.3229668682646197e-05, 'epoch': 0.41} + 41%|████▏ | 2387/5772 [19:43<5:46:08, 6.14s/it] 41%|████▏ | 2388/5772 [19:49<5:46:27, 6.14s/it] 41%|████▏ | 2388/5772 [19:51<5:46:27, 6.14s/it] {'loss': 0.467, 'learning_rate': 1.3224356928260735e-05, 'epoch': 0.41} + 41%|████▏ | 2388/5772 [19:51<5:46:27, 6.14s/it] {'loss': 0.467, 'learning_rate': 1.3224356928260735e-05, 'epoch': 0.41} + 41%|████▏ | 2388/5772 [19:49<5:46:27, 6.14s/it] 41%|████▏ | 2389/5772 [19:55<5:43:37, 6.09s/it] 41%|████▏ | 2389/5772 [19:57<5:43:37, 6.09s/it] {'loss': 0.4793, 'learning_rate': 1.3219044158381988e-05, 'epoch': 0.41} + 41%|████▏ | 2389/5772 [19:57<5:43:37, 6.09s/it] {'loss': 0.4793, 'learning_rate': 1.3219044158381988e-05, 'epoch': 0.41} + 41%|████▏ | 2389/5772 [19:55<5:43:37, 6.09s/it] 41%|████▏ | 2390/5772 [20:01<5:45:32, 6.13s/it] 41%|████▏ | 2390/5772 [20:03<5:45:32, 6.13s/it] {'loss': 0.4763, 'learning_rate': 1.321373037468318e-05, 'epoch': 0.41} + 41%|████▏ | 2390/5772 [20:03<5:45:32, 6.13s/it] {'loss': 0.4763, 'learning_rate': 1.321373037468318e-05, 'epoch': 0.41} + 41%|████▏ | 2390/5772 [20:01<5:45:32, 6.13s/it] 41%|████▏ | 2391/5772 [20:09<5:41:51, 6.07s/it] 41%|████▏ | 2391/5772 [20:07<5:41:51, 6.07s/it] {'loss': 0.4783, 'learning_rate': 1.3208415578837859e-05, 'epoch': 0.41} + 41%|████▏ | 2391/5772 [20:09<5:41:51, 6.07s/it] {'loss': 0.4783, 'learning_rate': 1.3208415578837859e-05, 'epoch': 0.41} + 41%|████▏ | 2391/5772 [20:07<5:41:51, 6.07s/it] 41%|████▏ | 2392/5772 [20:15<5:38:34, 6.01s/it] 41%|████▏ | 2392/5772 [20:13<5:38:34, 6.01s/it] {'loss': 0.4776, 'learning_rate': 1.3203099772519889e-05, 'epoch': 0.41} + 41%|████▏ | 2392/5772 [20:15<5:38:34, 6.01s/it] {'loss': 0.4776, 'learning_rate': 1.3203099772519889e-05, 'epoch': 0.41} + 41%|████▏ | 2392/5772 [20:13<5:38:34, 6.01s/it] 41%|████▏ | 2393/5772 [20:21<5:40:49, 6.05s/it] 41%|████▏ | 2393/5772 [20:19<5:40:49, 6.05s/it] {'loss': 0.4723, 'learning_rate': 1.3197782957403458e-05, 'epoch': 0.41} + 41%|████▏ | 2393/5772 [20:21<5:40:49, 6.05s/it] {'loss': 0.4723, 'learning_rate': 1.3197782957403458e-05, 'epoch': 0.41} + 41%|████▏ | 2393/5772 [20:19<5:40:49, 6.05s/it] 41%|████▏ | 2394/5772 [20:26<5:46:11, 6.15s/it] 41%|████▏ | 2394/5772 [20:28<5:46:12, 6.15s/it] {'loss': 0.4635, 'learning_rate': 1.3192465135163062e-05, 'epoch': 0.41} + 41%|████▏ | 2394/5772 [20:28<5:46:12, 6.15s/it] {'loss': 0.4635, 'learning_rate': 1.3192465135163062e-05, 'epoch': 0.41} + 41%|████▏ | 2394/5772 [20:26<5:46:11, 6.15s/it] 41%|████▏ | 2395/5772 [20:32<5:42:33, 6.09s/it] 41%|████▏ | 2395/5772 [20:34<5:42:33, 6.09s/it] {'loss': 0.4769, 'learning_rate': 1.3187146307473521e-05, 'epoch': 0.41} + 41%|████▏ | 2395/5772 [20:34<5:42:33, 6.09s/it] {'loss': 0.4769, 'learning_rate': 1.3187146307473521e-05, 'epoch': 0.41} + 41%|████▏ | 2395/5772 [20:32<5:42:33, 6.09s/it] 42%|████▏ | 2396/5772 [20:38<5:47:23, 6.17s/it] 42%|████▏ | 2396/5772 [20:40<5:47:23, 6.17s/it] {'loss': 0.4628, 'learning_rate': 1.3181826476009974e-05, 'epoch': 0.42} + 42%|████▏ | 2396/5772 [20:40<5:47:23, 6.17s/it] {'loss': 0.4628, 'learning_rate': 1.3181826476009974e-05, 'epoch': 0.42} + 42%|████▏ | 2396/5772 [20:38<5:47:23, 6.17s/it] 42%|████▏ | 2397/5772 [20:46<5:45:56, 6.15s/it] 42%|████▏ | 2397/5772 [20:44<5:45:56, 6.15s/it] {'loss': 0.4688, 'learning_rate': 1.317650564244787e-05, 'epoch': 0.42} + 42%|████▏ | 2397/5772 [20:46<5:45:56, 6.15s/it] {'loss': 0.4688, 'learning_rate': 1.317650564244787e-05, 'epoch': 0.42} + 42%|████▏ | 2397/5772 [20:44<5:45:56, 6.15s/it] 42%|████▏ | 2398/5772 [20:50<5:46:33, 6.16s/it] 42%|████▏ | 2398/5772 [20:52<5:46:33, 6.16s/it] {'loss': 0.4708, 'learning_rate': 1.3171183808462969e-05, 'epoch': 0.42} + 42%|████▏ | 2398/5772 [20:52<5:46:33, 6.16s/it] {'loss': 0.4708, 'learning_rate': 1.3171183808462969e-05, 'epoch': 0.42} + 42%|████▏ | 2398/5772 [20:50<5:46:33, 6.16s/it] 42%|████▏ | 2399/5772 [20:56<5:42:46, 6.10s/it] 42%|████▏ | 2399/5772 [20:58<5:42:46, 6.10s/it] {'loss': 0.4826, 'learning_rate': 1.3165860975731363e-05, 'epoch': 0.42} + 42%|████▏ | 2399/5772 [20:58<5:42:46, 6.10s/it] {'loss': 0.4826, 'learning_rate': 1.3165860975731363e-05, 'epoch': 0.42} + 42%|████▏ | 2399/5772 [20:56<5:42:46, 6.10s/it]1291110 4 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + + +14 AutoResumeHook: Checking whether to suspend... +1513 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend...3AutoResumeHook: Checking whether to suspend... + +AutoResumeHook: Checking whether to suspend... +70 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... + 42%|████▏ | 2400/5772 [21:02<5:39:58, 6.05s/it] 42%|████▏ | 2400/5772 [21:04<5:39:57, 6.05s/it]2 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4562, 'learning_rate': 1.3160537145929447e-05, 'epoch': 0.42} + 42%|████▏ | 2400/5772 [21:04<5:39:57, 6.05s/it] {'loss': 0.4562, 'learning_rate': 1.3160537145929447e-05, 'epoch': 0.42} + 42%|████▏ | 2400/5772 [21:02<5:39:58, 6.05s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-2400/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-2400/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-2400/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 42%|████▏ | 2401/5772 [21:34<12:26:38, 13.29s/it] 42%|████▏ | 2401/5772 [21:32<12:26:38, 13.29s/it] {'loss': 0.4821, 'learning_rate': 1.3155212320733925e-05, 'epoch': 0.42} + 42%|████▏ | 2401/5772 [21:34<12:26:38, 13.29s/it] {'loss': 0.4821, 'learning_rate': 1.3155212320733925e-05, 'epoch': 0.42} + 42%|████▏ | 2401/5772 [21:32<12:26:38, 13.29s/it] 42%|████▏ | 2402/5772 [21:40<10:24:48, 11.12s/it] 42%|████▏ | 2402/5772 [21:38<10:24:49, 11.12s/it] {'loss': 0.4732, 'learning_rate': 1.3149886501821831e-05, 'epoch': 0.42} + 42%|████▏ | 2402/5772 [21:40<10:24:48, 11.12s/it] {'loss': 0.4732, 'learning_rate': 1.3149886501821831e-05, 'epoch': 0.42} + 42%|████▏ | 2402/5772 [21:38<10:24:49, 11.12s/it] 42%|████▏ | 2403/5772 [21:46<8:58:10, 9.58s/it] 42%|████▏ | 2403/5772 [21:44<8:58:10, 9.58s/it] {'loss': 0.4798, 'learning_rate': 1.3144559690870494e-05, 'epoch': 0.42} + 42%|████▏ | 2403/5772 [21:46<8:58:10, 9.58s/it] {'loss': 0.4798, 'learning_rate': 1.3144559690870494e-05, 'epoch': 0.42} + 42%|████▏ | 2403/5772 [21:44<8:58:10, 9.58s/it] 42%|████▏ | 2404/5772 [21:53<8:00:45, 8.56s/it] 42%|████▏ | 2404/5772 [21:51<8:00:45, 8.56s/it] {'loss': 0.4572, 'learning_rate': 1.3139231889557568e-05, 'epoch': 0.42} + 42%|████▏ | 2404/5772 [21:53<8:00:45, 8.56s/it] {'loss': 0.4572, 'learning_rate': 1.3139231889557568e-05, 'epoch': 0.42} + 42%|████▏ | 2404/5772 [21:51<8:00:45, 8.56s/it] 42%|████▏ | 2405/5772 [21:58<7:14:05, 7.74s/it] 42%|████▏ | 2405/5772 [21:56<7:14:04, 7.74s/it] {'loss': 0.4702, 'learning_rate': 1.313390309956101e-05, 'epoch': 0.42} + 42%|████▏ | 2405/5772 [21:58<7:14:05, 7.74s/it] {'loss': 0.4702, 'learning_rate': 1.313390309956101e-05, 'epoch': 0.42} + 42%|████▏ | 2405/5772 [21:56<7:14:04, 7.74s/it] 42%|████▏ | 2406/5772 [22:03<6:57:17, 7.44s/it] 42%|████▏ | 2406/5772 [22:05<6:57:17, 7.44s/it] {'loss': 0.4806, 'learning_rate': 1.3128573322559097e-05, 'epoch': 0.42} + 42%|████▏ | 2406/5772 [22:05<6:57:17, 7.44s/it] {'loss': 0.4806, 'learning_rate': 1.3128573322559097e-05, 'epoch': 0.42} + 42%|████▏ | 2406/5772 [22:03<6:57:17, 7.44s/it] 42%|████▏ | 2407/5772 [22:09<6:32:28, 7.00s/it] 42%|████▏ | 2407/5772 [22:11<6:32:28, 7.00s/it] {'loss': 0.4686, 'learning_rate': 1.3123242560230408e-05, 'epoch': 0.42} + 42%|████▏ | 2407/5772 [22:11<6:32:28, 7.00s/it] {'loss': 0.4686, 'learning_rate': 1.3123242560230408e-05, 'epoch': 0.42} + 42%|████▏ | 2407/5772 [22:09<6:32:28, 7.00s/it] 42%|████▏ | 2408/5772 [22:15<6:16:56, 6.72s/it] 42%|████▏ | 2408/5772 [22:17<6:16:56, 6.72s/it] {'loss': 0.4713, 'learning_rate': 1.3117910814253845e-05, 'epoch': 0.42} + 42%|████▏ | 2408/5772 [22:17<6:16:56, 6.72s/it] {'loss': 0.4713, 'learning_rate': 1.3117910814253845e-05, 'epoch': 0.42} + 42%|████▏ | 2408/5772 [22:15<6:16:56, 6.72s/it] 42%|████▏ | 2409/5772 [22:24<6:11:21, 6.63s/it] 42%|████▏ | 2409/5772 [22:22<6:11:21, 6.63s/it] {'loss': 0.4625, 'learning_rate': 1.3112578086308602e-05, 'epoch': 0.42} + 42%|████▏ | 2409/5772 [22:24<6:11:21, 6.63s/it] {'loss': 0.4625, 'learning_rate': 1.3112578086308602e-05, 'epoch': 0.42} + 42%|████▏ | 2409/5772 [22:22<6:11:21, 6.63s/it] 42%|████▏ | 2410/5772 [22:30<6:03:42, 6.49s/it] 42%|████▏ | 2410/5772 [22:28<6:03:43, 6.49s/it] {'loss': 0.4679, 'learning_rate': 1.3107244378074197e-05, 'epoch': 0.42} + 42%|████▏ | 2410/5772 [22:30<6:03:42, 6.49s/it] {'loss': 0.4679, 'learning_rate': 1.3107244378074197e-05, 'epoch': 0.42} + 42%|████▏ | 2410/5772 [22:28<6:03:43, 6.49s/it] 42%|████▏ | 2411/5772 [22:35<5:49:53, 6.25s/it] 42%|████▏ | 2411/5772 [22:33<5:49:53, 6.25s/it] {'loss': 0.4695, 'learning_rate': 1.3101909691230456e-05, 'epoch': 0.42} + 42%|████▏ | 2411/5772 [22:35<5:49:53, 6.25s/it] {'loss': 0.4695, 'learning_rate': 1.3101909691230456e-05, 'epoch': 0.42} + 42%|████▏ | 2411/5772 [22:33<5:49:53, 6.25s/it] 42%|████▏ | 2412/5772 [22:39<5:44:53, 6.16s/it] 42%|████▏ | 2412/5772 [22:41<5:44:54, 6.16s/it] {'loss': 0.4692, 'learning_rate': 1.3096574027457503e-05, 'epoch': 0.42} + 42%|████▏ | 2412/5772 [22:41<5:44:54, 6.16s/it] {'loss': 0.4692, 'learning_rate': 1.3096574027457503e-05, 'epoch': 0.42} + 42%|████▏ | 2412/5772 [22:39<5:44:53, 6.16s/it] 42%|████▏ | 2413/5772 [22:47<5:42:44, 6.12s/it] 42%|████▏ | 2413/5772 [22:45<5:42:44, 6.12s/it] {'loss': 0.4807, 'learning_rate': 1.3091237388435773e-05, 'epoch': 0.42} + 42%|████▏ | 2413/5772 [22:47<5:42:44, 6.12s/it] {'loss': 0.4807, 'learning_rate': 1.3091237388435773e-05, 'epoch': 0.42} + 42%|████▏ | 2413/5772 [22:45<5:42:44, 6.12s/it] 42%|████▏ | 2414/5772 [22:52<5:50:39, 6.27s/it] 42%|████▏ | 2414/5772 [22:54<5:50:40, 6.27s/it] {'loss': 0.4654, 'learning_rate': 1.3085899775846018e-05, 'epoch': 0.42} + 42%|████▏ | 2414/5772 [22:54<5:50:40, 6.27s/it] {'loss': 0.4654, 'learning_rate': 1.3085899775846018e-05, 'epoch': 0.42} + 42%|████▏ | 2414/5772 [22:52<5:50:39, 6.27s/it] 42%|████▏ | 2415/5772 [22:58<5:47:59, 6.22s/it] 42%|████▏ | 2415/5772 [23:00<5:47:58, 6.22s/it] {'loss': 0.4757, 'learning_rate': 1.3080561191369286e-05, 'epoch': 0.42} + 42%|████▏ | 2415/5772 [23:00<5:47:58, 6.22s/it] {'loss': 0.4757, 'learning_rate': 1.3080561191369286e-05, 'epoch': 0.42} + 42%|████▏ | 2415/5772 [22:58<5:47:59, 6.22s/it] 42%|████▏ | 2416/5772 [23:06<5:46:33, 6.20s/it] 42%|████▏ | 2416/5772 [23:04<5:46:33, 6.20s/it] {'loss': 0.465, 'learning_rate': 1.3075221636686935e-05, 'epoch': 0.42} + 42%|████▏ | 2416/5772 [23:06<5:46:33, 6.20s/it] {'loss': 0.465, 'learning_rate': 1.3075221636686935e-05, 'epoch': 0.42} + 42%|████▏ | 2416/5772 [23:04<5:46:33, 6.20s/it] 42%|████▏ | 2417/5772 [23:12<5:41:52, 6.11s/it] 42%|████▏ | 2417/5772 [23:10<5:41:52, 6.11s/it] {'loss': 0.4844, 'learning_rate': 1.3069881113480629e-05, 'epoch': 0.42} + 42%|████▏ | 2417/5772 [23:12<5:41:52, 6.11s/it] {'loss': 0.4844, 'learning_rate': 1.3069881113480629e-05, 'epoch': 0.42} + 42%|████▏ | 2417/5772 [23:10<5:41:52, 6.11s/it] 42%|████▏ | 2418/5772 [23:16<5:38:47, 6.06s/it] 42%|████▏ | 2418/5772 [23:18<5:38:47, 6.06s/it] {'loss': 0.4861, 'learning_rate': 1.3064539623432331e-05, 'epoch': 0.42} + 42%|████▏ | 2418/5772 [23:18<5:38:47, 6.06s/it] {'loss': 0.4861, 'learning_rate': 1.3064539623432331e-05, 'epoch': 0.42} + 42%|████▏ | 2418/5772 [23:16<5:38:47, 6.06s/it] 42%|████▏ | 2419/5772 [23:25<5:43:30, 6.15s/it] 42%|████▏ | 2419/5772 [23:23<5:43:31, 6.15s/it] {'loss': 0.4714, 'learning_rate': 1.305919716822432e-05, 'epoch': 0.42} + 42%|████▏ | 2419/5772 [23:25<5:43:30, 6.15s/it] {'loss': 0.4714, 'learning_rate': 1.305919716822432e-05, 'epoch': 0.42} + 42%|████▏ | 2419/5772 [23:23<5:43:31, 6.15s/it] 42%|████▏ | 2420/5772 [23:29<5:52:46, 6.31s/it] 42%|████▏ | 2420/5772 [23:31<5:52:47, 6.31s/it] {'loss': 0.4671, 'learning_rate': 1.3053853749539169e-05, 'epoch': 0.42} + 42%|████▏ | 2420/5772 [23:31<5:52:47, 6.31s/it] {'loss': 0.4671, 'learning_rate': 1.3053853749539169e-05, 'epoch': 0.42} + 42%|████▏ | 2420/5772 [23:29<5:52:46, 6.31s/it] 42%|████▏ | 2421/5772 [23:36<5:52:47, 6.32s/it] 42%|████▏ | 2421/5772 [23:38<5:52:48, 6.32s/it] {'loss': 0.4642, 'learning_rate': 1.3048509369059762e-05, 'epoch': 0.42} + 42%|████▏ | 2421/5772 [23:38<5:52:48, 6.32s/it] {'loss': 0.4642, 'learning_rate': 1.3048509369059762e-05, 'epoch': 0.42} + 42%|████▏ | 2421/5772 [23:36<5:52:47, 6.32s/it] 42%|████▏ | 2422/5772 [23:42<5:59:19, 6.44s/it] 42%|████▏ | 2422/5772 [23:44<5:59:20, 6.44s/it] {'loss': 0.4752, 'learning_rate': 1.3043164028469274e-05, 'epoch': 0.42} + 42%|████▏ | 2422/5772 [23:44<5:59:20, 6.44s/it] {'loss': 0.4752, 'learning_rate': 1.3043164028469274e-05, 'epoch': 0.42} + 42%|████▏ | 2422/5772 [23:42<5:59:19, 6.44s/it] 42%|████▏ | 2423/5772 [23:50<5:48:28, 6.24s/it] 42%|████▏ | 2423/5772 [23:48<5:48:28, 6.24s/it] {'loss': 0.4585, 'learning_rate': 1.3037817729451199e-05, 'epoch': 0.42} + 42%|████▏ | 2423/5772 [23:50<5:48:28, 6.24s/it] {'loss': 0.4585, 'learning_rate': 1.3037817729451199e-05, 'epoch': 0.42} + 42%|████▏ | 2423/5772 [23:48<5:48:28, 6.24s/it] 42%|████▏ | 2424/5772 [23:55<5:51:37, 6.30s/it] 42%|████▏ | 2424/5772 [23:56<5:51:37, 6.30s/it] {'loss': 0.4709, 'learning_rate': 1.3032470473689322e-05, 'epoch': 0.42} + 42%|████▏ | 2424/5772 [23:56<5:51:37, 6.30s/it] {'loss': 0.4709, 'learning_rate': 1.3032470473689322e-05, 'epoch': 0.42} + 42%|████▏ | 2424/5772 [23:55<5:51:37, 6.30s/it] 42%|████▏ | 2425/5772 [24:01<5:55:28, 6.37s/it] 42%|████▏ | 2425/5772 [24:03<5:55:28, 6.37s/it] {'loss': 0.4715, 'learning_rate': 1.3027122262867727e-05, 'epoch': 0.42} + 42%|████▏ | 2425/5772 [24:03<5:55:28, 6.37s/it] {'loss': 0.4715, 'learning_rate': 1.3027122262867727e-05, 'epoch': 0.42} + 42%|████▏ | 2425/5772 [24:01<5:55:28, 6.37s/it] 42%|████▏ | 2426/5772 [24:07<5:52:47, 6.33s/it] 42%|████▏ | 2426/5772 [24:09<5:52:47, 6.33s/it] {'loss': 0.4692, 'learning_rate': 1.3021773098670804e-05, 'epoch': 0.42} + 42%|████▏ | 2426/5772 [24:09<5:52:47, 6.33s/it] {'loss': 0.4692, 'learning_rate': 1.3021773098670804e-05, 'epoch': 0.42} + 42%|████▏ | 2426/5772 [24:07<5:52:47, 6.33s/it] 42%|████▏ | 2427/5772 [24:15<5:51:18, 6.30s/it] 42%|████▏ | 2427/5772 [24:14<5:51:18, 6.30s/it] {'loss': 0.4687, 'learning_rate': 1.301642298278325e-05, 'epoch': 0.42} + 42%|████▏ | 2427/5772 [24:15<5:51:18, 6.30s/it] {'loss': 0.4687, 'learning_rate': 1.301642298278325e-05, 'epoch': 0.42} + 42%|████▏ | 2427/5772 [24:14<5:51:18, 6.30s/it] 42%|████▏ | 2428/5772 [24:20<5:54:09, 6.35s/it] 42%|████▏ | 2428/5772 [24:22<5:54:09, 6.35s/it] {'loss': 0.479, 'learning_rate': 1.3011071916890049e-05, 'epoch': 0.42} + 42%|████▏ | 2428/5772 [24:22<5:54:09, 6.35s/it] {'loss': 0.479, 'learning_rate': 1.3011071916890049e-05, 'epoch': 0.42} + 42%|████▏ | 2428/5772 [24:20<5:54:09, 6.35s/it] 42%|████▏ | 2429/5772 [24:28<5:53:50, 6.35s/it] 42%|████▏ | 2429/5772 [24:26<5:53:50, 6.35s/it] {'loss': 0.4872, 'learning_rate': 1.3005719902676483e-05, 'epoch': 0.42} + 42%|████▏ | 2429/5772 [24:28<5:53:50, 6.35s/it] {'loss': 0.4872, 'learning_rate': 1.3005719902676483e-05, 'epoch': 0.42} + 42%|████▏ | 2429/5772 [24:26<5:53:50, 6.35s/it] 42%|████▏ | 2430/5772 [24:33<5:54:10, 6.36s/it] 42%|████▏ | 2430/5772 [24:35<5:54:10, 6.36s/it] {'loss': 0.4764, 'learning_rate': 1.300036694182815e-05, 'epoch': 0.42} + 42%|████▏ | 2430/5772 [24:35<5:54:10, 6.36s/it] {'loss': 0.4764, 'learning_rate': 1.300036694182815e-05, 'epoch': 0.42} + 42%|████▏ | 2430/5772 [24:33<5:54:10, 6.36s/it] 42%|████▏ | 2431/5772 [24:39<5:50:13, 6.29s/it] 42%|████▏ | 2431/5772 [24:41<5:50:13, 6.29s/it] {'loss': 0.4736, 'learning_rate': 1.2995013036030932e-05, 'epoch': 0.42} + 42%|████▏ | 2431/5772 [24:41<5:50:13, 6.29s/it] {'loss': 0.4736, 'learning_rate': 1.2995013036030932e-05, 'epoch': 0.42} + 42%|████▏ | 2431/5772 [24:39<5:50:13, 6.29s/it] 42%|████▏ | 2432/5772 [24:45<5:51:46, 6.32s/it] 42%|████▏ | 2432/5772 [24:47<5:51:47, 6.32s/it] {'loss': 0.4796, 'learning_rate': 1.2989658186971007e-05, 'epoch': 0.42} + 42%|████▏ | 2432/5772 [24:47<5:51:47, 6.32s/it] {'loss': 0.4796, 'learning_rate': 1.2989658186971007e-05, 'epoch': 0.42} + 42%|████▏ | 2432/5772 [24:45<5:51:46, 6.32s/it] 42%|████▏ | 2433/5772 [24:51<5:48:24, 6.26s/it] 42%|████▏ | 2433/5772 [24:53<5:48:24, 6.26s/it] {'loss': 0.4766, 'learning_rate': 1.298430239633486e-05, 'epoch': 0.42} + 42%|████▏ | 2433/5772 [24:53<5:48:24, 6.26s/it] {'loss': 0.4766, 'learning_rate': 1.298430239633486e-05, 'epoch': 0.42} + 42%|████▏ | 2433/5772 [24:51<5:48:24, 6.26s/it] 42%|████▏ | 2434/5772 [24:57<5:45:05, 6.20s/it] 42%|████▏ | 2434/5772 [24:59<5:45:05, 6.20s/it] {'loss': 0.4731, 'learning_rate': 1.2978945665809267e-05, 'epoch': 0.42} + 42%|████▏ | 2434/5772 [24:59<5:45:05, 6.20s/it] {'loss': 0.4731, 'learning_rate': 1.2978945665809267e-05, 'epoch': 0.42} + 42%|████▏ | 2434/5772 [24:57<5:45:05, 6.20s/it] 42%|████▏ | 2435/5772 [25:04<5:49:16, 6.28s/it] 42%|████▏ | 2435/5772 [25:06<5:49:16, 6.28s/it] {'loss': 0.4743, 'learning_rate': 1.2973587997081298e-05, 'epoch': 0.42} + 42%|████▏ | 2435/5772 [25:06<5:49:16, 6.28s/it] {'loss': 0.4743, 'learning_rate': 1.2973587997081298e-05, 'epoch': 0.42} + 42%|████▏ | 2435/5772 [25:04<5:49:16, 6.28s/it] 42%|████▏ | 2436/5772 [25:12<5:49:09, 6.28s/it] 42%|████▏ | 2436/5772 [25:10<5:49:09, 6.28s/it] {'loss': 0.4577, 'learning_rate': 1.2968229391838322e-05, 'epoch': 0.42} + 42%|████▏ | 2436/5772 [25:12<5:49:09, 6.28s/it] {'loss': 0.4577, 'learning_rate': 1.2968229391838322e-05, 'epoch': 0.42} + 42%|████▏ | 2436/5772 [25:10<5:49:09, 6.28s/it] 42%|████▏ | 2437/5772 [25:17<5:53:44, 6.36s/it] 42%|████▏ | 2437/5772 [25:19<5:53:44, 6.36s/it] {'loss': 0.4817, 'learning_rate': 1.2962869851768008e-05, 'epoch': 0.42} + 42%|████▏ | 2437/5772 [25:19<5:53:44, 6.36s/it] {'loss': 0.4817, 'learning_rate': 1.2962869851768008e-05, 'epoch': 0.42} + 42%|████▏ | 2437/5772 [25:17<5:53:44, 6.36s/it] 42%|████▏ | 2438/5772 [25:25<5:54:43, 6.38s/it] 42%|████▏ | 2438/5772 [25:23<5:54:43, 6.38s/it] {'loss': 0.4717, 'learning_rate': 1.2957509378558301e-05, 'epoch': 0.42} + 42%|████▏ | 2438/5772 [25:25<5:54:43, 6.38s/it] {'loss': 0.4717, 'learning_rate': 1.2957509378558301e-05, 'epoch': 0.42} + 42%|████▏ | 2438/5772 [25:23<5:54:43, 6.38s/it] 42%|████▏ | 2439/5772 [25:31<5:48:45, 6.28s/it] 42%|████▏ | 2439/5772 [25:29<5:48:45, 6.28s/it] {'loss': 0.475, 'learning_rate': 1.2952147973897464e-05, 'epoch': 0.42} + 42%|████▏ | 2439/5772 [25:31<5:48:45, 6.28s/it] {'loss': 0.475, 'learning_rate': 1.2952147973897464e-05, 'epoch': 0.42} + 42%|████▏ | 2439/5772 [25:29<5:48:45, 6.28s/it] 42%|████▏ | 2440/5772 [25:37<5:48:20, 6.27s/it] 42%|████▏ | 2440/5772 [25:35<5:48:20, 6.27s/it] {'loss': 0.4607, 'learning_rate': 1.2946785639474034e-05, 'epoch': 0.42} + 42%|████▏ | 2440/5772 [25:37<5:48:20, 6.27s/it] {'loss': 0.4607, 'learning_rate': 1.2946785639474034e-05, 'epoch': 0.42} + 42%|████▏ | 2440/5772 [25:35<5:48:20, 6.27s/it] 42%|████▏ | 2441/5772 [25:43<5:43:09, 6.18s/it] 42%|████▏ | 2441/5772 [25:41<5:43:09, 6.18s/it] {'loss': 0.4735, 'learning_rate': 1.2941422376976851e-05, 'epoch': 0.42} + 42%|████▏ | 2441/5772 [25:43<5:43:09, 6.18s/it] {'loss': 0.4735, 'learning_rate': 1.2941422376976851e-05, 'epoch': 0.42} + 42%|████▏ | 2441/5772 [25:41<5:43:09, 6.18s/it] 42%|████▏ | 2442/5772 [25:50<5:42:32, 6.17s/it] 42%|████▏ | 2442/5772 [25:48<5:42:32, 6.17s/it] {'loss': 0.4658, 'learning_rate': 1.2936058188095045e-05, 'epoch': 0.42} + 42%|████▏ | 2442/5772 [25:50<5:42:32, 6.17s/it] {'loss': 0.4658, 'learning_rate': 1.2936058188095045e-05, 'epoch': 0.42} + 42%|████▏ | 2442/5772 [25:48<5:42:32, 6.17s/it] 42%|████▏ | 2443/5772 [25:55<5:37:58, 6.09s/it] 42%|████▏ | 2443/5772 [25:53<5:37:58, 6.09s/it] {'loss': 0.4786, 'learning_rate': 1.2930693074518038e-05, 'epoch': 0.42} + 42%|████▏ | 2443/5772 [25:55<5:37:58, 6.09s/it] {'loss': 0.4786, 'learning_rate': 1.2930693074518038e-05, 'epoch': 0.42} + 42%|████▏ | 2443/5772 [25:53<5:37:58, 6.09s/it] 42%|████▏ | 2444/5772 [26:02<5:38:55, 6.11s/it] 42%|████▏ | 2444/5772 [26:00<5:38:55, 6.11s/it] {'loss': 0.4759, 'learning_rate': 1.292532703793554e-05, 'epoch': 0.42} + 42%|████▏ | 2444/5772 [26:02<5:38:55, 6.11s/it] {'loss': 0.4759, 'learning_rate': 1.292532703793554e-05, 'epoch': 0.42} + 42%|████▏ | 2444/5772 [26:00<5:38:55, 6.11s/it] 42%|████▏ | 2445/5772 [26:08<5:36:17, 6.06s/it] 42%|████▏ | 2445/5772 [26:06<5:36:17, 6.06s/it] {'loss': 0.4765, 'learning_rate': 1.2919960080037557e-05, 'epoch': 0.42} + 42%|████▏ | 2445/5772 [26:08<5:36:17, 6.06s/it] {'loss': 0.4765, 'learning_rate': 1.2919960080037557e-05, 'epoch': 0.42} + 42%|████▏ | 2445/5772 [26:06<5:36:17, 6.06s/it] 42%|████▏ | 2446/5772 [26:14<5:38:59, 6.12s/it] 42%|████▏ | 2446/5772 [26:12<5:38:59, 6.12s/it] {'loss': 0.4822, 'learning_rate': 1.2914592202514385e-05, 'epoch': 0.42} + 42%|████▏ | 2446/5772 [26:14<5:38:59, 6.12s/it] {'loss': 0.4822, 'learning_rate': 1.2914592202514385e-05, 'epoch': 0.42} + 42%|████▏ | 2446/5772 [26:12<5:38:59, 6.12s/it] 42%|████▏ | 2447/5772 [26:20<5:38:16, 6.10s/it] 42%|████▏ | 2447/5772 [26:18<5:38:16, 6.10s/it] {'loss': 0.4684, 'learning_rate': 1.2909223407056599e-05, 'epoch': 0.42} + 42%|████▏ | 2447/5772 [26:20<5:38:16, 6.10s/it] {'loss': 0.4684, 'learning_rate': 1.2909223407056599e-05, 'epoch': 0.42} + 42%|████▏ | 2447/5772 [26:18<5:38:16, 6.10s/it] 42%|████▏ | 2448/5772 [26:26<5:36:17, 6.07s/it] 42%|████▏ | 2448/5772 [26:24<5:36:17, 6.07s/it] {'loss': 0.4725, 'learning_rate': 1.290385369535508e-05, 'epoch': 0.42} + 42%|████▏ | 2448/5772 [26:26<5:36:17, 6.07s/it] {'loss': 0.4725, 'learning_rate': 1.290385369535508e-05, 'epoch': 0.42} + 42%|████▏ | 2448/5772 [26:24<5:36:17, 6.07s/it] 42%|████▏ | 2449/5772 [26:32<5:39:15, 6.13s/it] 42%|████▏ | 2449/5772 [26:30<5:39:15, 6.13s/it] {'loss': 0.4593, 'learning_rate': 1.2898483069100982e-05, 'epoch': 0.42} + 42%|████▏ | 2449/5772 [26:32<5:39:15, 6.13s/it] {'loss': 0.4593, 'learning_rate': 1.2898483069100982e-05, 'epoch': 0.42} + 42%|████▏ | 2449/5772 [26:30<5:39:15, 6.13s/it]13 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +8 12AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... + 42%|████▏ | 2450/5772 [26:38<5:38:10, 6.11s/it]5 AutoResumeHook: Checking whether to suspend... +03 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... + 42%|████▏ | 2450/5772 [26:36<5:38:11, 6.11s/it] {'loss': 0.4803, 'learning_rate': 1.2893111529985761e-05, 'epoch': 0.42} + 42%|████▏ | 2450/5772 [26:38<5:38:10, 6.11s/it] {'loss': 0.4803, 'learning_rate': 1.2893111529985761e-05, 'epoch': 0.42} + 42%|████▏ | 2450/5772 [26:36<5:38:11, 6.11s/it] 42%|████▏ | 2451/5772 [26:45<5:44:20, 6.22s/it] 42%|████▏ | 2451/5772 [26:43<5:44:20, 6.22s/it] {'loss': 0.4645, 'learning_rate': 1.2887739079701147e-05, 'epoch': 0.42} + 42%|████▏ | 2451/5772 [26:45<5:44:20, 6.22s/it] {'loss': 0.4645, 'learning_rate': 1.2887739079701147e-05, 'epoch': 0.42} + 42%|████▏ | 2451/5772 [26:43<5:44:20, 6.22s/it] 42%|████▏ | 2452/5772 [26:51<5:45:44, 6.25s/it] 42%|████▏ | 2452/5772 [26:49<5:45:45, 6.25s/it] {'loss': 0.4601, 'learning_rate': 1.2882365719939167e-05, 'epoch': 0.42} + 42%|████▏ | 2452/5772 [26:51<5:45:44, 6.25s/it] {'loss': 0.4601, 'learning_rate': 1.2882365719939167e-05, 'epoch': 0.42} + 42%|████▏ | 2452/5772 [26:49<5:45:45, 6.25s/it] 42%|████▏ | 2453/5772 [26:57<5:42:54, 6.20s/it] 42%|████▏ | 2453/5772 [26:55<5:42:54, 6.20s/it] {'loss': 0.4691, 'learning_rate': 1.2876991452392124e-05, 'epoch': 0.42} + 42%|████▏ | 2453/5772 [26:57<5:42:54, 6.20s/it] {'loss': 0.4691, 'learning_rate': 1.2876991452392124e-05, 'epoch': 0.42} + 42%|████▏ | 2453/5772 [26:55<5:42:54, 6.20s/it] 43%|████▎ | 2454/5772 [27:01<5:46:11, 6.26s/it] 43%|████▎ | 2454/5772 [27:03<5:46:11, 6.26s/it] {'loss': 0.482, 'learning_rate': 1.2871616278752628e-05, 'epoch': 0.43} + 43%|████▎ | 2454/5772 [27:03<5:46:11, 6.26s/it] {'loss': 0.482, 'learning_rate': 1.2871616278752628e-05, 'epoch': 0.43} + 43%|████▎ | 2454/5772 [27:01<5:46:11, 6.26s/it] 43%|████▎ | 2455/5772 [27:10<5:46:39, 6.27s/it] 43%|████▎ | 2455/5772 [27:08<5:46:39, 6.27s/it] {'loss': 0.4746, 'learning_rate': 1.2866240200713544e-05, 'epoch': 0.43} + 43%|████▎ | 2455/5772 [27:10<5:46:39, 6.27s/it] {'loss': 0.4746, 'learning_rate': 1.2866240200713544e-05, 'epoch': 0.43} + 43%|████▎ | 2455/5772 [27:08<5:46:39, 6.27s/it] 43%|████▎ | 2456/5772 [27:16<5:40:50, 6.17s/it] 43%|████▎ | 2456/5772 [27:14<5:40:50, 6.17s/it] {'loss': 0.4694, 'learning_rate': 1.2860863219968049e-05, 'epoch': 0.43} + 43%|████▎ | 2456/5772 [27:16<5:40:50, 6.17s/it] {'loss': 0.4694, 'learning_rate': 1.2860863219968049e-05, 'epoch': 0.43} + 43%|████▎ | 2456/5772 [27:14<5:40:50, 6.17s/it] 43%|████▎ | 2457/5772 [27:22<5:41:42, 6.18s/it] 43%|████▎ | 2457/5772 [27:20<5:41:41, 6.18s/it] {'loss': 0.4724, 'learning_rate': 1.285548533820959e-05, 'epoch': 0.43} + 43%|████▎ | 2457/5772 [27:22<5:41:42, 6.18s/it] {'loss': 0.4724, 'learning_rate': 1.285548533820959e-05, 'epoch': 0.43} + 43%|████▎ | 2457/5772 [27:20<5:41:41, 6.18s/it] 43%|████▎ | 2458/5772 [27:28<5:39:01, 6.14s/it] 43%|████▎ | 2458/5772 [27:26<5:39:01, 6.14s/it] {'loss': 0.4836, 'learning_rate': 1.2850106557131898e-05, 'epoch': 0.43} + 43%|████▎ | 2458/5772 [27:28<5:39:01, 6.14s/it] {'loss': 0.4836, 'learning_rate': 1.2850106557131898e-05, 'epoch': 0.43} + 43%|████▎ | 2458/5772 [27:26<5:39:01, 6.14s/it] 43%|████▎ | 2459/5772 [27:34<5:35:21, 6.07s/it] 43%|████▎ | 2459/5772 [27:32<5:35:21, 6.07s/it] {'loss': 0.4568, 'learning_rate': 1.2844726878428993e-05, 'epoch': 0.43} + 43%|████▎ | 2459/5772 [27:34<5:35:21, 6.07s/it] {'loss': 0.4568, 'learning_rate': 1.2844726878428993e-05, 'epoch': 0.43} + 43%|████▎ | 2459/5772 [27:32<5:35:21, 6.07s/it] 43%|████▎ | 2460/5772 [27:40<5:38:32, 6.13s/it] 43%|████▎ | 2460/5772 [27:38<5:38:32, 6.13s/it] {'loss': 0.4723, 'learning_rate': 1.2839346303795173e-05, 'epoch': 0.43} + 43%|████▎ | 2460/5772 [27:40<5:38:32, 6.13s/it] {'loss': 0.4723, 'learning_rate': 1.2839346303795173e-05, 'epoch': 0.43} + 43%|████▎ | 2460/5772 [27:38<5:38:32, 6.13s/it] 43%|████▎ | 2461/5772 [27:44<5:36:09, 6.09s/it] 43%|████▎ | 2461/5772 [27:46<5:36:09, 6.09s/it] {'loss': 0.4683, 'learning_rate': 1.2833964834925024e-05, 'epoch': 0.43} + 43%|████▎ | 2461/5772 [27:46<5:36:09, 6.09s/it] {'loss': 0.4683, 'learning_rate': 1.2833964834925024e-05, 'epoch': 0.43} + 43%|████▎ | 2461/5772 [27:44<5:36:09, 6.09s/it] 43%|████▎ | 2462/5772 [27:52<5:39:40, 6.16s/it] 43%|████▎ | 2462/5772 [27:50<5:39:40, 6.16s/it] {'loss': 0.4734, 'learning_rate': 1.2828582473513405e-05, 'epoch': 0.43} + 43%|████▎ | 2462/5772 [27:52<5:39:40, 6.16s/it] {'loss': 0.4734, 'learning_rate': 1.2828582473513405e-05, 'epoch': 0.43} + 43%|████▎ | 2462/5772 [27:50<5:39:40, 6.16s/it] 43%|████▎ | 2463/5772 [27:58<5:34:08, 6.06s/it] 43%|████▎ | 2463/5772 [27:56<5:34:08, 6.06s/it] {'loss': 0.4639, 'learning_rate': 1.2823199221255467e-05, 'epoch': 0.43} + 43%|████▎ | 2463/5772 [27:58<5:34:08, 6.06s/it] {'loss': 0.4639, 'learning_rate': 1.2823199221255467e-05, 'epoch': 0.43} + 43%|████▎ | 2463/5772 [27:56<5:34:08, 6.06s/it] 43%|████▎ | 2464/5772 [28:05<5:37:36, 6.12s/it] 43%|████▎ | 2464/5772 [28:03<5:37:37, 6.12s/it] {'loss': 0.4841, 'learning_rate': 1.2817815079846627e-05, 'epoch': 0.43} + 43%|████▎ | 2464/5772 [28:05<5:37:36, 6.12s/it] {'loss': 0.4841, 'learning_rate': 1.2817815079846627e-05, 'epoch': 0.43} + 43%|████▎ | 2464/5772 [28:03<5:37:37, 6.12s/it] 43%|████▎ | 2465/5772 [28:11<5:35:08, 6.08s/it] 43%|████▎ | 2465/5772 [28:09<5:35:08, 6.08s/it] {'loss': 0.4697, 'learning_rate': 1.2812430050982596e-05, 'epoch': 0.43} + 43%|████▎ | 2465/5772 [28:11<5:35:08, 6.08s/it] {'loss': 0.4697, 'learning_rate': 1.2812430050982596e-05, 'epoch': 0.43} + 43%|████▎ | 2465/5772 [28:09<5:35:08, 6.08s/it] 43%|████▎ | 2466/5772 [28:16<5:32:05, 6.03s/it] 43%|████▎ | 2466/5772 [28:14<5:32:05, 6.03s/it] {'loss': 0.4852, 'learning_rate': 1.2807044136359358e-05, 'epoch': 0.43} + 43%|████▎ | 2466/5772 [28:16<5:32:05, 6.03s/it] {'loss': 0.4852, 'learning_rate': 1.2807044136359358e-05, 'epoch': 0.43} + 43%|████▎ | 2466/5772 [28:14<5:32:05, 6.03s/it] 43%|████▎ | 2467/5772 [28:21<5:38:20, 6.14s/it] 43%|████▎ | 2467/5772 [28:23<5:38:21, 6.14s/it] {'loss': 0.4572, 'learning_rate': 1.2801657337673176e-05, 'epoch': 0.43} + 43%|████▎ | 2467/5772 [28:21<5:38:20, 6.14s/it] {'loss': 0.4572, 'learning_rate': 1.2801657337673176e-05, 'epoch': 0.43} + 43%|████▎ | 2467/5772 [28:23<5:38:21, 6.14s/it] 43%|████▎ | 2468/5772 [28:27<5:33:38, 6.06s/it] 43%|████▎ | 2468/5772 [28:29<5:33:38, 6.06s/it] {'loss': 0.4676, 'learning_rate': 1.2796269656620593e-05, 'epoch': 0.43} + 43%|████▎ | 2468/5772 [28:29<5:33:38, 6.06s/it] {'loss': 0.4676, 'learning_rate': 1.2796269656620593e-05, 'epoch': 0.43} + 43%|████▎ | 2468/5772 [28:27<5:33:38, 6.06s/it] 43%|████▎ | 2469/5772 [28:35<5:35:06, 6.09s/it] 43%|████▎ | 2469/5772 [28:33<5:35:06, 6.09s/it] {'loss': 0.4739, 'learning_rate': 1.2790881094898428e-05, 'epoch': 0.43} + 43%|████▎ | 2469/5772 [28:35<5:35:06, 6.09s/it] {'loss': 0.4739, 'learning_rate': 1.2790881094898428e-05, 'epoch': 0.43} + 43%|████▎ | 2469/5772 [28:33<5:35:06, 6.09s/it] 43%|████▎ | 2470/5772 [28:41<5:32:47, 6.05s/it] 43%|████▎ | 2470/5772 [28:39<5:32:47, 6.05s/it] {'loss': 0.465, 'learning_rate': 1.2785491654203781e-05, 'epoch': 0.43} + 43%|████▎ | 2470/5772 [28:41<5:32:47, 6.05s/it] {'loss': 0.465, 'learning_rate': 1.2785491654203781e-05, 'epoch': 0.43} + 43%|████▎ | 2470/5772 [28:39<5:32:47, 6.05s/it] 43%|████▎ | 2471/5772 [28:47<5:30:47, 6.01s/it] 43%|████▎ | 2471/5772 [28:45<5:30:46, 6.01s/it] {'loss': 0.4632, 'learning_rate': 1.2780101336234024e-05, 'epoch': 0.43} + 43%|████▎ | 2471/5772 [28:47<5:30:47, 6.01s/it] {'loss': 0.4632, 'learning_rate': 1.2780101336234024e-05, 'epoch': 0.43} + 43%|████▎ | 2471/5772 [28:45<5:30:46, 6.01s/it] 43%|████▎ | 2472/5772 [28:53<5:36:55, 6.13s/it] 43%|████▎ | 2472/5772 [28:51<5:36:55, 6.13s/it] {'loss': 0.4684, 'learning_rate': 1.277471014268681e-05, 'epoch': 0.43} + 43%|████▎ | 2472/5772 [28:53<5:36:55, 6.13s/it] {'loss': 0.4684, 'learning_rate': 1.277471014268681e-05, 'epoch': 0.43} + 43%|████▎ | 2472/5772 [28:51<5:36:55, 6.13s/it] 43%|████▎ | 2473/5772 [28:57<5:38:51, 6.16s/it] 43%|████▎ | 2473/5772 [28:59<5:38:51, 6.16s/it] {'loss': 0.466, 'learning_rate': 1.2769318075260064e-05, 'epoch': 0.43} + 43%|████▎ | 2473/5772 [28:59<5:38:51, 6.16s/it] {'loss': 0.466, 'learning_rate': 1.2769318075260064e-05, 'epoch': 0.43} + 43%|████▎ | 2473/5772 [28:57<5:38:51, 6.16s/it] 43%|████▎ | 2474/5772 [29:05<5:31:44, 6.04s/it] 43%|████▎ | 2474/5772 [29:03<5:31:44, 6.04s/it] {'loss': 0.4665, 'learning_rate': 1.2763925135651984e-05, 'epoch': 0.43} + 43%|████▎ | 2474/5772 [29:05<5:31:44, 6.04s/it] {'loss': 0.4665, 'learning_rate': 1.2763925135651984e-05, 'epoch': 0.43} + 43%|████▎ | 2474/5772 [29:03<5:31:44, 6.04s/it] 43%|████▎ | 2475/5772 [29:12<5:40:22, 6.19s/it] 43%|████▎ | 2475/5772 [29:10<5:40:22, 6.19s/it] {'loss': 0.4725, 'learning_rate': 1.2758531325561055e-05, 'epoch': 0.43} + 43%|████▎ | 2475/5772 [29:12<5:40:22, 6.19s/it] {'loss': 0.4725, 'learning_rate': 1.2758531325561055e-05, 'epoch': 0.43} + 43%|████▎ | 2475/5772 [29:10<5:40:22, 6.19s/it] 43%|████▎ | 2476/5772 [29:18<5:38:21, 6.16s/it] 43%|████▎ | 2476/5772 [29:16<5:38:21, 6.16s/it] {'loss': 0.4775, 'learning_rate': 1.275313664668602e-05, 'epoch': 0.43} + 43%|████▎ | 2476/5772 [29:18<5:38:21, 6.16s/it] {'loss': 0.4775, 'learning_rate': 1.275313664668602e-05, 'epoch': 0.43} + 43%|████▎ | 2476/5772 [29:16<5:38:21, 6.16s/it] 43%|████▎ | 2477/5772 [29:22<5:46:59, 6.32s/it] 43%|████▎ | 2477/5772 [29:24<5:47:00, 6.32s/it] {'loss': 0.4611, 'learning_rate': 1.2747741100725906e-05, 'epoch': 0.43} + 43%|████▎ | 2477/5772 [29:24<5:47:00, 6.32s/it] {'loss': 0.4611, 'learning_rate': 1.2747741100725906e-05, 'epoch': 0.43} + 43%|████▎ | 2477/5772 [29:22<5:46:59, 6.32s/it] 43%|████▎ | 2478/5772 [29:29<5:45:29, 6.29s/it] 43%|████▎ | 2478/5772 [29:31<5:45:29, 6.29s/it] {'loss': 0.4672, 'learning_rate': 1.274234468938001e-05, 'epoch': 0.43} + 43%|████▎ | 2478/5772 [29:31<5:45:29, 6.29s/it] {'loss': 0.4672, 'learning_rate': 1.274234468938001e-05, 'epoch': 0.43} + 43%|████▎ | 2478/5772 [29:29<5:45:29, 6.29s/it] 43%|████▎ | 2479/5772 [29:37<5:39:12, 6.18s/it] 43%|████▎ | 2479/5772 [29:35<5:39:12, 6.18s/it] {'loss': 0.4659, 'learning_rate': 1.27369474143479e-05, 'epoch': 0.43} + 43%|████▎ | 2479/5772 [29:37<5:39:12, 6.18s/it] {'loss': 0.4659, 'learning_rate': 1.27369474143479e-05, 'epoch': 0.43} + 43%|████▎ | 2479/5772 [29:35<5:39:12, 6.18s/it] 43%|████▎ | 2480/5772 [29:43<5:36:17, 6.13s/it] 43%|████▎ | 2480/5772 [29:41<5:36:17, 6.13s/it] {'loss': 0.4725, 'learning_rate': 1.273154927732942e-05, 'epoch': 0.43} + 43%|████▎ | 2480/5772 [29:43<5:36:17, 6.13s/it] {'loss': 0.4725, 'learning_rate': 1.273154927732942e-05, 'epoch': 0.43} + 43%|████▎ | 2480/5772 [29:41<5:36:17, 6.13s/it] 43%|████▎ | 2481/5772 [29:49<5:35:42, 6.12s/it] 43%|████▎ | 2481/5772 [29:47<5:35:42, 6.12s/it] {'loss': 0.4698, 'learning_rate': 1.2726150280024683e-05, 'epoch': 0.43} + 43%|████▎ | 2481/5772 [29:49<5:35:42, 6.12s/it] {'loss': 0.4698, 'learning_rate': 1.2726150280024683e-05, 'epoch': 0.43} + 43%|████▎ | 2481/5772 [29:47<5:35:42, 6.12s/it] 43%|████▎ | 2482/5772 [29:55<5:31:11, 6.04s/it] 43%|████▎ | 2482/5772 [29:53<5:31:11, 6.04s/it] {'loss': 0.4636, 'learning_rate': 1.2720750424134073e-05, 'epoch': 0.43} + 43%|████▎ | 2482/5772 [29:55<5:31:11, 6.04s/it] {'loss': 0.4636, 'learning_rate': 1.2720750424134073e-05, 'epoch': 0.43} + 43%|████▎ | 2482/5772 [29:53<5:31:11, 6.04s/it] 43%|████▎ | 2483/5772 [30:01<5:37:53, 6.16s/it] 43%|████▎ | 2483/5772 [29:59<5:37:53, 6.16s/it] {'loss': 0.4809, 'learning_rate': 1.2715349711358245e-05, 'epoch': 0.43} + 43%|████▎ | 2483/5772 [30:01<5:37:53, 6.16s/it] {'loss': 0.4809, 'learning_rate': 1.2715349711358245e-05, 'epoch': 0.43} + 43%|████▎ | 2483/5772 [29:59<5:37:53, 6.16s/it] 43%|████▎ | 2484/5772 [30:07<5:35:50, 6.13s/it] 43%|████▎ | 2484/5772 [30:05<5:35:50, 6.13s/it] {'loss': 0.4763, 'learning_rate': 1.270994814339812e-05, 'epoch': 0.43} + 43%|████▎ | 2484/5772 [30:07<5:35:50, 6.13s/it] {'loss': 0.4763, 'learning_rate': 1.270994814339812e-05, 'epoch': 0.43} + 43%|████▎ | 2484/5772 [30:05<5:35:50, 6.13s/it] 43%|████▎ | 2485/5772 [30:13<5:39:13, 6.19s/it] 43%|████▎ | 2485/5772 [30:11<5:39:13, 6.19s/it] {'loss': 0.476, 'learning_rate': 1.27045457219549e-05, 'epoch': 0.43} + 43%|████▎ | 2485/5772 [30:13<5:39:13, 6.19s/it] {'loss': 0.476, 'learning_rate': 1.27045457219549e-05, 'epoch': 0.43} + 43%|████▎ | 2485/5772 [30:11<5:39:13, 6.19s/it] 43%|████▎ | 2486/5772 [30:20<5:39:28, 6.20s/it] 43%|████▎ | 2486/5772 [30:18<5:39:28, 6.20s/it] {'loss': 0.4695, 'learning_rate': 1.2699142448730037e-05, 'epoch': 0.43} + 43%|████▎ | 2486/5772 [30:20<5:39:28, 6.20s/it] {'loss': 0.4695, 'learning_rate': 1.2699142448730037e-05, 'epoch': 0.43} + 43%|████▎ | 2486/5772 [30:18<5:39:28, 6.20s/it] 43%|████▎ | 2487/5772 [30:26<5:41:32, 6.24s/it] 43%|████▎ | 2487/5772 [30:24<5:41:32, 6.24s/it] {'loss': 0.4781, 'learning_rate': 1.2693738325425272e-05, 'epoch': 0.43} + 43%|████▎ | 2487/5772 [30:26<5:41:32, 6.24s/it] {'loss': 0.4781, 'learning_rate': 1.2693738325425272e-05, 'epoch': 0.43} + 43%|████▎ | 2487/5772 [30:24<5:41:32, 6.24s/it] 43%|████▎ | 2488/5772 [30:32<5:35:22, 6.13s/it] 43%|████▎ | 2488/5772 [30:30<5:35:22, 6.13s/it] {'loss': 0.4728, 'learning_rate': 1.268833335374259e-05, 'epoch': 0.43} + 43%|████▎ | 2488/5772 [30:32<5:35:22, 6.13s/it] {'loss': 0.4728, 'learning_rate': 1.268833335374259e-05, 'epoch': 0.43} + 43%|████▎ | 2488/5772 [30:30<5:35:22, 6.13s/it] 43%|████▎ | 2489/5772 [30:38<5:31:43, 6.06s/it] 43%|████▎ | 2489/5772 [30:36<5:31:43, 6.06s/it] {'loss': 0.4731, 'learning_rate': 1.2682927535384273e-05, 'epoch': 0.43} + 43%|████▎ | 2489/5772 [30:38<5:31:43, 6.06s/it] {'loss': 0.4731, 'learning_rate': 1.2682927535384273e-05, 'epoch': 0.43} + 43%|████▎ | 2489/5772 [30:36<5:31:43, 6.06s/it] 43%|████▎ | 2490/5772 [30:44<5:32:46, 6.08s/it] 43%|████▎ | 2490/5772 [30:42<5:32:46, 6.08s/it] {'loss': 0.4733, 'learning_rate': 1.2677520872052843e-05, 'epoch': 0.43} + 43%|████▎ | 2490/5772 [30:44<5:32:46, 6.08s/it] {'loss': 0.4733, 'learning_rate': 1.2677520872052843e-05, 'epoch': 0.43} + 43%|████▎ | 2490/5772 [30:42<5:32:46, 6.08s/it] 43%|████▎ | 2491/5772 [30:50<5:34:07, 6.11s/it] 43%|████▎ | 2491/5772 [30:48<5:34:07, 6.11s/it] {'loss': 0.4683, 'learning_rate': 1.2672113365451102e-05, 'epoch': 0.43} + 43%|████▎ | 2491/5772 [30:50<5:34:07, 6.11s/it] {'loss': 0.4683, 'learning_rate': 1.2672113365451102e-05, 'epoch': 0.43} + 43%|████▎ | 2491/5772 [30:48<5:34:07, 6.11s/it] 43%|████▎ | 2492/5772 [30:56<5:31:28, 6.06s/it] 43%|████▎ | 2492/5772 [30:54<5:31:29, 6.06s/it] {'loss': 0.4621, 'learning_rate': 1.2666705017282115e-05, 'epoch': 0.43} + 43%|████▎ | 2492/5772 [30:56<5:31:28, 6.06s/it] {'loss': 0.4621, 'learning_rate': 1.2666705017282115e-05, 'epoch': 0.43} + 43%|████▎ | 2492/5772 [30:54<5:31:29, 6.06s/it] 43%|████▎ | 2493/5772 [31:02<5:31:32, 6.07s/it] 43%|████▎ | 2493/5772 [31:00<5:31:31, 6.07s/it] {'loss': 0.4603, 'learning_rate': 1.2661295829249207e-05, 'epoch': 0.43} + 43%|████▎ | 2493/5772 [31:02<5:31:32, 6.07s/it] {'loss': 0.4603, 'learning_rate': 1.2661295829249207e-05, 'epoch': 0.43} + 43%|████▎ | 2493/5772 [31:00<5:31:31, 6.07s/it] 43%|████▎ | 2494/5772 [31:08<5:33:30, 6.10s/it] 43%|████▎ | 2494/5772 [31:06<5:33:30, 6.10s/it] {'loss': 0.4765, 'learning_rate': 1.2655885803055978e-05, 'epoch': 0.43} + 43%|████▎ | 2494/5772 [31:08<5:33:30, 6.10s/it] {'loss': 0.4765, 'learning_rate': 1.2655885803055978e-05, 'epoch': 0.43} + 43%|████▎ | 2494/5772 [31:06<5:33:30, 6.10s/it] 43%|████▎ | 2495/5772 [31:14<5:32:10, 6.08s/it] 43%|████▎ | 2495/5772 [31:12<5:32:10, 6.08s/it] {'loss': 0.465, 'learning_rate': 1.2650474940406279e-05, 'epoch': 0.43} + 43%|████▎ | 2495/5772 [31:14<5:32:10, 6.08s/it] {'loss': 0.465, 'learning_rate': 1.2650474940406279e-05, 'epoch': 0.43} + 43%|████▎ | 2495/5772 [31:12<5:32:10, 6.08s/it] 43%|████▎ | 2496/5772 [31:20<5:31:56, 6.08s/it] 43%|████▎ | 2496/5772 [31:18<5:31:56, 6.08s/it] {'loss': 0.4827, 'learning_rate': 1.2645063243004236e-05, 'epoch': 0.43} + 43%|████▎ | 2496/5772 [31:20<5:31:56, 6.08s/it] {'loss': 0.4827, 'learning_rate': 1.2645063243004236e-05, 'epoch': 0.43} + 43%|████▎ | 2496/5772 [31:18<5:31:56, 6.08s/it] 43%|████▎ | 2497/5772 [31:26<5:31:02, 6.06s/it] 43%|████▎ | 2497/5772 [31:24<5:31:02, 6.06s/it] {'loss': 0.4617, 'learning_rate': 1.263965071255423e-05, 'epoch': 0.43} + 43%|████▎ | 2497/5772 [31:26<5:31:02, 6.06s/it] {'loss': 0.4617, 'learning_rate': 1.263965071255423e-05, 'epoch': 0.43} + 43%|████▎ | 2497/5772 [31:24<5:31:02, 6.06s/it] 43%|████▎ | 2498/5772 [31:33<5:34:00, 6.12s/it] 43%|████▎ | 2498/5772 [31:31<5:34:01, 6.12s/it] {'loss': 0.479, 'learning_rate': 1.2634237350760912e-05, 'epoch': 0.43} + 43%|████▎ | 2498/5772 [31:33<5:34:00, 6.12s/it] {'loss': 0.479, 'learning_rate': 1.2634237350760912e-05, 'epoch': 0.43} + 43%|████▎ | 2498/5772 [31:31<5:34:01, 6.12s/it] 43%|████▎ | 2499/5772 [31:39<5:37:38, 6.19s/it] 43%|████▎ | 2499/5772 [31:37<5:37:37, 6.19s/it] {'loss': 0.4593, 'learning_rate': 1.2628823159329182e-05, 'epoch': 0.43} + 43%|████▎ | 2499/5772 [31:39<5:37:38, 6.19s/it] {'loss': 0.4593, 'learning_rate': 1.2628823159329182e-05, 'epoch': 0.43} + 43%|████▎ | 2499/5772 [31:37<5:37:37, 6.19s/it]11 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... + 43%|████▎ | 2500/5772 [31:45<5:38:34, 6.21s/it]5 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +04 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... + 43%|████▎ | 2500/5772 [31:43<5:38:34, 6.21s/it]2 AutoResumeHook: Checking whether to suspend... + {'loss': 0.47, 'learning_rate': 1.2623408139964216e-05, 'epoch': 0.43} + 43%|████▎ | 2500/5772 [31:45<5:38:34, 6.21s/it] {'loss': 0.47, 'learning_rate': 1.2623408139964216e-05, 'epoch': 0.43} + 43%|████▎ | 2500/5772 [31:43<5:38:34, 6.21s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-2500/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-2500/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-2500/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 43%|████▎ | 2501/5772 [32:07<10:18:02, 11.34s/it] 43%|████▎ | 2501/5772 [32:09<10:18:03, 11.34s/it] {'loss': 0.4627, 'learning_rate': 1.2617992294371444e-05, 'epoch': 0.43} + 43%|████▎ | 2501/5772 [32:09<10:18:03, 11.34s/it] {'loss': 0.4627, 'learning_rate': 1.2617992294371444e-05, 'epoch': 0.43} + 43%|████▎ | 2501/5772 [32:07<10:18:02, 11.34s/it] 43%|████▎ | 2502/5772 [32:13<9:01:00, 9.93s/it] 43%|████▎ | 2502/5772 [32:15<9:01:01, 9.93s/it] {'loss': 0.4814, 'learning_rate': 1.2612575624256552e-05, 'epoch': 0.43} + 43%|████▎ | 2502/5772 [32:15<9:01:01, 9.93s/it] {'loss': 0.4814, 'learning_rate': 1.2612575624256552e-05, 'epoch': 0.43} + 43%|████▎ | 2502/5772 [32:13<9:01:00, 9.93s/it] 43%|████▎ | 2503/5772 [32:19<7:55:00, 8.72s/it] 43%|████▎ | 2503/5772 [32:21<7:55:00, 8.72s/it] {'loss': 0.4612, 'learning_rate': 1.2607158131325494e-05, 'epoch': 0.43} + 43%|████▎ | 2503/5772 [32:21<7:55:00, 8.72s/it] {'loss': 0.4612, 'learning_rate': 1.2607158131325494e-05, 'epoch': 0.43} + 43%|████▎ | 2503/5772 [32:19<7:55:00, 8.72s/it] 43%|████▎ | 2504/5772 [32:25<7:06:54, 7.84s/it] 43%|████▎ | 2504/5772 [32:27<7:06:54, 7.84s/it] {'loss': 0.4736, 'learning_rate': 1.260173981728448e-05, 'epoch': 0.43} + 43%|████▎ | 2504/5772 [32:27<7:06:54, 7.84s/it] {'loss': 0.4736, 'learning_rate': 1.260173981728448e-05, 'epoch': 0.43} + 43%|████▎ | 2504/5772 [32:25<7:06:54, 7.84s/it] 43%|████▎ | 2505/5772 [32:31<6:44:53, 7.44s/it] 43%|████▎ | 2505/5772 [32:33<6:44:54, 7.44s/it] {'loss': 0.4656, 'learning_rate': 1.2596320683839976e-05, 'epoch': 0.43} + 43%|████▎ | 2505/5772 [32:33<6:44:54, 7.44s/it] {'loss': 0.4656, 'learning_rate': 1.2596320683839976e-05, 'epoch': 0.43} + 43%|████▎ | 2505/5772 [32:31<6:44:53, 7.44s/it] 43%|████▎ | 2506/5772 [32:40<6:28:28, 7.14s/it] 43%|████▎ | 2506/5772 [32:38<6:28:29, 7.14s/it] {'loss': 0.4845, 'learning_rate': 1.2590900732698707e-05, 'epoch': 0.43} + 43%|████▎ | 2506/5772 [32:40<6:28:28, 7.14s/it] {'loss': 0.4845, 'learning_rate': 1.2590900732698707e-05, 'epoch': 0.43} + 43%|████▎ | 2506/5772 [32:38<6:28:29, 7.14s/it] 43%|████▎ | 2507/5772 [32:46<6:12:53, 6.85s/it] 43%|████▎ | 2507/5772 [32:44<6:12:54, 6.85s/it] {'loss': 0.4808, 'learning_rate': 1.258547996556766e-05, 'epoch': 0.43} + 43%|████▎ | 2507/5772 [32:46<6:12:53, 6.85s/it] {'loss': 0.4808, 'learning_rate': 1.258547996556766e-05, 'epoch': 0.43} + 43%|████▎ | 2507/5772 [32:44<6:12:54, 6.85s/it] 43%|████▎ | 2508/5772 [32:50<5:58:48, 6.60s/it] 43%|████▎ | 2508/5772 [32:52<5:58:48, 6.60s/it] {'loss': 0.4772, 'learning_rate': 1.258005838415407e-05, 'epoch': 0.43} + 43%|████▎ | 2508/5772 [32:52<5:58:48, 6.60s/it] {'loss': 0.4772, 'learning_rate': 1.258005838415407e-05, 'epoch': 0.43} + 43%|████▎ | 2508/5772 [32:50<5:58:48, 6.60s/it] 43%|████▎ | 2509/5772 [32:58<5:49:42, 6.43s/it] 43%|████▎ | 2509/5772 [32:56<5:49:42, 6.43s/it] {'loss': 0.4645, 'learning_rate': 1.2574635990165438e-05, 'epoch': 0.43} + 43%|████▎ | 2509/5772 [32:58<5:49:42, 6.43s/it] {'loss': 0.4645, 'learning_rate': 1.2574635990165438e-05, 'epoch': 0.43} + 43%|████▎ | 2509/5772 [32:56<5:49:42, 6.43s/it] 43%|████▎ | 2510/5772 [33:04<5:48:29, 6.41s/it] 43%|████▎ | 2510/5772 [33:02<5:48:29, 6.41s/it] {'loss': 0.4762, 'learning_rate': 1.2569212785309517e-05, 'epoch': 0.43} + 43%|████▎ | 2510/5772 [33:04<5:48:29, 6.41s/it] {'loss': 0.4762, 'learning_rate': 1.2569212785309517e-05, 'epoch': 0.43} + 43%|████▎ | 2510/5772 [33:02<5:48:29, 6.41s/it] 44%|████▎ | 2511/5772 [33:11<5:44:50, 6.34s/it] 44%|████▎ | 2511/5772 [33:09<5:44:50, 6.34s/it] {'loss': 0.4611, 'learning_rate': 1.2563788771294316e-05, 'epoch': 0.43} + 44%|████▎ | 2511/5772 [33:11<5:44:50, 6.34s/it] {'loss': 0.4611, 'learning_rate': 1.2563788771294316e-05, 'epoch': 0.43} + 44%|████▎ | 2511/5772 [33:09<5:44:50, 6.34s/it] 44%|████▎ | 2512/5772 [33:17<5:40:06, 6.26s/it] 44%|████▎ | 2512/5772 [33:15<5:40:06, 6.26s/it] {'loss': 0.4709, 'learning_rate': 1.2558363949828092e-05, 'epoch': 0.44} + 44%|████▎ | 2512/5772 [33:17<5:40:06, 6.26s/it] {'loss': 0.4709, 'learning_rate': 1.2558363949828092e-05, 'epoch': 0.44} + 44%|████▎ | 2512/5772 [33:15<5:40:06, 6.26s/it] 44%|████▎ | 2513/5772 [33:23<5:34:24, 6.16s/it] 44%|████▎ | 2513/5772 [33:21<5:34:24, 6.16s/it] {'loss': 0.4674, 'learning_rate': 1.2552938322619368e-05, 'epoch': 0.44} + 44%|████▎ | 2513/5772 [33:23<5:34:24, 6.16s/it] {'loss': 0.4674, 'learning_rate': 1.2552938322619368e-05, 'epoch': 0.44} + 44%|████▎ | 2513/5772 [33:21<5:34:24, 6.16s/it] 44%|████▎ | 2514/5772 [33:27<5:40:45, 6.28s/it] 44%|████▎ | 2514/5772 [33:29<5:40:45, 6.28s/it] {'loss': 0.4781, 'learning_rate': 1.2547511891376916e-05, 'epoch': 0.44} + 44%|████▎ | 2514/5772 [33:29<5:40:45, 6.28s/it] {'loss': 0.4781, 'learning_rate': 1.2547511891376916e-05, 'epoch': 0.44} + 44%|████▎ | 2514/5772 [33:27<5:40:45, 6.28s/it] 44%|████▎ | 2515/5772 [33:33<5:36:20, 6.20s/it] 44%|████▎ | 2515/5772 [33:35<5:36:20, 6.20s/it] {'loss': 0.471, 'learning_rate': 1.2542084657809754e-05, 'epoch': 0.44} + 44%|████▎ | 2515/5772 [33:35<5:36:20, 6.20s/it] {'loss': 0.471, 'learning_rate': 1.2542084657809754e-05, 'epoch': 0.44} + 44%|████▎ | 2515/5772 [33:33<5:36:20, 6.20s/it] 44%|████▎ | 2516/5772 [33:40<5:42:19, 6.31s/it] 44%|████▎ | 2516/5772 [33:42<5:42:19, 6.31s/it] {'loss': 0.4734, 'learning_rate': 1.2536656623627167e-05, 'epoch': 0.44} + 44%|████▎ | 2516/5772 [33:42<5:42:19, 6.31s/it] {'loss': 0.4734, 'learning_rate': 1.2536656623627167e-05, 'epoch': 0.44} + 44%|████▎ | 2516/5772 [33:40<5:42:19, 6.31s/it] 44%|████▎ | 2517/5772 [33:48<5:44:53, 6.36s/it] 44%|████▎ | 2517/5772 [33:46<5:44:54, 6.36s/it] {'loss': 0.4711, 'learning_rate': 1.2531227790538675e-05, 'epoch': 0.44} + 44%|████▎ | 2517/5772 [33:48<5:44:53, 6.36s/it] {'loss': 0.4711, 'learning_rate': 1.2531227790538675e-05, 'epoch': 0.44} + 44%|████▎ | 2517/5772 [33:46<5:44:54, 6.36s/it] 44%|████▎ | 2518/5772 [33:52<5:39:14, 6.26s/it] 44%|████▎ | 2518/5772 [33:54<5:39:15, 6.26s/it] {'loss': 0.4753, 'learning_rate': 1.252579816025407e-05, 'epoch': 0.44} + 44%|████▎ | 2518/5772 [33:54<5:39:15, 6.26s/it] {'loss': 0.4753, 'learning_rate': 1.252579816025407e-05, 'epoch': 0.44} + 44%|████▎ | 2518/5772 [33:52<5:39:14, 6.26s/it] 44%|████▎ | 2519/5772 [34:00<5:39:40, 6.27s/it] 44%|████▎ | 2519/5772 [33:58<5:39:41, 6.27s/it] {'loss': 0.4738, 'learning_rate': 1.2520367734483376e-05, 'epoch': 0.44} + 44%|████▎ | 2519/5772 [34:00<5:39:40, 6.27s/it] {'loss': 0.4738, 'learning_rate': 1.2520367734483376e-05, 'epoch': 0.44} + 44%|████▎ | 2519/5772 [33:58<5:39:41, 6.27s/it] 44%|████▎ | 2520/5772 [34:07<5:37:22, 6.22s/it] 44%|████▎ | 2520/5772 [34:05<5:37:22, 6.22s/it] {'loss': 0.4843, 'learning_rate': 1.2514936514936878e-05, 'epoch': 0.44} + 44%|████▎ | 2520/5772 [34:07<5:37:22, 6.22s/it] {'loss': 0.4843, 'learning_rate': 1.2514936514936878e-05, 'epoch': 0.44} + 44%|████▎ | 2520/5772 [34:05<5:37:22, 6.22s/it] 44%|████▎ | 2521/5772 [34:13<5:40:57, 6.29s/it] 44%|████▎ | 2521/5772 [34:11<5:40:58, 6.29s/it] {'loss': 0.4699, 'learning_rate': 1.2509504503325106e-05, 'epoch': 0.44} + 44%|████▎ | 2521/5772 [34:13<5:40:57, 6.29s/it] {'loss': 0.4699, 'learning_rate': 1.2509504503325106e-05, 'epoch': 0.44} + 44%|████▎ | 2521/5772 [34:11<5:40:58, 6.29s/it] 44%|████▎ | 2522/5772 [34:17<5:37:56, 6.24s/it] 44%|████▎ | 2522/5772 [34:19<5:37:56, 6.24s/it] {'loss': 0.464, 'learning_rate': 1.2504071701358842e-05, 'epoch': 0.44} + 44%|████▎ | 2522/5772 [34:19<5:37:56, 6.24s/it] {'loss': 0.464, 'learning_rate': 1.2504071701358842e-05, 'epoch': 0.44} + 44%|████▎ | 2522/5772 [34:17<5:37:56, 6.24s/it] 44%|████▎ | 2523/5772 [34:26<5:41:26, 6.31s/it] 44%|████▎ | 2523/5772 [34:24<5:41:26, 6.31s/it] {'loss': 0.4696, 'learning_rate': 1.2498638110749122e-05, 'epoch': 0.44} + 44%|████▎ | 2523/5772 [34:26<5:41:26, 6.31s/it] {'loss': 0.4696, 'learning_rate': 1.2498638110749122e-05, 'epoch': 0.44} + 44%|████▎ | 2523/5772 [34:24<5:41:26, 6.31s/it] 44%|████▎ | 2524/5772 [34:32<5:35:52, 6.20s/it] 44%|████▎ | 2524/5772 [34:30<5:35:52, 6.20s/it] {'loss': 0.4725, 'learning_rate': 1.2493203733207219e-05, 'epoch': 0.44} + 44%|████▎ | 2524/5772 [34:32<5:35:52, 6.20s/it] {'loss': 0.4725, 'learning_rate': 1.2493203733207219e-05, 'epoch': 0.44} + 44%|████▎ | 2524/5772 [34:30<5:35:52, 6.20s/it] 44%|████▎ | 2525/5772 [34:38<5:36:09, 6.21s/it] 44%|████▎ | 2525/5772 [34:36<5:36:09, 6.21s/it] {'loss': 0.4738, 'learning_rate': 1.2487768570444665e-05, 'epoch': 0.44} + 44%|████▎ | 2525/5772 [34:38<5:36:09, 6.21s/it] {'loss': 0.4738, 'learning_rate': 1.2487768570444665e-05, 'epoch': 0.44} + 44%|████▎ | 2525/5772 [34:36<5:36:09, 6.21s/it] 44%|████▍ | 2526/5772 [34:44<5:33:38, 6.17s/it] 44%|████▍ | 2526/5772 [34:42<5:33:38, 6.17s/it] {'loss': 0.4762, 'learning_rate': 1.2482332624173227e-05, 'epoch': 0.44} + 44%|████▍ | 2526/5772 [34:44<5:33:38, 6.17s/it] {'loss': 0.4762, 'learning_rate': 1.2482332624173227e-05, 'epoch': 0.44} + 44%|████▍ | 2526/5772 [34:42<5:33:38, 6.17s/it] 44%|████▍ | 2527/5772 [34:50<5:31:23, 6.13s/it] 44%|████▍ | 2527/5772 [34:48<5:31:23, 6.13s/it] {'loss': 0.4761, 'learning_rate': 1.2476895896104937e-05, 'epoch': 0.44} + 44%|████▍ | 2527/5772 [34:50<5:31:23, 6.13s/it] {'loss': 0.4761, 'learning_rate': 1.2476895896104937e-05, 'epoch': 0.44} + 44%|████▍ | 2527/5772 [34:48<5:31:23, 6.13s/it] 44%|████▍ | 2528/5772 [34:56<5:30:23, 6.11s/it] 44%|████▍ | 2528/5772 [34:54<5:30:23, 6.11s/it] {'loss': 0.4705, 'learning_rate': 1.2471458387952053e-05, 'epoch': 0.44} + 44%|████▍ | 2528/5772 [34:56<5:30:23, 6.11s/it] {'loss': 0.4705, 'learning_rate': 1.2471458387952053e-05, 'epoch': 0.44} + 44%|████▍ | 2528/5772 [34:54<5:30:23, 6.11s/it] 44%|████▍ | 2529/5772 [35:02<5:34:23, 6.19s/it] 44%|████▍ | 2529/5772 [35:00<5:34:23, 6.19s/it] {'loss': 0.4712, 'learning_rate': 1.2466020101427092e-05, 'epoch': 0.44} + 44%|████▍ | 2529/5772 [35:02<5:34:23, 6.19s/it] {'loss': 0.4712, 'learning_rate': 1.2466020101427092e-05, 'epoch': 0.44} + 44%|████▍ | 2529/5772 [35:00<5:34:23, 6.19s/it] 44%|████▍ | 2530/5772 [35:08<5:32:50, 6.16s/it] 44%|████▍ | 2530/5772 [35:06<5:32:50, 6.16s/it] {'loss': 0.4657, 'learning_rate': 1.246058103824281e-05, 'epoch': 0.44} + 44%|████▍ | 2530/5772 [35:08<5:32:50, 6.16s/it] {'loss': 0.4657, 'learning_rate': 1.246058103824281e-05, 'epoch': 0.44} + 44%|████▍ | 2530/5772 [35:06<5:32:50, 6.16s/it] 44%|████▍ | 2531/5772 [35:14<5:30:21, 6.12s/it] 44%|████▍ | 2531/5772 [35:12<5:30:21, 6.12s/it] {'loss': 0.4795, 'learning_rate': 1.245514120011221e-05, 'epoch': 0.44} + 44%|████▍ | 2531/5772 [35:14<5:30:21, 6.12s/it] {'loss': 0.4795, 'learning_rate': 1.245514120011221e-05, 'epoch': 0.44} + 44%|████▍ | 2531/5772 [35:12<5:30:21, 6.12s/it] 44%|████▍ | 2532/5772 [35:20<5:28:17, 6.08s/it] 44%|████▍ | 2532/5772 [35:18<5:28:18, 6.08s/it] {'loss': 0.4813, 'learning_rate': 1.2449700588748541e-05, 'epoch': 0.44} + 44%|████▍ | 2532/5772 [35:20<5:28:17, 6.08s/it] {'loss': 0.4813, 'learning_rate': 1.2449700588748541e-05, 'epoch': 0.44} + 44%|████▍ | 2532/5772 [35:18<5:28:18, 6.08s/it] 44%|████▍ | 2533/5772 [35:27<5:34:55, 6.20s/it] 44%|████▍ | 2533/5772 [35:25<5:34:55, 6.20s/it] {'loss': 0.4658, 'learning_rate': 1.2444259205865295e-05, 'epoch': 0.44} + 44%|████▍ | 2533/5772 [35:27<5:34:55, 6.20s/it] {'loss': 0.4658, 'learning_rate': 1.2444259205865295e-05, 'epoch': 0.44} + 44%|████▍ | 2533/5772 [35:25<5:34:55, 6.20s/it] 44%|████▍ | 2534/5772 [35:33<5:36:20, 6.23s/it] 44%|████▍ | 2534/5772 [35:31<5:36:20, 6.23s/it] {'loss': 0.4812, 'learning_rate': 1.2438817053176198e-05, 'epoch': 0.44} + 44%|████▍ | 2534/5772 [35:33<5:36:20, 6.23s/it] {'loss': 0.4812, 'learning_rate': 1.2438817053176198e-05, 'epoch': 0.44} + 44%|████▍ | 2534/5772 [35:31<5:36:20, 6.23s/it] 44%|████▍ | 2535/5772 [35:40<5:42:56, 6.36s/it] 44%|████▍ | 2535/5772 [35:38<5:42:56, 6.36s/it] {'loss': 0.4641, 'learning_rate': 1.243337413239523e-05, 'epoch': 0.44} + 44%|████▍ | 2535/5772 [35:40<5:42:56, 6.36s/it] {'loss': 0.4641, 'learning_rate': 1.243337413239523e-05, 'epoch': 0.44} + 44%|████▍ | 2535/5772 [35:38<5:42:56, 6.36s/it] 44%|████▍ | 2536/5772 [35:46<5:36:45, 6.24s/it] 44%|████▍ | 2536/5772 [35:44<5:36:45, 6.24s/it] {'loss': 0.4854, 'learning_rate': 1.2427930445236611e-05, 'epoch': 0.44} + 44%|████▍ | 2536/5772 [35:46<5:36:45, 6.24s/it] {'loss': 0.4854, 'learning_rate': 1.2427930445236611e-05, 'epoch': 0.44} + 44%|████▍ | 2536/5772 [35:44<5:36:45, 6.24s/it] 44%|████▍ | 2537/5772 [35:52<5:31:10, 6.14s/it] 44%|████▍ | 2537/5772 [35:50<5:31:10, 6.14s/it] {'loss': 0.4503, 'learning_rate': 1.2422485993414795e-05, 'epoch': 0.44} + 44%|████▍ | 2537/5772 [35:52<5:31:10, 6.14s/it] {'loss': 0.4503, 'learning_rate': 1.2422485993414795e-05, 'epoch': 0.44} + 44%|████▍ | 2537/5772 [35:50<5:31:10, 6.14s/it] 44%|████▍ | 2538/5772 [35:58<5:33:24, 6.19s/it] 44%|████▍ | 2538/5772 [35:56<5:33:24, 6.19s/it] {'loss': 0.4716, 'learning_rate': 1.2417040778644487e-05, 'epoch': 0.44} + 44%|████▍ | 2538/5772 [35:58<5:33:24, 6.19s/it] {'loss': 0.4716, 'learning_rate': 1.2417040778644487e-05, 'epoch': 0.44} + 44%|████▍ | 2538/5772 [35:56<5:33:24, 6.19s/it] 44%|████▍ | 2539/5772 [36:04<5:31:15, 6.15s/it] 44%|████▍ | 2539/5772 [36:02<5:31:15, 6.15s/it] {'loss': 0.4662, 'learning_rate': 1.2411594802640621e-05, 'epoch': 0.44} + 44%|████▍ | 2539/5772 [36:04<5:31:15, 6.15s/it] {'loss': 0.4662, 'learning_rate': 1.2411594802640621e-05, 'epoch': 0.44} + 44%|████▍ | 2539/5772 [36:02<5:31:15, 6.15s/it] 44%|████▍ | 2540/5772 [36:10<5:28:24, 6.10s/it] 44%|████▍ | 2540/5772 [36:08<5:28:24, 6.10s/it] {'loss': 0.4983, 'learning_rate': 1.2406148067118387e-05, 'epoch': 0.44} + 44%|████▍ | 2540/5772 [36:10<5:28:24, 6.10s/it] {'loss': 0.4983, 'learning_rate': 1.2406148067118387e-05, 'epoch': 0.44} + 44%|████▍ | 2540/5772 [36:08<5:28:24, 6.10s/it] 44%|████▍ | 2541/5772 [36:16<5:30:53, 6.14s/it] 44%|████▍ | 2541/5772 [36:14<5:30:53, 6.14s/it] {'loss': 0.4679, 'learning_rate': 1.2400700573793191e-05, 'epoch': 0.44} + 44%|████▍ | 2541/5772 [36:16<5:30:53, 6.14s/it] {'loss': 0.4679, 'learning_rate': 1.2400700573793191e-05, 'epoch': 0.44} + 44%|████▍ | 2541/5772 [36:14<5:30:53, 6.14s/it] 44%|████▍ | 2542/5772 [36:23<5:33:50, 6.20s/it] 44%|████▍ | 2542/5772 [36:21<5:33:50, 6.20s/it] {'loss': 0.4713, 'learning_rate': 1.2395252324380701e-05, 'epoch': 0.44} + 44%|████▍ | 2542/5772 [36:23<5:33:50, 6.20s/it] {'loss': 0.4713, 'learning_rate': 1.2395252324380701e-05, 'epoch': 0.44} + 44%|████▍ | 2542/5772 [36:21<5:33:50, 6.20s/it] 44%|████▍ | 2543/5772 [36:28<5:26:29, 6.07s/it] 44%|████▍ | 2543/5772 [36:26<5:26:29, 6.07s/it] {'loss': 0.4604, 'learning_rate': 1.2389803320596806e-05, 'epoch': 0.44} + 44%|████▍ | 2543/5772 [36:28<5:26:29, 6.07s/it] {'loss': 0.4604, 'learning_rate': 1.2389803320596806e-05, 'epoch': 0.44} + 44%|████▍ | 2543/5772 [36:26<5:26:29, 6.07s/it] 44%|████▍ | 2544/5772 [36:34<5:23:16, 6.01s/it] 44%|████▍ | 2544/5772 [36:32<5:23:16, 6.01s/it] {'loss': 0.4689, 'learning_rate': 1.2384353564157646e-05, 'epoch': 0.44} + 44%|████▍ | 2544/5772 [36:34<5:23:16, 6.01s/it] {'loss': 0.4689, 'learning_rate': 1.2384353564157646e-05, 'epoch': 0.44} + 44%|████▍ | 2544/5772 [36:32<5:23:16, 6.01s/it] 44%|████▍ | 2545/5772 [36:41<5:33:49, 6.21s/it] 44%|████▍ | 2545/5772 [36:39<5:33:49, 6.21s/it] {'loss': 0.4677, 'learning_rate': 1.2378903056779584e-05, 'epoch': 0.44} + 44%|████▍ | 2545/5772 [36:41<5:33:49, 6.21s/it] {'loss': 0.4677, 'learning_rate': 1.2378903056779584e-05, 'epoch': 0.44} + 44%|████▍ | 2545/5772 [36:39<5:33:49, 6.21s/it] 44%|████▍ | 2546/5772 [36:47<5:25:04, 6.05s/it] 44%|████▍ | 2546/5772 [36:45<5:25:04, 6.05s/it] {'loss': 0.4716, 'learning_rate': 1.2373451800179235e-05, 'epoch': 0.44} + 44%|████▍ | 2546/5772 [36:47<5:25:04, 6.05s/it] {'loss': 0.4716, 'learning_rate': 1.2373451800179235e-05, 'epoch': 0.44} + 44%|████▍ | 2546/5772 [36:45<5:25:04, 6.05s/it] 44%|████▍ | 2547/5772 [36:53<5:26:35, 6.08s/it] 44%|████▍ | 2547/5772 [36:51<5:26:35, 6.08s/it] {'loss': 0.4647, 'learning_rate': 1.2367999796073436e-05, 'epoch': 0.44} + 44%|████▍ | 2547/5772 [36:53<5:26:35, 6.08s/it] {'loss': 0.4647, 'learning_rate': 1.2367999796073436e-05, 'epoch': 0.44} + 44%|████▍ | 2547/5772 [36:51<5:26:35, 6.08s/it] 44%|████▍ | 2548/5772 [36:59<5:21:15, 5.98s/it] 44%|████▍ | 2548/5772 [36:57<5:21:15, 5.98s/it] {'loss': 0.4787, 'learning_rate': 1.2362547046179265e-05, 'epoch': 0.44} + 44%|████▍ | 2548/5772 [36:59<5:21:15, 5.98s/it] {'loss': 0.4787, 'learning_rate': 1.2362547046179265e-05, 'epoch': 0.44} + 44%|████▍ | 2548/5772 [36:57<5:21:15, 5.98s/it] 44%|████▍ | 2549/5772 [37:05<5:26:20, 6.08s/it] 44%|████▍ | 2549/5772 [37:03<5:26:21, 6.08s/it] {'loss': 0.4702, 'learning_rate': 1.2357093552214043e-05, 'epoch': 0.44} + 44%|████▍ | 2549/5772 [37:05<5:26:20, 6.08s/it] {'loss': 0.4702, 'learning_rate': 1.2357093552214043e-05, 'epoch': 0.44} + 44%|████▍ | 2549/5772 [37:03<5:26:21, 6.08s/it]11 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +54 1 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... + 44%|████▍ | 2550/5772 [37:11<5:30:47, 6.16s/it]0 AutoResumeHook: Checking whether to suspend... + 44%|████▍ | 2550/5772 [37:09<5:30:47, 6.16s/it] {'loss': 0.4917, 'learning_rate': 1.2351639315895309e-05, 'epoch': 0.44} + 44%|████▍ | 2550/5772 [37:11<5:30:47, 6.16s/it] {'loss': 0.4917, 'learning_rate': 1.2351639315895309e-05, 'epoch': 0.44} + 44%|████▍ | 2550/5772 [37:09<5:30:47, 6.16s/it] 44%|████▍ | 2551/5772 [37:17<5:27:52, 6.11s/it] 44%|████▍ | 2551/5772 [37:15<5:27:53, 6.11s/it] {'loss': 0.4759, 'learning_rate': 1.2346184338940847e-05, 'epoch': 0.44} + 44%|████▍ | 2551/5772 [37:17<5:27:52, 6.11s/it] {'loss': 0.4759, 'learning_rate': 1.2346184338940847e-05, 'epoch': 0.44} + 44%|████▍ | 2551/5772 [37:15<5:27:53, 6.11s/it] 44%|████▍ | 2552/5772 [37:23<5:25:50, 6.07s/it] 44%|████▍ | 2552/5772 [37:21<5:25:50, 6.07s/it] {'loss': 0.4709, 'learning_rate': 1.2340728623068671e-05, 'epoch': 0.44} + 44%|████▍ | 2552/5772 [37:23<5:25:50, 6.07s/it] {'loss': 0.4709, 'learning_rate': 1.2340728623068671e-05, 'epoch': 0.44} + 44%|████▍ | 2552/5772 [37:21<5:25:50, 6.07s/it] 44%|████▍ | 2553/5772 [37:29<5:25:21, 6.06s/it] 44%|████▍ | 2553/5772 [37:27<5:25:20, 6.06s/it] {'loss': 0.4675, 'learning_rate': 1.2335272169997034e-05, 'epoch': 0.44} + 44%|████▍ | 2553/5772 [37:29<5:25:21, 6.06s/it] {'loss': 0.4675, 'learning_rate': 1.2335272169997034e-05, 'epoch': 0.44} + 44%|████▍ | 2553/5772 [37:27<5:25:20, 6.06s/it] 44%|████▍ | 2554/5772 [37:35<5:23:29, 6.03s/it] 44%|████▍ | 2554/5772 [37:33<5:23:29, 6.03s/it] {'loss': 0.4636, 'learning_rate': 1.232981498144441e-05, 'epoch': 0.44} + 44%|████▍ | 2554/5772 [37:35<5:23:29, 6.03s/it] {'loss': 0.4636, 'learning_rate': 1.232981498144441e-05, 'epoch': 0.44} + 44%|████▍ | 2554/5772 [37:33<5:23:29, 6.03s/it] 44%|████▍ | 2555/5772 [37:41<5:27:25, 6.11s/it] 44%|████▍ | 2555/5772 [37:40<5:27:25, 6.11s/it] {'loss': 0.4759, 'learning_rate': 1.2324357059129512e-05, 'epoch': 0.44} + 44%|████▍ | 2555/5772 [37:41<5:27:25, 6.11s/it] {'loss': 0.4759, 'learning_rate': 1.2324357059129512e-05, 'epoch': 0.44} + 44%|████▍ | 2555/5772 [37:40<5:27:25, 6.11s/it] 44%|████▍ | 2556/5772 [37:47<5:25:50, 6.08s/it] 44%|████▍ | 2556/5772 [37:46<5:25:51, 6.08s/it] {'loss': 0.4661, 'learning_rate': 1.231889840477128e-05, 'epoch': 0.44} + 44%|████▍ | 2556/5772 [37:47<5:25:50, 6.08s/it] {'loss': 0.4661, 'learning_rate': 1.231889840477128e-05, 'epoch': 0.44} + 44%|████▍ | 2556/5772 [37:46<5:25:51, 6.08s/it] 44%|████▍ | 2557/5772 [37:54<5:26:05, 6.09s/it] 44%|████▍ | 2557/5772 [37:52<5:26:04, 6.09s/it] {'loss': 0.4721, 'learning_rate': 1.2313439020088889e-05, 'epoch': 0.44} + 44%|████▍ | 2557/5772 [37:54<5:26:05, 6.09s/it] {'loss': 0.4721, 'learning_rate': 1.2313439020088889e-05, 'epoch': 0.44} + 44%|████▍ | 2557/5772 [37:52<5:26:04, 6.09s/it] 44%|████▍ | 2558/5772 [38:00<5:30:50, 6.18s/it] 44%|████▍ | 2558/5772 [37:58<5:30:49, 6.18s/it] {'loss': 0.4802, 'learning_rate': 1.2307978906801738e-05, 'epoch': 0.44} + 44%|████▍ | 2558/5772 [38:00<5:30:50, 6.18s/it] {'loss': 0.4802, 'learning_rate': 1.2307978906801738e-05, 'epoch': 0.44} + 44%|████▍ | 2558/5772 [37:58<5:30:49, 6.18s/it] 44%|████▍ | 2559/5772 [38:06<5:34:24, 6.24s/it] 44%|████▍ | 2559/5772 [38:04<5:34:24, 6.24s/it] {'loss': 0.4725, 'learning_rate': 1.2302518066629467e-05, 'epoch': 0.44} + 44%|████▍ | 2559/5772 [38:06<5:34:24, 6.24s/it] {'loss': 0.4725, 'learning_rate': 1.2302518066629467e-05, 'epoch': 0.44} + 44%|████▍ | 2559/5772 [38:04<5:34:24, 6.24s/it] 44%|████▍ | 2560/5772 [38:13<5:36:27, 6.28s/it] 44%|████▍ | 2560/5772 [38:11<5:36:26, 6.28s/it] {'loss': 0.4564, 'learning_rate': 1.2297056501291932e-05, 'epoch': 0.44} + 44%|████▍ | 2560/5772 [38:13<5:36:27, 6.28s/it] {'loss': 0.4564, 'learning_rate': 1.2297056501291932e-05, 'epoch': 0.44} + 44%|████▍ | 2560/5772 [38:11<5:36:26, 6.28s/it] 44%|████▍ | 2561/5772 [38:19<5:31:48, 6.20s/it] 44%|████▍ | 2561/5772 [38:17<5:31:48, 6.20s/it] {'loss': 0.4795, 'learning_rate': 1.2291594212509224e-05, 'epoch': 0.44} + 44%|████▍ | 2561/5772 [38:19<5:31:48, 6.20s/it] {'loss': 0.4795, 'learning_rate': 1.2291594212509224e-05, 'epoch': 0.44} + 44%|████▍ | 2561/5772 [38:17<5:31:48, 6.20s/it] 44%|████▍ | 2562/5772 [38:23<5:28:53, 6.15s/it] 44%|████▍ | 2562/5772 [38:25<5:28:53, 6.15s/it] {'loss': 0.4759, 'learning_rate': 1.2286131202001661e-05, 'epoch': 0.44} + 44%|████▍ | 2562/5772 [38:25<5:28:53, 6.15s/it] {'loss': 0.4759, 'learning_rate': 1.2286131202001661e-05, 'epoch': 0.44} + 44%|████▍ | 2562/5772 [38:23<5:28:53, 6.15s/it] 44%|████▍ | 2563/5772 [38:31<5:36:09, 6.29s/it] 44%|████▍ | 2563/5772 [38:29<5:36:10, 6.29s/it] {'loss': 0.4716, 'learning_rate': 1.2280667471489784e-05, 'epoch': 0.44} + 44%|████▍ | 2563/5772 [38:31<5:36:09, 6.29s/it] {'loss': 0.4716, 'learning_rate': 1.2280667471489784e-05, 'epoch': 0.44} + 44%|████▍ | 2563/5772 [38:29<5:36:10, 6.29s/it] 44%|████▍ | 2564/5772 [38:37<5:30:35, 6.18s/it] 44%|████▍ | 2564/5772 [38:35<5:30:35, 6.18s/it] {'loss': 0.4811, 'learning_rate': 1.2275203022694371e-05, 'epoch': 0.44} + 44%|████▍ | 2564/5772 [38:37<5:30:35, 6.18s/it] {'loss': 0.4811, 'learning_rate': 1.2275203022694371e-05, 'epoch': 0.44} + 44%|████▍ | 2564/5772 [38:35<5:30:35, 6.18s/it] 44%|████▍ | 2565/5772 [38:44<5:36:19, 6.29s/it] 44%|████▍ | 2565/5772 [38:42<5:36:19, 6.29s/it] {'loss': 0.4767, 'learning_rate': 1.2269737857336412e-05, 'epoch': 0.44} + 44%|████▍ | 2565/5772 [38:44<5:36:19, 6.29s/it] {'loss': 0.4767, 'learning_rate': 1.2269737857336412e-05, 'epoch': 0.44} + 44%|████▍ | 2565/5772 [38:42<5:36:19, 6.29s/it] 44%|████▍ | 2566/5772 [38:50<5:29:24, 6.16s/it] 44%|████▍ | 2566/5772 [38:48<5:29:24, 6.16s/it] {'loss': 0.489, 'learning_rate': 1.2264271977137136e-05, 'epoch': 0.44} + 44%|████▍ | 2566/5772 [38:50<5:29:24, 6.16s/it] {'loss': 0.489, 'learning_rate': 1.2264271977137136e-05, 'epoch': 0.44} + 44%|████▍ | 2566/5772 [38:48<5:29:24, 6.16s/it] 44%|████▍ | 2567/5772 [38:56<5:26:41, 6.12s/it] 44%|████▍ | 2567/5772 [38:54<5:26:41, 6.12s/it] {'loss': 0.4769, 'learning_rate': 1.2258805383817992e-05, 'epoch': 0.44} + 44%|████▍ | 2567/5772 [38:56<5:26:41, 6.12s/it] {'loss': 0.4769, 'learning_rate': 1.2258805383817992e-05, 'epoch': 0.44} + 44%|████▍ | 2567/5772 [38:54<5:26:41, 6.12s/it] 44%|████▍ | 2568/5772 [39:02<5:25:06, 6.09s/it] 44%|████▍ | 2568/5772 [39:00<5:25:06, 6.09s/it] {'loss': 0.4674, 'learning_rate': 1.2253338079100652e-05, 'epoch': 0.44} + 44%|████▍ | 2568/5772 [39:02<5:25:06, 6.09s/it] {'loss': 0.4674, 'learning_rate': 1.2253338079100652e-05, 'epoch': 0.44} + 44%|████▍ | 2568/5772 [39:00<5:25:06, 6.09s/it] 45%|████▍ | 2569/5772 [39:08<5:30:07, 6.18s/it] 45%|████▍ | 2569/5772 [39:06<5:30:07, 6.18s/it] {'loss': 0.4674, 'learning_rate': 1.224787006470701e-05, 'epoch': 0.45} + 45%|████▍ | 2569/5772 [39:08<5:30:07, 6.18s/it] {'loss': 0.4674, 'learning_rate': 1.224787006470701e-05, 'epoch': 0.45} + 45%|████▍ | 2569/5772 [39:06<5:30:07, 6.18s/it] 45%|████▍ | 2570/5772 [39:14<5:28:59, 6.16s/it] 45%|████▍ | 2570/5772 [39:12<5:28:59, 6.16s/it] {'loss': 0.4662, 'learning_rate': 1.2242401342359188e-05, 'epoch': 0.45} + 45%|████▍ | 2570/5772 [39:14<5:28:59, 6.16s/it] {'loss': 0.4662, 'learning_rate': 1.2242401342359188e-05, 'epoch': 0.45} + 45%|████▍ | 2570/5772 [39:12<5:28:59, 6.16s/it] 45%|████▍ | 2571/5772 [39:21<5:34:36, 6.27s/it] 45%|████▍ | 2571/5772 [39:19<5:34:36, 6.27s/it] {'loss': 0.4616, 'learning_rate': 1.2236931913779534e-05, 'epoch': 0.45} + 45%|████▍ | 2571/5772 [39:21<5:34:36, 6.27s/it] {'loss': 0.4616, 'learning_rate': 1.2236931913779534e-05, 'epoch': 0.45} + 45%|████▍ | 2571/5772 [39:19<5:34:36, 6.27s/it] 45%|████▍ | 2572/5772 [39:27<5:34:38, 6.27s/it] 45%|████▍ | 2572/5772 [39:25<5:34:37, 6.27s/it] {'loss': 0.4749, 'learning_rate': 1.223146178069061e-05, 'epoch': 0.45} + 45%|████▍ | 2572/5772 [39:27<5:34:38, 6.27s/it] {'loss': 0.4749, 'learning_rate': 1.223146178069061e-05, 'epoch': 0.45} + 45%|████▍ | 2572/5772 [39:25<5:34:37, 6.27s/it] 45%|████▍ | 2573/5772 [39:33<5:32:58, 6.25s/it] 45%|████▍ | 2573/5772 [39:31<5:32:58, 6.25s/it] {'loss': 0.4771, 'learning_rate': 1.2225990944815207e-05, 'epoch': 0.45} + 45%|████▍ | 2573/5772 [39:33<5:32:58, 6.25s/it] {'loss': 0.4771, 'learning_rate': 1.2225990944815207e-05, 'epoch': 0.45} + 45%|████▍ | 2573/5772 [39:31<5:32:58, 6.25s/it] 45%|████▍ | 2574/5772 [39:39<5:27:20, 6.14s/it] 45%|████▍ | 2574/5772 [39:37<5:27:21, 6.14s/it] {'loss': 0.4713, 'learning_rate': 1.222051940787633e-05, 'epoch': 0.45} + 45%|████▍ | 2574/5772 [39:39<5:27:20, 6.14s/it] {'loss': 0.4713, 'learning_rate': 1.222051940787633e-05, 'epoch': 0.45} + 45%|████▍ | 2574/5772 [39:37<5:27:21, 6.14s/it] 45%|████▍ | 2575/5772 [39:46<5:33:05, 6.25s/it] 45%|████▍ | 2575/5772 [39:44<5:33:05, 6.25s/it] {'loss': 0.4797, 'learning_rate': 1.2215047171597214e-05, 'epoch': 0.45} + 45%|████▍ | 2575/5772 [39:46<5:33:05, 6.25s/it] {'loss': 0.4797, 'learning_rate': 1.2215047171597214e-05, 'epoch': 0.45} + 45%|████▍ | 2575/5772 [39:44<5:33:05, 6.25s/it] 45%|████▍ | 2576/5772 [39:52<5:27:11, 6.14s/it] 45%|████▍ | 2576/5772 [39:50<5:27:11, 6.14s/it] {'loss': 0.473, 'learning_rate': 1.2209574237701306e-05, 'epoch': 0.45} + 45%|████▍ | 2576/5772 [39:52<5:27:11, 6.14s/it] {'loss': 0.473, 'learning_rate': 1.2209574237701306e-05, 'epoch': 0.45} + 45%|████▍ | 2576/5772 [39:50<5:27:11, 6.14s/it] 45%|████▍ | 2577/5772 [39:58<5:29:29, 6.19s/it] 45%|████▍ | 2577/5772 [39:56<5:29:29, 6.19s/it] {'loss': 0.472, 'learning_rate': 1.2204100607912277e-05, 'epoch': 0.45} + 45%|████▍ | 2577/5772 [39:58<5:29:29, 6.19s/it] {'loss': 0.472, 'learning_rate': 1.2204100607912277e-05, 'epoch': 0.45} + 45%|████▍ | 2577/5772 [39:56<5:29:29, 6.19s/it] 45%|████▍ | 2578/5772 [40:04<5:28:38, 6.17s/it] 45%|████▍ | 2578/5772 [40:02<5:28:38, 6.17s/it] {'loss': 0.4965, 'learning_rate': 1.2198626283954016e-05, 'epoch': 0.45} + 45%|████▍ | 2578/5772 [40:04<5:28:38, 6.17s/it] {'loss': 0.4965, 'learning_rate': 1.2198626283954016e-05, 'epoch': 0.45} + 45%|████▍ | 2578/5772 [40:02<5:28:38, 6.17s/it] 45%|████▍ | 2579/5772 [40:10<5:29:05, 6.18s/it] 45%|████▍ | 2579/5772 [40:08<5:29:05, 6.18s/it] {'loss': 0.4688, 'learning_rate': 1.2193151267550631e-05, 'epoch': 0.45} + 45%|████▍ | 2579/5772 [40:10<5:29:05, 6.18s/it] {'loss': 0.4688, 'learning_rate': 1.2193151267550631e-05, 'epoch': 0.45} + 45%|████▍ | 2579/5772 [40:08<5:29:05, 6.18s/it] 45%|████▍ | 2580/5772 [40:16<5:25:04, 6.11s/it] 45%|████▍ | 2580/5772 [40:14<5:25:04, 6.11s/it] {'loss': 0.4699, 'learning_rate': 1.2187675560426448e-05, 'epoch': 0.45} + 45%|████▍ | 2580/5772 [40:16<5:25:04, 6.11s/it] {'loss': 0.4699, 'learning_rate': 1.2187675560426448e-05, 'epoch': 0.45} + 45%|████▍ | 2580/5772 [40:14<5:25:04, 6.11s/it] 45%|████▍ | 2581/5772 [40:22<5:23:01, 6.07s/it] 45%|████▍ | 2581/5772 [40:20<5:23:01, 6.07s/it] {'loss': 0.4662, 'learning_rate': 1.218219916430601e-05, 'epoch': 0.45} + 45%|████▍ | 2581/5772 [40:22<5:23:01, 6.07s/it] {'loss': 0.4662, 'learning_rate': 1.218219916430601e-05, 'epoch': 0.45} + 45%|████▍ | 2581/5772 [40:20<5:23:01, 6.07s/it] 45%|████▍ | 2582/5772 [40:28<5:22:05, 6.06s/it] 45%|████▍ | 2582/5772 [40:26<5:22:05, 6.06s/it] {'loss': 0.4808, 'learning_rate': 1.2176722080914081e-05, 'epoch': 0.45} + 45%|████▍ | 2582/5772 [40:28<5:22:05, 6.06s/it] {'loss': 0.4808, 'learning_rate': 1.2176722080914081e-05, 'epoch': 0.45} + 45%|████▍ | 2582/5772 [40:26<5:22:05, 6.06s/it] 45%|████▍ | 2583/5772 [40:34<5:24:32, 6.11s/it] 45%|████▍ | 2583/5772 [40:32<5:24:33, 6.11s/it] {'loss': 0.4722, 'learning_rate': 1.2171244311975635e-05, 'epoch': 0.45} + 45%|████▍ | 2583/5772 [40:34<5:24:32, 6.11s/it] {'loss': 0.4722, 'learning_rate': 1.2171244311975635e-05, 'epoch': 0.45} + 45%|████▍ | 2583/5772 [40:32<5:24:33, 6.11s/it] 45%|████▍ | 2584/5772 [40:40<5:22:21, 6.07s/it] 45%|████▍ | 2584/5772 [40:38<5:22:22, 6.07s/it] {'loss': 0.477, 'learning_rate': 1.2165765859215863e-05, 'epoch': 0.45} + 45%|████▍ | 2584/5772 [40:40<5:22:21, 6.07s/it] {'loss': 0.477, 'learning_rate': 1.2165765859215863e-05, 'epoch': 0.45} + 45%|████▍ | 2584/5772 [40:38<5:22:22, 6.07s/it] 45%|████▍ | 2585/5772 [40:46<5:21:36, 6.05s/it] 45%|████▍ | 2585/5772 [40:44<5:21:36, 6.05s/it] {'loss': 0.4682, 'learning_rate': 1.2160286724360177e-05, 'epoch': 0.45} + 45%|████▍ | 2585/5772 [40:46<5:21:36, 6.05s/it] {'loss': 0.4682, 'learning_rate': 1.2160286724360177e-05, 'epoch': 0.45} + 45%|████▍ | 2585/5772 [40:44<5:21:36, 6.05s/it] 45%|████▍ | 2586/5772 [40:53<5:26:32, 6.15s/it] 45%|████▍ | 2586/5772 [40:51<5:26:31, 6.15s/it] {'loss': 0.4701, 'learning_rate': 1.2154806909134198e-05, 'epoch': 0.45} + 45%|████▍ | 2586/5772 [40:53<5:26:32, 6.15s/it] {'loss': 0.4701, 'learning_rate': 1.2154806909134198e-05, 'epoch': 0.45} + 45%|████▍ | 2586/5772 [40:51<5:26:31, 6.15s/it] 45%|████▍ | 2587/5772 [40:59<5:31:07, 6.24s/it] 45%|████▍ | 2587/5772 [40:57<5:31:06, 6.24s/it] {'loss': 0.4716, 'learning_rate': 1.2149326415263762e-05, 'epoch': 0.45} + 45%|████▍ | 2587/5772 [40:59<5:31:07, 6.24s/it] {'loss': 0.4716, 'learning_rate': 1.2149326415263762e-05, 'epoch': 0.45} + 45%|████▍ | 2587/5772 [40:57<5:31:06, 6.24s/it] 45%|████▍ | 2588/5772 [41:05<5:26:26, 6.15s/it] 45%|████▍ | 2588/5772 [41:03<5:26:25, 6.15s/it] {'loss': 0.4747, 'learning_rate': 1.2143845244474925e-05, 'epoch': 0.45} + 45%|████▍ | 2588/5772 [41:05<5:26:26, 6.15s/it] {'loss': 0.4747, 'learning_rate': 1.2143845244474925e-05, 'epoch': 0.45} + 45%|████▍ | 2588/5772 [41:03<5:26:25, 6.15s/it] 45%|████▍ | 2589/5772 [41:11<5:24:51, 6.12s/it] 45%|████▍ | 2589/5772 [41:09<5:24:50, 6.12s/it] {'loss': 0.4615, 'learning_rate': 1.2138363398493946e-05, 'epoch': 0.45} + 45%|████▍ | 2589/5772 [41:11<5:24:51, 6.12s/it] {'loss': 0.4615, 'learning_rate': 1.2138363398493946e-05, 'epoch': 0.45} + 45%|████▍ | 2589/5772 [41:09<5:24:50, 6.12s/it] 45%|████▍ | 2590/5772 [41:18<5:29:34, 6.21s/it] 45%|████▍ | 2590/5772 [41:16<5:29:35, 6.21s/it] {'loss': 0.4785, 'learning_rate': 1.2132880879047307e-05, 'epoch': 0.45} + 45%|████▍ | 2590/5772 [41:18<5:29:34, 6.21s/it] {'loss': 0.4785, 'learning_rate': 1.2132880879047307e-05, 'epoch': 0.45} + 45%|████▍ | 2590/5772 [41:16<5:29:35, 6.21s/it] 45%|████▍ | 2591/5772 [41:24<5:27:49, 6.18s/it] 45%|████▍ | 2591/5772 [41:22<5:27:49, 6.18s/it] {'loss': 0.4696, 'learning_rate': 1.212739768786169e-05, 'epoch': 0.45} + 45%|████▍ | 2591/5772 [41:24<5:27:49, 6.18s/it] {'loss': 0.4696, 'learning_rate': 1.212739768786169e-05, 'epoch': 0.45} + 45%|████▍ | 2591/5772 [41:22<5:27:49, 6.18s/it] 45%|████▍ | 2592/5772 [41:30<5:29:04, 6.21s/it] 45%|████▍ | 2592/5772 [41:28<5:29:03, 6.21s/it] {'loss': 0.4786, 'learning_rate': 1.2121913826664001e-05, 'epoch': 0.45} + 45%|████▍ | 2592/5772 [41:30<5:29:04, 6.21s/it] {'loss': 0.4786, 'learning_rate': 1.2121913826664001e-05, 'epoch': 0.45} + 45%|████▍ | 2592/5772 [41:28<5:29:03, 6.21s/it] 45%|████▍ | 2593/5772 [41:34<5:29:58, 6.23s/it] 45%|████▍ | 2593/5772 [41:36<5:29:59, 6.23s/it] {'loss': 0.4572, 'learning_rate': 1.211642929718135e-05, 'epoch': 0.45} + 45%|████▍ | 2593/5772 [41:36<5:29:59, 6.23s/it] {'loss': 0.4572, 'learning_rate': 1.211642929718135e-05, 'epoch': 0.45} + 45%|████▍ | 2593/5772 [41:34<5:29:58, 6.23s/it] 45%|████▍ | 2594/5772 [41:43<5:35:16, 6.33s/it] 45%|████▍ | 2594/5772 [41:41<5:35:16, 6.33s/it] {'loss': 0.4768, 'learning_rate': 1.2110944101141058e-05, 'epoch': 0.45} + 45%|████▍ | 2594/5772 [41:43<5:35:16, 6.33s/it] {'loss': 0.4768, 'learning_rate': 1.2110944101141058e-05, 'epoch': 0.45} + 45%|████▍ | 2594/5772 [41:41<5:35:16, 6.33s/it] 45%|████▍ | 2595/5772 [41:49<5:31:55, 6.27s/it] 45%|████▍ | 2595/5772 [41:47<5:31:55, 6.27s/it] {'loss': 0.48, 'learning_rate': 1.210545824027066e-05, 'epoch': 0.45} + 45%|████▍ | 2595/5772 [41:49<5:31:55, 6.27s/it] {'loss': 0.48, 'learning_rate': 1.210545824027066e-05, 'epoch': 0.45} + 45%|████▍ | 2595/5772 [41:47<5:31:55, 6.27s/it] 45%|████▍ | 2596/5772 [41:53<5:24:51, 6.14s/it] 45%|████▍ | 2596/5772 [41:55<5:24:52, 6.14s/it] {'loss': 0.4893, 'learning_rate': 1.2099971716297896e-05, 'epoch': 0.45} + {'loss': 0.4893, 'learning_rate': 1.2099971716297896e-05, 'epoch': 0.45} 45%|████▍ | 2596/5772 [41:55<5:24:52, 6.14s/it] + 45%|████▍ | 2596/5772 [41:53<5:24:51, 6.14s/it] 45%|████▍ | 2597/5772 [42:01<5:23:17, 6.11s/it] 45%|████▍ | 2597/5772 [41:59<5:23:17, 6.11s/it] {'loss': 0.4605, 'learning_rate': 1.2094484530950714e-05, 'epoch': 0.45} + 45%|████▍ | 2597/5772 [42:01<5:23:17, 6.11s/it] {'loss': 0.4605, 'learning_rate': 1.2094484530950714e-05, 'epoch': 0.45} + 45%|████▍ | 2597/5772 [41:59<5:23:17, 6.11s/it] 45%|████▌ | 2598/5772 [42:05<5:24:09, 6.13s/it] 45%|████▌ | 2598/5772 [42:07<5:24:09, 6.13s/it] {'loss': 0.4703, 'learning_rate': 1.2088996685957277e-05, 'epoch': 0.45} + 45%|████▌ | 2598/5772 [42:07<5:24:09, 6.13s/it] {'loss': 0.4703, 'learning_rate': 1.2088996685957277e-05, 'epoch': 0.45} + 45%|████▌ | 2598/5772 [42:05<5:24:09, 6.13s/it] 45%|████▌ | 2599/5772 [42:11<5:24:29, 6.14s/it] 45%|████▌ | 2599/5772 [42:13<5:24:29, 6.14s/it] {'loss': 0.4669, 'learning_rate': 1.2083508183045947e-05, 'epoch': 0.45} + 45%|████▌ | 2599/5772 [42:13<5:24:29, 6.14s/it] {'loss': 0.4669, 'learning_rate': 1.2083508183045947e-05, 'epoch': 0.45} + 45%|████▌ | 2599/5772 [42:11<5:24:29, 6.14s/it]7 AutoResumeHook: Checking whether to suspend... +1110 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +4 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend...12 AutoResumeHook: Checking whether to suspend... + +14 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend...1 +2AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +8 0AutoResumeHook: Checking whether to suspend... + 45%|████▌ | 2600/5772 [42:20<5:29:20, 6.23s/it]AutoResumeHook: Checking whether to suspend... + 45%|████▌ | 2600/5772 [42:18<5:29:21, 6.23s/it] {'loss': 0.4827, 'learning_rate': 1.2078019023945298e-05, 'epoch': 0.45} + 45%|████▌ | 2600/5772 [42:20<5:29:20, 6.23s/it] {'loss': 0.4827, 'learning_rate': 1.2078019023945298e-05, 'epoch': 0.45} + 45%|████▌ | 2600/5772 [42:18<5:29:21, 6.23s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-2600/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-2600/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-2600/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 45%|████▌ | 2601/5772 [42:42<9:39:42, 10.97s/it] 45%|████▌ | 2601/5772 [42:40<9:39:42, 10.97s/it] {'loss': 0.4603, 'learning_rate': 1.2072529210384113e-05, 'epoch': 0.45} + 45%|████▌ | 2601/5772 [42:42<9:39:42, 10.97s/it] {'loss': 0.4603, 'learning_rate': 1.2072529210384113e-05, 'epoch': 0.45} + 45%|████▌ | 2601/5772 [42:40<9:39:42, 10.97s/it] 45%|████▌ | 2602/5772 [42:46<8:19:40, 9.46s/it] 45%|████▌ | 2602/5772 [42:48<8:19:40, 9.46s/it] {'loss': 0.4719, 'learning_rate': 1.2067038744091375e-05, 'epoch': 0.45} + 45%|████▌ | 2602/5772 [42:48<8:19:40, 9.46s/it] {'loss': 0.4719, 'learning_rate': 1.2067038744091375e-05, 'epoch': 0.45} + 45%|████▌ | 2602/5772 [42:46<8:19:40, 9.46s/it] 45%|████▌ | 2603/5772 [42:52<7:30:59, 8.54s/it] 45%|████▌ | 2603/5772 [42:54<7:31:00, 8.54s/it] {'loss': 0.492, 'learning_rate': 1.2061547626796276e-05, 'epoch': 0.45} + 45%|████▌ | 2603/5772 [42:54<7:31:00, 8.54s/it] {'loss': 0.492, 'learning_rate': 1.2061547626796276e-05, 'epoch': 0.45} + 45%|████▌ | 2603/5772 [42:52<7:30:59, 8.54s/it] 45%|████▌ | 2604/5772 [42:58<6:52:12, 7.81s/it] 45%|████▌ | 2604/5772 [43:00<6:52:12, 7.81s/it] {'loss': 0.4637, 'learning_rate': 1.205605586022822e-05, 'epoch': 0.45} + 45%|████▌ | 2604/5772 [43:00<6:52:12, 7.81s/it] {'loss': 0.4637, 'learning_rate': 1.205605586022822e-05, 'epoch': 0.45} + 45%|████▌ | 2604/5772 [42:58<6:52:12, 7.81s/it] 45%|████▌ | 2605/5772 [43:04<6:22:04, 7.24s/it] 45%|████▌ | 2605/5772 [43:06<6:22:04, 7.24s/it] {'loss': 0.4739, 'learning_rate': 1.2050563446116798e-05, 'epoch': 0.45} + 45%|████▌ | 2605/5772 [43:06<6:22:04, 7.24s/it] {'loss': 0.4739, 'learning_rate': 1.2050563446116798e-05, 'epoch': 0.45} + 45%|████▌ | 2605/5772 [43:04<6:22:04, 7.24s/it] 45%|████▌ | 2606/5772 [43:12<6:03:54, 6.90s/it] 45%|████▌ | 2606/5772 [43:10<6:03:55, 6.90s/it] {'loss': 0.4676, 'learning_rate': 1.2045070386191822e-05, 'epoch': 0.45} + 45%|████▌ | 2606/5772 [43:12<6:03:54, 6.90s/it] {'loss': 0.4676, 'learning_rate': 1.2045070386191822e-05, 'epoch': 0.45} + 45%|████▌ | 2606/5772 [43:10<6:03:55, 6.90s/it] 45%|████▌ | 2607/5772 [43:18<5:48:02, 6.60s/it] 45%|████▌ | 2607/5772 [43:16<5:48:02, 6.60s/it] {'loss': 0.4634, 'learning_rate': 1.2039576682183295e-05, 'epoch': 0.45} + 45%|████▌ | 2607/5772 [43:18<5:48:02, 6.60s/it] {'loss': 0.4634, 'learning_rate': 1.2039576682183295e-05, 'epoch': 0.45} + 45%|████▌ | 2607/5772 [43:16<5:48:02, 6.60s/it] 45%|████▌ | 2608/5772 [43:24<5:41:07, 6.47s/it] 45%|████▌ | 2608/5772 [43:22<5:41:08, 6.47s/it] {'loss': 0.4728, 'learning_rate': 1.2034082335821436e-05, 'epoch': 0.45} + 45%|████▌ | 2608/5772 [43:24<5:41:07, 6.47s/it] {'loss': 0.4728, 'learning_rate': 1.2034082335821436e-05, 'epoch': 0.45} + 45%|████▌ | 2608/5772 [43:22<5:41:08, 6.47s/it] 45%|████▌ | 2609/5772 [43:30<5:32:47, 6.31s/it] 45%|████▌ | 2609/5772 [43:28<5:32:48, 6.31s/it] {'loss': 0.491, 'learning_rate': 1.2028587348836653e-05, 'epoch': 0.45} + 45%|████▌ | 2609/5772 [43:30<5:32:47, 6.31s/it] {'loss': 0.491, 'learning_rate': 1.2028587348836653e-05, 'epoch': 0.45} + 45%|████▌ | 2609/5772 [43:28<5:32:48, 6.31s/it] 45%|████▌ | 2610/5772 [43:34<5:27:58, 6.22s/it] 45%|████▌ | 2610/5772 [43:36<5:27:59, 6.22s/it] {'loss': 0.4605, 'learning_rate': 1.2023091722959565e-05, 'epoch': 0.45} + 45%|████▌ | 2610/5772 [43:36<5:27:59, 6.22s/it] {'loss': 0.4605, 'learning_rate': 1.2023091722959565e-05, 'epoch': 0.45} + 45%|████▌ | 2610/5772 [43:34<5:27:58, 6.22s/it] 45%|████▌ | 2611/5772 [43:42<5:22:28, 6.12s/it] 45%|████▌ | 2611/5772 [43:40<5:22:28, 6.12s/it] {'loss': 0.4839, 'learning_rate': 1.2017595459920985e-05, 'epoch': 0.45} + 45%|████▌ | 2611/5772 [43:42<5:22:28, 6.12s/it] {'loss': 0.4839, 'learning_rate': 1.2017595459920985e-05, 'epoch': 0.45} + 45%|████▌ | 2611/5772 [43:40<5:22:28, 6.12s/it] 45%|████▌ | 2612/5772 [43:49<5:30:12, 6.27s/it] 45%|████▌ | 2612/5772 [43:47<5:30:12, 6.27s/it] {'loss': 0.4614, 'learning_rate': 1.2012098561451933e-05, 'epoch': 0.45} + 45%|████▌ | 2612/5772 [43:49<5:30:12, 6.27s/it] {'loss': 0.4614, 'learning_rate': 1.2012098561451933e-05, 'epoch': 0.45} + 45%|████▌ | 2612/5772 [43:47<5:30:12, 6.27s/it] 45%|████▌ | 2613/5772 [43:53<5:28:34, 6.24s/it] 45%|████▌ | 2613/5772 [43:55<5:28:34, 6.24s/it] {'loss': 0.4768, 'learning_rate': 1.2006601029283629e-05, 'epoch': 0.45} + 45%|████▌ | 2613/5772 [43:55<5:28:34, 6.24s/it] {'loss': 0.4768, 'learning_rate': 1.2006601029283629e-05, 'epoch': 0.45} + 45%|████▌ | 2613/5772 [43:53<5:28:34, 6.24s/it] 45%|████▌ | 2614/5772 [44:14<9:18:08, 10.60s/it] 45%|████▌ | 2614/5772 [44:16<9:18:10, 10.60s/it] {'loss': 0.4662, 'learning_rate': 1.2001102865147485e-05, 'epoch': 0.45} + 45%|████▌ | 2614/5772 [44:16<9:18:10, 10.60s/it] {'loss': 0.4662, 'learning_rate': 1.2001102865147485e-05, 'epoch': 0.45} + 45%|████▌ | 2614/5772 [44:14<9:18:08, 10.60s/it] 45%|████▌ | 2615/5772 [44:20<8:04:06, 9.20s/it] 45%|████▌ | 2615/5772 [44:22<8:04:06, 9.20s/it] {'loss': 0.4783, 'learning_rate': 1.199560407077512e-05, 'epoch': 0.45} + 45%|████▌ | 2615/5772 [44:22<8:04:06, 9.20s/it] {'loss': 0.4783, 'learning_rate': 1.199560407077512e-05, 'epoch': 0.45} + 45%|████▌ | 2615/5772 [44:20<8:04:06, 9.20s/it] 45%|████▌ | 2616/5772 [44:26<7:14:15, 8.26s/it] 45%|████▌ | 2616/5772 [44:28<7:14:15, 8.26s/it] {'loss': 0.4689, 'learning_rate': 1.1990104647898349e-05, 'epoch': 0.45} + 45%|████▌ | 2616/5772 [44:28<7:14:15, 8.26s/it] {'loss': 0.4689, 'learning_rate': 1.1990104647898349e-05, 'epoch': 0.45} + 45%|████▌ | 2616/5772 [44:26<7:14:15, 8.26s/it] 45%|████▌ | 2617/5772 [44:32<6:40:27, 7.62s/it] 45%|████▌ | 2617/5772 [44:34<6:40:27, 7.62s/it] {'loss': 0.4901, 'learning_rate': 1.1984604598249186e-05, 'epoch': 0.45} + 45%|████▌ | 2617/5772 [44:34<6:40:27, 7.62s/it] {'loss': 0.4901, 'learning_rate': 1.1984604598249186e-05, 'epoch': 0.45} + 45%|████▌ | 2617/5772 [44:32<6:40:27, 7.62s/it] 45%|████▌ | 2618/5772 [44:38<6:17:20, 7.18s/it] 45%|████▌ | 2618/5772 [44:40<6:17:20, 7.18s/it] {'loss': 0.4675, 'learning_rate': 1.1979103923559836e-05, 'epoch': 0.45} + 45%|████▌ | 2618/5772 [44:40<6:17:20, 7.18s/it] {'loss': 0.4675, 'learning_rate': 1.1979103923559836e-05, 'epoch': 0.45} + 45%|████▌ | 2618/5772 [44:38<6:17:20, 7.18s/it] 45%|████▌ | 2619/5772 [44:44<5:57:22, 6.80s/it] 45%|████▌ | 2619/5772 [44:46<5:57:23, 6.80s/it] {'loss': 0.4807, 'learning_rate': 1.1973602625562712e-05, 'epoch': 0.45} + 45%|████▌ | 2619/5772 [44:46<5:57:23, 6.80s/it] {'loss': 0.4807, 'learning_rate': 1.1973602625562712e-05, 'epoch': 0.45} + 45%|████▌ | 2619/5772 [44:44<5:57:22, 6.80s/it] 45%|████▌ | 2620/5772 [44:50<5:47:52, 6.62s/it] 45%|████▌ | 2620/5772 [44:52<5:47:52, 6.62s/it] {'loss': 0.4636, 'learning_rate': 1.1968100705990411e-05, 'epoch': 0.45} + 45%|████▌ | 2620/5772 [44:52<5:47:52, 6.62s/it] {'loss': 0.4636, 'learning_rate': 1.1968100705990411e-05, 'epoch': 0.45} + 45%|████▌ | 2620/5772 [44:50<5:47:52, 6.62s/it] 45%|████▌ | 2621/5772 [44:56<5:39:57, 6.47s/it] 45%|████▌ | 2621/5772 [44:58<5:39:57, 6.47s/it] {'loss': 0.4763, 'learning_rate': 1.1962598166575737e-05, 'epoch': 0.45} + 45%|████▌ | 2621/5772 [44:58<5:39:57, 6.47s/it] {'loss': 0.4763, 'learning_rate': 1.1962598166575737e-05, 'epoch': 0.45} + 45%|████▌ | 2621/5772 [44:56<5:39:57, 6.47s/it] 45%|████▌ | 2622/5772 [45:02<5:34:02, 6.36s/it] 45%|████▌ | 2622/5772 [45:04<5:34:02, 6.36s/it] {'loss': 0.4741, 'learning_rate': 1.1957095009051683e-05, 'epoch': 0.45} + 45%|████▌ | 2622/5772 [45:04<5:34:02, 6.36s/it] {'loss': 0.4741, 'learning_rate': 1.1957095009051683e-05, 'epoch': 0.45} + 45%|████▌ | 2622/5772 [45:02<5:34:02, 6.36s/it] 45%|████▌ | 2623/5772 [45:11<5:36:30, 6.41s/it] 45%|████▌ | 2623/5772 [45:09<5:36:31, 6.41s/it] {'loss': 0.466, 'learning_rate': 1.1951591235151438e-05, 'epoch': 0.45} + 45%|████▌ | 2623/5772 [45:11<5:36:30, 6.41s/it] {'loss': 0.466, 'learning_rate': 1.1951591235151438e-05, 'epoch': 0.45} + 45%|████▌ | 2623/5772 [45:09<5:36:31, 6.41s/it] 45%|████▌ | 2624/5772 [45:15<5:29:04, 6.27s/it] 45%|████▌ | 2624/5772 [45:17<5:29:04, 6.27s/it] {'loss': 0.4722, 'learning_rate': 1.1946086846608383e-05, 'epoch': 0.45} + 45%|████▌ | 2624/5772 [45:17<5:29:04, 6.27s/it] {'loss': 0.4722, 'learning_rate': 1.1946086846608383e-05, 'epoch': 0.45} + 45%|████▌ | 2624/5772 [45:15<5:29:04, 6.27s/it] 45%|████▌ | 2625/5772 [45:23<5:24:13, 6.18s/it] 45%|████▌ | 2625/5772 [45:21<5:24:14, 6.18s/it] {'loss': 0.4787, 'learning_rate': 1.1940581845156097e-05, 'epoch': 0.45} + 45%|████▌ | 2625/5772 [45:23<5:24:13, 6.18s/it] {'loss': 0.4787, 'learning_rate': 1.1940581845156097e-05, 'epoch': 0.45} + 45%|████▌ | 2625/5772 [45:21<5:24:14, 6.18s/it] 45%|████▌ | 2626/5772 [45:29<5:19:47, 6.10s/it] 45%|████▌ | 2626/5772 [45:27<5:19:48, 6.10s/it] {'loss': 0.4593, 'learning_rate': 1.1935076232528348e-05, 'epoch': 0.45} + 45%|████▌ | 2626/5772 [45:29<5:19:47, 6.10s/it] {'loss': 0.4593, 'learning_rate': 1.1935076232528348e-05, 'epoch': 0.45} + 45%|████▌ | 2626/5772 [45:27<5:19:48, 6.10s/it] 46%|████▌ | 2627/5772 [45:34<5:16:43, 6.04s/it] 46%|████▌ | 2627/5772 [45:32<5:16:43, 6.04s/it] {'loss': 0.4751, 'learning_rate': 1.1929570010459096e-05, 'epoch': 0.46} + 46%|████▌ | 2627/5772 [45:34<5:16:43, 6.04s/it] {'loss': 0.4751, 'learning_rate': 1.1929570010459096e-05, 'epoch': 0.46} + 46%|████▌ | 2627/5772 [45:32<5:16:43, 6.04s/it] 46%|████▌ | 2628/5772 [45:41<5:25:37, 6.21s/it] 46%|████▌ | 2628/5772 [45:39<5:25:37, 6.21s/it] {'loss': 0.4698, 'learning_rate': 1.19240631806825e-05, 'epoch': 0.46} + 46%|████▌ | 2628/5772 [45:41<5:25:37, 6.21s/it] {'loss': 0.4698, 'learning_rate': 1.19240631806825e-05, 'epoch': 0.46} + 46%|████▌ | 2628/5772 [45:39<5:25:37, 6.21s/it] 46%|████▌ | 2629/5772 [45:45<5:28:16, 6.27s/it] 46%|████▌ | 2629/5772 [45:47<5:28:16, 6.27s/it] {'loss': 0.4768, 'learning_rate': 1.1918555744932905e-05, 'epoch': 0.46} + 46%|████▌ | 2629/5772 [45:47<5:28:16, 6.27s/it] {'loss': 0.4768, 'learning_rate': 1.1918555744932905e-05, 'epoch': 0.46} + 46%|████▌ | 2629/5772 [45:45<5:28:16, 6.27s/it] 46%|████▌ | 2630/5772 [45:52<5:28:42, 6.28s/it] 46%|████▌ | 2630/5772 [45:54<5:28:42, 6.28s/it] {'loss': 0.477, 'learning_rate': 1.1913047704944845e-05, 'epoch': 0.46} + 46%|████▌ | 2630/5772 [45:54<5:28:42, 6.28s/it] {'loss': 0.477, 'learning_rate': 1.1913047704944845e-05, 'epoch': 0.46} + 46%|████▌ | 2630/5772 [45:52<5:28:42, 6.28s/it] 46%|████▌ | 2631/5772 [46:00<5:29:00, 6.28s/it] 46%|████▌ | 2631/5772 [45:58<5:29:00, 6.28s/it] {'loss': 0.4555, 'learning_rate': 1.1907539062453044e-05, 'epoch': 0.46} + 46%|████▌ | 2631/5772 [46:00<5:29:00, 6.28s/it] {'loss': 0.4555, 'learning_rate': 1.1907539062453044e-05, 'epoch': 0.46} + 46%|████▌ | 2631/5772 [45:58<5:29:00, 6.28s/it] 46%|████▌ | 2632/5772 [46:04<5:25:51, 6.23s/it] 46%|████▌ | 2632/5772 [46:06<5:25:51, 6.23s/it] {'loss': 0.4759, 'learning_rate': 1.1902029819192424e-05, 'epoch': 0.46} + 46%|████▌ | 2632/5772 [46:06<5:25:51, 6.23s/it] {'loss': 0.4759, 'learning_rate': 1.1902029819192424e-05, 'epoch': 0.46} + 46%|████▌ | 2632/5772 [46:04<5:25:51, 6.23s/it] 46%|████▌ | 2633/5772 [46:13<5:36:17, 6.43s/it] 46%|████▌ | 2633/5772 [46:11<5:36:17, 6.43s/it] {'loss': 0.4771, 'learning_rate': 1.1896519976898086e-05, 'epoch': 0.46} + 46%|████▌ | 2633/5772 [46:13<5:36:17, 6.43s/it] {'loss': 0.4771, 'learning_rate': 1.1896519976898086e-05, 'epoch': 0.46} + 46%|████▌ | 2633/5772 [46:11<5:36:17, 6.43s/it] 46%|████▌ | 2634/5772 [46:19<5:32:43, 6.36s/it] 46%|████▌ | 2634/5772 [46:17<5:32:43, 6.36s/it] {'loss': 0.4672, 'learning_rate': 1.1891009537305326e-05, 'epoch': 0.46} + 46%|████▌ | 2634/5772 [46:19<5:32:43, 6.36s/it] {'loss': 0.4672, 'learning_rate': 1.1891009537305326e-05, 'epoch': 0.46} + 46%|████▌ | 2634/5772 [46:17<5:32:43, 6.36s/it] 46%|████▌ | 2635/5772 [46:23<5:27:49, 6.27s/it] 46%|████▌ | 2635/5772 [46:25<5:27:49, 6.27s/it]{'loss': 0.474, 'learning_rate': 1.1885498502149626e-05, 'epoch': 0.46} + {'loss': 0.474, 'learning_rate': 1.1885498502149626e-05, 'epoch': 0.46} + 46%|████▌ | 2635/5772 [46:25<5:27:49, 6.27s/it] 46%|████▌ | 2635/5772 [46:23<5:27:49, 6.27s/it] 46%|████▌ | 2636/5772 [46:29<5:20:28, 6.13s/it] 46%|████▌ | 2636/5772 [46:31<5:20:29, 6.13s/it] {'loss': 0.4618, 'learning_rate': 1.187998687316666e-05, 'epoch': 0.46} + 46%|████▌ | 2636/5772 [46:31<5:20:29, 6.13s/it] {'loss': 0.4618, 'learning_rate': 1.187998687316666e-05, 'epoch': 0.46} + 46%|████▌ | 2636/5772 [46:29<5:20:28, 6.13s/it] 46%|████▌ | 2637/5772 [46:36<5:23:53, 6.20s/it] 46%|████▌ | 2637/5772 [46:37<5:23:53, 6.20s/it] {'loss': 0.474, 'learning_rate': 1.1874474652092279e-05, 'epoch': 0.46} + 46%|████▌ | 2637/5772 [46:37<5:23:53, 6.20s/it] {'loss': 0.474, 'learning_rate': 1.1874474652092279e-05, 'epoch': 0.46} + 46%|████▌ | 2637/5772 [46:36<5:23:53, 6.20s/it] 46%|████▌ | 2638/5772 [46:44<5:21:01, 6.15s/it] 46%|████▌ | 2638/5772 [46:42<5:21:01, 6.15s/it] {'loss': 0.4633, 'learning_rate': 1.1868961840662525e-05, 'epoch': 0.46} + 46%|████▌ | 2638/5772 [46:44<5:21:01, 6.15s/it] {'loss': 0.4633, 'learning_rate': 1.1868961840662525e-05, 'epoch': 0.46} + 46%|████▌ | 2638/5772 [46:42<5:21:01, 6.15s/it] 46%|████▌ | 2639/5772 [46:49<5:15:30, 6.04s/it] 46%|████▌ | 2639/5772 [46:47<5:15:30, 6.04s/it] {'loss': 0.4843, 'learning_rate': 1.1863448440613634e-05, 'epoch': 0.46} + 46%|████▌ | 2639/5772 [46:49<5:15:30, 6.04s/it] {'loss': 0.4843, 'learning_rate': 1.1863448440613634e-05, 'epoch': 0.46} + 46%|████▌ | 2639/5772 [46:47<5:15:30, 6.04s/it] 46%|████▌ | 2640/5772 [46:55<5:14:57, 6.03s/it] 46%|████▌ | 2640/5772 [46:53<5:14:57, 6.03s/it] {'loss': 0.478, 'learning_rate': 1.1857934453682016e-05, 'epoch': 0.46} + 46%|████▌ | 2640/5772 [46:55<5:14:57, 6.03s/it] {'loss': 0.478, 'learning_rate': 1.1857934453682016e-05, 'epoch': 0.46} + 46%|████▌ | 2640/5772 [46:53<5:14:57, 6.03s/it] 46%|████▌ | 2641/5772 [47:01<5:15:38, 6.05s/it] 46%|████▌ | 2641/5772 [46:59<5:15:38, 6.05s/it] {'loss': 0.475, 'learning_rate': 1.1852419881604276e-05, 'epoch': 0.46} + 46%|████▌ | 2641/5772 [47:01<5:15:38, 6.05s/it] {'loss': 0.475, 'learning_rate': 1.1852419881604276e-05, 'epoch': 0.46} + 46%|████▌ | 2641/5772 [46:59<5:15:38, 6.05s/it] 46%|████▌ | 2642/5772 [47:06<5:16:29, 6.07s/it] 46%|████▌ | 2642/5772 [47:08<5:16:29, 6.07s/it] {'loss': 0.4687, 'learning_rate': 1.1846904726117187e-05, 'epoch': 0.46} + 46%|████▌ | 2642/5772 [47:08<5:16:29, 6.07s/it] {'loss': 0.4687, 'learning_rate': 1.1846904726117187e-05, 'epoch': 0.46} + 46%|████▌ | 2642/5772 [47:06<5:16:29, 6.07s/it] 46%|████▌ | 2643/5772 [47:13<5:14:46, 6.04s/it] 46%|████▌ | 2643/5772 [47:12<5:14:47, 6.04s/it] {'loss': 0.4864, 'learning_rate': 1.1841388988957728e-05, 'epoch': 0.46} + 46%|████▌ | 2643/5772 [47:13<5:14:46, 6.04s/it] {'loss': 0.4864, 'learning_rate': 1.1841388988957728e-05, 'epoch': 0.46} + 46%|████▌ | 2643/5772 [47:12<5:14:47, 6.04s/it] 46%|████▌ | 2644/5772 [47:19<5:13:32, 6.01s/it] 46%|████▌ | 2644/5772 [47:17<5:13:32, 6.01s/it] {'loss': 0.4612, 'learning_rate': 1.1835872671863042e-05, 'epoch': 0.46} + 46%|████▌ | 2644/5772 [47:19<5:13:32, 6.01s/it] {'loss': 0.4612, 'learning_rate': 1.1835872671863042e-05, 'epoch': 0.46} + 46%|████▌ | 2644/5772 [47:17<5:13:32, 6.01s/it] 46%|████▌ | 2645/5772 [47:26<5:16:40, 6.08s/it] 46%|████▌ | 2645/5772 [47:24<5:16:40, 6.08s/it] {'loss': 0.4755, 'learning_rate': 1.183035577657047e-05, 'epoch': 0.46} + 46%|████▌ | 2645/5772 [47:26<5:16:40, 6.08s/it] {'loss': 0.4755, 'learning_rate': 1.183035577657047e-05, 'epoch': 0.46} + 46%|████▌ | 2645/5772 [47:24<5:16:40, 6.08s/it] 46%|████▌ | 2646/5772 [47:32<5:13:10, 6.01s/it] 46%|████▌ | 2646/5772 [47:30<5:13:11, 6.01s/it] {'loss': 0.4553, 'learning_rate': 1.1824838304817521e-05, 'epoch': 0.46} + 46%|████▌ | 2646/5772 [47:32<5:13:10, 6.01s/it] {'loss': 0.4553, 'learning_rate': 1.1824838304817521e-05, 'epoch': 0.46} + 46%|████▌ | 2646/5772 [47:30<5:13:11, 6.01s/it] 46%|████▌ | 2647/5772 [47:36<5:15:51, 6.06s/it] 46%|████▌ | 2647/5772 [47:38<5:15:51, 6.06s/it] {'loss': 0.494, 'learning_rate': 1.1819320258341891e-05, 'epoch': 0.46} + 46%|████▌ | 2647/5772 [47:38<5:15:51, 6.06s/it] {'loss': 0.494, 'learning_rate': 1.1819320258341891e-05, 'epoch': 0.46} + 46%|████▌ | 2647/5772 [47:36<5:15:51, 6.06s/it] 46%|████▌ | 2648/5772 [47:42<5:14:19, 6.04s/it] 46%|████▌ | 2648/5772 [47:44<5:14:19, 6.04s/it] {'loss': 0.4708, 'learning_rate': 1.1813801638881466e-05, 'epoch': 0.46} + 46%|████▌ | 2648/5772 [47:44<5:14:19, 6.04s/it] {'loss': 0.4708, 'learning_rate': 1.1813801638881466e-05, 'epoch': 0.46} + 46%|████▌ | 2648/5772 [47:42<5:14:19, 6.04s/it] 46%|████▌ | 2649/5772 [47:48<5:17:50, 6.11s/it] 46%|████▌ | 2649/5772 [47:50<5:17:50, 6.11s/it] {'loss': 0.479, 'learning_rate': 1.1808282448174295e-05, 'epoch': 0.46} + 46%|████▌ | 2649/5772 [47:50<5:17:50, 6.11s/it] {'loss': 0.479, 'learning_rate': 1.1808282448174295e-05, 'epoch': 0.46} + 46%|████▌ | 2649/5772 [47:48<5:17:50, 6.11s/it]3 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +159 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +6 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +02 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... + 46%|████▌ | 2650/5772 [47:56<5:18:03, 6.11s/it] 46%|████▌ | 2650/5772 [47:54<5:18:03, 6.11s/it] {'loss': 0.4585, 'learning_rate': 1.1802762687958624e-05, 'epoch': 0.46} + 46%|████▌ | 2650/5772 [47:56<5:18:03, 6.11s/it] {'loss': 0.4585, 'learning_rate': 1.1802762687958624e-05, 'epoch': 0.46} + 46%|████▌ | 2650/5772 [47:54<5:18:03, 6.11s/it] 46%|████▌ | 2651/5772 [48:03<5:23:33, 6.22s/it] 46%|████▌ | 2651/5772 [48:01<5:23:33, 6.22s/it] {'loss': 0.4832, 'learning_rate': 1.1797242359972868e-05, 'epoch': 0.46} + 46%|████▌ | 2651/5772 [48:03<5:23:33, 6.22s/it] {'loss': 0.4832, 'learning_rate': 1.1797242359972868e-05, 'epoch': 0.46} + 46%|████▌ | 2651/5772 [48:01<5:23:33, 6.22s/it] 46%|████▌ | 2652/5772 [48:09<5:20:22, 6.16s/it] 46%|████▌ | 2652/5772 [48:07<5:20:23, 6.16s/it] {'loss': 0.4615, 'learning_rate': 1.1791721465955621e-05, 'epoch': 0.46} + 46%|████▌ | 2652/5772 [48:09<5:20:22, 6.16s/it] {'loss': 0.4615, 'learning_rate': 1.1791721465955621e-05, 'epoch': 0.46} + 46%|████▌ | 2652/5772 [48:07<5:20:23, 6.16s/it] 46%|████▌ | 2653/5772 [48:14<5:15:34, 6.07s/it] 46%|████▌ | 2653/5772 [48:12<5:15:33, 6.07s/it] {'loss': 0.4823, 'learning_rate': 1.1786200007645662e-05, 'epoch': 0.46} + 46%|████▌ | 2653/5772 [48:14<5:15:34, 6.07s/it] {'loss': 0.4823, 'learning_rate': 1.1786200007645662e-05, 'epoch': 0.46} + 46%|████▌ | 2653/5772 [48:12<5:15:33, 6.07s/it] 46%|████▌ | 2654/5772 [48:19<5:21:51, 6.19s/it] 46%|████▌ | 2654/5772 [48:21<5:21:51, 6.19s/it] {'loss': 0.4647, 'learning_rate': 1.1780677986781943e-05, 'epoch': 0.46} + 46%|████▌ | 2654/5772 [48:21<5:21:51, 6.19s/it] {'loss': 0.4647, 'learning_rate': 1.1780677986781943e-05, 'epoch': 0.46} + 46%|████▌ | 2654/5772 [48:19<5:21:51, 6.19s/it] 46%|████▌ | 2655/5772 [48:27<5:20:53, 6.18s/it] 46%|████▌ | 2655/5772 [48:25<5:20:53, 6.18s/it] {'loss': 0.4725, 'learning_rate': 1.177515540510359e-05, 'epoch': 0.46} + 46%|████▌ | 2655/5772 [48:27<5:20:53, 6.18s/it] {'loss': 0.4725, 'learning_rate': 1.177515540510359e-05, 'epoch': 0.46} + 46%|████▌ | 2655/5772 [48:25<5:20:53, 6.18s/it] 46%|████▌ | 2656/5772 [48:31<5:20:38, 6.17s/it] 46%|████▌ | 2656/5772 [48:33<5:20:39, 6.17s/it] {'loss': 0.461, 'learning_rate': 1.1769632264349914e-05, 'epoch': 0.46} + 46%|████▌ | 2656/5772 [48:33<5:20:39, 6.17s/it] {'loss': 0.461, 'learning_rate': 1.1769632264349914e-05, 'epoch': 0.46} + 46%|████▌ | 2656/5772 [48:31<5:20:38, 6.17s/it] 46%|████▌ | 2657/5772 [48:39<5:17:10, 6.11s/it] 46%|████▌ | 2657/5772 [48:37<5:17:11, 6.11s/it] {'loss': 0.478, 'learning_rate': 1.1764108566260392e-05, 'epoch': 0.46} + 46%|████▌ | 2657/5772 [48:39<5:17:10, 6.11s/it] {'loss': 0.478, 'learning_rate': 1.1764108566260392e-05, 'epoch': 0.46} + 46%|████▌ | 2657/5772 [48:37<5:17:11, 6.11s/it] 46%|████▌ | 2658/5772 [48:44<5:23:22, 6.23s/it] 46%|████▌ | 2658/5772 [48:46<5:23:22, 6.23s/it] {'loss': 0.4814, 'learning_rate': 1.1758584312574693e-05, 'epoch': 0.46} + 46%|████▌ | 2658/5772 [48:46<5:23:22, 6.23s/it] {'loss': 0.4814, 'learning_rate': 1.1758584312574693e-05, 'epoch': 0.46} + 46%|████▌ | 2658/5772 [48:44<5:23:22, 6.23s/it] 46%|████▌ | 2659/5772 [48:50<5:22:47, 6.22s/it] 46%|████▌ | 2659/5772 [48:52<5:22:47, 6.22s/it] {'loss': 0.4764, 'learning_rate': 1.1753059505032636e-05, 'epoch': 0.46} + 46%|████▌ | 2659/5772 [48:52<5:22:47, 6.22s/it] {'loss': 0.4764, 'learning_rate': 1.1753059505032636e-05, 'epoch': 0.46} + 46%|████▌ | 2659/5772 [48:50<5:22:47, 6.22s/it] 46%|████▌ | 2660/5772 [48:56<5:25:16, 6.27s/it] 46%|████▌ | 2660/5772 [48:58<5:25:18, 6.27s/it] {'loss': 0.4802, 'learning_rate': 1.1747534145374236e-05, 'epoch': 0.46} + 46%|████▌ | 2660/5772 [48:58<5:25:18, 6.27s/it] {'loss': 0.4802, 'learning_rate': 1.1747534145374236e-05, 'epoch': 0.46} + 46%|████▌ | 2660/5772 [48:56<5:25:16, 6.27s/it] 46%|████▌ | 2661/5772 [49:03<5:25:30, 6.28s/it] 46%|████▌ | 2661/5772 [49:05<5:25:30, 6.28s/it] {'loss': 0.4746, 'learning_rate': 1.1742008235339677e-05, 'epoch': 0.46} + 46%|████▌ | 2661/5772 [49:05<5:25:30, 6.28s/it] {'loss': 0.4746, 'learning_rate': 1.1742008235339677e-05, 'epoch': 0.46} + 46%|████▌ | 2661/5772 [49:03<5:25:30, 6.28s/it] 46%|████▌ | 2662/5772 [49:09<5:22:06, 6.21s/it] 46%|████▌ | 2662/5772 [49:11<5:22:06, 6.21s/it] {'loss': 0.465, 'learning_rate': 1.1736481776669307e-05, 'epoch': 0.46} + 46%|████▌ | 2662/5772 [49:11<5:22:06, 6.21s/it] {'loss': 0.465, 'learning_rate': 1.1736481776669307e-05, 'epoch': 0.46} + 46%|████▌ | 2662/5772 [49:09<5:22:06, 6.21s/it] 46%|████▌ | 2663/5772 [49:15<5:19:18, 6.16s/it] 46%|████▌ | 2663/5772 [49:17<5:19:18, 6.16s/it] {'loss': 0.4773, 'learning_rate': 1.1730954771103653e-05, 'epoch': 0.46} + 46%|████▌ | 2663/5772 [49:17<5:19:18, 6.16s/it] {'loss': 0.4773, 'learning_rate': 1.1730954771103653e-05, 'epoch': 0.46} + 46%|████▌ | 2663/5772 [49:15<5:19:18, 6.16s/it] 46%|████▌ | 2664/5772 [49:23<5:20:27, 6.19s/it] 46%|████▌ | 2664/5772 [49:21<5:20:27, 6.19s/it] {'loss': 0.482, 'learning_rate': 1.1725427220383421e-05, 'epoch': 0.46} + 46%|████▌ | 2664/5772 [49:23<5:20:27, 6.19s/it] {'loss': 0.482, 'learning_rate': 1.1725427220383421e-05, 'epoch': 0.46} + 46%|████▌ | 2664/5772 [49:21<5:20:27, 6.19s/it] 46%|████▌ | 2665/5772 [49:29<5:20:06, 6.18s/it] 46%|████▌ | 2665/5772 [49:27<5:20:06, 6.18s/it] {'loss': 0.4719, 'learning_rate': 1.1719899126249482e-05, 'epoch': 0.46} + 46%|████▌ | 2665/5772 [49:29<5:20:06, 6.18s/it] {'loss': 0.4719, 'learning_rate': 1.1719899126249482e-05, 'epoch': 0.46} + 46%|████▌ | 2665/5772 [49:27<5:20:06, 6.18s/it] 46%|████▌ | 2666/5772 [49:33<5:15:03, 6.09s/it] 46%|████▌ | 2666/5772 [49:35<5:15:03, 6.09s/it] {'loss': 0.4558, 'learning_rate': 1.1714370490442872e-05, 'epoch': 0.46} + 46%|████▌ | 2666/5772 [49:35<5:15:03, 6.09s/it] {'loss': 0.4558, 'learning_rate': 1.1714370490442872e-05, 'epoch': 0.46} + 46%|████▌ | 2666/5772 [49:33<5:15:03, 6.09s/it] 46%|████▌ | 2667/5772 [49:41<5:10:46, 6.01s/it] 46%|████▌ | 2667/5772 [49:39<5:10:46, 6.01s/it] {'loss': 0.4749, 'learning_rate': 1.1708841314704811e-05, 'epoch': 0.46} + 46%|████▌ | 2667/5772 [49:41<5:10:46, 6.01s/it] {'loss': 0.4749, 'learning_rate': 1.1708841314704811e-05, 'epoch': 0.46} + 46%|████▌ | 2667/5772 [49:39<5:10:46, 6.01s/it] 46%|████▌ | 2668/5772 [49:47<5:14:42, 6.08s/it] 46%|████▌ | 2668/5772 [49:45<5:14:43, 6.08s/it] {'loss': 0.4726, 'learning_rate': 1.1703311600776677e-05, 'epoch': 0.46} + 46%|████▌ | 2668/5772 [49:47<5:14:42, 6.08s/it] {'loss': 0.4726, 'learning_rate': 1.1703311600776677e-05, 'epoch': 0.46} + 46%|████▌ | 2668/5772 [49:45<5:14:43, 6.08s/it] 46%|████▌ | 2669/5772 [49:52<5:21:40, 6.22s/it] 46%|████▌ | 2669/5772 [49:54<5:21:40, 6.22s/it] {'loss': 0.4765, 'learning_rate': 1.1697781350400025e-05, 'epoch': 0.46} + 46%|████▌ | 2669/5772 [49:54<5:21:40, 6.22s/it] {'loss': 0.4765, 'learning_rate': 1.1697781350400025e-05, 'epoch': 0.46} + 46%|████▌ | 2669/5772 [49:52<5:21:40, 6.22s/it] 46%|████▋ | 2670/5772 [50:00<5:20:36, 6.20s/it] 46%|████▋ | 2670/5772 [49:58<5:20:36, 6.20s/it] {'loss': 0.4621, 'learning_rate': 1.1692250565316577e-05, 'epoch': 0.46} + 46%|████▋ | 2670/5772 [50:00<5:20:36, 6.20s/it] {'loss': 0.4621, 'learning_rate': 1.1692250565316577e-05, 'epoch': 0.46} + 46%|████▋ | 2670/5772 [49:58<5:20:36, 6.20s/it] 46%|████▋ | 2671/5772 [50:06<5:16:15, 6.12s/it] 46%|████▋ | 2671/5772 [50:04<5:16:15, 6.12s/it] {'loss': 0.477, 'learning_rate': 1.1686719247268221e-05, 'epoch': 0.46} + 46%|████▋ | 2671/5772 [50:06<5:16:15, 6.12s/it] {'loss': 0.477, 'learning_rate': 1.1686719247268221e-05, 'epoch': 0.46} + 46%|████▋ | 2671/5772 [50:04<5:16:15, 6.12s/it] 46%|████▋ | 2672/5772 [50:12<5:19:11, 6.18s/it] 46%|████▋ | 2672/5772 [50:10<5:19:11, 6.18s/it] {'loss': 0.4871, 'learning_rate': 1.1681187397997018e-05, 'epoch': 0.46} + 46%|████▋ | 2672/5772 [50:12<5:19:11, 6.18s/it] {'loss': 0.4871, 'learning_rate': 1.1681187397997018e-05, 'epoch': 0.46} + 46%|████▋ | 2672/5772 [50:10<5:19:11, 6.18s/it] 46%|████▋ | 2673/5772 [50:18<5:16:35, 6.13s/it] 46%|████▋ | 2673/5772 [50:16<5:16:35, 6.13s/it] {'loss': 0.4632, 'learning_rate': 1.1675655019245191e-05, 'epoch': 0.46} + 46%|████▋ | 2673/5772 [50:18<5:16:35, 6.13s/it] {'loss': 0.4632, 'learning_rate': 1.1675655019245191e-05, 'epoch': 0.46} + 46%|████▋ | 2673/5772 [50:16<5:16:35, 6.13s/it] 46%|████▋ | 2674/5772 [50:24<5:18:33, 6.17s/it] 46%|████▋ | 2674/5772 [50:22<5:18:33, 6.17s/it] {'loss': 0.4695, 'learning_rate': 1.1670122112755134e-05, 'epoch': 0.46} + 46%|████▋ | 2674/5772 [50:24<5:18:33, 6.17s/it] {'loss': 0.4695, 'learning_rate': 1.1670122112755134e-05, 'epoch': 0.46} + 46%|████▋ | 2674/5772 [50:22<5:18:33, 6.17s/it] 46%|████▋ | 2675/5772 [50:30<5:19:17, 6.19s/it] 46%|████▋ | 2675/5772 [50:29<5:19:16, 6.19s/it] {'loss': 0.4929, 'learning_rate': 1.1664588680269403e-05, 'epoch': 0.46} + 46%|████▋ | 2675/5772 [50:30<5:19:17, 6.19s/it] {'loss': 0.4929, 'learning_rate': 1.1664588680269403e-05, 'epoch': 0.46} + 46%|████▋ | 2675/5772 [50:29<5:19:16, 6.19s/it] 46%|████▋ | 2676/5772 [50:35<5:16:25, 6.13s/it] 46%|████▋ | 2676/5772 [50:36<5:16:25, 6.13s/it] {'loss': 0.4679, 'learning_rate': 1.1659054723530721e-05, 'epoch': 0.46} + 46%|████▋ | 2676/5772 [50:36<5:16:25, 6.13s/it] {'loss': 0.4679, 'learning_rate': 1.1659054723530721e-05, 'epoch': 0.46} + 46%|████▋ | 2676/5772 [50:35<5:16:25, 6.13s/it] 46%|████▋ | 2677/5772 [50:41<5:18:54, 6.18s/it] 46%|████▋ | 2677/5772 [50:43<5:18:55, 6.18s/it] {'loss': 0.4802, 'learning_rate': 1.1653520244281975e-05, 'epoch': 0.46} + 46%|████▋ | 2677/5772 [50:43<5:18:55, 6.18s/it] {'loss': 0.4802, 'learning_rate': 1.1653520244281975e-05, 'epoch': 0.46} + 46%|████▋ | 2677/5772 [50:41<5:18:54, 6.18s/it] 46%|████▋ | 2678/5772 [50:49<5:18:05, 6.17s/it] 46%|████▋ | 2678/5772 [50:47<5:18:06, 6.17s/it] {'loss': 0.4704, 'learning_rate': 1.1647985244266226e-05, 'epoch': 0.46} + 46%|████▋ | 2678/5772 [50:49<5:18:05, 6.17s/it] {'loss': 0.4704, 'learning_rate': 1.1647985244266226e-05, 'epoch': 0.46} + 46%|████▋ | 2678/5772 [50:47<5:18:06, 6.17s/it] 46%|████▋ | 2679/5772 [50:55<5:17:10, 6.15s/it] 46%|████▋ | 2679/5772 [50:53<5:17:11, 6.15s/it] {'loss': 0.4741, 'learning_rate': 1.1642449725226685e-05, 'epoch': 0.46} + 46%|████▋ | 2679/5772 [50:55<5:17:10, 6.15s/it] {'loss': 0.4741, 'learning_rate': 1.1642449725226685e-05, 'epoch': 0.46} + 46%|████▋ | 2679/5772 [50:53<5:17:11, 6.15s/it] 46%|████▋ | 2680/5772 [50:59<5:19:02, 6.19s/it] 46%|████▋ | 2680/5772 [51:01<5:19:02, 6.19s/it] {'loss': 0.4736, 'learning_rate': 1.1636913688906739e-05, 'epoch': 0.46} + 46%|████▋ | 2680/5772 [51:01<5:19:02, 6.19s/it] {'loss': 0.4736, 'learning_rate': 1.1636913688906739e-05, 'epoch': 0.46} + 46%|████▋ | 2680/5772 [50:59<5:19:02, 6.19s/it] 46%|████▋ | 2681/5772 [51:07<5:15:01, 6.11s/it] 46%|████▋ | 2681/5772 [51:05<5:15:01, 6.11s/it] {'loss': 0.4716, 'learning_rate': 1.1631377137049925e-05, 'epoch': 0.46} + 46%|████▋ | 2681/5772 [51:07<5:15:01, 6.11s/it] {'loss': 0.4716, 'learning_rate': 1.1631377137049925e-05, 'epoch': 0.46} + 46%|████▋ | 2681/5772 [51:05<5:15:01, 6.11s/it] 46%|████▋ | 2682/5772 [51:13<5:10:16, 6.02s/it] 46%|████▋ | 2682/5772 [51:11<5:10:16, 6.02s/it] {'loss': 0.4609, 'learning_rate': 1.1625840071399952e-05, 'epoch': 0.46} + 46%|████▋ | 2682/5772 [51:13<5:10:16, 6.02s/it] {'loss': 0.4609, 'learning_rate': 1.1625840071399952e-05, 'epoch': 0.46} + 46%|████▋ | 2682/5772 [51:11<5:10:16, 6.02s/it] 46%|████▋ | 2683/5772 [51:17<5:13:13, 6.08s/it] 46%|████▋ | 2683/5772 [51:19<5:13:14, 6.08s/it] {'loss': 0.479, 'learning_rate': 1.1620302493700689e-05, 'epoch': 0.46} + 46%|████▋ | 2683/5772 [51:19<5:13:14, 6.08s/it] {'loss': 0.479, 'learning_rate': 1.1620302493700689e-05, 'epoch': 0.46} + 46%|████▋ | 2683/5772 [51:17<5:13:13, 6.08s/it] 47%|████▋ | 2684/5772 [51:23<5:11:15, 6.05s/it] 47%|████▋ | 2684/5772 [51:25<5:11:15, 6.05s/it] {'loss': 0.4727, 'learning_rate': 1.1614764405696162e-05, 'epoch': 0.46} + 47%|████▋ | 2684/5772 [51:25<5:11:15, 6.05s/it] {'loss': 0.4727, 'learning_rate': 1.1614764405696162e-05, 'epoch': 0.46} + 47%|████▋ | 2684/5772 [51:23<5:11:15, 6.05s/it] 47%|████▋ | 2685/5772 [51:29<5:10:23, 6.03s/it] 47%|████▋ | 2685/5772 [51:31<5:10:23, 6.03s/it] {'loss': 0.4828, 'learning_rate': 1.1609225809130566e-05, 'epoch': 0.47} + 47%|████▋ | 2685/5772 [51:31<5:10:23, 6.03s/it] {'loss': 0.4828, 'learning_rate': 1.1609225809130566e-05, 'epoch': 0.47} + 47%|████▋ | 2685/5772 [51:29<5:10:23, 6.03s/it] 47%|████▋ | 2686/5772 [51:35<5:10:48, 6.04s/it] 47%|████▋ | 2686/5772 [51:37<5:10:48, 6.04s/it] {'loss': 0.4744, 'learning_rate': 1.1603686705748247e-05, 'epoch': 0.47} + 47%|████▋ | 2686/5772 [51:37<5:10:48, 6.04s/it] {'loss': 0.4744, 'learning_rate': 1.1603686705748247e-05, 'epoch': 0.47} + 47%|████▋ | 2686/5772 [51:35<5:10:48, 6.04s/it] 47%|████▋ | 2687/5772 [51:42<5:14:42, 6.12s/it] 47%|████▋ | 2687/5772 [51:44<5:14:42, 6.12s/it] {'loss': 0.4803, 'learning_rate': 1.1598147097293721e-05, 'epoch': 0.47} + 47%|████▋ | 2687/5772 [51:44<5:14:42, 6.12s/it] {'loss': 0.4803, 'learning_rate': 1.1598147097293721e-05, 'epoch': 0.47} + 47%|████▋ | 2687/5772 [51:42<5:14:42, 6.12s/it] 47%|████▋ | 2688/5772 [51:49<5:09:27, 6.02s/it] 47%|████▋ | 2688/5772 [51:47<5:09:27, 6.02s/it] {'loss': 0.4444, 'learning_rate': 1.1592606985511648e-05, 'epoch': 0.47} + 47%|████▋ | 2688/5772 [51:49<5:09:27, 6.02s/it] {'loss': 0.4444, 'learning_rate': 1.1592606985511648e-05, 'epoch': 0.47} + 47%|████▋ | 2688/5772 [51:47<5:09:27, 6.02s/it] 47%|████▋ | 2689/5772 [51:53<5:05:32, 5.95s/it] 47%|████▋ | 2689/5772 [51:55<5:05:32, 5.95s/it] {'loss': 0.4755, 'learning_rate': 1.1587066372146863e-05, 'epoch': 0.47} + 47%|████▋ | 2689/5772 [51:55<5:05:32, 5.95s/it] {'loss': 0.4755, 'learning_rate': 1.1587066372146863e-05, 'epoch': 0.47} + 47%|████▋ | 2689/5772 [51:53<5:05:32, 5.95s/it] 47%|████▋ | 2690/5772 [51:59<5:08:05, 6.00s/it] 47%|████▋ | 2690/5772 [52:01<5:08:06, 6.00s/it] {'loss': 0.4771, 'learning_rate': 1.1581525258944346e-05, 'epoch': 0.47} + 47%|████▋ | 2690/5772 [52:01<5:08:06, 6.00s/it] {'loss': 0.4771, 'learning_rate': 1.1581525258944346e-05, 'epoch': 0.47} + 47%|████▋ | 2690/5772 [51:59<5:08:05, 6.00s/it] 47%|████▋ | 2691/5772 [52:08<5:18:46, 6.21s/it] 47%|████▋ | 2691/5772 [52:06<5:18:46, 6.21s/it] {'loss': 0.4714, 'learning_rate': 1.1575983647649243e-05, 'epoch': 0.47} + 47%|████▋ | 2691/5772 [52:08<5:18:46, 6.21s/it] {'loss': 0.4714, 'learning_rate': 1.1575983647649243e-05, 'epoch': 0.47} + 47%|████▋ | 2691/5772 [52:06<5:18:46, 6.21s/it] 47%|████▋ | 2692/5772 [52:15<5:27:44, 6.38s/it] 47%|████▋ | 2692/5772 [52:13<5:27:44, 6.38s/it] {'loss': 0.4707, 'learning_rate': 1.1570441540006849e-05, 'epoch': 0.47} + 47%|████▋ | 2692/5772 [52:15<5:27:44, 6.38s/it] {'loss': 0.4707, 'learning_rate': 1.1570441540006849e-05, 'epoch': 0.47} + 47%|████▋ | 2692/5772 [52:13<5:27:44, 6.38s/it] 47%|████▋ | 2693/5772 [52:21<5:22:53, 6.29s/it] 47%|████▋ | 2693/5772 [52:19<5:22:53, 6.29s/it] {'loss': 0.4846, 'learning_rate': 1.1564898937762627e-05, 'epoch': 0.47} + 47%|████▋ | 2693/5772 [52:21<5:22:53, 6.29s/it] {'loss': 0.4846, 'learning_rate': 1.1564898937762627e-05, 'epoch': 0.47} + 47%|████▋ | 2693/5772 [52:19<5:22:53, 6.29s/it] 47%|████▋ | 2694/5772 [52:27<5:21:17, 6.26s/it] 47%|████▋ | 2694/5772 [52:25<5:21:17, 6.26s/it] {'loss': 0.4755, 'learning_rate': 1.1559355842662188e-05, 'epoch': 0.47} + 47%|████▋ | 2694/5772 [52:27<5:21:17, 6.26s/it] {'loss': 0.4755, 'learning_rate': 1.1559355842662188e-05, 'epoch': 0.47} + 47%|████▋ | 2694/5772 [52:25<5:21:17, 6.26s/it] 47%|████▋ | 2695/5772 [52:31<5:20:57, 6.26s/it] {'loss': 0.4822, 'learning_rate': 1.155381225645129e-05, 'epoch': 0.47} + 47%|████▋ | 2695/5772 [52:31<5:20:57, 6.26s/it] 47%|████▋ | 2695/5772 [52:33<5:20:57, 6.26s/it] {'loss': 0.4822, 'learning_rate': 1.155381225645129e-05, 'epoch': 0.47} + 47%|████▋ | 2695/5772 [52:33<5:20:57, 6.26s/it] 47%|████▋ | 2696/5772 [52:38<5:23:03, 6.30s/it] 47%|████▋ | 2696/5772 [52:40<5:23:03, 6.30s/it] {'loss': 0.4682, 'learning_rate': 1.1548268180875868e-05, 'epoch': 0.47} + 47%|████▋ | 2696/5772 [52:40<5:23:03, 6.30s/it] {'loss': 0.4682, 'learning_rate': 1.1548268180875868e-05, 'epoch': 0.47} + 47%|████▋ | 2696/5772 [52:38<5:23:03, 6.30s/it] 47%|████▋ | 2697/5772 [52:46<5:21:59, 6.28s/it] 47%|████▋ | 2697/5772 [52:44<5:21:59, 6.28s/it] {'loss': 0.4668, 'learning_rate': 1.1542723617681989e-05, 'epoch': 0.47} + 47%|████▋ | 2697/5772 [52:46<5:21:59, 6.28s/it] {'loss': 0.4668, 'learning_rate': 1.1542723617681989e-05, 'epoch': 0.47} + 47%|████▋ | 2697/5772 [52:44<5:21:59, 6.28s/it] 47%|████▋ | 2698/5772 [52:52<5:18:09, 6.21s/it] 47%|████▋ | 2698/5772 [52:50<5:18:09, 6.21s/it] {'loss': 0.4676, 'learning_rate': 1.1537178568615879e-05, 'epoch': 0.47} + 47%|████▋ | 2698/5772 [52:52<5:18:09, 6.21s/it] {'loss': 0.4676, 'learning_rate': 1.1537178568615879e-05, 'epoch': 0.47} + 47%|████▋ | 2698/5772 [52:50<5:18:09, 6.21s/it] 47%|████▋ | 2699/5772 [52:58<5:14:50, 6.15s/it] 47%|████▋ | 2699/5772 [52:56<5:14:50, 6.15s/it] {'loss': 0.4818, 'learning_rate': 1.1531633035423931e-05, 'epoch': 0.47} + 47%|████▋ | 2699/5772 [52:58<5:14:50, 6.15s/it] {'loss': 0.4818, 'learning_rate': 1.1531633035423931e-05, 'epoch': 0.47} + 47%|████▋ | 2699/5772 [52:56<5:14:50, 6.15s/it]3 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +104 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +6 12 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +015 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... + 47%|████▋ | 2700/5772 [53:04<5:15:38, 6.16s/it] 47%|████▋ | 2700/5772 [53:02<5:15:38, 6.16s/it]1 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4701, 'learning_rate': 1.1526087019852673e-05, 'epoch': 0.47} + 47%|████▋ | 2700/5772 [53:04<5:15:38, 6.16s/it] {'loss': 0.4701, 'learning_rate': 1.1526087019852673e-05, 'epoch': 0.47} + 47%|████▋ | 2700/5772 [53:02<5:15:38, 6.16s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-2700/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-2700/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-2700/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 47%|████▋ | 2701/5772 [53:27<10:02:00, 11.76s/it] 47%|████▋ | 2701/5772 [53:29<10:02:01, 11.76s/it] {'loss': 0.4807, 'learning_rate': 1.152054052364879e-05, 'epoch': 0.47} + 47%|████▋ | 2701/5772 [53:29<10:02:01, 11.76s/it] {'loss': 0.4807, 'learning_rate': 1.152054052364879e-05, 'epoch': 0.47} + 47%|████▋ | 2701/5772 [53:27<10:02:00, 11.76s/it] 47%|████▋ | 2702/5772 [53:33<8:38:16, 10.13s/it] 47%|████▋ | 2702/5772 [53:35<8:38:17, 10.13s/it] {'loss': 0.4669, 'learning_rate': 1.151499354855913e-05, 'epoch': 0.47} + 47%|████▋ | 2702/5772 [53:35<8:38:17, 10.13s/it] {'loss': 0.4669, 'learning_rate': 1.151499354855913e-05, 'epoch': 0.47} + 47%|████▋ | 2702/5772 [53:33<8:38:16, 10.13s/it] 47%|████▋ | 2703/5772 [53:39<7:35:22, 8.90s/it] 47%|████▋ | 2703/5772 [53:41<7:35:22, 8.90s/it] {'loss': 0.469, 'learning_rate': 1.150944609633067e-05, 'epoch': 0.47} + 47%|████▋ | 2703/5772 [53:41<7:35:22, 8.90s/it] {'loss': 0.469, 'learning_rate': 1.150944609633067e-05, 'epoch': 0.47} + 47%|████▋ | 2703/5772 [53:39<7:35:22, 8.90s/it] 47%|████▋ | 2704/5772 [53:45<6:49:58, 8.02s/it] 47%|████▋ | 2704/5772 [53:47<6:49:58, 8.02s/it] {'loss': 0.4634, 'learning_rate': 1.1503898168710555e-05, 'epoch': 0.47} + 47%|████▋ | 2704/5772 [53:47<6:49:58, 8.02s/it] {'loss': 0.4634, 'learning_rate': 1.1503898168710555e-05, 'epoch': 0.47} + 47%|████▋ | 2704/5772 [53:45<6:49:58, 8.02s/it] 47%|████▋ | 2705/5772 [53:51<6:20:07, 7.44s/it] 47%|████▋ | 2705/5772 [53:53<6:20:07, 7.44s/it] {'loss': 0.4728, 'learning_rate': 1.1498349767446072e-05, 'epoch': 0.47} + 47%|████▋ | 2705/5772 [53:53<6:20:07, 7.44s/it] {'loss': 0.4728, 'learning_rate': 1.1498349767446072e-05, 'epoch': 0.47} + 47%|████▋ | 2705/5772 [53:51<6:20:07, 7.44s/it] 47%|████▋ | 2706/5772 [53:58<6:00:45, 7.06s/it] 47%|████▋ | 2706/5772 [54:00<6:00:45, 7.06s/it] {'loss': 0.4676, 'learning_rate': 1.1492800894284664e-05, 'epoch': 0.47} + 47%|████▋ | 2706/5772 [54:00<6:00:45, 7.06s/it] {'loss': 0.4676, 'learning_rate': 1.1492800894284664e-05, 'epoch': 0.47} + 47%|████▋ | 2706/5772 [53:58<6:00:45, 7.06s/it] 47%|████▋ | 2707/5772 [54:04<5:51:52, 6.89s/it] 47%|████▋ | 2707/5772 [54:06<5:51:52, 6.89s/it] {'loss': 0.4762, 'learning_rate': 1.1487251550973914e-05, 'epoch': 0.47} + 47%|████▋ | 2707/5772 [54:06<5:51:52, 6.89s/it] {'loss': 0.4762, 'learning_rate': 1.1487251550973914e-05, 'epoch': 0.47} + 47%|████▋ | 2707/5772 [54:04<5:51:52, 6.89s/it] 47%|████▋ | 2708/5772 [54:10<5:38:08, 6.62s/it] 47%|████▋ | 2708/5772 [54:12<5:38:08, 6.62s/it] {'loss': 0.4664, 'learning_rate': 1.1481701739261557e-05, 'epoch': 0.47} + 47%|████▋ | 2708/5772 [54:12<5:38:08, 6.62s/it] {'loss': 0.4664, 'learning_rate': 1.1481701739261557e-05, 'epoch': 0.47} + 47%|████▋ | 2708/5772 [54:10<5:38:08, 6.62s/it] 47%|████▋ | 2709/5772 [54:16<5:30:00, 6.46s/it] 47%|████▋ | 2709/5772 [54:18<5:30:00, 6.46s/it] {'loss': 0.4752, 'learning_rate': 1.1476151460895476e-05, 'epoch': 0.47} + 47%|████▋ | 2709/5772 [54:18<5:30:00, 6.46s/it] {'loss': 0.4752, 'learning_rate': 1.1476151460895476e-05, 'epoch': 0.47} + 47%|████▋ | 2709/5772 [54:16<5:30:00, 6.46s/it] 47%|████▋ | 2710/5772 [54:22<5:24:08, 6.35s/it] 47%|████▋ | 2710/5772 [54:24<5:24:08, 6.35s/it] {'loss': 0.4611, 'learning_rate': 1.1470600717623699e-05, 'epoch': 0.47} + 47%|████▋ | 2710/5772 [54:24<5:24:08, 6.35s/it] {'loss': 0.4611, 'learning_rate': 1.1470600717623699e-05, 'epoch': 0.47} + 47%|████▋ | 2710/5772 [54:22<5:24:08, 6.35s/it] 47%|████▋ | 2711/5772 [54:28<5:20:34, 6.28s/it] 47%|████▋ | 2711/5772 [54:30<5:20:34, 6.28s/it] {'loss': 0.4794, 'learning_rate': 1.1465049511194404e-05, 'epoch': 0.47} + 47%|████▋ | 2711/5772 [54:30<5:20:34, 6.28s/it] {'loss': 0.4794, 'learning_rate': 1.1465049511194404e-05, 'epoch': 0.47} + 47%|████▋ | 2711/5772 [54:28<5:20:34, 6.28s/it] 47%|████▋ | 2712/5772 [54:35<5:25:58, 6.39s/it] 47%|████▋ | 2712/5772 [54:37<5:25:58, 6.39s/it] {'loss': 0.473, 'learning_rate': 1.1459497843355907e-05, 'epoch': 0.47} + 47%|████▋ | 2712/5772 [54:37<5:25:58, 6.39s/it] {'loss': 0.473, 'learning_rate': 1.1459497843355907e-05, 'epoch': 0.47} + 47%|████▋ | 2712/5772 [54:35<5:25:58, 6.39s/it] 47%|████▋ | 2713/5772 [54:43<5:17:11, 6.22s/it] 47%|████▋ | 2713/5772 [54:41<5:17:11, 6.22s/it] {'loss': 0.4751, 'learning_rate': 1.1453945715856682e-05, 'epoch': 0.47} + 47%|████▋ | 2713/5772 [54:43<5:17:11, 6.22s/it] {'loss': 0.4751, 'learning_rate': 1.1453945715856682e-05, 'epoch': 0.47} + 47%|████▋ | 2713/5772 [54:41<5:17:11, 6.22s/it] 47%|████▋ | 2714/5772 [54:47<5:12:35, 6.13s/it] 47%|████▋ | 2714/5772 [54:49<5:12:36, 6.13s/it] {'loss': 0.4613, 'learning_rate': 1.1448393130445337e-05, 'epoch': 0.47} + 47%|████▋ | 2714/5772 [54:49<5:12:36, 6.13s/it] {'loss': 0.4613, 'learning_rate': 1.1448393130445337e-05, 'epoch': 0.47} + 47%|████▋ | 2714/5772 [54:47<5:12:35, 6.13s/it] 47%|████▋ | 2715/5772 [54:53<5:09:39, 6.08s/it] 47%|████▋ | 2715/5772 [54:55<5:09:39, 6.08s/it] {'loss': 0.4664, 'learning_rate': 1.1442840088870628e-05, 'epoch': 0.47} + 47%|████▋ | 2715/5772 [54:55<5:09:39, 6.08s/it] {'loss': 0.4664, 'learning_rate': 1.1442840088870628e-05, 'epoch': 0.47} + 47%|████▋ | 2715/5772 [54:53<5:09:39, 6.08s/it] 47%|████▋ | 2716/5772 [55:01<5:12:50, 6.14s/it] 47%|████▋ | 2716/5772 [54:59<5:12:50, 6.14s/it] {'loss': 0.4638, 'learning_rate': 1.1437286592881458e-05, 'epoch': 0.47} + 47%|████▋ | 2716/5772 [55:01<5:12:50, 6.14s/it] {'loss': 0.4638, 'learning_rate': 1.1437286592881458e-05, 'epoch': 0.47} + 47%|████▋ | 2716/5772 [54:59<5:12:50, 6.14s/it] 47%|████▋ | 2717/5772 [55:05<5:16:10, 6.21s/it] 47%|████▋ | 2717/5772 [55:07<5:16:10, 6.21s/it] {'loss': 0.4743, 'learning_rate': 1.1431732644226865e-05, 'epoch': 0.47} + 47%|████▋ | 2717/5772 [55:07<5:16:10, 6.21s/it] {'loss': 0.4743, 'learning_rate': 1.1431732644226865e-05, 'epoch': 0.47} + 47%|████▋ | 2717/5772 [55:05<5:16:10, 6.21s/it] 47%|████▋ | 2718/5772 [55:13<5:12:42, 6.14s/it] 47%|████▋ | 2718/5772 [55:11<5:12:43, 6.14s/it] {'loss': 0.4629, 'learning_rate': 1.1426178244656038e-05, 'epoch': 0.47} + 47%|████▋ | 2718/5772 [55:13<5:12:42, 6.14s/it] {'loss': 0.4629, 'learning_rate': 1.1426178244656038e-05, 'epoch': 0.47} + 47%|████▋ | 2718/5772 [55:11<5:12:43, 6.14s/it] 47%|████▋ | 2719/5772 [55:17<5:09:30, 6.08s/it] 47%|████▋ | 2719/5772 [55:19<5:09:30, 6.08s/it] {'loss': 0.4804, 'learning_rate': 1.1420623395918297e-05, 'epoch': 0.47} + 47%|████▋ | 2719/5772 [55:19<5:09:30, 6.08s/it] {'loss': 0.4804, 'learning_rate': 1.1420623395918297e-05, 'epoch': 0.47} + 47%|████▋ | 2719/5772 [55:17<5:09:30, 6.08s/it] 47%|████▋ | 2720/5772 [55:23<5:07:28, 6.04s/it] 47%|████▋ | 2720/5772 [55:25<5:07:28, 6.04s/it] {'loss': 0.4752, 'learning_rate': 1.1415068099763123e-05, 'epoch': 0.47} + 47%|████▋ | 2720/5772 [55:25<5:07:28, 6.04s/it] {'loss': 0.4752, 'learning_rate': 1.1415068099763123e-05, 'epoch': 0.47} + 47%|████▋ | 2720/5772 [55:23<5:07:28, 6.04s/it] 47%|████▋ | 2721/5772 [55:31<5:04:25, 5.99s/it] 47%|████▋ | 2721/5772 [55:29<5:04:25, 5.99s/it] {'loss': 0.4806, 'learning_rate': 1.1409512357940114e-05, 'epoch': 0.47} + 47%|████▋ | 2721/5772 [55:31<5:04:25, 5.99s/it] {'loss': 0.4806, 'learning_rate': 1.1409512357940114e-05, 'epoch': 0.47} + 47%|████▋ | 2721/5772 [55:29<5:04:25, 5.99s/it] 47%|████▋ | 2722/5772 [55:35<5:05:32, 6.01s/it] 47%|████▋ | 2722/5772 [55:37<5:05:32, 6.01s/it] {'loss': 0.46, 'learning_rate': 1.140395617219903e-05, 'epoch': 0.47} + 47%|████▋ | 2722/5772 [55:37<5:05:32, 6.01s/it] {'loss': 0.46, 'learning_rate': 1.140395617219903e-05, 'epoch': 0.47} + 47%|████▋ | 2722/5772 [55:35<5:05:32, 6.01s/it] 47%|████▋ | 2723/5772 [55:41<5:06:09, 6.02s/it] 47%|████▋ | 2723/5772 [55:43<5:06:10, 6.03s/it] {'loss': 0.4777, 'learning_rate': 1.1398399544289751e-05, 'epoch': 0.47} + 47%|████▋ | 2723/5772 [55:43<5:06:10, 6.03s/it] {'loss': 0.4777, 'learning_rate': 1.1398399544289751e-05, 'epoch': 0.47} + 47%|████▋ | 2723/5772 [55:41<5:06:09, 6.02s/it] 47%|████▋ | 2724/5772 [55:49<5:03:16, 5.97s/it] 47%|████▋ | 2724/5772 [55:47<5:03:17, 5.97s/it] {'loss': 0.4726, 'learning_rate': 1.1392842475962311e-05, 'epoch': 0.47} + 47%|████▋ | 2724/5772 [55:49<5:03:16, 5.97s/it] {'loss': 0.4726, 'learning_rate': 1.1392842475962311e-05, 'epoch': 0.47} + 47%|████▋ | 2724/5772 [55:47<5:03:17, 5.97s/it] 47%|████▋ | 2725/5772 [55:53<5:07:01, 6.05s/it] 47%|████▋ | 2725/5772 [55:55<5:07:02, 6.05s/it] {'loss': 0.4657, 'learning_rate': 1.1387284968966879e-05, 'epoch': 0.47} + 47%|████▋ | 2725/5772 [55:55<5:07:02, 6.05s/it] {'loss': 0.4657, 'learning_rate': 1.1387284968966879e-05, 'epoch': 0.47} + 47%|████▋ | 2725/5772 [55:53<5:07:01, 6.05s/it] 47%|████▋ | 2726/5772 [55:59<5:06:00, 6.03s/it] 47%|████▋ | 2726/5772 [56:01<5:06:00, 6.03s/it] {'loss': 0.4499, 'learning_rate': 1.1381727025053758e-05, 'epoch': 0.47} + 47%|████▋ | 2726/5772 [56:01<5:06:00, 6.03s/it] {'loss': 0.4499, 'learning_rate': 1.1381727025053758e-05, 'epoch': 0.47} + 47%|████▋ | 2726/5772 [55:59<5:06:00, 6.03s/it] 47%|████▋ | 2727/5772 [56:06<5:08:49, 6.09s/it] 47%|████▋ | 2727/5772 [56:08<5:08:49, 6.09s/it] {'loss': 0.4751, 'learning_rate': 1.1376168645973393e-05, 'epoch': 0.47} + 47%|████▋ | 2727/5772 [56:08<5:08:49, 6.09s/it] {'loss': 0.4751, 'learning_rate': 1.1376168645973393e-05, 'epoch': 0.47} + 47%|████▋ | 2727/5772 [56:06<5:08:49, 6.09s/it] 47%|████▋ | 2728/5772 [56:12<5:08:22, 6.08s/it] 47%|████▋ | 2728/5772 [56:14<5:08:22, 6.08s/it] {'loss': 0.46, 'learning_rate': 1.1370609833476365e-05, 'epoch': 0.47} + 47%|████▋ | 2728/5772 [56:14<5:08:22, 6.08s/it] {'loss': 0.46, 'learning_rate': 1.1370609833476365e-05, 'epoch': 0.47} + 47%|████▋ | 2728/5772 [56:12<5:08:22, 6.08s/it] 47%|████▋ | 2729/5772 [56:18<5:11:30, 6.14s/it] 47%|████▋ | 2729/5772 [56:20<5:11:30, 6.14s/it] {'loss': 0.47, 'learning_rate': 1.136505058931339e-05, 'epoch': 0.47} + 47%|████▋ | 2729/5772 [56:20<5:11:30, 6.14s/it] {'loss': 0.47, 'learning_rate': 1.136505058931339e-05, 'epoch': 0.47} + 47%|████▋ | 2729/5772 [56:18<5:11:30, 6.14s/it] 47%|████▋ | 2730/5772 [56:26<5:13:53, 6.19s/it] 47%|████▋ | 2730/5772 [56:24<5:13:54, 6.19s/it] {'loss': 0.462, 'learning_rate': 1.1359490915235323e-05, 'epoch': 0.47} + 47%|████▋ | 2730/5772 [56:26<5:13:53, 6.19s/it] {'loss': 0.462, 'learning_rate': 1.1359490915235323e-05, 'epoch': 0.47} + 47%|████▋ | 2730/5772 [56:24<5:13:54, 6.19s/it] 47%|████▋ | 2731/5772 [56:32<5:08:07, 6.08s/it] 47%|████▋ | 2731/5772 [56:30<5:08:08, 6.08s/it] {'loss': 0.4797, 'learning_rate': 1.135393081299315e-05, 'epoch': 0.47} + 47%|████▋ | 2731/5772 [56:32<5:08:07, 6.08s/it] {'loss': 0.4797, 'learning_rate': 1.135393081299315e-05, 'epoch': 0.47} + 47%|████▋ | 2731/5772 [56:30<5:08:08, 6.08s/it] 47%|████▋ | 2732/5772 [56:36<5:07:20, 6.07s/it] 47%|████▋ | 2732/5772 [56:38<5:07:20, 6.07s/it] {'loss': 0.4655, 'learning_rate': 1.1348370284337996e-05, 'epoch': 0.47} + 47%|████▋ | 2732/5772 [56:36<5:07:20, 6.07s/it]{'loss': 0.4655, 'learning_rate': 1.1348370284337996e-05, 'epoch': 0.47} + 47%|████▋ | 2732/5772 [56:38<5:07:20, 6.07s/it] 47%|████▋ | 2733/5772 [56:42<5:02:59, 5.98s/it] 47%|████▋ | 2733/5772 [56:44<5:03:00, 5.98s/it] {'loss': 0.4653, 'learning_rate': 1.1342809331021117e-05, 'epoch': 0.47} + 47%|████▋ | 2733/5772 [56:44<5:03:00, 5.98s/it] {'loss': 0.4653, 'learning_rate': 1.1342809331021117e-05, 'epoch': 0.47} + 47%|████▋ | 2733/5772 [56:42<5:02:59, 5.98s/it] 47%|████▋ | 2734/5772 [56:50<5:05:56, 6.04s/it] 47%|████▋ | 2734/5772 [56:48<5:05:56, 6.04s/it] {'loss': 0.4585, 'learning_rate': 1.1337247954793904e-05, 'epoch': 0.47} + 47%|████▋ | 2734/5772 [56:50<5:05:56, 6.04s/it] {'loss': 0.4585, 'learning_rate': 1.1337247954793904e-05, 'epoch': 0.47} + 47%|████▋ | 2734/5772 [56:48<5:05:56, 6.04s/it] 47%|████▋ | 2735/5772 [56:54<5:08:35, 6.10s/it] 47%|████▋ | 2735/5772 [56:56<5:08:36, 6.10s/it] {'loss': 0.4755, 'learning_rate': 1.1331686157407887e-05, 'epoch': 0.47} + 47%|████▋ | 2735/5772 [56:56<5:08:36, 6.10s/it] {'loss': 0.4755, 'learning_rate': 1.1331686157407887e-05, 'epoch': 0.47} + 47%|████▋ | 2735/5772 [56:54<5:08:35, 6.10s/it] 47%|████▋ | 2736/5772 [57:01<5:15:14, 6.23s/it] 47%|████▋ | 2736/5772 [57:03<5:15:14, 6.23s/it] {'loss': 0.4748, 'learning_rate': 1.1326123940614715e-05, 'epoch': 0.47} + 47%|████▋ | 2736/5772 [57:03<5:15:14, 6.23s/it] {'loss': 0.4748, 'learning_rate': 1.1326123940614715e-05, 'epoch': 0.47} + 47%|████▋ | 2736/5772 [57:01<5:15:14, 6.23s/it] 47%|████▋ | 2737/5772 [57:07<5:11:54, 6.17s/it] 47%|████▋ | 2737/5772 [57:09<5:11:54, 6.17s/it] {'loss': 0.4718, 'learning_rate': 1.1320561306166182e-05, 'epoch': 0.47} + 47%|████▋ | 2737/5772 [57:09<5:11:54, 6.17s/it] {'loss': 0.4718, 'learning_rate': 1.1320561306166182e-05, 'epoch': 0.47} + 47%|████▋ | 2737/5772 [57:07<5:11:54, 6.17s/it] 47%|████▋ | 2738/5772 [57:15<5:08:12, 6.09s/it] 47%|████▋ | 2738/5772 [57:13<5:08:12, 6.10s/it] {'loss': 0.4608, 'learning_rate': 1.131499825581421e-05, 'epoch': 0.47} + 47%|████▋ | 2738/5772 [57:15<5:08:12, 6.09s/it] {'loss': 0.4608, 'learning_rate': 1.131499825581421e-05, 'epoch': 0.47} + 47%|████▋ | 2738/5772 [57:13<5:08:12, 6.10s/it] 47%|████▋ | 2739/5772 [57:19<5:12:46, 6.19s/it] 47%|████▋ | 2739/5772 [57:21<5:12:46, 6.19s/it] {'loss': 0.4723, 'learning_rate': 1.1309434791310848e-05, 'epoch': 0.47} + 47%|████▋ | 2739/5772 [57:21<5:12:46, 6.19s/it] {'loss': 0.4723, 'learning_rate': 1.1309434791310848e-05, 'epoch': 0.47} + 47%|████▋ | 2739/5772 [57:19<5:12:46, 6.19s/it] 47%|████▋ | 2740/5772 [57:27<5:08:12, 6.10s/it] 47%|████▋ | 2740/5772 [57:25<5:08:12, 6.10s/it] {'loss': 0.4694, 'learning_rate': 1.1303870914408277e-05, 'epoch': 0.47} + 47%|████▋ | 2740/5772 [57:27<5:08:12, 6.10s/it] {'loss': 0.4694, 'learning_rate': 1.1303870914408277e-05, 'epoch': 0.47} + 47%|████▋ | 2740/5772 [57:25<5:08:12, 6.10s/it] 47%|████▋ | 2741/5772 [57:31<5:01:34, 5.97s/it] 47%|████▋ | 2741/5772 [57:33<5:01:34, 5.97s/it] {'loss': 0.4756, 'learning_rate': 1.1298306626858811e-05, 'epoch': 0.47} + 47%|████▋ | 2741/5772 [57:33<5:01:34, 5.97s/it] {'loss': 0.4756, 'learning_rate': 1.1298306626858811e-05, 'epoch': 0.47} + 47%|████▋ | 2741/5772 [57:31<5:01:34, 5.97s/it] 48%|████▊ | 2742/5772 [57:37<5:00:40, 5.95s/it] 48%|████▊ | 2742/5772 [57:39<5:00:40, 5.95s/it] {'loss': 0.471, 'learning_rate': 1.1292741930414894e-05, 'epoch': 0.48} + 48%|████▊ | 2742/5772 [57:39<5:00:40, 5.95s/it] {'loss': 0.471, 'learning_rate': 1.1292741930414894e-05, 'epoch': 0.48} + 48%|████▊ | 2742/5772 [57:37<5:00:40, 5.95s/it] 48%|████▊ | 2743/5772 [57:43<5:08:58, 6.12s/it] 48%|████▊ | 2743/5772 [57:45<5:08:58, 6.12s/it] {'loss': 0.4806, 'learning_rate': 1.128717682682909e-05, 'epoch': 0.48} + 48%|████▊ | 2743/5772 [57:45<5:08:58, 6.12s/it] {'loss': 0.4806, 'learning_rate': 1.128717682682909e-05, 'epoch': 0.48} + 48%|████▊ | 2743/5772 [57:43<5:08:58, 6.12s/it] 48%|████▊ | 2744/5772 [57:49<5:06:35, 6.08s/it] 48%|████▊ | 2744/5772 [57:51<5:06:35, 6.08s/it] {'loss': 0.4507, 'learning_rate': 1.1281611317854107e-05, 'epoch': 0.48} + 48%|████▊ | 2744/5772 [57:51<5:06:35, 6.08s/it] {'loss': 0.4507, 'learning_rate': 1.1281611317854107e-05, 'epoch': 0.48} + 48%|████▊ | 2744/5772 [57:49<5:06:35, 6.08s/it] 48%|████▊ | 2745/5772 [57:55<5:05:20, 6.05s/it] 48%|████▊ | 2745/5772 [57:57<5:05:20, 6.05s/it] {'loss': 0.4736, 'learning_rate': 1.1276045405242761e-05, 'epoch': 0.48} + 48%|████▊ | 2745/5772 [57:57<5:05:20, 6.05s/it] {'loss': 0.4736, 'learning_rate': 1.1276045405242761e-05, 'epoch': 0.48} + 48%|████▊ | 2745/5772 [57:55<5:05:20, 6.05s/it] 48%|████▊ | 2746/5772 [58:01<5:09:42, 6.14s/it] 48%|████▊ | 2746/5772 [58:03<5:09:42, 6.14s/it] {'loss': 0.462, 'learning_rate': 1.127047909074801e-05, 'epoch': 0.48} + 48%|████▊ | 2746/5772 [58:03<5:09:42, 6.14s/it] {'loss': 0.462, 'learning_rate': 1.127047909074801e-05, 'epoch': 0.48} + 48%|████▊ | 2746/5772 [58:01<5:09:42, 6.14s/it] 48%|████▊ | 2747/5772 [58:07<5:04:13, 6.03s/it] 48%|████▊ | 2747/5772 [58:09<5:04:14, 6.03s/it] {'loss': 0.471, 'learning_rate': 1.1264912376122931e-05, 'epoch': 0.48} + 48%|████▊ | 2747/5772 [58:09<5:04:14, 6.03s/it] {'loss': 0.471, 'learning_rate': 1.1264912376122931e-05, 'epoch': 0.48} + 48%|████▊ | 2747/5772 [58:07<5:04:13, 6.03s/it] 48%|████▊ | 2748/5772 [58:15<5:06:12, 6.08s/it] 48%|████▊ | 2748/5772 [58:13<5:06:12, 6.08s/it] {'loss': 0.4628, 'learning_rate': 1.1259345263120738e-05, 'epoch': 0.48} + 48%|████▊ | 2748/5772 [58:15<5:06:12, 6.08s/it] {'loss': 0.4628, 'learning_rate': 1.1259345263120738e-05, 'epoch': 0.48} + 48%|████▊ | 2748/5772 [58:13<5:06:12, 6.08s/it] 48%|████▊ | 2749/5772 [58:20<5:06:32, 6.08s/it] 48%|████▊ | 2749/5772 [58:22<5:06:32, 6.08s/it] {'loss': 0.4649, 'learning_rate': 1.1253777753494753e-05, 'epoch': 0.48} + 48%|████▊ | 2749/5772 [58:22<5:06:32, 6.08s/it] {'loss': 0.4649, 'learning_rate': 1.1253777753494753e-05, 'epoch': 0.48} + 48%|████▊ | 2749/5772 [58:20<5:06:32, 6.08s/it]10 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... + 48%|████▊ | 2750/5772 [58:28<5:11:36, 6.19s/it]12 AutoResumeHook: Checking whether to suspend... +06 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +1 48%|████▊ | 2750/5772 [58:26<5:11:37, 6.19s/it]AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4619, 'learning_rate': 1.1248209848998433e-05, 'epoch': 0.48} + 48%|████▊ | 2750/5772 [58:28<5:11:36, 6.19s/it] {'loss': 0.4619, 'learning_rate': 1.1248209848998433e-05, 'epoch': 0.48} + 48%|████▊ | 2750/5772 [58:26<5:11:37, 6.19s/it] 48%|████▊ | 2751/5772 [58:32<5:08:56, 6.14s/it] 48%|████▊ | 2751/5772 [58:34<5:08:56, 6.14s/it] {'loss': 0.4657, 'learning_rate': 1.1242641551385365e-05, 'epoch': 0.48} + 48%|████▊ | 2751/5772 [58:34<5:08:56, 6.14s/it] {'loss': 0.4657, 'learning_rate': 1.1242641551385365e-05, 'epoch': 0.48} + 48%|████▊ | 2751/5772 [58:32<5:08:56, 6.14s/it] 48%|████▊ | 2752/5772 [58:38<5:06:17, 6.09s/it] 48%|████▊ | 2752/5772 [58:40<5:06:17, 6.09s/it] {'loss': 0.4603, 'learning_rate': 1.1237072862409248e-05, 'epoch': 0.48} + 48%|████▊ | 2752/5772 [58:40<5:06:17, 6.09s/it] {'loss': 0.4603, 'learning_rate': 1.1237072862409248e-05, 'epoch': 0.48} + 48%|████▊ | 2752/5772 [58:38<5:06:17, 6.09s/it] 48%|████▊ | 2753/5772 [58:46<5:04:16, 6.05s/it] 48%|████▊ | 2753/5772 [58:44<5:04:16, 6.05s/it] {'loss': 0.4749, 'learning_rate': 1.1231503783823914e-05, 'epoch': 0.48} + 48%|████▊ | 2753/5772 [58:46<5:04:16, 6.05s/it] {'loss': 0.4749, 'learning_rate': 1.1231503783823914e-05, 'epoch': 0.48} + 48%|████▊ | 2753/5772 [58:44<5:04:16, 6.05s/it] 48%|████▊ | 2754/5772 [58:50<4:58:55, 5.94s/it] 48%|████▊ | 2754/5772 [58:52<4:58:55, 5.94s/it] {'loss': 0.464, 'learning_rate': 1.1225934317383305e-05, 'epoch': 0.48} + 48%|████▊ | 2754/5772 [58:52<4:58:55, 5.94s/it] {'loss': 0.464, 'learning_rate': 1.1225934317383305e-05, 'epoch': 0.48} + 48%|████▊ | 2754/5772 [58:50<4:58:55, 5.94s/it] 48%|████▊ | 2755/5772 [58:56<5:02:47, 6.02s/it] 48%|████▊ | 2755/5772 [58:58<5:02:47, 6.02s/it] {'loss': 0.4737, 'learning_rate': 1.1220364464841502e-05, 'epoch': 0.48} + 48%|████▊ | 2755/5772 [58:58<5:02:47, 6.02s/it] {'loss': 0.4737, 'learning_rate': 1.1220364464841502e-05, 'epoch': 0.48} + 48%|████▊ | 2755/5772 [58:56<5:02:47, 6.02s/it] 48%|████▊ | 2756/5772 [59:02<5:09:37, 6.16s/it] 48%|████▊ | 2756/5772 [59:04<5:09:37, 6.16s/it] {'loss': 0.4646, 'learning_rate': 1.1214794227952694e-05, 'epoch': 0.48} + 48%|████▊ | 2756/5772 [59:04<5:09:37, 6.16s/it] {'loss': 0.4646, 'learning_rate': 1.1214794227952694e-05, 'epoch': 0.48} + 48%|████▊ | 2756/5772 [59:02<5:09:37, 6.16s/it] 48%|████▊ | 2757/5772 [59:09<5:12:51, 6.23s/it] 48%|████▊ | 2757/5772 [59:11<5:12:50, 6.23s/it] {'loss': 0.4833, 'learning_rate': 1.1209223608471202e-05, 'epoch': 0.48} + 48%|████▊ | 2757/5772 [59:11<5:12:50, 6.23s/it] {'loss': 0.4833, 'learning_rate': 1.1209223608471202e-05, 'epoch': 0.48} + 48%|████▊ | 2757/5772 [59:09<5:12:51, 6.23s/it] 48%|████▊ | 2758/5772 [59:15<5:12:16, 6.22s/it] 48%|████▊ | 2758/5772 [59:17<5:12:16, 6.22s/it] {'loss': 0.4797, 'learning_rate': 1.1203652608151456e-05, 'epoch': 0.48} + 48%|████▊ | 2758/5772 [59:17<5:12:16, 6.22s/it] {'loss': 0.4797, 'learning_rate': 1.1203652608151456e-05, 'epoch': 0.48} + 48%|████▊ | 2758/5772 [59:15<5:12:16, 6.22s/it] 48%|████▊ | 2759/5772 [59:21<5:08:13, 6.14s/it] 48%|████▊ | 2759/5772 [59:23<5:08:13, 6.14s/it] {'loss': 0.4794, 'learning_rate': 1.1198081228748012e-05, 'epoch': 0.48} + 48%|████▊ | 2759/5772 [59:23<5:08:13, 6.14s/it] {'loss': 0.4794, 'learning_rate': 1.1198081228748012e-05, 'epoch': 0.48} + 48%|████▊ | 2759/5772 [59:21<5:08:13, 6.14s/it] 48%|████▊ | 2760/5772 [59:27<5:02:31, 6.03s/it] 48%|████▊ | 2760/5772 [59:29<5:02:31, 6.03s/it] {'loss': 0.473, 'learning_rate': 1.1192509472015549e-05, 'epoch': 0.48} + 48%|████▊ | 2760/5772 [59:29<5:02:31, 6.03s/it] {'loss': 0.473, 'learning_rate': 1.1192509472015549e-05, 'epoch': 0.48} + 48%|████▊ | 2760/5772 [59:27<5:02:31, 6.03s/it] 48%|████▊ | 2761/5772 [59:32<4:59:26, 5.97s/it] 48%|████▊ | 2761/5772 [59:34<4:59:26, 5.97s/it] {'loss': 0.4608, 'learning_rate': 1.1186937339708856e-05, 'epoch': 0.48} + 48%|████▊ | 2761/5772 [59:34<4:59:26, 5.97s/it] {'loss': 0.4608, 'learning_rate': 1.1186937339708856e-05, 'epoch': 0.48} + 48%|████▊ | 2761/5772 [59:32<4:59:26, 5.97s/it] 48%|████▊ | 2762/5772 [59:41<5:02:49, 6.04s/it] 48%|████▊ | 2762/5772 [59:39<5:02:50, 6.04s/it] {'loss': 0.472, 'learning_rate': 1.1181364833582848e-05, 'epoch': 0.48} + 48%|████▊ | 2762/5772 [59:41<5:02:49, 6.04s/it] {'loss': 0.472, 'learning_rate': 1.1181364833582848e-05, 'epoch': 0.48} + 48%|████▊ | 2762/5772 [59:39<5:02:50, 6.04s/it] 48%|████▊ | 2763/5772 [59:45<5:03:38, 6.05s/it] 48%|████▊ | 2763/5772 [59:47<5:03:38, 6.05s/it] {'loss': 0.477, 'learning_rate': 1.1175791955392552e-05, 'epoch': 0.48} + 48%|████▊ | 2763/5772 [59:47<5:03:38, 6.05s/it] {'loss': 0.477, 'learning_rate': 1.1175791955392552e-05, 'epoch': 0.48} + 48%|████▊ | 2763/5772 [59:45<5:03:38, 6.05s/it] 48%|████▊ | 2764/5772 [59:51<5:05:18, 6.09s/it] 48%|████▊ | 2764/5772 [59:53<5:05:19, 6.09s/it] {'loss': 0.4608, 'learning_rate': 1.1170218706893121e-05, 'epoch': 0.48} + 48%|████▊ | 2764/5772 [59:53<5:05:19, 6.09s/it] {'loss': 0.4608, 'learning_rate': 1.1170218706893121e-05, 'epoch': 0.48} + 48%|████▊ | 2764/5772 [59:51<5:05:18, 6.09s/it] 48%|████▊ | 2765/5772 [59:57<5:07:17, 6.13s/it] 48%|████▊ | 2765/5772 [59:59<5:07:17, 6.13s/it] {'loss': 0.4727, 'learning_rate': 1.1164645089839812e-05, 'epoch': 0.48} + 48%|████▊ | 2765/5772 [59:59<5:07:17, 6.13s/it] {'loss': 0.4727, 'learning_rate': 1.1164645089839812e-05, 'epoch': 0.48} + 48%|████▊ | 2765/5772 [59:57<5:07:17, 6.13s/it] 48%|████▊ | 2766/5772 [1:00:03<5:08:07, 6.15s/it] 48%|████▊ | 2766/5772 [1:00:05<5:08:07, 6.15s/it] {'loss': 0.4593, 'learning_rate': 1.1159071105988012e-05, 'epoch': 0.48} + 48%|████▊ | 2766/5772 [1:00:05<5:08:07, 6.15s/it] {'loss': 0.4593, 'learning_rate': 1.1159071105988012e-05, 'epoch': 0.48} + 48%|████▊ | 2766/5772 [1:00:03<5:08:07, 6.15s/it] 48%|████▊ | 2767/5772 [1:00:09<5:06:17, 6.12s/it] 48%|████▊ | 2767/5772 [1:00:11<5:06:17, 6.12s/it] {'loss': 0.4681, 'learning_rate': 1.1153496757093205e-05, 'epoch': 0.48} + 48%|████▊ | 2767/5772 [1:00:11<5:06:17, 6.12s/it] {'loss': 0.4681, 'learning_rate': 1.1153496757093205e-05, 'epoch': 0.48} + 48%|████▊ | 2767/5772 [1:00:09<5:06:17, 6.12s/it] 48%|████▊ | 2768/5772 [1:00:15<5:05:16, 6.10s/it] 48%|████▊ | 2768/5772 [1:00:17<5:05:16, 6.10s/it] {'loss': 0.4665, 'learning_rate': 1.114792204491101e-05, 'epoch': 0.48} + 48%|████▊ | 2768/5772 [1:00:17<5:05:16, 6.10s/it] {'loss': 0.4665, 'learning_rate': 1.114792204491101e-05, 'epoch': 0.48} + 48%|████▊ | 2768/5772 [1:00:15<5:05:16, 6.10s/it] 48%|████▊ | 2769/5772 [1:00:21<5:02:54, 6.05s/it] 48%|████▊ | 2769/5772 [1:00:23<5:02:54, 6.05s/it] {'loss': 0.4654, 'learning_rate': 1.1142346971197151e-05, 'epoch': 0.48} + 48%|████▊ | 2769/5772 [1:00:23<5:02:54, 6.05s/it] {'loss': 0.4654, 'learning_rate': 1.1142346971197151e-05, 'epoch': 0.48} + 48%|████▊ | 2769/5772 [1:00:21<5:02:54, 6.05s/it] 48%|████▊ | 2770/5772 [1:00:27<5:03:20, 6.06s/it] 48%|████▊ | 2770/5772 [1:00:29<5:03:20, 6.06s/it] {'loss': 0.4698, 'learning_rate': 1.1136771537707464e-05, 'epoch': 0.48} + 48%|████▊ | 2770/5772 [1:00:29<5:03:20, 6.06s/it] {'loss': 0.4698, 'learning_rate': 1.1136771537707464e-05, 'epoch': 0.48} + 48%|████▊ | 2770/5772 [1:00:27<5:03:20, 6.06s/it] 48%|████▊ | 2771/5772 [1:00:34<5:04:36, 6.09s/it] 48%|████▊ | 2771/5772 [1:00:36<5:04:36, 6.09s/it] {'loss': 0.4699, 'learning_rate': 1.1131195746197902e-05, 'epoch': 0.48} + 48%|████▊ | 2771/5772 [1:00:36<5:04:36, 6.09s/it] {'loss': 0.4699, 'learning_rate': 1.1131195746197902e-05, 'epoch': 0.48} + 48%|████▊ | 2771/5772 [1:00:34<5:04:36, 6.09s/it] 48%|████▊ | 2772/5772 [1:00:40<5:08:06, 6.16s/it] 48%|████▊ | 2772/5772 [1:00:42<5:08:06, 6.16s/it] {'loss': 0.482, 'learning_rate': 1.1125619598424528e-05, 'epoch': 0.48} + 48%|████▊ | 2772/5772 [1:00:42<5:08:06, 6.16s/it] {'loss': 0.482, 'learning_rate': 1.1125619598424528e-05, 'epoch': 0.48} + 48%|████▊ | 2772/5772 [1:00:40<5:08:06, 6.16s/it] 48%|████▊ | 2773/5772 [1:00:48<5:12:29, 6.25s/it] 48%|████▊ | 2773/5772 [1:00:46<5:12:29, 6.25s/it] {'loss': 0.48, 'learning_rate': 1.1120043096143523e-05, 'epoch': 0.48} + 48%|████▊ | 2773/5772 [1:00:48<5:12:29, 6.25s/it] {'loss': 0.48, 'learning_rate': 1.1120043096143523e-05, 'epoch': 0.48} + 48%|████▊ | 2773/5772 [1:00:46<5:12:29, 6.25s/it] 48%|████▊ | 2774/5772 [1:00:52<5:06:23, 6.13s/it] 48%|████▊ | 2774/5772 [1:00:54<5:06:23, 6.13s/it] {'loss': 0.4747, 'learning_rate': 1.1114466241111168e-05, 'epoch': 0.48} + 48%|████▊ | 2774/5772 [1:00:54<5:06:23, 6.13s/it] {'loss': 0.4747, 'learning_rate': 1.1114466241111168e-05, 'epoch': 0.48} + 48%|████▊ | 2774/5772 [1:00:52<5:06:23, 6.13s/it] 48%|████▊ | 2775/5772 [1:00:58<5:04:39, 6.10s/it] 48%|████▊ | 2775/5772 [1:01:00<5:04:39, 6.10s/it] {'loss': 0.4621, 'learning_rate': 1.110888903508387e-05, 'epoch': 0.48} + 48%|████▊ | 2775/5772 [1:01:00<5:04:39, 6.10s/it] {'loss': 0.4621, 'learning_rate': 1.110888903508387e-05, 'epoch': 0.48} + 48%|████▊ | 2775/5772 [1:00:58<5:04:39, 6.10s/it] 48%|████▊ | 2776/5772 [1:01:05<5:07:00, 6.15s/it] 48%|████▊ | 2776/5772 [1:01:06<5:07:01, 6.15s/it] {'loss': 0.4847, 'learning_rate': 1.1103311479818133e-05, 'epoch': 0.48} + 48%|████▊ | 2776/5772 [1:01:06<5:07:01, 6.15s/it] {'loss': 0.4847, 'learning_rate': 1.1103311479818133e-05, 'epoch': 0.48} + 48%|████▊ | 2776/5772 [1:01:05<5:07:00, 6.15s/it] 48%|████▊ | 2777/5772 [1:01:11<5:08:09, 6.17s/it] 48%|████▊ | 2777/5772 [1:01:13<5:08:09, 6.17s/it] {'loss': 0.4682, 'learning_rate': 1.1097733577070585e-05, 'epoch': 0.48} + 48%|████▊ | 2777/5772 [1:01:13<5:08:09, 6.17s/it] {'loss': 0.4682, 'learning_rate': 1.1097733577070585e-05, 'epoch': 0.48} + 48%|████▊ | 2777/5772 [1:01:11<5:08:09, 6.17s/it] 48%|████▊ | 2778/5772 [1:01:19<5:11:34, 6.24s/it] 48%|████▊ | 2778/5772 [1:01:17<5:11:34, 6.24s/it] {'loss': 0.4803, 'learning_rate': 1.1092155328597945e-05, 'epoch': 0.48} + 48%|████▊ | 2778/5772 [1:01:17<5:11:34, 6.24s/it]{'loss': 0.4803, 'learning_rate': 1.1092155328597945e-05, 'epoch': 0.48} + 48%|████▊ | 2778/5772 [1:01:19<5:11:34, 6.24s/it] 48%|████▊ | 2779/5772 [1:01:25<5:08:20, 6.18s/it] 48%|████▊ | 2779/5772 [1:01:23<5:08:20, 6.18s/it] {'loss': 0.4698, 'learning_rate': 1.108657673615706e-05, 'epoch': 0.48} + 48%|████▊ | 2779/5772 [1:01:25<5:08:20, 6.18s/it] {'loss': 0.4698, 'learning_rate': 1.108657673615706e-05, 'epoch': 0.48} + 48%|████▊ | 2779/5772 [1:01:23<5:08:20, 6.18s/it] 48%|████▊ | 2780/5772 [1:01:29<5:09:14, 6.20s/it] 48%|████▊ | 2780/5772 [1:01:31<5:09:14, 6.20s/it] {'loss': 0.4759, 'learning_rate': 1.1080997801504872e-05, 'epoch': 0.48} + 48%|████▊ | 2780/5772 [1:01:31<5:09:14, 6.20s/it] {'loss': 0.4759, 'learning_rate': 1.1080997801504872e-05, 'epoch': 0.48} + 48%|████▊ | 2780/5772 [1:01:29<5:09:14, 6.20s/it] 48%|████▊ | 2781/5772 [1:01:38<5:08:46, 6.19s/it] 48%|████▊ | 2781/5772 [1:01:36<5:08:46, 6.19s/it] {'loss': 0.4704, 'learning_rate': 1.1075418526398435e-05, 'epoch': 0.48} + 48%|████▊ | 2781/5772 [1:01:38<5:08:46, 6.19s/it] {'loss': 0.4704, 'learning_rate': 1.1075418526398435e-05, 'epoch': 0.48} + 48%|████▊ | 2781/5772 [1:01:36<5:08:46, 6.19s/it] 48%|████▊ | 2782/5772 [1:01:42<5:13:28, 6.29s/it] 48%|████▊ | 2782/5772 [1:01:44<5:13:28, 6.29s/it] {'loss': 0.4643, 'learning_rate': 1.1069838912594914e-05, 'epoch': 0.48} + 48%|████▊ | 2782/5772 [1:01:44<5:13:28, 6.29s/it] {'loss': 0.4643, 'learning_rate': 1.1069838912594914e-05, 'epoch': 0.48} + 48%|████▊ | 2782/5772 [1:01:42<5:13:28, 6.29s/it] 48%|████▊ | 2783/5772 [1:01:48<5:13:00, 6.28s/it] 48%|████▊ | 2783/5772 [1:01:50<5:13:00, 6.28s/it] {'loss': 0.4732, 'learning_rate': 1.1064258961851575e-05, 'epoch': 0.48} + 48%|████▊ | 2783/5772 [1:01:50<5:13:00, 6.28s/it] {'loss': 0.4732, 'learning_rate': 1.1064258961851575e-05, 'epoch': 0.48} + 48%|████▊ | 2783/5772 [1:01:48<5:13:00, 6.28s/it] 48%|████▊ | 2784/5772 [1:01:55<5:17:08, 6.37s/it] 48%|████▊ | 2784/5772 [1:01:57<5:17:09, 6.37s/it] {'loss': 0.4784, 'learning_rate': 1.1058678675925796e-05, 'epoch': 0.48} + 48%|████▊ | 2784/5772 [1:01:57<5:17:09, 6.37s/it] {'loss': 0.4784, 'learning_rate': 1.1058678675925796e-05, 'epoch': 0.48} + 48%|████▊ | 2784/5772 [1:01:55<5:17:08, 6.37s/it] 48%|████▊ | 2785/5772 [1:02:01<5:13:23, 6.30s/it] 48%|████▊ | 2785/5772 [1:02:03<5:13:24, 6.30s/it] {'loss': 0.4694, 'learning_rate': 1.105309805657505e-05, 'epoch': 0.48} + 48%|████▊ | 2785/5772 [1:02:03<5:13:24, 6.30s/it] {'loss': 0.4694, 'learning_rate': 1.105309805657505e-05, 'epoch': 0.48} + 48%|████▊ | 2785/5772 [1:02:01<5:13:23, 6.30s/it] 48%|████▊ | 2786/5772 [1:02:07<5:07:27, 6.18s/it] 48%|████▊ | 2786/5772 [1:02:09<5:07:27, 6.18s/it] {'loss': 0.4724, 'learning_rate': 1.1047517105556933e-05, 'epoch': 0.48} + 48%|████▊ | 2786/5772 [1:02:09<5:07:27, 6.18s/it] {'loss': 0.4724, 'learning_rate': 1.1047517105556933e-05, 'epoch': 0.48} + 48%|████▊ | 2786/5772 [1:02:07<5:07:27, 6.18s/it] 48%|████▊ | 2787/5772 [1:02:13<5:04:40, 6.12s/it] 48%|████▊ | 2787/5772 [1:02:15<5:04:40, 6.12s/it] {'loss': 0.4654, 'learning_rate': 1.1041935824629121e-05, 'epoch': 0.48} + 48%|████▊ | 2787/5772 [1:02:15<5:04:40, 6.12s/it] {'loss': 0.4654, 'learning_rate': 1.1041935824629121e-05, 'epoch': 0.48} + 48%|████▊ | 2787/5772 [1:02:13<5:04:40, 6.12s/it] 48%|████▊ | 2788/5772 [1:02:19<5:06:23, 6.16s/it] 48%|████▊ | 2788/5772 [1:02:21<5:06:24, 6.16s/it] {'loss': 0.4745, 'learning_rate': 1.1036354215549422e-05, 'epoch': 0.48} + 48%|████▊ | 2788/5772 [1:02:21<5:06:24, 6.16s/it] {'loss': 0.4745, 'learning_rate': 1.1036354215549422e-05, 'epoch': 0.48} + 48%|████▊ | 2788/5772 [1:02:19<5:06:23, 6.16s/it] 48%|████▊ | 2789/5772 [1:02:26<5:08:41, 6.21s/it] 48%|████▊ | 2789/5772 [1:02:28<5:08:41, 6.21s/it] {'loss': 0.4658, 'learning_rate': 1.1030772280075714e-05, 'epoch': 0.48} + 48%|████▊ | 2789/5772 [1:02:28<5:08:41, 6.21s/it] {'loss': 0.4658, 'learning_rate': 1.1030772280075714e-05, 'epoch': 0.48} + 48%|████▊ | 2789/5772 [1:02:26<5:08:41, 6.21s/it] 48%|████▊ | 2790/5772 [1:02:34<5:05:40, 6.15s/it] 48%|████▊ | 2790/5772 [1:02:32<5:05:41, 6.15s/it] {'loss': 0.4662, 'learning_rate': 1.1025190019966017e-05, 'epoch': 0.48} + 48%|████▊ | 2790/5772 [1:02:34<5:05:40, 6.15s/it] {'loss': 0.4662, 'learning_rate': 1.1025190019966017e-05, 'epoch': 0.48} + 48%|████▊ | 2790/5772 [1:02:32<5:05:41, 6.15s/it] 48%|████▊ | 2791/5772 [1:02:37<5:00:50, 6.06s/it] 48%|████▊ | 2791/5772 [1:02:39<5:00:50, 6.06s/it] {'loss': 0.461, 'learning_rate': 1.1019607436978419e-05, 'epoch': 0.48} + 48%|████▊ | 2791/5772 [1:02:39<5:00:50, 6.06s/it] {'loss': 0.461, 'learning_rate': 1.1019607436978419e-05, 'epoch': 0.48} + 48%|████▊ | 2791/5772 [1:02:37<5:00:50, 6.06s/it] 48%|████▊ | 2792/5772 [1:02:43<4:58:05, 6.00s/it] 48%|████▊ | 2792/5772 [1:02:45<4:58:05, 6.00s/it] {'loss': 0.473, 'learning_rate': 1.1014024532871128e-05, 'epoch': 0.48} + 48%|████▊ | 2792/5772 [1:02:45<4:58:05, 6.00s/it] {'loss': 0.473, 'learning_rate': 1.1014024532871128e-05, 'epoch': 0.48} + 48%|████▊ | 2792/5772 [1:02:43<4:58:05, 6.00s/it] 48%|████▊ | 2793/5772 [1:02:49<4:59:49, 6.04s/it] 48%|████▊ | 2793/5772 [1:02:51<4:59:49, 6.04s/it] {'loss': 0.4741, 'learning_rate': 1.1008441309402448e-05, 'epoch': 0.48} + 48%|████▊ | 2793/5772 [1:02:51<4:59:49, 6.04s/it] {'loss': 0.4741, 'learning_rate': 1.1008441309402448e-05, 'epoch': 0.48} + 48%|████▊ | 2793/5772 [1:02:49<4:59:49, 6.04s/it] 48%|████▊ | 2794/5772 [1:02:58<5:01:44, 6.08s/it] 48%|████▊ | 2794/5772 [1:02:56<5:01:44, 6.08s/it] {'loss': 0.4655, 'learning_rate': 1.1002857768330786e-05, 'epoch': 0.48} + 48%|████▊ | 2794/5772 [1:02:58<5:01:44, 6.08s/it] {'loss': 0.4655, 'learning_rate': 1.1002857768330786e-05, 'epoch': 0.48} + 48%|████▊ | 2794/5772 [1:02:56<5:01:44, 6.08s/it] 48%|████▊ | 2795/5772 [1:03:01<4:58:44, 6.02s/it] 48%|████▊ | 2795/5772 [1:03:03<4:58:44, 6.02s/it] {'loss': 0.4662, 'learning_rate': 1.0997273911414648e-05, 'epoch': 0.48} + 48%|████▊ | 2795/5772 [1:03:03<4:58:44, 6.02s/it] {'loss': 0.4662, 'learning_rate': 1.0997273911414648e-05, 'epoch': 0.48} + 48%|████▊ | 2795/5772 [1:03:01<4:58:44, 6.02s/it] 48%|████▊ | 2796/5772 [1:03:07<4:54:31, 5.94s/it] 48%|████▊ | 2796/5772 [1:03:09<4:54:32, 5.94s/it] {'loss': 0.4602, 'learning_rate': 1.099168974041263e-05, 'epoch': 0.48} + 48%|████▊ | 2796/5772 [1:03:09<4:54:32, 5.94s/it] {'loss': 0.4602, 'learning_rate': 1.099168974041263e-05, 'epoch': 0.48} + 48%|████▊ | 2796/5772 [1:03:07<4:54:31, 5.94s/it] 48%|████▊ | 2797/5772 [1:03:14<5:02:57, 6.11s/it] 48%|████▊ | 2797/5772 [1:03:16<5:02:57, 6.11s/it] {'loss': 0.4546, 'learning_rate': 1.0986105257083446e-05, 'epoch': 0.48} + 48%|████▊ | 2797/5772 [1:03:16<5:02:57, 6.11s/it] {'loss': 0.4546, 'learning_rate': 1.0986105257083446e-05, 'epoch': 0.48} + 48%|████▊ | 2797/5772 [1:03:14<5:02:57, 6.11s/it] 48%|████▊ | 2798/5772 [1:03:20<5:03:57, 6.13s/it] 48%|████▊ | 2798/5772 [1:03:22<5:03:57, 6.13s/it] {'loss': 0.4846, 'learning_rate': 1.0980520463185894e-05, 'epoch': 0.48} + 48%|████▊ | 2798/5772 [1:03:22<5:03:57, 6.13s/it] {'loss': 0.4846, 'learning_rate': 1.0980520463185894e-05, 'epoch': 0.48} + 48%|████▊ | 2798/5772 [1:03:20<5:03:57, 6.13s/it] 48%|████▊ | 2799/5772 [1:03:26<5:04:34, 6.15s/it] 48%|████▊ | 2799/5772 [1:03:28<5:04:33, 6.15s/it] {'loss': 0.4559, 'learning_rate': 1.0974935360478875e-05, 'epoch': 0.48} + 48%|████▊ | 2799/5772 [1:03:28<5:04:33, 6.15s/it] {'loss': 0.4559, 'learning_rate': 1.0974935360478875e-05, 'epoch': 0.48} + 48%|████▊ | 2799/5772 [1:03:26<5:04:34, 6.15s/it]10 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +56 AutoResumeHook: Checking whether to suspend... AutoResumeHook: Checking whether to suspend... + +015 12AutoResumeHook: Checking whether to suspend...8 +AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... + 49%|████▊ | 2800/5772 [1:03:32<5:01:02, 6.08s/it] 49%|████▊ | 2800/5772 [1:03:34<5:01:02, 6.08s/it]2 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4704, 'learning_rate': 1.0969349950721382e-05, 'epoch': 0.49} + 49%|████▊ | 2800/5772 [1:03:34<5:01:02, 6.08s/it] {'loss': 0.4704, 'learning_rate': 1.0969349950721382e-05, 'epoch': 0.49} + 49%|████▊ | 2800/5772 [1:03:32<5:01:02, 6.08s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-2800/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-2800/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-2800/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 49%|████▊ | 2801/5772 [1:04:00<10:29:34, 12.71s/it] 49%|████▊ | 2801/5772 [1:04:02<10:29:34, 12.71s/it] {'loss': 0.4658, 'learning_rate': 1.0963764235672516e-05, 'epoch': 0.49} + 49%|████▊ | 2801/5772 [1:04:02<10:29:34, 12.71s/it] {'loss': 0.4658, 'learning_rate': 1.0963764235672516e-05, 'epoch': 0.49} + 49%|████▊ | 2801/5772 [1:04:00<10:29:34, 12.71s/it] 49%|████▊ | 2802/5772 [1:04:06<8:52:34, 10.76s/it] 49%|████▊ | 2802/5772 [1:04:08<8:52:34, 10.76s/it] {'loss': 0.4947, 'learning_rate': 1.0958178217091455e-05, 'epoch': 0.49} + 49%|████▊ | 2802/5772 [1:04:08<8:52:34, 10.76s/it] {'loss': 0.4947, 'learning_rate': 1.0958178217091455e-05, 'epoch': 0.49} + 49%|████▊ | 2802/5772 [1:04:06<8:52:34, 10.76s/it] 49%|████▊ | 2803/5772 [1:04:12<7:42:37, 9.35s/it] 49%|████▊ | 2803/5772 [1:04:14<7:42:37, 9.35s/it] {'loss': 0.4545, 'learning_rate': 1.0952591896737499e-05, 'epoch': 0.49} + 49%|████▊ | 2803/5772 [1:04:14<7:42:37, 9.35s/it] {'loss': 0.4545, 'learning_rate': 1.0952591896737499e-05, 'epoch': 0.49} + 49%|████▊ | 2803/5772 [1:04:12<7:42:37, 9.35s/it] 49%|████▊ | 2804/5772 [1:04:19<6:54:00, 8.37s/it] 49%|████▊ | 2804/5772 [1:04:21<6:54:01, 8.37s/it] {'loss': 0.4725, 'learning_rate': 1.094700527637002e-05, 'epoch': 0.49} + 49%|████▊ | 2804/5772 [1:04:21<6:54:01, 8.37s/it] {'loss': 0.4725, 'learning_rate': 1.094700527637002e-05, 'epoch': 0.49} + 49%|████▊ | 2804/5772 [1:04:19<6:54:00, 8.37s/it] 49%|████▊ | 2805/5772 [1:04:25<6:18:28, 7.65s/it] 49%|████▊ | 2805/5772 [1:04:26<6:18:28, 7.65s/it] {'loss': 0.4617, 'learning_rate': 1.0941418357748493e-05, 'epoch': 0.49} + 49%|████▊ | 2805/5772 [1:04:27<6:18:28, 7.65s/it] {'loss': 0.4617, 'learning_rate': 1.0941418357748493e-05, 'epoch': 0.49} + 49%|████▊ | 2805/5772 [1:04:25<6:18:28, 7.65s/it] 49%|████▊ | 2806/5772 [1:04:31<5:54:47, 7.18s/it] 49%|████▊ | 2806/5772 [1:04:33<5:54:47, 7.18s/it] {'loss': 0.4739, 'learning_rate': 1.0935831142632489e-05, 'epoch': 0.49} + 49%|████▊ | 2806/5772 [1:04:33<5:54:47, 7.18s/it] {'loss': 0.4739, 'learning_rate': 1.0935831142632489e-05, 'epoch': 0.49} + 49%|████▊ | 2806/5772 [1:04:31<5:54:47, 7.18s/it] 49%|████▊ | 2807/5772 [1:04:39<5:46:34, 7.01s/it] 49%|████▊ | 2807/5772 [1:04:37<5:46:34, 7.01s/it] {'loss': 0.4565, 'learning_rate': 1.0930243632781669e-05, 'epoch': 0.49} + 49%|████▊ | 2807/5772 [1:04:39<5:46:34, 7.01s/it] {'loss': 0.4565, 'learning_rate': 1.0930243632781669e-05, 'epoch': 0.49} + 49%|████▊ | 2807/5772 [1:04:37<5:46:34, 7.01s/it] 49%|████▊ | 2808/5772 [1:04:46<5:37:58, 6.84s/it] 49%|████▊ | 2808/5772 [1:04:44<5:37:58, 6.84s/it] {'loss': 0.472, 'learning_rate': 1.0924655829955793e-05, 'epoch': 0.49} + 49%|████▊ | 2808/5772 [1:04:46<5:37:58, 6.84s/it] {'loss': 0.472, 'learning_rate': 1.0924655829955793e-05, 'epoch': 0.49} + 49%|████▊ | 2808/5772 [1:04:44<5:37:58, 6.84s/it] 49%|████▊ | 2809/5772 [1:04:49<5:22:08, 6.52s/it] 49%|████▊ | 2809/5772 [1:04:51<5:22:09, 6.52s/it] {'loss': 0.4593, 'learning_rate': 1.09190677359147e-05, 'epoch': 0.49} + 49%|████▊ | 2809/5772 [1:04:51<5:22:09, 6.52s/it] {'loss': 0.4593, 'learning_rate': 1.09190677359147e-05, 'epoch': 0.49} + 49%|████▊ | 2809/5772 [1:04:49<5:22:08, 6.52s/it] 49%|████▊ | 2810/5772 [1:04:56<5:15:57, 6.40s/it] 49%|████▊ | 2810/5772 [1:04:58<5:15:56, 6.40s/it] {'loss': 0.4592, 'learning_rate': 1.0913479352418336e-05, 'epoch': 0.49} + 49%|████▊ | 2810/5772 [1:04:58<5:15:56, 6.40s/it] {'loss': 0.4592, 'learning_rate': 1.0913479352418336e-05, 'epoch': 0.49} + 49%|████▊ | 2810/5772 [1:04:56<5:15:57, 6.40s/it] 49%|████▊ | 2811/5772 [1:05:02<5:13:44, 6.36s/it] 49%|████▊ | 2811/5772 [1:05:04<5:13:44, 6.36s/it] {'loss': 0.474, 'learning_rate': 1.0907890681226728e-05, 'epoch': 0.49} + 49%|████▊ | 2811/5772 [1:05:04<5:13:44, 6.36s/it] {'loss': 0.474, 'learning_rate': 1.0907890681226728e-05, 'epoch': 0.49} + 49%|████▊ | 2811/5772 [1:05:02<5:13:44, 6.36s/it] 49%|████▊ | 2812/5772 [1:05:10<5:12:00, 6.32s/it] 49%|████▊ | 2812/5772 [1:05:08<5:12:00, 6.32s/it] {'loss': 0.4666, 'learning_rate': 1.09023017241e-05, 'epoch': 0.49} + 49%|████▊ | 2812/5772 [1:05:10<5:12:00, 6.32s/it] {'loss': 0.4666, 'learning_rate': 1.09023017241e-05, 'epoch': 0.49} + 49%|████▊ | 2812/5772 [1:05:08<5:12:00, 6.32s/it] 49%|████▊ | 2813/5772 [1:05:14<5:06:49, 6.22s/it] 49%|████▊ | 2813/5772 [1:05:16<5:06:50, 6.22s/it] {'loss': 0.4696, 'learning_rate': 1.0896712482798358e-05, 'epoch': 0.49} + 49%|████▊ | 2813/5772 [1:05:16<5:06:50, 6.22s/it] {'loss': 0.4696, 'learning_rate': 1.0896712482798358e-05, 'epoch': 0.49} + 49%|████▊ | 2813/5772 [1:05:14<5:06:49, 6.22s/it] 49%|████▉ | 2814/5772 [1:05:20<5:07:38, 6.24s/it] 49%|████▉ | 2814/5772 [1:05:22<5:07:38, 6.24s/it] {'loss': 0.4647, 'learning_rate': 1.0891122959082108e-05, 'epoch': 0.49} + 49%|████▉ | 2814/5772 [1:05:22<5:07:38, 6.24s/it] {'loss': 0.4647, 'learning_rate': 1.0891122959082108e-05, 'epoch': 0.49} + 49%|████▉ | 2814/5772 [1:05:20<5:07:38, 6.24s/it] 49%|████▉ | 2815/5772 [1:05:27<5:09:17, 6.28s/it] 49%|████▉ | 2815/5772 [1:05:29<5:09:18, 6.28s/it] {'loss': 0.4632, 'learning_rate': 1.0885533154711633e-05, 'epoch': 0.49} + 49%|████▉ | 2815/5772 [1:05:29<5:09:18, 6.28s/it] {'loss': 0.4632, 'learning_rate': 1.0885533154711633e-05, 'epoch': 0.49} + 49%|████▉ | 2815/5772 [1:05:27<5:09:17, 6.28s/it] 49%|████▉ | 2816/5772 [1:05:33<5:05:38, 6.20s/it] 49%|████▉ | 2816/5772 [1:05:35<5:05:37, 6.20s/it] {'loss': 0.4635, 'learning_rate': 1.0879943071447417e-05, 'epoch': 0.49} + 49%|████▉ | 2816/5772 [1:05:35<5:05:37, 6.20s/it] {'loss': 0.4635, 'learning_rate': 1.0879943071447417e-05, 'epoch': 0.49} + 49%|████▉ | 2816/5772 [1:05:33<5:05:38, 6.20s/it] 49%|████▉ | 2817/5772 [1:05:39<5:03:04, 6.15s/it] 49%|████▉ | 2817/5772 [1:05:41<5:03:04, 6.15s/it] {'loss': 0.4699, 'learning_rate': 1.087435271105002e-05, 'epoch': 0.49} + 49%|████▉ | 2817/5772 [1:05:41<5:03:04, 6.15s/it] {'loss': 0.4699, 'learning_rate': 1.087435271105002e-05, 'epoch': 0.49} + 49%|████▉ | 2817/5772 [1:05:39<5:03:04, 6.15s/it] 49%|████▉ | 2818/5772 [1:05:45<4:59:30, 6.08s/it] 49%|████▉ | 2818/5772 [1:05:47<4:59:30, 6.08s/it] {'loss': 0.4664, 'learning_rate': 1.0868762075280102e-05, 'epoch': 0.49} + 49%|████▉ | 2818/5772 [1:05:47<4:59:30, 6.08s/it] {'loss': 0.4664, 'learning_rate': 1.0868762075280102e-05, 'epoch': 0.49} + 49%|████▉ | 2818/5772 [1:05:45<4:59:30, 6.08s/it] 49%|████▉ | 2819/5772 [1:05:51<5:00:44, 6.11s/it] 49%|████▉ | 2819/5772 [1:05:53<5:00:44, 6.11s/it] {'loss': 0.4585, 'learning_rate': 1.0863171165898398e-05, 'epoch': 0.49} + 49%|████▉ | 2819/5772 [1:05:53<5:00:44, 6.11s/it] {'loss': 0.4585, 'learning_rate': 1.0863171165898398e-05, 'epoch': 0.49} + 49%|████▉ | 2819/5772 [1:05:51<5:00:44, 6.11s/it] 49%|████▉ | 2820/5772 [1:05:57<4:57:27, 6.05s/it] 49%|████▉ | 2820/5772 [1:05:59<4:57:27, 6.05s/it] {'loss': 0.4537, 'learning_rate': 1.0857579984665733e-05, 'epoch': 0.49} + 49%|████▉ | 2820/5772 [1:05:59<4:57:27, 6.05s/it] {'loss': 0.4537, 'learning_rate': 1.0857579984665733e-05, 'epoch': 0.49} + 49%|████▉ | 2820/5772 [1:05:57<4:57:27, 6.05s/it] 49%|████▉ | 2821/5772 [1:06:03<4:59:28, 6.09s/it] 49%|████▉ | 2821/5772 [1:06:05<4:59:28, 6.09s/it] {'loss': 0.4682, 'learning_rate': 1.0851988533343022e-05, 'epoch': 0.49} + 49%|████▉ | 2821/5772 [1:06:05<4:59:28, 6.09s/it] {'loss': 0.4682, 'learning_rate': 1.0851988533343022e-05, 'epoch': 0.49} + 49%|████▉ | 2821/5772 [1:06:03<4:59:28, 6.09s/it] 49%|████▉ | 2822/5772 [1:06:09<4:58:59, 6.08s/it] 49%|████▉ | 2822/5772 [1:06:11<4:58:59, 6.08s/it] {'loss': 0.4823, 'learning_rate': 1.0846396813691258e-05, 'epoch': 0.49} + 49%|████▉ | 2822/5772 [1:06:11<4:58:59, 6.08s/it] {'loss': 0.4823, 'learning_rate': 1.0846396813691258e-05, 'epoch': 0.49} + 49%|████▉ | 2822/5772 [1:06:09<4:58:59, 6.08s/it] 49%|████▉ | 2823/5772 [1:06:15<4:56:35, 6.03s/it] 49%|████▉ | 2823/5772 [1:06:17<4:56:35, 6.03s/it] {'loss': 0.4675, 'learning_rate': 1.0840804827471523e-05, 'epoch': 0.49} + 49%|████▉ | 2823/5772 [1:06:17<4:56:35, 6.03s/it] {'loss': 0.4675, 'learning_rate': 1.0840804827471523e-05, 'epoch': 0.49} + 49%|████▉ | 2823/5772 [1:06:15<4:56:35, 6.03s/it] 49%|████▉ | 2824/5772 [1:06:21<4:53:33, 5.97s/it] 49%|████▉ | 2824/5772 [1:06:23<4:53:33, 5.97s/it] {'loss': 0.4673, 'learning_rate': 1.0835212576444983e-05, 'epoch': 0.49} + 49%|████▉ | 2824/5772 [1:06:23<4:53:33, 5.97s/it] {'loss': 0.4673, 'learning_rate': 1.0835212576444983e-05, 'epoch': 0.49} + 49%|████▉ | 2824/5772 [1:06:21<4:53:33, 5.97s/it] 49%|████▉ | 2825/5772 [1:06:27<5:00:27, 6.12s/it] 49%|████▉ | 2825/5772 [1:06:29<5:00:27, 6.12s/it] {'loss': 0.4716, 'learning_rate': 1.0829620062372887e-05, 'epoch': 0.49} + 49%|████▉ | 2825/5772 [1:06:29<5:00:27, 6.12s/it] {'loss': 0.4716, 'learning_rate': 1.0829620062372887e-05, 'epoch': 0.49} + 49%|████▉ | 2825/5772 [1:06:27<5:00:27, 6.12s/it] 49%|████▉ | 2826/5772 [1:06:33<4:59:56, 6.11s/it] 49%|████▉ | 2826/5772 [1:06:35<4:59:57, 6.11s/it] {'loss': 0.4714, 'learning_rate': 1.0824027287016566e-05, 'epoch': 0.49} + 49%|████▉ | 2826/5772 [1:06:35<4:59:57, 6.11s/it] {'loss': 0.4714, 'learning_rate': 1.0824027287016566e-05, 'epoch': 0.49} + 49%|████▉ | 2826/5772 [1:06:33<4:59:56, 6.11s/it] 49%|████▉ | 2827/5772 [1:06:39<4:59:18, 6.10s/it] 49%|████▉ | 2827/5772 [1:06:41<4:59:18, 6.10s/it] {'loss': 0.4608, 'learning_rate': 1.0818434252137428e-05, 'epoch': 0.49} + 49%|████▉ | 2827/5772 [1:06:41<4:59:18, 6.10s/it] {'loss': 0.4608, 'learning_rate': 1.0818434252137428e-05, 'epoch': 0.49} + 49%|████▉ | 2827/5772 [1:06:39<4:59:18, 6.10s/it] 49%|████▉ | 2828/5772 [1:06:47<5:00:01, 6.11s/it] 49%|████▉ | 2828/5772 [1:06:46<5:00:02, 6.11s/it] {'loss': 0.4751, 'learning_rate': 1.0812840959496978e-05, 'epoch': 0.49} + 49%|████▉ | 2828/5772 [1:06:48<5:00:01, 6.11s/it] {'loss': 0.4751, 'learning_rate': 1.0812840959496978e-05, 'epoch': 0.49} + 49%|████▉ | 2828/5772 [1:06:46<5:00:02, 6.11s/it] 49%|████▉ | 2829/5772 [1:06:52<4:59:12, 6.10s/it] 49%|████▉ | 2829/5772 [1:06:54<4:59:12, 6.10s/it] {'loss': 0.4667, 'learning_rate': 1.0807247410856783e-05, 'epoch': 0.49} + 49%|████▉ | 2829/5772 [1:06:54<4:59:12, 6.10s/it] {'loss': 0.4667, 'learning_rate': 1.0807247410856783e-05, 'epoch': 0.49} + 49%|████▉ | 2829/5772 [1:06:52<4:59:12, 6.10s/it] 49%|████▉ | 2830/5772 [1:06:58<4:58:03, 6.08s/it] 49%|████▉ | 2830/5772 [1:07:00<4:58:03, 6.08s/it] {'loss': 0.4661, 'learning_rate': 1.0801653607978506e-05, 'epoch': 0.49} + 49%|████▉ | 2830/5772 [1:07:00<4:58:03, 6.08s/it] {'loss': 0.4661, 'learning_rate': 1.0801653607978506e-05, 'epoch': 0.49} + 49%|████▉ | 2830/5772 [1:06:58<4:58:03, 6.08s/it] 49%|████▉ | 2831/5772 [1:07:04<5:00:48, 6.14s/it] 49%|████▉ | 2831/5772 [1:07:06<5:00:48, 6.14s/it] {'loss': 0.4628, 'learning_rate': 1.0796059552623881e-05, 'epoch': 0.49} + 49%|████▉ | 2831/5772 [1:07:06<5:00:48, 6.14s/it] {'loss': 0.4628, 'learning_rate': 1.0796059552623881e-05, 'epoch': 0.49} + 49%|████▉ | 2831/5772 [1:07:04<5:00:48, 6.14s/it] 49%|████▉ | 2832/5772 [1:07:10<5:03:36, 6.20s/it] 49%|████▉ | 2832/5772 [1:07:12<5:03:36, 6.20s/it] {'loss': 0.48, 'learning_rate': 1.0790465246554728e-05, 'epoch': 0.49} + 49%|████▉ | 2832/5772 [1:07:12<5:03:36, 6.20s/it] {'loss': 0.48, 'learning_rate': 1.0790465246554728e-05, 'epoch': 0.49} + 49%|████▉ | 2832/5772 [1:07:10<5:03:36, 6.20s/it] 49%|████▉ | 2833/5772 [1:07:16<4:58:37, 6.10s/it] 49%|████▉ | 2833/5772 [1:07:18<4:58:37, 6.10s/it] {'loss': 0.4635, 'learning_rate': 1.078487069153294e-05, 'epoch': 0.49} + 49%|████▉ | 2833/5772 [1:07:18<4:58:37, 6.10s/it] {'loss': 0.4635, 'learning_rate': 1.078487069153294e-05, 'epoch': 0.49} + 49%|████▉ | 2833/5772 [1:07:16<4:58:37, 6.10s/it] 49%|████▉ | 2834/5772 [1:07:23<5:06:17, 6.26s/it] 49%|████▉ | 2834/5772 [1:07:25<5:06:16, 6.25s/it] {'loss': 0.4781, 'learning_rate': 1.0779275889320495e-05, 'epoch': 0.49} + 49%|████▉ | 2834/5772 [1:07:25<5:06:16, 6.25s/it] {'loss': 0.4781, 'learning_rate': 1.0779275889320495e-05, 'epoch': 0.49} + 49%|████▉ | 2834/5772 [1:07:23<5:06:17, 6.26s/it] 49%|████▉ | 2835/5772 [1:07:29<5:03:30, 6.20s/it] 49%|████▉ | 2835/5772 [1:07:31<5:03:29, 6.20s/it] {'loss': 0.4668, 'learning_rate': 1.0773680841679437e-05, 'epoch': 0.49} + 49%|████▉ | 2835/5772 [1:07:31<5:03:29, 6.20s/it] {'loss': 0.4668, 'learning_rate': 1.0773680841679437e-05, 'epoch': 0.49} + 49%|████▉ | 2835/5772 [1:07:29<5:03:30, 6.20s/it] 49%|████▉ | 2836/5772 [1:07:35<5:01:45, 6.17s/it] 49%|████▉ | 2836/5772 [1:07:37<5:01:45, 6.17s/it] {'loss': 0.4714, 'learning_rate': 1.0768085550371902e-05, 'epoch': 0.49} + 49%|████▉ | 2836/5772 [1:07:37<5:01:45, 6.17s/it] {'loss': 0.4714, 'learning_rate': 1.0768085550371902e-05, 'epoch': 0.49} + 49%|████▉ | 2836/5772 [1:07:35<5:01:45, 6.17s/it] 49%|████▉ | 2837/5772 [1:07:42<5:11:46, 6.37s/it] 49%|████▉ | 2837/5772 [1:07:44<5:11:46, 6.37s/it] {'loss': 0.459, 'learning_rate': 1.076249001716009e-05, 'epoch': 0.49} + 49%|████▉ | 2837/5772 [1:07:44<5:11:46, 6.37s/it] {'loss': 0.459, 'learning_rate': 1.076249001716009e-05, 'epoch': 0.49} + 49%|████▉ | 2837/5772 [1:07:42<5:11:46, 6.37s/it] 49%|████▉ | 2838/5772 [1:07:48<5:14:43, 6.44s/it] 49%|████▉ | 2838/5772 [1:07:50<5:14:44, 6.44s/it] {'loss': 0.4767, 'learning_rate': 1.0756894243806291e-05, 'epoch': 0.49} + 49%|████▉ | 2838/5772 [1:07:50<5:14:44, 6.44s/it] {'loss': 0.4767, 'learning_rate': 1.0756894243806291e-05, 'epoch': 0.49} + 49%|████▉ | 2838/5772 [1:07:48<5:14:43, 6.44s/it] 49%|████▉ | 2839/5772 [1:07:55<5:13:54, 6.42s/it] 49%|████▉ | 2839/5772 [1:07:57<5:13:54, 6.42s/it] {'loss': 0.4708, 'learning_rate': 1.0751298232072856e-05, 'epoch': 0.49} + 49%|████▉ | 2839/5772 [1:07:57<5:13:54, 6.42s/it] {'loss': 0.4708, 'learning_rate': 1.0751298232072856e-05, 'epoch': 0.49} + 49%|████▉ | 2839/5772 [1:07:55<5:13:54, 6.42s/it] 49%|████▉ | 2840/5772 [1:08:01<5:09:22, 6.33s/it] 49%|████▉ | 2840/5772 [1:08:03<5:09:22, 6.33s/it] {'loss': 0.4797, 'learning_rate': 1.0745701983722219e-05, 'epoch': 0.49} + 49%|████▉ | 2840/5772 [1:08:03<5:09:22, 6.33s/it] {'loss': 0.4797, 'learning_rate': 1.0745701983722219e-05, 'epoch': 0.49} + 49%|████▉ | 2840/5772 [1:08:01<5:09:22, 6.33s/it] 49%|████▉ | 2841/5772 [1:08:09<5:12:35, 6.40s/it] 49%|████▉ | 2841/5772 [1:08:07<5:12:36, 6.40s/it] {'loss': 0.4657, 'learning_rate': 1.0740105500516889e-05, 'epoch': 0.49} + 49%|████▉ | 2841/5772 [1:08:09<5:12:35, 6.40s/it] {'loss': 0.4657, 'learning_rate': 1.0740105500516889e-05, 'epoch': 0.49} + 49%|████▉ | 2841/5772 [1:08:07<5:12:36, 6.40s/it] 49%|████▉ | 2842/5772 [1:08:15<5:06:39, 6.28s/it] 49%|████▉ | 2842/5772 [1:08:13<5:06:39, 6.28s/it] {'loss': 0.4839, 'learning_rate': 1.0734508784219446e-05, 'epoch': 0.49} + 49%|████▉ | 2842/5772 [1:08:15<5:06:39, 6.28s/it] {'loss': 0.4839, 'learning_rate': 1.0734508784219446e-05, 'epoch': 0.49} + 49%|████▉ | 2842/5772 [1:08:13<5:06:39, 6.28s/it] 49%|████▉ | 2843/5772 [1:08:21<5:01:36, 6.18s/it] 49%|████▉ | 2843/5772 [1:08:19<5:01:36, 6.18s/it] {'loss': 0.4604, 'learning_rate': 1.0728911836592548e-05, 'epoch': 0.49} + 49%|████▉ | 2843/5772 [1:08:21<5:01:36, 6.18s/it] {'loss': 0.4604, 'learning_rate': 1.0728911836592548e-05, 'epoch': 0.49} + 49%|████▉ | 2843/5772 [1:08:19<5:01:36, 6.18s/it] 49%|████▉ | 2844/5772 [1:08:27<5:00:45, 6.16s/it] 49%|████▉ | 2844/5772 [1:08:25<5:00:45, 6.16s/it] {'loss': 0.4856, 'learning_rate': 1.0723314659398916e-05, 'epoch': 0.49} + 49%|████▉ | 2844/5772 [1:08:27<5:00:45, 6.16s/it] {'loss': 0.4856, 'learning_rate': 1.0723314659398916e-05, 'epoch': 0.49} + 49%|████▉ | 2844/5772 [1:08:25<5:00:45, 6.16s/it] 49%|████▉ | 2845/5772 [1:08:32<4:59:08, 6.13s/it] 49%|████▉ | 2845/5772 [1:08:33<4:59:08, 6.13s/it] {'loss': 0.4612, 'learning_rate': 1.0717717254401356e-05, 'epoch': 0.49} + 49%|████▉ | 2845/5772 [1:08:33<4:59:08, 6.13s/it] {'loss': 0.4612, 'learning_rate': 1.0717717254401356e-05, 'epoch': 0.49} + 49%|████▉ | 2845/5772 [1:08:32<4:59:08, 6.13s/it] 49%|████▉ | 2846/5772 [1:08:40<5:00:59, 6.17s/it] 49%|████▉ | 2846/5772 [1:08:38<5:00:59, 6.17s/it] {'loss': 0.4742, 'learning_rate': 1.0712119623362738e-05, 'epoch': 0.49} + 49%|████▉ | 2846/5772 [1:08:40<5:00:59, 6.17s/it] {'loss': 0.4742, 'learning_rate': 1.0712119623362738e-05, 'epoch': 0.49} + 49%|████▉ | 2846/5772 [1:08:38<5:00:59, 6.17s/it] 49%|████▉ | 2847/5772 [1:08:44<4:58:17, 6.12s/it] 49%|████▉ | 2847/5772 [1:08:46<4:58:17, 6.12s/it] {'loss': 0.4681, 'learning_rate': 1.0706521768046006e-05, 'epoch': 0.49} + 49%|████▉ | 2847/5772 [1:08:46<4:58:17, 6.12s/it] {'loss': 0.4681, 'learning_rate': 1.0706521768046006e-05, 'epoch': 0.49} + 49%|████▉ | 2847/5772 [1:08:44<4:58:17, 6.12s/it] 49%|████▉ | 2848/5772 [1:08:50<5:02:40, 6.21s/it] 49%|████▉ | 2848/5772 [1:08:52<5:02:40, 6.21s/it] {'loss': 0.4755, 'learning_rate': 1.0700923690214166e-05, 'epoch': 0.49} + 49%|████▉ | 2848/5772 [1:08:52<5:02:40, 6.21s/it] {'loss': 0.4755, 'learning_rate': 1.0700923690214166e-05, 'epoch': 0.49} + 49%|████▉ | 2848/5772 [1:08:50<5:02:40, 6.21s/it] 49%|████▉ | 2849/5772 [1:08:56<5:00:22, 6.17s/it] 49%|████▉ | 2849/5772 [1:08:58<5:00:23, 6.17s/it] {'loss': 0.4642, 'learning_rate': 1.0695325391630309e-05, 'epoch': 0.49} + 49%|████▉ | 2849/5772 [1:08:58<5:00:23, 6.17s/it] {'loss': 0.4642, 'learning_rate': 1.0695325391630309e-05, 'epoch': 0.49} + 49%|████▉ | 2849/5772 [1:08:56<5:00:22, 6.17s/it]10 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +0758 AutoResumeHook: Checking whether to suspend...1 + 49%|████▉ | 2850/5772 [1:09:04<5:00:10, 6.16s/it] 12 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + + + AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... + 49%|████▉ | 2850/5772 [1:09:02<5:00:10, 6.16s/it] {'loss': 0.4708, 'learning_rate': 1.0689726874057589e-05, 'epoch': 0.49} + 49%|████▉ | 2850/5772 [1:09:04<5:00:10, 6.16s/it] {'loss': 0.4708, 'learning_rate': 1.0689726874057589e-05, 'epoch': 0.49} + 49%|████▉ | 2850/5772 [1:09:02<5:00:10, 6.16s/it] 49%|████▉ | 2851/5772 [1:09:08<4:54:56, 6.06s/it] 49%|████▉ | 2851/5772 [1:09:10<4:54:56, 6.06s/it] {'loss': 0.4728, 'learning_rate': 1.068412813925922e-05, 'epoch': 0.49} + 49%|████▉ | 2851/5772 [1:09:10<4:54:56, 6.06s/it] {'loss': 0.4728, 'learning_rate': 1.068412813925922e-05, 'epoch': 0.49} + 49%|████▉ | 2851/5772 [1:09:08<4:54:56, 6.06s/it] 49%|████▉ | 2852/5772 [1:09:15<4:59:22, 6.15s/it] 49%|████▉ | 2852/5772 [1:09:17<4:59:22, 6.15s/it] {'loss': 0.4745, 'learning_rate': 1.06785291889985e-05, 'epoch': 0.49} + 49%|████▉ | 2852/5772 [1:09:17<4:59:22, 6.15s/it] {'loss': 0.4745, 'learning_rate': 1.06785291889985e-05, 'epoch': 0.49} + 49%|████▉ | 2852/5772 [1:09:15<4:59:22, 6.15s/it] 49%|████▉ | 2853/5772 [1:09:22<4:54:14, 6.05s/it] 49%|████▉ | 2853/5772 [1:09:20<4:54:14, 6.05s/it] {'loss': 0.4644, 'learning_rate': 1.0672930025038783e-05, 'epoch': 0.49} + 49%|████▉ | 2853/5772 [1:09:22<4:54:14, 6.05s/it] {'loss': 0.4644, 'learning_rate': 1.0672930025038783e-05, 'epoch': 0.49} + 49%|████▉ | 2853/5772 [1:09:20<4:54:14, 6.05s/it] 49%|████▉ | 2854/5772 [1:09:29<4:58:38, 6.14s/it] 49%|████▉ | 2854/5772 [1:09:27<4:58:38, 6.14s/it] {'loss': 0.4762, 'learning_rate': 1.0667330649143498e-05, 'epoch': 0.49} + 49%|████▉ | 2854/5772 [1:09:29<4:58:38, 6.14s/it] {'loss': 0.4762, 'learning_rate': 1.0667330649143498e-05, 'epoch': 0.49} + 49%|████▉ | 2854/5772 [1:09:27<4:58:38, 6.14s/it] 49%|████▉ | 2855/5772 [1:09:35<4:59:33, 6.16s/it] 49%|████▉ | 2855/5772 [1:09:33<4:59:34, 6.16s/it] {'loss': 0.4733, 'learning_rate': 1.0661731063076134e-05, 'epoch': 0.49} + 49%|████▉ | 2855/5772 [1:09:35<4:59:33, 6.16s/it] {'loss': 0.4733, 'learning_rate': 1.0661731063076134e-05, 'epoch': 0.49} + 49%|████▉ | 2855/5772 [1:09:33<4:59:34, 6.16s/it] 49%|████▉ | 2856/5772 [1:09:39<4:53:46, 6.04s/it] 49%|████▉ | 2856/5772 [1:09:41<4:53:47, 6.05s/it] {'loss': 0.4657, 'learning_rate': 1.0656131268600254e-05, 'epoch': 0.49} + 49%|████▉ | 2856/5772 [1:09:41<4:53:47, 6.05s/it] {'loss': 0.4657, 'learning_rate': 1.0656131268600254e-05, 'epoch': 0.49} + 49%|████▉ | 2856/5772 [1:09:39<4:53:46, 6.04s/it] 49%|████▉ | 2857/5772 [1:09:45<4:58:24, 6.14s/it] 49%|████▉ | 2857/5772 [1:09:47<4:58:24, 6.14s/it] {'loss': 0.4802, 'learning_rate': 1.0650531267479477e-05, 'epoch': 0.49} + 49%|████▉ | 2857/5772 [1:09:47<4:58:24, 6.14s/it] {'loss': 0.4802, 'learning_rate': 1.0650531267479477e-05, 'epoch': 0.49} + 49%|████▉ | 2857/5772 [1:09:45<4:58:24, 6.14s/it] 50%|████▉ | 2858/5772 [1:09:53<4:52:53, 6.03s/it] 50%|████▉ | 2858/5772 [1:09:51<4:52:54, 6.03s/it] {'loss': 0.4758, 'learning_rate': 1.0644931061477492e-05, 'epoch': 0.5} + 50%|████▉ | 2858/5772 [1:09:53<4:52:53, 6.03s/it] {'loss': 0.4758, 'learning_rate': 1.0644931061477492e-05, 'epoch': 0.5} + 50%|████▉ | 2858/5772 [1:09:51<4:52:54, 6.03s/it] 50%|████▉ | 2859/5772 [1:09:57<4:57:05, 6.12s/it] 50%|████▉ | 2859/5772 [1:09:59<4:57:05, 6.12s/it] {'loss': 0.471, 'learning_rate': 1.0639330652358058e-05, 'epoch': 0.5} + 50%|████▉ | 2859/5772 [1:09:59<4:57:05, 6.12s/it] {'loss': 0.471, 'learning_rate': 1.0639330652358058e-05, 'epoch': 0.5} + 50%|████▉ | 2859/5772 [1:09:57<4:57:05, 6.12s/it] 50%|████▉ | 2860/5772 [1:10:03<4:57:45, 6.14s/it] 50%|████▉ | 2860/5772 [1:10:05<4:57:45, 6.14s/it] {'loss': 0.4702, 'learning_rate': 1.0633730041884988e-05, 'epoch': 0.5} + 50%|████▉ | 2860/5772 [1:10:05<4:57:45, 6.14s/it] {'loss': 0.4702, 'learning_rate': 1.0633730041884988e-05, 'epoch': 0.5} + 50%|████▉ | 2860/5772 [1:10:03<4:57:45, 6.14s/it] 50%|████▉ | 2861/5772 [1:10:11<4:54:25, 6.07s/it] 50%|████▉ | 2861/5772 [1:10:09<4:54:25, 6.07s/it] {'loss': 0.4673, 'learning_rate': 1.0628129231822166e-05, 'epoch': 0.5} + 50%|████▉ | 2861/5772 [1:10:11<4:54:25, 6.07s/it] {'loss': 0.4673, 'learning_rate': 1.0628129231822166e-05, 'epoch': 0.5} + 50%|████▉ | 2861/5772 [1:10:09<4:54:25, 6.07s/it] 50%|████▉ | 2862/5772 [1:10:16<5:06:32, 6.32s/it] 50%|████▉ | 2862/5772 [1:10:18<5:06:32, 6.32s/it] {'loss': 0.4733, 'learning_rate': 1.0622528223933534e-05, 'epoch': 0.5} + 50%|████▉ | 2862/5772 [1:10:18<5:06:32, 6.32s/it] {'loss': 0.4733, 'learning_rate': 1.0622528223933534e-05, 'epoch': 0.5} + 50%|████▉ | 2862/5772 [1:10:16<5:06:32, 6.32s/it] 50%|████▉ | 2863/5772 [1:10:22<5:02:51, 6.25s/it] 50%|████▉ | 2863/5772 [1:10:24<5:02:51, 6.25s/it] {'loss': 0.4704, 'learning_rate': 1.06169270199831e-05, 'epoch': 0.5} + 50%|████▉ | 2863/5772 [1:10:24<5:02:51, 6.25s/it] {'loss': 0.4704, 'learning_rate': 1.06169270199831e-05, 'epoch': 0.5} + 50%|████▉ | 2863/5772 [1:10:22<5:02:51, 6.25s/it] 50%|████▉ | 2864/5772 [1:10:28<5:00:02, 6.19s/it] 50%|████▉ | 2864/5772 [1:10:30<5:00:02, 6.19s/it] {'loss': 0.4627, 'learning_rate': 1.061132562173493e-05, 'epoch': 0.5} + 50%|████▉ | 2864/5772 [1:10:30<5:00:02, 6.19s/it] {'loss': 0.4627, 'learning_rate': 1.061132562173493e-05, 'epoch': 0.5} + 50%|████▉ | 2864/5772 [1:10:28<5:00:02, 6.19s/it] 50%|████▉ | 2865/5772 [1:10:35<5:01:58, 6.23s/it] 50%|████▉ | 2865/5772 [1:10:37<5:01:58, 6.23s/it] {'loss': 0.4642, 'learning_rate': 1.0605724030953155e-05, 'epoch': 0.5} + 50%|████▉ | 2865/5772 [1:10:37<5:01:58, 6.23s/it] {'loss': 0.4642, 'learning_rate': 1.0605724030953155e-05, 'epoch': 0.5} + 50%|████▉ | 2865/5772 [1:10:35<5:01:58, 6.23s/it] 50%|████▉ | 2866/5772 [1:10:43<5:04:45, 6.29s/it] 50%|████▉ | 2866/5772 [1:10:41<5:04:46, 6.29s/it] {'loss': 0.4826, 'learning_rate': 1.0600122249401965e-05, 'epoch': 0.5} + 50%|████▉ | 2866/5772 [1:10:43<5:04:45, 6.29s/it] {'loss': 0.4826, 'learning_rate': 1.0600122249401965e-05, 'epoch': 0.5} + 50%|████▉ | 2866/5772 [1:10:41<5:04:46, 6.29s/it] 50%|████▉ | 2867/5772 [1:10:47<4:58:54, 6.17s/it] 50%|████▉ | 2867/5772 [1:10:49<4:58:55, 6.17s/it] {'loss': 0.4649, 'learning_rate': 1.059452027884561e-05, 'epoch': 0.5} + 50%|████▉ | 2867/5772 [1:10:49<4:58:55, 6.17s/it] {'loss': 0.4649, 'learning_rate': 1.059452027884561e-05, 'epoch': 0.5} + 50%|████▉ | 2867/5772 [1:10:47<4:58:54, 6.17s/it] 50%|████▉ | 2868/5772 [1:10:55<4:55:09, 6.10s/it] 50%|████▉ | 2868/5772 [1:10:53<4:55:09, 6.10s/it] {'loss': 0.473, 'learning_rate': 1.0588918121048396e-05, 'epoch': 0.5} + 50%|████▉ | 2868/5772 [1:10:55<4:55:09, 6.10s/it] {'loss': 0.473, 'learning_rate': 1.0588918121048396e-05, 'epoch': 0.5} + 50%|████▉ | 2868/5772 [1:10:53<4:55:09, 6.10s/it] 50%|████▉ | 2869/5772 [1:10:59<4:58:14, 6.16s/it] 50%|████▉ | 2869/5772 [1:11:01<4:58:14, 6.16s/it] {'loss': 0.4647, 'learning_rate': 1.0583315777774697e-05, 'epoch': 0.5} + 50%|████▉ | 2869/5772 [1:11:01<4:58:14, 6.16s/it] {'loss': 0.4647, 'learning_rate': 1.0583315777774697e-05, 'epoch': 0.5} + 50%|████▉ | 2869/5772 [1:10:59<4:58:14, 6.16s/it] 50%|████▉ | 2870/5772 [1:11:05<4:51:25, 6.03s/it] 50%|████▉ | 2870/5772 [1:11:07<4:51:25, 6.03s/it] {'loss': 0.4806, 'learning_rate': 1.0577713250788935e-05, 'epoch': 0.5} + 50%|████▉ | 2870/5772 [1:11:07<4:51:25, 6.03s/it] {'loss': 0.4806, 'learning_rate': 1.0577713250788935e-05, 'epoch': 0.5} + 50%|████▉ | 2870/5772 [1:11:05<4:51:25, 6.03s/it] 50%|████▉ | 2871/5772 [1:11:13<4:51:41, 6.03s/it] 50%|████▉ | 2871/5772 [1:11:11<4:51:41, 6.03s/it] {'loss': 0.4572, 'learning_rate': 1.0572110541855596e-05, 'epoch': 0.5} + 50%|████▉ | 2871/5772 [1:11:13<4:51:41, 6.03s/it] {'loss': 0.4572, 'learning_rate': 1.0572110541855596e-05, 'epoch': 0.5} + 50%|████▉ | 2871/5772 [1:11:11<4:51:41, 6.03s/it] 50%|████▉ | 2872/5772 [1:11:17<4:56:19, 6.13s/it] 50%|████▉ | 2872/5772 [1:11:19<4:56:19, 6.13s/it] {'loss': 0.4676, 'learning_rate': 1.0566507652739224e-05, 'epoch': 0.5} + 50%|████▉ | 2872/5772 [1:11:19<4:56:19, 6.13s/it] {'loss': 0.4676, 'learning_rate': 1.0566507652739224e-05, 'epoch': 0.5} + 50%|████▉ | 2872/5772 [1:11:17<4:56:19, 6.13s/it] 50%|████▉ | 2873/5772 [1:11:23<4:55:50, 6.12s/it] 50%|████▉ | 2873/5772 [1:11:25<4:55:50, 6.12s/it] {'loss': 0.4635, 'learning_rate': 1.056090458520442e-05, 'epoch': 0.5} + 50%|████▉ | 2873/5772 [1:11:25<4:55:50, 6.12s/it] {'loss': 0.4635, 'learning_rate': 1.056090458520442e-05, 'epoch': 0.5} + 50%|████▉ | 2873/5772 [1:11:23<4:55:50, 6.12s/it] 50%|████▉ | 2874/5772 [1:11:30<4:58:56, 6.19s/it] 50%|████▉ | 2874/5772 [1:11:32<4:58:56, 6.19s/it] {'loss': 0.4789, 'learning_rate': 1.0555301341015832e-05, 'epoch': 0.5} + 50%|████▉ | 2874/5772 [1:11:32<4:58:56, 6.19s/it] {'loss': 0.4789, 'learning_rate': 1.0555301341015832e-05, 'epoch': 0.5} + 50%|████▉ | 2874/5772 [1:11:30<4:58:56, 6.19s/it] 50%|████▉ | 2875/5772 [1:11:38<4:56:58, 6.15s/it] 50%|████▉ | 2875/5772 [1:11:36<4:56:58, 6.15s/it] {'loss': 0.4621, 'learning_rate': 1.0549697921938172e-05, 'epoch': 0.5} + 50%|████▉ | 2875/5772 [1:11:38<4:56:58, 6.15s/it] {'loss': 0.4621, 'learning_rate': 1.0549697921938172e-05, 'epoch': 0.5} + 50%|████▉ | 2875/5772 [1:11:36<4:56:58, 6.15s/it] 50%|████▉ | 2876/5772 [1:11:42<4:55:55, 6.13s/it] 50%|████▉ | 2876/5772 [1:11:44<4:55:55, 6.13s/it] {'loss': 0.4799, 'learning_rate': 1.0544094329736213e-05, 'epoch': 0.5} + 50%|████▉ | 2876/5772 [1:11:44<4:55:55, 6.13s/it] {'loss': 0.4799, 'learning_rate': 1.0544094329736213e-05, 'epoch': 0.5} + 50%|████▉ | 2876/5772 [1:11:42<4:55:55, 6.13s/it] 50%|████▉ | 2877/5772 [1:11:49<5:02:38, 6.27s/it] 50%|████▉ | 2877/5772 [1:11:51<5:02:39, 6.27s/it] {'loss': 0.4554, 'learning_rate': 1.0538490566174766e-05, 'epoch': 0.5} + 50%|████▉ | 2877/5772 [1:11:51<5:02:39, 6.27s/it] {'loss': 0.4554, 'learning_rate': 1.0538490566174766e-05, 'epoch': 0.5} + 50%|████▉ | 2877/5772 [1:11:49<5:02:38, 6.27s/it] 50%|████▉ | 2878/5772 [1:11:56<4:56:41, 6.15s/it] 50%|████▉ | 2878/5772 [1:11:54<4:56:41, 6.15s/it] {'loss': 0.478, 'learning_rate': 1.0532886633018711e-05, 'epoch': 0.5} + 50%|████▉ | 2878/5772 [1:11:56<4:56:41, 6.15s/it] {'loss': 0.478, 'learning_rate': 1.0532886633018711e-05, 'epoch': 0.5} + 50%|████▉ | 2878/5772 [1:11:54<4:56:41, 6.15s/it] 50%|████▉ | 2879/5772 [1:12:03<4:57:35, 6.17s/it] 50%|████▉ | 2879/5772 [1:12:01<4:57:36, 6.17s/it] {'loss': 0.4675, 'learning_rate': 1.052728253203297e-05, 'epoch': 0.5} + 50%|████▉ | 2879/5772 [1:12:03<4:57:35, 6.17s/it] {'loss': 0.4675, 'learning_rate': 1.052728253203297e-05, 'epoch': 0.5} + 50%|████▉ | 2879/5772 [1:12:01<4:57:36, 6.17s/it] 50%|████▉ | 2880/5772 [1:12:07<5:01:40, 6.26s/it] 50%|████▉ | 2880/5772 [1:12:09<5:01:40, 6.26s/it] {'loss': 0.4751, 'learning_rate': 1.0521678264982534e-05, 'epoch': 0.5} + 50%|████▉ | 2880/5772 [1:12:09<5:01:40, 6.26s/it] {'loss': 0.4751, 'learning_rate': 1.0521678264982534e-05, 'epoch': 0.5} + 50%|████▉ | 2880/5772 [1:12:07<5:01:40, 6.26s/it] 50%|████▉ | 2881/5772 [1:12:14<5:05:57, 6.35s/it] 50%|████▉ | 2881/5772 [1:12:16<5:05:58, 6.35s/it] {'loss': 0.4759, 'learning_rate': 1.0516073833632424e-05, 'epoch': 0.5} + 50%|████▉ | 2881/5772 [1:12:16<5:05:58, 6.35s/it] {'loss': 0.4759, 'learning_rate': 1.0516073833632424e-05, 'epoch': 0.5} + 50%|████▉ | 2881/5772 [1:12:14<5:05:57, 6.35s/it] 50%|████▉ | 2882/5772 [1:12:20<5:08:46, 6.41s/it] 50%|████▉ | 2882/5772 [1:12:22<5:08:46, 6.41s/it] {'loss': 0.4729, 'learning_rate': 1.0510469239747731e-05, 'epoch': 0.5} + 50%|████▉ | 2882/5772 [1:12:22<5:08:46, 6.41s/it] {'loss': 0.4729, 'learning_rate': 1.0510469239747731e-05, 'epoch': 0.5} + 50%|████▉ | 2882/5772 [1:12:20<5:08:46, 6.41s/it] 50%|████▉ | 2883/5772 [1:12:26<5:05:50, 6.35s/it] 50%|████▉ | 2883/5772 [1:12:28<5:05:50, 6.35s/it] {'loss': 0.4586, 'learning_rate': 1.0504864485093588e-05, 'epoch': 0.5} + 50%|████▉ | 2883/5772 [1:12:28<5:05:50, 6.35s/it] {'loss': 0.4586, 'learning_rate': 1.0504864485093588e-05, 'epoch': 0.5} + 50%|████▉ | 2883/5772 [1:12:26<5:05:50, 6.35s/it] 50%|████▉ | 2884/5772 [1:12:35<5:03:41, 6.31s/it] 50%|████▉ | 2884/5772 [1:12:33<5:03:42, 6.31s/it] {'loss': 0.4834, 'learning_rate': 1.0499259571435185e-05, 'epoch': 0.5} + 50%|████▉ | 2884/5772 [1:12:35<5:03:41, 6.31s/it] {'loss': 0.4834, 'learning_rate': 1.0499259571435185e-05, 'epoch': 0.5} + 50%|████▉ | 2884/5772 [1:12:33<5:03:42, 6.31s/it] 50%|████▉ | 2885/5772 [1:12:41<5:00:27, 6.24s/it] 50%|████▉ | 2885/5772 [1:12:39<5:00:27, 6.24s/it] {'loss': 0.4687, 'learning_rate': 1.0493654500537756e-05, 'epoch': 0.5} + 50%|████▉ | 2885/5772 [1:12:41<5:00:27, 6.24s/it] {'loss': 0.4687, 'learning_rate': 1.0493654500537756e-05, 'epoch': 0.5} + 50%|████▉ | 2885/5772 [1:12:39<5:00:27, 6.24s/it] 50%|█████ | 2886/5772 [1:12:47<5:01:21, 6.27s/it] 50%|█████ | 2886/5772 [1:12:45<5:01:21, 6.27s/it] {'loss': 0.458, 'learning_rate': 1.0488049274166583e-05, 'epoch': 0.5} + 50%|█████ | 2886/5772 [1:12:47<5:01:21, 6.27s/it] {'loss': 0.458, 'learning_rate': 1.0488049274166583e-05, 'epoch': 0.5} + 50%|█████ | 2886/5772 [1:12:45<5:01:21, 6.27s/it] 50%|█████ | 2887/5772 [1:12:51<5:00:57, 6.26s/it] 50%|█████ | 2887/5772 [1:12:53<5:00:57, 6.26s/it] {'loss': 0.4751, 'learning_rate': 1.0482443894087007e-05, 'epoch': 0.5} + 50%|█████ | 2887/5772 [1:12:53<5:00:57, 6.26s/it] {'loss': 0.4751, 'learning_rate': 1.0482443894087007e-05, 'epoch': 0.5} + 50%|█████ | 2887/5772 [1:12:51<5:00:57, 6.26s/it] 50%|█████ | 2888/5772 [1:12:57<4:53:07, 6.10s/it] 50%|█████ | 2888/5772 [1:12:59<4:53:07, 6.10s/it] {'loss': 0.4803, 'learning_rate': 1.0476838362064408e-05, 'epoch': 0.5} + 50%|█████ | 2888/5772 [1:12:59<4:53:07, 6.10s/it] {'loss': 0.4803, 'learning_rate': 1.0476838362064408e-05, 'epoch': 0.5} + 50%|█████ | 2888/5772 [1:12:57<4:53:07, 6.10s/it] 50%|█████ | 2889/5772 [1:13:04<4:59:18, 6.23s/it] 50%|█████ | 2889/5772 [1:13:06<4:59:18, 6.23s/it] {'loss': 0.4636, 'learning_rate': 1.047123267986422e-05, 'epoch': 0.5} + 50%|█████ | 2889/5772 [1:13:06<4:59:18, 6.23s/it] {'loss': 0.4636, 'learning_rate': 1.047123267986422e-05, 'epoch': 0.5} + 50%|█████ | 2889/5772 [1:13:04<4:59:18, 6.23s/it] 50%|█████ | 2890/5772 [1:13:12<5:01:17, 6.27s/it] 50%|█████ | 2890/5772 [1:13:10<5:01:17, 6.27s/it] {'loss': 0.4917, 'learning_rate': 1.0465626849251919e-05, 'epoch': 0.5} + 50%|█████ | 2890/5772 [1:13:12<5:01:17, 6.27s/it] {'loss': 0.4917, 'learning_rate': 1.0465626849251919e-05, 'epoch': 0.5} + 50%|█████ | 2890/5772 [1:13:10<5:01:17, 6.27s/it] 50%|█████ | 2891/5772 [1:13:16<4:56:36, 6.18s/it] 50%|█████ | 2891/5772 [1:13:18<4:56:36, 6.18s/it] {'loss': 0.4683, 'learning_rate': 1.046002087199303e-05, 'epoch': 0.5} + 50%|█████ | 2891/5772 [1:13:18<4:56:36, 6.18s/it] {'loss': 0.4683, 'learning_rate': 1.046002087199303e-05, 'epoch': 0.5} + 50%|█████ | 2891/5772 [1:13:16<4:56:36, 6.18s/it] 50%|█████ | 2892/5772 [1:13:22<4:57:36, 6.20s/it] 50%|█████ | 2892/5772 [1:13:24<4:57:36, 6.20s/it] {'loss': 0.4804, 'learning_rate': 1.0454414749853126e-05, 'epoch': 0.5} + 50%|█████ | 2892/5772 [1:13:24<4:57:36, 6.20s/it] {'loss': 0.4804, 'learning_rate': 1.0454414749853126e-05, 'epoch': 0.5} + 50%|█████ | 2892/5772 [1:13:22<4:57:36, 6.20s/it] 50%|█████ | 2893/5772 [1:13:28<4:54:32, 6.14s/it] 50%|█████ | 2893/5772 [1:13:30<4:54:32, 6.14s/it] {'loss': 0.4634, 'learning_rate': 1.0448808484597821e-05, 'epoch': 0.5} + 50%|█████ | 2893/5772 [1:13:30<4:54:32, 6.14s/it] {'loss': 0.4634, 'learning_rate': 1.0448808484597821e-05, 'epoch': 0.5} + 50%|█████ | 2893/5772 [1:13:28<4:54:32, 6.14s/it] 50%|█████ | 2894/5772 [1:13:34<4:53:21, 6.12s/it] 50%|█████ | 2894/5772 [1:13:36<4:53:21, 6.12s/it] {'loss': 0.4697, 'learning_rate': 1.044320207799278e-05, 'epoch': 0.5} + 50%|█████ | 2894/5772 [1:13:36<4:53:21, 6.12s/it] {'loss': 0.4697, 'learning_rate': 1.044320207799278e-05, 'epoch': 0.5} + 50%|█████ | 2894/5772 [1:13:34<4:53:21, 6.12s/it] 50%|█████ | 2895/5772 [1:13:42<4:55:31, 6.16s/it] 50%|█████ | 2895/5772 [1:13:40<4:55:32, 6.16s/it] {'loss': 0.4658, 'learning_rate': 1.0437595531803713e-05, 'epoch': 0.5} + 50%|█████ | 2895/5772 [1:13:42<4:55:31, 6.16s/it] {'loss': 0.4658, 'learning_rate': 1.0437595531803713e-05, 'epoch': 0.5} + 50%|█████ | 2895/5772 [1:13:40<4:55:32, 6.16s/it] 50%|█████ | 2896/5772 [1:13:47<4:58:21, 6.22s/it] 50%|█████ | 2896/5772 [1:13:49<4:58:21, 6.22s/it] {'loss': 0.4785, 'learning_rate': 1.0431988847796361e-05, 'epoch': 0.5} + 50%|█████ | 2896/5772 [1:13:49<4:58:21, 6.22s/it] {'loss': 0.4785, 'learning_rate': 1.0431988847796361e-05, 'epoch': 0.5} + 50%|█████ | 2896/5772 [1:13:47<4:58:21, 6.22s/it] 50%|█████ | 2897/5772 [1:13:53<4:58:30, 6.23s/it] 50%|█████ | 2897/5772 [1:13:55<4:58:30, 6.23s/it] {'loss': 0.4607, 'learning_rate': 1.0426382027736524e-05, 'epoch': 0.5} + 50%|█████ | 2897/5772 [1:13:55<4:58:30, 6.23s/it] {'loss': 0.4607, 'learning_rate': 1.0426382027736524e-05, 'epoch': 0.5} + 50%|█████ | 2897/5772 [1:13:53<4:58:30, 6.23s/it] 50%|█████ | 2898/5772 [1:13:59<4:57:24, 6.21s/it] 50%|█████ | 2898/5772 [1:14:01<4:57:24, 6.21s/it] {'loss': 0.4539, 'learning_rate': 1.042077507339004e-05, 'epoch': 0.5} + 50%|█████ | 2898/5772 [1:14:01<4:57:24, 6.21s/it] {'loss': 0.4539, 'learning_rate': 1.042077507339004e-05, 'epoch': 0.5} + 50%|█████ | 2898/5772 [1:13:59<4:57:24, 6.21s/it] 50%|█████ | 2899/5772 [1:14:05<4:58:08, 6.23s/it] 50%|█████ | 2899/5772 [1:14:07<4:58:09, 6.23s/it] {'loss': 0.4563, 'learning_rate': 1.0415167986522785e-05, 'epoch': 0.5} + 50%|█████ | 2899/5772 [1:14:07<4:58:09, 6.23s/it] {'loss': 0.4563, 'learning_rate': 1.0415167986522785e-05, 'epoch': 0.5} + 50%|█████ | 2899/5772 [1:14:05<4:58:08, 6.23s/it]11 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +0812 AutoResumeHook: Checking whether to suspend...7 + AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... + 50%|█████ | 2900/5772 [1:14:12<4:57:12, 6.21s/it] AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... + 50%|█████ | 2900/5772 [1:14:14<4:57:12, 6.21s/it]1 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4754, 'learning_rate': 1.040956076890068e-05, 'epoch': 0.5} + 50%|█████ | 2900/5772 [1:14:14<4:57:12, 6.21s/it] {'loss': 0.4754, 'learning_rate': 1.040956076890068e-05, 'epoch': 0.5} + 50%|█████ | 2900/5772 [1:14:12<4:57:12, 6.21s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-2900/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-2900/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-2900/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 50%|█████ | 2901/5772 [1:14:33<8:39:20, 10.85s/it] 50%|█████ | 2901/5772 [1:14:35<8:39:21, 10.85s/it] {'loss': 0.4598, 'learning_rate': 1.0403953422289687e-05, 'epoch': 0.5} + 50%|█████ | 2901/5772 [1:14:35<8:39:21, 10.85s/it] {'loss': 0.4598, 'learning_rate': 1.0403953422289687e-05, 'epoch': 0.5} + 50%|█████ | 2901/5772 [1:14:33<8:39:20, 10.85s/it] 50%|█████ | 2902/5772 [1:14:39<7:29:38, 9.40s/it] 50%|█████ | 2902/5772 [1:14:41<7:29:38, 9.40s/it] {'loss': 0.4637, 'learning_rate': 1.0398345948455815e-05, 'epoch': 0.5} + 50%|█████ | 2902/5772 [1:14:41<7:29:38, 9.40s/it] {'loss': 0.4637, 'learning_rate': 1.0398345948455815e-05, 'epoch': 0.5} + 50%|█████ | 2902/5772 [1:14:39<7:29:38, 9.40s/it] 50%|█████ | 2903/5772 [1:14:46<6:46:34, 8.50s/it] 50%|█████ | 2903/5772 [1:14:48<6:46:34, 8.50s/it] {'loss': 0.4655, 'learning_rate': 1.0392738349165097e-05, 'epoch': 0.5} + 50%|█████ | 2903/5772 [1:14:48<6:46:34, 8.50s/it] {'loss': 0.4655, 'learning_rate': 1.0392738349165097e-05, 'epoch': 0.5} + 50%|█████ | 2903/5772 [1:14:46<6:46:34, 8.50s/it] 50%|█████ | 2904/5772 [1:14:54<6:16:34, 7.88s/it] 50%|█████ | 2904/5772 [1:14:52<6:16:34, 7.88s/it] {'loss': 0.4774, 'learning_rate': 1.038713062618362e-05, 'epoch': 0.5} + 50%|█████ | 2904/5772 [1:14:54<6:16:34, 7.88s/it] {'loss': 0.4774, 'learning_rate': 1.038713062618362e-05, 'epoch': 0.5} + 50%|█████ | 2904/5772 [1:14:52<6:16:34, 7.88s/it] 50%|█████ | 2905/5772 [1:15:00<5:52:04, 7.37s/it] 50%|█████ | 2905/5772 [1:14:58<5:52:05, 7.37s/it] {'loss': 0.4659, 'learning_rate': 1.0381522781277506e-05, 'epoch': 0.5} + 50%|█████ | 2905/5772 [1:15:00<5:52:04, 7.37s/it] {'loss': 0.4659, 'learning_rate': 1.0381522781277506e-05, 'epoch': 0.5} + 50%|█████ | 2905/5772 [1:14:58<5:52:05, 7.37s/it] 50%|█████ | 2906/5772 [1:15:06<5:34:08, 7.00s/it] 50%|█████ | 2906/5772 [1:15:04<5:34:08, 7.00s/it] {'loss': 0.4837, 'learning_rate': 1.0375914816212913e-05, 'epoch': 0.5} + 50%|█████ | 2906/5772 [1:15:06<5:34:08, 7.00s/it] {'loss': 0.4837, 'learning_rate': 1.0375914816212913e-05, 'epoch': 0.5} + 50%|█████ | 2906/5772 [1:15:04<5:34:08, 7.00s/it] 50%|█████ | 2907/5772 [1:15:11<5:22:46, 6.76s/it] 50%|█████ | 2907/5772 [1:15:13<5:22:46, 6.76s/it] {'loss': 0.4699, 'learning_rate': 1.0370306732756037e-05, 'epoch': 0.5} + 50%|█████ | 2907/5772 [1:15:13<5:22:46, 6.76s/it] {'loss': 0.4699, 'learning_rate': 1.0370306732756037e-05, 'epoch': 0.5} + 50%|█████ | 2907/5772 [1:15:11<5:22:46, 6.76s/it] 50%|█████ | 2908/5772 [1:15:19<5:16:29, 6.63s/it] 50%|█████ | 2908/5772 [1:15:17<5:16:29, 6.63s/it] {'loss': 0.4766, 'learning_rate': 1.0364698532673117e-05, 'epoch': 0.5} + 50%|█████ | 2908/5772 [1:15:19<5:16:29, 6.63s/it] {'loss': 0.4766, 'learning_rate': 1.0364698532673117e-05, 'epoch': 0.5} + 50%|█████ | 2908/5772 [1:15:17<5:16:29, 6.63s/it] 50%|█████ | 2909/5772 [1:15:25<5:11:48, 6.53s/it] 50%|█████ | 2909/5772 [1:15:23<5:11:48, 6.53s/it] {'loss': 0.4821, 'learning_rate': 1.035909021773042e-05, 'epoch': 0.5} + 50%|█████ | 2909/5772 [1:15:25<5:11:48, 6.53s/it] {'loss': 0.4821, 'learning_rate': 1.035909021773042e-05, 'epoch': 0.5} + 50%|█████ | 2909/5772 [1:15:23<5:11:48, 6.53s/it] 50%|█████ | 2910/5772 [1:15:30<5:06:40, 6.43s/it] 50%|█████ | 2910/5772 [1:15:32<5:06:40, 6.43s/it] {'loss': 0.4764, 'learning_rate': 1.0353481789694258e-05, 'epoch': 0.5} + 50%|█████ | 2910/5772 [1:15:32<5:06:40, 6.43s/it] {'loss': 0.4764, 'learning_rate': 1.0353481789694258e-05, 'epoch': 0.5} + 50%|█████ | 2910/5772 [1:15:30<5:06:40, 6.43s/it] 50%|█████ | 2911/5772 [1:15:38<5:06:29, 6.43s/it] 50%|█████ | 2911/5772 [1:15:36<5:06:29, 6.43s/it] {'loss': 0.4531, 'learning_rate': 1.0347873250330971e-05, 'epoch': 0.5} + 50%|█████ | 2911/5772 [1:15:38<5:06:29, 6.43s/it] {'loss': 0.4531, 'learning_rate': 1.0347873250330971e-05, 'epoch': 0.5} + 50%|█████ | 2911/5772 [1:15:36<5:06:29, 6.43s/it] 50%|█████ | 2912/5772 [1:15:42<5:00:51, 6.31s/it] 50%|█████ | 2912/5772 [1:15:44<5:00:51, 6.31s/it] {'loss': 0.4722, 'learning_rate': 1.0342264601406936e-05, 'epoch': 0.5} + 50%|█████ | 2912/5772 [1:15:44<5:00:51, 6.31s/it] {'loss': 0.4722, 'learning_rate': 1.0342264601406936e-05, 'epoch': 0.5} + 50%|█████ | 2912/5772 [1:15:42<5:00:51, 6.31s/it] 50%|█████ | 2913/5772 [1:15:48<4:58:17, 6.26s/it] 50%|█████ | 2913/5772 [1:15:50<4:58:18, 6.26s/it] {'loss': 0.4608, 'learning_rate': 1.0336655844688571e-05, 'epoch': 0.5} + 50%|█████ | 2913/5772 [1:15:50<4:58:18, 6.26s/it] {'loss': 0.4608, 'learning_rate': 1.0336655844688571e-05, 'epoch': 0.5} + 50%|█████ | 2913/5772 [1:15:48<4:58:17, 6.26s/it] 50%|█████ | 2914/5772 [1:15:54<4:51:11, 6.11s/it] 50%|█████ | 2914/5772 [1:15:56<4:51:11, 6.11s/it] {'loss': 0.4781, 'learning_rate': 1.0331046981942311e-05, 'epoch': 0.5} + 50%|█████ | 2914/5772 [1:15:56<4:51:11, 6.11s/it] {'loss': 0.4781, 'learning_rate': 1.0331046981942311e-05, 'epoch': 0.5} + 50%|█████ | 2914/5772 [1:15:54<4:51:11, 6.11s/it] 51%|█████ | 2915/5772 [1:16:02<4:52:03, 6.13s/it] 51%|█████ | 2915/5772 [1:16:00<4:52:03, 6.13s/it] {'loss': 0.46, 'learning_rate': 1.0325438014934655e-05, 'epoch': 0.5} + 51%|█████ | 2915/5772 [1:16:02<4:52:03, 6.13s/it] {'loss': 0.46, 'learning_rate': 1.0325438014934655e-05, 'epoch': 0.5} + 51%|█████ | 2915/5772 [1:16:00<4:52:03, 6.13s/it] 51%|█████ | 2916/5772 [1:16:06<4:50:51, 6.11s/it] 51%|█████ | 2916/5772 [1:16:08<4:50:52, 6.11s/it] {'loss': 0.4754, 'learning_rate': 1.03198289454321e-05, 'epoch': 0.51} + 51%|█████ | 2916/5772 [1:16:08<4:50:52, 6.11s/it] {'loss': 0.4754, 'learning_rate': 1.03198289454321e-05, 'epoch': 0.51} + 51%|█████ | 2916/5772 [1:16:06<4:50:51, 6.11s/it] 51%|█████ | 2917/5772 [1:16:14<4:48:52, 6.07s/it] 51%|█████ | 2917/5772 [1:16:12<4:48:52, 6.07s/it] {'loss': 0.4663, 'learning_rate': 1.0314219775201198e-05, 'epoch': 0.51} + 51%|█████ | 2917/5772 [1:16:14<4:48:52, 6.07s/it] {'loss': 0.4663, 'learning_rate': 1.0314219775201198e-05, 'epoch': 0.51} + 51%|█████ | 2917/5772 [1:16:12<4:48:52, 6.07s/it] 51%|█████ | 2918/5772 [1:16:18<4:50:27, 6.11s/it] 51%|█████ | 2918/5772 [1:16:20<4:50:27, 6.11s/it] {'loss': 0.4763, 'learning_rate': 1.0308610506008527e-05, 'epoch': 0.51} + 51%|█████ | 2918/5772 [1:16:20<4:50:27, 6.11s/it] {'loss': 0.4763, 'learning_rate': 1.0308610506008527e-05, 'epoch': 0.51} + 51%|█████ | 2918/5772 [1:16:18<4:50:27, 6.11s/it] 51%|█████ | 2919/5772 [1:16:26<4:49:26, 6.09s/it] 51%|█████ | 2919/5772 [1:16:24<4:49:26, 6.09s/it] {'loss': 0.4618, 'learning_rate': 1.030300113962069e-05, 'epoch': 0.51} + 51%|█████ | 2919/5772 [1:16:26<4:49:26, 6.09s/it] {'loss': 0.4618, 'learning_rate': 1.030300113962069e-05, 'epoch': 0.51} + 51%|█████ | 2919/5772 [1:16:24<4:49:26, 6.09s/it] 51%|█████ | 2920/5772 [1:16:30<4:48:27, 6.07s/it] 51%|█████ | 2920/5772 [1:16:32<4:48:27, 6.07s/it] {'loss': 0.4673, 'learning_rate': 1.029739167780433e-05, 'epoch': 0.51} + 51%|█████ | 2920/5772 [1:16:32<4:48:27, 6.07s/it] {'loss': 0.4673, 'learning_rate': 1.029739167780433e-05, 'epoch': 0.51} + 51%|█████ | 2920/5772 [1:16:30<4:48:27, 6.07s/it] 51%|█████ | 2921/5772 [1:16:39<4:51:15, 6.13s/it] 51%|█████ | 2921/5772 [1:16:37<4:51:16, 6.13s/it] {'loss': 0.4567, 'learning_rate': 1.0291782122326112e-05, 'epoch': 0.51} + 51%|█████ | 2921/5772 [1:16:39<4:51:15, 6.13s/it] {'loss': 0.4567, 'learning_rate': 1.0291782122326112e-05, 'epoch': 0.51} + 51%|█████ | 2921/5772 [1:16:37<4:51:16, 6.13s/it] 51%|█████ | 2922/5772 [1:16:42<4:47:04, 6.04s/it] 51%|█████ | 2922/5772 [1:16:44<4:47:04, 6.04s/it] {'loss': 0.4812, 'learning_rate': 1.0286172474952742e-05, 'epoch': 0.51} + 51%|█████ | 2922/5772 [1:16:44<4:47:04, 6.04s/it] {'loss': 0.4812, 'learning_rate': 1.0286172474952742e-05, 'epoch': 0.51} + 51%|█████ | 2922/5772 [1:16:42<4:47:04, 6.04s/it] 51%|█████ | 2923/5772 [1:16:49<4:48:40, 6.08s/it] 51%|█████ | 2923/5772 [1:16:51<4:48:41, 6.08s/it] {'loss': 0.4611, 'learning_rate': 1.0280562737450938e-05, 'epoch': 0.51} + 51%|█████ | 2923/5772 [1:16:51<4:48:41, 6.08s/it] {'loss': 0.4611, 'learning_rate': 1.0280562737450938e-05, 'epoch': 0.51} + 51%|█████ | 2923/5772 [1:16:49<4:48:40, 6.08s/it] 51%|█████ | 2924/5772 [1:16:55<4:54:19, 6.20s/it] 51%|█████ | 2924/5772 [1:16:57<4:54:18, 6.20s/it] {'loss': 0.4646, 'learning_rate': 1.0274952911587464e-05, 'epoch': 0.51} + 51%|█████ | 2924/5772 [1:16:57<4:54:18, 6.20s/it] {'loss': 0.4646, 'learning_rate': 1.0274952911587464e-05, 'epoch': 0.51} + 51%|█████ | 2924/5772 [1:16:55<4:54:19, 6.20s/it] 51%|█████ | 2925/5772 [1:17:01<4:49:04, 6.09s/it] 51%|█████ | 2925/5772 [1:17:03<4:49:04, 6.09s/it] {'loss': 0.4584, 'learning_rate': 1.0269342999129097e-05, 'epoch': 0.51} + 51%|█████ | 2925/5772 [1:17:03<4:49:04, 6.09s/it] {'loss': 0.4584, 'learning_rate': 1.0269342999129097e-05, 'epoch': 0.51} + 51%|█████ | 2925/5772 [1:17:01<4:49:04, 6.09s/it] 51%|█████ | 2926/5772 [1:17:07<4:54:37, 6.21s/it] 51%|█████ | 2926/5772 [1:17:09<4:54:37, 6.21s/it] {'loss': 0.4704, 'learning_rate': 1.026373300184265e-05, 'epoch': 0.51} + 51%|█████ | 2926/5772 [1:17:09<4:54:37, 6.21s/it] {'loss': 0.4704, 'learning_rate': 1.026373300184265e-05, 'epoch': 0.51} + 51%|█████ | 2926/5772 [1:17:07<4:54:37, 6.21s/it] 51%|█████ | 2927/5772 [1:17:16<4:57:47, 6.28s/it] 51%|█████ | 2927/5772 [1:17:14<4:57:48, 6.28s/it] {'loss': 0.4644, 'learning_rate': 1.025812292149496e-05, 'epoch': 0.51} + 51%|█████ | 2927/5772 [1:17:16<4:57:47, 6.28s/it] {'loss': 0.4644, 'learning_rate': 1.025812292149496e-05, 'epoch': 0.51} + 51%|█████ | 2927/5772 [1:17:14<4:57:48, 6.28s/it] 51%|█████ | 2928/5772 [1:17:22<4:55:09, 6.23s/it] 51%|█████ | 2928/5772 [1:17:20<4:55:09, 6.23s/it] {'loss': 0.4719, 'learning_rate': 1.0252512759852891e-05, 'epoch': 0.51} + 51%|█████ | 2928/5772 [1:17:22<4:55:09, 6.23s/it] {'loss': 0.4719, 'learning_rate': 1.0252512759852891e-05, 'epoch': 0.51} + 51%|█████ | 2928/5772 [1:17:20<4:55:09, 6.23s/it] 51%|█████ | 2929/5772 [1:17:26<4:55:46, 6.24s/it] 51%|█████ | 2929/5772 [1:17:28<4:55:46, 6.24s/it] {'loss': 0.485, 'learning_rate': 1.0246902518683331e-05, 'epoch': 0.51} + 51%|█████ | 2929/5772 [1:17:28<4:55:46, 6.24s/it] {'loss': 0.485, 'learning_rate': 1.0246902518683331e-05, 'epoch': 0.51} + 51%|█████ | 2929/5772 [1:17:26<4:55:46, 6.24s/it] 51%|█████ | 2930/5772 [1:17:34<4:53:36, 6.20s/it] 51%|█████ | 2930/5772 [1:17:32<4:53:36, 6.20s/it] {'loss': 0.461, 'learning_rate': 1.0241292199753196e-05, 'epoch': 0.51} + 51%|█████ | 2930/5772 [1:17:34<4:53:36, 6.20s/it] {'loss': 0.461, 'learning_rate': 1.0241292199753196e-05, 'epoch': 0.51} + 51%|█████ | 2930/5772 [1:17:32<4:53:36, 6.20s/it] 51%|█████ | 2931/5772 [1:17:40<4:49:06, 6.11s/it] 51%|█████ | 2931/5772 [1:17:38<4:49:06, 6.11s/it] {'loss': 0.4711, 'learning_rate': 1.0235681804829426e-05, 'epoch': 0.51} + 51%|█████ | 2931/5772 [1:17:40<4:49:06, 6.11s/it] {'loss': 0.4711, 'learning_rate': 1.0235681804829426e-05, 'epoch': 0.51} + 51%|█████ | 2931/5772 [1:17:38<4:49:06, 6.11s/it] 51%|█████ | 2932/5772 [1:17:46<4:47:31, 6.07s/it] 51%|█████ | 2932/5772 [1:17:44<4:47:31, 6.07s/it] {'loss': 0.4688, 'learning_rate': 1.0230071335678982e-05, 'epoch': 0.51} + 51%|█████ | 2932/5772 [1:17:46<4:47:31, 6.07s/it] {'loss': 0.4688, 'learning_rate': 1.0230071335678982e-05, 'epoch': 0.51} + 51%|█████ | 2932/5772 [1:17:44<4:47:31, 6.07s/it] 51%|█████ | 2933/5772 [1:17:50<4:46:28, 6.05s/it] 51%|█████ | 2933/5772 [1:17:52<4:46:28, 6.05s/it] {'loss': 0.4713, 'learning_rate': 1.0224460794068849e-05, 'epoch': 0.51} + 51%|█████ | 2933/5772 [1:17:52<4:46:28, 6.05s/it] {'loss': 0.4713, 'learning_rate': 1.0224460794068849e-05, 'epoch': 0.51} + 51%|█████ | 2933/5772 [1:17:50<4:46:28, 6.05s/it] 51%|█████ | 2934/5772 [1:17:57<4:50:04, 6.13s/it] 51%|█████ | 2934/5772 [1:17:59<4:50:04, 6.13s/it] {'loss': 0.458, 'learning_rate': 1.0218850181766038e-05, 'epoch': 0.51} + 51%|█████ | 2934/5772 [1:17:59<4:50:04, 6.13s/it] {'loss': 0.458, 'learning_rate': 1.0218850181766038e-05, 'epoch': 0.51} + 51%|█████ | 2934/5772 [1:17:57<4:50:04, 6.13s/it] 51%|█████ | 2935/5772 [1:18:05<4:47:25, 6.08s/it] 51%|█████ | 2935/5772 [1:18:03<4:47:25, 6.08s/it] {'loss': 0.4666, 'learning_rate': 1.0213239500537577e-05, 'epoch': 0.51} + 51%|█████ | 2935/5772 [1:18:05<4:47:25, 6.08s/it] {'loss': 0.4666, 'learning_rate': 1.0213239500537577e-05, 'epoch': 0.51} + 51%|█████ | 2935/5772 [1:18:03<4:47:25, 6.08s/it] 51%|█████ | 2936/5772 [1:18:11<4:53:23, 6.21s/it] 51%|█████ | 2936/5772 [1:18:09<4:53:23, 6.21s/it] {'loss': 0.4552, 'learning_rate': 1.020762875215052e-05, 'epoch': 0.51} + 51%|█████ | 2936/5772 [1:18:11<4:53:23, 6.21s/it] {'loss': 0.4552, 'learning_rate': 1.020762875215052e-05, 'epoch': 0.51} + 51%|█████ | 2936/5772 [1:18:09<4:53:23, 6.21s/it] 51%|█████ | 2937/5772 [1:18:15<4:52:57, 6.20s/it] 51%|█████ | 2937/5772 [1:18:17<4:52:57, 6.20s/it] {'loss': 0.4701, 'learning_rate': 1.0202017938371947e-05, 'epoch': 0.51} + 51%|█████ | 2937/5772 [1:18:17<4:52:57, 6.20s/it] {'loss': 0.4701, 'learning_rate': 1.0202017938371947e-05, 'epoch': 0.51} + 51%|█████ | 2937/5772 [1:18:15<4:52:57, 6.20s/it] 51%|█████ | 2938/5772 [1:18:21<4:50:58, 6.16s/it] 51%|█████ | 2938/5772 [1:18:23<4:50:58, 6.16s/it] {'loss': 0.4638, 'learning_rate': 1.0196407060968942e-05, 'epoch': 0.51} + 51%|█████ | 2938/5772 [1:18:23<4:50:58, 6.16s/it] {'loss': 0.4638, 'learning_rate': 1.0196407060968942e-05, 'epoch': 0.51} + 51%|█████ | 2938/5772 [1:18:21<4:50:58, 6.16s/it] 51%|█████ | 2939/5772 [1:18:29<4:48:56, 6.12s/it] 51%|█████ | 2939/5772 [1:18:27<4:48:56, 6.12s/it] {'loss': 0.4686, 'learning_rate': 1.0190796121708627e-05, 'epoch': 0.51} + 51%|█████ | 2939/5772 [1:18:29<4:48:56, 6.12s/it] {'loss': 0.4686, 'learning_rate': 1.0190796121708627e-05, 'epoch': 0.51} + 51%|█████ | 2939/5772 [1:18:27<4:48:56, 6.12s/it] 51%|█████ | 2940/5772 [1:18:33<4:49:17, 6.13s/it] 51%|█████ | 2940/5772 [1:18:35<4:49:17, 6.13s/it] {'loss': 0.4612, 'learning_rate': 1.0185185122358139e-05, 'epoch': 0.51} + 51%|█████ | 2940/5772 [1:18:35<4:49:17, 6.13s/it] {'loss': 0.4612, 'learning_rate': 1.0185185122358139e-05, 'epoch': 0.51} + 51%|█████ | 2940/5772 [1:18:33<4:49:17, 6.13s/it] 51%|█████ | 2941/5772 [1:18:40<4:53:05, 6.21s/it] 51%|█████ | 2941/5772 [1:18:42<4:53:06, 6.21s/it] {'loss': 0.4647, 'learning_rate': 1.017957406468462e-05, 'epoch': 0.51} + 51%|█████ | 2941/5772 [1:18:42<4:53:06, 6.21s/it] {'loss': 0.4647, 'learning_rate': 1.017957406468462e-05, 'epoch': 0.51} + 51%|█████ | 2941/5772 [1:18:40<4:53:05, 6.21s/it] 51%|█████ | 2942/5772 [1:18:46<4:54:05, 6.23s/it] 51%|█████ | 2942/5772 [1:18:48<4:54:05, 6.24s/it] {'loss': 0.4582, 'learning_rate': 1.0173962950455249e-05, 'epoch': 0.51} + 51%|█████ | 2942/5772 [1:18:48<4:54:05, 6.24s/it] {'loss': 0.4582, 'learning_rate': 1.0173962950455249e-05, 'epoch': 0.51} + 51%|█████ | 2942/5772 [1:18:46<4:54:05, 6.23s/it] 51%|█████ | 2943/5772 [1:18:52<4:53:42, 6.23s/it] 51%|█████ | 2943/5772 [1:18:54<4:53:42, 6.23s/it] {'loss': 0.4753, 'learning_rate': 1.0168351781437215e-05, 'epoch': 0.51} + 51%|█████ | 2943/5772 [1:18:54<4:53:42, 6.23s/it] {'loss': 0.4753, 'learning_rate': 1.0168351781437215e-05, 'epoch': 0.51} + 51%|█████ | 2943/5772 [1:18:52<4:53:42, 6.23s/it] 51%|█████ | 2944/5772 [1:18:59<4:57:07, 6.30s/it] 51%|█████ | 2944/5772 [1:19:01<4:57:07, 6.30s/it] {'loss': 0.465, 'learning_rate': 1.0162740559397726e-05, 'epoch': 0.51} + 51%|█████ | 2944/5772 [1:19:01<4:57:07, 6.30s/it] {'loss': 0.465, 'learning_rate': 1.0162740559397726e-05, 'epoch': 0.51} + 51%|█████ | 2944/5772 [1:18:59<4:57:07, 6.30s/it] 51%|█████ | 2945/5772 [1:19:05<4:51:22, 6.18s/it] 51%|█████ | 2945/5772 [1:19:07<4:51:22, 6.18s/it] {'loss': 0.461, 'learning_rate': 1.0157129286104e-05, 'epoch': 0.51} + 51%|█████ | 2945/5772 [1:19:07<4:51:22, 6.18s/it] {'loss': 0.461, 'learning_rate': 1.0157129286104e-05, 'epoch': 0.51} + 51%|█████ | 2945/5772 [1:19:05<4:51:22, 6.18s/it] 51%|█████ | 2946/5772 [1:19:11<4:48:00, 6.11s/it] 51%|█████ | 2946/5772 [1:19:13<4:48:00, 6.11s/it] {'loss': 0.4559, 'learning_rate': 1.015151796332328e-05, 'epoch': 0.51} + 51%|█████ | 2946/5772 [1:19:13<4:48:00, 6.11s/it] {'loss': 0.4559, 'learning_rate': 1.015151796332328e-05, 'epoch': 0.51} + 51%|█████ | 2946/5772 [1:19:11<4:48:00, 6.11s/it] 51%|█████ | 2947/5772 [1:19:17<4:47:31, 6.11s/it] 51%|█████ | 2947/5772 [1:19:19<4:47:31, 6.11s/it] {'loss': 0.4736, 'learning_rate': 1.0145906592822819e-05, 'epoch': 0.51} + 51%|█████ | 2947/5772 [1:19:19<4:47:31, 6.11s/it] {'loss': 0.4736, 'learning_rate': 1.0145906592822819e-05, 'epoch': 0.51} + 51%|█████ | 2947/5772 [1:19:17<4:47:31, 6.11s/it] 51%|█████ | 2948/5772 [1:19:23<4:48:37, 6.13s/it] 51%|█████ | 2948/5772 [1:19:25<4:48:37, 6.13s/it] {'loss': 0.4715, 'learning_rate': 1.014029517636989e-05, 'epoch': 0.51} + 51%|█████ | 2948/5772 [1:19:23<4:48:37, 6.13s/it] {'loss': 0.4715, 'learning_rate': 1.014029517636989e-05, 'epoch': 0.51} + 51%|█████ | 2948/5772 [1:19:25<4:48:37, 6.13s/it] 51%|█████ | 2949/5772 [1:19:29<4:52:00, 6.21s/it] 51%|█████ | 2949/5772 [1:19:31<4:52:00, 6.21s/it] {'loss': 0.4753, 'learning_rate': 1.013468371573177e-05, 'epoch': 0.51} + 51%|█████ | 2949/5772 [1:19:31<4:52:00, 6.21s/it] {'loss': 0.4753, 'learning_rate': 1.013468371573177e-05, 'epoch': 0.51} + 51%|█████ | 2949/5772 [1:19:29<4:52:00, 6.21s/it]10 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +013 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... + 51%|█████ | 2950/5772 [1:19:36<4:50:40, 6.18s/it]11 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... + 51%|█████ | 2950/5772 [1:19:37<4:50:40, 6.18s/it]1 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... + {'loss': 0.467, 'learning_rate': 1.0129072212675766e-05, 'epoch': 0.51} + 51%|█████ | 2950/5772 [1:19:37<4:50:40, 6.18s/it] {'loss': 0.467, 'learning_rate': 1.0129072212675766e-05, 'epoch': 0.51} + 51%|█████ | 2950/5772 [1:19:36<4:50:40, 6.18s/it] 51%|█████ | 2951/5772 [1:19:42<4:52:09, 6.21s/it] 51%|█████ | 2951/5772 [1:19:44<4:52:09, 6.21s/it] {'loss': 0.4833, 'learning_rate': 1.0123460668969184e-05, 'epoch': 0.51} + 51%|█████ | 2951/5772 [1:19:44<4:52:09, 6.21s/it] {'loss': 0.4833, 'learning_rate': 1.0123460668969184e-05, 'epoch': 0.51} + 51%|█████ | 2951/5772 [1:19:42<4:52:09, 6.21s/it] 51%|█████ | 2952/5772 [1:19:48<4:56:45, 6.31s/it] 51%|█████ | 2952/5772 [1:19:50<4:56:45, 6.31s/it] {'loss': 0.4622, 'learning_rate': 1.0117849086379355e-05, 'epoch': 0.51} + 51%|█████ | 2952/5772 [1:19:50<4:56:45, 6.31s/it] {'loss': 0.4622, 'learning_rate': 1.0117849086379355e-05, 'epoch': 0.51} + 51%|█████ | 2952/5772 [1:19:48<4:56:45, 6.31s/it] 51%|█████ | 2953/5772 [1:19:54<4:51:59, 6.21s/it] 51%|█████ | 2953/5772 [1:19:56<4:52:00, 6.22s/it] {'loss': 0.4684, 'learning_rate': 1.011223746667361e-05, 'epoch': 0.51} + 51%|█████ | 2953/5772 [1:19:56<4:52:00, 6.22s/it] {'loss': 0.4684, 'learning_rate': 1.011223746667361e-05, 'epoch': 0.51} + 51%|█████ | 2953/5772 [1:19:54<4:51:59, 6.21s/it] 51%|█████ | 2954/5772 [1:20:01<4:58:44, 6.36s/it] 51%|█████ | 2954/5772 [1:20:03<4:58:44, 6.36s/it] {'loss': 0.4609, 'learning_rate': 1.0106625811619297e-05, 'epoch': 0.51} + 51%|█████ | 2954/5772 [1:20:03<4:58:44, 6.36s/it] {'loss': 0.4609, 'learning_rate': 1.0106625811619297e-05, 'epoch': 0.51} + 51%|█████ | 2954/5772 [1:20:01<4:58:44, 6.36s/it] 51%|█████ | 2955/5772 [1:20:08<5:04:06, 6.48s/it] 51%|█████ | 2955/5772 [1:20:10<5:04:06, 6.48s/it] {'loss': 0.4705, 'learning_rate': 1.010101412298378e-05, 'epoch': 0.51} + 51%|█████ | 2955/5772 [1:20:10<5:04:06, 6.48s/it] {'loss': 0.4705, 'learning_rate': 1.010101412298378e-05, 'epoch': 0.51} + 51%|█████ | 2955/5772 [1:20:08<5:04:06, 6.48s/it] 51%|█████ | 2956/5772 [1:20:15<5:09:57, 6.60s/it] 51%|█████ | 2956/5772 [1:20:17<5:09:57, 6.60s/it] {'loss': 0.4534, 'learning_rate': 1.0095402402534423e-05, 'epoch': 0.51} + 51%|█████ | 2956/5772 [1:20:17<5:09:57, 6.60s/it] {'loss': 0.4534, 'learning_rate': 1.0095402402534423e-05, 'epoch': 0.51} + 51%|█████ | 2956/5772 [1:20:15<5:09:57, 6.60s/it] 51%|█████ | 2957/5772 [1:20:21<5:07:42, 6.56s/it] 51%|█████ | 2957/5772 [1:20:23<5:07:42, 6.56s/it] {'loss': 0.4711, 'learning_rate': 1.0089790652038613e-05, 'epoch': 0.51} + 51%|█████ | 2957/5772 [1:20:23<5:07:42, 6.56s/it] {'loss': 0.4711, 'learning_rate': 1.0089790652038613e-05, 'epoch': 0.51} + 51%|█████ | 2957/5772 [1:20:21<5:07:42, 6.56s/it] 51%|█████ | 2958/5772 [1:20:28<5:05:42, 6.52s/it] 51%|█████ | 2958/5772 [1:20:30<5:05:43, 6.52s/it] {'loss': 0.4686, 'learning_rate': 1.0084178873263735e-05, 'epoch': 0.51} + 51%|█████ | 2958/5772 [1:20:30<5:05:43, 6.52s/it] {'loss': 0.4686, 'learning_rate': 1.0084178873263735e-05, 'epoch': 0.51} + 51%|█████ | 2958/5772 [1:20:28<5:05:42, 6.52s/it] 51%|█████▏ | 2959/5772 [1:20:34<5:02:34, 6.45s/it] 51%|█████▏ | 2959/5772 [1:20:36<5:02:34, 6.45s/it] {'loss': 0.4894, 'learning_rate': 1.0078567067977193e-05, 'epoch': 0.51} + 51%|█████▏ | 2959/5772 [1:20:36<5:02:34, 6.45s/it] {'loss': 0.4894, 'learning_rate': 1.0078567067977193e-05, 'epoch': 0.51} + 51%|█████▏ | 2959/5772 [1:20:34<5:02:34, 6.45s/it] 51%|█████▏ | 2960/5772 [1:20:40<4:58:34, 6.37s/it] 51%|█████▏ | 2960/5772 [1:20:42<4:58:34, 6.37s/it] {'loss': 0.474, 'learning_rate': 1.0072955237946383e-05, 'epoch': 0.51} + 51%|█████▏ | 2960/5772 [1:20:42<4:58:34, 6.37s/it] {'loss': 0.474, 'learning_rate': 1.0072955237946383e-05, 'epoch': 0.51} + 51%|█████▏ | 2960/5772 [1:20:40<4:58:34, 6.37s/it] 51%|█████▏ | 2961/5772 [1:20:46<4:53:25, 6.26s/it] 51%|█████▏ | 2961/5772 [1:20:48<4:53:25, 6.26s/it] {'loss': 0.4594, 'learning_rate': 1.0067343384938731e-05, 'epoch': 0.51} + 51%|█████▏ | 2961/5772 [1:20:48<4:53:25, 6.26s/it] {'loss': 0.4594, 'learning_rate': 1.0067343384938731e-05, 'epoch': 0.51} + 51%|█████▏ | 2961/5772 [1:20:46<4:53:25, 6.26s/it] 51%|█████▏ | 2962/5772 [1:20:52<4:48:40, 6.16s/it] 51%|█████▏ | 2962/5772 [1:20:54<4:48:40, 6.16s/it] {'loss': 0.4644, 'learning_rate': 1.0061731510721653e-05, 'epoch': 0.51} + 51%|█████▏ | 2962/5772 [1:20:54<4:48:40, 6.16s/it] {'loss': 0.4644, 'learning_rate': 1.0061731510721653e-05, 'epoch': 0.51} + 51%|█████▏ | 2962/5772 [1:20:52<4:48:40, 6.16s/it] 51%|█████▏ | 2963/5772 [1:20:58<4:46:12, 6.11s/it] 51%|█████▏ | 2963/5772 [1:21:00<4:46:11, 6.11s/it] {'loss': 0.4748, 'learning_rate': 1.005611961706258e-05, 'epoch': 0.51} + 51%|█████▏ | 2963/5772 [1:21:00<4:46:11, 6.11s/it] {'loss': 0.4748, 'learning_rate': 1.005611961706258e-05, 'epoch': 0.51} + 51%|█████▏ | 2963/5772 [1:20:58<4:46:12, 6.11s/it] 51%|█████▏ | 2964/5772 [1:21:04<4:46:58, 6.13s/it] 51%|█████▏ | 2964/5772 [1:21:06<4:46:58, 6.13s/it] {'loss': 0.4724, 'learning_rate': 1.0050507705728943e-05, 'epoch': 0.51} + 51%|█████▏ | 2964/5772 [1:21:06<4:46:58, 6.13s/it] {'loss': 0.4724, 'learning_rate': 1.0050507705728943e-05, 'epoch': 0.51} + 51%|█████▏ | 2964/5772 [1:21:04<4:46:58, 6.13s/it] 51%|█████▏ | 2965/5772 [1:21:10<4:43:33, 6.06s/it] 51%|█████▏ | 2965/5772 [1:21:12<4:43:33, 6.06s/it] {'loss': 0.4678, 'learning_rate': 1.0044895778488184e-05, 'epoch': 0.51} + 51%|█████▏ | 2965/5772 [1:21:12<4:43:33, 6.06s/it] {'loss': 0.4678, 'learning_rate': 1.0044895778488184e-05, 'epoch': 0.51} + 51%|█████▏ | 2965/5772 [1:21:10<4:43:33, 6.06s/it] 51%|█████▏ | 2966/5772 [1:21:16<4:43:02, 6.05s/it] 51%|█████▏ | 2966/5772 [1:21:18<4:43:02, 6.05s/it] {'loss': 0.469, 'learning_rate': 1.0039283837107753e-05, 'epoch': 0.51} + 51%|█████▏ | 2966/5772 [1:21:18<4:43:02, 6.05s/it] {'loss': 0.469, 'learning_rate': 1.0039283837107753e-05, 'epoch': 0.51} + 51%|█████▏ | 2966/5772 [1:21:16<4:43:02, 6.05s/it] 51%|█████▏ | 2967/5772 [1:21:23<4:50:15, 6.21s/it] 51%|█████▏ | 2967/5772 [1:21:25<4:50:15, 6.21s/it] {'loss': 0.4649, 'learning_rate': 1.0033671883355093e-05, 'epoch': 0.51} + 51%|█████▏ | 2967/5772 [1:21:25<4:50:15, 6.21s/it] {'loss': 0.4649, 'learning_rate': 1.0033671883355093e-05, 'epoch': 0.51} + 51%|█████▏ | 2967/5772 [1:21:23<4:50:15, 6.21s/it] 51%|█████▏ | 2968/5772 [1:21:29<4:46:45, 6.14s/it] 51%|█████▏ | 2968/5772 [1:21:31<4:46:45, 6.14s/it] {'loss': 0.4727, 'learning_rate': 1.0028059918997664e-05, 'epoch': 0.51} + 51%|█████▏ | 2968/5772 [1:21:31<4:46:45, 6.14s/it] {'loss': 0.4727, 'learning_rate': 1.0028059918997664e-05, 'epoch': 0.51} + 51%|█████▏ | 2968/5772 [1:21:29<4:46:45, 6.14s/it] 51%|█████▏ | 2969/5772 [1:21:35<4:46:02, 6.12s/it] 51%|█████▏ | 2969/5772 [1:21:37<4:46:01, 6.12s/it] {'loss': 0.4778, 'learning_rate': 1.0022447945802917e-05, 'epoch': 0.51} + 51%|█████▏ | 2969/5772 [1:21:37<4:46:01, 6.12s/it] {'loss': 0.4778, 'learning_rate': 1.0022447945802917e-05, 'epoch': 0.51} + 51%|█████▏ | 2969/5772 [1:21:35<4:46:02, 6.12s/it] 51%|█████▏ | 2970/5772 [1:21:41<4:42:27, 6.05s/it] 51%|█████▏ | 2970/5772 [1:21:43<4:42:27, 6.05s/it] {'loss': 0.4505, 'learning_rate': 1.0016835965538314e-05, 'epoch': 0.51} + 51%|█████▏ | 2970/5772 [1:21:43<4:42:27, 6.05s/it] {'loss': 0.4505, 'learning_rate': 1.0016835965538314e-05, 'epoch': 0.51} + 51%|█████▏ | 2970/5772 [1:21:41<4:42:27, 6.05s/it] 51%|█████▏ | 2971/5772 [1:21:47<4:46:38, 6.14s/it] 51%|█████▏ | 2971/5772 [1:21:49<4:46:38, 6.14s/it] {'loss': 0.4661, 'learning_rate': 1.0011223979971319e-05, 'epoch': 0.51} + 51%|█████▏ | 2971/5772 [1:21:49<4:46:38, 6.14s/it] {'loss': 0.4661, 'learning_rate': 1.0011223979971319e-05, 'epoch': 0.51} + 51%|█████▏ | 2971/5772 [1:21:47<4:46:38, 6.14s/it] 51%|█████▏ | 2972/5772 [1:21:53<4:42:32, 6.05s/it] 51%|█████▏ | 2972/5772 [1:21:55<4:42:32, 6.05s/it] {'loss': 0.472, 'learning_rate': 1.0005611990869392e-05, 'epoch': 0.51} + 51%|█████▏ | 2972/5772 [1:21:55<4:42:32, 6.05s/it] {'loss': 0.472, 'learning_rate': 1.0005611990869392e-05, 'epoch': 0.51} + 51%|█████▏ | 2972/5772 [1:21:53<4:42:32, 6.05s/it] 52%|█████▏ | 2973/5772 [1:21:59<4:40:12, 6.01s/it] 52%|█████▏ | 2973/5772 [1:22:01<4:40:12, 6.01s/it] {'loss': 0.4793, 'learning_rate': 1e-05, 'epoch': 0.52} + 52%|█████▏ | 2973/5772 [1:22:01<4:40:12, 6.01s/it] {'loss': 0.4793, 'learning_rate': 1e-05, 'epoch': 0.52} + 52%|█████▏ | 2973/5772 [1:21:59<4:40:12, 6.01s/it] 52%|█████▏ | 2974/5772 [1:22:05<4:40:56, 6.02s/it] 52%|█████▏ | 2974/5772 [1:22:07<4:40:56, 6.02s/it] {'loss': 0.463, 'learning_rate': 9.99438800913061e-06, 'epoch': 0.52} + 52%|█████▏ | 2974/5772 [1:22:07<4:40:56, 6.02s/it] {'loss': 0.463, 'learning_rate': 9.99438800913061e-06, 'epoch': 0.52} + 52%|█████▏ | 2974/5772 [1:22:05<4:40:56, 6.02s/it] 52%|█████▏ | 2975/5772 [1:22:11<4:43:22, 6.08s/it] 52%|█████▏ | 2975/5772 [1:22:13<4:43:22, 6.08s/it] {'loss': 0.4748, 'learning_rate': 9.988776020028685e-06, 'epoch': 0.52} + 52%|█████▏ | 2975/5772 [1:22:13<4:43:22, 6.08s/it] {'loss': 0.4748, 'learning_rate': 9.988776020028685e-06, 'epoch': 0.52} + 52%|█████▏ | 2975/5772 [1:22:11<4:43:22, 6.08s/it] 52%|█████▏ | 2976/5772 [1:22:17<4:39:43, 6.00s/it] 52%|█████▏ | 2976/5772 [1:22:19<4:39:43, 6.00s/it] {'loss': 0.4596, 'learning_rate': 9.983164034461686e-06, 'epoch': 0.52} + 52%|█████▏ | 2976/5772 [1:22:19<4:39:43, 6.00s/it] {'loss': 0.4596, 'learning_rate': 9.983164034461686e-06, 'epoch': 0.52} + 52%|█████▏ | 2976/5772 [1:22:17<4:39:43, 6.00s/it] 52%|█████▏ | 2977/5772 [1:22:23<4:37:52, 5.97s/it] 52%|█████▏ | 2977/5772 [1:22:25<4:37:52, 5.97s/it] {'loss': 0.4713, 'learning_rate': 9.977552054197088e-06, 'epoch': 0.52} + 52%|█████▏ | 2977/5772 [1:22:25<4:37:52, 5.97s/it] {'loss': 0.4713, 'learning_rate': 9.977552054197088e-06, 'epoch': 0.52} + 52%|█████▏ | 2977/5772 [1:22:23<4:37:52, 5.97s/it] 52%|█████▏ | 2978/5772 [1:22:29<4:41:17, 6.04s/it] 52%|█████▏ | 2978/5772 [1:22:31<4:41:17, 6.04s/it] {'loss': 0.4618, 'learning_rate': 9.971940081002338e-06, 'epoch': 0.52} + 52%|█████▏ | 2978/5772 [1:22:31<4:41:17, 6.04s/it] {'loss': 0.4618, 'learning_rate': 9.971940081002338e-06, 'epoch': 0.52} + 52%|█████▏ | 2978/5772 [1:22:29<4:41:17, 6.04s/it] 52%|█████▏ | 2979/5772 [1:22:35<4:41:26, 6.05s/it] 52%|█████▏ | 2979/5772 [1:22:37<4:41:26, 6.05s/it] {'loss': 0.4759, 'learning_rate': 9.96632811664491e-06, 'epoch': 0.52} + 52%|█████▏ | 2979/5772 [1:22:37<4:41:26, 6.05s/it] {'loss': 0.4759, 'learning_rate': 9.96632811664491e-06, 'epoch': 0.52} + 52%|█████▏ | 2979/5772 [1:22:35<4:41:26, 6.05s/it] 52%|█████▏ | 2980/5772 [1:22:41<4:42:34, 6.07s/it] 52%|█████▏ | 2980/5772 [1:22:43<4:42:34, 6.07s/it] {'loss': 0.4563, 'learning_rate': 9.96071616289225e-06, 'epoch': 0.52} + 52%|█████▏ | 2980/5772 [1:22:43<4:42:34, 6.07s/it] {'loss': 0.4563, 'learning_rate': 9.96071616289225e-06, 'epoch': 0.52} + 52%|█████▏ | 2980/5772 [1:22:41<4:42:34, 6.07s/it] 52%|█████▏ | 2981/5772 [1:22:47<4:37:26, 5.96s/it] 52%|█████▏ | 2981/5772 [1:22:49<4:37:26, 5.96s/it] {'loss': 0.4776, 'learning_rate': 9.955104221511816e-06, 'epoch': 0.52} + 52%|█████▏ | 2981/5772 [1:22:49<4:37:26, 5.96s/it] {'loss': 0.4776, 'learning_rate': 9.955104221511816e-06, 'epoch': 0.52} + 52%|█████▏ | 2981/5772 [1:22:47<4:37:26, 5.96s/it] 52%|█████▏ | 2982/5772 [1:22:53<4:43:59, 6.11s/it] 52%|█████▏ | 2982/5772 [1:22:55<4:43:59, 6.11s/it] {'loss': 0.4688, 'learning_rate': 9.949492294271062e-06, 'epoch': 0.52} + 52%|█████▏ | 2982/5772 [1:22:55<4:43:59, 6.11s/it] {'loss': 0.4688, 'learning_rate': 9.949492294271062e-06, 'epoch': 0.52} + 52%|█████▏ | 2982/5772 [1:22:53<4:43:59, 6.11s/it] 52%|█████▏ | 2983/5772 [1:22:59<4:41:58, 6.07s/it] 52%|█████▏ | 2983/5772 [1:23:01<4:41:58, 6.07s/it] {'loss': 0.481, 'learning_rate': 9.943880382937426e-06, 'epoch': 0.52} + 52%|█████▏ | 2983/5772 [1:23:01<4:41:58, 6.07s/it] {'loss': 0.481, 'learning_rate': 9.943880382937426e-06, 'epoch': 0.52} + 52%|█████▏ | 2983/5772 [1:22:59<4:41:58, 6.07s/it] 52%|█████▏ | 2984/5772 [1:23:06<4:46:38, 6.17s/it] 52%|█████▏ | 2984/5772 [1:23:08<4:46:38, 6.17s/it] {'loss': 0.4643, 'learning_rate': 9.938268489278352e-06, 'epoch': 0.52} + 52%|█████▏ | 2984/5772 [1:23:08<4:46:38, 6.17s/it] {'loss': 0.4643, 'learning_rate': 9.938268489278352e-06, 'epoch': 0.52} + 52%|█████▏ | 2984/5772 [1:23:06<4:46:38, 6.17s/it] 52%|█████▏ | 2985/5772 [1:23:12<4:47:37, 6.19s/it] 52%|█████▏ | 2985/5772 [1:23:14<4:47:37, 6.19s/it] {'loss': 0.4796, 'learning_rate': 9.932656615061274e-06, 'epoch': 0.52} + 52%|█████▏ | 2985/5772 [1:23:14<4:47:37, 6.19s/it] {'loss': 0.4796, 'learning_rate': 9.932656615061274e-06, 'epoch': 0.52} + 52%|█████▏ | 2985/5772 [1:23:12<4:47:37, 6.19s/it] 52%|█████▏ | 2986/5772 [1:23:18<4:43:01, 6.10s/it] 52%|█████▏ | 2986/5772 [1:23:20<4:43:01, 6.10s/it] {'loss': 0.4643, 'learning_rate': 9.927044762053622e-06, 'epoch': 0.52} + 52%|█████▏ | 2986/5772 [1:23:20<4:43:01, 6.10s/it] {'loss': 0.4643, 'learning_rate': 9.927044762053622e-06, 'epoch': 0.52} + 52%|█████▏ | 2986/5772 [1:23:18<4:43:01, 6.10s/it] 52%|█████▏ | 2987/5772 [1:23:24<4:48:07, 6.21s/it] 52%|█████▏ | 2987/5772 [1:23:26<4:48:07, 6.21s/it] {'loss': 0.4787, 'learning_rate': 9.921432932022812e-06, 'epoch': 0.52} + 52%|█████▏ | 2987/5772 [1:23:26<4:48:07, 6.21s/it] {'loss': 0.4787, 'learning_rate': 9.921432932022812e-06, 'epoch': 0.52} + 52%|█████▏ | 2987/5772 [1:23:24<4:48:07, 6.21s/it] 52%|█████▏ | 2988/5772 [1:23:30<4:44:56, 6.14s/it] 52%|█████▏ | 2988/5772 [1:23:32<4:44:56, 6.14s/it] {'loss': 0.4675, 'learning_rate': 9.915821126736266e-06, 'epoch': 0.52} + 52%|█████▏ | 2988/5772 [1:23:32<4:44:56, 6.14s/it] {'loss': 0.4675, 'learning_rate': 9.915821126736266e-06, 'epoch': 0.52} + 52%|█████▏ | 2988/5772 [1:23:30<4:44:56, 6.14s/it] 52%|█████▏ | 2989/5772 [1:23:36<4:40:32, 6.05s/it] 52%|█████▏ | 2989/5772 [1:23:38<4:40:32, 6.05s/it] {'loss': 0.4763, 'learning_rate': 9.910209347961389e-06, 'epoch': 0.52} + 52%|█████▏ | 2989/5772 [1:23:38<4:40:32, 6.05s/it] {'loss': 0.4763, 'learning_rate': 9.910209347961389e-06, 'epoch': 0.52} + 52%|█████▏ | 2989/5772 [1:23:36<4:40:32, 6.05s/it] 52%|█████▏ | 2990/5772 [1:23:42<4:41:48, 6.08s/it] 52%|█████▏ | 2990/5772 [1:23:44<4:41:48, 6.08s/it] {'loss': 0.4723, 'learning_rate': 9.904597597465577e-06, 'epoch': 0.52} + 52%|█████▏ | 2990/5772 [1:23:44<4:41:48, 6.08s/it] {'loss': 0.4723, 'learning_rate': 9.904597597465577e-06, 'epoch': 0.52} + 52%|█████▏ | 2990/5772 [1:23:42<4:41:48, 6.08s/it] 52%|█████▏ | 2991/5772 [1:23:48<4:42:48, 6.10s/it] 52%|█████▏ | 2991/5772 [1:23:50<4:42:48, 6.10s/it] {'loss': 0.4615, 'learning_rate': 9.898985877016225e-06, 'epoch': 0.52} + 52%|█████▏ | 2991/5772 [1:23:50<4:42:48, 6.10s/it] {'loss': 0.4615, 'learning_rate': 9.898985877016225e-06, 'epoch': 0.52} + 52%|█████▏ | 2991/5772 [1:23:48<4:42:48, 6.10s/it] 52%|█████▏ | 2992/5772 [1:23:54<4:40:45, 6.06s/it] 52%|█████▏ | 2992/5772 [1:23:56<4:40:45, 6.06s/it] {'loss': 0.4593, 'learning_rate': 9.893374188380705e-06, 'epoch': 0.52} + 52%|█████▏ | 2992/5772 [1:23:56<4:40:45, 6.06s/it] {'loss': 0.4593, 'learning_rate': 9.893374188380705e-06, 'epoch': 0.52} + 52%|█████▏ | 2992/5772 [1:23:54<4:40:45, 6.06s/it] 52%|█████▏ | 2993/5772 [1:24:00<4:40:25, 6.05s/it] 52%|█████▏ | 2993/5772 [1:24:02<4:40:25, 6.05s/it] {'loss': 0.4683, 'learning_rate': 9.887762533326396e-06, 'epoch': 0.52} + 52%|█████▏ | 2993/5772 [1:24:02<4:40:25, 6.05s/it] {'loss': 0.4683, 'learning_rate': 9.887762533326396e-06, 'epoch': 0.52} + 52%|█████▏ | 2993/5772 [1:24:00<4:40:25, 6.05s/it] 52%|█████▏ | 2994/5772 [1:24:06<4:36:58, 5.98s/it] 52%|█████▏ | 2994/5772 [1:24:08<4:36:58, 5.98s/it] {'loss': 0.4717, 'learning_rate': 9.882150913620648e-06, 'epoch': 0.52} + 52%|█████▏ | 2994/5772 [1:24:08<4:36:58, 5.98s/it] {'loss': 0.4717, 'learning_rate': 9.882150913620648e-06, 'epoch': 0.52} + 52%|█████▏ | 2994/5772 [1:24:06<4:36:58, 5.98s/it] 52%|█████▏ | 2995/5772 [1:24:12<4:37:27, 5.99s/it] 52%|█████▏ | 2995/5772 [1:24:14<4:37:27, 5.99s/it] {'loss': 0.4672, 'learning_rate': 9.876539331030814e-06, 'epoch': 0.52} + 52%|█████▏ | 2995/5772 [1:24:14<4:37:27, 5.99s/it] {'loss': 0.4672, 'learning_rate': 9.876539331030814e-06, 'epoch': 0.52} + 52%|█████▏ | 2995/5772 [1:24:12<4:37:27, 5.99s/it] 52%|█████▏ | 2996/5772 [1:24:18<4:35:06, 5.95s/it] 52%|█████▏ | 2996/5772 [1:24:20<4:35:06, 5.95s/it] {'loss': 0.4518, 'learning_rate': 9.870927787324236e-06, 'epoch': 0.52} + 52%|█████▏ | 2996/5772 [1:24:20<4:35:06, 5.95s/it] {'loss': 0.4518, 'learning_rate': 9.870927787324236e-06, 'epoch': 0.52} + 52%|█████▏ | 2996/5772 [1:24:18<4:35:06, 5.95s/it] 52%|█████▏ | 2997/5772 [1:24:24<4:41:50, 6.09s/it] 52%|█████▏ | 2997/5772 [1:24:26<4:41:51, 6.09s/it] {'loss': 0.4798, 'learning_rate': 9.865316284268232e-06, 'epoch': 0.52} + 52%|█████▏ | 2997/5772 [1:24:26<4:41:51, 6.09s/it] {'loss': 0.4798, 'learning_rate': 9.865316284268232e-06, 'epoch': 0.52} + 52%|█████▏ | 2997/5772 [1:24:24<4:41:50, 6.09s/it] 52%|█████▏ | 2998/5772 [1:24:30<4:39:48, 6.05s/it] 52%|█████▏ | 2998/5772 [1:24:32<4:39:48, 6.05s/it] {'loss': 0.4739, 'learning_rate': 9.859704823630115e-06, 'epoch': 0.52} + 52%|█████▏ | 2998/5772 [1:24:32<4:39:48, 6.05s/it] {'loss': 0.4739, 'learning_rate': 9.859704823630115e-06, 'epoch': 0.52} + 52%|█████▏ | 2998/5772 [1:24:30<4:39:48, 6.05s/it] 52%|█████▏ | 2999/5772 [1:24:37<4:44:54, 6.16s/it] 52%|█████▏ | 2999/5772 [1:24:39<4:44:54, 6.16s/it] {'loss': 0.4692, 'learning_rate': 9.854093407177185e-06, 'epoch': 0.52} + 52%|█████▏ | 2999/5772 [1:24:39<4:44:54, 6.16s/it] {'loss': 0.4692, 'learning_rate': 9.854093407177185e-06, 'epoch': 0.52} + 52%|█████▏ | 2999/5772 [1:24:37<4:44:54, 6.16s/it]10 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend...15 + AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +06 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... + 52%|█████▏ | 3000/5772 [1:24:43<4:40:19, 6.07s/it]12 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... + 52%|█████▏ | 3000/5772 [1:24:45<4:40:19, 6.07s/it]9 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4533, 'learning_rate': 9.848482036676725e-06, 'epoch': 0.52} + 52%|█████▏ | 3000/5772 [1:24:45<4:40:19, 6.07s/it] {'loss': 0.4533, 'learning_rate': 9.848482036676725e-06, 'epoch': 0.52} + 52%|█████▏ | 3000/5772 [1:24:43<4:40:19, 6.07s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-3000/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-3000/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-3000/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 52%|█████▏ | 3001/5772 [1:25:06<8:33:59, 11.13s/it] 52%|█████▏ | 3001/5772 [1:25:08<8:33:59, 11.13s/it] {'loss': 0.4729, 'learning_rate': 9.842870713896004e-06, 'epoch': 0.52} + 52%|█████▏ | 3001/5772 [1:25:08<8:33:59, 11.13s/it] {'loss': 0.4729, 'learning_rate': 9.842870713896004e-06, 'epoch': 0.52} + 52%|█████▏ | 3001/5772 [1:25:06<8:33:59, 11.13s/it] 52%|█████▏ | 3002/5772 [1:25:12<7:23:02, 9.60s/it] 52%|█████▏ | 3002/5772 [1:25:14<7:23:02, 9.60s/it] {'loss': 0.4635, 'learning_rate': 9.837259440602274e-06, 'epoch': 0.52} + 52%|█████▏ | 3002/5772 [1:25:14<7:23:02, 9.60s/it] {'loss': 0.4635, 'learning_rate': 9.837259440602274e-06, 'epoch': 0.52} + 52%|█████▏ | 3002/5772 [1:25:12<7:23:02, 9.60s/it] 52%|█████▏ | 3003/5772 [1:25:18<6:34:18, 8.54s/it] 52%|█████▏ | 3003/5772 [1:25:20<6:34:18, 8.54s/it] {'loss': 0.4655, 'learning_rate': 9.831648218562787e-06, 'epoch': 0.52} + 52%|█████▏ | 3003/5772 [1:25:20<6:34:18, 8.54s/it] {'loss': 0.4655, 'learning_rate': 9.831648218562787e-06, 'epoch': 0.52} + 52%|█████▏ | 3003/5772 [1:25:18<6:34:18, 8.54s/it] 52%|█████▏ | 3004/5772 [1:25:24<6:06:02, 7.93s/it] 52%|█████▏ | 3004/5772 [1:25:26<6:06:02, 7.93s/it] {'loss': 0.4707, 'learning_rate': 9.82603704954475e-06, 'epoch': 0.52} + 52%|█████▏ | 3004/5772 [1:25:26<6:06:02, 7.93s/it] {'loss': 0.4707, 'learning_rate': 9.82603704954475e-06, 'epoch': 0.52} + 52%|█████▏ | 3004/5772 [1:25:24<6:06:02, 7.93s/it] 52%|█████▏ | 3005/5772 [1:25:30<5:39:21, 7.36s/it] 52%|█████▏ | 3005/5772 [1:25:32<5:39:21, 7.36s/it] {'loss': 0.4678, 'learning_rate': 9.820425935315381e-06, 'epoch': 0.52} + 52%|█████▏ | 3005/5772 [1:25:32<5:39:21, 7.36s/it] {'loss': 0.4678, 'learning_rate': 9.820425935315381e-06, 'epoch': 0.52} + 52%|█████▏ | 3005/5772 [1:25:30<5:39:21, 7.36s/it] 52%|█████▏ | 3006/5772 [1:25:36<5:22:56, 7.01s/it] 52%|█████▏ | 3006/5772 [1:25:38<5:22:56, 7.01s/it] {'loss': 0.4674, 'learning_rate': 9.814814877641865e-06, 'epoch': 0.52} + 52%|█████▏ | 3006/5772 [1:25:38<5:22:56, 7.01s/it] {'loss': 0.4674, 'learning_rate': 9.814814877641865e-06, 'epoch': 0.52} + 52%|█████▏ | 3006/5772 [1:25:36<5:22:56, 7.01s/it] 52%|█████▏ | 3007/5772 [1:25:42<5:07:47, 6.68s/it] 52%|█████▏ | 3007/5772 [1:25:44<5:07:46, 6.68s/it] {'loss': 0.4597, 'learning_rate': 9.809203878291374e-06, 'epoch': 0.52} + 52%|█████▏ | 3007/5772 [1:25:44<5:07:46, 6.68s/it] {'loss': 0.4597, 'learning_rate': 9.809203878291374e-06, 'epoch': 0.52} + 52%|█████▏ | 3007/5772 [1:25:42<5:07:47, 6.68s/it] 52%|█████▏ | 3008/5772 [1:25:48<4:59:20, 6.50s/it] 52%|█████▏ | 3008/5772 [1:25:50<4:59:21, 6.50s/it] {'loss': 0.4624, 'learning_rate': 9.80359293903106e-06, 'epoch': 0.52} + 52%|█████▏ | 3008/5772 [1:25:50<4:59:21, 6.50s/it] {'loss': 0.4624, 'learning_rate': 9.80359293903106e-06, 'epoch': 0.52} + 52%|█████▏ | 3008/5772 [1:25:48<4:59:20, 6.50s/it] 52%|█████▏ | 3009/5772 [1:25:54<4:53:18, 6.37s/it] 52%|█████▏ | 3009/5772 [1:25:56<4:53:18, 6.37s/it] {'loss': 0.4711, 'learning_rate': 9.797982061628056e-06, 'epoch': 0.52} + 52%|█████▏ | 3009/5772 [1:25:56<4:53:18, 6.37s/it] {'loss': 0.4711, 'learning_rate': 9.797982061628056e-06, 'epoch': 0.52} + 52%|█████▏ | 3009/5772 [1:25:54<4:53:18, 6.37s/it] 52%|█████▏ | 3010/5772 [1:26:01<4:59:50, 6.51s/it] 52%|█████▏ | 3010/5772 [1:26:03<4:59:50, 6.51s/it] {'loss': 0.4489, 'learning_rate': 9.792371247849481e-06, 'epoch': 0.52} + 52%|█████▏ | 3010/5772 [1:26:03<4:59:50, 6.51s/it] {'loss': 0.4489, 'learning_rate': 9.792371247849481e-06, 'epoch': 0.52} + 52%|█████▏ | 3010/5772 [1:26:01<4:59:50, 6.51s/it] 52%|█████▏ | 3011/5772 [1:26:07<4:53:51, 6.39s/it] 52%|█████▏ | 3011/5772 [1:26:09<4:53:51, 6.39s/it] {'loss': 0.4755, 'learning_rate': 9.786760499462425e-06, 'epoch': 0.52} + 52%|█████▏ | 3011/5772 [1:26:09<4:53:51, 6.39s/it] {'loss': 0.4755, 'learning_rate': 9.786760499462425e-06, 'epoch': 0.52} + 52%|█████▏ | 3011/5772 [1:26:07<4:53:51, 6.39s/it] 52%|█████▏ | 3012/5772 [1:26:14<4:51:58, 6.35s/it] 52%|█████▏ | 3012/5772 [1:26:16<4:51:58, 6.35s/it] {'loss': 0.4623, 'learning_rate': 9.781149818233969e-06, 'epoch': 0.52} + 52%|█████▏ | 3012/5772 [1:26:16<4:51:58, 6.35s/it] {'loss': 0.4623, 'learning_rate': 9.781149818233969e-06, 'epoch': 0.52} + 52%|█████▏ | 3012/5772 [1:26:14<4:51:58, 6.35s/it] 52%|█████▏ | 3013/5772 [1:26:20<4:52:08, 6.35s/it] 52%|█████▏ | 3013/5772 [1:26:22<4:52:08, 6.35s/it] {'loss': 0.4762, 'learning_rate': 9.775539205931153e-06, 'epoch': 0.52} + 52%|█████▏ | 3013/5772 [1:26:22<4:52:08, 6.35s/it] {'loss': 0.4762, 'learning_rate': 9.775539205931153e-06, 'epoch': 0.52} + 52%|█████▏ | 3013/5772 [1:26:20<4:52:08, 6.35s/it] 52%|█████▏ | 3014/5772 [1:26:26<4:47:39, 6.26s/it] 52%|█████▏ | 3014/5772 [1:26:28<4:47:39, 6.26s/it] {'loss': 0.4694, 'learning_rate': 9.769928664321021e-06, 'epoch': 0.52} + 52%|█████▏ | 3014/5772 [1:26:28<4:47:39, 6.26s/it] {'loss': 0.4694, 'learning_rate': 9.769928664321021e-06, 'epoch': 0.52} + 52%|█████▏ | 3014/5772 [1:26:26<4:47:39, 6.26s/it] 52%|█████▏ | 3015/5772 [1:26:32<4:44:49, 6.20s/it] 52%|█████▏ | 3015/5772 [1:26:34<4:44:49, 6.20s/it] {'loss': 0.4562, 'learning_rate': 9.764318195170575e-06, 'epoch': 0.52} + 52%|█████▏ | 3015/5772 [1:26:34<4:44:49, 6.20s/it] {'loss': 0.4562, 'learning_rate': 9.764318195170575e-06, 'epoch': 0.52} + 52%|█████▏ | 3015/5772 [1:26:32<4:44:49, 6.20s/it] 52%|█████▏ | 3016/5772 [1:26:38<4:37:45, 6.05s/it] 52%|█████▏ | 3016/5772 [1:26:40<4:37:45, 6.05s/it] {'loss': 0.4686, 'learning_rate': 9.758707800246806e-06, 'epoch': 0.52} + 52%|█████▏ | 3016/5772 [1:26:40<4:37:45, 6.05s/it] {'loss': 0.4686, 'learning_rate': 9.758707800246806e-06, 'epoch': 0.52} + 52%|█████▏ | 3016/5772 [1:26:38<4:37:45, 6.05s/it] 52%|█████▏ | 3017/5772 [1:26:44<4:38:36, 6.07s/it] 52%|█████▏ | 3017/5772 [1:26:46<4:38:36, 6.07s/it] {'loss': 0.4821, 'learning_rate': 9.753097481316672e-06, 'epoch': 0.52} + 52%|█████▏ | 3017/5772 [1:26:46<4:38:36, 6.07s/it] {'loss': 0.4821, 'learning_rate': 9.753097481316672e-06, 'epoch': 0.52} + 52%|█████▏ | 3017/5772 [1:26:44<4:38:36, 6.07s/it] 52%|█████▏ | 3018/5772 [1:26:50<4:39:11, 6.08s/it] 52%|█████▏ | 3018/5772 [1:26:52<4:39:11, 6.08s/it] {'loss': 0.4611, 'learning_rate': 9.747487240147112e-06, 'epoch': 0.52} + 52%|█████▏ | 3018/5772 [1:26:52<4:39:11, 6.08s/it] {'loss': 0.4611, 'learning_rate': 9.747487240147112e-06, 'epoch': 0.52} + 52%|█████▏ | 3018/5772 [1:26:50<4:39:11, 6.08s/it] 52%|█████▏ | 3019/5772 [1:26:57<4:47:45, 6.27s/it] 52%|█████▏ | 3019/5772 [1:26:59<4:47:46, 6.27s/it] {'loss': 0.4765, 'learning_rate': 9.741877078505046e-06, 'epoch': 0.52} + 52%|█████▏ | 3019/5772 [1:26:59<4:47:46, 6.27s/it] {'loss': 0.4765, 'learning_rate': 9.741877078505046e-06, 'epoch': 0.52} + 52%|█████▏ | 3019/5772 [1:26:57<4:47:45, 6.27s/it] 52%|█████▏ | 3020/5772 [1:27:03<4:43:17, 6.18s/it] 52%|█████▏ | 3020/5772 [1:27:05<4:43:16, 6.18s/it] {'loss': 0.4685, 'learning_rate': 9.736266998157353e-06, 'epoch': 0.52} + 52%|█████▏ | 3020/5772 [1:27:05<4:43:16, 6.18s/it] {'loss': 0.4685, 'learning_rate': 9.736266998157353e-06, 'epoch': 0.52} + 52%|█████▏ | 3020/5772 [1:27:03<4:43:17, 6.18s/it] 52%|█████▏ | 3021/5772 [1:27:09<4:39:03, 6.09s/it] 52%|█████▏ | 3021/5772 [1:27:11<4:39:03, 6.09s/it] {'loss': 0.4648, 'learning_rate': 9.73065700087091e-06, 'epoch': 0.52} + 52%|█████▏ | 3021/5772 [1:27:11<4:39:03, 6.09s/it] {'loss': 0.4648, 'learning_rate': 9.73065700087091e-06, 'epoch': 0.52} + 52%|█████▏ | 3021/5772 [1:27:09<4:39:03, 6.09s/it] 52%|█████▏ | 3022/5772 [1:27:15<4:39:15, 6.09s/it] 52%|█████▏ | 3022/5772 [1:27:17<4:39:15, 6.09s/it] {'loss': 0.4586, 'learning_rate': 9.725047088412538e-06, 'epoch': 0.52} + 52%|█████▏ | 3022/5772 [1:27:17<4:39:15, 6.09s/it] {'loss': 0.4586, 'learning_rate': 9.725047088412538e-06, 'epoch': 0.52} + 52%|█████▏ | 3022/5772 [1:27:15<4:39:15, 6.09s/it] 52%|█████▏ | 3023/5772 [1:27:21<4:42:53, 6.17s/it] 52%|█████▏ | 3023/5772 [1:27:23<4:42:53, 6.17s/it] {'loss': 0.4572, 'learning_rate': 9.719437262549061e-06, 'epoch': 0.52} + 52%|█████▏ | 3023/5772 [1:27:23<4:42:53, 6.17s/it] {'loss': 0.4572, 'learning_rate': 9.719437262549061e-06, 'epoch': 0.52} + 52%|█████▏ | 3023/5772 [1:27:21<4:42:53, 6.17s/it] 52%|█████▏ | 3024/5772 [1:27:27<4:43:18, 6.19s/it] 52%|█████▏ | 3024/5772 [1:27:29<4:43:18, 6.19s/it] {'loss': 0.4636, 'learning_rate': 9.713827525047261e-06, 'epoch': 0.52} + 52%|█████▏ | 3024/5772 [1:27:29<4:43:18, 6.19s/it] {'loss': 0.4636, 'learning_rate': 9.713827525047261e-06, 'epoch': 0.52} + 52%|█████▏ | 3024/5772 [1:27:27<4:43:18, 6.19s/it] 52%|█████▏ | 3025/5772 [1:27:34<4:44:18, 6.21s/it] 52%|█████▏ | 3025/5772 [1:27:36<4:44:18, 6.21s/it] {'loss': 0.4712, 'learning_rate': 9.708217877673888e-06, 'epoch': 0.52} + 52%|█████▏ | 3025/5772 [1:27:36<4:44:18, 6.21s/it] {'loss': 0.4712, 'learning_rate': 9.708217877673888e-06, 'epoch': 0.52} + 52%|█████▏ | 3025/5772 [1:27:34<4:44:18, 6.21s/it] 52%|█████▏ | 3026/5772 [1:27:40<4:48:36, 6.31s/it] 52%|█████▏ | 3026/5772 [1:27:42<4:48:36, 6.31s/it] {'loss': 0.4651, 'learning_rate': 9.702608322195674e-06, 'epoch': 0.52} + 52%|█████▏ | 3026/5772 [1:27:42<4:48:36, 6.31s/it] {'loss': 0.4651, 'learning_rate': 9.702608322195674e-06, 'epoch': 0.52} + 52%|█████▏ | 3026/5772 [1:27:40<4:48:36, 6.31s/it] 52%|█████▏ | 3027/5772 [1:27:47<4:51:43, 6.38s/it] 52%|█████▏ | 3027/5772 [1:27:49<4:51:43, 6.38s/it] {'loss': 0.4669, 'learning_rate': 9.696998860379313e-06, 'epoch': 0.52} + 52%|█████▏ | 3027/5772 [1:27:49<4:51:43, 6.38s/it] {'loss': 0.4669, 'learning_rate': 9.696998860379313e-06, 'epoch': 0.52} + 52%|█████▏ | 3027/5772 [1:27:47<4:51:43, 6.38s/it] 52%|█████▏ | 3028/5772 [1:27:53<4:51:20, 6.37s/it] 52%|█████▏ | 3028/5772 [1:27:55<4:51:19, 6.37s/it] {'loss': 0.4548, 'learning_rate': 9.691389493991478e-06, 'epoch': 0.52} + 52%|█████▏ | 3028/5772 [1:27:55<4:51:19, 6.37s/it] {'loss': 0.4548, 'learning_rate': 9.691389493991478e-06, 'epoch': 0.52} + 52%|█████▏ | 3028/5772 [1:27:53<4:51:20, 6.37s/it] 52%|█████▏ | 3029/5772 [1:27:59<4:47:38, 6.29s/it] 52%|█████▏ | 3029/5772 [1:28:01<4:47:38, 6.29s/it] {'loss': 0.4726, 'learning_rate': 9.685780224798805e-06, 'epoch': 0.52} + 52%|█████▏ | 3029/5772 [1:28:01<4:47:38, 6.29s/it] {'loss': 0.4726, 'learning_rate': 9.685780224798805e-06, 'epoch': 0.52} + 52%|█████▏ | 3029/5772 [1:27:59<4:47:38, 6.29s/it] 52%|█████▏ | 3030/5772 [1:28:05<4:44:11, 6.22s/it] 52%|█████▏ | 3030/5772 [1:28:07<4:44:11, 6.22s/it] {'loss': 0.4694, 'learning_rate': 9.6801710545679e-06, 'epoch': 0.52} + 52%|█████▏ | 3030/5772 [1:28:07<4:44:11, 6.22s/it] {'loss': 0.4694, 'learning_rate': 9.6801710545679e-06, 'epoch': 0.52} + 52%|█████▏ | 3030/5772 [1:28:05<4:44:11, 6.22s/it] 53%|█████▎ | 3031/5772 [1:28:11<4:38:05, 6.09s/it] 53%|█████▎ | 3031/5772 [1:28:13<4:38:05, 6.09s/it] {'loss': 0.4704, 'learning_rate': 9.674561985065349e-06, 'epoch': 0.53} + 53%|█████▎ | 3031/5772 [1:28:13<4:38:05, 6.09s/it] {'loss': 0.4704, 'learning_rate': 9.674561985065349e-06, 'epoch': 0.53} + 53%|█████▎ | 3031/5772 [1:28:11<4:38:05, 6.09s/it] 53%|█████▎ | 3032/5772 [1:28:17<4:35:10, 6.03s/it] 53%|█████▎ | 3032/5772 [1:28:19<4:35:10, 6.03s/it] {'loss': 0.4679, 'learning_rate': 9.668953018057687e-06, 'epoch': 0.53} + 53%|█████▎ | 3032/5772 [1:28:19<4:35:10, 6.03s/it] {'loss': 0.4679, 'learning_rate': 9.668953018057687e-06, 'epoch': 0.53} + 53%|█████▎ | 3032/5772 [1:28:17<4:35:10, 6.03s/it] 53%|█████▎ | 3033/5772 [1:28:23<4:37:14, 6.07s/it] 53%|█████▎ | 3033/5772 [1:28:25<4:37:14, 6.07s/it] {'loss': 0.4752, 'learning_rate': 9.663344155311436e-06, 'epoch': 0.53} + 53%|█████▎ | 3033/5772 [1:28:25<4:37:14, 6.07s/it] {'loss': 0.4752, 'learning_rate': 9.663344155311436e-06, 'epoch': 0.53} + 53%|█████▎ | 3033/5772 [1:28:23<4:37:14, 6.07s/it] 53%|█████▎ | 3034/5772 [1:28:29<4:37:58, 6.09s/it] 53%|█████▎ | 3034/5772 [1:28:31<4:37:58, 6.09s/it] {'loss': 0.4705, 'learning_rate': 9.657735398593068e-06, 'epoch': 0.53} + 53%|█████▎ | 3034/5772 [1:28:31<4:37:58, 6.09s/it] {'loss': 0.4705, 'learning_rate': 9.657735398593068e-06, 'epoch': 0.53} + 53%|█████▎ | 3034/5772 [1:28:29<4:37:58, 6.09s/it] 53%|█████▎ | 3035/5772 [1:28:35<4:40:47, 6.16s/it] 53%|█████▎ | 3035/5772 [1:28:37<4:40:47, 6.16s/it] {'loss': 0.4681, 'learning_rate': 9.652126749669036e-06, 'epoch': 0.53} + 53%|█████▎ | 3035/5772 [1:28:37<4:40:47, 6.16s/it] {'loss': 0.4681, 'learning_rate': 9.652126749669036e-06, 'epoch': 0.53} + 53%|█████▎ | 3035/5772 [1:28:35<4:40:47, 6.16s/it] 53%|█████▎ | 3036/5772 [1:28:41<4:35:32, 6.04s/it] 53%|█████▎ | 3036/5772 [1:28:43<4:35:32, 6.04s/it] {'loss': 0.4661, 'learning_rate': 9.646518210305747e-06, 'epoch': 0.53} + 53%|█████▎ | 3036/5772 [1:28:43<4:35:32, 6.04s/it] {'loss': 0.4661, 'learning_rate': 9.646518210305747e-06, 'epoch': 0.53} + 53%|█████▎ | 3036/5772 [1:28:41<4:35:32, 6.04s/it] 53%|█████▎ | 3037/5772 [1:28:47<4:37:04, 6.08s/it] 53%|█████▎ | 3037/5772 [1:28:49<4:37:04, 6.08s/it] {'loss': 0.4631, 'learning_rate': 9.64090978226958e-06, 'epoch': 0.53} + 53%|█████▎ | 3037/5772 [1:28:49<4:37:04, 6.08s/it] {'loss': 0.4631, 'learning_rate': 9.64090978226958e-06, 'epoch': 0.53} + 53%|█████▎ | 3037/5772 [1:28:47<4:37:04, 6.08s/it] 53%|█████▎ | 3038/5772 [1:28:54<4:43:39, 6.23s/it] 53%|█████▎ | 3038/5772 [1:28:56<4:43:39, 6.23s/it] {'loss': 0.4626, 'learning_rate': 9.635301467326888e-06, 'epoch': 0.53} + 53%|█████▎ | 3038/5772 [1:28:56<4:43:39, 6.23s/it] {'loss': 0.4626, 'learning_rate': 9.635301467326888e-06, 'epoch': 0.53} + 53%|█████▎ | 3038/5772 [1:28:54<4:43:39, 6.23s/it] 53%|█████▎ | 3039/5772 [1:29:00<4:45:52, 6.28s/it] 53%|█████▎ | 3039/5772 [1:29:02<4:45:52, 6.28s/it] {'loss': 0.4856, 'learning_rate': 9.629693267243963e-06, 'epoch': 0.53} + 53%|█████▎ | 3039/5772 [1:29:02<4:45:52, 6.28s/it] {'loss': 0.4856, 'learning_rate': 9.629693267243963e-06, 'epoch': 0.53} + 53%|█████▎ | 3039/5772 [1:29:00<4:45:52, 6.28s/it] 53%|█████▎ | 3040/5772 [1:29:07<4:45:18, 6.27s/it] 53%|█████▎ | 3040/5772 [1:29:09<4:45:18, 6.27s/it] {'loss': 0.4672, 'learning_rate': 9.62408518378709e-06, 'epoch': 0.53} + 53%|█████▎ | 3040/5772 [1:29:09<4:45:18, 6.27s/it] {'loss': 0.4672, 'learning_rate': 9.62408518378709e-06, 'epoch': 0.53} + 53%|█████▎ | 3040/5772 [1:29:07<4:45:18, 6.27s/it] 53%|█████▎ | 3041/5772 [1:29:13<4:43:05, 6.22s/it] 53%|█████▎ | 3041/5772 [1:29:15<4:43:05, 6.22s/it] {'loss': 0.4745, 'learning_rate': 9.618477218722496e-06, 'epoch': 0.53} + 53%|█████▎ | 3041/5772 [1:29:15<4:43:05, 6.22s/it] {'loss': 0.4745, 'learning_rate': 9.618477218722496e-06, 'epoch': 0.53} + 53%|█████▎ | 3041/5772 [1:29:13<4:43:05, 6.22s/it] 53%|█████▎ | 3042/5772 [1:29:19<4:41:02, 6.18s/it] 53%|█████▎ | 3042/5772 [1:29:21<4:41:06, 6.18s/it] {'loss': 0.4665, 'learning_rate': 9.612869373816383e-06, 'epoch': 0.53} + {'loss': 0.4665, 'learning_rate': 9.612869373816383e-06, 'epoch': 0.53} 53%|█████▎ | 3042/5772 [1:29:21<4:41:06, 6.18s/it] + 53%|█████▎ | 3042/5772 [1:29:19<4:41:02, 6.18s/it] 53%|█████▎ | 3043/5772 [1:29:25<4:41:16, 6.18s/it] 53%|█████▎ | 3043/5772 [1:29:27<4:41:15, 6.18s/it] {'loss': 0.4643, 'learning_rate': 9.607261650834906e-06, 'epoch': 0.53} + 53%|█████▎ | 3043/5772 [1:29:27<4:41:15, 6.18s/it] {'loss': 0.4643, 'learning_rate': 9.607261650834906e-06, 'epoch': 0.53} + 53%|█████▎ | 3043/5772 [1:29:25<4:41:16, 6.18s/it] 53%|█████▎ | 3044/5772 [1:29:31<4:40:06, 6.16s/it] 53%|█████▎ | 3044/5772 [1:29:33<4:40:05, 6.16s/it] {'loss': 0.4712, 'learning_rate': 9.601654051544188e-06, 'epoch': 0.53} + 53%|█████▎ | 3044/5772 [1:29:33<4:40:05, 6.16s/it] {'loss': 0.4712, 'learning_rate': 9.601654051544188e-06, 'epoch': 0.53} + 53%|█████▎ | 3044/5772 [1:29:31<4:40:06, 6.16s/it] 53%|█████▎ | 3045/5772 [1:29:37<4:42:27, 6.21s/it] 53%|█████▎ | 3045/5772 [1:29:39<4:42:26, 6.21s/it] {'loss': 0.4799, 'learning_rate': 9.596046577710314e-06, 'epoch': 0.53} + 53%|█████▎ | 3045/5772 [1:29:39<4:42:26, 6.21s/it] {'loss': 0.4799, 'learning_rate': 9.596046577710314e-06, 'epoch': 0.53} + 53%|█████▎ | 3045/5772 [1:29:37<4:42:27, 6.21s/it] 53%|█████▎ | 3046/5772 [1:29:44<4:45:18, 6.28s/it] 53%|█████▎ | 3046/5772 [1:29:46<4:45:18, 6.28s/it] {'loss': 0.4698, 'learning_rate': 9.59043923109932e-06, 'epoch': 0.53} + 53%|█████▎ | 3046/5772 [1:29:46<4:45:18, 6.28s/it] {'loss': 0.4698, 'learning_rate': 9.59043923109932e-06, 'epoch': 0.53} + 53%|█████▎ | 3046/5772 [1:29:44<4:45:18, 6.28s/it] 53%|█████▎ | 3047/5772 [1:29:50<4:42:10, 6.21s/it] 53%|█████▎ | 3047/5772 [1:29:52<4:42:10, 6.21s/it] {'loss': 0.4743, 'learning_rate': 9.58483201347722e-06, 'epoch': 0.53} + 53%|█████▎ | 3047/5772 [1:29:52<4:42:10, 6.21s/it] {'loss': 0.4743, 'learning_rate': 9.58483201347722e-06, 'epoch': 0.53} + 53%|█████▎ | 3047/5772 [1:29:50<4:42:10, 6.21s/it] 53%|█████▎ | 3048/5772 [1:29:56<4:38:16, 6.13s/it] 53%|█████▎ | 3048/5772 [1:29:58<4:38:15, 6.13s/it] {'loss': 0.4615, 'learning_rate': 9.579224926609962e-06, 'epoch': 0.53} + 53%|█████▎ | 3048/5772 [1:29:58<4:38:15, 6.13s/it] {'loss': 0.4615, 'learning_rate': 9.579224926609962e-06, 'epoch': 0.53} + 53%|█████▎ | 3048/5772 [1:29:56<4:38:16, 6.13s/it] 53%|█████▎ | 3049/5772 [1:30:02<4:42:55, 6.23s/it] 53%|█████▎ | 3049/5772 [1:30:04<4:42:55, 6.23s/it] {'loss': 0.4528, 'learning_rate': 9.57361797226348e-06, 'epoch': 0.53} + 53%|█████▎ | 3049/5772 [1:30:04<4:42:55, 6.23s/it] {'loss': 0.4528, 'learning_rate': 9.57361797226348e-06, 'epoch': 0.53} + 53%|█████▎ | 3049/5772 [1:30:02<4:42:55, 6.23s/it]10 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +0 6 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... + 53%|█████▎ | 3050/5772 [1:30:08<4:38:31, 6.14s/it]1 AutoResumeHook: Checking whether to suspend... +1215 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... + 53%|█████▎ | 3050/5772 [1:30:10<4:38:31, 6.14s/it] {'loss': 0.4662, 'learning_rate': 9.568011152203642e-06, 'epoch': 0.53} + 53%|█████▎ | 3050/5772 [1:30:10<4:38:31, 6.14s/it] {'loss': 0.4662, 'learning_rate': 9.568011152203642e-06, 'epoch': 0.53} + 53%|█████▎ | 3050/5772 [1:30:08<4:38:31, 6.14s/it]WARNING: tokenization mismatch: 1 vs. 64. [[{'from': 'human', 'value': '\nWhat vitamin is this vegetable associated with?\nAnswer the question using a single word or phrase.'}, {'from': 'gpt', 'value': ''}]] (ignored) + 53%|█████▎ | 3051/5772 [1:30:14<4:36:50, 6.10s/it] 53%|█████▎ | 3051/5772 [1:30:16<4:36:49, 6.10s/it] {'loss': 0.4747, 'learning_rate': 9.562404468196292e-06, 'epoch': 0.53} + 53%|█████▎ | 3051/5772 [1:30:16<4:36:49, 6.10s/it] {'loss': 0.4747, 'learning_rate': 9.562404468196292e-06, 'epoch': 0.53} + 53%|█████▎ | 3051/5772 [1:30:14<4:36:50, 6.10s/it] 53%|█████▎ | 3052/5772 [1:30:21<4:41:45, 6.22s/it] 53%|█████▎ | 3052/5772 [1:30:23<4:41:44, 6.22s/it] {'loss': 0.464, 'learning_rate': 9.556797922007221e-06, 'epoch': 0.53} + 53%|█████▎ | 3052/5772 [1:30:23<4:41:44, 6.22s/it] {'loss': 0.464, 'learning_rate': 9.556797922007221e-06, 'epoch': 0.53} + 53%|█████▎ | 3052/5772 [1:30:21<4:41:45, 6.22s/it] 53%|█████▎ | 3053/5772 [1:30:27<4:38:53, 6.15s/it] 53%|█████▎ | 3053/5772 [1:30:29<4:38:53, 6.15s/it] {'loss': 0.4595, 'learning_rate': 9.55119151540218e-06, 'epoch': 0.53} + 53%|█████▎ | 3053/5772 [1:30:29<4:38:53, 6.15s/it] {'loss': 0.4595, 'learning_rate': 9.55119151540218e-06, 'epoch': 0.53} + 53%|█████▎ | 3053/5772 [1:30:27<4:38:53, 6.15s/it] 53%|█████▎ | 3054/5772 [1:30:33<4:35:19, 6.08s/it] 53%|█████▎ | 3054/5772 [1:30:35<4:35:19, 6.08s/it] {'loss': 0.4652, 'learning_rate': 9.545585250146879e-06, 'epoch': 0.53} + 53%|█████▎ | 3054/5772 [1:30:35<4:35:19, 6.08s/it] {'loss': 0.4652, 'learning_rate': 9.545585250146879e-06, 'epoch': 0.53} + 53%|█████▎ | 3054/5772 [1:30:33<4:35:19, 6.08s/it] 53%|█████▎ | 3055/5772 [1:30:39<4:37:55, 6.14s/it] 53%|█████▎ | 3055/5772 [1:30:41<4:37:55, 6.14s/it] {'loss': 0.4774, 'learning_rate': 9.539979128006971e-06, 'epoch': 0.53} + 53%|█████▎ | 3055/5772 [1:30:41<4:37:55, 6.14s/it] {'loss': 0.4774, 'learning_rate': 9.539979128006971e-06, 'epoch': 0.53} + 53%|█████▎ | 3055/5772 [1:30:39<4:37:55, 6.14s/it] 53%|█████▎ | 3056/5772 [1:30:45<4:38:46, 6.16s/it] 53%|█████▎ | 3056/5772 [1:30:47<4:38:46, 6.16s/it] {'loss': 0.4644, 'learning_rate': 9.534373150748086e-06, 'epoch': 0.53} + 53%|█████▎ | 3056/5772 [1:30:47<4:38:46, 6.16s/it] {'loss': 0.4644, 'learning_rate': 9.534373150748086e-06, 'epoch': 0.53} + 53%|█████▎ | 3056/5772 [1:30:45<4:38:46, 6.16s/it] 53%|█████▎ | 3057/5772 [1:30:52<4:42:48, 6.25s/it] 53%|█████▎ | 3057/5772 [1:30:54<4:42:48, 6.25s/it] {'loss': 0.469, 'learning_rate': 9.528767320135783e-06, 'epoch': 0.53} + 53%|█████▎ | 3057/5772 [1:30:54<4:42:48, 6.25s/it] {'loss': 0.469, 'learning_rate': 9.528767320135783e-06, 'epoch': 0.53} + 53%|█████▎ | 3057/5772 [1:30:52<4:42:48, 6.25s/it] 53%|█████▎ | 3058/5772 [1:30:58<4:40:15, 6.20s/it] 53%|█████▎ | 3058/5772 [1:31:00<4:40:16, 6.20s/it] {'loss': 0.4634, 'learning_rate': 9.523161637935592e-06, 'epoch': 0.53} + 53%|█████▎ | 3058/5772 [1:31:00<4:40:16, 6.20s/it] {'loss': 0.4634, 'learning_rate': 9.523161637935592e-06, 'epoch': 0.53} + 53%|█████▎ | 3058/5772 [1:30:58<4:40:15, 6.20s/it] 53%|█████▎ | 3059/5772 [1:31:04<4:38:10, 6.15s/it] 53%|█████▎ | 3059/5772 [1:31:06<4:38:10, 6.15s/it] {'loss': 0.4786, 'learning_rate': 9.517556105912994e-06, 'epoch': 0.53} + 53%|█████▎ | 3059/5772 [1:31:06<4:38:10, 6.15s/it] {'loss': 0.4786, 'learning_rate': 9.517556105912994e-06, 'epoch': 0.53} + 53%|█████▎ | 3059/5772 [1:31:04<4:38:10, 6.15s/it] 53%|█████▎ | 3060/5772 [1:31:10<4:40:03, 6.20s/it] 53%|█████▎ | 3060/5772 [1:31:12<4:40:03, 6.20s/it] {'loss': 0.4586, 'learning_rate': 9.511950725833418e-06, 'epoch': 0.53} + 53%|█████▎ | 3060/5772 [1:31:12<4:40:03, 6.20s/it] {'loss': 0.4586, 'learning_rate': 9.511950725833418e-06, 'epoch': 0.53} + 53%|█████▎ | 3060/5772 [1:31:10<4:40:03, 6.20s/it] 53%|█████▎ | 3061/5772 [1:31:16<4:37:41, 6.15s/it] 53%|█████▎ | 3061/5772 [1:31:18<4:37:41, 6.15s/it] {'loss': 0.4675, 'learning_rate': 9.50634549946225e-06, 'epoch': 0.53} + 53%|█████▎ | 3061/5772 [1:31:18<4:37:41, 6.15s/it] {'loss': 0.4675, 'learning_rate': 9.50634549946225e-06, 'epoch': 0.53} + 53%|█████▎ | 3061/5772 [1:31:16<4:37:41, 6.15s/it] 53%|█████▎ | 3062/5772 [1:31:22<4:37:35, 6.15s/it] 53%|█████▎ | 3062/5772 [1:31:24<4:37:35, 6.15s/it] {'loss': 0.4654, 'learning_rate': 9.500740428564819e-06, 'epoch': 0.53} + 53%|█████▎ | 3062/5772 [1:31:24<4:37:35, 6.15s/it] {'loss': 0.4654, 'learning_rate': 9.500740428564819e-06, 'epoch': 0.53} + 53%|█████▎ | 3062/5772 [1:31:22<4:37:35, 6.15s/it] 53%|█████▎ | 3063/5772 [1:31:28<4:33:49, 6.06s/it] 53%|█████▎ | 3063/5772 [1:31:30<4:33:49, 6.06s/it] {'loss': 0.4633, 'learning_rate': 9.495135514906415e-06, 'epoch': 0.53} + 53%|█████▎ | 3063/5772 [1:31:30<4:33:49, 6.06s/it] {'loss': 0.4633, 'learning_rate': 9.495135514906415e-06, 'epoch': 0.53} + 53%|█████▎ | 3063/5772 [1:31:28<4:33:49, 6.06s/it] 53%|█████▎ | 3064/5772 [1:31:35<4:39:50, 6.20s/it] 53%|█████▎ | 3064/5772 [1:31:37<4:39:50, 6.20s/it] {'loss': 0.4626, 'learning_rate': 9.489530760252272e-06, 'epoch': 0.53} + 53%|█████▎ | 3064/5772 [1:31:37<4:39:50, 6.20s/it] {'loss': 0.4626, 'learning_rate': 9.489530760252272e-06, 'epoch': 0.53} + 53%|█████▎ | 3064/5772 [1:31:35<4:39:50, 6.20s/it] 53%|█████▎ | 3065/5772 [1:31:41<4:40:06, 6.21s/it] 53%|█████▎ | 3065/5772 [1:31:43<4:40:06, 6.21s/it]{'loss': 0.4748, 'learning_rate': 9.483926166367578e-06, 'epoch': 0.53} + {'loss': 0.4748, 'learning_rate': 9.483926166367578e-06, 'epoch': 0.53} + 53%|█████▎ | 3065/5772 [1:31:43<4:40:06, 6.21s/it] 53%|█████▎ | 3065/5772 [1:31:41<4:40:06, 6.21s/it] 53%|█████▎ | 3066/5772 [1:31:47<4:37:44, 6.16s/it] 53%|█████▎ | 3066/5772 [1:31:49<4:37:43, 6.16s/it] {'loss': 0.4612, 'learning_rate': 9.478321735017471e-06, 'epoch': 0.53} + 53%|█████▎ | 3066/5772 [1:31:49<4:37:43, 6.16s/it] {'loss': 0.4612, 'learning_rate': 9.478321735017471e-06, 'epoch': 0.53} + 53%|█████▎ | 3066/5772 [1:31:47<4:37:44, 6.16s/it] 53%|█████▎ | 3067/5772 [1:31:53<4:39:25, 6.20s/it] 53%|█████▎ | 3067/5772 [1:31:55<4:39:25, 6.20s/it] {'loss': 0.4688, 'learning_rate': 9.47271746796703e-06, 'epoch': 0.53} + 53%|█████▎ | 3067/5772 [1:31:55<4:39:25, 6.20s/it] {'loss': 0.4688, 'learning_rate': 9.47271746796703e-06, 'epoch': 0.53} + 53%|█████▎ | 3067/5772 [1:31:53<4:39:25, 6.20s/it] 53%|█████▎ | 3068/5772 [1:32:00<4:45:30, 6.34s/it] 53%|█████▎ | 3068/5772 [1:32:02<4:45:30, 6.34s/it] {'loss': 0.4579, 'learning_rate': 9.467113366981294e-06, 'epoch': 0.53} + 53%|█████▎ | 3068/5772 [1:32:02<4:45:30, 6.34s/it] {'loss': 0.4579, 'learning_rate': 9.467113366981294e-06, 'epoch': 0.53} + 53%|█████▎ | 3068/5772 [1:32:00<4:45:30, 6.34s/it] 53%|█████▎ | 3069/5772 [1:32:06<4:42:45, 6.28s/it] 53%|█████▎ | 3069/5772 [1:32:08<4:42:45, 6.28s/it] {'loss': 0.4761, 'learning_rate': 9.461509433825238e-06, 'epoch': 0.53} + 53%|█████▎ | 3069/5772 [1:32:08<4:42:45, 6.28s/it] {'loss': 0.4761, 'learning_rate': 9.461509433825238e-06, 'epoch': 0.53} + 53%|█████▎ | 3069/5772 [1:32:06<4:42:45, 6.28s/it] 53%|█████▎ | 3070/5772 [1:32:12<4:37:34, 6.16s/it] 53%|█████▎ | 3070/5772 [1:32:14<4:37:34, 6.16s/it] {'loss': 0.4584, 'learning_rate': 9.455905670263792e-06, 'epoch': 0.53} + 53%|█████▎ | 3070/5772 [1:32:14<4:37:34, 6.16s/it] {'loss': 0.4584, 'learning_rate': 9.455905670263792e-06, 'epoch': 0.53} + 53%|█████▎ | 3070/5772 [1:32:12<4:37:34, 6.16s/it] 53%|█████▎ | 3071/5772 [1:32:18<4:37:09, 6.16s/it] 53%|█████▎ | 3071/5772 [1:32:20<4:37:09, 6.16s/it] {'loss': 0.4614, 'learning_rate': 9.45030207806183e-06, 'epoch': 0.53} + 53%|█████▎ | 3071/5772 [1:32:20<4:37:09, 6.16s/it] {'loss': 0.4614, 'learning_rate': 9.45030207806183e-06, 'epoch': 0.53} + 53%|█████▎ | 3071/5772 [1:32:18<4:37:09, 6.16s/it] 53%|█████▎ | 3072/5772 [1:32:24<4:36:32, 6.15s/it] 53%|█████▎ | 3072/5772 [1:32:26<4:36:32, 6.15s/it] {'loss': 0.4731, 'learning_rate': 9.44469865898417e-06, 'epoch': 0.53} + 53%|█████▎ | 3072/5772 [1:32:26<4:36:32, 6.15s/it] {'loss': 0.4731, 'learning_rate': 9.44469865898417e-06, 'epoch': 0.53} + 53%|█████▎ | 3072/5772 [1:32:24<4:36:32, 6.15s/it] 53%|█████▎ | 3073/5772 [1:32:30<4:34:02, 6.09s/it] 53%|█████▎ | 3073/5772 [1:32:32<4:34:02, 6.09s/it] {'loss': 0.4735, 'learning_rate': 9.439095414795584e-06, 'epoch': 0.53} + 53%|█████▎ | 3073/5772 [1:32:32<4:34:02, 6.09s/it] {'loss': 0.4735, 'learning_rate': 9.439095414795584e-06, 'epoch': 0.53} + 53%|█████▎ | 3073/5772 [1:32:30<4:34:02, 6.09s/it] 53%|█████▎ | 3074/5772 [1:32:36<4:32:13, 6.05s/it] 53%|█████▎ | 3074/5772 [1:32:38<4:32:13, 6.05s/it] {'loss': 0.4629, 'learning_rate': 9.433492347260776e-06, 'epoch': 0.53} + 53%|█████▎ | 3074/5772 [1:32:38<4:32:13, 6.05s/it] {'loss': 0.4629, 'learning_rate': 9.433492347260776e-06, 'epoch': 0.53} + 53%|█████▎ | 3074/5772 [1:32:36<4:32:13, 6.05s/it] 53%|█████▎ | 3075/5772 [1:32:42<4:32:47, 6.07s/it] 53%|█████▎ | 3075/5772 [1:32:44<4:32:47, 6.07s/it] {'loss': 0.478, 'learning_rate': 9.427889458144405e-06, 'epoch': 0.53} + 53%|█████▎ | 3075/5772 [1:32:44<4:32:47, 6.07s/it] {'loss': 0.478, 'learning_rate': 9.427889458144405e-06, 'epoch': 0.53} + 53%|█████▎ | 3075/5772 [1:32:42<4:32:47, 6.07s/it] 53%|█████▎ | 3076/5772 [1:32:48<4:35:35, 6.13s/it] 53%|█████▎ | 3076/5772 [1:32:50<4:35:35, 6.13s/it] {'loss': 0.4597, 'learning_rate': 9.422286749211068e-06, 'epoch': 0.53} + 53%|█████▎ | 3076/5772 [1:32:50<4:35:35, 6.13s/it] {'loss': 0.4597, 'learning_rate': 9.422286749211068e-06, 'epoch': 0.53} + 53%|█████▎ | 3076/5772 [1:32:48<4:35:35, 6.13s/it] 53%|█████▎ | 3077/5772 [1:32:54<4:32:23, 6.06s/it] 53%|█████▎ | 3077/5772 [1:32:56<4:32:23, 6.06s/it] {'loss': 0.4684, 'learning_rate': 9.416684222225308e-06, 'epoch': 0.53} + 53%|█████▎ | 3077/5772 [1:32:56<4:32:23, 6.06s/it] {'loss': 0.4684, 'learning_rate': 9.416684222225308e-06, 'epoch': 0.53} + 53%|█████▎ | 3077/5772 [1:32:54<4:32:23, 6.06s/it] 53%|█████▎ | 3078/5772 [1:33:00<4:30:41, 6.03s/it] 53%|█████▎ | 3078/5772 [1:33:02<4:30:41, 6.03s/it] {'loss': 0.4601, 'learning_rate': 9.411081878951607e-06, 'epoch': 0.53} + 53%|█████▎ | 3078/5772 [1:33:02<4:30:41, 6.03s/it] {'loss': 0.4601, 'learning_rate': 9.411081878951607e-06, 'epoch': 0.53} + 53%|█████▎ | 3078/5772 [1:33:00<4:30:41, 6.03s/it] 53%|█████▎ | 3079/5772 [1:33:06<4:31:46, 6.06s/it] 53%|█████▎ | 3079/5772 [1:33:08<4:31:46, 6.06s/it] {'loss': 0.481, 'learning_rate': 9.40547972115439e-06, 'epoch': 0.53} + 53%|█████▎ | 3079/5772 [1:33:08<4:31:46, 6.06s/it] {'loss': 0.481, 'learning_rate': 9.40547972115439e-06, 'epoch': 0.53} + 53%|█████▎ | 3079/5772 [1:33:06<4:31:46, 6.06s/it] 53%|█████▎ | 3080/5772 [1:33:12<4:29:47, 6.01s/it] 53%|█████▎ | 3080/5772 [1:33:14<4:29:47, 6.01s/it] {'loss': 0.4723, 'learning_rate': 9.39987775059804e-06, 'epoch': 0.53} + 53%|█████▎ | 3080/5772 [1:33:14<4:29:47, 6.01s/it] {'loss': 0.4723, 'learning_rate': 9.39987775059804e-06, 'epoch': 0.53} + 53%|█████▎ | 3080/5772 [1:33:12<4:29:47, 6.01s/it] 53%|█████▎ | 3081/5772 [1:33:18<4:29:07, 6.00s/it] 53%|█████▎ | 3081/5772 [1:33:20<4:29:07, 6.00s/it] {'loss': 0.4726, 'learning_rate': 9.394275969046845e-06, 'epoch': 0.53} + 53%|█████▎ | 3081/5772 [1:33:20<4:29:07, 6.00s/it] {'loss': 0.4726, 'learning_rate': 9.394275969046845e-06, 'epoch': 0.53} + 53%|█████▎ | 3081/5772 [1:33:18<4:29:07, 6.00s/it] 53%|█████▎ | 3082/5772 [1:33:24<4:27:40, 5.97s/it] 53%|█████▎ | 3082/5772 [1:33:26<4:27:40, 5.97s/it] {'loss': 0.4632, 'learning_rate': 9.388674378265074e-06, 'epoch': 0.53} + 53%|█████▎ | 3082/5772 [1:33:26<4:27:40, 5.97s/it] {'loss': 0.4632, 'learning_rate': 9.388674378265074e-06, 'epoch': 0.53} + 53%|█████▎ | 3082/5772 [1:33:24<4:27:40, 5.97s/it] 53%|█████▎ | 3083/5772 [1:33:30<4:32:02, 6.07s/it] 53%|█████▎ | 3083/5772 [1:33:32<4:32:02, 6.07s/it] {'loss': 0.4645, 'learning_rate': 9.383072980016902e-06, 'epoch': 0.53} + 53%|█████▎ | 3083/5772 [1:33:32<4:32:02, 6.07s/it] {'loss': 0.4645, 'learning_rate': 9.383072980016902e-06, 'epoch': 0.53} + 53%|█████▎ | 3083/5772 [1:33:30<4:32:02, 6.07s/it] 53%|█████▎ | 3084/5772 [1:33:37<4:34:45, 6.13s/it] 53%|█████▎ | 3084/5772 [1:33:39<4:34:45, 6.13s/it] {'loss': 0.477, 'learning_rate': 9.377471776066469e-06, 'epoch': 0.53} + 53%|█████▎ | 3084/5772 [1:33:39<4:34:45, 6.13s/it] {'loss': 0.477, 'learning_rate': 9.377471776066469e-06, 'epoch': 0.53} + 53%|█████▎ | 3084/5772 [1:33:37<4:34:45, 6.13s/it] 53%|█████▎ | 3085/5772 [1:33:43<4:34:35, 6.13s/it] 53%|█████▎ | 3085/5772 [1:33:45<4:34:36, 6.13s/it] {'loss': 0.4647, 'learning_rate': 9.371870768177836e-06, 'epoch': 0.53} + 53%|█████▎ | 3085/5772 [1:33:45<4:34:36, 6.13s/it] {'loss': 0.4647, 'learning_rate': 9.371870768177836e-06, 'epoch': 0.53} + 53%|█████▎ | 3085/5772 [1:33:43<4:34:35, 6.13s/it] 53%|█████▎ | 3086/5772 [1:33:49<4:33:23, 6.11s/it] 53%|█████▎ | 3086/5772 [1:33:51<4:33:23, 6.11s/it] {'loss': 0.4684, 'learning_rate': 9.366269958115014e-06, 'epoch': 0.53} + 53%|█████▎ | 3086/5772 [1:33:51<4:33:23, 6.11s/it] {'loss': 0.4684, 'learning_rate': 9.366269958115014e-06, 'epoch': 0.53} + 53%|█████▎ | 3086/5772 [1:33:49<4:33:23, 6.11s/it] 53%|█████▎ | 3087/5772 [1:33:55<4:32:55, 6.10s/it] 53%|█████▎ | 3087/5772 [1:33:57<4:32:55, 6.10s/it] {'loss': 0.4689, 'learning_rate': 9.360669347641946e-06, 'epoch': 0.53} + 53%|█████▎ | 3087/5772 [1:33:57<4:32:55, 6.10s/it] {'loss': 0.4689, 'learning_rate': 9.360669347641946e-06, 'epoch': 0.53} + 53%|█████▎ | 3087/5772 [1:33:55<4:32:55, 6.10s/it] 53%|█████▎ | 3088/5772 [1:34:01<4:34:16, 6.13s/it] 53%|█████▎ | 3088/5772 [1:34:03<4:34:17, 6.13s/it] {'loss': 0.4689, 'learning_rate': 9.355068938522508e-06, 'epoch': 0.53} + 53%|█████▎ | 3088/5772 [1:34:03<4:34:17, 6.13s/it] {'loss': 0.4689, 'learning_rate': 9.355068938522508e-06, 'epoch': 0.53} + 53%|█████▎ | 3088/5772 [1:34:01<4:34:16, 6.13s/it] 54%|█████▎ | 3089/5772 [1:34:07<4:35:31, 6.16s/it] 54%|█████▎ | 3089/5772 [1:34:09<4:35:30, 6.16s/it] {'loss': 0.4681, 'learning_rate': 9.349468732520529e-06, 'epoch': 0.54} + 54%|█████▎ | 3089/5772 [1:34:09<4:35:30, 6.16s/it] {'loss': 0.4681, 'learning_rate': 9.349468732520529e-06, 'epoch': 0.54} + 54%|█████▎ | 3089/5772 [1:34:07<4:35:31, 6.16s/it] 54%|█████▎ | 3090/5772 [1:34:14<4:36:44, 6.19s/it] 54%|█████▎ | 3090/5772 [1:34:16<4:36:44, 6.19s/it] {'loss': 0.4613, 'learning_rate': 9.34386873139975e-06, 'epoch': 0.54} + 54%|█████▎ | 3090/5772 [1:34:16<4:36:44, 6.19s/it] {'loss': 0.4613, 'learning_rate': 9.34386873139975e-06, 'epoch': 0.54} + 54%|█████▎ | 3090/5772 [1:34:14<4:36:44, 6.19s/it] 54%|█████▎ | 3091/5772 [1:34:20<4:31:56, 6.09s/it] 54%|█████▎ | 3091/5772 [1:34:22<4:31:56, 6.09s/it] {'loss': 0.4658, 'learning_rate': 9.33826893692387e-06, 'epoch': 0.54} + 54%|█████▎ | 3091/5772 [1:34:22<4:31:56, 6.09s/it] {'loss': 0.4658, 'learning_rate': 9.33826893692387e-06, 'epoch': 0.54} + 54%|█████▎ | 3091/5772 [1:34:20<4:31:56, 6.09s/it] 54%|█████▎ | 3092/5772 [1:34:26<4:30:56, 6.07s/it] 54%|█████▎ | 3092/5772 [1:34:28<4:30:56, 6.07s/it] {'loss': 0.4719, 'learning_rate': 9.332669350856503e-06, 'epoch': 0.54} + 54%|█████▎ | 3092/5772 [1:34:28<4:30:56, 6.07s/it] {'loss': 0.4719, 'learning_rate': 9.332669350856503e-06, 'epoch': 0.54} + 54%|█████▎ | 3092/5772 [1:34:26<4:30:56, 6.07s/it] 54%|█████▎ | 3093/5772 [1:34:32<4:33:07, 6.12s/it] 54%|█████▎ | 3093/5772 [1:34:34<4:33:07, 6.12s/it] {'loss': 0.4618, 'learning_rate': 9.327069974961219e-06, 'epoch': 0.54} + 54%|█████▎ | 3093/5772 [1:34:34<4:33:07, 6.12s/it] {'loss': 0.4618, 'learning_rate': 9.327069974961219e-06, 'epoch': 0.54} + 54%|█████▎ | 3093/5772 [1:34:32<4:33:07, 6.12s/it] 54%|█████▎ | 3094/5772 [1:34:38<4:28:43, 6.02s/it] 54%|█████▎ | 3094/5772 [1:34:40<4:28:43, 6.02s/it] {'loss': 0.4697, 'learning_rate': 9.321470811001502e-06, 'epoch': 0.54} + 54%|█████▎ | 3094/5772 [1:34:40<4:28:43, 6.02s/it] {'loss': 0.4697, 'learning_rate': 9.321470811001502e-06, 'epoch': 0.54} + 54%|█████▎ | 3094/5772 [1:34:38<4:28:43, 6.02s/it] 54%|█████▎ | 3095/5772 [1:34:44<4:28:11, 6.01s/it] 54%|█████▎ | 3095/5772 [1:34:46<4:28:11, 6.01s/it] {'loss': 0.462, 'learning_rate': 9.315871860740782e-06, 'epoch': 0.54} + 54%|█████▎ | 3095/5772 [1:34:46<4:28:11, 6.01s/it] {'loss': 0.462, 'learning_rate': 9.315871860740782e-06, 'epoch': 0.54} + 54%|█████▎ | 3095/5772 [1:34:44<4:28:11, 6.01s/it] 54%|█████▎ | 3096/5772 [1:34:50<4:29:00, 6.03s/it] 54%|█████▎ | 3096/5772 [1:34:52<4:29:00, 6.03s/it] {'loss': 0.4703, 'learning_rate': 9.310273125942418e-06, 'epoch': 0.54} + 54%|█████▎ | 3096/5772 [1:34:52<4:29:00, 6.03s/it] {'loss': 0.4703, 'learning_rate': 9.310273125942418e-06, 'epoch': 0.54} + 54%|█████▎ | 3096/5772 [1:34:50<4:29:00, 6.03s/it] 54%|█████▎ | 3097/5772 [1:34:56<4:27:36, 6.00s/it] 54%|█████▎ | 3097/5772 [1:34:58<4:27:35, 6.00s/it] {'loss': 0.4592, 'learning_rate': 9.304674608369695e-06, 'epoch': 0.54} + 54%|█████▎ | 3097/5772 [1:34:58<4:27:35, 6.00s/it] {'loss': 0.4592, 'learning_rate': 9.304674608369695e-06, 'epoch': 0.54} + 54%|█████▎ | 3097/5772 [1:34:56<4:27:36, 6.00s/it] 54%|█████▎ | 3098/5772 [1:35:02<4:32:46, 6.12s/it] 54%|█████▎ | 3098/5772 [1:35:04<4:32:46, 6.12s/it] {'loss': 0.4773, 'learning_rate': 9.299076309785839e-06, 'epoch': 0.54} + 54%|█████▎ | 3098/5772 [1:35:04<4:32:46, 6.12s/it] {'loss': 0.4773, 'learning_rate': 9.299076309785839e-06, 'epoch': 0.54} + 54%|█████▎ | 3098/5772 [1:35:02<4:32:46, 6.12s/it] 54%|█████▎ | 3099/5772 [1:35:08<4:34:53, 6.17s/it] 54%|█████▎ | 3099/5772 [1:35:10<4:34:53, 6.17s/it] {'loss': 0.4705, 'learning_rate': 9.293478231954e-06, 'epoch': 0.54} + 54%|█████▎ | 3099/5772 [1:35:10<4:34:53, 6.17s/it] {'loss': 0.4705, 'learning_rate': 9.293478231954e-06, 'epoch': 0.54} + 54%|█████▎ | 3099/5772 [1:35:08<4:34:53, 6.17s/it]10 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +0 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... + 54%|█████▎ | 3100/5772 [1:35:15<4:38:20, 6.25s/it]2 AutoResumeHook: Checking whether to suspend... +815 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... + 54%|█████▎ | 3100/5772 [1:35:17<4:38:19, 6.25s/it]71 5AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +4 AutoResumeHook: Checking whether to suspend... + {'loss': 0.476, 'learning_rate': 9.287880376637262e-06, 'epoch': 0.54} + 54%|█████▎ | 3100/5772 [1:35:17<4:38:19, 6.25s/it] {'loss': 0.476, 'learning_rate': 9.287880376637262e-06, 'epoch': 0.54} + 54%|█████▎ | 3100/5772 [1:35:15<4:38:20, 6.25s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-3100/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-3100/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-3100/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 54%|█████▎ | 3101/5772 [1:35:39<8:32:49, 11.52s/it] 54%|█████▎ | 3101/5772 [1:35:40<8:32:49, 11.52s/it] {'loss': 0.4617, 'learning_rate': 9.282282745598646e-06, 'epoch': 0.54} + 54%|█████▎ | 3101/5772 [1:35:40<8:32:49, 11.52s/it] {'loss': 0.4617, 'learning_rate': 9.282282745598646e-06, 'epoch': 0.54} + 54%|█████▎ | 3101/5772 [1:35:39<8:32:49, 11.52s/it] 54%|█████▎ | 3102/5772 [1:35:45<7:20:30, 9.90s/it] 54%|█████▎ | 3102/5772 [1:35:47<7:20:30, 9.90s/it] {'loss': 0.4752, 'learning_rate': 9.276685340601085e-06, 'epoch': 0.54} + 54%|█████▎ | 3102/5772 [1:35:47<7:20:30, 9.90s/it] {'loss': 0.4752, 'learning_rate': 9.276685340601085e-06, 'epoch': 0.54} + 54%|█████▎ | 3102/5772 [1:35:45<7:20:30, 9.90s/it] 54%|█████▍ | 3103/5772 [1:35:51<6:30:07, 8.77s/it] 54%|█████▍ | 3103/5772 [1:35:53<6:30:07, 8.77s/it] {'loss': 0.463, 'learning_rate': 9.271088163407455e-06, 'epoch': 0.54} + 54%|█████▍ | 3103/5772 [1:35:53<6:30:07, 8.77s/it] {'loss': 0.463, 'learning_rate': 9.271088163407455e-06, 'epoch': 0.54} + 54%|█████▍ | 3103/5772 [1:35:51<6:30:07, 8.77s/it] 54%|█████▍ | 3104/5772 [1:35:57<5:55:43, 8.00s/it] 54%|█████▍ | 3104/5772 [1:35:59<5:55:43, 8.00s/it] {'loss': 0.4675, 'learning_rate': 9.265491215780556e-06, 'epoch': 0.54} + 54%|█████▍ | 3104/5772 [1:35:59<5:55:43, 8.00s/it] {'loss': 0.4675, 'learning_rate': 9.265491215780556e-06, 'epoch': 0.54} + 54%|█████▍ | 3104/5772 [1:35:57<5:55:43, 8.00s/it] 54%|█████▍ | 3105/5772 [1:36:03<5:30:07, 7.43s/it] 54%|█████▍ | 3105/5772 [1:36:05<5:30:07, 7.43s/it] {'loss': 0.4748, 'learning_rate': 9.259894499483116e-06, 'epoch': 0.54} + 54%|█████▍ | 3105/5772 [1:36:05<5:30:07, 7.43s/it] {'loss': 0.4748, 'learning_rate': 9.259894499483116e-06, 'epoch': 0.54} + 54%|█████▍ | 3105/5772 [1:36:03<5:30:07, 7.43s/it] 54%|█████▍ | 3106/5772 [1:36:09<5:13:45, 7.06s/it] 54%|█████▍ | 3106/5772 [1:36:11<5:13:45, 7.06s/it] {'loss': 0.4773, 'learning_rate': 9.254298016277785e-06, 'epoch': 0.54} + 54%|█████▍ | 3106/5772 [1:36:11<5:13:45, 7.06s/it] {'loss': 0.4773, 'learning_rate': 9.254298016277785e-06, 'epoch': 0.54} + 54%|█████▍ | 3106/5772 [1:36:09<5:13:45, 7.06s/it] 54%|█████▍ | 3107/5772 [1:36:15<5:00:43, 6.77s/it] 54%|█████▍ | 3107/5772 [1:36:17<5:00:43, 6.77s/it] {'loss': 0.463, 'learning_rate': 9.248701767927146e-06, 'epoch': 0.54} + 54%|█████▍ | 3107/5772 [1:36:17<5:00:43, 6.77s/it] {'loss': 0.463, 'learning_rate': 9.248701767927146e-06, 'epoch': 0.54} + 54%|█████▍ | 3107/5772 [1:36:15<5:00:43, 6.77s/it] 54%|█████▍ | 3108/5772 [1:36:22<4:54:13, 6.63s/it] 54%|█████▍ | 3108/5772 [1:36:24<4:54:13, 6.63s/it] {'loss': 0.4689, 'learning_rate': 9.243105756193714e-06, 'epoch': 0.54} + {'loss': 0.4689, 'learning_rate': 9.243105756193714e-06, 'epoch': 0.54} 54%|█████▍ | 3108/5772 [1:36:24<4:54:13, 6.63s/it] + 54%|█████▍ | 3108/5772 [1:36:22<4:54:13, 6.63s/it] 54%|█████▍ | 3109/5772 [1:36:28<4:54:24, 6.63s/it] 54%|█████▍ | 3109/5772 [1:36:30<4:54:24, 6.63s/it] {'loss': 0.4777, 'learning_rate': 9.23750998283991e-06, 'epoch': 0.54} + 54%|█████▍ | 3109/5772 [1:36:30<4:54:24, 6.63s/it] {'loss': 0.4777, 'learning_rate': 9.23750998283991e-06, 'epoch': 0.54} + 54%|█████▍ | 3109/5772 [1:36:28<4:54:24, 6.63s/it] 54%|█████▍ | 3110/5772 [1:36:34<4:46:39, 6.46s/it] 54%|█████▍ | 3110/5772 [1:36:36<4:46:39, 6.46s/it] {'loss': 0.4587, 'learning_rate': 9.231914449628103e-06, 'epoch': 0.54} + 54%|█████▍ | 3110/5772 [1:36:36<4:46:39, 6.46s/it] {'loss': 0.4587, 'learning_rate': 9.231914449628103e-06, 'epoch': 0.54} + 54%|█████▍ | 3110/5772 [1:36:34<4:46:39, 6.46s/it] 54%|█████▍ | 3111/5772 [1:36:41<4:43:08, 6.38s/it] 54%|█████▍ | 3111/5772 [1:36:43<4:43:08, 6.38s/it] {'loss': 0.4722, 'learning_rate': 9.226319158320565e-06, 'epoch': 0.54} + 54%|█████▍ | 3111/5772 [1:36:43<4:43:08, 6.38s/it] {'loss': 0.4722, 'learning_rate': 9.226319158320565e-06, 'epoch': 0.54} + 54%|█████▍ | 3111/5772 [1:36:41<4:43:08, 6.38s/it] 54%|█████▍ | 3112/5772 [1:36:46<4:36:00, 6.23s/it] 54%|█████▍ | 3112/5772 [1:36:48<4:36:00, 6.23s/it] {'loss': 0.4691, 'learning_rate': 9.22072411067951e-06, 'epoch': 0.54} + 54%|█████▍ | 3112/5772 [1:36:48<4:36:00, 6.23s/it] {'loss': 0.4691, 'learning_rate': 9.22072411067951e-06, 'epoch': 0.54} + 54%|█████▍ | 3112/5772 [1:36:46<4:36:00, 6.23s/it] 54%|█████▍ | 3113/5772 [1:36:52<4:31:20, 6.12s/it] 54%|█████▍ | 3113/5772 [1:36:54<4:31:20, 6.12s/it] {'loss': 0.4654, 'learning_rate': 9.215129308467062e-06, 'epoch': 0.54} + 54%|█████▍ | 3113/5772 [1:36:54<4:31:20, 6.12s/it] {'loss': 0.4654, 'learning_rate': 9.215129308467062e-06, 'epoch': 0.54} + 54%|█████▍ | 3113/5772 [1:36:52<4:31:20, 6.12s/it] 54%|█████▍ | 3114/5772 [1:36:58<4:29:16, 6.08s/it] 54%|█████▍ | 3114/5772 [1:37:00<4:29:16, 6.08s/it] {'loss': 0.4789, 'learning_rate': 9.20953475344527e-06, 'epoch': 0.54} + 54%|█████▍ | 3114/5772 [1:37:00<4:29:16, 6.08s/it] {'loss': 0.4789, 'learning_rate': 9.20953475344527e-06, 'epoch': 0.54} + 54%|█████▍ | 3114/5772 [1:36:58<4:29:16, 6.08s/it] 54%|█████▍ | 3115/5772 [1:37:04<4:28:32, 6.06s/it] 54%|█████▍ | 3115/5772 [1:37:06<4:28:32, 6.06s/it] {'loss': 0.4623, 'learning_rate': 9.20394044737612e-06, 'epoch': 0.54} + 54%|█████▍ | 3115/5772 [1:37:06<4:28:32, 6.06s/it] {'loss': 0.4623, 'learning_rate': 9.20394044737612e-06, 'epoch': 0.54} + 54%|█████▍ | 3115/5772 [1:37:04<4:28:32, 6.06s/it] 54%|█████▍ | 3116/5772 [1:37:10<4:26:13, 6.01s/it] 54%|█████▍ | 3116/5772 [1:37:12<4:26:13, 6.01s/it] {'loss': 0.469, 'learning_rate': 9.198346392021494e-06, 'epoch': 0.54} + 54%|█████▍ | 3116/5772 [1:37:12<4:26:13, 6.01s/it] {'loss': 0.469, 'learning_rate': 9.198346392021494e-06, 'epoch': 0.54} + 54%|█████▍ | 3116/5772 [1:37:10<4:26:13, 6.01s/it] 54%|█████▍ | 3117/5772 [1:37:16<4:27:57, 6.06s/it] 54%|█████▍ | 3117/5772 [1:37:18<4:27:57, 6.06s/it] {'loss': 0.4741, 'learning_rate': 9.192752589143219e-06, 'epoch': 0.54} + 54%|█████▍ | 3117/5772 [1:37:18<4:27:57, 6.06s/it] {'loss': 0.4741, 'learning_rate': 9.192752589143219e-06, 'epoch': 0.54} + 54%|█████▍ | 3117/5772 [1:37:16<4:27:57, 6.06s/it] 54%|█████▍ | 3118/5772 [1:37:22<4:28:14, 6.06s/it] 54%|█████▍ | 3118/5772 [1:37:24<4:28:14, 6.06s/it] {'loss': 0.4653, 'learning_rate': 9.187159040503025e-06, 'epoch': 0.54} + 54%|█████▍ | 3118/5772 [1:37:24<4:28:14, 6.06s/it] {'loss': 0.4653, 'learning_rate': 9.187159040503025e-06, 'epoch': 0.54} + 54%|█████▍ | 3118/5772 [1:37:22<4:28:14, 6.06s/it] 54%|█████▍ | 3119/5772 [1:37:28<4:26:20, 6.02s/it] 54%|█████▍ | 3119/5772 [1:37:30<4:26:20, 6.02s/it] {'loss': 0.4558, 'learning_rate': 9.181565747862575e-06, 'epoch': 0.54} + 54%|█████▍ | 3119/5772 [1:37:30<4:26:20, 6.02s/it] {'loss': 0.4558, 'learning_rate': 9.181565747862575e-06, 'epoch': 0.54} + 54%|█████▍ | 3119/5772 [1:37:28<4:26:20, 6.02s/it] 54%|█████▍ | 3120/5772 [1:37:34<4:23:39, 5.97s/it] 54%|█████▍ | 3120/5772 [1:37:36<4:23:39, 5.97s/it] {'loss': 0.4667, 'learning_rate': 9.175972712983439e-06, 'epoch': 0.54} + 54%|█████▍ | 3120/5772 [1:37:36<4:23:39, 5.97s/it] {'loss': 0.4667, 'learning_rate': 9.175972712983439e-06, 'epoch': 0.54} + 54%|█████▍ | 3120/5772 [1:37:34<4:23:39, 5.97s/it] 54%|█████▍ | 3121/5772 [1:37:40<4:25:01, 6.00s/it] 54%|█████▍ | 3121/5772 [1:37:42<4:25:00, 6.00s/it] {'loss': 0.4752, 'learning_rate': 9.170379937627116e-06, 'epoch': 0.54} + 54%|█████▍ | 3121/5772 [1:37:42<4:25:00, 6.00s/it] {'loss': 0.4752, 'learning_rate': 9.170379937627116e-06, 'epoch': 0.54} + 54%|█████▍ | 3121/5772 [1:37:40<4:25:01, 6.00s/it] 54%|█████▍ | 3122/5772 [1:37:46<4:25:57, 6.02s/it] 54%|█████▍ | 3122/5772 [1:37:48<4:25:57, 6.02s/it] {'loss': 0.4638, 'learning_rate': 9.16478742355502e-06, 'epoch': 0.54} + 54%|█████▍ | 3122/5772 [1:37:48<4:25:57, 6.02s/it] {'loss': 0.4638, 'learning_rate': 9.16478742355502e-06, 'epoch': 0.54} + 54%|█████▍ | 3122/5772 [1:37:46<4:25:57, 6.02s/it] 54%|█████▍ | 3123/5772 [1:37:52<4:27:01, 6.05s/it] 54%|█████▍ | 3123/5772 [1:37:54<4:27:01, 6.05s/it] {'loss': 0.4571, 'learning_rate': 9.159195172528478e-06, 'epoch': 0.54} + 54%|█████▍ | 3123/5772 [1:37:54<4:27:01, 6.05s/it] {'loss': 0.4571, 'learning_rate': 9.159195172528478e-06, 'epoch': 0.54} + 54%|█████▍ | 3123/5772 [1:37:52<4:27:01, 6.05s/it] 54%|█████▍ | 3124/5772 [1:37:58<4:25:22, 6.01s/it] 54%|█████▍ | 3124/5772 [1:38:00<4:25:22, 6.01s/it] {'loss': 0.4727, 'learning_rate': 9.153603186308747e-06, 'epoch': 0.54} + 54%|█████▍ | 3124/5772 [1:38:00<4:25:22, 6.01s/it] {'loss': 0.4727, 'learning_rate': 9.153603186308747e-06, 'epoch': 0.54} + 54%|█████▍ | 3124/5772 [1:37:58<4:25:22, 6.01s/it] 54%|█████▍ | 3125/5772 [1:38:05<4:32:17, 6.17s/it] 54%|█████▍ | 3125/5772 [1:38:07<4:32:17, 6.17s/it] {'loss': 0.465, 'learning_rate': 9.148011466656981e-06, 'epoch': 0.54} + 54%|█████▍ | 3125/5772 [1:38:07<4:32:17, 6.17s/it] {'loss': 0.465, 'learning_rate': 9.148011466656981e-06, 'epoch': 0.54} + 54%|█████▍ | 3125/5772 [1:38:05<4:32:17, 6.17s/it] 54%|█████▍ | 3126/5772 [1:38:11<4:30:53, 6.14s/it] 54%|█████▍ | 3126/5772 [1:38:13<4:30:54, 6.14s/it] {'loss': 0.4761, 'learning_rate': 9.14242001533427e-06, 'epoch': 0.54} + 54%|█████▍ | 3126/5772 [1:38:13<4:30:54, 6.14s/it] {'loss': 0.4761, 'learning_rate': 9.14242001533427e-06, 'epoch': 0.54} + 54%|█████▍ | 3126/5772 [1:38:11<4:30:53, 6.14s/it] 54%|█████▍ | 3127/5772 [1:38:17<4:30:53, 6.15s/it] 54%|█████▍ | 3127/5772 [1:38:19<4:30:53, 6.15s/it] {'loss': 0.4711, 'learning_rate': 9.136828834101606e-06, 'epoch': 0.54} + 54%|█████▍ | 3127/5772 [1:38:19<4:30:53, 6.15s/it] {'loss': 0.4711, 'learning_rate': 9.136828834101606e-06, 'epoch': 0.54} + 54%|█████▍ | 3127/5772 [1:38:17<4:30:53, 6.15s/it] 54%|█████▍ | 3128/5772 [1:38:23<4:29:38, 6.12s/it] 54%|█████▍ | 3128/5772 [1:38:25<4:29:38, 6.12s/it] {'loss': 0.4669, 'learning_rate': 9.1312379247199e-06, 'epoch': 0.54} + 54%|█████▍ | 3128/5772 [1:38:25<4:29:38, 6.12s/it] {'loss': 0.4669, 'learning_rate': 9.1312379247199e-06, 'epoch': 0.54} + 54%|█████▍ | 3128/5772 [1:38:23<4:29:38, 6.12s/it] 54%|█████▍ | 3129/5772 [1:38:29<4:27:16, 6.07s/it] 54%|█████▍ | 3129/5772 [1:38:31<4:27:16, 6.07s/it] {'loss': 0.4521, 'learning_rate': 9.125647288949982e-06, 'epoch': 0.54} + 54%|█████▍ | 3129/5772 [1:38:31<4:27:16, 6.07s/it] {'loss': 0.4521, 'learning_rate': 9.125647288949982e-06, 'epoch': 0.54} + 54%|█████▍ | 3129/5772 [1:38:29<4:27:16, 6.07s/it] 54%|█████▍ | 3130/5772 [1:38:35<4:30:28, 6.14s/it] 54%|█████▍ | 3130/5772 [1:38:37<4:30:28, 6.14s/it] {'loss': 0.4658, 'learning_rate': 9.120056928552586e-06, 'epoch': 0.54} + 54%|█████▍ | 3130/5772 [1:38:37<4:30:28, 6.14s/it] {'loss': 0.4658, 'learning_rate': 9.120056928552586e-06, 'epoch': 0.54} + 54%|█████▍ | 3130/5772 [1:38:35<4:30:28, 6.14s/it] 54%|█████▍ | 3131/5772 [1:38:42<4:32:43, 6.20s/it] 54%|█████▍ | 3131/5772 [1:38:44<4:32:43, 6.20s/it] {'loss': 0.463, 'learning_rate': 9.114466845288372e-06, 'epoch': 0.54} + 54%|█████▍ | 3131/5772 [1:38:44<4:32:43, 6.20s/it] {'loss': 0.463, 'learning_rate': 9.114466845288372e-06, 'epoch': 0.54} + 54%|█████▍ | 3131/5772 [1:38:42<4:32:43, 6.20s/it] 54%|█████▍ | 3132/5772 [1:38:48<4:28:55, 6.11s/it] 54%|█████▍ | 3132/5772 [1:38:50<4:28:55, 6.11s/it] {'loss': 0.4682, 'learning_rate': 9.108877040917896e-06, 'epoch': 0.54} + 54%|█████▍ | 3132/5772 [1:38:50<4:28:55, 6.11s/it] {'loss': 0.4682, 'learning_rate': 9.108877040917896e-06, 'epoch': 0.54} + 54%|█████▍ | 3132/5772 [1:38:48<4:28:55, 6.11s/it] 54%|█████▍ | 3133/5772 [1:38:54<4:29:35, 6.13s/it] 54%|█████▍ | 3133/5772 [1:38:56<4:29:35, 6.13s/it] {'loss': 0.4614, 'learning_rate': 9.103287517201647e-06, 'epoch': 0.54} + 54%|█████▍ | 3133/5772 [1:38:56<4:29:35, 6.13s/it] {'loss': 0.4614, 'learning_rate': 9.103287517201647e-06, 'epoch': 0.54} + 54%|█████▍ | 3133/5772 [1:38:54<4:29:35, 6.13s/it] 54%|█████▍ | 3134/5772 [1:39:00<4:29:01, 6.12s/it] 54%|█████▍ | 3134/5772 [1:39:02<4:29:01, 6.12s/it] {'loss': 0.4742, 'learning_rate': 9.097698275900004e-06, 'epoch': 0.54} + 54%|█████▍ | 3134/5772 [1:39:02<4:29:01, 6.12s/it] {'loss': 0.4742, 'learning_rate': 9.097698275900004e-06, 'epoch': 0.54} + 54%|█████▍ | 3134/5772 [1:39:00<4:29:01, 6.12s/it] 54%|█████▍ | 3135/5772 [1:39:07<4:35:35, 6.27s/it] 54%|█████▍ | 3135/5772 [1:39:09<4:35:35, 6.27s/it] {'loss': 0.4581, 'learning_rate': 9.092109318773274e-06, 'epoch': 0.54} + {'loss': 0.4581, 'learning_rate': 9.092109318773274e-06, 'epoch': 0.54} 54%|█████▍ | 3135/5772 [1:39:09<4:35:35, 6.27s/it] + 54%|█████▍ | 3135/5772 [1:39:07<4:35:35, 6.27s/it] 54%|█████▍ | 3136/5772 [1:39:13<4:40:00, 6.37s/it] 54%|█████▍ | 3136/5772 [1:39:15<4:40:00, 6.37s/it] {'loss': 0.4641, 'learning_rate': 9.086520647581667e-06, 'epoch': 0.54} + 54%|█████▍ | 3136/5772 [1:39:15<4:40:00, 6.37s/it] {'loss': 0.4641, 'learning_rate': 9.086520647581667e-06, 'epoch': 0.54} + 54%|█████▍ | 3136/5772 [1:39:13<4:40:00, 6.37s/it] 54%|█████▍ | 3137/5772 [1:39:19<4:36:49, 6.30s/it] 54%|█████▍ | 3137/5772 [1:39:21<4:36:49, 6.30s/it] {'loss': 0.451, 'learning_rate': 9.080932264085302e-06, 'epoch': 0.54} + 54%|█████▍ | 3137/5772 [1:39:21<4:36:49, 6.30s/it] {'loss': 0.451, 'learning_rate': 9.080932264085302e-06, 'epoch': 0.54} + 54%|█████▍ | 3137/5772 [1:39:19<4:36:49, 6.30s/it] 54%|█████▍ | 3138/5772 [1:39:25<4:33:23, 6.23s/it] 54%|█████▍ | 3138/5772 [1:39:27<4:33:23, 6.23s/it] {'loss': 0.4747, 'learning_rate': 9.075344170044212e-06, 'epoch': 0.54} + 54%|█████▍ | 3138/5772 [1:39:27<4:33:23, 6.23s/it] {'loss': 0.4747, 'learning_rate': 9.075344170044212e-06, 'epoch': 0.54} + 54%|█████▍ | 3138/5772 [1:39:25<4:33:23, 6.23s/it] 54%|█████▍ | 3139/5772 [1:39:31<4:30:34, 6.17s/it] 54%|█████▍ | 3139/5772 [1:39:33<4:30:34, 6.17s/it] {'loss': 0.4549, 'learning_rate': 9.069756367218333e-06, 'epoch': 0.54} + 54%|█████▍ | 3139/5772 [1:39:33<4:30:34, 6.17s/it] {'loss': 0.4549, 'learning_rate': 9.069756367218333e-06, 'epoch': 0.54} + 54%|█████▍ | 3139/5772 [1:39:31<4:30:34, 6.17s/it] 54%|█████▍ | 3140/5772 [1:39:38<4:29:11, 6.14s/it] 54%|█████▍ | 3140/5772 [1:39:39<4:29:10, 6.14s/it] {'loss': 0.476, 'learning_rate': 9.064168857367514e-06, 'epoch': 0.54} + 54%|█████▍ | 3140/5772 [1:39:39<4:29:10, 6.14s/it] {'loss': 0.476, 'learning_rate': 9.064168857367514e-06, 'epoch': 0.54} + 54%|█████▍ | 3140/5772 [1:39:38<4:29:11, 6.14s/it] 54%|█████▍ | 3141/5772 [1:39:43<4:26:24, 6.08s/it] 54%|█████▍ | 3141/5772 [1:39:45<4:26:24, 6.08s/it] {'loss': 0.4573, 'learning_rate': 9.05858164225151e-06, 'epoch': 0.54} + 54%|█████▍ | 3141/5772 [1:39:45<4:26:24, 6.08s/it] {'loss': 0.4573, 'learning_rate': 9.05858164225151e-06, 'epoch': 0.54} + 54%|█████▍ | 3141/5772 [1:39:43<4:26:24, 6.08s/it] 54%|█████▍ | 3142/5772 [1:39:50<4:26:16, 6.07s/it] 54%|█████▍ | 3142/5772 [1:39:51<4:26:16, 6.07s/it] {'loss': 0.4651, 'learning_rate': 9.052994723629982e-06, 'epoch': 0.54} + 54%|█████▍ | 3142/5772 [1:39:51<4:26:16, 6.07s/it] {'loss': 0.4651, 'learning_rate': 9.052994723629982e-06, 'epoch': 0.54} + 54%|█████▍ | 3142/5772 [1:39:50<4:26:16, 6.07s/it] 54%|█████▍ | 3143/5772 [1:39:56<4:28:27, 6.13s/it] 54%|█████▍ | 3143/5772 [1:39:58<4:28:28, 6.13s/it] {'loss': 0.4638, 'learning_rate': 9.047408103262503e-06, 'epoch': 0.54} + 54%|█████▍ | 3143/5772 [1:39:58<4:28:28, 6.13s/it] {'loss': 0.4638, 'learning_rate': 9.047408103262503e-06, 'epoch': 0.54} + 54%|█████▍ | 3143/5772 [1:39:56<4:28:27, 6.13s/it] 54%|█████▍ | 3144/5772 [1:40:02<4:25:13, 6.06s/it] 54%|█████▍ | 3144/5772 [1:40:04<4:25:13, 6.06s/it] {'loss': 0.4693, 'learning_rate': 9.041821782908544e-06, 'epoch': 0.54} + 54%|█████▍ | 3144/5772 [1:40:04<4:25:13, 6.06s/it] {'loss': 0.4693, 'learning_rate': 9.041821782908544e-06, 'epoch': 0.54} + 54%|█████▍ | 3144/5772 [1:40:02<4:25:13, 6.06s/it] 54%|█████▍ | 3145/5772 [1:40:08<4:31:28, 6.20s/it] 54%|█████▍ | 3145/5772 [1:40:10<4:31:28, 6.20s/it] {'loss': 0.4805, 'learning_rate': 9.03623576432749e-06, 'epoch': 0.54} + 54%|█████▍ | 3145/5772 [1:40:10<4:31:28, 6.20s/it] {'loss': 0.4805, 'learning_rate': 9.03623576432749e-06, 'epoch': 0.54} + 54%|█████▍ | 3145/5772 [1:40:08<4:31:28, 6.20s/it] 55%|█████▍ | 3146/5772 [1:40:15<4:35:09, 6.29s/it] 55%|█████▍ | 3146/5772 [1:40:17<4:35:09, 6.29s/it] {'loss': 0.4691, 'learning_rate': 9.03065004927862e-06, 'epoch': 0.54} + 55%|█████▍ | 3146/5772 [1:40:17<4:35:09, 6.29s/it] {'loss': 0.4691, 'learning_rate': 9.03065004927862e-06, 'epoch': 0.54} + 55%|█████▍ | 3146/5772 [1:40:15<4:35:09, 6.29s/it] 55%|█████▍ | 3147/5772 [1:40:20<4:28:49, 6.14s/it] 55%|█████▍ | 3147/5772 [1:40:22<4:28:49, 6.14s/it] {'loss': 0.4667, 'learning_rate': 9.02506463952113e-06, 'epoch': 0.55} + 55%|█████▍ | 3147/5772 [1:40:22<4:28:49, 6.14s/it] {'loss': 0.4667, 'learning_rate': 9.02506463952113e-06, 'epoch': 0.55} + 55%|█████▍ | 3147/5772 [1:40:20<4:28:49, 6.14s/it] 55%|█████▍ | 3148/5772 [1:40:26<4:26:33, 6.10s/it] 55%|█████▍ | 3148/5772 [1:40:28<4:26:33, 6.10s/it] {'loss': 0.4706, 'learning_rate': 9.019479536814108e-06, 'epoch': 0.55} + 55%|█████▍ | 3148/5772 [1:40:28<4:26:33, 6.10s/it] {'loss': 0.4706, 'learning_rate': 9.019479536814108e-06, 'epoch': 0.55} + 55%|█████▍ | 3148/5772 [1:40:26<4:26:33, 6.10s/it] 55%|█████▍ | 3149/5772 [1:40:32<4:25:05, 6.06s/it] 55%|█████▍ | 3149/5772 [1:40:34<4:25:05, 6.06s/it] {'loss': 0.455, 'learning_rate': 9.013894742916554e-06, 'epoch': 0.55} + 55%|█████▍ | 3149/5772 [1:40:34<4:25:05, 6.06s/it] {'loss': 0.455, 'learning_rate': 9.013894742916554e-06, 'epoch': 0.55} + 55%|█████▍ | 3149/5772 [1:40:32<4:25:05, 6.06s/it]3 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +5 0AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... + 55%|█████▍ | 3150/5772 [1:40:39<4:28:49, 6.15s/it]14 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +1112 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +1 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... + 55%|█████▍ | 3150/5772 [1:40:41<4:28:49, 6.15s/it] {'loss': 0.4606, 'learning_rate': 9.008310259587374e-06, 'epoch': 0.55} + 55%|█████▍ | 3150/5772 [1:40:41<4:28:49, 6.15s/it] {'loss': 0.4606, 'learning_rate': 9.008310259587374e-06, 'epoch': 0.55} + 55%|█████▍ | 3150/5772 [1:40:39<4:28:49, 6.15s/it] 55%|█████▍ | 3151/5772 [1:40:45<4:28:24, 6.14s/it] 55%|█████▍ | 3151/5772 [1:40:47<4:28:24, 6.14s/it] {'loss': 0.4682, 'learning_rate': 9.002726088585356e-06, 'epoch': 0.55} + 55%|█████▍ | 3151/5772 [1:40:47<4:28:24, 6.14s/it] {'loss': 0.4682, 'learning_rate': 9.002726088585356e-06, 'epoch': 0.55} + 55%|█████▍ | 3151/5772 [1:40:45<4:28:24, 6.14s/it] 55%|█████▍ | 3152/5772 [1:40:51<4:25:58, 6.09s/it] 55%|█████▍ | 3152/5772 [1:40:53<4:25:58, 6.09s/it] {'loss': 0.4523, 'learning_rate': 8.997142231669217e-06, 'epoch': 0.55} + 55%|█████▍ | 3152/5772 [1:40:53<4:25:58, 6.09s/it] {'loss': 0.4523, 'learning_rate': 8.997142231669217e-06, 'epoch': 0.55} + 55%|█████▍ | 3152/5772 [1:40:51<4:25:58, 6.09s/it] 55%|█████▍ | 3153/5772 [1:40:57<4:29:42, 6.18s/it] 55%|█████▍ | 3153/5772 [1:40:59<4:29:42, 6.18s/it] {'loss': 0.473, 'learning_rate': 8.991558690597553e-06, 'epoch': 0.55} + 55%|█████▍ | 3153/5772 [1:40:59<4:29:42, 6.18s/it] {'loss': 0.473, 'learning_rate': 8.991558690597553e-06, 'epoch': 0.55} + 55%|█████▍ | 3153/5772 [1:40:57<4:29:42, 6.18s/it] 55%|█████▍ | 3154/5772 [1:41:03<4:26:56, 6.12s/it] 55%|█████▍ | 3154/5772 [1:41:05<4:26:56, 6.12s/it] {'loss': 0.4886, 'learning_rate': 8.985975467128875e-06, 'epoch': 0.55} + 55%|█████▍ | 3154/5772 [1:41:05<4:26:56, 6.12s/it] {'loss': 0.4886, 'learning_rate': 8.985975467128875e-06, 'epoch': 0.55} + 55%|█████▍ | 3154/5772 [1:41:03<4:26:56, 6.12s/it] 55%|█████▍ | 3155/5772 [1:41:09<4:25:08, 6.08s/it] 55%|█████▍ | 3155/5772 [1:41:11<4:25:08, 6.08s/it] {'loss': 0.4663, 'learning_rate': 8.980392563021585e-06, 'epoch': 0.55} + 55%|█████▍ | 3155/5772 [1:41:11<4:25:08, 6.08s/it] {'loss': 0.4663, 'learning_rate': 8.980392563021585e-06, 'epoch': 0.55} + 55%|█████▍ | 3155/5772 [1:41:09<4:25:08, 6.08s/it] 55%|█████▍ | 3156/5772 [1:41:16<4:27:41, 6.14s/it] 55%|█████▍ | 3156/5772 [1:41:18<4:27:40, 6.14s/it] {'loss': 0.4706, 'learning_rate': 8.974809980033987e-06, 'epoch': 0.55} + 55%|█████▍ | 3156/5772 [1:41:18<4:27:40, 6.14s/it] {'loss': 0.4706, 'learning_rate': 8.974809980033987e-06, 'epoch': 0.55} + 55%|█████▍ | 3156/5772 [1:41:16<4:27:41, 6.14s/it] 55%|█████▍ | 3157/5772 [1:41:21<4:24:02, 6.06s/it] 55%|█████▍ | 3157/5772 [1:41:23<4:24:02, 6.06s/it] {'loss': 0.4569, 'learning_rate': 8.969227719924289e-06, 'epoch': 0.55} + 55%|█████▍ | 3157/5772 [1:41:23<4:24:02, 6.06s/it] {'loss': 0.4569, 'learning_rate': 8.969227719924289e-06, 'epoch': 0.55} + 55%|█████▍ | 3157/5772 [1:41:21<4:24:02, 6.06s/it] 55%|█████▍ | 3158/5772 [1:41:28<4:27:44, 6.15s/it] 55%|█████▍ | 3158/5772 [1:41:30<4:27:44, 6.15s/it] {'loss': 0.4681, 'learning_rate': 8.963645784450584e-06, 'epoch': 0.55} + 55%|█████▍ | 3158/5772 [1:41:30<4:27:44, 6.15s/it] {'loss': 0.4681, 'learning_rate': 8.963645784450584e-06, 'epoch': 0.55} + 55%|█████▍ | 3158/5772 [1:41:28<4:27:44, 6.15s/it] 55%|█████▍ | 3159/5772 [1:41:34<4:24:20, 6.07s/it] 55%|█████▍ | 3159/5772 [1:41:36<4:24:20, 6.07s/it] {'loss': 0.4673, 'learning_rate': 8.958064175370884e-06, 'epoch': 0.55} + 55%|█████▍ | 3159/5772 [1:41:36<4:24:20, 6.07s/it] {'loss': 0.4673, 'learning_rate': 8.958064175370884e-06, 'epoch': 0.55} + 55%|█████▍ | 3159/5772 [1:41:34<4:24:20, 6.07s/it] 55%|█████▍ | 3160/5772 [1:41:40<4:31:00, 6.23s/it] 55%|█████▍ | 3160/5772 [1:41:42<4:31:00, 6.23s/it] {'loss': 0.4689, 'learning_rate': 8.95248289444307e-06, 'epoch': 0.55} + 55%|█████▍ | 3160/5772 [1:41:42<4:31:00, 6.23s/it] {'loss': 0.4689, 'learning_rate': 8.95248289444307e-06, 'epoch': 0.55} + 55%|█████▍ | 3160/5772 [1:41:40<4:31:00, 6.23s/it] 55%|█████▍ | 3161/5772 [1:41:46<4:30:56, 6.23s/it] 55%|█████▍ | 3161/5772 [1:41:48<4:30:56, 6.23s/it] {'loss': 0.4577, 'learning_rate': 8.946901943424951e-06, 'epoch': 0.55} + 55%|█████▍ | 3161/5772 [1:41:48<4:30:56, 6.23s/it] {'loss': 0.4577, 'learning_rate': 8.946901943424951e-06, 'epoch': 0.55} + 55%|█████▍ | 3161/5772 [1:41:46<4:30:56, 6.23s/it] 55%|█████▍ | 3162/5772 [1:41:52<4:27:03, 6.14s/it] 55%|█████▍ | 3162/5772 [1:41:54<4:27:04, 6.14s/it] {'loss': 0.4649, 'learning_rate': 8.941321324074207e-06, 'epoch': 0.55} + 55%|█████▍ | 3162/5772 [1:41:54<4:27:04, 6.14s/it] {'loss': 0.4649, 'learning_rate': 8.941321324074207e-06, 'epoch': 0.55} + 55%|█████▍ | 3162/5772 [1:41:52<4:27:03, 6.14s/it] 55%|█████▍ | 3163/5772 [1:41:59<4:28:55, 6.18s/it] 55%|█████▍ | 3163/5772 [1:42:01<4:28:55, 6.18s/it] {'loss': 0.4548, 'learning_rate': 8.935741038148426e-06, 'epoch': 0.55} + 55%|█████▍ | 3163/5772 [1:42:01<4:28:55, 6.18s/it] {'loss': 0.4548, 'learning_rate': 8.935741038148426e-06, 'epoch': 0.55} + 55%|█████▍ | 3163/5772 [1:41:59<4:28:55, 6.18s/it] 55%|█████▍ | 3164/5772 [1:42:05<4:24:12, 6.08s/it] 55%|█████▍ | 3164/5772 [1:42:07<4:24:12, 6.08s/it] {'loss': 0.476, 'learning_rate': 8.930161087405089e-06, 'epoch': 0.55} + 55%|█████▍ | 3164/5772 [1:42:07<4:24:12, 6.08s/it] {'loss': 0.476, 'learning_rate': 8.930161087405089e-06, 'epoch': 0.55} + 55%|█████▍ | 3164/5772 [1:42:05<4:24:12, 6.08s/it] 55%|█████▍ | 3165/5772 [1:42:11<4:25:28, 6.11s/it] 55%|█████▍ | 3165/5772 [1:42:13<4:25:28, 6.11s/it] {'loss': 0.4596, 'learning_rate': 8.924581473601568e-06, 'epoch': 0.55} + 55%|█████▍ | 3165/5772 [1:42:13<4:25:28, 6.11s/it] {'loss': 0.4596, 'learning_rate': 8.924581473601568e-06, 'epoch': 0.55} + 55%|█████▍ | 3165/5772 [1:42:11<4:25:28, 6.11s/it] 55%|█████▍ | 3166/5772 [1:42:17<4:24:50, 6.10s/it] 55%|█████▍ | 3166/5772 [1:42:19<4:24:50, 6.10s/it] {'loss': 0.4701, 'learning_rate': 8.919002198495135e-06, 'epoch': 0.55} + 55%|█████▍ | 3166/5772 [1:42:19<4:24:50, 6.10s/it] {'loss': 0.4701, 'learning_rate': 8.919002198495135e-06, 'epoch': 0.55} + 55%|█████▍ | 3166/5772 [1:42:17<4:24:50, 6.10s/it] 55%|█████▍ | 3167/5772 [1:42:23<4:31:08, 6.24s/it] 55%|█████▍ | 3167/5772 [1:42:25<4:31:07, 6.24s/it] {'loss': 0.4719, 'learning_rate': 8.913423263842943e-06, 'epoch': 0.55} + 55%|█████▍ | 3167/5772 [1:42:25<4:31:07, 6.24s/it] {'loss': 0.4719, 'learning_rate': 8.913423263842943e-06, 'epoch': 0.55} + 55%|█████▍ | 3167/5772 [1:42:23<4:31:08, 6.24s/it] 55%|█████▍ | 3168/5772 [1:42:29<4:26:42, 6.15s/it] 55%|█████▍ | 3168/5772 [1:42:31<4:26:42, 6.15s/it] {'loss': 0.4662, 'learning_rate': 8.90784467140206e-06, 'epoch': 0.55} + 55%|█████▍ | 3168/5772 [1:42:31<4:26:42, 6.15s/it] {'loss': 0.4662, 'learning_rate': 8.90784467140206e-06, 'epoch': 0.55} + 55%|█████▍ | 3168/5772 [1:42:29<4:26:42, 6.15s/it] 55%|█████▍ | 3169/5772 [1:42:35<4:24:02, 6.09s/it] 55%|█████▍ | 3169/5772 [1:42:37<4:24:02, 6.09s/it] {'loss': 0.4529, 'learning_rate': 8.90226642292942e-06, 'epoch': 0.55} + 55%|█████▍ | 3169/5772 [1:42:37<4:24:02, 6.09s/it] {'loss': 0.4529, 'learning_rate': 8.90226642292942e-06, 'epoch': 0.55} + 55%|█████▍ | 3169/5772 [1:42:35<4:24:02, 6.09s/it] 55%|█████▍ | 3170/5772 [1:42:41<4:22:24, 6.05s/it] 55%|█████▍ | 3170/5772 [1:42:43<4:22:24, 6.05s/it] {'loss': 0.4792, 'learning_rate': 8.896688520181867e-06, 'epoch': 0.55} + 55%|█████▍ | 3170/5772 [1:42:43<4:22:24, 6.05s/it] {'loss': 0.4792, 'learning_rate': 8.896688520181867e-06, 'epoch': 0.55} + 55%|█████▍ | 3170/5772 [1:42:41<4:22:24, 6.05s/it] 55%|█████▍ | 3171/5772 [1:42:47<4:23:01, 6.07s/it] 55%|█████▍ | 3171/5772 [1:42:49<4:23:02, 6.07s/it] {'loss': 0.4617, 'learning_rate': 8.891110964916135e-06, 'epoch': 0.55} + 55%|█████▍ | 3171/5772 [1:42:49<4:23:02, 6.07s/it] {'loss': 0.4617, 'learning_rate': 8.891110964916135e-06, 'epoch': 0.55} + 55%|█████▍ | 3171/5772 [1:42:47<4:23:01, 6.07s/it] 55%|█████▍ | 3172/5772 [1:42:53<4:22:35, 6.06s/it] 55%|█████▍ | 3172/5772 [1:42:55<4:22:35, 6.06s/it] {'loss': 0.4782, 'learning_rate': 8.885533758888835e-06, 'epoch': 0.55} + 55%|█████▍ | 3172/5772 [1:42:55<4:22:35, 6.06s/it] {'loss': 0.4782, 'learning_rate': 8.885533758888835e-06, 'epoch': 0.55} + 55%|█████▍ | 3172/5772 [1:42:53<4:22:35, 6.06s/it] 55%|█████▍ | 3173/5772 [1:43:00<4:25:53, 6.14s/it] 55%|█████▍ | 3173/5772 [1:43:02<4:25:52, 6.14s/it] {'loss': 0.4593, 'learning_rate': 8.879956903856484e-06, 'epoch': 0.55} + 55%|█████▍ | 3173/5772 [1:43:02<4:25:52, 6.14s/it] {'loss': 0.4593, 'learning_rate': 8.879956903856484e-06, 'epoch': 0.55} + 55%|█████▍ | 3173/5772 [1:43:00<4:25:53, 6.14s/it] 55%|█████▍ | 3174/5772 [1:43:05<4:20:37, 6.02s/it] 55%|█████▍ | 3174/5772 [1:43:07<4:20:37, 6.02s/it] {'loss': 0.4656, 'learning_rate': 8.874380401575476e-06, 'epoch': 0.55} + 55%|█████▍ | 3174/5772 [1:43:07<4:20:37, 6.02s/it] {'loss': 0.4656, 'learning_rate': 8.874380401575476e-06, 'epoch': 0.55} + 55%|█████▍ | 3174/5772 [1:43:05<4:20:37, 6.02s/it] 55%|█████▌ | 3175/5772 [1:43:11<4:19:47, 6.00s/it] 55%|█████▌ | 3175/5772 [1:43:13<4:19:47, 6.00s/it] {'loss': 0.4525, 'learning_rate': 8.868804253802103e-06, 'epoch': 0.55} + 55%|█████▌ | 3175/5772 [1:43:13<4:19:47, 6.00s/it] {'loss': 0.4525, 'learning_rate': 8.868804253802103e-06, 'epoch': 0.55} + 55%|█████▌ | 3175/5772 [1:43:11<4:19:47, 6.00s/it] 55%|█████▌ | 3176/5772 [1:43:17<4:16:48, 5.94s/it] 55%|█████▌ | 3176/5772 [1:43:19<4:16:48, 5.94s/it] {'loss': 0.4732, 'learning_rate': 8.863228462292537e-06, 'epoch': 0.55} + 55%|█████▌ | 3176/5772 [1:43:19<4:16:48, 5.94s/it] {'loss': 0.4732, 'learning_rate': 8.863228462292537e-06, 'epoch': 0.55} + 55%|█████▌ | 3176/5772 [1:43:17<4:16:48, 5.94s/it] 55%|█████▌ | 3177/5772 [1:43:23<4:16:04, 5.92s/it] 55%|█████▌ | 3177/5772 [1:43:25<4:16:04, 5.92s/it] {'loss': 0.4595, 'learning_rate': 8.85765302880285e-06, 'epoch': 0.55} + 55%|█████▌ | 3177/5772 [1:43:25<4:16:04, 5.92s/it] {'loss': 0.4595, 'learning_rate': 8.85765302880285e-06, 'epoch': 0.55} + 55%|█████▌ | 3177/5772 [1:43:23<4:16:04, 5.92s/it] 55%|█████▌ | 3178/5772 [1:43:29<4:19:11, 6.00s/it] 55%|█████▌ | 3178/5772 [1:43:31<4:19:11, 6.00s/it] {'loss': 0.459, 'learning_rate': 8.852077955088993e-06, 'epoch': 0.55} + 55%|█████▌ | 3178/5772 [1:43:31<4:19:11, 6.00s/it] {'loss': 0.459, 'learning_rate': 8.852077955088993e-06, 'epoch': 0.55} + 55%|█████▌ | 3178/5772 [1:43:29<4:19:11, 6.00s/it] 55%|█████▌ | 3179/5772 [1:43:35<4:16:41, 5.94s/it] 55%|█████▌ | 3179/5772 [1:43:37<4:16:41, 5.94s/it] {'loss': 0.4555, 'learning_rate': 8.846503242906798e-06, 'epoch': 0.55} + 55%|█████▌ | 3179/5772 [1:43:37<4:16:41, 5.94s/it] {'loss': 0.4555, 'learning_rate': 8.846503242906798e-06, 'epoch': 0.55} + 55%|█████▌ | 3179/5772 [1:43:35<4:16:41, 5.94s/it] 55%|█████▌ | 3180/5772 [1:43:41<4:20:05, 6.02s/it] 55%|█████▌ | 3180/5772 [1:43:43<4:20:05, 6.02s/it] {'loss': 0.4713, 'learning_rate': 8.840928894011995e-06, 'epoch': 0.55} + 55%|█████▌ | 3180/5772 [1:43:43<4:20:05, 6.02s/it] {'loss': 0.4713, 'learning_rate': 8.840928894011995e-06, 'epoch': 0.55} + 55%|█████▌ | 3180/5772 [1:43:41<4:20:05, 6.02s/it] 55%|█████▌ | 3181/5772 [1:43:48<4:26:02, 6.16s/it] 55%|█████▌ | 3181/5772 [1:43:50<4:26:02, 6.16s/it] {'loss': 0.4572, 'learning_rate': 8.83535491016019e-06, 'epoch': 0.55} + 55%|█████▌ | 3181/5772 [1:43:50<4:26:02, 6.16s/it] {'loss': 0.4572, 'learning_rate': 8.83535491016019e-06, 'epoch': 0.55} + 55%|█████▌ | 3181/5772 [1:43:48<4:26:02, 6.16s/it] 55%|█████▌ | 3182/5772 [1:43:54<4:23:46, 6.11s/it] 55%|█████▌ | 3182/5772 [1:43:56<4:23:46, 6.11s/it] {'loss': 0.4592, 'learning_rate': 8.829781293106884e-06, 'epoch': 0.55} + 55%|█████▌ | 3182/5772 [1:43:56<4:23:46, 6.11s/it] {'loss': 0.4592, 'learning_rate': 8.829781293106884e-06, 'epoch': 0.55} + 55%|█████▌ | 3182/5772 [1:43:54<4:23:46, 6.11s/it] 55%|█████▌ | 3183/5772 [1:44:00<4:23:49, 6.11s/it] 55%|█████▌ | 3183/5772 [1:44:02<4:23:49, 6.11s/it] {'loss': 0.4783, 'learning_rate': 8.82420804460745e-06, 'epoch': 0.55} + 55%|█████▌ | 3183/5772 [1:44:02<4:23:49, 6.11s/it] {'loss': 0.4783, 'learning_rate': 8.82420804460745e-06, 'epoch': 0.55} + 55%|█████▌ | 3183/5772 [1:44:00<4:23:49, 6.11s/it] 55%|█████▌ | 3184/5772 [1:44:06<4:23:53, 6.12s/it] 55%|█████▌ | 3184/5772 [1:44:08<4:23:53, 6.12s/it] {'loss': 0.4731, 'learning_rate': 8.818635166417154e-06, 'epoch': 0.55} + 55%|█████▌ | 3184/5772 [1:44:08<4:23:53, 6.12s/it] {'loss': 0.4731, 'learning_rate': 8.818635166417154e-06, 'epoch': 0.55} + 55%|█████▌ | 3184/5772 [1:44:06<4:23:53, 6.12s/it] 55%|█████▌ | 3185/5772 [1:44:12<4:24:59, 6.15s/it] 55%|█████▌ | 3185/5772 [1:44:14<4:24:59, 6.15s/it] {'loss': 0.4696, 'learning_rate': 8.813062660291146e-06, 'epoch': 0.55} + 55%|█████▌ | 3185/5772 [1:44:14<4:24:59, 6.15s/it] {'loss': 0.4696, 'learning_rate': 8.813062660291146e-06, 'epoch': 0.55} + 55%|█████▌ | 3185/5772 [1:44:12<4:24:59, 6.15s/it] 55%|█████▌ | 3186/5772 [1:44:18<4:20:05, 6.03s/it] 55%|█████▌ | 3186/5772 [1:44:20<4:20:05, 6.03s/it] {'loss': 0.4599, 'learning_rate': 8.807490527984453e-06, 'epoch': 0.55} + 55%|█████▌ | 3186/5772 [1:44:20<4:20:05, 6.03s/it] {'loss': 0.4599, 'learning_rate': 8.807490527984453e-06, 'epoch': 0.55} + 55%|█████▌ | 3186/5772 [1:44:18<4:20:05, 6.03s/it] 55%|█████▌ | 3187/5772 [1:44:24<4:19:43, 6.03s/it] 55%|█████▌ | 3187/5772 [1:44:26<4:19:43, 6.03s/it] {'loss': 0.4681, 'learning_rate': 8.80191877125199e-06, 'epoch': 0.55} + 55%|█████▌ | 3187/5772 [1:44:26<4:19:43, 6.03s/it] {'loss': 0.4681, 'learning_rate': 8.80191877125199e-06, 'epoch': 0.55} + 55%|█████▌ | 3187/5772 [1:44:24<4:19:43, 6.03s/it] 55%|█████▌ | 3188/5772 [1:44:30<4:21:07, 6.06s/it] 55%|█████▌ | 3188/5772 [1:44:32<4:21:07, 6.06s/it] {'loss': 0.476, 'learning_rate': 8.796347391848547e-06, 'epoch': 0.55} + 55%|█████▌ | 3188/5772 [1:44:32<4:21:07, 6.06s/it] {'loss': 0.476, 'learning_rate': 8.796347391848547e-06, 'epoch': 0.55} + 55%|█████▌ | 3188/5772 [1:44:30<4:21:07, 6.06s/it] 55%|█████▌ | 3189/5772 [1:44:37<4:25:49, 6.17s/it] 55%|█████▌ | 3189/5772 [1:44:39<4:25:49, 6.17s/it] {'loss': 0.4761, 'learning_rate': 8.790776391528803e-06, 'epoch': 0.55} + 55%|█████▌ | 3189/5772 [1:44:39<4:25:49, 6.17s/it] {'loss': 0.4761, 'learning_rate': 8.790776391528803e-06, 'epoch': 0.55} + 55%|█████▌ | 3189/5772 [1:44:37<4:25:49, 6.17s/it] 55%|█████▌ | 3190/5772 [1:44:43<4:23:21, 6.12s/it] 55%|█████▌ | 3190/5772 [1:44:45<4:23:21, 6.12s/it] {'loss': 0.4663, 'learning_rate': 8.785205772047308e-06, 'epoch': 0.55} + 55%|█████▌ | 3190/5772 [1:44:45<4:23:21, 6.12s/it] {'loss': 0.4663, 'learning_rate': 8.785205772047308e-06, 'epoch': 0.55} + 55%|█████▌ | 3190/5772 [1:44:43<4:23:21, 6.12s/it] 55%|█████▌ | 3191/5772 [1:44:49<4:23:32, 6.13s/it] 55%|█████▌ | 3191/5772 [1:44:51<4:23:32, 6.13s/it] {'loss': 0.4719, 'learning_rate': 8.779635535158498e-06, 'epoch': 0.55} + 55%|█████▌ | 3191/5772 [1:44:51<4:23:32, 6.13s/it] {'loss': 0.4719, 'learning_rate': 8.779635535158498e-06, 'epoch': 0.55} + 55%|█████▌ | 3191/5772 [1:44:49<4:23:32, 6.13s/it] 55%|█████▌ | 3192/5772 [1:44:55<4:22:56, 6.12s/it] 55%|█████▌ | 3192/5772 [1:44:57<4:22:57, 6.12s/it] {'loss': 0.4611, 'learning_rate': 8.774065682616699e-06, 'epoch': 0.55} + 55%|█████▌ | 3192/5772 [1:44:57<4:22:57, 6.12s/it] {'loss': 0.4611, 'learning_rate': 8.774065682616699e-06, 'epoch': 0.55} + 55%|█████▌ | 3192/5772 [1:44:55<4:22:56, 6.12s/it] 55%|█████▌ | 3193/5772 [1:45:01<4:21:49, 6.09s/it] 55%|█████▌ | 3193/5772 [1:45:03<4:21:49, 6.09s/it] {'loss': 0.4583, 'learning_rate': 8.76849621617609e-06, 'epoch': 0.55} + 55%|█████▌ | 3193/5772 [1:45:03<4:21:49, 6.09s/it] {'loss': 0.4583, 'learning_rate': 8.76849621617609e-06, 'epoch': 0.55} + 55%|█████▌ | 3193/5772 [1:45:01<4:21:49, 6.09s/it] 55%|█████▌ | 3194/5772 [1:45:07<4:26:08, 6.19s/it] 55%|█████▌ | 3194/5772 [1:45:09<4:26:08, 6.19s/it] {'loss': 0.477, 'learning_rate': 8.762927137590757e-06, 'epoch': 0.55} + 55%|█████▌ | 3194/5772 [1:45:09<4:26:08, 6.19s/it] {'loss': 0.477, 'learning_rate': 8.762927137590757e-06, 'epoch': 0.55} + 55%|█████▌ | 3194/5772 [1:45:07<4:26:08, 6.19s/it] 55%|█████▌ | 3195/5772 [1:45:14<4:27:59, 6.24s/it] 55%|█████▌ | 3195/5772 [1:45:16<4:27:59, 6.24s/it] {'loss': 0.4614, 'learning_rate': 8.757358448614636e-06, 'epoch': 0.55} + 55%|█████▌ | 3195/5772 [1:45:16<4:27:59, 6.24s/it] {'loss': 0.4614, 'learning_rate': 8.757358448614636e-06, 'epoch': 0.55} + 55%|█████▌ | 3195/5772 [1:45:14<4:27:59, 6.24s/it] 55%|█████▌ | 3196/5772 [1:45:22<4:24:48, 6.17s/it] 55%|█████▌ | 3196/5772 [1:45:20<4:24:49, 6.17s/it] {'loss': 0.4577, 'learning_rate': 8.751790151001569e-06, 'epoch': 0.55} + 55%|█████▌ | 3196/5772 [1:45:22<4:24:48, 6.17s/it] {'loss': 0.4577, 'learning_rate': 8.751790151001569e-06, 'epoch': 0.55} + 55%|█████▌ | 3196/5772 [1:45:20<4:24:49, 6.17s/it] 55%|█████▌ | 3197/5772 [1:45:26<4:25:48, 6.19s/it] 55%|█████▌ | 3197/5772 [1:45:28<4:25:48, 6.19s/it] {'loss': 0.4662, 'learning_rate': 8.74622224650525e-06, 'epoch': 0.55} + 55%|█████▌ | 3197/5772 [1:45:28<4:25:48, 6.19s/it] {'loss': 0.4662, 'learning_rate': 8.74622224650525e-06, 'epoch': 0.55} + 55%|█████▌ | 3197/5772 [1:45:26<4:25:48, 6.19s/it] 55%|█████▌ | 3198/5772 [1:45:32<4:26:27, 6.21s/it] 55%|█████▌ | 3198/5772 [1:45:34<4:26:28, 6.21s/it] {'loss': 0.4608, 'learning_rate': 8.740654736879265e-06, 'epoch': 0.55} + 55%|█████▌ | 3198/5772 [1:45:34<4:26:28, 6.21s/it] {'loss': 0.4608, 'learning_rate': 8.740654736879265e-06, 'epoch': 0.55} + 55%|█████▌ | 3198/5772 [1:45:32<4:26:27, 6.21s/it] 55%|█████▌ | 3199/5772 [1:45:38<4:27:53, 6.25s/it] 55%|█████▌ | 3199/5772 [1:45:40<4:27:53, 6.25s/it] {'loss': 0.4625, 'learning_rate': 8.73508762387707e-06, 'epoch': 0.55} + 55%|█████▌ | 3199/5772 [1:45:40<4:27:53, 6.25s/it] {'loss': 0.4625, 'learning_rate': 8.73508762387707e-06, 'epoch': 0.55} + 55%|█████▌ | 3199/5772 [1:45:38<4:27:53, 6.25s/it]3 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +010 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... + 55%|█████▌ | 3200/5772 [1:45:44<4:24:50, 6.18s/it]71 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... + 55%|█████▌ | 3200/5772 [1:45:46<4:24:50, 6.18s/it] {'loss': 0.4822, 'learning_rate': 8.729520909251994e-06, 'epoch': 0.55} + 55%|█████▌ | 3200/5772 [1:45:46<4:24:50, 6.18s/it] {'loss': 0.4822, 'learning_rate': 8.729520909251994e-06, 'epoch': 0.55} + 55%|█████▌ | 3200/5772 [1:45:44<4:24:50, 6.18s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-3200/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-3200/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-3200/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 55%|█████▌ | 3201/5772 [1:46:11<8:48:14, 12.33s/it] 55%|█████▌ | 3201/5772 [1:46:13<8:48:14, 12.33s/it] {'loss': 0.4694, 'learning_rate': 8.723954594757244e-06, 'epoch': 0.55} + 55%|█████▌ | 3201/5772 [1:46:13<8:48:14, 12.33s/it] {'loss': 0.4694, 'learning_rate': 8.723954594757244e-06, 'epoch': 0.55} + 55%|█████▌ | 3201/5772 [1:46:11<8:48:14, 12.33s/it] 55%|█████▌ | 3202/5772 [1:46:17<7:26:33, 10.43s/it] 55%|█████▌ | 3202/5772 [1:46:19<7:26:33, 10.43s/it] {'loss': 0.4683, 'learning_rate': 8.718388682145897e-06, 'epoch': 0.55} + 55%|█████▌ | 3202/5772 [1:46:19<7:26:33, 10.43s/it] {'loss': 0.4683, 'learning_rate': 8.718388682145897e-06, 'epoch': 0.55} + 55%|█████▌ | 3202/5772 [1:46:17<7:26:33, 10.43s/it] 55%|█████▌ | 3203/5772 [1:46:25<6:26:41, 9.03s/it] 55%|█████▌ | 3203/5772 [1:46:23<6:26:42, 9.03s/it] {'loss': 0.4658, 'learning_rate': 8.712823173170914e-06, 'epoch': 0.55} + 55%|█████▌ | 3203/5772 [1:46:25<6:26:41, 9.03s/it] {'loss': 0.4658, 'learning_rate': 8.712823173170914e-06, 'epoch': 0.55} + 55%|█████▌ | 3203/5772 [1:46:23<6:26:42, 9.03s/it] 56%|█████▌ | 3204/5772 [1:46:29<5:45:42, 8.08s/it] 56%|█████▌ | 3204/5772 [1:46:31<5:45:42, 8.08s/it] {'loss': 0.4694, 'learning_rate': 8.707258069585109e-06, 'epoch': 0.56} + 56%|█████▌ | 3204/5772 [1:46:31<5:45:42, 8.08s/it] {'loss': 0.4694, 'learning_rate': 8.707258069585109e-06, 'epoch': 0.56} + 56%|█████▌ | 3204/5772 [1:46:29<5:45:42, 8.08s/it] 56%|█████▌ | 3205/5772 [1:46:35<5:22:13, 7.53s/it] 56%|█████▌ | 3205/5772 [1:46:37<5:22:13, 7.53s/it] {'loss': 0.4782, 'learning_rate': 8.70169337314119e-06, 'epoch': 0.56} + 56%|█████▌ | 3205/5772 [1:46:37<5:22:13, 7.53s/it] {'loss': 0.4782, 'learning_rate': 8.70169337314119e-06, 'epoch': 0.56} + 56%|█████▌ | 3205/5772 [1:46:35<5:22:13, 7.53s/it] 56%|█████▌ | 3206/5772 [1:46:41<5:03:11, 7.09s/it] 56%|█████▌ | 3206/5772 [1:46:43<5:03:12, 7.09s/it] {'loss': 0.4704, 'learning_rate': 8.696129085591726e-06, 'epoch': 0.56} + 56%|█████▌ | 3206/5772 [1:46:43<5:03:12, 7.09s/it] {'loss': 0.4704, 'learning_rate': 8.696129085591726e-06, 'epoch': 0.56} + 56%|█████▌ | 3206/5772 [1:46:41<5:03:11, 7.09s/it] 56%|█████▌ | 3207/5772 [1:46:47<4:45:55, 6.69s/it] 56%|█████▌ | 3207/5772 [1:46:49<4:45:54, 6.69s/it] {'loss': 0.4619, 'learning_rate': 8.690565208689157e-06, 'epoch': 0.56} + 56%|█████▌ | 3207/5772 [1:46:49<4:45:54, 6.69s/it] {'loss': 0.4619, 'learning_rate': 8.690565208689157e-06, 'epoch': 0.56} + 56%|█████▌ | 3207/5772 [1:46:47<4:45:55, 6.69s/it] 56%|█████▌ | 3208/5772 [1:46:55<4:38:43, 6.52s/it] 56%|█████▌ | 3208/5772 [1:46:53<4:38:44, 6.52s/it] {'loss': 0.477, 'learning_rate': 8.685001744185795e-06, 'epoch': 0.56} + 56%|█████▌ | 3208/5772 [1:46:55<4:38:43, 6.52s/it] {'loss': 0.477, 'learning_rate': 8.685001744185795e-06, 'epoch': 0.56} + 56%|█████▌ | 3208/5772 [1:46:53<4:38:44, 6.52s/it] 56%|█████▌ | 3209/5772 [1:47:00<4:39:51, 6.55s/it] 56%|█████▌ | 3209/5772 [1:47:02<4:39:51, 6.55s/it] {'loss': 0.4601, 'learning_rate': 8.679438693833821e-06, 'epoch': 0.56} + 56%|█████▌ | 3209/5772 [1:47:02<4:39:51, 6.55s/it] {'loss': 0.4601, 'learning_rate': 8.679438693833821e-06, 'epoch': 0.56} + 56%|█████▌ | 3209/5772 [1:47:00<4:39:51, 6.55s/it] 56%|█████▌ | 3210/5772 [1:47:06<4:36:01, 6.46s/it] 56%|█████▌ | 3210/5772 [1:47:08<4:36:01, 6.46s/it] {'loss': 0.4735, 'learning_rate': 8.67387605938529e-06, 'epoch': 0.56} + 56%|█████▌ | 3210/5772 [1:47:08<4:36:01, 6.46s/it] {'loss': 0.4735, 'learning_rate': 8.67387605938529e-06, 'epoch': 0.56} + 56%|█████▌ | 3210/5772 [1:47:06<4:36:01, 6.46s/it] 56%|█████▌ | 3211/5772 [1:47:14<4:29:43, 6.32s/it] 56%|█████▌ | 3211/5772 [1:47:12<4:29:43, 6.32s/it] {'loss': 0.4636, 'learning_rate': 8.668313842592116e-06, 'epoch': 0.56} + 56%|█████▌ | 3211/5772 [1:47:14<4:29:43, 6.32s/it] {'loss': 0.4636, 'learning_rate': 8.668313842592116e-06, 'epoch': 0.56} + 56%|█████▌ | 3211/5772 [1:47:12<4:29:43, 6.32s/it] 56%|█████▌ | 3212/5772 [1:47:18<4:24:13, 6.19s/it] 56%|█████▌ | 3212/5772 [1:47:20<4:24:14, 6.19s/it] {'loss': 0.4682, 'learning_rate': 8.662752045206096e-06, 'epoch': 0.56} + 56%|█████▌ | 3212/5772 [1:47:20<4:24:14, 6.19s/it] {'loss': 0.4682, 'learning_rate': 8.662752045206096e-06, 'epoch': 0.56} + 56%|█████▌ | 3212/5772 [1:47:18<4:24:13, 6.19s/it] 56%|█████▌ | 3213/5772 [1:47:24<4:23:36, 6.18s/it] 56%|█████▌ | 3213/5772 [1:47:26<4:23:36, 6.18s/it] {'loss': 0.4597, 'learning_rate': 8.657190668978887e-06, 'epoch': 0.56} + 56%|█████▌ | 3213/5772 [1:47:26<4:23:36, 6.18s/it] {'loss': 0.4597, 'learning_rate': 8.657190668978887e-06, 'epoch': 0.56} + 56%|█████▌ | 3213/5772 [1:47:24<4:23:36, 6.18s/it] 56%|█████▌ | 3214/5772 [1:47:32<4:21:15, 6.13s/it] 56%|█████▌ | 3214/5772 [1:47:30<4:21:16, 6.13s/it] {'loss': 0.4815, 'learning_rate': 8.651629715662006e-06, 'epoch': 0.56} + 56%|█████▌ | 3214/5772 [1:47:32<4:21:15, 6.13s/it] {'loss': 0.4815, 'learning_rate': 8.651629715662006e-06, 'epoch': 0.56} + 56%|█████▌ | 3214/5772 [1:47:30<4:21:16, 6.13s/it] 56%|█████▌ | 3215/5772 [1:47:36<4:20:55, 6.12s/it] 56%|█████▌ | 3215/5772 [1:47:38<4:20:55, 6.12s/it] {'loss': 0.4489, 'learning_rate': 8.646069187006854e-06, 'epoch': 0.56} + 56%|█████▌ | 3215/5772 [1:47:38<4:20:55, 6.12s/it] {'loss': 0.4489, 'learning_rate': 8.646069187006854e-06, 'epoch': 0.56} + 56%|█████▌ | 3215/5772 [1:47:36<4:20:55, 6.12s/it] 56%|█████▌ | 3216/5772 [1:47:42<4:19:13, 6.09s/it] 56%|█████▌ | 3216/5772 [1:47:44<4:19:13, 6.09s/it] {'loss': 0.4722, 'learning_rate': 8.640509084764682e-06, 'epoch': 0.56} + 56%|█████▌ | 3216/5772 [1:47:44<4:19:13, 6.09s/it] {'loss': 0.4722, 'learning_rate': 8.640509084764682e-06, 'epoch': 0.56} + 56%|█████▌ | 3216/5772 [1:47:42<4:19:13, 6.09s/it] 56%|█████▌ | 3217/5772 [1:47:49<4:26:15, 6.25s/it] 56%|█████▌ | 3217/5772 [1:47:51<4:26:15, 6.25s/it] {'loss': 0.4673, 'learning_rate': 8.634949410686615e-06, 'epoch': 0.56} + 56%|█████▌ | 3217/5772 [1:47:51<4:26:15, 6.25s/it] {'loss': 0.4673, 'learning_rate': 8.634949410686615e-06, 'epoch': 0.56} + 56%|█████▌ | 3217/5772 [1:47:49<4:26:15, 6.25s/it] 56%|█████▌ | 3218/5772 [1:47:55<4:26:12, 6.25s/it] 56%|█████▌ | 3218/5772 [1:47:57<4:26:12, 6.25s/it] {'loss': 0.4812, 'learning_rate': 8.629390166523638e-06, 'epoch': 0.56} + 56%|█████▌ | 3218/5772 [1:47:57<4:26:12, 6.25s/it] {'loss': 0.4812, 'learning_rate': 8.629390166523638e-06, 'epoch': 0.56} + 56%|█████▌ | 3218/5772 [1:47:55<4:26:12, 6.25s/it] 56%|█████▌ | 3219/5772 [1:48:01<4:21:42, 6.15s/it] 56%|█████▌ | 3219/5772 [1:48:03<4:21:42, 6.15s/it] {'loss': 0.4494, 'learning_rate': 8.623831354026609e-06, 'epoch': 0.56} + 56%|█████▌ | 3219/5772 [1:48:03<4:21:42, 6.15s/it] {'loss': 0.4494, 'learning_rate': 8.623831354026609e-06, 'epoch': 0.56} + 56%|█████▌ | 3219/5772 [1:48:01<4:21:42, 6.15s/it] 56%|█████▌ | 3220/5772 [1:48:07<4:17:42, 6.06s/it] 56%|█████▌ | 3220/5772 [1:48:09<4:17:43, 6.06s/it] {'loss': 0.4666, 'learning_rate': 8.618272974946244e-06, 'epoch': 0.56} + 56%|█████▌ | 3220/5772 [1:48:09<4:17:43, 6.06s/it] {'loss': 0.4666, 'learning_rate': 8.618272974946244e-06, 'epoch': 0.56} + 56%|█████▌ | 3220/5772 [1:48:07<4:17:42, 6.06s/it] 56%|█████▌ | 3221/5772 [1:48:13<4:16:36, 6.04s/it] 56%|█████▌ | 3221/5772 [1:48:15<4:16:36, 6.04s/it] {'loss': 0.4671, 'learning_rate': 8.612715031033125e-06, 'epoch': 0.56} + 56%|█████▌ | 3221/5772 [1:48:15<4:16:36, 6.04s/it] {'loss': 0.4671, 'learning_rate': 8.612715031033125e-06, 'epoch': 0.56} + 56%|█████▌ | 3221/5772 [1:48:13<4:16:36, 6.04s/it] 56%|█████▌ | 3222/5772 [1:48:19<4:15:56, 6.02s/it] 56%|█████▌ | 3222/5772 [1:48:21<4:15:56, 6.02s/it] {'loss': 0.4623, 'learning_rate': 8.607157524037692e-06, 'epoch': 0.56} + 56%|█████▌ | 3222/5772 [1:48:21<4:15:56, 6.02s/it] {'loss': 0.4623, 'learning_rate': 8.607157524037692e-06, 'epoch': 0.56} + 56%|█████▌ | 3222/5772 [1:48:19<4:15:56, 6.02s/it] 56%|█████▌ | 3223/5772 [1:48:25<4:18:51, 6.09s/it] 56%|█████▌ | 3223/5772 [1:48:27<4:18:51, 6.09s/it] {'loss': 0.4647, 'learning_rate': 8.601600455710254e-06, 'epoch': 0.56} + 56%|█████▌ | 3223/5772 [1:48:27<4:18:51, 6.09s/it] {'loss': 0.4647, 'learning_rate': 8.601600455710254e-06, 'epoch': 0.56} + 56%|█████▌ | 3223/5772 [1:48:25<4:18:51, 6.09s/it] 56%|█████▌ | 3224/5772 [1:48:31<4:16:23, 6.04s/it] 56%|█████▌ | 3224/5772 [1:48:33<4:16:23, 6.04s/it] {'loss': 0.4815, 'learning_rate': 8.596043827800976e-06, 'epoch': 0.56} + 56%|█████▌ | 3224/5772 [1:48:33<4:16:23, 6.04s/it] {'loss': 0.4815, 'learning_rate': 8.596043827800976e-06, 'epoch': 0.56} + 56%|█████▌ | 3224/5772 [1:48:31<4:16:23, 6.04s/it] 56%|█████▌ | 3225/5772 [1:48:39<4:24:12, 6.22s/it] {'loss': 0.4712, 'learning_rate': 8.590487642059888e-06, 'epoch': 0.56} + 56%|█████▌ | 3225/5772 [1:48:37<4:24:12, 6.22s/it] 56%|█████▌ | 3225/5772 [1:48:39<4:24:12, 6.22s/it] {'loss': 0.4712, 'learning_rate': 8.590487642059888e-06, 'epoch': 0.56} + 56%|█████▌ | 3225/5772 [1:48:37<4:24:12, 6.22s/it] 56%|█████▌ | 3226/5772 [1:48:43<4:19:10, 6.11s/it] 56%|█████▌ | 3226/5772 [1:48:45<4:19:11, 6.11s/it] {'loss': 0.4617, 'learning_rate': 8.584931900236879e-06, 'epoch': 0.56} + 56%|█████▌ | 3226/5772 [1:48:45<4:19:11, 6.11s/it] {'loss': 0.4617, 'learning_rate': 8.584931900236879e-06, 'epoch': 0.56} + 56%|█████▌ | 3226/5772 [1:48:43<4:19:10, 6.11s/it] 56%|█████▌ | 3227/5772 [1:48:49<4:19:30, 6.12s/it] 56%|█████▌ | 3227/5772 [1:48:51<4:19:30, 6.12s/it] {'loss': 0.4552, 'learning_rate': 8.579376604081705e-06, 'epoch': 0.56} + 56%|█████▌ | 3227/5772 [1:48:51<4:19:30, 6.12s/it] {'loss': 0.4552, 'learning_rate': 8.579376604081705e-06, 'epoch': 0.56} + 56%|█████▌ | 3227/5772 [1:48:49<4:19:30, 6.12s/it] 56%|█████▌ | 3228/5772 [1:48:56<4:20:26, 6.14s/it] 56%|█████▌ | 3228/5772 [1:48:58<4:20:26, 6.14s/it] {'loss': 0.473, 'learning_rate': 8.573821755343965e-06, 'epoch': 0.56} + 56%|█████▌ | 3228/5772 [1:48:58<4:20:26, 6.14s/it] {'loss': 0.473, 'learning_rate': 8.573821755343965e-06, 'epoch': 0.56} + 56%|█████▌ | 3228/5772 [1:48:56<4:20:26, 6.14s/it] 56%|█████▌ | 3229/5772 [1:49:02<4:22:48, 6.20s/it] 56%|█████▌ | 3229/5772 [1:49:04<4:22:48, 6.20s/it] {'loss': 0.4719, 'learning_rate': 8.568267355773137e-06, 'epoch': 0.56} + 56%|█████▌ | 3229/5772 [1:49:04<4:22:48, 6.20s/it] {'loss': 0.4719, 'learning_rate': 8.568267355773137e-06, 'epoch': 0.56} + 56%|█████▌ | 3229/5772 [1:49:02<4:22:48, 6.20s/it] 56%|█████▌ | 3230/5772 [1:49:08<4:19:54, 6.13s/it] 56%|█████▌ | 3230/5772 [1:49:10<4:19:54, 6.13s/it] {'loss': 0.4709, 'learning_rate': 8.562713407118543e-06, 'epoch': 0.56} + 56%|█████▌ | 3230/5772 [1:49:10<4:19:54, 6.13s/it] {'loss': 0.4709, 'learning_rate': 8.562713407118543e-06, 'epoch': 0.56} + 56%|█████▌ | 3230/5772 [1:49:08<4:19:54, 6.13s/it] 56%|█████▌ | 3231/5772 [1:49:14<4:17:08, 6.07s/it] 56%|█████▌ | 3231/5772 [1:49:16<4:17:08, 6.07s/it] {'loss': 0.4589, 'learning_rate': 8.557159911129373e-06, 'epoch': 0.56} + 56%|█████▌ | 3231/5772 [1:49:16<4:17:08, 6.07s/it] {'loss': 0.4589, 'learning_rate': 8.557159911129373e-06, 'epoch': 0.56} + 56%|█████▌ | 3231/5772 [1:49:14<4:17:08, 6.07s/it] 56%|█████▌ | 3232/5772 [1:49:20<4:18:18, 6.10s/it] 56%|█████▌ | 3232/5772 [1:49:22<4:18:18, 6.10s/it] {'loss': 0.4652, 'learning_rate': 8.551606869554665e-06, 'epoch': 0.56} + 56%|█████▌ | 3232/5772 [1:49:22<4:18:18, 6.10s/it] {'loss': 0.4652, 'learning_rate': 8.551606869554665e-06, 'epoch': 0.56} + 56%|█████▌ | 3232/5772 [1:49:20<4:18:18, 6.10s/it] 56%|█████▌ | 3233/5772 [1:49:26<4:20:46, 6.16s/it] 56%|█████▌ | 3233/5772 [1:49:28<4:20:46, 6.16s/it] {'loss': 0.471, 'learning_rate': 8.54605428414332e-06, 'epoch': 0.56} + 56%|█████▌ | 3233/5772 [1:49:28<4:20:46, 6.16s/it] {'loss': 0.471, 'learning_rate': 8.54605428414332e-06, 'epoch': 0.56} + 56%|█████▌ | 3233/5772 [1:49:26<4:20:46, 6.16s/it] 56%|█████▌ | 3234/5772 [1:49:32<4:19:47, 6.14s/it] 56%|█████▌ | 3234/5772 [1:49:34<4:19:47, 6.14s/it] {'loss': 0.4634, 'learning_rate': 8.540502156644096e-06, 'epoch': 0.56} + 56%|█████▌ | 3234/5772 [1:49:34<4:19:47, 6.14s/it] {'loss': 0.4634, 'learning_rate': 8.540502156644096e-06, 'epoch': 0.56} + 56%|█████▌ | 3234/5772 [1:49:32<4:19:47, 6.14s/it] 56%|█████▌ | 3235/5772 [1:49:38<4:19:12, 6.13s/it] 56%|█████▌ | 3235/5772 [1:49:40<4:19:12, 6.13s/it] {'loss': 0.4661, 'learning_rate': 8.534950488805599e-06, 'epoch': 0.56} + 56%|█████▌ | 3235/5772 [1:49:40<4:19:12, 6.13s/it] {'loss': 0.4661, 'learning_rate': 8.534950488805599e-06, 'epoch': 0.56} + 56%|█████▌ | 3235/5772 [1:49:38<4:19:12, 6.13s/it] 56%|█████▌ | 3236/5772 [1:49:45<4:19:20, 6.14s/it] 56%|█████▌ | 3236/5772 [1:49:47<4:19:21, 6.14s/it] {'loss': 0.467, 'learning_rate': 8.529399282376306e-06, 'epoch': 0.56} + 56%|█████▌ | 3236/5772 [1:49:47<4:19:21, 6.14s/it] {'loss': 0.467, 'learning_rate': 8.529399282376306e-06, 'epoch': 0.56} + 56%|█████▌ | 3236/5772 [1:49:45<4:19:20, 6.14s/it] 56%|█████▌ | 3237/5772 [1:49:51<4:21:10, 6.18s/it] 56%|█████▌ | 3237/5772 [1:49:53<4:21:10, 6.18s/it] {'loss': 0.4558, 'learning_rate': 8.523848539104527e-06, 'epoch': 0.56} + 56%|█████▌ | 3237/5772 [1:49:53<4:21:10, 6.18s/it] {'loss': 0.4558, 'learning_rate': 8.523848539104527e-06, 'epoch': 0.56} + 56%|█████▌ | 3237/5772 [1:49:51<4:21:10, 6.18s/it] 56%|█████▌ | 3238/5772 [1:49:57<4:18:52, 6.13s/it] 56%|█████▌ | 3238/5772 [1:49:59<4:18:52, 6.13s/it] {'loss': 0.4654, 'learning_rate': 8.518298260738448e-06, 'epoch': 0.56} + 56%|█████▌ | 3238/5772 [1:49:59<4:18:52, 6.13s/it] {'loss': 0.4654, 'learning_rate': 8.518298260738448e-06, 'epoch': 0.56} + 56%|█████▌ | 3238/5772 [1:49:57<4:18:52, 6.13s/it] 56%|█████▌ | 3239/5772 [1:50:03<4:18:41, 6.13s/it] 56%|█████▌ | 3239/5772 [1:50:05<4:18:41, 6.13s/it] {'loss': 0.4705, 'learning_rate': 8.512748449026087e-06, 'epoch': 0.56} + 56%|█████▌ | 3239/5772 [1:50:05<4:18:41, 6.13s/it] {'loss': 0.4705, 'learning_rate': 8.512748449026087e-06, 'epoch': 0.56} + 56%|█████▌ | 3239/5772 [1:50:03<4:18:41, 6.13s/it] 56%|█████▌ | 3240/5772 [1:50:09<4:22:12, 6.21s/it] 56%|█████▌ | 3240/5772 [1:50:11<4:22:11, 6.21s/it] {'loss': 0.4738, 'learning_rate': 8.507199105715336e-06, 'epoch': 0.56} + 56%|█████▌ | 3240/5772 [1:50:11<4:22:11, 6.21s/it] {'loss': 0.4738, 'learning_rate': 8.507199105715336e-06, 'epoch': 0.56} + 56%|█████▌ | 3240/5772 [1:50:09<4:22:12, 6.21s/it] 56%|█████▌ | 3241/5772 [1:50:16<4:20:31, 6.18s/it] 56%|█████▌ | 3241/5772 [1:50:18<4:20:31, 6.18s/it] {'loss': 0.453, 'learning_rate': 8.50165023255393e-06, 'epoch': 0.56} + 56%|█████▌ | 3241/5772 [1:50:18<4:20:31, 6.18s/it] {'loss': 0.453, 'learning_rate': 8.50165023255393e-06, 'epoch': 0.56} + 56%|█████▌ | 3241/5772 [1:50:16<4:20:31, 6.18s/it] 56%|█████▌ | 3242/5772 [1:50:22<4:24:30, 6.27s/it] 56%|█████▌ | 3242/5772 [1:50:24<4:24:30, 6.27s/it] {'loss': 0.4738, 'learning_rate': 8.496101831289447e-06, 'epoch': 0.56} + 56%|█████▌ | 3242/5772 [1:50:24<4:24:30, 6.27s/it] {'loss': 0.4738, 'learning_rate': 8.496101831289447e-06, 'epoch': 0.56} + 56%|█████▌ | 3242/5772 [1:50:22<4:24:30, 6.27s/it] 56%|█████▌ | 3243/5772 [1:50:28<4:22:22, 6.22s/it] 56%|█████▌ | 3243/5772 [1:50:30<4:22:22, 6.22s/it] {'loss': 0.4652, 'learning_rate': 8.490553903669335e-06, 'epoch': 0.56} + 56%|█████▌ | 3243/5772 [1:50:30<4:22:22, 6.22s/it] {'loss': 0.4652, 'learning_rate': 8.490553903669335e-06, 'epoch': 0.56} + 56%|█████▌ | 3243/5772 [1:50:28<4:22:22, 6.22s/it] 56%|█████▌ | 3244/5772 [1:50:34<4:20:05, 6.17s/it] 56%|█████▌ | 3244/5772 [1:50:36<4:20:06, 6.17s/it] {'loss': 0.471, 'learning_rate': 8.485006451440874e-06, 'epoch': 0.56} + 56%|█████▌ | 3244/5772 [1:50:36<4:20:06, 6.17s/it] {'loss': 0.471, 'learning_rate': 8.485006451440874e-06, 'epoch': 0.56} + 56%|█████▌ | 3244/5772 [1:50:34<4:20:05, 6.17s/it] 56%|█████▌ | 3245/5772 [1:50:40<4:16:26, 6.09s/it] 56%|█████▌ | 3245/5772 [1:50:42<4:16:26, 6.09s/it] {'loss': 0.4559, 'learning_rate': 8.479459476351213e-06, 'epoch': 0.56} + 56%|█████▌ | 3245/5772 [1:50:42<4:16:26, 6.09s/it] {'loss': 0.4559, 'learning_rate': 8.479459476351213e-06, 'epoch': 0.56} + 56%|█████▌ | 3245/5772 [1:50:40<4:16:26, 6.09s/it] 56%|█████▌ | 3246/5772 [1:50:46<4:13:42, 6.03s/it] 56%|█████▌ | 3246/5772 [1:50:48<4:13:42, 6.03s/it] {'loss': 0.4811, 'learning_rate': 8.473912980147329e-06, 'epoch': 0.56} + 56%|█████▌ | 3246/5772 [1:50:48<4:13:42, 6.03s/it] {'loss': 0.4811, 'learning_rate': 8.473912980147329e-06, 'epoch': 0.56} + 56%|█████▌ | 3246/5772 [1:50:46<4:13:42, 6.03s/it] 56%|█████▋ | 3247/5772 [1:50:52<4:15:26, 6.07s/it] 56%|█████▋ | 3247/5772 [1:50:54<4:15:26, 6.07s/it] {'loss': 0.4489, 'learning_rate': 8.46836696457607e-06, 'epoch': 0.56} + 56%|█████▋ | 3247/5772 [1:50:54<4:15:26, 6.07s/it] {'loss': 0.4489, 'learning_rate': 8.46836696457607e-06, 'epoch': 0.56} + 56%|█████▋ | 3247/5772 [1:50:52<4:15:26, 6.07s/it] 56%|█████▋ | 3248/5772 [1:50:58<4:15:25, 6.07s/it] 56%|█████▋ | 3248/5772 [1:51:00<4:15:25, 6.07s/it] {'loss': 0.4736, 'learning_rate': 8.462821431384123e-06, 'epoch': 0.56} + 56%|█████▋ | 3248/5772 [1:51:00<4:15:25, 6.07s/it] {'loss': 0.4736, 'learning_rate': 8.462821431384123e-06, 'epoch': 0.56} + 56%|█████▋ | 3248/5772 [1:50:58<4:15:25, 6.07s/it] 56%|█████▋ | 3249/5772 [1:51:04<4:15:52, 6.09s/it] 56%|█████▋ | 3249/5772 [1:51:06<4:15:52, 6.09s/it] {'loss': 0.4585, 'learning_rate': 8.457276382318016e-06, 'epoch': 0.56} + 56%|█████▋ | 3249/5772 [1:51:06<4:15:52, 6.09s/it] {'loss': 0.4585, 'learning_rate': 8.457276382318016e-06, 'epoch': 0.56} + 56%|█████▋ | 3249/5772 [1:51:04<4:15:52, 6.09s/it]10 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +0 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... + 56%|█████▋ | 3250/5772 [1:51:10<4:11:55, 5.99s/it]6 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... + 56%|█████▋ | 3250/5772 [1:51:12<4:11:55, 5.99s/it]9 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4688, 'learning_rate': 8.451731819124137e-06, 'epoch': 0.56} + 56%|█████▋ | 3250/5772 [1:51:12<4:11:55, 5.99s/it] {'loss': 0.4688, 'learning_rate': 8.451731819124137e-06, 'epoch': 0.56} + 56%|█████▋ | 3250/5772 [1:51:10<4:11:55, 5.99s/it] 56%|█████▋ | 3251/5772 [1:51:16<4:13:27, 6.03s/it] 56%|█████▋ | 3251/5772 [1:51:18<4:13:27, 6.03s/it] {'loss': 0.4674, 'learning_rate': 8.446187743548711e-06, 'epoch': 0.56} + 56%|█████▋ | 3251/5772 [1:51:18<4:13:27, 6.03s/it] {'loss': 0.4674, 'learning_rate': 8.446187743548711e-06, 'epoch': 0.56} + 56%|█████▋ | 3251/5772 [1:51:16<4:13:27, 6.03s/it] 56%|█████▋ | 3252/5772 [1:51:22<4:10:59, 5.98s/it] 56%|█████▋ | 3252/5772 [1:51:24<4:10:59, 5.98s/it] {'loss': 0.4713, 'learning_rate': 8.440644157337819e-06, 'epoch': 0.56} + 56%|█████▋ | 3252/5772 [1:51:24<4:10:59, 5.98s/it] {'loss': 0.4713, 'learning_rate': 8.440644157337819e-06, 'epoch': 0.56} + 56%|█████▋ | 3252/5772 [1:51:22<4:10:59, 5.98s/it] 56%|█████▋ | 3253/5772 [1:51:28<4:14:46, 6.07s/it] 56%|█████▋ | 3253/5772 [1:51:30<4:14:46, 6.07s/it] {'loss': 0.4648, 'learning_rate': 8.435101062237377e-06, 'epoch': 0.56} + 56%|█████▋ | 3253/5772 [1:51:30<4:14:46, 6.07s/it] {'loss': 0.4648, 'learning_rate': 8.435101062237377e-06, 'epoch': 0.56} + 56%|█████▋ | 3253/5772 [1:51:28<4:14:46, 6.07s/it] 56%|█████▋ | 3254/5772 [1:51:35<4:16:27, 6.11s/it] 56%|█████▋ | 3254/5772 [1:51:37<4:16:27, 6.11s/it] {'loss': 0.4804, 'learning_rate': 8.42955845999315e-06, 'epoch': 0.56} + 56%|█████▋ | 3254/5772 [1:51:37<4:16:27, 6.11s/it] {'loss': 0.4804, 'learning_rate': 8.42955845999315e-06, 'epoch': 0.56} + 56%|█████▋ | 3254/5772 [1:51:35<4:16:27, 6.11s/it] 56%|█████▋ | 3255/5772 [1:51:41<4:14:10, 6.06s/it] 56%|█████▋ | 3255/5772 [1:51:43<4:14:09, 6.06s/it] {'loss': 0.4546, 'learning_rate': 8.42401635235076e-06, 'epoch': 0.56} + 56%|█████▋ | 3255/5772 [1:51:43<4:14:09, 6.06s/it] {'loss': 0.4546, 'learning_rate': 8.42401635235076e-06, 'epoch': 0.56} + 56%|█████▋ | 3255/5772 [1:51:41<4:14:10, 6.06s/it] 56%|█████▋ | 3256/5772 [1:51:47<4:16:57, 6.13s/it] 56%|█████▋ | 3256/5772 [1:51:49<4:16:57, 6.13s/it] {'loss': 0.4593, 'learning_rate': 8.418474741055657e-06, 'epoch': 0.56} + 56%|█████▋ | 3256/5772 [1:51:47<4:16:57, 6.13s/it]{'loss': 0.4593, 'learning_rate': 8.418474741055657e-06, 'epoch': 0.56} + 56%|█████▋ | 3256/5772 [1:51:49<4:16:57, 6.13s/it] 56%|█████▋ | 3257/5772 [1:51:53<4:21:45, 6.24s/it] 56%|█████▋ | 3257/5772 [1:51:55<4:21:45, 6.24s/it] {'loss': 0.4624, 'learning_rate': 8.412933627853142e-06, 'epoch': 0.56} + 56%|█████▋ | 3257/5772 [1:51:55<4:21:45, 6.24s/it] {'loss': 0.4624, 'learning_rate': 8.412933627853142e-06, 'epoch': 0.56} + 56%|█████▋ | 3257/5772 [1:51:53<4:21:45, 6.24s/it] 56%|█████▋ | 3258/5772 [1:51:59<4:19:03, 6.18s/it] 56%|█████▋ | 3258/5772 [1:52:01<4:19:03, 6.18s/it] {'loss': 0.4556, 'learning_rate': 8.407393014488354e-06, 'epoch': 0.56} + 56%|█████▋ | 3258/5772 [1:52:01<4:19:03, 6.18s/it] {'loss': 0.4556, 'learning_rate': 8.407393014488354e-06, 'epoch': 0.56} + 56%|█████▋ | 3258/5772 [1:51:59<4:19:03, 6.18s/it] 56%|█████▋ | 3259/5772 [1:52:05<4:17:56, 6.16s/it] 56%|█████▋ | 3259/5772 [1:52:07<4:17:56, 6.16s/it] {'loss': 0.4685, 'learning_rate': 8.401852902706285e-06, 'epoch': 0.56} + 56%|█████▋ | 3259/5772 [1:52:07<4:17:56, 6.16s/it] {'loss': 0.4685, 'learning_rate': 8.401852902706285e-06, 'epoch': 0.56} + 56%|█████▋ | 3259/5772 [1:52:05<4:17:56, 6.16s/it] 56%|█████▋ | 3260/5772 [1:52:11<4:15:39, 6.11s/it] 56%|█████▋ | 3260/5772 [1:52:13<4:15:39, 6.11s/it] {'loss': 0.4548, 'learning_rate': 8.396313294251755e-06, 'epoch': 0.56} + 56%|█████▋ | 3260/5772 [1:52:13<4:15:39, 6.11s/it] {'loss': 0.4548, 'learning_rate': 8.396313294251755e-06, 'epoch': 0.56} + 56%|█████▋ | 3260/5772 [1:52:11<4:15:39, 6.11s/it] 56%|█████▋ | 3261/5772 [1:52:18<4:16:19, 6.12s/it] 56%|█████▋ | 3261/5772 [1:52:20<4:16:19, 6.12s/it] {'loss': 0.47, 'learning_rate': 8.390774190869434e-06, 'epoch': 0.56} + 56%|█████▋ | 3261/5772 [1:52:20<4:16:19, 6.12s/it] {'loss': 0.47, 'learning_rate': 8.390774190869434e-06, 'epoch': 0.56} + 56%|█████▋ | 3261/5772 [1:52:18<4:16:19, 6.12s/it] 57%|█████▋ | 3262/5772 [1:52:24<4:18:05, 6.17s/it] 57%|█████▋ | 3262/5772 [1:52:26<4:18:05, 6.17s/it] {'loss': 0.4616, 'learning_rate': 8.385235594303842e-06, 'epoch': 0.57} + 57%|█████▋ | 3262/5772 [1:52:26<4:18:05, 6.17s/it] {'loss': 0.4616, 'learning_rate': 8.385235594303842e-06, 'epoch': 0.57} + 57%|█████▋ | 3262/5772 [1:52:24<4:18:05, 6.17s/it] 57%|█████▋ | 3263/5772 [1:52:30<4:16:31, 6.13s/it] 57%|█████▋ | 3263/5772 [1:52:32<4:16:31, 6.13s/it] {'loss': 0.4621, 'learning_rate': 8.379697506299313e-06, 'epoch': 0.57} + 57%|█████▋ | 3263/5772 [1:52:32<4:16:31, 6.13s/it] {'loss': 0.4621, 'learning_rate': 8.379697506299313e-06, 'epoch': 0.57} + 57%|█████▋ | 3263/5772 [1:52:30<4:16:31, 6.13s/it] 57%|█████▋ | 3264/5772 [1:52:36<4:14:02, 6.08s/it] 57%|█████▋ | 3264/5772 [1:52:38<4:14:02, 6.08s/it] {'loss': 0.4602, 'learning_rate': 8.374159928600051e-06, 'epoch': 0.57} + 57%|█████▋ | 3264/5772 [1:52:38<4:14:02, 6.08s/it] {'loss': 0.4602, 'learning_rate': 8.374159928600051e-06, 'epoch': 0.57} + 57%|█████▋ | 3264/5772 [1:52:36<4:14:02, 6.08s/it] 57%|█████▋ | 3265/5772 [1:52:42<4:13:32, 6.07s/it] 57%|█████▋ | 3265/5772 [1:52:44<4:13:32, 6.07s/it] {'loss': 0.4845, 'learning_rate': 8.368622862950079e-06, 'epoch': 0.57} + 57%|█████▋ | 3265/5772 [1:52:44<4:13:32, 6.07s/it] {'loss': 0.4845, 'learning_rate': 8.368622862950079e-06, 'epoch': 0.57} + 57%|█████▋ | 3265/5772 [1:52:42<4:13:32, 6.07s/it] 57%|█████▋ | 3266/5772 [1:52:48<4:11:13, 6.01s/it] 57%|█████▋ | 3266/5772 [1:52:50<4:11:13, 6.02s/it] {'loss': 0.4663, 'learning_rate': 8.363086311093266e-06, 'epoch': 0.57} + 57%|█████▋ | 3266/5772 [1:52:50<4:11:13, 6.02s/it] {'loss': 0.4663, 'learning_rate': 8.363086311093266e-06, 'epoch': 0.57} + 57%|█████▋ | 3266/5772 [1:52:48<4:11:13, 6.01s/it] 57%|█████▋ | 3267/5772 [1:52:54<4:13:50, 6.08s/it] 57%|█████▋ | 3267/5772 [1:52:56<4:13:50, 6.08s/it] {'loss': 0.4665, 'learning_rate': 8.357550274773317e-06, 'epoch': 0.57} + 57%|█████▋ | 3267/5772 [1:52:56<4:13:50, 6.08s/it] {'loss': 0.4665, 'learning_rate': 8.357550274773317e-06, 'epoch': 0.57} + 57%|█████▋ | 3267/5772 [1:52:54<4:13:50, 6.08s/it] 57%|█████▋ | 3268/5772 [1:53:00<4:12:41, 6.05s/it] 57%|█████▋ | 3268/5772 [1:53:02<4:12:41, 6.05s/it] {'loss': 0.458, 'learning_rate': 8.352014755733775e-06, 'epoch': 0.57} + 57%|█████▋ | 3268/5772 [1:53:02<4:12:41, 6.05s/it] {'loss': 0.458, 'learning_rate': 8.352014755733775e-06, 'epoch': 0.57} + 57%|█████▋ | 3268/5772 [1:53:00<4:12:41, 6.05s/it] 57%|█████▋ | 3269/5772 [1:53:06<4:11:58, 6.04s/it] 57%|█████▋ | 3269/5772 [1:53:08<4:11:58, 6.04s/it] {'loss': 0.4712, 'learning_rate': 8.346479755718028e-06, 'epoch': 0.57} + 57%|█████▋ | 3269/5772 [1:53:08<4:11:58, 6.04s/it] {'loss': 0.4712, 'learning_rate': 8.346479755718028e-06, 'epoch': 0.57} + 57%|█████▋ | 3269/5772 [1:53:06<4:11:58, 6.04s/it] 57%|█████▋ | 3270/5772 [1:53:12<4:12:09, 6.05s/it] 57%|█████▋ | 3270/5772 [1:53:14<4:12:09, 6.05s/it] {'loss': 0.4693, 'learning_rate': 8.340945276469282e-06, 'epoch': 0.57} + 57%|█████▋ | 3270/5772 [1:53:14<4:12:09, 6.05s/it] {'loss': 0.4693, 'learning_rate': 8.340945276469282e-06, 'epoch': 0.57} + 57%|█████▋ | 3270/5772 [1:53:12<4:12:09, 6.05s/it] 57%|█████▋ | 3271/5772 [1:53:18<4:10:32, 6.01s/it] 57%|█████▋ | 3271/5772 [1:53:20<4:10:32, 6.01s/it] {'loss': 0.4677, 'learning_rate': 8.335411319730604e-06, 'epoch': 0.57} + 57%|█████▋ | 3271/5772 [1:53:20<4:10:32, 6.01s/it] {'loss': 0.4677, 'learning_rate': 8.335411319730604e-06, 'epoch': 0.57} + 57%|█████▋ | 3271/5772 [1:53:18<4:10:32, 6.01s/it] 57%|█████▋ | 3272/5772 [1:53:26<4:11:33, 6.04s/it] 57%|█████▋ | 3272/5772 [1:53:24<4:11:33, 6.04s/it] {'loss': 0.4688, 'learning_rate': 8.329877887244867e-06, 'epoch': 0.57} + 57%|█████▋ | 3272/5772 [1:53:26<4:11:33, 6.04s/it] {'loss': 0.4688, 'learning_rate': 8.329877887244867e-06, 'epoch': 0.57} + 57%|█████▋ | 3272/5772 [1:53:24<4:11:33, 6.04s/it] 57%|█████▋ | 3273/5772 [1:53:32<4:14:38, 6.11s/it] 57%|█████▋ | 3273/5772 [1:53:30<4:14:38, 6.11s/it] {'loss': 0.4869, 'learning_rate': 8.32434498075481e-06, 'epoch': 0.57} + 57%|█████▋ | 3273/5772 [1:53:32<4:14:38, 6.11s/it] {'loss': 0.4869, 'learning_rate': 8.32434498075481e-06, 'epoch': 0.57} + 57%|█████▋ | 3273/5772 [1:53:30<4:14:38, 6.11s/it] 57%|█████▋ | 3274/5772 [1:53:38<4:12:57, 6.08s/it] 57%|█████▋ | 3274/5772 [1:53:36<4:12:57, 6.08s/it] {'loss': 0.4563, 'learning_rate': 8.318812602002984e-06, 'epoch': 0.57} + 57%|█████▋ | 3274/5772 [1:53:38<4:12:57, 6.08s/it] {'loss': 0.4563, 'learning_rate': 8.318812602002984e-06, 'epoch': 0.57} + 57%|█████▋ | 3274/5772 [1:53:36<4:12:57, 6.08s/it] 57%|█████▋ | 3275/5772 [1:53:42<4:11:00, 6.03s/it] 57%|█████▋ | 3275/5772 [1:53:44<4:11:00, 6.03s/it] {'loss': 0.4772, 'learning_rate': 8.313280752731779e-06, 'epoch': 0.57} + 57%|█████▋ | 3275/5772 [1:53:44<4:11:00, 6.03s/it] {'loss': 0.4772, 'learning_rate': 8.313280752731779e-06, 'epoch': 0.57} + 57%|█████▋ | 3275/5772 [1:53:42<4:11:00, 6.03s/it] 57%|█████▋ | 3276/5772 [1:53:49<4:14:31, 6.12s/it] 57%|█████▋ | 3276/5772 [1:53:51<4:14:32, 6.12s/it] {'loss': 0.4574, 'learning_rate': 8.307749434683426e-06, 'epoch': 0.57} + 57%|█████▋ | 3276/5772 [1:53:51<4:14:32, 6.12s/it] {'loss': 0.4574, 'learning_rate': 8.307749434683426e-06, 'epoch': 0.57} + 57%|█████▋ | 3276/5772 [1:53:49<4:14:31, 6.12s/it] 57%|█████▋ | 3277/5772 [1:53:57<4:15:32, 6.15s/it] 57%|█████▋ | 3277/5772 [1:53:55<4:15:32, 6.15s/it] {'loss': 0.4658, 'learning_rate': 8.302218649599978e-06, 'epoch': 0.57} + 57%|█████▋ | 3277/5772 [1:53:57<4:15:32, 6.15s/it] {'loss': 0.4658, 'learning_rate': 8.302218649599978e-06, 'epoch': 0.57} + 57%|█████▋ | 3277/5772 [1:53:55<4:15:32, 6.15s/it] 57%|█████▋ | 3278/5772 [1:54:01<4:17:37, 6.20s/it] 57%|█████▋ | 3278/5772 [1:54:03<4:17:37, 6.20s/it] {'loss': 0.4539, 'learning_rate': 8.296688399223327e-06, 'epoch': 0.57} + 57%|█████▋ | 3278/5772 [1:54:03<4:17:37, 6.20s/it] {'loss': 0.4539, 'learning_rate': 8.296688399223327e-06, 'epoch': 0.57} + 57%|█████▋ | 3278/5772 [1:54:01<4:17:37, 6.20s/it] 57%|█████▋ | 3279/5772 [1:54:07<4:13:48, 6.11s/it] 57%|█████▋ | 3279/5772 [1:54:09<4:13:48, 6.11s/it] {'loss': 0.4621, 'learning_rate': 8.29115868529519e-06, 'epoch': 0.57} + 57%|█████▋ | 3279/5772 [1:54:09<4:13:48, 6.11s/it] {'loss': 0.4621, 'learning_rate': 8.29115868529519e-06, 'epoch': 0.57} + 57%|█████▋ | 3279/5772 [1:54:07<4:13:48, 6.11s/it] 57%|█████▋ | 3280/5772 [1:54:13<4:10:42, 6.04s/it] 57%|█████▋ | 3280/5772 [1:54:15<4:10:42, 6.04s/it] {'loss': 0.458, 'learning_rate': 8.285629509557132e-06, 'epoch': 0.57} + 57%|█████▋ | 3280/5772 [1:54:15<4:10:42, 6.04s/it] {'loss': 0.458, 'learning_rate': 8.285629509557132e-06, 'epoch': 0.57} + 57%|█████▋ | 3280/5772 [1:54:13<4:10:42, 6.04s/it] 57%|█████▋ | 3281/5772 [1:54:19<4:14:39, 6.13s/it] 57%|█████▋ | 3281/5772 [1:54:21<4:14:39, 6.13s/it] {'loss': 0.4663, 'learning_rate': 8.28010087375052e-06, 'epoch': 0.57} + 57%|█████▋ | 3281/5772 [1:54:21<4:14:39, 6.13s/it] {'loss': 0.4663, 'learning_rate': 8.28010087375052e-06, 'epoch': 0.57} + 57%|█████▋ | 3281/5772 [1:54:19<4:14:39, 6.13s/it] 57%|█████▋ | 3282/5772 [1:54:25<4:12:49, 6.09s/it] 57%|█████▋ | 3282/5772 [1:54:27<4:12:49, 6.09s/it] {'loss': 0.4599, 'learning_rate': 8.274572779616579e-06, 'epoch': 0.57} + 57%|█████▋ | 3282/5772 [1:54:27<4:12:49, 6.09s/it] {'loss': 0.4599, 'learning_rate': 8.274572779616579e-06, 'epoch': 0.57} + 57%|█████▋ | 3282/5772 [1:54:25<4:12:49, 6.09s/it] 57%|█████▋ | 3283/5772 [1:54:33<4:08:12, 5.98s/it] 57%|█████▋ | 3283/5772 [1:54:31<4:08:12, 5.98s/it] {'loss': 0.4721, 'learning_rate': 8.269045228896349e-06, 'epoch': 0.57} + 57%|█████▋ | 3283/5772 [1:54:33<4:08:12, 5.98s/it] {'loss': 0.4721, 'learning_rate': 8.269045228896349e-06, 'epoch': 0.57} + 57%|█████▋ | 3283/5772 [1:54:31<4:08:12, 5.98s/it] 57%|█████▋ | 3284/5772 [1:54:37<4:08:41, 6.00s/it] 57%|█████▋ | 3284/5772 [1:54:39<4:08:41, 6.00s/it] {'loss': 0.4592, 'learning_rate': 8.263518223330698e-06, 'epoch': 0.57} + 57%|█████▋ | 3284/5772 [1:54:39<4:08:41, 6.00s/it] {'loss': 0.4592, 'learning_rate': 8.263518223330698e-06, 'epoch': 0.57} + 57%|█████▋ | 3284/5772 [1:54:37<4:08:41, 6.00s/it] 57%|█████▋ | 3285/5772 [1:54:45<4:10:01, 6.03s/it] 57%|█████▋ | 3285/5772 [1:54:43<4:10:01, 6.03s/it] {'loss': 0.4667, 'learning_rate': 8.25799176466033e-06, 'epoch': 0.57} + 57%|█████▋ | 3285/5772 [1:54:45<4:10:01, 6.03s/it] {'loss': 0.4667, 'learning_rate': 8.25799176466033e-06, 'epoch': 0.57} + 57%|█████▋ | 3285/5772 [1:54:43<4:10:01, 6.03s/it] 57%|█████▋ | 3286/5772 [1:54:50<4:15:03, 6.16s/it] 57%|█████▋ | 3286/5772 [1:54:52<4:15:03, 6.16s/it] {'loss': 0.4715, 'learning_rate': 8.252465854625766e-06, 'epoch': 0.57} + 57%|█████▋ | 3286/5772 [1:54:52<4:15:03, 6.16s/it] {'loss': 0.4715, 'learning_rate': 8.252465854625766e-06, 'epoch': 0.57} + 57%|█████▋ | 3286/5772 [1:54:50<4:15:03, 6.16s/it] 57%|█████▋ | 3287/5772 [1:54:55<4:09:27, 6.02s/it] 57%|█████▋ | 3287/5772 [1:54:57<4:09:27, 6.02s/it] {'loss': 0.4611, 'learning_rate': 8.246940494967369e-06, 'epoch': 0.57} + 57%|█████▋ | 3287/5772 [1:54:57<4:09:27, 6.02s/it] {'loss': 0.4611, 'learning_rate': 8.246940494967369e-06, 'epoch': 0.57} + 57%|█████▋ | 3287/5772 [1:54:55<4:09:27, 6.02s/it] 57%|█████▋ | 3288/5772 [1:55:03<4:06:38, 5.96s/it] 57%|█████▋ | 3288/5772 [1:55:01<4:06:38, 5.96s/it] {'loss': 0.4716, 'learning_rate': 8.24141568742531e-06, 'epoch': 0.57} + 57%|█████▋ | 3288/5772 [1:55:03<4:06:38, 5.96s/it] {'loss': 0.4716, 'learning_rate': 8.24141568742531e-06, 'epoch': 0.57} + 57%|█████▋ | 3288/5772 [1:55:01<4:06:38, 5.96s/it] 57%|█████▋ | 3289/5772 [1:55:09<4:05:18, 5.93s/it] 57%|█████▋ | 3289/5772 [1:55:07<4:05:18, 5.93s/it] {'loss': 0.4684, 'learning_rate': 8.235891433739606e-06, 'epoch': 0.57} + 57%|█████▋ | 3289/5772 [1:55:09<4:05:18, 5.93s/it] {'loss': 0.4684, 'learning_rate': 8.235891433739606e-06, 'epoch': 0.57} + 57%|█████▋ | 3289/5772 [1:55:07<4:05:18, 5.93s/it] 57%|█████▋ | 3290/5772 [1:55:15<4:07:17, 5.98s/it] 57%|█████▋ | 3290/5772 [1:55:13<4:07:17, 5.98s/it] {'loss': 0.4594, 'learning_rate': 8.230367735650088e-06, 'epoch': 0.57} + 57%|█████▋ | 3290/5772 [1:55:15<4:07:17, 5.98s/it] {'loss': 0.4594, 'learning_rate': 8.230367735650088e-06, 'epoch': 0.57} + 57%|█████▋ | 3290/5772 [1:55:13<4:07:17, 5.98s/it] 57%|█████▋ | 3291/5772 [1:55:21<4:08:55, 6.02s/it] 57%|█████▋ | 3291/5772 [1:55:19<4:08:55, 6.02s/it] {'loss': 0.4731, 'learning_rate': 8.224844594896411e-06, 'epoch': 0.57} + 57%|█████▋ | 3291/5772 [1:55:21<4:08:55, 6.02s/it] {'loss': 0.4731, 'learning_rate': 8.224844594896411e-06, 'epoch': 0.57} + 57%|█████▋ | 3291/5772 [1:55:19<4:08:55, 6.02s/it] 57%|█████▋ | 3292/5772 [1:55:25<4:11:12, 6.08s/it] 57%|█████▋ | 3292/5772 [1:55:27<4:11:12, 6.08s/it] {'loss': 0.4606, 'learning_rate': 8.219322013218062e-06, 'epoch': 0.57} + 57%|█████▋ | 3292/5772 [1:55:27<4:11:12, 6.08s/it] {'loss': 0.4606, 'learning_rate': 8.219322013218062e-06, 'epoch': 0.57} + 57%|█████▋ | 3292/5772 [1:55:25<4:11:12, 6.08s/it] 57%|█████▋ | 3293/5772 [1:55:32<4:13:59, 6.15s/it] 57%|█████▋ | 3293/5772 [1:55:34<4:13:59, 6.15s/it] {'loss': 0.4584, 'learning_rate': 8.213799992354341e-06, 'epoch': 0.57} + 57%|█████▋ | 3293/5772 [1:55:34<4:13:59, 6.15s/it] {'loss': 0.4584, 'learning_rate': 8.213799992354341e-06, 'epoch': 0.57} + 57%|█████▋ | 3293/5772 [1:55:32<4:13:59, 6.15s/it] 57%|█████▋ | 3294/5772 [1:55:40<4:19:18, 6.28s/it] 57%|█████▋ | 3294/5772 [1:55:38<4:19:18, 6.28s/it] {'loss': 0.465, 'learning_rate': 8.208278534044382e-06, 'epoch': 0.57} + 57%|█████▋ | 3294/5772 [1:55:40<4:19:18, 6.28s/it] {'loss': 0.465, 'learning_rate': 8.208278534044382e-06, 'epoch': 0.57} + 57%|█████▋ | 3294/5772 [1:55:38<4:19:18, 6.28s/it] 57%|█████▋ | 3295/5772 [1:55:47<4:18:20, 6.26s/it] 57%|█████▋ | 3295/5772 [1:55:45<4:18:20, 6.26s/it] {'loss': 0.4589, 'learning_rate': 8.202757640027137e-06, 'epoch': 0.57} + 57%|█████▋ | 3295/5772 [1:55:47<4:18:20, 6.26s/it] {'loss': 0.4589, 'learning_rate': 8.202757640027137e-06, 'epoch': 0.57} + 57%|█████▋ | 3295/5772 [1:55:45<4:18:20, 6.26s/it] 57%|█████▋ | 3296/5772 [1:55:53<4:16:17, 6.21s/it] 57%|█████▋ | 3296/5772 [1:55:51<4:16:17, 6.21s/it] {'loss': 0.4627, 'learning_rate': 8.197237312041377e-06, 'epoch': 0.57} + 57%|█████▋ | 3296/5772 [1:55:53<4:16:17, 6.21s/it] {'loss': 0.4627, 'learning_rate': 8.197237312041377e-06, 'epoch': 0.57} + 57%|█████▋ | 3296/5772 [1:55:51<4:16:17, 6.21s/it] 57%|█████▋ | 3297/5772 [1:55:59<4:13:26, 6.14s/it] 57%|█████▋ | 3297/5772 [1:55:57<4:13:26, 6.14s/it] {'loss': 0.4657, 'learning_rate': 8.191717551825707e-06, 'epoch': 0.57} + 57%|█████▋ | 3297/5772 [1:55:59<4:13:26, 6.14s/it] {'loss': 0.4657, 'learning_rate': 8.191717551825707e-06, 'epoch': 0.57} + 57%|█████▋ | 3297/5772 [1:55:57<4:13:26, 6.14s/it] 57%|█████▋ | 3298/5772 [1:56:05<4:12:15, 6.12s/it] 57%|█████▋ | 3298/5772 [1:56:03<4:12:15, 6.12s/it] {'loss': 0.4591, 'learning_rate': 8.186198361118537e-06, 'epoch': 0.57} + 57%|█████▋ | 3298/5772 [1:56:05<4:12:15, 6.12s/it] {'loss': 0.4591, 'learning_rate': 8.186198361118537e-06, 'epoch': 0.57} + 57%|█████▋ | 3298/5772 [1:56:03<4:12:15, 6.12s/it] 57%|█████▋ | 3299/5772 [1:56:11<4:10:00, 6.07s/it] 57%|█████▋ | 3299/5772 [1:56:09<4:10:00, 6.07s/it] {'loss': 0.4741, 'learning_rate': 8.18067974165811e-06, 'epoch': 0.57} + 57%|█████▋ | 3299/5772 [1:56:11<4:10:00, 6.07s/it] {'loss': 0.4741, 'learning_rate': 8.18067974165811e-06, 'epoch': 0.57} + 57%|█████▋ | 3299/5772 [1:56:09<4:10:00, 6.07s/it]10 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +06 AutoResumeHook: Checking whether to suspend... + 7 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +1113 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... + 57%|█████▋ | 3300/5772 [1:56:16<4:05:28, 5.96s/it] 57%|█████▋ | 3300/5772 [1:56:14<4:05:28, 5.96s/it]9 AutoResumeHook: Checking whether to suspend...1 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... + +4 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4651, 'learning_rate': 8.175161695182484e-06, 'epoch': 0.57} + 57%|█████▋ | 3300/5772 [1:56:16<4:05:28, 5.96s/it] {'loss': 0.4651, 'learning_rate': 8.175161695182484e-06, 'epoch': 0.57} + 57%|█████▋ | 3300/5772 [1:56:14<4:05:28, 5.96s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-3300/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-3300/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-3300/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 57%|█████▋ | 3301/5772 [1:56:39<7:27:00, 10.85s/it] 57%|█████▋ | 3301/5772 [1:56:37<7:27:00, 10.85s/it] {'loss': 0.461, 'learning_rate': 8.169644223429535e-06, 'epoch': 0.57} + 57%|█████▋ | 3301/5772 [1:56:39<7:27:00, 10.85s/it] {'loss': 0.461, 'learning_rate': 8.169644223429535e-06, 'epoch': 0.57} + 57%|█████▋ | 3301/5772 [1:56:37<7:27:00, 10.85s/it] 57%|█████▋ | 3302/5772 [1:56:43<6:29:46, 9.47s/it] 57%|█████▋ | 3302/5772 [1:56:45<6:29:47, 9.47s/it] {'loss': 0.4654, 'learning_rate': 8.16412732813696e-06, 'epoch': 0.57} + 57%|█████▋ | 3302/5772 [1:56:45<6:29:47, 9.47s/it] {'loss': 0.4654, 'learning_rate': 8.16412732813696e-06, 'epoch': 0.57} + 57%|█████▋ | 3302/5772 [1:56:43<6:29:46, 9.47s/it] 57%|█████▋ | 3303/5772 [1:56:49<5:48:47, 8.48s/it] 57%|█████▋ | 3303/5772 [1:56:51<5:48:47, 8.48s/it] {'loss': 0.4785, 'learning_rate': 8.158611011042272e-06, 'epoch': 0.57} + 57%|█████▋ | 3303/5772 [1:56:51<5:48:47, 8.48s/it] {'loss': 0.4785, 'learning_rate': 8.158611011042272e-06, 'epoch': 0.57} + 57%|█████▋ | 3303/5772 [1:56:49<5:48:47, 8.48s/it] 57%|█████▋ | 3304/5772 [1:56:57<5:19:51, 7.78s/it] 57%|█████▋ | 3304/5772 [1:56:55<5:19:52, 7.78s/it] {'loss': 0.4581, 'learning_rate': 8.153095273882816e-06, 'epoch': 0.57} + 57%|█████▋ | 3304/5772 [1:56:57<5:19:51, 7.78s/it] {'loss': 0.4581, 'learning_rate': 8.153095273882816e-06, 'epoch': 0.57} + 57%|█████▋ | 3304/5772 [1:56:55<5:19:52, 7.78s/it] 57%|█████▋ | 3305/5772 [1:57:01<4:56:54, 7.22s/it] 57%|█████▋ | 3305/5772 [1:57:03<4:56:54, 7.22s/it] {'loss': 0.4684, 'learning_rate': 8.147580118395728e-06, 'epoch': 0.57} + 57%|█████▋ | 3305/5772 [1:57:03<4:56:54, 7.22s/it] {'loss': 0.4684, 'learning_rate': 8.147580118395728e-06, 'epoch': 0.57} + 57%|█████▋ | 3305/5772 [1:57:01<4:56:54, 7.22s/it] 57%|█████▋ | 3306/5772 [1:57:07<4:44:05, 6.91s/it] 57%|█████▋ | 3306/5772 [1:57:09<4:44:05, 6.91s/it] {'loss': 0.4578, 'learning_rate': 8.142065546317988e-06, 'epoch': 0.57} + 57%|█████▋ | 3306/5772 [1:57:09<4:44:05, 6.91s/it] {'loss': 0.4578, 'learning_rate': 8.142065546317988e-06, 'epoch': 0.57} + 57%|█████▋ | 3306/5772 [1:57:07<4:44:05, 6.91s/it] 57%|█████▋ | 3307/5772 [1:57:16<4:35:30, 6.71s/it] 57%|█████▋ | 3307/5772 [1:57:14<4:35:30, 6.71s/it] {'loss': 0.4739, 'learning_rate': 8.136551559386368e-06, 'epoch': 0.57} + 57%|█████▋ | 3307/5772 [1:57:16<4:35:30, 6.71s/it] {'loss': 0.4739, 'learning_rate': 8.136551559386368e-06, 'epoch': 0.57} + 57%|█████▋ | 3307/5772 [1:57:14<4:35:30, 6.71s/it] 57%|█████▋ | 3308/5772 [1:57:22<4:29:28, 6.56s/it] 57%|█████▋ | 3308/5772 [1:57:20<4:29:28, 6.56s/it] {'loss': 0.4797, 'learning_rate': 8.131038159337478e-06, 'epoch': 0.57} + 57%|█████▋ | 3308/5772 [1:57:22<4:29:28, 6.56s/it] {'loss': 0.4797, 'learning_rate': 8.131038159337478e-06, 'epoch': 0.57} + 57%|█████▋ | 3308/5772 [1:57:20<4:29:28, 6.56s/it] 57%|█████▋ | 3309/5772 [1:57:28<4:24:45, 6.45s/it] 57%|█████▋ | 3309/5772 [1:57:26<4:24:45, 6.45s/it] {'loss': 0.4735, 'learning_rate': 8.125525347907726e-06, 'epoch': 0.57} + 57%|█████▋ | 3309/5772 [1:57:28<4:24:45, 6.45s/it] {'loss': 0.4735, 'learning_rate': 8.125525347907726e-06, 'epoch': 0.57} + 57%|█████▋ | 3309/5772 [1:57:26<4:24:45, 6.45s/it] 57%|█████▋ | 3310/5772 [1:57:32<4:20:09, 6.34s/it] 57%|█████▋ | 3310/5772 [1:57:34<4:20:09, 6.34s/it] {'loss': 0.4692, 'learning_rate': 8.120013126833344e-06, 'epoch': 0.57} + 57%|█████▋ | 3310/5772 [1:57:34<4:20:09, 6.34s/it] {'loss': 0.4692, 'learning_rate': 8.120013126833344e-06, 'epoch': 0.57} + 57%|█████▋ | 3310/5772 [1:57:32<4:20:09, 6.34s/it] 57%|█████▋ | 3311/5772 [1:57:38<4:16:53, 6.26s/it] 57%|█████▋ | 3311/5772 [1:57:40<4:16:53, 6.26s/it] {'loss': 0.462, 'learning_rate': 8.114501497850375e-06, 'epoch': 0.57} + 57%|█████▋ | 3311/5772 [1:57:40<4:16:53, 6.26s/it] {'loss': 0.462, 'learning_rate': 8.114501497850375e-06, 'epoch': 0.57} + 57%|█████▋ | 3311/5772 [1:57:38<4:16:53, 6.26s/it] 57%|█████▋ | 3312/5772 [1:57:47<4:24:08, 6.44s/it] 57%|█████▋ | 3312/5772 [1:57:45<4:24:08, 6.44s/it] {'loss': 0.4587, 'learning_rate': 8.108990462694676e-06, 'epoch': 0.57} + 57%|█████▋ | 3312/5772 [1:57:47<4:24:08, 6.44s/it] {'loss': 0.4587, 'learning_rate': 8.108990462694676e-06, 'epoch': 0.57} + 57%|█████▋ | 3312/5772 [1:57:45<4:24:08, 6.44s/it] 57%|█████▋ | 3313/5772 [1:57:51<4:18:55, 6.32s/it] 57%|█████▋ | 3313/5772 [1:57:53<4:18:55, 6.32s/it] {'loss': 0.4697, 'learning_rate': 8.103480023101919e-06, 'epoch': 0.57} + 57%|█████▋ | 3313/5772 [1:57:53<4:18:55, 6.32s/it] {'loss': 0.4697, 'learning_rate': 8.103480023101919e-06, 'epoch': 0.57} + 57%|█████▋ | 3313/5772 [1:57:51<4:18:55, 6.32s/it] 57%|█████▋ | 3314/5772 [1:57:57<4:14:45, 6.22s/it] 57%|█████▋ | 3314/5772 [1:57:59<4:14:45, 6.22s/it] {'loss': 0.4634, 'learning_rate': 8.097970180807577e-06, 'epoch': 0.57} + 57%|█████▋ | 3314/5772 [1:57:59<4:14:45, 6.22s/it] {'loss': 0.4634, 'learning_rate': 8.097970180807577e-06, 'epoch': 0.57} + 57%|█████▋ | 3314/5772 [1:57:57<4:14:45, 6.22s/it] 57%|█████▋ | 3315/5772 [1:58:03<4:16:11, 6.26s/it] 57%|█████▋ | 3315/5772 [1:58:05<4:16:11, 6.26s/it] {'loss': 0.4669, 'learning_rate': 8.09246093754696e-06, 'epoch': 0.57} + 57%|█████▋ | 3315/5772 [1:58:05<4:16:11, 6.26s/it] {'loss': 0.4669, 'learning_rate': 8.09246093754696e-06, 'epoch': 0.57} + 57%|█████▋ | 3315/5772 [1:58:03<4:16:11, 6.26s/it] 57%|█████▋ | 3316/5772 [1:58:11<4:11:38, 6.15s/it] 57%|█████▋ | 3316/5772 [1:58:09<4:11:39, 6.15s/it] {'loss': 0.4627, 'learning_rate': 8.086952295055158e-06, 'epoch': 0.57} + 57%|█████▋ | 3316/5772 [1:58:11<4:11:38, 6.15s/it] {'loss': 0.4627, 'learning_rate': 8.086952295055158e-06, 'epoch': 0.57} + 57%|█████▋ | 3316/5772 [1:58:09<4:11:39, 6.15s/it] 57%|█████▋ | 3317/5772 [1:58:17<4:11:08, 6.14s/it] 57%|█████▋ | 3317/5772 [1:58:15<4:11:09, 6.14s/it] {'loss': 0.4747, 'learning_rate': 8.081444255067096e-06, 'epoch': 0.57} + 57%|█████▋ | 3317/5772 [1:58:17<4:11:08, 6.14s/it] {'loss': 0.4747, 'learning_rate': 8.081444255067096e-06, 'epoch': 0.57} + 57%|█████▋ | 3317/5772 [1:58:15<4:11:09, 6.14s/it] 57%|█████▋ | 3318/5772 [1:58:24<4:11:49, 6.16s/it] 57%|█████▋ | 3318/5772 [1:58:22<4:11:49, 6.16s/it] {'loss': 0.4707, 'learning_rate': 8.075936819317501e-06, 'epoch': 0.57} + 57%|█████▋ | 3318/5772 [1:58:24<4:11:49, 6.16s/it] {'loss': 0.4707, 'learning_rate': 8.075936819317501e-06, 'epoch': 0.57} + 57%|█████▋ | 3318/5772 [1:58:22<4:11:49, 6.16s/it] 58%|█████▊ | 3319/5772 [1:58:30<4:16:01, 6.26s/it] 58%|█████▊ | 3319/5772 [1:58:28<4:16:02, 6.26s/it] {'loss': 0.4572, 'learning_rate': 8.070429989540905e-06, 'epoch': 0.57} + 58%|█████▊ | 3319/5772 [1:58:30<4:16:01, 6.26s/it] {'loss': 0.4572, 'learning_rate': 8.070429989540905e-06, 'epoch': 0.57} + 58%|█████▊ | 3319/5772 [1:58:28<4:16:02, 6.26s/it] 58%|█████▊ | 3320/5772 [1:58:36<4:16:10, 6.27s/it] 58%|█████▊ | 3320/5772 [1:58:34<4:16:11, 6.27s/it] {'loss': 0.4639, 'learning_rate': 8.064923767471657e-06, 'epoch': 0.58} + 58%|█████▊ | 3320/5772 [1:58:36<4:16:10, 6.27s/it] {'loss': 0.4639, 'learning_rate': 8.064923767471657e-06, 'epoch': 0.58} + 58%|█████▊ | 3320/5772 [1:58:34<4:16:11, 6.27s/it] 58%|█████▊ | 3321/5772 [1:58:43<4:15:32, 6.26s/it] 58%|█████▊ | 3321/5772 [1:58:41<4:15:32, 6.26s/it] {'loss': 0.4774, 'learning_rate': 8.059418154843908e-06, 'epoch': 0.58} + 58%|█████▊ | 3321/5772 [1:58:43<4:15:32, 6.26s/it] {'loss': 0.4774, 'learning_rate': 8.059418154843908e-06, 'epoch': 0.58} + 58%|█████▊ | 3321/5772 [1:58:41<4:15:32, 6.26s/it] 58%|█████▊ | 3322/5772 [1:58:49<4:18:45, 6.34s/it] 58%|█████▊ | 3322/5772 [1:58:47<4:18:45, 6.34s/it] {'loss': 0.4606, 'learning_rate': 8.053913153391622e-06, 'epoch': 0.58} + 58%|█████▊ | 3322/5772 [1:58:49<4:18:45, 6.34s/it] {'loss': 0.4606, 'learning_rate': 8.053913153391622e-06, 'epoch': 0.58} + 58%|█████▊ | 3322/5772 [1:58:47<4:18:45, 6.34s/it] 58%|█████▊ | 3323/5772 [1:58:56<4:20:14, 6.38s/it] 58%|█████▊ | 3323/5772 [1:58:54<4:20:14, 6.38s/it] {'loss': 0.4735, 'learning_rate': 8.048408764848565e-06, 'epoch': 0.58} + 58%|█████▊ | 3323/5772 [1:58:56<4:20:14, 6.38s/it] {'loss': 0.4735, 'learning_rate': 8.048408764848565e-06, 'epoch': 0.58} + 58%|█████▊ | 3323/5772 [1:58:54<4:20:14, 6.38s/it] 58%|█████▊ | 3324/5772 [1:59:02<4:19:45, 6.37s/it] 58%|█████▊ | 3324/5772 [1:59:00<4:19:45, 6.37s/it] {'loss': 0.4653, 'learning_rate': 8.042904990948319e-06, 'epoch': 0.58} + 58%|█████▊ | 3324/5772 [1:59:02<4:19:45, 6.37s/it] {'loss': 0.4653, 'learning_rate': 8.042904990948319e-06, 'epoch': 0.58} + 58%|█████▊ | 3324/5772 [1:59:00<4:19:45, 6.37s/it] 58%|█████▊ | 3325/5772 [1:59:08<4:13:05, 6.21s/it] 58%|█████▊ | 3325/5772 [1:59:06<4:13:05, 6.21s/it] {'loss': 0.4701, 'learning_rate': 8.037401833424265e-06, 'epoch': 0.58} + 58%|█████▊ | 3325/5772 [1:59:08<4:13:05, 6.21s/it] {'loss': 0.4701, 'learning_rate': 8.037401833424265e-06, 'epoch': 0.58} + 58%|█████▊ | 3325/5772 [1:59:06<4:13:05, 6.21s/it] 58%|█████▊ | 3326/5772 [1:59:12<4:17:51, 6.33s/it] 58%|█████▊ | 3326/5772 [1:59:14<4:17:52, 6.33s/it] {'loss': 0.4588, 'learning_rate': 8.03189929400959e-06, 'epoch': 0.58} + 58%|█████▊ | 3326/5772 [1:59:14<4:17:52, 6.33s/it] {'loss': 0.4588, 'learning_rate': 8.03189929400959e-06, 'epoch': 0.58} + 58%|█████▊ | 3326/5772 [1:59:12<4:17:51, 6.33s/it] 58%|█████▊ | 3327/5772 [1:59:19<4:19:24, 6.37s/it] 58%|█████▊ | 3327/5772 [1:59:21<4:19:24, 6.37s/it] {'loss': 0.471, 'learning_rate': 8.026397374437294e-06, 'epoch': 0.58} + 58%|█████▊ | 3327/5772 [1:59:21<4:19:24, 6.37s/it] {'loss': 0.471, 'learning_rate': 8.026397374437294e-06, 'epoch': 0.58} + 58%|█████▊ | 3327/5772 [1:59:19<4:19:24, 6.37s/it] 58%|█████▊ | 3328/5772 [1:59:25<4:16:56, 6.31s/it] 58%|█████▊ | 3328/5772 [1:59:27<4:16:56, 6.31s/it] {'loss': 0.4658, 'learning_rate': 8.020896076440169e-06, 'epoch': 0.58} + 58%|█████▊ | 3328/5772 [1:59:27<4:16:56, 6.31s/it] {'loss': 0.4658, 'learning_rate': 8.020896076440169e-06, 'epoch': 0.58} + 58%|█████▊ | 3328/5772 [1:59:25<4:16:56, 6.31s/it] 58%|█████▊ | 3329/5772 [1:59:33<4:13:45, 6.23s/it] 58%|█████▊ | 3329/5772 [1:59:31<4:13:45, 6.23s/it] {'loss': 0.468, 'learning_rate': 8.015395401750816e-06, 'epoch': 0.58} + 58%|█████▊ | 3329/5772 [1:59:33<4:13:45, 6.23s/it] {'loss': 0.468, 'learning_rate': 8.015395401750816e-06, 'epoch': 0.58} + 58%|█████▊ | 3329/5772 [1:59:31<4:13:45, 6.23s/it] 58%|█████▊ | 3330/5772 [1:59:39<4:12:55, 6.21s/it] 58%|█████▊ | 3330/5772 [1:59:37<4:12:55, 6.21s/it] {'loss': 0.4561, 'learning_rate': 8.009895352101656e-06, 'epoch': 0.58} + 58%|█████▊ | 3330/5772 [1:59:39<4:12:55, 6.21s/it] {'loss': 0.4561, 'learning_rate': 8.009895352101656e-06, 'epoch': 0.58} + 58%|█████▊ | 3330/5772 [1:59:37<4:12:55, 6.21s/it] 58%|█████▊ | 3331/5772 [1:59:43<4:13:37, 6.23s/it] 58%|█████▊ | 3331/5772 [1:59:45<4:13:37, 6.23s/it] {'loss': 0.4632, 'learning_rate': 8.004395929224881e-06, 'epoch': 0.58} + 58%|█████▊ | 3331/5772 [1:59:45<4:13:37, 6.23s/it] {'loss': 0.4632, 'learning_rate': 8.004395929224881e-06, 'epoch': 0.58} + 58%|█████▊ | 3331/5772 [1:59:43<4:13:37, 6.23s/it] 58%|█████▊ | 3332/5772 [1:59:51<4:10:39, 6.16s/it] 58%|█████▊ | 3332/5772 [1:59:49<4:10:39, 6.16s/it] {'loss': 0.462, 'learning_rate': 7.998897134852518e-06, 'epoch': 0.58} + 58%|█████▊ | 3332/5772 [1:59:51<4:10:39, 6.16s/it] {'loss': 0.462, 'learning_rate': 7.998897134852518e-06, 'epoch': 0.58} + 58%|█████▊ | 3332/5772 [1:59:49<4:10:39, 6.16s/it] 58%|█████▊ | 3333/5772 [1:59:57<4:07:47, 6.10s/it] 58%|█████▊ | 3333/5772 [1:59:55<4:07:47, 6.10s/it] {'loss': 0.4553, 'learning_rate': 7.993398970716375e-06, 'epoch': 0.58} + 58%|█████▊ | 3333/5772 [1:59:57<4:07:47, 6.10s/it] {'loss': 0.4553, 'learning_rate': 7.993398970716375e-06, 'epoch': 0.58} + 58%|█████▊ | 3333/5772 [1:59:55<4:07:47, 6.10s/it] 58%|█████▊ | 3334/5772 [2:00:01<4:05:08, 6.03s/it] 58%|█████▊ | 3334/5772 [2:00:03<4:05:08, 6.03s/it] {'loss': 0.4636, 'learning_rate': 7.987901438548069e-06, 'epoch': 0.58} + 58%|█████▊ | 3334/5772 [2:00:03<4:05:08, 6.03s/it] {'loss': 0.4636, 'learning_rate': 7.987901438548069e-06, 'epoch': 0.58} + 58%|█████▊ | 3334/5772 [2:00:01<4:05:08, 6.03s/it] 58%|█████▊ | 3335/5772 [2:00:07<4:05:48, 6.05s/it] 58%|█████▊ | 3335/5772 [2:00:09<4:05:48, 6.05s/it] {'loss': 0.4721, 'learning_rate': 7.982404540079018e-06, 'epoch': 0.58} + 58%|█████▊ | 3335/5772 [2:00:09<4:05:48, 6.05s/it] {'loss': 0.4721, 'learning_rate': 7.982404540079018e-06, 'epoch': 0.58} + 58%|█████▊ | 3335/5772 [2:00:07<4:05:48, 6.05s/it] 58%|█████▊ | 3336/5772 [2:00:15<4:04:27, 6.02s/it] 58%|█████▊ | 3336/5772 [2:00:13<4:04:27, 6.02s/it] {'loss': 0.4641, 'learning_rate': 7.976908277040438e-06, 'epoch': 0.58} + 58%|█████▊ | 3336/5772 [2:00:15<4:04:27, 6.02s/it] {'loss': 0.4641, 'learning_rate': 7.976908277040438e-06, 'epoch': 0.58} + 58%|█████▊ | 3336/5772 [2:00:13<4:04:27, 6.02s/it] 58%|█████▊ | 3337/5772 [2:00:19<4:05:39, 6.05s/it] 58%|█████▊ | 3337/5772 [2:00:21<4:05:39, 6.05s/it] {'loss': 0.4606, 'learning_rate': 7.97141265116335e-06, 'epoch': 0.58} + 58%|█████▊ | 3337/5772 [2:00:21<4:05:39, 6.05s/it] {'loss': 0.4606, 'learning_rate': 7.97141265116335e-06, 'epoch': 0.58} + 58%|█████▊ | 3337/5772 [2:00:19<4:05:39, 6.05s/it] 58%|█████▊ | 3338/5772 [2:00:26<4:12:20, 6.22s/it] 58%|█████▊ | 3338/5772 [2:00:28<4:12:20, 6.22s/it] {'loss': 0.4607, 'learning_rate': 7.965917664178564e-06, 'epoch': 0.58} + 58%|█████▊ | 3338/5772 [2:00:28<4:12:20, 6.22s/it] {'loss': 0.4607, 'learning_rate': 7.965917664178564e-06, 'epoch': 0.58} + 58%|█████▊ | 3338/5772 [2:00:26<4:12:20, 6.22s/it] 58%|█████▊ | 3339/5772 [2:00:34<4:13:02, 6.24s/it] 58%|█████▊ | 3339/5772 [2:00:32<4:13:02, 6.24s/it] {'loss': 0.4664, 'learning_rate': 7.960423317816708e-06, 'epoch': 0.58} + 58%|█████▊ | 3339/5772 [2:00:34<4:13:02, 6.24s/it] {'loss': 0.4664, 'learning_rate': 7.960423317816708e-06, 'epoch': 0.58} + 58%|█████▊ | 3339/5772 [2:00:32<4:13:02, 6.24s/it] 58%|█████▊ | 3340/5772 [2:00:40<4:11:34, 6.21s/it] 58%|█████▊ | 3340/5772 [2:00:39<4:11:34, 6.21s/it] {'loss': 0.4586, 'learning_rate': 7.95492961380818e-06, 'epoch': 0.58} + 58%|█████▊ | 3340/5772 [2:00:40<4:11:34, 6.21s/it] {'loss': 0.4586, 'learning_rate': 7.95492961380818e-06, 'epoch': 0.58} + 58%|█████▊ | 3340/5772 [2:00:39<4:11:34, 6.21s/it] 58%|█████▊ | 3341/5772 [2:00:47<4:09:39, 6.16s/it] 58%|█████▊ | 3341/5772 [2:00:45<4:09:39, 6.16s/it] {'loss': 0.4743, 'learning_rate': 7.949436553883203e-06, 'epoch': 0.58} + 58%|█████▊ | 3341/5772 [2:00:47<4:09:39, 6.16s/it] {'loss': 0.4743, 'learning_rate': 7.949436553883203e-06, 'epoch': 0.58} + 58%|█████▊ | 3341/5772 [2:00:45<4:09:39, 6.16s/it] 58%|█████▊ | 3342/5772 [2:00:51<4:07:26, 6.11s/it] 58%|█████▊ | 3342/5772 [2:00:53<4:07:27, 6.11s/it] {'loss': 0.4768, 'learning_rate': 7.943944139771784e-06, 'epoch': 0.58} + 58%|█████▊ | 3342/5772 [2:00:53<4:07:27, 6.11s/it] {'loss': 0.4768, 'learning_rate': 7.943944139771784e-06, 'epoch': 0.58} + 58%|█████▊ | 3342/5772 [2:00:51<4:07:26, 6.11s/it] 58%|█████▊ | 3343/5772 [2:00:57<4:08:57, 6.15s/it] 58%|█████▊ | 3343/5772 [2:00:59<4:08:58, 6.15s/it] {'loss': 0.4768, 'learning_rate': 7.938452373203722e-06, 'epoch': 0.58} + 58%|█████▊ | 3343/5772 [2:00:59<4:08:58, 6.15s/it] {'loss': 0.4768, 'learning_rate': 7.938452373203722e-06, 'epoch': 0.58} + 58%|█████▊ | 3343/5772 [2:00:57<4:08:57, 6.15s/it] 58%|█████▊ | 3344/5772 [2:01:05<4:08:38, 6.14s/it] 58%|█████▊ | 3344/5772 [2:01:03<4:08:39, 6.14s/it] {'loss': 0.4606, 'learning_rate': 7.932961255908628e-06, 'epoch': 0.58} + 58%|█████▊ | 3344/5772 [2:01:05<4:08:38, 6.14s/it] {'loss': 0.4606, 'learning_rate': 7.932961255908628e-06, 'epoch': 0.58} + 58%|█████▊ | 3344/5772 [2:01:03<4:08:39, 6.14s/it] 58%|█████▊ | 3345/5772 [2:01:09<4:10:01, 6.18s/it] 58%|█████▊ | 3345/5772 [2:01:11<4:10:01, 6.18s/it] {'loss': 0.4795, 'learning_rate': 7.92747078961589e-06, 'epoch': 0.58} + 58%|█████▊ | 3345/5772 [2:01:11<4:10:01, 6.18s/it] {'loss': 0.4795, 'learning_rate': 7.92747078961589e-06, 'epoch': 0.58} + 58%|█████▊ | 3345/5772 [2:01:09<4:10:01, 6.18s/it] 58%|█████▊ | 3346/5772 [2:01:15<4:06:07, 6.09s/it] 58%|█████▊ | 3346/5772 [2:01:17<4:06:07, 6.09s/it] {'loss': 0.4605, 'learning_rate': 7.921980976054707e-06, 'epoch': 0.58} + 58%|█████▊ | 3346/5772 [2:01:17<4:06:07, 6.09s/it] {'loss': 0.4605, 'learning_rate': 7.921980976054707e-06, 'epoch': 0.58} + 58%|█████▊ | 3346/5772 [2:01:15<4:06:07, 6.09s/it] 58%|█████▊ | 3347/5772 [2:01:21<4:06:16, 6.09s/it] 58%|█████▊ | 3347/5772 [2:01:23<4:06:16, 6.09s/it] {'loss': 0.4802, 'learning_rate': 7.916491816954055e-06, 'epoch': 0.58} + 58%|█████▊ | 3347/5772 [2:01:23<4:06:16, 6.09s/it] {'loss': 0.4802, 'learning_rate': 7.916491816954055e-06, 'epoch': 0.58} + 58%|█████▊ | 3347/5772 [2:01:21<4:06:16, 6.09s/it] 58%|█████▊ | 3348/5772 [2:01:27<4:02:15, 6.00s/it] 58%|█████▊ | 3348/5772 [2:01:29<4:02:15, 6.00s/it] {'loss': 0.4603, 'learning_rate': 7.91100331404273e-06, 'epoch': 0.58} + 58%|█████▊ | 3348/5772 [2:01:29<4:02:15, 6.00s/it] {'loss': 0.4603, 'learning_rate': 7.91100331404273e-06, 'epoch': 0.58} + 58%|█████▊ | 3348/5772 [2:01:27<4:02:15, 6.00s/it] 58%|█████▊ | 3349/5772 [2:01:35<4:08:53, 6.16s/it] 58%|█████▊ | 3349/5772 [2:01:33<4:08:53, 6.16s/it] {'loss': 0.4667, 'learning_rate': 7.905515469049287e-06, 'epoch': 0.58} + 58%|█████▊ | 3349/5772 [2:01:35<4:08:53, 6.16s/it] {'loss': 0.4667, 'learning_rate': 7.905515469049287e-06, 'epoch': 0.58} + 58%|█████▊ | 3349/5772 [2:01:33<4:08:53, 6.16s/it]10 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +07 AutoResumeHook: Checking whether to suspend... + 9 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... + 58%|█████▊ | 3350/5772 [2:01:40<4:07:28, 6.13s/it]8 AutoResumeHook: Checking whether to suspend... + 58%|█████▊ | 3350/5772 [2:01:42<4:07:28, 6.13s/it]2 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4751, 'learning_rate': 7.900028283702106e-06, 'epoch': 0.58} + 58%|█████▊ | 3350/5772 [2:01:42<4:07:28, 6.13s/it] {'loss': 0.4751, 'learning_rate': 7.900028283702106e-06, 'epoch': 0.58} + 58%|█████▊ | 3350/5772 [2:01:40<4:07:28, 6.13s/it] 58%|█████▊ | 3351/5772 [2:01:46<4:08:04, 6.15s/it] 58%|█████▊ | 3351/5772 [2:01:48<4:08:04, 6.15s/it] {'loss': 0.4701, 'learning_rate': 7.894541759729344e-06, 'epoch': 0.58} + 58%|█████▊ | 3351/5772 [2:01:48<4:08:04, 6.15s/it] {'loss': 0.4701, 'learning_rate': 7.894541759729344e-06, 'epoch': 0.58} + 58%|█████▊ | 3351/5772 [2:01:46<4:08:04, 6.15s/it] 58%|█████▊ | 3352/5772 [2:01:52<4:07:34, 6.14s/it] 58%|█████▊ | 3352/5772 [2:01:54<4:07:34, 6.14s/it] {'loss': 0.4495, 'learning_rate': 7.889055898858943e-06, 'epoch': 0.58} + 58%|█████▊ | 3352/5772 [2:01:54<4:07:34, 6.14s/it] {'loss': 0.4495, 'learning_rate': 7.889055898858943e-06, 'epoch': 0.58} + 58%|█████▊ | 3352/5772 [2:01:52<4:07:34, 6.14s/it] 58%|█████▊ | 3353/5772 [2:01:58<4:05:17, 6.08s/it] 58%|█████▊ | 3353/5772 [2:02:00<4:05:17, 6.08s/it] {'loss': 0.4592, 'learning_rate': 7.883570702818654e-06, 'epoch': 0.58} + 58%|█████▊ | 3353/5772 [2:02:00<4:05:17, 6.08s/it] {'loss': 0.4592, 'learning_rate': 7.883570702818654e-06, 'epoch': 0.58} + 58%|█████▊ | 3353/5772 [2:01:58<4:05:17, 6.08s/it] 58%|█████▊ | 3354/5772 [2:02:06<4:05:46, 6.10s/it] 58%|█████▊ | 3354/5772 [2:02:04<4:05:47, 6.10s/it] {'loss': 0.4667, 'learning_rate': 7.878086173336004e-06, 'epoch': 0.58} + 58%|█████▊ | 3354/5772 [2:02:06<4:05:46, 6.10s/it] {'loss': 0.4667, 'learning_rate': 7.878086173336004e-06, 'epoch': 0.58} + 58%|█████▊ | 3354/5772 [2:02:04<4:05:47, 6.10s/it] 58%|█████▊ | 3355/5772 [2:02:12<4:07:50, 6.15s/it] 58%|█████▊ | 3355/5772 [2:02:10<4:07:50, 6.15s/it] {'loss': 0.4748, 'learning_rate': 7.872602312138316e-06, 'epoch': 0.58} + 58%|█████▊ | 3355/5772 [2:02:12<4:07:50, 6.15s/it] {'loss': 0.4748, 'learning_rate': 7.872602312138316e-06, 'epoch': 0.58} + 58%|█████▊ | 3355/5772 [2:02:10<4:07:50, 6.15s/it] 58%|█████▊ | 3356/5772 [2:02:17<4:14:02, 6.31s/it] 58%|█████▊ | 3356/5772 [2:02:19<4:14:02, 6.31s/it] {'loss': 0.464, 'learning_rate': 7.867119120952698e-06, 'epoch': 0.58} + 58%|█████▊ | 3356/5772 [2:02:19<4:14:02, 6.31s/it] {'loss': 0.464, 'learning_rate': 7.867119120952698e-06, 'epoch': 0.58} + 58%|█████▊ | 3356/5772 [2:02:17<4:14:02, 6.31s/it] 58%|█████▊ | 3357/5772 [2:02:23<4:08:52, 6.18s/it] 58%|█████▊ | 3357/5772 [2:02:25<4:08:52, 6.18s/it] {'loss': 0.4657, 'learning_rate': 7.861636601506056e-06, 'epoch': 0.58} + 58%|█████▊ | 3357/5772 [2:02:25<4:08:52, 6.18s/it] {'loss': 0.4657, 'learning_rate': 7.861636601506056e-06, 'epoch': 0.58} + 58%|█████▊ | 3357/5772 [2:02:23<4:08:52, 6.18s/it] 58%|█████▊ | 3358/5772 [2:02:29<4:07:51, 6.16s/it] 58%|█████▊ | 3358/5772 [2:02:31<4:07:51, 6.16s/it] {'loss': 0.461, 'learning_rate': 7.856154755525078e-06, 'epoch': 0.58} + 58%|█████▊ | 3358/5772 [2:02:31<4:07:51, 6.16s/it] {'loss': 0.461, 'learning_rate': 7.856154755525078e-06, 'epoch': 0.58} + 58%|█████▊ | 3358/5772 [2:02:29<4:07:51, 6.16s/it] 58%|█████▊ | 3359/5772 [2:02:37<4:02:36, 6.03s/it] 58%|█████▊ | 3359/5772 [2:02:35<4:02:36, 6.03s/it] {'loss': 0.4753, 'learning_rate': 7.85067358473624e-06, 'epoch': 0.58} + 58%|█████▊ | 3359/5772 [2:02:37<4:02:36, 6.03s/it] {'loss': 0.4753, 'learning_rate': 7.85067358473624e-06, 'epoch': 0.58} + 58%|█████▊ | 3359/5772 [2:02:35<4:02:36, 6.03s/it] 58%|█████▊ | 3360/5772 [2:02:42<3:59:50, 5.97s/it] 58%|█████▊ | 3360/5772 [2:02:40<3:59:51, 5.97s/it] {'loss': 0.468, 'learning_rate': 7.845193090865807e-06, 'epoch': 0.58} + 58%|█████▊ | 3360/5772 [2:02:42<3:59:50, 5.97s/it] {'loss': 0.468, 'learning_rate': 7.845193090865807e-06, 'epoch': 0.58} + 58%|█████▊ | 3360/5772 [2:02:40<3:59:51, 5.97s/it] 58%|█████▊ | 3361/5772 [2:02:49<4:04:18, 6.08s/it] 58%|█████▊ | 3361/5772 [2:02:47<4:04:18, 6.08s/it] {'loss': 0.4724, 'learning_rate': 7.839713275639826e-06, 'epoch': 0.58} + 58%|█████▊ | 3361/5772 [2:02:49<4:04:18, 6.08s/it] {'loss': 0.4724, 'learning_rate': 7.839713275639826e-06, 'epoch': 0.58} + 58%|█████▊ | 3361/5772 [2:02:47<4:04:18, 6.08s/it] 58%|█████▊ | 3362/5772 [2:02:53<4:00:14, 5.98s/it] 58%|█████▊ | 3362/5772 [2:02:55<4:00:14, 5.98s/it] {'loss': 0.4642, 'learning_rate': 7.83423414078414e-06, 'epoch': 0.58} + 58%|█████▊ | 3362/5772 [2:02:55<4:00:14, 5.98s/it] {'loss': 0.4642, 'learning_rate': 7.83423414078414e-06, 'epoch': 0.58} + 58%|█████▊ | 3362/5772 [2:02:53<4:00:14, 5.98s/it] 58%|█████▊ | 3363/5772 [2:03:01<4:04:35, 6.09s/it] 58%|█████▊ | 3363/5772 [2:02:59<4:04:36, 6.09s/it] {'loss': 0.4805, 'learning_rate': 7.828755688024369e-06, 'epoch': 0.58} + 58%|█████▊ | 3363/5772 [2:03:01<4:04:35, 6.09s/it] {'loss': 0.4805, 'learning_rate': 7.828755688024369e-06, 'epoch': 0.58} + 58%|█████▊ | 3363/5772 [2:02:59<4:04:36, 6.09s/it] 58%|█████▊ | 3364/5772 [2:03:05<4:05:16, 6.11s/it] 58%|█████▊ | 3364/5772 [2:03:07<4:05:17, 6.11s/it] {'loss': 0.465, 'learning_rate': 7.823277919085919e-06, 'epoch': 0.58} + 58%|█████▊ | 3364/5772 [2:03:07<4:05:17, 6.11s/it] {'loss': 0.465, 'learning_rate': 7.823277919085919e-06, 'epoch': 0.58} + 58%|█████▊ | 3364/5772 [2:03:05<4:05:16, 6.11s/it] 58%|█████▊ | 3365/5772 [2:03:11<4:03:08, 6.06s/it] 58%|█████▊ | 3365/5772 [2:03:13<4:03:09, 6.06s/it] {'loss': 0.475, 'learning_rate': 7.817800835693993e-06, 'epoch': 0.58} + 58%|█████▊ | 3365/5772 [2:03:13<4:03:09, 6.06s/it] {'loss': 0.475, 'learning_rate': 7.817800835693993e-06, 'epoch': 0.58} + 58%|█████▊ | 3365/5772 [2:03:11<4:03:08, 6.06s/it] 58%|█████▊ | 3366/5772 [2:03:19<4:01:55, 6.03s/it] 58%|█████▊ | 3366/5772 [2:03:17<4:01:55, 6.03s/it] {'loss': 0.4709, 'learning_rate': 7.812324439573554e-06, 'epoch': 0.58} + 58%|█████▊ | 3366/5772 [2:03:19<4:01:55, 6.03s/it] {'loss': 0.4709, 'learning_rate': 7.812324439573554e-06, 'epoch': 0.58} + 58%|█████▊ | 3366/5772 [2:03:17<4:01:55, 6.03s/it] 58%|█████▊ | 3367/5772 [2:03:25<4:01:01, 6.01s/it] 58%|█████▊ | 3367/5772 [2:03:23<4:01:01, 6.01s/it] {'loss': 0.4754, 'learning_rate': 7.806848732449372e-06, 'epoch': 0.58} + 58%|█████▊ | 3367/5772 [2:03:25<4:01:01, 6.01s/it] {'loss': 0.4754, 'learning_rate': 7.806848732449372e-06, 'epoch': 0.58} + 58%|█████▊ | 3367/5772 [2:03:23<4:01:01, 6.01s/it] 58%|█████▊ | 3368/5772 [2:03:31<4:01:54, 6.04s/it] 58%|█████▊ | 3368/5772 [2:03:29<4:01:54, 6.04s/it] {'loss': 0.4573, 'learning_rate': 7.801373716045987e-06, 'epoch': 0.58} + 58%|█████▊ | 3368/5772 [2:03:31<4:01:54, 6.04s/it] {'loss': 0.4573, 'learning_rate': 7.801373716045987e-06, 'epoch': 0.58} + 58%|█████▊ | 3368/5772 [2:03:29<4:01:54, 6.04s/it] 58%|█████▊ | 3369/5772 [2:03:35<4:00:45, 6.01s/it] 58%|█████▊ | 3369/5772 [2:03:37<4:00:46, 6.01s/it] {'loss': 0.4714, 'learning_rate': 7.795899392087728e-06, 'epoch': 0.58} + 58%|█████▊ | 3369/5772 [2:03:37<4:00:46, 6.01s/it] {'loss': 0.4714, 'learning_rate': 7.795899392087728e-06, 'epoch': 0.58} + 58%|█████▊ | 3369/5772 [2:03:35<4:00:45, 6.01s/it] 58%|█████▊ | 3370/5772 [2:03:43<3:56:13, 5.90s/it] 58%|█████▊ | 3370/5772 [2:03:41<3:56:14, 5.90s/it] {'loss': 0.4527, 'learning_rate': 7.790425762298698e-06, 'epoch': 0.58} + 58%|█████▊ | 3370/5772 [2:03:43<3:56:13, 5.90s/it] {'loss': 0.4527, 'learning_rate': 7.790425762298698e-06, 'epoch': 0.58} + 58%|█████▊ | 3370/5772 [2:03:41<3:56:14, 5.90s/it] 58%|█████▊ | 3371/5772 [2:03:49<4:00:16, 6.00s/it] 58%|█████▊ | 3371/5772 [2:03:47<4:00:16, 6.00s/it] {'loss': 0.464, 'learning_rate': 7.784952828402789e-06, 'epoch': 0.58} + 58%|█████▊ | 3371/5772 [2:03:49<4:00:16, 6.00s/it] {'loss': 0.464, 'learning_rate': 7.784952828402789e-06, 'epoch': 0.58} + 58%|█████▊ | 3371/5772 [2:03:47<4:00:16, 6.00s/it] 58%|█████▊ | 3372/5772 [2:03:54<4:08:55, 6.22s/it] 58%|█████▊ | 3372/5772 [2:03:56<4:08:55, 6.22s/it] {'loss': 0.4656, 'learning_rate': 7.779480592123673e-06, 'epoch': 0.58} + 58%|█████▊ | 3372/5772 [2:03:56<4:08:55, 6.22s/it] {'loss': 0.4656, 'learning_rate': 7.779480592123673e-06, 'epoch': 0.58} + 58%|█████▊ | 3372/5772 [2:03:54<4:08:55, 6.22s/it] 58%|█████▊ | 3373/5772 [2:04:00<4:09:10, 6.23s/it] 58%|█████▊ | 3373/5772 [2:04:02<4:09:10, 6.23s/it] {'loss': 0.4687, 'learning_rate': 7.774009055184795e-06, 'epoch': 0.58} + 58%|█████▊ | 3373/5772 [2:04:02<4:09:10, 6.23s/it] {'loss': 0.4687, 'learning_rate': 7.774009055184795e-06, 'epoch': 0.58} + 58%|█████▊ | 3373/5772 [2:04:00<4:09:10, 6.23s/it] 58%|█████▊ | 3374/5772 [2:04:06<4:06:01, 6.16s/it] 58%|█████▊ | 3374/5772 [2:04:08<4:06:01, 6.16s/it] {'loss': 0.4685, 'learning_rate': 7.768538219309392e-06, 'epoch': 0.58} + 58%|█████▊ | 3374/5772 [2:04:08<4:06:01, 6.16s/it] {'loss': 0.4685, 'learning_rate': 7.768538219309392e-06, 'epoch': 0.58} + 58%|█████▊ | 3374/5772 [2:04:06<4:06:01, 6.16s/it] 58%|█████▊ | 3375/5772 [2:04:14<4:04:12, 6.11s/it] 58%|█████▊ | 3375/5772 [2:04:12<4:04:12, 6.11s/it] {'loss': 0.4703, 'learning_rate': 7.763068086220467e-06, 'epoch': 0.58} + 58%|█████▊ | 3375/5772 [2:04:14<4:04:12, 6.11s/it] {'loss': 0.4703, 'learning_rate': 7.763068086220467e-06, 'epoch': 0.58} + 58%|█████▊ | 3375/5772 [2:04:12<4:04:12, 6.11s/it] 58%|█████▊ | 3376/5772 [2:04:20<4:01:43, 6.05s/it] 58%|█████▊ | 3376/5772 [2:04:18<4:01:43, 6.05s/it] {'loss': 0.4587, 'learning_rate': 7.757598657640813e-06, 'epoch': 0.58} + 58%|█████▊ | 3376/5772 [2:04:20<4:01:43, 6.05s/it] {'loss': 0.4587, 'learning_rate': 7.757598657640813e-06, 'epoch': 0.58} + 58%|█████▊ | 3376/5772 [2:04:18<4:01:43, 6.05s/it] 59%|█████▊ | 3377/5772 [2:04:26<4:01:33, 6.05s/it] 59%|█████▊ | 3377/5772 [2:04:24<4:01:33, 6.05s/it] {'loss': 0.4714, 'learning_rate': 7.752129935292993e-06, 'epoch': 0.59} + 59%|█████▊ | 3377/5772 [2:04:26<4:01:33, 6.05s/it] {'loss': 0.4714, 'learning_rate': 7.752129935292993e-06, 'epoch': 0.59} + 59%|█████▊ | 3377/5772 [2:04:24<4:01:33, 6.05s/it] 59%|█████▊ | 3378/5772 [2:04:32<4:05:14, 6.15s/it] 59%|█████▊ | 3378/5772 [2:04:30<4:05:15, 6.15s/it] {'loss': 0.4716, 'learning_rate': 7.746661920899351e-06, 'epoch': 0.59} + 59%|█████▊ | 3378/5772 [2:04:32<4:05:14, 6.15s/it] {'loss': 0.4716, 'learning_rate': 7.746661920899351e-06, 'epoch': 0.59} + 59%|█████▊ | 3378/5772 [2:04:30<4:05:15, 6.15s/it] 59%|█████▊ | 3379/5772 [2:04:38<4:01:25, 6.05s/it] 59%|█████▊ | 3379/5772 [2:04:36<4:01:25, 6.05s/it] {'loss': 0.4672, 'learning_rate': 7.74119461618201e-06, 'epoch': 0.59} + 59%|█████▊ | 3379/5772 [2:04:38<4:01:25, 6.05s/it] {'loss': 0.4672, 'learning_rate': 7.74119461618201e-06, 'epoch': 0.59} + 59%|█████▊ | 3379/5772 [2:04:36<4:01:25, 6.05s/it] 59%|█████▊ | 3380/5772 [2:04:42<3:59:08, 6.00s/it] 59%|█████▊ | 3380/5772 [2:04:44<3:59:08, 6.00s/it] {'loss': 0.4719, 'learning_rate': 7.735728022862865e-06, 'epoch': 0.59} + 59%|█████▊ | 3380/5772 [2:04:44<3:59:08, 6.00s/it] {'loss': 0.4719, 'learning_rate': 7.735728022862865e-06, 'epoch': 0.59} + 59%|█████▊ | 3380/5772 [2:04:42<3:59:08, 6.00s/it] 59%|█████▊ | 3381/5772 [2:04:48<3:59:58, 6.02s/it] 59%|█████▊ | 3381/5772 [2:04:50<3:59:59, 6.02s/it] {'loss': 0.4781, 'learning_rate': 7.730262142663591e-06, 'epoch': 0.59} + 59%|█████▊ | 3381/5772 [2:04:50<3:59:59, 6.02s/it] {'loss': 0.4781, 'learning_rate': 7.730262142663591e-06, 'epoch': 0.59} + 59%|█████▊ | 3381/5772 [2:04:48<3:59:58, 6.02s/it] 59%|█████▊ | 3382/5772 [2:04:54<4:00:58, 6.05s/it] 59%|█████▊ | 3382/5772 [2:04:56<4:00:58, 6.05s/it] {'loss': 0.4406, 'learning_rate': 7.724796977305632e-06, 'epoch': 0.59} + 59%|█████▊ | 3382/5772 [2:04:56<4:00:58, 6.05s/it] {'loss': 0.4406, 'learning_rate': 7.724796977305632e-06, 'epoch': 0.59} + 59%|█████▊ | 3382/5772 [2:04:54<4:00:58, 6.05s/it] 59%|█████▊ | 3383/5772 [2:05:00<4:03:22, 6.11s/it] 59%|█████▊ | 3383/5772 [2:05:02<4:03:23, 6.11s/it] {'loss': 0.4652, 'learning_rate': 7.71933252851022e-06, 'epoch': 0.59} + 59%|█████▊ | 3383/5772 [2:05:02<4:03:23, 6.11s/it] {'loss': 0.4652, 'learning_rate': 7.71933252851022e-06, 'epoch': 0.59} + 59%|█████▊ | 3383/5772 [2:05:00<4:03:22, 6.11s/it] 59%|█████▊ | 3384/5772 [2:05:07<4:10:10, 6.29s/it] 59%|█████▊ | 3384/5772 [2:05:09<4:10:10, 6.29s/it] {'loss': 0.4732, 'learning_rate': 7.713868797998342e-06, 'epoch': 0.59} + 59%|█████▊ | 3384/5772 [2:05:09<4:10:10, 6.29s/it] {'loss': 0.4732, 'learning_rate': 7.713868797998342e-06, 'epoch': 0.59} + 59%|█████▊ | 3384/5772 [2:05:07<4:10:10, 6.29s/it] 59%|█████▊ | 3385/5772 [2:05:13<4:09:04, 6.26s/it] 59%|█████▊ | 3385/5772 [2:05:15<4:09:04, 6.26s/it] {'loss': 0.4693, 'learning_rate': 7.708405787490777e-06, 'epoch': 0.59} + 59%|█████▊ | 3385/5772 [2:05:15<4:09:04, 6.26s/it] {'loss': 0.4693, 'learning_rate': 7.708405787490777e-06, 'epoch': 0.59} + 59%|█████▊ | 3385/5772 [2:05:13<4:09:04, 6.26s/it] 59%|█████▊ | 3386/5772 [2:05:19<4:08:31, 6.25s/it] 59%|█████▊ | 3386/5772 [2:05:21<4:08:31, 6.25s/it] {'loss': 0.4669, 'learning_rate': 7.702943498708069e-06, 'epoch': 0.59} + 59%|█████▊ | 3386/5772 [2:05:21<4:08:31, 6.25s/it] {'loss': 0.4669, 'learning_rate': 7.702943498708069e-06, 'epoch': 0.59} + 59%|█████▊ | 3386/5772 [2:05:19<4:08:31, 6.25s/it] 59%|█████▊ | 3387/5772 [2:05:26<4:08:01, 6.24s/it] 59%|█████▊ | 3387/5772 [2:05:28<4:08:00, 6.24s/it] {'loss': 0.4611, 'learning_rate': 7.697481933370535e-06, 'epoch': 0.59} + 59%|█████▊ | 3387/5772 [2:05:28<4:08:00, 6.24s/it] {'loss': 0.4611, 'learning_rate': 7.697481933370535e-06, 'epoch': 0.59} + 59%|█████▊ | 3387/5772 [2:05:26<4:08:01, 6.24s/it] 59%|█████▊ | 3388/5772 [2:05:32<4:08:39, 6.26s/it] 59%|█████▊ | 3388/5772 [2:05:34<4:08:39, 6.26s/it] {'loss': 0.4602, 'learning_rate': 7.692021093198264e-06, 'epoch': 0.59} + 59%|█████▊ | 3388/5772 [2:05:34<4:08:39, 6.26s/it] {'loss': 0.4602, 'learning_rate': 7.692021093198264e-06, 'epoch': 0.59} + 59%|█████▊ | 3388/5772 [2:05:32<4:08:39, 6.26s/it] 59%|█████▊ | 3389/5772 [2:05:38<4:09:34, 6.28s/it] 59%|█████▊ | 3389/5772 [2:05:40<4:09:34, 6.28s/it] {'loss': 0.4677, 'learning_rate': 7.686560979911115e-06, 'epoch': 0.59} + 59%|█████▊ | 3389/5772 [2:05:40<4:09:34, 6.28s/it] {'loss': 0.4677, 'learning_rate': 7.686560979911115e-06, 'epoch': 0.59} + 59%|█████▊ | 3389/5772 [2:05:38<4:09:34, 6.28s/it] 59%|█████▊ | 3390/5772 [2:05:44<4:06:55, 6.22s/it] 59%|█████▊ | 3390/5772 [2:05:46<4:06:55, 6.22s/it] {'loss': 0.463, 'learning_rate': 7.681101595228727e-06, 'epoch': 0.59} + {'loss': 0.463, 'learning_rate': 7.681101595228727e-06, 'epoch': 0.59} 59%|█████▊ | 3390/5772 [2:05:46<4:06:55, 6.22s/it] + 59%|█████▊ | 3390/5772 [2:05:44<4:06:55, 6.22s/it] 59%|█████▊ | 3391/5772 [2:05:51<4:06:37, 6.21s/it] 59%|█████▊ | 3391/5772 [2:05:53<4:06:37, 6.21s/it] {'loss': 0.4554, 'learning_rate': 7.67564294087049e-06, 'epoch': 0.59} + 59%|█████▊ | 3391/5772 [2:05:53<4:06:37, 6.21s/it] {'loss': 0.4554, 'learning_rate': 7.67564294087049e-06, 'epoch': 0.59} + 59%|█████▊ | 3391/5772 [2:05:51<4:06:37, 6.21s/it] 59%|█████▉ | 3392/5772 [2:05:59<4:04:36, 6.17s/it] 59%|█████▉ | 3392/5772 [2:05:57<4:04:36, 6.17s/it] {'loss': 0.4666, 'learning_rate': 7.670185018555592e-06, 'epoch': 0.59} + 59%|█████▉ | 3392/5772 [2:05:59<4:04:36, 6.17s/it] {'loss': 0.4666, 'learning_rate': 7.670185018555592e-06, 'epoch': 0.59} + 59%|█████▉ | 3392/5772 [2:05:57<4:04:36, 6.17s/it] 59%|█████▉ | 3393/5772 [2:06:03<4:06:50, 6.23s/it] 59%|█████▉ | 3393/5772 [2:06:05<4:06:50, 6.23s/it] {'loss': 0.4804, 'learning_rate': 7.664727830002967e-06, 'epoch': 0.59} + 59%|█████▉ | 3393/5772 [2:06:05<4:06:50, 6.23s/it] {'loss': 0.4804, 'learning_rate': 7.664727830002967e-06, 'epoch': 0.59} + 59%|█████▉ | 3393/5772 [2:06:03<4:06:50, 6.23s/it] 59%|█████▉ | 3394/5772 [2:06:09<4:08:10, 6.26s/it] 59%|█████▉ | 3394/5772 [2:06:11<4:08:10, 6.26s/it] {'loss': 0.4709, 'learning_rate': 7.659271376931327e-06, 'epoch': 0.59} + 59%|█████▉ | 3394/5772 [2:06:11<4:08:10, 6.26s/it] {'loss': 0.4709, 'learning_rate': 7.659271376931327e-06, 'epoch': 0.59} + 59%|█████▉ | 3394/5772 [2:06:09<4:08:10, 6.26s/it] 59%|█████▉ | 3395/5772 [2:06:17<4:06:58, 6.23s/it] 59%|█████▉ | 3395/5772 [2:06:15<4:06:58, 6.23s/it] {'loss': 0.4663, 'learning_rate': 7.653815661059156e-06, 'epoch': 0.59} + 59%|█████▉ | 3395/5772 [2:06:17<4:06:58, 6.23s/it] {'loss': 0.4663, 'learning_rate': 7.653815661059156e-06, 'epoch': 0.59} + 59%|█████▉ | 3395/5772 [2:06:15<4:06:58, 6.23s/it] 59%|█████▉ | 3396/5772 [2:06:24<4:06:35, 6.23s/it] 59%|█████▉ | 3396/5772 [2:06:22<4:06:35, 6.23s/it] {'loss': 0.4557, 'learning_rate': 7.648360684104695e-06, 'epoch': 0.59} + 59%|█████▉ | 3396/5772 [2:06:24<4:06:35, 6.23s/it] {'loss': 0.4557, 'learning_rate': 7.648360684104695e-06, 'epoch': 0.59} + 59%|█████▉ | 3396/5772 [2:06:22<4:06:35, 6.23s/it] 59%|█████▉ | 3397/5772 [2:06:30<4:02:09, 6.12s/it] 59%|█████▉ | 3397/5772 [2:06:28<4:02:09, 6.12s/it] {'loss': 0.4693, 'learning_rate': 7.642906447785962e-06, 'epoch': 0.59} + 59%|█████▉ | 3397/5772 [2:06:30<4:02:09, 6.12s/it] {'loss': 0.4693, 'learning_rate': 7.642906447785962e-06, 'epoch': 0.59} + 59%|█████▉ | 3397/5772 [2:06:28<4:02:09, 6.12s/it] 59%|█████▉ | 3398/5772 [2:06:36<4:02:45, 6.14s/it] 59%|█████▉ | 3398/5772 [2:06:34<4:02:45, 6.14s/it] {'loss': 0.4672, 'learning_rate': 7.637452953820737e-06, 'epoch': 0.59} + 59%|█████▉ | 3398/5772 [2:06:36<4:02:45, 6.14s/it] {'loss': 0.4672, 'learning_rate': 7.637452953820737e-06, 'epoch': 0.59} + 59%|█████▉ | 3398/5772 [2:06:34<4:02:45, 6.14s/it] 59%|█████▉ | 3399/5772 [2:06:40<4:09:45, 6.31s/it] 59%|█████▉ | 3399/5772 [2:06:42<4:09:45, 6.31s/it] {'loss': 0.4754, 'learning_rate': 7.632000203926564e-06, 'epoch': 0.59} + 59%|█████▉ | 3399/5772 [2:06:42<4:09:45, 6.31s/it] {'loss': 0.4754, 'learning_rate': 7.632000203926564e-06, 'epoch': 0.59} + 59%|█████▉ | 3399/5772 [2:06:40<4:09:45, 6.31s/it]10 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +51 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +0 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... + 59%|█████▉ | 3400/5772 [2:06:49<4:09:03, 6.30s/it]4 AutoResumeHook: Checking whether to suspend... + 59%|█████▉ | 3400/5772 [2:06:47<4:09:03, 6.30s/it]7 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4541, 'learning_rate': 7.626548199820768e-06, 'epoch': 0.59} + 59%|█████▉ | 3400/5772 [2:06:47<4:09:03, 6.30s/it]{'loss': 0.4541, 'learning_rate': 7.626548199820768e-06, 'epoch': 0.59} + 59%|█████▉ | 3400/5772 [2:06:49<4:09:03, 6.30s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-3400/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-3400/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-3400/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 59%|█████▉ | 3401/5772 [2:07:11<7:40:06, 11.64s/it] 59%|█████▉ | 3401/5772 [2:07:13<7:40:06, 11.64s/it] {'loss': 0.483, 'learning_rate': 7.621096943220416e-06, 'epoch': 0.59} + 59%|█████▉ | 3401/5772 [2:07:13<7:40:06, 11.64s/it] {'loss': 0.483, 'learning_rate': 7.621096943220416e-06, 'epoch': 0.59} + 59%|█████▉ | 3401/5772 [2:07:11<7:40:06, 11.64s/it] 59%|█████▉ | 3402/5772 [2:07:19<6:36:40, 10.04s/it] 59%|█████▉ | 3402/5772 [2:07:17<6:36:39, 10.04s/it] {'loss': 0.4626, 'learning_rate': 7.6156464358423586e-06, 'epoch': 0.59} + 59%|█████▉ | 3402/5772 [2:07:19<6:36:40, 10.04s/it] {'loss': 0.4626, 'learning_rate': 7.6156464358423586e-06, 'epoch': 0.59} + 59%|█████▉ | 3402/5772 [2:07:17<6:36:39, 10.04s/it] 59%|█████▉ | 3403/5772 [2:07:24<5:56:01, 9.02s/it] 59%|█████▉ | 3403/5772 [2:07:26<5:56:01, 9.02s/it] {'loss': 0.4719, 'learning_rate': 7.610196679403195e-06, 'epoch': 0.59} + 59%|█████▉ | 3403/5772 [2:07:26<5:56:01, 9.02s/it] {'loss': 0.4719, 'learning_rate': 7.610196679403195e-06, 'epoch': 0.59} + 59%|█████▉ | 3403/5772 [2:07:24<5:56:01, 9.02s/it] 59%|█████▉ | 3404/5772 [2:07:30<5:22:15, 8.17s/it] 59%|█████▉ | 3404/5772 [2:07:32<5:22:15, 8.17s/it] {'loss': 0.4599, 'learning_rate': 7.6047476756193035e-06, 'epoch': 0.59} + 59%|█████▉ | 3404/5772 [2:07:32<5:22:15, 8.17s/it] {'loss': 0.4599, 'learning_rate': 7.6047476756193035e-06, 'epoch': 0.59} + 59%|█████▉ | 3404/5772 [2:07:30<5:22:15, 8.17s/it] 59%|█████▉ | 3405/5772 [2:07:36<4:57:06, 7.53s/it] 59%|█████▉ | 3405/5772 [2:07:38<4:57:07, 7.53s/it] {'loss': 0.4793, 'learning_rate': 7.599299426206812e-06, 'epoch': 0.59} + 59%|█████▉ | 3405/5772 [2:07:38<4:57:07, 7.53s/it] {'loss': 0.4793, 'learning_rate': 7.599299426206812e-06, 'epoch': 0.59} + 59%|█████▉ | 3405/5772 [2:07:36<4:57:06, 7.53s/it] 59%|█████▉ | 3406/5772 [2:07:42<4:41:31, 7.14s/it] 59%|█████▉ | 3406/5772 [2:07:44<4:41:31, 7.14s/it] {'loss': 0.4594, 'learning_rate': 7.5938519328816156e-06, 'epoch': 0.59} + 59%|█████▉ | 3406/5772 [2:07:44<4:41:31, 7.14s/it] {'loss': 0.4594, 'learning_rate': 7.5938519328816156e-06, 'epoch': 0.59} + 59%|█████▉ | 3406/5772 [2:07:42<4:41:31, 7.14s/it] 59%|█████▉ | 3407/5772 [2:07:48<4:29:19, 6.83s/it] 59%|█████▉ | 3407/5772 [2:07:50<4:29:19, 6.83s/it] {'loss': 0.4724, 'learning_rate': 7.588405197359381e-06, 'epoch': 0.59} + 59%|█████▉ | 3407/5772 [2:07:50<4:29:19, 6.83s/it] {'loss': 0.4724, 'learning_rate': 7.588405197359381e-06, 'epoch': 0.59} + 59%|█████▉ | 3407/5772 [2:07:48<4:29:19, 6.83s/it] 59%|█████▉ | 3408/5772 [2:07:54<4:19:46, 6.59s/it] 59%|█████▉ | 3408/5772 [2:07:56<4:19:46, 6.59s/it] {'loss': 0.4478, 'learning_rate': 7.582959221355514e-06, 'epoch': 0.59} + 59%|█████▉ | 3408/5772 [2:07:56<4:19:46, 6.59s/it] {'loss': 0.4478, 'learning_rate': 7.582959221355514e-06, 'epoch': 0.59} + 59%|█████▉ | 3408/5772 [2:07:54<4:19:46, 6.59s/it] 59%|█████▉ | 3409/5772 [2:08:01<4:15:36, 6.49s/it] 59%|█████▉ | 3409/5772 [2:08:03<4:15:36, 6.49s/it] {'loss': 0.4568, 'learning_rate': 7.577514006585209e-06, 'epoch': 0.59} + 59%|█████▉ | 3409/5772 [2:08:03<4:15:36, 6.49s/it] {'loss': 0.4568, 'learning_rate': 7.577514006585209e-06, 'epoch': 0.59} + 59%|█████▉ | 3409/5772 [2:08:01<4:15:36, 6.49s/it] 59%|█████▉ | 3410/5772 [2:08:07<4:10:37, 6.37s/it] 59%|█████▉ | 3410/5772 [2:08:09<4:10:38, 6.37s/it] {'loss': 0.4599, 'learning_rate': 7.572069554763391e-06, 'epoch': 0.59} + 59%|█████▉ | 3410/5772 [2:08:09<4:10:38, 6.37s/it] {'loss': 0.4599, 'learning_rate': 7.572069554763391e-06, 'epoch': 0.59} + 59%|█████▉ | 3410/5772 [2:08:07<4:10:37, 6.37s/it] 59%|█████▉ | 3411/5772 [2:08:15<4:05:41, 6.24s/it] 59%|█████▉ | 3411/5772 [2:08:13<4:05:43, 6.24s/it] {'loss': 0.468, 'learning_rate': 7.5666258676047735e-06, 'epoch': 0.59} + 59%|█████▉ | 3411/5772 [2:08:15<4:05:41, 6.24s/it] {'loss': 0.468, 'learning_rate': 7.5666258676047735e-06, 'epoch': 0.59} + 59%|█████▉ | 3411/5772 [2:08:13<4:05:43, 6.24s/it] 59%|█████▉ | 3412/5772 [2:08:21<4:10:25, 6.37s/it] 59%|█████▉ | 3412/5772 [2:08:19<4:10:25, 6.37s/it] {'loss': 0.4635, 'learning_rate': 7.561182946823805e-06, 'epoch': 0.59} + 59%|█████▉ | 3412/5772 [2:08:21<4:10:25, 6.37s/it] {'loss': 0.4635, 'learning_rate': 7.561182946823805e-06, 'epoch': 0.59} + 59%|█████▉ | 3412/5772 [2:08:19<4:10:25, 6.37s/it] 59%|█████▉ | 3413/5772 [2:08:28<4:09:13, 6.34s/it] 59%|█████▉ | 3413/5772 [2:08:26<4:09:14, 6.34s/it] {'loss': 0.4705, 'learning_rate': 7.5557407941347095e-06, 'epoch': 0.59} + 59%|█████▉ | 3413/5772 [2:08:28<4:09:13, 6.34s/it] {'loss': 0.4705, 'learning_rate': 7.5557407941347095e-06, 'epoch': 0.59} + 59%|█████▉ | 3413/5772 [2:08:26<4:09:14, 6.34s/it] 59%|█████▉ | 3414/5772 [2:08:32<4:07:10, 6.29s/it] 59%|█████▉ | 3414/5772 [2:08:34<4:07:11, 6.29s/it] {'loss': 0.469, 'learning_rate': 7.550299411251461e-06, 'epoch': 0.59} + 59%|█████▉ | 3414/5772 [2:08:34<4:07:11, 6.29s/it] {'loss': 0.469, 'learning_rate': 7.550299411251461e-06, 'epoch': 0.59} + 59%|█████▉ | 3414/5772 [2:08:32<4:07:10, 6.29s/it] 59%|█████▉ | 3415/5772 [2:08:38<4:07:30, 6.30s/it] 59%|█████▉ | 3415/5772 [2:08:40<4:07:31, 6.30s/it] {'loss': 0.4667, 'learning_rate': 7.54485879988779e-06, 'epoch': 0.59} + 59%|█████▉ | 3415/5772 [2:08:40<4:07:31, 6.30s/it] {'loss': 0.4667, 'learning_rate': 7.54485879988779e-06, 'epoch': 0.59} + 59%|█████▉ | 3415/5772 [2:08:38<4:07:30, 6.30s/it] 59%|█████▉ | 3416/5772 [2:08:44<4:02:26, 6.17s/it] 59%|█████▉ | 3416/5772 [2:08:46<4:02:25, 6.17s/it] {'loss': 0.464, 'learning_rate': 7.539418961757195e-06, 'epoch': 0.59} + 59%|█████▉ | 3416/5772 [2:08:46<4:02:25, 6.17s/it] {'loss': 0.464, 'learning_rate': 7.539418961757195e-06, 'epoch': 0.59} + 59%|█████▉ | 3416/5772 [2:08:44<4:02:26, 6.17s/it] 59%|█████▉ | 3417/5772 [2:08:50<3:58:57, 6.09s/it] 59%|█████▉ | 3417/5772 [2:08:52<3:58:57, 6.09s/it] {'loss': 0.4575, 'learning_rate': 7.533979898572909e-06, 'epoch': 0.59} + 59%|█████▉ | 3417/5772 [2:08:52<3:58:57, 6.09s/it] {'loss': 0.4575, 'learning_rate': 7.533979898572909e-06, 'epoch': 0.59} + 59%|█████▉ | 3417/5772 [2:08:50<3:58:57, 6.09s/it] 59%|█████▉ | 3418/5772 [2:08:56<3:56:43, 6.03s/it] 59%|█████▉ | 3418/5772 [2:08:58<3:56:43, 6.03s/it] {'loss': 0.4699, 'learning_rate': 7.528541612047953e-06, 'epoch': 0.59} + 59%|█████▉ | 3418/5772 [2:08:58<3:56:43, 6.03s/it] {'loss': 0.4699, 'learning_rate': 7.528541612047953e-06, 'epoch': 0.59} + 59%|█████▉ | 3418/5772 [2:08:56<3:56:43, 6.03s/it] 59%|█████▉ | 3419/5772 [2:09:04<3:57:01, 6.04s/it] 59%|█████▉ | 3419/5772 [2:09:02<3:57:02, 6.04s/it] {'loss': 0.4639, 'learning_rate': 7.523104103895066e-06, 'epoch': 0.59} + 59%|█████▉ | 3419/5772 [2:09:04<3:57:01, 6.04s/it] {'loss': 0.4639, 'learning_rate': 7.523104103895066e-06, 'epoch': 0.59} + 59%|█████▉ | 3419/5772 [2:09:02<3:57:02, 6.04s/it] 59%|█████▉ | 3420/5772 [2:09:08<3:56:31, 6.03s/it] 59%|█████▉ | 3420/5772 [2:09:10<3:56:31, 6.03s/it] {'loss': 0.4799, 'learning_rate': 7.517667375826772e-06, 'epoch': 0.59} + 59%|█████▉ | 3420/5772 [2:09:10<3:56:31, 6.03s/it] {'loss': 0.4799, 'learning_rate': 7.517667375826772e-06, 'epoch': 0.59} + 59%|█████▉ | 3420/5772 [2:09:08<3:56:31, 6.03s/it] 59%|█████▉ | 3421/5772 [2:09:14<3:56:52, 6.05s/it] 59%|█████▉ | 3421/5772 [2:09:16<3:56:52, 6.05s/it] {'loss': 0.4692, 'learning_rate': 7.512231429555339e-06, 'epoch': 0.59} + 59%|█████▉ | 3421/5772 [2:09:16<3:56:52, 6.05s/it] {'loss': 0.4692, 'learning_rate': 7.512231429555339e-06, 'epoch': 0.59} + 59%|█████▉ | 3421/5772 [2:09:14<3:56:52, 6.05s/it] 59%|█████▉ | 3422/5772 [2:09:20<4:01:35, 6.17s/it] 59%|█████▉ | 3422/5772 [2:09:22<4:01:35, 6.17s/it] {'loss': 0.4812, 'learning_rate': 7.506796266792782e-06, 'epoch': 0.59} + 59%|█████▉ | 3422/5772 [2:09:22<4:01:35, 6.17s/it] {'loss': 0.4812, 'learning_rate': 7.506796266792782e-06, 'epoch': 0.59} + 59%|█████▉ | 3422/5772 [2:09:20<4:01:35, 6.17s/it] 59%|█████▉ | 3423/5772 [2:09:26<3:56:23, 6.04s/it] 59%|█████▉ | 3423/5772 [2:09:28<3:56:23, 6.04s/it] {'loss': 0.4523, 'learning_rate': 7.501361889250882e-06, 'epoch': 0.59} + 59%|█████▉ | 3423/5772 [2:09:28<3:56:23, 6.04s/it] {'loss': 0.4523, 'learning_rate': 7.501361889250882e-06, 'epoch': 0.59} + 59%|█████▉ | 3423/5772 [2:09:26<3:56:23, 6.04s/it] 59%|█████▉ | 3424/5772 [2:09:33<4:07:15, 6.32s/it] 59%|█████▉ | 3424/5772 [2:09:35<4:07:15, 6.32s/it] {'loss': 0.4693, 'learning_rate': 7.4959282986411595e-06, 'epoch': 0.59} + 59%|█████▉ | 3424/5772 [2:09:35<4:07:15, 6.32s/it] {'loss': 0.4693, 'learning_rate': 7.4959282986411595e-06, 'epoch': 0.59} + 59%|█████▉ | 3424/5772 [2:09:33<4:07:15, 6.32s/it] 59%|█████▉ | 3425/5772 [2:09:39<4:03:44, 6.23s/it] 59%|█████▉ | 3425/5772 [2:09:41<4:03:44, 6.23s/it] {'loss': 0.4711, 'learning_rate': 7.490495496674899e-06, 'epoch': 0.59} + 59%|█████▉ | 3425/5772 [2:09:41<4:03:44, 6.23s/it] {'loss': 0.4711, 'learning_rate': 7.490495496674899e-06, 'epoch': 0.59} + 59%|█████▉ | 3425/5772 [2:09:39<4:03:44, 6.23s/it] 59%|█████▉ | 3426/5772 [2:09:45<3:59:44, 6.13s/it] 59%|█████▉ | 3426/5772 [2:09:47<3:59:44, 6.13s/it] {'loss': 0.4732, 'learning_rate': 7.485063485063124e-06, 'epoch': 0.59} + 59%|█████▉ | 3426/5772 [2:09:47<3:59:44, 6.13s/it] {'loss': 0.4732, 'learning_rate': 7.485063485063124e-06, 'epoch': 0.59} + 59%|█████▉ | 3426/5772 [2:09:45<3:59:44, 6.13s/it] 59%|█████▉ | 3427/5772 [2:09:52<4:08:03, 6.35s/it] 59%|█████▉ | 3427/5772 [2:09:54<4:08:03, 6.35s/it] {'loss': 0.4637, 'learning_rate': 7.479632265516626e-06, 'epoch': 0.59} + 59%|█████▉ | 3427/5772 [2:09:54<4:08:03, 6.35s/it] {'loss': 0.4637, 'learning_rate': 7.479632265516626e-06, 'epoch': 0.59} + 59%|█████▉ | 3427/5772 [2:09:52<4:08:03, 6.35s/it] 59%|█████▉ | 3428/5772 [2:09:58<4:05:53, 6.29s/it] 59%|█████▉ | 3428/5772 [2:10:00<4:05:54, 6.29s/it] {'loss': 0.4777, 'learning_rate': 7.474201839745932e-06, 'epoch': 0.59} + 59%|█████▉ | 3428/5772 [2:10:00<4:05:54, 6.29s/it] {'loss': 0.4777, 'learning_rate': 7.474201839745932e-06, 'epoch': 0.59} + 59%|█████▉ | 3428/5772 [2:09:58<4:05:53, 6.29s/it] 59%|█████▉ | 3429/5772 [2:10:04<4:03:04, 6.22s/it] 59%|█████▉ | 3429/5772 [2:10:06<4:03:04, 6.22s/it] {'loss': 0.4588, 'learning_rate': 7.468772209461324e-06, 'epoch': 0.59} + 59%|█████▉ | 3429/5772 [2:10:06<4:03:04, 6.22s/it] {'loss': 0.4588, 'learning_rate': 7.468772209461324e-06, 'epoch': 0.59} + 59%|█████▉ | 3429/5772 [2:10:04<4:03:04, 6.22s/it] 59%|█████▉ | 3430/5772 [2:10:10<4:04:36, 6.27s/it] 59%|█████▉ | 3430/5772 [2:10:12<4:04:36, 6.27s/it] {'loss': 0.4633, 'learning_rate': 7.463343376372837e-06, 'epoch': 0.59} + 59%|█████▉ | 3430/5772 [2:10:12<4:04:36, 6.27s/it] {'loss': 0.4633, 'learning_rate': 7.463343376372837e-06, 'epoch': 0.59} + 59%|█████▉ | 3430/5772 [2:10:10<4:04:36, 6.27s/it] 59%|█████▉ | 3431/5772 [2:10:17<4:07:13, 6.34s/it] 59%|█████▉ | 3431/5772 [2:10:19<4:07:13, 6.34s/it] {'loss': 0.4508, 'learning_rate': 7.457915342190247e-06, 'epoch': 0.59} + 59%|█████▉ | 3431/5772 [2:10:19<4:07:13, 6.34s/it] {'loss': 0.4508, 'learning_rate': 7.457915342190247e-06, 'epoch': 0.59} + 59%|█████▉ | 3431/5772 [2:10:17<4:07:13, 6.34s/it] 59%|█████▉ | 3432/5772 [2:10:23<4:00:42, 6.17s/it] 59%|█████▉ | 3432/5772 [2:10:25<4:00:42, 6.17s/it] {'loss': 0.4695, 'learning_rate': 7.452488108623089e-06, 'epoch': 0.59} + 59%|█████▉ | 3432/5772 [2:10:25<4:00:42, 6.17s/it] {'loss': 0.4695, 'learning_rate': 7.452488108623089e-06, 'epoch': 0.59} + 59%|█████▉ | 3432/5772 [2:10:23<4:00:42, 6.17s/it] 59%|█████▉ | 3433/5772 [2:10:29<4:03:56, 6.26s/it] 59%|█████▉ | 3433/5772 [2:10:31<4:03:56, 6.26s/it] {'loss': 0.4594, 'learning_rate': 7.447061677380635e-06, 'epoch': 0.59} + 59%|█████▉ | 3433/5772 [2:10:31<4:03:56, 6.26s/it] {'loss': 0.4594, 'learning_rate': 7.447061677380635e-06, 'epoch': 0.59} + 59%|█████▉ | 3433/5772 [2:10:29<4:03:56, 6.26s/it] 59%|█████▉ | 3434/5772 [2:10:35<3:59:13, 6.14s/it] 59%|█████▉ | 3434/5772 [2:10:37<3:59:13, 6.14s/it] {'loss': 0.4679, 'learning_rate': 7.441636050171909e-06, 'epoch': 0.59} + 59%|█████▉ | 3434/5772 [2:10:37<3:59:13, 6.14s/it] {'loss': 0.4679, 'learning_rate': 7.441636050171909e-06, 'epoch': 0.59} + 59%|█████▉ | 3434/5772 [2:10:35<3:59:13, 6.14s/it] 60%|█████▉ | 3435/5772 [2:10:41<3:58:28, 6.12s/it] 60%|█████▉ | 3435/5772 [2:10:43<3:58:28, 6.12s/it] {'loss': 0.4748, 'learning_rate': 7.436211228705687e-06, 'epoch': 0.6} + 60%|█████▉ | 3435/5772 [2:10:43<3:58:28, 6.12s/it] {'loss': 0.4748, 'learning_rate': 7.436211228705687e-06, 'epoch': 0.6} + 60%|█████▉ | 3435/5772 [2:10:41<3:58:28, 6.12s/it] 60%|█████▉ | 3436/5772 [2:10:47<3:55:38, 6.05s/it] 60%|█████▉ | 3436/5772 [2:10:49<3:55:39, 6.05s/it] {'loss': 0.4642, 'learning_rate': 7.430787214690485e-06, 'epoch': 0.6} + 60%|█████▉ | 3436/5772 [2:10:49<3:55:39, 6.05s/it] {'loss': 0.4642, 'learning_rate': 7.430787214690485e-06, 'epoch': 0.6} + 60%|█████▉ | 3436/5772 [2:10:47<3:55:38, 6.05s/it] 60%|█████▉ | 3437/5772 [2:10:53<3:59:07, 6.14s/it] 60%|█████▉ | 3437/5772 [2:10:55<3:59:07, 6.14s/it] {'loss': 0.4695, 'learning_rate': 7.425364009834563e-06, 'epoch': 0.6} + 60%|█████▉ | 3437/5772 [2:10:55<3:59:07, 6.14s/it] {'loss': 0.4695, 'learning_rate': 7.425364009834563e-06, 'epoch': 0.6} + 60%|█████▉ | 3437/5772 [2:10:53<3:59:07, 6.14s/it] 60%|█████▉ | 3438/5772 [2:10:59<3:55:52, 6.06s/it] 60%|█████▉ | 3438/5772 [2:11:01<3:55:53, 6.06s/it] {'loss': 0.4621, 'learning_rate': 7.4199416158459316e-06, 'epoch': 0.6} + 60%|█████▉ | 3438/5772 [2:11:01<3:55:53, 6.06s/it] {'loss': 0.4621, 'learning_rate': 7.4199416158459316e-06, 'epoch': 0.6} + 60%|█████▉ | 3438/5772 [2:10:59<3:55:52, 6.06s/it] 60%|█████▉ | 3439/5772 [2:11:05<3:55:29, 6.06s/it] 60%|█████▉ | 3439/5772 [2:11:07<3:55:29, 6.06s/it] {'loss': 0.465, 'learning_rate': 7.414520034432345e-06, 'epoch': 0.6} + 60%|█████▉ | 3439/5772 [2:11:07<3:55:29, 6.06s/it] {'loss': 0.465, 'learning_rate': 7.414520034432345e-06, 'epoch': 0.6} + 60%|█████▉ | 3439/5772 [2:11:05<3:55:29, 6.06s/it] 60%|█████▉ | 3440/5772 [2:11:11<3:51:56, 5.97s/it] 60%|█████▉ | 3440/5772 [2:11:13<3:51:56, 5.97s/it] {'loss': 0.4635, 'learning_rate': 7.409099267301296e-06, 'epoch': 0.6} + 60%|█████▉ | 3440/5772 [2:11:13<3:51:56, 5.97s/it] {'loss': 0.4635, 'learning_rate': 7.409099267301296e-06, 'epoch': 0.6} + 60%|█████▉ | 3440/5772 [2:11:11<3:51:56, 5.97s/it] 60%|█████▉ | 3441/5772 [2:11:17<3:50:37, 5.94s/it] 60%|█████▉ | 3441/5772 [2:11:19<3:50:37, 5.94s/it] {'loss': 0.4551, 'learning_rate': 7.403679316160024e-06, 'epoch': 0.6} + 60%|█████▉ | 3441/5772 [2:11:19<3:50:37, 5.94s/it] {'loss': 0.4551, 'learning_rate': 7.403679316160024e-06, 'epoch': 0.6} + 60%|█████▉ | 3441/5772 [2:11:17<3:50:37, 5.94s/it] 60%|█████▉ | 3442/5772 [2:11:23<3:49:01, 5.90s/it] 60%|█████▉ | 3442/5772 [2:11:25<3:49:01, 5.90s/it] {'loss': 0.4667, 'learning_rate': 7.398260182715524e-06, 'epoch': 0.6} + 60%|█████▉ | 3442/5772 [2:11:25<3:49:01, 5.90s/it] {'loss': 0.4667, 'learning_rate': 7.398260182715524e-06, 'epoch': 0.6} + 60%|█████▉ | 3442/5772 [2:11:23<3:49:01, 5.90s/it] 60%|█████▉ | 3443/5772 [2:11:29<3:49:03, 5.90s/it] 60%|█████▉ | 3443/5772 [2:11:31<3:49:03, 5.90s/it] {'loss': 0.4818, 'learning_rate': 7.392841868674506e-06, 'epoch': 0.6} + 60%|█████▉ | 3443/5772 [2:11:31<3:49:03, 5.90s/it] {'loss': 0.4818, 'learning_rate': 7.392841868674506e-06, 'epoch': 0.6} + 60%|█████▉ | 3443/5772 [2:11:29<3:49:03, 5.90s/it] 60%|█████▉ | 3444/5772 [2:11:35<3:49:22, 5.91s/it] 60%|█████▉ | 3444/5772 [2:11:37<3:49:22, 5.91s/it] {'loss': 0.4771, 'learning_rate': 7.387424375743451e-06, 'epoch': 0.6} + 60%|█████▉ | 3444/5772 [2:11:37<3:49:22, 5.91s/it] {'loss': 0.4771, 'learning_rate': 7.387424375743451e-06, 'epoch': 0.6} + 60%|█████▉ | 3444/5772 [2:11:35<3:49:22, 5.91s/it] 60%|█████▉ | 3445/5772 [2:11:40<3:47:19, 5.86s/it] 60%|█████▉ | 3445/5772 [2:11:42<3:47:19, 5.86s/it] {'loss': 0.4621, 'learning_rate': 7.3820077056285595e-06, 'epoch': 0.6} + 60%|█████▉ | 3445/5772 [2:11:42<3:47:19, 5.86s/it] {'loss': 0.4621, 'learning_rate': 7.3820077056285595e-06, 'epoch': 0.6} + 60%|█████▉ | 3445/5772 [2:11:40<3:47:19, 5.86s/it] 60%|█████▉ | 3446/5772 [2:11:47<3:52:14, 5.99s/it] 60%|█████▉ | 3446/5772 [2:11:49<3:52:15, 5.99s/it] {'loss': 0.4668, 'learning_rate': 7.3765918600357875e-06, 'epoch': 0.6} + 60%|█████▉ | 3446/5772 [2:11:49<3:52:15, 5.99s/it] {'loss': 0.4668, 'learning_rate': 7.3765918600357875e-06, 'epoch': 0.6} + 60%|█████▉ | 3446/5772 [2:11:47<3:52:14, 5.99s/it] 60%|█████▉ | 3447/5772 [2:11:53<3:53:07, 6.02s/it] 60%|█████▉ | 3447/5772 [2:11:55<3:53:07, 6.02s/it] {'loss': 0.4691, 'learning_rate': 7.371176840670822e-06, 'epoch': 0.6} + 60%|█████▉ | 3447/5772 [2:11:55<3:53:07, 6.02s/it] {'loss': 0.4691, 'learning_rate': 7.371176840670822e-06, 'epoch': 0.6} + 60%|█████▉ | 3447/5772 [2:11:53<3:53:07, 6.02s/it] 60%|█████▉ | 3448/5772 [2:11:59<3:57:13, 6.12s/it] 60%|█████▉ | 3448/5772 [2:12:01<3:57:12, 6.12s/it] {'loss': 0.4661, 'learning_rate': 7.365762649239092e-06, 'epoch': 0.6} + 60%|█████▉ | 3448/5772 [2:12:01<3:57:12, 6.12s/it] {'loss': 0.4661, 'learning_rate': 7.365762649239092e-06, 'epoch': 0.6} + 60%|█████▉ | 3448/5772 [2:11:59<3:57:13, 6.12s/it] 60%|█████▉ | 3449/5772 [2:12:05<3:57:10, 6.13s/it] 60%|█████▉ | 3449/5772 [2:12:07<3:57:10, 6.13s/it] {'loss': 0.464, 'learning_rate': 7.360349287445774e-06, 'epoch': 0.6} + 60%|█████▉ | 3449/5772 [2:12:07<3:57:10, 6.13s/it] {'loss': 0.464, 'learning_rate': 7.360349287445774e-06, 'epoch': 0.6} + 60%|█████▉ | 3449/5772 [2:12:05<3:57:10, 6.13s/it]10 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +06 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... + 60%|█████▉ | 3450/5772 [2:12:13<3:58:13, 6.16s/it] 60%|█████▉ | 3450/5772 [2:12:11<3:58:14, 6.16s/it]7 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4699, 'learning_rate': 7.354936756995766e-06, 'epoch': 0.6} + 60%|█████▉ | 3450/5772 [2:12:13<3:58:13, 6.16s/it] {'loss': 0.4699, 'learning_rate': 7.354936756995766e-06, 'epoch': 0.6} + 60%|█████▉ | 3450/5772 [2:12:11<3:58:14, 6.16s/it] 60%|█████▉ | 3451/5772 [2:12:17<3:54:43, 6.07s/it] 60%|█████▉ | 3451/5772 [2:12:19<3:54:44, 6.07s/it] {'loss': 0.4499, 'learning_rate': 7.349525059593725e-06, 'epoch': 0.6} + 60%|█████▉ | 3451/5772 [2:12:19<3:54:44, 6.07s/it] {'loss': 0.4499, 'learning_rate': 7.349525059593725e-06, 'epoch': 0.6} + 60%|█████▉ | 3451/5772 [2:12:17<3:54:43, 6.07s/it] 60%|█████▉ | 3452/5772 [2:12:23<3:54:21, 6.06s/it] 60%|█████▉ | 3452/5772 [2:12:25<3:54:20, 6.06s/it] {'loss': 0.4578, 'learning_rate': 7.344114196944023e-06, 'epoch': 0.6} + 60%|█████▉ | 3452/5772 [2:12:25<3:54:20, 6.06s/it] {'loss': 0.4578, 'learning_rate': 7.344114196944023e-06, 'epoch': 0.6} + 60%|█████▉ | 3452/5772 [2:12:23<3:54:21, 6.06s/it] 60%|█████▉ | 3453/5772 [2:12:29<3:52:14, 6.01s/it] 60%|█████▉ | 3453/5772 [2:12:31<3:52:15, 6.01s/it] {'loss': 0.4681, 'learning_rate': 7.338704170750794e-06, 'epoch': 0.6} + 60%|█████▉ | 3453/5772 [2:12:31<3:52:15, 6.01s/it] {'loss': 0.4681, 'learning_rate': 7.338704170750794e-06, 'epoch': 0.6} + 60%|█████▉ | 3453/5772 [2:12:29<3:52:14, 6.01s/it] 60%|█████▉ | 3454/5772 [2:12:35<3:54:20, 6.07s/it] 60%|█████▉ | 3454/5772 [2:12:37<3:54:20, 6.07s/it] {'loss': 0.4698, 'learning_rate': 7.333294982717887e-06, 'epoch': 0.6} + 60%|█████▉ | 3454/5772 [2:12:37<3:54:20, 6.07s/it] {'loss': 0.4698, 'learning_rate': 7.333294982717887e-06, 'epoch': 0.6} + 60%|█████▉ | 3454/5772 [2:12:35<3:54:20, 6.07s/it] 60%|█████▉ | 3455/5772 [2:12:42<3:55:43, 6.10s/it] 60%|█████▉ | 3455/5772 [2:12:44<3:55:43, 6.10s/it] {'loss': 0.4616, 'learning_rate': 7.327886634548899e-06, 'epoch': 0.6} + 60%|█████▉ | 3455/5772 [2:12:44<3:55:43, 6.10s/it] {'loss': 0.4616, 'learning_rate': 7.327886634548899e-06, 'epoch': 0.6} + 60%|█████▉ | 3455/5772 [2:12:42<3:55:43, 6.10s/it] 60%|█████▉ | 3456/5772 [2:12:48<3:53:54, 6.06s/it] 60%|█████▉ | 3456/5772 [2:12:50<3:53:54, 6.06s/it] {'loss': 0.4617, 'learning_rate': 7.32247912794716e-06, 'epoch': 0.6} + 60%|█████▉ | 3456/5772 [2:12:50<3:53:54, 6.06s/it] {'loss': 0.4617, 'learning_rate': 7.32247912794716e-06, 'epoch': 0.6} + 60%|█████▉ | 3456/5772 [2:12:48<3:53:54, 6.06s/it] 60%|█████▉ | 3457/5772 [2:12:54<3:55:08, 6.09s/it] 60%|█████▉ | 3457/5772 [2:12:56<3:55:08, 6.09s/it] {'loss': 0.4624, 'learning_rate': 7.3170724646157284e-06, 'epoch': 0.6} + 60%|█████▉ | 3457/5772 [2:12:56<3:55:08, 6.09s/it] {'loss': 0.4624, 'learning_rate': 7.3170724646157284e-06, 'epoch': 0.6} + 60%|█████▉ | 3457/5772 [2:12:54<3:55:08, 6.09s/it] 60%|█████▉ | 3458/5772 [2:13:00<4:01:05, 6.25s/it] 60%|█████▉ | 3458/5772 [2:13:02<4:01:05, 6.25s/it] {'loss': 0.4685, 'learning_rate': 7.311666646257412e-06, 'epoch': 0.6} + 60%|█████▉ | 3458/5772 [2:13:02<4:01:05, 6.25s/it] {'loss': 0.4685, 'learning_rate': 7.311666646257412e-06, 'epoch': 0.6} + 60%|█████▉ | 3458/5772 [2:13:00<4:01:05, 6.25s/it] 60%|█████▉ | 3459/5772 [2:13:06<3:57:39, 6.16s/it] 60%|█████▉ | 3459/5772 [2:13:08<3:57:39, 6.16s/it] {'loss': 0.4553, 'learning_rate': 7.3062616745747325e-06, 'epoch': 0.6} + 60%|█████▉ | 3459/5772 [2:13:08<3:57:39, 6.16s/it] {'loss': 0.4553, 'learning_rate': 7.3062616745747325e-06, 'epoch': 0.6} + 60%|█████▉ | 3459/5772 [2:13:06<3:57:39, 6.16s/it] 60%|█████▉ | 3460/5772 [2:13:13<3:57:35, 6.17s/it] 60%|█████▉ | 3460/5772 [2:13:14<3:57:35, 6.17s/it] {'loss': 0.4675, 'learning_rate': 7.300857551269969e-06, 'epoch': 0.6} + 60%|█████▉ | 3460/5772 [2:13:14<3:57:35, 6.17s/it] {'loss': 0.4675, 'learning_rate': 7.300857551269969e-06, 'epoch': 0.6} + 60%|█████▉ | 3460/5772 [2:13:13<3:57:35, 6.17s/it] 60%|█████▉ | 3461/5772 [2:13:19<3:58:35, 6.19s/it] 60%|█████▉ | 3461/5772 [2:13:21<3:58:35, 6.19s/it] {'loss': 0.4565, 'learning_rate': 7.295454278045104e-06, 'epoch': 0.6} + 60%|█████▉ | 3461/5772 [2:13:21<3:58:35, 6.19s/it] {'loss': 0.4565, 'learning_rate': 7.295454278045104e-06, 'epoch': 0.6} + 60%|█████▉ | 3461/5772 [2:13:19<3:58:35, 6.19s/it] 60%|█████▉ | 3462/5772 [2:13:25<3:59:42, 6.23s/it] 60%|█████▉ | 3462/5772 [2:13:27<3:59:42, 6.23s/it] {'loss': 0.4733, 'learning_rate': 7.290051856601879e-06, 'epoch': 0.6} + 60%|█████▉ | 3462/5772 [2:13:27<3:59:42, 6.23s/it] {'loss': 0.4733, 'learning_rate': 7.290051856601879e-06, 'epoch': 0.6} + 60%|█████▉ | 3462/5772 [2:13:25<3:59:42, 6.23s/it] 60%|█████▉ | 3463/5772 [2:13:31<4:00:38, 6.25s/it] 60%|█████▉ | 3463/5772 [2:13:33<4:00:38, 6.25s/it] {'loss': 0.45, 'learning_rate': 7.28465028864176e-06, 'epoch': 0.6} + 60%|█████▉ | 3463/5772 [2:13:33<4:00:38, 6.25s/it] {'loss': 0.45, 'learning_rate': 7.28465028864176e-06, 'epoch': 0.6} + 60%|█████▉ | 3463/5772 [2:13:31<4:00:38, 6.25s/it] 60%|██████ | 3464/5772 [2:13:37<3:56:33, 6.15s/it] 60%|██████ | 3464/5772 [2:13:39<3:56:32, 6.15s/it] {'loss': 0.4632, 'learning_rate': 7.279249575865929e-06, 'epoch': 0.6} + 60%|██████ | 3464/5772 [2:13:39<3:56:32, 6.15s/it] {'loss': 0.4632, 'learning_rate': 7.279249575865929e-06, 'epoch': 0.6} + 60%|██████ | 3464/5772 [2:13:37<3:56:33, 6.15s/it] 60%|██████ | 3465/5772 [2:13:43<3:55:08, 6.12s/it] 60%|██████ | 3465/5772 [2:13:45<3:55:08, 6.12s/it] {'loss': 0.4666, 'learning_rate': 7.27384971997532e-06, 'epoch': 0.6} + 60%|██████ | 3465/5772 [2:13:45<3:55:08, 6.12s/it] {'loss': 0.4666, 'learning_rate': 7.27384971997532e-06, 'epoch': 0.6} + 60%|██████ | 3465/5772 [2:13:43<3:55:08, 6.12s/it] 60%|██████ | 3466/5772 [2:13:49<3:52:25, 6.05s/it] 60%|██████ | 3466/5772 [2:13:51<3:52:25, 6.05s/it] {'loss': 0.4735, 'learning_rate': 7.268450722670582e-06, 'epoch': 0.6} + 60%|██████ | 3466/5772 [2:13:51<3:52:25, 6.05s/it] {'loss': 0.4735, 'learning_rate': 7.268450722670582e-06, 'epoch': 0.6} + 60%|██████ | 3466/5772 [2:13:49<3:52:25, 6.05s/it] 60%|██████ | 3467/5772 [2:13:58<3:55:31, 6.13s/it] 60%|██████ | 3467/5772 [2:13:56<3:55:31, 6.13s/it] {'loss': 0.4551, 'learning_rate': 7.263052585652104e-06, 'epoch': 0.6} + 60%|██████ | 3467/5772 [2:13:58<3:55:31, 6.13s/it] {'loss': 0.4551, 'learning_rate': 7.263052585652104e-06, 'epoch': 0.6} + 60%|██████ | 3467/5772 [2:13:56<3:55:31, 6.13s/it] 60%|██████ | 3468/5772 [2:14:02<3:58:43, 6.22s/it] 60%|██████ | 3468/5772 [2:14:04<3:58:42, 6.22s/it] {'loss': 0.473, 'learning_rate': 7.257655310619996e-06, 'epoch': 0.6} + 60%|██████ | 3468/5772 [2:14:04<3:58:42, 6.22s/it] {'loss': 0.473, 'learning_rate': 7.257655310619996e-06, 'epoch': 0.6} + 60%|██████ | 3468/5772 [2:14:02<3:58:43, 6.22s/it] 60%|██████ | 3469/5772 [2:14:08<3:54:53, 6.12s/it] 60%|██████ | 3469/5772 [2:14:10<3:54:53, 6.12s/it] {'loss': 0.4582, 'learning_rate': 7.252258899274096e-06, 'epoch': 0.6} + 60%|██████ | 3469/5772 [2:14:10<3:54:53, 6.12s/it] {'loss': 0.4582, 'learning_rate': 7.252258899274096e-06, 'epoch': 0.6} + 60%|██████ | 3469/5772 [2:14:08<3:54:53, 6.12s/it] 60%|██████ | 3470/5772 [2:14:14<3:51:39, 6.04s/it] 60%|██████ | 3470/5772 [2:14:16<3:51:39, 6.04s/it] {'loss': 0.4638, 'learning_rate': 7.246863353313983e-06, 'epoch': 0.6} + 60%|██████ | 3470/5772 [2:14:16<3:51:39, 6.04s/it] {'loss': 0.4638, 'learning_rate': 7.246863353313983e-06, 'epoch': 0.6} + 60%|██████ | 3470/5772 [2:14:14<3:51:39, 6.04s/it] 60%|██████ | 3471/5772 [2:14:20<3:50:21, 6.01s/it] 60%|██████ | 3471/5772 [2:14:22<3:50:21, 6.01s/it] {'loss': 0.4695, 'learning_rate': 7.241468674438947e-06, 'epoch': 0.6} + 60%|██████ | 3471/5772 [2:14:22<3:50:21, 6.01s/it] {'loss': 0.4695, 'learning_rate': 7.241468674438947e-06, 'epoch': 0.6} + 60%|██████ | 3471/5772 [2:14:20<3:50:21, 6.01s/it] 60%|██████ | 3472/5772 [2:14:26<3:52:58, 6.08s/it] 60%|██████ | 3472/5772 [2:14:28<3:52:58, 6.08s/it] {'loss': 0.4695, 'learning_rate': 7.236074864348017e-06, 'epoch': 0.6} + 60%|██████ | 3472/5772 [2:14:28<3:52:58, 6.08s/it] {'loss': 0.4695, 'learning_rate': 7.236074864348017e-06, 'epoch': 0.6} + 60%|██████ | 3472/5772 [2:14:26<3:52:58, 6.08s/it] 60%|██████ | 3473/5772 [2:14:34<3:51:30, 6.04s/it] {'loss': 0.4592, 'learning_rate': 7.230681924739939e-06, 'epoch': 0.6} + 60%|██████ | 3473/5772 [2:14:34<3:51:30, 6.04s/it] 60%|██████ | 3473/5772 [2:14:32<3:51:30, 6.04s/it] {'loss': 0.4592, 'learning_rate': 7.230681924739939e-06, 'epoch': 0.6} + 60%|██████ | 3473/5772 [2:14:32<3:51:30, 6.04s/it] 60%|██████ | 3474/5772 [2:14:38<3:54:30, 6.12s/it] 60%|██████ | 3474/5772 [2:14:40<3:54:30, 6.12s/it] {'loss': 0.4716, 'learning_rate': 7.225289857313194e-06, 'epoch': 0.6} + 60%|██████ | 3474/5772 [2:14:40<3:54:30, 6.12s/it] {'loss': 0.4716, 'learning_rate': 7.225289857313194e-06, 'epoch': 0.6} + 60%|██████ | 3474/5772 [2:14:38<3:54:30, 6.12s/it] 60%|██████ | 3475/5772 [2:14:44<3:51:08, 6.04s/it] 60%|██████ | 3475/5772 [2:14:46<3:51:08, 6.04s/it] {'loss': 0.477, 'learning_rate': 7.219898663765979e-06, 'epoch': 0.6} + 60%|██████ | 3475/5772 [2:14:46<3:51:08, 6.04s/it] {'loss': 0.477, 'learning_rate': 7.219898663765979e-06, 'epoch': 0.6} + 60%|██████ | 3475/5772 [2:14:44<3:51:08, 6.04s/it] 60%|██████ | 3476/5772 [2:14:50<3:52:03, 6.06s/it] 60%|██████ | 3476/5772 [2:14:52<3:52:04, 6.06s/it] {'loss': 0.464, 'learning_rate': 7.214508345796218e-06, 'epoch': 0.6} + 60%|██████ | 3476/5772 [2:14:52<3:52:04, 6.06s/it] {'loss': 0.464, 'learning_rate': 7.214508345796218e-06, 'epoch': 0.6} + 60%|██████ | 3476/5772 [2:14:50<3:52:03, 6.06s/it] 60%|██████ | 3477/5772 [2:14:56<3:48:10, 5.97s/it] 60%|██████ | 3477/5772 [2:14:58<3:48:10, 5.97s/it] {'loss': 0.4659, 'learning_rate': 7.209118905101575e-06, 'epoch': 0.6} + 60%|██████ | 3477/5772 [2:14:58<3:48:10, 5.97s/it] {'loss': 0.4659, 'learning_rate': 7.209118905101575e-06, 'epoch': 0.6} + 60%|██████ | 3477/5772 [2:14:56<3:48:10, 5.97s/it] 60%|██████ | 3478/5772 [2:15:02<3:54:39, 6.14s/it] 60%|██████ | 3478/5772 [2:15:04<3:54:38, 6.14s/it] {'loss': 0.4538, 'learning_rate': 7.203730343379408e-06, 'epoch': 0.6} + 60%|██████ | 3478/5772 [2:15:04<3:54:38, 6.14s/it] {'loss': 0.4538, 'learning_rate': 7.203730343379408e-06, 'epoch': 0.6} + 60%|██████ | 3478/5772 [2:15:02<3:54:39, 6.14s/it] 60%|██████ | 3479/5772 [2:15:11<3:55:23, 6.16s/it] 60%|██████ | 3479/5772 [2:15:09<3:55:24, 6.16s/it] {'loss': 0.4664, 'learning_rate': 7.198342662326827e-06, 'epoch': 0.6} + 60%|██████ | 3479/5772 [2:15:11<3:55:23, 6.16s/it] {'loss': 0.4664, 'learning_rate': 7.198342662326827e-06, 'epoch': 0.6} + 60%|██████ | 3479/5772 [2:15:09<3:55:24, 6.16s/it] 60%|██████ | 3480/5772 [2:15:17<3:57:27, 6.22s/it] 60%|██████ | 3480/5772 [2:15:15<3:57:28, 6.22s/it] {'loss': 0.4818, 'learning_rate': 7.192955863640645e-06, 'epoch': 0.6} + 60%|██████ | 3480/5772 [2:15:17<3:57:27, 6.22s/it] {'loss': 0.4818, 'learning_rate': 7.192955863640645e-06, 'epoch': 0.6} + 60%|██████ | 3480/5772 [2:15:15<3:57:28, 6.22s/it] 60%|██████ | 3481/5772 [2:15:21<3:57:48, 6.23s/it] 60%|██████ | 3481/5772 [2:15:23<3:57:49, 6.23s/it] {'loss': 0.4638, 'learning_rate': 7.187569949017408e-06, 'epoch': 0.6} + 60%|██████ | 3481/5772 [2:15:23<3:57:49, 6.23s/it] {'loss': 0.4638, 'learning_rate': 7.187569949017408e-06, 'epoch': 0.6} + 60%|██████ | 3481/5772 [2:15:21<3:57:48, 6.23s/it] 60%|██████ | 3482/5772 [2:15:28<3:58:56, 6.26s/it] 60%|██████ | 3482/5772 [2:15:30<3:58:56, 6.26s/it] {'loss': 0.4564, 'learning_rate': 7.1821849201533765e-06, 'epoch': 0.6} + 60%|██████ | 3482/5772 [2:15:30<3:58:56, 6.26s/it] {'loss': 0.4564, 'learning_rate': 7.1821849201533765e-06, 'epoch': 0.6} + 60%|██████ | 3482/5772 [2:15:28<3:58:56, 6.26s/it] 60%|██████ | 3483/5772 [2:15:34<3:55:43, 6.18s/it] 60%|██████ | 3483/5772 [2:15:36<3:55:43, 6.18s/it] {'loss': 0.4707, 'learning_rate': 7.176800778744537e-06, 'epoch': 0.6} + 60%|██████ | 3483/5772 [2:15:36<3:55:43, 6.18s/it] {'loss': 0.4707, 'learning_rate': 7.176800778744537e-06, 'epoch': 0.6} + 60%|██████ | 3483/5772 [2:15:34<3:55:43, 6.18s/it] 60%|██████ | 3484/5772 [2:15:40<3:54:05, 6.14s/it] 60%|██████ | 3484/5772 [2:15:42<3:54:06, 6.14s/it] {'loss': 0.4766, 'learning_rate': 7.1714175264865975e-06, 'epoch': 0.6} + 60%|██████ | 3484/5772 [2:15:42<3:54:06, 6.14s/it] {'loss': 0.4766, 'learning_rate': 7.1714175264865975e-06, 'epoch': 0.6} + 60%|██████ | 3484/5772 [2:15:40<3:54:05, 6.14s/it] 60%|██████ | 3485/5772 [2:15:46<3:51:36, 6.08s/it] 60%|██████ | 3485/5772 [2:15:47<3:51:36, 6.08s/it] {'loss': 0.4582, 'learning_rate': 7.166035165074976e-06, 'epoch': 0.6} + 60%|██████ | 3485/5772 [2:15:47<3:51:36, 6.08s/it] {'loss': 0.4582, 'learning_rate': 7.166035165074976e-06, 'epoch': 0.6} + 60%|██████ | 3485/5772 [2:15:46<3:51:36, 6.08s/it] 60%|██████ | 3486/5772 [2:15:54<4:00:11, 6.30s/it] 60%|██████ | 3486/5772 [2:15:52<4:00:11, 6.30s/it] {'loss': 0.4723, 'learning_rate': 7.16065369620483e-06, 'epoch': 0.6} + 60%|██████ | 3486/5772 [2:15:54<4:00:11, 6.30s/it] {'loss': 0.4723, 'learning_rate': 7.16065369620483e-06, 'epoch': 0.6} + 60%|██████ | 3486/5772 [2:15:52<4:00:11, 6.30s/it] 60%|██████ | 3487/5772 [2:15:58<3:58:04, 6.25s/it] 60%|██████ | 3487/5772 [2:16:00<3:58:04, 6.25s/it] {'loss': 0.4649, 'learning_rate': 7.155273121571009e-06, 'epoch': 0.6} + 60%|██████ | 3487/5772 [2:16:00<3:58:04, 6.25s/it] {'loss': 0.4649, 'learning_rate': 7.155273121571009e-06, 'epoch': 0.6} + 60%|██████ | 3487/5772 [2:15:58<3:58:04, 6.25s/it] 60%|██████ | 3488/5772 [2:16:05<3:57:28, 6.24s/it] 60%|██████ | 3488/5772 [2:16:07<3:57:29, 6.24s/it] {'loss': 0.4558, 'learning_rate': 7.149893442868105e-06, 'epoch': 0.6} + 60%|██████ | 3488/5772 [2:16:07<3:57:29, 6.24s/it] {'loss': 0.4558, 'learning_rate': 7.149893442868105e-06, 'epoch': 0.6} + 60%|██████ | 3488/5772 [2:16:05<3:57:28, 6.24s/it] 60%|██████ | 3489/5772 [2:16:11<3:55:49, 6.20s/it] 60%|██████ | 3489/5772 [2:16:13<3:55:48, 6.20s/it] {'loss': 0.4584, 'learning_rate': 7.1445146617904135e-06, 'epoch': 0.6} + 60%|██████ | 3489/5772 [2:16:13<3:55:48, 6.20s/it] {'loss': 0.4584, 'learning_rate': 7.1445146617904135e-06, 'epoch': 0.6} + 60%|██████ | 3489/5772 [2:16:11<3:55:49, 6.20s/it] 60%|██████ | 3490/5772 [2:16:17<3:57:20, 6.24s/it] 60%|██████ | 3490/5772 [2:16:19<3:57:20, 6.24s/it] {'loss': 0.4733, 'learning_rate': 7.139136780031953e-06, 'epoch': 0.6} + 60%|██████ | 3490/5772 [2:16:19<3:57:20, 6.24s/it] {'loss': 0.4733, 'learning_rate': 7.139136780031953e-06, 'epoch': 0.6} + 60%|██████ | 3490/5772 [2:16:17<3:57:20, 6.24s/it] 60%|██████ | 3491/5772 [2:16:25<3:52:28, 6.11s/it] 60%|██████ | 3491/5772 [2:16:23<3:52:28, 6.12s/it] {'loss': 0.4657, 'learning_rate': 7.133759799286458e-06, 'epoch': 0.6} + 60%|██████ | 3491/5772 [2:16:25<3:52:28, 6.11s/it] {'loss': 0.4657, 'learning_rate': 7.133759799286458e-06, 'epoch': 0.6} + 60%|██████ | 3491/5772 [2:16:23<3:52:28, 6.12s/it] 60%|██████ | 3492/5772 [2:16:29<3:54:54, 6.18s/it] 60%|██████ | 3492/5772 [2:16:31<3:54:54, 6.18s/it] {'loss': 0.4882, 'learning_rate': 7.128383721247376e-06, 'epoch': 0.6} + 60%|██████ | 3492/5772 [2:16:31<3:54:54, 6.18s/it] {'loss': 0.4882, 'learning_rate': 7.128383721247376e-06, 'epoch': 0.6} + 60%|██████ | 3492/5772 [2:16:29<3:54:54, 6.18s/it] 61%|██████ | 3493/5772 [2:16:36<3:55:42, 6.21s/it] 61%|██████ | 3493/5772 [2:16:38<3:55:42, 6.21s/it] {'loss': 0.4711, 'learning_rate': 7.123008547607877e-06, 'epoch': 0.61} + 61%|██████ | 3493/5772 [2:16:38<3:55:42, 6.21s/it] {'loss': 0.4711, 'learning_rate': 7.123008547607877e-06, 'epoch': 0.61} + 61%|██████ | 3493/5772 [2:16:36<3:55:42, 6.21s/it] 61%|██████ | 3494/5772 [2:16:42<3:54:43, 6.18s/it] 61%|██████ | 3494/5772 [2:16:44<3:54:43, 6.18s/it] {'loss': 0.4656, 'learning_rate': 7.1176342800608365e-06, 'epoch': 0.61} + 61%|██████ | 3494/5772 [2:16:44<3:54:43, 6.18s/it] {'loss': 0.4656, 'learning_rate': 7.1176342800608365e-06, 'epoch': 0.61} + 61%|██████ | 3494/5772 [2:16:42<3:54:43, 6.18s/it] 61%|██████ | 3495/5772 [2:16:48<3:52:36, 6.13s/it] 61%|██████ | 3495/5772 [2:16:50<3:52:36, 6.13s/it] {'loss': 0.4632, 'learning_rate': 7.112260920298859e-06, 'epoch': 0.61} + 61%|██████ | 3495/5772 [2:16:50<3:52:36, 6.13s/it] {'loss': 0.4632, 'learning_rate': 7.112260920298859e-06, 'epoch': 0.61} + 61%|██████ | 3495/5772 [2:16:48<3:52:36, 6.13s/it] 61%|██████ | 3496/5772 [2:16:54<3:50:30, 6.08s/it] 61%|██████ | 3496/5772 [2:16:56<3:50:30, 6.08s/it] {'loss': 0.4702, 'learning_rate': 7.1068884700142416e-06, 'epoch': 0.61} + 61%|██████ | 3496/5772 [2:16:56<3:50:30, 6.08s/it] {'loss': 0.4702, 'learning_rate': 7.1068884700142416e-06, 'epoch': 0.61} + 61%|██████ | 3496/5772 [2:16:54<3:50:30, 6.08s/it] 61%|██████ | 3497/5772 [2:17:00<3:49:53, 6.06s/it] 61%|██████ | 3497/5772 [2:17:02<3:49:53, 6.06s/it] {'loss': 0.4586, 'learning_rate': 7.101516930899019e-06, 'epoch': 0.61} + 61%|██████ | 3497/5772 [2:17:02<3:49:53, 6.06s/it] {'loss': 0.4586, 'learning_rate': 7.101516930899019e-06, 'epoch': 0.61} + 61%|██████ | 3497/5772 [2:17:00<3:49:53, 6.06s/it] 61%|██████ | 3498/5772 [2:17:06<3:48:02, 6.02s/it] 61%|██████ | 3498/5772 [2:17:08<3:48:02, 6.02s/it] {'loss': 0.475, 'learning_rate': 7.096146304644924e-06, 'epoch': 0.61} + 61%|██████ | 3498/5772 [2:17:08<3:48:02, 6.02s/it] {'loss': 0.475, 'learning_rate': 7.096146304644924e-06, 'epoch': 0.61} + 61%|██████ | 3498/5772 [2:17:06<3:48:02, 6.02s/it] 61%|██████ | 3499/5772 [2:17:13<3:46:24, 5.98s/it] 61%|██████ | 3499/5772 [2:17:11<3:46:24, 5.98s/it] {'loss': 0.456, 'learning_rate': 7.090776592943402e-06, 'epoch': 0.61} + 61%|██████ | 3499/5772 [2:17:13<3:46:24, 5.98s/it] {'loss': 0.456, 'learning_rate': 7.090776592943402e-06, 'epoch': 0.61} + 61%|██████ | 3499/5772 [2:17:11<3:46:24, 5.98s/it]10 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +012 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +7 61%|██████ | 3500/5772 [2:17:20<3:50:10, 6.08s/it]AutoResumeHook: Checking whether to suspend... + 61%|██████ | 3500/5772 [2:17:18<3:50:11, 6.08s/it]4 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4742, 'learning_rate': 7.08540779748562e-06, 'epoch': 0.61} + 61%|██████ | 3500/5772 [2:17:20<3:50:10, 6.08s/it] {'loss': 0.4742, 'learning_rate': 7.08540779748562e-06, 'epoch': 0.61} + 61%|██████ | 3500/5772 [2:17:18<3:50:11, 6.08s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-3500/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-3500/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-3500/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 61%|██████ | 3501/5772 [2:17:40<6:54:30, 10.95s/it] 61%|██████ | 3501/5772 [2:17:42<6:54:30, 10.95s/it] {'loss': 0.4584, 'learning_rate': 7.080039919962445e-06, 'epoch': 0.61} + 61%|██████ | 3501/5772 [2:17:42<6:54:30, 10.95s/it] {'loss': 0.4584, 'learning_rate': 7.080039919962445e-06, 'epoch': 0.61} + 61%|██████ | 3501/5772 [2:17:40<6:54:30, 10.95s/it] 61%|██████ | 3502/5772 [2:17:46<5:59:09, 9.49s/it] 61%|██████ | 3502/5772 [2:17:48<5:59:09, 9.49s/it] {'loss': 0.4636, 'learning_rate': 7.074672962064464e-06, 'epoch': 0.61} + 61%|██████ | 3502/5772 [2:17:48<5:59:09, 9.49s/it] {'loss': 0.4636, 'learning_rate': 7.074672962064464e-06, 'epoch': 0.61} + 61%|██████ | 3502/5772 [2:17:46<5:59:09, 9.49s/it] 61%|██████ | 3503/5772 [2:17:52<5:22:44, 8.53s/it] 61%|██████ | 3503/5772 [2:17:54<5:22:44, 8.53s/it] {'loss': 0.4546, 'learning_rate': 7.069306925481965e-06, 'epoch': 0.61} + 61%|██████ | 3503/5772 [2:17:54<5:22:44, 8.53s/it] {'loss': 0.4546, 'learning_rate': 7.069306925481965e-06, 'epoch': 0.61} + 61%|██████ | 3503/5772 [2:17:52<5:22:44, 8.53s/it] 61%|██████ | 3504/5772 [2:18:01<4:54:29, 7.79s/it] 61%|██████ | 3504/5772 [2:17:59<4:54:30, 7.79s/it] {'loss': 0.4525, 'learning_rate': 7.063941811904956e-06, 'epoch': 0.61} + 61%|██████ | 3504/5772 [2:18:01<4:54:29, 7.79s/it] {'loss': 0.4525, 'learning_rate': 7.063941811904956e-06, 'epoch': 0.61} + 61%|██████ | 3504/5772 [2:17:59<4:54:30, 7.79s/it] 61%|██████ | 3505/5772 [2:18:05<4:33:55, 7.25s/it] 61%|██████ | 3505/5772 [2:18:06<4:33:55, 7.25s/it] {'loss': 0.4522, 'learning_rate': 7.058577623023153e-06, 'epoch': 0.61} + 61%|██████ | 3505/5772 [2:18:06<4:33:55, 7.25s/it] {'loss': 0.4522, 'learning_rate': 7.058577623023153e-06, 'epoch': 0.61} + 61%|██████ | 3505/5772 [2:18:05<4:33:55, 7.25s/it] 61%|██████ | 3506/5772 [2:18:11<4:25:35, 7.03s/it] 61%|██████ | 3506/5772 [2:18:13<4:25:35, 7.03s/it] {'loss': 0.4889, 'learning_rate': 7.0532143605259686e-06, 'epoch': 0.61} + 61%|██████ | 3506/5772 [2:18:13<4:25:35, 7.03s/it] {'loss': 0.4889, 'learning_rate': 7.0532143605259686e-06, 'epoch': 0.61} + 61%|██████ | 3506/5772 [2:18:11<4:25:35, 7.03s/it] 61%|██████ | 3507/5772 [2:18:19<4:13:28, 6.71s/it] 61%|██████ | 3507/5772 [2:18:17<4:13:28, 6.71s/it] {'loss': 0.4739, 'learning_rate': 7.047852026102541e-06, 'epoch': 0.61} + 61%|██████ | 3507/5772 [2:18:19<4:13:28, 6.71s/it] {'loss': 0.4739, 'learning_rate': 7.047852026102541e-06, 'epoch': 0.61} + 61%|██████ | 3507/5772 [2:18:17<4:13:28, 6.71s/it] 61%|██████ | 3508/5772 [2:18:23<4:06:24, 6.53s/it] 61%|██████ | 3508/5772 [2:18:25<4:06:24, 6.53s/it] {'loss': 0.4732, 'learning_rate': 7.042490621441701e-06, 'epoch': 0.61} + 61%|██████ | 3508/5772 [2:18:25<4:06:24, 6.53s/it] {'loss': 0.4732, 'learning_rate': 7.042490621441701e-06, 'epoch': 0.61} + 61%|██████ | 3508/5772 [2:18:23<4:06:24, 6.53s/it] 61%|██████ | 3509/5772 [2:18:29<4:01:30, 6.40s/it] 61%|██████ | 3509/5772 [2:18:31<4:01:31, 6.40s/it] {'loss': 0.4569, 'learning_rate': 7.0371301482319985e-06, 'epoch': 0.61} + 61%|██████ | 3509/5772 [2:18:31<4:01:31, 6.40s/it] {'loss': 0.4569, 'learning_rate': 7.0371301482319985e-06, 'epoch': 0.61} + 61%|██████ | 3509/5772 [2:18:29<4:01:30, 6.40s/it] 61%|██████ | 3510/5772 [2:18:35<3:54:49, 6.23s/it] 61%|██████ | 3510/5772 [2:18:37<3:54:50, 6.23s/it] {'loss': 0.4685, 'learning_rate': 7.0317706081616785e-06, 'epoch': 0.61} + 61%|██████ | 3510/5772 [2:18:37<3:54:50, 6.23s/it] {'loss': 0.4685, 'learning_rate': 7.0317706081616785e-06, 'epoch': 0.61} + 61%|██████ | 3510/5772 [2:18:35<3:54:49, 6.23s/it] 61%|██████ | 3511/5772 [2:18:41<3:50:47, 6.12s/it] 61%|██████ | 3511/5772 [2:18:43<3:50:47, 6.12s/it] {'loss': 0.4413, 'learning_rate': 7.026412002918701e-06, 'epoch': 0.61} + 61%|██████ | 3511/5772 [2:18:43<3:50:47, 6.12s/it] {'loss': 0.4413, 'learning_rate': 7.026412002918701e-06, 'epoch': 0.61} + 61%|██████ | 3511/5772 [2:18:41<3:50:47, 6.12s/it] 61%|██████ | 3512/5772 [2:18:47<3:49:26, 6.09s/it] 61%|██████ | 3512/5772 [2:18:49<3:49:26, 6.09s/it] {'loss': 0.4756, 'learning_rate': 7.021054334190736e-06, 'epoch': 0.61} + 61%|██████ | 3512/5772 [2:18:49<3:49:26, 6.09s/it] {'loss': 0.4756, 'learning_rate': 7.021054334190736e-06, 'epoch': 0.61} + 61%|██████ | 3512/5772 [2:18:47<3:49:26, 6.09s/it] 61%|██████ | 3513/5772 [2:18:53<3:49:59, 6.11s/it] 61%|██████ | 3513/5772 [2:18:55<3:49:59, 6.11s/it] {'loss': 0.456, 'learning_rate': 7.015697603665141e-06, 'epoch': 0.61} + 61%|██████ | 3513/5772 [2:18:55<3:49:59, 6.11s/it] {'loss': 0.456, 'learning_rate': 7.015697603665141e-06, 'epoch': 0.61} + 61%|██████ | 3513/5772 [2:18:53<3:49:59, 6.11s/it] 61%|██████ | 3514/5772 [2:19:00<3:54:12, 6.22s/it] 61%|██████ | 3514/5772 [2:19:02<3:54:12, 6.22s/it] {'loss': 0.4734, 'learning_rate': 7.010341813028996e-06, 'epoch': 0.61} + 61%|██████ | 3514/5772 [2:19:02<3:54:12, 6.22s/it] {'loss': 0.4734, 'learning_rate': 7.010341813028996e-06, 'epoch': 0.61} + 61%|██████ | 3514/5772 [2:19:00<3:54:12, 6.22s/it] 61%|██████ | 3515/5772 [2:19:06<3:51:49, 6.16s/it] 61%|██████ | 3515/5772 [2:19:08<3:51:48, 6.16s/it] {'loss': 0.457, 'learning_rate': 7.004986963969072e-06, 'epoch': 0.61} + 61%|██████ | 3515/5772 [2:19:08<3:51:48, 6.16s/it] {'loss': 0.457, 'learning_rate': 7.004986963969072e-06, 'epoch': 0.61} + 61%|██████ | 3515/5772 [2:19:06<3:51:49, 6.16s/it] 61%|██████ | 3516/5772 [2:19:12<3:55:49, 6.27s/it] 61%|██████ | 3516/5772 [2:19:14<3:55:49, 6.27s/it] {'loss': 0.4699, 'learning_rate': 6.999633058171853e-06, 'epoch': 0.61} + 61%|██████ | 3516/5772 [2:19:14<3:55:49, 6.27s/it] {'loss': 0.4699, 'learning_rate': 6.999633058171853e-06, 'epoch': 0.61} + 61%|██████ | 3516/5772 [2:19:12<3:55:49, 6.27s/it] 61%|██████ | 3517/5772 [2:19:20<3:52:40, 6.19s/it] 61%|██████ | 3517/5772 [2:19:18<3:52:41, 6.19s/it] {'loss': 0.469, 'learning_rate': 6.994280097323519e-06, 'epoch': 0.61} + 61%|██████ | 3517/5772 [2:19:20<3:52:40, 6.19s/it] {'loss': 0.469, 'learning_rate': 6.994280097323519e-06, 'epoch': 0.61} + 61%|██████ | 3517/5772 [2:19:18<3:52:41, 6.19s/it] 61%|██████ | 3518/5772 [2:19:24<3:49:16, 6.10s/it] 61%|██████ | 3518/5772 [2:19:26<3:49:16, 6.10s/it] {'loss': 0.4659, 'learning_rate': 6.988928083109954e-06, 'epoch': 0.61} + 61%|██████ | 3518/5772 [2:19:26<3:49:16, 6.10s/it] {'loss': 0.4659, 'learning_rate': 6.988928083109954e-06, 'epoch': 0.61} + 61%|██████ | 3518/5772 [2:19:24<3:49:16, 6.10s/it] 61%|██████ | 3519/5772 [2:19:30<3:50:59, 6.15s/it] 61%|██████ | 3519/5772 [2:19:32<3:50:59, 6.15s/it] {'loss': 0.4597, 'learning_rate': 6.9835770172167535e-06, 'epoch': 0.61} + 61%|██████ | 3519/5772 [2:19:32<3:50:59, 6.15s/it] {'loss': 0.4597, 'learning_rate': 6.9835770172167535e-06, 'epoch': 0.61} + 61%|██████ | 3519/5772 [2:19:30<3:50:59, 6.15s/it] 61%|██████ | 3520/5772 [2:19:36<3:49:22, 6.11s/it] 61%|██████ | 3520/5772 [2:19:38<3:49:22, 6.11s/it] {'loss': 0.4713, 'learning_rate': 6.978226901329195e-06, 'epoch': 0.61} + 61%|██████ | 3520/5772 [2:19:38<3:49:22, 6.11s/it] {'loss': 0.4713, 'learning_rate': 6.978226901329195e-06, 'epoch': 0.61} + 61%|██████ | 3520/5772 [2:19:36<3:49:22, 6.11s/it] 61%|██████ | 3521/5772 [2:19:43<3:53:24, 6.22s/it] 61%|██████ | 3521/5772 [2:19:45<3:53:24, 6.22s/it] {'loss': 0.4692, 'learning_rate': 6.9728777371322775e-06, 'epoch': 0.61} + 61%|██████ | 3521/5772 [2:19:45<3:53:24, 6.22s/it] {'loss': 0.4692, 'learning_rate': 6.9728777371322775e-06, 'epoch': 0.61} + 61%|██████ | 3521/5772 [2:19:43<3:53:24, 6.22s/it] 61%|██████ | 3522/5772 [2:19:49<3:57:08, 6.32s/it] 61%|██████ | 3522/5772 [2:19:51<3:57:08, 6.32s/it] {'loss': 0.4704, 'learning_rate': 6.967529526310681e-06, 'epoch': 0.61} + 61%|██████ | 3522/5772 [2:19:51<3:57:08, 6.32s/it] {'loss': 0.4704, 'learning_rate': 6.967529526310681e-06, 'epoch': 0.61} + 61%|██████ | 3522/5772 [2:19:49<3:57:08, 6.32s/it] 61%|██████ | 3523/5772 [2:19:57<3:52:49, 6.21s/it] 61%|██████ | 3523/5772 [2:19:55<3:52:50, 6.21s/it] {'loss': 0.462, 'learning_rate': 6.962182270548803e-06, 'epoch': 0.61} + 61%|██████ | 3523/5772 [2:19:57<3:52:49, 6.21s/it] {'loss': 0.462, 'learning_rate': 6.962182270548803e-06, 'epoch': 0.61} + 61%|██████ | 3523/5772 [2:19:55<3:52:50, 6.21s/it] 61%|██████ | 3524/5772 [2:20:01<3:50:51, 6.16s/it] 61%|██████ | 3524/5772 [2:20:03<3:50:51, 6.16s/it] {'loss': 0.4608, 'learning_rate': 6.9568359715307265e-06, 'epoch': 0.61} + 61%|██████ | 3524/5772 [2:20:03<3:50:51, 6.16s/it] {'loss': 0.4608, 'learning_rate': 6.9568359715307265e-06, 'epoch': 0.61} + 61%|██████ | 3524/5772 [2:20:01<3:50:51, 6.16s/it] 61%|██████ | 3525/5772 [2:20:07<3:49:46, 6.14s/it] 61%|██████ | 3525/5772 [2:20:09<3:49:45, 6.14s/it] {'loss': 0.4685, 'learning_rate': 6.951490630940241e-06, 'epoch': 0.61} + 61%|██████ | 3525/5772 [2:20:09<3:49:45, 6.14s/it] {'loss': 0.4685, 'learning_rate': 6.951490630940241e-06, 'epoch': 0.61} + 61%|██████ | 3525/5772 [2:20:07<3:49:46, 6.14s/it] 61%|██████ | 3526/5772 [2:20:13<3:46:14, 6.04s/it] 61%|██████ | 3526/5772 [2:20:15<3:46:14, 6.04s/it] {'loss': 0.4449, 'learning_rate': 6.9461462504608335e-06, 'epoch': 0.61} + 61%|██████ | 3526/5772 [2:20:15<3:46:14, 6.04s/it] {'loss': 0.4449, 'learning_rate': 6.9461462504608335e-06, 'epoch': 0.61} + 61%|██████ | 3526/5772 [2:20:13<3:46:14, 6.04s/it] 61%|██████ | 3527/5772 [2:20:19<3:45:02, 6.01s/it] 61%|██████ | 3527/5772 [2:20:21<3:45:02, 6.01s/it] {'loss': 0.4617, 'learning_rate': 6.94080283177568e-06, 'epoch': 0.61} + 61%|██████ | 3527/5772 [2:20:21<3:45:02, 6.01s/it] {'loss': 0.4617, 'learning_rate': 6.94080283177568e-06, 'epoch': 0.61} + 61%|██████ | 3527/5772 [2:20:19<3:45:02, 6.01s/it] 61%|██████ | 3528/5772 [2:20:25<3:44:25, 6.00s/it] 61%|██████ | 3528/5772 [2:20:27<3:44:25, 6.00s/it] {'loss': 0.484, 'learning_rate': 6.935460376567673e-06, 'epoch': 0.61} + 61%|██████ | 3528/5772 [2:20:27<3:44:25, 6.00s/it] {'loss': 0.484, 'learning_rate': 6.935460376567673e-06, 'epoch': 0.61} + 61%|██████ | 3528/5772 [2:20:25<3:44:25, 6.00s/it] 61%|██████ | 3529/5772 [2:20:33<3:46:24, 6.06s/it] 61%|██████ | 3529/5772 [2:20:31<3:46:24, 6.06s/it] {'loss': 0.4661, 'learning_rate': 6.930118886519374e-06, 'epoch': 0.61} + 61%|██████ | 3529/5772 [2:20:33<3:46:24, 6.06s/it] {'loss': 0.4661, 'learning_rate': 6.930118886519374e-06, 'epoch': 0.61} + 61%|██████ | 3529/5772 [2:20:31<3:46:24, 6.06s/it] 61%|██████ | 3530/5772 [2:20:37<3:46:19, 6.06s/it] 61%|██████ | 3530/5772 [2:20:39<3:46:19, 6.06s/it] {'loss': 0.4587, 'learning_rate': 6.924778363313071e-06, 'epoch': 0.61} + 61%|██████ | 3530/5772 [2:20:39<3:46:19, 6.06s/it] {'loss': 0.4587, 'learning_rate': 6.924778363313071e-06, 'epoch': 0.61} + 61%|██████ | 3530/5772 [2:20:37<3:46:19, 6.06s/it] 61%|██████ | 3531/5772 [2:20:43<3:45:01, 6.02s/it] 61%|██████ | 3531/5772 [2:20:45<3:45:01, 6.02s/it] {'loss': 0.4628, 'learning_rate': 6.919438808630716e-06, 'epoch': 0.61} + 61%|██████ | 3531/5772 [2:20:45<3:45:01, 6.02s/it] {'loss': 0.4628, 'learning_rate': 6.919438808630716e-06, 'epoch': 0.61} + 61%|██████ | 3531/5772 [2:20:43<3:45:01, 6.02s/it] 61%|██████ | 3532/5772 [2:20:50<3:46:43, 6.07s/it] 61%|██████ | 3532/5772 [2:20:52<3:46:43, 6.07s/it] {'loss': 0.471, 'learning_rate': 6.914100224153983e-06, 'epoch': 0.61} + 61%|██████ | 3532/5772 [2:20:52<3:46:43, 6.07s/it] {'loss': 0.471, 'learning_rate': 6.914100224153983e-06, 'epoch': 0.61} + 61%|██████ | 3532/5772 [2:20:50<3:46:43, 6.07s/it] 61%|██████ | 3533/5772 [2:20:56<3:47:04, 6.08s/it] 61%|██████ | 3533/5772 [2:20:58<3:47:03, 6.08s/it] {'loss': 0.4574, 'learning_rate': 6.90876261156423e-06, 'epoch': 0.61} + 61%|██████ | 3533/5772 [2:20:58<3:47:03, 6.08s/it] {'loss': 0.4574, 'learning_rate': 6.90876261156423e-06, 'epoch': 0.61} + 61%|██████ | 3533/5772 [2:20:56<3:47:04, 6.08s/it] 61%|██████ | 3534/5772 [2:21:02<3:44:54, 6.03s/it] 61%|██████ | 3534/5772 [2:21:04<3:44:53, 6.03s/it] {'loss': 0.465, 'learning_rate': 6.903425972542501e-06, 'epoch': 0.61} + 61%|██████ | 3534/5772 [2:21:04<3:44:53, 6.03s/it] {'loss': 0.465, 'learning_rate': 6.903425972542501e-06, 'epoch': 0.61} + 61%|██████ | 3534/5772 [2:21:02<3:44:54, 6.03s/it] 61%|██████ | 3535/5772 [2:21:08<3:47:55, 6.11s/it] 61%|██████ | 3535/5772 [2:21:10<3:47:55, 6.11s/it] {'loss': 0.4648, 'learning_rate': 6.898090308769548e-06, 'epoch': 0.61} + 61%|██████ | 3535/5772 [2:21:10<3:47:55, 6.11s/it] {'loss': 0.4648, 'learning_rate': 6.898090308769548e-06, 'epoch': 0.61} + 61%|██████ | 3535/5772 [2:21:08<3:47:55, 6.11s/it] 61%|██████▏ | 3536/5772 [2:21:14<3:47:45, 6.11s/it] 61%|██████▏ | 3536/5772 [2:21:16<3:47:45, 6.11s/it] {'loss': 0.4702, 'learning_rate': 6.892755621925804e-06, 'epoch': 0.61} + 61%|██████▏ | 3536/5772 [2:21:16<3:47:45, 6.11s/it] {'loss': 0.4702, 'learning_rate': 6.892755621925804e-06, 'epoch': 0.61} + 61%|██████▏ | 3536/5772 [2:21:14<3:47:45, 6.11s/it] 61%|██████▏ | 3537/5772 [2:21:22<3:48:24, 6.13s/it] 61%|██████▏ | 3537/5772 [2:21:20<3:48:24, 6.13s/it] {'loss': 0.4554, 'learning_rate': 6.887421913691402e-06, 'epoch': 0.61} + 61%|██████▏ | 3537/5772 [2:21:22<3:48:24, 6.13s/it] {'loss': 0.4554, 'learning_rate': 6.887421913691402e-06, 'epoch': 0.61} + 61%|██████▏ | 3537/5772 [2:21:20<3:48:24, 6.13s/it] 61%|██████▏ | 3538/5772 [2:21:26<3:48:49, 6.15s/it] 61%|██████▏ | 3538/5772 [2:21:28<3:48:49, 6.15s/it] {'loss': 0.4733, 'learning_rate': 6.882089185746158e-06, 'epoch': 0.61} + 61%|██████▏ | 3538/5772 [2:21:28<3:48:49, 6.15s/it] {'loss': 0.4733, 'learning_rate': 6.882089185746158e-06, 'epoch': 0.61} + 61%|██████▏ | 3538/5772 [2:21:26<3:48:49, 6.15s/it] 61%|██████▏ | 3539/5772 [2:21:34<3:48:03, 6.13s/it] 61%|██████▏ | 3539/5772 [2:21:32<3:48:03, 6.13s/it] {'loss': 0.4595, 'learning_rate': 6.876757439769592e-06, 'epoch': 0.61} + 61%|██████▏ | 3539/5772 [2:21:34<3:48:03, 6.13s/it] {'loss': 0.4595, 'learning_rate': 6.876757439769592e-06, 'epoch': 0.61} + 61%|██████▏ | 3539/5772 [2:21:32<3:48:03, 6.13s/it] 61%|██████▏ | 3540/5772 [2:21:40<3:45:40, 6.07s/it] 61%|██████▏ | 3540/5772 [2:21:38<3:45:40, 6.07s/it] {'loss': 0.4607, 'learning_rate': 6.871426677440907e-06, 'epoch': 0.61} + 61%|██████▏ | 3540/5772 [2:21:40<3:45:40, 6.07s/it] {'loss': 0.4607, 'learning_rate': 6.871426677440907e-06, 'epoch': 0.61} + 61%|██████▏ | 3540/5772 [2:21:38<3:45:40, 6.07s/it] 61%|██████▏ | 3541/5772 [2:21:45<3:47:25, 6.12s/it] 61%|██████▏ | 3541/5772 [2:21:47<3:47:25, 6.12s/it] {'loss': 0.4549, 'learning_rate': 6.866096900438992e-06, 'epoch': 0.61} + 61%|██████▏ | 3541/5772 [2:21:47<3:47:25, 6.12s/it] {'loss': 0.4549, 'learning_rate': 6.866096900438992e-06, 'epoch': 0.61} + 61%|██████▏ | 3541/5772 [2:21:45<3:47:25, 6.12s/it] 61%|██████▏ | 3542/5772 [2:21:51<3:47:46, 6.13s/it] 61%|██████▏ | 3542/5772 [2:21:53<3:47:46, 6.13s/it] {'loss': 0.4729, 'learning_rate': 6.860768110442438e-06, 'epoch': 0.61} + 61%|██████▏ | 3542/5772 [2:21:53<3:47:46, 6.13s/it] {'loss': 0.4729, 'learning_rate': 6.860768110442438e-06, 'epoch': 0.61} + 61%|██████▏ | 3542/5772 [2:21:51<3:47:46, 6.13s/it] 61%|██████▏ | 3543/5772 [2:21:57<3:48:47, 6.16s/it] 61%|██████▏ | 3543/5772 [2:21:59<3:48:47, 6.16s/it] {'loss': 0.4623, 'learning_rate': 6.855440309129509e-06, 'epoch': 0.61} + 61%|██████▏ | 3543/5772 [2:21:59<3:48:47, 6.16s/it] {'loss': 0.4623, 'learning_rate': 6.855440309129509e-06, 'epoch': 0.61} + 61%|██████▏ | 3543/5772 [2:21:57<3:48:47, 6.16s/it] 61%|██████▏ | 3544/5772 [2:22:03<3:44:05, 6.03s/it] 61%|██████▏ | 3544/5772 [2:22:05<3:44:05, 6.03s/it] {'loss': 0.4746, 'learning_rate': 6.850113498178173e-06, 'epoch': 0.61} + 61%|██████▏ | 3544/5772 [2:22:05<3:44:05, 6.03s/it] {'loss': 0.4746, 'learning_rate': 6.850113498178173e-06, 'epoch': 0.61} + 61%|██████▏ | 3544/5772 [2:22:03<3:44:05, 6.03s/it] 61%|██████▏ | 3545/5772 [2:22:11<3:42:39, 6.00s/it] 61%|██████▏ | 3545/5772 [2:22:09<3:42:40, 6.00s/it] {'loss': 0.4501, 'learning_rate': 6.844787679266076e-06, 'epoch': 0.61} + 61%|██████▏ | 3545/5772 [2:22:11<3:42:39, 6.00s/it] {'loss': 0.4501, 'learning_rate': 6.844787679266076e-06, 'epoch': 0.61} + 61%|██████▏ | 3545/5772 [2:22:09<3:42:40, 6.00s/it] 61%|██████▏ | 3546/5772 [2:22:17<3:45:31, 6.08s/it] 61%|██████▏ | 3546/5772 [2:22:15<3:45:31, 6.08s/it] {'loss': 0.4743, 'learning_rate': 6.839462854070554e-06, 'epoch': 0.61} + 61%|██████▏ | 3546/5772 [2:22:17<3:45:31, 6.08s/it] {'loss': 0.4743, 'learning_rate': 6.839462854070554e-06, 'epoch': 0.61} + 61%|██████▏ | 3546/5772 [2:22:15<3:45:31, 6.08s/it] 61%|██████▏ | 3547/5772 [2:22:21<3:46:43, 6.11s/it] 61%|██████▏ | 3547/5772 [2:22:23<3:46:44, 6.11s/it] {'loss': 0.4688, 'learning_rate': 6.834139024268638e-06, 'epoch': 0.61} + 61%|██████▏ | 3547/5772 [2:22:23<3:46:44, 6.11s/it] {'loss': 0.4688, 'learning_rate': 6.834139024268638e-06, 'epoch': 0.61} + 61%|██████▏ | 3547/5772 [2:22:21<3:46:43, 6.11s/it] 61%|██████▏ | 3548/5772 [2:22:27<3:45:47, 6.09s/it] 61%|██████▏ | 3548/5772 [2:22:29<3:45:47, 6.09s/it] {'loss': 0.4658, 'learning_rate': 6.828816191537032e-06, 'epoch': 0.61} + 61%|██████▏ | 3548/5772 [2:22:29<3:45:47, 6.09s/it] {'loss': 0.4658, 'learning_rate': 6.828816191537032e-06, 'epoch': 0.61} + 61%|██████▏ | 3548/5772 [2:22:27<3:45:47, 6.09s/it] 61%|██████▏ | 3549/5772 [2:22:33<3:46:21, 6.11s/it] 61%|██████▏ | 3549/5772 [2:22:35<3:46:21, 6.11s/it] {'loss': 0.4566, 'learning_rate': 6.8234943575521365e-06, 'epoch': 0.61} + 61%|██████▏ | 3549/5772 [2:22:35<3:46:21, 6.11s/it] {'loss': 0.4566, 'learning_rate': 6.8234943575521365e-06, 'epoch': 0.61} + 61%|██████▏ | 3549/5772 [2:22:33<3:46:21, 6.11s/it]10 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +30 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend...2 AutoResumeHook: Checking whether to suspend... + +7 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... + 62%|██████▏ | 3550/5772 [2:22:40<3:51:06, 6.24s/it]4 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... + 62%|██████▏ | 3550/5772 [2:22:42<3:51:07, 6.24s/it]5 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4841, 'learning_rate': 6.818173523990029e-06, 'epoch': 0.61} + 62%|██████▏ | 3550/5772 [2:22:42<3:51:07, 6.24s/it] {'loss': 0.4841, 'learning_rate': 6.818173523990029e-06, 'epoch': 0.61} + 62%|██████▏ | 3550/5772 [2:22:40<3:51:06, 6.24s/it] 62%|██████▏ | 3551/5772 [2:22:46<3:50:25, 6.22s/it] 62%|██████▏ | 3551/5772 [2:22:48<3:50:25, 6.22s/it] {'loss': 0.4527, 'learning_rate': 6.812853692526482e-06, 'epoch': 0.62} + 62%|██████▏ | 3551/5772 [2:22:48<3:50:25, 6.22s/it] {'loss': 0.4527, 'learning_rate': 6.812853692526482e-06, 'epoch': 0.62} + 62%|██████▏ | 3551/5772 [2:22:46<3:50:25, 6.22s/it] 62%|██████▏ | 3552/5772 [2:22:52<3:50:48, 6.24s/it] 62%|██████▏ | 3552/5772 [2:22:54<3:50:48, 6.24s/it] {'loss': 0.4757, 'learning_rate': 6.807534864836942e-06, 'epoch': 0.62} + 62%|██████▏ | 3552/5772 [2:22:54<3:50:48, 6.24s/it] {'loss': 0.4757, 'learning_rate': 6.807534864836942e-06, 'epoch': 0.62} + 62%|██████▏ | 3552/5772 [2:22:52<3:50:48, 6.24s/it] 62%|██████▏ | 3553/5772 [2:23:00<3:47:43, 6.16s/it] 62%|██████▏ | 3553/5772 [2:22:58<3:47:43, 6.16s/it] {'loss': 0.4675, 'learning_rate': 6.802217042596544e-06, 'epoch': 0.62} + 62%|██████▏ | 3553/5772 [2:23:00<3:47:43, 6.16s/it] {'loss': 0.4675, 'learning_rate': 6.802217042596544e-06, 'epoch': 0.62} + 62%|██████▏ | 3553/5772 [2:22:58<3:47:43, 6.16s/it] 62%|██████▏ | 3554/5772 [2:23:04<3:44:26, 6.07s/it] 62%|██████▏ | 3554/5772 [2:23:06<3:44:26, 6.07s/it] {'loss': 0.4678, 'learning_rate': 6.7969002274801145e-06, 'epoch': 0.62} + 62%|██████▏ | 3554/5772 [2:23:06<3:44:26, 6.07s/it] {'loss': 0.4678, 'learning_rate': 6.7969002274801145e-06, 'epoch': 0.62} + 62%|██████▏ | 3554/5772 [2:23:04<3:44:26, 6.07s/it] 62%|██████▏ | 3555/5772 [2:23:10<3:43:35, 6.05s/it] 62%|██████▏ | 3555/5772 [2:23:12<3:43:35, 6.05s/it] {'loss': 0.4627, 'learning_rate': 6.791584421162143e-06, 'epoch': 0.62} + 62%|██████▏ | 3555/5772 [2:23:12<3:43:35, 6.05s/it] {'loss': 0.4627, 'learning_rate': 6.791584421162143e-06, 'epoch': 0.62} + 62%|██████▏ | 3555/5772 [2:23:10<3:43:35, 6.05s/it] 62%|██████▏ | 3556/5772 [2:23:16<3:45:51, 6.12s/it] 62%|██████▏ | 3556/5772 [2:23:18<3:45:50, 6.12s/it] {'loss': 0.4719, 'learning_rate': 6.7862696253168225e-06, 'epoch': 0.62} + 62%|██████▏ | 3556/5772 [2:23:18<3:45:50, 6.12s/it] {'loss': 0.4719, 'learning_rate': 6.7862696253168225e-06, 'epoch': 0.62} + 62%|██████▏ | 3556/5772 [2:23:16<3:45:51, 6.12s/it] 62%|██████▏ | 3557/5772 [2:23:23<3:51:58, 6.28s/it] 62%|██████▏ | 3557/5772 [2:23:25<3:51:58, 6.28s/it] {'loss': 0.4641, 'learning_rate': 6.780955841618013e-06, 'epoch': 0.62} + 62%|██████▏ | 3557/5772 [2:23:25<3:51:58, 6.28s/it] {'loss': 0.4641, 'learning_rate': 6.780955841618013e-06, 'epoch': 0.62} + 62%|██████▏ | 3557/5772 [2:23:23<3:51:58, 6.28s/it] 62%|██████▏ | 3558/5772 [2:23:29<3:49:09, 6.21s/it] 62%|██████▏ | 3558/5772 [2:23:31<3:49:09, 6.21s/it] {'loss': 0.4674, 'learning_rate': 6.775643071739267e-06, 'epoch': 0.62} + 62%|██████▏ | 3558/5772 [2:23:31<3:49:09, 6.21s/it] {'loss': 0.4674, 'learning_rate': 6.775643071739267e-06, 'epoch': 0.62} + 62%|██████▏ | 3558/5772 [2:23:29<3:49:09, 6.21s/it] 62%|██████▏ | 3559/5772 [2:23:35<3:48:34, 6.20s/it] 62%|██████▏ | 3559/5772 [2:23:37<3:48:35, 6.20s/it] {'loss': 0.4595, 'learning_rate': 6.770331317353804e-06, 'epoch': 0.62} + 62%|██████▏ | 3559/5772 [2:23:37<3:48:35, 6.20s/it] {'loss': 0.4595, 'learning_rate': 6.770331317353804e-06, 'epoch': 0.62} + 62%|██████▏ | 3559/5772 [2:23:35<3:48:34, 6.20s/it] 62%|██████▏ | 3560/5772 [2:23:41<3:47:37, 6.17s/it] 62%|██████▏ | 3560/5772 [2:23:43<3:47:37, 6.17s/it] {'loss': 0.4496, 'learning_rate': 6.765020580134538e-06, 'epoch': 0.62} + 62%|██████▏ | 3560/5772 [2:23:43<3:47:37, 6.17s/it] {'loss': 0.4496, 'learning_rate': 6.765020580134538e-06, 'epoch': 0.62} + 62%|██████▏ | 3560/5772 [2:23:41<3:47:37, 6.17s/it] 62%|██████▏ | 3561/5772 [2:23:48<3:49:04, 6.22s/it] 62%|██████▏ | 3561/5772 [2:23:50<3:49:04, 6.22s/it] {'loss': 0.4621, 'learning_rate': 6.759710861754054e-06, 'epoch': 0.62} + 62%|██████▏ | 3561/5772 [2:23:50<3:49:04, 6.22s/it] {'loss': 0.4621, 'learning_rate': 6.759710861754054e-06, 'epoch': 0.62} + 62%|██████▏ | 3561/5772 [2:23:48<3:49:04, 6.22s/it] 62%|██████▏ | 3562/5772 [2:23:56<3:44:56, 6.11s/it] 62%|██████▏ | 3562/5772 [2:23:54<3:44:56, 6.11s/it] {'loss': 0.4679, 'learning_rate': 6.7544021638846145e-06, 'epoch': 0.62} + 62%|██████▏ | 3562/5772 [2:23:56<3:44:56, 6.11s/it] {'loss': 0.4679, 'learning_rate': 6.7544021638846145e-06, 'epoch': 0.62} + 62%|██████▏ | 3562/5772 [2:23:54<3:44:56, 6.11s/it] 62%|██████▏ | 3563/5772 [2:24:02<3:49:40, 6.24s/it] 62%|██████▏ | 3563/5772 [2:24:00<3:49:40, 6.24s/it] {'loss': 0.4621, 'learning_rate': 6.749094488198173e-06, 'epoch': 0.62} + 62%|██████▏ | 3563/5772 [2:24:02<3:49:40, 6.24s/it] {'loss': 0.4621, 'learning_rate': 6.749094488198173e-06, 'epoch': 0.62} + 62%|██████▏ | 3563/5772 [2:24:00<3:49:40, 6.24s/it] 62%|██████▏ | 3564/5772 [2:24:08<3:45:32, 6.13s/it] 62%|██████▏ | 3564/5772 [2:24:06<3:45:33, 6.13s/it] {'loss': 0.4696, 'learning_rate': 6.743787836366343e-06, 'epoch': 0.62} + 62%|██████▏ | 3564/5772 [2:24:08<3:45:32, 6.13s/it] {'loss': 0.4696, 'learning_rate': 6.743787836366343e-06, 'epoch': 0.62} + 62%|██████▏ | 3564/5772 [2:24:06<3:45:33, 6.13s/it] 62%|██████▏ | 3565/5772 [2:24:12<3:45:42, 6.14s/it] 62%|██████▏ | 3565/5772 [2:24:14<3:45:42, 6.14s/it] {'loss': 0.4578, 'learning_rate': 6.738482210060433e-06, 'epoch': 0.62} + 62%|██████▏ | 3565/5772 [2:24:14<3:45:42, 6.14s/it] {'loss': 0.4578, 'learning_rate': 6.738482210060433e-06, 'epoch': 0.62} + 62%|██████▏ | 3565/5772 [2:24:12<3:45:42, 6.14s/it] 62%|██████▏ | 3566/5772 [2:24:18<3:42:24, 6.05s/it] 62%|██████▏ | 3566/5772 [2:24:20<3:42:24, 6.05s/it] {'loss': 0.4576, 'learning_rate': 6.733177610951414e-06, 'epoch': 0.62} + 62%|██████▏ | 3566/5772 [2:24:20<3:42:24, 6.05s/it] {'loss': 0.4576, 'learning_rate': 6.733177610951414e-06, 'epoch': 0.62} + 62%|██████▏ | 3566/5772 [2:24:18<3:42:24, 6.05s/it] 62%|██████▏ | 3567/5772 [2:24:26<3:40:27, 6.00s/it] 62%|██████▏ | 3567/5772 [2:24:24<3:40:27, 6.00s/it] {'loss': 0.459, 'learning_rate': 6.727874040709943e-06, 'epoch': 0.62} + 62%|██████▏ | 3567/5772 [2:24:26<3:40:27, 6.00s/it] {'loss': 0.459, 'learning_rate': 6.727874040709943e-06, 'epoch': 0.62} + 62%|██████▏ | 3567/5772 [2:24:24<3:40:27, 6.00s/it] 62%|██████▏ | 3568/5772 [2:24:32<3:44:56, 6.12s/it] 62%|██████▏ | 3568/5772 [2:24:30<3:44:56, 6.12s/it] {'loss': 0.47, 'learning_rate': 6.7225715010063516e-06, 'epoch': 0.62} + 62%|██████▏ | 3568/5772 [2:24:32<3:44:56, 6.12s/it] {'loss': 0.47, 'learning_rate': 6.7225715010063516e-06, 'epoch': 0.62} + 62%|██████▏ | 3568/5772 [2:24:30<3:44:56, 6.12s/it] 62%|██████▏ | 3569/5772 [2:24:37<3:47:42, 6.20s/it] 62%|██████▏ | 3569/5772 [2:24:39<3:47:42, 6.20s/it] {'loss': 0.4695, 'learning_rate': 6.717269993510642e-06, 'epoch': 0.62} + 62%|██████▏ | 3569/5772 [2:24:39<3:47:42, 6.20s/it] {'loss': 0.4695, 'learning_rate': 6.717269993510642e-06, 'epoch': 0.62} + 62%|██████▏ | 3569/5772 [2:24:37<3:47:42, 6.20s/it] 62%|██████▏ | 3570/5772 [2:24:45<3:50:20, 6.28s/it] 62%|██████▏ | 3570/5772 [2:24:43<3:50:21, 6.28s/it] {'loss': 0.4753, 'learning_rate': 6.711969519892499e-06, 'epoch': 0.62} + 62%|██████▏ | 3570/5772 [2:24:45<3:50:20, 6.28s/it] {'loss': 0.4753, 'learning_rate': 6.711969519892499e-06, 'epoch': 0.62} + 62%|██████▏ | 3570/5772 [2:24:43<3:50:21, 6.28s/it] 62%|██████▏ | 3571/5772 [2:24:51<3:51:41, 6.32s/it] 62%|██████▏ | 3571/5772 [2:24:50<3:51:41, 6.32s/it] {'loss': 0.4626, 'learning_rate': 6.706670081821267e-06, 'epoch': 0.62} + 62%|██████▏ | 3571/5772 [2:24:51<3:51:41, 6.32s/it] {'loss': 0.4626, 'learning_rate': 6.706670081821267e-06, 'epoch': 0.62} + 62%|██████▏ | 3571/5772 [2:24:50<3:51:41, 6.32s/it] 62%|██████▏ | 3572/5772 [2:24:56<3:55:09, 6.41s/it] 62%|██████▏ | 3572/5772 [2:24:58<3:55:09, 6.41s/it] {'loss': 0.4655, 'learning_rate': 6.70137168096599e-06, 'epoch': 0.62} + 62%|██████▏ | 3572/5772 [2:24:58<3:55:09, 6.41s/it] {'loss': 0.4655, 'learning_rate': 6.70137168096599e-06, 'epoch': 0.62} + 62%|██████▏ | 3572/5772 [2:24:56<3:55:09, 6.41s/it] 62%|██████▏ | 3573/5772 [2:25:02<3:53:55, 6.38s/it] 62%|██████▏ | 3573/5772 [2:25:04<3:53:56, 6.38s/it] {'loss': 0.457, 'learning_rate': 6.696074318995355e-06, 'epoch': 0.62} + 62%|██████▏ | 3573/5772 [2:25:04<3:53:56, 6.38s/it] {'loss': 0.457, 'learning_rate': 6.696074318995355e-06, 'epoch': 0.62} + 62%|██████▏ | 3573/5772 [2:25:02<3:53:55, 6.38s/it] 62%|██████▏ | 3574/5772 [2:25:08<3:46:30, 6.18s/it] 62%|██████▏ | 3574/5772 [2:25:10<3:46:30, 6.18s/it] {'loss': 0.4677, 'learning_rate': 6.690777997577745e-06, 'epoch': 0.62} + 62%|██████▏ | 3574/5772 [2:25:10<3:46:30, 6.18s/it] {'loss': 0.4677, 'learning_rate': 6.690777997577745e-06, 'epoch': 0.62} + 62%|██████▏ | 3574/5772 [2:25:08<3:46:30, 6.18s/it] 62%|██████▏ | 3575/5772 [2:25:15<3:49:01, 6.25s/it] 62%|██████▏ | 3575/5772 [2:25:17<3:49:02, 6.25s/it] {'loss': 0.4578, 'learning_rate': 6.685482718381209e-06, 'epoch': 0.62} + 62%|██████▏ | 3575/5772 [2:25:17<3:49:02, 6.25s/it] {'loss': 0.4578, 'learning_rate': 6.685482718381209e-06, 'epoch': 0.62} + 62%|██████▏ | 3575/5772 [2:25:15<3:49:01, 6.25s/it] 62%|██████▏ | 3576/5772 [2:25:21<3:45:34, 6.16s/it] 62%|██████▏ | 3576/5772 [2:25:23<3:45:34, 6.16s/it] {'loss': 0.4644, 'learning_rate': 6.680188483073458e-06, 'epoch': 0.62} + 62%|██████▏ | 3576/5772 [2:25:23<3:45:34, 6.16s/it] {'loss': 0.4644, 'learning_rate': 6.680188483073458e-06, 'epoch': 0.62} + 62%|██████▏ | 3576/5772 [2:25:21<3:45:34, 6.16s/it] 62%|██████▏ | 3577/5772 [2:25:27<3:45:36, 6.17s/it] 62%|██████▏ | 3577/5772 [2:25:29<3:45:36, 6.17s/it] {'loss': 0.4594, 'learning_rate': 6.6748952933218895e-06, 'epoch': 0.62} + 62%|██████▏ | 3577/5772 [2:25:29<3:45:36, 6.17s/it] {'loss': 0.4594, 'learning_rate': 6.6748952933218895e-06, 'epoch': 0.62} + 62%|██████▏ | 3577/5772 [2:25:27<3:45:36, 6.17s/it] 62%|██████▏ | 3578/5772 [2:25:33<3:42:43, 6.09s/it] 62%|██████▏ | 3578/5772 [2:25:35<3:42:43, 6.09s/it] {'loss': 0.4721, 'learning_rate': 6.6696031507935575e-06, 'epoch': 0.62} + 62%|██████▏ | 3578/5772 [2:25:35<3:42:43, 6.09s/it] {'loss': 0.4721, 'learning_rate': 6.6696031507935575e-06, 'epoch': 0.62} + 62%|██████▏ | 3578/5772 [2:25:33<3:42:43, 6.09s/it] 62%|██████▏ | 3579/5772 [2:25:41<3:44:47, 6.15s/it] 62%|██████▏ | 3579/5772 [2:25:39<3:44:47, 6.15s/it] {'loss': 0.4788, 'learning_rate': 6.664312057155199e-06, 'epoch': 0.62} + 62%|██████▏ | 3579/5772 [2:25:41<3:44:47, 6.15s/it] {'loss': 0.4788, 'learning_rate': 6.664312057155199e-06, 'epoch': 0.62} + 62%|██████▏ | 3579/5772 [2:25:39<3:44:47, 6.15s/it] 62%|██████▏ | 3580/5772 [2:25:45<3:46:13, 6.19s/it] 62%|██████▏ | 3580/5772 [2:25:47<3:46:14, 6.19s/it] {'loss': 0.4659, 'learning_rate': 6.659022014073209e-06, 'epoch': 0.62} + 62%|██████▏ | 3580/5772 [2:25:47<3:46:14, 6.19s/it] {'loss': 0.4659, 'learning_rate': 6.659022014073209e-06, 'epoch': 0.62} + 62%|██████▏ | 3580/5772 [2:25:45<3:46:13, 6.19s/it] 62%|██████▏ | 3581/5772 [2:25:51<3:45:10, 6.17s/it] 62%|██████▏ | 3581/5772 [2:25:53<3:45:11, 6.17s/it] {'loss': 0.4614, 'learning_rate': 6.653733023213658e-06, 'epoch': 0.62} + 62%|██████▏ | 3581/5772 [2:25:53<3:45:11, 6.17s/it] {'loss': 0.4614, 'learning_rate': 6.653733023213658e-06, 'epoch': 0.62} + 62%|██████▏ | 3581/5772 [2:25:51<3:45:10, 6.17s/it] 62%|██████▏ | 3582/5772 [2:25:58<3:46:19, 6.20s/it] 62%|██████▏ | 3582/5772 [2:26:00<3:46:19, 6.20s/it] {'loss': 0.4639, 'learning_rate': 6.64844508624229e-06, 'epoch': 0.62} + 62%|██████▏ | 3582/5772 [2:26:00<3:46:19, 6.20s/it] {'loss': 0.4639, 'learning_rate': 6.64844508624229e-06, 'epoch': 0.62} + 62%|██████▏ | 3582/5772 [2:25:58<3:46:19, 6.20s/it] 62%|██████▏ | 3583/5772 [2:26:04<3:45:56, 6.19s/it] 62%|██████▏ | 3583/5772 [2:26:06<3:45:56, 6.19s/it] {'loss': 0.4665, 'learning_rate': 6.643158204824506e-06, 'epoch': 0.62} + 62%|██████▏ | 3583/5772 [2:26:06<3:45:56, 6.19s/it] {'loss': 0.4665, 'learning_rate': 6.643158204824506e-06, 'epoch': 0.62} + 62%|██████▏ | 3583/5772 [2:26:04<3:45:56, 6.19s/it] 62%|██████▏ | 3584/5772 [2:26:12<3:45:04, 6.17s/it] 62%|██████▏ | 3584/5772 [2:26:10<3:45:05, 6.17s/it] {'loss': 0.4654, 'learning_rate': 6.637872380625383e-06, 'epoch': 0.62} + 62%|██████▏ | 3584/5772 [2:26:12<3:45:04, 6.17s/it] {'loss': 0.4654, 'learning_rate': 6.637872380625383e-06, 'epoch': 0.62} + 62%|██████▏ | 3584/5772 [2:26:10<3:45:05, 6.17s/it] 62%|██████▏ | 3585/5772 [2:26:16<3:44:57, 6.17s/it] 62%|██████▏ | 3585/5772 [2:26:18<3:44:57, 6.17s/it] {'loss': 0.4663, 'learning_rate': 6.632587615309658e-06, 'epoch': 0.62} + 62%|██████▏ | 3585/5772 [2:26:18<3:44:57, 6.17s/it] {'loss': 0.4663, 'learning_rate': 6.632587615309658e-06, 'epoch': 0.62} + 62%|██████▏ | 3585/5772 [2:26:16<3:44:57, 6.17s/it] 62%|██████▏ | 3586/5772 [2:26:22<3:42:22, 6.10s/it] 62%|██████▏ | 3586/5772 [2:26:24<3:42:22, 6.10s/it] {'loss': 0.4588, 'learning_rate': 6.627303910541743e-06, 'epoch': 0.62} + 62%|██████▏ | 3586/5772 [2:26:24<3:42:22, 6.10s/it] {'loss': 0.4588, 'learning_rate': 6.627303910541743e-06, 'epoch': 0.62} + 62%|██████▏ | 3586/5772 [2:26:22<3:42:22, 6.10s/it] 62%|██████▏ | 3587/5772 [2:26:29<3:48:59, 6.29s/it] 62%|██████▏ | 3587/5772 [2:26:31<3:48:59, 6.29s/it] {'loss': 0.4555, 'learning_rate': 6.622021267985705e-06, 'epoch': 0.62} + 62%|██████▏ | 3587/5772 [2:26:31<3:48:59, 6.29s/it] {'loss': 0.4555, 'learning_rate': 6.622021267985705e-06, 'epoch': 0.62} + 62%|██████▏ | 3587/5772 [2:26:29<3:48:59, 6.29s/it] 62%|██████▏ | 3588/5772 [2:26:35<3:47:53, 6.26s/it] 62%|██████▏ | 3588/5772 [2:26:37<3:47:53, 6.26s/it] {'loss': 0.4616, 'learning_rate': 6.616739689305287e-06, 'epoch': 0.62} + 62%|██████▏ | 3588/5772 [2:26:37<3:47:53, 6.26s/it] {'loss': 0.4616, 'learning_rate': 6.616739689305287e-06, 'epoch': 0.62} + 62%|██████▏ | 3588/5772 [2:26:35<3:47:53, 6.26s/it] 62%|██████▏ | 3589/5772 [2:26:41<3:47:19, 6.25s/it] 62%|██████▏ | 3589/5772 [2:26:43<3:47:19, 6.25s/it] {'loss': 0.4486, 'learning_rate': 6.6114591761638995e-06, 'epoch': 0.62} + 62%|██████▏ | 3589/5772 [2:26:43<3:47:19, 6.25s/it] {'loss': 0.4486, 'learning_rate': 6.6114591761638995e-06, 'epoch': 0.62} + 62%|██████▏ | 3589/5772 [2:26:41<3:47:19, 6.25s/it] 62%|██████▏ | 3590/5772 [2:26:47<3:44:18, 6.17s/it] 62%|██████▏ | 3590/5772 [2:26:49<3:44:18, 6.17s/it] {'loss': 0.4681, 'learning_rate': 6.606179730224598e-06, 'epoch': 0.62} + 62%|██████▏ | 3590/5772 [2:26:49<3:44:18, 6.17s/it] {'loss': 0.4681, 'learning_rate': 6.606179730224598e-06, 'epoch': 0.62} + 62%|██████▏ | 3590/5772 [2:26:47<3:44:18, 6.17s/it] 62%|██████▏ | 3591/5772 [2:26:55<3:45:12, 6.20s/it] 62%|██████▏ | 3591/5772 [2:26:53<3:45:13, 6.20s/it] {'loss': 0.468, 'learning_rate': 6.600901353150123e-06, 'epoch': 0.62} + 62%|██████▏ | 3591/5772 [2:26:55<3:45:12, 6.20s/it] {'loss': 0.468, 'learning_rate': 6.600901353150123e-06, 'epoch': 0.62} + 62%|██████▏ | 3591/5772 [2:26:53<3:45:13, 6.20s/it] 62%|██████▏ | 3592/5772 [2:27:02<3:45:35, 6.21s/it] 62%|██████▏ | 3592/5772 [2:27:00<3:45:35, 6.21s/it] {'loss': 0.4731, 'learning_rate': 6.595624046602867e-06, 'epoch': 0.62} + 62%|██████▏ | 3592/5772 [2:27:02<3:45:35, 6.21s/it] {'loss': 0.4731, 'learning_rate': 6.595624046602867e-06, 'epoch': 0.62} + 62%|██████▏ | 3592/5772 [2:27:00<3:45:35, 6.21s/it] 62%|██████▏ | 3593/5772 [2:27:06<3:44:06, 6.17s/it] 62%|██████▏ | 3593/5772 [2:27:08<3:44:06, 6.17s/it] {'loss': 0.453, 'learning_rate': 6.59034781224489e-06, 'epoch': 0.62} + 62%|██████▏ | 3593/5772 [2:27:08<3:44:06, 6.17s/it] {'loss': 0.453, 'learning_rate': 6.59034781224489e-06, 'epoch': 0.62} + 62%|██████▏ | 3593/5772 [2:27:06<3:44:06, 6.17s/it] 62%|██████▏ | 3594/5772 [2:27:14<3:43:10, 6.15s/it] 62%|██████▏ | 3594/5772 [2:27:12<3:43:10, 6.15s/it] {'loss': 0.4699, 'learning_rate': 6.585072651737911e-06, 'epoch': 0.62} + 62%|██████▏ | 3594/5772 [2:27:14<3:43:10, 6.15s/it] {'loss': 0.4699, 'learning_rate': 6.585072651737911e-06, 'epoch': 0.62} + 62%|██████▏ | 3594/5772 [2:27:12<3:43:10, 6.15s/it] 62%|██████▏ | 3595/5772 [2:27:20<3:42:05, 6.12s/it] 62%|██████▏ | 3595/5772 [2:27:18<3:42:05, 6.12s/it] {'loss': 0.4721, 'learning_rate': 6.579798566743314e-06, 'epoch': 0.62} + 62%|██████▏ | 3595/5772 [2:27:20<3:42:05, 6.12s/it] {'loss': 0.4721, 'learning_rate': 6.579798566743314e-06, 'epoch': 0.62} + 62%|██████▏ | 3595/5772 [2:27:18<3:42:05, 6.12s/it] 62%|██████▏ | 3596/5772 [2:27:24<3:41:20, 6.10s/it] 62%|██████▏ | 3596/5772 [2:27:26<3:41:20, 6.10s/it] {'loss': 0.4519, 'learning_rate': 6.574525558922142e-06, 'epoch': 0.62} + 62%|██████▏ | 3596/5772 [2:27:26<3:41:20, 6.10s/it] {'loss': 0.4519, 'learning_rate': 6.574525558922142e-06, 'epoch': 0.62} + 62%|██████▏ | 3596/5772 [2:27:24<3:41:20, 6.10s/it] 62%|██████▏ | 3597/5772 [2:27:30<3:37:53, 6.01s/it] 62%|██████▏ | 3597/5772 [2:27:32<3:37:53, 6.01s/it] {'loss': 0.4623, 'learning_rate': 6.5692536299350974e-06, 'epoch': 0.62} + 62%|██████▏ | 3597/5772 [2:27:32<3:37:53, 6.01s/it] {'loss': 0.4623, 'learning_rate': 6.5692536299350974e-06, 'epoch': 0.62} + 62%|██████▏ | 3597/5772 [2:27:30<3:37:53, 6.01s/it] 62%|██████▏ | 3598/5772 [2:27:36<3:39:37, 6.06s/it] 62%|██████▏ | 3598/5772 [2:27:38<3:39:37, 6.06s/it] {'loss': 0.4699, 'learning_rate': 6.563982781442551e-06, 'epoch': 0.62} + 62%|██████▏ | 3598/5772 [2:27:38<3:39:37, 6.06s/it] {'loss': 0.4699, 'learning_rate': 6.563982781442551e-06, 'epoch': 0.62} + 62%|██████▏ | 3598/5772 [2:27:36<3:39:37, 6.06s/it] 62%|██████▏ | 3599/5772 [2:27:42<3:35:45, 5.96s/it] 62%|██████▏ | 3599/5772 [2:27:44<3:35:45, 5.96s/it] {'loss': 0.4544, 'learning_rate': 6.558713015104519e-06, 'epoch': 0.62} + 62%|██████▏ | 3599/5772 [2:27:44<3:35:45, 5.96s/it] {'loss': 0.4544, 'learning_rate': 6.558713015104519e-06, 'epoch': 0.62} + 62%|██████▏ | 3599/5772 [2:27:42<3:35:45, 5.96s/it]10 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +312 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +011 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend...4 AutoResumeHook: Checking whether to suspend... + 62%|██████▏ | 3600/5772 [2:27:48<3:39:59, 6.08s/it] + 62%|██████▏ | 3600/5772 [2:27:50<3:39:59, 6.08s/it] {'loss': 0.4631, 'learning_rate': 6.553444332580692e-06, 'epoch': 0.62} + 62%|██████▏ | 3600/5772 [2:27:50<3:39:59, 6.08s/it] {'loss': 0.4631, 'learning_rate': 6.553444332580692e-06, 'epoch': 0.62} + 62%|██████▏ | 3600/5772 [2:27:48<3:39:59, 6.08s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-3600/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-3600/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-3600/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 62%|██████▏ | 3601/5772 [2:28:11<6:22:28, 10.57s/it] 62%|██████▏ | 3601/5772 [2:28:09<6:22:28, 10.57s/it] {'loss': 0.4713, 'learning_rate': 6.54817673553041e-06, 'epoch': 0.62} + 62%|██████▏ | 3601/5772 [2:28:11<6:22:28, 10.57s/it] {'loss': 0.4713, 'learning_rate': 6.54817673553041e-06, 'epoch': 0.62} + 62%|██████▏ | 3601/5772 [2:28:09<6:22:28, 10.57s/it] 62%|██████▏ | 3602/5772 [2:28:17<5:36:54, 9.32s/it] 62%|██████▏ | 3602/5772 [2:28:15<5:36:54, 9.32s/it] {'loss': 0.461, 'learning_rate': 6.54291022561267e-06, 'epoch': 0.62} + 62%|██████▏ | 3602/5772 [2:28:17<5:36:54, 9.32s/it] {'loss': 0.461, 'learning_rate': 6.54291022561267e-06, 'epoch': 0.62} + 62%|██████▏ | 3602/5772 [2:28:15<5:36:54, 9.32s/it] 62%|██████▏ | 3603/5772 [2:28:24<5:02:30, 8.37s/it] 62%|██████▏ | 3603/5772 [2:28:22<5:02:30, 8.37s/it] {'loss': 0.4755, 'learning_rate': 6.537644804486136e-06, 'epoch': 0.62} + 62%|██████▏ | 3603/5772 [2:28:24<5:02:30, 8.37s/it] {'loss': 0.4755, 'learning_rate': 6.537644804486136e-06, 'epoch': 0.62} + 62%|██████▏ | 3603/5772 [2:28:22<5:02:30, 8.37s/it] 62%|██████▏ | 3604/5772 [2:28:30<4:40:09, 7.75s/it] 62%|██████▏ | 3604/5772 [2:28:28<4:40:09, 7.75s/it] {'loss': 0.452, 'learning_rate': 6.532380473809118e-06, 'epoch': 0.62} + 62%|██████▏ | 3604/5772 [2:28:30<4:40:09, 7.75s/it] {'loss': 0.452, 'learning_rate': 6.532380473809118e-06, 'epoch': 0.62} + 62%|██████▏ | 3604/5772 [2:28:28<4:40:09, 7.75s/it] 62%|██████▏ | 3605/5772 [2:28:34<4:24:21, 7.32s/it] 62%|██████▏ | 3605/5772 [2:28:36<4:24:22, 7.32s/it] {'loss': 0.471, 'learning_rate': 6.527117235239591e-06, 'epoch': 0.62} + 62%|██████▏ | 3605/5772 [2:28:36<4:24:22, 7.32s/it] {'loss': 0.471, 'learning_rate': 6.527117235239591e-06, 'epoch': 0.62} + 62%|██████▏ | 3605/5772 [2:28:34<4:24:21, 7.32s/it] 62%|██████▏ | 3606/5772 [2:28:43<4:13:47, 7.03s/it] 62%|██████▏ | 3606/5772 [2:28:41<4:13:47, 7.03s/it] {'loss': 0.4683, 'learning_rate': 6.521855090435178e-06, 'epoch': 0.62} + 62%|██████▏ | 3606/5772 [2:28:43<4:13:47, 7.03s/it] {'loss': 0.4683, 'learning_rate': 6.521855090435178e-06, 'epoch': 0.62} + 62%|██████▏ | 3606/5772 [2:28:41<4:13:47, 7.03s/it] 62%|██████▏ | 3607/5772 [2:28:48<4:01:09, 6.68s/it] 62%|██████▏ | 3607/5772 [2:28:46<4:01:09, 6.68s/it] {'loss': 0.4794, 'learning_rate': 6.516594041053173e-06, 'epoch': 0.62} + 62%|██████▏ | 3607/5772 [2:28:48<4:01:09, 6.68s/it] {'loss': 0.4794, 'learning_rate': 6.516594041053173e-06, 'epoch': 0.62} + 62%|██████▏ | 3607/5772 [2:28:46<4:01:09, 6.68s/it] 63%|██████▎ | 3608/5772 [2:28:53<3:56:17, 6.55s/it] 63%|██████▎ | 3608/5772 [2:28:55<3:56:17, 6.55s/it] {'loss': 0.4724, 'learning_rate': 6.511334088750501e-06, 'epoch': 0.63} + 63%|██████▎ | 3608/5772 [2:28:55<3:56:17, 6.55s/it] {'loss': 0.4724, 'learning_rate': 6.511334088750501e-06, 'epoch': 0.63} + 63%|██████▎ | 3608/5772 [2:28:53<3:56:17, 6.55s/it] 63%|██████▎ | 3609/5772 [2:29:01<3:53:37, 6.48s/it] 63%|██████▎ | 3609/5772 [2:28:59<3:53:37, 6.48s/it] {'loss': 0.4787, 'learning_rate': 6.50607523518376e-06, 'epoch': 0.63} + 63%|██████▎ | 3609/5772 [2:29:01<3:53:37, 6.48s/it] {'loss': 0.4787, 'learning_rate': 6.50607523518376e-06, 'epoch': 0.63} + 63%|██████▎ | 3609/5772 [2:28:59<3:53:37, 6.48s/it] 63%|██████▎ | 3610/5772 [2:29:05<3:51:03, 6.41s/it] 63%|██████▎ | 3610/5772 [2:29:07<3:51:03, 6.41s/it] {'loss': 0.4538, 'learning_rate': 6.500817482009201e-06, 'epoch': 0.63} + 63%|██████▎ | 3610/5772 [2:29:07<3:51:03, 6.41s/it] {'loss': 0.4538, 'learning_rate': 6.500817482009201e-06, 'epoch': 0.63} + 63%|██████▎ | 3610/5772 [2:29:05<3:51:03, 6.41s/it] 63%|██████▎ | 3611/5772 [2:29:11<3:48:04, 6.33s/it] 63%|██████▎ | 3611/5772 [2:29:13<3:48:04, 6.33s/it] {'loss': 0.4778, 'learning_rate': 6.495560830882719e-06, 'epoch': 0.63} + 63%|██████▎ | 3611/5772 [2:29:13<3:48:04, 6.33s/it] {'loss': 0.4778, 'learning_rate': 6.495560830882719e-06, 'epoch': 0.63} + 63%|██████▎ | 3611/5772 [2:29:11<3:48:04, 6.33s/it] 63%|██████▎ | 3612/5772 [2:29:20<3:47:03, 6.31s/it] 63%|██████▎ | 3612/5772 [2:29:18<3:47:03, 6.31s/it] {'loss': 0.4512, 'learning_rate': 6.49030528345987e-06, 'epoch': 0.63} + 63%|██████▎ | 3612/5772 [2:29:20<3:47:03, 6.31s/it] {'loss': 0.4512, 'learning_rate': 6.49030528345987e-06, 'epoch': 0.63} + 63%|██████▎ | 3612/5772 [2:29:18<3:47:03, 6.31s/it] 63%|██████▎ | 3613/5772 [2:29:24<3:45:47, 6.27s/it] 63%|██████▎ | 3613/5772 [2:29:26<3:45:47, 6.27s/it] {'loss': 0.4702, 'learning_rate': 6.4850508413958564e-06, 'epoch': 0.63} + 63%|██████▎ | 3613/5772 [2:29:26<3:45:47, 6.27s/it] {'loss': 0.4702, 'learning_rate': 6.4850508413958564e-06, 'epoch': 0.63} + 63%|██████▎ | 3613/5772 [2:29:24<3:45:47, 6.27s/it] 63%|██████▎ | 3614/5772 [2:29:30<3:41:44, 6.17s/it] 63%|██████▎ | 3614/5772 [2:29:32<3:41:44, 6.17s/it] {'loss': 0.4632, 'learning_rate': 6.479797506345539e-06, 'epoch': 0.63} + 63%|██████▎ | 3614/5772 [2:29:32<3:41:44, 6.17s/it] {'loss': 0.4632, 'learning_rate': 6.479797506345539e-06, 'epoch': 0.63} + 63%|██████▎ | 3614/5772 [2:29:30<3:41:44, 6.17s/it] 63%|██████▎ | 3615/5772 [2:29:36<3:46:08, 6.29s/it] 63%|██████▎ | 3615/5772 [2:29:38<3:46:08, 6.29s/it] {'loss': 0.4605, 'learning_rate': 6.47454527996342e-06, 'epoch': 0.63} + 63%|██████▎ | 3615/5772 [2:29:38<3:46:08, 6.29s/it] {'loss': 0.4605, 'learning_rate': 6.47454527996342e-06, 'epoch': 0.63} + 63%|██████▎ | 3615/5772 [2:29:36<3:46:08, 6.29s/it] 63%|██████▎ | 3616/5772 [2:29:43<3:47:48, 6.34s/it] 63%|██████▎ | 3616/5772 [2:29:45<3:47:49, 6.34s/it] {'loss': 0.4625, 'learning_rate': 6.469294163903666e-06, 'epoch': 0.63} + 63%|██████▎ | 3616/5772 [2:29:45<3:47:49, 6.34s/it] {'loss': 0.4625, 'learning_rate': 6.469294163903666e-06, 'epoch': 0.63} + 63%|██████▎ | 3616/5772 [2:29:43<3:47:48, 6.34s/it] 63%|██████▎ | 3617/5772 [2:29:49<3:44:41, 6.26s/it] 63%|██████▎ | 3617/5772 [2:29:51<3:44:41, 6.26s/it] {'loss': 0.4677, 'learning_rate': 6.464044159820086e-06, 'epoch': 0.63} + 63%|██████▎ | 3617/5772 [2:29:51<3:44:41, 6.26s/it] {'loss': 0.4677, 'learning_rate': 6.464044159820086e-06, 'epoch': 0.63} + 63%|██████▎ | 3617/5772 [2:29:49<3:44:41, 6.26s/it] 63%|██████▎ | 3618/5772 [2:29:55<3:46:01, 6.30s/it] 63%|██████▎ | 3618/5772 [2:29:57<3:46:01, 6.30s/it] {'loss': 0.4603, 'learning_rate': 6.458795269366136e-06, 'epoch': 0.63} + 63%|██████▎ | 3618/5772 [2:29:57<3:46:01, 6.30s/it] {'loss': 0.4603, 'learning_rate': 6.458795269366136e-06, 'epoch': 0.63} + 63%|██████▎ | 3618/5772 [2:29:55<3:46:01, 6.30s/it] 63%|██████▎ | 3619/5772 [2:30:02<3:54:17, 6.53s/it] 63%|██████▎ | 3619/5772 [2:30:04<3:54:16, 6.53s/it] {'loss': 0.4607, 'learning_rate': 6.453547494194929e-06, 'epoch': 0.63} + 63%|██████▎ | 3619/5772 [2:30:04<3:54:16, 6.53s/it] {'loss': 0.4607, 'learning_rate': 6.453547494194929e-06, 'epoch': 0.63} + 63%|██████▎ | 3619/5772 [2:30:02<3:54:17, 6.53s/it] 63%|██████▎ | 3620/5772 [2:30:08<3:49:10, 6.39s/it] 63%|██████▎ | 3620/5772 [2:30:10<3:49:10, 6.39s/it] {'loss': 0.4366, 'learning_rate': 6.448300835959218e-06, 'epoch': 0.63} + 63%|██████▎ | 3620/5772 [2:30:10<3:49:10, 6.39s/it] {'loss': 0.4366, 'learning_rate': 6.448300835959218e-06, 'epoch': 0.63} + 63%|██████▎ | 3620/5772 [2:30:08<3:49:10, 6.39s/it] 63%|██████▎ | 3621/5772 [2:30:14<3:45:09, 6.28s/it] 63%|██████▎ | 3621/5772 [2:30:16<3:45:09, 6.28s/it] {'loss': 0.4668, 'learning_rate': 6.443055296311413e-06, 'epoch': 0.63} + 63%|██████▎ | 3621/5772 [2:30:16<3:45:09, 6.28s/it] {'loss': 0.4668, 'learning_rate': 6.443055296311413e-06, 'epoch': 0.63} + 63%|██████▎ | 3621/5772 [2:30:14<3:45:09, 6.28s/it] 63%|██████▎ | 3622/5772 [2:30:20<3:41:45, 6.19s/it] 63%|██████▎ | 3622/5772 [2:30:22<3:41:45, 6.19s/it] {'loss': 0.452, 'learning_rate': 6.4378108769035644e-06, 'epoch': 0.63} + 63%|██████▎ | 3622/5772 [2:30:22<3:41:45, 6.19s/it] {'loss': 0.452, 'learning_rate': 6.4378108769035644e-06, 'epoch': 0.63} + 63%|██████▎ | 3622/5772 [2:30:20<3:41:45, 6.19s/it] 63%|██████▎ | 3623/5772 [2:30:27<3:42:37, 6.22s/it] 63%|██████▎ | 3623/5772 [2:30:29<3:42:38, 6.22s/it] {'loss': 0.4678, 'learning_rate': 6.432567579387372e-06, 'epoch': 0.63} + 63%|██████▎ | 3623/5772 [2:30:29<3:42:38, 6.22s/it] {'loss': 0.4678, 'learning_rate': 6.432567579387372e-06, 'epoch': 0.63} + 63%|██████▎ | 3623/5772 [2:30:27<3:42:37, 6.22s/it] 63%|██████▎ | 3624/5772 [2:30:33<3:43:58, 6.26s/it] 63%|██████▎ | 3624/5772 [2:30:35<3:43:58, 6.26s/it] {'loss': 0.4571, 'learning_rate': 6.427325405414189e-06, 'epoch': 0.63} + 63%|██████▎ | 3624/5772 [2:30:35<3:43:58, 6.26s/it] {'loss': 0.4571, 'learning_rate': 6.427325405414189e-06, 'epoch': 0.63} + 63%|██████▎ | 3624/5772 [2:30:33<3:43:58, 6.26s/it] 63%|██████▎ | 3625/5772 [2:30:39<3:37:11, 6.07s/it] 63%|██████▎ | 3625/5772 [2:30:41<3:37:11, 6.07s/it] {'loss': 0.4564, 'learning_rate': 6.422084356635003e-06, 'epoch': 0.63} + 63%|██████▎ | 3625/5772 [2:30:41<3:37:11, 6.07s/it] {'loss': 0.4564, 'learning_rate': 6.422084356635003e-06, 'epoch': 0.63} + 63%|██████▎ | 3625/5772 [2:30:39<3:37:11, 6.07s/it] 63%|██████▎ | 3626/5772 [2:30:45<3:36:06, 6.04s/it] 63%|██████▎ | 3626/5772 [2:30:47<3:36:05, 6.04s/it] {'loss': 0.4556, 'learning_rate': 6.41684443470046e-06, 'epoch': 0.63} + 63%|██████▎ | 3626/5772 [2:30:47<3:36:05, 6.04s/it] {'loss': 0.4556, 'learning_rate': 6.41684443470046e-06, 'epoch': 0.63} + 63%|██████▎ | 3626/5772 [2:30:45<3:36:06, 6.04s/it] 63%|██████▎ | 3627/5772 [2:30:51<3:35:29, 6.03s/it] 63%|██████▎ | 3627/5772 [2:30:53<3:35:28, 6.03s/it] {'loss': 0.4651, 'learning_rate': 6.4116056412608355e-06, 'epoch': 0.63} + 63%|██████▎ | 3627/5772 [2:30:53<3:35:28, 6.03s/it] {'loss': 0.4651, 'learning_rate': 6.4116056412608355e-06, 'epoch': 0.63} + 63%|██████▎ | 3627/5772 [2:30:51<3:35:29, 6.03s/it] 63%|██████▎ | 3628/5772 [2:30:57<3:39:41, 6.15s/it] 63%|██████▎ | 3628/5772 [2:30:59<3:39:41, 6.15s/it] {'loss': 0.4597, 'learning_rate': 6.406367977966066e-06, 'epoch': 0.63} + 63%|██████▎ | 3628/5772 [2:30:59<3:39:41, 6.15s/it] {'loss': 0.4597, 'learning_rate': 6.406367977966066e-06, 'epoch': 0.63} + 63%|██████▎ | 3628/5772 [2:30:57<3:39:41, 6.15s/it] 63%|██████▎ | 3629/5772 [2:31:05<3:41:00, 6.19s/it] 63%|██████▎ | 3629/5772 [2:31:03<3:41:01, 6.19s/it] {'loss': 0.4807, 'learning_rate': 6.4011314464657186e-06, 'epoch': 0.63} + 63%|██████▎ | 3629/5772 [2:31:05<3:41:00, 6.19s/it] {'loss': 0.4807, 'learning_rate': 6.4011314464657186e-06, 'epoch': 0.63} + 63%|██████▎ | 3629/5772 [2:31:03<3:41:01, 6.19s/it] 63%|██████▎ | 3630/5772 [2:31:09<3:39:38, 6.15s/it] 63%|██████▎ | 3630/5772 [2:31:11<3:39:38, 6.15s/it] {'loss': 0.4596, 'learning_rate': 6.3958960484090094e-06, 'epoch': 0.63} + 63%|██████▎ | 3630/5772 [2:31:11<3:39:38, 6.15s/it] {'loss': 0.4596, 'learning_rate': 6.3958960484090094e-06, 'epoch': 0.63} + 63%|██████▎ | 3630/5772 [2:31:09<3:39:38, 6.15s/it] 63%|██████▎ | 3631/5772 [2:31:17<3:37:20, 6.09s/it] 63%|██████▎ | 3631/5772 [2:31:15<3:37:20, 6.09s/it] {'loss': 0.4634, 'learning_rate': 6.390661785444809e-06, 'epoch': 0.63} + 63%|██████▎ | 3631/5772 [2:31:17<3:37:20, 6.09s/it] {'loss': 0.4634, 'learning_rate': 6.390661785444809e-06, 'epoch': 0.63} + 63%|██████▎ | 3631/5772 [2:31:15<3:37:20, 6.09s/it] 63%|██████▎ | 3632/5772 [2:31:23<3:33:17, 5.98s/it] 63%|██████▎ | 3632/5772 [2:31:21<3:33:18, 5.98s/it] {'loss': 0.4588, 'learning_rate': 6.385428659221604e-06, 'epoch': 0.63} + 63%|██████▎ | 3632/5772 [2:31:23<3:33:17, 5.98s/it] {'loss': 0.4588, 'learning_rate': 6.385428659221604e-06, 'epoch': 0.63} + 63%|██████▎ | 3632/5772 [2:31:21<3:33:18, 5.98s/it] 63%|██████▎ | 3633/5772 [2:31:29<3:36:54, 6.08s/it] 63%|██████▎ | 3633/5772 [2:31:27<3:36:54, 6.08s/it] {'loss': 0.477, 'learning_rate': 6.38019667138755e-06, 'epoch': 0.63} + 63%|██████▎ | 3633/5772 [2:31:29<3:36:54, 6.08s/it] {'loss': 0.477, 'learning_rate': 6.38019667138755e-06, 'epoch': 0.63} + 63%|██████▎ | 3633/5772 [2:31:27<3:36:54, 6.08s/it] 63%|██████▎ | 3634/5772 [2:31:33<3:36:50, 6.09s/it] 63%|██████▎ | 3634/5772 [2:31:35<3:36:51, 6.09s/it] {'loss': 0.4646, 'learning_rate': 6.374965823590425e-06, 'epoch': 0.63} + 63%|██████▎ | 3634/5772 [2:31:35<3:36:51, 6.09s/it] {'loss': 0.4646, 'learning_rate': 6.374965823590425e-06, 'epoch': 0.63} + 63%|██████▎ | 3634/5772 [2:31:33<3:36:50, 6.09s/it] 63%|██████▎ | 3635/5772 [2:31:40<3:36:27, 6.08s/it] 63%|██████▎ | 3635/5772 [2:31:42<3:36:27, 6.08s/it] {'loss': 0.4688, 'learning_rate': 6.369736117477662e-06, 'epoch': 0.63} + 63%|██████▎ | 3635/5772 [2:31:42<3:36:27, 6.08s/it] {'loss': 0.4688, 'learning_rate': 6.369736117477662e-06, 'epoch': 0.63} + 63%|██████▎ | 3635/5772 [2:31:40<3:36:27, 6.08s/it] 63%|██████▎ | 3636/5772 [2:31:48<3:37:48, 6.12s/it] 63%|██████▎ | 3636/5772 [2:31:46<3:37:48, 6.12s/it] {'loss': 0.4649, 'learning_rate': 6.364507554696322e-06, 'epoch': 0.63} + 63%|██████▎ | 3636/5772 [2:31:48<3:37:48, 6.12s/it] {'loss': 0.4649, 'learning_rate': 6.364507554696322e-06, 'epoch': 0.63} + 63%|██████▎ | 3636/5772 [2:31:46<3:37:48, 6.12s/it] 63%|██████▎ | 3637/5772 [2:31:54<3:40:05, 6.19s/it] 63%|██████▎ | 3637/5772 [2:31:52<3:40:05, 6.19s/it] {'loss': 0.4653, 'learning_rate': 6.3592801368931134e-06, 'epoch': 0.63} + 63%|██████▎ | 3637/5772 [2:31:54<3:40:05, 6.19s/it] {'loss': 0.4653, 'learning_rate': 6.3592801368931134e-06, 'epoch': 0.63} + 63%|██████▎ | 3637/5772 [2:31:52<3:40:05, 6.19s/it] 63%|██████▎ | 3638/5772 [2:32:00<3:36:56, 6.10s/it] 63%|██████▎ | 3638/5772 [2:31:58<3:36:56, 6.10s/it] {'loss': 0.4684, 'learning_rate': 6.354053865714387e-06, 'epoch': 0.63} + 63%|██████▎ | 3638/5772 [2:32:00<3:36:56, 6.10s/it] {'loss': 0.4684, 'learning_rate': 6.354053865714387e-06, 'epoch': 0.63} + 63%|██████▎ | 3638/5772 [2:31:58<3:36:56, 6.10s/it] 63%|██████▎ | 3639/5772 [2:32:04<3:39:38, 6.18s/it] 63%|██████▎ | 3639/5772 [2:32:06<3:39:38, 6.18s/it] {'loss': 0.4755, 'learning_rate': 6.348828742806122e-06, 'epoch': 0.63} + 63%|██████▎ | 3639/5772 [2:32:06<3:39:38, 6.18s/it] {'loss': 0.4755, 'learning_rate': 6.348828742806122e-06, 'epoch': 0.63} + 63%|██████▎ | 3639/5772 [2:32:04<3:39:38, 6.18s/it] 63%|██████▎ | 3640/5772 [2:32:13<3:41:47, 6.24s/it] 63%|██████▎ | 3640/5772 [2:32:11<3:41:48, 6.24s/it] {'loss': 0.468, 'learning_rate': 6.343604769813945e-06, 'epoch': 0.63} + 63%|██████▎ | 3640/5772 [2:32:13<3:41:47, 6.24s/it] {'loss': 0.468, 'learning_rate': 6.343604769813945e-06, 'epoch': 0.63} + 63%|██████▎ | 3640/5772 [2:32:11<3:41:48, 6.24s/it] 63%|██████▎ | 3641/5772 [2:32:19<3:39:14, 6.17s/it] 63%|██████▎ | 3641/5772 [2:32:17<3:39:14, 6.17s/it] {'loss': 0.4654, 'learning_rate': 6.338381948383111e-06, 'epoch': 0.63} + 63%|██████▎ | 3641/5772 [2:32:19<3:39:14, 6.17s/it] {'loss': 0.4654, 'learning_rate': 6.338381948383111e-06, 'epoch': 0.63} + 63%|██████▎ | 3641/5772 [2:32:17<3:39:14, 6.17s/it] 63%|██████▎ | 3642/5772 [2:32:23<3:37:03, 6.11s/it] 63%|██████▎ | 3642/5772 [2:32:25<3:37:04, 6.11s/it] {'loss': 0.4646, 'learning_rate': 6.33316028015853e-06, 'epoch': 0.63} + 63%|██████▎ | 3642/5772 [2:32:25<3:37:04, 6.11s/it] {'loss': 0.4646, 'learning_rate': 6.33316028015853e-06, 'epoch': 0.63} + 63%|██████▎ | 3642/5772 [2:32:23<3:37:03, 6.11s/it] 63%|██████▎ | 3643/5772 [2:32:29<3:37:24, 6.13s/it] 63%|██████▎ | 3643/5772 [2:32:31<3:37:24, 6.13s/it] {'loss': 0.4656, 'learning_rate': 6.3279397667847265e-06, 'epoch': 0.63} + 63%|██████▎ | 3643/5772 [2:32:31<3:37:24, 6.13s/it] {'loss': 0.4656, 'learning_rate': 6.3279397667847265e-06, 'epoch': 0.63} + 63%|██████▎ | 3643/5772 [2:32:29<3:37:24, 6.13s/it] 63%|██████▎ | 3644/5772 [2:32:35<3:35:18, 6.07s/it] 63%|██████▎ | 3644/5772 [2:32:37<3:35:18, 6.07s/it] {'loss': 0.4594, 'learning_rate': 6.322720409905878e-06, 'epoch': 0.63} + 63%|██████▎ | 3644/5772 [2:32:37<3:35:18, 6.07s/it] {'loss': 0.4594, 'learning_rate': 6.322720409905878e-06, 'epoch': 0.63} + 63%|██████▎ | 3644/5772 [2:32:35<3:35:18, 6.07s/it] 63%|██████▎ | 3645/5772 [2:32:41<3:36:04, 6.10s/it] 63%|██████▎ | 3645/5772 [2:32:43<3:36:04, 6.10s/it] {'loss': 0.4647, 'learning_rate': 6.317502211165794e-06, 'epoch': 0.63} + 63%|██████▎ | 3645/5772 [2:32:43<3:36:04, 6.10s/it] {'loss': 0.4647, 'learning_rate': 6.317502211165794e-06, 'epoch': 0.63} + 63%|██████▎ | 3645/5772 [2:32:41<3:36:04, 6.10s/it] 63%|██████▎ | 3646/5772 [2:32:47<3:33:56, 6.04s/it] 63%|██████▎ | 3646/5772 [2:32:49<3:33:57, 6.04s/it] {'loss': 0.4608, 'learning_rate': 6.312285172207909e-06, 'epoch': 0.63} + 63%|██████▎ | 3646/5772 [2:32:49<3:33:57, 6.04s/it] {'loss': 0.4608, 'learning_rate': 6.312285172207909e-06, 'epoch': 0.63} + 63%|██████▎ | 3646/5772 [2:32:47<3:33:56, 6.04s/it] 63%|██████▎ | 3647/5772 [2:32:53<3:34:48, 6.07s/it] 63%|██████▎ | 3647/5772 [2:32:55<3:34:49, 6.07s/it] {'loss': 0.4775, 'learning_rate': 6.30706929467531e-06, 'epoch': 0.63} + 63%|██████▎ | 3647/5772 [2:32:55<3:34:49, 6.07s/it] {'loss': 0.4775, 'learning_rate': 6.30706929467531e-06, 'epoch': 0.63} + 63%|██████▎ | 3647/5772 [2:32:53<3:34:48, 6.07s/it] 63%|██████▎ | 3648/5772 [2:33:01<3:32:38, 6.01s/it] 63%|██████▎ | 3648/5772 [2:32:59<3:32:39, 6.01s/it] {'loss': 0.4598, 'learning_rate': 6.3018545802107e-06, 'epoch': 0.63} + 63%|██████▎ | 3648/5772 [2:33:01<3:32:38, 6.01s/it] {'loss': 0.4598, 'learning_rate': 6.3018545802107e-06, 'epoch': 0.63} + 63%|██████▎ | 3648/5772 [2:32:59<3:32:39, 6.01s/it] 63%|██████▎ | 3649/5772 [2:33:05<3:30:32, 5.95s/it] 63%|██████▎ | 3649/5772 [2:33:07<3:30:32, 5.95s/it] {'loss': 0.4789, 'learning_rate': 6.2966410304564304e-06, 'epoch': 0.63} + 63%|██████▎ | 3649/5772 [2:33:07<3:30:32, 5.95s/it] {'loss': 0.4789, 'learning_rate': 6.2966410304564304e-06, 'epoch': 0.63} + 63%|██████▎ | 3649/5772 [2:33:05<3:30:32, 5.95s/it]10 AutoResumeHook: Checking whether to suspend... +1415 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +26 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +01 AutoResumeHook: Checking whether to suspend... + 5 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... + 63%|██████▎ | 3650/5772 [2:33:13<3:33:00, 6.02s/it]1311 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... + 63%|██████▎ | 3650/5772 [2:33:11<3:33:01, 6.02s/it]7 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4667, 'learning_rate': 6.291428647054474e-06, 'epoch': 0.63} + 63%|██████▎ | 3650/5772 [2:33:13<3:33:00, 6.02s/it] {'loss': 0.4667, 'learning_rate': 6.291428647054474e-06, 'epoch': 0.63} + 63%|██████▎ | 3650/5772 [2:33:11<3:33:01, 6.02s/it] 63%|██████▎ | 3651/5772 [2:33:17<3:30:27, 5.95s/it] 63%|██████▎ | 3651/5772 [2:33:19<3:30:27, 5.95s/it] {'loss': 0.4702, 'learning_rate': 6.286217431646447e-06, 'epoch': 0.63} + 63%|██████▎ | 3651/5772 [2:33:19<3:30:27, 5.95s/it] {'loss': 0.4702, 'learning_rate': 6.286217431646447e-06, 'epoch': 0.63} + 63%|██████▎ | 3651/5772 [2:33:17<3:30:27, 5.95s/it] 63%|██████▎ | 3652/5772 [2:33:23<3:31:14, 5.98s/it] 63%|██████▎ | 3652/5772 [2:33:25<3:31:14, 5.98s/it] {'loss': 0.454, 'learning_rate': 6.281007385873594e-06, 'epoch': 0.63} + 63%|██████▎ | 3652/5772 [2:33:25<3:31:14, 5.98s/it] {'loss': 0.454, 'learning_rate': 6.281007385873594e-06, 'epoch': 0.63} + 63%|██████▎ | 3652/5772 [2:33:23<3:31:14, 5.98s/it] 63%|██████▎ | 3653/5772 [2:33:29<3:35:22, 6.10s/it] 63%|██████▎ | 3653/5772 [2:33:31<3:35:22, 6.10s/it] {'loss': 0.4691, 'learning_rate': 6.275798511376785e-06, 'epoch': 0.63} + 63%|██████▎ | 3653/5772 [2:33:31<3:35:22, 6.10s/it] {'loss': 0.4691, 'learning_rate': 6.275798511376785e-06, 'epoch': 0.63} + 63%|██████▎ | 3653/5772 [2:33:29<3:35:22, 6.10s/it] 63%|██████▎ | 3654/5772 [2:33:36<3:41:09, 6.26s/it] 63%|██████▎ | 3654/5772 [2:33:38<3:41:08, 6.26s/it] {'loss': 0.4717, 'learning_rate': 6.270590809796531e-06, 'epoch': 0.63} + 63%|██████▎ | 3654/5772 [2:33:38<3:41:08, 6.26s/it] {'loss': 0.4717, 'learning_rate': 6.270590809796531e-06, 'epoch': 0.63} + 63%|██████▎ | 3654/5772 [2:33:36<3:41:09, 6.26s/it] 63%|██████▎ | 3655/5772 [2:33:44<3:38:25, 6.19s/it] 63%|██████▎ | 3655/5772 [2:33:42<3:38:26, 6.19s/it] {'loss': 0.4571, 'learning_rate': 6.265384282772961e-06, 'epoch': 0.63} + 63%|██████▎ | 3655/5772 [2:33:44<3:38:25, 6.19s/it] {'loss': 0.4571, 'learning_rate': 6.265384282772961e-06, 'epoch': 0.63} + 63%|██████▎ | 3655/5772 [2:33:42<3:38:26, 6.19s/it] 63%|██████▎ | 3656/5772 [2:33:50<3:37:26, 6.17s/it] 63%|██████▎ | 3656/5772 [2:33:48<3:37:27, 6.17s/it] {'loss': 0.4604, 'learning_rate': 6.260178931945852e-06, 'epoch': 0.63} + 63%|██████▎ | 3656/5772 [2:33:50<3:37:26, 6.17s/it] {'loss': 0.4604, 'learning_rate': 6.260178931945852e-06, 'epoch': 0.63} + 63%|██████▎ | 3656/5772 [2:33:48<3:37:27, 6.17s/it] 63%|██████▎ | 3657/5772 [2:33:54<3:36:44, 6.15s/it] 63%|██████▎ | 3657/5772 [2:33:56<3:36:44, 6.15s/it] {'loss': 0.4663, 'learning_rate': 6.25497475895459e-06, 'epoch': 0.63} + 63%|██████▎ | 3657/5772 [2:33:56<3:36:44, 6.15s/it] {'loss': 0.4663, 'learning_rate': 6.25497475895459e-06, 'epoch': 0.63} + 63%|██████▎ | 3657/5772 [2:33:54<3:36:44, 6.15s/it] 63%|██████▎ | 3658/5772 [2:34:00<3:37:54, 6.18s/it] 63%|██████▎ | 3658/5772 [2:34:02<3:37:55, 6.18s/it] {'loss': 0.4655, 'learning_rate': 6.249771765438205e-06, 'epoch': 0.63} + 63%|██████▎ | 3658/5772 [2:34:02<3:37:55, 6.18s/it] {'loss': 0.4655, 'learning_rate': 6.249771765438205e-06, 'epoch': 0.63} + 63%|██████▎ | 3658/5772 [2:34:00<3:37:54, 6.18s/it] 63%|██████▎ | 3659/5772 [2:34:06<3:34:48, 6.10s/it] 63%|██████▎ | 3659/5772 [2:34:08<3:34:48, 6.10s/it] {'loss': 0.4662, 'learning_rate': 6.244569953035355e-06, 'epoch': 0.63} + 63%|██████▎ | 3659/5772 [2:34:08<3:34:48, 6.10s/it] {'loss': 0.4662, 'learning_rate': 6.244569953035355e-06, 'epoch': 0.63} + 63%|██████▎ | 3659/5772 [2:34:06<3:34:48, 6.10s/it]/lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/VILA/llava/model/llava_arch.py:397: UserWarning: Inputs truncated! + warnings.warn("Inputs truncated!") + 63%|██████▎ | 3660/5772 [2:34:15<3:37:55, 6.19s/it] 63%|██████▎ | 3660/5772 [2:34:13<3:37:55, 6.19s/it] {'loss': 0.4591, 'learning_rate': 6.2393693233843155e-06, 'epoch': 0.63} + 63%|██████▎ | 3660/5772 [2:34:15<3:37:55, 6.19s/it] {'loss': 0.4591, 'learning_rate': 6.2393693233843155e-06, 'epoch': 0.63} + 63%|██████▎ | 3660/5772 [2:34:13<3:37:55, 6.19s/it] 63%|██████▎ | 3661/5772 [2:34:20<3:35:11, 6.12s/it] 63%|██████▎ | 3661/5772 [2:34:19<3:35:11, 6.12s/it] {'loss': 0.475, 'learning_rate': 6.234169878123001e-06, 'epoch': 0.63} + 63%|██████▎ | 3661/5772 [2:34:20<3:35:11, 6.12s/it] {'loss': 0.475, 'learning_rate': 6.234169878123001e-06, 'epoch': 0.63} + 63%|██████▎ | 3661/5772 [2:34:19<3:35:11, 6.12s/it] 63%|██████▎ | 3662/5772 [2:34:27<3:34:50, 6.11s/it] 63%|██████▎ | 3662/5772 [2:34:25<3:34:50, 6.11s/it] {'loss': 0.466, 'learning_rate': 6.228971618888943e-06, 'epoch': 0.63} + 63%|██████▎ | 3662/5772 [2:34:27<3:34:50, 6.11s/it] {'loss': 0.466, 'learning_rate': 6.228971618888943e-06, 'epoch': 0.63} + 63%|██████▎ | 3662/5772 [2:34:25<3:34:50, 6.11s/it] 63%|██████▎ | 3663/5772 [2:34:31<3:38:37, 6.22s/it] 63%|██████▎ | 3663/5772 [2:34:33<3:38:38, 6.22s/it] {'loss': 0.4608, 'learning_rate': 6.223774547319308e-06, 'epoch': 0.63} + 63%|██████▎ | 3663/5772 [2:34:33<3:38:38, 6.22s/it] {'loss': 0.4608, 'learning_rate': 6.223774547319308e-06, 'epoch': 0.63} + 63%|██████▎ | 3663/5772 [2:34:31<3:38:37, 6.22s/it] 63%|██████▎ | 3664/5772 [2:34:39<3:36:06, 6.15s/it] 63%|██████▎ | 3664/5772 [2:34:37<3:36:06, 6.15s/it] {'loss': 0.4663, 'learning_rate': 6.218578665050883e-06, 'epoch': 0.63} + 63%|██████▎ | 3664/5772 [2:34:39<3:36:06, 6.15s/it] {'loss': 0.4663, 'learning_rate': 6.218578665050883e-06, 'epoch': 0.63} + 63%|██████▎ | 3664/5772 [2:34:37<3:36:06, 6.15s/it] 63%|██████▎ | 3665/5772 [2:34:45<3:38:37, 6.23s/it] 63%|██████▎ | 3665/5772 [2:34:43<3:38:37, 6.23s/it] {'loss': 0.4641, 'learning_rate': 6.2133839737200795e-06, 'epoch': 0.63} + 63%|██████▎ | 3665/5772 [2:34:45<3:38:37, 6.23s/it] {'loss': 0.4641, 'learning_rate': 6.2133839737200795e-06, 'epoch': 0.63} + 63%|██████▎ | 3665/5772 [2:34:43<3:38:37, 6.23s/it] 64%|██████▎ | 3666/5772 [2:34:52<3:40:31, 6.28s/it] 64%|██████▎ | 3666/5772 [2:34:50<3:40:31, 6.28s/it] {'loss': 0.4789, 'learning_rate': 6.208190474962945e-06, 'epoch': 0.64} + 64%|██████▎ | 3666/5772 [2:34:52<3:40:31, 6.28s/it] {'loss': 0.4789, 'learning_rate': 6.208190474962945e-06, 'epoch': 0.64} + 64%|██████▎ | 3666/5772 [2:34:50<3:40:31, 6.28s/it] 64%|██████▎ | 3667/5772 [2:34:58<3:37:08, 6.19s/it] 64%|██████▎ | 3667/5772 [2:34:56<3:37:08, 6.19s/it] {'loss': 0.4632, 'learning_rate': 6.202998170415133e-06, 'epoch': 0.64} + 64%|██████▎ | 3667/5772 [2:34:58<3:37:08, 6.19s/it] {'loss': 0.4632, 'learning_rate': 6.202998170415133e-06, 'epoch': 0.64} + 64%|██████▎ | 3667/5772 [2:34:56<3:37:08, 6.19s/it] 64%|██████▎ | 3668/5772 [2:35:04<3:40:04, 6.28s/it] 64%|██████▎ | 3668/5772 [2:35:02<3:40:04, 6.28s/it] {'loss': 0.4625, 'learning_rate': 6.19780706171194e-06, 'epoch': 0.64} + 64%|██████▎ | 3668/5772 [2:35:04<3:40:04, 6.28s/it] {'loss': 0.4625, 'learning_rate': 6.19780706171194e-06, 'epoch': 0.64} + 64%|██████▎ | 3668/5772 [2:35:02<3:40:04, 6.28s/it] 64%|██████▎ | 3669/5772 [2:35:10<3:34:44, 6.13s/it] 64%|██████▎ | 3669/5772 [2:35:08<3:34:44, 6.13s/it] {'loss': 0.4738, 'learning_rate': 6.19261715048827e-06, 'epoch': 0.64} + 64%|██████▎ | 3669/5772 [2:35:10<3:34:44, 6.13s/it] {'loss': 0.4738, 'learning_rate': 6.19261715048827e-06, 'epoch': 0.64} + 64%|██████▎ | 3669/5772 [2:35:08<3:34:44, 6.13s/it] 64%|██████▎ | 3670/5772 [2:35:14<3:32:40, 6.07s/it] 64%|██████▎ | 3670/5772 [2:35:16<3:32:40, 6.07s/it] {'loss': 0.4628, 'learning_rate': 6.187428438378662e-06, 'epoch': 0.64} + 64%|██████▎ | 3670/5772 [2:35:16<3:32:40, 6.07s/it] {'loss': 0.4628, 'learning_rate': 6.187428438378662e-06, 'epoch': 0.64} + 64%|██████▎ | 3670/5772 [2:35:14<3:32:40, 6.07s/it] 64%|██████▎ | 3671/5772 [2:35:22<3:34:14, 6.12s/it] 64%|██████▎ | 3671/5772 [2:35:20<3:34:14, 6.12s/it] {'loss': 0.4553, 'learning_rate': 6.1822409270172665e-06, 'epoch': 0.64} + 64%|██████▎ | 3671/5772 [2:35:22<3:34:14, 6.12s/it] {'loss': 0.4553, 'learning_rate': 6.1822409270172665e-06, 'epoch': 0.64} + 64%|██████▎ | 3671/5772 [2:35:20<3:34:14, 6.12s/it] 64%|██████▎ | 3672/5772 [2:35:29<3:37:27, 6.21s/it] 64%|██████▎ | 3672/5772 [2:35:27<3:37:28, 6.21s/it] {'loss': 0.4724, 'learning_rate': 6.177054618037866e-06, 'epoch': 0.64} + 64%|██████▎ | 3672/5772 [2:35:29<3:37:27, 6.21s/it] {'loss': 0.4724, 'learning_rate': 6.177054618037866e-06, 'epoch': 0.64} + 64%|██████▎ | 3672/5772 [2:35:27<3:37:28, 6.21s/it] 64%|██████▎ | 3673/5772 [2:35:35<3:42:16, 6.35s/it] 64%|██████▎ | 3673/5772 [2:35:33<3:42:16, 6.35s/it] {'loss': 0.4736, 'learning_rate': 6.171869513073858e-06, 'epoch': 0.64} + 64%|██████▎ | 3673/5772 [2:35:35<3:42:16, 6.35s/it] {'loss': 0.4736, 'learning_rate': 6.171869513073858e-06, 'epoch': 0.64} + 64%|██████▎ | 3673/5772 [2:35:33<3:42:16, 6.35s/it] 64%|██████▎ | 3674/5772 [2:35:39<3:36:34, 6.19s/it] 64%|██████▎ | 3674/5772 [2:35:41<3:36:34, 6.19s/it] {'loss': 0.4536, 'learning_rate': 6.166685613758259e-06, 'epoch': 0.64} + 64%|██████▎ | 3674/5772 [2:35:41<3:36:34, 6.19s/it] {'loss': 0.4536, 'learning_rate': 6.166685613758259e-06, 'epoch': 0.64} + 64%|██████▎ | 3674/5772 [2:35:39<3:36:34, 6.19s/it] 64%|██████▎ | 3675/5772 [2:35:47<3:33:54, 6.12s/it] 64%|██████▎ | 3675/5772 [2:35:45<3:33:54, 6.12s/it] {'loss': 0.4635, 'learning_rate': 6.161502921723719e-06, 'epoch': 0.64} + 64%|██████▎ | 3675/5772 [2:35:47<3:33:54, 6.12s/it] {'loss': 0.4635, 'learning_rate': 6.161502921723719e-06, 'epoch': 0.64} + 64%|██████▎ | 3675/5772 [2:35:45<3:33:54, 6.12s/it] 64%|██████▎ | 3676/5772 [2:35:51<3:32:41, 6.09s/it] 64%|██████▎ | 3676/5772 [2:35:53<3:32:41, 6.09s/it] {'loss': 0.4506, 'learning_rate': 6.156321438602484e-06, 'epoch': 0.64} + 64%|██████▎ | 3676/5772 [2:35:53<3:32:41, 6.09s/it] {'loss': 0.4506, 'learning_rate': 6.156321438602484e-06, 'epoch': 0.64} + 64%|██████▎ | 3676/5772 [2:35:51<3:32:41, 6.09s/it] 64%|██████▎ | 3677/5772 [2:35:57<3:32:55, 6.10s/it] 64%|██████▎ | 3677/5772 [2:35:59<3:32:55, 6.10s/it] {'loss': 0.4714, 'learning_rate': 6.1511411660264485e-06, 'epoch': 0.64} + 64%|██████▎ | 3677/5772 [2:35:59<3:32:55, 6.10s/it] {'loss': 0.4714, 'learning_rate': 6.1511411660264485e-06, 'epoch': 0.64} + 64%|██████▎ | 3677/5772 [2:35:57<3:32:55, 6.10s/it] 64%|██████▎ | 3678/5772 [2:36:03<3:33:12, 6.11s/it] 64%|██████▎ | 3678/5772 [2:36:05<3:33:12, 6.11s/it] {'loss': 0.4637, 'learning_rate': 6.145962105627097e-06, 'epoch': 0.64} + 64%|██████▎ | 3678/5772 [2:36:05<3:33:12, 6.11s/it] {'loss': 0.4637, 'learning_rate': 6.145962105627097e-06, 'epoch': 0.64} + 64%|██████▎ | 3678/5772 [2:36:03<3:33:12, 6.11s/it] 64%|██████▎ | 3679/5772 [2:36:10<3:33:31, 6.12s/it] 64%|██████▎ | 3679/5772 [2:36:12<3:33:32, 6.12s/it] {'loss': 0.4725, 'learning_rate': 6.140784259035553e-06, 'epoch': 0.64} + 64%|██████▎ | 3679/5772 [2:36:12<3:33:32, 6.12s/it] {'loss': 0.4725, 'learning_rate': 6.140784259035553e-06, 'epoch': 0.64} + 64%|██████▎ | 3679/5772 [2:36:10<3:33:31, 6.12s/it] 64%|██████▍ | 3680/5772 [2:36:18<3:37:00, 6.22s/it] 64%|██████▍ | 3680/5772 [2:36:16<3:37:01, 6.22s/it] {'loss': 0.4605, 'learning_rate': 6.1356076278825516e-06, 'epoch': 0.64} + 64%|██████▍ | 3680/5772 [2:36:18<3:37:00, 6.22s/it] {'loss': 0.4605, 'learning_rate': 6.1356076278825516e-06, 'epoch': 0.64} + 64%|██████▍ | 3680/5772 [2:36:16<3:37:01, 6.22s/it] 64%|██████▍ | 3681/5772 [2:36:23<3:41:08, 6.35s/it] 64%|██████▍ | 3681/5772 [2:36:25<3:41:08, 6.35s/it] {'loss': 0.4652, 'learning_rate': 6.130432213798441e-06, 'epoch': 0.64} + 64%|██████▍ | 3681/5772 [2:36:25<3:41:08, 6.35s/it] {'loss': 0.4652, 'learning_rate': 6.130432213798441e-06, 'epoch': 0.64} + 64%|██████▍ | 3681/5772 [2:36:23<3:41:08, 6.35s/it] 64%|██████▍ | 3682/5772 [2:36:29<3:39:55, 6.31s/it] 64%|██████▍ | 3682/5772 [2:36:31<3:39:55, 6.31s/it] {'loss': 0.4574, 'learning_rate': 6.125258018413191e-06, 'epoch': 0.64} + 64%|██████▍ | 3682/5772 [2:36:31<3:39:55, 6.31s/it] {'loss': 0.4574, 'learning_rate': 6.125258018413191e-06, 'epoch': 0.64} + 64%|██████▍ | 3682/5772 [2:36:29<3:39:55, 6.31s/it] 64%|██████▍ | 3683/5772 [2:36:35<3:38:46, 6.28s/it] 64%|██████▍ | 3683/5772 [2:36:37<3:38:46, 6.28s/it] {'loss': 0.4687, 'learning_rate': 6.120085043356378e-06, 'epoch': 0.64} + 64%|██████▍ | 3683/5772 [2:36:37<3:38:46, 6.28s/it] {'loss': 0.4687, 'learning_rate': 6.120085043356378e-06, 'epoch': 0.64} + 64%|██████▍ | 3683/5772 [2:36:35<3:38:46, 6.28s/it] 64%|██████▍ | 3684/5772 [2:36:41<3:34:07, 6.15s/it] 64%|██████▍ | 3684/5772 [2:36:43<3:34:07, 6.15s/it] {'loss': 0.4612, 'learning_rate': 6.114913290257219e-06, 'epoch': 0.64} + 64%|██████▍ | 3684/5772 [2:36:43<3:34:07, 6.15s/it] {'loss': 0.4612, 'learning_rate': 6.114913290257219e-06, 'epoch': 0.64} + 64%|██████▍ | 3684/5772 [2:36:41<3:34:07, 6.15s/it] 64%|██████▍ | 3685/5772 [2:36:47<3:30:43, 6.06s/it] 64%|██████▍ | 3685/5772 [2:36:49<3:30:43, 6.06s/it] {'loss': 0.4657, 'learning_rate': 6.109742760744508e-06, 'epoch': 0.64} + 64%|██████▍ | 3685/5772 [2:36:47<3:30:43, 6.06s/it]{'loss': 0.4657, 'learning_rate': 6.109742760744508e-06, 'epoch': 0.64} + 64%|██████▍ | 3685/5772 [2:36:49<3:30:43, 6.06s/it] 64%|██████▍ | 3686/5772 [2:36:53<3:33:20, 6.14s/it] 64%|██████▍ | 3686/5772 [2:36:55<3:33:20, 6.14s/it] {'loss': 0.4554, 'learning_rate': 6.104573456446687e-06, 'epoch': 0.64} + 64%|██████▍ | 3686/5772 [2:36:55<3:33:20, 6.14s/it] {'loss': 0.4554, 'learning_rate': 6.104573456446687e-06, 'epoch': 0.64} + 64%|██████▍ | 3686/5772 [2:36:53<3:33:20, 6.14s/it] 64%|██████▍ | 3687/5772 [2:36:59<3:34:45, 6.18s/it] 64%|██████▍ | 3687/5772 [2:37:01<3:34:45, 6.18s/it] {'loss': 0.473, 'learning_rate': 6.0994053789918004e-06, 'epoch': 0.64} + 64%|██████▍ | 3687/5772 [2:37:01<3:34:45, 6.18s/it] {'loss': 0.473, 'learning_rate': 6.0994053789918004e-06, 'epoch': 0.64} + 64%|██████▍ | 3687/5772 [2:36:59<3:34:45, 6.18s/it] 64%|██████▍ | 3688/5772 [2:37:07<3:32:13, 6.11s/it] 64%|██████▍ | 3688/5772 [2:37:05<3:32:13, 6.11s/it] {'loss': 0.4553, 'learning_rate': 6.094238530007501e-06, 'epoch': 0.64} + 64%|██████▍ | 3688/5772 [2:37:07<3:32:13, 6.11s/it] {'loss': 0.4553, 'learning_rate': 6.094238530007501e-06, 'epoch': 0.64} + 64%|██████▍ | 3688/5772 [2:37:05<3:32:13, 6.11s/it] 64%|██████▍ | 3689/5772 [2:37:13<3:29:04, 6.02s/it] 64%|██████▍ | 3689/5772 [2:37:11<3:29:04, 6.02s/it] {'loss': 0.4625, 'learning_rate': 6.089072911121061e-06, 'epoch': 0.64} + 64%|██████▍ | 3689/5772 [2:37:13<3:29:04, 6.02s/it] {'loss': 0.4625, 'learning_rate': 6.089072911121061e-06, 'epoch': 0.64} + 64%|██████▍ | 3689/5772 [2:37:11<3:29:04, 6.02s/it] 64%|██████▍ | 3690/5772 [2:37:17<3:30:06, 6.06s/it] 64%|██████▍ | 3690/5772 [2:37:19<3:30:06, 6.06s/it] {'loss': 0.4584, 'learning_rate': 6.083908523959362e-06, 'epoch': 0.64} + 64%|██████▍ | 3690/5772 [2:37:19<3:30:06, 6.06s/it] {'loss': 0.4584, 'learning_rate': 6.083908523959362e-06, 'epoch': 0.64} + 64%|██████▍ | 3690/5772 [2:37:17<3:30:06, 6.06s/it] 64%|██████▍ | 3691/5772 [2:37:24<3:31:17, 6.09s/it] 64%|██████▍ | 3691/5772 [2:37:25<3:31:17, 6.09s/it] {'loss': 0.4723, 'learning_rate': 6.078745370148902e-06, 'epoch': 0.64} + 64%|██████▍ | 3691/5772 [2:37:25<3:31:17, 6.09s/it] {'loss': 0.4723, 'learning_rate': 6.078745370148902e-06, 'epoch': 0.64} + 64%|██████▍ | 3691/5772 [2:37:24<3:31:17, 6.09s/it] 64%|██████▍ | 3692/5772 [2:37:30<3:33:09, 6.15s/it] 64%|██████▍ | 3692/5772 [2:37:32<3:33:09, 6.15s/it] {'loss': 0.4441, 'learning_rate': 6.073583451315782e-06, 'epoch': 0.64} + 64%|██████▍ | 3692/5772 [2:37:32<3:33:09, 6.15s/it] {'loss': 0.4441, 'learning_rate': 6.073583451315782e-06, 'epoch': 0.64} + 64%|██████▍ | 3692/5772 [2:37:30<3:33:09, 6.15s/it] 64%|██████▍ | 3693/5772 [2:37:38<3:30:41, 6.08s/it] 64%|██████▍ | 3693/5772 [2:37:36<3:30:41, 6.08s/it] {'loss': 0.4579, 'learning_rate': 6.068422769085722e-06, 'epoch': 0.64} + 64%|██████▍ | 3693/5772 [2:37:38<3:30:41, 6.08s/it] {'loss': 0.4579, 'learning_rate': 6.068422769085722e-06, 'epoch': 0.64} + 64%|██████▍ | 3693/5772 [2:37:36<3:30:41, 6.08s/it] 64%|██████▍ | 3694/5772 [2:37:42<3:30:17, 6.07s/it] 64%|██████▍ | 3694/5772 [2:37:44<3:30:17, 6.07s/it] {'loss': 0.4471, 'learning_rate': 6.063263325084054e-06, 'epoch': 0.64} + 64%|██████▍ | 3694/5772 [2:37:44<3:30:17, 6.07s/it] {'loss': 0.4471, 'learning_rate': 6.063263325084054e-06, 'epoch': 0.64} + 64%|██████▍ | 3694/5772 [2:37:42<3:30:17, 6.07s/it] 64%|██████▍ | 3695/5772 [2:37:50<3:31:41, 6.12s/it] 64%|██████▍ | 3695/5772 [2:37:48<3:31:41, 6.12s/it] {'loss': 0.4524, 'learning_rate': 6.0581051209357135e-06, 'epoch': 0.64} + 64%|██████▍ | 3695/5772 [2:37:50<3:31:41, 6.12s/it] {'loss': 0.4524, 'learning_rate': 6.0581051209357135e-06, 'epoch': 0.64} + 64%|██████▍ | 3695/5772 [2:37:48<3:31:41, 6.12s/it] 64%|██████▍ | 3696/5772 [2:37:56<3:31:08, 6.10s/it] 64%|██████▍ | 3696/5772 [2:37:54<3:31:08, 6.10s/it] {'loss': 0.4682, 'learning_rate': 6.052948158265248e-06, 'epoch': 0.64} + 64%|██████▍ | 3696/5772 [2:37:56<3:31:08, 6.10s/it] {'loss': 0.4682, 'learning_rate': 6.052948158265248e-06, 'epoch': 0.64} + 64%|██████▍ | 3696/5772 [2:37:54<3:31:08, 6.10s/it] 64%|██████▍ | 3697/5772 [2:38:00<3:29:17, 6.05s/it] 64%|██████▍ | 3697/5772 [2:38:02<3:29:18, 6.05s/it] {'loss': 0.4705, 'learning_rate': 6.047792438696813e-06, 'epoch': 0.64} + 64%|██████▍ | 3697/5772 [2:38:02<3:29:18, 6.05s/it] {'loss': 0.4705, 'learning_rate': 6.047792438696813e-06, 'epoch': 0.64} + 64%|██████▍ | 3697/5772 [2:38:00<3:29:17, 6.05s/it] 64%|██████▍ | 3698/5772 [2:38:08<3:32:38, 6.15s/it] 64%|██████▍ | 3698/5772 [2:38:06<3:32:38, 6.15s/it] {'loss': 0.4437, 'learning_rate': 6.042637963854179e-06, 'epoch': 0.64} + 64%|██████▍ | 3698/5772 [2:38:08<3:32:38, 6.15s/it] {'loss': 0.4437, 'learning_rate': 6.042637963854179e-06, 'epoch': 0.64} + 64%|██████▍ | 3698/5772 [2:38:06<3:32:38, 6.15s/it] 64%|██████▍ | 3699/5772 [2:38:15<3:34:31, 6.21s/it] 64%|██████▍ | 3699/5772 [2:38:13<3:34:31, 6.21s/it] {'loss': 0.4598, 'learning_rate': 6.037484735360711e-06, 'epoch': 0.64} + 64%|██████▍ | 3699/5772 [2:38:15<3:34:31, 6.21s/it] {'loss': 0.4598, 'learning_rate': 6.037484735360711e-06, 'epoch': 0.64} + 64%|██████▍ | 3699/5772 [2:38:13<3:34:31, 6.21s/it]10 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +0 AutoResumeHook: Checking whether to suspend... 64%|██████▍ | 3700/5772 [2:38:21<3:31:21, 6.12s/it]9 AutoResumeHook: Checking whether to suspend... + +5 AutoResumeHook: Checking whether to suspend... + 64%|██████▍ | 3700/5772 [2:38:19<3:31:21, 6.12s/it] {'loss': 0.4514, 'learning_rate': 6.0323327548393926e-06, 'epoch': 0.64} + 64%|██████▍ | 3700/5772 [2:38:21<3:31:21, 6.12s/it] {'loss': 0.4514, 'learning_rate': 6.0323327548393926e-06, 'epoch': 0.64} + 64%|██████▍ | 3700/5772 [2:38:19<3:31:21, 6.12s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-3700/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-3700/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-3700/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 64%|██████▍ | 3701/5772 [2:38:40<6:14:01, 10.84s/it] 64%|██████▍ | 3701/5772 [2:38:42<6:14:03, 10.84s/it] {'loss': 0.4683, 'learning_rate': 6.027182023912819e-06, 'epoch': 0.64} + 64%|██████▍ | 3701/5772 [2:38:42<6:14:03, 10.84s/it] {'loss': 0.4683, 'learning_rate': 6.027182023912819e-06, 'epoch': 0.64} + 64%|██████▍ | 3701/5772 [2:38:40<6:14:01, 10.84s/it] 64%|██████▍ | 3702/5772 [2:38:46<5:23:50, 9.39s/it] 64%|██████▍ | 3702/5772 [2:38:48<5:23:50, 9.39s/it] {'loss': 0.4591, 'learning_rate': 6.0220325442031714e-06, 'epoch': 0.64} + 64%|██████▍ | 3702/5772 [2:38:46<5:23:50, 9.39s/it]{'loss': 0.4591, 'learning_rate': 6.0220325442031714e-06, 'epoch': 0.64} + 64%|██████▍ | 3702/5772 [2:38:48<5:23:50, 9.39s/it] 64%|██████▍ | 3703/5772 [2:38:52<4:45:45, 8.29s/it] 64%|██████▍ | 3703/5772 [2:38:54<4:45:45, 8.29s/it] {'loss': 0.459, 'learning_rate': 6.016884317332261e-06, 'epoch': 0.64} + 64%|██████▍ | 3703/5772 [2:38:54<4:45:45, 8.29s/it] {'loss': 0.459, 'learning_rate': 6.016884317332261e-06, 'epoch': 0.64} + 64%|██████▍ | 3703/5772 [2:38:52<4:45:45, 8.29s/it] 64%|██████▍ | 3704/5772 [2:38:59<4:25:48, 7.71s/it] 64%|██████▍ | 3704/5772 [2:39:01<4:25:49, 7.71s/it] {'loss': 0.4688, 'learning_rate': 6.011737344921487e-06, 'epoch': 0.64} + 64%|██████▍ | 3704/5772 [2:39:01<4:25:49, 7.71s/it] {'loss': 0.4688, 'learning_rate': 6.011737344921487e-06, 'epoch': 0.64} + 64%|██████▍ | 3704/5772 [2:38:59<4:25:48, 7.71s/it] 64%|██████▍ | 3705/5772 [2:39:05<4:07:28, 7.18s/it] 64%|██████▍ | 3705/5772 [2:39:06<4:07:27, 7.18s/it] {'loss': 0.4723, 'learning_rate': 6.0065916285918625e-06, 'epoch': 0.64} + 64%|██████▍ | 3705/5772 [2:39:06<4:07:27, 7.18s/it] {'loss': 0.4723, 'learning_rate': 6.0065916285918625e-06, 'epoch': 0.64} + 64%|██████▍ | 3705/5772 [2:39:05<4:07:28, 7.18s/it] 64%|██████▍ | 3706/5772 [2:39:10<3:54:18, 6.80s/it] 64%|██████▍ | 3706/5772 [2:39:12<3:54:18, 6.80s/it] {'loss': 0.4551, 'learning_rate': 6.001447169964e-06, 'epoch': 0.64} + 64%|██████▍ | 3706/5772 [2:39:12<3:54:18, 6.80s/it] {'loss': 0.4551, 'learning_rate': 6.001447169964e-06, 'epoch': 0.64} + 64%|██████▍ | 3706/5772 [2:39:10<3:54:18, 6.80s/it] 64%|██████▍ | 3707/5772 [2:39:17<3:51:23, 6.72s/it] 64%|██████▍ | 3707/5772 [2:39:19<3:51:22, 6.72s/it] {'loss': 0.472, 'learning_rate': 5.996303970658119e-06, 'epoch': 0.64} + 64%|██████▍ | 3707/5772 [2:39:19<3:51:22, 6.72s/it] {'loss': 0.472, 'learning_rate': 5.996303970658119e-06, 'epoch': 0.64} + 64%|██████▍ | 3707/5772 [2:39:17<3:51:23, 6.72s/it] 64%|██████▍ | 3708/5772 [2:39:23<3:42:20, 6.46s/it] 64%|██████▍ | 3708/5772 [2:39:25<3:42:20, 6.46s/it] {'loss': 0.475, 'learning_rate': 5.991162032294042e-06, 'epoch': 0.64} + 64%|██████▍ | 3708/5772 [2:39:25<3:42:20, 6.46s/it] {'loss': 0.475, 'learning_rate': 5.991162032294042e-06, 'epoch': 0.64} + 64%|██████▍ | 3708/5772 [2:39:23<3:42:20, 6.46s/it] 64%|██████▍ | 3709/5772 [2:39:29<3:37:41, 6.33s/it] 64%|██████▍ | 3709/5772 [2:39:31<3:37:41, 6.33s/it] {'loss': 0.4661, 'learning_rate': 5.986021356491192e-06, 'epoch': 0.64} + 64%|██████▍ | 3709/5772 [2:39:31<3:37:41, 6.33s/it] {'loss': 0.4661, 'learning_rate': 5.986021356491192e-06, 'epoch': 0.64} + 64%|██████▍ | 3709/5772 [2:39:29<3:37:41, 6.33s/it] 64%|██████▍ | 3710/5772 [2:39:35<3:35:18, 6.26s/it] 64%|██████▍ | 3710/5772 [2:39:37<3:35:18, 6.26s/it] {'loss': 0.4583, 'learning_rate': 5.980881944868604e-06, 'epoch': 0.64} + 64%|██████▍ | 3710/5772 [2:39:37<3:35:18, 6.26s/it] {'loss': 0.4583, 'learning_rate': 5.980881944868604e-06, 'epoch': 0.64} + 64%|██████▍ | 3710/5772 [2:39:35<3:35:18, 6.26s/it] 64%|██████▍ | 3711/5772 [2:39:41<3:31:47, 6.17s/it] 64%|██████▍ | 3711/5772 [2:39:43<3:31:47, 6.17s/it] {'loss': 0.4576, 'learning_rate': 5.975743799044894e-06, 'epoch': 0.64} + 64%|██████▍ | 3711/5772 [2:39:43<3:31:47, 6.17s/it] {'loss': 0.4576, 'learning_rate': 5.975743799044894e-06, 'epoch': 0.64} + 64%|██████▍ | 3711/5772 [2:39:41<3:31:47, 6.17s/it] 64%|██████▍ | 3712/5772 [2:39:47<3:32:35, 6.19s/it] 64%|██████▍ | 3712/5772 [2:39:49<3:32:35, 6.19s/it] {'loss': 0.4629, 'learning_rate': 5.970606920638304e-06, 'epoch': 0.64} + 64%|██████▍ | 3712/5772 [2:39:49<3:32:35, 6.19s/it] {'loss': 0.4629, 'learning_rate': 5.970606920638304e-06, 'epoch': 0.64} + 64%|██████▍ | 3712/5772 [2:39:47<3:32:35, 6.19s/it] 64%|██████▍ | 3713/5772 [2:39:53<3:32:23, 6.19s/it] 64%|██████▍ | 3713/5772 [2:39:55<3:32:23, 6.19s/it] {'loss': 0.4794, 'learning_rate': 5.965471311266658e-06, 'epoch': 0.64} + 64%|██████▍ | 3713/5772 [2:39:55<3:32:23, 6.19s/it] {'loss': 0.4794, 'learning_rate': 5.965471311266658e-06, 'epoch': 0.64} + 64%|██████▍ | 3713/5772 [2:39:53<3:32:23, 6.19s/it] 64%|██████▍ | 3714/5772 [2:40:00<3:36:47, 6.32s/it] 64%|██████▍ | 3714/5772 [2:40:02<3:36:47, 6.32s/it] {'loss': 0.4629, 'learning_rate': 5.960336972547391e-06, 'epoch': 0.64} + 64%|██████▍ | 3714/5772 [2:40:02<3:36:47, 6.32s/it] {'loss': 0.4629, 'learning_rate': 5.960336972547391e-06, 'epoch': 0.64} + 64%|██████▍ | 3714/5772 [2:40:00<3:36:47, 6.32s/it] 64%|██████▍ | 3715/5772 [2:40:06<3:32:59, 6.21s/it] 64%|██████▍ | 3715/5772 [2:40:08<3:32:58, 6.21s/it] {'loss': 0.4614, 'learning_rate': 5.955203906097537e-06, 'epoch': 0.64} + 64%|██████▍ | 3715/5772 [2:40:08<3:32:58, 6.21s/it] {'loss': 0.4614, 'learning_rate': 5.955203906097537e-06, 'epoch': 0.64} + 64%|██████▍ | 3715/5772 [2:40:06<3:32:59, 6.21s/it] 64%|██████▍ | 3716/5772 [2:40:12<3:34:03, 6.25s/it] 64%|██████▍ | 3716/5772 [2:40:14<3:34:03, 6.25s/it] {'loss': 0.4714, 'learning_rate': 5.9500721135337205e-06, 'epoch': 0.64} + 64%|██████▍ | 3716/5772 [2:40:14<3:34:03, 6.25s/it] {'loss': 0.4714, 'learning_rate': 5.9500721135337205e-06, 'epoch': 0.64} + 64%|██████▍ | 3716/5772 [2:40:12<3:34:03, 6.25s/it] 64%|██████▍ | 3717/5772 [2:40:21<3:35:24, 6.29s/it] 64%|██████▍ | 3717/5772 [2:40:19<3:35:24, 6.29s/it] {'loss': 0.4577, 'learning_rate': 5.944941596472176e-06, 'epoch': 0.64} + 64%|██████▍ | 3717/5772 [2:40:21<3:35:24, 6.29s/it] {'loss': 0.4577, 'learning_rate': 5.944941596472176e-06, 'epoch': 0.64} + 64%|██████▍ | 3717/5772 [2:40:19<3:35:24, 6.29s/it] 64%|██████▍ | 3718/5772 [2:40:27<3:32:04, 6.20s/it] 64%|██████▍ | 3718/5772 [2:40:25<3:32:05, 6.20s/it] {'loss': 0.4576, 'learning_rate': 5.939812356528727e-06, 'epoch': 0.64} + 64%|██████▍ | 3718/5772 [2:40:27<3:32:04, 6.20s/it] {'loss': 0.4576, 'learning_rate': 5.939812356528727e-06, 'epoch': 0.64} + 64%|██████▍ | 3718/5772 [2:40:25<3:32:05, 6.20s/it] 64%|██████▍ | 3719/5772 [2:40:31<3:30:39, 6.16s/it] {'loss': 0.4625, 'learning_rate': 5.934684395318806e-06, 'epoch': 0.64} + 64%|██████▍ | 3719/5772 [2:40:31<3:30:39, 6.16s/it] 64%|██████▍ | 3719/5772 [2:40:33<3:30:39, 6.16s/it] {'loss': 0.4625, 'learning_rate': 5.934684395318806e-06, 'epoch': 0.64} + 64%|██████▍ | 3719/5772 [2:40:33<3:30:39, 6.16s/it] 64%|██████▍ | 3720/5772 [2:40:39<3:29:07, 6.11s/it] 64%|██████▍ | 3720/5772 [2:40:37<3:29:07, 6.11s/it] {'loss': 0.4628, 'learning_rate': 5.929557714457425e-06, 'epoch': 0.64} + 64%|██████▍ | 3720/5772 [2:40:39<3:29:07, 6.11s/it] {'loss': 0.4628, 'learning_rate': 5.929557714457425e-06, 'epoch': 0.64} + 64%|██████▍ | 3720/5772 [2:40:37<3:29:07, 6.11s/it] 64%|██████▍ | 3721/5772 [2:40:45<3:30:23, 6.15s/it] 64%|██████▍ | 3721/5772 [2:40:43<3:30:23, 6.15s/it] {'loss': 0.4631, 'learning_rate': 5.924432315559213e-06, 'epoch': 0.64} + 64%|██████▍ | 3721/5772 [2:40:43<3:30:23, 6.15s/it] {'loss': 0.4631, 'learning_rate': 5.924432315559213e-06, 'epoch': 0.64} + 64%|██████▍ | 3721/5772 [2:40:45<3:30:23, 6.15s/it] 64%|██████▍ | 3722/5772 [2:40:49<3:26:15, 6.04s/it] 64%|██████▍ | 3722/5772 [2:40:51<3:26:15, 6.04s/it] {'loss': 0.4518, 'learning_rate': 5.919308200238385e-06, 'epoch': 0.64} + 64%|██████▍ | 3722/5772 [2:40:51<3:26:15, 6.04s/it] {'loss': 0.4518, 'learning_rate': 5.919308200238385e-06, 'epoch': 0.64} + 64%|██████▍ | 3722/5772 [2:40:49<3:26:15, 6.04s/it] 65%|██████▍ | 3723/5772 [2:40:55<3:26:06, 6.04s/it] 65%|██████▍ | 3723/5772 [2:40:57<3:26:06, 6.04s/it] {'loss': 0.4709, 'learning_rate': 5.914185370108749e-06, 'epoch': 0.64} + 65%|██████▍ | 3723/5772 [2:40:57<3:26:06, 6.04s/it] {'loss': 0.4709, 'learning_rate': 5.914185370108749e-06, 'epoch': 0.64} + 65%|██████▍ | 3723/5772 [2:40:55<3:26:06, 6.04s/it] 65%|██████▍ | 3724/5772 [2:41:01<3:26:35, 6.05s/it] 65%|██████▍ | 3724/5772 [2:41:03<3:26:35, 6.05s/it] {'loss': 0.4643, 'learning_rate': 5.9090638267837144e-06, 'epoch': 0.65} + 65%|██████▍ | 3724/5772 [2:41:03<3:26:35, 6.05s/it] {'loss': 0.4643, 'learning_rate': 5.9090638267837144e-06, 'epoch': 0.65} + 65%|██████▍ | 3724/5772 [2:41:01<3:26:35, 6.05s/it] 65%|██████▍ | 3725/5772 [2:41:09<3:27:21, 6.08s/it] 65%|██████▍ | 3725/5772 [2:41:07<3:27:21, 6.08s/it] {'loss': 0.4795, 'learning_rate': 5.90394357187628e-06, 'epoch': 0.65} + 65%|██████▍ | 3725/5772 [2:41:09<3:27:21, 6.08s/it] {'loss': 0.4795, 'learning_rate': 5.90394357187628e-06, 'epoch': 0.65} + 65%|██████▍ | 3725/5772 [2:41:07<3:27:21, 6.08s/it] 65%|██████▍ | 3726/5772 [2:41:16<3:33:11, 6.25s/it] 65%|██████▍ | 3726/5772 [2:41:14<3:33:11, 6.25s/it] {'loss': 0.444, 'learning_rate': 5.898824606999047e-06, 'epoch': 0.65} + 65%|██████▍ | 3726/5772 [2:41:16<3:33:11, 6.25s/it] {'loss': 0.444, 'learning_rate': 5.898824606999047e-06, 'epoch': 0.65} + 65%|██████▍ | 3726/5772 [2:41:14<3:33:11, 6.25s/it] 65%|██████▍ | 3727/5772 [2:41:20<3:34:16, 6.29s/it] 65%|██████▍ | 3727/5772 [2:41:22<3:34:16, 6.29s/it]{'loss': 0.4698, 'learning_rate': 5.893706933764196e-06, 'epoch': 0.65} + 65%|██████▍ | 3727/5772 [2:41:20<3:34:16, 6.29s/it] {'loss': 0.4698, 'learning_rate': 5.893706933764196e-06, 'epoch': 0.65} + 65%|██████▍ | 3727/5772 [2:41:22<3:34:16, 6.29s/it] 65%|██████▍ | 3728/5772 [2:41:26<3:33:09, 6.26s/it] 65%|██████▍ | 3728/5772 [2:41:28<3:33:09, 6.26s/it] {'loss': 0.4571, 'learning_rate': 5.888590553783517e-06, 'epoch': 0.65} + 65%|██████▍ | 3728/5772 [2:41:28<3:33:09, 6.26s/it] {'loss': 0.4571, 'learning_rate': 5.888590553783517e-06, 'epoch': 0.65} + 65%|██████▍ | 3728/5772 [2:41:26<3:33:09, 6.26s/it] 65%|██████▍ | 3729/5772 [2:41:32<3:32:15, 6.23s/it] 65%|██████▍ | 3729/5772 [2:41:34<3:32:15, 6.23s/it] {'loss': 0.4757, 'learning_rate': 5.883475468668387e-06, 'epoch': 0.65} + 65%|██████▍ | 3729/5772 [2:41:34<3:32:15, 6.23s/it] {'loss': 0.4757, 'learning_rate': 5.883475468668387e-06, 'epoch': 0.65} + 65%|██████▍ | 3729/5772 [2:41:32<3:32:15, 6.23s/it] 65%|██████▍ | 3730/5772 [2:41:39<3:32:19, 6.24s/it] 65%|██████▍ | 3730/5772 [2:41:41<3:32:19, 6.24s/it] {'loss': 0.4632, 'learning_rate': 5.8783616800297675e-06, 'epoch': 0.65} + 65%|██████▍ | 3730/5772 [2:41:41<3:32:19, 6.24s/it] {'loss': 0.4632, 'learning_rate': 5.8783616800297675e-06, 'epoch': 0.65} + 65%|██████▍ | 3730/5772 [2:41:39<3:32:19, 6.24s/it] 65%|██████▍ | 3731/5772 [2:41:45<3:32:00, 6.23s/it] 65%|██████▍ | 3731/5772 [2:41:47<3:32:00, 6.23s/it] {'loss': 0.4665, 'learning_rate': 5.873249189478221e-06, 'epoch': 0.65} + 65%|██████▍ | 3731/5772 [2:41:47<3:32:00, 6.23s/it] {'loss': 0.4665, 'learning_rate': 5.873249189478221e-06, 'epoch': 0.65} + 65%|██████▍ | 3731/5772 [2:41:45<3:32:00, 6.23s/it] 65%|██████▍ | 3732/5772 [2:41:53<3:33:16, 6.27s/it] 65%|██████▍ | 3732/5772 [2:41:51<3:33:16, 6.27s/it] {'loss': 0.4642, 'learning_rate': 5.868137998623897e-06, 'epoch': 0.65} + 65%|██████▍ | 3732/5772 [2:41:53<3:33:16, 6.27s/it] {'loss': 0.4642, 'learning_rate': 5.868137998623897e-06, 'epoch': 0.65} + 65%|██████▍ | 3732/5772 [2:41:51<3:33:16, 6.27s/it] 65%|██████▍ | 3733/5772 [2:42:00<3:35:47, 6.35s/it] 65%|██████▍ | 3733/5772 [2:41:58<3:35:47, 6.35s/it] {'loss': 0.47, 'learning_rate': 5.8630281090765386e-06, 'epoch': 0.65} + 65%|██████▍ | 3733/5772 [2:42:00<3:35:47, 6.35s/it] {'loss': 0.47, 'learning_rate': 5.8630281090765386e-06, 'epoch': 0.65} + 65%|██████▍ | 3733/5772 [2:41:58<3:35:47, 6.35s/it] 65%|██████▍ | 3734/5772 [2:42:04<3:34:29, 6.31s/it] 65%|██████▍ | 3734/5772 [2:42:06<3:34:29, 6.31s/it] {'loss': 0.461, 'learning_rate': 5.857919522445475e-06, 'epoch': 0.65} + 65%|██████▍ | 3734/5772 [2:42:06<3:34:29, 6.31s/it] {'loss': 0.461, 'learning_rate': 5.857919522445475e-06, 'epoch': 0.65} + 65%|██████▍ | 3734/5772 [2:42:04<3:34:29, 6.31s/it] 65%|██████▍ | 3735/5772 [2:42:10<3:31:41, 6.24s/it] 65%|██████▍ | 3735/5772 [2:42:12<3:31:41, 6.24s/it] {'loss': 0.4655, 'learning_rate': 5.8528122403396226e-06, 'epoch': 0.65} + 65%|██████▍ | 3735/5772 [2:42:12<3:31:41, 6.24s/it] {'loss': 0.4655, 'learning_rate': 5.8528122403396226e-06, 'epoch': 0.65} + 65%|██████▍ | 3735/5772 [2:42:10<3:31:41, 6.24s/it] 65%|██████▍ | 3736/5772 [2:42:16<3:28:59, 6.16s/it] 65%|██████▍ | 3736/5772 [2:42:18<3:28:59, 6.16s/it] {'loss': 0.4718, 'learning_rate': 5.847706264367503e-06, 'epoch': 0.65} + 65%|██████▍ | 3736/5772 [2:42:18<3:28:59, 6.16s/it] {'loss': 0.4718, 'learning_rate': 5.847706264367503e-06, 'epoch': 0.65} + 65%|██████▍ | 3736/5772 [2:42:16<3:28:59, 6.16s/it] 65%|██████▍ | 3737/5772 [2:42:22<3:29:20, 6.17s/it] 65%|██████▍ | 3737/5772 [2:42:24<3:29:20, 6.17s/it] {'loss': 0.4683, 'learning_rate': 5.842601596137206e-06, 'epoch': 0.65} + 65%|██████▍ | 3737/5772 [2:42:24<3:29:20, 6.17s/it] {'loss': 0.4683, 'learning_rate': 5.842601596137206e-06, 'epoch': 0.65} + 65%|██████▍ | 3737/5772 [2:42:22<3:29:20, 6.17s/it] 65%|██████▍ | 3738/5772 [2:42:28<3:28:15, 6.14s/it] 65%|██████▍ | 3738/5772 [2:42:30<3:28:15, 6.14s/it] {'loss': 0.4635, 'learning_rate': 5.8374982372564255e-06, 'epoch': 0.65} + 65%|██████▍ | 3738/5772 [2:42:30<3:28:15, 6.14s/it] {'loss': 0.4635, 'learning_rate': 5.8374982372564255e-06, 'epoch': 0.65} + 65%|██████▍ | 3738/5772 [2:42:28<3:28:15, 6.14s/it] 65%|██████▍ | 3739/5772 [2:42:34<3:25:15, 6.06s/it] 65%|██████▍ | 3739/5772 [2:42:36<3:25:15, 6.06s/it] {'loss': 0.464, 'learning_rate': 5.832396189332423e-06, 'epoch': 0.65} + 65%|██████▍ | 3739/5772 [2:42:36<3:25:15, 6.06s/it] {'loss': 0.464, 'learning_rate': 5.832396189332423e-06, 'epoch': 0.65} + 65%|██████▍ | 3739/5772 [2:42:34<3:25:15, 6.06s/it] 65%|██████▍ | 3740/5772 [2:42:40<3:26:13, 6.09s/it] 65%|██████▍ | 3740/5772 [2:42:42<3:26:13, 6.09s/it] {'loss': 0.4756, 'learning_rate': 5.8272954539720775e-06, 'epoch': 0.65} + 65%|██████▍ | 3740/5772 [2:42:42<3:26:13, 6.09s/it] {'loss': 0.4756, 'learning_rate': 5.8272954539720775e-06, 'epoch': 0.65} + 65%|██████▍ | 3740/5772 [2:42:40<3:26:13, 6.09s/it] 65%|██████▍ | 3741/5772 [2:42:47<3:28:10, 6.15s/it] 65%|██████▍ | 3741/5772 [2:42:49<3:28:11, 6.15s/it] {'loss': 0.4625, 'learning_rate': 5.822196032781824e-06, 'epoch': 0.65} + 65%|██████▍ | 3741/5772 [2:42:49<3:28:11, 6.15s/it] {'loss': 0.4625, 'learning_rate': 5.822196032781824e-06, 'epoch': 0.65} + 65%|██████▍ | 3741/5772 [2:42:47<3:28:10, 6.15s/it] 65%|██████▍ | 3742/5772 [2:42:53<3:27:26, 6.13s/it] 65%|██████▍ | 3742/5772 [2:42:55<3:27:26, 6.13s/it] {'loss': 0.4692, 'learning_rate': 5.817097927367701e-06, 'epoch': 0.65} + 65%|██████▍ | 3742/5772 [2:42:55<3:27:26, 6.13s/it] {'loss': 0.4692, 'learning_rate': 5.817097927367701e-06, 'epoch': 0.65} + 65%|██████▍ | 3742/5772 [2:42:53<3:27:26, 6.13s/it] 65%|██████▍ | 3743/5772 [2:42:59<3:26:04, 6.09s/it] 65%|██████▍ | 3743/5772 [2:43:01<3:26:04, 6.09s/it] {'loss': 0.4654, 'learning_rate': 5.812001139335329e-06, 'epoch': 0.65} + 65%|██████▍ | 3743/5772 [2:43:01<3:26:04, 6.09s/it] {'loss': 0.4654, 'learning_rate': 5.812001139335329e-06, 'epoch': 0.65} + 65%|██████▍ | 3743/5772 [2:42:59<3:26:04, 6.09s/it] 65%|██████▍ | 3744/5772 [2:43:05<3:24:59, 6.06s/it] 65%|██████▍ | 3744/5772 [2:43:07<3:24:59, 6.06s/it] {'loss': 0.4728, 'learning_rate': 5.806905670289913e-06, 'epoch': 0.65} + 65%|██████▍ | 3744/5772 [2:43:07<3:24:59, 6.06s/it] {'loss': 0.4728, 'learning_rate': 5.806905670289913e-06, 'epoch': 0.65} + 65%|██████▍ | 3744/5772 [2:43:05<3:24:59, 6.06s/it] 65%|██████▍ | 3745/5772 [2:43:11<3:25:46, 6.09s/it] 65%|██████▍ | 3745/5772 [2:43:13<3:25:46, 6.09s/it] {'loss': 0.4626, 'learning_rate': 5.801811521836246e-06, 'epoch': 0.65} + 65%|██████▍ | 3745/5772 [2:43:13<3:25:46, 6.09s/it] {'loss': 0.4626, 'learning_rate': 5.801811521836246e-06, 'epoch': 0.65} + 65%|██████▍ | 3745/5772 [2:43:11<3:25:46, 6.09s/it] 65%|██████▍ | 3746/5772 [2:43:17<3:28:19, 6.17s/it] 65%|██████▍ | 3746/5772 [2:43:19<3:28:20, 6.17s/it] {'loss': 0.4652, 'learning_rate': 5.796718695578695e-06, 'epoch': 0.65} + 65%|██████▍ | 3746/5772 [2:43:19<3:28:20, 6.17s/it] {'loss': 0.4652, 'learning_rate': 5.796718695578695e-06, 'epoch': 0.65} + 65%|██████▍ | 3746/5772 [2:43:17<3:28:19, 6.17s/it] 65%|██████▍ | 3747/5772 [2:43:26<3:30:23, 6.23s/it] 65%|██████▍ | 3747/5772 [2:43:24<3:30:23, 6.23s/it] {'loss': 0.4602, 'learning_rate': 5.7916271931212185e-06, 'epoch': 0.65} + 65%|██████▍ | 3747/5772 [2:43:26<3:30:23, 6.23s/it] {'loss': 0.4602, 'learning_rate': 5.7916271931212185e-06, 'epoch': 0.65} + 65%|██████▍ | 3747/5772 [2:43:24<3:30:23, 6.23s/it] 65%|██████▍ | 3748/5772 [2:43:32<3:28:56, 6.19s/it] 65%|██████▍ | 3748/5772 [2:43:30<3:28:57, 6.19s/it] {'loss': 0.4601, 'learning_rate': 5.786537016067362e-06, 'epoch': 0.65} + 65%|██████▍ | 3748/5772 [2:43:32<3:28:56, 6.19s/it] {'loss': 0.4601, 'learning_rate': 5.786537016067362e-06, 'epoch': 0.65} + 65%|██████▍ | 3748/5772 [2:43:30<3:28:57, 6.19s/it] 65%|██████▍ | 3749/5772 [2:43:38<3:26:55, 6.14s/it] 65%|██████▍ | 3749/5772 [2:43:36<3:26:55, 6.14s/it] {'loss': 0.465, 'learning_rate': 5.781448166020242e-06, 'epoch': 0.65} + 65%|██████▍ | 3749/5772 [2:43:38<3:26:55, 6.14s/it] {'loss': 0.465, 'learning_rate': 5.781448166020242e-06, 'epoch': 0.65} + 65%|██████▍ | 3749/5772 [2:43:36<3:26:55, 6.14s/it]10 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +05 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... + 4 AutoResumeHook: Checking whether to suspend...6 + AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... + 65%|██████▍ | 3750/5772 [2:43:44<3:28:54, 6.20s/it]2 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... + 65%|██████▍ | 3750/5772 [2:43:42<3:29:06, 6.20s/it] {'loss': 0.4587, 'learning_rate': 5.776360644582569e-06, 'epoch': 0.65} + 65%|██████▍ | 3750/5772 [2:43:44<3:28:54, 6.20s/it] {'loss': 0.4587, 'learning_rate': 5.776360644582569e-06, 'epoch': 0.65} + 65%|██████▍ | 3750/5772 [2:43:42<3:29:06, 6.20s/it] 65%|██████▍ | 3751/5772 [2:43:48<3:29:33, 6.22s/it] 65%|██████▍ | 3751/5772 [2:43:50<3:29:37, 6.22s/it] {'loss': 0.467, 'learning_rate': 5.771274453356628e-06, 'epoch': 0.65} + 65%|██████▍ | 3751/5772 [2:43:50<3:29:37, 6.22s/it] {'loss': 0.467, 'learning_rate': 5.771274453356628e-06, 'epoch': 0.65} + 65%|██████▍ | 3751/5772 [2:43:48<3:29:33, 6.22s/it] 65%|██████▌ | 3752/5772 [2:43:55<3:32:55, 6.32s/it] 65%|██████▌ | 3752/5772 [2:43:57<3:32:57, 6.33s/it] {'loss': 0.4571, 'learning_rate': 5.766189593944289e-06, 'epoch': 0.65} + 65%|██████▌ | 3752/5772 [2:43:57<3:32:57, 6.33s/it] {'loss': 0.4571, 'learning_rate': 5.766189593944289e-06, 'epoch': 0.65} + 65%|██████▌ | 3752/5772 [2:43:55<3:32:55, 6.32s/it] 65%|██████▌ | 3753/5772 [2:44:03<3:31:12, 6.28s/it] 65%|██████▌ | 3753/5772 [2:44:01<3:31:11, 6.28s/it] {'loss': 0.4588, 'learning_rate': 5.761106067946993e-06, 'epoch': 0.65} + 65%|██████▌ | 3753/5772 [2:44:03<3:31:12, 6.28s/it] {'loss': 0.4588, 'learning_rate': 5.761106067946993e-06, 'epoch': 0.65} + 65%|██████▌ | 3753/5772 [2:44:01<3:31:11, 6.28s/it] 65%|██████▌ | 3754/5772 [2:44:07<3:30:47, 6.27s/it] 65%|██████▌ | 3754/5772 [2:44:09<3:30:48, 6.27s/it] {'loss': 0.4725, 'learning_rate': 5.756023876965773e-06, 'epoch': 0.65} + 65%|██████▌ | 3754/5772 [2:44:09<3:30:48, 6.27s/it] {'loss': 0.4725, 'learning_rate': 5.756023876965773e-06, 'epoch': 0.65} + 65%|██████▌ | 3754/5772 [2:44:07<3:30:47, 6.27s/it] 65%|██████▌ | 3755/5772 [2:44:13<3:27:06, 6.16s/it] 65%|██████▌ | 3755/5772 [2:44:15<3:27:08, 6.16s/it] {'loss': 0.4601, 'learning_rate': 5.7509430226012365e-06, 'epoch': 0.65} + 65%|██████▌ | 3755/5772 [2:44:15<3:27:08, 6.16s/it] {'loss': 0.4601, 'learning_rate': 5.7509430226012365e-06, 'epoch': 0.65} + 65%|██████▌ | 3755/5772 [2:44:13<3:27:06, 6.16s/it] 65%|██████▌ | 3756/5772 [2:44:19<3:27:17, 6.17s/it] 65%|██████▌ | 3756/5772 [2:44:21<3:27:17, 6.17s/it] {'loss': 0.4668, 'learning_rate': 5.745863506453569e-06, 'epoch': 0.65} + 65%|██████▌ | 3756/5772 [2:44:21<3:27:17, 6.17s/it] {'loss': 0.4668, 'learning_rate': 5.745863506453569e-06, 'epoch': 0.65} + 65%|██████▌ | 3756/5772 [2:44:19<3:27:17, 6.17s/it] 65%|██████▌ | 3757/5772 [2:44:26<3:27:35, 6.18s/it] 65%|██████▌ | 3757/5772 [2:44:28<3:27:35, 6.18s/it] {'loss': 0.4774, 'learning_rate': 5.740785330122542e-06, 'epoch': 0.65} + 65%|██████▌ | 3757/5772 [2:44:28<3:27:35, 6.18s/it] {'loss': 0.4774, 'learning_rate': 5.740785330122542e-06, 'epoch': 0.65} + 65%|██████▌ | 3757/5772 [2:44:26<3:27:35, 6.18s/it] 65%|██████▌ | 3758/5772 [2:44:32<3:27:12, 6.17s/it] 65%|██████▌ | 3758/5772 [2:44:34<3:27:12, 6.17s/it] {'loss': 0.4693, 'learning_rate': 5.735708495207486e-06, 'epoch': 0.65} + 65%|██████▌ | 3758/5772 [2:44:34<3:27:12, 6.17s/it] {'loss': 0.4693, 'learning_rate': 5.735708495207486e-06, 'epoch': 0.65} + 65%|██████▌ | 3758/5772 [2:44:32<3:27:12, 6.17s/it] 65%|██████▌ | 3759/5772 [2:44:38<3:25:22, 6.12s/it] 65%|██████▌ | 3759/5772 [2:44:40<3:25:22, 6.12s/it] {'loss': 0.4639, 'learning_rate': 5.730633003307338e-06, 'epoch': 0.65} + 65%|██████▌ | 3759/5772 [2:44:40<3:25:22, 6.12s/it] {'loss': 0.4639, 'learning_rate': 5.730633003307338e-06, 'epoch': 0.65} + 65%|██████▌ | 3759/5772 [2:44:38<3:25:22, 6.12s/it] 65%|██████▌ | 3760/5772 [2:44:44<3:25:33, 6.13s/it] 65%|██████▌ | 3760/5772 [2:44:46<3:25:33, 6.13s/it] {'loss': 0.4714, 'learning_rate': 5.725558856020584e-06, 'epoch': 0.65} + 65%|██████▌ | 3760/5772 [2:44:46<3:25:33, 6.13s/it] {'loss': 0.4714, 'learning_rate': 5.725558856020584e-06, 'epoch': 0.65} + 65%|██████▌ | 3760/5772 [2:44:44<3:25:33, 6.13s/it] 65%|██████▌ | 3761/5772 [2:44:50<3:23:36, 6.08s/it] 65%|██████▌ | 3761/5772 [2:44:52<3:23:36, 6.08s/it] {'loss': 0.4505, 'learning_rate': 5.7204860549453025e-06, 'epoch': 0.65} + 65%|██████▌ | 3761/5772 [2:44:52<3:23:36, 6.08s/it] {'loss': 0.4505, 'learning_rate': 5.7204860549453025e-06, 'epoch': 0.65} + 65%|██████▌ | 3761/5772 [2:44:50<3:23:36, 6.08s/it] 65%|██████▌ | 3762/5772 [2:44:56<3:22:17, 6.04s/it] 65%|██████▌ | 3762/5772 [2:44:58<3:22:17, 6.04s/it] {'loss': 0.4769, 'learning_rate': 5.715414601679144e-06, 'epoch': 0.65} + 65%|██████▌ | 3762/5772 [2:44:58<3:22:17, 6.04s/it] {'loss': 0.4769, 'learning_rate': 5.715414601679144e-06, 'epoch': 0.65} + 65%|██████▌ | 3762/5772 [2:44:56<3:22:17, 6.04s/it] 65%|██████▌ | 3763/5772 [2:45:04<3:23:02, 6.06s/it] 65%|██████▌ | 3763/5772 [2:45:02<3:23:02, 6.06s/it] {'loss': 0.4562, 'learning_rate': 5.710344497819333e-06, 'epoch': 0.65} + 65%|██████▌ | 3763/5772 [2:45:04<3:23:02, 6.06s/it] {'loss': 0.4562, 'learning_rate': 5.710344497819333e-06, 'epoch': 0.65} + 65%|██████▌ | 3763/5772 [2:45:02<3:23:02, 6.06s/it] 65%|██████▌ | 3764/5772 [2:45:10<3:20:10, 5.98s/it] 65%|██████▌ | 3764/5772 [2:45:08<3:20:11, 5.98s/it] {'loss': 0.4739, 'learning_rate': 5.705275744962676e-06, 'epoch': 0.65} + 65%|██████▌ | 3764/5772 [2:45:10<3:20:10, 5.98s/it] {'loss': 0.4739, 'learning_rate': 5.705275744962676e-06, 'epoch': 0.65} + 65%|██████▌ | 3764/5772 [2:45:08<3:20:11, 5.98s/it] 65%|██████▌ | 3765/5772 [2:45:16<3:22:12, 6.05s/it] 65%|██████▌ | 3765/5772 [2:45:14<3:22:13, 6.05s/it] {'loss': 0.4644, 'learning_rate': 5.700208344705537e-06, 'epoch': 0.65} + 65%|██████▌ | 3765/5772 [2:45:16<3:22:12, 6.05s/it] {'loss': 0.4644, 'learning_rate': 5.700208344705537e-06, 'epoch': 0.65} + 65%|██████▌ | 3765/5772 [2:45:14<3:22:13, 6.05s/it] 65%|██████▌ | 3766/5772 [2:45:20<3:22:14, 6.05s/it] 65%|██████▌ | 3766/5772 [2:45:22<3:22:14, 6.05s/it] {'loss': 0.4559, 'learning_rate': 5.695142298643881e-06, 'epoch': 0.65} + 65%|██████▌ | 3766/5772 [2:45:22<3:22:14, 6.05s/it] {'loss': 0.4559, 'learning_rate': 5.695142298643881e-06, 'epoch': 0.65} + 65%|██████▌ | 3766/5772 [2:45:20<3:22:14, 6.05s/it] 65%|██████▌ | 3767/5772 [2:45:26<3:27:18, 6.20s/it] 65%|██████▌ | 3767/5772 [2:45:28<3:27:18, 6.20s/it] {'loss': 0.4567, 'learning_rate': 5.690077608373219e-06, 'epoch': 0.65} + 65%|██████▌ | 3767/5772 [2:45:28<3:27:18, 6.20s/it] {'loss': 0.4567, 'learning_rate': 5.690077608373219e-06, 'epoch': 0.65} + 65%|██████▌ | 3767/5772 [2:45:26<3:27:18, 6.20s/it] 65%|██████▌ | 3768/5772 [2:45:33<3:25:38, 6.16s/it] 65%|██████▌ | 3768/5772 [2:45:35<3:25:38, 6.16s/it] {'loss': 0.4767, 'learning_rate': 5.685014275488649e-06, 'epoch': 0.65} + 65%|██████▌ | 3768/5772 [2:45:35<3:25:38, 6.16s/it] {'loss': 0.4767, 'learning_rate': 5.685014275488649e-06, 'epoch': 0.65} + 65%|██████▌ | 3768/5772 [2:45:33<3:25:38, 6.16s/it] 65%|██████▌ | 3769/5772 [2:45:39<3:24:55, 6.14s/it] 65%|██████▌ | 3769/5772 [2:45:41<3:24:55, 6.14s/it] {'loss': 0.4589, 'learning_rate': 5.679952301584844e-06, 'epoch': 0.65} + 65%|██████▌ | 3769/5772 [2:45:41<3:24:55, 6.14s/it] {'loss': 0.4589, 'learning_rate': 5.679952301584844e-06, 'epoch': 0.65} + 65%|██████▌ | 3769/5772 [2:45:39<3:24:55, 6.14s/it] 65%|██████▌ | 3770/5772 [2:45:45<3:26:11, 6.18s/it] 65%|██████▌ | 3770/5772 [2:45:47<3:26:11, 6.18s/it] {'loss': 0.4607, 'learning_rate': 5.674891688256041e-06, 'epoch': 0.65} + 65%|██████▌ | 3770/5772 [2:45:47<3:26:11, 6.18s/it] {'loss': 0.4607, 'learning_rate': 5.674891688256041e-06, 'epoch': 0.65} + 65%|██████▌ | 3770/5772 [2:45:45<3:26:11, 6.18s/it] 65%|██████▌ | 3771/5772 [2:45:51<3:29:21, 6.28s/it] 65%|██████▌ | 3771/5772 [2:45:53<3:29:22, 6.28s/it] {'loss': 0.4569, 'learning_rate': 5.669832437096058e-06, 'epoch': 0.65} + 65%|██████▌ | 3771/5772 [2:45:53<3:29:22, 6.28s/it] {'loss': 0.4569, 'learning_rate': 5.669832437096058e-06, 'epoch': 0.65} + 65%|██████▌ | 3771/5772 [2:45:51<3:29:21, 6.28s/it] 65%|██████▌ | 3772/5772 [2:46:00<3:27:51, 6.24s/it] 65%|██████▌ | 3772/5772 [2:45:58<3:27:51, 6.24s/it] {'loss': 0.4761, 'learning_rate': 5.664774549698269e-06, 'epoch': 0.65} + 65%|██████▌ | 3772/5772 [2:46:00<3:27:51, 6.24s/it] {'loss': 0.4761, 'learning_rate': 5.664774549698269e-06, 'epoch': 0.65} + 65%|██████▌ | 3772/5772 [2:45:58<3:27:51, 6.24s/it] 65%|██████▌ | 3773/5772 [2:46:06<3:27:29, 6.23s/it] 65%|██████▌ | 3773/5772 [2:46:04<3:27:30, 6.23s/it] {'loss': 0.4579, 'learning_rate': 5.659718027655631e-06, 'epoch': 0.65} + 65%|██████▌ | 3773/5772 [2:46:06<3:27:29, 6.23s/it] {'loss': 0.4579, 'learning_rate': 5.659718027655631e-06, 'epoch': 0.65} + 65%|██████▌ | 3773/5772 [2:46:04<3:27:30, 6.23s/it] 65%|██████▌ | 3774/5772 [2:46:12<3:26:16, 6.19s/it] 65%|██████▌ | 3774/5772 [2:46:10<3:26:16, 6.19s/it] {'loss': 0.4528, 'learning_rate': 5.6546628725606675e-06, 'epoch': 0.65} + 65%|██████▌ | 3774/5772 [2:46:12<3:26:16, 6.19s/it] {'loss': 0.4528, 'learning_rate': 5.6546628725606675e-06, 'epoch': 0.65} + 65%|██████▌ | 3774/5772 [2:46:10<3:26:16, 6.19s/it] 65%|██████▌ | 3775/5772 [2:46:16<3:24:07, 6.13s/it] 65%|██████▌ | 3775/5772 [2:46:18<3:24:07, 6.13s/it] {'loss': 0.458, 'learning_rate': 5.649609086005476e-06, 'epoch': 0.65} + 65%|██████▌ | 3775/5772 [2:46:18<3:24:07, 6.13s/it] {'loss': 0.458, 'learning_rate': 5.649609086005476e-06, 'epoch': 0.65} + 65%|██████▌ | 3775/5772 [2:46:16<3:24:07, 6.13s/it] 65%|██████▌ | 3776/5772 [2:46:22<3:24:42, 6.15s/it] 65%|██████▌ | 3776/5772 [2:46:24<3:24:42, 6.15s/it] {'loss': 0.4681, 'learning_rate': 5.644556669581709e-06, 'epoch': 0.65} + 65%|██████▌ | 3776/5772 [2:46:24<3:24:42, 6.15s/it] {'loss': 0.4681, 'learning_rate': 5.644556669581709e-06, 'epoch': 0.65} + 65%|██████▌ | 3776/5772 [2:46:22<3:24:42, 6.15s/it] 65%|██████▌ | 3777/5772 [2:46:28<3:24:43, 6.16s/it] 65%|██████▌ | 3777/5772 [2:46:30<3:24:43, 6.16s/it] {'loss': 0.4688, 'learning_rate': 5.639505624880604e-06, 'epoch': 0.65} + 65%|██████▌ | 3777/5772 [2:46:30<3:24:43, 6.16s/it] {'loss': 0.4688, 'learning_rate': 5.639505624880604e-06, 'epoch': 0.65} + 65%|██████▌ | 3777/5772 [2:46:28<3:24:43, 6.16s/it] 65%|██████▌ | 3778/5772 [2:46:34<3:22:17, 6.09s/it] 65%|██████▌ | 3778/5772 [2:46:36<3:22:17, 6.09s/it] {'loss': 0.4686, 'learning_rate': 5.634455953492964e-06, 'epoch': 0.65} + 65%|██████▌ | 3778/5772 [2:46:36<3:22:17, 6.09s/it] {'loss': 0.4686, 'learning_rate': 5.634455953492964e-06, 'epoch': 0.65} + 65%|██████▌ | 3778/5772 [2:46:34<3:22:17, 6.09s/it] 65%|██████▌ | 3779/5772 [2:46:42<3:23:49, 6.14s/it] 65%|██████▌ | 3779/5772 [2:46:40<3:23:50, 6.14s/it] {'loss': 0.4591, 'learning_rate': 5.629407657009143e-06, 'epoch': 0.65} + 65%|██████▌ | 3779/5772 [2:46:42<3:23:49, 6.14s/it] {'loss': 0.4591, 'learning_rate': 5.629407657009143e-06, 'epoch': 0.65} + 65%|██████▌ | 3779/5772 [2:46:40<3:23:50, 6.14s/it] 65%|██████▌ | 3780/5772 [2:46:46<3:21:08, 6.06s/it] 65%|██████▌ | 3780/5772 [2:46:48<3:21:08, 6.06s/it] {'loss': 0.4553, 'learning_rate': 5.624360737019081e-06, 'epoch': 0.65} + 65%|██████▌ | 3780/5772 [2:46:48<3:21:08, 6.06s/it] {'loss': 0.4553, 'learning_rate': 5.624360737019081e-06, 'epoch': 0.65} + 65%|██████▌ | 3780/5772 [2:46:46<3:21:08, 6.06s/it] 66%|██████▌ | 3781/5772 [2:46:54<3:20:56, 6.06s/it] 66%|██████▌ | 3781/5772 [2:46:52<3:20:56, 6.06s/it] {'loss': 0.4691, 'learning_rate': 5.619315195112276e-06, 'epoch': 0.66} + 66%|██████▌ | 3781/5772 [2:46:54<3:20:56, 6.06s/it] {'loss': 0.4691, 'learning_rate': 5.619315195112276e-06, 'epoch': 0.66} + 66%|██████▌ | 3781/5772 [2:46:52<3:20:56, 6.06s/it] 66%|██████▌ | 3782/5772 [2:47:00<3:20:46, 6.05s/it] 66%|██████▌ | 3782/5772 [2:46:58<3:20:47, 6.05s/it] {'loss': 0.4607, 'learning_rate': 5.614271032877799e-06, 'epoch': 0.66} + 66%|██████▌ | 3782/5772 [2:47:00<3:20:46, 6.05s/it] {'loss': 0.4607, 'learning_rate': 5.614271032877799e-06, 'epoch': 0.66} + 66%|██████▌ | 3782/5772 [2:46:58<3:20:47, 6.05s/it] 66%|██████▌ | 3783/5772 [2:47:06<3:19:47, 6.03s/it] 66%|██████▌ | 3783/5772 [2:47:04<3:19:47, 6.03s/it] {'loss': 0.4665, 'learning_rate': 5.609228251904265e-06, 'epoch': 0.66} + 66%|██████▌ | 3783/5772 [2:47:06<3:19:47, 6.03s/it] {'loss': 0.4665, 'learning_rate': 5.609228251904265e-06, 'epoch': 0.66} + 66%|██████▌ | 3783/5772 [2:47:04<3:19:47, 6.03s/it] 66%|██████▌ | 3784/5772 [2:47:10<3:19:45, 6.03s/it] 66%|██████▌ | 3784/5772 [2:47:12<3:19:45, 6.03s/it] {'loss': 0.4743, 'learning_rate': 5.6041868537798845e-06, 'epoch': 0.66} + 66%|██████▌ | 3784/5772 [2:47:12<3:19:45, 6.03s/it] {'loss': 0.4743, 'learning_rate': 5.6041868537798845e-06, 'epoch': 0.66} + 66%|██████▌ | 3784/5772 [2:47:10<3:19:45, 6.03s/it] 66%|██████▌ | 3785/5772 [2:47:18<3:19:22, 6.02s/it] 66%|██████▌ | 3785/5772 [2:47:16<3:19:22, 6.02s/it] {'loss': 0.4555, 'learning_rate': 5.59914684009242e-06, 'epoch': 0.66} + 66%|██████▌ | 3785/5772 [2:47:18<3:19:22, 6.02s/it] {'loss': 0.4555, 'learning_rate': 5.59914684009242e-06, 'epoch': 0.66} + 66%|██████▌ | 3785/5772 [2:47:16<3:19:22, 6.02s/it] 66%|██████▌ | 3786/5772 [2:47:23<3:21:31, 6.09s/it] 66%|██████▌ | 3786/5772 [2:47:25<3:21:31, 6.09s/it] {'loss': 0.4663, 'learning_rate': 5.594108212429183e-06, 'epoch': 0.66} + 66%|██████▌ | 3786/5772 [2:47:25<3:21:31, 6.09s/it] {'loss': 0.4663, 'learning_rate': 5.594108212429183e-06, 'epoch': 0.66} + 66%|██████▌ | 3786/5772 [2:47:23<3:21:31, 6.09s/it] 66%|██████▌ | 3787/5772 [2:47:29<3:22:00, 6.11s/it] 66%|██████▌ | 3787/5772 [2:47:31<3:22:00, 6.11s/it] {'loss': 0.458, 'learning_rate': 5.589070972377068e-06, 'epoch': 0.66} + 66%|██████▌ | 3787/5772 [2:47:31<3:22:00, 6.11s/it] {'loss': 0.458, 'learning_rate': 5.589070972377068e-06, 'epoch': 0.66} + 66%|██████▌ | 3787/5772 [2:47:29<3:22:00, 6.11s/it] 66%|██████▌ | 3788/5772 [2:47:37<3:21:00, 6.08s/it] 66%|██████▌ | 3788/5772 [2:47:35<3:21:01, 6.08s/it] {'loss': 0.4612, 'learning_rate': 5.584035121522526e-06, 'epoch': 0.66} + 66%|██████▌ | 3788/5772 [2:47:37<3:21:00, 6.08s/it] {'loss': 0.4612, 'learning_rate': 5.584035121522526e-06, 'epoch': 0.66} + 66%|██████▌ | 3788/5772 [2:47:35<3:21:01, 6.08s/it] 66%|██████▌ | 3789/5772 [2:47:41<3:20:58, 6.08s/it] 66%|██████▌ | 3789/5772 [2:47:43<3:20:59, 6.08s/it] {'loss': 0.4473, 'learning_rate': 5.579000661451574e-06, 'epoch': 0.66} + 66%|██████▌ | 3789/5772 [2:47:43<3:20:59, 6.08s/it] {'loss': 0.4473, 'learning_rate': 5.579000661451574e-06, 'epoch': 0.66} + 66%|██████▌ | 3789/5772 [2:47:41<3:20:58, 6.08s/it] 66%|██████▌ | 3790/5772 [2:47:47<3:22:18, 6.12s/it] 66%|██████▌ | 3790/5772 [2:47:49<3:22:18, 6.12s/it] {'loss': 0.456, 'learning_rate': 5.573967593749778e-06, 'epoch': 0.66} + 66%|██████▌ | 3790/5772 [2:47:49<3:22:18, 6.12s/it] {'loss': 0.456, 'learning_rate': 5.573967593749778e-06, 'epoch': 0.66} + 66%|██████▌ | 3790/5772 [2:47:47<3:22:18, 6.12s/it] 66%|██████▌ | 3791/5772 [2:47:53<3:20:54, 6.09s/it] 66%|██████▌ | 3791/5772 [2:47:55<3:20:54, 6.09s/it] {'loss': 0.4696, 'learning_rate': 5.568935920002276e-06, 'epoch': 0.66} + 66%|██████▌ | 3791/5772 [2:47:55<3:20:54, 6.09s/it] {'loss': 0.4696, 'learning_rate': 5.568935920002276e-06, 'epoch': 0.66} + 66%|██████▌ | 3791/5772 [2:47:53<3:20:54, 6.09s/it] 66%|██████▌ | 3792/5772 [2:48:00<3:25:14, 6.22s/it] 66%|██████▌ | 3792/5772 [2:48:02<3:25:14, 6.22s/it] {'loss': 0.4693, 'learning_rate': 5.563905641793776e-06, 'epoch': 0.66} + 66%|██████▌ | 3792/5772 [2:48:02<3:25:14, 6.22s/it] {'loss': 0.4693, 'learning_rate': 5.563905641793776e-06, 'epoch': 0.66} + 66%|██████▌ | 3792/5772 [2:48:00<3:25:14, 6.22s/it] 66%|██████▌ | 3793/5772 [2:48:06<3:24:13, 6.19s/it] 66%|██████▌ | 3793/5772 [2:48:08<3:24:13, 6.19s/it] {'loss': 0.4759, 'learning_rate': 5.558876760708527e-06, 'epoch': 0.66} + 66%|██████▌ | 3793/5772 [2:48:08<3:24:13, 6.19s/it] {'loss': 0.4759, 'learning_rate': 5.558876760708527e-06, 'epoch': 0.66} + 66%|██████▌ | 3793/5772 [2:48:06<3:24:13, 6.19s/it] 66%|██████▌ | 3794/5772 [2:48:12<3:24:03, 6.19s/it] 66%|██████▌ | 3794/5772 [2:48:14<3:24:03, 6.19s/it] {'loss': 0.4616, 'learning_rate': 5.553849278330349e-06, 'epoch': 0.66} + 66%|██████▌ | 3794/5772 [2:48:14<3:24:03, 6.19s/it] {'loss': 0.4616, 'learning_rate': 5.553849278330349e-06, 'epoch': 0.66} + 66%|██████▌ | 3794/5772 [2:48:12<3:24:03, 6.19s/it] 66%|██████▌ | 3795/5772 [2:48:20<3:24:11, 6.20s/it] 66%|██████▌ | 3795/5772 [2:48:18<3:24:12, 6.20s/it] {'loss': 0.4664, 'learning_rate': 5.54882319624262e-06, 'epoch': 0.66} + 66%|██████▌ | 3795/5772 [2:48:20<3:24:11, 6.20s/it] {'loss': 0.4664, 'learning_rate': 5.54882319624262e-06, 'epoch': 0.66} + 66%|██████▌ | 3795/5772 [2:48:18<3:24:12, 6.20s/it] 66%|██████▌ | 3796/5772 [2:48:26<3:24:55, 6.22s/it] 66%|██████▌ | 3796/5772 [2:48:24<3:24:55, 6.22s/it] {'loss': 0.4681, 'learning_rate': 5.54379851602828e-06, 'epoch': 0.66} + 66%|██████▌ | 3796/5772 [2:48:26<3:24:55, 6.22s/it] {'loss': 0.4681, 'learning_rate': 5.54379851602828e-06, 'epoch': 0.66} + 66%|██████▌ | 3796/5772 [2:48:24<3:24:55, 6.22s/it] 66%|██████▌ | 3797/5772 [2:48:30<3:20:09, 6.08s/it] 66%|██████▌ | 3797/5772 [2:48:32<3:20:09, 6.08s/it] {'loss': 0.4507, 'learning_rate': 5.538775239269818e-06, 'epoch': 0.66} + 66%|██████▌ | 3797/5772 [2:48:32<3:20:09, 6.08s/it] {'loss': 0.4507, 'learning_rate': 5.538775239269818e-06, 'epoch': 0.66} + 66%|██████▌ | 3797/5772 [2:48:30<3:20:09, 6.08s/it] 66%|██████▌ | 3798/5772 [2:48:36<3:19:12, 6.05s/it] 66%|██████▌ | 3798/5772 [2:48:38<3:19:12, 6.05s/it] {'loss': 0.4625, 'learning_rate': 5.533753367549285e-06, 'epoch': 0.66} + 66%|██████▌ | 3798/5772 [2:48:38<3:19:12, 6.05s/it] {'loss': 0.4625, 'learning_rate': 5.533753367549285e-06, 'epoch': 0.66} + 66%|██████▌ | 3798/5772 [2:48:36<3:19:12, 6.05s/it] 66%|██████▌ | 3799/5772 [2:48:42<3:17:50, 6.02s/it] 66%|██████▌ | 3799/5772 [2:48:44<3:17:50, 6.02s/it] {'loss': 0.4612, 'learning_rate': 5.528732902448305e-06, 'epoch': 0.66} + 66%|██████▌ | 3799/5772 [2:48:44<3:17:50, 6.02s/it] {'loss': 0.4612, 'learning_rate': 5.528732902448305e-06, 'epoch': 0.66} + 66%|██████▌ | 3799/5772 [2:48:42<3:17:50, 6.02s/it]10 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +0 AutoResumeHook: Checking whether to suspend...1 AutoResumeHook: Checking whether to suspend... + +4 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... + 66%|██████▌ | 3800/5772 [2:48:50<3:17:48, 6.02s/it]6 AutoResumeHook: Checking whether to suspend... + 66%|██████▌ | 3800/5772 [2:48:48<3:17:48, 6.02s/it]15 AutoResumeHook: Checking whether to suspend... +27 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... + {'loss': 0.4672, 'learning_rate': 5.523713845548033e-06, 'epoch': 0.66} + 66%|██████▌ | 3800/5772 [2:48:50<3:17:48, 6.02s/it] {'loss': 0.4672, 'learning_rate': 5.523713845548033e-06, 'epoch': 0.66} + 66%|██████▌ | 3800/5772 [2:48:48<3:17:48, 6.02s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-3800/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-3800/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-3800/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 66%|██████▌ | 3801/5772 [2:49:11<5:59:13, 10.94s/it] 66%|██████▌ | 3801/5772 [2:49:13<5:59:13, 10.94s/it] {'loss': 0.4616, 'learning_rate': 5.518696198429201e-06, 'epoch': 0.66} + 66%|██████▌ | 3801/5772 [2:49:13<5:59:13, 10.94s/it] {'loss': 0.4616, 'learning_rate': 5.518696198429201e-06, 'epoch': 0.66} + 66%|██████▌ | 3801/5772 [2:49:11<5:59:13, 10.94s/it] 66%|██████▌ | 3802/5772 [2:49:17<5:10:21, 9.45s/it] 66%|██████▌ | 3802/5772 [2:49:19<5:10:21, 9.45s/it] {'loss': 0.4722, 'learning_rate': 5.513679962672076e-06, 'epoch': 0.66} + 66%|██████▌ | 3802/5772 [2:49:19<5:10:21, 9.45s/it] {'loss': 0.4722, 'learning_rate': 5.513679962672076e-06, 'epoch': 0.66} + 66%|██████▌ | 3802/5772 [2:49:17<5:10:21, 9.45s/it] 66%|██████▌ | 3803/5772 [2:49:23<4:43:43, 8.65s/it] 66%|██████▌ | 3803/5772 [2:49:25<4:43:43, 8.65s/it] {'loss': 0.4661, 'learning_rate': 5.508665139856513e-06, 'epoch': 0.66} + 66%|██████▌ | 3803/5772 [2:49:23<4:43:43, 8.65s/it]{'loss': 0.4661, 'learning_rate': 5.508665139856513e-06, 'epoch': 0.66} + 66%|██████▌ | 3803/5772 [2:49:25<4:43:43, 8.65s/it] 66%|██████▌ | 3804/5772 [2:49:29<4:18:29, 7.88s/it] 66%|██████▌ | 3804/5772 [2:49:31<4:18:29, 7.88s/it] {'loss': 0.4585, 'learning_rate': 5.503651731561887e-06, 'epoch': 0.66} + 66%|██████▌ | 3804/5772 [2:49:31<4:18:29, 7.88s/it] {'loss': 0.4585, 'learning_rate': 5.503651731561887e-06, 'epoch': 0.66} + 66%|██████▌ | 3804/5772 [2:49:29<4:18:29, 7.88s/it] 66%|██████▌ | 3805/5772 [2:49:36<4:01:02, 7.35s/it] 66%|██████▌ | 3805/5772 [2:49:37<4:01:03, 7.35s/it] {'loss': 0.4713, 'learning_rate': 5.498639739367148e-06, 'epoch': 0.66} + {'loss': 0.4713, 'learning_rate': 5.498639739367148e-06, 'epoch': 0.66} 66%|██████▌ | 3805/5772 [2:49:37<4:01:03, 7.35s/it] + 66%|██████▌ | 3805/5772 [2:49:36<4:01:02, 7.35s/it] 66%|██████▌ | 3806/5772 [2:49:41<3:46:28, 6.91s/it] 66%|██████▌ | 3806/5772 [2:49:43<3:46:28, 6.91s/it] {'loss': 0.4669, 'learning_rate': 5.493629164850795e-06, 'epoch': 0.66} + 66%|██████▌ | 3806/5772 [2:49:43<3:46:28, 6.91s/it] {'loss': 0.4669, 'learning_rate': 5.493629164850795e-06, 'epoch': 0.66} + 66%|██████▌ | 3806/5772 [2:49:41<3:46:28, 6.91s/it] 66%|██████▌ | 3807/5772 [2:49:48<3:39:54, 6.71s/it] 66%|██████▌ | 3807/5772 [2:49:50<3:39:53, 6.71s/it] {'loss': 0.4646, 'learning_rate': 5.488620009590881e-06, 'epoch': 0.66} + 66%|██████▌ | 3807/5772 [2:49:50<3:39:53, 6.71s/it] {'loss': 0.4646, 'learning_rate': 5.488620009590881e-06, 'epoch': 0.66} + 66%|██████▌ | 3807/5772 [2:49:48<3:39:54, 6.71s/it] 66%|██████▌ | 3808/5772 [2:49:54<3:34:16, 6.55s/it] 66%|██████▌ | 3808/5772 [2:49:56<3:34:16, 6.55s/it] {'loss': 0.464, 'learning_rate': 5.483612275165018e-06, 'epoch': 0.66} + 66%|██████▌ | 3808/5772 [2:49:56<3:34:16, 6.55s/it] {'loss': 0.464, 'learning_rate': 5.483612275165018e-06, 'epoch': 0.66} + 66%|██████▌ | 3808/5772 [2:49:54<3:34:16, 6.55s/it] 66%|██████▌ | 3809/5772 [2:50:00<3:27:49, 6.35s/it] 66%|██████▌ | 3809/5772 [2:50:02<3:27:48, 6.35s/it] {'loss': 0.442, 'learning_rate': 5.478605963150348e-06, 'epoch': 0.66} + 66%|██████▌ | 3809/5772 [2:50:02<3:27:48, 6.35s/it] {'loss': 0.442, 'learning_rate': 5.478605963150348e-06, 'epoch': 0.66} + 66%|██████▌ | 3809/5772 [2:50:00<3:27:49, 6.35s/it] 66%|██████▌ | 3810/5772 [2:50:08<3:26:04, 6.30s/it] 66%|██████▌ | 3810/5772 [2:50:06<3:26:05, 6.30s/it] {'loss': 0.4687, 'learning_rate': 5.473601075123599e-06, 'epoch': 0.66} + 66%|██████▌ | 3810/5772 [2:50:08<3:26:04, 6.30s/it] {'loss': 0.4687, 'learning_rate': 5.473601075123599e-06, 'epoch': 0.66} + 66%|██████▌ | 3810/5772 [2:50:06<3:26:05, 6.30s/it] 66%|██████▌ | 3811/5772 [2:50:14<3:25:03, 6.27s/it] 66%|██████▌ | 3811/5772 [2:50:12<3:25:03, 6.27s/it] {'loss': 0.4591, 'learning_rate': 5.468597612661021e-06, 'epoch': 0.66} + 66%|██████▌ | 3811/5772 [2:50:14<3:25:03, 6.27s/it] {'loss': 0.4591, 'learning_rate': 5.468597612661021e-06, 'epoch': 0.66} + 66%|██████▌ | 3811/5772 [2:50:12<3:25:03, 6.27s/it] 66%|██████▌ | 3812/5772 [2:50:18<3:22:48, 6.21s/it] 66%|██████▌ | 3812/5772 [2:50:20<3:22:48, 6.21s/it] {'loss': 0.4737, 'learning_rate': 5.4635955773384295e-06, 'epoch': 0.66} + 66%|██████▌ | 3812/5772 [2:50:20<3:22:48, 6.21s/it] {'loss': 0.4737, 'learning_rate': 5.4635955773384295e-06, 'epoch': 0.66} + 66%|██████▌ | 3812/5772 [2:50:18<3:22:48, 6.21s/it] 66%|██████▌ | 3813/5772 [2:50:24<3:20:10, 6.13s/it] 66%|██████▌ | 3813/5772 [2:50:26<3:20:10, 6.13s/it] {'loss': 0.4548, 'learning_rate': 5.458594970731188e-06, 'epoch': 0.66} + 66%|██████▌ | 3813/5772 [2:50:26<3:20:10, 6.13s/it] {'loss': 0.4548, 'learning_rate': 5.458594970731188e-06, 'epoch': 0.66} + 66%|██████▌ | 3813/5772 [2:50:24<3:20:10, 6.13s/it] 66%|██████▌ | 3814/5772 [2:50:30<3:17:32, 6.05s/it] 66%|██████▌ | 3814/5772 [2:50:32<3:17:33, 6.05s/it] {'loss': 0.4619, 'learning_rate': 5.453595794414211e-06, 'epoch': 0.66} + 66%|██████▌ | 3814/5772 [2:50:32<3:17:33, 6.05s/it] {'loss': 0.4619, 'learning_rate': 5.453595794414211e-06, 'epoch': 0.66} + 66%|██████▌ | 3814/5772 [2:50:30<3:17:32, 6.05s/it] 66%|██████▌ | 3815/5772 [2:50:36<3:16:38, 6.03s/it] 66%|██████▌ | 3815/5772 [2:50:38<3:16:37, 6.03s/it] {'loss': 0.4589, 'learning_rate': 5.448598049961964e-06, 'epoch': 0.66} + 66%|██████▌ | 3815/5772 [2:50:38<3:16:37, 6.03s/it] {'loss': 0.4589, 'learning_rate': 5.448598049961964e-06, 'epoch': 0.66} + 66%|██████▌ | 3815/5772 [2:50:36<3:16:38, 6.03s/it] 66%|██████▌ | 3816/5772 [2:50:42<3:16:18, 6.02s/it] 66%|██████▌ | 3816/5772 [2:50:44<3:16:18, 6.02s/it] {'loss': 0.462, 'learning_rate': 5.443601738948452e-06, 'epoch': 0.66} + 66%|██████▌ | 3816/5772 [2:50:44<3:16:18, 6.02s/it] {'loss': 0.462, 'learning_rate': 5.443601738948452e-06, 'epoch': 0.66} + 66%|██████▌ | 3816/5772 [2:50:42<3:16:18, 6.02s/it] 66%|██████▌ | 3817/5772 [2:50:48<3:15:09, 5.99s/it] 66%|██████▌ | 3817/5772 [2:50:50<3:15:09, 5.99s/it] {'loss': 0.4483, 'learning_rate': 5.438606862947237e-06, 'epoch': 0.66} + 66%|██████▌ | 3817/5772 [2:50:50<3:15:09, 5.99s/it] {'loss': 0.4483, 'learning_rate': 5.438606862947237e-06, 'epoch': 0.66} + 66%|██████▌ | 3817/5772 [2:50:48<3:15:09, 5.99s/it] 66%|██████▌ | 3818/5772 [2:50:54<3:12:15, 5.90s/it] 66%|██████▌ | 3818/5772 [2:50:56<3:12:15, 5.90s/it] {'loss': 0.4479, 'learning_rate': 5.433613423531432e-06, 'epoch': 0.66} + 66%|██████▌ | 3818/5772 [2:50:56<3:12:15, 5.90s/it] {'loss': 0.4479, 'learning_rate': 5.433613423531432e-06, 'epoch': 0.66} + 66%|██████▌ | 3818/5772 [2:50:54<3:12:15, 5.90s/it] 66%|██████▌ | 3819/5772 [2:50:59<3:11:55, 5.90s/it] 66%|██████▌ | 3819/5772 [2:51:01<3:11:55, 5.90s/it] {'loss': 0.4714, 'learning_rate': 5.428621422273687e-06, 'epoch': 0.66} + 66%|██████▌ | 3819/5772 [2:51:01<3:11:55, 5.90s/it] {'loss': 0.4714, 'learning_rate': 5.428621422273687e-06, 'epoch': 0.66} + 66%|██████▌ | 3819/5772 [2:50:59<3:11:55, 5.90s/it] 66%|██████▌ | 3820/5772 [2:51:05<3:12:24, 5.91s/it] 66%|██████▌ | 3820/5772 [2:51:07<3:12:24, 5.91s/it] {'loss': 0.4632, 'learning_rate': 5.4236308607462095e-06, 'epoch': 0.66} + 66%|██████▌ | 3820/5772 [2:51:07<3:12:24, 5.91s/it] {'loss': 0.4632, 'learning_rate': 5.4236308607462095e-06, 'epoch': 0.66} + 66%|██████▌ | 3820/5772 [2:51:05<3:12:24, 5.91s/it] 66%|██████▌ | 3821/5772 [2:51:12<3:17:29, 6.07s/it] 66%|██████▌ | 3821/5772 [2:51:14<3:17:29, 6.07s/it] {'loss': 0.4571, 'learning_rate': 5.418641740520748e-06, 'epoch': 0.66} + 66%|██████▌ | 3821/5772 [2:51:14<3:17:29, 6.07s/it] {'loss': 0.4571, 'learning_rate': 5.418641740520748e-06, 'epoch': 0.66} + 66%|██████▌ | 3821/5772 [2:51:12<3:17:29, 6.07s/it] 66%|██████▌ | 3822/5772 [2:51:18<3:20:59, 6.18s/it] 66%|██████▌ | 3822/5772 [2:51:20<3:20:59, 6.18s/it] {'loss': 0.4652, 'learning_rate': 5.413654063168602e-06, 'epoch': 0.66} + 66%|██████▌ | 3822/5772 [2:51:20<3:20:59, 6.18s/it] {'loss': 0.4652, 'learning_rate': 5.413654063168602e-06, 'epoch': 0.66} + 66%|██████▌ | 3822/5772 [2:51:18<3:20:59, 6.18s/it] 66%|██████▌ | 3823/5772 [2:51:25<3:23:51, 6.28s/it] 66%|██████▌ | 3823/5772 [2:51:27<3:23:51, 6.28s/it] {'loss': 0.4642, 'learning_rate': 5.408667830260603e-06, 'epoch': 0.66} + 66%|██████▌ | 3823/5772 [2:51:27<3:23:51, 6.28s/it] {'loss': 0.4642, 'learning_rate': 5.408667830260603e-06, 'epoch': 0.66} + 66%|██████▌ | 3823/5772 [2:51:25<3:23:51, 6.28s/it] 66%|██████▋ | 3824/5772 [2:51:31<3:22:46, 6.25s/it] 66%|██████▋ | 3824/5772 [2:51:33<3:22:46, 6.25s/it] {'loss': 0.4725, 'learning_rate': 5.403683043367145e-06, 'epoch': 0.66} + 66%|██████▋ | 3824/5772 [2:51:33<3:22:46, 6.25s/it] {'loss': 0.4725, 'learning_rate': 5.403683043367145e-06, 'epoch': 0.66} + 66%|██████▋ | 3824/5772 [2:51:31<3:22:46, 6.25s/it] 66%|██████▋ | 3825/5772 [2:51:37<3:21:22, 6.21s/it] 66%|██████▋ | 3825/5772 [2:51:39<3:21:22, 6.21s/it] {'loss': 0.4654, 'learning_rate': 5.398699704058156e-06, 'epoch': 0.66} + 66%|██████▋ | 3825/5772 [2:51:39<3:21:22, 6.21s/it] {'loss': 0.4654, 'learning_rate': 5.398699704058156e-06, 'epoch': 0.66} + 66%|██████▋ | 3825/5772 [2:51:37<3:21:22, 6.21s/it] 66%|██████▋ | 3826/5772 [2:51:44<3:26:51, 6.38s/it] 66%|██████▋ | 3826/5772 [2:51:46<3:26:51, 6.38s/it] {'loss': 0.4514, 'learning_rate': 5.393717813903112e-06, 'epoch': 0.66} + 66%|██████▋ | 3826/5772 [2:51:46<3:26:51, 6.38s/it] {'loss': 0.4514, 'learning_rate': 5.393717813903112e-06, 'epoch': 0.66} + 66%|██████▋ | 3826/5772 [2:51:44<3:26:51, 6.38s/it] 66%|██████▋ | 3827/5772 [2:51:52<3:23:13, 6.27s/it] 66%|██████▋ | 3827/5772 [2:51:50<3:23:14, 6.27s/it] {'loss': 0.4707, 'learning_rate': 5.388737374471032e-06, 'epoch': 0.66} + 66%|██████▋ | 3827/5772 [2:51:52<3:23:13, 6.27s/it] {'loss': 0.4707, 'learning_rate': 5.388737374471032e-06, 'epoch': 0.66} + 66%|██████▋ | 3827/5772 [2:51:50<3:23:14, 6.27s/it] 66%|██████▋ | 3828/5772 [2:51:56<3:19:53, 6.17s/it] 66%|██████▋ | 3828/5772 [2:51:58<3:19:53, 6.17s/it] {'loss': 0.4746, 'learning_rate': 5.383758387330476e-06, 'epoch': 0.66} + 66%|██████▋ | 3828/5772 [2:51:58<3:19:53, 6.17s/it] {'loss': 0.4746, 'learning_rate': 5.383758387330476e-06, 'epoch': 0.66} + 66%|██████▋ | 3828/5772 [2:51:56<3:19:53, 6.17s/it] 66%|██████▋ | 3829/5772 [2:52:04<3:18:18, 6.12s/it] 66%|██████▋ | 3829/5772 [2:52:02<3:18:18, 6.12s/it] {'loss': 0.4654, 'learning_rate': 5.378780854049553e-06, 'epoch': 0.66} + 66%|██████▋ | 3829/5772 [2:52:04<3:18:18, 6.12s/it] {'loss': 0.4654, 'learning_rate': 5.378780854049553e-06, 'epoch': 0.66} + 66%|██████▋ | 3829/5772 [2:52:02<3:18:18, 6.12s/it] 66%|██████▋ | 3830/5772 [2:52:10<3:17:47, 6.11s/it] 66%|██████▋ | 3830/5772 [2:52:08<3:17:48, 6.11s/it] {'loss': 0.4837, 'learning_rate': 5.373804776195903e-06, 'epoch': 0.66} + 66%|██████▋ | 3830/5772 [2:52:10<3:17:47, 6.11s/it] {'loss': 0.4837, 'learning_rate': 5.373804776195903e-06, 'epoch': 0.66} + 66%|██████▋ | 3830/5772 [2:52:08<3:17:48, 6.11s/it] 66%|██████▋ | 3831/5772 [2:52:16<3:17:46, 6.11s/it] 66%|██████▋ | 3831/5772 [2:52:14<3:17:46, 6.11s/it] {'loss': 0.4546, 'learning_rate': 5.368830155336717e-06, 'epoch': 0.66} + 66%|██████▋ | 3831/5772 [2:52:16<3:17:46, 6.11s/it] {'loss': 0.4546, 'learning_rate': 5.368830155336717e-06, 'epoch': 0.66} + 66%|██████▋ | 3831/5772 [2:52:14<3:17:46, 6.11s/it] 66%|██████▋ | 3832/5772 [2:52:20<3:18:14, 6.13s/it] 66%|██████▋ | 3832/5772 [2:52:22<3:18:14, 6.13s/it] {'loss': 0.4698, 'learning_rate': 5.363856993038725e-06, 'epoch': 0.66} + 66%|██████▋ | 3832/5772 [2:52:22<3:18:14, 6.13s/it] {'loss': 0.4698, 'learning_rate': 5.363856993038725e-06, 'epoch': 0.66} + 66%|██████▋ | 3832/5772 [2:52:20<3:18:14, 6.13s/it] 66%|██████▋ | 3833/5772 [2:52:26<3:18:16, 6.14s/it] 66%|██████▋ | 3833/5772 [2:52:28<3:18:16, 6.14s/it] {'loss': 0.4645, 'learning_rate': 5.358885290868195e-06, 'epoch': 0.66} + 66%|██████▋ | 3833/5772 [2:52:28<3:18:16, 6.14s/it] {'loss': 0.4645, 'learning_rate': 5.358885290868195e-06, 'epoch': 0.66} + 66%|██████▋ | 3833/5772 [2:52:26<3:18:16, 6.14s/it] 66%|██████▋ | 3834/5772 [2:52:32<3:17:46, 6.12s/it] 66%|██████▋ | 3834/5772 [2:52:34<3:17:46, 6.12s/it] {'loss': 0.4748, 'learning_rate': 5.353915050390941e-06, 'epoch': 0.66} + 66%|██████▋ | 3834/5772 [2:52:34<3:17:46, 6.12s/it] {'loss': 0.4748, 'learning_rate': 5.353915050390941e-06, 'epoch': 0.66} + 66%|██████▋ | 3834/5772 [2:52:32<3:17:46, 6.12s/it] 66%|██████▋ | 3835/5772 [2:52:39<3:19:27, 6.18s/it] 66%|██████▋ | 3835/5772 [2:52:41<3:19:27, 6.18s/it] {'loss': 0.4708, 'learning_rate': 5.3489462731723045e-06, 'epoch': 0.66} + 66%|██████▋ | 3835/5772 [2:52:41<3:19:27, 6.18s/it] {'loss': 0.4708, 'learning_rate': 5.3489462731723045e-06, 'epoch': 0.66} + 66%|██████▋ | 3835/5772 [2:52:39<3:19:27, 6.18s/it] 66%|██████▋ | 3836/5772 [2:52:45<3:17:18, 6.12s/it] 66%|██████▋ | 3836/5772 [2:52:47<3:17:18, 6.12s/it] {'loss': 0.4718, 'learning_rate': 5.343978960777184e-06, 'epoch': 0.66} + 66%|██████▋ | 3836/5772 [2:52:47<3:17:18, 6.12s/it] {'loss': 0.4718, 'learning_rate': 5.343978960777184e-06, 'epoch': 0.66} + 66%|██████▋ | 3836/5772 [2:52:45<3:17:18, 6.12s/it] 66%|██████▋ | 3837/5772 [2:52:53<3:19:23, 6.18s/it] 66%|██████▋ | 3837/5772 [2:52:51<3:19:23, 6.18s/it] {'loss': 0.4653, 'learning_rate': 5.3390131147699995e-06, 'epoch': 0.66} + 66%|██████▋ | 3837/5772 [2:52:53<3:19:23, 6.18s/it] {'loss': 0.4653, 'learning_rate': 5.3390131147699995e-06, 'epoch': 0.66} + 66%|██████▋ | 3837/5772 [2:52:51<3:19:23, 6.18s/it] 66%|██████▋ | 3838/5772 [2:52:59<3:19:05, 6.18s/it] 66%|██████▋ | 3838/5772 [2:52:57<3:19:05, 6.18s/it] {'loss': 0.4753, 'learning_rate': 5.3340487367147195e-06, 'epoch': 0.66} + 66%|██████▋ | 3838/5772 [2:52:59<3:19:05, 6.18s/it] {'loss': 0.4753, 'learning_rate': 5.3340487367147195e-06, 'epoch': 0.66} + 66%|██████▋ | 3838/5772 [2:52:57<3:19:05, 6.18s/it] 67%|██████▋ | 3839/5772 [2:53:05<3:18:31, 6.16s/it] 67%|██████▋ | 3839/5772 [2:53:03<3:18:31, 6.16s/it] {'loss': 0.4544, 'learning_rate': 5.329085828174847e-06, 'epoch': 0.67} + 67%|██████▋ | 3839/5772 [2:53:05<3:18:31, 6.16s/it] {'loss': 0.4544, 'learning_rate': 5.329085828174847e-06, 'epoch': 0.67} + 67%|██████▋ | 3839/5772 [2:53:03<3:18:31, 6.16s/it] 67%|██████▋ | 3840/5772 [2:53:09<3:15:56, 6.09s/it] 67%|██████▋ | 3840/5772 [2:53:11<3:15:56, 6.09s/it] {'loss': 0.4679, 'learning_rate': 5.324124390713423e-06, 'epoch': 0.67} + 67%|██████▋ | 3840/5772 [2:53:11<3:15:56, 6.09s/it] {'loss': 0.4679, 'learning_rate': 5.324124390713423e-06, 'epoch': 0.67} + 67%|██████▋ | 3840/5772 [2:53:09<3:15:56, 6.09s/it] 67%|██████▋ | 3841/5772 [2:53:16<3:22:25, 6.29s/it] 67%|██████▋ | 3841/5772 [2:53:18<3:22:25, 6.29s/it] {'loss': 0.4557, 'learning_rate': 5.3191644258930275e-06, 'epoch': 0.67} + 67%|██████▋ | 3841/5772 [2:53:18<3:22:25, 6.29s/it] {'loss': 0.4557, 'learning_rate': 5.3191644258930275e-06, 'epoch': 0.67} + 67%|██████▋ | 3841/5772 [2:53:16<3:22:25, 6.29s/it] 67%|██████▋ | 3842/5772 [2:53:22<3:20:46, 6.24s/it] 67%|██████▋ | 3842/5772 [2:53:24<3:20:46, 6.24s/it] {'loss': 0.4634, 'learning_rate': 5.3142059352757625e-06, 'epoch': 0.67} + 67%|██████▋ | 3842/5772 [2:53:24<3:20:46, 6.24s/it] {'loss': 0.4634, 'learning_rate': 5.3142059352757625e-06, 'epoch': 0.67} + 67%|██████▋ | 3842/5772 [2:53:22<3:20:46, 6.24s/it] 67%|██████▋ | 3843/5772 [2:53:28<3:21:16, 6.26s/it] 67%|██████▋ | 3843/5772 [2:53:30<3:21:16, 6.26s/it] {'loss': 0.4606, 'learning_rate': 5.309248920423293e-06, 'epoch': 0.67} + 67%|██████▋ | 3843/5772 [2:53:30<3:21:16, 6.26s/it] {'loss': 0.4606, 'learning_rate': 5.309248920423293e-06, 'epoch': 0.67} + 67%|██████▋ | 3843/5772 [2:53:28<3:21:16, 6.26s/it] 67%|██████▋ | 3844/5772 [2:53:34<3:15:53, 6.10s/it] 67%|██████▋ | 3844/5772 [2:53:36<3:15:53, 6.10s/it] {'loss': 0.4601, 'learning_rate': 5.304293382896792e-06, 'epoch': 0.67} + 67%|██████▋ | 3844/5772 [2:53:36<3:15:53, 6.10s/it] {'loss': 0.4601, 'learning_rate': 5.304293382896792e-06, 'epoch': 0.67} + 67%|██████▋ | 3844/5772 [2:53:34<3:15:53, 6.10s/it] 67%|██████▋ | 3845/5772 [2:53:40<3:16:52, 6.13s/it] 67%|██████▋ | 3845/5772 [2:53:42<3:16:52, 6.13s/it] {'loss': 0.4662, 'learning_rate': 5.299339324256986e-06, 'epoch': 0.67} + 67%|██████▋ | 3845/5772 [2:53:42<3:16:52, 6.13s/it] {'loss': 0.4662, 'learning_rate': 5.299339324256986e-06, 'epoch': 0.67} + 67%|██████▋ | 3845/5772 [2:53:40<3:16:52, 6.13s/it] 67%|██████▋ | 3846/5772 [2:53:47<3:18:49, 6.19s/it] 67%|██████▋ | 3846/5772 [2:53:49<3:18:49, 6.19s/it] {'loss': 0.4661, 'learning_rate': 5.294386746064115e-06, 'epoch': 0.67} + 67%|██████▋ | 3846/5772 [2:53:49<3:18:49, 6.19s/it] {'loss': 0.4661, 'learning_rate': 5.294386746064115e-06, 'epoch': 0.67} + 67%|██████▋ | 3846/5772 [2:53:47<3:18:49, 6.19s/it] 67%|██████▋ | 3847/5772 [2:53:55<3:18:08, 6.18s/it] 67%|██████▋ | 3847/5772 [2:53:53<3:18:08, 6.18s/it] {'loss': 0.4621, 'learning_rate': 5.28943564987798e-06, 'epoch': 0.67} + 67%|██████▋ | 3847/5772 [2:53:55<3:18:08, 6.18s/it] {'loss': 0.4621, 'learning_rate': 5.28943564987798e-06, 'epoch': 0.67} + 67%|██████▋ | 3847/5772 [2:53:53<3:18:08, 6.18s/it] 67%|██████▋ | 3848/5772 [2:54:01<3:17:45, 6.17s/it] 67%|██████▋ | 3848/5772 [2:53:59<3:17:46, 6.17s/it] {'loss': 0.457, 'learning_rate': 5.2844860372578995e-06, 'epoch': 0.67} + 67%|██████▋ | 3848/5772 [2:54:01<3:17:45, 6.17s/it] {'loss': 0.457, 'learning_rate': 5.2844860372578995e-06, 'epoch': 0.67} + 67%|██████▋ | 3848/5772 [2:53:59<3:17:46, 6.17s/it] 67%|██████▋ | 3849/5772 [2:54:07<3:14:48, 6.08s/it] 67%|██████▋ | 3849/5772 [2:54:05<3:14:48, 6.08s/it] {'loss': 0.4558, 'learning_rate': 5.2795379097627195e-06, 'epoch': 0.67} + 67%|██████▋ | 3849/5772 [2:54:07<3:14:48, 6.08s/it] {'loss': 0.4558, 'learning_rate': 5.2795379097627195e-06, 'epoch': 0.67} + 67%|██████▋ | 3849/5772 [2:54:05<3:14:48, 6.08s/it]10 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend...11 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... + +2 0AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend...5 + AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend...6 +AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... + 67%|██████▋ | 3850/5772 [2:54:13<3:13:34, 6.04s/it] 67%|██████▋ | 3850/5772 [2:54:11<3:13:34, 6.04s/it]7 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4629, 'learning_rate': 5.274591268950828e-06, 'epoch': 0.67} + 67%|██████▋ | 3850/5772 [2:54:13<3:13:34, 6.04s/it] {'loss': 0.4629, 'learning_rate': 5.274591268950828e-06, 'epoch': 0.67} + 67%|██████▋ | 3850/5772 [2:54:11<3:13:34, 6.04s/it] 67%|██████▋ | 3851/5772 [2:54:17<3:12:11, 6.00s/it] 67%|██████▋ | 3851/5772 [2:54:19<3:12:11, 6.00s/it] {'loss': 0.4588, 'learning_rate': 5.2696461163801445e-06, 'epoch': 0.67} + 67%|██████▋ | 3851/5772 [2:54:19<3:12:11, 6.00s/it] {'loss': 0.4588, 'learning_rate': 5.2696461163801445e-06, 'epoch': 0.67} + 67%|██████▋ | 3851/5772 [2:54:17<3:12:11, 6.00s/it] 67%|██████▋ | 3852/5772 [2:54:23<3:15:23, 6.11s/it] 67%|██████▋ | 3852/5772 [2:54:25<3:15:23, 6.11s/it] {'loss': 0.4654, 'learning_rate': 5.264702453608119e-06, 'epoch': 0.67} + 67%|██████▋ | 3852/5772 [2:54:25<3:15:23, 6.11s/it] {'loss': 0.4654, 'learning_rate': 5.264702453608119e-06, 'epoch': 0.67} + 67%|██████▋ | 3852/5772 [2:54:23<3:15:23, 6.11s/it] 67%|██████▋ | 3853/5772 [2:54:29<3:14:10, 6.07s/it] 67%|██████▋ | 3853/5772 [2:54:31<3:14:10, 6.07s/it] {'loss': 0.4633, 'learning_rate': 5.2597602821917206e-06, 'epoch': 0.67} + 67%|██████▋ | 3853/5772 [2:54:31<3:14:10, 6.07s/it] {'loss': 0.4633, 'learning_rate': 5.2597602821917206e-06, 'epoch': 0.67} + 67%|██████▋ | 3853/5772 [2:54:29<3:14:10, 6.07s/it] 67%|██████▋ | 3854/5772 [2:54:35<3:14:15, 6.08s/it] 67%|██████▋ | 3854/5772 [2:54:37<3:14:16, 6.08s/it] {'loss': 0.4682, 'learning_rate': 5.254819603687469e-06, 'epoch': 0.67} + 67%|██████▋ | 3854/5772 [2:54:37<3:14:16, 6.08s/it] {'loss': 0.4682, 'learning_rate': 5.254819603687469e-06, 'epoch': 0.67} + 67%|██████▋ | 3854/5772 [2:54:35<3:14:15, 6.08s/it] 67%|██████▋ | 3855/5772 [2:54:41<3:12:04, 6.01s/it] 67%|██████▋ | 3855/5772 [2:54:43<3:12:04, 6.01s/it] {'loss': 0.4468, 'learning_rate': 5.249880419651403e-06, 'epoch': 0.67} + 67%|██████▋ | 3855/5772 [2:54:43<3:12:04, 6.01s/it] {'loss': 0.4468, 'learning_rate': 5.249880419651403e-06, 'epoch': 0.67} + 67%|██████▋ | 3855/5772 [2:54:41<3:12:04, 6.01s/it] 67%|██████▋ | 3856/5772 [2:54:47<3:12:56, 6.04s/it] 67%|██████▋ | 3856/5772 [2:54:49<3:12:56, 6.04s/it] {'loss': 0.4686, 'learning_rate': 5.244942731639084e-06, 'epoch': 0.67} + 67%|██████▋ | 3856/5772 [2:54:49<3:12:56, 6.04s/it] {'loss': 0.4686, 'learning_rate': 5.244942731639084e-06, 'epoch': 0.67} + 67%|██████▋ | 3856/5772 [2:54:47<3:12:56, 6.04s/it] 67%|██████▋ | 3857/5772 [2:54:53<3:13:12, 6.05s/it] 67%|██████▋ | 3857/5772 [2:54:55<3:13:13, 6.05s/it] {'loss': 0.4523, 'learning_rate': 5.2400065412056136e-06, 'epoch': 0.67} + 67%|██████▋ | 3857/5772 [2:54:55<3:13:13, 6.05s/it] {'loss': 0.4523, 'learning_rate': 5.2400065412056136e-06, 'epoch': 0.67} + 67%|██████▋ | 3857/5772 [2:54:53<3:13:12, 6.05s/it] 67%|██████▋ | 3858/5772 [2:55:00<3:15:55, 6.14s/it] 67%|██████▋ | 3858/5772 [2:55:02<3:15:55, 6.14s/it] {'loss': 0.4693, 'learning_rate': 5.235071849905617e-06, 'epoch': 0.67} + 67%|██████▋ | 3858/5772 [2:55:02<3:15:55, 6.14s/it] {'loss': 0.4693, 'learning_rate': 5.235071849905617e-06, 'epoch': 0.67} + 67%|██████▋ | 3858/5772 [2:55:00<3:15:55, 6.14s/it] 67%|██████▋ | 3859/5772 [2:55:06<3:16:33, 6.16s/it] 67%|██████▋ | 3859/5772 [2:55:08<3:16:32, 6.16s/it] {'loss': 0.467, 'learning_rate': 5.230138659293254e-06, 'epoch': 0.67} + 67%|██████▋ | 3859/5772 [2:55:08<3:16:32, 6.16s/it] {'loss': 0.467, 'learning_rate': 5.230138659293254e-06, 'epoch': 0.67} + 67%|██████▋ | 3859/5772 [2:55:06<3:16:33, 6.16s/it] 67%|██████▋ | 3860/5772 [2:55:12<3:12:41, 6.05s/it] 67%|██████▋ | 3860/5772 [2:55:14<3:12:41, 6.05s/it] {'loss': 0.4571, 'learning_rate': 5.2252069709221945e-06, 'epoch': 0.67} + 67%|██████▋ | 3860/5772 [2:55:14<3:12:41, 6.05s/it] {'loss': 0.4571, 'learning_rate': 5.2252069709221945e-06, 'epoch': 0.67} + 67%|██████▋ | 3860/5772 [2:55:12<3:12:41, 6.05s/it] 67%|██████▋ | 3861/5772 [2:55:18<3:15:04, 6.12s/it] 67%|██████▋ | 3861/5772 [2:55:20<3:15:05, 6.13s/it] {'loss': 0.4575, 'learning_rate': 5.220276786345648e-06, 'epoch': 0.67} + 67%|██████▋ | 3861/5772 [2:55:20<3:15:05, 6.13s/it] {'loss': 0.4575, 'learning_rate': 5.220276786345648e-06, 'epoch': 0.67} + 67%|██████▋ | 3861/5772 [2:55:18<3:15:04, 6.12s/it] 67%|██████▋ | 3862/5772 [2:55:24<3:13:48, 6.09s/it] 67%|██████▋ | 3862/5772 [2:55:26<3:13:48, 6.09s/it] {'loss': 0.4704, 'learning_rate': 5.21534810711636e-06, 'epoch': 0.67} + 67%|██████▋ | 3862/5772 [2:55:26<3:13:48, 6.09s/it] {'loss': 0.4704, 'learning_rate': 5.21534810711636e-06, 'epoch': 0.67} + 67%|██████▋ | 3862/5772 [2:55:24<3:13:48, 6.09s/it] 67%|██████▋ | 3863/5772 [2:55:30<3:11:49, 6.03s/it] 67%|██████▋ | 3863/5772 [2:55:32<3:11:50, 6.03s/it] {'loss': 0.4462, 'learning_rate': 5.2104209347865786e-06, 'epoch': 0.67} + 67%|██████▋ | 3863/5772 [2:55:32<3:11:50, 6.03s/it] {'loss': 0.4462, 'learning_rate': 5.2104209347865786e-06, 'epoch': 0.67} + 67%|██████▋ | 3863/5772 [2:55:30<3:11:49, 6.03s/it] 67%|██████▋ | 3864/5772 [2:55:36<3:12:00, 6.04s/it] 67%|██████▋ | 3864/5772 [2:55:38<3:12:00, 6.04s/it] {'loss': 0.4624, 'learning_rate': 5.205495270908094e-06, 'epoch': 0.67} + 67%|██████▋ | 3864/5772 [2:55:38<3:12:00, 6.04s/it] {'loss': 0.4624, 'learning_rate': 5.205495270908094e-06, 'epoch': 0.67} + 67%|██████▋ | 3864/5772 [2:55:36<3:12:00, 6.04s/it] 67%|██████▋ | 3865/5772 [2:55:42<3:12:32, 6.06s/it] 67%|██████▋ | 3865/5772 [2:55:44<3:12:32, 6.06s/it] {'loss': 0.4513, 'learning_rate': 5.200571117032216e-06, 'epoch': 0.67} + 67%|██████▋ | 3865/5772 [2:55:44<3:12:32, 6.06s/it] {'loss': 0.4513, 'learning_rate': 5.200571117032216e-06, 'epoch': 0.67} + 67%|██████▋ | 3865/5772 [2:55:42<3:12:32, 6.06s/it] 67%|██████▋ | 3866/5772 [2:55:48<3:12:48, 6.07s/it] 67%|██████▋ | 3866/5772 [2:55:50<3:12:48, 6.07s/it] {'loss': 0.4586, 'learning_rate': 5.195648474709783e-06, 'epoch': 0.67} + 67%|██████▋ | 3866/5772 [2:55:50<3:12:48, 6.07s/it] {'loss': 0.4586, 'learning_rate': 5.195648474709783e-06, 'epoch': 0.67} + 67%|██████▋ | 3866/5772 [2:55:48<3:12:48, 6.07s/it] 67%|██████▋ | 3867/5772 [2:55:54<3:13:10, 6.08s/it] 67%|██████▋ | 3867/5772 [2:55:56<3:13:09, 6.08s/it] {'loss': 0.4619, 'learning_rate': 5.190727345491149e-06, 'epoch': 0.67} + 67%|██████▋ | 3867/5772 [2:55:56<3:13:09, 6.08s/it] {'loss': 0.4619, 'learning_rate': 5.190727345491149e-06, 'epoch': 0.67} + 67%|██████▋ | 3867/5772 [2:55:54<3:13:10, 6.08s/it] 67%|██████▋ | 3868/5772 [2:56:00<3:14:25, 6.13s/it] 67%|██████▋ | 3868/5772 [2:56:02<3:14:25, 6.13s/it] {'loss': 0.4741, 'learning_rate': 5.185807730926191e-06, 'epoch': 0.67} + 67%|██████▋ | 3868/5772 [2:56:02<3:14:25, 6.13s/it] {'loss': 0.4741, 'learning_rate': 5.185807730926191e-06, 'epoch': 0.67} + 67%|██████▋ | 3868/5772 [2:56:00<3:14:25, 6.13s/it] 67%|██████▋ | 3869/5772 [2:56:06<3:12:13, 6.06s/it] 67%|██████▋ | 3869/5772 [2:56:08<3:12:13, 6.06s/it] {'loss': 0.4711, 'learning_rate': 5.180889632564331e-06, 'epoch': 0.67} + 67%|██████▋ | 3869/5772 [2:56:08<3:12:13, 6.06s/it] {'loss': 0.4711, 'learning_rate': 5.180889632564331e-06, 'epoch': 0.67} + 67%|██████▋ | 3869/5772 [2:56:06<3:12:13, 6.06s/it] 67%|██████▋ | 3870/5772 [2:56:13<3:14:13, 6.13s/it] 67%|██████▋ | 3870/5772 [2:56:15<3:14:13, 6.13s/it] {'loss': 0.4641, 'learning_rate': 5.175973051954482e-06, 'epoch': 0.67} + 67%|██████▋ | 3870/5772 [2:56:15<3:14:13, 6.13s/it] {'loss': 0.4641, 'learning_rate': 5.175973051954482e-06, 'epoch': 0.67} + 67%|██████▋ | 3870/5772 [2:56:13<3:14:13, 6.13s/it] 67%|██████▋ | 3871/5772 [2:56:18<3:11:50, 6.06s/it] 67%|██████▋ | 3871/5772 [2:56:20<3:11:50, 6.06s/it] {'loss': 0.4523, 'learning_rate': 5.171057990645098e-06, 'epoch': 0.67} + 67%|██████▋ | 3871/5772 [2:56:20<3:11:50, 6.06s/it] {'loss': 0.4523, 'learning_rate': 5.171057990645098e-06, 'epoch': 0.67} + 67%|██████▋ | 3871/5772 [2:56:18<3:11:50, 6.06s/it] 67%|██████▋ | 3872/5772 [2:56:25<3:12:30, 6.08s/it] 67%|██████▋ | 3872/5772 [2:56:27<3:12:30, 6.08s/it] {'loss': 0.4696, 'learning_rate': 5.166144450184154e-06, 'epoch': 0.67} + 67%|██████▋ | 3872/5772 [2:56:27<3:12:30, 6.08s/it] {'loss': 0.4696, 'learning_rate': 5.166144450184154e-06, 'epoch': 0.67} + 67%|██████▋ | 3872/5772 [2:56:25<3:12:30, 6.08s/it] 67%|██████▋ | 3873/5772 [2:56:31<3:12:38, 6.09s/it] 67%|██████▋ | 3873/5772 [2:56:33<3:12:38, 6.09s/it] {'loss': 0.4567, 'learning_rate': 5.16123243211914e-06, 'epoch': 0.67} + 67%|██████▋ | 3873/5772 [2:56:33<3:12:38, 6.09s/it] {'loss': 0.4567, 'learning_rate': 5.16123243211914e-06, 'epoch': 0.67} + 67%|██████▋ | 3873/5772 [2:56:31<3:12:38, 6.09s/it] 67%|██████▋ | 3874/5772 [2:56:37<3:10:21, 6.02s/it] 67%|██████▋ | 3874/5772 [2:56:39<3:10:21, 6.02s/it] {'loss': 0.4642, 'learning_rate': 5.156321937997064e-06, 'epoch': 0.67} + 67%|██████▋ | 3874/5772 [2:56:39<3:10:21, 6.02s/it] {'loss': 0.4642, 'learning_rate': 5.156321937997064e-06, 'epoch': 0.67} + 67%|██████▋ | 3874/5772 [2:56:37<3:10:21, 6.02s/it] 67%|██████▋ | 3875/5772 [2:56:45<3:11:17, 6.05s/it] 67%|██████▋ | 3875/5772 [2:56:43<3:11:18, 6.05s/it] {'loss': 0.4562, 'learning_rate': 5.151412969364464e-06, 'epoch': 0.67} + 67%|██████▋ | 3875/5772 [2:56:45<3:11:17, 6.05s/it] {'loss': 0.4562, 'learning_rate': 5.151412969364464e-06, 'epoch': 0.67} + 67%|██████▋ | 3875/5772 [2:56:43<3:11:18, 6.05s/it] 67%|██████▋ | 3876/5772 [2:56:51<3:11:23, 6.06s/it] 67%|██████▋ | 3876/5772 [2:56:49<3:11:23, 6.06s/it] {'loss': 0.4813, 'learning_rate': 5.1465055277673915e-06, 'epoch': 0.67} + 67%|██████▋ | 3876/5772 [2:56:51<3:11:23, 6.06s/it] {'loss': 0.4813, 'learning_rate': 5.1465055277673915e-06, 'epoch': 0.67} + 67%|██████▋ | 3876/5772 [2:56:49<3:11:23, 6.06s/it] 67%|██████▋ | 3877/5772 [2:56:55<3:11:14, 6.06s/it] 67%|██████▋ | 3877/5772 [2:56:57<3:11:14, 6.06s/it] {'loss': 0.4642, 'learning_rate': 5.141599614751416e-06, 'epoch': 0.67} + 67%|██████▋ | 3877/5772 [2:56:57<3:11:14, 6.06s/it] {'loss': 0.4642, 'learning_rate': 5.141599614751416e-06, 'epoch': 0.67} + 67%|██████▋ | 3877/5772 [2:56:55<3:11:14, 6.06s/it] 67%|██████▋ | 3878/5772 [2:57:01<3:13:55, 6.14s/it] 67%|██████▋ | 3878/5772 [2:57:03<3:13:55, 6.14s/it] {'loss': 0.463, 'learning_rate': 5.136695231861633e-06, 'epoch': 0.67} + 67%|██████▋ | 3878/5772 [2:57:03<3:13:55, 6.14s/it] {'loss': 0.463, 'learning_rate': 5.136695231861633e-06, 'epoch': 0.67} + 67%|██████▋ | 3878/5772 [2:57:01<3:13:55, 6.14s/it] 67%|██████▋ | 3879/5772 [2:57:07<3:14:37, 6.17s/it] 67%|██████▋ | 3879/5772 [2:57:09<3:14:37, 6.17s/it] {'loss': 0.4429, 'learning_rate': 5.131792380642639e-06, 'epoch': 0.67} + 67%|██████▋ | 3879/5772 [2:57:09<3:14:37, 6.17s/it] {'loss': 0.4429, 'learning_rate': 5.131792380642639e-06, 'epoch': 0.67} + 67%|██████▋ | 3879/5772 [2:57:07<3:14:37, 6.17s/it] 67%|██████▋ | 3880/5772 [2:57:14<3:17:26, 6.26s/it] 67%|██████▋ | 3880/5772 [2:57:16<3:17:26, 6.26s/it] {'loss': 0.4724, 'learning_rate': 5.126891062638575e-06, 'epoch': 0.67} + 67%|██████▋ | 3880/5772 [2:57:16<3:17:26, 6.26s/it] {'loss': 0.4724, 'learning_rate': 5.126891062638575e-06, 'epoch': 0.67} + 67%|██████▋ | 3880/5772 [2:57:14<3:17:26, 6.26s/it] 67%|██████▋ | 3881/5772 [2:57:20<3:18:18, 6.29s/it] 67%|██████▋ | 3881/5772 [2:57:22<3:18:18, 6.29s/it] {'loss': 0.4663, 'learning_rate': 5.121991279393073e-06, 'epoch': 0.67} + 67%|██████▋ | 3881/5772 [2:57:22<3:18:18, 6.29s/it] {'loss': 0.4663, 'learning_rate': 5.121991279393073e-06, 'epoch': 0.67} + 67%|██████▋ | 3881/5772 [2:57:20<3:18:18, 6.29s/it] 67%|██████▋ | 3882/5772 [2:57:26<3:14:38, 6.18s/it] 67%|██████▋ | 3882/5772 [2:57:28<3:14:38, 6.18s/it] {'loss': 0.4603, 'learning_rate': 5.117093032449297e-06, 'epoch': 0.67} + 67%|██████▋ | 3882/5772 [2:57:28<3:14:38, 6.18s/it] {'loss': 0.4603, 'learning_rate': 5.117093032449297e-06, 'epoch': 0.67} + 67%|██████▋ | 3882/5772 [2:57:26<3:14:38, 6.18s/it] 67%|██████▋ | 3883/5772 [2:57:33<3:20:20, 6.36s/it] 67%|██████▋ | 3883/5772 [2:57:35<3:20:20, 6.36s/it] {'loss': 0.4549, 'learning_rate': 5.112196323349918e-06, 'epoch': 0.67} + 67%|██████▋ | 3883/5772 [2:57:35<3:20:20, 6.36s/it] {'loss': 0.4549, 'learning_rate': 5.112196323349918e-06, 'epoch': 0.67} + 67%|██████▋ | 3883/5772 [2:57:33<3:20:20, 6.36s/it] 67%|██████▋ | 3884/5772 [2:57:39<3:15:46, 6.22s/it] 67%|██████▋ | 3884/5772 [2:57:41<3:15:46, 6.22s/it] {'loss': 0.4709, 'learning_rate': 5.107301153637133e-06, 'epoch': 0.67} + 67%|██████▋ | 3884/5772 [2:57:41<3:15:46, 6.22s/it] {'loss': 0.4709, 'learning_rate': 5.107301153637133e-06, 'epoch': 0.67} + 67%|██████▋ | 3884/5772 [2:57:39<3:15:46, 6.22s/it] 67%|██████▋ | 3885/5772 [2:57:45<3:16:11, 6.24s/it] 67%|██████▋ | 3885/5772 [2:57:47<3:16:11, 6.24s/it] {'loss': 0.4503, 'learning_rate': 5.10240752485265e-06, 'epoch': 0.67} + 67%|██████▋ | 3885/5772 [2:57:47<3:16:11, 6.24s/it] {'loss': 0.4503, 'learning_rate': 5.10240752485265e-06, 'epoch': 0.67} + 67%|██████▋ | 3885/5772 [2:57:45<3:16:11, 6.24s/it] 67%|██████▋ | 3886/5772 [2:57:51<3:14:49, 6.20s/it] 67%|██████▋ | 3886/5772 [2:57:53<3:14:49, 6.20s/it] {'loss': 0.462, 'learning_rate': 5.097515438537678e-06, 'epoch': 0.67} + 67%|██████▋ | 3886/5772 [2:57:53<3:14:49, 6.20s/it] {'loss': 0.462, 'learning_rate': 5.097515438537678e-06, 'epoch': 0.67} + 67%|██████▋ | 3886/5772 [2:57:51<3:14:49, 6.20s/it] 67%|██████▋ | 3887/5772 [2:57:58<3:17:15, 6.28s/it] 67%|██████▋ | 3887/5772 [2:58:00<3:17:15, 6.28s/it] {'loss': 0.4536, 'learning_rate': 5.092624896232969e-06, 'epoch': 0.67} + 67%|██████▋ | 3887/5772 [2:58:00<3:17:15, 6.28s/it] {'loss': 0.4536, 'learning_rate': 5.092624896232969e-06, 'epoch': 0.67} + 67%|██████▋ | 3887/5772 [2:57:58<3:17:15, 6.28s/it] 67%|██████▋ | 3888/5772 [2:58:04<3:17:39, 6.30s/it] 67%|██████▋ | 3888/5772 [2:58:06<3:17:39, 6.30s/it] {'loss': 0.4736, 'learning_rate': 5.087735899478759e-06, 'epoch': 0.67} + 67%|██████▋ | 3888/5772 [2:58:06<3:17:39, 6.30s/it] {'loss': 0.4736, 'learning_rate': 5.087735899478759e-06, 'epoch': 0.67} + 67%|██████▋ | 3888/5772 [2:58:04<3:17:39, 6.30s/it] 67%|██████▋ | 3889/5772 [2:58:10<3:15:57, 6.24s/it] 67%|██████▋ | 3889/5772 [2:58:12<3:15:57, 6.24s/it] {'loss': 0.4562, 'learning_rate': 5.082848449814816e-06, 'epoch': 0.67} + 67%|██████▋ | 3889/5772 [2:58:12<3:15:57, 6.24s/it] {'loss': 0.4562, 'learning_rate': 5.082848449814816e-06, 'epoch': 0.67} + 67%|██████▋ | 3889/5772 [2:58:10<3:15:57, 6.24s/it] 67%|██████▋ | 3890/5772 [2:58:16<3:15:24, 6.23s/it] 67%|██████▋ | 3890/5772 [2:58:18<3:15:24, 6.23s/it] {'loss': 0.4619, 'learning_rate': 5.0779625487804125e-06, 'epoch': 0.67} + 67%|██████▋ | 3890/5772 [2:58:18<3:15:24, 6.23s/it] {'loss': 0.4619, 'learning_rate': 5.0779625487804125e-06, 'epoch': 0.67} + 67%|██████▋ | 3890/5772 [2:58:16<3:15:24, 6.23s/it] 67%|██████▋ | 3891/5772 [2:58:22<3:12:36, 6.14s/it] 67%|██████▋ | 3891/5772 [2:58:24<3:12:36, 6.14s/it] {'loss': 0.467, 'learning_rate': 5.073078197914341e-06, 'epoch': 0.67} + 67%|██████▋ | 3891/5772 [2:58:24<3:12:36, 6.14s/it] {'loss': 0.467, 'learning_rate': 5.073078197914341e-06, 'epoch': 0.67} + 67%|██████▋ | 3891/5772 [2:58:22<3:12:36, 6.14s/it] 67%|██████▋ | 3892/5772 [2:58:28<3:10:58, 6.09s/it] 67%|██████▋ | 3892/5772 [2:58:30<3:10:58, 6.10s/it] {'loss': 0.4736, 'learning_rate': 5.068195398754898e-06, 'epoch': 0.67} + 67%|██████▋ | 3892/5772 [2:58:30<3:10:58, 6.10s/it] {'loss': 0.4736, 'learning_rate': 5.068195398754898e-06, 'epoch': 0.67} + 67%|██████▋ | 3892/5772 [2:58:28<3:10:58, 6.09s/it] 67%|██████▋ | 3893/5772 [2:58:35<3:12:32, 6.15s/it] 67%|██████▋ | 3893/5772 [2:58:36<3:12:32, 6.15s/it] {'loss': 0.4517, 'learning_rate': 5.063314152839891e-06, 'epoch': 0.67} + 67%|██████▋ | 3893/5772 [2:58:36<3:12:32, 6.15s/it] {'loss': 0.4517, 'learning_rate': 5.063314152839891e-06, 'epoch': 0.67} + 67%|██████▋ | 3893/5772 [2:58:35<3:12:32, 6.15s/it] 67%|██████▋ | 3894/5772 [2:58:40<3:08:02, 6.01s/it] 67%|██████▋ | 3894/5772 [2:58:42<3:08:02, 6.01s/it] {'loss': 0.4644, 'learning_rate': 5.058434461706642e-06, 'epoch': 0.67} + 67%|██████▋ | 3894/5772 [2:58:42<3:08:02, 6.01s/it] {'loss': 0.4644, 'learning_rate': 5.058434461706642e-06, 'epoch': 0.67} + 67%|██████▋ | 3894/5772 [2:58:40<3:08:02, 6.01s/it] 67%|██████▋ | 3895/5772 [2:58:47<3:11:01, 6.11s/it] 67%|██████▋ | 3895/5772 [2:58:48<3:11:01, 6.11s/it] {'loss': 0.4685, 'learning_rate': 5.053556326891986e-06, 'epoch': 0.67} + 67%|██████▋ | 3895/5772 [2:58:48<3:11:01, 6.11s/it] {'loss': 0.4685, 'learning_rate': 5.053556326891986e-06, 'epoch': 0.67} + 67%|██████▋ | 3895/5772 [2:58:47<3:11:01, 6.11s/it] 67%|██████▋ | 3896/5772 [2:58:53<3:12:03, 6.14s/it] 67%|██████▋ | 3896/5772 [2:58:55<3:12:03, 6.14s/it] {'loss': 0.4656, 'learning_rate': 5.048679749932261e-06, 'epoch': 0.67} + 67%|██████▋ | 3896/5772 [2:58:55<3:12:03, 6.14s/it] {'loss': 0.4656, 'learning_rate': 5.048679749932261e-06, 'epoch': 0.67} + 67%|██████▋ | 3896/5772 [2:58:53<3:12:03, 6.14s/it] 68%|██████▊ | 3897/5772 [2:58:59<3:10:08, 6.08s/it] 68%|██████▊ | 3897/5772 [2:59:01<3:10:09, 6.08s/it] {'loss': 0.4573, 'learning_rate': 5.043804732363321e-06, 'epoch': 0.68} + 68%|██████▊ | 3897/5772 [2:59:01<3:10:09, 6.08s/it] {'loss': 0.4573, 'learning_rate': 5.043804732363321e-06, 'epoch': 0.68} + 68%|██████▊ | 3897/5772 [2:58:59<3:10:08, 6.08s/it] 68%|██████▊ | 3898/5772 [2:59:05<3:10:03, 6.09s/it] 68%|██████▊ | 3898/5772 [2:59:07<3:10:03, 6.08s/it] {'loss': 0.4655, 'learning_rate': 5.038931275720522e-06, 'epoch': 0.68} + 68%|██████▊ | 3898/5772 [2:59:07<3:10:03, 6.08s/it] {'loss': 0.4655, 'learning_rate': 5.038931275720522e-06, 'epoch': 0.68} + 68%|██████▊ | 3898/5772 [2:59:05<3:10:03, 6.09s/it] 68%|██████▊ | 3899/5772 [2:59:11<3:09:41, 6.08s/it] 68%|██████▊ | 3899/5772 [2:59:13<3:09:40, 6.08s/it] {'loss': 0.4577, 'learning_rate': 5.03405938153874e-06, 'epoch': 0.68} + 68%|██████▊ | 3899/5772 [2:59:13<3:09:40, 6.08s/it] {'loss': 0.4577, 'learning_rate': 5.03405938153874e-06, 'epoch': 0.68} + 68%|██████▊ | 3899/5772 [2:59:11<3:09:41, 6.08s/it]3 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +0 AutoResumeHook: Checking whether to suspend...1 AutoResumeHook: Checking whether to suspend... + +14 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +1115 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... + 68%|██████▊ | 3900/5772 [2:59:17<3:08:59, 6.06s/it]6 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... + 68%|██████▊ | 3900/5772 [2:59:19<3:08:59, 6.06s/it]2 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4766, 'learning_rate': 5.029189051352339e-06, 'epoch': 0.68} + 68%|██████▊ | 3900/5772 [2:59:19<3:08:59, 6.06s/it] {'loss': 0.4766, 'learning_rate': 5.029189051352339e-06, 'epoch': 0.68} + 68%|██████▊ | 3900/5772 [2:59:17<3:08:59, 6.06s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-3900/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-3900/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-3900/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 68%|██████▊ | 3901/5772 [2:59:44<6:24:30, 12.33s/it] 68%|██████▊ | 3901/5772 [2:59:46<6:24:30, 12.33s/it] {'loss': 0.4757, 'learning_rate': 5.02432028669521e-06, 'epoch': 0.68} + 68%|██████▊ | 3901/5772 [2:59:46<6:24:30, 12.33s/it] {'loss': 0.4757, 'learning_rate': 5.02432028669521e-06, 'epoch': 0.68} + 68%|██████▊ | 3901/5772 [2:59:44<6:24:30, 12.33s/it] 68%|██████▊ | 3902/5772 [2:59:50<5:26:08, 10.46s/it] 68%|██████▊ | 3902/5772 [2:59:52<5:26:08, 10.46s/it] {'loss': 0.4582, 'learning_rate': 5.0194530891007405e-06, 'epoch': 0.68} + 68%|██████▊ | 3902/5772 [2:59:52<5:26:08, 10.46s/it] {'loss': 0.4582, 'learning_rate': 5.0194530891007405e-06, 'epoch': 0.68} + 68%|██████▊ | 3902/5772 [2:59:50<5:26:08, 10.46s/it] 68%|██████▊ | 3903/5772 [2:59:56<4:44:20, 9.13s/it] 68%|██████▊ | 3903/5772 [2:59:58<4:44:20, 9.13s/it] {'loss': 0.4724, 'learning_rate': 5.01458746010183e-06, 'epoch': 0.68} + 68%|██████▊ | 3903/5772 [2:59:58<4:44:20, 9.13s/it] {'loss': 0.4724, 'learning_rate': 5.01458746010183e-06, 'epoch': 0.68} + 68%|██████▊ | 3903/5772 [2:59:56<4:44:20, 9.13s/it] 68%|██████▊ | 3904/5772 [3:00:02<4:14:36, 8.18s/it] 68%|██████▊ | 3904/5772 [3:00:04<4:14:36, 8.18s/it] {'loss': 0.4504, 'learning_rate': 5.0097234012308836e-06, 'epoch': 0.68} + 68%|██████▊ | 3904/5772 [3:00:04<4:14:36, 8.18s/it] {'loss': 0.4504, 'learning_rate': 5.0097234012308836e-06, 'epoch': 0.68} + 68%|██████▊ | 3904/5772 [3:00:02<4:14:36, 8.18s/it] 68%|██████▊ | 3905/5772 [3:00:08<3:51:41, 7.45s/it] 68%|██████▊ | 3905/5772 [3:00:10<3:51:41, 7.45s/it] {'loss': 0.4703, 'learning_rate': 5.004860914019798e-06, 'epoch': 0.68} + 68%|██████▊ | 3905/5772 [3:00:10<3:51:41, 7.45s/it] {'loss': 0.4703, 'learning_rate': 5.004860914019798e-06, 'epoch': 0.68} + 68%|██████▊ | 3905/5772 [3:00:08<3:51:41, 7.45s/it] 68%|██████▊ | 3906/5772 [3:00:14<3:37:07, 6.98s/it] 68%|██████▊ | 3906/5772 [3:00:16<3:37:07, 6.98s/it] {'loss': 0.4569, 'learning_rate': 5.000000000000003e-06, 'epoch': 0.68} + 68%|██████▊ | 3906/5772 [3:00:16<3:37:07, 6.98s/it] {'loss': 0.4569, 'learning_rate': 5.000000000000003e-06, 'epoch': 0.68} + 68%|██████▊ | 3906/5772 [3:00:14<3:37:07, 6.98s/it] 68%|██████▊ | 3907/5772 [3:00:20<3:29:56, 6.75s/it] 68%|██████▊ | 3907/5772 [3:00:22<3:29:56, 6.75s/it] {'loss': 0.4723, 'learning_rate': 4.9951406607024024e-06, 'epoch': 0.68} + 68%|██████▊ | 3907/5772 [3:00:22<3:29:56, 6.75s/it] {'loss': 0.4723, 'learning_rate': 4.9951406607024024e-06, 'epoch': 0.68} + 68%|██████▊ | 3907/5772 [3:00:20<3:29:56, 6.75s/it] 68%|██████▊ | 3908/5772 [3:00:26<3:21:53, 6.50s/it] 68%|██████▊ | 3908/5772 [3:00:28<3:21:53, 6.50s/it] {'loss': 0.4625, 'learning_rate': 4.990282897657425e-06, 'epoch': 0.68} + 68%|██████▊ | 3908/5772 [3:00:28<3:21:53, 6.50s/it] {'loss': 0.4625, 'learning_rate': 4.990282897657425e-06, 'epoch': 0.68} + 68%|██████▊ | 3908/5772 [3:00:26<3:21:53, 6.50s/it] 68%|██████▊ | 3909/5772 [3:00:34<3:20:08, 6.45s/it] 68%|██████▊ | 3909/5772 [3:00:32<3:20:09, 6.45s/it] {'loss': 0.4656, 'learning_rate': 4.985426712394994e-06, 'epoch': 0.68} + 68%|██████▊ | 3909/5772 [3:00:34<3:20:08, 6.45s/it] {'loss': 0.4656, 'learning_rate': 4.985426712394994e-06, 'epoch': 0.68} + 68%|██████▊ | 3909/5772 [3:00:32<3:20:09, 6.45s/it] 68%|██████▊ | 3910/5772 [3:00:40<3:13:26, 6.23s/it] 68%|██████▊ | 3910/5772 [3:00:38<3:13:27, 6.23s/it] {'loss': 0.4499, 'learning_rate': 4.980572106444539e-06, 'epoch': 0.68} + 68%|██████▊ | 3910/5772 [3:00:40<3:13:26, 6.23s/it] {'loss': 0.4499, 'learning_rate': 4.980572106444539e-06, 'epoch': 0.68} + 68%|██████▊ | 3910/5772 [3:00:38<3:13:27, 6.23s/it] 68%|██████▊ | 3911/5772 [3:00:44<3:14:59, 6.29s/it] 68%|██████▊ | 3911/5772 [3:00:46<3:15:00, 6.29s/it] {'loss': 0.462, 'learning_rate': 4.9757190813349945e-06, 'epoch': 0.68} + 68%|██████▊ | 3911/5772 [3:00:46<3:15:00, 6.29s/it] {'loss': 0.462, 'learning_rate': 4.9757190813349945e-06, 'epoch': 0.68} + 68%|██████▊ | 3911/5772 [3:00:44<3:14:59, 6.29s/it] 68%|██████▊ | 3912/5772 [3:00:51<3:16:10, 6.33s/it] 68%|██████▊ | 3912/5772 [3:00:53<3:16:10, 6.33s/it] {'loss': 0.452, 'learning_rate': 4.970867638594783e-06, 'epoch': 0.68} + 68%|██████▊ | 3912/5772 [3:00:53<3:16:10, 6.33s/it] {'loss': 0.452, 'learning_rate': 4.970867638594783e-06, 'epoch': 0.68} + 68%|██████▊ | 3912/5772 [3:00:51<3:16:10, 6.33s/it] 68%|██████▊ | 3913/5772 [3:00:57<3:14:46, 6.29s/it] 68%|██████▊ | 3913/5772 [3:00:59<3:14:46, 6.29s/it] {'loss': 0.4632, 'learning_rate': 4.966017779751854e-06, 'epoch': 0.68} + 68%|██████▊ | 3913/5772 [3:00:59<3:14:46, 6.29s/it] {'loss': 0.4632, 'learning_rate': 4.966017779751854e-06, 'epoch': 0.68} + 68%|██████▊ | 3913/5772 [3:00:57<3:14:46, 6.29s/it] 68%|██████▊ | 3914/5772 [3:01:03<3:12:54, 6.23s/it] 68%|██████▊ | 3914/5772 [3:01:05<3:12:54, 6.23s/it] {'loss': 0.4513, 'learning_rate': 4.961169506333632e-06, 'epoch': 0.68} + 68%|██████▊ | 3914/5772 [3:01:05<3:12:54, 6.23s/it] {'loss': 0.4513, 'learning_rate': 4.961169506333632e-06, 'epoch': 0.68} + 68%|██████▊ | 3914/5772 [3:01:03<3:12:54, 6.23s/it] 68%|██████▊ | 3915/5772 [3:01:11<3:12:11, 6.21s/it] 68%|██████▊ | 3915/5772 [3:01:09<3:12:11, 6.21s/it] {'loss': 0.4621, 'learning_rate': 4.956322819867059e-06, 'epoch': 0.68} + 68%|██████▊ | 3915/5772 [3:01:11<3:12:11, 6.21s/it] {'loss': 0.4621, 'learning_rate': 4.956322819867059e-06, 'epoch': 0.68} + 68%|██████▊ | 3915/5772 [3:01:09<3:12:11, 6.21s/it] 68%|██████▊ | 3916/5772 [3:01:15<3:13:11, 6.25s/it] 68%|██████▊ | 3916/5772 [3:01:17<3:13:11, 6.25s/it] {'loss': 0.4767, 'learning_rate': 4.9514777218785704e-06, 'epoch': 0.68} + 68%|██████▊ | 3916/5772 [3:01:17<3:13:11, 6.25s/it] {'loss': 0.4767, 'learning_rate': 4.9514777218785704e-06, 'epoch': 0.68} + 68%|██████▊ | 3916/5772 [3:01:15<3:13:11, 6.25s/it] 68%|██████▊ | 3917/5772 [3:01:24<3:18:42, 6.43s/it] 68%|██████▊ | 3917/5772 [3:01:22<3:18:42, 6.43s/it] {'loss': 0.4647, 'learning_rate': 4.946634213894104e-06, 'epoch': 0.68} + 68%|██████▊ | 3917/5772 [3:01:24<3:18:42, 6.43s/it] {'loss': 0.4647, 'learning_rate': 4.946634213894104e-06, 'epoch': 0.68} + 68%|██████▊ | 3917/5772 [3:01:22<3:18:42, 6.43s/it] 68%|██████▊ | 3918/5772 [3:01:28<3:13:28, 6.26s/it] 68%|██████▊ | 3918/5772 [3:01:30<3:13:29, 6.26s/it] {'loss': 0.4511, 'learning_rate': 4.941792297439098e-06, 'epoch': 0.68} + 68%|██████▊ | 3918/5772 [3:01:30<3:13:29, 6.26s/it] {'loss': 0.4511, 'learning_rate': 4.941792297439098e-06, 'epoch': 0.68} + 68%|██████▊ | 3918/5772 [3:01:28<3:13:28, 6.26s/it] 68%|██████▊ | 3919/5772 [3:01:35<3:17:31, 6.40s/it] 68%|██████▊ | 3919/5772 [3:01:37<3:17:31, 6.40s/it] {'loss': 0.4713, 'learning_rate': 4.936951974038481e-06, 'epoch': 0.68} + 68%|██████▊ | 3919/5772 [3:01:37<3:17:31, 6.40s/it] {'loss': 0.4713, 'learning_rate': 4.936951974038481e-06, 'epoch': 0.68} + 68%|██████▊ | 3919/5772 [3:01:35<3:17:31, 6.40s/it] 68%|██████▊ | 3920/5772 [3:01:41<3:12:20, 6.23s/it] 68%|██████▊ | 3920/5772 [3:01:43<3:12:20, 6.23s/it] {'loss': 0.4471, 'learning_rate': 4.932113245216689e-06, 'epoch': 0.68} + 68%|██████▊ | 3920/5772 [3:01:43<3:12:20, 6.23s/it] {'loss': 0.4471, 'learning_rate': 4.932113245216689e-06, 'epoch': 0.68} + 68%|██████▊ | 3920/5772 [3:01:41<3:12:20, 6.23s/it] 68%|██████▊ | 3921/5772 [3:01:47<3:12:33, 6.24s/it] 68%|██████▊ | 3921/5772 [3:01:49<3:12:33, 6.24s/it] {'loss': 0.4602, 'learning_rate': 4.927276112497652e-06, 'epoch': 0.68} + 68%|██████▊ | 3921/5772 [3:01:47<3:12:33, 6.24s/it]{'loss': 0.4602, 'learning_rate': 4.927276112497652e-06, 'epoch': 0.68} + 68%|██████▊ | 3921/5772 [3:01:49<3:12:33, 6.24s/it] 68%|██████▊ | 3922/5772 [3:01:53<3:13:24, 6.27s/it] 68%|██████▊ | 3922/5772 [3:01:55<3:13:24, 6.27s/it] {'loss': 0.4603, 'learning_rate': 4.922440577404804e-06, 'epoch': 0.68} + 68%|██████▊ | 3922/5772 [3:01:55<3:13:24, 6.27s/it] {'loss': 0.4603, 'learning_rate': 4.922440577404804e-06, 'epoch': 0.68} + 68%|██████▊ | 3922/5772 [3:01:53<3:13:24, 6.27s/it] 68%|██████▊ | 3923/5772 [3:02:01<3:11:42, 6.22s/it] 68%|██████▊ | 3923/5772 [3:01:59<3:11:43, 6.22s/it] {'loss': 0.4642, 'learning_rate': 4.917606641461056e-06, 'epoch': 0.68} + 68%|██████▊ | 3923/5772 [3:02:01<3:11:42, 6.22s/it] {'loss': 0.4642, 'learning_rate': 4.917606641461056e-06, 'epoch': 0.68} + 68%|██████▊ | 3923/5772 [3:01:59<3:11:43, 6.22s/it] 68%|██████▊ | 3924/5772 [3:02:05<3:09:30, 6.15s/it] 68%|██████▊ | 3924/5772 [3:02:07<3:09:31, 6.15s/it] {'loss': 0.4735, 'learning_rate': 4.912774306188842e-06, 'epoch': 0.68} + 68%|██████▊ | 3924/5772 [3:02:07<3:09:31, 6.15s/it] {'loss': 0.4735, 'learning_rate': 4.912774306188842e-06, 'epoch': 0.68} + 68%|██████▊ | 3924/5772 [3:02:05<3:09:30, 6.15s/it] 68%|██████▊ | 3925/5772 [3:02:12<3:13:30, 6.29s/it] 68%|██████▊ | 3925/5772 [3:02:14<3:13:30, 6.29s/it] {'loss': 0.4675, 'learning_rate': 4.90794357311008e-06, 'epoch': 0.68} + 68%|██████▊ | 3925/5772 [3:02:14<3:13:30, 6.29s/it] {'loss': 0.4675, 'learning_rate': 4.90794357311008e-06, 'epoch': 0.68} + 68%|██████▊ | 3925/5772 [3:02:12<3:13:30, 6.29s/it] 68%|██████▊ | 3926/5772 [3:02:18<3:09:58, 6.17s/it] 68%|██████▊ | 3926/5772 [3:02:20<3:09:58, 6.17s/it] {'loss': 0.4655, 'learning_rate': 4.903114443746173e-06, 'epoch': 0.68} + 68%|██████▊ | 3926/5772 [3:02:20<3:09:58, 6.17s/it] {'loss': 0.4655, 'learning_rate': 4.903114443746173e-06, 'epoch': 0.68} + 68%|██████▊ | 3926/5772 [3:02:18<3:09:58, 6.17s/it] 68%|██████▊ | 3927/5772 [3:02:24<3:07:59, 6.11s/it] 68%|██████▊ | 3927/5772 [3:02:26<3:08:00, 6.11s/it] {'loss': 0.4552, 'learning_rate': 4.898286919618034e-06, 'epoch': 0.68} + 68%|██████▊ | 3927/5772 [3:02:26<3:08:00, 6.11s/it] {'loss': 0.4552, 'learning_rate': 4.898286919618034e-06, 'epoch': 0.68} + 68%|██████▊ | 3927/5772 [3:02:24<3:07:59, 6.11s/it] 68%|██████▊ | 3928/5772 [3:02:30<3:05:38, 6.04s/it] 68%|██████▊ | 3928/5772 [3:02:32<3:05:39, 6.04s/it] {'loss': 0.4642, 'learning_rate': 4.8934610022460635e-06, 'epoch': 0.68} + 68%|██████▊ | 3928/5772 [3:02:32<3:05:39, 6.04s/it] {'loss': 0.4642, 'learning_rate': 4.8934610022460635e-06, 'epoch': 0.68} + 68%|██████▊ | 3928/5772 [3:02:30<3:05:38, 6.04s/it] 68%|██████▊ | 3929/5772 [3:02:35<3:02:37, 5.95s/it] 68%|██████▊ | 3929/5772 [3:02:37<3:02:37, 5.95s/it] {'loss': 0.4609, 'learning_rate': 4.888636693150161e-06, 'epoch': 0.68} + 68%|██████▊ | 3929/5772 [3:02:37<3:02:37, 5.95s/it] {'loss': 0.4609, 'learning_rate': 4.888636693150161e-06, 'epoch': 0.68} + 68%|██████▊ | 3929/5772 [3:02:35<3:02:37, 5.95s/it] 68%|██████▊ | 3930/5772 [3:02:43<3:02:00, 5.93s/it] 68%|██████▊ | 3930/5772 [3:02:41<3:02:01, 5.93s/it] {'loss': 0.4498, 'learning_rate': 4.883813993849706e-06, 'epoch': 0.68} + 68%|██████▊ | 3930/5772 [3:02:43<3:02:00, 5.93s/it] {'loss': 0.4498, 'learning_rate': 4.883813993849706e-06, 'epoch': 0.68} + 68%|██████▊ | 3930/5772 [3:02:41<3:02:01, 5.93s/it] 68%|██████▊ | 3931/5772 [3:02:47<3:04:14, 6.00s/it] 68%|██████▊ | 3931/5772 [3:02:49<3:04:14, 6.00s/it] {'loss': 0.4568, 'learning_rate': 4.878992905863591e-06, 'epoch': 0.68} + 68%|██████▊ | 3931/5772 [3:02:49<3:04:14, 6.00s/it] {'loss': 0.4568, 'learning_rate': 4.878992905863591e-06, 'epoch': 0.68} + 68%|██████▊ | 3931/5772 [3:02:48<3:04:14, 6.00s/it] 68%|██████▊ | 3932/5772 [3:02:56<3:09:14, 6.17s/it] 68%|██████▊ | 3932/5772 [3:02:54<3:09:15, 6.17s/it] {'loss': 0.4625, 'learning_rate': 4.874173430710192e-06, 'epoch': 0.68} + 68%|██████▊ | 3932/5772 [3:02:56<3:09:14, 6.17s/it] {'loss': 0.4625, 'learning_rate': 4.874173430710192e-06, 'epoch': 0.68} + 68%|██████▊ | 3932/5772 [3:02:54<3:09:15, 6.17s/it] 68%|██████▊ | 3933/5772 [3:03:02<3:07:25, 6.12s/it] 68%|██████▊ | 3933/5772 [3:03:00<3:07:26, 6.12s/it] {'loss': 0.4587, 'learning_rate': 4.869355569907367e-06, 'epoch': 0.68} + 68%|██████▊ | 3933/5772 [3:03:02<3:07:25, 6.12s/it] {'loss': 0.4587, 'learning_rate': 4.869355569907367e-06, 'epoch': 0.68} + 68%|██████▊ | 3933/5772 [3:03:00<3:07:26, 6.12s/it] 68%|██████▊ | 3934/5772 [3:03:06<3:10:33, 6.22s/it] 68%|██████▊ | 3934/5772 [3:03:08<3:10:33, 6.22s/it] {'loss': 0.47, 'learning_rate': 4.864539324972478e-06, 'epoch': 0.68} + 68%|██████▊ | 3934/5772 [3:03:08<3:10:33, 6.22s/it] {'loss': 0.47, 'learning_rate': 4.864539324972478e-06, 'epoch': 0.68} + 68%|██████▊ | 3934/5772 [3:03:06<3:10:33, 6.22s/it] 68%|██████▊ | 3935/5772 [3:03:15<3:31:33, 6.91s/it] 68%|██████▊ | 3935/5772 [3:03:17<3:31:33, 6.91s/it] {'loss': 0.4627, 'learning_rate': 4.859724697422377e-06, 'epoch': 0.68} + 68%|██████▊ | 3935/5772 [3:03:17<3:31:33, 6.91s/it] {'loss': 0.4627, 'learning_rate': 4.859724697422377e-06, 'epoch': 0.68} + 68%|██████▊ | 3935/5772 [3:03:15<3:31:33, 6.91s/it] 68%|██████▊ | 3936/5772 [3:03:22<3:28:56, 6.83s/it] 68%|██████▊ | 3936/5772 [3:03:24<3:28:56, 6.83s/it] {'loss': 0.4528, 'learning_rate': 4.8549116887734045e-06, 'epoch': 0.68} + 68%|██████▊ | 3936/5772 [3:03:24<3:28:56, 6.83s/it] {'loss': 0.4528, 'learning_rate': 4.8549116887734045e-06, 'epoch': 0.68} + 68%|██████▊ | 3936/5772 [3:03:22<3:28:56, 6.83s/it] 68%|██████▊ | 3937/5772 [3:03:28<3:22:48, 6.63s/it] 68%|██████▊ | 3937/5772 [3:03:30<3:22:48, 6.63s/it] {'loss': 0.4505, 'learning_rate': 4.850100300541386e-06, 'epoch': 0.68} + 68%|██████▊ | 3937/5772 [3:03:30<3:22:48, 6.63s/it] {'loss': 0.4505, 'learning_rate': 4.850100300541386e-06, 'epoch': 0.68} + 68%|██████▊ | 3937/5772 [3:03:28<3:22:48, 6.63s/it] 68%|██████▊ | 3938/5772 [3:03:36<3:19:16, 6.52s/it] 68%|██████▊ | 3938/5772 [3:03:34<3:19:16, 6.52s/it] {'loss': 0.4524, 'learning_rate': 4.8452905342416405e-06, 'epoch': 0.68} + 68%|██████▊ | 3938/5772 [3:03:36<3:19:16, 6.52s/it] {'loss': 0.4524, 'learning_rate': 4.8452905342416405e-06, 'epoch': 0.68} + 68%|██████▊ | 3938/5772 [3:03:34<3:19:16, 6.52s/it] 68%|██████▊ | 3939/5772 [3:03:42<3:14:13, 6.36s/it] 68%|██████▊ | 3939/5772 [3:03:40<3:14:13, 6.36s/it] {'loss': 0.4619, 'learning_rate': 4.840482391388988e-06, 'epoch': 0.68} + 68%|██████▊ | 3939/5772 [3:03:42<3:14:13, 6.36s/it] {'loss': 0.4619, 'learning_rate': 4.840482391388988e-06, 'epoch': 0.68} + 68%|██████▊ | 3939/5772 [3:03:40<3:14:13, 6.36s/it] 68%|██████▊ | 3940/5772 [3:03:46<3:10:47, 6.25s/it] 68%|██████▊ | 3940/5772 [3:03:48<3:10:47, 6.25s/it] {'loss': 0.4656, 'learning_rate': 4.835675873497716e-06, 'epoch': 0.68} + {'loss': 0.4656, 'learning_rate': 4.835675873497716e-06, 'epoch': 0.68} 68%|██████▊ | 3940/5772 [3:03:48<3:10:47, 6.25s/it] + 68%|██████▊ | 3940/5772 [3:03:46<3:10:47, 6.25s/it] 68%|██████▊ | 3941/5772 [3:03:52<3:07:05, 6.13s/it] 68%|██████▊ | 3941/5772 [3:03:54<3:07:06, 6.13s/it] {'loss': 0.4612, 'learning_rate': 4.830870982081614e-06, 'epoch': 0.68} + 68%|██████▊ | 3941/5772 [3:03:54<3:07:06, 6.13s/it] {'loss': 0.4612, 'learning_rate': 4.830870982081614e-06, 'epoch': 0.68} + 68%|██████▊ | 3941/5772 [3:03:52<3:07:05, 6.13s/it] 68%|██████▊ | 3942/5772 [3:03:58<3:06:09, 6.10s/it] 68%|██████▊ | 3942/5772 [3:04:00<3:06:09, 6.10s/it] {'loss': 0.4606, 'learning_rate': 4.8260677186539554e-06, 'epoch': 0.68} + 68%|██████▊ | 3942/5772 [3:04:00<3:06:09, 6.10s/it] {'loss': 0.4606, 'learning_rate': 4.8260677186539554e-06, 'epoch': 0.68} + 68%|██████▊ | 3942/5772 [3:03:58<3:06:09, 6.10s/it] 68%|██████▊ | 3943/5772 [3:04:04<3:05:17, 6.08s/it] 68%|██████▊ | 3943/5772 [3:04:06<3:05:17, 6.08s/it] {'loss': 0.4662, 'learning_rate': 4.821266084727505e-06, 'epoch': 0.68} + 68%|██████▊ | 3943/5772 [3:04:06<3:05:17, 6.08s/it] {'loss': 0.4662, 'learning_rate': 4.821266084727505e-06, 'epoch': 0.68} + 68%|██████▊ | 3943/5772 [3:04:04<3:05:17, 6.08s/it] 68%|██████▊ | 3944/5772 [3:04:10<3:04:22, 6.05s/it] 68%|██████▊ | 3944/5772 [3:04:12<3:04:21, 6.05s/it] {'loss': 0.4507, 'learning_rate': 4.816466081814504e-06, 'epoch': 0.68} + 68%|██████▊ | 3944/5772 [3:04:12<3:04:21, 6.05s/it] {'loss': 0.4507, 'learning_rate': 4.816466081814504e-06, 'epoch': 0.68} + 68%|██████▊ | 3944/5772 [3:04:10<3:04:22, 6.05s/it] 68%|██████▊ | 3945/5772 [3:04:18<3:03:16, 6.02s/it] 68%|██████▊ | 3945/5772 [3:04:16<3:03:17, 6.02s/it] {'loss': 0.4686, 'learning_rate': 4.811667711426686e-06, 'epoch': 0.68} + 68%|██████▊ | 3945/5772 [3:04:18<3:03:16, 6.02s/it] {'loss': 0.4686, 'learning_rate': 4.811667711426686e-06, 'epoch': 0.68} + 68%|██████▊ | 3945/5772 [3:04:16<3:03:17, 6.02s/it] 68%|██████▊ | 3946/5772 [3:04:22<3:01:40, 5.97s/it] 68%|██████▊ | 3946/5772 [3:04:24<3:01:40, 5.97s/it] {'loss': 0.4615, 'learning_rate': 4.8068709750752825e-06, 'epoch': 0.68} + 68%|██████▊ | 3946/5772 [3:04:24<3:01:40, 5.97s/it] {'loss': 0.4615, 'learning_rate': 4.8068709750752825e-06, 'epoch': 0.68} + 68%|██████▊ | 3946/5772 [3:04:22<3:01:40, 5.97s/it] 68%|██████▊ | 3947/5772 [3:04:29<2:59:22, 5.90s/it] 68%|██████▊ | 3947/5772 [3:04:27<2:59:22, 5.90s/it] {'loss': 0.4663, 'learning_rate': 4.802075874270988e-06, 'epoch': 0.68} + 68%|██████▊ | 3947/5772 [3:04:29<2:59:22, 5.90s/it] {'loss': 0.4663, 'learning_rate': 4.802075874270988e-06, 'epoch': 0.68} + 68%|██████▊ | 3947/5772 [3:04:27<2:59:22, 5.90s/it] 68%|██████▊ | 3948/5772 [3:04:35<2:59:57, 5.92s/it] 68%|██████▊ | 3948/5772 [3:04:33<2:59:57, 5.92s/it] {'loss': 0.4541, 'learning_rate': 4.797282410523997e-06, 'epoch': 0.68} + 68%|██████▊ | 3948/5772 [3:04:35<2:59:57, 5.92s/it] {'loss': 0.4541, 'learning_rate': 4.797282410523997e-06, 'epoch': 0.68} + 68%|██████▊ | 3948/5772 [3:04:33<2:59:57, 5.92s/it] 68%|██████▊ | 3949/5772 [3:04:42<3:02:54, 6.02s/it] 68%|██████▊ | 3949/5772 [3:04:40<3:02:54, 6.02s/it] {'loss': 0.4682, 'learning_rate': 4.792490585343983e-06, 'epoch': 0.68} + 68%|██████▊ | 3949/5772 [3:04:42<3:02:54, 6.02s/it] {'loss': 0.4682, 'learning_rate': 4.792490585343983e-06, 'epoch': 0.68} + 68%|██████▊ | 3949/5772 [3:04:40<3:02:54, 6.02s/it]10 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +02 AutoResumeHook: Checking whether to suspend... + 4 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend...119 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... + + 68%|██████▊ | 3950/5772 [3:04:49<3:33:44, 7.04s/it]7 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... + 68%|██████▊ | 3950/5772 [3:04:51<3:33:44, 7.04s/it] {'loss': 0.4638, 'learning_rate': 4.787700400240108e-06, 'epoch': 0.68} + 68%|██████▊ | 3950/5772 [3:04:51<3:33:44, 7.04s/it] {'loss': 0.4638, 'learning_rate': 4.787700400240108e-06, 'epoch': 0.68} + 68%|██████▊ | 3950/5772 [3:04:49<3:33:44, 7.04s/it] 68%|██████▊ | 3951/5772 [3:04:55<3:23:11, 6.69s/it] 68%|██████▊ | 3951/5772 [3:04:57<3:23:11, 6.69s/it] {'loss': 0.4696, 'learning_rate': 4.78291185672101e-06, 'epoch': 0.68} + 68%|██████▊ | 3951/5772 [3:04:57<3:23:11, 6.69s/it] {'loss': 0.4696, 'learning_rate': 4.78291185672101e-06, 'epoch': 0.68} + 68%|██████▊ | 3951/5772 [3:04:55<3:23:11, 6.69s/it] 68%|██████▊ | 3952/5772 [3:05:01<3:17:43, 6.52s/it] 68%|██████▊ | 3952/5772 [3:05:03<3:17:43, 6.52s/it] {'loss': 0.4554, 'learning_rate': 4.7781249562948136e-06, 'epoch': 0.68} + 68%|██████▊ | 3952/5772 [3:05:03<3:17:43, 6.52s/it] {'loss': 0.4554, 'learning_rate': 4.7781249562948136e-06, 'epoch': 0.68} + 68%|██████▊ | 3952/5772 [3:05:01<3:17:43, 6.52s/it] 68%|██████▊ | 3953/5772 [3:05:07<3:13:17, 6.38s/it] 68%|██████▊ | 3953/5772 [3:05:09<3:13:17, 6.38s/it] {'loss': 0.4688, 'learning_rate': 4.773339700469129e-06, 'epoch': 0.68} + 68%|██████▊ | 3953/5772 [3:05:09<3:13:17, 6.38s/it] {'loss': 0.4688, 'learning_rate': 4.773339700469129e-06, 'epoch': 0.68} + 68%|██████▊ | 3953/5772 [3:05:07<3:13:17, 6.38s/it] 69%|██████▊ | 3954/5772 [3:05:13<3:10:58, 6.30s/it] 69%|██████▊ | 3954/5772 [3:05:15<3:10:58, 6.30s/it] {'loss': 0.4552, 'learning_rate': 4.7685560907510465e-06, 'epoch': 0.68} + 69%|██████▊ | 3954/5772 [3:05:15<3:10:58, 6.30s/it] {'loss': 0.4552, 'learning_rate': 4.7685560907510465e-06, 'epoch': 0.68} + 69%|██████▊ | 3954/5772 [3:05:13<3:10:58, 6.30s/it] 69%|██████▊ | 3955/5772 [3:05:21<3:08:12, 6.22s/it] 69%|██████▊ | 3955/5772 [3:05:19<3:08:12, 6.22s/it] {'loss': 0.4667, 'learning_rate': 4.7637741286471385e-06, 'epoch': 0.69} + 69%|██████▊ | 3955/5772 [3:05:21<3:08:12, 6.22s/it] {'loss': 0.4667, 'learning_rate': 4.7637741286471385e-06, 'epoch': 0.69} + 69%|██████▊ | 3955/5772 [3:05:19<3:08:12, 6.22s/it] 69%|██████▊ | 3956/5772 [3:05:26<3:07:58, 6.21s/it] 69%|██████▊ | 3956/5772 [3:05:27<3:07:58, 6.21s/it] {'loss': 0.4608, 'learning_rate': 4.7589938156634485e-06, 'epoch': 0.69} + 69%|██████▊ | 3956/5772 [3:05:27<3:07:58, 6.21s/it] {'loss': 0.4608, 'learning_rate': 4.7589938156634485e-06, 'epoch': 0.69} + 69%|██████▊ | 3956/5772 [3:05:26<3:07:58, 6.21s/it] 69%|██████▊ | 3957/5772 [3:05:34<3:09:50, 6.28s/it] 69%|██████▊ | 3957/5772 [3:05:32<3:09:50, 6.28s/it] {'loss': 0.4648, 'learning_rate': 4.7542151533055235e-06, 'epoch': 0.69} + 69%|██████▊ | 3957/5772 [3:05:34<3:09:50, 6.28s/it] {'loss': 0.4648, 'learning_rate': 4.7542151533055235e-06, 'epoch': 0.69} + 69%|██████▊ | 3957/5772 [3:05:32<3:09:50, 6.28s/it] 69%|██████▊ | 3958/5772 [3:05:38<3:08:14, 6.23s/it] 69%|██████▊ | 3958/5772 [3:05:40<3:08:14, 6.23s/it] {'loss': 0.4578, 'learning_rate': 4.7494381430783656e-06, 'epoch': 0.69} + 69%|██████▊ | 3958/5772 [3:05:40<3:08:14, 6.23s/it] {'loss': 0.4578, 'learning_rate': 4.7494381430783656e-06, 'epoch': 0.69} + 69%|██████▊ | 3958/5772 [3:05:38<3:08:14, 6.23s/it] 69%|██████▊ | 3959/5772 [3:05:44<3:08:17, 6.23s/it] 69%|██████▊ | 3959/5772 [3:05:46<3:08:17, 6.23s/it] {'loss': 0.4605, 'learning_rate': 4.744662786486471e-06, 'epoch': 0.69} + 69%|██████▊ | 3959/5772 [3:05:46<3:08:17, 6.23s/it] {'loss': 0.4605, 'learning_rate': 4.744662786486471e-06, 'epoch': 0.69} + 69%|██████▊ | 3959/5772 [3:05:44<3:08:17, 6.23s/it] 69%|██████▊ | 3960/5772 [3:05:50<3:04:07, 6.10s/it] 69%|██████▊ | 3960/5772 [3:05:52<3:04:07, 6.10s/it] {'loss': 0.458, 'learning_rate': 4.739889085033812e-06, 'epoch': 0.69} + 69%|██████▊ | 3960/5772 [3:05:52<3:04:07, 6.10s/it] {'loss': 0.458, 'learning_rate': 4.739889085033812e-06, 'epoch': 0.69} + 69%|██████▊ | 3960/5772 [3:05:50<3:04:07, 6.10s/it] 69%|██████▊ | 3961/5772 [3:05:59<3:30:09, 6.96s/it] 69%|██████▊ | 3961/5772 [3:06:01<3:30:09, 6.96s/it] {'loss': 0.4681, 'learning_rate': 4.73511704022384e-06, 'epoch': 0.69} + 69%|██████▊ | 3961/5772 [3:06:01<3:30:09, 6.96s/it] {'loss': 0.4681, 'learning_rate': 4.73511704022384e-06, 'epoch': 0.69} + 69%|██████▊ | 3961/5772 [3:05:59<3:30:09, 6.96s/it] 69%|██████▊ | 3962/5772 [3:06:12<4:21:08, 8.66s/it] 69%|██████▊ | 3962/5772 [3:06:14<4:21:08, 8.66s/it] {'loss': 0.4721, 'learning_rate': 4.730346653559486e-06, 'epoch': 0.69} + 69%|██████▊ | 3962/5772 [3:06:14<4:21:08, 8.66s/it] {'loss': 0.4721, 'learning_rate': 4.730346653559486e-06, 'epoch': 0.69} + 69%|██████▊ | 3962/5772 [3:06:12<4:21:08, 8.66s/it] 69%|██████▊ | 3963/5772 [3:06:22<4:22:26, 8.70s/it] 69%|██████▊ | 3963/5772 [3:06:20<4:22:26, 8.70s/it] {'loss': 0.4547, 'learning_rate': 4.725577926543151e-06, 'epoch': 0.69} + 69%|██████▊ | 3963/5772 [3:06:22<4:22:26, 8.70s/it] {'loss': 0.4547, 'learning_rate': 4.725577926543151e-06, 'epoch': 0.69} + 69%|██████▊ | 3963/5772 [3:06:20<4:22:26, 8.70s/it] 69%|██████▊ | 3964/5772 [3:06:29<4:20:41, 8.65s/it] 69%|██████▊ | 3964/5772 [3:06:31<4:20:41, 8.65s/it] {'loss': 0.4637, 'learning_rate': 4.720810860676722e-06, 'epoch': 0.69} + 69%|██████▊ | 3964/5772 [3:06:31<4:20:41, 8.65s/it] {'loss': 0.4637, 'learning_rate': 4.720810860676722e-06, 'epoch': 0.69} + 69%|██████▊ | 3964/5772 [3:06:29<4:20:41, 8.65s/it] 69%|██████▊ | 3965/5772 [3:06:40<4:24:05, 8.77s/it] 69%|██████▊ | 3965/5772 [3:06:38<4:24:05, 8.77s/it] {'loss': 0.472, 'learning_rate': 4.7160454574615596e-06, 'epoch': 0.69} + 69%|██████▊ | 3965/5772 [3:06:40<4:24:05, 8.77s/it] {'loss': 0.472, 'learning_rate': 4.7160454574615596e-06, 'epoch': 0.69} + 69%|██████▊ | 3965/5772 [3:06:38<4:24:05, 8.77s/it] 69%|██████▊ | 3966/5772 [3:06:45<4:03:17, 8.08s/it] 69%|██████▊ | 3966/5772 [3:06:47<4:03:17, 8.08s/it] {'loss': 0.4559, 'learning_rate': 4.711281718398503e-06, 'epoch': 0.69} + 69%|██████▊ | 3966/5772 [3:06:47<4:03:17, 8.08s/it] {'loss': 0.4559, 'learning_rate': 4.711281718398503e-06, 'epoch': 0.69} + 69%|██████▊ | 3966/5772 [3:06:45<4:03:17, 8.08s/it] 69%|██████▊ | 3967/5772 [3:06:54<4:16:07, 8.51s/it] 69%|██████▊ | 3967/5772 [3:06:56<4:16:07, 8.51s/it] {'loss': 0.4531, 'learning_rate': 4.706519644987863e-06, 'epoch': 0.69} + 69%|██████▊ | 3967/5772 [3:06:56<4:16:07, 8.51s/it] {'loss': 0.4531, 'learning_rate': 4.706519644987863e-06, 'epoch': 0.69} + 69%|██████▊ | 3967/5772 [3:06:54<4:16:07, 8.51s/it] 69%|██████▊ | 3968/5772 [3:07:03<4:23:39, 8.77s/it] 69%|██████▊ | 3968/5772 [3:07:05<4:23:39, 8.77s/it] {'loss': 0.4634, 'learning_rate': 4.701759238729428e-06, 'epoch': 0.69} + 69%|██████▊ | 3968/5772 [3:07:05<4:23:39, 8.77s/it] {'loss': 0.4634, 'learning_rate': 4.701759238729428e-06, 'epoch': 0.69} + 69%|██████▊ | 3968/5772 [3:07:03<4:23:39, 8.77s/it] 69%|██████▉ | 3969/5772 [3:07:09<3:57:12, 7.89s/it] 69%|██████▉ | 3969/5772 [3:07:11<3:57:12, 7.89s/it] {'loss': 0.4764, 'learning_rate': 4.697000501122466e-06, 'epoch': 0.69} + 69%|██████▉ | 3969/5772 [3:07:11<3:57:12, 7.89s/it] {'loss': 0.4764, 'learning_rate': 4.697000501122466e-06, 'epoch': 0.69} + 69%|██████▉ | 3969/5772 [3:07:09<3:57:12, 7.89s/it] 69%|██████▉ | 3970/5772 [3:07:22<4:39:46, 9.32s/it] 69%|██████▉ | 3970/5772 [3:07:24<4:39:46, 9.32s/it] {'loss': 0.4595, 'learning_rate': 4.6922434336657095e-06, 'epoch': 0.69} + 69%|██████▉ | 3970/5772 [3:07:24<4:39:46, 9.32s/it] {'loss': 0.4595, 'learning_rate': 4.6922434336657095e-06, 'epoch': 0.69} + 69%|██████▉ | 3970/5772 [3:07:22<4:39:46, 9.32s/it] 69%|██████▉ | 3971/5772 [3:07:30<4:08:21, 8.27s/it] 69%|██████▉ | 3971/5772 [3:07:28<4:08:21, 8.27s/it] {'loss': 0.4585, 'learning_rate': 4.68748803785737e-06, 'epoch': 0.69} + 69%|██████▉ | 3971/5772 [3:07:30<4:08:21, 8.27s/it] {'loss': 0.4585, 'learning_rate': 4.68748803785737e-06, 'epoch': 0.69} + 69%|██████▉ | 3971/5772 [3:07:28<4:08:21, 8.27s/it] 69%|██████▉ | 3972/5772 [3:07:36<3:50:12, 7.67s/it] 69%|██████▉ | 3972/5772 [3:07:34<3:50:12, 7.67s/it] {'loss': 0.4674, 'learning_rate': 4.682734315195138e-06, 'epoch': 0.69} + 69%|██████▉ | 3972/5772 [3:07:36<3:50:12, 7.67s/it] {'loss': 0.4674, 'learning_rate': 4.682734315195138e-06, 'epoch': 0.69} + 69%|██████▉ | 3972/5772 [3:07:34<3:50:12, 7.67s/it] 69%|██████▉ | 3973/5772 [3:07:40<3:33:55, 7.13s/it] 69%|██████▉ | 3973/5772 [3:07:42<3:33:55, 7.13s/it] {'loss': 0.4676, 'learning_rate': 4.677982267176168e-06, 'epoch': 0.69} + 69%|██████▉ | 3973/5772 [3:07:42<3:33:55, 7.13s/it] {'loss': 0.4676, 'learning_rate': 4.677982267176168e-06, 'epoch': 0.69} + 69%|██████▉ | 3973/5772 [3:07:40<3:33:55, 7.13s/it] 69%|██████▉ | 3974/5772 [3:07:48<3:43:57, 7.47s/it] 69%|██████▉ | 3974/5772 [3:07:50<3:43:57, 7.47s/it] {'loss': 0.4559, 'learning_rate': 4.673231895297092e-06, 'epoch': 0.69} + 69%|██████▉ | 3974/5772 [3:07:50<3:43:57, 7.47s/it] {'loss': 0.4559, 'learning_rate': 4.673231895297092e-06, 'epoch': 0.69} + 69%|██████▉ | 3974/5772 [3:07:48<3:43:57, 7.47s/it] 69%|██████▉ | 3975/5772 [3:07:54<3:31:05, 7.05s/it] 69%|██████▉ | 3975/5772 [3:07:56<3:31:05, 7.05s/it] {'loss': 0.476, 'learning_rate': 4.668483201054013e-06, 'epoch': 0.69} + 69%|██████▉ | 3975/5772 [3:07:56<3:31:05, 7.05s/it] {'loss': 0.476, 'learning_rate': 4.668483201054013e-06, 'epoch': 0.69} + 69%|██████▉ | 3975/5772 [3:07:54<3:31:05, 7.05s/it] 69%|██████▉ | 3976/5772 [3:08:01<3:27:01, 6.92s/it] 69%|██████▉ | 3976/5772 [3:08:03<3:27:01, 6.92s/it] {'loss': 0.451, 'learning_rate': 4.663736185942512e-06, 'epoch': 0.69} + 69%|██████▉ | 3976/5772 [3:08:03<3:27:01, 6.92s/it] {'loss': 0.451, 'learning_rate': 4.663736185942512e-06, 'epoch': 0.69} + 69%|██████▉ | 3976/5772 [3:08:01<3:27:01, 6.92s/it] 69%|██████▉ | 3977/5772 [3:08:07<3:18:12, 6.63s/it] 69%|██████▉ | 3977/5772 [3:08:09<3:18:12, 6.63s/it] {'loss': 0.4596, 'learning_rate': 4.658990851457625e-06, 'epoch': 0.69} + 69%|██████▉ | 3977/5772 [3:08:09<3:18:12, 6.63s/it] {'loss': 0.4596, 'learning_rate': 4.658990851457625e-06, 'epoch': 0.69} + 69%|██████▉ | 3977/5772 [3:08:07<3:18:12, 6.63s/it] 69%|██████▉ | 3978/5772 [3:08:13<3:11:42, 6.41s/it] 69%|██████▉ | 3978/5772 [3:08:15<3:11:42, 6.41s/it] {'loss': 0.4557, 'learning_rate': 4.654247199093873e-06, 'epoch': 0.69} + 69%|██████▉ | 3978/5772 [3:08:15<3:11:42, 6.41s/it] {'loss': 0.4557, 'learning_rate': 4.654247199093873e-06, 'epoch': 0.69} + 69%|██████▉ | 3978/5772 [3:08:13<3:11:42, 6.41s/it] 69%|██████▉ | 3979/5772 [3:08:21<3:09:38, 6.35s/it] 69%|██████▉ | 3979/5772 [3:08:19<3:09:38, 6.35s/it] {'loss': 0.4646, 'learning_rate': 4.649505230345244e-06, 'epoch': 0.69} + 69%|██████▉ | 3979/5772 [3:08:21<3:09:38, 6.35s/it] {'loss': 0.4646, 'learning_rate': 4.649505230345244e-06, 'epoch': 0.69} + 69%|██████▉ | 3979/5772 [3:08:19<3:09:38, 6.35s/it] 69%|██████▉ | 3980/5772 [3:08:27<3:05:06, 6.20s/it] 69%|██████▉ | 3980/5772 [3:08:25<3:05:06, 6.20s/it] {'loss': 0.4544, 'learning_rate': 4.644764946705193e-06, 'epoch': 0.69} + 69%|██████▉ | 3980/5772 [3:08:27<3:05:06, 6.20s/it] {'loss': 0.4544, 'learning_rate': 4.644764946705193e-06, 'epoch': 0.69} + 69%|██████▉ | 3980/5772 [3:08:25<3:05:06, 6.20s/it] 69%|██████▉ | 3981/5772 [3:08:33<3:05:06, 6.20s/it] 69%|██████▉ | 3981/5772 [3:08:31<3:05:06, 6.20s/it] {'loss': 0.4833, 'learning_rate': 4.640026349666651e-06, 'epoch': 0.69} + 69%|██████▉ | 3981/5772 [3:08:33<3:05:06, 6.20s/it] {'loss': 0.4833, 'learning_rate': 4.640026349666651e-06, 'epoch': 0.69} + 69%|██████▉ | 3981/5772 [3:08:31<3:05:06, 6.20s/it] 69%|██████▉ | 3982/5772 [3:08:39<3:05:31, 6.22s/it] 69%|██████▉ | 3982/5772 [3:08:37<3:05:31, 6.22s/it] {'loss': 0.4612, 'learning_rate': 4.635289440722001e-06, 'epoch': 0.69} + 69%|██████▉ | 3982/5772 [3:08:39<3:05:31, 6.22s/it] {'loss': 0.4612, 'learning_rate': 4.635289440722001e-06, 'epoch': 0.69} + 69%|██████▉ | 3982/5772 [3:08:37<3:05:31, 6.22s/it] 69%|██████▉ | 3983/5772 [3:08:45<3:03:26, 6.15s/it] 69%|██████▉ | 3983/5772 [3:08:43<3:03:26, 6.15s/it] {'loss': 0.4913, 'learning_rate': 4.6305542213631205e-06, 'epoch': 0.69} + 69%|██████▉ | 3983/5772 [3:08:45<3:03:26, 6.15s/it] {'loss': 0.4913, 'learning_rate': 4.6305542213631205e-06, 'epoch': 0.69} + 69%|██████▉ | 3983/5772 [3:08:43<3:03:26, 6.15s/it] 69%|██████▉ | 3984/5772 [3:08:49<3:03:49, 6.17s/it] 69%|██████▉ | 3984/5772 [3:08:51<3:03:49, 6.17s/it] {'loss': 0.4662, 'learning_rate': 4.625820693081331e-06, 'epoch': 0.69} + 69%|██████▉ | 3984/5772 [3:08:51<3:03:49, 6.17s/it] {'loss': 0.4662, 'learning_rate': 4.625820693081331e-06, 'epoch': 0.69} + 69%|██████▉ | 3984/5772 [3:08:49<3:03:49, 6.17s/it] 69%|██████▉ | 3985/5772 [3:08:57<3:01:53, 6.11s/it] 69%|██████▉ | 3985/5772 [3:08:55<3:01:53, 6.11s/it] {'loss': 0.4635, 'learning_rate': 4.621088857367433e-06, 'epoch': 0.69} + 69%|██████▉ | 3985/5772 [3:08:57<3:01:53, 6.11s/it] {'loss': 0.4635, 'learning_rate': 4.621088857367433e-06, 'epoch': 0.69} + 69%|██████▉ | 3985/5772 [3:08:55<3:01:53, 6.11s/it] 69%|██████▉ | 3986/5772 [3:09:04<3:04:56, 6.21s/it] 69%|██████▉ | 3986/5772 [3:09:02<3:04:55, 6.21s/it] {'loss': 0.4612, 'learning_rate': 4.616358715711693e-06, 'epoch': 0.69} + 69%|██████▉ | 3986/5772 [3:09:04<3:04:56, 6.21s/it] {'loss': 0.4612, 'learning_rate': 4.616358715711693e-06, 'epoch': 0.69} + 69%|██████▉ | 3986/5772 [3:09:02<3:04:55, 6.21s/it] 69%|██████▉ | 3987/5772 [3:09:08<3:05:50, 6.25s/it] 69%|██████▉ | 3987/5772 [3:09:10<3:05:50, 6.25s/it] {'loss': 0.4644, 'learning_rate': 4.611630269603842e-06, 'epoch': 0.69} + 69%|██████▉ | 3987/5772 [3:09:10<3:05:50, 6.25s/it] {'loss': 0.4644, 'learning_rate': 4.611630269603842e-06, 'epoch': 0.69} + 69%|██████▉ | 3987/5772 [3:09:08<3:05:50, 6.25s/it] 69%|██████▉ | 3988/5772 [3:09:16<3:04:28, 6.20s/it] 69%|██████▉ | 3988/5772 [3:09:14<3:04:28, 6.20s/it] {'loss': 0.4544, 'learning_rate': 4.606903520533082e-06, 'epoch': 0.69} + 69%|██████▉ | 3988/5772 [3:09:16<3:04:28, 6.20s/it] {'loss': 0.4544, 'learning_rate': 4.606903520533082e-06, 'epoch': 0.69} + 69%|██████▉ | 3988/5772 [3:09:14<3:04:28, 6.20s/it] 69%|██████▉ | 3989/5772 [3:09:22<3:02:42, 6.15s/it] 69%|██████▉ | 3989/5772 [3:09:20<3:02:42, 6.15s/it] {'loss': 0.4627, 'learning_rate': 4.602178469988064e-06, 'epoch': 0.69} + 69%|██████▉ | 3989/5772 [3:09:22<3:02:42, 6.15s/it] {'loss': 0.4627, 'learning_rate': 4.602178469988064e-06, 'epoch': 0.69} + 69%|██████▉ | 3989/5772 [3:09:20<3:02:42, 6.15s/it] 69%|██████▉ | 3990/5772 [3:09:28<3:00:43, 6.09s/it] 69%|██████▉ | 3990/5772 [3:09:26<3:00:43, 6.09s/it] {'loss': 0.4668, 'learning_rate': 4.5974551194569336e-06, 'epoch': 0.69} + 69%|██████▉ | 3990/5772 [3:09:28<3:00:43, 6.09s/it] {'loss': 0.4668, 'learning_rate': 4.5974551194569336e-06, 'epoch': 0.69} + 69%|██████▉ | 3990/5772 [3:09:26<3:00:43, 6.09s/it] 69%|██████▉ | 3991/5772 [3:09:34<3:01:07, 6.10s/it] 69%|██████▉ | 3991/5772 [3:09:32<3:01:07, 6.10s/it] {'loss': 0.4575, 'learning_rate': 4.592733470427272e-06, 'epoch': 0.69} + 69%|██████▉ | 3991/5772 [3:09:34<3:01:07, 6.10s/it] {'loss': 0.4575, 'learning_rate': 4.592733470427272e-06, 'epoch': 0.69} + 69%|██████▉ | 3991/5772 [3:09:32<3:01:07, 6.10s/it] 69%|██████▉ | 3992/5772 [3:09:40<2:59:38, 6.06s/it] 69%|██████▉ | 3992/5772 [3:09:38<2:59:38, 6.06s/it] {'loss': 0.4608, 'learning_rate': 4.588013524386138e-06, 'epoch': 0.69} + 69%|██████▉ | 3992/5772 [3:09:40<2:59:38, 6.06s/it] {'loss': 0.4608, 'learning_rate': 4.588013524386138e-06, 'epoch': 0.69} + 69%|██████▉ | 3992/5772 [3:09:38<2:59:38, 6.06s/it] 69%|██████▉ | 3993/5772 [3:09:44<2:59:04, 6.04s/it] 69%|██████▉ | 3993/5772 [3:09:46<2:59:04, 6.04s/it] {'loss': 0.4691, 'learning_rate': 4.5832952828200535e-06, 'epoch': 0.69} + 69%|██████▉ | 3993/5772 [3:09:46<2:59:04, 6.04s/it] {'loss': 0.4691, 'learning_rate': 4.5832952828200535e-06, 'epoch': 0.69} + 69%|██████▉ | 3993/5772 [3:09:44<2:59:04, 6.04s/it] 69%|██████▉ | 3994/5772 [3:09:53<3:02:55, 6.17s/it] 69%|██████▉ | 3994/5772 [3:09:51<3:02:55, 6.17s/it] {'loss': 0.4686, 'learning_rate': 4.578578747215003e-06, 'epoch': 0.69} + 69%|██████▉ | 3994/5772 [3:09:53<3:02:55, 6.17s/it] {'loss': 0.4686, 'learning_rate': 4.578578747215003e-06, 'epoch': 0.69} + 69%|██████▉ | 3994/5772 [3:09:51<3:02:55, 6.17s/it] 69%|██████▉ | 3995/5772 [3:09:59<3:02:58, 6.18s/it] 69%|██████▉ | 3995/5772 [3:09:57<3:02:59, 6.18s/it] {'loss': 0.4526, 'learning_rate': 4.573863919056438e-06, 'epoch': 0.69} + 69%|██████▉ | 3995/5772 [3:09:59<3:02:58, 6.18s/it] {'loss': 0.4526, 'learning_rate': 4.573863919056438e-06, 'epoch': 0.69} + 69%|██████▉ | 3995/5772 [3:09:57<3:02:59, 6.18s/it] 69%|██████▉ | 3996/5772 [3:10:03<3:01:51, 6.14s/it] 69%|██████▉ | 3996/5772 [3:10:05<3:01:51, 6.14s/it] {'loss': 0.4591, 'learning_rate': 4.569150799829257e-06, 'epoch': 0.69} + 69%|██████▉ | 3996/5772 [3:10:05<3:01:51, 6.14s/it] {'loss': 0.4591, 'learning_rate': 4.569150799829257e-06, 'epoch': 0.69} + 69%|██████▉ | 3996/5772 [3:10:03<3:01:51, 6.14s/it] 69%|██████▉ | 3997/5772 [3:10:09<3:03:09, 6.19s/it] 69%|██████▉ | 3997/5772 [3:10:11<3:03:09, 6.19s/it] {'loss': 0.4828, 'learning_rate': 4.564439391017836e-06, 'epoch': 0.69} + 69%|██████▉ | 3997/5772 [3:10:11<3:03:09, 6.19s/it] {'loss': 0.4828, 'learning_rate': 4.564439391017836e-06, 'epoch': 0.69} + 69%|██████▉ | 3997/5772 [3:10:09<3:03:09, 6.19s/it] 69%|██████▉ | 3998/5772 [3:10:18<3:03:24, 6.20s/it] 69%|██████▉ | 3998/5772 [3:10:16<3:03:24, 6.20s/it] {'loss': 0.4569, 'learning_rate': 4.559729694106008e-06, 'epoch': 0.69} + 69%|██████▉ | 3998/5772 [3:10:18<3:03:24, 6.20s/it] {'loss': 0.4569, 'learning_rate': 4.559729694106008e-06, 'epoch': 0.69} + 69%|██████▉ | 3998/5772 [3:10:16<3:03:24, 6.20s/it] 69%|██████▉ | 3999/5772 [3:10:24<3:02:42, 6.18s/it] 69%|██████▉ | 3999/5772 [3:10:22<3:02:42, 6.18s/it] {'loss': 0.475, 'learning_rate': 4.555021710577068e-06, 'epoch': 0.69} + 69%|██████▉ | 3999/5772 [3:10:24<3:02:42, 6.18s/it] {'loss': 0.475, 'learning_rate': 4.555021710577068e-06, 'epoch': 0.69} + 69%|██████▉ | 3999/5772 [3:10:22<3:02:42, 6.18s/it]10 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +014 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... 5 + AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... + 69%|██████▉ | 4000/5772 [3:10:28<3:03:27, 6.21s/it]6 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... + 69%|██████▉ | 4000/5772 [3:10:30<3:03:27, 6.21s/it]9 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4542, 'learning_rate': 4.550315441913759e-06, 'epoch': 0.69} + 69%|██████▉ | 4000/5772 [3:10:30<3:03:27, 6.21s/it] {'loss': 0.4542, 'learning_rate': 4.550315441913759e-06, 'epoch': 0.69} + 69%|██████▉ | 4000/5772 [3:10:28<3:03:27, 6.21s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-4000/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-4000/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-4000/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 69%|██████▉ | 4001/5772 [3:10:51<5:35:01, 11.35s/it] 69%|██████▉ | 4001/5772 [3:10:53<5:35:01, 11.35s/it] {'loss': 0.4579, 'learning_rate': 4.545610889598304e-06, 'epoch': 0.69} + 69%|██████▉ | 4001/5772 [3:10:53<5:35:01, 11.35s/it] {'loss': 0.4579, 'learning_rate': 4.545610889598304e-06, 'epoch': 0.69} + 69%|██████▉ | 4001/5772 [3:10:51<5:35:01, 11.35s/it] 69%|██████▉ | 4002/5772 [3:10:58<4:54:46, 9.99s/it] 69%|██████▉ | 4002/5772 [3:11:00<4:54:46, 9.99s/it] {'loss': 0.4683, 'learning_rate': 4.540908055112378e-06, 'epoch': 0.69} + 69%|██████▉ | 4002/5772 [3:11:00<4:54:46, 9.99s/it] {'loss': 0.4683, 'learning_rate': 4.540908055112378e-06, 'epoch': 0.69} + 69%|██████▉ | 4002/5772 [3:10:58<4:54:46, 9.99s/it] 69%|██████▉ | 4003/5772 [3:11:04<4:17:59, 8.75s/it] 69%|██████▉ | 4003/5772 [3:11:06<4:17:59, 8.75s/it] {'loss': 0.4582, 'learning_rate': 4.536206939937101e-06, 'epoch': 0.69} + 69%|██████▉ | 4003/5772 [3:11:06<4:17:59, 8.75s/it] {'loss': 0.4582, 'learning_rate': 4.536206939937101e-06, 'epoch': 0.69} + 69%|██████▉ | 4003/5772 [3:11:04<4:17:59, 8.75s/it] 69%|██████▉ | 4004/5772 [3:11:10<3:53:11, 7.91s/it] 69%|██████▉ | 4004/5772 [3:11:12<3:53:11, 7.91s/it] {'loss': 0.4596, 'learning_rate': 4.531507545553072e-06, 'epoch': 0.69} + 69%|██████▉ | 4004/5772 [3:11:12<3:53:11, 7.91s/it] {'loss': 0.4596, 'learning_rate': 4.531507545553072e-06, 'epoch': 0.69} + 69%|██████▉ | 4004/5772 [3:11:10<3:53:11, 7.91s/it] 69%|██████▉ | 4005/5772 [3:11:16<3:38:18, 7.41s/it] 69%|██████▉ | 4005/5772 [3:11:18<3:38:18, 7.41s/it] {'loss': 0.4677, 'learning_rate': 4.526809873440335e-06, 'epoch': 0.69} + 69%|██████▉ | 4005/5772 [3:11:18<3:38:18, 7.41s/it] {'loss': 0.4677, 'learning_rate': 4.526809873440335e-06, 'epoch': 0.69} + 69%|██████▉ | 4005/5772 [3:11:16<3:38:18, 7.41s/it] 69%|██████▉ | 4006/5772 [3:11:24<3:24:11, 6.94s/it] 69%|██████▉ | 4006/5772 [3:11:22<3:24:11, 6.94s/it] {'loss': 0.4544, 'learning_rate': 4.522113925078402e-06, 'epoch': 0.69} + 69%|██████▉ | 4006/5772 [3:11:24<3:24:11, 6.94s/it] {'loss': 0.4544, 'learning_rate': 4.522113925078402e-06, 'epoch': 0.69} + 69%|██████▉ | 4006/5772 [3:11:22<3:24:11, 6.94s/it] 69%|██████▉ | 4007/5772 [3:11:30<3:14:36, 6.62s/it] 69%|██████▉ | 4007/5772 [3:11:28<3:14:36, 6.62s/it] {'loss': 0.4599, 'learning_rate': 4.517419701946224e-06, 'epoch': 0.69} + 69%|██████▉ | 4007/5772 [3:11:30<3:14:36, 6.62s/it] {'loss': 0.4599, 'learning_rate': 4.517419701946224e-06, 'epoch': 0.69} + 69%|██████▉ | 4007/5772 [3:11:28<3:14:36, 6.62s/it] 69%|██████▉ | 4008/5772 [3:11:34<3:10:42, 6.49s/it] 69%|██████▉ | 4008/5772 [3:11:36<3:10:42, 6.49s/it] {'loss': 0.4528, 'learning_rate': 4.51272720552223e-06, 'epoch': 0.69} + 69%|██████▉ | 4008/5772 [3:11:36<3:10:42, 6.49s/it] {'loss': 0.4528, 'learning_rate': 4.51272720552223e-06, 'epoch': 0.69} + 69%|██████▉ | 4008/5772 [3:11:34<3:10:42, 6.49s/it] 69%|██████▉ | 4009/5772 [3:11:42<3:04:18, 6.27s/it] 69%|██████▉ | 4009/5772 [3:11:40<3:04:18, 6.27s/it] {'loss': 0.4655, 'learning_rate': 4.508036437284298e-06, 'epoch': 0.69} + 69%|██████▉ | 4009/5772 [3:11:40<3:04:18, 6.27s/it]{'loss': 0.4655, 'learning_rate': 4.508036437284298e-06, 'epoch': 0.69} + 69%|██████▉ | 4009/5772 [3:11:42<3:04:18, 6.27s/it] 69%|██████▉ | 4010/5772 [3:11:46<3:03:31, 6.25s/it] 69%|██████▉ | 4010/5772 [3:11:48<3:03:31, 6.25s/it] {'loss': 0.4552, 'learning_rate': 4.503347398709751e-06, 'epoch': 0.69} + 69%|██████▉ | 4010/5772 [3:11:48<3:03:31, 6.25s/it] {'loss': 0.4552, 'learning_rate': 4.503347398709751e-06, 'epoch': 0.69} + 69%|██████▉ | 4010/5772 [3:11:46<3:03:31, 6.25s/it] 69%|██████▉ | 4011/5772 [3:11:52<2:59:49, 6.13s/it] 69%|██████▉ | 4011/5772 [3:11:54<2:59:50, 6.13s/it] {'loss': 0.4575, 'learning_rate': 4.498660091275379e-06, 'epoch': 0.69} + 69%|██████▉ | 4011/5772 [3:11:54<2:59:50, 6.13s/it] {'loss': 0.4575, 'learning_rate': 4.498660091275379e-06, 'epoch': 0.69} + 69%|██████▉ | 4011/5772 [3:11:52<2:59:49, 6.13s/it] 70%|██████▉ | 4012/5772 [3:11:58<3:01:01, 6.17s/it] 70%|██████▉ | 4012/5772 [3:12:00<3:01:01, 6.17s/it] {'loss': 0.4682, 'learning_rate': 4.493974516457423e-06, 'epoch': 0.7} + 70%|██████▉ | 4012/5772 [3:12:00<3:01:01, 6.17s/it] {'loss': 0.4682, 'learning_rate': 4.493974516457423e-06, 'epoch': 0.7} + 70%|██████▉ | 4012/5772 [3:11:58<3:01:01, 6.17s/it] 70%|██████▉ | 4013/5772 [3:12:04<2:58:53, 6.10s/it] 70%|██████▉ | 4013/5772 [3:12:06<2:58:53, 6.10s/it] {'loss': 0.465, 'learning_rate': 4.489290675731584e-06, 'epoch': 0.7} + 70%|██████▉ | 4013/5772 [3:12:06<2:58:53, 6.10s/it] {'loss': 0.465, 'learning_rate': 4.489290675731584e-06, 'epoch': 0.7} + 70%|██████▉ | 4013/5772 [3:12:04<2:58:53, 6.10s/it] 70%|██████▉ | 4014/5772 [3:12:12<2:57:18, 6.05s/it] 70%|██████▉ | 4014/5772 [3:12:10<2:57:18, 6.05s/it] {'loss': 0.4485, 'learning_rate': 4.484608570573002e-06, 'epoch': 0.7} + 70%|██████▉ | 4014/5772 [3:12:12<2:57:18, 6.05s/it] {'loss': 0.4485, 'learning_rate': 4.484608570573002e-06, 'epoch': 0.7} + 70%|██████▉ | 4014/5772 [3:12:10<2:57:18, 6.05s/it] 70%|██████▉ | 4015/5772 [3:12:18<2:55:55, 6.01s/it] 70%|██████▉ | 4015/5772 [3:12:16<2:55:55, 6.01s/it] {'loss': 0.4514, 'learning_rate': 4.479928202456283e-06, 'epoch': 0.7} + 70%|██████▉ | 4015/5772 [3:12:18<2:55:55, 6.01s/it] {'loss': 0.4514, 'learning_rate': 4.479928202456283e-06, 'epoch': 0.7} + 70%|██████▉ | 4015/5772 [3:12:16<2:55:55, 6.01s/it] 70%|██████▉ | 4016/5772 [3:12:24<2:57:48, 6.08s/it] 70%|██████▉ | 4016/5772 [3:12:22<2:57:48, 6.08s/it] {'loss': 0.4706, 'learning_rate': 4.475249572855492e-06, 'epoch': 0.7} + 70%|██████▉ | 4016/5772 [3:12:24<2:57:48, 6.08s/it] {'loss': 0.4706, 'learning_rate': 4.475249572855492e-06, 'epoch': 0.7} + 70%|██████▉ | 4016/5772 [3:12:22<2:57:48, 6.08s/it] 70%|██████▉ | 4017/5772 [3:12:30<2:58:36, 6.11s/it] 70%|██████▉ | 4017/5772 [3:12:28<2:58:37, 6.11s/it] {'loss': 0.465, 'learning_rate': 4.470572683244127e-06, 'epoch': 0.7} + 70%|██████▉ | 4017/5772 [3:12:30<2:58:36, 6.11s/it] {'loss': 0.465, 'learning_rate': 4.470572683244127e-06, 'epoch': 0.7} + 70%|██████▉ | 4017/5772 [3:12:28<2:58:37, 6.11s/it] 70%|██████▉ | 4018/5772 [3:12:35<3:00:43, 6.18s/it] 70%|██████▉ | 4018/5772 [3:12:37<3:00:44, 6.18s/it] {'loss': 0.4532, 'learning_rate': 4.4658975350951505e-06, 'epoch': 0.7} + 70%|██████▉ | 4018/5772 [3:12:37<3:00:44, 6.18s/it] {'loss': 0.4532, 'learning_rate': 4.4658975350951505e-06, 'epoch': 0.7} + 70%|██████▉ | 4018/5772 [3:12:35<3:00:43, 6.18s/it] 70%|██████▉ | 4019/5772 [3:12:41<2:59:13, 6.13s/it] 70%|██████▉ | 4019/5772 [3:12:43<2:59:13, 6.13s/it] {'loss': 0.4539, 'learning_rate': 4.461224129880976e-06, 'epoch': 0.7} + 70%|██████▉ | 4019/5772 [3:12:43<2:59:13, 6.13s/it] {'loss': 0.4539, 'learning_rate': 4.461224129880976e-06, 'epoch': 0.7} + 70%|██████▉ | 4019/5772 [3:12:41<2:59:13, 6.13s/it] 70%|██████▉ | 4020/5772 [3:12:47<3:01:07, 6.20s/it] 70%|██████▉ | 4020/5772 [3:12:49<3:01:07, 6.20s/it] {'loss': 0.4543, 'learning_rate': 4.45655246907347e-06, 'epoch': 0.7} + 70%|██████▉ | 4020/5772 [3:12:49<3:01:07, 6.20s/it] {'loss': 0.4543, 'learning_rate': 4.45655246907347e-06, 'epoch': 0.7} + 70%|██████▉ | 4020/5772 [3:12:47<3:01:07, 6.20s/it] 70%|██████▉ | 4021/5772 [3:12:55<2:58:04, 6.10s/it] 70%|██████▉ | 4021/5772 [3:12:53<2:58:05, 6.10s/it] {'loss': 0.4633, 'learning_rate': 4.451882554143938e-06, 'epoch': 0.7} + 70%|██████▉ | 4021/5772 [3:12:55<2:58:04, 6.10s/it] {'loss': 0.4633, 'learning_rate': 4.451882554143938e-06, 'epoch': 0.7} + 70%|██████▉ | 4021/5772 [3:12:53<2:58:05, 6.10s/it] 70%|██████▉ | 4022/5772 [3:13:01<2:57:46, 6.10s/it] 70%|██████▉ | 4022/5772 [3:12:59<2:57:46, 6.10s/it] {'loss': 0.4477, 'learning_rate': 4.447214386563145e-06, 'epoch': 0.7} + 70%|██████▉ | 4022/5772 [3:13:01<2:57:46, 6.10s/it] {'loss': 0.4477, 'learning_rate': 4.447214386563145e-06, 'epoch': 0.7} + 70%|██████▉ | 4022/5772 [3:12:59<2:57:46, 6.10s/it] 70%|██████▉ | 4023/5772 [3:13:07<2:56:47, 6.07s/it] 70%|██████▉ | 4023/5772 [3:13:05<2:56:48, 6.07s/it] {'loss': 0.4694, 'learning_rate': 4.442547967801314e-06, 'epoch': 0.7} + 70%|██████▉ | 4023/5772 [3:13:07<2:56:47, 6.07s/it] {'loss': 0.4694, 'learning_rate': 4.442547967801314e-06, 'epoch': 0.7} + 70%|██████▉ | 4023/5772 [3:13:05<2:56:48, 6.07s/it] 70%|██████▉ | 4024/5772 [3:13:13<2:58:09, 6.12s/it] 70%|██████▉ | 4024/5772 [3:13:11<2:58:09, 6.12s/it] {'loss': 0.4691, 'learning_rate': 4.437883299328097e-06, 'epoch': 0.7} + 70%|██████▉ | 4024/5772 [3:13:13<2:58:09, 6.12s/it] {'loss': 0.4691, 'learning_rate': 4.437883299328097e-06, 'epoch': 0.7} + 70%|██████▉ | 4024/5772 [3:13:11<2:58:09, 6.12s/it] 70%|██████▉ | 4025/5772 [3:13:20<3:00:28, 6.20s/it] 70%|██████▉ | 4025/5772 [3:13:18<3:00:28, 6.20s/it] {'loss': 0.4572, 'learning_rate': 4.433220382612614e-06, 'epoch': 0.7} + 70%|██████▉ | 4025/5772 [3:13:20<3:00:28, 6.20s/it] {'loss': 0.4572, 'learning_rate': 4.433220382612614e-06, 'epoch': 0.7} + 70%|██████▉ | 4025/5772 [3:13:18<3:00:28, 6.20s/it] 70%|██████▉ | 4026/5772 [3:13:23<2:56:55, 6.08s/it] 70%|██████▉ | 4026/5772 [3:13:25<2:56:56, 6.08s/it] {'loss': 0.4435, 'learning_rate': 4.4285592191234125e-06, 'epoch': 0.7} + 70%|██████▉ | 4026/5772 [3:13:25<2:56:56, 6.08s/it] {'loss': 0.4435, 'learning_rate': 4.4285592191234125e-06, 'epoch': 0.7} + 70%|██████▉ | 4026/5772 [3:13:23<2:56:55, 6.08s/it] 70%|██████▉ | 4027/5772 [3:13:30<2:57:30, 6.10s/it] 70%|██████▉ | 4027/5772 [3:13:32<2:57:30, 6.10s/it] {'loss': 0.4609, 'learning_rate': 4.423899810328512e-06, 'epoch': 0.7} + {'loss': 0.4609, 'learning_rate': 4.423899810328512e-06, 'epoch': 0.7} 70%|██████▉ | 4027/5772 [3:13:32<2:57:30, 6.10s/it] + 70%|██████▉ | 4027/5772 [3:13:30<2:57:30, 6.10s/it] 70%|██████▉ | 4028/5772 [3:13:36<2:57:20, 6.10s/it] 70%|██████▉ | 4028/5772 [3:13:38<2:57:20, 6.10s/it] {'loss': 0.4541, 'learning_rate': 4.419242157695364e-06, 'epoch': 0.7} + 70%|██████▉ | 4028/5772 [3:13:38<2:57:20, 6.10s/it] {'loss': 0.4541, 'learning_rate': 4.419242157695364e-06, 'epoch': 0.7} + 70%|██████▉ | 4028/5772 [3:13:36<2:57:20, 6.10s/it] 70%|██████▉ | 4029/5772 [3:13:42<2:58:45, 6.15s/it] 70%|██████▉ | 4029/5772 [3:13:44<2:58:45, 6.15s/it] {'loss': 0.4774, 'learning_rate': 4.4145862626908684e-06, 'epoch': 0.7} + 70%|██████▉ | 4029/5772 [3:13:44<2:58:45, 6.15s/it] {'loss': 0.4774, 'learning_rate': 4.4145862626908684e-06, 'epoch': 0.7} + 70%|██████▉ | 4029/5772 [3:13:42<2:58:45, 6.15s/it] 70%|██████▉ | 4030/5772 [3:13:48<2:55:42, 6.05s/it] 70%|██████▉ | 4030/5772 [3:13:50<2:55:43, 6.05s/it] {'loss': 0.4526, 'learning_rate': 4.409932126781373e-06, 'epoch': 0.7} + 70%|██████▉ | 4030/5772 [3:13:50<2:55:43, 6.05s/it] {'loss': 0.4526, 'learning_rate': 4.409932126781373e-06, 'epoch': 0.7} + 70%|██████▉ | 4030/5772 [3:13:48<2:55:42, 6.05s/it] 70%|██████▉ | 4031/5772 [3:13:54<2:52:40, 5.95s/it] 70%|██████▉ | 4031/5772 [3:13:56<2:52:40, 5.95s/it] {'loss': 0.4674, 'learning_rate': 4.405279751432674e-06, 'epoch': 0.7} + 70%|██████▉ | 4031/5772 [3:13:56<2:52:40, 5.95s/it] {'loss': 0.4674, 'learning_rate': 4.405279751432674e-06, 'epoch': 0.7} + 70%|██████▉ | 4031/5772 [3:13:54<2:52:40, 5.95s/it] 70%|██████▉ | 4032/5772 [3:14:00<2:55:19, 6.05s/it] 70%|██████▉ | 4032/5772 [3:14:02<2:55:19, 6.05s/it] {'loss': 0.4585, 'learning_rate': 4.400629138110014e-06, 'epoch': 0.7} + 70%|██████▉ | 4032/5772 [3:14:02<2:55:19, 6.05s/it] {'loss': 0.4585, 'learning_rate': 4.400629138110014e-06, 'epoch': 0.7} + 70%|██████▉ | 4032/5772 [3:14:00<2:55:19, 6.05s/it] 70%|██████▉ | 4033/5772 [3:14:06<2:57:06, 6.11s/it] 70%|██████▉ | 4033/5772 [3:14:08<2:57:06, 6.11s/it] {'loss': 0.4693, 'learning_rate': 4.395980288278067e-06, 'epoch': 0.7} + 70%|██████▉ | 4033/5772 [3:14:08<2:57:06, 6.11s/it] {'loss': 0.4693, 'learning_rate': 4.395980288278067e-06, 'epoch': 0.7} + 70%|██████▉ | 4033/5772 [3:14:06<2:57:06, 6.11s/it] 70%|██████▉ | 4034/5772 [3:14:12<2:57:48, 6.14s/it] 70%|██████▉ | 4034/5772 [3:14:14<2:57:47, 6.14s/it] {'loss': 0.4503, 'learning_rate': 4.391333203400974e-06, 'epoch': 0.7} + 70%|██████▉ | 4034/5772 [3:14:14<2:57:47, 6.14s/it] {'loss': 0.4503, 'learning_rate': 4.391333203400974e-06, 'epoch': 0.7} + 70%|██████▉ | 4034/5772 [3:14:12<2:57:48, 6.14s/it] 70%|██████▉ | 4035/5772 [3:14:18<2:55:24, 6.06s/it] 70%|██████▉ | 4035/5772 [3:14:20<2:55:24, 6.06s/it] {'loss': 0.4735, 'learning_rate': 4.386687884942307e-06, 'epoch': 0.7} + 70%|██████▉ | 4035/5772 [3:14:20<2:55:24, 6.06s/it] {'loss': 0.4735, 'learning_rate': 4.386687884942307e-06, 'epoch': 0.7} + 70%|██████▉ | 4035/5772 [3:14:18<2:55:24, 6.06s/it] 70%|██████▉ | 4036/5772 [3:14:25<2:59:21, 6.20s/it] 70%|██████▉ | 4036/5772 [3:14:27<2:59:21, 6.20s/it] {'loss': 0.4385, 'learning_rate': 4.382044334365078e-06, 'epoch': 0.7} + 70%|██████▉ | 4036/5772 [3:14:27<2:59:21, 6.20s/it] {'loss': 0.4385, 'learning_rate': 4.382044334365078e-06, 'epoch': 0.7} + 70%|██████▉ | 4036/5772 [3:14:25<2:59:21, 6.20s/it] 70%|██████▉ | 4037/5772 [3:14:31<2:58:52, 6.19s/it] 70%|██████▉ | 4037/5772 [3:14:33<2:58:52, 6.19s/it] {'loss': 0.4545, 'learning_rate': 4.3774025531317476e-06, 'epoch': 0.7} + 70%|██████▉ | 4037/5772 [3:14:33<2:58:52, 6.19s/it] {'loss': 0.4545, 'learning_rate': 4.3774025531317476e-06, 'epoch': 0.7} + 70%|██████▉ | 4037/5772 [3:14:31<2:58:52, 6.19s/it] 70%|██████▉ | 4038/5772 [3:14:37<2:55:43, 6.08s/it] 70%|██████▉ | 4038/5772 [3:14:39<2:55:43, 6.08s/it] {'loss': 0.4537, 'learning_rate': 4.372762542704223e-06, 'epoch': 0.7} + 70%|██████▉ | 4038/5772 [3:14:39<2:55:43, 6.08s/it] {'loss': 0.4537, 'learning_rate': 4.372762542704223e-06, 'epoch': 0.7} + 70%|██████▉ | 4038/5772 [3:14:37<2:55:43, 6.08s/it] 70%|██████▉ | 4039/5772 [3:14:43<2:58:21, 6.18s/it] 70%|██████▉ | 4039/5772 [3:14:45<2:58:21, 6.18s/it] {'loss': 0.4692, 'learning_rate': 4.368124304543852e-06, 'epoch': 0.7} + 70%|██████▉ | 4039/5772 [3:14:45<2:58:21, 6.18s/it] {'loss': 0.4692, 'learning_rate': 4.368124304543852e-06, 'epoch': 0.7} + 70%|██████▉ | 4039/5772 [3:14:43<2:58:21, 6.18s/it] 70%|██████▉ | 4040/5772 [3:14:49<2:58:27, 6.18s/it] 70%|██████▉ | 4040/5772 [3:14:51<2:58:27, 6.18s/it] {'loss': 0.4754, 'learning_rate': 4.363487840111413e-06, 'epoch': 0.7} + 70%|██████▉ | 4040/5772 [3:14:51<2:58:27, 6.18s/it] {'loss': 0.4754, 'learning_rate': 4.363487840111413e-06, 'epoch': 0.7} + 70%|██████▉ | 4040/5772 [3:14:49<2:58:27, 6.18s/it] 70%|███████ | 4041/5772 [3:14:55<2:58:23, 6.18s/it] 70%|███████ | 4041/5772 [3:14:57<2:58:23, 6.18s/it] {'loss': 0.4589, 'learning_rate': 4.358853150867137e-06, 'epoch': 0.7} + 70%|███████ | 4041/5772 [3:14:57<2:58:23, 6.18s/it] {'loss': 0.4589, 'learning_rate': 4.358853150867137e-06, 'epoch': 0.7} + 70%|███████ | 4041/5772 [3:14:55<2:58:23, 6.18s/it] 70%|███████ | 4042/5772 [3:15:02<2:57:39, 6.16s/it] 70%|███████ | 4042/5772 [3:15:04<2:57:39, 6.16s/it] {'loss': 0.4623, 'learning_rate': 4.354220238270705e-06, 'epoch': 0.7} + 70%|███████ | 4042/5772 [3:15:04<2:57:39, 6.16s/it] {'loss': 0.4623, 'learning_rate': 4.354220238270705e-06, 'epoch': 0.7} + 70%|███████ | 4042/5772 [3:15:02<2:57:39, 6.16s/it] 70%|███████ | 4043/5772 [3:15:08<2:58:48, 6.21s/it] 70%|███████ | 4043/5772 [3:15:10<2:58:48, 6.21s/it] {'loss': 0.4689, 'learning_rate': 4.349589103781212e-06, 'epoch': 0.7} + 70%|███████ | 4043/5772 [3:15:10<2:58:48, 6.21s/it] {'loss': 0.4689, 'learning_rate': 4.349589103781212e-06, 'epoch': 0.7} + 70%|███████ | 4043/5772 [3:15:08<2:58:48, 6.21s/it] 70%|███████ | 4044/5772 [3:15:14<3:00:54, 6.28s/it] 70%|███████ | 4044/5772 [3:15:16<3:00:54, 6.28s/it] {'loss': 0.4627, 'learning_rate': 4.344959748857215e-06, 'epoch': 0.7} + 70%|███████ | 4044/5772 [3:15:16<3:00:54, 6.28s/it] {'loss': 0.4627, 'learning_rate': 4.344959748857215e-06, 'epoch': 0.7} + 70%|███████ | 4044/5772 [3:15:14<3:00:54, 6.28s/it] 70%|███████ | 4045/5772 [3:15:21<3:04:28, 6.41s/it] 70%|███████ | 4045/5772 [3:15:23<3:04:28, 6.41s/it] {'loss': 0.4543, 'learning_rate': 4.340332174956703e-06, 'epoch': 0.7} + 70%|███████ | 4045/5772 [3:15:23<3:04:28, 6.41s/it] {'loss': 0.4543, 'learning_rate': 4.340332174956703e-06, 'epoch': 0.7} + 70%|███████ | 4045/5772 [3:15:21<3:04:28, 6.41s/it] 70%|███████ | 4046/5772 [3:15:27<3:03:44, 6.39s/it] 70%|███████ | 4046/5772 [3:15:29<3:03:44, 6.39s/it] {'loss': 0.4541, 'learning_rate': 4.335706383537109e-06, 'epoch': 0.7} + 70%|███████ | 4046/5772 [3:15:29<3:03:44, 6.39s/it] {'loss': 0.4541, 'learning_rate': 4.335706383537109e-06, 'epoch': 0.7} + 70%|███████ | 4046/5772 [3:15:27<3:03:44, 6.39s/it] 70%|███████ | 4047/5772 [3:15:35<3:00:33, 6.28s/it] 70%|███████ | 4047/5772 [3:15:33<3:00:33, 6.28s/it] {'loss': 0.4708, 'learning_rate': 4.331082376055292e-06, 'epoch': 0.7} + 70%|███████ | 4047/5772 [3:15:35<3:00:33, 6.28s/it] {'loss': 0.4708, 'learning_rate': 4.331082376055292e-06, 'epoch': 0.7} + 70%|███████ | 4047/5772 [3:15:33<3:00:33, 6.28s/it] 70%|███████ | 4048/5772 [3:15:40<3:01:13, 6.31s/it] 70%|███████ | 4048/5772 [3:15:42<3:01:14, 6.31s/it] {'loss': 0.4497, 'learning_rate': 4.326460153967558e-06, 'epoch': 0.7} + 70%|███████ | 4048/5772 [3:15:42<3:01:14, 6.31s/it] {'loss': 0.4497, 'learning_rate': 4.326460153967558e-06, 'epoch': 0.7} + 70%|███████ | 4048/5772 [3:15:40<3:01:13, 6.31s/it] 70%|███████ | 4049/5772 [3:15:46<2:59:59, 6.27s/it] 70%|███████ | 4049/5772 [3:15:48<2:59:59, 6.27s/it] {'loss': 0.477, 'learning_rate': 4.32183971872966e-06, 'epoch': 0.7} + 70%|███████ | 4049/5772 [3:15:48<2:59:59, 6.27s/it] {'loss': 0.477, 'learning_rate': 4.32183971872966e-06, 'epoch': 0.7} + 70%|███████ | 4049/5772 [3:15:46<2:59:59, 6.27s/it]10 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +0 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... + 70%|███████ | 4050/5772 [3:15:54<3:00:52, 6.30s/it]6 AutoResumeHook: Checking whether to suspend... + 70%|███████ | 4050/5772 [3:15:52<3:00:52, 6.30s/it]12 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4651, 'learning_rate': 4.317221071796768e-06, 'epoch': 0.7} + 70%|███████ | 4050/5772 [3:15:54<3:00:52, 6.30s/it] {'loss': 0.4651, 'learning_rate': 4.317221071796768e-06, 'epoch': 0.7} + 70%|███████ | 4050/5772 [3:15:52<3:00:52, 6.30s/it] 70%|███████ | 4051/5772 [3:15:58<2:56:56, 6.17s/it] 70%|███████ | 4051/5772 [3:16:00<2:56:56, 6.17s/it] {'loss': 0.4612, 'learning_rate': 4.312604214623504e-06, 'epoch': 0.7} + 70%|███████ | 4051/5772 [3:16:00<2:56:56, 6.17s/it] {'loss': 0.4612, 'learning_rate': 4.312604214623504e-06, 'epoch': 0.7} + 70%|███████ | 4051/5772 [3:15:58<2:56:56, 6.17s/it] 70%|███████ | 4052/5772 [3:16:04<2:57:59, 6.21s/it] 70%|███████ | 4052/5772 [3:16:06<2:57:59, 6.21s/it] {'loss': 0.4646, 'learning_rate': 4.307989148663921e-06, 'epoch': 0.7} + 70%|███████ | 4052/5772 [3:16:06<2:57:59, 6.21s/it] {'loss': 0.4646, 'learning_rate': 4.307989148663921e-06, 'epoch': 0.7} + 70%|███████ | 4052/5772 [3:16:04<2:57:59, 6.21s/it] 70%|███████ | 4053/5772 [3:16:11<2:57:47, 6.21s/it] 70%|███████ | 4053/5772 [3:16:13<2:57:47, 6.21s/it] {'loss': 0.4634, 'learning_rate': 4.3033758753715095e-06, 'epoch': 0.7} + 70%|███████ | 4053/5772 [3:16:13<2:57:47, 6.21s/it] {'loss': 0.4634, 'learning_rate': 4.3033758753715095e-06, 'epoch': 0.7} + 70%|███████ | 4053/5772 [3:16:11<2:57:47, 6.21s/it] 70%|███████ | 4054/5772 [3:16:17<2:54:47, 6.10s/it] 70%|███████ | 4054/5772 [3:16:19<2:54:47, 6.10s/it] {'loss': 0.4535, 'learning_rate': 4.298764396199191e-06, 'epoch': 0.7} + 70%|███████ | 4054/5772 [3:16:19<2:54:47, 6.10s/it] {'loss': 0.4535, 'learning_rate': 4.298764396199191e-06, 'epoch': 0.7} + 70%|███████ | 4054/5772 [3:16:17<2:54:47, 6.10s/it] 70%|███████ | 4055/5772 [3:16:23<2:57:03, 6.19s/it] 70%|███████ | 4055/5772 [3:16:25<2:57:03, 6.19s/it] {'loss': 0.4565, 'learning_rate': 4.294154712599325e-06, 'epoch': 0.7} + 70%|███████ | 4055/5772 [3:16:25<2:57:03, 6.19s/it] {'loss': 0.4565, 'learning_rate': 4.294154712599325e-06, 'epoch': 0.7} + 70%|███████ | 4055/5772 [3:16:23<2:57:03, 6.19s/it] 70%|███████ | 4056/5772 [3:16:31<2:56:29, 6.17s/it] 70%|███████ | 4056/5772 [3:16:29<2:56:30, 6.17s/it] {'loss': 0.4544, 'learning_rate': 4.28954682602371e-06, 'epoch': 0.7} + 70%|███████ | 4056/5772 [3:16:31<2:56:29, 6.17s/it] {'loss': 0.4544, 'learning_rate': 4.28954682602371e-06, 'epoch': 0.7} + 70%|███████ | 4056/5772 [3:16:29<2:56:30, 6.17s/it] 70%|███████ | 4057/5772 [3:16:37<2:57:11, 6.20s/it] 70%|███████ | 4057/5772 [3:16:35<2:57:11, 6.20s/it] {'loss': 0.4611, 'learning_rate': 4.284940737923571e-06, 'epoch': 0.7} + 70%|███████ | 4057/5772 [3:16:37<2:57:11, 6.20s/it] {'loss': 0.4611, 'learning_rate': 4.284940737923571e-06, 'epoch': 0.7} + 70%|███████ | 4057/5772 [3:16:35<2:57:11, 6.20s/it] 70%|███████ | 4058/5772 [3:16:43<2:55:23, 6.14s/it] 70%|███████ | 4058/5772 [3:16:41<2:55:23, 6.14s/it] {'loss': 0.4682, 'learning_rate': 4.280336449749573e-06, 'epoch': 0.7} + 70%|███████ | 4058/5772 [3:16:43<2:55:23, 6.14s/it] {'loss': 0.4682, 'learning_rate': 4.280336449749573e-06, 'epoch': 0.7} + 70%|███████ | 4058/5772 [3:16:41<2:55:23, 6.14s/it] 70%|███████ | 4059/5772 [3:16:50<2:56:20, 6.18s/it] 70%|███████ | 4059/5772 [3:16:48<2:56:20, 6.18s/it] {'loss': 0.472, 'learning_rate': 4.275733962951804e-06, 'epoch': 0.7} + 70%|███████ | 4059/5772 [3:16:50<2:56:20, 6.18s/it] {'loss': 0.472, 'learning_rate': 4.275733962951804e-06, 'epoch': 0.7} + 70%|███████ | 4059/5772 [3:16:48<2:56:20, 6.18s/it] 70%|███████ | 4060/5772 [3:16:56<2:57:32, 6.22s/it] 70%|███████ | 4060/5772 [3:16:54<2:57:32, 6.22s/it] {'loss': 0.461, 'learning_rate': 4.271133278979802e-06, 'epoch': 0.7} + 70%|███████ | 4060/5772 [3:16:56<2:57:32, 6.22s/it] {'loss': 0.461, 'learning_rate': 4.271133278979802e-06, 'epoch': 0.7} + 70%|███████ | 4060/5772 [3:16:54<2:57:32, 6.22s/it] 70%|███████ | 4061/5772 [3:17:02<2:54:06, 6.11s/it] 70%|███████ | 4061/5772 [3:17:00<2:54:06, 6.11s/it] {'loss': 0.4797, 'learning_rate': 4.266534399282517e-06, 'epoch': 0.7} + 70%|███████ | 4061/5772 [3:17:02<2:54:06, 6.11s/it] {'loss': 0.4797, 'learning_rate': 4.266534399282517e-06, 'epoch': 0.7} + 70%|███████ | 4061/5772 [3:17:00<2:54:06, 6.11s/it] 70%|███████ | 4062/5772 [3:17:08<2:52:54, 6.07s/it] 70%|███████ | 4062/5772 [3:17:06<2:52:54, 6.07s/it] {'loss': 0.4607, 'learning_rate': 4.261937325308347e-06, 'epoch': 0.7} + 70%|███████ | 4062/5772 [3:17:08<2:52:54, 6.07s/it] {'loss': 0.4607, 'learning_rate': 4.261937325308347e-06, 'epoch': 0.7} + 70%|███████ | 4062/5772 [3:17:06<2:52:54, 6.07s/it] 70%|███████ | 4063/5772 [3:17:14<2:53:34, 6.09s/it] 70%|███████ | 4063/5772 [3:17:12<2:53:34, 6.09s/it] {'loss': 0.4616, 'learning_rate': 4.257342058505109e-06, 'epoch': 0.7} + 70%|███████ | 4063/5772 [3:17:14<2:53:34, 6.09s/it] {'loss': 0.4616, 'learning_rate': 4.257342058505109e-06, 'epoch': 0.7} + 70%|███████ | 4063/5772 [3:17:12<2:53:34, 6.09s/it] 70%|███████ | 4064/5772 [3:17:20<2:54:01, 6.11s/it] 70%|███████ | 4064/5772 [3:17:18<2:54:01, 6.11s/it] {'loss': 0.4599, 'learning_rate': 4.252748600320063e-06, 'epoch': 0.7} + 70%|███████ | 4064/5772 [3:17:20<2:54:01, 6.11s/it] {'loss': 0.4599, 'learning_rate': 4.252748600320063e-06, 'epoch': 0.7} + 70%|███████ | 4064/5772 [3:17:18<2:54:01, 6.11s/it] 70%|███████ | 4065/5772 [3:17:26<2:54:33, 6.14s/it] 70%|███████ | 4065/5772 [3:17:24<2:54:33, 6.14s/it] {'loss': 0.4682, 'learning_rate': 4.248156952199895e-06, 'epoch': 0.7} + 70%|███████ | 4065/5772 [3:17:26<2:54:33, 6.14s/it] {'loss': 0.4682, 'learning_rate': 4.248156952199895e-06, 'epoch': 0.7} + 70%|███████ | 4065/5772 [3:17:24<2:54:33, 6.14s/it] 70%|███████ | 4066/5772 [3:17:32<2:55:29, 6.17s/it] 70%|███████ | 4066/5772 [3:17:30<2:55:29, 6.17s/it] {'loss': 0.4599, 'learning_rate': 4.243567115590705e-06, 'epoch': 0.7} + 70%|███████ | 4066/5772 [3:17:32<2:55:29, 6.17s/it] {'loss': 0.4599, 'learning_rate': 4.243567115590705e-06, 'epoch': 0.7} + 70%|███████ | 4066/5772 [3:17:30<2:55:29, 6.17s/it] 70%|███████ | 4067/5772 [3:17:39<2:55:13, 6.17s/it] 70%|███████ | 4067/5772 [3:17:37<2:55:13, 6.17s/it] {'loss': 0.4683, 'learning_rate': 4.238979091938054e-06, 'epoch': 0.7} + 70%|███████ | 4067/5772 [3:17:39<2:55:13, 6.17s/it] {'loss': 0.4683, 'learning_rate': 4.238979091938054e-06, 'epoch': 0.7} + 70%|███████ | 4067/5772 [3:17:37<2:55:13, 6.17s/it] 70%|███████ | 4068/5772 [3:17:44<2:52:01, 6.06s/it] 70%|███████ | 4068/5772 [3:17:42<2:52:02, 6.06s/it] {'loss': 0.4566, 'learning_rate': 4.234392882686904e-06, 'epoch': 0.7} + 70%|███████ | 4068/5772 [3:17:44<2:52:01, 6.06s/it] {'loss': 0.4566, 'learning_rate': 4.234392882686904e-06, 'epoch': 0.7} + 70%|███████ | 4068/5772 [3:17:42<2:52:02, 6.06s/it] 70%|███████ | 4069/5772 [3:17:50<2:51:01, 6.03s/it] 70%|███████ | 4069/5772 [3:17:48<2:51:01, 6.03s/it] {'loss': 0.4679, 'learning_rate': 4.2298084892816574e-06, 'epoch': 0.7} + 70%|███████ | 4069/5772 [3:17:50<2:51:01, 6.03s/it] {'loss': 0.4679, 'learning_rate': 4.2298084892816574e-06, 'epoch': 0.7} + 70%|███████ | 4069/5772 [3:17:48<2:51:01, 6.03s/it] 71%|███████ | 4070/5772 [3:17:57<2:52:03, 6.07s/it] 71%|███████ | 4070/5772 [3:17:55<2:52:03, 6.07s/it] {'loss': 0.4593, 'learning_rate': 4.225225913166146e-06, 'epoch': 0.71} + 71%|███████ | 4070/5772 [3:17:57<2:52:03, 6.07s/it] {'loss': 0.4593, 'learning_rate': 4.225225913166146e-06, 'epoch': 0.71} + 71%|███████ | 4070/5772 [3:17:55<2:52:03, 6.07s/it] 71%|███████ | 4071/5772 [3:18:01<2:52:44, 6.09s/it] 71%|███████ | 4071/5772 [3:18:03<2:52:44, 6.09s/it] {'loss': 0.4572, 'learning_rate': 4.2206451557836235e-06, 'epoch': 0.71} + 71%|███████ | 4071/5772 [3:18:03<2:52:44, 6.09s/it] {'loss': 0.4572, 'learning_rate': 4.2206451557836235e-06, 'epoch': 0.71} + 71%|███████ | 4071/5772 [3:18:01<2:52:44, 6.09s/it] 71%|███████ | 4072/5772 [3:18:09<2:55:09, 6.18s/it] 71%|███████ | 4072/5772 [3:18:07<2:55:09, 6.18s/it] {'loss': 0.466, 'learning_rate': 4.2160662185767805e-06, 'epoch': 0.71} + 71%|███████ | 4072/5772 [3:18:09<2:55:09, 6.18s/it] {'loss': 0.466, 'learning_rate': 4.2160662185767805e-06, 'epoch': 0.71} + 71%|███████ | 4072/5772 [3:18:07<2:55:09, 6.18s/it] 71%|███████ | 4073/5772 [3:18:15<2:54:47, 6.17s/it] 71%|███████ | 4073/5772 [3:18:13<2:54:47, 6.17s/it] {'loss': 0.4592, 'learning_rate': 4.21148910298772e-06, 'epoch': 0.71} + 71%|███████ | 4073/5772 [3:18:15<2:54:47, 6.17s/it] {'loss': 0.4592, 'learning_rate': 4.21148910298772e-06, 'epoch': 0.71} + 71%|███████ | 4073/5772 [3:18:13<2:54:47, 6.17s/it] 71%|███████ | 4074/5772 [3:18:21<2:53:56, 6.15s/it] 71%|███████ | 4074/5772 [3:18:19<2:53:56, 6.15s/it] {'loss': 0.4481, 'learning_rate': 4.2069138104579825e-06, 'epoch': 0.71} + 71%|███████ | 4074/5772 [3:18:21<2:53:56, 6.15s/it] {'loss': 0.4481, 'learning_rate': 4.2069138104579825e-06, 'epoch': 0.71} + 71%|███████ | 4074/5772 [3:18:19<2:53:56, 6.15s/it] 71%|███████ | 4075/5772 [3:18:27<2:52:14, 6.09s/it] 71%|███████ | 4075/5772 [3:18:25<2:52:14, 6.09s/it] {'loss': 0.4599, 'learning_rate': 4.202340342428529e-06, 'epoch': 0.71} + 71%|███████ | 4075/5772 [3:18:27<2:52:14, 6.09s/it] {'loss': 0.4599, 'learning_rate': 4.202340342428529e-06, 'epoch': 0.71} + 71%|███████ | 4075/5772 [3:18:25<2:52:14, 6.09s/it] 71%|███████ | 4076/5772 [3:18:33<2:52:31, 6.10s/it] 71%|███████ | 4076/5772 [3:18:31<2:52:30, 6.10s/it] {'loss': 0.4645, 'learning_rate': 4.197768700339752e-06, 'epoch': 0.71} + 71%|███████ | 4076/5772 [3:18:33<2:52:31, 6.10s/it] {'loss': 0.4645, 'learning_rate': 4.197768700339752e-06, 'epoch': 0.71} + 71%|███████ | 4076/5772 [3:18:31<2:52:30, 6.10s/it] 71%|███████ | 4077/5772 [3:18:40<2:54:59, 6.19s/it] 71%|███████ | 4077/5772 [3:18:38<2:54:59, 6.19s/it] {'loss': 0.4528, 'learning_rate': 4.19319888563146e-06, 'epoch': 0.71} + 71%|███████ | 4077/5772 [3:18:40<2:54:59, 6.19s/it] {'loss': 0.4528, 'learning_rate': 4.19319888563146e-06, 'epoch': 0.71} + 71%|███████ | 4077/5772 [3:18:38<2:54:59, 6.19s/it] 71%|███████ | 4078/5772 [3:18:44<2:54:38, 6.19s/it] 71%|███████ | 4078/5772 [3:18:46<2:54:38, 6.19s/it] {'loss': 0.4721, 'learning_rate': 4.188630899742894e-06, 'epoch': 0.71} + 71%|███████ | 4078/5772 [3:18:46<2:54:38, 6.19s/it] {'loss': 0.4721, 'learning_rate': 4.188630899742894e-06, 'epoch': 0.71} + 71%|███████ | 4078/5772 [3:18:44<2:54:38, 6.19s/it] 71%|███████ | 4079/5772 [3:18:52<2:52:56, 6.13s/it] 71%|███████ | 4079/5772 [3:18:50<2:52:56, 6.13s/it] {'loss': 0.4717, 'learning_rate': 4.184064744112718e-06, 'epoch': 0.71} + 71%|███████ | 4079/5772 [3:18:52<2:52:56, 6.13s/it] {'loss': 0.4717, 'learning_rate': 4.184064744112718e-06, 'epoch': 0.71} + 71%|███████ | 4079/5772 [3:18:50<2:52:56, 6.13s/it] 71%|███████ | 4080/5772 [3:18:58<2:52:13, 6.11s/it] 71%|███████ | 4080/5772 [3:18:56<2:52:13, 6.11s/it] {'loss': 0.4654, 'learning_rate': 4.179500420179011e-06, 'epoch': 0.71} + 71%|███████ | 4080/5772 [3:18:58<2:52:13, 6.11s/it] {'loss': 0.4654, 'learning_rate': 4.179500420179011e-06, 'epoch': 0.71} + 71%|███████ | 4080/5772 [3:18:56<2:52:13, 6.11s/it] 71%|███████ | 4081/5772 [3:19:04<2:50:21, 6.04s/it] 71%|███████ | 4081/5772 [3:19:02<2:50:21, 6.04s/it] {'loss': 0.4483, 'learning_rate': 4.174937929379285e-06, 'epoch': 0.71} + 71%|███████ | 4081/5772 [3:19:04<2:50:21, 6.04s/it] {'loss': 0.4483, 'learning_rate': 4.174937929379285e-06, 'epoch': 0.71} + 71%|███████ | 4081/5772 [3:19:02<2:50:21, 6.04s/it] 71%|███████ | 4082/5772 [3:19:10<2:53:19, 6.15s/it] 71%|███████ | 4082/5772 [3:19:08<2:53:19, 6.15s/it] {'loss': 0.4791, 'learning_rate': 4.17037727315047e-06, 'epoch': 0.71} + 71%|███████ | 4082/5772 [3:19:10<2:53:19, 6.15s/it] {'loss': 0.4791, 'learning_rate': 4.17037727315047e-06, 'epoch': 0.71} + 71%|███████ | 4082/5772 [3:19:08<2:53:19, 6.15s/it] 71%|███████ | 4083/5772 [3:19:17<2:55:47, 6.24s/it] 71%|███████ | 4083/5772 [3:19:15<2:55:47, 6.24s/it] {'loss': 0.4593, 'learning_rate': 4.16581845292892e-06, 'epoch': 0.71} + 71%|███████ | 4083/5772 [3:19:17<2:55:47, 6.24s/it] {'loss': 0.4593, 'learning_rate': 4.16581845292892e-06, 'epoch': 0.71} + 71%|███████ | 4083/5772 [3:19:15<2:55:47, 6.24s/it] 71%|███████ | 4084/5772 [3:19:23<2:53:00, 6.15s/it] 71%|███████ | 4084/5772 [3:19:21<2:53:00, 6.15s/it] {'loss': 0.4577, 'learning_rate': 4.161261470150414e-06, 'epoch': 0.71} + 71%|███████ | 4084/5772 [3:19:23<2:53:00, 6.15s/it] {'loss': 0.4577, 'learning_rate': 4.161261470150414e-06, 'epoch': 0.71} + 71%|███████ | 4084/5772 [3:19:21<2:53:00, 6.15s/it] 71%|███████ | 4085/5772 [3:19:27<2:53:17, 6.16s/it] 71%|███████ | 4085/5772 [3:19:29<2:53:17, 6.16s/it] {'loss': 0.4593, 'learning_rate': 4.156706326250137e-06, 'epoch': 0.71} + 71%|███████ | 4085/5772 [3:19:29<2:53:17, 6.16s/it] {'loss': 0.4593, 'learning_rate': 4.156706326250137e-06, 'epoch': 0.71} + 71%|███████ | 4085/5772 [3:19:27<2:53:17, 6.16s/it] 71%|███████ | 4086/5772 [3:19:33<2:53:53, 6.19s/it] 71%|███████ | 4086/5772 [3:19:35<2:53:53, 6.19s/it] {'loss': 0.4555, 'learning_rate': 4.15215302266272e-06, 'epoch': 0.71} + 71%|███████ | 4086/5772 [3:19:35<2:53:53, 6.19s/it] {'loss': 0.4555, 'learning_rate': 4.15215302266272e-06, 'epoch': 0.71} + 71%|███████ | 4086/5772 [3:19:33<2:53:53, 6.19s/it] 71%|███████ | 4087/5772 [3:19:41<2:52:46, 6.15s/it] 71%|███████ | 4087/5772 [3:19:39<2:52:46, 6.15s/it] {'loss': 0.4354, 'learning_rate': 4.147601560822192e-06, 'epoch': 0.71} + 71%|███████ | 4087/5772 [3:19:41<2:52:46, 6.15s/it] {'loss': 0.4354, 'learning_rate': 4.147601560822192e-06, 'epoch': 0.71} + 71%|███████ | 4087/5772 [3:19:39<2:52:46, 6.15s/it] 71%|███████ | 4088/5772 [3:19:47<2:52:22, 6.14s/it] 71%|███████ | 4088/5772 [3:19:45<2:52:22, 6.14s/it] {'loss': 0.475, 'learning_rate': 4.143051942162013e-06, 'epoch': 0.71} + 71%|███████ | 4088/5772 [3:19:47<2:52:22, 6.14s/it] {'loss': 0.475, 'learning_rate': 4.143051942162013e-06, 'epoch': 0.71} + 71%|███████ | 4088/5772 [3:19:45<2:52:22, 6.14s/it] 71%|███████ | 4089/5772 [3:19:54<2:52:42, 6.16s/it] 71%|███████ | 4089/5772 [3:19:52<2:52:42, 6.16s/it] {'loss': 0.451, 'learning_rate': 4.138504168115059e-06, 'epoch': 0.71} + 71%|███████ | 4089/5772 [3:19:54<2:52:42, 6.16s/it] {'loss': 0.451, 'learning_rate': 4.138504168115059e-06, 'epoch': 0.71} + 71%|███████ | 4089/5772 [3:19:52<2:52:42, 6.16s/it] 71%|███████ | 4090/5772 [3:19:58<2:51:52, 6.13s/it] 71%|███████ | 4090/5772 [3:20:00<2:51:53, 6.13s/it] {'loss': 0.461, 'learning_rate': 4.133958240113629e-06, 'epoch': 0.71} + 71%|███████ | 4090/5772 [3:20:00<2:51:53, 6.13s/it] {'loss': 0.461, 'learning_rate': 4.133958240113629e-06, 'epoch': 0.71} + 71%|███████ | 4090/5772 [3:19:58<2:51:52, 6.13s/it] 71%|███████ | 4091/5772 [3:20:04<2:52:30, 6.16s/it] 71%|███████ | 4091/5772 [3:20:06<2:52:30, 6.16s/it] {'loss': 0.4575, 'learning_rate': 4.129414159589438e-06, 'epoch': 0.71} + 71%|███████ | 4091/5772 [3:20:06<2:52:30, 6.16s/it] {'loss': 0.4575, 'learning_rate': 4.129414159589438e-06, 'epoch': 0.71} + 71%|███████ | 4091/5772 [3:20:04<2:52:30, 6.16s/it] 71%|███████ | 4092/5772 [3:20:12<2:52:49, 6.17s/it] 71%|███████ | 4092/5772 [3:20:10<2:52:49, 6.17s/it] {'loss': 0.4669, 'learning_rate': 4.124871927973611e-06, 'epoch': 0.71} + 71%|███████ | 4092/5772 [3:20:12<2:52:49, 6.17s/it] {'loss': 0.4669, 'learning_rate': 4.124871927973611e-06, 'epoch': 0.71} + 71%|███████ | 4092/5772 [3:20:10<2:52:49, 6.17s/it] 71%|███████ | 4093/5772 [3:20:18<2:49:24, 6.05s/it] 71%|███████ | 4093/5772 [3:20:16<2:49:24, 6.05s/it] {'loss': 0.4473, 'learning_rate': 4.120331546696711e-06, 'epoch': 0.71} + 71%|███████ | 4093/5772 [3:20:18<2:49:24, 6.05s/it] {'loss': 0.4473, 'learning_rate': 4.120331546696711e-06, 'epoch': 0.71} + 71%|███████ | 4093/5772 [3:20:16<2:49:24, 6.05s/it] 71%|███████ | 4094/5772 [3:20:24<2:50:01, 6.08s/it] 71%|███████ | 4094/5772 [3:20:22<2:50:01, 6.08s/it] {'loss': 0.4759, 'learning_rate': 4.115793017188695e-06, 'epoch': 0.71} + 71%|███████ | 4094/5772 [3:20:24<2:50:01, 6.08s/it] {'loss': 0.4759, 'learning_rate': 4.115793017188695e-06, 'epoch': 0.71} + 71%|███████ | 4094/5772 [3:20:22<2:50:01, 6.08s/it] 71%|███████ | 4095/5772 [3:20:30<2:51:43, 6.14s/it] 71%|███████ | 4095/5772 [3:20:28<2:51:43, 6.14s/it] {'loss': 0.4549, 'learning_rate': 4.111256340878952e-06, 'epoch': 0.71} + 71%|███████ | 4095/5772 [3:20:30<2:51:43, 6.14s/it] {'loss': 0.4549, 'learning_rate': 4.111256340878952e-06, 'epoch': 0.71} + 71%|███████ | 4095/5772 [3:20:28<2:51:43, 6.14s/it] 71%|███████ | 4096/5772 [3:20:34<2:50:39, 6.11s/it] 71%|███████ | 4096/5772 [3:20:36<2:50:39, 6.11s/it] {'loss': 0.4605, 'learning_rate': 4.106721519196284e-06, 'epoch': 0.71} + 71%|███████ | 4096/5772 [3:20:36<2:50:39, 6.11s/it] {'loss': 0.4605, 'learning_rate': 4.106721519196284e-06, 'epoch': 0.71} + 71%|███████ | 4096/5772 [3:20:34<2:50:39, 6.11s/it] 71%|███████ | 4097/5772 [3:20:42<2:49:22, 6.07s/it] 71%|███████ | 4097/5772 [3:20:40<2:49:22, 6.07s/it] {'loss': 0.4536, 'learning_rate': 4.102188553568905e-06, 'epoch': 0.71} + 71%|███████ | 4097/5772 [3:20:42<2:49:22, 6.07s/it] {'loss': 0.4536, 'learning_rate': 4.102188553568905e-06, 'epoch': 0.71} + 71%|███████ | 4097/5772 [3:20:40<2:49:22, 6.07s/it] 71%|███████ | 4098/5772 [3:20:48<2:48:24, 6.04s/it] 71%|███████ | 4098/5772 [3:20:46<2:48:24, 6.04s/it] {'loss': 0.4647, 'learning_rate': 4.097657445424454e-06, 'epoch': 0.71} + 71%|███████ | 4098/5772 [3:20:48<2:48:24, 6.04s/it] {'loss': 0.4647, 'learning_rate': 4.097657445424454e-06, 'epoch': 0.71} + 71%|███████ | 4098/5772 [3:20:46<2:48:24, 6.04s/it] 71%|███████ | 4099/5772 [3:20:54<2:48:22, 6.04s/it] 71%|███████ | 4099/5772 [3:20:52<2:48:22, 6.04s/it] {'loss': 0.4607, 'learning_rate': 4.093128196189971e-06, 'epoch': 0.71} + 71%|███████ | 4099/5772 [3:20:54<2:48:22, 6.04s/it] {'loss': 0.4607, 'learning_rate': 4.093128196189971e-06, 'epoch': 0.71} + 71%|███████ | 4099/5772 [3:20:52<2:48:22, 6.04s/it]10 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +71 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +5 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend...14 + AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +0 71%|███████ | 4100/5772 [3:21:01<2:50:22, 6.11s/it]4 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... + 71%|███████ | 4100/5772 [3:20:59<2:50:23, 6.11s/it] {'loss': 0.4582, 'learning_rate': 4.088600807291918e-06, 'epoch': 0.71} + 71%|███████ | 4100/5772 [3:21:01<2:50:22, 6.11s/it] {'loss': 0.4582, 'learning_rate': 4.088600807291918e-06, 'epoch': 0.71} + 71%|███████ | 4100/5772 [3:20:59<2:50:23, 6.11s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-4100/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-4100/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-4100/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 71%|███████ | 4101/5772 [3:21:23<5:07:52, 11.06s/it] 71%|███████ | 4101/5772 [3:21:21<5:07:52, 11.06s/it] {'loss': 0.4684, 'learning_rate': 4.084075280156175e-06, 'epoch': 0.71} + 71%|███████ | 4101/5772 [3:21:23<5:07:52, 11.06s/it] {'loss': 0.4684, 'learning_rate': 4.084075280156175e-06, 'epoch': 0.71} + 71%|███████ | 4101/5772 [3:21:21<5:07:52, 11.06s/it] 71%|███████ | 4102/5772 [3:21:29<4:26:28, 9.57s/it] 71%|███████ | 4102/5772 [3:21:27<4:26:28, 9.57s/it] {'loss': 0.4605, 'learning_rate': 4.079551616208032e-06, 'epoch': 0.71} + 71%|███████ | 4102/5772 [3:21:29<4:26:28, 9.57s/it] {'loss': 0.4605, 'learning_rate': 4.079551616208032e-06, 'epoch': 0.71} + 71%|███████ | 4102/5772 [3:21:27<4:26:28, 9.57s/it] 71%|███████ | 4103/5772 [3:21:35<3:58:02, 8.56s/it] 71%|███████ | 4103/5772 [3:21:33<3:58:02, 8.56s/it] {'loss': 0.4753, 'learning_rate': 4.075029816872183e-06, 'epoch': 0.71} + 71%|███████ | 4103/5772 [3:21:35<3:58:02, 8.56s/it] {'loss': 0.4753, 'learning_rate': 4.075029816872183e-06, 'epoch': 0.71} + 71%|███████ | 4103/5772 [3:21:33<3:58:02, 8.56s/it] 71%|███████ | 4104/5772 [3:21:42<3:38:01, 7.84s/it] 71%|███████ | 4104/5772 [3:21:40<3:38:01, 7.84s/it] {'loss': 0.4575, 'learning_rate': 4.070509883572754e-06, 'epoch': 0.71} + 71%|███████ | 4104/5772 [3:21:42<3:38:01, 7.84s/it] {'loss': 0.4575, 'learning_rate': 4.070509883572754e-06, 'epoch': 0.71} + 71%|███████ | 4104/5772 [3:21:40<3:38:01, 7.84s/it] 71%|███████ | 4105/5772 [3:21:46<3:24:09, 7.35s/it] 71%|███████ | 4105/5772 [3:21:48<3:24:09, 7.35s/it] {'loss': 0.4594, 'learning_rate': 4.065991817733272e-06, 'epoch': 0.71} + 71%|███████ | 4105/5772 [3:21:48<3:24:09, 7.35s/it] {'loss': 0.4594, 'learning_rate': 4.065991817733272e-06, 'epoch': 0.71} + 71%|███████ | 4105/5772 [3:21:46<3:24:09, 7.35s/it] 71%|███████ | 4106/5772 [3:21:52<3:12:16, 6.92s/it] 71%|███████ | 4106/5772 [3:21:54<3:12:16, 6.92s/it] {'loss': 0.4531, 'learning_rate': 4.061475620776672e-06, 'epoch': 0.71} + 71%|███████ | 4106/5772 [3:21:54<3:12:16, 6.92s/it] {'loss': 0.4531, 'learning_rate': 4.061475620776672e-06, 'epoch': 0.71} + 71%|███████ | 4106/5772 [3:21:52<3:12:16, 6.92s/it] 71%|███████ | 4107/5772 [3:21:58<3:05:28, 6.68s/it] 71%|███████ | 4107/5772 [3:22:00<3:05:28, 6.68s/it] {'loss': 0.459, 'learning_rate': 4.056961294125305e-06, 'epoch': 0.71} + 71%|███████ | 4107/5772 [3:22:00<3:05:28, 6.68s/it] {'loss': 0.459, 'learning_rate': 4.056961294125305e-06, 'epoch': 0.71} + 71%|███████ | 4107/5772 [3:21:58<3:05:28, 6.68s/it] 71%|███████ | 4108/5772 [3:22:04<3:00:28, 6.51s/it] 71%|███████ | 4108/5772 [3:22:06<3:00:28, 6.51s/it] {'loss': 0.4658, 'learning_rate': 4.052448839200935e-06, 'epoch': 0.71} + 71%|███████ | 4108/5772 [3:22:06<3:00:28, 6.51s/it] {'loss': 0.4658, 'learning_rate': 4.052448839200935e-06, 'epoch': 0.71} + 71%|███████ | 4108/5772 [3:22:04<3:00:28, 6.51s/it] 71%|███████ | 4109/5772 [3:22:10<2:57:17, 6.40s/it] 71%|███████ | 4109/5772 [3:22:12<2:57:18, 6.40s/it] {'loss': 0.46, 'learning_rate': 4.04793825742474e-06, 'epoch': 0.71} + 71%|███████ | 4109/5772 [3:22:12<2:57:18, 6.40s/it] {'loss': 0.46, 'learning_rate': 4.04793825742474e-06, 'epoch': 0.71} + 71%|███████ | 4109/5772 [3:22:10<2:57:17, 6.40s/it] 71%|███████ | 4110/5772 [3:22:16<2:56:23, 6.37s/it] 71%|███████ | 4110/5772 [3:22:18<2:56:23, 6.37s/it] {'loss': 0.4643, 'learning_rate': 4.0434295502172885e-06, 'epoch': 0.71} + 71%|███████ | 4110/5772 [3:22:18<2:56:23, 6.37s/it] {'loss': 0.4643, 'learning_rate': 4.0434295502172885e-06, 'epoch': 0.71} + 71%|███████ | 4110/5772 [3:22:16<2:56:23, 6.37s/it] 71%|███████ | 4111/5772 [3:22:23<2:57:14, 6.40s/it] 71%|███████ | 4111/5772 [3:22:25<2:57:14, 6.40s/it] {'loss': 0.4642, 'learning_rate': 4.038922718998585e-06, 'epoch': 0.71} + 71%|███████ | 4111/5772 [3:22:25<2:57:14, 6.40s/it] {'loss': 0.4642, 'learning_rate': 4.038922718998585e-06, 'epoch': 0.71} + 71%|███████ | 4111/5772 [3:22:23<2:57:14, 6.40s/it] 71%|███████ | 4112/5772 [3:22:29<2:55:59, 6.36s/it] 71%|███████ | 4112/5772 [3:22:31<2:55:59, 6.36s/it] {'loss': 0.464, 'learning_rate': 4.034417765188031e-06, 'epoch': 0.71} + 71%|███████ | 4112/5772 [3:22:31<2:55:59, 6.36s/it] {'loss': 0.464, 'learning_rate': 4.034417765188031e-06, 'epoch': 0.71} + 71%|███████ | 4112/5772 [3:22:29<2:55:59, 6.36s/it] 71%|███████▏ | 4113/5772 [3:22:36<2:56:37, 6.39s/it] 71%|███████▏ | 4113/5772 [3:22:38<2:56:37, 6.39s/it] {'loss': 0.4568, 'learning_rate': 4.0299146902044304e-06, 'epoch': 0.71} + 71%|███████▏ | 4113/5772 [3:22:38<2:56:37, 6.39s/it] {'loss': 0.4568, 'learning_rate': 4.0299146902044304e-06, 'epoch': 0.71} + 71%|███████▏ | 4113/5772 [3:22:36<2:56:37, 6.39s/it] 71%|███████▏ | 4114/5772 [3:22:42<2:53:56, 6.29s/it] 71%|███████▏ | 4114/5772 [3:22:44<2:53:56, 6.29s/it] {'loss': 0.4665, 'learning_rate': 4.025413495466004e-06, 'epoch': 0.71} + 71%|███████▏ | 4114/5772 [3:22:44<2:53:56, 6.29s/it] {'loss': 0.4665, 'learning_rate': 4.025413495466004e-06, 'epoch': 0.71} + 71%|███████▏ | 4114/5772 [3:22:42<2:53:56, 6.29s/it] 71%|███████▏ | 4115/5772 [3:22:48<2:52:55, 6.26s/it] 71%|███████▏ | 4115/5772 [3:22:50<2:52:55, 6.26s/it] {'loss': 0.454, 'learning_rate': 4.020914182390379e-06, 'epoch': 0.71} + 71%|███████▏ | 4115/5772 [3:22:50<2:52:55, 6.26s/it] {'loss': 0.454, 'learning_rate': 4.020914182390379e-06, 'epoch': 0.71} + 71%|███████▏ | 4115/5772 [3:22:48<2:52:55, 6.26s/it] 71%|███████▏ | 4116/5772 [3:22:54<2:52:35, 6.25s/it] 71%|███████▏ | 4116/5772 [3:22:56<2:52:35, 6.25s/it] {'loss': 0.4543, 'learning_rate': 4.016416752394591e-06, 'epoch': 0.71} + 71%|███████▏ | 4116/5772 [3:22:56<2:52:35, 6.25s/it] {'loss': 0.4543, 'learning_rate': 4.016416752394591e-06, 'epoch': 0.71} + 71%|███████▏ | 4116/5772 [3:22:54<2:52:35, 6.25s/it] 71%|███████▏ | 4117/5772 [3:23:00<2:49:26, 6.14s/it] 71%|███████▏ | 4117/5772 [3:23:02<2:49:26, 6.14s/it] {'loss': 0.4674, 'learning_rate': 4.011921206895074e-06, 'epoch': 0.71} + 71%|███████▏ | 4117/5772 [3:23:02<2:49:26, 6.14s/it] {'loss': 0.4674, 'learning_rate': 4.011921206895074e-06, 'epoch': 0.71} + 71%|███████▏ | 4117/5772 [3:23:00<2:49:26, 6.14s/it] 71%|███████▏ | 4118/5772 [3:23:06<2:49:00, 6.13s/it] 71%|███████▏ | 4118/5772 [3:23:08<2:49:00, 6.13s/it] {'loss': 0.4708, 'learning_rate': 4.007427547307676e-06, 'epoch': 0.71} + 71%|███████▏ | 4118/5772 [3:23:08<2:49:00, 6.13s/it] {'loss': 0.4708, 'learning_rate': 4.007427547307676e-06, 'epoch': 0.71} + 71%|███████▏ | 4118/5772 [3:23:06<2:49:00, 6.13s/it] 71%|███████▏ | 4119/5772 [3:23:12<2:46:52, 6.06s/it] 71%|███████▏ | 4119/5772 [3:23:14<2:46:52, 6.06s/it] {'loss': 0.4561, 'learning_rate': 4.00293577504766e-06, 'epoch': 0.71} + 71%|███████▏ | 4119/5772 [3:23:14<2:46:52, 6.06s/it] {'loss': 0.4561, 'learning_rate': 4.00293577504766e-06, 'epoch': 0.71} + 71%|███████▏ | 4119/5772 [3:23:12<2:46:52, 6.06s/it] 71%|███████▏ | 4120/5772 [3:23:18<2:46:02, 6.03s/it] 71%|███████▏ | 4120/5772 [3:23:20<2:46:02, 6.03s/it] {'loss': 0.465, 'learning_rate': 3.998445891529675e-06, 'epoch': 0.71} + 71%|███████▏ | 4120/5772 [3:23:20<2:46:02, 6.03s/it] {'loss': 0.465, 'learning_rate': 3.998445891529675e-06, 'epoch': 0.71} + 71%|███████▏ | 4120/5772 [3:23:18<2:46:02, 6.03s/it] 71%|███████▏ | 4121/5772 [3:23:24<2:44:04, 5.96s/it] 71%|███████▏ | 4121/5772 [3:23:26<2:44:04, 5.96s/it] {'loss': 0.455, 'learning_rate': 3.993957898167788e-06, 'epoch': 0.71} + 71%|███████▏ | 4121/5772 [3:23:26<2:44:04, 5.96s/it] {'loss': 0.455, 'learning_rate': 3.993957898167788e-06, 'epoch': 0.71} + 71%|███████▏ | 4121/5772 [3:23:24<2:44:04, 5.96s/it] 71%|███████▏ | 4122/5772 [3:23:30<2:46:34, 6.06s/it] 71%|███████▏ | 4122/5772 [3:23:32<2:46:34, 6.06s/it] {'loss': 0.4655, 'learning_rate': 3.989471796375466e-06, 'epoch': 0.71} + 71%|███████▏ | 4122/5772 [3:23:32<2:46:34, 6.06s/it] {'loss': 0.4655, 'learning_rate': 3.989471796375466e-06, 'epoch': 0.71} + 71%|███████▏ | 4122/5772 [3:23:30<2:46:34, 6.06s/it] 71%|███████▏ | 4123/5772 [3:23:36<2:48:08, 6.12s/it] 71%|███████▏ | 4123/5772 [3:23:38<2:48:08, 6.12s/it] {'loss': 0.4592, 'learning_rate': 3.9849875875655875e-06, 'epoch': 0.71} + 71%|███████▏ | 4123/5772 [3:23:38<2:48:08, 6.12s/it] {'loss': 0.4592, 'learning_rate': 3.9849875875655875e-06, 'epoch': 0.71} + 71%|███████▏ | 4123/5772 [3:23:36<2:48:08, 6.12s/it] 71%|███████▏ | 4124/5772 [3:23:44<2:46:26, 6.06s/it] 71%|███████▏ | 4124/5772 [3:23:42<2:46:26, 6.06s/it] {'loss': 0.4649, 'learning_rate': 3.980505273150421e-06, 'epoch': 0.71} + 71%|███████▏ | 4124/5772 [3:23:44<2:46:26, 6.06s/it] {'loss': 0.4649, 'learning_rate': 3.980505273150421e-06, 'epoch': 0.71} + 71%|███████▏ | 4124/5772 [3:23:42<2:46:26, 6.06s/it] 71%|███████▏ | 4125/5772 [3:23:48<2:47:06, 6.09s/it] 71%|███████▏ | 4125/5772 [3:23:50<2:47:06, 6.09s/it] {'loss': 0.4585, 'learning_rate': 3.9760248545416465e-06, 'epoch': 0.71} + 71%|███████▏ | 4125/5772 [3:23:50<2:47:06, 6.09s/it] {'loss': 0.4585, 'learning_rate': 3.9760248545416465e-06, 'epoch': 0.71} + 71%|███████▏ | 4125/5772 [3:23:48<2:47:06, 6.09s/it] 71%|███████▏ | 4126/5772 [3:23:55<2:47:40, 6.11s/it] 71%|███████▏ | 4126/5772 [3:23:57<2:47:40, 6.11s/it] {'loss': 0.4722, 'learning_rate': 3.971546333150358e-06, 'epoch': 0.71} + 71%|███████▏ | 4126/5772 [3:23:57<2:47:40, 6.11s/it] {'loss': 0.4722, 'learning_rate': 3.971546333150358e-06, 'epoch': 0.71} + 71%|███████▏ | 4126/5772 [3:23:55<2:47:40, 6.11s/it] 72%|███████▏ | 4127/5772 [3:24:02<2:45:37, 6.04s/it] 72%|███████▏ | 4127/5772 [3:24:00<2:45:37, 6.04s/it] {'loss': 0.4527, 'learning_rate': 3.967069710387029e-06, 'epoch': 0.71} + 72%|███████▏ | 4127/5772 [3:24:02<2:45:37, 6.04s/it] {'loss': 0.4527, 'learning_rate': 3.967069710387029e-06, 'epoch': 0.71} + 72%|███████▏ | 4127/5772 [3:24:00<2:45:37, 6.04s/it] 72%|███████▏ | 4128/5772 [3:24:07<2:46:03, 6.06s/it] 72%|███████▏ | 4128/5772 [3:24:08<2:46:03, 6.06s/it] {'loss': 0.4672, 'learning_rate': 3.962594987661557e-06, 'epoch': 0.72} + 72%|███████▏ | 4128/5772 [3:24:08<2:46:03, 6.06s/it] {'loss': 0.4672, 'learning_rate': 3.962594987661557e-06, 'epoch': 0.72} + 72%|███████▏ | 4128/5772 [3:24:07<2:46:03, 6.06s/it] 72%|███████▏ | 4129/5772 [3:24:13<2:47:01, 6.10s/it] 72%|███████▏ | 4129/5772 [3:24:15<2:47:01, 6.10s/it] {'loss': 0.4467, 'learning_rate': 3.958122166383217e-06, 'epoch': 0.72} + 72%|███████▏ | 4129/5772 [3:24:15<2:47:01, 6.10s/it] {'loss': 0.4467, 'learning_rate': 3.958122166383217e-06, 'epoch': 0.72} + 72%|███████▏ | 4129/5772 [3:24:13<2:47:01, 6.10s/it] 72%|███████▏ | 4130/5772 [3:24:19<2:46:16, 6.08s/it] 72%|███████▏ | 4130/5772 [3:24:21<2:46:18, 6.08s/it] {'loss': 0.4607, 'learning_rate': 3.953651247960715e-06, 'epoch': 0.72} + 72%|███████▏ | 4130/5772 [3:24:21<2:46:18, 6.08s/it] {'loss': 0.4607, 'learning_rate': 3.953651247960715e-06, 'epoch': 0.72} + 72%|███████▏ | 4130/5772 [3:24:19<2:46:16, 6.08s/it] 72%|███████▏ | 4131/5772 [3:24:25<2:45:23, 6.05s/it] 72%|███████▏ | 4131/5772 [3:24:27<2:45:23, 6.05s/it] {'loss': 0.4523, 'learning_rate': 3.949182233802131e-06, 'epoch': 0.72} + 72%|███████▏ | 4131/5772 [3:24:27<2:45:23, 6.05s/it] {'loss': 0.4523, 'learning_rate': 3.949182233802131e-06, 'epoch': 0.72} + 72%|███████▏ | 4131/5772 [3:24:25<2:45:23, 6.05s/it] 72%|███████▏ | 4132/5772 [3:24:31<2:47:31, 6.13s/it] 72%|███████▏ | 4132/5772 [3:24:33<2:47:31, 6.13s/it] {'loss': 0.4652, 'learning_rate': 3.944715125314961e-06, 'epoch': 0.72} + 72%|███████▏ | 4132/5772 [3:24:33<2:47:31, 6.13s/it] {'loss': 0.4652, 'learning_rate': 3.944715125314961e-06, 'epoch': 0.72} + 72%|███████▏ | 4132/5772 [3:24:31<2:47:31, 6.13s/it] 72%|███████▏ | 4133/5772 [3:24:37<2:49:02, 6.19s/it] 72%|███████▏ | 4133/5772 [3:24:39<2:49:02, 6.19s/it] {'loss': 0.456, 'learning_rate': 3.940249923906093e-06, 'epoch': 0.72} + 72%|███████▏ | 4133/5772 [3:24:39<2:49:02, 6.19s/it] {'loss': 0.456, 'learning_rate': 3.940249923906093e-06, 'epoch': 0.72} + 72%|███████▏ | 4133/5772 [3:24:37<2:49:02, 6.19s/it] 72%|███████▏ | 4134/5772 [3:24:43<2:47:10, 6.12s/it] 72%|███████▏ | 4134/5772 [3:24:45<2:47:10, 6.12s/it] {'loss': 0.4622, 'learning_rate': 3.935786630981819e-06, 'epoch': 0.72} + 72%|███████▏ | 4134/5772 [3:24:45<2:47:10, 6.12s/it] {'loss': 0.4622, 'learning_rate': 3.935786630981819e-06, 'epoch': 0.72} + 72%|███████▏ | 4134/5772 [3:24:43<2:47:10, 6.12s/it] 72%|███████▏ | 4135/5772 [3:24:50<2:47:55, 6.15s/it] 72%|███████▏ | 4135/5772 [3:24:52<2:47:55, 6.15s/it] {'loss': 0.4512, 'learning_rate': 3.931325247947834e-06, 'epoch': 0.72} + 72%|███████▏ | 4135/5772 [3:24:52<2:47:55, 6.15s/it] {'loss': 0.4512, 'learning_rate': 3.931325247947834e-06, 'epoch': 0.72} + 72%|███████▏ | 4135/5772 [3:24:50<2:47:55, 6.15s/it] 72%|███████▏ | 4136/5772 [3:24:56<2:48:37, 6.18s/it] 72%|███████▏ | 4136/5772 [3:24:58<2:48:37, 6.18s/it] {'loss': 0.4694, 'learning_rate': 3.926865776209212e-06, 'epoch': 0.72} + 72%|███████▏ | 4136/5772 [3:24:58<2:48:37, 6.18s/it] {'loss': 0.4694, 'learning_rate': 3.926865776209212e-06, 'epoch': 0.72} + 72%|███████▏ | 4136/5772 [3:24:56<2:48:37, 6.18s/it] 72%|███████▏ | 4137/5772 [3:25:02<2:46:08, 6.10s/it] 72%|███████▏ | 4137/5772 [3:25:04<2:46:08, 6.10s/it] {'loss': 0.4577, 'learning_rate': 3.922408217170454e-06, 'epoch': 0.72} + 72%|███████▏ | 4137/5772 [3:25:04<2:46:08, 6.10s/it] {'loss': 0.4577, 'learning_rate': 3.922408217170454e-06, 'epoch': 0.72} + 72%|███████▏ | 4137/5772 [3:25:02<2:46:08, 6.10s/it] 72%|███████▏ | 4138/5772 [3:25:08<2:45:59, 6.10s/it] 72%|███████▏ | 4138/5772 [3:25:10<2:45:59, 6.10s/it] {'loss': 0.4587, 'learning_rate': 3.917952572235433e-06, 'epoch': 0.72} + 72%|███████▏ | 4138/5772 [3:25:10<2:45:59, 6.10s/it] {'loss': 0.4587, 'learning_rate': 3.917952572235433e-06, 'epoch': 0.72} + 72%|███████▏ | 4138/5772 [3:25:08<2:45:59, 6.10s/it] 72%|███████▏ | 4139/5772 [3:25:14<2:44:46, 6.05s/it] 72%|███████▏ | 4139/5772 [3:25:16<2:44:46, 6.05s/it] {'loss': 0.457, 'learning_rate': 3.913498842807433e-06, 'epoch': 0.72} + 72%|███████▏ | 4139/5772 [3:25:16<2:44:46, 6.05s/it] {'loss': 0.457, 'learning_rate': 3.913498842807433e-06, 'epoch': 0.72} + 72%|███████▏ | 4139/5772 [3:25:14<2:44:46, 6.05s/it] 72%|███████▏ | 4140/5772 [3:25:20<2:42:32, 5.98s/it] 72%|███████▏ | 4140/5772 [3:25:22<2:42:32, 5.98s/it] {'loss': 0.4623, 'learning_rate': 3.909047030289131e-06, 'epoch': 0.72} + 72%|███████▏ | 4140/5772 [3:25:22<2:42:32, 5.98s/it] {'loss': 0.4623, 'learning_rate': 3.909047030289131e-06, 'epoch': 0.72} + 72%|███████▏ | 4140/5772 [3:25:20<2:42:32, 5.98s/it] 72%|███████▏ | 4141/5772 [3:25:28<2:45:11, 6.08s/it] 72%|███████▏ | 4141/5772 [3:25:26<2:45:11, 6.08s/it] {'loss': 0.4521, 'learning_rate': 3.9045971360826014e-06, 'epoch': 0.72} + 72%|███████▏ | 4141/5772 [3:25:28<2:45:11, 6.08s/it] {'loss': 0.4521, 'learning_rate': 3.9045971360826014e-06, 'epoch': 0.72} + 72%|███████▏ | 4141/5772 [3:25:26<2:45:11, 6.08s/it] 72%|███████▏ | 4142/5772 [3:25:32<2:49:08, 6.23s/it] 72%|███████▏ | 4142/5772 [3:25:34<2:49:08, 6.23s/it] {'loss': 0.4721, 'learning_rate': 3.900149161589317e-06, 'epoch': 0.72} + 72%|███████▏ | 4142/5772 [3:25:34<2:49:08, 6.23s/it] {'loss': 0.4721, 'learning_rate': 3.900149161589317e-06, 'epoch': 0.72} + 72%|███████▏ | 4142/5772 [3:25:32<2:49:08, 6.23s/it] 72%|███████▏ | 4143/5772 [3:25:39<2:48:57, 6.22s/it] 72%|███████▏ | 4143/5772 [3:25:41<2:48:57, 6.22s/it] {'loss': 0.4549, 'learning_rate': 3.895703108210135e-06, 'epoch': 0.72} + 72%|███████▏ | 4143/5772 [3:25:41<2:48:57, 6.22s/it] {'loss': 0.4549, 'learning_rate': 3.895703108210135e-06, 'epoch': 0.72} + 72%|███████▏ | 4143/5772 [3:25:39<2:48:57, 6.22s/it] 72%|███████▏ | 4144/5772 [3:25:45<2:47:17, 6.17s/it] 72%|███████▏ | 4144/5772 [3:25:47<2:47:17, 6.17s/it] {'loss': 0.4636, 'learning_rate': 3.891258977345319e-06, 'epoch': 0.72} + 72%|███████▏ | 4144/5772 [3:25:47<2:47:17, 6.17s/it] {'loss': 0.4636, 'learning_rate': 3.891258977345319e-06, 'epoch': 0.72} + 72%|███████▏ | 4144/5772 [3:25:45<2:47:17, 6.17s/it] 72%|███████▏ | 4145/5772 [3:25:51<2:45:08, 6.09s/it] 72%|███████▏ | 4145/5772 [3:25:53<2:45:08, 6.09s/it] {'loss': 0.4565, 'learning_rate': 3.886816770394524e-06, 'epoch': 0.72} + 72%|███████▏ | 4145/5772 [3:25:53<2:45:08, 6.09s/it] {'loss': 0.4565, 'learning_rate': 3.886816770394524e-06, 'epoch': 0.72} + 72%|███████▏ | 4145/5772 [3:25:51<2:45:08, 6.09s/it] 72%|███████▏ | 4146/5772 [3:25:57<2:45:32, 6.11s/it] 72%|███████▏ | 4146/5772 [3:25:59<2:45:33, 6.11s/it] {'loss': 0.4719, 'learning_rate': 3.882376488756797e-06, 'epoch': 0.72} + 72%|███████▏ | 4146/5772 [3:25:59<2:45:33, 6.11s/it] {'loss': 0.4719, 'learning_rate': 3.882376488756797e-06, 'epoch': 0.72} + 72%|███████▏ | 4146/5772 [3:25:57<2:45:32, 6.11s/it] 72%|███████▏ | 4147/5772 [3:26:03<2:44:37, 6.08s/it] 72%|███████▏ | 4147/5772 [3:26:05<2:44:36, 6.08s/it] {'loss': 0.4526, 'learning_rate': 3.877938133830581e-06, 'epoch': 0.72} + 72%|███████▏ | 4147/5772 [3:26:05<2:44:36, 6.08s/it] {'loss': 0.4526, 'learning_rate': 3.877938133830581e-06, 'epoch': 0.72} + 72%|███████▏ | 4147/5772 [3:26:03<2:44:37, 6.08s/it] 72%|███████▏ | 4148/5772 [3:26:09<2:45:41, 6.12s/it] 72%|███████▏ | 4148/5772 [3:26:11<2:45:40, 6.12s/it] {'loss': 0.4708, 'learning_rate': 3.873501707013711e-06, 'epoch': 0.72} + 72%|███████▏ | 4148/5772 [3:26:11<2:45:40, 6.12s/it] {'loss': 0.4708, 'learning_rate': 3.873501707013711e-06, 'epoch': 0.72} + 72%|███████▏ | 4148/5772 [3:26:09<2:45:41, 6.12s/it] 72%|███████▏ | 4149/5772 [3:26:15<2:45:57, 6.14s/it] 72%|███████▏ | 4149/5772 [3:26:17<2:45:57, 6.14s/it] {'loss': 0.4645, 'learning_rate': 3.869067209703418e-06, 'epoch': 0.72} + 72%|███████▏ | 4149/5772 [3:26:17<2:45:57, 6.14s/it] {'loss': 0.4645, 'learning_rate': 3.869067209703418e-06, 'epoch': 0.72} + 72%|███████▏ | 4149/5772 [3:26:15<2:45:57, 6.14s/it]3 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +1 0AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +42 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... + 72%|███████▏ | 4150/5772 [3:26:21<2:43:06, 6.03s/it]11 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... + 72%|███████▏ | 4150/5772 [3:26:23<2:43:06, 6.03s/it] {'loss': 0.4581, 'learning_rate': 3.8646346432963165e-06, 'epoch': 0.72} + 72%|███████▏ | 4150/5772 [3:26:23<2:43:06, 6.03s/it] {'loss': 0.4581, 'learning_rate': 3.8646346432963165e-06, 'epoch': 0.72} + 72%|███████▏ | 4150/5772 [3:26:21<2:43:06, 6.03s/it] 72%|███████▏ | 4151/5772 [3:26:27<2:46:49, 6.17s/it] 72%|███████▏ | 4151/5772 [3:26:29<2:46:49, 6.17s/it] {'loss': 0.4715, 'learning_rate': 3.860204009188421e-06, 'epoch': 0.72} + 72%|███████▏ | 4151/5772 [3:26:29<2:46:49, 6.17s/it] {'loss': 0.4715, 'learning_rate': 3.860204009188421e-06, 'epoch': 0.72} + 72%|███████▏ | 4151/5772 [3:26:27<2:46:49, 6.17s/it] 72%|███████▏ | 4152/5772 [3:26:33<2:44:16, 6.08s/it] 72%|███████▏ | 4152/5772 [3:26:35<2:44:17, 6.08s/it] {'loss': 0.4606, 'learning_rate': 3.8557753087751345e-06, 'epoch': 0.72} + 72%|███████▏ | 4152/5772 [3:26:35<2:44:17, 6.08s/it] {'loss': 0.4606, 'learning_rate': 3.8557753087751345e-06, 'epoch': 0.72} + 72%|███████▏ | 4152/5772 [3:26:33<2:44:16, 6.08s/it] 72%|███████▏ | 4153/5772 [3:26:39<2:43:15, 6.05s/it] 72%|███████▏ | 4153/5772 [3:26:41<2:43:14, 6.05s/it] {'loss': 0.4533, 'learning_rate': 3.851348543451253e-06, 'epoch': 0.72} + 72%|███████▏ | 4153/5772 [3:26:41<2:43:14, 6.05s/it] {'loss': 0.4533, 'learning_rate': 3.851348543451253e-06, 'epoch': 0.72} + 72%|███████▏ | 4153/5772 [3:26:39<2:43:15, 6.05s/it] 72%|███████▏ | 4154/5772 [3:26:45<2:43:30, 6.06s/it] 72%|███████▏ | 4154/5772 [3:26:47<2:43:30, 6.06s/it] {'loss': 0.4532, 'learning_rate': 3.846923714610962e-06, 'epoch': 0.72} + 72%|███████▏ | 4154/5772 [3:26:47<2:43:30, 6.06s/it] {'loss': 0.4532, 'learning_rate': 3.846923714610962e-06, 'epoch': 0.72} + 72%|███████▏ | 4154/5772 [3:26:45<2:43:30, 6.06s/it] 72%|███████▏ | 4155/5772 [3:26:53<2:43:20, 6.06s/it] 72%|███████▏ | 4155/5772 [3:26:51<2:43:20, 6.06s/it] {'loss': 0.458, 'learning_rate': 3.8425008236478355e-06, 'epoch': 0.72} + 72%|███████▏ | 4155/5772 [3:26:53<2:43:20, 6.06s/it] {'loss': 0.458, 'learning_rate': 3.8425008236478355e-06, 'epoch': 0.72} + 72%|███████▏ | 4155/5772 [3:26:51<2:43:20, 6.06s/it] 72%|███████▏ | 4156/5772 [3:26:57<2:42:03, 6.02s/it] 72%|███████▏ | 4156/5772 [3:26:59<2:42:03, 6.02s/it] {'loss': 0.455, 'learning_rate': 3.838079871954842e-06, 'epoch': 0.72} + 72%|███████▏ | 4156/5772 [3:26:59<2:42:03, 6.02s/it] {'loss': 0.455, 'learning_rate': 3.838079871954842e-06, 'epoch': 0.72} + 72%|███████▏ | 4156/5772 [3:26:57<2:42:03, 6.02s/it] 72%|███████▏ | 4157/5772 [3:27:03<2:42:07, 6.02s/it] 72%|███████▏ | 4157/5772 [3:27:05<2:42:07, 6.02s/it] {'loss': 0.4683, 'learning_rate': 3.833660860924328e-06, 'epoch': 0.72} + 72%|███████▏ | 4157/5772 [3:27:05<2:42:07, 6.02s/it] {'loss': 0.4683, 'learning_rate': 3.833660860924328e-06, 'epoch': 0.72} + 72%|███████▏ | 4157/5772 [3:27:03<2:42:07, 6.02s/it] 72%|███████▏ | 4158/5772 [3:27:10<2:43:15, 6.07s/it] 72%|███████▏ | 4158/5772 [3:27:12<2:43:15, 6.07s/it] {'loss': 0.462, 'learning_rate': 3.829243791948043e-06, 'epoch': 0.72} + 72%|███████▏ | 4158/5772 [3:27:12<2:43:15, 6.07s/it] {'loss': 0.462, 'learning_rate': 3.829243791948043e-06, 'epoch': 0.72} + 72%|███████▏ | 4158/5772 [3:27:10<2:43:15, 6.07s/it] 72%|███████▏ | 4159/5772 [3:27:18<2:42:39, 6.05s/it] 72%|███████▏ | 4159/5772 [3:27:16<2:42:39, 6.05s/it] {'loss': 0.4533, 'learning_rate': 3.824828666417114e-06, 'epoch': 0.72} + 72%|███████▏ | 4159/5772 [3:27:18<2:42:39, 6.05s/it] {'loss': 0.4533, 'learning_rate': 3.824828666417114e-06, 'epoch': 0.72} + 72%|███████▏ | 4159/5772 [3:27:16<2:42:39, 6.05s/it] 72%|███████▏ | 4160/5772 [3:27:24<2:44:13, 6.11s/it] 72%|███████▏ | 4160/5772 [3:27:22<2:44:13, 6.11s/it] {'loss': 0.4669, 'learning_rate': 3.820415485722064e-06, 'epoch': 0.72} + 72%|███████▏ | 4160/5772 [3:27:24<2:44:13, 6.11s/it] {'loss': 0.4669, 'learning_rate': 3.820415485722064e-06, 'epoch': 0.72} + 72%|███████▏ | 4160/5772 [3:27:22<2:44:13, 6.11s/it] 72%|███████▏ | 4161/5772 [3:27:28<2:45:26, 6.16s/it] 72%|███████▏ | 4161/5772 [3:27:30<2:45:26, 6.16s/it] {'loss': 0.4668, 'learning_rate': 3.8160042512528e-06, 'epoch': 0.72} + 72%|███████▏ | 4161/5772 [3:27:30<2:45:26, 6.16s/it] {'loss': 0.4668, 'learning_rate': 3.8160042512528e-06, 'epoch': 0.72} + 72%|███████▏ | 4161/5772 [3:27:28<2:45:26, 6.16s/it] 72%|███████▏ | 4162/5772 [3:27:36<2:43:57, 6.11s/it] 72%|███████▏ | 4162/5772 [3:27:34<2:43:58, 6.11s/it] {'loss': 0.461, 'learning_rate': 3.8115949643986095e-06, 'epoch': 0.72} + 72%|███████▏ | 4162/5772 [3:27:36<2:43:57, 6.11s/it] {'loss': 0.461, 'learning_rate': 3.8115949643986095e-06, 'epoch': 0.72} + 72%|███████▏ | 4162/5772 [3:27:34<2:43:58, 6.11s/it] 72%|███████▏ | 4163/5772 [3:27:40<2:44:43, 6.14s/it] 72%|███████▏ | 4163/5772 [3:27:42<2:44:43, 6.14s/it] {'loss': 0.4602, 'learning_rate': 3.8071876265481823e-06, 'epoch': 0.72} + 72%|███████▏ | 4163/5772 [3:27:42<2:44:43, 6.14s/it] {'loss': 0.4602, 'learning_rate': 3.8071876265481823e-06, 'epoch': 0.72} + 72%|███████▏ | 4163/5772 [3:27:40<2:44:43, 6.14s/it] 72%|███████▏ | 4164/5772 [3:27:46<2:42:49, 6.08s/it] 72%|███████▏ | 4164/5772 [3:27:48<2:42:49, 6.08s/it] {'loss': 0.4562, 'learning_rate': 3.8027822390895774e-06, 'epoch': 0.72} + 72%|███████▏ | 4164/5772 [3:27:48<2:42:49, 6.08s/it] {'loss': 0.4562, 'learning_rate': 3.8027822390895774e-06, 'epoch': 0.72} + 72%|███████▏ | 4164/5772 [3:27:46<2:42:49, 6.08s/it] 72%|███████▏ | 4165/5772 [3:27:54<2:42:26, 6.06s/it] 72%|███████▏ | 4165/5772 [3:27:52<2:42:26, 6.07s/it] {'loss': 0.4569, 'learning_rate': 3.7983788034102488e-06, 'epoch': 0.72} + 72%|███████▏ | 4165/5772 [3:27:54<2:42:26, 6.06s/it] {'loss': 0.4569, 'learning_rate': 3.7983788034102488e-06, 'epoch': 0.72} + 72%|███████▏ | 4165/5772 [3:27:52<2:42:26, 6.07s/it] 72%|███████▏ | 4166/5772 [3:27:59<2:44:00, 6.13s/it] 72%|███████▏ | 4166/5772 [3:28:01<2:44:00, 6.13s/it] {'loss': 0.4684, 'learning_rate': 3.7939773208970353e-06, 'epoch': 0.72} + 72%|███████▏ | 4166/5772 [3:28:01<2:44:00, 6.13s/it] {'loss': 0.4684, 'learning_rate': 3.7939773208970353e-06, 'epoch': 0.72} + 72%|███████▏ | 4166/5772 [3:27:59<2:44:00, 6.13s/it] 72%|███████▏ | 4167/5772 [3:28:05<2:42:53, 6.09s/it] 72%|███████▏ | 4167/5772 [3:28:07<2:42:53, 6.09s/it] {'loss': 0.4488, 'learning_rate': 3.7895777929361586e-06, 'epoch': 0.72} + 72%|███████▏ | 4167/5772 [3:28:07<2:42:53, 6.09s/it] {'loss': 0.4488, 'learning_rate': 3.7895777929361586e-06, 'epoch': 0.72} + 72%|███████▏ | 4167/5772 [3:28:05<2:42:53, 6.09s/it] 72%|███████▏ | 4168/5772 [3:28:11<2:43:06, 6.10s/it] 72%|███████▏ | 4168/5772 [3:28:13<2:43:05, 6.10s/it] {'loss': 0.4698, 'learning_rate': 3.7851802209132303e-06, 'epoch': 0.72} + 72%|███████▏ | 4168/5772 [3:28:13<2:43:05, 6.10s/it] {'loss': 0.4698, 'learning_rate': 3.7851802209132303e-06, 'epoch': 0.72} + 72%|███████▏ | 4168/5772 [3:28:11<2:43:06, 6.10s/it] 72%|███████▏ | 4169/5772 [3:28:19<2:45:29, 6.19s/it] 72%|███████▏ | 4169/5772 [3:28:17<2:45:30, 6.19s/it] {'loss': 0.4496, 'learning_rate': 3.7807846062132293e-06, 'epoch': 0.72} + 72%|███████▏ | 4169/5772 [3:28:19<2:45:29, 6.19s/it] {'loss': 0.4496, 'learning_rate': 3.7807846062132293e-06, 'epoch': 0.72} + 72%|███████▏ | 4169/5772 [3:28:17<2:45:30, 6.19s/it] 72%|███████▏ | 4170/5772 [3:28:25<2:46:30, 6.24s/it] 72%|███████▏ | 4170/5772 [3:28:23<2:46:30, 6.24s/it] {'loss': 0.463, 'learning_rate': 3.776390950220544e-06, 'epoch': 0.72} + 72%|███████▏ | 4170/5772 [3:28:25<2:46:30, 6.24s/it] {'loss': 0.463, 'learning_rate': 3.776390950220544e-06, 'epoch': 0.72} + 72%|███████▏ | 4170/5772 [3:28:23<2:46:30, 6.24s/it] 72%|███████▏ | 4171/5772 [3:28:30<2:49:25, 6.35s/it] 72%|███████▏ | 4171/5772 [3:28:32<2:49:26, 6.35s/it] {'loss': 0.4654, 'learning_rate': 3.7719992543189233e-06, 'epoch': 0.72} + {'loss': 0.4654, 'learning_rate': 3.7719992543189233e-06, 'epoch': 0.72} 72%|███████▏ | 4171/5772 [3:28:32<2:49:26, 6.35s/it] + 72%|███████▏ | 4171/5772 [3:28:30<2:49:25, 6.35s/it] 72%|███████▏ | 4172/5772 [3:28:36<2:47:09, 6.27s/it] 72%|███████▏ | 4172/5772 [3:28:38<2:47:10, 6.27s/it] {'loss': 0.475, 'learning_rate': 3.767609519891513e-06, 'epoch': 0.72} + 72%|███████▏ | 4172/5772 [3:28:38<2:47:10, 6.27s/it] {'loss': 0.475, 'learning_rate': 3.767609519891513e-06, 'epoch': 0.72} + 72%|███████▏ | 4172/5772 [3:28:36<2:47:09, 6.27s/it] 72%|███████▏ | 4173/5772 [3:28:42<2:45:59, 6.23s/it] 72%|███████▏ | 4173/5772 [3:28:44<2:45:59, 6.23s/it] {'loss': 0.4586, 'learning_rate': 3.7632217483208242e-06, 'epoch': 0.72} + 72%|███████▏ | 4173/5772 [3:28:44<2:45:59, 6.23s/it] {'loss': 0.4586, 'learning_rate': 3.7632217483208242e-06, 'epoch': 0.72} + 72%|███████▏ | 4173/5772 [3:28:42<2:45:59, 6.23s/it] 72%|███████▏ | 4174/5772 [3:28:48<2:44:55, 6.19s/it] 72%|███████▏ | 4174/5772 [3:28:50<2:44:55, 6.19s/it] {'loss': 0.4617, 'learning_rate': 3.758835940988773e-06, 'epoch': 0.72} + 72%|███████▏ | 4174/5772 [3:28:50<2:44:55, 6.19s/it] {'loss': 0.4617, 'learning_rate': 3.758835940988773e-06, 'epoch': 0.72} + 72%|███████▏ | 4174/5772 [3:28:48<2:44:55, 6.19s/it] 72%|███████▏ | 4175/5772 [3:28:54<2:43:42, 6.15s/it] 72%|███████▏ | 4175/5772 [3:28:56<2:43:42, 6.15s/it] {'loss': 0.462, 'learning_rate': 3.7544520992766454e-06, 'epoch': 0.72} + 72%|███████▏ | 4175/5772 [3:28:56<2:43:42, 6.15s/it] {'loss': 0.462, 'learning_rate': 3.7544520992766454e-06, 'epoch': 0.72} + 72%|███████▏ | 4175/5772 [3:28:54<2:43:42, 6.15s/it] 72%|███████▏ | 4176/5772 [3:29:01<2:45:49, 6.23s/it] 72%|███████▏ | 4176/5772 [3:29:03<2:45:49, 6.23s/it] {'loss': 0.4621, 'learning_rate': 3.7500702245651e-06, 'epoch': 0.72} + 72%|███████▏ | 4176/5772 [3:29:03<2:45:49, 6.23s/it] {'loss': 0.4621, 'learning_rate': 3.7500702245651e-06, 'epoch': 0.72} + 72%|███████▏ | 4176/5772 [3:29:01<2:45:49, 6.23s/it] 72%|███████▏ | 4177/5772 [3:29:07<2:46:21, 6.26s/it] 72%|███████▏ | 4177/5772 [3:29:09<2:46:21, 6.26s/it] {'loss': 0.4515, 'learning_rate': 3.745690318234186e-06, 'epoch': 0.72} + 72%|███████▏ | 4177/5772 [3:29:09<2:46:21, 6.26s/it] {'loss': 0.4515, 'learning_rate': 3.745690318234186e-06, 'epoch': 0.72} + 72%|███████▏ | 4177/5772 [3:29:07<2:46:21, 6.26s/it] 72%|███████▏ | 4178/5772 [3:29:13<2:45:51, 6.24s/it] 72%|███████▏ | 4178/5772 [3:29:15<2:45:51, 6.24s/it] {'loss': 0.4545, 'learning_rate': 3.7413123816633344e-06, 'epoch': 0.72} + 72%|███████▏ | 4178/5772 [3:29:15<2:45:51, 6.24s/it] {'loss': 0.4545, 'learning_rate': 3.7413123816633344e-06, 'epoch': 0.72} + 72%|███████▏ | 4178/5772 [3:29:13<2:45:51, 6.24s/it] 72%|███████▏ | 4179/5772 [3:29:20<2:49:59, 6.40s/it] 72%|███████▏ | 4179/5772 [3:29:22<2:49:59, 6.40s/it] {'loss': 0.453, 'learning_rate': 3.7369364162313528e-06, 'epoch': 0.72} + 72%|███████▏ | 4179/5772 [3:29:22<2:49:59, 6.40s/it] {'loss': 0.453, 'learning_rate': 3.7369364162313528e-06, 'epoch': 0.72} + 72%|███████▏ | 4179/5772 [3:29:20<2:49:59, 6.40s/it] 72%|███████▏ | 4180/5772 [3:29:26<2:45:03, 6.22s/it] 72%|███████▏ | 4180/5772 [3:29:28<2:45:03, 6.22s/it] {'loss': 0.4785, 'learning_rate': 3.7325624233164157e-06, 'epoch': 0.72} + 72%|███████▏ | 4180/5772 [3:29:28<2:45:03, 6.22s/it] {'loss': 0.4785, 'learning_rate': 3.7325624233164157e-06, 'epoch': 0.72} + 72%|███████▏ | 4180/5772 [3:29:26<2:45:03, 6.22s/it] 72%|███████▏ | 4181/5772 [3:29:32<2:44:38, 6.21s/it] 72%|███████▏ | 4181/5772 [3:29:34<2:44:38, 6.21s/it] {'loss': 0.4586, 'learning_rate': 3.7281904042961016e-06, 'epoch': 0.72} + 72%|███████▏ | 4181/5772 [3:29:34<2:44:38, 6.21s/it] {'loss': 0.4586, 'learning_rate': 3.7281904042961016e-06, 'epoch': 0.72} + 72%|███████▏ | 4181/5772 [3:29:32<2:44:38, 6.21s/it] 72%|███████▏ | 4182/5772 [3:29:38<2:43:56, 6.19s/it] 72%|███████▏ | 4182/5772 [3:29:40<2:43:56, 6.19s/it] {'loss': 0.4756, 'learning_rate': 3.723820360547351e-06, 'epoch': 0.72} + 72%|███████▏ | 4182/5772 [3:29:40<2:43:56, 6.19s/it] {'loss': 0.4756, 'learning_rate': 3.723820360547351e-06, 'epoch': 0.72} + 72%|███████▏ | 4182/5772 [3:29:38<2:43:56, 6.19s/it] 72%|███████▏ | 4183/5772 [3:29:44<2:42:19, 6.13s/it] 72%|███████▏ | 4183/5772 [3:29:46<2:42:19, 6.13s/it] {'loss': 0.4498, 'learning_rate': 3.7194522934464785e-06, 'epoch': 0.72} + 72%|███████▏ | 4183/5772 [3:29:46<2:42:19, 6.13s/it] {'loss': 0.4498, 'learning_rate': 3.7194522934464785e-06, 'epoch': 0.72} + 72%|███████▏ | 4183/5772 [3:29:44<2:42:19, 6.13s/it] 72%|███████▏ | 4184/5772 [3:29:50<2:39:02, 6.01s/it] 72%|███████▏ | 4184/5772 [3:29:52<2:39:02, 6.01s/it] {'loss': 0.4692, 'learning_rate': 3.715086204369186e-06, 'epoch': 0.72} + 72%|███████▏ | 4184/5772 [3:29:52<2:39:02, 6.01s/it] {'loss': 0.4692, 'learning_rate': 3.715086204369186e-06, 'epoch': 0.72} + 72%|███████▏ | 4184/5772 [3:29:50<2:39:02, 6.01s/it] 73%|███████▎ | 4185/5772 [3:29:56<2:41:56, 6.12s/it] 73%|███████▎ | 4185/5772 [3:29:58<2:41:56, 6.12s/it] {'loss': 0.4498, 'learning_rate': 3.7107220946905497e-06, 'epoch': 0.72} + 73%|███████▎ | 4185/5772 [3:29:58<2:41:56, 6.12s/it] {'loss': 0.4498, 'learning_rate': 3.7107220946905497e-06, 'epoch': 0.72} + 73%|███████▎ | 4185/5772 [3:29:56<2:41:56, 6.12s/it] 73%|███████▎ | 4186/5772 [3:30:02<2:41:27, 6.11s/it] 73%|███████▎ | 4186/5772 [3:30:04<2:41:26, 6.11s/it] {'loss': 0.4646, 'learning_rate': 3.7063599657850248e-06, 'epoch': 0.73} + 73%|███████▎ | 4186/5772 [3:30:04<2:41:26, 6.11s/it] {'loss': 0.4646, 'learning_rate': 3.7063599657850248e-06, 'epoch': 0.73} + 73%|███████▎ | 4186/5772 [3:30:02<2:41:27, 6.11s/it] 73%|███████▎ | 4187/5772 [3:30:09<2:43:18, 6.18s/it] 73%|███████▎ | 4187/5772 [3:30:11<2:43:18, 6.18s/it] {'loss': 0.4478, 'learning_rate': 3.701999819026432e-06, 'epoch': 0.73} + 73%|███████▎ | 4187/5772 [3:30:11<2:43:18, 6.18s/it] {'loss': 0.4478, 'learning_rate': 3.701999819026432e-06, 'epoch': 0.73} + 73%|███████▎ | 4187/5772 [3:30:09<2:43:18, 6.18s/it] 73%|███████▎ | 4188/5772 [3:30:15<2:46:11, 6.30s/it] 73%|███████▎ | 4188/5772 [3:30:17<2:46:11, 6.30s/it] {'loss': 0.4595, 'learning_rate': 3.6976416557879757e-06, 'epoch': 0.73} + 73%|███████▎ | 4188/5772 [3:30:17<2:46:11, 6.30s/it] {'loss': 0.4595, 'learning_rate': 3.6976416557879757e-06, 'epoch': 0.73} + 73%|███████▎ | 4188/5772 [3:30:15<2:46:11, 6.30s/it] 73%|███████▎ | 4189/5772 [3:30:21<2:44:02, 6.22s/it] 73%|███████▎ | 4189/5772 [3:30:23<2:44:03, 6.22s/it] {'loss': 0.4676, 'learning_rate': 3.6932854774422457e-06, 'epoch': 0.73} + 73%|███████▎ | 4189/5772 [3:30:23<2:44:03, 6.22s/it] {'loss': 0.4676, 'learning_rate': 3.6932854774422457e-06, 'epoch': 0.73} + 73%|███████▎ | 4189/5772 [3:30:21<2:44:02, 6.22s/it] 73%|███████▎ | 4190/5772 [3:30:28<2:47:07, 6.34s/it] 73%|███████▎ | 4190/5772 [3:30:30<2:47:06, 6.34s/it] {'loss': 0.4629, 'learning_rate': 3.6889312853611857e-06, 'epoch': 0.73} + 73%|███████▎ | 4190/5772 [3:30:30<2:47:06, 6.34s/it] {'loss': 0.4629, 'learning_rate': 3.6889312853611857e-06, 'epoch': 0.73} + 73%|███████▎ | 4190/5772 [3:30:28<2:47:07, 6.34s/it] 73%|███████▎ | 4191/5772 [3:30:34<2:44:04, 6.23s/it] 73%|███████▎ | 4191/5772 [3:30:36<2:44:04, 6.23s/it] {'loss': 0.4552, 'learning_rate': 3.6845790809161273e-06, 'epoch': 0.73} + 73%|███████▎ | 4191/5772 [3:30:36<2:44:04, 6.23s/it] {'loss': 0.4552, 'learning_rate': 3.6845790809161273e-06, 'epoch': 0.73} + 73%|███████▎ | 4191/5772 [3:30:34<2:44:04, 6.23s/it] 73%|███████▎ | 4192/5772 [3:30:40<2:42:33, 6.17s/it] 73%|███████▎ | 4192/5772 [3:30:42<2:42:33, 6.17s/it] {'loss': 0.4722, 'learning_rate': 3.680228865477774e-06, 'epoch': 0.73} + {'loss': 0.4722, 'learning_rate': 3.680228865477774e-06, 'epoch': 0.73} 73%|███████▎ | 4192/5772 [3:30:42<2:42:33, 6.17s/it] + 73%|███████▎ | 4192/5772 [3:30:40<2:42:33, 6.17s/it] 73%|███████▎ | 4193/5772 [3:30:46<2:40:49, 6.11s/it] 73%|███████▎ | 4193/5772 [3:30:48<2:40:49, 6.11s/it] {'loss': 0.4545, 'learning_rate': 3.675880640416205e-06, 'epoch': 0.73} + 73%|███████▎ | 4193/5772 [3:30:48<2:40:49, 6.11s/it] {'loss': 0.4545, 'learning_rate': 3.675880640416205e-06, 'epoch': 0.73} + 73%|███████▎ | 4193/5772 [3:30:46<2:40:49, 6.11s/it] 73%|███████▎ | 4194/5772 [3:30:52<2:39:16, 6.06s/it] 73%|███████▎ | 4194/5772 [3:30:54<2:39:17, 6.06s/it] {'loss': 0.4684, 'learning_rate': 3.671534407100863e-06, 'epoch': 0.73} + 73%|███████▎ | 4194/5772 [3:30:54<2:39:17, 6.06s/it] {'loss': 0.4684, 'learning_rate': 3.671534407100863e-06, 'epoch': 0.73} + 73%|███████▎ | 4194/5772 [3:30:52<2:39:16, 6.06s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (5158 > 4096). Running this sequence through the model will result in indexing errors + 73%|███████▎ | 4195/5772 [3:30:58<2:42:55, 6.20s/it] 73%|███████▎ | 4195/5772 [3:31:00<2:42:55, 6.20s/it] {'loss': 0.4703, 'learning_rate': 3.6671901669005683e-06, 'epoch': 0.73} + 73%|███████▎ | 4195/5772 [3:31:00<2:42:55, 6.20s/it] {'loss': 0.4703, 'learning_rate': 3.6671901669005683e-06, 'epoch': 0.73} + 73%|███████▎ | 4195/5772 [3:30:58<2:42:55, 6.20s/it] 73%|███████▎ | 4196/5772 [3:31:04<2:38:54, 6.05s/it] 73%|███████▎ | 4196/5772 [3:31:06<2:38:53, 6.05s/it] {'loss': 0.4646, 'learning_rate': 3.662847921183528e-06, 'epoch': 0.73} + 73%|███████▎ | 4196/5772 [3:31:06<2:38:53, 6.05s/it] {'loss': 0.4646, 'learning_rate': 3.662847921183528e-06, 'epoch': 0.73} + 73%|███████▎ | 4196/5772 [3:31:04<2:38:54, 6.05s/it] 73%|███████▎ | 4197/5772 [3:31:10<2:39:51, 6.09s/it] 73%|███████▎ | 4197/5772 [3:31:12<2:39:51, 6.09s/it] {'loss': 0.4587, 'learning_rate': 3.658507671317296e-06, 'epoch': 0.73} + 73%|███████▎ | 4197/5772 [3:31:12<2:39:51, 6.09s/it] {'loss': 0.4587, 'learning_rate': 3.658507671317296e-06, 'epoch': 0.73} + 73%|███████▎ | 4197/5772 [3:31:10<2:39:51, 6.09s/it] 73%|███████▎ | 4198/5772 [3:31:16<2:39:51, 6.09s/it] 73%|███████▎ | 4198/5772 [3:31:18<2:39:51, 6.09s/it] {'loss': 0.464, 'learning_rate': 3.654169418668815e-06, 'epoch': 0.73} + 73%|███████▎ | 4198/5772 [3:31:18<2:39:51, 6.09s/it] {'loss': 0.464, 'learning_rate': 3.654169418668815e-06, 'epoch': 0.73} + 73%|███████▎ | 4198/5772 [3:31:16<2:39:51, 6.09s/it] 73%|███████▎ | 4199/5772 [3:31:22<2:37:45, 6.02s/it] 73%|███████▎ | 4199/5772 [3:31:24<2:37:45, 6.02s/it] {'loss': 0.4713, 'learning_rate': 3.6498331646043917e-06, 'epoch': 0.73} + 73%|███████▎ | 4199/5772 [3:31:24<2:37:45, 6.02s/it] {'loss': 0.4713, 'learning_rate': 3.6498331646043917e-06, 'epoch': 0.73} + 73%|███████▎ | 4199/5772 [3:31:22<2:37:45, 6.02s/it]3 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +2 10 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +7 9 AutoResumeHook: Checking whether to suspend...11 +AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +08 AutoResumeHook: Checking whether to suspend... + 73%|███████▎ | 4200/5772 [3:31:31<2:39:54, 6.10s/it] AutoResumeHook: Checking whether to suspend... + 73%|███████▎ | 4200/5772 [3:31:29<2:39:54, 6.10s/it] {'loss': 0.4587, 'learning_rate': 3.6454989104897097e-06, 'epoch': 0.73} + 73%|███████▎ | 4200/5772 [3:31:31<2:39:54, 6.10s/it] {'loss': 0.4587, 'learning_rate': 3.6454989104897097e-06, 'epoch': 0.73} + 73%|███████▎ | 4200/5772 [3:31:29<2:39:54, 6.10s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-4200/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-4200/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-4200/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 73%|███████▎ | 4201/5772 [3:31:52<4:55:35, 11.29s/it] 73%|███████▎ | 4201/5772 [3:31:54<4:55:36, 11.29s/it] {'loss': 0.4594, 'learning_rate': 3.641166657689812e-06, 'epoch': 0.73} + 73%|███████▎ | 4201/5772 [3:31:54<4:55:36, 11.29s/it] {'loss': 0.4594, 'learning_rate': 3.641166657689812e-06, 'epoch': 0.73} + 73%|███████▎ | 4201/5772 [3:31:52<4:55:35, 11.29s/it] 73%|███████▎ | 4202/5772 [3:31:58<4:13:34, 9.69s/it] 73%|███████▎ | 4202/5772 [3:32:00<4:13:34, 9.69s/it] {'loss': 0.4769, 'learning_rate': 3.63683640756912e-06, 'epoch': 0.73} + 73%|███████▎ | 4202/5772 [3:32:00<4:13:34, 9.69s/it] {'loss': 0.4769, 'learning_rate': 3.63683640756912e-06, 'epoch': 0.73} + 73%|███████▎ | 4202/5772 [3:31:58<4:13:34, 9.69s/it] 73%|███████▎ | 4203/5772 [3:32:04<3:46:04, 8.65s/it] 73%|███████▎ | 4203/5772 [3:32:06<3:46:04, 8.65s/it] {'loss': 0.4718, 'learning_rate': 3.6325081614914216e-06, 'epoch': 0.73} + 73%|███████▎ | 4203/5772 [3:32:06<3:46:04, 8.65s/it] {'loss': 0.4718, 'learning_rate': 3.6325081614914216e-06, 'epoch': 0.73} + 73%|███████▎ | 4203/5772 [3:32:04<3:46:04, 8.65s/it] 73%|███████▎ | 4204/5772 [3:32:10<3:25:08, 7.85s/it] 73%|███████▎ | 4204/5772 [3:32:12<3:25:08, 7.85s/it] {'loss': 0.4522, 'learning_rate': 3.6281819208198744e-06, 'epoch': 0.73} + 73%|███████▎ | 4204/5772 [3:32:12<3:25:08, 7.85s/it] {'loss': 0.4522, 'learning_rate': 3.6281819208198744e-06, 'epoch': 0.73} + 73%|███████▎ | 4204/5772 [3:32:10<3:25:08, 7.85s/it] 73%|███████▎ | 4205/5772 [3:32:16<3:09:14, 7.25s/it] 73%|███████▎ | 4205/5772 [3:32:18<3:09:14, 7.25s/it] {'loss': 0.4739, 'learning_rate': 3.6238576869170074e-06, 'epoch': 0.73} + 73%|███████▎ | 4205/5772 [3:32:18<3:09:14, 7.25s/it] {'loss': 0.4739, 'learning_rate': 3.6238576869170074e-06, 'epoch': 0.73} + 73%|███████▎ | 4205/5772 [3:32:16<3:09:14, 7.25s/it] 73%|███████▎ | 4206/5772 [3:32:22<2:58:16, 6.83s/it] 73%|███████▎ | 4206/5772 [3:32:24<2:58:16, 6.83s/it] {'loss': 0.4603, 'learning_rate': 3.6195354611447033e-06, 'epoch': 0.73} + 73%|███████▎ | 4206/5772 [3:32:24<2:58:16, 6.83s/it] {'loss': 0.4603, 'learning_rate': 3.6195354611447033e-06, 'epoch': 0.73} + 73%|███████▎ | 4206/5772 [3:32:22<2:58:16, 6.83s/it] 73%|███████▎ | 4207/5772 [3:32:28<2:54:41, 6.70s/it] 73%|███████▎ | 4207/5772 [3:32:30<2:54:41, 6.70s/it] {'loss': 0.4557, 'learning_rate': 3.6152152448642374e-06, 'epoch': 0.73} + 73%|███████▎ | 4207/5772 [3:32:30<2:54:41, 6.70s/it] {'loss': 0.4557, 'learning_rate': 3.6152152448642374e-06, 'epoch': 0.73} + 73%|███████▎ | 4207/5772 [3:32:28<2:54:41, 6.70s/it] 73%|███████▎ | 4208/5772 [3:32:34<2:51:06, 6.56s/it] 73%|███████▎ | 4208/5772 [3:32:36<2:51:07, 6.56s/it] {'loss': 0.4721, 'learning_rate': 3.6108970394362274e-06, 'epoch': 0.73} + 73%|███████▎ | 4208/5772 [3:32:36<2:51:07, 6.56s/it] {'loss': 0.4721, 'learning_rate': 3.6108970394362274e-06, 'epoch': 0.73} + 73%|███████▎ | 4208/5772 [3:32:34<2:51:06, 6.56s/it] 73%|███████▎ | 4209/5772 [3:32:41<2:49:20, 6.50s/it] 73%|███████▎ | 4209/5772 [3:32:43<2:49:20, 6.50s/it] {'loss': 0.457, 'learning_rate': 3.606580846220671e-06, 'epoch': 0.73} + 73%|███████▎ | 4209/5772 [3:32:43<2:49:20, 6.50s/it] {'loss': 0.457, 'learning_rate': 3.606580846220671e-06, 'epoch': 0.73} + 73%|███████▎ | 4209/5772 [3:32:41<2:49:20, 6.50s/it] 73%|███████▎ | 4210/5772 [3:32:47<2:47:51, 6.45s/it] 73%|███████▎ | 4210/5772 [3:32:49<2:47:51, 6.45s/it] {'loss': 0.4608, 'learning_rate': 3.602266666576929e-06, 'epoch': 0.73} + 73%|███████▎ | 4210/5772 [3:32:49<2:47:51, 6.45s/it] {'loss': 0.4608, 'learning_rate': 3.602266666576929e-06, 'epoch': 0.73} + 73%|███████▎ | 4210/5772 [3:32:47<2:47:51, 6.45s/it]/lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/VILA/llava/model/llava_arch.py:397: UserWarning: Inputs truncated! + warnings.warn("Inputs truncated!") + 73%|███████▎ | 4211/5772 [3:32:54<2:47:01, 6.42s/it] 73%|███████▎ | 4211/5772 [3:32:55<2:47:01, 6.42s/it] {'loss': 0.4588, 'learning_rate': 3.59795450186373e-06, 'epoch': 0.73} + 73%|███████▎ | 4211/5772 [3:32:55<2:47:01, 6.42s/it] {'loss': 0.4588, 'learning_rate': 3.59795450186373e-06, 'epoch': 0.73} + 73%|███████▎ | 4211/5772 [3:32:54<2:47:01, 6.42s/it] 73%|███████▎ | 4212/5772 [3:33:00<2:44:50, 6.34s/it] 73%|███████▎ | 4212/5772 [3:33:02<2:44:50, 6.34s/it] {'loss': 0.4771, 'learning_rate': 3.5936443534391676e-06, 'epoch': 0.73} + 73%|███████▎ | 4212/5772 [3:33:02<2:44:50, 6.34s/it] {'loss': 0.4771, 'learning_rate': 3.5936443534391676e-06, 'epoch': 0.73} + 73%|███████▎ | 4212/5772 [3:33:00<2:44:50, 6.34s/it] 73%|███████▎ | 4213/5772 [3:33:06<2:41:48, 6.23s/it] 73%|███████▎ | 4213/5772 [3:33:08<2:41:48, 6.23s/it] {'loss': 0.4578, 'learning_rate': 3.58933622266069e-06, 'epoch': 0.73} + 73%|███████▎ | 4213/5772 [3:33:08<2:41:48, 6.23s/it] {'loss': 0.4578, 'learning_rate': 3.58933622266069e-06, 'epoch': 0.73} + 73%|███████▎ | 4213/5772 [3:33:06<2:41:48, 6.23s/it] 73%|███████▎ | 4214/5772 [3:33:11<2:38:22, 6.10s/it] 73%|███████▎ | 4214/5772 [3:33:13<2:38:22, 6.10s/it] {'loss': 0.4705, 'learning_rate': 3.5850301108851326e-06, 'epoch': 0.73} + 73%|███████▎ | 4214/5772 [3:33:13<2:38:22, 6.10s/it] {'loss': 0.4705, 'learning_rate': 3.5850301108851326e-06, 'epoch': 0.73} + 73%|███████▎ | 4214/5772 [3:33:11<2:38:22, 6.10s/it] 73%|███████▎ | 4215/5772 [3:33:18<2:38:27, 6.11s/it] 73%|███████▎ | 4215/5772 [3:33:20<2:38:27, 6.11s/it] {'loss': 0.455, 'learning_rate': 3.580726019468671e-06, 'epoch': 0.73} + 73%|███████▎ | 4215/5772 [3:33:20<2:38:27, 6.11s/it] {'loss': 0.455, 'learning_rate': 3.580726019468671e-06, 'epoch': 0.73} + 73%|███████▎ | 4215/5772 [3:33:18<2:38:27, 6.11s/it] 73%|███████▎ | 4216/5772 [3:33:24<2:38:34, 6.11s/it] 73%|███████▎ | 4216/5772 [3:33:26<2:38:34, 6.11s/it] {'loss': 0.462, 'learning_rate': 3.5764239497668584e-06, 'epoch': 0.73} + 73%|███████▎ | 4216/5772 [3:33:26<2:38:34, 6.11s/it] {'loss': 0.462, 'learning_rate': 3.5764239497668584e-06, 'epoch': 0.73} + 73%|███████▎ | 4216/5772 [3:33:24<2:38:34, 6.11s/it] 73%|███████▎ | 4217/5772 [3:33:30<2:37:40, 6.08s/it] 73%|███████▎ | 4217/5772 [3:33:32<2:37:40, 6.08s/it] {'loss': 0.4613, 'learning_rate': 3.5721239031346067e-06, 'epoch': 0.73} + 73%|███████▎ | 4217/5772 [3:33:32<2:37:40, 6.08s/it] {'loss': 0.4613, 'learning_rate': 3.5721239031346067e-06, 'epoch': 0.73} + 73%|███████▎ | 4217/5772 [3:33:30<2:37:40, 6.08s/it] 73%|███████▎ | 4218/5772 [3:33:36<2:36:19, 6.04s/it] 73%|███████▎ | 4218/5772 [3:33:38<2:36:19, 6.04s/it] {'loss': 0.4637, 'learning_rate': 3.5678258809261935e-06, 'epoch': 0.73} + 73%|███████▎ | 4218/5772 [3:33:38<2:36:19, 6.04s/it] {'loss': 0.4637, 'learning_rate': 3.5678258809261935e-06, 'epoch': 0.73} + 73%|███████▎ | 4218/5772 [3:33:36<2:36:19, 6.04s/it] 73%|███████▎ | 4219/5772 [3:33:42<2:35:58, 6.03s/it] 73%|███████▎ | 4219/5772 [3:33:44<2:35:58, 6.03s/it] {'loss': 0.4509, 'learning_rate': 3.563529884495259e-06, 'epoch': 0.73} + 73%|███████▎ | 4219/5772 [3:33:44<2:35:58, 6.03s/it] {'loss': 0.4509, 'learning_rate': 3.563529884495259e-06, 'epoch': 0.73} + 73%|███████▎ | 4219/5772 [3:33:42<2:35:58, 6.03s/it] 73%|███████▎ | 4220/5772 [3:33:48<2:35:53, 6.03s/it] 73%|███████▎ | 4220/5772 [3:33:50<2:35:53, 6.03s/it] {'loss': 0.4659, 'learning_rate': 3.5592359151947974e-06, 'epoch': 0.73} + 73%|███████▎ | 4220/5772 [3:33:50<2:35:53, 6.03s/it] {'loss': 0.4659, 'learning_rate': 3.5592359151947974e-06, 'epoch': 0.73} + 73%|███████▎ | 4220/5772 [3:33:48<2:35:53, 6.03s/it] 73%|███████▎ | 4221/5772 [3:33:54<2:35:23, 6.01s/it] 73%|███████▎ | 4221/5772 [3:33:56<2:35:23, 6.01s/it] {'loss': 0.4512, 'learning_rate': 3.554943974377174e-06, 'epoch': 0.73} + 73%|███████▎ | 4221/5772 [3:33:56<2:35:23, 6.01s/it] {'loss': 0.4512, 'learning_rate': 3.554943974377174e-06, 'epoch': 0.73} + 73%|███████▎ | 4221/5772 [3:33:54<2:35:23, 6.01s/it] 73%|███████▎ | 4222/5772 [3:34:00<2:38:43, 6.14s/it] 73%|███████▎ | 4222/5772 [3:34:02<2:38:43, 6.14s/it] {'loss': 0.4715, 'learning_rate': 3.55065406339411e-06, 'epoch': 0.73} + 73%|███████▎ | 4222/5772 [3:34:02<2:38:43, 6.14s/it] {'loss': 0.4715, 'learning_rate': 3.55065406339411e-06, 'epoch': 0.73} + 73%|███████▎ | 4222/5772 [3:34:00<2:38:43, 6.14s/it] 73%|███████▎ | 4223/5772 [3:34:06<2:40:28, 6.22s/it] 73%|███████▎ | 4223/5772 [3:34:08<2:40:28, 6.22s/it] {'loss': 0.4478, 'learning_rate': 3.546366183596691e-06, 'epoch': 0.73} + 73%|███████▎ | 4223/5772 [3:34:08<2:40:28, 6.22s/it] {'loss': 0.4478, 'learning_rate': 3.546366183596691e-06, 'epoch': 0.73} + 73%|███████▎ | 4223/5772 [3:34:06<2:40:28, 6.22s/it] 73%|███████▎ | 4224/5772 [3:34:13<2:40:48, 6.23s/it] 73%|███████▎ | 4224/5772 [3:34:15<2:40:48, 6.23s/it] {'loss': 0.463, 'learning_rate': 3.5420803363353604e-06, 'epoch': 0.73} + 73%|███████▎ | 4224/5772 [3:34:15<2:40:48, 6.23s/it] {'loss': 0.463, 'learning_rate': 3.5420803363353604e-06, 'epoch': 0.73} + 73%|███████▎ | 4224/5772 [3:34:13<2:40:48, 6.23s/it] 73%|███████▎ | 4225/5772 [3:34:19<2:37:40, 6.12s/it] 73%|███████▎ | 4225/5772 [3:34:21<2:37:40, 6.12s/it] {'loss': 0.4407, 'learning_rate': 3.537796522959921e-06, 'epoch': 0.73} + 73%|███████▎ | 4225/5772 [3:34:21<2:37:40, 6.12s/it] {'loss': 0.4407, 'learning_rate': 3.537796522959921e-06, 'epoch': 0.73} + 73%|███████▎ | 4225/5772 [3:34:19<2:37:40, 6.12s/it] 73%|███████▎ | 4226/5772 [3:34:24<2:35:51, 6.05s/it] 73%|███████▎ | 4226/5772 [3:34:26<2:35:51, 6.05s/it] {'loss': 0.4692, 'learning_rate': 3.5335147448195406e-06, 'epoch': 0.73} + 73%|███████▎ | 4226/5772 [3:34:26<2:35:51, 6.05s/it] {'loss': 0.4692, 'learning_rate': 3.5335147448195406e-06, 'epoch': 0.73} + 73%|███████▎ | 4226/5772 [3:34:24<2:35:51, 6.05s/it] 73%|███████▎ | 4227/5772 [3:34:31<2:36:11, 6.07s/it] 73%|███████▎ | 4227/5772 [3:34:33<2:36:11, 6.07s/it] {'loss': 0.4446, 'learning_rate': 3.5292350032627344e-06, 'epoch': 0.73} + 73%|███████▎ | 4227/5772 [3:34:33<2:36:11, 6.07s/it] {'loss': 0.4446, 'learning_rate': 3.5292350032627344e-06, 'epoch': 0.73} + 73%|███████▎ | 4227/5772 [3:34:31<2:36:11, 6.07s/it] 73%|███████▎ | 4228/5772 [3:34:37<2:35:32, 6.04s/it] 73%|███████▎ | 4228/5772 [3:34:39<2:35:32, 6.04s/it] {'loss': 0.4563, 'learning_rate': 3.524957299637386e-06, 'epoch': 0.73} + 73%|███████▎ | 4228/5772 [3:34:39<2:35:32, 6.04s/it] {'loss': 0.4563, 'learning_rate': 3.524957299637386e-06, 'epoch': 0.73} + 73%|███████▎ | 4228/5772 [3:34:37<2:35:32, 6.04s/it] 73%|███████▎ | 4229/5772 [3:34:43<2:38:29, 6.16s/it] 73%|███████▎ | 4229/5772 [3:34:45<2:38:29, 6.16s/it] {'loss': 0.4592, 'learning_rate': 3.5206816352907347e-06, 'epoch': 0.73} + 73%|███████▎ | 4229/5772 [3:34:45<2:38:29, 6.16s/it] {'loss': 0.4592, 'learning_rate': 3.5206816352907347e-06, 'epoch': 0.73} + 73%|███████▎ | 4229/5772 [3:34:43<2:38:29, 6.16s/it] 73%|███████▎ | 4230/5772 [3:34:49<2:38:55, 6.18s/it] 73%|███████▎ | 4230/5772 [3:34:51<2:38:55, 6.18s/it] {'loss': 0.471, 'learning_rate': 3.5164080115693767e-06, 'epoch': 0.73} + 73%|███████▎ | 4230/5772 [3:34:51<2:38:55, 6.18s/it] {'loss': 0.471, 'learning_rate': 3.5164080115693767e-06, 'epoch': 0.73} + 73%|███████▎ | 4230/5772 [3:34:49<2:38:55, 6.18s/it] 73%|███████▎ | 4231/5772 [3:34:55<2:37:38, 6.14s/it] 73%|███████▎ | 4231/5772 [3:34:57<2:37:38, 6.14s/it] {'loss': 0.4637, 'learning_rate': 3.5121364298192673e-06, 'epoch': 0.73} + 73%|███████▎ | 4231/5772 [3:34:57<2:37:38, 6.14s/it] {'loss': 0.4637, 'learning_rate': 3.5121364298192673e-06, 'epoch': 0.73} + 73%|███████▎ | 4231/5772 [3:34:55<2:37:38, 6.14s/it] 73%|███████▎ | 4232/5772 [3:35:02<2:40:20, 6.25s/it] 73%|███████▎ | 4232/5772 [3:35:04<2:40:20, 6.25s/it] {'loss': 0.457, 'learning_rate': 3.507866891385716e-06, 'epoch': 0.73} + 73%|███████▎ | 4232/5772 [3:35:04<2:40:20, 6.25s/it] {'loss': 0.457, 'learning_rate': 3.507866891385716e-06, 'epoch': 0.73} + 73%|███████▎ | 4232/5772 [3:35:02<2:40:20, 6.25s/it] 73%|███████▎ | 4233/5772 [3:35:08<2:39:37, 6.22s/it] 73%|███████▎ | 4233/5772 [3:35:10<2:39:37, 6.22s/it] {'loss': 0.4632, 'learning_rate': 3.503599397613394e-06, 'epoch': 0.73} + 73%|███████▎ | 4233/5772 [3:35:10<2:39:37, 6.22s/it] {'loss': 0.4632, 'learning_rate': 3.503599397613394e-06, 'epoch': 0.73} + 73%|███████▎ | 4233/5772 [3:35:08<2:39:37, 6.22s/it] 73%|███████▎ | 4234/5772 [3:35:14<2:40:25, 6.26s/it] 73%|███████▎ | 4234/5772 [3:35:16<2:40:25, 6.26s/it] {'loss': 0.4578, 'learning_rate': 3.4993339498463197e-06, 'epoch': 0.73} + 73%|███████▎ | 4234/5772 [3:35:16<2:40:25, 6.26s/it] {'loss': 0.4578, 'learning_rate': 3.4993339498463197e-06, 'epoch': 0.73} + 73%|███████▎ | 4234/5772 [3:35:14<2:40:25, 6.26s/it] 73%|███████▎ | 4235/5772 [3:35:20<2:37:36, 6.15s/it] 73%|███████▎ | 4235/5772 [3:35:22<2:37:36, 6.15s/it] {'loss': 0.4661, 'learning_rate': 3.4950705494278723e-06, 'epoch': 0.73} + 73%|███████▎ | 4235/5772 [3:35:22<2:37:36, 6.15s/it] {'loss': 0.4661, 'learning_rate': 3.4950705494278723e-06, 'epoch': 0.73} + 73%|███████▎ | 4235/5772 [3:35:20<2:37:36, 6.15s/it] 73%|███████▎ | 4236/5772 [3:35:26<2:35:13, 6.06s/it] 73%|███████▎ | 4236/5772 [3:35:28<2:35:13, 6.06s/it] {'loss': 0.4629, 'learning_rate': 3.4908091977007896e-06, 'epoch': 0.73} + 73%|███████▎ | 4236/5772 [3:35:28<2:35:13, 6.06s/it] {'loss': 0.4629, 'learning_rate': 3.4908091977007896e-06, 'epoch': 0.73} + 73%|███████▎ | 4236/5772 [3:35:26<2:35:13, 6.06s/it] 73%|███████▎ | 4237/5772 [3:35:32<2:33:28, 6.00s/it] 73%|███████▎ | 4237/5772 [3:35:34<2:33:28, 6.00s/it] {'loss': 0.4547, 'learning_rate': 3.4865498960071576e-06, 'epoch': 0.73} + 73%|███████▎ | 4237/5772 [3:35:34<2:33:28, 6.00s/it] {'loss': 0.4547, 'learning_rate': 3.4865498960071576e-06, 'epoch': 0.73} + 73%|███████▎ | 4237/5772 [3:35:32<2:33:28, 6.00s/it] 73%|███████▎ | 4238/5772 [3:35:38<2:32:31, 5.97s/it] 73%|███████▎ | 4238/5772 [3:35:40<2:32:31, 5.97s/it] {'loss': 0.452, 'learning_rate': 3.4822926456884243e-06, 'epoch': 0.73} + 73%|███████▎ | 4238/5772 [3:35:40<2:32:31, 5.97s/it] {'loss': 0.452, 'learning_rate': 3.4822926456884243e-06, 'epoch': 0.73} + 73%|███████▎ | 4238/5772 [3:35:38<2:32:31, 5.97s/it] 73%|███████▎ | 4239/5772 [3:35:44<2:32:25, 5.97s/it] 73%|███████▎ | 4239/5772 [3:35:46<2:32:25, 5.97s/it] {'loss': 0.4657, 'learning_rate': 3.4780374480853774e-06, 'epoch': 0.73} + 73%|███████▎ | 4239/5772 [3:35:46<2:32:25, 5.97s/it] {'loss': 0.4657, 'learning_rate': 3.4780374480853774e-06, 'epoch': 0.73} + 73%|███████▎ | 4239/5772 [3:35:44<2:32:25, 5.97s/it] 73%|███████▎ | 4240/5772 [3:35:50<2:33:30, 6.01s/it] 73%|███████▎ | 4240/5772 [3:35:52<2:33:30, 6.01s/it] {'loss': 0.4537, 'learning_rate': 3.47378430453818e-06, 'epoch': 0.73} + 73%|███████▎ | 4240/5772 [3:35:52<2:33:30, 6.01s/it] {'loss': 0.4537, 'learning_rate': 3.47378430453818e-06, 'epoch': 0.73} + 73%|███████▎ | 4240/5772 [3:35:50<2:33:30, 6.01s/it] 73%|███████▎ | 4241/5772 [3:35:56<2:34:03, 6.04s/it] 73%|███████▎ | 4241/5772 [3:35:58<2:34:03, 6.04s/it] {'loss': 0.4718, 'learning_rate': 3.469533216386328e-06, 'epoch': 0.73} + 73%|███████▎ | 4241/5772 [3:35:58<2:34:03, 6.04s/it] {'loss': 0.4718, 'learning_rate': 3.469533216386328e-06, 'epoch': 0.73} + 73%|███████▎ | 4241/5772 [3:35:56<2:34:03, 6.04s/it] 73%|███████▎ | 4242/5772 [3:36:02<2:37:13, 6.17s/it] 73%|███████▎ | 4242/5772 [3:36:04<2:37:13, 6.17s/it] {'loss': 0.4629, 'learning_rate': 3.465284184968679e-06, 'epoch': 0.73} + 73%|███████▎ | 4242/5772 [3:36:04<2:37:13, 6.17s/it] {'loss': 0.4629, 'learning_rate': 3.465284184968679e-06, 'epoch': 0.73} + 73%|███████▎ | 4242/5772 [3:36:02<2:37:13, 6.17s/it] 74%|███████▎ | 4243/5772 [3:36:08<2:34:15, 6.05s/it] 74%|███████▎ | 4243/5772 [3:36:10<2:34:15, 6.05s/it] {'loss': 0.4636, 'learning_rate': 3.4610372116234425e-06, 'epoch': 0.74} + 74%|███████▎ | 4243/5772 [3:36:10<2:34:15, 6.05s/it] {'loss': 0.4636, 'learning_rate': 3.4610372116234425e-06, 'epoch': 0.74} + 74%|███████▎ | 4243/5772 [3:36:08<2:34:15, 6.05s/it] 74%|███████▎ | 4244/5772 [3:36:15<2:36:41, 6.15s/it] 74%|███████▎ | 4244/5772 [3:36:17<2:36:41, 6.15s/it] {'loss': 0.4467, 'learning_rate': 3.456792297688181e-06, 'epoch': 0.74} + 74%|███████▎ | 4244/5772 [3:36:17<2:36:41, 6.15s/it] {'loss': 0.4467, 'learning_rate': 3.456792297688181e-06, 'epoch': 0.74} + 74%|███████▎ | 4244/5772 [3:36:15<2:36:41, 6.15s/it] 74%|███████▎ | 4245/5772 [3:36:21<2:35:54, 6.13s/it] 74%|███████▎ | 4245/5772 [3:36:23<2:35:55, 6.13s/it] {'loss': 0.4652, 'learning_rate': 3.4525494444998064e-06, 'epoch': 0.74} + 74%|███████▎ | 4245/5772 [3:36:23<2:35:55, 6.13s/it] {'loss': 0.4652, 'learning_rate': 3.4525494444998064e-06, 'epoch': 0.74} + 74%|███████▎ | 4245/5772 [3:36:21<2:35:54, 6.13s/it] 74%|███████▎ | 4246/5772 [3:36:27<2:35:31, 6.11s/it] 74%|███████▎ | 4246/5772 [3:36:29<2:35:30, 6.11s/it] {'loss': 0.4378, 'learning_rate': 3.4483086533945776e-06, 'epoch': 0.74} + 74%|███████▎ | 4246/5772 [3:36:29<2:35:30, 6.11s/it] {'loss': 0.4378, 'learning_rate': 3.4483086533945776e-06, 'epoch': 0.74} + 74%|███████▎ | 4246/5772 [3:36:27<2:35:31, 6.11s/it] 74%|███████▎ | 4247/5772 [3:36:33<2:35:21, 6.11s/it] 74%|███████▎ | 4247/5772 [3:36:35<2:35:21, 6.11s/it] {'loss': 0.4639, 'learning_rate': 3.4440699257081113e-06, 'epoch': 0.74} + 74%|███████▎ | 4247/5772 [3:36:35<2:35:21, 6.11s/it] {'loss': 0.4639, 'learning_rate': 3.4440699257081113e-06, 'epoch': 0.74} + 74%|███████▎ | 4247/5772 [3:36:33<2:35:21, 6.11s/it] 74%|███████▎ | 4248/5772 [3:36:39<2:37:00, 6.18s/it] 74%|███████▎ | 4248/5772 [3:36:41<2:37:00, 6.18s/it] {'loss': 0.4473, 'learning_rate': 3.4398332627753704e-06, 'epoch': 0.74} + 74%|███████▎ | 4248/5772 [3:36:41<2:37:00, 6.18s/it] {'loss': 0.4473, 'learning_rate': 3.4398332627753704e-06, 'epoch': 0.74} + 74%|███████▎ | 4248/5772 [3:36:39<2:37:00, 6.18s/it] 74%|███████▎ | 4249/5772 [3:36:45<2:35:24, 6.12s/it] 74%|███████▎ | 4249/5772 [3:36:47<2:35:24, 6.12s/it] {'loss': 0.4607, 'learning_rate': 3.435598665930672e-06, 'epoch': 0.74} + 74%|███████▎ | 4249/5772 [3:36:47<2:35:24, 6.12s/it] {'loss': 0.4607, 'learning_rate': 3.435598665930672e-06, 'epoch': 0.74} + 74%|███████▎ | 4249/5772 [3:36:45<2:35:24, 6.12s/it]3 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +0 15 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... + 74%|███████▎ | 4250/5772 [3:36:51<2:33:49, 6.06s/it]8 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... + 74%|███████▎ | 4250/5772 [3:36:53<2:33:49, 6.06s/it]12 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4599, 'learning_rate': 3.431366136507669e-06, 'epoch': 0.74} + 74%|███████▎ | 4250/5772 [3:36:53<2:33:49, 6.06s/it] {'loss': 0.4599, 'learning_rate': 3.431366136507669e-06, 'epoch': 0.74} + 74%|███████▎ | 4250/5772 [3:36:51<2:33:49, 6.06s/it] 74%|███████▎ | 4251/5772 [3:36:57<2:32:11, 6.00s/it] 74%|███████▎ | 4251/5772 [3:36:59<2:32:11, 6.00s/it] {'loss': 0.4609, 'learning_rate': 3.4271356758393827e-06, 'epoch': 0.74} + 74%|███████▎ | 4251/5772 [3:36:59<2:32:11, 6.00s/it] {'loss': 0.4609, 'learning_rate': 3.4271356758393827e-06, 'epoch': 0.74} + 74%|███████▎ | 4251/5772 [3:36:57<2:32:11, 6.00s/it] 74%|███████▎ | 4252/5772 [3:37:03<2:33:03, 6.04s/it] 74%|███████▎ | 4252/5772 [3:37:05<2:33:02, 6.04s/it] {'loss': 0.4724, 'learning_rate': 3.4229072852581735e-06, 'epoch': 0.74} + 74%|███████▎ | 4252/5772 [3:37:05<2:33:02, 6.04s/it] {'loss': 0.4724, 'learning_rate': 3.4229072852581735e-06, 'epoch': 0.74} + 74%|███████▎ | 4252/5772 [3:37:03<2:33:03, 6.04s/it] 74%|███████▎ | 4253/5772 [3:37:09<2:33:18, 6.06s/it] 74%|███████▎ | 4253/5772 [3:37:11<2:33:18, 6.06s/it] {'loss': 0.4685, 'learning_rate': 3.4186809660957433e-06, 'epoch': 0.74} + 74%|███████▎ | 4253/5772 [3:37:11<2:33:18, 6.06s/it] {'loss': 0.4685, 'learning_rate': 3.4186809660957433e-06, 'epoch': 0.74} + 74%|███████▎ | 4253/5772 [3:37:09<2:33:18, 6.06s/it] 74%|███████▎ | 4254/5772 [3:37:15<2:32:46, 6.04s/it] 74%|███████▎ | 4254/5772 [3:37:17<2:32:46, 6.04s/it] {'loss': 0.4552, 'learning_rate': 3.41445671968315e-06, 'epoch': 0.74} + 74%|███████▎ | 4254/5772 [3:37:17<2:32:46, 6.04s/it] {'loss': 0.4552, 'learning_rate': 3.41445671968315e-06, 'epoch': 0.74} + 74%|███████▎ | 4254/5772 [3:37:15<2:32:46, 6.04s/it] 74%|███████▎ | 4255/5772 [3:37:21<2:30:35, 5.96s/it] 74%|███████▎ | 4255/5772 [3:37:23<2:30:35, 5.96s/it] {'loss': 0.458, 'learning_rate': 3.410234547350797e-06, 'epoch': 0.74} + 74%|███████▎ | 4255/5772 [3:37:23<2:30:35, 5.96s/it] {'loss': 0.458, 'learning_rate': 3.410234547350797e-06, 'epoch': 0.74} + 74%|███████▎ | 4255/5772 [3:37:21<2:30:35, 5.96s/it] 74%|███████▎ | 4256/5772 [3:37:27<2:34:41, 6.12s/it] 74%|███████▎ | 4256/5772 [3:37:29<2:34:41, 6.12s/it] {'loss': 0.4633, 'learning_rate': 3.4060144504284375e-06, 'epoch': 0.74} + 74%|███████▎ | 4256/5772 [3:37:29<2:34:41, 6.12s/it] {'loss': 0.4633, 'learning_rate': 3.4060144504284375e-06, 'epoch': 0.74} + 74%|███████▎ | 4256/5772 [3:37:27<2:34:41, 6.12s/it] 74%|███████▍ | 4257/5772 [3:37:34<2:34:12, 6.11s/it] 74%|███████▍ | 4257/5772 [3:37:36<2:34:12, 6.11s/it] {'loss': 0.4646, 'learning_rate': 3.4017964302451578e-06, 'epoch': 0.74} + 74%|███████▍ | 4257/5772 [3:37:36<2:34:12, 6.11s/it] {'loss': 0.4646, 'learning_rate': 3.4017964302451578e-06, 'epoch': 0.74} + 74%|███████▍ | 4257/5772 [3:37:34<2:34:12, 6.11s/it] 74%|███████▍ | 4258/5772 [3:37:39<2:32:24, 6.04s/it] 74%|███████▍ | 4258/5772 [3:37:41<2:32:23, 6.04s/it] {'loss': 0.4643, 'learning_rate': 3.3975804881294095e-06, 'epoch': 0.74} + 74%|███████▍ | 4258/5772 [3:37:41<2:32:23, 6.04s/it] {'loss': 0.4643, 'learning_rate': 3.3975804881294095e-06, 'epoch': 0.74} + 74%|███████▍ | 4258/5772 [3:37:39<2:32:24, 6.04s/it] 74%|███████▍ | 4259/5772 [3:37:46<2:33:48, 6.10s/it] 74%|███████▍ | 4259/5772 [3:37:48<2:33:49, 6.10s/it] {'loss': 0.4545, 'learning_rate': 3.393366625408979e-06, 'epoch': 0.74} + 74%|███████▍ | 4259/5772 [3:37:48<2:33:49, 6.10s/it] {'loss': 0.4545, 'learning_rate': 3.393366625408979e-06, 'epoch': 0.74} + 74%|███████▍ | 4259/5772 [3:37:46<2:33:48, 6.10s/it] 74%|███████▍ | 4260/5772 [3:37:52<2:38:14, 6.28s/it] 74%|███████▍ | 4260/5772 [3:37:54<2:38:14, 6.28s/it] {'loss': 0.4573, 'learning_rate': 3.3891548434109942e-06, 'epoch': 0.74} + 74%|███████▍ | 4260/5772 [3:37:54<2:38:14, 6.28s/it] {'loss': 0.4573, 'learning_rate': 3.3891548434109942e-06, 'epoch': 0.74} + 74%|███████▍ | 4260/5772 [3:37:52<2:38:14, 6.28s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (5014 > 4096). Running this sequence through the model will result in indexing errors + 74%|███████▍ | 4261/5772 [3:37:58<2:36:42, 6.22s/it] 74%|███████▍ | 4261/5772 [3:38:00<2:36:42, 6.22s/it] {'loss': 0.4598, 'learning_rate': 3.384945143461936e-06, 'epoch': 0.74} + 74%|███████▍ | 4261/5772 [3:38:00<2:36:42, 6.22s/it] {'loss': 0.4598, 'learning_rate': 3.384945143461936e-06, 'epoch': 0.74} + 74%|███████▍ | 4261/5772 [3:37:58<2:36:42, 6.22s/it] 74%|███████▍ | 4262/5772 [3:38:04<2:32:34, 6.06s/it] 74%|███████▍ | 4262/5772 [3:38:06<2:32:34, 6.06s/it] {'loss': 0.4526, 'learning_rate': 3.380737526887624e-06, 'epoch': 0.74} + 74%|███████▍ | 4262/5772 [3:38:06<2:32:34, 6.06s/it] {'loss': 0.4526, 'learning_rate': 3.380737526887624e-06, 'epoch': 0.74} + 74%|███████▍ | 4262/5772 [3:38:04<2:32:34, 6.06s/it] 74%|███████▍ | 4263/5772 [3:38:10<2:32:29, 6.06s/it] 74%|███████▍ | 4263/5772 [3:38:12<2:32:29, 6.06s/it] {'loss': 0.4548, 'learning_rate': 3.376531995013228e-06, 'epoch': 0.74} + 74%|███████▍ | 4263/5772 [3:38:12<2:32:29, 6.06s/it] {'loss': 0.4548, 'learning_rate': 3.376531995013228e-06, 'epoch': 0.74} + 74%|███████▍ | 4263/5772 [3:38:10<2:32:29, 6.06s/it] 74%|███████▍ | 4264/5772 [3:38:16<2:31:23, 6.02s/it] 74%|███████▍ | 4264/5772 [3:38:18<2:31:23, 6.02s/it] {'loss': 0.4643, 'learning_rate': 3.3723285491632508e-06, 'epoch': 0.74} + 74%|███████▍ | 4264/5772 [3:38:18<2:31:23, 6.02s/it] {'loss': 0.4643, 'learning_rate': 3.3723285491632508e-06, 'epoch': 0.74} + 74%|███████▍ | 4264/5772 [3:38:16<2:31:23, 6.02s/it] 74%|███████▍ | 4265/5772 [3:38:22<2:33:22, 6.11s/it] 74%|███████▍ | 4265/5772 [3:38:24<2:33:22, 6.11s/it] {'loss': 0.4648, 'learning_rate': 3.368127190661543e-06, 'epoch': 0.74} + 74%|███████▍ | 4265/5772 [3:38:24<2:33:22, 6.11s/it] {'loss': 0.4648, 'learning_rate': 3.368127190661543e-06, 'epoch': 0.74} + 74%|███████▍ | 4265/5772 [3:38:22<2:33:22, 6.11s/it] 74%|███████▍ | 4266/5772 [3:38:29<2:33:10, 6.10s/it] 74%|███████▍ | 4266/5772 [3:38:31<2:33:10, 6.10s/it] {'loss': 0.456, 'learning_rate': 3.3639279208313113e-06, 'epoch': 0.74} + 74%|███████▍ | 4266/5772 [3:38:31<2:33:10, 6.10s/it] {'loss': 0.456, 'learning_rate': 3.3639279208313113e-06, 'epoch': 0.74} + 74%|███████▍ | 4266/5772 [3:38:29<2:33:10, 6.10s/it] 74%|███████▍ | 4267/5772 [3:38:35<2:33:33, 6.12s/it] 74%|███████▍ | 4267/5772 [3:38:37<2:33:33, 6.12s/it] {'loss': 0.4522, 'learning_rate': 3.35973074099508e-06, 'epoch': 0.74} + 74%|███████▍ | 4267/5772 [3:38:37<2:33:33, 6.12s/it] {'loss': 0.4522, 'learning_rate': 3.35973074099508e-06, 'epoch': 0.74} + 74%|███████▍ | 4267/5772 [3:38:35<2:33:33, 6.12s/it] 74%|███████▍ | 4268/5772 [3:38:41<2:35:26, 6.20s/it] 74%|███████▍ | 4268/5772 [3:38:43<2:35:26, 6.20s/it] {'loss': 0.4633, 'learning_rate': 3.3555356524747307e-06, 'epoch': 0.74} + 74%|███████▍ | 4268/5772 [3:38:43<2:35:26, 6.20s/it] {'loss': 0.4633, 'learning_rate': 3.3555356524747307e-06, 'epoch': 0.74} + 74%|███████▍ | 4268/5772 [3:38:41<2:35:26, 6.20s/it] 74%|███████▍ | 4269/5772 [3:38:47<2:32:29, 6.09s/it] 74%|███████▍ | 4269/5772 [3:38:49<2:32:29, 6.09s/it] {'loss': 0.4587, 'learning_rate': 3.3513426565914854e-06, 'epoch': 0.74} + 74%|███████▍ | 4269/5772 [3:38:49<2:32:29, 6.09s/it] {'loss': 0.4587, 'learning_rate': 3.3513426565914854e-06, 'epoch': 0.74} + 74%|███████▍ | 4269/5772 [3:38:47<2:32:29, 6.09s/it] 74%|███████▍ | 4270/5772 [3:38:53<2:34:27, 6.17s/it] 74%|███████▍ | 4270/5772 [3:38:55<2:34:27, 6.17s/it] {'loss': 0.4573, 'learning_rate': 3.3471517546659072e-06, 'epoch': 0.74} + 74%|███████▍ | 4270/5772 [3:38:55<2:34:27, 6.17s/it] {'loss': 0.4573, 'learning_rate': 3.3471517546659072e-06, 'epoch': 0.74} + 74%|███████▍ | 4270/5772 [3:38:53<2:34:27, 6.17s/it] 74%|███████▍ | 4271/5772 [3:39:00<2:37:07, 6.28s/it] 74%|███████▍ | 4271/5772 [3:39:02<2:37:07, 6.28s/it] {'loss': 0.4629, 'learning_rate': 3.3429629480178905e-06, 'epoch': 0.74} + 74%|███████▍ | 4271/5772 [3:39:02<2:37:07, 6.28s/it] {'loss': 0.4629, 'learning_rate': 3.3429629480178905e-06, 'epoch': 0.74} + 74%|███████▍ | 4271/5772 [3:39:00<2:37:07, 6.28s/it] 74%|███████▍ | 4272/5772 [3:39:06<2:36:28, 6.26s/it] 74%|███████▍ | 4272/5772 [3:39:08<2:36:28, 6.26s/it] {'loss': 0.4619, 'learning_rate': 3.3387762379666753e-06, 'epoch': 0.74} + 74%|███████▍ | 4272/5772 [3:39:08<2:36:28, 6.26s/it] {'loss': 0.4619, 'learning_rate': 3.3387762379666753e-06, 'epoch': 0.74} + 74%|███████▍ | 4272/5772 [3:39:06<2:36:28, 6.26s/it] 74%|███████▍ | 4273/5772 [3:39:12<2:34:30, 6.18s/it] 74%|███████▍ | 4273/5772 [3:39:14<2:34:30, 6.18s/it] {'loss': 0.4738, 'learning_rate': 3.3345916258308565e-06, 'epoch': 0.74} + 74%|███████▍ | 4273/5772 [3:39:14<2:34:30, 6.18s/it] {'loss': 0.4738, 'learning_rate': 3.3345916258308565e-06, 'epoch': 0.74} + 74%|███████▍ | 4273/5772 [3:39:12<2:34:30, 6.18s/it] 74%|███████▍ | 4274/5772 [3:39:19<2:37:43, 6.32s/it] 74%|███████▍ | 4274/5772 [3:39:21<2:37:43, 6.32s/it] {'loss': 0.4561, 'learning_rate': 3.33040911292834e-06, 'epoch': 0.74} + 74%|███████▍ | 4274/5772 [3:39:21<2:37:43, 6.32s/it] {'loss': 0.4561, 'learning_rate': 3.33040911292834e-06, 'epoch': 0.74} + 74%|███████▍ | 4274/5772 [3:39:19<2:37:43, 6.32s/it] 74%|███████▍ | 4275/5772 [3:39:25<2:36:11, 6.26s/it] 74%|███████▍ | 4275/5772 [3:39:27<2:36:10, 6.26s/it] {'loss': 0.4697, 'learning_rate': 3.3262287005763915e-06, 'epoch': 0.74} + 74%|███████▍ | 4275/5772 [3:39:27<2:36:10, 6.26s/it] {'loss': 0.4697, 'learning_rate': 3.3262287005763915e-06, 'epoch': 0.74} + 74%|███████▍ | 4275/5772 [3:39:25<2:36:11, 6.26s/it] 74%|███████▍ | 4276/5772 [3:39:31<2:33:28, 6.16s/it] 74%|███████▍ | 4276/5772 [3:39:33<2:33:28, 6.16s/it] {'loss': 0.4553, 'learning_rate': 3.3220503900916068e-06, 'epoch': 0.74} + 74%|███████▍ | 4276/5772 [3:39:33<2:33:28, 6.16s/it] {'loss': 0.4553, 'learning_rate': 3.3220503900916068e-06, 'epoch': 0.74} + 74%|███████▍ | 4276/5772 [3:39:31<2:33:28, 6.16s/it] 74%|███████▍ | 4277/5772 [3:39:37<2:33:20, 6.15s/it] 74%|███████▍ | 4277/5772 [3:39:39<2:33:20, 6.15s/it] {'loss': 0.4582, 'learning_rate': 3.3178741827899253e-06, 'epoch': 0.74} + 74%|███████▍ | 4277/5772 [3:39:39<2:33:20, 6.15s/it] {'loss': 0.4582, 'learning_rate': 3.3178741827899253e-06, 'epoch': 0.74} + 74%|███████▍ | 4277/5772 [3:39:37<2:33:20, 6.15s/it] 74%|███████▍ | 4278/5772 [3:39:43<2:34:57, 6.22s/it] 74%|███████▍ | 4278/5772 [3:39:45<2:34:56, 6.22s/it] {'loss': 0.4789, 'learning_rate': 3.3137000799866148e-06, 'epoch': 0.74} + 74%|███████▍ | 4278/5772 [3:39:45<2:34:56, 6.22s/it] {'loss': 0.4789, 'learning_rate': 3.3137000799866148e-06, 'epoch': 0.74} + 74%|███████▍ | 4278/5772 [3:39:43<2:34:57, 6.22s/it] 74%|███████▍ | 4279/5772 [3:39:49<2:33:41, 6.18s/it] 74%|███████▍ | 4279/5772 [3:39:51<2:33:41, 6.18s/it] {'loss': 0.4688, 'learning_rate': 3.309528082996287e-06, 'epoch': 0.74} + 74%|███████▍ | 4279/5772 [3:39:51<2:33:41, 6.18s/it] {'loss': 0.4688, 'learning_rate': 3.309528082996287e-06, 'epoch': 0.74} + 74%|███████▍ | 4279/5772 [3:39:49<2:33:41, 6.18s/it] 74%|███████▍ | 4280/5772 [3:39:56<2:39:05, 6.40s/it] 74%|███████▍ | 4280/5772 [3:39:58<2:39:05, 6.40s/it] {'loss': 0.4618, 'learning_rate': 3.3053581931328914e-06, 'epoch': 0.74} + 74%|███████▍ | 4280/5772 [3:39:58<2:39:05, 6.40s/it] {'loss': 0.4618, 'learning_rate': 3.3053581931328914e-06, 'epoch': 0.74} + 74%|███████▍ | 4280/5772 [3:39:56<2:39:05, 6.40s/it] 74%|███████▍ | 4281/5772 [3:40:03<2:39:42, 6.43s/it] 74%|███████▍ | 4281/5772 [3:40:05<2:39:42, 6.43s/it] {'loss': 0.4571, 'learning_rate': 3.3011904117097093e-06, 'epoch': 0.74} + 74%|███████▍ | 4281/5772 [3:40:05<2:39:42, 6.43s/it] {'loss': 0.4571, 'learning_rate': 3.3011904117097093e-06, 'epoch': 0.74} + 74%|███████▍ | 4281/5772 [3:40:03<2:39:42, 6.43s/it] 74%|███████▍ | 4282/5772 [3:40:09<2:40:58, 6.48s/it] 74%|███████▍ | 4282/5772 [3:40:11<2:40:58, 6.48s/it] {'loss': 0.443, 'learning_rate': 3.297024740039366e-06, 'epoch': 0.74} + 74%|███████▍ | 4282/5772 [3:40:11<2:40:58, 6.48s/it] {'loss': 0.443, 'learning_rate': 3.297024740039366e-06, 'epoch': 0.74} + 74%|███████▍ | 4282/5772 [3:40:09<2:40:58, 6.48s/it] 74%|███████▍ | 4283/5772 [3:40:16<2:44:19, 6.62s/it] 74%|███████▍ | 4283/5772 [3:40:18<2:44:19, 6.62s/it] {'loss': 0.4692, 'learning_rate': 3.292861179433805e-06, 'epoch': 0.74} + 74%|███████▍ | 4283/5772 [3:40:18<2:44:19, 6.62s/it] {'loss': 0.4692, 'learning_rate': 3.292861179433805e-06, 'epoch': 0.74} + 74%|███████▍ | 4283/5772 [3:40:16<2:44:19, 6.62s/it] 74%|███████▍ | 4284/5772 [3:40:22<2:37:06, 6.34s/it] 74%|███████▍ | 4284/5772 [3:40:24<2:37:07, 6.34s/it] {'loss': 0.4474, 'learning_rate': 3.28869973120433e-06, 'epoch': 0.74} + 74%|███████▍ | 4284/5772 [3:40:24<2:37:07, 6.34s/it] {'loss': 0.4474, 'learning_rate': 3.28869973120433e-06, 'epoch': 0.74} + 74%|███████▍ | 4284/5772 [3:40:22<2:37:06, 6.34s/it] 74%|███████▍ | 4285/5772 [3:40:28<2:38:10, 6.38s/it] 74%|███████▍ | 4285/5772 [3:40:30<2:38:10, 6.38s/it] {'loss': 0.4649, 'learning_rate': 3.2845403966615574e-06, 'epoch': 0.74} + 74%|███████▍ | 4285/5772 [3:40:30<2:38:10, 6.38s/it] {'loss': 0.4649, 'learning_rate': 3.2845403966615574e-06, 'epoch': 0.74} + 74%|███████▍ | 4285/5772 [3:40:28<2:38:10, 6.38s/it] 74%|███████▍ | 4286/5772 [3:40:34<2:35:21, 6.27s/it] 74%|███████▍ | 4286/5772 [3:40:36<2:35:21, 6.27s/it] {'loss': 0.4625, 'learning_rate': 3.2803831771154483e-06, 'epoch': 0.74} + 74%|███████▍ | 4286/5772 [3:40:36<2:35:21, 6.27s/it] {'loss': 0.4625, 'learning_rate': 3.2803831771154483e-06, 'epoch': 0.74} + 74%|███████▍ | 4286/5772 [3:40:34<2:35:21, 6.27s/it] 74%|███████▍ | 4287/5772 [3:40:41<2:35:45, 6.29s/it] 74%|███████▍ | 4287/5772 [3:40:43<2:35:45, 6.29s/it] {'loss': 0.4664, 'learning_rate': 3.276228073875296e-06, 'epoch': 0.74} + 74%|███████▍ | 4287/5772 [3:40:43<2:35:45, 6.29s/it] {'loss': 0.4664, 'learning_rate': 3.276228073875296e-06, 'epoch': 0.74} + 74%|███████▍ | 4287/5772 [3:40:41<2:35:45, 6.29s/it] 74%|███████▍ | 4288/5772 [3:40:47<2:33:53, 6.22s/it] 74%|███████▍ | 4288/5772 [3:40:49<2:33:53, 6.22s/it] {'loss': 0.4606, 'learning_rate': 3.2720750882497276e-06, 'epoch': 0.74} + 74%|███████▍ | 4288/5772 [3:40:49<2:33:53, 6.22s/it] {'loss': 0.4606, 'learning_rate': 3.2720750882497276e-06, 'epoch': 0.74} + 74%|███████▍ | 4288/5772 [3:40:47<2:33:53, 6.22s/it] 74%|███████▍ | 4289/5772 [3:40:53<2:30:26, 6.09s/it] 74%|███████▍ | 4289/5772 [3:40:55<2:30:26, 6.09s/it] {'loss': 0.4553, 'learning_rate': 3.2679242215467066e-06, 'epoch': 0.74} + 74%|███████▍ | 4289/5772 [3:40:55<2:30:26, 6.09s/it] {'loss': 0.4553, 'learning_rate': 3.2679242215467066e-06, 'epoch': 0.74} + 74%|███████▍ | 4289/5772 [3:40:53<2:30:26, 6.09s/it] 74%|███████▍ | 4290/5772 [3:40:59<2:30:41, 6.10s/it] 74%|███████▍ | 4290/5772 [3:41:01<2:30:41, 6.10s/it] {'loss': 0.4546, 'learning_rate': 3.263775475073514e-06, 'epoch': 0.74} + 74%|███████▍ | 4290/5772 [3:41:01<2:30:41, 6.10s/it] {'loss': 0.4546, 'learning_rate': 3.263775475073514e-06, 'epoch': 0.74} + 74%|███████▍ | 4290/5772 [3:40:59<2:30:41, 6.10s/it] 74%|███████▍ | 4291/5772 [3:41:05<2:29:48, 6.07s/it] 74%|███████▍ | 4291/5772 [3:41:07<2:29:48, 6.07s/it]{'loss': 0.4592, 'learning_rate': 3.259628850136789e-06, 'epoch': 0.74} + 74%|███████▍ | 4291/5772 [3:41:05<2:29:48, 6.07s/it]{'loss': 0.4592, 'learning_rate': 3.259628850136789e-06, 'epoch': 0.74} + 74%|███████▍ | 4291/5772 [3:41:07<2:29:48, 6.07s/it] 74%|███████▍ | 4292/5772 [3:41:11<2:30:52, 6.12s/it] 74%|███████▍ | 4292/5772 [3:41:13<2:30:53, 6.12s/it] {'loss': 0.464, 'learning_rate': 3.255484348042478e-06, 'epoch': 0.74} + 74%|███████▍ | 4292/5772 [3:41:13<2:30:53, 6.12s/it] {'loss': 0.464, 'learning_rate': 3.255484348042478e-06, 'epoch': 0.74} + 74%|███████▍ | 4292/5772 [3:41:11<2:30:52, 6.12s/it] 74%|███████▍ | 4293/5772 [3:41:17<2:29:51, 6.08s/it] 74%|███████▍ | 4293/5772 [3:41:19<2:29:50, 6.08s/it] {'loss': 0.4555, 'learning_rate': 3.2513419700958715e-06, 'epoch': 0.74} + 74%|███████▍ | 4293/5772 [3:41:19<2:29:50, 6.08s/it] {'loss': 0.4555, 'learning_rate': 3.2513419700958715e-06, 'epoch': 0.74} + 74%|███████▍ | 4293/5772 [3:41:17<2:29:51, 6.08s/it] 74%|███████▍ | 4294/5772 [3:41:23<2:31:25, 6.15s/it] 74%|███████▍ | 4294/5772 [3:41:25<2:31:25, 6.15s/it] {'loss': 0.4587, 'learning_rate': 3.2472017176015893e-06, 'epoch': 0.74} + 74%|███████▍ | 4294/5772 [3:41:25<2:31:25, 6.15s/it] {'loss': 0.4587, 'learning_rate': 3.2472017176015893e-06, 'epoch': 0.74} + 74%|███████▍ | 4294/5772 [3:41:23<2:31:25, 6.15s/it] 74%|███████▍ | 4295/5772 [3:41:29<2:31:16, 6.15s/it] 74%|███████▍ | 4295/5772 [3:41:31<2:31:16, 6.15s/it] {'loss': 0.4514, 'learning_rate': 3.24306359186358e-06, 'epoch': 0.74} + 74%|███████▍ | 4295/5772 [3:41:31<2:31:16, 6.15s/it] {'loss': 0.4514, 'learning_rate': 3.24306359186358e-06, 'epoch': 0.74} + 74%|███████▍ | 4295/5772 [3:41:29<2:31:16, 6.15s/it] 74%|███████▍ | 4296/5772 [3:41:36<2:33:17, 6.23s/it] 74%|███████▍ | 4296/5772 [3:41:38<2:33:17, 6.23s/it] {'loss': 0.4553, 'learning_rate': 3.238927594185127e-06, 'epoch': 0.74} + 74%|███████▍ | 4296/5772 [3:41:38<2:33:17, 6.23s/it] {'loss': 0.4553, 'learning_rate': 3.238927594185127e-06, 'epoch': 0.74} + 74%|███████▍ | 4296/5772 [3:41:36<2:33:17, 6.23s/it] 74%|███████▍ | 4297/5772 [3:41:42<2:33:59, 6.26s/it] 74%|███████▍ | 4297/5772 [3:41:44<2:33:59, 6.26s/it] {'loss': 0.461, 'learning_rate': 3.2347937258688342e-06, 'epoch': 0.74} + 74%|███████▍ | 4297/5772 [3:41:44<2:33:59, 6.26s/it] {'loss': 0.461, 'learning_rate': 3.2347937258688342e-06, 'epoch': 0.74} + 74%|███████▍ | 4297/5772 [3:41:42<2:33:59, 6.26s/it] 74%|███████▍ | 4298/5772 [3:41:48<2:31:19, 6.16s/it] 74%|███████▍ | 4298/5772 [3:41:50<2:31:19, 6.16s/it] {'loss': 0.4527, 'learning_rate': 3.2306619882166414e-06, 'epoch': 0.74} + 74%|███████▍ | 4298/5772 [3:41:50<2:31:19, 6.16s/it] {'loss': 0.4527, 'learning_rate': 3.2306619882166414e-06, 'epoch': 0.74} + 74%|███████▍ | 4298/5772 [3:41:48<2:31:19, 6.16s/it] 74%|███████▍ | 4299/5772 [3:41:54<2:31:17, 6.16s/it] 74%|███████▍ | 4299/5772 [3:41:56<2:31:17, 6.16s/it] {'loss': 0.4625, 'learning_rate': 3.226532382529819e-06, 'epoch': 0.74} + 74%|███████▍ | 4299/5772 [3:41:56<2:31:17, 6.16s/it] {'loss': 0.4625, 'learning_rate': 3.226532382529819e-06, 'epoch': 0.74} + 74%|███████▍ | 4299/5772 [3:41:54<2:31:17, 6.16s/it]10 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +0 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... + 74%|███████▍ | 4300/5772 [3:42:01<2:33:55, 6.27s/it]8 72 AutoResumeHook: Checking whether to suspend... + 74%|███████▍ | 4300/5772 [3:42:03<2:33:55, 6.27s/it]AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4595, 'learning_rate': 3.2224049101089616e-06, 'epoch': 0.74} + 74%|███████▍ | 4300/5772 [3:42:03<2:33:55, 6.27s/it] {'loss': 0.4595, 'learning_rate': 3.2224049101089616e-06, 'epoch': 0.74} + 74%|███████▍ | 4300/5772 [3:42:01<2:33:55, 6.27s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-4300/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-4300/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-4300/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 75%|███████▍ | 4301/5772 [3:42:22<4:22:35, 10.71s/it] 75%|███████▍ | 4301/5772 [3:42:24<4:22:35, 10.71s/it] {'loss': 0.4629, 'learning_rate': 3.218279572253994e-06, 'epoch': 0.75} + 75%|███████▍ | 4301/5772 [3:42:24<4:22:35, 10.71s/it] {'loss': 0.4629, 'learning_rate': 3.218279572253994e-06, 'epoch': 0.75} + 75%|███████▍ | 4301/5772 [3:42:22<4:22:35, 10.71s/it] 75%|███████▍ | 4302/5772 [3:42:28<3:48:27, 9.32s/it] 75%|███████▍ | 4302/5772 [3:42:30<3:48:27, 9.32s/it] {'loss': 0.4642, 'learning_rate': 3.214156370264169e-06, 'epoch': 0.75} + 75%|███████▍ | 4302/5772 [3:42:30<3:48:27, 9.32s/it] {'loss': 0.4642, 'learning_rate': 3.214156370264169e-06, 'epoch': 0.75} + 75%|███████▍ | 4302/5772 [3:42:28<3:48:27, 9.32s/it] 75%|███████▍ | 4303/5772 [3:42:34<3:27:00, 8.45s/it] 75%|███████▍ | 4303/5772 [3:42:36<3:27:00, 8.45s/it] {'loss': 0.456, 'learning_rate': 3.2100353054380683e-06, 'epoch': 0.75} + 75%|███████▍ | 4303/5772 [3:42:36<3:27:00, 8.45s/it] {'loss': 0.456, 'learning_rate': 3.2100353054380683e-06, 'epoch': 0.75} + 75%|███████▍ | 4303/5772 [3:42:34<3:27:00, 8.45s/it] 75%|███████▍ | 4304/5772 [3:42:40<3:08:16, 7.70s/it] 75%|███████▍ | 4304/5772 [3:42:42<3:08:16, 7.70s/it] {'loss': 0.4647, 'learning_rate': 3.2059163790735927e-06, 'epoch': 0.75} + 75%|███████▍ | 4304/5772 [3:42:42<3:08:16, 7.70s/it] {'loss': 0.4647, 'learning_rate': 3.2059163790735927e-06, 'epoch': 0.75} + 75%|███████▍ | 4304/5772 [3:42:40<3:08:16, 7.70s/it] 75%|███████▍ | 4305/5772 [3:42:47<2:57:21, 7.25s/it] 75%|███████▍ | 4305/5772 [3:42:48<2:57:21, 7.25s/it] {'loss': 0.46, 'learning_rate': 3.201799592467978e-06, 'epoch': 0.75}{'loss': 0.46, 'learning_rate': 3.201799592467978e-06, 'epoch': 0.75} + 75%|███████▍ | 4305/5772 [3:42:48<2:57:21, 7.25s/it] + 75%|███████▍ | 4305/5772 [3:42:47<2:57:21, 7.25s/it] 75%|███████▍ | 4306/5772 [3:42:53<2:47:57, 6.87s/it] 75%|███████▍ | 4306/5772 [3:42:54<2:47:57, 6.87s/it] {'loss': 0.4564, 'learning_rate': 3.197684946917784e-06, 'epoch': 0.75} + 75%|███████▍ | 4306/5772 [3:42:54<2:47:57, 6.87s/it] {'loss': 0.4564, 'learning_rate': 3.197684946917784e-06, 'epoch': 0.75} + 75%|███████▍ | 4306/5772 [3:42:53<2:47:57, 6.87s/it] 75%|███████▍ | 4307/5772 [3:42:59<2:41:34, 6.62s/it] 75%|███████▍ | 4307/5772 [3:43:01<2:41:34, 6.62s/it] {'loss': 0.4499, 'learning_rate': 3.1935724437188954e-06, 'epoch': 0.75} + 75%|███████▍ | 4307/5772 [3:43:01<2:41:34, 6.62s/it] {'loss': 0.4499, 'learning_rate': 3.1935724437188954e-06, 'epoch': 0.75} + 75%|███████▍ | 4307/5772 [3:42:59<2:41:34, 6.62s/it] 75%|███████▍ | 4308/5772 [3:43:05<2:39:52, 6.55s/it] 75%|███████▍ | 4308/5772 [3:43:07<2:39:52, 6.55s/it] {'loss': 0.4549, 'learning_rate': 3.1894620841665248e-06, 'epoch': 0.75} + 75%|███████▍ | 4308/5772 [3:43:07<2:39:52, 6.55s/it] {'loss': 0.4549, 'learning_rate': 3.1894620841665248e-06, 'epoch': 0.75} + 75%|███████▍ | 4308/5772 [3:43:05<2:39:52, 6.55s/it] 75%|███████▍ | 4309/5772 [3:43:11<2:34:06, 6.32s/it] 75%|███████▍ | 4309/5772 [3:43:13<2:34:06, 6.32s/it] {'loss': 0.4534, 'learning_rate': 3.1853538695551965e-06, 'epoch': 0.75} + 75%|███████▍ | 4309/5772 [3:43:13<2:34:06, 6.32s/it] {'loss': 0.4534, 'learning_rate': 3.1853538695551965e-06, 'epoch': 0.75} + 75%|███████▍ | 4309/5772 [3:43:11<2:34:06, 6.32s/it] 75%|███████▍ | 4310/5772 [3:43:17<2:32:07, 6.24s/it] 75%|███████▍ | 4310/5772 [3:43:19<2:32:07, 6.24s/it] {'loss': 0.4736, 'learning_rate': 3.181247801178785e-06, 'epoch': 0.75} + 75%|███████▍ | 4310/5772 [3:43:17<2:32:07, 6.24s/it]{'loss': 0.4736, 'learning_rate': 3.181247801178785e-06, 'epoch': 0.75} + 75%|███████▍ | 4310/5772 [3:43:19<2:32:07, 6.24s/it] 75%|███████▍ | 4311/5772 [3:43:23<2:30:07, 6.17s/it] 75%|███████▍ | 4311/5772 [3:43:25<2:30:07, 6.17s/it] {'loss': 0.4637, 'learning_rate': 3.177143880330463e-06, 'epoch': 0.75} + 75%|███████▍ | 4311/5772 [3:43:25<2:30:07, 6.17s/it] {'loss': 0.4637, 'learning_rate': 3.177143880330463e-06, 'epoch': 0.75} + 75%|███████▍ | 4311/5772 [3:43:23<2:30:07, 6.17s/it] 75%|███████▍ | 4312/5772 [3:43:29<2:30:59, 6.20s/it] 75%|███████▍ | 4312/5772 [3:43:31<2:30:59, 6.21s/it] {'loss': 0.466, 'learning_rate': 3.1730421083027395e-06, 'epoch': 0.75} + 75%|███████▍ | 4312/5772 [3:43:31<2:30:59, 6.21s/it] {'loss': 0.466, 'learning_rate': 3.1730421083027395e-06, 'epoch': 0.75} + 75%|███████▍ | 4312/5772 [3:43:29<2:30:59, 6.20s/it] 75%|███████▍ | 4313/5772 [3:43:35<2:28:15, 6.10s/it] 75%|███████▍ | 4313/5772 [3:43:37<2:28:15, 6.10s/it] {'loss': 0.4582, 'learning_rate': 3.1689424863874473e-06, 'epoch': 0.75} + 75%|███████▍ | 4313/5772 [3:43:37<2:28:15, 6.10s/it] {'loss': 0.4582, 'learning_rate': 3.1689424863874473e-06, 'epoch': 0.75} + 75%|███████▍ | 4313/5772 [3:43:35<2:28:15, 6.10s/it] 75%|███████▍ | 4314/5772 [3:43:41<2:29:46, 6.16s/it] 75%|███████▍ | 4314/5772 [3:43:43<2:29:46, 6.16s/it] {'loss': 0.4527, 'learning_rate': 3.1648450158757373e-06, 'epoch': 0.75} + 75%|███████▍ | 4314/5772 [3:43:43<2:29:46, 6.16s/it] {'loss': 0.4527, 'learning_rate': 3.1648450158757373e-06, 'epoch': 0.75} + 75%|███████▍ | 4314/5772 [3:43:41<2:29:46, 6.16s/it] 75%|███████▍ | 4315/5772 [3:43:47<2:27:00, 6.05s/it] 75%|███████▍ | 4315/5772 [3:43:49<2:27:00, 6.05s/it] {'loss': 0.447, 'learning_rate': 3.1607496980580897e-06, 'epoch': 0.75} + 75%|███████▍ | 4315/5772 [3:43:49<2:27:00, 6.05s/it] {'loss': 0.447, 'learning_rate': 3.1607496980580897e-06, 'epoch': 0.75} + 75%|███████▍ | 4315/5772 [3:43:47<2:27:00, 6.05s/it] 75%|███████▍ | 4316/5772 [3:43:53<2:26:30, 6.04s/it] 75%|███████▍ | 4316/5772 [3:43:55<2:26:30, 6.04s/it] {'loss': 0.4677, 'learning_rate': 3.1566565342242916e-06, 'epoch': 0.75} + 75%|███████▍ | 4316/5772 [3:43:55<2:26:30, 6.04s/it] {'loss': 0.4677, 'learning_rate': 3.1566565342242916e-06, 'epoch': 0.75} + 75%|███████▍ | 4316/5772 [3:43:53<2:26:30, 6.04s/it] 75%|███████▍ | 4317/5772 [3:43:59<2:27:13, 6.07s/it] 75%|███████▍ | 4317/5772 [3:44:01<2:27:13, 6.07s/it] {'loss': 0.4666, 'learning_rate': 3.1525655256634757e-06, 'epoch': 0.75} + 75%|███████▍ | 4317/5772 [3:44:01<2:27:13, 6.07s/it] {'loss': 0.4666, 'learning_rate': 3.1525655256634757e-06, 'epoch': 0.75} + 75%|███████▍ | 4317/5772 [3:43:59<2:27:13, 6.07s/it] 75%|███████▍ | 4318/5772 [3:44:06<2:29:22, 6.16s/it] 75%|███████▍ | 4318/5772 [3:44:08<2:29:22, 6.16s/it] {'loss': 0.4664, 'learning_rate': 3.1484766736640717e-06, 'epoch': 0.75} + 75%|███████▍ | 4318/5772 [3:44:08<2:29:22, 6.16s/it] {'loss': 0.4664, 'learning_rate': 3.1484766736640717e-06, 'epoch': 0.75} + 75%|███████▍ | 4318/5772 [3:44:06<2:29:22, 6.16s/it] 75%|███████▍ | 4319/5772 [3:44:12<2:30:30, 6.22s/it] 75%|███████▍ | 4319/5772 [3:44:14<2:30:30, 6.22s/it] {'loss': 0.4636, 'learning_rate': 3.1443899795138454e-06, 'epoch': 0.75} + 75%|███████▍ | 4319/5772 [3:44:14<2:30:30, 6.22s/it] {'loss': 0.4636, 'learning_rate': 3.1443899795138454e-06, 'epoch': 0.75} + 75%|███████▍ | 4319/5772 [3:44:12<2:30:30, 6.22s/it] 75%|███████▍ | 4320/5772 [3:44:18<2:28:35, 6.14s/it] 75%|███████▍ | 4320/5772 [3:44:20<2:28:35, 6.14s/it] {'loss': 0.4435, 'learning_rate': 3.140305444499877e-06, 'epoch': 0.75} + 75%|███████▍ | 4320/5772 [3:44:20<2:28:35, 6.14s/it] {'loss': 0.4435, 'learning_rate': 3.140305444499877e-06, 'epoch': 0.75} + 75%|███████▍ | 4320/5772 [3:44:18<2:28:35, 6.14s/it] 75%|███████▍ | 4321/5772 [3:44:24<2:28:50, 6.15s/it] 75%|███████▍ | 4321/5772 [3:44:26<2:28:50, 6.15s/it] {'loss': 0.4718, 'learning_rate': 3.1362230699085693e-06, 'epoch': 0.75} + 75%|███████▍ | 4321/5772 [3:44:26<2:28:50, 6.15s/it] {'loss': 0.4718, 'learning_rate': 3.1362230699085693e-06, 'epoch': 0.75} + 75%|███████▍ | 4321/5772 [3:44:24<2:28:50, 6.15s/it] 75%|███████▍ | 4322/5772 [3:44:30<2:27:28, 6.10s/it] 75%|███████▍ | 4322/5772 [3:44:32<2:27:28, 6.10s/it] {'loss': 0.451, 'learning_rate': 3.1321428570256464e-06, 'epoch': 0.75} + 75%|███████▍ | 4322/5772 [3:44:32<2:27:28, 6.10s/it] {'loss': 0.451, 'learning_rate': 3.1321428570256464e-06, 'epoch': 0.75} + 75%|███████▍ | 4322/5772 [3:44:30<2:27:28, 6.10s/it] 75%|███████▍ | 4323/5772 [3:44:36<2:27:11, 6.09s/it] 75%|███████▍ | 4323/5772 [3:44:38<2:27:11, 6.09s/it] {'loss': 0.468, 'learning_rate': 3.128064807136142e-06, 'epoch': 0.75} + 75%|███████▍ | 4323/5772 [3:44:38<2:27:11, 6.09s/it] {'loss': 0.468, 'learning_rate': 3.128064807136142e-06, 'epoch': 0.75} + 75%|███████▍ | 4323/5772 [3:44:36<2:27:11, 6.09s/it] 75%|███████▍ | 4324/5772 [3:44:42<2:28:40, 6.16s/it] 75%|███████▍ | 4324/5772 [3:44:44<2:28:40, 6.16s/it] {'loss': 0.4656, 'learning_rate': 3.123988921524418e-06, 'epoch': 0.75} + 75%|███████▍ | 4324/5772 [3:44:44<2:28:40, 6.16s/it] {'loss': 0.4656, 'learning_rate': 3.123988921524418e-06, 'epoch': 0.75} + 75%|███████▍ | 4324/5772 [3:44:42<2:28:40, 6.16s/it] 75%|███████▍ | 4325/5772 [3:44:48<2:26:45, 6.09s/it] 75%|███████▍ | 4325/5772 [3:44:50<2:26:45, 6.09s/it] {'loss': 0.4552, 'learning_rate': 3.119915201474153e-06, 'epoch': 0.75} + 75%|███████▍ | 4325/5772 [3:44:50<2:26:45, 6.09s/it] {'loss': 0.4552, 'learning_rate': 3.119915201474153e-06, 'epoch': 0.75} + 75%|███████▍ | 4325/5772 [3:44:48<2:26:45, 6.09s/it] 75%|███████▍ | 4326/5772 [3:44:55<2:27:43, 6.13s/it] 75%|███████▍ | 4326/5772 [3:44:57<2:27:43, 6.13s/it] {'loss': 0.4616, 'learning_rate': 3.115843648268344e-06, 'epoch': 0.75} + 75%|███████▍ | 4326/5772 [3:44:57<2:27:43, 6.13s/it] {'loss': 0.4616, 'learning_rate': 3.115843648268344e-06, 'epoch': 0.75} + 75%|███████▍ | 4326/5772 [3:44:55<2:27:43, 6.13s/it] 75%|███████▍ | 4327/5772 [3:45:01<2:26:24, 6.08s/it] 75%|███████▍ | 4327/5772 [3:45:02<2:26:24, 6.08s/it] {'loss': 0.469, 'learning_rate': 3.1117742631892965e-06, 'epoch': 0.75} + 75%|███████▍ | 4327/5772 [3:45:02<2:26:24, 6.08s/it] {'loss': 0.469, 'learning_rate': 3.1117742631892965e-06, 'epoch': 0.75} + 75%|███████▍ | 4327/5772 [3:45:01<2:26:24, 6.08s/it] 75%|███████▍ | 4328/5772 [3:45:07<2:27:16, 6.12s/it] 75%|███████▍ | 4328/5772 [3:45:09<2:27:16, 6.12s/it] {'loss': 0.4706, 'learning_rate': 3.107707047518649e-06, 'epoch': 0.75} + 75%|███████▍ | 4328/5772 [3:45:09<2:27:16, 6.12s/it] {'loss': 0.4706, 'learning_rate': 3.107707047518649e-06, 'epoch': 0.75} + 75%|███████▍ | 4328/5772 [3:45:07<2:27:16, 6.12s/it] 75%|███████▌ | 4329/5772 [3:45:13<2:24:53, 6.02s/it] 75%|███████▌ | 4329/5772 [3:45:14<2:24:53, 6.02s/it] {'loss': 0.4639, 'learning_rate': 3.103642002537349e-06, 'epoch': 0.75} + 75%|███████▌ | 4329/5772 [3:45:15<2:24:53, 6.02s/it] {'loss': 0.4639, 'learning_rate': 3.103642002537349e-06, 'epoch': 0.75} + 75%|███████▌ | 4329/5772 [3:45:13<2:24:53, 6.02s/it] 75%|███████▌ | 4330/5772 [3:45:19<2:24:37, 6.02s/it] 75%|███████▌ | 4330/5772 [3:45:21<2:24:37, 6.02s/it] {'loss': 0.4644, 'learning_rate': 3.099579129525653e-06, 'epoch': 0.75} + 75%|███████▌ | 4330/5772 [3:45:21<2:24:37, 6.02s/it] {'loss': 0.4644, 'learning_rate': 3.099579129525653e-06, 'epoch': 0.75} + 75%|███████▌ | 4330/5772 [3:45:19<2:24:37, 6.02s/it] 75%|███████▌ | 4331/5772 [3:45:24<2:24:03, 6.00s/it] 75%|███████▌ | 4331/5772 [3:45:26<2:24:04, 6.00s/it] {'loss': 0.4657, 'learning_rate': 3.0955184297631437e-06, 'epoch': 0.75} + 75%|███████▌ | 4331/5772 [3:45:26<2:24:04, 6.00s/it] {'loss': 0.4657, 'learning_rate': 3.0955184297631437e-06, 'epoch': 0.75} + 75%|███████▌ | 4331/5772 [3:45:24<2:24:03, 6.00s/it] 75%|███████▌ | 4332/5772 [3:45:31<2:24:31, 6.02s/it] 75%|███████▌ | 4332/5772 [3:45:33<2:24:31, 6.02s/it] {'loss': 0.4497, 'learning_rate': 3.0914599045287165e-06, 'epoch': 0.75} + 75%|███████▌ | 4332/5772 [3:45:33<2:24:31, 6.02s/it] {'loss': 0.4497, 'learning_rate': 3.0914599045287165e-06, 'epoch': 0.75} + 75%|███████▌ | 4332/5772 [3:45:31<2:24:31, 6.02s/it] 75%|███████▌ | 4333/5772 [3:45:37<2:24:18, 6.02s/it] 75%|███████▌ | 4333/5772 [3:45:39<2:24:18, 6.02s/it] {'loss': 0.4677, 'learning_rate': 3.087403555100583e-06, 'epoch': 0.75} + 75%|███████▌ | 4333/5772 [3:45:39<2:24:18, 6.02s/it] {'loss': 0.4677, 'learning_rate': 3.087403555100583e-06, 'epoch': 0.75} + 75%|███████▌ | 4333/5772 [3:45:37<2:24:18, 6.02s/it] 75%|███████▌ | 4334/5772 [3:45:43<2:30:10, 6.27s/it] 75%|███████▌ | 4334/5772 [3:45:45<2:30:10, 6.27s/it] {'loss': 0.4448, 'learning_rate': 3.0833493827562598e-06, 'epoch': 0.75} + 75%|███████▌ | 4334/5772 [3:45:45<2:30:10, 6.27s/it] {'loss': 0.4448, 'learning_rate': 3.0833493827562598e-06, 'epoch': 0.75} + 75%|███████▌ | 4334/5772 [3:45:43<2:30:10, 6.27s/it] 75%|███████▌ | 4335/5772 [3:45:49<2:28:31, 6.20s/it] 75%|███████▌ | 4335/5772 [3:45:51<2:28:31, 6.20s/it] {'loss': 0.4665, 'learning_rate': 3.079297388772595e-06, 'epoch': 0.75} + 75%|███████▌ | 4335/5772 [3:45:51<2:28:31, 6.20s/it] {'loss': 0.4665, 'learning_rate': 3.079297388772595e-06, 'epoch': 0.75} + 75%|███████▌ | 4335/5772 [3:45:49<2:28:31, 6.20s/it] 75%|███████▌ | 4336/5772 [3:45:56<2:27:20, 6.16s/it] 75%|███████▌ | 4336/5772 [3:45:57<2:27:20, 6.16s/it] {'loss': 0.4676, 'learning_rate': 3.0752475744257414e-06, 'epoch': 0.75} + 75%|███████▌ | 4336/5772 [3:45:57<2:27:20, 6.16s/it] {'loss': 0.4676, 'learning_rate': 3.0752475744257414e-06, 'epoch': 0.75} + 75%|███████▌ | 4336/5772 [3:45:56<2:27:20, 6.16s/it] 75%|███████▌ | 4337/5772 [3:46:02<2:28:50, 6.22s/it] 75%|███████▌ | 4337/5772 [3:46:04<2:28:50, 6.22s/it] {'loss': 0.4557, 'learning_rate': 3.0711999409911587e-06, 'epoch': 0.75} + 75%|███████▌ | 4337/5772 [3:46:04<2:28:50, 6.22s/it] {'loss': 0.4557, 'learning_rate': 3.0711999409911587e-06, 'epoch': 0.75} + 75%|███████▌ | 4337/5772 [3:46:02<2:28:50, 6.22s/it] 75%|███████▌ | 4338/5772 [3:46:08<2:27:39, 6.18s/it] 75%|███████▌ | 4338/5772 [3:46:10<2:27:39, 6.18s/it] {'loss': 0.4644, 'learning_rate': 3.067154489743631e-06, 'epoch': 0.75} + 75%|███████▌ | 4338/5772 [3:46:10<2:27:39, 6.18s/it] {'loss': 0.4644, 'learning_rate': 3.067154489743631e-06, 'epoch': 0.75} + 75%|███████▌ | 4338/5772 [3:46:08<2:27:39, 6.18s/it] 75%|███████▌ | 4339/5772 [3:46:14<2:26:47, 6.15s/it] 75%|███████▌ | 4339/5772 [3:46:16<2:26:46, 6.15s/it] {'loss': 0.452, 'learning_rate': 3.06311122195725e-06, 'epoch': 0.75} + 75%|███████▌ | 4339/5772 [3:46:16<2:26:46, 6.15s/it] {'loss': 0.452, 'learning_rate': 3.06311122195725e-06, 'epoch': 0.75} + 75%|███████▌ | 4339/5772 [3:46:14<2:26:47, 6.15s/it] 75%|███████▌ | 4340/5772 [3:46:20<2:24:36, 6.06s/it] 75%|███████▌ | 4340/5772 [3:46:22<2:24:36, 6.06s/it] {'loss': 0.4546, 'learning_rate': 3.0590701389054235e-06, 'epoch': 0.75} + 75%|███████▌ | 4340/5772 [3:46:22<2:24:36, 6.06s/it] {'loss': 0.4546, 'learning_rate': 3.0590701389054235e-06, 'epoch': 0.75} + 75%|███████▌ | 4340/5772 [3:46:20<2:24:36, 6.06s/it] 75%|███████▌ | 4341/5772 [3:46:26<2:27:22, 6.18s/it] 75%|███████▌ | 4341/5772 [3:46:28<2:27:22, 6.18s/it] {'loss': 0.455, 'learning_rate': 3.0550312418608617e-06, 'epoch': 0.75} + 75%|███████▌ | 4341/5772 [3:46:28<2:27:22, 6.18s/it] {'loss': 0.455, 'learning_rate': 3.0550312418608617e-06, 'epoch': 0.75} + 75%|███████▌ | 4341/5772 [3:46:26<2:27:22, 6.18s/it] 75%|███████▌ | 4342/5772 [3:46:33<2:31:34, 6.36s/it] 75%|███████▌ | 4342/5772 [3:46:35<2:31:34, 6.36s/it] {'loss': 0.4618, 'learning_rate': 3.0509945320955925e-06, 'epoch': 0.75} + 75%|███████▌ | 4342/5772 [3:46:35<2:31:34, 6.36s/it] {'loss': 0.4618, 'learning_rate': 3.0509945320955925e-06, 'epoch': 0.75} + 75%|███████▌ | 4342/5772 [3:46:33<2:31:34, 6.36s/it] 75%|███████▌ | 4343/5772 [3:46:39<2:31:08, 6.35s/it] 75%|███████▌ | 4343/5772 [3:46:41<2:31:08, 6.35s/it] {'loss': 0.4677, 'learning_rate': 3.046960010880966e-06, 'epoch': 0.75} + 75%|███████▌ | 4343/5772 [3:46:41<2:31:08, 6.35s/it] {'loss': 0.4677, 'learning_rate': 3.046960010880966e-06, 'epoch': 0.75} + 75%|███████▌ | 4343/5772 [3:46:39<2:31:08, 6.35s/it] 75%|███████▌ | 4344/5772 [3:46:46<2:30:17, 6.31s/it] 75%|███████▌ | 4344/5772 [3:46:48<2:30:17, 6.31s/it] {'loss': 0.4728, 'learning_rate': 3.042927679487622e-06, 'epoch': 0.75} + 75%|███████▌ | 4344/5772 [3:46:48<2:30:17, 6.31s/it] {'loss': 0.4728, 'learning_rate': 3.042927679487622e-06, 'epoch': 0.75} + 75%|███████▌ | 4344/5772 [3:46:46<2:30:17, 6.31s/it] 75%|███████▌ | 4345/5772 [3:46:52<2:28:51, 6.26s/it] 75%|███████▌ | 4345/5772 [3:46:54<2:28:50, 6.26s/it] {'loss': 0.4678, 'learning_rate': 3.0388975391855226e-06, 'epoch': 0.75} + 75%|███████▌ | 4345/5772 [3:46:54<2:28:50, 6.26s/it] {'loss': 0.4678, 'learning_rate': 3.0388975391855226e-06, 'epoch': 0.75} + 75%|███████▌ | 4345/5772 [3:46:52<2:28:51, 6.26s/it] 75%|███████▌ | 4346/5772 [3:46:58<2:26:22, 6.16s/it] 75%|███████▌ | 4346/5772 [3:47:00<2:26:23, 6.16s/it] {'loss': 0.456, 'learning_rate': 3.03486959124394e-06, 'epoch': 0.75} + 75%|███████▌ | 4346/5772 [3:47:00<2:26:23, 6.16s/it] {'loss': 0.456, 'learning_rate': 3.03486959124394e-06, 'epoch': 0.75} + 75%|███████▌ | 4346/5772 [3:46:58<2:26:22, 6.16s/it] 75%|███████▌ | 4347/5772 [3:47:04<2:23:48, 6.06s/it] 75%|███████▌ | 4347/5772 [3:47:06<2:23:48, 6.06s/it] {'loss': 0.4641, 'learning_rate': 3.0308438369314563e-06, 'epoch': 0.75} + 75%|███████▌ | 4347/5772 [3:47:06<2:23:48, 6.06s/it] {'loss': 0.4641, 'learning_rate': 3.0308438369314563e-06, 'epoch': 0.75} + 75%|███████▌ | 4347/5772 [3:47:04<2:23:48, 6.06s/it] 75%|███████▌ | 4348/5772 [3:47:10<2:23:53, 6.06s/it] 75%|███████▌ | 4348/5772 [3:47:12<2:23:53, 6.06s/it] {'loss': 0.4568, 'learning_rate': 3.026820277515955e-06, 'epoch': 0.75} + 75%|███████▌ | 4348/5772 [3:47:12<2:23:53, 6.06s/it] {'loss': 0.4568, 'learning_rate': 3.026820277515955e-06, 'epoch': 0.75} + 75%|███████▌ | 4348/5772 [3:47:10<2:23:53, 6.06s/it] 75%|███████▌ | 4349/5772 [3:47:16<2:23:44, 6.06s/it] 75%|███████▌ | 4349/5772 [3:47:18<2:23:44, 6.06s/it] {'loss': 0.4769, 'learning_rate': 3.022798914264633e-06, 'epoch': 0.75} + 75%|███████▌ | 4349/5772 [3:47:18<2:23:44, 6.06s/it] {'loss': 0.4769, 'learning_rate': 3.022798914264633e-06, 'epoch': 0.75} + 75%|███████▌ | 4349/5772 [3:47:16<2:23:44, 6.06s/it]10 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +013 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... + 11 AutoResumeHook: Checking whether to suspend... +4AutoResumeHook: Checking whether to suspend... AutoResumeHook: Checking whether to suspend... + +9 75%|███████▌ | 4350/5772 [3:47:22<2:25:17, 6.13s/it]AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +7 8 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +1 AutoResumeHook: Checking whether to suspend... + 75%|███████▌ | 4350/5772 [3:47:24<2:25:17, 6.13s/it]12 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4551, 'learning_rate': 3.018779748444005e-06, 'epoch': 0.75} + 75%|███████▌ | 4350/5772 [3:47:24<2:25:17, 6.13s/it] {'loss': 0.4551, 'learning_rate': 3.018779748444005e-06, 'epoch': 0.75} + 75%|███████▌ | 4350/5772 [3:47:22<2:25:17, 6.13s/it] 75%|███████▌ | 4351/5772 [3:47:29<2:28:06, 6.25s/it] 75%|███████▌ | 4351/5772 [3:47:31<2:28:07, 6.25s/it] {'loss': 0.4693, 'learning_rate': 3.0147627813198777e-06, 'epoch': 0.75} + 75%|███████▌ | 4351/5772 [3:47:31<2:28:07, 6.25s/it] {'loss': 0.4693, 'learning_rate': 3.0147627813198777e-06, 'epoch': 0.75} + 75%|███████▌ | 4351/5772 [3:47:29<2:28:06, 6.25s/it] 75%|███████▌ | 4352/5772 [3:47:35<2:27:21, 6.23s/it] 75%|███████▌ | 4352/5772 [3:47:37<2:27:21, 6.23s/it] {'loss': 0.46, 'learning_rate': 3.0107480141573763e-06, 'epoch': 0.75} + 75%|███████▌ | 4352/5772 [3:47:37<2:27:21, 6.23s/it] {'loss': 0.46, 'learning_rate': 3.0107480141573763e-06, 'epoch': 0.75} + 75%|███████▌ | 4352/5772 [3:47:35<2:27:21, 6.23s/it] 75%|███████▌ | 4353/5772 [3:47:41<2:26:14, 6.18s/it] 75%|███████▌ | 4353/5772 [3:47:43<2:26:14, 6.18s/it] {'loss': 0.4506, 'learning_rate': 3.006735448220922e-06, 'epoch': 0.75} + 75%|███████▌ | 4353/5772 [3:47:43<2:26:14, 6.18s/it] {'loss': 0.4506, 'learning_rate': 3.006735448220922e-06, 'epoch': 0.75} + 75%|███████▌ | 4353/5772 [3:47:41<2:26:14, 6.18s/it] 75%|███████▌ | 4354/5772 [3:47:47<2:28:34, 6.29s/it] 75%|███████▌ | 4354/5772 [3:47:49<2:28:34, 6.29s/it] {'loss': 0.4529, 'learning_rate': 3.002725084774262e-06, 'epoch': 0.75} + 75%|███████▌ | 4354/5772 [3:47:49<2:28:34, 6.29s/it] {'loss': 0.4529, 'learning_rate': 3.002725084774262e-06, 'epoch': 0.75} + 75%|███████▌ | 4354/5772 [3:47:47<2:28:34, 6.29s/it] 75%|███████▌ | 4355/5772 [3:47:54<2:30:52, 6.39s/it] 75%|███████▌ | 4355/5772 [3:47:56<2:30:52, 6.39s/it] {'loss': 0.4752, 'learning_rate': 2.998716925080427e-06, 'epoch': 0.75} + 75%|███████▌ | 4355/5772 [3:47:56<2:30:52, 6.39s/it] {'loss': 0.4752, 'learning_rate': 2.998716925080427e-06, 'epoch': 0.75} + 75%|███████▌ | 4355/5772 [3:47:54<2:30:52, 6.39s/it] 75%|███████▌ | 4356/5772 [3:48:00<2:31:08, 6.40s/it] 75%|███████▌ | 4356/5772 [3:48:02<2:31:08, 6.40s/it] {'loss': 0.4475, 'learning_rate': 2.9947109704017707e-06, 'epoch': 0.75} + 75%|███████▌ | 4356/5772 [3:48:02<2:31:08, 6.40s/it] {'loss': 0.4475, 'learning_rate': 2.9947109704017707e-06, 'epoch': 0.75} + 75%|███████▌ | 4356/5772 [3:48:00<2:31:08, 6.40s/it] 75%|███████▌ | 4357/5772 [3:48:06<2:28:04, 6.28s/it] 75%|███████▌ | 4357/5772 [3:48:08<2:28:04, 6.28s/it] {'loss': 0.4598, 'learning_rate': 2.9907072219999443e-06, 'epoch': 0.75} + 75%|███████▌ | 4357/5772 [3:48:08<2:28:04, 6.28s/it] {'loss': 0.4598, 'learning_rate': 2.9907072219999443e-06, 'epoch': 0.75} + 75%|███████▌ | 4357/5772 [3:48:06<2:28:04, 6.28s/it] 76%|███████▌ | 4358/5772 [3:48:12<2:25:30, 6.17s/it] 76%|███████▌ | 4358/5772 [3:48:14<2:25:30, 6.17s/it] {'loss': 0.4503, 'learning_rate': 2.9867056811359063e-06, 'epoch': 0.75} + 76%|███████▌ | 4358/5772 [3:48:14<2:25:30, 6.17s/it] {'loss': 0.4503, 'learning_rate': 2.9867056811359063e-06, 'epoch': 0.75} + 76%|███████▌ | 4358/5772 [3:48:12<2:25:30, 6.17s/it] 76%|███████▌ | 4359/5772 [3:48:18<2:23:59, 6.11s/it] 76%|███████▌ | 4359/5772 [3:48:20<2:23:59, 6.11s/it] {'loss': 0.4649, 'learning_rate': 2.9827063490699225e-06, 'epoch': 0.76} + 76%|███████▌ | 4359/5772 [3:48:20<2:23:59, 6.11s/it] {'loss': 0.4649, 'learning_rate': 2.9827063490699225e-06, 'epoch': 0.76} + 76%|███████▌ | 4359/5772 [3:48:18<2:23:59, 6.11s/it] 76%|███████▌ | 4360/5772 [3:48:25<2:25:29, 6.18s/it] 76%|███████▌ | 4360/5772 [3:48:27<2:25:30, 6.18s/it] {'loss': 0.4645, 'learning_rate': 2.9787092270615527e-06, 'epoch': 0.76} + 76%|███████▌ | 4360/5772 [3:48:27<2:25:30, 6.18s/it] {'loss': 0.4645, 'learning_rate': 2.9787092270615527e-06, 'epoch': 0.76} + 76%|███████▌ | 4360/5772 [3:48:25<2:25:29, 6.18s/it] 76%|███████▌ | 4361/5772 [3:48:31<2:26:00, 6.21s/it] 76%|███████▌ | 4361/5772 [3:48:33<2:26:00, 6.21s/it] {'loss': 0.4648, 'learning_rate': 2.974714316369679e-06, 'epoch': 0.76} + 76%|███████▌ | 4361/5772 [3:48:33<2:26:00, 6.21s/it] {'loss': 0.4648, 'learning_rate': 2.974714316369679e-06, 'epoch': 0.76} + 76%|███████▌ | 4361/5772 [3:48:31<2:26:00, 6.21s/it] 76%|███████▌ | 4362/5772 [3:48:37<2:25:07, 6.18s/it] 76%|███████▌ | 4362/5772 [3:48:39<2:25:07, 6.18s/it] {'loss': 0.461, 'learning_rate': 2.9707216182524667e-06, 'epoch': 0.76} + 76%|███████▌ | 4362/5772 [3:48:39<2:25:07, 6.18s/it] {'loss': 0.461, 'learning_rate': 2.9707216182524667e-06, 'epoch': 0.76} + 76%|███████▌ | 4362/5772 [3:48:37<2:25:07, 6.18s/it] 76%|███████▌ | 4363/5772 [3:48:43<2:25:34, 6.20s/it] 76%|███████▌ | 4363/5772 [3:48:45<2:25:34, 6.20s/it] {'loss': 0.4601, 'learning_rate': 2.966731133967399e-06, 'epoch': 0.76} + 76%|███████▌ | 4363/5772 [3:48:45<2:25:34, 6.20s/it] {'loss': 0.4601, 'learning_rate': 2.966731133967399e-06, 'epoch': 0.76} + 76%|███████▌ | 4363/5772 [3:48:43<2:25:34, 6.20s/it] 76%|███████▌ | 4364/5772 [3:48:49<2:23:38, 6.12s/it] 76%|███████▌ | 4364/5772 [3:48:51<2:23:38, 6.12s/it] {'loss': 0.4539, 'learning_rate': 2.9627428647712553e-06, 'epoch': 0.76} + 76%|███████▌ | 4364/5772 [3:48:51<2:23:38, 6.12s/it] {'loss': 0.4539, 'learning_rate': 2.9627428647712553e-06, 'epoch': 0.76} + 76%|███████▌ | 4364/5772 [3:48:49<2:23:38, 6.12s/it] 76%|███████▌ | 4365/5772 [3:48:56<2:27:10, 6.28s/it] 76%|███████▌ | 4365/5772 [3:48:58<2:27:11, 6.28s/it] {'loss': 0.4541, 'learning_rate': 2.9587568119201193e-06, 'epoch': 0.76} + 76%|███████▌ | 4365/5772 [3:48:58<2:27:11, 6.28s/it] {'loss': 0.4541, 'learning_rate': 2.9587568119201193e-06, 'epoch': 0.76} + 76%|███████▌ | 4365/5772 [3:48:56<2:27:10, 6.28s/it] 76%|███████▌ | 4366/5772 [3:49:02<2:27:54, 6.31s/it] 76%|███████▌ | 4366/5772 [3:49:04<2:27:54, 6.31s/it] {'loss': 0.4597, 'learning_rate': 2.954772976669378e-06, 'epoch': 0.76} + 76%|███████▌ | 4366/5772 [3:49:04<2:27:54, 6.31s/it] {'loss': 0.4597, 'learning_rate': 2.954772976669378e-06, 'epoch': 0.76} + 76%|███████▌ | 4366/5772 [3:49:02<2:27:54, 6.31s/it] 76%|███████▌ | 4367/5772 [3:49:08<2:26:43, 6.27s/it] 76%|███████▌ | 4367/5772 [3:49:10<2:26:44, 6.27s/it] {'loss': 0.4622, 'learning_rate': 2.950791360273714e-06, 'epoch': 0.76} + 76%|███████▌ | 4367/5772 [3:49:10<2:26:44, 6.27s/it] {'loss': 0.4622, 'learning_rate': 2.950791360273714e-06, 'epoch': 0.76} + 76%|███████▌ | 4367/5772 [3:49:08<2:26:43, 6.27s/it] 76%|███████▌ | 4368/5772 [3:49:14<2:24:35, 6.18s/it] 76%|███████▌ | 4368/5772 [3:49:16<2:24:35, 6.18s/it] {'loss': 0.4503, 'learning_rate': 2.9468119639871163e-06, 'epoch': 0.76} + 76%|███████▌ | 4368/5772 [3:49:16<2:24:35, 6.18s/it] {'loss': 0.4503, 'learning_rate': 2.9468119639871163e-06, 'epoch': 0.76} + 76%|███████▌ | 4368/5772 [3:49:14<2:24:35, 6.18s/it] 76%|███████▌ | 4369/5772 [3:49:20<2:23:42, 6.15s/it] 76%|███████▌ | 4369/5772 [3:49:22<2:23:42, 6.15s/it] {'loss': 0.4597, 'learning_rate': 2.942834789062876e-06, 'epoch': 0.76} + 76%|███████▌ | 4369/5772 [3:49:22<2:23:42, 6.15s/it] {'loss': 0.4597, 'learning_rate': 2.942834789062876e-06, 'epoch': 0.76} + 76%|███████▌ | 4369/5772 [3:49:20<2:23:42, 6.15s/it] 76%|███████▌ | 4370/5772 [3:49:27<2:24:16, 6.17s/it] 76%|███████▌ | 4370/5772 [3:49:29<2:24:16, 6.17s/it] {'loss': 0.4547, 'learning_rate': 2.9388598367535793e-06, 'epoch': 0.76} + 76%|███████▌ | 4370/5772 [3:49:29<2:24:16, 6.17s/it] {'loss': 0.4547, 'learning_rate': 2.9388598367535793e-06, 'epoch': 0.76} + 76%|███████▌ | 4370/5772 [3:49:27<2:24:16, 6.17s/it] 76%|███████▌ | 4371/5772 [3:49:32<2:21:09, 6.04s/it] 76%|███████▌ | 4371/5772 [3:49:34<2:21:09, 6.05s/it] {'loss': 0.4495, 'learning_rate': 2.9348871083111185e-06, 'epoch': 0.76} + 76%|███████▌ | 4371/5772 [3:49:34<2:21:09, 6.05s/it] {'loss': 0.4495, 'learning_rate': 2.9348871083111185e-06, 'epoch': 0.76} + 76%|███████▌ | 4371/5772 [3:49:32<2:21:09, 6.04s/it] 76%|███████▌ | 4372/5772 [3:49:39<2:24:19, 6.19s/it] 76%|███████▌ | 4372/5772 [3:49:41<2:24:19, 6.19s/it] {'loss': 0.4566, 'learning_rate': 2.93091660498668e-06, 'epoch': 0.76} + 76%|███████▌ | 4372/5772 [3:49:41<2:24:19, 6.19s/it] {'loss': 0.4566, 'learning_rate': 2.93091660498668e-06, 'epoch': 0.76} + 76%|███████▌ | 4372/5772 [3:49:39<2:24:19, 6.19s/it] 76%|███████▌ | 4373/5772 [3:49:45<2:24:02, 6.18s/it] 76%|███████▌ | 4373/5772 [3:49:47<2:24:02, 6.18s/it] {'loss': 0.462, 'learning_rate': 2.926948328030755e-06, 'epoch': 0.76} + 76%|███████▌ | 4373/5772 [3:49:47<2:24:02, 6.18s/it] {'loss': 0.462, 'learning_rate': 2.926948328030755e-06, 'epoch': 0.76} + 76%|███████▌ | 4373/5772 [3:49:45<2:24:02, 6.18s/it] 76%|███████▌ | 4374/5772 [3:49:51<2:24:19, 6.19s/it] 76%|███████▌ | 4374/5772 [3:49:53<2:24:19, 6.19s/it] {'loss': 0.4492, 'learning_rate': 2.9229822786931263e-06, 'epoch': 0.76} + 76%|███████▌ | 4374/5772 [3:49:53<2:24:19, 6.19s/it] {'loss': 0.4492, 'learning_rate': 2.9229822786931263e-06, 'epoch': 0.76} + 76%|███████▌ | 4374/5772 [3:49:51<2:24:19, 6.19s/it] 76%|███████▌ | 4375/5772 [3:49:58<2:28:15, 6.37s/it] 76%|███████▌ | 4375/5772 [3:50:00<2:28:15, 6.37s/it] {'loss': 0.4665, 'learning_rate': 2.9190184582228787e-06, 'epoch': 0.76} + 76%|███████▌ | 4375/5772 [3:50:00<2:28:15, 6.37s/it] {'loss': 0.4665, 'learning_rate': 2.9190184582228787e-06, 'epoch': 0.76} + 76%|███████▌ | 4375/5772 [3:49:58<2:28:15, 6.37s/it] 76%|███████▌ | 4376/5772 [3:50:04<2:26:59, 6.32s/it] 76%|███████▌ | 4376/5772 [3:50:06<2:26:58, 6.32s/it] {'loss': 0.4561, 'learning_rate': 2.9150568678683987e-06, 'epoch': 0.76} + 76%|███████▌ | 4376/5772 [3:50:06<2:26:58, 6.32s/it] {'loss': 0.4561, 'learning_rate': 2.9150568678683987e-06, 'epoch': 0.76} + 76%|███████▌ | 4376/5772 [3:50:04<2:26:59, 6.32s/it] 76%|███████▌ | 4377/5772 [3:50:10<2:25:55, 6.28s/it] 76%|███████▌ | 4377/5772 [3:50:12<2:25:55, 6.28s/it] {'loss': 0.4533, 'learning_rate': 2.911097508877365e-06, 'epoch': 0.76} + 76%|███████▌ | 4377/5772 [3:50:12<2:25:55, 6.28s/it] {'loss': 0.4533, 'learning_rate': 2.911097508877365e-06, 'epoch': 0.76} + 76%|███████▌ | 4377/5772 [3:50:10<2:25:55, 6.28s/it] 76%|███████▌ | 4378/5772 [3:50:16<2:22:55, 6.15s/it] 76%|███████▌ | 4378/5772 [3:50:18<2:22:55, 6.15s/it] {'loss': 0.4547, 'learning_rate': 2.907140382496757e-06, 'epoch': 0.76} + 76%|███████▌ | 4378/5772 [3:50:18<2:22:55, 6.15s/it] {'loss': 0.4547, 'learning_rate': 2.907140382496757e-06, 'epoch': 0.76} + 76%|███████▌ | 4378/5772 [3:50:16<2:22:55, 6.15s/it] 76%|███████▌ | 4379/5772 [3:50:22<2:22:57, 6.16s/it] 76%|███████▌ | 4379/5772 [3:50:24<2:22:57, 6.16s/it] {'loss': 0.4533, 'learning_rate': 2.9031854899728485e-06, 'epoch': 0.76} + 76%|███████▌ | 4379/5772 [3:50:24<2:22:57, 6.16s/it] {'loss': 0.4533, 'learning_rate': 2.9031854899728485e-06, 'epoch': 0.76} + 76%|███████▌ | 4379/5772 [3:50:22<2:22:57, 6.16s/it] 76%|███████▌ | 4380/5772 [3:50:28<2:21:27, 6.10s/it] 76%|███████▌ | 4380/5772 [3:50:30<2:21:26, 6.10s/it] {'loss': 0.4637, 'learning_rate': 2.899232832551214e-06, 'epoch': 0.76} + 76%|███████▌ | 4380/5772 [3:50:30<2:21:26, 6.10s/it] {'loss': 0.4637, 'learning_rate': 2.899232832551214e-06, 'epoch': 0.76} + 76%|███████▌ | 4380/5772 [3:50:28<2:21:27, 6.10s/it] 76%|███████▌ | 4381/5772 [3:50:34<2:21:07, 6.09s/it] 76%|███████▌ | 4381/5772 [3:50:36<2:21:07, 6.09s/it] {'loss': 0.465, 'learning_rate': 2.8952824114767164e-06, 'epoch': 0.76} + 76%|███████▌ | 4381/5772 [3:50:36<2:21:07, 6.09s/it] {'loss': 0.465, 'learning_rate': 2.8952824114767164e-06, 'epoch': 0.76} + 76%|███████▌ | 4381/5772 [3:50:34<2:21:07, 6.09s/it] 76%|███████▌ | 4382/5772 [3:50:41<2:20:51, 6.08s/it] 76%|███████▌ | 4382/5772 [3:50:43<2:20:51, 6.08s/it] {'loss': 0.4696, 'learning_rate': 2.891334227993521e-06, 'epoch': 0.76} + 76%|███████▌ | 4382/5772 [3:50:43<2:20:51, 6.08s/it] {'loss': 0.4696, 'learning_rate': 2.891334227993521e-06, 'epoch': 0.76} + 76%|███████▌ | 4382/5772 [3:50:41<2:20:51, 6.08s/it] 76%|███████▌ | 4383/5772 [3:50:47<2:23:25, 6.20s/it] 76%|███████▌ | 4383/5772 [3:50:49<2:23:25, 6.20s/it] {'loss': 0.4657, 'learning_rate': 2.8873882833450863e-06, 'epoch': 0.76} + 76%|███████▌ | 4383/5772 [3:50:49<2:23:25, 6.20s/it] {'loss': 0.4657, 'learning_rate': 2.8873882833450863e-06, 'epoch': 0.76} + 76%|███████▌ | 4383/5772 [3:50:47<2:23:25, 6.20s/it] 76%|███████▌ | 4384/5772 [3:50:53<2:21:07, 6.10s/it] 76%|███████▌ | 4384/5772 [3:50:55<2:21:07, 6.10s/it] {'loss': 0.4593, 'learning_rate': 2.8834445787741647e-06, 'epoch': 0.76} + 76%|███████▌ | 4384/5772 [3:50:55<2:21:07, 6.10s/it] {'loss': 0.4593, 'learning_rate': 2.8834445787741647e-06, 'epoch': 0.76} + 76%|███████▌ | 4384/5772 [3:50:53<2:21:07, 6.10s/it] 76%|███████▌ | 4385/5772 [3:51:01<2:21:59, 6.14s/it] 76%|███████▌ | 4385/5772 [3:50:59<2:21:59, 6.14s/it] {'loss': 0.4618, 'learning_rate': 2.8795031155228083e-06, 'epoch': 0.76} + 76%|███████▌ | 4385/5772 [3:51:01<2:21:59, 6.14s/it] {'loss': 0.4618, 'learning_rate': 2.8795031155228083e-06, 'epoch': 0.76} + 76%|███████▌ | 4385/5772 [3:50:59<2:21:59, 6.14s/it] 76%|███████▌ | 4386/5772 [3:51:05<2:20:47, 6.09s/it] 76%|███████▌ | 4386/5772 [3:51:07<2:20:47, 6.09s/it] {'loss': 0.4623, 'learning_rate': 2.8755638948323494e-06, 'epoch': 0.76} + 76%|███████▌ | 4386/5772 [3:51:07<2:20:47, 6.09s/it] {'loss': 0.4623, 'learning_rate': 2.8755638948323494e-06, 'epoch': 0.76} + 76%|███████▌ | 4386/5772 [3:51:05<2:20:47, 6.09s/it] 76%|███████▌ | 4387/5772 [3:51:11<2:20:14, 6.08s/it] 76%|███████▌ | 4387/5772 [3:51:13<2:20:14, 6.08s/it] {'loss': 0.4635, 'learning_rate': 2.8716269179434366e-06, 'epoch': 0.76} + 76%|███████▌ | 4387/5772 [3:51:13<2:20:14, 6.08s/it] {'loss': 0.4635, 'learning_rate': 2.8716269179434366e-06, 'epoch': 0.76} + 76%|███████▌ | 4387/5772 [3:51:11<2:20:14, 6.08s/it] 76%|███████▌ | 4388/5772 [3:51:17<2:21:14, 6.12s/it] 76%|███████▌ | 4388/5772 [3:51:19<2:21:13, 6.12s/it] {'loss': 0.4454, 'learning_rate': 2.8676921860959874e-06, 'epoch': 0.76} + 76%|███████▌ | 4388/5772 [3:51:19<2:21:13, 6.12s/it] {'loss': 0.4454, 'learning_rate': 2.8676921860959874e-06, 'epoch': 0.76} + 76%|███████▌ | 4388/5772 [3:51:17<2:21:14, 6.12s/it] 76%|███████▌ | 4389/5772 [3:51:23<2:20:27, 6.09s/it] 76%|███████▌ | 4389/5772 [3:51:25<2:20:28, 6.09s/it] {'loss': 0.4648, 'learning_rate': 2.8637597005292295e-06, 'epoch': 0.76} + 76%|███████▌ | 4389/5772 [3:51:25<2:20:28, 6.09s/it] {'loss': 0.4648, 'learning_rate': 2.8637597005292295e-06, 'epoch': 0.76} + 76%|███████▌ | 4389/5772 [3:51:23<2:20:27, 6.09s/it] 76%|███████▌ | 4390/5772 [3:51:29<2:19:54, 6.07s/it] 76%|███████▌ | 4390/5772 [3:51:31<2:19:54, 6.07s/it] {'loss': 0.4558, 'learning_rate': 2.859829462481676e-06, 'epoch': 0.76} + 76%|███████▌ | 4390/5772 [3:51:31<2:19:54, 6.07s/it] {'loss': 0.4558, 'learning_rate': 2.859829462481676e-06, 'epoch': 0.76} + 76%|███████▌ | 4390/5772 [3:51:29<2:19:54, 6.07s/it] 76%|███████▌ | 4391/5772 [3:51:35<2:19:08, 6.05s/it] 76%|███████▌ | 4391/5772 [3:51:37<2:19:08, 6.05s/it] {'loss': 0.4496, 'learning_rate': 2.855901473191134e-06, 'epoch': 0.76} + 76%|███████▌ | 4391/5772 [3:51:37<2:19:08, 6.05s/it] {'loss': 0.4496, 'learning_rate': 2.855901473191134e-06, 'epoch': 0.76} + 76%|███████▌ | 4391/5772 [3:51:35<2:19:08, 6.05s/it] 76%|███████▌ | 4392/5772 [3:51:42<2:19:54, 6.08s/it] 76%|███████▌ | 4392/5772 [3:51:44<2:19:54, 6.08s/it] {'loss': 0.4724, 'learning_rate': 2.851975733894705e-06, 'epoch': 0.76} + 76%|███████▌ | 4392/5772 [3:51:44<2:19:54, 6.08s/it] {'loss': 0.4724, 'learning_rate': 2.851975733894705e-06, 'epoch': 0.76} + 76%|███████▌ | 4392/5772 [3:51:42<2:19:54, 6.08s/it] 76%|███████▌ | 4393/5772 [3:51:49<2:18:15, 6.02s/it] 76%|███████▌ | 4393/5772 [3:51:47<2:18:16, 6.02s/it] {'loss': 0.4589, 'learning_rate': 2.8480522458287686e-06, 'epoch': 0.76} + 76%|███████▌ | 4393/5772 [3:51:49<2:18:15, 6.02s/it] {'loss': 0.4589, 'learning_rate': 2.8480522458287686e-06, 'epoch': 0.76} + 76%|███████▌ | 4393/5772 [3:51:47<2:18:16, 6.02s/it] 76%|███████▌ | 4394/5772 [3:51:54<2:23:06, 6.23s/it] 76%|███████▌ | 4394/5772 [3:51:56<2:23:07, 6.23s/it] {'loss': 0.4592, 'learning_rate': 2.8441310102290187e-06, 'epoch': 0.76} + 76%|███████▌ | 4394/5772 [3:51:56<2:23:07, 6.23s/it] {'loss': 0.4592, 'learning_rate': 2.8441310102290187e-06, 'epoch': 0.76} + 76%|███████▌ | 4394/5772 [3:51:54<2:23:06, 6.23s/it] 76%|███████▌ | 4395/5772 [3:52:00<2:21:26, 6.16s/it] 76%|███████▌ | 4395/5772 [3:52:02<2:21:26, 6.16s/it] {'loss': 0.4554, 'learning_rate': 2.840212028330418e-06, 'epoch': 0.76} + 76%|███████▌ | 4395/5772 [3:52:02<2:21:26, 6.16s/it] {'loss': 0.4554, 'learning_rate': 2.840212028330418e-06, 'epoch': 0.76} + 76%|███████▌ | 4395/5772 [3:52:00<2:21:26, 6.16s/it] 76%|███████▌ | 4396/5772 [3:52:06<2:20:20, 6.12s/it] 76%|███████▌ | 4396/5772 [3:52:08<2:20:20, 6.12s/it] {'loss': 0.4692, 'learning_rate': 2.8362953013672325e-06, 'epoch': 0.76} + 76%|███████▌ | 4396/5772 [3:52:08<2:20:20, 6.12s/it] {'loss': 0.4692, 'learning_rate': 2.8362953013672325e-06, 'epoch': 0.76} + 76%|███████▌ | 4396/5772 [3:52:06<2:20:20, 6.12s/it] 76%|███████▌ | 4397/5772 [3:52:12<2:20:41, 6.14s/it] 76%|███████▌ | 4397/5772 [3:52:14<2:20:41, 6.14s/it] {'loss': 0.451, 'learning_rate': 2.8323808305730062e-06, 'epoch': 0.76} + 76%|███████▌ | 4397/5772 [3:52:14<2:20:41, 6.14s/it] {'loss': 0.451, 'learning_rate': 2.8323808305730062e-06, 'epoch': 0.76} + 76%|███████▌ | 4397/5772 [3:52:12<2:20:41, 6.14s/it] 76%|███████▌ | 4398/5772 [3:52:19<2:22:16, 6.21s/it] 76%|███████▌ | 4398/5772 [3:52:21<2:22:16, 6.21s/it] {'loss': 0.452, 'learning_rate': 2.8284686171805875e-06, 'epoch': 0.76} + 76%|███████▌ | 4398/5772 [3:52:21<2:22:16, 6.21s/it] {'loss': 0.452, 'learning_rate': 2.8284686171805875e-06, 'epoch': 0.76} + 76%|███████▌ | 4398/5772 [3:52:19<2:22:16, 6.21s/it] 76%|███████▌ | 4399/5772 [3:52:25<2:20:55, 6.16s/it] 76%|███████▌ | 4399/5772 [3:52:27<2:20:55, 6.16s/it] {'loss': 0.4629, 'learning_rate': 2.8245586624221076e-06, 'epoch': 0.76} + 76%|███████▌ | 4399/5772 [3:52:27<2:20:55, 6.16s/it] {'loss': 0.4629, 'learning_rate': 2.8245586624221076e-06, 'epoch': 0.76} + 76%|███████▌ | 4399/5772 [3:52:25<2:20:55, 6.16s/it]10 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +14136 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +02 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... 76%|███████▌ | 4400/5772 [3:52:33<2:20:49, 6.16s/it]9 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... + + 76%|███████▌ | 4400/5772 [3:52:31<2:20:49, 6.16s/it]5 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4694, 'learning_rate': 2.8206509675289785e-06, 'epoch': 0.76} + 76%|███████▌ | 4400/5772 [3:52:33<2:20:49, 6.16s/it] {'loss': 0.4694, 'learning_rate': 2.8206509675289785e-06, 'epoch': 0.76} + 76%|███████▌ | 4400/5772 [3:52:31<2:20:49, 6.16s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-4400/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-4400/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-4400/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 76%|███████▌ | 4401/5772 [3:52:53<3:57:56, 10.41s/it] 76%|███████▌ | 4401/5772 [3:52:51<3:57:56, 10.41s/it] {'loss': 0.4702, 'learning_rate': 2.8167455337319084e-06, 'epoch': 0.76} + 76%|███████▌ | 4401/5772 [3:52:53<3:57:56, 10.41s/it] {'loss': 0.4702, 'learning_rate': 2.8167455337319084e-06, 'epoch': 0.76} + 76%|███████▌ | 4401/5772 [3:52:51<3:57:56, 10.41s/it] 76%|███████▋ | 4402/5772 [3:53:00<3:31:50, 9.28s/it] 76%|███████▋ | 4402/5772 [3:52:58<3:31:50, 9.28s/it] {'loss': 0.4631, 'learning_rate': 2.8128423622608947e-06, 'epoch': 0.76} + 76%|███████▋ | 4402/5772 [3:53:00<3:31:50, 9.28s/it] {'loss': 0.4631, 'learning_rate': 2.8128423622608947e-06, 'epoch': 0.76} + 76%|███████▋ | 4402/5772 [3:52:58<3:31:50, 9.28s/it] 76%|███████▋ | 4403/5772 [3:53:04<3:08:21, 8.26s/it] 76%|███████▋ | 4403/5772 [3:53:06<3:08:21, 8.26s/it] {'loss': 0.4536, 'learning_rate': 2.808941454345221e-06, 'epoch': 0.76} + 76%|███████▋ | 4403/5772 [3:53:06<3:08:21, 8.26s/it] {'loss': 0.4536, 'learning_rate': 2.808941454345221e-06, 'epoch': 0.76} + 76%|███████▋ | 4403/5772 [3:53:04<3:08:21, 8.26s/it] 76%|███████▋ | 4404/5772 [3:53:12<2:52:07, 7.55s/it] 76%|███████▋ | 4404/5772 [3:53:10<2:52:07, 7.55s/it] {'loss': 0.4615, 'learning_rate': 2.8050428112134474e-06, 'epoch': 0.76} + 76%|███████▋ | 4404/5772 [3:53:12<2:52:07, 7.55s/it] {'loss': 0.4615, 'learning_rate': 2.8050428112134474e-06, 'epoch': 0.76} + 76%|███████▋ | 4404/5772 [3:53:10<2:52:07, 7.55s/it] 76%|███████▋ | 4405/5772 [3:53:16<2:41:29, 7.09s/it] 76%|███████▋ | 4405/5772 [3:53:18<2:41:29, 7.09s/it] {'loss': 0.4603, 'learning_rate': 2.8011464340934403e-06, 'epoch': 0.76} + 76%|███████▋ | 4405/5772 [3:53:18<2:41:29, 7.09s/it] {'loss': 0.4603, 'learning_rate': 2.8011464340934403e-06, 'epoch': 0.76} + 76%|███████▋ | 4405/5772 [3:53:16<2:41:29, 7.09s/it] 76%|███████▋ | 4406/5772 [3:53:22<2:34:27, 6.78s/it] 76%|███████▋ | 4406/5772 [3:53:24<2:34:27, 6.78s/it] {'loss': 0.4632, 'learning_rate': 2.7972523242123407e-06, 'epoch': 0.76} + 76%|███████▋ | 4406/5772 [3:53:24<2:34:27, 6.78s/it] {'loss': 0.4632, 'learning_rate': 2.7972523242123407e-06, 'epoch': 0.76} + 76%|███████▋ | 4406/5772 [3:53:22<2:34:27, 6.78s/it] 76%|███████▋ | 4407/5772 [3:53:28<2:29:20, 6.56s/it] 76%|███████▋ | 4407/5772 [3:53:30<2:29:20, 6.56s/it] {'loss': 0.4575, 'learning_rate': 2.79336048279657e-06, 'epoch': 0.76} + 76%|███████▋ | 4407/5772 [3:53:30<2:29:20, 6.56s/it] {'loss': 0.4575, 'learning_rate': 2.79336048279657e-06, 'epoch': 0.76} + 76%|███████▋ | 4407/5772 [3:53:28<2:29:20, 6.56s/it] 76%|███████▋ | 4408/5772 [3:53:34<2:26:31, 6.45s/it] 76%|███████▋ | 4408/5772 [3:53:36<2:26:31, 6.45s/it] {'loss': 0.4615, 'learning_rate': 2.7894709110718476e-06, 'epoch': 0.76} + 76%|███████▋ | 4408/5772 [3:53:36<2:26:31, 6.45s/it] {'loss': 0.4615, 'learning_rate': 2.7894709110718476e-06, 'epoch': 0.76} + 76%|███████▋ | 4408/5772 [3:53:34<2:26:31, 6.45s/it] 76%|███████▋ | 4409/5772 [3:53:42<2:25:43, 6.42s/it] 76%|███████▋ | 4409/5772 [3:53:40<2:25:44, 6.42s/it] {'loss': 0.4501, 'learning_rate': 2.7855836102631707e-06, 'epoch': 0.76} + 76%|███████▋ | 4409/5772 [3:53:42<2:25:43, 6.42s/it] {'loss': 0.4501, 'learning_rate': 2.7855836102631707e-06, 'epoch': 0.76} + 76%|███████▋ | 4409/5772 [3:53:40<2:25:44, 6.42s/it] 76%|███████▋ | 4410/5772 [3:53:48<2:23:44, 6.33s/it] 76%|███████▋ | 4410/5772 [3:53:46<2:23:45, 6.33s/it] {'loss': 0.4604, 'learning_rate': 2.781698581594826e-06, 'epoch': 0.76} + 76%|███████▋ | 4410/5772 [3:53:48<2:23:44, 6.33s/it] {'loss': 0.4604, 'learning_rate': 2.781698581594826e-06, 'epoch': 0.76} + 76%|███████▋ | 4410/5772 [3:53:47<2:23:45, 6.33s/it] 76%|███████▋ | 4411/5772 [3:53:55<2:22:32, 6.28s/it] 76%|███████▋ | 4411/5772 [3:53:53<2:22:32, 6.28s/it] {'loss': 0.457, 'learning_rate': 2.7778158262903764e-06, 'epoch': 0.76} + 76%|███████▋ | 4411/5772 [3:53:55<2:22:32, 6.28s/it] {'loss': 0.457, 'learning_rate': 2.7778158262903764e-06, 'epoch': 0.76} + 76%|███████▋ | 4411/5772 [3:53:53<2:22:32, 6.28s/it] 76%|███████▋ | 4412/5772 [3:53:59<2:20:56, 6.22s/it] 76%|███████▋ | 4412/5772 [3:54:01<2:20:56, 6.22s/it] {'loss': 0.4694, 'learning_rate': 2.7739353455726735e-06, 'epoch': 0.76} + 76%|███████▋ | 4412/5772 [3:54:01<2:20:56, 6.22s/it] {'loss': 0.4694, 'learning_rate': 2.7739353455726735e-06, 'epoch': 0.76} + 76%|███████▋ | 4412/5772 [3:53:59<2:20:56, 6.22s/it] 76%|███████▋ | 4413/5772 [3:54:05<2:20:05, 6.18s/it] 76%|███████▋ | 4413/5772 [3:54:07<2:20:05, 6.19s/it] {'loss': 0.4589, 'learning_rate': 2.7700571406638633e-06, 'epoch': 0.76} + 76%|███████▋ | 4413/5772 [3:54:07<2:20:05, 6.19s/it] {'loss': 0.4589, 'learning_rate': 2.7700571406638633e-06, 'epoch': 0.76} + 76%|███████▋ | 4413/5772 [3:54:05<2:20:05, 6.18s/it] 76%|███████▋ | 4414/5772 [3:54:11<2:17:56, 6.09s/it] 76%|███████▋ | 4414/5772 [3:54:13<2:17:56, 6.09s/it] {'loss': 0.4522, 'learning_rate': 2.7661812127853536e-06, 'epoch': 0.76} + 76%|███████▋ | 4414/5772 [3:54:13<2:17:56, 6.09s/it] {'loss': 0.4522, 'learning_rate': 2.7661812127853536e-06, 'epoch': 0.76} + 76%|███████▋ | 4414/5772 [3:54:11<2:17:56, 6.09s/it] 76%|███████▋ | 4415/5772 [3:54:17<2:18:50, 6.14s/it] 76%|███████▋ | 4415/5772 [3:54:19<2:18:50, 6.14s/it] {'loss': 0.4688, 'learning_rate': 2.762307563157852e-06, 'epoch': 0.76} + 76%|███████▋ | 4415/5772 [3:54:19<2:18:50, 6.14s/it] {'loss': 0.4688, 'learning_rate': 2.762307563157852e-06, 'epoch': 0.76} + 76%|███████▋ | 4415/5772 [3:54:17<2:18:50, 6.14s/it] 77%|███████▋ | 4416/5772 [3:54:23<2:16:17, 6.03s/it] 77%|███████▋ | 4416/5772 [3:54:25<2:16:17, 6.03s/it] {'loss': 0.4658, 'learning_rate': 2.7584361930013413e-06, 'epoch': 0.77} + 77%|███████▋ | 4416/5772 [3:54:25<2:16:17, 6.03s/it] {'loss': 0.4658, 'learning_rate': 2.7584361930013413e-06, 'epoch': 0.77} + 77%|███████▋ | 4416/5772 [3:54:23<2:16:17, 6.03s/it] 77%|███████▋ | 4417/5772 [3:54:29<2:17:35, 6.09s/it] 77%|███████▋ | 4417/5772 [3:54:31<2:17:35, 6.09s/it] {'loss': 0.4603, 'learning_rate': 2.7545671035350907e-06, 'epoch': 0.77} + 77%|███████▋ | 4417/5772 [3:54:31<2:17:35, 6.09s/it] {'loss': 0.4603, 'learning_rate': 2.7545671035350907e-06, 'epoch': 0.77} + 77%|███████▋ | 4417/5772 [3:54:29<2:17:35, 6.09s/it] 77%|███████▋ | 4418/5772 [3:54:35<2:17:43, 6.10s/it] 77%|███████▋ | 4418/5772 [3:54:37<2:17:43, 6.10s/it] {'loss': 0.4644, 'learning_rate': 2.7507002959776443e-06, 'epoch': 0.77} + 77%|███████▋ | 4418/5772 [3:54:37<2:17:43, 6.10s/it] {'loss': 0.4644, 'learning_rate': 2.7507002959776443e-06, 'epoch': 0.77} + 77%|███████▋ | 4418/5772 [3:54:35<2:17:43, 6.10s/it] 77%|███████▋ | 4419/5772 [3:54:41<2:16:30, 6.05s/it] 77%|███████▋ | 4419/5772 [3:54:43<2:16:30, 6.05s/it] {'loss': 0.4498, 'learning_rate': 2.7468357715468296e-06, 'epoch': 0.77} + 77%|███████▋ | 4419/5772 [3:54:43<2:16:30, 6.05s/it] {'loss': 0.4498, 'learning_rate': 2.7468357715468296e-06, 'epoch': 0.77} + 77%|███████▋ | 4419/5772 [3:54:41<2:16:30, 6.05s/it] 77%|███████▋ | 4420/5772 [3:54:47<2:14:58, 5.99s/it] 77%|███████▋ | 4420/5772 [3:54:49<2:14:58, 5.99s/it] {'loss': 0.4577, 'learning_rate': 2.742973531459767e-06, 'epoch': 0.77} + 77%|███████▋ | 4420/5772 [3:54:49<2:14:58, 5.99s/it] {'loss': 0.4577, 'learning_rate': 2.742973531459767e-06, 'epoch': 0.77} + 77%|███████▋ | 4420/5772 [3:54:47<2:14:58, 5.99s/it] 77%|███████▋ | 4421/5772 [3:54:53<2:17:41, 6.12s/it] 77%|███████▋ | 4421/5772 [3:54:55<2:17:42, 6.12s/it] {'loss': 0.4691, 'learning_rate': 2.739113576932838e-06, 'epoch': 0.77} + 77%|███████▋ | 4421/5772 [3:54:55<2:17:42, 6.12s/it] {'loss': 0.4691, 'learning_rate': 2.739113576932838e-06, 'epoch': 0.77} + 77%|███████▋ | 4421/5772 [3:54:53<2:17:41, 6.12s/it] 77%|███████▋ | 4422/5772 [3:54:59<2:16:23, 6.06s/it] 77%|███████▋ | 4422/5772 [3:55:01<2:16:23, 6.06s/it] {'loss': 0.4659, 'learning_rate': 2.735255909181719e-06, 'epoch': 0.77} + 77%|███████▋ | 4422/5772 [3:55:01<2:16:23, 6.06s/it] {'loss': 0.4659, 'learning_rate': 2.735255909181719e-06, 'epoch': 0.77} + 77%|███████▋ | 4422/5772 [3:54:59<2:16:23, 6.06s/it] 77%|███████▋ | 4423/5772 [3:55:05<2:16:03, 6.05s/it] 77%|███████▋ | 4423/5772 [3:55:07<2:16:03, 6.05s/it] {'loss': 0.4499, 'learning_rate': 2.7314005294213573e-06, 'epoch': 0.77} + 77%|███████▋ | 4423/5772 [3:55:07<2:16:03, 6.05s/it] {'loss': 0.4499, 'learning_rate': 2.7314005294213573e-06, 'epoch': 0.77} + 77%|███████▋ | 4423/5772 [3:55:05<2:16:03, 6.05s/it] 77%|███████▋ | 4424/5772 [3:55:11<2:14:44, 6.00s/it] 77%|███████▋ | 4424/5772 [3:55:13<2:14:44, 6.00s/it] {'loss': 0.4618, 'learning_rate': 2.7275474388659896e-06, 'epoch': 0.77} + 77%|███████▋ | 4424/5772 [3:55:13<2:14:44, 6.00s/it] {'loss': 0.4618, 'learning_rate': 2.7275474388659896e-06, 'epoch': 0.77} + 77%|███████▋ | 4424/5772 [3:55:11<2:14:44, 6.00s/it] 77%|███████▋ | 4425/5772 [3:55:17<2:15:16, 6.03s/it] 77%|███████▋ | 4425/5772 [3:55:19<2:15:16, 6.03s/it] {'loss': 0.4588, 'learning_rate': 2.7236966387291176e-06, 'epoch': 0.77} + 77%|███████▋ | 4425/5772 [3:55:19<2:15:16, 6.03s/it] {'loss': 0.4588, 'learning_rate': 2.7236966387291176e-06, 'epoch': 0.77} + 77%|███████▋ | 4425/5772 [3:55:17<2:15:16, 6.03s/it] 77%|███████▋ | 4426/5772 [3:55:24<2:17:13, 6.12s/it] 77%|███████▋ | 4426/5772 [3:55:26<2:17:13, 6.12s/it] {'loss': 0.4684, 'learning_rate': 2.7198481302235325e-06, 'epoch': 0.77} + 77%|███████▋ | 4426/5772 [3:55:26<2:17:13, 6.12s/it] {'loss': 0.4684, 'learning_rate': 2.7198481302235325e-06, 'epoch': 0.77} + 77%|███████▋ | 4426/5772 [3:55:24<2:17:13, 6.12s/it] 77%|███████▋ | 4427/5772 [3:55:30<2:18:30, 6.18s/it] 77%|███████▋ | 4427/5772 [3:55:32<2:18:30, 6.18s/it] {'loss': 0.4643, 'learning_rate': 2.7160019145613002e-06, 'epoch': 0.77} + 77%|███████▋ | 4427/5772 [3:55:32<2:18:30, 6.18s/it] {'loss': 0.4643, 'learning_rate': 2.7160019145613002e-06, 'epoch': 0.77} + 77%|███████▋ | 4427/5772 [3:55:30<2:18:30, 6.18s/it] 77%|███████▋ | 4428/5772 [3:55:36<2:15:15, 6.04s/it] 77%|███████▋ | 4428/5772 [3:55:38<2:15:15, 6.04s/it] {'loss': 0.4602, 'learning_rate': 2.7121579929537677e-06, 'epoch': 0.77} + 77%|███████▋ | 4428/5772 [3:55:38<2:15:15, 6.04s/it] {'loss': 0.4602, 'learning_rate': 2.7121579929537677e-06, 'epoch': 0.77} + 77%|███████▋ | 4428/5772 [3:55:36<2:15:15, 6.04s/it] 77%|███████▋ | 4429/5772 [3:55:42<2:14:53, 6.03s/it] 77%|███████▋ | 4429/5772 [3:55:44<2:14:53, 6.03s/it] {'loss': 0.4479, 'learning_rate': 2.7083163666115564e-06, 'epoch': 0.77} + 77%|███████▋ | 4429/5772 [3:55:44<2:14:53, 6.03s/it] {'loss': 0.4479, 'learning_rate': 2.7083163666115564e-06, 'epoch': 0.77} + 77%|███████▋ | 4429/5772 [3:55:42<2:14:53, 6.03s/it] 77%|███████▋ | 4430/5772 [3:55:48<2:19:26, 6.23s/it] 77%|███████▋ | 4430/5772 [3:55:50<2:19:26, 6.23s/it] {'loss': 0.4654, 'learning_rate': 2.7044770367445583e-06, 'epoch': 0.77} + 77%|███████▋ | 4430/5772 [3:55:50<2:19:26, 6.23s/it] {'loss': 0.4654, 'learning_rate': 2.7044770367445583e-06, 'epoch': 0.77} + 77%|███████▋ | 4430/5772 [3:55:48<2:19:26, 6.23s/it] 77%|███████▋ | 4431/5772 [3:55:54<2:18:37, 6.20s/it] 77%|███████▋ | 4431/5772 [3:55:56<2:18:37, 6.20s/it] {'loss': 0.4454, 'learning_rate': 2.7006400045619597e-06, 'epoch': 0.77} + 77%|███████▋ | 4431/5772 [3:55:56<2:18:37, 6.20s/it] {'loss': 0.4454, 'learning_rate': 2.7006400045619597e-06, 'epoch': 0.77} + 77%|███████▋ | 4431/5772 [3:55:54<2:18:37, 6.20s/it] 77%|███████▋ | 4432/5772 [3:56:00<2:17:15, 6.15s/it] 77%|███████▋ | 4432/5772 [3:56:02<2:17:15, 6.15s/it] {'loss': 0.4655, 'learning_rate': 2.6968052712722037e-06, 'epoch': 0.77} + 77%|███████▋ | 4432/5772 [3:56:02<2:17:15, 6.15s/it] {'loss': 0.4655, 'learning_rate': 2.6968052712722037e-06, 'epoch': 0.77} + 77%|███████▋ | 4432/5772 [3:56:00<2:17:15, 6.15s/it] 77%|███████▋ | 4433/5772 [3:56:09<2:19:52, 6.27s/it] 77%|███████▋ | 4433/5772 [3:56:07<2:19:52, 6.27s/it] {'loss': 0.4514, 'learning_rate': 2.692972838083022e-06, 'epoch': 0.77} + 77%|███████▋ | 4433/5772 [3:56:09<2:19:52, 6.27s/it] {'loss': 0.4514, 'learning_rate': 2.692972838083022e-06, 'epoch': 0.77} + 77%|███████▋ | 4433/5772 [3:56:07<2:19:52, 6.27s/it] 77%|███████▋ | 4434/5772 [3:56:13<2:17:09, 6.15s/it] 77%|███████▋ | 4434/5772 [3:56:15<2:17:09, 6.15s/it] {'loss': 0.455, 'learning_rate': 2.6891427062014184e-06, 'epoch': 0.77} + 77%|███████▋ | 4434/5772 [3:56:15<2:17:09, 6.15s/it] {'loss': 0.455, 'learning_rate': 2.6891427062014184e-06, 'epoch': 0.77} + 77%|███████▋ | 4434/5772 [3:56:13<2:17:09, 6.15s/it] 77%|███████▋ | 4435/5772 [3:56:19<2:18:13, 6.20s/it] 77%|███████▋ | 4435/5772 [3:56:21<2:18:13, 6.20s/it] {'loss': 0.4566, 'learning_rate': 2.6853148768336703e-06, 'epoch': 0.77} + 77%|███████▋ | 4435/5772 [3:56:21<2:18:13, 6.20s/it] {'loss': 0.4566, 'learning_rate': 2.6853148768336703e-06, 'epoch': 0.77} + 77%|███████▋ | 4435/5772 [3:56:19<2:18:13, 6.20s/it] 77%|███████▋ | 4436/5772 [3:56:25<2:17:10, 6.16s/it] 77%|███████▋ | 4436/5772 [3:56:27<2:17:10, 6.16s/it] {'loss': 0.4567, 'learning_rate': 2.6814893511853347e-06, 'epoch': 0.77} + 77%|███████▋ | 4436/5772 [3:56:27<2:17:10, 6.16s/it] {'loss': 0.4567, 'learning_rate': 2.6814893511853347e-06, 'epoch': 0.77} + 77%|███████▋ | 4436/5772 [3:56:25<2:17:10, 6.16s/it] 77%|███████▋ | 4437/5772 [3:56:31<2:14:45, 6.06s/it] 77%|███████▋ | 4437/5772 [3:56:33<2:14:45, 6.06s/it] {'loss': 0.4504, 'learning_rate': 2.677666130461232e-06, 'epoch': 0.77} + 77%|███████▋ | 4437/5772 [3:56:33<2:14:45, 6.06s/it] {'loss': 0.4504, 'learning_rate': 2.677666130461232e-06, 'epoch': 0.77} + 77%|███████▋ | 4437/5772 [3:56:31<2:14:45, 6.06s/it] 77%|███████▋ | 4438/5772 [3:56:37<2:16:37, 6.15s/it] 77%|███████▋ | 4438/5772 [3:56:39<2:16:37, 6.15s/it] {'loss': 0.466, 'learning_rate': 2.6738452158654736e-06, 'epoch': 0.77} + 77%|███████▋ | 4438/5772 [3:56:39<2:16:37, 6.15s/it] {'loss': 0.466, 'learning_rate': 2.6738452158654736e-06, 'epoch': 0.77} + 77%|███████▋ | 4438/5772 [3:56:37<2:16:37, 6.15s/it] 77%|███████▋ | 4439/5772 [3:56:44<2:18:29, 6.23s/it] 77%|███████▋ | 4439/5772 [3:56:46<2:18:29, 6.23s/it] {'loss': 0.4617, 'learning_rate': 2.670026608601429e-06, 'epoch': 0.77} + 77%|███████▋ | 4439/5772 [3:56:46<2:18:29, 6.23s/it] {'loss': 0.4617, 'learning_rate': 2.670026608601429e-06, 'epoch': 0.77} + 77%|███████▋ | 4439/5772 [3:56:44<2:18:29, 6.23s/it] 77%|███████▋ | 4440/5772 [3:56:50<2:16:59, 6.17s/it] 77%|███████▋ | 4440/5772 [3:56:52<2:16:59, 6.17s/it] {'loss': 0.4668, 'learning_rate': 2.6662103098717485e-06, 'epoch': 0.77} + 77%|███████▋ | 4440/5772 [3:56:52<2:16:59, 6.17s/it] {'loss': 0.4668, 'learning_rate': 2.6662103098717485e-06, 'epoch': 0.77} + 77%|███████▋ | 4440/5772 [3:56:50<2:16:59, 6.17s/it] 77%|███████▋ | 4441/5772 [3:56:58<2:18:38, 6.25s/it] 77%|███████▋ | 4441/5772 [3:56:56<2:18:38, 6.25s/it] {'loss': 0.4616, 'learning_rate': 2.6623963208783553e-06, 'epoch': 0.77} + 77%|███████▋ | 4441/5772 [3:56:58<2:18:38, 6.25s/it] {'loss': 0.4616, 'learning_rate': 2.6623963208783553e-06, 'epoch': 0.77} + 77%|███████▋ | 4441/5772 [3:56:56<2:18:38, 6.25s/it] 77%|███████▋ | 4442/5772 [3:57:04<2:15:42, 6.12s/it] 77%|███████▋ | 4442/5772 [3:57:02<2:15:42, 6.12s/it] {'loss': 0.4574, 'learning_rate': 2.658584642822444e-06, 'epoch': 0.77} + 77%|███████▋ | 4442/5772 [3:57:04<2:15:42, 6.12s/it] {'loss': 0.4574, 'learning_rate': 2.658584642822444e-06, 'epoch': 0.77} + 77%|███████▋ | 4442/5772 [3:57:02<2:15:42, 6.12s/it] 77%|███████▋ | 4443/5772 [3:57:10<2:16:19, 6.15s/it] 77%|███████▋ | 4443/5772 [3:57:08<2:16:19, 6.15s/it] {'loss': 0.4585, 'learning_rate': 2.654775276904483e-06, 'epoch': 0.77} + 77%|███████▋ | 4443/5772 [3:57:10<2:16:19, 6.15s/it] {'loss': 0.4585, 'learning_rate': 2.654775276904483e-06, 'epoch': 0.77} + 77%|███████▋ | 4443/5772 [3:57:08<2:16:19, 6.15s/it] 77%|███████▋ | 4444/5772 [3:57:14<2:15:54, 6.14s/it] 77%|███████▋ | 4444/5772 [3:57:16<2:15:54, 6.14s/it] {'loss': 0.4679, 'learning_rate': 2.6509682243242074e-06, 'epoch': 0.77} + 77%|███████▋ | 4444/5772 [3:57:16<2:15:54, 6.14s/it] {'loss': 0.4679, 'learning_rate': 2.6509682243242074e-06, 'epoch': 0.77} + 77%|███████▋ | 4444/5772 [3:57:14<2:15:54, 6.14s/it] 77%|███████▋ | 4445/5772 [3:57:20<2:14:12, 6.07s/it] 77%|███████▋ | 4445/5772 [3:57:22<2:14:12, 6.07s/it] {'loss': 0.4717, 'learning_rate': 2.6471634862806272e-06, 'epoch': 0.77} + 77%|███████▋ | 4445/5772 [3:57:22<2:14:12, 6.07s/it] {'loss': 0.4717, 'learning_rate': 2.6471634862806272e-06, 'epoch': 0.77} + 77%|███████▋ | 4445/5772 [3:57:20<2:14:12, 6.07s/it] 77%|███████▋ | 4446/5772 [3:57:26<2:14:11, 6.07s/it] 77%|███████▋ | 4446/5772 [3:57:28<2:14:11, 6.07s/it] {'loss': 0.4692, 'learning_rate': 2.6433610639720265e-06, 'epoch': 0.77} + 77%|███████▋ | 4446/5772 [3:57:28<2:14:11, 6.07s/it] {'loss': 0.4692, 'learning_rate': 2.6433610639720265e-06, 'epoch': 0.77} + 77%|███████▋ | 4446/5772 [3:57:26<2:14:11, 6.07s/it] 77%|███████▋ | 4447/5772 [3:57:35<2:19:07, 6.30s/it] 77%|███████▋ | 4447/5772 [3:57:33<2:19:07, 6.30s/it] {'loss': 0.4644, 'learning_rate': 2.6395609585959547e-06, 'epoch': 0.77} + 77%|███████▋ | 4447/5772 [3:57:33<2:19:07, 6.30s/it]{'loss': 0.4644, 'learning_rate': 2.6395609585959547e-06, 'epoch': 0.77} + 77%|███████▋ | 4447/5772 [3:57:35<2:19:07, 6.30s/it] 77%|███████▋ | 4448/5772 [3:57:41<2:15:49, 6.15s/it] 77%|███████▋ | 4448/5772 [3:57:39<2:15:49, 6.15s/it] {'loss': 0.4487, 'learning_rate': 2.635763171349235e-06, 'epoch': 0.77} + 77%|███████▋ | 4448/5772 [3:57:41<2:15:49, 6.15s/it] {'loss': 0.4487, 'learning_rate': 2.635763171349235e-06, 'epoch': 0.77} + 77%|███████▋ | 4448/5772 [3:57:39<2:15:49, 6.15s/it] 77%|███████▋ | 4449/5772 [3:57:47<2:13:31, 6.06s/it] 77%|███████▋ | 4449/5772 [3:57:45<2:13:31, 6.06s/it] {'loss': 0.4577, 'learning_rate': 2.631967703427959e-06, 'epoch': 0.77} + 77%|███████▋ | 4449/5772 [3:57:47<2:13:31, 6.06s/it] {'loss': 0.4577, 'learning_rate': 2.631967703427959e-06, 'epoch': 0.77} + 77%|███████▋ | 4449/5772 [3:57:45<2:13:31, 6.06s/it]10 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +6 11 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... + 77%|███████▋ | 4450/5772 [3:57:53<2:15:42, 6.16s/it]14 AutoResumeHook: Checking whether to suspend... +059 AutoResumeHook: Checking whether to suspend... AutoResumeHook: Checking whether to suspend... +2 + AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend...7 + AutoResumeHook: Checking whether to suspend... + 77%|███████▋ | 4450/5772 [3:57:51<2:15:42, 6.16s/it]12 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... + {'loss': 0.464, 'learning_rate': 2.628174556027492e-06, 'epoch': 0.77} + 77%|███████▋ | 4450/5772 [3:57:53<2:15:42, 6.16s/it] {'loss': 0.464, 'learning_rate': 2.628174556027492e-06, 'epoch': 0.77} + 77%|███████▋ | 4450/5772 [3:57:51<2:15:42, 6.16s/it] 77%|███████▋ | 4451/5772 [3:57:59<2:14:36, 6.11s/it] 77%|███████▋ | 4451/5772 [3:57:57<2:14:36, 6.11s/it] {'loss': 0.4476, 'learning_rate': 2.624383730342457e-06, 'epoch': 0.77} + 77%|███████▋ | 4451/5772 [3:57:59<2:14:36, 6.11s/it] {'loss': 0.4476, 'learning_rate': 2.624383730342457e-06, 'epoch': 0.77} + 77%|███████▋ | 4451/5772 [3:57:57<2:14:36, 6.11s/it]Apr 10 05:57:16.902229 1218894 slurmstepd 0x155550ab8700: error: *** STEP 6692876.0 ON batch-block1-2085 CANCELLED AT 2025-04-10T05:57:16 DUE TO TIME LIMIT *** +srun: Job step aborted: Waiting up to 122 seconds for job step to finish. +srun: error: batch-block1-2109: task 1: Terminated +srun: Terminating StepId=6692876.0 +srun: error: batch-block1-2085: task 0: Terminated +srun: job 6716486 queued and waiting for resources +srun: job 6716486 has been allocated resources +wandb: Currently logged in as: memmelma. Use `wandb login --relogin` to force relogin +MASTER_ADDR=batch-block5-00641 +JobID: 6716486 | Full list: batch-block5-00641 batch-block5-00553 +NETWORK=Efficient-Large-Model/VILA1.5-3b +wandb: Currently logged in as: memmelma. Use `wandb login --relogin` to force relogin +MASTER_ADDR=batch-block5-00641 +JobID: 6716486 | Full list: batch-block5-00641 batch-block5-00553 +NETWORK=Efficient-Large-Model/VILA1.5-3b +WARNING:torch.distributed.run: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:torch.distributed.run: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +[2025-04-10 12:31:04,366] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 12:31:04,366] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 12:31:04,366] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 12:31:04,367] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 12:31:04,373] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 12:31:04,381] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 12:31:04,381] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 12:31:04,387] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 12:31:05,387] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 12:31:05,387] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 12:31:05,387] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 12:31:05,395] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 12:31:05,396] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 12:31:05,397] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 12:31:05,401] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 12:31:05,403] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 12:31:06,349] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 12:31:06,349] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 12:31:06,349] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 12:31:06,349] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 12:31:06,349] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 12:31:06,349] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 12:31:06,350] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 12:31:06,350] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 12:31:06,349] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 12:31:06,350] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 12:31:06,350] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 12:31:06,350] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 12:31:06,350] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 12:31:06,350] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 12:31:06,350] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 12:31:06,350] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 12:31:06,350] [INFO] [comm.py:625:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +[2025-04-10 12:31:07,615] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 12:31:07,615] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 12:31:07,615] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 12:31:07,615] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 12:31:07,615] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 12:31:07,615] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 12:31:07,615] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 12:31:07,615] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 12:31:07,615] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 12:31:07,615] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 12:31:07,615] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 12:31:07,615] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 12:31:07,615] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 12:31:07,615] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 12:31:07,615] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 12:31:07,615] [INFO] [comm.py:594:init_distributed] cdb=None +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +[2025-04-10 12:31:20,548] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 2.70B parameters + Loading checkpoint shards: 0%| | 0/2 [00:00 4096). Running this sequence through the model will result in indexing errors + 90%|████████▉ | 5170/5772 [1:20:38<1:01:12, 6.10s/it] 90%|████████▉ | 5170/5772 [1:20:41<1:01:12, 6.10s/it] {'loss': 0.4809, 'learning_rate': 5.652782134367974e-07, 'epoch': 0.9} + 90%|████████▉ | 5170/5772 [1:20:41<1:01:12, 6.10s/it] {'loss': 0.4809, 'learning_rate': 5.652782134367974e-07, 'epoch': 0.9} + 90%|████████▉ | 5170/5772 [1:20:38<1:01:12, 6.10s/it] 90%|████████▉ | 5171/5772 [1:20:44<1:00:27, 6.04s/it] 90%|████████▉ | 5171/5772 [1:20:47<1:00:27, 6.04s/it] {'loss': 0.4448, 'learning_rate': 5.634195950712939e-07, 'epoch': 0.9} + 90%|████████▉ | 5171/5772 [1:20:47<1:00:27, 6.04s/it] {'loss': 0.4448, 'learning_rate': 5.634195950712939e-07, 'epoch': 0.9} + 90%|████████▉ | 5171/5772 [1:20:44<1:00:27, 6.04s/it] 90%|████████▉ | 5172/5772 [1:20:51<1:01:11, 6.12s/it] 90%|████████▉ | 5172/5772 [1:20:54<1:01:11, 6.12s/it] {'loss': 0.4605, 'learning_rate': 5.615639487043201e-07, 'epoch': 0.9} + 90%|████████▉ | 5172/5772 [1:20:54<1:01:11, 6.12s/it] {'loss': 0.4605, 'learning_rate': 5.615639487043201e-07, 'epoch': 0.9} + 90%|████████▉ | 5172/5772 [1:20:51<1:01:11, 6.12s/it] 90%|████████▉ | 5173/5772 [1:20:59<1:00:01, 6.01s/it] 90%|████████▉ | 5173/5772 [1:20:56<1:00:01, 6.01s/it] {'loss': 0.4666, 'learning_rate': 5.597112749203038e-07, 'epoch': 0.9} + 90%|████████▉ | 5173/5772 [1:20:59<1:00:01, 6.01s/it] {'loss': 0.4666, 'learning_rate': 5.597112749203038e-07, 'epoch': 0.9} + 90%|████████▉ | 5173/5772 [1:20:56<1:00:01, 6.01s/it] 90%|████████▉ | 5174/5772 [1:21:03<1:01:25, 6.16s/it] 90%|████████▉ | 5174/5772 [1:21:06<1:01:25, 6.16s/it] {'loss': 0.4622, 'learning_rate': 5.578615743027338e-07, 'epoch': 0.9} + 90%|████████▉ | 5174/5772 [1:21:06<1:01:25, 6.16s/it] {'loss': 0.4622, 'learning_rate': 5.578615743027338e-07, 'epoch': 0.9} + 90%|████████▉ | 5174/5772 [1:21:03<1:01:25, 6.16s/it] 90%|████████▉ | 5175/5772 [1:21:12<1:01:27, 6.18s/it] 90%|████████▉ | 5175/5772 [1:21:09<1:01:27, 6.18s/it] {'loss': 0.4523, 'learning_rate': 5.56014847434162e-07, 'epoch': 0.9} + 90%|████████▉ | 5175/5772 [1:21:12<1:01:27, 6.18s/it] {'loss': 0.4523, 'learning_rate': 5.56014847434162e-07, 'epoch': 0.9} + 90%|████████▉ | 5175/5772 [1:21:09<1:01:27, 6.18s/it] 90%|████████▉ | 5176/5772 [1:21:18<1:01:21, 6.18s/it] 90%|████████▉ | 5176/5772 [1:21:15<1:01:21, 6.18s/it] {'loss': 0.4649, 'learning_rate': 5.541710948962043e-07, 'epoch': 0.9} + 90%|████████▉ | 5176/5772 [1:21:18<1:01:21, 6.18s/it] {'loss': 0.4649, 'learning_rate': 5.541710948962043e-07, 'epoch': 0.9} + 90%|████████▉ | 5176/5772 [1:21:15<1:01:21, 6.18s/it] 90%|████████▉ | 5177/5772 [1:21:25<1:01:32, 6.21s/it] 90%|████████▉ | 5177/5772 [1:21:22<1:01:32, 6.21s/it] {'loss': 0.457, 'learning_rate': 5.523303172695427e-07, 'epoch': 0.9} + 90%|████████▉ | 5177/5772 [1:21:25<1:01:32, 6.21s/it] {'loss': 0.457, 'learning_rate': 5.523303172695427e-07, 'epoch': 0.9} + 90%|████████▉ | 5177/5772 [1:21:22<1:01:32, 6.21s/it] 90%|████████▉ | 5178/5772 [1:21:30<1:00:00, 6.06s/it] 90%|████████▉ | 5178/5772 [1:21:27<1:00:00, 6.06s/it] {'loss': 0.453, 'learning_rate': 5.504925151339191e-07, 'epoch': 0.9} + 90%|████████▉ | 5178/5772 [1:21:30<1:00:00, 6.06s/it] {'loss': 0.453, 'learning_rate': 5.504925151339191e-07, 'epoch': 0.9} + 90%|████████▉ | 5178/5772 [1:21:27<1:00:00, 6.06s/it] 90%|████████▉ | 5179/5772 [1:21:37<1:02:10, 6.29s/it] 90%|████████▉ | 5179/5772 [1:21:34<1:02:10, 6.29s/it] {'loss': 0.4498, 'learning_rate': 5.48657689068135e-07, 'epoch': 0.9} + 90%|████████▉ | 5179/5772 [1:21:37<1:02:10, 6.29s/it] {'loss': 0.4498, 'learning_rate': 5.48657689068135e-07, 'epoch': 0.9} + 90%|████████▉ | 5179/5772 [1:21:34<1:02:10, 6.29s/it] 90%|████████▉ | 5180/5772 [1:21:43<1:01:28, 6.23s/it] 90%|████████▉ | 5180/5772 [1:21:40<1:01:28, 6.23s/it] {'loss': 0.4503, 'learning_rate': 5.468258396500636e-07, 'epoch': 0.9} + 90%|████████▉ | 5180/5772 [1:21:43<1:01:28, 6.23s/it] {'loss': 0.4503, 'learning_rate': 5.468258396500636e-07, 'epoch': 0.9} + 90%|████████▉ | 5180/5772 [1:21:40<1:01:28, 6.23s/it] 90%|████████▉ | 5181/5772 [1:21:49<1:00:54, 6.18s/it] 90%|████████▉ | 5181/5772 [1:21:46<1:00:54, 6.18s/it] {'loss': 0.4526, 'learning_rate': 5.449969674566369e-07, 'epoch': 0.9} + 90%|████████▉ | 5181/5772 [1:21:49<1:00:54, 6.18s/it] {'loss': 0.4526, 'learning_rate': 5.449969674566369e-07, 'epoch': 0.9} + 90%|████████▉ | 5181/5772 [1:21:46<1:00:54, 6.18s/it] 90%|████████▉ | 5182/5772 [1:21:56<1:01:08, 6.22s/it] 90%|████████▉ | 5182/5772 [1:21:53<1:01:08, 6.22s/it] {'loss': 0.4634, 'learning_rate': 5.431710730638428e-07, 'epoch': 0.9} + 90%|████████▉ | 5182/5772 [1:21:56<1:01:08, 6.22s/it] {'loss': 0.4634, 'learning_rate': 5.431710730638428e-07, 'epoch': 0.9} + 90%|████████▉ | 5182/5772 [1:21:53<1:01:08, 6.22s/it] 90%|████████▉ | 5183/5772 [1:22:02<1:01:19, 6.25s/it] 90%|████████▉ | 5183/5772 [1:21:59<1:01:19, 6.25s/it] {'loss': 0.463, 'learning_rate': 5.413481570467382e-07, 'epoch': 0.9} + 90%|████████▉ | 5183/5772 [1:22:02<1:01:19, 6.25s/it] {'loss': 0.463, 'learning_rate': 5.413481570467382e-07, 'epoch': 0.9} + 90%|████████▉ | 5183/5772 [1:21:59<1:01:19, 6.25s/it] 90%|████████▉ | 5184/5772 [1:22:08<1:00:04, 6.13s/it] 90%|████████▉ | 5184/5772 [1:22:05<1:00:04, 6.13s/it] {'loss': 0.4711, 'learning_rate': 5.395282199794427e-07, 'epoch': 0.9} + 90%|████████▉ | 5184/5772 [1:22:08<1:00:04, 6.13s/it] {'loss': 0.4711, 'learning_rate': 5.395282199794427e-07, 'epoch': 0.9} + 90%|████████▉ | 5184/5772 [1:22:05<1:00:04, 6.13s/it] 90%|████████▉ | 5185/5772 [1:22:11<59:52, 6.12s/it] 90%|████████▉ | 5185/5772 [1:22:14<59:52, 6.12s/it] {'loss': 0.4462, 'learning_rate': 5.377112624351355e-07, 'epoch': 0.9} + 90%|████████▉ | 5185/5772 [1:22:14<59:52, 6.12s/it] {'loss': 0.4462, 'learning_rate': 5.377112624351355e-07, 'epoch': 0.9} + 90%|████████▉ | 5185/5772 [1:22:11<59:52, 6.12s/it] 90%|████████▉ | 5186/5772 [1:22:20<1:00:12, 6.16s/it] 90%|████████▉ | 5186/5772 [1:22:17<1:00:12, 6.16s/it] {'loss': 0.4639, 'learning_rate': 5.358972849860533e-07, 'epoch': 0.9} + 90%|████████▉ | 5186/5772 [1:22:20<1:00:12, 6.16s/it] {'loss': 0.4639, 'learning_rate': 5.358972849860533e-07, 'epoch': 0.9} + 90%|████████▉ | 5186/5772 [1:22:17<1:00:12, 6.16s/it]/lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/VILA/llava/model/llava_arch.py:397: UserWarning: Inputs truncated! + warnings.warn("Inputs truncated!") + 90%|████████▉ | 5187/5772 [1:22:27<1:00:43, 6.23s/it] 90%|████████▉ | 5187/5772 [1:22:23<1:00:43, 6.23s/it] {'loss': 0.4728, 'learning_rate': 5.340862882034992e-07, 'epoch': 0.9} + 90%|████████▉ | 5187/5772 [1:22:27<1:00:43, 6.23s/it] {'loss': 0.4728, 'learning_rate': 5.340862882034992e-07, 'epoch': 0.9} + 90%|████████▉ | 5187/5772 [1:22:23<1:00:43, 6.23s/it] 90%|████████▉ | 5188/5772 [1:22:30<1:00:33, 6.22s/it] 90%|████████▉ | 5188/5772 [1:22:33<1:00:33, 6.22s/it] {'loss': 0.4621, 'learning_rate': 5.322782726578413e-07, 'epoch': 0.9} + 90%|████████▉ | 5188/5772 [1:22:33<1:00:33, 6.22s/it] {'loss': 0.4621, 'learning_rate': 5.322782726578413e-07, 'epoch': 0.9} + 90%|████████▉ | 5188/5772 [1:22:30<1:00:33, 6.22s/it] 90%|████████▉ | 5189/5772 [1:22:39<1:00:43, 6.25s/it] 90%|████████▉ | 5189/5772 [1:22:36<1:00:43, 6.25s/it] {'loss': 0.4525, 'learning_rate': 5.304732389184986e-07, 'epoch': 0.9} + 90%|████████▉ | 5189/5772 [1:22:39<1:00:43, 6.25s/it] {'loss': 0.4525, 'learning_rate': 5.304732389184986e-07, 'epoch': 0.9} + 90%|████████▉ | 5189/5772 [1:22:36<1:00:43, 6.25s/it] 90%|████████▉ | 5190/5772 [1:22:45<58:56, 6.08s/it] 90%|████████▉ | 5190/5772 [1:22:42<58:56, 6.08s/it] {'loss': 0.4578, 'learning_rate': 5.286711875539585e-07, 'epoch': 0.9} + 90%|████████▉ | 5190/5772 [1:22:45<58:56, 6.08s/it] {'loss': 0.4578, 'learning_rate': 5.286711875539585e-07, 'epoch': 0.9} + 90%|████████▉ | 5190/5772 [1:22:42<58:56, 6.08s/it] 90%|████████▉ | 5191/5772 [1:22:51<58:24, 6.03s/it] 90%|████████▉ | 5191/5772 [1:22:48<58:24, 6.03s/it] {'loss': 0.4595, 'learning_rate': 5.268721191317683e-07, 'epoch': 0.9} + 90%|████████▉ | 5191/5772 [1:22:51<58:24, 6.03s/it] {'loss': 0.4595, 'learning_rate': 5.268721191317683e-07, 'epoch': 0.9} + 90%|████████▉ | 5191/5772 [1:22:48<58:24, 6.03s/it] 90%|████████▉ | 5192/5772 [1:22:57<58:43, 6.07s/it] 90%|████████▉ | 5192/5772 [1:22:54<58:43, 6.07s/it] {'loss': 0.4715, 'learning_rate': 5.250760342185335e-07, 'epoch': 0.9} + 90%|████████▉ | 5192/5772 [1:22:57<58:43, 6.07s/it] {'loss': 0.4715, 'learning_rate': 5.250760342185335e-07, 'epoch': 0.9} + 90%|████████▉ | 5192/5772 [1:22:54<58:43, 6.07s/it] 90%|████████▉ | 5193/5772 [1:23:03<59:52, 6.21s/it] 90%|████████▉ | 5193/5772 [1:23:00<59:52, 6.21s/it] {'loss': 0.4446, 'learning_rate': 5.232829333799205e-07, 'epoch': 0.9} + 90%|████████▉ | 5193/5772 [1:23:03<59:52, 6.21s/it] {'loss': 0.4446, 'learning_rate': 5.232829333799205e-07, 'epoch': 0.9} + 90%|████████▉ | 5193/5772 [1:23:00<59:52, 6.21s/it] 90%|████████▉ | 5194/5772 [1:23:10<1:00:50, 6.32s/it] 90%|████████▉ | 5194/5772 [1:23:07<1:00:50, 6.32s/it] {'loss': 0.4671, 'learning_rate': 5.214928171806543e-07, 'epoch': 0.9} + 90%|████████▉ | 5194/5772 [1:23:07<1:00:50, 6.32s/it] {'loss': 0.4671, 'learning_rate': 5.214928171806543e-07, 'epoch': 0.9} + 90%|████████▉ | 5194/5772 [1:23:10<1:00:50, 6.32s/it] 90%|█████████ | 5195/5772 [1:23:16<1:00:07, 6.25s/it] 90%|█████████ | 5195/5772 [1:23:13<1:00:07, 6.25s/it] {'loss': 0.4451, 'learning_rate': 5.197056861845284e-07, 'epoch': 0.9} + 90%|█████████ | 5195/5772 [1:23:16<1:00:07, 6.25s/it] {'loss': 0.4451, 'learning_rate': 5.197056861845284e-07, 'epoch': 0.9} + 90%|█████████ | 5195/5772 [1:23:13<1:00:07, 6.25s/it] 90%|█████████ | 5196/5772 [1:23:22<1:00:06, 6.26s/it] 90%|█████████ | 5196/5772 [1:23:19<1:00:06, 6.26s/it] {'loss': 0.4566, 'learning_rate': 5.179215409543848e-07, 'epoch': 0.9} + 90%|█████████ | 5196/5772 [1:23:22<1:00:06, 6.26s/it] {'loss': 0.4566, 'learning_rate': 5.179215409543848e-07, 'epoch': 0.9} + 90%|█████████ | 5196/5772 [1:23:19<1:00:06, 6.26s/it] 90%|█████████ | 5197/5772 [1:23:28<59:33, 6.21s/it] 90%|█████████ | 5197/5772 [1:23:25<59:33, 6.21s/it] {'loss': 0.453, 'learning_rate': 5.161403820521305e-07, 'epoch': 0.9} + 90%|█████████ | 5197/5772 [1:23:28<59:33, 6.21s/it] {'loss': 0.453, 'learning_rate': 5.161403820521305e-07, 'epoch': 0.9} + 90%|█████████ | 5197/5772 [1:23:25<59:33, 6.21s/it] 90%|█████████ | 5198/5772 [1:23:35<59:42, 6.24s/it] 90%|█████████ | 5198/5772 [1:23:32<59:42, 6.24s/it] {'loss': 0.4703, 'learning_rate': 5.143622100387336e-07, 'epoch': 0.9} + 90%|█████████ | 5198/5772 [1:23:35<59:42, 6.24s/it] {'loss': 0.4703, 'learning_rate': 5.143622100387336e-07, 'epoch': 0.9} + 90%|█████████ | 5198/5772 [1:23:32<59:42, 6.24s/it] 90%|█████████ | 5199/5772 [1:23:41<59:11, 6.20s/it] 90%|█████████ | 5199/5772 [1:23:38<59:11, 6.20s/it] {'loss': 0.4572, 'learning_rate': 5.125870254742182e-07, 'epoch': 0.9} + 90%|█████████ | 5199/5772 [1:23:41<59:11, 6.20s/it] {'loss': 0.4572, 'learning_rate': 5.125870254742182e-07, 'epoch': 0.9} + 90%|█████████ | 5199/5772 [1:23:38<59:11, 6.20s/it]10 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... + 90%|█████████ | 5200/5772 [1:23:47<58:58, 6.19s/it]5 AutoResumeHook: Checking whether to suspend... +7 1211 AutoResumeHook: Checking whether to suspend... AutoResumeHook: Checking whether to suspend... + +AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +2 13 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +0 6 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + + 90%|█████████ | 5200/5772 [1:23:44<58:59, 6.19s/it] {'loss': 0.4512, 'learning_rate': 5.108148289176685e-07, 'epoch': 0.9} + 90%|█████████ | 5200/5772 [1:23:47<58:58, 6.19s/it] {'loss': 0.4512, 'learning_rate': 5.108148289176685e-07, 'epoch': 0.9} + 90%|█████████ | 5200/5772 [1:23:44<58:59, 6.19s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-5200/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-5200/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-5200/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 90%|█████████ | 5201/5772 [1:24:09<1:54:12, 12.00s/it] 90%|█████████ | 5201/5772 [1:24:13<1:54:12, 12.00s/it] {'loss': 0.4503, 'learning_rate': 5.090456209272276e-07, 'epoch': 0.9} + 90%|█████████ | 5201/5772 [1:24:13<1:54:12, 12.00s/it] {'loss': 0.4503, 'learning_rate': 5.090456209272276e-07, 'epoch': 0.9} + 90%|█████████ | 5201/5772 [1:24:09<1:54:12, 12.00s/it] 90%|█████████ | 5202/5772 [1:24:16<1:37:25, 10.26s/it] 90%|█████████ | 5202/5772 [1:24:19<1:37:25, 10.26s/it] {'loss': 0.4579, 'learning_rate': 5.07279402060099e-07, 'epoch': 0.9} + 90%|█████████ | 5202/5772 [1:24:19<1:37:25, 10.26s/it] {'loss': 0.4579, 'learning_rate': 5.07279402060099e-07, 'epoch': 0.9} + 90%|█████████ | 5202/5772 [1:24:16<1:37:25, 10.26s/it] 90%|█████████ | 5203/5772 [1:24:25<1:25:01, 8.97s/it] 90%|█████████ | 5203/5772 [1:24:22<1:25:01, 8.97s/it] {'loss': 0.4633, 'learning_rate': 5.055161728725433e-07, 'epoch': 0.9} + 90%|█████████ | 5203/5772 [1:24:25<1:25:01, 8.97s/it] {'loss': 0.4633, 'learning_rate': 5.055161728725433e-07, 'epoch': 0.9} + 90%|█████████ | 5203/5772 [1:24:22<1:25:01, 8.97s/it] 90%|█████████ | 5204/5772 [1:24:28<1:16:53, 8.12s/it] 90%|█████████ | 5204/5772 [1:24:31<1:16:53, 8.12s/it] {'loss': 0.4608, 'learning_rate': 5.037559339198805e-07, 'epoch': 0.9} + 90%|█████████ | 5204/5772 [1:24:31<1:16:53, 8.12s/it] {'loss': 0.4608, 'learning_rate': 5.037559339198805e-07, 'epoch': 0.9} + 90%|█████████ | 5204/5772 [1:24:28<1:16:53, 8.12s/it] 90%|█████████ | 5205/5772 [1:24:37<1:11:16, 7.54s/it] 90%|█████████ | 5205/5772 [1:24:34<1:11:16, 7.54s/it] {'loss': 0.4749, 'learning_rate': 5.01998685756484e-07, 'epoch': 0.9} + 90%|█████████ | 5205/5772 [1:24:37<1:11:16, 7.54s/it] {'loss': 0.4749, 'learning_rate': 5.01998685756484e-07, 'epoch': 0.9} + 90%|█████████ | 5205/5772 [1:24:34<1:11:16, 7.54s/it] 90%|█████████ | 5206/5772 [1:24:40<1:07:04, 7.11s/it] 90%|█████████ | 5206/5772 [1:24:43<1:07:04, 7.11s/it] {'loss': 0.4498, 'learning_rate': 5.002444289357955e-07, 'epoch': 0.9} + 90%|█████████ | 5206/5772 [1:24:43<1:07:04, 7.11s/it] {'loss': 0.4498, 'learning_rate': 5.002444289357955e-07, 'epoch': 0.9} + 90%|█████████ | 5206/5772 [1:24:40<1:07:04, 7.11s/it] 90%|█████████ | 5207/5772 [1:24:46<1:03:02, 6.69s/it] 90%|█████████ | 5207/5772 [1:24:49<1:03:02, 6.69s/it] {'loss': 0.4603, 'learning_rate': 4.984931640103041e-07, 'epoch': 0.9} + 90%|█████████ | 5207/5772 [1:24:49<1:03:02, 6.69s/it] {'loss': 0.4603, 'learning_rate': 4.984931640103041e-07, 'epoch': 0.9} + 90%|█████████ | 5207/5772 [1:24:46<1:03:02, 6.69s/it] 90%|█████████ | 5208/5772 [1:24:52<1:01:50, 6.58s/it] 90%|█████████ | 5208/5772 [1:24:55<1:01:50, 6.58s/it] {'loss': 0.4564, 'learning_rate': 4.967448915315609e-07, 'epoch': 0.9} + 90%|█████████ | 5208/5772 [1:24:55<1:01:50, 6.58s/it] {'loss': 0.4564, 'learning_rate': 4.967448915315609e-07, 'epoch': 0.9} + 90%|█████████ | 5208/5772 [1:24:52<1:01:50, 6.58s/it] 90%|█████████ | 5209/5772 [1:24:58<1:00:36, 6.46s/it] 90%|█████████ | 5209/5772 [1:25:01<1:00:36, 6.46s/it] {'loss': 0.4628, 'learning_rate': 4.949996120501765e-07, 'epoch': 0.9} + 90%|█████████ | 5209/5772 [1:25:01<1:00:36, 6.46s/it] {'loss': 0.4628, 'learning_rate': 4.949996120501765e-07, 'epoch': 0.9} + 90%|█████████ | 5209/5772 [1:24:58<1:00:36, 6.46s/it] 90%|█████████ | 5210/5772 [1:25:08<59:46, 6.38s/it] 90%|█████████ | 5210/5772 [1:25:04<59:46, 6.38s/it] {'loss': 0.4512, 'learning_rate': 4.932573261158169e-07, 'epoch': 0.9} + 90%|█████████ | 5210/5772 [1:25:08<59:46, 6.38s/it] {'loss': 0.4512, 'learning_rate': 4.932573261158169e-07, 'epoch': 0.9} + 90%|█████████ | 5210/5772 [1:25:04<59:46, 6.38s/it] 90%|█████████ | 5211/5772 [1:25:11<59:06, 6.32s/it] 90%|█████████ | 5211/5772 [1:25:14<59:06, 6.32s/it] {'loss': 0.4598, 'learning_rate': 4.915180342772053e-07, 'epoch': 0.9} + 90%|█████████ | 5211/5772 [1:25:14<59:06, 6.32s/it] {'loss': 0.4598, 'learning_rate': 4.915180342772053e-07, 'epoch': 0.9} + 90%|█████████ | 5211/5772 [1:25:11<59:06, 6.32s/it] 90%|█████████ | 5212/5772 [1:25:17<58:00, 6.22s/it] 90%|█████████ | 5212/5772 [1:25:20<58:00, 6.22s/it] {'loss': 0.4577, 'learning_rate': 4.89781737082119e-07, 'epoch': 0.9} + 90%|█████████ | 5212/5772 [1:25:20<58:00, 6.22s/it] {'loss': 0.4577, 'learning_rate': 4.89781737082119e-07, 'epoch': 0.9} + 90%|█████████ | 5212/5772 [1:25:17<58:00, 6.22s/it] 90%|█████████ | 5213/5772 [1:25:23<57:42, 6.19s/it] 90%|█████████ | 5213/5772 [1:25:26<57:42, 6.19s/it] {'loss': 0.4422, 'learning_rate': 4.880484350774007e-07, 'epoch': 0.9} + 90%|█████████ | 5213/5772 [1:25:26<57:42, 6.19s/it] {'loss': 0.4422, 'learning_rate': 4.880484350774007e-07, 'epoch': 0.9} + 90%|█████████ | 5213/5772 [1:25:23<57:42, 6.19s/it] 90%|█████████ | 5214/5772 [1:25:29<57:48, 6.22s/it] 90%|█████████ | 5214/5772 [1:25:32<57:48, 6.22s/it] {'loss': 0.4424, 'learning_rate': 4.863181288089391e-07, 'epoch': 0.9} + 90%|█████████ | 5214/5772 [1:25:32<57:48, 6.22s/it] {'loss': 0.4424, 'learning_rate': 4.863181288089391e-07, 'epoch': 0.9} + 90%|█████████ | 5214/5772 [1:25:29<57:48, 6.22s/it] 90%|█████████ | 5215/5772 [1:25:35<57:16, 6.17s/it] 90%|█████████ | 5215/5772 [1:25:38<57:16, 6.17s/it] {'loss': 0.4558, 'learning_rate': 4.845908188216874e-07, 'epoch': 0.9} + 90%|█████████ | 5215/5772 [1:25:38<57:16, 6.17s/it]{'loss': 0.4558, 'learning_rate': 4.845908188216874e-07, 'epoch': 0.9} + 90%|█████████ | 5215/5772 [1:25:35<57:16, 6.17s/it] 90%|█████████ | 5216/5772 [1:25:44<56:26, 6.09s/it] {'loss': 0.4684, 'learning_rate': 4.828665056596504e-07, 'epoch': 0.9} + 90%|█████████ | 5216/5772 [1:25:44<56:26, 6.09s/it] 90%|█████████ | 5216/5772 [1:25:41<56:26, 6.09s/it] {'loss': 0.4684, 'learning_rate': 4.828665056596504e-07, 'epoch': 0.9} + 90%|█████████ | 5216/5772 [1:25:41<56:26, 6.09s/it] 90%|█████████ | 5217/5772 [1:25:47<55:55, 6.05s/it] 90%|█████████ | 5217/5772 [1:25:50<55:55, 6.05s/it] {'loss': 0.4609, 'learning_rate': 4.811451898658925e-07, 'epoch': 0.9} + 90%|█████████ | 5217/5772 [1:25:50<55:55, 6.05s/it] {'loss': 0.4609, 'learning_rate': 4.811451898658925e-07, 'epoch': 0.9} + 90%|█████████ | 5217/5772 [1:25:47<55:55, 6.05s/it] 90%|█████████ | 5218/5772 [1:25:53<56:38, 6.13s/it] 90%|█████████ | 5218/5772 [1:25:56<56:38, 6.13s/it] {'loss': 0.4519, 'learning_rate': 4.794268719825334e-07, 'epoch': 0.9} + 90%|█████████ | 5218/5772 [1:25:56<56:38, 6.13s/it] {'loss': 0.4519, 'learning_rate': 4.794268719825334e-07, 'epoch': 0.9} + 90%|█████████ | 5218/5772 [1:25:53<56:38, 6.13s/it] 90%|█████████ | 5219/5772 [1:25:59<56:00, 6.08s/it] 90%|█████████ | 5219/5772 [1:26:02<56:00, 6.08s/it] {'loss': 0.4669, 'learning_rate': 4.777115525507447e-07, 'epoch': 0.9} + 90%|█████████ | 5219/5772 [1:26:02<56:00, 6.08s/it] {'loss': 0.4669, 'learning_rate': 4.777115525507447e-07, 'epoch': 0.9} + 90%|█████████ | 5219/5772 [1:25:59<56:00, 6.08s/it] 90%|█████████ | 5220/5772 [1:26:05<56:10, 6.11s/it] 90%|█████████ | 5220/5772 [1:26:08<56:10, 6.11s/it] {'loss': 0.453, 'learning_rate': 4.759992321107587e-07, 'epoch': 0.9} + 90%|█████████ | 5220/5772 [1:26:08<56:10, 6.11s/it] {'loss': 0.453, 'learning_rate': 4.759992321107587e-07, 'epoch': 0.9} + 90%|█████████ | 5220/5772 [1:26:05<56:10, 6.11s/it] 90%|█████████ | 5221/5772 [1:26:11<55:07, 6.00s/it] 90%|█████████ | 5221/5772 [1:26:14<55:07, 6.00s/it] {'loss': 0.4488, 'learning_rate': 4.7428991120186065e-07, 'epoch': 0.9} + 90%|█████████ | 5221/5772 [1:26:14<55:07, 6.00s/it] {'loss': 0.4488, 'learning_rate': 4.7428991120186065e-07, 'epoch': 0.9} + 90%|█████████ | 5221/5772 [1:26:11<55:07, 6.00s/it] 90%|█████████ | 5222/5772 [1:26:17<55:05, 6.01s/it] 90%|█████████ | 5222/5772 [1:26:20<55:05, 6.01s/it] {'loss': 0.4633, 'learning_rate': 4.725835903623921e-07, 'epoch': 0.9} + 90%|█████████ | 5222/5772 [1:26:20<55:05, 6.01s/it] {'loss': 0.4633, 'learning_rate': 4.725835903623921e-07, 'epoch': 0.9} + 90%|█████████ | 5222/5772 [1:26:17<55:05, 6.01s/it] 90%|█████████ | 5223/5772 [1:26:23<55:25, 6.06s/it] 90%|█████████ | 5223/5772 [1:26:26<55:25, 6.06s/it] {'loss': 0.4644, 'learning_rate': 4.708802701297499e-07, 'epoch': 0.9} + 90%|█████████ | 5223/5772 [1:26:26<55:25, 6.06s/it] {'loss': 0.4644, 'learning_rate': 4.708802701297499e-07, 'epoch': 0.9} + 90%|█████████ | 5223/5772 [1:26:23<55:25, 6.06s/it] 91%|█████████ | 5224/5772 [1:26:29<54:57, 6.02s/it] 91%|█████████ | 5224/5772 [1:26:32<54:57, 6.02s/it]{'loss': 0.4636, 'learning_rate': 4.6917995104038384e-07, 'epoch': 0.9} + {'loss': 0.4636, 'learning_rate': 4.6917995104038384e-07, 'epoch': 0.9} + 91%|█████████ | 5224/5772 [1:26:32<54:57, 6.02s/it] 91%|█████████ | 5224/5772 [1:26:29<54:57, 6.02s/it] 91%|█████████ | 5225/5772 [1:26:35<55:02, 6.04s/it] 91%|█████████ | 5225/5772 [1:26:38<55:02, 6.04s/it] {'loss': 0.466, 'learning_rate': 4.6748263362980105e-07, 'epoch': 0.91} + 91%|█████████ | 5225/5772 [1:26:38<55:02, 6.04s/it] {'loss': 0.466, 'learning_rate': 4.6748263362980105e-07, 'epoch': 0.91} + 91%|█████████ | 5225/5772 [1:26:35<55:02, 6.04s/it] 91%|█████████ | 5226/5772 [1:26:41<55:12, 6.07s/it] 91%|█████████ | 5226/5772 [1:26:45<55:12, 6.07s/it] {'loss': 0.4613, 'learning_rate': 4.6578831843256176e-07, 'epoch': 0.91} + 91%|█████████ | 5226/5772 [1:26:45<55:12, 6.07s/it] {'loss': 0.4613, 'learning_rate': 4.6578831843256176e-07, 'epoch': 0.91} + 91%|█████████ | 5226/5772 [1:26:41<55:12, 6.07s/it] 91%|█████████ | 5227/5772 [1:26:50<54:30, 6.00s/it] 91%|█████████ | 5227/5772 [1:26:47<54:30, 6.00s/it] {'loss': 0.4511, 'learning_rate': 4.6409700598228025e-07, 'epoch': 0.91} + 91%|█████████ | 5227/5772 [1:26:50<54:30, 6.00s/it] {'loss': 0.4511, 'learning_rate': 4.6409700598228025e-07, 'epoch': 0.91} + 91%|█████████ | 5227/5772 [1:26:47<54:30, 6.00s/it] 91%|█████████ | 5228/5772 [1:26:56<54:13, 5.98s/it] 91%|█████████ | 5228/5772 [1:26:53<54:13, 5.98s/it] {'loss': 0.453, 'learning_rate': 4.6240869681162814e-07, 'epoch': 0.91} + 91%|█████████ | 5228/5772 [1:26:56<54:13, 5.98s/it] {'loss': 0.453, 'learning_rate': 4.6240869681162814e-07, 'epoch': 0.91} + 91%|█████████ | 5228/5772 [1:26:53<54:13, 5.98s/it] 91%|█████████ | 5229/5772 [1:26:59<54:14, 5.99s/it] 91%|█████████ | 5229/5772 [1:27:02<54:14, 5.99s/it] {'loss': 0.4489, 'learning_rate': 4.607233914523268e-07, 'epoch': 0.91} + 91%|█████████ | 5229/5772 [1:27:02<54:14, 5.99s/it] {'loss': 0.4489, 'learning_rate': 4.607233914523268e-07, 'epoch': 0.91} + 91%|█████████ | 5229/5772 [1:26:59<54:14, 5.99s/it] 91%|█████████ | 5230/5772 [1:27:05<53:25, 5.91s/it] 91%|█████████ | 5230/5772 [1:27:08<53:25, 5.91s/it] {'loss': 0.4446, 'learning_rate': 4.590410904351561e-07, 'epoch': 0.91} + 91%|█████████ | 5230/5772 [1:27:08<53:25, 5.91s/it] {'loss': 0.4446, 'learning_rate': 4.590410904351561e-07, 'epoch': 0.91} + 91%|█████████ | 5230/5772 [1:27:05<53:25, 5.91s/it] 91%|█████████ | 5231/5772 [1:27:11<53:01, 5.88s/it] 91%|█████████ | 5231/5772 [1:27:14<53:01, 5.88s/it] {'loss': 0.4639, 'learning_rate': 4.573617942899433e-07, 'epoch': 0.91} + 91%|█████████ | 5231/5772 [1:27:14<53:01, 5.88s/it] {'loss': 0.4639, 'learning_rate': 4.573617942899433e-07, 'epoch': 0.91} + 91%|█████████ | 5231/5772 [1:27:11<53:01, 5.88s/it] 91%|█████████ | 5232/5772 [1:27:17<52:53, 5.88s/it] 91%|█████████ | 5232/5772 [1:27:20<52:53, 5.88s/it] {'loss': 0.4521, 'learning_rate': 4.556855035455787e-07, 'epoch': 0.91} +{'loss': 0.4521, 'learning_rate': 4.556855035455787e-07, 'epoch': 0.91} + 91%|█████████ | 5232/5772 [1:27:20<52:53, 5.88s/it] 91%|█████████ | 5232/5772 [1:27:17<52:53, 5.88s/it] 91%|█████████ | 5233/5772 [1:27:23<53:04, 5.91s/it] 91%|█████████ | 5233/5772 [1:27:26<53:04, 5.91s/it] {'loss': 0.4636, 'learning_rate': 4.540122187299978e-07, 'epoch': 0.91} + 91%|█████████ | 5233/5772 [1:27:23<53:04, 5.91s/it]{'loss': 0.4636, 'learning_rate': 4.540122187299978e-07, 'epoch': 0.91} + 91%|█████████ | 5233/5772 [1:27:26<53:04, 5.91s/it] 91%|█████████ | 5234/5772 [1:27:29<53:14, 5.94s/it] 91%|█████████ | 5234/5772 [1:27:32<53:14, 5.94s/it] {'loss': 0.4583, 'learning_rate': 4.523419403701923e-07, 'epoch': 0.91} + 91%|█████████ | 5234/5772 [1:27:32<53:14, 5.94s/it] {'loss': 0.4583, 'learning_rate': 4.523419403701923e-07, 'epoch': 0.91} + 91%|█████████ | 5234/5772 [1:27:29<53:14, 5.94s/it] 91%|█████████ | 5235/5772 [1:27:35<53:56, 6.03s/it] 91%|█████████ | 5235/5772 [1:27:38<53:56, 6.03s/it] {'loss': 0.4539, 'learning_rate': 4.5067466899220703e-07, 'epoch': 0.91} + 91%|█████████ | 5235/5772 [1:27:38<53:56, 6.03s/it] {'loss': 0.4539, 'learning_rate': 4.5067466899220703e-07, 'epoch': 0.91} + 91%|█████████ | 5235/5772 [1:27:35<53:56, 6.03s/it] 91%|█████████ | 5236/5772 [1:27:41<54:07, 6.06s/it] 91%|█████████ | 5236/5772 [1:27:44<54:07, 6.06s/it] {'loss': 0.4485, 'learning_rate': 4.490104051211408e-07, 'epoch': 0.91} + 91%|█████████ | 5236/5772 [1:27:44<54:07, 6.06s/it] {'loss': 0.4485, 'learning_rate': 4.490104051211408e-07, 'epoch': 0.91} + 91%|█████████ | 5236/5772 [1:27:41<54:07, 6.06s/it] 91%|█████████ | 5237/5772 [1:27:47<54:05, 6.07s/it] 91%|█████████ | 5237/5772 [1:27:50<54:05, 6.07s/it] {'loss': 0.447, 'learning_rate': 4.4734914928114435e-07, 'epoch': 0.91} + 91%|█████████ | 5237/5772 [1:27:50<54:05, 6.07s/it] {'loss': 0.447, 'learning_rate': 4.4734914928114435e-07, 'epoch': 0.91} + 91%|█████████ | 5237/5772 [1:27:47<54:05, 6.07s/it] 91%|█████████ | 5238/5772 [1:27:53<53:29, 6.01s/it] 91%|█████████ | 5238/5772 [1:27:56<53:29, 6.01s/it] {'loss': 0.4452, 'learning_rate': 4.456909019954181e-07, 'epoch': 0.91} + 91%|█████████ | 5238/5772 [1:27:56<53:29, 6.01s/it] {'loss': 0.4452, 'learning_rate': 4.456909019954181e-07, 'epoch': 0.91} + 91%|█████████ | 5238/5772 [1:27:53<53:29, 6.01s/it] 91%|█████████ | 5239/5772 [1:27:59<54:06, 6.09s/it] 91%|█████████ | 5239/5772 [1:28:02<54:06, 6.09s/it] {'loss': 0.4675, 'learning_rate': 4.440356637862231e-07, 'epoch': 0.91} + 91%|█████████ | 5239/5772 [1:28:02<54:06, 6.09s/it] {'loss': 0.4675, 'learning_rate': 4.440356637862231e-07, 'epoch': 0.91} + 91%|█████████ | 5239/5772 [1:27:59<54:06, 6.09s/it] 91%|█████████ | 5240/5772 [1:28:05<53:22, 6.02s/it] 91%|█████████ | 5240/5772 [1:28:08<53:22, 6.02s/it] {'loss': 0.4524, 'learning_rate': 4.4238343517486237e-07, 'epoch': 0.91} + 91%|█████████ | 5240/5772 [1:28:08<53:22, 6.02s/it] {'loss': 0.4524, 'learning_rate': 4.4238343517486237e-07, 'epoch': 0.91} + 91%|█████████ | 5240/5772 [1:28:05<53:22, 6.02s/it] 91%|█████████ | 5241/5772 [1:28:11<53:22, 6.03s/it] 91%|█████████ | 5241/5772 [1:28:14<53:22, 6.03s/it]{'loss': 0.4583, 'learning_rate': 4.407342166816997e-07, 'epoch': 0.91} + {'loss': 0.4583, 'learning_rate': 4.407342166816997e-07, 'epoch': 0.91} + 91%|█████████ | 5241/5772 [1:28:14<53:22, 6.03s/it] 91%|█████████ | 5241/5772 [1:28:11<53:22, 6.03s/it] 91%|█████████ | 5242/5772 [1:28:17<53:33, 6.06s/it] 91%|█████████ | 5242/5772 [1:28:20<53:33, 6.06s/it]{'loss': 0.4549, 'learning_rate': 4.3908800882614397e-07, 'epoch': 0.91} + {'loss': 0.4549, 'learning_rate': 4.3908800882614397e-07, 'epoch': 0.91} + 91%|█████████ | 5242/5772 [1:28:20<53:33, 6.06s/it] 91%|█████████ | 5242/5772 [1:28:17<53:33, 6.06s/it] 91%|█████████ | 5243/5772 [1:28:23<53:24, 6.06s/it] 91%|█████████ | 5243/5772 [1:28:26<53:24, 6.06s/it] {'loss': 0.4564, 'learning_rate': 4.3744481212666167e-07, 'epoch': 0.91} + 91%|█████████ | 5243/5772 [1:28:26<53:24, 6.06s/it] {'loss': 0.4564, 'learning_rate': 4.3744481212666167e-07, 'epoch': 0.91} + 91%|█████████ | 5243/5772 [1:28:23<53:24, 6.06s/it] 91%|█████████ | 5244/5772 [1:28:30<53:52, 6.12s/it] 91%|█████████ | 5244/5772 [1:28:33<53:52, 6.12s/it] {'loss': 0.4557, 'learning_rate': 4.358046271007699e-07, 'epoch': 0.91} + 91%|█████████ | 5244/5772 [1:28:33<53:52, 6.12s/it] {'loss': 0.4557, 'learning_rate': 4.358046271007699e-07, 'epoch': 0.91} + 91%|█████████ | 5244/5772 [1:28:30<53:52, 6.12s/it] 91%|█████████ | 5245/5772 [1:28:39<54:42, 6.23s/it] 91%|█████████ | 5245/5772 [1:28:36<54:42, 6.23s/it] {'loss': 0.4698, 'learning_rate': 4.3416745426503095e-07, 'epoch': 0.91} + 91%|█████████ | 5245/5772 [1:28:39<54:42, 6.23s/it] {'loss': 0.4698, 'learning_rate': 4.3416745426503095e-07, 'epoch': 0.91} + 91%|█████████ | 5245/5772 [1:28:36<54:42, 6.23s/it] 91%|█████████ | 5246/5772 [1:28:42<54:45, 6.25s/it] 91%|█████████ | 5246/5772 [1:28:45<54:45, 6.25s/it] {'loss': 0.4578, 'learning_rate': 4.325332941350668e-07, 'epoch': 0.91} + 91%|█████████ | 5246/5772 [1:28:45<54:45, 6.25s/it] {'loss': 0.4578, 'learning_rate': 4.325332941350668e-07, 'epoch': 0.91} + 91%|█████████ | 5246/5772 [1:28:42<54:45, 6.25s/it] 91%|█████████ | 5247/5772 [1:28:48<53:53, 6.16s/it] 91%|█████████ | 5247/5772 [1:28:51<53:53, 6.16s/it] {'loss': 0.4563, 'learning_rate': 4.30902147225547e-07, 'epoch': 0.91} + 91%|█████████ | 5247/5772 [1:28:51<53:53, 6.16s/it] {'loss': 0.4563, 'learning_rate': 4.30902147225547e-07, 'epoch': 0.91} + 91%|█████████ | 5247/5772 [1:28:48<53:53, 6.16s/it] 91%|█████████ | 5248/5772 [1:28:58<54:13, 6.21s/it] 91%|█████████ | 5248/5772 [1:28:55<54:14, 6.21s/it] {'loss': 0.4425, 'learning_rate': 4.2927401405019166e-07, 'epoch': 0.91} + 91%|█████████ | 5248/5772 [1:28:58<54:13, 6.21s/it] {'loss': 0.4425, 'learning_rate': 4.2927401405019166e-07, 'epoch': 0.91} + 91%|█████████ | 5248/5772 [1:28:55<54:14, 6.21s/it] 91%|█████████ | 5249/5772 [1:29:01<54:07, 6.21s/it] 91%|█████████ | 5249/5772 [1:29:04<54:07, 6.21s/it] {'loss': 0.471, 'learning_rate': 4.276488951217705e-07, 'epoch': 0.91} +{'loss': 0.471, 'learning_rate': 4.276488951217705e-07, 'epoch': 0.91} + 91%|█████████ | 5249/5772 [1:29:04<54:07, 6.21s/it] 91%|█████████ | 5249/5772 [1:29:01<54:07, 6.21s/it]2 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +10911 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + + + 91%|█████████ | 5250/5772 [1:29:10<53:58, 6.20s/it]13 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +0 AutoResumeHook: Checking whether to suspend...1 AutoResumeHook: Checking whether to suspend... + +3 AutoResumeHook: Checking whether to suspend... + 91%|█████████ | 5250/5772 [1:29:07<53:59, 6.21s/it]6 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4542, 'learning_rate': 4.2602679095210766e-07, 'epoch': 0.91} + 91%|█████████ | 5250/5772 [1:29:10<53:58, 6.20s/it] {'loss': 0.4542, 'learning_rate': 4.2602679095210766e-07, 'epoch': 0.91} + 91%|█████████ | 5250/5772 [1:29:07<53:59, 6.21s/it] 91%|█████████ | 5251/5772 [1:29:13<52:37, 6.06s/it] 91%|█████████ | 5251/5772 [1:29:16<52:37, 6.06s/it] {'loss': 0.4553, 'learning_rate': 4.244077020520776e-07, 'epoch': 0.91} + 91%|█████████ | 5251/5772 [1:29:16<52:37, 6.06s/it] {'loss': 0.4553, 'learning_rate': 4.244077020520776e-07, 'epoch': 0.91} + 91%|█████████ | 5251/5772 [1:29:13<52:37, 6.06s/it] 91%|█████████ | 5252/5772 [1:29:19<53:13, 6.14s/it] 91%|█████████ | 5252/5772 [1:29:22<53:13, 6.14s/it] {'loss': 0.4466, 'learning_rate': 4.227916289316003e-07, 'epoch': 0.91} + 91%|█████████ | 5252/5772 [1:29:22<53:13, 6.14s/it] {'loss': 0.4466, 'learning_rate': 4.227916289316003e-07, 'epoch': 0.91} + 91%|█████████ | 5252/5772 [1:29:19<53:13, 6.14s/it] 91%|█████████ | 5253/5772 [1:29:28<53:10, 6.15s/it] 91%|█████████ | 5253/5772 [1:29:25<53:10, 6.15s/it] {'loss': 0.4621, 'learning_rate': 4.2117857209964863e-07, 'epoch': 0.91} + 91%|█████████ | 5253/5772 [1:29:28<53:10, 6.15s/it] {'loss': 0.4621, 'learning_rate': 4.2117857209964863e-07, 'epoch': 0.91} + 91%|█████████ | 5253/5772 [1:29:25<53:10, 6.15s/it] 91%|█████████ | 5254/5772 [1:29:32<53:53, 6.24s/it] 91%|█████████ | 5254/5772 [1:29:35<53:53, 6.24s/it] {'loss': 0.462, 'learning_rate': 4.195685320642484e-07, 'epoch': 0.91} + 91%|█████████ | 5254/5772 [1:29:32<53:53, 6.24s/it] {'loss': 0.462, 'learning_rate': 4.195685320642484e-07, 'epoch': 0.91} + 91%|█████████ | 5254/5772 [1:29:35<53:53, 6.24s/it] 91%|█████████ | 5255/5772 [1:29:38<52:55, 6.14s/it] 91%|█████████ | 5255/5772 [1:29:41<52:55, 6.14s/it] {'loss': 0.4546, 'learning_rate': 4.179615093324729e-07, 'epoch': 0.91}{'loss': 0.4546, 'learning_rate': 4.179615093324729e-07, 'epoch': 0.91} + 91%|█████████ | 5255/5772 [1:29:41<52:55, 6.14s/it] + 91%|█████████ | 5255/5772 [1:29:38<52:55, 6.14s/it] 91%|█████████ | 5256/5772 [1:29:44<52:37, 6.12s/it] 91%|█████████ | 5256/5772 [1:29:47<52:37, 6.12s/it] {'loss': 0.4504, 'learning_rate': 4.1635750441044067e-07, 'epoch': 0.91} + 91%|█████████ | 5256/5772 [1:29:47<52:37, 6.12s/it] {'loss': 0.4504, 'learning_rate': 4.1635750441044067e-07, 'epoch': 0.91} + 91%|█████████ | 5256/5772 [1:29:44<52:37, 6.12s/it] 91%|█████████ | 5257/5772 [1:29:50<52:50, 6.16s/it] 91%|█████████ | 5257/5772 [1:29:53<52:50, 6.16s/it] {'loss': 0.4651, 'learning_rate': 4.147565178033286e-07, 'epoch': 0.91} + 91%|█████████ | 5257/5772 [1:29:53<52:50, 6.16s/it] {'loss': 0.4651, 'learning_rate': 4.147565178033286e-07, 'epoch': 0.91} + 91%|█████████ | 5257/5772 [1:29:50<52:50, 6.16s/it] 91%|█████████ | 5258/5772 [1:29:56<53:11, 6.21s/it] 91%|█████████ | 5258/5772 [1:29:59<53:11, 6.21s/it] {'loss': 0.4507, 'learning_rate': 4.131585500153579e-07, 'epoch': 0.91} + 91%|█████████ | 5258/5772 [1:29:59<53:11, 6.21s/it] {'loss': 0.4507, 'learning_rate': 4.131585500153579e-07, 'epoch': 0.91} + 91%|█████████ | 5258/5772 [1:29:56<53:11, 6.21s/it] 91%|█████████ | 5259/5772 [1:30:02<52:26, 6.13s/it] 91%|█████████ | 5259/5772 [1:30:05<52:26, 6.13s/it] {'loss': 0.4504, 'learning_rate': 4.1156360154979813e-07, 'epoch': 0.91} + 91%|█████████ | 5259/5772 [1:30:05<52:26, 6.13s/it] {'loss': 0.4504, 'learning_rate': 4.1156360154979813e-07, 'epoch': 0.91} + 91%|█████████ | 5259/5772 [1:30:02<52:26, 6.13s/it] 91%|█████████ | 5260/5772 [1:30:09<52:43, 6.18s/it] 91%|█████████ | 5260/5772 [1:30:12<52:43, 6.18s/it] {'loss': 0.4499, 'learning_rate': 4.099716729089698e-07, 'epoch': 0.91} + 91%|█████████ | 5260/5772 [1:30:12<52:43, 6.18s/it] {'loss': 0.4499, 'learning_rate': 4.099716729089698e-07, 'epoch': 0.91} + 91%|█████████ | 5260/5772 [1:30:09<52:43, 6.18s/it] 91%|█████████ | 5261/5772 [1:30:15<52:06, 6.12s/it] 91%|█████████ | 5261/5772 [1:30:18<52:06, 6.12s/it] {'loss': 0.4594, 'learning_rate': 4.083827645942429e-07, 'epoch': 0.91} + 91%|█████████ | 5261/5772 [1:30:18<52:06, 6.12s/it] {'loss': 0.4594, 'learning_rate': 4.083827645942429e-07, 'epoch': 0.91} + 91%|█████████ | 5261/5772 [1:30:15<52:06, 6.12s/it] 91%|█████████ | 5262/5772 [1:30:20<51:33, 6.07s/it] 91%|█████████ | 5262/5772 [1:30:24<51:33, 6.07s/it] {'loss': 0.4668, 'learning_rate': 4.067968771060349e-07, 'epoch': 0.91} + 91%|█████████ | 5262/5772 [1:30:24<51:33, 6.07s/it] {'loss': 0.4668, 'learning_rate': 4.067968771060349e-07, 'epoch': 0.91} + 91%|█████████ | 5262/5772 [1:30:20<51:33, 6.07s/it] 91%|█████████ | 5263/5772 [1:30:29<51:02, 6.02s/it] 91%|█████████ | 5263/5772 [1:30:26<51:03, 6.02s/it] {'loss': 0.4552, 'learning_rate': 4.0521401094381186e-07, 'epoch': 0.91} + 91%|█████████ | 5263/5772 [1:30:29<51:02, 6.02s/it] {'loss': 0.4552, 'learning_rate': 4.0521401094381186e-07, 'epoch': 0.91} + 91%|█████████ | 5263/5772 [1:30:26<51:03, 6.02s/it] 91%|█████████ | 5264/5772 [1:30:33<51:11, 6.05s/it] 91%|█████████ | 5264/5772 [1:30:36<51:11, 6.05s/it] {'loss': 0.4517, 'learning_rate': 4.036341666060872e-07, 'epoch': 0.91} + 91%|█████████ | 5264/5772 [1:30:36<51:11, 6.05s/it] {'loss': 0.4517, 'learning_rate': 4.036341666060872e-07, 'epoch': 0.91} + 91%|█████████ | 5264/5772 [1:30:33<51:11, 6.05s/it] 91%|█████████ | 5265/5772 [1:30:38<50:51, 6.02s/it] 91%|█████████ | 5265/5772 [1:30:42<50:51, 6.02s/it] {'loss': 0.4649, 'learning_rate': 4.0205734459042854e-07, 'epoch': 0.91} + 91%|█████████ | 5265/5772 [1:30:42<50:51, 6.02s/it] {'loss': 0.4649, 'learning_rate': 4.0205734459042854e-07, 'epoch': 0.91} + 91%|█████████ | 5265/5772 [1:30:38<50:51, 6.02s/it] 91%|█████████ | 5266/5772 [1:30:47<50:15, 5.96s/it] 91%|█████████ | 5266/5772 [1:30:44<50:15, 5.96s/it] {'loss': 0.4472, 'learning_rate': 4.004835453934419e-07, 'epoch': 0.91} + 91%|█████████ | 5266/5772 [1:30:47<50:15, 5.96s/it] {'loss': 0.4472, 'learning_rate': 4.004835453934419e-07, 'epoch': 0.91} + 91%|█████████ | 5266/5772 [1:30:44<50:15, 5.96s/it] 91%|█████████▏| 5267/5772 [1:30:50<50:38, 6.02s/it] 91%|█████████▏| 5267/5772 [1:30:53<50:38, 6.02s/it] {'loss': 0.4725, 'learning_rate': 3.9891276951079083e-07, 'epoch': 0.91} + 91%|█████████▏| 5267/5772 [1:30:53<50:38, 6.02s/it] {'loss': 0.4725, 'learning_rate': 3.9891276951079083e-07, 'epoch': 0.91} + 91%|█████████▏| 5267/5772 [1:30:50<50:38, 6.02s/it] 91%|█████████▏| 5268/5772 [1:30:57<51:21, 6.11s/it] 91%|█████████▏| 5268/5772 [1:31:00<51:21, 6.11s/it] {'loss': 0.4478, 'learning_rate': 3.9734501743717956e-07, 'epoch': 0.91} + 91%|█████████▏| 5268/5772 [1:31:00<51:21, 6.11s/it] {'loss': 0.4478, 'learning_rate': 3.9734501743717956e-07, 'epoch': 0.91} + 91%|█████████▏| 5268/5772 [1:30:57<51:21, 6.11s/it] 91%|█████████▏| 5269/5772 [1:31:06<51:32, 6.15s/it] 91%|█████████▏| 5269/5772 [1:31:03<51:32, 6.15s/it] {'loss': 0.4535, 'learning_rate': 3.957802896663665e-07, 'epoch': 0.91} + 91%|█████████▏| 5269/5772 [1:31:06<51:32, 6.15s/it] {'loss': 0.4535, 'learning_rate': 3.957802896663665e-07, 'epoch': 0.91} + 91%|█████████▏| 5269/5772 [1:31:03<51:32, 6.15s/it] 91%|█████████▏| 5270/5772 [1:31:09<51:31, 6.16s/it] 91%|█████████▏| 5270/5772 [1:31:12<51:31, 6.16s/it] {'loss': 0.4643, 'learning_rate': 3.9421858669114966e-07, 'epoch': 0.91} + 91%|█████████▏| 5270/5772 [1:31:12<51:31, 6.16s/it] {'loss': 0.4643, 'learning_rate': 3.9421858669114966e-07, 'epoch': 0.91} + 91%|█████████▏| 5270/5772 [1:31:09<51:31, 6.16s/it] 91%|█████████▏| 5271/5772 [1:31:15<51:13, 6.13s/it] 91%|█████████▏| 5271/5772 [1:31:18<51:13, 6.13s/it] {'loss': 0.4498, 'learning_rate': 3.9265990900337893e-07, 'epoch': 0.91} + 91%|█████████▏| 5271/5772 [1:31:18<51:13, 6.13s/it] {'loss': 0.4498, 'learning_rate': 3.9265990900337893e-07, 'epoch': 0.91} + 91%|█████████▏| 5271/5772 [1:31:15<51:13, 6.13s/it] 91%|█████████▏| 5272/5772 [1:31:21<50:46, 6.09s/it] 91%|█████████▏| 5272/5772 [1:31:24<50:46, 6.09s/it] {'loss': 0.4652, 'learning_rate': 3.9110425709395606e-07, 'epoch': 0.91} + 91%|█████████▏| 5272/5772 [1:31:24<50:46, 6.09s/it] {'loss': 0.4652, 'learning_rate': 3.9110425709395606e-07, 'epoch': 0.91} + 91%|█████████▏| 5272/5772 [1:31:21<50:46, 6.09s/it] 91%|█████████▏| 5273/5772 [1:31:28<51:01, 6.14s/it] 91%|█████████▏| 5273/5772 [1:31:31<51:01, 6.14s/it] {'loss': 0.4634, 'learning_rate': 3.8955163145282024e-07, 'epoch': 0.91} + 91%|█████████▏| 5273/5772 [1:31:31<51:01, 6.14s/it] {'loss': 0.4634, 'learning_rate': 3.8955163145282024e-07, 'epoch': 0.91} + 91%|█████████▏| 5273/5772 [1:31:28<51:01, 6.14s/it] 91%|█████████▏| 5274/5772 [1:31:33<50:08, 6.04s/it] 91%|█████████▏| 5274/5772 [1:31:36<50:08, 6.04s/it] {'loss': 0.4447, 'learning_rate': 3.8800203256896483e-07, 'epoch': 0.91} + 91%|█████████▏| 5274/5772 [1:31:36<50:08, 6.04s/it] {'loss': 0.4447, 'learning_rate': 3.8800203256896483e-07, 'epoch': 0.91} + 91%|█████████▏| 5274/5772 [1:31:33<50:08, 6.04s/it] 91%|█████████▏| 5275/5772 [1:31:39<50:06, 6.05s/it] 91%|█████████▏| 5275/5772 [1:31:42<50:06, 6.05s/it] {'loss': 0.459, 'learning_rate': 3.8645546093042385e-07, 'epoch': 0.91} + 91%|█████████▏| 5275/5772 [1:31:42<50:06, 6.05s/it] {'loss': 0.459, 'learning_rate': 3.8645546093042385e-07, 'epoch': 0.91} + 91%|█████████▏| 5275/5772 [1:31:39<50:06, 6.05s/it] 91%|█████████▏| 5276/5772 [1:31:46<50:29, 6.11s/it] 91%|█████████▏| 5276/5772 [1:31:49<50:29, 6.11s/it] {'loss': 0.4555, 'learning_rate': 3.8491191702428654e-07, 'epoch': 0.91} + 91%|█████████▏| 5276/5772 [1:31:49<50:29, 6.11s/it] {'loss': 0.4555, 'learning_rate': 3.8491191702428654e-07, 'epoch': 0.91} + 91%|█████████▏| 5276/5772 [1:31:46<50:29, 6.11s/it] 91%|█████████▏| 5277/5772 [1:31:52<50:44, 6.15s/it] 91%|█████████▏| 5277/5772 [1:31:55<50:44, 6.15s/it] {'loss': 0.4627, 'learning_rate': 3.833714013366796e-07, 'epoch': 0.91} + 91%|█████████▏| 5277/5772 [1:31:55<50:44, 6.15s/it] {'loss': 0.4627, 'learning_rate': 3.833714013366796e-07, 'epoch': 0.91} + 91%|█████████▏| 5277/5772 [1:31:52<50:44, 6.15s/it] 91%|█████████▏| 5278/5772 [1:31:58<49:48, 6.05s/it] 91%|█████████▏| 5278/5772 [1:32:01<49:48, 6.05s/it] {'loss': 0.4461, 'learning_rate': 3.8183391435278163e-07, 'epoch': 0.91} + 91%|█████████▏| 5278/5772 [1:32:01<49:48, 6.05s/it] {'loss': 0.4461, 'learning_rate': 3.8183391435278163e-07, 'epoch': 0.91} + 91%|█████████▏| 5278/5772 [1:31:58<49:48, 6.05s/it] 91%|█████████▏| 5279/5772 [1:32:04<49:24, 6.01s/it] 91%|█████████▏| 5279/5772 [1:32:07<49:24, 6.01s/it] {'loss': 0.4504, 'learning_rate': 3.802994565568141e-07, 'epoch': 0.91} + 91%|█████████▏| 5279/5772 [1:32:07<49:24, 6.01s/it] {'loss': 0.4504, 'learning_rate': 3.802994565568141e-07, 'epoch': 0.91} + 91%|█████████▏| 5279/5772 [1:32:04<49:24, 6.01s/it] 91%|█████████▏| 5280/5772 [1:32:10<50:01, 6.10s/it] 91%|█████████▏| 5280/5772 [1:32:13<50:01, 6.10s/it] {'loss': 0.461, 'learning_rate': 3.787680284320472e-07, 'epoch': 0.91} + 91%|█████████▏| 5280/5772 [1:32:13<50:01, 6.10s/it] {'loss': 0.461, 'learning_rate': 3.787680284320472e-07, 'epoch': 0.91} + 91%|█████████▏| 5280/5772 [1:32:10<50:01, 6.10s/it] 91%|█████████▏| 5281/5772 [1:32:16<49:39, 6.07s/it] 91%|█████████▏| 5281/5772 [1:32:19<49:39, 6.07s/it] {'loss': 0.4498, 'learning_rate': 3.7723963046079724e-07, 'epoch': 0.91} + 91%|█████████▏| 5281/5772 [1:32:19<49:39, 6.07s/it] {'loss': 0.4498, 'learning_rate': 3.7723963046079724e-07, 'epoch': 0.91} + 91%|█████████▏| 5281/5772 [1:32:16<49:39, 6.07s/it] 92%|█████████▏| 5282/5772 [1:32:22<50:10, 6.14s/it] 92%|█████████▏| 5282/5772 [1:32:25<50:10, 6.14s/it] {'loss': 0.4495, 'learning_rate': 3.757142631244204e-07, 'epoch': 0.92} + 92%|█████████▏| 5282/5772 [1:32:25<50:10, 6.14s/it] {'loss': 0.4495, 'learning_rate': 3.757142631244204e-07, 'epoch': 0.92} + 92%|█████████▏| 5282/5772 [1:32:22<50:10, 6.14s/it] 92%|█████████▏| 5283/5772 [1:32:32<50:42, 6.22s/it] 92%|█████████▏| 5283/5772 [1:32:29<50:42, 6.22s/it] {'loss': 0.4553, 'learning_rate': 3.7419192690332786e-07, 'epoch': 0.92} + 92%|█████████▏| 5283/5772 [1:32:32<50:42, 6.22s/it] {'loss': 0.4553, 'learning_rate': 3.7419192690332786e-07, 'epoch': 0.92} + 92%|█████████▏| 5283/5772 [1:32:29<50:42, 6.22s/it] 92%|█████████▏| 5284/5772 [1:32:35<50:32, 6.21s/it] 92%|█████████▏| 5284/5772 [1:32:38<50:32, 6.22s/it] {'loss': 0.4456, 'learning_rate': 3.726726222769672e-07, 'epoch': 0.92} + 92%|█████████▏| 5284/5772 [1:32:38<50:32, 6.22s/it] {'loss': 0.4456, 'learning_rate': 3.726726222769672e-07, 'epoch': 0.92} + 92%|█████████▏| 5284/5772 [1:32:35<50:32, 6.21s/it] 92%|█████████▏| 5285/5772 [1:32:44<50:09, 6.18s/it] 92%|█████████▏| 5285/5772 [1:32:41<50:10, 6.18s/it] {'loss': 0.4597, 'learning_rate': 3.7115634972383464e-07, 'epoch': 0.92} + 92%|█████████▏| 5285/5772 [1:32:44<50:09, 6.18s/it] {'loss': 0.4597, 'learning_rate': 3.7115634972383464e-07, 'epoch': 0.92} + 92%|█████████▏| 5285/5772 [1:32:41<50:10, 6.18s/it] 92%|█████████▏| 5286/5772 [1:32:48<51:00, 6.30s/it] 92%|█████████▏| 5286/5772 [1:32:51<51:00, 6.30s/it] {'loss': 0.4608, 'learning_rate': 3.696431097214748e-07, 'epoch': 0.92} + 92%|█████████▏| 5286/5772 [1:32:51<51:00, 6.30s/it] {'loss': 0.4608, 'learning_rate': 3.696431097214748e-07, 'epoch': 0.92} + 92%|█████████▏| 5286/5772 [1:32:48<51:00, 6.30s/it] 92%|█████████▏| 5287/5772 [1:32:54<51:02, 6.32s/it] 92%|█████████▏| 5287/5772 [1:32:57<51:02, 6.31s/it] {'loss': 0.4527, 'learning_rate': 3.6813290274647197e-07, 'epoch': 0.92} + 92%|█████████▏| 5287/5772 [1:32:57<51:02, 6.31s/it] {'loss': 0.4527, 'learning_rate': 3.6813290274647197e-07, 'epoch': 0.92} + 92%|█████████▏| 5287/5772 [1:32:54<51:02, 6.32s/it]WARNING: tokenization mismatch: 1 vs. 1440. [[{'from': 'human', 'value': 'ASSISTANT: Du bist ein Personalmarketing-Experte mit den folgenden Kenntnissen und Eigenschaften.\n\nKENNTNISSE: Personalmarketing, Social Media Marketing, Facebook-Marketing, Kampagnenentwicklung, Zielgruppenanalyse, Content-Strategie, Werbeanzeigen, Performance-Messung, Personalgewinnung, Steuerfachangestellte.\n\nEIGENSCHAFTEN: Kreativ, strategisch, analytisch, kommunikativ, zielorientiert, detailverliebt, teamorientiert.\n\nAUFGABE: Entwickle eine erfolgreiche Personalmarketing-Strategie für die Gewinnung von Steuerfachangestellten über Facebook. Erstelle eine Tabelle mit allen Maßnahmen und weise jeder Maßnahme den zuständigen Spezialisten in einer Marketingagentur zu. Achte dabei auf Zielgruppenanalyse, Content-Strategie, Werbeanzeigen und Performance-Messung.\n\nOUTPUT: Eine Tabelle mit allen Maßnahmen für eine erfolgreiche Personalmarketing-Kampagne zur Gewinnung von Steuerfachangestellten auf Facebook, die die Verantwortlichkeiten der Spezialisten in einer Marketingagentur aufzeigt, einschließlich Zielgruppenanalyse, Content-Strategie, Werbeanzeigen und Performance-Messung.'}, {'from': 'gpt', 'value': '| Maßnahme | Beschreibung | Zuständiger Spezialist |\n| --- | --- | --- |\n| Zielgruppenanalyse | Analyse der demografischen Daten, Interessen und Verhaltensweisen der potenziellen Bewerber. | Marktanalyst |\n| Erstellung von Personas | Entwicklung von fiktiven, repräsentativen Bewerberprofilen zur besseren Zielgruppenansprache. | Personalmarketing-Experte |\n| Content-Strategie | Planung von relevantem, ansprechendem und zielgruppenspezifischem Content. | Content-Stratege |\n| Redaktionsplan | Erstellung eines Zeitplans für die Veröffentlichung von Inhalten auf Facebook. | Social Media Manager |\n| Erstellung von Werbeanzeigen | Design und Texten von ansprechenden, zielgruppenspezifischen Facebook-Werbeanzeigen. | Art Director & Copywriter |\n| Auswahl von Targeting-Optionen | Festlegung der Zielgruppenparameter für die Facebook-Werbekampagnen. | Facebook-Marketing-Experte |\n| Budgetierung und Zeitplanung | Festlegung des Budgets und der Laufzeit für die Facebook-Werbekampagnen. | Media Planner |\n| Erstellung von Landing Pages | Gestaltung und Implementierung von spezifischen Landing Pages für die Bewerbung. | Web-Designer |\n| Performance-Messung | Überwachung und Analyse der Kampagnenleistung, einschließlich KPIs wie Klicks und Bewerbungen. | Data Analyst |\n| A/B-Testing | Durchführung von A/B-Tests, um die Effektivität von Anzeigen und Landing Pages zu optimieren. | Conversion-Optimierungsexperte |\n| Community-Management | Interaktion mit der Zielgruppe auf Facebook, Beantwortung von Fragen und Moderation von Diskussionen. | Community Manager |\n| Reporting und Optimierung | Regelmäßige Berichterstattung über die Kampagnenergebnisse und Anpassung der Strategie. | Performance Marketing-Manager |'}, {'from': 'human', 'value': 'Weiter.'}, {'from': 'gpt', 'value': 'Hier sind einige zusätzliche Maßnahmen, die Sie in Ihre Personalmarketing-Strategie für die Gewinnung von Steuerfachangestellten über Facebook einbeziehen können:\n\n| Maßnahme | Beschreibung | Zuständiger Spezialist |\n| --- | --- | --- |\n| Video-Content | Erstellung von Video-Content, wie Interviews mit aktuellen Mitarbeitern oder Berufsalltagsvideos. | Videoproduzent |\n| Influencer-Marketing | Zusammenarbeit mit Influencern aus der Branche, um Ihre Arbeitgebermarke zu stärken. | Influencer-Marketing-Experte |\n| Anpassung von Stellenanzeigen | Anpassung der Stellenanzeigen auf Facebook, um die Zielgruppe besser anzusprechen. | Personalmarketing-Experte |\n| Mitarbeiter-Testimonials | Teilen von Erfahrungsberichten und Zitaten der Mitarbeiter auf Facebook, um Authentizität zu schaffen. | Content-Stratege |\n| Virtuelle Karriere-Events | Durchführung von virtuellen Karriere-Events oder Informationsveranstaltungen auf Facebook. | Event-Manager |\n| Interaktive Umfragen und Quiz | Nutzung von interaktiven Umfragen und Quiz auf Facebook, um das Engagement zu erhöhen. | Social Media Manager |\n| Remarketing-Kampagnen | Einsatz von Remarketing-Anzeigen, um die Personen erneut anzusprechen, die bereits interessiert waren. | Facebook-Marketing-Experte |\n| Monitoring von Wettbewerbern | Beobachtung der Personalmarketing-Aktivitäten der Wettbewerber auf Facebook und Ableitung von Optimierungspotenzialen. | Marktanalyst |\n| Integration von Chatbots | Implementierung von Chatbots auf der Facebook-Seite und der Landing Page zur Automatisierung des Bewerbungsprozesses. | Chatbot-Entwickler |\n| Employer Branding-Kampagne | Entwicklung einer Employer Branding-Kampagne, die die Unternehmenskultur und -werte hervorhebt. | Employer Branding-Experte |\n\nMit diesen erweiterten Maßnahmen können Sie eine noch umfassendere Personalmarketing-Strategie entwickeln, die die Chancen erhöht, qualifizierte Steuerfachangestellte über Facebook zu gewinnen.'}]] (ignored) + 92%|█████████▏| 5288/5772 [1:33:00<50:29, 6.26s/it] 92%|█████████▏| 5288/5772 [1:33:03<50:29, 6.26s/it] {'loss': 0.4614, 'learning_rate': 3.6662572927445907e-07, 'epoch': 0.92} + 92%|█████████▏| 5288/5772 [1:33:03<50:29, 6.26s/it] {'loss': 0.4614, 'learning_rate': 3.6662572927445907e-07, 'epoch': 0.92} + 92%|█████████▏| 5288/5772 [1:33:00<50:29, 6.26s/it] 92%|█████████▏| 5289/5772 [1:33:06<49:29, 6.15s/it] 92%|█████████▏| 5289/5772 [1:33:09<49:29, 6.15s/it] {'loss': 0.453, 'learning_rate': 3.651215897801097e-07, 'epoch': 0.92} + 92%|█████████▏| 5289/5772 [1:33:09<49:29, 6.15s/it] {'loss': 0.453, 'learning_rate': 3.651215897801097e-07, 'epoch': 0.92} + 92%|█████████▏| 5289/5772 [1:33:06<49:29, 6.15s/it] 92%|█████████▏| 5290/5772 [1:33:13<50:51, 6.33s/it] 92%|█████████▏| 5290/5772 [1:33:16<50:51, 6.33s/it] {'loss': 0.4504, 'learning_rate': 3.6362048473714496e-07, 'epoch': 0.92} + 92%|█████████▏| 5290/5772 [1:33:16<50:51, 6.33s/it] {'loss': 0.4504, 'learning_rate': 3.6362048473714496e-07, 'epoch': 0.92} + 92%|█████████▏| 5290/5772 [1:33:13<50:51, 6.33s/it] 92%|█████████▏| 5291/5772 [1:33:19<51:02, 6.37s/it] 92%|█████████▏| 5291/5772 [1:33:22<51:02, 6.37s/it] {'loss': 0.4459, 'learning_rate': 3.6212241461833107e-07, 'epoch': 0.92} + 92%|█████████▏| 5291/5772 [1:33:22<51:02, 6.37s/it] {'loss': 0.4459, 'learning_rate': 3.6212241461833107e-07, 'epoch': 0.92} + 92%|█████████▏| 5291/5772 [1:33:19<51:02, 6.37s/it] 92%|█████████▏| 5292/5772 [1:33:25<49:29, 6.19s/it] 92%|█████████▏| 5292/5772 [1:33:28<49:29, 6.19s/it] {'loss': 0.4535, 'learning_rate': 3.606273798954751e-07, 'epoch': 0.92} + 92%|█████████▏| 5292/5772 [1:33:28<49:29, 6.19s/it] {'loss': 0.4535, 'learning_rate': 3.606273798954751e-07, 'epoch': 0.92} + 92%|█████████▏| 5292/5772 [1:33:25<49:29, 6.19s/it] 92%|█████████▏| 5293/5772 [1:33:31<49:14, 6.17s/it] 92%|█████████▏| 5293/5772 [1:33:34<49:14, 6.17s/it] {'loss': 0.4683, 'learning_rate': 3.5913538103943155e-07, 'epoch': 0.92} + 92%|█████████▏| 5293/5772 [1:33:34<49:14, 6.17s/it] {'loss': 0.4683, 'learning_rate': 3.5913538103943155e-07, 'epoch': 0.92} + 92%|█████████▏| 5293/5772 [1:33:31<49:14, 6.17s/it] 92%|█████████▏| 5294/5772 [1:33:37<48:26, 6.08s/it] 92%|█████████▏| 5294/5772 [1:33:40<48:26, 6.08s/it] {'loss': 0.4617, 'learning_rate': 3.5764641852009565e-07, 'epoch': 0.92} + 92%|█████████▏| 5294/5772 [1:33:40<48:26, 6.08s/it] {'loss': 0.4617, 'learning_rate': 3.5764641852009565e-07, 'epoch': 0.92} + 92%|█████████▏| 5294/5772 [1:33:37<48:26, 6.08s/it] 92%|█████████▏| 5295/5772 [1:33:43<48:20, 6.08s/it] 92%|█████████▏| 5295/5772 [1:33:46<48:20, 6.08s/it] {'loss': 0.4643, 'learning_rate': 3.5616049280640995e-07, 'epoch': 0.92} + 92%|█████████▏| 5295/5772 [1:33:46<48:20, 6.08s/it] {'loss': 0.4643, 'learning_rate': 3.5616049280640995e-07, 'epoch': 0.92} + 92%|█████████▏| 5295/5772 [1:33:43<48:20, 6.08s/it] 92%|█████████▏| 5296/5772 [1:33:49<47:42, 6.01s/it] 92%|█████████▏| 5296/5772 [1:33:52<47:42, 6.01s/it] {'loss': 0.4484, 'learning_rate': 3.5467760436635577e-07, 'epoch': 0.92} + 92%|█████████▏| 5296/5772 [1:33:52<47:42, 6.01s/it] {'loss': 0.4484, 'learning_rate': 3.5467760436635577e-07, 'epoch': 0.92} + 92%|█████████▏| 5296/5772 [1:33:49<47:42, 6.01s/it] 92%|█████████▏| 5297/5772 [1:33:55<47:19, 5.98s/it] 92%|█████████▏| 5297/5772 [1:33:58<47:19, 5.98s/it] {'loss': 0.4557, 'learning_rate': 3.5319775366696175e-07, 'epoch': 0.92} + 92%|█████████▏| 5297/5772 [1:33:58<47:19, 5.98s/it] {'loss': 0.4557, 'learning_rate': 3.5319775366696175e-07, 'epoch': 0.92} + 92%|█████████▏| 5297/5772 [1:33:55<47:19, 5.98s/it] 92%|█████████▏| 5298/5772 [1:34:01<48:04, 6.09s/it] 92%|█████████▏| 5298/5772 [1:34:04<48:04, 6.09s/it] {'loss': 0.4481, 'learning_rate': 3.517209411742994e-07, 'epoch': 0.92} + 92%|█████████▏| 5298/5772 [1:34:04<48:04, 6.09s/it] {'loss': 0.4481, 'learning_rate': 3.517209411742994e-07, 'epoch': 0.92} + 92%|█████████▏| 5298/5772 [1:34:01<48:04, 6.09s/it] 92%|█████████▏| 5299/5772 [1:34:07<47:56, 6.08s/it] 92%|█████████▏| 5299/5772 [1:34:10<47:56, 6.08s/it] {'loss': 0.4605, 'learning_rate': 3.502471673534824e-07, 'epoch': 0.92} + 92%|█████████▏| 5299/5772 [1:34:10<47:56, 6.08s/it] {'loss': 0.4605, 'learning_rate': 3.502471673534824e-07, 'epoch': 0.92} + 92%|█████████▏| 5299/5772 [1:34:07<47:56, 6.08s/it]2 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +07 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend...4 + AutoResumeHook: Checking whether to suspend... + 92%|█████████▏| 5300/5772 [1:34:13<47:52, 6.09s/it]1 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4491, 'learning_rate': 3.48776432668666e-07, 'epoch': 0.92} + 92%|█████████▏| 5300/5772 [1:34:13<47:52, 6.09s/it]14 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... + 92%|█████████▏| 5300/5772 [1:34:16<47:52, 6.09s/it]9 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4491, 'learning_rate': 3.48776432668666e-07, 'epoch': 0.92} + 92%|█████████▏| 5300/5772 [1:34:16<47:52, 6.09s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-5300/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-5300/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-5300/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 92%|█████████▏| 5301/5772 [1:34:38<1:24:34, 10.77s/it] 92%|█████████▏| 5301/5772 [1:34:35<1:24:35, 10.78s/it] {'loss': 0.4689, 'learning_rate': 3.4730873758305193e-07, 'epoch': 0.92} + 92%|█████████▏| 5301/5772 [1:34:38<1:24:34, 10.77s/it] {'loss': 0.4689, 'learning_rate': 3.4730873758305193e-07, 'epoch': 0.92} + 92%|█████████▏| 5301/5772 [1:34:35<1:24:35, 10.78s/it] 92%|█████████▏| 5302/5772 [1:34:44<1:13:09, 9.34s/it] 92%|█████████▏| 5302/5772 [1:34:41<1:13:10, 9.34s/it] {'loss': 0.457, 'learning_rate': 3.458440825588827e-07, 'epoch': 0.92} + 92%|█████████▏| 5302/5772 [1:34:44<1:13:09, 9.34s/it] {'loss': 0.457, 'learning_rate': 3.458440825588827e-07, 'epoch': 0.92} + 92%|█████████▏| 5302/5772 [1:34:41<1:13:10, 9.34s/it] 92%|█████████▏| 5303/5772 [1:34:50<1:04:43, 8.28s/it] 92%|█████████▏| 5303/5772 [1:34:47<1:04:43, 8.28s/it] {'loss': 0.4624, 'learning_rate': 3.4438246805744034e-07, 'epoch': 0.92} + 92%|█████████▏| 5303/5772 [1:34:47<1:04:43, 8.28s/it]{'loss': 0.4624, 'learning_rate': 3.4438246805744034e-07, 'epoch': 0.92} + 92%|█████████▏| 5303/5772 [1:34:50<1:04:43, 8.28s/it] 92%|█████████▏| 5304/5772 [1:34:56<1:00:25, 7.75s/it] 92%|█████████▏| 5304/5772 [1:34:53<1:00:25, 7.75s/it] {'loss': 0.4581, 'learning_rate': 3.429238945390556e-07, 'epoch': 0.92} + 92%|█████████▏| 5304/5772 [1:34:56<1:00:25, 7.75s/it] {'loss': 0.4581, 'learning_rate': 3.429238945390556e-07, 'epoch': 0.92} + 92%|█████████▏| 5304/5772 [1:34:53<1:00:25, 7.75s/it] 92%|█████████▏| 5305/5772 [1:35:02<56:26, 7.25s/it] 92%|█████████▏| 5305/5772 [1:34:59<56:26, 7.25s/it] {'loss': 0.4609, 'learning_rate': 3.4146836246309656e-07, 'epoch': 0.92} + 92%|█████████▏| 5305/5772 [1:35:02<56:26, 7.25s/it] {'loss': 0.4609, 'learning_rate': 3.4146836246309656e-07, 'epoch': 0.92} + 92%|█████████▏| 5305/5772 [1:34:59<56:26, 7.25s/it] 92%|█████████▏| 5306/5772 [1:35:09<55:20, 7.12s/it] 92%|█████████▏| 5306/5772 [1:35:06<55:20, 7.12s/it] {'loss': 0.4566, 'learning_rate': 3.4001587228797427e-07, 'epoch': 0.92} + 92%|█████████▏| 5306/5772 [1:35:09<55:20, 7.12s/it] {'loss': 0.4566, 'learning_rate': 3.4001587228797427e-07, 'epoch': 0.92} + 92%|█████████▏| 5306/5772 [1:35:06<55:20, 7.12s/it] 92%|█████████▏| 5307/5772 [1:35:15<52:43, 6.80s/it] 92%|█████████▏| 5307/5772 [1:35:12<52:43, 6.80s/it] {'loss': 0.4571, 'learning_rate': 3.385664244711451e-07, 'epoch': 0.92} + 92%|█████████▏| 5307/5772 [1:35:12<52:43, 6.80s/it] {'loss': 0.4571, 'learning_rate': 3.385664244711451e-07, 'epoch': 0.92} + 92%|█████████▏| 5307/5772 [1:35:15<52:43, 6.80s/it] 92%|█████████▏| 5308/5772 [1:35:21<51:05, 6.61s/it] 92%|█████████▏| 5308/5772 [1:35:18<51:05, 6.61s/it] {'loss': 0.4469, 'learning_rate': 3.3712001946910046e-07, 'epoch': 0.92} + 92%|█████████▏| 5308/5772 [1:35:21<51:05, 6.61s/it] {'loss': 0.4469, 'learning_rate': 3.3712001946910046e-07, 'epoch': 0.92} + 92%|█████████▏| 5308/5772 [1:35:18<51:05, 6.61s/it] 92%|█████████▏| 5309/5772 [1:35:27<49:34, 6.42s/it] 92%|█████████▏| 5309/5772 [1:35:24<49:34, 6.42s/it] {'loss': 0.4617, 'learning_rate': 3.3567665773738156e-07, 'epoch': 0.92} + 92%|█████████▏| 5309/5772 [1:35:27<49:34, 6.42s/it] {'loss': 0.4617, 'learning_rate': 3.3567665773738156e-07, 'epoch': 0.92} + 92%|█████████▏| 5309/5772 [1:35:24<49:34, 6.42s/it] 92%|█████████▏| 5310/5772 [1:35:33<47:54, 6.22s/it] 92%|█████████▏| 5310/5772 [1:35:30<47:54, 6.22s/it] {'loss': 0.4555, 'learning_rate': 3.342363397305648e-07, 'epoch': 0.92} + 92%|█████████▏| 5310/5772 [1:35:33<47:54, 6.22s/it] {'loss': 0.4555, 'learning_rate': 3.342363397305648e-07, 'epoch': 0.92} + 92%|█████████▏| 5310/5772 [1:35:30<47:54, 6.22s/it] 92%|█████████▏| 5311/5772 [1:35:39<47:08, 6.14s/it] 92%|█████████▏| 5311/5772 [1:35:36<47:08, 6.14s/it] {'loss': 0.4569, 'learning_rate': 3.327990659022706e-07, 'epoch': 0.92} + 92%|█████████▏| 5311/5772 [1:35:39<47:08, 6.14s/it] {'loss': 0.4569, 'learning_rate': 3.327990659022706e-07, 'epoch': 0.92} + 92%|█████████▏| 5311/5772 [1:35:36<47:08, 6.14s/it] 92%|█████████▏| 5312/5772 [1:35:45<46:33, 6.07s/it] 92%|█████████▏| 5312/5772 [1:35:42<46:33, 6.07s/it] {'loss': 0.4567, 'learning_rate': 3.313648367051614e-07, 'epoch': 0.92} + 92%|█████████▏| 5312/5772 [1:35:45<46:33, 6.07s/it] {'loss': 0.4567, 'learning_rate': 3.313648367051614e-07, 'epoch': 0.92} + 92%|█████████▏| 5312/5772 [1:35:42<46:33, 6.07s/it] 92%|█████████▏| 5313/5772 [1:35:51<47:14, 6.17s/it] 92%|█████████▏| 5313/5772 [1:35:48<47:14, 6.17s/it] {'loss': 0.4615, 'learning_rate': 3.299336525909391e-07, 'epoch': 0.92} + 92%|█████████▏| 5313/5772 [1:35:51<47:14, 6.17s/it] {'loss': 0.4615, 'learning_rate': 3.299336525909391e-07, 'epoch': 0.92} + 92%|█████████▏| 5313/5772 [1:35:48<47:14, 6.17s/it] 92%|█████████▏| 5314/5772 [1:35:57<46:40, 6.11s/it] 92%|█████████▏| 5314/5772 [1:35:54<46:40, 6.11s/it] {'loss': 0.4533, 'learning_rate': 3.2850551401034767e-07, 'epoch': 0.92} + 92%|█████████▏| 5314/5772 [1:35:57<46:40, 6.11s/it] {'loss': 0.4533, 'learning_rate': 3.2850551401034767e-07, 'epoch': 0.92} + 92%|█████████▏| 5314/5772 [1:35:54<46:40, 6.11s/it] 92%|█████████▏| 5315/5772 [1:36:03<46:08, 6.06s/it] 92%|█████████▏| 5315/5772 [1:36:00<46:08, 6.06s/it] {'loss': 0.4612, 'learning_rate': 3.270804214131684e-07, 'epoch': 0.92} + 92%|█████████▏| 5315/5772 [1:36:03<46:08, 6.06s/it] {'loss': 0.4612, 'learning_rate': 3.270804214131684e-07, 'epoch': 0.92} + 92%|█████████▏| 5315/5772 [1:36:00<46:08, 6.06s/it] 92%|█████████▏| 5316/5772 [1:36:09<45:31, 5.99s/it] 92%|█████████▏| 5316/5772 [1:36:06<45:31, 5.99s/it] {'loss': 0.4474, 'learning_rate': 3.2565837524823227e-07, 'epoch': 0.92} + 92%|█████████▏| 5316/5772 [1:36:09<45:31, 5.99s/it] {'loss': 0.4474, 'learning_rate': 3.2565837524823227e-07, 'epoch': 0.92} + 92%|█████████▏| 5316/5772 [1:36:06<45:31, 5.99s/it] 92%|█████████▏| 5317/5772 [1:36:15<45:34, 6.01s/it] 92%|█████████▏| 5317/5772 [1:36:12<45:34, 6.01s/it] {'loss': 0.459, 'learning_rate': 3.242393759633988e-07, 'epoch': 0.92} + 92%|█████████▏| 5317/5772 [1:36:15<45:34, 6.01s/it] {'loss': 0.459, 'learning_rate': 3.242393759633988e-07, 'epoch': 0.92} + 92%|█████████▏| 5317/5772 [1:36:12<45:34, 6.01s/it] 92%|█████████▏| 5318/5772 [1:36:18<45:24, 6.00s/it] 92%|█████████▏| 5318/5772 [1:36:21<45:24, 6.00s/it] {'loss': 0.4459, 'learning_rate': 3.228234240055772e-07, 'epoch': 0.92} + 92%|█████████▏| 5318/5772 [1:36:21<45:24, 6.00s/it] {'loss': 0.4459, 'learning_rate': 3.228234240055772e-07, 'epoch': 0.92} + 92%|█████████▏| 5318/5772 [1:36:18<45:24, 6.00s/it] 92%|█████████▏| 5319/5772 [1:36:27<44:59, 5.96s/it] 92%|█████████▏| 5319/5772 [1:36:24<44:59, 5.96s/it] {'loss': 0.4676, 'learning_rate': 3.2141051982071293e-07, 'epoch': 0.92} + 92%|█████████▏| 5319/5772 [1:36:27<44:59, 5.96s/it] {'loss': 0.4676, 'learning_rate': 3.2141051982071293e-07, 'epoch': 0.92} + 92%|█████████▏| 5319/5772 [1:36:24<44:59, 5.96s/it] 92%|█████████▏| 5320/5772 [1:36:33<45:30, 6.04s/it] 92%|█████████▏| 5320/5772 [1:36:30<45:30, 6.04s/it] {'loss': 0.4667, 'learning_rate': 3.2000066385379225e-07, 'epoch': 0.92} + 92%|█████████▏| 5320/5772 [1:36:33<45:30, 6.04s/it] {'loss': 0.4667, 'learning_rate': 3.2000066385379225e-07, 'epoch': 0.92} + 92%|█████████▏| 5320/5772 [1:36:30<45:30, 6.04s/it] 92%|█████████▏| 5321/5772 [1:36:40<45:53, 6.11s/it] 92%|█████████▏| 5321/5772 [1:36:36<45:53, 6.10s/it] {'loss': 0.4633, 'learning_rate': 3.185938565488422e-07, 'epoch': 0.92} + 92%|█████████▏| 5321/5772 [1:36:40<45:53, 6.11s/it] {'loss': 0.4633, 'learning_rate': 3.185938565488422e-07, 'epoch': 0.92} + 92%|█████████▏| 5321/5772 [1:36:36<45:53, 6.10s/it] 92%|█████████▏| 5322/5772 [1:36:46<46:09, 6.15s/it] 92%|█████████▏| 5322/5772 [1:36:43<46:09, 6.15s/it] {'loss': 0.4517, 'learning_rate': 3.171900983489273e-07, 'epoch': 0.92} + 92%|█████████▏| 5322/5772 [1:36:46<46:09, 6.15s/it] {'loss': 0.4517, 'learning_rate': 3.171900983489273e-07, 'epoch': 0.92} + 92%|█████████▏| 5322/5772 [1:36:43<46:09, 6.15s/it] 92%|█████████▏| 5323/5772 [1:36:52<45:45, 6.12s/it] 92%|█████████▏| 5323/5772 [1:36:49<45:45, 6.12s/it] {'loss': 0.458, 'learning_rate': 3.1578938969615394e-07, 'epoch': 0.92} + 92%|█████████▏| 5323/5772 [1:36:52<45:45, 6.12s/it] {'loss': 0.458, 'learning_rate': 3.1578938969615394e-07, 'epoch': 0.92} + 92%|█████████▏| 5323/5772 [1:36:49<45:45, 6.12s/it] 92%|█████████▏| 5324/5772 [1:36:58<45:13, 6.06s/it] 92%|█████████▏| 5324/5772 [1:36:55<45:13, 6.06s/it] {'loss': 0.4582, 'learning_rate': 3.143917310316691e-07, 'epoch': 0.92} + 92%|█████████▏| 5324/5772 [1:36:58<45:13, 6.06s/it] {'loss': 0.4582, 'learning_rate': 3.143917310316691e-07, 'epoch': 0.92} + 92%|█████████▏| 5324/5772 [1:36:55<45:13, 6.06s/it] 92%|█████████▏| 5325/5772 [1:37:04<45:16, 6.08s/it] 92%|█████████▏| 5325/5772 [1:37:01<45:16, 6.08s/it] {'loss': 0.4636, 'learning_rate': 3.129971227956563e-07, 'epoch': 0.92} + 92%|█████████▏| 5325/5772 [1:37:04<45:16, 6.08s/it] {'loss': 0.4636, 'learning_rate': 3.129971227956563e-07, 'epoch': 0.92} + 92%|█████████▏| 5325/5772 [1:37:01<45:16, 6.08s/it] 92%|█████████▏| 5326/5772 [1:37:10<45:42, 6.15s/it] {'loss': 0.448, 'learning_rate': 3.1160556542733757e-07, 'epoch': 0.92} + 92%|█████████▏| 5326/5772 [1:37:10<45:42, 6.15s/it] 92%|█████████▏| 5326/5772 [1:37:07<45:42, 6.15s/it] {'loss': 0.448, 'learning_rate': 3.1160556542733757e-07, 'epoch': 0.92} + 92%|█████████▏| 5326/5772 [1:37:07<45:42, 6.15s/it] 92%|█████████▏| 5327/5772 [1:37:17<46:32, 6.27s/it] 92%|█████████▏| 5327/5772 [1:37:14<46:31, 6.27s/it] {'loss': 0.4799, 'learning_rate': 3.1021705936498005e-07, 'epoch': 0.92} + 92%|█████████▏| 5327/5772 [1:37:17<46:32, 6.27s/it] {'loss': 0.4799, 'learning_rate': 3.1021705936498005e-07, 'epoch': 0.92} + 92%|█████████▏| 5327/5772 [1:37:14<46:31, 6.27s/it] 92%|█████████▏| 5328/5772 [1:37:23<45:48, 6.19s/it] 92%|█████████▏| 5328/5772 [1:37:20<45:48, 6.19s/it] {'loss': 0.4445, 'learning_rate': 3.0883160504588504e-07, 'epoch': 0.92} + 92%|█████████▏| 5328/5772 [1:37:23<45:48, 6.19s/it] {'loss': 0.4445, 'learning_rate': 3.0883160504588504e-07, 'epoch': 0.92} + 92%|█████████▏| 5328/5772 [1:37:20<45:48, 6.19s/it] 92%|█████████▏| 5329/5772 [1:37:29<45:42, 6.19s/it] 92%|█████████▏| 5329/5772 [1:37:26<45:42, 6.19s/it] {'loss': 0.4554, 'learning_rate': 3.0744920290639247e-07, 'epoch': 0.92} + 92%|█████████▏| 5329/5772 [1:37:29<45:42, 6.19s/it] {'loss': 0.4554, 'learning_rate': 3.0744920290639247e-07, 'epoch': 0.92} + 92%|█████████▏| 5329/5772 [1:37:26<45:42, 6.19s/it] 92%|█████████▏| 5330/5772 [1:37:35<45:51, 6.23s/it] 92%|█████████▏| 5330/5772 [1:37:32<45:51, 6.23s/it] {'loss': 0.4439, 'learning_rate': 3.0606985338188177e-07, 'epoch': 0.92} + 92%|█████████▏| 5330/5772 [1:37:35<45:51, 6.23s/it] {'loss': 0.4439, 'learning_rate': 3.0606985338188177e-07, 'epoch': 0.92} + 92%|█████████▏| 5330/5772 [1:37:32<45:51, 6.23s/it] 92%|█████████▏| 5331/5772 [1:37:41<44:42, 6.08s/it] 92%|█████████▏| 5331/5772 [1:37:38<44:42, 6.08s/it] {'loss': 0.4707, 'learning_rate': 3.0469355690677216e-07, 'epoch': 0.92} + 92%|█████████▏| 5331/5772 [1:37:41<44:42, 6.08s/it] {'loss': 0.4707, 'learning_rate': 3.0469355690677216e-07, 'epoch': 0.92} + 92%|█████████▏| 5331/5772 [1:37:38<44:42, 6.08s/it] 92%|█████████▏| 5332/5772 [1:37:47<44:40, 6.09s/it] 92%|█████████▏| 5332/5772 [1:37:44<44:40, 6.09s/it] {'loss': 0.45, 'learning_rate': 3.0332031391452243e-07, 'epoch': 0.92} + 92%|█████████▏| 5332/5772 [1:37:47<44:40, 6.09s/it] {'loss': 0.45, 'learning_rate': 3.0332031391452243e-07, 'epoch': 0.92} + 92%|█████████▏| 5332/5772 [1:37:44<44:40, 6.09s/it] 92%|█████████▏| 5333/5772 [1:37:53<44:17, 6.05s/it] 92%|█████████▏| 5333/5772 [1:37:50<44:17, 6.05s/it] {'loss': 0.47, 'learning_rate': 3.019501248376244e-07, 'epoch': 0.92} + 92%|█████████▏| 5333/5772 [1:37:53<44:17, 6.05s/it] {'loss': 0.47, 'learning_rate': 3.019501248376244e-07, 'epoch': 0.92} + 92%|█████████▏| 5333/5772 [1:37:50<44:17, 6.05s/it] 92%|█████████▏| 5334/5772 [1:37:59<44:22, 6.08s/it] 92%|█████████▏| 5334/5772 [1:37:56<44:22, 6.08s/it] {'loss': 0.4555, 'learning_rate': 3.0058299010761294e-07, 'epoch': 0.92} + 92%|█████████▏| 5334/5772 [1:37:59<44:22, 6.08s/it] {'loss': 0.4555, 'learning_rate': 3.0058299010761294e-07, 'epoch': 0.92} + 92%|█████████▏| 5334/5772 [1:37:56<44:22, 6.08s/it] 92%|█████████▏| 5335/5772 [1:38:05<43:46, 6.01s/it] 92%|█████████▏| 5335/5772 [1:38:02<43:46, 6.01s/it] {'loss': 0.4653, 'learning_rate': 2.992189101550613e-07, 'epoch': 0.92} + 92%|█████████▏| 5335/5772 [1:38:05<43:46, 6.01s/it] {'loss': 0.4653, 'learning_rate': 2.992189101550613e-07, 'epoch': 0.92} + 92%|█████████▏| 5335/5772 [1:38:02<43:46, 6.01s/it] 92%|█████████▏| 5336/5772 [1:38:11<43:38, 6.01s/it] 92%|█████████▏| 5336/5772 [1:38:08<43:38, 6.01s/it] {'loss': 0.4473, 'learning_rate': 2.9785788540957706e-07, 'epoch': 0.92} + 92%|█████████▏| 5336/5772 [1:38:11<43:38, 6.01s/it] {'loss': 0.4473, 'learning_rate': 2.9785788540957706e-07, 'epoch': 0.92} + 92%|█████████▏| 5336/5772 [1:38:08<43:38, 6.01s/it] 92%|█████████▏| 5337/5772 [1:38:17<43:29, 6.00s/it] 92%|█████████▏| 5337/5772 [1:38:14<43:29, 6.00s/it] {'loss': 0.4552, 'learning_rate': 2.964999162998072e-07, 'epoch': 0.92} + 92%|█████████▏| 5337/5772 [1:38:14<43:29, 6.00s/it] {'loss': 0.4552, 'learning_rate': 2.964999162998072e-07, 'epoch': 0.92} + 92%|█████████▏| 5337/5772 [1:38:17<43:29, 6.00s/it] 92%|█████████▏| 5338/5772 [1:38:23<43:47, 6.05s/it] 92%|█████████▏| 5338/5772 [1:38:20<43:47, 6.05s/it] {'loss': 0.4465, 'learning_rate': 2.951450032534364e-07, 'epoch': 0.92} + 92%|█████████▏| 5338/5772 [1:38:23<43:47, 6.05s/it] {'loss': 0.4465, 'learning_rate': 2.951450032534364e-07, 'epoch': 0.92} + 92%|█████████▏| 5338/5772 [1:38:20<43:47, 6.05s/it] 92%|█████████▏| 5339/5772 [1:38:29<44:06, 6.11s/it] 92%|█████████▏| 5339/5772 [1:38:26<44:06, 6.11s/it] {'loss': 0.4582, 'learning_rate': 2.937931466971888e-07, 'epoch': 0.92} + 92%|█████████▏| 5339/5772 [1:38:29<44:06, 6.11s/it] {'loss': 0.4582, 'learning_rate': 2.937931466971888e-07, 'epoch': 0.92} + 92%|█████████▏| 5339/5772 [1:38:26<44:06, 6.11s/it] 93%|█████████▎| 5340/5772 [1:38:36<44:01, 6.11s/it] 93%|█████████▎| 5340/5772 [1:38:33<44:00, 6.11s/it] {'loss': 0.4413, 'learning_rate': 2.9244434705682276e-07, 'epoch': 0.93} + 93%|█████████▎| 5340/5772 [1:38:36<44:01, 6.11s/it] {'loss': 0.4413, 'learning_rate': 2.9244434705682276e-07, 'epoch': 0.93} + 93%|█████████▎| 5340/5772 [1:38:33<44:00, 6.11s/it] 93%|█████████▎| 5341/5772 [1:38:42<43:45, 6.09s/it] 93%|█████████▎| 5341/5772 [1:38:39<43:45, 6.09s/it] {'loss': 0.4547, 'learning_rate': 2.9109860475713403e-07, 'epoch': 0.93} + 93%|█████████▎| 5341/5772 [1:38:42<43:45, 6.09s/it] {'loss': 0.4547, 'learning_rate': 2.9109860475713403e-07, 'epoch': 0.93} + 93%|█████████▎| 5341/5772 [1:38:39<43:45, 6.09s/it] 93%|█████████▎| 5342/5772 [1:38:48<44:04, 6.15s/it] 93%|█████████▎| 5342/5772 [1:38:45<44:04, 6.15s/it] {'loss': 0.4603, 'learning_rate': 2.897559202219602e-07, 'epoch': 0.93} + 93%|█████████▎| 5342/5772 [1:38:48<44:04, 6.15s/it] {'loss': 0.4603, 'learning_rate': 2.897559202219602e-07, 'epoch': 0.93} + 93%|█████████▎| 5342/5772 [1:38:45<44:04, 6.15s/it] 93%|█████████▎| 5343/5772 [1:38:54<43:57, 6.15s/it] 93%|█████████▎| 5343/5772 [1:38:51<43:57, 6.15s/it] {'loss': 0.4502, 'learning_rate': 2.884162938741686e-07, 'epoch': 0.93} + 93%|█████████▎| 5343/5772 [1:38:54<43:57, 6.15s/it] {'loss': 0.4502, 'learning_rate': 2.884162938741686e-07, 'epoch': 0.93} + 93%|█████████▎| 5343/5772 [1:38:51<43:57, 6.15s/it] 93%|█████████▎| 5344/5772 [1:39:00<43:39, 6.12s/it] 93%|█████████▎| 5344/5772 [1:38:57<43:39, 6.12s/it] {'loss': 0.4443, 'learning_rate': 2.870797261356684e-07, 'epoch': 0.93} + 93%|█████████▎| 5344/5772 [1:39:00<43:39, 6.12s/it] {'loss': 0.4443, 'learning_rate': 2.870797261356684e-07, 'epoch': 0.93} + 93%|█████████▎| 5344/5772 [1:38:57<43:39, 6.12s/it] 93%|█████████▎| 5345/5772 [1:39:06<43:18, 6.08s/it] 93%|█████████▎| 5345/5772 [1:39:03<43:18, 6.08s/it]{'loss': 0.4499, 'learning_rate': 2.8574621742740506e-07, 'epoch': 0.93} + {'loss': 0.4499, 'learning_rate': 2.8574621742740506e-07, 'epoch': 0.93} + 93%|█████████▎| 5345/5772 [1:39:06<43:18, 6.08s/it] 93%|█████████▎| 5345/5772 [1:39:03<43:18, 6.08s/it] 93%|█████████▎| 5346/5772 [1:39:13<44:14, 6.23s/it] 93%|█████████▎| 5346/5772 [1:39:10<44:14, 6.23s/it] {'loss': 0.4647, 'learning_rate': 2.8441576816936043e-07, 'epoch': 0.93} + 93%|█████████▎| 5346/5772 [1:39:13<44:14, 6.23s/it] {'loss': 0.4647, 'learning_rate': 2.8441576816936043e-07, 'epoch': 0.93} + 93%|█████████▎| 5346/5772 [1:39:10<44:14, 6.23s/it] 93%|█████████▎| 5347/5772 [1:39:19<43:23, 6.13s/it] 93%|█████████▎| 5347/5772 [1:39:16<43:23, 6.13s/it] {'loss': 0.4481, 'learning_rate': 2.830883787805494e-07, 'epoch': 0.93} + 93%|█████████▎| 5347/5772 [1:39:19<43:23, 6.13s/it] {'loss': 0.4481, 'learning_rate': 2.830883787805494e-07, 'epoch': 0.93} + 93%|█████████▎| 5347/5772 [1:39:16<43:23, 6.13s/it] 93%|█████████▎| 5348/5772 [1:39:25<43:51, 6.21s/it] 93%|█████████▎| 5348/5772 [1:39:22<43:51, 6.21s/it] {'loss': 0.455, 'learning_rate': 2.817640496790275e-07, 'epoch': 0.93} + 93%|█████████▎| 5348/5772 [1:39:25<43:51, 6.21s/it] {'loss': 0.455, 'learning_rate': 2.817640496790275e-07, 'epoch': 0.93} + 93%|█████████▎| 5348/5772 [1:39:22<43:51, 6.21s/it] 93%|█████████▎| 5349/5772 [1:39:31<43:37, 6.19s/it] 93%|█████████▎| 5349/5772 [1:39:28<43:37, 6.19s/it]{'loss': 0.467, 'learning_rate': 2.8044278128188327e-07, 'epoch': 0.93} + 93%|█████████▎| 5349/5772 [1:39:31<43:37, 6.19s/it] {'loss': 0.467, 'learning_rate': 2.8044278128188327e-07, 'epoch': 0.93} + 93%|█████████▎| 5349/5772 [1:39:28<43:37, 6.19s/it]5 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +8114 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +2 12 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... + 93%|█████████▎| 5350/5772 [1:39:37<43:33, 6.19s/it]9 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +10 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +0 AutoResumeHook: Checking whether to suspend... + 93%|█████████▎| 5350/5772 [1:39:34<43:33, 6.19s/it] {'loss': 0.4679, 'learning_rate': 2.791245740052451e-07, 'epoch': 0.93} + 93%|█████████▎| 5350/5772 [1:39:37<43:33, 6.19s/it] {'loss': 0.4679, 'learning_rate': 2.791245740052451e-07, 'epoch': 0.93} + 93%|█████████▎| 5350/5772 [1:39:34<43:33, 6.19s/it] 93%|█████████▎| 5351/5772 [1:39:44<44:00, 6.27s/it] 93%|█████████▎| 5351/5772 [1:39:41<44:00, 6.27s/it] {'loss': 0.461, 'learning_rate': 2.7780942826427514e-07, 'epoch': 0.93} + 93%|█████████▎| 5351/5772 [1:39:44<44:00, 6.27s/it] {'loss': 0.461, 'learning_rate': 2.7780942826427514e-07, 'epoch': 0.93} + 93%|█████████▎| 5351/5772 [1:39:41<44:00, 6.27s/it] 93%|█████████▎| 5352/5772 [1:39:50<43:54, 6.27s/it] 93%|█████████▎| 5352/5772 [1:39:47<43:54, 6.27s/it] {'loss': 0.4574, 'learning_rate': 2.7649734447316777e-07, 'epoch': 0.93} + 93%|█████████▎| 5352/5772 [1:39:47<43:54, 6.27s/it] {'loss': 0.4574, 'learning_rate': 2.7649734447316777e-07, 'epoch': 0.93} + 93%|█████████▎| 5352/5772 [1:39:50<43:54, 6.27s/it] 93%|█████████▎| 5353/5772 [1:39:56<43:25, 6.22s/it] 93%|█████████▎| 5353/5772 [1:39:53<43:25, 6.22s/it] {'loss': 0.4629, 'learning_rate': 2.751883230451613e-07, 'epoch': 0.93} + 93%|█████████▎| 5353/5772 [1:39:56<43:25, 6.22s/it] {'loss': 0.4629, 'learning_rate': 2.751883230451613e-07, 'epoch': 0.93} + 93%|█████████▎| 5353/5772 [1:39:53<43:25, 6.22s/it] 93%|█████████▎| 5354/5772 [1:40:02<43:31, 6.25s/it] 93%|█████████▎| 5354/5772 [1:39:59<43:31, 6.25s/it] {'loss': 0.4565, 'learning_rate': 2.738823643925215e-07, 'epoch': 0.93} + 93%|█████████▎| 5354/5772 [1:40:02<43:31, 6.25s/it] {'loss': 0.4565, 'learning_rate': 2.738823643925215e-07, 'epoch': 0.93} + 93%|█████████▎| 5354/5772 [1:39:59<43:31, 6.25s/it] 93%|█████████▎| 5355/5772 [1:40:08<42:53, 6.17s/it] 93%|█████████▎| 5355/5772 [1:40:05<42:53, 6.17s/it] {'loss': 0.4407, 'learning_rate': 2.725794689265537e-07, 'epoch': 0.93} + 93%|█████████▎| 5355/5772 [1:40:08<42:53, 6.17s/it] {'loss': 0.4407, 'learning_rate': 2.725794689265537e-07, 'epoch': 0.93} + 93%|█████████▎| 5355/5772 [1:40:05<42:53, 6.17s/it] 93%|█████████▎| 5356/5772 [1:40:15<42:48, 6.17s/it] 93%|█████████▎| 5356/5772 [1:40:12<42:48, 6.17s/it] {'loss': 0.459, 'learning_rate': 2.7127963705759653e-07, 'epoch': 0.93} + 93%|█████████▎| 5356/5772 [1:40:15<42:48, 6.17s/it] {'loss': 0.459, 'learning_rate': 2.7127963705759653e-07, 'epoch': 0.93} + 93%|█████████▎| 5356/5772 [1:40:12<42:48, 6.17s/it] 93%|█████████▎| 5357/5772 [1:40:20<41:54, 6.06s/it] 93%|█████████▎| 5357/5772 [1:40:17<41:54, 6.06s/it] {'loss': 0.4583, 'learning_rate': 2.699828691950268e-07, 'epoch': 0.93} + 93%|█████████▎| 5357/5772 [1:40:20<41:54, 6.06s/it] {'loss': 0.4583, 'learning_rate': 2.699828691950268e-07, 'epoch': 0.93} + 93%|█████████▎| 5357/5772 [1:40:17<41:54, 6.06s/it] 93%|█████████▎| 5358/5772 [1:40:26<41:40, 6.04s/it] 93%|█████████▎| 5358/5772 [1:40:23<41:40, 6.04s/it]{'loss': 0.4489, 'learning_rate': 2.6868916574725347e-07, 'epoch': 0.93} + {'loss': 0.4489, 'learning_rate': 2.6868916574725347e-07, 'epoch': 0.93} + 93%|█████████▎| 5358/5772 [1:40:26<41:40, 6.04s/it] 93%|█████████▎| 5358/5772 [1:40:23<41:40, 6.04s/it] 93%|█████████▎| 5359/5772 [1:40:33<43:07, 6.26s/it] 93%|█████████▎| 5359/5772 [1:40:30<43:07, 6.26s/it] {'loss': 0.4553, 'learning_rate': 2.6739852712171946e-07, 'epoch': 0.93} + 93%|█████████▎| 5359/5772 [1:40:33<43:07, 6.26s/it] {'loss': 0.4553, 'learning_rate': 2.6739852712171946e-07, 'epoch': 0.93} + 93%|█████████▎| 5359/5772 [1:40:30<43:07, 6.26s/it] 93%|█████████▎| 5360/5772 [1:40:36<42:47, 6.23s/it] {'loss': 0.4625, 'learning_rate': 2.661109537249085e-07, 'epoch': 0.93} + 93%|█████████▎| 5360/5772 [1:40:36<42:47, 6.23s/it] 93%|█████████▎| 5360/5772 [1:40:39<42:47, 6.23s/it] {'loss': 0.4625, 'learning_rate': 2.661109537249085e-07, 'epoch': 0.93} + 93%|█████████▎| 5360/5772 [1:40:39<42:47, 6.23s/it] 93%|█████████▎| 5361/5772 [1:40:45<42:17, 6.17s/it] 93%|█████████▎| 5361/5772 [1:40:42<42:17, 6.17s/it] {'loss': 0.4665, 'learning_rate': 2.6482644596232953e-07, 'epoch': 0.93} + 93%|█████████▎| 5361/5772 [1:40:45<42:17, 6.17s/it] {'loss': 0.4665, 'learning_rate': 2.6482644596232953e-07, 'epoch': 0.93} + 93%|█████████▎| 5361/5772 [1:40:42<42:17, 6.17s/it] 93%|█████████▎| 5362/5772 [1:40:52<42:08, 6.17s/it] 93%|█████████▎| 5362/5772 [1:40:49<42:08, 6.17s/it] {'loss': 0.452, 'learning_rate': 2.6354500423853457e-07, 'epoch': 0.93} + 93%|█████████▎| 5362/5772 [1:40:52<42:08, 6.17s/it] {'loss': 0.452, 'learning_rate': 2.6354500423853457e-07, 'epoch': 0.93} + 93%|█████████▎| 5362/5772 [1:40:49<42:08, 6.17s/it] 93%|█████████▎| 5363/5772 [1:40:57<41:11, 6.04s/it] {'loss': 0.4487, 'learning_rate': 2.622666289571063e-07, 'epoch': 0.93} + 93%|█████████▎| 5363/5772 [1:40:57<41:11, 6.04s/it] 93%|█████████▎| 5363/5772 [1:40:54<41:11, 6.04s/it] {'loss': 0.4487, 'learning_rate': 2.622666289571063e-07, 'epoch': 0.93} + 93%|█████████▎| 5363/5772 [1:40:54<41:11, 6.04s/it] 93%|█████████▎| 5364/5772 [1:41:04<42:19, 6.22s/it] 93%|█████████▎| 5364/5772 [1:41:01<42:19, 6.22s/it] {'loss': 0.4503, 'learning_rate': 2.6099132052066044e-07, 'epoch': 0.93} + 93%|█████████▎| 5364/5772 [1:41:04<42:19, 6.22s/it] {'loss': 0.4503, 'learning_rate': 2.6099132052066044e-07, 'epoch': 0.93} + 93%|█████████▎| 5364/5772 [1:41:01<42:19, 6.22s/it] 93%|█████████▎| 5365/5772 [1:41:10<41:26, 6.11s/it] 93%|█████████▎| 5365/5772 [1:41:07<41:26, 6.11s/it] {'loss': 0.4621, 'learning_rate': 2.5971907933085016e-07, 'epoch': 0.93} + 93%|█████████▎| 5365/5772 [1:41:10<41:26, 6.11s/it] {'loss': 0.4621, 'learning_rate': 2.5971907933085016e-07, 'epoch': 0.93} + 93%|█████████▎| 5365/5772 [1:41:07<41:26, 6.11s/it] 93%|█████████▎| 5366/5772 [1:41:16<41:39, 6.16s/it] 93%|█████████▎| 5366/5772 [1:41:13<41:39, 6.16s/it] {'loss': 0.4508, 'learning_rate': 2.5844990578835825e-07, 'epoch': 0.93} + 93%|█████████▎| 5366/5772 [1:41:13<41:39, 6.16s/it]{'loss': 0.4508, 'learning_rate': 2.5844990578835825e-07, 'epoch': 0.93} + 93%|█████████▎| 5366/5772 [1:41:16<41:39, 6.16s/it] 93%|█████████▎| 5367/5772 [1:41:22<41:22, 6.13s/it] 93%|█████████▎| 5367/5772 [1:41:19<41:21, 6.13s/it] {'loss': 0.4531, 'learning_rate': 2.571838002929061e-07, 'epoch': 0.93} + 93%|█████████▎| 5367/5772 [1:41:22<41:22, 6.13s/it] {'loss': 0.4531, 'learning_rate': 2.571838002929061e-07, 'epoch': 0.93} + 93%|█████████▎| 5367/5772 [1:41:19<41:21, 6.13s/it] 93%|█████████▎| 5368/5772 [1:41:28<41:08, 6.11s/it] 93%|█████████▎| 5368/5772 [1:41:25<41:08, 6.11s/it] {'loss': 0.4536, 'learning_rate': 2.559207632432448e-07, 'epoch': 0.93} + 93%|█████████▎| 5368/5772 [1:41:28<41:08, 6.11s/it] {'loss': 0.4536, 'learning_rate': 2.559207632432448e-07, 'epoch': 0.93} + 93%|█████████▎| 5368/5772 [1:41:25<41:08, 6.11s/it] 93%|█████████▎| 5369/5772 [1:41:34<40:46, 6.07s/it] 93%|█████████▎| 5369/5772 [1:41:31<40:46, 6.07s/it] {'loss': 0.4642, 'learning_rate': 2.546607950371627e-07, 'epoch': 0.93} + 93%|█████████▎| 5369/5772 [1:41:34<40:46, 6.07s/it] {'loss': 0.4642, 'learning_rate': 2.546607950371627e-07, 'epoch': 0.93} + 93%|█████████▎| 5369/5772 [1:41:31<40:46, 6.07s/it] 93%|█████████▎| 5370/5772 [1:41:40<40:48, 6.09s/it] 93%|█████████▎| 5370/5772 [1:41:37<40:48, 6.09s/it] {'loss': 0.4454, 'learning_rate': 2.534038960714791e-07, 'epoch': 0.93} + 93%|█████████▎| 5370/5772 [1:41:40<40:48, 6.09s/it] {'loss': 0.4454, 'learning_rate': 2.534038960714791e-07, 'epoch': 0.93} + 93%|█████████▎| 5370/5772 [1:41:37<40:48, 6.09s/it] 93%|█████████▎| 5371/5772 [1:41:47<40:57, 6.13s/it] 93%|█████████▎| 5371/5772 [1:41:43<40:57, 6.13s/it] {'loss': 0.4569, 'learning_rate': 2.5215006674204625e-07, 'epoch': 0.93} + 93%|█████████▎| 5371/5772 [1:41:47<40:57, 6.13s/it] {'loss': 0.4569, 'learning_rate': 2.5215006674204625e-07, 'epoch': 0.93} + 93%|█████████▎| 5371/5772 [1:41:43<40:57, 6.13s/it] 93%|█████████▎| 5372/5772 [1:41:53<40:54, 6.14s/it] 93%|█████████▎| 5372/5772 [1:41:50<40:54, 6.14s/it] {'loss': 0.4533, 'learning_rate': 2.508993074437527e-07, 'epoch': 0.93} + 93%|█████████▎| 5372/5772 [1:41:53<40:54, 6.14s/it] {'loss': 0.4533, 'learning_rate': 2.508993074437527e-07, 'epoch': 0.93} + 93%|█████████▎| 5372/5772 [1:41:50<40:54, 6.14s/it] 93%|█████████▎| 5373/5772 [1:41:59<40:31, 6.09s/it] 93%|█████████▎| 5373/5772 [1:41:56<40:31, 6.09s/it] {'loss': 0.4574, 'learning_rate': 2.4965161857051667e-07, 'epoch': 0.93} + 93%|█████████▎| 5373/5772 [1:41:59<40:31, 6.09s/it] {'loss': 0.4574, 'learning_rate': 2.4965161857051667e-07, 'epoch': 0.93} + 93%|█████████▎| 5373/5772 [1:41:56<40:31, 6.09s/it] 93%|█████████▎| 5374/5772 [1:42:05<41:10, 6.21s/it] 93%|█████████▎| 5374/5772 [1:42:02<41:10, 6.21s/it] {'loss': 0.4555, 'learning_rate': 2.4840700051529054e-07, 'epoch': 0.93} + 93%|█████████▎| 5374/5772 [1:42:05<41:10, 6.21s/it] {'loss': 0.4555, 'learning_rate': 2.4840700051529054e-07, 'epoch': 0.93} + 93%|█████████▎| 5374/5772 [1:42:02<41:10, 6.21s/it] 93%|█████████▎| 5375/5772 [1:42:11<40:35, 6.13s/it] 93%|█████████▎| 5375/5772 [1:42:08<40:35, 6.13s/it] {'loss': 0.4636, 'learning_rate': 2.4716545367006186e-07, 'epoch': 0.93} + 93%|█████████▎| 5375/5772 [1:42:11<40:35, 6.13s/it] {'loss': 0.4636, 'learning_rate': 2.4716545367006186e-07, 'epoch': 0.93} + 93%|█████████▎| 5375/5772 [1:42:08<40:35, 6.13s/it] 93%|█████████▎| 5376/5772 [1:42:18<41:22, 6.27s/it] 93%|█████████▎| 5376/5772 [1:42:15<41:22, 6.27s/it] {'loss': 0.4602, 'learning_rate': 2.459269784258467e-07, 'epoch': 0.93} + 93%|█████████▎| 5376/5772 [1:42:18<41:22, 6.27s/it] {'loss': 0.4602, 'learning_rate': 2.459269784258467e-07, 'epoch': 0.93} + 93%|█████████▎| 5376/5772 [1:42:15<41:22, 6.27s/it] 93%|█████████▎| 5377/5772 [1:42:24<41:18, 6.27s/it] 93%|█████████▎| 5377/5772 [1:42:21<41:18, 6.27s/it] {'loss': 0.4591, 'learning_rate': 2.4469157517269636e-07, 'epoch': 0.93} + 93%|█████████▎| 5377/5772 [1:42:24<41:18, 6.27s/it] {'loss': 0.4591, 'learning_rate': 2.4469157517269636e-07, 'epoch': 0.93} + 93%|█████████▎| 5377/5772 [1:42:21<41:18, 6.27s/it] 93%|█████████▎| 5378/5772 [1:42:30<40:13, 6.13s/it] {'loss': 0.4491, 'learning_rate': 2.4345924429969523e-07, 'epoch': 0.93} + 93%|█████████▎| 5378/5772 [1:42:30<40:13, 6.13s/it] 93%|█████████▎| 5378/5772 [1:42:27<40:13, 6.13s/it] {'loss': 0.4491, 'learning_rate': 2.4345924429969523e-07, 'epoch': 0.93} + 93%|█████████▎| 5378/5772 [1:42:27<40:13, 6.13s/it] 93%|█████████▎| 5379/5772 [1:42:36<39:50, 6.08s/it] 93%|█████████▎| 5379/5772 [1:42:33<39:50, 6.08s/it] {'loss': 0.4593, 'learning_rate': 2.4222998619495953e-07, 'epoch': 0.93} + 93%|█████████▎| 5379/5772 [1:42:36<39:50, 6.08s/it] {'loss': 0.4593, 'learning_rate': 2.4222998619495953e-07, 'epoch': 0.93} + 93%|█████████▎| 5379/5772 [1:42:33<39:50, 6.08s/it] 93%|█████████▎| 5380/5772 [1:42:42<39:24, 6.03s/it] 93%|█████████▎| 5380/5772 [1:42:39<39:24, 6.03s/it] {'loss': 0.4663, 'learning_rate': 2.41003801245635e-07, 'epoch': 0.93} + 93%|█████████▎| 5380/5772 [1:42:42<39:24, 6.03s/it] {'loss': 0.4663, 'learning_rate': 2.41003801245635e-07, 'epoch': 0.93} + 93%|█████████▎| 5380/5772 [1:42:39<39:24, 6.03s/it] 93%|█████████▎| 5381/5772 [1:42:48<39:17, 6.03s/it] 93%|█████████▎| 5381/5772 [1:42:45<39:17, 6.03s/it] {'loss': 0.4496, 'learning_rate': 2.3978068983790294e-07, 'epoch': 0.93} + 93%|█████████▎| 5381/5772 [1:42:45<39:17, 6.03s/it] {'loss': 0.4496, 'learning_rate': 2.3978068983790294e-07, 'epoch': 0.93} + 93%|█████████▎| 5381/5772 [1:42:48<39:17, 6.03s/it] 93%|█████████▎| 5382/5772 [1:42:54<39:31, 6.08s/it] 93%|█████████▎| 5382/5772 [1:42:51<39:31, 6.08s/it] {'loss': 0.4602, 'learning_rate': 2.3856065235697613e-07, 'epoch': 0.93} + 93%|█████████▎| 5382/5772 [1:42:54<39:31, 6.08s/it] {'loss': 0.4602, 'learning_rate': 2.3856065235697613e-07, 'epoch': 0.93} + 93%|█████████▎| 5382/5772 [1:42:51<39:31, 6.08s/it] 93%|█████████▎| 5383/5772 [1:43:00<39:54, 6.15s/it] 93%|█████████▎| 5383/5772 [1:42:57<39:54, 6.15s/it] {'loss': 0.4575, 'learning_rate': 2.3734368918709838e-07, 'epoch': 0.93} + 93%|█████████▎| 5383/5772 [1:43:00<39:54, 6.15s/it] {'loss': 0.4575, 'learning_rate': 2.3734368918709838e-07, 'epoch': 0.93} + 93%|█████████▎| 5383/5772 [1:42:57<39:54, 6.15s/it] 93%|█████████▎| 5384/5772 [1:43:06<39:53, 6.17s/it] 93%|█████████▎| 5384/5772 [1:43:03<39:53, 6.17s/it] {'loss': 0.4551, 'learning_rate': 2.3612980071154534e-07, 'epoch': 0.93} + 93%|█████████▎| 5384/5772 [1:43:06<39:53, 6.17s/it] {'loss': 0.4551, 'learning_rate': 2.3612980071154534e-07, 'epoch': 0.93} + 93%|█████████▎| 5384/5772 [1:43:03<39:53, 6.17s/it] 93%|█████████▎| 5385/5772 [1:43:12<39:35, 6.14s/it] 93%|█████████▎| 5385/5772 [1:43:09<39:35, 6.14s/it] {'loss': 0.4377, 'learning_rate': 2.349189873126223e-07, 'epoch': 0.93} + 93%|█████████▎| 5385/5772 [1:43:12<39:35, 6.14s/it] {'loss': 0.4377, 'learning_rate': 2.349189873126223e-07, 'epoch': 0.93} + 93%|█████████▎| 5385/5772 [1:43:09<39:35, 6.14s/it] 93%|█████████▎| 5386/5772 [1:43:18<38:51, 6.04s/it] 93%|█████████▎| 5386/5772 [1:43:15<38:51, 6.04s/it] {'loss': 0.4714, 'learning_rate': 2.33711249371672e-07, 'epoch': 0.93} + 93%|█████████▎| 5386/5772 [1:43:18<38:51, 6.04s/it] {'loss': 0.4714, 'learning_rate': 2.33711249371672e-07, 'epoch': 0.93} + 93%|█████████▎| 5386/5772 [1:43:15<38:51, 6.04s/it] 93%|█████████▎| 5387/5772 [1:43:21<39:01, 6.08s/it] 93%|█████████▎| 5387/5772 [1:43:24<39:01, 6.08s/it] {'loss': 0.4479, 'learning_rate': 2.325065872690624e-07, 'epoch': 0.93} + 93%|█████████▎| 5387/5772 [1:43:21<39:01, 6.08s/it] {'loss': 0.4479, 'learning_rate': 2.325065872690624e-07, 'epoch': 0.93} + 93%|█████████▎| 5387/5772 [1:43:24<39:01, 6.08s/it] 93%|█████████▎| 5388/5772 [1:43:31<39:14, 6.13s/it] 93%|█████████▎| 5388/5772 [1:43:28<39:14, 6.13s/it] {'loss': 0.4634, 'learning_rate': 2.3130500138419553e-07, 'epoch': 0.93} + 93%|█████████▎| 5388/5772 [1:43:31<39:14, 6.13s/it] {'loss': 0.4634, 'learning_rate': 2.3130500138419553e-07, 'epoch': 0.93} + 93%|█████████▎| 5388/5772 [1:43:28<39:14, 6.13s/it] 93%|█████████▎| 5389/5772 [1:43:37<39:25, 6.18s/it] 93%|█████████▎| 5389/5772 [1:43:34<39:25, 6.18s/it] {'loss': 0.4547, 'learning_rate': 2.3010649209550428e-07, 'epoch': 0.93} + 93%|█████████▎| 5389/5772 [1:43:37<39:25, 6.18s/it] {'loss': 0.4547, 'learning_rate': 2.3010649209550428e-07, 'epoch': 0.93} + 93%|█████████▎| 5389/5772 [1:43:34<39:25, 6.18s/it] 93%|█████████▎| 5390/5772 [1:43:43<39:14, 6.16s/it] 93%|█████████▎| 5390/5772 [1:43:40<39:14, 6.16s/it] {'loss': 0.4579, 'learning_rate': 2.2891105978045336e-07, 'epoch': 0.93} + 93%|█████████▎| 5390/5772 [1:43:43<39:14, 6.16s/it] {'loss': 0.4579, 'learning_rate': 2.2891105978045336e-07, 'epoch': 0.93} + 93%|█████████▎| 5390/5772 [1:43:40<39:14, 6.16s/it] 93%|█████████▎| 5391/5772 [1:43:49<39:21, 6.20s/it] 93%|█████████▎| 5391/5772 [1:43:46<39:21, 6.20s/it]{'loss': 0.4419, 'learning_rate': 2.2771870481553715e-07, 'epoch': 0.93} + {'loss': 0.4419, 'learning_rate': 2.2771870481553715e-07, 'epoch': 0.93} + 93%|█████████▎| 5391/5772 [1:43:49<39:21, 6.20s/it] 93%|█████████▎| 5391/5772 [1:43:46<39:21, 6.20s/it] 93%|█████████▎| 5392/5772 [1:43:56<39:12, 6.19s/it] 93%|█████████▎| 5392/5772 [1:43:53<39:12, 6.19s/it] {'loss': 0.4609, 'learning_rate': 2.265294275762786e-07, 'epoch': 0.93} + 93%|█████████▎| 5392/5772 [1:43:56<39:12, 6.19s/it] {'loss': 0.4609, 'learning_rate': 2.265294275762786e-07, 'epoch': 0.93} + 93%|█████████▎| 5392/5772 [1:43:53<39:12, 6.19s/it] 93%|█████████▎| 5393/5772 [1:44:02<39:42, 6.29s/it] 93%|█████████▎| 5393/5772 [1:43:59<39:42, 6.29s/it] {'loss': 0.4552, 'learning_rate': 2.25343228437237e-07, 'epoch': 0.93} + 93%|█████████▎| 5393/5772 [1:44:02<39:42, 6.29s/it] {'loss': 0.4552, 'learning_rate': 2.25343228437237e-07, 'epoch': 0.93} + 93%|█████████▎| 5393/5772 [1:43:59<39:42, 6.29s/it] 93%|█████████▎| 5394/5772 [1:44:08<38:41, 6.14s/it] 93%|█████████▎| 5394/5772 [1:44:05<38:41, 6.14s/it] {'loss': 0.4539, 'learning_rate': 2.2416010777199904e-07, 'epoch': 0.93} + 93%|█████████▎| 5394/5772 [1:44:08<38:41, 6.14s/it] {'loss': 0.4539, 'learning_rate': 2.2416010777199904e-07, 'epoch': 0.93} + 93%|█████████▎| 5394/5772 [1:44:05<38:41, 6.14s/it] 93%|█████████▎| 5395/5772 [1:44:11<38:52, 6.19s/it] 93%|█████████▎| 5395/5772 [1:44:14<38:52, 6.19s/it]{'loss': 0.4517, 'learning_rate': 2.229800659531811e-07, 'epoch': 0.93} + 93%|█████████▎| 5395/5772 [1:44:14<38:52, 6.19s/it] {'loss': 0.4517, 'learning_rate': 2.229800659531811e-07, 'epoch': 0.93} + 93%|█████████▎| 5395/5772 [1:44:11<38:52, 6.19s/it] 93%|█████████▎| 5396/5772 [1:44:20<38:39, 6.17s/it] 93%|█████████▎| 5396/5772 [1:44:17<38:39, 6.17s/it] {'loss': 0.4577, 'learning_rate': 2.218031033524304e-07, 'epoch': 0.93} + 93%|█████████▎| 5396/5772 [1:44:20<38:39, 6.17s/it] {'loss': 0.4577, 'learning_rate': 2.218031033524304e-07, 'epoch': 0.93} + 93%|█████████▎| 5396/5772 [1:44:17<38:39, 6.17s/it] 94%|█████████▎| 5397/5772 [1:44:27<38:56, 6.23s/it] 94%|█████████▎| 5397/5772 [1:44:24<38:56, 6.23s/it] {'loss': 0.4423, 'learning_rate': 2.2062922034042478e-07, 'epoch': 0.93} + 94%|█████████▎| 5397/5772 [1:44:27<38:56, 6.23s/it] {'loss': 0.4423, 'learning_rate': 2.2062922034042478e-07, 'epoch': 0.93} + 94%|█████████▎| 5397/5772 [1:44:24<38:56, 6.23s/it] 94%|█████████▎| 5398/5772 [1:44:33<38:11, 6.13s/it] 94%|█████████▎| 5398/5772 [1:44:30<38:11, 6.13s/it]{'loss': 0.4606, 'learning_rate': 2.194584172868741e-07, 'epoch': 0.94} + 94%|█████████▎| 5398/5772 [1:44:33<38:11, 6.13s/it] {'loss': 0.4606, 'learning_rate': 2.194584172868741e-07, 'epoch': 0.94} + 94%|█████████▎| 5398/5772 [1:44:30<38:11, 6.13s/it] 94%|█████████▎| 5399/5772 [1:44:39<37:50, 6.09s/it] 94%|█████████▎| 5399/5772 [1:44:36<37:50, 6.09s/it] {'loss': 0.4527, 'learning_rate': 2.1829069456051456e-07, 'epoch': 0.94} + 94%|█████████▎| 5399/5772 [1:44:39<37:50, 6.09s/it] {'loss': 0.4527, 'learning_rate': 2.1829069456051456e-07, 'epoch': 0.94} + 94%|█████████▎| 5399/5772 [1:44:36<37:50, 6.09s/it]5 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend...12 AutoResumeHook: Checking whether to suspend... + +13 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend...9 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... + +3 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... + 94%|█████████▎| 5400/5772 [1:44:44<37:14, 6.01s/it]1 AutoResumeHook: Checking whether to suspend... +0 AutoResumeHook: Checking whether to suspend... + 94%|█████████▎| 5400/5772 [1:44:41<37:14, 6.01s/it] {'loss': 0.4753, 'learning_rate': 2.17126052529113e-07, 'epoch': 0.94} + 94%|█████████▎| 5400/5772 [1:44:44<37:14, 6.01s/it] {'loss': 0.4753, 'learning_rate': 2.17126052529113e-07, 'epoch': 0.94} + 94%|█████████▎| 5400/5772 [1:44:41<37:14, 6.01s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-5400/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-5400/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-5400/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 94%|█████████▎| 5401/5772 [1:45:11<1:15:30, 12.21s/it] 94%|█████████▎| 5401/5772 [1:45:08<1:15:30, 12.21s/it] {'loss': 0.4676, 'learning_rate': 2.159644915594694e-07, 'epoch': 0.94} + 94%|█████████▎| 5401/5772 [1:45:11<1:15:30, 12.21s/it] {'loss': 0.4676, 'learning_rate': 2.159644915594694e-07, 'epoch': 0.94} + 94%|█████████▎| 5401/5772 [1:45:08<1:15:30, 12.21s/it] 94%|█████████▎| 5402/5772 [1:45:17<1:04:03, 10.39s/it] 94%|█████████▎| 5402/5772 [1:45:14<1:04:03, 10.39s/it] {'loss': 0.4511, 'learning_rate': 2.1480601201741004e-07, 'epoch': 0.94} + 94%|█████████▎| 5402/5772 [1:45:17<1:04:03, 10.39s/it] {'loss': 0.4511, 'learning_rate': 2.1480601201741004e-07, 'epoch': 0.94} + 94%|█████████▎| 5402/5772 [1:45:14<1:04:03, 10.39s/it] 94%|█████████▎| 5403/5772 [1:45:23<55:50, 9.08s/it] 94%|█████████▎| 5403/5772 [1:45:20<55:50, 9.08s/it] {'loss': 0.4427, 'learning_rate': 2.1365061426778967e-07, 'epoch': 0.94} + 94%|█████████▎| 5403/5772 [1:45:20<55:50, 9.08s/it] {'loss': 0.4427, 'learning_rate': 2.1365061426778967e-07, 'epoch': 0.94} + 94%|█████████▎| 5403/5772 [1:45:23<55:50, 9.08s/it] 94%|█████████▎| 5404/5772 [1:45:29<50:03, 8.16s/it] 94%|█████████▎| 5404/5772 [1:45:26<50:03, 8.16s/it] {'loss': 0.4649, 'learning_rate': 2.1249829867449723e-07, 'epoch': 0.94} + 94%|█████████▎| 5404/5772 [1:45:29<50:03, 8.16s/it] {'loss': 0.4649, 'learning_rate': 2.1249829867449723e-07, 'epoch': 0.94} + 94%|█████████▎| 5404/5772 [1:45:26<50:03, 8.16s/it] 94%|█████████▎| 5405/5772 [1:45:35<45:48, 7.49s/it] 94%|█████████▎| 5405/5772 [1:45:32<45:48, 7.49s/it] {'loss': 0.4483, 'learning_rate': 2.11349065600448e-07, 'epoch': 0.94} + 94%|█████████▎| 5405/5772 [1:45:35<45:48, 7.49s/it] {'loss': 0.4483, 'learning_rate': 2.11349065600448e-07, 'epoch': 0.94} + 94%|█████████▎| 5405/5772 [1:45:32<45:48, 7.49s/it] 94%|█████████▎| 5406/5772 [1:45:41<42:42, 7.00s/it] 94%|█████████▎| 5406/5772 [1:45:38<42:42, 7.00s/it] {'loss': 0.4481, 'learning_rate': 2.1020291540758352e-07, 'epoch': 0.94} + 94%|█████████▎| 5406/5772 [1:45:41<42:42, 7.00s/it] {'loss': 0.4481, 'learning_rate': 2.1020291540758352e-07, 'epoch': 0.94} + 94%|█████████▎| 5406/5772 [1:45:38<42:42, 7.00s/it] 94%|█████████▎| 5407/5772 [1:45:47<41:17, 6.79s/it] 94%|█████████▎| 5407/5772 [1:45:44<41:17, 6.79s/it] {'loss': 0.4496, 'learning_rate': 2.0905984845687954e-07, 'epoch': 0.94} + 94%|█████████▎| 5407/5772 [1:45:47<41:17, 6.79s/it] {'loss': 0.4496, 'learning_rate': 2.0905984845687954e-07, 'epoch': 0.94} + 94%|█████████▎| 5407/5772 [1:45:44<41:17, 6.79s/it] 94%|█████████▎| 5408/5772 [1:45:54<40:25, 6.66s/it] 94%|█████████▎| 5408/5772 [1:45:51<40:25, 6.66s/it] {'loss': 0.4565, 'learning_rate': 2.0791986510833918e-07, 'epoch': 0.94} + 94%|█████████▎| 5408/5772 [1:45:54<40:25, 6.66s/it] {'loss': 0.4565, 'learning_rate': 2.0791986510833918e-07, 'epoch': 0.94} + 94%|█████████▎| 5408/5772 [1:45:51<40:25, 6.66s/it] 94%|█████████▎| 5409/5772 [1:46:01<40:41, 6.72s/it] 94%|█████████▎| 5409/5772 [1:45:58<40:41, 6.72s/it] {'loss': 0.466, 'learning_rate': 2.067829657209941e-07, 'epoch': 0.94} + 94%|█████████▎| 5409/5772 [1:46:01<40:41, 6.72s/it] {'loss': 0.466, 'learning_rate': 2.067829657209941e-07, 'epoch': 0.94} + 94%|█████████▎| 5409/5772 [1:45:58<40:41, 6.72s/it] 94%|█████████▎| 5410/5772 [1:46:07<39:56, 6.62s/it] 94%|█████████▎| 5410/5772 [1:46:04<39:56, 6.62s/it] {'loss': 0.4785, 'learning_rate': 2.0564915065290237e-07, 'epoch': 0.94} + 94%|█████████▎| 5410/5772 [1:46:07<39:56, 6.62s/it] {'loss': 0.4785, 'learning_rate': 2.0564915065290237e-07, 'epoch': 0.94} + 94%|█████████▎| 5410/5772 [1:46:04<39:56, 6.62s/it] 94%|█████████▎| 5411/5772 [1:46:13<38:37, 6.42s/it] 94%|█████████▎| 5411/5772 [1:46:10<38:37, 6.42s/it] {'loss': 0.4514, 'learning_rate': 2.0451842026115277e-07, 'epoch': 0.94} + 94%|█████████▎| 5411/5772 [1:46:13<38:37, 6.42s/it] {'loss': 0.4514, 'learning_rate': 2.0451842026115277e-07, 'epoch': 0.94} + 94%|█████████▎| 5411/5772 [1:46:10<38:37, 6.42s/it] 94%|█████████▍| 5412/5772 [1:46:19<37:41, 6.28s/it] 94%|█████████▍| 5412/5772 [1:46:16<37:41, 6.28s/it] {'loss': 0.4763, 'learning_rate': 2.0339077490186488e-07, 'epoch': 0.94} + 94%|█████████▍| 5412/5772 [1:46:19<37:41, 6.28s/it] {'loss': 0.4763, 'learning_rate': 2.0339077490186488e-07, 'epoch': 0.94} + 94%|█████████▍| 5412/5772 [1:46:16<37:41, 6.28s/it] 94%|█████████▍| 5413/5772 [1:46:25<37:05, 6.20s/it] 94%|█████████▍| 5413/5772 [1:46:22<37:05, 6.20s/it] {'loss': 0.4583, 'learning_rate': 2.022662149301824e-07, 'epoch': 0.94} + 94%|█████████▍| 5413/5772 [1:46:25<37:05, 6.20s/it] {'loss': 0.4583, 'learning_rate': 2.022662149301824e-07, 'epoch': 0.94} + 94%|█████████▍| 5413/5772 [1:46:22<37:05, 6.20s/it] 94%|█████████▍| 5414/5772 [1:46:31<37:09, 6.23s/it] 94%|█████████▍| 5414/5772 [1:46:28<37:09, 6.23s/it] {'loss': 0.4565, 'learning_rate': 2.011447407002809e-07, 'epoch': 0.94} + 94%|█████████▍| 5414/5772 [1:46:31<37:09, 6.23s/it] {'loss': 0.4565, 'learning_rate': 2.011447407002809e-07, 'epoch': 0.94} + 94%|█████████▍| 5414/5772 [1:46:28<37:09, 6.23s/it] 94%|█████████▍| 5415/5772 [1:46:37<36:33, 6.14s/it] 94%|█████████▍| 5415/5772 [1:46:34<36:33, 6.14s/it] {'loss': 0.4539, 'learning_rate': 2.0002635256536008e-07, 'epoch': 0.94} + 94%|█████████▍| 5415/5772 [1:46:37<36:33, 6.14s/it] {'loss': 0.4539, 'learning_rate': 2.0002635256536008e-07, 'epoch': 0.94} + 94%|█████████▍| 5415/5772 [1:46:34<36:33, 6.14s/it] 94%|█████████▍| 5416/5772 [1:46:43<36:33, 6.16s/it] {'loss': 0.4502, 'learning_rate': 1.9891105087765371e-07, 'epoch': 0.94} + 94%|█████████▍| 5416/5772 [1:46:43<36:33, 6.16s/it] 94%|█████████▍| 5416/5772 [1:46:40<36:33, 6.16s/it] {'loss': 0.4502, 'learning_rate': 1.9891105087765371e-07, 'epoch': 0.94} + 94%|█████████▍| 5416/5772 [1:46:40<36:33, 6.16s/it] 94%|█████████▍| 5417/5772 [1:46:49<36:10, 6.11s/it] 94%|█████████▍| 5417/5772 [1:46:46<36:09, 6.11s/it] {'loss': 0.4453, 'learning_rate': 1.977988359884153e-07, 'epoch': 0.94} + 94%|█████████▍| 5417/5772 [1:46:49<36:10, 6.11s/it] {'loss': 0.4453, 'learning_rate': 1.977988359884153e-07, 'epoch': 0.94} + 94%|█████████▍| 5417/5772 [1:46:46<36:09, 6.11s/it] 94%|█████████▍| 5418/5772 [1:46:55<35:48, 6.07s/it] 94%|█████████▍| 5418/5772 [1:46:52<35:48, 6.07s/it] {'loss': 0.4491, 'learning_rate': 1.9668970824793355e-07, 'epoch': 0.94} + 94%|█████████▍| 5418/5772 [1:46:55<35:48, 6.07s/it] {'loss': 0.4491, 'learning_rate': 1.9668970824793355e-07, 'epoch': 0.94} + 94%|█████████▍| 5418/5772 [1:46:52<35:48, 6.07s/it] 94%|█████████▍| 5419/5772 [1:47:01<35:30, 6.04s/it] 94%|█████████▍| 5419/5772 [1:46:58<35:30, 6.04s/it] {'loss': 0.4513, 'learning_rate': 1.955836680055223e-07, 'epoch': 0.94} + 94%|█████████▍| 5419/5772 [1:47:01<35:30, 6.04s/it] {'loss': 0.4513, 'learning_rate': 1.955836680055223e-07, 'epoch': 0.94} + 94%|█████████▍| 5419/5772 [1:46:58<35:30, 6.04s/it] 94%|█████████▍| 5420/5772 [1:47:07<35:36, 6.07s/it] 94%|█████████▍| 5420/5772 [1:47:04<35:36, 6.07s/it] {'loss': 0.4656, 'learning_rate': 1.9448071560952187e-07, 'epoch': 0.94} + 94%|█████████▍| 5420/5772 [1:47:07<35:36, 6.07s/it] {'loss': 0.4656, 'learning_rate': 1.9448071560952187e-07, 'epoch': 0.94} + 94%|█████████▍| 5420/5772 [1:47:04<35:36, 6.07s/it] 94%|█████████▍| 5421/5772 [1:47:14<36:24, 6.22s/it] 94%|█████████▍| 5421/5772 [1:47:11<36:24, 6.22s/it] {'loss': 0.453, 'learning_rate': 1.93380851407301e-07, 'epoch': 0.94} + 94%|█████████▍| 5421/5772 [1:47:14<36:24, 6.22s/it] {'loss': 0.453, 'learning_rate': 1.93380851407301e-07, 'epoch': 0.94} + 94%|█████████▍| 5421/5772 [1:47:11<36:24, 6.22s/it] 94%|█████████▍| 5422/5772 [1:47:20<35:33, 6.09s/it] 94%|█████████▍| 5422/5772 [1:47:17<35:33, 6.09s/it] {'loss': 0.4534, 'learning_rate': 1.92284075745256e-07, 'epoch': 0.94} + 94%|█████████▍| 5422/5772 [1:47:20<35:33, 6.09s/it] {'loss': 0.4534, 'learning_rate': 1.92284075745256e-07, 'epoch': 0.94} + 94%|█████████▍| 5422/5772 [1:47:17<35:33, 6.09s/it] 94%|█████████▍| 5423/5772 [1:47:26<35:58, 6.18s/it] 94%|█████████▍| 5423/5772 [1:47:23<35:58, 6.18s/it] {'loss': 0.4653, 'learning_rate': 1.9119038896880938e-07, 'epoch': 0.94} + 94%|█████████▍| 5423/5772 [1:47:26<35:58, 6.18s/it] {'loss': 0.4653, 'learning_rate': 1.9119038896880938e-07, 'epoch': 0.94} + 94%|█████████▍| 5423/5772 [1:47:23<35:58, 6.18s/it] 94%|█████████▍| 5424/5772 [1:47:32<35:20, 6.09s/it] 94%|█████████▍| 5424/5772 [1:47:29<35:20, 6.09s/it] {'loss': 0.4533, 'learning_rate': 1.9009979142241453e-07, 'epoch': 0.94} + 94%|█████████▍| 5424/5772 [1:47:32<35:20, 6.09s/it] {'loss': 0.4533, 'learning_rate': 1.9009979142241453e-07, 'epoch': 0.94} + 94%|█████████▍| 5424/5772 [1:47:29<35:20, 6.09s/it] 94%|█████████▍| 5425/5772 [1:47:38<35:40, 6.17s/it] 94%|█████████▍| 5425/5772 [1:47:35<35:40, 6.17s/it] {'loss': 0.4484, 'learning_rate': 1.8901228344954558e-07, 'epoch': 0.94} + 94%|█████████▍| 5425/5772 [1:47:38<35:40, 6.17s/it] {'loss': 0.4484, 'learning_rate': 1.8901228344954558e-07, 'epoch': 0.94} + 94%|█████████▍| 5425/5772 [1:47:35<35:40, 6.17s/it] 94%|█████████▍| 5426/5772 [1:47:44<35:08, 6.09s/it] 94%|█████████▍| 5426/5772 [1:47:41<35:08, 6.09s/it] {'loss': 0.4684, 'learning_rate': 1.8792786539270967e-07, 'epoch': 0.94} + 94%|█████████▍| 5426/5772 [1:47:44<35:08, 6.09s/it] {'loss': 0.4684, 'learning_rate': 1.8792786539270967e-07, 'epoch': 0.94} + 94%|█████████▍| 5426/5772 [1:47:41<35:08, 6.09s/it] 94%|█████████▍| 5427/5772 [1:47:50<34:52, 6.06s/it] 94%|█████████▍| 5427/5772 [1:47:47<34:51, 6.06s/it] {'loss': 0.4482, 'learning_rate': 1.8684653759343586e-07, 'epoch': 0.94} + 94%|█████████▍| 5427/5772 [1:47:50<34:52, 6.06s/it] {'loss': 0.4482, 'learning_rate': 1.8684653759343586e-07, 'epoch': 0.94} + 94%|█████████▍| 5427/5772 [1:47:47<34:51, 6.06s/it] 94%|█████████▍| 5428/5772 [1:47:57<35:16, 6.15s/it] 94%|█████████▍| 5428/5772 [1:47:54<35:16, 6.15s/it] {'loss': 0.4614, 'learning_rate': 1.85768300392285e-07, 'epoch': 0.94} + 94%|█████████▍| 5428/5772 [1:47:57<35:16, 6.15s/it] {'loss': 0.4614, 'learning_rate': 1.85768300392285e-07, 'epoch': 0.94} + 94%|█████████▍| 5428/5772 [1:47:54<35:16, 6.15s/it] 94%|█████████▍| 5429/5772 [1:48:03<34:52, 6.10s/it] 94%|█████████▍| 5429/5772 [1:48:00<34:52, 6.10s/it] {'loss': 0.4721, 'learning_rate': 1.8469315412883882e-07, 'epoch': 0.94} + 94%|█████████▍| 5429/5772 [1:48:03<34:52, 6.10s/it] {'loss': 0.4721, 'learning_rate': 1.8469315412883882e-07, 'epoch': 0.94} + 94%|█████████▍| 5429/5772 [1:48:00<34:52, 6.10s/it] 94%|█████████▍| 5430/5772 [1:48:09<34:43, 6.09s/it] {'loss': 0.4699, 'learning_rate': 1.83621099141712e-07, 'epoch': 0.94} + 94%|█████████▍| 5430/5772 [1:48:09<34:43, 6.09s/it] 94%|█████████▍| 5430/5772 [1:48:06<34:43, 6.09s/it] {'loss': 0.4699, 'learning_rate': 1.83621099141712e-07, 'epoch': 0.94} + 94%|█████████▍| 5430/5772 [1:48:06<34:43, 6.09s/it] 94%|█████████▍| 5431/5772 [1:48:15<34:43, 6.11s/it] 94%|█████████▍| 5431/5772 [1:48:12<34:43, 6.11s/it] {'loss': 0.4545, 'learning_rate': 1.8255213576854115e-07, 'epoch': 0.94} + 94%|█████████▍| 5431/5772 [1:48:15<34:43, 6.11s/it] {'loss': 0.4545, 'learning_rate': 1.8255213576854115e-07, 'epoch': 0.94} + 94%|█████████▍| 5431/5772 [1:48:12<34:43, 6.11s/it] 94%|█████████▍| 5432/5772 [1:48:21<34:27, 6.08s/it] 94%|█████████▍| 5432/5772 [1:48:18<34:27, 6.08s/it] {'loss': 0.4776, 'learning_rate': 1.8148626434598916e-07, 'epoch': 0.94} + 94%|█████████▍| 5432/5772 [1:48:21<34:27, 6.08s/it] {'loss': 0.4776, 'learning_rate': 1.8148626434598916e-07, 'epoch': 0.94} + 94%|█████████▍| 5432/5772 [1:48:18<34:27, 6.08s/it] 94%|█████████▍| 5433/5772 [1:48:27<34:03, 6.03s/it] 94%|█████████▍| 5433/5772 [1:48:24<34:03, 6.03s/it] {'loss': 0.4429, 'learning_rate': 1.804234852097464e-07, 'epoch': 0.94} + 94%|█████████▍| 5433/5772 [1:48:27<34:03, 6.03s/it] {'loss': 0.4429, 'learning_rate': 1.804234852097464e-07, 'epoch': 0.94} + 94%|█████████▍| 5433/5772 [1:48:24<34:03, 6.03s/it] 94%|█████████▍| 5434/5772 [1:48:33<34:04, 6.05s/it] 94%|█████████▍| 5434/5772 [1:48:30<34:04, 6.05s/it] {'loss': 0.4531, 'learning_rate': 1.793637986945307e-07, 'epoch': 0.94} + 94%|█████████▍| 5434/5772 [1:48:33<34:04, 6.05s/it] {'loss': 0.4531, 'learning_rate': 1.793637986945307e-07, 'epoch': 0.94} + 94%|█████████▍| 5434/5772 [1:48:30<34:04, 6.05s/it] 94%|█████████▍| 5435/5772 [1:48:39<34:22, 6.12s/it] 94%|█████████▍| 5435/5772 [1:48:36<34:22, 6.12s/it] {'loss': 0.4535, 'learning_rate': 1.7830720513408395e-07, 'epoch': 0.94} + 94%|█████████▍| 5435/5772 [1:48:39<34:22, 6.12s/it] {'loss': 0.4535, 'learning_rate': 1.7830720513408395e-07, 'epoch': 0.94} + 94%|█████████▍| 5435/5772 [1:48:36<34:22, 6.12s/it] 94%|█████████▍| 5436/5772 [1:48:46<34:52, 6.23s/it] 94%|█████████▍| 5436/5772 [1:48:43<34:52, 6.23s/it] {'loss': 0.4703, 'learning_rate': 1.7725370486117333e-07, 'epoch': 0.94} + 94%|█████████▍| 5436/5772 [1:48:46<34:52, 6.23s/it] {'loss': 0.4703, 'learning_rate': 1.7725370486117333e-07, 'epoch': 0.94} + 94%|█████████▍| 5436/5772 [1:48:43<34:52, 6.23s/it] 94%|█████████▍| 5437/5772 [1:48:52<35:26, 6.35s/it] 94%|█████████▍| 5437/5772 [1:48:49<35:26, 6.35s/it] {'loss': 0.4514, 'learning_rate': 1.762032982075934e-07, 'epoch': 0.94} + 94%|█████████▍| 5437/5772 [1:48:52<35:26, 6.35s/it] {'loss': 0.4514, 'learning_rate': 1.762032982075934e-07, 'epoch': 0.94} + 94%|█████████▍| 5437/5772 [1:48:49<35:26, 6.35s/it] 94%|█████████▍| 5438/5772 [1:48:58<34:49, 6.26s/it] 94%|█████████▍| 5438/5772 [1:48:55<34:49, 6.26s/it] {'loss': 0.464, 'learning_rate': 1.7515598550416625e-07, 'epoch': 0.94} + 94%|█████████▍| 5438/5772 [1:48:58<34:49, 6.26s/it] {'loss': 0.464, 'learning_rate': 1.7515598550416625e-07, 'epoch': 0.94} + 94%|█████████▍| 5438/5772 [1:48:55<34:49, 6.26s/it] 94%|█████████▍| 5439/5772 [1:49:04<34:35, 6.23s/it] 94%|█████████▍| 5439/5772 [1:49:01<34:35, 6.23s/it] {'loss': 0.4535, 'learning_rate': 1.741117670807335e-07, 'epoch': 0.94} + 94%|█████████▍| 5439/5772 [1:49:05<34:35, 6.23s/it] {'loss': 0.4535, 'learning_rate': 1.741117670807335e-07, 'epoch': 0.94} + 94%|█████████▍| 5439/5772 [1:49:01<34:35, 6.23s/it] 94%|█████████▍| 5440/5772 [1:49:11<34:27, 6.23s/it] 94%|█████████▍| 5440/5772 [1:49:08<34:27, 6.23s/it] {'loss': 0.4614, 'learning_rate': 1.7307064326616775e-07, 'epoch': 0.94} + 94%|█████████▍| 5440/5772 [1:49:11<34:27, 6.23s/it] {'loss': 0.4614, 'learning_rate': 1.7307064326616775e-07, 'epoch': 0.94} + 94%|█████████▍| 5440/5772 [1:49:08<34:27, 6.23s/it] 94%|█████████▍| 5441/5772 [1:49:17<34:31, 6.26s/it] 94%|█████████▍| 5441/5772 [1:49:14<34:31, 6.26s/it] {'loss': 0.4606, 'learning_rate': 1.7203261438836439e-07, 'epoch': 0.94} + 94%|█████████▍| 5441/5772 [1:49:17<34:31, 6.26s/it] {'loss': 0.4606, 'learning_rate': 1.7203261438836439e-07, 'epoch': 0.94} + 94%|█████████▍| 5441/5772 [1:49:14<34:31, 6.26s/it] 94%|█████████▍| 5442/5772 [1:49:23<33:46, 6.14s/it] 94%|█████████▍| 5442/5772 [1:49:20<33:46, 6.14s/it] {'loss': 0.4658, 'learning_rate': 1.709976807742475e-07, 'epoch': 0.94} + 94%|█████████▍| 5442/5772 [1:49:23<33:46, 6.14s/it] {'loss': 0.4658, 'learning_rate': 1.709976807742475e-07, 'epoch': 0.94} + 94%|█████████▍| 5442/5772 [1:49:20<33:46, 6.14s/it] 94%|█████████▍| 5443/5772 [1:49:29<33:27, 6.10s/it] 94%|█████████▍| 5443/5772 [1:49:26<33:27, 6.10s/it] {'loss': 0.4466, 'learning_rate': 1.699658427497597e-07, 'epoch': 0.94} + {'loss': 0.4466, 'learning_rate': 1.699658427497597e-07, 'epoch': 0.94} + 94%|█████████▍| 5443/5772 [1:49:29<33:27, 6.10s/it] 94%|█████████▍| 5443/5772 [1:49:26<33:27, 6.10s/it] 94%|█████████▍| 5444/5772 [1:49:35<32:56, 6.03s/it] 94%|█████████▍| 5444/5772 [1:49:32<32:56, 6.03s/it] {'loss': 0.4664, 'learning_rate': 1.6893710063987433e-07, 'epoch': 0.94} + 94%|█████████▍| 5444/5772 [1:49:35<32:56, 6.03s/it] {'loss': 0.4664, 'learning_rate': 1.6893710063987433e-07, 'epoch': 0.94} + 94%|█████████▍| 5444/5772 [1:49:32<32:56, 6.03s/it] 94%|█████████▍| 5445/5772 [1:49:41<33:15, 6.10s/it] 94%|█████████▍| 5445/5772 [1:49:38<33:15, 6.10s/it] {'loss': 0.4565, 'learning_rate': 1.6791145476858894e-07, 'epoch': 0.94} + 94%|█████████▍| 5445/5772 [1:49:41<33:15, 6.10s/it] {'loss': 0.4565, 'learning_rate': 1.6791145476858894e-07, 'epoch': 0.94} + 94%|█████████▍| 5445/5772 [1:49:38<33:15, 6.10s/it] 94%|█████████▍| 5446/5772 [1:49:47<32:57, 6.06s/it] 94%|█████████▍| 5446/5772 [1:49:44<32:57, 6.06s/it] {'loss': 0.4685, 'learning_rate': 1.66888905458924e-07, 'epoch': 0.94} + 94%|█████████▍| 5446/5772 [1:49:47<32:57, 6.06s/it] {'loss': 0.4685, 'learning_rate': 1.66888905458924e-07, 'epoch': 0.94} + 94%|█████████▍| 5446/5772 [1:49:44<32:57, 6.06s/it] 94%|█████████▍| 5447/5772 [1:49:53<32:34, 6.01s/it] 94%|█████████▍| 5447/5772 [1:49:50<32:34, 6.01s/it] {'loss': 0.4544, 'learning_rate': 1.6586945303292633e-07, 'epoch': 0.94} + 94%|█████████▍| 5447/5772 [1:49:53<32:34, 6.01s/it] {'loss': 0.4544, 'learning_rate': 1.6586945303292633e-07, 'epoch': 0.94} + 94%|█████████▍| 5447/5772 [1:49:50<32:34, 6.01s/it] 94%|█████████▍| 5448/5772 [1:49:59<33:11, 6.15s/it] 94%|█████████▍| 5448/5772 [1:49:56<33:11, 6.15s/it] {'loss': 0.4624, 'learning_rate': 1.648530978116658e-07, 'epoch': 0.94} + 94%|█████████▍| 5448/5772 [1:49:59<33:11, 6.15s/it] {'loss': 0.4624, 'learning_rate': 1.648530978116658e-07, 'epoch': 0.94} + 94%|█████████▍| 5448/5772 [1:49:56<33:11, 6.15s/it] 94%|█████████▍| 5449/5772 [1:50:05<32:39, 6.07s/it] 94%|█████████▍| 5449/5772 [1:50:02<32:39, 6.07s/it] {'loss': 0.4592, 'learning_rate': 1.6383984011523967e-07, 'epoch': 0.94} + 94%|█████████▍| 5449/5772 [1:50:05<32:39, 6.07s/it] {'loss': 0.4592, 'learning_rate': 1.6383984011523967e-07, 'epoch': 0.94} + 94%|█████████▍| 5449/5772 [1:50:02<32:39, 6.07s/it]14 AutoResumeHook: Checking whether to suspend... +1315 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +811 AutoResumeHook: Checking whether to suspend...7 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +12 + AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... + 94%|█████████▍| 5450/5772 [1:50:11<32:11, 6.00s/it]32 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +1 10 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +0 AutoResumeHook: Checking whether to suspend... + 94%|█████████▍| 5450/5772 [1:50:08<32:11, 6.00s/it] {'loss': 0.4581, 'learning_rate': 1.6282968026276602e-07, 'epoch': 0.94} + 94%|█████████▍| 5450/5772 [1:50:11<32:11, 6.00s/it] {'loss': 0.4581, 'learning_rate': 1.6282968026276602e-07, 'epoch': 0.94} + 94%|█████████▍| 5450/5772 [1:50:08<32:11, 6.00s/it] 94%|█████████▍| 5451/5772 [1:50:17<32:31, 6.08s/it] 94%|█████████▍| 5451/5772 [1:50:14<32:31, 6.08s/it] {'loss': 0.4612, 'learning_rate': 1.6182261857238812e-07, 'epoch': 0.94} + 94%|█████████▍| 5451/5772 [1:50:17<32:31, 6.08s/it] {'loss': 0.4612, 'learning_rate': 1.6182261857238812e-07, 'epoch': 0.94} + 94%|█████████▍| 5451/5772 [1:50:14<32:31, 6.08s/it] 94%|█████████▍| 5452/5772 [1:50:23<32:24, 6.08s/it] 94%|█████████▍| 5452/5772 [1:50:20<32:24, 6.08s/it] {'loss': 0.4542, 'learning_rate': 1.6081865536127895e-07, 'epoch': 0.94} + 94%|█████████▍| 5452/5772 [1:50:23<32:24, 6.08s/it] {'loss': 0.4542, 'learning_rate': 1.6081865536127895e-07, 'epoch': 0.94} + 94%|█████████▍| 5452/5772 [1:50:20<32:24, 6.08s/it] 94%|█████████▍| 5453/5772 [1:50:29<31:56, 6.01s/it] 94%|█████████▍| 5453/5772 [1:50:26<31:56, 6.01s/it] {'loss': 0.4658, 'learning_rate': 1.5981779094562667e-07, 'epoch': 0.94} + 94%|█████████▍| 5453/5772 [1:50:26<31:56, 6.01s/it]{'loss': 0.4658, 'learning_rate': 1.5981779094562667e-07, 'epoch': 0.94} + 94%|█████████▍| 5453/5772 [1:50:29<31:56, 6.01s/it] 94%|█████████▍| 5454/5772 [1:50:35<32:00, 6.04s/it] 94%|█████████▍| 5454/5772 [1:50:32<32:00, 6.04s/it] {'loss': 0.4668, 'learning_rate': 1.5882002564065025e-07, 'epoch': 0.94} + 94%|█████████▍| 5454/5772 [1:50:35<32:00, 6.04s/it] {'loss': 0.4668, 'learning_rate': 1.5882002564065025e-07, 'epoch': 0.94} + 94%|█████████▍| 5454/5772 [1:50:32<32:00, 6.04s/it] 95%|█████████▍| 5455/5772 [1:50:42<32:15, 6.11s/it] 95%|█████████▍| 5455/5772 [1:50:39<32:15, 6.11s/it] {'loss': 0.4458, 'learning_rate': 1.578253597605872e-07, 'epoch': 0.94} + 95%|█████████▍| 5455/5772 [1:50:42<32:15, 6.11s/it] {'loss': 0.4458, 'learning_rate': 1.578253597605872e-07, 'epoch': 0.94} + 95%|█████████▍| 5455/5772 [1:50:39<32:15, 6.11s/it] 95%|█████████▍| 5456/5772 [1:50:49<33:29, 6.36s/it] 95%|█████████▍| 5456/5772 [1:50:46<33:29, 6.36s/it] {'loss': 0.4646, 'learning_rate': 1.56833793618707e-07, 'epoch': 0.95} + 95%|█████████▍| 5456/5772 [1:50:49<33:29, 6.36s/it] {'loss': 0.4646, 'learning_rate': 1.56833793618707e-07, 'epoch': 0.95} + 95%|█████████▍| 5456/5772 [1:50:46<33:29, 6.36s/it] 95%|█████████▍| 5457/5772 [1:50:55<33:36, 6.40s/it] 95%|█████████▍| 5457/5772 [1:50:52<33:36, 6.40s/it] {'loss': 0.4627, 'learning_rate': 1.558453275272942e-07, 'epoch': 0.95} + 95%|█████████▍| 5457/5772 [1:50:55<33:36, 6.40s/it] {'loss': 0.4627, 'learning_rate': 1.558453275272942e-07, 'epoch': 0.95} + 95%|█████████▍| 5457/5772 [1:50:52<33:36, 6.40s/it] 95%|█████████▍| 5458/5772 [1:51:02<33:48, 6.46s/it] 95%|█████████▍| 5458/5772 [1:50:59<33:48, 6.46s/it] {'loss': 0.4641, 'learning_rate': 1.5485996179766206e-07, 'epoch': 0.95} + 95%|█████████▍| 5458/5772 [1:51:02<33:48, 6.46s/it] {'loss': 0.4641, 'learning_rate': 1.5485996179766206e-07, 'epoch': 0.95} + 95%|█████████▍| 5458/5772 [1:50:59<33:48, 6.46s/it] 95%|█████████▍| 5459/5772 [1:51:08<33:18, 6.39s/it] 95%|█████████▍| 5459/5772 [1:51:05<33:18, 6.39s/it] {'loss': 0.4484, 'learning_rate': 1.5387769674014563e-07, 'epoch': 0.95} + 95%|█████████▍| 5459/5772 [1:51:08<33:18, 6.39s/it] {'loss': 0.4484, 'learning_rate': 1.5387769674014563e-07, 'epoch': 0.95} + 95%|█████████▍| 5459/5772 [1:51:05<33:18, 6.39s/it] 95%|█████████▍| 5460/5772 [1:51:14<33:09, 6.38s/it] 95%|█████████▍| 5460/5772 [1:51:11<33:09, 6.38s/it] {'loss': 0.4618, 'learning_rate': 1.5289853266410416e-07, 'epoch': 0.95} + 95%|█████████▍| 5460/5772 [1:51:14<33:09, 6.38s/it] {'loss': 0.4618, 'learning_rate': 1.5289853266410416e-07, 'epoch': 0.95} + 95%|█████████▍| 5460/5772 [1:51:11<33:09, 6.38s/it] 95%|█████████▍| 5461/5772 [1:51:21<32:55, 6.35s/it] {'loss': 0.4555, 'learning_rate': 1.519224698779198e-07, 'epoch': 0.95} + 95%|█████████▍| 5461/5772 [1:51:21<32:55, 6.35s/it] 95%|█████████▍| 5461/5772 [1:51:18<32:55, 6.35s/it] {'loss': 0.4555, 'learning_rate': 1.519224698779198e-07, 'epoch': 0.95} + 95%|█████████▍| 5461/5772 [1:51:18<32:55, 6.35s/it] 95%|█████████▍| 5462/5772 [1:51:27<32:55, 6.37s/it] 95%|█████████▍| 5462/5772 [1:51:24<32:55, 6.37s/it] {'loss': 0.4498, 'learning_rate': 1.5094950868899672e-07, 'epoch': 0.95} + 95%|█████████▍| 5462/5772 [1:51:27<32:55, 6.37s/it] {'loss': 0.4498, 'learning_rate': 1.5094950868899672e-07, 'epoch': 0.95} + 95%|█████████▍| 5462/5772 [1:51:24<32:55, 6.37s/it] 95%|█████████▍| 5463/5772 [1:51:33<32:17, 6.27s/it] 95%|█████████▍| 5463/5772 [1:51:30<32:17, 6.27s/it] {'loss': 0.4479, 'learning_rate': 1.4997964940376752e-07, 'epoch': 0.95} + 95%|█████████▍| 5463/5772 [1:51:33<32:17, 6.27s/it] {'loss': 0.4479, 'learning_rate': 1.4997964940376752e-07, 'epoch': 0.95} + 95%|█████████▍| 5463/5772 [1:51:30<32:17, 6.27s/it] 95%|█████████▍| 5464/5772 [1:51:39<32:24, 6.31s/it] {'loss': 0.4602, 'learning_rate': 1.4901289232767903e-07, 'epoch': 0.95} + 95%|█████████▍| 5464/5772 [1:51:39<32:24, 6.31s/it] 95%|█████████▍| 5464/5772 [1:51:36<32:24, 6.31s/it] {'loss': 0.4602, 'learning_rate': 1.4901289232767903e-07, 'epoch': 0.95} + 95%|█████████▍| 5464/5772 [1:51:36<32:24, 6.31s/it] 95%|█████████▍| 5465/5772 [1:51:45<31:46, 6.21s/it] 95%|█████████▍| 5465/5772 [1:51:42<31:46, 6.21s/it] {'loss': 0.4452, 'learning_rate': 1.4804923776520985e-07, 'epoch': 0.95} + 95%|█████████▍| 5465/5772 [1:51:45<31:46, 6.21s/it] {'loss': 0.4452, 'learning_rate': 1.4804923776520985e-07, 'epoch': 0.95} + 95%|█████████▍| 5465/5772 [1:51:42<31:46, 6.21s/it] 95%|█████████▍| 5466/5772 [1:51:51<31:07, 6.10s/it] 95%|█████████▍| 5466/5772 [1:51:48<31:07, 6.10s/it] {'loss': 0.4645, 'learning_rate': 1.4708868601985503e-07, 'epoch': 0.95} + 95%|█████████▍| 5466/5772 [1:51:51<31:07, 6.10s/it] {'loss': 0.4645, 'learning_rate': 1.4708868601985503e-07, 'epoch': 0.95} + 95%|█████████▍| 5466/5772 [1:51:48<31:07, 6.10s/it] 95%|█████████▍| 5467/5772 [1:51:57<31:06, 6.12s/it] {'loss': 0.4587, 'learning_rate': 1.4613123739413704e-07, 'epoch': 0.95} + 95%|█████████▍| 5467/5772 [1:51:57<31:06, 6.12s/it] 95%|█████████▍| 5467/5772 [1:51:54<31:06, 6.12s/it] {'loss': 0.4587, 'learning_rate': 1.4613123739413704e-07, 'epoch': 0.95} + 95%|█████████▍| 5467/5772 [1:51:54<31:06, 6.12s/it] 95%|█████████▍| 5468/5772 [1:52:03<30:50, 6.09s/it] 95%|█████████▍| 5468/5772 [1:52:00<30:50, 6.09s/it] {'loss': 0.465, 'learning_rate': 1.4517689218959907e-07, 'epoch': 0.95} + 95%|█████████▍| 5468/5772 [1:52:03<30:50, 6.09s/it] {'loss': 0.465, 'learning_rate': 1.4517689218959907e-07, 'epoch': 0.95} + 95%|█████████▍| 5468/5772 [1:52:00<30:50, 6.09s/it] 95%|█████████▍| 5469/5772 [1:52:09<30:43, 6.08s/it] 95%|█████████▍| 5469/5772 [1:52:06<30:43, 6.08s/it] {'loss': 0.4604, 'learning_rate': 1.4422565070680406e-07, 'epoch': 0.95} + 95%|█████████▍| 5469/5772 [1:52:09<30:43, 6.08s/it] {'loss': 0.4604, 'learning_rate': 1.4422565070680406e-07, 'epoch': 0.95} + 95%|█████████▍| 5469/5772 [1:52:06<30:43, 6.08s/it] 95%|█████████▍| 5470/5772 [1:52:16<31:02, 6.17s/it] 95%|█████████▍| 5470/5772 [1:52:13<31:02, 6.17s/it] {'loss': 0.4602, 'learning_rate': 1.4327751324534233e-07, 'epoch': 0.95} + 95%|█████████▍| 5470/5772 [1:52:16<31:02, 6.17s/it] {'loss': 0.4602, 'learning_rate': 1.4327751324534233e-07, 'epoch': 0.95} + 95%|█████████▍| 5470/5772 [1:52:13<31:02, 6.17s/it] 95%|█████████▍| 5471/5772 [1:52:22<30:56, 6.17s/it] 95%|█████████▍| 5471/5772 [1:52:19<30:56, 6.17s/it] {'loss': 0.4438, 'learning_rate': 1.4233248010382506e-07, 'epoch': 0.95} + 95%|█████████▍| 5471/5772 [1:52:22<30:56, 6.17s/it] {'loss': 0.4438, 'learning_rate': 1.4233248010382506e-07, 'epoch': 0.95} + 95%|█████████▍| 5471/5772 [1:52:19<30:56, 6.17s/it] 95%|█████████▍| 5472/5772 [1:52:28<31:11, 6.24s/it] 95%|█████████▍| 5472/5772 [1:52:25<31:11, 6.24s/it] {'loss': 0.4658, 'learning_rate': 1.4139055157988303e-07, 'epoch': 0.95} + 95%|█████████▍| 5472/5772 [1:52:28<31:11, 6.24s/it] {'loss': 0.4658, 'learning_rate': 1.4139055157988303e-07, 'epoch': 0.95} + 95%|█████████▍| 5472/5772 [1:52:25<31:11, 6.24s/it] 95%|█████████▍| 5473/5772 [1:52:35<31:01, 6.23s/it] 95%|█████████▍| 5473/5772 [1:52:32<31:01, 6.23s/it] {'loss': 0.4561, 'learning_rate': 1.4045172797017336e-07, 'epoch': 0.95} + {'loss': 0.4561, 'learning_rate': 1.4045172797017336e-07, 'epoch': 0.95} 95%|█████████▍| 5473/5772 [1:52:35<31:01, 6.23s/it] + 95%|█████████▍| 5473/5772 [1:52:32<31:01, 6.23s/it] 95%|█████████▍| 5474/5772 [1:52:41<30:37, 6.17s/it] 95%|█████████▍| 5474/5772 [1:52:38<30:37, 6.17s/it]{'loss': 0.4513, 'learning_rate': 1.3951600957037292e-07, 'epoch': 0.95} + 95%|█████████▍| 5474/5772 [1:52:41<30:37, 6.17s/it] {'loss': 0.4513, 'learning_rate': 1.3951600957037292e-07, 'epoch': 0.95} + 95%|█████████▍| 5474/5772 [1:52:38<30:37, 6.17s/it] 95%|█████████▍| 5475/5772 [1:52:47<30:14, 6.11s/it] 95%|█████████▍| 5475/5772 [1:52:44<30:14, 6.11s/it] {'loss': 0.458, 'learning_rate': 1.385833966751815e-07, 'epoch': 0.95} + 95%|█████████▍| 5475/5772 [1:52:47<30:14, 6.11s/it] {'loss': 0.458, 'learning_rate': 1.385833966751815e-07, 'epoch': 0.95} + 95%|█████████▍| 5475/5772 [1:52:44<30:14, 6.11s/it] 95%|█████████▍| 5476/5772 [1:52:53<30:04, 6.10s/it] 95%|█████████▍| 5476/5772 [1:52:50<30:04, 6.10s/it] {'loss': 0.4642, 'learning_rate': 1.376538895783186e-07, 'epoch': 0.95} + 95%|█████████▍| 5476/5772 [1:52:53<30:04, 6.10s/it] {'loss': 0.4642, 'learning_rate': 1.376538895783186e-07, 'epoch': 0.95} + 95%|█████████▍| 5476/5772 [1:52:50<30:04, 6.10s/it] 95%|█████████▍| 5477/5772 [1:52:59<30:11, 6.14s/it] 95%|█████████▍| 5477/5772 [1:52:56<30:11, 6.14s/it]{'loss': 0.4616, 'learning_rate': 1.3672748857252783e-07, 'epoch': 0.95} + {'loss': 0.4616, 'learning_rate': 1.3672748857252783e-07, 'epoch': 0.95} + 95%|█████████▍| 5477/5772 [1:52:59<30:11, 6.14s/it] 95%|█████████▍| 5477/5772 [1:52:56<30:11, 6.14s/it] 95%|█████████▍| 5478/5772 [1:53:05<30:21, 6.20s/it] 95%|█████████▍| 5478/5772 [1:53:02<30:21, 6.20s/it] {'loss': 0.4598, 'learning_rate': 1.358041939495758e-07, 'epoch': 0.95} + 95%|█████████▍| 5478/5772 [1:53:05<30:21, 6.20s/it] {'loss': 0.4598, 'learning_rate': 1.358041939495758e-07, 'epoch': 0.95} + 95%|█████████▍| 5478/5772 [1:53:02<30:21, 6.20s/it] 95%|█████████▍| 5479/5772 [1:53:11<30:02, 6.15s/it] 95%|█████████▍| 5479/5772 [1:53:08<30:02, 6.15s/it] {'loss': 0.45, 'learning_rate': 1.3488400600024654e-07, 'epoch': 0.95} + 95%|█████████▍| 5479/5772 [1:53:11<30:02, 6.15s/it] {'loss': 0.45, 'learning_rate': 1.3488400600024654e-07, 'epoch': 0.95} + 95%|█████████▍| 5479/5772 [1:53:08<30:02, 6.15s/it] 95%|█████████▍| 5480/5772 [1:53:17<29:44, 6.11s/it] 95%|█████████▍| 5480/5772 [1:53:14<29:44, 6.11s/it] {'loss': 0.4489, 'learning_rate': 1.339669250143505e-07, 'epoch': 0.95} + 95%|█████████▍| 5480/5772 [1:53:17<29:44, 6.11s/it] {'loss': 0.4489, 'learning_rate': 1.339669250143505e-07, 'epoch': 0.95} + 95%|█████████▍| 5480/5772 [1:53:14<29:44, 6.11s/it] 95%|█████████▍| 5481/5772 [1:53:23<29:38, 6.11s/it] 95%|█████████▍| 5481/5772 [1:53:20<29:38, 6.11s/it] {'loss': 0.4467, 'learning_rate': 1.3305295128071437e-07, 'epoch': 0.95} + 95%|█████████▍| 5481/5772 [1:53:23<29:38, 6.11s/it] {'loss': 0.4467, 'learning_rate': 1.3305295128071437e-07, 'epoch': 0.95} + 95%|█████████▍| 5481/5772 [1:53:20<29:38, 6.11s/it] 95%|█████████▍| 5482/5772 [1:53:30<29:29, 6.10s/it] 95%|█████████▍| 5482/5772 [1:53:26<29:29, 6.10s/it] {'loss': 0.4734, 'learning_rate': 1.321420850871935e-07, 'epoch': 0.95} + 95%|█████████▍| 5482/5772 [1:53:30<29:29, 6.10s/it] {'loss': 0.4734, 'learning_rate': 1.321420850871935e-07, 'epoch': 0.95} + 95%|█████████▍| 5482/5772 [1:53:27<29:29, 6.10s/it] 95%|█████████▍| 5483/5772 [1:53:36<29:21, 6.10s/it] 95%|█████████▍| 5483/5772 [1:53:33<29:21, 6.10s/it] {'loss': 0.4501, 'learning_rate': 1.3123432672065506e-07, 'epoch': 0.95} + 95%|█████████▍| 5483/5772 [1:53:36<29:21, 6.10s/it] {'loss': 0.4501, 'learning_rate': 1.3123432672065506e-07, 'epoch': 0.95} + 95%|█████████▍| 5483/5772 [1:53:33<29:21, 6.10s/it] 95%|█████████▌| 5484/5772 [1:53:42<29:05, 6.06s/it] 95%|█████████▌| 5484/5772 [1:53:39<29:05, 6.06s/it] {'loss': 0.463, 'learning_rate': 1.303296764669959e-07, 'epoch': 0.95} + 95%|█████████▌| 5484/5772 [1:53:42<29:05, 6.06s/it] {'loss': 0.463, 'learning_rate': 1.303296764669959e-07, 'epoch': 0.95} + 95%|█████████▌| 5484/5772 [1:53:39<29:05, 6.06s/it] 95%|█████████▌| 5485/5772 [1:53:48<29:23, 6.15s/it] 95%|█████████▌| 5485/5772 [1:53:45<29:23, 6.15s/it] {'loss': 0.4432, 'learning_rate': 1.2942813461112924e-07, 'epoch': 0.95} + 95%|█████████▌| 5485/5772 [1:53:48<29:23, 6.15s/it] {'loss': 0.4432, 'learning_rate': 1.2942813461112924e-07, 'epoch': 0.95} + 95%|█████████▌| 5485/5772 [1:53:45<29:23, 6.15s/it] 95%|█████████▌| 5486/5772 [1:53:54<29:06, 6.11s/it] 95%|█████████▌| 5486/5772 [1:53:51<29:06, 6.11s/it]{'loss': 0.4676, 'learning_rate': 1.2852970143699129e-07, 'epoch': 0.95} + 95%|█████████▌| 5486/5772 [1:53:54<29:06, 6.11s/it] {'loss': 0.4676, 'learning_rate': 1.2852970143699129e-07, 'epoch': 0.95} + 95%|█████████▌| 5486/5772 [1:53:51<29:06, 6.11s/it] 95%|█████████▌| 5487/5772 [1:54:00<29:37, 6.24s/it] 95%|█████████▌| 5487/5772 [1:53:57<29:37, 6.24s/it] {'loss': 0.453, 'learning_rate': 1.276343772275379e-07, 'epoch': 0.95} + 95%|█████████▌| 5487/5772 [1:54:00<29:37, 6.24s/it] {'loss': 0.453, 'learning_rate': 1.276343772275379e-07, 'epoch': 0.95} + 95%|█████████▌| 5487/5772 [1:53:57<29:37, 6.24s/it] 95%|█████████▌| 5488/5772 [1:54:06<29:04, 6.14s/it] 95%|█████████▌| 5488/5772 [1:54:03<29:04, 6.14s/it] {'loss': 0.4718, 'learning_rate': 1.267421622647469e-07, 'epoch': 0.95} + 95%|█████████▌| 5488/5772 [1:54:06<29:04, 6.14s/it] {'loss': 0.4718, 'learning_rate': 1.267421622647469e-07, 'epoch': 0.95} + 95%|█████████▌| 5488/5772 [1:54:03<29:04, 6.14s/it] 95%|█████████▌| 5489/5772 [1:54:13<29:17, 6.21s/it] 95%|█████████▌| 5489/5772 [1:54:10<29:17, 6.21s/it] {'loss': 0.4574, 'learning_rate': 1.2585305682961679e-07, 'epoch': 0.95} + 95%|█████████▌| 5489/5772 [1:54:13<29:17, 6.21s/it] {'loss': 0.4574, 'learning_rate': 1.2585305682961679e-07, 'epoch': 0.95} + 95%|█████████▌| 5489/5772 [1:54:10<29:17, 6.21s/it] 95%|█████████▌| 5490/5772 [1:54:19<29:38, 6.31s/it] 95%|█████████▌| 5490/5772 [1:54:16<29:38, 6.30s/it] {'loss': 0.4711, 'learning_rate': 1.2496706120216585e-07, 'epoch': 0.95} + 95%|█████████▌| 5490/5772 [1:54:19<29:38, 6.31s/it] {'loss': 0.4711, 'learning_rate': 1.2496706120216585e-07, 'epoch': 0.95} + 95%|█████████▌| 5490/5772 [1:54:16<29:38, 6.30s/it] 95%|█████████▌| 5491/5772 [1:54:26<30:00, 6.41s/it] 95%|█████████▌| 5491/5772 [1:54:23<30:00, 6.41s/it] {'loss': 0.4482, 'learning_rate': 1.2408417566143306e-07, 'epoch': 0.95} + 95%|█████████▌| 5491/5772 [1:54:26<30:00, 6.41s/it] {'loss': 0.4482, 'learning_rate': 1.2408417566143306e-07, 'epoch': 0.95} + 95%|█████████▌| 5491/5772 [1:54:23<30:00, 6.41s/it] 95%|█████████▌| 5492/5772 [1:54:32<29:13, 6.26s/it] 95%|█████████▌| 5492/5772 [1:54:29<29:13, 6.26s/it] {'loss': 0.4482, 'learning_rate': 1.2320440048547933e-07, 'epoch': 0.95} + 95%|█████████▌| 5492/5772 [1:54:32<29:13, 6.26s/it] {'loss': 0.4482, 'learning_rate': 1.2320440048547933e-07, 'epoch': 0.95} + 95%|█████████▌| 5492/5772 [1:54:29<29:13, 6.26s/it] 95%|█████████▌| 5493/5772 [1:54:38<29:08, 6.27s/it] 95%|█████████▌| 5493/5772 [1:54:35<29:08, 6.27s/it] {'loss': 0.4576, 'learning_rate': 1.2232773595138415e-07, 'epoch': 0.95} + 95%|█████████▌| 5493/5772 [1:54:38<29:08, 6.27s/it] {'loss': 0.4576, 'learning_rate': 1.2232773595138415e-07, 'epoch': 0.95} + 95%|█████████▌| 5493/5772 [1:54:35<29:08, 6.27s/it] 95%|█████████▌| 5494/5772 [1:54:44<28:43, 6.20s/it] 95%|█████████▌| 5494/5772 [1:54:41<28:43, 6.20s/it] {'loss': 0.4591, 'learning_rate': 1.2145418233524886e-07, 'epoch': 0.95} + 95%|█████████▌| 5494/5772 [1:54:44<28:43, 6.20s/it] {'loss': 0.4591, 'learning_rate': 1.2145418233524886e-07, 'epoch': 0.95} + 95%|█████████▌| 5494/5772 [1:54:41<28:43, 6.20s/it] 95%|█████████▌| 5495/5772 [1:54:51<28:52, 6.25s/it] 95%|█████████▌| 5495/5772 [1:54:48<28:52, 6.25s/it] {'loss': 0.4595, 'learning_rate': 1.2058373991219341e-07, 'epoch': 0.95} + 95%|█████████▌| 5495/5772 [1:54:51<28:52, 6.25s/it] {'loss': 0.4595, 'learning_rate': 1.2058373991219341e-07, 'epoch': 0.95} + 95%|█████████▌| 5495/5772 [1:54:48<28:52, 6.25s/it] 95%|█████████▌| 5496/5772 [1:54:57<28:58, 6.30s/it] 95%|█████████▌| 5496/5772 [1:54:54<28:58, 6.30s/it] {'loss': 0.4454, 'learning_rate': 1.197164089563596e-07, 'epoch': 0.95} + 95%|█████████▌| 5496/5772 [1:54:57<28:58, 6.30s/it] {'loss': 0.4454, 'learning_rate': 1.197164089563596e-07, 'epoch': 0.95} + 95%|█████████▌| 5496/5772 [1:54:54<28:58, 6.30s/it] 95%|█████████▌| 5497/5772 [1:55:03<28:36, 6.24s/it] 95%|█████████▌| 5497/5772 [1:55:00<28:36, 6.24s/it] {'loss': 0.4451, 'learning_rate': 1.1885218974090895e-07, 'epoch': 0.95} + 95%|█████████▌| 5497/5772 [1:55:03<28:36, 6.24s/it] {'loss': 0.4451, 'learning_rate': 1.1885218974090895e-07, 'epoch': 0.95} + 95%|█████████▌| 5497/5772 [1:55:00<28:36, 6.24s/it] 95%|█████████▌| 5498/5772 [1:55:09<28:14, 6.18s/it] 95%|█████████▌| 5498/5772 [1:55:06<28:14, 6.18s/it] {'loss': 0.4504, 'learning_rate': 1.1799108253802149e-07, 'epoch': 0.95} + 95%|█████████▌| 5498/5772 [1:55:09<28:14, 6.18s/it] {'loss': 0.4504, 'learning_rate': 1.1799108253802149e-07, 'epoch': 0.95} + 95%|█████████▌| 5498/5772 [1:55:06<28:14, 6.18s/it] 95%|█████████▌| 5499/5772 [1:55:15<28:03, 6.17s/it] 95%|█████████▌| 5499/5772 [1:55:12<28:03, 6.17s/it] {'loss': 0.4535, 'learning_rate': 1.1713308761889696e-07, 'epoch': 0.95} + 95%|█████████▌| 5499/5772 [1:55:15<28:03, 6.17s/it] {'loss': 0.4535, 'learning_rate': 1.1713308761889696e-07, 'epoch': 0.95} + 95%|█████████▌| 5499/5772 [1:55:12<28:03, 6.17s/it]9 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +118 AutoResumeHook: Checking whether to suspend... 4 AutoResumeHook: Checking whether to suspend... + +AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... + 95%|█████████▌| 5500/5772 [1:55:21<27:27, 6.06s/it]12 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +0 AutoResumeHook: Checking whether to suspend... + 95%|█████████▌| 5500/5772 [1:55:18<27:27, 6.06s/it] {'loss': 0.4496, 'learning_rate': 1.1627820525375811e-07, 'epoch': 0.95} + 95%|█████████▌| 5500/5772 [1:55:21<27:27, 6.06s/it] {'loss': 0.4496, 'learning_rate': 1.1627820525375811e-07, 'epoch': 0.95} + 95%|█████████▌| 5500/5772 [1:55:18<27:27, 6.06s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-5500/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-5500/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-5500/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 95%|█████████▌| 5501/5772 [1:55:45<56:02, 12.41s/it] 95%|█████████▌| 5501/5772 [1:55:48<56:03, 12.41s/it] {'loss': 0.4506, 'learning_rate': 1.1542643571184619e-07, 'epoch': 0.95} + 95%|█████████▌| 5501/5772 [1:55:48<56:03, 12.41s/it] {'loss': 0.4506, 'learning_rate': 1.1542643571184619e-07, 'epoch': 0.95} + 95%|█████████▌| 5501/5772 [1:55:45<56:02, 12.41s/it] 95%|█████████▌| 5502/5772 [1:55:51<47:32, 10.56s/it] 95%|█████████▌| 5502/5772 [1:55:55<47:32, 10.57s/it] {'loss': 0.4528, 'learning_rate': 1.1457777926141889e-07, 'epoch': 0.95} + 95%|█████████▌| 5502/5772 [1:55:55<47:32, 10.57s/it]{'loss': 0.4528, 'learning_rate': 1.1457777926141889e-07, 'epoch': 0.95} + 95%|█████████▌| 5502/5772 [1:55:52<47:32, 10.56s/it] 95%|█████████▌| 5503/5772 [1:55:58<41:33, 9.27s/it] 95%|█████████▌| 5503/5772 [1:56:01<41:33, 9.27s/it] {'loss': 0.454, 'learning_rate': 1.1373223616975681e-07, 'epoch': 0.95} + 95%|█████████▌| 5503/5772 [1:56:01<41:33, 9.27s/it] {'loss': 0.454, 'learning_rate': 1.1373223616975681e-07, 'epoch': 0.95} + 95%|█████████▌| 5503/5772 [1:55:58<41:33, 9.27s/it] 95%|█████████▌| 5504/5772 [1:56:04<37:07, 8.31s/it] 95%|█████████▌| 5504/5772 [1:56:07<37:07, 8.31s/it] {'loss': 0.4688, 'learning_rate': 1.1288980670315918e-07, 'epoch': 0.95} + 95%|█████████▌| 5504/5772 [1:56:07<37:07, 8.31s/it]{'loss': 0.4688, 'learning_rate': 1.1288980670315918e-07, 'epoch': 0.95} + 95%|█████████▌| 5504/5772 [1:56:04<37:07, 8.31s/it] 95%|█████████▌| 5505/5772 [1:56:10<34:15, 7.70s/it] 95%|█████████▌| 5505/5772 [1:56:13<34:16, 7.70s/it] {'loss': 0.4584, 'learning_rate': 1.1205049112694488e-07, 'epoch': 0.95} + 95%|█████████▌| 5505/5772 [1:56:13<34:16, 7.70s/it] {'loss': 0.4584, 'learning_rate': 1.1205049112694488e-07, 'epoch': 0.95} + 95%|█████████▌| 5505/5772 [1:56:10<34:15, 7.70s/it] 95%|█████████▌| 5506/5772 [1:56:16<31:36, 7.13s/it] 95%|█████████▌| 5506/5772 [1:56:19<31:36, 7.13s/it] {'loss': 0.4515, 'learning_rate': 1.1121428970545023e-07, 'epoch': 0.95} + 95%|█████████▌| 5506/5772 [1:56:19<31:36, 7.13s/it] {'loss': 0.4515, 'learning_rate': 1.1121428970545023e-07, 'epoch': 0.95} + 95%|█████████▌| 5506/5772 [1:56:16<31:36, 7.13s/it] 95%|█████████▌| 5507/5772 [1:56:25<30:08, 6.82s/it] {'loss': 0.4583, 'learning_rate': 1.1038120270203345e-07, 'epoch': 0.95} + 95%|█████████▌| 5507/5772 [1:56:25<30:08, 6.82s/it] 95%|█████████▌| 5507/5772 [1:56:22<30:08, 6.82s/it] {'loss': 0.4583, 'learning_rate': 1.1038120270203345e-07, 'epoch': 0.95} + 95%|█████████▌| 5507/5772 [1:56:22<30:08, 6.82s/it] 95%|█████████▌| 5508/5772 [1:56:28<29:02, 6.60s/it] 95%|█████████▌| 5508/5772 [1:56:31<29:03, 6.60s/it] {'loss': 0.4628, 'learning_rate': 1.0955123037907134e-07, 'epoch': 0.95} + 95%|█████████▌| 5508/5772 [1:56:31<29:03, 6.60s/it] {'loss': 0.4628, 'learning_rate': 1.0955123037907134e-07, 'epoch': 0.95} + 95%|█████████▌| 5508/5772 [1:56:28<29:02, 6.60s/it] 95%|█████████▌| 5509/5772 [1:56:34<28:09, 6.43s/it] 95%|█████████▌| 5509/5772 [1:56:37<28:09, 6.43s/it] {'loss': 0.4589, 'learning_rate': 1.0872437299795701e-07, 'epoch': 0.95} + 95%|█████████▌| 5509/5772 [1:56:37<28:09, 6.43s/it] {'loss': 0.4589, 'learning_rate': 1.0872437299795701e-07, 'epoch': 0.95} + 95%|█████████▌| 5509/5772 [1:56:34<28:09, 6.43s/it] 95%|█████████▌| 5510/5772 [1:56:40<27:42, 6.35s/it] 95%|█████████▌| 5510/5772 [1:56:43<27:42, 6.35s/it] {'loss': 0.4698, 'learning_rate': 1.079006308191055e-07, 'epoch': 0.95} +{'loss': 0.4698, 'learning_rate': 1.079006308191055e-07, 'epoch': 0.95} + 95%|█████████▌| 5510/5772 [1:56:43<27:42, 6.35s/it] 95%|█████████▌| 5510/5772 [1:56:40<27:42, 6.35s/it] 95%|█████████▌| 5511/5772 [1:56:50<27:37, 6.35s/it] 95%|█████████▌| 5511/5772 [1:56:47<27:37, 6.35s/it] {'loss': 0.452, 'learning_rate': 1.0708000410195041e-07, 'epoch': 0.95} + 95%|█████████▌| 5511/5772 [1:56:50<27:37, 6.35s/it] {'loss': 0.452, 'learning_rate': 1.0708000410195041e-07, 'epoch': 0.95} + 95%|█████████▌| 5511/5772 [1:56:47<27:37, 6.35s/it] 95%|█████████▌| 5512/5772 [1:56:53<26:54, 6.21s/it] 95%|█████████▌| 5512/5772 [1:56:56<26:54, 6.21s/it] {'loss': 0.4537, 'learning_rate': 1.0626249310494385e-07, 'epoch': 0.95} + 95%|█████████▌| 5512/5772 [1:56:56<26:54, 6.21s/it]{'loss': 0.4537, 'learning_rate': 1.0626249310494385e-07, 'epoch': 0.95} + 95%|█████████▌| 5512/5772 [1:56:53<26:54, 6.21s/it] 96%|█████████▌| 5513/5772 [1:56:59<26:48, 6.21s/it] 96%|█████████▌| 5513/5772 [1:57:02<26:48, 6.21s/it] {'loss': 0.4577, 'learning_rate': 1.0544809808555545e-07, 'epoch': 0.96} + 96%|█████████▌| 5513/5772 [1:57:02<26:48, 6.21s/it] {'loss': 0.4577, 'learning_rate': 1.0544809808555545e-07, 'epoch': 0.96} + 96%|█████████▌| 5513/5772 [1:56:59<26:48, 6.21s/it] 96%|█████████▌| 5514/5772 [1:57:05<26:21, 6.13s/it] 96%|█████████▌| 5514/5772 [1:57:08<26:21, 6.13s/it] {'loss': 0.4576, 'learning_rate': 1.0463681930027336e-07, 'epoch': 0.96} + 96%|█████████▌| 5514/5772 [1:57:08<26:21, 6.13s/it] {'loss': 0.4576, 'learning_rate': 1.0463681930027336e-07, 'epoch': 0.96} + 96%|█████████▌| 5514/5772 [1:57:05<26:21, 6.13s/it] 96%|█████████▌| 5515/5772 [1:57:14<26:11, 6.11s/it] 96%|█████████▌| 5515/5772 [1:57:11<26:11, 6.11s/it] {'loss': 0.4449, 'learning_rate': 1.0382865700460876e-07, 'epoch': 0.96} + 96%|█████████▌| 5515/5772 [1:57:14<26:11, 6.11s/it] {'loss': 0.4449, 'learning_rate': 1.0382865700460876e-07, 'epoch': 0.96} + 96%|█████████▌| 5515/5772 [1:57:11<26:11, 6.11s/it] 96%|█████████▌| 5516/5772 [1:57:17<26:07, 6.12s/it] 96%|█████████▌| 5516/5772 [1:57:20<26:07, 6.12s/it] {'loss': 0.4702, 'learning_rate': 1.030236114530847e-07, 'epoch': 0.96} + 96%|█████████▌| 5516/5772 [1:57:20<26:07, 6.12s/it] {'loss': 0.4702, 'learning_rate': 1.030236114530847e-07, 'epoch': 0.96} + 96%|█████████▌| 5516/5772 [1:57:17<26:07, 6.12s/it] 96%|█████████▌| 5517/5772 [1:57:23<25:57, 6.11s/it] 96%|█████████▌| 5517/5772 [1:57:26<25:57, 6.11s/it] {'loss': 0.4356, 'learning_rate': 1.0222168289924616e-07, 'epoch': 0.96} + 96%|█████████▌| 5517/5772 [1:57:26<25:57, 6.11s/it] {'loss': 0.4356, 'learning_rate': 1.0222168289924616e-07, 'epoch': 0.96} + 96%|█████████▌| 5517/5772 [1:57:23<25:57, 6.11s/it] 96%|█████████▌| 5518/5772 [1:57:29<25:38, 6.06s/it] 96%|█████████▌| 5518/5772 [1:57:32<25:38, 6.06s/it] {'loss': 0.4542, 'learning_rate': 1.0142287159565778e-07, 'epoch': 0.96} + 96%|█████████▌| 5518/5772 [1:57:32<25:38, 6.06s/it] {'loss': 0.4542, 'learning_rate': 1.0142287159565778e-07, 'epoch': 0.96} + 96%|█████████▌| 5518/5772 [1:57:29<25:38, 6.06s/it] 96%|█████████▌| 5519/5772 [1:57:35<25:36, 6.07s/it] 96%|█████████▌| 5519/5772 [1:57:38<25:36, 6.07s/it] {'loss': 0.4467, 'learning_rate': 1.0062717779389942e-07, 'epoch': 0.96} + 96%|█████████▌| 5519/5772 [1:57:38<25:36, 6.07s/it] {'loss': 0.4467, 'learning_rate': 1.0062717779389942e-07, 'epoch': 0.96} + 96%|█████████▌| 5519/5772 [1:57:35<25:36, 6.07s/it] 96%|█████████▌| 5520/5772 [1:57:44<25:31, 6.08s/it] {'loss': 0.4777, 'learning_rate': 9.98346017445706e-08, 'epoch': 0.96} + 96%|█████████▌| 5520/5772 [1:57:44<25:31, 6.08s/it] 96%|█████████▌| 5520/5772 [1:57:41<25:31, 6.08s/it] {'loss': 0.4777, 'learning_rate': 9.98346017445706e-08, 'epoch': 0.96} + 96%|█████████▌| 5520/5772 [1:57:41<25:31, 6.08s/it] 96%|█████████▌| 5521/5772 [1:57:47<25:06, 6.00s/it] 96%|█████████▌| 5521/5772 [1:57:50<25:06, 6.00s/it] {'loss': 0.4518, 'learning_rate': 9.904514369728724e-08, 'epoch': 0.96} + 96%|█████████▌| 5521/5772 [1:57:50<25:06, 6.00s/it] {'loss': 0.4518, 'learning_rate': 9.904514369728724e-08, 'epoch': 0.96} + 96%|█████████▌| 5521/5772 [1:57:47<25:06, 6.00s/it] 96%|█████████▌| 5522/5772 [1:57:56<24:46, 5.95s/it] 96%|█████████▌| 5522/5772 [1:57:53<24:46, 5.95s/it] {'loss': 0.4577, 'learning_rate': 9.82588039006882e-08, 'epoch': 0.96} + 96%|█████████▌| 5522/5772 [1:57:56<24:46, 5.95s/it] {'loss': 0.4577, 'learning_rate': 9.82588039006882e-08, 'epoch': 0.96} + 96%|█████████▌| 5522/5772 [1:57:53<24:46, 5.95s/it] 96%|█████████▌| 5523/5772 [1:58:02<24:57, 6.01s/it] 96%|█████████▌| 5523/5772 [1:57:59<24:57, 6.01s/it] {'loss': 0.462, 'learning_rate': 9.74755826024254e-08, 'epoch': 0.96} + 96%|█████████▌| 5523/5772 [1:58:02<24:57, 6.01s/it] {'loss': 0.462, 'learning_rate': 9.74755826024254e-08, 'epoch': 0.96} + 96%|█████████▌| 5523/5772 [1:57:59<24:57, 6.01s/it] 96%|█████████▌| 5524/5772 [1:58:05<25:17, 6.12s/it] 96%|█████████▌| 5524/5772 [1:58:08<25:17, 6.12s/it] {'loss': 0.47, 'learning_rate': 9.669548004916817e-08, 'epoch': 0.96} + 96%|█████████▌| 5524/5772 [1:58:08<25:17, 6.12s/it] {'loss': 0.47, 'learning_rate': 9.669548004916817e-08, 'epoch': 0.96} + 96%|█████████▌| 5524/5772 [1:58:05<25:17, 6.12s/it] 96%|█████████▌| 5525/5772 [1:58:11<24:51, 6.04s/it] 96%|█████████▌| 5525/5772 [1:58:14<24:51, 6.04s/it] {'loss': 0.4533, 'learning_rate': 9.591849648660779e-08, 'epoch': 0.96} + 96%|█████████▌| 5525/5772 [1:58:14<24:51, 6.04s/it] {'loss': 0.4533, 'learning_rate': 9.591849648660779e-08, 'epoch': 0.96} + 96%|█████████▌| 5525/5772 [1:58:11<24:51, 6.04s/it] 96%|█████████▌| 5526/5772 [1:58:17<24:53, 6.07s/it] 96%|█████████▌| 5526/5772 [1:58:20<24:53, 6.07s/it] {'loss': 0.4538, 'learning_rate': 9.51446321594507e-08, 'epoch': 0.96} + 96%|█████████▌| 5526/5772 [1:58:20<24:53, 6.07s/it] {'loss': 0.4538, 'learning_rate': 9.51446321594507e-08, 'epoch': 0.96} + 96%|█████████▌| 5526/5772 [1:58:17<24:53, 6.07s/it] 96%|█████████▌| 5527/5772 [1:58:23<24:55, 6.10s/it] 96%|█████████▌| 5527/5772 [1:58:27<24:55, 6.10s/it] {'loss': 0.4458, 'learning_rate': 9.437388731141861e-08, 'epoch': 0.96} + 96%|█████████▌| 5527/5772 [1:58:27<24:55, 6.10s/it] {'loss': 0.4458, 'learning_rate': 9.437388731141861e-08, 'epoch': 0.96} + 96%|█████████▌| 5527/5772 [1:58:23<24:55, 6.10s/it] 96%|█████████▌| 5528/5772 [1:58:29<24:36, 6.05s/it] 96%|█████████▌| 5528/5772 [1:58:32<24:36, 6.05s/it] {'loss': 0.4658, 'learning_rate': 9.360626218525625e-08, 'epoch': 0.96} + 96%|█████████▌| 5528/5772 [1:58:32<24:36, 6.05s/it] {'loss': 0.4658, 'learning_rate': 9.360626218525625e-08, 'epoch': 0.96} + 96%|█████████▌| 5528/5772 [1:58:29<24:36, 6.05s/it] 96%|█████████▌| 5529/5772 [1:58:39<25:07, 6.20s/it] 96%|█████████▌| 5529/5772 [1:58:36<25:07, 6.20s/it] {'loss': 0.4435, 'learning_rate': 9.284175702272246e-08, 'epoch': 0.96} + {'loss': 0.4435, 'learning_rate': 9.284175702272246e-08, 'epoch': 0.96} + 96%|█████████▌| 5529/5772 [1:58:39<25:07, 6.20s/it] 96%|█████████▌| 5529/5772 [1:58:36<25:07, 6.20s/it] 96%|█████████▌| 5530/5772 [1:58:42<24:32, 6.08s/it] 96%|█████████▌| 5530/5772 [1:58:45<24:32, 6.08s/it] {'loss': 0.445, 'learning_rate': 9.208037206459242e-08, 'epoch': 0.96} + 96%|█████████▌| 5530/5772 [1:58:45<24:32, 6.08s/it] {'loss': 0.445, 'learning_rate': 9.208037206459242e-08, 'epoch': 0.96} + 96%|█████████▌| 5530/5772 [1:58:42<24:32, 6.08s/it] 96%|█████████▌| 5531/5772 [1:58:48<24:38, 6.13s/it] 96%|█████████▌| 5531/5772 [1:58:51<24:38, 6.13s/it] {'loss': 0.4653, 'learning_rate': 9.132210755066096e-08, 'epoch': 0.96} + 96%|█████████▌| 5531/5772 [1:58:51<24:38, 6.13s/it] {'loss': 0.4653, 'learning_rate': 9.132210755066096e-08, 'epoch': 0.96} + 96%|█████████▌| 5531/5772 [1:58:48<24:38, 6.13s/it] 96%|█████████▌| 5532/5772 [1:58:54<24:52, 6.22s/it] 96%|█████████▌| 5532/5772 [1:58:57<24:52, 6.22s/it] {'loss': 0.4495, 'learning_rate': 9.056696371973928e-08, 'epoch': 0.96} + 96%|█████████▌| 5532/5772 [1:58:57<24:52, 6.22s/it] {'loss': 0.4495, 'learning_rate': 9.056696371973928e-08, 'epoch': 0.96} + 96%|█████████▌| 5532/5772 [1:58:54<24:52, 6.22s/it] 96%|█████████▌| 5533/5772 [1:59:01<24:40, 6.19s/it] 96%|█████████▌| 5533/5772 [1:59:04<24:40, 6.19s/it] {'loss': 0.4581, 'learning_rate': 8.981494080965602e-08, 'epoch': 0.96} + 96%|█████████▌| 5533/5772 [1:59:04<24:40, 6.19s/it] {'loss': 0.4581, 'learning_rate': 8.981494080965602e-08, 'epoch': 0.96} + 96%|█████████▌| 5533/5772 [1:59:01<24:40, 6.19s/it] 96%|█████████▌| 5534/5772 [1:59:07<25:09, 6.34s/it] 96%|█████████▌| 5534/5772 [1:59:10<25:09, 6.34s/it] {'loss': 0.4604, 'learning_rate': 8.906603905725619e-08, 'epoch': 0.96} + 96%|█████████▌| 5534/5772 [1:59:10<25:09, 6.34s/it] {'loss': 0.4604, 'learning_rate': 8.906603905725619e-08, 'epoch': 0.96} + 96%|█████████▌| 5534/5772 [1:59:07<25:09, 6.34s/it] 96%|█████████▌| 5535/5772 [1:59:14<25:02, 6.34s/it] 96%|█████████▌| 5535/5772 [1:59:17<25:02, 6.34s/it] {'loss': 0.4697, 'learning_rate': 8.832025869840222e-08, 'epoch': 0.96} + 96%|█████████▌| 5535/5772 [1:59:17<25:02, 6.34s/it] {'loss': 0.4697, 'learning_rate': 8.832025869840222e-08, 'epoch': 0.96} + 96%|█████████▌| 5535/5772 [1:59:14<25:02, 6.34s/it] 96%|█████████▌| 5536/5772 [1:59:20<24:39, 6.27s/it] 96%|█████████▌| 5536/5772 [1:59:23<24:39, 6.27s/it] {'loss': 0.4452, 'learning_rate': 8.757759996797399e-08, 'epoch': 0.96} + 96%|█████████▌| 5536/5772 [1:59:20<24:39, 6.27s/it] {'loss': 0.4452, 'learning_rate': 8.757759996797399e-08, 'epoch': 0.96} + 96%|█████████▌| 5536/5772 [1:59:23<24:39, 6.27s/it] 96%|█████████▌| 5537/5772 [1:59:25<23:55, 6.11s/it] 96%|█████████▌| 5537/5772 [1:59:28<23:55, 6.11s/it] {'loss': 0.4541, 'learning_rate': 8.683806309986776e-08, 'epoch': 0.96} + 96%|█████████▌| 5537/5772 [1:59:28<23:55, 6.11s/it] {'loss': 0.4541, 'learning_rate': 8.683806309986776e-08, 'epoch': 0.96} + 96%|█████████▌| 5537/5772 [1:59:25<23:55, 6.11s/it] 96%|█████████▌| 5538/5772 [1:59:32<24:00, 6.16s/it] 96%|█████████▌| 5538/5772 [1:59:35<24:00, 6.16s/it] {'loss': 0.4456, 'learning_rate': 8.610164832699608e-08, 'epoch': 0.96} + 96%|█████████▌| 5538/5772 [1:59:32<24:00, 6.16s/it] {'loss': 0.4456, 'learning_rate': 8.610164832699608e-08, 'epoch': 0.96} + 96%|█████████▌| 5538/5772 [1:59:35<24:00, 6.16s/it] 96%|█████████▌| 5539/5772 [1:59:38<23:56, 6.16s/it] 96%|█████████▌| 5539/5772 [1:59:41<23:56, 6.16s/it] {'loss': 0.4716, 'learning_rate': 8.536835588128678e-08, 'epoch': 0.96} + 96%|█████████▌| 5539/5772 [1:59:38<23:56, 6.16s/it]{'loss': 0.4716, 'learning_rate': 8.536835588128678e-08, 'epoch': 0.96} + 96%|█████████▌| 5539/5772 [1:59:41<23:56, 6.16s/it] 96%|█████████▌| 5540/5772 [1:59:44<23:49, 6.16s/it] 96%|█████████▌| 5540/5772 [1:59:47<23:49, 6.16s/it] {'loss': 0.455, 'learning_rate': 8.463818599369067e-08, 'epoch': 0.96} + 96%|█████████▌| 5540/5772 [1:59:47<23:49, 6.16s/it] {'loss': 0.455, 'learning_rate': 8.463818599369067e-08, 'epoch': 0.96} + 96%|█████████▌| 5540/5772 [1:59:44<23:49, 6.16s/it] 96%|█████████▌| 5541/5772 [1:59:50<23:24, 6.08s/it] 96%|█████████▌| 5541/5772 [1:59:53<23:24, 6.08s/it] {'loss': 0.4659, 'learning_rate': 8.391113889416713e-08, 'epoch': 0.96} + 96%|█████████▌| 5541/5772 [1:59:53<23:24, 6.08s/it] {'loss': 0.4659, 'learning_rate': 8.391113889416713e-08, 'epoch': 0.96} + 96%|█████████▌| 5541/5772 [1:59:50<23:24, 6.08s/it] 96%|█████████▌| 5542/5772 [1:59:56<23:37, 6.16s/it] 96%|█████████▌| 5542/5772 [1:59:59<23:37, 6.16s/it] {'loss': 0.45, 'learning_rate': 8.318721481169633e-08, 'epoch': 0.96} + 96%|█████████▌| 5542/5772 [1:59:59<23:37, 6.16s/it] {'loss': 0.45, 'learning_rate': 8.318721481169633e-08, 'epoch': 0.96} + 96%|█████████▌| 5542/5772 [1:59:56<23:37, 6.16s/it] 96%|█████████▌| 5543/5772 [2:00:02<23:27, 6.15s/it] 96%|█████████▌| 5543/5772 [2:00:05<23:27, 6.15s/it] {'loss': 0.4638, 'learning_rate': 8.24664139742759e-08, 'epoch': 0.96} + 96%|█████████▌| 5543/5772 [2:00:05<23:27, 6.15s/it] {'loss': 0.4638, 'learning_rate': 8.24664139742759e-08, 'epoch': 0.96} + 96%|█████████▌| 5543/5772 [2:00:02<23:27, 6.15s/it] 96%|█████████▌| 5544/5772 [2:00:09<23:35, 6.21s/it] 96%|█████████▌| 5544/5772 [2:00:12<23:35, 6.21s/it] {'loss': 0.4472, 'learning_rate': 8.174873660891536e-08, 'epoch': 0.96} + {'loss': 0.4472, 'learning_rate': 8.174873660891536e-08, 'epoch': 0.96} 96%|█████████▌| 5544/5772 [2:00:12<23:35, 6.21s/it] + 96%|█████████▌| 5544/5772 [2:00:09<23:35, 6.21s/it] 96%|█████████▌| 5545/5772 [2:00:15<23:16, 6.15s/it] 96%|█████████▌| 5545/5772 [2:00:18<23:16, 6.15s/it]{'loss': 0.4584, 'learning_rate': 8.103418294164611e-08, 'epoch': 0.96} + {'loss': 0.4584, 'learning_rate': 8.103418294164611e-08, 'epoch': 0.96} + 96%|█████████▌| 5545/5772 [2:00:18<23:16, 6.15s/it] 96%|█████████▌| 5545/5772 [2:00:15<23:16, 6.15s/it] 96%|█████████▌| 5546/5772 [2:00:21<22:58, 6.10s/it] 96%|█████████▌| 5546/5772 [2:00:24<22:58, 6.10s/it] {'loss': 0.4668, 'learning_rate': 8.032275319750926e-08, 'epoch': 0.96} + 96%|█████████▌| 5546/5772 [2:00:24<22:58, 6.10s/it] {'loss': 0.4668, 'learning_rate': 8.032275319750926e-08, 'epoch': 0.96} + 96%|█████████▌| 5546/5772 [2:00:21<22:58, 6.10s/it] 96%|█████████▌| 5547/5772 [2:00:27<22:33, 6.02s/it] 96%|█████████▌| 5547/5772 [2:00:30<22:33, 6.02s/it] {'loss': 0.465, 'learning_rate': 7.96144476005689e-08, 'epoch': 0.96} + 96%|█████████▌| 5547/5772 [2:00:30<22:33, 6.02s/it] {'loss': 0.465, 'learning_rate': 7.96144476005689e-08, 'epoch': 0.96} + 96%|█████████▌| 5547/5772 [2:00:27<22:33, 6.02s/it] 96%|█████████▌| 5548/5772 [2:00:32<22:14, 5.96s/it] 96%|█████████▌| 5548/5772 [2:00:35<22:14, 5.96s/it] {'loss': 0.4387, 'learning_rate': 7.890926637390106e-08, 'epoch': 0.96} + 96%|█████████▌| 5548/5772 [2:00:35<22:14, 5.96s/it]{'loss': 0.4387, 'learning_rate': 7.890926637390106e-08, 'epoch': 0.96} + 96%|█████████▌| 5548/5772 [2:00:32<22:14, 5.96s/it] 96%|█████████▌| 5549/5772 [2:00:39<22:45, 6.12s/it] 96%|█████████▌| 5549/5772 [2:00:42<22:45, 6.12s/it] {'loss': 0.4637, 'learning_rate': 7.820720973959694e-08, 'epoch': 0.96} + 96%|█████████▌| 5549/5772 [2:00:42<22:45, 6.12s/it] {'loss': 0.4637, 'learning_rate': 7.820720973959694e-08, 'epoch': 0.96} + 96%|█████████▌| 5549/5772 [2:00:39<22:45, 6.12s/it]4 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +012 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... + 96%|█████████▌| 5550/5772 [2:00:45<22:38, 6.12s/it] 96%|█████████▌| 5550/5772 [2:00:48<22:38, 6.12s/it]15 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend...9 AutoResumeHook: Checking whether to suspend... + +8 AutoResumeHook: Checking whether to suspend...1 AutoResumeHook: Checking whether to suspend... + +6 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4694, 'learning_rate': 7.750827791876747e-08, 'epoch': 0.96} + 96%|█████████▌| 5550/5772 [2:00:45<22:38, 6.12s/it]{'loss': 0.4694, 'learning_rate': 7.750827791876747e-08, 'epoch': 0.96} + 96%|█████████▌| 5550/5772 [2:00:48<22:38, 6.12s/it] 96%|█████████▌| 5551/5772 [2:00:51<22:16, 6.05s/it] 96%|█████████▌| 5551/5772 [2:00:54<22:16, 6.05s/it] {'loss': 0.4576, 'learning_rate': 7.681247113153655e-08, 'epoch': 0.96} + 96%|█████████▌| 5551/5772 [2:00:54<22:16, 6.05s/it] {'loss': 0.4576, 'learning_rate': 7.681247113153655e-08, 'epoch': 0.96} + 96%|█████████▌| 5551/5772 [2:00:51<22:16, 6.05s/it] 96%|█████████▌| 5552/5772 [2:00:57<22:10, 6.05s/it] 96%|█████████▌| 5552/5772 [2:01:00<22:10, 6.05s/it] {'loss': 0.4624, 'learning_rate': 7.611978959704558e-08, 'epoch': 0.96} + 96%|█████████▌| 5552/5772 [2:01:00<22:10, 6.05s/it] {'loss': 0.4624, 'learning_rate': 7.611978959704558e-08, 'epoch': 0.96} + 96%|█████████▌| 5552/5772 [2:00:57<22:10, 6.05s/it] 96%|█████████▌| 5553/5772 [2:01:03<22:08, 6.07s/it] 96%|█████████▌| 5553/5772 [2:01:06<22:08, 6.07s/it] {'loss': 0.4565, 'learning_rate': 7.543023353344892e-08, 'epoch': 0.96} + 96%|█████████▌| 5553/5772 [2:01:06<22:08, 6.07s/it] {'loss': 0.4565, 'learning_rate': 7.543023353344892e-08, 'epoch': 0.96} + 96%|█████████▌| 5553/5772 [2:01:03<22:08, 6.07s/it] 96%|█████████▌| 5554/5772 [2:01:13<22:29, 6.19s/it] {'loss': 0.4679, 'learning_rate': 7.474380315791951e-08, 'epoch': 0.96} + 96%|█████████▌| 5554/5772 [2:01:13<22:29, 6.19s/it] 96%|█████████▌| 5554/5772 [2:01:09<22:29, 6.19s/it] {'loss': 0.4679, 'learning_rate': 7.474380315791951e-08, 'epoch': 0.96} + 96%|█████████▌| 5554/5772 [2:01:09<22:29, 6.19s/it] 96%|█████████▌| 5555/5772 [2:01:16<22:19, 6.17s/it] 96%|█████████▌| 5555/5772 [2:01:19<22:19, 6.17s/it] {'loss': 0.4649, 'learning_rate': 7.406049868664445e-08, 'epoch': 0.96} + 96%|█████████▌| 5555/5772 [2:01:19<22:19, 6.17s/it] {'loss': 0.4649, 'learning_rate': 7.406049868664445e-08, 'epoch': 0.96} + 96%|█████████▌| 5555/5772 [2:01:16<22:19, 6.17s/it] 96%|█████████▋| 5556/5772 [2:01:22<21:57, 6.10s/it] 96%|█████████▋| 5556/5772 [2:01:25<21:57, 6.10s/it] {'loss': 0.4597, 'learning_rate': 7.338032033482712e-08, 'epoch': 0.96} + 96%|█████████▋| 5556/5772 [2:01:25<21:57, 6.10s/it] {'loss': 0.4597, 'learning_rate': 7.338032033482712e-08, 'epoch': 0.96} + 96%|█████████▋| 5556/5772 [2:01:22<21:57, 6.10s/it] 96%|█████████▋| 5557/5772 [2:01:27<21:33, 6.02s/it] 96%|█████████▋| 5557/5772 [2:01:30<21:33, 6.02s/it] {'loss': 0.4514, 'learning_rate': 7.270326831668617e-08, 'epoch': 0.96} + 96%|█████████▋| 5557/5772 [2:01:30<21:33, 6.02s/it] {'loss': 0.4514, 'learning_rate': 7.270326831668617e-08, 'epoch': 0.96} + 96%|█████████▋| 5557/5772 [2:01:27<21:33, 6.02s/it] 96%|█████████▋| 5558/5772 [2:01:33<21:29, 6.02s/it] 96%|█████████▋| 5558/5772 [2:01:36<21:29, 6.02s/it] {'loss': 0.4614, 'learning_rate': 7.202934284545438e-08, 'epoch': 0.96} + 96%|█████████▋| 5558/5772 [2:01:36<21:29, 6.02s/it] {'loss': 0.4614, 'learning_rate': 7.202934284545438e-08, 'epoch': 0.96} + 96%|█████████▋| 5558/5772 [2:01:33<21:29, 6.02s/it] 96%|█████████▋| 5559/5772 [2:01:39<21:15, 5.99s/it] 96%|█████████▋| 5559/5772 [2:01:42<21:15, 5.99s/it] {'loss': 0.4515, 'learning_rate': 7.135854413338194e-08, 'epoch': 0.96} + 96%|█████████▋| 5559/5772 [2:01:42<21:15, 5.99s/it] {'loss': 0.4515, 'learning_rate': 7.135854413338194e-08, 'epoch': 0.96} + 96%|█████████▋| 5559/5772 [2:01:39<21:15, 5.99s/it] 96%|█████████▋| 5560/5772 [2:01:45<21:02, 5.95s/it] 96%|█████████▋| 5560/5772 [2:01:48<21:02, 5.95s/it] {'loss': 0.4506, 'learning_rate': 7.069087239173211e-08, 'epoch': 0.96} + 96%|█████████▋| 5560/5772 [2:01:48<21:02, 5.95s/it] {'loss': 0.4506, 'learning_rate': 7.069087239173211e-08, 'epoch': 0.96} + 96%|█████████▋| 5560/5772 [2:01:45<21:02, 5.95s/it] 96%|█████████▋| 5561/5772 [2:01:51<21:13, 6.03s/it] 96%|█████████▋| 5561/5772 [2:01:54<21:13, 6.03s/it] {'loss': 0.4676, 'learning_rate': 7.002632783078445e-08, 'epoch': 0.96} + 96%|█████████▋| 5561/5772 [2:01:54<21:13, 6.03s/it] {'loss': 0.4676, 'learning_rate': 7.002632783078445e-08, 'epoch': 0.96} + 96%|█████████▋| 5561/5772 [2:01:51<21:13, 6.03s/it] 96%|█████████▋| 5562/5772 [2:02:00<21:05, 6.03s/it] {'loss': 0.4422, 'learning_rate': 6.936491065983486e-08, 'epoch': 0.96} + 96%|█████████▋| 5562/5772 [2:02:00<21:05, 6.03s/it] 96%|█████████▋| 5562/5772 [2:01:57<21:05, 6.03s/it] {'loss': 0.4422, 'learning_rate': 6.936491065983486e-08, 'epoch': 0.96} + 96%|█████████▋| 5562/5772 [2:01:57<21:05, 6.03s/it] 96%|█████████▋| 5563/5772 [2:02:03<21:00, 6.03s/it] 96%|█████████▋| 5563/5772 [2:02:07<21:00, 6.03s/it] {'loss': 0.4611, 'learning_rate': 6.870662108719117e-08, 'epoch': 0.96} + 96%|█████████▋| 5563/5772 [2:02:07<21:00, 6.03s/it] {'loss': 0.4611, 'learning_rate': 6.870662108719117e-08, 'epoch': 0.96} + 96%|█████████▋| 5563/5772 [2:02:03<21:00, 6.03s/it] 96%|█████████▋| 5564/5772 [2:02:09<20:41, 5.97s/it] 96%|█████████▋| 5564/5772 [2:02:12<20:41, 5.97s/it] {'loss': 0.4621, 'learning_rate': 6.805145932017975e-08, 'epoch': 0.96} + {'loss': 0.4621, 'learning_rate': 6.805145932017975e-08, 'epoch': 0.96} 96%|█████████▋| 5564/5772 [2:02:12<20:41, 5.97s/it] + 96%|█████████▋| 5564/5772 [2:02:09<20:41, 5.97s/it] 96%|█████████▋| 5565/5772 [2:02:16<21:18, 6.18s/it] 96%|█████████▋| 5565/5772 [2:02:19<21:18, 6.18s/it] {'loss': 0.4611, 'learning_rate': 6.73994255651389e-08, 'epoch': 0.96} + 96%|█████████▋| 5565/5772 [2:02:19<21:18, 6.18s/it] {'loss': 0.4611, 'learning_rate': 6.73994255651389e-08, 'epoch': 0.96} + 96%|█████████▋| 5565/5772 [2:02:16<21:18, 6.18s/it] 96%|█████████▋| 5566/5772 [2:02:22<21:00, 6.12s/it] 96%|█████████▋| 5566/5772 [2:02:25<21:00, 6.12s/it] {'loss': 0.4516, 'learning_rate': 6.675052002742321e-08, 'epoch': 0.96} + 96%|█████████▋| 5566/5772 [2:02:25<21:00, 6.12s/it]{'loss': 0.4516, 'learning_rate': 6.675052002742321e-08, 'epoch': 0.96} + 96%|█████████▋| 5566/5772 [2:02:22<21:00, 6.12s/it] 96%|█████████▋| 5567/5772 [2:02:28<20:45, 6.07s/it] 96%|█████████▋| 5567/5772 [2:02:31<20:45, 6.07s/it] {'loss': 0.4512, 'learning_rate': 6.610474291140257e-08, 'epoch': 0.96} + 96%|█████████▋| 5567/5772 [2:02:31<20:45, 6.07s/it] {'loss': 0.4512, 'learning_rate': 6.610474291140257e-08, 'epoch': 0.96} + 96%|█████████▋| 5567/5772 [2:02:28<20:45, 6.07s/it] 96%|█████████▋| 5568/5772 [2:02:34<20:13, 5.95s/it] 96%|█████████▋| 5568/5772 [2:02:37<20:13, 5.95s/it] {'loss': 0.4645, 'learning_rate': 6.546209442046093e-08, 'epoch': 0.96} + 96%|█████████▋| 5568/5772 [2:02:37<20:13, 5.95s/it] {'loss': 0.4645, 'learning_rate': 6.546209442046093e-08, 'epoch': 0.96} + 96%|█████████▋| 5568/5772 [2:02:34<20:13, 5.95s/it] 96%|█████████▋| 5569/5772 [2:02:39<19:52, 5.88s/it] 96%|█████████▋| 5569/5772 [2:02:42<19:52, 5.88s/it] {'loss': 0.453, 'learning_rate': 6.482257475699526e-08, 'epoch': 0.96} + 96%|█████████▋| 5569/5772 [2:02:42<19:52, 5.88s/it] {'loss': 0.453, 'learning_rate': 6.482257475699526e-08, 'epoch': 0.96} + 96%|█████████▋| 5569/5772 [2:02:39<19:52, 5.88s/it] 97%|█████████▋| 5570/5772 [2:02:46<20:30, 6.09s/it] 97%|█████████▋| 5570/5772 [2:02:49<20:30, 6.09s/it] {'loss': 0.4429, 'learning_rate': 6.418618412242116e-08, 'epoch': 0.96} + 97%|█████████▋| 5570/5772 [2:02:49<20:30, 6.09s/it] {'loss': 0.4429, 'learning_rate': 6.418618412242116e-08, 'epoch': 0.96} + 97%|█████████▋| 5570/5772 [2:02:46<20:30, 6.09s/it] 97%|█████████▋| 5571/5772 [2:02:52<20:12, 6.03s/it] 97%|█████████▋| 5571/5772 [2:02:55<20:12, 6.03s/it] {'loss': 0.4523, 'learning_rate': 6.355292271716495e-08, 'epoch': 0.97} + 97%|█████████▋| 5571/5772 [2:02:55<20:12, 6.03s/it] {'loss': 0.4523, 'learning_rate': 6.355292271716495e-08, 'epoch': 0.97} + 97%|█████████▋| 5571/5772 [2:02:52<20:12, 6.03s/it] 97%|█████████▋| 5572/5772 [2:02:58<20:02, 6.01s/it] 97%|█████████▋| 5572/5772 [2:03:01<20:02, 6.01s/it] {'loss': 0.4651, 'learning_rate': 6.292279074066821e-08, 'epoch': 0.97} + 97%|█████████▋| 5572/5772 [2:03:01<20:02, 6.01s/it] {'loss': 0.4651, 'learning_rate': 6.292279074066821e-08, 'epoch': 0.97} + 97%|█████████▋| 5572/5772 [2:02:58<20:02, 6.01s/it] 97%|█████████▋| 5573/5772 [2:03:04<20:02, 6.04s/it] 97%|█████████▋| 5573/5772 [2:03:07<20:02, 6.04s/it] {'loss': 0.4459, 'learning_rate': 6.229578839138772e-08, 'epoch': 0.97} + 97%|█████████▋| 5573/5772 [2:03:07<20:02, 6.04s/it] {'loss': 0.4459, 'learning_rate': 6.229578839138772e-08, 'epoch': 0.97} + 97%|█████████▋| 5573/5772 [2:03:04<20:02, 6.04s/it] 97%|█████████▋| 5574/5772 [2:03:10<20:08, 6.11s/it] 97%|█████████▋| 5574/5772 [2:03:13<20:08, 6.11s/it] {'loss': 0.4407, 'learning_rate': 6.167191586679556e-08, 'epoch': 0.97} + 97%|█████████▋| 5574/5772 [2:03:13<20:08, 6.11s/it] {'loss': 0.4407, 'learning_rate': 6.167191586679556e-08, 'epoch': 0.97} + 97%|█████████▋| 5574/5772 [2:03:10<20:08, 6.11s/it] 97%|█████████▋| 5575/5772 [2:03:19<19:50, 6.04s/it] 97%|█████████▋| 5575/5772 [2:03:16<19:50, 6.04s/it] {'loss': 0.4617, 'learning_rate': 6.105117336337674e-08, 'epoch': 0.97} + 97%|█████████▋| 5575/5772 [2:03:19<19:50, 6.04s/it] {'loss': 0.4617, 'learning_rate': 6.105117336337674e-08, 'epoch': 0.97} + 97%|█████████▋| 5575/5772 [2:03:16<19:50, 6.04s/it] 97%|█████████▋| 5576/5772 [2:03:26<20:26, 6.26s/it] 97%|█████████▋| 5576/5772 [2:03:23<20:26, 6.26s/it] {'loss': 0.4538, 'learning_rate': 6.043356107662823e-08, 'epoch': 0.97} + 97%|█████████▋| 5576/5772 [2:03:26<20:26, 6.26s/it] {'loss': 0.4538, 'learning_rate': 6.043356107662823e-08, 'epoch': 0.97} + 97%|█████████▋| 5576/5772 [2:03:23<20:26, 6.26s/it] 97%|█████████▋| 5577/5772 [2:03:32<20:21, 6.26s/it] 97%|█████████▋| 5577/5772 [2:03:29<20:21, 6.26s/it] {'loss': 0.4598, 'learning_rate': 5.981907920106667e-08, 'epoch': 0.97} + 97%|█████████▋| 5577/5772 [2:03:32<20:21, 6.26s/it] {'loss': 0.4598, 'learning_rate': 5.981907920106667e-08, 'epoch': 0.97} + 97%|█████████▋| 5577/5772 [2:03:29<20:21, 6.26s/it] 97%|█████████▋| 5578/5772 [2:03:35<20:13, 6.26s/it] 97%|█████████▋| 5578/5772 [2:03:38<20:13, 6.26s/it] {'loss': 0.4442, 'learning_rate': 5.920772793021945e-08, 'epoch': 0.97} + 97%|█████████▋| 5578/5772 [2:03:38<20:13, 6.26s/it] {'loss': 0.4442, 'learning_rate': 5.920772793021945e-08, 'epoch': 0.97} + 97%|█████████▋| 5578/5772 [2:03:35<20:13, 6.26s/it] 97%|█████████▋| 5579/5772 [2:03:41<19:46, 6.15s/it] 97%|█████████▋| 5579/5772 [2:03:44<19:46, 6.15s/it] {'loss': 0.4693, 'learning_rate': 5.8599507456625907e-08, 'epoch': 0.97} + 97%|█████████▋| 5579/5772 [2:03:44<19:46, 6.15s/it]{'loss': 0.4693, 'learning_rate': 5.8599507456625907e-08, 'epoch': 0.97} + 97%|█████████▋| 5579/5772 [2:03:41<19:46, 6.15s/it] 97%|█████████▋| 5580/5772 [2:03:47<19:33, 6.11s/it] 97%|█████████▋| 5580/5772 [2:03:50<19:33, 6.11s/it] {'loss': 0.4507, 'learning_rate': 5.799441797184391e-08, 'epoch': 0.97} + 97%|█████████▋| 5580/5772 [2:03:50<19:33, 6.11s/it] {'loss': 0.4507, 'learning_rate': 5.799441797184391e-08, 'epoch': 0.97} + 97%|█████████▋| 5580/5772 [2:03:47<19:33, 6.11s/it] 97%|█████████▋| 5581/5772 [2:03:53<19:38, 6.17s/it] 97%|█████████▋| 5581/5772 [2:03:57<19:38, 6.17s/it] {'loss': 0.4531, 'learning_rate': 5.739245966644102e-08, 'epoch': 0.97} + 97%|█████████▋| 5581/5772 [2:03:57<19:38, 6.17s/it] {'loss': 0.4531, 'learning_rate': 5.739245966644102e-08, 'epoch': 0.97} + 97%|█████████▋| 5581/5772 [2:03:53<19:38, 6.17s/it] 97%|█████████▋| 5582/5772 [2:04:00<19:50, 6.27s/it] 97%|█████████▋| 5582/5772 [2:04:03<19:50, 6.27s/it] {'loss': 0.4542, 'learning_rate': 5.6793632730003375e-08, 'epoch': 0.97} + 97%|█████████▋| 5582/5772 [2:04:03<19:50, 6.27s/it] {'loss': 0.4542, 'learning_rate': 5.6793632730003375e-08, 'epoch': 0.97} + 97%|█████████▋| 5582/5772 [2:04:00<19:50, 6.27s/it] 97%|█████████▋| 5583/5772 [2:04:10<19:58, 6.34s/it] 97%|█████████▋| 5583/5772 [2:04:07<19:58, 6.34s/it] {'loss': 0.4596, 'learning_rate': 5.6197937351125664e-08, 'epoch': 0.97} + 97%|█████████▋| 5583/5772 [2:04:10<19:58, 6.34s/it] {'loss': 0.4596, 'learning_rate': 5.6197937351125664e-08, 'epoch': 0.97} + 97%|█████████▋| 5583/5772 [2:04:07<19:58, 6.34s/it] 97%|█████████▋| 5584/5772 [2:04:13<19:36, 6.26s/it] 97%|█████████▋| 5584/5772 [2:04:16<19:36, 6.26s/it] {'loss': 0.4511, 'learning_rate': 5.560537371742003e-08, 'epoch': 0.97} + 97%|█████████▋| 5584/5772 [2:04:16<19:36, 6.26s/it] {'loss': 0.4511, 'learning_rate': 5.560537371742003e-08, 'epoch': 0.97} + 97%|█████████▋| 5584/5772 [2:04:13<19:36, 6.26s/it] 97%|█████████▋| 5585/5772 [2:04:19<19:21, 6.21s/it] 97%|█████████▋| 5585/5772 [2:04:22<19:21, 6.21s/it] {'loss': 0.4645, 'learning_rate': 5.501594201551164e-08, 'epoch': 0.97} + 97%|█████████▋| 5585/5772 [2:04:22<19:21, 6.21s/it] {'loss': 0.4645, 'learning_rate': 5.501594201551164e-08, 'epoch': 0.97} + 97%|█████████▋| 5585/5772 [2:04:19<19:21, 6.21s/it] 97%|█████████▋| 5586/5772 [2:04:25<19:25, 6.26s/it] 97%|█████████▋| 5586/5772 [2:04:28<19:25, 6.26s/it] {'loss': 0.4548, 'learning_rate': 5.4429642431036435e-08, 'epoch': 0.97} + 97%|█████████▋| 5586/5772 [2:04:28<19:25, 6.26s/it] {'loss': 0.4548, 'learning_rate': 5.4429642431036435e-08, 'epoch': 0.97} + 97%|█████████▋| 5586/5772 [2:04:25<19:25, 6.26s/it] 97%|█████████▋| 5587/5772 [2:04:31<18:51, 6.11s/it] 97%|█████████▋| 5587/5772 [2:04:34<18:51, 6.11s/it] {'loss': 0.4547, 'learning_rate': 5.3846475148648936e-08, 'epoch': 0.97} + 97%|█████████▋| 5587/5772 [2:04:34<18:51, 6.11s/it] {'loss': 0.4547, 'learning_rate': 5.3846475148648936e-08, 'epoch': 0.97} + 97%|█████████▋| 5587/5772 [2:04:31<18:51, 6.11s/it] 97%|█████████▋| 5588/5772 [2:04:37<18:48, 6.13s/it] 97%|█████████▋| 5588/5772 [2:04:40<18:48, 6.13s/it] {'loss': 0.4662, 'learning_rate': 5.326644035201334e-08, 'epoch': 0.97} + 97%|█████████▋| 5588/5772 [2:04:40<18:48, 6.13s/it] {'loss': 0.4662, 'learning_rate': 5.326644035201334e-08, 'epoch': 0.97} + 97%|█████████▋| 5588/5772 [2:04:37<18:48, 6.13s/it] 97%|█████████▋| 5589/5772 [2:04:43<18:36, 6.10s/it] 97%|█████████▋| 5589/5772 [2:04:46<18:36, 6.10s/it] {'loss': 0.4589, 'learning_rate': 5.268953822380796e-08, 'epoch': 0.97} + 97%|█████████▋| 5589/5772 [2:04:46<18:36, 6.10s/it] {'loss': 0.4589, 'learning_rate': 5.268953822380796e-08, 'epoch': 0.97} + 97%|█████████▋| 5589/5772 [2:04:43<18:36, 6.10s/it] 97%|█████████▋| 5590/5772 [2:04:49<18:13, 6.01s/it] 97%|█████████▋| 5590/5772 [2:04:52<18:13, 6.01s/it] {'loss': 0.4518, 'learning_rate': 5.211576894572523e-08, 'epoch': 0.97} + 97%|█████████▋| 5590/5772 [2:04:52<18:13, 6.01s/it] {'loss': 0.4518, 'learning_rate': 5.211576894572523e-08, 'epoch': 0.97} + 97%|█████████▋| 5590/5772 [2:04:49<18:13, 6.01s/it] 97%|█████████▋| 5591/5772 [2:04:55<17:57, 5.95s/it] 97%|█████████▋| 5591/5772 [2:04:58<17:57, 5.95s/it] {'loss': 0.4601, 'learning_rate': 5.154513269847061e-08, 'epoch': 0.97} + 97%|█████████▋| 5591/5772 [2:04:58<17:57, 5.95s/it] {'loss': 0.4601, 'learning_rate': 5.154513269847061e-08, 'epoch': 0.97} + 97%|█████████▋| 5591/5772 [2:04:55<17:57, 5.95s/it] 97%|█████████▋| 5592/5772 [2:05:01<17:56, 5.98s/it] 97%|█████████▋| 5592/5772 [2:05:04<17:56, 5.98s/it] {'loss': 0.4462, 'learning_rate': 5.097762966176256e-08, 'epoch': 0.97} + 97%|█████████▋| 5592/5772 [2:05:04<17:56, 5.98s/it] {'loss': 0.4462, 'learning_rate': 5.097762966176256e-08, 'epoch': 0.97} + 97%|█████████▋| 5592/5772 [2:05:01<17:56, 5.98s/it] 97%|█████████▋| 5593/5772 [2:05:07<18:02, 6.05s/it] 97%|█████████▋| 5593/5772 [2:05:10<18:02, 6.05s/it] {'loss': 0.464, 'learning_rate': 5.041326001433366e-08, 'epoch': 0.97} + 97%|█████████▋| 5593/5772 [2:05:07<18:02, 6.05s/it] {'loss': 0.464, 'learning_rate': 5.041326001433366e-08, 'epoch': 0.97} + 97%|█████████▋| 5593/5772 [2:05:10<18:02, 6.05s/it] 97%|█████████▋| 5594/5772 [2:05:13<17:55, 6.04s/it] 97%|█████████▋| 5594/5772 [2:05:16<17:55, 6.04s/it] {'loss': 0.4583, 'learning_rate': 4.985202393392841e-08, 'epoch': 0.97} + 97%|█████████▋| 5594/5772 [2:05:16<17:55, 6.04s/it] {'loss': 0.4583, 'learning_rate': 4.985202393392841e-08, 'epoch': 0.97} + 97%|█████████▋| 5594/5772 [2:05:13<17:55, 6.04s/it] 97%|█████████▋| 5595/5772 [2:05:19<17:47, 6.03s/it] 97%|█████████▋| 5595/5772 [2:05:22<17:47, 6.03s/it] {'loss': 0.4461, 'learning_rate': 4.9293921597305396e-08, 'epoch': 0.97} + 97%|█████████▋| 5595/5772 [2:05:22<17:47, 6.03s/it] {'loss': 0.4461, 'learning_rate': 4.9293921597305396e-08, 'epoch': 0.97} + 97%|█████████▋| 5595/5772 [2:05:19<17:47, 6.03s/it] 97%|█████████▋| 5596/5772 [2:05:25<17:45, 6.05s/it] 97%|█████████▋| 5596/5772 [2:05:28<17:45, 6.05s/it] {'loss': 0.4444, 'learning_rate': 4.873895318023625e-08, 'epoch': 0.97} + 97%|█████████▋| 5596/5772 [2:05:28<17:45, 6.05s/it] {'loss': 0.4444, 'learning_rate': 4.873895318023625e-08, 'epoch': 0.97} + 97%|█████████▋| 5596/5772 [2:05:25<17:45, 6.05s/it] 97%|█████████▋| 5597/5772 [2:05:31<17:38, 6.05s/it] 97%|█████████▋| 5597/5772 [2:05:34<17:38, 6.05s/it] {'loss': 0.4643, 'learning_rate': 4.818711885750338e-08, 'epoch': 0.97} + 97%|█████████▋| 5597/5772 [2:05:34<17:38, 6.05s/it] {'loss': 0.4643, 'learning_rate': 4.818711885750338e-08, 'epoch': 0.97} + 97%|█████████▋| 5597/5772 [2:05:31<17:38, 6.05s/it] 97%|█████████▋| 5598/5772 [2:05:37<17:50, 6.15s/it] 97%|█████████▋| 5598/5772 [2:05:41<17:50, 6.15s/it] {'loss': 0.4594, 'learning_rate': 4.763841880290554e-08, 'epoch': 0.97} + 97%|█████████▋| 5598/5772 [2:05:41<17:50, 6.15s/it] {'loss': 0.4594, 'learning_rate': 4.763841880290554e-08, 'epoch': 0.97} + 97%|█████████▋| 5598/5772 [2:05:37<17:50, 6.15s/it] 97%|█████████▋| 5599/5772 [2:05:44<17:42, 6.14s/it] 97%|█████████▋| 5599/5772 [2:05:47<17:42, 6.14s/it] {'loss': 0.4535, 'learning_rate': 4.7092853189252273e-08, 'epoch': 0.97} + 97%|█████████▋| 5599/5772 [2:05:47<17:42, 6.14s/it] {'loss': 0.4535, 'learning_rate': 4.7092853189252273e-08, 'epoch': 0.97} + 97%|█████████▋| 5599/5772 [2:05:44<17:42, 6.14s/it]10 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +05 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend...8 +15AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... + 97%|█████████▋| 5600/5772 [2:05:50<17:36, 6.14s/it] 97%|█████████▋| 5600/5772 [2:05:53<17:36, 6.14s/it]16 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +9 4 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... + {'loss': 0.4511, 'learning_rate': 4.655042218836725e-08, 'epoch': 0.97} + 97%|█████████▋| 5600/5772 [2:05:53<17:36, 6.14s/it] {'loss': 0.4511, 'learning_rate': 4.655042218836725e-08, 'epoch': 0.97} + 97%|█████████▋| 5600/5772 [2:05:50<17:36, 6.14s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-5600/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-5600/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-5600/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 97%|█████████▋| 5601/5772 [2:06:14<30:01, 10.54s/it] 97%|█████████▋| 5601/5772 [2:06:11<30:01, 10.54s/it] {'loss': 0.4572, 'learning_rate': 4.6011125971084924e-08, 'epoch': 0.97} + 97%|█████████▋| 5601/5772 [2:06:14<30:01, 10.54s/it] {'loss': 0.4572, 'learning_rate': 4.6011125971084924e-08, 'epoch': 0.97} + 97%|█████████▋| 5601/5772 [2:06:11<30:01, 10.54s/it] 97%|█████████▋| 5602/5772 [2:06:20<26:04, 9.20s/it] 97%|█████████▋| 5602/5772 [2:06:17<26:04, 9.20s/it] {'loss': 0.4642, 'learning_rate': 4.547496470725388e-08, 'epoch': 0.97} + 97%|█████████▋| 5602/5772 [2:06:20<26:04, 9.20s/it] {'loss': 0.4642, 'learning_rate': 4.547496470725388e-08, 'epoch': 0.97} + 97%|█████████▋| 5602/5772 [2:06:17<26:04, 9.20s/it] 97%|█████████▋| 5603/5772 [2:06:23<23:14, 8.25s/it] 97%|█████████▋| 5603/5772 [2:06:26<23:14, 8.25s/it] {'loss': 0.4607, 'learning_rate': 4.49419385657357e-08, 'epoch': 0.97} + 97%|█████████▋| 5603/5772 [2:06:26<23:14, 8.25s/it] {'loss': 0.4607, 'learning_rate': 4.49419385657357e-08, 'epoch': 0.97} + 97%|█████████▋| 5603/5772 [2:06:23<23:14, 8.25s/it] 97%|█████████▋| 5604/5772 [2:06:29<21:08, 7.55s/it] 97%|█████████▋| 5604/5772 [2:06:32<21:08, 7.55s/it] {'loss': 0.4552, 'learning_rate': 4.4412047714402774e-08, 'epoch': 0.97} + 97%|█████████▋| 5604/5772 [2:06:32<21:08, 7.55s/it] {'loss': 0.4552, 'learning_rate': 4.4412047714402774e-08, 'epoch': 0.97} + 97%|█████████▋| 5604/5772 [2:06:29<21:08, 7.55s/it] 97%|█████████▋| 5605/5772 [2:06:37<19:33, 7.02s/it] 97%|█████████▋| 5605/5772 [2:06:34<19:33, 7.02s/it] {'loss': 0.457, 'learning_rate': 4.388529232014271e-08, 'epoch': 0.97} + 97%|█████████▋| 5605/5772 [2:06:37<19:33, 7.02s/it] {'loss': 0.457, 'learning_rate': 4.388529232014271e-08, 'epoch': 0.97} + 97%|█████████▋| 5605/5772 [2:06:34<19:33, 7.02s/it] 97%|█████████▋| 5606/5772 [2:06:44<18:42, 6.76s/it] 97%|█████████▋| 5606/5772 [2:06:41<18:42, 6.76s/it] {'loss': 0.4658, 'learning_rate': 4.336167254885393e-08, 'epoch': 0.97} + 97%|█████████▋| 5606/5772 [2:06:44<18:42, 6.76s/it] {'loss': 0.4658, 'learning_rate': 4.336167254885393e-08, 'epoch': 0.97} + 97%|█████████▋| 5606/5772 [2:06:41<18:42, 6.76s/it] 97%|█████████▋| 5607/5772 [2:06:47<18:19, 6.67s/it] 97%|█████████▋| 5607/5772 [2:06:50<18:20, 6.67s/it] {'loss': 0.474, 'learning_rate': 4.2841188565446724e-08, 'epoch': 0.97} + 97%|█████████▋| 5607/5772 [2:06:50<18:20, 6.67s/it] {'loss': 0.474, 'learning_rate': 4.2841188565446724e-08, 'epoch': 0.97} + 97%|█████████▋| 5607/5772 [2:06:47<18:19, 6.67s/it] 97%|█████████▋| 5608/5772 [2:06:56<17:49, 6.52s/it] 97%|█████████▋| 5608/5772 [2:06:53<17:49, 6.52s/it] {'loss': 0.4547, 'learning_rate': 4.232384053384553e-08, 'epoch': 0.97} + 97%|█████████▋| 5608/5772 [2:06:56<17:49, 6.52s/it] {'loss': 0.4547, 'learning_rate': 4.232384053384553e-08, 'epoch': 0.97} + 97%|█████████▋| 5608/5772 [2:06:53<17:49, 6.52s/it] 97%|█████████▋| 5609/5772 [2:06:59<17:09, 6.32s/it] 97%|█████████▋| 5609/5772 [2:07:02<17:09, 6.32s/it] {'loss': 0.454, 'learning_rate': 4.1809628616985564e-08, 'epoch': 0.97} + {'loss': 0.454, 'learning_rate': 4.1809628616985564e-08, 'epoch': 0.97} 97%|█████████▋| 5609/5772 [2:07:02<17:09, 6.32s/it] + 97%|█████████▋| 5609/5772 [2:06:59<17:09, 6.32s/it] 97%|█████████▋| 5610/5772 [2:07:05<17:06, 6.34s/it] 97%|█████████▋| 5610/5772 [2:07:08<17:06, 6.34s/it] {'loss': 0.4624, 'learning_rate': 4.129855297681618e-08, 'epoch': 0.97} + 97%|█████████▋| 5610/5772 [2:07:08<17:06, 6.34s/it] {'loss': 0.4624, 'learning_rate': 4.129855297681618e-08, 'epoch': 0.97} + 97%|█████████▋| 5610/5772 [2:07:05<17:06, 6.34s/it] 97%|█████████▋| 5611/5772 [2:07:11<16:50, 6.27s/it] 97%|█████████▋| 5611/5772 [2:07:15<16:50, 6.27s/it] {'loss': 0.4564, 'learning_rate': 4.0790613774295274e-08, 'epoch': 0.97} + 97%|█████████▋| 5611/5772 [2:07:15<16:50, 6.27s/it] {'loss': 0.4564, 'learning_rate': 4.0790613774295274e-08, 'epoch': 0.97} + 97%|█████████▋| 5611/5772 [2:07:11<16:50, 6.27s/it] 97%|█████████▋| 5612/5772 [2:07:21<16:57, 6.36s/it] 97%|█████████▋| 5612/5772 [2:07:18<16:59, 6.37s/it] {'loss': 0.452, 'learning_rate': 4.028581116939823e-08, 'epoch': 0.97} + 97%|█████████▋| 5612/5772 [2:07:21<16:57, 6.36s/it] {'loss': 0.452, 'learning_rate': 4.028581116939823e-08, 'epoch': 0.97} + 97%|█████████▋| 5612/5772 [2:07:18<16:59, 6.37s/it] 97%|█████████▋| 5613/5772 [2:07:28<16:59, 6.41s/it] 97%|█████████▋| 5613/5772 [2:07:25<16:58, 6.41s/it] {'loss': 0.4578, 'learning_rate': 3.978414532110897e-08, 'epoch': 0.97} + 97%|█████████▋| 5613/5772 [2:07:28<16:59, 6.41s/it] {'loss': 0.4578, 'learning_rate': 3.978414532110897e-08, 'epoch': 0.97} + 97%|█████████▋| 5613/5772 [2:07:25<16:58, 6.41s/it] 97%|█████████▋| 5614/5772 [2:07:34<16:55, 6.43s/it] 97%|█████████▋| 5614/5772 [2:07:31<16:54, 6.42s/it] {'loss': 0.447, 'learning_rate': 3.928561638742334e-08, 'epoch': 0.97} + 97%|█████████▋| 5614/5772 [2:07:34<16:55, 6.43s/it] {'loss': 0.447, 'learning_rate': 3.928561638742334e-08, 'epoch': 0.97} + 97%|█████████▋| 5614/5772 [2:07:31<16:54, 6.42s/it] 97%|█████████▋| 5615/5772 [2:07:37<16:36, 6.35s/it] 97%|█████████▋| 5615/5772 [2:07:40<16:36, 6.35s/it] {'loss': 0.4691, 'learning_rate': 3.8790224525352416e-08, 'epoch': 0.97} + 97%|█████████▋| 5615/5772 [2:07:40<16:36, 6.35s/it] {'loss': 0.4691, 'learning_rate': 3.8790224525352416e-08, 'epoch': 0.97} + 97%|█████████▋| 5615/5772 [2:07:37<16:36, 6.35s/it] 97%|█████████▋| 5616/5772 [2:07:43<16:24, 6.31s/it] 97%|█████████▋| 5616/5772 [2:07:46<16:25, 6.32s/it] {'loss': 0.4469, 'learning_rate': 3.829796989091472e-08, 'epoch': 0.97} + 97%|█████████▋| 5616/5772 [2:07:46<16:25, 6.32s/it] {'loss': 0.4469, 'learning_rate': 3.829796989091472e-08, 'epoch': 0.97} + 97%|█████████▋| 5616/5772 [2:07:43<16:24, 6.31s/it] 97%|█████████▋| 5617/5772 [2:07:53<16:13, 6.28s/it] 97%|█████████▋| 5617/5772 [2:07:50<16:13, 6.28s/it] {'loss': 0.4586, 'learning_rate': 3.780885263914402e-08, 'epoch': 0.97} + 97%|█████████▋| 5617/5772 [2:07:53<16:13, 6.28s/it] {'loss': 0.4586, 'learning_rate': 3.780885263914402e-08, 'epoch': 0.97} + 97%|█████████▋| 5617/5772 [2:07:50<16:13, 6.28s/it] 97%|█████████▋| 5618/5772 [2:07:59<15:47, 6.15s/it] 97%|█████████▋| 5618/5772 [2:07:55<15:47, 6.15s/it] {'loss': 0.4491, 'learning_rate': 3.7322872924084876e-08, 'epoch': 0.97} + 97%|█████████▋| 5618/5772 [2:07:59<15:47, 6.15s/it] {'loss': 0.4491, 'learning_rate': 3.7322872924084876e-08, 'epoch': 0.97} + 97%|█████████▋| 5618/5772 [2:07:56<15:47, 6.15s/it] 97%|█████████▋| 5619/5772 [2:08:05<15:46, 6.18s/it] 97%|█████████▋| 5619/5772 [2:08:02<15:46, 6.18s/it] {'loss': 0.4598, 'learning_rate': 3.684003089879484e-08, 'epoch': 0.97} + 97%|█████████▋| 5619/5772 [2:08:05<15:46, 6.18s/it] {'loss': 0.4598, 'learning_rate': 3.684003089879484e-08, 'epoch': 0.97} + 97%|█████████▋| 5619/5772 [2:08:02<15:46, 6.18s/it] 97%|█████████▋| 5620/5772 [2:08:11<15:37, 6.17s/it] 97%|█████████▋| 5620/5772 [2:08:08<15:37, 6.17s/it] {'loss': 0.4636, 'learning_rate': 3.636032671534229e-08, 'epoch': 0.97} + 97%|█████████▋| 5620/5772 [2:08:11<15:37, 6.17s/it] {'loss': 0.4636, 'learning_rate': 3.636032671534229e-08, 'epoch': 0.97} + 97%|█████████▋| 5620/5772 [2:08:08<15:37, 6.17s/it] 97%|█████████▋| 5621/5772 [2:08:17<15:25, 6.13s/it] 97%|█████████▋| 5621/5772 [2:08:14<15:25, 6.13s/it] {'loss': 0.4654, 'learning_rate': 3.5883760524805244e-08, 'epoch': 0.97} + 97%|█████████▋| 5621/5772 [2:08:17<15:25, 6.13s/it] {'loss': 0.4654, 'learning_rate': 3.5883760524805244e-08, 'epoch': 0.97} + 97%|█████████▋| 5621/5772 [2:08:14<15:25, 6.13s/it] 97%|█████████▋| 5622/5772 [2:08:23<15:07, 6.05s/it] 97%|█████████▋| 5622/5772 [2:08:20<15:07, 6.05s/it]{'loss': 0.4607, 'learning_rate': 3.5410332477278096e-08, 'epoch': 0.97} + {'loss': 0.4607, 'learning_rate': 3.5410332477278096e-08, 'epoch': 0.97} + 97%|█████████▋| 5622/5772 [2:08:23<15:07, 6.05s/it] 97%|█████████▋| 5622/5772 [2:08:20<15:07, 6.05s/it] 97%|█████████▋| 5623/5772 [2:08:26<15:03, 6.07s/it] 97%|█████████▋| 5623/5772 [2:08:29<15:03, 6.07s/it] {'loss': 0.4674, 'learning_rate': 3.49400427218638e-08, 'epoch': 0.97} + 97%|█████████▋| 5623/5772 [2:08:29<15:03, 6.07s/it] {'loss': 0.4674, 'learning_rate': 3.49400427218638e-08, 'epoch': 0.97} + 97%|█████████▋| 5623/5772 [2:08:26<15:03, 6.07s/it] 97%|█████████▋| 5624/5772 [2:08:32<15:07, 6.13s/it] 97%|█████████▋| 5624/5772 [2:08:35<15:07, 6.13s/it] {'loss': 0.4592, 'learning_rate': 3.447289140667609e-08, 'epoch': 0.97} + 97%|█████████▋| 5624/5772 [2:08:35<15:07, 6.13s/it] {'loss': 0.4592, 'learning_rate': 3.447289140667609e-08, 'epoch': 0.97} + 97%|█████████▋| 5624/5772 [2:08:32<15:07, 6.13s/it] 97%|█████████▋| 5625/5772 [2:08:41<15:04, 6.15s/it] 97%|█████████▋| 5625/5772 [2:08:38<15:04, 6.15s/it] {'loss': 0.4642, 'learning_rate': 3.4008878678843946e-08, 'epoch': 0.97} + 97%|█████████▋| 5625/5772 [2:08:41<15:04, 6.15s/it] {'loss': 0.4642, 'learning_rate': 3.4008878678843946e-08, 'epoch': 0.97} + 97%|█████████▋| 5625/5772 [2:08:38<15:04, 6.15s/it] 97%|█████████▋| 5626/5772 [2:08:48<15:20, 6.31s/it] 97%|█████████▋| 5626/5772 [2:08:45<15:20, 6.31s/it] {'loss': 0.4505, 'learning_rate': 3.35480046845027e-08, 'epoch': 0.97} + 97%|█████████▋| 5626/5772 [2:08:48<15:20, 6.31s/it] {'loss': 0.4505, 'learning_rate': 3.35480046845027e-08, 'epoch': 0.97} + 97%|█████████▋| 5626/5772 [2:08:45<15:20, 6.31s/it] 97%|█████████▋| 5627/5772 [2:08:54<15:06, 6.25s/it] 97%|█████████▋| 5627/5772 [2:08:51<15:06, 6.25s/it]{'loss': 0.464, 'learning_rate': 3.309026956880512e-08, 'epoch': 0.97} + {'loss': 0.464, 'learning_rate': 3.309026956880512e-08, 'epoch': 0.97} 97%|█████████▋| 5627/5772 [2:08:54<15:06, 6.25s/it] + 97%|█████████▋| 5627/5772 [2:08:51<15:06, 6.25s/it] 98%|█████████▊| 5628/5772 [2:09:00<14:48, 6.17s/it] 98%|█████████▊| 5628/5772 [2:08:57<14:48, 6.17s/it] {'loss': 0.4551, 'learning_rate': 3.2635673475910345e-08, 'epoch': 0.97} + 98%|█████████▊| 5628/5772 [2:09:00<14:48, 6.17s/it] {'loss': 0.4551, 'learning_rate': 3.2635673475910345e-08, 'epoch': 0.97} + 98%|█████████▊| 5628/5772 [2:08:57<14:48, 6.17s/it] 98%|█████████▊| 5629/5772 [2:09:07<14:50, 6.23s/it] 98%|█████████▊| 5629/5772 [2:09:04<14:50, 6.23s/it] {'loss': 0.4552, 'learning_rate': 3.218421654899162e-08, 'epoch': 0.98} + 98%|█████████▊| 5629/5772 [2:09:07<14:50, 6.23s/it] {'loss': 0.4552, 'learning_rate': 3.218421654899162e-08, 'epoch': 0.98} + 98%|█████████▊| 5629/5772 [2:09:04<14:50, 6.23s/it] 98%|█████████▊| 5630/5772 [2:09:10<14:47, 6.25s/it] 98%|█████████▊| 5630/5772 [2:09:13<14:48, 6.25s/it] {'loss': 0.4475, 'learning_rate': 3.173589893023188e-08, 'epoch': 0.98} + {'loss': 0.4475, 'learning_rate': 3.173589893023188e-08, 'epoch': 0.98} 98%|█████████▊| 5630/5772 [2:09:13<14:48, 6.25s/it] + 98%|█████████▊| 5630/5772 [2:09:10<14:47, 6.25s/it] 98%|█████████▊| 5631/5772 [2:09:19<14:43, 6.27s/it] 98%|█████████▊| 5631/5772 [2:09:16<14:43, 6.27s/it] {'loss': 0.4659, 'learning_rate': 3.1290720760828176e-08, 'epoch': 0.98} + 98%|█████████▊| 5631/5772 [2:09:19<14:43, 6.27s/it] {'loss': 0.4659, 'learning_rate': 3.1290720760828176e-08, 'epoch': 0.98} + 98%|█████████▊| 5631/5772 [2:09:16<14:43, 6.27s/it] 98%|█████████▊| 5632/5772 [2:09:25<14:24, 6.18s/it] 98%|█████████▊| 5632/5772 [2:09:22<14:24, 6.18s/it] {'loss': 0.4481, 'learning_rate': 3.0848682180985025e-08, 'epoch': 0.98} + 98%|█████████▊| 5632/5772 [2:09:25<14:24, 6.18s/it] {'loss': 0.4481, 'learning_rate': 3.0848682180985025e-08, 'epoch': 0.98} + 98%|█████████▊| 5632/5772 [2:09:22<14:24, 6.18s/it] 98%|█████████▊| 5633/5772 [2:09:31<14:15, 6.15s/it] 98%|█████████▊| 5633/5772 [2:09:28<14:15, 6.15s/it] {'loss': 0.4626, 'learning_rate': 3.040978332992106e-08, 'epoch': 0.98} + 98%|█████████▊| 5633/5772 [2:09:31<14:15, 6.15s/it] {'loss': 0.4626, 'learning_rate': 3.040978332992106e-08, 'epoch': 0.98} + 98%|█████████▊| 5633/5772 [2:09:28<14:15, 6.15s/it] 98%|█████████▊| 5634/5772 [2:09:34<14:07, 6.14s/it] 98%|█████████▊| 5634/5772 [2:09:37<14:07, 6.14s/it] {'loss': 0.457, 'learning_rate': 2.9974024345864604e-08, 'epoch': 0.98} + 98%|█████████▊| 5634/5772 [2:09:37<14:07, 6.14s/it] {'loss': 0.457, 'learning_rate': 2.9974024345864604e-08, 'epoch': 0.98} + 98%|█████████▊| 5634/5772 [2:09:34<14:07, 6.14s/it] 98%|█████████▊| 5635/5772 [2:09:44<14:01, 6.14s/it] 98%|█████████▊| 5635/5772 [2:09:40<14:01, 6.14s/it] {'loss': 0.4577, 'learning_rate': 2.9541405366054764e-08, 'epoch': 0.98} + 98%|█████████▊| 5635/5772 [2:09:44<14:01, 6.14s/it] {'loss': 0.4577, 'learning_rate': 2.9541405366054764e-08, 'epoch': 0.98} + 98%|█████████▊| 5635/5772 [2:09:40<14:01, 6.14s/it] 98%|█████████▊| 5636/5772 [2:09:47<13:53, 6.13s/it] 98%|█████████▊| 5636/5772 [2:09:50<13:53, 6.13s/it] {'loss': 0.4611, 'learning_rate': 2.9111926526744772e-08, 'epoch': 0.98} + 98%|█████████▊| 5636/5772 [2:09:50<13:53, 6.13s/it] {'loss': 0.4611, 'learning_rate': 2.9111926526744772e-08, 'epoch': 0.98} + 98%|█████████▊| 5636/5772 [2:09:47<13:53, 6.13s/it] 98%|█████████▊| 5637/5772 [2:09:53<13:53, 6.18s/it] 98%|█████████▊| 5637/5772 [2:09:56<13:53, 6.18s/it] {'loss': 0.4485, 'learning_rate': 2.8685587963194206e-08, 'epoch': 0.98} + 98%|█████████▊| 5637/5772 [2:09:56<13:53, 6.18s/it] {'loss': 0.4485, 'learning_rate': 2.8685587963194206e-08, 'epoch': 0.98} + 98%|█████████▊| 5637/5772 [2:09:53<13:53, 6.18s/it] 98%|█████████▊| 5638/5772 [2:09:59<13:55, 6.23s/it] 98%|█████████▊| 5638/5772 [2:10:02<13:55, 6.23s/it] {'loss': 0.453, 'learning_rate': 2.826238980967788e-08, 'epoch': 0.98} + 98%|█████████▊| 5638/5772 [2:10:02<13:55, 6.23s/it] {'loss': 0.453, 'learning_rate': 2.826238980967788e-08, 'epoch': 0.98} + 98%|█████████▊| 5638/5772 [2:09:59<13:55, 6.23s/it] 98%|█████████▊| 5639/5772 [2:10:05<13:42, 6.18s/it] 98%|█████████▊| 5639/5772 [2:10:08<13:42, 6.18s/it] {'loss': 0.4563, 'learning_rate': 2.7842332199478074e-08, 'epoch': 0.98} + 98%|█████████▊| 5639/5772 [2:10:08<13:42, 6.18s/it] {'loss': 0.4563, 'learning_rate': 2.7842332199478074e-08, 'epoch': 0.98} + 98%|█████████▊| 5639/5772 [2:10:05<13:42, 6.18s/it] 98%|█████████▊| 5640/5772 [2:10:12<13:47, 6.27s/it] 98%|█████████▊| 5640/5772 [2:10:15<13:47, 6.27s/it] {'loss': 0.4569, 'learning_rate': 2.7425415264890065e-08, 'epoch': 0.98} + 98%|█████████▊| 5640/5772 [2:10:15<13:47, 6.27s/it] {'loss': 0.4569, 'learning_rate': 2.7425415264890065e-08, 'epoch': 0.98} + 98%|█████████▊| 5640/5772 [2:10:12<13:47, 6.27s/it] 98%|█████████▊| 5641/5772 [2:10:18<13:30, 6.18s/it] 98%|█████████▊| 5641/5772 [2:10:21<13:30, 6.18s/it] {'loss': 0.4554, 'learning_rate': 2.7011639137221046e-08, 'epoch': 0.98} + 98%|█████████▊| 5641/5772 [2:10:21<13:30, 6.18s/it] {'loss': 0.4554, 'learning_rate': 2.7011639137221046e-08, 'epoch': 0.98} + 98%|█████████▊| 5641/5772 [2:10:18<13:30, 6.18s/it] 98%|█████████▊| 5642/5772 [2:10:24<13:22, 6.17s/it] 98%|█████████▊| 5642/5772 [2:10:27<13:22, 6.17s/it] {'loss': 0.4503, 'learning_rate': 2.6601003946784555e-08, 'epoch': 0.98} + 98%|█████████▊| 5642/5772 [2:10:27<13:22, 6.17s/it] {'loss': 0.4503, 'learning_rate': 2.6601003946784555e-08, 'epoch': 0.98} + 98%|█████████▊| 5642/5772 [2:10:24<13:22, 6.17s/it] 98%|█████████▊| 5643/5772 [2:10:33<13:05, 6.09s/it] 98%|█████████▊| 5643/5772 [2:10:30<13:05, 6.09s/it] {'loss': 0.4741, 'learning_rate': 2.6193509822910466e-08, 'epoch': 0.98} + 98%|█████████▊| 5643/5772 [2:10:30<13:05, 6.09s/it]{'loss': 0.4741, 'learning_rate': 2.6193509822910466e-08, 'epoch': 0.98} + 98%|█████████▊| 5643/5772 [2:10:33<13:05, 6.09s/it] 98%|█████████▊| 5644/5772 [2:10:36<13:11, 6.18s/it] 98%|█████████▊| 5644/5772 [2:10:39<13:11, 6.18s/it] {'loss': 0.4402, 'learning_rate': 2.578915689393613e-08, 'epoch': 0.98} + 98%|█████████▊| 5644/5772 [2:10:39<13:11, 6.18s/it] {'loss': 0.4402, 'learning_rate': 2.578915689393613e-08, 'epoch': 0.98} + 98%|█████████▊| 5644/5772 [2:10:36<13:11, 6.18s/it] 98%|█████████▊| 5645/5772 [2:10:45<12:46, 6.04s/it] 98%|█████████▊| 5645/5772 [2:10:42<12:46, 6.04s/it] {'loss': 0.4572, 'learning_rate': 2.538794528720967e-08, 'epoch': 0.98} + 98%|█████████▊| 5645/5772 [2:10:45<12:46, 6.04s/it] {'loss': 0.4572, 'learning_rate': 2.538794528720967e-08, 'epoch': 0.98} + 98%|█████████▊| 5645/5772 [2:10:42<12:46, 6.04s/it] 98%|█████████▊| 5646/5772 [2:10:48<12:41, 6.05s/it] 98%|█████████▊| 5646/5772 [2:10:51<12:41, 6.05s/it] {'loss': 0.4563, 'learning_rate': 2.4989875129091124e-08, 'epoch': 0.98} {'loss': 0.4563, 'learning_rate': 2.4989875129091124e-08, 'epoch': 0.98} + 98%|█████████▊| 5646/5772 [2:10:51<12:41, 6.05s/it] + 98%|█████████▊| 5646/5772 [2:10:48<12:41, 6.05s/it] 98%|█████████▊| 5647/5772 [2:10:54<12:40, 6.09s/it] 98%|█████████▊| 5647/5772 [2:10:57<12:40, 6.09s/it] {'loss': 0.4601, 'learning_rate': 2.4594946544949094e-08, 'epoch': 0.98} + 98%|█████████▊| 5647/5772 [2:10:57<12:40, 6.09s/it] {'loss': 0.4601, 'learning_rate': 2.4594946544949094e-08, 'epoch': 0.98} + 98%|█████████▊| 5647/5772 [2:10:54<12:40, 6.09s/it] 98%|█████████▊| 5648/5772 [2:11:00<12:41, 6.14s/it] 98%|█████████▊| 5648/5772 [2:11:03<12:41, 6.14s/it] {'loss': 0.463, 'learning_rate': 2.420315965916631e-08, 'epoch': 0.98} + 98%|█████████▊| 5648/5772 [2:11:03<12:41, 6.14s/it] {'loss': 0.463, 'learning_rate': 2.420315965916631e-08, 'epoch': 0.98} + 98%|█████████▊| 5648/5772 [2:11:00<12:41, 6.14s/it] 98%|█████████▊| 5649/5772 [2:11:06<12:29, 6.09s/it] 98%|█████████▊| 5649/5772 [2:11:09<12:29, 6.09s/it] {'loss': 0.4661, 'learning_rate': 2.3814514595132955e-08, 'epoch': 0.98} + {'loss': 0.4661, 'learning_rate': 2.3814514595132955e-08, 'epoch': 0.98} 98%|█████████▊| 5649/5772 [2:11:09<12:29, 6.09s/it] + 98%|█████████▊| 5649/5772 [2:11:06<12:29, 6.09s/it]2 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +011 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... + 98%|█████████▊| 5650/5772 [2:11:13<12:28, 6.14s/it] 98%|█████████▊| 5650/5772 [2:11:16<12:28, 6.14s/it]1 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4664, 'learning_rate': 2.3429011475250008e-08, 'epoch': 0.98} +{'loss': 0.4664, 'learning_rate': 2.3429011475250008e-08, 'epoch': 0.98} + 98%|█████████▊| 5650/5772 [2:11:16<12:28, 6.14s/it] 98%|█████████▊| 5650/5772 [2:11:13<12:28, 6.14s/it] 98%|█████████▊| 5651/5772 [2:11:19<12:19, 6.11s/it] 98%|█████████▊| 5651/5772 [2:11:22<12:19, 6.11s/it] {'loss': 0.4606, 'learning_rate': 2.304665042092924e-08, 'epoch': 0.98} + 98%|█████████▊| 5651/5772 [2:11:22<12:19, 6.11s/it] {'loss': 0.4606, 'learning_rate': 2.304665042092924e-08, 'epoch': 0.98} + 98%|█████████▊| 5651/5772 [2:11:19<12:19, 6.11s/it] 98%|█████████▊| 5652/5772 [2:11:28<12:17, 6.14s/it] 98%|█████████▊| 5652/5772 [2:11:25<12:17, 6.14s/it] {'loss': 0.457, 'learning_rate': 2.266743155259432e-08, 'epoch': 0.98} + 98%|█████████▊| 5652/5772 [2:11:28<12:17, 6.14s/it] {'loss': 0.457, 'learning_rate': 2.266743155259432e-08, 'epoch': 0.98} + 98%|█████████▊| 5652/5772 [2:11:25<12:17, 6.14s/it] 98%|█████████▊| 5653/5772 [2:11:34<12:15, 6.18s/it] 98%|█████████▊| 5653/5772 [2:11:31<12:15, 6.18s/it] {'loss': 0.458, 'learning_rate': 2.2291354989677492e-08, 'epoch': 0.98} + 98%|█████████▊| 5653/5772 [2:11:34<12:15, 6.18s/it] {'loss': 0.458, 'learning_rate': 2.2291354989677492e-08, 'epoch': 0.98} + 98%|█████████▊| 5653/5772 [2:11:31<12:15, 6.18s/it] 98%|█████████▊| 5654/5772 [2:11:37<12:01, 6.11s/it] 98%|█████████▊| 5654/5772 [2:11:40<12:01, 6.11s/it] {'loss': 0.453, 'learning_rate': 2.19184208506229e-08, 'epoch': 0.98} + 98%|█████████▊| 5654/5772 [2:11:37<12:01, 6.11s/it] {'loss': 0.453, 'learning_rate': 2.19184208506229e-08, 'epoch': 0.98} + 98%|█████████▊| 5654/5772 [2:11:40<12:01, 6.11s/it] 98%|█████████▊| 5655/5772 [2:11:43<12:04, 6.20s/it] 98%|█████████▊| 5655/5772 [2:11:47<12:04, 6.20s/it]{'loss': 0.4644, 'learning_rate': 2.154862925288326e-08, 'epoch': 0.98} + 98%|█████████▊| 5655/5772 [2:11:47<12:04, 6.20s/it] {'loss': 0.4644, 'learning_rate': 2.154862925288326e-08, 'epoch': 0.98} + 98%|█████████▊| 5655/5772 [2:11:43<12:04, 6.20s/it] 98%|█████████▊| 5656/5772 [2:11:49<11:42, 6.06s/it] 98%|█████████▊| 5656/5772 [2:11:52<11:42, 6.06s/it] {'loss': 0.4466, 'learning_rate': 2.118198031292207e-08, 'epoch': 0.98} + 98%|█████████▊| 5656/5772 [2:11:52<11:42, 6.06s/it] {'loss': 0.4466, 'learning_rate': 2.118198031292207e-08, 'epoch': 0.98} + 98%|█████████▊| 5656/5772 [2:11:49<11:42, 6.06s/it] 98%|█████████▊| 5657/5772 [2:11:55<11:34, 6.04s/it] 98%|█████████▊| 5657/5772 [2:11:58<11:34, 6.04s/it] {'loss': 0.4588, 'learning_rate': 2.0818474146212518e-08, 'epoch': 0.98} + 98%|█████████▊| 5657/5772 [2:11:58<11:34, 6.04s/it] {'loss': 0.4588, 'learning_rate': 2.0818474146212518e-08, 'epoch': 0.98} + 98%|█████████▊| 5657/5772 [2:11:55<11:34, 6.04s/it] 98%|█████████▊| 5658/5772 [2:12:05<11:35, 6.10s/it] 98%|█████████▊| 5658/5772 [2:12:01<11:35, 6.10s/it] {'loss': 0.4617, 'learning_rate': 2.045811086724192e-08, 'epoch': 0.98} + 98%|█████████▊| 5658/5772 [2:12:05<11:35, 6.10s/it] {'loss': 0.4617, 'learning_rate': 2.045811086724192e-08, 'epoch': 0.98} + 98%|█████████▊| 5658/5772 [2:12:01<11:35, 6.10s/it] 98%|█████████▊| 5659/5772 [2:12:07<11:19, 6.01s/it] 98%|█████████▊| 5659/5772 [2:12:10<11:19, 6.01s/it] {'loss': 0.4629, 'learning_rate': 2.010089058950171e-08, 'epoch': 0.98} + 98%|█████████▊| 5659/5772 [2:12:10<11:19, 6.01s/it] {'loss': 0.4629, 'learning_rate': 2.010089058950171e-08, 'epoch': 0.98} + 98%|█████████▊| 5659/5772 [2:12:07<11:19, 6.01s/it] 98%|█████████▊| 5660/5772 [2:12:13<11:15, 6.03s/it] 98%|█████████▊| 5660/5772 [2:12:16<11:15, 6.03s/it] {'loss': 0.4552, 'learning_rate': 1.9746813425498555e-08, 'epoch': 0.98} + 98%|█████████▊| 5660/5772 [2:12:16<11:15, 6.03s/it] {'loss': 0.4552, 'learning_rate': 1.9746813425498555e-08, 'epoch': 0.98} + 98%|█████████▊| 5660/5772 [2:12:13<11:15, 6.03s/it] 98%|█████████▊| 5661/5772 [2:12:19<11:11, 6.05s/it] 98%|█████████▊| 5661/5772 [2:12:22<11:11, 6.05s/it] {'loss': 0.4621, 'learning_rate': 1.9395879486745483e-08, 'epoch': 0.98} + 98%|█████████▊| 5661/5772 [2:12:22<11:11, 6.05s/it] {'loss': 0.4621, 'learning_rate': 1.9395879486745483e-08, 'epoch': 0.98} + 98%|█████████▊| 5661/5772 [2:12:19<11:11, 6.05s/it] 98%|█████████▊| 5662/5772 [2:12:25<11:02, 6.02s/it] 98%|█████████▊| 5662/5772 [2:12:28<11:02, 6.02s/it] {'loss': 0.453, 'learning_rate': 1.9048088883767414e-08, 'epoch': 0.98} + 98%|█████████▊| 5662/5772 [2:12:28<11:02, 6.02s/it] {'loss': 0.453, 'learning_rate': 1.9048088883767414e-08, 'epoch': 0.98} + 98%|█████████▊| 5662/5772 [2:12:25<11:02, 6.02s/it] 98%|█████████▊| 5663/5772 [2:12:32<11:04, 6.10s/it] 98%|█████████▊| 5663/5772 [2:12:35<11:04, 6.10s/it] {'loss': 0.4623, 'learning_rate': 1.870344172610006e-08, 'epoch': 0.98} + 98%|█████████▊| 5663/5772 [2:12:35<11:04, 6.10s/it] {'loss': 0.4623, 'learning_rate': 1.870344172610006e-08, 'epoch': 0.98} + 98%|█████████▊| 5663/5772 [2:12:32<11:04, 6.10s/it] 98%|█████████▊| 5664/5772 [2:12:38<11:00, 6.11s/it] 98%|█████████▊| 5664/5772 [2:12:41<11:00, 6.11s/it] {'loss': 0.4524, 'learning_rate': 1.8361938122287704e-08, 'epoch': 0.98} + {'loss': 0.4524, 'learning_rate': 1.8361938122287704e-08, 'epoch': 0.98} 98%|█████████▊| 5664/5772 [2:12:41<11:00, 6.11s/it] + 98%|█████████▊| 5664/5772 [2:12:38<11:00, 6.11s/it] 98%|█████████▊| 5665/5772 [2:12:44<10:59, 6.16s/it] 98%|█████████▊| 5665/5772 [2:12:47<10:59, 6.17s/it] {'loss': 0.4618, 'learning_rate': 1.8023578179884315e-08, 'epoch': 0.98} + 98%|█████████▊| 5665/5772 [2:12:47<10:59, 6.17s/it] {'loss': 0.4618, 'learning_rate': 1.8023578179884315e-08, 'epoch': 0.98} + 98%|█████████▊| 5665/5772 [2:12:44<10:59, 6.16s/it] 98%|█████████▊| 5666/5772 [2:12:50<10:52, 6.15s/it] 98%|█████████▊| 5666/5772 [2:12:53<10:52, 6.15s/it] {'loss': 0.4413, 'learning_rate': 1.7688362005454653e-08, 'epoch': 0.98} + 98%|█████████▊| 5666/5772 [2:12:53<10:52, 6.15s/it] {'loss': 0.4413, 'learning_rate': 1.7688362005454653e-08, 'epoch': 0.98} + 98%|█████████▊| 5666/5772 [2:12:50<10:52, 6.15s/it] 98%|█████████▊| 5667/5772 [2:12:56<10:43, 6.12s/it] {'loss': 0.4618, 'learning_rate': 1.7356289704574257e-08, 'epoch': 0.98} + 98%|█████████▊| 5667/5772 [2:12:56<10:43, 6.12s/it] 98%|█████████▊| 5667/5772 [2:12:59<10:43, 6.12s/it] {'loss': 0.4618, 'learning_rate': 1.7356289704574257e-08, 'epoch': 0.98} + 98%|█████████▊| 5667/5772 [2:12:59<10:43, 6.12s/it] 98%|█████████▊| 5668/5772 [2:13:02<10:38, 6.14s/it] 98%|█████████▊| 5668/5772 [2:13:05<10:38, 6.14s/it] {'loss': 0.4502, 'learning_rate': 1.7027361381826147e-08, 'epoch': 0.98} + 98%|█████████▊| 5668/5772 [2:13:05<10:38, 6.14s/it] {'loss': 0.4502, 'learning_rate': 1.7027361381826147e-08, 'epoch': 0.98} + 98%|█████████▊| 5668/5772 [2:13:02<10:38, 6.14s/it] 98%|█████████▊| 5669/5772 [2:13:08<10:26, 6.09s/it] 98%|█████████▊| 5669/5772 [2:13:11<10:26, 6.09s/it] {'loss': 0.4669, 'learning_rate': 1.6701577140805225e-08, 'epoch': 0.98} + 98%|█████████▊| 5669/5772 [2:13:11<10:26, 6.09s/it]{'loss': 0.4669, 'learning_rate': 1.6701577140805225e-08, 'epoch': 0.98} + 98%|█████████▊| 5669/5772 [2:13:08<10:26, 6.09s/it] 98%|█████████▊| 5670/5772 [2:13:14<10:16, 6.05s/it] 98%|█████████▊| 5670/5772 [2:13:17<10:16, 6.05s/it] {'loss': 0.4464, 'learning_rate': 1.6378937084114978e-08, 'epoch': 0.98} + 98%|█████████▊| 5670/5772 [2:13:17<10:16, 6.05s/it] {'loss': 0.4464, 'learning_rate': 1.6378937084114978e-08, 'epoch': 0.98} + 98%|█████████▊| 5670/5772 [2:13:14<10:16, 6.05s/it] 98%|█████████▊| 5671/5772 [2:13:20<10:11, 6.05s/it] 98%|█████████▊| 5671/5772 [2:13:23<10:11, 6.05s/it] {'loss': 0.4608, 'learning_rate': 1.6059441313369672e-08, 'epoch': 0.98} + 98%|█████████▊| 5671/5772 [2:13:23<10:11, 6.05s/it] {'loss': 0.4608, 'learning_rate': 1.6059441313369672e-08, 'epoch': 0.98} + 98%|█████████▊| 5671/5772 [2:13:20<10:11, 6.05s/it] 98%|█████████▊| 5672/5772 [2:13:27<10:14, 6.14s/it] 98%|█████████▊| 5672/5772 [2:13:30<10:14, 6.14s/it] {'loss': 0.4479, 'learning_rate': 1.5743089929193266e-08, 'epoch': 0.98} + 98%|█████████▊| 5672/5772 [2:13:30<10:14, 6.14s/it] {'loss': 0.4479, 'learning_rate': 1.5743089929193266e-08, 'epoch': 0.98} + 98%|█████████▊| 5672/5772 [2:13:27<10:14, 6.14s/it] 98%|█████████▊| 5673/5772 [2:13:33<10:05, 6.12s/it] 98%|█████████▊| 5673/5772 [2:13:36<10:05, 6.12s/it] {'loss': 0.4573, 'learning_rate': 1.5429883031217173e-08, 'epoch': 0.98} + 98%|█████████▊| 5673/5772 [2:13:36<10:05, 6.12s/it] {'loss': 0.4573, 'learning_rate': 1.5429883031217173e-08, 'epoch': 0.98} + 98%|█████████▊| 5673/5772 [2:13:33<10:05, 6.12s/it] 98%|█████████▊| 5674/5772 [2:13:42<09:47, 6.00s/it] {'loss': 0.4414, 'learning_rate': 1.511982071808471e-08, 'epoch': 0.98} + 98%|█████████▊| 5674/5772 [2:13:42<09:47, 6.00s/it] 98%|█████████▊| 5674/5772 [2:13:39<09:47, 6.00s/it] {'loss': 0.4414, 'learning_rate': 1.511982071808471e-08, 'epoch': 0.98} + 98%|█████████▊| 5674/5772 [2:13:39<09:47, 6.00s/it] 98%|█████████▊| 5675/5772 [2:13:45<09:46, 6.05s/it] 98%|█████████▊| 5675/5772 [2:13:48<09:46, 6.05s/it] {'loss': 0.452, 'learning_rate': 1.4812903087448872e-08, 'epoch': 0.98} + 98%|█████████▊| 5675/5772 [2:13:48<09:46, 6.05s/it] {'loss': 0.452, 'learning_rate': 1.4812903087448872e-08, 'epoch': 0.98} + 98%|█████████▊| 5675/5772 [2:13:45<09:46, 6.05s/it] 98%|█████████▊| 5676/5772 [2:13:51<09:38, 6.02s/it] 98%|█████████▊| 5676/5772 [2:13:54<09:38, 6.02s/it]{'loss': 0.4685, 'learning_rate': 1.4509130235971226e-08, 'epoch': 0.98} {'loss': 0.4685, 'learning_rate': 1.4509130235971226e-08, 'epoch': 0.98} + + 98%|█████████▊| 5676/5772 [2:13:54<09:38, 6.02s/it] 98%|█████████▊| 5676/5772 [2:13:51<09:38, 6.02s/it] 98%|█████████▊| 5677/5772 [2:13:57<09:36, 6.07s/it] 98%|█████████▊| 5677/5772 [2:14:00<09:36, 6.07s/it] {'loss': 0.4814, 'learning_rate': 1.420850225932302e-08, 'epoch': 0.98} + 98%|█████████▊| 5677/5772 [2:14:00<09:36, 6.07s/it] {'loss': 0.4814, 'learning_rate': 1.420850225932302e-08, 'epoch': 0.98} + 98%|█████████▊| 5677/5772 [2:13:57<09:36, 6.07s/it] 98%|█████████▊| 5678/5772 [2:14:03<09:42, 6.20s/it] 98%|█████████▊| 5678/5772 [2:14:06<09:42, 6.20s/it] {'loss': 0.4546, 'learning_rate': 1.3911019252187408e-08, 'epoch': 0.98} + 98%|█████████▊| 5678/5772 [2:14:06<09:42, 6.20s/it] {'loss': 0.4546, 'learning_rate': 1.3911019252187408e-08, 'epoch': 0.98} + 98%|█████████▊| 5678/5772 [2:14:03<09:42, 6.20s/it] 98%|█████████▊| 5679/5772 [2:14:10<09:37, 6.21s/it] 98%|█████████▊| 5679/5772 [2:14:13<09:37, 6.21s/it] {'loss': 0.46, 'learning_rate': 1.3616681308251667e-08, 'epoch': 0.98} + 98%|█████████▊| 5679/5772 [2:14:13<09:37, 6.21s/it] {'loss': 0.46, 'learning_rate': 1.3616681308251667e-08, 'epoch': 0.98} + 98%|█████████▊| 5679/5772 [2:14:10<09:37, 6.21s/it] 98%|█████████▊| 5680/5772 [2:14:16<09:32, 6.23s/it] 98%|█████████▊| 5680/5772 [2:14:19<09:32, 6.23s/it] {'loss': 0.4617, 'learning_rate': 1.332548852021831e-08, 'epoch': 0.98} + 98%|█████████▊| 5680/5772 [2:14:19<09:32, 6.23s/it]{'loss': 0.4617, 'learning_rate': 1.332548852021831e-08, 'epoch': 0.98} + 98%|█████████▊| 5680/5772 [2:14:16<09:32, 6.23s/it] 98%|█████████▊| 5681/5772 [2:14:23<09:41, 6.39s/it] 98%|█████████▊| 5681/5772 [2:14:26<09:41, 6.39s/it] {'loss': 0.4526, 'learning_rate': 1.30374409797962e-08, 'epoch': 0.98} + 98%|█████████▊| 5681/5772 [2:14:26<09:41, 6.39s/it] {'loss': 0.4526, 'learning_rate': 1.30374409797962e-08, 'epoch': 0.98} + 98%|█████████▊| 5681/5772 [2:14:23<09:41, 6.39s/it] 98%|█████████▊| 5682/5772 [2:14:29<09:36, 6.41s/it] 98%|█████████▊| 5682/5772 [2:14:32<09:36, 6.41s/it] {'loss': 0.4703, 'learning_rate': 1.2752538777704993e-08, 'epoch': 0.98} + 98%|█████████▊| 5682/5772 [2:14:32<09:36, 6.41s/it] {'loss': 0.4703, 'learning_rate': 1.2752538777704993e-08, 'epoch': 0.98} + 98%|█████████▊| 5682/5772 [2:14:29<09:36, 6.41s/it] 98%|█████████▊| 5683/5772 [2:14:35<09:26, 6.37s/it] 98%|█████████▊| 5683/5772 [2:14:38<09:26, 6.37s/it] {'loss': 0.4595, 'learning_rate': 1.24707820036718e-08, 'epoch': 0.98} + 98%|█████████▊| 5683/5772 [2:14:38<09:26, 6.37s/it] {'loss': 0.4595, 'learning_rate': 1.24707820036718e-08, 'epoch': 0.98} + 98%|█████████▊| 5683/5772 [2:14:35<09:26, 6.37s/it] 98%|█████████▊| 5684/5772 [2:14:41<09:13, 6.28s/it] 98%|█████████▊| 5684/5772 [2:14:45<09:13, 6.28s/it] {'loss': 0.4681, 'learning_rate': 1.2192170746434529e-08, 'epoch': 0.98} + 98%|█████████▊| 5684/5772 [2:14:45<09:13, 6.28s/it] {'loss': 0.4681, 'learning_rate': 1.2192170746434529e-08, 'epoch': 0.98} + 98%|█████████▊| 5684/5772 [2:14:41<09:13, 6.28s/it] 98%|█████████▊| 5685/5772 [2:14:47<08:59, 6.20s/it] 98%|█████████▊| 5685/5772 [2:14:51<08:59, 6.20s/it] {'loss': 0.4605, 'learning_rate': 1.1916705093740766e-08, 'epoch': 0.98} + 98%|█████████▊| 5685/5772 [2:14:51<08:59, 6.20s/it] {'loss': 0.4605, 'learning_rate': 1.1916705093740766e-08, 'epoch': 0.98} + 98%|█████████▊| 5685/5772 [2:14:47<08:59, 6.20s/it] 99%|█████████▊| 5686/5772 [2:14:53<08:46, 6.13s/it] 99%|█████████▊| 5686/5772 [2:14:56<08:46, 6.13s/it] {'loss': 0.4527, 'learning_rate': 1.164438513234667e-08, 'epoch': 0.99} + {'loss': 0.4527, 'learning_rate': 1.164438513234667e-08, 'epoch': 0.99} 99%|█████████▊| 5686/5772 [2:14:56<08:46, 6.13s/it] + 99%|█████████▊| 5686/5772 [2:14:53<08:46, 6.13s/it] 99%|█████████▊| 5687/5772 [2:15:03<08:52, 6.26s/it] 99%|█████████▊| 5687/5772 [2:15:00<08:52, 6.26s/it] {'loss': 0.4723, 'learning_rate': 1.1375210948019188e-08, 'epoch': 0.99} + 99%|█████████▊| 5687/5772 [2:15:03<08:52, 6.26s/it] {'loss': 0.4723, 'learning_rate': 1.1375210948019188e-08, 'epoch': 0.99} + 99%|█████████▊| 5687/5772 [2:15:00<08:52, 6.26s/it] 99%|█████████▊| 5688/5772 [2:15:09<08:47, 6.28s/it] {'loss': 0.4574, 'learning_rate': 1.1109182625531622e-08, 'epoch': 0.99} + 99%|█████████▊| 5688/5772 [2:15:09<08:47, 6.28s/it] 99%|█████████▊| 5688/5772 [2:15:06<08:47, 6.28s/it] {'loss': 0.4574, 'learning_rate': 1.1109182625531622e-08, 'epoch': 0.99} + 99%|█████████▊| 5688/5772 [2:15:06<08:47, 6.28s/it] 99%|█████████▊| 5689/5772 [2:15:13<08:49, 6.38s/it] 99%|█████████▊| 5689/5772 [2:15:16<08:49, 6.38s/it] {'loss': 0.4398, 'learning_rate': 1.0846300248668063e-08, 'epoch': 0.99} + 99%|█████████▊| 5689/5772 [2:15:16<08:49, 6.38s/it] {'loss': 0.4398, 'learning_rate': 1.0846300248668063e-08, 'epoch': 0.99} + 99%|█████████▊| 5689/5772 [2:15:13<08:49, 6.38s/it] 99%|█████████▊| 5690/5772 [2:15:19<08:40, 6.34s/it] 99%|█████████▊| 5690/5772 [2:15:22<08:40, 6.34s/it] {'loss': 0.4609, 'learning_rate': 1.0586563900222279e-08, 'epoch': 0.99} + 99%|█████████▊| 5690/5772 [2:15:22<08:40, 6.34s/it] {'loss': 0.4609, 'learning_rate': 1.0586563900222279e-08, 'epoch': 0.99} + 99%|█████████▊| 5690/5772 [2:15:19<08:40, 6.34s/it] 99%|█████████▊| 5691/5772 [2:15:25<08:27, 6.27s/it] 99%|█████████▊| 5691/5772 [2:15:28<08:27, 6.27s/it] {'loss': 0.4525, 'learning_rate': 1.0329973661996617e-08, 'epoch': 0.99} + 99%|█████████▊| 5691/5772 [2:15:28<08:27, 6.27s/it] {'loss': 0.4525, 'learning_rate': 1.0329973661996617e-08, 'epoch': 0.99} + 99%|█████████▊| 5691/5772 [2:15:25<08:27, 6.27s/it] 99%|█████████▊| 5692/5772 [2:15:31<08:17, 6.22s/it] 99%|█████████▊| 5692/5772 [2:15:34<08:17, 6.22s/it] {'loss': 0.4581, 'learning_rate': 1.0076529614804209e-08, 'epoch': 0.99} + 99%|█████████▊| 5692/5772 [2:15:34<08:17, 6.22s/it] {'loss': 0.4581, 'learning_rate': 1.0076529614804209e-08, 'epoch': 0.99} + 99%|█████████▊| 5692/5772 [2:15:31<08:17, 6.22s/it] 99%|█████████▊| 5693/5772 [2:15:37<08:03, 6.12s/it] 99%|█████████▊| 5693/5772 [2:15:40<08:03, 6.12s/it] {'loss': 0.437, 'learning_rate': 9.82623183846343e-09, 'epoch': 0.99} + 99%|█████████▊| 5693/5772 [2:15:40<08:03, 6.12s/it] {'loss': 0.437, 'learning_rate': 9.82623183846343e-09, 'epoch': 0.99} + 99%|█████████▊| 5693/5772 [2:15:37<08:03, 6.12s/it] 99%|█████████▊| 5694/5772 [2:15:43<07:58, 6.13s/it] 99%|█████████▊| 5694/5772 [2:15:46<07:58, 6.13s/it] {'loss': 0.4612, 'learning_rate': 9.579080411805664e-09, 'epoch': 0.99} + 99%|█████████▊| 5694/5772 [2:15:46<07:58, 6.13s/it] {'loss': 0.4612, 'learning_rate': 9.579080411805664e-09, 'epoch': 0.99} + 99%|█████████▊| 5694/5772 [2:15:43<07:58, 6.13s/it] 99%|█████████▊| 5695/5772 [2:15:50<07:55, 6.18s/it] 99%|█████████▊| 5695/5772 [2:15:53<07:55, 6.18s/it] {'loss': 0.462, 'learning_rate': 9.335075412669758e-09, 'epoch': 0.99} + 99%|█████████▊| 5695/5772 [2:15:53<07:55, 6.18s/it] {'loss': 0.462, 'learning_rate': 9.335075412669758e-09, 'epoch': 0.99} + 99%|█████████▊| 5695/5772 [2:15:50<07:55, 6.18s/it] 99%|█████████▊| 5696/5772 [2:15:56<07:48, 6.16s/it] 99%|█████████▊| 5696/5772 [2:15:59<07:48, 6.16s/it] {'loss': 0.4575, 'learning_rate': 9.094216917903131e-09, 'epoch': 0.99} + 99%|█████████▊| 5696/5772 [2:15:59<07:48, 6.16s/it] {'loss': 0.4575, 'learning_rate': 9.094216917903131e-09, 'epoch': 0.99} + 99%|█████████▊| 5696/5772 [2:15:56<07:48, 6.16s/it] 99%|█████████▊| 5697/5772 [2:16:02<07:39, 6.13s/it] 99%|█████████▊| 5697/5772 [2:16:05<07:39, 6.13s/it] {'loss': 0.4613, 'learning_rate': 8.85650500336288e-09, 'epoch': 0.99} + 99%|█████████▊| 5697/5772 [2:16:05<07:39, 6.13s/it] {'loss': 0.4613, 'learning_rate': 8.85650500336288e-09, 'epoch': 0.99} + 99%|█████████▊| 5697/5772 [2:16:02<07:39, 6.13s/it] 99%|█████████▊| 5698/5772 [2:16:11<07:29, 6.07s/it] 99%|█████████▊| 5698/5772 [2:16:08<07:29, 6.07s/it] {'loss': 0.4579, 'learning_rate': 8.621939743916896e-09, 'epoch': 0.99} + 99%|█████████▊| 5698/5772 [2:16:11<07:29, 6.07s/it] {'loss': 0.4579, 'learning_rate': 8.621939743916896e-09, 'epoch': 0.99} + 99%|█████████▊| 5698/5772 [2:16:08<07:29, 6.07s/it] 99%|█████████▊| 5699/5772 [2:16:14<07:30, 6.17s/it] 99%|█████████▊| 5699/5772 [2:16:17<07:30, 6.17s/it] {'loss': 0.4488, 'learning_rate': 8.390521213437197e-09, 'epoch': 0.99} + 99%|█████████▊| 5699/5772 [2:16:17<07:30, 6.17s/it] {'loss': 0.4488, 'learning_rate': 8.390521213437197e-09, 'epoch': 0.99} + 99%|█████████▊| 5699/5772 [2:16:14<07:30, 6.17s/it]2 AutoResumeHook: Checking whether to suspend... +73 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +14 AutoResumeHook: Checking whether to suspend... +139 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +15 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +011 AutoResumeHook: Checking whether to suspend... + 8 AutoResumeHook: Checking whether to suspend... +6AutoResumeHook: Checking whether to suspend... + 99%|█████████▉| 5700/5772 [2:16:23<07:23, 6.17s/it] AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... + 99%|█████████▉| 5700/5772 [2:16:20<07:23, 6.17s/it]5 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4607, 'learning_rate': 8.162249484809926e-09, 'epoch': 0.99} + 99%|█████████▉| 5700/5772 [2:16:23<07:23, 6.17s/it] {'loss': 0.4607, 'learning_rate': 8.162249484809926e-09, 'epoch': 0.99} + 99%|█████████▉| 5700/5772 [2:16:20<07:23, 6.17s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-5700/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-5700/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/tmp-checkpoint-5700/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 99%|█████████▉| 5701/5772 [2:16:42<12:36, 10.65s/it] 99%|█████████▉| 5701/5772 [2:16:45<12:36, 10.65s/it] {'loss': 0.4537, 'learning_rate': 7.937124629927573e-09, 'epoch': 0.99} + {'loss': 0.4537, 'learning_rate': 7.937124629927573e-09, 'epoch': 0.99} + 99%|█████████▉| 5701/5772 [2:16:45<12:36, 10.65s/it] 99%|█████████▉| 5701/5772 [2:16:42<12:36, 10.65s/it] 99%|█████████▉| 5702/5772 [2:16:51<10:57, 9.39s/it] 99%|█████████▉| 5702/5772 [2:16:48<10:57, 9.39s/it] {'loss': 0.4603, 'learning_rate': 7.715146719691202e-09, 'epoch': 0.99} + 99%|█████████▉| 5702/5772 [2:16:51<10:57, 9.39s/it] {'loss': 0.4603, 'learning_rate': 7.715146719691202e-09, 'epoch': 0.99} + 99%|█████████▉| 5702/5772 [2:16:48<10:57, 9.39s/it] 99%|█████████▉| 5703/5772 [2:16:58<09:49, 8.55s/it] 99%|█████████▉| 5703/5772 [2:16:55<09:50, 8.55s/it] {'loss': 0.4492, 'learning_rate': 7.496315824012667e-09, 'epoch': 0.99} + 99%|█████████▉| 5703/5772 [2:16:58<09:49, 8.55s/it] {'loss': 0.4492, 'learning_rate': 7.496315824012667e-09, 'epoch': 0.99} + 99%|█████████▉| 5703/5772 [2:16:55<09:50, 8.55s/it] 99%|█████████▉| 5704/5772 [2:17:04<08:52, 7.83s/it] 99%|█████████▉| 5704/5772 [2:17:01<08:52, 7.83s/it] {'loss': 0.4596, 'learning_rate': 7.280632011810174e-09, 'epoch': 0.99} + 99%|█████████▉| 5704/5772 [2:17:01<08:52, 7.83s/it]{'loss': 0.4596, 'learning_rate': 7.280632011810174e-09, 'epoch': 0.99} + 99%|█████████▉| 5704/5772 [2:17:04<08:52, 7.83s/it] 99%|█████████▉| 5705/5772 [2:17:10<08:04, 7.22s/it] 99%|█████████▉| 5705/5772 [2:17:07<08:04, 7.22s/it] {'loss': 0.468, 'learning_rate': 7.068095351013826e-09, 'epoch': 0.99} + 99%|█████████▉| 5705/5772 [2:17:10<08:04, 7.22s/it] {'loss': 0.468, 'learning_rate': 7.068095351013826e-09, 'epoch': 0.99} + 99%|█████████▉| 5705/5772 [2:17:07<08:04, 7.22s/it] 99%|█████████▉| 5706/5772 [2:17:16<07:33, 6.87s/it] 99%|█████████▉| 5706/5772 [2:17:13<07:33, 6.87s/it] {'loss': 0.4609, 'learning_rate': 6.858705908560081e-09, 'epoch': 0.99} + 99%|█████████▉| 5706/5772 [2:17:16<07:33, 6.87s/it] {'loss': 0.4609, 'learning_rate': 6.858705908560081e-09, 'epoch': 0.99} + 99%|█████████▉| 5706/5772 [2:17:13<07:33, 6.87s/it] 99%|█████████▉| 5707/5772 [2:17:22<07:10, 6.63s/it] 99%|█████████▉| 5707/5772 [2:17:19<07:10, 6.63s/it] {'loss': 0.4503, 'learning_rate': 6.6524637503939675e-09, 'epoch': 0.99} + 99%|█████████▉| 5707/5772 [2:17:22<07:10, 6.63s/it] {'loss': 0.4503, 'learning_rate': 6.6524637503939675e-09, 'epoch': 0.99} + 99%|█████████▉| 5707/5772 [2:17:19<07:10, 6.63s/it] 99%|█████████▉| 5708/5772 [2:17:28<06:54, 6.48s/it] 99%|█████████▉| 5708/5772 [2:17:25<06:54, 6.48s/it] {'loss': 0.4484, 'learning_rate': 6.449368941471301e-09, 'epoch': 0.99} + 99%|█████████▉| 5708/5772 [2:17:28<06:54, 6.48s/it] {'loss': 0.4484, 'learning_rate': 6.449368941471301e-09, 'epoch': 0.99} + 99%|█████████▉| 5708/5772 [2:17:25<06:54, 6.48s/it] 99%|█████████▉| 5709/5772 [2:17:34<06:37, 6.31s/it] 99%|█████████▉| 5709/5772 [2:17:31<06:37, 6.31s/it] {'loss': 0.4571, 'learning_rate': 6.249421545755363e-09, 'epoch': 0.99} + 99%|█████████▉| 5709/5772 [2:17:34<06:37, 6.31s/it] {'loss': 0.4571, 'learning_rate': 6.249421545755363e-09, 'epoch': 0.99} + 99%|█████████▉| 5709/5772 [2:17:31<06:37, 6.31s/it] 99%|█████████▉| 5710/5772 [2:17:40<06:30, 6.30s/it] 99%|█████████▉| 5710/5772 [2:17:37<06:30, 6.30s/it]{'loss': 0.4479, 'learning_rate': 6.052621626219113e-09, 'epoch': 0.99} + 99%|█████████▉| 5710/5772 [2:17:40<06:30, 6.30s/it] {'loss': 0.4479, 'learning_rate': 6.052621626219113e-09, 'epoch': 0.99} + 99%|█████████▉| 5710/5772 [2:17:37<06:30, 6.30s/it] 99%|█████████▉| 5711/5772 [2:17:46<06:17, 6.18s/it] 99%|█████████▉| 5711/5772 [2:17:43<06:17, 6.18s/it] {'loss': 0.4516, 'learning_rate': 5.858969244842971e-09, 'epoch': 0.99} +{'loss': 0.4516, 'learning_rate': 5.858969244842971e-09, 'epoch': 0.99} + 99%|█████████▉| 5711/5772 [2:17:43<06:17, 6.18s/it] 99%|█████████▉| 5711/5772 [2:17:46<06:17, 6.18s/it] 99%|█████████▉| 5712/5772 [2:17:52<06:03, 6.05s/it] 99%|█████████▉| 5712/5772 [2:17:49<06:03, 6.05s/it] {'loss': 0.4506, 'learning_rate': 5.66846446261704e-09, 'epoch': 0.99} + 99%|█████████▉| 5712/5772 [2:17:52<06:03, 6.05s/it] {'loss': 0.4506, 'learning_rate': 5.66846446261704e-09, 'epoch': 0.99} + 99%|█████████▉| 5712/5772 [2:17:49<06:03, 6.05s/it] 99%|█████████▉| 5713/5772 [2:17:58<05:54, 6.02s/it] 99%|█████████▉| 5713/5772 [2:17:55<05:54, 6.02s/it] {'loss': 0.4515, 'learning_rate': 5.4811073395388824e-09, 'epoch': 0.99} + 99%|█████████▉| 5713/5772 [2:17:58<05:54, 6.02s/it] {'loss': 0.4515, 'learning_rate': 5.4811073395388824e-09, 'epoch': 0.99} + 99%|█████████▉| 5713/5772 [2:17:55<05:54, 6.02s/it] 99%|█████████▉| 5714/5772 [2:18:04<05:55, 6.12s/it] 99%|█████████▉| 5714/5772 [2:18:01<05:55, 6.12s/it] {'loss': 0.4621, 'learning_rate': 5.296897934616852e-09, 'epoch': 0.99} + 99%|█████████▉| 5714/5772 [2:18:01<05:55, 6.12s/it] {'loss': 0.4621, 'learning_rate': 5.296897934616852e-09, 'epoch': 0.99} + 99%|█████████▉| 5714/5772 [2:18:04<05:55, 6.12s/it] 99%|█████████▉| 5715/5772 [2:18:10<05:47, 6.10s/it] 99%|█████████▉| 5715/5772 [2:18:07<05:47, 6.10s/it] {'loss': 0.4417, 'learning_rate': 5.115836305865651e-09, 'epoch': 0.99} + 99%|█████████▉| 5715/5772 [2:18:10<05:47, 6.10s/it] {'loss': 0.4417, 'learning_rate': 5.115836305865651e-09, 'epoch': 0.99} + 99%|█████████▉| 5715/5772 [2:18:07<05:47, 6.10s/it] 99%|█████████▉| 5716/5772 [2:18:16<05:39, 6.06s/it] 99%|█████████▉| 5716/5772 [2:18:13<05:39, 6.06s/it] {'loss': 0.4559, 'learning_rate': 4.937922510310778e-09, 'epoch': 0.99} + 99%|█████████▉| 5716/5772 [2:18:16<05:39, 6.06s/it] {'loss': 0.4559, 'learning_rate': 4.937922510310778e-09, 'epoch': 0.99} + 99%|█████████▉| 5716/5772 [2:18:13<05:39, 6.06s/it] 99%|█████████▉| 5717/5772 [2:18:22<05:29, 5.99s/it] 99%|█████████▉| 5717/5772 [2:18:19<05:29, 5.99s/it] {'loss': 0.4506, 'learning_rate': 4.763156603984076e-09, 'epoch': 0.99} + 99%|█████████▉| 5717/5772 [2:18:22<05:29, 5.99s/it] {'loss': 0.4506, 'learning_rate': 4.763156603984076e-09, 'epoch': 0.99} + 99%|█████████▉| 5717/5772 [2:18:19<05:29, 5.99s/it] 99%|█████████▉| 5718/5772 [2:18:28<05:34, 6.19s/it] 99%|█████████▉| 5718/5772 [2:18:25<05:34, 6.19s/it] {'loss': 0.4551, 'learning_rate': 4.591538641927074e-09, 'epoch': 0.99} + 99%|█████████▉| 5718/5772 [2:18:28<05:34, 6.19s/it] {'loss': 0.4551, 'learning_rate': 4.591538641927074e-09, 'epoch': 0.99} + 99%|█████████▉| 5718/5772 [2:18:25<05:34, 6.19s/it] 99%|█████████▉| 5719/5772 [2:18:35<05:28, 6.20s/it] 99%|█████████▉| 5719/5772 [2:18:32<05:28, 6.20s/it] {'loss': 0.4479, 'learning_rate': 4.423068678189868e-09, 'epoch': 0.99} + 99%|█████████▉| 5719/5772 [2:18:35<05:28, 6.20s/it] {'loss': 0.4479, 'learning_rate': 4.423068678189868e-09, 'epoch': 0.99} + 99%|█████████▉| 5719/5772 [2:18:32<05:28, 6.20s/it] 99%|█████████▉| 5720/5772 [2:18:41<05:22, 6.19s/it] 99%|█████████▉| 5720/5772 [2:18:38<05:22, 6.19s/it] {'loss': 0.4606, 'learning_rate': 4.257746765832238e-09, 'epoch': 0.99} + 99%|█████████▉| 5720/5772 [2:18:38<05:22, 6.19s/it]{'loss': 0.4606, 'learning_rate': 4.257746765832238e-09, 'epoch': 0.99} + 99%|█████████▉| 5720/5772 [2:18:41<05:22, 6.19s/it] 99%|█████████▉| 5721/5772 [2:18:47<05:13, 6.15s/it] 99%|█████████▉| 5721/5772 [2:18:44<05:13, 6.15s/it] {'loss': 0.4519, 'learning_rate': 4.095572956921423e-09, 'epoch': 0.99} + 99%|█████████▉| 5721/5772 [2:18:47<05:13, 6.15s/it] {'loss': 0.4519, 'learning_rate': 4.095572956921423e-09, 'epoch': 0.99} + 99%|█████████▉| 5721/5772 [2:18:44<05:13, 6.15s/it] 99%|█████████▉| 5722/5772 [2:18:53<05:07, 6.14s/it] 99%|█████████▉| 5722/5772 [2:18:50<05:07, 6.14s/it] {'loss': 0.4559, 'learning_rate': 3.9365473025321235e-09, 'epoch': 0.99} + 99%|█████████▉| 5722/5772 [2:18:53<05:07, 6.14s/it] {'loss': 0.4559, 'learning_rate': 3.9365473025321235e-09, 'epoch': 0.99} + 99%|█████████▉| 5722/5772 [2:18:50<05:07, 6.14s/it] 99%|█████████▉| 5723/5772 [2:18:59<04:59, 6.12s/it] 99%|█████████▉| 5723/5772 [2:18:56<04:59, 6.12s/it] {'loss': 0.4548, 'learning_rate': 3.780669852747609e-09, 'epoch': 0.99} + 99%|█████████▉| 5723/5772 [2:18:59<04:59, 6.12s/it] {'loss': 0.4548, 'learning_rate': 3.780669852747609e-09, 'epoch': 0.99} + 99%|█████████▉| 5723/5772 [2:18:56<04:59, 6.12s/it] 99%|█████████▉| 5724/5772 [2:19:05<04:53, 6.11s/it] 99%|█████████▉| 5724/5772 [2:19:02<04:53, 6.11s/it] {'loss': 0.4606, 'learning_rate': 3.6279406566630536e-09, 'epoch': 0.99} + 99%|█████████▉| 5724/5772 [2:19:05<04:53, 6.11s/it] {'loss': 0.4606, 'learning_rate': 3.6279406566630536e-09, 'epoch': 0.99} + 99%|█████████▉| 5724/5772 [2:19:02<04:53, 6.11s/it] 99%|█████████▉| 5725/5772 [2:19:11<04:44, 6.06s/it] 99%|█████████▉| 5725/5772 [2:19:08<04:44, 6.06s/it] {'loss': 0.4356, 'learning_rate': 3.478359762378869e-09, 'epoch': 0.99} + 99%|█████████▉| 5725/5772 [2:19:11<04:44, 6.06s/it] {'loss': 0.4356, 'learning_rate': 3.478359762378869e-09, 'epoch': 0.99} + 99%|█████████▉| 5725/5772 [2:19:08<04:44, 6.06s/it] 99%|█████████▉| 5726/5772 [2:19:17<04:37, 6.04s/it] 99%|█████████▉| 5726/5772 [2:19:14<04:37, 6.04s/it] {'loss': 0.4643, 'learning_rate': 3.3319272170040384e-09, 'epoch': 0.99} + 99%|█████████▉| 5726/5772 [2:19:17<04:37, 6.04s/it] {'loss': 0.4643, 'learning_rate': 3.3319272170040384e-09, 'epoch': 0.99} + 99%|█████████▉| 5726/5772 [2:19:14<04:37, 6.04s/it] 99%|█████████▉| 5727/5772 [2:19:23<04:30, 6.00s/it] 99%|█████████▉| 5727/5772 [2:19:20<04:30, 6.00s/it] {'loss': 0.4449, 'learning_rate': 3.1886430666561163e-09, 'epoch': 0.99} + 99%|█████████▉| 5727/5772 [2:19:23<04:30, 6.00s/it] {'loss': 0.4449, 'learning_rate': 3.1886430666561163e-09, 'epoch': 0.99} + 99%|█████████▉| 5727/5772 [2:19:20<04:30, 6.00s/it] 99%|█████████▉| 5728/5772 [2:19:29<04:28, 6.10s/it] 99%|█████████▉| 5728/5772 [2:19:26<04:28, 6.10s/it] {'loss': 0.4708, 'learning_rate': 3.048507356463448e-09, 'epoch': 0.99} + 99%|█████████▉| 5728/5772 [2:19:29<04:28, 6.10s/it] {'loss': 0.4708, 'learning_rate': 3.048507356463448e-09, 'epoch': 0.99} + 99%|█████████▉| 5728/5772 [2:19:26<04:28, 6.10s/it] 99%|█████████▉| 5729/5772 [2:19:35<04:21, 6.09s/it] 99%|█████████▉| 5729/5772 [2:19:32<04:21, 6.09s/it] {'loss': 0.4509, 'learning_rate': 2.91152013056073e-09, 'epoch': 0.99} + 99%|█████████▉| 5729/5772 [2:19:35<04:21, 6.09s/it] {'loss': 0.4509, 'learning_rate': 2.91152013056073e-09, 'epoch': 0.99} + 99%|█████████▉| 5729/5772 [2:19:32<04:21, 6.09s/it] 99%|█████████▉| 5730/5772 [2:19:42<04:18, 6.16s/it] 99%|█████████▉| 5730/5772 [2:19:39<04:18, 6.16s/it]{'loss': 0.4714, 'learning_rate': 2.777681432090118e-09, 'epoch': 0.99} + {'loss': 0.4714, 'learning_rate': 2.777681432090118e-09, 'epoch': 0.99} + 99%|█████████▉| 5730/5772 [2:19:42<04:18, 6.16s/it] 99%|█████████▉| 5730/5772 [2:19:39<04:18, 6.16s/it] 99%|█████████▉| 5731/5772 [2:19:48<04:16, 6.25s/it] 99%|█████████▉| 5731/5772 [2:19:45<04:16, 6.25s/it] {'loss': 0.4528, 'learning_rate': 2.64699130320345e-09, 'epoch': 0.99} + 99%|█████████▉| 5731/5772 [2:19:48<04:16, 6.25s/it] {'loss': 0.4528, 'learning_rate': 2.64699130320345e-09, 'epoch': 0.99} + 99%|█████████▉| 5731/5772 [2:19:45<04:16, 6.25s/it] 99%|█████████▉| 5732/5772 [2:19:54<04:05, 6.15s/it] 99%|█████████▉| 5732/5772 [2:19:51<04:05, 6.15s/it] {'loss': 0.4606, 'learning_rate': 2.5194497850622447e-09, 'epoch': 0.99} + 99%|█████████▉| 5732/5772 [2:19:54<04:05, 6.15s/it] {'loss': 0.4606, 'learning_rate': 2.5194497850622447e-09, 'epoch': 0.99} + 99%|█████████▉| 5732/5772 [2:19:51<04:05, 6.15s/it] 99%|█████████▉| 5733/5772 [2:20:00<03:57, 6.08s/it] 99%|█████████▉| 5733/5772 [2:19:57<03:57, 6.08s/it] {'loss': 0.4521, 'learning_rate': 2.3950569178332605e-09, 'epoch': 0.99} + 99%|█████████▉| 5733/5772 [2:20:00<03:57, 6.08s/it] {'loss': 0.4521, 'learning_rate': 2.3950569178332605e-09, 'epoch': 0.99} + 99%|█████████▉| 5733/5772 [2:19:57<03:57, 6.08s/it] 99%|█████████▉| 5734/5772 [2:20:06<03:53, 6.13s/it] 99%|█████████▉| 5734/5772 [2:20:03<03:53, 6.13s/it] {'loss': 0.4608, 'learning_rate': 2.2738127406951583e-09, 'epoch': 0.99} + 99%|█████████▉| 5734/5772 [2:20:06<03:53, 6.13s/it] {'loss': 0.4608, 'learning_rate': 2.2738127406951583e-09, 'epoch': 0.99} + 99%|█████████▉| 5734/5772 [2:20:03<03:53, 6.13s/it] 99%|█████████▉| 5735/5772 [2:20:13<03:48, 6.19s/it] 99%|█████████▉| 5735/5772 [2:20:10<03:48, 6.19s/it] {'loss': 0.4646, 'learning_rate': 2.155717291830728e-09, 'epoch': 0.99} + 99%|█████████▉| 5735/5772 [2:20:13<03:48, 6.19s/it] {'loss': 0.4646, 'learning_rate': 2.155717291830728e-09, 'epoch': 0.99} + 99%|█████████▉| 5735/5772 [2:20:10<03:48, 6.19s/it] 99%|█████████▉| 5736/5772 [2:20:18<03:39, 6.10s/it] 99%|█████████▉| 5736/5772 [2:20:15<03:39, 6.10s/it] {'loss': 0.4617, 'learning_rate': 2.0407706084368816e-09, 'epoch': 0.99} + 99%|█████████▉| 5736/5772 [2:20:18<03:39, 6.10s/it] {'loss': 0.4617, 'learning_rate': 2.0407706084368816e-09, 'epoch': 0.99} + 99%|█████████▉| 5736/5772 [2:20:15<03:39, 6.10s/it] 99%|█████████▉| 5737/5772 [2:20:25<03:33, 6.09s/it] 99%|█████████▉| 5737/5772 [2:20:22<03:33, 6.09s/it] {'loss': 0.4423, 'learning_rate': 1.9289727267124416e-09, 'epoch': 0.99} + 99%|█████████▉| 5737/5772 [2:20:25<03:33, 6.09s/it] {'loss': 0.4423, 'learning_rate': 1.9289727267124416e-09, 'epoch': 0.99} + 99%|█████████▉| 5737/5772 [2:20:22<03:33, 6.09s/it] 99%|█████████▉| 5738/5772 [2:20:31<03:30, 6.18s/it] 99%|█████████▉| 5738/5772 [2:20:28<03:30, 6.18s/it] {'loss': 0.4593, 'learning_rate': 1.8203236818681302e-09, 'epoch': 0.99} + 99%|█████████▉| 5738/5772 [2:20:31<03:30, 6.18s/it] {'loss': 0.4593, 'learning_rate': 1.8203236818681302e-09, 'epoch': 0.99} + 99%|█████████▉| 5738/5772 [2:20:28<03:30, 6.18s/it] 99%|█████████▉| 5739/5772 [2:20:34<03:27, 6.28s/it] {'loss': 0.451, 'learning_rate': 1.7148235081232424e-09, 'epoch': 0.99} + 99%|█████████▉| 5739/5772 [2:20:34<03:27, 6.28s/it] 99%|█████████▉| 5739/5772 [2:20:37<03:27, 6.28s/it] {'loss': 0.451, 'learning_rate': 1.7148235081232424e-09, 'epoch': 0.99} + 99%|█████████▉| 5739/5772 [2:20:37<03:27, 6.28s/it] 99%|█████████▉| 5740/5772 [2:20:43<03:16, 6.15s/it] 99%|█████████▉| 5740/5772 [2:20:40<03:16, 6.15s/it] {'loss': 0.4544, 'learning_rate': 1.6124722387034219e-09, 'epoch': 0.99} + 99%|█████████▉| 5740/5772 [2:20:43<03:16, 6.15s/it] {'loss': 0.4544, 'learning_rate': 1.6124722387034219e-09, 'epoch': 0.99} + 99%|█████████▉| 5740/5772 [2:20:40<03:16, 6.15s/it] 99%|█████████▉| 5741/5772 [2:20:49<03:10, 6.15s/it] 99%|█████████▉| 5741/5772 [2:20:46<03:10, 6.15s/it] {'loss': 0.4566, 'learning_rate': 1.513269905845105e-09, 'epoch': 0.99} + 99%|█████████▉| 5741/5772 [2:20:49<03:10, 6.15s/it] {'loss': 0.4566, 'learning_rate': 1.513269905845105e-09, 'epoch': 0.99} + 99%|█████████▉| 5741/5772 [2:20:46<03:10, 6.15s/it] 99%|█████████▉| 5742/5772 [2:20:56<03:03, 6.13s/it] 99%|█████████▉| 5742/5772 [2:20:53<03:03, 6.13s/it] {'loss': 0.4664, 'learning_rate': 1.4172165407899675e-09, 'epoch': 0.99} + 99%|█████████▉| 5742/5772 [2:20:56<03:03, 6.13s/it] {'loss': 0.4664, 'learning_rate': 1.4172165407899675e-09, 'epoch': 0.99} + 99%|█████████▉| 5742/5772 [2:20:53<03:03, 6.13s/it] 99%|█████████▉| 5743/5772 [2:21:01<02:54, 6.02s/it] 99%|█████████▉| 5743/5772 [2:20:58<02:54, 6.02s/it] {'loss': 0.4471, 'learning_rate': 1.3243121737904763e-09, 'epoch': 0.99} + 99%|█████████▉| 5743/5772 [2:21:01<02:54, 6.02s/it] {'loss': 0.4471, 'learning_rate': 1.3243121737904763e-09, 'epoch': 0.99} + 99%|█████████▉| 5743/5772 [2:20:58<02:54, 6.02s/it] 100%|█████████▉| 5744/5772 [2:21:07<02:47, 5.99s/it] 100%|█████████▉| 5744/5772 [2:21:04<02:47, 5.99s/it] {'loss': 0.4498, 'learning_rate': 1.2345568341065595e-09, 'epoch': 1.0} + 100%|█████████▉| 5744/5772 [2:21:07<02:47, 5.99s/it] {'loss': 0.4498, 'learning_rate': 1.2345568341065595e-09, 'epoch': 1.0} + 100%|█████████▉| 5744/5772 [2:21:04<02:47, 5.99s/it] 100%|█████████▉| 5745/5772 [2:21:14<02:45, 6.15s/it] 100%|█████████▉| 5745/5772 [2:21:11<02:45, 6.15s/it] {'loss': 0.4469, 'learning_rate': 1.1479505500044952e-09, 'epoch': 1.0} + 100%|█████████▉| 5745/5772 [2:21:14<02:45, 6.15s/it] {'loss': 0.4469, 'learning_rate': 1.1479505500044952e-09, 'epoch': 1.0} + 100%|█████████▉| 5745/5772 [2:21:11<02:45, 6.15s/it] 100%|█████████▉| 5746/5772 [2:21:20<02:40, 6.19s/it] {'loss': 0.4601, 'learning_rate': 1.064493348762463e-09, 'epoch': 1.0} + 100%|█████████▉| 5746/5772 [2:21:17<02:40, 6.19s/it] 100%|█████████▉| 5746/5772 [2:21:20<02:40, 6.19s/it]{'loss': 0.4601, 'learning_rate': 1.064493348762463e-09, 'epoch': 1.0} + 100%|█████████▉| 5746/5772 [2:21:17<02:40, 6.19s/it] 100%|█████████▉| 5747/5772 [2:21:26<02:32, 6.09s/it] 100%|█████████▉| 5747/5772 [2:21:23<02:32, 6.09s/it] {'loss': 0.4555, 'learning_rate': 9.841852566638833e-10, 'epoch': 1.0} + 100%|█████████▉| 5747/5772 [2:21:26<02:32, 6.09s/it] {'loss': 0.4555, 'learning_rate': 9.841852566638833e-10, 'epoch': 1.0} + 100%|█████████▉| 5747/5772 [2:21:23<02:32, 6.09s/it] 100%|█████████▉| 5748/5772 [2:21:29<02:29, 6.24s/it] 100%|█████████▉| 5748/5772 [2:21:32<02:29, 6.24s/it]{'loss': 0.4535, 'learning_rate': 9.070262990007462e-10, 'epoch': 1.0} + {'loss': 0.4535, 'learning_rate': 9.070262990007462e-10, 'epoch': 1.0} + 100%|█████████▉| 5748/5772 [2:21:32<02:29, 6.24s/it] 100%|█████████▉| 5748/5772 [2:21:29<02:29, 6.24s/it] 100%|█████████▉| 5749/5772 [2:21:39<02:24, 6.27s/it] 100%|█████████▉| 5749/5772 [2:21:36<02:24, 6.27s/it] {'loss': 0.4618, 'learning_rate': 8.330165000758339e-10, 'epoch': 1.0} + 100%|█████████▉| 5749/5772 [2:21:39<02:24, 6.27s/it] {'loss': 0.4618, 'learning_rate': 8.330165000758339e-10, 'epoch': 1.0} + 100%|█████████▉| 5749/5772 [2:21:36<02:24, 6.27s/it]14 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend...7 + AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... + 100%|█████████▉| 5750/5772 [2:21:45<02:19, 6.34s/it]6 AutoResumeHook: Checking whether to suspend...12 AutoResumeHook: Checking whether to suspend... + +3 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +0 AutoResumeHook: Checking whether to suspend... + 100%|█████████▉| 5750/5772 [2:21:42<02:19, 6.34s/it] {'loss': 0.4579, 'learning_rate': 7.621558831949482e-10, 'epoch': 1.0} + 100%|█████████▉| 5750/5772 [2:21:45<02:19, 6.34s/it] {'loss': 0.4579, 'learning_rate': 7.621558831949482e-10, 'epoch': 1.0} + 100%|█████████▉| 5750/5772 [2:21:42<02:19, 6.34s/it] 100%|█████████▉| 5751/5772 [2:21:51<02:11, 6.26s/it] 100%|█████████▉| 5751/5772 [2:21:48<02:11, 6.26s/it]{'loss': 0.4425, 'learning_rate': 6.944444706791231e-10, 'epoch': 1.0} + {'loss': 0.4425, 'learning_rate': 6.944444706791231e-10, 'epoch': 1.0} + 100%|█████████▉| 5751/5772 [2:21:51<02:11, 6.26s/it] 100%|█████████▉| 5751/5772 [2:21:48<02:11, 6.26s/it] 100%|█████████▉| 5752/5772 [2:21:58<02:04, 6.22s/it] 100%|█████████▉| 5752/5772 [2:21:54<02:04, 6.22s/it] {'loss': 0.4656, 'learning_rate': 6.298822838501917e-10, 'epoch': 1.0} + 100%|█████████▉| 5752/5772 [2:21:58<02:04, 6.22s/it] {'loss': 0.4656, 'learning_rate': 6.298822838501917e-10, 'epoch': 1.0} + 100%|█████████▉| 5752/5772 [2:21:54<02:04, 6.22s/it] 100%|█████████▉| 5753/5772 [2:22:04<01:58, 6.25s/it] 100%|█████████▉| 5753/5772 [2:22:01<01:58, 6.25s/it] {'loss': 0.4469, 'learning_rate': 5.684693430429988e-10, 'epoch': 1.0} + 100%|█████████▉| 5753/5772 [2:22:04<01:58, 6.25s/it] {'loss': 0.4469, 'learning_rate': 5.684693430429988e-10, 'epoch': 1.0} + 100%|█████████▉| 5753/5772 [2:22:01<01:58, 6.25s/it] 100%|█████████▉| 5754/5772 [2:22:10<01:52, 6.26s/it] {'loss': 0.4556, 'learning_rate': 5.102056675998501e-10, 'epoch': 1.0} + 100%|█████████▉| 5754/5772 [2:22:10<01:52, 6.26s/it] 100%|█████████▉| 5754/5772 [2:22:07<01:52, 6.26s/it] {'loss': 0.4556, 'learning_rate': 5.102056675998501e-10, 'epoch': 1.0} + 100%|█████████▉| 5754/5772 [2:22:07<01:52, 6.26s/it] 100%|█████████▉| 5755/5772 [2:22:16<01:45, 6.23s/it] 100%|█████████▉| 5755/5772 [2:22:13<01:45, 6.23s/it] {'loss': 0.4572, 'learning_rate': 4.550912758705117e-10, 'epoch': 1.0} + 100%|█████████▉| 5755/5772 [2:22:16<01:45, 6.23s/it] {'loss': 0.4572, 'learning_rate': 4.550912758705117e-10, 'epoch': 1.0} + 100%|█████████▉| 5755/5772 [2:22:13<01:45, 6.23s/it] 100%|█████████▉| 5756/5772 [2:22:22<01:37, 6.11s/it] 100%|█████████▉| 5756/5772 [2:22:19<01:37, 6.11s/it] {'loss': 0.4509, 'learning_rate': 4.0312618521221034e-10, 'epoch': 1.0} + 100%|█████████▉| 5756/5772 [2:22:22<01:37, 6.11s/it] {'loss': 0.4509, 'learning_rate': 4.0312618521221034e-10, 'epoch': 1.0} + 100%|█████████▉| 5756/5772 [2:22:19<01:37, 6.11s/it] 100%|█████████▉| 5757/5772 [2:22:28<01:30, 6.06s/it] 100%|█████████▉| 5757/5772 [2:22:25<01:30, 6.06s/it] {'loss': 0.457, 'learning_rate': 3.543104119907437e-10, 'epoch': 1.0} + 100%|█████████▉| 5757/5772 [2:22:28<01:30, 6.06s/it] {'loss': 0.457, 'learning_rate': 3.543104119907437e-10, 'epoch': 1.0} + 100%|█████████▉| 5757/5772 [2:22:25<01:30, 6.06s/it] 100%|█████████▉| 5758/5772 [2:22:34<01:24, 6.05s/it] 100%|█████████▉| 5758/5772 [2:22:31<01:24, 6.05s/it] {'loss': 0.4623, 'learning_rate': 3.086439715815903e-10, 'epoch': 1.0} + 100%|█████████▉| 5758/5772 [2:22:34<01:24, 6.05s/it] {'loss': 0.4623, 'learning_rate': 3.086439715815903e-10, 'epoch': 1.0} + 100%|█████████▉| 5758/5772 [2:22:31<01:24, 6.05s/it] 100%|█████████▉| 5759/5772 [2:22:40<01:18, 6.06s/it] 100%|█████████▉| 5759/5772 [2:22:37<01:18, 6.06s/it] {'loss': 0.4452, 'learning_rate': 2.6612687836657937e-10, 'epoch': 1.0} + 100%|█████████▉| 5759/5772 [2:22:40<01:18, 6.06s/it] {'loss': 0.4452, 'learning_rate': 2.6612687836657937e-10, 'epoch': 1.0} + 100%|█████████▉| 5759/5772 [2:22:37<01:18, 6.06s/it] 100%|█████████▉| 5760/5772 [2:22:46<01:13, 6.13s/it] 100%|█████████▉| 5760/5772 [2:22:43<01:13, 6.13s/it] {'loss': 0.4547, 'learning_rate': 2.2675914573611068e-10, 'epoch': 1.0} + 100%|█████████▉| 5760/5772 [2:22:46<01:13, 6.13s/it] {'loss': 0.4547, 'learning_rate': 2.2675914573611068e-10, 'epoch': 1.0} + 100%|█████████▉| 5760/5772 [2:22:43<01:13, 6.13s/it] 100%|█████████▉| 5761/5772 [2:22:52<01:06, 6.08s/it] 100%|█████████▉| 5761/5772 [2:22:49<01:06, 6.08s/it] {'loss': 0.4651, 'learning_rate': 1.9054078608804482e-10, 'epoch': 1.0} + 100%|█████████▉| 5761/5772 [2:22:52<01:06, 6.08s/it] {'loss': 0.4651, 'learning_rate': 1.9054078608804482e-10, 'epoch': 1.0} + 100%|█████████▉| 5761/5772 [2:22:49<01:06, 6.08s/it] 100%|█████████▉| 5762/5772 [2:22:59<01:02, 6.20s/it] 100%|█████████▉| 5762/5772 [2:22:56<01:02, 6.20s/it] {'loss': 0.4642, 'learning_rate': 1.5747181083103357e-10, 'epoch': 1.0} + 100%|█████████▉| 5762/5772 [2:22:59<01:02, 6.20s/it] {'loss': 0.4642, 'learning_rate': 1.5747181083103357e-10, 'epoch': 1.0} + 100%|█████████▉| 5762/5772 [2:22:56<01:02, 6.20s/it] 100%|█████████▉| 5763/5772 [2:23:05<00:55, 6.18s/it] 100%|█████████▉| 5763/5772 [2:23:02<00:55, 6.18s/it] {'loss': 0.4387, 'learning_rate': 1.2755223037896892e-10, 'epoch': 1.0} + 100%|█████████▉| 5763/5772 [2:23:05<00:55, 6.18s/it] {'loss': 0.4387, 'learning_rate': 1.2755223037896892e-10, 'epoch': 1.0} + 100%|█████████▉| 5763/5772 [2:23:02<00:55, 6.18s/it] 100%|█████████▉| 5764/5772 [2:23:11<00:48, 6.09s/it] 100%|█████████▉| 5764/5772 [2:23:08<00:48, 6.09s/it] {'loss': 0.4611, 'learning_rate': 1.0078205415431364e-10, 'epoch': 1.0} + 100%|█████████▉| 5764/5772 [2:23:11<00:48, 6.09s/it] {'loss': 0.4611, 'learning_rate': 1.0078205415431364e-10, 'epoch': 1.0} + 100%|█████████▉| 5764/5772 [2:23:08<00:48, 6.09s/it] 100%|█████████▉| 5765/5772 [2:23:17<00:43, 6.16s/it] 100%|█████████▉| 5765/5772 [2:23:14<00:43, 6.16s/it] {'loss': 0.4515, 'learning_rate': 7.716129058921162e-11, 'epoch': 1.0} + 100%|█████████▉| 5765/5772 [2:23:17<00:43, 6.16s/it] {'loss': 0.4515, 'learning_rate': 7.716129058921162e-11, 'epoch': 1.0} + 100%|█████████▉| 5765/5772 [2:23:14<00:43, 6.16s/it] 100%|█████████▉| 5766/5772 [2:23:23<00:36, 6.10s/it] 100%|█████████▉| 5766/5772 [2:23:20<00:36, 6.10s/it] {'loss': 0.4597, 'learning_rate': 5.668994712104692e-11, 'epoch': 1.0} + 100%|█████████▉| 5766/5772 [2:23:23<00:36, 6.10s/it] {'loss': 0.4597, 'learning_rate': 5.668994712104692e-11, 'epoch': 1.0} + 100%|█████████▉| 5766/5772 [2:23:20<00:36, 6.10s/it] 100%|█████████▉| 5767/5772 [2:23:30<00:30, 6.16s/it] 100%|█████████▉| 5767/5772 [2:23:26<00:30, 6.16s/it] {'loss': 0.4535, 'learning_rate': 3.936803020021529e-11, 'epoch': 1.0} + 100%|█████████▉| 5767/5772 [2:23:30<00:30, 6.16s/it] {'loss': 0.4535, 'learning_rate': 3.936803020021529e-11, 'epoch': 1.0} + 100%|█████████▉| 5767/5772 [2:23:26<00:30, 6.16s/it] 100%|█████████▉| 5768/5772 [2:23:36<00:24, 6.17s/it] 100%|█████████▉| 5768/5772 [2:23:33<00:24, 6.17s/it] {'loss': 0.4563, 'learning_rate': 2.5195545279022016e-11, 'epoch': 1.0} + 100%|█████████▉| 5768/5772 [2:23:36<00:24, 6.17s/it] {'loss': 0.4563, 'learning_rate': 2.5195545279022016e-11, 'epoch': 1.0} + 100%|█████████▉| 5768/5772 [2:23:33<00:24, 6.17s/it] 100%|█████████▉| 5769/5772 [2:23:42<00:18, 6.08s/it] 100%|█████████▉| 5769/5772 [2:23:39<00:18, 6.08s/it] {'loss': 0.436, 'learning_rate': 1.4172496823894322e-11, 'epoch': 1.0} + 100%|█████████▉| 5769/5772 [2:23:42<00:18, 6.08s/it] {'loss': 0.436, 'learning_rate': 1.4172496823894322e-11, 'epoch': 1.0} + 100%|█████████▉| 5769/5772 [2:23:39<00:18, 6.08s/it] 100%|█████████▉| 5770/5772 [2:23:48<00:12, 6.22s/it] 100%|█████████▉| 5770/5772 [2:23:45<00:12, 6.22s/it] {'loss': 0.4587, 'learning_rate': 6.298888303168938e-12, 'epoch': 1.0} + 100%|█████████▉| 5770/5772 [2:23:48<00:12, 6.22s/it] {'loss': 0.4587, 'learning_rate': 6.298888303168938e-12, 'epoch': 1.0} + 100%|█████████▉| 5770/5772 [2:23:45<00:12, 6.22s/it] 100%|█████████▉| 5771/5772 [2:23:54<00:06, 6.14s/it] 100%|█████████▉| 5771/5772 [2:23:51<00:06, 6.14s/it] {'loss': 0.4541, 'learning_rate': 1.574722200414769e-12, 'epoch': 1.0} + 100%|█████████▉| 5771/5772 [2:23:54<00:06, 6.14s/it] {'loss': 0.4541, 'learning_rate': 1.574722200414769e-12, 'epoch': 1.0} + 100%|█████████▉| 5771/5772 [2:23:51<00:06, 6.14s/it] 100%|██████████| 5772/5772 [2:24:00<00:00, 6.18s/it] 100%|██████████| 5772/5772 [2:23:57<00:00, 6.18s/it] {'loss': 0.4532, 'learning_rate': 0.0, 'epoch': 1.0} + 100%|██████████| 5772/5772 [2:24:00<00:00, 6.18s/it] {'loss': 0.4532, 'learning_rate': 0.0, 'epoch': 1.0} + 100%|██████████| 5772/5772 [2:23:57<00:00, 6.18s/it] {'train_runtime': 8641.6583, 'train_samples_per_second': 344.271, 'train_steps_per_second': 0.668, 'train_loss': 0.10881091421437776, 'epoch': 1.0} + 100%|██████████| 5772/5772 [2:24:01<00:00, 6.18s/it] {'train_runtime': 8641.6411, 'train_samples_per_second': 344.272, 'train_steps_per_second': 0.668, 'train_loss': 0.10881091421437776, 'epoch': 1.0} + 100%|██████████| 5772/5772 [2:23:58<00:00, 6.18s/it] 100%|██████████| 5772/5772 [2:23:58<00:00, 1.50s/it] + 100%|██████████| 5772/5772 [2:24:01<00:00, 1.50s/it] +saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/mm_projector +wandb: 🚀 View run vila_3b_oxe_sim_path_mask at: https://wandb.ai/memmelma/VILA/runs/4sab5m1l +wandb: Find logs at: ../../../../../../../../fs12/portfolios/nvr/users/mmemmel/projects/vila/VILA/wandb/run-20250410_123248-4sab5m1l/logs +srun: job 6724457 queued and waiting for resources +srun: job 6724457 has been allocated resources +wandb: Currently logged in as: memmelma. Use `wandb login --relogin` to force relogin +wandb: Currently logged in as: memmelma. Use `wandb login --relogin` to force relogin +MASTER_ADDR=batch-block1-2008 +JobID: 6724457 | Full list: batch-block1-2008 batch-block1-0005 +NETWORK=Efficient-Large-Model/VILA1.5-3b +MASTER_ADDR=batch-block1-2008 +JobID: 6724457 | Full list: batch-block1-2008 batch-block1-0005 +NETWORK=Efficient-Large-Model/VILA1.5-3b +WARNING:torch.distributed.run: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:torch.distributed.run: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +[2025-04-10 18:12:56,539] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 18:12:56,539] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 18:12:56,539] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 18:12:56,539] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 18:12:56,539] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 18:12:56,540] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 18:12:56,540] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 18:12:56,540] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 18:12:56,545] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 18:12:56,545] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 18:12:56,545] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 18:12:56,545] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 18:12:56,545] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 18:12:56,545] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 18:12:56,545] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 18:12:56,545] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 18:12:57,478] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 18:12:57,478] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 18:12:57,478] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 18:12:57,478] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 18:12:57,478] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 18:12:57,478] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 18:12:57,478] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 18:12:57,478] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 18:12:57,478] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 18:12:57,478] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 18:12:57,478] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 18:12:57,478] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 18:12:57,478] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 18:12:57,478] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 18:12:57,478] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 18:12:57,478] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 18:12:57,478] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 18:12:57,478] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 18:12:57,478] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 18:12:57,478] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 18:12:57,478] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 18:12:57,478] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 18:12:57,478] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 18:12:57,478] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 18:12:57,478] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 18:12:57,478] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 18:12:57,478] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 18:12:57,478] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 18:12:57,478] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 18:12:57,478] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 18:12:57,478] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 18:12:57,478] [INFO] [comm.py:625:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +[2025-04-10 18:12:57,478] [INFO] [comm.py:594:init_distributed] cdb=None +Models has been ready under /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask. Skipp trainingModels has been ready under /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask. Skipp training + +Models has been ready under /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask. Skipp training +Models has been ready under /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask. Skipp training +Models has been ready under /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask. Skipp training +Models has been ready under /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask. Skipp training +Models has been ready under /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask. Skipp training +Models has been ready under /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask. Skipp training +Models has been ready under /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask. Skipp training +Models has been ready under /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask. Skipp training +Models has been ready under /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask. Skipp training +Models has been ready under /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask. Skipp training +Models has been ready under /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask. Skipp training +Models has been ready under /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask. Skipp training +Models has been ready under /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask. Skipp training +Models has been ready under /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask. Skipp training diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f15c52b882f365766ea9416f3d1b4bf63a32c973 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,34662 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999133824166305, + "eval_steps": 500, + "global_step": 5772, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 1.1494252873563219e-07, + "loss": 0.8138, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 2.2988505747126437e-07, + "loss": 0.7974, + "step": 2 + }, + { + "epoch": 0.0, + "learning_rate": 3.4482758620689656e-07, + "loss": 0.8003, + "step": 3 + }, + { + "epoch": 0.0, + "learning_rate": 4.5977011494252875e-07, + "loss": 0.8151, + "step": 4 + }, + { + "epoch": 0.0, + "learning_rate": 5.747126436781609e-07, + "loss": 0.8152, + "step": 5 + }, + { + "epoch": 0.0, + "learning_rate": 6.896551724137931e-07, + "loss": 0.7962, + "step": 6 + }, + { + "epoch": 0.0, + "learning_rate": 8.045977011494253e-07, + "loss": 0.7987, + "step": 7 + }, + { + "epoch": 0.0, + "learning_rate": 9.195402298850575e-07, + "loss": 0.7904, + "step": 8 + }, + { + "epoch": 0.0, + "learning_rate": 1.0344827586206898e-06, + "loss": 0.8011, + "step": 9 + }, + { + "epoch": 0.0, + "learning_rate": 1.1494252873563219e-06, + "loss": 0.7733, + "step": 10 + }, + { + "epoch": 0.0, + "learning_rate": 1.2643678160919542e-06, + "loss": 0.7698, + "step": 11 + }, + { + "epoch": 0.0, + "learning_rate": 1.3793103448275862e-06, + "loss": 0.779, + "step": 12 + }, + { + "epoch": 0.0, + "learning_rate": 1.4942528735632185e-06, + "loss": 0.7727, + "step": 13 + }, + { + "epoch": 0.0, + "learning_rate": 1.6091954022988506e-06, + "loss": 0.7267, + "step": 14 + }, + { + "epoch": 0.0, + "learning_rate": 1.724137931034483e-06, + "loss": 0.7231, + "step": 15 + }, + { + "epoch": 0.0, + "learning_rate": 1.839080459770115e-06, + "loss": 0.727, + "step": 16 + }, + { + "epoch": 0.0, + "learning_rate": 1.9540229885057475e-06, + "loss": 0.728, + "step": 17 + }, + { + "epoch": 0.0, + "learning_rate": 2.0689655172413796e-06, + "loss": 0.6981, + "step": 18 + }, + { + "epoch": 0.0, + "learning_rate": 2.1839080459770117e-06, + "loss": 0.6842, + "step": 19 + }, + { + "epoch": 0.0, + "learning_rate": 2.2988505747126437e-06, + "loss": 0.6777, + "step": 20 + }, + { + "epoch": 0.0, + "learning_rate": 2.4137931034482762e-06, + "loss": 0.6727, + "step": 21 + }, + { + "epoch": 0.0, + "learning_rate": 2.5287356321839083e-06, + "loss": 0.6708, + "step": 22 + }, + { + "epoch": 0.0, + "learning_rate": 2.6436781609195404e-06, + "loss": 0.6726, + "step": 23 + }, + { + "epoch": 0.0, + "learning_rate": 2.7586206896551725e-06, + "loss": 0.665, + "step": 24 + }, + { + "epoch": 0.0, + "learning_rate": 2.8735632183908046e-06, + "loss": 0.6452, + "step": 25 + }, + { + "epoch": 0.0, + "learning_rate": 2.988505747126437e-06, + "loss": 0.6388, + "step": 26 + }, + { + "epoch": 0.0, + "learning_rate": 3.103448275862069e-06, + "loss": 0.6562, + "step": 27 + }, + { + "epoch": 0.0, + "learning_rate": 3.2183908045977012e-06, + "loss": 0.6516, + "step": 28 + }, + { + "epoch": 0.01, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.6408, + "step": 29 + }, + { + "epoch": 0.01, + "learning_rate": 3.448275862068966e-06, + "loss": 0.6408, + "step": 30 + }, + { + "epoch": 0.01, + "learning_rate": 3.563218390804598e-06, + "loss": 0.6422, + "step": 31 + }, + { + "epoch": 0.01, + "learning_rate": 3.67816091954023e-06, + "loss": 0.6256, + "step": 32 + }, + { + "epoch": 0.01, + "learning_rate": 3.793103448275862e-06, + "loss": 0.6399, + "step": 33 + }, + { + "epoch": 0.01, + "learning_rate": 3.908045977011495e-06, + "loss": 0.6133, + "step": 34 + }, + { + "epoch": 0.01, + "learning_rate": 4.022988505747127e-06, + "loss": 0.6151, + "step": 35 + }, + { + "epoch": 0.01, + "learning_rate": 4.137931034482759e-06, + "loss": 0.6182, + "step": 36 + }, + { + "epoch": 0.01, + "learning_rate": 4.252873563218391e-06, + "loss": 0.6266, + "step": 37 + }, + { + "epoch": 0.01, + "learning_rate": 4.367816091954023e-06, + "loss": 0.6073, + "step": 38 + }, + { + "epoch": 0.01, + "learning_rate": 4.482758620689656e-06, + "loss": 0.6164, + "step": 39 + }, + { + "epoch": 0.01, + "learning_rate": 4.5977011494252875e-06, + "loss": 0.6078, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 4.71264367816092e-06, + "loss": 0.6099, + "step": 41 + }, + { + "epoch": 0.01, + "learning_rate": 4.8275862068965525e-06, + "loss": 0.6025, + "step": 42 + }, + { + "epoch": 0.01, + "learning_rate": 4.942528735632184e-06, + "loss": 0.6016, + "step": 43 + }, + { + "epoch": 0.01, + "learning_rate": 5.057471264367817e-06, + "loss": 0.6129, + "step": 44 + }, + { + "epoch": 0.01, + "learning_rate": 5.172413793103449e-06, + "loss": 0.6055, + "step": 45 + }, + { + "epoch": 0.01, + "learning_rate": 5.287356321839081e-06, + "loss": 0.5933, + "step": 46 + }, + { + "epoch": 0.01, + "learning_rate": 5.402298850574713e-06, + "loss": 0.6011, + "step": 47 + }, + { + "epoch": 0.01, + "learning_rate": 5.517241379310345e-06, + "loss": 0.5938, + "step": 48 + }, + { + "epoch": 0.01, + "learning_rate": 5.6321839080459775e-06, + "loss": 0.5975, + "step": 49 + }, + { + "epoch": 0.01, + "learning_rate": 5.747126436781609e-06, + "loss": 0.5881, + "step": 50 + }, + { + "epoch": 0.01, + "learning_rate": 5.862068965517242e-06, + "loss": 0.5889, + "step": 51 + }, + { + "epoch": 0.01, + "learning_rate": 5.977011494252874e-06, + "loss": 0.5824, + "step": 52 + }, + { + "epoch": 0.01, + "learning_rate": 6.091954022988507e-06, + "loss": 0.5805, + "step": 53 + }, + { + "epoch": 0.01, + "learning_rate": 6.206896551724138e-06, + "loss": 0.5791, + "step": 54 + }, + { + "epoch": 0.01, + "learning_rate": 6.321839080459771e-06, + "loss": 0.5897, + "step": 55 + }, + { + "epoch": 0.01, + "learning_rate": 6.4367816091954025e-06, + "loss": 0.5856, + "step": 56 + }, + { + "epoch": 0.01, + "learning_rate": 6.551724137931035e-06, + "loss": 0.5927, + "step": 57 + }, + { + "epoch": 0.01, + "learning_rate": 6.666666666666667e-06, + "loss": 0.5984, + "step": 58 + }, + { + "epoch": 0.01, + "learning_rate": 6.781609195402299e-06, + "loss": 0.5773, + "step": 59 + }, + { + "epoch": 0.01, + "learning_rate": 6.896551724137932e-06, + "loss": 0.5679, + "step": 60 + }, + { + "epoch": 0.01, + "learning_rate": 7.011494252873564e-06, + "loss": 0.5864, + "step": 61 + }, + { + "epoch": 0.01, + "learning_rate": 7.126436781609196e-06, + "loss": 0.568, + "step": 62 + }, + { + "epoch": 0.01, + "learning_rate": 7.241379310344828e-06, + "loss": 0.5995, + "step": 63 + }, + { + "epoch": 0.01, + "learning_rate": 7.35632183908046e-06, + "loss": 0.5634, + "step": 64 + }, + { + "epoch": 0.01, + "learning_rate": 7.4712643678160925e-06, + "loss": 0.5783, + "step": 65 + }, + { + "epoch": 0.01, + "learning_rate": 7.586206896551724e-06, + "loss": 0.572, + "step": 66 + }, + { + "epoch": 0.01, + "learning_rate": 7.701149425287356e-06, + "loss": 0.5772, + "step": 67 + }, + { + "epoch": 0.01, + "learning_rate": 7.81609195402299e-06, + "loss": 0.5611, + "step": 68 + }, + { + "epoch": 0.01, + "learning_rate": 7.93103448275862e-06, + "loss": 0.5901, + "step": 69 + }, + { + "epoch": 0.01, + "learning_rate": 8.045977011494253e-06, + "loss": 0.5774, + "step": 70 + }, + { + "epoch": 0.01, + "learning_rate": 8.160919540229886e-06, + "loss": 0.5845, + "step": 71 + }, + { + "epoch": 0.01, + "learning_rate": 8.275862068965518e-06, + "loss": 0.5646, + "step": 72 + }, + { + "epoch": 0.01, + "learning_rate": 8.390804597701149e-06, + "loss": 0.5755, + "step": 73 + }, + { + "epoch": 0.01, + "learning_rate": 8.505747126436782e-06, + "loss": 0.5638, + "step": 74 + }, + { + "epoch": 0.01, + "learning_rate": 8.620689655172414e-06, + "loss": 0.5663, + "step": 75 + }, + { + "epoch": 0.01, + "learning_rate": 8.735632183908047e-06, + "loss": 0.5741, + "step": 76 + }, + { + "epoch": 0.01, + "learning_rate": 8.85057471264368e-06, + "loss": 0.5721, + "step": 77 + }, + { + "epoch": 0.01, + "learning_rate": 8.965517241379312e-06, + "loss": 0.5506, + "step": 78 + }, + { + "epoch": 0.01, + "learning_rate": 9.080459770114942e-06, + "loss": 0.5658, + "step": 79 + }, + { + "epoch": 0.01, + "learning_rate": 9.195402298850575e-06, + "loss": 0.5664, + "step": 80 + }, + { + "epoch": 0.01, + "learning_rate": 9.310344827586207e-06, + "loss": 0.5681, + "step": 81 + }, + { + "epoch": 0.01, + "learning_rate": 9.42528735632184e-06, + "loss": 0.5679, + "step": 82 + }, + { + "epoch": 0.01, + "learning_rate": 9.54022988505747e-06, + "loss": 0.571, + "step": 83 + }, + { + "epoch": 0.01, + "learning_rate": 9.655172413793105e-06, + "loss": 0.5493, + "step": 84 + }, + { + "epoch": 0.01, + "learning_rate": 9.770114942528738e-06, + "loss": 0.5607, + "step": 85 + }, + { + "epoch": 0.01, + "learning_rate": 9.885057471264368e-06, + "loss": 0.5613, + "step": 86 + }, + { + "epoch": 0.02, + "learning_rate": 1e-05, + "loss": 0.5624, + "step": 87 + }, + { + "epoch": 0.02, + "learning_rate": 1.0114942528735633e-05, + "loss": 0.5735, + "step": 88 + }, + { + "epoch": 0.02, + "learning_rate": 1.0229885057471264e-05, + "loss": 0.5745, + "step": 89 + }, + { + "epoch": 0.02, + "learning_rate": 1.0344827586206898e-05, + "loss": 0.5663, + "step": 90 + }, + { + "epoch": 0.02, + "learning_rate": 1.0459770114942529e-05, + "loss": 0.5482, + "step": 91 + }, + { + "epoch": 0.02, + "learning_rate": 1.0574712643678162e-05, + "loss": 0.5711, + "step": 92 + }, + { + "epoch": 0.02, + "learning_rate": 1.0689655172413792e-05, + "loss": 0.56, + "step": 93 + }, + { + "epoch": 0.02, + "learning_rate": 1.0804597701149427e-05, + "loss": 0.5514, + "step": 94 + }, + { + "epoch": 0.02, + "learning_rate": 1.091954022988506e-05, + "loss": 0.5783, + "step": 95 + }, + { + "epoch": 0.02, + "learning_rate": 1.103448275862069e-05, + "loss": 0.5573, + "step": 96 + }, + { + "epoch": 0.02, + "learning_rate": 1.1149425287356324e-05, + "loss": 0.5631, + "step": 97 + }, + { + "epoch": 0.02, + "learning_rate": 1.1264367816091955e-05, + "loss": 0.5467, + "step": 98 + }, + { + "epoch": 0.02, + "learning_rate": 1.1379310344827587e-05, + "loss": 0.5686, + "step": 99 + }, + { + "epoch": 0.02, + "learning_rate": 1.1494252873563218e-05, + "loss": 0.5516, + "step": 100 + }, + { + "epoch": 0.02, + "learning_rate": 1.1609195402298852e-05, + "loss": 0.5591, + "step": 101 + }, + { + "epoch": 0.02, + "learning_rate": 1.1724137931034483e-05, + "loss": 0.5489, + "step": 102 + }, + { + "epoch": 0.02, + "learning_rate": 1.1839080459770116e-05, + "loss": 0.5568, + "step": 103 + }, + { + "epoch": 0.02, + "learning_rate": 1.1954022988505748e-05, + "loss": 0.5457, + "step": 104 + }, + { + "epoch": 0.02, + "learning_rate": 1.206896551724138e-05, + "loss": 0.5429, + "step": 105 + }, + { + "epoch": 0.02, + "learning_rate": 1.2183908045977013e-05, + "loss": 0.5531, + "step": 106 + }, + { + "epoch": 0.02, + "learning_rate": 1.2298850574712644e-05, + "loss": 0.5406, + "step": 107 + }, + { + "epoch": 0.02, + "learning_rate": 1.2413793103448277e-05, + "loss": 0.5414, + "step": 108 + }, + { + "epoch": 0.02, + "learning_rate": 1.2528735632183907e-05, + "loss": 0.5448, + "step": 109 + }, + { + "epoch": 0.02, + "learning_rate": 1.2643678160919542e-05, + "loss": 0.548, + "step": 110 + }, + { + "epoch": 0.02, + "learning_rate": 1.2758620689655174e-05, + "loss": 0.5415, + "step": 111 + }, + { + "epoch": 0.02, + "learning_rate": 1.2873563218390805e-05, + "loss": 0.5541, + "step": 112 + }, + { + "epoch": 0.02, + "learning_rate": 1.298850574712644e-05, + "loss": 0.5606, + "step": 113 + }, + { + "epoch": 0.02, + "learning_rate": 1.310344827586207e-05, + "loss": 0.542, + "step": 114 + }, + { + "epoch": 0.02, + "learning_rate": 1.3218390804597702e-05, + "loss": 0.5431, + "step": 115 + }, + { + "epoch": 0.02, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.5307, + "step": 116 + }, + { + "epoch": 0.02, + "learning_rate": 1.3448275862068967e-05, + "loss": 0.549, + "step": 117 + }, + { + "epoch": 0.02, + "learning_rate": 1.3563218390804598e-05, + "loss": 0.5463, + "step": 118 + }, + { + "epoch": 0.02, + "learning_rate": 1.367816091954023e-05, + "loss": 0.5432, + "step": 119 + }, + { + "epoch": 0.02, + "learning_rate": 1.3793103448275863e-05, + "loss": 0.5369, + "step": 120 + }, + { + "epoch": 0.02, + "learning_rate": 1.3908045977011496e-05, + "loss": 0.5385, + "step": 121 + }, + { + "epoch": 0.02, + "learning_rate": 1.4022988505747128e-05, + "loss": 0.5413, + "step": 122 + }, + { + "epoch": 0.02, + "learning_rate": 1.4137931034482759e-05, + "loss": 0.5432, + "step": 123 + }, + { + "epoch": 0.02, + "learning_rate": 1.4252873563218392e-05, + "loss": 0.5262, + "step": 124 + }, + { + "epoch": 0.02, + "learning_rate": 1.4367816091954022e-05, + "loss": 0.5489, + "step": 125 + }, + { + "epoch": 0.02, + "learning_rate": 1.4482758620689657e-05, + "loss": 0.5398, + "step": 126 + }, + { + "epoch": 0.02, + "learning_rate": 1.459770114942529e-05, + "loss": 0.5472, + "step": 127 + }, + { + "epoch": 0.02, + "learning_rate": 1.471264367816092e-05, + "loss": 0.5424, + "step": 128 + }, + { + "epoch": 0.02, + "learning_rate": 1.4827586206896554e-05, + "loss": 0.5462, + "step": 129 + }, + { + "epoch": 0.02, + "learning_rate": 1.4942528735632185e-05, + "loss": 0.5408, + "step": 130 + }, + { + "epoch": 0.02, + "learning_rate": 1.5057471264367817e-05, + "loss": 0.5521, + "step": 131 + }, + { + "epoch": 0.02, + "learning_rate": 1.5172413793103448e-05, + "loss": 0.5362, + "step": 132 + }, + { + "epoch": 0.02, + "learning_rate": 1.528735632183908e-05, + "loss": 0.5475, + "step": 133 + }, + { + "epoch": 0.02, + "learning_rate": 1.540229885057471e-05, + "loss": 0.5244, + "step": 134 + }, + { + "epoch": 0.02, + "learning_rate": 1.5517241379310346e-05, + "loss": 0.5356, + "step": 135 + }, + { + "epoch": 0.02, + "learning_rate": 1.563218390804598e-05, + "loss": 0.5392, + "step": 136 + }, + { + "epoch": 0.02, + "learning_rate": 1.574712643678161e-05, + "loss": 0.5487, + "step": 137 + }, + { + "epoch": 0.02, + "learning_rate": 1.586206896551724e-05, + "loss": 0.5275, + "step": 138 + }, + { + "epoch": 0.02, + "learning_rate": 1.5977011494252876e-05, + "loss": 0.5268, + "step": 139 + }, + { + "epoch": 0.02, + "learning_rate": 1.6091954022988507e-05, + "loss": 0.5286, + "step": 140 + }, + { + "epoch": 0.02, + "learning_rate": 1.6206896551724137e-05, + "loss": 0.5404, + "step": 141 + }, + { + "epoch": 0.02, + "learning_rate": 1.632183908045977e-05, + "loss": 0.5323, + "step": 142 + }, + { + "epoch": 0.02, + "learning_rate": 1.6436781609195406e-05, + "loss": 0.5345, + "step": 143 + }, + { + "epoch": 0.02, + "learning_rate": 1.6551724137931037e-05, + "loss": 0.527, + "step": 144 + }, + { + "epoch": 0.03, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.5248, + "step": 145 + }, + { + "epoch": 0.03, + "learning_rate": 1.6781609195402298e-05, + "loss": 0.5292, + "step": 146 + }, + { + "epoch": 0.03, + "learning_rate": 1.6896551724137932e-05, + "loss": 0.5429, + "step": 147 + }, + { + "epoch": 0.03, + "learning_rate": 1.7011494252873563e-05, + "loss": 0.5322, + "step": 148 + }, + { + "epoch": 0.03, + "learning_rate": 1.7126436781609197e-05, + "loss": 0.5303, + "step": 149 + }, + { + "epoch": 0.03, + "learning_rate": 1.7241379310344828e-05, + "loss": 0.5264, + "step": 150 + }, + { + "epoch": 0.03, + "learning_rate": 1.7356321839080462e-05, + "loss": 0.5303, + "step": 151 + }, + { + "epoch": 0.03, + "learning_rate": 1.7471264367816093e-05, + "loss": 0.5355, + "step": 152 + }, + { + "epoch": 0.03, + "learning_rate": 1.7586206896551724e-05, + "loss": 0.5251, + "step": 153 + }, + { + "epoch": 0.03, + "learning_rate": 1.770114942528736e-05, + "loss": 0.5325, + "step": 154 + }, + { + "epoch": 0.03, + "learning_rate": 1.781609195402299e-05, + "loss": 0.5274, + "step": 155 + }, + { + "epoch": 0.03, + "learning_rate": 1.7931034482758623e-05, + "loss": 0.5273, + "step": 156 + }, + { + "epoch": 0.03, + "learning_rate": 1.8045977011494254e-05, + "loss": 0.5355, + "step": 157 + }, + { + "epoch": 0.03, + "learning_rate": 1.8160919540229885e-05, + "loss": 0.5371, + "step": 158 + }, + { + "epoch": 0.03, + "learning_rate": 1.827586206896552e-05, + "loss": 0.5453, + "step": 159 + }, + { + "epoch": 0.03, + "learning_rate": 1.839080459770115e-05, + "loss": 0.5329, + "step": 160 + }, + { + "epoch": 0.03, + "learning_rate": 1.8505747126436784e-05, + "loss": 0.5241, + "step": 161 + }, + { + "epoch": 0.03, + "learning_rate": 1.8620689655172415e-05, + "loss": 0.5136, + "step": 162 + }, + { + "epoch": 0.03, + "learning_rate": 1.873563218390805e-05, + "loss": 0.5355, + "step": 163 + }, + { + "epoch": 0.03, + "learning_rate": 1.885057471264368e-05, + "loss": 0.5372, + "step": 164 + }, + { + "epoch": 0.03, + "learning_rate": 1.896551724137931e-05, + "loss": 0.5187, + "step": 165 + }, + { + "epoch": 0.03, + "learning_rate": 1.908045977011494e-05, + "loss": 0.5368, + "step": 166 + }, + { + "epoch": 0.03, + "learning_rate": 1.9195402298850576e-05, + "loss": 0.5113, + "step": 167 + }, + { + "epoch": 0.03, + "learning_rate": 1.931034482758621e-05, + "loss": 0.5277, + "step": 168 + }, + { + "epoch": 0.03, + "learning_rate": 1.942528735632184e-05, + "loss": 0.5277, + "step": 169 + }, + { + "epoch": 0.03, + "learning_rate": 1.9540229885057475e-05, + "loss": 0.5299, + "step": 170 + }, + { + "epoch": 0.03, + "learning_rate": 1.9655172413793106e-05, + "loss": 0.5241, + "step": 171 + }, + { + "epoch": 0.03, + "learning_rate": 1.9770114942528737e-05, + "loss": 0.5478, + "step": 172 + }, + { + "epoch": 0.03, + "learning_rate": 1.9885057471264367e-05, + "loss": 0.5343, + "step": 173 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.5179, + "step": 174 + }, + { + "epoch": 0.03, + "learning_rate": 1.99999984252778e-05, + "loss": 0.5239, + "step": 175 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999993701111697e-05, + "loss": 0.5314, + "step": 176 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999985827503177e-05, + "loss": 0.535, + "step": 177 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999974804454722e-05, + "loss": 0.5273, + "step": 178 + }, + { + "epoch": 0.03, + "learning_rate": 1.99999606319698e-05, + "loss": 0.5348, + "step": 179 + }, + { + "epoch": 0.03, + "learning_rate": 1.999994331005288e-05, + "loss": 0.516, + "step": 180 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999922838709414e-05, + "loss": 0.523, + "step": 181 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999899217945845e-05, + "loss": 0.5273, + "step": 182 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999872447769624e-05, + "loss": 0.526, + "step": 183 + }, + { + "epoch": 0.03, + "learning_rate": 1.999984252818917e-05, + "loss": 0.5228, + "step": 184 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999809459213914e-05, + "loss": 0.5195, + "step": 185 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999773240854266e-05, + "loss": 0.5289, + "step": 186 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999733873121638e-05, + "loss": 0.5073, + "step": 187 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999691356028422e-05, + "loss": 0.5311, + "step": 188 + }, + { + "epoch": 0.03, + "learning_rate": 1.999964568958801e-05, + "loss": 0.5266, + "step": 189 + }, + { + "epoch": 0.03, + "learning_rate": 1.999959687381479e-05, + "loss": 0.5216, + "step": 190 + }, + { + "epoch": 0.03, + "learning_rate": 1.999954490872413e-05, + "loss": 0.5132, + "step": 191 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999489794332404e-05, + "loss": 0.5388, + "step": 192 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999431530656958e-05, + "loss": 0.532, + "step": 193 + }, + { + "epoch": 0.03, + "learning_rate": 1.999937011771615e-05, + "loss": 0.5262, + "step": 194 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999305555529324e-05, + "loss": 0.5378, + "step": 195 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999237844116807e-05, + "loss": 0.5188, + "step": 196 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999166983499923e-05, + "loss": 0.5267, + "step": 197 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999092973701e-05, + "loss": 0.519, + "step": 198 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999015814743337e-05, + "loss": 0.5109, + "step": 199 + }, + { + "epoch": 0.03, + "learning_rate": 1.999893550665124e-05, + "loss": 0.5264, + "step": 200 + }, + { + "epoch": 0.03, + "learning_rate": 1.9998852049449998e-05, + "loss": 0.5141, + "step": 201 + }, + { + "epoch": 0.03, + "learning_rate": 1.9998765443165896e-05, + "loss": 0.5398, + "step": 202 + }, + { + "epoch": 0.04, + "learning_rate": 1.9998675687826214e-05, + "loss": 0.517, + "step": 203 + }, + { + "epoch": 0.04, + "learning_rate": 1.9998582783459214e-05, + "loss": 0.5293, + "step": 204 + }, + { + "epoch": 0.04, + "learning_rate": 1.9998486730094157e-05, + "loss": 0.5163, + "step": 205 + }, + { + "epoch": 0.04, + "learning_rate": 1.99983875277613e-05, + "loss": 0.5251, + "step": 206 + }, + { + "epoch": 0.04, + "learning_rate": 1.9998285176491878e-05, + "loss": 0.5183, + "step": 207 + }, + { + "epoch": 0.04, + "learning_rate": 1.9998179676318133e-05, + "loss": 0.528, + "step": 208 + }, + { + "epoch": 0.04, + "learning_rate": 1.999807102727329e-05, + "loss": 0.5123, + "step": 209 + }, + { + "epoch": 0.04, + "learning_rate": 1.9997959229391567e-05, + "loss": 0.5251, + "step": 210 + }, + { + "epoch": 0.04, + "learning_rate": 1.9997844282708173e-05, + "loss": 0.5262, + "step": 211 + }, + { + "epoch": 0.04, + "learning_rate": 1.9997726187259307e-05, + "loss": 0.5303, + "step": 212 + }, + { + "epoch": 0.04, + "learning_rate": 1.999760494308217e-05, + "loss": 0.5351, + "step": 213 + }, + { + "epoch": 0.04, + "learning_rate": 1.9997480550214942e-05, + "loss": 0.5298, + "step": 214 + }, + { + "epoch": 0.04, + "learning_rate": 1.99973530086968e-05, + "loss": 0.5313, + "step": 215 + }, + { + "epoch": 0.04, + "learning_rate": 1.999722231856791e-05, + "loss": 0.5232, + "step": 216 + }, + { + "epoch": 0.04, + "learning_rate": 1.999708847986944e-05, + "loss": 0.5183, + "step": 217 + }, + { + "epoch": 0.04, + "learning_rate": 1.9996951492643538e-05, + "loss": 0.5284, + "step": 218 + }, + { + "epoch": 0.04, + "learning_rate": 1.9996811356933346e-05, + "loss": 0.5213, + "step": 219 + }, + { + "epoch": 0.04, + "learning_rate": 1.9996668072783e-05, + "loss": 0.5176, + "step": 220 + }, + { + "epoch": 0.04, + "learning_rate": 1.9996521640237624e-05, + "loss": 0.5141, + "step": 221 + }, + { + "epoch": 0.04, + "learning_rate": 1.999637205934334e-05, + "loss": 0.5304, + "step": 222 + }, + { + "epoch": 0.04, + "learning_rate": 1.9996219330147255e-05, + "loss": 0.5064, + "step": 223 + }, + { + "epoch": 0.04, + "learning_rate": 1.9996063452697472e-05, + "loss": 0.5068, + "step": 224 + }, + { + "epoch": 0.04, + "learning_rate": 1.999590442704308e-05, + "loss": 0.5203, + "step": 225 + }, + { + "epoch": 0.04, + "learning_rate": 1.9995742253234168e-05, + "loss": 0.5301, + "step": 226 + }, + { + "epoch": 0.04, + "learning_rate": 1.9995576931321812e-05, + "loss": 0.513, + "step": 227 + }, + { + "epoch": 0.04, + "learning_rate": 1.9995408461358074e-05, + "loss": 0.522, + "step": 228 + }, + { + "epoch": 0.04, + "learning_rate": 1.9995236843396018e-05, + "loss": 0.5117, + "step": 229 + }, + { + "epoch": 0.04, + "learning_rate": 1.999506207748969e-05, + "loss": 0.5231, + "step": 230 + }, + { + "epoch": 0.04, + "learning_rate": 1.999488416369414e-05, + "loss": 0.5127, + "step": 231 + }, + { + "epoch": 0.04, + "learning_rate": 1.9994703102065385e-05, + "loss": 0.521, + "step": 232 + }, + { + "epoch": 0.04, + "learning_rate": 1.9994518892660463e-05, + "loss": 0.5242, + "step": 233 + }, + { + "epoch": 0.04, + "learning_rate": 1.9994331535537385e-05, + "loss": 0.5205, + "step": 234 + }, + { + "epoch": 0.04, + "learning_rate": 1.9994141030755158e-05, + "loss": 0.5007, + "step": 235 + }, + { + "epoch": 0.04, + "learning_rate": 1.9993947378373782e-05, + "loss": 0.5293, + "step": 236 + }, + { + "epoch": 0.04, + "learning_rate": 1.9993750578454248e-05, + "loss": 0.51, + "step": 237 + }, + { + "epoch": 0.04, + "learning_rate": 1.999355063105853e-05, + "loss": 0.5178, + "step": 238 + }, + { + "epoch": 0.04, + "learning_rate": 1.999334753624961e-05, + "loss": 0.5296, + "step": 239 + }, + { + "epoch": 0.04, + "learning_rate": 1.999314129409144e-05, + "loss": 0.5249, + "step": 240 + }, + { + "epoch": 0.04, + "learning_rate": 1.999293190464899e-05, + "loss": 0.5024, + "step": 241 + }, + { + "epoch": 0.04, + "learning_rate": 1.999271936798819e-05, + "loss": 0.5026, + "step": 242 + }, + { + "epoch": 0.04, + "learning_rate": 1.9992503684175986e-05, + "loss": 0.5192, + "step": 243 + }, + { + "epoch": 0.04, + "learning_rate": 1.999228485328031e-05, + "loss": 0.5099, + "step": 244 + }, + { + "epoch": 0.04, + "learning_rate": 1.9992062875370073e-05, + "loss": 0.5017, + "step": 245 + }, + { + "epoch": 0.04, + "learning_rate": 1.999183775051519e-05, + "loss": 0.5169, + "step": 246 + }, + { + "epoch": 0.04, + "learning_rate": 1.9991609478786564e-05, + "loss": 0.5197, + "step": 247 + }, + { + "epoch": 0.04, + "learning_rate": 1.9991378060256084e-05, + "loss": 0.5364, + "step": 248 + }, + { + "epoch": 0.04, + "learning_rate": 1.999114349499664e-05, + "loss": 0.503, + "step": 249 + }, + { + "epoch": 0.04, + "learning_rate": 1.9990905783082098e-05, + "loss": 0.5201, + "step": 250 + }, + { + "epoch": 0.04, + "learning_rate": 1.999066492458733e-05, + "loss": 0.5104, + "step": 251 + }, + { + "epoch": 0.04, + "learning_rate": 1.9990420919588196e-05, + "loss": 0.5157, + "step": 252 + }, + { + "epoch": 0.04, + "learning_rate": 1.999017376816154e-05, + "loss": 0.5122, + "step": 253 + }, + { + "epoch": 0.04, + "learning_rate": 1.9989923470385198e-05, + "loss": 0.5206, + "step": 254 + }, + { + "epoch": 0.04, + "learning_rate": 1.9989670026338002e-05, + "loss": 0.5213, + "step": 255 + }, + { + "epoch": 0.04, + "learning_rate": 1.998941343609978e-05, + "loss": 0.5101, + "step": 256 + }, + { + "epoch": 0.04, + "learning_rate": 1.9989153699751332e-05, + "loss": 0.5015, + "step": 257 + }, + { + "epoch": 0.04, + "learning_rate": 1.9988890817374472e-05, + "loss": 0.515, + "step": 258 + }, + { + "epoch": 0.04, + "learning_rate": 1.9988624789051983e-05, + "loss": 0.5218, + "step": 259 + }, + { + "epoch": 0.05, + "learning_rate": 1.9988355614867654e-05, + "loss": 0.5197, + "step": 260 + }, + { + "epoch": 0.05, + "learning_rate": 1.998808329490626e-05, + "loss": 0.5071, + "step": 261 + }, + { + "epoch": 0.05, + "learning_rate": 1.9987807829253568e-05, + "loss": 0.5263, + "step": 262 + }, + { + "epoch": 0.05, + "learning_rate": 1.998752921799633e-05, + "loss": 0.5148, + "step": 263 + }, + { + "epoch": 0.05, + "learning_rate": 1.9987247461222297e-05, + "loss": 0.5179, + "step": 264 + }, + { + "epoch": 0.05, + "learning_rate": 1.9986962559020203e-05, + "loss": 0.5303, + "step": 265 + }, + { + "epoch": 0.05, + "learning_rate": 1.9986674511479783e-05, + "loss": 0.5189, + "step": 266 + }, + { + "epoch": 0.05, + "learning_rate": 1.998638331869175e-05, + "loss": 0.5192, + "step": 267 + }, + { + "epoch": 0.05, + "learning_rate": 1.9986088980747817e-05, + "loss": 0.5199, + "step": 268 + }, + { + "epoch": 0.05, + "learning_rate": 1.998579149774068e-05, + "loss": 0.5067, + "step": 269 + }, + { + "epoch": 0.05, + "learning_rate": 1.998549086976403e-05, + "loss": 0.5262, + "step": 270 + }, + { + "epoch": 0.05, + "learning_rate": 1.9985187096912554e-05, + "loss": 0.5066, + "step": 271 + }, + { + "epoch": 0.05, + "learning_rate": 1.9984880179281917e-05, + "loss": 0.5427, + "step": 272 + }, + { + "epoch": 0.05, + "learning_rate": 1.9984570116968785e-05, + "loss": 0.5037, + "step": 273 + }, + { + "epoch": 0.05, + "learning_rate": 1.9984256910070807e-05, + "loss": 0.5228, + "step": 274 + }, + { + "epoch": 0.05, + "learning_rate": 1.998394055868663e-05, + "loss": 0.5075, + "step": 275 + }, + { + "epoch": 0.05, + "learning_rate": 1.9983621062915886e-05, + "loss": 0.5133, + "step": 276 + }, + { + "epoch": 0.05, + "learning_rate": 1.9983298422859197e-05, + "loss": 0.4988, + "step": 277 + }, + { + "epoch": 0.05, + "learning_rate": 1.9982972638618175e-05, + "loss": 0.5129, + "step": 278 + }, + { + "epoch": 0.05, + "learning_rate": 1.9982643710295428e-05, + "loss": 0.5012, + "step": 279 + }, + { + "epoch": 0.05, + "learning_rate": 1.9982311637994547e-05, + "loss": 0.5289, + "step": 280 + }, + { + "epoch": 0.05, + "learning_rate": 1.9981976421820118e-05, + "loss": 0.5061, + "step": 281 + }, + { + "epoch": 0.05, + "learning_rate": 1.9981638061877714e-05, + "loss": 0.5192, + "step": 282 + }, + { + "epoch": 0.05, + "learning_rate": 1.99812965582739e-05, + "loss": 0.5029, + "step": 283 + }, + { + "epoch": 0.05, + "learning_rate": 1.9980951911116234e-05, + "loss": 0.5233, + "step": 284 + }, + { + "epoch": 0.05, + "learning_rate": 1.9980604120513257e-05, + "loss": 0.5109, + "step": 285 + }, + { + "epoch": 0.05, + "learning_rate": 1.9980253186574505e-05, + "loss": 0.5173, + "step": 286 + }, + { + "epoch": 0.05, + "learning_rate": 1.99798991094105e-05, + "loss": 0.4996, + "step": 287 + }, + { + "epoch": 0.05, + "learning_rate": 1.9979541889132758e-05, + "loss": 0.518, + "step": 288 + }, + { + "epoch": 0.05, + "learning_rate": 1.997918152585379e-05, + "loss": 0.5019, + "step": 289 + }, + { + "epoch": 0.05, + "learning_rate": 1.997881801968708e-05, + "loss": 0.5075, + "step": 290 + }, + { + "epoch": 0.05, + "learning_rate": 1.9978451370747122e-05, + "loss": 0.5123, + "step": 291 + }, + { + "epoch": 0.05, + "learning_rate": 1.9978081579149378e-05, + "loss": 0.5249, + "step": 292 + }, + { + "epoch": 0.05, + "learning_rate": 1.9977708645010323e-05, + "loss": 0.5011, + "step": 293 + }, + { + "epoch": 0.05, + "learning_rate": 1.9977332568447406e-05, + "loss": 0.4963, + "step": 294 + }, + { + "epoch": 0.05, + "learning_rate": 1.9976953349579073e-05, + "loss": 0.5083, + "step": 295 + }, + { + "epoch": 0.05, + "learning_rate": 1.9976570988524752e-05, + "loss": 0.5123, + "step": 296 + }, + { + "epoch": 0.05, + "learning_rate": 1.9976185485404867e-05, + "loss": 0.5139, + "step": 297 + }, + { + "epoch": 0.05, + "learning_rate": 1.9975796840340837e-05, + "loss": 0.5021, + "step": 298 + }, + { + "epoch": 0.05, + "learning_rate": 1.9975405053455052e-05, + "loss": 0.5138, + "step": 299 + }, + { + "epoch": 0.05, + "learning_rate": 1.997501012487091e-05, + "loss": 0.5121, + "step": 300 + }, + { + "epoch": 0.05, + "learning_rate": 1.9974612054712792e-05, + "loss": 0.4994, + "step": 301 + }, + { + "epoch": 0.05, + "learning_rate": 1.9974210843106065e-05, + "loss": 0.5099, + "step": 302 + }, + { + "epoch": 0.05, + "learning_rate": 1.9973806490177094e-05, + "loss": 0.5195, + "step": 303 + }, + { + "epoch": 0.05, + "learning_rate": 1.9973398996053218e-05, + "loss": 0.5147, + "step": 304 + }, + { + "epoch": 0.05, + "learning_rate": 1.9972988360862782e-05, + "loss": 0.5116, + "step": 305 + }, + { + "epoch": 0.05, + "learning_rate": 1.9972574584735112e-05, + "loss": 0.5153, + "step": 306 + }, + { + "epoch": 0.05, + "learning_rate": 1.9972157667800522e-05, + "loss": 0.5056, + "step": 307 + }, + { + "epoch": 0.05, + "learning_rate": 1.9971737610190326e-05, + "loss": 0.5172, + "step": 308 + }, + { + "epoch": 0.05, + "learning_rate": 1.9971314412036807e-05, + "loss": 0.5146, + "step": 309 + }, + { + "epoch": 0.05, + "learning_rate": 1.9970888073473257e-05, + "loss": 0.5089, + "step": 310 + }, + { + "epoch": 0.05, + "learning_rate": 1.9970458594633947e-05, + "loss": 0.5159, + "step": 311 + }, + { + "epoch": 0.05, + "learning_rate": 1.9970025975654137e-05, + "loss": 0.5144, + "step": 312 + }, + { + "epoch": 0.05, + "learning_rate": 1.996959021667008e-05, + "loss": 0.516, + "step": 313 + }, + { + "epoch": 0.05, + "learning_rate": 1.9969151317819014e-05, + "loss": 0.5195, + "step": 314 + }, + { + "epoch": 0.05, + "learning_rate": 1.9968709279239172e-05, + "loss": 0.5033, + "step": 315 + }, + { + "epoch": 0.05, + "learning_rate": 1.996826410106977e-05, + "loss": 0.5096, + "step": 316 + }, + { + "epoch": 0.05, + "learning_rate": 1.996781578345101e-05, + "loss": 0.5041, + "step": 317 + }, + { + "epoch": 0.06, + "learning_rate": 1.996736432652409e-05, + "loss": 0.5246, + "step": 318 + }, + { + "epoch": 0.06, + "learning_rate": 1.9966909730431196e-05, + "loss": 0.5088, + "step": 319 + }, + { + "epoch": 0.06, + "learning_rate": 1.99664519953155e-05, + "loss": 0.5369, + "step": 320 + }, + { + "epoch": 0.06, + "learning_rate": 1.9965991121321156e-05, + "loss": 0.5081, + "step": 321 + }, + { + "epoch": 0.06, + "learning_rate": 1.9965527108593325e-05, + "loss": 0.5189, + "step": 322 + }, + { + "epoch": 0.06, + "learning_rate": 1.9965059957278135e-05, + "loss": 0.5071, + "step": 323 + }, + { + "epoch": 0.06, + "learning_rate": 1.9964589667522724e-05, + "loss": 0.5225, + "step": 324 + }, + { + "epoch": 0.06, + "learning_rate": 1.99641162394752e-05, + "loss": 0.5079, + "step": 325 + }, + { + "epoch": 0.06, + "learning_rate": 1.996363967328466e-05, + "loss": 0.5081, + "step": 326 + }, + { + "epoch": 0.06, + "learning_rate": 1.9963159969101207e-05, + "loss": 0.5068, + "step": 327 + }, + { + "epoch": 0.06, + "learning_rate": 1.9962677127075916e-05, + "loss": 0.5092, + "step": 328 + }, + { + "epoch": 0.06, + "learning_rate": 1.9962191147360855e-05, + "loss": 0.4977, + "step": 329 + }, + { + "epoch": 0.06, + "learning_rate": 1.9961702030109088e-05, + "loss": 0.5134, + "step": 330 + }, + { + "epoch": 0.06, + "learning_rate": 1.996120977547465e-05, + "loss": 0.5065, + "step": 331 + }, + { + "epoch": 0.06, + "learning_rate": 1.9960714383612577e-05, + "loss": 0.5119, + "step": 332 + }, + { + "epoch": 0.06, + "learning_rate": 1.9960215854678894e-05, + "loss": 0.5076, + "step": 333 + }, + { + "epoch": 0.06, + "learning_rate": 1.9959714188830603e-05, + "loss": 0.4984, + "step": 334 + }, + { + "epoch": 0.06, + "learning_rate": 1.9959209386225707e-05, + "loss": 0.5095, + "step": 335 + }, + { + "epoch": 0.06, + "learning_rate": 1.9958701447023188e-05, + "loss": 0.505, + "step": 336 + }, + { + "epoch": 0.06, + "learning_rate": 1.9958190371383016e-05, + "loss": 0.5217, + "step": 337 + }, + { + "epoch": 0.06, + "learning_rate": 1.9957676159466154e-05, + "loss": 0.4897, + "step": 338 + }, + { + "epoch": 0.06, + "learning_rate": 1.9957158811434552e-05, + "loss": 0.5075, + "step": 339 + }, + { + "epoch": 0.06, + "learning_rate": 1.995663832745115e-05, + "loss": 0.5113, + "step": 340 + }, + { + "epoch": 0.06, + "learning_rate": 1.9956114707679858e-05, + "loss": 0.5193, + "step": 341 + }, + { + "epoch": 0.06, + "learning_rate": 1.99555879522856e-05, + "loss": 0.5017, + "step": 342 + }, + { + "epoch": 0.06, + "learning_rate": 1.9955058061434266e-05, + "loss": 0.5158, + "step": 343 + }, + { + "epoch": 0.06, + "learning_rate": 1.9954525035292748e-05, + "loss": 0.5055, + "step": 344 + }, + { + "epoch": 0.06, + "learning_rate": 1.9953988874028917e-05, + "loss": 0.5102, + "step": 345 + }, + { + "epoch": 0.06, + "learning_rate": 1.9953449577811635e-05, + "loss": 0.4962, + "step": 346 + }, + { + "epoch": 0.06, + "learning_rate": 1.9952907146810748e-05, + "loss": 0.5254, + "step": 347 + }, + { + "epoch": 0.06, + "learning_rate": 1.9952361581197097e-05, + "loss": 0.5107, + "step": 348 + }, + { + "epoch": 0.06, + "learning_rate": 1.9951812881142497e-05, + "loss": 0.5073, + "step": 349 + }, + { + "epoch": 0.06, + "learning_rate": 1.9951261046819766e-05, + "loss": 0.5069, + "step": 350 + }, + { + "epoch": 0.06, + "learning_rate": 1.9950706078402696e-05, + "loss": 0.4961, + "step": 351 + }, + { + "epoch": 0.06, + "learning_rate": 1.9950147976066073e-05, + "loss": 0.4943, + "step": 352 + }, + { + "epoch": 0.06, + "learning_rate": 1.994958673998567e-05, + "loss": 0.5141, + "step": 353 + }, + { + "epoch": 0.06, + "learning_rate": 1.994902237033824e-05, + "loss": 0.5022, + "step": 354 + }, + { + "epoch": 0.06, + "learning_rate": 1.994845486730153e-05, + "loss": 0.5025, + "step": 355 + }, + { + "epoch": 0.06, + "learning_rate": 1.9947884231054276e-05, + "loss": 0.5108, + "step": 356 + }, + { + "epoch": 0.06, + "learning_rate": 1.9947310461776195e-05, + "loss": 0.5178, + "step": 357 + }, + { + "epoch": 0.06, + "learning_rate": 1.9946733559647987e-05, + "loss": 0.5009, + "step": 358 + }, + { + "epoch": 0.06, + "learning_rate": 1.9946153524851352e-05, + "loss": 0.5157, + "step": 359 + }, + { + "epoch": 0.06, + "learning_rate": 1.9945570357568967e-05, + "loss": 0.5154, + "step": 360 + }, + { + "epoch": 0.06, + "learning_rate": 1.994498405798449e-05, + "loss": 0.527, + "step": 361 + }, + { + "epoch": 0.06, + "learning_rate": 1.994439462628258e-05, + "loss": 0.5034, + "step": 362 + }, + { + "epoch": 0.06, + "learning_rate": 1.9943802062648877e-05, + "loss": 0.5117, + "step": 363 + }, + { + "epoch": 0.06, + "learning_rate": 1.994320636727e-05, + "loss": 0.5125, + "step": 364 + }, + { + "epoch": 0.06, + "learning_rate": 1.994260754033356e-05, + "loss": 0.5067, + "step": 365 + }, + { + "epoch": 0.06, + "learning_rate": 1.994200558202816e-05, + "loss": 0.496, + "step": 366 + }, + { + "epoch": 0.06, + "learning_rate": 1.9941400492543376e-05, + "loss": 0.508, + "step": 367 + }, + { + "epoch": 0.06, + "learning_rate": 1.9940792272069783e-05, + "loss": 0.5069, + "step": 368 + }, + { + "epoch": 0.06, + "learning_rate": 1.9940180920798934e-05, + "loss": 0.5098, + "step": 369 + }, + { + "epoch": 0.06, + "learning_rate": 1.993956643892337e-05, + "loss": 0.5125, + "step": 370 + }, + { + "epoch": 0.06, + "learning_rate": 1.9938948826636625e-05, + "loss": 0.501, + "step": 371 + }, + { + "epoch": 0.06, + "learning_rate": 1.9938328084133206e-05, + "loss": 0.512, + "step": 372 + }, + { + "epoch": 0.06, + "learning_rate": 1.9937704211608615e-05, + "loss": 0.5049, + "step": 373 + }, + { + "epoch": 0.06, + "learning_rate": 1.9937077209259333e-05, + "loss": 0.5034, + "step": 374 + }, + { + "epoch": 0.06, + "learning_rate": 1.993644707728284e-05, + "loss": 0.5152, + "step": 375 + }, + { + "epoch": 0.07, + "learning_rate": 1.993581381587758e-05, + "loss": 0.5045, + "step": 376 + }, + { + "epoch": 0.07, + "learning_rate": 1.9935177425243007e-05, + "loss": 0.5107, + "step": 377 + }, + { + "epoch": 0.07, + "learning_rate": 1.993453790557954e-05, + "loss": 0.5137, + "step": 378 + }, + { + "epoch": 0.07, + "learning_rate": 1.99338952570886e-05, + "loss": 0.5065, + "step": 379 + }, + { + "epoch": 0.07, + "learning_rate": 1.993324947997258e-05, + "loss": 0.5046, + "step": 380 + }, + { + "epoch": 0.07, + "learning_rate": 1.9932600574434864e-05, + "loss": 0.5151, + "step": 381 + }, + { + "epoch": 0.07, + "learning_rate": 1.9931948540679822e-05, + "loss": 0.5004, + "step": 382 + }, + { + "epoch": 0.07, + "learning_rate": 1.993129337891281e-05, + "loss": 0.5128, + "step": 383 + }, + { + "epoch": 0.07, + "learning_rate": 1.9930635089340168e-05, + "loss": 0.5013, + "step": 384 + }, + { + "epoch": 0.07, + "learning_rate": 1.992997367216922e-05, + "loss": 0.5062, + "step": 385 + }, + { + "epoch": 0.07, + "learning_rate": 1.992930912760827e-05, + "loss": 0.5075, + "step": 386 + }, + { + "epoch": 0.07, + "learning_rate": 1.992864145586662e-05, + "loss": 0.5185, + "step": 387 + }, + { + "epoch": 0.07, + "learning_rate": 1.9927970657154548e-05, + "loss": 0.486, + "step": 388 + }, + { + "epoch": 0.07, + "learning_rate": 1.9927296731683317e-05, + "loss": 0.5152, + "step": 389 + }, + { + "epoch": 0.07, + "learning_rate": 1.9926619679665175e-05, + "loss": 0.5066, + "step": 390 + }, + { + "epoch": 0.07, + "learning_rate": 1.9925939501313358e-05, + "loss": 0.5078, + "step": 391 + }, + { + "epoch": 0.07, + "learning_rate": 1.992525619684208e-05, + "loss": 0.5135, + "step": 392 + }, + { + "epoch": 0.07, + "learning_rate": 1.9924569766466552e-05, + "loss": 0.5143, + "step": 393 + }, + { + "epoch": 0.07, + "learning_rate": 1.9923880210402956e-05, + "loss": 0.503, + "step": 394 + }, + { + "epoch": 0.07, + "learning_rate": 1.9923187528868463e-05, + "loss": 0.5251, + "step": 395 + }, + { + "epoch": 0.07, + "learning_rate": 1.9922491722081235e-05, + "loss": 0.5049, + "step": 396 + }, + { + "epoch": 0.07, + "learning_rate": 1.9921792790260402e-05, + "loss": 0.5091, + "step": 397 + }, + { + "epoch": 0.07, + "learning_rate": 1.9921090733626102e-05, + "loss": 0.4971, + "step": 398 + }, + { + "epoch": 0.07, + "learning_rate": 1.9920385552399434e-05, + "loss": 0.5029, + "step": 399 + }, + { + "epoch": 0.07, + "learning_rate": 1.9919677246802492e-05, + "loss": 0.5008, + "step": 400 + }, + { + "epoch": 0.07, + "learning_rate": 1.9918965817058357e-05, + "loss": 0.4976, + "step": 401 + }, + { + "epoch": 0.07, + "learning_rate": 1.9918251263391086e-05, + "loss": 0.5129, + "step": 402 + }, + { + "epoch": 0.07, + "learning_rate": 1.9917533586025725e-05, + "loss": 0.5045, + "step": 403 + }, + { + "epoch": 0.07, + "learning_rate": 1.9916812785188305e-05, + "loss": 0.5052, + "step": 404 + }, + { + "epoch": 0.07, + "learning_rate": 1.9916088861105835e-05, + "loss": 0.5116, + "step": 405 + }, + { + "epoch": 0.07, + "learning_rate": 1.9915361814006312e-05, + "loss": 0.5115, + "step": 406 + }, + { + "epoch": 0.07, + "learning_rate": 1.9914631644118712e-05, + "loss": 0.5029, + "step": 407 + }, + { + "epoch": 0.07, + "learning_rate": 1.9913898351673006e-05, + "loss": 0.504, + "step": 408 + }, + { + "epoch": 0.07, + "learning_rate": 1.9913161936900135e-05, + "loss": 0.5115, + "step": 409 + }, + { + "epoch": 0.07, + "learning_rate": 1.9912422400032027e-05, + "loss": 0.4922, + "step": 410 + }, + { + "epoch": 0.07, + "learning_rate": 1.99116797413016e-05, + "loss": 0.5161, + "step": 411 + }, + { + "epoch": 0.07, + "learning_rate": 1.9910933960942747e-05, + "loss": 0.5055, + "step": 412 + }, + { + "epoch": 0.07, + "learning_rate": 1.9910185059190344e-05, + "loss": 0.5047, + "step": 413 + }, + { + "epoch": 0.07, + "learning_rate": 1.990943303628026e-05, + "loss": 0.5034, + "step": 414 + }, + { + "epoch": 0.07, + "learning_rate": 1.990867789244934e-05, + "loss": 0.5254, + "step": 415 + }, + { + "epoch": 0.07, + "learning_rate": 1.990791962793541e-05, + "loss": 0.4984, + "step": 416 + }, + { + "epoch": 0.07, + "learning_rate": 1.990715824297728e-05, + "loss": 0.5208, + "step": 417 + }, + { + "epoch": 0.07, + "learning_rate": 1.9906393737814748e-05, + "loss": 0.5087, + "step": 418 + }, + { + "epoch": 0.07, + "learning_rate": 1.990562611268858e-05, + "loss": 0.4982, + "step": 419 + }, + { + "epoch": 0.07, + "learning_rate": 1.990485536784055e-05, + "loss": 0.4952, + "step": 420 + }, + { + "epoch": 0.07, + "learning_rate": 1.9904081503513395e-05, + "loss": 0.5156, + "step": 421 + }, + { + "epoch": 0.07, + "learning_rate": 1.9903304519950833e-05, + "loss": 0.5065, + "step": 422 + }, + { + "epoch": 0.07, + "learning_rate": 1.990252441739758e-05, + "loss": 0.5126, + "step": 423 + }, + { + "epoch": 0.07, + "learning_rate": 1.9901741196099313e-05, + "loss": 0.4999, + "step": 424 + }, + { + "epoch": 0.07, + "learning_rate": 1.9900954856302715e-05, + "loss": 0.5116, + "step": 425 + }, + { + "epoch": 0.07, + "learning_rate": 1.9900165398255434e-05, + "loss": 0.4924, + "step": 426 + }, + { + "epoch": 0.07, + "learning_rate": 1.9899372822206105e-05, + "loss": 0.5066, + "step": 427 + }, + { + "epoch": 0.07, + "learning_rate": 1.9898577128404343e-05, + "loss": 0.5091, + "step": 428 + }, + { + "epoch": 0.07, + "learning_rate": 1.9897778317100754e-05, + "loss": 0.4911, + "step": 429 + }, + { + "epoch": 0.07, + "learning_rate": 1.9896976388546915e-05, + "loss": 0.4944, + "step": 430 + }, + { + "epoch": 0.07, + "learning_rate": 1.9896171342995392e-05, + "loss": 0.5091, + "step": 431 + }, + { + "epoch": 0.07, + "learning_rate": 1.989536318069973e-05, + "loss": 0.5162, + "step": 432 + }, + { + "epoch": 0.08, + "learning_rate": 1.9894551901914445e-05, + "loss": 0.5103, + "step": 433 + }, + { + "epoch": 0.08, + "learning_rate": 1.989373750689506e-05, + "loss": 0.4951, + "step": 434 + }, + { + "epoch": 0.08, + "learning_rate": 1.9892919995898052e-05, + "loss": 0.5102, + "step": 435 + }, + { + "epoch": 0.08, + "learning_rate": 1.98920993691809e-05, + "loss": 0.51, + "step": 436 + }, + { + "epoch": 0.08, + "learning_rate": 1.9891275627002043e-05, + "loss": 0.5001, + "step": 437 + }, + { + "epoch": 0.08, + "learning_rate": 1.9890448769620932e-05, + "loss": 0.4947, + "step": 438 + }, + { + "epoch": 0.08, + "learning_rate": 1.988961879729797e-05, + "loss": 0.5059, + "step": 439 + }, + { + "epoch": 0.08, + "learning_rate": 1.9888785710294552e-05, + "loss": 0.5001, + "step": 440 + }, + { + "epoch": 0.08, + "learning_rate": 1.9887949508873058e-05, + "loss": 0.5009, + "step": 441 + }, + { + "epoch": 0.08, + "learning_rate": 1.988711019329684e-05, + "loss": 0.4887, + "step": 442 + }, + { + "epoch": 0.08, + "learning_rate": 1.9886267763830245e-05, + "loss": 0.5005, + "step": 443 + }, + { + "epoch": 0.08, + "learning_rate": 1.9885422220738583e-05, + "loss": 0.4987, + "step": 444 + }, + { + "epoch": 0.08, + "learning_rate": 1.9884573564288154e-05, + "loss": 0.4964, + "step": 445 + }, + { + "epoch": 0.08, + "learning_rate": 1.9883721794746242e-05, + "loss": 0.4959, + "step": 446 + }, + { + "epoch": 0.08, + "learning_rate": 1.9882866912381105e-05, + "loss": 0.5038, + "step": 447 + }, + { + "epoch": 0.08, + "learning_rate": 1.988200891746198e-05, + "loss": 0.5196, + "step": 448 + }, + { + "epoch": 0.08, + "learning_rate": 1.9881147810259094e-05, + "loss": 0.5063, + "step": 449 + }, + { + "epoch": 0.08, + "learning_rate": 1.988028359104364e-05, + "loss": 0.51, + "step": 450 + }, + { + "epoch": 0.08, + "learning_rate": 1.9879416260087808e-05, + "loss": 0.5078, + "step": 451 + }, + { + "epoch": 0.08, + "learning_rate": 1.9878545817664752e-05, + "loss": 0.498, + "step": 452 + }, + { + "epoch": 0.08, + "learning_rate": 1.9877672264048618e-05, + "loss": 0.5075, + "step": 453 + }, + { + "epoch": 0.08, + "learning_rate": 1.9876795599514523e-05, + "loss": 0.5062, + "step": 454 + }, + { + "epoch": 0.08, + "learning_rate": 1.987591582433857e-05, + "loss": 0.5133, + "step": 455 + }, + { + "epoch": 0.08, + "learning_rate": 1.9875032938797837e-05, + "loss": 0.4992, + "step": 456 + }, + { + "epoch": 0.08, + "learning_rate": 1.9874146943170386e-05, + "loss": 0.4929, + "step": 457 + }, + { + "epoch": 0.08, + "learning_rate": 1.9873257837735257e-05, + "loss": 0.5224, + "step": 458 + }, + { + "epoch": 0.08, + "learning_rate": 1.9872365622772464e-05, + "loss": 0.5048, + "step": 459 + }, + { + "epoch": 0.08, + "learning_rate": 1.987147029856301e-05, + "loss": 0.4979, + "step": 460 + }, + { + "epoch": 0.08, + "learning_rate": 1.9870571865388873e-05, + "loss": 0.5262, + "step": 461 + }, + { + "epoch": 0.08, + "learning_rate": 1.9869670323533005e-05, + "loss": 0.4916, + "step": 462 + }, + { + "epoch": 0.08, + "learning_rate": 1.9868765673279347e-05, + "loss": 0.4974, + "step": 463 + }, + { + "epoch": 0.08, + "learning_rate": 1.9867857914912808e-05, + "loss": 0.495, + "step": 464 + }, + { + "epoch": 0.08, + "learning_rate": 1.9866947048719285e-05, + "loss": 0.5099, + "step": 465 + }, + { + "epoch": 0.08, + "learning_rate": 1.986603307498565e-05, + "loss": 0.4879, + "step": 466 + }, + { + "epoch": 0.08, + "learning_rate": 1.9865115993999755e-05, + "loss": 0.5051, + "step": 467 + }, + { + "epoch": 0.08, + "learning_rate": 1.9864195806050425e-05, + "loss": 0.4841, + "step": 468 + }, + { + "epoch": 0.08, + "learning_rate": 1.9863272511427475e-05, + "loss": 0.5163, + "step": 469 + }, + { + "epoch": 0.08, + "learning_rate": 1.9862346110421682e-05, + "loss": 0.4991, + "step": 470 + }, + { + "epoch": 0.08, + "learning_rate": 1.986141660332482e-05, + "loss": 0.508, + "step": 471 + }, + { + "epoch": 0.08, + "learning_rate": 1.986048399042963e-05, + "loss": 0.501, + "step": 472 + }, + { + "epoch": 0.08, + "learning_rate": 1.9859548272029828e-05, + "loss": 0.4963, + "step": 473 + }, + { + "epoch": 0.08, + "learning_rate": 1.9858609448420118e-05, + "loss": 0.4996, + "step": 474 + }, + { + "epoch": 0.08, + "learning_rate": 1.9857667519896176e-05, + "loss": 0.4953, + "step": 475 + }, + { + "epoch": 0.08, + "learning_rate": 1.985672248675466e-05, + "loss": 0.4918, + "step": 476 + }, + { + "epoch": 0.08, + "learning_rate": 1.98557743492932e-05, + "loss": 0.5062, + "step": 477 + }, + { + "epoch": 0.08, + "learning_rate": 1.9854823107810402e-05, + "loss": 0.4982, + "step": 478 + }, + { + "epoch": 0.08, + "learning_rate": 1.9853868762605865e-05, + "loss": 0.4986, + "step": 479 + }, + { + "epoch": 0.08, + "learning_rate": 1.9852911313980146e-05, + "loss": 0.4995, + "step": 480 + }, + { + "epoch": 0.08, + "learning_rate": 1.9851950762234794e-05, + "loss": 0.4999, + "step": 481 + }, + { + "epoch": 0.08, + "learning_rate": 1.9850987107672322e-05, + "loss": 0.4976, + "step": 482 + }, + { + "epoch": 0.08, + "learning_rate": 1.9850020350596237e-05, + "loss": 0.5057, + "step": 483 + }, + { + "epoch": 0.08, + "learning_rate": 1.9849050491311005e-05, + "loss": 0.5108, + "step": 484 + }, + { + "epoch": 0.08, + "learning_rate": 1.9848077530122083e-05, + "loss": 0.5035, + "step": 485 + }, + { + "epoch": 0.08, + "learning_rate": 1.9847101467335895e-05, + "loss": 0.4953, + "step": 486 + }, + { + "epoch": 0.08, + "learning_rate": 1.9846122303259855e-05, + "loss": 0.5039, + "step": 487 + }, + { + "epoch": 0.08, + "learning_rate": 1.9845140038202338e-05, + "loss": 0.4933, + "step": 488 + }, + { + "epoch": 0.08, + "learning_rate": 1.9844154672472707e-05, + "loss": 0.4985, + "step": 489 + }, + { + "epoch": 0.08, + "learning_rate": 1.9843166206381296e-05, + "loss": 0.5092, + "step": 490 + }, + { + "epoch": 0.09, + "learning_rate": 1.9842174640239415e-05, + "loss": 0.5042, + "step": 491 + }, + { + "epoch": 0.09, + "learning_rate": 1.984117997435935e-05, + "loss": 0.5104, + "step": 492 + }, + { + "epoch": 0.09, + "learning_rate": 1.9840182209054377e-05, + "loss": 0.4955, + "step": 493 + }, + { + "epoch": 0.09, + "learning_rate": 1.9839181344638722e-05, + "loss": 0.508, + "step": 494 + }, + { + "epoch": 0.09, + "learning_rate": 1.9838177381427613e-05, + "loss": 0.5043, + "step": 495 + }, + { + "epoch": 0.09, + "learning_rate": 1.9837170319737236e-05, + "loss": 0.5014, + "step": 496 + }, + { + "epoch": 0.09, + "learning_rate": 1.9836160159884762e-05, + "loss": 0.4948, + "step": 497 + }, + { + "epoch": 0.09, + "learning_rate": 1.9835146902188336e-05, + "loss": 0.5076, + "step": 498 + }, + { + "epoch": 0.09, + "learning_rate": 1.9834130546967073e-05, + "loss": 0.5062, + "step": 499 + }, + { + "epoch": 0.09, + "learning_rate": 1.983311109454108e-05, + "loss": 0.4908, + "step": 500 + }, + { + "epoch": 0.09, + "learning_rate": 1.983208854523141e-05, + "loss": 0.499, + "step": 501 + }, + { + "epoch": 0.09, + "learning_rate": 1.983106289936013e-05, + "loss": 0.4919, + "step": 502 + }, + { + "epoch": 0.09, + "learning_rate": 1.9830034157250245e-05, + "loss": 0.5054, + "step": 503 + }, + { + "epoch": 0.09, + "learning_rate": 1.9829002319225754e-05, + "loss": 0.4974, + "step": 504 + }, + { + "epoch": 0.09, + "learning_rate": 1.9827967385611638e-05, + "loss": 0.4992, + "step": 505 + }, + { + "epoch": 0.09, + "learning_rate": 1.9826929356733836e-05, + "loss": 0.5083, + "step": 506 + }, + { + "epoch": 0.09, + "learning_rate": 1.9825888232919268e-05, + "loss": 0.5134, + "step": 507 + }, + { + "epoch": 0.09, + "learning_rate": 1.9824844014495835e-05, + "loss": 0.5082, + "step": 508 + }, + { + "epoch": 0.09, + "learning_rate": 1.9823796701792405e-05, + "loss": 0.4943, + "step": 509 + }, + { + "epoch": 0.09, + "learning_rate": 1.9822746295138827e-05, + "loss": 0.5056, + "step": 510 + }, + { + "epoch": 0.09, + "learning_rate": 1.9821692794865918e-05, + "loss": 0.495, + "step": 511 + }, + { + "epoch": 0.09, + "learning_rate": 1.982063620130547e-05, + "loss": 0.4864, + "step": 512 + }, + { + "epoch": 0.09, + "learning_rate": 1.9819576514790254e-05, + "loss": 0.4939, + "step": 513 + }, + { + "epoch": 0.09, + "learning_rate": 1.9818513735654012e-05, + "loss": 0.5137, + "step": 514 + }, + { + "epoch": 0.09, + "learning_rate": 1.981744786423146e-05, + "loss": 0.5134, + "step": 515 + }, + { + "epoch": 0.09, + "learning_rate": 1.9816378900858288e-05, + "loss": 0.4942, + "step": 516 + }, + { + "epoch": 0.09, + "learning_rate": 1.9815306845871163e-05, + "loss": 0.4944, + "step": 517 + }, + { + "epoch": 0.09, + "learning_rate": 1.981423169960772e-05, + "loss": 0.5076, + "step": 518 + }, + { + "epoch": 0.09, + "learning_rate": 1.981315346240657e-05, + "loss": 0.4909, + "step": 519 + }, + { + "epoch": 0.09, + "learning_rate": 1.981207213460729e-05, + "loss": 0.4992, + "step": 520 + }, + { + "epoch": 0.09, + "learning_rate": 1.9810987716550458e-05, + "loss": 0.4944, + "step": 521 + }, + { + "epoch": 0.09, + "learning_rate": 1.9809900208577586e-05, + "loss": 0.4966, + "step": 522 + }, + { + "epoch": 0.09, + "learning_rate": 1.980880961103119e-05, + "loss": 0.4817, + "step": 523 + }, + { + "epoch": 0.09, + "learning_rate": 1.9807715924254743e-05, + "loss": 0.4879, + "step": 524 + }, + { + "epoch": 0.09, + "learning_rate": 1.98066191485927e-05, + "loss": 0.5011, + "step": 525 + }, + { + "epoch": 0.09, + "learning_rate": 1.980551928439048e-05, + "loss": 0.5055, + "step": 526 + }, + { + "epoch": 0.09, + "learning_rate": 1.980441633199448e-05, + "loss": 0.4915, + "step": 527 + }, + { + "epoch": 0.09, + "learning_rate": 1.980331029175207e-05, + "loss": 0.5072, + "step": 528 + }, + { + "epoch": 0.09, + "learning_rate": 1.9802201164011587e-05, + "loss": 0.4988, + "step": 529 + }, + { + "epoch": 0.09, + "learning_rate": 1.980108894912235e-05, + "loss": 0.4962, + "step": 530 + }, + { + "epoch": 0.09, + "learning_rate": 1.979997364743464e-05, + "loss": 0.4929, + "step": 531 + }, + { + "epoch": 0.09, + "learning_rate": 1.979885525929972e-05, + "loss": 0.5002, + "step": 532 + }, + { + "epoch": 0.09, + "learning_rate": 1.979773378506982e-05, + "loss": 0.4916, + "step": 533 + }, + { + "epoch": 0.09, + "learning_rate": 1.9796609225098136e-05, + "loss": 0.5075, + "step": 534 + }, + { + "epoch": 0.09, + "learning_rate": 1.9795481579738848e-05, + "loss": 0.5014, + "step": 535 + }, + { + "epoch": 0.09, + "learning_rate": 1.97943508493471e-05, + "loss": 0.5107, + "step": 536 + }, + { + "epoch": 0.09, + "learning_rate": 1.979321703427901e-05, + "loss": 0.5007, + "step": 537 + }, + { + "epoch": 0.09, + "learning_rate": 1.9792080134891662e-05, + "loss": 0.5064, + "step": 538 + }, + { + "epoch": 0.09, + "learning_rate": 1.9790940151543122e-05, + "loss": 0.5029, + "step": 539 + }, + { + "epoch": 0.09, + "learning_rate": 1.9789797084592418e-05, + "loss": 0.5052, + "step": 540 + }, + { + "epoch": 0.09, + "learning_rate": 1.9788650934399553e-05, + "loss": 0.4913, + "step": 541 + }, + { + "epoch": 0.09, + "learning_rate": 1.9787501701325505e-05, + "loss": 0.502, + "step": 542 + }, + { + "epoch": 0.09, + "learning_rate": 1.9786349385732212e-05, + "loss": 0.4988, + "step": 543 + }, + { + "epoch": 0.09, + "learning_rate": 1.9785193987982593e-05, + "loss": 0.5049, + "step": 544 + }, + { + "epoch": 0.09, + "learning_rate": 1.9784035508440534e-05, + "loss": 0.4889, + "step": 545 + }, + { + "epoch": 0.09, + "learning_rate": 1.9782873947470888e-05, + "loss": 0.499, + "step": 546 + }, + { + "epoch": 0.09, + "learning_rate": 1.9781709305439486e-05, + "loss": 0.5083, + "step": 547 + }, + { + "epoch": 0.09, + "learning_rate": 1.9780541582713128e-05, + "loss": 0.5017, + "step": 548 + }, + { + "epoch": 0.1, + "learning_rate": 1.9779370779659578e-05, + "loss": 0.4985, + "step": 549 + }, + { + "epoch": 0.1, + "learning_rate": 1.9778196896647572e-05, + "loss": 0.4972, + "step": 550 + }, + { + "epoch": 0.1, + "learning_rate": 1.977701993404682e-05, + "loss": 0.5008, + "step": 551 + }, + { + "epoch": 0.1, + "learning_rate": 1.9775839892228004e-05, + "loss": 0.5087, + "step": 552 + }, + { + "epoch": 0.1, + "learning_rate": 1.9774656771562764e-05, + "loss": 0.4935, + "step": 553 + }, + { + "epoch": 0.1, + "learning_rate": 1.977347057242372e-05, + "loss": 0.5055, + "step": 554 + }, + { + "epoch": 0.1, + "learning_rate": 1.9772281295184465e-05, + "loss": 0.5013, + "step": 555 + }, + { + "epoch": 0.1, + "learning_rate": 1.9771088940219546e-05, + "loss": 0.5, + "step": 556 + }, + { + "epoch": 0.1, + "learning_rate": 1.97698935079045e-05, + "loss": 0.4929, + "step": 557 + }, + { + "epoch": 0.1, + "learning_rate": 1.9768694998615805e-05, + "loss": 0.5258, + "step": 558 + }, + { + "epoch": 0.1, + "learning_rate": 1.976749341273094e-05, + "loss": 0.5014, + "step": 559 + }, + { + "epoch": 0.1, + "learning_rate": 1.9766288750628327e-05, + "loss": 0.4991, + "step": 560 + }, + { + "epoch": 0.1, + "learning_rate": 1.976508101268738e-05, + "loss": 0.4859, + "step": 561 + }, + { + "epoch": 0.1, + "learning_rate": 1.976387019928846e-05, + "loss": 0.4919, + "step": 562 + }, + { + "epoch": 0.1, + "learning_rate": 1.97626563108129e-05, + "loss": 0.496, + "step": 563 + }, + { + "epoch": 0.1, + "learning_rate": 1.9761439347643027e-05, + "loss": 0.5063, + "step": 564 + }, + { + "epoch": 0.1, + "learning_rate": 1.97602193101621e-05, + "loss": 0.4906, + "step": 565 + }, + { + "epoch": 0.1, + "learning_rate": 1.9758996198754364e-05, + "loss": 0.4971, + "step": 566 + }, + { + "epoch": 0.1, + "learning_rate": 1.975777001380504e-05, + "loss": 0.5023, + "step": 567 + }, + { + "epoch": 0.1, + "learning_rate": 1.9756540755700308e-05, + "loss": 0.4956, + "step": 568 + }, + { + "epoch": 0.1, + "learning_rate": 1.9755308424827303e-05, + "loss": 0.5023, + "step": 569 + }, + { + "epoch": 0.1, + "learning_rate": 1.9754073021574153e-05, + "loss": 0.4913, + "step": 570 + }, + { + "epoch": 0.1, + "learning_rate": 1.9752834546329944e-05, + "loss": 0.4865, + "step": 571 + }, + { + "epoch": 0.1, + "learning_rate": 1.9751592999484713e-05, + "loss": 0.5035, + "step": 572 + }, + { + "epoch": 0.1, + "learning_rate": 1.9750348381429484e-05, + "loss": 0.4981, + "step": 573 + }, + { + "epoch": 0.1, + "learning_rate": 1.974910069255625e-05, + "loss": 0.5027, + "step": 574 + }, + { + "epoch": 0.1, + "learning_rate": 1.9747849933257955e-05, + "loss": 0.4961, + "step": 575 + }, + { + "epoch": 0.1, + "learning_rate": 1.9746596103928524e-05, + "loss": 0.5046, + "step": 576 + }, + { + "epoch": 0.1, + "learning_rate": 1.974533920496284e-05, + "loss": 0.4902, + "step": 577 + }, + { + "epoch": 0.1, + "learning_rate": 1.9744079236756756e-05, + "loss": 0.5076, + "step": 578 + }, + { + "epoch": 0.1, + "learning_rate": 1.9742816199707096e-05, + "loss": 0.483, + "step": 579 + }, + { + "epoch": 0.1, + "learning_rate": 1.9741550094211647e-05, + "loss": 0.5074, + "step": 580 + }, + { + "epoch": 0.1, + "learning_rate": 1.9740280920669153e-05, + "loss": 0.4936, + "step": 581 + }, + { + "epoch": 0.1, + "learning_rate": 1.973900867947934e-05, + "loss": 0.4946, + "step": 582 + }, + { + "epoch": 0.1, + "learning_rate": 1.9737733371042894e-05, + "loss": 0.4804, + "step": 583 + }, + { + "epoch": 0.1, + "learning_rate": 1.9736454995761468e-05, + "loss": 0.4976, + "step": 584 + }, + { + "epoch": 0.1, + "learning_rate": 1.973517355403767e-05, + "loss": 0.502, + "step": 585 + }, + { + "epoch": 0.1, + "learning_rate": 1.9733889046275095e-05, + "loss": 0.5016, + "step": 586 + }, + { + "epoch": 0.1, + "learning_rate": 1.9732601472878282e-05, + "loss": 0.5068, + "step": 587 + }, + { + "epoch": 0.1, + "learning_rate": 1.9731310834252747e-05, + "loss": 0.5192, + "step": 588 + }, + { + "epoch": 0.1, + "learning_rate": 1.9730017130804976e-05, + "loss": 0.4961, + "step": 589 + }, + { + "epoch": 0.1, + "learning_rate": 1.9728720362942404e-05, + "loss": 0.5111, + "step": 590 + }, + { + "epoch": 0.1, + "learning_rate": 1.9727420531073447e-05, + "loss": 0.5004, + "step": 591 + }, + { + "epoch": 0.1, + "learning_rate": 1.972611763560748e-05, + "loss": 0.4893, + "step": 592 + }, + { + "epoch": 0.1, + "learning_rate": 1.972481167695484e-05, + "loss": 0.4979, + "step": 593 + }, + { + "epoch": 0.1, + "learning_rate": 1.9723502655526832e-05, + "loss": 0.4872, + "step": 594 + }, + { + "epoch": 0.1, + "learning_rate": 1.9722190571735725e-05, + "loss": 0.4964, + "step": 595 + }, + { + "epoch": 0.1, + "learning_rate": 1.9720875425994758e-05, + "loss": 0.5013, + "step": 596 + }, + { + "epoch": 0.1, + "learning_rate": 1.9719557218718116e-05, + "loss": 0.4933, + "step": 597 + }, + { + "epoch": 0.1, + "learning_rate": 1.9718235950320978e-05, + "loss": 0.5033, + "step": 598 + }, + { + "epoch": 0.1, + "learning_rate": 1.9716911621219453e-05, + "loss": 0.5092, + "step": 599 + }, + { + "epoch": 0.1, + "learning_rate": 1.9715584231830642e-05, + "loss": 0.4879, + "step": 600 + }, + { + "epoch": 0.1, + "learning_rate": 1.9714253782572598e-05, + "loss": 0.5023, + "step": 601 + }, + { + "epoch": 0.1, + "learning_rate": 1.9712920273864333e-05, + "loss": 0.4989, + "step": 602 + }, + { + "epoch": 0.1, + "learning_rate": 1.9711583706125835e-05, + "loss": 0.4996, + "step": 603 + }, + { + "epoch": 0.1, + "learning_rate": 1.9710244079778042e-05, + "loss": 0.502, + "step": 604 + }, + { + "epoch": 0.1, + "learning_rate": 1.970890139524287e-05, + "loss": 0.5031, + "step": 605 + }, + { + "epoch": 0.1, + "learning_rate": 1.970755565294318e-05, + "loss": 0.507, + "step": 606 + }, + { + "epoch": 0.11, + "learning_rate": 1.9706206853302815e-05, + "loss": 0.4954, + "step": 607 + }, + { + "epoch": 0.11, + "learning_rate": 1.9704854996746565e-05, + "loss": 0.4885, + "step": 608 + }, + { + "epoch": 0.11, + "learning_rate": 1.9703500083700196e-05, + "loss": 0.4874, + "step": 609 + }, + { + "epoch": 0.11, + "learning_rate": 1.9702142114590426e-05, + "loss": 0.5074, + "step": 610 + }, + { + "epoch": 0.11, + "learning_rate": 1.970078108984494e-05, + "loss": 0.4931, + "step": 611 + }, + { + "epoch": 0.11, + "learning_rate": 1.969941700989239e-05, + "loss": 0.4933, + "step": 612 + }, + { + "epoch": 0.11, + "learning_rate": 1.9698049875162377e-05, + "loss": 0.4932, + "step": 613 + }, + { + "epoch": 0.11, + "learning_rate": 1.969667968608548e-05, + "loss": 0.5098, + "step": 614 + }, + { + "epoch": 0.11, + "learning_rate": 1.969530644309323e-05, + "loss": 0.4902, + "step": 615 + }, + { + "epoch": 0.11, + "learning_rate": 1.969393014661812e-05, + "loss": 0.5083, + "step": 616 + }, + { + "epoch": 0.11, + "learning_rate": 1.969255079709361e-05, + "loss": 0.4884, + "step": 617 + }, + { + "epoch": 0.11, + "learning_rate": 1.9691168394954117e-05, + "loss": 0.488, + "step": 618 + }, + { + "epoch": 0.11, + "learning_rate": 1.968978294063502e-05, + "loss": 0.5094, + "step": 619 + }, + { + "epoch": 0.11, + "learning_rate": 1.9688394434572666e-05, + "loss": 0.5111, + "step": 620 + }, + { + "epoch": 0.11, + "learning_rate": 1.9687002877204347e-05, + "loss": 0.4851, + "step": 621 + }, + { + "epoch": 0.11, + "learning_rate": 1.968560826896833e-05, + "loss": 0.5144, + "step": 622 + }, + { + "epoch": 0.11, + "learning_rate": 1.9684210610303848e-05, + "loss": 0.5036, + "step": 623 + }, + { + "epoch": 0.11, + "learning_rate": 1.9682809901651074e-05, + "loss": 0.5031, + "step": 624 + }, + { + "epoch": 0.11, + "learning_rate": 1.968140614345116e-05, + "loss": 0.5042, + "step": 625 + }, + { + "epoch": 0.11, + "learning_rate": 1.967999933614621e-05, + "loss": 0.4961, + "step": 626 + }, + { + "epoch": 0.11, + "learning_rate": 1.967858948017929e-05, + "loss": 0.4822, + "step": 627 + }, + { + "epoch": 0.11, + "learning_rate": 1.9677176575994425e-05, + "loss": 0.5064, + "step": 628 + }, + { + "epoch": 0.11, + "learning_rate": 1.9675760624036605e-05, + "loss": 0.5004, + "step": 629 + }, + { + "epoch": 0.11, + "learning_rate": 1.967434162475177e-05, + "loss": 0.4946, + "step": 630 + }, + { + "epoch": 0.11, + "learning_rate": 1.9672919578586832e-05, + "loss": 0.4925, + "step": 631 + }, + { + "epoch": 0.11, + "learning_rate": 1.9671494485989656e-05, + "loss": 0.4967, + "step": 632 + }, + { + "epoch": 0.11, + "learning_rate": 1.9670066347409063e-05, + "loss": 0.4867, + "step": 633 + }, + { + "epoch": 0.11, + "learning_rate": 1.966863516329484e-05, + "loss": 0.5025, + "step": 634 + }, + { + "epoch": 0.11, + "learning_rate": 1.966720093409773e-05, + "loss": 0.4966, + "step": 635 + }, + { + "epoch": 0.11, + "learning_rate": 1.9665763660269436e-05, + "loss": 0.4956, + "step": 636 + }, + { + "epoch": 0.11, + "learning_rate": 1.9664323342262623e-05, + "loss": 0.4999, + "step": 637 + }, + { + "epoch": 0.11, + "learning_rate": 1.96628799805309e-05, + "loss": 0.4919, + "step": 638 + }, + { + "epoch": 0.11, + "learning_rate": 1.966143357552886e-05, + "loss": 0.4875, + "step": 639 + }, + { + "epoch": 0.11, + "learning_rate": 1.9659984127712027e-05, + "loss": 0.4946, + "step": 640 + }, + { + "epoch": 0.11, + "learning_rate": 1.9658531637536905e-05, + "loss": 0.4844, + "step": 641 + }, + { + "epoch": 0.11, + "learning_rate": 1.9657076105460945e-05, + "loss": 0.4926, + "step": 642 + }, + { + "epoch": 0.11, + "learning_rate": 1.965561753194256e-05, + "loss": 0.4923, + "step": 643 + }, + { + "epoch": 0.11, + "learning_rate": 1.965415591744112e-05, + "loss": 0.4929, + "step": 644 + }, + { + "epoch": 0.11, + "learning_rate": 1.965269126241695e-05, + "loss": 0.4983, + "step": 645 + }, + { + "epoch": 0.11, + "learning_rate": 1.9651223567331333e-05, + "loss": 0.4932, + "step": 646 + }, + { + "epoch": 0.11, + "learning_rate": 1.964975283264652e-05, + "loss": 0.4953, + "step": 647 + }, + { + "epoch": 0.11, + "learning_rate": 1.9648279058825702e-05, + "loss": 0.493, + "step": 648 + }, + { + "epoch": 0.11, + "learning_rate": 1.964680224633304e-05, + "loss": 0.4916, + "step": 649 + }, + { + "epoch": 0.11, + "learning_rate": 1.9645322395633647e-05, + "loss": 0.4984, + "step": 650 + }, + { + "epoch": 0.11, + "learning_rate": 1.964383950719359e-05, + "loss": 0.4904, + "step": 651 + }, + { + "epoch": 0.11, + "learning_rate": 1.9642353581479904e-05, + "loss": 0.5009, + "step": 652 + }, + { + "epoch": 0.11, + "learning_rate": 1.964086461896057e-05, + "loss": 0.501, + "step": 653 + }, + { + "epoch": 0.11, + "learning_rate": 1.9639372620104527e-05, + "loss": 0.503, + "step": 654 + }, + { + "epoch": 0.11, + "learning_rate": 1.9637877585381672e-05, + "loss": 0.504, + "step": 655 + }, + { + "epoch": 0.11, + "learning_rate": 1.9636379515262857e-05, + "loss": 0.4844, + "step": 656 + }, + { + "epoch": 0.11, + "learning_rate": 1.9634878410219893e-05, + "loss": 0.4932, + "step": 657 + }, + { + "epoch": 0.11, + "learning_rate": 1.9633374270725546e-05, + "loss": 0.5057, + "step": 658 + }, + { + "epoch": 0.11, + "learning_rate": 1.963186709725353e-05, + "loss": 0.5004, + "step": 659 + }, + { + "epoch": 0.11, + "learning_rate": 1.9630356890278527e-05, + "loss": 0.4987, + "step": 660 + }, + { + "epoch": 0.11, + "learning_rate": 1.9628843650276167e-05, + "loss": 0.486, + "step": 661 + }, + { + "epoch": 0.11, + "learning_rate": 1.9627327377723035e-05, + "loss": 0.4997, + "step": 662 + }, + { + "epoch": 0.11, + "learning_rate": 1.9625808073096676e-05, + "loss": 0.5092, + "step": 663 + }, + { + "epoch": 0.12, + "learning_rate": 1.962428573687558e-05, + "loss": 0.4952, + "step": 664 + }, + { + "epoch": 0.12, + "learning_rate": 1.9622760369539206e-05, + "loss": 0.5028, + "step": 665 + }, + { + "epoch": 0.12, + "learning_rate": 1.9621231971567955e-05, + "loss": 0.5023, + "step": 666 + }, + { + "epoch": 0.12, + "learning_rate": 1.9619700543443187e-05, + "loss": 0.4897, + "step": 667 + }, + { + "epoch": 0.12, + "learning_rate": 1.961816608564722e-05, + "loss": 0.4956, + "step": 668 + }, + { + "epoch": 0.12, + "learning_rate": 1.9616628598663322e-05, + "loss": 0.5053, + "step": 669 + }, + { + "epoch": 0.12, + "learning_rate": 1.9615088082975715e-05, + "loss": 0.4903, + "step": 670 + }, + { + "epoch": 0.12, + "learning_rate": 1.9613544539069577e-05, + "loss": 0.4986, + "step": 671 + }, + { + "epoch": 0.12, + "learning_rate": 1.9611997967431037e-05, + "loss": 0.4863, + "step": 672 + }, + { + "epoch": 0.12, + "learning_rate": 1.9610448368547182e-05, + "loss": 0.4977, + "step": 673 + }, + { + "epoch": 0.12, + "learning_rate": 1.9608895742906046e-05, + "loss": 0.4801, + "step": 674 + }, + { + "epoch": 0.12, + "learning_rate": 1.960734009099662e-05, + "loss": 0.4961, + "step": 675 + }, + { + "epoch": 0.12, + "learning_rate": 1.9605781413308852e-05, + "loss": 0.4988, + "step": 676 + }, + { + "epoch": 0.12, + "learning_rate": 1.9604219710333637e-05, + "loss": 0.4955, + "step": 677 + }, + { + "epoch": 0.12, + "learning_rate": 1.9602654982562822e-05, + "loss": 0.4939, + "step": 678 + }, + { + "epoch": 0.12, + "learning_rate": 1.960108723048921e-05, + "loss": 0.5024, + "step": 679 + }, + { + "epoch": 0.12, + "learning_rate": 1.959951645460656e-05, + "loss": 0.4843, + "step": 680 + }, + { + "epoch": 0.12, + "learning_rate": 1.9597942655409574e-05, + "loss": 0.4985, + "step": 681 + }, + { + "epoch": 0.12, + "learning_rate": 1.9596365833393913e-05, + "loss": 0.4966, + "step": 682 + }, + { + "epoch": 0.12, + "learning_rate": 1.959478598905619e-05, + "loss": 0.511, + "step": 683 + }, + { + "epoch": 0.12, + "learning_rate": 1.9593203122893966e-05, + "loss": 0.4879, + "step": 684 + }, + { + "epoch": 0.12, + "learning_rate": 1.959161723540576e-05, + "loss": 0.4948, + "step": 685 + }, + { + "epoch": 0.12, + "learning_rate": 1.959002832709103e-05, + "loss": 0.4896, + "step": 686 + }, + { + "epoch": 0.12, + "learning_rate": 1.9588436398450206e-05, + "loss": 0.505, + "step": 687 + }, + { + "epoch": 0.12, + "learning_rate": 1.9586841449984643e-05, + "loss": 0.4988, + "step": 688 + }, + { + "epoch": 0.12, + "learning_rate": 1.958524348219667e-05, + "loss": 0.5032, + "step": 689 + }, + { + "epoch": 0.12, + "learning_rate": 1.958364249558956e-05, + "loss": 0.4961, + "step": 690 + }, + { + "epoch": 0.12, + "learning_rate": 1.9582038490667532e-05, + "loss": 0.4944, + "step": 691 + }, + { + "epoch": 0.12, + "learning_rate": 1.9580431467935753e-05, + "loss": 0.4867, + "step": 692 + }, + { + "epoch": 0.12, + "learning_rate": 1.957882142790035e-05, + "loss": 0.5004, + "step": 693 + }, + { + "epoch": 0.12, + "learning_rate": 1.95772083710684e-05, + "loss": 0.5042, + "step": 694 + }, + { + "epoch": 0.12, + "learning_rate": 1.9575592297947926e-05, + "loss": 0.4973, + "step": 695 + }, + { + "epoch": 0.12, + "learning_rate": 1.9573973209047893e-05, + "loss": 0.5008, + "step": 696 + }, + { + "epoch": 0.12, + "learning_rate": 1.9572351104878232e-05, + "loss": 0.4941, + "step": 697 + }, + { + "epoch": 0.12, + "learning_rate": 1.957072598594981e-05, + "loss": 0.4974, + "step": 698 + }, + { + "epoch": 0.12, + "learning_rate": 1.9569097852774456e-05, + "loss": 0.4998, + "step": 699 + }, + { + "epoch": 0.12, + "learning_rate": 1.9567466705864934e-05, + "loss": 0.4911, + "step": 700 + }, + { + "epoch": 0.12, + "learning_rate": 1.9565832545734972e-05, + "loss": 0.4998, + "step": 701 + }, + { + "epoch": 0.12, + "learning_rate": 1.9564195372899233e-05, + "loss": 0.4952, + "step": 702 + }, + { + "epoch": 0.12, + "learning_rate": 1.956255518787334e-05, + "loss": 0.5178, + "step": 703 + }, + { + "epoch": 0.12, + "learning_rate": 1.9560911991173856e-05, + "loss": 0.491, + "step": 704 + }, + { + "epoch": 0.12, + "learning_rate": 1.9559265783318304e-05, + "loss": 0.503, + "step": 705 + }, + { + "epoch": 0.12, + "learning_rate": 1.9557616564825138e-05, + "loss": 0.5068, + "step": 706 + }, + { + "epoch": 0.12, + "learning_rate": 1.955596433621378e-05, + "loss": 0.4874, + "step": 707 + }, + { + "epoch": 0.12, + "learning_rate": 1.9554309098004583e-05, + "loss": 0.5021, + "step": 708 + }, + { + "epoch": 0.12, + "learning_rate": 1.955265085071886e-05, + "loss": 0.504, + "step": 709 + }, + { + "epoch": 0.12, + "learning_rate": 1.9550989594878862e-05, + "loss": 0.4918, + "step": 710 + }, + { + "epoch": 0.12, + "learning_rate": 1.9549325331007795e-05, + "loss": 0.5056, + "step": 711 + }, + { + "epoch": 0.12, + "learning_rate": 1.954765805962981e-05, + "loss": 0.4862, + "step": 712 + }, + { + "epoch": 0.12, + "learning_rate": 1.9545987781270007e-05, + "loss": 0.5218, + "step": 713 + }, + { + "epoch": 0.12, + "learning_rate": 1.9544314496454423e-05, + "loss": 0.4962, + "step": 714 + }, + { + "epoch": 0.12, + "learning_rate": 1.9542638205710058e-05, + "loss": 0.5025, + "step": 715 + }, + { + "epoch": 0.12, + "learning_rate": 1.9540958909564846e-05, + "loss": 0.4878, + "step": 716 + }, + { + "epoch": 0.12, + "learning_rate": 1.9539276608547676e-05, + "loss": 0.4929, + "step": 717 + }, + { + "epoch": 0.12, + "learning_rate": 1.9537591303188375e-05, + "loss": 0.4846, + "step": 718 + }, + { + "epoch": 0.12, + "learning_rate": 1.953590299401772e-05, + "loss": 0.4969, + "step": 719 + }, + { + "epoch": 0.12, + "learning_rate": 1.953421168156744e-05, + "loss": 0.4985, + "step": 720 + }, + { + "epoch": 0.12, + "learning_rate": 1.9532517366370203e-05, + "loss": 0.4978, + "step": 721 + }, + { + "epoch": 0.13, + "learning_rate": 1.9530820048959616e-05, + "loss": 0.4921, + "step": 722 + }, + { + "epoch": 0.13, + "learning_rate": 1.9529119729870253e-05, + "loss": 0.5012, + "step": 723 + }, + { + "epoch": 0.13, + "learning_rate": 1.952741640963761e-05, + "loss": 0.4827, + "step": 724 + }, + { + "epoch": 0.13, + "learning_rate": 1.9525710088798142e-05, + "loss": 0.5087, + "step": 725 + }, + { + "epoch": 0.13, + "learning_rate": 1.9524000767889243e-05, + "loss": 0.4864, + "step": 726 + }, + { + "epoch": 0.13, + "learning_rate": 1.952228844744926e-05, + "loss": 0.4965, + "step": 727 + }, + { + "epoch": 0.13, + "learning_rate": 1.9520573128017467e-05, + "loss": 0.4973, + "step": 728 + }, + { + "epoch": 0.13, + "learning_rate": 1.951885481013411e-05, + "loss": 0.4982, + "step": 729 + }, + { + "epoch": 0.13, + "learning_rate": 1.951713349434035e-05, + "loss": 0.4904, + "step": 730 + }, + { + "epoch": 0.13, + "learning_rate": 1.9515409181178315e-05, + "loss": 0.4877, + "step": 731 + }, + { + "epoch": 0.13, + "learning_rate": 1.9513681871191063e-05, + "loss": 0.4991, + "step": 732 + }, + { + "epoch": 0.13, + "learning_rate": 1.95119515649226e-05, + "loss": 0.492, + "step": 733 + }, + { + "epoch": 0.13, + "learning_rate": 1.9510218262917883e-05, + "loss": 0.4919, + "step": 734 + }, + { + "epoch": 0.13, + "learning_rate": 1.9508481965722798e-05, + "loss": 0.4863, + "step": 735 + }, + { + "epoch": 0.13, + "learning_rate": 1.9506742673884186e-05, + "loss": 0.5002, + "step": 736 + }, + { + "epoch": 0.13, + "learning_rate": 1.9505000387949825e-05, + "loss": 0.4839, + "step": 737 + }, + { + "epoch": 0.13, + "learning_rate": 1.950325510846844e-05, + "loss": 0.4935, + "step": 738 + }, + { + "epoch": 0.13, + "learning_rate": 1.95015068359897e-05, + "loss": 0.4995, + "step": 739 + }, + { + "epoch": 0.13, + "learning_rate": 1.949975557106421e-05, + "loss": 0.4957, + "step": 740 + }, + { + "epoch": 0.13, + "learning_rate": 1.949800131424352e-05, + "loss": 0.4914, + "step": 741 + }, + { + "epoch": 0.13, + "learning_rate": 1.9496244066080122e-05, + "loss": 0.4897, + "step": 742 + }, + { + "epoch": 0.13, + "learning_rate": 1.949448382712746e-05, + "loss": 0.501, + "step": 743 + }, + { + "epoch": 0.13, + "learning_rate": 1.9492720597939902e-05, + "loss": 0.4924, + "step": 744 + }, + { + "epoch": 0.13, + "learning_rate": 1.9490954379072775e-05, + "loss": 0.4974, + "step": 745 + }, + { + "epoch": 0.13, + "learning_rate": 1.9489185171082334e-05, + "loss": 0.4968, + "step": 746 + }, + { + "epoch": 0.13, + "learning_rate": 1.9487412974525784e-05, + "loss": 0.4888, + "step": 747 + }, + { + "epoch": 0.13, + "learning_rate": 1.948563778996127e-05, + "loss": 0.4854, + "step": 748 + }, + { + "epoch": 0.13, + "learning_rate": 1.948385961794787e-05, + "loss": 0.5014, + "step": 749 + }, + { + "epoch": 0.13, + "learning_rate": 1.9482078459045617e-05, + "loss": 0.4971, + "step": 750 + }, + { + "epoch": 0.13, + "learning_rate": 1.9480294313815472e-05, + "loss": 0.4993, + "step": 751 + }, + { + "epoch": 0.13, + "learning_rate": 1.9478507182819345e-05, + "loss": 0.4901, + "step": 752 + }, + { + "epoch": 0.13, + "learning_rate": 1.9476717066620082e-05, + "loss": 0.5011, + "step": 753 + }, + { + "epoch": 0.13, + "learning_rate": 1.947492396578147e-05, + "loss": 0.5017, + "step": 754 + }, + { + "epoch": 0.13, + "learning_rate": 1.9473127880868233e-05, + "loss": 0.5016, + "step": 755 + }, + { + "epoch": 0.13, + "learning_rate": 1.9471328812446045e-05, + "loss": 0.4976, + "step": 756 + }, + { + "epoch": 0.13, + "learning_rate": 1.9469526761081504e-05, + "loss": 0.4896, + "step": 757 + }, + { + "epoch": 0.13, + "learning_rate": 1.946772172734216e-05, + "loss": 0.4892, + "step": 758 + }, + { + "epoch": 0.13, + "learning_rate": 1.9465913711796502e-05, + "loss": 0.5003, + "step": 759 + }, + { + "epoch": 0.13, + "learning_rate": 1.946410271501395e-05, + "loss": 0.4832, + "step": 760 + }, + { + "epoch": 0.13, + "learning_rate": 1.946228873756487e-05, + "loss": 0.5087, + "step": 761 + }, + { + "epoch": 0.13, + "learning_rate": 1.946047178002056e-05, + "loss": 0.4891, + "step": 762 + }, + { + "epoch": 0.13, + "learning_rate": 1.9458651842953264e-05, + "loss": 0.5023, + "step": 763 + }, + { + "epoch": 0.13, + "learning_rate": 1.945682892693616e-05, + "loss": 0.4886, + "step": 764 + }, + { + "epoch": 0.13, + "learning_rate": 1.9455003032543366e-05, + "loss": 0.5015, + "step": 765 + }, + { + "epoch": 0.13, + "learning_rate": 1.9453174160349938e-05, + "loss": 0.4845, + "step": 766 + }, + { + "epoch": 0.13, + "learning_rate": 1.9451342310931866e-05, + "loss": 0.4935, + "step": 767 + }, + { + "epoch": 0.13, + "learning_rate": 1.9449507484866084e-05, + "loss": 0.4867, + "step": 768 + }, + { + "epoch": 0.13, + "learning_rate": 1.944766968273046e-05, + "loss": 0.4937, + "step": 769 + }, + { + "epoch": 0.13, + "learning_rate": 1.9445828905103797e-05, + "loss": 0.4944, + "step": 770 + }, + { + "epoch": 0.13, + "learning_rate": 1.944398515256584e-05, + "loss": 0.4985, + "step": 771 + }, + { + "epoch": 0.13, + "learning_rate": 1.944213842569727e-05, + "loss": 0.4973, + "step": 772 + }, + { + "epoch": 0.13, + "learning_rate": 1.94402887250797e-05, + "loss": 0.4921, + "step": 773 + }, + { + "epoch": 0.13, + "learning_rate": 1.943843605129568e-05, + "loss": 0.493, + "step": 774 + }, + { + "epoch": 0.13, + "learning_rate": 1.943658040492871e-05, + "loss": 0.4867, + "step": 775 + }, + { + "epoch": 0.13, + "learning_rate": 1.9434721786563204e-05, + "loss": 0.4912, + "step": 776 + }, + { + "epoch": 0.13, + "learning_rate": 1.9432860196784533e-05, + "loss": 0.4909, + "step": 777 + }, + { + "epoch": 0.13, + "learning_rate": 1.9430995636178986e-05, + "loss": 0.4876, + "step": 778 + }, + { + "epoch": 0.13, + "learning_rate": 1.9429128105333802e-05, + "loss": 0.4934, + "step": 779 + }, + { + "epoch": 0.14, + "learning_rate": 1.9427257604837146e-05, + "loss": 0.4827, + "step": 780 + }, + { + "epoch": 0.14, + "learning_rate": 1.9425384135278126e-05, + "loss": 0.5015, + "step": 781 + }, + { + "epoch": 0.14, + "learning_rate": 1.942350769724678e-05, + "loss": 0.4936, + "step": 782 + }, + { + "epoch": 0.14, + "learning_rate": 1.9421628291334072e-05, + "loss": 0.4904, + "step": 783 + }, + { + "epoch": 0.14, + "learning_rate": 1.941974591813192e-05, + "loss": 0.4869, + "step": 784 + }, + { + "epoch": 0.14, + "learning_rate": 1.941786057823317e-05, + "loss": 0.4918, + "step": 785 + }, + { + "epoch": 0.14, + "learning_rate": 1.941597227223159e-05, + "loss": 0.5043, + "step": 786 + }, + { + "epoch": 0.14, + "learning_rate": 1.9414081000721898e-05, + "loss": 0.5034, + "step": 787 + }, + { + "epoch": 0.14, + "learning_rate": 1.9412186764299738e-05, + "loss": 0.4977, + "step": 788 + }, + { + "epoch": 0.14, + "learning_rate": 1.9410289563561685e-05, + "loss": 0.5031, + "step": 789 + }, + { + "epoch": 0.14, + "learning_rate": 1.9408389399105257e-05, + "loss": 0.4781, + "step": 790 + }, + { + "epoch": 0.14, + "learning_rate": 1.9406486271528896e-05, + "loss": 0.5038, + "step": 791 + }, + { + "epoch": 0.14, + "learning_rate": 1.940458018143199e-05, + "loss": 0.4921, + "step": 792 + }, + { + "epoch": 0.14, + "learning_rate": 1.9402671129414844e-05, + "loss": 0.4901, + "step": 793 + }, + { + "epoch": 0.14, + "learning_rate": 1.9400759116078703e-05, + "loss": 0.4883, + "step": 794 + }, + { + "epoch": 0.14, + "learning_rate": 1.9398844142025746e-05, + "loss": 0.4904, + "step": 795 + }, + { + "epoch": 0.14, + "learning_rate": 1.9396926207859085e-05, + "loss": 0.4899, + "step": 796 + }, + { + "epoch": 0.14, + "learning_rate": 1.9395005314182765e-05, + "loss": 0.5099, + "step": 797 + }, + { + "epoch": 0.14, + "learning_rate": 1.9393081461601752e-05, + "loss": 0.4862, + "step": 798 + }, + { + "epoch": 0.14, + "learning_rate": 1.939115465072196e-05, + "loss": 0.4944, + "step": 799 + }, + { + "epoch": 0.14, + "learning_rate": 1.938922488215023e-05, + "loss": 0.4799, + "step": 800 + }, + { + "epoch": 0.14, + "learning_rate": 1.9387292156494326e-05, + "loss": 0.4979, + "step": 801 + }, + { + "epoch": 0.14, + "learning_rate": 1.938535647436295e-05, + "loss": 0.4922, + "step": 802 + }, + { + "epoch": 0.14, + "learning_rate": 1.9383417836365734e-05, + "loss": 0.4974, + "step": 803 + }, + { + "epoch": 0.14, + "learning_rate": 1.9381476243113243e-05, + "loss": 0.4915, + "step": 804 + }, + { + "epoch": 0.14, + "learning_rate": 1.937953169521697e-05, + "loss": 0.4953, + "step": 805 + }, + { + "epoch": 0.14, + "learning_rate": 1.937758419328934e-05, + "loss": 0.4721, + "step": 806 + }, + { + "epoch": 0.14, + "learning_rate": 1.937563373794371e-05, + "loss": 0.4937, + "step": 807 + }, + { + "epoch": 0.14, + "learning_rate": 1.937368032979436e-05, + "loss": 0.4752, + "step": 808 + }, + { + "epoch": 0.14, + "learning_rate": 1.9371723969456512e-05, + "loss": 0.4992, + "step": 809 + }, + { + "epoch": 0.14, + "learning_rate": 1.9369764657546307e-05, + "loss": 0.5051, + "step": 810 + }, + { + "epoch": 0.14, + "learning_rate": 1.9367802394680816e-05, + "loss": 0.4984, + "step": 811 + }, + { + "epoch": 0.14, + "learning_rate": 1.9365837181478044e-05, + "loss": 0.4727, + "step": 812 + }, + { + "epoch": 0.14, + "learning_rate": 1.9363869018556928e-05, + "loss": 0.4883, + "step": 813 + }, + { + "epoch": 0.14, + "learning_rate": 1.936189790653733e-05, + "loss": 0.4926, + "step": 814 + }, + { + "epoch": 0.14, + "learning_rate": 1.9359923846040035e-05, + "loss": 0.4914, + "step": 815 + }, + { + "epoch": 0.14, + "learning_rate": 1.935794683768677e-05, + "loss": 0.5043, + "step": 816 + }, + { + "epoch": 0.14, + "learning_rate": 1.935596688210018e-05, + "loss": 0.4854, + "step": 817 + }, + { + "epoch": 0.14, + "learning_rate": 1.9353983979903836e-05, + "loss": 0.4882, + "step": 818 + }, + { + "epoch": 0.14, + "learning_rate": 1.9351998131722244e-05, + "loss": 0.4945, + "step": 819 + }, + { + "epoch": 0.14, + "learning_rate": 1.9350009338180842e-05, + "loss": 0.492, + "step": 820 + }, + { + "epoch": 0.14, + "learning_rate": 1.9348017599905984e-05, + "loss": 0.493, + "step": 821 + }, + { + "epoch": 0.14, + "learning_rate": 1.9346022917524958e-05, + "loss": 0.503, + "step": 822 + }, + { + "epoch": 0.14, + "learning_rate": 1.9344025291665978e-05, + "loss": 0.4995, + "step": 823 + }, + { + "epoch": 0.14, + "learning_rate": 1.9342024722958187e-05, + "loss": 0.497, + "step": 824 + }, + { + "epoch": 0.14, + "learning_rate": 1.9340021212031647e-05, + "loss": 0.4962, + "step": 825 + }, + { + "epoch": 0.14, + "learning_rate": 1.933801475951736e-05, + "loss": 0.4983, + "step": 826 + }, + { + "epoch": 0.14, + "learning_rate": 1.9336005366047246e-05, + "loss": 0.4809, + "step": 827 + }, + { + "epoch": 0.14, + "learning_rate": 1.933399303225415e-05, + "loss": 0.5052, + "step": 828 + }, + { + "epoch": 0.14, + "learning_rate": 1.933197775877184e-05, + "loss": 0.4871, + "step": 829 + }, + { + "epoch": 0.14, + "learning_rate": 1.9329959546235028e-05, + "loss": 0.4884, + "step": 830 + }, + { + "epoch": 0.14, + "learning_rate": 1.9327938395279325e-05, + "loss": 0.4829, + "step": 831 + }, + { + "epoch": 0.14, + "learning_rate": 1.9325914306541294e-05, + "loss": 0.5091, + "step": 832 + }, + { + "epoch": 0.14, + "learning_rate": 1.93238872806584e-05, + "loss": 0.4914, + "step": 833 + }, + { + "epoch": 0.14, + "learning_rate": 1.932185731826905e-05, + "loss": 0.4964, + "step": 834 + }, + { + "epoch": 0.14, + "learning_rate": 1.9319824420012566e-05, + "loss": 0.4823, + "step": 835 + }, + { + "epoch": 0.14, + "learning_rate": 1.93177885865292e-05, + "loss": 0.501, + "step": 836 + }, + { + "epoch": 0.14, + "learning_rate": 1.9315749818460127e-05, + "loss": 0.4884, + "step": 837 + }, + { + "epoch": 0.15, + "learning_rate": 1.9313708116447446e-05, + "loss": 0.5083, + "step": 838 + }, + { + "epoch": 0.15, + "learning_rate": 1.9311663481134174e-05, + "loss": 0.5016, + "step": 839 + }, + { + "epoch": 0.15, + "learning_rate": 1.9309615913164262e-05, + "loss": 0.5033, + "step": 840 + }, + { + "epoch": 0.15, + "learning_rate": 1.9307565413182582e-05, + "loss": 0.4805, + "step": 841 + }, + { + "epoch": 0.15, + "learning_rate": 1.9305511981834927e-05, + "loss": 0.4876, + "step": 842 + }, + { + "epoch": 0.15, + "learning_rate": 1.9303455619768006e-05, + "loss": 0.4936, + "step": 843 + }, + { + "epoch": 0.15, + "learning_rate": 1.930139632762947e-05, + "loss": 0.5003, + "step": 844 + }, + { + "epoch": 0.15, + "learning_rate": 1.9299334106067874e-05, + "loss": 0.492, + "step": 845 + }, + { + "epoch": 0.15, + "learning_rate": 1.9297268955732707e-05, + "loss": 0.4804, + "step": 846 + }, + { + "epoch": 0.15, + "learning_rate": 1.929520087727438e-05, + "loss": 0.4867, + "step": 847 + }, + { + "epoch": 0.15, + "learning_rate": 1.9293129871344215e-05, + "loss": 0.5015, + "step": 848 + }, + { + "epoch": 0.15, + "learning_rate": 1.9291055938594464e-05, + "loss": 0.4911, + "step": 849 + }, + { + "epoch": 0.15, + "learning_rate": 1.9288979079678306e-05, + "loss": 0.4982, + "step": 850 + }, + { + "epoch": 0.15, + "learning_rate": 1.928689929524983e-05, + "loss": 0.4947, + "step": 851 + }, + { + "epoch": 0.15, + "learning_rate": 1.928481658596406e-05, + "loss": 0.4926, + "step": 852 + }, + { + "epoch": 0.15, + "learning_rate": 1.9282730952476928e-05, + "loss": 0.4768, + "step": 853 + }, + { + "epoch": 0.15, + "learning_rate": 1.9280642395445298e-05, + "loss": 0.4899, + "step": 854 + }, + { + "epoch": 0.15, + "learning_rate": 1.9278550915526947e-05, + "loss": 0.4818, + "step": 855 + }, + { + "epoch": 0.15, + "learning_rate": 1.927645651338057e-05, + "loss": 0.4917, + "step": 856 + }, + { + "epoch": 0.15, + "learning_rate": 1.9274359189665792e-05, + "loss": 0.4928, + "step": 857 + }, + { + "epoch": 0.15, + "learning_rate": 1.9272258945043154e-05, + "loss": 0.4905, + "step": 858 + }, + { + "epoch": 0.15, + "learning_rate": 1.9270155780174113e-05, + "loss": 0.4839, + "step": 859 + }, + { + "epoch": 0.15, + "learning_rate": 1.9268049695721055e-05, + "loss": 0.4801, + "step": 860 + }, + { + "epoch": 0.15, + "learning_rate": 1.9265940692347276e-05, + "loss": 0.4792, + "step": 861 + }, + { + "epoch": 0.15, + "learning_rate": 1.9263828770716993e-05, + "loss": 0.4887, + "step": 862 + }, + { + "epoch": 0.15, + "learning_rate": 1.9261713931495344e-05, + "loss": 0.4897, + "step": 863 + }, + { + "epoch": 0.15, + "learning_rate": 1.925959617534839e-05, + "loss": 0.4902, + "step": 864 + }, + { + "epoch": 0.15, + "learning_rate": 1.92574755029431e-05, + "loss": 0.4804, + "step": 865 + }, + { + "epoch": 0.15, + "learning_rate": 1.925535191494738e-05, + "loss": 0.5123, + "step": 866 + }, + { + "epoch": 0.15, + "learning_rate": 1.9253225412030028e-05, + "loss": 0.4983, + "step": 867 + }, + { + "epoch": 0.15, + "learning_rate": 1.9251095994860782e-05, + "loss": 0.5005, + "step": 868 + }, + { + "epoch": 0.15, + "learning_rate": 1.924896366411029e-05, + "loss": 0.4775, + "step": 869 + }, + { + "epoch": 0.15, + "learning_rate": 1.9246828420450113e-05, + "loss": 0.4955, + "step": 870 + }, + { + "epoch": 0.15, + "learning_rate": 1.9244690264552745e-05, + "loss": 0.4846, + "step": 871 + }, + { + "epoch": 0.15, + "learning_rate": 1.924254919709157e-05, + "loss": 0.4919, + "step": 872 + }, + { + "epoch": 0.15, + "learning_rate": 1.924040521874092e-05, + "loss": 0.4945, + "step": 873 + }, + { + "epoch": 0.15, + "learning_rate": 1.923825833017602e-05, + "loss": 0.4845, + "step": 874 + }, + { + "epoch": 0.15, + "learning_rate": 1.9236108532073025e-05, + "loss": 0.4783, + "step": 875 + }, + { + "epoch": 0.15, + "learning_rate": 1.9233955825109e-05, + "loss": 0.4993, + "step": 876 + }, + { + "epoch": 0.15, + "learning_rate": 1.9231800209961932e-05, + "loss": 0.4976, + "step": 877 + }, + { + "epoch": 0.15, + "learning_rate": 1.9229641687310714e-05, + "loss": 0.4995, + "step": 878 + }, + { + "epoch": 0.15, + "learning_rate": 1.9227480257835163e-05, + "loss": 0.4897, + "step": 879 + }, + { + "epoch": 0.15, + "learning_rate": 1.922531592221601e-05, + "loss": 0.4904, + "step": 880 + }, + { + "epoch": 0.15, + "learning_rate": 1.92231486811349e-05, + "loss": 0.4959, + "step": 881 + }, + { + "epoch": 0.15, + "learning_rate": 1.9220978535274398e-05, + "loss": 0.4898, + "step": 882 + }, + { + "epoch": 0.15, + "learning_rate": 1.9218805485317973e-05, + "loss": 0.4901, + "step": 883 + }, + { + "epoch": 0.15, + "learning_rate": 1.9216629531950014e-05, + "loss": 0.4828, + "step": 884 + }, + { + "epoch": 0.15, + "learning_rate": 1.9214450675855832e-05, + "loss": 0.4791, + "step": 885 + }, + { + "epoch": 0.15, + "learning_rate": 1.9212268917721643e-05, + "loss": 0.4885, + "step": 886 + }, + { + "epoch": 0.15, + "learning_rate": 1.9210084258234576e-05, + "loss": 0.4964, + "step": 887 + }, + { + "epoch": 0.15, + "learning_rate": 1.920789669808268e-05, + "loss": 0.5053, + "step": 888 + }, + { + "epoch": 0.15, + "learning_rate": 1.9205706237954914e-05, + "loss": 0.491, + "step": 889 + }, + { + "epoch": 0.15, + "learning_rate": 1.9203512878541156e-05, + "loss": 0.4959, + "step": 890 + }, + { + "epoch": 0.15, + "learning_rate": 1.9201316620532186e-05, + "loss": 0.4881, + "step": 891 + }, + { + "epoch": 0.15, + "learning_rate": 1.919911746461971e-05, + "loss": 0.4935, + "step": 892 + }, + { + "epoch": 0.15, + "learning_rate": 1.919691541149633e-05, + "loss": 0.4887, + "step": 893 + }, + { + "epoch": 0.15, + "learning_rate": 1.919471046185558e-05, + "loss": 0.4885, + "step": 894 + }, + { + "epoch": 0.16, + "learning_rate": 1.919250261639189e-05, + "loss": 0.4915, + "step": 895 + }, + { + "epoch": 0.16, + "learning_rate": 1.9190291875800616e-05, + "loss": 0.4944, + "step": 896 + }, + { + "epoch": 0.16, + "learning_rate": 1.918807824077801e-05, + "loss": 0.4916, + "step": 897 + }, + { + "epoch": 0.16, + "learning_rate": 1.918586171202125e-05, + "loss": 0.4934, + "step": 898 + }, + { + "epoch": 0.16, + "learning_rate": 1.9183642290228415e-05, + "loss": 0.4836, + "step": 899 + }, + { + "epoch": 0.16, + "learning_rate": 1.9181419976098503e-05, + "loss": 0.5045, + "step": 900 + }, + { + "epoch": 0.16, + "learning_rate": 1.9179194770331418e-05, + "loss": 0.4787, + "step": 901 + }, + { + "epoch": 0.16, + "learning_rate": 1.917696667362798e-05, + "loss": 0.4937, + "step": 902 + }, + { + "epoch": 0.16, + "learning_rate": 1.917473568668991e-05, + "loss": 0.4817, + "step": 903 + }, + { + "epoch": 0.16, + "learning_rate": 1.9172501810219844e-05, + "loss": 0.4836, + "step": 904 + }, + { + "epoch": 0.16, + "learning_rate": 1.9170265044921338e-05, + "loss": 0.472, + "step": 905 + }, + { + "epoch": 0.16, + "learning_rate": 1.9168025391498837e-05, + "loss": 0.5059, + "step": 906 + }, + { + "epoch": 0.16, + "learning_rate": 1.9165782850657716e-05, + "loss": 0.4813, + "step": 907 + }, + { + "epoch": 0.16, + "learning_rate": 1.916353742310425e-05, + "loss": 0.496, + "step": 908 + }, + { + "epoch": 0.16, + "learning_rate": 1.916128910954562e-05, + "loss": 0.4805, + "step": 909 + }, + { + "epoch": 0.16, + "learning_rate": 1.9159037910689925e-05, + "loss": 0.4943, + "step": 910 + }, + { + "epoch": 0.16, + "learning_rate": 1.915678382724616e-05, + "loss": 0.4927, + "step": 911 + }, + { + "epoch": 0.16, + "learning_rate": 1.9154526859924242e-05, + "loss": 0.4999, + "step": 912 + }, + { + "epoch": 0.16, + "learning_rate": 1.915226700943499e-05, + "loss": 0.487, + "step": 913 + }, + { + "epoch": 0.16, + "learning_rate": 1.915000427649013e-05, + "loss": 0.4971, + "step": 914 + }, + { + "epoch": 0.16, + "learning_rate": 1.9147738661802295e-05, + "loss": 0.4923, + "step": 915 + }, + { + "epoch": 0.16, + "learning_rate": 1.9145470166085034e-05, + "loss": 0.4784, + "step": 916 + }, + { + "epoch": 0.16, + "learning_rate": 1.9143198790052788e-05, + "loss": 0.4825, + "step": 917 + }, + { + "epoch": 0.16, + "learning_rate": 1.9140924534420924e-05, + "loss": 0.4898, + "step": 918 + }, + { + "epoch": 0.16, + "learning_rate": 1.91386473999057e-05, + "loss": 0.4846, + "step": 919 + }, + { + "epoch": 0.16, + "learning_rate": 1.9136367387224288e-05, + "loss": 0.4944, + "step": 920 + }, + { + "epoch": 0.16, + "learning_rate": 1.9134084497094766e-05, + "loss": 0.4891, + "step": 921 + }, + { + "epoch": 0.16, + "learning_rate": 1.9131798730236116e-05, + "loss": 0.4883, + "step": 922 + }, + { + "epoch": 0.16, + "learning_rate": 1.9129510087368234e-05, + "loss": 0.488, + "step": 923 + }, + { + "epoch": 0.16, + "learning_rate": 1.9127218569211905e-05, + "loss": 0.4985, + "step": 924 + }, + { + "epoch": 0.16, + "learning_rate": 1.9124924176488838e-05, + "loss": 0.4915, + "step": 925 + }, + { + "epoch": 0.16, + "learning_rate": 1.9122626909921637e-05, + "loss": 0.4934, + "step": 926 + }, + { + "epoch": 0.16, + "learning_rate": 1.912032677023381e-05, + "loss": 0.4974, + "step": 927 + }, + { + "epoch": 0.16, + "learning_rate": 1.9118023758149777e-05, + "loss": 0.4957, + "step": 928 + }, + { + "epoch": 0.16, + "learning_rate": 1.9115717874394856e-05, + "loss": 0.4833, + "step": 929 + }, + { + "epoch": 0.16, + "learning_rate": 1.9113409119695276e-05, + "loss": 0.4893, + "step": 930 + }, + { + "epoch": 0.16, + "learning_rate": 1.9111097494778164e-05, + "loss": 0.4852, + "step": 931 + }, + { + "epoch": 0.16, + "learning_rate": 1.9108783000371555e-05, + "loss": 0.488, + "step": 932 + }, + { + "epoch": 0.16, + "learning_rate": 1.910646563720439e-05, + "loss": 0.4887, + "step": 933 + }, + { + "epoch": 0.16, + "learning_rate": 1.9104145406006495e-05, + "loss": 0.4851, + "step": 934 + }, + { + "epoch": 0.16, + "learning_rate": 1.9101822307508628e-05, + "loss": 0.4894, + "step": 935 + }, + { + "epoch": 0.16, + "learning_rate": 1.9099496342442432e-05, + "loss": 0.4945, + "step": 936 + }, + { + "epoch": 0.16, + "learning_rate": 1.9097167511540453e-05, + "loss": 0.4975, + "step": 937 + }, + { + "epoch": 0.16, + "learning_rate": 1.909483581553615e-05, + "loss": 0.4941, + "step": 938 + }, + { + "epoch": 0.16, + "learning_rate": 1.9092501255163874e-05, + "loss": 0.4986, + "step": 939 + }, + { + "epoch": 0.16, + "learning_rate": 1.9090163831158883e-05, + "loss": 0.4856, + "step": 940 + }, + { + "epoch": 0.16, + "learning_rate": 1.9087823544257334e-05, + "loss": 0.4816, + "step": 941 + }, + { + "epoch": 0.16, + "learning_rate": 1.9085480395196287e-05, + "loss": 0.5001, + "step": 942 + }, + { + "epoch": 0.16, + "learning_rate": 1.9083134384713708e-05, + "loss": 0.4829, + "step": 943 + }, + { + "epoch": 0.16, + "learning_rate": 1.9080785513548454e-05, + "loss": 0.5031, + "step": 944 + }, + { + "epoch": 0.16, + "learning_rate": 1.9078433782440292e-05, + "loss": 0.4799, + "step": 945 + }, + { + "epoch": 0.16, + "learning_rate": 1.9076079192129886e-05, + "loss": 0.5045, + "step": 946 + }, + { + "epoch": 0.16, + "learning_rate": 1.9073721743358805e-05, + "loss": 0.4859, + "step": 947 + }, + { + "epoch": 0.16, + "learning_rate": 1.907136143686951e-05, + "loss": 0.4885, + "step": 948 + }, + { + "epoch": 0.16, + "learning_rate": 1.9068998273405364e-05, + "loss": 0.5062, + "step": 949 + }, + { + "epoch": 0.16, + "learning_rate": 1.9066632253710636e-05, + "loss": 0.4889, + "step": 950 + }, + { + "epoch": 0.16, + "learning_rate": 1.9064263378530495e-05, + "loss": 0.4893, + "step": 951 + }, + { + "epoch": 0.16, + "learning_rate": 1.9061891648610997e-05, + "loss": 0.492, + "step": 952 + }, + { + "epoch": 0.17, + "learning_rate": 1.905951706469911e-05, + "loss": 0.4924, + "step": 953 + }, + { + "epoch": 0.17, + "learning_rate": 1.9057139627542693e-05, + "loss": 0.5003, + "step": 954 + }, + { + "epoch": 0.17, + "learning_rate": 1.905475933789051e-05, + "loss": 0.4777, + "step": 955 + }, + { + "epoch": 0.17, + "learning_rate": 1.9052376196492218e-05, + "loss": 0.4971, + "step": 956 + }, + { + "epoch": 0.17, + "learning_rate": 1.904999020409837e-05, + "loss": 0.4724, + "step": 957 + }, + { + "epoch": 0.17, + "learning_rate": 1.904760136146043e-05, + "loss": 0.4922, + "step": 958 + }, + { + "epoch": 0.17, + "learning_rate": 1.9045209669330747e-05, + "loss": 0.4811, + "step": 959 + }, + { + "epoch": 0.17, + "learning_rate": 1.904281512846257e-05, + "loss": 0.4858, + "step": 960 + }, + { + "epoch": 0.17, + "learning_rate": 1.904041773961004e-05, + "loss": 0.4897, + "step": 961 + }, + { + "epoch": 0.17, + "learning_rate": 1.9038017503528215e-05, + "loss": 0.4933, + "step": 962 + }, + { + "epoch": 0.17, + "learning_rate": 1.9035614420973026e-05, + "loss": 0.4866, + "step": 963 + }, + { + "epoch": 0.17, + "learning_rate": 1.9033208492701316e-05, + "loss": 0.4981, + "step": 964 + }, + { + "epoch": 0.17, + "learning_rate": 1.903079971947081e-05, + "loss": 0.4943, + "step": 965 + }, + { + "epoch": 0.17, + "learning_rate": 1.902838810204015e-05, + "loss": 0.4872, + "step": 966 + }, + { + "epoch": 0.17, + "learning_rate": 1.9025973641168854e-05, + "loss": 0.4959, + "step": 967 + }, + { + "epoch": 0.17, + "learning_rate": 1.9023556337617343e-05, + "loss": 0.4882, + "step": 968 + }, + { + "epoch": 0.17, + "learning_rate": 1.9021136192146936e-05, + "loss": 0.4891, + "step": 969 + }, + { + "epoch": 0.17, + "learning_rate": 1.901871320551984e-05, + "loss": 0.4902, + "step": 970 + }, + { + "epoch": 0.17, + "learning_rate": 1.9016287378499167e-05, + "loss": 0.4823, + "step": 971 + }, + { + "epoch": 0.17, + "learning_rate": 1.9013858711848914e-05, + "loss": 0.4887, + "step": 972 + }, + { + "epoch": 0.17, + "learning_rate": 1.9011427206333976e-05, + "loss": 0.4975, + "step": 973 + }, + { + "epoch": 0.17, + "learning_rate": 1.9008992862720145e-05, + "loss": 0.4885, + "step": 974 + }, + { + "epoch": 0.17, + "learning_rate": 1.90065556817741e-05, + "loss": 0.4819, + "step": 975 + }, + { + "epoch": 0.17, + "learning_rate": 1.900411566426342e-05, + "loss": 0.4903, + "step": 976 + }, + { + "epoch": 0.17, + "learning_rate": 1.9001672810956575e-05, + "loss": 0.4949, + "step": 977 + }, + { + "epoch": 0.17, + "learning_rate": 1.899922712262293e-05, + "loss": 0.4802, + "step": 978 + }, + { + "epoch": 0.17, + "learning_rate": 1.8996778600032736e-05, + "loss": 0.488, + "step": 979 + }, + { + "epoch": 0.17, + "learning_rate": 1.8994327243957143e-05, + "loss": 0.482, + "step": 980 + }, + { + "epoch": 0.17, + "learning_rate": 1.8991873055168194e-05, + "loss": 0.4865, + "step": 981 + }, + { + "epoch": 0.17, + "learning_rate": 1.8989416034438823e-05, + "loss": 0.4803, + "step": 982 + }, + { + "epoch": 0.17, + "learning_rate": 1.8986956182542853e-05, + "loss": 0.502, + "step": 983 + }, + { + "epoch": 0.17, + "learning_rate": 1.8984493500255e-05, + "loss": 0.4831, + "step": 984 + }, + { + "epoch": 0.17, + "learning_rate": 1.8982027988350877e-05, + "loss": 0.4882, + "step": 985 + }, + { + "epoch": 0.17, + "learning_rate": 1.8979559647606973e-05, + "loss": 0.4845, + "step": 986 + }, + { + "epoch": 0.17, + "learning_rate": 1.8977088478800687e-05, + "loss": 0.4836, + "step": 987 + }, + { + "epoch": 0.17, + "learning_rate": 1.89746144827103e-05, + "loss": 0.4937, + "step": 988 + }, + { + "epoch": 0.17, + "learning_rate": 1.8972137660114977e-05, + "loss": 0.5059, + "step": 989 + }, + { + "epoch": 0.17, + "learning_rate": 1.8969658011794785e-05, + "loss": 0.4897, + "step": 990 + }, + { + "epoch": 0.17, + "learning_rate": 1.8967175538530675e-05, + "loss": 0.4968, + "step": 991 + }, + { + "epoch": 0.17, + "learning_rate": 1.8964690241104484e-05, + "loss": 0.5004, + "step": 992 + }, + { + "epoch": 0.17, + "learning_rate": 1.8962202120298948e-05, + "loss": 0.4861, + "step": 993 + }, + { + "epoch": 0.17, + "learning_rate": 1.8959711176897682e-05, + "loss": 0.4945, + "step": 994 + }, + { + "epoch": 0.17, + "learning_rate": 1.8957217411685197e-05, + "loss": 0.4937, + "step": 995 + }, + { + "epoch": 0.17, + "learning_rate": 1.8954720825446893e-05, + "loss": 0.4894, + "step": 996 + }, + { + "epoch": 0.17, + "learning_rate": 1.895222141896905e-05, + "loss": 0.4847, + "step": 997 + }, + { + "epoch": 0.17, + "learning_rate": 1.8949719193038847e-05, + "loss": 0.4885, + "step": 998 + }, + { + "epoch": 0.17, + "learning_rate": 1.8947214148444346e-05, + "loss": 0.4816, + "step": 999 + }, + { + "epoch": 0.17, + "learning_rate": 1.8944706285974496e-05, + "loss": 0.4887, + "step": 1000 + }, + { + "epoch": 0.17, + "learning_rate": 1.8942195606419133e-05, + "loss": 0.492, + "step": 1001 + }, + { + "epoch": 0.17, + "learning_rate": 1.8939682110568982e-05, + "loss": 0.493, + "step": 1002 + }, + { + "epoch": 0.17, + "learning_rate": 1.8937165799215657e-05, + "loss": 0.5016, + "step": 1003 + }, + { + "epoch": 0.17, + "learning_rate": 1.8934646673151655e-05, + "loss": 0.4833, + "step": 1004 + }, + { + "epoch": 0.17, + "learning_rate": 1.8932124733170357e-05, + "loss": 0.5154, + "step": 1005 + }, + { + "epoch": 0.17, + "learning_rate": 1.8929599980066034e-05, + "loss": 0.4956, + "step": 1006 + }, + { + "epoch": 0.17, + "learning_rate": 1.892707241463385e-05, + "loss": 0.5025, + "step": 1007 + }, + { + "epoch": 0.17, + "learning_rate": 1.8924542037669845e-05, + "loss": 0.4741, + "step": 1008 + }, + { + "epoch": 0.17, + "learning_rate": 1.8922008849970947e-05, + "loss": 0.49, + "step": 1009 + }, + { + "epoch": 0.17, + "learning_rate": 1.8919472852334964e-05, + "loss": 0.4839, + "step": 1010 + }, + { + "epoch": 0.18, + "learning_rate": 1.8916934045560603e-05, + "loss": 0.4931, + "step": 1011 + }, + { + "epoch": 0.18, + "learning_rate": 1.891439243044744e-05, + "loss": 0.4864, + "step": 1012 + }, + { + "epoch": 0.18, + "learning_rate": 1.8911848007795944e-05, + "loss": 0.497, + "step": 1013 + }, + { + "epoch": 0.18, + "learning_rate": 1.890930077840747e-05, + "loss": 0.4826, + "step": 1014 + }, + { + "epoch": 0.18, + "learning_rate": 1.890675074308425e-05, + "loss": 0.4936, + "step": 1015 + }, + { + "epoch": 0.18, + "learning_rate": 1.8904197902629408e-05, + "loss": 0.4765, + "step": 1016 + }, + { + "epoch": 0.18, + "learning_rate": 1.8901642257846943e-05, + "loss": 0.4886, + "step": 1017 + }, + { + "epoch": 0.18, + "learning_rate": 1.889908380954174e-05, + "loss": 0.4853, + "step": 1018 + }, + { + "epoch": 0.18, + "learning_rate": 1.8896522558519574e-05, + "loss": 0.5038, + "step": 1019 + }, + { + "epoch": 0.18, + "learning_rate": 1.8893958505587093e-05, + "loss": 0.4756, + "step": 1020 + }, + { + "epoch": 0.18, + "learning_rate": 1.8891391651551826e-05, + "loss": 0.4978, + "step": 1021 + }, + { + "epoch": 0.18, + "learning_rate": 1.88888219972222e-05, + "loss": 0.4758, + "step": 1022 + }, + { + "epoch": 0.18, + "learning_rate": 1.8886249543407505e-05, + "loss": 0.5042, + "step": 1023 + }, + { + "epoch": 0.18, + "learning_rate": 1.8883674290917927e-05, + "loss": 0.4856, + "step": 1024 + }, + { + "epoch": 0.18, + "learning_rate": 1.8881096240564523e-05, + "loss": 0.4897, + "step": 1025 + }, + { + "epoch": 0.18, + "learning_rate": 1.8878515393159236e-05, + "loss": 0.4896, + "step": 1026 + }, + { + "epoch": 0.18, + "learning_rate": 1.8875931749514893e-05, + "loss": 0.4969, + "step": 1027 + }, + { + "epoch": 0.18, + "learning_rate": 1.8873345310445193e-05, + "loss": 0.4821, + "step": 1028 + }, + { + "epoch": 0.18, + "learning_rate": 1.8870756076764728e-05, + "loss": 0.4879, + "step": 1029 + }, + { + "epoch": 0.18, + "learning_rate": 1.8868164049288954e-05, + "loss": 0.4839, + "step": 1030 + }, + { + "epoch": 0.18, + "learning_rate": 1.886556922883422e-05, + "loss": 0.5094, + "step": 1031 + }, + { + "epoch": 0.18, + "learning_rate": 1.8862971616217753e-05, + "loss": 0.487, + "step": 1032 + }, + { + "epoch": 0.18, + "learning_rate": 1.8860371212257648e-05, + "loss": 0.485, + "step": 1033 + }, + { + "epoch": 0.18, + "learning_rate": 1.88577680177729e-05, + "loss": 0.4799, + "step": 1034 + }, + { + "epoch": 0.18, + "learning_rate": 1.885516203358336e-05, + "loss": 0.4906, + "step": 1035 + }, + { + "epoch": 0.18, + "learning_rate": 1.8852553260509775e-05, + "loss": 0.4869, + "step": 1036 + }, + { + "epoch": 0.18, + "learning_rate": 1.884994169937376e-05, + "loss": 0.4998, + "step": 1037 + }, + { + "epoch": 0.18, + "learning_rate": 1.8847327350997814e-05, + "loss": 0.4949, + "step": 1038 + }, + { + "epoch": 0.18, + "learning_rate": 1.8844710216205306e-05, + "loss": 0.4873, + "step": 1039 + }, + { + "epoch": 0.18, + "learning_rate": 1.8842090295820497e-05, + "loss": 0.4882, + "step": 1040 + }, + { + "epoch": 0.18, + "learning_rate": 1.8839467590668507e-05, + "loss": 0.4983, + "step": 1041 + }, + { + "epoch": 0.18, + "learning_rate": 1.883684210157535e-05, + "loss": 0.4784, + "step": 1042 + }, + { + "epoch": 0.18, + "learning_rate": 1.8834213829367908e-05, + "loss": 0.4906, + "step": 1043 + }, + { + "epoch": 0.18, + "learning_rate": 1.8831582774873935e-05, + "loss": 0.4817, + "step": 1044 + }, + { + "epoch": 0.18, + "learning_rate": 1.8828948938922073e-05, + "loss": 0.4962, + "step": 1045 + }, + { + "epoch": 0.18, + "learning_rate": 1.882631232234183e-05, + "loss": 0.4835, + "step": 1046 + }, + { + "epoch": 0.18, + "learning_rate": 1.8823672925963598e-05, + "loss": 0.5017, + "step": 1047 + }, + { + "epoch": 0.18, + "learning_rate": 1.8821030750618633e-05, + "loss": 0.4919, + "step": 1048 + }, + { + "epoch": 0.18, + "learning_rate": 1.8818385797139083e-05, + "loss": 0.4901, + "step": 1049 + }, + { + "epoch": 0.18, + "learning_rate": 1.8815738066357954e-05, + "loss": 0.4795, + "step": 1050 + }, + { + "epoch": 0.18, + "learning_rate": 1.8813087559109137e-05, + "loss": 0.4837, + "step": 1051 + }, + { + "epoch": 0.18, + "learning_rate": 1.8810434276227397e-05, + "loss": 0.4824, + "step": 1052 + }, + { + "epoch": 0.18, + "learning_rate": 1.8807778218548364e-05, + "loss": 0.4876, + "step": 1053 + }, + { + "epoch": 0.18, + "learning_rate": 1.8805119386908556e-05, + "loss": 0.4757, + "step": 1054 + }, + { + "epoch": 0.18, + "learning_rate": 1.8802457782145352e-05, + "loss": 0.4832, + "step": 1055 + }, + { + "epoch": 0.18, + "learning_rate": 1.879979340509701e-05, + "loss": 0.4731, + "step": 1056 + }, + { + "epoch": 0.18, + "learning_rate": 1.8797126256602666e-05, + "loss": 0.5019, + "step": 1057 + }, + { + "epoch": 0.18, + "learning_rate": 1.8794456337502318e-05, + "loss": 0.4823, + "step": 1058 + }, + { + "epoch": 0.18, + "learning_rate": 1.8791783648636844e-05, + "loss": 0.494, + "step": 1059 + }, + { + "epoch": 0.18, + "learning_rate": 1.878910819084799e-05, + "loss": 0.4827, + "step": 1060 + }, + { + "epoch": 0.18, + "learning_rate": 1.878642996497838e-05, + "loss": 0.4991, + "step": 1061 + }, + { + "epoch": 0.18, + "learning_rate": 1.8783748971871508e-05, + "loss": 0.4858, + "step": 1062 + }, + { + "epoch": 0.18, + "learning_rate": 1.8781065212371732e-05, + "loss": 0.5024, + "step": 1063 + }, + { + "epoch": 0.18, + "learning_rate": 1.877837868732429e-05, + "loss": 0.492, + "step": 1064 + }, + { + "epoch": 0.18, + "learning_rate": 1.877568939757529e-05, + "loss": 0.5046, + "step": 1065 + }, + { + "epoch": 0.18, + "learning_rate": 1.8772997343971708e-05, + "loss": 0.4811, + "step": 1066 + }, + { + "epoch": 0.18, + "learning_rate": 1.877030252736139e-05, + "loss": 0.4876, + "step": 1067 + }, + { + "epoch": 0.19, + "learning_rate": 1.8767604948593052e-05, + "loss": 0.4762, + "step": 1068 + }, + { + "epoch": 0.19, + "learning_rate": 1.8764904608516287e-05, + "loss": 0.4956, + "step": 1069 + }, + { + "epoch": 0.19, + "learning_rate": 1.8762201507981546e-05, + "loss": 0.4916, + "step": 1070 + }, + { + "epoch": 0.19, + "learning_rate": 1.8759495647840158e-05, + "loss": 0.4886, + "step": 1071 + }, + { + "epoch": 0.19, + "learning_rate": 1.875678702894432e-05, + "loss": 0.5012, + "step": 1072 + }, + { + "epoch": 0.19, + "learning_rate": 1.8754075652147094e-05, + "loss": 0.4869, + "step": 1073 + }, + { + "epoch": 0.19, + "learning_rate": 1.8751361518302413e-05, + "loss": 0.4958, + "step": 1074 + }, + { + "epoch": 0.19, + "learning_rate": 1.8748644628265085e-05, + "loss": 0.4942, + "step": 1075 + }, + { + "epoch": 0.19, + "learning_rate": 1.874592498289077e-05, + "loss": 0.4724, + "step": 1076 + }, + { + "epoch": 0.19, + "learning_rate": 1.874320258303601e-05, + "loss": 0.4914, + "step": 1077 + }, + { + "epoch": 0.19, + "learning_rate": 1.8740477429558205e-05, + "loss": 0.4907, + "step": 1078 + }, + { + "epoch": 0.19, + "learning_rate": 1.8737749523315636e-05, + "loss": 0.4783, + "step": 1079 + }, + { + "epoch": 0.19, + "learning_rate": 1.8735018865167433e-05, + "loss": 0.4902, + "step": 1080 + }, + { + "epoch": 0.19, + "learning_rate": 1.873228545597361e-05, + "loss": 0.4966, + "step": 1081 + }, + { + "epoch": 0.19, + "learning_rate": 1.872954929659503e-05, + "loss": 0.4806, + "step": 1082 + }, + { + "epoch": 0.19, + "learning_rate": 1.8726810387893438e-05, + "loss": 0.4954, + "step": 1083 + }, + { + "epoch": 0.19, + "learning_rate": 1.8724068730731436e-05, + "loss": 0.4813, + "step": 1084 + }, + { + "epoch": 0.19, + "learning_rate": 1.872132432597249e-05, + "loss": 0.4887, + "step": 1085 + }, + { + "epoch": 0.19, + "learning_rate": 1.8718577174480938e-05, + "loss": 0.4834, + "step": 1086 + }, + { + "epoch": 0.19, + "learning_rate": 1.8715827277121982e-05, + "loss": 0.4913, + "step": 1087 + }, + { + "epoch": 0.19, + "learning_rate": 1.8713074634761687e-05, + "loss": 0.4868, + "step": 1088 + }, + { + "epoch": 0.19, + "learning_rate": 1.8710319248266978e-05, + "loss": 0.4912, + "step": 1089 + }, + { + "epoch": 0.19, + "learning_rate": 1.8707561118505656e-05, + "loss": 0.4802, + "step": 1090 + }, + { + "epoch": 0.19, + "learning_rate": 1.8704800246346367e-05, + "loss": 0.4976, + "step": 1091 + }, + { + "epoch": 0.19, + "learning_rate": 1.8702036632658646e-05, + "loss": 0.4892, + "step": 1092 + }, + { + "epoch": 0.19, + "learning_rate": 1.869927027831287e-05, + "loss": 0.4902, + "step": 1093 + }, + { + "epoch": 0.19, + "learning_rate": 1.8696501184180283e-05, + "loss": 0.4751, + "step": 1094 + }, + { + "epoch": 0.19, + "learning_rate": 1.8693729351133005e-05, + "loss": 0.4891, + "step": 1095 + }, + { + "epoch": 0.19, + "learning_rate": 1.8690954780044004e-05, + "loss": 0.4905, + "step": 1096 + }, + { + "epoch": 0.19, + "learning_rate": 1.8688177471787118e-05, + "loss": 0.4763, + "step": 1097 + }, + { + "epoch": 0.19, + "learning_rate": 1.8685397427237043e-05, + "loss": 0.4752, + "step": 1098 + }, + { + "epoch": 0.19, + "learning_rate": 1.868261464726934e-05, + "loss": 0.4936, + "step": 1099 + }, + { + "epoch": 0.19, + "learning_rate": 1.8679829132760427e-05, + "loss": 0.487, + "step": 1100 + }, + { + "epoch": 0.19, + "learning_rate": 1.867704088458759e-05, + "loss": 0.4899, + "step": 1101 + }, + { + "epoch": 0.19, + "learning_rate": 1.867424990362897e-05, + "loss": 0.5005, + "step": 1102 + }, + { + "epoch": 0.19, + "learning_rate": 1.8671456190763572e-05, + "loss": 0.5067, + "step": 1103 + }, + { + "epoch": 0.19, + "learning_rate": 1.866865974687126e-05, + "loss": 0.4883, + "step": 1104 + }, + { + "epoch": 0.19, + "learning_rate": 1.866586057283276e-05, + "loss": 0.4883, + "step": 1105 + }, + { + "epoch": 0.19, + "learning_rate": 1.8663058669529654e-05, + "loss": 0.482, + "step": 1106 + }, + { + "epoch": 0.19, + "learning_rate": 1.866025403784439e-05, + "loss": 0.5132, + "step": 1107 + }, + { + "epoch": 0.19, + "learning_rate": 1.8657446678660264e-05, + "loss": 0.4922, + "step": 1108 + }, + { + "epoch": 0.19, + "learning_rate": 1.865463659286144e-05, + "loss": 0.4843, + "step": 1109 + }, + { + "epoch": 0.19, + "learning_rate": 1.8651823781332948e-05, + "loss": 0.4855, + "step": 1110 + }, + { + "epoch": 0.19, + "learning_rate": 1.8649008244960657e-05, + "loss": 0.4881, + "step": 1111 + }, + { + "epoch": 0.19, + "learning_rate": 1.8646189984631306e-05, + "loss": 0.4867, + "step": 1112 + }, + { + "epoch": 0.19, + "learning_rate": 1.8643369001232498e-05, + "loss": 0.4921, + "step": 1113 + }, + { + "epoch": 0.19, + "learning_rate": 1.864054529565267e-05, + "loss": 0.4784, + "step": 1114 + }, + { + "epoch": 0.19, + "learning_rate": 1.8637718868781154e-05, + "loss": 0.4994, + "step": 1115 + }, + { + "epoch": 0.19, + "learning_rate": 1.86348897215081e-05, + "loss": 0.4842, + "step": 1116 + }, + { + "epoch": 0.19, + "learning_rate": 1.863205785472454e-05, + "loss": 0.4868, + "step": 1117 + }, + { + "epoch": 0.19, + "learning_rate": 1.8629223269322353e-05, + "loss": 0.4885, + "step": 1118 + }, + { + "epoch": 0.19, + "learning_rate": 1.8626385966194275e-05, + "loss": 0.488, + "step": 1119 + }, + { + "epoch": 0.19, + "learning_rate": 1.86235459462339e-05, + "loss": 0.4855, + "step": 1120 + }, + { + "epoch": 0.19, + "learning_rate": 1.862070321033568e-05, + "loss": 0.4894, + "step": 1121 + }, + { + "epoch": 0.19, + "learning_rate": 1.8617857759394913e-05, + "loss": 0.478, + "step": 1122 + }, + { + "epoch": 0.19, + "learning_rate": 1.8615009594307757e-05, + "loss": 0.4825, + "step": 1123 + }, + { + "epoch": 0.19, + "learning_rate": 1.861215871597123e-05, + "loss": 0.4743, + "step": 1124 + }, + { + "epoch": 0.19, + "learning_rate": 1.8609305125283202e-05, + "loss": 0.4972, + "step": 1125 + }, + { + "epoch": 0.2, + "learning_rate": 1.860644882314239e-05, + "loss": 0.4808, + "step": 1126 + }, + { + "epoch": 0.2, + "learning_rate": 1.8603589810448377e-05, + "loss": 0.4905, + "step": 1127 + }, + { + "epoch": 0.2, + "learning_rate": 1.8600728088101587e-05, + "loss": 0.4849, + "step": 1128 + }, + { + "epoch": 0.2, + "learning_rate": 1.8597863657003303e-05, + "loss": 0.4904, + "step": 1129 + }, + { + "epoch": 0.2, + "learning_rate": 1.859499651805567e-05, + "loss": 0.4808, + "step": 1130 + }, + { + "epoch": 0.2, + "learning_rate": 1.859212667216167e-05, + "loss": 0.4904, + "step": 1131 + }, + { + "epoch": 0.2, + "learning_rate": 1.8589254120225145e-05, + "loss": 0.4728, + "step": 1132 + }, + { + "epoch": 0.2, + "learning_rate": 1.858637886315079e-05, + "loss": 0.4987, + "step": 1133 + }, + { + "epoch": 0.2, + "learning_rate": 1.8583500901844157e-05, + "loss": 0.4776, + "step": 1134 + }, + { + "epoch": 0.2, + "learning_rate": 1.858062023721164e-05, + "loss": 0.4862, + "step": 1135 + }, + { + "epoch": 0.2, + "learning_rate": 1.8577736870160482e-05, + "loss": 0.488, + "step": 1136 + }, + { + "epoch": 0.2, + "learning_rate": 1.857485080159879e-05, + "loss": 0.4913, + "step": 1137 + }, + { + "epoch": 0.2, + "learning_rate": 1.857196203243552e-05, + "loss": 0.4903, + "step": 1138 + }, + { + "epoch": 0.2, + "learning_rate": 1.8569070563580466e-05, + "loss": 0.4746, + "step": 1139 + }, + { + "epoch": 0.2, + "learning_rate": 1.8566176395944277e-05, + "loss": 0.4835, + "step": 1140 + }, + { + "epoch": 0.2, + "learning_rate": 1.8563279530438464e-05, + "loss": 0.4722, + "step": 1141 + }, + { + "epoch": 0.2, + "learning_rate": 1.8560379967975376e-05, + "loss": 0.4798, + "step": 1142 + }, + { + "epoch": 0.2, + "learning_rate": 1.8557477709468214e-05, + "loss": 0.4928, + "step": 1143 + }, + { + "epoch": 0.2, + "learning_rate": 1.8554572755831026e-05, + "loss": 0.4819, + "step": 1144 + }, + { + "epoch": 0.2, + "learning_rate": 1.8551665107978708e-05, + "loss": 0.4879, + "step": 1145 + }, + { + "epoch": 0.2, + "learning_rate": 1.8548754766827016e-05, + "loss": 0.4904, + "step": 1146 + }, + { + "epoch": 0.2, + "learning_rate": 1.8545841733292543e-05, + "loss": 0.4838, + "step": 1147 + }, + { + "epoch": 0.2, + "learning_rate": 1.8542926008292726e-05, + "loss": 0.4819, + "step": 1148 + }, + { + "epoch": 0.2, + "learning_rate": 1.8540007592745865e-05, + "loss": 0.4732, + "step": 1149 + }, + { + "epoch": 0.2, + "learning_rate": 1.8537086487571095e-05, + "loss": 0.4843, + "step": 1150 + }, + { + "epoch": 0.2, + "learning_rate": 1.85341626936884e-05, + "loss": 0.4837, + "step": 1151 + }, + { + "epoch": 0.2, + "learning_rate": 1.8531236212018616e-05, + "loss": 0.4972, + "step": 1152 + }, + { + "epoch": 0.2, + "learning_rate": 1.8528307043483425e-05, + "loss": 0.4893, + "step": 1153 + }, + { + "epoch": 0.2, + "learning_rate": 1.8525375189005345e-05, + "loss": 0.4974, + "step": 1154 + }, + { + "epoch": 0.2, + "learning_rate": 1.852244064950775e-05, + "loss": 0.4873, + "step": 1155 + }, + { + "epoch": 0.2, + "learning_rate": 1.8519503425914857e-05, + "loss": 0.5025, + "step": 1156 + }, + { + "epoch": 0.2, + "learning_rate": 1.851656351915173e-05, + "loss": 0.4822, + "step": 1157 + }, + { + "epoch": 0.2, + "learning_rate": 1.851362093014427e-05, + "loss": 0.4985, + "step": 1158 + }, + { + "epoch": 0.2, + "learning_rate": 1.851067565981924e-05, + "loss": 0.4862, + "step": 1159 + }, + { + "epoch": 0.2, + "learning_rate": 1.850772770910423e-05, + "loss": 0.4824, + "step": 1160 + }, + { + "epoch": 0.2, + "learning_rate": 1.850477707892768e-05, + "loss": 0.471, + "step": 1161 + }, + { + "epoch": 0.2, + "learning_rate": 1.8501823770218873e-05, + "loss": 0.4918, + "step": 1162 + }, + { + "epoch": 0.2, + "learning_rate": 1.8498867783907942e-05, + "loss": 0.499, + "step": 1163 + }, + { + "epoch": 0.2, + "learning_rate": 1.8495909120925857e-05, + "loss": 0.5001, + "step": 1164 + }, + { + "epoch": 0.2, + "learning_rate": 1.849294778220443e-05, + "loss": 0.4884, + "step": 1165 + }, + { + "epoch": 0.2, + "learning_rate": 1.8489983768676322e-05, + "loss": 0.4888, + "step": 1166 + }, + { + "epoch": 0.2, + "learning_rate": 1.8487017081275028e-05, + "loss": 0.4697, + "step": 1167 + }, + { + "epoch": 0.2, + "learning_rate": 1.8484047720934898e-05, + "loss": 0.4897, + "step": 1168 + }, + { + "epoch": 0.2, + "learning_rate": 1.8481075688591104e-05, + "loss": 0.4859, + "step": 1169 + }, + { + "epoch": 0.2, + "learning_rate": 1.8478100985179676e-05, + "loss": 0.4838, + "step": 1170 + }, + { + "epoch": 0.2, + "learning_rate": 1.8475123611637485e-05, + "loss": 0.4881, + "step": 1171 + }, + { + "epoch": 0.2, + "learning_rate": 1.8472143568902235e-05, + "loss": 0.4959, + "step": 1172 + }, + { + "epoch": 0.2, + "learning_rate": 1.846916085791247e-05, + "loss": 0.4892, + "step": 1173 + }, + { + "epoch": 0.2, + "learning_rate": 1.8466175479607583e-05, + "loss": 0.4869, + "step": 1174 + }, + { + "epoch": 0.2, + "learning_rate": 1.84631874349278e-05, + "loss": 0.4968, + "step": 1175 + }, + { + "epoch": 0.2, + "learning_rate": 1.8460196724814193e-05, + "loss": 0.4881, + "step": 1176 + }, + { + "epoch": 0.2, + "learning_rate": 1.8457203350208664e-05, + "loss": 0.4959, + "step": 1177 + }, + { + "epoch": 0.2, + "learning_rate": 1.845420731205396e-05, + "loss": 0.4905, + "step": 1178 + }, + { + "epoch": 0.2, + "learning_rate": 1.8451208611293672e-05, + "loss": 0.4918, + "step": 1179 + }, + { + "epoch": 0.2, + "learning_rate": 1.844820724887222e-05, + "loss": 0.4899, + "step": 1180 + }, + { + "epoch": 0.2, + "learning_rate": 1.8445203225734866e-05, + "loss": 0.4855, + "step": 1181 + }, + { + "epoch": 0.2, + "learning_rate": 1.8442196542827712e-05, + "loss": 0.4858, + "step": 1182 + }, + { + "epoch": 0.2, + "learning_rate": 1.8439187201097696e-05, + "loss": 0.4913, + "step": 1183 + }, + { + "epoch": 0.21, + "learning_rate": 1.8436175201492594e-05, + "loss": 0.4818, + "step": 1184 + }, + { + "epoch": 0.21, + "learning_rate": 1.8433160544961017e-05, + "loss": 0.4775, + "step": 1185 + }, + { + "epoch": 0.21, + "learning_rate": 1.8430143232452413e-05, + "loss": 0.4777, + "step": 1186 + }, + { + "epoch": 0.21, + "learning_rate": 1.8427123264917074e-05, + "loss": 0.4813, + "step": 1187 + }, + { + "epoch": 0.21, + "learning_rate": 1.8424100643306113e-05, + "loss": 0.4798, + "step": 1188 + }, + { + "epoch": 0.21, + "learning_rate": 1.8421075368571493e-05, + "loss": 0.4814, + "step": 1189 + }, + { + "epoch": 0.21, + "learning_rate": 1.8418047441666012e-05, + "loss": 0.4834, + "step": 1190 + }, + { + "epoch": 0.21, + "learning_rate": 1.8415016863543286e-05, + "loss": 0.4783, + "step": 1191 + }, + { + "epoch": 0.21, + "learning_rate": 1.8411983635157792e-05, + "loss": 0.4856, + "step": 1192 + }, + { + "epoch": 0.21, + "learning_rate": 1.8408947757464825e-05, + "loss": 0.4851, + "step": 1193 + }, + { + "epoch": 0.21, + "learning_rate": 1.840590923142051e-05, + "loss": 0.484, + "step": 1194 + }, + { + "epoch": 0.21, + "learning_rate": 1.8402868057981823e-05, + "loss": 0.4843, + "step": 1195 + }, + { + "epoch": 0.21, + "learning_rate": 1.839982423810656e-05, + "loss": 0.4877, + "step": 1196 + }, + { + "epoch": 0.21, + "learning_rate": 1.8396777772753355e-05, + "loss": 0.488, + "step": 1197 + }, + { + "epoch": 0.21, + "learning_rate": 1.839372866288168e-05, + "loss": 0.4934, + "step": 1198 + }, + { + "epoch": 0.21, + "learning_rate": 1.839067690945183e-05, + "loss": 0.4813, + "step": 1199 + }, + { + "epoch": 0.21, + "learning_rate": 1.8387622513424942e-05, + "loss": 0.4897, + "step": 1200 + }, + { + "epoch": 0.21, + "learning_rate": 1.8384565475762978e-05, + "loss": 0.4739, + "step": 1201 + }, + { + "epoch": 0.21, + "learning_rate": 1.838150579742873e-05, + "loss": 0.5024, + "step": 1202 + }, + { + "epoch": 0.21, + "learning_rate": 1.837844347938584e-05, + "loss": 0.4796, + "step": 1203 + }, + { + "epoch": 0.21, + "learning_rate": 1.8375378522598756e-05, + "loss": 0.4995, + "step": 1204 + }, + { + "epoch": 0.21, + "learning_rate": 1.8372310928032774e-05, + "loss": 0.4788, + "step": 1205 + }, + { + "epoch": 0.21, + "learning_rate": 1.8369240696654017e-05, + "loss": 0.4866, + "step": 1206 + }, + { + "epoch": 0.21, + "learning_rate": 1.8366167829429434e-05, + "loss": 0.4749, + "step": 1207 + }, + { + "epoch": 0.21, + "learning_rate": 1.836309232732681e-05, + "loss": 0.4948, + "step": 1208 + }, + { + "epoch": 0.21, + "learning_rate": 1.836001419131476e-05, + "loss": 0.4762, + "step": 1209 + }, + { + "epoch": 0.21, + "learning_rate": 1.835693342236272e-05, + "loss": 0.5067, + "step": 1210 + }, + { + "epoch": 0.21, + "learning_rate": 1.8353850021440962e-05, + "loss": 0.4815, + "step": 1211 + }, + { + "epoch": 0.21, + "learning_rate": 1.835076398952059e-05, + "loss": 0.4806, + "step": 1212 + }, + { + "epoch": 0.21, + "learning_rate": 1.8347675327573525e-05, + "loss": 0.4874, + "step": 1213 + }, + { + "epoch": 0.21, + "learning_rate": 1.834458403657253e-05, + "loss": 0.4903, + "step": 1214 + }, + { + "epoch": 0.21, + "learning_rate": 1.834149011749119e-05, + "loss": 0.4731, + "step": 1215 + }, + { + "epoch": 0.21, + "learning_rate": 1.8338393571303917e-05, + "loss": 0.4878, + "step": 1216 + }, + { + "epoch": 0.21, + "learning_rate": 1.8335294398985953e-05, + "loss": 0.4938, + "step": 1217 + }, + { + "epoch": 0.21, + "learning_rate": 1.8332192601513358e-05, + "loss": 0.4939, + "step": 1218 + }, + { + "epoch": 0.21, + "learning_rate": 1.8329088179863033e-05, + "loss": 0.4756, + "step": 1219 + }, + { + "epoch": 0.21, + "learning_rate": 1.8325981135012693e-05, + "loss": 0.4886, + "step": 1220 + }, + { + "epoch": 0.21, + "learning_rate": 1.832287146794089e-05, + "loss": 0.4813, + "step": 1221 + }, + { + "epoch": 0.21, + "learning_rate": 1.8319759179626992e-05, + "loss": 0.495, + "step": 1222 + }, + { + "epoch": 0.21, + "learning_rate": 1.8316644271051197e-05, + "loss": 0.4832, + "step": 1223 + }, + { + "epoch": 0.21, + "learning_rate": 1.8313526743194536e-05, + "loss": 0.483, + "step": 1224 + }, + { + "epoch": 0.21, + "learning_rate": 1.8310406597038843e-05, + "loss": 0.4954, + "step": 1225 + }, + { + "epoch": 0.21, + "learning_rate": 1.83072838335668e-05, + "loss": 0.4769, + "step": 1226 + }, + { + "epoch": 0.21, + "learning_rate": 1.8304158453761904e-05, + "loss": 0.4896, + "step": 1227 + }, + { + "epoch": 0.21, + "learning_rate": 1.8301030458608475e-05, + "loss": 0.4876, + "step": 1228 + }, + { + "epoch": 0.21, + "learning_rate": 1.8297899849091654e-05, + "loss": 0.4811, + "step": 1229 + }, + { + "epoch": 0.21, + "learning_rate": 1.8294766626197414e-05, + "loss": 0.4938, + "step": 1230 + }, + { + "epoch": 0.21, + "learning_rate": 1.8291630790912544e-05, + "loss": 0.4829, + "step": 1231 + }, + { + "epoch": 0.21, + "learning_rate": 1.828849234422466e-05, + "loss": 0.4799, + "step": 1232 + }, + { + "epoch": 0.21, + "learning_rate": 1.8285351287122192e-05, + "loss": 0.475, + "step": 1233 + }, + { + "epoch": 0.21, + "learning_rate": 1.8282207620594405e-05, + "loss": 0.4934, + "step": 1234 + }, + { + "epoch": 0.21, + "learning_rate": 1.827906134563138e-05, + "loss": 0.4791, + "step": 1235 + }, + { + "epoch": 0.21, + "learning_rate": 1.827591246322401e-05, + "loss": 0.4891, + "step": 1236 + }, + { + "epoch": 0.21, + "learning_rate": 1.827276097436403e-05, + "loss": 0.4866, + "step": 1237 + }, + { + "epoch": 0.21, + "learning_rate": 1.8269606880043974e-05, + "loss": 0.4862, + "step": 1238 + }, + { + "epoch": 0.21, + "learning_rate": 1.8266450181257213e-05, + "loss": 0.4986, + "step": 1239 + }, + { + "epoch": 0.21, + "learning_rate": 1.826329087899793e-05, + "loss": 0.4878, + "step": 1240 + }, + { + "epoch": 0.21, + "learning_rate": 1.8260128974261123e-05, + "loss": 0.4904, + "step": 1241 + }, + { + "epoch": 0.22, + "learning_rate": 1.8256964468042624e-05, + "loss": 0.4943, + "step": 1242 + }, + { + "epoch": 0.22, + "learning_rate": 1.8253797361339075e-05, + "loss": 0.4712, + "step": 1243 + }, + { + "epoch": 0.22, + "learning_rate": 1.825062765514794e-05, + "loss": 0.4856, + "step": 1244 + }, + { + "epoch": 0.22, + "learning_rate": 1.8247455350467496e-05, + "loss": 0.4751, + "step": 1245 + }, + { + "epoch": 0.22, + "learning_rate": 1.8244280448296852e-05, + "loss": 0.493, + "step": 1246 + }, + { + "epoch": 0.22, + "learning_rate": 1.824110294963591e-05, + "loss": 0.4863, + "step": 1247 + }, + { + "epoch": 0.22, + "learning_rate": 1.8237922855485422e-05, + "loss": 0.4936, + "step": 1248 + }, + { + "epoch": 0.22, + "learning_rate": 1.823474016684693e-05, + "loss": 0.4764, + "step": 1249 + }, + { + "epoch": 0.22, + "learning_rate": 1.8231554884722807e-05, + "loss": 0.4865, + "step": 1250 + }, + { + "epoch": 0.22, + "learning_rate": 1.8228367010116246e-05, + "loss": 0.4816, + "step": 1251 + }, + { + "epoch": 0.22, + "learning_rate": 1.822517654403124e-05, + "loss": 0.4873, + "step": 1252 + }, + { + "epoch": 0.22, + "learning_rate": 1.8221983487472617e-05, + "loss": 0.4851, + "step": 1253 + }, + { + "epoch": 0.22, + "learning_rate": 1.8218787841446003e-05, + "loss": 0.4765, + "step": 1254 + }, + { + "epoch": 0.22, + "learning_rate": 1.8215589606957862e-05, + "loss": 0.4912, + "step": 1255 + }, + { + "epoch": 0.22, + "learning_rate": 1.821238878501545e-05, + "loss": 0.4789, + "step": 1256 + }, + { + "epoch": 0.22, + "learning_rate": 1.820918537662685e-05, + "loss": 0.4815, + "step": 1257 + }, + { + "epoch": 0.22, + "learning_rate": 1.8205979382800963e-05, + "loss": 0.4828, + "step": 1258 + }, + { + "epoch": 0.22, + "learning_rate": 1.820277080454749e-05, + "loss": 0.4934, + "step": 1259 + }, + { + "epoch": 0.22, + "learning_rate": 1.8199559642876962e-05, + "loss": 0.4828, + "step": 1260 + }, + { + "epoch": 0.22, + "learning_rate": 1.8196345898800715e-05, + "loss": 0.4804, + "step": 1261 + }, + { + "epoch": 0.22, + "learning_rate": 1.8193129573330896e-05, + "loss": 0.4816, + "step": 1262 + }, + { + "epoch": 0.22, + "learning_rate": 1.8189910667480476e-05, + "loss": 0.4906, + "step": 1263 + }, + { + "epoch": 0.22, + "learning_rate": 1.8186689182263225e-05, + "loss": 0.4922, + "step": 1264 + }, + { + "epoch": 0.22, + "learning_rate": 1.818346511869373e-05, + "loss": 0.4915, + "step": 1265 + }, + { + "epoch": 0.22, + "learning_rate": 1.8180238477787406e-05, + "loss": 0.4774, + "step": 1266 + }, + { + "epoch": 0.22, + "learning_rate": 1.8177009260560447e-05, + "loss": 0.479, + "step": 1267 + }, + { + "epoch": 0.22, + "learning_rate": 1.817377746802989e-05, + "loss": 0.4945, + "step": 1268 + }, + { + "epoch": 0.22, + "learning_rate": 1.8170543101213565e-05, + "loss": 0.49, + "step": 1269 + }, + { + "epoch": 0.22, + "learning_rate": 1.816730616113012e-05, + "loss": 0.4786, + "step": 1270 + }, + { + "epoch": 0.22, + "learning_rate": 1.816406664879901e-05, + "loss": 0.4775, + "step": 1271 + }, + { + "epoch": 0.22, + "learning_rate": 1.8160824565240495e-05, + "loss": 0.4895, + "step": 1272 + }, + { + "epoch": 0.22, + "learning_rate": 1.8157579911475664e-05, + "loss": 0.4846, + "step": 1273 + }, + { + "epoch": 0.22, + "learning_rate": 1.8154332688526395e-05, + "loss": 0.4966, + "step": 1274 + }, + { + "epoch": 0.22, + "learning_rate": 1.8151082897415386e-05, + "loss": 0.4857, + "step": 1275 + }, + { + "epoch": 0.22, + "learning_rate": 1.8147830539166132e-05, + "loss": 0.4779, + "step": 1276 + }, + { + "epoch": 0.22, + "learning_rate": 1.8144575614802958e-05, + "loss": 0.4868, + "step": 1277 + }, + { + "epoch": 0.22, + "learning_rate": 1.8141318125350974e-05, + "loss": 0.4888, + "step": 1278 + }, + { + "epoch": 0.22, + "learning_rate": 1.8138058071836116e-05, + "loss": 0.4667, + "step": 1279 + }, + { + "epoch": 0.22, + "learning_rate": 1.8134795455285116e-05, + "loss": 0.5037, + "step": 1280 + }, + { + "epoch": 0.22, + "learning_rate": 1.8131530276725514e-05, + "loss": 0.4845, + "step": 1281 + }, + { + "epoch": 0.22, + "learning_rate": 1.8128262537185663e-05, + "loss": 0.4904, + "step": 1282 + }, + { + "epoch": 0.22, + "learning_rate": 1.812499223769472e-05, + "loss": 0.4663, + "step": 1283 + }, + { + "epoch": 0.22, + "learning_rate": 1.8121719379282646e-05, + "loss": 0.4887, + "step": 1284 + }, + { + "epoch": 0.22, + "learning_rate": 1.8118443962980215e-05, + "loss": 0.4854, + "step": 1285 + }, + { + "epoch": 0.22, + "learning_rate": 1.8115165989818992e-05, + "loss": 0.4785, + "step": 1286 + }, + { + "epoch": 0.22, + "learning_rate": 1.8111885460831362e-05, + "loss": 0.4927, + "step": 1287 + }, + { + "epoch": 0.22, + "learning_rate": 1.810860237705051e-05, + "loss": 0.4945, + "step": 1288 + }, + { + "epoch": 0.22, + "learning_rate": 1.8105316739510424e-05, + "loss": 0.4768, + "step": 1289 + }, + { + "epoch": 0.22, + "learning_rate": 1.8102028549245894e-05, + "loss": 0.5007, + "step": 1290 + }, + { + "epoch": 0.22, + "learning_rate": 1.8098737807292517e-05, + "loss": 0.4795, + "step": 1291 + }, + { + "epoch": 0.22, + "learning_rate": 1.8095444514686702e-05, + "loss": 0.4891, + "step": 1292 + }, + { + "epoch": 0.22, + "learning_rate": 1.8092148672465647e-05, + "loss": 0.4903, + "step": 1293 + }, + { + "epoch": 0.22, + "learning_rate": 1.8088850281667358e-05, + "loss": 0.4997, + "step": 1294 + }, + { + "epoch": 0.22, + "learning_rate": 1.808554934333065e-05, + "loss": 0.4868, + "step": 1295 + }, + { + "epoch": 0.22, + "learning_rate": 1.808224585849513e-05, + "loss": 0.5118, + "step": 1296 + }, + { + "epoch": 0.22, + "learning_rate": 1.8078939828201213e-05, + "loss": 0.4826, + "step": 1297 + }, + { + "epoch": 0.22, + "learning_rate": 1.807563125349012e-05, + "loss": 0.4833, + "step": 1298 + }, + { + "epoch": 0.23, + "learning_rate": 1.8072320135403862e-05, + "loss": 0.4942, + "step": 1299 + }, + { + "epoch": 0.23, + "learning_rate": 1.806900647498526e-05, + "loss": 0.4836, + "step": 1300 + }, + { + "epoch": 0.23, + "learning_rate": 1.8065690273277933e-05, + "loss": 0.4759, + "step": 1301 + }, + { + "epoch": 0.23, + "learning_rate": 1.8062371531326298e-05, + "loss": 0.4762, + "step": 1302 + }, + { + "epoch": 0.23, + "learning_rate": 1.8059050250175577e-05, + "loss": 0.4876, + "step": 1303 + }, + { + "epoch": 0.23, + "learning_rate": 1.805572643087179e-05, + "loss": 0.4878, + "step": 1304 + }, + { + "epoch": 0.23, + "learning_rate": 1.8052400074461752e-05, + "loss": 0.4943, + "step": 1305 + }, + { + "epoch": 0.23, + "learning_rate": 1.8049071181993083e-05, + "loss": 0.4735, + "step": 1306 + }, + { + "epoch": 0.23, + "learning_rate": 1.8045739754514197e-05, + "loss": 0.4954, + "step": 1307 + }, + { + "epoch": 0.23, + "learning_rate": 1.804240579307431e-05, + "loss": 0.4753, + "step": 1308 + }, + { + "epoch": 0.23, + "learning_rate": 1.8039069298723438e-05, + "loss": 0.4947, + "step": 1309 + }, + { + "epoch": 0.23, + "learning_rate": 1.8035730272512383e-05, + "loss": 0.4878, + "step": 1310 + }, + { + "epoch": 0.23, + "learning_rate": 1.803238871549276e-05, + "loss": 0.4959, + "step": 1311 + }, + { + "epoch": 0.23, + "learning_rate": 1.802904462871697e-05, + "loss": 0.4842, + "step": 1312 + }, + { + "epoch": 0.23, + "learning_rate": 1.8025698013238217e-05, + "loss": 0.479, + "step": 1313 + }, + { + "epoch": 0.23, + "learning_rate": 1.8022348870110495e-05, + "loss": 0.4788, + "step": 1314 + }, + { + "epoch": 0.23, + "learning_rate": 1.8018997200388605e-05, + "loss": 0.4837, + "step": 1315 + }, + { + "epoch": 0.23, + "learning_rate": 1.801564300512813e-05, + "loss": 0.4876, + "step": 1316 + }, + { + "epoch": 0.23, + "learning_rate": 1.8012286285385456e-05, + "loss": 0.4842, + "step": 1317 + }, + { + "epoch": 0.23, + "learning_rate": 1.800892704221777e-05, + "loss": 0.4833, + "step": 1318 + }, + { + "epoch": 0.23, + "learning_rate": 1.8005565276683038e-05, + "loss": 0.4918, + "step": 1319 + }, + { + "epoch": 0.23, + "learning_rate": 1.8002200989840034e-05, + "loss": 0.4897, + "step": 1320 + }, + { + "epoch": 0.23, + "learning_rate": 1.7998834182748318e-05, + "loss": 0.4864, + "step": 1321 + }, + { + "epoch": 0.23, + "learning_rate": 1.7995464856468253e-05, + "loss": 0.4754, + "step": 1322 + }, + { + "epoch": 0.23, + "learning_rate": 1.7992093012060988e-05, + "loss": 0.4875, + "step": 1323 + }, + { + "epoch": 0.23, + "learning_rate": 1.798871865058846e-05, + "loss": 0.4805, + "step": 1324 + }, + { + "epoch": 0.23, + "learning_rate": 1.7985341773113416e-05, + "loss": 0.4966, + "step": 1325 + }, + { + "epoch": 0.23, + "learning_rate": 1.7981962380699376e-05, + "loss": 0.4825, + "step": 1326 + }, + { + "epoch": 0.23, + "learning_rate": 1.7978580474410665e-05, + "loss": 0.4848, + "step": 1327 + }, + { + "epoch": 0.23, + "learning_rate": 1.7975196055312393e-05, + "loss": 0.4776, + "step": 1328 + }, + { + "epoch": 0.23, + "learning_rate": 1.797180912447047e-05, + "loss": 0.4793, + "step": 1329 + }, + { + "epoch": 0.23, + "learning_rate": 1.7968419682951584e-05, + "loss": 0.4984, + "step": 1330 + }, + { + "epoch": 0.23, + "learning_rate": 1.796502773182322e-05, + "loss": 0.4848, + "step": 1331 + }, + { + "epoch": 0.23, + "learning_rate": 1.7961633272153662e-05, + "loss": 0.4775, + "step": 1332 + }, + { + "epoch": 0.23, + "learning_rate": 1.7958236305011972e-05, + "loss": 0.5025, + "step": 1333 + }, + { + "epoch": 0.23, + "learning_rate": 1.7954836831468007e-05, + "loss": 0.4843, + "step": 1334 + }, + { + "epoch": 0.23, + "learning_rate": 1.7951434852592406e-05, + "loss": 0.4804, + "step": 1335 + }, + { + "epoch": 0.23, + "learning_rate": 1.794803036945661e-05, + "loss": 0.4761, + "step": 1336 + }, + { + "epoch": 0.23, + "learning_rate": 1.794462338313284e-05, + "loss": 0.4893, + "step": 1337 + }, + { + "epoch": 0.23, + "learning_rate": 1.7941213894694108e-05, + "loss": 0.4781, + "step": 1338 + }, + { + "epoch": 0.23, + "learning_rate": 1.7937801905214213e-05, + "loss": 0.4984, + "step": 1339 + }, + { + "epoch": 0.23, + "learning_rate": 1.7934387415767745e-05, + "loss": 0.4858, + "step": 1340 + }, + { + "epoch": 0.23, + "learning_rate": 1.7930970427430074e-05, + "loss": 0.4893, + "step": 1341 + }, + { + "epoch": 0.23, + "learning_rate": 1.7927550941277364e-05, + "loss": 0.4732, + "step": 1342 + }, + { + "epoch": 0.23, + "learning_rate": 1.7924128958386558e-05, + "loss": 0.4996, + "step": 1343 + }, + { + "epoch": 0.23, + "learning_rate": 1.79207044798354e-05, + "loss": 0.4775, + "step": 1344 + }, + { + "epoch": 0.23, + "learning_rate": 1.7917277506702406e-05, + "loss": 0.4816, + "step": 1345 + }, + { + "epoch": 0.23, + "learning_rate": 1.791384804006688e-05, + "loss": 0.4786, + "step": 1346 + }, + { + "epoch": 0.23, + "learning_rate": 1.7910416081008914e-05, + "loss": 0.4925, + "step": 1347 + }, + { + "epoch": 0.23, + "learning_rate": 1.7906981630609383e-05, + "loss": 0.4695, + "step": 1348 + }, + { + "epoch": 0.23, + "learning_rate": 1.7903544689949955e-05, + "loss": 0.4898, + "step": 1349 + }, + { + "epoch": 0.23, + "learning_rate": 1.7900105260113066e-05, + "loss": 0.4902, + "step": 1350 + }, + { + "epoch": 0.23, + "learning_rate": 1.7896663342181954e-05, + "loss": 0.4999, + "step": 1351 + }, + { + "epoch": 0.23, + "learning_rate": 1.7893218937240627e-05, + "loss": 0.4797, + "step": 1352 + }, + { + "epoch": 0.23, + "learning_rate": 1.788977204637388e-05, + "loss": 0.5008, + "step": 1353 + }, + { + "epoch": 0.23, + "learning_rate": 1.78863226706673e-05, + "loss": 0.4776, + "step": 1354 + }, + { + "epoch": 0.23, + "learning_rate": 1.788287081120724e-05, + "loss": 0.4872, + "step": 1355 + }, + { + "epoch": 0.23, + "learning_rate": 1.7879416469080847e-05, + "loss": 0.4703, + "step": 1356 + }, + { + "epoch": 0.24, + "learning_rate": 1.7875959645376043e-05, + "loss": 0.4942, + "step": 1357 + }, + { + "epoch": 0.24, + "learning_rate": 1.7872500341181546e-05, + "loss": 0.4934, + "step": 1358 + }, + { + "epoch": 0.24, + "learning_rate": 1.7869038557586832e-05, + "loss": 0.4881, + "step": 1359 + }, + { + "epoch": 0.24, + "learning_rate": 1.786557429568218e-05, + "loss": 0.4758, + "step": 1360 + }, + { + "epoch": 0.24, + "learning_rate": 1.7862107556558633e-05, + "loss": 0.4827, + "step": 1361 + }, + { + "epoch": 0.24, + "learning_rate": 1.7858638341308026e-05, + "loss": 0.4804, + "step": 1362 + }, + { + "epoch": 0.24, + "learning_rate": 1.785516665102297e-05, + "loss": 0.4954, + "step": 1363 + }, + { + "epoch": 0.24, + "learning_rate": 1.7851692486796847e-05, + "loss": 0.4928, + "step": 1364 + }, + { + "epoch": 0.24, + "learning_rate": 1.7848215849723836e-05, + "loss": 0.4783, + "step": 1365 + }, + { + "epoch": 0.24, + "learning_rate": 1.7844736740898876e-05, + "loss": 0.4845, + "step": 1366 + }, + { + "epoch": 0.24, + "learning_rate": 1.7841255161417698e-05, + "loss": 0.4874, + "step": 1367 + }, + { + "epoch": 0.24, + "learning_rate": 1.7837771112376804e-05, + "loss": 0.4759, + "step": 1368 + }, + { + "epoch": 0.24, + "learning_rate": 1.7834284594873478e-05, + "loss": 0.4842, + "step": 1369 + }, + { + "epoch": 0.24, + "learning_rate": 1.7830795610005775e-05, + "loss": 0.4797, + "step": 1370 + }, + { + "epoch": 0.24, + "learning_rate": 1.7827304158872538e-05, + "loss": 0.5033, + "step": 1371 + }, + { + "epoch": 0.24, + "learning_rate": 1.782381024257337e-05, + "loss": 0.4867, + "step": 1372 + }, + { + "epoch": 0.24, + "learning_rate": 1.782031386220867e-05, + "loss": 0.486, + "step": 1373 + }, + { + "epoch": 0.24, + "learning_rate": 1.78168150188796e-05, + "loss": 0.4841, + "step": 1374 + }, + { + "epoch": 0.24, + "learning_rate": 1.78133137136881e-05, + "loss": 0.4853, + "step": 1375 + }, + { + "epoch": 0.24, + "learning_rate": 1.7809809947736892e-05, + "loss": 0.4927, + "step": 1376 + }, + { + "epoch": 0.24, + "learning_rate": 1.780630372212946e-05, + "loss": 0.4803, + "step": 1377 + }, + { + "epoch": 0.24, + "learning_rate": 1.7802795037970076e-05, + "loss": 0.4791, + "step": 1378 + }, + { + "epoch": 0.24, + "learning_rate": 1.7799283896363778e-05, + "loss": 0.4781, + "step": 1379 + }, + { + "epoch": 0.24, + "learning_rate": 1.779577029841638e-05, + "loss": 0.4866, + "step": 1380 + }, + { + "epoch": 0.24, + "learning_rate": 1.779225424523447e-05, + "loss": 0.4822, + "step": 1381 + }, + { + "epoch": 0.24, + "learning_rate": 1.7788735737925414e-05, + "loss": 0.4758, + "step": 1382 + }, + { + "epoch": 0.24, + "learning_rate": 1.7785214777597342e-05, + "loss": 0.4866, + "step": 1383 + }, + { + "epoch": 0.24, + "learning_rate": 1.778169136535916e-05, + "loss": 0.4759, + "step": 1384 + }, + { + "epoch": 0.24, + "learning_rate": 1.777816550232055e-05, + "loss": 0.4994, + "step": 1385 + }, + { + "epoch": 0.24, + "learning_rate": 1.7774637189591963e-05, + "loss": 0.4736, + "step": 1386 + }, + { + "epoch": 0.24, + "learning_rate": 1.777110642828462e-05, + "loss": 0.4879, + "step": 1387 + }, + { + "epoch": 0.24, + "learning_rate": 1.776757321951051e-05, + "loss": 0.471, + "step": 1388 + }, + { + "epoch": 0.24, + "learning_rate": 1.776403756438241e-05, + "loss": 0.4798, + "step": 1389 + }, + { + "epoch": 0.24, + "learning_rate": 1.776049946401384e-05, + "loss": 0.4738, + "step": 1390 + }, + { + "epoch": 0.24, + "learning_rate": 1.7756958919519118e-05, + "loss": 0.4954, + "step": 1391 + }, + { + "epoch": 0.24, + "learning_rate": 1.7753415932013313e-05, + "loss": 0.4746, + "step": 1392 + }, + { + "epoch": 0.24, + "learning_rate": 1.7749870502612267e-05, + "loss": 0.4881, + "step": 1393 + }, + { + "epoch": 0.24, + "learning_rate": 1.7746322632432593e-05, + "loss": 0.48, + "step": 1394 + }, + { + "epoch": 0.24, + "learning_rate": 1.774277232259168e-05, + "loss": 0.4918, + "step": 1395 + }, + { + "epoch": 0.24, + "learning_rate": 1.7739219574207673e-05, + "loss": 0.4842, + "step": 1396 + }, + { + "epoch": 0.24, + "learning_rate": 1.7735664388399492e-05, + "loss": 0.4949, + "step": 1397 + }, + { + "epoch": 0.24, + "learning_rate": 1.773210676628682e-05, + "loss": 0.488, + "step": 1398 + }, + { + "epoch": 0.24, + "learning_rate": 1.772854670899011e-05, + "loss": 0.4911, + "step": 1399 + }, + { + "epoch": 0.24, + "learning_rate": 1.7724984217630594e-05, + "loss": 0.4925, + "step": 1400 + }, + { + "epoch": 0.24, + "learning_rate": 1.7721419293330245e-05, + "loss": 0.4714, + "step": 1401 + }, + { + "epoch": 0.24, + "learning_rate": 1.771785193721182e-05, + "loss": 0.4858, + "step": 1402 + }, + { + "epoch": 0.24, + "learning_rate": 1.771428215039884e-05, + "loss": 0.4903, + "step": 1403 + }, + { + "epoch": 0.24, + "learning_rate": 1.7710709934015585e-05, + "loss": 0.482, + "step": 1404 + }, + { + "epoch": 0.24, + "learning_rate": 1.7707135289187115e-05, + "loss": 0.4867, + "step": 1405 + }, + { + "epoch": 0.24, + "learning_rate": 1.7703558217039233e-05, + "loss": 0.4799, + "step": 1406 + }, + { + "epoch": 0.24, + "learning_rate": 1.769997871869852e-05, + "loss": 0.4765, + "step": 1407 + }, + { + "epoch": 0.24, + "learning_rate": 1.7696396795292324e-05, + "loss": 0.4905, + "step": 1408 + }, + { + "epoch": 0.24, + "learning_rate": 1.769281244794875e-05, + "loss": 0.4839, + "step": 1409 + }, + { + "epoch": 0.24, + "learning_rate": 1.7689225677796667e-05, + "loss": 0.4807, + "step": 1410 + }, + { + "epoch": 0.24, + "learning_rate": 1.7685636485965713e-05, + "loss": 0.4713, + "step": 1411 + }, + { + "epoch": 0.24, + "learning_rate": 1.7682044873586273e-05, + "loss": 0.4737, + "step": 1412 + }, + { + "epoch": 0.24, + "learning_rate": 1.7678450841789515e-05, + "loss": 0.4845, + "step": 1413 + }, + { + "epoch": 0.24, + "learning_rate": 1.7674854391707357e-05, + "loss": 0.48, + "step": 1414 + }, + { + "epoch": 0.25, + "learning_rate": 1.7671255524472482e-05, + "loss": 0.4823, + "step": 1415 + }, + { + "epoch": 0.25, + "learning_rate": 1.7667654241218332e-05, + "loss": 0.4718, + "step": 1416 + }, + { + "epoch": 0.25, + "learning_rate": 1.766405054307911e-05, + "loss": 0.4727, + "step": 1417 + }, + { + "epoch": 0.25, + "learning_rate": 1.766044443118978e-05, + "loss": 0.4721, + "step": 1418 + }, + { + "epoch": 0.25, + "learning_rate": 1.7656835906686072e-05, + "loss": 0.4903, + "step": 1419 + }, + { + "epoch": 0.25, + "learning_rate": 1.7653224970704465e-05, + "loss": 0.4977, + "step": 1420 + }, + { + "epoch": 0.25, + "learning_rate": 1.7649611624382207e-05, + "loss": 0.4877, + "step": 1421 + }, + { + "epoch": 0.25, + "learning_rate": 1.76459958688573e-05, + "loss": 0.4823, + "step": 1422 + }, + { + "epoch": 0.25, + "learning_rate": 1.7642377705268505e-05, + "loss": 0.4861, + "step": 1423 + }, + { + "epoch": 0.25, + "learning_rate": 1.7638757134755346e-05, + "loss": 0.4747, + "step": 1424 + }, + { + "epoch": 0.25, + "learning_rate": 1.7635134158458095e-05, + "loss": 0.4974, + "step": 1425 + }, + { + "epoch": 0.25, + "learning_rate": 1.7631508777517794e-05, + "loss": 0.4708, + "step": 1426 + }, + { + "epoch": 0.25, + "learning_rate": 1.7627880993076237e-05, + "loss": 0.4903, + "step": 1427 + }, + { + "epoch": 0.25, + "learning_rate": 1.762425080627597e-05, + "loss": 0.461, + "step": 1428 + }, + { + "epoch": 0.25, + "learning_rate": 1.76206182182603e-05, + "loss": 0.487, + "step": 1429 + }, + { + "epoch": 0.25, + "learning_rate": 1.76169832301733e-05, + "loss": 0.4694, + "step": 1430 + }, + { + "epoch": 0.25, + "learning_rate": 1.7613345843159777e-05, + "loss": 0.4955, + "step": 1431 + }, + { + "epoch": 0.25, + "learning_rate": 1.760970605836531e-05, + "loss": 0.4795, + "step": 1432 + }, + { + "epoch": 0.25, + "learning_rate": 1.7606063876936235e-05, + "loss": 0.4889, + "step": 1433 + }, + { + "epoch": 0.25, + "learning_rate": 1.7602419300019627e-05, + "loss": 0.4722, + "step": 1434 + }, + { + "epoch": 0.25, + "learning_rate": 1.7598772328763335e-05, + "loss": 0.4881, + "step": 1435 + }, + { + "epoch": 0.25, + "learning_rate": 1.7595122964315945e-05, + "loss": 0.4842, + "step": 1436 + }, + { + "epoch": 0.25, + "learning_rate": 1.7591471207826804e-05, + "loss": 0.4988, + "step": 1437 + }, + { + "epoch": 0.25, + "learning_rate": 1.758781706044602e-05, + "loss": 0.4792, + "step": 1438 + }, + { + "epoch": 0.25, + "learning_rate": 1.7584160523324437e-05, + "loss": 0.4915, + "step": 1439 + }, + { + "epoch": 0.25, + "learning_rate": 1.7580501597613665e-05, + "loss": 0.4851, + "step": 1440 + }, + { + "epoch": 0.25, + "learning_rate": 1.7576840284466065e-05, + "loss": 0.4947, + "step": 1441 + }, + { + "epoch": 0.25, + "learning_rate": 1.7573176585034744e-05, + "loss": 0.4741, + "step": 1442 + }, + { + "epoch": 0.25, + "learning_rate": 1.7569510500473566e-05, + "loss": 0.4847, + "step": 1443 + }, + { + "epoch": 0.25, + "learning_rate": 1.756584203193714e-05, + "loss": 0.4885, + "step": 1444 + }, + { + "epoch": 0.25, + "learning_rate": 1.7562171180580834e-05, + "loss": 0.4767, + "step": 1445 + }, + { + "epoch": 0.25, + "learning_rate": 1.755849794756076e-05, + "loss": 0.4881, + "step": 1446 + }, + { + "epoch": 0.25, + "learning_rate": 1.7554822334033782e-05, + "loss": 0.4759, + "step": 1447 + }, + { + "epoch": 0.25, + "learning_rate": 1.7551144341157514e-05, + "loss": 0.4845, + "step": 1448 + }, + { + "epoch": 0.25, + "learning_rate": 1.7547463970090324e-05, + "loss": 0.4891, + "step": 1449 + }, + { + "epoch": 0.25, + "learning_rate": 1.7543781221991317e-05, + "loss": 0.4907, + "step": 1450 + }, + { + "epoch": 0.25, + "learning_rate": 1.7540096098020358e-05, + "loss": 0.5027, + "step": 1451 + }, + { + "epoch": 0.25, + "learning_rate": 1.753640859933806e-05, + "loss": 0.482, + "step": 1452 + }, + { + "epoch": 0.25, + "learning_rate": 1.7532718727105772e-05, + "loss": 0.4841, + "step": 1453 + }, + { + "epoch": 0.25, + "learning_rate": 1.7529026482485605e-05, + "loss": 0.4958, + "step": 1454 + }, + { + "epoch": 0.25, + "learning_rate": 1.7525331866640406e-05, + "loss": 0.4702, + "step": 1455 + }, + { + "epoch": 0.25, + "learning_rate": 1.752163488073378e-05, + "loss": 0.479, + "step": 1456 + }, + { + "epoch": 0.25, + "learning_rate": 1.7517935525930068e-05, + "loss": 0.4984, + "step": 1457 + }, + { + "epoch": 0.25, + "learning_rate": 1.751423380339436e-05, + "loss": 0.4763, + "step": 1458 + }, + { + "epoch": 0.25, + "learning_rate": 1.7510529714292497e-05, + "loss": 0.4875, + "step": 1459 + }, + { + "epoch": 0.25, + "learning_rate": 1.750682325979106e-05, + "loss": 0.4765, + "step": 1460 + }, + { + "epoch": 0.25, + "learning_rate": 1.7503114441057374e-05, + "loss": 0.486, + "step": 1461 + }, + { + "epoch": 0.25, + "learning_rate": 1.7499403259259515e-05, + "loss": 0.471, + "step": 1462 + }, + { + "epoch": 0.25, + "learning_rate": 1.749568971556629e-05, + "loss": 0.4966, + "step": 1463 + }, + { + "epoch": 0.25, + "learning_rate": 1.749197381114727e-05, + "loss": 0.4799, + "step": 1464 + }, + { + "epoch": 0.25, + "learning_rate": 1.7488255547172754e-05, + "loss": 0.4838, + "step": 1465 + }, + { + "epoch": 0.25, + "learning_rate": 1.7484534924813785e-05, + "loss": 0.4895, + "step": 1466 + }, + { + "epoch": 0.25, + "learning_rate": 1.748081194524216e-05, + "loss": 0.4809, + "step": 1467 + }, + { + "epoch": 0.25, + "learning_rate": 1.7477086609630403e-05, + "loss": 0.4868, + "step": 1468 + }, + { + "epoch": 0.25, + "learning_rate": 1.7473358919151792e-05, + "loss": 0.4892, + "step": 1469 + }, + { + "epoch": 0.25, + "learning_rate": 1.746962887498034e-05, + "loss": 0.4867, + "step": 1470 + }, + { + "epoch": 0.25, + "learning_rate": 1.746589647829081e-05, + "loss": 0.4791, + "step": 1471 + }, + { + "epoch": 0.26, + "learning_rate": 1.7462161730258688e-05, + "loss": 0.4751, + "step": 1472 + }, + { + "epoch": 0.26, + "learning_rate": 1.745842463206022e-05, + "loss": 0.4752, + "step": 1473 + }, + { + "epoch": 0.26, + "learning_rate": 1.7454685184872388e-05, + "loss": 0.4848, + "step": 1474 + }, + { + "epoch": 0.26, + "learning_rate": 1.74509433898729e-05, + "loss": 0.4824, + "step": 1475 + }, + { + "epoch": 0.26, + "learning_rate": 1.7447199248240222e-05, + "loss": 0.4817, + "step": 1476 + }, + { + "epoch": 0.26, + "learning_rate": 1.7443452761153546e-05, + "loss": 0.4745, + "step": 1477 + }, + { + "epoch": 0.26, + "learning_rate": 1.743970392979281e-05, + "loss": 0.4781, + "step": 1478 + }, + { + "epoch": 0.26, + "learning_rate": 1.743595275533869e-05, + "loss": 0.4853, + "step": 1479 + }, + { + "epoch": 0.26, + "learning_rate": 1.743219923897259e-05, + "loss": 0.4854, + "step": 1480 + }, + { + "epoch": 0.26, + "learning_rate": 1.7428443381876657e-05, + "loss": 0.4923, + "step": 1481 + }, + { + "epoch": 0.26, + "learning_rate": 1.7424685185233788e-05, + "loss": 0.4898, + "step": 1482 + }, + { + "epoch": 0.26, + "learning_rate": 1.7420924650227603e-05, + "loss": 0.4776, + "step": 1483 + }, + { + "epoch": 0.26, + "learning_rate": 1.7417161778042456e-05, + "loss": 0.4953, + "step": 1484 + }, + { + "epoch": 0.26, + "learning_rate": 1.741339656986345e-05, + "loss": 0.4786, + "step": 1485 + }, + { + "epoch": 0.26, + "learning_rate": 1.7409629026876412e-05, + "loss": 0.4889, + "step": 1486 + }, + { + "epoch": 0.26, + "learning_rate": 1.740585915026791e-05, + "loss": 0.4938, + "step": 1487 + }, + { + "epoch": 0.26, + "learning_rate": 1.7402086941225246e-05, + "loss": 0.4819, + "step": 1488 + }, + { + "epoch": 0.26, + "learning_rate": 1.739831240093645e-05, + "loss": 0.4728, + "step": 1489 + }, + { + "epoch": 0.26, + "learning_rate": 1.7394535530590305e-05, + "loss": 0.4836, + "step": 1490 + }, + { + "epoch": 0.26, + "learning_rate": 1.7390756331376307e-05, + "loss": 0.4717, + "step": 1491 + }, + { + "epoch": 0.26, + "learning_rate": 1.7386974804484694e-05, + "loss": 0.4874, + "step": 1492 + }, + { + "epoch": 0.26, + "learning_rate": 1.738319095110644e-05, + "loss": 0.4797, + "step": 1493 + }, + { + "epoch": 0.26, + "learning_rate": 1.7379404772433247e-05, + "loss": 0.4836, + "step": 1494 + }, + { + "epoch": 0.26, + "learning_rate": 1.7375616269657544e-05, + "loss": 0.4834, + "step": 1495 + }, + { + "epoch": 0.26, + "learning_rate": 1.7371825443972513e-05, + "loss": 0.4895, + "step": 1496 + }, + { + "epoch": 0.26, + "learning_rate": 1.736803229657204e-05, + "loss": 0.4899, + "step": 1497 + }, + { + "epoch": 0.26, + "learning_rate": 1.7364236828650768e-05, + "loss": 0.4932, + "step": 1498 + }, + { + "epoch": 0.26, + "learning_rate": 1.736043904140405e-05, + "loss": 0.4728, + "step": 1499 + }, + { + "epoch": 0.26, + "learning_rate": 1.7356638936027975e-05, + "loss": 0.4743, + "step": 1500 + }, + { + "epoch": 0.26, + "learning_rate": 1.7352836513719377e-05, + "loss": 0.4668, + "step": 1501 + }, + { + "epoch": 0.26, + "learning_rate": 1.7349031775675796e-05, + "loss": 0.4973, + "step": 1502 + }, + { + "epoch": 0.26, + "learning_rate": 1.734522472309552e-05, + "loss": 0.4741, + "step": 1503 + }, + { + "epoch": 0.26, + "learning_rate": 1.734141535717756e-05, + "loss": 0.4871, + "step": 1504 + }, + { + "epoch": 0.26, + "learning_rate": 1.7337603679121645e-05, + "loss": 0.4767, + "step": 1505 + }, + { + "epoch": 0.26, + "learning_rate": 1.7333789690128252e-05, + "loss": 0.4918, + "step": 1506 + }, + { + "epoch": 0.26, + "learning_rate": 1.7329973391398575e-05, + "loss": 0.4799, + "step": 1507 + }, + { + "epoch": 0.26, + "learning_rate": 1.732615478413453e-05, + "loss": 0.5018, + "step": 1508 + }, + { + "epoch": 0.26, + "learning_rate": 1.732233386953877e-05, + "loss": 0.465, + "step": 1509 + }, + { + "epoch": 0.26, + "learning_rate": 1.731851064881467e-05, + "loss": 0.4904, + "step": 1510 + }, + { + "epoch": 0.26, + "learning_rate": 1.7314685123166333e-05, + "loss": 0.4751, + "step": 1511 + }, + { + "epoch": 0.26, + "learning_rate": 1.7310857293798585e-05, + "loss": 0.4875, + "step": 1512 + }, + { + "epoch": 0.26, + "learning_rate": 1.730702716191698e-05, + "loss": 0.4971, + "step": 1513 + }, + { + "epoch": 0.26, + "learning_rate": 1.73031947287278e-05, + "loss": 0.4836, + "step": 1514 + }, + { + "epoch": 0.26, + "learning_rate": 1.7299359995438046e-05, + "loss": 0.4697, + "step": 1515 + }, + { + "epoch": 0.26, + "learning_rate": 1.7295522963255443e-05, + "loss": 0.4878, + "step": 1516 + }, + { + "epoch": 0.26, + "learning_rate": 1.729168363338845e-05, + "loss": 0.4813, + "step": 1517 + }, + { + "epoch": 0.26, + "learning_rate": 1.7287842007046232e-05, + "loss": 0.4766, + "step": 1518 + }, + { + "epoch": 0.26, + "learning_rate": 1.7283998085438703e-05, + "loss": 0.4687, + "step": 1519 + }, + { + "epoch": 0.26, + "learning_rate": 1.728015186977647e-05, + "loss": 0.4789, + "step": 1520 + }, + { + "epoch": 0.26, + "learning_rate": 1.7276303361270886e-05, + "loss": 0.472, + "step": 1521 + }, + { + "epoch": 0.26, + "learning_rate": 1.7272452561134015e-05, + "loss": 0.4761, + "step": 1522 + }, + { + "epoch": 0.26, + "learning_rate": 1.7268599470578644e-05, + "loss": 0.4844, + "step": 1523 + }, + { + "epoch": 0.26, + "learning_rate": 1.7264744090818284e-05, + "loss": 0.4951, + "step": 1524 + }, + { + "epoch": 0.26, + "learning_rate": 1.726088642306716e-05, + "loss": 0.474, + "step": 1525 + }, + { + "epoch": 0.26, + "learning_rate": 1.7257026468540238e-05, + "loss": 0.4939, + "step": 1526 + }, + { + "epoch": 0.26, + "learning_rate": 1.725316422845317e-05, + "loss": 0.4736, + "step": 1527 + }, + { + "epoch": 0.26, + "learning_rate": 1.724929970402236e-05, + "loss": 0.4868, + "step": 1528 + }, + { + "epoch": 0.26, + "learning_rate": 1.7245432896464913e-05, + "loss": 0.4686, + "step": 1529 + }, + { + "epoch": 0.27, + "learning_rate": 1.724156380699866e-05, + "loss": 0.4903, + "step": 1530 + }, + { + "epoch": 0.27, + "learning_rate": 1.723769243684215e-05, + "loss": 0.4675, + "step": 1531 + }, + { + "epoch": 0.27, + "learning_rate": 1.723381878721465e-05, + "loss": 0.4902, + "step": 1532 + }, + { + "epoch": 0.27, + "learning_rate": 1.7229942859336142e-05, + "loss": 0.4852, + "step": 1533 + }, + { + "epoch": 0.27, + "learning_rate": 1.7226064654427327e-05, + "loss": 0.4787, + "step": 1534 + }, + { + "epoch": 0.27, + "learning_rate": 1.7222184173709627e-05, + "loss": 0.4813, + "step": 1535 + }, + { + "epoch": 0.27, + "learning_rate": 1.721830141840518e-05, + "loss": 0.4895, + "step": 1536 + }, + { + "epoch": 0.27, + "learning_rate": 1.721441638973683e-05, + "loss": 0.4857, + "step": 1537 + }, + { + "epoch": 0.27, + "learning_rate": 1.7210529088928156e-05, + "loss": 0.4911, + "step": 1538 + }, + { + "epoch": 0.27, + "learning_rate": 1.7206639517203433e-05, + "loss": 0.4721, + "step": 1539 + }, + { + "epoch": 0.27, + "learning_rate": 1.7202747675787662e-05, + "loss": 0.481, + "step": 1540 + }, + { + "epoch": 0.27, + "learning_rate": 1.7198853565906558e-05, + "loss": 0.4851, + "step": 1541 + }, + { + "epoch": 0.27, + "learning_rate": 1.719495718878655e-05, + "loss": 0.4899, + "step": 1542 + }, + { + "epoch": 0.27, + "learning_rate": 1.7191058545654783e-05, + "loss": 0.4861, + "step": 1543 + }, + { + "epoch": 0.27, + "learning_rate": 1.7187157637739108e-05, + "loss": 0.4907, + "step": 1544 + }, + { + "epoch": 0.27, + "learning_rate": 1.7183254466268093e-05, + "loss": 0.4905, + "step": 1545 + }, + { + "epoch": 0.27, + "learning_rate": 1.7179349032471026e-05, + "loss": 0.492, + "step": 1546 + }, + { + "epoch": 0.27, + "learning_rate": 1.7175441337577897e-05, + "loss": 0.4805, + "step": 1547 + }, + { + "epoch": 0.27, + "learning_rate": 1.717153138281941e-05, + "loss": 0.4872, + "step": 1548 + }, + { + "epoch": 0.27, + "learning_rate": 1.7167619169426996e-05, + "loss": 0.4735, + "step": 1549 + }, + { + "epoch": 0.27, + "learning_rate": 1.7163704698632772e-05, + "loss": 0.4869, + "step": 1550 + }, + { + "epoch": 0.27, + "learning_rate": 1.7159787971669586e-05, + "loss": 0.4725, + "step": 1551 + }, + { + "epoch": 0.27, + "learning_rate": 1.7155868989770984e-05, + "loss": 0.4903, + "step": 1552 + }, + { + "epoch": 0.27, + "learning_rate": 1.715194775417123e-05, + "loss": 0.4845, + "step": 1553 + }, + { + "epoch": 0.27, + "learning_rate": 1.71480242661053e-05, + "loss": 0.4637, + "step": 1554 + }, + { + "epoch": 0.27, + "learning_rate": 1.7144098526808867e-05, + "loss": 0.4855, + "step": 1555 + }, + { + "epoch": 0.27, + "learning_rate": 1.7140170537518327e-05, + "loss": 0.474, + "step": 1556 + }, + { + "epoch": 0.27, + "learning_rate": 1.7136240299470772e-05, + "loss": 0.4753, + "step": 1557 + }, + { + "epoch": 0.27, + "learning_rate": 1.7132307813904016e-05, + "loss": 0.4766, + "step": 1558 + }, + { + "epoch": 0.27, + "learning_rate": 1.7128373082056567e-05, + "loss": 0.4801, + "step": 1559 + }, + { + "epoch": 0.27, + "learning_rate": 1.712443610516765e-05, + "loss": 0.472, + "step": 1560 + }, + { + "epoch": 0.27, + "learning_rate": 1.7120496884477196e-05, + "loss": 0.4802, + "step": 1561 + }, + { + "epoch": 0.27, + "learning_rate": 1.7116555421225837e-05, + "loss": 0.4882, + "step": 1562 + }, + { + "epoch": 0.27, + "learning_rate": 1.7112611716654918e-05, + "loss": 0.4937, + "step": 1563 + }, + { + "epoch": 0.27, + "learning_rate": 1.710866577200648e-05, + "loss": 0.486, + "step": 1564 + }, + { + "epoch": 0.27, + "learning_rate": 1.7104717588523285e-05, + "loss": 0.4797, + "step": 1565 + }, + { + "epoch": 0.27, + "learning_rate": 1.710076716744879e-05, + "loss": 0.4787, + "step": 1566 + }, + { + "epoch": 0.27, + "learning_rate": 1.709681451002715e-05, + "loss": 0.4634, + "step": 1567 + }, + { + "epoch": 0.27, + "learning_rate": 1.7092859617503242e-05, + "loss": 0.495, + "step": 1568 + }, + { + "epoch": 0.27, + "learning_rate": 1.7088902491122636e-05, + "loss": 0.4658, + "step": 1569 + }, + { + "epoch": 0.27, + "learning_rate": 1.7084943132131604e-05, + "loss": 0.4963, + "step": 1570 + }, + { + "epoch": 0.27, + "learning_rate": 1.7080981541777123e-05, + "loss": 0.4947, + "step": 1571 + }, + { + "epoch": 0.27, + "learning_rate": 1.7077017721306877e-05, + "loss": 0.488, + "step": 1572 + }, + { + "epoch": 0.27, + "learning_rate": 1.707305167196925e-05, + "loss": 0.487, + "step": 1573 + }, + { + "epoch": 0.27, + "learning_rate": 1.7069083395013323e-05, + "loss": 0.4934, + "step": 1574 + }, + { + "epoch": 0.27, + "learning_rate": 1.7065112891688883e-05, + "loss": 0.4877, + "step": 1575 + }, + { + "epoch": 0.27, + "learning_rate": 1.706114016324642e-05, + "loss": 0.4946, + "step": 1576 + }, + { + "epoch": 0.27, + "learning_rate": 1.7057165210937124e-05, + "loss": 0.4809, + "step": 1577 + }, + { + "epoch": 0.27, + "learning_rate": 1.7053188036012885e-05, + "loss": 0.4978, + "step": 1578 + }, + { + "epoch": 0.27, + "learning_rate": 1.704920863972629e-05, + "loss": 0.4744, + "step": 1579 + }, + { + "epoch": 0.27, + "learning_rate": 1.704522702333063e-05, + "loss": 0.4862, + "step": 1580 + }, + { + "epoch": 0.27, + "learning_rate": 1.7041243188079884e-05, + "loss": 0.4763, + "step": 1581 + }, + { + "epoch": 0.27, + "learning_rate": 1.7037257135228745e-05, + "loss": 0.4792, + "step": 1582 + }, + { + "epoch": 0.27, + "learning_rate": 1.7033268866032605e-05, + "loss": 0.4759, + "step": 1583 + }, + { + "epoch": 0.27, + "learning_rate": 1.7029278381747537e-05, + "loss": 0.4778, + "step": 1584 + }, + { + "epoch": 0.27, + "learning_rate": 1.7025285683630324e-05, + "loss": 0.4713, + "step": 1585 + }, + { + "epoch": 0.27, + "learning_rate": 1.7021290772938447e-05, + "loss": 0.4769, + "step": 1586 + }, + { + "epoch": 0.27, + "learning_rate": 1.7017293650930083e-05, + "loss": 0.4747, + "step": 1587 + }, + { + "epoch": 0.28, + "learning_rate": 1.7013294318864095e-05, + "loss": 0.4903, + "step": 1588 + }, + { + "epoch": 0.28, + "learning_rate": 1.7009292778000058e-05, + "loss": 0.4922, + "step": 1589 + }, + { + "epoch": 0.28, + "learning_rate": 1.7005289029598233e-05, + "loss": 0.4684, + "step": 1590 + }, + { + "epoch": 0.28, + "learning_rate": 1.7001283074919576e-05, + "loss": 0.4811, + "step": 1591 + }, + { + "epoch": 0.28, + "learning_rate": 1.699727491522574e-05, + "loss": 0.4888, + "step": 1592 + }, + { + "epoch": 0.28, + "learning_rate": 1.699326455177908e-05, + "loss": 0.4798, + "step": 1593 + }, + { + "epoch": 0.28, + "learning_rate": 1.698925198584263e-05, + "loss": 0.485, + "step": 1594 + }, + { + "epoch": 0.28, + "learning_rate": 1.6985237218680125e-05, + "loss": 0.4776, + "step": 1595 + }, + { + "epoch": 0.28, + "learning_rate": 1.6981220251555996e-05, + "loss": 0.4769, + "step": 1596 + }, + { + "epoch": 0.28, + "learning_rate": 1.6977201085735367e-05, + "loss": 0.477, + "step": 1597 + }, + { + "epoch": 0.28, + "learning_rate": 1.6973179722484048e-05, + "loss": 0.4779, + "step": 1598 + }, + { + "epoch": 0.28, + "learning_rate": 1.6969156163068547e-05, + "loss": 0.4849, + "step": 1599 + }, + { + "epoch": 0.28, + "learning_rate": 1.696513040875606e-05, + "loss": 0.4938, + "step": 1600 + }, + { + "epoch": 0.28, + "learning_rate": 1.696110246081448e-05, + "loss": 0.486, + "step": 1601 + }, + { + "epoch": 0.28, + "learning_rate": 1.695707232051238e-05, + "loss": 0.479, + "step": 1602 + }, + { + "epoch": 0.28, + "learning_rate": 1.6953039989119036e-05, + "loss": 0.488, + "step": 1603 + }, + { + "epoch": 0.28, + "learning_rate": 1.6949005467904405e-05, + "loss": 0.4918, + "step": 1604 + }, + { + "epoch": 0.28, + "learning_rate": 1.6944968758139144e-05, + "loss": 0.4734, + "step": 1605 + }, + { + "epoch": 0.28, + "learning_rate": 1.694092986109458e-05, + "loss": 0.4876, + "step": 1606 + }, + { + "epoch": 0.28, + "learning_rate": 1.693688877804275e-05, + "loss": 0.4866, + "step": 1607 + }, + { + "epoch": 0.28, + "learning_rate": 1.693284551025637e-05, + "loss": 0.4785, + "step": 1608 + }, + { + "epoch": 0.28, + "learning_rate": 1.6928800059008845e-05, + "loss": 0.4798, + "step": 1609 + }, + { + "epoch": 0.28, + "learning_rate": 1.6924752425574262e-05, + "loss": 0.4768, + "step": 1610 + }, + { + "epoch": 0.28, + "learning_rate": 1.6920702611227405e-05, + "loss": 0.4744, + "step": 1611 + }, + { + "epoch": 0.28, + "learning_rate": 1.691665061724374e-05, + "loss": 0.4773, + "step": 1612 + }, + { + "epoch": 0.28, + "learning_rate": 1.691259644489942e-05, + "loss": 0.4709, + "step": 1613 + }, + { + "epoch": 0.28, + "learning_rate": 1.6908540095471288e-05, + "loss": 0.4761, + "step": 1614 + }, + { + "epoch": 0.28, + "learning_rate": 1.690448157023686e-05, + "loss": 0.493, + "step": 1615 + }, + { + "epoch": 0.28, + "learning_rate": 1.6900420870474347e-05, + "loss": 0.4955, + "step": 1616 + }, + { + "epoch": 0.28, + "learning_rate": 1.6896357997462653e-05, + "loss": 0.4714, + "step": 1617 + }, + { + "epoch": 0.28, + "learning_rate": 1.6892292952481352e-05, + "loss": 0.4854, + "step": 1618 + }, + { + "epoch": 0.28, + "learning_rate": 1.6888225736810705e-05, + "loss": 0.472, + "step": 1619 + }, + { + "epoch": 0.28, + "learning_rate": 1.688415635173166e-05, + "loss": 0.4962, + "step": 1620 + }, + { + "epoch": 0.28, + "learning_rate": 1.6880084798525848e-05, + "loss": 0.4751, + "step": 1621 + }, + { + "epoch": 0.28, + "learning_rate": 1.6876011078475586e-05, + "loss": 0.4819, + "step": 1622 + }, + { + "epoch": 0.28, + "learning_rate": 1.6871935192863862e-05, + "loss": 0.4739, + "step": 1623 + }, + { + "epoch": 0.28, + "learning_rate": 1.6867857142974354e-05, + "loss": 0.4955, + "step": 1624 + }, + { + "epoch": 0.28, + "learning_rate": 1.686377693009143e-05, + "loss": 0.4622, + "step": 1625 + }, + { + "epoch": 0.28, + "learning_rate": 1.6859694555500125e-05, + "loss": 0.4855, + "step": 1626 + }, + { + "epoch": 0.28, + "learning_rate": 1.685561002048616e-05, + "loss": 0.4688, + "step": 1627 + }, + { + "epoch": 0.28, + "learning_rate": 1.6851523326335932e-05, + "loss": 0.4835, + "step": 1628 + }, + { + "epoch": 0.28, + "learning_rate": 1.684743447433653e-05, + "loss": 0.4752, + "step": 1629 + }, + { + "epoch": 0.28, + "learning_rate": 1.684334346577571e-05, + "loss": 0.4714, + "step": 1630 + }, + { + "epoch": 0.28, + "learning_rate": 1.6839250301941912e-05, + "loss": 0.494, + "step": 1631 + }, + { + "epoch": 0.28, + "learning_rate": 1.6835154984124266e-05, + "loss": 0.4731, + "step": 1632 + }, + { + "epoch": 0.28, + "learning_rate": 1.6831057513612554e-05, + "loss": 0.4931, + "step": 1633 + }, + { + "epoch": 0.28, + "learning_rate": 1.682695789169726e-05, + "loss": 0.4652, + "step": 1634 + }, + { + "epoch": 0.28, + "learning_rate": 1.682285611966954e-05, + "loss": 0.4901, + "step": 1635 + }, + { + "epoch": 0.28, + "learning_rate": 1.681875219882122e-05, + "loss": 0.4747, + "step": 1636 + }, + { + "epoch": 0.28, + "learning_rate": 1.6814646130444804e-05, + "loss": 0.4875, + "step": 1637 + }, + { + "epoch": 0.28, + "learning_rate": 1.681053791583348e-05, + "loss": 0.4651, + "step": 1638 + }, + { + "epoch": 0.28, + "learning_rate": 1.6806427556281105e-05, + "loss": 0.493, + "step": 1639 + }, + { + "epoch": 0.28, + "learning_rate": 1.6802315053082218e-05, + "loss": 0.4864, + "step": 1640 + }, + { + "epoch": 0.28, + "learning_rate": 1.6798200407532025e-05, + "loss": 0.4772, + "step": 1641 + }, + { + "epoch": 0.28, + "learning_rate": 1.6794083620926412e-05, + "loss": 0.4849, + "step": 1642 + }, + { + "epoch": 0.28, + "learning_rate": 1.6789964694561936e-05, + "loss": 0.4909, + "step": 1643 + }, + { + "epoch": 0.28, + "learning_rate": 1.6785843629735832e-05, + "loss": 0.4645, + "step": 1644 + }, + { + "epoch": 0.28, + "learning_rate": 1.6781720427746008e-05, + "loss": 0.488, + "step": 1645 + }, + { + "epoch": 0.29, + "learning_rate": 1.677759508989104e-05, + "loss": 0.4667, + "step": 1646 + }, + { + "epoch": 0.29, + "learning_rate": 1.6773467617470184e-05, + "loss": 0.4853, + "step": 1647 + }, + { + "epoch": 0.29, + "learning_rate": 1.6769338011783363e-05, + "loss": 0.478, + "step": 1648 + }, + { + "epoch": 0.29, + "learning_rate": 1.676520627413117e-05, + "loss": 0.4832, + "step": 1649 + }, + { + "epoch": 0.29, + "learning_rate": 1.676107240581488e-05, + "loss": 0.4763, + "step": 1650 + }, + { + "epoch": 0.29, + "learning_rate": 1.6756936408136423e-05, + "loss": 0.488, + "step": 1651 + }, + { + "epoch": 0.29, + "learning_rate": 1.6752798282398414e-05, + "loss": 0.472, + "step": 1652 + }, + { + "epoch": 0.29, + "learning_rate": 1.6748658029904132e-05, + "loss": 0.4785, + "step": 1653 + }, + { + "epoch": 0.29, + "learning_rate": 1.6744515651957525e-05, + "loss": 0.4887, + "step": 1654 + }, + { + "epoch": 0.29, + "learning_rate": 1.6740371149863212e-05, + "loss": 0.4847, + "step": 1655 + }, + { + "epoch": 0.29, + "learning_rate": 1.6736224524926487e-05, + "loss": 0.4734, + "step": 1656 + }, + { + "epoch": 0.29, + "learning_rate": 1.6732075778453298e-05, + "loss": 0.491, + "step": 1657 + }, + { + "epoch": 0.29, + "learning_rate": 1.6727924911750274e-05, + "loss": 0.4717, + "step": 1658 + }, + { + "epoch": 0.29, + "learning_rate": 1.6723771926124704e-05, + "loss": 0.4911, + "step": 1659 + }, + { + "epoch": 0.29, + "learning_rate": 1.6719616822884555e-05, + "loss": 0.4733, + "step": 1660 + }, + { + "epoch": 0.29, + "learning_rate": 1.6715459603338445e-05, + "loss": 0.4723, + "step": 1661 + }, + { + "epoch": 0.29, + "learning_rate": 1.6711300268795674e-05, + "loss": 0.4768, + "step": 1662 + }, + { + "epoch": 0.29, + "learning_rate": 1.6707138820566195e-05, + "loss": 0.4821, + "step": 1663 + }, + { + "epoch": 0.29, + "learning_rate": 1.670297525996064e-05, + "loss": 0.4755, + "step": 1664 + }, + { + "epoch": 0.29, + "learning_rate": 1.6698809588290292e-05, + "loss": 0.4733, + "step": 1665 + }, + { + "epoch": 0.29, + "learning_rate": 1.6694641806867112e-05, + "loss": 0.489, + "step": 1666 + }, + { + "epoch": 0.29, + "learning_rate": 1.6690471917003716e-05, + "loss": 0.4801, + "step": 1667 + }, + { + "epoch": 0.29, + "learning_rate": 1.6686299920013388e-05, + "loss": 0.484, + "step": 1668 + }, + { + "epoch": 0.29, + "learning_rate": 1.668212581721008e-05, + "loss": 0.4744, + "step": 1669 + }, + { + "epoch": 0.29, + "learning_rate": 1.6677949609908394e-05, + "loss": 0.4864, + "step": 1670 + }, + { + "epoch": 0.29, + "learning_rate": 1.6673771299423613e-05, + "loss": 0.4845, + "step": 1671 + }, + { + "epoch": 0.29, + "learning_rate": 1.666959088707166e-05, + "loss": 0.4727, + "step": 1672 + }, + { + "epoch": 0.29, + "learning_rate": 1.6665408374169144e-05, + "loss": 0.5003, + "step": 1673 + }, + { + "epoch": 0.29, + "learning_rate": 1.666122376203332e-05, + "loss": 0.4625, + "step": 1674 + }, + { + "epoch": 0.29, + "learning_rate": 1.665703705198211e-05, + "loss": 0.4741, + "step": 1675 + }, + { + "epoch": 0.29, + "learning_rate": 1.6652848245334097e-05, + "loss": 0.473, + "step": 1676 + }, + { + "epoch": 0.29, + "learning_rate": 1.6648657343408517e-05, + "loss": 0.4817, + "step": 1677 + }, + { + "epoch": 0.29, + "learning_rate": 1.6644464347525273e-05, + "loss": 0.4679, + "step": 1678 + }, + { + "epoch": 0.29, + "learning_rate": 1.664026925900492e-05, + "loss": 0.491, + "step": 1679 + }, + { + "epoch": 0.29, + "learning_rate": 1.663607207916869e-05, + "loss": 0.4781, + "step": 1680 + }, + { + "epoch": 0.29, + "learning_rate": 1.6631872809338456e-05, + "loss": 0.4882, + "step": 1681 + }, + { + "epoch": 0.29, + "learning_rate": 1.6627671450836753e-05, + "loss": 0.4775, + "step": 1682 + }, + { + "epoch": 0.29, + "learning_rate": 1.6623468004986774e-05, + "loss": 0.4884, + "step": 1683 + }, + { + "epoch": 0.29, + "learning_rate": 1.661926247311238e-05, + "loss": 0.4808, + "step": 1684 + }, + { + "epoch": 0.29, + "learning_rate": 1.6615054856538067e-05, + "loss": 0.4874, + "step": 1685 + }, + { + "epoch": 0.29, + "learning_rate": 1.661084515658901e-05, + "loss": 0.4707, + "step": 1686 + }, + { + "epoch": 0.29, + "learning_rate": 1.6606633374591022e-05, + "loss": 0.4841, + "step": 1687 + }, + { + "epoch": 0.29, + "learning_rate": 1.660241951187059e-05, + "loss": 0.4763, + "step": 1688 + }, + { + "epoch": 0.29, + "learning_rate": 1.6598203569754843e-05, + "loss": 0.4839, + "step": 1689 + }, + { + "epoch": 0.29, + "learning_rate": 1.6593985549571568e-05, + "loss": 0.4783, + "step": 1690 + }, + { + "epoch": 0.29, + "learning_rate": 1.6589765452649205e-05, + "loss": 0.4928, + "step": 1691 + }, + { + "epoch": 0.29, + "learning_rate": 1.6585543280316853e-05, + "loss": 0.4722, + "step": 1692 + }, + { + "epoch": 0.29, + "learning_rate": 1.658131903390426e-05, + "loss": 0.4848, + "step": 1693 + }, + { + "epoch": 0.29, + "learning_rate": 1.657709271474183e-05, + "loss": 0.4727, + "step": 1694 + }, + { + "epoch": 0.29, + "learning_rate": 1.6572864324160617e-05, + "loss": 0.4902, + "step": 1695 + }, + { + "epoch": 0.29, + "learning_rate": 1.6568633863492332e-05, + "loss": 0.4872, + "step": 1696 + }, + { + "epoch": 0.29, + "learning_rate": 1.6564401334069333e-05, + "loss": 0.4794, + "step": 1697 + }, + { + "epoch": 0.29, + "learning_rate": 1.656016673722463e-05, + "loss": 0.4731, + "step": 1698 + }, + { + "epoch": 0.29, + "learning_rate": 1.655593007429189e-05, + "loss": 0.4825, + "step": 1699 + }, + { + "epoch": 0.29, + "learning_rate": 1.6551691346605426e-05, + "loss": 0.4699, + "step": 1700 + }, + { + "epoch": 0.29, + "learning_rate": 1.65474505555002e-05, + "loss": 0.4846, + "step": 1701 + }, + { + "epoch": 0.29, + "learning_rate": 1.6543207702311822e-05, + "loss": 0.4806, + "step": 1702 + }, + { + "epoch": 0.3, + "learning_rate": 1.6538962788376557e-05, + "loss": 0.4719, + "step": 1703 + }, + { + "epoch": 0.3, + "learning_rate": 1.6534715815031325e-05, + "loss": 0.4718, + "step": 1704 + }, + { + "epoch": 0.3, + "learning_rate": 1.6530466783613674e-05, + "loss": 0.4824, + "step": 1705 + }, + { + "epoch": 0.3, + "learning_rate": 1.652621569546182e-05, + "loss": 0.4716, + "step": 1706 + }, + { + "epoch": 0.3, + "learning_rate": 1.652196255191462e-05, + "loss": 0.4657, + "step": 1707 + }, + { + "epoch": 0.3, + "learning_rate": 1.651770735431158e-05, + "loss": 0.482, + "step": 1708 + }, + { + "epoch": 0.3, + "learning_rate": 1.6513450103992844e-05, + "loss": 0.4808, + "step": 1709 + }, + { + "epoch": 0.3, + "learning_rate": 1.650919080229921e-05, + "loss": 0.4923, + "step": 1710 + }, + { + "epoch": 0.3, + "learning_rate": 1.650492945057213e-05, + "loss": 0.495, + "step": 1711 + }, + { + "epoch": 0.3, + "learning_rate": 1.6500666050153685e-05, + "loss": 0.4701, + "step": 1712 + }, + { + "epoch": 0.3, + "learning_rate": 1.649640060238661e-05, + "loss": 0.4815, + "step": 1713 + }, + { + "epoch": 0.3, + "learning_rate": 1.6492133108614284e-05, + "loss": 0.472, + "step": 1714 + }, + { + "epoch": 0.3, + "learning_rate": 1.6487863570180734e-05, + "loss": 0.4886, + "step": 1715 + }, + { + "epoch": 0.3, + "learning_rate": 1.6483591988430625e-05, + "loss": 0.4786, + "step": 1716 + }, + { + "epoch": 0.3, + "learning_rate": 1.6479318364709266e-05, + "loss": 0.4679, + "step": 1717 + }, + { + "epoch": 0.3, + "learning_rate": 1.647504270036262e-05, + "loss": 0.4775, + "step": 1718 + }, + { + "epoch": 0.3, + "learning_rate": 1.647076499673727e-05, + "loss": 0.4839, + "step": 1719 + }, + { + "epoch": 0.3, + "learning_rate": 1.6466485255180464e-05, + "loss": 0.4751, + "step": 1720 + }, + { + "epoch": 0.3, + "learning_rate": 1.646220347704008e-05, + "loss": 0.4819, + "step": 1721 + }, + { + "epoch": 0.3, + "learning_rate": 1.645791966366464e-05, + "loss": 0.4794, + "step": 1722 + }, + { + "epoch": 0.3, + "learning_rate": 1.6453633816403312e-05, + "loss": 0.481, + "step": 1723 + }, + { + "epoch": 0.3, + "learning_rate": 1.6449345936605894e-05, + "loss": 0.4795, + "step": 1724 + }, + { + "epoch": 0.3, + "learning_rate": 1.644505602562283e-05, + "loss": 0.4851, + "step": 1725 + }, + { + "epoch": 0.3, + "learning_rate": 1.6440764084805208e-05, + "loss": 0.4598, + "step": 1726 + }, + { + "epoch": 0.3, + "learning_rate": 1.6436470115504745e-05, + "loss": 0.4824, + "step": 1727 + }, + { + "epoch": 0.3, + "learning_rate": 1.643217411907381e-05, + "loss": 0.4777, + "step": 1728 + }, + { + "epoch": 0.3, + "learning_rate": 1.6427876096865394e-05, + "loss": 0.4871, + "step": 1729 + }, + { + "epoch": 0.3, + "learning_rate": 1.6423576050233144e-05, + "loss": 0.4854, + "step": 1730 + }, + { + "epoch": 0.3, + "learning_rate": 1.6419273980531333e-05, + "loss": 0.4809, + "step": 1731 + }, + { + "epoch": 0.3, + "learning_rate": 1.6414969889114872e-05, + "loss": 0.4785, + "step": 1732 + }, + { + "epoch": 0.3, + "learning_rate": 1.641066377733931e-05, + "loss": 0.4757, + "step": 1733 + }, + { + "epoch": 0.3, + "learning_rate": 1.6406355646560838e-05, + "loss": 0.4696, + "step": 1734 + }, + { + "epoch": 0.3, + "learning_rate": 1.640204549813627e-05, + "loss": 0.493, + "step": 1735 + }, + { + "epoch": 0.3, + "learning_rate": 1.6397733333423072e-05, + "loss": 0.4724, + "step": 1736 + }, + { + "epoch": 0.3, + "learning_rate": 1.639341915377933e-05, + "loss": 0.4714, + "step": 1737 + }, + { + "epoch": 0.3, + "learning_rate": 1.6389102960563776e-05, + "loss": 0.4763, + "step": 1738 + }, + { + "epoch": 0.3, + "learning_rate": 1.6384784755135767e-05, + "loss": 0.4857, + "step": 1739 + }, + { + "epoch": 0.3, + "learning_rate": 1.63804645388553e-05, + "loss": 0.4857, + "step": 1740 + }, + { + "epoch": 0.3, + "learning_rate": 1.6376142313083e-05, + "loss": 0.4676, + "step": 1741 + }, + { + "epoch": 0.3, + "learning_rate": 1.6371818079180126e-05, + "loss": 0.4663, + "step": 1742 + }, + { + "epoch": 0.3, + "learning_rate": 1.636749183850858e-05, + "loss": 0.4854, + "step": 1743 + }, + { + "epoch": 0.3, + "learning_rate": 1.636316359243088e-05, + "loss": 0.475, + "step": 1744 + }, + { + "epoch": 0.3, + "learning_rate": 1.6358833342310192e-05, + "loss": 0.4826, + "step": 1745 + }, + { + "epoch": 0.3, + "learning_rate": 1.635450108951029e-05, + "loss": 0.4843, + "step": 1746 + }, + { + "epoch": 0.3, + "learning_rate": 1.6350166835395607e-05, + "loss": 0.4876, + "step": 1747 + }, + { + "epoch": 0.3, + "learning_rate": 1.6345830581331187e-05, + "loss": 0.4754, + "step": 1748 + }, + { + "epoch": 0.3, + "learning_rate": 1.6341492328682703e-05, + "loss": 0.4711, + "step": 1749 + }, + { + "epoch": 0.3, + "learning_rate": 1.6337152078816475e-05, + "loss": 0.4811, + "step": 1750 + }, + { + "epoch": 0.3, + "learning_rate": 1.633280983309943e-05, + "loss": 0.4787, + "step": 1751 + }, + { + "epoch": 0.3, + "learning_rate": 1.6328465592899142e-05, + "loss": 0.489, + "step": 1752 + }, + { + "epoch": 0.3, + "learning_rate": 1.63241193595838e-05, + "loss": 0.4747, + "step": 1753 + }, + { + "epoch": 0.3, + "learning_rate": 1.631977113452223e-05, + "loss": 0.4961, + "step": 1754 + }, + { + "epoch": 0.3, + "learning_rate": 1.631542091908387e-05, + "loss": 0.4681, + "step": 1755 + }, + { + "epoch": 0.3, + "learning_rate": 1.6311068714638817e-05, + "loss": 0.4746, + "step": 1756 + }, + { + "epoch": 0.3, + "learning_rate": 1.6306714522557755e-05, + "loss": 0.4848, + "step": 1757 + }, + { + "epoch": 0.3, + "learning_rate": 1.6302358344212025e-05, + "loss": 0.4704, + "step": 1758 + }, + { + "epoch": 0.3, + "learning_rate": 1.6298000180973572e-05, + "loss": 0.5019, + "step": 1759 + }, + { + "epoch": 0.3, + "learning_rate": 1.629364003421498e-05, + "loss": 0.4716, + "step": 1760 + }, + { + "epoch": 0.31, + "learning_rate": 1.628927790530945e-05, + "loss": 0.4922, + "step": 1761 + }, + { + "epoch": 0.31, + "learning_rate": 1.6284913795630813e-05, + "loss": 0.4904, + "step": 1762 + }, + { + "epoch": 0.31, + "learning_rate": 1.6280547706553525e-05, + "loss": 0.4808, + "step": 1763 + }, + { + "epoch": 0.31, + "learning_rate": 1.6276179639452654e-05, + "loss": 0.4619, + "step": 1764 + }, + { + "epoch": 0.31, + "learning_rate": 1.62718095957039e-05, + "loss": 0.4895, + "step": 1765 + }, + { + "epoch": 0.31, + "learning_rate": 1.6267437576683585e-05, + "loss": 0.4717, + "step": 1766 + }, + { + "epoch": 0.31, + "learning_rate": 1.6263063583768652e-05, + "loss": 0.4779, + "step": 1767 + }, + { + "epoch": 0.31, + "learning_rate": 1.625868761833667e-05, + "loss": 0.4694, + "step": 1768 + }, + { + "epoch": 0.31, + "learning_rate": 1.6254309681765814e-05, + "loss": 0.4825, + "step": 1769 + }, + { + "epoch": 0.31, + "learning_rate": 1.6249929775434903e-05, + "loss": 0.4637, + "step": 1770 + }, + { + "epoch": 0.31, + "learning_rate": 1.624554790072336e-05, + "loss": 0.4858, + "step": 1771 + }, + { + "epoch": 0.31, + "learning_rate": 1.6241164059011228e-05, + "loss": 0.4688, + "step": 1772 + }, + { + "epoch": 0.31, + "learning_rate": 1.6236778251679177e-05, + "loss": 0.4832, + "step": 1773 + }, + { + "epoch": 0.31, + "learning_rate": 1.6232390480108493e-05, + "loss": 0.4774, + "step": 1774 + }, + { + "epoch": 0.31, + "learning_rate": 1.6228000745681082e-05, + "loss": 0.4782, + "step": 1775 + }, + { + "epoch": 0.31, + "learning_rate": 1.622360904977946e-05, + "loss": 0.4663, + "step": 1776 + }, + { + "epoch": 0.31, + "learning_rate": 1.6219215393786772e-05, + "loss": 0.4862, + "step": 1777 + }, + { + "epoch": 0.31, + "learning_rate": 1.6214819779086774e-05, + "loss": 0.4758, + "step": 1778 + }, + { + "epoch": 0.31, + "learning_rate": 1.621042220706384e-05, + "loss": 0.4686, + "step": 1779 + }, + { + "epoch": 0.31, + "learning_rate": 1.6206022679102967e-05, + "loss": 0.4765, + "step": 1780 + }, + { + "epoch": 0.31, + "learning_rate": 1.6201621196589755e-05, + "loss": 0.4844, + "step": 1781 + }, + { + "epoch": 0.31, + "learning_rate": 1.6197217760910426e-05, + "loss": 0.4676, + "step": 1782 + }, + { + "epoch": 0.31, + "learning_rate": 1.619281237345182e-05, + "loss": 0.4866, + "step": 1783 + }, + { + "epoch": 0.31, + "learning_rate": 1.618840503560139e-05, + "loss": 0.4763, + "step": 1784 + }, + { + "epoch": 0.31, + "learning_rate": 1.6183995748747204e-05, + "loss": 0.4792, + "step": 1785 + }, + { + "epoch": 0.31, + "learning_rate": 1.6179584514277937e-05, + "loss": 0.4864, + "step": 1786 + }, + { + "epoch": 0.31, + "learning_rate": 1.6175171333582887e-05, + "loss": 0.4832, + "step": 1787 + }, + { + "epoch": 0.31, + "learning_rate": 1.617075620805196e-05, + "loss": 0.4923, + "step": 1788 + }, + { + "epoch": 0.31, + "learning_rate": 1.6166339139075676e-05, + "loss": 0.461, + "step": 1789 + }, + { + "epoch": 0.31, + "learning_rate": 1.616192012804516e-05, + "loss": 0.4977, + "step": 1790 + }, + { + "epoch": 0.31, + "learning_rate": 1.6157499176352164e-05, + "loss": 0.4786, + "step": 1791 + }, + { + "epoch": 0.31, + "learning_rate": 1.6153076285389036e-05, + "loss": 0.47, + "step": 1792 + }, + { + "epoch": 0.31, + "learning_rate": 1.614865145654875e-05, + "loss": 0.4768, + "step": 1793 + }, + { + "epoch": 0.31, + "learning_rate": 1.6144224691224868e-05, + "loss": 0.4867, + "step": 1794 + }, + { + "epoch": 0.31, + "learning_rate": 1.6139795990811583e-05, + "loss": 0.4725, + "step": 1795 + }, + { + "epoch": 0.31, + "learning_rate": 1.613536535670369e-05, + "loss": 0.4866, + "step": 1796 + }, + { + "epoch": 0.31, + "learning_rate": 1.6130932790296586e-05, + "loss": 0.475, + "step": 1797 + }, + { + "epoch": 0.31, + "learning_rate": 1.612649829298629e-05, + "loss": 0.4841, + "step": 1798 + }, + { + "epoch": 0.31, + "learning_rate": 1.612206186616942e-05, + "loss": 0.4907, + "step": 1799 + }, + { + "epoch": 0.31, + "learning_rate": 1.6117623511243204e-05, + "loss": 0.4836, + "step": 1800 + }, + { + "epoch": 0.31, + "learning_rate": 1.611318322960548e-05, + "loss": 0.4662, + "step": 1801 + }, + { + "epoch": 0.31, + "learning_rate": 1.6108741022654685e-05, + "loss": 0.4805, + "step": 1802 + }, + { + "epoch": 0.31, + "learning_rate": 1.6104296891789867e-05, + "loss": 0.4682, + "step": 1803 + }, + { + "epoch": 0.31, + "learning_rate": 1.6099850838410685e-05, + "loss": 0.4861, + "step": 1804 + }, + { + "epoch": 0.31, + "learning_rate": 1.6095402863917398e-05, + "loss": 0.4799, + "step": 1805 + }, + { + "epoch": 0.31, + "learning_rate": 1.6090952969710868e-05, + "loss": 0.4819, + "step": 1806 + }, + { + "epoch": 0.31, + "learning_rate": 1.608650115719257e-05, + "loss": 0.4732, + "step": 1807 + }, + { + "epoch": 0.31, + "learning_rate": 1.6082047427764572e-05, + "loss": 0.4735, + "step": 1808 + }, + { + "epoch": 0.31, + "learning_rate": 1.607759178282955e-05, + "loss": 0.4878, + "step": 1809 + }, + { + "epoch": 0.31, + "learning_rate": 1.607313422379079e-05, + "loss": 0.4873, + "step": 1810 + }, + { + "epoch": 0.31, + "learning_rate": 1.6068674752052168e-05, + "loss": 0.4742, + "step": 1811 + }, + { + "epoch": 0.31, + "learning_rate": 1.606421336901818e-05, + "loss": 0.4871, + "step": 1812 + }, + { + "epoch": 0.31, + "learning_rate": 1.605975007609391e-05, + "loss": 0.4788, + "step": 1813 + }, + { + "epoch": 0.31, + "learning_rate": 1.605528487468504e-05, + "loss": 0.4738, + "step": 1814 + }, + { + "epoch": 0.31, + "learning_rate": 1.605081776619787e-05, + "loss": 0.4719, + "step": 1815 + }, + { + "epoch": 0.31, + "learning_rate": 1.604634875203929e-05, + "loss": 0.4735, + "step": 1816 + }, + { + "epoch": 0.31, + "learning_rate": 1.6041877833616782e-05, + "loss": 0.4733, + "step": 1817 + }, + { + "epoch": 0.31, + "learning_rate": 1.6037405012338448e-05, + "loss": 0.4873, + "step": 1818 + }, + { + "epoch": 0.32, + "learning_rate": 1.6032930289612974e-05, + "loss": 0.4684, + "step": 1819 + }, + { + "epoch": 0.32, + "learning_rate": 1.6028453666849645e-05, + "loss": 0.4884, + "step": 1820 + }, + { + "epoch": 0.32, + "learning_rate": 1.6023975145458352e-05, + "loss": 0.4753, + "step": 1821 + }, + { + "epoch": 0.32, + "learning_rate": 1.6019494726849582e-05, + "loss": 0.4745, + "step": 1822 + }, + { + "epoch": 0.32, + "learning_rate": 1.6015012412434417e-05, + "loss": 0.4688, + "step": 1823 + }, + { + "epoch": 0.32, + "learning_rate": 1.6010528203624537e-05, + "loss": 0.4771, + "step": 1824 + }, + { + "epoch": 0.32, + "learning_rate": 1.6006042101832212e-05, + "loss": 0.4708, + "step": 1825 + }, + { + "epoch": 0.32, + "learning_rate": 1.6001554108470325e-05, + "loss": 0.4742, + "step": 1826 + }, + { + "epoch": 0.32, + "learning_rate": 1.5997064224952345e-05, + "loss": 0.4749, + "step": 1827 + }, + { + "epoch": 0.32, + "learning_rate": 1.5992572452692324e-05, + "loss": 0.4973, + "step": 1828 + }, + { + "epoch": 0.32, + "learning_rate": 1.598807879310493e-05, + "loss": 0.4844, + "step": 1829 + }, + { + "epoch": 0.32, + "learning_rate": 1.5983583247605414e-05, + "loss": 0.4757, + "step": 1830 + }, + { + "epoch": 0.32, + "learning_rate": 1.5979085817609625e-05, + "loss": 0.4706, + "step": 1831 + }, + { + "epoch": 0.32, + "learning_rate": 1.5974586504534e-05, + "loss": 0.4778, + "step": 1832 + }, + { + "epoch": 0.32, + "learning_rate": 1.5970085309795572e-05, + "loss": 0.4701, + "step": 1833 + }, + { + "epoch": 0.32, + "learning_rate": 1.5965582234811972e-05, + "loss": 0.4736, + "step": 1834 + }, + { + "epoch": 0.32, + "learning_rate": 1.5961077281001418e-05, + "loss": 0.4801, + "step": 1835 + }, + { + "epoch": 0.32, + "learning_rate": 1.5956570449782715e-05, + "loss": 0.4773, + "step": 1836 + }, + { + "epoch": 0.32, + "learning_rate": 1.5952061742575268e-05, + "loss": 0.463, + "step": 1837 + }, + { + "epoch": 0.32, + "learning_rate": 1.594755116079907e-05, + "loss": 0.4813, + "step": 1838 + }, + { + "epoch": 0.32, + "learning_rate": 1.5943038705874697e-05, + "loss": 0.4776, + "step": 1839 + }, + { + "epoch": 0.32, + "learning_rate": 1.593852437922333e-05, + "loss": 0.4953, + "step": 1840 + }, + { + "epoch": 0.32, + "learning_rate": 1.593400818226673e-05, + "loss": 0.4605, + "step": 1841 + }, + { + "epoch": 0.32, + "learning_rate": 1.5929490116427247e-05, + "loss": 0.4966, + "step": 1842 + }, + { + "epoch": 0.32, + "learning_rate": 1.592497018312782e-05, + "loss": 0.4885, + "step": 1843 + }, + { + "epoch": 0.32, + "learning_rate": 1.5920448383791972e-05, + "loss": 0.4816, + "step": 1844 + }, + { + "epoch": 0.32, + "learning_rate": 1.591592471984383e-05, + "loss": 0.491, + "step": 1845 + }, + { + "epoch": 0.32, + "learning_rate": 1.5911399192708085e-05, + "loss": 0.4932, + "step": 1846 + }, + { + "epoch": 0.32, + "learning_rate": 1.590687180381003e-05, + "loss": 0.4819, + "step": 1847 + }, + { + "epoch": 0.32, + "learning_rate": 1.590234255457555e-05, + "loss": 0.4898, + "step": 1848 + }, + { + "epoch": 0.32, + "learning_rate": 1.5897811446431096e-05, + "loss": 0.478, + "step": 1849 + }, + { + "epoch": 0.32, + "learning_rate": 1.5893278480803716e-05, + "loss": 0.4747, + "step": 1850 + }, + { + "epoch": 0.32, + "learning_rate": 1.588874365912105e-05, + "loss": 0.4751, + "step": 1851 + }, + { + "epoch": 0.32, + "learning_rate": 1.588420698281131e-05, + "loss": 0.482, + "step": 1852 + }, + { + "epoch": 0.32, + "learning_rate": 1.587966845330329e-05, + "loss": 0.4772, + "step": 1853 + }, + { + "epoch": 0.32, + "learning_rate": 1.587512807202639e-05, + "loss": 0.4687, + "step": 1854 + }, + { + "epoch": 0.32, + "learning_rate": 1.5870585840410565e-05, + "loss": 0.4734, + "step": 1855 + }, + { + "epoch": 0.32, + "learning_rate": 1.586604175988637e-05, + "loss": 0.4748, + "step": 1856 + }, + { + "epoch": 0.32, + "learning_rate": 1.5861495831884942e-05, + "loss": 0.4822, + "step": 1857 + }, + { + "epoch": 0.32, + "learning_rate": 1.585694805783799e-05, + "loss": 0.4839, + "step": 1858 + }, + { + "epoch": 0.32, + "learning_rate": 1.5852398439177813e-05, + "loss": 0.4657, + "step": 1859 + }, + { + "epoch": 0.32, + "learning_rate": 1.584784697733728e-05, + "loss": 0.4751, + "step": 1860 + }, + { + "epoch": 0.32, + "learning_rate": 1.5843293673749863e-05, + "loss": 0.4783, + "step": 1861 + }, + { + "epoch": 0.32, + "learning_rate": 1.583873852984959e-05, + "loss": 0.4855, + "step": 1862 + }, + { + "epoch": 0.32, + "learning_rate": 1.5834181547071082e-05, + "loss": 0.4744, + "step": 1863 + }, + { + "epoch": 0.32, + "learning_rate": 1.582962272684953e-05, + "loss": 0.4675, + "step": 1864 + }, + { + "epoch": 0.32, + "learning_rate": 1.582506207062072e-05, + "loss": 0.4887, + "step": 1865 + }, + { + "epoch": 0.32, + "learning_rate": 1.582049957982099e-05, + "loss": 0.4932, + "step": 1866 + }, + { + "epoch": 0.32, + "learning_rate": 1.5815935255887286e-05, + "loss": 0.4717, + "step": 1867 + }, + { + "epoch": 0.32, + "learning_rate": 1.5811369100257104e-05, + "loss": 0.4868, + "step": 1868 + }, + { + "epoch": 0.32, + "learning_rate": 1.5806801114368542e-05, + "loss": 0.4721, + "step": 1869 + }, + { + "epoch": 0.32, + "learning_rate": 1.580223129966025e-05, + "loss": 0.4768, + "step": 1870 + }, + { + "epoch": 0.32, + "learning_rate": 1.5797659657571475e-05, + "loss": 0.4676, + "step": 1871 + }, + { + "epoch": 0.32, + "learning_rate": 1.579308618954202e-05, + "loss": 0.4818, + "step": 1872 + }, + { + "epoch": 0.32, + "learning_rate": 1.5788510897012286e-05, + "loss": 0.4774, + "step": 1873 + }, + { + "epoch": 0.32, + "learning_rate": 1.5783933781423222e-05, + "loss": 0.4734, + "step": 1874 + }, + { + "epoch": 0.32, + "learning_rate": 1.5779354844216377e-05, + "loss": 0.4675, + "step": 1875 + }, + { + "epoch": 0.32, + "learning_rate": 1.5774774086833856e-05, + "loss": 0.4822, + "step": 1876 + }, + { + "epoch": 0.33, + "learning_rate": 1.577019151071835e-05, + "loss": 0.4683, + "step": 1877 + }, + { + "epoch": 0.33, + "learning_rate": 1.5765607117313097e-05, + "loss": 0.4903, + "step": 1878 + }, + { + "epoch": 0.33, + "learning_rate": 1.5761020908061947e-05, + "loss": 0.4545, + "step": 1879 + }, + { + "epoch": 0.33, + "learning_rate": 1.5756432884409297e-05, + "loss": 0.4954, + "step": 1880 + }, + { + "epoch": 0.33, + "learning_rate": 1.5751843047800107e-05, + "loss": 0.4833, + "step": 1881 + }, + { + "epoch": 0.33, + "learning_rate": 1.5747251399679937e-05, + "loss": 0.478, + "step": 1882 + }, + { + "epoch": 0.33, + "learning_rate": 1.574265794149489e-05, + "loss": 0.4749, + "step": 1883 + }, + { + "epoch": 0.33, + "learning_rate": 1.5738062674691657e-05, + "loss": 0.4749, + "step": 1884 + }, + { + "epoch": 0.33, + "learning_rate": 1.5733465600717486e-05, + "loss": 0.5003, + "step": 1885 + }, + { + "epoch": 0.33, + "learning_rate": 1.5728866721020203e-05, + "loss": 0.4891, + "step": 1886 + }, + { + "epoch": 0.33, + "learning_rate": 1.5724266037048196e-05, + "loss": 0.4746, + "step": 1887 + }, + { + "epoch": 0.33, + "learning_rate": 1.571966355025043e-05, + "loss": 0.4751, + "step": 1888 + }, + { + "epoch": 0.33, + "learning_rate": 1.571505926207643e-05, + "loss": 0.4775, + "step": 1889 + }, + { + "epoch": 0.33, + "learning_rate": 1.571045317397629e-05, + "loss": 0.4793, + "step": 1890 + }, + { + "epoch": 0.33, + "learning_rate": 1.5705845287400675e-05, + "loss": 0.4811, + "step": 1891 + }, + { + "epoch": 0.33, + "learning_rate": 1.5701235603800813e-05, + "loss": 0.4856, + "step": 1892 + }, + { + "epoch": 0.33, + "learning_rate": 1.5696624124628495e-05, + "loss": 0.4813, + "step": 1893 + }, + { + "epoch": 0.33, + "learning_rate": 1.569201085133608e-05, + "loss": 0.4919, + "step": 1894 + }, + { + "epoch": 0.33, + "learning_rate": 1.56873957853765e-05, + "loss": 0.4731, + "step": 1895 + }, + { + "epoch": 0.33, + "learning_rate": 1.5682778928203232e-05, + "loss": 0.4643, + "step": 1896 + }, + { + "epoch": 0.33, + "learning_rate": 1.5678160281270344e-05, + "loss": 0.4742, + "step": 1897 + }, + { + "epoch": 0.33, + "learning_rate": 1.567353984603244e-05, + "loss": 0.4752, + "step": 1898 + }, + { + "epoch": 0.33, + "learning_rate": 1.566891762394471e-05, + "loss": 0.4819, + "step": 1899 + }, + { + "epoch": 0.33, + "learning_rate": 1.5664293616462894e-05, + "loss": 0.4824, + "step": 1900 + }, + { + "epoch": 0.33, + "learning_rate": 1.56596678250433e-05, + "loss": 0.4747, + "step": 1901 + }, + { + "epoch": 0.33, + "learning_rate": 1.5655040251142787e-05, + "loss": 0.4805, + "step": 1902 + }, + { + "epoch": 0.33, + "learning_rate": 1.5650410896218788e-05, + "loss": 0.4683, + "step": 1903 + }, + { + "epoch": 0.33, + "learning_rate": 1.5645779761729297e-05, + "loss": 0.4902, + "step": 1904 + }, + { + "epoch": 0.33, + "learning_rate": 1.564114684913286e-05, + "loss": 0.4837, + "step": 1905 + }, + { + "epoch": 0.33, + "learning_rate": 1.563651215988859e-05, + "loss": 0.4874, + "step": 1906 + }, + { + "epoch": 0.33, + "learning_rate": 1.5631875695456154e-05, + "loss": 0.4736, + "step": 1907 + }, + { + "epoch": 0.33, + "learning_rate": 1.5627237457295778e-05, + "loss": 0.4728, + "step": 1908 + }, + { + "epoch": 0.33, + "learning_rate": 1.5622597446868254e-05, + "loss": 0.4907, + "step": 1909 + }, + { + "epoch": 0.33, + "learning_rate": 1.5617955665634925e-05, + "loss": 0.4877, + "step": 1910 + }, + { + "epoch": 0.33, + "learning_rate": 1.5613312115057697e-05, + "loss": 0.4736, + "step": 1911 + }, + { + "epoch": 0.33, + "learning_rate": 1.5608666796599026e-05, + "loss": 0.4774, + "step": 1912 + }, + { + "epoch": 0.33, + "learning_rate": 1.5604019711721935e-05, + "loss": 0.4786, + "step": 1913 + }, + { + "epoch": 0.33, + "learning_rate": 1.559937086188999e-05, + "loss": 0.4809, + "step": 1914 + }, + { + "epoch": 0.33, + "learning_rate": 1.5594720248567327e-05, + "loss": 0.4688, + "step": 1915 + }, + { + "epoch": 0.33, + "learning_rate": 1.5590067873218627e-05, + "loss": 0.4923, + "step": 1916 + }, + { + "epoch": 0.33, + "learning_rate": 1.5585413737309133e-05, + "loss": 0.4631, + "step": 1917 + }, + { + "epoch": 0.33, + "learning_rate": 1.558075784230464e-05, + "loss": 0.4852, + "step": 1918 + }, + { + "epoch": 0.33, + "learning_rate": 1.557610018967149e-05, + "loss": 0.4703, + "step": 1919 + }, + { + "epoch": 0.33, + "learning_rate": 1.5571440780876588e-05, + "loss": 0.4803, + "step": 1920 + }, + { + "epoch": 0.33, + "learning_rate": 1.556677961738739e-05, + "loss": 0.4872, + "step": 1921 + }, + { + "epoch": 0.33, + "learning_rate": 1.5562116700671907e-05, + "loss": 0.486, + "step": 1922 + }, + { + "epoch": 0.33, + "learning_rate": 1.555745203219869e-05, + "loss": 0.4732, + "step": 1923 + }, + { + "epoch": 0.33, + "learning_rate": 1.5552785613436853e-05, + "loss": 0.4689, + "step": 1924 + }, + { + "epoch": 0.33, + "learning_rate": 1.5548117445856067e-05, + "loss": 0.4753, + "step": 1925 + }, + { + "epoch": 0.33, + "learning_rate": 1.5543447530926536e-05, + "loss": 0.4887, + "step": 1926 + }, + { + "epoch": 0.33, + "learning_rate": 1.5538775870119026e-05, + "loss": 0.4671, + "step": 1927 + }, + { + "epoch": 0.33, + "learning_rate": 1.553410246490485e-05, + "loss": 0.4737, + "step": 1928 + }, + { + "epoch": 0.33, + "learning_rate": 1.5529427316755876e-05, + "loss": 0.4791, + "step": 1929 + }, + { + "epoch": 0.33, + "learning_rate": 1.552475042714451e-05, + "loss": 0.4842, + "step": 1930 + }, + { + "epoch": 0.33, + "learning_rate": 1.5520071797543717e-05, + "loss": 0.4761, + "step": 1931 + }, + { + "epoch": 0.33, + "learning_rate": 1.5515391429427e-05, + "loss": 0.4845, + "step": 1932 + }, + { + "epoch": 0.33, + "learning_rate": 1.5510709324268422e-05, + "loss": 0.4711, + "step": 1933 + }, + { + "epoch": 0.34, + "learning_rate": 1.5506025483542577e-05, + "loss": 0.472, + "step": 1934 + }, + { + "epoch": 0.34, + "learning_rate": 1.5501339908724624e-05, + "loss": 0.4945, + "step": 1935 + }, + { + "epoch": 0.34, + "learning_rate": 1.5496652601290253e-05, + "loss": 0.4747, + "step": 1936 + }, + { + "epoch": 0.34, + "learning_rate": 1.5491963562715705e-05, + "loss": 0.4855, + "step": 1937 + }, + { + "epoch": 0.34, + "learning_rate": 1.548727279447777e-05, + "loss": 0.4706, + "step": 1938 + }, + { + "epoch": 0.34, + "learning_rate": 1.548258029805378e-05, + "loss": 0.4798, + "step": 1939 + }, + { + "epoch": 0.34, + "learning_rate": 1.5477886074921604e-05, + "loss": 0.4732, + "step": 1940 + }, + { + "epoch": 0.34, + "learning_rate": 1.5473190126559667e-05, + "loss": 0.4786, + "step": 1941 + }, + { + "epoch": 0.34, + "learning_rate": 1.546849245444693e-05, + "loss": 0.4803, + "step": 1942 + }, + { + "epoch": 0.34, + "learning_rate": 1.5463793060062903e-05, + "loss": 0.478, + "step": 1943 + }, + { + "epoch": 0.34, + "learning_rate": 1.5459091944887626e-05, + "loss": 0.489, + "step": 1944 + }, + { + "epoch": 0.34, + "learning_rate": 1.5454389110401694e-05, + "loss": 0.4699, + "step": 1945 + }, + { + "epoch": 0.34, + "learning_rate": 1.5449684558086243e-05, + "loss": 0.4861, + "step": 1946 + }, + { + "epoch": 0.34, + "learning_rate": 1.5444978289422937e-05, + "loss": 0.4789, + "step": 1947 + }, + { + "epoch": 0.34, + "learning_rate": 1.5440270305893995e-05, + "loss": 0.479, + "step": 1948 + }, + { + "epoch": 0.34, + "learning_rate": 1.5435560608982166e-05, + "loss": 0.4813, + "step": 1949 + }, + { + "epoch": 0.34, + "learning_rate": 1.5430849200170747e-05, + "loss": 0.4849, + "step": 1950 + }, + { + "epoch": 0.34, + "learning_rate": 1.5426136080943566e-05, + "loss": 0.4782, + "step": 1951 + }, + { + "epoch": 0.34, + "learning_rate": 1.5421421252784998e-05, + "loss": 0.4741, + "step": 1952 + }, + { + "epoch": 0.34, + "learning_rate": 1.541670471717995e-05, + "loss": 0.4812, + "step": 1953 + }, + { + "epoch": 0.34, + "learning_rate": 1.5411986475613864e-05, + "loss": 0.4782, + "step": 1954 + }, + { + "epoch": 0.34, + "learning_rate": 1.540726652957273e-05, + "loss": 0.4738, + "step": 1955 + }, + { + "epoch": 0.34, + "learning_rate": 1.540254488054307e-05, + "loss": 0.472, + "step": 1956 + }, + { + "epoch": 0.34, + "learning_rate": 1.5397821530011935e-05, + "loss": 0.4885, + "step": 1957 + }, + { + "epoch": 0.34, + "learning_rate": 1.5393096479466922e-05, + "loss": 0.4813, + "step": 1958 + }, + { + "epoch": 0.34, + "learning_rate": 1.538836973039616e-05, + "loss": 0.4777, + "step": 1959 + }, + { + "epoch": 0.34, + "learning_rate": 1.5383641284288308e-05, + "loss": 0.4748, + "step": 1960 + }, + { + "epoch": 0.34, + "learning_rate": 1.537891114263257e-05, + "loss": 0.4864, + "step": 1961 + }, + { + "epoch": 0.34, + "learning_rate": 1.5374179306918674e-05, + "loss": 0.4671, + "step": 1962 + }, + { + "epoch": 0.34, + "learning_rate": 1.5369445778636885e-05, + "loss": 0.4594, + "step": 1963 + }, + { + "epoch": 0.34, + "learning_rate": 1.5364710559278e-05, + "loss": 0.4797, + "step": 1964 + }, + { + "epoch": 0.34, + "learning_rate": 1.5359973650333352e-05, + "loss": 0.4878, + "step": 1965 + }, + { + "epoch": 0.34, + "learning_rate": 1.535523505329481e-05, + "loss": 0.463, + "step": 1966 + }, + { + "epoch": 0.34, + "learning_rate": 1.535049476965476e-05, + "loss": 0.4725, + "step": 1967 + }, + { + "epoch": 0.34, + "learning_rate": 1.5345752800906128e-05, + "loss": 0.4716, + "step": 1968 + }, + { + "epoch": 0.34, + "learning_rate": 1.5341009148542378e-05, + "loss": 0.4837, + "step": 1969 + }, + { + "epoch": 0.34, + "learning_rate": 1.5336263814057493e-05, + "loss": 0.4715, + "step": 1970 + }, + { + "epoch": 0.34, + "learning_rate": 1.5331516798945987e-05, + "loss": 0.4911, + "step": 1971 + }, + { + "epoch": 0.34, + "learning_rate": 1.532676810470291e-05, + "loss": 0.4739, + "step": 1972 + }, + { + "epoch": 0.34, + "learning_rate": 1.5322017732823836e-05, + "loss": 0.4929, + "step": 1973 + }, + { + "epoch": 0.34, + "learning_rate": 1.5317265684804865e-05, + "loss": 0.4621, + "step": 1974 + }, + { + "epoch": 0.34, + "learning_rate": 1.5312511962142634e-05, + "loss": 0.4749, + "step": 1975 + }, + { + "epoch": 0.34, + "learning_rate": 1.5307756566334295e-05, + "loss": 0.467, + "step": 1976 + }, + { + "epoch": 0.34, + "learning_rate": 1.5302999498877537e-05, + "loss": 0.4822, + "step": 1977 + }, + { + "epoch": 0.34, + "learning_rate": 1.5298240761270575e-05, + "loss": 0.4682, + "step": 1978 + }, + { + "epoch": 0.34, + "learning_rate": 1.529348035501214e-05, + "loss": 0.4744, + "step": 1979 + }, + { + "epoch": 0.34, + "learning_rate": 1.52887182816015e-05, + "loss": 0.4743, + "step": 1980 + }, + { + "epoch": 0.34, + "learning_rate": 1.5283954542538442e-05, + "loss": 0.4805, + "step": 1981 + }, + { + "epoch": 0.34, + "learning_rate": 1.5279189139323284e-05, + "loss": 0.4704, + "step": 1982 + }, + { + "epoch": 0.34, + "learning_rate": 1.5274422073456853e-05, + "loss": 0.5015, + "step": 1983 + }, + { + "epoch": 0.34, + "learning_rate": 1.526965334644052e-05, + "loss": 0.4753, + "step": 1984 + }, + { + "epoch": 0.34, + "learning_rate": 1.5264882959776164e-05, + "loss": 0.4768, + "step": 1985 + }, + { + "epoch": 0.34, + "learning_rate": 1.526011091496619e-05, + "loss": 0.4517, + "step": 1986 + }, + { + "epoch": 0.34, + "learning_rate": 1.5255337213513532e-05, + "loss": 0.4776, + "step": 1987 + }, + { + "epoch": 0.34, + "learning_rate": 1.5250561856921638e-05, + "loss": 0.4706, + "step": 1988 + }, + { + "epoch": 0.34, + "learning_rate": 1.5245784846694483e-05, + "loss": 0.4815, + "step": 1989 + }, + { + "epoch": 0.34, + "learning_rate": 1.5241006184336553e-05, + "loss": 0.4739, + "step": 1990 + }, + { + "epoch": 0.34, + "learning_rate": 1.5236225871352867e-05, + "loss": 0.4761, + "step": 1991 + }, + { + "epoch": 0.35, + "learning_rate": 1.5231443909248956e-05, + "loss": 0.4844, + "step": 1992 + }, + { + "epoch": 0.35, + "learning_rate": 1.5226660299530874e-05, + "loss": 0.4789, + "step": 1993 + }, + { + "epoch": 0.35, + "learning_rate": 1.522187504370519e-05, + "loss": 0.4887, + "step": 1994 + }, + { + "epoch": 0.35, + "learning_rate": 1.5217088143278995e-05, + "loss": 0.4777, + "step": 1995 + }, + { + "epoch": 0.35, + "learning_rate": 1.5212299599759894e-05, + "loss": 0.4814, + "step": 1996 + }, + { + "epoch": 0.35, + "learning_rate": 1.5207509414656017e-05, + "loss": 0.4742, + "step": 1997 + }, + { + "epoch": 0.35, + "learning_rate": 1.5202717589476006e-05, + "loss": 0.4831, + "step": 1998 + }, + { + "epoch": 0.35, + "learning_rate": 1.5197924125729015e-05, + "loss": 0.4824, + "step": 1999 + }, + { + "epoch": 0.35, + "learning_rate": 1.519312902492472e-05, + "loss": 0.4619, + "step": 2000 + }, + { + "epoch": 0.35, + "learning_rate": 1.5188332288573313e-05, + "loss": 0.4846, + "step": 2001 + }, + { + "epoch": 0.35, + "learning_rate": 1.51835339181855e-05, + "loss": 0.4707, + "step": 2002 + }, + { + "epoch": 0.35, + "learning_rate": 1.5178733915272501e-05, + "loss": 0.4717, + "step": 2003 + }, + { + "epoch": 0.35, + "learning_rate": 1.5173932281346049e-05, + "loss": 0.473, + "step": 2004 + }, + { + "epoch": 0.35, + "learning_rate": 1.5169129017918389e-05, + "loss": 0.4676, + "step": 2005 + }, + { + "epoch": 0.35, + "learning_rate": 1.5164324126502287e-05, + "loss": 0.4798, + "step": 2006 + }, + { + "epoch": 0.35, + "learning_rate": 1.5159517608611015e-05, + "loss": 0.4926, + "step": 2007 + }, + { + "epoch": 0.35, + "learning_rate": 1.515470946575836e-05, + "loss": 0.4698, + "step": 2008 + }, + { + "epoch": 0.35, + "learning_rate": 1.514989969945862e-05, + "loss": 0.4893, + "step": 2009 + }, + { + "epoch": 0.35, + "learning_rate": 1.5145088311226599e-05, + "loss": 0.4794, + "step": 2010 + }, + { + "epoch": 0.35, + "learning_rate": 1.5140275302577627e-05, + "loss": 0.4828, + "step": 2011 + }, + { + "epoch": 0.35, + "learning_rate": 1.5135460675027525e-05, + "loss": 0.4627, + "step": 2012 + }, + { + "epoch": 0.35, + "learning_rate": 1.5130644430092638e-05, + "loss": 0.4708, + "step": 2013 + }, + { + "epoch": 0.35, + "learning_rate": 1.5125826569289812e-05, + "loss": 0.475, + "step": 2014 + }, + { + "epoch": 0.35, + "learning_rate": 1.512100709413641e-05, + "loss": 0.4832, + "step": 2015 + }, + { + "epoch": 0.35, + "learning_rate": 1.5116186006150294e-05, + "loss": 0.4714, + "step": 2016 + }, + { + "epoch": 0.35, + "learning_rate": 1.5111363306849845e-05, + "loss": 0.4816, + "step": 2017 + }, + { + "epoch": 0.35, + "learning_rate": 1.5106538997753938e-05, + "loss": 0.4778, + "step": 2018 + }, + { + "epoch": 0.35, + "learning_rate": 1.510171308038197e-05, + "loss": 0.467, + "step": 2019 + }, + { + "epoch": 0.35, + "learning_rate": 1.5096885556253833e-05, + "loss": 0.4623, + "step": 2020 + }, + { + "epoch": 0.35, + "learning_rate": 1.5092056426889923e-05, + "loss": 0.4858, + "step": 2021 + }, + { + "epoch": 0.35, + "learning_rate": 1.5087225693811159e-05, + "loss": 0.4515, + "step": 2022 + }, + { + "epoch": 0.35, + "learning_rate": 1.5082393358538946e-05, + "loss": 0.4879, + "step": 2023 + }, + { + "epoch": 0.35, + "learning_rate": 1.5077559422595202e-05, + "loss": 0.4633, + "step": 2024 + }, + { + "epoch": 0.35, + "learning_rate": 1.5072723887502352e-05, + "loss": 0.4859, + "step": 2025 + }, + { + "epoch": 0.35, + "learning_rate": 1.5067886754783316e-05, + "loss": 0.4942, + "step": 2026 + }, + { + "epoch": 0.35, + "learning_rate": 1.5063048025961523e-05, + "loss": 0.4801, + "step": 2027 + }, + { + "epoch": 0.35, + "learning_rate": 1.5058207702560907e-05, + "loss": 0.4776, + "step": 2028 + }, + { + "epoch": 0.35, + "learning_rate": 1.5053365786105898e-05, + "loss": 0.4837, + "step": 2029 + }, + { + "epoch": 0.35, + "learning_rate": 1.5048522278121432e-05, + "loss": 0.4721, + "step": 2030 + }, + { + "epoch": 0.35, + "learning_rate": 1.5043677180132946e-05, + "loss": 0.4912, + "step": 2031 + }, + { + "epoch": 0.35, + "learning_rate": 1.5038830493666371e-05, + "loss": 0.4744, + "step": 2032 + }, + { + "epoch": 0.35, + "learning_rate": 1.5033982220248151e-05, + "loss": 0.481, + "step": 2033 + }, + { + "epoch": 0.35, + "learning_rate": 1.5029132361405219e-05, + "loss": 0.4795, + "step": 2034 + }, + { + "epoch": 0.35, + "learning_rate": 1.502428091866501e-05, + "loss": 0.4778, + "step": 2035 + }, + { + "epoch": 0.35, + "learning_rate": 1.5019427893555462e-05, + "loss": 0.474, + "step": 2036 + }, + { + "epoch": 0.35, + "learning_rate": 1.501457328760501e-05, + "loss": 0.4732, + "step": 2037 + }, + { + "epoch": 0.35, + "learning_rate": 1.5009717102342577e-05, + "loss": 0.4752, + "step": 2038 + }, + { + "epoch": 0.35, + "learning_rate": 1.5004859339297601e-05, + "loss": 0.4855, + "step": 2039 + }, + { + "epoch": 0.35, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.4876, + "step": 2040 + }, + { + "epoch": 0.35, + "learning_rate": 1.4995139085980203e-05, + "loss": 0.4735, + "step": 2041 + }, + { + "epoch": 0.35, + "learning_rate": 1.499027659876912e-05, + "loss": 0.4773, + "step": 2042 + }, + { + "epoch": 0.35, + "learning_rate": 1.498541253989817e-05, + "loss": 0.4759, + "step": 2043 + }, + { + "epoch": 0.35, + "learning_rate": 1.4980546910899261e-05, + "loss": 0.4661, + "step": 2044 + }, + { + "epoch": 0.35, + "learning_rate": 1.4975679713304794e-05, + "loss": 0.4859, + "step": 2045 + }, + { + "epoch": 0.35, + "learning_rate": 1.4970810948647664e-05, + "loss": 0.4868, + "step": 2046 + }, + { + "epoch": 0.35, + "learning_rate": 1.4965940618461265e-05, + "loss": 0.4837, + "step": 2047 + }, + { + "epoch": 0.35, + "learning_rate": 1.496106872427948e-05, + "loss": 0.4805, + "step": 2048 + }, + { + "epoch": 0.35, + "learning_rate": 1.4956195267636679e-05, + "loss": 0.4798, + "step": 2049 + }, + { + "epoch": 0.36, + "learning_rate": 1.495132025006774e-05, + "loss": 0.4697, + "step": 2050 + }, + { + "epoch": 0.36, + "learning_rate": 1.4946443673108015e-05, + "loss": 0.4797, + "step": 2051 + }, + { + "epoch": 0.36, + "learning_rate": 1.4941565538293358e-05, + "loss": 0.4784, + "step": 2052 + }, + { + "epoch": 0.36, + "learning_rate": 1.4936685847160113e-05, + "loss": 0.4813, + "step": 2053 + }, + { + "epoch": 0.36, + "learning_rate": 1.4931804601245105e-05, + "loss": 0.4836, + "step": 2054 + }, + { + "epoch": 0.36, + "learning_rate": 1.4926921802085662e-05, + "loss": 0.4865, + "step": 2055 + }, + { + "epoch": 0.36, + "learning_rate": 1.4922037451219586e-05, + "loss": 0.462, + "step": 2056 + }, + { + "epoch": 0.36, + "learning_rate": 1.4917151550185187e-05, + "loss": 0.4853, + "step": 2057 + }, + { + "epoch": 0.36, + "learning_rate": 1.4912264100521243e-05, + "loss": 0.4677, + "step": 2058 + }, + { + "epoch": 0.36, + "learning_rate": 1.4907375103767037e-05, + "loss": 0.4763, + "step": 2059 + }, + { + "epoch": 0.36, + "learning_rate": 1.4902484561462323e-05, + "loss": 0.4692, + "step": 2060 + }, + { + "epoch": 0.36, + "learning_rate": 1.4897592475147356e-05, + "loss": 0.4795, + "step": 2061 + }, + { + "epoch": 0.36, + "learning_rate": 1.489269884636287e-05, + "loss": 0.4737, + "step": 2062 + }, + { + "epoch": 0.36, + "learning_rate": 1.4887803676650083e-05, + "loss": 0.4805, + "step": 2063 + }, + { + "epoch": 0.36, + "learning_rate": 1.4882906967550708e-05, + "loss": 0.4819, + "step": 2064 + }, + { + "epoch": 0.36, + "learning_rate": 1.487800872060693e-05, + "loss": 0.479, + "step": 2065 + }, + { + "epoch": 0.36, + "learning_rate": 1.4873108937361429e-05, + "loss": 0.463, + "step": 2066 + }, + { + "epoch": 0.36, + "learning_rate": 1.4868207619357362e-05, + "loss": 0.4814, + "step": 2067 + }, + { + "epoch": 0.36, + "learning_rate": 1.4863304768138374e-05, + "loss": 0.4654, + "step": 2068 + }, + { + "epoch": 0.36, + "learning_rate": 1.4858400385248585e-05, + "loss": 0.4851, + "step": 2069 + }, + { + "epoch": 0.36, + "learning_rate": 1.4853494472232613e-05, + "loss": 0.479, + "step": 2070 + }, + { + "epoch": 0.36, + "learning_rate": 1.4848587030635537e-05, + "loss": 0.4832, + "step": 2071 + }, + { + "epoch": 0.36, + "learning_rate": 1.484367806200294e-05, + "loss": 0.4865, + "step": 2072 + }, + { + "epoch": 0.36, + "learning_rate": 1.4838767567880865e-05, + "loss": 0.475, + "step": 2073 + }, + { + "epoch": 0.36, + "learning_rate": 1.4833855549815848e-05, + "loss": 0.4734, + "step": 2074 + }, + { + "epoch": 0.36, + "learning_rate": 1.4828942009354902e-05, + "loss": 0.4751, + "step": 2075 + }, + { + "epoch": 0.36, + "learning_rate": 1.482402694804552e-05, + "loss": 0.4698, + "step": 2076 + }, + { + "epoch": 0.36, + "learning_rate": 1.4819110367435672e-05, + "loss": 0.4898, + "step": 2077 + }, + { + "epoch": 0.36, + "learning_rate": 1.4814192269073808e-05, + "loss": 0.4844, + "step": 2078 + }, + { + "epoch": 0.36, + "learning_rate": 1.4809272654508858e-05, + "loss": 0.4787, + "step": 2079 + }, + { + "epoch": 0.36, + "learning_rate": 1.4804351525290221e-05, + "loss": 0.4723, + "step": 2080 + }, + { + "epoch": 0.36, + "learning_rate": 1.4799428882967787e-05, + "loss": 0.489, + "step": 2081 + }, + { + "epoch": 0.36, + "learning_rate": 1.479450472909191e-05, + "loss": 0.473, + "step": 2082 + }, + { + "epoch": 0.36, + "learning_rate": 1.4789579065213425e-05, + "loss": 0.4792, + "step": 2083 + }, + { + "epoch": 0.36, + "learning_rate": 1.4784651892883644e-05, + "loss": 0.488, + "step": 2084 + }, + { + "epoch": 0.36, + "learning_rate": 1.4779723213654354e-05, + "loss": 0.4646, + "step": 2085 + }, + { + "epoch": 0.36, + "learning_rate": 1.477479302907781e-05, + "loss": 0.4817, + "step": 2086 + }, + { + "epoch": 0.36, + "learning_rate": 1.476986134070675e-05, + "loss": 0.4806, + "step": 2087 + }, + { + "epoch": 0.36, + "learning_rate": 1.4764928150094384e-05, + "loss": 0.4804, + "step": 2088 + }, + { + "epoch": 0.36, + "learning_rate": 1.4759993458794388e-05, + "loss": 0.4801, + "step": 2089 + }, + { + "epoch": 0.36, + "learning_rate": 1.475505726836092e-05, + "loss": 0.4725, + "step": 2090 + }, + { + "epoch": 0.36, + "learning_rate": 1.4750119580348601e-05, + "loss": 0.4812, + "step": 2091 + }, + { + "epoch": 0.36, + "learning_rate": 1.4745180396312533e-05, + "loss": 0.4754, + "step": 2092 + }, + { + "epoch": 0.36, + "learning_rate": 1.474023971780828e-05, + "loss": 0.4702, + "step": 2093 + }, + { + "epoch": 0.36, + "learning_rate": 1.4735297546391887e-05, + "loss": 0.4676, + "step": 2094 + }, + { + "epoch": 0.36, + "learning_rate": 1.4730353883619856e-05, + "loss": 0.4733, + "step": 2095 + }, + { + "epoch": 0.36, + "learning_rate": 1.4725408731049173e-05, + "loss": 0.4712, + "step": 2096 + }, + { + "epoch": 0.36, + "learning_rate": 1.4720462090237285e-05, + "loss": 0.4859, + "step": 2097 + }, + { + "epoch": 0.36, + "learning_rate": 1.4715513962742102e-05, + "loss": 0.4719, + "step": 2098 + }, + { + "epoch": 0.36, + "learning_rate": 1.471056435012202e-05, + "loss": 0.4891, + "step": 2099 + }, + { + "epoch": 0.36, + "learning_rate": 1.4705613253935886e-05, + "loss": 0.475, + "step": 2100 + }, + { + "epoch": 0.36, + "learning_rate": 1.4700660675743021e-05, + "loss": 0.4895, + "step": 2101 + }, + { + "epoch": 0.36, + "learning_rate": 1.469570661710321e-05, + "loss": 0.4623, + "step": 2102 + }, + { + "epoch": 0.36, + "learning_rate": 1.469075107957671e-05, + "loss": 0.4755, + "step": 2103 + }, + { + "epoch": 0.36, + "learning_rate": 1.4685794064724235e-05, + "loss": 0.4781, + "step": 2104 + }, + { + "epoch": 0.36, + "learning_rate": 1.4680835574106977e-05, + "loss": 0.4876, + "step": 2105 + }, + { + "epoch": 0.36, + "learning_rate": 1.4675875609286579e-05, + "loss": 0.491, + "step": 2106 + }, + { + "epoch": 0.37, + "learning_rate": 1.4670914171825157e-05, + "loss": 0.488, + "step": 2107 + }, + { + "epoch": 0.37, + "learning_rate": 1.4665951263285283e-05, + "loss": 0.477, + "step": 2108 + }, + { + "epoch": 0.37, + "learning_rate": 1.4660986885230002e-05, + "loss": 0.4932, + "step": 2109 + }, + { + "epoch": 0.37, + "learning_rate": 1.465602103922282e-05, + "loss": 0.472, + "step": 2110 + }, + { + "epoch": 0.37, + "learning_rate": 1.4651053726827695e-05, + "loss": 0.4847, + "step": 2111 + }, + { + "epoch": 0.37, + "learning_rate": 1.4646084949609062e-05, + "loss": 0.4797, + "step": 2112 + }, + { + "epoch": 0.37, + "learning_rate": 1.4641114709131805e-05, + "loss": 0.4709, + "step": 2113 + }, + { + "epoch": 0.37, + "learning_rate": 1.4636143006961279e-05, + "loss": 0.4815, + "step": 2114 + }, + { + "epoch": 0.37, + "learning_rate": 1.4631169844663284e-05, + "loss": 0.4732, + "step": 2115 + }, + { + "epoch": 0.37, + "learning_rate": 1.4626195223804101e-05, + "loss": 0.4884, + "step": 2116 + }, + { + "epoch": 0.37, + "learning_rate": 1.4621219145950452e-05, + "loss": 0.4591, + "step": 2117 + }, + { + "epoch": 0.37, + "learning_rate": 1.4616241612669523e-05, + "loss": 0.4718, + "step": 2118 + }, + { + "epoch": 0.37, + "learning_rate": 1.461126262552897e-05, + "loss": 0.4565, + "step": 2119 + }, + { + "epoch": 0.37, + "learning_rate": 1.460628218609689e-05, + "loss": 0.4789, + "step": 2120 + }, + { + "epoch": 0.37, + "learning_rate": 1.4601300295941847e-05, + "loss": 0.476, + "step": 2121 + }, + { + "epoch": 0.37, + "learning_rate": 1.4596316956632856e-05, + "loss": 0.4833, + "step": 2122 + }, + { + "epoch": 0.37, + "learning_rate": 1.45913321697394e-05, + "loss": 0.4711, + "step": 2123 + }, + { + "epoch": 0.37, + "learning_rate": 1.4586345936831404e-05, + "loss": 0.4863, + "step": 2124 + }, + { + "epoch": 0.37, + "learning_rate": 1.4581358259479252e-05, + "loss": 0.4724, + "step": 2125 + }, + { + "epoch": 0.37, + "learning_rate": 1.457636913925379e-05, + "loss": 0.4782, + "step": 2126 + }, + { + "epoch": 0.37, + "learning_rate": 1.4571378577726317e-05, + "loss": 0.4747, + "step": 2127 + }, + { + "epoch": 0.37, + "learning_rate": 1.4566386576468572e-05, + "loss": 0.4751, + "step": 2128 + }, + { + "epoch": 0.37, + "learning_rate": 1.4561393137052767e-05, + "loss": 0.4757, + "step": 2129 + }, + { + "epoch": 0.37, + "learning_rate": 1.4556398261051553e-05, + "loss": 0.4747, + "step": 2130 + }, + { + "epoch": 0.37, + "learning_rate": 1.455140195003804e-05, + "loss": 0.4643, + "step": 2131 + }, + { + "epoch": 0.37, + "learning_rate": 1.4546404205585789e-05, + "loss": 0.4826, + "step": 2132 + }, + { + "epoch": 0.37, + "learning_rate": 1.4541405029268813e-05, + "loss": 0.475, + "step": 2133 + }, + { + "epoch": 0.37, + "learning_rate": 1.4536404422661575e-05, + "loss": 0.4854, + "step": 2134 + }, + { + "epoch": 0.37, + "learning_rate": 1.4531402387338982e-05, + "loss": 0.4695, + "step": 2135 + }, + { + "epoch": 0.37, + "learning_rate": 1.4526398924876407e-05, + "loss": 0.4703, + "step": 2136 + }, + { + "epoch": 0.37, + "learning_rate": 1.4521394036849652e-05, + "loss": 0.4847, + "step": 2137 + }, + { + "epoch": 0.37, + "learning_rate": 1.4516387724834989e-05, + "loss": 0.4657, + "step": 2138 + }, + { + "epoch": 0.37, + "learning_rate": 1.4511379990409119e-05, + "loss": 0.4667, + "step": 2139 + }, + { + "epoch": 0.37, + "learning_rate": 1.4506370835149209e-05, + "loss": 0.471, + "step": 2140 + }, + { + "epoch": 0.37, + "learning_rate": 1.4501360260632855e-05, + "loss": 0.4593, + "step": 2141 + }, + { + "epoch": 0.37, + "learning_rate": 1.4496348268438116e-05, + "loss": 0.4654, + "step": 2142 + }, + { + "epoch": 0.37, + "learning_rate": 1.4491334860143494e-05, + "loss": 0.4815, + "step": 2143 + }, + { + "epoch": 0.37, + "learning_rate": 1.4486320037327924e-05, + "loss": 0.4863, + "step": 2144 + }, + { + "epoch": 0.37, + "learning_rate": 1.4481303801570805e-05, + "loss": 0.4664, + "step": 2145 + }, + { + "epoch": 0.37, + "learning_rate": 1.4476286154451968e-05, + "loss": 0.4875, + "step": 2146 + }, + { + "epoch": 0.37, + "learning_rate": 1.4471267097551698e-05, + "loss": 0.4843, + "step": 2147 + }, + { + "epoch": 0.37, + "learning_rate": 1.4466246632450714e-05, + "loss": 0.4747, + "step": 2148 + }, + { + "epoch": 0.37, + "learning_rate": 1.4461224760730189e-05, + "loss": 0.471, + "step": 2149 + }, + { + "epoch": 0.37, + "learning_rate": 1.4456201483971724e-05, + "loss": 0.4707, + "step": 2150 + }, + { + "epoch": 0.37, + "learning_rate": 1.4451176803757383e-05, + "loss": 0.4741, + "step": 2151 + }, + { + "epoch": 0.37, + "learning_rate": 1.4446150721669654e-05, + "loss": 0.4866, + "step": 2152 + }, + { + "epoch": 0.37, + "learning_rate": 1.4441123239291477e-05, + "loss": 0.467, + "step": 2153 + }, + { + "epoch": 0.37, + "learning_rate": 1.4436094358206224e-05, + "loss": 0.4785, + "step": 2154 + }, + { + "epoch": 0.37, + "learning_rate": 1.4431064079997723e-05, + "loss": 0.4608, + "step": 2155 + }, + { + "epoch": 0.37, + "learning_rate": 1.4426032406250228e-05, + "loss": 0.4788, + "step": 2156 + }, + { + "epoch": 0.37, + "learning_rate": 1.4420999338548432e-05, + "loss": 0.4773, + "step": 2157 + }, + { + "epoch": 0.37, + "learning_rate": 1.4415964878477477e-05, + "loss": 0.4902, + "step": 2158 + }, + { + "epoch": 0.37, + "learning_rate": 1.4410929027622932e-05, + "loss": 0.4673, + "step": 2159 + }, + { + "epoch": 0.37, + "learning_rate": 1.440589178757082e-05, + "loss": 0.4779, + "step": 2160 + }, + { + "epoch": 0.37, + "learning_rate": 1.4400853159907584e-05, + "loss": 0.4775, + "step": 2161 + }, + { + "epoch": 0.37, + "learning_rate": 1.4395813146220117e-05, + "loss": 0.4901, + "step": 2162 + }, + { + "epoch": 0.37, + "learning_rate": 1.4390771748095735e-05, + "loss": 0.4659, + "step": 2163 + }, + { + "epoch": 0.37, + "learning_rate": 1.4385728967122207e-05, + "loss": 0.4782, + "step": 2164 + }, + { + "epoch": 0.38, + "learning_rate": 1.4380684804887726e-05, + "loss": 0.482, + "step": 2165 + }, + { + "epoch": 0.38, + "learning_rate": 1.4375639262980921e-05, + "loss": 0.4734, + "step": 2166 + }, + { + "epoch": 0.38, + "learning_rate": 1.437059234299086e-05, + "loss": 0.4619, + "step": 2167 + }, + { + "epoch": 0.38, + "learning_rate": 1.4365544046507039e-05, + "loss": 0.489, + "step": 2168 + }, + { + "epoch": 0.38, + "learning_rate": 1.4360494375119396e-05, + "loss": 0.4819, + "step": 2169 + }, + { + "epoch": 0.38, + "learning_rate": 1.4355443330418292e-05, + "loss": 0.4827, + "step": 2170 + }, + { + "epoch": 0.38, + "learning_rate": 1.435039091399453e-05, + "loss": 0.4666, + "step": 2171 + }, + { + "epoch": 0.38, + "learning_rate": 1.4345337127439333e-05, + "loss": 0.4795, + "step": 2172 + }, + { + "epoch": 0.38, + "learning_rate": 1.4340281972344374e-05, + "loss": 0.4665, + "step": 2173 + }, + { + "epoch": 0.38, + "learning_rate": 1.4335225450301735e-05, + "loss": 0.4774, + "step": 2174 + }, + { + "epoch": 0.38, + "learning_rate": 1.4330167562903948e-05, + "loss": 0.4776, + "step": 2175 + }, + { + "epoch": 0.38, + "learning_rate": 1.4325108311743959e-05, + "loss": 0.4809, + "step": 2176 + }, + { + "epoch": 0.38, + "learning_rate": 1.4320047698415156e-05, + "loss": 0.4686, + "step": 2177 + }, + { + "epoch": 0.38, + "learning_rate": 1.4314985724511353e-05, + "loss": 0.4643, + "step": 2178 + }, + { + "epoch": 0.38, + "learning_rate": 1.4309922391626784e-05, + "loss": 0.4634, + "step": 2179 + }, + { + "epoch": 0.38, + "learning_rate": 1.4304857701356123e-05, + "loss": 0.4828, + "step": 2180 + }, + { + "epoch": 0.38, + "learning_rate": 1.4299791655294461e-05, + "loss": 0.48, + "step": 2181 + }, + { + "epoch": 0.38, + "learning_rate": 1.4294724255037329e-05, + "loss": 0.4784, + "step": 2182 + }, + { + "epoch": 0.38, + "learning_rate": 1.4289655502180667e-05, + "loss": 0.4829, + "step": 2183 + }, + { + "epoch": 0.38, + "learning_rate": 1.428458539832086e-05, + "loss": 0.4745, + "step": 2184 + }, + { + "epoch": 0.38, + "learning_rate": 1.42795139450547e-05, + "loss": 0.4731, + "step": 2185 + }, + { + "epoch": 0.38, + "learning_rate": 1.4274441143979418e-05, + "loss": 0.4751, + "step": 2186 + }, + { + "epoch": 0.38, + "learning_rate": 1.4269366996692666e-05, + "loss": 0.4652, + "step": 2187 + }, + { + "epoch": 0.38, + "learning_rate": 1.4264291504792514e-05, + "loss": 0.468, + "step": 2188 + }, + { + "epoch": 0.38, + "learning_rate": 1.4259214669877462e-05, + "loss": 0.4812, + "step": 2189 + }, + { + "epoch": 0.38, + "learning_rate": 1.4254136493546432e-05, + "loss": 0.4743, + "step": 2190 + }, + { + "epoch": 0.38, + "learning_rate": 1.4249056977398767e-05, + "loss": 0.4736, + "step": 2191 + }, + { + "epoch": 0.38, + "learning_rate": 1.4243976123034231e-05, + "loss": 0.4811, + "step": 2192 + }, + { + "epoch": 0.38, + "learning_rate": 1.4238893932053013e-05, + "loss": 0.4814, + "step": 2193 + }, + { + "epoch": 0.38, + "learning_rate": 1.4233810406055718e-05, + "loss": 0.4729, + "step": 2194 + }, + { + "epoch": 0.38, + "learning_rate": 1.4228725546643373e-05, + "loss": 0.4779, + "step": 2195 + }, + { + "epoch": 0.38, + "learning_rate": 1.422363935541743e-05, + "loss": 0.4712, + "step": 2196 + }, + { + "epoch": 0.38, + "learning_rate": 1.4218551833979759e-05, + "loss": 0.467, + "step": 2197 + }, + { + "epoch": 0.38, + "learning_rate": 1.4213462983932641e-05, + "loss": 0.4695, + "step": 2198 + }, + { + "epoch": 0.38, + "learning_rate": 1.4208372806878782e-05, + "loss": 0.4685, + "step": 2199 + }, + { + "epoch": 0.38, + "learning_rate": 1.420328130442131e-05, + "loss": 0.4762, + "step": 2200 + }, + { + "epoch": 0.38, + "learning_rate": 1.419818847816376e-05, + "loss": 0.4696, + "step": 2201 + }, + { + "epoch": 0.38, + "learning_rate": 1.4193094329710089e-05, + "loss": 0.4629, + "step": 2202 + }, + { + "epoch": 0.38, + "learning_rate": 1.4187998860664672e-05, + "loss": 0.4587, + "step": 2203 + }, + { + "epoch": 0.38, + "learning_rate": 1.4182902072632301e-05, + "loss": 0.4809, + "step": 2204 + }, + { + "epoch": 0.38, + "learning_rate": 1.4177803967218178e-05, + "loss": 0.4734, + "step": 2205 + }, + { + "epoch": 0.38, + "learning_rate": 1.4172704546027926e-05, + "loss": 0.4799, + "step": 2206 + }, + { + "epoch": 0.38, + "learning_rate": 1.4167603810667578e-05, + "loss": 0.4761, + "step": 2207 + }, + { + "epoch": 0.38, + "learning_rate": 1.4162501762743579e-05, + "loss": 0.4763, + "step": 2208 + }, + { + "epoch": 0.38, + "learning_rate": 1.4157398403862794e-05, + "loss": 0.4727, + "step": 2209 + }, + { + "epoch": 0.38, + "learning_rate": 1.4152293735632498e-05, + "loss": 0.4846, + "step": 2210 + }, + { + "epoch": 0.38, + "learning_rate": 1.4147187759660377e-05, + "loss": 0.4676, + "step": 2211 + }, + { + "epoch": 0.38, + "learning_rate": 1.414208047755453e-05, + "loss": 0.4847, + "step": 2212 + }, + { + "epoch": 0.38, + "learning_rate": 1.4136971890923465e-05, + "loss": 0.4629, + "step": 2213 + }, + { + "epoch": 0.38, + "learning_rate": 1.4131862001376107e-05, + "loss": 0.4789, + "step": 2214 + }, + { + "epoch": 0.38, + "learning_rate": 1.4126750810521783e-05, + "loss": 0.4673, + "step": 2215 + }, + { + "epoch": 0.38, + "learning_rate": 1.4121638319970234e-05, + "loss": 0.4909, + "step": 2216 + }, + { + "epoch": 0.38, + "learning_rate": 1.4116524531331616e-05, + "loss": 0.4755, + "step": 2217 + }, + { + "epoch": 0.38, + "learning_rate": 1.4111409446216482e-05, + "loss": 0.4735, + "step": 2218 + }, + { + "epoch": 0.38, + "learning_rate": 1.4106293066235806e-05, + "loss": 0.4708, + "step": 2219 + }, + { + "epoch": 0.38, + "learning_rate": 1.410117539300096e-05, + "loss": 0.4804, + "step": 2220 + }, + { + "epoch": 0.38, + "learning_rate": 1.4096056428123721e-05, + "loss": 0.4605, + "step": 2221 + }, + { + "epoch": 0.38, + "learning_rate": 1.4090936173216289e-05, + "loss": 0.4638, + "step": 2222 + }, + { + "epoch": 0.39, + "learning_rate": 1.4085814629891252e-05, + "loss": 0.4724, + "step": 2223 + }, + { + "epoch": 0.39, + "learning_rate": 1.4080691799761618e-05, + "loss": 0.4739, + "step": 2224 + }, + { + "epoch": 0.39, + "learning_rate": 1.4075567684440788e-05, + "loss": 0.4719, + "step": 2225 + }, + { + "epoch": 0.39, + "learning_rate": 1.4070442285542579e-05, + "loss": 0.4873, + "step": 2226 + }, + { + "epoch": 0.39, + "learning_rate": 1.4065315604681198e-05, + "loss": 0.4739, + "step": 2227 + }, + { + "epoch": 0.39, + "learning_rate": 1.4060187643471276e-05, + "loss": 0.4852, + "step": 2228 + }, + { + "epoch": 0.39, + "learning_rate": 1.4055058403527828e-05, + "loss": 0.472, + "step": 2229 + }, + { + "epoch": 0.39, + "learning_rate": 1.4049927886466281e-05, + "loss": 0.4794, + "step": 2230 + }, + { + "epoch": 0.39, + "learning_rate": 1.4044796093902466e-05, + "loss": 0.4655, + "step": 2231 + }, + { + "epoch": 0.39, + "learning_rate": 1.403966302745261e-05, + "loss": 0.4765, + "step": 2232 + }, + { + "epoch": 0.39, + "learning_rate": 1.4034528688733344e-05, + "loss": 0.459, + "step": 2233 + }, + { + "epoch": 0.39, + "learning_rate": 1.4029393079361699e-05, + "loss": 0.4716, + "step": 2234 + }, + { + "epoch": 0.39, + "learning_rate": 1.402425620095511e-05, + "loss": 0.4742, + "step": 2235 + }, + { + "epoch": 0.39, + "learning_rate": 1.40191180551314e-05, + "loss": 0.4857, + "step": 2236 + }, + { + "epoch": 0.39, + "learning_rate": 1.4013978643508807e-05, + "loss": 0.4715, + "step": 2237 + }, + { + "epoch": 0.39, + "learning_rate": 1.4008837967705959e-05, + "loss": 0.4794, + "step": 2238 + }, + { + "epoch": 0.39, + "learning_rate": 1.4003696029341884e-05, + "loss": 0.4615, + "step": 2239 + }, + { + "epoch": 0.39, + "learning_rate": 1.3998552830036003e-05, + "loss": 0.4667, + "step": 2240 + }, + { + "epoch": 0.39, + "learning_rate": 1.399340837140814e-05, + "loss": 0.4716, + "step": 2241 + }, + { + "epoch": 0.39, + "learning_rate": 1.3988262655078514e-05, + "loss": 0.468, + "step": 2242 + }, + { + "epoch": 0.39, + "learning_rate": 1.3983115682667743e-05, + "loss": 0.4775, + "step": 2243 + }, + { + "epoch": 0.39, + "learning_rate": 1.3977967455796828e-05, + "loss": 0.4658, + "step": 2244 + }, + { + "epoch": 0.39, + "learning_rate": 1.3972817976087183e-05, + "loss": 0.459, + "step": 2245 + }, + { + "epoch": 0.39, + "learning_rate": 1.3967667245160608e-05, + "loss": 0.4804, + "step": 2246 + }, + { + "epoch": 0.39, + "learning_rate": 1.3962515264639291e-05, + "loss": 0.4736, + "step": 2247 + }, + { + "epoch": 0.39, + "learning_rate": 1.3957362036145826e-05, + "loss": 0.4788, + "step": 2248 + }, + { + "epoch": 0.39, + "learning_rate": 1.3952207561303188e-05, + "loss": 0.4674, + "step": 2249 + }, + { + "epoch": 0.39, + "learning_rate": 1.3947051841734756e-05, + "loss": 0.4654, + "step": 2250 + }, + { + "epoch": 0.39, + "learning_rate": 1.3941894879064289e-05, + "loss": 0.4666, + "step": 2251 + }, + { + "epoch": 0.39, + "learning_rate": 1.3936736674915947e-05, + "loss": 0.4889, + "step": 2252 + }, + { + "epoch": 0.39, + "learning_rate": 1.393157723091428e-05, + "loss": 0.4643, + "step": 2253 + }, + { + "epoch": 0.39, + "learning_rate": 1.3926416548684221e-05, + "loss": 0.4876, + "step": 2254 + }, + { + "epoch": 0.39, + "learning_rate": 1.3921254629851103e-05, + "loss": 0.4603, + "step": 2255 + }, + { + "epoch": 0.39, + "learning_rate": 1.391609147604064e-05, + "loss": 0.4882, + "step": 2256 + }, + { + "epoch": 0.39, + "learning_rate": 1.3910927088878943e-05, + "loss": 0.4752, + "step": 2257 + }, + { + "epoch": 0.39, + "learning_rate": 1.39057614699925e-05, + "loss": 0.4832, + "step": 2258 + }, + { + "epoch": 0.39, + "learning_rate": 1.3900594621008201e-05, + "loss": 0.4699, + "step": 2259 + }, + { + "epoch": 0.39, + "learning_rate": 1.3895426543553313e-05, + "loss": 0.4706, + "step": 2260 + }, + { + "epoch": 0.39, + "learning_rate": 1.3890257239255495e-05, + "loss": 0.4842, + "step": 2261 + }, + { + "epoch": 0.39, + "learning_rate": 1.3885086709742788e-05, + "loss": 0.4819, + "step": 2262 + }, + { + "epoch": 0.39, + "learning_rate": 1.3879914956643623e-05, + "loss": 0.4778, + "step": 2263 + }, + { + "epoch": 0.39, + "learning_rate": 1.3874741981586814e-05, + "loss": 0.4678, + "step": 2264 + }, + { + "epoch": 0.39, + "learning_rate": 1.3869567786201562e-05, + "loss": 0.4702, + "step": 2265 + }, + { + "epoch": 0.39, + "learning_rate": 1.386439237211745e-05, + "loss": 0.483, + "step": 2266 + }, + { + "epoch": 0.39, + "learning_rate": 1.3859215740964446e-05, + "loss": 0.4682, + "step": 2267 + }, + { + "epoch": 0.39, + "learning_rate": 1.3854037894372905e-05, + "loss": 0.4867, + "step": 2268 + }, + { + "epoch": 0.39, + "learning_rate": 1.3848858833973555e-05, + "loss": 0.4761, + "step": 2269 + }, + { + "epoch": 0.39, + "learning_rate": 1.3843678561397517e-05, + "loss": 0.4568, + "step": 2270 + }, + { + "epoch": 0.39, + "learning_rate": 1.3838497078276288e-05, + "loss": 0.4671, + "step": 2271 + }, + { + "epoch": 0.39, + "learning_rate": 1.3833314386241744e-05, + "loss": 0.4686, + "step": 2272 + }, + { + "epoch": 0.39, + "learning_rate": 1.3828130486926145e-05, + "loss": 0.468, + "step": 2273 + }, + { + "epoch": 0.39, + "learning_rate": 1.382294538196214e-05, + "loss": 0.4744, + "step": 2274 + }, + { + "epoch": 0.39, + "learning_rate": 1.3817759072982737e-05, + "loss": 0.4769, + "step": 2275 + }, + { + "epoch": 0.39, + "learning_rate": 1.3812571561621341e-05, + "loss": 0.4793, + "step": 2276 + }, + { + "epoch": 0.39, + "learning_rate": 1.3807382849511732e-05, + "loss": 0.4674, + "step": 2277 + }, + { + "epoch": 0.39, + "learning_rate": 1.3802192938288063e-05, + "loss": 0.4612, + "step": 2278 + }, + { + "epoch": 0.39, + "learning_rate": 1.3797001829584868e-05, + "loss": 0.4749, + "step": 2279 + }, + { + "epoch": 0.39, + "learning_rate": 1.3791809525037057e-05, + "loss": 0.4634, + "step": 2280 + }, + { + "epoch": 0.4, + "learning_rate": 1.3786616026279922e-05, + "loss": 0.4755, + "step": 2281 + }, + { + "epoch": 0.4, + "learning_rate": 1.378142133494912e-05, + "loss": 0.475, + "step": 2282 + }, + { + "epoch": 0.4, + "learning_rate": 1.3776225452680696e-05, + "loss": 0.4809, + "step": 2283 + }, + { + "epoch": 0.4, + "learning_rate": 1.377102838111106e-05, + "loss": 0.4685, + "step": 2284 + }, + { + "epoch": 0.4, + "learning_rate": 1.3765830121877004e-05, + "loss": 0.4739, + "step": 2285 + }, + { + "epoch": 0.4, + "learning_rate": 1.3760630676615685e-05, + "loss": 0.4748, + "step": 2286 + }, + { + "epoch": 0.4, + "learning_rate": 1.3755430046964649e-05, + "loss": 0.4793, + "step": 2287 + }, + { + "epoch": 0.4, + "learning_rate": 1.3750228234561796e-05, + "loss": 0.4773, + "step": 2288 + }, + { + "epoch": 0.4, + "learning_rate": 1.3745025241045414e-05, + "loss": 0.4722, + "step": 2289 + }, + { + "epoch": 0.4, + "learning_rate": 1.3739821068054153e-05, + "loss": 0.4733, + "step": 2290 + }, + { + "epoch": 0.4, + "learning_rate": 1.373461571722704e-05, + "loss": 0.473, + "step": 2291 + }, + { + "epoch": 0.4, + "learning_rate": 1.3729409190203475e-05, + "loss": 0.4796, + "step": 2292 + }, + { + "epoch": 0.4, + "learning_rate": 1.3724201488623216e-05, + "loss": 0.4749, + "step": 2293 + }, + { + "epoch": 0.4, + "learning_rate": 1.371899261412641e-05, + "loss": 0.4794, + "step": 2294 + }, + { + "epoch": 0.4, + "learning_rate": 1.3713782568353553e-05, + "loss": 0.4751, + "step": 2295 + }, + { + "epoch": 0.4, + "learning_rate": 1.3708571352945527e-05, + "loss": 0.48, + "step": 2296 + }, + { + "epoch": 0.4, + "learning_rate": 1.3703358969543575e-05, + "loss": 0.4806, + "step": 2297 + }, + { + "epoch": 0.4, + "learning_rate": 1.3698145419789302e-05, + "loss": 0.4642, + "step": 2298 + }, + { + "epoch": 0.4, + "learning_rate": 1.3692930705324697e-05, + "loss": 0.4907, + "step": 2299 + }, + { + "epoch": 0.4, + "learning_rate": 1.3687714827792093e-05, + "loss": 0.4663, + "step": 2300 + }, + { + "epoch": 0.4, + "learning_rate": 1.368249778883421e-05, + "loss": 0.476, + "step": 2301 + }, + { + "epoch": 0.4, + "learning_rate": 1.3677279590094123e-05, + "loss": 0.47, + "step": 2302 + }, + { + "epoch": 0.4, + "learning_rate": 1.3672060233215277e-05, + "loss": 0.4694, + "step": 2303 + }, + { + "epoch": 0.4, + "learning_rate": 1.3666839719841473e-05, + "loss": 0.4755, + "step": 2304 + }, + { + "epoch": 0.4, + "learning_rate": 1.3661618051616893e-05, + "loss": 0.4808, + "step": 2305 + }, + { + "epoch": 0.4, + "learning_rate": 1.3656395230186062e-05, + "loss": 0.4644, + "step": 2306 + }, + { + "epoch": 0.4, + "learning_rate": 1.3651171257193883e-05, + "loss": 0.4786, + "step": 2307 + }, + { + "epoch": 0.4, + "learning_rate": 1.3645946134285617e-05, + "loss": 0.4711, + "step": 2308 + }, + { + "epoch": 0.4, + "learning_rate": 1.3640719863106888e-05, + "loss": 0.4827, + "step": 2309 + }, + { + "epoch": 0.4, + "learning_rate": 1.3635492445303679e-05, + "loss": 0.4798, + "step": 2310 + }, + { + "epoch": 0.4, + "learning_rate": 1.3630263882522341e-05, + "loss": 0.4772, + "step": 2311 + }, + { + "epoch": 0.4, + "learning_rate": 1.3625034176409577e-05, + "loss": 0.4721, + "step": 2312 + }, + { + "epoch": 0.4, + "learning_rate": 1.3619803328612454e-05, + "loss": 0.49, + "step": 2313 + }, + { + "epoch": 0.4, + "learning_rate": 1.3614571340778398e-05, + "loss": 0.4814, + "step": 2314 + }, + { + "epoch": 0.4, + "learning_rate": 1.3609338214555195e-05, + "loss": 0.4737, + "step": 2315 + }, + { + "epoch": 0.4, + "learning_rate": 1.3604103951590993e-05, + "loss": 0.464, + "step": 2316 + }, + { + "epoch": 0.4, + "learning_rate": 1.3598868553534286e-05, + "loss": 0.4787, + "step": 2317 + }, + { + "epoch": 0.4, + "learning_rate": 1.359363202203394e-05, + "loss": 0.4692, + "step": 2318 + }, + { + "epoch": 0.4, + "learning_rate": 1.3588394358739167e-05, + "loss": 0.4746, + "step": 2319 + }, + { + "epoch": 0.4, + "learning_rate": 1.3583155565299544e-05, + "loss": 0.4711, + "step": 2320 + }, + { + "epoch": 0.4, + "learning_rate": 1.3577915643364997e-05, + "loss": 0.4707, + "step": 2321 + }, + { + "epoch": 0.4, + "learning_rate": 1.3572674594585813e-05, + "loss": 0.4784, + "step": 2322 + }, + { + "epoch": 0.4, + "learning_rate": 1.356743242061263e-05, + "loss": 0.4633, + "step": 2323 + }, + { + "epoch": 0.4, + "learning_rate": 1.3562189123096439e-05, + "loss": 0.4693, + "step": 2324 + }, + { + "epoch": 0.4, + "learning_rate": 1.3556944703688592e-05, + "loss": 0.4733, + "step": 2325 + }, + { + "epoch": 0.4, + "learning_rate": 1.3551699164040786e-05, + "loss": 0.4557, + "step": 2326 + }, + { + "epoch": 0.4, + "learning_rate": 1.3546452505805076e-05, + "loss": 0.4625, + "step": 2327 + }, + { + "epoch": 0.4, + "learning_rate": 1.3541204730633864e-05, + "loss": 0.473, + "step": 2328 + }, + { + "epoch": 0.4, + "learning_rate": 1.3535955840179918e-05, + "loss": 0.4798, + "step": 2329 + }, + { + "epoch": 0.4, + "learning_rate": 1.3530705836096333e-05, + "loss": 0.4713, + "step": 2330 + }, + { + "epoch": 0.4, + "learning_rate": 1.3525454720036581e-05, + "loss": 0.4861, + "step": 2331 + }, + { + "epoch": 0.4, + "learning_rate": 1.3520202493654466e-05, + "loss": 0.4849, + "step": 2332 + }, + { + "epoch": 0.4, + "learning_rate": 1.3514949158604147e-05, + "loss": 0.4749, + "step": 2333 + }, + { + "epoch": 0.4, + "learning_rate": 1.3509694716540135e-05, + "loss": 0.4686, + "step": 2334 + }, + { + "epoch": 0.4, + "learning_rate": 1.3504439169117283e-05, + "loss": 0.4771, + "step": 2335 + }, + { + "epoch": 0.4, + "learning_rate": 1.34991825179908e-05, + "loss": 0.4818, + "step": 2336 + }, + { + "epoch": 0.4, + "learning_rate": 1.349392476481624e-05, + "loss": 0.4784, + "step": 2337 + }, + { + "epoch": 0.41, + "learning_rate": 1.3488665911249503e-05, + "loss": 0.482, + "step": 2338 + }, + { + "epoch": 0.41, + "learning_rate": 1.348340595894683e-05, + "loss": 0.4707, + "step": 2339 + }, + { + "epoch": 0.41, + "learning_rate": 1.3478144909564824e-05, + "loss": 0.4692, + "step": 2340 + }, + { + "epoch": 0.41, + "learning_rate": 1.3472882764760414e-05, + "loss": 0.4772, + "step": 2341 + }, + { + "epoch": 0.41, + "learning_rate": 1.3467619526190885e-05, + "loss": 0.4598, + "step": 2342 + }, + { + "epoch": 0.41, + "learning_rate": 1.3462355195513868e-05, + "loss": 0.4806, + "step": 2343 + }, + { + "epoch": 0.41, + "learning_rate": 1.3457089774387333e-05, + "loss": 0.4645, + "step": 2344 + }, + { + "epoch": 0.41, + "learning_rate": 1.3451823264469595e-05, + "loss": 0.4834, + "step": 2345 + }, + { + "epoch": 0.41, + "learning_rate": 1.344655566741931e-05, + "loss": 0.4737, + "step": 2346 + }, + { + "epoch": 0.41, + "learning_rate": 1.3441286984895486e-05, + "loss": 0.4835, + "step": 2347 + }, + { + "epoch": 0.41, + "learning_rate": 1.3436017218557453e-05, + "loss": 0.4712, + "step": 2348 + }, + { + "epoch": 0.41, + "learning_rate": 1.3430746370064904e-05, + "loss": 0.461, + "step": 2349 + }, + { + "epoch": 0.41, + "learning_rate": 1.342547444107786e-05, + "loss": 0.4763, + "step": 2350 + }, + { + "epoch": 0.41, + "learning_rate": 1.342020143325669e-05, + "loss": 0.4925, + "step": 2351 + }, + { + "epoch": 0.41, + "learning_rate": 1.341492734826209e-05, + "loss": 0.4636, + "step": 2352 + }, + { + "epoch": 0.41, + "learning_rate": 1.3409652187755114e-05, + "loss": 0.4786, + "step": 2353 + }, + { + "epoch": 0.41, + "learning_rate": 1.3404375953397136e-05, + "loss": 0.4604, + "step": 2354 + }, + { + "epoch": 0.41, + "learning_rate": 1.339909864684988e-05, + "loss": 0.48, + "step": 2355 + }, + { + "epoch": 0.41, + "learning_rate": 1.3393820269775405e-05, + "loss": 0.471, + "step": 2356 + }, + { + "epoch": 0.41, + "learning_rate": 1.3388540823836103e-05, + "loss": 0.4639, + "step": 2357 + }, + { + "epoch": 0.41, + "learning_rate": 1.3383260310694712e-05, + "loss": 0.4765, + "step": 2358 + }, + { + "epoch": 0.41, + "learning_rate": 1.3377978732014295e-05, + "loss": 0.4856, + "step": 2359 + }, + { + "epoch": 0.41, + "learning_rate": 1.3372696089458264e-05, + "loss": 0.4787, + "step": 2360 + }, + { + "epoch": 0.41, + "learning_rate": 1.3367412384690346e-05, + "loss": 0.4897, + "step": 2361 + }, + { + "epoch": 0.41, + "learning_rate": 1.3362127619374622e-05, + "loss": 0.4831, + "step": 2362 + }, + { + "epoch": 0.41, + "learning_rate": 1.3356841795175494e-05, + "loss": 0.4737, + "step": 2363 + }, + { + "epoch": 0.41, + "learning_rate": 1.3351554913757712e-05, + "loss": 0.4593, + "step": 2364 + }, + { + "epoch": 0.41, + "learning_rate": 1.3346266976786341e-05, + "loss": 0.4735, + "step": 2365 + }, + { + "epoch": 0.41, + "learning_rate": 1.3340977985926793e-05, + "loss": 0.4674, + "step": 2366 + }, + { + "epoch": 0.41, + "learning_rate": 1.3335687942844806e-05, + "loss": 0.4677, + "step": 2367 + }, + { + "epoch": 0.41, + "learning_rate": 1.3330396849206447e-05, + "loss": 0.4696, + "step": 2368 + }, + { + "epoch": 0.41, + "learning_rate": 1.3325104706678116e-05, + "loss": 0.475, + "step": 2369 + }, + { + "epoch": 0.41, + "learning_rate": 1.3319811516926541e-05, + "loss": 0.4709, + "step": 2370 + }, + { + "epoch": 0.41, + "learning_rate": 1.3314517281618794e-05, + "loss": 0.4784, + "step": 2371 + }, + { + "epoch": 0.41, + "learning_rate": 1.3309222002422255e-05, + "loss": 0.4635, + "step": 2372 + }, + { + "epoch": 0.41, + "learning_rate": 1.3303925681004649e-05, + "loss": 0.4752, + "step": 2373 + }, + { + "epoch": 0.41, + "learning_rate": 1.3298628319034014e-05, + "loss": 0.4705, + "step": 2374 + }, + { + "epoch": 0.41, + "learning_rate": 1.3293329918178737e-05, + "loss": 0.4671, + "step": 2375 + }, + { + "epoch": 0.41, + "learning_rate": 1.3288030480107508e-05, + "loss": 0.4841, + "step": 2376 + }, + { + "epoch": 0.41, + "learning_rate": 1.3282730006489361e-05, + "loss": 0.4869, + "step": 2377 + }, + { + "epoch": 0.41, + "learning_rate": 1.327742849899365e-05, + "loss": 0.4633, + "step": 2378 + }, + { + "epoch": 0.41, + "learning_rate": 1.3272125959290059e-05, + "loss": 0.478, + "step": 2379 + }, + { + "epoch": 0.41, + "learning_rate": 1.326682238904859e-05, + "loss": 0.4815, + "step": 2380 + }, + { + "epoch": 0.41, + "learning_rate": 1.326151778993957e-05, + "loss": 0.4784, + "step": 2381 + }, + { + "epoch": 0.41, + "learning_rate": 1.325621216363366e-05, + "loss": 0.4718, + "step": 2382 + }, + { + "epoch": 0.41, + "learning_rate": 1.3250905511801831e-05, + "loss": 0.472, + "step": 2383 + }, + { + "epoch": 0.41, + "learning_rate": 1.3245597836115386e-05, + "loss": 0.4734, + "step": 2384 + }, + { + "epoch": 0.41, + "learning_rate": 1.3240289138245949e-05, + "loss": 0.4684, + "step": 2385 + }, + { + "epoch": 0.41, + "learning_rate": 1.3234979419865466e-05, + "loss": 0.4808, + "step": 2386 + }, + { + "epoch": 0.41, + "learning_rate": 1.3229668682646197e-05, + "loss": 0.4775, + "step": 2387 + }, + { + "epoch": 0.41, + "learning_rate": 1.3224356928260735e-05, + "loss": 0.467, + "step": 2388 + }, + { + "epoch": 0.41, + "learning_rate": 1.3219044158381988e-05, + "loss": 0.4793, + "step": 2389 + }, + { + "epoch": 0.41, + "learning_rate": 1.321373037468318e-05, + "loss": 0.4763, + "step": 2390 + }, + { + "epoch": 0.41, + "learning_rate": 1.3208415578837859e-05, + "loss": 0.4783, + "step": 2391 + }, + { + "epoch": 0.41, + "learning_rate": 1.3203099772519889e-05, + "loss": 0.4776, + "step": 2392 + }, + { + "epoch": 0.41, + "learning_rate": 1.3197782957403458e-05, + "loss": 0.4723, + "step": 2393 + }, + { + "epoch": 0.41, + "learning_rate": 1.3192465135163062e-05, + "loss": 0.4635, + "step": 2394 + }, + { + "epoch": 0.41, + "learning_rate": 1.3187146307473521e-05, + "loss": 0.4769, + "step": 2395 + }, + { + "epoch": 0.42, + "learning_rate": 1.3181826476009974e-05, + "loss": 0.4628, + "step": 2396 + }, + { + "epoch": 0.42, + "learning_rate": 1.317650564244787e-05, + "loss": 0.4688, + "step": 2397 + }, + { + "epoch": 0.42, + "learning_rate": 1.3171183808462969e-05, + "loss": 0.4708, + "step": 2398 + }, + { + "epoch": 0.42, + "learning_rate": 1.3165860975731363e-05, + "loss": 0.4826, + "step": 2399 + }, + { + "epoch": 0.42, + "learning_rate": 1.3160537145929447e-05, + "loss": 0.4562, + "step": 2400 + }, + { + "epoch": 0.42, + "learning_rate": 1.3155212320733925e-05, + "loss": 0.4821, + "step": 2401 + }, + { + "epoch": 0.42, + "learning_rate": 1.3149886501821831e-05, + "loss": 0.4732, + "step": 2402 + }, + { + "epoch": 0.42, + "learning_rate": 1.3144559690870494e-05, + "loss": 0.4798, + "step": 2403 + }, + { + "epoch": 0.42, + "learning_rate": 1.3139231889557568e-05, + "loss": 0.4572, + "step": 2404 + }, + { + "epoch": 0.42, + "learning_rate": 1.313390309956101e-05, + "loss": 0.4702, + "step": 2405 + }, + { + "epoch": 0.42, + "learning_rate": 1.3128573322559097e-05, + "loss": 0.4806, + "step": 2406 + }, + { + "epoch": 0.42, + "learning_rate": 1.3123242560230408e-05, + "loss": 0.4686, + "step": 2407 + }, + { + "epoch": 0.42, + "learning_rate": 1.3117910814253845e-05, + "loss": 0.4713, + "step": 2408 + }, + { + "epoch": 0.42, + "learning_rate": 1.3112578086308602e-05, + "loss": 0.4625, + "step": 2409 + }, + { + "epoch": 0.42, + "learning_rate": 1.3107244378074197e-05, + "loss": 0.4679, + "step": 2410 + }, + { + "epoch": 0.42, + "learning_rate": 1.3101909691230456e-05, + "loss": 0.4695, + "step": 2411 + }, + { + "epoch": 0.42, + "learning_rate": 1.3096574027457503e-05, + "loss": 0.4692, + "step": 2412 + }, + { + "epoch": 0.42, + "learning_rate": 1.3091237388435773e-05, + "loss": 0.4807, + "step": 2413 + }, + { + "epoch": 0.42, + "learning_rate": 1.3085899775846018e-05, + "loss": 0.4654, + "step": 2414 + }, + { + "epoch": 0.42, + "learning_rate": 1.3080561191369286e-05, + "loss": 0.4757, + "step": 2415 + }, + { + "epoch": 0.42, + "learning_rate": 1.3075221636686935e-05, + "loss": 0.465, + "step": 2416 + }, + { + "epoch": 0.42, + "learning_rate": 1.3069881113480629e-05, + "loss": 0.4844, + "step": 2417 + }, + { + "epoch": 0.42, + "learning_rate": 1.3064539623432331e-05, + "loss": 0.4861, + "step": 2418 + }, + { + "epoch": 0.42, + "learning_rate": 1.305919716822432e-05, + "loss": 0.4714, + "step": 2419 + }, + { + "epoch": 0.42, + "learning_rate": 1.3053853749539169e-05, + "loss": 0.4671, + "step": 2420 + }, + { + "epoch": 0.42, + "learning_rate": 1.3048509369059762e-05, + "loss": 0.4642, + "step": 2421 + }, + { + "epoch": 0.42, + "learning_rate": 1.3043164028469274e-05, + "loss": 0.4752, + "step": 2422 + }, + { + "epoch": 0.42, + "learning_rate": 1.3037817729451199e-05, + "loss": 0.4585, + "step": 2423 + }, + { + "epoch": 0.42, + "learning_rate": 1.3032470473689322e-05, + "loss": 0.4709, + "step": 2424 + }, + { + "epoch": 0.42, + "learning_rate": 1.3027122262867727e-05, + "loss": 0.4715, + "step": 2425 + }, + { + "epoch": 0.42, + "learning_rate": 1.3021773098670804e-05, + "loss": 0.4692, + "step": 2426 + }, + { + "epoch": 0.42, + "learning_rate": 1.301642298278325e-05, + "loss": 0.4687, + "step": 2427 + }, + { + "epoch": 0.42, + "learning_rate": 1.3011071916890049e-05, + "loss": 0.479, + "step": 2428 + }, + { + "epoch": 0.42, + "learning_rate": 1.3005719902676483e-05, + "loss": 0.4872, + "step": 2429 + }, + { + "epoch": 0.42, + "learning_rate": 1.300036694182815e-05, + "loss": 0.4764, + "step": 2430 + }, + { + "epoch": 0.42, + "learning_rate": 1.2995013036030932e-05, + "loss": 0.4736, + "step": 2431 + }, + { + "epoch": 0.42, + "learning_rate": 1.2989658186971007e-05, + "loss": 0.4796, + "step": 2432 + }, + { + "epoch": 0.42, + "learning_rate": 1.298430239633486e-05, + "loss": 0.4766, + "step": 2433 + }, + { + "epoch": 0.42, + "learning_rate": 1.2978945665809267e-05, + "loss": 0.4731, + "step": 2434 + }, + { + "epoch": 0.42, + "learning_rate": 1.2973587997081298e-05, + "loss": 0.4743, + "step": 2435 + }, + { + "epoch": 0.42, + "learning_rate": 1.2968229391838322e-05, + "loss": 0.4577, + "step": 2436 + }, + { + "epoch": 0.42, + "learning_rate": 1.2962869851768008e-05, + "loss": 0.4817, + "step": 2437 + }, + { + "epoch": 0.42, + "learning_rate": 1.2957509378558301e-05, + "loss": 0.4717, + "step": 2438 + }, + { + "epoch": 0.42, + "learning_rate": 1.2952147973897464e-05, + "loss": 0.475, + "step": 2439 + }, + { + "epoch": 0.42, + "learning_rate": 1.2946785639474034e-05, + "loss": 0.4607, + "step": 2440 + }, + { + "epoch": 0.42, + "learning_rate": 1.2941422376976851e-05, + "loss": 0.4735, + "step": 2441 + }, + { + "epoch": 0.42, + "learning_rate": 1.2936058188095045e-05, + "loss": 0.4658, + "step": 2442 + }, + { + "epoch": 0.42, + "learning_rate": 1.2930693074518038e-05, + "loss": 0.4786, + "step": 2443 + }, + { + "epoch": 0.42, + "learning_rate": 1.292532703793554e-05, + "loss": 0.4759, + "step": 2444 + }, + { + "epoch": 0.42, + "learning_rate": 1.2919960080037557e-05, + "loss": 0.4765, + "step": 2445 + }, + { + "epoch": 0.42, + "learning_rate": 1.2914592202514385e-05, + "loss": 0.4822, + "step": 2446 + }, + { + "epoch": 0.42, + "learning_rate": 1.2909223407056599e-05, + "loss": 0.4684, + "step": 2447 + }, + { + "epoch": 0.42, + "learning_rate": 1.290385369535508e-05, + "loss": 0.4725, + "step": 2448 + }, + { + "epoch": 0.42, + "learning_rate": 1.2898483069100982e-05, + "loss": 0.4593, + "step": 2449 + }, + { + "epoch": 0.42, + "learning_rate": 1.2893111529985761e-05, + "loss": 0.4803, + "step": 2450 + }, + { + "epoch": 0.42, + "learning_rate": 1.2887739079701147e-05, + "loss": 0.4645, + "step": 2451 + }, + { + "epoch": 0.42, + "learning_rate": 1.2882365719939167e-05, + "loss": 0.4601, + "step": 2452 + }, + { + "epoch": 0.42, + "learning_rate": 1.2876991452392124e-05, + "loss": 0.4691, + "step": 2453 + }, + { + "epoch": 0.43, + "learning_rate": 1.2871616278752628e-05, + "loss": 0.482, + "step": 2454 + }, + { + "epoch": 0.43, + "learning_rate": 1.2866240200713544e-05, + "loss": 0.4746, + "step": 2455 + }, + { + "epoch": 0.43, + "learning_rate": 1.2860863219968049e-05, + "loss": 0.4694, + "step": 2456 + }, + { + "epoch": 0.43, + "learning_rate": 1.285548533820959e-05, + "loss": 0.4724, + "step": 2457 + }, + { + "epoch": 0.43, + "learning_rate": 1.2850106557131898e-05, + "loss": 0.4836, + "step": 2458 + }, + { + "epoch": 0.43, + "learning_rate": 1.2844726878428993e-05, + "loss": 0.4568, + "step": 2459 + }, + { + "epoch": 0.43, + "learning_rate": 1.2839346303795173e-05, + "loss": 0.4723, + "step": 2460 + }, + { + "epoch": 0.43, + "learning_rate": 1.2833964834925024e-05, + "loss": 0.4683, + "step": 2461 + }, + { + "epoch": 0.43, + "learning_rate": 1.2828582473513405e-05, + "loss": 0.4734, + "step": 2462 + }, + { + "epoch": 0.43, + "learning_rate": 1.2823199221255467e-05, + "loss": 0.4639, + "step": 2463 + }, + { + "epoch": 0.43, + "learning_rate": 1.2817815079846627e-05, + "loss": 0.4841, + "step": 2464 + }, + { + "epoch": 0.43, + "learning_rate": 1.2812430050982596e-05, + "loss": 0.4697, + "step": 2465 + }, + { + "epoch": 0.43, + "learning_rate": 1.2807044136359358e-05, + "loss": 0.4852, + "step": 2466 + }, + { + "epoch": 0.43, + "learning_rate": 1.2801657337673176e-05, + "loss": 0.4572, + "step": 2467 + }, + { + "epoch": 0.43, + "learning_rate": 1.2796269656620593e-05, + "loss": 0.4676, + "step": 2468 + }, + { + "epoch": 0.43, + "learning_rate": 1.2790881094898428e-05, + "loss": 0.4739, + "step": 2469 + }, + { + "epoch": 0.43, + "learning_rate": 1.2785491654203781e-05, + "loss": 0.465, + "step": 2470 + }, + { + "epoch": 0.43, + "learning_rate": 1.2780101336234024e-05, + "loss": 0.4632, + "step": 2471 + }, + { + "epoch": 0.43, + "learning_rate": 1.277471014268681e-05, + "loss": 0.4684, + "step": 2472 + }, + { + "epoch": 0.43, + "learning_rate": 1.2769318075260064e-05, + "loss": 0.466, + "step": 2473 + }, + { + "epoch": 0.43, + "learning_rate": 1.2763925135651984e-05, + "loss": 0.4665, + "step": 2474 + }, + { + "epoch": 0.43, + "learning_rate": 1.2758531325561055e-05, + "loss": 0.4725, + "step": 2475 + }, + { + "epoch": 0.43, + "learning_rate": 1.275313664668602e-05, + "loss": 0.4775, + "step": 2476 + }, + { + "epoch": 0.43, + "learning_rate": 1.2747741100725906e-05, + "loss": 0.4611, + "step": 2477 + }, + { + "epoch": 0.43, + "learning_rate": 1.274234468938001e-05, + "loss": 0.4672, + "step": 2478 + }, + { + "epoch": 0.43, + "learning_rate": 1.27369474143479e-05, + "loss": 0.4659, + "step": 2479 + }, + { + "epoch": 0.43, + "learning_rate": 1.273154927732942e-05, + "loss": 0.4725, + "step": 2480 + }, + { + "epoch": 0.43, + "learning_rate": 1.2726150280024683e-05, + "loss": 0.4698, + "step": 2481 + }, + { + "epoch": 0.43, + "learning_rate": 1.2720750424134073e-05, + "loss": 0.4636, + "step": 2482 + }, + { + "epoch": 0.43, + "learning_rate": 1.2715349711358245e-05, + "loss": 0.4809, + "step": 2483 + }, + { + "epoch": 0.43, + "learning_rate": 1.270994814339812e-05, + "loss": 0.4763, + "step": 2484 + }, + { + "epoch": 0.43, + "learning_rate": 1.27045457219549e-05, + "loss": 0.476, + "step": 2485 + }, + { + "epoch": 0.43, + "learning_rate": 1.2699142448730037e-05, + "loss": 0.4695, + "step": 2486 + }, + { + "epoch": 0.43, + "learning_rate": 1.2693738325425272e-05, + "loss": 0.4781, + "step": 2487 + }, + { + "epoch": 0.43, + "learning_rate": 1.268833335374259e-05, + "loss": 0.4728, + "step": 2488 + }, + { + "epoch": 0.43, + "learning_rate": 1.2682927535384273e-05, + "loss": 0.4731, + "step": 2489 + }, + { + "epoch": 0.43, + "learning_rate": 1.2677520872052843e-05, + "loss": 0.4733, + "step": 2490 + }, + { + "epoch": 0.43, + "learning_rate": 1.2672113365451102e-05, + "loss": 0.4683, + "step": 2491 + }, + { + "epoch": 0.43, + "learning_rate": 1.2666705017282115e-05, + "loss": 0.4621, + "step": 2492 + }, + { + "epoch": 0.43, + "learning_rate": 1.2661295829249207e-05, + "loss": 0.4603, + "step": 2493 + }, + { + "epoch": 0.43, + "learning_rate": 1.2655885803055978e-05, + "loss": 0.4765, + "step": 2494 + }, + { + "epoch": 0.43, + "learning_rate": 1.2650474940406279e-05, + "loss": 0.465, + "step": 2495 + }, + { + "epoch": 0.43, + "learning_rate": 1.2645063243004236e-05, + "loss": 0.4827, + "step": 2496 + }, + { + "epoch": 0.43, + "learning_rate": 1.263965071255423e-05, + "loss": 0.4617, + "step": 2497 + }, + { + "epoch": 0.43, + "learning_rate": 1.2634237350760912e-05, + "loss": 0.479, + "step": 2498 + }, + { + "epoch": 0.43, + "learning_rate": 1.2628823159329182e-05, + "loss": 0.4593, + "step": 2499 + }, + { + "epoch": 0.43, + "learning_rate": 1.2623408139964216e-05, + "loss": 0.47, + "step": 2500 + }, + { + "epoch": 0.43, + "learning_rate": 1.2617992294371444e-05, + "loss": 0.4627, + "step": 2501 + }, + { + "epoch": 0.43, + "learning_rate": 1.2612575624256552e-05, + "loss": 0.4814, + "step": 2502 + }, + { + "epoch": 0.43, + "learning_rate": 1.2607158131325494e-05, + "loss": 0.4612, + "step": 2503 + }, + { + "epoch": 0.43, + "learning_rate": 1.260173981728448e-05, + "loss": 0.4736, + "step": 2504 + }, + { + "epoch": 0.43, + "learning_rate": 1.2596320683839976e-05, + "loss": 0.4656, + "step": 2505 + }, + { + "epoch": 0.43, + "learning_rate": 1.2590900732698707e-05, + "loss": 0.4845, + "step": 2506 + }, + { + "epoch": 0.43, + "learning_rate": 1.258547996556766e-05, + "loss": 0.4808, + "step": 2507 + }, + { + "epoch": 0.43, + "learning_rate": 1.258005838415407e-05, + "loss": 0.4772, + "step": 2508 + }, + { + "epoch": 0.43, + "learning_rate": 1.2574635990165438e-05, + "loss": 0.4645, + "step": 2509 + }, + { + "epoch": 0.43, + "learning_rate": 1.2569212785309517e-05, + "loss": 0.4762, + "step": 2510 + }, + { + "epoch": 0.43, + "learning_rate": 1.2563788771294316e-05, + "loss": 0.4611, + "step": 2511 + }, + { + "epoch": 0.44, + "learning_rate": 1.2558363949828092e-05, + "loss": 0.4709, + "step": 2512 + }, + { + "epoch": 0.44, + "learning_rate": 1.2552938322619368e-05, + "loss": 0.4674, + "step": 2513 + }, + { + "epoch": 0.44, + "learning_rate": 1.2547511891376916e-05, + "loss": 0.4781, + "step": 2514 + }, + { + "epoch": 0.44, + "learning_rate": 1.2542084657809754e-05, + "loss": 0.471, + "step": 2515 + }, + { + "epoch": 0.44, + "learning_rate": 1.2536656623627167e-05, + "loss": 0.4734, + "step": 2516 + }, + { + "epoch": 0.44, + "learning_rate": 1.2531227790538675e-05, + "loss": 0.4711, + "step": 2517 + }, + { + "epoch": 0.44, + "learning_rate": 1.252579816025407e-05, + "loss": 0.4753, + "step": 2518 + }, + { + "epoch": 0.44, + "learning_rate": 1.2520367734483376e-05, + "loss": 0.4738, + "step": 2519 + }, + { + "epoch": 0.44, + "learning_rate": 1.2514936514936878e-05, + "loss": 0.4843, + "step": 2520 + }, + { + "epoch": 0.44, + "learning_rate": 1.2509504503325106e-05, + "loss": 0.4699, + "step": 2521 + }, + { + "epoch": 0.44, + "learning_rate": 1.2504071701358842e-05, + "loss": 0.464, + "step": 2522 + }, + { + "epoch": 0.44, + "learning_rate": 1.2498638110749122e-05, + "loss": 0.4696, + "step": 2523 + }, + { + "epoch": 0.44, + "learning_rate": 1.2493203733207219e-05, + "loss": 0.4725, + "step": 2524 + }, + { + "epoch": 0.44, + "learning_rate": 1.2487768570444665e-05, + "loss": 0.4738, + "step": 2525 + }, + { + "epoch": 0.44, + "learning_rate": 1.2482332624173227e-05, + "loss": 0.4762, + "step": 2526 + }, + { + "epoch": 0.44, + "learning_rate": 1.2476895896104937e-05, + "loss": 0.4761, + "step": 2527 + }, + { + "epoch": 0.44, + "learning_rate": 1.2471458387952053e-05, + "loss": 0.4705, + "step": 2528 + }, + { + "epoch": 0.44, + "learning_rate": 1.2466020101427092e-05, + "loss": 0.4712, + "step": 2529 + }, + { + "epoch": 0.44, + "learning_rate": 1.246058103824281e-05, + "loss": 0.4657, + "step": 2530 + }, + { + "epoch": 0.44, + "learning_rate": 1.245514120011221e-05, + "loss": 0.4795, + "step": 2531 + }, + { + "epoch": 0.44, + "learning_rate": 1.2449700588748541e-05, + "loss": 0.4813, + "step": 2532 + }, + { + "epoch": 0.44, + "learning_rate": 1.2444259205865295e-05, + "loss": 0.4658, + "step": 2533 + }, + { + "epoch": 0.44, + "learning_rate": 1.2438817053176198e-05, + "loss": 0.4812, + "step": 2534 + }, + { + "epoch": 0.44, + "learning_rate": 1.243337413239523e-05, + "loss": 0.4641, + "step": 2535 + }, + { + "epoch": 0.44, + "learning_rate": 1.2427930445236611e-05, + "loss": 0.4854, + "step": 2536 + }, + { + "epoch": 0.44, + "learning_rate": 1.2422485993414795e-05, + "loss": 0.4503, + "step": 2537 + }, + { + "epoch": 0.44, + "learning_rate": 1.2417040778644487e-05, + "loss": 0.4716, + "step": 2538 + }, + { + "epoch": 0.44, + "learning_rate": 1.2411594802640621e-05, + "loss": 0.4662, + "step": 2539 + }, + { + "epoch": 0.44, + "learning_rate": 1.2406148067118387e-05, + "loss": 0.4983, + "step": 2540 + }, + { + "epoch": 0.44, + "learning_rate": 1.2400700573793191e-05, + "loss": 0.4679, + "step": 2541 + }, + { + "epoch": 0.44, + "learning_rate": 1.2395252324380701e-05, + "loss": 0.4713, + "step": 2542 + }, + { + "epoch": 0.44, + "learning_rate": 1.2389803320596806e-05, + "loss": 0.4604, + "step": 2543 + }, + { + "epoch": 0.44, + "learning_rate": 1.2384353564157646e-05, + "loss": 0.4689, + "step": 2544 + }, + { + "epoch": 0.44, + "learning_rate": 1.2378903056779584e-05, + "loss": 0.4677, + "step": 2545 + }, + { + "epoch": 0.44, + "learning_rate": 1.2373451800179235e-05, + "loss": 0.4716, + "step": 2546 + }, + { + "epoch": 0.44, + "learning_rate": 1.2367999796073436e-05, + "loss": 0.4647, + "step": 2547 + }, + { + "epoch": 0.44, + "learning_rate": 1.2362547046179265e-05, + "loss": 0.4787, + "step": 2548 + }, + { + "epoch": 0.44, + "learning_rate": 1.2357093552214043e-05, + "loss": 0.4702, + "step": 2549 + }, + { + "epoch": 0.44, + "learning_rate": 1.2351639315895309e-05, + "loss": 0.4917, + "step": 2550 + }, + { + "epoch": 0.44, + "learning_rate": 1.2346184338940847e-05, + "loss": 0.4759, + "step": 2551 + }, + { + "epoch": 0.44, + "learning_rate": 1.2340728623068671e-05, + "loss": 0.4709, + "step": 2552 + }, + { + "epoch": 0.44, + "learning_rate": 1.2335272169997034e-05, + "loss": 0.4675, + "step": 2553 + }, + { + "epoch": 0.44, + "learning_rate": 1.232981498144441e-05, + "loss": 0.4636, + "step": 2554 + }, + { + "epoch": 0.44, + "learning_rate": 1.2324357059129512e-05, + "loss": 0.4759, + "step": 2555 + }, + { + "epoch": 0.44, + "learning_rate": 1.231889840477128e-05, + "loss": 0.4661, + "step": 2556 + }, + { + "epoch": 0.44, + "learning_rate": 1.2313439020088889e-05, + "loss": 0.4721, + "step": 2557 + }, + { + "epoch": 0.44, + "learning_rate": 1.2307978906801738e-05, + "loss": 0.4802, + "step": 2558 + }, + { + "epoch": 0.44, + "learning_rate": 1.2302518066629467e-05, + "loss": 0.4725, + "step": 2559 + }, + { + "epoch": 0.44, + "learning_rate": 1.2297056501291932e-05, + "loss": 0.4564, + "step": 2560 + }, + { + "epoch": 0.44, + "learning_rate": 1.2291594212509224e-05, + "loss": 0.4795, + "step": 2561 + }, + { + "epoch": 0.44, + "learning_rate": 1.2286131202001661e-05, + "loss": 0.4759, + "step": 2562 + }, + { + "epoch": 0.44, + "learning_rate": 1.2280667471489784e-05, + "loss": 0.4716, + "step": 2563 + }, + { + "epoch": 0.44, + "learning_rate": 1.2275203022694371e-05, + "loss": 0.4811, + "step": 2564 + }, + { + "epoch": 0.44, + "learning_rate": 1.2269737857336412e-05, + "loss": 0.4767, + "step": 2565 + }, + { + "epoch": 0.44, + "learning_rate": 1.2264271977137136e-05, + "loss": 0.489, + "step": 2566 + }, + { + "epoch": 0.44, + "learning_rate": 1.2258805383817992e-05, + "loss": 0.4769, + "step": 2567 + }, + { + "epoch": 0.44, + "learning_rate": 1.2253338079100652e-05, + "loss": 0.4674, + "step": 2568 + }, + { + "epoch": 0.45, + "learning_rate": 1.224787006470701e-05, + "loss": 0.4674, + "step": 2569 + }, + { + "epoch": 0.45, + "learning_rate": 1.2242401342359188e-05, + "loss": 0.4662, + "step": 2570 + }, + { + "epoch": 0.45, + "learning_rate": 1.2236931913779534e-05, + "loss": 0.4616, + "step": 2571 + }, + { + "epoch": 0.45, + "learning_rate": 1.223146178069061e-05, + "loss": 0.4749, + "step": 2572 + }, + { + "epoch": 0.45, + "learning_rate": 1.2225990944815207e-05, + "loss": 0.4771, + "step": 2573 + }, + { + "epoch": 0.45, + "learning_rate": 1.222051940787633e-05, + "loss": 0.4713, + "step": 2574 + }, + { + "epoch": 0.45, + "learning_rate": 1.2215047171597214e-05, + "loss": 0.4797, + "step": 2575 + }, + { + "epoch": 0.45, + "learning_rate": 1.2209574237701306e-05, + "loss": 0.473, + "step": 2576 + }, + { + "epoch": 0.45, + "learning_rate": 1.2204100607912277e-05, + "loss": 0.472, + "step": 2577 + }, + { + "epoch": 0.45, + "learning_rate": 1.2198626283954016e-05, + "loss": 0.4965, + "step": 2578 + }, + { + "epoch": 0.45, + "learning_rate": 1.2193151267550631e-05, + "loss": 0.4688, + "step": 2579 + }, + { + "epoch": 0.45, + "learning_rate": 1.2187675560426448e-05, + "loss": 0.4699, + "step": 2580 + }, + { + "epoch": 0.45, + "learning_rate": 1.218219916430601e-05, + "loss": 0.4662, + "step": 2581 + }, + { + "epoch": 0.45, + "learning_rate": 1.2176722080914081e-05, + "loss": 0.4808, + "step": 2582 + }, + { + "epoch": 0.45, + "learning_rate": 1.2171244311975635e-05, + "loss": 0.4722, + "step": 2583 + }, + { + "epoch": 0.45, + "learning_rate": 1.2165765859215863e-05, + "loss": 0.477, + "step": 2584 + }, + { + "epoch": 0.45, + "learning_rate": 1.2160286724360177e-05, + "loss": 0.4682, + "step": 2585 + }, + { + "epoch": 0.45, + "learning_rate": 1.2154806909134198e-05, + "loss": 0.4701, + "step": 2586 + }, + { + "epoch": 0.45, + "learning_rate": 1.2149326415263762e-05, + "loss": 0.4716, + "step": 2587 + }, + { + "epoch": 0.45, + "learning_rate": 1.2143845244474925e-05, + "loss": 0.4747, + "step": 2588 + }, + { + "epoch": 0.45, + "learning_rate": 1.2138363398493946e-05, + "loss": 0.4615, + "step": 2589 + }, + { + "epoch": 0.45, + "learning_rate": 1.2132880879047307e-05, + "loss": 0.4785, + "step": 2590 + }, + { + "epoch": 0.45, + "learning_rate": 1.212739768786169e-05, + "loss": 0.4696, + "step": 2591 + }, + { + "epoch": 0.45, + "learning_rate": 1.2121913826664001e-05, + "loss": 0.4786, + "step": 2592 + }, + { + "epoch": 0.45, + "learning_rate": 1.211642929718135e-05, + "loss": 0.4572, + "step": 2593 + }, + { + "epoch": 0.45, + "learning_rate": 1.2110944101141058e-05, + "loss": 0.4768, + "step": 2594 + }, + { + "epoch": 0.45, + "learning_rate": 1.210545824027066e-05, + "loss": 0.48, + "step": 2595 + }, + { + "epoch": 0.45, + "learning_rate": 1.2099971716297896e-05, + "loss": 0.4893, + "step": 2596 + }, + { + "epoch": 0.45, + "learning_rate": 1.2094484530950714e-05, + "loss": 0.4605, + "step": 2597 + }, + { + "epoch": 0.45, + "learning_rate": 1.2088996685957277e-05, + "loss": 0.4703, + "step": 2598 + }, + { + "epoch": 0.45, + "learning_rate": 1.2083508183045947e-05, + "loss": 0.4669, + "step": 2599 + }, + { + "epoch": 0.45, + "learning_rate": 1.2078019023945298e-05, + "loss": 0.4827, + "step": 2600 + }, + { + "epoch": 0.45, + "learning_rate": 1.2072529210384113e-05, + "loss": 0.4603, + "step": 2601 + }, + { + "epoch": 0.45, + "learning_rate": 1.2067038744091375e-05, + "loss": 0.4719, + "step": 2602 + }, + { + "epoch": 0.45, + "learning_rate": 1.2061547626796276e-05, + "loss": 0.492, + "step": 2603 + }, + { + "epoch": 0.45, + "learning_rate": 1.205605586022822e-05, + "loss": 0.4637, + "step": 2604 + }, + { + "epoch": 0.45, + "learning_rate": 1.2050563446116798e-05, + "loss": 0.4739, + "step": 2605 + }, + { + "epoch": 0.45, + "learning_rate": 1.2045070386191822e-05, + "loss": 0.4676, + "step": 2606 + }, + { + "epoch": 0.45, + "learning_rate": 1.2039576682183295e-05, + "loss": 0.4634, + "step": 2607 + }, + { + "epoch": 0.45, + "learning_rate": 1.2034082335821436e-05, + "loss": 0.4728, + "step": 2608 + }, + { + "epoch": 0.45, + "learning_rate": 1.2028587348836653e-05, + "loss": 0.491, + "step": 2609 + }, + { + "epoch": 0.45, + "learning_rate": 1.2023091722959565e-05, + "loss": 0.4605, + "step": 2610 + }, + { + "epoch": 0.45, + "learning_rate": 1.2017595459920985e-05, + "loss": 0.4839, + "step": 2611 + }, + { + "epoch": 0.45, + "learning_rate": 1.2012098561451933e-05, + "loss": 0.4614, + "step": 2612 + }, + { + "epoch": 0.45, + "learning_rate": 1.2006601029283629e-05, + "loss": 0.4768, + "step": 2613 + }, + { + "epoch": 0.45, + "learning_rate": 1.2001102865147485e-05, + "loss": 0.4662, + "step": 2614 + }, + { + "epoch": 0.45, + "learning_rate": 1.199560407077512e-05, + "loss": 0.4783, + "step": 2615 + }, + { + "epoch": 0.45, + "learning_rate": 1.1990104647898349e-05, + "loss": 0.4689, + "step": 2616 + }, + { + "epoch": 0.45, + "learning_rate": 1.1984604598249186e-05, + "loss": 0.4901, + "step": 2617 + }, + { + "epoch": 0.45, + "learning_rate": 1.1979103923559836e-05, + "loss": 0.4675, + "step": 2618 + }, + { + "epoch": 0.45, + "learning_rate": 1.1973602625562712e-05, + "loss": 0.4807, + "step": 2619 + }, + { + "epoch": 0.45, + "learning_rate": 1.1968100705990411e-05, + "loss": 0.4636, + "step": 2620 + }, + { + "epoch": 0.45, + "learning_rate": 1.1962598166575737e-05, + "loss": 0.4763, + "step": 2621 + }, + { + "epoch": 0.45, + "learning_rate": 1.1957095009051683e-05, + "loss": 0.4741, + "step": 2622 + }, + { + "epoch": 0.45, + "learning_rate": 1.1951591235151438e-05, + "loss": 0.466, + "step": 2623 + }, + { + "epoch": 0.45, + "learning_rate": 1.1946086846608383e-05, + "loss": 0.4722, + "step": 2624 + }, + { + "epoch": 0.45, + "learning_rate": 1.1940581845156097e-05, + "loss": 0.4787, + "step": 2625 + }, + { + "epoch": 0.45, + "learning_rate": 1.1935076232528348e-05, + "loss": 0.4593, + "step": 2626 + }, + { + "epoch": 0.46, + "learning_rate": 1.1929570010459096e-05, + "loss": 0.4751, + "step": 2627 + }, + { + "epoch": 0.46, + "learning_rate": 1.19240631806825e-05, + "loss": 0.4698, + "step": 2628 + }, + { + "epoch": 0.46, + "learning_rate": 1.1918555744932905e-05, + "loss": 0.4768, + "step": 2629 + }, + { + "epoch": 0.46, + "learning_rate": 1.1913047704944845e-05, + "loss": 0.477, + "step": 2630 + }, + { + "epoch": 0.46, + "learning_rate": 1.1907539062453044e-05, + "loss": 0.4555, + "step": 2631 + }, + { + "epoch": 0.46, + "learning_rate": 1.1902029819192424e-05, + "loss": 0.4759, + "step": 2632 + }, + { + "epoch": 0.46, + "learning_rate": 1.1896519976898086e-05, + "loss": 0.4771, + "step": 2633 + }, + { + "epoch": 0.46, + "learning_rate": 1.1891009537305326e-05, + "loss": 0.4672, + "step": 2634 + }, + { + "epoch": 0.46, + "learning_rate": 1.1885498502149626e-05, + "loss": 0.474, + "step": 2635 + }, + { + "epoch": 0.46, + "learning_rate": 1.187998687316666e-05, + "loss": 0.4618, + "step": 2636 + }, + { + "epoch": 0.46, + "learning_rate": 1.1874474652092279e-05, + "loss": 0.474, + "step": 2637 + }, + { + "epoch": 0.46, + "learning_rate": 1.1868961840662525e-05, + "loss": 0.4633, + "step": 2638 + }, + { + "epoch": 0.46, + "learning_rate": 1.1863448440613634e-05, + "loss": 0.4843, + "step": 2639 + }, + { + "epoch": 0.46, + "learning_rate": 1.1857934453682016e-05, + "loss": 0.478, + "step": 2640 + }, + { + "epoch": 0.46, + "learning_rate": 1.1852419881604276e-05, + "loss": 0.475, + "step": 2641 + }, + { + "epoch": 0.46, + "learning_rate": 1.1846904726117187e-05, + "loss": 0.4687, + "step": 2642 + }, + { + "epoch": 0.46, + "learning_rate": 1.1841388988957728e-05, + "loss": 0.4864, + "step": 2643 + }, + { + "epoch": 0.46, + "learning_rate": 1.1835872671863042e-05, + "loss": 0.4612, + "step": 2644 + }, + { + "epoch": 0.46, + "learning_rate": 1.183035577657047e-05, + "loss": 0.4755, + "step": 2645 + }, + { + "epoch": 0.46, + "learning_rate": 1.1824838304817521e-05, + "loss": 0.4553, + "step": 2646 + }, + { + "epoch": 0.46, + "learning_rate": 1.1819320258341891e-05, + "loss": 0.494, + "step": 2647 + }, + { + "epoch": 0.46, + "learning_rate": 1.1813801638881466e-05, + "loss": 0.4708, + "step": 2648 + }, + { + "epoch": 0.46, + "learning_rate": 1.1808282448174295e-05, + "loss": 0.479, + "step": 2649 + }, + { + "epoch": 0.46, + "learning_rate": 1.1802762687958624e-05, + "loss": 0.4585, + "step": 2650 + }, + { + "epoch": 0.46, + "learning_rate": 1.1797242359972868e-05, + "loss": 0.4832, + "step": 2651 + }, + { + "epoch": 0.46, + "learning_rate": 1.1791721465955621e-05, + "loss": 0.4615, + "step": 2652 + }, + { + "epoch": 0.46, + "learning_rate": 1.1786200007645662e-05, + "loss": 0.4823, + "step": 2653 + }, + { + "epoch": 0.46, + "learning_rate": 1.1780677986781943e-05, + "loss": 0.4647, + "step": 2654 + }, + { + "epoch": 0.46, + "learning_rate": 1.177515540510359e-05, + "loss": 0.4725, + "step": 2655 + }, + { + "epoch": 0.46, + "learning_rate": 1.1769632264349914e-05, + "loss": 0.461, + "step": 2656 + }, + { + "epoch": 0.46, + "learning_rate": 1.1764108566260392e-05, + "loss": 0.478, + "step": 2657 + }, + { + "epoch": 0.46, + "learning_rate": 1.1758584312574693e-05, + "loss": 0.4814, + "step": 2658 + }, + { + "epoch": 0.46, + "learning_rate": 1.1753059505032636e-05, + "loss": 0.4764, + "step": 2659 + }, + { + "epoch": 0.46, + "learning_rate": 1.1747534145374236e-05, + "loss": 0.4802, + "step": 2660 + }, + { + "epoch": 0.46, + "learning_rate": 1.1742008235339677e-05, + "loss": 0.4746, + "step": 2661 + }, + { + "epoch": 0.46, + "learning_rate": 1.1736481776669307e-05, + "loss": 0.465, + "step": 2662 + }, + { + "epoch": 0.46, + "learning_rate": 1.1730954771103653e-05, + "loss": 0.4773, + "step": 2663 + }, + { + "epoch": 0.46, + "learning_rate": 1.1725427220383421e-05, + "loss": 0.482, + "step": 2664 + }, + { + "epoch": 0.46, + "learning_rate": 1.1719899126249482e-05, + "loss": 0.4719, + "step": 2665 + }, + { + "epoch": 0.46, + "learning_rate": 1.1714370490442872e-05, + "loss": 0.4558, + "step": 2666 + }, + { + "epoch": 0.46, + "learning_rate": 1.1708841314704811e-05, + "loss": 0.4749, + "step": 2667 + }, + { + "epoch": 0.46, + "learning_rate": 1.1703311600776677e-05, + "loss": 0.4726, + "step": 2668 + }, + { + "epoch": 0.46, + "learning_rate": 1.1697781350400025e-05, + "loss": 0.4765, + "step": 2669 + }, + { + "epoch": 0.46, + "learning_rate": 1.1692250565316577e-05, + "loss": 0.4621, + "step": 2670 + }, + { + "epoch": 0.46, + "learning_rate": 1.1686719247268221e-05, + "loss": 0.477, + "step": 2671 + }, + { + "epoch": 0.46, + "learning_rate": 1.1681187397997018e-05, + "loss": 0.4871, + "step": 2672 + }, + { + "epoch": 0.46, + "learning_rate": 1.1675655019245191e-05, + "loss": 0.4632, + "step": 2673 + }, + { + "epoch": 0.46, + "learning_rate": 1.1670122112755134e-05, + "loss": 0.4695, + "step": 2674 + }, + { + "epoch": 0.46, + "learning_rate": 1.1664588680269403e-05, + "loss": 0.4929, + "step": 2675 + }, + { + "epoch": 0.46, + "learning_rate": 1.1659054723530721e-05, + "loss": 0.4679, + "step": 2676 + }, + { + "epoch": 0.46, + "learning_rate": 1.1653520244281975e-05, + "loss": 0.4802, + "step": 2677 + }, + { + "epoch": 0.46, + "learning_rate": 1.1647985244266226e-05, + "loss": 0.4704, + "step": 2678 + }, + { + "epoch": 0.46, + "learning_rate": 1.1642449725226685e-05, + "loss": 0.4741, + "step": 2679 + }, + { + "epoch": 0.46, + "learning_rate": 1.1636913688906739e-05, + "loss": 0.4736, + "step": 2680 + }, + { + "epoch": 0.46, + "learning_rate": 1.1631377137049925e-05, + "loss": 0.4716, + "step": 2681 + }, + { + "epoch": 0.46, + "learning_rate": 1.1625840071399952e-05, + "loss": 0.4609, + "step": 2682 + }, + { + "epoch": 0.46, + "learning_rate": 1.1620302493700689e-05, + "loss": 0.479, + "step": 2683 + }, + { + "epoch": 0.46, + "learning_rate": 1.1614764405696162e-05, + "loss": 0.4727, + "step": 2684 + }, + { + "epoch": 0.47, + "learning_rate": 1.1609225809130566e-05, + "loss": 0.4828, + "step": 2685 + }, + { + "epoch": 0.47, + "learning_rate": 1.1603686705748247e-05, + "loss": 0.4744, + "step": 2686 + }, + { + "epoch": 0.47, + "learning_rate": 1.1598147097293721e-05, + "loss": 0.4803, + "step": 2687 + }, + { + "epoch": 0.47, + "learning_rate": 1.1592606985511648e-05, + "loss": 0.4444, + "step": 2688 + }, + { + "epoch": 0.47, + "learning_rate": 1.1587066372146863e-05, + "loss": 0.4755, + "step": 2689 + }, + { + "epoch": 0.47, + "learning_rate": 1.1581525258944346e-05, + "loss": 0.4771, + "step": 2690 + }, + { + "epoch": 0.47, + "learning_rate": 1.1575983647649243e-05, + "loss": 0.4714, + "step": 2691 + }, + { + "epoch": 0.47, + "learning_rate": 1.1570441540006849e-05, + "loss": 0.4707, + "step": 2692 + }, + { + "epoch": 0.47, + "learning_rate": 1.1564898937762627e-05, + "loss": 0.4846, + "step": 2693 + }, + { + "epoch": 0.47, + "learning_rate": 1.1559355842662188e-05, + "loss": 0.4755, + "step": 2694 + }, + { + "epoch": 0.47, + "learning_rate": 1.155381225645129e-05, + "loss": 0.4822, + "step": 2695 + }, + { + "epoch": 0.47, + "learning_rate": 1.1548268180875868e-05, + "loss": 0.4682, + "step": 2696 + }, + { + "epoch": 0.47, + "learning_rate": 1.1542723617681989e-05, + "loss": 0.4668, + "step": 2697 + }, + { + "epoch": 0.47, + "learning_rate": 1.1537178568615879e-05, + "loss": 0.4676, + "step": 2698 + }, + { + "epoch": 0.47, + "learning_rate": 1.1531633035423931e-05, + "loss": 0.4818, + "step": 2699 + }, + { + "epoch": 0.47, + "learning_rate": 1.1526087019852673e-05, + "loss": 0.4701, + "step": 2700 + }, + { + "epoch": 0.47, + "learning_rate": 1.152054052364879e-05, + "loss": 0.4807, + "step": 2701 + }, + { + "epoch": 0.47, + "learning_rate": 1.151499354855913e-05, + "loss": 0.4669, + "step": 2702 + }, + { + "epoch": 0.47, + "learning_rate": 1.150944609633067e-05, + "loss": 0.469, + "step": 2703 + }, + { + "epoch": 0.47, + "learning_rate": 1.1503898168710555e-05, + "loss": 0.4634, + "step": 2704 + }, + { + "epoch": 0.47, + "learning_rate": 1.1498349767446072e-05, + "loss": 0.4728, + "step": 2705 + }, + { + "epoch": 0.47, + "learning_rate": 1.1492800894284664e-05, + "loss": 0.4676, + "step": 2706 + }, + { + "epoch": 0.47, + "learning_rate": 1.1487251550973914e-05, + "loss": 0.4762, + "step": 2707 + }, + { + "epoch": 0.47, + "learning_rate": 1.1481701739261557e-05, + "loss": 0.4664, + "step": 2708 + }, + { + "epoch": 0.47, + "learning_rate": 1.1476151460895476e-05, + "loss": 0.4752, + "step": 2709 + }, + { + "epoch": 0.47, + "learning_rate": 1.1470600717623699e-05, + "loss": 0.4611, + "step": 2710 + }, + { + "epoch": 0.47, + "learning_rate": 1.1465049511194404e-05, + "loss": 0.4794, + "step": 2711 + }, + { + "epoch": 0.47, + "learning_rate": 1.1459497843355907e-05, + "loss": 0.473, + "step": 2712 + }, + { + "epoch": 0.47, + "learning_rate": 1.1453945715856682e-05, + "loss": 0.4751, + "step": 2713 + }, + { + "epoch": 0.47, + "learning_rate": 1.1448393130445337e-05, + "loss": 0.4613, + "step": 2714 + }, + { + "epoch": 0.47, + "learning_rate": 1.1442840088870628e-05, + "loss": 0.4664, + "step": 2715 + }, + { + "epoch": 0.47, + "learning_rate": 1.1437286592881458e-05, + "loss": 0.4638, + "step": 2716 + }, + { + "epoch": 0.47, + "learning_rate": 1.1431732644226865e-05, + "loss": 0.4743, + "step": 2717 + }, + { + "epoch": 0.47, + "learning_rate": 1.1426178244656038e-05, + "loss": 0.4629, + "step": 2718 + }, + { + "epoch": 0.47, + "learning_rate": 1.1420623395918297e-05, + "loss": 0.4804, + "step": 2719 + }, + { + "epoch": 0.47, + "learning_rate": 1.1415068099763123e-05, + "loss": 0.4752, + "step": 2720 + }, + { + "epoch": 0.47, + "learning_rate": 1.1409512357940114e-05, + "loss": 0.4806, + "step": 2721 + }, + { + "epoch": 0.47, + "learning_rate": 1.140395617219903e-05, + "loss": 0.46, + "step": 2722 + }, + { + "epoch": 0.47, + "learning_rate": 1.1398399544289751e-05, + "loss": 0.4777, + "step": 2723 + }, + { + "epoch": 0.47, + "learning_rate": 1.1392842475962311e-05, + "loss": 0.4726, + "step": 2724 + }, + { + "epoch": 0.47, + "learning_rate": 1.1387284968966879e-05, + "loss": 0.4657, + "step": 2725 + }, + { + "epoch": 0.47, + "learning_rate": 1.1381727025053758e-05, + "loss": 0.4499, + "step": 2726 + }, + { + "epoch": 0.47, + "learning_rate": 1.1376168645973393e-05, + "loss": 0.4751, + "step": 2727 + }, + { + "epoch": 0.47, + "learning_rate": 1.1370609833476365e-05, + "loss": 0.46, + "step": 2728 + }, + { + "epoch": 0.47, + "learning_rate": 1.136505058931339e-05, + "loss": 0.47, + "step": 2729 + }, + { + "epoch": 0.47, + "learning_rate": 1.1359490915235323e-05, + "loss": 0.462, + "step": 2730 + }, + { + "epoch": 0.47, + "learning_rate": 1.135393081299315e-05, + "loss": 0.4797, + "step": 2731 + }, + { + "epoch": 0.47, + "learning_rate": 1.1348370284337996e-05, + "loss": 0.4655, + "step": 2732 + }, + { + "epoch": 0.47, + "learning_rate": 1.1342809331021117e-05, + "loss": 0.4653, + "step": 2733 + }, + { + "epoch": 0.47, + "learning_rate": 1.1337247954793904e-05, + "loss": 0.4585, + "step": 2734 + }, + { + "epoch": 0.47, + "learning_rate": 1.1331686157407887e-05, + "loss": 0.4755, + "step": 2735 + }, + { + "epoch": 0.47, + "learning_rate": 1.1326123940614715e-05, + "loss": 0.4748, + "step": 2736 + }, + { + "epoch": 0.47, + "learning_rate": 1.1320561306166182e-05, + "loss": 0.4718, + "step": 2737 + }, + { + "epoch": 0.47, + "learning_rate": 1.131499825581421e-05, + "loss": 0.4608, + "step": 2738 + }, + { + "epoch": 0.47, + "learning_rate": 1.1309434791310848e-05, + "loss": 0.4723, + "step": 2739 + }, + { + "epoch": 0.47, + "learning_rate": 1.1303870914408277e-05, + "loss": 0.4694, + "step": 2740 + }, + { + "epoch": 0.47, + "learning_rate": 1.1298306626858811e-05, + "loss": 0.4756, + "step": 2741 + }, + { + "epoch": 0.48, + "learning_rate": 1.1292741930414894e-05, + "loss": 0.471, + "step": 2742 + }, + { + "epoch": 0.48, + "learning_rate": 1.128717682682909e-05, + "loss": 0.4806, + "step": 2743 + }, + { + "epoch": 0.48, + "learning_rate": 1.1281611317854107e-05, + "loss": 0.4507, + "step": 2744 + }, + { + "epoch": 0.48, + "learning_rate": 1.1276045405242761e-05, + "loss": 0.4736, + "step": 2745 + }, + { + "epoch": 0.48, + "learning_rate": 1.127047909074801e-05, + "loss": 0.462, + "step": 2746 + }, + { + "epoch": 0.48, + "learning_rate": 1.1264912376122931e-05, + "loss": 0.471, + "step": 2747 + }, + { + "epoch": 0.48, + "learning_rate": 1.1259345263120738e-05, + "loss": 0.4628, + "step": 2748 + }, + { + "epoch": 0.48, + "learning_rate": 1.1253777753494753e-05, + "loss": 0.4649, + "step": 2749 + }, + { + "epoch": 0.48, + "learning_rate": 1.1248209848998433e-05, + "loss": 0.4619, + "step": 2750 + }, + { + "epoch": 0.48, + "learning_rate": 1.1242641551385365e-05, + "loss": 0.4657, + "step": 2751 + }, + { + "epoch": 0.48, + "learning_rate": 1.1237072862409248e-05, + "loss": 0.4603, + "step": 2752 + }, + { + "epoch": 0.48, + "learning_rate": 1.1231503783823914e-05, + "loss": 0.4749, + "step": 2753 + }, + { + "epoch": 0.48, + "learning_rate": 1.1225934317383305e-05, + "loss": 0.464, + "step": 2754 + }, + { + "epoch": 0.48, + "learning_rate": 1.1220364464841502e-05, + "loss": 0.4737, + "step": 2755 + }, + { + "epoch": 0.48, + "learning_rate": 1.1214794227952694e-05, + "loss": 0.4646, + "step": 2756 + }, + { + "epoch": 0.48, + "learning_rate": 1.1209223608471202e-05, + "loss": 0.4833, + "step": 2757 + }, + { + "epoch": 0.48, + "learning_rate": 1.1203652608151456e-05, + "loss": 0.4797, + "step": 2758 + }, + { + "epoch": 0.48, + "learning_rate": 1.1198081228748012e-05, + "loss": 0.4794, + "step": 2759 + }, + { + "epoch": 0.48, + "learning_rate": 1.1192509472015549e-05, + "loss": 0.473, + "step": 2760 + }, + { + "epoch": 0.48, + "learning_rate": 1.1186937339708856e-05, + "loss": 0.4608, + "step": 2761 + }, + { + "epoch": 0.48, + "learning_rate": 1.1181364833582848e-05, + "loss": 0.472, + "step": 2762 + }, + { + "epoch": 0.48, + "learning_rate": 1.1175791955392552e-05, + "loss": 0.477, + "step": 2763 + }, + { + "epoch": 0.48, + "learning_rate": 1.1170218706893121e-05, + "loss": 0.4608, + "step": 2764 + }, + { + "epoch": 0.48, + "learning_rate": 1.1164645089839812e-05, + "loss": 0.4727, + "step": 2765 + }, + { + "epoch": 0.48, + "learning_rate": 1.1159071105988012e-05, + "loss": 0.4593, + "step": 2766 + }, + { + "epoch": 0.48, + "learning_rate": 1.1153496757093205e-05, + "loss": 0.4681, + "step": 2767 + }, + { + "epoch": 0.48, + "learning_rate": 1.114792204491101e-05, + "loss": 0.4665, + "step": 2768 + }, + { + "epoch": 0.48, + "learning_rate": 1.1142346971197151e-05, + "loss": 0.4654, + "step": 2769 + }, + { + "epoch": 0.48, + "learning_rate": 1.1136771537707464e-05, + "loss": 0.4698, + "step": 2770 + }, + { + "epoch": 0.48, + "learning_rate": 1.1131195746197902e-05, + "loss": 0.4699, + "step": 2771 + }, + { + "epoch": 0.48, + "learning_rate": 1.1125619598424528e-05, + "loss": 0.482, + "step": 2772 + }, + { + "epoch": 0.48, + "learning_rate": 1.1120043096143523e-05, + "loss": 0.48, + "step": 2773 + }, + { + "epoch": 0.48, + "learning_rate": 1.1114466241111168e-05, + "loss": 0.4747, + "step": 2774 + }, + { + "epoch": 0.48, + "learning_rate": 1.110888903508387e-05, + "loss": 0.4621, + "step": 2775 + }, + { + "epoch": 0.48, + "learning_rate": 1.1103311479818133e-05, + "loss": 0.4847, + "step": 2776 + }, + { + "epoch": 0.48, + "learning_rate": 1.1097733577070585e-05, + "loss": 0.4682, + "step": 2777 + }, + { + "epoch": 0.48, + "learning_rate": 1.1092155328597945e-05, + "loss": 0.4803, + "step": 2778 + }, + { + "epoch": 0.48, + "learning_rate": 1.108657673615706e-05, + "loss": 0.4698, + "step": 2779 + }, + { + "epoch": 0.48, + "learning_rate": 1.1080997801504872e-05, + "loss": 0.4759, + "step": 2780 + }, + { + "epoch": 0.48, + "learning_rate": 1.1075418526398435e-05, + "loss": 0.4704, + "step": 2781 + }, + { + "epoch": 0.48, + "learning_rate": 1.1069838912594914e-05, + "loss": 0.4643, + "step": 2782 + }, + { + "epoch": 0.48, + "learning_rate": 1.1064258961851575e-05, + "loss": 0.4732, + "step": 2783 + }, + { + "epoch": 0.48, + "learning_rate": 1.1058678675925796e-05, + "loss": 0.4784, + "step": 2784 + }, + { + "epoch": 0.48, + "learning_rate": 1.105309805657505e-05, + "loss": 0.4694, + "step": 2785 + }, + { + "epoch": 0.48, + "learning_rate": 1.1047517105556933e-05, + "loss": 0.4724, + "step": 2786 + }, + { + "epoch": 0.48, + "learning_rate": 1.1041935824629121e-05, + "loss": 0.4654, + "step": 2787 + }, + { + "epoch": 0.48, + "learning_rate": 1.1036354215549422e-05, + "loss": 0.4745, + "step": 2788 + }, + { + "epoch": 0.48, + "learning_rate": 1.1030772280075714e-05, + "loss": 0.4658, + "step": 2789 + }, + { + "epoch": 0.48, + "learning_rate": 1.1025190019966017e-05, + "loss": 0.4662, + "step": 2790 + }, + { + "epoch": 0.48, + "learning_rate": 1.1019607436978419e-05, + "loss": 0.461, + "step": 2791 + }, + { + "epoch": 0.48, + "learning_rate": 1.1014024532871128e-05, + "loss": 0.473, + "step": 2792 + }, + { + "epoch": 0.48, + "learning_rate": 1.1008441309402448e-05, + "loss": 0.4741, + "step": 2793 + }, + { + "epoch": 0.48, + "learning_rate": 1.1002857768330786e-05, + "loss": 0.4655, + "step": 2794 + }, + { + "epoch": 0.48, + "learning_rate": 1.0997273911414648e-05, + "loss": 0.4662, + "step": 2795 + }, + { + "epoch": 0.48, + "learning_rate": 1.099168974041263e-05, + "loss": 0.4602, + "step": 2796 + }, + { + "epoch": 0.48, + "learning_rate": 1.0986105257083446e-05, + "loss": 0.4546, + "step": 2797 + }, + { + "epoch": 0.48, + "learning_rate": 1.0980520463185894e-05, + "loss": 0.4846, + "step": 2798 + }, + { + "epoch": 0.48, + "learning_rate": 1.0974935360478875e-05, + "loss": 0.4559, + "step": 2799 + }, + { + "epoch": 0.49, + "learning_rate": 1.0969349950721382e-05, + "loss": 0.4704, + "step": 2800 + }, + { + "epoch": 0.49, + "learning_rate": 1.0963764235672516e-05, + "loss": 0.4658, + "step": 2801 + }, + { + "epoch": 0.49, + "learning_rate": 1.0958178217091455e-05, + "loss": 0.4947, + "step": 2802 + }, + { + "epoch": 0.49, + "learning_rate": 1.0952591896737499e-05, + "loss": 0.4545, + "step": 2803 + }, + { + "epoch": 0.49, + "learning_rate": 1.094700527637002e-05, + "loss": 0.4725, + "step": 2804 + }, + { + "epoch": 0.49, + "learning_rate": 1.0941418357748493e-05, + "loss": 0.4617, + "step": 2805 + }, + { + "epoch": 0.49, + "learning_rate": 1.0935831142632489e-05, + "loss": 0.4739, + "step": 2806 + }, + { + "epoch": 0.49, + "learning_rate": 1.0930243632781669e-05, + "loss": 0.4565, + "step": 2807 + }, + { + "epoch": 0.49, + "learning_rate": 1.0924655829955793e-05, + "loss": 0.472, + "step": 2808 + }, + { + "epoch": 0.49, + "learning_rate": 1.09190677359147e-05, + "loss": 0.4593, + "step": 2809 + }, + { + "epoch": 0.49, + "learning_rate": 1.0913479352418336e-05, + "loss": 0.4592, + "step": 2810 + }, + { + "epoch": 0.49, + "learning_rate": 1.0907890681226728e-05, + "loss": 0.474, + "step": 2811 + }, + { + "epoch": 0.49, + "learning_rate": 1.09023017241e-05, + "loss": 0.4666, + "step": 2812 + }, + { + "epoch": 0.49, + "learning_rate": 1.0896712482798358e-05, + "loss": 0.4696, + "step": 2813 + }, + { + "epoch": 0.49, + "learning_rate": 1.0891122959082108e-05, + "loss": 0.4647, + "step": 2814 + }, + { + "epoch": 0.49, + "learning_rate": 1.0885533154711633e-05, + "loss": 0.4632, + "step": 2815 + }, + { + "epoch": 0.49, + "learning_rate": 1.0879943071447417e-05, + "loss": 0.4635, + "step": 2816 + }, + { + "epoch": 0.49, + "learning_rate": 1.087435271105002e-05, + "loss": 0.4699, + "step": 2817 + }, + { + "epoch": 0.49, + "learning_rate": 1.0868762075280102e-05, + "loss": 0.4664, + "step": 2818 + }, + { + "epoch": 0.49, + "learning_rate": 1.0863171165898398e-05, + "loss": 0.4585, + "step": 2819 + }, + { + "epoch": 0.49, + "learning_rate": 1.0857579984665733e-05, + "loss": 0.4537, + "step": 2820 + }, + { + "epoch": 0.49, + "learning_rate": 1.0851988533343022e-05, + "loss": 0.4682, + "step": 2821 + }, + { + "epoch": 0.49, + "learning_rate": 1.0846396813691258e-05, + "loss": 0.4823, + "step": 2822 + }, + { + "epoch": 0.49, + "learning_rate": 1.0840804827471523e-05, + "loss": 0.4675, + "step": 2823 + }, + { + "epoch": 0.49, + "learning_rate": 1.0835212576444983e-05, + "loss": 0.4673, + "step": 2824 + }, + { + "epoch": 0.49, + "learning_rate": 1.0829620062372887e-05, + "loss": 0.4716, + "step": 2825 + }, + { + "epoch": 0.49, + "learning_rate": 1.0824027287016566e-05, + "loss": 0.4714, + "step": 2826 + }, + { + "epoch": 0.49, + "learning_rate": 1.0818434252137428e-05, + "loss": 0.4608, + "step": 2827 + }, + { + "epoch": 0.49, + "learning_rate": 1.0812840959496978e-05, + "loss": 0.4751, + "step": 2828 + }, + { + "epoch": 0.49, + "learning_rate": 1.0807247410856783e-05, + "loss": 0.4667, + "step": 2829 + }, + { + "epoch": 0.49, + "learning_rate": 1.0801653607978506e-05, + "loss": 0.4661, + "step": 2830 + }, + { + "epoch": 0.49, + "learning_rate": 1.0796059552623881e-05, + "loss": 0.4628, + "step": 2831 + }, + { + "epoch": 0.49, + "learning_rate": 1.0790465246554728e-05, + "loss": 0.48, + "step": 2832 + }, + { + "epoch": 0.49, + "learning_rate": 1.078487069153294e-05, + "loss": 0.4635, + "step": 2833 + }, + { + "epoch": 0.49, + "learning_rate": 1.0779275889320495e-05, + "loss": 0.4781, + "step": 2834 + }, + { + "epoch": 0.49, + "learning_rate": 1.0773680841679437e-05, + "loss": 0.4668, + "step": 2835 + }, + { + "epoch": 0.49, + "learning_rate": 1.0768085550371902e-05, + "loss": 0.4714, + "step": 2836 + }, + { + "epoch": 0.49, + "learning_rate": 1.076249001716009e-05, + "loss": 0.459, + "step": 2837 + }, + { + "epoch": 0.49, + "learning_rate": 1.0756894243806291e-05, + "loss": 0.4767, + "step": 2838 + }, + { + "epoch": 0.49, + "learning_rate": 1.0751298232072856e-05, + "loss": 0.4708, + "step": 2839 + }, + { + "epoch": 0.49, + "learning_rate": 1.0745701983722219e-05, + "loss": 0.4797, + "step": 2840 + }, + { + "epoch": 0.49, + "learning_rate": 1.0740105500516889e-05, + "loss": 0.4657, + "step": 2841 + }, + { + "epoch": 0.49, + "learning_rate": 1.0734508784219446e-05, + "loss": 0.4839, + "step": 2842 + }, + { + "epoch": 0.49, + "learning_rate": 1.0728911836592548e-05, + "loss": 0.4604, + "step": 2843 + }, + { + "epoch": 0.49, + "learning_rate": 1.0723314659398916e-05, + "loss": 0.4856, + "step": 2844 + }, + { + "epoch": 0.49, + "learning_rate": 1.0717717254401356e-05, + "loss": 0.4612, + "step": 2845 + }, + { + "epoch": 0.49, + "learning_rate": 1.0712119623362738e-05, + "loss": 0.4742, + "step": 2846 + }, + { + "epoch": 0.49, + "learning_rate": 1.0706521768046006e-05, + "loss": 0.4681, + "step": 2847 + }, + { + "epoch": 0.49, + "learning_rate": 1.0700923690214166e-05, + "loss": 0.4755, + "step": 2848 + }, + { + "epoch": 0.49, + "learning_rate": 1.0695325391630309e-05, + "loss": 0.4642, + "step": 2849 + }, + { + "epoch": 0.49, + "learning_rate": 1.0689726874057589e-05, + "loss": 0.4708, + "step": 2850 + }, + { + "epoch": 0.49, + "learning_rate": 1.068412813925922e-05, + "loss": 0.4728, + "step": 2851 + }, + { + "epoch": 0.49, + "learning_rate": 1.06785291889985e-05, + "loss": 0.4745, + "step": 2852 + }, + { + "epoch": 0.49, + "learning_rate": 1.0672930025038783e-05, + "loss": 0.4644, + "step": 2853 + }, + { + "epoch": 0.49, + "learning_rate": 1.0667330649143498e-05, + "loss": 0.4762, + "step": 2854 + }, + { + "epoch": 0.49, + "learning_rate": 1.0661731063076134e-05, + "loss": 0.4733, + "step": 2855 + }, + { + "epoch": 0.49, + "learning_rate": 1.0656131268600254e-05, + "loss": 0.4657, + "step": 2856 + }, + { + "epoch": 0.49, + "learning_rate": 1.0650531267479477e-05, + "loss": 0.4802, + "step": 2857 + }, + { + "epoch": 0.5, + "learning_rate": 1.0644931061477492e-05, + "loss": 0.4758, + "step": 2858 + }, + { + "epoch": 0.5, + "learning_rate": 1.0639330652358058e-05, + "loss": 0.471, + "step": 2859 + }, + { + "epoch": 0.5, + "learning_rate": 1.0633730041884988e-05, + "loss": 0.4702, + "step": 2860 + }, + { + "epoch": 0.5, + "learning_rate": 1.0628129231822166e-05, + "loss": 0.4673, + "step": 2861 + }, + { + "epoch": 0.5, + "learning_rate": 1.0622528223933534e-05, + "loss": 0.4733, + "step": 2862 + }, + { + "epoch": 0.5, + "learning_rate": 1.06169270199831e-05, + "loss": 0.4704, + "step": 2863 + }, + { + "epoch": 0.5, + "learning_rate": 1.061132562173493e-05, + "loss": 0.4627, + "step": 2864 + }, + { + "epoch": 0.5, + "learning_rate": 1.0605724030953155e-05, + "loss": 0.4642, + "step": 2865 + }, + { + "epoch": 0.5, + "learning_rate": 1.0600122249401965e-05, + "loss": 0.4826, + "step": 2866 + }, + { + "epoch": 0.5, + "learning_rate": 1.059452027884561e-05, + "loss": 0.4649, + "step": 2867 + }, + { + "epoch": 0.5, + "learning_rate": 1.0588918121048396e-05, + "loss": 0.473, + "step": 2868 + }, + { + "epoch": 0.5, + "learning_rate": 1.0583315777774697e-05, + "loss": 0.4647, + "step": 2869 + }, + { + "epoch": 0.5, + "learning_rate": 1.0577713250788935e-05, + "loss": 0.4806, + "step": 2870 + }, + { + "epoch": 0.5, + "learning_rate": 1.0572110541855596e-05, + "loss": 0.4572, + "step": 2871 + }, + { + "epoch": 0.5, + "learning_rate": 1.0566507652739224e-05, + "loss": 0.4676, + "step": 2872 + }, + { + "epoch": 0.5, + "learning_rate": 1.056090458520442e-05, + "loss": 0.4635, + "step": 2873 + }, + { + "epoch": 0.5, + "learning_rate": 1.0555301341015832e-05, + "loss": 0.4789, + "step": 2874 + }, + { + "epoch": 0.5, + "learning_rate": 1.0549697921938172e-05, + "loss": 0.4621, + "step": 2875 + }, + { + "epoch": 0.5, + "learning_rate": 1.0544094329736213e-05, + "loss": 0.4799, + "step": 2876 + }, + { + "epoch": 0.5, + "learning_rate": 1.0538490566174766e-05, + "loss": 0.4554, + "step": 2877 + }, + { + "epoch": 0.5, + "learning_rate": 1.0532886633018711e-05, + "loss": 0.478, + "step": 2878 + }, + { + "epoch": 0.5, + "learning_rate": 1.052728253203297e-05, + "loss": 0.4675, + "step": 2879 + }, + { + "epoch": 0.5, + "learning_rate": 1.0521678264982534e-05, + "loss": 0.4751, + "step": 2880 + }, + { + "epoch": 0.5, + "learning_rate": 1.0516073833632424e-05, + "loss": 0.4759, + "step": 2881 + }, + { + "epoch": 0.5, + "learning_rate": 1.0510469239747731e-05, + "loss": 0.4729, + "step": 2882 + }, + { + "epoch": 0.5, + "learning_rate": 1.0504864485093588e-05, + "loss": 0.4586, + "step": 2883 + }, + { + "epoch": 0.5, + "learning_rate": 1.0499259571435185e-05, + "loss": 0.4834, + "step": 2884 + }, + { + "epoch": 0.5, + "learning_rate": 1.0493654500537756e-05, + "loss": 0.4687, + "step": 2885 + }, + { + "epoch": 0.5, + "learning_rate": 1.0488049274166583e-05, + "loss": 0.458, + "step": 2886 + }, + { + "epoch": 0.5, + "learning_rate": 1.0482443894087007e-05, + "loss": 0.4751, + "step": 2887 + }, + { + "epoch": 0.5, + "learning_rate": 1.0476838362064408e-05, + "loss": 0.4803, + "step": 2888 + }, + { + "epoch": 0.5, + "learning_rate": 1.047123267986422e-05, + "loss": 0.4636, + "step": 2889 + }, + { + "epoch": 0.5, + "learning_rate": 1.0465626849251919e-05, + "loss": 0.4917, + "step": 2890 + }, + { + "epoch": 0.5, + "learning_rate": 1.046002087199303e-05, + "loss": 0.4683, + "step": 2891 + }, + { + "epoch": 0.5, + "learning_rate": 1.0454414749853126e-05, + "loss": 0.4804, + "step": 2892 + }, + { + "epoch": 0.5, + "learning_rate": 1.0448808484597821e-05, + "loss": 0.4634, + "step": 2893 + }, + { + "epoch": 0.5, + "learning_rate": 1.044320207799278e-05, + "loss": 0.4697, + "step": 2894 + }, + { + "epoch": 0.5, + "learning_rate": 1.0437595531803713e-05, + "loss": 0.4658, + "step": 2895 + }, + { + "epoch": 0.5, + "learning_rate": 1.0431988847796361e-05, + "loss": 0.4785, + "step": 2896 + }, + { + "epoch": 0.5, + "learning_rate": 1.0426382027736524e-05, + "loss": 0.4607, + "step": 2897 + }, + { + "epoch": 0.5, + "learning_rate": 1.042077507339004e-05, + "loss": 0.4539, + "step": 2898 + }, + { + "epoch": 0.5, + "learning_rate": 1.0415167986522785e-05, + "loss": 0.4563, + "step": 2899 + }, + { + "epoch": 0.5, + "learning_rate": 1.040956076890068e-05, + "loss": 0.4754, + "step": 2900 + }, + { + "epoch": 0.5, + "learning_rate": 1.0403953422289687e-05, + "loss": 0.4598, + "step": 2901 + }, + { + "epoch": 0.5, + "learning_rate": 1.0398345948455815e-05, + "loss": 0.4637, + "step": 2902 + }, + { + "epoch": 0.5, + "learning_rate": 1.0392738349165097e-05, + "loss": 0.4655, + "step": 2903 + }, + { + "epoch": 0.5, + "learning_rate": 1.038713062618362e-05, + "loss": 0.4774, + "step": 2904 + }, + { + "epoch": 0.5, + "learning_rate": 1.0381522781277506e-05, + "loss": 0.4659, + "step": 2905 + }, + { + "epoch": 0.5, + "learning_rate": 1.0375914816212913e-05, + "loss": 0.4837, + "step": 2906 + }, + { + "epoch": 0.5, + "learning_rate": 1.0370306732756037e-05, + "loss": 0.4699, + "step": 2907 + }, + { + "epoch": 0.5, + "learning_rate": 1.0364698532673117e-05, + "loss": 0.4766, + "step": 2908 + }, + { + "epoch": 0.5, + "learning_rate": 1.035909021773042e-05, + "loss": 0.4821, + "step": 2909 + }, + { + "epoch": 0.5, + "learning_rate": 1.0353481789694258e-05, + "loss": 0.4764, + "step": 2910 + }, + { + "epoch": 0.5, + "learning_rate": 1.0347873250330971e-05, + "loss": 0.4531, + "step": 2911 + }, + { + "epoch": 0.5, + "learning_rate": 1.0342264601406936e-05, + "loss": 0.4722, + "step": 2912 + }, + { + "epoch": 0.5, + "learning_rate": 1.0336655844688571e-05, + "loss": 0.4608, + "step": 2913 + }, + { + "epoch": 0.5, + "learning_rate": 1.0331046981942311e-05, + "loss": 0.4781, + "step": 2914 + }, + { + "epoch": 0.5, + "learning_rate": 1.0325438014934655e-05, + "loss": 0.46, + "step": 2915 + }, + { + "epoch": 0.51, + "learning_rate": 1.03198289454321e-05, + "loss": 0.4754, + "step": 2916 + }, + { + "epoch": 0.51, + "learning_rate": 1.0314219775201198e-05, + "loss": 0.4663, + "step": 2917 + }, + { + "epoch": 0.51, + "learning_rate": 1.0308610506008527e-05, + "loss": 0.4763, + "step": 2918 + }, + { + "epoch": 0.51, + "learning_rate": 1.030300113962069e-05, + "loss": 0.4618, + "step": 2919 + }, + { + "epoch": 0.51, + "learning_rate": 1.029739167780433e-05, + "loss": 0.4673, + "step": 2920 + }, + { + "epoch": 0.51, + "learning_rate": 1.0291782122326112e-05, + "loss": 0.4567, + "step": 2921 + }, + { + "epoch": 0.51, + "learning_rate": 1.0286172474952742e-05, + "loss": 0.4812, + "step": 2922 + }, + { + "epoch": 0.51, + "learning_rate": 1.0280562737450938e-05, + "loss": 0.4611, + "step": 2923 + }, + { + "epoch": 0.51, + "learning_rate": 1.0274952911587464e-05, + "loss": 0.4646, + "step": 2924 + }, + { + "epoch": 0.51, + "learning_rate": 1.0269342999129097e-05, + "loss": 0.4584, + "step": 2925 + }, + { + "epoch": 0.51, + "learning_rate": 1.026373300184265e-05, + "loss": 0.4704, + "step": 2926 + }, + { + "epoch": 0.51, + "learning_rate": 1.025812292149496e-05, + "loss": 0.4644, + "step": 2927 + }, + { + "epoch": 0.51, + "learning_rate": 1.0252512759852891e-05, + "loss": 0.4719, + "step": 2928 + }, + { + "epoch": 0.51, + "learning_rate": 1.0246902518683331e-05, + "loss": 0.485, + "step": 2929 + }, + { + "epoch": 0.51, + "learning_rate": 1.0241292199753196e-05, + "loss": 0.461, + "step": 2930 + }, + { + "epoch": 0.51, + "learning_rate": 1.0235681804829426e-05, + "loss": 0.4711, + "step": 2931 + }, + { + "epoch": 0.51, + "learning_rate": 1.0230071335678982e-05, + "loss": 0.4688, + "step": 2932 + }, + { + "epoch": 0.51, + "learning_rate": 1.0224460794068849e-05, + "loss": 0.4713, + "step": 2933 + }, + { + "epoch": 0.51, + "learning_rate": 1.0218850181766038e-05, + "loss": 0.458, + "step": 2934 + }, + { + "epoch": 0.51, + "learning_rate": 1.0213239500537577e-05, + "loss": 0.4666, + "step": 2935 + }, + { + "epoch": 0.51, + "learning_rate": 1.020762875215052e-05, + "loss": 0.4552, + "step": 2936 + }, + { + "epoch": 0.51, + "learning_rate": 1.0202017938371947e-05, + "loss": 0.4701, + "step": 2937 + }, + { + "epoch": 0.51, + "learning_rate": 1.0196407060968942e-05, + "loss": 0.4638, + "step": 2938 + }, + { + "epoch": 0.51, + "learning_rate": 1.0190796121708627e-05, + "loss": 0.4686, + "step": 2939 + }, + { + "epoch": 0.51, + "learning_rate": 1.0185185122358139e-05, + "loss": 0.4612, + "step": 2940 + }, + { + "epoch": 0.51, + "learning_rate": 1.017957406468462e-05, + "loss": 0.4647, + "step": 2941 + }, + { + "epoch": 0.51, + "learning_rate": 1.0173962950455249e-05, + "loss": 0.4582, + "step": 2942 + }, + { + "epoch": 0.51, + "learning_rate": 1.0168351781437215e-05, + "loss": 0.4753, + "step": 2943 + }, + { + "epoch": 0.51, + "learning_rate": 1.0162740559397726e-05, + "loss": 0.465, + "step": 2944 + }, + { + "epoch": 0.51, + "learning_rate": 1.0157129286104e-05, + "loss": 0.461, + "step": 2945 + }, + { + "epoch": 0.51, + "learning_rate": 1.015151796332328e-05, + "loss": 0.4559, + "step": 2946 + }, + { + "epoch": 0.51, + "learning_rate": 1.0145906592822819e-05, + "loss": 0.4736, + "step": 2947 + }, + { + "epoch": 0.51, + "learning_rate": 1.014029517636989e-05, + "loss": 0.4715, + "step": 2948 + }, + { + "epoch": 0.51, + "learning_rate": 1.013468371573177e-05, + "loss": 0.4753, + "step": 2949 + }, + { + "epoch": 0.51, + "learning_rate": 1.0129072212675766e-05, + "loss": 0.467, + "step": 2950 + }, + { + "epoch": 0.51, + "learning_rate": 1.0123460668969184e-05, + "loss": 0.4833, + "step": 2951 + }, + { + "epoch": 0.51, + "learning_rate": 1.0117849086379355e-05, + "loss": 0.4622, + "step": 2952 + }, + { + "epoch": 0.51, + "learning_rate": 1.011223746667361e-05, + "loss": 0.4684, + "step": 2953 + }, + { + "epoch": 0.51, + "learning_rate": 1.0106625811619297e-05, + "loss": 0.4609, + "step": 2954 + }, + { + "epoch": 0.51, + "learning_rate": 1.010101412298378e-05, + "loss": 0.4705, + "step": 2955 + }, + { + "epoch": 0.51, + "learning_rate": 1.0095402402534423e-05, + "loss": 0.4534, + "step": 2956 + }, + { + "epoch": 0.51, + "learning_rate": 1.0089790652038613e-05, + "loss": 0.4711, + "step": 2957 + }, + { + "epoch": 0.51, + "learning_rate": 1.0084178873263735e-05, + "loss": 0.4686, + "step": 2958 + }, + { + "epoch": 0.51, + "learning_rate": 1.0078567067977193e-05, + "loss": 0.4894, + "step": 2959 + }, + { + "epoch": 0.51, + "learning_rate": 1.0072955237946383e-05, + "loss": 0.474, + "step": 2960 + }, + { + "epoch": 0.51, + "learning_rate": 1.0067343384938731e-05, + "loss": 0.4594, + "step": 2961 + }, + { + "epoch": 0.51, + "learning_rate": 1.0061731510721653e-05, + "loss": 0.4644, + "step": 2962 + }, + { + "epoch": 0.51, + "learning_rate": 1.005611961706258e-05, + "loss": 0.4748, + "step": 2963 + }, + { + "epoch": 0.51, + "learning_rate": 1.0050507705728943e-05, + "loss": 0.4724, + "step": 2964 + }, + { + "epoch": 0.51, + "learning_rate": 1.0044895778488184e-05, + "loss": 0.4678, + "step": 2965 + }, + { + "epoch": 0.51, + "learning_rate": 1.0039283837107753e-05, + "loss": 0.469, + "step": 2966 + }, + { + "epoch": 0.51, + "learning_rate": 1.0033671883355093e-05, + "loss": 0.4649, + "step": 2967 + }, + { + "epoch": 0.51, + "learning_rate": 1.0028059918997664e-05, + "loss": 0.4727, + "step": 2968 + }, + { + "epoch": 0.51, + "learning_rate": 1.0022447945802917e-05, + "loss": 0.4778, + "step": 2969 + }, + { + "epoch": 0.51, + "learning_rate": 1.0016835965538314e-05, + "loss": 0.4505, + "step": 2970 + }, + { + "epoch": 0.51, + "learning_rate": 1.0011223979971319e-05, + "loss": 0.4661, + "step": 2971 + }, + { + "epoch": 0.51, + "learning_rate": 1.0005611990869392e-05, + "loss": 0.472, + "step": 2972 + }, + { + "epoch": 0.52, + "learning_rate": 1e-05, + "loss": 0.4793, + "step": 2973 + }, + { + "epoch": 0.52, + "learning_rate": 9.99438800913061e-06, + "loss": 0.463, + "step": 2974 + }, + { + "epoch": 0.52, + "learning_rate": 9.988776020028685e-06, + "loss": 0.4748, + "step": 2975 + }, + { + "epoch": 0.52, + "learning_rate": 9.983164034461686e-06, + "loss": 0.4596, + "step": 2976 + }, + { + "epoch": 0.52, + "learning_rate": 9.977552054197088e-06, + "loss": 0.4713, + "step": 2977 + }, + { + "epoch": 0.52, + "learning_rate": 9.971940081002338e-06, + "loss": 0.4618, + "step": 2978 + }, + { + "epoch": 0.52, + "learning_rate": 9.96632811664491e-06, + "loss": 0.4759, + "step": 2979 + }, + { + "epoch": 0.52, + "learning_rate": 9.96071616289225e-06, + "loss": 0.4563, + "step": 2980 + }, + { + "epoch": 0.52, + "learning_rate": 9.955104221511816e-06, + "loss": 0.4776, + "step": 2981 + }, + { + "epoch": 0.52, + "learning_rate": 9.949492294271062e-06, + "loss": 0.4688, + "step": 2982 + }, + { + "epoch": 0.52, + "learning_rate": 9.943880382937426e-06, + "loss": 0.481, + "step": 2983 + }, + { + "epoch": 0.52, + "learning_rate": 9.938268489278352e-06, + "loss": 0.4643, + "step": 2984 + }, + { + "epoch": 0.52, + "learning_rate": 9.932656615061274e-06, + "loss": 0.4796, + "step": 2985 + }, + { + "epoch": 0.52, + "learning_rate": 9.927044762053622e-06, + "loss": 0.4643, + "step": 2986 + }, + { + "epoch": 0.52, + "learning_rate": 9.921432932022812e-06, + "loss": 0.4787, + "step": 2987 + }, + { + "epoch": 0.52, + "learning_rate": 9.915821126736266e-06, + "loss": 0.4675, + "step": 2988 + }, + { + "epoch": 0.52, + "learning_rate": 9.910209347961389e-06, + "loss": 0.4763, + "step": 2989 + }, + { + "epoch": 0.52, + "learning_rate": 9.904597597465577e-06, + "loss": 0.4723, + "step": 2990 + }, + { + "epoch": 0.52, + "learning_rate": 9.898985877016225e-06, + "loss": 0.4615, + "step": 2991 + }, + { + "epoch": 0.52, + "learning_rate": 9.893374188380705e-06, + "loss": 0.4593, + "step": 2992 + }, + { + "epoch": 0.52, + "learning_rate": 9.887762533326396e-06, + "loss": 0.4683, + "step": 2993 + }, + { + "epoch": 0.52, + "learning_rate": 9.882150913620648e-06, + "loss": 0.4717, + "step": 2994 + }, + { + "epoch": 0.52, + "learning_rate": 9.876539331030814e-06, + "loss": 0.4672, + "step": 2995 + }, + { + "epoch": 0.52, + "learning_rate": 9.870927787324236e-06, + "loss": 0.4518, + "step": 2996 + }, + { + "epoch": 0.52, + "learning_rate": 9.865316284268232e-06, + "loss": 0.4798, + "step": 2997 + }, + { + "epoch": 0.52, + "learning_rate": 9.859704823630115e-06, + "loss": 0.4739, + "step": 2998 + }, + { + "epoch": 0.52, + "learning_rate": 9.854093407177185e-06, + "loss": 0.4692, + "step": 2999 + }, + { + "epoch": 0.52, + "learning_rate": 9.848482036676725e-06, + "loss": 0.4533, + "step": 3000 + }, + { + "epoch": 0.52, + "learning_rate": 9.842870713896004e-06, + "loss": 0.4729, + "step": 3001 + }, + { + "epoch": 0.52, + "learning_rate": 9.837259440602274e-06, + "loss": 0.4635, + "step": 3002 + }, + { + "epoch": 0.52, + "learning_rate": 9.831648218562787e-06, + "loss": 0.4655, + "step": 3003 + }, + { + "epoch": 0.52, + "learning_rate": 9.82603704954475e-06, + "loss": 0.4707, + "step": 3004 + }, + { + "epoch": 0.52, + "learning_rate": 9.820425935315381e-06, + "loss": 0.4678, + "step": 3005 + }, + { + "epoch": 0.52, + "learning_rate": 9.814814877641865e-06, + "loss": 0.4674, + "step": 3006 + }, + { + "epoch": 0.52, + "learning_rate": 9.809203878291374e-06, + "loss": 0.4597, + "step": 3007 + }, + { + "epoch": 0.52, + "learning_rate": 9.80359293903106e-06, + "loss": 0.4624, + "step": 3008 + }, + { + "epoch": 0.52, + "learning_rate": 9.797982061628056e-06, + "loss": 0.4711, + "step": 3009 + }, + { + "epoch": 0.52, + "learning_rate": 9.792371247849481e-06, + "loss": 0.4489, + "step": 3010 + }, + { + "epoch": 0.52, + "learning_rate": 9.786760499462425e-06, + "loss": 0.4755, + "step": 3011 + }, + { + "epoch": 0.52, + "learning_rate": 9.781149818233969e-06, + "loss": 0.4623, + "step": 3012 + }, + { + "epoch": 0.52, + "learning_rate": 9.775539205931153e-06, + "loss": 0.4762, + "step": 3013 + }, + { + "epoch": 0.52, + "learning_rate": 9.769928664321021e-06, + "loss": 0.4694, + "step": 3014 + }, + { + "epoch": 0.52, + "learning_rate": 9.764318195170575e-06, + "loss": 0.4562, + "step": 3015 + }, + { + "epoch": 0.52, + "learning_rate": 9.758707800246806e-06, + "loss": 0.4686, + "step": 3016 + }, + { + "epoch": 0.52, + "learning_rate": 9.753097481316672e-06, + "loss": 0.4821, + "step": 3017 + }, + { + "epoch": 0.52, + "learning_rate": 9.747487240147112e-06, + "loss": 0.4611, + "step": 3018 + }, + { + "epoch": 0.52, + "learning_rate": 9.741877078505046e-06, + "loss": 0.4765, + "step": 3019 + }, + { + "epoch": 0.52, + "learning_rate": 9.736266998157353e-06, + "loss": 0.4685, + "step": 3020 + }, + { + "epoch": 0.52, + "learning_rate": 9.73065700087091e-06, + "loss": 0.4648, + "step": 3021 + }, + { + "epoch": 0.52, + "learning_rate": 9.725047088412538e-06, + "loss": 0.4586, + "step": 3022 + }, + { + "epoch": 0.52, + "learning_rate": 9.719437262549061e-06, + "loss": 0.4572, + "step": 3023 + }, + { + "epoch": 0.52, + "learning_rate": 9.713827525047261e-06, + "loss": 0.4636, + "step": 3024 + }, + { + "epoch": 0.52, + "learning_rate": 9.708217877673888e-06, + "loss": 0.4712, + "step": 3025 + }, + { + "epoch": 0.52, + "learning_rate": 9.702608322195674e-06, + "loss": 0.4651, + "step": 3026 + }, + { + "epoch": 0.52, + "learning_rate": 9.696998860379313e-06, + "loss": 0.4669, + "step": 3027 + }, + { + "epoch": 0.52, + "learning_rate": 9.691389493991478e-06, + "loss": 0.4548, + "step": 3028 + }, + { + "epoch": 0.52, + "learning_rate": 9.685780224798805e-06, + "loss": 0.4726, + "step": 3029 + }, + { + "epoch": 0.52, + "learning_rate": 9.6801710545679e-06, + "loss": 0.4694, + "step": 3030 + }, + { + "epoch": 0.53, + "learning_rate": 9.674561985065349e-06, + "loss": 0.4704, + "step": 3031 + }, + { + "epoch": 0.53, + "learning_rate": 9.668953018057687e-06, + "loss": 0.4679, + "step": 3032 + }, + { + "epoch": 0.53, + "learning_rate": 9.663344155311436e-06, + "loss": 0.4752, + "step": 3033 + }, + { + "epoch": 0.53, + "learning_rate": 9.657735398593068e-06, + "loss": 0.4705, + "step": 3034 + }, + { + "epoch": 0.53, + "learning_rate": 9.652126749669036e-06, + "loss": 0.4681, + "step": 3035 + }, + { + "epoch": 0.53, + "learning_rate": 9.646518210305747e-06, + "loss": 0.4661, + "step": 3036 + }, + { + "epoch": 0.53, + "learning_rate": 9.64090978226958e-06, + "loss": 0.4631, + "step": 3037 + }, + { + "epoch": 0.53, + "learning_rate": 9.635301467326888e-06, + "loss": 0.4626, + "step": 3038 + }, + { + "epoch": 0.53, + "learning_rate": 9.629693267243963e-06, + "loss": 0.4856, + "step": 3039 + }, + { + "epoch": 0.53, + "learning_rate": 9.62408518378709e-06, + "loss": 0.4672, + "step": 3040 + }, + { + "epoch": 0.53, + "learning_rate": 9.618477218722496e-06, + "loss": 0.4745, + "step": 3041 + }, + { + "epoch": 0.53, + "learning_rate": 9.612869373816383e-06, + "loss": 0.4665, + "step": 3042 + }, + { + "epoch": 0.53, + "learning_rate": 9.607261650834906e-06, + "loss": 0.4643, + "step": 3043 + }, + { + "epoch": 0.53, + "learning_rate": 9.601654051544188e-06, + "loss": 0.4712, + "step": 3044 + }, + { + "epoch": 0.53, + "learning_rate": 9.596046577710314e-06, + "loss": 0.4799, + "step": 3045 + }, + { + "epoch": 0.53, + "learning_rate": 9.59043923109932e-06, + "loss": 0.4698, + "step": 3046 + }, + { + "epoch": 0.53, + "learning_rate": 9.58483201347722e-06, + "loss": 0.4743, + "step": 3047 + }, + { + "epoch": 0.53, + "learning_rate": 9.579224926609962e-06, + "loss": 0.4615, + "step": 3048 + }, + { + "epoch": 0.53, + "learning_rate": 9.57361797226348e-06, + "loss": 0.4528, + "step": 3049 + }, + { + "epoch": 0.53, + "learning_rate": 9.568011152203642e-06, + "loss": 0.4662, + "step": 3050 + }, + { + "epoch": 0.53, + "learning_rate": 9.562404468196292e-06, + "loss": 0.4747, + "step": 3051 + }, + { + "epoch": 0.53, + "learning_rate": 9.556797922007221e-06, + "loss": 0.464, + "step": 3052 + }, + { + "epoch": 0.53, + "learning_rate": 9.55119151540218e-06, + "loss": 0.4595, + "step": 3053 + }, + { + "epoch": 0.53, + "learning_rate": 9.545585250146879e-06, + "loss": 0.4652, + "step": 3054 + }, + { + "epoch": 0.53, + "learning_rate": 9.539979128006971e-06, + "loss": 0.4774, + "step": 3055 + }, + { + "epoch": 0.53, + "learning_rate": 9.534373150748086e-06, + "loss": 0.4644, + "step": 3056 + }, + { + "epoch": 0.53, + "learning_rate": 9.528767320135783e-06, + "loss": 0.469, + "step": 3057 + }, + { + "epoch": 0.53, + "learning_rate": 9.523161637935592e-06, + "loss": 0.4634, + "step": 3058 + }, + { + "epoch": 0.53, + "learning_rate": 9.517556105912994e-06, + "loss": 0.4786, + "step": 3059 + }, + { + "epoch": 0.53, + "learning_rate": 9.511950725833418e-06, + "loss": 0.4586, + "step": 3060 + }, + { + "epoch": 0.53, + "learning_rate": 9.50634549946225e-06, + "loss": 0.4675, + "step": 3061 + }, + { + "epoch": 0.53, + "learning_rate": 9.500740428564819e-06, + "loss": 0.4654, + "step": 3062 + }, + { + "epoch": 0.53, + "learning_rate": 9.495135514906415e-06, + "loss": 0.4633, + "step": 3063 + }, + { + "epoch": 0.53, + "learning_rate": 9.489530760252272e-06, + "loss": 0.4626, + "step": 3064 + }, + { + "epoch": 0.53, + "learning_rate": 9.483926166367578e-06, + "loss": 0.4748, + "step": 3065 + }, + { + "epoch": 0.53, + "learning_rate": 9.478321735017471e-06, + "loss": 0.4612, + "step": 3066 + }, + { + "epoch": 0.53, + "learning_rate": 9.47271746796703e-06, + "loss": 0.4688, + "step": 3067 + }, + { + "epoch": 0.53, + "learning_rate": 9.467113366981294e-06, + "loss": 0.4579, + "step": 3068 + }, + { + "epoch": 0.53, + "learning_rate": 9.461509433825238e-06, + "loss": 0.4761, + "step": 3069 + }, + { + "epoch": 0.53, + "learning_rate": 9.455905670263792e-06, + "loss": 0.4584, + "step": 3070 + }, + { + "epoch": 0.53, + "learning_rate": 9.45030207806183e-06, + "loss": 0.4614, + "step": 3071 + }, + { + "epoch": 0.53, + "learning_rate": 9.44469865898417e-06, + "loss": 0.4731, + "step": 3072 + }, + { + "epoch": 0.53, + "learning_rate": 9.439095414795584e-06, + "loss": 0.4735, + "step": 3073 + }, + { + "epoch": 0.53, + "learning_rate": 9.433492347260776e-06, + "loss": 0.4629, + "step": 3074 + }, + { + "epoch": 0.53, + "learning_rate": 9.427889458144405e-06, + "loss": 0.478, + "step": 3075 + }, + { + "epoch": 0.53, + "learning_rate": 9.422286749211068e-06, + "loss": 0.4597, + "step": 3076 + }, + { + "epoch": 0.53, + "learning_rate": 9.416684222225308e-06, + "loss": 0.4684, + "step": 3077 + }, + { + "epoch": 0.53, + "learning_rate": 9.411081878951607e-06, + "loss": 0.4601, + "step": 3078 + }, + { + "epoch": 0.53, + "learning_rate": 9.40547972115439e-06, + "loss": 0.481, + "step": 3079 + }, + { + "epoch": 0.53, + "learning_rate": 9.39987775059804e-06, + "loss": 0.4723, + "step": 3080 + }, + { + "epoch": 0.53, + "learning_rate": 9.394275969046845e-06, + "loss": 0.4726, + "step": 3081 + }, + { + "epoch": 0.53, + "learning_rate": 9.388674378265074e-06, + "loss": 0.4632, + "step": 3082 + }, + { + "epoch": 0.53, + "learning_rate": 9.383072980016902e-06, + "loss": 0.4645, + "step": 3083 + }, + { + "epoch": 0.53, + "learning_rate": 9.377471776066469e-06, + "loss": 0.477, + "step": 3084 + }, + { + "epoch": 0.53, + "learning_rate": 9.371870768177836e-06, + "loss": 0.4647, + "step": 3085 + }, + { + "epoch": 0.53, + "learning_rate": 9.366269958115014e-06, + "loss": 0.4684, + "step": 3086 + }, + { + "epoch": 0.53, + "learning_rate": 9.360669347641946e-06, + "loss": 0.4689, + "step": 3087 + }, + { + "epoch": 0.53, + "learning_rate": 9.355068938522508e-06, + "loss": 0.4689, + "step": 3088 + }, + { + "epoch": 0.54, + "learning_rate": 9.349468732520529e-06, + "loss": 0.4681, + "step": 3089 + }, + { + "epoch": 0.54, + "learning_rate": 9.34386873139975e-06, + "loss": 0.4613, + "step": 3090 + }, + { + "epoch": 0.54, + "learning_rate": 9.33826893692387e-06, + "loss": 0.4658, + "step": 3091 + }, + { + "epoch": 0.54, + "learning_rate": 9.332669350856503e-06, + "loss": 0.4719, + "step": 3092 + }, + { + "epoch": 0.54, + "learning_rate": 9.327069974961219e-06, + "loss": 0.4618, + "step": 3093 + }, + { + "epoch": 0.54, + "learning_rate": 9.321470811001502e-06, + "loss": 0.4697, + "step": 3094 + }, + { + "epoch": 0.54, + "learning_rate": 9.315871860740782e-06, + "loss": 0.462, + "step": 3095 + }, + { + "epoch": 0.54, + "learning_rate": 9.310273125942418e-06, + "loss": 0.4703, + "step": 3096 + }, + { + "epoch": 0.54, + "learning_rate": 9.304674608369695e-06, + "loss": 0.4592, + "step": 3097 + }, + { + "epoch": 0.54, + "learning_rate": 9.299076309785839e-06, + "loss": 0.4773, + "step": 3098 + }, + { + "epoch": 0.54, + "learning_rate": 9.293478231954e-06, + "loss": 0.4705, + "step": 3099 + }, + { + "epoch": 0.54, + "learning_rate": 9.287880376637262e-06, + "loss": 0.476, + "step": 3100 + }, + { + "epoch": 0.54, + "learning_rate": 9.282282745598646e-06, + "loss": 0.4617, + "step": 3101 + }, + { + "epoch": 0.54, + "learning_rate": 9.276685340601085e-06, + "loss": 0.4752, + "step": 3102 + }, + { + "epoch": 0.54, + "learning_rate": 9.271088163407455e-06, + "loss": 0.463, + "step": 3103 + }, + { + "epoch": 0.54, + "learning_rate": 9.265491215780556e-06, + "loss": 0.4675, + "step": 3104 + }, + { + "epoch": 0.54, + "learning_rate": 9.259894499483116e-06, + "loss": 0.4748, + "step": 3105 + }, + { + "epoch": 0.54, + "learning_rate": 9.254298016277785e-06, + "loss": 0.4773, + "step": 3106 + }, + { + "epoch": 0.54, + "learning_rate": 9.248701767927146e-06, + "loss": 0.463, + "step": 3107 + }, + { + "epoch": 0.54, + "learning_rate": 9.243105756193714e-06, + "loss": 0.4689, + "step": 3108 + }, + { + "epoch": 0.54, + "learning_rate": 9.23750998283991e-06, + "loss": 0.4777, + "step": 3109 + }, + { + "epoch": 0.54, + "learning_rate": 9.231914449628103e-06, + "loss": 0.4587, + "step": 3110 + }, + { + "epoch": 0.54, + "learning_rate": 9.226319158320565e-06, + "loss": 0.4722, + "step": 3111 + }, + { + "epoch": 0.54, + "learning_rate": 9.22072411067951e-06, + "loss": 0.4691, + "step": 3112 + }, + { + "epoch": 0.54, + "learning_rate": 9.215129308467062e-06, + "loss": 0.4654, + "step": 3113 + }, + { + "epoch": 0.54, + "learning_rate": 9.20953475344527e-06, + "loss": 0.4789, + "step": 3114 + }, + { + "epoch": 0.54, + "learning_rate": 9.20394044737612e-06, + "loss": 0.4623, + "step": 3115 + }, + { + "epoch": 0.54, + "learning_rate": 9.198346392021494e-06, + "loss": 0.469, + "step": 3116 + }, + { + "epoch": 0.54, + "learning_rate": 9.192752589143219e-06, + "loss": 0.4741, + "step": 3117 + }, + { + "epoch": 0.54, + "learning_rate": 9.187159040503025e-06, + "loss": 0.4653, + "step": 3118 + }, + { + "epoch": 0.54, + "learning_rate": 9.181565747862575e-06, + "loss": 0.4558, + "step": 3119 + }, + { + "epoch": 0.54, + "learning_rate": 9.175972712983439e-06, + "loss": 0.4667, + "step": 3120 + }, + { + "epoch": 0.54, + "learning_rate": 9.170379937627116e-06, + "loss": 0.4752, + "step": 3121 + }, + { + "epoch": 0.54, + "learning_rate": 9.16478742355502e-06, + "loss": 0.4638, + "step": 3122 + }, + { + "epoch": 0.54, + "learning_rate": 9.159195172528478e-06, + "loss": 0.4571, + "step": 3123 + }, + { + "epoch": 0.54, + "learning_rate": 9.153603186308747e-06, + "loss": 0.4727, + "step": 3124 + }, + { + "epoch": 0.54, + "learning_rate": 9.148011466656981e-06, + "loss": 0.465, + "step": 3125 + }, + { + "epoch": 0.54, + "learning_rate": 9.14242001533427e-06, + "loss": 0.4761, + "step": 3126 + }, + { + "epoch": 0.54, + "learning_rate": 9.136828834101606e-06, + "loss": 0.4711, + "step": 3127 + }, + { + "epoch": 0.54, + "learning_rate": 9.1312379247199e-06, + "loss": 0.4669, + "step": 3128 + }, + { + "epoch": 0.54, + "learning_rate": 9.125647288949982e-06, + "loss": 0.4521, + "step": 3129 + }, + { + "epoch": 0.54, + "learning_rate": 9.120056928552586e-06, + "loss": 0.4658, + "step": 3130 + }, + { + "epoch": 0.54, + "learning_rate": 9.114466845288372e-06, + "loss": 0.463, + "step": 3131 + }, + { + "epoch": 0.54, + "learning_rate": 9.108877040917896e-06, + "loss": 0.4682, + "step": 3132 + }, + { + "epoch": 0.54, + "learning_rate": 9.103287517201647e-06, + "loss": 0.4614, + "step": 3133 + }, + { + "epoch": 0.54, + "learning_rate": 9.097698275900004e-06, + "loss": 0.4742, + "step": 3134 + }, + { + "epoch": 0.54, + "learning_rate": 9.092109318773274e-06, + "loss": 0.4581, + "step": 3135 + }, + { + "epoch": 0.54, + "learning_rate": 9.086520647581667e-06, + "loss": 0.4641, + "step": 3136 + }, + { + "epoch": 0.54, + "learning_rate": 9.080932264085302e-06, + "loss": 0.451, + "step": 3137 + }, + { + "epoch": 0.54, + "learning_rate": 9.075344170044212e-06, + "loss": 0.4747, + "step": 3138 + }, + { + "epoch": 0.54, + "learning_rate": 9.069756367218333e-06, + "loss": 0.4549, + "step": 3139 + }, + { + "epoch": 0.54, + "learning_rate": 9.064168857367514e-06, + "loss": 0.476, + "step": 3140 + }, + { + "epoch": 0.54, + "learning_rate": 9.05858164225151e-06, + "loss": 0.4573, + "step": 3141 + }, + { + "epoch": 0.54, + "learning_rate": 9.052994723629982e-06, + "loss": 0.4651, + "step": 3142 + }, + { + "epoch": 0.54, + "learning_rate": 9.047408103262503e-06, + "loss": 0.4638, + "step": 3143 + }, + { + "epoch": 0.54, + "learning_rate": 9.041821782908544e-06, + "loss": 0.4693, + "step": 3144 + }, + { + "epoch": 0.54, + "learning_rate": 9.03623576432749e-06, + "loss": 0.4805, + "step": 3145 + }, + { + "epoch": 0.54, + "learning_rate": 9.03065004927862e-06, + "loss": 0.4691, + "step": 3146 + }, + { + "epoch": 0.55, + "learning_rate": 9.02506463952113e-06, + "loss": 0.4667, + "step": 3147 + }, + { + "epoch": 0.55, + "learning_rate": 9.019479536814108e-06, + "loss": 0.4706, + "step": 3148 + }, + { + "epoch": 0.55, + "learning_rate": 9.013894742916554e-06, + "loss": 0.455, + "step": 3149 + }, + { + "epoch": 0.55, + "learning_rate": 9.008310259587374e-06, + "loss": 0.4606, + "step": 3150 + }, + { + "epoch": 0.55, + "learning_rate": 9.002726088585356e-06, + "loss": 0.4682, + "step": 3151 + }, + { + "epoch": 0.55, + "learning_rate": 8.997142231669217e-06, + "loss": 0.4523, + "step": 3152 + }, + { + "epoch": 0.55, + "learning_rate": 8.991558690597553e-06, + "loss": 0.473, + "step": 3153 + }, + { + "epoch": 0.55, + "learning_rate": 8.985975467128875e-06, + "loss": 0.4886, + "step": 3154 + }, + { + "epoch": 0.55, + "learning_rate": 8.980392563021585e-06, + "loss": 0.4663, + "step": 3155 + }, + { + "epoch": 0.55, + "learning_rate": 8.974809980033987e-06, + "loss": 0.4706, + "step": 3156 + }, + { + "epoch": 0.55, + "learning_rate": 8.969227719924289e-06, + "loss": 0.4569, + "step": 3157 + }, + { + "epoch": 0.55, + "learning_rate": 8.963645784450584e-06, + "loss": 0.4681, + "step": 3158 + }, + { + "epoch": 0.55, + "learning_rate": 8.958064175370884e-06, + "loss": 0.4673, + "step": 3159 + }, + { + "epoch": 0.55, + "learning_rate": 8.95248289444307e-06, + "loss": 0.4689, + "step": 3160 + }, + { + "epoch": 0.55, + "learning_rate": 8.946901943424951e-06, + "loss": 0.4577, + "step": 3161 + }, + { + "epoch": 0.55, + "learning_rate": 8.941321324074207e-06, + "loss": 0.4649, + "step": 3162 + }, + { + "epoch": 0.55, + "learning_rate": 8.935741038148426e-06, + "loss": 0.4548, + "step": 3163 + }, + { + "epoch": 0.55, + "learning_rate": 8.930161087405089e-06, + "loss": 0.476, + "step": 3164 + }, + { + "epoch": 0.55, + "learning_rate": 8.924581473601568e-06, + "loss": 0.4596, + "step": 3165 + }, + { + "epoch": 0.55, + "learning_rate": 8.919002198495135e-06, + "loss": 0.4701, + "step": 3166 + }, + { + "epoch": 0.55, + "learning_rate": 8.913423263842943e-06, + "loss": 0.4719, + "step": 3167 + }, + { + "epoch": 0.55, + "learning_rate": 8.90784467140206e-06, + "loss": 0.4662, + "step": 3168 + }, + { + "epoch": 0.55, + "learning_rate": 8.90226642292942e-06, + "loss": 0.4529, + "step": 3169 + }, + { + "epoch": 0.55, + "learning_rate": 8.896688520181867e-06, + "loss": 0.4792, + "step": 3170 + }, + { + "epoch": 0.55, + "learning_rate": 8.891110964916135e-06, + "loss": 0.4617, + "step": 3171 + }, + { + "epoch": 0.55, + "learning_rate": 8.885533758888835e-06, + "loss": 0.4782, + "step": 3172 + }, + { + "epoch": 0.55, + "learning_rate": 8.879956903856484e-06, + "loss": 0.4593, + "step": 3173 + }, + { + "epoch": 0.55, + "learning_rate": 8.874380401575476e-06, + "loss": 0.4656, + "step": 3174 + }, + { + "epoch": 0.55, + "learning_rate": 8.868804253802103e-06, + "loss": 0.4525, + "step": 3175 + }, + { + "epoch": 0.55, + "learning_rate": 8.863228462292537e-06, + "loss": 0.4732, + "step": 3176 + }, + { + "epoch": 0.55, + "learning_rate": 8.85765302880285e-06, + "loss": 0.4595, + "step": 3177 + }, + { + "epoch": 0.55, + "learning_rate": 8.852077955088993e-06, + "loss": 0.459, + "step": 3178 + }, + { + "epoch": 0.55, + "learning_rate": 8.846503242906798e-06, + "loss": 0.4555, + "step": 3179 + }, + { + "epoch": 0.55, + "learning_rate": 8.840928894011995e-06, + "loss": 0.4713, + "step": 3180 + }, + { + "epoch": 0.55, + "learning_rate": 8.83535491016019e-06, + "loss": 0.4572, + "step": 3181 + }, + { + "epoch": 0.55, + "learning_rate": 8.829781293106884e-06, + "loss": 0.4592, + "step": 3182 + }, + { + "epoch": 0.55, + "learning_rate": 8.82420804460745e-06, + "loss": 0.4783, + "step": 3183 + }, + { + "epoch": 0.55, + "learning_rate": 8.818635166417154e-06, + "loss": 0.4731, + "step": 3184 + }, + { + "epoch": 0.55, + "learning_rate": 8.813062660291146e-06, + "loss": 0.4696, + "step": 3185 + }, + { + "epoch": 0.55, + "learning_rate": 8.807490527984453e-06, + "loss": 0.4599, + "step": 3186 + }, + { + "epoch": 0.55, + "learning_rate": 8.80191877125199e-06, + "loss": 0.4681, + "step": 3187 + }, + { + "epoch": 0.55, + "learning_rate": 8.796347391848547e-06, + "loss": 0.476, + "step": 3188 + }, + { + "epoch": 0.55, + "learning_rate": 8.790776391528803e-06, + "loss": 0.4761, + "step": 3189 + }, + { + "epoch": 0.55, + "learning_rate": 8.785205772047308e-06, + "loss": 0.4663, + "step": 3190 + }, + { + "epoch": 0.55, + "learning_rate": 8.779635535158498e-06, + "loss": 0.4719, + "step": 3191 + }, + { + "epoch": 0.55, + "learning_rate": 8.774065682616699e-06, + "loss": 0.4611, + "step": 3192 + }, + { + "epoch": 0.55, + "learning_rate": 8.76849621617609e-06, + "loss": 0.4583, + "step": 3193 + }, + { + "epoch": 0.55, + "learning_rate": 8.762927137590757e-06, + "loss": 0.477, + "step": 3194 + }, + { + "epoch": 0.55, + "learning_rate": 8.757358448614636e-06, + "loss": 0.4614, + "step": 3195 + }, + { + "epoch": 0.55, + "learning_rate": 8.751790151001569e-06, + "loss": 0.4577, + "step": 3196 + }, + { + "epoch": 0.55, + "learning_rate": 8.74622224650525e-06, + "loss": 0.4662, + "step": 3197 + }, + { + "epoch": 0.55, + "learning_rate": 8.740654736879265e-06, + "loss": 0.4608, + "step": 3198 + }, + { + "epoch": 0.55, + "learning_rate": 8.73508762387707e-06, + "loss": 0.4625, + "step": 3199 + }, + { + "epoch": 0.55, + "learning_rate": 8.729520909251994e-06, + "loss": 0.4822, + "step": 3200 + }, + { + "epoch": 0.55, + "learning_rate": 8.723954594757244e-06, + "loss": 0.4694, + "step": 3201 + }, + { + "epoch": 0.55, + "learning_rate": 8.718388682145897e-06, + "loss": 0.4683, + "step": 3202 + }, + { + "epoch": 0.55, + "learning_rate": 8.712823173170914e-06, + "loss": 0.4658, + "step": 3203 + }, + { + "epoch": 0.56, + "learning_rate": 8.707258069585109e-06, + "loss": 0.4694, + "step": 3204 + }, + { + "epoch": 0.56, + "learning_rate": 8.70169337314119e-06, + "loss": 0.4782, + "step": 3205 + }, + { + "epoch": 0.56, + "learning_rate": 8.696129085591726e-06, + "loss": 0.4704, + "step": 3206 + }, + { + "epoch": 0.56, + "learning_rate": 8.690565208689157e-06, + "loss": 0.4619, + "step": 3207 + }, + { + "epoch": 0.56, + "learning_rate": 8.685001744185795e-06, + "loss": 0.477, + "step": 3208 + }, + { + "epoch": 0.56, + "learning_rate": 8.679438693833821e-06, + "loss": 0.4601, + "step": 3209 + }, + { + "epoch": 0.56, + "learning_rate": 8.67387605938529e-06, + "loss": 0.4735, + "step": 3210 + }, + { + "epoch": 0.56, + "learning_rate": 8.668313842592116e-06, + "loss": 0.4636, + "step": 3211 + }, + { + "epoch": 0.56, + "learning_rate": 8.662752045206096e-06, + "loss": 0.4682, + "step": 3212 + }, + { + "epoch": 0.56, + "learning_rate": 8.657190668978887e-06, + "loss": 0.4597, + "step": 3213 + }, + { + "epoch": 0.56, + "learning_rate": 8.651629715662006e-06, + "loss": 0.4815, + "step": 3214 + }, + { + "epoch": 0.56, + "learning_rate": 8.646069187006854e-06, + "loss": 0.4489, + "step": 3215 + }, + { + "epoch": 0.56, + "learning_rate": 8.640509084764682e-06, + "loss": 0.4722, + "step": 3216 + }, + { + "epoch": 0.56, + "learning_rate": 8.634949410686615e-06, + "loss": 0.4673, + "step": 3217 + }, + { + "epoch": 0.56, + "learning_rate": 8.629390166523638e-06, + "loss": 0.4812, + "step": 3218 + }, + { + "epoch": 0.56, + "learning_rate": 8.623831354026609e-06, + "loss": 0.4494, + "step": 3219 + }, + { + "epoch": 0.56, + "learning_rate": 8.618272974946244e-06, + "loss": 0.4666, + "step": 3220 + }, + { + "epoch": 0.56, + "learning_rate": 8.612715031033125e-06, + "loss": 0.4671, + "step": 3221 + }, + { + "epoch": 0.56, + "learning_rate": 8.607157524037692e-06, + "loss": 0.4623, + "step": 3222 + }, + { + "epoch": 0.56, + "learning_rate": 8.601600455710254e-06, + "loss": 0.4647, + "step": 3223 + }, + { + "epoch": 0.56, + "learning_rate": 8.596043827800976e-06, + "loss": 0.4815, + "step": 3224 + }, + { + "epoch": 0.56, + "learning_rate": 8.590487642059888e-06, + "loss": 0.4712, + "step": 3225 + }, + { + "epoch": 0.56, + "learning_rate": 8.584931900236879e-06, + "loss": 0.4617, + "step": 3226 + }, + { + "epoch": 0.56, + "learning_rate": 8.579376604081705e-06, + "loss": 0.4552, + "step": 3227 + }, + { + "epoch": 0.56, + "learning_rate": 8.573821755343965e-06, + "loss": 0.473, + "step": 3228 + }, + { + "epoch": 0.56, + "learning_rate": 8.568267355773137e-06, + "loss": 0.4719, + "step": 3229 + }, + { + "epoch": 0.56, + "learning_rate": 8.562713407118543e-06, + "loss": 0.4709, + "step": 3230 + }, + { + "epoch": 0.56, + "learning_rate": 8.557159911129373e-06, + "loss": 0.4589, + "step": 3231 + }, + { + "epoch": 0.56, + "learning_rate": 8.551606869554665e-06, + "loss": 0.4652, + "step": 3232 + }, + { + "epoch": 0.56, + "learning_rate": 8.54605428414332e-06, + "loss": 0.471, + "step": 3233 + }, + { + "epoch": 0.56, + "learning_rate": 8.540502156644096e-06, + "loss": 0.4634, + "step": 3234 + }, + { + "epoch": 0.56, + "learning_rate": 8.534950488805599e-06, + "loss": 0.4661, + "step": 3235 + }, + { + "epoch": 0.56, + "learning_rate": 8.529399282376306e-06, + "loss": 0.467, + "step": 3236 + }, + { + "epoch": 0.56, + "learning_rate": 8.523848539104527e-06, + "loss": 0.4558, + "step": 3237 + }, + { + "epoch": 0.56, + "learning_rate": 8.518298260738448e-06, + "loss": 0.4654, + "step": 3238 + }, + { + "epoch": 0.56, + "learning_rate": 8.512748449026087e-06, + "loss": 0.4705, + "step": 3239 + }, + { + "epoch": 0.56, + "learning_rate": 8.507199105715336e-06, + "loss": 0.4738, + "step": 3240 + }, + { + "epoch": 0.56, + "learning_rate": 8.50165023255393e-06, + "loss": 0.453, + "step": 3241 + }, + { + "epoch": 0.56, + "learning_rate": 8.496101831289447e-06, + "loss": 0.4738, + "step": 3242 + }, + { + "epoch": 0.56, + "learning_rate": 8.490553903669335e-06, + "loss": 0.4652, + "step": 3243 + }, + { + "epoch": 0.56, + "learning_rate": 8.485006451440874e-06, + "loss": 0.471, + "step": 3244 + }, + { + "epoch": 0.56, + "learning_rate": 8.479459476351213e-06, + "loss": 0.4559, + "step": 3245 + }, + { + "epoch": 0.56, + "learning_rate": 8.473912980147329e-06, + "loss": 0.4811, + "step": 3246 + }, + { + "epoch": 0.56, + "learning_rate": 8.46836696457607e-06, + "loss": 0.4489, + "step": 3247 + }, + { + "epoch": 0.56, + "learning_rate": 8.462821431384123e-06, + "loss": 0.4736, + "step": 3248 + }, + { + "epoch": 0.56, + "learning_rate": 8.457276382318016e-06, + "loss": 0.4585, + "step": 3249 + }, + { + "epoch": 0.56, + "learning_rate": 8.451731819124137e-06, + "loss": 0.4688, + "step": 3250 + }, + { + "epoch": 0.56, + "learning_rate": 8.446187743548711e-06, + "loss": 0.4674, + "step": 3251 + }, + { + "epoch": 0.56, + "learning_rate": 8.440644157337819e-06, + "loss": 0.4713, + "step": 3252 + }, + { + "epoch": 0.56, + "learning_rate": 8.435101062237377e-06, + "loss": 0.4648, + "step": 3253 + }, + { + "epoch": 0.56, + "learning_rate": 8.42955845999315e-06, + "loss": 0.4804, + "step": 3254 + }, + { + "epoch": 0.56, + "learning_rate": 8.42401635235076e-06, + "loss": 0.4546, + "step": 3255 + }, + { + "epoch": 0.56, + "learning_rate": 8.418474741055657e-06, + "loss": 0.4593, + "step": 3256 + }, + { + "epoch": 0.56, + "learning_rate": 8.412933627853142e-06, + "loss": 0.4624, + "step": 3257 + }, + { + "epoch": 0.56, + "learning_rate": 8.407393014488354e-06, + "loss": 0.4556, + "step": 3258 + }, + { + "epoch": 0.56, + "learning_rate": 8.401852902706285e-06, + "loss": 0.4685, + "step": 3259 + }, + { + "epoch": 0.56, + "learning_rate": 8.396313294251755e-06, + "loss": 0.4548, + "step": 3260 + }, + { + "epoch": 0.56, + "learning_rate": 8.390774190869434e-06, + "loss": 0.47, + "step": 3261 + }, + { + "epoch": 0.57, + "learning_rate": 8.385235594303842e-06, + "loss": 0.4616, + "step": 3262 + }, + { + "epoch": 0.57, + "learning_rate": 8.379697506299313e-06, + "loss": 0.4621, + "step": 3263 + }, + { + "epoch": 0.57, + "learning_rate": 8.374159928600051e-06, + "loss": 0.4602, + "step": 3264 + }, + { + "epoch": 0.57, + "learning_rate": 8.368622862950079e-06, + "loss": 0.4845, + "step": 3265 + }, + { + "epoch": 0.57, + "learning_rate": 8.363086311093266e-06, + "loss": 0.4663, + "step": 3266 + }, + { + "epoch": 0.57, + "learning_rate": 8.357550274773317e-06, + "loss": 0.4665, + "step": 3267 + }, + { + "epoch": 0.57, + "learning_rate": 8.352014755733775e-06, + "loss": 0.458, + "step": 3268 + }, + { + "epoch": 0.57, + "learning_rate": 8.346479755718028e-06, + "loss": 0.4712, + "step": 3269 + }, + { + "epoch": 0.57, + "learning_rate": 8.340945276469282e-06, + "loss": 0.4693, + "step": 3270 + }, + { + "epoch": 0.57, + "learning_rate": 8.335411319730604e-06, + "loss": 0.4677, + "step": 3271 + }, + { + "epoch": 0.57, + "learning_rate": 8.329877887244867e-06, + "loss": 0.4688, + "step": 3272 + }, + { + "epoch": 0.57, + "learning_rate": 8.32434498075481e-06, + "loss": 0.4869, + "step": 3273 + }, + { + "epoch": 0.57, + "learning_rate": 8.318812602002984e-06, + "loss": 0.4563, + "step": 3274 + }, + { + "epoch": 0.57, + "learning_rate": 8.313280752731779e-06, + "loss": 0.4772, + "step": 3275 + }, + { + "epoch": 0.57, + "learning_rate": 8.307749434683426e-06, + "loss": 0.4574, + "step": 3276 + }, + { + "epoch": 0.57, + "learning_rate": 8.302218649599978e-06, + "loss": 0.4658, + "step": 3277 + }, + { + "epoch": 0.57, + "learning_rate": 8.296688399223327e-06, + "loss": 0.4539, + "step": 3278 + }, + { + "epoch": 0.57, + "learning_rate": 8.29115868529519e-06, + "loss": 0.4621, + "step": 3279 + }, + { + "epoch": 0.57, + "learning_rate": 8.285629509557132e-06, + "loss": 0.458, + "step": 3280 + }, + { + "epoch": 0.57, + "learning_rate": 8.28010087375052e-06, + "loss": 0.4663, + "step": 3281 + }, + { + "epoch": 0.57, + "learning_rate": 8.274572779616579e-06, + "loss": 0.4599, + "step": 3282 + }, + { + "epoch": 0.57, + "learning_rate": 8.269045228896349e-06, + "loss": 0.4721, + "step": 3283 + }, + { + "epoch": 0.57, + "learning_rate": 8.263518223330698e-06, + "loss": 0.4592, + "step": 3284 + }, + { + "epoch": 0.57, + "learning_rate": 8.25799176466033e-06, + "loss": 0.4667, + "step": 3285 + }, + { + "epoch": 0.57, + "learning_rate": 8.252465854625766e-06, + "loss": 0.4715, + "step": 3286 + }, + { + "epoch": 0.57, + "learning_rate": 8.246940494967369e-06, + "loss": 0.4611, + "step": 3287 + }, + { + "epoch": 0.57, + "learning_rate": 8.24141568742531e-06, + "loss": 0.4716, + "step": 3288 + }, + { + "epoch": 0.57, + "learning_rate": 8.235891433739606e-06, + "loss": 0.4684, + "step": 3289 + }, + { + "epoch": 0.57, + "learning_rate": 8.230367735650088e-06, + "loss": 0.4594, + "step": 3290 + }, + { + "epoch": 0.57, + "learning_rate": 8.224844594896411e-06, + "loss": 0.4731, + "step": 3291 + }, + { + "epoch": 0.57, + "learning_rate": 8.219322013218062e-06, + "loss": 0.4606, + "step": 3292 + }, + { + "epoch": 0.57, + "learning_rate": 8.213799992354341e-06, + "loss": 0.4584, + "step": 3293 + }, + { + "epoch": 0.57, + "learning_rate": 8.208278534044382e-06, + "loss": 0.465, + "step": 3294 + }, + { + "epoch": 0.57, + "learning_rate": 8.202757640027137e-06, + "loss": 0.4589, + "step": 3295 + }, + { + "epoch": 0.57, + "learning_rate": 8.197237312041377e-06, + "loss": 0.4627, + "step": 3296 + }, + { + "epoch": 0.57, + "learning_rate": 8.191717551825707e-06, + "loss": 0.4657, + "step": 3297 + }, + { + "epoch": 0.57, + "learning_rate": 8.186198361118537e-06, + "loss": 0.4591, + "step": 3298 + }, + { + "epoch": 0.57, + "learning_rate": 8.18067974165811e-06, + "loss": 0.4741, + "step": 3299 + }, + { + "epoch": 0.57, + "learning_rate": 8.175161695182484e-06, + "loss": 0.4651, + "step": 3300 + }, + { + "epoch": 0.57, + "learning_rate": 8.169644223429535e-06, + "loss": 0.461, + "step": 3301 + }, + { + "epoch": 0.57, + "learning_rate": 8.16412732813696e-06, + "loss": 0.4654, + "step": 3302 + }, + { + "epoch": 0.57, + "learning_rate": 8.158611011042272e-06, + "loss": 0.4785, + "step": 3303 + }, + { + "epoch": 0.57, + "learning_rate": 8.153095273882816e-06, + "loss": 0.4581, + "step": 3304 + }, + { + "epoch": 0.57, + "learning_rate": 8.147580118395728e-06, + "loss": 0.4684, + "step": 3305 + }, + { + "epoch": 0.57, + "learning_rate": 8.142065546317988e-06, + "loss": 0.4578, + "step": 3306 + }, + { + "epoch": 0.57, + "learning_rate": 8.136551559386368e-06, + "loss": 0.4739, + "step": 3307 + }, + { + "epoch": 0.57, + "learning_rate": 8.131038159337478e-06, + "loss": 0.4797, + "step": 3308 + }, + { + "epoch": 0.57, + "learning_rate": 8.125525347907726e-06, + "loss": 0.4735, + "step": 3309 + }, + { + "epoch": 0.57, + "learning_rate": 8.120013126833344e-06, + "loss": 0.4692, + "step": 3310 + }, + { + "epoch": 0.57, + "learning_rate": 8.114501497850375e-06, + "loss": 0.462, + "step": 3311 + }, + { + "epoch": 0.57, + "learning_rate": 8.108990462694676e-06, + "loss": 0.4587, + "step": 3312 + }, + { + "epoch": 0.57, + "learning_rate": 8.103480023101919e-06, + "loss": 0.4697, + "step": 3313 + }, + { + "epoch": 0.57, + "learning_rate": 8.097970180807577e-06, + "loss": 0.4634, + "step": 3314 + }, + { + "epoch": 0.57, + "learning_rate": 8.09246093754696e-06, + "loss": 0.4669, + "step": 3315 + }, + { + "epoch": 0.57, + "learning_rate": 8.086952295055158e-06, + "loss": 0.4627, + "step": 3316 + }, + { + "epoch": 0.57, + "learning_rate": 8.081444255067096e-06, + "loss": 0.4747, + "step": 3317 + }, + { + "epoch": 0.57, + "learning_rate": 8.075936819317501e-06, + "loss": 0.4707, + "step": 3318 + }, + { + "epoch": 0.57, + "learning_rate": 8.070429989540905e-06, + "loss": 0.4572, + "step": 3319 + }, + { + "epoch": 0.58, + "learning_rate": 8.064923767471657e-06, + "loss": 0.4639, + "step": 3320 + }, + { + "epoch": 0.58, + "learning_rate": 8.059418154843908e-06, + "loss": 0.4774, + "step": 3321 + }, + { + "epoch": 0.58, + "learning_rate": 8.053913153391622e-06, + "loss": 0.4606, + "step": 3322 + }, + { + "epoch": 0.58, + "learning_rate": 8.048408764848565e-06, + "loss": 0.4735, + "step": 3323 + }, + { + "epoch": 0.58, + "learning_rate": 8.042904990948319e-06, + "loss": 0.4653, + "step": 3324 + }, + { + "epoch": 0.58, + "learning_rate": 8.037401833424265e-06, + "loss": 0.4701, + "step": 3325 + }, + { + "epoch": 0.58, + "learning_rate": 8.03189929400959e-06, + "loss": 0.4588, + "step": 3326 + }, + { + "epoch": 0.58, + "learning_rate": 8.026397374437294e-06, + "loss": 0.471, + "step": 3327 + }, + { + "epoch": 0.58, + "learning_rate": 8.020896076440169e-06, + "loss": 0.4658, + "step": 3328 + }, + { + "epoch": 0.58, + "learning_rate": 8.015395401750816e-06, + "loss": 0.468, + "step": 3329 + }, + { + "epoch": 0.58, + "learning_rate": 8.009895352101656e-06, + "loss": 0.4561, + "step": 3330 + }, + { + "epoch": 0.58, + "learning_rate": 8.004395929224881e-06, + "loss": 0.4632, + "step": 3331 + }, + { + "epoch": 0.58, + "learning_rate": 7.998897134852518e-06, + "loss": 0.462, + "step": 3332 + }, + { + "epoch": 0.58, + "learning_rate": 7.993398970716375e-06, + "loss": 0.4553, + "step": 3333 + }, + { + "epoch": 0.58, + "learning_rate": 7.987901438548069e-06, + "loss": 0.4636, + "step": 3334 + }, + { + "epoch": 0.58, + "learning_rate": 7.982404540079018e-06, + "loss": 0.4721, + "step": 3335 + }, + { + "epoch": 0.58, + "learning_rate": 7.976908277040438e-06, + "loss": 0.4641, + "step": 3336 + }, + { + "epoch": 0.58, + "learning_rate": 7.97141265116335e-06, + "loss": 0.4606, + "step": 3337 + }, + { + "epoch": 0.58, + "learning_rate": 7.965917664178564e-06, + "loss": 0.4607, + "step": 3338 + }, + { + "epoch": 0.58, + "learning_rate": 7.960423317816708e-06, + "loss": 0.4664, + "step": 3339 + }, + { + "epoch": 0.58, + "learning_rate": 7.95492961380818e-06, + "loss": 0.4586, + "step": 3340 + }, + { + "epoch": 0.58, + "learning_rate": 7.949436553883203e-06, + "loss": 0.4743, + "step": 3341 + }, + { + "epoch": 0.58, + "learning_rate": 7.943944139771784e-06, + "loss": 0.4768, + "step": 3342 + }, + { + "epoch": 0.58, + "learning_rate": 7.938452373203722e-06, + "loss": 0.4768, + "step": 3343 + }, + { + "epoch": 0.58, + "learning_rate": 7.932961255908628e-06, + "loss": 0.4606, + "step": 3344 + }, + { + "epoch": 0.58, + "learning_rate": 7.92747078961589e-06, + "loss": 0.4795, + "step": 3345 + }, + { + "epoch": 0.58, + "learning_rate": 7.921980976054707e-06, + "loss": 0.4605, + "step": 3346 + }, + { + "epoch": 0.58, + "learning_rate": 7.916491816954055e-06, + "loss": 0.4802, + "step": 3347 + }, + { + "epoch": 0.58, + "learning_rate": 7.91100331404273e-06, + "loss": 0.4603, + "step": 3348 + }, + { + "epoch": 0.58, + "learning_rate": 7.905515469049287e-06, + "loss": 0.4667, + "step": 3349 + }, + { + "epoch": 0.58, + "learning_rate": 7.900028283702106e-06, + "loss": 0.4751, + "step": 3350 + }, + { + "epoch": 0.58, + "learning_rate": 7.894541759729344e-06, + "loss": 0.4701, + "step": 3351 + }, + { + "epoch": 0.58, + "learning_rate": 7.889055898858943e-06, + "loss": 0.4495, + "step": 3352 + }, + { + "epoch": 0.58, + "learning_rate": 7.883570702818654e-06, + "loss": 0.4592, + "step": 3353 + }, + { + "epoch": 0.58, + "learning_rate": 7.878086173336004e-06, + "loss": 0.4667, + "step": 3354 + }, + { + "epoch": 0.58, + "learning_rate": 7.872602312138316e-06, + "loss": 0.4748, + "step": 3355 + }, + { + "epoch": 0.58, + "learning_rate": 7.867119120952698e-06, + "loss": 0.464, + "step": 3356 + }, + { + "epoch": 0.58, + "learning_rate": 7.861636601506056e-06, + "loss": 0.4657, + "step": 3357 + }, + { + "epoch": 0.58, + "learning_rate": 7.856154755525078e-06, + "loss": 0.461, + "step": 3358 + }, + { + "epoch": 0.58, + "learning_rate": 7.85067358473624e-06, + "loss": 0.4753, + "step": 3359 + }, + { + "epoch": 0.58, + "learning_rate": 7.845193090865807e-06, + "loss": 0.468, + "step": 3360 + }, + { + "epoch": 0.58, + "learning_rate": 7.839713275639826e-06, + "loss": 0.4724, + "step": 3361 + }, + { + "epoch": 0.58, + "learning_rate": 7.83423414078414e-06, + "loss": 0.4642, + "step": 3362 + }, + { + "epoch": 0.58, + "learning_rate": 7.828755688024369e-06, + "loss": 0.4805, + "step": 3363 + }, + { + "epoch": 0.58, + "learning_rate": 7.823277919085919e-06, + "loss": 0.465, + "step": 3364 + }, + { + "epoch": 0.58, + "learning_rate": 7.817800835693993e-06, + "loss": 0.475, + "step": 3365 + }, + { + "epoch": 0.58, + "learning_rate": 7.812324439573554e-06, + "loss": 0.4709, + "step": 3366 + }, + { + "epoch": 0.58, + "learning_rate": 7.806848732449372e-06, + "loss": 0.4754, + "step": 3367 + }, + { + "epoch": 0.58, + "learning_rate": 7.801373716045987e-06, + "loss": 0.4573, + "step": 3368 + }, + { + "epoch": 0.58, + "learning_rate": 7.795899392087728e-06, + "loss": 0.4714, + "step": 3369 + }, + { + "epoch": 0.58, + "learning_rate": 7.790425762298698e-06, + "loss": 0.4527, + "step": 3370 + }, + { + "epoch": 0.58, + "learning_rate": 7.784952828402789e-06, + "loss": 0.464, + "step": 3371 + }, + { + "epoch": 0.58, + "learning_rate": 7.779480592123673e-06, + "loss": 0.4656, + "step": 3372 + }, + { + "epoch": 0.58, + "learning_rate": 7.774009055184795e-06, + "loss": 0.4687, + "step": 3373 + }, + { + "epoch": 0.58, + "learning_rate": 7.768538219309392e-06, + "loss": 0.4685, + "step": 3374 + }, + { + "epoch": 0.58, + "learning_rate": 7.763068086220467e-06, + "loss": 0.4703, + "step": 3375 + }, + { + "epoch": 0.58, + "learning_rate": 7.757598657640813e-06, + "loss": 0.4587, + "step": 3376 + }, + { + "epoch": 0.59, + "learning_rate": 7.752129935292993e-06, + "loss": 0.4714, + "step": 3377 + }, + { + "epoch": 0.59, + "learning_rate": 7.746661920899351e-06, + "loss": 0.4716, + "step": 3378 + }, + { + "epoch": 0.59, + "learning_rate": 7.74119461618201e-06, + "loss": 0.4672, + "step": 3379 + }, + { + "epoch": 0.59, + "learning_rate": 7.735728022862865e-06, + "loss": 0.4719, + "step": 3380 + }, + { + "epoch": 0.59, + "learning_rate": 7.730262142663591e-06, + "loss": 0.4781, + "step": 3381 + }, + { + "epoch": 0.59, + "learning_rate": 7.724796977305632e-06, + "loss": 0.4406, + "step": 3382 + }, + { + "epoch": 0.59, + "learning_rate": 7.71933252851022e-06, + "loss": 0.4652, + "step": 3383 + }, + { + "epoch": 0.59, + "learning_rate": 7.713868797998342e-06, + "loss": 0.4732, + "step": 3384 + }, + { + "epoch": 0.59, + "learning_rate": 7.708405787490777e-06, + "loss": 0.4693, + "step": 3385 + }, + { + "epoch": 0.59, + "learning_rate": 7.702943498708069e-06, + "loss": 0.4669, + "step": 3386 + }, + { + "epoch": 0.59, + "learning_rate": 7.697481933370535e-06, + "loss": 0.4611, + "step": 3387 + }, + { + "epoch": 0.59, + "learning_rate": 7.692021093198264e-06, + "loss": 0.4602, + "step": 3388 + }, + { + "epoch": 0.59, + "learning_rate": 7.686560979911115e-06, + "loss": 0.4677, + "step": 3389 + }, + { + "epoch": 0.59, + "learning_rate": 7.681101595228727e-06, + "loss": 0.463, + "step": 3390 + }, + { + "epoch": 0.59, + "learning_rate": 7.67564294087049e-06, + "loss": 0.4554, + "step": 3391 + }, + { + "epoch": 0.59, + "learning_rate": 7.670185018555592e-06, + "loss": 0.4666, + "step": 3392 + }, + { + "epoch": 0.59, + "learning_rate": 7.664727830002967e-06, + "loss": 0.4804, + "step": 3393 + }, + { + "epoch": 0.59, + "learning_rate": 7.659271376931327e-06, + "loss": 0.4709, + "step": 3394 + }, + { + "epoch": 0.59, + "learning_rate": 7.653815661059156e-06, + "loss": 0.4663, + "step": 3395 + }, + { + "epoch": 0.59, + "learning_rate": 7.648360684104695e-06, + "loss": 0.4557, + "step": 3396 + }, + { + "epoch": 0.59, + "learning_rate": 7.642906447785962e-06, + "loss": 0.4693, + "step": 3397 + }, + { + "epoch": 0.59, + "learning_rate": 7.637452953820737e-06, + "loss": 0.4672, + "step": 3398 + }, + { + "epoch": 0.59, + "learning_rate": 7.632000203926564e-06, + "loss": 0.4754, + "step": 3399 + }, + { + "epoch": 0.59, + "learning_rate": 7.626548199820768e-06, + "loss": 0.4541, + "step": 3400 + }, + { + "epoch": 0.59, + "learning_rate": 7.621096943220416e-06, + "loss": 0.483, + "step": 3401 + }, + { + "epoch": 0.59, + "learning_rate": 7.6156464358423586e-06, + "loss": 0.4626, + "step": 3402 + }, + { + "epoch": 0.59, + "learning_rate": 7.610196679403195e-06, + "loss": 0.4719, + "step": 3403 + }, + { + "epoch": 0.59, + "learning_rate": 7.6047476756193035e-06, + "loss": 0.4599, + "step": 3404 + }, + { + "epoch": 0.59, + "learning_rate": 7.599299426206812e-06, + "loss": 0.4793, + "step": 3405 + }, + { + "epoch": 0.59, + "learning_rate": 7.5938519328816156e-06, + "loss": 0.4594, + "step": 3406 + }, + { + "epoch": 0.59, + "learning_rate": 7.588405197359381e-06, + "loss": 0.4724, + "step": 3407 + }, + { + "epoch": 0.59, + "learning_rate": 7.582959221355514e-06, + "loss": 0.4478, + "step": 3408 + }, + { + "epoch": 0.59, + "learning_rate": 7.577514006585209e-06, + "loss": 0.4568, + "step": 3409 + }, + { + "epoch": 0.59, + "learning_rate": 7.572069554763391e-06, + "loss": 0.4599, + "step": 3410 + }, + { + "epoch": 0.59, + "learning_rate": 7.5666258676047735e-06, + "loss": 0.468, + "step": 3411 + }, + { + "epoch": 0.59, + "learning_rate": 7.561182946823805e-06, + "loss": 0.4635, + "step": 3412 + }, + { + "epoch": 0.59, + "learning_rate": 7.5557407941347095e-06, + "loss": 0.4705, + "step": 3413 + }, + { + "epoch": 0.59, + "learning_rate": 7.550299411251461e-06, + "loss": 0.469, + "step": 3414 + }, + { + "epoch": 0.59, + "learning_rate": 7.54485879988779e-06, + "loss": 0.4667, + "step": 3415 + }, + { + "epoch": 0.59, + "learning_rate": 7.539418961757195e-06, + "loss": 0.464, + "step": 3416 + }, + { + "epoch": 0.59, + "learning_rate": 7.533979898572909e-06, + "loss": 0.4575, + "step": 3417 + }, + { + "epoch": 0.59, + "learning_rate": 7.528541612047953e-06, + "loss": 0.4699, + "step": 3418 + }, + { + "epoch": 0.59, + "learning_rate": 7.523104103895066e-06, + "loss": 0.4639, + "step": 3419 + }, + { + "epoch": 0.59, + "learning_rate": 7.517667375826772e-06, + "loss": 0.4799, + "step": 3420 + }, + { + "epoch": 0.59, + "learning_rate": 7.512231429555339e-06, + "loss": 0.4692, + "step": 3421 + }, + { + "epoch": 0.59, + "learning_rate": 7.506796266792782e-06, + "loss": 0.4812, + "step": 3422 + }, + { + "epoch": 0.59, + "learning_rate": 7.501361889250882e-06, + "loss": 0.4523, + "step": 3423 + }, + { + "epoch": 0.59, + "learning_rate": 7.4959282986411595e-06, + "loss": 0.4693, + "step": 3424 + }, + { + "epoch": 0.59, + "learning_rate": 7.490495496674899e-06, + "loss": 0.4711, + "step": 3425 + }, + { + "epoch": 0.59, + "learning_rate": 7.485063485063124e-06, + "loss": 0.4732, + "step": 3426 + }, + { + "epoch": 0.59, + "learning_rate": 7.479632265516626e-06, + "loss": 0.4637, + "step": 3427 + }, + { + "epoch": 0.59, + "learning_rate": 7.474201839745932e-06, + "loss": 0.4777, + "step": 3428 + }, + { + "epoch": 0.59, + "learning_rate": 7.468772209461324e-06, + "loss": 0.4588, + "step": 3429 + }, + { + "epoch": 0.59, + "learning_rate": 7.463343376372837e-06, + "loss": 0.4633, + "step": 3430 + }, + { + "epoch": 0.59, + "learning_rate": 7.457915342190247e-06, + "loss": 0.4508, + "step": 3431 + }, + { + "epoch": 0.59, + "learning_rate": 7.452488108623089e-06, + "loss": 0.4695, + "step": 3432 + }, + { + "epoch": 0.59, + "learning_rate": 7.447061677380635e-06, + "loss": 0.4594, + "step": 3433 + }, + { + "epoch": 0.59, + "learning_rate": 7.441636050171909e-06, + "loss": 0.4679, + "step": 3434 + }, + { + "epoch": 0.6, + "learning_rate": 7.436211228705687e-06, + "loss": 0.4748, + "step": 3435 + }, + { + "epoch": 0.6, + "learning_rate": 7.430787214690485e-06, + "loss": 0.4642, + "step": 3436 + }, + { + "epoch": 0.6, + "learning_rate": 7.425364009834563e-06, + "loss": 0.4695, + "step": 3437 + }, + { + "epoch": 0.6, + "learning_rate": 7.4199416158459316e-06, + "loss": 0.4621, + "step": 3438 + }, + { + "epoch": 0.6, + "learning_rate": 7.414520034432345e-06, + "loss": 0.465, + "step": 3439 + }, + { + "epoch": 0.6, + "learning_rate": 7.409099267301296e-06, + "loss": 0.4635, + "step": 3440 + }, + { + "epoch": 0.6, + "learning_rate": 7.403679316160024e-06, + "loss": 0.4551, + "step": 3441 + }, + { + "epoch": 0.6, + "learning_rate": 7.398260182715524e-06, + "loss": 0.4667, + "step": 3442 + }, + { + "epoch": 0.6, + "learning_rate": 7.392841868674506e-06, + "loss": 0.4818, + "step": 3443 + }, + { + "epoch": 0.6, + "learning_rate": 7.387424375743451e-06, + "loss": 0.4771, + "step": 3444 + }, + { + "epoch": 0.6, + "learning_rate": 7.3820077056285595e-06, + "loss": 0.4621, + "step": 3445 + }, + { + "epoch": 0.6, + "learning_rate": 7.3765918600357875e-06, + "loss": 0.4668, + "step": 3446 + }, + { + "epoch": 0.6, + "learning_rate": 7.371176840670822e-06, + "loss": 0.4691, + "step": 3447 + }, + { + "epoch": 0.6, + "learning_rate": 7.365762649239092e-06, + "loss": 0.4661, + "step": 3448 + }, + { + "epoch": 0.6, + "learning_rate": 7.360349287445774e-06, + "loss": 0.464, + "step": 3449 + }, + { + "epoch": 0.6, + "learning_rate": 7.354936756995766e-06, + "loss": 0.4699, + "step": 3450 + }, + { + "epoch": 0.6, + "learning_rate": 7.349525059593725e-06, + "loss": 0.4499, + "step": 3451 + }, + { + "epoch": 0.6, + "learning_rate": 7.344114196944023e-06, + "loss": 0.4578, + "step": 3452 + }, + { + "epoch": 0.6, + "learning_rate": 7.338704170750794e-06, + "loss": 0.4681, + "step": 3453 + }, + { + "epoch": 0.6, + "learning_rate": 7.333294982717887e-06, + "loss": 0.4698, + "step": 3454 + }, + { + "epoch": 0.6, + "learning_rate": 7.327886634548899e-06, + "loss": 0.4616, + "step": 3455 + }, + { + "epoch": 0.6, + "learning_rate": 7.32247912794716e-06, + "loss": 0.4617, + "step": 3456 + }, + { + "epoch": 0.6, + "learning_rate": 7.3170724646157284e-06, + "loss": 0.4624, + "step": 3457 + }, + { + "epoch": 0.6, + "learning_rate": 7.311666646257412e-06, + "loss": 0.4685, + "step": 3458 + }, + { + "epoch": 0.6, + "learning_rate": 7.3062616745747325e-06, + "loss": 0.4553, + "step": 3459 + }, + { + "epoch": 0.6, + "learning_rate": 7.300857551269969e-06, + "loss": 0.4675, + "step": 3460 + }, + { + "epoch": 0.6, + "learning_rate": 7.295454278045104e-06, + "loss": 0.4565, + "step": 3461 + }, + { + "epoch": 0.6, + "learning_rate": 7.290051856601879e-06, + "loss": 0.4733, + "step": 3462 + }, + { + "epoch": 0.6, + "learning_rate": 7.28465028864176e-06, + "loss": 0.45, + "step": 3463 + }, + { + "epoch": 0.6, + "learning_rate": 7.279249575865929e-06, + "loss": 0.4632, + "step": 3464 + }, + { + "epoch": 0.6, + "learning_rate": 7.27384971997532e-06, + "loss": 0.4666, + "step": 3465 + }, + { + "epoch": 0.6, + "learning_rate": 7.268450722670582e-06, + "loss": 0.4735, + "step": 3466 + }, + { + "epoch": 0.6, + "learning_rate": 7.263052585652104e-06, + "loss": 0.4551, + "step": 3467 + }, + { + "epoch": 0.6, + "learning_rate": 7.257655310619996e-06, + "loss": 0.473, + "step": 3468 + }, + { + "epoch": 0.6, + "learning_rate": 7.252258899274096e-06, + "loss": 0.4582, + "step": 3469 + }, + { + "epoch": 0.6, + "learning_rate": 7.246863353313983e-06, + "loss": 0.4638, + "step": 3470 + }, + { + "epoch": 0.6, + "learning_rate": 7.241468674438947e-06, + "loss": 0.4695, + "step": 3471 + }, + { + "epoch": 0.6, + "learning_rate": 7.236074864348017e-06, + "loss": 0.4695, + "step": 3472 + }, + { + "epoch": 0.6, + "learning_rate": 7.230681924739939e-06, + "loss": 0.4592, + "step": 3473 + }, + { + "epoch": 0.6, + "learning_rate": 7.225289857313194e-06, + "loss": 0.4716, + "step": 3474 + }, + { + "epoch": 0.6, + "learning_rate": 7.219898663765979e-06, + "loss": 0.477, + "step": 3475 + }, + { + "epoch": 0.6, + "learning_rate": 7.214508345796218e-06, + "loss": 0.464, + "step": 3476 + }, + { + "epoch": 0.6, + "learning_rate": 7.209118905101575e-06, + "loss": 0.4659, + "step": 3477 + }, + { + "epoch": 0.6, + "learning_rate": 7.203730343379408e-06, + "loss": 0.4538, + "step": 3478 + }, + { + "epoch": 0.6, + "learning_rate": 7.198342662326827e-06, + "loss": 0.4664, + "step": 3479 + }, + { + "epoch": 0.6, + "learning_rate": 7.192955863640645e-06, + "loss": 0.4818, + "step": 3480 + }, + { + "epoch": 0.6, + "learning_rate": 7.187569949017408e-06, + "loss": 0.4638, + "step": 3481 + }, + { + "epoch": 0.6, + "learning_rate": 7.1821849201533765e-06, + "loss": 0.4564, + "step": 3482 + }, + { + "epoch": 0.6, + "learning_rate": 7.176800778744537e-06, + "loss": 0.4707, + "step": 3483 + }, + { + "epoch": 0.6, + "learning_rate": 7.1714175264865975e-06, + "loss": 0.4766, + "step": 3484 + }, + { + "epoch": 0.6, + "learning_rate": 7.166035165074976e-06, + "loss": 0.4582, + "step": 3485 + }, + { + "epoch": 0.6, + "learning_rate": 7.16065369620483e-06, + "loss": 0.4723, + "step": 3486 + }, + { + "epoch": 0.6, + "learning_rate": 7.155273121571009e-06, + "loss": 0.4649, + "step": 3487 + }, + { + "epoch": 0.6, + "learning_rate": 7.149893442868105e-06, + "loss": 0.4558, + "step": 3488 + }, + { + "epoch": 0.6, + "learning_rate": 7.1445146617904135e-06, + "loss": 0.4584, + "step": 3489 + }, + { + "epoch": 0.6, + "learning_rate": 7.139136780031953e-06, + "loss": 0.4733, + "step": 3490 + }, + { + "epoch": 0.6, + "learning_rate": 7.133759799286458e-06, + "loss": 0.4657, + "step": 3491 + }, + { + "epoch": 0.6, + "learning_rate": 7.128383721247376e-06, + "loss": 0.4882, + "step": 3492 + }, + { + "epoch": 0.61, + "learning_rate": 7.123008547607877e-06, + "loss": 0.4711, + "step": 3493 + }, + { + "epoch": 0.61, + "learning_rate": 7.1176342800608365e-06, + "loss": 0.4656, + "step": 3494 + }, + { + "epoch": 0.61, + "learning_rate": 7.112260920298859e-06, + "loss": 0.4632, + "step": 3495 + }, + { + "epoch": 0.61, + "learning_rate": 7.1068884700142416e-06, + "loss": 0.4702, + "step": 3496 + }, + { + "epoch": 0.61, + "learning_rate": 7.101516930899019e-06, + "loss": 0.4586, + "step": 3497 + }, + { + "epoch": 0.61, + "learning_rate": 7.096146304644924e-06, + "loss": 0.475, + "step": 3498 + }, + { + "epoch": 0.61, + "learning_rate": 7.090776592943402e-06, + "loss": 0.456, + "step": 3499 + }, + { + "epoch": 0.61, + "learning_rate": 7.08540779748562e-06, + "loss": 0.4742, + "step": 3500 + }, + { + "epoch": 0.61, + "learning_rate": 7.080039919962445e-06, + "loss": 0.4584, + "step": 3501 + }, + { + "epoch": 0.61, + "learning_rate": 7.074672962064464e-06, + "loss": 0.4636, + "step": 3502 + }, + { + "epoch": 0.61, + "learning_rate": 7.069306925481965e-06, + "loss": 0.4546, + "step": 3503 + }, + { + "epoch": 0.61, + "learning_rate": 7.063941811904956e-06, + "loss": 0.4525, + "step": 3504 + }, + { + "epoch": 0.61, + "learning_rate": 7.058577623023153e-06, + "loss": 0.4522, + "step": 3505 + }, + { + "epoch": 0.61, + "learning_rate": 7.0532143605259686e-06, + "loss": 0.4889, + "step": 3506 + }, + { + "epoch": 0.61, + "learning_rate": 7.047852026102541e-06, + "loss": 0.4739, + "step": 3507 + }, + { + "epoch": 0.61, + "learning_rate": 7.042490621441701e-06, + "loss": 0.4732, + "step": 3508 + }, + { + "epoch": 0.61, + "learning_rate": 7.0371301482319985e-06, + "loss": 0.4569, + "step": 3509 + }, + { + "epoch": 0.61, + "learning_rate": 7.0317706081616785e-06, + "loss": 0.4685, + "step": 3510 + }, + { + "epoch": 0.61, + "learning_rate": 7.026412002918701e-06, + "loss": 0.4413, + "step": 3511 + }, + { + "epoch": 0.61, + "learning_rate": 7.021054334190736e-06, + "loss": 0.4756, + "step": 3512 + }, + { + "epoch": 0.61, + "learning_rate": 7.015697603665141e-06, + "loss": 0.456, + "step": 3513 + }, + { + "epoch": 0.61, + "learning_rate": 7.010341813028996e-06, + "loss": 0.4734, + "step": 3514 + }, + { + "epoch": 0.61, + "learning_rate": 7.004986963969072e-06, + "loss": 0.457, + "step": 3515 + }, + { + "epoch": 0.61, + "learning_rate": 6.999633058171853e-06, + "loss": 0.4699, + "step": 3516 + }, + { + "epoch": 0.61, + "learning_rate": 6.994280097323519e-06, + "loss": 0.469, + "step": 3517 + }, + { + "epoch": 0.61, + "learning_rate": 6.988928083109954e-06, + "loss": 0.4659, + "step": 3518 + }, + { + "epoch": 0.61, + "learning_rate": 6.9835770172167535e-06, + "loss": 0.4597, + "step": 3519 + }, + { + "epoch": 0.61, + "learning_rate": 6.978226901329195e-06, + "loss": 0.4713, + "step": 3520 + }, + { + "epoch": 0.61, + "learning_rate": 6.9728777371322775e-06, + "loss": 0.4692, + "step": 3521 + }, + { + "epoch": 0.61, + "learning_rate": 6.967529526310681e-06, + "loss": 0.4704, + "step": 3522 + }, + { + "epoch": 0.61, + "learning_rate": 6.962182270548803e-06, + "loss": 0.462, + "step": 3523 + }, + { + "epoch": 0.61, + "learning_rate": 6.9568359715307265e-06, + "loss": 0.4608, + "step": 3524 + }, + { + "epoch": 0.61, + "learning_rate": 6.951490630940241e-06, + "loss": 0.4685, + "step": 3525 + }, + { + "epoch": 0.61, + "learning_rate": 6.9461462504608335e-06, + "loss": 0.4449, + "step": 3526 + }, + { + "epoch": 0.61, + "learning_rate": 6.94080283177568e-06, + "loss": 0.4617, + "step": 3527 + }, + { + "epoch": 0.61, + "learning_rate": 6.935460376567673e-06, + "loss": 0.484, + "step": 3528 + }, + { + "epoch": 0.61, + "learning_rate": 6.930118886519374e-06, + "loss": 0.4661, + "step": 3529 + }, + { + "epoch": 0.61, + "learning_rate": 6.924778363313071e-06, + "loss": 0.4587, + "step": 3530 + }, + { + "epoch": 0.61, + "learning_rate": 6.919438808630716e-06, + "loss": 0.4628, + "step": 3531 + }, + { + "epoch": 0.61, + "learning_rate": 6.914100224153983e-06, + "loss": 0.471, + "step": 3532 + }, + { + "epoch": 0.61, + "learning_rate": 6.90876261156423e-06, + "loss": 0.4574, + "step": 3533 + }, + { + "epoch": 0.61, + "learning_rate": 6.903425972542501e-06, + "loss": 0.465, + "step": 3534 + }, + { + "epoch": 0.61, + "learning_rate": 6.898090308769548e-06, + "loss": 0.4648, + "step": 3535 + }, + { + "epoch": 0.61, + "learning_rate": 6.892755621925804e-06, + "loss": 0.4702, + "step": 3536 + }, + { + "epoch": 0.61, + "learning_rate": 6.887421913691402e-06, + "loss": 0.4554, + "step": 3537 + }, + { + "epoch": 0.61, + "learning_rate": 6.882089185746158e-06, + "loss": 0.4733, + "step": 3538 + }, + { + "epoch": 0.61, + "learning_rate": 6.876757439769592e-06, + "loss": 0.4595, + "step": 3539 + }, + { + "epoch": 0.61, + "learning_rate": 6.871426677440907e-06, + "loss": 0.4607, + "step": 3540 + }, + { + "epoch": 0.61, + "learning_rate": 6.866096900438992e-06, + "loss": 0.4549, + "step": 3541 + }, + { + "epoch": 0.61, + "learning_rate": 6.860768110442438e-06, + "loss": 0.4729, + "step": 3542 + }, + { + "epoch": 0.61, + "learning_rate": 6.855440309129509e-06, + "loss": 0.4623, + "step": 3543 + }, + { + "epoch": 0.61, + "learning_rate": 6.850113498178173e-06, + "loss": 0.4746, + "step": 3544 + }, + { + "epoch": 0.61, + "learning_rate": 6.844787679266076e-06, + "loss": 0.4501, + "step": 3545 + }, + { + "epoch": 0.61, + "learning_rate": 6.839462854070554e-06, + "loss": 0.4743, + "step": 3546 + }, + { + "epoch": 0.61, + "learning_rate": 6.834139024268638e-06, + "loss": 0.4688, + "step": 3547 + }, + { + "epoch": 0.61, + "learning_rate": 6.828816191537032e-06, + "loss": 0.4658, + "step": 3548 + }, + { + "epoch": 0.61, + "learning_rate": 6.8234943575521365e-06, + "loss": 0.4566, + "step": 3549 + }, + { + "epoch": 0.61, + "learning_rate": 6.818173523990029e-06, + "loss": 0.4841, + "step": 3550 + }, + { + "epoch": 0.62, + "learning_rate": 6.812853692526482e-06, + "loss": 0.4527, + "step": 3551 + }, + { + "epoch": 0.62, + "learning_rate": 6.807534864836942e-06, + "loss": 0.4757, + "step": 3552 + }, + { + "epoch": 0.62, + "learning_rate": 6.802217042596544e-06, + "loss": 0.4675, + "step": 3553 + }, + { + "epoch": 0.62, + "learning_rate": 6.7969002274801145e-06, + "loss": 0.4678, + "step": 3554 + }, + { + "epoch": 0.62, + "learning_rate": 6.791584421162143e-06, + "loss": 0.4627, + "step": 3555 + }, + { + "epoch": 0.62, + "learning_rate": 6.7862696253168225e-06, + "loss": 0.4719, + "step": 3556 + }, + { + "epoch": 0.62, + "learning_rate": 6.780955841618013e-06, + "loss": 0.4641, + "step": 3557 + }, + { + "epoch": 0.62, + "learning_rate": 6.775643071739267e-06, + "loss": 0.4674, + "step": 3558 + }, + { + "epoch": 0.62, + "learning_rate": 6.770331317353804e-06, + "loss": 0.4595, + "step": 3559 + }, + { + "epoch": 0.62, + "learning_rate": 6.765020580134538e-06, + "loss": 0.4496, + "step": 3560 + }, + { + "epoch": 0.62, + "learning_rate": 6.759710861754054e-06, + "loss": 0.4621, + "step": 3561 + }, + { + "epoch": 0.62, + "learning_rate": 6.7544021638846145e-06, + "loss": 0.4679, + "step": 3562 + }, + { + "epoch": 0.62, + "learning_rate": 6.749094488198173e-06, + "loss": 0.4621, + "step": 3563 + }, + { + "epoch": 0.62, + "learning_rate": 6.743787836366343e-06, + "loss": 0.4696, + "step": 3564 + }, + { + "epoch": 0.62, + "learning_rate": 6.738482210060433e-06, + "loss": 0.4578, + "step": 3565 + }, + { + "epoch": 0.62, + "learning_rate": 6.733177610951414e-06, + "loss": 0.4576, + "step": 3566 + }, + { + "epoch": 0.62, + "learning_rate": 6.727874040709943e-06, + "loss": 0.459, + "step": 3567 + }, + { + "epoch": 0.62, + "learning_rate": 6.7225715010063516e-06, + "loss": 0.47, + "step": 3568 + }, + { + "epoch": 0.62, + "learning_rate": 6.717269993510642e-06, + "loss": 0.4695, + "step": 3569 + }, + { + "epoch": 0.62, + "learning_rate": 6.711969519892499e-06, + "loss": 0.4753, + "step": 3570 + }, + { + "epoch": 0.62, + "learning_rate": 6.706670081821267e-06, + "loss": 0.4626, + "step": 3571 + }, + { + "epoch": 0.62, + "learning_rate": 6.70137168096599e-06, + "loss": 0.4655, + "step": 3572 + }, + { + "epoch": 0.62, + "learning_rate": 6.696074318995355e-06, + "loss": 0.457, + "step": 3573 + }, + { + "epoch": 0.62, + "learning_rate": 6.690777997577745e-06, + "loss": 0.4677, + "step": 3574 + }, + { + "epoch": 0.62, + "learning_rate": 6.685482718381209e-06, + "loss": 0.4578, + "step": 3575 + }, + { + "epoch": 0.62, + "learning_rate": 6.680188483073458e-06, + "loss": 0.4644, + "step": 3576 + }, + { + "epoch": 0.62, + "learning_rate": 6.6748952933218895e-06, + "loss": 0.4594, + "step": 3577 + }, + { + "epoch": 0.62, + "learning_rate": 6.6696031507935575e-06, + "loss": 0.4721, + "step": 3578 + }, + { + "epoch": 0.62, + "learning_rate": 6.664312057155199e-06, + "loss": 0.4788, + "step": 3579 + }, + { + "epoch": 0.62, + "learning_rate": 6.659022014073209e-06, + "loss": 0.4659, + "step": 3580 + }, + { + "epoch": 0.62, + "learning_rate": 6.653733023213658e-06, + "loss": 0.4614, + "step": 3581 + }, + { + "epoch": 0.62, + "learning_rate": 6.64844508624229e-06, + "loss": 0.4639, + "step": 3582 + }, + { + "epoch": 0.62, + "learning_rate": 6.643158204824506e-06, + "loss": 0.4665, + "step": 3583 + }, + { + "epoch": 0.62, + "learning_rate": 6.637872380625383e-06, + "loss": 0.4654, + "step": 3584 + }, + { + "epoch": 0.62, + "learning_rate": 6.632587615309658e-06, + "loss": 0.4663, + "step": 3585 + }, + { + "epoch": 0.62, + "learning_rate": 6.627303910541743e-06, + "loss": 0.4588, + "step": 3586 + }, + { + "epoch": 0.62, + "learning_rate": 6.622021267985705e-06, + "loss": 0.4555, + "step": 3587 + }, + { + "epoch": 0.62, + "learning_rate": 6.616739689305287e-06, + "loss": 0.4616, + "step": 3588 + }, + { + "epoch": 0.62, + "learning_rate": 6.6114591761638995e-06, + "loss": 0.4486, + "step": 3589 + }, + { + "epoch": 0.62, + "learning_rate": 6.606179730224598e-06, + "loss": 0.4681, + "step": 3590 + }, + { + "epoch": 0.62, + "learning_rate": 6.600901353150123e-06, + "loss": 0.468, + "step": 3591 + }, + { + "epoch": 0.62, + "learning_rate": 6.595624046602867e-06, + "loss": 0.4731, + "step": 3592 + }, + { + "epoch": 0.62, + "learning_rate": 6.59034781224489e-06, + "loss": 0.453, + "step": 3593 + }, + { + "epoch": 0.62, + "learning_rate": 6.585072651737911e-06, + "loss": 0.4699, + "step": 3594 + }, + { + "epoch": 0.62, + "learning_rate": 6.579798566743314e-06, + "loss": 0.4721, + "step": 3595 + }, + { + "epoch": 0.62, + "learning_rate": 6.574525558922142e-06, + "loss": 0.4519, + "step": 3596 + }, + { + "epoch": 0.62, + "learning_rate": 6.5692536299350974e-06, + "loss": 0.4623, + "step": 3597 + }, + { + "epoch": 0.62, + "learning_rate": 6.563982781442551e-06, + "loss": 0.4699, + "step": 3598 + }, + { + "epoch": 0.62, + "learning_rate": 6.558713015104519e-06, + "loss": 0.4544, + "step": 3599 + }, + { + "epoch": 0.62, + "learning_rate": 6.553444332580692e-06, + "loss": 0.4631, + "step": 3600 + }, + { + "epoch": 0.62, + "learning_rate": 6.54817673553041e-06, + "loss": 0.4713, + "step": 3601 + }, + { + "epoch": 0.62, + "learning_rate": 6.54291022561267e-06, + "loss": 0.461, + "step": 3602 + }, + { + "epoch": 0.62, + "learning_rate": 6.537644804486136e-06, + "loss": 0.4755, + "step": 3603 + }, + { + "epoch": 0.62, + "learning_rate": 6.532380473809118e-06, + "loss": 0.452, + "step": 3604 + }, + { + "epoch": 0.62, + "learning_rate": 6.527117235239591e-06, + "loss": 0.471, + "step": 3605 + }, + { + "epoch": 0.62, + "learning_rate": 6.521855090435178e-06, + "loss": 0.4683, + "step": 3606 + }, + { + "epoch": 0.62, + "learning_rate": 6.516594041053173e-06, + "loss": 0.4794, + "step": 3607 + }, + { + "epoch": 0.63, + "learning_rate": 6.511334088750501e-06, + "loss": 0.4724, + "step": 3608 + }, + { + "epoch": 0.63, + "learning_rate": 6.50607523518376e-06, + "loss": 0.4787, + "step": 3609 + }, + { + "epoch": 0.63, + "learning_rate": 6.500817482009201e-06, + "loss": 0.4538, + "step": 3610 + }, + { + "epoch": 0.63, + "learning_rate": 6.495560830882719e-06, + "loss": 0.4778, + "step": 3611 + }, + { + "epoch": 0.63, + "learning_rate": 6.49030528345987e-06, + "loss": 0.4512, + "step": 3612 + }, + { + "epoch": 0.63, + "learning_rate": 6.4850508413958564e-06, + "loss": 0.4702, + "step": 3613 + }, + { + "epoch": 0.63, + "learning_rate": 6.479797506345539e-06, + "loss": 0.4632, + "step": 3614 + }, + { + "epoch": 0.63, + "learning_rate": 6.47454527996342e-06, + "loss": 0.4605, + "step": 3615 + }, + { + "epoch": 0.63, + "learning_rate": 6.469294163903666e-06, + "loss": 0.4625, + "step": 3616 + }, + { + "epoch": 0.63, + "learning_rate": 6.464044159820086e-06, + "loss": 0.4677, + "step": 3617 + }, + { + "epoch": 0.63, + "learning_rate": 6.458795269366136e-06, + "loss": 0.4603, + "step": 3618 + }, + { + "epoch": 0.63, + "learning_rate": 6.453547494194929e-06, + "loss": 0.4607, + "step": 3619 + }, + { + "epoch": 0.63, + "learning_rate": 6.448300835959218e-06, + "loss": 0.4366, + "step": 3620 + }, + { + "epoch": 0.63, + "learning_rate": 6.443055296311413e-06, + "loss": 0.4668, + "step": 3621 + }, + { + "epoch": 0.63, + "learning_rate": 6.4378108769035644e-06, + "loss": 0.452, + "step": 3622 + }, + { + "epoch": 0.63, + "learning_rate": 6.432567579387372e-06, + "loss": 0.4678, + "step": 3623 + }, + { + "epoch": 0.63, + "learning_rate": 6.427325405414189e-06, + "loss": 0.4571, + "step": 3624 + }, + { + "epoch": 0.63, + "learning_rate": 6.422084356635003e-06, + "loss": 0.4564, + "step": 3625 + }, + { + "epoch": 0.63, + "learning_rate": 6.41684443470046e-06, + "loss": 0.4556, + "step": 3626 + }, + { + "epoch": 0.63, + "learning_rate": 6.4116056412608355e-06, + "loss": 0.4651, + "step": 3627 + }, + { + "epoch": 0.63, + "learning_rate": 6.406367977966066e-06, + "loss": 0.4597, + "step": 3628 + }, + { + "epoch": 0.63, + "learning_rate": 6.4011314464657186e-06, + "loss": 0.4807, + "step": 3629 + }, + { + "epoch": 0.63, + "learning_rate": 6.3958960484090094e-06, + "loss": 0.4596, + "step": 3630 + }, + { + "epoch": 0.63, + "learning_rate": 6.390661785444809e-06, + "loss": 0.4634, + "step": 3631 + }, + { + "epoch": 0.63, + "learning_rate": 6.385428659221604e-06, + "loss": 0.4588, + "step": 3632 + }, + { + "epoch": 0.63, + "learning_rate": 6.38019667138755e-06, + "loss": 0.477, + "step": 3633 + }, + { + "epoch": 0.63, + "learning_rate": 6.374965823590425e-06, + "loss": 0.4646, + "step": 3634 + }, + { + "epoch": 0.63, + "learning_rate": 6.369736117477662e-06, + "loss": 0.4688, + "step": 3635 + }, + { + "epoch": 0.63, + "learning_rate": 6.364507554696322e-06, + "loss": 0.4649, + "step": 3636 + }, + { + "epoch": 0.63, + "learning_rate": 6.3592801368931134e-06, + "loss": 0.4653, + "step": 3637 + }, + { + "epoch": 0.63, + "learning_rate": 6.354053865714387e-06, + "loss": 0.4684, + "step": 3638 + }, + { + "epoch": 0.63, + "learning_rate": 6.348828742806122e-06, + "loss": 0.4755, + "step": 3639 + }, + { + "epoch": 0.63, + "learning_rate": 6.343604769813945e-06, + "loss": 0.468, + "step": 3640 + }, + { + "epoch": 0.63, + "learning_rate": 6.338381948383111e-06, + "loss": 0.4654, + "step": 3641 + }, + { + "epoch": 0.63, + "learning_rate": 6.33316028015853e-06, + "loss": 0.4646, + "step": 3642 + }, + { + "epoch": 0.63, + "learning_rate": 6.3279397667847265e-06, + "loss": 0.4656, + "step": 3643 + }, + { + "epoch": 0.63, + "learning_rate": 6.322720409905878e-06, + "loss": 0.4594, + "step": 3644 + }, + { + "epoch": 0.63, + "learning_rate": 6.317502211165794e-06, + "loss": 0.4647, + "step": 3645 + }, + { + "epoch": 0.63, + "learning_rate": 6.312285172207909e-06, + "loss": 0.4608, + "step": 3646 + }, + { + "epoch": 0.63, + "learning_rate": 6.30706929467531e-06, + "loss": 0.4775, + "step": 3647 + }, + { + "epoch": 0.63, + "learning_rate": 6.3018545802107e-06, + "loss": 0.4598, + "step": 3648 + }, + { + "epoch": 0.63, + "learning_rate": 6.2966410304564304e-06, + "loss": 0.4789, + "step": 3649 + }, + { + "epoch": 0.63, + "learning_rate": 6.291428647054474e-06, + "loss": 0.4667, + "step": 3650 + }, + { + "epoch": 0.63, + "learning_rate": 6.286217431646447e-06, + "loss": 0.4702, + "step": 3651 + }, + { + "epoch": 0.63, + "learning_rate": 6.281007385873594e-06, + "loss": 0.454, + "step": 3652 + }, + { + "epoch": 0.63, + "learning_rate": 6.275798511376785e-06, + "loss": 0.4691, + "step": 3653 + }, + { + "epoch": 0.63, + "learning_rate": 6.270590809796531e-06, + "loss": 0.4717, + "step": 3654 + }, + { + "epoch": 0.63, + "learning_rate": 6.265384282772961e-06, + "loss": 0.4571, + "step": 3655 + }, + { + "epoch": 0.63, + "learning_rate": 6.260178931945852e-06, + "loss": 0.4604, + "step": 3656 + }, + { + "epoch": 0.63, + "learning_rate": 6.25497475895459e-06, + "loss": 0.4663, + "step": 3657 + }, + { + "epoch": 0.63, + "learning_rate": 6.249771765438205e-06, + "loss": 0.4655, + "step": 3658 + }, + { + "epoch": 0.63, + "learning_rate": 6.244569953035355e-06, + "loss": 0.4662, + "step": 3659 + }, + { + "epoch": 0.63, + "learning_rate": 6.2393693233843155e-06, + "loss": 0.4591, + "step": 3660 + }, + { + "epoch": 0.63, + "learning_rate": 6.234169878123001e-06, + "loss": 0.475, + "step": 3661 + }, + { + "epoch": 0.63, + "learning_rate": 6.228971618888943e-06, + "loss": 0.466, + "step": 3662 + }, + { + "epoch": 0.63, + "learning_rate": 6.223774547319308e-06, + "loss": 0.4608, + "step": 3663 + }, + { + "epoch": 0.63, + "learning_rate": 6.218578665050883e-06, + "loss": 0.4663, + "step": 3664 + }, + { + "epoch": 0.63, + "learning_rate": 6.2133839737200795e-06, + "loss": 0.4641, + "step": 3665 + }, + { + "epoch": 0.64, + "learning_rate": 6.208190474962945e-06, + "loss": 0.4789, + "step": 3666 + }, + { + "epoch": 0.64, + "learning_rate": 6.202998170415133e-06, + "loss": 0.4632, + "step": 3667 + }, + { + "epoch": 0.64, + "learning_rate": 6.19780706171194e-06, + "loss": 0.4625, + "step": 3668 + }, + { + "epoch": 0.64, + "learning_rate": 6.19261715048827e-06, + "loss": 0.4738, + "step": 3669 + }, + { + "epoch": 0.64, + "learning_rate": 6.187428438378662e-06, + "loss": 0.4628, + "step": 3670 + }, + { + "epoch": 0.64, + "learning_rate": 6.1822409270172665e-06, + "loss": 0.4553, + "step": 3671 + }, + { + "epoch": 0.64, + "learning_rate": 6.177054618037866e-06, + "loss": 0.4724, + "step": 3672 + }, + { + "epoch": 0.64, + "learning_rate": 6.171869513073858e-06, + "loss": 0.4736, + "step": 3673 + }, + { + "epoch": 0.64, + "learning_rate": 6.166685613758259e-06, + "loss": 0.4536, + "step": 3674 + }, + { + "epoch": 0.64, + "learning_rate": 6.161502921723719e-06, + "loss": 0.4635, + "step": 3675 + }, + { + "epoch": 0.64, + "learning_rate": 6.156321438602484e-06, + "loss": 0.4506, + "step": 3676 + }, + { + "epoch": 0.64, + "learning_rate": 6.1511411660264485e-06, + "loss": 0.4714, + "step": 3677 + }, + { + "epoch": 0.64, + "learning_rate": 6.145962105627097e-06, + "loss": 0.4637, + "step": 3678 + }, + { + "epoch": 0.64, + "learning_rate": 6.140784259035553e-06, + "loss": 0.4725, + "step": 3679 + }, + { + "epoch": 0.64, + "learning_rate": 6.1356076278825516e-06, + "loss": 0.4605, + "step": 3680 + }, + { + "epoch": 0.64, + "learning_rate": 6.130432213798441e-06, + "loss": 0.4652, + "step": 3681 + }, + { + "epoch": 0.64, + "learning_rate": 6.125258018413191e-06, + "loss": 0.4574, + "step": 3682 + }, + { + "epoch": 0.64, + "learning_rate": 6.120085043356378e-06, + "loss": 0.4687, + "step": 3683 + }, + { + "epoch": 0.64, + "learning_rate": 6.114913290257219e-06, + "loss": 0.4612, + "step": 3684 + }, + { + "epoch": 0.64, + "learning_rate": 6.109742760744508e-06, + "loss": 0.4657, + "step": 3685 + }, + { + "epoch": 0.64, + "learning_rate": 6.104573456446687e-06, + "loss": 0.4554, + "step": 3686 + }, + { + "epoch": 0.64, + "learning_rate": 6.0994053789918004e-06, + "loss": 0.473, + "step": 3687 + }, + { + "epoch": 0.64, + "learning_rate": 6.094238530007501e-06, + "loss": 0.4553, + "step": 3688 + }, + { + "epoch": 0.64, + "learning_rate": 6.089072911121061e-06, + "loss": 0.4625, + "step": 3689 + }, + { + "epoch": 0.64, + "learning_rate": 6.083908523959362e-06, + "loss": 0.4584, + "step": 3690 + }, + { + "epoch": 0.64, + "learning_rate": 6.078745370148902e-06, + "loss": 0.4723, + "step": 3691 + }, + { + "epoch": 0.64, + "learning_rate": 6.073583451315782e-06, + "loss": 0.4441, + "step": 3692 + }, + { + "epoch": 0.64, + "learning_rate": 6.068422769085722e-06, + "loss": 0.4579, + "step": 3693 + }, + { + "epoch": 0.64, + "learning_rate": 6.063263325084054e-06, + "loss": 0.4471, + "step": 3694 + }, + { + "epoch": 0.64, + "learning_rate": 6.0581051209357135e-06, + "loss": 0.4524, + "step": 3695 + }, + { + "epoch": 0.64, + "learning_rate": 6.052948158265248e-06, + "loss": 0.4682, + "step": 3696 + }, + { + "epoch": 0.64, + "learning_rate": 6.047792438696813e-06, + "loss": 0.4705, + "step": 3697 + }, + { + "epoch": 0.64, + "learning_rate": 6.042637963854179e-06, + "loss": 0.4437, + "step": 3698 + }, + { + "epoch": 0.64, + "learning_rate": 6.037484735360711e-06, + "loss": 0.4598, + "step": 3699 + }, + { + "epoch": 0.64, + "learning_rate": 6.0323327548393926e-06, + "loss": 0.4514, + "step": 3700 + }, + { + "epoch": 0.64, + "learning_rate": 6.027182023912819e-06, + "loss": 0.4683, + "step": 3701 + }, + { + "epoch": 0.64, + "learning_rate": 6.0220325442031714e-06, + "loss": 0.4591, + "step": 3702 + }, + { + "epoch": 0.64, + "learning_rate": 6.016884317332261e-06, + "loss": 0.459, + "step": 3703 + }, + { + "epoch": 0.64, + "learning_rate": 6.011737344921487e-06, + "loss": 0.4688, + "step": 3704 + }, + { + "epoch": 0.64, + "learning_rate": 6.0065916285918625e-06, + "loss": 0.4723, + "step": 3705 + }, + { + "epoch": 0.64, + "learning_rate": 6.001447169964e-06, + "loss": 0.4551, + "step": 3706 + }, + { + "epoch": 0.64, + "learning_rate": 5.996303970658119e-06, + "loss": 0.472, + "step": 3707 + }, + { + "epoch": 0.64, + "learning_rate": 5.991162032294042e-06, + "loss": 0.475, + "step": 3708 + }, + { + "epoch": 0.64, + "learning_rate": 5.986021356491192e-06, + "loss": 0.4661, + "step": 3709 + }, + { + "epoch": 0.64, + "learning_rate": 5.980881944868604e-06, + "loss": 0.4583, + "step": 3710 + }, + { + "epoch": 0.64, + "learning_rate": 5.975743799044894e-06, + "loss": 0.4576, + "step": 3711 + }, + { + "epoch": 0.64, + "learning_rate": 5.970606920638304e-06, + "loss": 0.4629, + "step": 3712 + }, + { + "epoch": 0.64, + "learning_rate": 5.965471311266658e-06, + "loss": 0.4794, + "step": 3713 + }, + { + "epoch": 0.64, + "learning_rate": 5.960336972547391e-06, + "loss": 0.4629, + "step": 3714 + }, + { + "epoch": 0.64, + "learning_rate": 5.955203906097537e-06, + "loss": 0.4614, + "step": 3715 + }, + { + "epoch": 0.64, + "learning_rate": 5.9500721135337205e-06, + "loss": 0.4714, + "step": 3716 + }, + { + "epoch": 0.64, + "learning_rate": 5.944941596472176e-06, + "loss": 0.4577, + "step": 3717 + }, + { + "epoch": 0.64, + "learning_rate": 5.939812356528727e-06, + "loss": 0.4576, + "step": 3718 + }, + { + "epoch": 0.64, + "learning_rate": 5.934684395318806e-06, + "loss": 0.4625, + "step": 3719 + }, + { + "epoch": 0.64, + "learning_rate": 5.929557714457425e-06, + "loss": 0.4628, + "step": 3720 + }, + { + "epoch": 0.64, + "learning_rate": 5.924432315559213e-06, + "loss": 0.4631, + "step": 3721 + }, + { + "epoch": 0.64, + "learning_rate": 5.919308200238385e-06, + "loss": 0.4518, + "step": 3722 + }, + { + "epoch": 0.64, + "learning_rate": 5.914185370108749e-06, + "loss": 0.4709, + "step": 3723 + }, + { + "epoch": 0.65, + "learning_rate": 5.9090638267837144e-06, + "loss": 0.4643, + "step": 3724 + }, + { + "epoch": 0.65, + "learning_rate": 5.90394357187628e-06, + "loss": 0.4795, + "step": 3725 + }, + { + "epoch": 0.65, + "learning_rate": 5.898824606999047e-06, + "loss": 0.444, + "step": 3726 + }, + { + "epoch": 0.65, + "learning_rate": 5.893706933764196e-06, + "loss": 0.4698, + "step": 3727 + }, + { + "epoch": 0.65, + "learning_rate": 5.888590553783517e-06, + "loss": 0.4571, + "step": 3728 + }, + { + "epoch": 0.65, + "learning_rate": 5.883475468668387e-06, + "loss": 0.4757, + "step": 3729 + }, + { + "epoch": 0.65, + "learning_rate": 5.8783616800297675e-06, + "loss": 0.4632, + "step": 3730 + }, + { + "epoch": 0.65, + "learning_rate": 5.873249189478221e-06, + "loss": 0.4665, + "step": 3731 + }, + { + "epoch": 0.65, + "learning_rate": 5.868137998623897e-06, + "loss": 0.4642, + "step": 3732 + }, + { + "epoch": 0.65, + "learning_rate": 5.8630281090765386e-06, + "loss": 0.47, + "step": 3733 + }, + { + "epoch": 0.65, + "learning_rate": 5.857919522445475e-06, + "loss": 0.461, + "step": 3734 + }, + { + "epoch": 0.65, + "learning_rate": 5.8528122403396226e-06, + "loss": 0.4655, + "step": 3735 + }, + { + "epoch": 0.65, + "learning_rate": 5.847706264367503e-06, + "loss": 0.4718, + "step": 3736 + }, + { + "epoch": 0.65, + "learning_rate": 5.842601596137206e-06, + "loss": 0.4683, + "step": 3737 + }, + { + "epoch": 0.65, + "learning_rate": 5.8374982372564255e-06, + "loss": 0.4635, + "step": 3738 + }, + { + "epoch": 0.65, + "learning_rate": 5.832396189332423e-06, + "loss": 0.464, + "step": 3739 + }, + { + "epoch": 0.65, + "learning_rate": 5.8272954539720775e-06, + "loss": 0.4756, + "step": 3740 + }, + { + "epoch": 0.65, + "learning_rate": 5.822196032781824e-06, + "loss": 0.4625, + "step": 3741 + }, + { + "epoch": 0.65, + "learning_rate": 5.817097927367701e-06, + "loss": 0.4692, + "step": 3742 + }, + { + "epoch": 0.65, + "learning_rate": 5.812001139335329e-06, + "loss": 0.4654, + "step": 3743 + }, + { + "epoch": 0.65, + "learning_rate": 5.806905670289913e-06, + "loss": 0.4728, + "step": 3744 + }, + { + "epoch": 0.65, + "learning_rate": 5.801811521836246e-06, + "loss": 0.4626, + "step": 3745 + }, + { + "epoch": 0.65, + "learning_rate": 5.796718695578695e-06, + "loss": 0.4652, + "step": 3746 + }, + { + "epoch": 0.65, + "learning_rate": 5.7916271931212185e-06, + "loss": 0.4602, + "step": 3747 + }, + { + "epoch": 0.65, + "learning_rate": 5.786537016067362e-06, + "loss": 0.4601, + "step": 3748 + }, + { + "epoch": 0.65, + "learning_rate": 5.781448166020242e-06, + "loss": 0.465, + "step": 3749 + }, + { + "epoch": 0.65, + "learning_rate": 5.776360644582569e-06, + "loss": 0.4587, + "step": 3750 + }, + { + "epoch": 0.65, + "learning_rate": 5.771274453356628e-06, + "loss": 0.467, + "step": 3751 + }, + { + "epoch": 0.65, + "learning_rate": 5.766189593944289e-06, + "loss": 0.4571, + "step": 3752 + }, + { + "epoch": 0.65, + "learning_rate": 5.761106067946993e-06, + "loss": 0.4588, + "step": 3753 + }, + { + "epoch": 0.65, + "learning_rate": 5.756023876965773e-06, + "loss": 0.4725, + "step": 3754 + }, + { + "epoch": 0.65, + "learning_rate": 5.7509430226012365e-06, + "loss": 0.4601, + "step": 3755 + }, + { + "epoch": 0.65, + "learning_rate": 5.745863506453569e-06, + "loss": 0.4668, + "step": 3756 + }, + { + "epoch": 0.65, + "learning_rate": 5.740785330122542e-06, + "loss": 0.4774, + "step": 3757 + }, + { + "epoch": 0.65, + "learning_rate": 5.735708495207486e-06, + "loss": 0.4693, + "step": 3758 + }, + { + "epoch": 0.65, + "learning_rate": 5.730633003307338e-06, + "loss": 0.4639, + "step": 3759 + }, + { + "epoch": 0.65, + "learning_rate": 5.725558856020584e-06, + "loss": 0.4714, + "step": 3760 + }, + { + "epoch": 0.65, + "learning_rate": 5.7204860549453025e-06, + "loss": 0.4505, + "step": 3761 + }, + { + "epoch": 0.65, + "learning_rate": 5.715414601679144e-06, + "loss": 0.4769, + "step": 3762 + }, + { + "epoch": 0.65, + "learning_rate": 5.710344497819333e-06, + "loss": 0.4562, + "step": 3763 + }, + { + "epoch": 0.65, + "learning_rate": 5.705275744962676e-06, + "loss": 0.4739, + "step": 3764 + }, + { + "epoch": 0.65, + "learning_rate": 5.700208344705537e-06, + "loss": 0.4644, + "step": 3765 + }, + { + "epoch": 0.65, + "learning_rate": 5.695142298643881e-06, + "loss": 0.4559, + "step": 3766 + }, + { + "epoch": 0.65, + "learning_rate": 5.690077608373219e-06, + "loss": 0.4567, + "step": 3767 + }, + { + "epoch": 0.65, + "learning_rate": 5.685014275488649e-06, + "loss": 0.4767, + "step": 3768 + }, + { + "epoch": 0.65, + "learning_rate": 5.679952301584844e-06, + "loss": 0.4589, + "step": 3769 + }, + { + "epoch": 0.65, + "learning_rate": 5.674891688256041e-06, + "loss": 0.4607, + "step": 3770 + }, + { + "epoch": 0.65, + "learning_rate": 5.669832437096058e-06, + "loss": 0.4569, + "step": 3771 + }, + { + "epoch": 0.65, + "learning_rate": 5.664774549698269e-06, + "loss": 0.4761, + "step": 3772 + }, + { + "epoch": 0.65, + "learning_rate": 5.659718027655631e-06, + "loss": 0.4579, + "step": 3773 + }, + { + "epoch": 0.65, + "learning_rate": 5.6546628725606675e-06, + "loss": 0.4528, + "step": 3774 + }, + { + "epoch": 0.65, + "learning_rate": 5.649609086005476e-06, + "loss": 0.458, + "step": 3775 + }, + { + "epoch": 0.65, + "learning_rate": 5.644556669581709e-06, + "loss": 0.4681, + "step": 3776 + }, + { + "epoch": 0.65, + "learning_rate": 5.639505624880604e-06, + "loss": 0.4688, + "step": 3777 + }, + { + "epoch": 0.65, + "learning_rate": 5.634455953492964e-06, + "loss": 0.4686, + "step": 3778 + }, + { + "epoch": 0.65, + "learning_rate": 5.629407657009143e-06, + "loss": 0.4591, + "step": 3779 + }, + { + "epoch": 0.65, + "learning_rate": 5.624360737019081e-06, + "loss": 0.4553, + "step": 3780 + }, + { + "epoch": 0.66, + "learning_rate": 5.619315195112276e-06, + "loss": 0.4691, + "step": 3781 + }, + { + "epoch": 0.66, + "learning_rate": 5.614271032877799e-06, + "loss": 0.4607, + "step": 3782 + }, + { + "epoch": 0.66, + "learning_rate": 5.609228251904265e-06, + "loss": 0.4665, + "step": 3783 + }, + { + "epoch": 0.66, + "learning_rate": 5.6041868537798845e-06, + "loss": 0.4743, + "step": 3784 + }, + { + "epoch": 0.66, + "learning_rate": 5.59914684009242e-06, + "loss": 0.4555, + "step": 3785 + }, + { + "epoch": 0.66, + "learning_rate": 5.594108212429183e-06, + "loss": 0.4663, + "step": 3786 + }, + { + "epoch": 0.66, + "learning_rate": 5.589070972377068e-06, + "loss": 0.458, + "step": 3787 + }, + { + "epoch": 0.66, + "learning_rate": 5.584035121522526e-06, + "loss": 0.4612, + "step": 3788 + }, + { + "epoch": 0.66, + "learning_rate": 5.579000661451574e-06, + "loss": 0.4473, + "step": 3789 + }, + { + "epoch": 0.66, + "learning_rate": 5.573967593749778e-06, + "loss": 0.456, + "step": 3790 + }, + { + "epoch": 0.66, + "learning_rate": 5.568935920002276e-06, + "loss": 0.4696, + "step": 3791 + }, + { + "epoch": 0.66, + "learning_rate": 5.563905641793776e-06, + "loss": 0.4693, + "step": 3792 + }, + { + "epoch": 0.66, + "learning_rate": 5.558876760708527e-06, + "loss": 0.4759, + "step": 3793 + }, + { + "epoch": 0.66, + "learning_rate": 5.553849278330349e-06, + "loss": 0.4616, + "step": 3794 + }, + { + "epoch": 0.66, + "learning_rate": 5.54882319624262e-06, + "loss": 0.4664, + "step": 3795 + }, + { + "epoch": 0.66, + "learning_rate": 5.54379851602828e-06, + "loss": 0.4681, + "step": 3796 + }, + { + "epoch": 0.66, + "learning_rate": 5.538775239269818e-06, + "loss": 0.4507, + "step": 3797 + }, + { + "epoch": 0.66, + "learning_rate": 5.533753367549285e-06, + "loss": 0.4625, + "step": 3798 + }, + { + "epoch": 0.66, + "learning_rate": 5.528732902448305e-06, + "loss": 0.4612, + "step": 3799 + }, + { + "epoch": 0.66, + "learning_rate": 5.523713845548033e-06, + "loss": 0.4672, + "step": 3800 + }, + { + "epoch": 0.66, + "learning_rate": 5.518696198429201e-06, + "loss": 0.4616, + "step": 3801 + }, + { + "epoch": 0.66, + "learning_rate": 5.513679962672076e-06, + "loss": 0.4722, + "step": 3802 + }, + { + "epoch": 0.66, + "learning_rate": 5.508665139856513e-06, + "loss": 0.4661, + "step": 3803 + }, + { + "epoch": 0.66, + "learning_rate": 5.503651731561887e-06, + "loss": 0.4585, + "step": 3804 + }, + { + "epoch": 0.66, + "learning_rate": 5.498639739367148e-06, + "loss": 0.4713, + "step": 3805 + }, + { + "epoch": 0.66, + "learning_rate": 5.493629164850795e-06, + "loss": 0.4669, + "step": 3806 + }, + { + "epoch": 0.66, + "learning_rate": 5.488620009590881e-06, + "loss": 0.4646, + "step": 3807 + }, + { + "epoch": 0.66, + "learning_rate": 5.483612275165018e-06, + "loss": 0.464, + "step": 3808 + }, + { + "epoch": 0.66, + "learning_rate": 5.478605963150348e-06, + "loss": 0.442, + "step": 3809 + }, + { + "epoch": 0.66, + "learning_rate": 5.473601075123599e-06, + "loss": 0.4687, + "step": 3810 + }, + { + "epoch": 0.66, + "learning_rate": 5.468597612661021e-06, + "loss": 0.4591, + "step": 3811 + }, + { + "epoch": 0.66, + "learning_rate": 5.4635955773384295e-06, + "loss": 0.4737, + "step": 3812 + }, + { + "epoch": 0.66, + "learning_rate": 5.458594970731188e-06, + "loss": 0.4548, + "step": 3813 + }, + { + "epoch": 0.66, + "learning_rate": 5.453595794414211e-06, + "loss": 0.4619, + "step": 3814 + }, + { + "epoch": 0.66, + "learning_rate": 5.448598049961964e-06, + "loss": 0.4589, + "step": 3815 + }, + { + "epoch": 0.66, + "learning_rate": 5.443601738948452e-06, + "loss": 0.462, + "step": 3816 + }, + { + "epoch": 0.66, + "learning_rate": 5.438606862947237e-06, + "loss": 0.4483, + "step": 3817 + }, + { + "epoch": 0.66, + "learning_rate": 5.433613423531432e-06, + "loss": 0.4479, + "step": 3818 + }, + { + "epoch": 0.66, + "learning_rate": 5.428621422273687e-06, + "loss": 0.4714, + "step": 3819 + }, + { + "epoch": 0.66, + "learning_rate": 5.4236308607462095e-06, + "loss": 0.4632, + "step": 3820 + }, + { + "epoch": 0.66, + "learning_rate": 5.418641740520748e-06, + "loss": 0.4571, + "step": 3821 + }, + { + "epoch": 0.66, + "learning_rate": 5.413654063168602e-06, + "loss": 0.4652, + "step": 3822 + }, + { + "epoch": 0.66, + "learning_rate": 5.408667830260603e-06, + "loss": 0.4642, + "step": 3823 + }, + { + "epoch": 0.66, + "learning_rate": 5.403683043367145e-06, + "loss": 0.4725, + "step": 3824 + }, + { + "epoch": 0.66, + "learning_rate": 5.398699704058156e-06, + "loss": 0.4654, + "step": 3825 + }, + { + "epoch": 0.66, + "learning_rate": 5.393717813903112e-06, + "loss": 0.4514, + "step": 3826 + }, + { + "epoch": 0.66, + "learning_rate": 5.388737374471032e-06, + "loss": 0.4707, + "step": 3827 + }, + { + "epoch": 0.66, + "learning_rate": 5.383758387330476e-06, + "loss": 0.4746, + "step": 3828 + }, + { + "epoch": 0.66, + "learning_rate": 5.378780854049553e-06, + "loss": 0.4654, + "step": 3829 + }, + { + "epoch": 0.66, + "learning_rate": 5.373804776195903e-06, + "loss": 0.4837, + "step": 3830 + }, + { + "epoch": 0.66, + "learning_rate": 5.368830155336717e-06, + "loss": 0.4546, + "step": 3831 + }, + { + "epoch": 0.66, + "learning_rate": 5.363856993038725e-06, + "loss": 0.4698, + "step": 3832 + }, + { + "epoch": 0.66, + "learning_rate": 5.358885290868195e-06, + "loss": 0.4645, + "step": 3833 + }, + { + "epoch": 0.66, + "learning_rate": 5.353915050390941e-06, + "loss": 0.4748, + "step": 3834 + }, + { + "epoch": 0.66, + "learning_rate": 5.3489462731723045e-06, + "loss": 0.4708, + "step": 3835 + }, + { + "epoch": 0.66, + "learning_rate": 5.343978960777184e-06, + "loss": 0.4718, + "step": 3836 + }, + { + "epoch": 0.66, + "learning_rate": 5.3390131147699995e-06, + "loss": 0.4653, + "step": 3837 + }, + { + "epoch": 0.66, + "learning_rate": 5.3340487367147195e-06, + "loss": 0.4753, + "step": 3838 + }, + { + "epoch": 0.67, + "learning_rate": 5.329085828174847e-06, + "loss": 0.4544, + "step": 3839 + }, + { + "epoch": 0.67, + "learning_rate": 5.324124390713423e-06, + "loss": 0.4679, + "step": 3840 + }, + { + "epoch": 0.67, + "learning_rate": 5.3191644258930275e-06, + "loss": 0.4557, + "step": 3841 + }, + { + "epoch": 0.67, + "learning_rate": 5.3142059352757625e-06, + "loss": 0.4634, + "step": 3842 + }, + { + "epoch": 0.67, + "learning_rate": 5.309248920423293e-06, + "loss": 0.4606, + "step": 3843 + }, + { + "epoch": 0.67, + "learning_rate": 5.304293382896792e-06, + "loss": 0.4601, + "step": 3844 + }, + { + "epoch": 0.67, + "learning_rate": 5.299339324256986e-06, + "loss": 0.4662, + "step": 3845 + }, + { + "epoch": 0.67, + "learning_rate": 5.294386746064115e-06, + "loss": 0.4661, + "step": 3846 + }, + { + "epoch": 0.67, + "learning_rate": 5.28943564987798e-06, + "loss": 0.4621, + "step": 3847 + }, + { + "epoch": 0.67, + "learning_rate": 5.2844860372578995e-06, + "loss": 0.457, + "step": 3848 + }, + { + "epoch": 0.67, + "learning_rate": 5.2795379097627195e-06, + "loss": 0.4558, + "step": 3849 + }, + { + "epoch": 0.67, + "learning_rate": 5.274591268950828e-06, + "loss": 0.4629, + "step": 3850 + }, + { + "epoch": 0.67, + "learning_rate": 5.2696461163801445e-06, + "loss": 0.4588, + "step": 3851 + }, + { + "epoch": 0.67, + "learning_rate": 5.264702453608119e-06, + "loss": 0.4654, + "step": 3852 + }, + { + "epoch": 0.67, + "learning_rate": 5.2597602821917206e-06, + "loss": 0.4633, + "step": 3853 + }, + { + "epoch": 0.67, + "learning_rate": 5.254819603687469e-06, + "loss": 0.4682, + "step": 3854 + }, + { + "epoch": 0.67, + "learning_rate": 5.249880419651403e-06, + "loss": 0.4468, + "step": 3855 + }, + { + "epoch": 0.67, + "learning_rate": 5.244942731639084e-06, + "loss": 0.4686, + "step": 3856 + }, + { + "epoch": 0.67, + "learning_rate": 5.2400065412056136e-06, + "loss": 0.4523, + "step": 3857 + }, + { + "epoch": 0.67, + "learning_rate": 5.235071849905617e-06, + "loss": 0.4693, + "step": 3858 + }, + { + "epoch": 0.67, + "learning_rate": 5.230138659293254e-06, + "loss": 0.467, + "step": 3859 + }, + { + "epoch": 0.67, + "learning_rate": 5.2252069709221945e-06, + "loss": 0.4571, + "step": 3860 + }, + { + "epoch": 0.67, + "learning_rate": 5.220276786345648e-06, + "loss": 0.4575, + "step": 3861 + }, + { + "epoch": 0.67, + "learning_rate": 5.21534810711636e-06, + "loss": 0.4704, + "step": 3862 + }, + { + "epoch": 0.67, + "learning_rate": 5.2104209347865786e-06, + "loss": 0.4462, + "step": 3863 + }, + { + "epoch": 0.67, + "learning_rate": 5.205495270908094e-06, + "loss": 0.4624, + "step": 3864 + }, + { + "epoch": 0.67, + "learning_rate": 5.200571117032216e-06, + "loss": 0.4513, + "step": 3865 + }, + { + "epoch": 0.67, + "learning_rate": 5.195648474709783e-06, + "loss": 0.4586, + "step": 3866 + }, + { + "epoch": 0.67, + "learning_rate": 5.190727345491149e-06, + "loss": 0.4619, + "step": 3867 + }, + { + "epoch": 0.67, + "learning_rate": 5.185807730926191e-06, + "loss": 0.4741, + "step": 3868 + }, + { + "epoch": 0.67, + "learning_rate": 5.180889632564331e-06, + "loss": 0.4711, + "step": 3869 + }, + { + "epoch": 0.67, + "learning_rate": 5.175973051954482e-06, + "loss": 0.4641, + "step": 3870 + }, + { + "epoch": 0.67, + "learning_rate": 5.171057990645098e-06, + "loss": 0.4523, + "step": 3871 + }, + { + "epoch": 0.67, + "learning_rate": 5.166144450184154e-06, + "loss": 0.4696, + "step": 3872 + }, + { + "epoch": 0.67, + "learning_rate": 5.16123243211914e-06, + "loss": 0.4567, + "step": 3873 + }, + { + "epoch": 0.67, + "learning_rate": 5.156321937997064e-06, + "loss": 0.4642, + "step": 3874 + }, + { + "epoch": 0.67, + "learning_rate": 5.151412969364464e-06, + "loss": 0.4562, + "step": 3875 + }, + { + "epoch": 0.67, + "learning_rate": 5.1465055277673915e-06, + "loss": 0.4813, + "step": 3876 + }, + { + "epoch": 0.67, + "learning_rate": 5.141599614751416e-06, + "loss": 0.4642, + "step": 3877 + }, + { + "epoch": 0.67, + "learning_rate": 5.136695231861633e-06, + "loss": 0.463, + "step": 3878 + }, + { + "epoch": 0.67, + "learning_rate": 5.131792380642639e-06, + "loss": 0.4429, + "step": 3879 + }, + { + "epoch": 0.67, + "learning_rate": 5.126891062638575e-06, + "loss": 0.4724, + "step": 3880 + }, + { + "epoch": 0.67, + "learning_rate": 5.121991279393073e-06, + "loss": 0.4663, + "step": 3881 + }, + { + "epoch": 0.67, + "learning_rate": 5.117093032449297e-06, + "loss": 0.4603, + "step": 3882 + }, + { + "epoch": 0.67, + "learning_rate": 5.112196323349918e-06, + "loss": 0.4549, + "step": 3883 + }, + { + "epoch": 0.67, + "learning_rate": 5.107301153637133e-06, + "loss": 0.4709, + "step": 3884 + }, + { + "epoch": 0.67, + "learning_rate": 5.10240752485265e-06, + "loss": 0.4503, + "step": 3885 + }, + { + "epoch": 0.67, + "learning_rate": 5.097515438537678e-06, + "loss": 0.462, + "step": 3886 + }, + { + "epoch": 0.67, + "learning_rate": 5.092624896232969e-06, + "loss": 0.4536, + "step": 3887 + }, + { + "epoch": 0.67, + "learning_rate": 5.087735899478759e-06, + "loss": 0.4736, + "step": 3888 + }, + { + "epoch": 0.67, + "learning_rate": 5.082848449814816e-06, + "loss": 0.4562, + "step": 3889 + }, + { + "epoch": 0.67, + "learning_rate": 5.0779625487804125e-06, + "loss": 0.4619, + "step": 3890 + }, + { + "epoch": 0.67, + "learning_rate": 5.073078197914341e-06, + "loss": 0.467, + "step": 3891 + }, + { + "epoch": 0.67, + "learning_rate": 5.068195398754898e-06, + "loss": 0.4736, + "step": 3892 + }, + { + "epoch": 0.67, + "learning_rate": 5.063314152839891e-06, + "loss": 0.4517, + "step": 3893 + }, + { + "epoch": 0.67, + "learning_rate": 5.058434461706642e-06, + "loss": 0.4644, + "step": 3894 + }, + { + "epoch": 0.67, + "learning_rate": 5.053556326891986e-06, + "loss": 0.4685, + "step": 3895 + }, + { + "epoch": 0.67, + "learning_rate": 5.048679749932261e-06, + "loss": 0.4656, + "step": 3896 + }, + { + "epoch": 0.68, + "learning_rate": 5.043804732363321e-06, + "loss": 0.4573, + "step": 3897 + }, + { + "epoch": 0.68, + "learning_rate": 5.038931275720522e-06, + "loss": 0.4655, + "step": 3898 + }, + { + "epoch": 0.68, + "learning_rate": 5.03405938153874e-06, + "loss": 0.4577, + "step": 3899 + }, + { + "epoch": 0.68, + "learning_rate": 5.029189051352339e-06, + "loss": 0.4766, + "step": 3900 + }, + { + "epoch": 0.68, + "learning_rate": 5.02432028669521e-06, + "loss": 0.4757, + "step": 3901 + }, + { + "epoch": 0.68, + "learning_rate": 5.0194530891007405e-06, + "loss": 0.4582, + "step": 3902 + }, + { + "epoch": 0.68, + "learning_rate": 5.01458746010183e-06, + "loss": 0.4724, + "step": 3903 + }, + { + "epoch": 0.68, + "learning_rate": 5.0097234012308836e-06, + "loss": 0.4504, + "step": 3904 + }, + { + "epoch": 0.68, + "learning_rate": 5.004860914019798e-06, + "loss": 0.4703, + "step": 3905 + }, + { + "epoch": 0.68, + "learning_rate": 5.000000000000003e-06, + "loss": 0.4569, + "step": 3906 + }, + { + "epoch": 0.68, + "learning_rate": 4.9951406607024024e-06, + "loss": 0.4723, + "step": 3907 + }, + { + "epoch": 0.68, + "learning_rate": 4.990282897657425e-06, + "loss": 0.4625, + "step": 3908 + }, + { + "epoch": 0.68, + "learning_rate": 4.985426712394994e-06, + "loss": 0.4656, + "step": 3909 + }, + { + "epoch": 0.68, + "learning_rate": 4.980572106444539e-06, + "loss": 0.4499, + "step": 3910 + }, + { + "epoch": 0.68, + "learning_rate": 4.9757190813349945e-06, + "loss": 0.462, + "step": 3911 + }, + { + "epoch": 0.68, + "learning_rate": 4.970867638594783e-06, + "loss": 0.452, + "step": 3912 + }, + { + "epoch": 0.68, + "learning_rate": 4.966017779751854e-06, + "loss": 0.4632, + "step": 3913 + }, + { + "epoch": 0.68, + "learning_rate": 4.961169506333632e-06, + "loss": 0.4513, + "step": 3914 + }, + { + "epoch": 0.68, + "learning_rate": 4.956322819867059e-06, + "loss": 0.4621, + "step": 3915 + }, + { + "epoch": 0.68, + "learning_rate": 4.9514777218785704e-06, + "loss": 0.4767, + "step": 3916 + }, + { + "epoch": 0.68, + "learning_rate": 4.946634213894104e-06, + "loss": 0.4647, + "step": 3917 + }, + { + "epoch": 0.68, + "learning_rate": 4.941792297439098e-06, + "loss": 0.4511, + "step": 3918 + }, + { + "epoch": 0.68, + "learning_rate": 4.936951974038481e-06, + "loss": 0.4713, + "step": 3919 + }, + { + "epoch": 0.68, + "learning_rate": 4.932113245216689e-06, + "loss": 0.4471, + "step": 3920 + }, + { + "epoch": 0.68, + "learning_rate": 4.927276112497652e-06, + "loss": 0.4602, + "step": 3921 + }, + { + "epoch": 0.68, + "learning_rate": 4.922440577404804e-06, + "loss": 0.4603, + "step": 3922 + }, + { + "epoch": 0.68, + "learning_rate": 4.917606641461056e-06, + "loss": 0.4642, + "step": 3923 + }, + { + "epoch": 0.68, + "learning_rate": 4.912774306188842e-06, + "loss": 0.4735, + "step": 3924 + }, + { + "epoch": 0.68, + "learning_rate": 4.90794357311008e-06, + "loss": 0.4675, + "step": 3925 + }, + { + "epoch": 0.68, + "learning_rate": 4.903114443746173e-06, + "loss": 0.4655, + "step": 3926 + }, + { + "epoch": 0.68, + "learning_rate": 4.898286919618034e-06, + "loss": 0.4552, + "step": 3927 + }, + { + "epoch": 0.68, + "learning_rate": 4.8934610022460635e-06, + "loss": 0.4642, + "step": 3928 + }, + { + "epoch": 0.68, + "learning_rate": 4.888636693150161e-06, + "loss": 0.4609, + "step": 3929 + }, + { + "epoch": 0.68, + "learning_rate": 4.883813993849706e-06, + "loss": 0.4498, + "step": 3930 + }, + { + "epoch": 0.68, + "learning_rate": 4.878992905863591e-06, + "loss": 0.4568, + "step": 3931 + }, + { + "epoch": 0.68, + "learning_rate": 4.874173430710192e-06, + "loss": 0.4625, + "step": 3932 + }, + { + "epoch": 0.68, + "learning_rate": 4.869355569907367e-06, + "loss": 0.4587, + "step": 3933 + }, + { + "epoch": 0.68, + "learning_rate": 4.864539324972478e-06, + "loss": 0.47, + "step": 3934 + }, + { + "epoch": 0.68, + "learning_rate": 4.859724697422377e-06, + "loss": 0.4627, + "step": 3935 + }, + { + "epoch": 0.68, + "learning_rate": 4.8549116887734045e-06, + "loss": 0.4528, + "step": 3936 + }, + { + "epoch": 0.68, + "learning_rate": 4.850100300541386e-06, + "loss": 0.4505, + "step": 3937 + }, + { + "epoch": 0.68, + "learning_rate": 4.8452905342416405e-06, + "loss": 0.4524, + "step": 3938 + }, + { + "epoch": 0.68, + "learning_rate": 4.840482391388988e-06, + "loss": 0.4619, + "step": 3939 + }, + { + "epoch": 0.68, + "learning_rate": 4.835675873497716e-06, + "loss": 0.4656, + "step": 3940 + }, + { + "epoch": 0.68, + "learning_rate": 4.830870982081614e-06, + "loss": 0.4612, + "step": 3941 + }, + { + "epoch": 0.68, + "learning_rate": 4.8260677186539554e-06, + "loss": 0.4606, + "step": 3942 + }, + { + "epoch": 0.68, + "learning_rate": 4.821266084727505e-06, + "loss": 0.4662, + "step": 3943 + }, + { + "epoch": 0.68, + "learning_rate": 4.816466081814504e-06, + "loss": 0.4507, + "step": 3944 + }, + { + "epoch": 0.68, + "learning_rate": 4.811667711426686e-06, + "loss": 0.4686, + "step": 3945 + }, + { + "epoch": 0.68, + "learning_rate": 4.8068709750752825e-06, + "loss": 0.4615, + "step": 3946 + }, + { + "epoch": 0.68, + "learning_rate": 4.802075874270988e-06, + "loss": 0.4663, + "step": 3947 + }, + { + "epoch": 0.68, + "learning_rate": 4.797282410523997e-06, + "loss": 0.4541, + "step": 3948 + }, + { + "epoch": 0.68, + "learning_rate": 4.792490585343983e-06, + "loss": 0.4682, + "step": 3949 + }, + { + "epoch": 0.68, + "learning_rate": 4.787700400240108e-06, + "loss": 0.4638, + "step": 3950 + }, + { + "epoch": 0.68, + "learning_rate": 4.78291185672101e-06, + "loss": 0.4696, + "step": 3951 + }, + { + "epoch": 0.68, + "learning_rate": 4.7781249562948136e-06, + "loss": 0.4554, + "step": 3952 + }, + { + "epoch": 0.68, + "learning_rate": 4.773339700469129e-06, + "loss": 0.4688, + "step": 3953 + }, + { + "epoch": 0.68, + "learning_rate": 4.7685560907510465e-06, + "loss": 0.4552, + "step": 3954 + }, + { + "epoch": 0.69, + "learning_rate": 4.7637741286471385e-06, + "loss": 0.4667, + "step": 3955 + }, + { + "epoch": 0.69, + "learning_rate": 4.7589938156634485e-06, + "loss": 0.4608, + "step": 3956 + }, + { + "epoch": 0.69, + "learning_rate": 4.7542151533055235e-06, + "loss": 0.4648, + "step": 3957 + }, + { + "epoch": 0.69, + "learning_rate": 4.7494381430783656e-06, + "loss": 0.4578, + "step": 3958 + }, + { + "epoch": 0.69, + "learning_rate": 4.744662786486471e-06, + "loss": 0.4605, + "step": 3959 + }, + { + "epoch": 0.69, + "learning_rate": 4.739889085033812e-06, + "loss": 0.458, + "step": 3960 + }, + { + "epoch": 0.69, + "learning_rate": 4.73511704022384e-06, + "loss": 0.4681, + "step": 3961 + }, + { + "epoch": 0.69, + "learning_rate": 4.730346653559486e-06, + "loss": 0.4721, + "step": 3962 + }, + { + "epoch": 0.69, + "learning_rate": 4.725577926543151e-06, + "loss": 0.4547, + "step": 3963 + }, + { + "epoch": 0.69, + "learning_rate": 4.720810860676722e-06, + "loss": 0.4637, + "step": 3964 + }, + { + "epoch": 0.69, + "learning_rate": 4.7160454574615596e-06, + "loss": 0.472, + "step": 3965 + }, + { + "epoch": 0.69, + "learning_rate": 4.711281718398503e-06, + "loss": 0.4559, + "step": 3966 + }, + { + "epoch": 0.69, + "learning_rate": 4.706519644987863e-06, + "loss": 0.4531, + "step": 3967 + }, + { + "epoch": 0.69, + "learning_rate": 4.701759238729428e-06, + "loss": 0.4634, + "step": 3968 + }, + { + "epoch": 0.69, + "learning_rate": 4.697000501122466e-06, + "loss": 0.4764, + "step": 3969 + }, + { + "epoch": 0.69, + "learning_rate": 4.6922434336657095e-06, + "loss": 0.4595, + "step": 3970 + }, + { + "epoch": 0.69, + "learning_rate": 4.68748803785737e-06, + "loss": 0.4585, + "step": 3971 + }, + { + "epoch": 0.69, + "learning_rate": 4.682734315195138e-06, + "loss": 0.4674, + "step": 3972 + }, + { + "epoch": 0.69, + "learning_rate": 4.677982267176168e-06, + "loss": 0.4676, + "step": 3973 + }, + { + "epoch": 0.69, + "learning_rate": 4.673231895297092e-06, + "loss": 0.4559, + "step": 3974 + }, + { + "epoch": 0.69, + "learning_rate": 4.668483201054013e-06, + "loss": 0.476, + "step": 3975 + }, + { + "epoch": 0.69, + "learning_rate": 4.663736185942512e-06, + "loss": 0.451, + "step": 3976 + }, + { + "epoch": 0.69, + "learning_rate": 4.658990851457625e-06, + "loss": 0.4596, + "step": 3977 + }, + { + "epoch": 0.69, + "learning_rate": 4.654247199093873e-06, + "loss": 0.4557, + "step": 3978 + }, + { + "epoch": 0.69, + "learning_rate": 4.649505230345244e-06, + "loss": 0.4646, + "step": 3979 + }, + { + "epoch": 0.69, + "learning_rate": 4.644764946705193e-06, + "loss": 0.4544, + "step": 3980 + }, + { + "epoch": 0.69, + "learning_rate": 4.640026349666651e-06, + "loss": 0.4833, + "step": 3981 + }, + { + "epoch": 0.69, + "learning_rate": 4.635289440722001e-06, + "loss": 0.4612, + "step": 3982 + }, + { + "epoch": 0.69, + "learning_rate": 4.6305542213631205e-06, + "loss": 0.4913, + "step": 3983 + }, + { + "epoch": 0.69, + "learning_rate": 4.625820693081331e-06, + "loss": 0.4662, + "step": 3984 + }, + { + "epoch": 0.69, + "learning_rate": 4.621088857367433e-06, + "loss": 0.4635, + "step": 3985 + }, + { + "epoch": 0.69, + "learning_rate": 4.616358715711693e-06, + "loss": 0.4612, + "step": 3986 + }, + { + "epoch": 0.69, + "learning_rate": 4.611630269603842e-06, + "loss": 0.4644, + "step": 3987 + }, + { + "epoch": 0.69, + "learning_rate": 4.606903520533082e-06, + "loss": 0.4544, + "step": 3988 + }, + { + "epoch": 0.69, + "learning_rate": 4.602178469988064e-06, + "loss": 0.4627, + "step": 3989 + }, + { + "epoch": 0.69, + "learning_rate": 4.5974551194569336e-06, + "loss": 0.4668, + "step": 3990 + }, + { + "epoch": 0.69, + "learning_rate": 4.592733470427272e-06, + "loss": 0.4575, + "step": 3991 + }, + { + "epoch": 0.69, + "learning_rate": 4.588013524386138e-06, + "loss": 0.4608, + "step": 3992 + }, + { + "epoch": 0.69, + "learning_rate": 4.5832952828200535e-06, + "loss": 0.4691, + "step": 3993 + }, + { + "epoch": 0.69, + "learning_rate": 4.578578747215003e-06, + "loss": 0.4686, + "step": 3994 + }, + { + "epoch": 0.69, + "learning_rate": 4.573863919056438e-06, + "loss": 0.4526, + "step": 3995 + }, + { + "epoch": 0.69, + "learning_rate": 4.569150799829257e-06, + "loss": 0.4591, + "step": 3996 + }, + { + "epoch": 0.69, + "learning_rate": 4.564439391017836e-06, + "loss": 0.4828, + "step": 3997 + }, + { + "epoch": 0.69, + "learning_rate": 4.559729694106008e-06, + "loss": 0.4569, + "step": 3998 + }, + { + "epoch": 0.69, + "learning_rate": 4.555021710577068e-06, + "loss": 0.475, + "step": 3999 + }, + { + "epoch": 0.69, + "learning_rate": 4.550315441913759e-06, + "loss": 0.4542, + "step": 4000 + }, + { + "epoch": 0.69, + "learning_rate": 4.545610889598304e-06, + "loss": 0.4579, + "step": 4001 + }, + { + "epoch": 0.69, + "learning_rate": 4.540908055112378e-06, + "loss": 0.4683, + "step": 4002 + }, + { + "epoch": 0.69, + "learning_rate": 4.536206939937101e-06, + "loss": 0.4582, + "step": 4003 + }, + { + "epoch": 0.69, + "learning_rate": 4.531507545553072e-06, + "loss": 0.4596, + "step": 4004 + }, + { + "epoch": 0.69, + "learning_rate": 4.526809873440335e-06, + "loss": 0.4677, + "step": 4005 + }, + { + "epoch": 0.69, + "learning_rate": 4.522113925078402e-06, + "loss": 0.4544, + "step": 4006 + }, + { + "epoch": 0.69, + "learning_rate": 4.517419701946224e-06, + "loss": 0.4599, + "step": 4007 + }, + { + "epoch": 0.69, + "learning_rate": 4.51272720552223e-06, + "loss": 0.4528, + "step": 4008 + }, + { + "epoch": 0.69, + "learning_rate": 4.508036437284298e-06, + "loss": 0.4655, + "step": 4009 + }, + { + "epoch": 0.69, + "learning_rate": 4.503347398709751e-06, + "loss": 0.4552, + "step": 4010 + }, + { + "epoch": 0.69, + "learning_rate": 4.498660091275379e-06, + "loss": 0.4575, + "step": 4011 + }, + { + "epoch": 0.7, + "learning_rate": 4.493974516457423e-06, + "loss": 0.4682, + "step": 4012 + }, + { + "epoch": 0.7, + "learning_rate": 4.489290675731584e-06, + "loss": 0.465, + "step": 4013 + }, + { + "epoch": 0.7, + "learning_rate": 4.484608570573002e-06, + "loss": 0.4485, + "step": 4014 + }, + { + "epoch": 0.7, + "learning_rate": 4.479928202456283e-06, + "loss": 0.4514, + "step": 4015 + }, + { + "epoch": 0.7, + "learning_rate": 4.475249572855492e-06, + "loss": 0.4706, + "step": 4016 + }, + { + "epoch": 0.7, + "learning_rate": 4.470572683244127e-06, + "loss": 0.465, + "step": 4017 + }, + { + "epoch": 0.7, + "learning_rate": 4.4658975350951505e-06, + "loss": 0.4532, + "step": 4018 + }, + { + "epoch": 0.7, + "learning_rate": 4.461224129880976e-06, + "loss": 0.4539, + "step": 4019 + }, + { + "epoch": 0.7, + "learning_rate": 4.45655246907347e-06, + "loss": 0.4543, + "step": 4020 + }, + { + "epoch": 0.7, + "learning_rate": 4.451882554143938e-06, + "loss": 0.4633, + "step": 4021 + }, + { + "epoch": 0.7, + "learning_rate": 4.447214386563145e-06, + "loss": 0.4477, + "step": 4022 + }, + { + "epoch": 0.7, + "learning_rate": 4.442547967801314e-06, + "loss": 0.4694, + "step": 4023 + }, + { + "epoch": 0.7, + "learning_rate": 4.437883299328097e-06, + "loss": 0.4691, + "step": 4024 + }, + { + "epoch": 0.7, + "learning_rate": 4.433220382612614e-06, + "loss": 0.4572, + "step": 4025 + }, + { + "epoch": 0.7, + "learning_rate": 4.4285592191234125e-06, + "loss": 0.4435, + "step": 4026 + }, + { + "epoch": 0.7, + "learning_rate": 4.423899810328512e-06, + "loss": 0.4609, + "step": 4027 + }, + { + "epoch": 0.7, + "learning_rate": 4.419242157695364e-06, + "loss": 0.4541, + "step": 4028 + }, + { + "epoch": 0.7, + "learning_rate": 4.4145862626908684e-06, + "loss": 0.4774, + "step": 4029 + }, + { + "epoch": 0.7, + "learning_rate": 4.409932126781373e-06, + "loss": 0.4526, + "step": 4030 + }, + { + "epoch": 0.7, + "learning_rate": 4.405279751432674e-06, + "loss": 0.4674, + "step": 4031 + }, + { + "epoch": 0.7, + "learning_rate": 4.400629138110014e-06, + "loss": 0.4585, + "step": 4032 + }, + { + "epoch": 0.7, + "learning_rate": 4.395980288278067e-06, + "loss": 0.4693, + "step": 4033 + }, + { + "epoch": 0.7, + "learning_rate": 4.391333203400974e-06, + "loss": 0.4503, + "step": 4034 + }, + { + "epoch": 0.7, + "learning_rate": 4.386687884942307e-06, + "loss": 0.4735, + "step": 4035 + }, + { + "epoch": 0.7, + "learning_rate": 4.382044334365078e-06, + "loss": 0.4385, + "step": 4036 + }, + { + "epoch": 0.7, + "learning_rate": 4.3774025531317476e-06, + "loss": 0.4545, + "step": 4037 + }, + { + "epoch": 0.7, + "learning_rate": 4.372762542704223e-06, + "loss": 0.4537, + "step": 4038 + }, + { + "epoch": 0.7, + "learning_rate": 4.368124304543852e-06, + "loss": 0.4692, + "step": 4039 + }, + { + "epoch": 0.7, + "learning_rate": 4.363487840111413e-06, + "loss": 0.4754, + "step": 4040 + }, + { + "epoch": 0.7, + "learning_rate": 4.358853150867137e-06, + "loss": 0.4589, + "step": 4041 + }, + { + "epoch": 0.7, + "learning_rate": 4.354220238270705e-06, + "loss": 0.4623, + "step": 4042 + }, + { + "epoch": 0.7, + "learning_rate": 4.349589103781212e-06, + "loss": 0.4689, + "step": 4043 + }, + { + "epoch": 0.7, + "learning_rate": 4.344959748857215e-06, + "loss": 0.4627, + "step": 4044 + }, + { + "epoch": 0.7, + "learning_rate": 4.340332174956703e-06, + "loss": 0.4543, + "step": 4045 + }, + { + "epoch": 0.7, + "learning_rate": 4.335706383537109e-06, + "loss": 0.4541, + "step": 4046 + }, + { + "epoch": 0.7, + "learning_rate": 4.331082376055292e-06, + "loss": 0.4708, + "step": 4047 + }, + { + "epoch": 0.7, + "learning_rate": 4.326460153967558e-06, + "loss": 0.4497, + "step": 4048 + }, + { + "epoch": 0.7, + "learning_rate": 4.32183971872966e-06, + "loss": 0.477, + "step": 4049 + }, + { + "epoch": 0.7, + "learning_rate": 4.317221071796768e-06, + "loss": 0.4651, + "step": 4050 + }, + { + "epoch": 0.7, + "learning_rate": 4.312604214623504e-06, + "loss": 0.4612, + "step": 4051 + }, + { + "epoch": 0.7, + "learning_rate": 4.307989148663921e-06, + "loss": 0.4646, + "step": 4052 + }, + { + "epoch": 0.7, + "learning_rate": 4.3033758753715095e-06, + "loss": 0.4634, + "step": 4053 + }, + { + "epoch": 0.7, + "learning_rate": 4.298764396199191e-06, + "loss": 0.4535, + "step": 4054 + }, + { + "epoch": 0.7, + "learning_rate": 4.294154712599325e-06, + "loss": 0.4565, + "step": 4055 + }, + { + "epoch": 0.7, + "learning_rate": 4.28954682602371e-06, + "loss": 0.4544, + "step": 4056 + }, + { + "epoch": 0.7, + "learning_rate": 4.284940737923571e-06, + "loss": 0.4611, + "step": 4057 + }, + { + "epoch": 0.7, + "learning_rate": 4.280336449749573e-06, + "loss": 0.4682, + "step": 4058 + }, + { + "epoch": 0.7, + "learning_rate": 4.275733962951804e-06, + "loss": 0.472, + "step": 4059 + }, + { + "epoch": 0.7, + "learning_rate": 4.271133278979802e-06, + "loss": 0.461, + "step": 4060 + }, + { + "epoch": 0.7, + "learning_rate": 4.266534399282517e-06, + "loss": 0.4797, + "step": 4061 + }, + { + "epoch": 0.7, + "learning_rate": 4.261937325308347e-06, + "loss": 0.4607, + "step": 4062 + }, + { + "epoch": 0.7, + "learning_rate": 4.257342058505109e-06, + "loss": 0.4616, + "step": 4063 + }, + { + "epoch": 0.7, + "learning_rate": 4.252748600320063e-06, + "loss": 0.4599, + "step": 4064 + }, + { + "epoch": 0.7, + "learning_rate": 4.248156952199895e-06, + "loss": 0.4682, + "step": 4065 + }, + { + "epoch": 0.7, + "learning_rate": 4.243567115590705e-06, + "loss": 0.4599, + "step": 4066 + }, + { + "epoch": 0.7, + "learning_rate": 4.238979091938054e-06, + "loss": 0.4683, + "step": 4067 + }, + { + "epoch": 0.7, + "learning_rate": 4.234392882686904e-06, + "loss": 0.4566, + "step": 4068 + }, + { + "epoch": 0.7, + "learning_rate": 4.2298084892816574e-06, + "loss": 0.4679, + "step": 4069 + }, + { + "epoch": 0.71, + "learning_rate": 4.225225913166146e-06, + "loss": 0.4593, + "step": 4070 + }, + { + "epoch": 0.71, + "learning_rate": 4.2206451557836235e-06, + "loss": 0.4572, + "step": 4071 + }, + { + "epoch": 0.71, + "learning_rate": 4.2160662185767805e-06, + "loss": 0.466, + "step": 4072 + }, + { + "epoch": 0.71, + "learning_rate": 4.21148910298772e-06, + "loss": 0.4592, + "step": 4073 + }, + { + "epoch": 0.71, + "learning_rate": 4.2069138104579825e-06, + "loss": 0.4481, + "step": 4074 + }, + { + "epoch": 0.71, + "learning_rate": 4.202340342428529e-06, + "loss": 0.4599, + "step": 4075 + }, + { + "epoch": 0.71, + "learning_rate": 4.197768700339752e-06, + "loss": 0.4645, + "step": 4076 + }, + { + "epoch": 0.71, + "learning_rate": 4.19319888563146e-06, + "loss": 0.4528, + "step": 4077 + }, + { + "epoch": 0.71, + "learning_rate": 4.188630899742894e-06, + "loss": 0.4721, + "step": 4078 + }, + { + "epoch": 0.71, + "learning_rate": 4.184064744112718e-06, + "loss": 0.4717, + "step": 4079 + }, + { + "epoch": 0.71, + "learning_rate": 4.179500420179011e-06, + "loss": 0.4654, + "step": 4080 + }, + { + "epoch": 0.71, + "learning_rate": 4.174937929379285e-06, + "loss": 0.4483, + "step": 4081 + }, + { + "epoch": 0.71, + "learning_rate": 4.17037727315047e-06, + "loss": 0.4791, + "step": 4082 + }, + { + "epoch": 0.71, + "learning_rate": 4.16581845292892e-06, + "loss": 0.4593, + "step": 4083 + }, + { + "epoch": 0.71, + "learning_rate": 4.161261470150414e-06, + "loss": 0.4577, + "step": 4084 + }, + { + "epoch": 0.71, + "learning_rate": 4.156706326250137e-06, + "loss": 0.4593, + "step": 4085 + }, + { + "epoch": 0.71, + "learning_rate": 4.15215302266272e-06, + "loss": 0.4555, + "step": 4086 + }, + { + "epoch": 0.71, + "learning_rate": 4.147601560822192e-06, + "loss": 0.4354, + "step": 4087 + }, + { + "epoch": 0.71, + "learning_rate": 4.143051942162013e-06, + "loss": 0.475, + "step": 4088 + }, + { + "epoch": 0.71, + "learning_rate": 4.138504168115059e-06, + "loss": 0.451, + "step": 4089 + }, + { + "epoch": 0.71, + "learning_rate": 4.133958240113629e-06, + "loss": 0.461, + "step": 4090 + }, + { + "epoch": 0.71, + "learning_rate": 4.129414159589438e-06, + "loss": 0.4575, + "step": 4091 + }, + { + "epoch": 0.71, + "learning_rate": 4.124871927973611e-06, + "loss": 0.4669, + "step": 4092 + }, + { + "epoch": 0.71, + "learning_rate": 4.120331546696711e-06, + "loss": 0.4473, + "step": 4093 + }, + { + "epoch": 0.71, + "learning_rate": 4.115793017188695e-06, + "loss": 0.4759, + "step": 4094 + }, + { + "epoch": 0.71, + "learning_rate": 4.111256340878952e-06, + "loss": 0.4549, + "step": 4095 + }, + { + "epoch": 0.71, + "learning_rate": 4.106721519196284e-06, + "loss": 0.4605, + "step": 4096 + }, + { + "epoch": 0.71, + "learning_rate": 4.102188553568905e-06, + "loss": 0.4536, + "step": 4097 + }, + { + "epoch": 0.71, + "learning_rate": 4.097657445424454e-06, + "loss": 0.4647, + "step": 4098 + }, + { + "epoch": 0.71, + "learning_rate": 4.093128196189971e-06, + "loss": 0.4607, + "step": 4099 + }, + { + "epoch": 0.71, + "learning_rate": 4.088600807291918e-06, + "loss": 0.4582, + "step": 4100 + }, + { + "epoch": 0.71, + "learning_rate": 4.084075280156175e-06, + "loss": 0.4684, + "step": 4101 + }, + { + "epoch": 0.71, + "learning_rate": 4.079551616208032e-06, + "loss": 0.4605, + "step": 4102 + }, + { + "epoch": 0.71, + "learning_rate": 4.075029816872183e-06, + "loss": 0.4753, + "step": 4103 + }, + { + "epoch": 0.71, + "learning_rate": 4.070509883572754e-06, + "loss": 0.4575, + "step": 4104 + }, + { + "epoch": 0.71, + "learning_rate": 4.065991817733272e-06, + "loss": 0.4594, + "step": 4105 + }, + { + "epoch": 0.71, + "learning_rate": 4.061475620776672e-06, + "loss": 0.4531, + "step": 4106 + }, + { + "epoch": 0.71, + "learning_rate": 4.056961294125305e-06, + "loss": 0.459, + "step": 4107 + }, + { + "epoch": 0.71, + "learning_rate": 4.052448839200935e-06, + "loss": 0.4658, + "step": 4108 + }, + { + "epoch": 0.71, + "learning_rate": 4.04793825742474e-06, + "loss": 0.46, + "step": 4109 + }, + { + "epoch": 0.71, + "learning_rate": 4.0434295502172885e-06, + "loss": 0.4643, + "step": 4110 + }, + { + "epoch": 0.71, + "learning_rate": 4.038922718998585e-06, + "loss": 0.4642, + "step": 4111 + }, + { + "epoch": 0.71, + "learning_rate": 4.034417765188031e-06, + "loss": 0.464, + "step": 4112 + }, + { + "epoch": 0.71, + "learning_rate": 4.0299146902044304e-06, + "loss": 0.4568, + "step": 4113 + }, + { + "epoch": 0.71, + "learning_rate": 4.025413495466004e-06, + "loss": 0.4665, + "step": 4114 + }, + { + "epoch": 0.71, + "learning_rate": 4.020914182390379e-06, + "loss": 0.454, + "step": 4115 + }, + { + "epoch": 0.71, + "learning_rate": 4.016416752394591e-06, + "loss": 0.4543, + "step": 4116 + }, + { + "epoch": 0.71, + "learning_rate": 4.011921206895074e-06, + "loss": 0.4674, + "step": 4117 + }, + { + "epoch": 0.71, + "learning_rate": 4.007427547307676e-06, + "loss": 0.4708, + "step": 4118 + }, + { + "epoch": 0.71, + "learning_rate": 4.00293577504766e-06, + "loss": 0.4561, + "step": 4119 + }, + { + "epoch": 0.71, + "learning_rate": 3.998445891529675e-06, + "loss": 0.465, + "step": 4120 + }, + { + "epoch": 0.71, + "learning_rate": 3.993957898167788e-06, + "loss": 0.455, + "step": 4121 + }, + { + "epoch": 0.71, + "learning_rate": 3.989471796375466e-06, + "loss": 0.4655, + "step": 4122 + }, + { + "epoch": 0.71, + "learning_rate": 3.9849875875655875e-06, + "loss": 0.4592, + "step": 4123 + }, + { + "epoch": 0.71, + "learning_rate": 3.980505273150421e-06, + "loss": 0.4649, + "step": 4124 + }, + { + "epoch": 0.71, + "learning_rate": 3.9760248545416465e-06, + "loss": 0.4585, + "step": 4125 + }, + { + "epoch": 0.71, + "learning_rate": 3.971546333150358e-06, + "loss": 0.4722, + "step": 4126 + }, + { + "epoch": 0.71, + "learning_rate": 3.967069710387029e-06, + "loss": 0.4527, + "step": 4127 + }, + { + "epoch": 0.72, + "learning_rate": 3.962594987661557e-06, + "loss": 0.4672, + "step": 4128 + }, + { + "epoch": 0.72, + "learning_rate": 3.958122166383217e-06, + "loss": 0.4467, + "step": 4129 + }, + { + "epoch": 0.72, + "learning_rate": 3.953651247960715e-06, + "loss": 0.4607, + "step": 4130 + }, + { + "epoch": 0.72, + "learning_rate": 3.949182233802131e-06, + "loss": 0.4523, + "step": 4131 + }, + { + "epoch": 0.72, + "learning_rate": 3.944715125314961e-06, + "loss": 0.4652, + "step": 4132 + }, + { + "epoch": 0.72, + "learning_rate": 3.940249923906093e-06, + "loss": 0.456, + "step": 4133 + }, + { + "epoch": 0.72, + "learning_rate": 3.935786630981819e-06, + "loss": 0.4622, + "step": 4134 + }, + { + "epoch": 0.72, + "learning_rate": 3.931325247947834e-06, + "loss": 0.4512, + "step": 4135 + }, + { + "epoch": 0.72, + "learning_rate": 3.926865776209212e-06, + "loss": 0.4694, + "step": 4136 + }, + { + "epoch": 0.72, + "learning_rate": 3.922408217170454e-06, + "loss": 0.4577, + "step": 4137 + }, + { + "epoch": 0.72, + "learning_rate": 3.917952572235433e-06, + "loss": 0.4587, + "step": 4138 + }, + { + "epoch": 0.72, + "learning_rate": 3.913498842807433e-06, + "loss": 0.457, + "step": 4139 + }, + { + "epoch": 0.72, + "learning_rate": 3.909047030289131e-06, + "loss": 0.4623, + "step": 4140 + }, + { + "epoch": 0.72, + "learning_rate": 3.9045971360826014e-06, + "loss": 0.4521, + "step": 4141 + }, + { + "epoch": 0.72, + "learning_rate": 3.900149161589317e-06, + "loss": 0.4721, + "step": 4142 + }, + { + "epoch": 0.72, + "learning_rate": 3.895703108210135e-06, + "loss": 0.4549, + "step": 4143 + }, + { + "epoch": 0.72, + "learning_rate": 3.891258977345319e-06, + "loss": 0.4636, + "step": 4144 + }, + { + "epoch": 0.72, + "learning_rate": 3.886816770394524e-06, + "loss": 0.4565, + "step": 4145 + }, + { + "epoch": 0.72, + "learning_rate": 3.882376488756797e-06, + "loss": 0.4719, + "step": 4146 + }, + { + "epoch": 0.72, + "learning_rate": 3.877938133830581e-06, + "loss": 0.4526, + "step": 4147 + }, + { + "epoch": 0.72, + "learning_rate": 3.873501707013711e-06, + "loss": 0.4708, + "step": 4148 + }, + { + "epoch": 0.72, + "learning_rate": 3.869067209703418e-06, + "loss": 0.4645, + "step": 4149 + }, + { + "epoch": 0.72, + "learning_rate": 3.8646346432963165e-06, + "loss": 0.4581, + "step": 4150 + }, + { + "epoch": 0.72, + "learning_rate": 3.860204009188421e-06, + "loss": 0.4715, + "step": 4151 + }, + { + "epoch": 0.72, + "learning_rate": 3.8557753087751345e-06, + "loss": 0.4606, + "step": 4152 + }, + { + "epoch": 0.72, + "learning_rate": 3.851348543451253e-06, + "loss": 0.4533, + "step": 4153 + }, + { + "epoch": 0.72, + "learning_rate": 3.846923714610962e-06, + "loss": 0.4532, + "step": 4154 + }, + { + "epoch": 0.72, + "learning_rate": 3.8425008236478355e-06, + "loss": 0.458, + "step": 4155 + }, + { + "epoch": 0.72, + "learning_rate": 3.838079871954842e-06, + "loss": 0.455, + "step": 4156 + }, + { + "epoch": 0.72, + "learning_rate": 3.833660860924328e-06, + "loss": 0.4683, + "step": 4157 + }, + { + "epoch": 0.72, + "learning_rate": 3.829243791948043e-06, + "loss": 0.462, + "step": 4158 + }, + { + "epoch": 0.72, + "learning_rate": 3.824828666417114e-06, + "loss": 0.4533, + "step": 4159 + }, + { + "epoch": 0.72, + "learning_rate": 3.820415485722064e-06, + "loss": 0.4669, + "step": 4160 + }, + { + "epoch": 0.72, + "learning_rate": 3.8160042512528e-06, + "loss": 0.4668, + "step": 4161 + }, + { + "epoch": 0.72, + "learning_rate": 3.8115949643986095e-06, + "loss": 0.461, + "step": 4162 + }, + { + "epoch": 0.72, + "learning_rate": 3.8071876265481823e-06, + "loss": 0.4602, + "step": 4163 + }, + { + "epoch": 0.72, + "learning_rate": 3.8027822390895774e-06, + "loss": 0.4562, + "step": 4164 + }, + { + "epoch": 0.72, + "learning_rate": 3.7983788034102488e-06, + "loss": 0.4569, + "step": 4165 + }, + { + "epoch": 0.72, + "learning_rate": 3.7939773208970353e-06, + "loss": 0.4684, + "step": 4166 + }, + { + "epoch": 0.72, + "learning_rate": 3.7895777929361586e-06, + "loss": 0.4488, + "step": 4167 + }, + { + "epoch": 0.72, + "learning_rate": 3.7851802209132303e-06, + "loss": 0.4698, + "step": 4168 + }, + { + "epoch": 0.72, + "learning_rate": 3.7807846062132293e-06, + "loss": 0.4496, + "step": 4169 + }, + { + "epoch": 0.72, + "learning_rate": 3.776390950220544e-06, + "loss": 0.463, + "step": 4170 + }, + { + "epoch": 0.72, + "learning_rate": 3.7719992543189233e-06, + "loss": 0.4654, + "step": 4171 + }, + { + "epoch": 0.72, + "learning_rate": 3.767609519891513e-06, + "loss": 0.475, + "step": 4172 + }, + { + "epoch": 0.72, + "learning_rate": 3.7632217483208242e-06, + "loss": 0.4586, + "step": 4173 + }, + { + "epoch": 0.72, + "learning_rate": 3.758835940988773e-06, + "loss": 0.4617, + "step": 4174 + }, + { + "epoch": 0.72, + "learning_rate": 3.7544520992766454e-06, + "loss": 0.462, + "step": 4175 + }, + { + "epoch": 0.72, + "learning_rate": 3.7500702245651e-06, + "loss": 0.4621, + "step": 4176 + }, + { + "epoch": 0.72, + "learning_rate": 3.745690318234186e-06, + "loss": 0.4515, + "step": 4177 + }, + { + "epoch": 0.72, + "learning_rate": 3.7413123816633344e-06, + "loss": 0.4545, + "step": 4178 + }, + { + "epoch": 0.72, + "learning_rate": 3.7369364162313528e-06, + "loss": 0.453, + "step": 4179 + }, + { + "epoch": 0.72, + "learning_rate": 3.7325624233164157e-06, + "loss": 0.4785, + "step": 4180 + }, + { + "epoch": 0.72, + "learning_rate": 3.7281904042961016e-06, + "loss": 0.4586, + "step": 4181 + }, + { + "epoch": 0.72, + "learning_rate": 3.723820360547351e-06, + "loss": 0.4756, + "step": 4182 + }, + { + "epoch": 0.72, + "learning_rate": 3.7194522934464785e-06, + "loss": 0.4498, + "step": 4183 + }, + { + "epoch": 0.72, + "learning_rate": 3.715086204369186e-06, + "loss": 0.4692, + "step": 4184 + }, + { + "epoch": 0.72, + "learning_rate": 3.7107220946905497e-06, + "loss": 0.4498, + "step": 4185 + }, + { + "epoch": 0.73, + "learning_rate": 3.7063599657850248e-06, + "loss": 0.4646, + "step": 4186 + }, + { + "epoch": 0.73, + "learning_rate": 3.701999819026432e-06, + "loss": 0.4478, + "step": 4187 + }, + { + "epoch": 0.73, + "learning_rate": 3.6976416557879757e-06, + "loss": 0.4595, + "step": 4188 + }, + { + "epoch": 0.73, + "learning_rate": 3.6932854774422457e-06, + "loss": 0.4676, + "step": 4189 + }, + { + "epoch": 0.73, + "learning_rate": 3.6889312853611857e-06, + "loss": 0.4629, + "step": 4190 + }, + { + "epoch": 0.73, + "learning_rate": 3.6845790809161273e-06, + "loss": 0.4552, + "step": 4191 + }, + { + "epoch": 0.73, + "learning_rate": 3.680228865477774e-06, + "loss": 0.4722, + "step": 4192 + }, + { + "epoch": 0.73, + "learning_rate": 3.675880640416205e-06, + "loss": 0.4545, + "step": 4193 + }, + { + "epoch": 0.73, + "learning_rate": 3.671534407100863e-06, + "loss": 0.4684, + "step": 4194 + }, + { + "epoch": 0.73, + "learning_rate": 3.6671901669005683e-06, + "loss": 0.4703, + "step": 4195 + }, + { + "epoch": 0.73, + "learning_rate": 3.662847921183528e-06, + "loss": 0.4646, + "step": 4196 + }, + { + "epoch": 0.73, + "learning_rate": 3.658507671317296e-06, + "loss": 0.4587, + "step": 4197 + }, + { + "epoch": 0.73, + "learning_rate": 3.654169418668815e-06, + "loss": 0.464, + "step": 4198 + }, + { + "epoch": 0.73, + "learning_rate": 3.6498331646043917e-06, + "loss": 0.4713, + "step": 4199 + }, + { + "epoch": 0.73, + "learning_rate": 3.6454989104897097e-06, + "loss": 0.4587, + "step": 4200 + }, + { + "epoch": 0.73, + "learning_rate": 3.641166657689812e-06, + "loss": 0.4594, + "step": 4201 + }, + { + "epoch": 0.73, + "learning_rate": 3.63683640756912e-06, + "loss": 0.4769, + "step": 4202 + }, + { + "epoch": 0.73, + "learning_rate": 3.6325081614914216e-06, + "loss": 0.4718, + "step": 4203 + }, + { + "epoch": 0.73, + "learning_rate": 3.6281819208198744e-06, + "loss": 0.4522, + "step": 4204 + }, + { + "epoch": 0.73, + "learning_rate": 3.6238576869170074e-06, + "loss": 0.4739, + "step": 4205 + }, + { + "epoch": 0.73, + "learning_rate": 3.6195354611447033e-06, + "loss": 0.4603, + "step": 4206 + }, + { + "epoch": 0.73, + "learning_rate": 3.6152152448642374e-06, + "loss": 0.4557, + "step": 4207 + }, + { + "epoch": 0.73, + "learning_rate": 3.6108970394362274e-06, + "loss": 0.4721, + "step": 4208 + }, + { + "epoch": 0.73, + "learning_rate": 3.606580846220671e-06, + "loss": 0.457, + "step": 4209 + }, + { + "epoch": 0.73, + "learning_rate": 3.602266666576929e-06, + "loss": 0.4608, + "step": 4210 + }, + { + "epoch": 0.73, + "learning_rate": 3.59795450186373e-06, + "loss": 0.4588, + "step": 4211 + }, + { + "epoch": 0.73, + "learning_rate": 3.5936443534391676e-06, + "loss": 0.4771, + "step": 4212 + }, + { + "epoch": 0.73, + "learning_rate": 3.58933622266069e-06, + "loss": 0.4578, + "step": 4213 + }, + { + "epoch": 0.73, + "learning_rate": 3.5850301108851326e-06, + "loss": 0.4705, + "step": 4214 + }, + { + "epoch": 0.73, + "learning_rate": 3.580726019468671e-06, + "loss": 0.455, + "step": 4215 + }, + { + "epoch": 0.73, + "learning_rate": 3.5764239497668584e-06, + "loss": 0.462, + "step": 4216 + }, + { + "epoch": 0.73, + "learning_rate": 3.5721239031346067e-06, + "loss": 0.4613, + "step": 4217 + }, + { + "epoch": 0.73, + "learning_rate": 3.5678258809261935e-06, + "loss": 0.4637, + "step": 4218 + }, + { + "epoch": 0.73, + "learning_rate": 3.563529884495259e-06, + "loss": 0.4509, + "step": 4219 + }, + { + "epoch": 0.73, + "learning_rate": 3.5592359151947974e-06, + "loss": 0.4659, + "step": 4220 + }, + { + "epoch": 0.73, + "learning_rate": 3.554943974377174e-06, + "loss": 0.4512, + "step": 4221 + }, + { + "epoch": 0.73, + "learning_rate": 3.55065406339411e-06, + "loss": 0.4715, + "step": 4222 + }, + { + "epoch": 0.73, + "learning_rate": 3.546366183596691e-06, + "loss": 0.4478, + "step": 4223 + }, + { + "epoch": 0.73, + "learning_rate": 3.5420803363353604e-06, + "loss": 0.463, + "step": 4224 + }, + { + "epoch": 0.73, + "learning_rate": 3.537796522959921e-06, + "loss": 0.4407, + "step": 4225 + }, + { + "epoch": 0.73, + "learning_rate": 3.5335147448195406e-06, + "loss": 0.4692, + "step": 4226 + }, + { + "epoch": 0.73, + "learning_rate": 3.5292350032627344e-06, + "loss": 0.4446, + "step": 4227 + }, + { + "epoch": 0.73, + "learning_rate": 3.524957299637386e-06, + "loss": 0.4563, + "step": 4228 + }, + { + "epoch": 0.73, + "learning_rate": 3.5206816352907347e-06, + "loss": 0.4592, + "step": 4229 + }, + { + "epoch": 0.73, + "learning_rate": 3.5164080115693767e-06, + "loss": 0.471, + "step": 4230 + }, + { + "epoch": 0.73, + "learning_rate": 3.5121364298192673e-06, + "loss": 0.4637, + "step": 4231 + }, + { + "epoch": 0.73, + "learning_rate": 3.507866891385716e-06, + "loss": 0.457, + "step": 4232 + }, + { + "epoch": 0.73, + "learning_rate": 3.503599397613394e-06, + "loss": 0.4632, + "step": 4233 + }, + { + "epoch": 0.73, + "learning_rate": 3.4993339498463197e-06, + "loss": 0.4578, + "step": 4234 + }, + { + "epoch": 0.73, + "learning_rate": 3.4950705494278723e-06, + "loss": 0.4661, + "step": 4235 + }, + { + "epoch": 0.73, + "learning_rate": 3.4908091977007896e-06, + "loss": 0.4629, + "step": 4236 + }, + { + "epoch": 0.73, + "learning_rate": 3.4865498960071576e-06, + "loss": 0.4547, + "step": 4237 + }, + { + "epoch": 0.73, + "learning_rate": 3.4822926456884243e-06, + "loss": 0.452, + "step": 4238 + }, + { + "epoch": 0.73, + "learning_rate": 3.4780374480853774e-06, + "loss": 0.4657, + "step": 4239 + }, + { + "epoch": 0.73, + "learning_rate": 3.47378430453818e-06, + "loss": 0.4537, + "step": 4240 + }, + { + "epoch": 0.73, + "learning_rate": 3.469533216386328e-06, + "loss": 0.4718, + "step": 4241 + }, + { + "epoch": 0.73, + "learning_rate": 3.465284184968679e-06, + "loss": 0.4629, + "step": 4242 + }, + { + "epoch": 0.74, + "learning_rate": 3.4610372116234425e-06, + "loss": 0.4636, + "step": 4243 + }, + { + "epoch": 0.74, + "learning_rate": 3.456792297688181e-06, + "loss": 0.4467, + "step": 4244 + }, + { + "epoch": 0.74, + "learning_rate": 3.4525494444998064e-06, + "loss": 0.4652, + "step": 4245 + }, + { + "epoch": 0.74, + "learning_rate": 3.4483086533945776e-06, + "loss": 0.4378, + "step": 4246 + }, + { + "epoch": 0.74, + "learning_rate": 3.4440699257081113e-06, + "loss": 0.4639, + "step": 4247 + }, + { + "epoch": 0.74, + "learning_rate": 3.4398332627753704e-06, + "loss": 0.4473, + "step": 4248 + }, + { + "epoch": 0.74, + "learning_rate": 3.435598665930672e-06, + "loss": 0.4607, + "step": 4249 + }, + { + "epoch": 0.74, + "learning_rate": 3.431366136507669e-06, + "loss": 0.4599, + "step": 4250 + }, + { + "epoch": 0.74, + "learning_rate": 3.4271356758393827e-06, + "loss": 0.4609, + "step": 4251 + }, + { + "epoch": 0.74, + "learning_rate": 3.4229072852581735e-06, + "loss": 0.4724, + "step": 4252 + }, + { + "epoch": 0.74, + "learning_rate": 3.4186809660957433e-06, + "loss": 0.4685, + "step": 4253 + }, + { + "epoch": 0.74, + "learning_rate": 3.41445671968315e-06, + "loss": 0.4552, + "step": 4254 + }, + { + "epoch": 0.74, + "learning_rate": 3.410234547350797e-06, + "loss": 0.458, + "step": 4255 + }, + { + "epoch": 0.74, + "learning_rate": 3.4060144504284375e-06, + "loss": 0.4633, + "step": 4256 + }, + { + "epoch": 0.74, + "learning_rate": 3.4017964302451578e-06, + "loss": 0.4646, + "step": 4257 + }, + { + "epoch": 0.74, + "learning_rate": 3.3975804881294095e-06, + "loss": 0.4643, + "step": 4258 + }, + { + "epoch": 0.74, + "learning_rate": 3.393366625408979e-06, + "loss": 0.4545, + "step": 4259 + }, + { + "epoch": 0.74, + "learning_rate": 3.3891548434109942e-06, + "loss": 0.4573, + "step": 4260 + }, + { + "epoch": 0.74, + "learning_rate": 3.384945143461936e-06, + "loss": 0.4598, + "step": 4261 + }, + { + "epoch": 0.74, + "learning_rate": 3.380737526887624e-06, + "loss": 0.4526, + "step": 4262 + }, + { + "epoch": 0.74, + "learning_rate": 3.376531995013228e-06, + "loss": 0.4548, + "step": 4263 + }, + { + "epoch": 0.74, + "learning_rate": 3.3723285491632508e-06, + "loss": 0.4643, + "step": 4264 + }, + { + "epoch": 0.74, + "learning_rate": 3.368127190661543e-06, + "loss": 0.4648, + "step": 4265 + }, + { + "epoch": 0.74, + "learning_rate": 3.3639279208313113e-06, + "loss": 0.456, + "step": 4266 + }, + { + "epoch": 0.74, + "learning_rate": 3.35973074099508e-06, + "loss": 0.4522, + "step": 4267 + }, + { + "epoch": 0.74, + "learning_rate": 3.3555356524747307e-06, + "loss": 0.4633, + "step": 4268 + }, + { + "epoch": 0.74, + "learning_rate": 3.3513426565914854e-06, + "loss": 0.4587, + "step": 4269 + }, + { + "epoch": 0.74, + "learning_rate": 3.3471517546659072e-06, + "loss": 0.4573, + "step": 4270 + }, + { + "epoch": 0.74, + "learning_rate": 3.3429629480178905e-06, + "loss": 0.4629, + "step": 4271 + }, + { + "epoch": 0.74, + "learning_rate": 3.3387762379666753e-06, + "loss": 0.4619, + "step": 4272 + }, + { + "epoch": 0.74, + "learning_rate": 3.3345916258308565e-06, + "loss": 0.4738, + "step": 4273 + }, + { + "epoch": 0.74, + "learning_rate": 3.33040911292834e-06, + "loss": 0.4561, + "step": 4274 + }, + { + "epoch": 0.74, + "learning_rate": 3.3262287005763915e-06, + "loss": 0.4697, + "step": 4275 + }, + { + "epoch": 0.74, + "learning_rate": 3.3220503900916068e-06, + "loss": 0.4553, + "step": 4276 + }, + { + "epoch": 0.74, + "learning_rate": 3.3178741827899253e-06, + "loss": 0.4582, + "step": 4277 + }, + { + "epoch": 0.74, + "learning_rate": 3.3137000799866148e-06, + "loss": 0.4789, + "step": 4278 + }, + { + "epoch": 0.74, + "learning_rate": 3.309528082996287e-06, + "loss": 0.4688, + "step": 4279 + }, + { + "epoch": 0.74, + "learning_rate": 3.3053581931328914e-06, + "loss": 0.4618, + "step": 4280 + }, + { + "epoch": 0.74, + "learning_rate": 3.3011904117097093e-06, + "loss": 0.4571, + "step": 4281 + }, + { + "epoch": 0.74, + "learning_rate": 3.297024740039366e-06, + "loss": 0.443, + "step": 4282 + }, + { + "epoch": 0.74, + "learning_rate": 3.292861179433805e-06, + "loss": 0.4692, + "step": 4283 + }, + { + "epoch": 0.74, + "learning_rate": 3.28869973120433e-06, + "loss": 0.4474, + "step": 4284 + }, + { + "epoch": 0.74, + "learning_rate": 3.2845403966615574e-06, + "loss": 0.4649, + "step": 4285 + }, + { + "epoch": 0.74, + "learning_rate": 3.2803831771154483e-06, + "loss": 0.4625, + "step": 4286 + }, + { + "epoch": 0.74, + "learning_rate": 3.276228073875296e-06, + "loss": 0.4664, + "step": 4287 + }, + { + "epoch": 0.74, + "learning_rate": 3.2720750882497276e-06, + "loss": 0.4606, + "step": 4288 + }, + { + "epoch": 0.74, + "learning_rate": 3.2679242215467066e-06, + "loss": 0.4553, + "step": 4289 + }, + { + "epoch": 0.74, + "learning_rate": 3.263775475073514e-06, + "loss": 0.4546, + "step": 4290 + }, + { + "epoch": 0.74, + "learning_rate": 3.259628850136789e-06, + "loss": 0.4592, + "step": 4291 + }, + { + "epoch": 0.74, + "learning_rate": 3.255484348042478e-06, + "loss": 0.464, + "step": 4292 + }, + { + "epoch": 0.74, + "learning_rate": 3.2513419700958715e-06, + "loss": 0.4555, + "step": 4293 + }, + { + "epoch": 0.74, + "learning_rate": 3.2472017176015893e-06, + "loss": 0.4587, + "step": 4294 + }, + { + "epoch": 0.74, + "learning_rate": 3.24306359186358e-06, + "loss": 0.4514, + "step": 4295 + }, + { + "epoch": 0.74, + "learning_rate": 3.238927594185127e-06, + "loss": 0.4553, + "step": 4296 + }, + { + "epoch": 0.74, + "learning_rate": 3.2347937258688342e-06, + "loss": 0.461, + "step": 4297 + }, + { + "epoch": 0.74, + "learning_rate": 3.2306619882166414e-06, + "loss": 0.4527, + "step": 4298 + }, + { + "epoch": 0.74, + "learning_rate": 3.226532382529819e-06, + "loss": 0.4625, + "step": 4299 + }, + { + "epoch": 0.74, + "learning_rate": 3.2224049101089616e-06, + "loss": 0.4595, + "step": 4300 + }, + { + "epoch": 0.75, + "learning_rate": 3.218279572253994e-06, + "loss": 0.4629, + "step": 4301 + }, + { + "epoch": 0.75, + "learning_rate": 3.214156370264169e-06, + "loss": 0.4642, + "step": 4302 + }, + { + "epoch": 0.75, + "learning_rate": 3.2100353054380683e-06, + "loss": 0.456, + "step": 4303 + }, + { + "epoch": 0.75, + "learning_rate": 3.2059163790735927e-06, + "loss": 0.4647, + "step": 4304 + }, + { + "epoch": 0.75, + "learning_rate": 3.201799592467978e-06, + "loss": 0.46, + "step": 4305 + }, + { + "epoch": 0.75, + "learning_rate": 3.197684946917784e-06, + "loss": 0.4564, + "step": 4306 + }, + { + "epoch": 0.75, + "learning_rate": 3.1935724437188954e-06, + "loss": 0.4499, + "step": 4307 + }, + { + "epoch": 0.75, + "learning_rate": 3.1894620841665248e-06, + "loss": 0.4549, + "step": 4308 + }, + { + "epoch": 0.75, + "learning_rate": 3.1853538695551965e-06, + "loss": 0.4534, + "step": 4309 + }, + { + "epoch": 0.75, + "learning_rate": 3.181247801178785e-06, + "loss": 0.4736, + "step": 4310 + }, + { + "epoch": 0.75, + "learning_rate": 3.177143880330463e-06, + "loss": 0.4637, + "step": 4311 + }, + { + "epoch": 0.75, + "learning_rate": 3.1730421083027395e-06, + "loss": 0.466, + "step": 4312 + }, + { + "epoch": 0.75, + "learning_rate": 3.1689424863874473e-06, + "loss": 0.4582, + "step": 4313 + }, + { + "epoch": 0.75, + "learning_rate": 3.1648450158757373e-06, + "loss": 0.4527, + "step": 4314 + }, + { + "epoch": 0.75, + "learning_rate": 3.1607496980580897e-06, + "loss": 0.447, + "step": 4315 + }, + { + "epoch": 0.75, + "learning_rate": 3.1566565342242916e-06, + "loss": 0.4677, + "step": 4316 + }, + { + "epoch": 0.75, + "learning_rate": 3.1525655256634757e-06, + "loss": 0.4666, + "step": 4317 + }, + { + "epoch": 0.75, + "learning_rate": 3.1484766736640717e-06, + "loss": 0.4664, + "step": 4318 + }, + { + "epoch": 0.75, + "learning_rate": 3.1443899795138454e-06, + "loss": 0.4636, + "step": 4319 + }, + { + "epoch": 0.75, + "learning_rate": 3.140305444499877e-06, + "loss": 0.4435, + "step": 4320 + }, + { + "epoch": 0.75, + "learning_rate": 3.1362230699085693e-06, + "loss": 0.4718, + "step": 4321 + }, + { + "epoch": 0.75, + "learning_rate": 3.1321428570256464e-06, + "loss": 0.451, + "step": 4322 + }, + { + "epoch": 0.75, + "learning_rate": 3.128064807136142e-06, + "loss": 0.468, + "step": 4323 + }, + { + "epoch": 0.75, + "learning_rate": 3.123988921524418e-06, + "loss": 0.4656, + "step": 4324 + }, + { + "epoch": 0.75, + "learning_rate": 3.119915201474153e-06, + "loss": 0.4552, + "step": 4325 + }, + { + "epoch": 0.75, + "learning_rate": 3.115843648268344e-06, + "loss": 0.4616, + "step": 4326 + }, + { + "epoch": 0.75, + "learning_rate": 3.1117742631892965e-06, + "loss": 0.469, + "step": 4327 + }, + { + "epoch": 0.75, + "learning_rate": 3.107707047518649e-06, + "loss": 0.4706, + "step": 4328 + }, + { + "epoch": 0.75, + "learning_rate": 3.103642002537349e-06, + "loss": 0.4639, + "step": 4329 + }, + { + "epoch": 0.75, + "learning_rate": 3.099579129525653e-06, + "loss": 0.4644, + "step": 4330 + }, + { + "epoch": 0.75, + "learning_rate": 3.0955184297631437e-06, + "loss": 0.4657, + "step": 4331 + }, + { + "epoch": 0.75, + "learning_rate": 3.0914599045287165e-06, + "loss": 0.4497, + "step": 4332 + }, + { + "epoch": 0.75, + "learning_rate": 3.087403555100583e-06, + "loss": 0.4677, + "step": 4333 + }, + { + "epoch": 0.75, + "learning_rate": 3.0833493827562598e-06, + "loss": 0.4448, + "step": 4334 + }, + { + "epoch": 0.75, + "learning_rate": 3.079297388772595e-06, + "loss": 0.4665, + "step": 4335 + }, + { + "epoch": 0.75, + "learning_rate": 3.0752475744257414e-06, + "loss": 0.4676, + "step": 4336 + }, + { + "epoch": 0.75, + "learning_rate": 3.0711999409911587e-06, + "loss": 0.4557, + "step": 4337 + }, + { + "epoch": 0.75, + "learning_rate": 3.067154489743631e-06, + "loss": 0.4644, + "step": 4338 + }, + { + "epoch": 0.75, + "learning_rate": 3.06311122195725e-06, + "loss": 0.452, + "step": 4339 + }, + { + "epoch": 0.75, + "learning_rate": 3.0590701389054235e-06, + "loss": 0.4546, + "step": 4340 + }, + { + "epoch": 0.75, + "learning_rate": 3.0550312418608617e-06, + "loss": 0.455, + "step": 4341 + }, + { + "epoch": 0.75, + "learning_rate": 3.0509945320955925e-06, + "loss": 0.4618, + "step": 4342 + }, + { + "epoch": 0.75, + "learning_rate": 3.046960010880966e-06, + "loss": 0.4677, + "step": 4343 + }, + { + "epoch": 0.75, + "learning_rate": 3.042927679487622e-06, + "loss": 0.4728, + "step": 4344 + }, + { + "epoch": 0.75, + "learning_rate": 3.0388975391855226e-06, + "loss": 0.4678, + "step": 4345 + }, + { + "epoch": 0.75, + "learning_rate": 3.03486959124394e-06, + "loss": 0.456, + "step": 4346 + }, + { + "epoch": 0.75, + "learning_rate": 3.0308438369314563e-06, + "loss": 0.4641, + "step": 4347 + }, + { + "epoch": 0.75, + "learning_rate": 3.026820277515955e-06, + "loss": 0.4568, + "step": 4348 + }, + { + "epoch": 0.75, + "learning_rate": 3.022798914264633e-06, + "loss": 0.4769, + "step": 4349 + }, + { + "epoch": 0.75, + "learning_rate": 3.018779748444005e-06, + "loss": 0.4551, + "step": 4350 + }, + { + "epoch": 0.75, + "learning_rate": 3.0147627813198777e-06, + "loss": 0.4693, + "step": 4351 + }, + { + "epoch": 0.75, + "learning_rate": 3.0107480141573763e-06, + "loss": 0.46, + "step": 4352 + }, + { + "epoch": 0.75, + "learning_rate": 3.006735448220922e-06, + "loss": 0.4506, + "step": 4353 + }, + { + "epoch": 0.75, + "learning_rate": 3.002725084774262e-06, + "loss": 0.4529, + "step": 4354 + }, + { + "epoch": 0.75, + "learning_rate": 2.998716925080427e-06, + "loss": 0.4752, + "step": 4355 + }, + { + "epoch": 0.75, + "learning_rate": 2.9947109704017707e-06, + "loss": 0.4475, + "step": 4356 + }, + { + "epoch": 0.75, + "learning_rate": 2.9907072219999443e-06, + "loss": 0.4598, + "step": 4357 + }, + { + "epoch": 0.75, + "learning_rate": 2.9867056811359063e-06, + "loss": 0.4503, + "step": 4358 + }, + { + "epoch": 0.76, + "learning_rate": 2.9827063490699225e-06, + "loss": 0.4649, + "step": 4359 + }, + { + "epoch": 0.76, + "learning_rate": 2.9787092270615527e-06, + "loss": 0.4645, + "step": 4360 + }, + { + "epoch": 0.76, + "learning_rate": 2.974714316369679e-06, + "loss": 0.4648, + "step": 4361 + }, + { + "epoch": 0.76, + "learning_rate": 2.9707216182524667e-06, + "loss": 0.461, + "step": 4362 + }, + { + "epoch": 0.76, + "learning_rate": 2.966731133967399e-06, + "loss": 0.4601, + "step": 4363 + }, + { + "epoch": 0.76, + "learning_rate": 2.9627428647712553e-06, + "loss": 0.4539, + "step": 4364 + }, + { + "epoch": 0.76, + "learning_rate": 2.9587568119201193e-06, + "loss": 0.4541, + "step": 4365 + }, + { + "epoch": 0.76, + "learning_rate": 2.954772976669378e-06, + "loss": 0.4597, + "step": 4366 + }, + { + "epoch": 0.76, + "learning_rate": 2.950791360273714e-06, + "loss": 0.4622, + "step": 4367 + }, + { + "epoch": 0.76, + "learning_rate": 2.9468119639871163e-06, + "loss": 0.4503, + "step": 4368 + }, + { + "epoch": 0.76, + "learning_rate": 2.942834789062876e-06, + "loss": 0.4597, + "step": 4369 + }, + { + "epoch": 0.76, + "learning_rate": 2.9388598367535793e-06, + "loss": 0.4547, + "step": 4370 + }, + { + "epoch": 0.76, + "learning_rate": 2.9348871083111185e-06, + "loss": 0.4495, + "step": 4371 + }, + { + "epoch": 0.76, + "learning_rate": 2.93091660498668e-06, + "loss": 0.4566, + "step": 4372 + }, + { + "epoch": 0.76, + "learning_rate": 2.926948328030755e-06, + "loss": 0.462, + "step": 4373 + }, + { + "epoch": 0.76, + "learning_rate": 2.9229822786931263e-06, + "loss": 0.4492, + "step": 4374 + }, + { + "epoch": 0.76, + "learning_rate": 2.9190184582228787e-06, + "loss": 0.4665, + "step": 4375 + }, + { + "epoch": 0.76, + "learning_rate": 2.9150568678683987e-06, + "loss": 0.4561, + "step": 4376 + }, + { + "epoch": 0.76, + "learning_rate": 2.911097508877365e-06, + "loss": 0.4533, + "step": 4377 + }, + { + "epoch": 0.76, + "learning_rate": 2.907140382496757e-06, + "loss": 0.4547, + "step": 4378 + }, + { + "epoch": 0.76, + "learning_rate": 2.9031854899728485e-06, + "loss": 0.4533, + "step": 4379 + }, + { + "epoch": 0.76, + "learning_rate": 2.899232832551214e-06, + "loss": 0.4637, + "step": 4380 + }, + { + "epoch": 0.76, + "learning_rate": 2.8952824114767164e-06, + "loss": 0.465, + "step": 4381 + }, + { + "epoch": 0.76, + "learning_rate": 2.891334227993521e-06, + "loss": 0.4696, + "step": 4382 + }, + { + "epoch": 0.76, + "learning_rate": 2.8873882833450863e-06, + "loss": 0.4657, + "step": 4383 + }, + { + "epoch": 0.76, + "learning_rate": 2.8834445787741647e-06, + "loss": 0.4593, + "step": 4384 + }, + { + "epoch": 0.76, + "learning_rate": 2.8795031155228083e-06, + "loss": 0.4618, + "step": 4385 + }, + { + "epoch": 0.76, + "learning_rate": 2.8755638948323494e-06, + "loss": 0.4623, + "step": 4386 + }, + { + "epoch": 0.76, + "learning_rate": 2.8716269179434366e-06, + "loss": 0.4635, + "step": 4387 + }, + { + "epoch": 0.76, + "learning_rate": 2.8676921860959874e-06, + "loss": 0.4454, + "step": 4388 + }, + { + "epoch": 0.76, + "learning_rate": 2.8637597005292295e-06, + "loss": 0.4648, + "step": 4389 + }, + { + "epoch": 0.76, + "learning_rate": 2.859829462481676e-06, + "loss": 0.4558, + "step": 4390 + }, + { + "epoch": 0.76, + "learning_rate": 2.855901473191134e-06, + "loss": 0.4496, + "step": 4391 + }, + { + "epoch": 0.76, + "learning_rate": 2.851975733894705e-06, + "loss": 0.4724, + "step": 4392 + }, + { + "epoch": 0.76, + "learning_rate": 2.8480522458287686e-06, + "loss": 0.4589, + "step": 4393 + }, + { + "epoch": 0.76, + "learning_rate": 2.8441310102290187e-06, + "loss": 0.4592, + "step": 4394 + }, + { + "epoch": 0.76, + "learning_rate": 2.840212028330418e-06, + "loss": 0.4554, + "step": 4395 + }, + { + "epoch": 0.76, + "learning_rate": 2.8362953013672325e-06, + "loss": 0.4692, + "step": 4396 + }, + { + "epoch": 0.76, + "learning_rate": 2.8323808305730062e-06, + "loss": 0.451, + "step": 4397 + }, + { + "epoch": 0.76, + "learning_rate": 2.8284686171805875e-06, + "loss": 0.452, + "step": 4398 + }, + { + "epoch": 0.76, + "learning_rate": 2.8245586624221076e-06, + "loss": 0.4629, + "step": 4399 + }, + { + "epoch": 0.76, + "learning_rate": 2.8206509675289785e-06, + "loss": 0.4694, + "step": 4400 + }, + { + "epoch": 0.76, + "learning_rate": 2.8167455337319084e-06, + "loss": 0.4702, + "step": 4401 + }, + { + "epoch": 0.76, + "learning_rate": 2.8128423622608947e-06, + "loss": 0.4631, + "step": 4402 + }, + { + "epoch": 0.76, + "learning_rate": 2.808941454345221e-06, + "loss": 0.4537, + "step": 4403 + }, + { + "epoch": 0.76, + "learning_rate": 2.8050428112134474e-06, + "loss": 0.4615, + "step": 4404 + }, + { + "epoch": 0.76, + "learning_rate": 2.8011464340934403e-06, + "loss": 0.4602, + "step": 4405 + }, + { + "epoch": 0.76, + "learning_rate": 2.7972523242123407e-06, + "loss": 0.4631, + "step": 4406 + }, + { + "epoch": 0.76, + "learning_rate": 2.79336048279657e-06, + "loss": 0.4574, + "step": 4407 + }, + { + "epoch": 0.76, + "learning_rate": 2.7894709110718476e-06, + "loss": 0.4615, + "step": 4408 + }, + { + "epoch": 0.76, + "learning_rate": 2.7855836102631707e-06, + "loss": 0.4502, + "step": 4409 + }, + { + "epoch": 0.76, + "learning_rate": 2.781698581594826e-06, + "loss": 0.4604, + "step": 4410 + }, + { + "epoch": 0.76, + "learning_rate": 2.7778158262903764e-06, + "loss": 0.4571, + "step": 4411 + }, + { + "epoch": 0.76, + "learning_rate": 2.7739353455726735e-06, + "loss": 0.4692, + "step": 4412 + }, + { + "epoch": 0.76, + "learning_rate": 2.7700571406638633e-06, + "loss": 0.4588, + "step": 4413 + }, + { + "epoch": 0.76, + "learning_rate": 2.7661812127853536e-06, + "loss": 0.4522, + "step": 4414 + }, + { + "epoch": 0.76, + "learning_rate": 2.762307563157852e-06, + "loss": 0.4688, + "step": 4415 + }, + { + "epoch": 0.77, + "learning_rate": 2.7584361930013413e-06, + "loss": 0.4656, + "step": 4416 + }, + { + "epoch": 0.77, + "learning_rate": 2.7545671035350907e-06, + "loss": 0.4602, + "step": 4417 + }, + { + "epoch": 0.77, + "learning_rate": 2.7507002959776443e-06, + "loss": 0.4645, + "step": 4418 + }, + { + "epoch": 0.77, + "learning_rate": 2.7468357715468296e-06, + "loss": 0.4498, + "step": 4419 + }, + { + "epoch": 0.77, + "learning_rate": 2.742973531459767e-06, + "loss": 0.4576, + "step": 4420 + }, + { + "epoch": 0.77, + "learning_rate": 2.739113576932838e-06, + "loss": 0.4691, + "step": 4421 + }, + { + "epoch": 0.77, + "learning_rate": 2.735255909181719e-06, + "loss": 0.466, + "step": 4422 + }, + { + "epoch": 0.77, + "learning_rate": 2.7314005294213573e-06, + "loss": 0.4498, + "step": 4423 + }, + { + "epoch": 0.77, + "learning_rate": 2.7275474388659896e-06, + "loss": 0.4618, + "step": 4424 + }, + { + "epoch": 0.77, + "learning_rate": 2.7236966387291176e-06, + "loss": 0.4587, + "step": 4425 + }, + { + "epoch": 0.77, + "learning_rate": 2.7198481302235325e-06, + "loss": 0.4686, + "step": 4426 + }, + { + "epoch": 0.77, + "learning_rate": 2.7160019145613002e-06, + "loss": 0.4644, + "step": 4427 + }, + { + "epoch": 0.77, + "learning_rate": 2.7121579929537677e-06, + "loss": 0.4603, + "step": 4428 + }, + { + "epoch": 0.77, + "learning_rate": 2.7083163666115564e-06, + "loss": 0.4479, + "step": 4429 + }, + { + "epoch": 0.77, + "learning_rate": 2.7044770367445583e-06, + "loss": 0.4653, + "step": 4430 + }, + { + "epoch": 0.77, + "learning_rate": 2.7006400045619597e-06, + "loss": 0.4453, + "step": 4431 + }, + { + "epoch": 0.77, + "learning_rate": 2.6968052712722037e-06, + "loss": 0.4655, + "step": 4432 + }, + { + "epoch": 0.77, + "learning_rate": 2.692972838083022e-06, + "loss": 0.4513, + "step": 4433 + }, + { + "epoch": 0.77, + "learning_rate": 2.6891427062014184e-06, + "loss": 0.4551, + "step": 4434 + }, + { + "epoch": 0.77, + "learning_rate": 2.6853148768336703e-06, + "loss": 0.4565, + "step": 4435 + }, + { + "epoch": 0.77, + "learning_rate": 2.6814893511853347e-06, + "loss": 0.4567, + "step": 4436 + }, + { + "epoch": 0.77, + "learning_rate": 2.677666130461232e-06, + "loss": 0.4506, + "step": 4437 + }, + { + "epoch": 0.77, + "learning_rate": 2.6738452158654736e-06, + "loss": 0.4658, + "step": 4438 + }, + { + "epoch": 0.77, + "learning_rate": 2.670026608601429e-06, + "loss": 0.4617, + "step": 4439 + }, + { + "epoch": 0.77, + "learning_rate": 2.6662103098717485e-06, + "loss": 0.4667, + "step": 4440 + }, + { + "epoch": 0.77, + "learning_rate": 2.6623963208783553e-06, + "loss": 0.4617, + "step": 4441 + }, + { + "epoch": 0.77, + "learning_rate": 2.658584642822444e-06, + "loss": 0.4574, + "step": 4442 + }, + { + "epoch": 0.77, + "learning_rate": 2.654775276904483e-06, + "loss": 0.4585, + "step": 4443 + }, + { + "epoch": 0.77, + "learning_rate": 2.6509682243242074e-06, + "loss": 0.4679, + "step": 4444 + }, + { + "epoch": 0.77, + "learning_rate": 2.6471634862806272e-06, + "loss": 0.4718, + "step": 4445 + }, + { + "epoch": 0.77, + "learning_rate": 2.6433610639720265e-06, + "loss": 0.4694, + "step": 4446 + }, + { + "epoch": 0.77, + "learning_rate": 2.6395609585959547e-06, + "loss": 0.4645, + "step": 4447 + }, + { + "epoch": 0.77, + "learning_rate": 2.635763171349235e-06, + "loss": 0.4486, + "step": 4448 + }, + { + "epoch": 0.77, + "learning_rate": 2.631967703427959e-06, + "loss": 0.4579, + "step": 4449 + }, + { + "epoch": 0.77, + "learning_rate": 2.628174556027492e-06, + "loss": 0.4639, + "step": 4450 + }, + { + "epoch": 0.77, + "learning_rate": 2.624383730342457e-06, + "loss": 0.4476, + "step": 4451 + }, + { + "epoch": 0.77, + "learning_rate": 2.620595227566758e-06, + "loss": 0.4616, + "step": 4452 + }, + { + "epoch": 0.77, + "learning_rate": 2.616809048893563e-06, + "loss": 0.4561, + "step": 4453 + }, + { + "epoch": 0.77, + "learning_rate": 2.6130251955153063e-06, + "loss": 0.4682, + "step": 4454 + }, + { + "epoch": 0.77, + "learning_rate": 2.6092436686236966e-06, + "loss": 0.4561, + "step": 4455 + }, + { + "epoch": 0.77, + "learning_rate": 2.6054644694096942e-06, + "loss": 0.4541, + "step": 4456 + }, + { + "epoch": 0.77, + "learning_rate": 2.601687599063549e-06, + "loss": 0.471, + "step": 4457 + }, + { + "epoch": 0.77, + "learning_rate": 2.597913058774758e-06, + "loss": 0.4713, + "step": 4458 + }, + { + "epoch": 0.77, + "learning_rate": 2.594140849732092e-06, + "loss": 0.4573, + "step": 4459 + }, + { + "epoch": 0.77, + "learning_rate": 2.590370973123589e-06, + "loss": 0.4741, + "step": 4460 + }, + { + "epoch": 0.77, + "learning_rate": 2.5866034301365505e-06, + "loss": 0.4652, + "step": 4461 + }, + { + "epoch": 0.77, + "learning_rate": 2.5828382219575467e-06, + "loss": 0.4584, + "step": 4462 + }, + { + "epoch": 0.77, + "learning_rate": 2.5790753497723986e-06, + "loss": 0.4553, + "step": 4463 + }, + { + "epoch": 0.77, + "learning_rate": 2.5753148147662145e-06, + "loss": 0.4543, + "step": 4464 + }, + { + "epoch": 0.77, + "learning_rate": 2.5715566181233454e-06, + "loss": 0.4604, + "step": 4465 + }, + { + "epoch": 0.77, + "learning_rate": 2.567800761027417e-06, + "loss": 0.4679, + "step": 4466 + }, + { + "epoch": 0.77, + "learning_rate": 2.564047244661316e-06, + "loss": 0.4518, + "step": 4467 + }, + { + "epoch": 0.77, + "learning_rate": 2.5602960702071913e-06, + "loss": 0.4688, + "step": 4468 + }, + { + "epoch": 0.77, + "learning_rate": 2.556547238846456e-06, + "loss": 0.4524, + "step": 4469 + }, + { + "epoch": 0.77, + "learning_rate": 2.5528007517597807e-06, + "loss": 0.4525, + "step": 4470 + }, + { + "epoch": 0.77, + "learning_rate": 2.549056610127101e-06, + "loss": 0.4516, + "step": 4471 + }, + { + "epoch": 0.77, + "learning_rate": 2.5453148151276153e-06, + "loss": 0.4685, + "step": 4472 + }, + { + "epoch": 0.77, + "learning_rate": 2.5415753679397827e-06, + "loss": 0.457, + "step": 4473 + }, + { + "epoch": 0.78, + "learning_rate": 2.537838269741314e-06, + "loss": 0.4706, + "step": 4474 + }, + { + "epoch": 0.78, + "learning_rate": 2.534103521709195e-06, + "loss": 0.4706, + "step": 4475 + }, + { + "epoch": 0.78, + "learning_rate": 2.530371125019664e-06, + "loss": 0.4684, + "step": 4476 + }, + { + "epoch": 0.78, + "learning_rate": 2.526641080848212e-06, + "loss": 0.4625, + "step": 4477 + }, + { + "epoch": 0.78, + "learning_rate": 2.5229133903696012e-06, + "loss": 0.4558, + "step": 4478 + }, + { + "epoch": 0.78, + "learning_rate": 2.519188054757844e-06, + "loss": 0.4569, + "step": 4479 + }, + { + "epoch": 0.78, + "learning_rate": 2.5154650751862197e-06, + "loss": 0.4576, + "step": 4480 + }, + { + "epoch": 0.78, + "learning_rate": 2.5117444528272496e-06, + "loss": 0.4697, + "step": 4481 + }, + { + "epoch": 0.78, + "learning_rate": 2.5080261888527314e-06, + "loss": 0.4654, + "step": 4482 + }, + { + "epoch": 0.78, + "learning_rate": 2.504310284433713e-06, + "loss": 0.4708, + "step": 4483 + }, + { + "epoch": 0.78, + "learning_rate": 2.500596740740491e-06, + "loss": 0.4518, + "step": 4484 + }, + { + "epoch": 0.78, + "learning_rate": 2.4968855589426288e-06, + "loss": 0.4566, + "step": 4485 + }, + { + "epoch": 0.78, + "learning_rate": 2.4931767402089423e-06, + "loss": 0.4543, + "step": 4486 + }, + { + "epoch": 0.78, + "learning_rate": 2.489470285707507e-06, + "loss": 0.4584, + "step": 4487 + }, + { + "epoch": 0.78, + "learning_rate": 2.4857661966056423e-06, + "loss": 0.452, + "step": 4488 + }, + { + "epoch": 0.78, + "learning_rate": 2.4820644740699327e-06, + "loss": 0.4568, + "step": 4489 + }, + { + "epoch": 0.78, + "learning_rate": 2.478365119266223e-06, + "loss": 0.4539, + "step": 4490 + }, + { + "epoch": 0.78, + "learning_rate": 2.4746681333595957e-06, + "loss": 0.4531, + "step": 4491 + }, + { + "epoch": 0.78, + "learning_rate": 2.4709735175143977e-06, + "loss": 0.4556, + "step": 4492 + }, + { + "epoch": 0.78, + "learning_rate": 2.4672812728942295e-06, + "loss": 0.4429, + "step": 4493 + }, + { + "epoch": 0.78, + "learning_rate": 2.4635914006619454e-06, + "loss": 0.4626, + "step": 4494 + }, + { + "epoch": 0.78, + "learning_rate": 2.4599039019796444e-06, + "loss": 0.4629, + "step": 4495 + }, + { + "epoch": 0.78, + "learning_rate": 2.4562187780086834e-06, + "loss": 0.4611, + "step": 4496 + }, + { + "epoch": 0.78, + "learning_rate": 2.45253602990968e-06, + "loss": 0.4483, + "step": 4497 + }, + { + "epoch": 0.78, + "learning_rate": 2.448855658842487e-06, + "loss": 0.4526, + "step": 4498 + }, + { + "epoch": 0.78, + "learning_rate": 2.4451776659662207e-06, + "loss": 0.4572, + "step": 4499 + }, + { + "epoch": 0.78, + "learning_rate": 2.441502052439243e-06, + "loss": 0.4578, + "step": 4500 + }, + { + "epoch": 0.78, + "learning_rate": 2.4378288194191714e-06, + "loss": 0.4605, + "step": 4501 + }, + { + "epoch": 0.78, + "learning_rate": 2.4341579680628637e-06, + "loss": 0.467, + "step": 4502 + }, + { + "epoch": 0.78, + "learning_rate": 2.430489499526438e-06, + "loss": 0.4525, + "step": 4503 + }, + { + "epoch": 0.78, + "learning_rate": 2.4268234149652582e-06, + "loss": 0.4609, + "step": 4504 + }, + { + "epoch": 0.78, + "learning_rate": 2.423159715533937e-06, + "loss": 0.4632, + "step": 4505 + }, + { + "epoch": 0.78, + "learning_rate": 2.419498402386338e-06, + "loss": 0.4622, + "step": 4506 + }, + { + "epoch": 0.78, + "learning_rate": 2.4158394766755645e-06, + "loss": 0.4534, + "step": 4507 + }, + { + "epoch": 0.78, + "learning_rate": 2.4121829395539854e-06, + "loss": 0.4598, + "step": 4508 + }, + { + "epoch": 0.78, + "learning_rate": 2.4085287921731972e-06, + "loss": 0.4521, + "step": 4509 + }, + { + "epoch": 0.78, + "learning_rate": 2.4048770356840577e-06, + "loss": 0.4724, + "step": 4510 + }, + { + "epoch": 0.78, + "learning_rate": 2.401227671236668e-06, + "loss": 0.4629, + "step": 4511 + }, + { + "epoch": 0.78, + "learning_rate": 2.3975806999803717e-06, + "loss": 0.464, + "step": 4512 + }, + { + "epoch": 0.78, + "learning_rate": 2.3939361230637692e-06, + "loss": 0.4545, + "step": 4513 + }, + { + "epoch": 0.78, + "learning_rate": 2.3902939416346917e-06, + "loss": 0.4662, + "step": 4514 + }, + { + "epoch": 0.78, + "learning_rate": 2.386654156840226e-06, + "loss": 0.4666, + "step": 4515 + }, + { + "epoch": 0.78, + "learning_rate": 2.3830167698267038e-06, + "loss": 0.47, + "step": 4516 + }, + { + "epoch": 0.78, + "learning_rate": 2.379381781739699e-06, + "loss": 0.4547, + "step": 4517 + }, + { + "epoch": 0.78, + "learning_rate": 2.375749193724032e-06, + "loss": 0.467, + "step": 4518 + }, + { + "epoch": 0.78, + "learning_rate": 2.3721190069237655e-06, + "loss": 0.4426, + "step": 4519 + }, + { + "epoch": 0.78, + "learning_rate": 2.3684912224822086e-06, + "loss": 0.4621, + "step": 4520 + }, + { + "epoch": 0.78, + "learning_rate": 2.364865841541908e-06, + "loss": 0.4545, + "step": 4521 + }, + { + "epoch": 0.78, + "learning_rate": 2.3612428652446586e-06, + "loss": 0.4635, + "step": 4522 + }, + { + "epoch": 0.78, + "learning_rate": 2.3576222947314962e-06, + "loss": 0.4416, + "step": 4523 + }, + { + "epoch": 0.78, + "learning_rate": 2.354004131142702e-06, + "loss": 0.4648, + "step": 4524 + }, + { + "epoch": 0.78, + "learning_rate": 2.3503883756177935e-06, + "loss": 0.4496, + "step": 4525 + }, + { + "epoch": 0.78, + "learning_rate": 2.346775029295535e-06, + "loss": 0.4434, + "step": 4526 + }, + { + "epoch": 0.78, + "learning_rate": 2.343164093313931e-06, + "loss": 0.4577, + "step": 4527 + }, + { + "epoch": 0.78, + "learning_rate": 2.339555568810221e-06, + "loss": 0.4519, + "step": 4528 + }, + { + "epoch": 0.78, + "learning_rate": 2.3359494569208927e-06, + "loss": 0.4616, + "step": 4529 + }, + { + "epoch": 0.78, + "learning_rate": 2.33234575878167e-06, + "loss": 0.4537, + "step": 4530 + }, + { + "epoch": 0.78, + "learning_rate": 2.328744475527519e-06, + "loss": 0.4601, + "step": 4531 + }, + { + "epoch": 0.79, + "learning_rate": 2.325145608292646e-06, + "loss": 0.4775, + "step": 4532 + }, + { + "epoch": 0.79, + "learning_rate": 2.3215491582104855e-06, + "loss": 0.4496, + "step": 4533 + }, + { + "epoch": 0.79, + "learning_rate": 2.31795512641373e-06, + "loss": 0.464, + "step": 4534 + }, + { + "epoch": 0.79, + "learning_rate": 2.3143635140342936e-06, + "loss": 0.4599, + "step": 4535 + }, + { + "epoch": 0.79, + "learning_rate": 2.310774322203335e-06, + "loss": 0.4569, + "step": 4536 + }, + { + "epoch": 0.79, + "learning_rate": 2.307187552051252e-06, + "loss": 0.4522, + "step": 4537 + }, + { + "epoch": 0.79, + "learning_rate": 2.3036032047076774e-06, + "loss": 0.4585, + "step": 4538 + }, + { + "epoch": 0.79, + "learning_rate": 2.300021281301483e-06, + "loss": 0.4488, + "step": 4539 + }, + { + "epoch": 0.79, + "learning_rate": 2.29644178296077e-06, + "loss": 0.4625, + "step": 4540 + }, + { + "epoch": 0.79, + "learning_rate": 2.292864710812891e-06, + "loss": 0.4622, + "step": 4541 + }, + { + "epoch": 0.79, + "learning_rate": 2.2892900659844154e-06, + "loss": 0.4641, + "step": 4542 + }, + { + "epoch": 0.79, + "learning_rate": 2.2857178496011633e-06, + "loss": 0.4577, + "step": 4543 + }, + { + "epoch": 0.79, + "learning_rate": 2.282148062788182e-06, + "loss": 0.4583, + "step": 4544 + }, + { + "epoch": 0.79, + "learning_rate": 2.278580706669757e-06, + "loss": 0.4488, + "step": 4545 + }, + { + "epoch": 0.79, + "learning_rate": 2.27501578236941e-06, + "loss": 0.475, + "step": 4546 + }, + { + "epoch": 0.79, + "learning_rate": 2.2714532910098885e-06, + "loss": 0.4611, + "step": 4547 + }, + { + "epoch": 0.79, + "learning_rate": 2.267893233713182e-06, + "loss": 0.4476, + "step": 4548 + }, + { + "epoch": 0.79, + "learning_rate": 2.264335611600511e-06, + "loss": 0.4498, + "step": 4549 + }, + { + "epoch": 0.79, + "learning_rate": 2.2607804257923316e-06, + "loss": 0.4632, + "step": 4550 + }, + { + "epoch": 0.79, + "learning_rate": 2.2572276774083212e-06, + "loss": 0.459, + "step": 4551 + }, + { + "epoch": 0.79, + "learning_rate": 2.253677367567406e-06, + "loss": 0.4535, + "step": 4552 + }, + { + "epoch": 0.79, + "learning_rate": 2.2501294973877374e-06, + "loss": 0.4536, + "step": 4553 + }, + { + "epoch": 0.79, + "learning_rate": 2.2465840679866923e-06, + "loss": 0.4461, + "step": 4554 + }, + { + "epoch": 0.79, + "learning_rate": 2.2430410804808842e-06, + "loss": 0.4591, + "step": 4555 + }, + { + "epoch": 0.79, + "learning_rate": 2.23950053598616e-06, + "loss": 0.4658, + "step": 4556 + }, + { + "epoch": 0.79, + "learning_rate": 2.235962435617596e-06, + "loss": 0.4751, + "step": 4557 + }, + { + "epoch": 0.79, + "learning_rate": 2.2324267804894895e-06, + "loss": 0.4686, + "step": 4558 + }, + { + "epoch": 0.79, + "learning_rate": 2.2288935717153825e-06, + "loss": 0.4607, + "step": 4559 + }, + { + "epoch": 0.79, + "learning_rate": 2.2253628104080415e-06, + "loss": 0.4609, + "step": 4560 + }, + { + "epoch": 0.79, + "learning_rate": 2.2218344976794527e-06, + "loss": 0.4695, + "step": 4561 + }, + { + "epoch": 0.79, + "learning_rate": 2.218308634640842e-06, + "loss": 0.4632, + "step": 4562 + }, + { + "epoch": 0.79, + "learning_rate": 2.214785222402661e-06, + "loss": 0.4513, + "step": 4563 + }, + { + "epoch": 0.79, + "learning_rate": 2.2112642620745906e-06, + "loss": 0.4658, + "step": 4564 + }, + { + "epoch": 0.79, + "learning_rate": 2.2077457547655325e-06, + "loss": 0.469, + "step": 4565 + }, + { + "epoch": 0.79, + "learning_rate": 2.204229701583621e-06, + "loss": 0.4621, + "step": 4566 + }, + { + "epoch": 0.79, + "learning_rate": 2.2007161036362255e-06, + "loss": 0.4633, + "step": 4567 + }, + { + "epoch": 0.79, + "learning_rate": 2.1972049620299273e-06, + "loss": 0.4548, + "step": 4568 + }, + { + "epoch": 0.79, + "learning_rate": 2.1936962778705417e-06, + "loss": 0.4551, + "step": 4569 + }, + { + "epoch": 0.79, + "learning_rate": 2.1901900522631114e-06, + "loss": 0.4452, + "step": 4570 + }, + { + "epoch": 0.79, + "learning_rate": 2.186686286311903e-06, + "loss": 0.4726, + "step": 4571 + }, + { + "epoch": 0.79, + "learning_rate": 2.183184981120404e-06, + "loss": 0.4621, + "step": 4572 + }, + { + "epoch": 0.79, + "learning_rate": 2.1796861377913304e-06, + "loss": 0.4636, + "step": 4573 + }, + { + "epoch": 0.79, + "learning_rate": 2.176189757426633e-06, + "loss": 0.4523, + "step": 4574 + }, + { + "epoch": 0.79, + "learning_rate": 2.172695841127468e-06, + "loss": 0.4625, + "step": 4575 + }, + { + "epoch": 0.79, + "learning_rate": 2.1692043899942304e-06, + "loss": 0.4616, + "step": 4576 + }, + { + "epoch": 0.79, + "learning_rate": 2.165715405126525e-06, + "loss": 0.4617, + "step": 4577 + }, + { + "epoch": 0.79, + "learning_rate": 2.1622288876232e-06, + "loss": 0.4576, + "step": 4578 + }, + { + "epoch": 0.79, + "learning_rate": 2.158744838582305e-06, + "loss": 0.4586, + "step": 4579 + }, + { + "epoch": 0.79, + "learning_rate": 2.155263259101127e-06, + "loss": 0.455, + "step": 4580 + }, + { + "epoch": 0.79, + "learning_rate": 2.1517841502761672e-06, + "loss": 0.4683, + "step": 4581 + }, + { + "epoch": 0.79, + "learning_rate": 2.148307513203154e-06, + "loss": 0.4508, + "step": 4582 + }, + { + "epoch": 0.79, + "learning_rate": 2.144833348977037e-06, + "loss": 0.4752, + "step": 4583 + }, + { + "epoch": 0.79, + "learning_rate": 2.141361658691975e-06, + "loss": 0.451, + "step": 4584 + }, + { + "epoch": 0.79, + "learning_rate": 2.1378924434413708e-06, + "loss": 0.4566, + "step": 4585 + }, + { + "epoch": 0.79, + "learning_rate": 2.1344257043178253e-06, + "loss": 0.4511, + "step": 4586 + }, + { + "epoch": 0.79, + "learning_rate": 2.130961442413171e-06, + "loss": 0.4548, + "step": 4587 + }, + { + "epoch": 0.79, + "learning_rate": 2.127499658818458e-06, + "loss": 0.4382, + "step": 4588 + }, + { + "epoch": 0.79, + "learning_rate": 2.1240403546239575e-06, + "loss": 0.4616, + "step": 4589 + }, + { + "epoch": 0.8, + "learning_rate": 2.1205835309191593e-06, + "loss": 0.4636, + "step": 4590 + }, + { + "epoch": 0.8, + "learning_rate": 2.117129188792765e-06, + "loss": 0.4684, + "step": 4591 + }, + { + "epoch": 0.8, + "learning_rate": 2.113677329332704e-06, + "loss": 0.4547, + "step": 4592 + }, + { + "epoch": 0.8, + "learning_rate": 2.1102279536261193e-06, + "loss": 0.4541, + "step": 4593 + }, + { + "epoch": 0.8, + "learning_rate": 2.1067810627593744e-06, + "loss": 0.466, + "step": 4594 + }, + { + "epoch": 0.8, + "learning_rate": 2.1033366578180468e-06, + "loss": 0.4506, + "step": 4595 + }, + { + "epoch": 0.8, + "learning_rate": 2.099894739886933e-06, + "loss": 0.4586, + "step": 4596 + }, + { + "epoch": 0.8, + "learning_rate": 2.0964553100500495e-06, + "loss": 0.4676, + "step": 4597 + }, + { + "epoch": 0.8, + "learning_rate": 2.093018369390619e-06, + "loss": 0.4577, + "step": 4598 + }, + { + "epoch": 0.8, + "learning_rate": 2.0895839189910906e-06, + "loss": 0.4736, + "step": 4599 + }, + { + "epoch": 0.8, + "learning_rate": 2.0861519599331236e-06, + "loss": 0.4652, + "step": 4600 + }, + { + "epoch": 0.8, + "learning_rate": 2.0827224932975963e-06, + "loss": 0.4546, + "step": 4601 + }, + { + "epoch": 0.8, + "learning_rate": 2.0792955201646005e-06, + "loss": 0.4617, + "step": 4602 + }, + { + "epoch": 0.8, + "learning_rate": 2.075871041613441e-06, + "loss": 0.4547, + "step": 4603 + }, + { + "epoch": 0.8, + "learning_rate": 2.07244905872264e-06, + "loss": 0.452, + "step": 4604 + }, + { + "epoch": 0.8, + "learning_rate": 2.0690295725699292e-06, + "loss": 0.4653, + "step": 4605 + }, + { + "epoch": 0.8, + "learning_rate": 2.0656125842322574e-06, + "loss": 0.4589, + "step": 4606 + }, + { + "epoch": 0.8, + "learning_rate": 2.0621980947857865e-06, + "loss": 0.4498, + "step": 4607 + }, + { + "epoch": 0.8, + "learning_rate": 2.0587861053058924e-06, + "loss": 0.4534, + "step": 4608 + }, + { + "epoch": 0.8, + "learning_rate": 2.055376616867164e-06, + "loss": 0.4682, + "step": 4609 + }, + { + "epoch": 0.8, + "learning_rate": 2.0519696305433913e-06, + "loss": 0.4458, + "step": 4610 + }, + { + "epoch": 0.8, + "learning_rate": 2.0485651474075987e-06, + "loss": 0.4538, + "step": 4611 + }, + { + "epoch": 0.8, + "learning_rate": 2.0451631685319995e-06, + "loss": 0.4492, + "step": 4612 + }, + { + "epoch": 0.8, + "learning_rate": 2.0417636949880316e-06, + "loss": 0.468, + "step": 4613 + }, + { + "epoch": 0.8, + "learning_rate": 2.038366727846339e-06, + "loss": 0.4535, + "step": 4614 + }, + { + "epoch": 0.8, + "learning_rate": 2.0349722681767794e-06, + "loss": 0.4561, + "step": 4615 + }, + { + "epoch": 0.8, + "learning_rate": 2.0315803170484204e-06, + "loss": 0.4518, + "step": 4616 + }, + { + "epoch": 0.8, + "learning_rate": 2.028190875529532e-06, + "loss": 0.4601, + "step": 4617 + }, + { + "epoch": 0.8, + "learning_rate": 2.0248039446876078e-06, + "loss": 0.4445, + "step": 4618 + }, + { + "epoch": 0.8, + "learning_rate": 2.0214195255893365e-06, + "loss": 0.466, + "step": 4619 + }, + { + "epoch": 0.8, + "learning_rate": 2.018037619300628e-06, + "loss": 0.4525, + "step": 4620 + }, + { + "epoch": 0.8, + "learning_rate": 2.0146582268865854e-06, + "loss": 0.4705, + "step": 4621 + }, + { + "epoch": 0.8, + "learning_rate": 2.011281349411539e-06, + "loss": 0.4509, + "step": 4622 + }, + { + "epoch": 0.8, + "learning_rate": 2.0079069879390156e-06, + "loss": 0.4678, + "step": 4623 + }, + { + "epoch": 0.8, + "learning_rate": 2.0045351435317484e-06, + "loss": 0.4413, + "step": 4624 + }, + { + "epoch": 0.8, + "learning_rate": 2.0011658172516823e-06, + "loss": 0.4621, + "step": 4625 + }, + { + "epoch": 0.8, + "learning_rate": 1.9977990101599687e-06, + "loss": 0.4394, + "step": 4626 + }, + { + "epoch": 0.8, + "learning_rate": 1.994434723316967e-06, + "loss": 0.4719, + "step": 4627 + }, + { + "epoch": 0.8, + "learning_rate": 1.991072957782233e-06, + "loss": 0.4662, + "step": 4628 + }, + { + "epoch": 0.8, + "learning_rate": 1.987713714614543e-06, + "loss": 0.4462, + "step": 4629 + }, + { + "epoch": 0.8, + "learning_rate": 1.9843569948718744e-06, + "loss": 0.4586, + "step": 4630 + }, + { + "epoch": 0.8, + "learning_rate": 1.981002799611399e-06, + "loss": 0.4606, + "step": 4631 + }, + { + "epoch": 0.8, + "learning_rate": 1.9776511298895064e-06, + "loss": 0.4556, + "step": 4632 + }, + { + "epoch": 0.8, + "learning_rate": 1.9743019867617864e-06, + "loss": 0.4375, + "step": 4633 + }, + { + "epoch": 0.8, + "learning_rate": 1.970955371283034e-06, + "loss": 0.4534, + "step": 4634 + }, + { + "epoch": 0.8, + "learning_rate": 1.9676112845072447e-06, + "loss": 0.4627, + "step": 4635 + }, + { + "epoch": 0.8, + "learning_rate": 1.9642697274876178e-06, + "loss": 0.458, + "step": 4636 + }, + { + "epoch": 0.8, + "learning_rate": 1.9609307012765664e-06, + "loss": 0.4652, + "step": 4637 + }, + { + "epoch": 0.8, + "learning_rate": 1.9575942069256914e-06, + "loss": 0.4623, + "step": 4638 + }, + { + "epoch": 0.8, + "learning_rate": 1.954260245485804e-06, + "loss": 0.4648, + "step": 4639 + }, + { + "epoch": 0.8, + "learning_rate": 1.9509288180069185e-06, + "loss": 0.4569, + "step": 4640 + }, + { + "epoch": 0.8, + "learning_rate": 1.9475999255382516e-06, + "loss": 0.4541, + "step": 4641 + }, + { + "epoch": 0.8, + "learning_rate": 1.944273569128213e-06, + "loss": 0.4501, + "step": 4642 + }, + { + "epoch": 0.8, + "learning_rate": 1.940949749824422e-06, + "loss": 0.4718, + "step": 4643 + }, + { + "epoch": 0.8, + "learning_rate": 1.9376284686737036e-06, + "loss": 0.4484, + "step": 4644 + }, + { + "epoch": 0.8, + "learning_rate": 1.93430972672207e-06, + "loss": 0.4633, + "step": 4645 + }, + { + "epoch": 0.8, + "learning_rate": 1.9309935250147417e-06, + "loss": 0.4523, + "step": 4646 + }, + { + "epoch": 0.81, + "learning_rate": 1.9276798645961392e-06, + "loss": 0.472, + "step": 4647 + }, + { + "epoch": 0.81, + "learning_rate": 1.924368746509884e-06, + "loss": 0.4423, + "step": 4648 + }, + { + "epoch": 0.81, + "learning_rate": 1.9210601717987887e-06, + "loss": 0.4608, + "step": 4649 + }, + { + "epoch": 0.81, + "learning_rate": 1.9177541415048728e-06, + "loss": 0.4644, + "step": 4650 + }, + { + "epoch": 0.81, + "learning_rate": 1.914450656669353e-06, + "loss": 0.4609, + "step": 4651 + }, + { + "epoch": 0.81, + "learning_rate": 1.9111497183326433e-06, + "loss": 0.4549, + "step": 4652 + }, + { + "epoch": 0.81, + "learning_rate": 1.907851327534358e-06, + "loss": 0.4677, + "step": 4653 + }, + { + "epoch": 0.81, + "learning_rate": 1.9045554853132986e-06, + "loss": 0.4447, + "step": 4654 + }, + { + "epoch": 0.81, + "learning_rate": 1.9012621927074849e-06, + "loss": 0.47, + "step": 4655 + }, + { + "epoch": 0.81, + "learning_rate": 1.8979714507541103e-06, + "loss": 0.4524, + "step": 4656 + }, + { + "epoch": 0.81, + "learning_rate": 1.8946832604895805e-06, + "loss": 0.4636, + "step": 4657 + }, + { + "epoch": 0.81, + "learning_rate": 1.8913976229494924e-06, + "loss": 0.4436, + "step": 4658 + }, + { + "epoch": 0.81, + "learning_rate": 1.8881145391686384e-06, + "loss": 0.46, + "step": 4659 + }, + { + "epoch": 0.81, + "learning_rate": 1.8848340101810114e-06, + "loss": 0.4482, + "step": 4660 + }, + { + "epoch": 0.81, + "learning_rate": 1.881556037019787e-06, + "loss": 0.462, + "step": 4661 + }, + { + "epoch": 0.81, + "learning_rate": 1.8782806207173542e-06, + "loss": 0.4598, + "step": 4662 + }, + { + "epoch": 0.81, + "learning_rate": 1.875007762305282e-06, + "loss": 0.4697, + "step": 4663 + }, + { + "epoch": 0.81, + "learning_rate": 1.8717374628143391e-06, + "loss": 0.4405, + "step": 4664 + }, + { + "epoch": 0.81, + "learning_rate": 1.8684697232744886e-06, + "loss": 0.4616, + "step": 4665 + }, + { + "epoch": 0.81, + "learning_rate": 1.865204544714888e-06, + "loss": 0.4561, + "step": 4666 + }, + { + "epoch": 0.81, + "learning_rate": 1.8619419281638883e-06, + "loss": 0.4652, + "step": 4667 + }, + { + "epoch": 0.81, + "learning_rate": 1.8586818746490288e-06, + "loss": 0.4573, + "step": 4668 + }, + { + "epoch": 0.81, + "learning_rate": 1.8554243851970466e-06, + "loss": 0.4537, + "step": 4669 + }, + { + "epoch": 0.81, + "learning_rate": 1.8521694608338692e-06, + "loss": 0.4525, + "step": 4670 + }, + { + "epoch": 0.81, + "learning_rate": 1.8489171025846198e-06, + "loss": 0.4616, + "step": 4671 + }, + { + "epoch": 0.81, + "learning_rate": 1.845667311473608e-06, + "loss": 0.4585, + "step": 4672 + }, + { + "epoch": 0.81, + "learning_rate": 1.842420088524337e-06, + "loss": 0.4477, + "step": 4673 + }, + { + "epoch": 0.81, + "learning_rate": 1.839175434759507e-06, + "loss": 0.4571, + "step": 4674 + }, + { + "epoch": 0.81, + "learning_rate": 1.8359333512009959e-06, + "loss": 0.4569, + "step": 4675 + }, + { + "epoch": 0.81, + "learning_rate": 1.8326938388698846e-06, + "loss": 0.4577, + "step": 4676 + }, + { + "epoch": 0.81, + "learning_rate": 1.829456898786437e-06, + "loss": 0.4595, + "step": 4677 + }, + { + "epoch": 0.81, + "learning_rate": 1.8262225319701122e-06, + "loss": 0.46, + "step": 4678 + }, + { + "epoch": 0.81, + "learning_rate": 1.8229907394395562e-06, + "loss": 0.4578, + "step": 4679 + }, + { + "epoch": 0.81, + "learning_rate": 1.8197615222125975e-06, + "loss": 0.4488, + "step": 4680 + }, + { + "epoch": 0.81, + "learning_rate": 1.81653488130627e-06, + "loss": 0.4586, + "step": 4681 + }, + { + "epoch": 0.81, + "learning_rate": 1.8133108177367787e-06, + "loss": 0.4556, + "step": 4682 + }, + { + "epoch": 0.81, + "learning_rate": 1.810089332519528e-06, + "loss": 0.4613, + "step": 4683 + }, + { + "epoch": 0.81, + "learning_rate": 1.806870426669105e-06, + "loss": 0.4579, + "step": 4684 + }, + { + "epoch": 0.81, + "learning_rate": 1.803654101199288e-06, + "loss": 0.4692, + "step": 4685 + }, + { + "epoch": 0.81, + "learning_rate": 1.8004403571230422e-06, + "loss": 0.4557, + "step": 4686 + }, + { + "epoch": 0.81, + "learning_rate": 1.79722919545251e-06, + "loss": 0.4528, + "step": 4687 + }, + { + "epoch": 0.81, + "learning_rate": 1.7940206171990416e-06, + "loss": 0.4531, + "step": 4688 + }, + { + "epoch": 0.81, + "learning_rate": 1.7908146233731515e-06, + "loss": 0.4565, + "step": 4689 + }, + { + "epoch": 0.81, + "learning_rate": 1.7876112149845526e-06, + "loss": 0.4536, + "step": 4690 + }, + { + "epoch": 0.81, + "learning_rate": 1.7844103930421409e-06, + "loss": 0.474, + "step": 4691 + }, + { + "epoch": 0.81, + "learning_rate": 1.7812121585539964e-06, + "loss": 0.4487, + "step": 4692 + }, + { + "epoch": 0.81, + "learning_rate": 1.7780165125273885e-06, + "loss": 0.467, + "step": 4693 + }, + { + "epoch": 0.81, + "learning_rate": 1.7748234559687628e-06, + "loss": 0.4548, + "step": 4694 + }, + { + "epoch": 0.81, + "learning_rate": 1.771632989883758e-06, + "loss": 0.466, + "step": 4695 + }, + { + "epoch": 0.81, + "learning_rate": 1.7684451152771932e-06, + "loss": 0.4491, + "step": 4696 + }, + { + "epoch": 0.81, + "learning_rate": 1.7652598331530734e-06, + "loss": 0.4749, + "step": 4697 + }, + { + "epoch": 0.81, + "learning_rate": 1.76207714451458e-06, + "loss": 0.4563, + "step": 4698 + }, + { + "epoch": 0.81, + "learning_rate": 1.758897050364089e-06, + "loss": 0.4695, + "step": 4699 + }, + { + "epoch": 0.81, + "learning_rate": 1.7557195517031532e-06, + "loss": 0.4566, + "step": 4700 + }, + { + "epoch": 0.81, + "learning_rate": 1.7525446495325038e-06, + "loss": 0.4568, + "step": 4701 + }, + { + "epoch": 0.81, + "learning_rate": 1.7493723448520616e-06, + "loss": 0.4316, + "step": 4702 + }, + { + "epoch": 0.81, + "learning_rate": 1.7462026386609253e-06, + "loss": 0.4566, + "step": 4703 + }, + { + "epoch": 0.81, + "learning_rate": 1.7430355319573799e-06, + "loss": 0.4596, + "step": 4704 + }, + { + "epoch": 0.82, + "learning_rate": 1.7398710257388784e-06, + "loss": 0.4623, + "step": 4705 + }, + { + "epoch": 0.82, + "learning_rate": 1.7367091210020748e-06, + "loss": 0.4465, + "step": 4706 + }, + { + "epoch": 0.82, + "learning_rate": 1.7335498187427912e-06, + "loss": 0.4636, + "step": 4707 + }, + { + "epoch": 0.82, + "learning_rate": 1.7303931199560286e-06, + "loss": 0.4593, + "step": 4708 + }, + { + "epoch": 0.82, + "learning_rate": 1.7272390256359728e-06, + "loss": 0.4531, + "step": 4709 + }, + { + "epoch": 0.82, + "learning_rate": 1.7240875367759902e-06, + "loss": 0.4527, + "step": 4710 + }, + { + "epoch": 0.82, + "learning_rate": 1.7209386543686247e-06, + "loss": 0.4652, + "step": 4711 + }, + { + "epoch": 0.82, + "learning_rate": 1.7177923794055974e-06, + "loss": 0.4445, + "step": 4712 + }, + { + "epoch": 0.82, + "learning_rate": 1.7146487128778077e-06, + "loss": 0.4478, + "step": 4713 + }, + { + "epoch": 0.82, + "learning_rate": 1.711507655775344e-06, + "loss": 0.4577, + "step": 4714 + }, + { + "epoch": 0.82, + "learning_rate": 1.708369209087457e-06, + "loss": 0.469, + "step": 4715 + }, + { + "epoch": 0.82, + "learning_rate": 1.7052333738025873e-06, + "loss": 0.4628, + "step": 4716 + }, + { + "epoch": 0.82, + "learning_rate": 1.7021001509083457e-06, + "loss": 0.4574, + "step": 4717 + }, + { + "epoch": 0.82, + "learning_rate": 1.6989695413915286e-06, + "loss": 0.4702, + "step": 4718 + }, + { + "epoch": 0.82, + "learning_rate": 1.6958415462380983e-06, + "loss": 0.4528, + "step": 4719 + }, + { + "epoch": 0.82, + "learning_rate": 1.6927161664331992e-06, + "loss": 0.4627, + "step": 4720 + }, + { + "epoch": 0.82, + "learning_rate": 1.6895934029611593e-06, + "loss": 0.4677, + "step": 4721 + }, + { + "epoch": 0.82, + "learning_rate": 1.6864732568054687e-06, + "loss": 0.4795, + "step": 4722 + }, + { + "epoch": 0.82, + "learning_rate": 1.6833557289488046e-06, + "loss": 0.4558, + "step": 4723 + }, + { + "epoch": 0.82, + "learning_rate": 1.6802408203730092e-06, + "loss": 0.4652, + "step": 4724 + }, + { + "epoch": 0.82, + "learning_rate": 1.6771285320591112e-06, + "loss": 0.4564, + "step": 4725 + }, + { + "epoch": 0.82, + "learning_rate": 1.674018864987309e-06, + "loss": 0.4535, + "step": 4726 + }, + { + "epoch": 0.82, + "learning_rate": 1.6709118201369702e-06, + "loss": 0.4631, + "step": 4727 + }, + { + "epoch": 0.82, + "learning_rate": 1.6678073984866438e-06, + "loss": 0.4622, + "step": 4728 + }, + { + "epoch": 0.82, + "learning_rate": 1.6647056010140495e-06, + "loss": 0.4565, + "step": 4729 + }, + { + "epoch": 0.82, + "learning_rate": 1.6616064286960852e-06, + "loss": 0.4612, + "step": 4730 + }, + { + "epoch": 0.82, + "learning_rate": 1.6585098825088086e-06, + "loss": 0.4598, + "step": 4731 + }, + { + "epoch": 0.82, + "learning_rate": 1.6554159634274692e-06, + "loss": 0.4607, + "step": 4732 + }, + { + "epoch": 0.82, + "learning_rate": 1.6523246724264775e-06, + "loss": 0.4438, + "step": 4733 + }, + { + "epoch": 0.82, + "learning_rate": 1.6492360104794158e-06, + "loss": 0.4751, + "step": 4734 + }, + { + "epoch": 0.82, + "learning_rate": 1.6461499785590407e-06, + "loss": 0.458, + "step": 4735 + }, + { + "epoch": 0.82, + "learning_rate": 1.6430665776372834e-06, + "loss": 0.4495, + "step": 4736 + }, + { + "epoch": 0.82, + "learning_rate": 1.639985808685245e-06, + "loss": 0.4514, + "step": 4737 + }, + { + "epoch": 0.82, + "learning_rate": 1.6369076726731913e-06, + "loss": 0.4569, + "step": 4738 + }, + { + "epoch": 0.82, + "learning_rate": 1.6338321705705651e-06, + "loss": 0.465, + "step": 4739 + }, + { + "epoch": 0.82, + "learning_rate": 1.6307593033459856e-06, + "loss": 0.4623, + "step": 4740 + }, + { + "epoch": 0.82, + "learning_rate": 1.6276890719672277e-06, + "loss": 0.4724, + "step": 4741 + }, + { + "epoch": 0.82, + "learning_rate": 1.6246214774012458e-06, + "loss": 0.4702, + "step": 4742 + }, + { + "epoch": 0.82, + "learning_rate": 1.6215565206141638e-06, + "loss": 0.4578, + "step": 4743 + }, + { + "epoch": 0.82, + "learning_rate": 1.6184942025712725e-06, + "loss": 0.4601, + "step": 4744 + }, + { + "epoch": 0.82, + "learning_rate": 1.6154345242370283e-06, + "loss": 0.4708, + "step": 4745 + }, + { + "epoch": 0.82, + "learning_rate": 1.6123774865750607e-06, + "loss": 0.4546, + "step": 4746 + }, + { + "epoch": 0.82, + "learning_rate": 1.6093230905481727e-06, + "loss": 0.448, + "step": 4747 + }, + { + "epoch": 0.82, + "learning_rate": 1.6062713371183225e-06, + "loss": 0.4579, + "step": 4748 + }, + { + "epoch": 0.82, + "learning_rate": 1.6032222272466458e-06, + "loss": 0.455, + "step": 4749 + }, + { + "epoch": 0.82, + "learning_rate": 1.600175761893442e-06, + "loss": 0.4588, + "step": 4750 + }, + { + "epoch": 0.82, + "learning_rate": 1.597131942018182e-06, + "loss": 0.4521, + "step": 4751 + }, + { + "epoch": 0.82, + "learning_rate": 1.5940907685794926e-06, + "loss": 0.4582, + "step": 4752 + }, + { + "epoch": 0.82, + "learning_rate": 1.5910522425351805e-06, + "loss": 0.4698, + "step": 4753 + }, + { + "epoch": 0.82, + "learning_rate": 1.5880163648422099e-06, + "loss": 0.459, + "step": 4754 + }, + { + "epoch": 0.82, + "learning_rate": 1.5849831364567137e-06, + "loss": 0.4697, + "step": 4755 + }, + { + "epoch": 0.82, + "learning_rate": 1.5819525583339945e-06, + "loss": 0.4598, + "step": 4756 + }, + { + "epoch": 0.82, + "learning_rate": 1.5789246314285067e-06, + "loss": 0.4548, + "step": 4757 + }, + { + "epoch": 0.82, + "learning_rate": 1.5758993566938907e-06, + "loss": 0.4695, + "step": 4758 + }, + { + "epoch": 0.82, + "learning_rate": 1.572876735082931e-06, + "loss": 0.4452, + "step": 4759 + }, + { + "epoch": 0.82, + "learning_rate": 1.5698567675475884e-06, + "loss": 0.4694, + "step": 4760 + }, + { + "epoch": 0.82, + "learning_rate": 1.5668394550389854e-06, + "loss": 0.4591, + "step": 4761 + }, + { + "epoch": 0.82, + "learning_rate": 1.5638247985074084e-06, + "loss": 0.4627, + "step": 4762 + }, + { + "epoch": 0.83, + "learning_rate": 1.560812798902307e-06, + "loss": 0.4537, + "step": 4763 + }, + { + "epoch": 0.83, + "learning_rate": 1.5578034571722879e-06, + "loss": 0.4618, + "step": 4764 + }, + { + "epoch": 0.83, + "learning_rate": 1.554796774265137e-06, + "loss": 0.4487, + "step": 4765 + }, + { + "epoch": 0.83, + "learning_rate": 1.5517927511277832e-06, + "loss": 0.473, + "step": 4766 + }, + { + "epoch": 0.83, + "learning_rate": 1.54879138870633e-06, + "loss": 0.449, + "step": 4767 + }, + { + "epoch": 0.83, + "learning_rate": 1.5457926879460404e-06, + "loss": 0.4628, + "step": 4768 + }, + { + "epoch": 0.83, + "learning_rate": 1.5427966497913383e-06, + "loss": 0.4592, + "step": 4769 + }, + { + "epoch": 0.83, + "learning_rate": 1.5398032751858117e-06, + "loss": 0.4573, + "step": 4770 + }, + { + "epoch": 0.83, + "learning_rate": 1.5368125650722021e-06, + "loss": 0.4605, + "step": 4771 + }, + { + "epoch": 0.83, + "learning_rate": 1.5338245203924196e-06, + "loss": 0.4631, + "step": 4772 + }, + { + "epoch": 0.83, + "learning_rate": 1.5308391420875312e-06, + "loss": 0.4626, + "step": 4773 + }, + { + "epoch": 0.83, + "learning_rate": 1.5278564310977673e-06, + "loss": 0.4625, + "step": 4774 + }, + { + "epoch": 0.83, + "learning_rate": 1.5248763883625162e-06, + "loss": 0.4395, + "step": 4775 + }, + { + "epoch": 0.83, + "learning_rate": 1.5218990148203228e-06, + "loss": 0.4601, + "step": 4776 + }, + { + "epoch": 0.83, + "learning_rate": 1.5189243114089003e-06, + "loss": 0.4473, + "step": 4777 + }, + { + "epoch": 0.83, + "learning_rate": 1.5159522790651072e-06, + "loss": 0.4738, + "step": 4778 + }, + { + "epoch": 0.83, + "learning_rate": 1.5129829187249734e-06, + "loss": 0.4613, + "step": 4779 + }, + { + "epoch": 0.83, + "learning_rate": 1.51001623132368e-06, + "loss": 0.4755, + "step": 4780 + }, + { + "epoch": 0.83, + "learning_rate": 1.5070522177955716e-06, + "loss": 0.4657, + "step": 4781 + }, + { + "epoch": 0.83, + "learning_rate": 1.5040908790741448e-06, + "loss": 0.4627, + "step": 4782 + }, + { + "epoch": 0.83, + "learning_rate": 1.5011322160920594e-06, + "loss": 0.4536, + "step": 4783 + }, + { + "epoch": 0.83, + "learning_rate": 1.4981762297811308e-06, + "loss": 0.4732, + "step": 4784 + }, + { + "epoch": 0.83, + "learning_rate": 1.4952229210723257e-06, + "loss": 0.4543, + "step": 4785 + }, + { + "epoch": 0.83, + "learning_rate": 1.4922722908957743e-06, + "loss": 0.4656, + "step": 4786 + }, + { + "epoch": 0.83, + "learning_rate": 1.4893243401807622e-06, + "loss": 0.4625, + "step": 4787 + }, + { + "epoch": 0.83, + "learning_rate": 1.4863790698557301e-06, + "loss": 0.4676, + "step": 4788 + }, + { + "epoch": 0.83, + "learning_rate": 1.483436480848276e-06, + "loss": 0.4544, + "step": 4789 + }, + { + "epoch": 0.83, + "learning_rate": 1.4804965740851451e-06, + "loss": 0.4508, + "step": 4790 + }, + { + "epoch": 0.83, + "learning_rate": 1.4775593504922547e-06, + "loss": 0.4606, + "step": 4791 + }, + { + "epoch": 0.83, + "learning_rate": 1.4746248109946592e-06, + "loss": 0.4709, + "step": 4792 + }, + { + "epoch": 0.83, + "learning_rate": 1.4716929565165784e-06, + "loss": 0.4516, + "step": 4793 + }, + { + "epoch": 0.83, + "learning_rate": 1.4687637879813832e-06, + "loss": 0.4483, + "step": 4794 + }, + { + "epoch": 0.83, + "learning_rate": 1.4658373063115993e-06, + "loss": 0.4591, + "step": 4795 + }, + { + "epoch": 0.83, + "learning_rate": 1.4629135124289084e-06, + "loss": 0.4539, + "step": 4796 + }, + { + "epoch": 0.83, + "learning_rate": 1.459992407254137e-06, + "loss": 0.4629, + "step": 4797 + }, + { + "epoch": 0.83, + "learning_rate": 1.4570739917072752e-06, + "loss": 0.467, + "step": 4798 + }, + { + "epoch": 0.83, + "learning_rate": 1.4541582667074606e-06, + "loss": 0.4629, + "step": 4799 + }, + { + "epoch": 0.83, + "learning_rate": 1.4512452331729864e-06, + "loss": 0.449, + "step": 4800 + }, + { + "epoch": 0.83, + "learning_rate": 1.4483348920212913e-06, + "loss": 0.4625, + "step": 4801 + }, + { + "epoch": 0.83, + "learning_rate": 1.4454272441689764e-06, + "loss": 0.4711, + "step": 4802 + }, + { + "epoch": 0.83, + "learning_rate": 1.4425222905317892e-06, + "loss": 0.4425, + "step": 4803 + }, + { + "epoch": 0.83, + "learning_rate": 1.4396200320246256e-06, + "loss": 0.4623, + "step": 4804 + }, + { + "epoch": 0.83, + "learning_rate": 1.4367204695615367e-06, + "loss": 0.4434, + "step": 4805 + }, + { + "epoch": 0.83, + "learning_rate": 1.433823604055723e-06, + "loss": 0.4681, + "step": 4806 + }, + { + "epoch": 0.83, + "learning_rate": 1.4309294364195403e-06, + "loss": 0.4519, + "step": 4807 + }, + { + "epoch": 0.83, + "learning_rate": 1.4280379675644817e-06, + "loss": 0.4702, + "step": 4808 + }, + { + "epoch": 0.83, + "learning_rate": 1.4251491984012089e-06, + "loss": 0.4474, + "step": 4809 + }, + { + "epoch": 0.83, + "learning_rate": 1.4222631298395207e-06, + "loss": 0.4577, + "step": 4810 + }, + { + "epoch": 0.83, + "learning_rate": 1.4193797627883655e-06, + "loss": 0.4602, + "step": 4811 + }, + { + "epoch": 0.83, + "learning_rate": 1.4164990981558458e-06, + "loss": 0.4659, + "step": 4812 + }, + { + "epoch": 0.83, + "learning_rate": 1.4136211368492104e-06, + "loss": 0.4534, + "step": 4813 + }, + { + "epoch": 0.83, + "learning_rate": 1.4107458797748596e-06, + "loss": 0.4612, + "step": 4814 + }, + { + "epoch": 0.83, + "learning_rate": 1.407873327838335e-06, + "loss": 0.4476, + "step": 4815 + }, + { + "epoch": 0.83, + "learning_rate": 1.4050034819443315e-06, + "loss": 0.4596, + "step": 4816 + }, + { + "epoch": 0.83, + "learning_rate": 1.4021363429966984e-06, + "loss": 0.4659, + "step": 4817 + }, + { + "epoch": 0.83, + "learning_rate": 1.3992719118984167e-06, + "loss": 0.4677, + "step": 4818 + }, + { + "epoch": 0.83, + "learning_rate": 1.3964101895516259e-06, + "loss": 0.4455, + "step": 4819 + }, + { + "epoch": 0.83, + "learning_rate": 1.3935511768576092e-06, + "loss": 0.4581, + "step": 4820 + }, + { + "epoch": 0.84, + "learning_rate": 1.3906948747168003e-06, + "loss": 0.4482, + "step": 4821 + }, + { + "epoch": 0.84, + "learning_rate": 1.3878412840287713e-06, + "loss": 0.4671, + "step": 4822 + }, + { + "epoch": 0.84, + "learning_rate": 1.3849904056922424e-06, + "loss": 0.4556, + "step": 4823 + }, + { + "epoch": 0.84, + "learning_rate": 1.382142240605091e-06, + "loss": 0.4594, + "step": 4824 + }, + { + "epoch": 0.84, + "learning_rate": 1.3792967896643228e-06, + "loss": 0.4634, + "step": 4825 + }, + { + "epoch": 0.84, + "learning_rate": 1.3764540537660997e-06, + "loss": 0.4604, + "step": 4826 + }, + { + "epoch": 0.84, + "learning_rate": 1.3736140338057247e-06, + "loss": 0.461, + "step": 4827 + }, + { + "epoch": 0.84, + "learning_rate": 1.3707767306776498e-06, + "loss": 0.464, + "step": 4828 + }, + { + "epoch": 0.84, + "learning_rate": 1.3679421452754627e-06, + "loss": 0.4531, + "step": 4829 + }, + { + "epoch": 0.84, + "learning_rate": 1.3651102784919024e-06, + "loss": 0.4397, + "step": 4830 + }, + { + "epoch": 0.84, + "learning_rate": 1.3622811312188489e-06, + "loss": 0.4567, + "step": 4831 + }, + { + "epoch": 0.84, + "learning_rate": 1.3594547043473283e-06, + "loss": 0.4537, + "step": 4832 + }, + { + "epoch": 0.84, + "learning_rate": 1.3566309987675087e-06, + "loss": 0.4479, + "step": 4833 + }, + { + "epoch": 0.84, + "learning_rate": 1.353810015368694e-06, + "loss": 0.4678, + "step": 4834 + }, + { + "epoch": 0.84, + "learning_rate": 1.350991755039347e-06, + "loss": 0.4594, + "step": 4835 + }, + { + "epoch": 0.84, + "learning_rate": 1.3481762186670556e-06, + "loss": 0.4674, + "step": 4836 + }, + { + "epoch": 0.84, + "learning_rate": 1.3453634071385591e-06, + "loss": 0.4529, + "step": 4837 + }, + { + "epoch": 0.84, + "learning_rate": 1.342553321339738e-06, + "loss": 0.4667, + "step": 4838 + }, + { + "epoch": 0.84, + "learning_rate": 1.339745962155613e-06, + "loss": 0.4576, + "step": 4839 + }, + { + "epoch": 0.84, + "learning_rate": 1.3369413304703481e-06, + "loss": 0.4639, + "step": 4840 + }, + { + "epoch": 0.84, + "learning_rate": 1.3341394271672403e-06, + "loss": 0.4409, + "step": 4841 + }, + { + "epoch": 0.84, + "learning_rate": 1.3313402531287423e-06, + "loss": 0.4684, + "step": 4842 + }, + { + "epoch": 0.84, + "learning_rate": 1.328543809236431e-06, + "loss": 0.4511, + "step": 4843 + }, + { + "epoch": 0.84, + "learning_rate": 1.3257500963710336e-06, + "loss": 0.4654, + "step": 4844 + }, + { + "epoch": 0.84, + "learning_rate": 1.3229591154124132e-06, + "loss": 0.4546, + "step": 4845 + }, + { + "epoch": 0.84, + "learning_rate": 1.3201708672395762e-06, + "loss": 0.4644, + "step": 4846 + }, + { + "epoch": 0.84, + "learning_rate": 1.3173853527306658e-06, + "loss": 0.449, + "step": 4847 + }, + { + "epoch": 0.84, + "learning_rate": 1.3146025727629618e-06, + "loss": 0.4428, + "step": 4848 + }, + { + "epoch": 0.84, + "learning_rate": 1.3118225282128861e-06, + "loss": 0.457, + "step": 4849 + }, + { + "epoch": 0.84, + "learning_rate": 1.3090452199559988e-06, + "loss": 0.4526, + "step": 4850 + }, + { + "epoch": 0.84, + "learning_rate": 1.3062706488669974e-06, + "loss": 0.4526, + "step": 4851 + }, + { + "epoch": 0.84, + "learning_rate": 1.3034988158197171e-06, + "loss": 0.4551, + "step": 4852 + }, + { + "epoch": 0.84, + "learning_rate": 1.3007297216871328e-06, + "loss": 0.4581, + "step": 4853 + }, + { + "epoch": 0.84, + "learning_rate": 1.2979633673413571e-06, + "loss": 0.4569, + "step": 4854 + }, + { + "epoch": 0.84, + "learning_rate": 1.295199753653633e-06, + "loss": 0.4456, + "step": 4855 + }, + { + "epoch": 0.84, + "learning_rate": 1.2924388814943467e-06, + "loss": 0.4659, + "step": 4856 + }, + { + "epoch": 0.84, + "learning_rate": 1.2896807517330211e-06, + "loss": 0.4537, + "step": 4857 + }, + { + "epoch": 0.84, + "learning_rate": 1.2869253652383141e-06, + "loss": 0.4776, + "step": 4858 + }, + { + "epoch": 0.84, + "learning_rate": 1.2841727228780188e-06, + "loss": 0.4571, + "step": 4859 + }, + { + "epoch": 0.84, + "learning_rate": 1.2814228255190608e-06, + "loss": 0.4624, + "step": 4860 + }, + { + "epoch": 0.84, + "learning_rate": 1.278675674027513e-06, + "loss": 0.4688, + "step": 4861 + }, + { + "epoch": 0.84, + "learning_rate": 1.275931269268569e-06, + "loss": 0.4514, + "step": 4862 + }, + { + "epoch": 0.84, + "learning_rate": 1.2731896121065645e-06, + "loss": 0.4537, + "step": 4863 + }, + { + "epoch": 0.84, + "learning_rate": 1.2704507034049717e-06, + "loss": 0.4614, + "step": 4864 + }, + { + "epoch": 0.84, + "learning_rate": 1.2677145440263927e-06, + "loss": 0.4525, + "step": 4865 + }, + { + "epoch": 0.84, + "learning_rate": 1.2649811348325691e-06, + "loss": 0.4668, + "step": 4866 + }, + { + "epoch": 0.84, + "learning_rate": 1.2622504766843657e-06, + "loss": 0.4593, + "step": 4867 + }, + { + "epoch": 0.84, + "learning_rate": 1.2595225704417958e-06, + "loss": 0.4559, + "step": 4868 + }, + { + "epoch": 0.84, + "learning_rate": 1.2567974169639941e-06, + "loss": 0.455, + "step": 4869 + }, + { + "epoch": 0.84, + "learning_rate": 1.254075017109233e-06, + "loss": 0.4632, + "step": 4870 + }, + { + "epoch": 0.84, + "learning_rate": 1.251355371734918e-06, + "loss": 0.4641, + "step": 4871 + }, + { + "epoch": 0.84, + "learning_rate": 1.248638481697586e-06, + "loss": 0.4654, + "step": 4872 + }, + { + "epoch": 0.84, + "learning_rate": 1.2459243478529094e-06, + "loss": 0.463, + "step": 4873 + }, + { + "epoch": 0.84, + "learning_rate": 1.2432129710556828e-06, + "loss": 0.4561, + "step": 4874 + }, + { + "epoch": 0.84, + "learning_rate": 1.2405043521598448e-06, + "loss": 0.4463, + "step": 4875 + }, + { + "epoch": 0.84, + "learning_rate": 1.2377984920184571e-06, + "loss": 0.4678, + "step": 4876 + }, + { + "epoch": 0.84, + "learning_rate": 1.2350953914837182e-06, + "loss": 0.4545, + "step": 4877 + }, + { + "epoch": 0.85, + "learning_rate": 1.2323950514069483e-06, + "loss": 0.449, + "step": 4878 + }, + { + "epoch": 0.85, + "learning_rate": 1.2296974726386124e-06, + "loss": 0.4515, + "step": 4879 + }, + { + "epoch": 0.85, + "learning_rate": 1.2270026560282955e-06, + "loss": 0.4654, + "step": 4880 + }, + { + "epoch": 0.85, + "learning_rate": 1.224310602424712e-06, + "loss": 0.4686, + "step": 4881 + }, + { + "epoch": 0.85, + "learning_rate": 1.2216213126757115e-06, + "loss": 0.4766, + "step": 4882 + }, + { + "epoch": 0.85, + "learning_rate": 1.2189347876282697e-06, + "loss": 0.4535, + "step": 4883 + }, + { + "epoch": 0.85, + "learning_rate": 1.2162510281284967e-06, + "loss": 0.4661, + "step": 4884 + }, + { + "epoch": 0.85, + "learning_rate": 1.2135700350216207e-06, + "loss": 0.4507, + "step": 4885 + }, + { + "epoch": 0.85, + "learning_rate": 1.2108918091520106e-06, + "loss": 0.465, + "step": 4886 + }, + { + "epoch": 0.85, + "learning_rate": 1.2082163513631595e-06, + "loss": 0.4533, + "step": 4887 + }, + { + "epoch": 0.85, + "learning_rate": 1.2055436624976847e-06, + "loss": 0.4669, + "step": 4888 + }, + { + "epoch": 0.85, + "learning_rate": 1.2028737433973358e-06, + "loss": 0.4558, + "step": 4889 + }, + { + "epoch": 0.85, + "learning_rate": 1.2002065949029896e-06, + "loss": 0.4458, + "step": 4890 + }, + { + "epoch": 0.85, + "learning_rate": 1.1975422178546502e-06, + "loss": 0.4476, + "step": 4891 + }, + { + "epoch": 0.85, + "learning_rate": 1.194880613091447e-06, + "loss": 0.4452, + "step": 4892 + }, + { + "epoch": 0.85, + "learning_rate": 1.1922217814516345e-06, + "loss": 0.4614, + "step": 4893 + }, + { + "epoch": 0.85, + "learning_rate": 1.1895657237726055e-06, + "loss": 0.4551, + "step": 4894 + }, + { + "epoch": 0.85, + "learning_rate": 1.1869124408908627e-06, + "loss": 0.4591, + "step": 4895 + }, + { + "epoch": 0.85, + "learning_rate": 1.1842619336420469e-06, + "loss": 0.4667, + "step": 4896 + }, + { + "epoch": 0.85, + "learning_rate": 1.1816142028609189e-06, + "loss": 0.4559, + "step": 4897 + }, + { + "epoch": 0.85, + "learning_rate": 1.1789692493813688e-06, + "loss": 0.4649, + "step": 4898 + }, + { + "epoch": 0.85, + "learning_rate": 1.1763270740364074e-06, + "loss": 0.454, + "step": 4899 + }, + { + "epoch": 0.85, + "learning_rate": 1.1736876776581706e-06, + "loss": 0.4562, + "step": 4900 + }, + { + "epoch": 0.85, + "learning_rate": 1.1710510610779314e-06, + "loss": 0.4628, + "step": 4901 + }, + { + "epoch": 0.85, + "learning_rate": 1.1684172251260684e-06, + "loss": 0.46, + "step": 4902 + }, + { + "epoch": 0.85, + "learning_rate": 1.1657861706320983e-06, + "loss": 0.4451, + "step": 4903 + }, + { + "epoch": 0.85, + "learning_rate": 1.1631578984246516e-06, + "loss": 0.4541, + "step": 4904 + }, + { + "epoch": 0.85, + "learning_rate": 1.1605324093314951e-06, + "loss": 0.4652, + "step": 4905 + }, + { + "epoch": 0.85, + "learning_rate": 1.1579097041795073e-06, + "loss": 0.4575, + "step": 4906 + }, + { + "epoch": 0.85, + "learning_rate": 1.1552897837946963e-06, + "loss": 0.4577, + "step": 4907 + }, + { + "epoch": 0.85, + "learning_rate": 1.15267264900219e-06, + "loss": 0.4464, + "step": 4908 + }, + { + "epoch": 0.85, + "learning_rate": 1.1500583006262423e-06, + "loss": 0.4721, + "step": 4909 + }, + { + "epoch": 0.85, + "learning_rate": 1.1474467394902288e-06, + "loss": 0.4498, + "step": 4910 + }, + { + "epoch": 0.85, + "learning_rate": 1.144837966416641e-06, + "loss": 0.4546, + "step": 4911 + }, + { + "epoch": 0.85, + "learning_rate": 1.1422319822271044e-06, + "loss": 0.455, + "step": 4912 + }, + { + "epoch": 0.85, + "learning_rate": 1.1396287877423528e-06, + "loss": 0.4723, + "step": 4913 + }, + { + "epoch": 0.85, + "learning_rate": 1.1370283837822515e-06, + "loss": 0.4466, + "step": 4914 + }, + { + "epoch": 0.85, + "learning_rate": 1.134430771165782e-06, + "loss": 0.4503, + "step": 4915 + }, + { + "epoch": 0.85, + "learning_rate": 1.1318359507110489e-06, + "loss": 0.4498, + "step": 4916 + }, + { + "epoch": 0.85, + "learning_rate": 1.1292439232352781e-06, + "loss": 0.4733, + "step": 4917 + }, + { + "epoch": 0.85, + "learning_rate": 1.1266546895548091e-06, + "loss": 0.4523, + "step": 4918 + }, + { + "epoch": 0.85, + "learning_rate": 1.1240682504851108e-06, + "loss": 0.4667, + "step": 4919 + }, + { + "epoch": 0.85, + "learning_rate": 1.1214846068407658e-06, + "loss": 0.4549, + "step": 4920 + }, + { + "epoch": 0.85, + "learning_rate": 1.118903759435479e-06, + "loss": 0.472, + "step": 4921 + }, + { + "epoch": 0.85, + "learning_rate": 1.1163257090820745e-06, + "loss": 0.4643, + "step": 4922 + }, + { + "epoch": 0.85, + "learning_rate": 1.113750456592494e-06, + "loss": 0.4533, + "step": 4923 + }, + { + "epoch": 0.85, + "learning_rate": 1.1111780027778019e-06, + "loss": 0.4689, + "step": 4924 + }, + { + "epoch": 0.85, + "learning_rate": 1.1086083484481735e-06, + "loss": 0.456, + "step": 4925 + }, + { + "epoch": 0.85, + "learning_rate": 1.1060414944129106e-06, + "loss": 0.4549, + "step": 4926 + }, + { + "epoch": 0.85, + "learning_rate": 1.1034774414804273e-06, + "loss": 0.4662, + "step": 4927 + }, + { + "epoch": 0.85, + "learning_rate": 1.1009161904582588e-06, + "loss": 0.4497, + "step": 4928 + }, + { + "epoch": 0.85, + "learning_rate": 1.0983577421530578e-06, + "loss": 0.4653, + "step": 4929 + }, + { + "epoch": 0.85, + "learning_rate": 1.0958020973705918e-06, + "loss": 0.4561, + "step": 4930 + }, + { + "epoch": 0.85, + "learning_rate": 1.0932492569157505e-06, + "loss": 0.4618, + "step": 4931 + }, + { + "epoch": 0.85, + "learning_rate": 1.0906992215925315e-06, + "loss": 0.4698, + "step": 4932 + }, + { + "epoch": 0.85, + "learning_rate": 1.0881519922040574e-06, + "loss": 0.4597, + "step": 4933 + }, + { + "epoch": 0.85, + "learning_rate": 1.0856075695525624e-06, + "loss": 0.4554, + "step": 4934 + }, + { + "epoch": 0.85, + "learning_rate": 1.0830659544393996e-06, + "loss": 0.4674, + "step": 4935 + }, + { + "epoch": 0.86, + "learning_rate": 1.0805271476650382e-06, + "loss": 0.4485, + "step": 4936 + }, + { + "epoch": 0.86, + "learning_rate": 1.077991150029054e-06, + "loss": 0.4717, + "step": 4937 + }, + { + "epoch": 0.86, + "learning_rate": 1.0754579623301564e-06, + "loss": 0.4524, + "step": 4938 + }, + { + "epoch": 0.86, + "learning_rate": 1.0729275853661503e-06, + "loss": 0.4571, + "step": 4939 + }, + { + "epoch": 0.86, + "learning_rate": 1.070400019933966e-06, + "loss": 0.4484, + "step": 4940 + }, + { + "epoch": 0.86, + "learning_rate": 1.0678752668296466e-06, + "loss": 0.45, + "step": 4941 + }, + { + "epoch": 0.86, + "learning_rate": 1.0653533268483495e-06, + "loss": 0.4516, + "step": 4942 + }, + { + "epoch": 0.86, + "learning_rate": 1.0628342007843472e-06, + "loss": 0.4651, + "step": 4943 + }, + { + "epoch": 0.86, + "learning_rate": 1.0603178894310185e-06, + "loss": 0.4505, + "step": 4944 + }, + { + "epoch": 0.86, + "learning_rate": 1.0578043935808702e-06, + "loss": 0.4601, + "step": 4945 + }, + { + "epoch": 0.86, + "learning_rate": 1.055293714025506e-06, + "loss": 0.4572, + "step": 4946 + }, + { + "epoch": 0.86, + "learning_rate": 1.0527858515556565e-06, + "loss": 0.4456, + "step": 4947 + }, + { + "epoch": 0.86, + "learning_rate": 1.0502808069611537e-06, + "loss": 0.4486, + "step": 4948 + }, + { + "epoch": 0.86, + "learning_rate": 1.0477785810309504e-06, + "loss": 0.4663, + "step": 4949 + }, + { + "epoch": 0.86, + "learning_rate": 1.045279174553111e-06, + "loss": 0.4539, + "step": 4950 + }, + { + "epoch": 0.86, + "learning_rate": 1.0427825883148057e-06, + "loss": 0.4528, + "step": 4951 + }, + { + "epoch": 0.86, + "learning_rate": 1.0402888231023212e-06, + "loss": 0.4556, + "step": 4952 + }, + { + "epoch": 0.86, + "learning_rate": 1.0377978797010558e-06, + "loss": 0.4717, + "step": 4953 + }, + { + "epoch": 0.86, + "learning_rate": 1.0353097588955198e-06, + "loss": 0.4627, + "step": 4954 + }, + { + "epoch": 0.86, + "learning_rate": 1.0328244614693285e-06, + "loss": 0.4615, + "step": 4955 + }, + { + "epoch": 0.86, + "learning_rate": 1.0303419882052157e-06, + "loss": 0.4536, + "step": 4956 + }, + { + "epoch": 0.86, + "learning_rate": 1.0278623398850251e-06, + "loss": 0.4785, + "step": 4957 + }, + { + "epoch": 0.86, + "learning_rate": 1.025385517289703e-06, + "loss": 0.4565, + "step": 4958 + }, + { + "epoch": 0.86, + "learning_rate": 1.0229115211993146e-06, + "loss": 0.4651, + "step": 4959 + }, + { + "epoch": 0.86, + "learning_rate": 1.0204403523930284e-06, + "loss": 0.4497, + "step": 4960 + }, + { + "epoch": 0.86, + "learning_rate": 1.0179720116491288e-06, + "loss": 0.4652, + "step": 4961 + }, + { + "epoch": 0.86, + "learning_rate": 1.0155064997450026e-06, + "loss": 0.4497, + "step": 4962 + }, + { + "epoch": 0.86, + "learning_rate": 1.0130438174571478e-06, + "loss": 0.4602, + "step": 4963 + }, + { + "epoch": 0.86, + "learning_rate": 1.0105839655611783e-06, + "loss": 0.4488, + "step": 4964 + }, + { + "epoch": 0.86, + "learning_rate": 1.0081269448318065e-06, + "loss": 0.4584, + "step": 4965 + }, + { + "epoch": 0.86, + "learning_rate": 1.005672756042858e-06, + "loss": 0.4623, + "step": 4966 + }, + { + "epoch": 0.86, + "learning_rate": 1.003221399967267e-06, + "loss": 0.4594, + "step": 4967 + }, + { + "epoch": 0.86, + "learning_rate": 1.0007728773770753e-06, + "loss": 0.4573, + "step": 4968 + }, + { + "epoch": 0.86, + "learning_rate": 9.983271890434277e-07, + "loss": 0.4533, + "step": 4969 + }, + { + "epoch": 0.86, + "learning_rate": 9.95884335736581e-07, + "loss": 0.4615, + "step": 4970 + }, + { + "epoch": 0.86, + "learning_rate": 9.934443182259023e-07, + "loss": 0.454, + "step": 4971 + }, + { + "epoch": 0.86, + "learning_rate": 9.91007137279858e-07, + "loss": 0.465, + "step": 4972 + }, + { + "epoch": 0.86, + "learning_rate": 9.88572793666026e-07, + "loss": 0.4574, + "step": 4973 + }, + { + "epoch": 0.86, + "learning_rate": 9.861412881510891e-07, + "loss": 0.4652, + "step": 4974 + }, + { + "epoch": 0.86, + "learning_rate": 9.837126215008374e-07, + "loss": 0.462, + "step": 4975 + }, + { + "epoch": 0.86, + "learning_rate": 9.81286794480163e-07, + "loss": 0.4675, + "step": 4976 + }, + { + "epoch": 0.86, + "learning_rate": 9.788638078530689e-07, + "loss": 0.4563, + "step": 4977 + }, + { + "epoch": 0.86, + "learning_rate": 9.764436623826601e-07, + "loss": 0.4531, + "step": 4978 + }, + { + "epoch": 0.86, + "learning_rate": 9.740263588311483e-07, + "loss": 0.4551, + "step": 4979 + }, + { + "epoch": 0.86, + "learning_rate": 9.716118979598533e-07, + "loss": 0.4514, + "step": 4980 + }, + { + "epoch": 0.86, + "learning_rate": 9.692002805291888e-07, + "loss": 0.4601, + "step": 4981 + }, + { + "epoch": 0.86, + "learning_rate": 9.667915072986877e-07, + "loss": 0.4578, + "step": 4982 + }, + { + "epoch": 0.86, + "learning_rate": 9.643855790269752e-07, + "loss": 0.4563, + "step": 4983 + }, + { + "epoch": 0.86, + "learning_rate": 9.619824964717873e-07, + "loss": 0.4612, + "step": 4984 + }, + { + "epoch": 0.86, + "learning_rate": 9.595822603899584e-07, + "loss": 0.4669, + "step": 4985 + }, + { + "epoch": 0.86, + "learning_rate": 9.571848715374333e-07, + "loss": 0.4589, + "step": 4986 + }, + { + "epoch": 0.86, + "learning_rate": 9.547903306692562e-07, + "loss": 0.4595, + "step": 4987 + }, + { + "epoch": 0.86, + "learning_rate": 9.523986385395689e-07, + "loss": 0.4563, + "step": 4988 + }, + { + "epoch": 0.86, + "learning_rate": 9.500097959016297e-07, + "loss": 0.4678, + "step": 4989 + }, + { + "epoch": 0.86, + "learning_rate": 9.476238035077855e-07, + "loss": 0.4641, + "step": 4990 + }, + { + "epoch": 0.86, + "learning_rate": 9.452406621094923e-07, + "loss": 0.4592, + "step": 4991 + }, + { + "epoch": 0.86, + "learning_rate": 9.428603724573083e-07, + "loss": 0.4424, + "step": 4992 + }, + { + "epoch": 0.86, + "learning_rate": 9.404829353008915e-07, + "loss": 0.468, + "step": 4993 + }, + { + "epoch": 0.87, + "learning_rate": 9.381083513890055e-07, + "loss": 0.4511, + "step": 4994 + }, + { + "epoch": 0.87, + "learning_rate": 9.357366214695074e-07, + "loss": 0.4791, + "step": 4995 + }, + { + "epoch": 0.87, + "learning_rate": 9.333677462893643e-07, + "loss": 0.4511, + "step": 4996 + }, + { + "epoch": 0.87, + "learning_rate": 9.310017265946381e-07, + "loss": 0.4723, + "step": 4997 + }, + { + "epoch": 0.87, + "learning_rate": 9.286385631304939e-07, + "loss": 0.4526, + "step": 4998 + }, + { + "epoch": 0.87, + "learning_rate": 9.262782566411976e-07, + "loss": 0.4586, + "step": 4999 + }, + { + "epoch": 0.87, + "learning_rate": 9.239208078701145e-07, + "loss": 0.4481, + "step": 5000 + }, + { + "epoch": 0.87, + "learning_rate": 9.215662175597106e-07, + "loss": 0.4651, + "step": 5001 + }, + { + "epoch": 0.87, + "learning_rate": 9.192144864515495e-07, + "loss": 0.4539, + "step": 5002 + }, + { + "epoch": 0.87, + "learning_rate": 9.168656152862965e-07, + "loss": 0.4531, + "step": 5003 + }, + { + "epoch": 0.87, + "learning_rate": 9.145196048037142e-07, + "loss": 0.4536, + "step": 5004 + }, + { + "epoch": 0.87, + "learning_rate": 9.121764557426682e-07, + "loss": 0.4593, + "step": 5005 + }, + { + "epoch": 0.87, + "learning_rate": 9.098361688411206e-07, + "loss": 0.4562, + "step": 5006 + }, + { + "epoch": 0.87, + "learning_rate": 9.074987448361261e-07, + "loss": 0.4564, + "step": 5007 + }, + { + "epoch": 0.87, + "learning_rate": 9.051641844638515e-07, + "loss": 0.4582, + "step": 5008 + }, + { + "epoch": 0.87, + "learning_rate": 9.028324884595474e-07, + "loss": 0.4658, + "step": 5009 + }, + { + "epoch": 0.87, + "learning_rate": 9.00503657557571e-07, + "loss": 0.4541, + "step": 5010 + }, + { + "epoch": 0.87, + "learning_rate": 8.981776924913743e-07, + "loss": 0.4616, + "step": 5011 + }, + { + "epoch": 0.87, + "learning_rate": 8.958545939935059e-07, + "loss": 0.473, + "step": 5012 + }, + { + "epoch": 0.87, + "learning_rate": 8.935343627956172e-07, + "loss": 0.4581, + "step": 5013 + }, + { + "epoch": 0.87, + "learning_rate": 8.912169996284447e-07, + "loss": 0.4719, + "step": 5014 + }, + { + "epoch": 0.87, + "learning_rate": 8.889025052218359e-07, + "loss": 0.4643, + "step": 5015 + }, + { + "epoch": 0.87, + "learning_rate": 8.865908803047241e-07, + "loss": 0.4535, + "step": 5016 + }, + { + "epoch": 0.87, + "learning_rate": 8.842821256051437e-07, + "loss": 0.4527, + "step": 5017 + }, + { + "epoch": 0.87, + "learning_rate": 8.819762418502242e-07, + "loss": 0.4523, + "step": 5018 + }, + { + "epoch": 0.87, + "learning_rate": 8.796732297661914e-07, + "loss": 0.4632, + "step": 5019 + }, + { + "epoch": 0.87, + "learning_rate": 8.773730900783672e-07, + "loss": 0.46, + "step": 5020 + }, + { + "epoch": 0.87, + "learning_rate": 8.750758235111644e-07, + "loss": 0.4641, + "step": 5021 + }, + { + "epoch": 0.87, + "learning_rate": 8.727814307880956e-07, + "loss": 0.4665, + "step": 5022 + }, + { + "epoch": 0.87, + "learning_rate": 8.70489912631769e-07, + "loss": 0.4578, + "step": 5023 + }, + { + "epoch": 0.87, + "learning_rate": 8.682012697638842e-07, + "loss": 0.4444, + "step": 5024 + }, + { + "epoch": 0.87, + "learning_rate": 8.659155029052346e-07, + "loss": 0.4695, + "step": 5025 + }, + { + "epoch": 0.87, + "learning_rate": 8.636326127757121e-07, + "loss": 0.4478, + "step": 5026 + }, + { + "epoch": 0.87, + "learning_rate": 8.613526000943029e-07, + "loss": 0.4595, + "step": 5027 + }, + { + "epoch": 0.87, + "learning_rate": 8.590754655790779e-07, + "loss": 0.4503, + "step": 5028 + }, + { + "epoch": 0.87, + "learning_rate": 8.568012099472123e-07, + "loss": 0.4582, + "step": 5029 + }, + { + "epoch": 0.87, + "learning_rate": 8.545298339149699e-07, + "loss": 0.4453, + "step": 5030 + }, + { + "epoch": 0.87, + "learning_rate": 8.522613381977074e-07, + "loss": 0.4529, + "step": 5031 + }, + { + "epoch": 0.87, + "learning_rate": 8.499957235098722e-07, + "loss": 0.4622, + "step": 5032 + }, + { + "epoch": 0.87, + "learning_rate": 8.477329905650111e-07, + "loss": 0.4481, + "step": 5033 + }, + { + "epoch": 0.87, + "learning_rate": 8.454731400757599e-07, + "loss": 0.4599, + "step": 5034 + }, + { + "epoch": 0.87, + "learning_rate": 8.432161727538424e-07, + "loss": 0.4695, + "step": 5035 + }, + { + "epoch": 0.87, + "learning_rate": 8.40962089310079e-07, + "loss": 0.458, + "step": 5036 + }, + { + "epoch": 0.87, + "learning_rate": 8.387108904543817e-07, + "loss": 0.4724, + "step": 5037 + }, + { + "epoch": 0.87, + "learning_rate": 8.364625768957535e-07, + "loss": 0.4504, + "step": 5038 + }, + { + "epoch": 0.87, + "learning_rate": 8.342171493422857e-07, + "loss": 0.4565, + "step": 5039 + }, + { + "epoch": 0.87, + "learning_rate": 8.319746085011627e-07, + "loss": 0.4611, + "step": 5040 + }, + { + "epoch": 0.87, + "learning_rate": 8.29734955078666e-07, + "loss": 0.4466, + "step": 5041 + }, + { + "epoch": 0.87, + "learning_rate": 8.274981897801571e-07, + "loss": 0.4624, + "step": 5042 + }, + { + "epoch": 0.87, + "learning_rate": 8.252643133100935e-07, + "loss": 0.4657, + "step": 5043 + }, + { + "epoch": 0.87, + "learning_rate": 8.230333263720225e-07, + "loss": 0.4603, + "step": 5044 + }, + { + "epoch": 0.87, + "learning_rate": 8.208052296685842e-07, + "loss": 0.4567, + "step": 5045 + }, + { + "epoch": 0.87, + "learning_rate": 8.185800239014996e-07, + "loss": 0.4677, + "step": 5046 + }, + { + "epoch": 0.87, + "learning_rate": 8.163577097715858e-07, + "loss": 0.4488, + "step": 5047 + }, + { + "epoch": 0.87, + "learning_rate": 8.14138287978754e-07, + "loss": 0.4595, + "step": 5048 + }, + { + "epoch": 0.87, + "learning_rate": 8.119217592219919e-07, + "loss": 0.4542, + "step": 5049 + }, + { + "epoch": 0.87, + "learning_rate": 8.097081241993865e-07, + "loss": 0.4716, + "step": 5050 + }, + { + "epoch": 0.88, + "learning_rate": 8.074973836081102e-07, + "loss": 0.4672, + "step": 5051 + }, + { + "epoch": 0.88, + "learning_rate": 8.052895381444226e-07, + "loss": 0.466, + "step": 5052 + }, + { + "epoch": 0.88, + "learning_rate": 8.03084588503672e-07, + "loss": 0.4464, + "step": 5053 + }, + { + "epoch": 0.88, + "learning_rate": 8.008825353802941e-07, + "loss": 0.4606, + "step": 5054 + }, + { + "epoch": 0.88, + "learning_rate": 7.986833794678139e-07, + "loss": 0.4489, + "step": 5055 + }, + { + "epoch": 0.88, + "learning_rate": 7.964871214588455e-07, + "loss": 0.4624, + "step": 5056 + }, + { + "epoch": 0.88, + "learning_rate": 7.942937620450864e-07, + "loss": 0.4473, + "step": 5057 + }, + { + "epoch": 0.88, + "learning_rate": 7.921033019173208e-07, + "loss": 0.4586, + "step": 5058 + }, + { + "epoch": 0.88, + "learning_rate": 7.899157417654268e-07, + "loss": 0.438, + "step": 5059 + }, + { + "epoch": 0.88, + "learning_rate": 7.877310822783613e-07, + "loss": 0.4699, + "step": 5060 + }, + { + "epoch": 0.88, + "learning_rate": 7.855493241441692e-07, + "loss": 0.4454, + "step": 5061 + }, + { + "epoch": 0.88, + "learning_rate": 7.833704680499865e-07, + "loss": 0.4564, + "step": 5062 + }, + { + "epoch": 0.88, + "learning_rate": 7.811945146820299e-07, + "loss": 0.4445, + "step": 5063 + }, + { + "epoch": 0.88, + "learning_rate": 7.790214647256044e-07, + "loss": 0.4501, + "step": 5064 + }, + { + "epoch": 0.88, + "learning_rate": 7.768513188650995e-07, + "loss": 0.4467, + "step": 5065 + }, + { + "epoch": 0.88, + "learning_rate": 7.746840777839903e-07, + "loss": 0.4536, + "step": 5066 + }, + { + "epoch": 0.88, + "learning_rate": 7.725197421648389e-07, + "loss": 0.4573, + "step": 5067 + }, + { + "epoch": 0.88, + "learning_rate": 7.703583126892889e-07, + "loss": 0.4523, + "step": 5068 + }, + { + "epoch": 0.88, + "learning_rate": 7.68199790038071e-07, + "loss": 0.4583, + "step": 5069 + }, + { + "epoch": 0.88, + "learning_rate": 7.660441748909997e-07, + "loss": 0.4582, + "step": 5070 + }, + { + "epoch": 0.88, + "learning_rate": 7.638914679269772e-07, + "loss": 0.4713, + "step": 5071 + }, + { + "epoch": 0.88, + "learning_rate": 7.617416698239821e-07, + "loss": 0.457, + "step": 5072 + }, + { + "epoch": 0.88, + "learning_rate": 7.595947812590832e-07, + "loss": 0.4698, + "step": 5073 + }, + { + "epoch": 0.88, + "learning_rate": 7.574508029084315e-07, + "loss": 0.4464, + "step": 5074 + }, + { + "epoch": 0.88, + "learning_rate": 7.553097354472594e-07, + "loss": 0.4571, + "step": 5075 + }, + { + "epoch": 0.88, + "learning_rate": 7.531715795498861e-07, + "loss": 0.4684, + "step": 5076 + }, + { + "epoch": 0.88, + "learning_rate": 7.510363358897122e-07, + "loss": 0.4622, + "step": 5077 + }, + { + "epoch": 0.88, + "learning_rate": 7.489040051392204e-07, + "loss": 0.4578, + "step": 5078 + }, + { + "epoch": 0.88, + "learning_rate": 7.46774587969975e-07, + "loss": 0.4585, + "step": 5079 + }, + { + "epoch": 0.88, + "learning_rate": 7.446480850526239e-07, + "loss": 0.4705, + "step": 5080 + }, + { + "epoch": 0.88, + "learning_rate": 7.42524497056899e-07, + "loss": 0.4539, + "step": 5081 + }, + { + "epoch": 0.88, + "learning_rate": 7.404038246516121e-07, + "loss": 0.4642, + "step": 5082 + }, + { + "epoch": 0.88, + "learning_rate": 7.382860685046589e-07, + "loss": 0.4536, + "step": 5083 + }, + { + "epoch": 0.88, + "learning_rate": 7.361712292830092e-07, + "loss": 0.4567, + "step": 5084 + }, + { + "epoch": 0.88, + "learning_rate": 7.34059307652728e-07, + "loss": 0.445, + "step": 5085 + }, + { + "epoch": 0.88, + "learning_rate": 7.319503042789467e-07, + "loss": 0.4614, + "step": 5086 + }, + { + "epoch": 0.88, + "learning_rate": 7.298442198258871e-07, + "loss": 0.4591, + "step": 5087 + }, + { + "epoch": 0.88, + "learning_rate": 7.277410549568476e-07, + "loss": 0.4633, + "step": 5088 + }, + { + "epoch": 0.88, + "learning_rate": 7.256408103342095e-07, + "loss": 0.4729, + "step": 5089 + }, + { + "epoch": 0.88, + "learning_rate": 7.235434866194335e-07, + "loss": 0.4619, + "step": 5090 + }, + { + "epoch": 0.88, + "learning_rate": 7.21449084473056e-07, + "loss": 0.4475, + "step": 5091 + }, + { + "epoch": 0.88, + "learning_rate": 7.193576045547034e-07, + "loss": 0.4671, + "step": 5092 + }, + { + "epoch": 0.88, + "learning_rate": 7.172690475230715e-07, + "loss": 0.4508, + "step": 5093 + }, + { + "epoch": 0.88, + "learning_rate": 7.151834140359404e-07, + "loss": 0.4615, + "step": 5094 + }, + { + "epoch": 0.88, + "learning_rate": 7.131007047501703e-07, + "loss": 0.4544, + "step": 5095 + }, + { + "epoch": 0.88, + "learning_rate": 7.110209203216967e-07, + "loss": 0.4578, + "step": 5096 + }, + { + "epoch": 0.88, + "learning_rate": 7.089440614055398e-07, + "loss": 0.4484, + "step": 5097 + }, + { + "epoch": 0.88, + "learning_rate": 7.0687012865579e-07, + "loss": 0.469, + "step": 5098 + }, + { + "epoch": 0.88, + "learning_rate": 7.047991227256235e-07, + "loss": 0.4572, + "step": 5099 + }, + { + "epoch": 0.88, + "learning_rate": 7.027310442672919e-07, + "loss": 0.4567, + "step": 5100 + }, + { + "epoch": 0.88, + "learning_rate": 7.006658939321265e-07, + "loss": 0.4512, + "step": 5101 + }, + { + "epoch": 0.88, + "learning_rate": 6.986036723705303e-07, + "loss": 0.4632, + "step": 5102 + }, + { + "epoch": 0.88, + "learning_rate": 6.965443802319927e-07, + "loss": 0.4554, + "step": 5103 + }, + { + "epoch": 0.88, + "learning_rate": 6.94488018165077e-07, + "loss": 0.4726, + "step": 5104 + }, + { + "epoch": 0.88, + "learning_rate": 6.924345868174187e-07, + "loss": 0.4646, + "step": 5105 + }, + { + "epoch": 0.88, + "learning_rate": 6.903840868357382e-07, + "loss": 0.4524, + "step": 5106 + }, + { + "epoch": 0.88, + "learning_rate": 6.883365188658275e-07, + "loss": 0.4595, + "step": 5107 + }, + { + "epoch": 0.88, + "learning_rate": 6.86291883552559e-07, + "loss": 0.4705, + "step": 5108 + }, + { + "epoch": 0.89, + "learning_rate": 6.842501815398739e-07, + "loss": 0.4498, + "step": 5109 + }, + { + "epoch": 0.89, + "learning_rate": 6.822114134707991e-07, + "loss": 0.4548, + "step": 5110 + }, + { + "epoch": 0.89, + "learning_rate": 6.801755799874354e-07, + "loss": 0.4446, + "step": 5111 + }, + { + "epoch": 0.89, + "learning_rate": 6.781426817309522e-07, + "loss": 0.468, + "step": 5112 + }, + { + "epoch": 0.89, + "learning_rate": 6.761127193416018e-07, + "loss": 0.4473, + "step": 5113 + }, + { + "epoch": 0.89, + "learning_rate": 6.740856934587092e-07, + "loss": 0.4591, + "step": 5114 + }, + { + "epoch": 0.89, + "learning_rate": 6.720616047206774e-07, + "loss": 0.4511, + "step": 5115 + }, + { + "epoch": 0.89, + "learning_rate": 6.700404537649774e-07, + "loss": 0.4592, + "step": 5116 + }, + { + "epoch": 0.89, + "learning_rate": 6.680222412281601e-07, + "loss": 0.4624, + "step": 5117 + }, + { + "epoch": 0.89, + "learning_rate": 6.660069677458558e-07, + "loss": 0.4514, + "step": 5118 + }, + { + "epoch": 0.89, + "learning_rate": 6.63994633952757e-07, + "loss": 0.4521, + "step": 5119 + }, + { + "epoch": 0.89, + "learning_rate": 6.61985240482641e-07, + "loss": 0.4507, + "step": 5120 + }, + { + "epoch": 0.89, + "learning_rate": 6.59978787968354e-07, + "loss": 0.444, + "step": 5121 + }, + { + "epoch": 0.89, + "learning_rate": 6.57975277041818e-07, + "loss": 0.4518, + "step": 5122 + }, + { + "epoch": 0.89, + "learning_rate": 6.559747083340251e-07, + "loss": 0.4569, + "step": 5123 + }, + { + "epoch": 0.89, + "learning_rate": 6.539770824750447e-07, + "loss": 0.4668, + "step": 5124 + }, + { + "epoch": 0.89, + "learning_rate": 6.519824000940178e-07, + "loss": 0.4592, + "step": 5125 + }, + { + "epoch": 0.89, + "learning_rate": 6.499906618191598e-07, + "loss": 0.454, + "step": 5126 + }, + { + "epoch": 0.89, + "learning_rate": 6.480018682777578e-07, + "loss": 0.4523, + "step": 5127 + }, + { + "epoch": 0.89, + "learning_rate": 6.460160200961662e-07, + "loss": 0.4587, + "step": 5128 + }, + { + "epoch": 0.89, + "learning_rate": 6.44033117899825e-07, + "loss": 0.453, + "step": 5129 + }, + { + "epoch": 0.89, + "learning_rate": 6.42053162313232e-07, + "loss": 0.4617, + "step": 5130 + }, + { + "epoch": 0.89, + "learning_rate": 6.400761539599653e-07, + "loss": 0.4639, + "step": 5131 + }, + { + "epoch": 0.89, + "learning_rate": 6.381020934626725e-07, + "loss": 0.4716, + "step": 5132 + }, + { + "epoch": 0.89, + "learning_rate": 6.361309814430727e-07, + "loss": 0.4593, + "step": 5133 + }, + { + "epoch": 0.89, + "learning_rate": 6.341628185219583e-07, + "loss": 0.4644, + "step": 5134 + }, + { + "epoch": 0.89, + "learning_rate": 6.32197605319187e-07, + "loss": 0.4514, + "step": 5135 + }, + { + "epoch": 0.89, + "learning_rate": 6.302353424536977e-07, + "loss": 0.463, + "step": 5136 + }, + { + "epoch": 0.89, + "learning_rate": 6.282760305434899e-07, + "loss": 0.4533, + "step": 5137 + }, + { + "epoch": 0.89, + "learning_rate": 6.263196702056395e-07, + "loss": 0.4631, + "step": 5138 + }, + { + "epoch": 0.89, + "learning_rate": 6.243662620562918e-07, + "loss": 0.4516, + "step": 5139 + }, + { + "epoch": 0.89, + "learning_rate": 6.2241580671066e-07, + "loss": 0.4672, + "step": 5140 + }, + { + "epoch": 0.89, + "learning_rate": 6.204683047830318e-07, + "loss": 0.4477, + "step": 5141 + }, + { + "epoch": 0.89, + "learning_rate": 6.185237568867597e-07, + "loss": 0.4681, + "step": 5142 + }, + { + "epoch": 0.89, + "learning_rate": 6.165821636342684e-07, + "loss": 0.46, + "step": 5143 + }, + { + "epoch": 0.89, + "learning_rate": 6.146435256370531e-07, + "loss": 0.4561, + "step": 5144 + }, + { + "epoch": 0.89, + "learning_rate": 6.127078435056766e-07, + "loss": 0.4513, + "step": 5145 + }, + { + "epoch": 0.89, + "learning_rate": 6.107751178497722e-07, + "loss": 0.4697, + "step": 5146 + }, + { + "epoch": 0.89, + "learning_rate": 6.088453492780388e-07, + "loss": 0.4588, + "step": 5147 + }, + { + "epoch": 0.89, + "learning_rate": 6.069185383982501e-07, + "loss": 0.4566, + "step": 5148 + }, + { + "epoch": 0.89, + "learning_rate": 6.049946858172395e-07, + "loss": 0.4567, + "step": 5149 + }, + { + "epoch": 0.89, + "learning_rate": 6.030737921409169e-07, + "loss": 0.4653, + "step": 5150 + }, + { + "epoch": 0.89, + "learning_rate": 6.01155857974256e-07, + "loss": 0.4675, + "step": 5151 + }, + { + "epoch": 0.89, + "learning_rate": 5.992408839213005e-07, + "loss": 0.4616, + "step": 5152 + }, + { + "epoch": 0.89, + "learning_rate": 5.973288705851587e-07, + "loss": 0.4532, + "step": 5153 + }, + { + "epoch": 0.89, + "learning_rate": 5.954198185680116e-07, + "loss": 0.461, + "step": 5154 + }, + { + "epoch": 0.89, + "learning_rate": 5.935137284711035e-07, + "loss": 0.4463, + "step": 5155 + }, + { + "epoch": 0.89, + "learning_rate": 5.916106008947454e-07, + "loss": 0.4567, + "step": 5156 + }, + { + "epoch": 0.89, + "learning_rate": 5.897104364383177e-07, + "loss": 0.4475, + "step": 5157 + }, + { + "epoch": 0.89, + "learning_rate": 5.878132357002663e-07, + "loss": 0.46, + "step": 5158 + }, + { + "epoch": 0.89, + "learning_rate": 5.859189992781045e-07, + "loss": 0.461, + "step": 5159 + }, + { + "epoch": 0.89, + "learning_rate": 5.840277277684136e-07, + "loss": 0.4669, + "step": 5160 + }, + { + "epoch": 0.89, + "learning_rate": 5.821394217668331e-07, + "loss": 0.4558, + "step": 5161 + }, + { + "epoch": 0.89, + "learning_rate": 5.802540818680814e-07, + "loss": 0.4566, + "step": 5162 + }, + { + "epoch": 0.89, + "learning_rate": 5.783717086659302e-07, + "loss": 0.4571, + "step": 5163 + }, + { + "epoch": 0.89, + "learning_rate": 5.764923027532265e-07, + "loss": 0.4475, + "step": 5164 + }, + { + "epoch": 0.89, + "learning_rate": 5.746158647218758e-07, + "loss": 0.4604, + "step": 5165 + }, + { + "epoch": 0.89, + "learning_rate": 5.727423951628541e-07, + "loss": 0.4634, + "step": 5166 + }, + { + "epoch": 0.9, + "learning_rate": 5.708718946662006e-07, + "loss": 0.4448, + "step": 5167 + }, + { + "epoch": 0.9, + "learning_rate": 5.690043638210141e-07, + "loss": 0.4579, + "step": 5168 + }, + { + "epoch": 0.9, + "learning_rate": 5.671398032154707e-07, + "loss": 0.4518, + "step": 5169 + }, + { + "epoch": 0.9, + "learning_rate": 5.652782134367974e-07, + "loss": 0.4809, + "step": 5170 + }, + { + "epoch": 0.9, + "learning_rate": 5.634195950712939e-07, + "loss": 0.4448, + "step": 5171 + }, + { + "epoch": 0.9, + "learning_rate": 5.615639487043201e-07, + "loss": 0.4605, + "step": 5172 + }, + { + "epoch": 0.9, + "learning_rate": 5.597112749203038e-07, + "loss": 0.4666, + "step": 5173 + }, + { + "epoch": 0.9, + "learning_rate": 5.578615743027338e-07, + "loss": 0.4622, + "step": 5174 + }, + { + "epoch": 0.9, + "learning_rate": 5.56014847434162e-07, + "loss": 0.4523, + "step": 5175 + }, + { + "epoch": 0.9, + "learning_rate": 5.541710948962043e-07, + "loss": 0.4649, + "step": 5176 + }, + { + "epoch": 0.9, + "learning_rate": 5.523303172695427e-07, + "loss": 0.457, + "step": 5177 + }, + { + "epoch": 0.9, + "learning_rate": 5.504925151339191e-07, + "loss": 0.453, + "step": 5178 + }, + { + "epoch": 0.9, + "learning_rate": 5.48657689068135e-07, + "loss": 0.4498, + "step": 5179 + }, + { + "epoch": 0.9, + "learning_rate": 5.468258396500636e-07, + "loss": 0.4503, + "step": 5180 + }, + { + "epoch": 0.9, + "learning_rate": 5.449969674566369e-07, + "loss": 0.4526, + "step": 5181 + }, + { + "epoch": 0.9, + "learning_rate": 5.431710730638428e-07, + "loss": 0.4634, + "step": 5182 + }, + { + "epoch": 0.9, + "learning_rate": 5.413481570467382e-07, + "loss": 0.463, + "step": 5183 + }, + { + "epoch": 0.9, + "learning_rate": 5.395282199794427e-07, + "loss": 0.4711, + "step": 5184 + }, + { + "epoch": 0.9, + "learning_rate": 5.377112624351355e-07, + "loss": 0.4462, + "step": 5185 + }, + { + "epoch": 0.9, + "learning_rate": 5.358972849860533e-07, + "loss": 0.4639, + "step": 5186 + }, + { + "epoch": 0.9, + "learning_rate": 5.340862882034992e-07, + "loss": 0.4728, + "step": 5187 + }, + { + "epoch": 0.9, + "learning_rate": 5.322782726578413e-07, + "loss": 0.4621, + "step": 5188 + }, + { + "epoch": 0.9, + "learning_rate": 5.304732389184986e-07, + "loss": 0.4525, + "step": 5189 + }, + { + "epoch": 0.9, + "learning_rate": 5.286711875539585e-07, + "loss": 0.4578, + "step": 5190 + }, + { + "epoch": 0.9, + "learning_rate": 5.268721191317683e-07, + "loss": 0.4595, + "step": 5191 + }, + { + "epoch": 0.9, + "learning_rate": 5.250760342185335e-07, + "loss": 0.4715, + "step": 5192 + }, + { + "epoch": 0.9, + "learning_rate": 5.232829333799205e-07, + "loss": 0.4446, + "step": 5193 + }, + { + "epoch": 0.9, + "learning_rate": 5.214928171806543e-07, + "loss": 0.4671, + "step": 5194 + }, + { + "epoch": 0.9, + "learning_rate": 5.197056861845284e-07, + "loss": 0.4451, + "step": 5195 + }, + { + "epoch": 0.9, + "learning_rate": 5.179215409543848e-07, + "loss": 0.4566, + "step": 5196 + }, + { + "epoch": 0.9, + "learning_rate": 5.161403820521305e-07, + "loss": 0.453, + "step": 5197 + }, + { + "epoch": 0.9, + "learning_rate": 5.143622100387336e-07, + "loss": 0.4703, + "step": 5198 + }, + { + "epoch": 0.9, + "learning_rate": 5.125870254742182e-07, + "loss": 0.4572, + "step": 5199 + }, + { + "epoch": 0.9, + "learning_rate": 5.108148289176685e-07, + "loss": 0.4512, + "step": 5200 + }, + { + "epoch": 0.9, + "learning_rate": 5.090456209272276e-07, + "loss": 0.4503, + "step": 5201 + }, + { + "epoch": 0.9, + "learning_rate": 5.07279402060099e-07, + "loss": 0.4579, + "step": 5202 + }, + { + "epoch": 0.9, + "learning_rate": 5.055161728725433e-07, + "loss": 0.4633, + "step": 5203 + }, + { + "epoch": 0.9, + "learning_rate": 5.037559339198805e-07, + "loss": 0.4608, + "step": 5204 + }, + { + "epoch": 0.9, + "learning_rate": 5.01998685756484e-07, + "loss": 0.4749, + "step": 5205 + }, + { + "epoch": 0.9, + "learning_rate": 5.002444289357955e-07, + "loss": 0.4498, + "step": 5206 + }, + { + "epoch": 0.9, + "learning_rate": 4.984931640103041e-07, + "loss": 0.4603, + "step": 5207 + }, + { + "epoch": 0.9, + "learning_rate": 4.967448915315609e-07, + "loss": 0.4564, + "step": 5208 + }, + { + "epoch": 0.9, + "learning_rate": 4.949996120501765e-07, + "loss": 0.4628, + "step": 5209 + }, + { + "epoch": 0.9, + "learning_rate": 4.932573261158169e-07, + "loss": 0.4512, + "step": 5210 + }, + { + "epoch": 0.9, + "learning_rate": 4.915180342772053e-07, + "loss": 0.4598, + "step": 5211 + }, + { + "epoch": 0.9, + "learning_rate": 4.89781737082119e-07, + "loss": 0.4577, + "step": 5212 + }, + { + "epoch": 0.9, + "learning_rate": 4.880484350774007e-07, + "loss": 0.4422, + "step": 5213 + }, + { + "epoch": 0.9, + "learning_rate": 4.863181288089391e-07, + "loss": 0.4424, + "step": 5214 + }, + { + "epoch": 0.9, + "learning_rate": 4.845908188216874e-07, + "loss": 0.4558, + "step": 5215 + }, + { + "epoch": 0.9, + "learning_rate": 4.828665056596504e-07, + "loss": 0.4684, + "step": 5216 + }, + { + "epoch": 0.9, + "learning_rate": 4.811451898658925e-07, + "loss": 0.4609, + "step": 5217 + }, + { + "epoch": 0.9, + "learning_rate": 4.794268719825334e-07, + "loss": 0.4519, + "step": 5218 + }, + { + "epoch": 0.9, + "learning_rate": 4.777115525507447e-07, + "loss": 0.4669, + "step": 5219 + }, + { + "epoch": 0.9, + "learning_rate": 4.759992321107587e-07, + "loss": 0.453, + "step": 5220 + }, + { + "epoch": 0.9, + "learning_rate": 4.7428991120186065e-07, + "loss": 0.4488, + "step": 5221 + }, + { + "epoch": 0.9, + "learning_rate": 4.725835903623921e-07, + "loss": 0.4633, + "step": 5222 + }, + { + "epoch": 0.9, + "learning_rate": 4.708802701297499e-07, + "loss": 0.4644, + "step": 5223 + }, + { + "epoch": 0.9, + "learning_rate": 4.6917995104038384e-07, + "loss": 0.4636, + "step": 5224 + }, + { + "epoch": 0.91, + "learning_rate": 4.6748263362980105e-07, + "loss": 0.466, + "step": 5225 + }, + { + "epoch": 0.91, + "learning_rate": 4.6578831843256176e-07, + "loss": 0.4613, + "step": 5226 + }, + { + "epoch": 0.91, + "learning_rate": 4.6409700598228025e-07, + "loss": 0.4511, + "step": 5227 + }, + { + "epoch": 0.91, + "learning_rate": 4.6240869681162814e-07, + "loss": 0.453, + "step": 5228 + }, + { + "epoch": 0.91, + "learning_rate": 4.607233914523268e-07, + "loss": 0.4489, + "step": 5229 + }, + { + "epoch": 0.91, + "learning_rate": 4.590410904351561e-07, + "loss": 0.4446, + "step": 5230 + }, + { + "epoch": 0.91, + "learning_rate": 4.573617942899433e-07, + "loss": 0.4639, + "step": 5231 + }, + { + "epoch": 0.91, + "learning_rate": 4.556855035455787e-07, + "loss": 0.4521, + "step": 5232 + }, + { + "epoch": 0.91, + "learning_rate": 4.540122187299978e-07, + "loss": 0.4636, + "step": 5233 + }, + { + "epoch": 0.91, + "learning_rate": 4.523419403701923e-07, + "loss": 0.4583, + "step": 5234 + }, + { + "epoch": 0.91, + "learning_rate": 4.5067466899220703e-07, + "loss": 0.4539, + "step": 5235 + }, + { + "epoch": 0.91, + "learning_rate": 4.490104051211408e-07, + "loss": 0.4485, + "step": 5236 + }, + { + "epoch": 0.91, + "learning_rate": 4.4734914928114435e-07, + "loss": 0.447, + "step": 5237 + }, + { + "epoch": 0.91, + "learning_rate": 4.456909019954181e-07, + "loss": 0.4452, + "step": 5238 + }, + { + "epoch": 0.91, + "learning_rate": 4.440356637862231e-07, + "loss": 0.4675, + "step": 5239 + }, + { + "epoch": 0.91, + "learning_rate": 4.4238343517486237e-07, + "loss": 0.4524, + "step": 5240 + }, + { + "epoch": 0.91, + "learning_rate": 4.407342166816997e-07, + "loss": 0.4583, + "step": 5241 + }, + { + "epoch": 0.91, + "learning_rate": 4.3908800882614397e-07, + "loss": 0.4549, + "step": 5242 + }, + { + "epoch": 0.91, + "learning_rate": 4.3744481212666167e-07, + "loss": 0.4564, + "step": 5243 + }, + { + "epoch": 0.91, + "learning_rate": 4.358046271007699e-07, + "loss": 0.4557, + "step": 5244 + }, + { + "epoch": 0.91, + "learning_rate": 4.3416745426503095e-07, + "loss": 0.4698, + "step": 5245 + }, + { + "epoch": 0.91, + "learning_rate": 4.325332941350668e-07, + "loss": 0.4578, + "step": 5246 + }, + { + "epoch": 0.91, + "learning_rate": 4.30902147225547e-07, + "loss": 0.4563, + "step": 5247 + }, + { + "epoch": 0.91, + "learning_rate": 4.2927401405019166e-07, + "loss": 0.4425, + "step": 5248 + }, + { + "epoch": 0.91, + "learning_rate": 4.276488951217705e-07, + "loss": 0.471, + "step": 5249 + }, + { + "epoch": 0.91, + "learning_rate": 4.2602679095210766e-07, + "loss": 0.4542, + "step": 5250 + }, + { + "epoch": 0.91, + "learning_rate": 4.244077020520776e-07, + "loss": 0.4553, + "step": 5251 + }, + { + "epoch": 0.91, + "learning_rate": 4.227916289316003e-07, + "loss": 0.4466, + "step": 5252 + }, + { + "epoch": 0.91, + "learning_rate": 4.2117857209964863e-07, + "loss": 0.4621, + "step": 5253 + }, + { + "epoch": 0.91, + "learning_rate": 4.195685320642484e-07, + "loss": 0.462, + "step": 5254 + }, + { + "epoch": 0.91, + "learning_rate": 4.179615093324729e-07, + "loss": 0.4546, + "step": 5255 + }, + { + "epoch": 0.91, + "learning_rate": 4.1635750441044067e-07, + "loss": 0.4504, + "step": 5256 + }, + { + "epoch": 0.91, + "learning_rate": 4.147565178033286e-07, + "loss": 0.4651, + "step": 5257 + }, + { + "epoch": 0.91, + "learning_rate": 4.131585500153579e-07, + "loss": 0.4507, + "step": 5258 + }, + { + "epoch": 0.91, + "learning_rate": 4.1156360154979813e-07, + "loss": 0.4504, + "step": 5259 + }, + { + "epoch": 0.91, + "learning_rate": 4.099716729089698e-07, + "loss": 0.4499, + "step": 5260 + }, + { + "epoch": 0.91, + "learning_rate": 4.083827645942429e-07, + "loss": 0.4594, + "step": 5261 + }, + { + "epoch": 0.91, + "learning_rate": 4.067968771060349e-07, + "loss": 0.4668, + "step": 5262 + }, + { + "epoch": 0.91, + "learning_rate": 4.0521401094381186e-07, + "loss": 0.4552, + "step": 5263 + }, + { + "epoch": 0.91, + "learning_rate": 4.036341666060872e-07, + "loss": 0.4517, + "step": 5264 + }, + { + "epoch": 0.91, + "learning_rate": 4.0205734459042854e-07, + "loss": 0.4649, + "step": 5265 + }, + { + "epoch": 0.91, + "learning_rate": 4.004835453934419e-07, + "loss": 0.4472, + "step": 5266 + }, + { + "epoch": 0.91, + "learning_rate": 3.9891276951079083e-07, + "loss": 0.4725, + "step": 5267 + }, + { + "epoch": 0.91, + "learning_rate": 3.9734501743717956e-07, + "loss": 0.4478, + "step": 5268 + }, + { + "epoch": 0.91, + "learning_rate": 3.957802896663665e-07, + "loss": 0.4535, + "step": 5269 + }, + { + "epoch": 0.91, + "learning_rate": 3.9421858669114966e-07, + "loss": 0.4643, + "step": 5270 + }, + { + "epoch": 0.91, + "learning_rate": 3.9265990900337893e-07, + "loss": 0.4498, + "step": 5271 + }, + { + "epoch": 0.91, + "learning_rate": 3.9110425709395606e-07, + "loss": 0.4652, + "step": 5272 + }, + { + "epoch": 0.91, + "learning_rate": 3.8955163145282024e-07, + "loss": 0.4634, + "step": 5273 + }, + { + "epoch": 0.91, + "learning_rate": 3.8800203256896483e-07, + "loss": 0.4447, + "step": 5274 + }, + { + "epoch": 0.91, + "learning_rate": 3.8645546093042385e-07, + "loss": 0.459, + "step": 5275 + }, + { + "epoch": 0.91, + "learning_rate": 3.8491191702428654e-07, + "loss": 0.4555, + "step": 5276 + }, + { + "epoch": 0.91, + "learning_rate": 3.833714013366796e-07, + "loss": 0.4627, + "step": 5277 + }, + { + "epoch": 0.91, + "learning_rate": 3.8183391435278163e-07, + "loss": 0.4461, + "step": 5278 + }, + { + "epoch": 0.91, + "learning_rate": 3.802994565568141e-07, + "loss": 0.4504, + "step": 5279 + }, + { + "epoch": 0.91, + "learning_rate": 3.787680284320472e-07, + "loss": 0.461, + "step": 5280 + }, + { + "epoch": 0.91, + "learning_rate": 3.7723963046079724e-07, + "loss": 0.4498, + "step": 5281 + }, + { + "epoch": 0.92, + "learning_rate": 3.757142631244204e-07, + "loss": 0.4495, + "step": 5282 + }, + { + "epoch": 0.92, + "learning_rate": 3.7419192690332786e-07, + "loss": 0.4553, + "step": 5283 + }, + { + "epoch": 0.92, + "learning_rate": 3.726726222769672e-07, + "loss": 0.4456, + "step": 5284 + }, + { + "epoch": 0.92, + "learning_rate": 3.7115634972383464e-07, + "loss": 0.4597, + "step": 5285 + }, + { + "epoch": 0.92, + "learning_rate": 3.696431097214748e-07, + "loss": 0.4608, + "step": 5286 + }, + { + "epoch": 0.92, + "learning_rate": 3.6813290274647197e-07, + "loss": 0.4527, + "step": 5287 + }, + { + "epoch": 0.92, + "learning_rate": 3.6662572927445907e-07, + "loss": 0.4614, + "step": 5288 + }, + { + "epoch": 0.92, + "learning_rate": 3.651215897801097e-07, + "loss": 0.453, + "step": 5289 + }, + { + "epoch": 0.92, + "learning_rate": 3.6362048473714496e-07, + "loss": 0.4504, + "step": 5290 + }, + { + "epoch": 0.92, + "learning_rate": 3.6212241461833107e-07, + "loss": 0.4459, + "step": 5291 + }, + { + "epoch": 0.92, + "learning_rate": 3.606273798954751e-07, + "loss": 0.4535, + "step": 5292 + }, + { + "epoch": 0.92, + "learning_rate": 3.5913538103943155e-07, + "loss": 0.4683, + "step": 5293 + }, + { + "epoch": 0.92, + "learning_rate": 3.5764641852009565e-07, + "loss": 0.4617, + "step": 5294 + }, + { + "epoch": 0.92, + "learning_rate": 3.5616049280640995e-07, + "loss": 0.4643, + "step": 5295 + }, + { + "epoch": 0.92, + "learning_rate": 3.5467760436635577e-07, + "loss": 0.4484, + "step": 5296 + }, + { + "epoch": 0.92, + "learning_rate": 3.5319775366696175e-07, + "loss": 0.4557, + "step": 5297 + }, + { + "epoch": 0.92, + "learning_rate": 3.517209411742994e-07, + "loss": 0.4481, + "step": 5298 + }, + { + "epoch": 0.92, + "learning_rate": 3.502471673534824e-07, + "loss": 0.4605, + "step": 5299 + }, + { + "epoch": 0.92, + "learning_rate": 3.48776432668666e-07, + "loss": 0.4491, + "step": 5300 + }, + { + "epoch": 0.92, + "learning_rate": 3.4730873758305193e-07, + "loss": 0.4689, + "step": 5301 + }, + { + "epoch": 0.92, + "learning_rate": 3.458440825588827e-07, + "loss": 0.457, + "step": 5302 + }, + { + "epoch": 0.92, + "learning_rate": 3.4438246805744034e-07, + "loss": 0.4624, + "step": 5303 + }, + { + "epoch": 0.92, + "learning_rate": 3.429238945390556e-07, + "loss": 0.4581, + "step": 5304 + }, + { + "epoch": 0.92, + "learning_rate": 3.4146836246309656e-07, + "loss": 0.4609, + "step": 5305 + }, + { + "epoch": 0.92, + "learning_rate": 3.4001587228797427e-07, + "loss": 0.4566, + "step": 5306 + }, + { + "epoch": 0.92, + "learning_rate": 3.385664244711451e-07, + "loss": 0.4571, + "step": 5307 + }, + { + "epoch": 0.92, + "learning_rate": 3.3712001946910046e-07, + "loss": 0.4469, + "step": 5308 + }, + { + "epoch": 0.92, + "learning_rate": 3.3567665773738156e-07, + "loss": 0.4617, + "step": 5309 + }, + { + "epoch": 0.92, + "learning_rate": 3.342363397305648e-07, + "loss": 0.4555, + "step": 5310 + }, + { + "epoch": 0.92, + "learning_rate": 3.327990659022706e-07, + "loss": 0.4569, + "step": 5311 + }, + { + "epoch": 0.92, + "learning_rate": 3.313648367051614e-07, + "loss": 0.4567, + "step": 5312 + }, + { + "epoch": 0.92, + "learning_rate": 3.299336525909391e-07, + "loss": 0.4615, + "step": 5313 + }, + { + "epoch": 0.92, + "learning_rate": 3.2850551401034767e-07, + "loss": 0.4533, + "step": 5314 + }, + { + "epoch": 0.92, + "learning_rate": 3.270804214131684e-07, + "loss": 0.4612, + "step": 5315 + }, + { + "epoch": 0.92, + "learning_rate": 3.2565837524823227e-07, + "loss": 0.4474, + "step": 5316 + }, + { + "epoch": 0.92, + "learning_rate": 3.242393759633988e-07, + "loss": 0.459, + "step": 5317 + }, + { + "epoch": 0.92, + "learning_rate": 3.228234240055772e-07, + "loss": 0.4459, + "step": 5318 + }, + { + "epoch": 0.92, + "learning_rate": 3.2141051982071293e-07, + "loss": 0.4676, + "step": 5319 + }, + { + "epoch": 0.92, + "learning_rate": 3.2000066385379225e-07, + "loss": 0.4667, + "step": 5320 + }, + { + "epoch": 0.92, + "learning_rate": 3.185938565488422e-07, + "loss": 0.4633, + "step": 5321 + }, + { + "epoch": 0.92, + "learning_rate": 3.171900983489273e-07, + "loss": 0.4517, + "step": 5322 + }, + { + "epoch": 0.92, + "learning_rate": 3.1578938969615394e-07, + "loss": 0.458, + "step": 5323 + }, + { + "epoch": 0.92, + "learning_rate": 3.143917310316691e-07, + "loss": 0.4582, + "step": 5324 + }, + { + "epoch": 0.92, + "learning_rate": 3.129971227956563e-07, + "loss": 0.4636, + "step": 5325 + }, + { + "epoch": 0.92, + "learning_rate": 3.1160556542733757e-07, + "loss": 0.448, + "step": 5326 + }, + { + "epoch": 0.92, + "learning_rate": 3.1021705936498005e-07, + "loss": 0.4799, + "step": 5327 + }, + { + "epoch": 0.92, + "learning_rate": 3.0883160504588504e-07, + "loss": 0.4445, + "step": 5328 + }, + { + "epoch": 0.92, + "learning_rate": 3.0744920290639247e-07, + "loss": 0.4554, + "step": 5329 + }, + { + "epoch": 0.92, + "learning_rate": 3.0606985338188177e-07, + "loss": 0.4439, + "step": 5330 + }, + { + "epoch": 0.92, + "learning_rate": 3.0469355690677216e-07, + "loss": 0.4707, + "step": 5331 + }, + { + "epoch": 0.92, + "learning_rate": 3.0332031391452243e-07, + "loss": 0.45, + "step": 5332 + }, + { + "epoch": 0.92, + "learning_rate": 3.019501248376244e-07, + "loss": 0.47, + "step": 5333 + }, + { + "epoch": 0.92, + "learning_rate": 3.0058299010761294e-07, + "loss": 0.4555, + "step": 5334 + }, + { + "epoch": 0.92, + "learning_rate": 2.992189101550613e-07, + "loss": 0.4653, + "step": 5335 + }, + { + "epoch": 0.92, + "learning_rate": 2.9785788540957706e-07, + "loss": 0.4473, + "step": 5336 + }, + { + "epoch": 0.92, + "learning_rate": 2.964999162998072e-07, + "loss": 0.4552, + "step": 5337 + }, + { + "epoch": 0.92, + "learning_rate": 2.951450032534364e-07, + "loss": 0.4465, + "step": 5338 + }, + { + "epoch": 0.92, + "learning_rate": 2.937931466971888e-07, + "loss": 0.4582, + "step": 5339 + }, + { + "epoch": 0.93, + "learning_rate": 2.9244434705682276e-07, + "loss": 0.4413, + "step": 5340 + }, + { + "epoch": 0.93, + "learning_rate": 2.9109860475713403e-07, + "loss": 0.4547, + "step": 5341 + }, + { + "epoch": 0.93, + "learning_rate": 2.897559202219602e-07, + "loss": 0.4603, + "step": 5342 + }, + { + "epoch": 0.93, + "learning_rate": 2.884162938741686e-07, + "loss": 0.4502, + "step": 5343 + }, + { + "epoch": 0.93, + "learning_rate": 2.870797261356684e-07, + "loss": 0.4443, + "step": 5344 + }, + { + "epoch": 0.93, + "learning_rate": 2.8574621742740506e-07, + "loss": 0.4499, + "step": 5345 + }, + { + "epoch": 0.93, + "learning_rate": 2.8441576816936043e-07, + "loss": 0.4647, + "step": 5346 + }, + { + "epoch": 0.93, + "learning_rate": 2.830883787805494e-07, + "loss": 0.4481, + "step": 5347 + }, + { + "epoch": 0.93, + "learning_rate": 2.817640496790275e-07, + "loss": 0.455, + "step": 5348 + }, + { + "epoch": 0.93, + "learning_rate": 2.8044278128188327e-07, + "loss": 0.467, + "step": 5349 + }, + { + "epoch": 0.93, + "learning_rate": 2.791245740052451e-07, + "loss": 0.4679, + "step": 5350 + }, + { + "epoch": 0.93, + "learning_rate": 2.7780942826427514e-07, + "loss": 0.461, + "step": 5351 + }, + { + "epoch": 0.93, + "learning_rate": 2.7649734447316777e-07, + "loss": 0.4574, + "step": 5352 + }, + { + "epoch": 0.93, + "learning_rate": 2.751883230451613e-07, + "loss": 0.4629, + "step": 5353 + }, + { + "epoch": 0.93, + "learning_rate": 2.738823643925215e-07, + "loss": 0.4565, + "step": 5354 + }, + { + "epoch": 0.93, + "learning_rate": 2.725794689265537e-07, + "loss": 0.4407, + "step": 5355 + }, + { + "epoch": 0.93, + "learning_rate": 2.7127963705759653e-07, + "loss": 0.459, + "step": 5356 + }, + { + "epoch": 0.93, + "learning_rate": 2.699828691950268e-07, + "loss": 0.4583, + "step": 5357 + }, + { + "epoch": 0.93, + "learning_rate": 2.6868916574725347e-07, + "loss": 0.4489, + "step": 5358 + }, + { + "epoch": 0.93, + "learning_rate": 2.6739852712171946e-07, + "loss": 0.4553, + "step": 5359 + }, + { + "epoch": 0.93, + "learning_rate": 2.661109537249085e-07, + "loss": 0.4625, + "step": 5360 + }, + { + "epoch": 0.93, + "learning_rate": 2.6482644596232953e-07, + "loss": 0.4665, + "step": 5361 + }, + { + "epoch": 0.93, + "learning_rate": 2.6354500423853457e-07, + "loss": 0.452, + "step": 5362 + }, + { + "epoch": 0.93, + "learning_rate": 2.622666289571063e-07, + "loss": 0.4487, + "step": 5363 + }, + { + "epoch": 0.93, + "learning_rate": 2.6099132052066044e-07, + "loss": 0.4503, + "step": 5364 + }, + { + "epoch": 0.93, + "learning_rate": 2.5971907933085016e-07, + "loss": 0.4621, + "step": 5365 + }, + { + "epoch": 0.93, + "learning_rate": 2.5844990578835825e-07, + "loss": 0.4508, + "step": 5366 + }, + { + "epoch": 0.93, + "learning_rate": 2.571838002929061e-07, + "loss": 0.4531, + "step": 5367 + }, + { + "epoch": 0.93, + "learning_rate": 2.559207632432448e-07, + "loss": 0.4536, + "step": 5368 + }, + { + "epoch": 0.93, + "learning_rate": 2.546607950371627e-07, + "loss": 0.4642, + "step": 5369 + }, + { + "epoch": 0.93, + "learning_rate": 2.534038960714791e-07, + "loss": 0.4454, + "step": 5370 + }, + { + "epoch": 0.93, + "learning_rate": 2.5215006674204625e-07, + "loss": 0.4569, + "step": 5371 + }, + { + "epoch": 0.93, + "learning_rate": 2.508993074437527e-07, + "loss": 0.4533, + "step": 5372 + }, + { + "epoch": 0.93, + "learning_rate": 2.4965161857051667e-07, + "loss": 0.4574, + "step": 5373 + }, + { + "epoch": 0.93, + "learning_rate": 2.4840700051529054e-07, + "loss": 0.4555, + "step": 5374 + }, + { + "epoch": 0.93, + "learning_rate": 2.4716545367006186e-07, + "loss": 0.4636, + "step": 5375 + }, + { + "epoch": 0.93, + "learning_rate": 2.459269784258467e-07, + "loss": 0.4602, + "step": 5376 + }, + { + "epoch": 0.93, + "learning_rate": 2.4469157517269636e-07, + "loss": 0.4591, + "step": 5377 + }, + { + "epoch": 0.93, + "learning_rate": 2.4345924429969523e-07, + "loss": 0.4491, + "step": 5378 + }, + { + "epoch": 0.93, + "learning_rate": 2.4222998619495953e-07, + "loss": 0.4593, + "step": 5379 + }, + { + "epoch": 0.93, + "learning_rate": 2.41003801245635e-07, + "loss": 0.4663, + "step": 5380 + }, + { + "epoch": 0.93, + "learning_rate": 2.3978068983790294e-07, + "loss": 0.4496, + "step": 5381 + }, + { + "epoch": 0.93, + "learning_rate": 2.3856065235697613e-07, + "loss": 0.4602, + "step": 5382 + }, + { + "epoch": 0.93, + "learning_rate": 2.3734368918709838e-07, + "loss": 0.4575, + "step": 5383 + }, + { + "epoch": 0.93, + "learning_rate": 2.3612980071154534e-07, + "loss": 0.4551, + "step": 5384 + }, + { + "epoch": 0.93, + "learning_rate": 2.349189873126223e-07, + "loss": 0.4377, + "step": 5385 + }, + { + "epoch": 0.93, + "learning_rate": 2.33711249371672e-07, + "loss": 0.4714, + "step": 5386 + }, + { + "epoch": 0.93, + "learning_rate": 2.325065872690624e-07, + "loss": 0.4479, + "step": 5387 + }, + { + "epoch": 0.93, + "learning_rate": 2.3130500138419553e-07, + "loss": 0.4634, + "step": 5388 + }, + { + "epoch": 0.93, + "learning_rate": 2.3010649209550428e-07, + "loss": 0.4547, + "step": 5389 + }, + { + "epoch": 0.93, + "learning_rate": 2.2891105978045336e-07, + "loss": 0.4579, + "step": 5390 + }, + { + "epoch": 0.93, + "learning_rate": 2.2771870481553715e-07, + "loss": 0.4419, + "step": 5391 + }, + { + "epoch": 0.93, + "learning_rate": 2.265294275762786e-07, + "loss": 0.4609, + "step": 5392 + }, + { + "epoch": 0.93, + "learning_rate": 2.25343228437237e-07, + "loss": 0.4552, + "step": 5393 + }, + { + "epoch": 0.93, + "learning_rate": 2.2416010777199904e-07, + "loss": 0.4539, + "step": 5394 + }, + { + "epoch": 0.93, + "learning_rate": 2.229800659531811e-07, + "loss": 0.4517, + "step": 5395 + }, + { + "epoch": 0.93, + "learning_rate": 2.218031033524304e-07, + "loss": 0.4577, + "step": 5396 + }, + { + "epoch": 0.93, + "learning_rate": 2.2062922034042478e-07, + "loss": 0.4423, + "step": 5397 + }, + { + "epoch": 0.94, + "learning_rate": 2.194584172868741e-07, + "loss": 0.4606, + "step": 5398 + }, + { + "epoch": 0.94, + "learning_rate": 2.1829069456051456e-07, + "loss": 0.4527, + "step": 5399 + }, + { + "epoch": 0.94, + "learning_rate": 2.17126052529113e-07, + "loss": 0.4753, + "step": 5400 + }, + { + "epoch": 0.94, + "learning_rate": 2.159644915594694e-07, + "loss": 0.4676, + "step": 5401 + }, + { + "epoch": 0.94, + "learning_rate": 2.1480601201741004e-07, + "loss": 0.4511, + "step": 5402 + }, + { + "epoch": 0.94, + "learning_rate": 2.1365061426778967e-07, + "loss": 0.4427, + "step": 5403 + }, + { + "epoch": 0.94, + "learning_rate": 2.1249829867449723e-07, + "loss": 0.4649, + "step": 5404 + }, + { + "epoch": 0.94, + "learning_rate": 2.11349065600448e-07, + "loss": 0.4483, + "step": 5405 + }, + { + "epoch": 0.94, + "learning_rate": 2.1020291540758352e-07, + "loss": 0.4481, + "step": 5406 + }, + { + "epoch": 0.94, + "learning_rate": 2.0905984845687954e-07, + "loss": 0.4496, + "step": 5407 + }, + { + "epoch": 0.94, + "learning_rate": 2.0791986510833918e-07, + "loss": 0.4565, + "step": 5408 + }, + { + "epoch": 0.94, + "learning_rate": 2.067829657209941e-07, + "loss": 0.466, + "step": 5409 + }, + { + "epoch": 0.94, + "learning_rate": 2.0564915065290237e-07, + "loss": 0.4785, + "step": 5410 + }, + { + "epoch": 0.94, + "learning_rate": 2.0451842026115277e-07, + "loss": 0.4514, + "step": 5411 + }, + { + "epoch": 0.94, + "learning_rate": 2.0339077490186488e-07, + "loss": 0.4763, + "step": 5412 + }, + { + "epoch": 0.94, + "learning_rate": 2.022662149301824e-07, + "loss": 0.4583, + "step": 5413 + }, + { + "epoch": 0.94, + "learning_rate": 2.011447407002809e-07, + "loss": 0.4565, + "step": 5414 + }, + { + "epoch": 0.94, + "learning_rate": 2.0002635256536008e-07, + "loss": 0.4539, + "step": 5415 + }, + { + "epoch": 0.94, + "learning_rate": 1.9891105087765371e-07, + "loss": 0.4502, + "step": 5416 + }, + { + "epoch": 0.94, + "learning_rate": 1.977988359884153e-07, + "loss": 0.4453, + "step": 5417 + }, + { + "epoch": 0.94, + "learning_rate": 1.9668970824793355e-07, + "loss": 0.4491, + "step": 5418 + }, + { + "epoch": 0.94, + "learning_rate": 1.955836680055223e-07, + "loss": 0.4513, + "step": 5419 + }, + { + "epoch": 0.94, + "learning_rate": 1.9448071560952187e-07, + "loss": 0.4656, + "step": 5420 + }, + { + "epoch": 0.94, + "learning_rate": 1.93380851407301e-07, + "loss": 0.453, + "step": 5421 + }, + { + "epoch": 0.94, + "learning_rate": 1.92284075745256e-07, + "loss": 0.4534, + "step": 5422 + }, + { + "epoch": 0.94, + "learning_rate": 1.9119038896880938e-07, + "loss": 0.4653, + "step": 5423 + }, + { + "epoch": 0.94, + "learning_rate": 1.9009979142241453e-07, + "loss": 0.4533, + "step": 5424 + }, + { + "epoch": 0.94, + "learning_rate": 1.8901228344954558e-07, + "loss": 0.4484, + "step": 5425 + }, + { + "epoch": 0.94, + "learning_rate": 1.8792786539270967e-07, + "loss": 0.4684, + "step": 5426 + }, + { + "epoch": 0.94, + "learning_rate": 1.8684653759343586e-07, + "loss": 0.4482, + "step": 5427 + }, + { + "epoch": 0.94, + "learning_rate": 1.85768300392285e-07, + "loss": 0.4614, + "step": 5428 + }, + { + "epoch": 0.94, + "learning_rate": 1.8469315412883882e-07, + "loss": 0.4721, + "step": 5429 + }, + { + "epoch": 0.94, + "learning_rate": 1.83621099141712e-07, + "loss": 0.4699, + "step": 5430 + }, + { + "epoch": 0.94, + "learning_rate": 1.8255213576854115e-07, + "loss": 0.4545, + "step": 5431 + }, + { + "epoch": 0.94, + "learning_rate": 1.8148626434598916e-07, + "loss": 0.4776, + "step": 5432 + }, + { + "epoch": 0.94, + "learning_rate": 1.804234852097464e-07, + "loss": 0.4429, + "step": 5433 + }, + { + "epoch": 0.94, + "learning_rate": 1.793637986945307e-07, + "loss": 0.4531, + "step": 5434 + }, + { + "epoch": 0.94, + "learning_rate": 1.7830720513408395e-07, + "loss": 0.4535, + "step": 5435 + }, + { + "epoch": 0.94, + "learning_rate": 1.7725370486117333e-07, + "loss": 0.4703, + "step": 5436 + }, + { + "epoch": 0.94, + "learning_rate": 1.762032982075934e-07, + "loss": 0.4514, + "step": 5437 + }, + { + "epoch": 0.94, + "learning_rate": 1.7515598550416625e-07, + "loss": 0.464, + "step": 5438 + }, + { + "epoch": 0.94, + "learning_rate": 1.741117670807335e-07, + "loss": 0.4535, + "step": 5439 + }, + { + "epoch": 0.94, + "learning_rate": 1.7307064326616775e-07, + "loss": 0.4614, + "step": 5440 + }, + { + "epoch": 0.94, + "learning_rate": 1.7203261438836439e-07, + "loss": 0.4606, + "step": 5441 + }, + { + "epoch": 0.94, + "learning_rate": 1.709976807742475e-07, + "loss": 0.4658, + "step": 5442 + }, + { + "epoch": 0.94, + "learning_rate": 1.699658427497597e-07, + "loss": 0.4466, + "step": 5443 + }, + { + "epoch": 0.94, + "learning_rate": 1.6893710063987433e-07, + "loss": 0.4664, + "step": 5444 + }, + { + "epoch": 0.94, + "learning_rate": 1.6791145476858894e-07, + "loss": 0.4565, + "step": 5445 + }, + { + "epoch": 0.94, + "learning_rate": 1.66888905458924e-07, + "loss": 0.4685, + "step": 5446 + }, + { + "epoch": 0.94, + "learning_rate": 1.6586945303292633e-07, + "loss": 0.4544, + "step": 5447 + }, + { + "epoch": 0.94, + "learning_rate": 1.648530978116658e-07, + "loss": 0.4624, + "step": 5448 + }, + { + "epoch": 0.94, + "learning_rate": 1.6383984011523967e-07, + "loss": 0.4592, + "step": 5449 + }, + { + "epoch": 0.94, + "learning_rate": 1.6282968026276602e-07, + "loss": 0.4581, + "step": 5450 + }, + { + "epoch": 0.94, + "learning_rate": 1.6182261857238812e-07, + "loss": 0.4612, + "step": 5451 + }, + { + "epoch": 0.94, + "learning_rate": 1.6081865536127895e-07, + "loss": 0.4542, + "step": 5452 + }, + { + "epoch": 0.94, + "learning_rate": 1.5981779094562667e-07, + "loss": 0.4658, + "step": 5453 + }, + { + "epoch": 0.94, + "learning_rate": 1.5882002564065025e-07, + "loss": 0.4668, + "step": 5454 + }, + { + "epoch": 0.94, + "learning_rate": 1.578253597605872e-07, + "loss": 0.4458, + "step": 5455 + }, + { + "epoch": 0.95, + "learning_rate": 1.56833793618707e-07, + "loss": 0.4646, + "step": 5456 + }, + { + "epoch": 0.95, + "learning_rate": 1.558453275272942e-07, + "loss": 0.4627, + "step": 5457 + }, + { + "epoch": 0.95, + "learning_rate": 1.5485996179766206e-07, + "loss": 0.4641, + "step": 5458 + }, + { + "epoch": 0.95, + "learning_rate": 1.5387769674014563e-07, + "loss": 0.4484, + "step": 5459 + }, + { + "epoch": 0.95, + "learning_rate": 1.5289853266410416e-07, + "loss": 0.4618, + "step": 5460 + }, + { + "epoch": 0.95, + "learning_rate": 1.519224698779198e-07, + "loss": 0.4555, + "step": 5461 + }, + { + "epoch": 0.95, + "learning_rate": 1.5094950868899672e-07, + "loss": 0.4498, + "step": 5462 + }, + { + "epoch": 0.95, + "learning_rate": 1.4997964940376752e-07, + "loss": 0.4479, + "step": 5463 + }, + { + "epoch": 0.95, + "learning_rate": 1.4901289232767903e-07, + "loss": 0.4602, + "step": 5464 + }, + { + "epoch": 0.95, + "learning_rate": 1.4804923776520985e-07, + "loss": 0.4452, + "step": 5465 + }, + { + "epoch": 0.95, + "learning_rate": 1.4708868601985503e-07, + "loss": 0.4645, + "step": 5466 + }, + { + "epoch": 0.95, + "learning_rate": 1.4613123739413704e-07, + "loss": 0.4587, + "step": 5467 + }, + { + "epoch": 0.95, + "learning_rate": 1.4517689218959907e-07, + "loss": 0.465, + "step": 5468 + }, + { + "epoch": 0.95, + "learning_rate": 1.4422565070680406e-07, + "loss": 0.4604, + "step": 5469 + }, + { + "epoch": 0.95, + "learning_rate": 1.4327751324534233e-07, + "loss": 0.4602, + "step": 5470 + }, + { + "epoch": 0.95, + "learning_rate": 1.4233248010382506e-07, + "loss": 0.4438, + "step": 5471 + }, + { + "epoch": 0.95, + "learning_rate": 1.4139055157988303e-07, + "loss": 0.4658, + "step": 5472 + }, + { + "epoch": 0.95, + "learning_rate": 1.4045172797017336e-07, + "loss": 0.4561, + "step": 5473 + }, + { + "epoch": 0.95, + "learning_rate": 1.3951600957037292e-07, + "loss": 0.4513, + "step": 5474 + }, + { + "epoch": 0.95, + "learning_rate": 1.385833966751815e-07, + "loss": 0.458, + "step": 5475 + }, + { + "epoch": 0.95, + "learning_rate": 1.376538895783186e-07, + "loss": 0.4642, + "step": 5476 + }, + { + "epoch": 0.95, + "learning_rate": 1.3672748857252783e-07, + "loss": 0.4616, + "step": 5477 + }, + { + "epoch": 0.95, + "learning_rate": 1.358041939495758e-07, + "loss": 0.4598, + "step": 5478 + }, + { + "epoch": 0.95, + "learning_rate": 1.3488400600024654e-07, + "loss": 0.45, + "step": 5479 + }, + { + "epoch": 0.95, + "learning_rate": 1.339669250143505e-07, + "loss": 0.4489, + "step": 5480 + }, + { + "epoch": 0.95, + "learning_rate": 1.3305295128071437e-07, + "loss": 0.4467, + "step": 5481 + }, + { + "epoch": 0.95, + "learning_rate": 1.321420850871935e-07, + "loss": 0.4734, + "step": 5482 + }, + { + "epoch": 0.95, + "learning_rate": 1.3123432672065506e-07, + "loss": 0.4501, + "step": 5483 + }, + { + "epoch": 0.95, + "learning_rate": 1.303296764669959e-07, + "loss": 0.463, + "step": 5484 + }, + { + "epoch": 0.95, + "learning_rate": 1.2942813461112924e-07, + "loss": 0.4432, + "step": 5485 + }, + { + "epoch": 0.95, + "learning_rate": 1.2852970143699129e-07, + "loss": 0.4676, + "step": 5486 + }, + { + "epoch": 0.95, + "learning_rate": 1.276343772275379e-07, + "loss": 0.453, + "step": 5487 + }, + { + "epoch": 0.95, + "learning_rate": 1.267421622647469e-07, + "loss": 0.4718, + "step": 5488 + }, + { + "epoch": 0.95, + "learning_rate": 1.2585305682961679e-07, + "loss": 0.4574, + "step": 5489 + }, + { + "epoch": 0.95, + "learning_rate": 1.2496706120216585e-07, + "loss": 0.4711, + "step": 5490 + }, + { + "epoch": 0.95, + "learning_rate": 1.2408417566143306e-07, + "loss": 0.4482, + "step": 5491 + }, + { + "epoch": 0.95, + "learning_rate": 1.2320440048547933e-07, + "loss": 0.4482, + "step": 5492 + }, + { + "epoch": 0.95, + "learning_rate": 1.2232773595138415e-07, + "loss": 0.4576, + "step": 5493 + }, + { + "epoch": 0.95, + "learning_rate": 1.2145418233524886e-07, + "loss": 0.4591, + "step": 5494 + }, + { + "epoch": 0.95, + "learning_rate": 1.2058373991219341e-07, + "loss": 0.4595, + "step": 5495 + }, + { + "epoch": 0.95, + "learning_rate": 1.197164089563596e-07, + "loss": 0.4454, + "step": 5496 + }, + { + "epoch": 0.95, + "learning_rate": 1.1885218974090895e-07, + "loss": 0.4451, + "step": 5497 + }, + { + "epoch": 0.95, + "learning_rate": 1.1799108253802149e-07, + "loss": 0.4504, + "step": 5498 + }, + { + "epoch": 0.95, + "learning_rate": 1.1713308761889696e-07, + "loss": 0.4535, + "step": 5499 + }, + { + "epoch": 0.95, + "learning_rate": 1.1627820525375811e-07, + "loss": 0.4496, + "step": 5500 + }, + { + "epoch": 0.95, + "learning_rate": 1.1542643571184619e-07, + "loss": 0.4506, + "step": 5501 + }, + { + "epoch": 0.95, + "learning_rate": 1.1457777926141889e-07, + "loss": 0.4528, + "step": 5502 + }, + { + "epoch": 0.95, + "learning_rate": 1.1373223616975681e-07, + "loss": 0.454, + "step": 5503 + }, + { + "epoch": 0.95, + "learning_rate": 1.1288980670315918e-07, + "loss": 0.4688, + "step": 5504 + }, + { + "epoch": 0.95, + "learning_rate": 1.1205049112694488e-07, + "loss": 0.4584, + "step": 5505 + }, + { + "epoch": 0.95, + "learning_rate": 1.1121428970545023e-07, + "loss": 0.4515, + "step": 5506 + }, + { + "epoch": 0.95, + "learning_rate": 1.1038120270203345e-07, + "loss": 0.4583, + "step": 5507 + }, + { + "epoch": 0.95, + "learning_rate": 1.0955123037907134e-07, + "loss": 0.4628, + "step": 5508 + }, + { + "epoch": 0.95, + "learning_rate": 1.0872437299795701e-07, + "loss": 0.4589, + "step": 5509 + }, + { + "epoch": 0.95, + "learning_rate": 1.079006308191055e-07, + "loss": 0.4698, + "step": 5510 + }, + { + "epoch": 0.95, + "learning_rate": 1.0708000410195041e-07, + "loss": 0.452, + "step": 5511 + }, + { + "epoch": 0.95, + "learning_rate": 1.0626249310494385e-07, + "loss": 0.4537, + "step": 5512 + }, + { + "epoch": 0.96, + "learning_rate": 1.0544809808555545e-07, + "loss": 0.4577, + "step": 5513 + }, + { + "epoch": 0.96, + "learning_rate": 1.0463681930027336e-07, + "loss": 0.4576, + "step": 5514 + }, + { + "epoch": 0.96, + "learning_rate": 1.0382865700460876e-07, + "loss": 0.4449, + "step": 5515 + }, + { + "epoch": 0.96, + "learning_rate": 1.030236114530847e-07, + "loss": 0.4702, + "step": 5516 + }, + { + "epoch": 0.96, + "learning_rate": 1.0222168289924616e-07, + "loss": 0.4356, + "step": 5517 + }, + { + "epoch": 0.96, + "learning_rate": 1.0142287159565778e-07, + "loss": 0.4542, + "step": 5518 + }, + { + "epoch": 0.96, + "learning_rate": 1.0062717779389942e-07, + "loss": 0.4467, + "step": 5519 + }, + { + "epoch": 0.96, + "learning_rate": 9.98346017445706e-08, + "loss": 0.4777, + "step": 5520 + }, + { + "epoch": 0.96, + "learning_rate": 9.904514369728724e-08, + "loss": 0.4518, + "step": 5521 + }, + { + "epoch": 0.96, + "learning_rate": 9.82588039006882e-08, + "loss": 0.4577, + "step": 5522 + }, + { + "epoch": 0.96, + "learning_rate": 9.74755826024254e-08, + "loss": 0.462, + "step": 5523 + }, + { + "epoch": 0.96, + "learning_rate": 9.669548004916817e-08, + "loss": 0.47, + "step": 5524 + }, + { + "epoch": 0.96, + "learning_rate": 9.591849648660779e-08, + "loss": 0.4533, + "step": 5525 + }, + { + "epoch": 0.96, + "learning_rate": 9.51446321594507e-08, + "loss": 0.4538, + "step": 5526 + }, + { + "epoch": 0.96, + "learning_rate": 9.437388731141861e-08, + "loss": 0.4458, + "step": 5527 + }, + { + "epoch": 0.96, + "learning_rate": 9.360626218525625e-08, + "loss": 0.4658, + "step": 5528 + }, + { + "epoch": 0.96, + "learning_rate": 9.284175702272246e-08, + "loss": 0.4435, + "step": 5529 + }, + { + "epoch": 0.96, + "learning_rate": 9.208037206459242e-08, + "loss": 0.445, + "step": 5530 + }, + { + "epoch": 0.96, + "learning_rate": 9.132210755066096e-08, + "loss": 0.4653, + "step": 5531 + }, + { + "epoch": 0.96, + "learning_rate": 9.056696371973928e-08, + "loss": 0.4495, + "step": 5532 + }, + { + "epoch": 0.96, + "learning_rate": 8.981494080965602e-08, + "loss": 0.4581, + "step": 5533 + }, + { + "epoch": 0.96, + "learning_rate": 8.906603905725619e-08, + "loss": 0.4604, + "step": 5534 + }, + { + "epoch": 0.96, + "learning_rate": 8.832025869840222e-08, + "loss": 0.4697, + "step": 5535 + }, + { + "epoch": 0.96, + "learning_rate": 8.757759996797399e-08, + "loss": 0.4452, + "step": 5536 + }, + { + "epoch": 0.96, + "learning_rate": 8.683806309986776e-08, + "loss": 0.4541, + "step": 5537 + }, + { + "epoch": 0.96, + "learning_rate": 8.610164832699608e-08, + "loss": 0.4456, + "step": 5538 + }, + { + "epoch": 0.96, + "learning_rate": 8.536835588128678e-08, + "loss": 0.4716, + "step": 5539 + }, + { + "epoch": 0.96, + "learning_rate": 8.463818599369067e-08, + "loss": 0.455, + "step": 5540 + }, + { + "epoch": 0.96, + "learning_rate": 8.391113889416713e-08, + "loss": 0.4659, + "step": 5541 + }, + { + "epoch": 0.96, + "learning_rate": 8.318721481169633e-08, + "loss": 0.45, + "step": 5542 + }, + { + "epoch": 0.96, + "learning_rate": 8.24664139742759e-08, + "loss": 0.4638, + "step": 5543 + }, + { + "epoch": 0.96, + "learning_rate": 8.174873660891536e-08, + "loss": 0.4472, + "step": 5544 + }, + { + "epoch": 0.96, + "learning_rate": 8.103418294164611e-08, + "loss": 0.4584, + "step": 5545 + }, + { + "epoch": 0.96, + "learning_rate": 8.032275319750926e-08, + "loss": 0.4668, + "step": 5546 + }, + { + "epoch": 0.96, + "learning_rate": 7.96144476005689e-08, + "loss": 0.465, + "step": 5547 + }, + { + "epoch": 0.96, + "learning_rate": 7.890926637390106e-08, + "loss": 0.4387, + "step": 5548 + }, + { + "epoch": 0.96, + "learning_rate": 7.820720973959694e-08, + "loss": 0.4637, + "step": 5549 + }, + { + "epoch": 0.96, + "learning_rate": 7.750827791876747e-08, + "loss": 0.4694, + "step": 5550 + }, + { + "epoch": 0.96, + "learning_rate": 7.681247113153655e-08, + "loss": 0.4576, + "step": 5551 + }, + { + "epoch": 0.96, + "learning_rate": 7.611978959704558e-08, + "loss": 0.4624, + "step": 5552 + }, + { + "epoch": 0.96, + "learning_rate": 7.543023353344892e-08, + "loss": 0.4565, + "step": 5553 + }, + { + "epoch": 0.96, + "learning_rate": 7.474380315791951e-08, + "loss": 0.4679, + "step": 5554 + }, + { + "epoch": 0.96, + "learning_rate": 7.406049868664445e-08, + "loss": 0.4649, + "step": 5555 + }, + { + "epoch": 0.96, + "learning_rate": 7.338032033482712e-08, + "loss": 0.4597, + "step": 5556 + }, + { + "epoch": 0.96, + "learning_rate": 7.270326831668617e-08, + "loss": 0.4514, + "step": 5557 + }, + { + "epoch": 0.96, + "learning_rate": 7.202934284545438e-08, + "loss": 0.4614, + "step": 5558 + }, + { + "epoch": 0.96, + "learning_rate": 7.135854413338194e-08, + "loss": 0.4515, + "step": 5559 + }, + { + "epoch": 0.96, + "learning_rate": 7.069087239173211e-08, + "loss": 0.4506, + "step": 5560 + }, + { + "epoch": 0.96, + "learning_rate": 7.002632783078445e-08, + "loss": 0.4676, + "step": 5561 + }, + { + "epoch": 0.96, + "learning_rate": 6.936491065983486e-08, + "loss": 0.4422, + "step": 5562 + }, + { + "epoch": 0.96, + "learning_rate": 6.870662108719117e-08, + "loss": 0.4611, + "step": 5563 + }, + { + "epoch": 0.96, + "learning_rate": 6.805145932017975e-08, + "loss": 0.4621, + "step": 5564 + }, + { + "epoch": 0.96, + "learning_rate": 6.73994255651389e-08, + "loss": 0.4611, + "step": 5565 + }, + { + "epoch": 0.96, + "learning_rate": 6.675052002742321e-08, + "loss": 0.4516, + "step": 5566 + }, + { + "epoch": 0.96, + "learning_rate": 6.610474291140257e-08, + "loss": 0.4512, + "step": 5567 + }, + { + "epoch": 0.96, + "learning_rate": 6.546209442046093e-08, + "loss": 0.4645, + "step": 5568 + }, + { + "epoch": 0.96, + "learning_rate": 6.482257475699526e-08, + "loss": 0.453, + "step": 5569 + }, + { + "epoch": 0.96, + "learning_rate": 6.418618412242116e-08, + "loss": 0.4429, + "step": 5570 + }, + { + "epoch": 0.97, + "learning_rate": 6.355292271716495e-08, + "loss": 0.4523, + "step": 5571 + }, + { + "epoch": 0.97, + "learning_rate": 6.292279074066821e-08, + "loss": 0.4651, + "step": 5572 + }, + { + "epoch": 0.97, + "learning_rate": 6.229578839138772e-08, + "loss": 0.4459, + "step": 5573 + }, + { + "epoch": 0.97, + "learning_rate": 6.167191586679556e-08, + "loss": 0.4407, + "step": 5574 + }, + { + "epoch": 0.97, + "learning_rate": 6.105117336337674e-08, + "loss": 0.4617, + "step": 5575 + }, + { + "epoch": 0.97, + "learning_rate": 6.043356107662823e-08, + "loss": 0.4538, + "step": 5576 + }, + { + "epoch": 0.97, + "learning_rate": 5.981907920106667e-08, + "loss": 0.4598, + "step": 5577 + }, + { + "epoch": 0.97, + "learning_rate": 5.920772793021945e-08, + "loss": 0.4442, + "step": 5578 + }, + { + "epoch": 0.97, + "learning_rate": 5.8599507456625907e-08, + "loss": 0.4693, + "step": 5579 + }, + { + "epoch": 0.97, + "learning_rate": 5.799441797184391e-08, + "loss": 0.4507, + "step": 5580 + }, + { + "epoch": 0.97, + "learning_rate": 5.739245966644102e-08, + "loss": 0.4531, + "step": 5581 + }, + { + "epoch": 0.97, + "learning_rate": 5.6793632730003375e-08, + "loss": 0.4542, + "step": 5582 + }, + { + "epoch": 0.97, + "learning_rate": 5.6197937351125664e-08, + "loss": 0.4596, + "step": 5583 + }, + { + "epoch": 0.97, + "learning_rate": 5.560537371742003e-08, + "loss": 0.4511, + "step": 5584 + }, + { + "epoch": 0.97, + "learning_rate": 5.501594201551164e-08, + "loss": 0.4645, + "step": 5585 + }, + { + "epoch": 0.97, + "learning_rate": 5.4429642431036435e-08, + "loss": 0.4548, + "step": 5586 + }, + { + "epoch": 0.97, + "learning_rate": 5.3846475148648936e-08, + "loss": 0.4547, + "step": 5587 + }, + { + "epoch": 0.97, + "learning_rate": 5.326644035201334e-08, + "loss": 0.4662, + "step": 5588 + }, + { + "epoch": 0.97, + "learning_rate": 5.268953822380796e-08, + "loss": 0.4589, + "step": 5589 + }, + { + "epoch": 0.97, + "learning_rate": 5.211576894572523e-08, + "loss": 0.4518, + "step": 5590 + }, + { + "epoch": 0.97, + "learning_rate": 5.154513269847061e-08, + "loss": 0.4601, + "step": 5591 + }, + { + "epoch": 0.97, + "learning_rate": 5.097762966176256e-08, + "loss": 0.4462, + "step": 5592 + }, + { + "epoch": 0.97, + "learning_rate": 5.041326001433366e-08, + "loss": 0.464, + "step": 5593 + }, + { + "epoch": 0.97, + "learning_rate": 4.985202393392841e-08, + "loss": 0.4583, + "step": 5594 + }, + { + "epoch": 0.97, + "learning_rate": 4.9293921597305396e-08, + "loss": 0.4461, + "step": 5595 + }, + { + "epoch": 0.97, + "learning_rate": 4.873895318023625e-08, + "loss": 0.4444, + "step": 5596 + }, + { + "epoch": 0.97, + "learning_rate": 4.818711885750338e-08, + "loss": 0.4643, + "step": 5597 + }, + { + "epoch": 0.97, + "learning_rate": 4.763841880290554e-08, + "loss": 0.4594, + "step": 5598 + }, + { + "epoch": 0.97, + "learning_rate": 4.7092853189252273e-08, + "loss": 0.4535, + "step": 5599 + }, + { + "epoch": 0.97, + "learning_rate": 4.655042218836725e-08, + "loss": 0.4511, + "step": 5600 + }, + { + "epoch": 0.97, + "learning_rate": 4.6011125971084924e-08, + "loss": 0.4572, + "step": 5601 + }, + { + "epoch": 0.97, + "learning_rate": 4.547496470725388e-08, + "loss": 0.4642, + "step": 5602 + }, + { + "epoch": 0.97, + "learning_rate": 4.49419385657357e-08, + "loss": 0.4607, + "step": 5603 + }, + { + "epoch": 0.97, + "learning_rate": 4.4412047714402774e-08, + "loss": 0.4552, + "step": 5604 + }, + { + "epoch": 0.97, + "learning_rate": 4.388529232014271e-08, + "loss": 0.457, + "step": 5605 + }, + { + "epoch": 0.97, + "learning_rate": 4.336167254885393e-08, + "loss": 0.4658, + "step": 5606 + }, + { + "epoch": 0.97, + "learning_rate": 4.2841188565446724e-08, + "loss": 0.474, + "step": 5607 + }, + { + "epoch": 0.97, + "learning_rate": 4.232384053384553e-08, + "loss": 0.4547, + "step": 5608 + }, + { + "epoch": 0.97, + "learning_rate": 4.1809628616985564e-08, + "loss": 0.454, + "step": 5609 + }, + { + "epoch": 0.97, + "learning_rate": 4.129855297681618e-08, + "loss": 0.4624, + "step": 5610 + }, + { + "epoch": 0.97, + "learning_rate": 4.0790613774295274e-08, + "loss": 0.4564, + "step": 5611 + }, + { + "epoch": 0.97, + "learning_rate": 4.028581116939823e-08, + "loss": 0.452, + "step": 5612 + }, + { + "epoch": 0.97, + "learning_rate": 3.978414532110897e-08, + "loss": 0.4578, + "step": 5613 + }, + { + "epoch": 0.97, + "learning_rate": 3.928561638742334e-08, + "loss": 0.447, + "step": 5614 + }, + { + "epoch": 0.97, + "learning_rate": 3.8790224525352416e-08, + "loss": 0.4691, + "step": 5615 + }, + { + "epoch": 0.97, + "learning_rate": 3.829796989091472e-08, + "loss": 0.4469, + "step": 5616 + }, + { + "epoch": 0.97, + "learning_rate": 3.780885263914402e-08, + "loss": 0.4586, + "step": 5617 + }, + { + "epoch": 0.97, + "learning_rate": 3.7322872924084876e-08, + "loss": 0.4491, + "step": 5618 + }, + { + "epoch": 0.97, + "learning_rate": 3.684003089879484e-08, + "loss": 0.4598, + "step": 5619 + }, + { + "epoch": 0.97, + "learning_rate": 3.636032671534229e-08, + "loss": 0.4636, + "step": 5620 + }, + { + "epoch": 0.97, + "learning_rate": 3.5883760524805244e-08, + "loss": 0.4654, + "step": 5621 + }, + { + "epoch": 0.97, + "learning_rate": 3.5410332477278096e-08, + "loss": 0.4607, + "step": 5622 + }, + { + "epoch": 0.97, + "learning_rate": 3.49400427218638e-08, + "loss": 0.4674, + "step": 5623 + }, + { + "epoch": 0.97, + "learning_rate": 3.447289140667609e-08, + "loss": 0.4592, + "step": 5624 + }, + { + "epoch": 0.97, + "learning_rate": 3.4008878678843946e-08, + "loss": 0.4642, + "step": 5625 + }, + { + "epoch": 0.97, + "learning_rate": 3.35480046845027e-08, + "loss": 0.4505, + "step": 5626 + }, + { + "epoch": 0.97, + "learning_rate": 3.309026956880512e-08, + "loss": 0.464, + "step": 5627 + }, + { + "epoch": 0.97, + "learning_rate": 3.2635673475910345e-08, + "loss": 0.4551, + "step": 5628 + }, + { + "epoch": 0.98, + "learning_rate": 3.218421654899162e-08, + "loss": 0.4552, + "step": 5629 + }, + { + "epoch": 0.98, + "learning_rate": 3.173589893023188e-08, + "loss": 0.4475, + "step": 5630 + }, + { + "epoch": 0.98, + "learning_rate": 3.1290720760828176e-08, + "loss": 0.4659, + "step": 5631 + }, + { + "epoch": 0.98, + "learning_rate": 3.0848682180985025e-08, + "loss": 0.4481, + "step": 5632 + }, + { + "epoch": 0.98, + "learning_rate": 3.040978332992106e-08, + "loss": 0.4626, + "step": 5633 + }, + { + "epoch": 0.98, + "learning_rate": 2.9974024345864604e-08, + "loss": 0.457, + "step": 5634 + }, + { + "epoch": 0.98, + "learning_rate": 2.9541405366054764e-08, + "loss": 0.4577, + "step": 5635 + }, + { + "epoch": 0.98, + "learning_rate": 2.9111926526744772e-08, + "loss": 0.4611, + "step": 5636 + }, + { + "epoch": 0.98, + "learning_rate": 2.8685587963194206e-08, + "loss": 0.4485, + "step": 5637 + }, + { + "epoch": 0.98, + "learning_rate": 2.826238980967788e-08, + "loss": 0.453, + "step": 5638 + }, + { + "epoch": 0.98, + "learning_rate": 2.7842332199478074e-08, + "loss": 0.4563, + "step": 5639 + }, + { + "epoch": 0.98, + "learning_rate": 2.7425415264890065e-08, + "loss": 0.4569, + "step": 5640 + }, + { + "epoch": 0.98, + "learning_rate": 2.7011639137221046e-08, + "loss": 0.4554, + "step": 5641 + }, + { + "epoch": 0.98, + "learning_rate": 2.6601003946784555e-08, + "loss": 0.4503, + "step": 5642 + }, + { + "epoch": 0.98, + "learning_rate": 2.6193509822910466e-08, + "loss": 0.4741, + "step": 5643 + }, + { + "epoch": 0.98, + "learning_rate": 2.578915689393613e-08, + "loss": 0.4402, + "step": 5644 + }, + { + "epoch": 0.98, + "learning_rate": 2.538794528720967e-08, + "loss": 0.4572, + "step": 5645 + }, + { + "epoch": 0.98, + "learning_rate": 2.4989875129091124e-08, + "loss": 0.4563, + "step": 5646 + }, + { + "epoch": 0.98, + "learning_rate": 2.4594946544949094e-08, + "loss": 0.4601, + "step": 5647 + }, + { + "epoch": 0.98, + "learning_rate": 2.420315965916631e-08, + "loss": 0.463, + "step": 5648 + }, + { + "epoch": 0.98, + "learning_rate": 2.3814514595132955e-08, + "loss": 0.4661, + "step": 5649 + }, + { + "epoch": 0.98, + "learning_rate": 2.3429011475250008e-08, + "loss": 0.4664, + "step": 5650 + }, + { + "epoch": 0.98, + "learning_rate": 2.304665042092924e-08, + "loss": 0.4606, + "step": 5651 + }, + { + "epoch": 0.98, + "learning_rate": 2.266743155259432e-08, + "loss": 0.457, + "step": 5652 + }, + { + "epoch": 0.98, + "learning_rate": 2.2291354989677492e-08, + "loss": 0.458, + "step": 5653 + }, + { + "epoch": 0.98, + "learning_rate": 2.19184208506229e-08, + "loss": 0.453, + "step": 5654 + }, + { + "epoch": 0.98, + "learning_rate": 2.154862925288326e-08, + "loss": 0.4644, + "step": 5655 + }, + { + "epoch": 0.98, + "learning_rate": 2.118198031292207e-08, + "loss": 0.4466, + "step": 5656 + }, + { + "epoch": 0.98, + "learning_rate": 2.0818474146212518e-08, + "loss": 0.4588, + "step": 5657 + }, + { + "epoch": 0.98, + "learning_rate": 2.045811086724192e-08, + "loss": 0.4617, + "step": 5658 + }, + { + "epoch": 0.98, + "learning_rate": 2.010089058950171e-08, + "loss": 0.4629, + "step": 5659 + }, + { + "epoch": 0.98, + "learning_rate": 1.9746813425498555e-08, + "loss": 0.4552, + "step": 5660 + }, + { + "epoch": 0.98, + "learning_rate": 1.9395879486745483e-08, + "loss": 0.4621, + "step": 5661 + }, + { + "epoch": 0.98, + "learning_rate": 1.9048088883767414e-08, + "loss": 0.453, + "step": 5662 + }, + { + "epoch": 0.98, + "learning_rate": 1.870344172610006e-08, + "loss": 0.4623, + "step": 5663 + }, + { + "epoch": 0.98, + "learning_rate": 1.8361938122287704e-08, + "loss": 0.4524, + "step": 5664 + }, + { + "epoch": 0.98, + "learning_rate": 1.8023578179884315e-08, + "loss": 0.4618, + "step": 5665 + }, + { + "epoch": 0.98, + "learning_rate": 1.7688362005454653e-08, + "loss": 0.4413, + "step": 5666 + }, + { + "epoch": 0.98, + "learning_rate": 1.7356289704574257e-08, + "loss": 0.4618, + "step": 5667 + }, + { + "epoch": 0.98, + "learning_rate": 1.7027361381826147e-08, + "loss": 0.4502, + "step": 5668 + }, + { + "epoch": 0.98, + "learning_rate": 1.6701577140805225e-08, + "loss": 0.4669, + "step": 5669 + }, + { + "epoch": 0.98, + "learning_rate": 1.6378937084114978e-08, + "loss": 0.4464, + "step": 5670 + }, + { + "epoch": 0.98, + "learning_rate": 1.6059441313369672e-08, + "loss": 0.4608, + "step": 5671 + }, + { + "epoch": 0.98, + "learning_rate": 1.5743089929193266e-08, + "loss": 0.4479, + "step": 5672 + }, + { + "epoch": 0.98, + "learning_rate": 1.5429883031217173e-08, + "loss": 0.4573, + "step": 5673 + }, + { + "epoch": 0.98, + "learning_rate": 1.511982071808471e-08, + "loss": 0.4414, + "step": 5674 + }, + { + "epoch": 0.98, + "learning_rate": 1.4812903087448872e-08, + "loss": 0.452, + "step": 5675 + }, + { + "epoch": 0.98, + "learning_rate": 1.4509130235971226e-08, + "loss": 0.4685, + "step": 5676 + }, + { + "epoch": 0.98, + "learning_rate": 1.420850225932302e-08, + "loss": 0.4814, + "step": 5677 + }, + { + "epoch": 0.98, + "learning_rate": 1.3911019252187408e-08, + "loss": 0.4546, + "step": 5678 + }, + { + "epoch": 0.98, + "learning_rate": 1.3616681308251667e-08, + "loss": 0.46, + "step": 5679 + }, + { + "epoch": 0.98, + "learning_rate": 1.332548852021831e-08, + "loss": 0.4617, + "step": 5680 + }, + { + "epoch": 0.98, + "learning_rate": 1.30374409797962e-08, + "loss": 0.4526, + "step": 5681 + }, + { + "epoch": 0.98, + "learning_rate": 1.2752538777704993e-08, + "loss": 0.4703, + "step": 5682 + }, + { + "epoch": 0.98, + "learning_rate": 1.24707820036718e-08, + "loss": 0.4595, + "step": 5683 + }, + { + "epoch": 0.98, + "learning_rate": 1.2192170746434529e-08, + "loss": 0.4681, + "step": 5684 + }, + { + "epoch": 0.98, + "learning_rate": 1.1916705093740766e-08, + "loss": 0.4605, + "step": 5685 + }, + { + "epoch": 0.99, + "learning_rate": 1.164438513234667e-08, + "loss": 0.4527, + "step": 5686 + }, + { + "epoch": 0.99, + "learning_rate": 1.1375210948019188e-08, + "loss": 0.4723, + "step": 5687 + }, + { + "epoch": 0.99, + "learning_rate": 1.1109182625531622e-08, + "loss": 0.4574, + "step": 5688 + }, + { + "epoch": 0.99, + "learning_rate": 1.0846300248668063e-08, + "loss": 0.4398, + "step": 5689 + }, + { + "epoch": 0.99, + "learning_rate": 1.0586563900222279e-08, + "loss": 0.4609, + "step": 5690 + }, + { + "epoch": 0.99, + "learning_rate": 1.0329973661996617e-08, + "loss": 0.4525, + "step": 5691 + }, + { + "epoch": 0.99, + "learning_rate": 1.0076529614804209e-08, + "loss": 0.4581, + "step": 5692 + }, + { + "epoch": 0.99, + "learning_rate": 9.82623183846343e-09, + "loss": 0.437, + "step": 5693 + }, + { + "epoch": 0.99, + "learning_rate": 9.579080411805664e-09, + "loss": 0.4612, + "step": 5694 + }, + { + "epoch": 0.99, + "learning_rate": 9.335075412669758e-09, + "loss": 0.462, + "step": 5695 + }, + { + "epoch": 0.99, + "learning_rate": 9.094216917903131e-09, + "loss": 0.4575, + "step": 5696 + }, + { + "epoch": 0.99, + "learning_rate": 8.85650500336288e-09, + "loss": 0.4613, + "step": 5697 + }, + { + "epoch": 0.99, + "learning_rate": 8.621939743916896e-09, + "loss": 0.4579, + "step": 5698 + }, + { + "epoch": 0.99, + "learning_rate": 8.390521213437197e-09, + "loss": 0.4488, + "step": 5699 + }, + { + "epoch": 0.99, + "learning_rate": 8.162249484809926e-09, + "loss": 0.4607, + "step": 5700 + }, + { + "epoch": 0.99, + "learning_rate": 7.937124629927573e-09, + "loss": 0.4537, + "step": 5701 + }, + { + "epoch": 0.99, + "learning_rate": 7.715146719691202e-09, + "loss": 0.4603, + "step": 5702 + }, + { + "epoch": 0.99, + "learning_rate": 7.496315824012667e-09, + "loss": 0.4492, + "step": 5703 + }, + { + "epoch": 0.99, + "learning_rate": 7.280632011810174e-09, + "loss": 0.4596, + "step": 5704 + }, + { + "epoch": 0.99, + "learning_rate": 7.068095351013826e-09, + "loss": 0.468, + "step": 5705 + }, + { + "epoch": 0.99, + "learning_rate": 6.858705908560081e-09, + "loss": 0.4609, + "step": 5706 + }, + { + "epoch": 0.99, + "learning_rate": 6.6524637503939675e-09, + "loss": 0.4503, + "step": 5707 + }, + { + "epoch": 0.99, + "learning_rate": 6.449368941471301e-09, + "loss": 0.4484, + "step": 5708 + }, + { + "epoch": 0.99, + "learning_rate": 6.249421545755363e-09, + "loss": 0.4571, + "step": 5709 + }, + { + "epoch": 0.99, + "learning_rate": 6.052621626219113e-09, + "loss": 0.4479, + "step": 5710 + }, + { + "epoch": 0.99, + "learning_rate": 5.858969244842971e-09, + "loss": 0.4516, + "step": 5711 + }, + { + "epoch": 0.99, + "learning_rate": 5.66846446261704e-09, + "loss": 0.4506, + "step": 5712 + }, + { + "epoch": 0.99, + "learning_rate": 5.4811073395388824e-09, + "loss": 0.4515, + "step": 5713 + }, + { + "epoch": 0.99, + "learning_rate": 5.296897934616852e-09, + "loss": 0.4621, + "step": 5714 + }, + { + "epoch": 0.99, + "learning_rate": 5.115836305865651e-09, + "loss": 0.4417, + "step": 5715 + }, + { + "epoch": 0.99, + "learning_rate": 4.937922510310778e-09, + "loss": 0.4559, + "step": 5716 + }, + { + "epoch": 0.99, + "learning_rate": 4.763156603984076e-09, + "loss": 0.4506, + "step": 5717 + }, + { + "epoch": 0.99, + "learning_rate": 4.591538641927074e-09, + "loss": 0.4551, + "step": 5718 + }, + { + "epoch": 0.99, + "learning_rate": 4.423068678189868e-09, + "loss": 0.4479, + "step": 5719 + }, + { + "epoch": 0.99, + "learning_rate": 4.257746765832238e-09, + "loss": 0.4606, + "step": 5720 + }, + { + "epoch": 0.99, + "learning_rate": 4.095572956921423e-09, + "loss": 0.4519, + "step": 5721 + }, + { + "epoch": 0.99, + "learning_rate": 3.9365473025321235e-09, + "loss": 0.4559, + "step": 5722 + }, + { + "epoch": 0.99, + "learning_rate": 3.780669852747609e-09, + "loss": 0.4548, + "step": 5723 + }, + { + "epoch": 0.99, + "learning_rate": 3.6279406566630536e-09, + "loss": 0.4606, + "step": 5724 + }, + { + "epoch": 0.99, + "learning_rate": 3.478359762378869e-09, + "loss": 0.4356, + "step": 5725 + }, + { + "epoch": 0.99, + "learning_rate": 3.3319272170040384e-09, + "loss": 0.4643, + "step": 5726 + }, + { + "epoch": 0.99, + "learning_rate": 3.1886430666561163e-09, + "loss": 0.4449, + "step": 5727 + }, + { + "epoch": 0.99, + "learning_rate": 3.048507356463448e-09, + "loss": 0.4708, + "step": 5728 + }, + { + "epoch": 0.99, + "learning_rate": 2.91152013056073e-09, + "loss": 0.4509, + "step": 5729 + }, + { + "epoch": 0.99, + "learning_rate": 2.777681432090118e-09, + "loss": 0.4714, + "step": 5730 + }, + { + "epoch": 0.99, + "learning_rate": 2.64699130320345e-09, + "loss": 0.4528, + "step": 5731 + }, + { + "epoch": 0.99, + "learning_rate": 2.5194497850622447e-09, + "loss": 0.4606, + "step": 5732 + }, + { + "epoch": 0.99, + "learning_rate": 2.3950569178332605e-09, + "loss": 0.4521, + "step": 5733 + }, + { + "epoch": 0.99, + "learning_rate": 2.2738127406951583e-09, + "loss": 0.4608, + "step": 5734 + }, + { + "epoch": 0.99, + "learning_rate": 2.155717291830728e-09, + "loss": 0.4646, + "step": 5735 + }, + { + "epoch": 0.99, + "learning_rate": 2.0407706084368816e-09, + "loss": 0.4617, + "step": 5736 + }, + { + "epoch": 0.99, + "learning_rate": 1.9289727267124416e-09, + "loss": 0.4423, + "step": 5737 + }, + { + "epoch": 0.99, + "learning_rate": 1.8203236818681302e-09, + "loss": 0.4593, + "step": 5738 + }, + { + "epoch": 0.99, + "learning_rate": 1.7148235081232424e-09, + "loss": 0.451, + "step": 5739 + }, + { + "epoch": 0.99, + "learning_rate": 1.6124722387034219e-09, + "loss": 0.4544, + "step": 5740 + }, + { + "epoch": 0.99, + "learning_rate": 1.513269905845105e-09, + "loss": 0.4566, + "step": 5741 + }, + { + "epoch": 0.99, + "learning_rate": 1.4172165407899675e-09, + "loss": 0.4664, + "step": 5742 + }, + { + "epoch": 0.99, + "learning_rate": 1.3243121737904763e-09, + "loss": 0.4471, + "step": 5743 + }, + { + "epoch": 1.0, + "learning_rate": 1.2345568341065595e-09, + "loss": 0.4498, + "step": 5744 + }, + { + "epoch": 1.0, + "learning_rate": 1.1479505500044952e-09, + "loss": 0.4469, + "step": 5745 + }, + { + "epoch": 1.0, + "learning_rate": 1.064493348762463e-09, + "loss": 0.4601, + "step": 5746 + }, + { + "epoch": 1.0, + "learning_rate": 9.841852566638833e-10, + "loss": 0.4555, + "step": 5747 + }, + { + "epoch": 1.0, + "learning_rate": 9.070262990007462e-10, + "loss": 0.4535, + "step": 5748 + }, + { + "epoch": 1.0, + "learning_rate": 8.330165000758339e-10, + "loss": 0.4618, + "step": 5749 + }, + { + "epoch": 1.0, + "learning_rate": 7.621558831949482e-10, + "loss": 0.4579, + "step": 5750 + }, + { + "epoch": 1.0, + "learning_rate": 6.944444706791231e-10, + "loss": 0.4425, + "step": 5751 + }, + { + "epoch": 1.0, + "learning_rate": 6.298822838501917e-10, + "loss": 0.4656, + "step": 5752 + }, + { + "epoch": 1.0, + "learning_rate": 5.684693430429988e-10, + "loss": 0.4469, + "step": 5753 + }, + { + "epoch": 1.0, + "learning_rate": 5.102056675998501e-10, + "loss": 0.4556, + "step": 5754 + }, + { + "epoch": 1.0, + "learning_rate": 4.550912758705117e-10, + "loss": 0.4572, + "step": 5755 + }, + { + "epoch": 1.0, + "learning_rate": 4.0312618521221034e-10, + "loss": 0.4509, + "step": 5756 + }, + { + "epoch": 1.0, + "learning_rate": 3.543104119907437e-10, + "loss": 0.457, + "step": 5757 + }, + { + "epoch": 1.0, + "learning_rate": 3.086439715815903e-10, + "loss": 0.4623, + "step": 5758 + }, + { + "epoch": 1.0, + "learning_rate": 2.6612687836657937e-10, + "loss": 0.4452, + "step": 5759 + }, + { + "epoch": 1.0, + "learning_rate": 2.2675914573611068e-10, + "loss": 0.4547, + "step": 5760 + }, + { + "epoch": 1.0, + "learning_rate": 1.9054078608804482e-10, + "loss": 0.4651, + "step": 5761 + }, + { + "epoch": 1.0, + "learning_rate": 1.5747181083103357e-10, + "loss": 0.4642, + "step": 5762 + }, + { + "epoch": 1.0, + "learning_rate": 1.2755223037896892e-10, + "loss": 0.4387, + "step": 5763 + }, + { + "epoch": 1.0, + "learning_rate": 1.0078205415431364e-10, + "loss": 0.4611, + "step": 5764 + }, + { + "epoch": 1.0, + "learning_rate": 7.716129058921162e-11, + "loss": 0.4515, + "step": 5765 + }, + { + "epoch": 1.0, + "learning_rate": 5.668994712104692e-11, + "loss": 0.4597, + "step": 5766 + }, + { + "epoch": 1.0, + "learning_rate": 3.936803020021529e-11, + "loss": 0.4535, + "step": 5767 + }, + { + "epoch": 1.0, + "learning_rate": 2.5195545279022016e-11, + "loss": 0.4563, + "step": 5768 + }, + { + "epoch": 1.0, + "learning_rate": 1.4172496823894322e-11, + "loss": 0.436, + "step": 5769 + }, + { + "epoch": 1.0, + "learning_rate": 6.298888303168938e-12, + "loss": 0.4587, + "step": 5770 + }, + { + "epoch": 1.0, + "learning_rate": 1.574722200414769e-12, + "loss": 0.4541, + "step": 5771 + }, + { + "epoch": 1.0, + "learning_rate": 0.0, + "loss": 0.4532, + "step": 5772 + }, + { + "epoch": 1.0, + "step": 5772, + "total_flos": 0.0, + "train_loss": 0.10881091421437776, + "train_runtime": 8641.6411, + "train_samples_per_second": 344.272, + "train_steps_per_second": 0.668 + } + ], + "logging_steps": 1.0, + "max_steps": 5772, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/vision_tower/config.json b/vision_tower/config.json new file mode 100644 index 0000000000000000000000000000000000000000..2f4fe9b42cb69f3068b403fd430ff0439784abad --- /dev/null +++ b/vision_tower/config.json @@ -0,0 +1,19 @@ +{ + "_name_or_path": "/lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_path_mask/vision_tower", + "architectures": [ + "SiglipVisionModel" + ], + "attention_dropout": 0.0, + "hidden_act": "gelu_pytorch_tanh", + "hidden_size": 1152, + "image_size": 384, + "intermediate_size": 4304, + "layer_norm_eps": 1e-06, + "model_type": "siglip_vision_model", + "num_attention_heads": 16, + "num_channels": 3, + "num_hidden_layers": 27, + "patch_size": 14, + "torch_dtype": "bfloat16", + "transformers_version": "4.36.2" +} diff --git a/vision_tower/model.safetensors b/vision_tower/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9adf768e8289c94233258f4672813cc1c0ab1825 --- /dev/null +++ b/vision_tower/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c333d82ccac00f9786f5bdefaa86cf133042db0f96caa300d407866281bc5bd +size 856506120 diff --git a/vision_tower/preprocessor_config.json b/vision_tower/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0f13134ed29056f82f3ab7e0246f0ab973e7ecf3 --- /dev/null +++ b/vision_tower/preprocessor_config.json @@ -0,0 +1,24 @@ +{ + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.5, + 0.5, + 0.5 + ], + "image_processor_type": "SiglipImageProcessor", + "image_std": [ + 0.5, + 0.5, + 0.5 + ], + "processor_class": "SiglipProcessor", + "resample": 3, + "rescale_factor": 0.00392156862745098, + "size": { + "height": 384, + "width": 384 + } +}