diff --git a/checkpoint-8700/config.json b/checkpoint-8700/config.json new file mode 100644 index 0000000000000000000000000000000000000000..820a591453c57ae542b8bc08b9f7eea6898b7395 --- /dev/null +++ b/checkpoint-8700/config.json @@ -0,0 +1,253 @@ +{ + "_name_or_path": "/lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-8700", + "architectures": [ + "LlavaLlamaModel" + ], + "drop_path_rate": 0.0, + "hidden_size": 2560, + "image_aspect_ratio": "resize", + "interpolate_mode": "linear", + "llm_cfg": { + "_name_or_path": "/lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-8700/llm", + "add_cross_attention": false, + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 1, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 2, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "silu", + "hidden_size": 2560, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 6912, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 4096, + "min_length": 0, + "model_max_length": 4096, + "model_type": "llama", + "no_repeat_ngram_size": 0, + "num_attention_heads": 20, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 32, + "num_key_value_heads": 20, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "pretraining_tp": 1, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": false, + "tokenizer_class": null, + "tokenizer_model_max_length": 4096, + "tokenizer_padding_side": "right", + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "bfloat16", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": false, + "vocab_size": 32000 + }, + "mm_hidden_size": 1152, + "mm_projector_cfg": { + "_name_or_path": "/lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-8700/mm_projector", + "add_cross_attention": false, + "architectures": [ + "MultimodalProjector" + ], + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "mm_projector_type": "mlp_downsample", + "model_type": "v2l_projector", + "no_repeat_ngram_size": 0, + "num_beam_groups": 1, + "num_beams": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "bfloat16", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false + }, + "mm_projector_lr": null, + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "cls_patch", + "mm_vision_select_layer": -2, + "model_dtype": "torch.bfloat16", + "model_type": "llava_llama", + "num_video_frames": 8, + "resume_path": "/lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/checkpoint-7000", + "s2": false, + "s2_max_split_size": 336, + "s2_scales": "336,672,1008", + "transformers_version": "4.36.2", + "tune_language_model": true, + "tune_mm_projector": true, + "tune_vision_tower": true, + "vision_resolution": -1, + "vision_tower_cfg": { + "_name_or_path": "/lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-8700/vision_tower", + "add_cross_attention": false, + "architectures": [ + "SiglipVisionModel" + ], + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu_pytorch_tanh", + "hidden_size": 1152, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 384, + "intermediate_size": 4304, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "siglip_vision_model", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 27, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "bfloat16", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false + } +} diff --git a/checkpoint-8700/global_step8700/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-8700/global_step8700/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2835b1b01c5b7225192df488dfd4d7d38e615674 --- /dev/null +++ b/checkpoint-8700/global_step8700/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78cf2093bbaab2171e7bc9a904c72eacfdb8e57165dc45588c954f194f4720bd +size 2361117175 diff --git a/checkpoint-8700/global_step8700/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt b/checkpoint-8700/global_step8700/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b1d55fcd6726909305b9959d443d5a1ac1185a05 --- /dev/null +++ b/checkpoint-8700/global_step8700/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b1eb14bd71544adf8fbd7d58ceba441e6f41e027a483e5f94d31526fd09c436 +size 2361117185 diff --git a/checkpoint-8700/global_step8700/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt b/checkpoint-8700/global_step8700/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..83404694c729bba9d9fe338b23d303f968cc53fe --- /dev/null +++ b/checkpoint-8700/global_step8700/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aadce2d9804841cb74193dab63e4875231e65f53c9901467abf3ef79d5f2e0e4 +size 2361117185 diff --git a/checkpoint-8700/global_step8700/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt b/checkpoint-8700/global_step8700/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8ccd7e124fdfc9eb901bcc687c88bf74111973d5 --- /dev/null +++ b/checkpoint-8700/global_step8700/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bd60dd63746253a89578b12be8e1478d70915e8af4897c35f7353b9af227ba2 +size 2361117185 diff --git a/checkpoint-8700/global_step8700/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt b/checkpoint-8700/global_step8700/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a7fb848857ee5fda059b1dec8e2652d2d005acba --- /dev/null +++ b/checkpoint-8700/global_step8700/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ec2cf28aa6764be766a04d525a06c084f8909e7f39030586c04fe4d1be978be +size 2361117185 diff --git a/checkpoint-8700/global_step8700/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt b/checkpoint-8700/global_step8700/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..af4d58d87d790d16f93260a6798e8c9f912711d4 --- /dev/null +++ b/checkpoint-8700/global_step8700/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:837d8cb41a3a8152201887325998253becfca186acd5f3f4296bf10bd64fa760 +size 2361117185 diff --git a/checkpoint-8700/global_step8700/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt b/checkpoint-8700/global_step8700/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..718a1fc3acd02e65c5554a301a3578027464136a --- /dev/null +++ b/checkpoint-8700/global_step8700/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16de79772c9f510c95ed0708862fa47361428370981bc25db36c541f3f5831da +size 2361117185 diff --git a/checkpoint-8700/global_step8700/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-8700/global_step8700/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..59b1b18714e32f397db4075ef50a919ac6ae9356 --- /dev/null +++ b/checkpoint-8700/global_step8700/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9146bcde0640dbe48dab01efa1b9438eaadd38eaa090e283d3c75c647f8b88e +size 2361117175 diff --git a/checkpoint-8700/global_step8700/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-8700/global_step8700/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f2bdf2bef319c16ebd8919abfedd818bc21cbda6 --- /dev/null +++ b/checkpoint-8700/global_step8700/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:876509abf46748c0725423f4a190e783d12e1afd5645e596feabcfa94a09eb58 +size 2361117175 diff --git a/checkpoint-8700/global_step8700/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-8700/global_step8700/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7db8f4ea98bcedbd913db3aaca3e4bc4bd98a958 --- /dev/null +++ b/checkpoint-8700/global_step8700/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb61beecb7c7ca378965bf2e9fa626fc617b7aa7e269dd6d1000b037698d0dc4 +size 2361117175 diff --git a/checkpoint-8700/global_step8700/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoint-8700/global_step8700/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..95dd3b8d814ea7d50d2348587ec57b08e32b8929 --- /dev/null +++ b/checkpoint-8700/global_step8700/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbc41738696670e89427b97a2783f961fdf8c501ce04d0f7ce3238d7ba177daf +size 2361117175 diff --git a/checkpoint-8700/global_step8700/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoint-8700/global_step8700/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..036223f411fc6ecf8b73f001e015e75ecd4cbcc7 --- /dev/null +++ b/checkpoint-8700/global_step8700/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:832e2564f0f49db4b7efc5fa5149bdc2192baab4b018041bb8876d6fb4d5b3db +size 2361117175 diff --git a/checkpoint-8700/global_step8700/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoint-8700/global_step8700/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b9de6fa795a7ae6a8d25cfe345dc0b3b950951e2 --- /dev/null +++ b/checkpoint-8700/global_step8700/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adf30c5ca64b82e24298ef493690a62e24cdd2d689cc56115e05bda0d1483cde +size 2361117175 diff --git a/checkpoint-8700/global_step8700/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/checkpoint-8700/global_step8700/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5fcdbd01d2c019def122fa56db2a8066dd87698c --- /dev/null +++ b/checkpoint-8700/global_step8700/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffe95762c71f57f4cf40f3264760422f95277bfb655fe9d1314736514fae495e +size 2361117175 diff --git a/checkpoint-8700/global_step8700/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt b/checkpoint-8700/global_step8700/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..db97ab99e57662d3c21f4d7ed8b9fcfd5ec33459 --- /dev/null +++ b/checkpoint-8700/global_step8700/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63c5a14ac01bc36389117a7a0c97cf5095c728f41fb3608ea0b02291385cd37b +size 2361117175 diff --git a/checkpoint-8700/global_step8700/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt b/checkpoint-8700/global_step8700/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8abeba9f1fda5361ee019356c2a9ff54ae78a65e --- /dev/null +++ b/checkpoint-8700/global_step8700/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9ac71527e85f9778c00f4c0b5bd48b5486fc0a15f26ae12cb6e58452ebc3444 +size 2361117175 diff --git a/checkpoint-8700/global_step8700/zero_pp_rank_0_mp_rank_00_model_states.pt b/checkpoint-8700/global_step8700/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7548514e4ea1728ae22b366a413a9bdf5fd45acc --- /dev/null +++ b/checkpoint-8700/global_step8700/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c392ae1cf60ca191375c128709d5ac489ae36ad649ad72f0dfa144acc5c8f2a6 +size 413988 diff --git a/checkpoint-8700/global_step8700/zero_pp_rank_10_mp_rank_00_model_states.pt b/checkpoint-8700/global_step8700/zero_pp_rank_10_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..297cf538fafbf30d91a73173b7e8bf2d51152481 --- /dev/null +++ b/checkpoint-8700/global_step8700/zero_pp_rank_10_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3da7aeb2a742b4a7c82786e7e9204e6b5a3dcb34a89c217c5ffcee2f464701b +size 414735 diff --git a/checkpoint-8700/global_step8700/zero_pp_rank_11_mp_rank_00_model_states.pt b/checkpoint-8700/global_step8700/zero_pp_rank_11_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..89e0b9f6b6a167a0b428d52a6bdfd983df9bedf0 --- /dev/null +++ b/checkpoint-8700/global_step8700/zero_pp_rank_11_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c392bb724876fd6881905f78053d0825183854893c764028012db92f3b7326a5 +size 414735 diff --git a/checkpoint-8700/global_step8700/zero_pp_rank_12_mp_rank_00_model_states.pt b/checkpoint-8700/global_step8700/zero_pp_rank_12_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5cd33f7c071d8e7745e172acf48ba72a78b19292 --- /dev/null +++ b/checkpoint-8700/global_step8700/zero_pp_rank_12_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4a27093f96aff3b0576e01ccfca437a718b929ebd7f80bab16ae4994a87076c +size 414735 diff --git a/checkpoint-8700/global_step8700/zero_pp_rank_13_mp_rank_00_model_states.pt b/checkpoint-8700/global_step8700/zero_pp_rank_13_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..38afe6c99501010f90b163be9f7ebf716ca352dd --- /dev/null +++ b/checkpoint-8700/global_step8700/zero_pp_rank_13_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d33b54a7200f44c695e7f9d5ee56a7267fa4983099007794032d7b17caf8fab2 +size 414735 diff --git a/checkpoint-8700/global_step8700/zero_pp_rank_14_mp_rank_00_model_states.pt b/checkpoint-8700/global_step8700/zero_pp_rank_14_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..432838e9886e1f4fafdf09cb5c656367c1491fd2 --- /dev/null +++ b/checkpoint-8700/global_step8700/zero_pp_rank_14_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9681082546c6199d1f86e5e20462f019dd28937c1305b1c3c90a3356a756769c +size 414735 diff --git a/checkpoint-8700/global_step8700/zero_pp_rank_15_mp_rank_00_model_states.pt b/checkpoint-8700/global_step8700/zero_pp_rank_15_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1922e8dedff05c28a311d2679ad54ae10b4b2be5 --- /dev/null +++ b/checkpoint-8700/global_step8700/zero_pp_rank_15_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:589e5894a60b42bf0c4d2dbe731ee2c56550dc20c4812cb2e97c8ab79b3e847c +size 414735 diff --git a/checkpoint-8700/global_step8700/zero_pp_rank_1_mp_rank_00_model_states.pt b/checkpoint-8700/global_step8700/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..23eb563b05b82aefdc3b0dd1b39c6712d6c5f82f --- /dev/null +++ b/checkpoint-8700/global_step8700/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00efd683197584fead835a124592caa3d77c9ec371a9640691eaa713f4d3e852 +size 413988 diff --git a/checkpoint-8700/global_step8700/zero_pp_rank_2_mp_rank_00_model_states.pt b/checkpoint-8700/global_step8700/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fecab6719acb5cae382c0a6c7f08fbeb24aacadd --- /dev/null +++ b/checkpoint-8700/global_step8700/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8110cf34f702ac6c2d547f943555ec81c687f49be9b2c698e0d61fd0e753e434 +size 413988 diff --git a/checkpoint-8700/global_step8700/zero_pp_rank_3_mp_rank_00_model_states.pt b/checkpoint-8700/global_step8700/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..61ce6392a8343a064493f79f08f2df87d240e549 --- /dev/null +++ b/checkpoint-8700/global_step8700/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9415ee6e8d00f8637ae90fb84fce6eb0b52199a197746cba20bc5afe1221b331 +size 413988 diff --git a/checkpoint-8700/global_step8700/zero_pp_rank_4_mp_rank_00_model_states.pt b/checkpoint-8700/global_step8700/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..11206e1aab1ac876fb9976b1a277ee0bd7d32a85 --- /dev/null +++ b/checkpoint-8700/global_step8700/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d93bdb24f247533420940f038fb38cce85e836556870eecde567b2f40185f704 +size 413988 diff --git a/checkpoint-8700/global_step8700/zero_pp_rank_5_mp_rank_00_model_states.pt b/checkpoint-8700/global_step8700/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ac615862c82f2b09ae4a0b1be8bb3a69478c19b8 --- /dev/null +++ b/checkpoint-8700/global_step8700/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18e04929e47dc577b42fbbf355fbf152a4f49c69d3d070719fb65d99422b9761 +size 413988 diff --git a/checkpoint-8700/global_step8700/zero_pp_rank_6_mp_rank_00_model_states.pt b/checkpoint-8700/global_step8700/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ef1d3267fd120e6cc83058ccd752f109b9a11f44 --- /dev/null +++ b/checkpoint-8700/global_step8700/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d949b3c042c094cd5da5c19fb55222d9204dc5f49588bf0fb36b996f5d8d8bb8 +size 413988 diff --git a/checkpoint-8700/global_step8700/zero_pp_rank_7_mp_rank_00_model_states.pt b/checkpoint-8700/global_step8700/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..358a8cc29b7fb58a2e6028d5e3eb6927808fbdac --- /dev/null +++ b/checkpoint-8700/global_step8700/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:276822847b9858d17e272ec4b7749678280adb032a2a16a71098df5cc0987565 +size 413988 diff --git a/checkpoint-8700/global_step8700/zero_pp_rank_8_mp_rank_00_model_states.pt b/checkpoint-8700/global_step8700/zero_pp_rank_8_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f3d342a07199bd7d7a0e36c516bcddc1bee4df3e --- /dev/null +++ b/checkpoint-8700/global_step8700/zero_pp_rank_8_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed41c972fccc697c200ab811a95c7d178ca4e0894adcae8587c96e8b6ebeabaa +size 413988 diff --git a/checkpoint-8700/global_step8700/zero_pp_rank_9_mp_rank_00_model_states.pt b/checkpoint-8700/global_step8700/zero_pp_rank_9_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e1918bd8278fac24e031f7c94dcba2fcda65ac76 --- /dev/null +++ b/checkpoint-8700/global_step8700/zero_pp_rank_9_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9d9d4648064af665b895c90abdd26cf846b31098667f11289ffca01717250b8 +size 413988 diff --git a/checkpoint-8700/latest b/checkpoint-8700/latest new file mode 100644 index 0000000000000000000000000000000000000000..3eb3c503e73bb996d41443e5ef2b09ab922c9efe --- /dev/null +++ b/checkpoint-8700/latest @@ -0,0 +1 @@ +global_step8700 \ No newline at end of file diff --git a/checkpoint-8700/llm/config.json b/checkpoint-8700/llm/config.json new file mode 100644 index 0000000000000000000000000000000000000000..b3ea31ae2ad5c8b131784647e2615fadbdafa042 --- /dev/null +++ b/checkpoint-8700/llm/config.json @@ -0,0 +1,32 @@ +{ + "_name_or_path": "/lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-8700/llm", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 1, + "eos_token_id": 2, + "hidden_act": "silu", + "hidden_size": 2560, + "initializer_range": 0.02, + "intermediate_size": 6912, + "max_position_embeddings": 4096, + "model_max_length": 4096, + "model_type": "llama", + "num_attention_heads": 20, + "num_hidden_layers": 32, + "num_key_value_heads": 20, + "pad_token_id": 0, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 4096, + "tokenizer_padding_side": "right", + "torch_dtype": "bfloat16", + "transformers_version": "4.36.2", + "use_cache": false, + "vocab_size": 32000 +} diff --git a/checkpoint-8700/llm/generation_config.json b/checkpoint-8700/llm/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bf84ec1a28ba89feb07162d95b06633a40b4975f --- /dev/null +++ b/checkpoint-8700/llm/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "eos_token_id": 2, + "pad_token_id": 0, + "transformers_version": "4.36.2" +} diff --git a/checkpoint-8700/llm/model-00001-of-00002.safetensors b/checkpoint-8700/llm/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5c2bf87a0d376f6068c5a5261109ae872a3cd6ff --- /dev/null +++ b/checkpoint-8700/llm/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a3881c7d15f91b15b0dbd0e42f3f64082b9400bfe43781ae5f17a333417390b +size 4974521464 diff --git a/checkpoint-8700/llm/model-00002-of-00002.safetensors b/checkpoint-8700/llm/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8ceae00ccdd17b7161c81829a481c0ee2b593dc5 --- /dev/null +++ b/checkpoint-8700/llm/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0ac1e6ed87765e83ca1e6e75e5b4044d4b8e9c7428cf9fe2eb6578e6491e3e6 +size 428632856 diff --git a/checkpoint-8700/llm/model.safetensors.index.json b/checkpoint-8700/llm/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..8b173c9ac8194749df58c92051618c0ff74c4c20 --- /dev/null +++ b/checkpoint-8700/llm/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 5403120640 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.28.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.29.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/checkpoint-8700/llm/special_tokens_map.json b/checkpoint-8700/llm/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..14761dcf1466dc232bd41de9c21d4c617b15755e --- /dev/null +++ b/checkpoint-8700/llm/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-8700/llm/tokenizer.model b/checkpoint-8700/llm/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..3b7eab905db502ae7629c8a3c1f8412a3178c4c2 --- /dev/null +++ b/checkpoint-8700/llm/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7aedb3582ecda9fa99ee9242c17a9658f6744db083ee6ebdc8fb14857f84d220 +size 499723 diff --git a/checkpoint-8700/llm/tokenizer_config.json b/checkpoint-8700/llm/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..47ab96cd62cc374653a0ea0fb77f9457e0f53481 --- /dev/null +++ b/checkpoint-8700/llm/tokenizer_config.json @@ -0,0 +1,43 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": false, + "model_max_length": 4096, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/checkpoint-8700/mm_projector/config.json b/checkpoint-8700/mm_projector/config.json new file mode 100644 index 0000000000000000000000000000000000000000..2817811890a44c58170492eb8710c2523b70a74c --- /dev/null +++ b/checkpoint-8700/mm_projector/config.json @@ -0,0 +1,10 @@ +{ + "_name_or_path": "/lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-8700/mm_projector", + "architectures": [ + "MultimodalProjector" + ], + "mm_projector_type": "mlp_downsample", + "model_type": "v2l_projector", + "torch_dtype": "bfloat16", + "transformers_version": "4.36.2" +} diff --git a/checkpoint-8700/mm_projector/model.safetensors b/checkpoint-8700/mm_projector/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c568243efc41a8837c77d7c9ab9d40412881a0b2 --- /dev/null +++ b/checkpoint-8700/mm_projector/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c03fb1f3b76192c22eb31a30b72b613121bbe3bd334bee0aa1851b6c6117d4a +size 36729360 diff --git a/checkpoint-8700/rng_state_0.pth b/checkpoint-8700/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..7357683b224bd50c6799b1bb1148fde24ec31fbb --- /dev/null +++ b/checkpoint-8700/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d427f50a4b918abd1b29c08d181c3049d53c751f7d62ab8b589e9737cdeb3c89 +size 21687 diff --git a/checkpoint-8700/rng_state_1.pth b/checkpoint-8700/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..49ad9e43aec674d3c18b519d2740d5dff68cd149 --- /dev/null +++ b/checkpoint-8700/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4f325e5dfaf9edb6ba115f90d6556217315523f097f447331e9dc48b74c1307 +size 21687 diff --git a/checkpoint-8700/rng_state_10.pth b/checkpoint-8700/rng_state_10.pth new file mode 100644 index 0000000000000000000000000000000000000000..ca3e40f507002de0adc16c89128af212cc563582 --- /dev/null +++ b/checkpoint-8700/rng_state_10.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79ac5d961aa2a73a29239939231d8b8141771abfcb7500a5de06838964fd971b +size 21698 diff --git a/checkpoint-8700/rng_state_11.pth b/checkpoint-8700/rng_state_11.pth new file mode 100644 index 0000000000000000000000000000000000000000..1d50a25b910824990eb0be0864a8ac894ed7658c --- /dev/null +++ b/checkpoint-8700/rng_state_11.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a01ff5444331cc8871752d42550373325f51424611622467ed0e9ee1aa22f87d +size 21698 diff --git a/checkpoint-8700/rng_state_12.pth b/checkpoint-8700/rng_state_12.pth new file mode 100644 index 0000000000000000000000000000000000000000..61f70194329a4ed9aff159182fb6e3b2b392a677 --- /dev/null +++ b/checkpoint-8700/rng_state_12.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:111bbdacabcb61556c1b120e649265d3ebcd5f731b84556a1721f16a895acaf7 +size 21698 diff --git a/checkpoint-8700/rng_state_13.pth b/checkpoint-8700/rng_state_13.pth new file mode 100644 index 0000000000000000000000000000000000000000..fbd4d0b90613ec4125ba805509483db8cbbd3f40 --- /dev/null +++ b/checkpoint-8700/rng_state_13.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20517a44d2277a5d5b1fe5ab6b227bcfd4e753cd4be36f0b0034f58550f6b2ed +size 21698 diff --git a/checkpoint-8700/rng_state_14.pth b/checkpoint-8700/rng_state_14.pth new file mode 100644 index 0000000000000000000000000000000000000000..f5f0c72bd0cc100d50bbec3b44c6549f203cb433 --- /dev/null +++ b/checkpoint-8700/rng_state_14.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2c66bebd40c0f394c921303f3b990fa506066ff9bb7ed0d144d1279eec0f731 +size 21698 diff --git a/checkpoint-8700/rng_state_15.pth b/checkpoint-8700/rng_state_15.pth new file mode 100644 index 0000000000000000000000000000000000000000..966589c93f261f987716f75844c703c0e24c97fe --- /dev/null +++ b/checkpoint-8700/rng_state_15.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d3d7a2ec2fb05adcf8664ae6f7b703554a749c14cb25a624a5f9df262dedab6 +size 21698 diff --git a/checkpoint-8700/rng_state_2.pth b/checkpoint-8700/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..6e21be780e54e51059480574390edc58d5999b59 --- /dev/null +++ b/checkpoint-8700/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61e8b1a9d6d5a705eb7b6e95b4fa642f55ec6b936a0b40ae1f905fb7a423d457 +size 21687 diff --git a/checkpoint-8700/rng_state_3.pth b/checkpoint-8700/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..58f04d0066baa34677058666a9a8f9ced4779253 --- /dev/null +++ b/checkpoint-8700/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35d4f44396a4d158d487efb069b1730120fd2fe73e51c27c3e0ef4140b980cc2 +size 21687 diff --git a/checkpoint-8700/rng_state_4.pth b/checkpoint-8700/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..9842eca0701a872930c0b054d155df7f7c957064 --- /dev/null +++ b/checkpoint-8700/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85309093672fdc00ecc5bd04d2c9ba991f69a8eacc779a1115f4c8e42be6e587 +size 21687 diff --git a/checkpoint-8700/rng_state_5.pth b/checkpoint-8700/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..3c3dfcdafa5d6a8f5679a1edcf7e18f44f58976e --- /dev/null +++ b/checkpoint-8700/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8386870a115c3c49653ba8f3afdd005246b42cd93080bc934bbf0e9e5c2d8bf8 +size 21687 diff --git a/checkpoint-8700/rng_state_6.pth b/checkpoint-8700/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..029d7aee69bd7e66668863f46f8b93cab5777b69 --- /dev/null +++ b/checkpoint-8700/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd3ea98950fb5c8934736f7e25c900e3750aa9e89de4c3faf6a89a3ceb99e887 +size 21687 diff --git a/checkpoint-8700/rng_state_7.pth b/checkpoint-8700/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..8d1bf055f7d2c45ffb40e3ad2dcfe00a13a0f430 --- /dev/null +++ b/checkpoint-8700/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9bd71ea40adc6512d7fdefcafc414e2db022aa983a997d92658578f6e64750f +size 21687 diff --git a/checkpoint-8700/rng_state_8.pth b/checkpoint-8700/rng_state_8.pth new file mode 100644 index 0000000000000000000000000000000000000000..dd73cd1f3677bf7fae10e3eff6f4683a7a89bf53 --- /dev/null +++ b/checkpoint-8700/rng_state_8.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:002a5f6750e8050785688d49c27945a76f04bd95d50612e05438098a05ce2c27 +size 21687 diff --git a/checkpoint-8700/rng_state_9.pth b/checkpoint-8700/rng_state_9.pth new file mode 100644 index 0000000000000000000000000000000000000000..33f7b52fc83c4a295ff7019ecfc65de9ec5b28e5 --- /dev/null +++ b/checkpoint-8700/rng_state_9.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0838b9ced648ee98ac59be84ebd8ab1a9f45e0915c1b4419d1d6a7f9fe75279 +size 21687 diff --git a/checkpoint-8700/scheduler.pt b/checkpoint-8700/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3d8f92f0275f580f3ed0b7aa80a5a1fa37539954 --- /dev/null +++ b/checkpoint-8700/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6889f97e3d1700599a04c1a1578c3e8d7766ad3fba9e1646d63a3c71db6ae75d +size 627 diff --git a/checkpoint-8700/trainer_state.json b/checkpoint-8700/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..980a226502d3532e8764b3afdf0556313111db46 --- /dev/null +++ b/checkpoint-8700/trainer_state.json @@ -0,0 +1,52221 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9942857142857143, + "eval_steps": 500, + "global_step": 8700, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 7.604562737642586e-08, + "loss": 0.8388, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 1.5209125475285173e-07, + "loss": 0.8142, + "step": 2 + }, + { + "epoch": 0.0, + "learning_rate": 2.281368821292776e-07, + "loss": 0.8402, + "step": 3 + }, + { + "epoch": 0.0, + "learning_rate": 3.0418250950570346e-07, + "loss": 0.808, + "step": 4 + }, + { + "epoch": 0.0, + "learning_rate": 3.802281368821293e-07, + "loss": 0.8206, + "step": 5 + }, + { + "epoch": 0.0, + "learning_rate": 4.562737642585552e-07, + "loss": 0.8594, + "step": 6 + }, + { + "epoch": 0.0, + "learning_rate": 5.32319391634981e-07, + "loss": 0.8293, + "step": 7 + }, + { + "epoch": 0.0, + "learning_rate": 6.083650190114069e-07, + "loss": 0.8594, + "step": 8 + }, + { + "epoch": 0.0, + "learning_rate": 6.844106463878328e-07, + "loss": 0.8089, + "step": 9 + }, + { + "epoch": 0.0, + "learning_rate": 7.604562737642586e-07, + "loss": 0.8156, + "step": 10 + }, + { + "epoch": 0.0, + "learning_rate": 8.365019011406844e-07, + "loss": 0.8078, + "step": 11 + }, + { + "epoch": 0.0, + "learning_rate": 9.125475285171104e-07, + "loss": 0.7722, + "step": 12 + }, + { + "epoch": 0.0, + "learning_rate": 9.885931558935361e-07, + "loss": 0.799, + "step": 13 + }, + { + "epoch": 0.0, + "learning_rate": 1.064638783269962e-06, + "loss": 0.8171, + "step": 14 + }, + { + "epoch": 0.0, + "learning_rate": 1.140684410646388e-06, + "loss": 0.7761, + "step": 15 + }, + { + "epoch": 0.0, + "learning_rate": 1.2167300380228138e-06, + "loss": 0.7554, + "step": 16 + }, + { + "epoch": 0.0, + "learning_rate": 1.2927756653992395e-06, + "loss": 0.7353, + "step": 17 + }, + { + "epoch": 0.0, + "learning_rate": 1.3688212927756656e-06, + "loss": 0.7562, + "step": 18 + }, + { + "epoch": 0.0, + "learning_rate": 1.4448669201520913e-06, + "loss": 0.7364, + "step": 19 + }, + { + "epoch": 0.0, + "learning_rate": 1.5209125475285172e-06, + "loss": 0.7192, + "step": 20 + }, + { + "epoch": 0.0, + "learning_rate": 1.5969581749049431e-06, + "loss": 0.7345, + "step": 21 + }, + { + "epoch": 0.0, + "learning_rate": 1.6730038022813688e-06, + "loss": 0.704, + "step": 22 + }, + { + "epoch": 0.0, + "learning_rate": 1.7490494296577947e-06, + "loss": 0.6885, + "step": 23 + }, + { + "epoch": 0.0, + "learning_rate": 1.8250950570342208e-06, + "loss": 0.6952, + "step": 24 + }, + { + "epoch": 0.0, + "learning_rate": 1.9011406844106463e-06, + "loss": 0.679, + "step": 25 + }, + { + "epoch": 0.0, + "learning_rate": 1.9771863117870722e-06, + "loss": 0.6861, + "step": 26 + }, + { + "epoch": 0.0, + "learning_rate": 2.053231939163498e-06, + "loss": 0.6799, + "step": 27 + }, + { + "epoch": 0.0, + "learning_rate": 2.129277566539924e-06, + "loss": 0.669, + "step": 28 + }, + { + "epoch": 0.0, + "learning_rate": 2.20532319391635e-06, + "loss": 0.6709, + "step": 29 + }, + { + "epoch": 0.0, + "learning_rate": 2.281368821292776e-06, + "loss": 0.6729, + "step": 30 + }, + { + "epoch": 0.0, + "learning_rate": 2.3574144486692017e-06, + "loss": 0.6522, + "step": 31 + }, + { + "epoch": 0.0, + "learning_rate": 2.4334600760456276e-06, + "loss": 0.6696, + "step": 32 + }, + { + "epoch": 0.0, + "learning_rate": 2.509505703422053e-06, + "loss": 0.6507, + "step": 33 + }, + { + "epoch": 0.0, + "learning_rate": 2.585551330798479e-06, + "loss": 0.6338, + "step": 34 + }, + { + "epoch": 0.0, + "learning_rate": 2.6615969581749054e-06, + "loss": 0.6667, + "step": 35 + }, + { + "epoch": 0.0, + "learning_rate": 2.7376425855513313e-06, + "loss": 0.6551, + "step": 36 + }, + { + "epoch": 0.0, + "learning_rate": 2.813688212927757e-06, + "loss": 0.6158, + "step": 37 + }, + { + "epoch": 0.0, + "learning_rate": 2.8897338403041826e-06, + "loss": 0.6222, + "step": 38 + }, + { + "epoch": 0.0, + "learning_rate": 2.9657794676806085e-06, + "loss": 0.628, + "step": 39 + }, + { + "epoch": 0.0, + "learning_rate": 3.0418250950570345e-06, + "loss": 0.6507, + "step": 40 + }, + { + "epoch": 0.0, + "learning_rate": 3.1178707224334604e-06, + "loss": 0.6518, + "step": 41 + }, + { + "epoch": 0.0, + "learning_rate": 3.1939163498098863e-06, + "loss": 0.6297, + "step": 42 + }, + { + "epoch": 0.0, + "learning_rate": 3.269961977186312e-06, + "loss": 0.6228, + "step": 43 + }, + { + "epoch": 0.01, + "learning_rate": 3.3460076045627376e-06, + "loss": 0.6278, + "step": 44 + }, + { + "epoch": 0.01, + "learning_rate": 3.4220532319391635e-06, + "loss": 0.6226, + "step": 45 + }, + { + "epoch": 0.01, + "learning_rate": 3.4980988593155894e-06, + "loss": 0.6057, + "step": 46 + }, + { + "epoch": 0.01, + "learning_rate": 3.5741444866920154e-06, + "loss": 0.6189, + "step": 47 + }, + { + "epoch": 0.01, + "learning_rate": 3.6501901140684417e-06, + "loss": 0.6293, + "step": 48 + }, + { + "epoch": 0.01, + "learning_rate": 3.7262357414448676e-06, + "loss": 0.6231, + "step": 49 + }, + { + "epoch": 0.01, + "learning_rate": 3.8022813688212926e-06, + "loss": 0.629, + "step": 50 + }, + { + "epoch": 0.01, + "learning_rate": 3.8783269961977185e-06, + "loss": 0.6011, + "step": 51 + }, + { + "epoch": 0.01, + "learning_rate": 3.9543726235741444e-06, + "loss": 0.6084, + "step": 52 + }, + { + "epoch": 0.01, + "learning_rate": 4.03041825095057e-06, + "loss": 0.5854, + "step": 53 + }, + { + "epoch": 0.01, + "learning_rate": 4.106463878326996e-06, + "loss": 0.6004, + "step": 54 + }, + { + "epoch": 0.01, + "learning_rate": 4.182509505703423e-06, + "loss": 0.5955, + "step": 55 + }, + { + "epoch": 0.01, + "learning_rate": 4.258555133079848e-06, + "loss": 0.6117, + "step": 56 + }, + { + "epoch": 0.01, + "learning_rate": 4.334600760456274e-06, + "loss": 0.5993, + "step": 57 + }, + { + "epoch": 0.01, + "learning_rate": 4.4106463878327e-06, + "loss": 0.6063, + "step": 58 + }, + { + "epoch": 0.01, + "learning_rate": 4.486692015209126e-06, + "loss": 0.5858, + "step": 59 + }, + { + "epoch": 0.01, + "learning_rate": 4.562737642585552e-06, + "loss": 0.6101, + "step": 60 + }, + { + "epoch": 0.01, + "learning_rate": 4.638783269961978e-06, + "loss": 0.5843, + "step": 61 + }, + { + "epoch": 0.01, + "learning_rate": 4.7148288973384035e-06, + "loss": 0.5875, + "step": 62 + }, + { + "epoch": 0.01, + "learning_rate": 4.790874524714829e-06, + "loss": 0.5833, + "step": 63 + }, + { + "epoch": 0.01, + "learning_rate": 4.866920152091255e-06, + "loss": 0.5843, + "step": 64 + }, + { + "epoch": 0.01, + "learning_rate": 4.942965779467681e-06, + "loss": 0.5836, + "step": 65 + }, + { + "epoch": 0.01, + "learning_rate": 5.019011406844106e-06, + "loss": 0.6352, + "step": 66 + }, + { + "epoch": 0.01, + "learning_rate": 5.095057034220533e-06, + "loss": 0.5787, + "step": 67 + }, + { + "epoch": 0.01, + "learning_rate": 5.171102661596958e-06, + "loss": 0.59, + "step": 68 + }, + { + "epoch": 0.01, + "learning_rate": 5.247148288973385e-06, + "loss": 0.5917, + "step": 69 + }, + { + "epoch": 0.01, + "learning_rate": 5.323193916349811e-06, + "loss": 0.5731, + "step": 70 + }, + { + "epoch": 0.01, + "learning_rate": 5.399239543726236e-06, + "loss": 0.5789, + "step": 71 + }, + { + "epoch": 0.01, + "learning_rate": 5.4752851711026625e-06, + "loss": 0.581, + "step": 72 + }, + { + "epoch": 0.01, + "learning_rate": 5.5513307984790876e-06, + "loss": 0.5857, + "step": 73 + }, + { + "epoch": 0.01, + "learning_rate": 5.627376425855514e-06, + "loss": 0.5863, + "step": 74 + }, + { + "epoch": 0.01, + "learning_rate": 5.703422053231939e-06, + "loss": 0.5802, + "step": 75 + }, + { + "epoch": 0.01, + "learning_rate": 5.779467680608365e-06, + "loss": 0.5812, + "step": 76 + }, + { + "epoch": 0.01, + "learning_rate": 5.855513307984791e-06, + "loss": 0.5775, + "step": 77 + }, + { + "epoch": 0.01, + "learning_rate": 5.931558935361217e-06, + "loss": 0.5732, + "step": 78 + }, + { + "epoch": 0.01, + "learning_rate": 6.007604562737643e-06, + "loss": 0.5573, + "step": 79 + }, + { + "epoch": 0.01, + "learning_rate": 6.083650190114069e-06, + "loss": 0.5563, + "step": 80 + }, + { + "epoch": 0.01, + "learning_rate": 6.159695817490496e-06, + "loss": 0.5797, + "step": 81 + }, + { + "epoch": 0.01, + "learning_rate": 6.235741444866921e-06, + "loss": 0.5945, + "step": 82 + }, + { + "epoch": 0.01, + "learning_rate": 6.311787072243346e-06, + "loss": 0.5757, + "step": 83 + }, + { + "epoch": 0.01, + "learning_rate": 6.3878326996197725e-06, + "loss": 0.569, + "step": 84 + }, + { + "epoch": 0.01, + "learning_rate": 6.4638783269961976e-06, + "loss": 0.5691, + "step": 85 + }, + { + "epoch": 0.01, + "learning_rate": 6.539923954372624e-06, + "loss": 0.5818, + "step": 86 + }, + { + "epoch": 0.01, + "learning_rate": 6.61596958174905e-06, + "loss": 0.5869, + "step": 87 + }, + { + "epoch": 0.01, + "learning_rate": 6.692015209125475e-06, + "loss": 0.5624, + "step": 88 + }, + { + "epoch": 0.01, + "learning_rate": 6.768060836501902e-06, + "loss": 0.561, + "step": 89 + }, + { + "epoch": 0.01, + "learning_rate": 6.844106463878327e-06, + "loss": 0.5586, + "step": 90 + }, + { + "epoch": 0.01, + "learning_rate": 6.920152091254754e-06, + "loss": 0.585, + "step": 91 + }, + { + "epoch": 0.01, + "learning_rate": 6.996197718631179e-06, + "loss": 0.575, + "step": 92 + }, + { + "epoch": 0.01, + "learning_rate": 7.072243346007606e-06, + "loss": 0.5674, + "step": 93 + }, + { + "epoch": 0.01, + "learning_rate": 7.148288973384031e-06, + "loss": 0.5803, + "step": 94 + }, + { + "epoch": 0.01, + "learning_rate": 7.224334600760457e-06, + "loss": 0.5745, + "step": 95 + }, + { + "epoch": 0.01, + "learning_rate": 7.300380228136883e-06, + "loss": 0.5678, + "step": 96 + }, + { + "epoch": 0.01, + "learning_rate": 7.376425855513308e-06, + "loss": 0.5495, + "step": 97 + }, + { + "epoch": 0.01, + "learning_rate": 7.452471482889735e-06, + "loss": 0.5583, + "step": 98 + }, + { + "epoch": 0.01, + "learning_rate": 7.52851711026616e-06, + "loss": 0.5921, + "step": 99 + }, + { + "epoch": 0.01, + "learning_rate": 7.604562737642585e-06, + "loss": 0.564, + "step": 100 + }, + { + "epoch": 0.01, + "learning_rate": 7.680608365019012e-06, + "loss": 0.5552, + "step": 101 + }, + { + "epoch": 0.01, + "learning_rate": 7.756653992395437e-06, + "loss": 0.5649, + "step": 102 + }, + { + "epoch": 0.01, + "learning_rate": 7.832699619771864e-06, + "loss": 0.5691, + "step": 103 + }, + { + "epoch": 0.01, + "learning_rate": 7.908745247148289e-06, + "loss": 0.5722, + "step": 104 + }, + { + "epoch": 0.01, + "learning_rate": 7.984790874524716e-06, + "loss": 0.5691, + "step": 105 + }, + { + "epoch": 0.01, + "learning_rate": 8.06083650190114e-06, + "loss": 0.5728, + "step": 106 + }, + { + "epoch": 0.01, + "learning_rate": 8.136882129277567e-06, + "loss": 0.5799, + "step": 107 + }, + { + "epoch": 0.01, + "learning_rate": 8.212927756653993e-06, + "loss": 0.5636, + "step": 108 + }, + { + "epoch": 0.01, + "learning_rate": 8.28897338403042e-06, + "loss": 0.5666, + "step": 109 + }, + { + "epoch": 0.01, + "learning_rate": 8.365019011406846e-06, + "loss": 0.5627, + "step": 110 + }, + { + "epoch": 0.01, + "learning_rate": 8.441064638783271e-06, + "loss": 0.5494, + "step": 111 + }, + { + "epoch": 0.01, + "learning_rate": 8.517110266159696e-06, + "loss": 0.5716, + "step": 112 + }, + { + "epoch": 0.01, + "learning_rate": 8.593155893536123e-06, + "loss": 0.5442, + "step": 113 + }, + { + "epoch": 0.01, + "learning_rate": 8.669201520912548e-06, + "loss": 0.5698, + "step": 114 + }, + { + "epoch": 0.01, + "learning_rate": 8.745247148288975e-06, + "loss": 0.5592, + "step": 115 + }, + { + "epoch": 0.01, + "learning_rate": 8.8212927756654e-06, + "loss": 0.5573, + "step": 116 + }, + { + "epoch": 0.01, + "learning_rate": 8.897338403041825e-06, + "loss": 0.5634, + "step": 117 + }, + { + "epoch": 0.01, + "learning_rate": 8.973384030418252e-06, + "loss": 0.5451, + "step": 118 + }, + { + "epoch": 0.01, + "learning_rate": 9.049429657794677e-06, + "loss": 0.5629, + "step": 119 + }, + { + "epoch": 0.01, + "learning_rate": 9.125475285171103e-06, + "loss": 0.5551, + "step": 120 + }, + { + "epoch": 0.01, + "learning_rate": 9.201520912547528e-06, + "loss": 0.547, + "step": 121 + }, + { + "epoch": 0.01, + "learning_rate": 9.277566539923955e-06, + "loss": 0.5649, + "step": 122 + }, + { + "epoch": 0.01, + "learning_rate": 9.35361216730038e-06, + "loss": 0.5516, + "step": 123 + }, + { + "epoch": 0.01, + "learning_rate": 9.429657794676807e-06, + "loss": 0.5661, + "step": 124 + }, + { + "epoch": 0.01, + "learning_rate": 9.505703422053234e-06, + "loss": 0.5619, + "step": 125 + }, + { + "epoch": 0.01, + "learning_rate": 9.581749049429659e-06, + "loss": 0.5714, + "step": 126 + }, + { + "epoch": 0.01, + "learning_rate": 9.657794676806086e-06, + "loss": 0.5372, + "step": 127 + }, + { + "epoch": 0.01, + "learning_rate": 9.73384030418251e-06, + "loss": 0.5492, + "step": 128 + }, + { + "epoch": 0.01, + "learning_rate": 9.809885931558936e-06, + "loss": 0.5402, + "step": 129 + }, + { + "epoch": 0.01, + "learning_rate": 9.885931558935362e-06, + "loss": 0.5687, + "step": 130 + }, + { + "epoch": 0.01, + "learning_rate": 9.961977186311787e-06, + "loss": 0.5433, + "step": 131 + }, + { + "epoch": 0.02, + "learning_rate": 1.0038022813688212e-05, + "loss": 0.5584, + "step": 132 + }, + { + "epoch": 0.02, + "learning_rate": 1.011406844106464e-05, + "loss": 0.5561, + "step": 133 + }, + { + "epoch": 0.02, + "learning_rate": 1.0190114068441066e-05, + "loss": 0.5563, + "step": 134 + }, + { + "epoch": 0.02, + "learning_rate": 1.0266159695817491e-05, + "loss": 0.5626, + "step": 135 + }, + { + "epoch": 0.02, + "learning_rate": 1.0342205323193916e-05, + "loss": 0.5659, + "step": 136 + }, + { + "epoch": 0.02, + "learning_rate": 1.0418250950570343e-05, + "loss": 0.516, + "step": 137 + }, + { + "epoch": 0.02, + "learning_rate": 1.049429657794677e-05, + "loss": 0.5602, + "step": 138 + }, + { + "epoch": 0.02, + "learning_rate": 1.0570342205323195e-05, + "loss": 0.553, + "step": 139 + }, + { + "epoch": 0.02, + "learning_rate": 1.0646387832699621e-05, + "loss": 0.5562, + "step": 140 + }, + { + "epoch": 0.02, + "learning_rate": 1.0722433460076046e-05, + "loss": 0.5302, + "step": 141 + }, + { + "epoch": 0.02, + "learning_rate": 1.0798479087452472e-05, + "loss": 0.561, + "step": 142 + }, + { + "epoch": 0.02, + "learning_rate": 1.0874524714828898e-05, + "loss": 0.5463, + "step": 143 + }, + { + "epoch": 0.02, + "learning_rate": 1.0950570342205325e-05, + "loss": 0.5567, + "step": 144 + }, + { + "epoch": 0.02, + "learning_rate": 1.1026615969581752e-05, + "loss": 0.5301, + "step": 145 + }, + { + "epoch": 0.02, + "learning_rate": 1.1102661596958175e-05, + "loss": 0.5499, + "step": 146 + }, + { + "epoch": 0.02, + "learning_rate": 1.1178707224334602e-05, + "loss": 0.5371, + "step": 147 + }, + { + "epoch": 0.02, + "learning_rate": 1.1254752851711029e-05, + "loss": 0.5385, + "step": 148 + }, + { + "epoch": 0.02, + "learning_rate": 1.1330798479087452e-05, + "loss": 0.5438, + "step": 149 + }, + { + "epoch": 0.02, + "learning_rate": 1.1406844106463879e-05, + "loss": 0.5507, + "step": 150 + }, + { + "epoch": 0.02, + "learning_rate": 1.1482889733840306e-05, + "loss": 0.5496, + "step": 151 + }, + { + "epoch": 0.02, + "learning_rate": 1.155893536121673e-05, + "loss": 0.5411, + "step": 152 + }, + { + "epoch": 0.02, + "learning_rate": 1.1634980988593156e-05, + "loss": 0.5492, + "step": 153 + }, + { + "epoch": 0.02, + "learning_rate": 1.1711026615969582e-05, + "loss": 0.5384, + "step": 154 + }, + { + "epoch": 0.02, + "learning_rate": 1.1787072243346009e-05, + "loss": 0.5209, + "step": 155 + }, + { + "epoch": 0.02, + "learning_rate": 1.1863117870722434e-05, + "loss": 0.5404, + "step": 156 + }, + { + "epoch": 0.02, + "learning_rate": 1.1939163498098861e-05, + "loss": 0.5511, + "step": 157 + }, + { + "epoch": 0.02, + "learning_rate": 1.2015209125475286e-05, + "loss": 0.53, + "step": 158 + }, + { + "epoch": 0.02, + "learning_rate": 1.2091254752851711e-05, + "loss": 0.5369, + "step": 159 + }, + { + "epoch": 0.02, + "learning_rate": 1.2167300380228138e-05, + "loss": 0.5446, + "step": 160 + }, + { + "epoch": 0.02, + "learning_rate": 1.2243346007604565e-05, + "loss": 0.5408, + "step": 161 + }, + { + "epoch": 0.02, + "learning_rate": 1.2319391634980991e-05, + "loss": 0.5411, + "step": 162 + }, + { + "epoch": 0.02, + "learning_rate": 1.2395437262357415e-05, + "loss": 0.5167, + "step": 163 + }, + { + "epoch": 0.02, + "learning_rate": 1.2471482889733841e-05, + "loss": 0.5447, + "step": 164 + }, + { + "epoch": 0.02, + "learning_rate": 1.2547528517110268e-05, + "loss": 0.5221, + "step": 165 + }, + { + "epoch": 0.02, + "learning_rate": 1.2623574144486692e-05, + "loss": 0.536, + "step": 166 + }, + { + "epoch": 0.02, + "learning_rate": 1.2699619771863118e-05, + "loss": 0.5453, + "step": 167 + }, + { + "epoch": 0.02, + "learning_rate": 1.2775665399239545e-05, + "loss": 0.531, + "step": 168 + }, + { + "epoch": 0.02, + "learning_rate": 1.2851711026615972e-05, + "loss": 0.5511, + "step": 169 + }, + { + "epoch": 0.02, + "learning_rate": 1.2927756653992395e-05, + "loss": 0.5444, + "step": 170 + }, + { + "epoch": 0.02, + "learning_rate": 1.3003802281368822e-05, + "loss": 0.5562, + "step": 171 + }, + { + "epoch": 0.02, + "learning_rate": 1.3079847908745249e-05, + "loss": 0.5343, + "step": 172 + }, + { + "epoch": 0.02, + "learning_rate": 1.3155893536121674e-05, + "loss": 0.5298, + "step": 173 + }, + { + "epoch": 0.02, + "learning_rate": 1.32319391634981e-05, + "loss": 0.5133, + "step": 174 + }, + { + "epoch": 0.02, + "learning_rate": 1.3307984790874526e-05, + "loss": 0.5407, + "step": 175 + }, + { + "epoch": 0.02, + "learning_rate": 1.338403041825095e-05, + "loss": 0.5446, + "step": 176 + }, + { + "epoch": 0.02, + "learning_rate": 1.3460076045627377e-05, + "loss": 0.5178, + "step": 177 + }, + { + "epoch": 0.02, + "learning_rate": 1.3536121673003804e-05, + "loss": 0.5481, + "step": 178 + }, + { + "epoch": 0.02, + "learning_rate": 1.361216730038023e-05, + "loss": 0.5537, + "step": 179 + }, + { + "epoch": 0.02, + "learning_rate": 1.3688212927756654e-05, + "loss": 0.5121, + "step": 180 + }, + { + "epoch": 0.02, + "learning_rate": 1.3764258555133081e-05, + "loss": 0.5132, + "step": 181 + }, + { + "epoch": 0.02, + "learning_rate": 1.3840304182509508e-05, + "loss": 0.5489, + "step": 182 + }, + { + "epoch": 0.02, + "learning_rate": 1.3916349809885931e-05, + "loss": 0.5299, + "step": 183 + }, + { + "epoch": 0.02, + "learning_rate": 1.3992395437262358e-05, + "loss": 0.5523, + "step": 184 + }, + { + "epoch": 0.02, + "learning_rate": 1.4068441064638785e-05, + "loss": 0.5257, + "step": 185 + }, + { + "epoch": 0.02, + "learning_rate": 1.4144486692015211e-05, + "loss": 0.5232, + "step": 186 + }, + { + "epoch": 0.02, + "learning_rate": 1.4220532319391636e-05, + "loss": 0.5423, + "step": 187 + }, + { + "epoch": 0.02, + "learning_rate": 1.4296577946768061e-05, + "loss": 0.528, + "step": 188 + }, + { + "epoch": 0.02, + "learning_rate": 1.4372623574144488e-05, + "loss": 0.527, + "step": 189 + }, + { + "epoch": 0.02, + "learning_rate": 1.4448669201520913e-05, + "loss": 0.5425, + "step": 190 + }, + { + "epoch": 0.02, + "learning_rate": 1.452471482889734e-05, + "loss": 0.5182, + "step": 191 + }, + { + "epoch": 0.02, + "learning_rate": 1.4600760456273767e-05, + "loss": 0.5552, + "step": 192 + }, + { + "epoch": 0.02, + "learning_rate": 1.467680608365019e-05, + "loss": 0.5517, + "step": 193 + }, + { + "epoch": 0.02, + "learning_rate": 1.4752851711026617e-05, + "loss": 0.5179, + "step": 194 + }, + { + "epoch": 0.02, + "learning_rate": 1.4828897338403044e-05, + "loss": 0.5453, + "step": 195 + }, + { + "epoch": 0.02, + "learning_rate": 1.490494296577947e-05, + "loss": 0.5325, + "step": 196 + }, + { + "epoch": 0.02, + "learning_rate": 1.4980988593155894e-05, + "loss": 0.5305, + "step": 197 + }, + { + "epoch": 0.02, + "learning_rate": 1.505703422053232e-05, + "loss": 0.5312, + "step": 198 + }, + { + "epoch": 0.02, + "learning_rate": 1.5133079847908747e-05, + "loss": 0.5371, + "step": 199 + }, + { + "epoch": 0.02, + "learning_rate": 1.520912547528517e-05, + "loss": 0.5374, + "step": 200 + }, + { + "epoch": 0.02, + "learning_rate": 1.5285171102661597e-05, + "loss": 0.537, + "step": 201 + }, + { + "epoch": 0.02, + "learning_rate": 1.5361216730038024e-05, + "loss": 0.5428, + "step": 202 + }, + { + "epoch": 0.02, + "learning_rate": 1.543726235741445e-05, + "loss": 0.5202, + "step": 203 + }, + { + "epoch": 0.02, + "learning_rate": 1.5513307984790874e-05, + "loss": 0.5168, + "step": 204 + }, + { + "epoch": 0.02, + "learning_rate": 1.55893536121673e-05, + "loss": 0.5479, + "step": 205 + }, + { + "epoch": 0.02, + "learning_rate": 1.5665399239543728e-05, + "loss": 0.5294, + "step": 206 + }, + { + "epoch": 0.02, + "learning_rate": 1.574144486692015e-05, + "loss": 0.535, + "step": 207 + }, + { + "epoch": 0.02, + "learning_rate": 1.5817490494296578e-05, + "loss": 0.5417, + "step": 208 + }, + { + "epoch": 0.02, + "learning_rate": 1.5893536121673005e-05, + "loss": 0.5564, + "step": 209 + }, + { + "epoch": 0.02, + "learning_rate": 1.596958174904943e-05, + "loss": 0.5247, + "step": 210 + }, + { + "epoch": 0.02, + "learning_rate": 1.6045627376425855e-05, + "loss": 0.5157, + "step": 211 + }, + { + "epoch": 0.02, + "learning_rate": 1.612167300380228e-05, + "loss": 0.5206, + "step": 212 + }, + { + "epoch": 0.02, + "learning_rate": 1.6197718631178708e-05, + "loss": 0.5466, + "step": 213 + }, + { + "epoch": 0.02, + "learning_rate": 1.6273764258555135e-05, + "loss": 0.5188, + "step": 214 + }, + { + "epoch": 0.02, + "learning_rate": 1.634980988593156e-05, + "loss": 0.5301, + "step": 215 + }, + { + "epoch": 0.02, + "learning_rate": 1.6425855513307985e-05, + "loss": 0.5417, + "step": 216 + }, + { + "epoch": 0.02, + "learning_rate": 1.6501901140684412e-05, + "loss": 0.5318, + "step": 217 + }, + { + "epoch": 0.02, + "learning_rate": 1.657794676806084e-05, + "loss": 0.5161, + "step": 218 + }, + { + "epoch": 0.03, + "learning_rate": 1.6653992395437265e-05, + "loss": 0.5379, + "step": 219 + }, + { + "epoch": 0.03, + "learning_rate": 1.6730038022813692e-05, + "loss": 0.5216, + "step": 220 + }, + { + "epoch": 0.03, + "learning_rate": 1.6806083650190115e-05, + "loss": 0.5264, + "step": 221 + }, + { + "epoch": 0.03, + "learning_rate": 1.6882129277566542e-05, + "loss": 0.5262, + "step": 222 + }, + { + "epoch": 0.03, + "learning_rate": 1.695817490494297e-05, + "loss": 0.5251, + "step": 223 + }, + { + "epoch": 0.03, + "learning_rate": 1.7034220532319392e-05, + "loss": 0.5259, + "step": 224 + }, + { + "epoch": 0.03, + "learning_rate": 1.711026615969582e-05, + "loss": 0.5346, + "step": 225 + }, + { + "epoch": 0.03, + "learning_rate": 1.7186311787072246e-05, + "loss": 0.5362, + "step": 226 + }, + { + "epoch": 0.03, + "learning_rate": 1.7262357414448672e-05, + "loss": 0.5164, + "step": 227 + }, + { + "epoch": 0.03, + "learning_rate": 1.7338403041825096e-05, + "loss": 0.5233, + "step": 228 + }, + { + "epoch": 0.03, + "learning_rate": 1.7414448669201523e-05, + "loss": 0.5171, + "step": 229 + }, + { + "epoch": 0.03, + "learning_rate": 1.749049429657795e-05, + "loss": 0.521, + "step": 230 + }, + { + "epoch": 0.03, + "learning_rate": 1.7566539923954373e-05, + "loss": 0.5227, + "step": 231 + }, + { + "epoch": 0.03, + "learning_rate": 1.76425855513308e-05, + "loss": 0.518, + "step": 232 + }, + { + "epoch": 0.03, + "learning_rate": 1.7718631178707226e-05, + "loss": 0.5416, + "step": 233 + }, + { + "epoch": 0.03, + "learning_rate": 1.779467680608365e-05, + "loss": 0.521, + "step": 234 + }, + { + "epoch": 0.03, + "learning_rate": 1.7870722433460076e-05, + "loss": 0.5199, + "step": 235 + }, + { + "epoch": 0.03, + "learning_rate": 1.7946768060836503e-05, + "loss": 0.5327, + "step": 236 + }, + { + "epoch": 0.03, + "learning_rate": 1.802281368821293e-05, + "loss": 0.52, + "step": 237 + }, + { + "epoch": 0.03, + "learning_rate": 1.8098859315589353e-05, + "loss": 0.519, + "step": 238 + }, + { + "epoch": 0.03, + "learning_rate": 1.817490494296578e-05, + "loss": 0.5309, + "step": 239 + }, + { + "epoch": 0.03, + "learning_rate": 1.8250950570342207e-05, + "loss": 0.5213, + "step": 240 + }, + { + "epoch": 0.03, + "learning_rate": 1.832699619771863e-05, + "loss": 0.5349, + "step": 241 + }, + { + "epoch": 0.03, + "learning_rate": 1.8403041825095057e-05, + "loss": 0.5415, + "step": 242 + }, + { + "epoch": 0.03, + "learning_rate": 1.8479087452471484e-05, + "loss": 0.5163, + "step": 243 + }, + { + "epoch": 0.03, + "learning_rate": 1.855513307984791e-05, + "loss": 0.5141, + "step": 244 + }, + { + "epoch": 0.03, + "learning_rate": 1.8631178707224337e-05, + "loss": 0.5208, + "step": 245 + }, + { + "epoch": 0.03, + "learning_rate": 1.870722433460076e-05, + "loss": 0.5091, + "step": 246 + }, + { + "epoch": 0.03, + "learning_rate": 1.8783269961977187e-05, + "loss": 0.5496, + "step": 247 + }, + { + "epoch": 0.03, + "learning_rate": 1.8859315589353614e-05, + "loss": 0.518, + "step": 248 + }, + { + "epoch": 0.03, + "learning_rate": 1.893536121673004e-05, + "loss": 0.5337, + "step": 249 + }, + { + "epoch": 0.03, + "learning_rate": 1.9011406844106467e-05, + "loss": 0.5099, + "step": 250 + }, + { + "epoch": 0.03, + "learning_rate": 1.908745247148289e-05, + "loss": 0.5258, + "step": 251 + }, + { + "epoch": 0.03, + "learning_rate": 1.9163498098859318e-05, + "loss": 0.5225, + "step": 252 + }, + { + "epoch": 0.03, + "learning_rate": 1.9239543726235744e-05, + "loss": 0.5185, + "step": 253 + }, + { + "epoch": 0.03, + "learning_rate": 1.931558935361217e-05, + "loss": 0.527, + "step": 254 + }, + { + "epoch": 0.03, + "learning_rate": 1.9391634980988594e-05, + "loss": 0.5155, + "step": 255 + }, + { + "epoch": 0.03, + "learning_rate": 1.946768060836502e-05, + "loss": 0.5138, + "step": 256 + }, + { + "epoch": 0.03, + "learning_rate": 1.9543726235741448e-05, + "loss": 0.5309, + "step": 257 + }, + { + "epoch": 0.03, + "learning_rate": 1.961977186311787e-05, + "loss": 0.527, + "step": 258 + }, + { + "epoch": 0.03, + "learning_rate": 1.9695817490494298e-05, + "loss": 0.5223, + "step": 259 + }, + { + "epoch": 0.03, + "learning_rate": 1.9771863117870725e-05, + "loss": 0.5452, + "step": 260 + }, + { + "epoch": 0.03, + "learning_rate": 1.984790874524715e-05, + "loss": 0.527, + "step": 261 + }, + { + "epoch": 0.03, + "learning_rate": 1.9923954372623575e-05, + "loss": 0.535, + "step": 262 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.5283, + "step": 263 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999999314888373e-05, + "loss": 0.5108, + "step": 264 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999997259553572e-05, + "loss": 0.5212, + "step": 265 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999993833995886e-05, + "loss": 0.5039, + "step": 266 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999989038215787e-05, + "loss": 0.5497, + "step": 267 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999982872213925e-05, + "loss": 0.5265, + "step": 268 + }, + { + "epoch": 0.03, + "learning_rate": 1.999997533599115e-05, + "loss": 0.5201, + "step": 269 + }, + { + "epoch": 0.03, + "learning_rate": 1.999996642954849e-05, + "loss": 0.5279, + "step": 270 + }, + { + "epoch": 0.03, + "learning_rate": 1.999995615288717e-05, + "loss": 0.5313, + "step": 271 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999944506008594e-05, + "loss": 0.5108, + "step": 272 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999931488914366e-05, + "loss": 0.5274, + "step": 273 + }, + { + "epoch": 0.03, + "learning_rate": 1.999991710160626e-05, + "loss": 0.5071, + "step": 274 + }, + { + "epoch": 0.03, + "learning_rate": 1.999990134408625e-05, + "loss": 0.5226, + "step": 275 + }, + { + "epoch": 0.03, + "learning_rate": 1.99998842163565e-05, + "loss": 0.5201, + "step": 276 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999865718419352e-05, + "loss": 0.5188, + "step": 277 + }, + { + "epoch": 0.03, + "learning_rate": 1.999984585027734e-05, + "loss": 0.5578, + "step": 278 + }, + { + "epoch": 0.03, + "learning_rate": 1.999982461193319e-05, + "loss": 0.5245, + "step": 279 + }, + { + "epoch": 0.03, + "learning_rate": 1.999980200338981e-05, + "loss": 0.5092, + "step": 280 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999778024650296e-05, + "loss": 0.5174, + "step": 281 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999752675717938e-05, + "loss": 0.524, + "step": 282 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999725956596204e-05, + "loss": 0.5193, + "step": 283 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999697867288764e-05, + "loss": 0.5026, + "step": 284 + }, + { + "epoch": 0.03, + "learning_rate": 1.999966840779946e-05, + "loss": 0.527, + "step": 285 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999637578132328e-05, + "loss": 0.5295, + "step": 286 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999605378291593e-05, + "loss": 0.5041, + "step": 287 + }, + { + "epoch": 0.03, + "learning_rate": 1.999957180828167e-05, + "loss": 0.5232, + "step": 288 + }, + { + "epoch": 0.03, + "learning_rate": 1.999953686810716e-05, + "loss": 0.5181, + "step": 289 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999500557772843e-05, + "loss": 0.5092, + "step": 290 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999462877283702e-05, + "loss": 0.5373, + "step": 291 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999423826644895e-05, + "loss": 0.5129, + "step": 292 + }, + { + "epoch": 0.03, + "learning_rate": 1.999938340586178e-05, + "loss": 0.5435, + "step": 293 + }, + { + "epoch": 0.03, + "learning_rate": 1.999934161493988e-05, + "loss": 0.5212, + "step": 294 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999298453884944e-05, + "loss": 0.5404, + "step": 295 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999253922702868e-05, + "loss": 0.5253, + "step": 296 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999208021399757e-05, + "loss": 0.5073, + "step": 297 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999160749981908e-05, + "loss": 0.507, + "step": 298 + }, + { + "epoch": 0.03, + "learning_rate": 1.999911210845579e-05, + "loss": 0.5219, + "step": 299 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999062096828072e-05, + "loss": 0.5168, + "step": 300 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999010715105608e-05, + "loss": 0.5167, + "step": 301 + }, + { + "epoch": 0.03, + "learning_rate": 1.9998957963295434e-05, + "loss": 0.5081, + "step": 302 + }, + { + "epoch": 0.03, + "learning_rate": 1.999890384140478e-05, + "loss": 0.5134, + "step": 303 + }, + { + "epoch": 0.03, + "learning_rate": 1.999884834944106e-05, + "loss": 0.5205, + "step": 304 + }, + { + "epoch": 0.03, + "learning_rate": 1.9998791487411887e-05, + "loss": 0.5281, + "step": 305 + }, + { + "epoch": 0.03, + "learning_rate": 1.9998733255325043e-05, + "loss": 0.5184, + "step": 306 + }, + { + "epoch": 0.04, + "learning_rate": 1.999867365318851e-05, + "loss": 0.509, + "step": 307 + }, + { + "epoch": 0.04, + "learning_rate": 1.9998612681010452e-05, + "loss": 0.5131, + "step": 308 + }, + { + "epoch": 0.04, + "learning_rate": 1.999855033879923e-05, + "loss": 0.5204, + "step": 309 + }, + { + "epoch": 0.04, + "learning_rate": 1.9998486626563376e-05, + "loss": 0.5085, + "step": 310 + }, + { + "epoch": 0.04, + "learning_rate": 1.999842154431163e-05, + "loss": 0.5362, + "step": 311 + }, + { + "epoch": 0.04, + "learning_rate": 1.9998355092052906e-05, + "loss": 0.5136, + "step": 312 + }, + { + "epoch": 0.04, + "learning_rate": 1.9998287269796313e-05, + "loss": 0.5057, + "step": 313 + }, + { + "epoch": 0.04, + "learning_rate": 1.9998218077551135e-05, + "loss": 0.5129, + "step": 314 + }, + { + "epoch": 0.04, + "learning_rate": 1.9998147515326862e-05, + "loss": 0.5301, + "step": 315 + }, + { + "epoch": 0.04, + "learning_rate": 1.9998075583133157e-05, + "loss": 0.4895, + "step": 316 + }, + { + "epoch": 0.04, + "learning_rate": 1.999800228097988e-05, + "loss": 0.5259, + "step": 317 + }, + { + "epoch": 0.04, + "learning_rate": 1.999792760887707e-05, + "loss": 0.5194, + "step": 318 + }, + { + "epoch": 0.04, + "learning_rate": 1.9997851566834966e-05, + "loss": 0.5316, + "step": 319 + }, + { + "epoch": 0.04, + "learning_rate": 1.999777415486398e-05, + "loss": 0.517, + "step": 320 + }, + { + "epoch": 0.04, + "learning_rate": 1.9997695372974725e-05, + "loss": 0.5055, + "step": 321 + }, + { + "epoch": 0.04, + "learning_rate": 1.9997615221177996e-05, + "loss": 0.543, + "step": 322 + }, + { + "epoch": 0.04, + "learning_rate": 1.999753369948477e-05, + "loss": 0.5334, + "step": 323 + }, + { + "epoch": 0.04, + "learning_rate": 1.999745080790622e-05, + "loss": 0.5019, + "step": 324 + }, + { + "epoch": 0.04, + "learning_rate": 1.999736654645371e-05, + "loss": 0.5284, + "step": 325 + }, + { + "epoch": 0.04, + "learning_rate": 1.999728091513877e-05, + "loss": 0.5158, + "step": 326 + }, + { + "epoch": 0.04, + "learning_rate": 1.9997193913973154e-05, + "loss": 0.5483, + "step": 327 + }, + { + "epoch": 0.04, + "learning_rate": 1.999710554296877e-05, + "loss": 0.5111, + "step": 328 + }, + { + "epoch": 0.04, + "learning_rate": 1.9997015802137727e-05, + "loss": 0.5069, + "step": 329 + }, + { + "epoch": 0.04, + "learning_rate": 1.9996924691492325e-05, + "loss": 0.4987, + "step": 330 + }, + { + "epoch": 0.04, + "learning_rate": 1.9996832211045048e-05, + "loss": 0.542, + "step": 331 + }, + { + "epoch": 0.04, + "learning_rate": 1.9996738360808566e-05, + "loss": 0.5133, + "step": 332 + }, + { + "epoch": 0.04, + "learning_rate": 1.999664314079574e-05, + "loss": 0.5189, + "step": 333 + }, + { + "epoch": 0.04, + "learning_rate": 1.9996546551019618e-05, + "loss": 0.4968, + "step": 334 + }, + { + "epoch": 0.04, + "learning_rate": 1.9996448591493433e-05, + "loss": 0.5309, + "step": 335 + }, + { + "epoch": 0.04, + "learning_rate": 1.9996349262230607e-05, + "loss": 0.5144, + "step": 336 + }, + { + "epoch": 0.04, + "learning_rate": 1.9996248563244755e-05, + "loss": 0.5217, + "step": 337 + }, + { + "epoch": 0.04, + "learning_rate": 1.9996146494549672e-05, + "loss": 0.4983, + "step": 338 + }, + { + "epoch": 0.04, + "learning_rate": 1.999604305615934e-05, + "loss": 0.4968, + "step": 339 + }, + { + "epoch": 0.04, + "learning_rate": 1.9995938248087937e-05, + "loss": 0.5185, + "step": 340 + }, + { + "epoch": 0.04, + "learning_rate": 1.9995832070349827e-05, + "loss": 0.4945, + "step": 341 + }, + { + "epoch": 0.04, + "learning_rate": 1.999572452295955e-05, + "loss": 0.5324, + "step": 342 + }, + { + "epoch": 0.04, + "learning_rate": 1.999561560593185e-05, + "loss": 0.5058, + "step": 343 + }, + { + "epoch": 0.04, + "learning_rate": 1.9995505319281645e-05, + "loss": 0.5145, + "step": 344 + }, + { + "epoch": 0.04, + "learning_rate": 1.9995393663024054e-05, + "loss": 0.5248, + "step": 345 + }, + { + "epoch": 0.04, + "learning_rate": 1.999528063717437e-05, + "loss": 0.515, + "step": 346 + }, + { + "epoch": 0.04, + "learning_rate": 1.9995166241748084e-05, + "loss": 0.5016, + "step": 347 + }, + { + "epoch": 0.04, + "learning_rate": 1.9995050476760864e-05, + "loss": 0.5052, + "step": 348 + }, + { + "epoch": 0.04, + "learning_rate": 1.9994933342228583e-05, + "loss": 0.5221, + "step": 349 + }, + { + "epoch": 0.04, + "learning_rate": 1.9994814838167286e-05, + "loss": 0.5006, + "step": 350 + }, + { + "epoch": 0.04, + "learning_rate": 1.9994694964593206e-05, + "loss": 0.513, + "step": 351 + }, + { + "epoch": 0.04, + "learning_rate": 1.9994573721522776e-05, + "loss": 0.5107, + "step": 352 + }, + { + "epoch": 0.04, + "learning_rate": 1.9994451108972604e-05, + "loss": 0.5247, + "step": 353 + }, + { + "epoch": 0.04, + "learning_rate": 1.999432712695949e-05, + "loss": 0.5054, + "step": 354 + }, + { + "epoch": 0.04, + "learning_rate": 1.999420177550043e-05, + "loss": 0.5129, + "step": 355 + }, + { + "epoch": 0.04, + "learning_rate": 1.999407505461259e-05, + "loss": 0.4971, + "step": 356 + }, + { + "epoch": 0.04, + "learning_rate": 1.999394696431334e-05, + "loss": 0.4876, + "step": 357 + }, + { + "epoch": 0.04, + "learning_rate": 1.999381750462023e-05, + "loss": 0.528, + "step": 358 + }, + { + "epoch": 0.04, + "learning_rate": 1.9993686675550998e-05, + "loss": 0.5143, + "step": 359 + }, + { + "epoch": 0.04, + "learning_rate": 1.9993554477123568e-05, + "loss": 0.5101, + "step": 360 + }, + { + "epoch": 0.04, + "learning_rate": 1.9993420909356058e-05, + "loss": 0.5107, + "step": 361 + }, + { + "epoch": 0.04, + "learning_rate": 1.999328597226677e-05, + "loss": 0.5321, + "step": 362 + }, + { + "epoch": 0.04, + "learning_rate": 1.9993149665874193e-05, + "loss": 0.5402, + "step": 363 + }, + { + "epoch": 0.04, + "learning_rate": 1.9993011990197e-05, + "loss": 0.5014, + "step": 364 + }, + { + "epoch": 0.04, + "learning_rate": 1.9992872945254064e-05, + "loss": 0.4866, + "step": 365 + }, + { + "epoch": 0.04, + "learning_rate": 1.9992732531064427e-05, + "loss": 0.5061, + "step": 366 + }, + { + "epoch": 0.04, + "learning_rate": 1.9992590747647334e-05, + "loss": 0.5053, + "step": 367 + }, + { + "epoch": 0.04, + "learning_rate": 1.9992447595022214e-05, + "loss": 0.5216, + "step": 368 + }, + { + "epoch": 0.04, + "learning_rate": 1.9992303073208678e-05, + "loss": 0.5027, + "step": 369 + }, + { + "epoch": 0.04, + "learning_rate": 1.9992157182226535e-05, + "loss": 0.4829, + "step": 370 + }, + { + "epoch": 0.04, + "learning_rate": 1.9992009922095766e-05, + "loss": 0.5256, + "step": 371 + }, + { + "epoch": 0.04, + "learning_rate": 1.999186129283656e-05, + "loss": 0.5018, + "step": 372 + }, + { + "epoch": 0.04, + "learning_rate": 1.9991711294469273e-05, + "loss": 0.509, + "step": 373 + }, + { + "epoch": 0.04, + "learning_rate": 1.9991559927014465e-05, + "loss": 0.5046, + "step": 374 + }, + { + "epoch": 0.04, + "learning_rate": 1.999140719049287e-05, + "loss": 0.5319, + "step": 375 + }, + { + "epoch": 0.04, + "learning_rate": 1.9991253084925425e-05, + "loss": 0.52, + "step": 376 + }, + { + "epoch": 0.04, + "learning_rate": 1.999109761033324e-05, + "loss": 0.5033, + "step": 377 + }, + { + "epoch": 0.04, + "learning_rate": 1.9990940766737617e-05, + "loss": 0.4969, + "step": 378 + }, + { + "epoch": 0.04, + "learning_rate": 1.999078255416005e-05, + "loss": 0.5246, + "step": 379 + }, + { + "epoch": 0.04, + "learning_rate": 1.9990622972622216e-05, + "loss": 0.4919, + "step": 380 + }, + { + "epoch": 0.04, + "learning_rate": 1.9990462022145985e-05, + "loss": 0.5271, + "step": 381 + }, + { + "epoch": 0.04, + "learning_rate": 1.9990299702753405e-05, + "loss": 0.5046, + "step": 382 + }, + { + "epoch": 0.04, + "learning_rate": 1.9990136014466722e-05, + "loss": 0.5027, + "step": 383 + }, + { + "epoch": 0.04, + "learning_rate": 1.9989970957308364e-05, + "loss": 0.5148, + "step": 384 + }, + { + "epoch": 0.04, + "learning_rate": 1.998980453130095e-05, + "loss": 0.529, + "step": 385 + }, + { + "epoch": 0.04, + "learning_rate": 1.9989636736467278e-05, + "loss": 0.5077, + "step": 386 + }, + { + "epoch": 0.04, + "learning_rate": 1.9989467572830342e-05, + "loss": 0.5123, + "step": 387 + }, + { + "epoch": 0.04, + "learning_rate": 1.9989297040413325e-05, + "loss": 0.5002, + "step": 388 + }, + { + "epoch": 0.04, + "learning_rate": 1.998912513923959e-05, + "loss": 0.5295, + "step": 389 + }, + { + "epoch": 0.04, + "learning_rate": 1.998895186933269e-05, + "loss": 0.4975, + "step": 390 + }, + { + "epoch": 0.04, + "learning_rate": 1.9988777230716367e-05, + "loss": 0.5053, + "step": 391 + }, + { + "epoch": 0.04, + "learning_rate": 1.9988601223414555e-05, + "loss": 0.5067, + "step": 392 + }, + { + "epoch": 0.04, + "learning_rate": 1.998842384745137e-05, + "loss": 0.5156, + "step": 393 + }, + { + "epoch": 0.05, + "learning_rate": 1.998824510285111e-05, + "loss": 0.5218, + "step": 394 + }, + { + "epoch": 0.05, + "learning_rate": 1.998806498963828e-05, + "loss": 0.5057, + "step": 395 + }, + { + "epoch": 0.05, + "learning_rate": 1.9987883507837545e-05, + "loss": 0.5167, + "step": 396 + }, + { + "epoch": 0.05, + "learning_rate": 1.998770065747378e-05, + "loss": 0.5119, + "step": 397 + }, + { + "epoch": 0.05, + "learning_rate": 1.9987516438572035e-05, + "loss": 0.5095, + "step": 398 + }, + { + "epoch": 0.05, + "learning_rate": 1.9987330851157557e-05, + "loss": 0.526, + "step": 399 + }, + { + "epoch": 0.05, + "learning_rate": 1.9987143895255774e-05, + "loss": 0.5022, + "step": 400 + }, + { + "epoch": 0.05, + "learning_rate": 1.9986955570892302e-05, + "loss": 0.5204, + "step": 401 + }, + { + "epoch": 0.05, + "learning_rate": 1.9986765878092945e-05, + "loss": 0.5211, + "step": 402 + }, + { + "epoch": 0.05, + "learning_rate": 1.99865748168837e-05, + "loss": 0.5107, + "step": 403 + }, + { + "epoch": 0.05, + "learning_rate": 1.9986382387290738e-05, + "loss": 0.5242, + "step": 404 + }, + { + "epoch": 0.05, + "learning_rate": 1.9986188589340435e-05, + "loss": 0.5001, + "step": 405 + }, + { + "epoch": 0.05, + "learning_rate": 1.9985993423059342e-05, + "loss": 0.4907, + "step": 406 + }, + { + "epoch": 0.05, + "learning_rate": 1.99857968884742e-05, + "loss": 0.5123, + "step": 407 + }, + { + "epoch": 0.05, + "learning_rate": 1.998559898561194e-05, + "loss": 0.508, + "step": 408 + }, + { + "epoch": 0.05, + "learning_rate": 1.9985399714499678e-05, + "loss": 0.4923, + "step": 409 + }, + { + "epoch": 0.05, + "learning_rate": 1.998519907516472e-05, + "loss": 0.5473, + "step": 410 + }, + { + "epoch": 0.05, + "learning_rate": 1.998499706763456e-05, + "loss": 0.5052, + "step": 411 + }, + { + "epoch": 0.05, + "learning_rate": 1.998479369193687e-05, + "loss": 0.5304, + "step": 412 + }, + { + "epoch": 0.05, + "learning_rate": 1.9984588948099528e-05, + "loss": 0.4969, + "step": 413 + }, + { + "epoch": 0.05, + "learning_rate": 1.998438283615058e-05, + "loss": 0.5129, + "step": 414 + }, + { + "epoch": 0.05, + "learning_rate": 1.9984175356118268e-05, + "loss": 0.4953, + "step": 415 + }, + { + "epoch": 0.05, + "learning_rate": 1.9983966508031026e-05, + "loss": 0.5145, + "step": 416 + }, + { + "epoch": 0.05, + "learning_rate": 1.9983756291917467e-05, + "loss": 0.5019, + "step": 417 + }, + { + "epoch": 0.05, + "learning_rate": 1.99835447078064e-05, + "loss": 0.5282, + "step": 418 + }, + { + "epoch": 0.05, + "learning_rate": 1.998333175572681e-05, + "loss": 0.4842, + "step": 419 + }, + { + "epoch": 0.05, + "learning_rate": 1.998311743570788e-05, + "loss": 0.5177, + "step": 420 + }, + { + "epoch": 0.05, + "learning_rate": 1.998290174777898e-05, + "loss": 0.5039, + "step": 421 + }, + { + "epoch": 0.05, + "learning_rate": 1.998268469196966e-05, + "loss": 0.5227, + "step": 422 + }, + { + "epoch": 0.05, + "learning_rate": 1.998246626830966e-05, + "loss": 0.4926, + "step": 423 + }, + { + "epoch": 0.05, + "learning_rate": 1.998224647682891e-05, + "loss": 0.5044, + "step": 424 + }, + { + "epoch": 0.05, + "learning_rate": 1.998202531755753e-05, + "loss": 0.5244, + "step": 425 + }, + { + "epoch": 0.05, + "learning_rate": 1.9981802790525822e-05, + "loss": 0.4932, + "step": 426 + }, + { + "epoch": 0.05, + "learning_rate": 1.9981578895764272e-05, + "loss": 0.5055, + "step": 427 + }, + { + "epoch": 0.05, + "learning_rate": 1.998135363330357e-05, + "loss": 0.537, + "step": 428 + }, + { + "epoch": 0.05, + "learning_rate": 1.998112700317457e-05, + "loss": 0.4919, + "step": 429 + }, + { + "epoch": 0.05, + "learning_rate": 1.998089900540833e-05, + "loss": 0.5127, + "step": 430 + }, + { + "epoch": 0.05, + "learning_rate": 1.998066964003609e-05, + "loss": 0.5092, + "step": 431 + }, + { + "epoch": 0.05, + "learning_rate": 1.998043890708928e-05, + "loss": 0.5153, + "step": 432 + }, + { + "epoch": 0.05, + "learning_rate": 1.9980206806599516e-05, + "loss": 0.508, + "step": 433 + }, + { + "epoch": 0.05, + "learning_rate": 1.9979973338598603e-05, + "loss": 0.5059, + "step": 434 + }, + { + "epoch": 0.05, + "learning_rate": 1.997973850311852e-05, + "loss": 0.4842, + "step": 435 + }, + { + "epoch": 0.05, + "learning_rate": 1.997950230019146e-05, + "loss": 0.5241, + "step": 436 + }, + { + "epoch": 0.05, + "learning_rate": 1.9979264729849776e-05, + "loss": 0.5035, + "step": 437 + }, + { + "epoch": 0.05, + "learning_rate": 1.9979025792126027e-05, + "loss": 0.4893, + "step": 438 + }, + { + "epoch": 0.05, + "learning_rate": 1.9978785487052952e-05, + "loss": 0.5123, + "step": 439 + }, + { + "epoch": 0.05, + "learning_rate": 1.9978543814663478e-05, + "loss": 0.5177, + "step": 440 + }, + { + "epoch": 0.05, + "learning_rate": 1.9978300774990716e-05, + "loss": 0.4992, + "step": 441 + }, + { + "epoch": 0.05, + "learning_rate": 1.9978056368067973e-05, + "loss": 0.5116, + "step": 442 + }, + { + "epoch": 0.05, + "learning_rate": 1.9977810593928736e-05, + "loss": 0.5017, + "step": 443 + }, + { + "epoch": 0.05, + "learning_rate": 1.9977563452606677e-05, + "loss": 0.5114, + "step": 444 + }, + { + "epoch": 0.05, + "learning_rate": 1.9977314944135667e-05, + "loss": 0.4902, + "step": 445 + }, + { + "epoch": 0.05, + "learning_rate": 1.9977065068549756e-05, + "loss": 0.5134, + "step": 446 + }, + { + "epoch": 0.05, + "learning_rate": 1.9976813825883182e-05, + "loss": 0.4954, + "step": 447 + }, + { + "epoch": 0.05, + "learning_rate": 1.9976561216170368e-05, + "loss": 0.5045, + "step": 448 + }, + { + "epoch": 0.05, + "learning_rate": 1.9976307239445924e-05, + "loss": 0.4949, + "step": 449 + }, + { + "epoch": 0.05, + "learning_rate": 1.9976051895744663e-05, + "loss": 0.5228, + "step": 450 + }, + { + "epoch": 0.05, + "learning_rate": 1.997579518510156e-05, + "loss": 0.5035, + "step": 451 + }, + { + "epoch": 0.05, + "learning_rate": 1.99755371075518e-05, + "loss": 0.5046, + "step": 452 + }, + { + "epoch": 0.05, + "learning_rate": 1.9975277663130736e-05, + "loss": 0.5041, + "step": 453 + }, + { + "epoch": 0.05, + "learning_rate": 1.9975016851873925e-05, + "loss": 0.5142, + "step": 454 + }, + { + "epoch": 0.05, + "learning_rate": 1.99747546738171e-05, + "loss": 0.4906, + "step": 455 + }, + { + "epoch": 0.05, + "learning_rate": 1.997449112899619e-05, + "loss": 0.5018, + "step": 456 + }, + { + "epoch": 0.05, + "learning_rate": 1.99742262174473e-05, + "loss": 0.5064, + "step": 457 + }, + { + "epoch": 0.05, + "learning_rate": 1.9973959939206734e-05, + "loss": 0.4996, + "step": 458 + }, + { + "epoch": 0.05, + "learning_rate": 1.9973692294310972e-05, + "loss": 0.5149, + "step": 459 + }, + { + "epoch": 0.05, + "learning_rate": 1.9973423282796695e-05, + "loss": 0.4956, + "step": 460 + }, + { + "epoch": 0.05, + "learning_rate": 1.9973152904700762e-05, + "loss": 0.5125, + "step": 461 + }, + { + "epoch": 0.05, + "learning_rate": 1.9972881160060216e-05, + "loss": 0.5385, + "step": 462 + }, + { + "epoch": 0.05, + "learning_rate": 1.997260804891229e-05, + "loss": 0.485, + "step": 463 + }, + { + "epoch": 0.05, + "learning_rate": 1.9972333571294418e-05, + "loss": 0.5028, + "step": 464 + }, + { + "epoch": 0.05, + "learning_rate": 1.9972057727244203e-05, + "loss": 0.4887, + "step": 465 + }, + { + "epoch": 0.05, + "learning_rate": 1.997178051679944e-05, + "loss": 0.5229, + "step": 466 + }, + { + "epoch": 0.05, + "learning_rate": 1.997150193999811e-05, + "loss": 0.4953, + "step": 467 + }, + { + "epoch": 0.05, + "learning_rate": 1.9971221996878395e-05, + "loss": 0.5123, + "step": 468 + }, + { + "epoch": 0.05, + "learning_rate": 1.9970940687478643e-05, + "loss": 0.5256, + "step": 469 + }, + { + "epoch": 0.05, + "learning_rate": 1.9970658011837404e-05, + "loss": 0.501, + "step": 470 + }, + { + "epoch": 0.05, + "learning_rate": 1.9970373969993414e-05, + "loss": 0.5173, + "step": 471 + }, + { + "epoch": 0.05, + "learning_rate": 1.997008856198559e-05, + "loss": 0.5047, + "step": 472 + }, + { + "epoch": 0.05, + "learning_rate": 1.9969801787853035e-05, + "loss": 0.5107, + "step": 473 + }, + { + "epoch": 0.05, + "learning_rate": 1.996951364763505e-05, + "loss": 0.5104, + "step": 474 + }, + { + "epoch": 0.05, + "learning_rate": 1.9969224141371114e-05, + "loss": 0.5097, + "step": 475 + }, + { + "epoch": 0.05, + "learning_rate": 1.99689332691009e-05, + "loss": 0.5093, + "step": 476 + }, + { + "epoch": 0.05, + "learning_rate": 1.9968641030864254e-05, + "loss": 0.4957, + "step": 477 + }, + { + "epoch": 0.05, + "learning_rate": 1.9968347426701228e-05, + "loss": 0.5105, + "step": 478 + }, + { + "epoch": 0.05, + "learning_rate": 1.9968052456652048e-05, + "loss": 0.504, + "step": 479 + }, + { + "epoch": 0.05, + "learning_rate": 1.9967756120757132e-05, + "loss": 0.5008, + "step": 480 + }, + { + "epoch": 0.05, + "learning_rate": 1.9967458419057092e-05, + "loss": 0.5101, + "step": 481 + }, + { + "epoch": 0.06, + "learning_rate": 1.9967159351592706e-05, + "loss": 0.5092, + "step": 482 + }, + { + "epoch": 0.06, + "learning_rate": 1.9966858918404965e-05, + "loss": 0.5023, + "step": 483 + }, + { + "epoch": 0.06, + "learning_rate": 1.996655711953503e-05, + "loss": 0.5103, + "step": 484 + }, + { + "epoch": 0.06, + "learning_rate": 1.996625395502425e-05, + "loss": 0.5325, + "step": 485 + }, + { + "epoch": 0.06, + "learning_rate": 1.9965949424914175e-05, + "loss": 0.5126, + "step": 486 + }, + { + "epoch": 0.06, + "learning_rate": 1.9965643529246526e-05, + "loss": 0.5017, + "step": 487 + }, + { + "epoch": 0.06, + "learning_rate": 1.996533626806322e-05, + "loss": 0.5367, + "step": 488 + }, + { + "epoch": 0.06, + "learning_rate": 1.9965027641406355e-05, + "loss": 0.4925, + "step": 489 + }, + { + "epoch": 0.06, + "learning_rate": 1.996471764931822e-05, + "loss": 0.5153, + "step": 490 + }, + { + "epoch": 0.06, + "learning_rate": 1.99644062918413e-05, + "loss": 0.5196, + "step": 491 + }, + { + "epoch": 0.06, + "learning_rate": 1.9964093569018247e-05, + "loss": 0.4936, + "step": 492 + }, + { + "epoch": 0.06, + "learning_rate": 1.9963779480891917e-05, + "loss": 0.5085, + "step": 493 + }, + { + "epoch": 0.06, + "learning_rate": 1.9963464027505343e-05, + "loss": 0.5153, + "step": 494 + }, + { + "epoch": 0.06, + "learning_rate": 1.996314720890175e-05, + "loss": 0.5228, + "step": 495 + }, + { + "epoch": 0.06, + "learning_rate": 1.9962829025124553e-05, + "loss": 0.499, + "step": 496 + }, + { + "epoch": 0.06, + "learning_rate": 1.9962509476217348e-05, + "loss": 0.512, + "step": 497 + }, + { + "epoch": 0.06, + "learning_rate": 1.9962188562223916e-05, + "loss": 0.4839, + "step": 498 + }, + { + "epoch": 0.06, + "learning_rate": 1.9961866283188237e-05, + "loss": 0.4876, + "step": 499 + }, + { + "epoch": 0.06, + "learning_rate": 1.9961542639154467e-05, + "loss": 0.5057, + "step": 500 + }, + { + "epoch": 0.06, + "learning_rate": 1.9961217630166954e-05, + "loss": 0.5144, + "step": 501 + }, + { + "epoch": 0.06, + "learning_rate": 1.9960891256270224e-05, + "loss": 0.5042, + "step": 502 + }, + { + "epoch": 0.06, + "learning_rate": 1.9960563517509008e-05, + "loss": 0.5188, + "step": 503 + }, + { + "epoch": 0.06, + "learning_rate": 1.996023441392821e-05, + "loss": 0.4886, + "step": 504 + }, + { + "epoch": 0.06, + "learning_rate": 1.9959903945572918e-05, + "loss": 0.4883, + "step": 505 + }, + { + "epoch": 0.06, + "learning_rate": 1.9959572112488423e-05, + "loss": 0.5606, + "step": 506 + }, + { + "epoch": 0.06, + "learning_rate": 1.9959238914720188e-05, + "loss": 0.492, + "step": 507 + }, + { + "epoch": 0.06, + "learning_rate": 1.995890435231387e-05, + "loss": 0.4934, + "step": 508 + }, + { + "epoch": 0.06, + "learning_rate": 1.9958568425315316e-05, + "loss": 0.5088, + "step": 509 + }, + { + "epoch": 0.06, + "learning_rate": 1.9958231133770548e-05, + "loss": 0.5085, + "step": 510 + }, + { + "epoch": 0.06, + "learning_rate": 1.995789247772578e-05, + "loss": 0.5067, + "step": 511 + }, + { + "epoch": 0.06, + "learning_rate": 1.9957552457227428e-05, + "loss": 0.5029, + "step": 512 + }, + { + "epoch": 0.06, + "learning_rate": 1.9957211072322075e-05, + "loss": 0.4727, + "step": 513 + }, + { + "epoch": 0.06, + "learning_rate": 1.9956868323056494e-05, + "loss": 0.5176, + "step": 514 + }, + { + "epoch": 0.06, + "learning_rate": 1.9956524209477658e-05, + "loss": 0.5033, + "step": 515 + }, + { + "epoch": 0.06, + "learning_rate": 1.9956178731632715e-05, + "loss": 0.513, + "step": 516 + }, + { + "epoch": 0.06, + "learning_rate": 1.9955831889568998e-05, + "loss": 0.5041, + "step": 517 + }, + { + "epoch": 0.06, + "learning_rate": 1.995548368333404e-05, + "loss": 0.4975, + "step": 518 + }, + { + "epoch": 0.06, + "learning_rate": 1.9955134112975548e-05, + "loss": 0.5129, + "step": 519 + }, + { + "epoch": 0.06, + "learning_rate": 1.9954783178541424e-05, + "loss": 0.505, + "step": 520 + }, + { + "epoch": 0.06, + "learning_rate": 1.995443088007975e-05, + "loss": 0.5008, + "step": 521 + }, + { + "epoch": 0.06, + "learning_rate": 1.9954077217638807e-05, + "loss": 0.5171, + "step": 522 + }, + { + "epoch": 0.06, + "learning_rate": 1.995372219126704e-05, + "loss": 0.4896, + "step": 523 + }, + { + "epoch": 0.06, + "learning_rate": 1.995336580101311e-05, + "loss": 0.521, + "step": 524 + }, + { + "epoch": 0.06, + "learning_rate": 1.9953008046925844e-05, + "loss": 0.5038, + "step": 525 + }, + { + "epoch": 0.06, + "learning_rate": 1.9952648929054262e-05, + "loss": 0.5123, + "step": 526 + }, + { + "epoch": 0.06, + "learning_rate": 1.9952288447447573e-05, + "loss": 0.5118, + "step": 527 + }, + { + "epoch": 0.06, + "learning_rate": 1.995192660215517e-05, + "loss": 0.5144, + "step": 528 + }, + { + "epoch": 0.06, + "learning_rate": 1.9951563393226632e-05, + "loss": 0.5075, + "step": 529 + }, + { + "epoch": 0.06, + "learning_rate": 1.9951198820711735e-05, + "loss": 0.497, + "step": 530 + }, + { + "epoch": 0.06, + "learning_rate": 1.995083288466042e-05, + "loss": 0.4836, + "step": 531 + }, + { + "epoch": 0.06, + "learning_rate": 1.995046558512284e-05, + "loss": 0.5177, + "step": 532 + }, + { + "epoch": 0.06, + "learning_rate": 1.995009692214932e-05, + "loss": 0.4862, + "step": 533 + }, + { + "epoch": 0.06, + "learning_rate": 1.994972689579037e-05, + "loss": 0.5107, + "step": 534 + }, + { + "epoch": 0.06, + "learning_rate": 1.99493555060967e-05, + "loss": 0.4924, + "step": 535 + }, + { + "epoch": 0.06, + "learning_rate": 1.994898275311919e-05, + "loss": 0.4966, + "step": 536 + }, + { + "epoch": 0.06, + "learning_rate": 1.9948608636908928e-05, + "loss": 0.5062, + "step": 537 + }, + { + "epoch": 0.06, + "learning_rate": 1.9948233157517164e-05, + "loss": 0.5026, + "step": 538 + }, + { + "epoch": 0.06, + "learning_rate": 1.994785631499535e-05, + "loss": 0.5129, + "step": 539 + }, + { + "epoch": 0.06, + "learning_rate": 1.9947478109395123e-05, + "loss": 0.5277, + "step": 540 + }, + { + "epoch": 0.06, + "learning_rate": 1.9947098540768306e-05, + "loss": 0.5, + "step": 541 + }, + { + "epoch": 0.06, + "learning_rate": 1.994671760916691e-05, + "loss": 0.5134, + "step": 542 + }, + { + "epoch": 0.06, + "learning_rate": 1.994633531464313e-05, + "loss": 0.5056, + "step": 543 + }, + { + "epoch": 0.06, + "learning_rate": 1.9945951657249348e-05, + "loss": 0.5002, + "step": 544 + }, + { + "epoch": 0.06, + "learning_rate": 1.9945566637038133e-05, + "loss": 0.509, + "step": 545 + }, + { + "epoch": 0.06, + "learning_rate": 1.9945180254062242e-05, + "loss": 0.5243, + "step": 546 + }, + { + "epoch": 0.06, + "learning_rate": 1.994479250837462e-05, + "loss": 0.5495, + "step": 547 + }, + { + "epoch": 0.06, + "learning_rate": 1.9944403400028392e-05, + "loss": 0.4995, + "step": 548 + }, + { + "epoch": 0.06, + "learning_rate": 1.9944012929076884e-05, + "loss": 0.4847, + "step": 549 + }, + { + "epoch": 0.06, + "learning_rate": 1.9943621095573588e-05, + "loss": 0.5106, + "step": 550 + }, + { + "epoch": 0.06, + "learning_rate": 1.9943227899572198e-05, + "loss": 0.5006, + "step": 551 + }, + { + "epoch": 0.06, + "learning_rate": 1.9942833341126597e-05, + "loss": 0.5108, + "step": 552 + }, + { + "epoch": 0.06, + "learning_rate": 1.9942437420290835e-05, + "loss": 0.4873, + "step": 553 + }, + { + "epoch": 0.06, + "learning_rate": 1.994204013711918e-05, + "loss": 0.5005, + "step": 554 + }, + { + "epoch": 0.06, + "learning_rate": 1.9941641491666052e-05, + "loss": 0.5119, + "step": 555 + }, + { + "epoch": 0.06, + "learning_rate": 1.994124148398608e-05, + "loss": 0.5057, + "step": 556 + }, + { + "epoch": 0.06, + "learning_rate": 1.9940840114134078e-05, + "loss": 0.4932, + "step": 557 + }, + { + "epoch": 0.06, + "learning_rate": 1.9940437382165038e-05, + "loss": 0.505, + "step": 558 + }, + { + "epoch": 0.06, + "learning_rate": 1.9940033288134143e-05, + "loss": 0.487, + "step": 559 + }, + { + "epoch": 0.06, + "learning_rate": 1.993962783209677e-05, + "loss": 0.5201, + "step": 560 + }, + { + "epoch": 0.06, + "learning_rate": 1.9939221014108467e-05, + "loss": 0.5063, + "step": 561 + }, + { + "epoch": 0.06, + "learning_rate": 1.9938812834224978e-05, + "loss": 0.5005, + "step": 562 + }, + { + "epoch": 0.06, + "learning_rate": 1.993840329250224e-05, + "loss": 0.5017, + "step": 563 + }, + { + "epoch": 0.06, + "learning_rate": 1.993799238899636e-05, + "loss": 0.519, + "step": 564 + }, + { + "epoch": 0.06, + "learning_rate": 1.9937580123763645e-05, + "loss": 0.4934, + "step": 565 + }, + { + "epoch": 0.06, + "learning_rate": 1.993716649686059e-05, + "loss": 0.5086, + "step": 566 + }, + { + "epoch": 0.06, + "learning_rate": 1.993675150834386e-05, + "loss": 0.4863, + "step": 567 + }, + { + "epoch": 0.06, + "learning_rate": 1.993633515827033e-05, + "loss": 0.5089, + "step": 568 + }, + { + "epoch": 0.07, + "learning_rate": 1.9935917446697038e-05, + "loss": 0.5077, + "step": 569 + }, + { + "epoch": 0.07, + "learning_rate": 1.993549837368123e-05, + "loss": 0.4964, + "step": 570 + }, + { + "epoch": 0.07, + "learning_rate": 1.9935077939280316e-05, + "loss": 0.5055, + "step": 571 + }, + { + "epoch": 0.07, + "learning_rate": 1.993465614355192e-05, + "loss": 0.503, + "step": 572 + }, + { + "epoch": 0.07, + "learning_rate": 1.9934232986553823e-05, + "loss": 0.5179, + "step": 573 + }, + { + "epoch": 0.07, + "learning_rate": 1.9933808468344016e-05, + "loss": 0.4953, + "step": 574 + }, + { + "epoch": 0.07, + "learning_rate": 1.9933382588980665e-05, + "loss": 0.4912, + "step": 575 + }, + { + "epoch": 0.07, + "learning_rate": 1.9932955348522125e-05, + "loss": 0.4973, + "step": 576 + }, + { + "epoch": 0.07, + "learning_rate": 1.9932526747026936e-05, + "loss": 0.5004, + "step": 577 + }, + { + "epoch": 0.07, + "learning_rate": 1.993209678455383e-05, + "loss": 0.5175, + "step": 578 + }, + { + "epoch": 0.07, + "learning_rate": 1.9931665461161716e-05, + "loss": 0.5021, + "step": 579 + }, + { + "epoch": 0.07, + "learning_rate": 1.9931232776909703e-05, + "loss": 0.5096, + "step": 580 + }, + { + "epoch": 0.07, + "learning_rate": 1.993079873185707e-05, + "loss": 0.4768, + "step": 581 + }, + { + "epoch": 0.07, + "learning_rate": 1.993036332606329e-05, + "loss": 0.5109, + "step": 582 + }, + { + "epoch": 0.07, + "learning_rate": 1.9929926559588032e-05, + "loss": 0.4972, + "step": 583 + }, + { + "epoch": 0.07, + "learning_rate": 1.9929488432491137e-05, + "loss": 0.4919, + "step": 584 + }, + { + "epoch": 0.07, + "learning_rate": 1.9929048944832638e-05, + "loss": 0.4959, + "step": 585 + }, + { + "epoch": 0.07, + "learning_rate": 1.9928608096672757e-05, + "loss": 0.5132, + "step": 586 + }, + { + "epoch": 0.07, + "learning_rate": 1.99281658880719e-05, + "loss": 0.512, + "step": 587 + }, + { + "epoch": 0.07, + "learning_rate": 1.992772231909066e-05, + "loss": 0.4972, + "step": 588 + }, + { + "epoch": 0.07, + "learning_rate": 1.9927277389789812e-05, + "loss": 0.5027, + "step": 589 + }, + { + "epoch": 0.07, + "learning_rate": 1.9926831100230322e-05, + "loss": 0.4921, + "step": 590 + }, + { + "epoch": 0.07, + "learning_rate": 1.9926383450473344e-05, + "loss": 0.5223, + "step": 591 + }, + { + "epoch": 0.07, + "learning_rate": 1.9925934440580218e-05, + "loss": 0.496, + "step": 592 + }, + { + "epoch": 0.07, + "learning_rate": 1.9925484070612465e-05, + "loss": 0.5087, + "step": 593 + }, + { + "epoch": 0.07, + "learning_rate": 1.9925032340631793e-05, + "loss": 0.5022, + "step": 594 + }, + { + "epoch": 0.07, + "learning_rate": 1.9924579250700104e-05, + "loss": 0.511, + "step": 595 + }, + { + "epoch": 0.07, + "learning_rate": 1.992412480087948e-05, + "loss": 0.509, + "step": 596 + }, + { + "epoch": 0.07, + "learning_rate": 1.992366899123219e-05, + "loss": 0.513, + "step": 597 + }, + { + "epoch": 0.07, + "learning_rate": 1.9923211821820692e-05, + "loss": 0.5128, + "step": 598 + }, + { + "epoch": 0.07, + "learning_rate": 1.9922753292707627e-05, + "loss": 0.5197, + "step": 599 + }, + { + "epoch": 0.07, + "learning_rate": 1.992229340395582e-05, + "loss": 0.4962, + "step": 600 + }, + { + "epoch": 0.07, + "learning_rate": 1.9921832155628295e-05, + "loss": 0.4956, + "step": 601 + }, + { + "epoch": 0.07, + "learning_rate": 1.9921369547788246e-05, + "loss": 0.4905, + "step": 602 + }, + { + "epoch": 0.07, + "learning_rate": 1.9920905580499063e-05, + "loss": 0.5012, + "step": 603 + }, + { + "epoch": 0.07, + "learning_rate": 1.9920440253824318e-05, + "loss": 0.4991, + "step": 604 + }, + { + "epoch": 0.07, + "learning_rate": 1.9919973567827776e-05, + "loss": 0.5092, + "step": 605 + }, + { + "epoch": 0.07, + "learning_rate": 1.991950552257338e-05, + "loss": 0.4813, + "step": 606 + }, + { + "epoch": 0.07, + "learning_rate": 1.991903611812526e-05, + "loss": 0.4992, + "step": 607 + }, + { + "epoch": 0.07, + "learning_rate": 1.9918565354547738e-05, + "loss": 0.4908, + "step": 608 + }, + { + "epoch": 0.07, + "learning_rate": 1.991809323190532e-05, + "loss": 0.505, + "step": 609 + }, + { + "epoch": 0.07, + "learning_rate": 1.99176197502627e-05, + "loss": 0.4865, + "step": 610 + }, + { + "epoch": 0.07, + "learning_rate": 1.9917144909684745e-05, + "loss": 0.4982, + "step": 611 + }, + { + "epoch": 0.07, + "learning_rate": 1.9916668710236528e-05, + "loss": 0.5175, + "step": 612 + }, + { + "epoch": 0.07, + "learning_rate": 1.9916191151983297e-05, + "loss": 0.529, + "step": 613 + }, + { + "epoch": 0.07, + "learning_rate": 1.9915712234990486e-05, + "loss": 0.4916, + "step": 614 + }, + { + "epoch": 0.07, + "learning_rate": 1.9915231959323722e-05, + "loss": 0.5154, + "step": 615 + }, + { + "epoch": 0.07, + "learning_rate": 1.991475032504881e-05, + "loss": 0.4903, + "step": 616 + }, + { + "epoch": 0.07, + "learning_rate": 1.9914267332231746e-05, + "loss": 0.4984, + "step": 617 + }, + { + "epoch": 0.07, + "learning_rate": 1.991378298093871e-05, + "loss": 0.5179, + "step": 618 + }, + { + "epoch": 0.07, + "learning_rate": 1.9913297271236063e-05, + "loss": 0.4865, + "step": 619 + }, + { + "epoch": 0.07, + "learning_rate": 1.9912810203190367e-05, + "loss": 0.5074, + "step": 620 + }, + { + "epoch": 0.07, + "learning_rate": 1.991232177686836e-05, + "loss": 0.5076, + "step": 621 + }, + { + "epoch": 0.07, + "learning_rate": 1.9911831992336963e-05, + "loss": 0.5042, + "step": 622 + }, + { + "epoch": 0.07, + "learning_rate": 1.9911340849663293e-05, + "loss": 0.5021, + "step": 623 + }, + { + "epoch": 0.07, + "learning_rate": 1.991084834891464e-05, + "loss": 0.5062, + "step": 624 + }, + { + "epoch": 0.07, + "learning_rate": 1.9910354490158498e-05, + "loss": 0.4975, + "step": 625 + }, + { + "epoch": 0.07, + "learning_rate": 1.9909859273462525e-05, + "loss": 0.4892, + "step": 626 + }, + { + "epoch": 0.07, + "learning_rate": 1.9909362698894585e-05, + "loss": 0.4962, + "step": 627 + }, + { + "epoch": 0.07, + "learning_rate": 1.9908864766522716e-05, + "loss": 0.5167, + "step": 628 + }, + { + "epoch": 0.07, + "learning_rate": 1.9908365476415146e-05, + "loss": 0.5168, + "step": 629 + }, + { + "epoch": 0.07, + "learning_rate": 1.9907864828640292e-05, + "loss": 0.502, + "step": 630 + }, + { + "epoch": 0.07, + "learning_rate": 1.9907362823266752e-05, + "loss": 0.5143, + "step": 631 + }, + { + "epoch": 0.07, + "learning_rate": 1.9906859460363307e-05, + "loss": 0.5045, + "step": 632 + }, + { + "epoch": 0.07, + "learning_rate": 1.9906354739998937e-05, + "loss": 0.5051, + "step": 633 + }, + { + "epoch": 0.07, + "learning_rate": 1.99058486622428e-05, + "loss": 0.4971, + "step": 634 + }, + { + "epoch": 0.07, + "learning_rate": 1.990534122716423e-05, + "loss": 0.4839, + "step": 635 + }, + { + "epoch": 0.07, + "learning_rate": 1.990483243483277e-05, + "loss": 0.5059, + "step": 636 + }, + { + "epoch": 0.07, + "learning_rate": 1.990432228531813e-05, + "loss": 0.504, + "step": 637 + }, + { + "epoch": 0.07, + "learning_rate": 1.9903810778690204e-05, + "loss": 0.5081, + "step": 638 + }, + { + "epoch": 0.07, + "learning_rate": 1.9903297915019093e-05, + "loss": 0.5, + "step": 639 + }, + { + "epoch": 0.07, + "learning_rate": 1.9902783694375064e-05, + "loss": 0.5103, + "step": 640 + }, + { + "epoch": 0.07, + "learning_rate": 1.9902268116828578e-05, + "loss": 0.5111, + "step": 641 + }, + { + "epoch": 0.07, + "learning_rate": 1.9901751182450276e-05, + "loss": 0.4893, + "step": 642 + }, + { + "epoch": 0.07, + "learning_rate": 1.9901232891310998e-05, + "loss": 0.4861, + "step": 643 + }, + { + "epoch": 0.07, + "learning_rate": 1.9900713243481758e-05, + "loss": 0.5005, + "step": 644 + }, + { + "epoch": 0.07, + "learning_rate": 1.990019223903376e-05, + "loss": 0.4887, + "step": 645 + }, + { + "epoch": 0.07, + "learning_rate": 1.9899669878038382e-05, + "loss": 0.5158, + "step": 646 + }, + { + "epoch": 0.07, + "learning_rate": 1.989914616056722e-05, + "loss": 0.4871, + "step": 647 + }, + { + "epoch": 0.07, + "learning_rate": 1.9898621086692017e-05, + "loss": 0.5133, + "step": 648 + }, + { + "epoch": 0.07, + "learning_rate": 1.989809465648473e-05, + "loss": 0.4984, + "step": 649 + }, + { + "epoch": 0.07, + "learning_rate": 1.989756687001749e-05, + "loss": 0.4881, + "step": 650 + }, + { + "epoch": 0.07, + "learning_rate": 1.9897037727362612e-05, + "loss": 0.4802, + "step": 651 + }, + { + "epoch": 0.07, + "learning_rate": 1.9896507228592604e-05, + "loss": 0.5036, + "step": 652 + }, + { + "epoch": 0.07, + "learning_rate": 1.989597537378015e-05, + "loss": 0.4942, + "step": 653 + }, + { + "epoch": 0.07, + "learning_rate": 1.9895442162998136e-05, + "loss": 0.5228, + "step": 654 + }, + { + "epoch": 0.07, + "learning_rate": 1.9894907596319615e-05, + "loss": 0.4976, + "step": 655 + }, + { + "epoch": 0.07, + "learning_rate": 1.989437167381784e-05, + "loss": 0.5151, + "step": 656 + }, + { + "epoch": 0.08, + "learning_rate": 1.9893834395566242e-05, + "loss": 0.5039, + "step": 657 + }, + { + "epoch": 0.08, + "learning_rate": 1.989329576163844e-05, + "loss": 0.4843, + "step": 658 + }, + { + "epoch": 0.08, + "learning_rate": 1.989275577210824e-05, + "loss": 0.5066, + "step": 659 + }, + { + "epoch": 0.08, + "learning_rate": 1.989221442704963e-05, + "loss": 0.497, + "step": 660 + }, + { + "epoch": 0.08, + "learning_rate": 1.9891671726536787e-05, + "loss": 0.4965, + "step": 661 + }, + { + "epoch": 0.08, + "learning_rate": 1.9891127670644076e-05, + "loss": 0.4853, + "step": 662 + }, + { + "epoch": 0.08, + "learning_rate": 1.9890582259446046e-05, + "loss": 0.5132, + "step": 663 + }, + { + "epoch": 0.08, + "learning_rate": 1.9890035493017424e-05, + "loss": 0.4881, + "step": 664 + }, + { + "epoch": 0.08, + "learning_rate": 1.9889487371433134e-05, + "loss": 0.5049, + "step": 665 + }, + { + "epoch": 0.08, + "learning_rate": 1.988893789476828e-05, + "loss": 0.4886, + "step": 666 + }, + { + "epoch": 0.08, + "learning_rate": 1.9888387063098153e-05, + "loss": 0.5109, + "step": 667 + }, + { + "epoch": 0.08, + "learning_rate": 1.9887834876498228e-05, + "loss": 0.4744, + "step": 668 + }, + { + "epoch": 0.08, + "learning_rate": 1.9887281335044167e-05, + "loss": 0.4952, + "step": 669 + }, + { + "epoch": 0.08, + "learning_rate": 1.988672643881182e-05, + "loss": 0.4774, + "step": 670 + }, + { + "epoch": 0.08, + "learning_rate": 1.9886170187877214e-05, + "loss": 0.509, + "step": 671 + }, + { + "epoch": 0.08, + "learning_rate": 1.9885612582316575e-05, + "loss": 0.4924, + "step": 672 + }, + { + "epoch": 0.08, + "learning_rate": 1.9885053622206305e-05, + "loss": 0.4924, + "step": 673 + }, + { + "epoch": 0.08, + "learning_rate": 1.9884493307622993e-05, + "loss": 0.5126, + "step": 674 + }, + { + "epoch": 0.08, + "learning_rate": 1.988393163864341e-05, + "loss": 0.4876, + "step": 675 + }, + { + "epoch": 0.08, + "learning_rate": 1.9883368615344526e-05, + "loss": 0.4895, + "step": 676 + }, + { + "epoch": 0.08, + "learning_rate": 1.9882804237803487e-05, + "loss": 0.5074, + "step": 677 + }, + { + "epoch": 0.08, + "learning_rate": 1.988223850609762e-05, + "loss": 0.4826, + "step": 678 + }, + { + "epoch": 0.08, + "learning_rate": 1.9881671420304444e-05, + "loss": 0.5268, + "step": 679 + }, + { + "epoch": 0.08, + "learning_rate": 1.9881102980501664e-05, + "loss": 0.4995, + "step": 680 + }, + { + "epoch": 0.08, + "learning_rate": 1.988053318676717e-05, + "loss": 0.5015, + "step": 681 + }, + { + "epoch": 0.08, + "learning_rate": 1.9879962039179033e-05, + "loss": 0.5134, + "step": 682 + }, + { + "epoch": 0.08, + "learning_rate": 1.9879389537815514e-05, + "loss": 0.4966, + "step": 683 + }, + { + "epoch": 0.08, + "learning_rate": 1.9878815682755062e-05, + "loss": 0.4792, + "step": 684 + }, + { + "epoch": 0.08, + "learning_rate": 1.9878240474076306e-05, + "loss": 0.523, + "step": 685 + }, + { + "epoch": 0.08, + "learning_rate": 1.987766391185806e-05, + "loss": 0.492, + "step": 686 + }, + { + "epoch": 0.08, + "learning_rate": 1.9877085996179327e-05, + "loss": 0.5097, + "step": 687 + }, + { + "epoch": 0.08, + "learning_rate": 1.9876506727119294e-05, + "loss": 0.4948, + "step": 688 + }, + { + "epoch": 0.08, + "learning_rate": 1.9875926104757337e-05, + "loss": 0.5193, + "step": 689 + }, + { + "epoch": 0.08, + "learning_rate": 1.9875344129173012e-05, + "loss": 0.5, + "step": 690 + }, + { + "epoch": 0.08, + "learning_rate": 1.9874760800446063e-05, + "loss": 0.4983, + "step": 691 + }, + { + "epoch": 0.08, + "learning_rate": 1.9874176118656415e-05, + "loss": 0.4759, + "step": 692 + }, + { + "epoch": 0.08, + "learning_rate": 1.9873590083884192e-05, + "loss": 0.5069, + "step": 693 + }, + { + "epoch": 0.08, + "learning_rate": 1.9873002696209688e-05, + "loss": 0.5129, + "step": 694 + }, + { + "epoch": 0.08, + "learning_rate": 1.9872413955713382e-05, + "loss": 0.508, + "step": 695 + }, + { + "epoch": 0.08, + "learning_rate": 1.9871823862475955e-05, + "loss": 0.4963, + "step": 696 + }, + { + "epoch": 0.08, + "learning_rate": 1.987123241657826e-05, + "loss": 0.5074, + "step": 697 + }, + { + "epoch": 0.08, + "learning_rate": 1.9870639618101333e-05, + "loss": 0.5238, + "step": 698 + }, + { + "epoch": 0.08, + "learning_rate": 1.987004546712641e-05, + "loss": 0.4916, + "step": 699 + }, + { + "epoch": 0.08, + "learning_rate": 1.9869449963734894e-05, + "loss": 0.4913, + "step": 700 + }, + { + "epoch": 0.08, + "learning_rate": 1.9868853108008387e-05, + "loss": 0.4755, + "step": 701 + }, + { + "epoch": 0.08, + "learning_rate": 1.986825490002867e-05, + "loss": 0.4836, + "step": 702 + }, + { + "epoch": 0.08, + "learning_rate": 1.9867655339877713e-05, + "loss": 0.4985, + "step": 703 + }, + { + "epoch": 0.08, + "learning_rate": 1.9867054427637667e-05, + "loss": 0.4972, + "step": 704 + }, + { + "epoch": 0.08, + "learning_rate": 1.986645216339087e-05, + "loss": 0.5101, + "step": 705 + }, + { + "epoch": 0.08, + "learning_rate": 1.9865848547219845e-05, + "loss": 0.4929, + "step": 706 + }, + { + "epoch": 0.08, + "learning_rate": 1.9865243579207304e-05, + "loss": 0.4964, + "step": 707 + }, + { + "epoch": 0.08, + "learning_rate": 1.986463725943614e-05, + "loss": 0.5103, + "step": 708 + }, + { + "epoch": 0.08, + "learning_rate": 1.9864029587989432e-05, + "loss": 0.481, + "step": 709 + }, + { + "epoch": 0.08, + "learning_rate": 1.9863420564950445e-05, + "loss": 0.4843, + "step": 710 + }, + { + "epoch": 0.08, + "learning_rate": 1.986281019040263e-05, + "loss": 0.5253, + "step": 711 + }, + { + "epoch": 0.08, + "learning_rate": 1.9862198464429614e-05, + "loss": 0.4945, + "step": 712 + }, + { + "epoch": 0.08, + "learning_rate": 1.9861585387115228e-05, + "loss": 0.4945, + "step": 713 + }, + { + "epoch": 0.08, + "learning_rate": 1.986097095854347e-05, + "loss": 0.4998, + "step": 714 + }, + { + "epoch": 0.08, + "learning_rate": 1.9860355178798536e-05, + "loss": 0.4981, + "step": 715 + }, + { + "epoch": 0.08, + "learning_rate": 1.9859738047964795e-05, + "loss": 0.5039, + "step": 716 + }, + { + "epoch": 0.08, + "learning_rate": 1.9859119566126813e-05, + "loss": 0.4968, + "step": 717 + }, + { + "epoch": 0.08, + "learning_rate": 1.9858499733369335e-05, + "loss": 0.4974, + "step": 718 + }, + { + "epoch": 0.08, + "learning_rate": 1.985787854977729e-05, + "loss": 0.4996, + "step": 719 + }, + { + "epoch": 0.08, + "learning_rate": 1.9857256015435797e-05, + "loss": 0.4793, + "step": 720 + }, + { + "epoch": 0.08, + "learning_rate": 1.985663213043015e-05, + "loss": 0.4923, + "step": 721 + }, + { + "epoch": 0.08, + "learning_rate": 1.9856006894845844e-05, + "loss": 0.4878, + "step": 722 + }, + { + "epoch": 0.08, + "learning_rate": 1.9855380308768546e-05, + "loss": 0.5218, + "step": 723 + }, + { + "epoch": 0.08, + "learning_rate": 1.9854752372284113e-05, + "loss": 0.4872, + "step": 724 + }, + { + "epoch": 0.08, + "learning_rate": 1.9854123085478587e-05, + "loss": 0.4902, + "step": 725 + }, + { + "epoch": 0.08, + "learning_rate": 1.9853492448438192e-05, + "loss": 0.5086, + "step": 726 + }, + { + "epoch": 0.08, + "learning_rate": 1.985286046124934e-05, + "loss": 0.4679, + "step": 727 + }, + { + "epoch": 0.08, + "learning_rate": 1.985222712399863e-05, + "loss": 0.4948, + "step": 728 + }, + { + "epoch": 0.08, + "learning_rate": 1.985159243677284e-05, + "loss": 0.4955, + "step": 729 + }, + { + "epoch": 0.08, + "learning_rate": 1.985095639965894e-05, + "loss": 0.4996, + "step": 730 + }, + { + "epoch": 0.08, + "learning_rate": 1.985031901274408e-05, + "loss": 0.512, + "step": 731 + }, + { + "epoch": 0.08, + "learning_rate": 1.9849680276115593e-05, + "loss": 0.492, + "step": 732 + }, + { + "epoch": 0.08, + "learning_rate": 1.9849040189861004e-05, + "loss": 0.4928, + "step": 733 + }, + { + "epoch": 0.08, + "learning_rate": 1.9848398754068018e-05, + "loss": 0.5268, + "step": 734 + }, + { + "epoch": 0.08, + "learning_rate": 1.984775596882452e-05, + "loss": 0.4822, + "step": 735 + }, + { + "epoch": 0.08, + "learning_rate": 1.98471118342186e-05, + "loss": 0.487, + "step": 736 + }, + { + "epoch": 0.08, + "learning_rate": 1.9846466350338506e-05, + "loss": 0.5087, + "step": 737 + }, + { + "epoch": 0.08, + "learning_rate": 1.9845819517272688e-05, + "loss": 0.4785, + "step": 738 + }, + { + "epoch": 0.08, + "learning_rate": 1.9845171335109776e-05, + "loss": 0.523, + "step": 739 + }, + { + "epoch": 0.08, + "learning_rate": 1.9844521803938588e-05, + "loss": 0.4755, + "step": 740 + }, + { + "epoch": 0.08, + "learning_rate": 1.9843870923848122e-05, + "loss": 0.4949, + "step": 741 + }, + { + "epoch": 0.08, + "learning_rate": 1.984321869492756e-05, + "loss": 0.5024, + "step": 742 + }, + { + "epoch": 0.08, + "learning_rate": 1.984256511726628e-05, + "loss": 0.502, + "step": 743 + }, + { + "epoch": 0.09, + "learning_rate": 1.984191019095383e-05, + "loss": 0.4899, + "step": 744 + }, + { + "epoch": 0.09, + "learning_rate": 1.9841253916079953e-05, + "loss": 0.5067, + "step": 745 + }, + { + "epoch": 0.09, + "learning_rate": 1.9840596292734573e-05, + "loss": 0.4877, + "step": 746 + }, + { + "epoch": 0.09, + "learning_rate": 1.9839937321007795e-05, + "loss": 0.5142, + "step": 747 + }, + { + "epoch": 0.09, + "learning_rate": 1.983927700098992e-05, + "loss": 0.488, + "step": 748 + }, + { + "epoch": 0.09, + "learning_rate": 1.983861533277142e-05, + "loss": 0.501, + "step": 749 + }, + { + "epoch": 0.09, + "learning_rate": 1.983795231644296e-05, + "loss": 0.4959, + "step": 750 + }, + { + "epoch": 0.09, + "learning_rate": 1.983728795209539e-05, + "loss": 0.5074, + "step": 751 + }, + { + "epoch": 0.09, + "learning_rate": 1.9836622239819743e-05, + "loss": 0.4955, + "step": 752 + }, + { + "epoch": 0.09, + "learning_rate": 1.983595517970723e-05, + "loss": 0.4876, + "step": 753 + }, + { + "epoch": 0.09, + "learning_rate": 1.9835286771849264e-05, + "loss": 0.4906, + "step": 754 + }, + { + "epoch": 0.09, + "learning_rate": 1.9834617016337424e-05, + "loss": 0.5096, + "step": 755 + }, + { + "epoch": 0.09, + "learning_rate": 1.9833945913263483e-05, + "loss": 0.513, + "step": 756 + }, + { + "epoch": 0.09, + "learning_rate": 1.9833273462719396e-05, + "loss": 0.5042, + "step": 757 + }, + { + "epoch": 0.09, + "learning_rate": 1.9832599664797306e-05, + "loss": 0.4622, + "step": 758 + }, + { + "epoch": 0.09, + "learning_rate": 1.9831924519589537e-05, + "loss": 0.4877, + "step": 759 + }, + { + "epoch": 0.09, + "learning_rate": 1.9831248027188604e-05, + "loss": 0.5123, + "step": 760 + }, + { + "epoch": 0.09, + "learning_rate": 1.983057018768719e-05, + "loss": 0.4812, + "step": 761 + }, + { + "epoch": 0.09, + "learning_rate": 1.982989100117819e-05, + "loss": 0.4839, + "step": 762 + }, + { + "epoch": 0.09, + "learning_rate": 1.9829210467754654e-05, + "loss": 0.5185, + "step": 763 + }, + { + "epoch": 0.09, + "learning_rate": 1.9828528587509836e-05, + "loss": 0.4878, + "step": 764 + }, + { + "epoch": 0.09, + "learning_rate": 1.982784536053717e-05, + "loss": 0.4959, + "step": 765 + }, + { + "epoch": 0.09, + "learning_rate": 1.9827160786930267e-05, + "loss": 0.5101, + "step": 766 + }, + { + "epoch": 0.09, + "learning_rate": 1.9826474866782933e-05, + "loss": 0.4955, + "step": 767 + }, + { + "epoch": 0.09, + "learning_rate": 1.9825787600189163e-05, + "loss": 0.5152, + "step": 768 + }, + { + "epoch": 0.09, + "learning_rate": 1.982509898724311e-05, + "loss": 0.4909, + "step": 769 + }, + { + "epoch": 0.09, + "learning_rate": 1.9824409028039143e-05, + "loss": 0.5013, + "step": 770 + }, + { + "epoch": 0.09, + "learning_rate": 1.9823717722671798e-05, + "loss": 0.4928, + "step": 771 + }, + { + "epoch": 0.09, + "learning_rate": 1.98230250712358e-05, + "loss": 0.4885, + "step": 772 + }, + { + "epoch": 0.09, + "learning_rate": 1.9822331073826056e-05, + "loss": 0.5048, + "step": 773 + }, + { + "epoch": 0.09, + "learning_rate": 1.982163573053766e-05, + "loss": 0.4921, + "step": 774 + }, + { + "epoch": 0.09, + "learning_rate": 1.9820939041465887e-05, + "loss": 0.4884, + "step": 775 + }, + { + "epoch": 0.09, + "learning_rate": 1.9820241006706203e-05, + "loss": 0.4746, + "step": 776 + }, + { + "epoch": 0.09, + "learning_rate": 1.9819541626354252e-05, + "loss": 0.4968, + "step": 777 + }, + { + "epoch": 0.09, + "learning_rate": 1.9818840900505866e-05, + "loss": 0.4767, + "step": 778 + }, + { + "epoch": 0.09, + "learning_rate": 1.9818138829257063e-05, + "loss": 0.5091, + "step": 779 + }, + { + "epoch": 0.09, + "learning_rate": 1.9817435412704037e-05, + "loss": 0.5013, + "step": 780 + }, + { + "epoch": 0.09, + "learning_rate": 1.981673065094317e-05, + "loss": 0.525, + "step": 781 + }, + { + "epoch": 0.09, + "learning_rate": 1.9816024544071038e-05, + "loss": 0.4728, + "step": 782 + }, + { + "epoch": 0.09, + "learning_rate": 1.9815317092184388e-05, + "loss": 0.505, + "step": 783 + }, + { + "epoch": 0.09, + "learning_rate": 1.9814608295380155e-05, + "loss": 0.4836, + "step": 784 + }, + { + "epoch": 0.09, + "learning_rate": 1.9813898153755465e-05, + "loss": 0.4904, + "step": 785 + }, + { + "epoch": 0.09, + "learning_rate": 1.9813186667407624e-05, + "loss": 0.4916, + "step": 786 + }, + { + "epoch": 0.09, + "learning_rate": 1.9812473836434115e-05, + "loss": 0.4835, + "step": 787 + }, + { + "epoch": 0.09, + "learning_rate": 1.981175966093262e-05, + "loss": 0.5016, + "step": 788 + }, + { + "epoch": 0.09, + "learning_rate": 1.9811044141000985e-05, + "loss": 0.507, + "step": 789 + }, + { + "epoch": 0.09, + "learning_rate": 1.9810327276737268e-05, + "loss": 0.4745, + "step": 790 + }, + { + "epoch": 0.09, + "learning_rate": 1.980960906823968e-05, + "loss": 0.5002, + "step": 791 + }, + { + "epoch": 0.09, + "learning_rate": 1.9808889515606644e-05, + "loss": 0.4881, + "step": 792 + }, + { + "epoch": 0.09, + "learning_rate": 1.9808168618936746e-05, + "loss": 0.4806, + "step": 793 + }, + { + "epoch": 0.09, + "learning_rate": 1.980744637832877e-05, + "loss": 0.4801, + "step": 794 + }, + { + "epoch": 0.09, + "learning_rate": 1.9806722793881675e-05, + "loss": 0.4944, + "step": 795 + }, + { + "epoch": 0.09, + "learning_rate": 1.9805997865694616e-05, + "loss": 0.499, + "step": 796 + }, + { + "epoch": 0.09, + "learning_rate": 1.9805271593866914e-05, + "loss": 0.5009, + "step": 797 + }, + { + "epoch": 0.09, + "learning_rate": 1.9804543978498093e-05, + "loss": 0.4867, + "step": 798 + }, + { + "epoch": 0.09, + "learning_rate": 1.9803815019687844e-05, + "loss": 0.5012, + "step": 799 + }, + { + "epoch": 0.09, + "learning_rate": 1.980308471753606e-05, + "loss": 0.5014, + "step": 800 + }, + { + "epoch": 0.09, + "learning_rate": 1.9802353072142802e-05, + "loss": 0.5037, + "step": 801 + }, + { + "epoch": 0.09, + "learning_rate": 1.9801620083608327e-05, + "loss": 0.4848, + "step": 802 + }, + { + "epoch": 0.09, + "learning_rate": 1.9800885752033067e-05, + "loss": 0.4847, + "step": 803 + }, + { + "epoch": 0.09, + "learning_rate": 1.980015007751764e-05, + "loss": 0.4958, + "step": 804 + }, + { + "epoch": 0.09, + "learning_rate": 1.9799413060162854e-05, + "loss": 0.4871, + "step": 805 + }, + { + "epoch": 0.09, + "learning_rate": 1.9798674700069698e-05, + "loss": 0.4962, + "step": 806 + }, + { + "epoch": 0.09, + "learning_rate": 1.979793499733934e-05, + "loss": 0.5009, + "step": 807 + }, + { + "epoch": 0.09, + "learning_rate": 1.9797193952073135e-05, + "loss": 0.4764, + "step": 808 + }, + { + "epoch": 0.09, + "learning_rate": 1.9796451564372624e-05, + "loss": 0.5013, + "step": 809 + }, + { + "epoch": 0.09, + "learning_rate": 1.979570783433954e-05, + "loss": 0.5022, + "step": 810 + }, + { + "epoch": 0.09, + "learning_rate": 1.9794962762075772e-05, + "loss": 0.4912, + "step": 811 + }, + { + "epoch": 0.09, + "learning_rate": 1.9794216347683425e-05, + "loss": 0.5052, + "step": 812 + }, + { + "epoch": 0.09, + "learning_rate": 1.979346859126477e-05, + "loss": 0.4961, + "step": 813 + }, + { + "epoch": 0.09, + "learning_rate": 1.979271949292227e-05, + "loss": 0.5019, + "step": 814 + }, + { + "epoch": 0.09, + "learning_rate": 1.9791969052758563e-05, + "loss": 0.497, + "step": 815 + }, + { + "epoch": 0.09, + "learning_rate": 1.979121727087648e-05, + "loss": 0.4944, + "step": 816 + }, + { + "epoch": 0.09, + "learning_rate": 1.979046414737903e-05, + "loss": 0.4989, + "step": 817 + }, + { + "epoch": 0.09, + "learning_rate": 1.978970968236941e-05, + "loss": 0.4955, + "step": 818 + }, + { + "epoch": 0.09, + "learning_rate": 1.9788953875950992e-05, + "loss": 0.4843, + "step": 819 + }, + { + "epoch": 0.09, + "learning_rate": 1.9788196728227348e-05, + "loss": 0.4918, + "step": 820 + }, + { + "epoch": 0.09, + "learning_rate": 1.9787438239302217e-05, + "loss": 0.4815, + "step": 821 + }, + { + "epoch": 0.09, + "learning_rate": 1.9786678409279535e-05, + "loss": 0.4935, + "step": 822 + }, + { + "epoch": 0.09, + "learning_rate": 1.9785917238263405e-05, + "loss": 0.4966, + "step": 823 + }, + { + "epoch": 0.09, + "learning_rate": 1.9785154726358134e-05, + "loss": 0.514, + "step": 824 + }, + { + "epoch": 0.09, + "learning_rate": 1.9784390873668206e-05, + "loss": 0.517, + "step": 825 + }, + { + "epoch": 0.09, + "learning_rate": 1.9783625680298276e-05, + "loss": 0.4887, + "step": 826 + }, + { + "epoch": 0.09, + "learning_rate": 1.9782859146353196e-05, + "loss": 0.4793, + "step": 827 + }, + { + "epoch": 0.09, + "learning_rate": 1.9782091271938e-05, + "loss": 0.4941, + "step": 828 + }, + { + "epoch": 0.09, + "learning_rate": 1.9781322057157902e-05, + "loss": 0.5018, + "step": 829 + }, + { + "epoch": 0.09, + "learning_rate": 1.9780551502118306e-05, + "loss": 0.4994, + "step": 830 + }, + { + "epoch": 0.09, + "learning_rate": 1.9779779606924788e-05, + "loss": 0.491, + "step": 831 + }, + { + "epoch": 0.1, + "learning_rate": 1.977900637168312e-05, + "loss": 0.4961, + "step": 832 + }, + { + "epoch": 0.1, + "learning_rate": 1.9778231796499254e-05, + "loss": 0.4925, + "step": 833 + }, + { + "epoch": 0.1, + "learning_rate": 1.977745588147932e-05, + "loss": 0.4914, + "step": 834 + }, + { + "epoch": 0.1, + "learning_rate": 1.977667862672964e-05, + "loss": 0.4989, + "step": 835 + }, + { + "epoch": 0.1, + "learning_rate": 1.9775900032356704e-05, + "loss": 0.4993, + "step": 836 + }, + { + "epoch": 0.1, + "learning_rate": 1.9775120098467212e-05, + "loss": 0.4878, + "step": 837 + }, + { + "epoch": 0.1, + "learning_rate": 1.9774338825168024e-05, + "loss": 0.5092, + "step": 838 + }, + { + "epoch": 0.1, + "learning_rate": 1.977355621256619e-05, + "loss": 0.4944, + "step": 839 + }, + { + "epoch": 0.1, + "learning_rate": 1.9772772260768954e-05, + "loss": 0.5022, + "step": 840 + }, + { + "epoch": 0.1, + "learning_rate": 1.9771986969883727e-05, + "loss": 0.5086, + "step": 841 + }, + { + "epoch": 0.1, + "learning_rate": 1.9771200340018115e-05, + "loss": 0.4895, + "step": 842 + }, + { + "epoch": 0.1, + "learning_rate": 1.97704123712799e-05, + "loss": 0.4911, + "step": 843 + }, + { + "epoch": 0.1, + "learning_rate": 1.976962306377706e-05, + "loss": 0.5018, + "step": 844 + }, + { + "epoch": 0.1, + "learning_rate": 1.9768832417617737e-05, + "loss": 0.4837, + "step": 845 + }, + { + "epoch": 0.1, + "learning_rate": 1.9768040432910276e-05, + "loss": 0.5251, + "step": 846 + }, + { + "epoch": 0.1, + "learning_rate": 1.976724710976319e-05, + "loss": 0.5045, + "step": 847 + }, + { + "epoch": 0.1, + "learning_rate": 1.9766452448285184e-05, + "loss": 0.5115, + "step": 848 + }, + { + "epoch": 0.1, + "learning_rate": 1.9765656448585148e-05, + "loss": 0.4769, + "step": 849 + }, + { + "epoch": 0.1, + "learning_rate": 1.976485911077215e-05, + "loss": 0.4936, + "step": 850 + }, + { + "epoch": 0.1, + "learning_rate": 1.9764060434955437e-05, + "loss": 0.4867, + "step": 851 + }, + { + "epoch": 0.1, + "learning_rate": 1.9763260421244455e-05, + "loss": 0.4924, + "step": 852 + }, + { + "epoch": 0.1, + "learning_rate": 1.9762459069748817e-05, + "loss": 0.4903, + "step": 853 + }, + { + "epoch": 0.1, + "learning_rate": 1.9761656380578328e-05, + "loss": 0.4812, + "step": 854 + }, + { + "epoch": 0.1, + "learning_rate": 1.9760852353842973e-05, + "loss": 0.4974, + "step": 855 + }, + { + "epoch": 0.1, + "learning_rate": 1.9760046989652926e-05, + "loss": 0.5047, + "step": 856 + }, + { + "epoch": 0.1, + "learning_rate": 1.9759240288118536e-05, + "loss": 0.4837, + "step": 857 + }, + { + "epoch": 0.1, + "learning_rate": 1.975843224935034e-05, + "loss": 0.4966, + "step": 858 + }, + { + "epoch": 0.1, + "learning_rate": 1.9757622873459056e-05, + "loss": 0.505, + "step": 859 + }, + { + "epoch": 0.1, + "learning_rate": 1.9756812160555586e-05, + "loss": 0.4818, + "step": 860 + }, + { + "epoch": 0.1, + "learning_rate": 1.9756000110751023e-05, + "loss": 0.4965, + "step": 861 + }, + { + "epoch": 0.1, + "learning_rate": 1.975518672415663e-05, + "loss": 0.4987, + "step": 862 + }, + { + "epoch": 0.1, + "learning_rate": 1.975437200088386e-05, + "loss": 0.4718, + "step": 863 + }, + { + "epoch": 0.1, + "learning_rate": 1.9753555941044345e-05, + "loss": 0.5024, + "step": 864 + }, + { + "epoch": 0.1, + "learning_rate": 1.9752738544749906e-05, + "loss": 0.4937, + "step": 865 + }, + { + "epoch": 0.1, + "learning_rate": 1.975191981211255e-05, + "loss": 0.4968, + "step": 866 + }, + { + "epoch": 0.1, + "learning_rate": 1.9751099743244454e-05, + "loss": 0.4785, + "step": 867 + }, + { + "epoch": 0.1, + "learning_rate": 1.9750278338257985e-05, + "loss": 0.5002, + "step": 868 + }, + { + "epoch": 0.1, + "learning_rate": 1.9749455597265704e-05, + "loss": 0.4948, + "step": 869 + }, + { + "epoch": 0.1, + "learning_rate": 1.9748631520380333e-05, + "loss": 0.4887, + "step": 870 + }, + { + "epoch": 0.1, + "learning_rate": 1.97478061077148e-05, + "loss": 0.4751, + "step": 871 + }, + { + "epoch": 0.1, + "learning_rate": 1.9746979359382193e-05, + "loss": 0.4976, + "step": 872 + }, + { + "epoch": 0.1, + "learning_rate": 1.9746151275495803e-05, + "loss": 0.5071, + "step": 873 + }, + { + "epoch": 0.1, + "learning_rate": 1.974532185616909e-05, + "loss": 0.5251, + "step": 874 + }, + { + "epoch": 0.1, + "learning_rate": 1.9744491101515715e-05, + "loss": 0.484, + "step": 875 + }, + { + "epoch": 0.1, + "learning_rate": 1.9743659011649495e-05, + "loss": 0.4894, + "step": 876 + }, + { + "epoch": 0.1, + "learning_rate": 1.9742825586684457e-05, + "loss": 0.4815, + "step": 877 + }, + { + "epoch": 0.1, + "learning_rate": 1.9741990826734793e-05, + "loss": 0.5014, + "step": 878 + }, + { + "epoch": 0.1, + "learning_rate": 1.9741154731914882e-05, + "loss": 0.4918, + "step": 879 + }, + { + "epoch": 0.1, + "learning_rate": 1.974031730233929e-05, + "loss": 0.4868, + "step": 880 + }, + { + "epoch": 0.1, + "learning_rate": 1.9739478538122765e-05, + "loss": 0.4782, + "step": 881 + }, + { + "epoch": 0.1, + "learning_rate": 1.9738638439380237e-05, + "loss": 0.5087, + "step": 882 + }, + { + "epoch": 0.1, + "learning_rate": 1.9737797006226815e-05, + "loss": 0.4789, + "step": 883 + }, + { + "epoch": 0.1, + "learning_rate": 1.9736954238777793e-05, + "loss": 0.4782, + "step": 884 + }, + { + "epoch": 0.1, + "learning_rate": 1.973611013714865e-05, + "loss": 0.4946, + "step": 885 + }, + { + "epoch": 0.1, + "learning_rate": 1.9735264701455054e-05, + "loss": 0.5054, + "step": 886 + }, + { + "epoch": 0.1, + "learning_rate": 1.973441793181284e-05, + "loss": 0.4952, + "step": 887 + }, + { + "epoch": 0.1, + "learning_rate": 1.9733569828338038e-05, + "loss": 0.4816, + "step": 888 + }, + { + "epoch": 0.1, + "learning_rate": 1.9732720391146852e-05, + "loss": 0.5189, + "step": 889 + }, + { + "epoch": 0.1, + "learning_rate": 1.973186962035568e-05, + "loss": 0.4958, + "step": 890 + }, + { + "epoch": 0.1, + "learning_rate": 1.97310175160811e-05, + "loss": 0.5128, + "step": 891 + }, + { + "epoch": 0.1, + "learning_rate": 1.9730164078439857e-05, + "loss": 0.5123, + "step": 892 + }, + { + "epoch": 0.1, + "learning_rate": 1.97293093075489e-05, + "loss": 0.4817, + "step": 893 + }, + { + "epoch": 0.1, + "learning_rate": 1.9728453203525352e-05, + "loss": 0.5027, + "step": 894 + }, + { + "epoch": 0.1, + "learning_rate": 1.9727595766486514e-05, + "loss": 0.5033, + "step": 895 + }, + { + "epoch": 0.1, + "learning_rate": 1.972673699654988e-05, + "loss": 0.4904, + "step": 896 + }, + { + "epoch": 0.1, + "learning_rate": 1.9725876893833108e-05, + "loss": 0.4848, + "step": 897 + }, + { + "epoch": 0.1, + "learning_rate": 1.9725015458454068e-05, + "loss": 0.4898, + "step": 898 + }, + { + "epoch": 0.1, + "learning_rate": 1.9724152690530785e-05, + "loss": 0.5056, + "step": 899 + }, + { + "epoch": 0.1, + "learning_rate": 1.972328859018148e-05, + "loss": 0.4908, + "step": 900 + }, + { + "epoch": 0.1, + "learning_rate": 1.9722423157524553e-05, + "loss": 0.4829, + "step": 901 + }, + { + "epoch": 0.1, + "learning_rate": 1.972155639267859e-05, + "loss": 0.4885, + "step": 902 + }, + { + "epoch": 0.1, + "learning_rate": 1.972068829576236e-05, + "loss": 0.4938, + "step": 903 + }, + { + "epoch": 0.1, + "learning_rate": 1.9719818866894802e-05, + "loss": 0.4802, + "step": 904 + }, + { + "epoch": 0.1, + "learning_rate": 1.9718948106195055e-05, + "loss": 0.5079, + "step": 905 + }, + { + "epoch": 0.1, + "learning_rate": 1.971807601378243e-05, + "loss": 0.4982, + "step": 906 + }, + { + "epoch": 0.1, + "learning_rate": 1.9717202589776424e-05, + "loss": 0.4993, + "step": 907 + }, + { + "epoch": 0.1, + "learning_rate": 1.971632783429672e-05, + "loss": 0.5026, + "step": 908 + }, + { + "epoch": 0.1, + "learning_rate": 1.9715451747463168e-05, + "loss": 0.4881, + "step": 909 + }, + { + "epoch": 0.1, + "learning_rate": 1.971457432939582e-05, + "loss": 0.5166, + "step": 910 + }, + { + "epoch": 0.1, + "learning_rate": 1.97136955802149e-05, + "loss": 0.4888, + "step": 911 + }, + { + "epoch": 0.1, + "learning_rate": 1.9712815500040815e-05, + "loss": 0.4803, + "step": 912 + }, + { + "epoch": 0.1, + "learning_rate": 1.9711934088994157e-05, + "loss": 0.4939, + "step": 913 + }, + { + "epoch": 0.1, + "learning_rate": 1.97110513471957e-05, + "loss": 0.4948, + "step": 914 + }, + { + "epoch": 0.1, + "learning_rate": 1.9710167274766395e-05, + "loss": 0.4926, + "step": 915 + }, + { + "epoch": 0.1, + "learning_rate": 1.9709281871827386e-05, + "loss": 0.4969, + "step": 916 + }, + { + "epoch": 0.1, + "learning_rate": 1.9708395138499986e-05, + "loss": 0.4836, + "step": 917 + }, + { + "epoch": 0.1, + "learning_rate": 1.97075070749057e-05, + "loss": 0.5263, + "step": 918 + }, + { + "epoch": 0.11, + "learning_rate": 1.970661768116622e-05, + "loss": 0.4922, + "step": 919 + }, + { + "epoch": 0.11, + "learning_rate": 1.9705726957403398e-05, + "loss": 0.4912, + "step": 920 + }, + { + "epoch": 0.11, + "learning_rate": 1.9704834903739297e-05, + "loss": 0.4587, + "step": 921 + }, + { + "epoch": 0.11, + "learning_rate": 1.970394152029614e-05, + "loss": 0.5059, + "step": 922 + }, + { + "epoch": 0.11, + "learning_rate": 1.970304680719634e-05, + "loss": 0.4826, + "step": 923 + }, + { + "epoch": 0.11, + "learning_rate": 1.9702150764562498e-05, + "loss": 0.5044, + "step": 924 + }, + { + "epoch": 0.11, + "learning_rate": 1.970125339251739e-05, + "loss": 0.4838, + "step": 925 + }, + { + "epoch": 0.11, + "learning_rate": 1.9700354691183977e-05, + "loss": 0.5082, + "step": 926 + }, + { + "epoch": 0.11, + "learning_rate": 1.9699454660685398e-05, + "loss": 0.4833, + "step": 927 + }, + { + "epoch": 0.11, + "learning_rate": 1.969855330114498e-05, + "loss": 0.4837, + "step": 928 + }, + { + "epoch": 0.11, + "learning_rate": 1.9697650612686228e-05, + "loss": 0.4915, + "step": 929 + }, + { + "epoch": 0.11, + "learning_rate": 1.9696746595432828e-05, + "loss": 0.4941, + "step": 930 + }, + { + "epoch": 0.11, + "learning_rate": 1.9695841249508656e-05, + "loss": 0.5001, + "step": 931 + }, + { + "epoch": 0.11, + "learning_rate": 1.9694934575037762e-05, + "loss": 0.514, + "step": 932 + }, + { + "epoch": 0.11, + "learning_rate": 1.969402657214438e-05, + "loss": 0.4917, + "step": 933 + }, + { + "epoch": 0.11, + "learning_rate": 1.9693117240952928e-05, + "loss": 0.4981, + "step": 934 + }, + { + "epoch": 0.11, + "learning_rate": 1.9692206581588e-05, + "loss": 0.4806, + "step": 935 + }, + { + "epoch": 0.11, + "learning_rate": 1.969129459417438e-05, + "loss": 0.496, + "step": 936 + }, + { + "epoch": 0.11, + "learning_rate": 1.9690381278837038e-05, + "loss": 0.4817, + "step": 937 + }, + { + "epoch": 0.11, + "learning_rate": 1.9689466635701106e-05, + "loss": 0.5036, + "step": 938 + }, + { + "epoch": 0.11, + "learning_rate": 1.9688550664891915e-05, + "loss": 0.5118, + "step": 939 + }, + { + "epoch": 0.11, + "learning_rate": 1.968763336653498e-05, + "loss": 0.5007, + "step": 940 + }, + { + "epoch": 0.11, + "learning_rate": 1.968671474075598e-05, + "loss": 0.4755, + "step": 941 + }, + { + "epoch": 0.11, + "learning_rate": 1.96857947876808e-05, + "loss": 0.51, + "step": 942 + }, + { + "epoch": 0.11, + "learning_rate": 1.968487350743548e-05, + "loss": 0.4993, + "step": 943 + }, + { + "epoch": 0.11, + "learning_rate": 1.968395090014627e-05, + "loss": 0.5162, + "step": 944 + }, + { + "epoch": 0.11, + "learning_rate": 1.968302696593958e-05, + "loss": 0.4912, + "step": 945 + }, + { + "epoch": 0.11, + "learning_rate": 1.968210170494201e-05, + "loss": 0.4905, + "step": 946 + }, + { + "epoch": 0.11, + "learning_rate": 1.9681175117280343e-05, + "loss": 0.4988, + "step": 947 + }, + { + "epoch": 0.11, + "learning_rate": 1.9680247203081537e-05, + "loss": 0.4947, + "step": 948 + }, + { + "epoch": 0.11, + "learning_rate": 1.9679317962472746e-05, + "loss": 0.4997, + "step": 949 + }, + { + "epoch": 0.11, + "learning_rate": 1.9678387395581292e-05, + "loss": 0.4728, + "step": 950 + }, + { + "epoch": 0.11, + "learning_rate": 1.967745550253468e-05, + "loss": 0.4971, + "step": 951 + }, + { + "epoch": 0.11, + "learning_rate": 1.9676522283460606e-05, + "loss": 0.488, + "step": 952 + }, + { + "epoch": 0.11, + "learning_rate": 1.9675587738486935e-05, + "loss": 0.4897, + "step": 953 + }, + { + "epoch": 0.11, + "learning_rate": 1.9674651867741733e-05, + "loss": 0.4924, + "step": 954 + }, + { + "epoch": 0.11, + "learning_rate": 1.967371467135322e-05, + "loss": 0.489, + "step": 955 + }, + { + "epoch": 0.11, + "learning_rate": 1.9672776149449826e-05, + "loss": 0.4761, + "step": 956 + }, + { + "epoch": 0.11, + "learning_rate": 1.967183630216014e-05, + "loss": 0.513, + "step": 957 + }, + { + "epoch": 0.11, + "learning_rate": 1.9670895129612946e-05, + "loss": 0.4968, + "step": 958 + }, + { + "epoch": 0.11, + "learning_rate": 1.9669952631937206e-05, + "loss": 0.4754, + "step": 959 + }, + { + "epoch": 0.11, + "learning_rate": 1.9669008809262064e-05, + "loss": 0.4952, + "step": 960 + }, + { + "epoch": 0.11, + "learning_rate": 1.9668063661716837e-05, + "loss": 0.4881, + "step": 961 + }, + { + "epoch": 0.11, + "learning_rate": 1.9667117189431045e-05, + "loss": 0.4988, + "step": 962 + }, + { + "epoch": 0.11, + "learning_rate": 1.9666169392534363e-05, + "loss": 0.4809, + "step": 963 + }, + { + "epoch": 0.11, + "learning_rate": 1.966522027115667e-05, + "loss": 0.4772, + "step": 964 + }, + { + "epoch": 0.11, + "learning_rate": 1.966426982542801e-05, + "loss": 0.4936, + "step": 965 + }, + { + "epoch": 0.11, + "learning_rate": 1.9663318055478616e-05, + "loss": 0.4872, + "step": 966 + }, + { + "epoch": 0.11, + "learning_rate": 1.9662364961438907e-05, + "loss": 0.5017, + "step": 967 + }, + { + "epoch": 0.11, + "learning_rate": 1.966141054343947e-05, + "loss": 0.4938, + "step": 968 + }, + { + "epoch": 0.11, + "learning_rate": 1.9660454801611094e-05, + "loss": 0.4742, + "step": 969 + }, + { + "epoch": 0.11, + "learning_rate": 1.9659497736084722e-05, + "loss": 0.4818, + "step": 970 + }, + { + "epoch": 0.11, + "learning_rate": 1.9658539346991504e-05, + "loss": 0.4862, + "step": 971 + }, + { + "epoch": 0.11, + "learning_rate": 1.9657579634462757e-05, + "loss": 0.4894, + "step": 972 + }, + { + "epoch": 0.11, + "learning_rate": 1.9656618598629985e-05, + "loss": 0.4805, + "step": 973 + }, + { + "epoch": 0.11, + "learning_rate": 1.9655656239624864e-05, + "loss": 0.4846, + "step": 974 + }, + { + "epoch": 0.11, + "learning_rate": 1.965469255757927e-05, + "loss": 0.5027, + "step": 975 + }, + { + "epoch": 0.11, + "learning_rate": 1.9653727552625242e-05, + "loss": 0.4863, + "step": 976 + }, + { + "epoch": 0.11, + "learning_rate": 1.9652761224895006e-05, + "loss": 0.4852, + "step": 977 + }, + { + "epoch": 0.11, + "learning_rate": 1.9651793574520975e-05, + "loss": 0.4887, + "step": 978 + }, + { + "epoch": 0.11, + "learning_rate": 1.965082460163574e-05, + "loss": 0.4766, + "step": 979 + }, + { + "epoch": 0.11, + "learning_rate": 1.9649854306372065e-05, + "loss": 0.5133, + "step": 980 + }, + { + "epoch": 0.11, + "learning_rate": 1.9648882688862905e-05, + "loss": 0.4669, + "step": 981 + }, + { + "epoch": 0.11, + "learning_rate": 1.9647909749241394e-05, + "loss": 0.4821, + "step": 982 + }, + { + "epoch": 0.11, + "learning_rate": 1.9646935487640848e-05, + "loss": 0.4946, + "step": 983 + }, + { + "epoch": 0.11, + "learning_rate": 1.964595990419476e-05, + "loss": 0.5043, + "step": 984 + }, + { + "epoch": 0.11, + "learning_rate": 1.964498299903681e-05, + "loss": 0.4751, + "step": 985 + }, + { + "epoch": 0.11, + "learning_rate": 1.964400477230085e-05, + "loss": 0.5033, + "step": 986 + }, + { + "epoch": 0.11, + "learning_rate": 1.9643025224120923e-05, + "loss": 0.4757, + "step": 987 + }, + { + "epoch": 0.11, + "learning_rate": 1.9642044354631255e-05, + "loss": 0.4983, + "step": 988 + }, + { + "epoch": 0.11, + "learning_rate": 1.9641062163966232e-05, + "loss": 0.49, + "step": 989 + }, + { + "epoch": 0.11, + "learning_rate": 1.9640078652260447e-05, + "loss": 0.4855, + "step": 990 + }, + { + "epoch": 0.11, + "learning_rate": 1.9639093819648664e-05, + "loss": 0.5073, + "step": 991 + }, + { + "epoch": 0.11, + "learning_rate": 1.963810766626582e-05, + "loss": 0.5177, + "step": 992 + }, + { + "epoch": 0.11, + "learning_rate": 1.9637120192247046e-05, + "loss": 0.4823, + "step": 993 + }, + { + "epoch": 0.11, + "learning_rate": 1.9636131397727646e-05, + "loss": 0.4998, + "step": 994 + }, + { + "epoch": 0.11, + "learning_rate": 1.9635141282843105e-05, + "loss": 0.4702, + "step": 995 + }, + { + "epoch": 0.11, + "learning_rate": 1.9634149847729093e-05, + "loss": 0.4997, + "step": 996 + }, + { + "epoch": 0.11, + "learning_rate": 1.963315709252146e-05, + "loss": 0.4942, + "step": 997 + }, + { + "epoch": 0.11, + "learning_rate": 1.963216301735623e-05, + "loss": 0.5002, + "step": 998 + }, + { + "epoch": 0.11, + "learning_rate": 1.9631167622369617e-05, + "loss": 0.5039, + "step": 999 + }, + { + "epoch": 0.11, + "learning_rate": 1.9630170907698015e-05, + "loss": 0.497, + "step": 1000 + }, + { + "epoch": 0.11, + "learning_rate": 1.9629172873477995e-05, + "loss": 0.4884, + "step": 1001 + }, + { + "epoch": 0.11, + "learning_rate": 1.9628173519846308e-05, + "loss": 0.4728, + "step": 1002 + }, + { + "epoch": 0.11, + "learning_rate": 1.9627172846939886e-05, + "loss": 0.5004, + "step": 1003 + }, + { + "epoch": 0.11, + "learning_rate": 1.962617085489585e-05, + "loss": 0.4989, + "step": 1004 + }, + { + "epoch": 0.11, + "learning_rate": 1.962516754385149e-05, + "loss": 0.4877, + "step": 1005 + }, + { + "epoch": 0.11, + "learning_rate": 1.962416291394428e-05, + "loss": 0.4992, + "step": 1006 + }, + { + "epoch": 0.12, + "learning_rate": 1.9623156965311884e-05, + "loss": 0.4895, + "step": 1007 + }, + { + "epoch": 0.12, + "learning_rate": 1.9622149698092135e-05, + "loss": 0.4922, + "step": 1008 + }, + { + "epoch": 0.12, + "learning_rate": 1.962114111242305e-05, + "loss": 0.5085, + "step": 1009 + }, + { + "epoch": 0.12, + "learning_rate": 1.962013120844283e-05, + "loss": 0.473, + "step": 1010 + }, + { + "epoch": 0.12, + "learning_rate": 1.9619119986289855e-05, + "loss": 0.4942, + "step": 1011 + }, + { + "epoch": 0.12, + "learning_rate": 1.9618107446102682e-05, + "loss": 0.4962, + "step": 1012 + }, + { + "epoch": 0.12, + "learning_rate": 1.9617093588020057e-05, + "loss": 0.4859, + "step": 1013 + }, + { + "epoch": 0.12, + "learning_rate": 1.9616078412180896e-05, + "loss": 0.4819, + "step": 1014 + }, + { + "epoch": 0.12, + "learning_rate": 1.96150619187243e-05, + "loss": 0.4826, + "step": 1015 + }, + { + "epoch": 0.12, + "learning_rate": 1.9614044107789553e-05, + "loss": 0.5166, + "step": 1016 + }, + { + "epoch": 0.12, + "learning_rate": 1.9613024979516123e-05, + "loss": 0.4963, + "step": 1017 + }, + { + "epoch": 0.12, + "learning_rate": 1.9612004534043644e-05, + "loss": 0.4796, + "step": 1018 + }, + { + "epoch": 0.12, + "learning_rate": 1.9610982771511947e-05, + "loss": 0.4787, + "step": 1019 + }, + { + "epoch": 0.12, + "learning_rate": 1.9609959692061037e-05, + "loss": 0.4938, + "step": 1020 + }, + { + "epoch": 0.12, + "learning_rate": 1.9608935295831092e-05, + "loss": 0.48, + "step": 1021 + }, + { + "epoch": 0.12, + "learning_rate": 1.9607909582962478e-05, + "loss": 0.4807, + "step": 1022 + }, + { + "epoch": 0.12, + "learning_rate": 1.9606882553595748e-05, + "loss": 0.4893, + "step": 1023 + }, + { + "epoch": 0.12, + "learning_rate": 1.960585420787162e-05, + "loss": 0.4854, + "step": 1024 + }, + { + "epoch": 0.12, + "learning_rate": 1.9604824545931005e-05, + "loss": 0.5115, + "step": 1025 + }, + { + "epoch": 0.12, + "learning_rate": 1.960379356791499e-05, + "loss": 0.4804, + "step": 1026 + }, + { + "epoch": 0.12, + "learning_rate": 1.960276127396484e-05, + "loss": 0.4954, + "step": 1027 + }, + { + "epoch": 0.12, + "learning_rate": 1.9601727664222e-05, + "loss": 0.4761, + "step": 1028 + }, + { + "epoch": 0.12, + "learning_rate": 1.96006927388281e-05, + "loss": 0.5093, + "step": 1029 + }, + { + "epoch": 0.12, + "learning_rate": 1.959965649792495e-05, + "loss": 0.4766, + "step": 1030 + }, + { + "epoch": 0.12, + "learning_rate": 1.9598618941654535e-05, + "loss": 0.4788, + "step": 1031 + }, + { + "epoch": 0.12, + "learning_rate": 1.9597580070159026e-05, + "loss": 0.4934, + "step": 1032 + }, + { + "epoch": 0.12, + "learning_rate": 1.9596539883580773e-05, + "loss": 0.4993, + "step": 1033 + }, + { + "epoch": 0.12, + "learning_rate": 1.9595498382062295e-05, + "loss": 0.4967, + "step": 1034 + }, + { + "epoch": 0.12, + "learning_rate": 1.9594455565746313e-05, + "loss": 0.5108, + "step": 1035 + }, + { + "epoch": 0.12, + "learning_rate": 1.959341143477571e-05, + "loss": 0.482, + "step": 1036 + }, + { + "epoch": 0.12, + "learning_rate": 1.9592365989293557e-05, + "loss": 0.4744, + "step": 1037 + }, + { + "epoch": 0.12, + "learning_rate": 1.95913192294431e-05, + "loss": 0.5033, + "step": 1038 + }, + { + "epoch": 0.12, + "learning_rate": 1.9590271155367776e-05, + "loss": 0.4749, + "step": 1039 + }, + { + "epoch": 0.12, + "learning_rate": 1.9589221767211188e-05, + "loss": 0.489, + "step": 1040 + }, + { + "epoch": 0.12, + "learning_rate": 1.9588171065117122e-05, + "loss": 0.4956, + "step": 1041 + }, + { + "epoch": 0.12, + "learning_rate": 1.9587119049229558e-05, + "loss": 0.4975, + "step": 1042 + }, + { + "epoch": 0.12, + "learning_rate": 1.9586065719692636e-05, + "loss": 0.5006, + "step": 1043 + }, + { + "epoch": 0.12, + "learning_rate": 1.9585011076650695e-05, + "loss": 0.5086, + "step": 1044 + }, + { + "epoch": 0.12, + "learning_rate": 1.958395512024824e-05, + "loss": 0.4902, + "step": 1045 + }, + { + "epoch": 0.12, + "learning_rate": 1.9582897850629958e-05, + "loss": 0.4795, + "step": 1046 + }, + { + "epoch": 0.12, + "learning_rate": 1.9581839267940722e-05, + "loss": 0.4852, + "step": 1047 + }, + { + "epoch": 0.12, + "learning_rate": 1.9580779372325583e-05, + "loss": 0.4886, + "step": 1048 + }, + { + "epoch": 0.12, + "learning_rate": 1.9579718163929767e-05, + "loss": 0.4913, + "step": 1049 + }, + { + "epoch": 0.12, + "learning_rate": 1.957865564289868e-05, + "loss": 0.5032, + "step": 1050 + }, + { + "epoch": 0.12, + "learning_rate": 1.9577591809377917e-05, + "loss": 0.4787, + "step": 1051 + }, + { + "epoch": 0.12, + "learning_rate": 1.957652666351325e-05, + "loss": 0.5013, + "step": 1052 + }, + { + "epoch": 0.12, + "learning_rate": 1.9575460205450616e-05, + "loss": 0.5499, + "step": 1053 + }, + { + "epoch": 0.12, + "learning_rate": 1.9574392435336156e-05, + "loss": 0.4768, + "step": 1054 + }, + { + "epoch": 0.12, + "learning_rate": 1.957332335331617e-05, + "loss": 0.4963, + "step": 1055 + }, + { + "epoch": 0.12, + "learning_rate": 1.957225295953715e-05, + "loss": 0.4937, + "step": 1056 + }, + { + "epoch": 0.12, + "learning_rate": 1.9571181254145762e-05, + "loss": 0.4775, + "step": 1057 + }, + { + "epoch": 0.12, + "learning_rate": 1.9570108237288853e-05, + "loss": 0.5146, + "step": 1058 + }, + { + "epoch": 0.12, + "learning_rate": 1.9569033909113454e-05, + "loss": 0.4953, + "step": 1059 + }, + { + "epoch": 0.12, + "learning_rate": 1.9567958269766768e-05, + "loss": 0.4758, + "step": 1060 + }, + { + "epoch": 0.12, + "learning_rate": 1.9566881319396184e-05, + "loss": 0.5177, + "step": 1061 + }, + { + "epoch": 0.12, + "learning_rate": 1.956580305814927e-05, + "loss": 0.4715, + "step": 1062 + }, + { + "epoch": 0.12, + "learning_rate": 1.9564723486173766e-05, + "loss": 0.4987, + "step": 1063 + }, + { + "epoch": 0.12, + "learning_rate": 1.95636426036176e-05, + "loss": 0.4819, + "step": 1064 + }, + { + "epoch": 0.12, + "learning_rate": 1.9562560410628883e-05, + "loss": 0.5034, + "step": 1065 + }, + { + "epoch": 0.12, + "learning_rate": 1.9561476907355886e-05, + "loss": 0.5026, + "step": 1066 + }, + { + "epoch": 0.12, + "learning_rate": 1.956039209394709e-05, + "loss": 0.5056, + "step": 1067 + }, + { + "epoch": 0.12, + "learning_rate": 1.9559305970551125e-05, + "loss": 0.4825, + "step": 1068 + }, + { + "epoch": 0.12, + "learning_rate": 1.955821853731682e-05, + "loss": 0.515, + "step": 1069 + }, + { + "epoch": 0.12, + "learning_rate": 1.955712979439318e-05, + "loss": 0.4992, + "step": 1070 + }, + { + "epoch": 0.12, + "learning_rate": 1.955603974192938e-05, + "loss": 0.483, + "step": 1071 + }, + { + "epoch": 0.12, + "learning_rate": 1.955494838007479e-05, + "loss": 0.493, + "step": 1072 + }, + { + "epoch": 0.12, + "learning_rate": 1.9553855708978943e-05, + "loss": 0.4969, + "step": 1073 + }, + { + "epoch": 0.12, + "learning_rate": 1.9552761728791563e-05, + "loss": 0.506, + "step": 1074 + }, + { + "epoch": 0.12, + "learning_rate": 1.955166643966255e-05, + "loss": 0.4959, + "step": 1075 + }, + { + "epoch": 0.12, + "learning_rate": 1.9550569841741984e-05, + "loss": 0.4879, + "step": 1076 + }, + { + "epoch": 0.12, + "learning_rate": 1.9549471935180123e-05, + "loss": 0.4908, + "step": 1077 + }, + { + "epoch": 0.12, + "learning_rate": 1.95483727201274e-05, + "loss": 0.4847, + "step": 1078 + }, + { + "epoch": 0.12, + "learning_rate": 1.9547272196734436e-05, + "loss": 0.5054, + "step": 1079 + }, + { + "epoch": 0.12, + "learning_rate": 1.954617036515203e-05, + "loss": 0.4997, + "step": 1080 + }, + { + "epoch": 0.12, + "learning_rate": 1.9545067225531155e-05, + "loss": 0.5026, + "step": 1081 + }, + { + "epoch": 0.12, + "learning_rate": 1.954396277802296e-05, + "loss": 0.5015, + "step": 1082 + }, + { + "epoch": 0.12, + "learning_rate": 1.954285702277879e-05, + "loss": 0.4918, + "step": 1083 + }, + { + "epoch": 0.12, + "learning_rate": 1.954174995995015e-05, + "loss": 0.4966, + "step": 1084 + }, + { + "epoch": 0.12, + "learning_rate": 1.9540641589688735e-05, + "loss": 0.4972, + "step": 1085 + }, + { + "epoch": 0.12, + "learning_rate": 1.953953191214642e-05, + "loss": 0.4849, + "step": 1086 + }, + { + "epoch": 0.12, + "learning_rate": 1.9538420927475247e-05, + "loss": 0.5057, + "step": 1087 + }, + { + "epoch": 0.12, + "learning_rate": 1.953730863582745e-05, + "loss": 0.4687, + "step": 1088 + }, + { + "epoch": 0.12, + "learning_rate": 1.9536195037355438e-05, + "loss": 0.4987, + "step": 1089 + }, + { + "epoch": 0.12, + "learning_rate": 1.9535080132211805e-05, + "loss": 0.4879, + "step": 1090 + }, + { + "epoch": 0.12, + "learning_rate": 1.9533963920549307e-05, + "loss": 0.4896, + "step": 1091 + }, + { + "epoch": 0.12, + "learning_rate": 1.9532846402520898e-05, + "loss": 0.4914, + "step": 1092 + }, + { + "epoch": 0.12, + "learning_rate": 1.95317275782797e-05, + "loss": 0.4981, + "step": 1093 + }, + { + "epoch": 0.13, + "learning_rate": 1.953060744797901e-05, + "loss": 0.5114, + "step": 1094 + }, + { + "epoch": 0.13, + "learning_rate": 1.9529486011772326e-05, + "loss": 0.4893, + "step": 1095 + }, + { + "epoch": 0.13, + "learning_rate": 1.95283632698133e-05, + "loss": 0.4894, + "step": 1096 + }, + { + "epoch": 0.13, + "learning_rate": 1.952723922225577e-05, + "loss": 0.4872, + "step": 1097 + }, + { + "epoch": 0.13, + "learning_rate": 1.952611386925376e-05, + "loss": 0.4907, + "step": 1098 + }, + { + "epoch": 0.13, + "learning_rate": 1.952498721096147e-05, + "loss": 0.4816, + "step": 1099 + }, + { + "epoch": 0.13, + "learning_rate": 1.952385924753328e-05, + "loss": 0.4909, + "step": 1100 + }, + { + "epoch": 0.13, + "learning_rate": 1.9522729979123735e-05, + "loss": 0.493, + "step": 1101 + }, + { + "epoch": 0.13, + "learning_rate": 1.952159940588758e-05, + "loss": 0.5024, + "step": 1102 + }, + { + "epoch": 0.13, + "learning_rate": 1.9520467527979726e-05, + "loss": 0.4942, + "step": 1103 + }, + { + "epoch": 0.13, + "learning_rate": 1.9519334345555264e-05, + "loss": 0.484, + "step": 1104 + }, + { + "epoch": 0.13, + "learning_rate": 1.9518199858769466e-05, + "loss": 0.4942, + "step": 1105 + }, + { + "epoch": 0.13, + "learning_rate": 1.9517064067777786e-05, + "loss": 0.469, + "step": 1106 + }, + { + "epoch": 0.13, + "learning_rate": 1.9515926972735847e-05, + "loss": 0.4857, + "step": 1107 + }, + { + "epoch": 0.13, + "learning_rate": 1.9514788573799457e-05, + "loss": 0.4872, + "step": 1108 + }, + { + "epoch": 0.13, + "learning_rate": 1.9513648871124604e-05, + "loss": 0.5115, + "step": 1109 + }, + { + "epoch": 0.13, + "learning_rate": 1.9512507864867452e-05, + "loss": 0.4947, + "step": 1110 + }, + { + "epoch": 0.13, + "learning_rate": 1.9511365555184345e-05, + "loss": 0.4913, + "step": 1111 + }, + { + "epoch": 0.13, + "learning_rate": 1.9510221942231803e-05, + "loss": 0.4762, + "step": 1112 + }, + { + "epoch": 0.13, + "learning_rate": 1.950907702616653e-05, + "loss": 0.4975, + "step": 1113 + }, + { + "epoch": 0.13, + "learning_rate": 1.9507930807145406e-05, + "loss": 0.4734, + "step": 1114 + }, + { + "epoch": 0.13, + "learning_rate": 1.9506783285325482e-05, + "loss": 0.4966, + "step": 1115 + }, + { + "epoch": 0.13, + "learning_rate": 1.9505634460863997e-05, + "loss": 0.4885, + "step": 1116 + }, + { + "epoch": 0.13, + "learning_rate": 1.950448433391837e-05, + "loss": 0.4667, + "step": 1117 + }, + { + "epoch": 0.13, + "learning_rate": 1.9503332904646188e-05, + "loss": 0.5057, + "step": 1118 + }, + { + "epoch": 0.13, + "learning_rate": 1.9502180173205227e-05, + "loss": 0.5029, + "step": 1119 + }, + { + "epoch": 0.13, + "learning_rate": 1.9501026139753433e-05, + "loss": 0.4856, + "step": 1120 + }, + { + "epoch": 0.13, + "learning_rate": 1.9499870804448936e-05, + "loss": 0.4836, + "step": 1121 + }, + { + "epoch": 0.13, + "learning_rate": 1.9498714167450047e-05, + "loss": 0.4915, + "step": 1122 + }, + { + "epoch": 0.13, + "learning_rate": 1.9497556228915246e-05, + "loss": 0.4904, + "step": 1123 + }, + { + "epoch": 0.13, + "learning_rate": 1.9496396989003195e-05, + "loss": 0.4843, + "step": 1124 + }, + { + "epoch": 0.13, + "learning_rate": 1.9495236447872738e-05, + "loss": 0.481, + "step": 1125 + }, + { + "epoch": 0.13, + "learning_rate": 1.94940746056829e-05, + "loss": 0.5034, + "step": 1126 + }, + { + "epoch": 0.13, + "learning_rate": 1.949291146259287e-05, + "loss": 0.4913, + "step": 1127 + }, + { + "epoch": 0.13, + "learning_rate": 1.949174701876203e-05, + "loss": 0.4805, + "step": 1128 + }, + { + "epoch": 0.13, + "learning_rate": 1.9490581274349934e-05, + "loss": 0.4944, + "step": 1129 + }, + { + "epoch": 0.13, + "learning_rate": 1.9489414229516318e-05, + "loss": 0.4868, + "step": 1130 + }, + { + "epoch": 0.13, + "learning_rate": 1.9488245884421087e-05, + "loss": 0.4768, + "step": 1131 + }, + { + "epoch": 0.13, + "learning_rate": 1.9487076239224337e-05, + "loss": 0.4823, + "step": 1132 + }, + { + "epoch": 0.13, + "learning_rate": 1.948590529408633e-05, + "loss": 0.4853, + "step": 1133 + }, + { + "epoch": 0.13, + "learning_rate": 1.948473304916751e-05, + "loss": 0.4846, + "step": 1134 + }, + { + "epoch": 0.13, + "learning_rate": 1.948355950462851e-05, + "loss": 0.5139, + "step": 1135 + }, + { + "epoch": 0.13, + "learning_rate": 1.9482384660630125e-05, + "loss": 0.4903, + "step": 1136 + }, + { + "epoch": 0.13, + "learning_rate": 1.9481208517333336e-05, + "loss": 0.5032, + "step": 1137 + }, + { + "epoch": 0.13, + "learning_rate": 1.9480031074899303e-05, + "loss": 0.4933, + "step": 1138 + }, + { + "epoch": 0.13, + "learning_rate": 1.9478852333489356e-05, + "loss": 0.4865, + "step": 1139 + }, + { + "epoch": 0.13, + "learning_rate": 1.9477672293265014e-05, + "loss": 0.4838, + "step": 1140 + }, + { + "epoch": 0.13, + "learning_rate": 1.9476490954387968e-05, + "loss": 0.4925, + "step": 1141 + }, + { + "epoch": 0.13, + "learning_rate": 1.947530831702009e-05, + "loss": 0.5201, + "step": 1142 + }, + { + "epoch": 0.13, + "learning_rate": 1.9474124381323424e-05, + "loss": 0.4918, + "step": 1143 + }, + { + "epoch": 0.13, + "learning_rate": 1.9472939147460194e-05, + "loss": 0.4993, + "step": 1144 + }, + { + "epoch": 0.13, + "learning_rate": 1.947175261559281e-05, + "loss": 0.4926, + "step": 1145 + }, + { + "epoch": 0.13, + "learning_rate": 1.9470564785883848e-05, + "loss": 0.5156, + "step": 1146 + }, + { + "epoch": 0.13, + "learning_rate": 1.9469375658496066e-05, + "loss": 0.4835, + "step": 1147 + }, + { + "epoch": 0.13, + "learning_rate": 1.946818523359241e-05, + "loss": 0.4778, + "step": 1148 + }, + { + "epoch": 0.13, + "learning_rate": 1.9466993511335985e-05, + "loss": 0.4853, + "step": 1149 + }, + { + "epoch": 0.13, + "learning_rate": 1.9465800491890087e-05, + "loss": 0.4952, + "step": 1150 + }, + { + "epoch": 0.13, + "learning_rate": 1.946460617541819e-05, + "loss": 0.4911, + "step": 1151 + }, + { + "epoch": 0.13, + "learning_rate": 1.9463410562083937e-05, + "loss": 0.4908, + "step": 1152 + }, + { + "epoch": 0.13, + "learning_rate": 1.946221365205115e-05, + "loss": 0.4897, + "step": 1153 + }, + { + "epoch": 0.13, + "learning_rate": 1.9461015445483843e-05, + "loss": 0.4868, + "step": 1154 + }, + { + "epoch": 0.13, + "learning_rate": 1.9459815942546192e-05, + "loss": 0.497, + "step": 1155 + }, + { + "epoch": 0.13, + "learning_rate": 1.9458615143402554e-05, + "loss": 0.4966, + "step": 1156 + }, + { + "epoch": 0.13, + "learning_rate": 1.9457413048217466e-05, + "loss": 0.4687, + "step": 1157 + }, + { + "epoch": 0.13, + "learning_rate": 1.9456209657155645e-05, + "loss": 0.4876, + "step": 1158 + }, + { + "epoch": 0.13, + "learning_rate": 1.9455004970381978e-05, + "loss": 0.5257, + "step": 1159 + }, + { + "epoch": 0.13, + "learning_rate": 1.9453798988061535e-05, + "loss": 0.4873, + "step": 1160 + }, + { + "epoch": 0.13, + "learning_rate": 1.9452591710359566e-05, + "loss": 0.4734, + "step": 1161 + }, + { + "epoch": 0.13, + "learning_rate": 1.9451383137441492e-05, + "loss": 0.485, + "step": 1162 + }, + { + "epoch": 0.13, + "learning_rate": 1.9450173269472915e-05, + "loss": 0.4731, + "step": 1163 + }, + { + "epoch": 0.13, + "learning_rate": 1.9448962106619614e-05, + "loss": 0.4943, + "step": 1164 + }, + { + "epoch": 0.13, + "learning_rate": 1.944774964904754e-05, + "loss": 0.4884, + "step": 1165 + }, + { + "epoch": 0.13, + "learning_rate": 1.944653589692284e-05, + "loss": 0.48, + "step": 1166 + }, + { + "epoch": 0.13, + "learning_rate": 1.9445320850411816e-05, + "loss": 0.5034, + "step": 1167 + }, + { + "epoch": 0.13, + "learning_rate": 1.9444104509680954e-05, + "loss": 0.5059, + "step": 1168 + }, + { + "epoch": 0.13, + "learning_rate": 1.9442886874896924e-05, + "loss": 0.4864, + "step": 1169 + }, + { + "epoch": 0.13, + "learning_rate": 1.944166794622657e-05, + "loss": 0.491, + "step": 1170 + }, + { + "epoch": 0.13, + "learning_rate": 1.9440447723836914e-05, + "loss": 0.4786, + "step": 1171 + }, + { + "epoch": 0.13, + "learning_rate": 1.9439226207895143e-05, + "loss": 0.4786, + "step": 1172 + }, + { + "epoch": 0.13, + "learning_rate": 1.9438003398568647e-05, + "loss": 0.4986, + "step": 1173 + }, + { + "epoch": 0.13, + "learning_rate": 1.9436779296024967e-05, + "loss": 0.4635, + "step": 1174 + }, + { + "epoch": 0.13, + "learning_rate": 1.9435553900431838e-05, + "loss": 0.5228, + "step": 1175 + }, + { + "epoch": 0.13, + "learning_rate": 1.9434327211957166e-05, + "loss": 0.4745, + "step": 1176 + }, + { + "epoch": 0.13, + "learning_rate": 1.943309923076903e-05, + "loss": 0.4875, + "step": 1177 + }, + { + "epoch": 0.13, + "learning_rate": 1.9431869957035698e-05, + "loss": 0.479, + "step": 1178 + }, + { + "epoch": 0.13, + "learning_rate": 1.9430639390925604e-05, + "loss": 0.4843, + "step": 1179 + }, + { + "epoch": 0.13, + "learning_rate": 1.942940753260736e-05, + "loss": 0.4825, + "step": 1180 + }, + { + "epoch": 0.13, + "learning_rate": 1.9428174382249764e-05, + "loss": 0.4885, + "step": 1181 + }, + { + "epoch": 0.14, + "learning_rate": 1.942693994002178e-05, + "loss": 0.4608, + "step": 1182 + }, + { + "epoch": 0.14, + "learning_rate": 1.9425704206092562e-05, + "loss": 0.4976, + "step": 1183 + }, + { + "epoch": 0.14, + "learning_rate": 1.9424467180631422e-05, + "loss": 0.4948, + "step": 1184 + }, + { + "epoch": 0.14, + "learning_rate": 1.942322886380787e-05, + "loss": 0.5119, + "step": 1185 + }, + { + "epoch": 0.14, + "learning_rate": 1.942198925579158e-05, + "loss": 0.4797, + "step": 1186 + }, + { + "epoch": 0.14, + "learning_rate": 1.9420748356752405e-05, + "loss": 0.4915, + "step": 1187 + }, + { + "epoch": 0.14, + "learning_rate": 1.9419506166860374e-05, + "loss": 0.4795, + "step": 1188 + }, + { + "epoch": 0.14, + "learning_rate": 1.9418262686285697e-05, + "loss": 0.4857, + "step": 1189 + }, + { + "epoch": 0.14, + "learning_rate": 1.9417017915198758e-05, + "loss": 0.4809, + "step": 1190 + }, + { + "epoch": 0.14, + "learning_rate": 1.9415771853770117e-05, + "loss": 0.4958, + "step": 1191 + }, + { + "epoch": 0.14, + "learning_rate": 1.9414524502170514e-05, + "loss": 0.49, + "step": 1192 + }, + { + "epoch": 0.14, + "learning_rate": 1.941327586057087e-05, + "loss": 0.5236, + "step": 1193 + }, + { + "epoch": 0.14, + "learning_rate": 1.9412025929142263e-05, + "loss": 0.4767, + "step": 1194 + }, + { + "epoch": 0.14, + "learning_rate": 1.9410774708055972e-05, + "loss": 0.5111, + "step": 1195 + }, + { + "epoch": 0.14, + "learning_rate": 1.940952219748344e-05, + "loss": 0.4883, + "step": 1196 + }, + { + "epoch": 0.14, + "learning_rate": 1.9408268397596287e-05, + "loss": 0.4899, + "step": 1197 + }, + { + "epoch": 0.14, + "learning_rate": 1.9407013308566315e-05, + "loss": 0.4783, + "step": 1198 + }, + { + "epoch": 0.14, + "learning_rate": 1.9405756930565496e-05, + "loss": 0.5071, + "step": 1199 + }, + { + "epoch": 0.14, + "learning_rate": 1.9404499263765983e-05, + "loss": 0.4763, + "step": 1200 + }, + { + "epoch": 0.14, + "learning_rate": 1.9403240308340105e-05, + "loss": 0.4966, + "step": 1201 + }, + { + "epoch": 0.14, + "learning_rate": 1.940198006446037e-05, + "loss": 0.4701, + "step": 1202 + }, + { + "epoch": 0.14, + "learning_rate": 1.940071853229945e-05, + "loss": 0.5047, + "step": 1203 + }, + { + "epoch": 0.14, + "learning_rate": 1.939945571203021e-05, + "loss": 0.4907, + "step": 1204 + }, + { + "epoch": 0.14, + "learning_rate": 1.9398191603825687e-05, + "loss": 0.4777, + "step": 1205 + }, + { + "epoch": 0.14, + "learning_rate": 1.9396926207859085e-05, + "loss": 0.4816, + "step": 1206 + }, + { + "epoch": 0.14, + "learning_rate": 1.9395659524303795e-05, + "loss": 0.492, + "step": 1207 + }, + { + "epoch": 0.14, + "learning_rate": 1.9394391553333384e-05, + "loss": 0.4973, + "step": 1208 + }, + { + "epoch": 0.14, + "learning_rate": 1.939312229512159e-05, + "loss": 0.501, + "step": 1209 + }, + { + "epoch": 0.14, + "learning_rate": 1.9391851749842326e-05, + "loss": 0.4835, + "step": 1210 + }, + { + "epoch": 0.14, + "learning_rate": 1.939057991766969e-05, + "loss": 0.4839, + "step": 1211 + }, + { + "epoch": 0.14, + "learning_rate": 1.938930679877795e-05, + "loss": 0.4796, + "step": 1212 + }, + { + "epoch": 0.14, + "learning_rate": 1.938803239334155e-05, + "loss": 0.4854, + "step": 1213 + }, + { + "epoch": 0.14, + "learning_rate": 1.9386756701535115e-05, + "loss": 0.5023, + "step": 1214 + }, + { + "epoch": 0.14, + "learning_rate": 1.938547972353344e-05, + "loss": 0.476, + "step": 1215 + }, + { + "epoch": 0.14, + "learning_rate": 1.93842014595115e-05, + "loss": 0.4784, + "step": 1216 + }, + { + "epoch": 0.14, + "learning_rate": 1.9382921909644448e-05, + "loss": 0.5077, + "step": 1217 + }, + { + "epoch": 0.14, + "learning_rate": 1.938164107410761e-05, + "loss": 0.5086, + "step": 1218 + }, + { + "epoch": 0.14, + "learning_rate": 1.938035895307649e-05, + "loss": 0.4828, + "step": 1219 + }, + { + "epoch": 0.14, + "learning_rate": 1.9379075546726764e-05, + "loss": 0.4738, + "step": 1220 + }, + { + "epoch": 0.14, + "learning_rate": 1.9377790855234288e-05, + "loss": 0.484, + "step": 1221 + }, + { + "epoch": 0.14, + "learning_rate": 1.9376504878775098e-05, + "loss": 0.4836, + "step": 1222 + }, + { + "epoch": 0.14, + "learning_rate": 1.9375217617525396e-05, + "loss": 0.4859, + "step": 1223 + }, + { + "epoch": 0.14, + "learning_rate": 1.937392907166157e-05, + "loss": 0.4851, + "step": 1224 + }, + { + "epoch": 0.14, + "learning_rate": 1.9372639241360173e-05, + "loss": 0.4709, + "step": 1225 + }, + { + "epoch": 0.14, + "learning_rate": 1.937134812679795e-05, + "loss": 0.5075, + "step": 1226 + }, + { + "epoch": 0.14, + "learning_rate": 1.9370055728151805e-05, + "loss": 0.5, + "step": 1227 + }, + { + "epoch": 0.14, + "learning_rate": 1.936876204559883e-05, + "loss": 0.4768, + "step": 1228 + }, + { + "epoch": 0.14, + "learning_rate": 1.936746707931628e-05, + "loss": 0.5111, + "step": 1229 + }, + { + "epoch": 0.14, + "learning_rate": 1.9366170829481607e-05, + "loss": 0.4642, + "step": 1230 + }, + { + "epoch": 0.14, + "learning_rate": 1.9364873296272414e-05, + "loss": 0.4755, + "step": 1231 + }, + { + "epoch": 0.14, + "learning_rate": 1.9363574479866504e-05, + "loss": 0.4973, + "step": 1232 + }, + { + "epoch": 0.14, + "learning_rate": 1.936227438044183e-05, + "loss": 0.4738, + "step": 1233 + }, + { + "epoch": 0.14, + "learning_rate": 1.9360972998176547e-05, + "loss": 0.4883, + "step": 1234 + }, + { + "epoch": 0.14, + "learning_rate": 1.9359670333248967e-05, + "loss": 0.4855, + "step": 1235 + }, + { + "epoch": 0.14, + "learning_rate": 1.935836638583759e-05, + "loss": 0.503, + "step": 1236 + }, + { + "epoch": 0.14, + "learning_rate": 1.935706115612108e-05, + "loss": 0.4941, + "step": 1237 + }, + { + "epoch": 0.14, + "learning_rate": 1.935575464427828e-05, + "loss": 0.4675, + "step": 1238 + }, + { + "epoch": 0.14, + "learning_rate": 1.9354446850488216e-05, + "loss": 0.468, + "step": 1239 + }, + { + "epoch": 0.14, + "learning_rate": 1.9353137774930085e-05, + "loss": 0.5015, + "step": 1240 + }, + { + "epoch": 0.14, + "learning_rate": 1.935182741778326e-05, + "loss": 0.4915, + "step": 1241 + }, + { + "epoch": 0.14, + "learning_rate": 1.9350515779227294e-05, + "loss": 0.4781, + "step": 1242 + }, + { + "epoch": 0.14, + "learning_rate": 1.93492028594419e-05, + "loss": 0.4798, + "step": 1243 + }, + { + "epoch": 0.14, + "learning_rate": 1.934788865860698e-05, + "loss": 0.4966, + "step": 1244 + }, + { + "epoch": 0.14, + "learning_rate": 1.9346573176902616e-05, + "loss": 0.4916, + "step": 1245 + }, + { + "epoch": 0.14, + "learning_rate": 1.934525641450905e-05, + "loss": 0.4928, + "step": 1246 + }, + { + "epoch": 0.14, + "learning_rate": 1.9343938371606714e-05, + "loss": 0.4975, + "step": 1247 + }, + { + "epoch": 0.14, + "learning_rate": 1.9342619048376202e-05, + "loss": 0.4795, + "step": 1248 + }, + { + "epoch": 0.14, + "learning_rate": 1.93412984449983e-05, + "loss": 0.4847, + "step": 1249 + }, + { + "epoch": 0.14, + "learning_rate": 1.9339976561653956e-05, + "loss": 0.4977, + "step": 1250 + }, + { + "epoch": 0.14, + "learning_rate": 1.9338653398524295e-05, + "loss": 0.4751, + "step": 1251 + }, + { + "epoch": 0.14, + "learning_rate": 1.933732895579062e-05, + "loss": 0.4929, + "step": 1252 + }, + { + "epoch": 0.14, + "learning_rate": 1.933600323363442e-05, + "loss": 0.4971, + "step": 1253 + }, + { + "epoch": 0.14, + "learning_rate": 1.933467623223733e-05, + "loss": 0.4927, + "step": 1254 + }, + { + "epoch": 0.14, + "learning_rate": 1.9333347951781194e-05, + "loss": 0.4969, + "step": 1255 + }, + { + "epoch": 0.14, + "learning_rate": 1.933201839244801e-05, + "loss": 0.4833, + "step": 1256 + }, + { + "epoch": 0.14, + "learning_rate": 1.9330687554419956e-05, + "loss": 0.4656, + "step": 1257 + }, + { + "epoch": 0.14, + "learning_rate": 1.932935543787939e-05, + "loss": 0.496, + "step": 1258 + }, + { + "epoch": 0.14, + "learning_rate": 1.9328022043008842e-05, + "loss": 0.4697, + "step": 1259 + }, + { + "epoch": 0.14, + "learning_rate": 1.9326687369991012e-05, + "loss": 0.4901, + "step": 1260 + }, + { + "epoch": 0.14, + "learning_rate": 1.9325351419008783e-05, + "loss": 0.4935, + "step": 1261 + }, + { + "epoch": 0.14, + "learning_rate": 1.932401419024521e-05, + "loss": 0.5023, + "step": 1262 + }, + { + "epoch": 0.14, + "learning_rate": 1.9322675683883528e-05, + "loss": 0.4864, + "step": 1263 + }, + { + "epoch": 0.14, + "learning_rate": 1.9321335900107134e-05, + "loss": 0.4949, + "step": 1264 + }, + { + "epoch": 0.14, + "learning_rate": 1.931999483909961e-05, + "loss": 0.4694, + "step": 1265 + }, + { + "epoch": 0.14, + "learning_rate": 1.9318652501044715e-05, + "loss": 0.4915, + "step": 1266 + }, + { + "epoch": 0.14, + "learning_rate": 1.931730888612638e-05, + "loss": 0.492, + "step": 1267 + }, + { + "epoch": 0.14, + "learning_rate": 1.9315963994528707e-05, + "loss": 0.4826, + "step": 1268 + }, + { + "epoch": 0.15, + "learning_rate": 1.931461782643598e-05, + "loss": 0.4878, + "step": 1269 + }, + { + "epoch": 0.15, + "learning_rate": 1.9313270382032644e-05, + "loss": 0.5038, + "step": 1270 + }, + { + "epoch": 0.15, + "learning_rate": 1.9311921661503338e-05, + "loss": 0.4938, + "step": 1271 + }, + { + "epoch": 0.15, + "learning_rate": 1.9310571665032867e-05, + "loss": 0.487, + "step": 1272 + }, + { + "epoch": 0.15, + "learning_rate": 1.9309220392806206e-05, + "loss": 0.4885, + "step": 1273 + }, + { + "epoch": 0.15, + "learning_rate": 1.9307867845008513e-05, + "loss": 0.4796, + "step": 1274 + }, + { + "epoch": 0.15, + "learning_rate": 1.930651402182512e-05, + "loss": 0.4803, + "step": 1275 + }, + { + "epoch": 0.15, + "learning_rate": 1.9305158923441524e-05, + "loss": 0.4871, + "step": 1276 + }, + { + "epoch": 0.15, + "learning_rate": 1.9303802550043404e-05, + "loss": 0.4888, + "step": 1277 + }, + { + "epoch": 0.15, + "learning_rate": 1.930244490181662e-05, + "loss": 0.4926, + "step": 1278 + }, + { + "epoch": 0.15, + "learning_rate": 1.9301085978947195e-05, + "loss": 0.4892, + "step": 1279 + }, + { + "epoch": 0.15, + "learning_rate": 1.9299725781621335e-05, + "loss": 0.4975, + "step": 1280 + }, + { + "epoch": 0.15, + "learning_rate": 1.9298364310025412e-05, + "loss": 0.4898, + "step": 1281 + }, + { + "epoch": 0.15, + "learning_rate": 1.929700156434599e-05, + "loss": 0.472, + "step": 1282 + }, + { + "epoch": 0.15, + "learning_rate": 1.929563754476978e-05, + "loss": 0.4749, + "step": 1283 + }, + { + "epoch": 0.15, + "learning_rate": 1.929427225148369e-05, + "loss": 0.4967, + "step": 1284 + }, + { + "epoch": 0.15, + "learning_rate": 1.92929056846748e-05, + "loss": 0.4912, + "step": 1285 + }, + { + "epoch": 0.15, + "learning_rate": 1.9291537844530352e-05, + "loss": 0.4963, + "step": 1286 + }, + { + "epoch": 0.15, + "learning_rate": 1.9290168731237776e-05, + "loss": 0.5038, + "step": 1287 + }, + { + "epoch": 0.15, + "learning_rate": 1.9288798344984673e-05, + "loss": 0.4706, + "step": 1288 + }, + { + "epoch": 0.15, + "learning_rate": 1.928742668595881e-05, + "loss": 0.498, + "step": 1289 + }, + { + "epoch": 0.15, + "learning_rate": 1.9286053754348142e-05, + "loss": 0.488, + "step": 1290 + }, + { + "epoch": 0.15, + "learning_rate": 1.9284679550340783e-05, + "loss": 0.4734, + "step": 1291 + }, + { + "epoch": 0.15, + "learning_rate": 1.928330407412504e-05, + "loss": 0.4883, + "step": 1292 + }, + { + "epoch": 0.15, + "learning_rate": 1.9281927325889373e-05, + "loss": 0.4742, + "step": 1293 + }, + { + "epoch": 0.15, + "learning_rate": 1.9280549305822435e-05, + "loss": 0.499, + "step": 1294 + }, + { + "epoch": 0.15, + "learning_rate": 1.927917001411304e-05, + "loss": 0.4847, + "step": 1295 + }, + { + "epoch": 0.15, + "learning_rate": 1.9277789450950187e-05, + "loss": 0.4661, + "step": 1296 + }, + { + "epoch": 0.15, + "learning_rate": 1.9276407616523044e-05, + "loss": 0.4792, + "step": 1297 + }, + { + "epoch": 0.15, + "learning_rate": 1.927502451102095e-05, + "loss": 0.4986, + "step": 1298 + }, + { + "epoch": 0.15, + "learning_rate": 1.927364013463342e-05, + "loss": 0.4912, + "step": 1299 + }, + { + "epoch": 0.15, + "learning_rate": 1.9272254487550144e-05, + "loss": 0.4729, + "step": 1300 + }, + { + "epoch": 0.15, + "learning_rate": 1.9270867569960994e-05, + "loss": 0.4785, + "step": 1301 + }, + { + "epoch": 0.15, + "learning_rate": 1.9269479382056008e-05, + "loss": 0.5064, + "step": 1302 + }, + { + "epoch": 0.15, + "learning_rate": 1.926808992402539e-05, + "loss": 0.4841, + "step": 1303 + }, + { + "epoch": 0.15, + "learning_rate": 1.926669919605953e-05, + "loss": 0.4717, + "step": 1304 + }, + { + "epoch": 0.15, + "learning_rate": 1.926530719834899e-05, + "loss": 0.4853, + "step": 1305 + }, + { + "epoch": 0.15, + "learning_rate": 1.9263913931084507e-05, + "loss": 0.4704, + "step": 1306 + }, + { + "epoch": 0.15, + "learning_rate": 1.9262519394456985e-05, + "loss": 0.5001, + "step": 1307 + }, + { + "epoch": 0.15, + "learning_rate": 1.9261123588657514e-05, + "loss": 0.4755, + "step": 1308 + }, + { + "epoch": 0.15, + "learning_rate": 1.925972651387734e-05, + "loss": 0.4841, + "step": 1309 + }, + { + "epoch": 0.15, + "learning_rate": 1.9258328170307905e-05, + "loss": 0.5065, + "step": 1310 + }, + { + "epoch": 0.15, + "learning_rate": 1.9256928558140806e-05, + "loss": 0.4867, + "step": 1311 + }, + { + "epoch": 0.15, + "learning_rate": 1.925552767756782e-05, + "loss": 0.4879, + "step": 1312 + }, + { + "epoch": 0.15, + "learning_rate": 1.9254125528780908e-05, + "loss": 0.4927, + "step": 1313 + }, + { + "epoch": 0.15, + "learning_rate": 1.9252722111972182e-05, + "loss": 0.5016, + "step": 1314 + }, + { + "epoch": 0.15, + "learning_rate": 1.9251317427333953e-05, + "loss": 0.5064, + "step": 1315 + }, + { + "epoch": 0.15, + "learning_rate": 1.924991147505869e-05, + "loss": 0.4779, + "step": 1316 + }, + { + "epoch": 0.15, + "learning_rate": 1.924850425533904e-05, + "loss": 0.4678, + "step": 1317 + }, + { + "epoch": 0.15, + "learning_rate": 1.9247095768367822e-05, + "loss": 0.4932, + "step": 1318 + }, + { + "epoch": 0.15, + "learning_rate": 1.924568601433803e-05, + "loss": 0.4863, + "step": 1319 + }, + { + "epoch": 0.15, + "learning_rate": 1.9244274993442836e-05, + "loss": 0.484, + "step": 1320 + }, + { + "epoch": 0.15, + "learning_rate": 1.924286270587558e-05, + "loss": 0.4773, + "step": 1321 + }, + { + "epoch": 0.15, + "learning_rate": 1.924144915182977e-05, + "loss": 0.4771, + "step": 1322 + }, + { + "epoch": 0.15, + "learning_rate": 1.9240034331499105e-05, + "loss": 0.4836, + "step": 1323 + }, + { + "epoch": 0.15, + "learning_rate": 1.923861824507744e-05, + "loss": 0.4933, + "step": 1324 + }, + { + "epoch": 0.15, + "learning_rate": 1.9237200892758814e-05, + "loss": 0.4814, + "step": 1325 + }, + { + "epoch": 0.15, + "learning_rate": 1.923578227473743e-05, + "loss": 0.477, + "step": 1326 + }, + { + "epoch": 0.15, + "learning_rate": 1.923436239120768e-05, + "loss": 0.4851, + "step": 1327 + }, + { + "epoch": 0.15, + "learning_rate": 1.9232941242364114e-05, + "loss": 0.4971, + "step": 1328 + }, + { + "epoch": 0.15, + "learning_rate": 1.9231518828401458e-05, + "loss": 0.488, + "step": 1329 + }, + { + "epoch": 0.15, + "learning_rate": 1.923009514951462e-05, + "loss": 0.4949, + "step": 1330 + }, + { + "epoch": 0.15, + "learning_rate": 1.9228670205898675e-05, + "loss": 0.4887, + "step": 1331 + }, + { + "epoch": 0.15, + "learning_rate": 1.922724399774887e-05, + "loss": 0.4823, + "step": 1332 + }, + { + "epoch": 0.15, + "learning_rate": 1.9225816525260626e-05, + "loss": 0.4923, + "step": 1333 + }, + { + "epoch": 0.15, + "learning_rate": 1.9224387788629547e-05, + "loss": 0.4817, + "step": 1334 + }, + { + "epoch": 0.15, + "learning_rate": 1.922295778805139e-05, + "loss": 0.4834, + "step": 1335 + }, + { + "epoch": 0.15, + "learning_rate": 1.9221526523722104e-05, + "loss": 0.5067, + "step": 1336 + }, + { + "epoch": 0.15, + "learning_rate": 1.9220093995837805e-05, + "loss": 0.4721, + "step": 1337 + }, + { + "epoch": 0.15, + "learning_rate": 1.9218660204594778e-05, + "loss": 0.5119, + "step": 1338 + }, + { + "epoch": 0.15, + "learning_rate": 1.9217225150189483e-05, + "loss": 0.469, + "step": 1339 + }, + { + "epoch": 0.15, + "learning_rate": 1.921578883281856e-05, + "loss": 0.4769, + "step": 1340 + }, + { + "epoch": 0.15, + "learning_rate": 1.9214351252678815e-05, + "loss": 0.4706, + "step": 1341 + }, + { + "epoch": 0.15, + "learning_rate": 1.9212912409967223e-05, + "loss": 0.4812, + "step": 1342 + }, + { + "epoch": 0.15, + "learning_rate": 1.9211472304880945e-05, + "loss": 0.4874, + "step": 1343 + }, + { + "epoch": 0.15, + "learning_rate": 1.9210030937617303e-05, + "loss": 0.5001, + "step": 1344 + }, + { + "epoch": 0.15, + "learning_rate": 1.9208588308373798e-05, + "loss": 0.477, + "step": 1345 + }, + { + "epoch": 0.15, + "learning_rate": 1.9207144417348103e-05, + "loss": 0.5098, + "step": 1346 + }, + { + "epoch": 0.15, + "learning_rate": 1.9205699264738063e-05, + "loss": 0.4901, + "step": 1347 + }, + { + "epoch": 0.15, + "learning_rate": 1.9204252850741695e-05, + "loss": 0.4818, + "step": 1348 + }, + { + "epoch": 0.15, + "learning_rate": 1.920280517555719e-05, + "loss": 0.4806, + "step": 1349 + }, + { + "epoch": 0.15, + "learning_rate": 1.9201356239382914e-05, + "loss": 0.4831, + "step": 1350 + }, + { + "epoch": 0.15, + "learning_rate": 1.9199906042417403e-05, + "loss": 0.4894, + "step": 1351 + }, + { + "epoch": 0.15, + "learning_rate": 1.919845458485936e-05, + "loss": 0.4902, + "step": 1352 + }, + { + "epoch": 0.15, + "learning_rate": 1.9197001866907676e-05, + "loss": 0.4933, + "step": 1353 + }, + { + "epoch": 0.15, + "learning_rate": 1.9195547888761403e-05, + "loss": 0.4747, + "step": 1354 + }, + { + "epoch": 0.15, + "learning_rate": 1.9194092650619767e-05, + "loss": 0.4754, + "step": 1355 + }, + { + "epoch": 0.15, + "learning_rate": 1.9192636152682173e-05, + "loss": 0.5062, + "step": 1356 + }, + { + "epoch": 0.16, + "learning_rate": 1.9191178395148188e-05, + "loss": 0.4714, + "step": 1357 + }, + { + "epoch": 0.16, + "learning_rate": 1.9189719378217554e-05, + "loss": 0.4884, + "step": 1358 + }, + { + "epoch": 0.16, + "learning_rate": 1.91882591020902e-05, + "loss": 0.4867, + "step": 1359 + }, + { + "epoch": 0.16, + "learning_rate": 1.9186797566966205e-05, + "loss": 0.5001, + "step": 1360 + }, + { + "epoch": 0.16, + "learning_rate": 1.918533477304584e-05, + "loss": 0.4858, + "step": 1361 + }, + { + "epoch": 0.16, + "learning_rate": 1.918387072052954e-05, + "loss": 0.4954, + "step": 1362 + }, + { + "epoch": 0.16, + "learning_rate": 1.918240540961791e-05, + "loss": 0.4623, + "step": 1363 + }, + { + "epoch": 0.16, + "learning_rate": 1.9180938840511727e-05, + "loss": 0.5021, + "step": 1364 + }, + { + "epoch": 0.16, + "learning_rate": 1.917947101341195e-05, + "loss": 0.4912, + "step": 1365 + }, + { + "epoch": 0.16, + "learning_rate": 1.9178001928519703e-05, + "loss": 0.4741, + "step": 1366 + }, + { + "epoch": 0.16, + "learning_rate": 1.9176531586036282e-05, + "loss": 0.4754, + "step": 1367 + }, + { + "epoch": 0.16, + "learning_rate": 1.9175059986163157e-05, + "loss": 0.4776, + "step": 1368 + }, + { + "epoch": 0.16, + "learning_rate": 1.9173587129101967e-05, + "loss": 0.4847, + "step": 1369 + }, + { + "epoch": 0.16, + "learning_rate": 1.917211301505453e-05, + "loss": 0.4818, + "step": 1370 + }, + { + "epoch": 0.16, + "learning_rate": 1.9170637644222835e-05, + "loss": 0.465, + "step": 1371 + }, + { + "epoch": 0.16, + "learning_rate": 1.9169161016809036e-05, + "loss": 0.516, + "step": 1372 + }, + { + "epoch": 0.16, + "learning_rate": 1.9167683133015465e-05, + "loss": 0.4855, + "step": 1373 + }, + { + "epoch": 0.16, + "learning_rate": 1.9166203993044627e-05, + "loss": 0.4876, + "step": 1374 + }, + { + "epoch": 0.16, + "learning_rate": 1.9164723597099192e-05, + "loss": 0.4911, + "step": 1375 + }, + { + "epoch": 0.16, + "learning_rate": 1.9163241945382012e-05, + "loss": 0.4916, + "step": 1376 + }, + { + "epoch": 0.16, + "learning_rate": 1.9161759038096108e-05, + "loss": 0.4721, + "step": 1377 + }, + { + "epoch": 0.16, + "learning_rate": 1.9160274875444668e-05, + "loss": 0.4814, + "step": 1378 + }, + { + "epoch": 0.16, + "learning_rate": 1.9158789457631054e-05, + "loss": 0.4883, + "step": 1379 + }, + { + "epoch": 0.16, + "learning_rate": 1.9157302784858807e-05, + "loss": 0.4865, + "step": 1380 + }, + { + "epoch": 0.16, + "learning_rate": 1.915581485733163e-05, + "loss": 0.4839, + "step": 1381 + }, + { + "epoch": 0.16, + "learning_rate": 1.91543256752534e-05, + "loss": 0.4959, + "step": 1382 + }, + { + "epoch": 0.16, + "learning_rate": 1.915283523882818e-05, + "loss": 0.4822, + "step": 1383 + }, + { + "epoch": 0.16, + "learning_rate": 1.9151343548260176e-05, + "loss": 0.4907, + "step": 1384 + }, + { + "epoch": 0.16, + "learning_rate": 1.9149850603753793e-05, + "loss": 0.4936, + "step": 1385 + }, + { + "epoch": 0.16, + "learning_rate": 1.91483564055136e-05, + "loss": 0.4744, + "step": 1386 + }, + { + "epoch": 0.16, + "learning_rate": 1.9146860953744325e-05, + "loss": 0.4955, + "step": 1387 + }, + { + "epoch": 0.16, + "learning_rate": 1.9145364248650892e-05, + "loss": 0.4918, + "step": 1388 + }, + { + "epoch": 0.16, + "learning_rate": 1.914386629043837e-05, + "loss": 0.4582, + "step": 1389 + }, + { + "epoch": 0.16, + "learning_rate": 1.9142367079312023e-05, + "loss": 0.4987, + "step": 1390 + }, + { + "epoch": 0.16, + "learning_rate": 1.9140866615477272e-05, + "loss": 0.4701, + "step": 1391 + }, + { + "epoch": 0.16, + "learning_rate": 1.913936489913971e-05, + "loss": 0.4822, + "step": 1392 + }, + { + "epoch": 0.16, + "learning_rate": 1.9137861930505112e-05, + "loss": 0.4738, + "step": 1393 + }, + { + "epoch": 0.16, + "learning_rate": 1.9136357709779418e-05, + "loss": 0.4974, + "step": 1394 + }, + { + "epoch": 0.16, + "learning_rate": 1.9134852237168738e-05, + "loss": 0.4939, + "step": 1395 + }, + { + "epoch": 0.16, + "learning_rate": 1.9133345512879353e-05, + "loss": 0.4838, + "step": 1396 + }, + { + "epoch": 0.16, + "learning_rate": 1.9131837537117724e-05, + "loss": 0.4822, + "step": 1397 + }, + { + "epoch": 0.16, + "learning_rate": 1.913032831009047e-05, + "loss": 0.5028, + "step": 1398 + }, + { + "epoch": 0.16, + "learning_rate": 1.9128817832004393e-05, + "loss": 0.4745, + "step": 1399 + }, + { + "epoch": 0.16, + "learning_rate": 1.912730610306646e-05, + "loss": 0.4764, + "step": 1400 + }, + { + "epoch": 0.16, + "learning_rate": 1.9125793123483814e-05, + "loss": 0.4963, + "step": 1401 + }, + { + "epoch": 0.16, + "learning_rate": 1.912427889346377e-05, + "loss": 0.4879, + "step": 1402 + }, + { + "epoch": 0.16, + "learning_rate": 1.91227634132138e-05, + "loss": 0.4866, + "step": 1403 + }, + { + "epoch": 0.16, + "learning_rate": 1.912124668294157e-05, + "loss": 0.4828, + "step": 1404 + }, + { + "epoch": 0.16, + "learning_rate": 1.91197287028549e-05, + "loss": 0.5033, + "step": 1405 + }, + { + "epoch": 0.16, + "learning_rate": 1.9118209473161794e-05, + "loss": 0.4922, + "step": 1406 + }, + { + "epoch": 0.16, + "learning_rate": 1.9116688994070413e-05, + "loss": 0.4738, + "step": 1407 + }, + { + "epoch": 0.16, + "learning_rate": 1.9115167265789096e-05, + "loss": 0.4901, + "step": 1408 + }, + { + "epoch": 0.16, + "learning_rate": 1.911364428852636e-05, + "loss": 0.4672, + "step": 1409 + }, + { + "epoch": 0.16, + "learning_rate": 1.9112120062490883e-05, + "loss": 0.4893, + "step": 1410 + }, + { + "epoch": 0.16, + "learning_rate": 1.911059458789152e-05, + "loss": 0.4717, + "step": 1411 + }, + { + "epoch": 0.16, + "learning_rate": 1.9109067864937292e-05, + "loss": 0.4992, + "step": 1412 + }, + { + "epoch": 0.16, + "learning_rate": 1.9107539893837396e-05, + "loss": 0.4814, + "step": 1413 + }, + { + "epoch": 0.16, + "learning_rate": 1.91060106748012e-05, + "loss": 0.4832, + "step": 1414 + }, + { + "epoch": 0.16, + "learning_rate": 1.9104480208038236e-05, + "loss": 0.4766, + "step": 1415 + }, + { + "epoch": 0.16, + "learning_rate": 1.9102948493758217e-05, + "loss": 0.4931, + "step": 1416 + }, + { + "epoch": 0.16, + "learning_rate": 1.9101415532171018e-05, + "loss": 0.4702, + "step": 1417 + }, + { + "epoch": 0.16, + "learning_rate": 1.90998813234867e-05, + "loss": 0.4695, + "step": 1418 + }, + { + "epoch": 0.16, + "learning_rate": 1.9098345867915467e-05, + "loss": 0.5036, + "step": 1419 + }, + { + "epoch": 0.16, + "learning_rate": 1.909680916566772e-05, + "loss": 0.5027, + "step": 1420 + }, + { + "epoch": 0.16, + "learning_rate": 1.9095271216954022e-05, + "loss": 0.4953, + "step": 1421 + }, + { + "epoch": 0.16, + "learning_rate": 1.9093732021985103e-05, + "loss": 0.4851, + "step": 1422 + }, + { + "epoch": 0.16, + "learning_rate": 1.909219158097187e-05, + "loss": 0.4972, + "step": 1423 + }, + { + "epoch": 0.16, + "learning_rate": 1.9090649894125395e-05, + "loss": 0.4746, + "step": 1424 + }, + { + "epoch": 0.16, + "learning_rate": 1.908910696165693e-05, + "loss": 0.4911, + "step": 1425 + }, + { + "epoch": 0.16, + "learning_rate": 1.908756278377788e-05, + "loss": 0.4701, + "step": 1426 + }, + { + "epoch": 0.16, + "learning_rate": 1.9086017360699843e-05, + "loss": 0.4793, + "step": 1427 + }, + { + "epoch": 0.16, + "learning_rate": 1.9084470692634567e-05, + "loss": 0.4962, + "step": 1428 + }, + { + "epoch": 0.16, + "learning_rate": 1.9082922779793988e-05, + "loss": 0.4917, + "step": 1429 + }, + { + "epoch": 0.16, + "learning_rate": 1.9081373622390204e-05, + "loss": 0.5028, + "step": 1430 + }, + { + "epoch": 0.16, + "learning_rate": 1.9079823220635477e-05, + "loss": 0.471, + "step": 1431 + }, + { + "epoch": 0.16, + "learning_rate": 1.907827157474225e-05, + "loss": 0.4818, + "step": 1432 + }, + { + "epoch": 0.16, + "learning_rate": 1.9076718684923136e-05, + "loss": 0.4884, + "step": 1433 + }, + { + "epoch": 0.16, + "learning_rate": 1.9075164551390918e-05, + "loss": 0.4989, + "step": 1434 + }, + { + "epoch": 0.16, + "learning_rate": 1.9073609174358535e-05, + "loss": 0.4663, + "step": 1435 + }, + { + "epoch": 0.16, + "learning_rate": 1.9072052554039123e-05, + "loss": 0.5005, + "step": 1436 + }, + { + "epoch": 0.16, + "learning_rate": 1.9070494690645966e-05, + "loss": 0.4952, + "step": 1437 + }, + { + "epoch": 0.16, + "learning_rate": 1.9068935584392522e-05, + "loss": 0.4959, + "step": 1438 + }, + { + "epoch": 0.16, + "learning_rate": 1.906737523549243e-05, + "loss": 0.4891, + "step": 1439 + }, + { + "epoch": 0.16, + "learning_rate": 1.9065813644159495e-05, + "loss": 0.4844, + "step": 1440 + }, + { + "epoch": 0.16, + "learning_rate": 1.906425081060768e-05, + "loss": 0.4942, + "step": 1441 + }, + { + "epoch": 0.16, + "learning_rate": 1.906268673505114e-05, + "loss": 0.4822, + "step": 1442 + }, + { + "epoch": 0.16, + "learning_rate": 1.906112141770418e-05, + "loss": 0.4636, + "step": 1443 + }, + { + "epoch": 0.17, + "learning_rate": 1.9059554858781285e-05, + "loss": 0.488, + "step": 1444 + }, + { + "epoch": 0.17, + "learning_rate": 1.9057987058497106e-05, + "loss": 0.5003, + "step": 1445 + }, + { + "epoch": 0.17, + "learning_rate": 1.9056418017066476e-05, + "loss": 0.503, + "step": 1446 + }, + { + "epoch": 0.17, + "learning_rate": 1.905484773470438e-05, + "loss": 0.4837, + "step": 1447 + }, + { + "epoch": 0.17, + "learning_rate": 1.905327621162598e-05, + "loss": 0.4655, + "step": 1448 + }, + { + "epoch": 0.17, + "learning_rate": 1.905170344804662e-05, + "loss": 0.4768, + "step": 1449 + }, + { + "epoch": 0.17, + "learning_rate": 1.90501294441818e-05, + "loss": 0.4872, + "step": 1450 + }, + { + "epoch": 0.17, + "learning_rate": 1.9048554200247184e-05, + "loss": 0.4736, + "step": 1451 + }, + { + "epoch": 0.17, + "learning_rate": 1.9046977716458627e-05, + "loss": 0.4808, + "step": 1452 + }, + { + "epoch": 0.17, + "learning_rate": 1.904539999303214e-05, + "loss": 0.4925, + "step": 1453 + }, + { + "epoch": 0.17, + "learning_rate": 1.90438210301839e-05, + "loss": 0.4836, + "step": 1454 + }, + { + "epoch": 0.17, + "learning_rate": 1.9042240828130267e-05, + "loss": 0.4678, + "step": 1455 + }, + { + "epoch": 0.17, + "learning_rate": 1.9040659387087762e-05, + "loss": 0.4835, + "step": 1456 + }, + { + "epoch": 0.17, + "learning_rate": 1.903907670727308e-05, + "loss": 0.4949, + "step": 1457 + }, + { + "epoch": 0.17, + "learning_rate": 1.903749278890308e-05, + "loss": 0.4879, + "step": 1458 + }, + { + "epoch": 0.17, + "learning_rate": 1.903590763219479e-05, + "loss": 0.5004, + "step": 1459 + }, + { + "epoch": 0.17, + "learning_rate": 1.9034321237365424e-05, + "loss": 0.4882, + "step": 1460 + }, + { + "epoch": 0.17, + "learning_rate": 1.9032733604632347e-05, + "loss": 0.4818, + "step": 1461 + }, + { + "epoch": 0.17, + "learning_rate": 1.9031144734213097e-05, + "loss": 0.5016, + "step": 1462 + }, + { + "epoch": 0.17, + "learning_rate": 1.9029554626325386e-05, + "loss": 0.5071, + "step": 1463 + }, + { + "epoch": 0.17, + "learning_rate": 1.90279632811871e-05, + "loss": 0.4721, + "step": 1464 + }, + { + "epoch": 0.17, + "learning_rate": 1.902637069901628e-05, + "loss": 0.4908, + "step": 1465 + }, + { + "epoch": 0.17, + "learning_rate": 1.9024776880031154e-05, + "loss": 0.4762, + "step": 1466 + }, + { + "epoch": 0.17, + "learning_rate": 1.9023181824450106e-05, + "loss": 0.4817, + "step": 1467 + }, + { + "epoch": 0.17, + "learning_rate": 1.9021585532491694e-05, + "loss": 0.4806, + "step": 1468 + }, + { + "epoch": 0.17, + "learning_rate": 1.9019988004374645e-05, + "loss": 0.4877, + "step": 1469 + }, + { + "epoch": 0.17, + "learning_rate": 1.901838924031786e-05, + "loss": 0.4813, + "step": 1470 + }, + { + "epoch": 0.17, + "learning_rate": 1.90167892405404e-05, + "loss": 0.4945, + "step": 1471 + }, + { + "epoch": 0.17, + "learning_rate": 1.9015188005261505e-05, + "loss": 0.4863, + "step": 1472 + }, + { + "epoch": 0.17, + "learning_rate": 1.9013585534700582e-05, + "loss": 0.4841, + "step": 1473 + }, + { + "epoch": 0.17, + "learning_rate": 1.90119818290772e-05, + "loss": 0.4733, + "step": 1474 + }, + { + "epoch": 0.17, + "learning_rate": 1.9010376888611106e-05, + "loss": 0.4872, + "step": 1475 + }, + { + "epoch": 0.17, + "learning_rate": 1.9008770713522206e-05, + "loss": 0.4819, + "step": 1476 + }, + { + "epoch": 0.17, + "learning_rate": 1.9007163304030593e-05, + "loss": 0.4832, + "step": 1477 + }, + { + "epoch": 0.17, + "learning_rate": 1.9005554660356505e-05, + "loss": 0.478, + "step": 1478 + }, + { + "epoch": 0.17, + "learning_rate": 1.9003944782720375e-05, + "loss": 0.4873, + "step": 1479 + }, + { + "epoch": 0.17, + "learning_rate": 1.9002333671342782e-05, + "loss": 0.4852, + "step": 1480 + }, + { + "epoch": 0.17, + "learning_rate": 1.9000721326444492e-05, + "loss": 0.4965, + "step": 1481 + }, + { + "epoch": 0.17, + "learning_rate": 1.8999107748246427e-05, + "loss": 0.4757, + "step": 1482 + }, + { + "epoch": 0.17, + "learning_rate": 1.8997492936969686e-05, + "loss": 0.4775, + "step": 1483 + }, + { + "epoch": 0.17, + "learning_rate": 1.899587689283553e-05, + "loss": 0.4777, + "step": 1484 + }, + { + "epoch": 0.17, + "learning_rate": 1.89942596160654e-05, + "loss": 0.4887, + "step": 1485 + }, + { + "epoch": 0.17, + "learning_rate": 1.899264110688089e-05, + "loss": 0.4752, + "step": 1486 + }, + { + "epoch": 0.17, + "learning_rate": 1.8991021365503782e-05, + "loss": 0.4975, + "step": 1487 + }, + { + "epoch": 0.17, + "learning_rate": 1.8989400392156012e-05, + "loss": 0.4667, + "step": 1488 + }, + { + "epoch": 0.17, + "learning_rate": 1.898777818705969e-05, + "loss": 0.5035, + "step": 1489 + }, + { + "epoch": 0.17, + "learning_rate": 1.898615475043709e-05, + "loss": 0.4994, + "step": 1490 + }, + { + "epoch": 0.17, + "learning_rate": 1.8984530082510665e-05, + "loss": 0.4835, + "step": 1491 + }, + { + "epoch": 0.17, + "learning_rate": 1.898290418350303e-05, + "loss": 0.4679, + "step": 1492 + }, + { + "epoch": 0.17, + "learning_rate": 1.8981277053636963e-05, + "loss": 0.4847, + "step": 1493 + }, + { + "epoch": 0.17, + "learning_rate": 1.8979648693135428e-05, + "loss": 0.4753, + "step": 1494 + }, + { + "epoch": 0.17, + "learning_rate": 1.8978019102221538e-05, + "loss": 0.5043, + "step": 1495 + }, + { + "epoch": 0.17, + "learning_rate": 1.8976388281118584e-05, + "loss": 0.4708, + "step": 1496 + }, + { + "epoch": 0.17, + "learning_rate": 1.8974756230050028e-05, + "loss": 0.4852, + "step": 1497 + }, + { + "epoch": 0.17, + "learning_rate": 1.8973122949239497e-05, + "loss": 0.4888, + "step": 1498 + }, + { + "epoch": 0.17, + "learning_rate": 1.897148843891079e-05, + "loss": 0.4918, + "step": 1499 + }, + { + "epoch": 0.17, + "learning_rate": 1.8969852699287865e-05, + "loss": 0.4865, + "step": 1500 + }, + { + "epoch": 0.17, + "learning_rate": 1.896821573059486e-05, + "loss": 0.4742, + "step": 1501 + }, + { + "epoch": 0.17, + "learning_rate": 1.896657753305607e-05, + "loss": 0.4891, + "step": 1502 + }, + { + "epoch": 0.17, + "learning_rate": 1.896493810689597e-05, + "loss": 0.5085, + "step": 1503 + }, + { + "epoch": 0.17, + "learning_rate": 1.89632974523392e-05, + "loss": 0.5098, + "step": 1504 + }, + { + "epoch": 0.17, + "learning_rate": 1.8961655569610557e-05, + "loss": 0.4616, + "step": 1505 + }, + { + "epoch": 0.17, + "learning_rate": 1.8960012458935025e-05, + "loss": 0.4735, + "step": 1506 + }, + { + "epoch": 0.17, + "learning_rate": 1.8958368120537746e-05, + "loss": 0.5024, + "step": 1507 + }, + { + "epoch": 0.17, + "learning_rate": 1.8956722554644026e-05, + "loss": 0.4795, + "step": 1508 + }, + { + "epoch": 0.17, + "learning_rate": 1.8955075761479342e-05, + "loss": 0.4802, + "step": 1509 + }, + { + "epoch": 0.17, + "learning_rate": 1.895342774126935e-05, + "loss": 0.4744, + "step": 1510 + }, + { + "epoch": 0.17, + "learning_rate": 1.8951778494239862e-05, + "loss": 0.4734, + "step": 1511 + }, + { + "epoch": 0.17, + "learning_rate": 1.8950128020616863e-05, + "loss": 0.4954, + "step": 1512 + }, + { + "epoch": 0.17, + "learning_rate": 1.89484763206265e-05, + "loss": 0.4863, + "step": 1513 + }, + { + "epoch": 0.17, + "learning_rate": 1.89468233944951e-05, + "loss": 0.4804, + "step": 1514 + }, + { + "epoch": 0.17, + "learning_rate": 1.8945169242449145e-05, + "loss": 0.5027, + "step": 1515 + }, + { + "epoch": 0.17, + "learning_rate": 1.894351386471529e-05, + "loss": 0.4762, + "step": 1516 + }, + { + "epoch": 0.17, + "learning_rate": 1.8941857261520363e-05, + "loss": 0.4789, + "step": 1517 + }, + { + "epoch": 0.17, + "learning_rate": 1.8940199433091354e-05, + "loss": 0.4867, + "step": 1518 + }, + { + "epoch": 0.17, + "learning_rate": 1.893854037965542e-05, + "loss": 0.4938, + "step": 1519 + }, + { + "epoch": 0.17, + "learning_rate": 1.8936880101439893e-05, + "loss": 0.4817, + "step": 1520 + }, + { + "epoch": 0.17, + "learning_rate": 1.8935218598672266e-05, + "loss": 0.5014, + "step": 1521 + }, + { + "epoch": 0.17, + "learning_rate": 1.8933555871580204e-05, + "loss": 0.4909, + "step": 1522 + }, + { + "epoch": 0.17, + "learning_rate": 1.8931891920391533e-05, + "loss": 0.5019, + "step": 1523 + }, + { + "epoch": 0.17, + "learning_rate": 1.893022674533425e-05, + "loss": 0.482, + "step": 1524 + }, + { + "epoch": 0.17, + "learning_rate": 1.8928560346636532e-05, + "loss": 0.507, + "step": 1525 + }, + { + "epoch": 0.17, + "learning_rate": 1.89268927245267e-05, + "loss": 0.4968, + "step": 1526 + }, + { + "epoch": 0.17, + "learning_rate": 1.8925223879233267e-05, + "loss": 0.4785, + "step": 1527 + }, + { + "epoch": 0.17, + "learning_rate": 1.8923553810984893e-05, + "loss": 0.4749, + "step": 1528 + }, + { + "epoch": 0.17, + "learning_rate": 1.8921882520010416e-05, + "loss": 0.4744, + "step": 1529 + }, + { + "epoch": 0.17, + "learning_rate": 1.8920210006538843e-05, + "loss": 0.5001, + "step": 1530 + }, + { + "epoch": 0.17, + "learning_rate": 1.891853627079935e-05, + "loss": 0.4816, + "step": 1531 + }, + { + "epoch": 0.18, + "learning_rate": 1.8916861313021268e-05, + "loss": 0.4662, + "step": 1532 + }, + { + "epoch": 0.18, + "learning_rate": 1.8915185133434107e-05, + "loss": 0.4976, + "step": 1533 + }, + { + "epoch": 0.18, + "learning_rate": 1.891350773226754e-05, + "loss": 0.48, + "step": 1534 + }, + { + "epoch": 0.18, + "learning_rate": 1.891182910975141e-05, + "loss": 0.4784, + "step": 1535 + }, + { + "epoch": 0.18, + "learning_rate": 1.8910149266115724e-05, + "loss": 0.4776, + "step": 1536 + }, + { + "epoch": 0.18, + "learning_rate": 1.890846820159066e-05, + "loss": 0.4834, + "step": 1537 + }, + { + "epoch": 0.18, + "learning_rate": 1.890678591640656e-05, + "loss": 0.4873, + "step": 1538 + }, + { + "epoch": 0.18, + "learning_rate": 1.8905102410793936e-05, + "loss": 0.5074, + "step": 1539 + }, + { + "epoch": 0.18, + "learning_rate": 1.8903417684983465e-05, + "loss": 0.478, + "step": 1540 + }, + { + "epoch": 0.18, + "learning_rate": 1.8901731739205992e-05, + "loss": 0.483, + "step": 1541 + }, + { + "epoch": 0.18, + "learning_rate": 1.8900044573692527e-05, + "loss": 0.4688, + "step": 1542 + }, + { + "epoch": 0.18, + "learning_rate": 1.8898356188674253e-05, + "loss": 0.487, + "step": 1543 + }, + { + "epoch": 0.18, + "learning_rate": 1.8896666584382516e-05, + "loss": 0.489, + "step": 1544 + }, + { + "epoch": 0.18, + "learning_rate": 1.8894975761048826e-05, + "loss": 0.4841, + "step": 1545 + }, + { + "epoch": 0.18, + "learning_rate": 1.8893283718904866e-05, + "loss": 0.4876, + "step": 1546 + }, + { + "epoch": 0.18, + "learning_rate": 1.8891590458182486e-05, + "loss": 0.4965, + "step": 1547 + }, + { + "epoch": 0.18, + "learning_rate": 1.8889895979113698e-05, + "loss": 0.4915, + "step": 1548 + }, + { + "epoch": 0.18, + "learning_rate": 1.888820028193068e-05, + "loss": 0.4695, + "step": 1549 + }, + { + "epoch": 0.18, + "learning_rate": 1.8886503366865786e-05, + "loss": 0.4902, + "step": 1550 + }, + { + "epoch": 0.18, + "learning_rate": 1.888480523415153e-05, + "loss": 0.4967, + "step": 1551 + }, + { + "epoch": 0.18, + "learning_rate": 1.8883105884020595e-05, + "loss": 0.4639, + "step": 1552 + }, + { + "epoch": 0.18, + "learning_rate": 1.8881405316705824e-05, + "loss": 0.4954, + "step": 1553 + }, + { + "epoch": 0.18, + "learning_rate": 1.887970353244024e-05, + "loss": 0.4777, + "step": 1554 + }, + { + "epoch": 0.18, + "learning_rate": 1.887800053145702e-05, + "loss": 0.4929, + "step": 1555 + }, + { + "epoch": 0.18, + "learning_rate": 1.8876296313989516e-05, + "loss": 0.5071, + "step": 1556 + }, + { + "epoch": 0.18, + "learning_rate": 1.8874590880271245e-05, + "loss": 0.4704, + "step": 1557 + }, + { + "epoch": 0.18, + "learning_rate": 1.8872884230535886e-05, + "loss": 0.468, + "step": 1558 + }, + { + "epoch": 0.18, + "learning_rate": 1.8871176365017293e-05, + "loss": 0.4986, + "step": 1559 + }, + { + "epoch": 0.18, + "learning_rate": 1.8869467283949475e-05, + "loss": 0.4718, + "step": 1560 + }, + { + "epoch": 0.18, + "learning_rate": 1.8867756987566615e-05, + "loss": 0.4777, + "step": 1561 + }, + { + "epoch": 0.18, + "learning_rate": 1.8866045476103073e-05, + "loss": 0.492, + "step": 1562 + }, + { + "epoch": 0.18, + "learning_rate": 1.886433274979335e-05, + "loss": 0.5308, + "step": 1563 + }, + { + "epoch": 0.18, + "learning_rate": 1.8862618808872138e-05, + "loss": 0.4872, + "step": 1564 + }, + { + "epoch": 0.18, + "learning_rate": 1.8860903653574277e-05, + "loss": 0.4714, + "step": 1565 + }, + { + "epoch": 0.18, + "learning_rate": 1.8859187284134785e-05, + "loss": 0.4829, + "step": 1566 + }, + { + "epoch": 0.18, + "learning_rate": 1.8857469700788845e-05, + "loss": 0.4899, + "step": 1567 + }, + { + "epoch": 0.18, + "learning_rate": 1.8855750903771805e-05, + "loss": 0.4854, + "step": 1568 + }, + { + "epoch": 0.18, + "learning_rate": 1.8854030893319173e-05, + "loss": 0.4738, + "step": 1569 + }, + { + "epoch": 0.18, + "learning_rate": 1.8852309669666634e-05, + "loss": 0.4797, + "step": 1570 + }, + { + "epoch": 0.18, + "learning_rate": 1.885058723305003e-05, + "loss": 0.4985, + "step": 1571 + }, + { + "epoch": 0.18, + "learning_rate": 1.8848863583705373e-05, + "loss": 0.4893, + "step": 1572 + }, + { + "epoch": 0.18, + "learning_rate": 1.884713872186885e-05, + "loss": 0.5035, + "step": 1573 + }, + { + "epoch": 0.18, + "learning_rate": 1.8845412647776795e-05, + "loss": 0.4932, + "step": 1574 + }, + { + "epoch": 0.18, + "learning_rate": 1.8843685361665724e-05, + "loss": 0.4753, + "step": 1575 + }, + { + "epoch": 0.18, + "learning_rate": 1.8841956863772314e-05, + "loss": 0.4796, + "step": 1576 + }, + { + "epoch": 0.18, + "learning_rate": 1.8840227154333405e-05, + "loss": 0.4888, + "step": 1577 + }, + { + "epoch": 0.18, + "learning_rate": 1.883849623358601e-05, + "loss": 0.4732, + "step": 1578 + }, + { + "epoch": 0.18, + "learning_rate": 1.88367641017673e-05, + "loss": 0.4919, + "step": 1579 + }, + { + "epoch": 0.18, + "learning_rate": 1.8835030759114617e-05, + "loss": 0.4883, + "step": 1580 + }, + { + "epoch": 0.18, + "learning_rate": 1.8833296205865466e-05, + "loss": 0.4923, + "step": 1581 + }, + { + "epoch": 0.18, + "learning_rate": 1.8831560442257523e-05, + "loss": 0.477, + "step": 1582 + }, + { + "epoch": 0.18, + "learning_rate": 1.8829823468528624e-05, + "loss": 0.4749, + "step": 1583 + }, + { + "epoch": 0.18, + "learning_rate": 1.8828085284916777e-05, + "loss": 0.4813, + "step": 1584 + }, + { + "epoch": 0.18, + "learning_rate": 1.882634589166014e-05, + "loss": 0.5162, + "step": 1585 + }, + { + "epoch": 0.18, + "learning_rate": 1.8824605288997064e-05, + "loss": 0.4578, + "step": 1586 + }, + { + "epoch": 0.18, + "learning_rate": 1.882286347716604e-05, + "loss": 0.5004, + "step": 1587 + }, + { + "epoch": 0.18, + "learning_rate": 1.8821120456405743e-05, + "loss": 0.4933, + "step": 1588 + }, + { + "epoch": 0.18, + "learning_rate": 1.8819376226955e-05, + "loss": 0.5036, + "step": 1589 + }, + { + "epoch": 0.18, + "learning_rate": 1.8817630789052813e-05, + "loss": 0.4774, + "step": 1590 + }, + { + "epoch": 0.18, + "learning_rate": 1.881588414293834e-05, + "loss": 0.4817, + "step": 1591 + }, + { + "epoch": 0.18, + "learning_rate": 1.881413628885092e-05, + "loss": 0.4606, + "step": 1592 + }, + { + "epoch": 0.18, + "learning_rate": 1.8812387227030035e-05, + "loss": 0.4866, + "step": 1593 + }, + { + "epoch": 0.18, + "learning_rate": 1.8810636957715357e-05, + "loss": 0.4622, + "step": 1594 + }, + { + "epoch": 0.18, + "learning_rate": 1.880888548114671e-05, + "loss": 0.4884, + "step": 1595 + }, + { + "epoch": 0.18, + "learning_rate": 1.880713279756408e-05, + "loss": 0.479, + "step": 1596 + }, + { + "epoch": 0.18, + "learning_rate": 1.880537890720763e-05, + "loss": 0.4996, + "step": 1597 + }, + { + "epoch": 0.18, + "learning_rate": 1.8803623810317678e-05, + "loss": 0.4693, + "step": 1598 + }, + { + "epoch": 0.18, + "learning_rate": 1.8801867507134712e-05, + "loss": 0.4771, + "step": 1599 + }, + { + "epoch": 0.18, + "learning_rate": 1.8800109997899386e-05, + "loss": 0.4914, + "step": 1600 + }, + { + "epoch": 0.18, + "learning_rate": 1.879835128285252e-05, + "loss": 0.4782, + "step": 1601 + }, + { + "epoch": 0.18, + "learning_rate": 1.879659136223509e-05, + "loss": 0.4953, + "step": 1602 + }, + { + "epoch": 0.18, + "learning_rate": 1.8794830236288254e-05, + "loss": 0.4904, + "step": 1603 + }, + { + "epoch": 0.18, + "learning_rate": 1.8793067905253318e-05, + "loss": 0.4721, + "step": 1604 + }, + { + "epoch": 0.18, + "learning_rate": 1.8791304369371765e-05, + "loss": 0.5035, + "step": 1605 + }, + { + "epoch": 0.18, + "learning_rate": 1.8789539628885233e-05, + "loss": 0.4752, + "step": 1606 + }, + { + "epoch": 0.18, + "learning_rate": 1.878777368403554e-05, + "loss": 0.4852, + "step": 1607 + }, + { + "epoch": 0.18, + "learning_rate": 1.8786006535064654e-05, + "loss": 0.496, + "step": 1608 + }, + { + "epoch": 0.18, + "learning_rate": 1.8784238182214713e-05, + "loss": 0.4785, + "step": 1609 + }, + { + "epoch": 0.18, + "learning_rate": 1.8782468625728027e-05, + "loss": 0.4813, + "step": 1610 + }, + { + "epoch": 0.18, + "learning_rate": 1.8780697865847056e-05, + "loss": 0.5088, + "step": 1611 + }, + { + "epoch": 0.18, + "learning_rate": 1.877892590281444e-05, + "loss": 0.4805, + "step": 1612 + }, + { + "epoch": 0.18, + "learning_rate": 1.877715273687297e-05, + "loss": 0.5001, + "step": 1613 + }, + { + "epoch": 0.18, + "learning_rate": 1.8775378368265622e-05, + "loss": 0.4999, + "step": 1614 + }, + { + "epoch": 0.18, + "learning_rate": 1.8773602797235516e-05, + "loss": 0.4834, + "step": 1615 + }, + { + "epoch": 0.18, + "learning_rate": 1.8771826024025944e-05, + "loss": 0.4797, + "step": 1616 + }, + { + "epoch": 0.18, + "learning_rate": 1.8770048048880367e-05, + "loss": 0.488, + "step": 1617 + }, + { + "epoch": 0.18, + "learning_rate": 1.8768268872042402e-05, + "loss": 0.4786, + "step": 1618 + }, + { + "epoch": 0.19, + "learning_rate": 1.8766488493755845e-05, + "loss": 0.4776, + "step": 1619 + }, + { + "epoch": 0.19, + "learning_rate": 1.8764706914264636e-05, + "loss": 0.4899, + "step": 1620 + }, + { + "epoch": 0.19, + "learning_rate": 1.8762924133812905e-05, + "loss": 0.4933, + "step": 1621 + }, + { + "epoch": 0.19, + "learning_rate": 1.876114015264492e-05, + "loss": 0.4851, + "step": 1622 + }, + { + "epoch": 0.19, + "learning_rate": 1.8759354971005133e-05, + "loss": 0.4766, + "step": 1623 + }, + { + "epoch": 0.19, + "learning_rate": 1.875756858913815e-05, + "loss": 0.5006, + "step": 1624 + }, + { + "epoch": 0.19, + "learning_rate": 1.875578100728875e-05, + "loss": 0.4958, + "step": 1625 + }, + { + "epoch": 0.19, + "learning_rate": 1.8753992225701868e-05, + "loss": 0.482, + "step": 1626 + }, + { + "epoch": 0.19, + "learning_rate": 1.875220224462261e-05, + "loss": 0.4894, + "step": 1627 + }, + { + "epoch": 0.19, + "learning_rate": 1.8750411064296237e-05, + "loss": 0.474, + "step": 1628 + }, + { + "epoch": 0.19, + "learning_rate": 1.8748618684968187e-05, + "loss": 0.507, + "step": 1629 + }, + { + "epoch": 0.19, + "learning_rate": 1.8746825106884055e-05, + "loss": 0.4657, + "step": 1630 + }, + { + "epoch": 0.19, + "learning_rate": 1.87450303302896e-05, + "loss": 0.484, + "step": 1631 + }, + { + "epoch": 0.19, + "learning_rate": 1.8743234355430746e-05, + "loss": 0.479, + "step": 1632 + }, + { + "epoch": 0.19, + "learning_rate": 1.8741437182553582e-05, + "loss": 0.4876, + "step": 1633 + }, + { + "epoch": 0.19, + "learning_rate": 1.8739638811904363e-05, + "loss": 0.5082, + "step": 1634 + }, + { + "epoch": 0.19, + "learning_rate": 1.8737839243729504e-05, + "loss": 0.4617, + "step": 1635 + }, + { + "epoch": 0.19, + "learning_rate": 1.8736038478275584e-05, + "loss": 0.4841, + "step": 1636 + }, + { + "epoch": 0.19, + "learning_rate": 1.873423651578935e-05, + "loss": 0.4825, + "step": 1637 + }, + { + "epoch": 0.19, + "learning_rate": 1.8732433356517713e-05, + "loss": 0.4856, + "step": 1638 + }, + { + "epoch": 0.19, + "learning_rate": 1.8730629000707746e-05, + "loss": 0.4894, + "step": 1639 + }, + { + "epoch": 0.19, + "learning_rate": 1.872882344860668e-05, + "loss": 0.4878, + "step": 1640 + }, + { + "epoch": 0.19, + "learning_rate": 1.872701670046192e-05, + "loss": 0.5066, + "step": 1641 + }, + { + "epoch": 0.19, + "learning_rate": 1.8725208756521036e-05, + "loss": 0.4937, + "step": 1642 + }, + { + "epoch": 0.19, + "learning_rate": 1.8723399617031754e-05, + "loss": 0.4747, + "step": 1643 + }, + { + "epoch": 0.19, + "learning_rate": 1.8721589282241956e-05, + "loss": 0.4769, + "step": 1644 + }, + { + "epoch": 0.19, + "learning_rate": 1.8719777752399713e-05, + "loss": 0.4826, + "step": 1645 + }, + { + "epoch": 0.19, + "learning_rate": 1.8717965027753235e-05, + "loss": 0.4781, + "step": 1646 + }, + { + "epoch": 0.19, + "learning_rate": 1.8716151108550912e-05, + "loss": 0.4992, + "step": 1647 + }, + { + "epoch": 0.19, + "learning_rate": 1.871433599504129e-05, + "loss": 0.4906, + "step": 1648 + }, + { + "epoch": 0.19, + "learning_rate": 1.8712519687473075e-05, + "loss": 0.4832, + "step": 1649 + }, + { + "epoch": 0.19, + "learning_rate": 1.8710702186095147e-05, + "loss": 0.4871, + "step": 1650 + }, + { + "epoch": 0.19, + "learning_rate": 1.8708883491156544e-05, + "loss": 0.4787, + "step": 1651 + }, + { + "epoch": 0.19, + "learning_rate": 1.8707063602906466e-05, + "loss": 0.4772, + "step": 1652 + }, + { + "epoch": 0.19, + "learning_rate": 1.8705242521594276e-05, + "loss": 0.4791, + "step": 1653 + }, + { + "epoch": 0.19, + "learning_rate": 1.870342024746951e-05, + "loss": 0.4847, + "step": 1654 + }, + { + "epoch": 0.19, + "learning_rate": 1.8701596780781855e-05, + "loss": 0.4919, + "step": 1655 + }, + { + "epoch": 0.19, + "learning_rate": 1.869977212178117e-05, + "loss": 0.4897, + "step": 1656 + }, + { + "epoch": 0.19, + "learning_rate": 1.8697946270717468e-05, + "loss": 0.4652, + "step": 1657 + }, + { + "epoch": 0.19, + "learning_rate": 1.8696119227840937e-05, + "loss": 0.493, + "step": 1658 + }, + { + "epoch": 0.19, + "learning_rate": 1.869429099340192e-05, + "loss": 0.46, + "step": 1659 + }, + { + "epoch": 0.19, + "learning_rate": 1.8692461567650925e-05, + "loss": 0.5069, + "step": 1660 + }, + { + "epoch": 0.19, + "learning_rate": 1.869063095083863e-05, + "loss": 0.4855, + "step": 1661 + }, + { + "epoch": 0.19, + "learning_rate": 1.8688799143215863e-05, + "loss": 0.4678, + "step": 1662 + }, + { + "epoch": 0.19, + "learning_rate": 1.8686966145033626e-05, + "loss": 0.4733, + "step": 1663 + }, + { + "epoch": 0.19, + "learning_rate": 1.8685131956543082e-05, + "loss": 0.4652, + "step": 1664 + }, + { + "epoch": 0.19, + "learning_rate": 1.8683296577995554e-05, + "loss": 0.4923, + "step": 1665 + }, + { + "epoch": 0.19, + "learning_rate": 1.8681460009642533e-05, + "loss": 0.4963, + "step": 1666 + }, + { + "epoch": 0.19, + "learning_rate": 1.867962225173566e-05, + "loss": 0.4617, + "step": 1667 + }, + { + "epoch": 0.19, + "learning_rate": 1.867778330452676e-05, + "loss": 0.4967, + "step": 1668 + }, + { + "epoch": 0.19, + "learning_rate": 1.8675943168267804e-05, + "loss": 0.4924, + "step": 1669 + }, + { + "epoch": 0.19, + "learning_rate": 1.8674101843210935e-05, + "loss": 0.4953, + "step": 1670 + }, + { + "epoch": 0.19, + "learning_rate": 1.8672259329608457e-05, + "loss": 0.4848, + "step": 1671 + }, + { + "epoch": 0.19, + "learning_rate": 1.8670415627712825e-05, + "loss": 0.4905, + "step": 1672 + }, + { + "epoch": 0.19, + "learning_rate": 1.866857073777668e-05, + "loss": 0.4961, + "step": 1673 + }, + { + "epoch": 0.19, + "learning_rate": 1.8666724660052807e-05, + "loss": 0.4816, + "step": 1674 + }, + { + "epoch": 0.19, + "learning_rate": 1.8664877394794158e-05, + "loss": 0.4707, + "step": 1675 + }, + { + "epoch": 0.19, + "learning_rate": 1.8663028942253854e-05, + "loss": 0.4883, + "step": 1676 + }, + { + "epoch": 0.19, + "learning_rate": 1.8661179302685177e-05, + "loss": 0.4905, + "step": 1677 + }, + { + "epoch": 0.19, + "learning_rate": 1.8659328476341557e-05, + "loss": 0.4828, + "step": 1678 + }, + { + "epoch": 0.19, + "learning_rate": 1.865747646347661e-05, + "loss": 0.4787, + "step": 1679 + }, + { + "epoch": 0.19, + "learning_rate": 1.8655623264344103e-05, + "loss": 0.478, + "step": 1680 + }, + { + "epoch": 0.19, + "learning_rate": 1.8653768879197956e-05, + "loss": 0.5098, + "step": 1681 + }, + { + "epoch": 0.19, + "learning_rate": 1.865191330829227e-05, + "loss": 0.4905, + "step": 1682 + }, + { + "epoch": 0.19, + "learning_rate": 1.8650056551881297e-05, + "loss": 0.4881, + "step": 1683 + }, + { + "epoch": 0.19, + "learning_rate": 1.8648198610219452e-05, + "loss": 0.4741, + "step": 1684 + }, + { + "epoch": 0.19, + "learning_rate": 1.864633948356132e-05, + "loss": 0.4758, + "step": 1685 + }, + { + "epoch": 0.19, + "learning_rate": 1.8644479172161635e-05, + "loss": 0.4836, + "step": 1686 + }, + { + "epoch": 0.19, + "learning_rate": 1.8642617676275306e-05, + "loss": 0.4902, + "step": 1687 + }, + { + "epoch": 0.19, + "learning_rate": 1.8640754996157397e-05, + "loss": 0.4673, + "step": 1688 + }, + { + "epoch": 0.19, + "learning_rate": 1.863889113206314e-05, + "loss": 0.5032, + "step": 1689 + }, + { + "epoch": 0.19, + "learning_rate": 1.863702608424793e-05, + "loss": 0.4906, + "step": 1690 + }, + { + "epoch": 0.19, + "learning_rate": 1.863515985296731e-05, + "loss": 0.4808, + "step": 1691 + }, + { + "epoch": 0.19, + "learning_rate": 1.8633292438476998e-05, + "loss": 0.4903, + "step": 1692 + }, + { + "epoch": 0.19, + "learning_rate": 1.8631423841032876e-05, + "loss": 0.4751, + "step": 1693 + }, + { + "epoch": 0.19, + "learning_rate": 1.8629554060890982e-05, + "loss": 0.4874, + "step": 1694 + }, + { + "epoch": 0.19, + "learning_rate": 1.8627683098307516e-05, + "loss": 0.4805, + "step": 1695 + }, + { + "epoch": 0.19, + "learning_rate": 1.862581095353884e-05, + "loss": 0.4862, + "step": 1696 + }, + { + "epoch": 0.19, + "learning_rate": 1.8623937626841485e-05, + "loss": 0.4728, + "step": 1697 + }, + { + "epoch": 0.19, + "learning_rate": 1.8622063118472135e-05, + "loss": 0.4992, + "step": 1698 + }, + { + "epoch": 0.19, + "learning_rate": 1.8620187428687643e-05, + "loss": 0.4798, + "step": 1699 + }, + { + "epoch": 0.19, + "learning_rate": 1.861831055774501e-05, + "loss": 0.4886, + "step": 1700 + }, + { + "epoch": 0.19, + "learning_rate": 1.8616432505901427e-05, + "loss": 0.4727, + "step": 1701 + }, + { + "epoch": 0.19, + "learning_rate": 1.861455327341421e-05, + "loss": 0.4862, + "step": 1702 + }, + { + "epoch": 0.19, + "learning_rate": 1.8612672860540865e-05, + "loss": 0.4721, + "step": 1703 + }, + { + "epoch": 0.19, + "learning_rate": 1.8610791267539053e-05, + "loss": 0.4631, + "step": 1704 + }, + { + "epoch": 0.19, + "learning_rate": 1.8608908494666593e-05, + "loss": 0.4935, + "step": 1705 + }, + { + "epoch": 0.19, + "learning_rate": 1.8607024542181465e-05, + "loss": 0.4937, + "step": 1706 + }, + { + "epoch": 0.2, + "learning_rate": 1.860513941034181e-05, + "loss": 0.474, + "step": 1707 + }, + { + "epoch": 0.2, + "learning_rate": 1.8603253099405937e-05, + "loss": 0.4716, + "step": 1708 + }, + { + "epoch": 0.2, + "learning_rate": 1.8601365609632315e-05, + "loss": 0.5024, + "step": 1709 + }, + { + "epoch": 0.2, + "learning_rate": 1.859947694127956e-05, + "loss": 0.4774, + "step": 1710 + }, + { + "epoch": 0.2, + "learning_rate": 1.859758709460648e-05, + "loss": 0.4771, + "step": 1711 + }, + { + "epoch": 0.2, + "learning_rate": 1.8595696069872013e-05, + "loss": 0.4811, + "step": 1712 + }, + { + "epoch": 0.2, + "learning_rate": 1.8593803867335276e-05, + "loss": 0.476, + "step": 1713 + }, + { + "epoch": 0.2, + "learning_rate": 1.859191048725554e-05, + "loss": 0.4997, + "step": 1714 + }, + { + "epoch": 0.2, + "learning_rate": 1.8590015929892245e-05, + "loss": 0.4814, + "step": 1715 + }, + { + "epoch": 0.2, + "learning_rate": 1.858812019550499e-05, + "loss": 0.4644, + "step": 1716 + }, + { + "epoch": 0.2, + "learning_rate": 1.8586223284353522e-05, + "loss": 0.4897, + "step": 1717 + }, + { + "epoch": 0.2, + "learning_rate": 1.8584325196697767e-05, + "loss": 0.4907, + "step": 1718 + }, + { + "epoch": 0.2, + "learning_rate": 1.8582425932797807e-05, + "loss": 0.475, + "step": 1719 + }, + { + "epoch": 0.2, + "learning_rate": 1.8580525492913884e-05, + "loss": 0.4799, + "step": 1720 + }, + { + "epoch": 0.2, + "learning_rate": 1.8578623877306394e-05, + "loss": 0.4759, + "step": 1721 + }, + { + "epoch": 0.2, + "learning_rate": 1.8576721086235908e-05, + "loss": 0.509, + "step": 1722 + }, + { + "epoch": 0.2, + "learning_rate": 1.8574817119963145e-05, + "loss": 0.4782, + "step": 1723 + }, + { + "epoch": 0.2, + "learning_rate": 1.8572911978748993e-05, + "loss": 0.4763, + "step": 1724 + }, + { + "epoch": 0.2, + "learning_rate": 1.8571005662854502e-05, + "loss": 0.5017, + "step": 1725 + }, + { + "epoch": 0.2, + "learning_rate": 1.8569098172540875e-05, + "loss": 0.4736, + "step": 1726 + }, + { + "epoch": 0.2, + "learning_rate": 1.856718950806949e-05, + "loss": 0.4701, + "step": 1727 + }, + { + "epoch": 0.2, + "learning_rate": 1.8565279669701862e-05, + "loss": 0.4726, + "step": 1728 + }, + { + "epoch": 0.2, + "learning_rate": 1.8563368657699693e-05, + "loss": 0.4578, + "step": 1729 + }, + { + "epoch": 0.2, + "learning_rate": 1.856145647232483e-05, + "loss": 0.4809, + "step": 1730 + }, + { + "epoch": 0.2, + "learning_rate": 1.8559543113839285e-05, + "loss": 0.4705, + "step": 1731 + }, + { + "epoch": 0.2, + "learning_rate": 1.8557628582505235e-05, + "loss": 0.5124, + "step": 1732 + }, + { + "epoch": 0.2, + "learning_rate": 1.8555712878585005e-05, + "loss": 0.473, + "step": 1733 + }, + { + "epoch": 0.2, + "learning_rate": 1.8553796002341098e-05, + "loss": 0.4715, + "step": 1734 + }, + { + "epoch": 0.2, + "learning_rate": 1.8551877954036165e-05, + "loss": 0.4905, + "step": 1735 + }, + { + "epoch": 0.2, + "learning_rate": 1.854995873393302e-05, + "loss": 0.4924, + "step": 1736 + }, + { + "epoch": 0.2, + "learning_rate": 1.854803834229464e-05, + "loss": 0.4684, + "step": 1737 + }, + { + "epoch": 0.2, + "learning_rate": 1.8546116779384165e-05, + "loss": 0.4869, + "step": 1738 + }, + { + "epoch": 0.2, + "learning_rate": 1.8544194045464888e-05, + "loss": 0.47, + "step": 1739 + }, + { + "epoch": 0.2, + "learning_rate": 1.8542270140800266e-05, + "loss": 0.4872, + "step": 1740 + }, + { + "epoch": 0.2, + "learning_rate": 1.854034506565392e-05, + "loss": 0.4664, + "step": 1741 + }, + { + "epoch": 0.2, + "learning_rate": 1.8538418820289628e-05, + "loss": 0.4862, + "step": 1742 + }, + { + "epoch": 0.2, + "learning_rate": 1.8536491404971327e-05, + "loss": 0.466, + "step": 1743 + }, + { + "epoch": 0.2, + "learning_rate": 1.8534562819963112e-05, + "loss": 0.4869, + "step": 1744 + }, + { + "epoch": 0.2, + "learning_rate": 1.853263306552925e-05, + "loss": 0.477, + "step": 1745 + }, + { + "epoch": 0.2, + "learning_rate": 1.8530702141934157e-05, + "loss": 0.4889, + "step": 1746 + }, + { + "epoch": 0.2, + "learning_rate": 1.8528770049442413e-05, + "loss": 0.4812, + "step": 1747 + }, + { + "epoch": 0.2, + "learning_rate": 1.852683678831876e-05, + "loss": 0.5022, + "step": 1748 + }, + { + "epoch": 0.2, + "learning_rate": 1.852490235882809e-05, + "loss": 0.477, + "step": 1749 + }, + { + "epoch": 0.2, + "learning_rate": 1.852296676123547e-05, + "loss": 0.4985, + "step": 1750 + }, + { + "epoch": 0.2, + "learning_rate": 1.8521029995806123e-05, + "loss": 0.489, + "step": 1751 + }, + { + "epoch": 0.2, + "learning_rate": 1.851909206280542e-05, + "loss": 0.4941, + "step": 1752 + }, + { + "epoch": 0.2, + "learning_rate": 1.8517152962498908e-05, + "loss": 0.4833, + "step": 1753 + }, + { + "epoch": 0.2, + "learning_rate": 1.8515212695152284e-05, + "loss": 0.4874, + "step": 1754 + }, + { + "epoch": 0.2, + "learning_rate": 1.8513271261031406e-05, + "loss": 0.4818, + "step": 1755 + }, + { + "epoch": 0.2, + "learning_rate": 1.8511328660402302e-05, + "loss": 0.4875, + "step": 1756 + }, + { + "epoch": 0.2, + "learning_rate": 1.850938489353114e-05, + "loss": 0.4796, + "step": 1757 + }, + { + "epoch": 0.2, + "learning_rate": 1.850743996068427e-05, + "loss": 0.4783, + "step": 1758 + }, + { + "epoch": 0.2, + "learning_rate": 1.8505493862128187e-05, + "loss": 0.4777, + "step": 1759 + }, + { + "epoch": 0.2, + "learning_rate": 1.8503546598129547e-05, + "loss": 0.478, + "step": 1760 + }, + { + "epoch": 0.2, + "learning_rate": 1.8501598168955172e-05, + "loss": 0.4774, + "step": 1761 + }, + { + "epoch": 0.2, + "learning_rate": 1.8499648574872042e-05, + "loss": 0.4938, + "step": 1762 + }, + { + "epoch": 0.2, + "learning_rate": 1.849769781614729e-05, + "loss": 0.4838, + "step": 1763 + }, + { + "epoch": 0.2, + "learning_rate": 1.849574589304822e-05, + "loss": 0.4955, + "step": 1764 + }, + { + "epoch": 0.2, + "learning_rate": 1.8493792805842278e-05, + "loss": 0.4886, + "step": 1765 + }, + { + "epoch": 0.2, + "learning_rate": 1.8491838554797096e-05, + "loss": 0.4945, + "step": 1766 + }, + { + "epoch": 0.2, + "learning_rate": 1.8489883140180437e-05, + "loss": 0.4612, + "step": 1767 + }, + { + "epoch": 0.2, + "learning_rate": 1.848792656226024e-05, + "loss": 0.492, + "step": 1768 + }, + { + "epoch": 0.2, + "learning_rate": 1.8485968821304604e-05, + "loss": 0.4717, + "step": 1769 + }, + { + "epoch": 0.2, + "learning_rate": 1.848400991758178e-05, + "loss": 0.4814, + "step": 1770 + }, + { + "epoch": 0.2, + "learning_rate": 1.8482049851360182e-05, + "loss": 0.48, + "step": 1771 + }, + { + "epoch": 0.2, + "learning_rate": 1.8480088622908382e-05, + "loss": 0.4792, + "step": 1772 + }, + { + "epoch": 0.2, + "learning_rate": 1.8478126232495114e-05, + "loss": 0.4839, + "step": 1773 + }, + { + "epoch": 0.2, + "learning_rate": 1.8476162680389268e-05, + "loss": 0.4825, + "step": 1774 + }, + { + "epoch": 0.2, + "learning_rate": 1.847419796685989e-05, + "loss": 0.4821, + "step": 1775 + }, + { + "epoch": 0.2, + "learning_rate": 1.84722320921762e-05, + "loss": 0.5115, + "step": 1776 + }, + { + "epoch": 0.2, + "learning_rate": 1.8470265056607557e-05, + "loss": 0.46, + "step": 1777 + }, + { + "epoch": 0.2, + "learning_rate": 1.8468296860423494e-05, + "loss": 0.4905, + "step": 1778 + }, + { + "epoch": 0.2, + "learning_rate": 1.8466327503893697e-05, + "loss": 0.4867, + "step": 1779 + }, + { + "epoch": 0.2, + "learning_rate": 1.8464356987288012e-05, + "loss": 0.4755, + "step": 1780 + }, + { + "epoch": 0.2, + "learning_rate": 1.8462385310876444e-05, + "loss": 0.4933, + "step": 1781 + }, + { + "epoch": 0.2, + "learning_rate": 1.8460412474929154e-05, + "loss": 0.4886, + "step": 1782 + }, + { + "epoch": 0.2, + "learning_rate": 1.8458438479716466e-05, + "loss": 0.4768, + "step": 1783 + }, + { + "epoch": 0.2, + "learning_rate": 1.845646332550886e-05, + "loss": 0.4779, + "step": 1784 + }, + { + "epoch": 0.2, + "learning_rate": 1.845448701257698e-05, + "loss": 0.4866, + "step": 1785 + }, + { + "epoch": 0.2, + "learning_rate": 1.8452509541191625e-05, + "loss": 0.4603, + "step": 1786 + }, + { + "epoch": 0.2, + "learning_rate": 1.8450530911623747e-05, + "loss": 0.5003, + "step": 1787 + }, + { + "epoch": 0.2, + "learning_rate": 1.8448551124144467e-05, + "loss": 0.4798, + "step": 1788 + }, + { + "epoch": 0.2, + "learning_rate": 1.844657017902506e-05, + "loss": 0.4767, + "step": 1789 + }, + { + "epoch": 0.2, + "learning_rate": 1.844458807653696e-05, + "loss": 0.4962, + "step": 1790 + }, + { + "epoch": 0.2, + "learning_rate": 1.8442604816951757e-05, + "loss": 0.4689, + "step": 1791 + }, + { + "epoch": 0.2, + "learning_rate": 1.8440620400541202e-05, + "loss": 0.5121, + "step": 1792 + }, + { + "epoch": 0.2, + "learning_rate": 1.843863482757721e-05, + "loss": 0.4729, + "step": 1793 + }, + { + "epoch": 0.21, + "learning_rate": 1.8436648098331838e-05, + "loss": 0.4883, + "step": 1794 + }, + { + "epoch": 0.21, + "learning_rate": 1.843466021307732e-05, + "loss": 0.4687, + "step": 1795 + }, + { + "epoch": 0.21, + "learning_rate": 1.8432671172086044e-05, + "loss": 0.4604, + "step": 1796 + }, + { + "epoch": 0.21, + "learning_rate": 1.8430680975630545e-05, + "loss": 0.4798, + "step": 1797 + }, + { + "epoch": 0.21, + "learning_rate": 1.8428689623983526e-05, + "loss": 0.468, + "step": 1798 + }, + { + "epoch": 0.21, + "learning_rate": 1.8426697117417848e-05, + "loss": 0.4735, + "step": 1799 + }, + { + "epoch": 0.21, + "learning_rate": 1.8424703456206533e-05, + "loss": 0.4889, + "step": 1800 + }, + { + "epoch": 0.21, + "learning_rate": 1.842270864062275e-05, + "loss": 0.4724, + "step": 1801 + }, + { + "epoch": 0.21, + "learning_rate": 1.8420712670939837e-05, + "loss": 0.4837, + "step": 1802 + }, + { + "epoch": 0.21, + "learning_rate": 1.8418715547431283e-05, + "loss": 0.4789, + "step": 1803 + }, + { + "epoch": 0.21, + "learning_rate": 1.8416717270370744e-05, + "loss": 0.4703, + "step": 1804 + }, + { + "epoch": 0.21, + "learning_rate": 1.841471784003203e-05, + "loss": 0.4746, + "step": 1805 + }, + { + "epoch": 0.21, + "learning_rate": 1.84127172566891e-05, + "loss": 0.4867, + "step": 1806 + }, + { + "epoch": 0.21, + "learning_rate": 1.841071552061608e-05, + "loss": 0.4663, + "step": 1807 + }, + { + "epoch": 0.21, + "learning_rate": 1.8408712632087256e-05, + "loss": 0.5056, + "step": 1808 + }, + { + "epoch": 0.21, + "learning_rate": 1.840670859137707e-05, + "loss": 0.4712, + "step": 1809 + }, + { + "epoch": 0.21, + "learning_rate": 1.840470339876011e-05, + "loss": 0.4915, + "step": 1810 + }, + { + "epoch": 0.21, + "learning_rate": 1.8402697054511145e-05, + "loss": 0.4752, + "step": 1811 + }, + { + "epoch": 0.21, + "learning_rate": 1.8400689558905083e-05, + "loss": 0.4876, + "step": 1812 + }, + { + "epoch": 0.21, + "learning_rate": 1.8398680912216997e-05, + "loss": 0.4676, + "step": 1813 + }, + { + "epoch": 0.21, + "learning_rate": 1.8396671114722112e-05, + "loss": 0.497, + "step": 1814 + }, + { + "epoch": 0.21, + "learning_rate": 1.8394660166695822e-05, + "loss": 0.4645, + "step": 1815 + }, + { + "epoch": 0.21, + "learning_rate": 1.8392648068413667e-05, + "loss": 0.5071, + "step": 1816 + }, + { + "epoch": 0.21, + "learning_rate": 1.8390634820151353e-05, + "loss": 0.4908, + "step": 1817 + }, + { + "epoch": 0.21, + "learning_rate": 1.8388620422184738e-05, + "loss": 0.4663, + "step": 1818 + }, + { + "epoch": 0.21, + "learning_rate": 1.8386604874789836e-05, + "loss": 0.4705, + "step": 1819 + }, + { + "epoch": 0.21, + "learning_rate": 1.8384588178242828e-05, + "loss": 0.4809, + "step": 1820 + }, + { + "epoch": 0.21, + "learning_rate": 1.8382570332820045e-05, + "loss": 0.5041, + "step": 1821 + }, + { + "epoch": 0.21, + "learning_rate": 1.8380551338797974e-05, + "loss": 0.472, + "step": 1822 + }, + { + "epoch": 0.21, + "learning_rate": 1.8378531196453265e-05, + "loss": 0.4672, + "step": 1823 + }, + { + "epoch": 0.21, + "learning_rate": 1.837650990606272e-05, + "loss": 0.498, + "step": 1824 + }, + { + "epoch": 0.21, + "learning_rate": 1.8374487467903303e-05, + "loss": 0.4822, + "step": 1825 + }, + { + "epoch": 0.21, + "learning_rate": 1.8372463882252133e-05, + "loss": 0.5021, + "step": 1826 + }, + { + "epoch": 0.21, + "learning_rate": 1.8370439149386484e-05, + "loss": 0.4562, + "step": 1827 + }, + { + "epoch": 0.21, + "learning_rate": 1.8368413269583795e-05, + "loss": 0.4936, + "step": 1828 + }, + { + "epoch": 0.21, + "learning_rate": 1.8366386243121654e-05, + "loss": 0.4606, + "step": 1829 + }, + { + "epoch": 0.21, + "learning_rate": 1.8364358070277807e-05, + "loss": 0.4959, + "step": 1830 + }, + { + "epoch": 0.21, + "learning_rate": 1.836232875133016e-05, + "loss": 0.4866, + "step": 1831 + }, + { + "epoch": 0.21, + "learning_rate": 1.8360298286556774e-05, + "loss": 0.4869, + "step": 1832 + }, + { + "epoch": 0.21, + "learning_rate": 1.8358266676235872e-05, + "loss": 0.4695, + "step": 1833 + }, + { + "epoch": 0.21, + "learning_rate": 1.8356233920645822e-05, + "loss": 0.5119, + "step": 1834 + }, + { + "epoch": 0.21, + "learning_rate": 1.8354200020065168e-05, + "loss": 0.4823, + "step": 1835 + }, + { + "epoch": 0.21, + "learning_rate": 1.8352164974772592e-05, + "loss": 0.4872, + "step": 1836 + }, + { + "epoch": 0.21, + "learning_rate": 1.8350128785046943e-05, + "loss": 0.4604, + "step": 1837 + }, + { + "epoch": 0.21, + "learning_rate": 1.8348091451167224e-05, + "loss": 0.4571, + "step": 1838 + }, + { + "epoch": 0.21, + "learning_rate": 1.8346052973412593e-05, + "loss": 0.5112, + "step": 1839 + }, + { + "epoch": 0.21, + "learning_rate": 1.834401335206237e-05, + "loss": 0.4873, + "step": 1840 + }, + { + "epoch": 0.21, + "learning_rate": 1.8341972587396032e-05, + "loss": 0.4778, + "step": 1841 + }, + { + "epoch": 0.21, + "learning_rate": 1.8339930679693202e-05, + "loss": 0.475, + "step": 1842 + }, + { + "epoch": 0.21, + "learning_rate": 1.8337887629233672e-05, + "loss": 0.4649, + "step": 1843 + }, + { + "epoch": 0.21, + "learning_rate": 1.833584343629738e-05, + "loss": 0.5096, + "step": 1844 + }, + { + "epoch": 0.21, + "learning_rate": 1.8333798101164433e-05, + "loss": 0.4945, + "step": 1845 + }, + { + "epoch": 0.21, + "learning_rate": 1.833175162411508e-05, + "loss": 0.4645, + "step": 1846 + }, + { + "epoch": 0.21, + "learning_rate": 1.8329704005429745e-05, + "loss": 0.4822, + "step": 1847 + }, + { + "epoch": 0.21, + "learning_rate": 1.8327655245388986e-05, + "loss": 0.4826, + "step": 1848 + }, + { + "epoch": 0.21, + "learning_rate": 1.8325605344273536e-05, + "loss": 0.4994, + "step": 1849 + }, + { + "epoch": 0.21, + "learning_rate": 1.8323554302364273e-05, + "loss": 0.4873, + "step": 1850 + }, + { + "epoch": 0.21, + "learning_rate": 1.832150211994224e-05, + "loss": 0.4714, + "step": 1851 + }, + { + "epoch": 0.21, + "learning_rate": 1.8319448797288628e-05, + "loss": 0.4985, + "step": 1852 + }, + { + "epoch": 0.21, + "learning_rate": 1.831739433468479e-05, + "loss": 0.4863, + "step": 1853 + }, + { + "epoch": 0.21, + "learning_rate": 1.831533873241223e-05, + "loss": 0.4833, + "step": 1854 + }, + { + "epoch": 0.21, + "learning_rate": 1.831328199075262e-05, + "loss": 0.4667, + "step": 1855 + }, + { + "epoch": 0.21, + "learning_rate": 1.8311224109987768e-05, + "loss": 0.476, + "step": 1856 + }, + { + "epoch": 0.21, + "learning_rate": 1.8309165090399657e-05, + "loss": 0.516, + "step": 1857 + }, + { + "epoch": 0.21, + "learning_rate": 1.8307104932270415e-05, + "loss": 0.4699, + "step": 1858 + }, + { + "epoch": 0.21, + "learning_rate": 1.8305043635882334e-05, + "loss": 0.4787, + "step": 1859 + }, + { + "epoch": 0.21, + "learning_rate": 1.830298120151785e-05, + "loss": 0.4944, + "step": 1860 + }, + { + "epoch": 0.21, + "learning_rate": 1.8300917629459575e-05, + "loss": 0.494, + "step": 1861 + }, + { + "epoch": 0.21, + "learning_rate": 1.8298852919990254e-05, + "loss": 0.4809, + "step": 1862 + }, + { + "epoch": 0.21, + "learning_rate": 1.82967870733928e-05, + "loss": 0.4735, + "step": 1863 + }, + { + "epoch": 0.21, + "learning_rate": 1.8294720089950282e-05, + "loss": 0.4649, + "step": 1864 + }, + { + "epoch": 0.21, + "learning_rate": 1.8292651969945923e-05, + "loss": 0.491, + "step": 1865 + }, + { + "epoch": 0.21, + "learning_rate": 1.82905827136631e-05, + "loss": 0.4947, + "step": 1866 + }, + { + "epoch": 0.21, + "learning_rate": 1.828851232138535e-05, + "loss": 0.4686, + "step": 1867 + }, + { + "epoch": 0.21, + "learning_rate": 1.828644079339636e-05, + "loss": 0.4651, + "step": 1868 + }, + { + "epoch": 0.21, + "learning_rate": 1.828436812997998e-05, + "loss": 0.4662, + "step": 1869 + }, + { + "epoch": 0.21, + "learning_rate": 1.8282294331420204e-05, + "loss": 0.4872, + "step": 1870 + }, + { + "epoch": 0.21, + "learning_rate": 1.8280219398001192e-05, + "loss": 0.4907, + "step": 1871 + }, + { + "epoch": 0.21, + "learning_rate": 1.827814333000726e-05, + "loss": 0.4671, + "step": 1872 + }, + { + "epoch": 0.21, + "learning_rate": 1.827606612772287e-05, + "loss": 0.474, + "step": 1873 + }, + { + "epoch": 0.21, + "learning_rate": 1.827398779143265e-05, + "loss": 0.4831, + "step": 1874 + }, + { + "epoch": 0.21, + "learning_rate": 1.8271908321421376e-05, + "loss": 0.4973, + "step": 1875 + }, + { + "epoch": 0.21, + "learning_rate": 1.8269827717973982e-05, + "loss": 0.4786, + "step": 1876 + }, + { + "epoch": 0.21, + "learning_rate": 1.8267745981375555e-05, + "loss": 0.4745, + "step": 1877 + }, + { + "epoch": 0.21, + "learning_rate": 1.8265663111911344e-05, + "loss": 0.5025, + "step": 1878 + }, + { + "epoch": 0.21, + "learning_rate": 1.8263579109866745e-05, + "loss": 0.4768, + "step": 1879 + }, + { + "epoch": 0.21, + "learning_rate": 1.8261493975527312e-05, + "loss": 0.4807, + "step": 1880 + }, + { + "epoch": 0.21, + "learning_rate": 1.8259407709178758e-05, + "loss": 0.4922, + "step": 1881 + }, + { + "epoch": 0.22, + "learning_rate": 1.8257320311106948e-05, + "loss": 0.4806, + "step": 1882 + }, + { + "epoch": 0.22, + "learning_rate": 1.82552317815979e-05, + "loss": 0.491, + "step": 1883 + }, + { + "epoch": 0.22, + "learning_rate": 1.825314212093779e-05, + "loss": 0.4727, + "step": 1884 + }, + { + "epoch": 0.22, + "learning_rate": 1.825105132941295e-05, + "loss": 0.4835, + "step": 1885 + }, + { + "epoch": 0.22, + "learning_rate": 1.8248959407309862e-05, + "loss": 0.4773, + "step": 1886 + }, + { + "epoch": 0.22, + "learning_rate": 1.824686635491517e-05, + "loss": 0.4654, + "step": 1887 + }, + { + "epoch": 0.22, + "learning_rate": 1.824477217251566e-05, + "loss": 0.4921, + "step": 1888 + }, + { + "epoch": 0.22, + "learning_rate": 1.8242676860398295e-05, + "loss": 0.5063, + "step": 1889 + }, + { + "epoch": 0.22, + "learning_rate": 1.824058041885017e-05, + "loss": 0.4779, + "step": 1890 + }, + { + "epoch": 0.22, + "learning_rate": 1.8238482848158548e-05, + "loss": 0.4864, + "step": 1891 + }, + { + "epoch": 0.22, + "learning_rate": 1.8236384148610843e-05, + "loss": 0.4714, + "step": 1892 + }, + { + "epoch": 0.22, + "learning_rate": 1.823428432049462e-05, + "loss": 0.4911, + "step": 1893 + }, + { + "epoch": 0.22, + "learning_rate": 1.8232183364097605e-05, + "loss": 0.4711, + "step": 1894 + }, + { + "epoch": 0.22, + "learning_rate": 1.8230081279707675e-05, + "loss": 0.4749, + "step": 1895 + }, + { + "epoch": 0.22, + "learning_rate": 1.822797806761287e-05, + "loss": 0.4695, + "step": 1896 + }, + { + "epoch": 0.22, + "learning_rate": 1.8225873728101367e-05, + "loss": 0.4946, + "step": 1897 + }, + { + "epoch": 0.22, + "learning_rate": 1.822376826146151e-05, + "loss": 0.4869, + "step": 1898 + }, + { + "epoch": 0.22, + "learning_rate": 1.8221661667981795e-05, + "loss": 0.4795, + "step": 1899 + }, + { + "epoch": 0.22, + "learning_rate": 1.8219553947950874e-05, + "loss": 0.4721, + "step": 1900 + }, + { + "epoch": 0.22, + "learning_rate": 1.8217445101657553e-05, + "loss": 0.4663, + "step": 1901 + }, + { + "epoch": 0.22, + "learning_rate": 1.8215335129390785e-05, + "loss": 0.4833, + "step": 1902 + }, + { + "epoch": 0.22, + "learning_rate": 1.821322403143969e-05, + "loss": 0.481, + "step": 1903 + }, + { + "epoch": 0.22, + "learning_rate": 1.8211111808093534e-05, + "loss": 0.473, + "step": 1904 + }, + { + "epoch": 0.22, + "learning_rate": 1.8208998459641737e-05, + "loss": 0.4881, + "step": 1905 + }, + { + "epoch": 0.22, + "learning_rate": 1.8206883986373872e-05, + "loss": 0.4802, + "step": 1906 + }, + { + "epoch": 0.22, + "learning_rate": 1.820476838857968e-05, + "loss": 0.4852, + "step": 1907 + }, + { + "epoch": 0.22, + "learning_rate": 1.820265166654903e-05, + "loss": 0.4831, + "step": 1908 + }, + { + "epoch": 0.22, + "learning_rate": 1.8200533820571973e-05, + "loss": 0.497, + "step": 1909 + }, + { + "epoch": 0.22, + "learning_rate": 1.8198414850938694e-05, + "loss": 0.4615, + "step": 1910 + }, + { + "epoch": 0.22, + "learning_rate": 1.8196294757939543e-05, + "loss": 0.4956, + "step": 1911 + }, + { + "epoch": 0.22, + "learning_rate": 1.8194173541865014e-05, + "loss": 0.4716, + "step": 1912 + }, + { + "epoch": 0.22, + "learning_rate": 1.8192051203005768e-05, + "loss": 0.4638, + "step": 1913 + }, + { + "epoch": 0.22, + "learning_rate": 1.818992774165261e-05, + "loss": 0.4879, + "step": 1914 + }, + { + "epoch": 0.22, + "learning_rate": 1.81878031580965e-05, + "loss": 0.4875, + "step": 1915 + }, + { + "epoch": 0.22, + "learning_rate": 1.8185677452628557e-05, + "loss": 0.4882, + "step": 1916 + }, + { + "epoch": 0.22, + "learning_rate": 1.818355062554005e-05, + "loss": 0.4997, + "step": 1917 + }, + { + "epoch": 0.22, + "learning_rate": 1.81814226771224e-05, + "loss": 0.4726, + "step": 1918 + }, + { + "epoch": 0.22, + "learning_rate": 1.8179293607667177e-05, + "loss": 0.4946, + "step": 1919 + }, + { + "epoch": 0.22, + "learning_rate": 1.8177163417466122e-05, + "loss": 0.4876, + "step": 1920 + }, + { + "epoch": 0.22, + "learning_rate": 1.8175032106811114e-05, + "loss": 0.4709, + "step": 1921 + }, + { + "epoch": 0.22, + "learning_rate": 1.817289967599419e-05, + "loss": 0.4999, + "step": 1922 + }, + { + "epoch": 0.22, + "learning_rate": 1.8170766125307543e-05, + "loss": 0.4864, + "step": 1923 + }, + { + "epoch": 0.22, + "learning_rate": 1.816863145504351e-05, + "loss": 0.4812, + "step": 1924 + }, + { + "epoch": 0.22, + "learning_rate": 1.81664956654946e-05, + "loss": 0.4748, + "step": 1925 + }, + { + "epoch": 0.22, + "learning_rate": 1.816435875695345e-05, + "loss": 0.4671, + "step": 1926 + }, + { + "epoch": 0.22, + "learning_rate": 1.8162220729712875e-05, + "loss": 0.4808, + "step": 1927 + }, + { + "epoch": 0.22, + "learning_rate": 1.8160081584065833e-05, + "loss": 0.4736, + "step": 1928 + }, + { + "epoch": 0.22, + "learning_rate": 1.8157941320305424e-05, + "loss": 0.4868, + "step": 1929 + }, + { + "epoch": 0.22, + "learning_rate": 1.815579993872492e-05, + "loss": 0.4725, + "step": 1930 + }, + { + "epoch": 0.22, + "learning_rate": 1.8153657439617738e-05, + "loss": 0.491, + "step": 1931 + }, + { + "epoch": 0.22, + "learning_rate": 1.8151513823277447e-05, + "loss": 0.4761, + "step": 1932 + }, + { + "epoch": 0.22, + "learning_rate": 1.8149369089997767e-05, + "loss": 0.5125, + "step": 1933 + }, + { + "epoch": 0.22, + "learning_rate": 1.814722324007258e-05, + "loss": 0.4676, + "step": 1934 + }, + { + "epoch": 0.22, + "learning_rate": 1.8145076273795914e-05, + "loss": 0.4984, + "step": 1935 + }, + { + "epoch": 0.22, + "learning_rate": 1.814292819146195e-05, + "loss": 0.47, + "step": 1936 + }, + { + "epoch": 0.22, + "learning_rate": 1.814077899336502e-05, + "loss": 0.4997, + "step": 1937 + }, + { + "epoch": 0.22, + "learning_rate": 1.813862867979962e-05, + "loss": 0.4623, + "step": 1938 + }, + { + "epoch": 0.22, + "learning_rate": 1.8136477251060385e-05, + "loss": 0.463, + "step": 1939 + }, + { + "epoch": 0.22, + "learning_rate": 1.813432470744211e-05, + "loss": 0.4888, + "step": 1940 + }, + { + "epoch": 0.22, + "learning_rate": 1.813217104923974e-05, + "loss": 0.4848, + "step": 1941 + }, + { + "epoch": 0.22, + "learning_rate": 1.813001627674838e-05, + "loss": 0.5043, + "step": 1942 + }, + { + "epoch": 0.22, + "learning_rate": 1.8127860390263275e-05, + "loss": 0.4762, + "step": 1943 + }, + { + "epoch": 0.22, + "learning_rate": 1.812570339007983e-05, + "loss": 0.4681, + "step": 1944 + }, + { + "epoch": 0.22, + "learning_rate": 1.8123545276493607e-05, + "loss": 0.4824, + "step": 1945 + }, + { + "epoch": 0.22, + "learning_rate": 1.8121386049800317e-05, + "loss": 0.4819, + "step": 1946 + }, + { + "epoch": 0.22, + "learning_rate": 1.8119225710295815e-05, + "loss": 0.4734, + "step": 1947 + }, + { + "epoch": 0.22, + "learning_rate": 1.811706425827612e-05, + "loss": 0.4641, + "step": 1948 + }, + { + "epoch": 0.22, + "learning_rate": 1.8114901694037402e-05, + "loss": 0.4786, + "step": 1949 + }, + { + "epoch": 0.22, + "learning_rate": 1.8112738017875974e-05, + "loss": 0.484, + "step": 1950 + }, + { + "epoch": 0.22, + "learning_rate": 1.811057323008831e-05, + "loss": 0.5028, + "step": 1951 + }, + { + "epoch": 0.22, + "learning_rate": 1.810840733097104e-05, + "loss": 0.481, + "step": 1952 + }, + { + "epoch": 0.22, + "learning_rate": 1.8106240320820928e-05, + "loss": 0.4853, + "step": 1953 + }, + { + "epoch": 0.22, + "learning_rate": 1.8104072199934916e-05, + "loss": 0.468, + "step": 1954 + }, + { + "epoch": 0.22, + "learning_rate": 1.8101902968610082e-05, + "loss": 0.4989, + "step": 1955 + }, + { + "epoch": 0.22, + "learning_rate": 1.8099732627143655e-05, + "loss": 0.4905, + "step": 1956 + }, + { + "epoch": 0.22, + "learning_rate": 1.809756117583302e-05, + "loss": 0.464, + "step": 1957 + }, + { + "epoch": 0.22, + "learning_rate": 1.809538861497572e-05, + "loss": 0.4789, + "step": 1958 + }, + { + "epoch": 0.22, + "learning_rate": 1.8093214944869437e-05, + "loss": 0.4867, + "step": 1959 + }, + { + "epoch": 0.22, + "learning_rate": 1.8091040165812018e-05, + "loss": 0.4939, + "step": 1960 + }, + { + "epoch": 0.22, + "learning_rate": 1.8088864278101452e-05, + "loss": 0.4874, + "step": 1961 + }, + { + "epoch": 0.22, + "learning_rate": 1.808668728203589e-05, + "loss": 0.4741, + "step": 1962 + }, + { + "epoch": 0.22, + "learning_rate": 1.8084509177913623e-05, + "loss": 0.4968, + "step": 1963 + }, + { + "epoch": 0.22, + "learning_rate": 1.8082329966033105e-05, + "loss": 0.4904, + "step": 1964 + }, + { + "epoch": 0.22, + "learning_rate": 1.8080149646692932e-05, + "loss": 0.4782, + "step": 1965 + }, + { + "epoch": 0.22, + "learning_rate": 1.807796822019186e-05, + "loss": 0.4839, + "step": 1966 + }, + { + "epoch": 0.22, + "learning_rate": 1.807578568682879e-05, + "loss": 0.4976, + "step": 1967 + }, + { + "epoch": 0.22, + "learning_rate": 1.8073602046902784e-05, + "loss": 0.4823, + "step": 1968 + }, + { + "epoch": 0.23, + "learning_rate": 1.8071417300713038e-05, + "loss": 0.4928, + "step": 1969 + }, + { + "epoch": 0.23, + "learning_rate": 1.8069231448558923e-05, + "loss": 0.4787, + "step": 1970 + }, + { + "epoch": 0.23, + "learning_rate": 1.806704449073994e-05, + "loss": 0.4659, + "step": 1971 + }, + { + "epoch": 0.23, + "learning_rate": 1.806485642755576e-05, + "loss": 0.477, + "step": 1972 + }, + { + "epoch": 0.23, + "learning_rate": 1.8062667259306193e-05, + "loss": 0.4825, + "step": 1973 + }, + { + "epoch": 0.23, + "learning_rate": 1.80604769862912e-05, + "loss": 0.4756, + "step": 1974 + }, + { + "epoch": 0.23, + "learning_rate": 1.8058285608810903e-05, + "loss": 0.4778, + "step": 1975 + }, + { + "epoch": 0.23, + "learning_rate": 1.8056093127165564e-05, + "loss": 0.5072, + "step": 1976 + }, + { + "epoch": 0.23, + "learning_rate": 1.8053899541655605e-05, + "loss": 0.4728, + "step": 1977 + }, + { + "epoch": 0.23, + "learning_rate": 1.8051704852581595e-05, + "loss": 0.4835, + "step": 1978 + }, + { + "epoch": 0.23, + "learning_rate": 1.804950906024426e-05, + "loss": 0.4829, + "step": 1979 + }, + { + "epoch": 0.23, + "learning_rate": 1.804731216494447e-05, + "loss": 0.4627, + "step": 1980 + }, + { + "epoch": 0.23, + "learning_rate": 1.804511416698324e-05, + "loss": 0.4922, + "step": 1981 + }, + { + "epoch": 0.23, + "learning_rate": 1.804291506666176e-05, + "loss": 0.4687, + "step": 1982 + }, + { + "epoch": 0.23, + "learning_rate": 1.8040714864281347e-05, + "loss": 0.5074, + "step": 1983 + }, + { + "epoch": 0.23, + "learning_rate": 1.8038513560143477e-05, + "loss": 0.479, + "step": 1984 + }, + { + "epoch": 0.23, + "learning_rate": 1.8036311154549783e-05, + "loss": 0.4841, + "step": 1985 + }, + { + "epoch": 0.23, + "learning_rate": 1.803410764780204e-05, + "loss": 0.4791, + "step": 1986 + }, + { + "epoch": 0.23, + "learning_rate": 1.803190304020218e-05, + "loss": 0.5005, + "step": 1987 + }, + { + "epoch": 0.23, + "learning_rate": 1.8029697332052277e-05, + "loss": 0.4771, + "step": 1988 + }, + { + "epoch": 0.23, + "learning_rate": 1.8027490523654568e-05, + "loss": 0.4959, + "step": 1989 + }, + { + "epoch": 0.23, + "learning_rate": 1.8025282615311437e-05, + "loss": 0.4576, + "step": 1990 + }, + { + "epoch": 0.23, + "learning_rate": 1.802307360732541e-05, + "loss": 0.4692, + "step": 1991 + }, + { + "epoch": 0.23, + "learning_rate": 1.8020863499999182e-05, + "loss": 0.4766, + "step": 1992 + }, + { + "epoch": 0.23, + "learning_rate": 1.801865229363557e-05, + "loss": 0.4822, + "step": 1993 + }, + { + "epoch": 0.23, + "learning_rate": 1.8016439988537576e-05, + "loss": 0.4648, + "step": 1994 + }, + { + "epoch": 0.23, + "learning_rate": 1.8014226585008322e-05, + "loss": 0.505, + "step": 1995 + }, + { + "epoch": 0.23, + "learning_rate": 1.80120120833511e-05, + "loss": 0.4659, + "step": 1996 + }, + { + "epoch": 0.23, + "learning_rate": 1.8009796483869347e-05, + "loss": 0.485, + "step": 1997 + }, + { + "epoch": 0.23, + "learning_rate": 1.8007579786866648e-05, + "loss": 0.4654, + "step": 1998 + }, + { + "epoch": 0.23, + "learning_rate": 1.8005361992646736e-05, + "loss": 0.4911, + "step": 1999 + }, + { + "epoch": 0.23, + "learning_rate": 1.8003143101513502e-05, + "loss": 0.5044, + "step": 2000 + }, + { + "epoch": 0.23, + "learning_rate": 1.8000923113770987e-05, + "loss": 0.4838, + "step": 2001 + }, + { + "epoch": 0.23, + "learning_rate": 1.7998702029723372e-05, + "loss": 0.454, + "step": 2002 + }, + { + "epoch": 0.23, + "learning_rate": 1.7996479849675e-05, + "loss": 0.4945, + "step": 2003 + }, + { + "epoch": 0.23, + "learning_rate": 1.799425657393036e-05, + "loss": 0.4773, + "step": 2004 + }, + { + "epoch": 0.23, + "learning_rate": 1.7992032202794084e-05, + "loss": 0.4931, + "step": 2005 + }, + { + "epoch": 0.23, + "learning_rate": 1.798980673657097e-05, + "loss": 0.4624, + "step": 2006 + }, + { + "epoch": 0.23, + "learning_rate": 1.7987580175565948e-05, + "loss": 0.481, + "step": 2007 + }, + { + "epoch": 0.23, + "learning_rate": 1.798535252008411e-05, + "loss": 0.4772, + "step": 2008 + }, + { + "epoch": 0.23, + "learning_rate": 1.7983123770430696e-05, + "loss": 0.4951, + "step": 2009 + }, + { + "epoch": 0.23, + "learning_rate": 1.7980893926911092e-05, + "loss": 0.4809, + "step": 2010 + }, + { + "epoch": 0.23, + "learning_rate": 1.7978662989830834e-05, + "loss": 0.4861, + "step": 2011 + }, + { + "epoch": 0.23, + "learning_rate": 1.7976430959495617e-05, + "loss": 0.4623, + "step": 2012 + }, + { + "epoch": 0.23, + "learning_rate": 1.7974197836211275e-05, + "loss": 0.4759, + "step": 2013 + }, + { + "epoch": 0.23, + "learning_rate": 1.7971963620283795e-05, + "loss": 0.4842, + "step": 2014 + }, + { + "epoch": 0.23, + "learning_rate": 1.7969728312019316e-05, + "loss": 0.4847, + "step": 2015 + }, + { + "epoch": 0.23, + "learning_rate": 1.7967491911724125e-05, + "loss": 0.484, + "step": 2016 + }, + { + "epoch": 0.23, + "learning_rate": 1.796525441970466e-05, + "loss": 0.4841, + "step": 2017 + }, + { + "epoch": 0.23, + "learning_rate": 1.7963015836267502e-05, + "loss": 0.4862, + "step": 2018 + }, + { + "epoch": 0.23, + "learning_rate": 1.7960776161719396e-05, + "loss": 0.4713, + "step": 2019 + }, + { + "epoch": 0.23, + "learning_rate": 1.7958535396367218e-05, + "loss": 0.4855, + "step": 2020 + }, + { + "epoch": 0.23, + "learning_rate": 1.795629354051801e-05, + "loss": 0.4947, + "step": 2021 + }, + { + "epoch": 0.23, + "learning_rate": 1.7954050594478952e-05, + "loss": 0.4709, + "step": 2022 + }, + { + "epoch": 0.23, + "learning_rate": 1.795180655855738e-05, + "loss": 0.4753, + "step": 2023 + }, + { + "epoch": 0.23, + "learning_rate": 1.7949561433060775e-05, + "loss": 0.4592, + "step": 2024 + }, + { + "epoch": 0.23, + "learning_rate": 1.794731521829677e-05, + "loss": 0.479, + "step": 2025 + }, + { + "epoch": 0.23, + "learning_rate": 1.7945067914573147e-05, + "loss": 0.4769, + "step": 2026 + }, + { + "epoch": 0.23, + "learning_rate": 1.7942819522197837e-05, + "loss": 0.4883, + "step": 2027 + }, + { + "epoch": 0.23, + "learning_rate": 1.794057004147892e-05, + "loss": 0.4726, + "step": 2028 + }, + { + "epoch": 0.23, + "learning_rate": 1.793831947272463e-05, + "loss": 0.4931, + "step": 2029 + }, + { + "epoch": 0.23, + "learning_rate": 1.793606781624333e-05, + "loss": 0.4699, + "step": 2030 + }, + { + "epoch": 0.23, + "learning_rate": 1.7933815072343565e-05, + "loss": 0.4965, + "step": 2031 + }, + { + "epoch": 0.23, + "learning_rate": 1.7931561241333998e-05, + "loss": 0.4694, + "step": 2032 + }, + { + "epoch": 0.23, + "learning_rate": 1.7929306323523463e-05, + "loss": 0.4896, + "step": 2033 + }, + { + "epoch": 0.23, + "learning_rate": 1.792705031922093e-05, + "loss": 0.4737, + "step": 2034 + }, + { + "epoch": 0.23, + "learning_rate": 1.792479322873552e-05, + "loss": 0.4702, + "step": 2035 + }, + { + "epoch": 0.23, + "learning_rate": 1.792253505237651e-05, + "loss": 0.5163, + "step": 2036 + }, + { + "epoch": 0.23, + "learning_rate": 1.7920275790453318e-05, + "loss": 0.4697, + "step": 2037 + }, + { + "epoch": 0.23, + "learning_rate": 1.7918015443275517e-05, + "loss": 0.4757, + "step": 2038 + }, + { + "epoch": 0.23, + "learning_rate": 1.7915754011152815e-05, + "loss": 0.4824, + "step": 2039 + }, + { + "epoch": 0.23, + "learning_rate": 1.791349149439509e-05, + "loss": 0.4852, + "step": 2040 + }, + { + "epoch": 0.23, + "learning_rate": 1.7911227893312347e-05, + "loss": 0.4749, + "step": 2041 + }, + { + "epoch": 0.23, + "learning_rate": 1.790896320821476e-05, + "loss": 0.485, + "step": 2042 + }, + { + "epoch": 0.23, + "learning_rate": 1.7906697439412634e-05, + "loss": 0.4604, + "step": 2043 + }, + { + "epoch": 0.23, + "learning_rate": 1.790443058721643e-05, + "loss": 0.4646, + "step": 2044 + }, + { + "epoch": 0.23, + "learning_rate": 1.7902162651936766e-05, + "loss": 0.4981, + "step": 2045 + }, + { + "epoch": 0.23, + "learning_rate": 1.789989363388439e-05, + "loss": 0.4722, + "step": 2046 + }, + { + "epoch": 0.23, + "learning_rate": 1.7897623533370212e-05, + "loss": 0.4707, + "step": 2047 + }, + { + "epoch": 0.23, + "learning_rate": 1.7895352350705288e-05, + "loss": 0.4786, + "step": 2048 + }, + { + "epoch": 0.23, + "learning_rate": 1.7893080086200817e-05, + "loss": 0.4885, + "step": 2049 + }, + { + "epoch": 0.23, + "learning_rate": 1.789080674016815e-05, + "loss": 0.4913, + "step": 2050 + }, + { + "epoch": 0.23, + "learning_rate": 1.7888532312918793e-05, + "loss": 0.498, + "step": 2051 + }, + { + "epoch": 0.23, + "learning_rate": 1.7886256804764385e-05, + "loss": 0.4651, + "step": 2052 + }, + { + "epoch": 0.23, + "learning_rate": 1.7883980216016724e-05, + "loss": 0.4745, + "step": 2053 + }, + { + "epoch": 0.23, + "learning_rate": 1.788170254698776e-05, + "loss": 0.4867, + "step": 2054 + }, + { + "epoch": 0.23, + "learning_rate": 1.7879423797989573e-05, + "loss": 0.455, + "step": 2055 + }, + { + "epoch": 0.23, + "learning_rate": 1.787714396933441e-05, + "loss": 0.4727, + "step": 2056 + }, + { + "epoch": 0.24, + "learning_rate": 1.7874863061334658e-05, + "loss": 0.4833, + "step": 2057 + }, + { + "epoch": 0.24, + "learning_rate": 1.7872581074302852e-05, + "loss": 0.494, + "step": 2058 + }, + { + "epoch": 0.24, + "learning_rate": 1.7870298008551674e-05, + "loss": 0.4881, + "step": 2059 + }, + { + "epoch": 0.24, + "learning_rate": 1.786801386439395e-05, + "loss": 0.4843, + "step": 2060 + }, + { + "epoch": 0.24, + "learning_rate": 1.7865728642142668e-05, + "loss": 0.4787, + "step": 2061 + }, + { + "epoch": 0.24, + "learning_rate": 1.786344234211095e-05, + "loss": 0.4876, + "step": 2062 + }, + { + "epoch": 0.24, + "learning_rate": 1.786115496461207e-05, + "loss": 0.4639, + "step": 2063 + }, + { + "epoch": 0.24, + "learning_rate": 1.7858866509959455e-05, + "loss": 0.4657, + "step": 2064 + }, + { + "epoch": 0.24, + "learning_rate": 1.7856576978466666e-05, + "loss": 0.4768, + "step": 2065 + }, + { + "epoch": 0.24, + "learning_rate": 1.785428637044742e-05, + "loss": 0.4837, + "step": 2066 + }, + { + "epoch": 0.24, + "learning_rate": 1.7851994686215592e-05, + "loss": 0.498, + "step": 2067 + }, + { + "epoch": 0.24, + "learning_rate": 1.7849701926085183e-05, + "loss": 0.4789, + "step": 2068 + }, + { + "epoch": 0.24, + "learning_rate": 1.7847408090370355e-05, + "loss": 0.47, + "step": 2069 + }, + { + "epoch": 0.24, + "learning_rate": 1.784511317938542e-05, + "loss": 0.4868, + "step": 2070 + }, + { + "epoch": 0.24, + "learning_rate": 1.7842817193444823e-05, + "loss": 0.4803, + "step": 2071 + }, + { + "epoch": 0.24, + "learning_rate": 1.7840520132863173e-05, + "loss": 0.4863, + "step": 2072 + }, + { + "epoch": 0.24, + "learning_rate": 1.783822199795522e-05, + "loss": 0.4667, + "step": 2073 + }, + { + "epoch": 0.24, + "learning_rate": 1.7835922789035853e-05, + "loss": 0.4738, + "step": 2074 + }, + { + "epoch": 0.24, + "learning_rate": 1.7833622506420116e-05, + "loss": 0.454, + "step": 2075 + }, + { + "epoch": 0.24, + "learning_rate": 1.7831321150423203e-05, + "loss": 0.5017, + "step": 2076 + }, + { + "epoch": 0.24, + "learning_rate": 1.782901872136045e-05, + "loss": 0.4968, + "step": 2077 + }, + { + "epoch": 0.24, + "learning_rate": 1.7826715219547336e-05, + "loss": 0.4852, + "step": 2078 + }, + { + "epoch": 0.24, + "learning_rate": 1.78244106452995e-05, + "loss": 0.4827, + "step": 2079 + }, + { + "epoch": 0.24, + "learning_rate": 1.7822104998932715e-05, + "loss": 0.4838, + "step": 2080 + }, + { + "epoch": 0.24, + "learning_rate": 1.7819798280762907e-05, + "loss": 0.4729, + "step": 2081 + }, + { + "epoch": 0.24, + "learning_rate": 1.7817490491106148e-05, + "loss": 0.4647, + "step": 2082 + }, + { + "epoch": 0.24, + "learning_rate": 1.7815181630278656e-05, + "loss": 0.4783, + "step": 2083 + }, + { + "epoch": 0.24, + "learning_rate": 1.78128716985968e-05, + "loss": 0.4797, + "step": 2084 + }, + { + "epoch": 0.24, + "learning_rate": 1.781056069637709e-05, + "loss": 0.5009, + "step": 2085 + }, + { + "epoch": 0.24, + "learning_rate": 1.7808248623936183e-05, + "loss": 0.5092, + "step": 2086 + }, + { + "epoch": 0.24, + "learning_rate": 1.780593548159089e-05, + "loss": 0.4692, + "step": 2087 + }, + { + "epoch": 0.24, + "learning_rate": 1.7803621269658154e-05, + "loss": 0.4904, + "step": 2088 + }, + { + "epoch": 0.24, + "learning_rate": 1.7801305988455085e-05, + "loss": 0.4693, + "step": 2089 + }, + { + "epoch": 0.24, + "learning_rate": 1.779898963829892e-05, + "loss": 0.4656, + "step": 2090 + }, + { + "epoch": 0.24, + "learning_rate": 1.779667221950705e-05, + "loss": 0.4907, + "step": 2091 + }, + { + "epoch": 0.24, + "learning_rate": 1.7794353732397018e-05, + "loss": 0.47, + "step": 2092 + }, + { + "epoch": 0.24, + "learning_rate": 1.7792034177286508e-05, + "loss": 0.4755, + "step": 2093 + }, + { + "epoch": 0.24, + "learning_rate": 1.778971355449335e-05, + "loss": 0.4875, + "step": 2094 + }, + { + "epoch": 0.24, + "learning_rate": 1.7787391864335517e-05, + "loss": 0.4787, + "step": 2095 + }, + { + "epoch": 0.24, + "learning_rate": 1.778506910713114e-05, + "loss": 0.4721, + "step": 2096 + }, + { + "epoch": 0.24, + "learning_rate": 1.778274528319848e-05, + "loss": 0.4827, + "step": 2097 + }, + { + "epoch": 0.24, + "learning_rate": 1.778042039285596e-05, + "loss": 0.462, + "step": 2098 + }, + { + "epoch": 0.24, + "learning_rate": 1.777809443642214e-05, + "loss": 0.4773, + "step": 2099 + }, + { + "epoch": 0.24, + "learning_rate": 1.7775767414215726e-05, + "loss": 0.4847, + "step": 2100 + }, + { + "epoch": 0.24, + "learning_rate": 1.7773439326555574e-05, + "loss": 0.4716, + "step": 2101 + }, + { + "epoch": 0.24, + "learning_rate": 1.777111017376068e-05, + "loss": 0.4797, + "step": 2102 + }, + { + "epoch": 0.24, + "learning_rate": 1.7768779956150196e-05, + "loss": 0.4778, + "step": 2103 + }, + { + "epoch": 0.24, + "learning_rate": 1.776644867404341e-05, + "loss": 0.4625, + "step": 2104 + }, + { + "epoch": 0.24, + "learning_rate": 1.776411632775976e-05, + "loss": 0.46, + "step": 2105 + }, + { + "epoch": 0.24, + "learning_rate": 1.7761782917618836e-05, + "loss": 0.4934, + "step": 2106 + }, + { + "epoch": 0.24, + "learning_rate": 1.7759448443940355e-05, + "loss": 0.459, + "step": 2107 + }, + { + "epoch": 0.24, + "learning_rate": 1.77571129070442e-05, + "loss": 0.4785, + "step": 2108 + }, + { + "epoch": 0.24, + "learning_rate": 1.775477630725039e-05, + "loss": 0.4648, + "step": 2109 + }, + { + "epoch": 0.24, + "learning_rate": 1.7752438644879092e-05, + "loss": 0.5045, + "step": 2110 + }, + { + "epoch": 0.24, + "learning_rate": 1.7750099920250616e-05, + "loss": 0.4848, + "step": 2111 + }, + { + "epoch": 0.24, + "learning_rate": 1.774776013368542e-05, + "loss": 0.4724, + "step": 2112 + }, + { + "epoch": 0.24, + "learning_rate": 1.774541928550411e-05, + "loss": 0.4749, + "step": 2113 + }, + { + "epoch": 0.24, + "learning_rate": 1.7743077376027433e-05, + "loss": 0.4872, + "step": 2114 + }, + { + "epoch": 0.24, + "learning_rate": 1.7740734405576283e-05, + "loss": 0.4778, + "step": 2115 + }, + { + "epoch": 0.24, + "learning_rate": 1.7738390374471696e-05, + "loss": 0.4904, + "step": 2116 + }, + { + "epoch": 0.24, + "learning_rate": 1.773604528303486e-05, + "loss": 0.4892, + "step": 2117 + }, + { + "epoch": 0.24, + "learning_rate": 1.7733699131587104e-05, + "loss": 0.4918, + "step": 2118 + }, + { + "epoch": 0.24, + "learning_rate": 1.77313519204499e-05, + "loss": 0.4776, + "step": 2119 + }, + { + "epoch": 0.24, + "learning_rate": 1.7729003649944878e-05, + "loss": 0.4777, + "step": 2120 + }, + { + "epoch": 0.24, + "learning_rate": 1.7726654320393795e-05, + "loss": 0.4935, + "step": 2121 + }, + { + "epoch": 0.24, + "learning_rate": 1.772430393211856e-05, + "loss": 0.482, + "step": 2122 + }, + { + "epoch": 0.24, + "learning_rate": 1.7721952485441232e-05, + "loss": 0.4718, + "step": 2123 + }, + { + "epoch": 0.24, + "learning_rate": 1.7719599980684016e-05, + "loss": 0.4591, + "step": 2124 + }, + { + "epoch": 0.24, + "learning_rate": 1.7717246418169252e-05, + "loss": 0.481, + "step": 2125 + }, + { + "epoch": 0.24, + "learning_rate": 1.7714891798219432e-05, + "loss": 0.4851, + "step": 2126 + }, + { + "epoch": 0.24, + "learning_rate": 1.771253612115719e-05, + "loss": 0.5005, + "step": 2127 + }, + { + "epoch": 0.24, + "learning_rate": 1.7710179387305308e-05, + "loss": 0.4734, + "step": 2128 + }, + { + "epoch": 0.24, + "learning_rate": 1.7707821596986715e-05, + "loss": 0.4805, + "step": 2129 + }, + { + "epoch": 0.24, + "learning_rate": 1.7705462750524474e-05, + "loss": 0.4809, + "step": 2130 + }, + { + "epoch": 0.24, + "learning_rate": 1.77031028482418e-05, + "loss": 0.4728, + "step": 2131 + }, + { + "epoch": 0.24, + "learning_rate": 1.770074189046206e-05, + "loss": 0.4759, + "step": 2132 + }, + { + "epoch": 0.24, + "learning_rate": 1.7698379877508755e-05, + "loss": 0.4709, + "step": 2133 + }, + { + "epoch": 0.24, + "learning_rate": 1.7696016809705525e-05, + "loss": 0.4993, + "step": 2134 + }, + { + "epoch": 0.24, + "learning_rate": 1.7693652687376173e-05, + "loss": 0.4765, + "step": 2135 + }, + { + "epoch": 0.24, + "learning_rate": 1.769128751084463e-05, + "loss": 0.4845, + "step": 2136 + }, + { + "epoch": 0.24, + "learning_rate": 1.7688921280434984e-05, + "loss": 0.4776, + "step": 2137 + }, + { + "epoch": 0.24, + "learning_rate": 1.768655399647146e-05, + "loss": 0.4779, + "step": 2138 + }, + { + "epoch": 0.24, + "learning_rate": 1.7684185659278423e-05, + "loss": 0.4532, + "step": 2139 + }, + { + "epoch": 0.24, + "learning_rate": 1.7681816269180394e-05, + "loss": 0.4901, + "step": 2140 + }, + { + "epoch": 0.24, + "learning_rate": 1.7679445826502033e-05, + "loss": 0.4582, + "step": 2141 + }, + { + "epoch": 0.24, + "learning_rate": 1.767707433156814e-05, + "loss": 0.4821, + "step": 2142 + }, + { + "epoch": 0.24, + "learning_rate": 1.767470178470366e-05, + "loss": 0.4743, + "step": 2143 + }, + { + "epoch": 0.25, + "learning_rate": 1.7672328186233692e-05, + "loss": 0.5039, + "step": 2144 + }, + { + "epoch": 0.25, + "learning_rate": 1.766995353648347e-05, + "loss": 0.4517, + "step": 2145 + }, + { + "epoch": 0.25, + "learning_rate": 1.766757783577837e-05, + "loss": 0.4928, + "step": 2146 + }, + { + "epoch": 0.25, + "learning_rate": 1.766520108444392e-05, + "loss": 0.4828, + "step": 2147 + }, + { + "epoch": 0.25, + "learning_rate": 1.7662823282805788e-05, + "loss": 0.4716, + "step": 2148 + }, + { + "epoch": 0.25, + "learning_rate": 1.766044443118978e-05, + "loss": 0.4756, + "step": 2149 + }, + { + "epoch": 0.25, + "learning_rate": 1.765806452992186e-05, + "loss": 0.4674, + "step": 2150 + }, + { + "epoch": 0.25, + "learning_rate": 1.7655683579328128e-05, + "loss": 0.4787, + "step": 2151 + }, + { + "epoch": 0.25, + "learning_rate": 1.765330157973482e-05, + "loss": 0.5245, + "step": 2152 + }, + { + "epoch": 0.25, + "learning_rate": 1.7650918531468326e-05, + "loss": 0.4759, + "step": 2153 + }, + { + "epoch": 0.25, + "learning_rate": 1.7648534434855177e-05, + "loss": 0.4719, + "step": 2154 + }, + { + "epoch": 0.25, + "learning_rate": 1.764614929022205e-05, + "loss": 0.4807, + "step": 2155 + }, + { + "epoch": 0.25, + "learning_rate": 1.764376309789576e-05, + "loss": 0.4773, + "step": 2156 + }, + { + "epoch": 0.25, + "learning_rate": 1.764137585820327e-05, + "loss": 0.4787, + "step": 2157 + }, + { + "epoch": 0.25, + "learning_rate": 1.7638987571471685e-05, + "loss": 0.4781, + "step": 2158 + }, + { + "epoch": 0.25, + "learning_rate": 1.7636598238028253e-05, + "loss": 0.4679, + "step": 2159 + }, + { + "epoch": 0.25, + "learning_rate": 1.7634207858200366e-05, + "loss": 0.4922, + "step": 2160 + }, + { + "epoch": 0.25, + "learning_rate": 1.763181643231556e-05, + "loss": 0.491, + "step": 2161 + }, + { + "epoch": 0.25, + "learning_rate": 1.7629423960701513e-05, + "loss": 0.4677, + "step": 2162 + }, + { + "epoch": 0.25, + "learning_rate": 1.7627030443686047e-05, + "loss": 0.4873, + "step": 2163 + }, + { + "epoch": 0.25, + "learning_rate": 1.762463588159713e-05, + "loss": 0.4677, + "step": 2164 + }, + { + "epoch": 0.25, + "learning_rate": 1.762224027476287e-05, + "loss": 0.4581, + "step": 2165 + }, + { + "epoch": 0.25, + "learning_rate": 1.761984362351151e-05, + "loss": 0.5085, + "step": 2166 + }, + { + "epoch": 0.25, + "learning_rate": 1.7617445928171458e-05, + "loss": 0.4582, + "step": 2167 + }, + { + "epoch": 0.25, + "learning_rate": 1.761504718907124e-05, + "loss": 0.499, + "step": 2168 + }, + { + "epoch": 0.25, + "learning_rate": 1.7612647406539548e-05, + "loss": 0.4789, + "step": 2169 + }, + { + "epoch": 0.25, + "learning_rate": 1.76102465809052e-05, + "loss": 0.4757, + "step": 2170 + }, + { + "epoch": 0.25, + "learning_rate": 1.760784471249716e-05, + "loss": 0.4804, + "step": 2171 + }, + { + "epoch": 0.25, + "learning_rate": 1.760544180164454e-05, + "loss": 0.4877, + "step": 2172 + }, + { + "epoch": 0.25, + "learning_rate": 1.7603037848676593e-05, + "loss": 0.4686, + "step": 2173 + }, + { + "epoch": 0.25, + "learning_rate": 1.7600632853922713e-05, + "loss": 0.4784, + "step": 2174 + }, + { + "epoch": 0.25, + "learning_rate": 1.7598226817712442e-05, + "loss": 0.4851, + "step": 2175 + }, + { + "epoch": 0.25, + "learning_rate": 1.7595819740375457e-05, + "loss": 0.4848, + "step": 2176 + }, + { + "epoch": 0.25, + "learning_rate": 1.7593411622241584e-05, + "loss": 0.4683, + "step": 2177 + }, + { + "epoch": 0.25, + "learning_rate": 1.7591002463640784e-05, + "loss": 0.4815, + "step": 2178 + }, + { + "epoch": 0.25, + "learning_rate": 1.758859226490317e-05, + "loss": 0.5039, + "step": 2179 + }, + { + "epoch": 0.25, + "learning_rate": 1.7586181026358987e-05, + "loss": 0.4845, + "step": 2180 + }, + { + "epoch": 0.25, + "learning_rate": 1.758376874833864e-05, + "loss": 0.4929, + "step": 2181 + }, + { + "epoch": 0.25, + "learning_rate": 1.7581355431172653e-05, + "loss": 0.4848, + "step": 2182 + }, + { + "epoch": 0.25, + "learning_rate": 1.757894107519171e-05, + "loss": 0.4796, + "step": 2183 + }, + { + "epoch": 0.25, + "learning_rate": 1.757652568072663e-05, + "loss": 0.4889, + "step": 2184 + }, + { + "epoch": 0.25, + "learning_rate": 1.757410924810838e-05, + "loss": 0.4622, + "step": 2185 + }, + { + "epoch": 0.25, + "learning_rate": 1.757169177766806e-05, + "loss": 0.4766, + "step": 2186 + }, + { + "epoch": 0.25, + "learning_rate": 1.7569273269736918e-05, + "loss": 0.4708, + "step": 2187 + }, + { + "epoch": 0.25, + "learning_rate": 1.756685372464635e-05, + "loss": 0.537, + "step": 2188 + }, + { + "epoch": 0.25, + "learning_rate": 1.7564433142727882e-05, + "loss": 0.4926, + "step": 2189 + }, + { + "epoch": 0.25, + "learning_rate": 1.7562011524313187e-05, + "loss": 0.4633, + "step": 2190 + }, + { + "epoch": 0.25, + "learning_rate": 1.755958886973408e-05, + "loss": 0.4688, + "step": 2191 + }, + { + "epoch": 0.25, + "learning_rate": 1.7557165179322522e-05, + "loss": 0.4958, + "step": 2192 + }, + { + "epoch": 0.25, + "learning_rate": 1.7554740453410617e-05, + "loss": 0.4528, + "step": 2193 + }, + { + "epoch": 0.25, + "learning_rate": 1.75523146923306e-05, + "loss": 0.4919, + "step": 2194 + }, + { + "epoch": 0.25, + "learning_rate": 1.7549887896414853e-05, + "loss": 0.4762, + "step": 2195 + }, + { + "epoch": 0.25, + "learning_rate": 1.7547460065995903e-05, + "loss": 0.4834, + "step": 2196 + }, + { + "epoch": 0.25, + "learning_rate": 1.754503120140642e-05, + "loss": 0.5091, + "step": 2197 + }, + { + "epoch": 0.25, + "learning_rate": 1.7542601302979213e-05, + "loss": 0.4773, + "step": 2198 + }, + { + "epoch": 0.25, + "learning_rate": 1.7540170371047228e-05, + "loss": 0.4704, + "step": 2199 + }, + { + "epoch": 0.25, + "learning_rate": 1.753773840594356e-05, + "loss": 0.4855, + "step": 2200 + }, + { + "epoch": 0.25, + "learning_rate": 1.753530540800144e-05, + "loss": 0.48, + "step": 2201 + }, + { + "epoch": 0.25, + "learning_rate": 1.7532871377554243e-05, + "loss": 0.4906, + "step": 2202 + }, + { + "epoch": 0.25, + "learning_rate": 1.7530436314935492e-05, + "loss": 0.4709, + "step": 2203 + }, + { + "epoch": 0.25, + "learning_rate": 1.7528000220478836e-05, + "loss": 0.4907, + "step": 2204 + }, + { + "epoch": 0.25, + "learning_rate": 1.7525563094518078e-05, + "loss": 0.4629, + "step": 2205 + }, + { + "epoch": 0.25, + "learning_rate": 1.7523124937387162e-05, + "loss": 0.4857, + "step": 2206 + }, + { + "epoch": 0.25, + "learning_rate": 1.7520685749420164e-05, + "loss": 0.4732, + "step": 2207 + }, + { + "epoch": 0.25, + "learning_rate": 1.7518245530951315e-05, + "loss": 0.4853, + "step": 2208 + }, + { + "epoch": 0.25, + "learning_rate": 1.7515804282314974e-05, + "loss": 0.4757, + "step": 2209 + }, + { + "epoch": 0.25, + "learning_rate": 1.751336200384564e-05, + "loss": 0.48, + "step": 2210 + }, + { + "epoch": 0.25, + "learning_rate": 1.751091869587797e-05, + "loss": 0.4729, + "step": 2211 + }, + { + "epoch": 0.25, + "learning_rate": 1.7508474358746753e-05, + "loss": 0.4996, + "step": 2212 + }, + { + "epoch": 0.25, + "learning_rate": 1.7506028992786912e-05, + "loss": 0.4631, + "step": 2213 + }, + { + "epoch": 0.25, + "learning_rate": 1.7503582598333517e-05, + "loss": 0.4687, + "step": 2214 + }, + { + "epoch": 0.25, + "learning_rate": 1.750113517572178e-05, + "loss": 0.4873, + "step": 2215 + }, + { + "epoch": 0.25, + "learning_rate": 1.749868672528705e-05, + "loss": 0.474, + "step": 2216 + }, + { + "epoch": 0.25, + "learning_rate": 1.7496237247364827e-05, + "loss": 0.4696, + "step": 2217 + }, + { + "epoch": 0.25, + "learning_rate": 1.7493786742290734e-05, + "loss": 0.4869, + "step": 2218 + }, + { + "epoch": 0.25, + "learning_rate": 1.7491335210400554e-05, + "loss": 0.4751, + "step": 2219 + }, + { + "epoch": 0.25, + "learning_rate": 1.7488882652030193e-05, + "loss": 0.5086, + "step": 2220 + }, + { + "epoch": 0.25, + "learning_rate": 1.748642906751571e-05, + "loss": 0.4697, + "step": 2221 + }, + { + "epoch": 0.25, + "learning_rate": 1.7483974457193307e-05, + "loss": 0.4575, + "step": 2222 + }, + { + "epoch": 0.25, + "learning_rate": 1.748151882139931e-05, + "loss": 0.482, + "step": 2223 + }, + { + "epoch": 0.25, + "learning_rate": 1.7479062160470205e-05, + "loss": 0.4892, + "step": 2224 + }, + { + "epoch": 0.25, + "learning_rate": 1.74766044747426e-05, + "loss": 0.4507, + "step": 2225 + }, + { + "epoch": 0.25, + "learning_rate": 1.7474145764553262e-05, + "loss": 0.4824, + "step": 2226 + }, + { + "epoch": 0.25, + "learning_rate": 1.7471686030239082e-05, + "loss": 0.4685, + "step": 2227 + }, + { + "epoch": 0.25, + "learning_rate": 1.7469225272137104e-05, + "loss": 0.5125, + "step": 2228 + }, + { + "epoch": 0.25, + "learning_rate": 1.7466763490584504e-05, + "loss": 0.4813, + "step": 2229 + }, + { + "epoch": 0.25, + "learning_rate": 1.7464300685918602e-05, + "loss": 0.4821, + "step": 2230 + }, + { + "epoch": 0.25, + "learning_rate": 1.7461836858476858e-05, + "loss": 0.4607, + "step": 2231 + }, + { + "epoch": 0.26, + "learning_rate": 1.745937200859687e-05, + "loss": 0.4742, + "step": 2232 + }, + { + "epoch": 0.26, + "learning_rate": 1.7456906136616374e-05, + "loss": 0.47, + "step": 2233 + }, + { + "epoch": 0.26, + "learning_rate": 1.7454439242873257e-05, + "loss": 0.4881, + "step": 2234 + }, + { + "epoch": 0.26, + "learning_rate": 1.745197132770553e-05, + "loss": 0.4716, + "step": 2235 + }, + { + "epoch": 0.26, + "learning_rate": 1.7449502391451362e-05, + "loss": 0.5046, + "step": 2236 + }, + { + "epoch": 0.26, + "learning_rate": 1.7447032434449045e-05, + "loss": 0.48, + "step": 2237 + }, + { + "epoch": 0.26, + "learning_rate": 1.7444561457037022e-05, + "loss": 0.4854, + "step": 2238 + }, + { + "epoch": 0.26, + "learning_rate": 1.744208945955387e-05, + "loss": 0.4625, + "step": 2239 + }, + { + "epoch": 0.26, + "learning_rate": 1.743961644233831e-05, + "loss": 0.4795, + "step": 2240 + }, + { + "epoch": 0.26, + "learning_rate": 1.7437142405729196e-05, + "loss": 0.4831, + "step": 2241 + }, + { + "epoch": 0.26, + "learning_rate": 1.743466735006553e-05, + "loss": 0.4779, + "step": 2242 + }, + { + "epoch": 0.26, + "learning_rate": 1.7432191275686454e-05, + "loss": 0.4721, + "step": 2243 + }, + { + "epoch": 0.26, + "learning_rate": 1.7429714182931238e-05, + "loss": 0.4879, + "step": 2244 + }, + { + "epoch": 0.26, + "learning_rate": 1.7427236072139306e-05, + "loss": 0.4855, + "step": 2245 + }, + { + "epoch": 0.26, + "learning_rate": 1.7424756943650203e-05, + "loss": 0.4936, + "step": 2246 + }, + { + "epoch": 0.26, + "learning_rate": 1.7422276797803638e-05, + "loss": 0.4762, + "step": 2247 + }, + { + "epoch": 0.26, + "learning_rate": 1.741979563493944e-05, + "loss": 0.4638, + "step": 2248 + }, + { + "epoch": 0.26, + "learning_rate": 1.741731345539758e-05, + "loss": 0.4827, + "step": 2249 + }, + { + "epoch": 0.26, + "learning_rate": 1.741483025951818e-05, + "loss": 0.4847, + "step": 2250 + }, + { + "epoch": 0.26, + "learning_rate": 1.7412346047641485e-05, + "loss": 0.4682, + "step": 2251 + }, + { + "epoch": 0.26, + "learning_rate": 1.74098608201079e-05, + "loss": 0.4909, + "step": 2252 + }, + { + "epoch": 0.26, + "learning_rate": 1.7407374577257945e-05, + "loss": 0.479, + "step": 2253 + }, + { + "epoch": 0.26, + "learning_rate": 1.7404887319432294e-05, + "loss": 0.5104, + "step": 2254 + }, + { + "epoch": 0.26, + "learning_rate": 1.740239904697176e-05, + "loss": 0.4657, + "step": 2255 + }, + { + "epoch": 0.26, + "learning_rate": 1.7399909760217285e-05, + "loss": 0.4881, + "step": 2256 + }, + { + "epoch": 0.26, + "learning_rate": 1.7397419459509962e-05, + "loss": 0.4684, + "step": 2257 + }, + { + "epoch": 0.26, + "learning_rate": 1.739492814519102e-05, + "loss": 0.4574, + "step": 2258 + }, + { + "epoch": 0.26, + "learning_rate": 1.739243581760182e-05, + "loss": 0.4794, + "step": 2259 + }, + { + "epoch": 0.26, + "learning_rate": 1.738994247708387e-05, + "loss": 0.4778, + "step": 2260 + }, + { + "epoch": 0.26, + "learning_rate": 1.7387448123978813e-05, + "loss": 0.489, + "step": 2261 + }, + { + "epoch": 0.26, + "learning_rate": 1.7384952758628423e-05, + "loss": 0.4871, + "step": 2262 + }, + { + "epoch": 0.26, + "learning_rate": 1.738245638137463e-05, + "loss": 0.4855, + "step": 2263 + }, + { + "epoch": 0.26, + "learning_rate": 1.7379958992559494e-05, + "loss": 0.471, + "step": 2264 + }, + { + "epoch": 0.26, + "learning_rate": 1.737746059252521e-05, + "loss": 0.4735, + "step": 2265 + }, + { + "epoch": 0.26, + "learning_rate": 1.737496118161411e-05, + "loss": 0.4721, + "step": 2266 + }, + { + "epoch": 0.26, + "learning_rate": 1.7372460760168676e-05, + "loss": 0.492, + "step": 2267 + }, + { + "epoch": 0.26, + "learning_rate": 1.736995932853152e-05, + "loss": 0.4756, + "step": 2268 + }, + { + "epoch": 0.26, + "learning_rate": 1.736745688704539e-05, + "loss": 0.5099, + "step": 2269 + }, + { + "epoch": 0.26, + "learning_rate": 1.736495343605318e-05, + "loss": 0.476, + "step": 2270 + }, + { + "epoch": 0.26, + "learning_rate": 1.736244897589792e-05, + "loss": 0.4761, + "step": 2271 + }, + { + "epoch": 0.26, + "learning_rate": 1.7359943506922775e-05, + "loss": 0.4725, + "step": 2272 + }, + { + "epoch": 0.26, + "learning_rate": 1.735743702947105e-05, + "loss": 0.477, + "step": 2273 + }, + { + "epoch": 0.26, + "learning_rate": 1.7354929543886186e-05, + "loss": 0.4682, + "step": 2274 + }, + { + "epoch": 0.26, + "learning_rate": 1.7352421050511767e-05, + "loss": 0.4591, + "step": 2275 + }, + { + "epoch": 0.26, + "learning_rate": 1.734991154969152e-05, + "loss": 0.4755, + "step": 2276 + }, + { + "epoch": 0.26, + "learning_rate": 1.7347401041769284e-05, + "loss": 0.4902, + "step": 2277 + }, + { + "epoch": 0.26, + "learning_rate": 1.7344889527089074e-05, + "loss": 0.4777, + "step": 2278 + }, + { + "epoch": 0.26, + "learning_rate": 1.7342377005995014e-05, + "loss": 0.4873, + "step": 2279 + }, + { + "epoch": 0.26, + "learning_rate": 1.733986347883138e-05, + "loss": 0.4777, + "step": 2280 + }, + { + "epoch": 0.26, + "learning_rate": 1.7337348945942572e-05, + "loss": 0.4675, + "step": 2281 + }, + { + "epoch": 0.26, + "learning_rate": 1.7334833407673145e-05, + "loss": 0.4943, + "step": 2282 + }, + { + "epoch": 0.26, + "learning_rate": 1.7332316864367785e-05, + "loss": 0.4708, + "step": 2283 + }, + { + "epoch": 0.26, + "learning_rate": 1.732979931637131e-05, + "loss": 0.4695, + "step": 2284 + }, + { + "epoch": 0.26, + "learning_rate": 1.7327280764028683e-05, + "loss": 0.488, + "step": 2285 + }, + { + "epoch": 0.26, + "learning_rate": 1.7324761207685005e-05, + "loss": 0.5035, + "step": 2286 + }, + { + "epoch": 0.26, + "learning_rate": 1.7322240647685503e-05, + "loss": 0.4726, + "step": 2287 + }, + { + "epoch": 0.26, + "learning_rate": 1.7319719084375556e-05, + "loss": 0.4609, + "step": 2288 + }, + { + "epoch": 0.26, + "learning_rate": 1.7317196518100672e-05, + "loss": 0.488, + "step": 2289 + }, + { + "epoch": 0.26, + "learning_rate": 1.7314672949206502e-05, + "loss": 0.474, + "step": 2290 + }, + { + "epoch": 0.26, + "learning_rate": 1.731214837803883e-05, + "loss": 0.4646, + "step": 2291 + }, + { + "epoch": 0.26, + "learning_rate": 1.7309622804943573e-05, + "loss": 0.4767, + "step": 2292 + }, + { + "epoch": 0.26, + "learning_rate": 1.73070962302668e-05, + "loss": 0.5126, + "step": 2293 + }, + { + "epoch": 0.26, + "learning_rate": 1.7304568654354703e-05, + "loss": 0.4779, + "step": 2294 + }, + { + "epoch": 0.26, + "learning_rate": 1.7302040077553616e-05, + "loss": 0.4769, + "step": 2295 + }, + { + "epoch": 0.26, + "learning_rate": 1.7299510500210015e-05, + "loss": 0.4764, + "step": 2296 + }, + { + "epoch": 0.26, + "learning_rate": 1.7296979922670502e-05, + "loss": 0.4786, + "step": 2297 + }, + { + "epoch": 0.26, + "learning_rate": 1.729444834528183e-05, + "loss": 0.4735, + "step": 2298 + }, + { + "epoch": 0.26, + "learning_rate": 1.7291915768390875e-05, + "loss": 0.4773, + "step": 2299 + }, + { + "epoch": 0.26, + "learning_rate": 1.728938219234466e-05, + "loss": 0.4762, + "step": 2300 + }, + { + "epoch": 0.26, + "learning_rate": 1.728684761749034e-05, + "loss": 0.4629, + "step": 2301 + }, + { + "epoch": 0.26, + "learning_rate": 1.728431204417521e-05, + "loss": 0.4624, + "step": 2302 + }, + { + "epoch": 0.26, + "learning_rate": 1.7281775472746695e-05, + "loss": 0.4793, + "step": 2303 + }, + { + "epoch": 0.26, + "learning_rate": 1.727923790355237e-05, + "loss": 0.4707, + "step": 2304 + }, + { + "epoch": 0.26, + "learning_rate": 1.7276699336939936e-05, + "loss": 0.4638, + "step": 2305 + }, + { + "epoch": 0.26, + "learning_rate": 1.7274159773257227e-05, + "loss": 0.492, + "step": 2306 + }, + { + "epoch": 0.26, + "learning_rate": 1.7271619212852232e-05, + "loss": 0.4632, + "step": 2307 + }, + { + "epoch": 0.26, + "learning_rate": 1.726907765607305e-05, + "loss": 0.479, + "step": 2308 + }, + { + "epoch": 0.26, + "learning_rate": 1.7266535103267943e-05, + "loss": 0.4955, + "step": 2309 + }, + { + "epoch": 0.26, + "learning_rate": 1.726399155478529e-05, + "loss": 0.471, + "step": 2310 + }, + { + "epoch": 0.26, + "learning_rate": 1.7261447010973623e-05, + "loss": 0.4869, + "step": 2311 + }, + { + "epoch": 0.26, + "learning_rate": 1.7258901472181587e-05, + "loss": 0.4765, + "step": 2312 + }, + { + "epoch": 0.26, + "learning_rate": 1.725635493875799e-05, + "loss": 0.4795, + "step": 2313 + }, + { + "epoch": 0.26, + "learning_rate": 1.725380741105176e-05, + "loss": 0.4612, + "step": 2314 + }, + { + "epoch": 0.26, + "learning_rate": 1.7251258889411964e-05, + "loss": 0.48, + "step": 2315 + }, + { + "epoch": 0.26, + "learning_rate": 1.724870937418781e-05, + "loss": 0.483, + "step": 2316 + }, + { + "epoch": 0.26, + "learning_rate": 1.7246158865728634e-05, + "loss": 0.4708, + "step": 2317 + }, + { + "epoch": 0.26, + "learning_rate": 1.7243607364383916e-05, + "loss": 0.4752, + "step": 2318 + }, + { + "epoch": 0.27, + "learning_rate": 1.7241054870503262e-05, + "loss": 0.4801, + "step": 2319 + }, + { + "epoch": 0.27, + "learning_rate": 1.723850138443643e-05, + "loss": 0.4739, + "step": 2320 + }, + { + "epoch": 0.27, + "learning_rate": 1.72359469065333e-05, + "loss": 0.4771, + "step": 2321 + }, + { + "epoch": 0.27, + "learning_rate": 1.723339143714389e-05, + "loss": 0.4811, + "step": 2322 + }, + { + "epoch": 0.27, + "learning_rate": 1.7230834976618364e-05, + "loss": 0.5013, + "step": 2323 + }, + { + "epoch": 0.27, + "learning_rate": 1.7228277525307007e-05, + "loss": 0.4576, + "step": 2324 + }, + { + "epoch": 0.27, + "learning_rate": 1.7225719083560246e-05, + "loss": 0.4736, + "step": 2325 + }, + { + "epoch": 0.27, + "learning_rate": 1.7223159651728653e-05, + "loss": 0.4784, + "step": 2326 + }, + { + "epoch": 0.27, + "learning_rate": 1.7220599230162917e-05, + "loss": 0.4742, + "step": 2327 + }, + { + "epoch": 0.27, + "learning_rate": 1.7218037819213883e-05, + "loss": 0.4799, + "step": 2328 + }, + { + "epoch": 0.27, + "learning_rate": 1.7215475419232516e-05, + "loss": 0.4987, + "step": 2329 + }, + { + "epoch": 0.27, + "learning_rate": 1.7212912030569923e-05, + "loss": 0.4768, + "step": 2330 + }, + { + "epoch": 0.27, + "learning_rate": 1.7210347653577343e-05, + "loss": 0.4819, + "step": 2331 + }, + { + "epoch": 0.27, + "learning_rate": 1.7207782288606154e-05, + "loss": 0.4671, + "step": 2332 + }, + { + "epoch": 0.27, + "learning_rate": 1.720521593600787e-05, + "loss": 0.4661, + "step": 2333 + }, + { + "epoch": 0.27, + "learning_rate": 1.7202648596134143e-05, + "loss": 0.4803, + "step": 2334 + }, + { + "epoch": 0.27, + "learning_rate": 1.7200080269336745e-05, + "loss": 0.4671, + "step": 2335 + }, + { + "epoch": 0.27, + "learning_rate": 1.71975109559676e-05, + "loss": 0.4705, + "step": 2336 + }, + { + "epoch": 0.27, + "learning_rate": 1.7194940656378763e-05, + "loss": 0.4898, + "step": 2337 + }, + { + "epoch": 0.27, + "learning_rate": 1.7192369370922423e-05, + "loss": 0.4762, + "step": 2338 + }, + { + "epoch": 0.27, + "learning_rate": 1.7189797099950895e-05, + "loss": 0.4991, + "step": 2339 + }, + { + "epoch": 0.27, + "learning_rate": 1.7187223843816648e-05, + "loss": 0.4784, + "step": 2340 + }, + { + "epoch": 0.27, + "learning_rate": 1.7184649602872274e-05, + "loss": 0.4749, + "step": 2341 + }, + { + "epoch": 0.27, + "learning_rate": 1.7182074377470494e-05, + "loss": 0.4777, + "step": 2342 + }, + { + "epoch": 0.27, + "learning_rate": 1.717949816796418e-05, + "loss": 0.4733, + "step": 2343 + }, + { + "epoch": 0.27, + "learning_rate": 1.7176920974706318e-05, + "loss": 0.4914, + "step": 2344 + }, + { + "epoch": 0.27, + "learning_rate": 1.7174342798050056e-05, + "loss": 0.4806, + "step": 2345 + }, + { + "epoch": 0.27, + "learning_rate": 1.7171763638348653e-05, + "loss": 0.4793, + "step": 2346 + }, + { + "epoch": 0.27, + "learning_rate": 1.7169183495955516e-05, + "loss": 0.4827, + "step": 2347 + }, + { + "epoch": 0.27, + "learning_rate": 1.7166602371224178e-05, + "loss": 0.4725, + "step": 2348 + }, + { + "epoch": 0.27, + "learning_rate": 1.716402026450831e-05, + "loss": 0.4927, + "step": 2349 + }, + { + "epoch": 0.27, + "learning_rate": 1.7161437176161727e-05, + "loss": 0.4618, + "step": 2350 + }, + { + "epoch": 0.27, + "learning_rate": 1.7158853106538358e-05, + "loss": 0.4854, + "step": 2351 + }, + { + "epoch": 0.27, + "learning_rate": 1.7156268055992286e-05, + "loss": 0.4853, + "step": 2352 + }, + { + "epoch": 0.27, + "learning_rate": 1.7153682024877716e-05, + "loss": 0.4795, + "step": 2353 + }, + { + "epoch": 0.27, + "learning_rate": 1.7151095013548996e-05, + "loss": 0.4838, + "step": 2354 + }, + { + "epoch": 0.27, + "learning_rate": 1.7148507022360602e-05, + "loss": 0.4743, + "step": 2355 + }, + { + "epoch": 0.27, + "learning_rate": 1.7145918051667147e-05, + "loss": 0.4669, + "step": 2356 + }, + { + "epoch": 0.27, + "learning_rate": 1.714332810182338e-05, + "loss": 0.4712, + "step": 2357 + }, + { + "epoch": 0.27, + "learning_rate": 1.7140737173184178e-05, + "loss": 0.4703, + "step": 2358 + }, + { + "epoch": 0.27, + "learning_rate": 1.713814526610456e-05, + "loss": 0.48, + "step": 2359 + }, + { + "epoch": 0.27, + "learning_rate": 1.713555238093967e-05, + "loss": 0.4717, + "step": 2360 + }, + { + "epoch": 0.27, + "learning_rate": 1.7132958518044797e-05, + "loss": 0.4669, + "step": 2361 + }, + { + "epoch": 0.27, + "learning_rate": 1.713036367777535e-05, + "loss": 0.4762, + "step": 2362 + }, + { + "epoch": 0.27, + "learning_rate": 1.7127767860486892e-05, + "loss": 0.4835, + "step": 2363 + }, + { + "epoch": 0.27, + "learning_rate": 1.71251710665351e-05, + "loss": 0.4753, + "step": 2364 + }, + { + "epoch": 0.27, + "learning_rate": 1.7122573296275788e-05, + "loss": 0.4621, + "step": 2365 + }, + { + "epoch": 0.27, + "learning_rate": 1.711997455006492e-05, + "loss": 0.4649, + "step": 2366 + }, + { + "epoch": 0.27, + "learning_rate": 1.711737482825858e-05, + "loss": 0.4877, + "step": 2367 + }, + { + "epoch": 0.27, + "learning_rate": 1.7114774131212983e-05, + "loss": 0.4709, + "step": 2368 + }, + { + "epoch": 0.27, + "learning_rate": 1.7112172459284478e-05, + "loss": 0.4921, + "step": 2369 + }, + { + "epoch": 0.27, + "learning_rate": 1.7109569812829565e-05, + "loss": 0.4739, + "step": 2370 + }, + { + "epoch": 0.27, + "learning_rate": 1.710696619220486e-05, + "loss": 0.4971, + "step": 2371 + }, + { + "epoch": 0.27, + "learning_rate": 1.7104361597767107e-05, + "loss": 0.4782, + "step": 2372 + }, + { + "epoch": 0.27, + "learning_rate": 1.7101756029873208e-05, + "loss": 0.4725, + "step": 2373 + }, + { + "epoch": 0.27, + "learning_rate": 1.7099149488880174e-05, + "loss": 0.4709, + "step": 2374 + }, + { + "epoch": 0.27, + "learning_rate": 1.709654197514517e-05, + "loss": 0.4594, + "step": 2375 + }, + { + "epoch": 0.27, + "learning_rate": 1.709393348902547e-05, + "loss": 0.4697, + "step": 2376 + }, + { + "epoch": 0.27, + "learning_rate": 1.7091324030878504e-05, + "loss": 0.4889, + "step": 2377 + }, + { + "epoch": 0.27, + "learning_rate": 1.7088713601061823e-05, + "loss": 0.4615, + "step": 2378 + }, + { + "epoch": 0.27, + "learning_rate": 1.7086102199933116e-05, + "loss": 0.4891, + "step": 2379 + }, + { + "epoch": 0.27, + "learning_rate": 1.7083489827850202e-05, + "loss": 0.4881, + "step": 2380 + }, + { + "epoch": 0.27, + "learning_rate": 1.7080876485171035e-05, + "loss": 0.4834, + "step": 2381 + }, + { + "epoch": 0.27, + "learning_rate": 1.70782621722537e-05, + "loss": 0.4731, + "step": 2382 + }, + { + "epoch": 0.27, + "learning_rate": 1.7075646889456415e-05, + "loss": 0.5067, + "step": 2383 + }, + { + "epoch": 0.27, + "learning_rate": 1.7073030637137535e-05, + "loss": 0.4691, + "step": 2384 + }, + { + "epoch": 0.27, + "learning_rate": 1.7070413415655548e-05, + "loss": 0.4868, + "step": 2385 + }, + { + "epoch": 0.27, + "learning_rate": 1.7067795225369063e-05, + "loss": 0.4821, + "step": 2386 + }, + { + "epoch": 0.27, + "learning_rate": 1.7065176066636836e-05, + "loss": 0.4938, + "step": 2387 + }, + { + "epoch": 0.27, + "learning_rate": 1.706255593981775e-05, + "loss": 0.4777, + "step": 2388 + }, + { + "epoch": 0.27, + "learning_rate": 1.7059934845270826e-05, + "loss": 0.5058, + "step": 2389 + }, + { + "epoch": 0.27, + "learning_rate": 1.70573127833552e-05, + "loss": 0.4644, + "step": 2390 + }, + { + "epoch": 0.27, + "learning_rate": 1.705468975443016e-05, + "loss": 0.4899, + "step": 2391 + }, + { + "epoch": 0.27, + "learning_rate": 1.7052065758855123e-05, + "loss": 0.4905, + "step": 2392 + }, + { + "epoch": 0.27, + "learning_rate": 1.704944079698963e-05, + "loss": 0.4652, + "step": 2393 + }, + { + "epoch": 0.27, + "learning_rate": 1.704681486919336e-05, + "loss": 0.4839, + "step": 2394 + }, + { + "epoch": 0.27, + "learning_rate": 1.7044187975826126e-05, + "loss": 0.4743, + "step": 2395 + }, + { + "epoch": 0.27, + "learning_rate": 1.704156011724787e-05, + "loss": 0.4723, + "step": 2396 + }, + { + "epoch": 0.27, + "learning_rate": 1.7038931293818665e-05, + "loss": 0.4885, + "step": 2397 + }, + { + "epoch": 0.27, + "learning_rate": 1.703630150589872e-05, + "loss": 0.471, + "step": 2398 + }, + { + "epoch": 0.27, + "learning_rate": 1.7033670753848373e-05, + "loss": 0.4791, + "step": 2399 + }, + { + "epoch": 0.27, + "learning_rate": 1.7031039038028103e-05, + "loss": 0.4562, + "step": 2400 + }, + { + "epoch": 0.27, + "learning_rate": 1.7028406358798505e-05, + "loss": 0.4891, + "step": 2401 + }, + { + "epoch": 0.27, + "learning_rate": 1.7025772716520324e-05, + "loss": 0.4637, + "step": 2402 + }, + { + "epoch": 0.27, + "learning_rate": 1.7023138111554412e-05, + "loss": 0.4753, + "step": 2403 + }, + { + "epoch": 0.27, + "learning_rate": 1.702050254426179e-05, + "loss": 0.4727, + "step": 2404 + }, + { + "epoch": 0.27, + "learning_rate": 1.701786601500357e-05, + "loss": 0.4795, + "step": 2405 + }, + { + "epoch": 0.27, + "learning_rate": 1.701522852414103e-05, + "loss": 0.4705, + "step": 2406 + }, + { + "epoch": 0.28, + "learning_rate": 1.7012590072035554e-05, + "loss": 0.5054, + "step": 2407 + }, + { + "epoch": 0.28, + "learning_rate": 1.7009950659048677e-05, + "loss": 0.4825, + "step": 2408 + }, + { + "epoch": 0.28, + "learning_rate": 1.7007310285542057e-05, + "loss": 0.4799, + "step": 2409 + }, + { + "epoch": 0.28, + "learning_rate": 1.7004668951877475e-05, + "loss": 0.4526, + "step": 2410 + }, + { + "epoch": 0.28, + "learning_rate": 1.7002026658416862e-05, + "loss": 0.4672, + "step": 2411 + }, + { + "epoch": 0.28, + "learning_rate": 1.699938340552227e-05, + "loss": 0.4956, + "step": 2412 + }, + { + "epoch": 0.28, + "learning_rate": 1.699673919355588e-05, + "loss": 0.4727, + "step": 2413 + }, + { + "epoch": 0.28, + "learning_rate": 1.699409402288001e-05, + "loss": 0.4746, + "step": 2414 + }, + { + "epoch": 0.28, + "learning_rate": 1.699144789385711e-05, + "loss": 0.4814, + "step": 2415 + }, + { + "epoch": 0.28, + "learning_rate": 1.6988800806849754e-05, + "loss": 0.4645, + "step": 2416 + }, + { + "epoch": 0.28, + "learning_rate": 1.6986152762220655e-05, + "loss": 0.4808, + "step": 2417 + }, + { + "epoch": 0.28, + "learning_rate": 1.6983503760332653e-05, + "loss": 0.4715, + "step": 2418 + }, + { + "epoch": 0.28, + "learning_rate": 1.698085380154872e-05, + "loss": 0.4588, + "step": 2419 + }, + { + "epoch": 0.28, + "learning_rate": 1.6978202886231963e-05, + "loss": 0.4597, + "step": 2420 + }, + { + "epoch": 0.28, + "learning_rate": 1.6975551014745614e-05, + "loss": 0.5044, + "step": 2421 + }, + { + "epoch": 0.28, + "learning_rate": 1.697289818745304e-05, + "loss": 0.463, + "step": 2422 + }, + { + "epoch": 0.28, + "learning_rate": 1.6970244404717732e-05, + "loss": 0.481, + "step": 2423 + }, + { + "epoch": 0.28, + "learning_rate": 1.6967589666903324e-05, + "loss": 0.4816, + "step": 2424 + }, + { + "epoch": 0.28, + "learning_rate": 1.696493397437357e-05, + "loss": 0.4928, + "step": 2425 + }, + { + "epoch": 0.28, + "learning_rate": 1.6962277327492366e-05, + "loss": 0.4689, + "step": 2426 + }, + { + "epoch": 0.28, + "learning_rate": 1.6959619726623722e-05, + "loss": 0.4803, + "step": 2427 + }, + { + "epoch": 0.28, + "learning_rate": 1.6956961172131796e-05, + "loss": 0.4684, + "step": 2428 + }, + { + "epoch": 0.28, + "learning_rate": 1.6954301664380867e-05, + "loss": 0.4796, + "step": 2429 + }, + { + "epoch": 0.28, + "learning_rate": 1.6951641203735345e-05, + "loss": 0.4839, + "step": 2430 + }, + { + "epoch": 0.28, + "learning_rate": 1.694897979055978e-05, + "loss": 0.4781, + "step": 2431 + }, + { + "epoch": 0.28, + "learning_rate": 1.6946317425218834e-05, + "loss": 0.4846, + "step": 2432 + }, + { + "epoch": 0.28, + "learning_rate": 1.6943654108077317e-05, + "loss": 0.4773, + "step": 2433 + }, + { + "epoch": 0.28, + "learning_rate": 1.6940989839500167e-05, + "loss": 0.4712, + "step": 2434 + }, + { + "epoch": 0.28, + "learning_rate": 1.6938324619852435e-05, + "loss": 0.4936, + "step": 2435 + }, + { + "epoch": 0.28, + "learning_rate": 1.693565844949933e-05, + "loss": 0.4735, + "step": 2436 + }, + { + "epoch": 0.28, + "learning_rate": 1.693299132880617e-05, + "loss": 0.4663, + "step": 2437 + }, + { + "epoch": 0.28, + "learning_rate": 1.693032325813841e-05, + "loss": 0.488, + "step": 2438 + }, + { + "epoch": 0.28, + "learning_rate": 1.6927654237861635e-05, + "loss": 0.4665, + "step": 2439 + }, + { + "epoch": 0.28, + "learning_rate": 1.6924984268341563e-05, + "loss": 0.4767, + "step": 2440 + }, + { + "epoch": 0.28, + "learning_rate": 1.6922313349944037e-05, + "loss": 0.4821, + "step": 2441 + }, + { + "epoch": 0.28, + "learning_rate": 1.6919641483035035e-05, + "loss": 0.4513, + "step": 2442 + }, + { + "epoch": 0.28, + "learning_rate": 1.6916968667980658e-05, + "loss": 0.4734, + "step": 2443 + }, + { + "epoch": 0.28, + "learning_rate": 1.6914294905147144e-05, + "loss": 0.4782, + "step": 2444 + }, + { + "epoch": 0.28, + "learning_rate": 1.6911620194900862e-05, + "loss": 0.4684, + "step": 2445 + }, + { + "epoch": 0.28, + "learning_rate": 1.6908944537608302e-05, + "loss": 0.4643, + "step": 2446 + }, + { + "epoch": 0.28, + "learning_rate": 1.6906267933636087e-05, + "loss": 0.4868, + "step": 2447 + }, + { + "epoch": 0.28, + "learning_rate": 1.6903590383350975e-05, + "loss": 0.4797, + "step": 2448 + }, + { + "epoch": 0.28, + "learning_rate": 1.690091188711985e-05, + "loss": 0.4894, + "step": 2449 + }, + { + "epoch": 0.28, + "learning_rate": 1.689823244530973e-05, + "loss": 0.4657, + "step": 2450 + }, + { + "epoch": 0.28, + "learning_rate": 1.6895552058287752e-05, + "loss": 0.4817, + "step": 2451 + }, + { + "epoch": 0.28, + "learning_rate": 1.689287072642119e-05, + "loss": 0.4785, + "step": 2452 + }, + { + "epoch": 0.28, + "learning_rate": 1.6890188450077445e-05, + "loss": 0.4668, + "step": 2453 + }, + { + "epoch": 0.28, + "learning_rate": 1.6887505229624053e-05, + "loss": 0.4684, + "step": 2454 + }, + { + "epoch": 0.28, + "learning_rate": 1.6884821065428673e-05, + "loss": 0.4807, + "step": 2455 + }, + { + "epoch": 0.28, + "learning_rate": 1.6882135957859095e-05, + "loss": 0.4894, + "step": 2456 + }, + { + "epoch": 0.28, + "learning_rate": 1.6879449907283238e-05, + "loss": 0.4591, + "step": 2457 + }, + { + "epoch": 0.28, + "learning_rate": 1.6876762914069154e-05, + "loss": 0.4734, + "step": 2458 + }, + { + "epoch": 0.28, + "learning_rate": 1.6874074978585018e-05, + "loss": 0.4596, + "step": 2459 + }, + { + "epoch": 0.28, + "learning_rate": 1.687138610119914e-05, + "loss": 0.4737, + "step": 2460 + }, + { + "epoch": 0.28, + "learning_rate": 1.686869628227995e-05, + "loss": 0.5029, + "step": 2461 + }, + { + "epoch": 0.28, + "learning_rate": 1.686600552219602e-05, + "loss": 0.4675, + "step": 2462 + }, + { + "epoch": 0.28, + "learning_rate": 1.686331382131604e-05, + "loss": 0.4601, + "step": 2463 + }, + { + "epoch": 0.28, + "learning_rate": 1.686062118000884e-05, + "loss": 0.4806, + "step": 2464 + }, + { + "epoch": 0.28, + "learning_rate": 1.6857927598643362e-05, + "loss": 0.4758, + "step": 2465 + }, + { + "epoch": 0.28, + "learning_rate": 1.6855233077588697e-05, + "loss": 0.503, + "step": 2466 + }, + { + "epoch": 0.28, + "learning_rate": 1.6852537617214043e-05, + "loss": 0.4894, + "step": 2467 + }, + { + "epoch": 0.28, + "learning_rate": 1.6849841217888748e-05, + "loss": 0.4551, + "step": 2468 + }, + { + "epoch": 0.28, + "learning_rate": 1.6847143879982276e-05, + "loss": 0.4697, + "step": 2469 + }, + { + "epoch": 0.28, + "learning_rate": 1.684444560386422e-05, + "loss": 0.4702, + "step": 2470 + }, + { + "epoch": 0.28, + "learning_rate": 1.6841746389904306e-05, + "loss": 0.4724, + "step": 2471 + }, + { + "epoch": 0.28, + "learning_rate": 1.6839046238472387e-05, + "loss": 0.49, + "step": 2472 + }, + { + "epoch": 0.28, + "learning_rate": 1.6836345149938445e-05, + "loss": 0.4787, + "step": 2473 + }, + { + "epoch": 0.28, + "learning_rate": 1.6833643124672586e-05, + "loss": 0.4715, + "step": 2474 + }, + { + "epoch": 0.28, + "learning_rate": 1.683094016304505e-05, + "loss": 0.4781, + "step": 2475 + }, + { + "epoch": 0.28, + "learning_rate": 1.6828236265426205e-05, + "loss": 0.4622, + "step": 2476 + }, + { + "epoch": 0.28, + "learning_rate": 1.6825531432186545e-05, + "loss": 0.4672, + "step": 2477 + }, + { + "epoch": 0.28, + "learning_rate": 1.6822825663696683e-05, + "loss": 0.4731, + "step": 2478 + }, + { + "epoch": 0.28, + "learning_rate": 1.6820118960327386e-05, + "loss": 0.4835, + "step": 2479 + }, + { + "epoch": 0.28, + "learning_rate": 1.681741132244952e-05, + "loss": 0.4742, + "step": 2480 + }, + { + "epoch": 0.28, + "learning_rate": 1.6814702750434097e-05, + "loss": 0.4779, + "step": 2481 + }, + { + "epoch": 0.28, + "learning_rate": 1.6811993244652248e-05, + "loss": 0.4848, + "step": 2482 + }, + { + "epoch": 0.28, + "learning_rate": 1.6809282805475243e-05, + "loss": 0.4667, + "step": 2483 + }, + { + "epoch": 0.28, + "learning_rate": 1.680657143327447e-05, + "loss": 0.479, + "step": 2484 + }, + { + "epoch": 0.28, + "learning_rate": 1.680385912842144e-05, + "loss": 0.466, + "step": 2485 + }, + { + "epoch": 0.28, + "learning_rate": 1.680114589128781e-05, + "loss": 0.4768, + "step": 2486 + }, + { + "epoch": 0.28, + "learning_rate": 1.6798431722245346e-05, + "loss": 0.4632, + "step": 2487 + }, + { + "epoch": 0.28, + "learning_rate": 1.6795716621665957e-05, + "loss": 0.5026, + "step": 2488 + }, + { + "epoch": 0.28, + "learning_rate": 1.6793000589921666e-05, + "loss": 0.4801, + "step": 2489 + }, + { + "epoch": 0.28, + "learning_rate": 1.6790283627384633e-05, + "loss": 0.4888, + "step": 2490 + }, + { + "epoch": 0.28, + "learning_rate": 1.6787565734427143e-05, + "loss": 0.4666, + "step": 2491 + }, + { + "epoch": 0.28, + "learning_rate": 1.6784846911421605e-05, + "loss": 0.4721, + "step": 2492 + }, + { + "epoch": 0.28, + "learning_rate": 1.678212715874056e-05, + "loss": 0.4713, + "step": 2493 + }, + { + "epoch": 0.29, + "learning_rate": 1.677940647675668e-05, + "loss": 0.4702, + "step": 2494 + }, + { + "epoch": 0.29, + "learning_rate": 1.6776684865842748e-05, + "loss": 0.4716, + "step": 2495 + }, + { + "epoch": 0.29, + "learning_rate": 1.6773962326371696e-05, + "loss": 0.4623, + "step": 2496 + }, + { + "epoch": 0.29, + "learning_rate": 1.677123885871657e-05, + "loss": 0.4892, + "step": 2497 + }, + { + "epoch": 0.29, + "learning_rate": 1.6768514463250544e-05, + "loss": 0.4916, + "step": 2498 + }, + { + "epoch": 0.29, + "learning_rate": 1.6765789140346916e-05, + "loss": 0.4808, + "step": 2499 + }, + { + "epoch": 0.29, + "learning_rate": 1.6763062890379128e-05, + "loss": 0.4719, + "step": 2500 + }, + { + "epoch": 0.29, + "learning_rate": 1.6760335713720727e-05, + "loss": 0.4711, + "step": 2501 + }, + { + "epoch": 0.29, + "learning_rate": 1.6757607610745405e-05, + "loss": 0.4837, + "step": 2502 + }, + { + "epoch": 0.29, + "learning_rate": 1.6754878581826966e-05, + "loss": 0.4707, + "step": 2503 + }, + { + "epoch": 0.29, + "learning_rate": 1.675214862733935e-05, + "loss": 0.4648, + "step": 2504 + }, + { + "epoch": 0.29, + "learning_rate": 1.674941774765662e-05, + "loss": 0.4742, + "step": 2505 + }, + { + "epoch": 0.29, + "learning_rate": 1.6746685943152975e-05, + "loss": 0.4896, + "step": 2506 + }, + { + "epoch": 0.29, + "learning_rate": 1.674395321420273e-05, + "loss": 0.4759, + "step": 2507 + }, + { + "epoch": 0.29, + "learning_rate": 1.674121956118032e-05, + "loss": 0.4774, + "step": 2508 + }, + { + "epoch": 0.29, + "learning_rate": 1.673848498446033e-05, + "loss": 0.4727, + "step": 2509 + }, + { + "epoch": 0.29, + "learning_rate": 1.6735749484417452e-05, + "loss": 0.4816, + "step": 2510 + }, + { + "epoch": 0.29, + "learning_rate": 1.673301306142651e-05, + "loss": 0.475, + "step": 2511 + }, + { + "epoch": 0.29, + "learning_rate": 1.6730275715862455e-05, + "loss": 0.4677, + "step": 2512 + }, + { + "epoch": 0.29, + "learning_rate": 1.672753744810037e-05, + "loss": 0.4701, + "step": 2513 + }, + { + "epoch": 0.29, + "learning_rate": 1.6724798258515452e-05, + "loss": 0.4852, + "step": 2514 + }, + { + "epoch": 0.29, + "learning_rate": 1.6722058147483034e-05, + "loss": 0.49, + "step": 2515 + }, + { + "epoch": 0.29, + "learning_rate": 1.671931711537857e-05, + "loss": 0.4704, + "step": 2516 + }, + { + "epoch": 0.29, + "learning_rate": 1.6716575162577647e-05, + "loss": 0.4733, + "step": 2517 + }, + { + "epoch": 0.29, + "learning_rate": 1.671383228945597e-05, + "loss": 0.4668, + "step": 2518 + }, + { + "epoch": 0.29, + "learning_rate": 1.6711088496389375e-05, + "loss": 0.4812, + "step": 2519 + }, + { + "epoch": 0.29, + "learning_rate": 1.6708343783753824e-05, + "loss": 0.4573, + "step": 2520 + }, + { + "epoch": 0.29, + "learning_rate": 1.67055981519254e-05, + "loss": 0.485, + "step": 2521 + }, + { + "epoch": 0.29, + "learning_rate": 1.6702851601280322e-05, + "loss": 0.4801, + "step": 2522 + }, + { + "epoch": 0.29, + "learning_rate": 1.6700104132194925e-05, + "loss": 0.4627, + "step": 2523 + }, + { + "epoch": 0.29, + "learning_rate": 1.6697355745045678e-05, + "loss": 0.479, + "step": 2524 + }, + { + "epoch": 0.29, + "learning_rate": 1.6694606440209163e-05, + "loss": 0.4802, + "step": 2525 + }, + { + "epoch": 0.29, + "learning_rate": 1.6691856218062105e-05, + "loss": 0.4808, + "step": 2526 + }, + { + "epoch": 0.29, + "learning_rate": 1.6689105078981333e-05, + "loss": 0.4646, + "step": 2527 + }, + { + "epoch": 0.29, + "learning_rate": 1.668635302334383e-05, + "loss": 0.48, + "step": 2528 + }, + { + "epoch": 0.29, + "learning_rate": 1.6683600051526682e-05, + "loss": 0.4655, + "step": 2529 + }, + { + "epoch": 0.29, + "learning_rate": 1.6680846163907107e-05, + "loss": 0.4952, + "step": 2530 + }, + { + "epoch": 0.29, + "learning_rate": 1.6678091360862447e-05, + "loss": 0.4787, + "step": 2531 + }, + { + "epoch": 0.29, + "learning_rate": 1.6675335642770178e-05, + "loss": 0.4895, + "step": 2532 + }, + { + "epoch": 0.29, + "learning_rate": 1.667257901000789e-05, + "loss": 0.4627, + "step": 2533 + }, + { + "epoch": 0.29, + "learning_rate": 1.6669821462953303e-05, + "loss": 0.4718, + "step": 2534 + }, + { + "epoch": 0.29, + "learning_rate": 1.6667063001984267e-05, + "loss": 0.4801, + "step": 2535 + }, + { + "epoch": 0.29, + "learning_rate": 1.6664303627478745e-05, + "loss": 0.4728, + "step": 2536 + }, + { + "epoch": 0.29, + "learning_rate": 1.6661543339814847e-05, + "loss": 0.4682, + "step": 2537 + }, + { + "epoch": 0.29, + "learning_rate": 1.6658782139370775e-05, + "loss": 0.4591, + "step": 2538 + }, + { + "epoch": 0.29, + "learning_rate": 1.6656020026524887e-05, + "loss": 0.4752, + "step": 2539 + }, + { + "epoch": 0.29, + "learning_rate": 1.6653257001655652e-05, + "loss": 0.4631, + "step": 2540 + }, + { + "epoch": 0.29, + "learning_rate": 1.6650493065141672e-05, + "loss": 0.4763, + "step": 2541 + }, + { + "epoch": 0.29, + "learning_rate": 1.6647728217361658e-05, + "loss": 0.4713, + "step": 2542 + }, + { + "epoch": 0.29, + "learning_rate": 1.6644962458694457e-05, + "loss": 0.4599, + "step": 2543 + }, + { + "epoch": 0.29, + "learning_rate": 1.6642195789519045e-05, + "loss": 0.4702, + "step": 2544 + }, + { + "epoch": 0.29, + "learning_rate": 1.6639428210214514e-05, + "loss": 0.4738, + "step": 2545 + }, + { + "epoch": 0.29, + "learning_rate": 1.6636659721160088e-05, + "loss": 0.4827, + "step": 2546 + }, + { + "epoch": 0.29, + "learning_rate": 1.6633890322735107e-05, + "loss": 0.4857, + "step": 2547 + }, + { + "epoch": 0.29, + "learning_rate": 1.6631120015319044e-05, + "loss": 0.4765, + "step": 2548 + }, + { + "epoch": 0.29, + "learning_rate": 1.662834879929149e-05, + "loss": 0.4676, + "step": 2549 + }, + { + "epoch": 0.29, + "learning_rate": 1.6625576675032163e-05, + "loss": 0.4788, + "step": 2550 + }, + { + "epoch": 0.29, + "learning_rate": 1.6622803642920912e-05, + "loss": 0.4827, + "step": 2551 + }, + { + "epoch": 0.29, + "learning_rate": 1.6620029703337697e-05, + "loss": 0.469, + "step": 2552 + }, + { + "epoch": 0.29, + "learning_rate": 1.6617254856662613e-05, + "loss": 0.4874, + "step": 2553 + }, + { + "epoch": 0.29, + "learning_rate": 1.6614479103275875e-05, + "loss": 0.4735, + "step": 2554 + }, + { + "epoch": 0.29, + "learning_rate": 1.6611702443557826e-05, + "loss": 0.4696, + "step": 2555 + }, + { + "epoch": 0.29, + "learning_rate": 1.6608924877888926e-05, + "loss": 0.4801, + "step": 2556 + }, + { + "epoch": 0.29, + "learning_rate": 1.6606146406649767e-05, + "loss": 0.471, + "step": 2557 + }, + { + "epoch": 0.29, + "learning_rate": 1.660336703022106e-05, + "loss": 0.4862, + "step": 2558 + }, + { + "epoch": 0.29, + "learning_rate": 1.6600586748983642e-05, + "loss": 0.4797, + "step": 2559 + }, + { + "epoch": 0.29, + "learning_rate": 1.6597805563318475e-05, + "loss": 0.4754, + "step": 2560 + }, + { + "epoch": 0.29, + "learning_rate": 1.659502347360664e-05, + "loss": 0.4828, + "step": 2561 + }, + { + "epoch": 0.29, + "learning_rate": 1.659224048022935e-05, + "loss": 0.4729, + "step": 2562 + }, + { + "epoch": 0.29, + "learning_rate": 1.6589456583567934e-05, + "loss": 0.4708, + "step": 2563 + }, + { + "epoch": 0.29, + "learning_rate": 1.6586671784003846e-05, + "loss": 0.4831, + "step": 2564 + }, + { + "epoch": 0.29, + "learning_rate": 1.658388608191867e-05, + "loss": 0.4759, + "step": 2565 + }, + { + "epoch": 0.29, + "learning_rate": 1.658109947769411e-05, + "loss": 0.4712, + "step": 2566 + }, + { + "epoch": 0.29, + "learning_rate": 1.657831197171199e-05, + "loss": 0.4785, + "step": 2567 + }, + { + "epoch": 0.29, + "learning_rate": 1.657552356435426e-05, + "loss": 0.4917, + "step": 2568 + }, + { + "epoch": 0.29, + "learning_rate": 1.6572734256002997e-05, + "loss": 0.4723, + "step": 2569 + }, + { + "epoch": 0.29, + "learning_rate": 1.6569944047040394e-05, + "loss": 0.4645, + "step": 2570 + }, + { + "epoch": 0.29, + "learning_rate": 1.6567152937848776e-05, + "loss": 0.483, + "step": 2571 + }, + { + "epoch": 0.29, + "learning_rate": 1.6564360928810588e-05, + "loss": 0.4732, + "step": 2572 + }, + { + "epoch": 0.29, + "learning_rate": 1.6561568020308397e-05, + "loss": 0.48, + "step": 2573 + }, + { + "epoch": 0.29, + "learning_rate": 1.6558774212724888e-05, + "loss": 0.48, + "step": 2574 + }, + { + "epoch": 0.29, + "learning_rate": 1.655597950644288e-05, + "loss": 0.4795, + "step": 2575 + }, + { + "epoch": 0.29, + "learning_rate": 1.6553183901845313e-05, + "loss": 0.475, + "step": 2576 + }, + { + "epoch": 0.29, + "learning_rate": 1.6550387399315246e-05, + "loss": 0.4733, + "step": 2577 + }, + { + "epoch": 0.29, + "learning_rate": 1.6547589999235854e-05, + "loss": 0.4748, + "step": 2578 + }, + { + "epoch": 0.29, + "learning_rate": 1.654479170199045e-05, + "loss": 0.467, + "step": 2579 + }, + { + "epoch": 0.29, + "learning_rate": 1.6541992507962467e-05, + "loss": 0.4737, + "step": 2580 + }, + { + "epoch": 0.29, + "learning_rate": 1.653919241753545e-05, + "loss": 0.4803, + "step": 2581 + }, + { + "epoch": 0.3, + "learning_rate": 1.6536391431093077e-05, + "loss": 0.4576, + "step": 2582 + }, + { + "epoch": 0.3, + "learning_rate": 1.6533589549019147e-05, + "loss": 0.4855, + "step": 2583 + }, + { + "epoch": 0.3, + "learning_rate": 1.6530786771697575e-05, + "loss": 0.4809, + "step": 2584 + }, + { + "epoch": 0.3, + "learning_rate": 1.6527983099512414e-05, + "loss": 0.4582, + "step": 2585 + }, + { + "epoch": 0.3, + "learning_rate": 1.6525178532847816e-05, + "loss": 0.4754, + "step": 2586 + }, + { + "epoch": 0.3, + "learning_rate": 1.6522373072088083e-05, + "loss": 0.4623, + "step": 2587 + }, + { + "epoch": 0.3, + "learning_rate": 1.6519566717617616e-05, + "loss": 0.4678, + "step": 2588 + }, + { + "epoch": 0.3, + "learning_rate": 1.6516759469820955e-05, + "loss": 0.4837, + "step": 2589 + }, + { + "epoch": 0.3, + "learning_rate": 1.6513951329082746e-05, + "loss": 0.4858, + "step": 2590 + }, + { + "epoch": 0.3, + "learning_rate": 1.651114229578778e-05, + "loss": 0.4732, + "step": 2591 + }, + { + "epoch": 0.3, + "learning_rate": 1.6508332370320948e-05, + "loss": 0.4903, + "step": 2592 + }, + { + "epoch": 0.3, + "learning_rate": 1.6505521553067273e-05, + "loss": 0.4576, + "step": 2593 + }, + { + "epoch": 0.3, + "learning_rate": 1.6502709844411907e-05, + "loss": 0.4749, + "step": 2594 + }, + { + "epoch": 0.3, + "learning_rate": 1.6499897244740107e-05, + "loss": 0.4708, + "step": 2595 + }, + { + "epoch": 0.3, + "learning_rate": 1.649708375443727e-05, + "loss": 0.4769, + "step": 2596 + }, + { + "epoch": 0.3, + "learning_rate": 1.6494269373888902e-05, + "loss": 0.4642, + "step": 2597 + }, + { + "epoch": 0.3, + "learning_rate": 1.6491454103480637e-05, + "loss": 0.4851, + "step": 2598 + }, + { + "epoch": 0.3, + "learning_rate": 1.6488637943598235e-05, + "loss": 0.4549, + "step": 2599 + }, + { + "epoch": 0.3, + "learning_rate": 1.648582089462756e-05, + "loss": 0.5042, + "step": 2600 + }, + { + "epoch": 0.3, + "learning_rate": 1.6483002956954622e-05, + "loss": 0.4748, + "step": 2601 + }, + { + "epoch": 0.3, + "learning_rate": 1.6480184130965542e-05, + "loss": 0.4497, + "step": 2602 + }, + { + "epoch": 0.3, + "learning_rate": 1.647736441704656e-05, + "loss": 0.4645, + "step": 2603 + }, + { + "epoch": 0.3, + "learning_rate": 1.647454381558403e-05, + "loss": 0.4807, + "step": 2604 + }, + { + "epoch": 0.3, + "learning_rate": 1.647172232696445e-05, + "loss": 0.4787, + "step": 2605 + }, + { + "epoch": 0.3, + "learning_rate": 1.6468899951574423e-05, + "loss": 0.4748, + "step": 2606 + }, + { + "epoch": 0.3, + "learning_rate": 1.6466076689800677e-05, + "loss": 0.4747, + "step": 2607 + }, + { + "epoch": 0.3, + "learning_rate": 1.6463252542030058e-05, + "loss": 0.4748, + "step": 2608 + }, + { + "epoch": 0.3, + "learning_rate": 1.6460427508649546e-05, + "loss": 0.4958, + "step": 2609 + }, + { + "epoch": 0.3, + "learning_rate": 1.6457601590046227e-05, + "loss": 0.454, + "step": 2610 + }, + { + "epoch": 0.3, + "learning_rate": 1.6454774786607317e-05, + "loss": 0.4737, + "step": 2611 + }, + { + "epoch": 0.3, + "learning_rate": 1.6451947098720148e-05, + "loss": 0.4823, + "step": 2612 + }, + { + "epoch": 0.3, + "learning_rate": 1.6449118526772183e-05, + "loss": 0.4601, + "step": 2613 + }, + { + "epoch": 0.3, + "learning_rate": 1.6446289071150993e-05, + "loss": 0.4766, + "step": 2614 + }, + { + "epoch": 0.3, + "learning_rate": 1.644345873224428e-05, + "loss": 0.4621, + "step": 2615 + }, + { + "epoch": 0.3, + "learning_rate": 1.6440627510439862e-05, + "loss": 0.4784, + "step": 2616 + }, + { + "epoch": 0.3, + "learning_rate": 1.6437795406125684e-05, + "loss": 0.4711, + "step": 2617 + }, + { + "epoch": 0.3, + "learning_rate": 1.6434962419689803e-05, + "loss": 0.4825, + "step": 2618 + }, + { + "epoch": 0.3, + "learning_rate": 1.64321285515204e-05, + "loss": 0.4636, + "step": 2619 + }, + { + "epoch": 0.3, + "learning_rate": 1.6429293802005783e-05, + "loss": 0.4749, + "step": 2620 + }, + { + "epoch": 0.3, + "learning_rate": 1.642645817153437e-05, + "loss": 0.4664, + "step": 2621 + }, + { + "epoch": 0.3, + "learning_rate": 1.6423621660494714e-05, + "loss": 0.4854, + "step": 2622 + }, + { + "epoch": 0.3, + "learning_rate": 1.6420784269275474e-05, + "loss": 0.4695, + "step": 2623 + }, + { + "epoch": 0.3, + "learning_rate": 1.6417945998265436e-05, + "loss": 0.4813, + "step": 2624 + }, + { + "epoch": 0.3, + "learning_rate": 1.641510684785351e-05, + "loss": 0.464, + "step": 2625 + }, + { + "epoch": 0.3, + "learning_rate": 1.641226681842872e-05, + "loss": 0.4837, + "step": 2626 + }, + { + "epoch": 0.3, + "learning_rate": 1.6409425910380215e-05, + "loss": 0.4592, + "step": 2627 + }, + { + "epoch": 0.3, + "learning_rate": 1.640658412409726e-05, + "loss": 0.4575, + "step": 2628 + }, + { + "epoch": 0.3, + "learning_rate": 1.640374145996925e-05, + "loss": 0.4661, + "step": 2629 + }, + { + "epoch": 0.3, + "learning_rate": 1.6400897918385687e-05, + "loss": 0.501, + "step": 2630 + }, + { + "epoch": 0.3, + "learning_rate": 1.63980534997362e-05, + "loss": 0.4592, + "step": 2631 + }, + { + "epoch": 0.3, + "learning_rate": 1.639520820441054e-05, + "loss": 0.4907, + "step": 2632 + }, + { + "epoch": 0.3, + "learning_rate": 1.6392362032798578e-05, + "loss": 0.4751, + "step": 2633 + }, + { + "epoch": 0.3, + "learning_rate": 1.63895149852903e-05, + "loss": 0.4735, + "step": 2634 + }, + { + "epoch": 0.3, + "learning_rate": 1.6386667062275817e-05, + "loss": 0.4791, + "step": 2635 + }, + { + "epoch": 0.3, + "learning_rate": 1.638381826414535e-05, + "loss": 0.476, + "step": 2636 + }, + { + "epoch": 0.3, + "learning_rate": 1.638096859128926e-05, + "loss": 0.453, + "step": 2637 + }, + { + "epoch": 0.3, + "learning_rate": 1.637811804409801e-05, + "loss": 0.4987, + "step": 2638 + }, + { + "epoch": 0.3, + "learning_rate": 1.6375266622962188e-05, + "loss": 0.4523, + "step": 2639 + }, + { + "epoch": 0.3, + "learning_rate": 1.6372414328272502e-05, + "loss": 0.4917, + "step": 2640 + }, + { + "epoch": 0.3, + "learning_rate": 1.6369561160419783e-05, + "loss": 0.4654, + "step": 2641 + }, + { + "epoch": 0.3, + "learning_rate": 1.6366707119794978e-05, + "loss": 0.4869, + "step": 2642 + }, + { + "epoch": 0.3, + "learning_rate": 1.6363852206789155e-05, + "loss": 0.4597, + "step": 2643 + }, + { + "epoch": 0.3, + "learning_rate": 1.6360996421793497e-05, + "loss": 0.4904, + "step": 2644 + }, + { + "epoch": 0.3, + "learning_rate": 1.635813976519931e-05, + "loss": 0.4572, + "step": 2645 + }, + { + "epoch": 0.3, + "learning_rate": 1.6355282237398026e-05, + "loss": 0.4793, + "step": 2646 + }, + { + "epoch": 0.3, + "learning_rate": 1.6352423838781185e-05, + "loss": 0.4746, + "step": 2647 + }, + { + "epoch": 0.3, + "learning_rate": 1.6349564569740454e-05, + "loss": 0.4728, + "step": 2648 + }, + { + "epoch": 0.3, + "learning_rate": 1.6346704430667612e-05, + "loss": 0.477, + "step": 2649 + }, + { + "epoch": 0.3, + "learning_rate": 1.6343843421954567e-05, + "loss": 0.4799, + "step": 2650 + }, + { + "epoch": 0.3, + "learning_rate": 1.634098154399334e-05, + "loss": 0.4626, + "step": 2651 + }, + { + "epoch": 0.3, + "learning_rate": 1.6338118797176074e-05, + "loss": 0.4803, + "step": 2652 + }, + { + "epoch": 0.3, + "learning_rate": 1.6335255181895026e-05, + "loss": 0.482, + "step": 2653 + }, + { + "epoch": 0.3, + "learning_rate": 1.633239069854257e-05, + "loss": 0.4549, + "step": 2654 + }, + { + "epoch": 0.3, + "learning_rate": 1.632952534751122e-05, + "loss": 0.4724, + "step": 2655 + }, + { + "epoch": 0.3, + "learning_rate": 1.6326659129193577e-05, + "loss": 0.4907, + "step": 2656 + }, + { + "epoch": 0.3, + "learning_rate": 1.632379204398238e-05, + "loss": 0.4783, + "step": 2657 + }, + { + "epoch": 0.3, + "learning_rate": 1.6320924092270494e-05, + "loss": 0.5081, + "step": 2658 + }, + { + "epoch": 0.3, + "learning_rate": 1.6318055274450885e-05, + "loss": 0.4605, + "step": 2659 + }, + { + "epoch": 0.3, + "learning_rate": 1.6315185590916644e-05, + "loss": 0.4626, + "step": 2660 + }, + { + "epoch": 0.3, + "learning_rate": 1.6312315042060984e-05, + "loss": 0.4659, + "step": 2661 + }, + { + "epoch": 0.3, + "learning_rate": 1.630944362827723e-05, + "loss": 0.4902, + "step": 2662 + }, + { + "epoch": 0.3, + "learning_rate": 1.6306571349958833e-05, + "loss": 0.4671, + "step": 2663 + }, + { + "epoch": 0.3, + "learning_rate": 1.6303698207499364e-05, + "loss": 0.4744, + "step": 2664 + }, + { + "epoch": 0.3, + "learning_rate": 1.63008242012925e-05, + "loss": 0.4611, + "step": 2665 + }, + { + "epoch": 0.3, + "learning_rate": 1.6297949331732047e-05, + "loss": 0.5038, + "step": 2666 + }, + { + "epoch": 0.3, + "learning_rate": 1.629507359921193e-05, + "loss": 0.4835, + "step": 2667 + }, + { + "epoch": 0.3, + "learning_rate": 1.6292197004126184e-05, + "loss": 0.4742, + "step": 2668 + }, + { + "epoch": 0.31, + "learning_rate": 1.6289319546868966e-05, + "loss": 0.4685, + "step": 2669 + }, + { + "epoch": 0.31, + "learning_rate": 1.6286441227834552e-05, + "loss": 0.4954, + "step": 2670 + }, + { + "epoch": 0.31, + "learning_rate": 1.6283562047417342e-05, + "loss": 0.4682, + "step": 2671 + }, + { + "epoch": 0.31, + "learning_rate": 1.628068200601184e-05, + "loss": 0.4653, + "step": 2672 + }, + { + "epoch": 0.31, + "learning_rate": 1.627780110401268e-05, + "loss": 0.4799, + "step": 2673 + }, + { + "epoch": 0.31, + "learning_rate": 1.6274919341814607e-05, + "loss": 0.4749, + "step": 2674 + }, + { + "epoch": 0.31, + "learning_rate": 1.6272036719812496e-05, + "loss": 0.4631, + "step": 2675 + }, + { + "epoch": 0.31, + "learning_rate": 1.6269153238401317e-05, + "loss": 0.4921, + "step": 2676 + }, + { + "epoch": 0.31, + "learning_rate": 1.626626889797618e-05, + "loss": 0.4678, + "step": 2677 + }, + { + "epoch": 0.31, + "learning_rate": 1.6263383698932307e-05, + "loss": 0.4558, + "step": 2678 + }, + { + "epoch": 0.31, + "learning_rate": 1.6260497641665028e-05, + "loss": 0.4778, + "step": 2679 + }, + { + "epoch": 0.31, + "learning_rate": 1.6257610726569798e-05, + "loss": 0.4876, + "step": 2680 + }, + { + "epoch": 0.31, + "learning_rate": 1.625472295404219e-05, + "loss": 0.4687, + "step": 2681 + }, + { + "epoch": 0.31, + "learning_rate": 1.625183432447789e-05, + "loss": 0.4772, + "step": 2682 + }, + { + "epoch": 0.31, + "learning_rate": 1.6248944838272712e-05, + "loss": 0.46, + "step": 2683 + }, + { + "epoch": 0.31, + "learning_rate": 1.6246054495822575e-05, + "loss": 0.4925, + "step": 2684 + }, + { + "epoch": 0.31, + "learning_rate": 1.6243163297523524e-05, + "loss": 0.4656, + "step": 2685 + }, + { + "epoch": 0.31, + "learning_rate": 1.6240271243771713e-05, + "loss": 0.4525, + "step": 2686 + }, + { + "epoch": 0.31, + "learning_rate": 1.6237378334963422e-05, + "loss": 0.4818, + "step": 2687 + }, + { + "epoch": 0.31, + "learning_rate": 1.623448457149504e-05, + "loss": 0.4646, + "step": 2688 + }, + { + "epoch": 0.31, + "learning_rate": 1.623158995376308e-05, + "loss": 0.4739, + "step": 2689 + }, + { + "epoch": 0.31, + "learning_rate": 1.6228694482164167e-05, + "loss": 0.4613, + "step": 2690 + }, + { + "epoch": 0.31, + "learning_rate": 1.622579815709505e-05, + "loss": 0.5051, + "step": 2691 + }, + { + "epoch": 0.31, + "learning_rate": 1.6222900978952586e-05, + "loss": 0.4472, + "step": 2692 + }, + { + "epoch": 0.31, + "learning_rate": 1.6220002948133756e-05, + "loss": 0.5041, + "step": 2693 + }, + { + "epoch": 0.31, + "learning_rate": 1.6217104065035652e-05, + "loss": 0.4845, + "step": 2694 + }, + { + "epoch": 0.31, + "learning_rate": 1.6214204330055484e-05, + "loss": 0.4754, + "step": 2695 + }, + { + "epoch": 0.31, + "learning_rate": 1.621130374359059e-05, + "loss": 0.4589, + "step": 2696 + }, + { + "epoch": 0.31, + "learning_rate": 1.6208402306038406e-05, + "loss": 0.4649, + "step": 2697 + }, + { + "epoch": 0.31, + "learning_rate": 1.620550001779649e-05, + "loss": 0.4751, + "step": 2698 + }, + { + "epoch": 0.31, + "learning_rate": 1.6202596879262536e-05, + "loss": 0.4805, + "step": 2699 + }, + { + "epoch": 0.31, + "learning_rate": 1.6199692890834324e-05, + "loss": 0.46, + "step": 2700 + }, + { + "epoch": 0.31, + "learning_rate": 1.6196788052909772e-05, + "loss": 0.477, + "step": 2701 + }, + { + "epoch": 0.31, + "learning_rate": 1.6193882365886905e-05, + "loss": 0.4757, + "step": 2702 + }, + { + "epoch": 0.31, + "learning_rate": 1.6190975830163872e-05, + "loss": 0.4712, + "step": 2703 + }, + { + "epoch": 0.31, + "learning_rate": 1.6188068446138925e-05, + "loss": 0.4701, + "step": 2704 + }, + { + "epoch": 0.31, + "learning_rate": 1.6185160214210447e-05, + "loss": 0.4706, + "step": 2705 + }, + { + "epoch": 0.31, + "learning_rate": 1.6182251134776927e-05, + "loss": 0.4659, + "step": 2706 + }, + { + "epoch": 0.31, + "learning_rate": 1.6179341208236977e-05, + "loss": 0.4882, + "step": 2707 + }, + { + "epoch": 0.31, + "learning_rate": 1.617643043498932e-05, + "loss": 0.4824, + "step": 2708 + }, + { + "epoch": 0.31, + "learning_rate": 1.6173518815432797e-05, + "loss": 0.4679, + "step": 2709 + }, + { + "epoch": 0.31, + "learning_rate": 1.6170606349966367e-05, + "loss": 0.4932, + "step": 2710 + }, + { + "epoch": 0.31, + "learning_rate": 1.6167693038989098e-05, + "loss": 0.485, + "step": 2711 + }, + { + "epoch": 0.31, + "learning_rate": 1.6164778882900186e-05, + "loss": 0.4555, + "step": 2712 + }, + { + "epoch": 0.31, + "learning_rate": 1.6161863882098926e-05, + "loss": 0.5035, + "step": 2713 + }, + { + "epoch": 0.31, + "learning_rate": 1.615894803698475e-05, + "loss": 0.4557, + "step": 2714 + }, + { + "epoch": 0.31, + "learning_rate": 1.615603134795718e-05, + "loss": 0.4986, + "step": 2715 + }, + { + "epoch": 0.31, + "learning_rate": 1.615311381541588e-05, + "loss": 0.4746, + "step": 2716 + }, + { + "epoch": 0.31, + "learning_rate": 1.615019543976061e-05, + "loss": 0.4601, + "step": 2717 + }, + { + "epoch": 0.31, + "learning_rate": 1.6147276221391256e-05, + "loss": 0.4762, + "step": 2718 + }, + { + "epoch": 0.31, + "learning_rate": 1.614435616070781e-05, + "loss": 0.483, + "step": 2719 + }, + { + "epoch": 0.31, + "learning_rate": 1.6141435258110397e-05, + "loss": 0.4747, + "step": 2720 + }, + { + "epoch": 0.31, + "learning_rate": 1.6138513513999234e-05, + "loss": 0.4668, + "step": 2721 + }, + { + "epoch": 0.31, + "learning_rate": 1.613559092877467e-05, + "loss": 0.4704, + "step": 2722 + }, + { + "epoch": 0.31, + "learning_rate": 1.6132667502837164e-05, + "loss": 0.4809, + "step": 2723 + }, + { + "epoch": 0.31, + "learning_rate": 1.6129743236587293e-05, + "loss": 0.4661, + "step": 2724 + }, + { + "epoch": 0.31, + "learning_rate": 1.6126818130425746e-05, + "loss": 0.48, + "step": 2725 + }, + { + "epoch": 0.31, + "learning_rate": 1.6123892184753324e-05, + "loss": 0.4916, + "step": 2726 + }, + { + "epoch": 0.31, + "learning_rate": 1.612096539997095e-05, + "loss": 0.4791, + "step": 2727 + }, + { + "epoch": 0.31, + "learning_rate": 1.611803777647966e-05, + "loss": 0.4851, + "step": 2728 + }, + { + "epoch": 0.31, + "learning_rate": 1.6115109314680603e-05, + "loss": 0.4755, + "step": 2729 + }, + { + "epoch": 0.31, + "learning_rate": 1.611218001497504e-05, + "loss": 0.4662, + "step": 2730 + }, + { + "epoch": 0.31, + "learning_rate": 1.610924987776436e-05, + "loss": 0.4743, + "step": 2731 + }, + { + "epoch": 0.31, + "learning_rate": 1.6106318903450042e-05, + "loss": 0.446, + "step": 2732 + }, + { + "epoch": 0.31, + "learning_rate": 1.6103387092433704e-05, + "loss": 0.48, + "step": 2733 + }, + { + "epoch": 0.31, + "learning_rate": 1.6100454445117074e-05, + "loss": 0.4846, + "step": 2734 + }, + { + "epoch": 0.31, + "learning_rate": 1.6097520961901983e-05, + "loss": 0.4946, + "step": 2735 + }, + { + "epoch": 0.31, + "learning_rate": 1.6094586643190388e-05, + "loss": 0.4798, + "step": 2736 + }, + { + "epoch": 0.31, + "learning_rate": 1.609165148938435e-05, + "loss": 0.4737, + "step": 2737 + }, + { + "epoch": 0.31, + "learning_rate": 1.608871550088606e-05, + "loss": 0.4634, + "step": 2738 + }, + { + "epoch": 0.31, + "learning_rate": 1.6085778678097804e-05, + "loss": 0.4716, + "step": 2739 + }, + { + "epoch": 0.31, + "learning_rate": 1.6082841021422e-05, + "loss": 0.4621, + "step": 2740 + }, + { + "epoch": 0.31, + "learning_rate": 1.607990253126117e-05, + "loss": 0.4844, + "step": 2741 + }, + { + "epoch": 0.31, + "learning_rate": 1.607696320801795e-05, + "loss": 0.479, + "step": 2742 + }, + { + "epoch": 0.31, + "learning_rate": 1.6074023052095096e-05, + "loss": 0.4718, + "step": 2743 + }, + { + "epoch": 0.31, + "learning_rate": 1.6071082063895476e-05, + "loss": 0.4843, + "step": 2744 + }, + { + "epoch": 0.31, + "learning_rate": 1.6068140243822065e-05, + "loss": 0.4677, + "step": 2745 + }, + { + "epoch": 0.31, + "learning_rate": 1.6065197592277965e-05, + "loss": 0.4666, + "step": 2746 + }, + { + "epoch": 0.31, + "learning_rate": 1.6062254109666383e-05, + "loss": 0.4853, + "step": 2747 + }, + { + "epoch": 0.31, + "learning_rate": 1.6059309796390638e-05, + "loss": 0.4735, + "step": 2748 + }, + { + "epoch": 0.31, + "learning_rate": 1.6056364652854174e-05, + "loss": 0.4768, + "step": 2749 + }, + { + "epoch": 0.31, + "learning_rate": 1.6053418679460534e-05, + "loss": 0.4528, + "step": 2750 + }, + { + "epoch": 0.31, + "learning_rate": 1.6050471876613386e-05, + "loss": 0.4702, + "step": 2751 + }, + { + "epoch": 0.31, + "learning_rate": 1.6047524244716506e-05, + "loss": 0.4713, + "step": 2752 + }, + { + "epoch": 0.31, + "learning_rate": 1.604457578417379e-05, + "loss": 0.4758, + "step": 2753 + }, + { + "epoch": 0.31, + "learning_rate": 1.6041626495389235e-05, + "loss": 0.4742, + "step": 2754 + }, + { + "epoch": 0.31, + "learning_rate": 1.6038676378766968e-05, + "loss": 0.4639, + "step": 2755 + }, + { + "epoch": 0.31, + "learning_rate": 1.603572543471121e-05, + "loss": 0.4743, + "step": 2756 + }, + { + "epoch": 0.32, + "learning_rate": 1.603277366362632e-05, + "loss": 0.4833, + "step": 2757 + }, + { + "epoch": 0.32, + "learning_rate": 1.6029821065916745e-05, + "loss": 0.4669, + "step": 2758 + }, + { + "epoch": 0.32, + "learning_rate": 1.602686764198706e-05, + "loss": 0.4883, + "step": 2759 + }, + { + "epoch": 0.32, + "learning_rate": 1.602391339224196e-05, + "loss": 0.4739, + "step": 2760 + }, + { + "epoch": 0.32, + "learning_rate": 1.6020958317086224e-05, + "loss": 0.466, + "step": 2761 + }, + { + "epoch": 0.32, + "learning_rate": 1.601800241692478e-05, + "loss": 0.4624, + "step": 2762 + }, + { + "epoch": 0.32, + "learning_rate": 1.6015045692162644e-05, + "loss": 0.4698, + "step": 2763 + }, + { + "epoch": 0.32, + "learning_rate": 1.6012088143204953e-05, + "loss": 0.4567, + "step": 2764 + }, + { + "epoch": 0.32, + "learning_rate": 1.6009129770456962e-05, + "loss": 0.4719, + "step": 2765 + }, + { + "epoch": 0.32, + "learning_rate": 1.6006170574324033e-05, + "loss": 0.4681, + "step": 2766 + }, + { + "epoch": 0.32, + "learning_rate": 1.6003210555211635e-05, + "loss": 0.4753, + "step": 2767 + }, + { + "epoch": 0.32, + "learning_rate": 1.6000249713525366e-05, + "loss": 0.4674, + "step": 2768 + }, + { + "epoch": 0.32, + "learning_rate": 1.5997288049670924e-05, + "loss": 0.4563, + "step": 2769 + }, + { + "epoch": 0.32, + "learning_rate": 1.5994325564054122e-05, + "loss": 0.5064, + "step": 2770 + }, + { + "epoch": 0.32, + "learning_rate": 1.599136225708089e-05, + "loss": 0.4761, + "step": 2771 + }, + { + "epoch": 0.32, + "learning_rate": 1.598839812915726e-05, + "loss": 0.4571, + "step": 2772 + }, + { + "epoch": 0.32, + "learning_rate": 1.598543318068939e-05, + "loss": 0.4861, + "step": 2773 + }, + { + "epoch": 0.32, + "learning_rate": 1.5982467412083543e-05, + "loss": 0.4579, + "step": 2774 + }, + { + "epoch": 0.32, + "learning_rate": 1.5979500823746096e-05, + "loss": 0.4937, + "step": 2775 + }, + { + "epoch": 0.32, + "learning_rate": 1.5976533416083535e-05, + "loss": 0.4646, + "step": 2776 + }, + { + "epoch": 0.32, + "learning_rate": 1.5973565189502463e-05, + "loss": 0.4636, + "step": 2777 + }, + { + "epoch": 0.32, + "learning_rate": 1.5970596144409595e-05, + "loss": 0.4741, + "step": 2778 + }, + { + "epoch": 0.32, + "learning_rate": 1.5967626281211754e-05, + "loss": 0.4716, + "step": 2779 + }, + { + "epoch": 0.32, + "learning_rate": 1.596465560031588e-05, + "loss": 0.4568, + "step": 2780 + }, + { + "epoch": 0.32, + "learning_rate": 1.5961684102129015e-05, + "loss": 0.4772, + "step": 2781 + }, + { + "epoch": 0.32, + "learning_rate": 1.5958711787058332e-05, + "loss": 0.4642, + "step": 2782 + }, + { + "epoch": 0.32, + "learning_rate": 1.5955738655511094e-05, + "loss": 0.4938, + "step": 2783 + }, + { + "epoch": 0.32, + "learning_rate": 1.5952764707894696e-05, + "loss": 0.4634, + "step": 2784 + }, + { + "epoch": 0.32, + "learning_rate": 1.594978994461663e-05, + "loss": 0.4767, + "step": 2785 + }, + { + "epoch": 0.32, + "learning_rate": 1.5946814366084505e-05, + "loss": 0.4734, + "step": 2786 + }, + { + "epoch": 0.32, + "learning_rate": 1.594383797270604e-05, + "loss": 0.4918, + "step": 2787 + }, + { + "epoch": 0.32, + "learning_rate": 1.5940860764889073e-05, + "loss": 0.4859, + "step": 2788 + }, + { + "epoch": 0.32, + "learning_rate": 1.5937882743041543e-05, + "loss": 0.462, + "step": 2789 + }, + { + "epoch": 0.32, + "learning_rate": 1.5934903907571507e-05, + "loss": 0.4686, + "step": 2790 + }, + { + "epoch": 0.32, + "learning_rate": 1.593192425888713e-05, + "loss": 0.4752, + "step": 2791 + }, + { + "epoch": 0.32, + "learning_rate": 1.5928943797396695e-05, + "loss": 0.4988, + "step": 2792 + }, + { + "epoch": 0.32, + "learning_rate": 1.592596252350859e-05, + "loss": 0.4771, + "step": 2793 + }, + { + "epoch": 0.32, + "learning_rate": 1.5922980437631314e-05, + "loss": 0.4763, + "step": 2794 + }, + { + "epoch": 0.32, + "learning_rate": 1.591999754017348e-05, + "loss": 0.4701, + "step": 2795 + }, + { + "epoch": 0.32, + "learning_rate": 1.5917013831543814e-05, + "loss": 0.4927, + "step": 2796 + }, + { + "epoch": 0.32, + "learning_rate": 1.5914029312151146e-05, + "loss": 0.4832, + "step": 2797 + }, + { + "epoch": 0.32, + "learning_rate": 1.5911043982404426e-05, + "loss": 0.475, + "step": 2798 + }, + { + "epoch": 0.32, + "learning_rate": 1.590805784271271e-05, + "loss": 0.4782, + "step": 2799 + }, + { + "epoch": 0.32, + "learning_rate": 1.5905070893485165e-05, + "loss": 0.4783, + "step": 2800 + }, + { + "epoch": 0.32, + "learning_rate": 1.5902083135131067e-05, + "loss": 0.514, + "step": 2801 + }, + { + "epoch": 0.32, + "learning_rate": 1.5899094568059812e-05, + "loss": 0.4526, + "step": 2802 + }, + { + "epoch": 0.32, + "learning_rate": 1.58961051926809e-05, + "loss": 0.4837, + "step": 2803 + }, + { + "epoch": 0.32, + "learning_rate": 1.5893115009403932e-05, + "loss": 0.4742, + "step": 2804 + }, + { + "epoch": 0.32, + "learning_rate": 1.589012401863864e-05, + "loss": 0.4731, + "step": 2805 + }, + { + "epoch": 0.32, + "learning_rate": 1.5887132220794855e-05, + "loss": 0.4697, + "step": 2806 + }, + { + "epoch": 0.32, + "learning_rate": 1.5884139616282517e-05, + "loss": 0.4692, + "step": 2807 + }, + { + "epoch": 0.32, + "learning_rate": 1.5881146205511683e-05, + "loss": 0.4793, + "step": 2808 + }, + { + "epoch": 0.32, + "learning_rate": 1.5878151988892513e-05, + "loss": 0.4826, + "step": 2809 + }, + { + "epoch": 0.32, + "learning_rate": 1.5875156966835285e-05, + "loss": 0.4683, + "step": 2810 + }, + { + "epoch": 0.32, + "learning_rate": 1.5872161139750384e-05, + "loss": 0.4728, + "step": 2811 + }, + { + "epoch": 0.32, + "learning_rate": 1.5869164508048304e-05, + "loss": 0.4519, + "step": 2812 + }, + { + "epoch": 0.32, + "learning_rate": 1.5866167072139645e-05, + "loss": 0.4566, + "step": 2813 + }, + { + "epoch": 0.32, + "learning_rate": 1.5863168832435137e-05, + "loss": 0.4898, + "step": 2814 + }, + { + "epoch": 0.32, + "learning_rate": 1.5860169789345592e-05, + "loss": 0.472, + "step": 2815 + }, + { + "epoch": 0.32, + "learning_rate": 1.5857169943281948e-05, + "loss": 0.4816, + "step": 2816 + }, + { + "epoch": 0.32, + "learning_rate": 1.585416929465526e-05, + "loss": 0.4718, + "step": 2817 + }, + { + "epoch": 0.32, + "learning_rate": 1.585116784387667e-05, + "loss": 0.4443, + "step": 2818 + }, + { + "epoch": 0.32, + "learning_rate": 1.5848165591357458e-05, + "loss": 0.4654, + "step": 2819 + }, + { + "epoch": 0.32, + "learning_rate": 1.584516253750899e-05, + "loss": 0.4793, + "step": 2820 + }, + { + "epoch": 0.32, + "learning_rate": 1.5842158682742756e-05, + "loss": 0.4703, + "step": 2821 + }, + { + "epoch": 0.32, + "learning_rate": 1.5839154027470346e-05, + "loss": 0.4631, + "step": 2822 + }, + { + "epoch": 0.32, + "learning_rate": 1.583614857210347e-05, + "loss": 0.4858, + "step": 2823 + }, + { + "epoch": 0.32, + "learning_rate": 1.5833142317053943e-05, + "loss": 0.4655, + "step": 2824 + }, + { + "epoch": 0.32, + "learning_rate": 1.5830135262733684e-05, + "loss": 0.4708, + "step": 2825 + }, + { + "epoch": 0.32, + "learning_rate": 1.582712740955473e-05, + "loss": 0.477, + "step": 2826 + }, + { + "epoch": 0.32, + "learning_rate": 1.5824118757929224e-05, + "loss": 0.4869, + "step": 2827 + }, + { + "epoch": 0.32, + "learning_rate": 1.5821109308269416e-05, + "loss": 0.4708, + "step": 2828 + }, + { + "epoch": 0.32, + "learning_rate": 1.581809906098767e-05, + "loss": 0.4834, + "step": 2829 + }, + { + "epoch": 0.32, + "learning_rate": 1.581508801649646e-05, + "loss": 0.4669, + "step": 2830 + }, + { + "epoch": 0.32, + "learning_rate": 1.581207617520836e-05, + "loss": 0.4761, + "step": 2831 + }, + { + "epoch": 0.32, + "learning_rate": 1.5809063537536066e-05, + "loss": 0.4765, + "step": 2832 + }, + { + "epoch": 0.32, + "learning_rate": 1.580605010389237e-05, + "loss": 0.4775, + "step": 2833 + }, + { + "epoch": 0.32, + "learning_rate": 1.5803035874690186e-05, + "loss": 0.4697, + "step": 2834 + }, + { + "epoch": 0.32, + "learning_rate": 1.5800020850342524e-05, + "loss": 0.4574, + "step": 2835 + }, + { + "epoch": 0.32, + "learning_rate": 1.5797005031262514e-05, + "loss": 0.4732, + "step": 2836 + }, + { + "epoch": 0.32, + "learning_rate": 1.579398841786339e-05, + "loss": 0.492, + "step": 2837 + }, + { + "epoch": 0.32, + "learning_rate": 1.57909710105585e-05, + "loss": 0.4496, + "step": 2838 + }, + { + "epoch": 0.32, + "learning_rate": 1.5787952809761286e-05, + "loss": 0.478, + "step": 2839 + }, + { + "epoch": 0.32, + "learning_rate": 1.5784933815885315e-05, + "loss": 0.4775, + "step": 2840 + }, + { + "epoch": 0.32, + "learning_rate": 1.5781914029344254e-05, + "loss": 0.4648, + "step": 2841 + }, + { + "epoch": 0.32, + "learning_rate": 1.5778893450551888e-05, + "loss": 0.4642, + "step": 2842 + }, + { + "epoch": 0.32, + "learning_rate": 1.5775872079922098e-05, + "loss": 0.4863, + "step": 2843 + }, + { + "epoch": 0.33, + "learning_rate": 1.5772849917868876e-05, + "loss": 0.4598, + "step": 2844 + }, + { + "epoch": 0.33, + "learning_rate": 1.576982696480633e-05, + "loss": 0.478, + "step": 2845 + }, + { + "epoch": 0.33, + "learning_rate": 1.5766803221148676e-05, + "loss": 0.4735, + "step": 2846 + }, + { + "epoch": 0.33, + "learning_rate": 1.5763778687310224e-05, + "loss": 0.4723, + "step": 2847 + }, + { + "epoch": 0.33, + "learning_rate": 1.5760753363705412e-05, + "loss": 0.4484, + "step": 2848 + }, + { + "epoch": 0.33, + "learning_rate": 1.5757727250748773e-05, + "loss": 0.4742, + "step": 2849 + }, + { + "epoch": 0.33, + "learning_rate": 1.5754700348854955e-05, + "loss": 0.4912, + "step": 2850 + }, + { + "epoch": 0.33, + "learning_rate": 1.5751672658438707e-05, + "loss": 0.4788, + "step": 2851 + }, + { + "epoch": 0.33, + "learning_rate": 1.574864417991489e-05, + "loss": 0.4717, + "step": 2852 + }, + { + "epoch": 0.33, + "learning_rate": 1.5745614913698478e-05, + "loss": 0.5, + "step": 2853 + }, + { + "epoch": 0.33, + "learning_rate": 1.5742584860204547e-05, + "loss": 0.4665, + "step": 2854 + }, + { + "epoch": 0.33, + "learning_rate": 1.5739554019848274e-05, + "loss": 0.471, + "step": 2855 + }, + { + "epoch": 0.33, + "learning_rate": 1.5736522393044962e-05, + "loss": 0.4684, + "step": 2856 + }, + { + "epoch": 0.33, + "learning_rate": 1.5733489980210007e-05, + "loss": 0.4837, + "step": 2857 + }, + { + "epoch": 0.33, + "learning_rate": 1.573045678175892e-05, + "loss": 0.4908, + "step": 2858 + }, + { + "epoch": 0.33, + "learning_rate": 1.5727422798107313e-05, + "loss": 0.4824, + "step": 2859 + }, + { + "epoch": 0.33, + "learning_rate": 1.5724388029670912e-05, + "loss": 0.4636, + "step": 2860 + }, + { + "epoch": 0.33, + "learning_rate": 1.5721352476865546e-05, + "loss": 0.4617, + "step": 2861 + }, + { + "epoch": 0.33, + "learning_rate": 1.5718316140107156e-05, + "loss": 0.4884, + "step": 2862 + }, + { + "epoch": 0.33, + "learning_rate": 1.5715279019811783e-05, + "loss": 0.4766, + "step": 2863 + }, + { + "epoch": 0.33, + "learning_rate": 1.571224111639559e-05, + "loss": 0.4708, + "step": 2864 + }, + { + "epoch": 0.33, + "learning_rate": 1.570920243027483e-05, + "loss": 0.4695, + "step": 2865 + }, + { + "epoch": 0.33, + "learning_rate": 1.5706162961865866e-05, + "loss": 0.4738, + "step": 2866 + }, + { + "epoch": 0.33, + "learning_rate": 1.570312271158519e-05, + "loss": 0.4878, + "step": 2867 + }, + { + "epoch": 0.33, + "learning_rate": 1.5700081679849362e-05, + "loss": 0.4698, + "step": 2868 + }, + { + "epoch": 0.33, + "learning_rate": 1.569703986707509e-05, + "loss": 0.4732, + "step": 2869 + }, + { + "epoch": 0.33, + "learning_rate": 1.5693997273679165e-05, + "loss": 0.5003, + "step": 2870 + }, + { + "epoch": 0.33, + "learning_rate": 1.5690953900078485e-05, + "loss": 0.4552, + "step": 2871 + }, + { + "epoch": 0.33, + "learning_rate": 1.5687909746690064e-05, + "loss": 0.472, + "step": 2872 + }, + { + "epoch": 0.33, + "learning_rate": 1.568486481393102e-05, + "loss": 0.4709, + "step": 2873 + }, + { + "epoch": 0.33, + "learning_rate": 1.5681819102218572e-05, + "loss": 0.4626, + "step": 2874 + }, + { + "epoch": 0.33, + "learning_rate": 1.5678772611970056e-05, + "loss": 0.4704, + "step": 2875 + }, + { + "epoch": 0.33, + "learning_rate": 1.5675725343602904e-05, + "loss": 0.4631, + "step": 2876 + }, + { + "epoch": 0.33, + "learning_rate": 1.5672677297534665e-05, + "loss": 0.4735, + "step": 2877 + }, + { + "epoch": 0.33, + "learning_rate": 1.566962847418299e-05, + "loss": 0.4643, + "step": 2878 + }, + { + "epoch": 0.33, + "learning_rate": 1.5666578873965627e-05, + "loss": 0.4823, + "step": 2879 + }, + { + "epoch": 0.33, + "learning_rate": 1.566352849730045e-05, + "loss": 0.4623, + "step": 2880 + }, + { + "epoch": 0.33, + "learning_rate": 1.566047734460542e-05, + "loss": 0.4846, + "step": 2881 + }, + { + "epoch": 0.33, + "learning_rate": 1.5657425416298623e-05, + "loss": 0.4666, + "step": 2882 + }, + { + "epoch": 0.33, + "learning_rate": 1.565437271279823e-05, + "loss": 0.4601, + "step": 2883 + }, + { + "epoch": 0.33, + "learning_rate": 1.5651319234522538e-05, + "loss": 0.4834, + "step": 2884 + }, + { + "epoch": 0.33, + "learning_rate": 1.5648264981889936e-05, + "loss": 0.4699, + "step": 2885 + }, + { + "epoch": 0.33, + "learning_rate": 1.564520995531893e-05, + "loss": 0.4772, + "step": 2886 + }, + { + "epoch": 0.33, + "learning_rate": 1.5642154155228124e-05, + "loss": 0.4867, + "step": 2887 + }, + { + "epoch": 0.33, + "learning_rate": 1.5639097582036226e-05, + "loss": 0.4585, + "step": 2888 + }, + { + "epoch": 0.33, + "learning_rate": 1.5636040236162066e-05, + "loss": 0.4807, + "step": 2889 + }, + { + "epoch": 0.33, + "learning_rate": 1.5632982118024556e-05, + "loss": 0.4746, + "step": 2890 + }, + { + "epoch": 0.33, + "learning_rate": 1.562992322804274e-05, + "loss": 0.4685, + "step": 2891 + }, + { + "epoch": 0.33, + "learning_rate": 1.5626863566635744e-05, + "loss": 0.4727, + "step": 2892 + }, + { + "epoch": 0.33, + "learning_rate": 1.5623803134222812e-05, + "loss": 0.4873, + "step": 2893 + }, + { + "epoch": 0.33, + "learning_rate": 1.5620741931223292e-05, + "loss": 0.4863, + "step": 2894 + }, + { + "epoch": 0.33, + "learning_rate": 1.5617679958056643e-05, + "loss": 0.4903, + "step": 2895 + }, + { + "epoch": 0.33, + "learning_rate": 1.5614617215142412e-05, + "loss": 0.4736, + "step": 2896 + }, + { + "epoch": 0.33, + "learning_rate": 1.5611553702900275e-05, + "loss": 0.4637, + "step": 2897 + }, + { + "epoch": 0.33, + "learning_rate": 1.5608489421749995e-05, + "loss": 0.4758, + "step": 2898 + }, + { + "epoch": 0.33, + "learning_rate": 1.5605424372111447e-05, + "loss": 0.4812, + "step": 2899 + }, + { + "epoch": 0.33, + "learning_rate": 1.5602358554404613e-05, + "loss": 0.4662, + "step": 2900 + }, + { + "epoch": 0.33, + "learning_rate": 1.5599291969049575e-05, + "loss": 0.4708, + "step": 2901 + }, + { + "epoch": 0.33, + "learning_rate": 1.5596224616466527e-05, + "loss": 0.469, + "step": 2902 + }, + { + "epoch": 0.33, + "learning_rate": 1.5593156497075767e-05, + "loss": 0.4772, + "step": 2903 + }, + { + "epoch": 0.33, + "learning_rate": 1.5590087611297694e-05, + "loss": 0.484, + "step": 2904 + }, + { + "epoch": 0.33, + "learning_rate": 1.558701795955281e-05, + "loss": 0.468, + "step": 2905 + }, + { + "epoch": 0.33, + "learning_rate": 1.558394754226173e-05, + "loss": 0.4638, + "step": 2906 + }, + { + "epoch": 0.33, + "learning_rate": 1.5580876359845166e-05, + "loss": 0.4748, + "step": 2907 + }, + { + "epoch": 0.33, + "learning_rate": 1.557780441272395e-05, + "loss": 0.4646, + "step": 2908 + }, + { + "epoch": 0.33, + "learning_rate": 1.5574731701318987e-05, + "loss": 0.4657, + "step": 2909 + }, + { + "epoch": 0.33, + "learning_rate": 1.5571658226051325e-05, + "loss": 0.5036, + "step": 2910 + }, + { + "epoch": 0.33, + "learning_rate": 1.556858398734209e-05, + "loss": 0.478, + "step": 2911 + }, + { + "epoch": 0.33, + "learning_rate": 1.5565508985612525e-05, + "loss": 0.4641, + "step": 2912 + }, + { + "epoch": 0.33, + "learning_rate": 1.556243322128397e-05, + "loss": 0.4988, + "step": 2913 + }, + { + "epoch": 0.33, + "learning_rate": 1.5559356694777882e-05, + "loss": 0.4682, + "step": 2914 + }, + { + "epoch": 0.33, + "learning_rate": 1.5556279406515802e-05, + "loss": 0.4605, + "step": 2915 + }, + { + "epoch": 0.33, + "learning_rate": 1.5553201356919394e-05, + "loss": 0.4572, + "step": 2916 + }, + { + "epoch": 0.33, + "learning_rate": 1.555012254641042e-05, + "loss": 0.4649, + "step": 2917 + }, + { + "epoch": 0.33, + "learning_rate": 1.554704297541074e-05, + "loss": 0.4938, + "step": 2918 + }, + { + "epoch": 0.33, + "learning_rate": 1.5543962644342335e-05, + "loss": 0.4751, + "step": 2919 + }, + { + "epoch": 0.33, + "learning_rate": 1.5540881553627264e-05, + "loss": 0.4628, + "step": 2920 + }, + { + "epoch": 0.33, + "learning_rate": 1.553779970368772e-05, + "loss": 0.4738, + "step": 2921 + }, + { + "epoch": 0.33, + "learning_rate": 1.553471709494598e-05, + "loss": 0.4807, + "step": 2922 + }, + { + "epoch": 0.33, + "learning_rate": 1.5531633727824423e-05, + "loss": 0.456, + "step": 2923 + }, + { + "epoch": 0.33, + "learning_rate": 1.5528549602745545e-05, + "loss": 0.4865, + "step": 2924 + }, + { + "epoch": 0.33, + "learning_rate": 1.5525464720131945e-05, + "loss": 0.4608, + "step": 2925 + }, + { + "epoch": 0.33, + "learning_rate": 1.5522379080406315e-05, + "loss": 0.4985, + "step": 2926 + }, + { + "epoch": 0.33, + "learning_rate": 1.5519292683991455e-05, + "loss": 0.4733, + "step": 2927 + }, + { + "epoch": 0.33, + "learning_rate": 1.5516205531310272e-05, + "loss": 0.4852, + "step": 2928 + }, + { + "epoch": 0.33, + "learning_rate": 1.5513117622785778e-05, + "loss": 0.4641, + "step": 2929 + }, + { + "epoch": 0.33, + "learning_rate": 1.5510028958841085e-05, + "loss": 0.4682, + "step": 2930 + }, + { + "epoch": 0.33, + "learning_rate": 1.5506939539899403e-05, + "loss": 0.4544, + "step": 2931 + }, + { + "epoch": 0.34, + "learning_rate": 1.5503849366384053e-05, + "loss": 0.4609, + "step": 2932 + }, + { + "epoch": 0.34, + "learning_rate": 1.5500758438718463e-05, + "loss": 0.4947, + "step": 2933 + }, + { + "epoch": 0.34, + "learning_rate": 1.5497666757326157e-05, + "loss": 0.4783, + "step": 2934 + }, + { + "epoch": 0.34, + "learning_rate": 1.5494574322630765e-05, + "loss": 0.4702, + "step": 2935 + }, + { + "epoch": 0.34, + "learning_rate": 1.5491481135056012e-05, + "loss": 0.5003, + "step": 2936 + }, + { + "epoch": 0.34, + "learning_rate": 1.5488387195025745e-05, + "loss": 0.4549, + "step": 2937 + }, + { + "epoch": 0.34, + "learning_rate": 1.5485292502963892e-05, + "loss": 0.4735, + "step": 2938 + }, + { + "epoch": 0.34, + "learning_rate": 1.548219705929451e-05, + "loss": 0.4817, + "step": 2939 + }, + { + "epoch": 0.34, + "learning_rate": 1.5479100864441726e-05, + "loss": 0.4679, + "step": 2940 + }, + { + "epoch": 0.34, + "learning_rate": 1.54760039188298e-05, + "loss": 0.4699, + "step": 2941 + }, + { + "epoch": 0.34, + "learning_rate": 1.5472906222883075e-05, + "loss": 0.4791, + "step": 2942 + }, + { + "epoch": 0.34, + "learning_rate": 1.5469807777026014e-05, + "loss": 0.4749, + "step": 2943 + }, + { + "epoch": 0.34, + "learning_rate": 1.5466708581683164e-05, + "loss": 0.4845, + "step": 2944 + }, + { + "epoch": 0.34, + "learning_rate": 1.546360863727919e-05, + "loss": 0.4682, + "step": 2945 + }, + { + "epoch": 0.34, + "learning_rate": 1.546050794423885e-05, + "loss": 0.4739, + "step": 2946 + }, + { + "epoch": 0.34, + "learning_rate": 1.5457406502987007e-05, + "loss": 0.4703, + "step": 2947 + }, + { + "epoch": 0.34, + "learning_rate": 1.5454304313948635e-05, + "loss": 0.486, + "step": 2948 + }, + { + "epoch": 0.34, + "learning_rate": 1.5451201377548793e-05, + "loss": 0.4633, + "step": 2949 + }, + { + "epoch": 0.34, + "learning_rate": 1.5448097694212663e-05, + "loss": 0.4788, + "step": 2950 + }, + { + "epoch": 0.34, + "learning_rate": 1.544499326436551e-05, + "loss": 0.4686, + "step": 2951 + }, + { + "epoch": 0.34, + "learning_rate": 1.5441888088432716e-05, + "loss": 0.482, + "step": 2952 + }, + { + "epoch": 0.34, + "learning_rate": 1.5438782166839757e-05, + "loss": 0.4666, + "step": 2953 + }, + { + "epoch": 0.34, + "learning_rate": 1.5435675500012212e-05, + "loss": 0.4929, + "step": 2954 + }, + { + "epoch": 0.34, + "learning_rate": 1.5432568088375766e-05, + "loss": 0.4673, + "step": 2955 + }, + { + "epoch": 0.34, + "learning_rate": 1.542945993235621e-05, + "loss": 0.4798, + "step": 2956 + }, + { + "epoch": 0.34, + "learning_rate": 1.5426351032379418e-05, + "loss": 0.4764, + "step": 2957 + }, + { + "epoch": 0.34, + "learning_rate": 1.5423241388871383e-05, + "loss": 0.4605, + "step": 2958 + }, + { + "epoch": 0.34, + "learning_rate": 1.54201310022582e-05, + "loss": 0.4917, + "step": 2959 + }, + { + "epoch": 0.34, + "learning_rate": 1.541701987296606e-05, + "loss": 0.468, + "step": 2960 + }, + { + "epoch": 0.34, + "learning_rate": 1.5413908001421257e-05, + "loss": 0.4596, + "step": 2961 + }, + { + "epoch": 0.34, + "learning_rate": 1.5410795388050182e-05, + "loss": 0.488, + "step": 2962 + }, + { + "epoch": 0.34, + "learning_rate": 1.540768203327934e-05, + "loss": 0.4573, + "step": 2963 + }, + { + "epoch": 0.34, + "learning_rate": 1.5404567937535326e-05, + "loss": 0.4672, + "step": 2964 + }, + { + "epoch": 0.34, + "learning_rate": 1.540145310124484e-05, + "loss": 0.4718, + "step": 2965 + }, + { + "epoch": 0.34, + "learning_rate": 1.5398337524834688e-05, + "loss": 0.4789, + "step": 2966 + }, + { + "epoch": 0.34, + "learning_rate": 1.5395221208731766e-05, + "loss": 0.4671, + "step": 2967 + }, + { + "epoch": 0.34, + "learning_rate": 1.5392104153363086e-05, + "loss": 0.4992, + "step": 2968 + }, + { + "epoch": 0.34, + "learning_rate": 1.538898635915576e-05, + "loss": 0.4666, + "step": 2969 + }, + { + "epoch": 0.34, + "learning_rate": 1.5385867826536977e-05, + "loss": 0.486, + "step": 2970 + }, + { + "epoch": 0.34, + "learning_rate": 1.5382748555934058e-05, + "loss": 0.4593, + "step": 2971 + }, + { + "epoch": 0.34, + "learning_rate": 1.5379628547774412e-05, + "loss": 0.4736, + "step": 2972 + }, + { + "epoch": 0.34, + "learning_rate": 1.5376507802485547e-05, + "loss": 0.4685, + "step": 2973 + }, + { + "epoch": 0.34, + "learning_rate": 1.537338632049508e-05, + "loss": 0.4572, + "step": 2974 + }, + { + "epoch": 0.34, + "learning_rate": 1.5370264102230716e-05, + "loss": 0.4599, + "step": 2975 + }, + { + "epoch": 0.34, + "learning_rate": 1.5367141148120275e-05, + "loss": 0.4735, + "step": 2976 + }, + { + "epoch": 0.34, + "learning_rate": 1.5364017458591668e-05, + "loss": 0.476, + "step": 2977 + }, + { + "epoch": 0.34, + "learning_rate": 1.536089303407291e-05, + "loss": 0.4663, + "step": 2978 + }, + { + "epoch": 0.34, + "learning_rate": 1.535776787499212e-05, + "loss": 0.4701, + "step": 2979 + }, + { + "epoch": 0.34, + "learning_rate": 1.5354641981777514e-05, + "loss": 0.483, + "step": 2980 + }, + { + "epoch": 0.34, + "learning_rate": 1.5351515354857404e-05, + "loss": 0.4585, + "step": 2981 + }, + { + "epoch": 0.34, + "learning_rate": 1.5348387994660214e-05, + "loss": 0.4679, + "step": 2982 + }, + { + "epoch": 0.34, + "learning_rate": 1.534525990161446e-05, + "loss": 0.4726, + "step": 2983 + }, + { + "epoch": 0.34, + "learning_rate": 1.534213107614876e-05, + "loss": 0.4763, + "step": 2984 + }, + { + "epoch": 0.34, + "learning_rate": 1.5339001518691833e-05, + "loss": 0.4865, + "step": 2985 + }, + { + "epoch": 0.34, + "learning_rate": 1.5335871229672496e-05, + "loss": 0.4598, + "step": 2986 + }, + { + "epoch": 0.34, + "learning_rate": 1.5332740209519674e-05, + "loss": 0.4754, + "step": 2987 + }, + { + "epoch": 0.34, + "learning_rate": 1.5329608458662383e-05, + "loss": 0.4915, + "step": 2988 + }, + { + "epoch": 0.34, + "learning_rate": 1.5326475977529745e-05, + "loss": 0.4633, + "step": 2989 + }, + { + "epoch": 0.34, + "learning_rate": 1.5323342766550978e-05, + "loss": 0.4808, + "step": 2990 + }, + { + "epoch": 0.34, + "learning_rate": 1.53202088261554e-05, + "loss": 0.4753, + "step": 2991 + }, + { + "epoch": 0.34, + "learning_rate": 1.5317074156772434e-05, + "loss": 0.4628, + "step": 2992 + }, + { + "epoch": 0.34, + "learning_rate": 1.5313938758831596e-05, + "loss": 0.4698, + "step": 2993 + }, + { + "epoch": 0.34, + "learning_rate": 1.531080263276251e-05, + "loss": 0.4773, + "step": 2994 + }, + { + "epoch": 0.34, + "learning_rate": 1.5307665778994897e-05, + "loss": 0.4576, + "step": 2995 + }, + { + "epoch": 0.34, + "learning_rate": 1.5304528197958565e-05, + "loss": 0.4724, + "step": 2996 + }, + { + "epoch": 0.34, + "learning_rate": 1.5301389890083446e-05, + "loss": 0.4608, + "step": 2997 + }, + { + "epoch": 0.34, + "learning_rate": 1.529825085579955e-05, + "loss": 0.4734, + "step": 2998 + }, + { + "epoch": 0.34, + "learning_rate": 1.5295111095536997e-05, + "loss": 0.4648, + "step": 2999 + }, + { + "epoch": 0.34, + "learning_rate": 1.5291970609726008e-05, + "loss": 0.454, + "step": 3000 + }, + { + "epoch": 0.34, + "learning_rate": 1.5288829398796892e-05, + "loss": 0.4805, + "step": 3001 + }, + { + "epoch": 0.34, + "learning_rate": 1.528568746318007e-05, + "loss": 0.4806, + "step": 3002 + }, + { + "epoch": 0.34, + "learning_rate": 1.5282544803306056e-05, + "loss": 0.4626, + "step": 3003 + }, + { + "epoch": 0.34, + "learning_rate": 1.5279401419605466e-05, + "loss": 0.4611, + "step": 3004 + }, + { + "epoch": 0.34, + "learning_rate": 1.527625731250901e-05, + "loss": 0.4811, + "step": 3005 + }, + { + "epoch": 0.34, + "learning_rate": 1.527311248244751e-05, + "loss": 0.4894, + "step": 3006 + }, + { + "epoch": 0.34, + "learning_rate": 1.5269966929851866e-05, + "loss": 0.4652, + "step": 3007 + }, + { + "epoch": 0.34, + "learning_rate": 1.52668206551531e-05, + "loss": 0.4701, + "step": 3008 + }, + { + "epoch": 0.34, + "learning_rate": 1.526367365878231e-05, + "loss": 0.4649, + "step": 3009 + }, + { + "epoch": 0.34, + "learning_rate": 1.526052594117071e-05, + "loss": 0.4541, + "step": 3010 + }, + { + "epoch": 0.34, + "learning_rate": 1.5257377502749614e-05, + "loss": 0.4898, + "step": 3011 + }, + { + "epoch": 0.34, + "learning_rate": 1.525422834395042e-05, + "loss": 0.4587, + "step": 3012 + }, + { + "epoch": 0.34, + "learning_rate": 1.525107846520464e-05, + "loss": 0.4891, + "step": 3013 + }, + { + "epoch": 0.34, + "learning_rate": 1.5247927866943869e-05, + "loss": 0.4719, + "step": 3014 + }, + { + "epoch": 0.34, + "learning_rate": 1.5244776549599816e-05, + "loss": 0.4638, + "step": 3015 + }, + { + "epoch": 0.34, + "learning_rate": 1.5241624513604281e-05, + "loss": 0.4802, + "step": 3016 + }, + { + "epoch": 0.34, + "learning_rate": 1.523847175938916e-05, + "loss": 0.4526, + "step": 3017 + }, + { + "epoch": 0.34, + "learning_rate": 1.5235318287386455e-05, + "loss": 0.4785, + "step": 3018 + }, + { + "epoch": 0.35, + "learning_rate": 1.5232164098028257e-05, + "loss": 0.4786, + "step": 3019 + }, + { + "epoch": 0.35, + "learning_rate": 1.5229009191746769e-05, + "loss": 0.4906, + "step": 3020 + }, + { + "epoch": 0.35, + "learning_rate": 1.5225853568974271e-05, + "loss": 0.4532, + "step": 3021 + }, + { + "epoch": 0.35, + "learning_rate": 1.5222697230143166e-05, + "loss": 0.4748, + "step": 3022 + }, + { + "epoch": 0.35, + "learning_rate": 1.5219540175685938e-05, + "loss": 0.4803, + "step": 3023 + }, + { + "epoch": 0.35, + "learning_rate": 1.521638240603517e-05, + "loss": 0.4751, + "step": 3024 + }, + { + "epoch": 0.35, + "learning_rate": 1.5213223921623553e-05, + "loss": 0.4679, + "step": 3025 + }, + { + "epoch": 0.35, + "learning_rate": 1.5210064722883865e-05, + "loss": 0.4836, + "step": 3026 + }, + { + "epoch": 0.35, + "learning_rate": 1.5206904810248992e-05, + "loss": 0.4656, + "step": 3027 + }, + { + "epoch": 0.35, + "learning_rate": 1.5203744184151907e-05, + "loss": 0.4757, + "step": 3028 + }, + { + "epoch": 0.35, + "learning_rate": 1.5200582845025688e-05, + "loss": 0.4992, + "step": 3029 + }, + { + "epoch": 0.35, + "learning_rate": 1.5197420793303514e-05, + "loss": 0.4701, + "step": 3030 + }, + { + "epoch": 0.35, + "learning_rate": 1.5194258029418657e-05, + "loss": 0.4565, + "step": 3031 + }, + { + "epoch": 0.35, + "learning_rate": 1.5191094553804476e-05, + "loss": 0.4628, + "step": 3032 + }, + { + "epoch": 0.35, + "learning_rate": 1.5187930366894442e-05, + "loss": 0.4777, + "step": 3033 + }, + { + "epoch": 0.35, + "learning_rate": 1.5184765469122122e-05, + "loss": 0.4835, + "step": 3034 + }, + { + "epoch": 0.35, + "learning_rate": 1.5181599860921182e-05, + "loss": 0.4564, + "step": 3035 + }, + { + "epoch": 0.35, + "learning_rate": 1.517843354272537e-05, + "loss": 0.4759, + "step": 3036 + }, + { + "epoch": 0.35, + "learning_rate": 1.517526651496855e-05, + "loss": 0.4583, + "step": 3037 + }, + { + "epoch": 0.35, + "learning_rate": 1.5172098778084672e-05, + "loss": 0.4962, + "step": 3038 + }, + { + "epoch": 0.35, + "learning_rate": 1.5168930332507791e-05, + "loss": 0.4479, + "step": 3039 + }, + { + "epoch": 0.35, + "learning_rate": 1.5165761178672052e-05, + "loss": 0.4767, + "step": 3040 + }, + { + "epoch": 0.35, + "learning_rate": 1.51625913170117e-05, + "loss": 0.4743, + "step": 3041 + }, + { + "epoch": 0.35, + "learning_rate": 1.5159420747961076e-05, + "loss": 0.4861, + "step": 3042 + }, + { + "epoch": 0.35, + "learning_rate": 1.5156249471954617e-05, + "loss": 0.46, + "step": 3043 + }, + { + "epoch": 0.35, + "learning_rate": 1.5153077489426865e-05, + "loss": 0.468, + "step": 3044 + }, + { + "epoch": 0.35, + "learning_rate": 1.5149904800812448e-05, + "loss": 0.4918, + "step": 3045 + }, + { + "epoch": 0.35, + "learning_rate": 1.514673140654609e-05, + "loss": 0.4897, + "step": 3046 + }, + { + "epoch": 0.35, + "learning_rate": 1.514355730706263e-05, + "loss": 0.4717, + "step": 3047 + }, + { + "epoch": 0.35, + "learning_rate": 1.5140382502796978e-05, + "loss": 0.4652, + "step": 3048 + }, + { + "epoch": 0.35, + "learning_rate": 1.5137206994184159e-05, + "loss": 0.4705, + "step": 3049 + }, + { + "epoch": 0.35, + "learning_rate": 1.5134030781659288e-05, + "loss": 0.4599, + "step": 3050 + }, + { + "epoch": 0.35, + "learning_rate": 1.513085386565758e-05, + "loss": 0.4557, + "step": 3051 + }, + { + "epoch": 0.35, + "learning_rate": 1.5127676246614336e-05, + "loss": 0.4788, + "step": 3052 + }, + { + "epoch": 0.35, + "learning_rate": 1.5124497924964966e-05, + "loss": 0.4772, + "step": 3053 + }, + { + "epoch": 0.35, + "learning_rate": 1.512131890114497e-05, + "loss": 0.473, + "step": 3054 + }, + { + "epoch": 0.35, + "learning_rate": 1.5118139175589944e-05, + "loss": 0.4603, + "step": 3055 + }, + { + "epoch": 0.35, + "learning_rate": 1.5114958748735584e-05, + "loss": 0.4807, + "step": 3056 + }, + { + "epoch": 0.35, + "learning_rate": 1.5111777621017677e-05, + "loss": 0.4671, + "step": 3057 + }, + { + "epoch": 0.35, + "learning_rate": 1.5108595792872112e-05, + "loss": 0.4546, + "step": 3058 + }, + { + "epoch": 0.35, + "learning_rate": 1.5105413264734866e-05, + "loss": 0.4701, + "step": 3059 + }, + { + "epoch": 0.35, + "learning_rate": 1.5102230037042018e-05, + "loss": 0.4811, + "step": 3060 + }, + { + "epoch": 0.35, + "learning_rate": 1.5099046110229742e-05, + "loss": 0.4597, + "step": 3061 + }, + { + "epoch": 0.35, + "learning_rate": 1.5095861484734307e-05, + "loss": 0.4749, + "step": 3062 + }, + { + "epoch": 0.35, + "learning_rate": 1.5092676160992077e-05, + "loss": 0.4709, + "step": 3063 + }, + { + "epoch": 0.35, + "learning_rate": 1.5089490139439514e-05, + "loss": 0.4601, + "step": 3064 + }, + { + "epoch": 0.35, + "learning_rate": 1.508630342051317e-05, + "loss": 0.4482, + "step": 3065 + }, + { + "epoch": 0.35, + "learning_rate": 1.5083116004649703e-05, + "loss": 0.4873, + "step": 3066 + }, + { + "epoch": 0.35, + "learning_rate": 1.5079927892285855e-05, + "loss": 0.4526, + "step": 3067 + }, + { + "epoch": 0.35, + "learning_rate": 1.5076739083858472e-05, + "loss": 0.4646, + "step": 3068 + }, + { + "epoch": 0.35, + "learning_rate": 1.5073549579804493e-05, + "loss": 0.4813, + "step": 3069 + }, + { + "epoch": 0.35, + "learning_rate": 1.5070359380560944e-05, + "loss": 0.4693, + "step": 3070 + }, + { + "epoch": 0.35, + "learning_rate": 1.5067168486564959e-05, + "loss": 0.4967, + "step": 3071 + }, + { + "epoch": 0.35, + "learning_rate": 1.5063976898253763e-05, + "loss": 0.4671, + "step": 3072 + }, + { + "epoch": 0.35, + "learning_rate": 1.506078461606467e-05, + "loss": 0.4815, + "step": 3073 + }, + { + "epoch": 0.35, + "learning_rate": 1.5057591640435098e-05, + "loss": 0.471, + "step": 3074 + }, + { + "epoch": 0.35, + "learning_rate": 1.5054397971802557e-05, + "loss": 0.4657, + "step": 3075 + }, + { + "epoch": 0.35, + "learning_rate": 1.5051203610604643e-05, + "loss": 0.4786, + "step": 3076 + }, + { + "epoch": 0.35, + "learning_rate": 1.5048008557279064e-05, + "loss": 0.4743, + "step": 3077 + }, + { + "epoch": 0.35, + "learning_rate": 1.504481281226361e-05, + "loss": 0.4958, + "step": 3078 + }, + { + "epoch": 0.35, + "learning_rate": 1.504161637599617e-05, + "loss": 0.4849, + "step": 3079 + }, + { + "epoch": 0.35, + "learning_rate": 1.5038419248914725e-05, + "loss": 0.4544, + "step": 3080 + }, + { + "epoch": 0.35, + "learning_rate": 1.5035221431457352e-05, + "loss": 0.478, + "step": 3081 + }, + { + "epoch": 0.35, + "learning_rate": 1.5032022924062228e-05, + "loss": 0.4758, + "step": 3082 + }, + { + "epoch": 0.35, + "learning_rate": 1.5028823727167621e-05, + "loss": 0.4713, + "step": 3083 + }, + { + "epoch": 0.35, + "learning_rate": 1.5025623841211885e-05, + "loss": 0.4591, + "step": 3084 + }, + { + "epoch": 0.35, + "learning_rate": 1.502242326663348e-05, + "loss": 0.457, + "step": 3085 + }, + { + "epoch": 0.35, + "learning_rate": 1.5019222003870954e-05, + "loss": 0.4923, + "step": 3086 + }, + { + "epoch": 0.35, + "learning_rate": 1.501602005336296e-05, + "loss": 0.4548, + "step": 3087 + }, + { + "epoch": 0.35, + "learning_rate": 1.5012817415548226e-05, + "loss": 0.4693, + "step": 3088 + }, + { + "epoch": 0.35, + "learning_rate": 1.500961409086559e-05, + "loss": 0.4745, + "step": 3089 + }, + { + "epoch": 0.35, + "learning_rate": 1.5006410079753974e-05, + "loss": 0.4642, + "step": 3090 + }, + { + "epoch": 0.35, + "learning_rate": 1.5003205382652409e-05, + "loss": 0.4622, + "step": 3091 + }, + { + "epoch": 0.35, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.4943, + "step": 3092 + }, + { + "epoch": 0.35, + "learning_rate": 1.4996793932235965e-05, + "loss": 0.4657, + "step": 3093 + }, + { + "epoch": 0.35, + "learning_rate": 1.4993587179799598e-05, + "loss": 0.4849, + "step": 3094 + }, + { + "epoch": 0.35, + "learning_rate": 1.49903797431303e-05, + "loss": 0.4615, + "step": 3095 + }, + { + "epoch": 0.35, + "learning_rate": 1.4987171622667562e-05, + "loss": 0.4694, + "step": 3096 + }, + { + "epoch": 0.35, + "learning_rate": 1.4983962818850967e-05, + "loss": 0.4771, + "step": 3097 + }, + { + "epoch": 0.35, + "learning_rate": 1.4980753332120193e-05, + "loss": 0.4754, + "step": 3098 + }, + { + "epoch": 0.35, + "learning_rate": 1.4977543162915011e-05, + "loss": 0.4576, + "step": 3099 + }, + { + "epoch": 0.35, + "learning_rate": 1.4974332311675286e-05, + "loss": 0.4701, + "step": 3100 + }, + { + "epoch": 0.35, + "learning_rate": 1.497112077884098e-05, + "loss": 0.4787, + "step": 3101 + }, + { + "epoch": 0.35, + "learning_rate": 1.4967908564852137e-05, + "loss": 0.5031, + "step": 3102 + }, + { + "epoch": 0.35, + "learning_rate": 1.4964695670148907e-05, + "loss": 0.47, + "step": 3103 + }, + { + "epoch": 0.35, + "learning_rate": 1.4961482095171529e-05, + "loss": 0.4716, + "step": 3104 + }, + { + "epoch": 0.35, + "learning_rate": 1.4958267840360332e-05, + "loss": 0.4739, + "step": 3105 + }, + { + "epoch": 0.35, + "learning_rate": 1.495505290615574e-05, + "loss": 0.4693, + "step": 3106 + }, + { + "epoch": 0.36, + "learning_rate": 1.4951837292998277e-05, + "loss": 0.4659, + "step": 3107 + }, + { + "epoch": 0.36, + "learning_rate": 1.4948621001328544e-05, + "loss": 0.4686, + "step": 3108 + }, + { + "epoch": 0.36, + "learning_rate": 1.4945404031587255e-05, + "loss": 0.4645, + "step": 3109 + }, + { + "epoch": 0.36, + "learning_rate": 1.4942186384215198e-05, + "loss": 0.4806, + "step": 3110 + }, + { + "epoch": 0.36, + "learning_rate": 1.4938968059653269e-05, + "loss": 0.4747, + "step": 3111 + }, + { + "epoch": 0.36, + "learning_rate": 1.4935749058342446e-05, + "loss": 0.4749, + "step": 3112 + }, + { + "epoch": 0.36, + "learning_rate": 1.4932529380723806e-05, + "loss": 0.4757, + "step": 3113 + }, + { + "epoch": 0.36, + "learning_rate": 1.4929309027238517e-05, + "loss": 0.4718, + "step": 3114 + }, + { + "epoch": 0.36, + "learning_rate": 1.4926087998327838e-05, + "loss": 0.4873, + "step": 3115 + }, + { + "epoch": 0.36, + "learning_rate": 1.4922866294433122e-05, + "loss": 0.458, + "step": 3116 + }, + { + "epoch": 0.36, + "learning_rate": 1.4919643915995816e-05, + "loss": 0.4518, + "step": 3117 + }, + { + "epoch": 0.36, + "learning_rate": 1.4916420863457456e-05, + "loss": 0.484, + "step": 3118 + }, + { + "epoch": 0.36, + "learning_rate": 1.4913197137259675e-05, + "loss": 0.4704, + "step": 3119 + }, + { + "epoch": 0.36, + "learning_rate": 1.490997273784419e-05, + "loss": 0.4703, + "step": 3120 + }, + { + "epoch": 0.36, + "learning_rate": 1.4906747665652821e-05, + "loss": 0.472, + "step": 3121 + }, + { + "epoch": 0.36, + "learning_rate": 1.4903521921127472e-05, + "loss": 0.4628, + "step": 3122 + }, + { + "epoch": 0.36, + "learning_rate": 1.4900295504710143e-05, + "loss": 0.4817, + "step": 3123 + }, + { + "epoch": 0.36, + "learning_rate": 1.4897068416842926e-05, + "loss": 0.464, + "step": 3124 + }, + { + "epoch": 0.36, + "learning_rate": 1.4893840657968001e-05, + "loss": 0.4822, + "step": 3125 + }, + { + "epoch": 0.36, + "learning_rate": 1.4890612228527648e-05, + "loss": 0.4618, + "step": 3126 + }, + { + "epoch": 0.36, + "learning_rate": 1.4887383128964232e-05, + "loss": 0.4776, + "step": 3127 + }, + { + "epoch": 0.36, + "learning_rate": 1.4884153359720205e-05, + "loss": 0.4775, + "step": 3128 + }, + { + "epoch": 0.36, + "learning_rate": 1.4880922921238128e-05, + "loss": 0.4653, + "step": 3129 + }, + { + "epoch": 0.36, + "learning_rate": 1.4877691813960638e-05, + "loss": 0.4933, + "step": 3130 + }, + { + "epoch": 0.36, + "learning_rate": 1.4874460038330469e-05, + "loss": 0.465, + "step": 3131 + }, + { + "epoch": 0.36, + "learning_rate": 1.4871227594790447e-05, + "loss": 0.4592, + "step": 3132 + }, + { + "epoch": 0.36, + "learning_rate": 1.4867994483783485e-05, + "loss": 0.4704, + "step": 3133 + }, + { + "epoch": 0.36, + "learning_rate": 1.48647607057526e-05, + "loss": 0.4597, + "step": 3134 + }, + { + "epoch": 0.36, + "learning_rate": 1.4861526261140886e-05, + "loss": 0.4649, + "step": 3135 + }, + { + "epoch": 0.36, + "learning_rate": 1.4858291150391533e-05, + "loss": 0.4881, + "step": 3136 + }, + { + "epoch": 0.36, + "learning_rate": 1.4855055373947829e-05, + "loss": 0.4686, + "step": 3137 + }, + { + "epoch": 0.36, + "learning_rate": 1.4851818932253137e-05, + "loss": 0.4904, + "step": 3138 + }, + { + "epoch": 0.36, + "learning_rate": 1.4848581825750935e-05, + "loss": 0.4775, + "step": 3139 + }, + { + "epoch": 0.36, + "learning_rate": 1.4845344054884772e-05, + "loss": 0.456, + "step": 3140 + }, + { + "epoch": 0.36, + "learning_rate": 1.4842105620098292e-05, + "loss": 0.4727, + "step": 3141 + }, + { + "epoch": 0.36, + "learning_rate": 1.4838866521835238e-05, + "loss": 0.4698, + "step": 3142 + }, + { + "epoch": 0.36, + "learning_rate": 1.4835626760539437e-05, + "loss": 0.4501, + "step": 3143 + }, + { + "epoch": 0.36, + "learning_rate": 1.483238633665481e-05, + "loss": 0.4737, + "step": 3144 + }, + { + "epoch": 0.36, + "learning_rate": 1.4829145250625368e-05, + "loss": 0.476, + "step": 3145 + }, + { + "epoch": 0.36, + "learning_rate": 1.4825903502895207e-05, + "loss": 0.4562, + "step": 3146 + }, + { + "epoch": 0.36, + "learning_rate": 1.4822661093908521e-05, + "loss": 0.4805, + "step": 3147 + }, + { + "epoch": 0.36, + "learning_rate": 1.4819418024109595e-05, + "loss": 0.4816, + "step": 3148 + }, + { + "epoch": 0.36, + "learning_rate": 1.4816174293942804e-05, + "loss": 0.4696, + "step": 3149 + }, + { + "epoch": 0.36, + "learning_rate": 1.4812929903852606e-05, + "loss": 0.4819, + "step": 3150 + }, + { + "epoch": 0.36, + "learning_rate": 1.4809684854283557e-05, + "loss": 0.4685, + "step": 3151 + }, + { + "epoch": 0.36, + "learning_rate": 1.4806439145680298e-05, + "loss": 0.4632, + "step": 3152 + }, + { + "epoch": 0.36, + "learning_rate": 1.4803192778487569e-05, + "loss": 0.4622, + "step": 3153 + }, + { + "epoch": 0.36, + "learning_rate": 1.4799945753150194e-05, + "loss": 0.508, + "step": 3154 + }, + { + "epoch": 0.36, + "learning_rate": 1.4796698070113084e-05, + "loss": 0.4752, + "step": 3155 + }, + { + "epoch": 0.36, + "learning_rate": 1.4793449729821248e-05, + "loss": 0.4801, + "step": 3156 + }, + { + "epoch": 0.36, + "learning_rate": 1.4790200732719779e-05, + "loss": 0.4513, + "step": 3157 + }, + { + "epoch": 0.36, + "learning_rate": 1.4786951079253861e-05, + "loss": 0.4687, + "step": 3158 + }, + { + "epoch": 0.36, + "learning_rate": 1.4783700769868775e-05, + "loss": 0.5047, + "step": 3159 + }, + { + "epoch": 0.36, + "learning_rate": 1.4780449805009878e-05, + "loss": 0.4439, + "step": 3160 + }, + { + "epoch": 0.36, + "learning_rate": 1.477719818512263e-05, + "loss": 0.481, + "step": 3161 + }, + { + "epoch": 0.36, + "learning_rate": 1.4773945910652576e-05, + "loss": 0.4716, + "step": 3162 + }, + { + "epoch": 0.36, + "learning_rate": 1.4770692982045344e-05, + "loss": 0.4807, + "step": 3163 + }, + { + "epoch": 0.36, + "learning_rate": 1.4767439399746666e-05, + "loss": 0.476, + "step": 3164 + }, + { + "epoch": 0.36, + "learning_rate": 1.4764185164202349e-05, + "loss": 0.4745, + "step": 3165 + }, + { + "epoch": 0.36, + "learning_rate": 1.47609302758583e-05, + "loss": 0.4644, + "step": 3166 + }, + { + "epoch": 0.36, + "learning_rate": 1.4757674735160512e-05, + "loss": 0.4823, + "step": 3167 + }, + { + "epoch": 0.36, + "learning_rate": 1.475441854255506e-05, + "loss": 0.464, + "step": 3168 + }, + { + "epoch": 0.36, + "learning_rate": 1.4751161698488124e-05, + "loss": 0.4667, + "step": 3169 + }, + { + "epoch": 0.36, + "learning_rate": 1.4747904203405959e-05, + "loss": 0.4758, + "step": 3170 + }, + { + "epoch": 0.36, + "learning_rate": 1.4744646057754913e-05, + "loss": 0.4825, + "step": 3171 + }, + { + "epoch": 0.36, + "learning_rate": 1.4741387261981428e-05, + "loss": 0.447, + "step": 3172 + }, + { + "epoch": 0.36, + "learning_rate": 1.4738127816532034e-05, + "loss": 0.4674, + "step": 3173 + }, + { + "epoch": 0.36, + "learning_rate": 1.4734867721853341e-05, + "loss": 0.4779, + "step": 3174 + }, + { + "epoch": 0.36, + "learning_rate": 1.4731606978392061e-05, + "loss": 0.4519, + "step": 3175 + }, + { + "epoch": 0.36, + "learning_rate": 1.4728345586594986e-05, + "loss": 0.4539, + "step": 3176 + }, + { + "epoch": 0.36, + "learning_rate": 1.4725083546909e-05, + "loss": 0.474, + "step": 3177 + }, + { + "epoch": 0.36, + "learning_rate": 1.4721820859781076e-05, + "loss": 0.479, + "step": 3178 + }, + { + "epoch": 0.36, + "learning_rate": 1.4718557525658272e-05, + "loss": 0.477, + "step": 3179 + }, + { + "epoch": 0.36, + "learning_rate": 1.471529354498774e-05, + "loss": 0.4798, + "step": 3180 + }, + { + "epoch": 0.36, + "learning_rate": 1.471202891821672e-05, + "loss": 0.4852, + "step": 3181 + }, + { + "epoch": 0.36, + "learning_rate": 1.4708763645792531e-05, + "loss": 0.4692, + "step": 3182 + }, + { + "epoch": 0.36, + "learning_rate": 1.4705497728162602e-05, + "loss": 0.4714, + "step": 3183 + }, + { + "epoch": 0.36, + "learning_rate": 1.4702231165774423e-05, + "loss": 0.4565, + "step": 3184 + }, + { + "epoch": 0.36, + "learning_rate": 1.4698963959075592e-05, + "loss": 0.494, + "step": 3185 + }, + { + "epoch": 0.36, + "learning_rate": 1.469569610851379e-05, + "loss": 0.458, + "step": 3186 + }, + { + "epoch": 0.36, + "learning_rate": 1.4692427614536783e-05, + "loss": 0.4748, + "step": 3187 + }, + { + "epoch": 0.36, + "learning_rate": 1.4689158477592433e-05, + "loss": 0.4661, + "step": 3188 + }, + { + "epoch": 0.36, + "learning_rate": 1.4685888698128677e-05, + "loss": 0.4881, + "step": 3189 + }, + { + "epoch": 0.36, + "learning_rate": 1.468261827659355e-05, + "loss": 0.4749, + "step": 3190 + }, + { + "epoch": 0.36, + "learning_rate": 1.4679347213435176e-05, + "loss": 0.4825, + "step": 3191 + }, + { + "epoch": 0.36, + "learning_rate": 1.4676075509101763e-05, + "loss": 0.4861, + "step": 3192 + }, + { + "epoch": 0.36, + "learning_rate": 1.4672803164041604e-05, + "loss": 0.4816, + "step": 3193 + }, + { + "epoch": 0.37, + "learning_rate": 1.4669530178703089e-05, + "loss": 0.4642, + "step": 3194 + }, + { + "epoch": 0.37, + "learning_rate": 1.4666256553534681e-05, + "loss": 0.4829, + "step": 3195 + }, + { + "epoch": 0.37, + "learning_rate": 1.466298228898495e-05, + "loss": 0.4793, + "step": 3196 + }, + { + "epoch": 0.37, + "learning_rate": 1.465970738550254e-05, + "loss": 0.4768, + "step": 3197 + }, + { + "epoch": 0.37, + "learning_rate": 1.4656431843536182e-05, + "loss": 0.4439, + "step": 3198 + }, + { + "epoch": 0.37, + "learning_rate": 1.4653155663534702e-05, + "loss": 0.4805, + "step": 3199 + }, + { + "epoch": 0.37, + "learning_rate": 1.464987884594701e-05, + "loss": 0.4833, + "step": 3200 + }, + { + "epoch": 0.37, + "learning_rate": 1.4646601391222102e-05, + "loss": 0.4571, + "step": 3201 + }, + { + "epoch": 0.37, + "learning_rate": 1.464332329980906e-05, + "loss": 0.4729, + "step": 3202 + }, + { + "epoch": 0.37, + "learning_rate": 1.4640044572157062e-05, + "loss": 0.4818, + "step": 3203 + }, + { + "epoch": 0.37, + "learning_rate": 1.4636765208715358e-05, + "loss": 0.4877, + "step": 3204 + }, + { + "epoch": 0.37, + "learning_rate": 1.4633485209933305e-05, + "loss": 0.4683, + "step": 3205 + }, + { + "epoch": 0.37, + "learning_rate": 1.4630204576260328e-05, + "loss": 0.4616, + "step": 3206 + }, + { + "epoch": 0.37, + "learning_rate": 1.4626923308145948e-05, + "loss": 0.4905, + "step": 3207 + }, + { + "epoch": 0.37, + "learning_rate": 1.4623641406039776e-05, + "loss": 0.4609, + "step": 3208 + }, + { + "epoch": 0.37, + "learning_rate": 1.46203588703915e-05, + "loss": 0.4601, + "step": 3209 + }, + { + "epoch": 0.37, + "learning_rate": 1.4617075701650907e-05, + "loss": 0.4575, + "step": 3210 + }, + { + "epoch": 0.37, + "learning_rate": 1.461379190026786e-05, + "loss": 0.4594, + "step": 3211 + }, + { + "epoch": 0.37, + "learning_rate": 1.4610507466692312e-05, + "loss": 0.4681, + "step": 3212 + }, + { + "epoch": 0.37, + "learning_rate": 1.460722240137431e-05, + "loss": 0.48, + "step": 3213 + }, + { + "epoch": 0.37, + "learning_rate": 1.4603936704763975e-05, + "loss": 0.4612, + "step": 3214 + }, + { + "epoch": 0.37, + "learning_rate": 1.4600650377311523e-05, + "loss": 0.4754, + "step": 3215 + }, + { + "epoch": 0.37, + "learning_rate": 1.4597363419467257e-05, + "loss": 0.4533, + "step": 3216 + }, + { + "epoch": 0.37, + "learning_rate": 1.4594075831681557e-05, + "loss": 0.4847, + "step": 3217 + }, + { + "epoch": 0.37, + "learning_rate": 1.4590787614404902e-05, + "loss": 0.4619, + "step": 3218 + }, + { + "epoch": 0.37, + "learning_rate": 1.4587498768087849e-05, + "loss": 0.4724, + "step": 3219 + }, + { + "epoch": 0.37, + "learning_rate": 1.4584209293181044e-05, + "loss": 0.4591, + "step": 3220 + }, + { + "epoch": 0.37, + "learning_rate": 1.4580919190135219e-05, + "loss": 0.4992, + "step": 3221 + }, + { + "epoch": 0.37, + "learning_rate": 1.4577628459401188e-05, + "loss": 0.4668, + "step": 3222 + }, + { + "epoch": 0.37, + "learning_rate": 1.457433710142986e-05, + "loss": 0.4642, + "step": 3223 + }, + { + "epoch": 0.37, + "learning_rate": 1.4571045116672219e-05, + "loss": 0.4759, + "step": 3224 + }, + { + "epoch": 0.37, + "learning_rate": 1.4567752505579345e-05, + "loss": 0.4752, + "step": 3225 + }, + { + "epoch": 0.37, + "learning_rate": 1.4564459268602396e-05, + "loss": 0.4603, + "step": 3226 + }, + { + "epoch": 0.37, + "learning_rate": 1.4561165406192622e-05, + "loss": 0.4835, + "step": 3227 + }, + { + "epoch": 0.37, + "learning_rate": 1.455787091880135e-05, + "loss": 0.466, + "step": 3228 + }, + { + "epoch": 0.37, + "learning_rate": 1.4554575806880005e-05, + "loss": 0.4776, + "step": 3229 + }, + { + "epoch": 0.37, + "learning_rate": 1.4551280070880089e-05, + "loss": 0.4615, + "step": 3230 + }, + { + "epoch": 0.37, + "learning_rate": 1.454798371125319e-05, + "loss": 0.4812, + "step": 3231 + }, + { + "epoch": 0.37, + "learning_rate": 1.4544686728450982e-05, + "loss": 0.4703, + "step": 3232 + }, + { + "epoch": 0.37, + "learning_rate": 1.4541389122925229e-05, + "loss": 0.4607, + "step": 3233 + }, + { + "epoch": 0.37, + "learning_rate": 1.4538090895127774e-05, + "loss": 0.466, + "step": 3234 + }, + { + "epoch": 0.37, + "learning_rate": 1.4534792045510548e-05, + "loss": 0.4682, + "step": 3235 + }, + { + "epoch": 0.37, + "learning_rate": 1.453149257452557e-05, + "loss": 0.4742, + "step": 3236 + }, + { + "epoch": 0.37, + "learning_rate": 1.4528192482624932e-05, + "loss": 0.4683, + "step": 3237 + }, + { + "epoch": 0.37, + "learning_rate": 1.4524891770260831e-05, + "loss": 0.4715, + "step": 3238 + }, + { + "epoch": 0.37, + "learning_rate": 1.4521590437885533e-05, + "loss": 0.4758, + "step": 3239 + }, + { + "epoch": 0.37, + "learning_rate": 1.4518288485951398e-05, + "loss": 0.4706, + "step": 3240 + }, + { + "epoch": 0.37, + "learning_rate": 1.4514985914910862e-05, + "loss": 0.4904, + "step": 3241 + }, + { + "epoch": 0.37, + "learning_rate": 1.451168272521645e-05, + "loss": 0.4547, + "step": 3242 + }, + { + "epoch": 0.37, + "learning_rate": 1.450837891732078e-05, + "loss": 0.4668, + "step": 3243 + }, + { + "epoch": 0.37, + "learning_rate": 1.4505074491676542e-05, + "loss": 0.457, + "step": 3244 + }, + { + "epoch": 0.37, + "learning_rate": 1.450176944873652e-05, + "loss": 0.4586, + "step": 3245 + }, + { + "epoch": 0.37, + "learning_rate": 1.4498463788953574e-05, + "loss": 0.4766, + "step": 3246 + }, + { + "epoch": 0.37, + "learning_rate": 1.4495157512780655e-05, + "loss": 0.4627, + "step": 3247 + }, + { + "epoch": 0.37, + "learning_rate": 1.4491850620670798e-05, + "loss": 0.487, + "step": 3248 + }, + { + "epoch": 0.37, + "learning_rate": 1.4488543113077121e-05, + "loss": 0.4664, + "step": 3249 + }, + { + "epoch": 0.37, + "learning_rate": 1.4485234990452826e-05, + "loss": 0.4712, + "step": 3250 + }, + { + "epoch": 0.37, + "learning_rate": 1.4481926253251197e-05, + "loss": 0.4645, + "step": 3251 + }, + { + "epoch": 0.37, + "learning_rate": 1.4478616901925606e-05, + "loss": 0.4795, + "step": 3252 + }, + { + "epoch": 0.37, + "learning_rate": 1.4475306936929513e-05, + "loss": 0.4754, + "step": 3253 + }, + { + "epoch": 0.37, + "learning_rate": 1.4471996358716451e-05, + "loss": 0.4654, + "step": 3254 + }, + { + "epoch": 0.37, + "learning_rate": 1.4468685167740044e-05, + "loss": 0.4913, + "step": 3255 + }, + { + "epoch": 0.37, + "learning_rate": 1.4465373364454001e-05, + "loss": 0.4487, + "step": 3256 + }, + { + "epoch": 0.37, + "learning_rate": 1.4462060949312114e-05, + "loss": 0.4955, + "step": 3257 + }, + { + "epoch": 0.37, + "learning_rate": 1.4458747922768256e-05, + "loss": 0.4717, + "step": 3258 + }, + { + "epoch": 0.37, + "learning_rate": 1.4455434285276385e-05, + "loss": 0.4566, + "step": 3259 + }, + { + "epoch": 0.37, + "learning_rate": 1.4452120037290547e-05, + "loss": 0.4784, + "step": 3260 + }, + { + "epoch": 0.37, + "learning_rate": 1.444880517926486e-05, + "loss": 0.4602, + "step": 3261 + }, + { + "epoch": 0.37, + "learning_rate": 1.4445489711653542e-05, + "loss": 0.4926, + "step": 3262 + }, + { + "epoch": 0.37, + "learning_rate": 1.4442173634910881e-05, + "loss": 0.4705, + "step": 3263 + }, + { + "epoch": 0.37, + "learning_rate": 1.4438856949491258e-05, + "loss": 0.4573, + "step": 3264 + }, + { + "epoch": 0.37, + "learning_rate": 1.4435539655849126e-05, + "loss": 0.4668, + "step": 3265 + }, + { + "epoch": 0.37, + "learning_rate": 1.4432221754439037e-05, + "loss": 0.4646, + "step": 3266 + }, + { + "epoch": 0.37, + "learning_rate": 1.4428903245715611e-05, + "loss": 0.4654, + "step": 3267 + }, + { + "epoch": 0.37, + "learning_rate": 1.442558413013356e-05, + "loss": 0.4748, + "step": 3268 + }, + { + "epoch": 0.37, + "learning_rate": 1.4422264408147676e-05, + "loss": 0.4817, + "step": 3269 + }, + { + "epoch": 0.37, + "learning_rate": 1.4418944080212838e-05, + "loss": 0.4665, + "step": 3270 + }, + { + "epoch": 0.37, + "learning_rate": 1.4415623146784e-05, + "loss": 0.4809, + "step": 3271 + }, + { + "epoch": 0.37, + "learning_rate": 1.441230160831621e-05, + "loss": 0.4672, + "step": 3272 + }, + { + "epoch": 0.37, + "learning_rate": 1.4408979465264588e-05, + "loss": 0.4713, + "step": 3273 + }, + { + "epoch": 0.37, + "learning_rate": 1.4405656718084344e-05, + "loss": 0.4615, + "step": 3274 + }, + { + "epoch": 0.37, + "learning_rate": 1.440233336723077e-05, + "loss": 0.5039, + "step": 3275 + }, + { + "epoch": 0.37, + "learning_rate": 1.4399009413159234e-05, + "loss": 0.4667, + "step": 3276 + }, + { + "epoch": 0.37, + "learning_rate": 1.4395684856325198e-05, + "loss": 0.4888, + "step": 3277 + }, + { + "epoch": 0.37, + "learning_rate": 1.4392359697184197e-05, + "loss": 0.4571, + "step": 3278 + }, + { + "epoch": 0.37, + "learning_rate": 1.4389033936191851e-05, + "loss": 0.4598, + "step": 3279 + }, + { + "epoch": 0.37, + "learning_rate": 1.4385707573803869e-05, + "loss": 0.4715, + "step": 3280 + }, + { + "epoch": 0.37, + "learning_rate": 1.4382380610476032e-05, + "loss": 0.5006, + "step": 3281 + }, + { + "epoch": 0.38, + "learning_rate": 1.4379053046664208e-05, + "loss": 0.4744, + "step": 3282 + }, + { + "epoch": 0.38, + "learning_rate": 1.437572488282435e-05, + "loss": 0.4742, + "step": 3283 + }, + { + "epoch": 0.38, + "learning_rate": 1.4372396119412493e-05, + "loss": 0.4635, + "step": 3284 + }, + { + "epoch": 0.38, + "learning_rate": 1.4369066756884745e-05, + "loss": 0.4539, + "step": 3285 + }, + { + "epoch": 0.38, + "learning_rate": 1.4365736795697306e-05, + "loss": 0.4807, + "step": 3286 + }, + { + "epoch": 0.38, + "learning_rate": 1.436240623630646e-05, + "loss": 0.486, + "step": 3287 + }, + { + "epoch": 0.38, + "learning_rate": 1.4359075079168562e-05, + "loss": 0.4795, + "step": 3288 + }, + { + "epoch": 0.38, + "learning_rate": 1.4355743324740055e-05, + "loss": 0.4836, + "step": 3289 + }, + { + "epoch": 0.38, + "learning_rate": 1.4352410973477466e-05, + "loss": 0.4509, + "step": 3290 + }, + { + "epoch": 0.38, + "learning_rate": 1.4349078025837401e-05, + "loss": 0.4845, + "step": 3291 + }, + { + "epoch": 0.38, + "learning_rate": 1.4345744482276551e-05, + "loss": 0.4521, + "step": 3292 + }, + { + "epoch": 0.38, + "learning_rate": 1.4342410343251683e-05, + "loss": 0.461, + "step": 3293 + }, + { + "epoch": 0.38, + "learning_rate": 1.4339075609219645e-05, + "loss": 0.4714, + "step": 3294 + }, + { + "epoch": 0.38, + "learning_rate": 1.4335740280637374e-05, + "loss": 0.4697, + "step": 3295 + }, + { + "epoch": 0.38, + "learning_rate": 1.4332404357961884e-05, + "loss": 0.4775, + "step": 3296 + }, + { + "epoch": 0.38, + "learning_rate": 1.4329067841650274e-05, + "loss": 0.4835, + "step": 3297 + }, + { + "epoch": 0.38, + "learning_rate": 1.4325730732159717e-05, + "loss": 0.4689, + "step": 3298 + }, + { + "epoch": 0.38, + "learning_rate": 1.432239302994747e-05, + "loss": 0.4673, + "step": 3299 + }, + { + "epoch": 0.38, + "learning_rate": 1.4319054735470879e-05, + "loss": 0.462, + "step": 3300 + }, + { + "epoch": 0.38, + "learning_rate": 1.4315715849187362e-05, + "loss": 0.4648, + "step": 3301 + }, + { + "epoch": 0.38, + "learning_rate": 1.4312376371554417e-05, + "loss": 0.4644, + "step": 3302 + }, + { + "epoch": 0.38, + "learning_rate": 1.4309036303029632e-05, + "loss": 0.4579, + "step": 3303 + }, + { + "epoch": 0.38, + "learning_rate": 1.4305695644070665e-05, + "loss": 0.4781, + "step": 3304 + }, + { + "epoch": 0.38, + "learning_rate": 1.4302354395135269e-05, + "loss": 0.4719, + "step": 3305 + }, + { + "epoch": 0.38, + "learning_rate": 1.4299012556681269e-05, + "loss": 0.4776, + "step": 3306 + }, + { + "epoch": 0.38, + "learning_rate": 1.4295670129166564e-05, + "loss": 0.4732, + "step": 3307 + }, + { + "epoch": 0.38, + "learning_rate": 1.4292327113049145e-05, + "loss": 0.4742, + "step": 3308 + }, + { + "epoch": 0.38, + "learning_rate": 1.428898350878708e-05, + "loss": 0.4832, + "step": 3309 + }, + { + "epoch": 0.38, + "learning_rate": 1.428563931683852e-05, + "loss": 0.4557, + "step": 3310 + }, + { + "epoch": 0.38, + "learning_rate": 1.4282294537661692e-05, + "loss": 0.4736, + "step": 3311 + }, + { + "epoch": 0.38, + "learning_rate": 1.4278949171714904e-05, + "loss": 0.4591, + "step": 3312 + }, + { + "epoch": 0.38, + "learning_rate": 1.4275603219456544e-05, + "loss": 0.4673, + "step": 3313 + }, + { + "epoch": 0.38, + "learning_rate": 1.4272256681345087e-05, + "loss": 0.4759, + "step": 3314 + }, + { + "epoch": 0.38, + "learning_rate": 1.4268909557839085e-05, + "loss": 0.4635, + "step": 3315 + }, + { + "epoch": 0.38, + "learning_rate": 1.4265561849397163e-05, + "loss": 0.4533, + "step": 3316 + }, + { + "epoch": 0.38, + "learning_rate": 1.4262213556478033e-05, + "loss": 0.4715, + "step": 3317 + }, + { + "epoch": 0.38, + "learning_rate": 1.4258864679540488e-05, + "loss": 0.4616, + "step": 3318 + }, + { + "epoch": 0.38, + "learning_rate": 1.4255515219043398e-05, + "loss": 0.4528, + "step": 3319 + }, + { + "epoch": 0.38, + "learning_rate": 1.425216517544571e-05, + "loss": 0.4803, + "step": 3320 + }, + { + "epoch": 0.38, + "learning_rate": 1.4248814549206464e-05, + "loss": 0.462, + "step": 3321 + }, + { + "epoch": 0.38, + "learning_rate": 1.4245463340784761e-05, + "loss": 0.4962, + "step": 3322 + }, + { + "epoch": 0.38, + "learning_rate": 1.4242111550639797e-05, + "loss": 0.4631, + "step": 3323 + }, + { + "epoch": 0.38, + "learning_rate": 1.4238759179230841e-05, + "loss": 0.4858, + "step": 3324 + }, + { + "epoch": 0.38, + "learning_rate": 1.4235406227017241e-05, + "loss": 0.4673, + "step": 3325 + }, + { + "epoch": 0.38, + "learning_rate": 1.423205269445843e-05, + "loss": 0.4831, + "step": 3326 + }, + { + "epoch": 0.38, + "learning_rate": 1.4228698582013908e-05, + "loss": 0.4592, + "step": 3327 + }, + { + "epoch": 0.38, + "learning_rate": 1.4225343890143275e-05, + "loss": 0.4626, + "step": 3328 + }, + { + "epoch": 0.38, + "learning_rate": 1.4221988619306192e-05, + "loss": 0.4664, + "step": 3329 + }, + { + "epoch": 0.38, + "learning_rate": 1.4218632769962408e-05, + "loss": 0.4782, + "step": 3330 + }, + { + "epoch": 0.38, + "learning_rate": 1.4215276342571749e-05, + "loss": 0.4562, + "step": 3331 + }, + { + "epoch": 0.38, + "learning_rate": 1.4211919337594118e-05, + "loss": 0.4621, + "step": 3332 + }, + { + "epoch": 0.38, + "learning_rate": 1.4208561755489502e-05, + "loss": 0.4751, + "step": 3333 + }, + { + "epoch": 0.38, + "learning_rate": 1.4205203596717966e-05, + "loss": 0.4712, + "step": 3334 + }, + { + "epoch": 0.38, + "learning_rate": 1.420184486173965e-05, + "loss": 0.4627, + "step": 3335 + }, + { + "epoch": 0.38, + "learning_rate": 1.4198485551014778e-05, + "loss": 0.4534, + "step": 3336 + }, + { + "epoch": 0.38, + "learning_rate": 1.4195125665003648e-05, + "loss": 0.4545, + "step": 3337 + }, + { + "epoch": 0.38, + "learning_rate": 1.4191765204166643e-05, + "loss": 0.4793, + "step": 3338 + }, + { + "epoch": 0.38, + "learning_rate": 1.4188404168964219e-05, + "loss": 0.4492, + "step": 3339 + }, + { + "epoch": 0.38, + "learning_rate": 1.418504255985691e-05, + "loss": 0.4563, + "step": 3340 + }, + { + "epoch": 0.38, + "learning_rate": 1.4181680377305336e-05, + "loss": 0.4673, + "step": 3341 + }, + { + "epoch": 0.38, + "learning_rate": 1.4178317621770187e-05, + "loss": 0.4813, + "step": 3342 + }, + { + "epoch": 0.38, + "learning_rate": 1.4174954293712242e-05, + "loss": 0.4651, + "step": 3343 + }, + { + "epoch": 0.38, + "learning_rate": 1.4171590393592346e-05, + "loss": 0.469, + "step": 3344 + }, + { + "epoch": 0.38, + "learning_rate": 1.4168225921871433e-05, + "loss": 0.4549, + "step": 3345 + }, + { + "epoch": 0.38, + "learning_rate": 1.4164860879010502e-05, + "loss": 0.4827, + "step": 3346 + }, + { + "epoch": 0.38, + "learning_rate": 1.4161495265470649e-05, + "loss": 0.4757, + "step": 3347 + }, + { + "epoch": 0.38, + "learning_rate": 1.4158129081713035e-05, + "loss": 0.4788, + "step": 3348 + }, + { + "epoch": 0.38, + "learning_rate": 1.41547623281989e-05, + "loss": 0.4634, + "step": 3349 + }, + { + "epoch": 0.38, + "learning_rate": 1.415139500538957e-05, + "loss": 0.4736, + "step": 3350 + }, + { + "epoch": 0.38, + "learning_rate": 1.4148027113746435e-05, + "loss": 0.4737, + "step": 3351 + }, + { + "epoch": 0.38, + "learning_rate": 1.4144658653730976e-05, + "loss": 0.4693, + "step": 3352 + }, + { + "epoch": 0.38, + "learning_rate": 1.4141289625804748e-05, + "loss": 0.4686, + "step": 3353 + }, + { + "epoch": 0.38, + "learning_rate": 1.4137920030429386e-05, + "loss": 0.4651, + "step": 3354 + }, + { + "epoch": 0.38, + "learning_rate": 1.4134549868066594e-05, + "loss": 0.4646, + "step": 3355 + }, + { + "epoch": 0.38, + "learning_rate": 1.4131179139178157e-05, + "loss": 0.4861, + "step": 3356 + }, + { + "epoch": 0.38, + "learning_rate": 1.4127807844225947e-05, + "loss": 0.4695, + "step": 3357 + }, + { + "epoch": 0.38, + "learning_rate": 1.4124435983671907e-05, + "loss": 0.4706, + "step": 3358 + }, + { + "epoch": 0.38, + "learning_rate": 1.4121063557978051e-05, + "loss": 0.4766, + "step": 3359 + }, + { + "epoch": 0.38, + "learning_rate": 1.4117690567606483e-05, + "loss": 0.4701, + "step": 3360 + }, + { + "epoch": 0.38, + "learning_rate": 1.411431701301937e-05, + "loss": 0.4642, + "step": 3361 + }, + { + "epoch": 0.38, + "learning_rate": 1.4110942894678971e-05, + "loss": 0.4676, + "step": 3362 + }, + { + "epoch": 0.38, + "learning_rate": 1.410756821304762e-05, + "loss": 0.4786, + "step": 3363 + }, + { + "epoch": 0.38, + "learning_rate": 1.410419296858771e-05, + "loss": 0.47, + "step": 3364 + }, + { + "epoch": 0.38, + "learning_rate": 1.4100817161761738e-05, + "loss": 0.4691, + "step": 3365 + }, + { + "epoch": 0.38, + "learning_rate": 1.4097440793032253e-05, + "loss": 0.4569, + "step": 3366 + }, + { + "epoch": 0.38, + "learning_rate": 1.4094063862861904e-05, + "loss": 0.4553, + "step": 3367 + }, + { + "epoch": 0.38, + "learning_rate": 1.4090686371713403e-05, + "loss": 0.4536, + "step": 3368 + }, + { + "epoch": 0.39, + "learning_rate": 1.4087308320049536e-05, + "loss": 0.452, + "step": 3369 + }, + { + "epoch": 0.39, + "learning_rate": 1.4083929708333173e-05, + "loss": 0.4892, + "step": 3370 + }, + { + "epoch": 0.39, + "learning_rate": 1.4080550537027264e-05, + "loss": 0.4583, + "step": 3371 + }, + { + "epoch": 0.39, + "learning_rate": 1.4077170806594831e-05, + "loss": 0.4813, + "step": 3372 + }, + { + "epoch": 0.39, + "learning_rate": 1.4073790517498967e-05, + "loss": 0.4754, + "step": 3373 + }, + { + "epoch": 0.39, + "learning_rate": 1.4070409670202849e-05, + "loss": 0.4719, + "step": 3374 + }, + { + "epoch": 0.39, + "learning_rate": 1.4067028265169728e-05, + "loss": 0.4677, + "step": 3375 + }, + { + "epoch": 0.39, + "learning_rate": 1.4063646302862938e-05, + "loss": 0.4727, + "step": 3376 + }, + { + "epoch": 0.39, + "learning_rate": 1.406026378374588e-05, + "loss": 0.4658, + "step": 3377 + }, + { + "epoch": 0.39, + "learning_rate": 1.405688070828203e-05, + "loss": 0.4719, + "step": 3378 + }, + { + "epoch": 0.39, + "learning_rate": 1.4053497076934948e-05, + "loss": 0.4711, + "step": 3379 + }, + { + "epoch": 0.39, + "learning_rate": 1.405011289016827e-05, + "loss": 0.4865, + "step": 3380 + }, + { + "epoch": 0.39, + "learning_rate": 1.4046728148445701e-05, + "loss": 0.4585, + "step": 3381 + }, + { + "epoch": 0.39, + "learning_rate": 1.4043342852231027e-05, + "loss": 0.4705, + "step": 3382 + }, + { + "epoch": 0.39, + "learning_rate": 1.4039957001988112e-05, + "loss": 0.4612, + "step": 3383 + }, + { + "epoch": 0.39, + "learning_rate": 1.4036570598180888e-05, + "loss": 0.4656, + "step": 3384 + }, + { + "epoch": 0.39, + "learning_rate": 1.4033183641273374e-05, + "loss": 0.4502, + "step": 3385 + }, + { + "epoch": 0.39, + "learning_rate": 1.4029796131729652e-05, + "loss": 0.4727, + "step": 3386 + }, + { + "epoch": 0.39, + "learning_rate": 1.4026408070013892e-05, + "loss": 0.4752, + "step": 3387 + }, + { + "epoch": 0.39, + "learning_rate": 1.4023019456590335e-05, + "loss": 0.4736, + "step": 3388 + }, + { + "epoch": 0.39, + "learning_rate": 1.4019630291923289e-05, + "loss": 0.4678, + "step": 3389 + }, + { + "epoch": 0.39, + "learning_rate": 1.4016240576477152e-05, + "loss": 0.483, + "step": 3390 + }, + { + "epoch": 0.39, + "learning_rate": 1.401285031071639e-05, + "loss": 0.4638, + "step": 3391 + }, + { + "epoch": 0.39, + "learning_rate": 1.4009459495105542e-05, + "loss": 0.4708, + "step": 3392 + }, + { + "epoch": 0.39, + "learning_rate": 1.400606813010923e-05, + "loss": 0.4631, + "step": 3393 + }, + { + "epoch": 0.39, + "learning_rate": 1.4002676216192141e-05, + "loss": 0.4596, + "step": 3394 + }, + { + "epoch": 0.39, + "learning_rate": 1.3999283753819047e-05, + "loss": 0.4453, + "step": 3395 + }, + { + "epoch": 0.39, + "learning_rate": 1.3995890743454789e-05, + "loss": 0.4751, + "step": 3396 + }, + { + "epoch": 0.39, + "learning_rate": 1.3992497185564289e-05, + "loss": 0.4807, + "step": 3397 + }, + { + "epoch": 0.39, + "learning_rate": 1.3989103080612533e-05, + "loss": 0.4586, + "step": 3398 + }, + { + "epoch": 0.39, + "learning_rate": 1.3985708429064598e-05, + "loss": 0.4707, + "step": 3399 + }, + { + "epoch": 0.39, + "learning_rate": 1.3982313231385622e-05, + "loss": 0.4871, + "step": 3400 + }, + { + "epoch": 0.39, + "learning_rate": 1.3978917488040822e-05, + "loss": 0.4575, + "step": 3401 + }, + { + "epoch": 0.39, + "learning_rate": 1.3975521199495495e-05, + "loss": 0.446, + "step": 3402 + }, + { + "epoch": 0.39, + "learning_rate": 1.3972124366215002e-05, + "loss": 0.4673, + "step": 3403 + }, + { + "epoch": 0.39, + "learning_rate": 1.3968726988664788e-05, + "loss": 0.4686, + "step": 3404 + }, + { + "epoch": 0.39, + "learning_rate": 1.3965329067310372e-05, + "loss": 0.4776, + "step": 3405 + }, + { + "epoch": 0.39, + "learning_rate": 1.3961930602617345e-05, + "loss": 0.4751, + "step": 3406 + }, + { + "epoch": 0.39, + "learning_rate": 1.3958531595051367e-05, + "loss": 0.4603, + "step": 3407 + }, + { + "epoch": 0.39, + "learning_rate": 1.395513204507818e-05, + "loss": 0.4653, + "step": 3408 + }, + { + "epoch": 0.39, + "learning_rate": 1.3951731953163606e-05, + "loss": 0.4896, + "step": 3409 + }, + { + "epoch": 0.39, + "learning_rate": 1.3948331319773525e-05, + "loss": 0.4479, + "step": 3410 + }, + { + "epoch": 0.39, + "learning_rate": 1.3944930145373903e-05, + "loss": 0.47, + "step": 3411 + }, + { + "epoch": 0.39, + "learning_rate": 1.3941528430430773e-05, + "loss": 0.4531, + "step": 3412 + }, + { + "epoch": 0.39, + "learning_rate": 1.393812617541025e-05, + "loss": 0.4815, + "step": 3413 + }, + { + "epoch": 0.39, + "learning_rate": 1.3934723380778517e-05, + "loss": 0.4752, + "step": 3414 + }, + { + "epoch": 0.39, + "learning_rate": 1.3931320047001838e-05, + "loss": 0.4631, + "step": 3415 + }, + { + "epoch": 0.39, + "learning_rate": 1.3927916174546536e-05, + "loss": 0.4979, + "step": 3416 + }, + { + "epoch": 0.39, + "learning_rate": 1.3924511763879025e-05, + "loss": 0.4603, + "step": 3417 + }, + { + "epoch": 0.39, + "learning_rate": 1.3921106815465782e-05, + "loss": 0.4488, + "step": 3418 + }, + { + "epoch": 0.39, + "learning_rate": 1.3917701329773364e-05, + "loss": 0.4639, + "step": 3419 + }, + { + "epoch": 0.39, + "learning_rate": 1.3914295307268396e-05, + "loss": 0.49, + "step": 3420 + }, + { + "epoch": 0.39, + "learning_rate": 1.3910888748417577e-05, + "loss": 0.4681, + "step": 3421 + }, + { + "epoch": 0.39, + "learning_rate": 1.3907481653687687e-05, + "loss": 0.4684, + "step": 3422 + }, + { + "epoch": 0.39, + "learning_rate": 1.3904074023545566e-05, + "loss": 0.4867, + "step": 3423 + }, + { + "epoch": 0.39, + "learning_rate": 1.390066585845815e-05, + "loss": 0.4704, + "step": 3424 + }, + { + "epoch": 0.39, + "learning_rate": 1.389725715889242e-05, + "loss": 0.4487, + "step": 3425 + }, + { + "epoch": 0.39, + "learning_rate": 1.3893847925315447e-05, + "loss": 0.4687, + "step": 3426 + }, + { + "epoch": 0.39, + "learning_rate": 1.3890438158194374e-05, + "loss": 0.4856, + "step": 3427 + }, + { + "epoch": 0.39, + "learning_rate": 1.3887027857996416e-05, + "loss": 0.4726, + "step": 3428 + }, + { + "epoch": 0.39, + "learning_rate": 1.3883617025188858e-05, + "loss": 0.4639, + "step": 3429 + }, + { + "epoch": 0.39, + "learning_rate": 1.3880205660239062e-05, + "loss": 0.4716, + "step": 3430 + }, + { + "epoch": 0.39, + "learning_rate": 1.387679376361446e-05, + "loss": 0.4622, + "step": 3431 + }, + { + "epoch": 0.39, + "learning_rate": 1.3873381335782559e-05, + "loss": 0.4813, + "step": 3432 + }, + { + "epoch": 0.39, + "learning_rate": 1.3869968377210936e-05, + "loss": 0.4577, + "step": 3433 + }, + { + "epoch": 0.39, + "learning_rate": 1.3866554888367243e-05, + "loss": 0.4814, + "step": 3434 + }, + { + "epoch": 0.39, + "learning_rate": 1.3863140869719207e-05, + "loss": 0.4519, + "step": 3435 + }, + { + "epoch": 0.39, + "learning_rate": 1.3859726321734623e-05, + "loss": 0.4714, + "step": 3436 + }, + { + "epoch": 0.39, + "learning_rate": 1.385631124488136e-05, + "loss": 0.4725, + "step": 3437 + }, + { + "epoch": 0.39, + "learning_rate": 1.3852895639627357e-05, + "loss": 0.476, + "step": 3438 + }, + { + "epoch": 0.39, + "learning_rate": 1.3849479506440633e-05, + "loss": 0.4709, + "step": 3439 + }, + { + "epoch": 0.39, + "learning_rate": 1.3846062845789275e-05, + "loss": 0.4698, + "step": 3440 + }, + { + "epoch": 0.39, + "learning_rate": 1.3842645658141436e-05, + "loss": 0.4537, + "step": 3441 + }, + { + "epoch": 0.39, + "learning_rate": 1.383922794396535e-05, + "loss": 0.4755, + "step": 3442 + }, + { + "epoch": 0.39, + "learning_rate": 1.3835809703729322e-05, + "loss": 0.4526, + "step": 3443 + }, + { + "epoch": 0.39, + "learning_rate": 1.3832390937901723e-05, + "loss": 0.4573, + "step": 3444 + }, + { + "epoch": 0.39, + "learning_rate": 1.3828971646951005e-05, + "loss": 0.4754, + "step": 3445 + }, + { + "epoch": 0.39, + "learning_rate": 1.3825551831345685e-05, + "loss": 0.4687, + "step": 3446 + }, + { + "epoch": 0.39, + "learning_rate": 1.3822131491554355e-05, + "loss": 0.4574, + "step": 3447 + }, + { + "epoch": 0.39, + "learning_rate": 1.3818710628045677e-05, + "loss": 0.4768, + "step": 3448 + }, + { + "epoch": 0.39, + "learning_rate": 1.3815289241288383e-05, + "loss": 0.473, + "step": 3449 + }, + { + "epoch": 0.39, + "learning_rate": 1.3811867331751286e-05, + "loss": 0.4808, + "step": 3450 + }, + { + "epoch": 0.39, + "learning_rate": 1.380844489990326e-05, + "loss": 0.4514, + "step": 3451 + }, + { + "epoch": 0.39, + "learning_rate": 1.3805021946213251e-05, + "loss": 0.4766, + "step": 3452 + }, + { + "epoch": 0.39, + "learning_rate": 1.3801598471150286e-05, + "loss": 0.4523, + "step": 3453 + }, + { + "epoch": 0.39, + "learning_rate": 1.3798174475183457e-05, + "loss": 0.4715, + "step": 3454 + }, + { + "epoch": 0.39, + "learning_rate": 1.3794749958781924e-05, + "loss": 0.4447, + "step": 3455 + }, + { + "epoch": 0.39, + "learning_rate": 1.3791324922414924e-05, + "loss": 0.471, + "step": 3456 + }, + { + "epoch": 0.4, + "learning_rate": 1.3787899366551764e-05, + "loss": 0.4725, + "step": 3457 + }, + { + "epoch": 0.4, + "learning_rate": 1.3784473291661824e-05, + "loss": 0.4766, + "step": 3458 + }, + { + "epoch": 0.4, + "learning_rate": 1.3781046698214549e-05, + "loss": 0.4553, + "step": 3459 + }, + { + "epoch": 0.4, + "learning_rate": 1.3777619586679458e-05, + "loss": 0.4858, + "step": 3460 + }, + { + "epoch": 0.4, + "learning_rate": 1.3774191957526144e-05, + "loss": 0.4454, + "step": 3461 + }, + { + "epoch": 0.4, + "learning_rate": 1.3770763811224273e-05, + "loss": 0.4684, + "step": 3462 + }, + { + "epoch": 0.4, + "learning_rate": 1.376733514824357e-05, + "loss": 0.4524, + "step": 3463 + }, + { + "epoch": 0.4, + "learning_rate": 1.3763905969053841e-05, + "loss": 0.4756, + "step": 3464 + }, + { + "epoch": 0.4, + "learning_rate": 1.376047627412496e-05, + "loss": 0.4817, + "step": 3465 + }, + { + "epoch": 0.4, + "learning_rate": 1.3757046063926876e-05, + "loss": 0.4717, + "step": 3466 + }, + { + "epoch": 0.4, + "learning_rate": 1.3753615338929598e-05, + "loss": 0.4687, + "step": 3467 + }, + { + "epoch": 0.4, + "learning_rate": 1.3750184099603216e-05, + "loss": 0.4765, + "step": 3468 + }, + { + "epoch": 0.4, + "learning_rate": 1.3746752346417884e-05, + "loss": 0.4669, + "step": 3469 + }, + { + "epoch": 0.4, + "learning_rate": 1.3743320079843828e-05, + "loss": 0.4586, + "step": 3470 + }, + { + "epoch": 0.4, + "learning_rate": 1.3739887300351349e-05, + "loss": 0.4725, + "step": 3471 + }, + { + "epoch": 0.4, + "learning_rate": 1.3736454008410816e-05, + "loss": 0.4761, + "step": 3472 + }, + { + "epoch": 0.4, + "learning_rate": 1.373302020449266e-05, + "loss": 0.4666, + "step": 3473 + }, + { + "epoch": 0.4, + "learning_rate": 1.3729585889067391e-05, + "loss": 0.4902, + "step": 3474 + }, + { + "epoch": 0.4, + "learning_rate": 1.3726151062605588e-05, + "loss": 0.461, + "step": 3475 + }, + { + "epoch": 0.4, + "learning_rate": 1.3722715725577902e-05, + "loss": 0.4729, + "step": 3476 + }, + { + "epoch": 0.4, + "learning_rate": 1.3719279878455046e-05, + "loss": 0.4725, + "step": 3477 + }, + { + "epoch": 0.4, + "learning_rate": 1.3715843521707805e-05, + "loss": 0.4572, + "step": 3478 + }, + { + "epoch": 0.4, + "learning_rate": 1.3712406655807047e-05, + "loss": 0.4482, + "step": 3479 + }, + { + "epoch": 0.4, + "learning_rate": 1.3708969281223687e-05, + "loss": 0.4834, + "step": 3480 + }, + { + "epoch": 0.4, + "learning_rate": 1.3705531398428736e-05, + "loss": 0.4743, + "step": 3481 + }, + { + "epoch": 0.4, + "learning_rate": 1.3702093007893249e-05, + "loss": 0.4896, + "step": 3482 + }, + { + "epoch": 0.4, + "learning_rate": 1.3698654110088365e-05, + "loss": 0.4706, + "step": 3483 + }, + { + "epoch": 0.4, + "learning_rate": 1.3695214705485294e-05, + "loss": 0.4695, + "step": 3484 + }, + { + "epoch": 0.4, + "learning_rate": 1.3691774794555306e-05, + "loss": 0.4616, + "step": 3485 + }, + { + "epoch": 0.4, + "learning_rate": 1.368833437776975e-05, + "loss": 0.4823, + "step": 3486 + }, + { + "epoch": 0.4, + "learning_rate": 1.3684893455600036e-05, + "loss": 0.4539, + "step": 3487 + }, + { + "epoch": 0.4, + "learning_rate": 1.368145202851765e-05, + "loss": 0.4737, + "step": 3488 + }, + { + "epoch": 0.4, + "learning_rate": 1.3678010096994143e-05, + "loss": 0.4613, + "step": 3489 + }, + { + "epoch": 0.4, + "learning_rate": 1.3674567661501138e-05, + "loss": 0.485, + "step": 3490 + }, + { + "epoch": 0.4, + "learning_rate": 1.3671124722510325e-05, + "loss": 0.4766, + "step": 3491 + }, + { + "epoch": 0.4, + "learning_rate": 1.366768128049346e-05, + "loss": 0.4763, + "step": 3492 + }, + { + "epoch": 0.4, + "learning_rate": 1.3664237335922377e-05, + "loss": 0.4588, + "step": 3493 + }, + { + "epoch": 0.4, + "learning_rate": 1.3660792889268967e-05, + "loss": 0.4883, + "step": 3494 + }, + { + "epoch": 0.4, + "learning_rate": 1.3657347941005204e-05, + "loss": 0.4671, + "step": 3495 + }, + { + "epoch": 0.4, + "learning_rate": 1.3653902491603117e-05, + "loss": 0.464, + "step": 3496 + }, + { + "epoch": 0.4, + "learning_rate": 1.3650456541534811e-05, + "loss": 0.4619, + "step": 3497 + }, + { + "epoch": 0.4, + "learning_rate": 1.3647010091272456e-05, + "loss": 0.4806, + "step": 3498 + }, + { + "epoch": 0.4, + "learning_rate": 1.3643563141288297e-05, + "loss": 0.4832, + "step": 3499 + }, + { + "epoch": 0.4, + "learning_rate": 1.364011569205464e-05, + "loss": 0.476, + "step": 3500 + }, + { + "epoch": 0.4, + "learning_rate": 1.3636667744043864e-05, + "loss": 0.4768, + "step": 3501 + }, + { + "epoch": 0.4, + "learning_rate": 1.3633219297728415e-05, + "loss": 0.4722, + "step": 3502 + }, + { + "epoch": 0.4, + "learning_rate": 1.3629770353580804e-05, + "loss": 0.4721, + "step": 3503 + }, + { + "epoch": 0.4, + "learning_rate": 1.3626320912073616e-05, + "loss": 0.4715, + "step": 3504 + }, + { + "epoch": 0.4, + "learning_rate": 1.3622870973679503e-05, + "loss": 0.4711, + "step": 3505 + }, + { + "epoch": 0.4, + "learning_rate": 1.361942053887118e-05, + "loss": 0.4722, + "step": 3506 + }, + { + "epoch": 0.4, + "learning_rate": 1.3615969608121438e-05, + "loss": 0.4726, + "step": 3507 + }, + { + "epoch": 0.4, + "learning_rate": 1.3612518181903127e-05, + "loss": 0.465, + "step": 3508 + }, + { + "epoch": 0.4, + "learning_rate": 1.360906626068917e-05, + "loss": 0.4693, + "step": 3509 + }, + { + "epoch": 0.4, + "learning_rate": 1.3605613844952561e-05, + "loss": 0.4564, + "step": 3510 + }, + { + "epoch": 0.4, + "learning_rate": 1.3602160935166357e-05, + "loss": 0.4639, + "step": 3511 + }, + { + "epoch": 0.4, + "learning_rate": 1.359870753180368e-05, + "loss": 0.4704, + "step": 3512 + }, + { + "epoch": 0.4, + "learning_rate": 1.3595253635337724e-05, + "loss": 0.4511, + "step": 3513 + }, + { + "epoch": 0.4, + "learning_rate": 1.3591799246241753e-05, + "loss": 0.4738, + "step": 3514 + }, + { + "epoch": 0.4, + "learning_rate": 1.3588344364989096e-05, + "loss": 0.478, + "step": 3515 + }, + { + "epoch": 0.4, + "learning_rate": 1.3584888992053146e-05, + "loss": 0.4549, + "step": 3516 + }, + { + "epoch": 0.4, + "learning_rate": 1.3581433127907366e-05, + "loss": 0.4767, + "step": 3517 + }, + { + "epoch": 0.4, + "learning_rate": 1.357797677302529e-05, + "loss": 0.4492, + "step": 3518 + }, + { + "epoch": 0.4, + "learning_rate": 1.3574519927880511e-05, + "loss": 0.4672, + "step": 3519 + }, + { + "epoch": 0.4, + "learning_rate": 1.3571062592946703e-05, + "loss": 0.473, + "step": 3520 + }, + { + "epoch": 0.4, + "learning_rate": 1.3567604768697585e-05, + "loss": 0.4583, + "step": 3521 + }, + { + "epoch": 0.4, + "learning_rate": 1.3564146455606961e-05, + "loss": 0.4575, + "step": 3522 + }, + { + "epoch": 0.4, + "learning_rate": 1.3560687654148703e-05, + "loss": 0.4642, + "step": 3523 + }, + { + "epoch": 0.4, + "learning_rate": 1.3557228364796742e-05, + "loss": 0.4688, + "step": 3524 + }, + { + "epoch": 0.4, + "learning_rate": 1.3553768588025073e-05, + "loss": 0.4582, + "step": 3525 + }, + { + "epoch": 0.4, + "learning_rate": 1.3550308324307767e-05, + "loss": 0.4701, + "step": 3526 + }, + { + "epoch": 0.4, + "learning_rate": 1.3546847574118951e-05, + "loss": 0.4618, + "step": 3527 + }, + { + "epoch": 0.4, + "learning_rate": 1.3543386337932834e-05, + "loss": 0.4574, + "step": 3528 + }, + { + "epoch": 0.4, + "learning_rate": 1.3539924616223679e-05, + "loss": 0.4567, + "step": 3529 + }, + { + "epoch": 0.4, + "learning_rate": 1.3536462409465816e-05, + "loss": 0.4813, + "step": 3530 + }, + { + "epoch": 0.4, + "learning_rate": 1.3532999718133648e-05, + "loss": 0.4813, + "step": 3531 + }, + { + "epoch": 0.4, + "learning_rate": 1.3529536542701638e-05, + "loss": 0.4799, + "step": 3532 + }, + { + "epoch": 0.4, + "learning_rate": 1.3526072883644326e-05, + "loss": 0.4817, + "step": 3533 + }, + { + "epoch": 0.4, + "learning_rate": 1.3522608741436303e-05, + "loss": 0.4664, + "step": 3534 + }, + { + "epoch": 0.4, + "learning_rate": 1.3519144116552236e-05, + "loss": 0.4972, + "step": 3535 + }, + { + "epoch": 0.4, + "learning_rate": 1.3515679009466856e-05, + "loss": 0.4505, + "step": 3536 + }, + { + "epoch": 0.4, + "learning_rate": 1.3512213420654959e-05, + "loss": 0.4666, + "step": 3537 + }, + { + "epoch": 0.4, + "learning_rate": 1.350874735059141e-05, + "loss": 0.4746, + "step": 3538 + }, + { + "epoch": 0.4, + "learning_rate": 1.3505280799751134e-05, + "loss": 0.4741, + "step": 3539 + }, + { + "epoch": 0.4, + "learning_rate": 1.3501813768609134e-05, + "loss": 0.4802, + "step": 3540 + }, + { + "epoch": 0.4, + "learning_rate": 1.3498346257640461e-05, + "loss": 0.4697, + "step": 3541 + }, + { + "epoch": 0.4, + "learning_rate": 1.349487826732025e-05, + "loss": 0.4593, + "step": 3542 + }, + { + "epoch": 0.4, + "learning_rate": 1.3491409798123687e-05, + "loss": 0.4985, + "step": 3543 + }, + { + "epoch": 0.41, + "learning_rate": 1.3487940850526033e-05, + "loss": 0.4663, + "step": 3544 + }, + { + "epoch": 0.41, + "learning_rate": 1.348447142500261e-05, + "loss": 0.4558, + "step": 3545 + }, + { + "epoch": 0.41, + "learning_rate": 1.3481001522028807e-05, + "loss": 0.4566, + "step": 3546 + }, + { + "epoch": 0.41, + "learning_rate": 1.3477531142080076e-05, + "loss": 0.4626, + "step": 3547 + }, + { + "epoch": 0.41, + "learning_rate": 1.347406028563194e-05, + "loss": 0.487, + "step": 3548 + }, + { + "epoch": 0.41, + "learning_rate": 1.3470588953159982e-05, + "loss": 0.4712, + "step": 3549 + }, + { + "epoch": 0.41, + "learning_rate": 1.3467117145139854e-05, + "loss": 0.4427, + "step": 3550 + }, + { + "epoch": 0.41, + "learning_rate": 1.3463644862047267e-05, + "loss": 0.4693, + "step": 3551 + }, + { + "epoch": 0.41, + "learning_rate": 1.3460172104358007e-05, + "loss": 0.4581, + "step": 3552 + }, + { + "epoch": 0.41, + "learning_rate": 1.3456698872547915e-05, + "loss": 0.4775, + "step": 3553 + }, + { + "epoch": 0.41, + "learning_rate": 1.3453225167092902e-05, + "loss": 0.4602, + "step": 3554 + }, + { + "epoch": 0.41, + "learning_rate": 1.3449750988468943e-05, + "loss": 0.4855, + "step": 3555 + }, + { + "epoch": 0.41, + "learning_rate": 1.344627633715208e-05, + "loss": 0.4711, + "step": 3556 + }, + { + "epoch": 0.41, + "learning_rate": 1.3442801213618417e-05, + "loss": 0.4648, + "step": 3557 + }, + { + "epoch": 0.41, + "learning_rate": 1.3439325618344123e-05, + "loss": 0.4784, + "step": 3558 + }, + { + "epoch": 0.41, + "learning_rate": 1.3435849551805436e-05, + "loss": 0.4907, + "step": 3559 + }, + { + "epoch": 0.41, + "learning_rate": 1.3432373014478644e-05, + "loss": 0.4504, + "step": 3560 + }, + { + "epoch": 0.41, + "learning_rate": 1.3428896006840122e-05, + "loss": 0.4856, + "step": 3561 + }, + { + "epoch": 0.41, + "learning_rate": 1.3425418529366293e-05, + "loss": 0.4712, + "step": 3562 + }, + { + "epoch": 0.41, + "learning_rate": 1.3421940582533645e-05, + "loss": 0.4771, + "step": 3563 + }, + { + "epoch": 0.41, + "learning_rate": 1.3418462166818743e-05, + "loss": 0.4686, + "step": 3564 + }, + { + "epoch": 0.41, + "learning_rate": 1.34149832826982e-05, + "loss": 0.4561, + "step": 3565 + }, + { + "epoch": 0.41, + "learning_rate": 1.3411503930648704e-05, + "loss": 0.4762, + "step": 3566 + }, + { + "epoch": 0.41, + "learning_rate": 1.3408024111147004e-05, + "loss": 0.477, + "step": 3567 + }, + { + "epoch": 0.41, + "learning_rate": 1.3404543824669915e-05, + "loss": 0.4383, + "step": 3568 + }, + { + "epoch": 0.41, + "learning_rate": 1.3401063071694309e-05, + "loss": 0.4824, + "step": 3569 + }, + { + "epoch": 0.41, + "learning_rate": 1.3397581852697128e-05, + "loss": 0.4664, + "step": 3570 + }, + { + "epoch": 0.41, + "learning_rate": 1.3394100168155382e-05, + "loss": 0.4723, + "step": 3571 + }, + { + "epoch": 0.41, + "learning_rate": 1.3390618018546135e-05, + "loss": 0.4462, + "step": 3572 + }, + { + "epoch": 0.41, + "learning_rate": 1.3387135404346519e-05, + "loss": 0.4555, + "step": 3573 + }, + { + "epoch": 0.41, + "learning_rate": 1.338365232603373e-05, + "loss": 0.4666, + "step": 3574 + }, + { + "epoch": 0.41, + "learning_rate": 1.3380168784085028e-05, + "loss": 0.4895, + "step": 3575 + }, + { + "epoch": 0.41, + "learning_rate": 1.3376684778977738e-05, + "loss": 0.4813, + "step": 3576 + }, + { + "epoch": 0.41, + "learning_rate": 1.3373200311189245e-05, + "loss": 0.485, + "step": 3577 + }, + { + "epoch": 0.41, + "learning_rate": 1.3369715381197e-05, + "loss": 0.4799, + "step": 3578 + }, + { + "epoch": 0.41, + "learning_rate": 1.336622998947851e-05, + "loss": 0.4667, + "step": 3579 + }, + { + "epoch": 0.41, + "learning_rate": 1.336274413651136e-05, + "loss": 0.4623, + "step": 3580 + }, + { + "epoch": 0.41, + "learning_rate": 1.3359257822773187e-05, + "loss": 0.4914, + "step": 3581 + }, + { + "epoch": 0.41, + "learning_rate": 1.3355771048741692e-05, + "loss": 0.4591, + "step": 3582 + }, + { + "epoch": 0.41, + "learning_rate": 1.335228381489464e-05, + "loss": 0.4642, + "step": 3583 + }, + { + "epoch": 0.41, + "learning_rate": 1.3348796121709862e-05, + "loss": 0.4682, + "step": 3584 + }, + { + "epoch": 0.41, + "learning_rate": 1.3345307969665252e-05, + "loss": 0.4726, + "step": 3585 + }, + { + "epoch": 0.41, + "learning_rate": 1.3341819359238762e-05, + "loss": 0.4532, + "step": 3586 + }, + { + "epoch": 0.41, + "learning_rate": 1.3338330290908408e-05, + "loss": 0.4763, + "step": 3587 + }, + { + "epoch": 0.41, + "learning_rate": 1.3334840765152272e-05, + "loss": 0.4579, + "step": 3588 + }, + { + "epoch": 0.41, + "learning_rate": 1.3331350782448495e-05, + "loss": 0.4719, + "step": 3589 + }, + { + "epoch": 0.41, + "learning_rate": 1.332786034327529e-05, + "loss": 0.4579, + "step": 3590 + }, + { + "epoch": 0.41, + "learning_rate": 1.3324369448110916e-05, + "loss": 0.4598, + "step": 3591 + }, + { + "epoch": 0.41, + "learning_rate": 1.3320878097433707e-05, + "loss": 0.4768, + "step": 3592 + }, + { + "epoch": 0.41, + "learning_rate": 1.331738629172206e-05, + "loss": 0.4901, + "step": 3593 + }, + { + "epoch": 0.41, + "learning_rate": 1.3313894031454421e-05, + "loss": 0.4635, + "step": 3594 + }, + { + "epoch": 0.41, + "learning_rate": 1.3310401317109316e-05, + "loss": 0.4515, + "step": 3595 + }, + { + "epoch": 0.41, + "learning_rate": 1.330690814916532e-05, + "loss": 0.4682, + "step": 3596 + }, + { + "epoch": 0.41, + "learning_rate": 1.330341452810108e-05, + "loss": 0.4734, + "step": 3597 + }, + { + "epoch": 0.41, + "learning_rate": 1.3299920454395296e-05, + "loss": 0.4524, + "step": 3598 + }, + { + "epoch": 0.41, + "learning_rate": 1.3296425928526735e-05, + "loss": 0.4722, + "step": 3599 + }, + { + "epoch": 0.41, + "learning_rate": 1.3292930950974223e-05, + "loss": 0.4714, + "step": 3600 + }, + { + "epoch": 0.41, + "learning_rate": 1.3289435522216657e-05, + "loss": 0.4808, + "step": 3601 + }, + { + "epoch": 0.41, + "learning_rate": 1.3285939642732979e-05, + "loss": 0.4508, + "step": 3602 + }, + { + "epoch": 0.41, + "learning_rate": 1.3282443313002209e-05, + "loss": 0.5019, + "step": 3603 + }, + { + "epoch": 0.41, + "learning_rate": 1.3278946533503422e-05, + "loss": 0.4552, + "step": 3604 + }, + { + "epoch": 0.41, + "learning_rate": 1.3275449304715753e-05, + "loss": 0.4557, + "step": 3605 + }, + { + "epoch": 0.41, + "learning_rate": 1.3271951627118402e-05, + "loss": 0.4747, + "step": 3606 + }, + { + "epoch": 0.41, + "learning_rate": 1.3268453501190628e-05, + "loss": 0.4785, + "step": 3607 + }, + { + "epoch": 0.41, + "learning_rate": 1.3264954927411751e-05, + "loss": 0.4874, + "step": 3608 + }, + { + "epoch": 0.41, + "learning_rate": 1.3261455906261154e-05, + "loss": 0.4692, + "step": 3609 + }, + { + "epoch": 0.41, + "learning_rate": 1.3257956438218283e-05, + "loss": 0.472, + "step": 3610 + }, + { + "epoch": 0.41, + "learning_rate": 1.3254456523762643e-05, + "loss": 0.4603, + "step": 3611 + }, + { + "epoch": 0.41, + "learning_rate": 1.3250956163373801e-05, + "loss": 0.476, + "step": 3612 + }, + { + "epoch": 0.41, + "learning_rate": 1.324745535753138e-05, + "loss": 0.469, + "step": 3613 + }, + { + "epoch": 0.41, + "learning_rate": 1.3243954106715074e-05, + "loss": 0.4551, + "step": 3614 + }, + { + "epoch": 0.41, + "learning_rate": 1.3240452411404628e-05, + "loss": 0.4715, + "step": 3615 + }, + { + "epoch": 0.41, + "learning_rate": 1.3236950272079858e-05, + "loss": 0.4817, + "step": 3616 + }, + { + "epoch": 0.41, + "learning_rate": 1.3233447689220629e-05, + "loss": 0.4714, + "step": 3617 + }, + { + "epoch": 0.41, + "learning_rate": 1.3229944663306877e-05, + "loss": 0.4745, + "step": 3618 + }, + { + "epoch": 0.41, + "learning_rate": 1.3226441194818596e-05, + "loss": 0.4591, + "step": 3619 + }, + { + "epoch": 0.41, + "learning_rate": 1.3222937284235835e-05, + "loss": 0.4703, + "step": 3620 + }, + { + "epoch": 0.41, + "learning_rate": 1.3219432932038712e-05, + "loss": 0.4754, + "step": 3621 + }, + { + "epoch": 0.41, + "learning_rate": 1.3215928138707396e-05, + "loss": 0.4626, + "step": 3622 + }, + { + "epoch": 0.41, + "learning_rate": 1.321242290472213e-05, + "loss": 0.4779, + "step": 3623 + }, + { + "epoch": 0.41, + "learning_rate": 1.3208917230563201e-05, + "loss": 0.4725, + "step": 3624 + }, + { + "epoch": 0.41, + "learning_rate": 1.3205411116710973e-05, + "loss": 0.4656, + "step": 3625 + }, + { + "epoch": 0.41, + "learning_rate": 1.3201904563645853e-05, + "loss": 0.486, + "step": 3626 + }, + { + "epoch": 0.41, + "learning_rate": 1.3198397571848323e-05, + "loss": 0.4626, + "step": 3627 + }, + { + "epoch": 0.41, + "learning_rate": 1.319489014179892e-05, + "loss": 0.4682, + "step": 3628 + }, + { + "epoch": 0.41, + "learning_rate": 1.3191382273978237e-05, + "loss": 0.4631, + "step": 3629 + }, + { + "epoch": 0.41, + "learning_rate": 1.3187873968866928e-05, + "loss": 0.4509, + "step": 3630 + }, + { + "epoch": 0.41, + "learning_rate": 1.3184365226945715e-05, + "loss": 0.4576, + "step": 3631 + }, + { + "epoch": 0.42, + "learning_rate": 1.318085604869537e-05, + "loss": 0.4649, + "step": 3632 + }, + { + "epoch": 0.42, + "learning_rate": 1.3177346434596734e-05, + "loss": 0.4934, + "step": 3633 + }, + { + "epoch": 0.42, + "learning_rate": 1.3173836385130693e-05, + "loss": 0.4483, + "step": 3634 + }, + { + "epoch": 0.42, + "learning_rate": 1.3170325900778211e-05, + "loss": 0.4704, + "step": 3635 + }, + { + "epoch": 0.42, + "learning_rate": 1.3166814982020298e-05, + "loss": 0.4706, + "step": 3636 + }, + { + "epoch": 0.42, + "learning_rate": 1.3163303629338029e-05, + "loss": 0.4671, + "step": 3637 + }, + { + "epoch": 0.42, + "learning_rate": 1.3159791843212542e-05, + "loss": 0.4487, + "step": 3638 + }, + { + "epoch": 0.42, + "learning_rate": 1.3156279624125023e-05, + "loss": 0.4742, + "step": 3639 + }, + { + "epoch": 0.42, + "learning_rate": 1.3152766972556727e-05, + "loss": 0.4657, + "step": 3640 + }, + { + "epoch": 0.42, + "learning_rate": 1.3149253888988967e-05, + "loss": 0.4887, + "step": 3641 + }, + { + "epoch": 0.42, + "learning_rate": 1.3145740373903118e-05, + "loss": 0.4618, + "step": 3642 + }, + { + "epoch": 0.42, + "learning_rate": 1.31422264277806e-05, + "loss": 0.4543, + "step": 3643 + }, + { + "epoch": 0.42, + "learning_rate": 1.3138712051102908e-05, + "loss": 0.4653, + "step": 3644 + }, + { + "epoch": 0.42, + "learning_rate": 1.3135197244351595e-05, + "loss": 0.4668, + "step": 3645 + }, + { + "epoch": 0.42, + "learning_rate": 1.3131682008008255e-05, + "loss": 0.4705, + "step": 3646 + }, + { + "epoch": 0.42, + "learning_rate": 1.3128166342554567e-05, + "loss": 0.4697, + "step": 3647 + }, + { + "epoch": 0.42, + "learning_rate": 1.3124650248472248e-05, + "loss": 0.4545, + "step": 3648 + }, + { + "epoch": 0.42, + "learning_rate": 1.3121133726243083e-05, + "loss": 0.4855, + "step": 3649 + }, + { + "epoch": 0.42, + "learning_rate": 1.3117616776348915e-05, + "loss": 0.4607, + "step": 3650 + }, + { + "epoch": 0.42, + "learning_rate": 1.3114099399271646e-05, + "loss": 0.4721, + "step": 3651 + }, + { + "epoch": 0.42, + "learning_rate": 1.311058159549323e-05, + "loss": 0.4476, + "step": 3652 + }, + { + "epoch": 0.42, + "learning_rate": 1.3107063365495692e-05, + "loss": 0.4616, + "step": 3653 + }, + { + "epoch": 0.42, + "learning_rate": 1.31035447097611e-05, + "loss": 0.454, + "step": 3654 + }, + { + "epoch": 0.42, + "learning_rate": 1.3100025628771595e-05, + "loss": 0.4719, + "step": 3655 + }, + { + "epoch": 0.42, + "learning_rate": 1.3096506123009368e-05, + "loss": 0.4565, + "step": 3656 + }, + { + "epoch": 0.42, + "learning_rate": 1.3092986192956665e-05, + "loss": 0.478, + "step": 3657 + }, + { + "epoch": 0.42, + "learning_rate": 1.3089465839095803e-05, + "loss": 0.4596, + "step": 3658 + }, + { + "epoch": 0.42, + "learning_rate": 1.3085945061909144e-05, + "loss": 0.4778, + "step": 3659 + }, + { + "epoch": 0.42, + "learning_rate": 1.3082423861879114e-05, + "loss": 0.4706, + "step": 3660 + }, + { + "epoch": 0.42, + "learning_rate": 1.3078902239488196e-05, + "loss": 0.4497, + "step": 3661 + }, + { + "epoch": 0.42, + "learning_rate": 1.3075380195218931e-05, + "loss": 0.4668, + "step": 3662 + }, + { + "epoch": 0.42, + "learning_rate": 1.3071857729553918e-05, + "loss": 0.469, + "step": 3663 + }, + { + "epoch": 0.42, + "learning_rate": 1.3068334842975813e-05, + "loss": 0.4776, + "step": 3664 + }, + { + "epoch": 0.42, + "learning_rate": 1.306481153596733e-05, + "loss": 0.4726, + "step": 3665 + }, + { + "epoch": 0.42, + "learning_rate": 1.3061287809011243e-05, + "loss": 0.4661, + "step": 3666 + }, + { + "epoch": 0.42, + "learning_rate": 1.3057763662590377e-05, + "loss": 0.4822, + "step": 3667 + }, + { + "epoch": 0.42, + "learning_rate": 1.3054239097187625e-05, + "loss": 0.4588, + "step": 3668 + }, + { + "epoch": 0.42, + "learning_rate": 1.3050714113285922e-05, + "loss": 0.4591, + "step": 3669 + }, + { + "epoch": 0.42, + "learning_rate": 1.3047188711368278e-05, + "loss": 0.4658, + "step": 3670 + }, + { + "epoch": 0.42, + "learning_rate": 1.3043662891917748e-05, + "loss": 0.4681, + "step": 3671 + }, + { + "epoch": 0.42, + "learning_rate": 1.3040136655417448e-05, + "loss": 0.4674, + "step": 3672 + }, + { + "epoch": 0.42, + "learning_rate": 1.303661000235055e-05, + "loss": 0.4597, + "step": 3673 + }, + { + "epoch": 0.42, + "learning_rate": 1.3033082933200287e-05, + "loss": 0.4537, + "step": 3674 + }, + { + "epoch": 0.42, + "learning_rate": 1.3029555448449947e-05, + "loss": 0.4705, + "step": 3675 + }, + { + "epoch": 0.42, + "learning_rate": 1.302602754858287e-05, + "loss": 0.4701, + "step": 3676 + }, + { + "epoch": 0.42, + "learning_rate": 1.3022499234082463e-05, + "loss": 0.4851, + "step": 3677 + }, + { + "epoch": 0.42, + "learning_rate": 1.3018970505432176e-05, + "loss": 0.4751, + "step": 3678 + }, + { + "epoch": 0.42, + "learning_rate": 1.3015441363115526e-05, + "loss": 0.4522, + "step": 3679 + }, + { + "epoch": 0.42, + "learning_rate": 1.3011911807616091e-05, + "loss": 0.4703, + "step": 3680 + }, + { + "epoch": 0.42, + "learning_rate": 1.3008381839417493e-05, + "loss": 0.4713, + "step": 3681 + }, + { + "epoch": 0.42, + "learning_rate": 1.3004851459003416e-05, + "loss": 0.4806, + "step": 3682 + }, + { + "epoch": 0.42, + "learning_rate": 1.30013206668576e-05, + "loss": 0.4972, + "step": 3683 + }, + { + "epoch": 0.42, + "learning_rate": 1.2997789463463848e-05, + "loss": 0.4535, + "step": 3684 + }, + { + "epoch": 0.42, + "learning_rate": 1.2994257849306009e-05, + "loss": 0.479, + "step": 3685 + }, + { + "epoch": 0.42, + "learning_rate": 1.2990725824867995e-05, + "loss": 0.4595, + "step": 3686 + }, + { + "epoch": 0.42, + "learning_rate": 1.2987193390633773e-05, + "loss": 0.464, + "step": 3687 + }, + { + "epoch": 0.42, + "learning_rate": 1.298366054708736e-05, + "loss": 0.4592, + "step": 3688 + }, + { + "epoch": 0.42, + "learning_rate": 1.2980127294712839e-05, + "loss": 0.4798, + "step": 3689 + }, + { + "epoch": 0.42, + "learning_rate": 1.2976593633994347e-05, + "loss": 0.4443, + "step": 3690 + }, + { + "epoch": 0.42, + "learning_rate": 1.297305956541607e-05, + "loss": 0.4957, + "step": 3691 + }, + { + "epoch": 0.42, + "learning_rate": 1.2969525089462253e-05, + "loss": 0.4539, + "step": 3692 + }, + { + "epoch": 0.42, + "learning_rate": 1.2965990206617203e-05, + "loss": 0.4583, + "step": 3693 + }, + { + "epoch": 0.42, + "learning_rate": 1.2962454917365275e-05, + "loss": 0.4544, + "step": 3694 + }, + { + "epoch": 0.42, + "learning_rate": 1.2958919222190885e-05, + "loss": 0.4833, + "step": 3695 + }, + { + "epoch": 0.42, + "learning_rate": 1.2955383121578498e-05, + "loss": 0.469, + "step": 3696 + }, + { + "epoch": 0.42, + "learning_rate": 1.2951846616012642e-05, + "loss": 0.4562, + "step": 3697 + }, + { + "epoch": 0.42, + "learning_rate": 1.2948309705977893e-05, + "loss": 0.4555, + "step": 3698 + }, + { + "epoch": 0.42, + "learning_rate": 1.2944772391958896e-05, + "loss": 0.4657, + "step": 3699 + }, + { + "epoch": 0.42, + "learning_rate": 1.294123467444033e-05, + "loss": 0.4753, + "step": 3700 + }, + { + "epoch": 0.42, + "learning_rate": 1.2937696553906949e-05, + "loss": 0.4762, + "step": 3701 + }, + { + "epoch": 0.42, + "learning_rate": 1.2934158030843554e-05, + "loss": 0.45, + "step": 3702 + }, + { + "epoch": 0.42, + "learning_rate": 1.2930619105734999e-05, + "loss": 0.4803, + "step": 3703 + }, + { + "epoch": 0.42, + "learning_rate": 1.2927079779066196e-05, + "loss": 0.4837, + "step": 3704 + }, + { + "epoch": 0.42, + "learning_rate": 1.2923540051322114e-05, + "loss": 0.4546, + "step": 3705 + }, + { + "epoch": 0.42, + "learning_rate": 1.2919999922987775e-05, + "loss": 0.4731, + "step": 3706 + }, + { + "epoch": 0.42, + "learning_rate": 1.291645939454825e-05, + "loss": 0.4707, + "step": 3707 + }, + { + "epoch": 0.42, + "learning_rate": 1.2912918466488678e-05, + "loss": 0.4518, + "step": 3708 + }, + { + "epoch": 0.42, + "learning_rate": 1.2909377139294242e-05, + "loss": 0.4774, + "step": 3709 + }, + { + "epoch": 0.42, + "learning_rate": 1.290583541345018e-05, + "loss": 0.4678, + "step": 3710 + }, + { + "epoch": 0.42, + "learning_rate": 1.2902293289441791e-05, + "loss": 0.4693, + "step": 3711 + }, + { + "epoch": 0.42, + "learning_rate": 1.2898750767754427e-05, + "loss": 0.4519, + "step": 3712 + }, + { + "epoch": 0.42, + "learning_rate": 1.2895207848873488e-05, + "loss": 0.4656, + "step": 3713 + }, + { + "epoch": 0.42, + "learning_rate": 1.2891664533284434e-05, + "loss": 0.4719, + "step": 3714 + }, + { + "epoch": 0.42, + "learning_rate": 1.288812082147278e-05, + "loss": 0.4629, + "step": 3715 + }, + { + "epoch": 0.42, + "learning_rate": 1.2884576713924093e-05, + "loss": 0.4535, + "step": 3716 + }, + { + "epoch": 0.42, + "learning_rate": 1.2881032211123994e-05, + "loss": 0.4501, + "step": 3717 + }, + { + "epoch": 0.42, + "learning_rate": 1.2877487313558159e-05, + "loss": 0.4916, + "step": 3718 + }, + { + "epoch": 0.43, + "learning_rate": 1.287394202171232e-05, + "loss": 0.4555, + "step": 3719 + }, + { + "epoch": 0.43, + "learning_rate": 1.287039633607226e-05, + "loss": 0.4693, + "step": 3720 + }, + { + "epoch": 0.43, + "learning_rate": 1.2866850257123817e-05, + "loss": 0.4711, + "step": 3721 + }, + { + "epoch": 0.43, + "learning_rate": 1.2863303785352883e-05, + "loss": 0.4536, + "step": 3722 + }, + { + "epoch": 0.43, + "learning_rate": 1.2859756921245403e-05, + "loss": 0.4687, + "step": 3723 + }, + { + "epoch": 0.43, + "learning_rate": 1.2856209665287378e-05, + "loss": 0.4715, + "step": 3724 + }, + { + "epoch": 0.43, + "learning_rate": 1.2852662017964863e-05, + "loss": 0.4529, + "step": 3725 + }, + { + "epoch": 0.43, + "learning_rate": 1.2849113979763956e-05, + "loss": 0.4851, + "step": 3726 + }, + { + "epoch": 0.43, + "learning_rate": 1.2845565551170829e-05, + "loss": 0.4807, + "step": 3727 + }, + { + "epoch": 0.43, + "learning_rate": 1.2842016732671689e-05, + "loss": 0.456, + "step": 3728 + }, + { + "epoch": 0.43, + "learning_rate": 1.2838467524752808e-05, + "loss": 0.4591, + "step": 3729 + }, + { + "epoch": 0.43, + "learning_rate": 1.2834917927900504e-05, + "loss": 0.4645, + "step": 3730 + }, + { + "epoch": 0.43, + "learning_rate": 1.2831367942601146e-05, + "loss": 0.4533, + "step": 3731 + }, + { + "epoch": 0.43, + "learning_rate": 1.2827817569341167e-05, + "loss": 0.4886, + "step": 3732 + }, + { + "epoch": 0.43, + "learning_rate": 1.282426680860705e-05, + "loss": 0.4656, + "step": 3733 + }, + { + "epoch": 0.43, + "learning_rate": 1.2820715660885328e-05, + "loss": 0.4773, + "step": 3734 + }, + { + "epoch": 0.43, + "learning_rate": 1.2817164126662581e-05, + "loss": 0.4702, + "step": 3735 + }, + { + "epoch": 0.43, + "learning_rate": 1.281361220642545e-05, + "loss": 0.4791, + "step": 3736 + }, + { + "epoch": 0.43, + "learning_rate": 1.281005990066063e-05, + "loss": 0.4713, + "step": 3737 + }, + { + "epoch": 0.43, + "learning_rate": 1.280650720985487e-05, + "loss": 0.4736, + "step": 3738 + }, + { + "epoch": 0.43, + "learning_rate": 1.2802954134494963e-05, + "loss": 0.449, + "step": 3739 + }, + { + "epoch": 0.43, + "learning_rate": 1.2799400675067754e-05, + "loss": 0.4865, + "step": 3740 + }, + { + "epoch": 0.43, + "learning_rate": 1.2795846832060157e-05, + "loss": 0.4372, + "step": 3741 + }, + { + "epoch": 0.43, + "learning_rate": 1.2792292605959125e-05, + "loss": 0.4791, + "step": 3742 + }, + { + "epoch": 0.43, + "learning_rate": 1.2788737997251665e-05, + "loss": 0.4706, + "step": 3743 + }, + { + "epoch": 0.43, + "learning_rate": 1.2785183006424836e-05, + "loss": 0.4556, + "step": 3744 + }, + { + "epoch": 0.43, + "learning_rate": 1.278162763396575e-05, + "loss": 0.4523, + "step": 3745 + }, + { + "epoch": 0.43, + "learning_rate": 1.2778071880361577e-05, + "loss": 0.4699, + "step": 3746 + }, + { + "epoch": 0.43, + "learning_rate": 1.2774515746099536e-05, + "loss": 0.4654, + "step": 3747 + }, + { + "epoch": 0.43, + "learning_rate": 1.277095923166689e-05, + "loss": 0.4471, + "step": 3748 + }, + { + "epoch": 0.43, + "learning_rate": 1.2767402337550966e-05, + "loss": 0.4751, + "step": 3749 + }, + { + "epoch": 0.43, + "learning_rate": 1.2763845064239134e-05, + "loss": 0.4884, + "step": 3750 + }, + { + "epoch": 0.43, + "learning_rate": 1.2760287412218824e-05, + "loss": 0.4594, + "step": 3751 + }, + { + "epoch": 0.43, + "learning_rate": 1.275672938197751e-05, + "loss": 0.4848, + "step": 3752 + }, + { + "epoch": 0.43, + "learning_rate": 1.2753170974002727e-05, + "loss": 0.4542, + "step": 3753 + }, + { + "epoch": 0.43, + "learning_rate": 1.2749612188782048e-05, + "loss": 0.4643, + "step": 3754 + }, + { + "epoch": 0.43, + "learning_rate": 1.2746053026803114e-05, + "loss": 0.46, + "step": 3755 + }, + { + "epoch": 0.43, + "learning_rate": 1.2742493488553606e-05, + "loss": 0.4519, + "step": 3756 + }, + { + "epoch": 0.43, + "learning_rate": 1.2738933574521262e-05, + "loss": 0.4655, + "step": 3757 + }, + { + "epoch": 0.43, + "learning_rate": 1.2735373285193867e-05, + "loss": 0.4716, + "step": 3758 + }, + { + "epoch": 0.43, + "learning_rate": 1.2731812621059262e-05, + "loss": 0.469, + "step": 3759 + }, + { + "epoch": 0.43, + "learning_rate": 1.2728251582605335e-05, + "loss": 0.4608, + "step": 3760 + }, + { + "epoch": 0.43, + "learning_rate": 1.2724690170320031e-05, + "loss": 0.4635, + "step": 3761 + }, + { + "epoch": 0.43, + "learning_rate": 1.2721128384691342e-05, + "loss": 0.4552, + "step": 3762 + }, + { + "epoch": 0.43, + "learning_rate": 1.2717566226207311e-05, + "loss": 0.4805, + "step": 3763 + }, + { + "epoch": 0.43, + "learning_rate": 1.2714003695356037e-05, + "loss": 0.4595, + "step": 3764 + }, + { + "epoch": 0.43, + "learning_rate": 1.2710440792625662e-05, + "loss": 0.4617, + "step": 3765 + }, + { + "epoch": 0.43, + "learning_rate": 1.2706877518504384e-05, + "loss": 0.485, + "step": 3766 + }, + { + "epoch": 0.43, + "learning_rate": 1.2703313873480451e-05, + "loss": 0.462, + "step": 3767 + }, + { + "epoch": 0.43, + "learning_rate": 1.2699749858042164e-05, + "loss": 0.4743, + "step": 3768 + }, + { + "epoch": 0.43, + "learning_rate": 1.269618547267787e-05, + "loss": 0.4731, + "step": 3769 + }, + { + "epoch": 0.43, + "learning_rate": 1.2692620717875972e-05, + "loss": 0.4591, + "step": 3770 + }, + { + "epoch": 0.43, + "learning_rate": 1.2689055594124919e-05, + "loss": 0.4616, + "step": 3771 + }, + { + "epoch": 0.43, + "learning_rate": 1.2685490101913214e-05, + "loss": 0.4946, + "step": 3772 + }, + { + "epoch": 0.43, + "learning_rate": 1.2681924241729409e-05, + "loss": 0.4563, + "step": 3773 + }, + { + "epoch": 0.43, + "learning_rate": 1.2678358014062104e-05, + "loss": 0.4606, + "step": 3774 + }, + { + "epoch": 0.43, + "learning_rate": 1.2674791419399956e-05, + "loss": 0.4596, + "step": 3775 + }, + { + "epoch": 0.43, + "learning_rate": 1.2671224458231664e-05, + "loss": 0.5041, + "step": 3776 + }, + { + "epoch": 0.43, + "learning_rate": 1.2667657131045983e-05, + "loss": 0.4335, + "step": 3777 + }, + { + "epoch": 0.43, + "learning_rate": 1.2664089438331716e-05, + "loss": 0.4698, + "step": 3778 + }, + { + "epoch": 0.43, + "learning_rate": 1.266052138057772e-05, + "loss": 0.4479, + "step": 3779 + }, + { + "epoch": 0.43, + "learning_rate": 1.2656952958272893e-05, + "loss": 0.4608, + "step": 3780 + }, + { + "epoch": 0.43, + "learning_rate": 1.2653384171906192e-05, + "loss": 0.4745, + "step": 3781 + }, + { + "epoch": 0.43, + "learning_rate": 1.264981502196662e-05, + "loss": 0.4606, + "step": 3782 + }, + { + "epoch": 0.43, + "learning_rate": 1.2646245508943227e-05, + "loss": 0.47, + "step": 3783 + }, + { + "epoch": 0.43, + "learning_rate": 1.2642675633325122e-05, + "loss": 0.4923, + "step": 3784 + }, + { + "epoch": 0.43, + "learning_rate": 1.2639105395601452e-05, + "loss": 0.4503, + "step": 3785 + }, + { + "epoch": 0.43, + "learning_rate": 1.2635534796261424e-05, + "loss": 0.4632, + "step": 3786 + }, + { + "epoch": 0.43, + "learning_rate": 1.2631963835794285e-05, + "loss": 0.4807, + "step": 3787 + }, + { + "epoch": 0.43, + "learning_rate": 1.2628392514689339e-05, + "loss": 0.4647, + "step": 3788 + }, + { + "epoch": 0.43, + "learning_rate": 1.2624820833435939e-05, + "loss": 0.4696, + "step": 3789 + }, + { + "epoch": 0.43, + "learning_rate": 1.262124879252348e-05, + "loss": 0.4604, + "step": 3790 + }, + { + "epoch": 0.43, + "learning_rate": 1.2617676392441419e-05, + "loss": 0.4466, + "step": 3791 + }, + { + "epoch": 0.43, + "learning_rate": 1.2614103633679244e-05, + "loss": 0.4844, + "step": 3792 + }, + { + "epoch": 0.43, + "learning_rate": 1.2610530516726506e-05, + "loss": 0.4597, + "step": 3793 + }, + { + "epoch": 0.43, + "learning_rate": 1.260695704207281e-05, + "loss": 0.4664, + "step": 3794 + }, + { + "epoch": 0.43, + "learning_rate": 1.2603383210207796e-05, + "loss": 0.4714, + "step": 3795 + }, + { + "epoch": 0.43, + "learning_rate": 1.2599809021621157e-05, + "loss": 0.4559, + "step": 3796 + }, + { + "epoch": 0.43, + "learning_rate": 1.2596234476802636e-05, + "loss": 0.4551, + "step": 3797 + }, + { + "epoch": 0.43, + "learning_rate": 1.2592659576242028e-05, + "loss": 0.4842, + "step": 3798 + }, + { + "epoch": 0.43, + "learning_rate": 1.2589084320429178e-05, + "loss": 0.4632, + "step": 3799 + }, + { + "epoch": 0.43, + "learning_rate": 1.2585508709853971e-05, + "loss": 0.477, + "step": 3800 + }, + { + "epoch": 0.43, + "learning_rate": 1.2581932745006343e-05, + "loss": 0.4712, + "step": 3801 + }, + { + "epoch": 0.43, + "learning_rate": 1.2578356426376283e-05, + "loss": 0.4702, + "step": 3802 + }, + { + "epoch": 0.43, + "learning_rate": 1.2574779754453831e-05, + "loss": 0.4609, + "step": 3803 + }, + { + "epoch": 0.43, + "learning_rate": 1.257120272972907e-05, + "loss": 0.4698, + "step": 3804 + }, + { + "epoch": 0.43, + "learning_rate": 1.2567625352692127e-05, + "loss": 0.4681, + "step": 3805 + }, + { + "epoch": 0.43, + "learning_rate": 1.2564047623833186e-05, + "loss": 0.4557, + "step": 3806 + }, + { + "epoch": 0.44, + "learning_rate": 1.2560469543642472e-05, + "loss": 0.4728, + "step": 3807 + }, + { + "epoch": 0.44, + "learning_rate": 1.255689111261027e-05, + "loss": 0.4641, + "step": 3808 + }, + { + "epoch": 0.44, + "learning_rate": 1.2553312331226896e-05, + "loss": 0.4671, + "step": 3809 + }, + { + "epoch": 0.44, + "learning_rate": 1.254973319998273e-05, + "loss": 0.4723, + "step": 3810 + }, + { + "epoch": 0.44, + "learning_rate": 1.2546153719368189e-05, + "loss": 0.4556, + "step": 3811 + }, + { + "epoch": 0.44, + "learning_rate": 1.2542573889873741e-05, + "loss": 0.479, + "step": 3812 + }, + { + "epoch": 0.44, + "learning_rate": 1.2538993711989906e-05, + "loss": 0.4741, + "step": 3813 + }, + { + "epoch": 0.44, + "learning_rate": 1.2535413186207247e-05, + "loss": 0.4584, + "step": 3814 + }, + { + "epoch": 0.44, + "learning_rate": 1.2531832313016374e-05, + "loss": 0.4645, + "step": 3815 + }, + { + "epoch": 0.44, + "learning_rate": 1.2528251092907948e-05, + "loss": 0.4779, + "step": 3816 + }, + { + "epoch": 0.44, + "learning_rate": 1.2524669526372674e-05, + "loss": 0.4673, + "step": 3817 + }, + { + "epoch": 0.44, + "learning_rate": 1.2521087613901313e-05, + "loss": 0.4735, + "step": 3818 + }, + { + "epoch": 0.44, + "learning_rate": 1.251750535598466e-05, + "loss": 0.4649, + "step": 3819 + }, + { + "epoch": 0.44, + "learning_rate": 1.2513922753113567e-05, + "loss": 0.4772, + "step": 3820 + }, + { + "epoch": 0.44, + "learning_rate": 1.2510339805778932e-05, + "loss": 0.4602, + "step": 3821 + }, + { + "epoch": 0.44, + "learning_rate": 1.2506756514471696e-05, + "loss": 0.4734, + "step": 3822 + }, + { + "epoch": 0.44, + "learning_rate": 1.2503172879682853e-05, + "loss": 0.4635, + "step": 3823 + }, + { + "epoch": 0.44, + "learning_rate": 1.2499588901903437e-05, + "loss": 0.4662, + "step": 3824 + }, + { + "epoch": 0.44, + "learning_rate": 1.2496004581624538e-05, + "loss": 0.4722, + "step": 3825 + }, + { + "epoch": 0.44, + "learning_rate": 1.2492419919337282e-05, + "loss": 0.4521, + "step": 3826 + }, + { + "epoch": 0.44, + "learning_rate": 1.2488834915532852e-05, + "loss": 0.4681, + "step": 3827 + }, + { + "epoch": 0.44, + "learning_rate": 1.2485249570702471e-05, + "loss": 0.5047, + "step": 3828 + }, + { + "epoch": 0.44, + "learning_rate": 1.2481663885337417e-05, + "loss": 0.4595, + "step": 3829 + }, + { + "epoch": 0.44, + "learning_rate": 1.2478077859929e-05, + "loss": 0.4662, + "step": 3830 + }, + { + "epoch": 0.44, + "learning_rate": 1.2474491494968593e-05, + "loss": 0.4725, + "step": 3831 + }, + { + "epoch": 0.44, + "learning_rate": 1.2470904790947605e-05, + "loss": 0.4675, + "step": 3832 + }, + { + "epoch": 0.44, + "learning_rate": 1.2467317748357493e-05, + "loss": 0.4636, + "step": 3833 + }, + { + "epoch": 0.44, + "learning_rate": 1.2463730367689768e-05, + "loss": 0.4673, + "step": 3834 + }, + { + "epoch": 0.44, + "learning_rate": 1.246014264943597e-05, + "loss": 0.4526, + "step": 3835 + }, + { + "epoch": 0.44, + "learning_rate": 1.2456554594087709e-05, + "loss": 0.4888, + "step": 3836 + }, + { + "epoch": 0.44, + "learning_rate": 1.245296620213662e-05, + "loss": 0.4601, + "step": 3837 + }, + { + "epoch": 0.44, + "learning_rate": 1.2449377474074398e-05, + "loss": 0.4894, + "step": 3838 + }, + { + "epoch": 0.44, + "learning_rate": 1.2445788410392778e-05, + "loss": 0.4568, + "step": 3839 + }, + { + "epoch": 0.44, + "learning_rate": 1.2442199011583538e-05, + "loss": 0.4489, + "step": 3840 + }, + { + "epoch": 0.44, + "learning_rate": 1.2438609278138509e-05, + "loss": 0.4727, + "step": 3841 + }, + { + "epoch": 0.44, + "learning_rate": 1.2435019210549564e-05, + "loss": 0.4745, + "step": 3842 + }, + { + "epoch": 0.44, + "learning_rate": 1.2431428809308625e-05, + "loss": 0.4597, + "step": 3843 + }, + { + "epoch": 0.44, + "learning_rate": 1.2427838074907654e-05, + "loss": 0.4838, + "step": 3844 + }, + { + "epoch": 0.44, + "learning_rate": 1.2424247007838659e-05, + "loss": 0.4504, + "step": 3845 + }, + { + "epoch": 0.44, + "learning_rate": 1.2420655608593701e-05, + "loss": 0.4759, + "step": 3846 + }, + { + "epoch": 0.44, + "learning_rate": 1.2417063877664883e-05, + "loss": 0.4513, + "step": 3847 + }, + { + "epoch": 0.44, + "learning_rate": 1.241347181554435e-05, + "loss": 0.471, + "step": 3848 + }, + { + "epoch": 0.44, + "learning_rate": 1.2409879422724293e-05, + "loss": 0.4847, + "step": 3849 + }, + { + "epoch": 0.44, + "learning_rate": 1.240628669969695e-05, + "loss": 0.4829, + "step": 3850 + }, + { + "epoch": 0.44, + "learning_rate": 1.2402693646954607e-05, + "loss": 0.4523, + "step": 3851 + }, + { + "epoch": 0.44, + "learning_rate": 1.2399100264989593e-05, + "loss": 0.4763, + "step": 3852 + }, + { + "epoch": 0.44, + "learning_rate": 1.2395506554294281e-05, + "loss": 0.4629, + "step": 3853 + }, + { + "epoch": 0.44, + "learning_rate": 1.2391912515361085e-05, + "loss": 0.4689, + "step": 3854 + }, + { + "epoch": 0.44, + "learning_rate": 1.2388318148682474e-05, + "loss": 0.459, + "step": 3855 + }, + { + "epoch": 0.44, + "learning_rate": 1.2384723454750957e-05, + "loss": 0.4638, + "step": 3856 + }, + { + "epoch": 0.44, + "learning_rate": 1.2381128434059082e-05, + "loss": 0.4496, + "step": 3857 + }, + { + "epoch": 0.44, + "learning_rate": 1.2377533087099451e-05, + "loss": 0.4754, + "step": 3858 + }, + { + "epoch": 0.44, + "learning_rate": 1.2373937414364703e-05, + "loss": 0.4502, + "step": 3859 + }, + { + "epoch": 0.44, + "learning_rate": 1.237034141634753e-05, + "loss": 0.4675, + "step": 3860 + }, + { + "epoch": 0.44, + "learning_rate": 1.2366745093540667e-05, + "loss": 0.4682, + "step": 3861 + }, + { + "epoch": 0.44, + "learning_rate": 1.2363148446436882e-05, + "loss": 0.4683, + "step": 3862 + }, + { + "epoch": 0.44, + "learning_rate": 1.2359551475529e-05, + "loss": 0.4558, + "step": 3863 + }, + { + "epoch": 0.44, + "learning_rate": 1.2355954181309883e-05, + "loss": 0.4749, + "step": 3864 + }, + { + "epoch": 0.44, + "learning_rate": 1.235235656427245e-05, + "loss": 0.4658, + "step": 3865 + }, + { + "epoch": 0.44, + "learning_rate": 1.2348758624909644e-05, + "loss": 0.4681, + "step": 3866 + }, + { + "epoch": 0.44, + "learning_rate": 1.2345160363714471e-05, + "loss": 0.4694, + "step": 3867 + }, + { + "epoch": 0.44, + "learning_rate": 1.2341561781179965e-05, + "loss": 0.4718, + "step": 3868 + }, + { + "epoch": 0.44, + "learning_rate": 1.233796287779922e-05, + "loss": 0.4808, + "step": 3869 + }, + { + "epoch": 0.44, + "learning_rate": 1.2334363654065363e-05, + "loss": 0.4671, + "step": 3870 + }, + { + "epoch": 0.44, + "learning_rate": 1.2330764110471567e-05, + "loss": 0.4458, + "step": 3871 + }, + { + "epoch": 0.44, + "learning_rate": 1.2327164247511051e-05, + "loss": 0.4809, + "step": 3872 + }, + { + "epoch": 0.44, + "learning_rate": 1.2323564065677078e-05, + "loss": 0.4554, + "step": 3873 + }, + { + "epoch": 0.44, + "learning_rate": 1.2319963565462949e-05, + "loss": 0.4595, + "step": 3874 + }, + { + "epoch": 0.44, + "learning_rate": 1.2316362747362019e-05, + "loss": 0.4644, + "step": 3875 + }, + { + "epoch": 0.44, + "learning_rate": 1.2312761611867673e-05, + "loss": 0.463, + "step": 3876 + }, + { + "epoch": 0.44, + "learning_rate": 1.2309160159473354e-05, + "loss": 0.4827, + "step": 3877 + }, + { + "epoch": 0.44, + "learning_rate": 1.2305558390672539e-05, + "loss": 0.4786, + "step": 3878 + }, + { + "epoch": 0.44, + "learning_rate": 1.2301956305958746e-05, + "loss": 0.4691, + "step": 3879 + }, + { + "epoch": 0.44, + "learning_rate": 1.2298353905825549e-05, + "loss": 0.4597, + "step": 3880 + }, + { + "epoch": 0.44, + "learning_rate": 1.2294751190766552e-05, + "loss": 0.4655, + "step": 3881 + }, + { + "epoch": 0.44, + "learning_rate": 1.229114816127541e-05, + "loss": 0.4744, + "step": 3882 + }, + { + "epoch": 0.44, + "learning_rate": 1.2287544817845817e-05, + "loss": 0.4498, + "step": 3883 + }, + { + "epoch": 0.44, + "learning_rate": 1.2283941160971512e-05, + "loss": 0.4816, + "step": 3884 + }, + { + "epoch": 0.44, + "learning_rate": 1.2280337191146276e-05, + "loss": 0.4777, + "step": 3885 + }, + { + "epoch": 0.44, + "learning_rate": 1.2276732908863933e-05, + "loss": 0.4736, + "step": 3886 + }, + { + "epoch": 0.44, + "learning_rate": 1.2273128314618353e-05, + "loss": 0.462, + "step": 3887 + }, + { + "epoch": 0.44, + "learning_rate": 1.226952340890344e-05, + "loss": 0.4851, + "step": 3888 + }, + { + "epoch": 0.44, + "learning_rate": 1.2265918192213153e-05, + "loss": 0.4576, + "step": 3889 + }, + { + "epoch": 0.44, + "learning_rate": 1.2262312665041482e-05, + "loss": 0.4893, + "step": 3890 + }, + { + "epoch": 0.44, + "learning_rate": 1.2258706827882472e-05, + "loss": 0.462, + "step": 3891 + }, + { + "epoch": 0.44, + "learning_rate": 1.2255100681230192e-05, + "loss": 0.4632, + "step": 3892 + }, + { + "epoch": 0.44, + "learning_rate": 1.2251494225578775e-05, + "loss": 0.4544, + "step": 3893 + }, + { + "epoch": 0.45, + "learning_rate": 1.224788746142238e-05, + "loss": 0.4882, + "step": 3894 + }, + { + "epoch": 0.45, + "learning_rate": 1.2244280389255218e-05, + "loss": 0.4637, + "step": 3895 + }, + { + "epoch": 0.45, + "learning_rate": 1.2240673009571536e-05, + "loss": 0.459, + "step": 3896 + }, + { + "epoch": 0.45, + "learning_rate": 1.2237065322865625e-05, + "loss": 0.4563, + "step": 3897 + }, + { + "epoch": 0.45, + "learning_rate": 1.223345732963182e-05, + "loss": 0.4665, + "step": 3898 + }, + { + "epoch": 0.45, + "learning_rate": 1.2229849030364496e-05, + "loss": 0.4574, + "step": 3899 + }, + { + "epoch": 0.45, + "learning_rate": 1.2226240425558071e-05, + "loss": 0.4701, + "step": 3900 + }, + { + "epoch": 0.45, + "learning_rate": 1.2222631515707005e-05, + "loss": 0.4649, + "step": 3901 + }, + { + "epoch": 0.45, + "learning_rate": 1.2219022301305796e-05, + "loss": 0.4686, + "step": 3902 + }, + { + "epoch": 0.45, + "learning_rate": 1.2215412782848993e-05, + "loss": 0.4741, + "step": 3903 + }, + { + "epoch": 0.45, + "learning_rate": 1.2211802960831176e-05, + "loss": 0.4629, + "step": 3904 + }, + { + "epoch": 0.45, + "learning_rate": 1.2208192835746973e-05, + "loss": 0.4678, + "step": 3905 + }, + { + "epoch": 0.45, + "learning_rate": 1.2204582408091047e-05, + "loss": 0.4732, + "step": 3906 + }, + { + "epoch": 0.45, + "learning_rate": 1.2200971678358113e-05, + "loss": 0.4798, + "step": 3907 + }, + { + "epoch": 0.45, + "learning_rate": 1.2197360647042922e-05, + "loss": 0.4674, + "step": 3908 + }, + { + "epoch": 0.45, + "learning_rate": 1.2193749314640264e-05, + "loss": 0.4708, + "step": 3909 + }, + { + "epoch": 0.45, + "learning_rate": 1.2190137681644968e-05, + "loss": 0.4884, + "step": 3910 + }, + { + "epoch": 0.45, + "learning_rate": 1.2186525748551914e-05, + "loss": 0.468, + "step": 3911 + }, + { + "epoch": 0.45, + "learning_rate": 1.2182913515856016e-05, + "loss": 0.4619, + "step": 3912 + }, + { + "epoch": 0.45, + "learning_rate": 1.2179300984052233e-05, + "loss": 0.4604, + "step": 3913 + }, + { + "epoch": 0.45, + "learning_rate": 1.217568815363556e-05, + "loss": 0.4674, + "step": 3914 + }, + { + "epoch": 0.45, + "learning_rate": 1.2172075025101032e-05, + "loss": 0.4838, + "step": 3915 + }, + { + "epoch": 0.45, + "learning_rate": 1.2168461598943728e-05, + "loss": 0.4492, + "step": 3916 + }, + { + "epoch": 0.45, + "learning_rate": 1.2164847875658776e-05, + "loss": 0.4636, + "step": 3917 + }, + { + "epoch": 0.45, + "learning_rate": 1.2161233855741332e-05, + "loss": 0.4594, + "step": 3918 + }, + { + "epoch": 0.45, + "learning_rate": 1.2157619539686597e-05, + "loss": 0.488, + "step": 3919 + }, + { + "epoch": 0.45, + "learning_rate": 1.2154004927989815e-05, + "loss": 0.4618, + "step": 3920 + }, + { + "epoch": 0.45, + "learning_rate": 1.2150390021146263e-05, + "loss": 0.4698, + "step": 3921 + }, + { + "epoch": 0.45, + "learning_rate": 1.2146774819651275e-05, + "loss": 0.4638, + "step": 3922 + }, + { + "epoch": 0.45, + "learning_rate": 1.2143159324000204e-05, + "loss": 0.4549, + "step": 3923 + }, + { + "epoch": 0.45, + "learning_rate": 1.2139543534688456e-05, + "loss": 0.4666, + "step": 3924 + }, + { + "epoch": 0.45, + "learning_rate": 1.2135927452211477e-05, + "loss": 0.4641, + "step": 3925 + }, + { + "epoch": 0.45, + "learning_rate": 1.2132311077064749e-05, + "loss": 0.4634, + "step": 3926 + }, + { + "epoch": 0.45, + "learning_rate": 1.2128694409743797e-05, + "loss": 0.4843, + "step": 3927 + }, + { + "epoch": 0.45, + "learning_rate": 1.2125077450744187e-05, + "loss": 0.4659, + "step": 3928 + }, + { + "epoch": 0.45, + "learning_rate": 1.2121460200561521e-05, + "loss": 0.4573, + "step": 3929 + }, + { + "epoch": 0.45, + "learning_rate": 1.2117842659691444e-05, + "loss": 0.4587, + "step": 3930 + }, + { + "epoch": 0.45, + "learning_rate": 1.2114224828629638e-05, + "loss": 0.467, + "step": 3931 + }, + { + "epoch": 0.45, + "learning_rate": 1.2110606707871828e-05, + "loss": 0.4531, + "step": 3932 + }, + { + "epoch": 0.45, + "learning_rate": 1.2106988297913778e-05, + "loss": 0.4764, + "step": 3933 + }, + { + "epoch": 0.45, + "learning_rate": 1.2103369599251289e-05, + "loss": 0.4769, + "step": 3934 + }, + { + "epoch": 0.45, + "learning_rate": 1.2099750612380205e-05, + "loss": 0.4911, + "step": 3935 + }, + { + "epoch": 0.45, + "learning_rate": 1.2096131337796408e-05, + "loss": 0.4438, + "step": 3936 + }, + { + "epoch": 0.45, + "learning_rate": 1.2092511775995821e-05, + "loss": 0.4926, + "step": 3937 + }, + { + "epoch": 0.45, + "learning_rate": 1.20888919274744e-05, + "loss": 0.4568, + "step": 3938 + }, + { + "epoch": 0.45, + "learning_rate": 1.208527179272815e-05, + "loss": 0.4464, + "step": 3939 + }, + { + "epoch": 0.45, + "learning_rate": 1.2081651372253107e-05, + "loss": 0.4697, + "step": 3940 + }, + { + "epoch": 0.45, + "learning_rate": 1.2078030666545351e-05, + "loss": 0.4728, + "step": 3941 + }, + { + "epoch": 0.45, + "learning_rate": 1.2074409676101e-05, + "loss": 0.4593, + "step": 3942 + }, + { + "epoch": 0.45, + "learning_rate": 1.2070788401416209e-05, + "loss": 0.4709, + "step": 3943 + }, + { + "epoch": 0.45, + "learning_rate": 1.2067166842987175e-05, + "loss": 0.4667, + "step": 3944 + }, + { + "epoch": 0.45, + "learning_rate": 1.206354500131013e-05, + "loss": 0.4728, + "step": 3945 + }, + { + "epoch": 0.45, + "learning_rate": 1.205992287688135e-05, + "loss": 0.4962, + "step": 3946 + }, + { + "epoch": 0.45, + "learning_rate": 1.2056300470197144e-05, + "loss": 0.4613, + "step": 3947 + }, + { + "epoch": 0.45, + "learning_rate": 1.2052677781753869e-05, + "loss": 0.4539, + "step": 3948 + }, + { + "epoch": 0.45, + "learning_rate": 1.2049054812047905e-05, + "loss": 0.4602, + "step": 3949 + }, + { + "epoch": 0.45, + "learning_rate": 1.2045431561575685e-05, + "loss": 0.4621, + "step": 3950 + }, + { + "epoch": 0.45, + "learning_rate": 1.2041808030833675e-05, + "loss": 0.4672, + "step": 3951 + }, + { + "epoch": 0.45, + "learning_rate": 1.2038184220318381e-05, + "loss": 0.4647, + "step": 3952 + }, + { + "epoch": 0.45, + "learning_rate": 1.2034560130526341e-05, + "loss": 0.4773, + "step": 3953 + }, + { + "epoch": 0.45, + "learning_rate": 1.2030935761954137e-05, + "loss": 0.4605, + "step": 3954 + }, + { + "epoch": 0.45, + "learning_rate": 1.2027311115098395e-05, + "loss": 0.4847, + "step": 3955 + }, + { + "epoch": 0.45, + "learning_rate": 1.2023686190455766e-05, + "loss": 0.4613, + "step": 3956 + }, + { + "epoch": 0.45, + "learning_rate": 1.202006098852295e-05, + "loss": 0.4723, + "step": 3957 + }, + { + "epoch": 0.45, + "learning_rate": 1.2016435509796677e-05, + "loss": 0.4552, + "step": 3958 + }, + { + "epoch": 0.45, + "learning_rate": 1.2012809754773718e-05, + "loss": 0.4701, + "step": 3959 + }, + { + "epoch": 0.45, + "learning_rate": 1.2009183723950886e-05, + "loss": 0.4679, + "step": 3960 + }, + { + "epoch": 0.45, + "learning_rate": 1.2005557417825029e-05, + "loss": 0.4634, + "step": 3961 + }, + { + "epoch": 0.45, + "learning_rate": 1.2001930836893026e-05, + "loss": 0.4503, + "step": 3962 + }, + { + "epoch": 0.45, + "learning_rate": 1.1998303981651804e-05, + "loss": 0.4864, + "step": 3963 + }, + { + "epoch": 0.45, + "learning_rate": 1.199467685259832e-05, + "loss": 0.4726, + "step": 3964 + }, + { + "epoch": 0.45, + "learning_rate": 1.1991049450229577e-05, + "loss": 0.4478, + "step": 3965 + }, + { + "epoch": 0.45, + "learning_rate": 1.1987421775042605e-05, + "loss": 0.4787, + "step": 3966 + }, + { + "epoch": 0.45, + "learning_rate": 1.1983793827534477e-05, + "loss": 0.4795, + "step": 3967 + }, + { + "epoch": 0.45, + "learning_rate": 1.1980165608202303e-05, + "loss": 0.4697, + "step": 3968 + }, + { + "epoch": 0.45, + "learning_rate": 1.1976537117543234e-05, + "loss": 0.4527, + "step": 3969 + }, + { + "epoch": 0.45, + "learning_rate": 1.1972908356054455e-05, + "loss": 0.4764, + "step": 3970 + }, + { + "epoch": 0.45, + "learning_rate": 1.1969279324233179e-05, + "loss": 0.4666, + "step": 3971 + }, + { + "epoch": 0.45, + "learning_rate": 1.1965650022576672e-05, + "loss": 0.4536, + "step": 3972 + }, + { + "epoch": 0.45, + "learning_rate": 1.196202045158222e-05, + "loss": 0.4626, + "step": 3973 + }, + { + "epoch": 0.45, + "learning_rate": 1.1958390611747167e-05, + "loss": 0.471, + "step": 3974 + }, + { + "epoch": 0.45, + "learning_rate": 1.1954760503568878e-05, + "loss": 0.4596, + "step": 3975 + }, + { + "epoch": 0.45, + "learning_rate": 1.1951130127544756e-05, + "loss": 0.4595, + "step": 3976 + }, + { + "epoch": 0.45, + "learning_rate": 1.1947499484172245e-05, + "loss": 0.4803, + "step": 3977 + }, + { + "epoch": 0.45, + "learning_rate": 1.1943868573948825e-05, + "loss": 0.4693, + "step": 3978 + }, + { + "epoch": 0.45, + "learning_rate": 1.194023739737201e-05, + "loss": 0.4819, + "step": 3979 + }, + { + "epoch": 0.45, + "learning_rate": 1.1936605954939355e-05, + "loss": 0.4549, + "step": 3980 + }, + { + "epoch": 0.45, + "learning_rate": 1.1932974247148445e-05, + "loss": 0.4593, + "step": 3981 + }, + { + "epoch": 0.46, + "learning_rate": 1.192934227449691e-05, + "loss": 0.4699, + "step": 3982 + }, + { + "epoch": 0.46, + "learning_rate": 1.1925710037482405e-05, + "loss": 0.4489, + "step": 3983 + }, + { + "epoch": 0.46, + "learning_rate": 1.1922077536602634e-05, + "loss": 0.4618, + "step": 3984 + }, + { + "epoch": 0.46, + "learning_rate": 1.1918444772355329e-05, + "loss": 0.4802, + "step": 3985 + }, + { + "epoch": 0.46, + "learning_rate": 1.1914811745238256e-05, + "loss": 0.4796, + "step": 3986 + }, + { + "epoch": 0.46, + "learning_rate": 1.1911178455749223e-05, + "loss": 0.4778, + "step": 3987 + }, + { + "epoch": 0.46, + "learning_rate": 1.1907544904386074e-05, + "loss": 0.4448, + "step": 3988 + }, + { + "epoch": 0.46, + "learning_rate": 1.1903911091646684e-05, + "loss": 0.4815, + "step": 3989 + }, + { + "epoch": 0.46, + "learning_rate": 1.190027701802897e-05, + "loss": 0.4605, + "step": 3990 + }, + { + "epoch": 0.46, + "learning_rate": 1.1896642684030874e-05, + "loss": 0.4692, + "step": 3991 + }, + { + "epoch": 0.46, + "learning_rate": 1.1893008090150389e-05, + "loss": 0.4576, + "step": 3992 + }, + { + "epoch": 0.46, + "learning_rate": 1.1889373236885531e-05, + "loss": 0.472, + "step": 3993 + }, + { + "epoch": 0.46, + "learning_rate": 1.1885738124734359e-05, + "loss": 0.4549, + "step": 3994 + }, + { + "epoch": 0.46, + "learning_rate": 1.188210275419496e-05, + "loss": 0.481, + "step": 3995 + }, + { + "epoch": 0.46, + "learning_rate": 1.1878467125765464e-05, + "loss": 0.449, + "step": 3996 + }, + { + "epoch": 0.46, + "learning_rate": 1.1874831239944034e-05, + "loss": 0.4738, + "step": 3997 + }, + { + "epoch": 0.46, + "learning_rate": 1.1871195097228864e-05, + "loss": 0.4507, + "step": 3998 + }, + { + "epoch": 0.46, + "learning_rate": 1.1867558698118192e-05, + "loss": 0.4777, + "step": 3999 + }, + { + "epoch": 0.46, + "learning_rate": 1.1863922043110282e-05, + "loss": 0.4692, + "step": 4000 + }, + { + "epoch": 0.46, + "learning_rate": 1.1860285132703435e-05, + "loss": 0.474, + "step": 4001 + }, + { + "epoch": 0.46, + "learning_rate": 1.1856647967395995e-05, + "loss": 0.462, + "step": 4002 + }, + { + "epoch": 0.46, + "learning_rate": 1.185301054768633e-05, + "loss": 0.4657, + "step": 4003 + }, + { + "epoch": 0.46, + "learning_rate": 1.1849372874072852e-05, + "loss": 0.4854, + "step": 4004 + }, + { + "epoch": 0.46, + "learning_rate": 1.1845734947054e-05, + "loss": 0.4589, + "step": 4005 + }, + { + "epoch": 0.46, + "learning_rate": 1.1842096767128249e-05, + "loss": 0.4663, + "step": 4006 + }, + { + "epoch": 0.46, + "learning_rate": 1.1838458334794116e-05, + "loss": 0.4764, + "step": 4007 + }, + { + "epoch": 0.46, + "learning_rate": 1.1834819650550144e-05, + "loss": 0.4485, + "step": 4008 + }, + { + "epoch": 0.46, + "learning_rate": 1.1831180714894918e-05, + "loss": 0.4674, + "step": 4009 + }, + { + "epoch": 0.46, + "learning_rate": 1.1827541528327052e-05, + "loss": 0.4603, + "step": 4010 + }, + { + "epoch": 0.46, + "learning_rate": 1.182390209134519e-05, + "loss": 0.4521, + "step": 4011 + }, + { + "epoch": 0.46, + "learning_rate": 1.1820262404448023e-05, + "loss": 0.4839, + "step": 4012 + }, + { + "epoch": 0.46, + "learning_rate": 1.181662246813427e-05, + "loss": 0.4953, + "step": 4013 + }, + { + "epoch": 0.46, + "learning_rate": 1.1812982282902676e-05, + "loss": 0.4508, + "step": 4014 + }, + { + "epoch": 0.46, + "learning_rate": 1.1809341849252034e-05, + "loss": 0.4796, + "step": 4015 + }, + { + "epoch": 0.46, + "learning_rate": 1.180570116768116e-05, + "loss": 0.4567, + "step": 4016 + }, + { + "epoch": 0.46, + "learning_rate": 1.1802060238688915e-05, + "loss": 0.4681, + "step": 4017 + }, + { + "epoch": 0.46, + "learning_rate": 1.1798419062774185e-05, + "loss": 0.4736, + "step": 4018 + }, + { + "epoch": 0.46, + "learning_rate": 1.179477764043589e-05, + "loss": 0.4584, + "step": 4019 + }, + { + "epoch": 0.46, + "learning_rate": 1.1791135972172989e-05, + "loss": 0.4717, + "step": 4020 + }, + { + "epoch": 0.46, + "learning_rate": 1.1787494058484468e-05, + "loss": 0.4796, + "step": 4021 + }, + { + "epoch": 0.46, + "learning_rate": 1.1783851899869357e-05, + "loss": 0.4622, + "step": 4022 + }, + { + "epoch": 0.46, + "learning_rate": 1.1780209496826707e-05, + "loss": 0.4661, + "step": 4023 + }, + { + "epoch": 0.46, + "learning_rate": 1.177656684985561e-05, + "loss": 0.4547, + "step": 4024 + }, + { + "epoch": 0.46, + "learning_rate": 1.1772923959455188e-05, + "loss": 0.4589, + "step": 4025 + }, + { + "epoch": 0.46, + "learning_rate": 1.1769280826124604e-05, + "loss": 0.4461, + "step": 4026 + }, + { + "epoch": 0.46, + "learning_rate": 1.1765637450363048e-05, + "loss": 0.479, + "step": 4027 + }, + { + "epoch": 0.46, + "learning_rate": 1.176199383266974e-05, + "loss": 0.475, + "step": 4028 + }, + { + "epoch": 0.46, + "learning_rate": 1.1758349973543936e-05, + "loss": 0.485, + "step": 4029 + }, + { + "epoch": 0.46, + "learning_rate": 1.1754705873484929e-05, + "loss": 0.4537, + "step": 4030 + }, + { + "epoch": 0.46, + "learning_rate": 1.1751061532992045e-05, + "loss": 0.4889, + "step": 4031 + }, + { + "epoch": 0.46, + "learning_rate": 1.1747416952564632e-05, + "loss": 0.4695, + "step": 4032 + }, + { + "epoch": 0.46, + "learning_rate": 1.1743772132702086e-05, + "loss": 0.4752, + "step": 4033 + }, + { + "epoch": 0.46, + "learning_rate": 1.1740127073903826e-05, + "loss": 0.4519, + "step": 4034 + }, + { + "epoch": 0.46, + "learning_rate": 1.1736481776669307e-05, + "loss": 0.4603, + "step": 4035 + }, + { + "epoch": 0.46, + "learning_rate": 1.1732836241498013e-05, + "loss": 0.4767, + "step": 4036 + }, + { + "epoch": 0.46, + "learning_rate": 1.1729190468889466e-05, + "loss": 0.4883, + "step": 4037 + }, + { + "epoch": 0.46, + "learning_rate": 1.1725544459343221e-05, + "loss": 0.4877, + "step": 4038 + }, + { + "epoch": 0.46, + "learning_rate": 1.172189821335886e-05, + "loss": 0.4644, + "step": 4039 + }, + { + "epoch": 0.46, + "learning_rate": 1.1718251731436001e-05, + "loss": 0.4534, + "step": 4040 + }, + { + "epoch": 0.46, + "learning_rate": 1.1714605014074291e-05, + "loss": 0.4697, + "step": 4041 + }, + { + "epoch": 0.46, + "learning_rate": 1.1710958061773413e-05, + "loss": 0.4632, + "step": 4042 + }, + { + "epoch": 0.46, + "learning_rate": 1.1707310875033085e-05, + "loss": 0.4561, + "step": 4043 + }, + { + "epoch": 0.46, + "learning_rate": 1.1703663454353045e-05, + "loss": 0.4752, + "step": 4044 + }, + { + "epoch": 0.46, + "learning_rate": 1.1700015800233078e-05, + "loss": 0.4879, + "step": 4045 + }, + { + "epoch": 0.46, + "learning_rate": 1.169636791317299e-05, + "loss": 0.4552, + "step": 4046 + }, + { + "epoch": 0.46, + "learning_rate": 1.1692719793672627e-05, + "loss": 0.4641, + "step": 4047 + }, + { + "epoch": 0.46, + "learning_rate": 1.1689071442231858e-05, + "loss": 0.4615, + "step": 4048 + }, + { + "epoch": 0.46, + "learning_rate": 1.1685422859350592e-05, + "loss": 0.4859, + "step": 4049 + }, + { + "epoch": 0.46, + "learning_rate": 1.1681774045528764e-05, + "loss": 0.4727, + "step": 4050 + }, + { + "epoch": 0.46, + "learning_rate": 1.1678125001266347e-05, + "loss": 0.4682, + "step": 4051 + }, + { + "epoch": 0.46, + "learning_rate": 1.1674475727063337e-05, + "loss": 0.4597, + "step": 4052 + }, + { + "epoch": 0.46, + "learning_rate": 1.1670826223419766e-05, + "loss": 0.4775, + "step": 4053 + }, + { + "epoch": 0.46, + "learning_rate": 1.1667176490835701e-05, + "loss": 0.4741, + "step": 4054 + }, + { + "epoch": 0.46, + "learning_rate": 1.1663526529811235e-05, + "loss": 0.4678, + "step": 4055 + }, + { + "epoch": 0.46, + "learning_rate": 1.1659876340846494e-05, + "loss": 0.4658, + "step": 4056 + }, + { + "epoch": 0.46, + "learning_rate": 1.165622592444164e-05, + "loss": 0.4533, + "step": 4057 + }, + { + "epoch": 0.46, + "learning_rate": 1.165257528109685e-05, + "loss": 0.4822, + "step": 4058 + }, + { + "epoch": 0.46, + "learning_rate": 1.1648924411312354e-05, + "loss": 0.4722, + "step": 4059 + }, + { + "epoch": 0.46, + "learning_rate": 1.1645273315588399e-05, + "loss": 0.4607, + "step": 4060 + }, + { + "epoch": 0.46, + "learning_rate": 1.164162199442527e-05, + "loss": 0.4798, + "step": 4061 + }, + { + "epoch": 0.46, + "learning_rate": 1.1637970448323274e-05, + "loss": 0.4758, + "step": 4062 + }, + { + "epoch": 0.46, + "learning_rate": 1.1634318677782755e-05, + "loss": 0.4617, + "step": 4063 + }, + { + "epoch": 0.46, + "learning_rate": 1.163066668330409e-05, + "loss": 0.467, + "step": 4064 + }, + { + "epoch": 0.46, + "learning_rate": 1.1627014465387685e-05, + "loss": 0.4489, + "step": 4065 + }, + { + "epoch": 0.46, + "learning_rate": 1.1623362024533974e-05, + "loss": 0.4678, + "step": 4066 + }, + { + "epoch": 0.46, + "learning_rate": 1.161970936124342e-05, + "loss": 0.4925, + "step": 4067 + }, + { + "epoch": 0.46, + "learning_rate": 1.1616056476016521e-05, + "loss": 0.4605, + "step": 4068 + }, + { + "epoch": 0.47, + "learning_rate": 1.1612403369353806e-05, + "loss": 0.4645, + "step": 4069 + }, + { + "epoch": 0.47, + "learning_rate": 1.1608750041755832e-05, + "loss": 0.4682, + "step": 4070 + }, + { + "epoch": 0.47, + "learning_rate": 1.1605096493723183e-05, + "loss": 0.4891, + "step": 4071 + }, + { + "epoch": 0.47, + "learning_rate": 1.1601442725756478e-05, + "loss": 0.4667, + "step": 4072 + }, + { + "epoch": 0.47, + "learning_rate": 1.1597788738356365e-05, + "loss": 0.4829, + "step": 4073 + }, + { + "epoch": 0.47, + "learning_rate": 1.1594134532023525e-05, + "loss": 0.4345, + "step": 4074 + }, + { + "epoch": 0.47, + "learning_rate": 1.1590480107258663e-05, + "loss": 0.4448, + "step": 4075 + }, + { + "epoch": 0.47, + "learning_rate": 1.1586825464562515e-05, + "loss": 0.4806, + "step": 4076 + }, + { + "epoch": 0.47, + "learning_rate": 1.1583170604435851e-05, + "loss": 0.4897, + "step": 4077 + }, + { + "epoch": 0.47, + "learning_rate": 1.1579515527379468e-05, + "loss": 0.4704, + "step": 4078 + }, + { + "epoch": 0.47, + "learning_rate": 1.1575860233894195e-05, + "loss": 0.4678, + "step": 4079 + }, + { + "epoch": 0.47, + "learning_rate": 1.1572204724480887e-05, + "loss": 0.4891, + "step": 4080 + }, + { + "epoch": 0.47, + "learning_rate": 1.1568548999640428e-05, + "loss": 0.4619, + "step": 4081 + }, + { + "epoch": 0.47, + "learning_rate": 1.1564893059873734e-05, + "loss": 0.4855, + "step": 4082 + }, + { + "epoch": 0.47, + "learning_rate": 1.1561236905681761e-05, + "loss": 0.4489, + "step": 4083 + }, + { + "epoch": 0.47, + "learning_rate": 1.155758053756547e-05, + "loss": 0.4747, + "step": 4084 + }, + { + "epoch": 0.47, + "learning_rate": 1.1553923956025871e-05, + "loss": 0.4877, + "step": 4085 + }, + { + "epoch": 0.47, + "learning_rate": 1.1550267161563998e-05, + "loss": 0.4616, + "step": 4086 + }, + { + "epoch": 0.47, + "learning_rate": 1.1546610154680908e-05, + "loss": 0.4657, + "step": 4087 + }, + { + "epoch": 0.47, + "learning_rate": 1.1542952935877703e-05, + "loss": 0.4554, + "step": 4088 + }, + { + "epoch": 0.47, + "learning_rate": 1.1539295505655494e-05, + "loss": 0.4621, + "step": 4089 + }, + { + "epoch": 0.47, + "learning_rate": 1.1535637864515436e-05, + "loss": 0.4696, + "step": 4090 + }, + { + "epoch": 0.47, + "learning_rate": 1.1531980012958706e-05, + "loss": 0.4821, + "step": 4091 + }, + { + "epoch": 0.47, + "learning_rate": 1.152832195148651e-05, + "loss": 0.4452, + "step": 4092 + }, + { + "epoch": 0.47, + "learning_rate": 1.1524663680600083e-05, + "loss": 0.5015, + "step": 4093 + }, + { + "epoch": 0.47, + "learning_rate": 1.1521005200800694e-05, + "loss": 0.4502, + "step": 4094 + }, + { + "epoch": 0.47, + "learning_rate": 1.1517346512589635e-05, + "loss": 0.4993, + "step": 4095 + }, + { + "epoch": 0.47, + "learning_rate": 1.1513687616468225e-05, + "loss": 0.4516, + "step": 4096 + }, + { + "epoch": 0.47, + "learning_rate": 1.1510028512937818e-05, + "loss": 0.4495, + "step": 4097 + }, + { + "epoch": 0.47, + "learning_rate": 1.1506369202499791e-05, + "loss": 0.4643, + "step": 4098 + }, + { + "epoch": 0.47, + "learning_rate": 1.1502709685655553e-05, + "loss": 0.474, + "step": 4099 + }, + { + "epoch": 0.47, + "learning_rate": 1.149904996290654e-05, + "loss": 0.448, + "step": 4100 + }, + { + "epoch": 0.47, + "learning_rate": 1.149539003475421e-05, + "loss": 0.4667, + "step": 4101 + }, + { + "epoch": 0.47, + "learning_rate": 1.1491729901700062e-05, + "loss": 0.4557, + "step": 4102 + }, + { + "epoch": 0.47, + "learning_rate": 1.148806956424561e-05, + "loss": 0.4826, + "step": 4103 + }, + { + "epoch": 0.47, + "learning_rate": 1.1484409022892406e-05, + "loss": 0.4517, + "step": 4104 + }, + { + "epoch": 0.47, + "learning_rate": 1.1480748278142025e-05, + "loss": 0.4756, + "step": 4105 + }, + { + "epoch": 0.47, + "learning_rate": 1.1477087330496071e-05, + "loss": 0.4744, + "step": 4106 + }, + { + "epoch": 0.47, + "learning_rate": 1.1473426180456174e-05, + "loss": 0.4785, + "step": 4107 + }, + { + "epoch": 0.47, + "learning_rate": 1.1469764828523995e-05, + "loss": 0.4586, + "step": 4108 + }, + { + "epoch": 0.47, + "learning_rate": 1.146610327520122e-05, + "loss": 0.4743, + "step": 4109 + }, + { + "epoch": 0.47, + "learning_rate": 1.1462441520989565e-05, + "loss": 0.4627, + "step": 4110 + }, + { + "epoch": 0.47, + "learning_rate": 1.1458779566390768e-05, + "loss": 0.4851, + "step": 4111 + }, + { + "epoch": 0.47, + "learning_rate": 1.1455117411906604e-05, + "loss": 0.4588, + "step": 4112 + }, + { + "epoch": 0.47, + "learning_rate": 1.1451455058038864e-05, + "loss": 0.47, + "step": 4113 + }, + { + "epoch": 0.47, + "learning_rate": 1.1447792505289384e-05, + "loss": 0.4498, + "step": 4114 + }, + { + "epoch": 0.47, + "learning_rate": 1.1444129754159998e-05, + "loss": 0.4372, + "step": 4115 + }, + { + "epoch": 0.47, + "learning_rate": 1.1440466805152596e-05, + "loss": 0.4844, + "step": 4116 + }, + { + "epoch": 0.47, + "learning_rate": 1.1436803658769082e-05, + "loss": 0.4632, + "step": 4117 + }, + { + "epoch": 0.47, + "learning_rate": 1.1433140315511392e-05, + "loss": 0.4676, + "step": 4118 + }, + { + "epoch": 0.47, + "learning_rate": 1.142947677588148e-05, + "loss": 0.4622, + "step": 4119 + }, + { + "epoch": 0.47, + "learning_rate": 1.1425813040381332e-05, + "loss": 0.4423, + "step": 4120 + }, + { + "epoch": 0.47, + "learning_rate": 1.1422149109512967e-05, + "loss": 0.4961, + "step": 4121 + }, + { + "epoch": 0.47, + "learning_rate": 1.1418484983778421e-05, + "loss": 0.4723, + "step": 4122 + }, + { + "epoch": 0.47, + "learning_rate": 1.1414820663679768e-05, + "loss": 0.486, + "step": 4123 + }, + { + "epoch": 0.47, + "learning_rate": 1.1411156149719094e-05, + "loss": 0.4641, + "step": 4124 + }, + { + "epoch": 0.47, + "learning_rate": 1.1407491442398518e-05, + "loss": 0.4745, + "step": 4125 + }, + { + "epoch": 0.47, + "learning_rate": 1.1403826542220193e-05, + "loss": 0.4447, + "step": 4126 + }, + { + "epoch": 0.47, + "learning_rate": 1.1400161449686293e-05, + "loss": 0.4544, + "step": 4127 + }, + { + "epoch": 0.47, + "learning_rate": 1.139649616529901e-05, + "loss": 0.4771, + "step": 4128 + }, + { + "epoch": 0.47, + "learning_rate": 1.1392830689560577e-05, + "loss": 0.4845, + "step": 4129 + }, + { + "epoch": 0.47, + "learning_rate": 1.1389165022973238e-05, + "loss": 0.4613, + "step": 4130 + }, + { + "epoch": 0.47, + "learning_rate": 1.1385499166039281e-05, + "loss": 0.4566, + "step": 4131 + }, + { + "epoch": 0.47, + "learning_rate": 1.1381833119261003e-05, + "loss": 0.4597, + "step": 4132 + }, + { + "epoch": 0.47, + "learning_rate": 1.1378166883140738e-05, + "loss": 0.4653, + "step": 4133 + }, + { + "epoch": 0.47, + "learning_rate": 1.1374500458180839e-05, + "loss": 0.4702, + "step": 4134 + }, + { + "epoch": 0.47, + "learning_rate": 1.137083384488369e-05, + "loss": 0.4467, + "step": 4135 + }, + { + "epoch": 0.47, + "learning_rate": 1.1367167043751701e-05, + "loss": 0.4668, + "step": 4136 + }, + { + "epoch": 0.47, + "learning_rate": 1.1363500055287301e-05, + "loss": 0.4543, + "step": 4137 + }, + { + "epoch": 0.47, + "learning_rate": 1.1359832879992956e-05, + "loss": 0.4653, + "step": 4138 + }, + { + "epoch": 0.47, + "learning_rate": 1.1356165518371142e-05, + "loss": 0.4749, + "step": 4139 + }, + { + "epoch": 0.47, + "learning_rate": 1.1352497970924376e-05, + "loss": 0.4748, + "step": 4140 + }, + { + "epoch": 0.47, + "learning_rate": 1.1348830238155191e-05, + "loss": 0.4532, + "step": 4141 + }, + { + "epoch": 0.47, + "learning_rate": 1.134516232056615e-05, + "loss": 0.4458, + "step": 4142 + }, + { + "epoch": 0.47, + "learning_rate": 1.134149421865984e-05, + "loss": 0.4781, + "step": 4143 + }, + { + "epoch": 0.47, + "learning_rate": 1.1337825932938866e-05, + "loss": 0.4488, + "step": 4144 + }, + { + "epoch": 0.47, + "learning_rate": 1.1334157463905876e-05, + "loss": 0.4605, + "step": 4145 + }, + { + "epoch": 0.47, + "learning_rate": 1.1330488812063526e-05, + "loss": 0.4605, + "step": 4146 + }, + { + "epoch": 0.47, + "learning_rate": 1.1326819977914503e-05, + "loss": 0.4966, + "step": 4147 + }, + { + "epoch": 0.47, + "learning_rate": 1.132315096196152e-05, + "loss": 0.4677, + "step": 4148 + }, + { + "epoch": 0.47, + "learning_rate": 1.1319481764707313e-05, + "loss": 0.4461, + "step": 4149 + }, + { + "epoch": 0.47, + "learning_rate": 1.131581238665465e-05, + "loss": 0.4615, + "step": 4150 + }, + { + "epoch": 0.47, + "learning_rate": 1.1312142828306309e-05, + "loss": 0.4617, + "step": 4151 + }, + { + "epoch": 0.47, + "learning_rate": 1.1308473090165107e-05, + "loss": 0.4534, + "step": 4152 + }, + { + "epoch": 0.47, + "learning_rate": 1.1304803172733878e-05, + "loss": 0.4575, + "step": 4153 + }, + { + "epoch": 0.47, + "learning_rate": 1.1301133076515482e-05, + "loss": 0.463, + "step": 4154 + }, + { + "epoch": 0.47, + "learning_rate": 1.1297462802012806e-05, + "loss": 0.4876, + "step": 4155 + }, + { + "epoch": 0.47, + "learning_rate": 1.129379234972876e-05, + "loss": 0.4654, + "step": 4156 + }, + { + "epoch": 0.48, + "learning_rate": 1.1290121720166277e-05, + "loss": 0.4713, + "step": 4157 + }, + { + "epoch": 0.48, + "learning_rate": 1.1286450913828313e-05, + "loss": 0.4479, + "step": 4158 + }, + { + "epoch": 0.48, + "learning_rate": 1.1282779931217852e-05, + "loss": 0.4562, + "step": 4159 + }, + { + "epoch": 0.48, + "learning_rate": 1.1279108772837901e-05, + "loss": 0.4596, + "step": 4160 + }, + { + "epoch": 0.48, + "learning_rate": 1.1275437439191493e-05, + "loss": 0.469, + "step": 4161 + }, + { + "epoch": 0.48, + "learning_rate": 1.1271765930781677e-05, + "loss": 0.4607, + "step": 4162 + }, + { + "epoch": 0.48, + "learning_rate": 1.1268094248111536e-05, + "loss": 0.471, + "step": 4163 + }, + { + "epoch": 0.48, + "learning_rate": 1.1264422391684171e-05, + "loss": 0.4497, + "step": 4164 + }, + { + "epoch": 0.48, + "learning_rate": 1.126075036200271e-05, + "loss": 0.4635, + "step": 4165 + }, + { + "epoch": 0.48, + "learning_rate": 1.1257078159570303e-05, + "loss": 0.4637, + "step": 4166 + }, + { + "epoch": 0.48, + "learning_rate": 1.125340578489012e-05, + "loss": 0.4544, + "step": 4167 + }, + { + "epoch": 0.48, + "learning_rate": 1.1249733238465359e-05, + "loss": 0.4707, + "step": 4168 + }, + { + "epoch": 0.48, + "learning_rate": 1.1246060520799244e-05, + "loss": 0.4489, + "step": 4169 + }, + { + "epoch": 0.48, + "learning_rate": 1.1242387632395019e-05, + "loss": 0.4794, + "step": 4170 + }, + { + "epoch": 0.48, + "learning_rate": 1.1238714573755954e-05, + "loss": 0.4467, + "step": 4171 + }, + { + "epoch": 0.48, + "learning_rate": 1.1235041345385328e-05, + "loss": 0.4618, + "step": 4172 + }, + { + "epoch": 0.48, + "learning_rate": 1.123136794778647e-05, + "loss": 0.4623, + "step": 4173 + }, + { + "epoch": 0.48, + "learning_rate": 1.122769438146271e-05, + "loss": 0.4752, + "step": 4174 + }, + { + "epoch": 0.48, + "learning_rate": 1.1224020646917413e-05, + "loss": 0.4514, + "step": 4175 + }, + { + "epoch": 0.48, + "learning_rate": 1.1220346744653956e-05, + "loss": 0.4628, + "step": 4176 + }, + { + "epoch": 0.48, + "learning_rate": 1.1216672675175748e-05, + "loss": 0.4627, + "step": 4177 + }, + { + "epoch": 0.48, + "learning_rate": 1.1212998438986223e-05, + "loss": 0.4575, + "step": 4178 + }, + { + "epoch": 0.48, + "learning_rate": 1.1209324036588828e-05, + "loss": 0.4789, + "step": 4179 + }, + { + "epoch": 0.48, + "learning_rate": 1.1205649468487042e-05, + "loss": 0.4914, + "step": 4180 + }, + { + "epoch": 0.48, + "learning_rate": 1.1201974735184362e-05, + "loss": 0.4602, + "step": 4181 + }, + { + "epoch": 0.48, + "learning_rate": 1.1198299837184305e-05, + "loss": 0.4697, + "step": 4182 + }, + { + "epoch": 0.48, + "learning_rate": 1.1194624774990418e-05, + "loss": 0.4737, + "step": 4183 + }, + { + "epoch": 0.48, + "learning_rate": 1.119094954910627e-05, + "loss": 0.4548, + "step": 4184 + }, + { + "epoch": 0.48, + "learning_rate": 1.118727416003544e-05, + "loss": 0.4352, + "step": 4185 + }, + { + "epoch": 0.48, + "learning_rate": 1.1183598608281543e-05, + "loss": 0.4824, + "step": 4186 + }, + { + "epoch": 0.48, + "learning_rate": 1.1179922894348207e-05, + "loss": 0.4557, + "step": 4187 + }, + { + "epoch": 0.48, + "learning_rate": 1.11762470187391e-05, + "loss": 0.4896, + "step": 4188 + }, + { + "epoch": 0.48, + "learning_rate": 1.1172570981957886e-05, + "loss": 0.4562, + "step": 4189 + }, + { + "epoch": 0.48, + "learning_rate": 1.1168894784508268e-05, + "loss": 0.4579, + "step": 4190 + }, + { + "epoch": 0.48, + "learning_rate": 1.1165218426893969e-05, + "loss": 0.4793, + "step": 4191 + }, + { + "epoch": 0.48, + "learning_rate": 1.1161541909618728e-05, + "loss": 0.4601, + "step": 4192 + }, + { + "epoch": 0.48, + "learning_rate": 1.1157865233186315e-05, + "loss": 0.4474, + "step": 4193 + }, + { + "epoch": 0.48, + "learning_rate": 1.1154188398100516e-05, + "loss": 0.4726, + "step": 4194 + }, + { + "epoch": 0.48, + "learning_rate": 1.1150511404865136e-05, + "loss": 0.4498, + "step": 4195 + }, + { + "epoch": 0.48, + "learning_rate": 1.1146834253984008e-05, + "loss": 0.473, + "step": 4196 + }, + { + "epoch": 0.48, + "learning_rate": 1.114315694596098e-05, + "loss": 0.4606, + "step": 4197 + }, + { + "epoch": 0.48, + "learning_rate": 1.1139479481299928e-05, + "loss": 0.4608, + "step": 4198 + }, + { + "epoch": 0.48, + "learning_rate": 1.113580186050475e-05, + "loss": 0.4625, + "step": 4199 + }, + { + "epoch": 0.48, + "learning_rate": 1.1132124084079359e-05, + "loss": 0.4688, + "step": 4200 + }, + { + "epoch": 0.48, + "learning_rate": 1.112844615252769e-05, + "loss": 0.4852, + "step": 4201 + }, + { + "epoch": 0.48, + "learning_rate": 1.1124768066353705e-05, + "loss": 0.4653, + "step": 4202 + }, + { + "epoch": 0.48, + "learning_rate": 1.1121089826061385e-05, + "loss": 0.4611, + "step": 4203 + }, + { + "epoch": 0.48, + "learning_rate": 1.1117411432154725e-05, + "loss": 0.47, + "step": 4204 + }, + { + "epoch": 0.48, + "learning_rate": 1.1113732885137755e-05, + "loss": 0.4632, + "step": 4205 + }, + { + "epoch": 0.48, + "learning_rate": 1.1110054185514513e-05, + "loss": 0.4561, + "step": 4206 + }, + { + "epoch": 0.48, + "learning_rate": 1.1106375333789065e-05, + "loss": 0.4782, + "step": 4207 + }, + { + "epoch": 0.48, + "learning_rate": 1.1102696330465495e-05, + "loss": 0.4728, + "step": 4208 + }, + { + "epoch": 0.48, + "learning_rate": 1.1099017176047909e-05, + "loss": 0.4594, + "step": 4209 + }, + { + "epoch": 0.48, + "learning_rate": 1.109533787104043e-05, + "loss": 0.4676, + "step": 4210 + }, + { + "epoch": 0.48, + "learning_rate": 1.109165841594721e-05, + "loss": 0.46, + "step": 4211 + }, + { + "epoch": 0.48, + "learning_rate": 1.1087978811272417e-05, + "loss": 0.4804, + "step": 4212 + }, + { + "epoch": 0.48, + "learning_rate": 1.1084299057520234e-05, + "loss": 0.4493, + "step": 4213 + }, + { + "epoch": 0.48, + "learning_rate": 1.1080619155194873e-05, + "loss": 0.4893, + "step": 4214 + }, + { + "epoch": 0.48, + "learning_rate": 1.107693910480056e-05, + "loss": 0.452, + "step": 4215 + }, + { + "epoch": 0.48, + "learning_rate": 1.1073258906841547e-05, + "loss": 0.4683, + "step": 4216 + }, + { + "epoch": 0.48, + "learning_rate": 1.10695785618221e-05, + "loss": 0.4677, + "step": 4217 + }, + { + "epoch": 0.48, + "learning_rate": 1.1065898070246512e-05, + "loss": 0.4687, + "step": 4218 + }, + { + "epoch": 0.48, + "learning_rate": 1.1062217432619095e-05, + "loss": 0.4808, + "step": 4219 + }, + { + "epoch": 0.48, + "learning_rate": 1.1058536649444167e-05, + "loss": 0.4601, + "step": 4220 + }, + { + "epoch": 0.48, + "learning_rate": 1.105485572122609e-05, + "loss": 0.4564, + "step": 4221 + }, + { + "epoch": 0.48, + "learning_rate": 1.1051174648469225e-05, + "loss": 0.4677, + "step": 4222 + }, + { + "epoch": 0.48, + "learning_rate": 1.104749343167797e-05, + "loss": 0.4712, + "step": 4223 + }, + { + "epoch": 0.48, + "learning_rate": 1.104381207135672e-05, + "loss": 0.475, + "step": 4224 + }, + { + "epoch": 0.48, + "learning_rate": 1.104013056800992e-05, + "loss": 0.4599, + "step": 4225 + }, + { + "epoch": 0.48, + "learning_rate": 1.1036448922142004e-05, + "loss": 0.4706, + "step": 4226 + }, + { + "epoch": 0.48, + "learning_rate": 1.1032767134257451e-05, + "loss": 0.4714, + "step": 4227 + }, + { + "epoch": 0.48, + "learning_rate": 1.102908520486074e-05, + "loss": 0.4596, + "step": 4228 + }, + { + "epoch": 0.48, + "learning_rate": 1.1025403134456378e-05, + "loss": 0.4529, + "step": 4229 + }, + { + "epoch": 0.48, + "learning_rate": 1.1021720923548897e-05, + "loss": 0.4729, + "step": 4230 + }, + { + "epoch": 0.48, + "learning_rate": 1.1018038572642837e-05, + "loss": 0.4506, + "step": 4231 + }, + { + "epoch": 0.48, + "learning_rate": 1.1014356082242766e-05, + "loss": 0.4713, + "step": 4232 + }, + { + "epoch": 0.48, + "learning_rate": 1.1010673452853262e-05, + "loss": 0.4613, + "step": 4233 + }, + { + "epoch": 0.48, + "learning_rate": 1.1006990684978928e-05, + "loss": 0.4614, + "step": 4234 + }, + { + "epoch": 0.48, + "learning_rate": 1.1003307779124392e-05, + "loss": 0.4434, + "step": 4235 + }, + { + "epoch": 0.48, + "learning_rate": 1.0999624735794292e-05, + "loss": 0.4647, + "step": 4236 + }, + { + "epoch": 0.48, + "learning_rate": 1.0995941555493283e-05, + "loss": 0.4507, + "step": 4237 + }, + { + "epoch": 0.48, + "learning_rate": 1.0992258238726046e-05, + "loss": 0.4711, + "step": 4238 + }, + { + "epoch": 0.48, + "learning_rate": 1.0988574785997275e-05, + "loss": 0.4512, + "step": 4239 + }, + { + "epoch": 0.48, + "learning_rate": 1.0984891197811686e-05, + "loss": 0.4598, + "step": 4240 + }, + { + "epoch": 0.48, + "learning_rate": 1.0981207474674021e-05, + "loss": 0.4618, + "step": 4241 + }, + { + "epoch": 0.48, + "learning_rate": 1.0977523617089019e-05, + "loss": 0.473, + "step": 4242 + }, + { + "epoch": 0.48, + "learning_rate": 1.097383962556146e-05, + "loss": 0.4364, + "step": 4243 + }, + { + "epoch": 0.49, + "learning_rate": 1.0970155500596127e-05, + "loss": 0.4559, + "step": 4244 + }, + { + "epoch": 0.49, + "learning_rate": 1.0966471242697834e-05, + "loss": 0.4674, + "step": 4245 + }, + { + "epoch": 0.49, + "learning_rate": 1.0962786852371402e-05, + "loss": 0.4773, + "step": 4246 + }, + { + "epoch": 0.49, + "learning_rate": 1.0959102330121676e-05, + "loss": 0.4772, + "step": 4247 + }, + { + "epoch": 0.49, + "learning_rate": 1.0955417676453517e-05, + "loss": 0.4668, + "step": 4248 + }, + { + "epoch": 0.49, + "learning_rate": 1.0951732891871807e-05, + "loss": 0.4536, + "step": 4249 + }, + { + "epoch": 0.49, + "learning_rate": 1.0948047976881439e-05, + "loss": 0.4819, + "step": 4250 + }, + { + "epoch": 0.49, + "learning_rate": 1.0944362931987336e-05, + "loss": 0.4545, + "step": 4251 + }, + { + "epoch": 0.49, + "learning_rate": 1.0940677757694425e-05, + "loss": 0.4557, + "step": 4252 + }, + { + "epoch": 0.49, + "learning_rate": 1.093699245450766e-05, + "loss": 0.4531, + "step": 4253 + }, + { + "epoch": 0.49, + "learning_rate": 1.093330702293201e-05, + "loss": 0.4528, + "step": 4254 + }, + { + "epoch": 0.49, + "learning_rate": 1.092962146347246e-05, + "loss": 0.4605, + "step": 4255 + }, + { + "epoch": 0.49, + "learning_rate": 1.0925935776634014e-05, + "loss": 0.4651, + "step": 4256 + }, + { + "epoch": 0.49, + "learning_rate": 1.0922249962921694e-05, + "loss": 0.4555, + "step": 4257 + }, + { + "epoch": 0.49, + "learning_rate": 1.0918564022840539e-05, + "loss": 0.4513, + "step": 4258 + }, + { + "epoch": 0.49, + "learning_rate": 1.0914877956895604e-05, + "loss": 0.4656, + "step": 4259 + }, + { + "epoch": 0.49, + "learning_rate": 1.0911191765591966e-05, + "loss": 0.4609, + "step": 4260 + }, + { + "epoch": 0.49, + "learning_rate": 1.090750544943471e-05, + "loss": 0.4698, + "step": 4261 + }, + { + "epoch": 0.49, + "learning_rate": 1.0903819008928948e-05, + "loss": 0.4613, + "step": 4262 + }, + { + "epoch": 0.49, + "learning_rate": 1.0900132444579801e-05, + "loss": 0.4463, + "step": 4263 + }, + { + "epoch": 0.49, + "learning_rate": 1.0896445756892415e-05, + "loss": 0.4858, + "step": 4264 + }, + { + "epoch": 0.49, + "learning_rate": 1.0892758946371943e-05, + "loss": 0.4598, + "step": 4265 + }, + { + "epoch": 0.49, + "learning_rate": 1.0889072013523568e-05, + "loss": 0.4422, + "step": 4266 + }, + { + "epoch": 0.49, + "learning_rate": 1.0885384958852474e-05, + "loss": 0.4569, + "step": 4267 + }, + { + "epoch": 0.49, + "learning_rate": 1.0881697782863874e-05, + "loss": 0.4665, + "step": 4268 + }, + { + "epoch": 0.49, + "learning_rate": 1.0878010486062993e-05, + "loss": 0.4516, + "step": 4269 + }, + { + "epoch": 0.49, + "learning_rate": 1.0874323068955073e-05, + "loss": 0.4574, + "step": 4270 + }, + { + "epoch": 0.49, + "learning_rate": 1.0870635532045375e-05, + "loss": 0.4604, + "step": 4271 + }, + { + "epoch": 0.49, + "learning_rate": 1.0866947875839167e-05, + "loss": 0.4557, + "step": 4272 + }, + { + "epoch": 0.49, + "learning_rate": 1.0863260100841744e-05, + "loss": 0.4617, + "step": 4273 + }, + { + "epoch": 0.49, + "learning_rate": 1.0859572207558416e-05, + "loss": 0.4668, + "step": 4274 + }, + { + "epoch": 0.49, + "learning_rate": 1.0855884196494507e-05, + "loss": 0.451, + "step": 4275 + }, + { + "epoch": 0.49, + "learning_rate": 1.0852196068155352e-05, + "loss": 0.4583, + "step": 4276 + }, + { + "epoch": 0.49, + "learning_rate": 1.0848507823046306e-05, + "loss": 0.4804, + "step": 4277 + }, + { + "epoch": 0.49, + "learning_rate": 1.0844819461672748e-05, + "loss": 0.4617, + "step": 4278 + }, + { + "epoch": 0.49, + "learning_rate": 1.0841130984540063e-05, + "loss": 0.4703, + "step": 4279 + }, + { + "epoch": 0.49, + "learning_rate": 1.0837442392153651e-05, + "loss": 0.4611, + "step": 4280 + }, + { + "epoch": 0.49, + "learning_rate": 1.0833753685018935e-05, + "loss": 0.451, + "step": 4281 + }, + { + "epoch": 0.49, + "learning_rate": 1.0830064863641352e-05, + "loss": 0.4829, + "step": 4282 + }, + { + "epoch": 0.49, + "learning_rate": 1.082637592852635e-05, + "loss": 0.459, + "step": 4283 + }, + { + "epoch": 0.49, + "learning_rate": 1.0822686880179395e-05, + "loss": 0.468, + "step": 4284 + }, + { + "epoch": 0.49, + "learning_rate": 1.081899771910597e-05, + "loss": 0.4545, + "step": 4285 + }, + { + "epoch": 0.49, + "learning_rate": 1.081530844581157e-05, + "loss": 0.4563, + "step": 4286 + }, + { + "epoch": 0.49, + "learning_rate": 1.0811619060801713e-05, + "loss": 0.4765, + "step": 4287 + }, + { + "epoch": 0.49, + "learning_rate": 1.0807929564581925e-05, + "loss": 0.4501, + "step": 4288 + }, + { + "epoch": 0.49, + "learning_rate": 1.080423995765775e-05, + "loss": 0.4503, + "step": 4289 + }, + { + "epoch": 0.49, + "learning_rate": 1.0800550240534742e-05, + "loss": 0.4785, + "step": 4290 + }, + { + "epoch": 0.49, + "learning_rate": 1.0796860413718475e-05, + "loss": 0.4585, + "step": 4291 + }, + { + "epoch": 0.49, + "learning_rate": 1.0793170477714546e-05, + "loss": 0.4625, + "step": 4292 + }, + { + "epoch": 0.49, + "learning_rate": 1.0789480433028551e-05, + "loss": 0.4596, + "step": 4293 + }, + { + "epoch": 0.49, + "learning_rate": 1.0785790280166114e-05, + "loss": 0.4536, + "step": 4294 + }, + { + "epoch": 0.49, + "learning_rate": 1.078210001963286e-05, + "loss": 0.4592, + "step": 4295 + }, + { + "epoch": 0.49, + "learning_rate": 1.0778409651934442e-05, + "loss": 0.4829, + "step": 4296 + }, + { + "epoch": 0.49, + "learning_rate": 1.0774719177576526e-05, + "loss": 0.4804, + "step": 4297 + }, + { + "epoch": 0.49, + "learning_rate": 1.0771028597064785e-05, + "loss": 0.4635, + "step": 4298 + }, + { + "epoch": 0.49, + "learning_rate": 1.076733791090491e-05, + "loss": 0.458, + "step": 4299 + }, + { + "epoch": 0.49, + "learning_rate": 1.0763647119602614e-05, + "loss": 0.4521, + "step": 4300 + }, + { + "epoch": 0.49, + "learning_rate": 1.0759956223663608e-05, + "loss": 0.4635, + "step": 4301 + }, + { + "epoch": 0.49, + "learning_rate": 1.0756265223593637e-05, + "loss": 0.4627, + "step": 4302 + }, + { + "epoch": 0.49, + "learning_rate": 1.0752574119898445e-05, + "loss": 0.4643, + "step": 4303 + }, + { + "epoch": 0.49, + "learning_rate": 1.0748882913083794e-05, + "loss": 0.4849, + "step": 4304 + }, + { + "epoch": 0.49, + "learning_rate": 1.0745191603655466e-05, + "loss": 0.4653, + "step": 4305 + }, + { + "epoch": 0.49, + "learning_rate": 1.074150019211925e-05, + "loss": 0.48, + "step": 4306 + }, + { + "epoch": 0.49, + "learning_rate": 1.0737808678980954e-05, + "loss": 0.4599, + "step": 4307 + }, + { + "epoch": 0.49, + "learning_rate": 1.0734117064746395e-05, + "loss": 0.4787, + "step": 4308 + }, + { + "epoch": 0.49, + "learning_rate": 1.073042534992141e-05, + "loss": 0.4467, + "step": 4309 + }, + { + "epoch": 0.49, + "learning_rate": 1.0726733535011844e-05, + "loss": 0.4669, + "step": 4310 + }, + { + "epoch": 0.49, + "learning_rate": 1.0723041620523558e-05, + "loss": 0.4679, + "step": 4311 + }, + { + "epoch": 0.49, + "learning_rate": 1.0719349606962426e-05, + "loss": 0.443, + "step": 4312 + }, + { + "epoch": 0.49, + "learning_rate": 1.071565749483434e-05, + "loss": 0.4815, + "step": 4313 + }, + { + "epoch": 0.49, + "learning_rate": 1.0711965284645198e-05, + "loss": 0.4714, + "step": 4314 + }, + { + "epoch": 0.49, + "learning_rate": 1.0708272976900915e-05, + "loss": 0.45, + "step": 4315 + }, + { + "epoch": 0.49, + "learning_rate": 1.0704580572107424e-05, + "loss": 0.4824, + "step": 4316 + }, + { + "epoch": 0.49, + "learning_rate": 1.0700888070770663e-05, + "loss": 0.4578, + "step": 4317 + }, + { + "epoch": 0.49, + "learning_rate": 1.0697195473396587e-05, + "loss": 0.4675, + "step": 4318 + }, + { + "epoch": 0.49, + "learning_rate": 1.0693502780491168e-05, + "loss": 0.4679, + "step": 4319 + }, + { + "epoch": 0.49, + "learning_rate": 1.0689809992560382e-05, + "loss": 0.4471, + "step": 4320 + }, + { + "epoch": 0.49, + "learning_rate": 1.0686117110110228e-05, + "loss": 0.4587, + "step": 4321 + }, + { + "epoch": 0.49, + "learning_rate": 1.0682424133646712e-05, + "loss": 0.4886, + "step": 4322 + }, + { + "epoch": 0.49, + "learning_rate": 1.067873106367585e-05, + "loss": 0.4718, + "step": 4323 + }, + { + "epoch": 0.49, + "learning_rate": 1.0675037900703684e-05, + "loss": 0.4528, + "step": 4324 + }, + { + "epoch": 0.49, + "learning_rate": 1.0671344645236253e-05, + "loss": 0.4683, + "step": 4325 + }, + { + "epoch": 0.49, + "learning_rate": 1.0667651297779615e-05, + "loss": 0.473, + "step": 4326 + }, + { + "epoch": 0.49, + "learning_rate": 1.0663957858839843e-05, + "loss": 0.4572, + "step": 4327 + }, + { + "epoch": 0.49, + "learning_rate": 1.0660264328923024e-05, + "loss": 0.4593, + "step": 4328 + }, + { + "epoch": 0.49, + "learning_rate": 1.0656570708535248e-05, + "loss": 0.4559, + "step": 4329 + }, + { + "epoch": 0.49, + "learning_rate": 1.0652876998182626e-05, + "loss": 0.4716, + "step": 4330 + }, + { + "epoch": 0.49, + "learning_rate": 1.064918319837128e-05, + "loss": 0.5014, + "step": 4331 + }, + { + "epoch": 0.5, + "learning_rate": 1.0645489309607346e-05, + "loss": 0.4458, + "step": 4332 + }, + { + "epoch": 0.5, + "learning_rate": 1.064179533239696e-05, + "loss": 0.4546, + "step": 4333 + }, + { + "epoch": 0.5, + "learning_rate": 1.0638101267246283e-05, + "loss": 0.5054, + "step": 4334 + }, + { + "epoch": 0.5, + "learning_rate": 1.0634407114661492e-05, + "loss": 0.4734, + "step": 4335 + }, + { + "epoch": 0.5, + "learning_rate": 1.0630712875148758e-05, + "loss": 0.4572, + "step": 4336 + }, + { + "epoch": 0.5, + "learning_rate": 1.0627018549214284e-05, + "loss": 0.4573, + "step": 4337 + }, + { + "epoch": 0.5, + "learning_rate": 1.062332413736426e-05, + "loss": 0.4595, + "step": 4338 + }, + { + "epoch": 0.5, + "learning_rate": 1.0619629640104921e-05, + "loss": 0.4774, + "step": 4339 + }, + { + "epoch": 0.5, + "learning_rate": 1.0615935057942485e-05, + "loss": 0.4731, + "step": 4340 + }, + { + "epoch": 0.5, + "learning_rate": 1.0612240391383197e-05, + "loss": 0.4404, + "step": 4341 + }, + { + "epoch": 0.5, + "learning_rate": 1.0608545640933304e-05, + "loss": 0.4807, + "step": 4342 + }, + { + "epoch": 0.5, + "learning_rate": 1.060485080709907e-05, + "loss": 0.4475, + "step": 4343 + }, + { + "epoch": 0.5, + "learning_rate": 1.0601155890386771e-05, + "loss": 0.468, + "step": 4344 + }, + { + "epoch": 0.5, + "learning_rate": 1.05974608913027e-05, + "loss": 0.4684, + "step": 4345 + }, + { + "epoch": 0.5, + "learning_rate": 1.0593765810353142e-05, + "loss": 0.4471, + "step": 4346 + }, + { + "epoch": 0.5, + "learning_rate": 1.0590070648044415e-05, + "loss": 0.465, + "step": 4347 + }, + { + "epoch": 0.5, + "learning_rate": 1.0586375404882832e-05, + "loss": 0.4646, + "step": 4348 + }, + { + "epoch": 0.5, + "learning_rate": 1.0582680081374728e-05, + "loss": 0.4695, + "step": 4349 + }, + { + "epoch": 0.5, + "learning_rate": 1.0578984678026445e-05, + "loss": 0.4684, + "step": 4350 + }, + { + "epoch": 0.5, + "learning_rate": 1.0575289195344334e-05, + "loss": 0.454, + "step": 4351 + }, + { + "epoch": 0.5, + "learning_rate": 1.0571593633834758e-05, + "loss": 0.477, + "step": 4352 + }, + { + "epoch": 0.5, + "learning_rate": 1.0567897994004093e-05, + "loss": 0.4632, + "step": 4353 + }, + { + "epoch": 0.5, + "learning_rate": 1.0564202276358726e-05, + "loss": 0.4516, + "step": 4354 + }, + { + "epoch": 0.5, + "learning_rate": 1.0560506481405048e-05, + "loss": 0.4679, + "step": 4355 + }, + { + "epoch": 0.5, + "learning_rate": 1.0556810609649471e-05, + "loss": 0.4686, + "step": 4356 + }, + { + "epoch": 0.5, + "learning_rate": 1.0553114661598406e-05, + "loss": 0.4695, + "step": 4357 + }, + { + "epoch": 0.5, + "learning_rate": 1.0549418637758284e-05, + "loss": 0.4569, + "step": 4358 + }, + { + "epoch": 0.5, + "learning_rate": 1.0545722538635544e-05, + "loss": 0.4721, + "step": 4359 + }, + { + "epoch": 0.5, + "learning_rate": 1.054202636473663e-05, + "loss": 0.4599, + "step": 4360 + }, + { + "epoch": 0.5, + "learning_rate": 1.0538330116568006e-05, + "loss": 0.4584, + "step": 4361 + }, + { + "epoch": 0.5, + "learning_rate": 1.0534633794636134e-05, + "loss": 0.469, + "step": 4362 + }, + { + "epoch": 0.5, + "learning_rate": 1.0530937399447496e-05, + "loss": 0.4706, + "step": 4363 + }, + { + "epoch": 0.5, + "learning_rate": 1.0527240931508582e-05, + "loss": 0.4607, + "step": 4364 + }, + { + "epoch": 0.5, + "learning_rate": 1.0523544391325888e-05, + "loss": 0.4766, + "step": 4365 + }, + { + "epoch": 0.5, + "learning_rate": 1.0519847779405926e-05, + "loss": 0.4552, + "step": 4366 + }, + { + "epoch": 0.5, + "learning_rate": 1.051615109625521e-05, + "loss": 0.4917, + "step": 4367 + }, + { + "epoch": 0.5, + "learning_rate": 1.0512454342380269e-05, + "loss": 0.4614, + "step": 4368 + }, + { + "epoch": 0.5, + "learning_rate": 1.0508757518287642e-05, + "loss": 0.453, + "step": 4369 + }, + { + "epoch": 0.5, + "learning_rate": 1.0505060624483878e-05, + "loss": 0.4705, + "step": 4370 + }, + { + "epoch": 0.5, + "learning_rate": 1.0501363661475533e-05, + "loss": 0.4748, + "step": 4371 + }, + { + "epoch": 0.5, + "learning_rate": 1.0497666629769172e-05, + "loss": 0.4521, + "step": 4372 + }, + { + "epoch": 0.5, + "learning_rate": 1.049396952987137e-05, + "loss": 0.463, + "step": 4373 + }, + { + "epoch": 0.5, + "learning_rate": 1.0490272362288716e-05, + "loss": 0.4574, + "step": 4374 + }, + { + "epoch": 0.5, + "learning_rate": 1.0486575127527802e-05, + "loss": 0.5077, + "step": 4375 + }, + { + "epoch": 0.5, + "learning_rate": 1.0482877826095233e-05, + "loss": 0.4699, + "step": 4376 + }, + { + "epoch": 0.5, + "learning_rate": 1.047918045849762e-05, + "loss": 0.4626, + "step": 4377 + }, + { + "epoch": 0.5, + "learning_rate": 1.0475483025241587e-05, + "loss": 0.4677, + "step": 4378 + }, + { + "epoch": 0.5, + "learning_rate": 1.0471785526833762e-05, + "loss": 0.4496, + "step": 4379 + }, + { + "epoch": 0.5, + "learning_rate": 1.046808796378079e-05, + "loss": 0.4818, + "step": 4380 + }, + { + "epoch": 0.5, + "learning_rate": 1.0464390336589311e-05, + "loss": 0.4648, + "step": 4381 + }, + { + "epoch": 0.5, + "learning_rate": 1.046069264576599e-05, + "loss": 0.4639, + "step": 4382 + }, + { + "epoch": 0.5, + "learning_rate": 1.0456994891817492e-05, + "loss": 0.501, + "step": 4383 + }, + { + "epoch": 0.5, + "learning_rate": 1.045329707525049e-05, + "loss": 0.4538, + "step": 4384 + }, + { + "epoch": 0.5, + "learning_rate": 1.0449599196571671e-05, + "loss": 0.4661, + "step": 4385 + }, + { + "epoch": 0.5, + "learning_rate": 1.044590125628772e-05, + "loss": 0.4508, + "step": 4386 + }, + { + "epoch": 0.5, + "learning_rate": 1.0442203254905346e-05, + "loss": 0.4545, + "step": 4387 + }, + { + "epoch": 0.5, + "learning_rate": 1.043850519293125e-05, + "loss": 0.4552, + "step": 4388 + }, + { + "epoch": 0.5, + "learning_rate": 1.0434807070872154e-05, + "loss": 0.4872, + "step": 4389 + }, + { + "epoch": 0.5, + "learning_rate": 1.0431108889234783e-05, + "loss": 0.4596, + "step": 4390 + }, + { + "epoch": 0.5, + "learning_rate": 1.0427410648525863e-05, + "loss": 0.4589, + "step": 4391 + }, + { + "epoch": 0.5, + "learning_rate": 1.0423712349252148e-05, + "loss": 0.4557, + "step": 4392 + }, + { + "epoch": 0.5, + "learning_rate": 1.0420013991920382e-05, + "loss": 0.4582, + "step": 4393 + }, + { + "epoch": 0.5, + "learning_rate": 1.041631557703732e-05, + "loss": 0.4466, + "step": 4394 + }, + { + "epoch": 0.5, + "learning_rate": 1.0412617105109725e-05, + "loss": 0.4584, + "step": 4395 + }, + { + "epoch": 0.5, + "learning_rate": 1.0408918576644378e-05, + "loss": 0.4645, + "step": 4396 + }, + { + "epoch": 0.5, + "learning_rate": 1.0405219992148057e-05, + "loss": 0.4663, + "step": 4397 + }, + { + "epoch": 0.5, + "learning_rate": 1.040152135212755e-05, + "loss": 0.4447, + "step": 4398 + }, + { + "epoch": 0.5, + "learning_rate": 1.0397822657089653e-05, + "loss": 0.4731, + "step": 4399 + }, + { + "epoch": 0.5, + "learning_rate": 1.039412390754117e-05, + "loss": 0.4579, + "step": 4400 + }, + { + "epoch": 0.5, + "learning_rate": 1.039042510398891e-05, + "loss": 0.4788, + "step": 4401 + }, + { + "epoch": 0.5, + "learning_rate": 1.03867262469397e-05, + "loss": 0.473, + "step": 4402 + }, + { + "epoch": 0.5, + "learning_rate": 1.0383027336900356e-05, + "loss": 0.4693, + "step": 4403 + }, + { + "epoch": 0.5, + "learning_rate": 1.0379328374377715e-05, + "loss": 0.4546, + "step": 4404 + }, + { + "epoch": 0.5, + "learning_rate": 1.0375629359878616e-05, + "loss": 0.4708, + "step": 4405 + }, + { + "epoch": 0.5, + "learning_rate": 1.0371930293909911e-05, + "loss": 0.4519, + "step": 4406 + }, + { + "epoch": 0.5, + "learning_rate": 1.0368231176978454e-05, + "loss": 0.4842, + "step": 4407 + }, + { + "epoch": 0.5, + "learning_rate": 1.0364532009591101e-05, + "loss": 0.4637, + "step": 4408 + }, + { + "epoch": 0.5, + "learning_rate": 1.0360832792254727e-05, + "loss": 0.4791, + "step": 4409 + }, + { + "epoch": 0.5, + "learning_rate": 1.03571335254762e-05, + "loss": 0.4545, + "step": 4410 + }, + { + "epoch": 0.5, + "learning_rate": 1.0353434209762412e-05, + "loss": 0.4852, + "step": 4411 + }, + { + "epoch": 0.5, + "learning_rate": 1.0349734845620244e-05, + "loss": 0.4396, + "step": 4412 + }, + { + "epoch": 0.5, + "learning_rate": 1.0346035433556594e-05, + "loss": 0.4579, + "step": 4413 + }, + { + "epoch": 0.5, + "learning_rate": 1.0342335974078364e-05, + "loss": 0.4664, + "step": 4414 + }, + { + "epoch": 0.5, + "learning_rate": 1.0338636467692462e-05, + "loss": 0.4647, + "step": 4415 + }, + { + "epoch": 0.5, + "learning_rate": 1.0334936914905802e-05, + "loss": 0.4764, + "step": 4416 + }, + { + "epoch": 0.5, + "learning_rate": 1.0331237316225309e-05, + "loss": 0.4702, + "step": 4417 + }, + { + "epoch": 0.5, + "learning_rate": 1.0327537672157908e-05, + "loss": 0.4681, + "step": 4418 + }, + { + "epoch": 0.51, + "learning_rate": 1.0323837983210535e-05, + "loss": 0.4651, + "step": 4419 + }, + { + "epoch": 0.51, + "learning_rate": 1.0320138249890126e-05, + "loss": 0.465, + "step": 4420 + }, + { + "epoch": 0.51, + "learning_rate": 1.031643847270363e-05, + "loss": 0.4518, + "step": 4421 + }, + { + "epoch": 0.51, + "learning_rate": 1.0312738652157996e-05, + "loss": 0.4564, + "step": 4422 + }, + { + "epoch": 0.51, + "learning_rate": 1.0309038788760185e-05, + "loss": 0.481, + "step": 4423 + }, + { + "epoch": 0.51, + "learning_rate": 1.0305338883017163e-05, + "loss": 0.4721, + "step": 4424 + }, + { + "epoch": 0.51, + "learning_rate": 1.0301638935435896e-05, + "loss": 0.469, + "step": 4425 + }, + { + "epoch": 0.51, + "learning_rate": 1.0297938946523361e-05, + "loss": 0.4487, + "step": 4426 + }, + { + "epoch": 0.51, + "learning_rate": 1.0294238916786537e-05, + "loss": 0.4555, + "step": 4427 + }, + { + "epoch": 0.51, + "learning_rate": 1.0290538846732415e-05, + "loss": 0.453, + "step": 4428 + }, + { + "epoch": 0.51, + "learning_rate": 1.0286838736867981e-05, + "loss": 0.4768, + "step": 4429 + }, + { + "epoch": 0.51, + "learning_rate": 1.0283138587700236e-05, + "loss": 0.4549, + "step": 4430 + }, + { + "epoch": 0.51, + "learning_rate": 1.0279438399736185e-05, + "loss": 0.4679, + "step": 4431 + }, + { + "epoch": 0.51, + "learning_rate": 1.0275738173482835e-05, + "loss": 0.4489, + "step": 4432 + }, + { + "epoch": 0.51, + "learning_rate": 1.0272037909447197e-05, + "loss": 0.4635, + "step": 4433 + }, + { + "epoch": 0.51, + "learning_rate": 1.0268337608136292e-05, + "loss": 0.4474, + "step": 4434 + }, + { + "epoch": 0.51, + "learning_rate": 1.0264637270057146e-05, + "loss": 0.4724, + "step": 4435 + }, + { + "epoch": 0.51, + "learning_rate": 1.0260936895716781e-05, + "loss": 0.458, + "step": 4436 + }, + { + "epoch": 0.51, + "learning_rate": 1.0257236485622241e-05, + "loss": 0.4632, + "step": 4437 + }, + { + "epoch": 0.51, + "learning_rate": 1.0253536040280556e-05, + "loss": 0.4592, + "step": 4438 + }, + { + "epoch": 0.51, + "learning_rate": 1.0249835560198772e-05, + "loss": 0.4643, + "step": 4439 + }, + { + "epoch": 0.51, + "learning_rate": 1.0246135045883943e-05, + "loss": 0.4771, + "step": 4440 + }, + { + "epoch": 0.51, + "learning_rate": 1.0242434497843117e-05, + "loss": 0.4872, + "step": 4441 + }, + { + "epoch": 0.51, + "learning_rate": 1.023873391658335e-05, + "loss": 0.4517, + "step": 4442 + }, + { + "epoch": 0.51, + "learning_rate": 1.0235033302611704e-05, + "loss": 0.4768, + "step": 4443 + }, + { + "epoch": 0.51, + "learning_rate": 1.023133265643525e-05, + "loss": 0.4565, + "step": 4444 + }, + { + "epoch": 0.51, + "learning_rate": 1.0227631978561057e-05, + "loss": 0.4665, + "step": 4445 + }, + { + "epoch": 0.51, + "learning_rate": 1.0223931269496204e-05, + "loss": 0.4572, + "step": 4446 + }, + { + "epoch": 0.51, + "learning_rate": 1.0220230529747766e-05, + "loss": 0.4538, + "step": 4447 + }, + { + "epoch": 0.51, + "learning_rate": 1.0216529759822823e-05, + "loss": 0.4572, + "step": 4448 + }, + { + "epoch": 0.51, + "learning_rate": 1.0212828960228475e-05, + "loss": 0.4631, + "step": 4449 + }, + { + "epoch": 0.51, + "learning_rate": 1.0209128131471809e-05, + "loss": 0.454, + "step": 4450 + }, + { + "epoch": 0.51, + "learning_rate": 1.0205427274059915e-05, + "loss": 0.4702, + "step": 4451 + }, + { + "epoch": 0.51, + "learning_rate": 1.02017263884999e-05, + "loss": 0.4545, + "step": 4452 + }, + { + "epoch": 0.51, + "learning_rate": 1.0198025475298865e-05, + "loss": 0.4729, + "step": 4453 + }, + { + "epoch": 0.51, + "learning_rate": 1.019432453496392e-05, + "loss": 0.4482, + "step": 4454 + }, + { + "epoch": 0.51, + "learning_rate": 1.0190623568002178e-05, + "loss": 0.446, + "step": 4455 + }, + { + "epoch": 0.51, + "learning_rate": 1.0186922574920747e-05, + "loss": 0.4636, + "step": 4456 + }, + { + "epoch": 0.51, + "learning_rate": 1.018322155622675e-05, + "loss": 0.4636, + "step": 4457 + }, + { + "epoch": 0.51, + "learning_rate": 1.017952051242731e-05, + "loss": 0.469, + "step": 4458 + }, + { + "epoch": 0.51, + "learning_rate": 1.0175819444029555e-05, + "loss": 0.4491, + "step": 4459 + }, + { + "epoch": 0.51, + "learning_rate": 1.0172118351540608e-05, + "loss": 0.4544, + "step": 4460 + }, + { + "epoch": 0.51, + "learning_rate": 1.0168417235467604e-05, + "loss": 0.4653, + "step": 4461 + }, + { + "epoch": 0.51, + "learning_rate": 1.0164716096317677e-05, + "loss": 0.4614, + "step": 4462 + }, + { + "epoch": 0.51, + "learning_rate": 1.016101493459797e-05, + "loss": 0.4497, + "step": 4463 + }, + { + "epoch": 0.51, + "learning_rate": 1.0157313750815623e-05, + "loss": 0.4507, + "step": 4464 + }, + { + "epoch": 0.51, + "learning_rate": 1.0153612545477778e-05, + "loss": 0.4477, + "step": 4465 + }, + { + "epoch": 0.51, + "learning_rate": 1.0149911319091583e-05, + "loss": 0.4675, + "step": 4466 + }, + { + "epoch": 0.51, + "learning_rate": 1.014621007216419e-05, + "loss": 0.4669, + "step": 4467 + }, + { + "epoch": 0.51, + "learning_rate": 1.0142508805202757e-05, + "loss": 0.4668, + "step": 4468 + }, + { + "epoch": 0.51, + "learning_rate": 1.0138807518714435e-05, + "loss": 0.4754, + "step": 4469 + }, + { + "epoch": 0.51, + "learning_rate": 1.0135106213206382e-05, + "loss": 0.4581, + "step": 4470 + }, + { + "epoch": 0.51, + "learning_rate": 1.0131404889185762e-05, + "loss": 0.4621, + "step": 4471 + }, + { + "epoch": 0.51, + "learning_rate": 1.012770354715974e-05, + "loss": 0.459, + "step": 4472 + }, + { + "epoch": 0.51, + "learning_rate": 1.012400218763548e-05, + "loss": 0.4633, + "step": 4473 + }, + { + "epoch": 0.51, + "learning_rate": 1.012030081112015e-05, + "loss": 0.4751, + "step": 4474 + }, + { + "epoch": 0.51, + "learning_rate": 1.0116599418120924e-05, + "loss": 0.4624, + "step": 4475 + }, + { + "epoch": 0.51, + "learning_rate": 1.0112898009144977e-05, + "loss": 0.4584, + "step": 4476 + }, + { + "epoch": 0.51, + "learning_rate": 1.0109196584699478e-05, + "loss": 0.4615, + "step": 4477 + }, + { + "epoch": 0.51, + "learning_rate": 1.0105495145291612e-05, + "loss": 0.4712, + "step": 4478 + }, + { + "epoch": 0.51, + "learning_rate": 1.0101793691428554e-05, + "loss": 0.456, + "step": 4479 + }, + { + "epoch": 0.51, + "learning_rate": 1.0098092223617488e-05, + "loss": 0.4551, + "step": 4480 + }, + { + "epoch": 0.51, + "learning_rate": 1.0094390742365598e-05, + "loss": 0.4553, + "step": 4481 + }, + { + "epoch": 0.51, + "learning_rate": 1.009068924818007e-05, + "loss": 0.47, + "step": 4482 + }, + { + "epoch": 0.51, + "learning_rate": 1.0086987741568089e-05, + "loss": 0.4609, + "step": 4483 + }, + { + "epoch": 0.51, + "learning_rate": 1.0083286223036845e-05, + "loss": 0.4694, + "step": 4484 + }, + { + "epoch": 0.51, + "learning_rate": 1.0079584693093529e-05, + "loss": 0.4731, + "step": 4485 + }, + { + "epoch": 0.51, + "learning_rate": 1.0075883152245334e-05, + "loss": 0.4774, + "step": 4486 + }, + { + "epoch": 0.51, + "learning_rate": 1.007218160099945e-05, + "loss": 0.4797, + "step": 4487 + }, + { + "epoch": 0.51, + "learning_rate": 1.006848003986308e-05, + "loss": 0.4382, + "step": 4488 + }, + { + "epoch": 0.51, + "learning_rate": 1.0064778469343413e-05, + "loss": 0.4554, + "step": 4489 + }, + { + "epoch": 0.51, + "learning_rate": 1.006107688994765e-05, + "loss": 0.4602, + "step": 4490 + }, + { + "epoch": 0.51, + "learning_rate": 1.0057375302182988e-05, + "loss": 0.4634, + "step": 4491 + }, + { + "epoch": 0.51, + "learning_rate": 1.005367370655663e-05, + "loss": 0.4633, + "step": 4492 + }, + { + "epoch": 0.51, + "learning_rate": 1.0049972103575775e-05, + "loss": 0.4775, + "step": 4493 + }, + { + "epoch": 0.51, + "learning_rate": 1.004627049374763e-05, + "loss": 0.462, + "step": 4494 + }, + { + "epoch": 0.51, + "learning_rate": 1.0042568877579388e-05, + "loss": 0.4605, + "step": 4495 + }, + { + "epoch": 0.51, + "learning_rate": 1.0038867255578261e-05, + "loss": 0.45, + "step": 4496 + }, + { + "epoch": 0.51, + "learning_rate": 1.0035165628251455e-05, + "loss": 0.4773, + "step": 4497 + }, + { + "epoch": 0.51, + "learning_rate": 1.0031463996106175e-05, + "loss": 0.462, + "step": 4498 + }, + { + "epoch": 0.51, + "learning_rate": 1.002776235964962e-05, + "loss": 0.4733, + "step": 4499 + }, + { + "epoch": 0.51, + "learning_rate": 1.0024060719389002e-05, + "loss": 0.4687, + "step": 4500 + }, + { + "epoch": 0.51, + "learning_rate": 1.002035907583153e-05, + "loss": 0.4611, + "step": 4501 + }, + { + "epoch": 0.51, + "learning_rate": 1.001665742948441e-05, + "loss": 0.4397, + "step": 4502 + }, + { + "epoch": 0.51, + "learning_rate": 1.0012955780854852e-05, + "loss": 0.4622, + "step": 4503 + }, + { + "epoch": 0.51, + "learning_rate": 1.0009254130450059e-05, + "loss": 0.4544, + "step": 4504 + }, + { + "epoch": 0.51, + "learning_rate": 1.0005552478777244e-05, + "loss": 0.4854, + "step": 4505 + }, + { + "epoch": 0.51, + "learning_rate": 1.0001850826343615e-05, + "loss": 0.458, + "step": 4506 + }, + { + "epoch": 0.52, + "learning_rate": 9.998149173656387e-06, + "loss": 0.4704, + "step": 4507 + }, + { + "epoch": 0.52, + "learning_rate": 9.994447521222758e-06, + "loss": 0.4649, + "step": 4508 + }, + { + "epoch": 0.52, + "learning_rate": 9.990745869549943e-06, + "loss": 0.4603, + "step": 4509 + }, + { + "epoch": 0.52, + "learning_rate": 9.987044219145155e-06, + "loss": 0.4715, + "step": 4510 + }, + { + "epoch": 0.52, + "learning_rate": 9.983342570515592e-06, + "loss": 0.4502, + "step": 4511 + }, + { + "epoch": 0.52, + "learning_rate": 9.979640924168475e-06, + "loss": 0.4481, + "step": 4512 + }, + { + "epoch": 0.52, + "learning_rate": 9.975939280611e-06, + "loss": 0.4688, + "step": 4513 + }, + { + "epoch": 0.52, + "learning_rate": 9.972237640350383e-06, + "loss": 0.4572, + "step": 4514 + }, + { + "epoch": 0.52, + "learning_rate": 9.968536003893832e-06, + "loss": 0.4765, + "step": 4515 + }, + { + "epoch": 0.52, + "learning_rate": 9.964834371748547e-06, + "loss": 0.4421, + "step": 4516 + }, + { + "epoch": 0.52, + "learning_rate": 9.96113274442174e-06, + "loss": 0.4692, + "step": 4517 + }, + { + "epoch": 0.52, + "learning_rate": 9.957431122420615e-06, + "loss": 0.4652, + "step": 4518 + }, + { + "epoch": 0.52, + "learning_rate": 9.953729506252374e-06, + "loss": 0.4713, + "step": 4519 + }, + { + "epoch": 0.52, + "learning_rate": 9.950027896424228e-06, + "loss": 0.4544, + "step": 4520 + }, + { + "epoch": 0.52, + "learning_rate": 9.946326293443371e-06, + "loss": 0.4721, + "step": 4521 + }, + { + "epoch": 0.52, + "learning_rate": 9.942624697817015e-06, + "loss": 0.4527, + "step": 4522 + }, + { + "epoch": 0.52, + "learning_rate": 9.938923110052353e-06, + "loss": 0.4667, + "step": 4523 + }, + { + "epoch": 0.52, + "learning_rate": 9.935221530656589e-06, + "loss": 0.4684, + "step": 4524 + }, + { + "epoch": 0.52, + "learning_rate": 9.931519960136925e-06, + "loss": 0.469, + "step": 4525 + }, + { + "epoch": 0.52, + "learning_rate": 9.92781839900055e-06, + "loss": 0.4682, + "step": 4526 + }, + { + "epoch": 0.52, + "learning_rate": 9.92411684775467e-06, + "loss": 0.4617, + "step": 4527 + }, + { + "epoch": 0.52, + "learning_rate": 9.920415306906475e-06, + "loss": 0.4692, + "step": 4528 + }, + { + "epoch": 0.52, + "learning_rate": 9.916713776963156e-06, + "loss": 0.4503, + "step": 4529 + }, + { + "epoch": 0.52, + "learning_rate": 9.913012258431915e-06, + "loss": 0.453, + "step": 4530 + }, + { + "epoch": 0.52, + "learning_rate": 9.909310751819936e-06, + "loss": 0.4832, + "step": 4531 + }, + { + "epoch": 0.52, + "learning_rate": 9.905609257634404e-06, + "loss": 0.4704, + "step": 4532 + }, + { + "epoch": 0.52, + "learning_rate": 9.901907776382514e-06, + "loss": 0.4666, + "step": 4533 + }, + { + "epoch": 0.52, + "learning_rate": 9.898206308571446e-06, + "loss": 0.4546, + "step": 4534 + }, + { + "epoch": 0.52, + "learning_rate": 9.894504854708391e-06, + "loss": 0.4571, + "step": 4535 + }, + { + "epoch": 0.52, + "learning_rate": 9.890803415300527e-06, + "loss": 0.475, + "step": 4536 + }, + { + "epoch": 0.52, + "learning_rate": 9.887101990855027e-06, + "loss": 0.4718, + "step": 4537 + }, + { + "epoch": 0.52, + "learning_rate": 9.883400581879077e-06, + "loss": 0.4536, + "step": 4538 + }, + { + "epoch": 0.52, + "learning_rate": 9.87969918887985e-06, + "loss": 0.4675, + "step": 4539 + }, + { + "epoch": 0.52, + "learning_rate": 9.875997812364524e-06, + "loss": 0.451, + "step": 4540 + }, + { + "epoch": 0.52, + "learning_rate": 9.872296452840266e-06, + "loss": 0.4575, + "step": 4541 + }, + { + "epoch": 0.52, + "learning_rate": 9.86859511081424e-06, + "loss": 0.4571, + "step": 4542 + }, + { + "epoch": 0.52, + "learning_rate": 9.86489378679362e-06, + "loss": 0.4678, + "step": 4543 + }, + { + "epoch": 0.52, + "learning_rate": 9.86119248128557e-06, + "loss": 0.4673, + "step": 4544 + }, + { + "epoch": 0.52, + "learning_rate": 9.857491194797244e-06, + "loss": 0.4733, + "step": 4545 + }, + { + "epoch": 0.52, + "learning_rate": 9.853789927835811e-06, + "loss": 0.4682, + "step": 4546 + }, + { + "epoch": 0.52, + "learning_rate": 9.85008868090842e-06, + "loss": 0.4519, + "step": 4547 + }, + { + "epoch": 0.52, + "learning_rate": 9.846387454522225e-06, + "loss": 0.4542, + "step": 4548 + }, + { + "epoch": 0.52, + "learning_rate": 9.842686249184384e-06, + "loss": 0.4461, + "step": 4549 + }, + { + "epoch": 0.52, + "learning_rate": 9.838985065402032e-06, + "loss": 0.4728, + "step": 4550 + }, + { + "epoch": 0.52, + "learning_rate": 9.835283903682327e-06, + "loss": 0.4747, + "step": 4551 + }, + { + "epoch": 0.52, + "learning_rate": 9.831582764532399e-06, + "loss": 0.4419, + "step": 4552 + }, + { + "epoch": 0.52, + "learning_rate": 9.827881648459396e-06, + "loss": 0.4559, + "step": 4553 + }, + { + "epoch": 0.52, + "learning_rate": 9.824180555970451e-06, + "loss": 0.4902, + "step": 4554 + }, + { + "epoch": 0.52, + "learning_rate": 9.820479487572691e-06, + "loss": 0.443, + "step": 4555 + }, + { + "epoch": 0.52, + "learning_rate": 9.816778443773253e-06, + "loss": 0.4473, + "step": 4556 + }, + { + "epoch": 0.52, + "learning_rate": 9.813077425079258e-06, + "loss": 0.4676, + "step": 4557 + }, + { + "epoch": 0.52, + "learning_rate": 9.809376431997825e-06, + "loss": 0.446, + "step": 4558 + }, + { + "epoch": 0.52, + "learning_rate": 9.805675465036084e-06, + "loss": 0.4786, + "step": 4559 + }, + { + "epoch": 0.52, + "learning_rate": 9.801974524701135e-06, + "loss": 0.4651, + "step": 4560 + }, + { + "epoch": 0.52, + "learning_rate": 9.798273611500103e-06, + "loss": 0.4478, + "step": 4561 + }, + { + "epoch": 0.52, + "learning_rate": 9.794572725940088e-06, + "loss": 0.4633, + "step": 4562 + }, + { + "epoch": 0.52, + "learning_rate": 9.790871868528194e-06, + "loss": 0.4631, + "step": 4563 + }, + { + "epoch": 0.52, + "learning_rate": 9.787171039771528e-06, + "loss": 0.458, + "step": 4564 + }, + { + "epoch": 0.52, + "learning_rate": 9.783470240177175e-06, + "loss": 0.4644, + "step": 4565 + }, + { + "epoch": 0.52, + "learning_rate": 9.779769470252237e-06, + "loss": 0.4525, + "step": 4566 + }, + { + "epoch": 0.52, + "learning_rate": 9.776068730503801e-06, + "loss": 0.48, + "step": 4567 + }, + { + "epoch": 0.52, + "learning_rate": 9.772368021438943e-06, + "loss": 0.4676, + "step": 4568 + }, + { + "epoch": 0.52, + "learning_rate": 9.768667343564752e-06, + "loss": 0.4516, + "step": 4569 + }, + { + "epoch": 0.52, + "learning_rate": 9.7649666973883e-06, + "loss": 0.4608, + "step": 4570 + }, + { + "epoch": 0.52, + "learning_rate": 9.761266083416655e-06, + "loss": 0.4624, + "step": 4571 + }, + { + "epoch": 0.52, + "learning_rate": 9.75756550215689e-06, + "loss": 0.4609, + "step": 4572 + }, + { + "epoch": 0.52, + "learning_rate": 9.753864954116058e-06, + "loss": 0.4632, + "step": 4573 + }, + { + "epoch": 0.52, + "learning_rate": 9.75016443980123e-06, + "loss": 0.4696, + "step": 4574 + }, + { + "epoch": 0.52, + "learning_rate": 9.746463959719447e-06, + "loss": 0.4639, + "step": 4575 + }, + { + "epoch": 0.52, + "learning_rate": 9.74276351437776e-06, + "loss": 0.4705, + "step": 4576 + }, + { + "epoch": 0.52, + "learning_rate": 9.73906310428322e-06, + "loss": 0.4716, + "step": 4577 + }, + { + "epoch": 0.52, + "learning_rate": 9.735362729942856e-06, + "loss": 0.4548, + "step": 4578 + }, + { + "epoch": 0.52, + "learning_rate": 9.731662391863711e-06, + "loss": 0.453, + "step": 4579 + }, + { + "epoch": 0.52, + "learning_rate": 9.727962090552808e-06, + "loss": 0.4619, + "step": 4580 + }, + { + "epoch": 0.52, + "learning_rate": 9.724261826517167e-06, + "loss": 0.4502, + "step": 4581 + }, + { + "epoch": 0.52, + "learning_rate": 9.720561600263818e-06, + "loss": 0.4543, + "step": 4582 + }, + { + "epoch": 0.52, + "learning_rate": 9.716861412299769e-06, + "loss": 0.4512, + "step": 4583 + }, + { + "epoch": 0.52, + "learning_rate": 9.713161263132022e-06, + "loss": 0.4865, + "step": 4584 + }, + { + "epoch": 0.52, + "learning_rate": 9.70946115326759e-06, + "loss": 0.4704, + "step": 4585 + }, + { + "epoch": 0.52, + "learning_rate": 9.705761083213463e-06, + "loss": 0.4619, + "step": 4586 + }, + { + "epoch": 0.52, + "learning_rate": 9.702061053476642e-06, + "loss": 0.4452, + "step": 4587 + }, + { + "epoch": 0.52, + "learning_rate": 9.698361064564107e-06, + "loss": 0.4892, + "step": 4588 + }, + { + "epoch": 0.52, + "learning_rate": 9.694661116982838e-06, + "loss": 0.437, + "step": 4589 + }, + { + "epoch": 0.52, + "learning_rate": 9.690961211239816e-06, + "loss": 0.4578, + "step": 4590 + }, + { + "epoch": 0.52, + "learning_rate": 9.687261347842004e-06, + "loss": 0.4736, + "step": 4591 + }, + { + "epoch": 0.52, + "learning_rate": 9.683561527296375e-06, + "loss": 0.4749, + "step": 4592 + }, + { + "epoch": 0.52, + "learning_rate": 9.67986175010988e-06, + "loss": 0.4598, + "step": 4593 + }, + { + "epoch": 0.53, + "learning_rate": 9.676162016789469e-06, + "loss": 0.4787, + "step": 4594 + }, + { + "epoch": 0.53, + "learning_rate": 9.672462327842095e-06, + "loss": 0.4639, + "step": 4595 + }, + { + "epoch": 0.53, + "learning_rate": 9.668762683774691e-06, + "loss": 0.4549, + "step": 4596 + }, + { + "epoch": 0.53, + "learning_rate": 9.6650630850942e-06, + "loss": 0.4702, + "step": 4597 + }, + { + "epoch": 0.53, + "learning_rate": 9.661363532307543e-06, + "loss": 0.4616, + "step": 4598 + }, + { + "epoch": 0.53, + "learning_rate": 9.65766402592164e-06, + "loss": 0.4599, + "step": 4599 + }, + { + "epoch": 0.53, + "learning_rate": 9.65396456644341e-06, + "loss": 0.4743, + "step": 4600 + }, + { + "epoch": 0.53, + "learning_rate": 9.650265154379761e-06, + "loss": 0.4605, + "step": 4601 + }, + { + "epoch": 0.53, + "learning_rate": 9.64656579023759e-06, + "loss": 0.4574, + "step": 4602 + }, + { + "epoch": 0.53, + "learning_rate": 9.642866474523802e-06, + "loss": 0.458, + "step": 4603 + }, + { + "epoch": 0.53, + "learning_rate": 9.639167207745276e-06, + "loss": 0.448, + "step": 4604 + }, + { + "epoch": 0.53, + "learning_rate": 9.6354679904089e-06, + "loss": 0.4661, + "step": 4605 + }, + { + "epoch": 0.53, + "learning_rate": 9.631768823021551e-06, + "loss": 0.4667, + "step": 4606 + }, + { + "epoch": 0.53, + "learning_rate": 9.628069706090089e-06, + "loss": 0.4673, + "step": 4607 + }, + { + "epoch": 0.53, + "learning_rate": 9.624370640121387e-06, + "loss": 0.4743, + "step": 4608 + }, + { + "epoch": 0.53, + "learning_rate": 9.620671625622287e-06, + "loss": 0.4708, + "step": 4609 + }, + { + "epoch": 0.53, + "learning_rate": 9.616972663099648e-06, + "loss": 0.4519, + "step": 4610 + }, + { + "epoch": 0.53, + "learning_rate": 9.613273753060306e-06, + "loss": 0.4693, + "step": 4611 + }, + { + "epoch": 0.53, + "learning_rate": 9.60957489601109e-06, + "loss": 0.4497, + "step": 4612 + }, + { + "epoch": 0.53, + "learning_rate": 9.605876092458835e-06, + "loss": 0.4547, + "step": 4613 + }, + { + "epoch": 0.53, + "learning_rate": 9.60217734291035e-06, + "loss": 0.4704, + "step": 4614 + }, + { + "epoch": 0.53, + "learning_rate": 9.598478647872451e-06, + "loss": 0.4674, + "step": 4615 + }, + { + "epoch": 0.53, + "learning_rate": 9.594780007851947e-06, + "loss": 0.4687, + "step": 4616 + }, + { + "epoch": 0.53, + "learning_rate": 9.591081423355622e-06, + "loss": 0.4746, + "step": 4617 + }, + { + "epoch": 0.53, + "learning_rate": 9.587382894890276e-06, + "loss": 0.4641, + "step": 4618 + }, + { + "epoch": 0.53, + "learning_rate": 9.583684422962686e-06, + "loss": 0.4568, + "step": 4619 + }, + { + "epoch": 0.53, + "learning_rate": 9.57998600807962e-06, + "loss": 0.4585, + "step": 4620 + }, + { + "epoch": 0.53, + "learning_rate": 9.576287650747854e-06, + "loss": 0.455, + "step": 4621 + }, + { + "epoch": 0.53, + "learning_rate": 9.572589351474135e-06, + "loss": 0.4596, + "step": 4622 + }, + { + "epoch": 0.53, + "learning_rate": 9.568891110765219e-06, + "loss": 0.4498, + "step": 4623 + }, + { + "epoch": 0.53, + "learning_rate": 9.565192929127849e-06, + "loss": 0.4718, + "step": 4624 + }, + { + "epoch": 0.53, + "learning_rate": 9.56149480706875e-06, + "loss": 0.459, + "step": 4625 + }, + { + "epoch": 0.53, + "learning_rate": 9.557796745094659e-06, + "loss": 0.4699, + "step": 4626 + }, + { + "epoch": 0.53, + "learning_rate": 9.554098743712282e-06, + "loss": 0.4617, + "step": 4627 + }, + { + "epoch": 0.53, + "learning_rate": 9.55040080342833e-06, + "loss": 0.4584, + "step": 4628 + }, + { + "epoch": 0.53, + "learning_rate": 9.546702924749513e-06, + "loss": 0.4683, + "step": 4629 + }, + { + "epoch": 0.53, + "learning_rate": 9.543005108182508e-06, + "loss": 0.4565, + "step": 4630 + }, + { + "epoch": 0.53, + "learning_rate": 9.539307354234013e-06, + "loss": 0.4596, + "step": 4631 + }, + { + "epoch": 0.53, + "learning_rate": 9.535609663410692e-06, + "loss": 0.4669, + "step": 4632 + }, + { + "epoch": 0.53, + "learning_rate": 9.531912036219214e-06, + "loss": 0.4542, + "step": 4633 + }, + { + "epoch": 0.53, + "learning_rate": 9.528214473166241e-06, + "loss": 0.4628, + "step": 4634 + }, + { + "epoch": 0.53, + "learning_rate": 9.524516974758415e-06, + "loss": 0.4588, + "step": 4635 + }, + { + "epoch": 0.53, + "learning_rate": 9.520819541502384e-06, + "loss": 0.4724, + "step": 4636 + }, + { + "epoch": 0.53, + "learning_rate": 9.51712217390477e-06, + "loss": 0.4556, + "step": 4637 + }, + { + "epoch": 0.53, + "learning_rate": 9.5134248724722e-06, + "loss": 0.4625, + "step": 4638 + }, + { + "epoch": 0.53, + "learning_rate": 9.509727637711287e-06, + "loss": 0.4548, + "step": 4639 + }, + { + "epoch": 0.53, + "learning_rate": 9.506030470128635e-06, + "loss": 0.4666, + "step": 4640 + }, + { + "epoch": 0.53, + "learning_rate": 9.502333370230831e-06, + "loss": 0.4542, + "step": 4641 + }, + { + "epoch": 0.53, + "learning_rate": 9.49863633852447e-06, + "loss": 0.4654, + "step": 4642 + }, + { + "epoch": 0.53, + "learning_rate": 9.494939375516122e-06, + "loss": 0.4402, + "step": 4643 + }, + { + "epoch": 0.53, + "learning_rate": 9.49124248171236e-06, + "loss": 0.459, + "step": 4644 + }, + { + "epoch": 0.53, + "learning_rate": 9.487545657619736e-06, + "loss": 0.4613, + "step": 4645 + }, + { + "epoch": 0.53, + "learning_rate": 9.483848903744795e-06, + "loss": 0.4691, + "step": 4646 + }, + { + "epoch": 0.53, + "learning_rate": 9.48015222059408e-06, + "loss": 0.451, + "step": 4647 + }, + { + "epoch": 0.53, + "learning_rate": 9.476455608674112e-06, + "loss": 0.4657, + "step": 4648 + }, + { + "epoch": 0.53, + "learning_rate": 9.472759068491421e-06, + "loss": 0.4631, + "step": 4649 + }, + { + "epoch": 0.53, + "learning_rate": 9.469062600552509e-06, + "loss": 0.4553, + "step": 4650 + }, + { + "epoch": 0.53, + "learning_rate": 9.46536620536387e-06, + "loss": 0.4589, + "step": 4651 + }, + { + "epoch": 0.53, + "learning_rate": 9.461669883431997e-06, + "loss": 0.4644, + "step": 4652 + }, + { + "epoch": 0.53, + "learning_rate": 9.457973635263375e-06, + "loss": 0.4886, + "step": 4653 + }, + { + "epoch": 0.53, + "learning_rate": 9.45427746136446e-06, + "loss": 0.4691, + "step": 4654 + }, + { + "epoch": 0.53, + "learning_rate": 9.45058136224172e-06, + "loss": 0.4444, + "step": 4655 + }, + { + "epoch": 0.53, + "learning_rate": 9.446885338401597e-06, + "loss": 0.4715, + "step": 4656 + }, + { + "epoch": 0.53, + "learning_rate": 9.443189390350534e-06, + "loss": 0.4667, + "step": 4657 + }, + { + "epoch": 0.53, + "learning_rate": 9.439493518594957e-06, + "loss": 0.4679, + "step": 4658 + }, + { + "epoch": 0.53, + "learning_rate": 9.435797723641277e-06, + "loss": 0.4479, + "step": 4659 + }, + { + "epoch": 0.53, + "learning_rate": 9.432102005995912e-06, + "loss": 0.4777, + "step": 4660 + }, + { + "epoch": 0.53, + "learning_rate": 9.428406366165244e-06, + "loss": 0.4656, + "step": 4661 + }, + { + "epoch": 0.53, + "learning_rate": 9.424710804655669e-06, + "loss": 0.4465, + "step": 4662 + }, + { + "epoch": 0.53, + "learning_rate": 9.42101532197356e-06, + "loss": 0.4584, + "step": 4663 + }, + { + "epoch": 0.53, + "learning_rate": 9.417319918625274e-06, + "loss": 0.4644, + "step": 4664 + }, + { + "epoch": 0.53, + "learning_rate": 9.413624595117173e-06, + "loss": 0.4449, + "step": 4665 + }, + { + "epoch": 0.53, + "learning_rate": 9.409929351955592e-06, + "loss": 0.4884, + "step": 4666 + }, + { + "epoch": 0.53, + "learning_rate": 9.40623418964686e-06, + "loss": 0.4483, + "step": 4667 + }, + { + "epoch": 0.53, + "learning_rate": 9.402539108697306e-06, + "loss": 0.4915, + "step": 4668 + }, + { + "epoch": 0.53, + "learning_rate": 9.398844109613228e-06, + "loss": 0.4573, + "step": 4669 + }, + { + "epoch": 0.53, + "learning_rate": 9.395149192900934e-06, + "loss": 0.4712, + "step": 4670 + }, + { + "epoch": 0.53, + "learning_rate": 9.391454359066701e-06, + "loss": 0.4664, + "step": 4671 + }, + { + "epoch": 0.53, + "learning_rate": 9.387759608616806e-06, + "loss": 0.4573, + "step": 4672 + }, + { + "epoch": 0.53, + "learning_rate": 9.384064942057518e-06, + "loss": 0.4352, + "step": 4673 + }, + { + "epoch": 0.53, + "learning_rate": 9.380370359895079e-06, + "loss": 0.4715, + "step": 4674 + }, + { + "epoch": 0.53, + "learning_rate": 9.37667586263574e-06, + "loss": 0.4777, + "step": 4675 + }, + { + "epoch": 0.53, + "learning_rate": 9.372981450785723e-06, + "loss": 0.4662, + "step": 4676 + }, + { + "epoch": 0.53, + "learning_rate": 9.369287124851243e-06, + "loss": 0.4557, + "step": 4677 + }, + { + "epoch": 0.53, + "learning_rate": 9.365592885338512e-06, + "loss": 0.457, + "step": 4678 + }, + { + "epoch": 0.53, + "learning_rate": 9.361898732753715e-06, + "loss": 0.4632, + "step": 4679 + }, + { + "epoch": 0.53, + "learning_rate": 9.358204667603043e-06, + "loss": 0.4692, + "step": 4680 + }, + { + "epoch": 0.53, + "learning_rate": 9.35451069039266e-06, + "loss": 0.4565, + "step": 4681 + }, + { + "epoch": 0.54, + "learning_rate": 9.35081680162872e-06, + "loss": 0.4649, + "step": 4682 + }, + { + "epoch": 0.54, + "learning_rate": 9.347123001817376e-06, + "loss": 0.4416, + "step": 4683 + }, + { + "epoch": 0.54, + "learning_rate": 9.343429291464756e-06, + "loss": 0.485, + "step": 4684 + }, + { + "epoch": 0.54, + "learning_rate": 9.339735671076978e-06, + "loss": 0.4466, + "step": 4685 + }, + { + "epoch": 0.54, + "learning_rate": 9.336042141160158e-06, + "loss": 0.4855, + "step": 4686 + }, + { + "epoch": 0.54, + "learning_rate": 9.332348702220386e-06, + "loss": 0.4635, + "step": 4687 + }, + { + "epoch": 0.54, + "learning_rate": 9.32865535476375e-06, + "loss": 0.4589, + "step": 4688 + }, + { + "epoch": 0.54, + "learning_rate": 9.32496209929632e-06, + "loss": 0.4634, + "step": 4689 + }, + { + "epoch": 0.54, + "learning_rate": 9.32126893632415e-06, + "loss": 0.4645, + "step": 4690 + }, + { + "epoch": 0.54, + "learning_rate": 9.317575866353293e-06, + "loss": 0.442, + "step": 4691 + }, + { + "epoch": 0.54, + "learning_rate": 9.313882889889773e-06, + "loss": 0.4607, + "step": 4692 + }, + { + "epoch": 0.54, + "learning_rate": 9.31019000743962e-06, + "loss": 0.4629, + "step": 4693 + }, + { + "epoch": 0.54, + "learning_rate": 9.306497219508835e-06, + "loss": 0.4754, + "step": 4694 + }, + { + "epoch": 0.54, + "learning_rate": 9.302804526603413e-06, + "loss": 0.4514, + "step": 4695 + }, + { + "epoch": 0.54, + "learning_rate": 9.29911192922934e-06, + "loss": 0.4604, + "step": 4696 + }, + { + "epoch": 0.54, + "learning_rate": 9.29541942789258e-06, + "loss": 0.4665, + "step": 4697 + }, + { + "epoch": 0.54, + "learning_rate": 9.291727023099087e-06, + "loss": 0.4833, + "step": 4698 + }, + { + "epoch": 0.54, + "learning_rate": 9.288034715354806e-06, + "loss": 0.4474, + "step": 4699 + }, + { + "epoch": 0.54, + "learning_rate": 9.28434250516566e-06, + "loss": 0.4553, + "step": 4700 + }, + { + "epoch": 0.54, + "learning_rate": 9.280650393037578e-06, + "loss": 0.4573, + "step": 4701 + }, + { + "epoch": 0.54, + "learning_rate": 9.276958379476449e-06, + "loss": 0.4991, + "step": 4702 + }, + { + "epoch": 0.54, + "learning_rate": 9.27326646498816e-06, + "loss": 0.4458, + "step": 4703 + }, + { + "epoch": 0.54, + "learning_rate": 9.269574650078594e-06, + "loss": 0.471, + "step": 4704 + }, + { + "epoch": 0.54, + "learning_rate": 9.265882935253605e-06, + "loss": 0.4525, + "step": 4705 + }, + { + "epoch": 0.54, + "learning_rate": 9.262191321019049e-06, + "loss": 0.473, + "step": 4706 + }, + { + "epoch": 0.54, + "learning_rate": 9.258499807880755e-06, + "loss": 0.4637, + "step": 4707 + }, + { + "epoch": 0.54, + "learning_rate": 9.254808396344536e-06, + "loss": 0.4749, + "step": 4708 + }, + { + "epoch": 0.54, + "learning_rate": 9.251117086916209e-06, + "loss": 0.4477, + "step": 4709 + }, + { + "epoch": 0.54, + "learning_rate": 9.247425880101561e-06, + "loss": 0.4813, + "step": 4710 + }, + { + "epoch": 0.54, + "learning_rate": 9.243734776406365e-06, + "loss": 0.4593, + "step": 4711 + }, + { + "epoch": 0.54, + "learning_rate": 9.240043776336397e-06, + "loss": 0.4823, + "step": 4712 + }, + { + "epoch": 0.54, + "learning_rate": 9.23635288039739e-06, + "loss": 0.4511, + "step": 4713 + }, + { + "epoch": 0.54, + "learning_rate": 9.232662089095091e-06, + "loss": 0.4458, + "step": 4714 + }, + { + "epoch": 0.54, + "learning_rate": 9.22897140293522e-06, + "loss": 0.4836, + "step": 4715 + }, + { + "epoch": 0.54, + "learning_rate": 9.225280822423477e-06, + "loss": 0.4481, + "step": 4716 + }, + { + "epoch": 0.54, + "learning_rate": 9.221590348065561e-06, + "loss": 0.4664, + "step": 4717 + }, + { + "epoch": 0.54, + "learning_rate": 9.217899980367142e-06, + "loss": 0.4613, + "step": 4718 + }, + { + "epoch": 0.54, + "learning_rate": 9.214209719833891e-06, + "loss": 0.4589, + "step": 4719 + }, + { + "epoch": 0.54, + "learning_rate": 9.210519566971452e-06, + "loss": 0.4755, + "step": 4720 + }, + { + "epoch": 0.54, + "learning_rate": 9.206829522285456e-06, + "loss": 0.4648, + "step": 4721 + }, + { + "epoch": 0.54, + "learning_rate": 9.203139586281527e-06, + "loss": 0.4539, + "step": 4722 + }, + { + "epoch": 0.54, + "learning_rate": 9.199449759465263e-06, + "loss": 0.455, + "step": 4723 + }, + { + "epoch": 0.54, + "learning_rate": 9.195760042342254e-06, + "loss": 0.4757, + "step": 4724 + }, + { + "epoch": 0.54, + "learning_rate": 9.192070435418079e-06, + "loss": 0.4543, + "step": 4725 + }, + { + "epoch": 0.54, + "learning_rate": 9.188380939198287e-06, + "loss": 0.4611, + "step": 4726 + }, + { + "epoch": 0.54, + "learning_rate": 9.184691554188432e-06, + "loss": 0.4593, + "step": 4727 + }, + { + "epoch": 0.54, + "learning_rate": 9.181002280894034e-06, + "loss": 0.4526, + "step": 4728 + }, + { + "epoch": 0.54, + "learning_rate": 9.177313119820608e-06, + "loss": 0.4462, + "step": 4729 + }, + { + "epoch": 0.54, + "learning_rate": 9.173624071473655e-06, + "loss": 0.4694, + "step": 4730 + }, + { + "epoch": 0.54, + "learning_rate": 9.16993513635865e-06, + "loss": 0.4631, + "step": 4731 + }, + { + "epoch": 0.54, + "learning_rate": 9.166246314981066e-06, + "loss": 0.4379, + "step": 4732 + }, + { + "epoch": 0.54, + "learning_rate": 9.162557607846352e-06, + "loss": 0.4645, + "step": 4733 + }, + { + "epoch": 0.54, + "learning_rate": 9.158869015459939e-06, + "loss": 0.4768, + "step": 4734 + }, + { + "epoch": 0.54, + "learning_rate": 9.155180538327255e-06, + "loss": 0.4581, + "step": 4735 + }, + { + "epoch": 0.54, + "learning_rate": 9.151492176953697e-06, + "loss": 0.4598, + "step": 4736 + }, + { + "epoch": 0.54, + "learning_rate": 9.147803931844651e-06, + "loss": 0.4646, + "step": 4737 + }, + { + "epoch": 0.54, + "learning_rate": 9.144115803505498e-06, + "loss": 0.493, + "step": 4738 + }, + { + "epoch": 0.54, + "learning_rate": 9.140427792441584e-06, + "loss": 0.4566, + "step": 4739 + }, + { + "epoch": 0.54, + "learning_rate": 9.136739899158257e-06, + "loss": 0.4656, + "step": 4740 + }, + { + "epoch": 0.54, + "learning_rate": 9.133052124160837e-06, + "loss": 0.4568, + "step": 4741 + }, + { + "epoch": 0.54, + "learning_rate": 9.129364467954628e-06, + "loss": 0.4596, + "step": 4742 + }, + { + "epoch": 0.54, + "learning_rate": 9.125676931044928e-06, + "loss": 0.4573, + "step": 4743 + }, + { + "epoch": 0.54, + "learning_rate": 9.121989513937007e-06, + "loss": 0.456, + "step": 4744 + }, + { + "epoch": 0.54, + "learning_rate": 9.11830221713613e-06, + "loss": 0.4509, + "step": 4745 + }, + { + "epoch": 0.54, + "learning_rate": 9.11461504114753e-06, + "loss": 0.4743, + "step": 4746 + }, + { + "epoch": 0.54, + "learning_rate": 9.110927986476434e-06, + "loss": 0.4662, + "step": 4747 + }, + { + "epoch": 0.54, + "learning_rate": 9.107241053628058e-06, + "loss": 0.4476, + "step": 4748 + }, + { + "epoch": 0.54, + "learning_rate": 9.103554243107592e-06, + "loss": 0.4494, + "step": 4749 + }, + { + "epoch": 0.54, + "learning_rate": 9.0998675554202e-06, + "loss": 0.4702, + "step": 4750 + }, + { + "epoch": 0.54, + "learning_rate": 9.096180991071055e-06, + "loss": 0.4599, + "step": 4751 + }, + { + "epoch": 0.54, + "learning_rate": 9.09249455056529e-06, + "loss": 0.4609, + "step": 4752 + }, + { + "epoch": 0.54, + "learning_rate": 9.088808234408037e-06, + "loss": 0.4478, + "step": 4753 + }, + { + "epoch": 0.54, + "learning_rate": 9.0851220431044e-06, + "loss": 0.4582, + "step": 4754 + }, + { + "epoch": 0.54, + "learning_rate": 9.081435977159464e-06, + "loss": 0.4391, + "step": 4755 + }, + { + "epoch": 0.54, + "learning_rate": 9.07775003707831e-06, + "loss": 0.4849, + "step": 4756 + }, + { + "epoch": 0.54, + "learning_rate": 9.074064223365986e-06, + "loss": 0.4503, + "step": 4757 + }, + { + "epoch": 0.54, + "learning_rate": 9.070378536527544e-06, + "loss": 0.4415, + "step": 4758 + }, + { + "epoch": 0.54, + "learning_rate": 9.066692977067996e-06, + "loss": 0.4643, + "step": 4759 + }, + { + "epoch": 0.54, + "learning_rate": 9.063007545492342e-06, + "loss": 0.4695, + "step": 4760 + }, + { + "epoch": 0.54, + "learning_rate": 9.059322242305579e-06, + "loss": 0.4582, + "step": 4761 + }, + { + "epoch": 0.54, + "learning_rate": 9.055637068012664e-06, + "loss": 0.4431, + "step": 4762 + }, + { + "epoch": 0.54, + "learning_rate": 9.051952023118563e-06, + "loss": 0.4624, + "step": 4763 + }, + { + "epoch": 0.54, + "learning_rate": 9.048267108128198e-06, + "loss": 0.4788, + "step": 4764 + }, + { + "epoch": 0.54, + "learning_rate": 9.044582323546486e-06, + "loss": 0.457, + "step": 4765 + }, + { + "epoch": 0.54, + "learning_rate": 9.040897669878327e-06, + "loss": 0.4669, + "step": 4766 + }, + { + "epoch": 0.54, + "learning_rate": 9.037213147628603e-06, + "loss": 0.4772, + "step": 4767 + }, + { + "epoch": 0.54, + "learning_rate": 9.033528757302167e-06, + "loss": 0.4535, + "step": 4768 + }, + { + "epoch": 0.55, + "learning_rate": 9.029844499403876e-06, + "loss": 0.4751, + "step": 4769 + }, + { + "epoch": 0.55, + "learning_rate": 9.026160374438543e-06, + "loss": 0.4523, + "step": 4770 + }, + { + "epoch": 0.55, + "learning_rate": 9.022476382910983e-06, + "loss": 0.462, + "step": 4771 + }, + { + "epoch": 0.55, + "learning_rate": 9.018792525325986e-06, + "loss": 0.458, + "step": 4772 + }, + { + "epoch": 0.55, + "learning_rate": 9.015108802188314e-06, + "loss": 0.4503, + "step": 4773 + }, + { + "epoch": 0.55, + "learning_rate": 9.01142521400273e-06, + "loss": 0.4615, + "step": 4774 + }, + { + "epoch": 0.55, + "learning_rate": 9.007741761273957e-06, + "loss": 0.447, + "step": 4775 + }, + { + "epoch": 0.55, + "learning_rate": 9.004058444506718e-06, + "loss": 0.4495, + "step": 4776 + }, + { + "epoch": 0.55, + "learning_rate": 9.000375264205713e-06, + "loss": 0.4651, + "step": 4777 + }, + { + "epoch": 0.55, + "learning_rate": 8.996692220875608e-06, + "loss": 0.448, + "step": 4778 + }, + { + "epoch": 0.55, + "learning_rate": 8.993009315021073e-06, + "loss": 0.467, + "step": 4779 + }, + { + "epoch": 0.55, + "learning_rate": 8.989326547146743e-06, + "loss": 0.4631, + "step": 4780 + }, + { + "epoch": 0.55, + "learning_rate": 8.985643917757237e-06, + "loss": 0.4623, + "step": 4781 + }, + { + "epoch": 0.55, + "learning_rate": 8.981961427357166e-06, + "loss": 0.459, + "step": 4782 + }, + { + "epoch": 0.55, + "learning_rate": 8.978279076451104e-06, + "loss": 0.4612, + "step": 4783 + }, + { + "epoch": 0.55, + "learning_rate": 8.974596865543624e-06, + "loss": 0.449, + "step": 4784 + }, + { + "epoch": 0.55, + "learning_rate": 8.970914795139264e-06, + "loss": 0.4779, + "step": 4785 + }, + { + "epoch": 0.55, + "learning_rate": 8.967232865742552e-06, + "loss": 0.4482, + "step": 4786 + }, + { + "epoch": 0.55, + "learning_rate": 8.963551077857999e-06, + "loss": 0.4515, + "step": 4787 + }, + { + "epoch": 0.55, + "learning_rate": 8.959869431990082e-06, + "loss": 0.4554, + "step": 4788 + }, + { + "epoch": 0.55, + "learning_rate": 8.956187928643281e-06, + "loss": 0.474, + "step": 4789 + }, + { + "epoch": 0.55, + "learning_rate": 8.952506568322036e-06, + "loss": 0.4525, + "step": 4790 + }, + { + "epoch": 0.55, + "learning_rate": 8.948825351530774e-06, + "loss": 0.4463, + "step": 4791 + }, + { + "epoch": 0.55, + "learning_rate": 8.945144278773914e-06, + "loss": 0.4477, + "step": 4792 + }, + { + "epoch": 0.55, + "learning_rate": 8.941463350555835e-06, + "loss": 0.4675, + "step": 4793 + }, + { + "epoch": 0.55, + "learning_rate": 8.937782567380908e-06, + "loss": 0.4434, + "step": 4794 + }, + { + "epoch": 0.55, + "learning_rate": 8.93410192975349e-06, + "loss": 0.4687, + "step": 4795 + }, + { + "epoch": 0.55, + "learning_rate": 8.9304214381779e-06, + "loss": 0.4757, + "step": 4796 + }, + { + "epoch": 0.55, + "learning_rate": 8.926741093158456e-06, + "loss": 0.4694, + "step": 4797 + }, + { + "epoch": 0.55, + "learning_rate": 8.923060895199442e-06, + "loss": 0.4357, + "step": 4798 + }, + { + "epoch": 0.55, + "learning_rate": 8.919380844805129e-06, + "loss": 0.4529, + "step": 4799 + }, + { + "epoch": 0.55, + "learning_rate": 8.915700942479769e-06, + "loss": 0.4772, + "step": 4800 + }, + { + "epoch": 0.55, + "learning_rate": 8.912021188727585e-06, + "loss": 0.4556, + "step": 4801 + }, + { + "epoch": 0.55, + "learning_rate": 8.908341584052791e-06, + "loss": 0.4584, + "step": 4802 + }, + { + "epoch": 0.55, + "learning_rate": 8.904662128959571e-06, + "loss": 0.4709, + "step": 4803 + }, + { + "epoch": 0.55, + "learning_rate": 8.900982823952093e-06, + "loss": 0.4442, + "step": 4804 + }, + { + "epoch": 0.55, + "learning_rate": 8.897303669534508e-06, + "loss": 0.4682, + "step": 4805 + }, + { + "epoch": 0.55, + "learning_rate": 8.89362466621094e-06, + "loss": 0.4557, + "step": 4806 + }, + { + "epoch": 0.55, + "learning_rate": 8.88994581448549e-06, + "loss": 0.4719, + "step": 4807 + }, + { + "epoch": 0.55, + "learning_rate": 8.886267114862248e-06, + "loss": 0.4619, + "step": 4808 + }, + { + "epoch": 0.55, + "learning_rate": 8.882588567845275e-06, + "loss": 0.4531, + "step": 4809 + }, + { + "epoch": 0.55, + "learning_rate": 8.87891017393862e-06, + "loss": 0.4544, + "step": 4810 + }, + { + "epoch": 0.55, + "learning_rate": 8.8752319336463e-06, + "loss": 0.4691, + "step": 4811 + }, + { + "epoch": 0.55, + "learning_rate": 8.871553847472313e-06, + "loss": 0.4549, + "step": 4812 + }, + { + "epoch": 0.55, + "learning_rate": 8.867875915920645e-06, + "loss": 0.4411, + "step": 4813 + }, + { + "epoch": 0.55, + "learning_rate": 8.86419813949525e-06, + "loss": 0.4696, + "step": 4814 + }, + { + "epoch": 0.55, + "learning_rate": 8.860520518700074e-06, + "loss": 0.4568, + "step": 4815 + }, + { + "epoch": 0.55, + "learning_rate": 8.856843054039026e-06, + "loss": 0.4446, + "step": 4816 + }, + { + "epoch": 0.55, + "learning_rate": 8.853165746015997e-06, + "loss": 0.4509, + "step": 4817 + }, + { + "epoch": 0.55, + "learning_rate": 8.849488595134867e-06, + "loss": 0.4493, + "step": 4818 + }, + { + "epoch": 0.55, + "learning_rate": 8.84581160189949e-06, + "loss": 0.4747, + "step": 4819 + }, + { + "epoch": 0.55, + "learning_rate": 8.842134766813687e-06, + "loss": 0.4503, + "step": 4820 + }, + { + "epoch": 0.55, + "learning_rate": 8.838458090381274e-06, + "loss": 0.4668, + "step": 4821 + }, + { + "epoch": 0.55, + "learning_rate": 8.834781573106035e-06, + "loss": 0.4503, + "step": 4822 + }, + { + "epoch": 0.55, + "learning_rate": 8.831105215491734e-06, + "loss": 0.4464, + "step": 4823 + }, + { + "epoch": 0.55, + "learning_rate": 8.827429018042119e-06, + "loss": 0.467, + "step": 4824 + }, + { + "epoch": 0.55, + "learning_rate": 8.823752981260904e-06, + "loss": 0.4819, + "step": 4825 + }, + { + "epoch": 0.55, + "learning_rate": 8.820077105651794e-06, + "loss": 0.4618, + "step": 4826 + }, + { + "epoch": 0.55, + "learning_rate": 8.81640139171846e-06, + "loss": 0.478, + "step": 4827 + }, + { + "epoch": 0.55, + "learning_rate": 8.812725839964564e-06, + "loss": 0.4558, + "step": 4828 + }, + { + "epoch": 0.55, + "learning_rate": 8.809050450893737e-06, + "loss": 0.4699, + "step": 4829 + }, + { + "epoch": 0.55, + "learning_rate": 8.805375225009583e-06, + "loss": 0.4374, + "step": 4830 + }, + { + "epoch": 0.55, + "learning_rate": 8.801700162815698e-06, + "loss": 0.465, + "step": 4831 + }, + { + "epoch": 0.55, + "learning_rate": 8.798025264815643e-06, + "loss": 0.4757, + "step": 4832 + }, + { + "epoch": 0.55, + "learning_rate": 8.79435053151296e-06, + "loss": 0.4561, + "step": 4833 + }, + { + "epoch": 0.55, + "learning_rate": 8.790675963411175e-06, + "loss": 0.4736, + "step": 4834 + }, + { + "epoch": 0.55, + "learning_rate": 8.787001561013779e-06, + "loss": 0.4534, + "step": 4835 + }, + { + "epoch": 0.55, + "learning_rate": 8.783327324824255e-06, + "loss": 0.4696, + "step": 4836 + }, + { + "epoch": 0.55, + "learning_rate": 8.779653255346049e-06, + "loss": 0.4809, + "step": 4837 + }, + { + "epoch": 0.55, + "learning_rate": 8.77597935308259e-06, + "loss": 0.4455, + "step": 4838 + }, + { + "epoch": 0.55, + "learning_rate": 8.772305618537293e-06, + "loss": 0.4506, + "step": 4839 + }, + { + "epoch": 0.55, + "learning_rate": 8.768632052213532e-06, + "loss": 0.4514, + "step": 4840 + }, + { + "epoch": 0.55, + "learning_rate": 8.764958654614673e-06, + "loss": 0.4674, + "step": 4841 + }, + { + "epoch": 0.55, + "learning_rate": 8.761285426244053e-06, + "loss": 0.45, + "step": 4842 + }, + { + "epoch": 0.55, + "learning_rate": 8.757612367604983e-06, + "loss": 0.4517, + "step": 4843 + }, + { + "epoch": 0.55, + "learning_rate": 8.753939479200758e-06, + "loss": 0.4691, + "step": 4844 + }, + { + "epoch": 0.55, + "learning_rate": 8.750266761534641e-06, + "loss": 0.4638, + "step": 4845 + }, + { + "epoch": 0.55, + "learning_rate": 8.746594215109884e-06, + "loss": 0.4645, + "step": 4846 + }, + { + "epoch": 0.55, + "learning_rate": 8.742921840429702e-06, + "loss": 0.4585, + "step": 4847 + }, + { + "epoch": 0.55, + "learning_rate": 8.73924963799729e-06, + "loss": 0.4481, + "step": 4848 + }, + { + "epoch": 0.55, + "learning_rate": 8.73557760831583e-06, + "loss": 0.4553, + "step": 4849 + }, + { + "epoch": 0.55, + "learning_rate": 8.731905751888466e-06, + "loss": 0.4728, + "step": 4850 + }, + { + "epoch": 0.55, + "learning_rate": 8.728234069218325e-06, + "loss": 0.459, + "step": 4851 + }, + { + "epoch": 0.55, + "learning_rate": 8.724562560808512e-06, + "loss": 0.4681, + "step": 4852 + }, + { + "epoch": 0.55, + "learning_rate": 8.720891227162099e-06, + "loss": 0.4699, + "step": 4853 + }, + { + "epoch": 0.55, + "learning_rate": 8.71722006878215e-06, + "loss": 0.4628, + "step": 4854 + }, + { + "epoch": 0.55, + "learning_rate": 8.71354908617169e-06, + "loss": 0.4618, + "step": 4855 + }, + { + "epoch": 0.55, + "learning_rate": 8.709878279833725e-06, + "loss": 0.4481, + "step": 4856 + }, + { + "epoch": 0.56, + "learning_rate": 8.706207650271243e-06, + "loss": 0.4791, + "step": 4857 + }, + { + "epoch": 0.56, + "learning_rate": 8.702537197987193e-06, + "loss": 0.4634, + "step": 4858 + }, + { + "epoch": 0.56, + "learning_rate": 8.698866923484521e-06, + "loss": 0.4584, + "step": 4859 + }, + { + "epoch": 0.56, + "learning_rate": 8.695196827266126e-06, + "loss": 0.4717, + "step": 4860 + }, + { + "epoch": 0.56, + "learning_rate": 8.691526909834895e-06, + "loss": 0.4573, + "step": 4861 + }, + { + "epoch": 0.56, + "learning_rate": 8.687857171693693e-06, + "loss": 0.4662, + "step": 4862 + }, + { + "epoch": 0.56, + "learning_rate": 8.684187613345356e-06, + "loss": 0.4881, + "step": 4863 + }, + { + "epoch": 0.56, + "learning_rate": 8.680518235292688e-06, + "loss": 0.4418, + "step": 4864 + }, + { + "epoch": 0.56, + "learning_rate": 8.676849038038483e-06, + "loss": 0.4666, + "step": 4865 + }, + { + "epoch": 0.56, + "learning_rate": 8.673180022085499e-06, + "loss": 0.4722, + "step": 4866 + }, + { + "epoch": 0.56, + "learning_rate": 8.669511187936478e-06, + "loss": 0.4519, + "step": 4867 + }, + { + "epoch": 0.56, + "learning_rate": 8.66584253609413e-06, + "loss": 0.4627, + "step": 4868 + }, + { + "epoch": 0.56, + "learning_rate": 8.662174067061135e-06, + "loss": 0.4508, + "step": 4869 + }, + { + "epoch": 0.56, + "learning_rate": 8.658505781340166e-06, + "loss": 0.4772, + "step": 4870 + }, + { + "epoch": 0.56, + "learning_rate": 8.654837679433852e-06, + "loss": 0.4594, + "step": 4871 + }, + { + "epoch": 0.56, + "learning_rate": 8.651169761844812e-06, + "loss": 0.4637, + "step": 4872 + }, + { + "epoch": 0.56, + "learning_rate": 8.64750202907563e-06, + "loss": 0.4445, + "step": 4873 + }, + { + "epoch": 0.56, + "learning_rate": 8.643834481628861e-06, + "loss": 0.4444, + "step": 4874 + }, + { + "epoch": 0.56, + "learning_rate": 8.640167120007047e-06, + "loss": 0.4511, + "step": 4875 + }, + { + "epoch": 0.56, + "learning_rate": 8.636499944712702e-06, + "loss": 0.4727, + "step": 4876 + }, + { + "epoch": 0.56, + "learning_rate": 8.6328329562483e-06, + "loss": 0.4674, + "step": 4877 + }, + { + "epoch": 0.56, + "learning_rate": 8.629166155116312e-06, + "loss": 0.48, + "step": 4878 + }, + { + "epoch": 0.56, + "learning_rate": 8.625499541819163e-06, + "loss": 0.4411, + "step": 4879 + }, + { + "epoch": 0.56, + "learning_rate": 8.621833116859264e-06, + "loss": 0.4496, + "step": 4880 + }, + { + "epoch": 0.56, + "learning_rate": 8.618166880739e-06, + "loss": 0.5047, + "step": 4881 + }, + { + "epoch": 0.56, + "learning_rate": 8.614500833960722e-06, + "loss": 0.4631, + "step": 4882 + }, + { + "epoch": 0.56, + "learning_rate": 8.610834977026765e-06, + "loss": 0.4548, + "step": 4883 + }, + { + "epoch": 0.56, + "learning_rate": 8.607169310439427e-06, + "loss": 0.4439, + "step": 4884 + }, + { + "epoch": 0.56, + "learning_rate": 8.603503834700993e-06, + "loss": 0.4659, + "step": 4885 + }, + { + "epoch": 0.56, + "learning_rate": 8.599838550313714e-06, + "loss": 0.4751, + "step": 4886 + }, + { + "epoch": 0.56, + "learning_rate": 8.596173457779807e-06, + "loss": 0.4699, + "step": 4887 + }, + { + "epoch": 0.56, + "learning_rate": 8.592508557601484e-06, + "loss": 0.4542, + "step": 4888 + }, + { + "epoch": 0.56, + "learning_rate": 8.588843850280911e-06, + "loss": 0.4825, + "step": 4889 + }, + { + "epoch": 0.56, + "learning_rate": 8.585179336320235e-06, + "loss": 0.4437, + "step": 4890 + }, + { + "epoch": 0.56, + "learning_rate": 8.58151501622158e-06, + "loss": 0.455, + "step": 4891 + }, + { + "epoch": 0.56, + "learning_rate": 8.577850890487035e-06, + "loss": 0.4626, + "step": 4892 + }, + { + "epoch": 0.56, + "learning_rate": 8.574186959618671e-06, + "loss": 0.4551, + "step": 4893 + }, + { + "epoch": 0.56, + "learning_rate": 8.570523224118526e-06, + "loss": 0.472, + "step": 4894 + }, + { + "epoch": 0.56, + "learning_rate": 8.566859684488611e-06, + "loss": 0.4657, + "step": 4895 + }, + { + "epoch": 0.56, + "learning_rate": 8.56319634123092e-06, + "loss": 0.4572, + "step": 4896 + }, + { + "epoch": 0.56, + "learning_rate": 8.559533194847406e-06, + "loss": 0.4532, + "step": 4897 + }, + { + "epoch": 0.56, + "learning_rate": 8.555870245840005e-06, + "loss": 0.4627, + "step": 4898 + }, + { + "epoch": 0.56, + "learning_rate": 8.552207494710623e-06, + "loss": 0.4717, + "step": 4899 + }, + { + "epoch": 0.56, + "learning_rate": 8.548544941961134e-06, + "loss": 0.458, + "step": 4900 + }, + { + "epoch": 0.56, + "learning_rate": 8.544882588093399e-06, + "loss": 0.4661, + "step": 4901 + }, + { + "epoch": 0.56, + "learning_rate": 8.541220433609234e-06, + "loss": 0.4533, + "step": 4902 + }, + { + "epoch": 0.56, + "learning_rate": 8.53755847901044e-06, + "loss": 0.4695, + "step": 4903 + }, + { + "epoch": 0.56, + "learning_rate": 8.533896724798784e-06, + "loss": 0.4595, + "step": 4904 + }, + { + "epoch": 0.56, + "learning_rate": 8.530235171476005e-06, + "loss": 0.4565, + "step": 4905 + }, + { + "epoch": 0.56, + "learning_rate": 8.526573819543828e-06, + "loss": 0.4569, + "step": 4906 + }, + { + "epoch": 0.56, + "learning_rate": 8.522912669503932e-06, + "loss": 0.4691, + "step": 4907 + }, + { + "epoch": 0.56, + "learning_rate": 8.519251721857977e-06, + "loss": 0.4539, + "step": 4908 + }, + { + "epoch": 0.56, + "learning_rate": 8.515590977107597e-06, + "loss": 0.4729, + "step": 4909 + }, + { + "epoch": 0.56, + "learning_rate": 8.511930435754391e-06, + "loss": 0.4549, + "step": 4910 + }, + { + "epoch": 0.56, + "learning_rate": 8.508270098299943e-06, + "loss": 0.4571, + "step": 4911 + }, + { + "epoch": 0.56, + "learning_rate": 8.504609965245793e-06, + "loss": 0.4656, + "step": 4912 + }, + { + "epoch": 0.56, + "learning_rate": 8.500950037093462e-06, + "loss": 0.4603, + "step": 4913 + }, + { + "epoch": 0.56, + "learning_rate": 8.49729031434445e-06, + "loss": 0.4627, + "step": 4914 + }, + { + "epoch": 0.56, + "learning_rate": 8.493630797500214e-06, + "loss": 0.4876, + "step": 4915 + }, + { + "epoch": 0.56, + "learning_rate": 8.489971487062184e-06, + "loss": 0.4382, + "step": 4916 + }, + { + "epoch": 0.56, + "learning_rate": 8.486312383531777e-06, + "loss": 0.4776, + "step": 4917 + }, + { + "epoch": 0.56, + "learning_rate": 8.482653487410367e-06, + "loss": 0.4457, + "step": 4918 + }, + { + "epoch": 0.56, + "learning_rate": 8.478994799199308e-06, + "loss": 0.4529, + "step": 4919 + }, + { + "epoch": 0.56, + "learning_rate": 8.47533631939992e-06, + "loss": 0.4611, + "step": 4920 + }, + { + "epoch": 0.56, + "learning_rate": 8.471678048513494e-06, + "loss": 0.4637, + "step": 4921 + }, + { + "epoch": 0.56, + "learning_rate": 8.468019987041298e-06, + "loss": 0.4798, + "step": 4922 + }, + { + "epoch": 0.56, + "learning_rate": 8.464362135484564e-06, + "loss": 0.4612, + "step": 4923 + }, + { + "epoch": 0.56, + "learning_rate": 8.460704494344508e-06, + "loss": 0.45, + "step": 4924 + }, + { + "epoch": 0.56, + "learning_rate": 8.4570470641223e-06, + "loss": 0.4651, + "step": 4925 + }, + { + "epoch": 0.56, + "learning_rate": 8.453389845319092e-06, + "loss": 0.4471, + "step": 4926 + }, + { + "epoch": 0.56, + "learning_rate": 8.449732838436006e-06, + "loss": 0.4679, + "step": 4927 + }, + { + "epoch": 0.56, + "learning_rate": 8.44607604397413e-06, + "loss": 0.4558, + "step": 4928 + }, + { + "epoch": 0.56, + "learning_rate": 8.442419462434533e-06, + "loss": 0.4744, + "step": 4929 + }, + { + "epoch": 0.56, + "learning_rate": 8.438763094318245e-06, + "loss": 0.4627, + "step": 4930 + }, + { + "epoch": 0.56, + "learning_rate": 8.435106940126266e-06, + "loss": 0.4728, + "step": 4931 + }, + { + "epoch": 0.56, + "learning_rate": 8.431451000359575e-06, + "loss": 0.4689, + "step": 4932 + }, + { + "epoch": 0.56, + "learning_rate": 8.42779527551912e-06, + "loss": 0.4442, + "step": 4933 + }, + { + "epoch": 0.56, + "learning_rate": 8.424139766105808e-06, + "loss": 0.4379, + "step": 4934 + }, + { + "epoch": 0.56, + "learning_rate": 8.420484472620535e-06, + "loss": 0.4609, + "step": 4935 + }, + { + "epoch": 0.56, + "learning_rate": 8.41682939556415e-06, + "loss": 0.4641, + "step": 4936 + }, + { + "epoch": 0.56, + "learning_rate": 8.413174535437486e-06, + "loss": 0.4572, + "step": 4937 + }, + { + "epoch": 0.56, + "learning_rate": 8.409519892741342e-06, + "loss": 0.4448, + "step": 4938 + }, + { + "epoch": 0.56, + "learning_rate": 8.405865467976477e-06, + "loss": 0.4714, + "step": 4939 + }, + { + "epoch": 0.56, + "learning_rate": 8.402211261643638e-06, + "loss": 0.4533, + "step": 4940 + }, + { + "epoch": 0.56, + "learning_rate": 8.398557274243524e-06, + "loss": 0.4597, + "step": 4941 + }, + { + "epoch": 0.56, + "learning_rate": 8.39490350627682e-06, + "loss": 0.4445, + "step": 4942 + }, + { + "epoch": 0.56, + "learning_rate": 8.391249958244173e-06, + "loss": 0.4642, + "step": 4943 + }, + { + "epoch": 0.57, + "learning_rate": 8.387596630646195e-06, + "loss": 0.4629, + "step": 4944 + }, + { + "epoch": 0.57, + "learning_rate": 8.383943523983482e-06, + "loss": 0.445, + "step": 4945 + }, + { + "epoch": 0.57, + "learning_rate": 8.380290638756584e-06, + "loss": 0.4638, + "step": 4946 + }, + { + "epoch": 0.57, + "learning_rate": 8.376637975466029e-06, + "loss": 0.4669, + "step": 4947 + }, + { + "epoch": 0.57, + "learning_rate": 8.372985534612317e-06, + "loss": 0.4508, + "step": 4948 + }, + { + "epoch": 0.57, + "learning_rate": 8.369333316695909e-06, + "loss": 0.4773, + "step": 4949 + }, + { + "epoch": 0.57, + "learning_rate": 8.365681322217247e-06, + "loss": 0.4547, + "step": 4950 + }, + { + "epoch": 0.57, + "learning_rate": 8.362029551676731e-06, + "loss": 0.4548, + "step": 4951 + }, + { + "epoch": 0.57, + "learning_rate": 8.358378005574731e-06, + "loss": 0.4562, + "step": 4952 + }, + { + "epoch": 0.57, + "learning_rate": 8.354726684411604e-06, + "loss": 0.4575, + "step": 4953 + }, + { + "epoch": 0.57, + "learning_rate": 8.351075588687648e-06, + "loss": 0.4677, + "step": 4954 + }, + { + "epoch": 0.57, + "learning_rate": 8.347424718903152e-06, + "loss": 0.4593, + "step": 4955 + }, + { + "epoch": 0.57, + "learning_rate": 8.343774075558366e-06, + "loss": 0.4511, + "step": 4956 + }, + { + "epoch": 0.57, + "learning_rate": 8.340123659153506e-06, + "loss": 0.4596, + "step": 4957 + }, + { + "epoch": 0.57, + "learning_rate": 8.336473470188767e-06, + "loss": 0.4628, + "step": 4958 + }, + { + "epoch": 0.57, + "learning_rate": 8.3328235091643e-06, + "loss": 0.4524, + "step": 4959 + }, + { + "epoch": 0.57, + "learning_rate": 8.329173776580236e-06, + "loss": 0.4604, + "step": 4960 + }, + { + "epoch": 0.57, + "learning_rate": 8.325524272936668e-06, + "loss": 0.4824, + "step": 4961 + }, + { + "epoch": 0.57, + "learning_rate": 8.321874998733654e-06, + "loss": 0.4744, + "step": 4962 + }, + { + "epoch": 0.57, + "learning_rate": 8.318225954471238e-06, + "loss": 0.4642, + "step": 4963 + }, + { + "epoch": 0.57, + "learning_rate": 8.31457714064941e-06, + "loss": 0.4616, + "step": 4964 + }, + { + "epoch": 0.57, + "learning_rate": 8.310928557768145e-06, + "loss": 0.4421, + "step": 4965 + }, + { + "epoch": 0.57, + "learning_rate": 8.307280206327376e-06, + "loss": 0.4755, + "step": 4966 + }, + { + "epoch": 0.57, + "learning_rate": 8.30363208682701e-06, + "loss": 0.4527, + "step": 4967 + }, + { + "epoch": 0.57, + "learning_rate": 8.299984199766925e-06, + "loss": 0.4444, + "step": 4968 + }, + { + "epoch": 0.57, + "learning_rate": 8.296336545646957e-06, + "loss": 0.4497, + "step": 4969 + }, + { + "epoch": 0.57, + "learning_rate": 8.292689124966917e-06, + "loss": 0.4672, + "step": 4970 + }, + { + "epoch": 0.57, + "learning_rate": 8.28904193822659e-06, + "loss": 0.4503, + "step": 4971 + }, + { + "epoch": 0.57, + "learning_rate": 8.285394985925714e-06, + "loss": 0.4621, + "step": 4972 + }, + { + "epoch": 0.57, + "learning_rate": 8.281748268564002e-06, + "loss": 0.459, + "step": 4973 + }, + { + "epoch": 0.57, + "learning_rate": 8.278101786641142e-06, + "loss": 0.4655, + "step": 4974 + }, + { + "epoch": 0.57, + "learning_rate": 8.27445554065678e-06, + "loss": 0.4612, + "step": 4975 + }, + { + "epoch": 0.57, + "learning_rate": 8.270809531110536e-06, + "loss": 0.4627, + "step": 4976 + }, + { + "epoch": 0.57, + "learning_rate": 8.267163758501992e-06, + "loss": 0.4448, + "step": 4977 + }, + { + "epoch": 0.57, + "learning_rate": 8.263518223330698e-06, + "loss": 0.4608, + "step": 4978 + }, + { + "epoch": 0.57, + "learning_rate": 8.259872926096177e-06, + "loss": 0.4677, + "step": 4979 + }, + { + "epoch": 0.57, + "learning_rate": 8.256227867297915e-06, + "loss": 0.47, + "step": 4980 + }, + { + "epoch": 0.57, + "learning_rate": 8.25258304743537e-06, + "loss": 0.4541, + "step": 4981 + }, + { + "epoch": 0.57, + "learning_rate": 8.24893846700796e-06, + "loss": 0.4523, + "step": 4982 + }, + { + "epoch": 0.57, + "learning_rate": 8.245294126515073e-06, + "loss": 0.4618, + "step": 4983 + }, + { + "epoch": 0.57, + "learning_rate": 8.241650026456065e-06, + "loss": 0.4756, + "step": 4984 + }, + { + "epoch": 0.57, + "learning_rate": 8.238006167330266e-06, + "loss": 0.44, + "step": 4985 + }, + { + "epoch": 0.57, + "learning_rate": 8.234362549636953e-06, + "loss": 0.4509, + "step": 4986 + }, + { + "epoch": 0.57, + "learning_rate": 8.230719173875399e-06, + "loss": 0.4718, + "step": 4987 + }, + { + "epoch": 0.57, + "learning_rate": 8.227076040544813e-06, + "loss": 0.4867, + "step": 4988 + }, + { + "epoch": 0.57, + "learning_rate": 8.223433150144393e-06, + "loss": 0.4502, + "step": 4989 + }, + { + "epoch": 0.57, + "learning_rate": 8.2197905031733e-06, + "loss": 0.4644, + "step": 4990 + }, + { + "epoch": 0.57, + "learning_rate": 8.216148100130647e-06, + "loss": 0.4476, + "step": 4991 + }, + { + "epoch": 0.57, + "learning_rate": 8.212505941515536e-06, + "loss": 0.4573, + "step": 4992 + }, + { + "epoch": 0.57, + "learning_rate": 8.208864027827015e-06, + "loss": 0.474, + "step": 4993 + }, + { + "epoch": 0.57, + "learning_rate": 8.205222359564113e-06, + "loss": 0.4377, + "step": 4994 + }, + { + "epoch": 0.57, + "learning_rate": 8.20158093722582e-06, + "loss": 0.462, + "step": 4995 + }, + { + "epoch": 0.57, + "learning_rate": 8.197939761311087e-06, + "loss": 0.4753, + "step": 4996 + }, + { + "epoch": 0.57, + "learning_rate": 8.194298832318843e-06, + "loss": 0.4643, + "step": 4997 + }, + { + "epoch": 0.57, + "learning_rate": 8.190658150747973e-06, + "loss": 0.4496, + "step": 4998 + }, + { + "epoch": 0.57, + "learning_rate": 8.187017717097327e-06, + "loss": 0.4611, + "step": 4999 + }, + { + "epoch": 0.57, + "learning_rate": 8.183377531865737e-06, + "loss": 0.4553, + "step": 5000 + }, + { + "epoch": 0.57, + "learning_rate": 8.179737595551979e-06, + "loss": 0.4481, + "step": 5001 + }, + { + "epoch": 0.57, + "learning_rate": 8.176097908654814e-06, + "loss": 0.4679, + "step": 5002 + }, + { + "epoch": 0.57, + "learning_rate": 8.172458471672953e-06, + "loss": 0.4676, + "step": 5003 + }, + { + "epoch": 0.57, + "learning_rate": 8.168819285105084e-06, + "loss": 0.4493, + "step": 5004 + }, + { + "epoch": 0.57, + "learning_rate": 8.165180349449857e-06, + "loss": 0.4536, + "step": 5005 + }, + { + "epoch": 0.57, + "learning_rate": 8.161541665205885e-06, + "loss": 0.4662, + "step": 5006 + }, + { + "epoch": 0.57, + "learning_rate": 8.157903232871755e-06, + "loss": 0.4608, + "step": 5007 + }, + { + "epoch": 0.57, + "learning_rate": 8.154265052946005e-06, + "loss": 0.4607, + "step": 5008 + }, + { + "epoch": 0.57, + "learning_rate": 8.15062712592715e-06, + "loss": 0.4508, + "step": 5009 + }, + { + "epoch": 0.57, + "learning_rate": 8.146989452313671e-06, + "loss": 0.4459, + "step": 5010 + }, + { + "epoch": 0.57, + "learning_rate": 8.143352032604007e-06, + "loss": 0.4568, + "step": 5011 + }, + { + "epoch": 0.57, + "learning_rate": 8.139714867296567e-06, + "loss": 0.4647, + "step": 5012 + }, + { + "epoch": 0.57, + "learning_rate": 8.136077956889723e-06, + "loss": 0.4756, + "step": 5013 + }, + { + "epoch": 0.57, + "learning_rate": 8.13244130188181e-06, + "loss": 0.4675, + "step": 5014 + }, + { + "epoch": 0.57, + "learning_rate": 8.128804902771137e-06, + "loss": 0.4786, + "step": 5015 + }, + { + "epoch": 0.57, + "learning_rate": 8.125168760055971e-06, + "loss": 0.471, + "step": 5016 + }, + { + "epoch": 0.57, + "learning_rate": 8.121532874234539e-06, + "loss": 0.4604, + "step": 5017 + }, + { + "epoch": 0.57, + "learning_rate": 8.117897245805044e-06, + "loss": 0.4471, + "step": 5018 + }, + { + "epoch": 0.57, + "learning_rate": 8.114261875265643e-06, + "loss": 0.4433, + "step": 5019 + }, + { + "epoch": 0.57, + "learning_rate": 8.11062676311447e-06, + "loss": 0.4612, + "step": 5020 + }, + { + "epoch": 0.57, + "learning_rate": 8.106991909849613e-06, + "loss": 0.4797, + "step": 5021 + }, + { + "epoch": 0.57, + "learning_rate": 8.103357315969127e-06, + "loss": 0.4456, + "step": 5022 + }, + { + "epoch": 0.57, + "learning_rate": 8.099722981971035e-06, + "loss": 0.4634, + "step": 5023 + }, + { + "epoch": 0.57, + "learning_rate": 8.096088908353316e-06, + "loss": 0.4674, + "step": 5024 + }, + { + "epoch": 0.57, + "learning_rate": 8.092455095613927e-06, + "loss": 0.4568, + "step": 5025 + }, + { + "epoch": 0.57, + "learning_rate": 8.088821544250778e-06, + "loss": 0.4572, + "step": 5026 + }, + { + "epoch": 0.57, + "learning_rate": 8.085188254761744e-06, + "loss": 0.4588, + "step": 5027 + }, + { + "epoch": 0.57, + "learning_rate": 8.081555227644674e-06, + "loss": 0.4563, + "step": 5028 + }, + { + "epoch": 0.57, + "learning_rate": 8.077922463397371e-06, + "loss": 0.4857, + "step": 5029 + }, + { + "epoch": 0.57, + "learning_rate": 8.074289962517597e-06, + "loss": 0.4454, + "step": 5030 + }, + { + "epoch": 0.57, + "learning_rate": 8.070657725503094e-06, + "loss": 0.4636, + "step": 5031 + }, + { + "epoch": 0.58, + "learning_rate": 8.067025752851555e-06, + "loss": 0.4605, + "step": 5032 + }, + { + "epoch": 0.58, + "learning_rate": 8.063394045060648e-06, + "loss": 0.4575, + "step": 5033 + }, + { + "epoch": 0.58, + "learning_rate": 8.059762602627995e-06, + "loss": 0.4647, + "step": 5034 + }, + { + "epoch": 0.58, + "learning_rate": 8.05613142605118e-06, + "loss": 0.4465, + "step": 5035 + }, + { + "epoch": 0.58, + "learning_rate": 8.052500515827759e-06, + "loss": 0.4594, + "step": 5036 + }, + { + "epoch": 0.58, + "learning_rate": 8.048869872455246e-06, + "loss": 0.4623, + "step": 5037 + }, + { + "epoch": 0.58, + "learning_rate": 8.045239496431125e-06, + "loss": 0.4749, + "step": 5038 + }, + { + "epoch": 0.58, + "learning_rate": 8.041609388252836e-06, + "loss": 0.4546, + "step": 5039 + }, + { + "epoch": 0.58, + "learning_rate": 8.037979548417781e-06, + "loss": 0.4709, + "step": 5040 + }, + { + "epoch": 0.58, + "learning_rate": 8.034349977423332e-06, + "loss": 0.4572, + "step": 5041 + }, + { + "epoch": 0.58, + "learning_rate": 8.030720675766825e-06, + "loss": 0.4705, + "step": 5042 + }, + { + "epoch": 0.58, + "learning_rate": 8.027091643945548e-06, + "loss": 0.4479, + "step": 5043 + }, + { + "epoch": 0.58, + "learning_rate": 8.023462882456768e-06, + "loss": 0.4596, + "step": 5044 + }, + { + "epoch": 0.58, + "learning_rate": 8.019834391797696e-06, + "loss": 0.4481, + "step": 5045 + }, + { + "epoch": 0.58, + "learning_rate": 8.016206172465525e-06, + "loss": 0.4726, + "step": 5046 + }, + { + "epoch": 0.58, + "learning_rate": 8.0125782249574e-06, + "loss": 0.4526, + "step": 5047 + }, + { + "epoch": 0.58, + "learning_rate": 8.008950549770426e-06, + "loss": 0.4478, + "step": 5048 + }, + { + "epoch": 0.58, + "learning_rate": 8.005323147401684e-06, + "loss": 0.4693, + "step": 5049 + }, + { + "epoch": 0.58, + "learning_rate": 8.0016960183482e-06, + "loss": 0.4515, + "step": 5050 + }, + { + "epoch": 0.58, + "learning_rate": 7.998069163106977e-06, + "loss": 0.4497, + "step": 5051 + }, + { + "epoch": 0.58, + "learning_rate": 7.994442582174976e-06, + "loss": 0.4524, + "step": 5052 + }, + { + "epoch": 0.58, + "learning_rate": 7.990816276049115e-06, + "loss": 0.4489, + "step": 5053 + }, + { + "epoch": 0.58, + "learning_rate": 7.987190245226285e-06, + "loss": 0.4759, + "step": 5054 + }, + { + "epoch": 0.58, + "learning_rate": 7.983564490203328e-06, + "loss": 0.4643, + "step": 5055 + }, + { + "epoch": 0.58, + "learning_rate": 7.979939011477052e-06, + "loss": 0.4524, + "step": 5056 + }, + { + "epoch": 0.58, + "learning_rate": 7.976313809544237e-06, + "loss": 0.4649, + "step": 5057 + }, + { + "epoch": 0.58, + "learning_rate": 7.972688884901606e-06, + "loss": 0.439, + "step": 5058 + }, + { + "epoch": 0.58, + "learning_rate": 7.969064238045865e-06, + "loss": 0.4411, + "step": 5059 + }, + { + "epoch": 0.58, + "learning_rate": 7.965439869473664e-06, + "loss": 0.4819, + "step": 5060 + }, + { + "epoch": 0.58, + "learning_rate": 7.961815779681624e-06, + "loss": 0.4461, + "step": 5061 + }, + { + "epoch": 0.58, + "learning_rate": 7.95819196916633e-06, + "loss": 0.4635, + "step": 5062 + }, + { + "epoch": 0.58, + "learning_rate": 7.954568438424315e-06, + "loss": 0.44, + "step": 5063 + }, + { + "epoch": 0.58, + "learning_rate": 7.9509451879521e-06, + "loss": 0.4765, + "step": 5064 + }, + { + "epoch": 0.58, + "learning_rate": 7.947322218246136e-06, + "loss": 0.4726, + "step": 5065 + }, + { + "epoch": 0.58, + "learning_rate": 7.943699529802854e-06, + "loss": 0.4714, + "step": 5066 + }, + { + "epoch": 0.58, + "learning_rate": 7.940077123118654e-06, + "loss": 0.4558, + "step": 5067 + }, + { + "epoch": 0.58, + "learning_rate": 7.936454998689875e-06, + "loss": 0.4609, + "step": 5068 + }, + { + "epoch": 0.58, + "learning_rate": 7.932833157012829e-06, + "loss": 0.4656, + "step": 5069 + }, + { + "epoch": 0.58, + "learning_rate": 7.929211598583795e-06, + "loss": 0.4652, + "step": 5070 + }, + { + "epoch": 0.58, + "learning_rate": 7.925590323899002e-06, + "loss": 0.4624, + "step": 5071 + }, + { + "epoch": 0.58, + "learning_rate": 7.921969333454652e-06, + "loss": 0.46, + "step": 5072 + }, + { + "epoch": 0.58, + "learning_rate": 7.918348627746896e-06, + "loss": 0.4763, + "step": 5073 + }, + { + "epoch": 0.58, + "learning_rate": 7.914728207271853e-06, + "loss": 0.4567, + "step": 5074 + }, + { + "epoch": 0.58, + "learning_rate": 7.911108072525603e-06, + "loss": 0.4655, + "step": 5075 + }, + { + "epoch": 0.58, + "learning_rate": 7.90748822400418e-06, + "loss": 0.4465, + "step": 5076 + }, + { + "epoch": 0.58, + "learning_rate": 7.903868662203594e-06, + "loss": 0.4486, + "step": 5077 + }, + { + "epoch": 0.58, + "learning_rate": 7.900249387619797e-06, + "loss": 0.4767, + "step": 5078 + }, + { + "epoch": 0.58, + "learning_rate": 7.896630400748715e-06, + "loss": 0.4576, + "step": 5079 + }, + { + "epoch": 0.58, + "learning_rate": 7.893011702086225e-06, + "loss": 0.4671, + "step": 5080 + }, + { + "epoch": 0.58, + "learning_rate": 7.889393292128177e-06, + "loss": 0.4425, + "step": 5081 + }, + { + "epoch": 0.58, + "learning_rate": 7.885775171370364e-06, + "loss": 0.4573, + "step": 5082 + }, + { + "epoch": 0.58, + "learning_rate": 7.88215734030856e-06, + "loss": 0.4664, + "step": 5083 + }, + { + "epoch": 0.58, + "learning_rate": 7.878539799438479e-06, + "loss": 0.464, + "step": 5084 + }, + { + "epoch": 0.58, + "learning_rate": 7.874922549255814e-06, + "loss": 0.444, + "step": 5085 + }, + { + "epoch": 0.58, + "learning_rate": 7.871305590256206e-06, + "loss": 0.4777, + "step": 5086 + }, + { + "epoch": 0.58, + "learning_rate": 7.867688922935253e-06, + "loss": 0.4662, + "step": 5087 + }, + { + "epoch": 0.58, + "learning_rate": 7.864072547788526e-06, + "loss": 0.4694, + "step": 5088 + }, + { + "epoch": 0.58, + "learning_rate": 7.860456465311544e-06, + "loss": 0.4534, + "step": 5089 + }, + { + "epoch": 0.58, + "learning_rate": 7.856840675999799e-06, + "loss": 0.4482, + "step": 5090 + }, + { + "epoch": 0.58, + "learning_rate": 7.85322518034873e-06, + "loss": 0.4837, + "step": 5091 + }, + { + "epoch": 0.58, + "learning_rate": 7.849609978853735e-06, + "loss": 0.4548, + "step": 5092 + }, + { + "epoch": 0.58, + "learning_rate": 7.845995072010188e-06, + "loss": 0.4736, + "step": 5093 + }, + { + "epoch": 0.58, + "learning_rate": 7.842380460313408e-06, + "loss": 0.4498, + "step": 5094 + }, + { + "epoch": 0.58, + "learning_rate": 7.83876614425867e-06, + "loss": 0.4545, + "step": 5095 + }, + { + "epoch": 0.58, + "learning_rate": 7.835152124341228e-06, + "loss": 0.459, + "step": 5096 + }, + { + "epoch": 0.58, + "learning_rate": 7.831538401056274e-06, + "loss": 0.4741, + "step": 5097 + }, + { + "epoch": 0.58, + "learning_rate": 7.827924974898973e-06, + "loss": 0.4798, + "step": 5098 + }, + { + "epoch": 0.58, + "learning_rate": 7.824311846364448e-06, + "loss": 0.4477, + "step": 5099 + }, + { + "epoch": 0.58, + "learning_rate": 7.82069901594777e-06, + "loss": 0.4804, + "step": 5100 + }, + { + "epoch": 0.58, + "learning_rate": 7.817086484143987e-06, + "loss": 0.4712, + "step": 5101 + }, + { + "epoch": 0.58, + "learning_rate": 7.813474251448086e-06, + "loss": 0.44, + "step": 5102 + }, + { + "epoch": 0.58, + "learning_rate": 7.809862318355033e-06, + "loss": 0.4624, + "step": 5103 + }, + { + "epoch": 0.58, + "learning_rate": 7.806250685359742e-06, + "loss": 0.468, + "step": 5104 + }, + { + "epoch": 0.58, + "learning_rate": 7.80263935295708e-06, + "loss": 0.4681, + "step": 5105 + }, + { + "epoch": 0.58, + "learning_rate": 7.799028321641889e-06, + "loss": 0.457, + "step": 5106 + }, + { + "epoch": 0.58, + "learning_rate": 7.795417591908954e-06, + "loss": 0.4619, + "step": 5107 + }, + { + "epoch": 0.58, + "learning_rate": 7.79180716425303e-06, + "loss": 0.4449, + "step": 5108 + }, + { + "epoch": 0.58, + "learning_rate": 7.788197039168829e-06, + "loss": 0.4714, + "step": 5109 + }, + { + "epoch": 0.58, + "learning_rate": 7.784587217151009e-06, + "loss": 0.4515, + "step": 5110 + }, + { + "epoch": 0.58, + "learning_rate": 7.780977698694206e-06, + "loss": 0.4551, + "step": 5111 + }, + { + "epoch": 0.58, + "learning_rate": 7.777368484293e-06, + "loss": 0.4647, + "step": 5112 + }, + { + "epoch": 0.58, + "learning_rate": 7.77375957444193e-06, + "loss": 0.4572, + "step": 5113 + }, + { + "epoch": 0.58, + "learning_rate": 7.770150969635509e-06, + "loss": 0.4862, + "step": 5114 + }, + { + "epoch": 0.58, + "learning_rate": 7.766542670368182e-06, + "loss": 0.4475, + "step": 5115 + }, + { + "epoch": 0.58, + "learning_rate": 7.76293467713438e-06, + "loss": 0.4581, + "step": 5116 + }, + { + "epoch": 0.58, + "learning_rate": 7.759326990428468e-06, + "loss": 0.4624, + "step": 5117 + }, + { + "epoch": 0.58, + "learning_rate": 7.755719610744783e-06, + "loss": 0.4533, + "step": 5118 + }, + { + "epoch": 0.59, + "learning_rate": 7.752112538577623e-06, + "loss": 0.458, + "step": 5119 + }, + { + "epoch": 0.59, + "learning_rate": 7.748505774421227e-06, + "loss": 0.4647, + "step": 5120 + }, + { + "epoch": 0.59, + "learning_rate": 7.744899318769811e-06, + "loss": 0.4381, + "step": 5121 + }, + { + "epoch": 0.59, + "learning_rate": 7.741293172117533e-06, + "loss": 0.486, + "step": 5122 + }, + { + "epoch": 0.59, + "learning_rate": 7.737687334958518e-06, + "loss": 0.4649, + "step": 5123 + }, + { + "epoch": 0.59, + "learning_rate": 7.73408180778685e-06, + "loss": 0.4676, + "step": 5124 + }, + { + "epoch": 0.59, + "learning_rate": 7.730476591096565e-06, + "loss": 0.464, + "step": 5125 + }, + { + "epoch": 0.59, + "learning_rate": 7.726871685381652e-06, + "loss": 0.4425, + "step": 5126 + }, + { + "epoch": 0.59, + "learning_rate": 7.72326709113607e-06, + "loss": 0.4543, + "step": 5127 + }, + { + "epoch": 0.59, + "learning_rate": 7.719662808853726e-06, + "loss": 0.454, + "step": 5128 + }, + { + "epoch": 0.59, + "learning_rate": 7.71605883902849e-06, + "loss": 0.4574, + "step": 5129 + }, + { + "epoch": 0.59, + "learning_rate": 7.712455182154186e-06, + "loss": 0.477, + "step": 5130 + }, + { + "epoch": 0.59, + "learning_rate": 7.708851838724592e-06, + "loss": 0.4669, + "step": 5131 + }, + { + "epoch": 0.59, + "learning_rate": 7.70524880923345e-06, + "loss": 0.4791, + "step": 5132 + }, + { + "epoch": 0.59, + "learning_rate": 7.701646094174451e-06, + "loss": 0.4468, + "step": 5133 + }, + { + "epoch": 0.59, + "learning_rate": 7.698043694041256e-06, + "loss": 0.4403, + "step": 5134 + }, + { + "epoch": 0.59, + "learning_rate": 7.694441609327465e-06, + "loss": 0.4693, + "step": 5135 + }, + { + "epoch": 0.59, + "learning_rate": 7.69083984052665e-06, + "loss": 0.4503, + "step": 5136 + }, + { + "epoch": 0.59, + "learning_rate": 7.68723838813233e-06, + "loss": 0.4546, + "step": 5137 + }, + { + "epoch": 0.59, + "learning_rate": 7.683637252637988e-06, + "loss": 0.474, + "step": 5138 + }, + { + "epoch": 0.59, + "learning_rate": 7.680036434537054e-06, + "loss": 0.4519, + "step": 5139 + }, + { + "epoch": 0.59, + "learning_rate": 7.676435934322925e-06, + "loss": 0.4492, + "step": 5140 + }, + { + "epoch": 0.59, + "learning_rate": 7.67283575248895e-06, + "loss": 0.4679, + "step": 5141 + }, + { + "epoch": 0.59, + "learning_rate": 7.669235889528436e-06, + "loss": 0.4641, + "step": 5142 + }, + { + "epoch": 0.59, + "learning_rate": 7.665636345934642e-06, + "loss": 0.4657, + "step": 5143 + }, + { + "epoch": 0.59, + "learning_rate": 7.662037122200783e-06, + "loss": 0.4692, + "step": 5144 + }, + { + "epoch": 0.59, + "learning_rate": 7.658438218820037e-06, + "loss": 0.4639, + "step": 5145 + }, + { + "epoch": 0.59, + "learning_rate": 7.65483963628553e-06, + "loss": 0.4528, + "step": 5146 + }, + { + "epoch": 0.59, + "learning_rate": 7.651241375090358e-06, + "loss": 0.4656, + "step": 5147 + }, + { + "epoch": 0.59, + "learning_rate": 7.647643435727555e-06, + "loss": 0.4515, + "step": 5148 + }, + { + "epoch": 0.59, + "learning_rate": 7.644045818690118e-06, + "loss": 0.4597, + "step": 5149 + }, + { + "epoch": 0.59, + "learning_rate": 7.640448524471002e-06, + "loss": 0.4585, + "step": 5150 + }, + { + "epoch": 0.59, + "learning_rate": 7.636851553563123e-06, + "loss": 0.4527, + "step": 5151 + }, + { + "epoch": 0.59, + "learning_rate": 7.633254906459335e-06, + "loss": 0.4651, + "step": 5152 + }, + { + "epoch": 0.59, + "learning_rate": 7.629658583652471e-06, + "loss": 0.4537, + "step": 5153 + }, + { + "epoch": 0.59, + "learning_rate": 7.6260625856352966e-06, + "loss": 0.4566, + "step": 5154 + }, + { + "epoch": 0.59, + "learning_rate": 7.6224669129005516e-06, + "loss": 0.4639, + "step": 5155 + }, + { + "epoch": 0.59, + "learning_rate": 7.6188715659409216e-06, + "loss": 0.4979, + "step": 5156 + }, + { + "epoch": 0.59, + "learning_rate": 7.615276545249046e-06, + "loss": 0.4343, + "step": 5157 + }, + { + "epoch": 0.59, + "learning_rate": 7.611681851317529e-06, + "loss": 0.4736, + "step": 5158 + }, + { + "epoch": 0.59, + "learning_rate": 7.608087484638915e-06, + "loss": 0.4522, + "step": 5159 + }, + { + "epoch": 0.59, + "learning_rate": 7.604493445705722e-06, + "loss": 0.4479, + "step": 5160 + }, + { + "epoch": 0.59, + "learning_rate": 7.60089973501041e-06, + "loss": 0.4702, + "step": 5161 + }, + { + "epoch": 0.59, + "learning_rate": 7.597306353045393e-06, + "loss": 0.4507, + "step": 5162 + }, + { + "epoch": 0.59, + "learning_rate": 7.593713300303054e-06, + "loss": 0.4657, + "step": 5163 + }, + { + "epoch": 0.59, + "learning_rate": 7.590120577275713e-06, + "loss": 0.4633, + "step": 5164 + }, + { + "epoch": 0.59, + "learning_rate": 7.586528184455653e-06, + "loss": 0.4432, + "step": 5165 + }, + { + "epoch": 0.59, + "learning_rate": 7.58293612233512e-06, + "loss": 0.4541, + "step": 5166 + }, + { + "epoch": 0.59, + "learning_rate": 7.579344391406299e-06, + "loss": 0.4552, + "step": 5167 + }, + { + "epoch": 0.59, + "learning_rate": 7.575752992161345e-06, + "loss": 0.4491, + "step": 5168 + }, + { + "epoch": 0.59, + "learning_rate": 7.572161925092352e-06, + "loss": 0.4613, + "step": 5169 + }, + { + "epoch": 0.59, + "learning_rate": 7.568571190691378e-06, + "loss": 0.4477, + "step": 5170 + }, + { + "epoch": 0.59, + "learning_rate": 7.564980789450438e-06, + "loss": 0.4663, + "step": 5171 + }, + { + "epoch": 0.59, + "learning_rate": 7.5613907218614925e-06, + "loss": 0.4692, + "step": 5172 + }, + { + "epoch": 0.59, + "learning_rate": 7.5578009884164646e-06, + "loss": 0.4534, + "step": 5173 + }, + { + "epoch": 0.59, + "learning_rate": 7.554211589607227e-06, + "loss": 0.469, + "step": 5174 + }, + { + "epoch": 0.59, + "learning_rate": 7.5506225259256025e-06, + "loss": 0.4627, + "step": 5175 + }, + { + "epoch": 0.59, + "learning_rate": 7.547033797863382e-06, + "loss": 0.4678, + "step": 5176 + }, + { + "epoch": 0.59, + "learning_rate": 7.543445405912298e-06, + "loss": 0.4545, + "step": 5177 + }, + { + "epoch": 0.59, + "learning_rate": 7.5398573505640326e-06, + "loss": 0.4504, + "step": 5178 + }, + { + "epoch": 0.59, + "learning_rate": 7.536269632310238e-06, + "loss": 0.4522, + "step": 5179 + }, + { + "epoch": 0.59, + "learning_rate": 7.532682251642508e-06, + "loss": 0.4594, + "step": 5180 + }, + { + "epoch": 0.59, + "learning_rate": 7.5290952090523995e-06, + "loss": 0.4656, + "step": 5181 + }, + { + "epoch": 0.59, + "learning_rate": 7.525508505031412e-06, + "loss": 0.4779, + "step": 5182 + }, + { + "epoch": 0.59, + "learning_rate": 7.521922140071003e-06, + "loss": 0.4321, + "step": 5183 + }, + { + "epoch": 0.59, + "learning_rate": 7.518336114662588e-06, + "loss": 0.4792, + "step": 5184 + }, + { + "epoch": 0.59, + "learning_rate": 7.514750429297528e-06, + "loss": 0.4704, + "step": 5185 + }, + { + "epoch": 0.59, + "learning_rate": 7.5111650844671515e-06, + "loss": 0.4456, + "step": 5186 + }, + { + "epoch": 0.59, + "learning_rate": 7.507580080662722e-06, + "loss": 0.4904, + "step": 5187 + }, + { + "epoch": 0.59, + "learning_rate": 7.5039954183754674e-06, + "loss": 0.4394, + "step": 5188 + }, + { + "epoch": 0.59, + "learning_rate": 7.5004110980965664e-06, + "loss": 0.467, + "step": 5189 + }, + { + "epoch": 0.59, + "learning_rate": 7.496827120317149e-06, + "loss": 0.4668, + "step": 5190 + }, + { + "epoch": 0.59, + "learning_rate": 7.493243485528306e-06, + "loss": 0.4549, + "step": 5191 + }, + { + "epoch": 0.59, + "learning_rate": 7.489660194221071e-06, + "loss": 0.4694, + "step": 5192 + }, + { + "epoch": 0.59, + "learning_rate": 7.486077246886435e-06, + "loss": 0.465, + "step": 5193 + }, + { + "epoch": 0.59, + "learning_rate": 7.4824946440153436e-06, + "loss": 0.4509, + "step": 5194 + }, + { + "epoch": 0.59, + "learning_rate": 7.478912386098692e-06, + "loss": 0.4735, + "step": 5195 + }, + { + "epoch": 0.59, + "learning_rate": 7.475330473627327e-06, + "loss": 0.4617, + "step": 5196 + }, + { + "epoch": 0.59, + "learning_rate": 7.471748907092056e-06, + "loss": 0.4542, + "step": 5197 + }, + { + "epoch": 0.59, + "learning_rate": 7.468167686983627e-06, + "loss": 0.4588, + "step": 5198 + }, + { + "epoch": 0.59, + "learning_rate": 7.464586813792757e-06, + "loss": 0.4637, + "step": 5199 + }, + { + "epoch": 0.59, + "learning_rate": 7.4610062880100985e-06, + "loss": 0.4593, + "step": 5200 + }, + { + "epoch": 0.59, + "learning_rate": 7.4574261101262604e-06, + "loss": 0.4585, + "step": 5201 + }, + { + "epoch": 0.59, + "learning_rate": 7.453846280631814e-06, + "loss": 0.457, + "step": 5202 + }, + { + "epoch": 0.59, + "learning_rate": 7.45026680001727e-06, + "loss": 0.4588, + "step": 5203 + }, + { + "epoch": 0.59, + "learning_rate": 7.446687668773105e-06, + "loss": 0.4484, + "step": 5204 + }, + { + "epoch": 0.59, + "learning_rate": 7.443108887389735e-06, + "loss": 0.4708, + "step": 5205 + }, + { + "epoch": 0.59, + "learning_rate": 7.439530456357528e-06, + "loss": 0.452, + "step": 5206 + }, + { + "epoch": 0.6, + "learning_rate": 7.435952376166818e-06, + "loss": 0.4659, + "step": 5207 + }, + { + "epoch": 0.6, + "learning_rate": 7.432374647307878e-06, + "loss": 0.4689, + "step": 5208 + }, + { + "epoch": 0.6, + "learning_rate": 7.428797270270933e-06, + "loss": 0.4541, + "step": 5209 + }, + { + "epoch": 0.6, + "learning_rate": 7.425220245546172e-06, + "loss": 0.4608, + "step": 5210 + }, + { + "epoch": 0.6, + "learning_rate": 7.421643573623717e-06, + "loss": 0.4559, + "step": 5211 + }, + { + "epoch": 0.6, + "learning_rate": 7.41806725499366e-06, + "loss": 0.4525, + "step": 5212 + }, + { + "epoch": 0.6, + "learning_rate": 7.4144912901460355e-06, + "loss": 0.4628, + "step": 5213 + }, + { + "epoch": 0.6, + "learning_rate": 7.410915679570825e-06, + "loss": 0.4476, + "step": 5214 + }, + { + "epoch": 0.6, + "learning_rate": 7.407340423757974e-06, + "loss": 0.4521, + "step": 5215 + }, + { + "epoch": 0.6, + "learning_rate": 7.403765523197365e-06, + "loss": 0.4727, + "step": 5216 + }, + { + "epoch": 0.6, + "learning_rate": 7.4001909783788465e-06, + "loss": 0.4596, + "step": 5217 + }, + { + "epoch": 0.6, + "learning_rate": 7.396616789792208e-06, + "loss": 0.4606, + "step": 5218 + }, + { + "epoch": 0.6, + "learning_rate": 7.39304295792719e-06, + "loss": 0.485, + "step": 5219 + }, + { + "epoch": 0.6, + "learning_rate": 7.389469483273495e-06, + "loss": 0.4432, + "step": 5220 + }, + { + "epoch": 0.6, + "learning_rate": 7.385896366320761e-06, + "loss": 0.4672, + "step": 5221 + }, + { + "epoch": 0.6, + "learning_rate": 7.382323607558585e-06, + "loss": 0.4679, + "step": 5222 + }, + { + "epoch": 0.6, + "learning_rate": 7.378751207476522e-06, + "loss": 0.4712, + "step": 5223 + }, + { + "epoch": 0.6, + "learning_rate": 7.375179166564062e-06, + "loss": 0.4411, + "step": 5224 + }, + { + "epoch": 0.6, + "learning_rate": 7.3716074853106635e-06, + "loss": 0.4791, + "step": 5225 + }, + { + "epoch": 0.6, + "learning_rate": 7.368036164205719e-06, + "loss": 0.4627, + "step": 5226 + }, + { + "epoch": 0.6, + "learning_rate": 7.3644652037385785e-06, + "loss": 0.4696, + "step": 5227 + }, + { + "epoch": 0.6, + "learning_rate": 7.3608946043985515e-06, + "loss": 0.4502, + "step": 5228 + }, + { + "epoch": 0.6, + "learning_rate": 7.357324366674881e-06, + "loss": 0.4456, + "step": 5229 + }, + { + "epoch": 0.6, + "learning_rate": 7.353754491056776e-06, + "loss": 0.4576, + "step": 5230 + }, + { + "epoch": 0.6, + "learning_rate": 7.350184978033386e-06, + "loss": 0.4659, + "step": 5231 + }, + { + "epoch": 0.6, + "learning_rate": 7.34661582809381e-06, + "loss": 0.4441, + "step": 5232 + }, + { + "epoch": 0.6, + "learning_rate": 7.3430470417271106e-06, + "loss": 0.4653, + "step": 5233 + }, + { + "epoch": 0.6, + "learning_rate": 7.339478619422287e-06, + "loss": 0.4634, + "step": 5234 + }, + { + "epoch": 0.6, + "learning_rate": 7.335910561668286e-06, + "loss": 0.4618, + "step": 5235 + }, + { + "epoch": 0.6, + "learning_rate": 7.3323428689540184e-06, + "loss": 0.4568, + "step": 5236 + }, + { + "epoch": 0.6, + "learning_rate": 7.328775541768336e-06, + "loss": 0.4478, + "step": 5237 + }, + { + "epoch": 0.6, + "learning_rate": 7.3252085806000474e-06, + "loss": 0.4481, + "step": 5238 + }, + { + "epoch": 0.6, + "learning_rate": 7.3216419859379e-06, + "loss": 0.4719, + "step": 5239 + }, + { + "epoch": 0.6, + "learning_rate": 7.318075758270593e-06, + "loss": 0.4603, + "step": 5240 + }, + { + "epoch": 0.6, + "learning_rate": 7.314509898086788e-06, + "loss": 0.4593, + "step": 5241 + }, + { + "epoch": 0.6, + "learning_rate": 7.31094440587508e-06, + "loss": 0.4439, + "step": 5242 + }, + { + "epoch": 0.6, + "learning_rate": 7.30737928212403e-06, + "loss": 0.4547, + "step": 5243 + }, + { + "epoch": 0.6, + "learning_rate": 7.303814527322132e-06, + "loss": 0.4527, + "step": 5244 + }, + { + "epoch": 0.6, + "learning_rate": 7.300250141957839e-06, + "loss": 0.4529, + "step": 5245 + }, + { + "epoch": 0.6, + "learning_rate": 7.296686126519552e-06, + "loss": 0.4366, + "step": 5246 + }, + { + "epoch": 0.6, + "learning_rate": 7.293122481495623e-06, + "loss": 0.4555, + "step": 5247 + }, + { + "epoch": 0.6, + "learning_rate": 7.2895592073743415e-06, + "loss": 0.4766, + "step": 5248 + }, + { + "epoch": 0.6, + "learning_rate": 7.2859963046439665e-06, + "loss": 0.4804, + "step": 5249 + }, + { + "epoch": 0.6, + "learning_rate": 7.282433773792689e-06, + "loss": 0.4415, + "step": 5250 + }, + { + "epoch": 0.6, + "learning_rate": 7.2788716153086604e-06, + "loss": 0.4475, + "step": 5251 + }, + { + "epoch": 0.6, + "learning_rate": 7.275309829679973e-06, + "loss": 0.479, + "step": 5252 + }, + { + "epoch": 0.6, + "learning_rate": 7.271748417394668e-06, + "loss": 0.4674, + "step": 5253 + }, + { + "epoch": 0.6, + "learning_rate": 7.2681873789407435e-06, + "loss": 0.4474, + "step": 5254 + }, + { + "epoch": 0.6, + "learning_rate": 7.264626714806135e-06, + "loss": 0.4562, + "step": 5255 + }, + { + "epoch": 0.6, + "learning_rate": 7.2610664254787425e-06, + "loss": 0.4435, + "step": 5256 + }, + { + "epoch": 0.6, + "learning_rate": 7.257506511446398e-06, + "loss": 0.4784, + "step": 5257 + }, + { + "epoch": 0.6, + "learning_rate": 7.253946973196888e-06, + "loss": 0.4483, + "step": 5258 + }, + { + "epoch": 0.6, + "learning_rate": 7.2503878112179534e-06, + "loss": 0.4493, + "step": 5259 + }, + { + "epoch": 0.6, + "learning_rate": 7.246829025997279e-06, + "loss": 0.4603, + "step": 5260 + }, + { + "epoch": 0.6, + "learning_rate": 7.243270618022492e-06, + "loss": 0.4749, + "step": 5261 + }, + { + "epoch": 0.6, + "learning_rate": 7.2397125877811816e-06, + "loss": 0.4541, + "step": 5262 + }, + { + "epoch": 0.6, + "learning_rate": 7.2361549357608685e-06, + "loss": 0.4485, + "step": 5263 + }, + { + "epoch": 0.6, + "learning_rate": 7.232597662449038e-06, + "loss": 0.4716, + "step": 5264 + }, + { + "epoch": 0.6, + "learning_rate": 7.2290407683331154e-06, + "loss": 0.4592, + "step": 5265 + }, + { + "epoch": 0.6, + "learning_rate": 7.225484253900468e-06, + "loss": 0.4722, + "step": 5266 + }, + { + "epoch": 0.6, + "learning_rate": 7.221928119638426e-06, + "loss": 0.4733, + "step": 5267 + }, + { + "epoch": 0.6, + "learning_rate": 7.218372366034252e-06, + "loss": 0.4663, + "step": 5268 + }, + { + "epoch": 0.6, + "learning_rate": 7.214816993575168e-06, + "loss": 0.4417, + "step": 5269 + }, + { + "epoch": 0.6, + "learning_rate": 7.211262002748341e-06, + "loss": 0.4623, + "step": 5270 + }, + { + "epoch": 0.6, + "learning_rate": 7.207707394040877e-06, + "loss": 0.4508, + "step": 5271 + }, + { + "epoch": 0.6, + "learning_rate": 7.2041531679398445e-06, + "loss": 0.4486, + "step": 5272 + }, + { + "epoch": 0.6, + "learning_rate": 7.200599324932246e-06, + "loss": 0.4616, + "step": 5273 + }, + { + "epoch": 0.6, + "learning_rate": 7.197045865505041e-06, + "loss": 0.464, + "step": 5274 + }, + { + "epoch": 0.6, + "learning_rate": 7.193492790145135e-06, + "loss": 0.4784, + "step": 5275 + }, + { + "epoch": 0.6, + "learning_rate": 7.18994009933937e-06, + "loss": 0.441, + "step": 5276 + }, + { + "epoch": 0.6, + "learning_rate": 7.186387793574554e-06, + "loss": 0.4565, + "step": 5277 + }, + { + "epoch": 0.6, + "learning_rate": 7.182835873337425e-06, + "loss": 0.4541, + "step": 5278 + }, + { + "epoch": 0.6, + "learning_rate": 7.179284339114676e-06, + "loss": 0.4627, + "step": 5279 + }, + { + "epoch": 0.6, + "learning_rate": 7.175733191392952e-06, + "loss": 0.4565, + "step": 5280 + }, + { + "epoch": 0.6, + "learning_rate": 7.172182430658832e-06, + "loss": 0.4737, + "step": 5281 + }, + { + "epoch": 0.6, + "learning_rate": 7.168632057398857e-06, + "loss": 0.4672, + "step": 5282 + }, + { + "epoch": 0.6, + "learning_rate": 7.165082072099503e-06, + "loss": 0.453, + "step": 5283 + }, + { + "epoch": 0.6, + "learning_rate": 7.161532475247195e-06, + "loss": 0.4718, + "step": 5284 + }, + { + "epoch": 0.6, + "learning_rate": 7.157983267328314e-06, + "loss": 0.4627, + "step": 5285 + }, + { + "epoch": 0.6, + "learning_rate": 7.1544344488291725e-06, + "loss": 0.4551, + "step": 5286 + }, + { + "epoch": 0.6, + "learning_rate": 7.150886020236045e-06, + "loss": 0.4413, + "step": 5287 + }, + { + "epoch": 0.6, + "learning_rate": 7.147337982035143e-06, + "loss": 0.4505, + "step": 5288 + }, + { + "epoch": 0.6, + "learning_rate": 7.143790334712623e-06, + "loss": 0.4672, + "step": 5289 + }, + { + "epoch": 0.6, + "learning_rate": 7.140243078754601e-06, + "loss": 0.4728, + "step": 5290 + }, + { + "epoch": 0.6, + "learning_rate": 7.136696214647123e-06, + "loss": 0.4598, + "step": 5291 + }, + { + "epoch": 0.6, + "learning_rate": 7.133149742876187e-06, + "loss": 0.4481, + "step": 5292 + }, + { + "epoch": 0.6, + "learning_rate": 7.129603663927742e-06, + "loss": 0.4895, + "step": 5293 + }, + { + "epoch": 0.61, + "learning_rate": 7.12605797828768e-06, + "loss": 0.4599, + "step": 5294 + }, + { + "epoch": 0.61, + "learning_rate": 7.1225126864418425e-06, + "loss": 0.4663, + "step": 5295 + }, + { + "epoch": 0.61, + "learning_rate": 7.118967788876011e-06, + "loss": 0.4485, + "step": 5296 + }, + { + "epoch": 0.61, + "learning_rate": 7.11542328607591e-06, + "loss": 0.4594, + "step": 5297 + }, + { + "epoch": 0.61, + "learning_rate": 7.111879178527223e-06, + "loss": 0.4633, + "step": 5298 + }, + { + "epoch": 0.61, + "learning_rate": 7.108335466715566e-06, + "loss": 0.46, + "step": 5299 + }, + { + "epoch": 0.61, + "learning_rate": 7.104792151126515e-06, + "loss": 0.4562, + "step": 5300 + }, + { + "epoch": 0.61, + "learning_rate": 7.101249232245576e-06, + "loss": 0.4664, + "step": 5301 + }, + { + "epoch": 0.61, + "learning_rate": 7.09770671055821e-06, + "loss": 0.4507, + "step": 5302 + }, + { + "epoch": 0.61, + "learning_rate": 7.094164586549821e-06, + "loss": 0.4531, + "step": 5303 + }, + { + "epoch": 0.61, + "learning_rate": 7.090622860705764e-06, + "loss": 0.4541, + "step": 5304 + }, + { + "epoch": 0.61, + "learning_rate": 7.087081533511324e-06, + "loss": 0.4608, + "step": 5305 + }, + { + "epoch": 0.61, + "learning_rate": 7.0835406054517505e-06, + "loss": 0.4726, + "step": 5306 + }, + { + "epoch": 0.61, + "learning_rate": 7.080000077012229e-06, + "loss": 0.4511, + "step": 5307 + }, + { + "epoch": 0.61, + "learning_rate": 7.076459948677889e-06, + "loss": 0.4427, + "step": 5308 + }, + { + "epoch": 0.61, + "learning_rate": 7.072920220933808e-06, + "loss": 0.4556, + "step": 5309 + }, + { + "epoch": 0.61, + "learning_rate": 7.069380894265004e-06, + "loss": 0.4427, + "step": 5310 + }, + { + "epoch": 0.61, + "learning_rate": 7.06584196915645e-06, + "loss": 0.477, + "step": 5311 + }, + { + "epoch": 0.61, + "learning_rate": 7.062303446093051e-06, + "loss": 0.4334, + "step": 5312 + }, + { + "epoch": 0.61, + "learning_rate": 7.058765325559673e-06, + "loss": 0.4567, + "step": 5313 + }, + { + "epoch": 0.61, + "learning_rate": 7.055227608041111e-06, + "loss": 0.4657, + "step": 5314 + }, + { + "epoch": 0.61, + "learning_rate": 7.051690294022108e-06, + "loss": 0.4766, + "step": 5315 + }, + { + "epoch": 0.61, + "learning_rate": 7.0481533839873614e-06, + "loss": 0.4642, + "step": 5316 + }, + { + "epoch": 0.61, + "learning_rate": 7.044616878421506e-06, + "loss": 0.484, + "step": 5317 + }, + { + "epoch": 0.61, + "learning_rate": 7.041080777809118e-06, + "loss": 0.4496, + "step": 5318 + }, + { + "epoch": 0.61, + "learning_rate": 7.037545082634729e-06, + "loss": 0.456, + "step": 5319 + }, + { + "epoch": 0.61, + "learning_rate": 7.034009793382799e-06, + "loss": 0.4493, + "step": 5320 + }, + { + "epoch": 0.61, + "learning_rate": 7.030474910537748e-06, + "loss": 0.4484, + "step": 5321 + }, + { + "epoch": 0.61, + "learning_rate": 7.0269404345839356e-06, + "loss": 0.4352, + "step": 5322 + }, + { + "epoch": 0.61, + "learning_rate": 7.023406366005655e-06, + "loss": 0.4687, + "step": 5323 + }, + { + "epoch": 0.61, + "learning_rate": 7.019872705287163e-06, + "loss": 0.468, + "step": 5324 + }, + { + "epoch": 0.61, + "learning_rate": 7.016339452912642e-06, + "loss": 0.4487, + "step": 5325 + }, + { + "epoch": 0.61, + "learning_rate": 7.012806609366231e-06, + "loss": 0.4607, + "step": 5326 + }, + { + "epoch": 0.61, + "learning_rate": 7.009274175132009e-06, + "loss": 0.453, + "step": 5327 + }, + { + "epoch": 0.61, + "learning_rate": 7.005742150693993e-06, + "loss": 0.4402, + "step": 5328 + }, + { + "epoch": 0.61, + "learning_rate": 7.0022105365361555e-06, + "loss": 0.4583, + "step": 5329 + }, + { + "epoch": 0.61, + "learning_rate": 6.998679333142403e-06, + "loss": 0.4477, + "step": 5330 + }, + { + "epoch": 0.61, + "learning_rate": 6.995148540996588e-06, + "loss": 0.467, + "step": 5331 + }, + { + "epoch": 0.61, + "learning_rate": 6.991618160582512e-06, + "loss": 0.4758, + "step": 5332 + }, + { + "epoch": 0.61, + "learning_rate": 6.9880881923839105e-06, + "loss": 0.4547, + "step": 5333 + }, + { + "epoch": 0.61, + "learning_rate": 6.9845586368844755e-06, + "loss": 0.4476, + "step": 5334 + }, + { + "epoch": 0.61, + "learning_rate": 6.981029494567828e-06, + "loss": 0.4614, + "step": 5335 + }, + { + "epoch": 0.61, + "learning_rate": 6.97750076591754e-06, + "loss": 0.4657, + "step": 5336 + }, + { + "epoch": 0.61, + "learning_rate": 6.973972451417132e-06, + "loss": 0.4678, + "step": 5337 + }, + { + "epoch": 0.61, + "learning_rate": 6.9704445515500544e-06, + "loss": 0.4482, + "step": 5338 + }, + { + "epoch": 0.61, + "learning_rate": 6.966917066799714e-06, + "loss": 0.4502, + "step": 5339 + }, + { + "epoch": 0.61, + "learning_rate": 6.9633899976494525e-06, + "loss": 0.4655, + "step": 5340 + }, + { + "epoch": 0.61, + "learning_rate": 6.959863344582554e-06, + "loss": 0.4492, + "step": 5341 + }, + { + "epoch": 0.61, + "learning_rate": 6.956337108082256e-06, + "loss": 0.4732, + "step": 5342 + }, + { + "epoch": 0.61, + "learning_rate": 6.952811288631728e-06, + "loss": 0.4797, + "step": 5343 + }, + { + "epoch": 0.61, + "learning_rate": 6.949285886714081e-06, + "loss": 0.4374, + "step": 5344 + }, + { + "epoch": 0.61, + "learning_rate": 6.9457609028123795e-06, + "loss": 0.4326, + "step": 5345 + }, + { + "epoch": 0.61, + "learning_rate": 6.942236337409623e-06, + "loss": 0.4609, + "step": 5346 + }, + { + "epoch": 0.61, + "learning_rate": 6.93871219098876e-06, + "loss": 0.4855, + "step": 5347 + }, + { + "epoch": 0.61, + "learning_rate": 6.935188464032674e-06, + "loss": 0.4507, + "step": 5348 + }, + { + "epoch": 0.61, + "learning_rate": 6.93166515702419e-06, + "loss": 0.4745, + "step": 5349 + }, + { + "epoch": 0.61, + "learning_rate": 6.928142270446086e-06, + "loss": 0.4519, + "step": 5350 + }, + { + "epoch": 0.61, + "learning_rate": 6.924619804781069e-06, + "loss": 0.457, + "step": 5351 + }, + { + "epoch": 0.61, + "learning_rate": 6.921097760511807e-06, + "loss": 0.442, + "step": 5352 + }, + { + "epoch": 0.61, + "learning_rate": 6.917576138120892e-06, + "loss": 0.4732, + "step": 5353 + }, + { + "epoch": 0.61, + "learning_rate": 6.91405493809086e-06, + "loss": 0.4478, + "step": 5354 + }, + { + "epoch": 0.61, + "learning_rate": 6.9105341609042e-06, + "loss": 0.4681, + "step": 5355 + }, + { + "epoch": 0.61, + "learning_rate": 6.907013807043335e-06, + "loss": 0.4417, + "step": 5356 + }, + { + "epoch": 0.61, + "learning_rate": 6.903493876990637e-06, + "loss": 0.4551, + "step": 5357 + }, + { + "epoch": 0.61, + "learning_rate": 6.899974371228409e-06, + "loss": 0.4597, + "step": 5358 + }, + { + "epoch": 0.61, + "learning_rate": 6.896455290238902e-06, + "loss": 0.459, + "step": 5359 + }, + { + "epoch": 0.61, + "learning_rate": 6.892936634504313e-06, + "loss": 0.4587, + "step": 5360 + }, + { + "epoch": 0.61, + "learning_rate": 6.889418404506774e-06, + "loss": 0.467, + "step": 5361 + }, + { + "epoch": 0.61, + "learning_rate": 6.885900600728358e-06, + "loss": 0.4496, + "step": 5362 + }, + { + "epoch": 0.61, + "learning_rate": 6.882383223651088e-06, + "loss": 0.4505, + "step": 5363 + }, + { + "epoch": 0.61, + "learning_rate": 6.878866273756919e-06, + "loss": 0.4636, + "step": 5364 + }, + { + "epoch": 0.61, + "learning_rate": 6.8753497515277555e-06, + "loss": 0.4444, + "step": 5365 + }, + { + "epoch": 0.61, + "learning_rate": 6.871833657445438e-06, + "loss": 0.4537, + "step": 5366 + }, + { + "epoch": 0.61, + "learning_rate": 6.8683179919917465e-06, + "loss": 0.4657, + "step": 5367 + }, + { + "epoch": 0.61, + "learning_rate": 6.8648027556484095e-06, + "loss": 0.4502, + "step": 5368 + }, + { + "epoch": 0.61, + "learning_rate": 6.861287948897091e-06, + "loss": 0.4813, + "step": 5369 + }, + { + "epoch": 0.61, + "learning_rate": 6.857773572219402e-06, + "loss": 0.4466, + "step": 5370 + }, + { + "epoch": 0.61, + "learning_rate": 6.854259626096888e-06, + "loss": 0.4636, + "step": 5371 + }, + { + "epoch": 0.61, + "learning_rate": 6.850746111011034e-06, + "loss": 0.4505, + "step": 5372 + }, + { + "epoch": 0.61, + "learning_rate": 6.847233027443274e-06, + "loss": 0.4592, + "step": 5373 + }, + { + "epoch": 0.61, + "learning_rate": 6.843720375874983e-06, + "loss": 0.4563, + "step": 5374 + }, + { + "epoch": 0.61, + "learning_rate": 6.8402081567874625e-06, + "loss": 0.4667, + "step": 5375 + }, + { + "epoch": 0.61, + "learning_rate": 6.836696370661975e-06, + "loss": 0.4653, + "step": 5376 + }, + { + "epoch": 0.61, + "learning_rate": 6.833185017979704e-06, + "loss": 0.4613, + "step": 5377 + }, + { + "epoch": 0.61, + "learning_rate": 6.8296740992217915e-06, + "loss": 0.4702, + "step": 5378 + }, + { + "epoch": 0.61, + "learning_rate": 6.82616361486931e-06, + "loss": 0.4558, + "step": 5379 + }, + { + "epoch": 0.61, + "learning_rate": 6.82265356540327e-06, + "loss": 0.4441, + "step": 5380 + }, + { + "epoch": 0.61, + "learning_rate": 6.819143951304632e-06, + "loss": 0.4734, + "step": 5381 + }, + { + "epoch": 0.62, + "learning_rate": 6.815634773054286e-06, + "loss": 0.4551, + "step": 5382 + }, + { + "epoch": 0.62, + "learning_rate": 6.812126031133073e-06, + "loss": 0.4691, + "step": 5383 + }, + { + "epoch": 0.62, + "learning_rate": 6.8086177260217675e-06, + "loss": 0.4692, + "step": 5384 + }, + { + "epoch": 0.62, + "learning_rate": 6.8051098582010825e-06, + "loss": 0.4457, + "step": 5385 + }, + { + "epoch": 0.62, + "learning_rate": 6.801602428151679e-06, + "loss": 0.4683, + "step": 5386 + }, + { + "epoch": 0.62, + "learning_rate": 6.7980954363541506e-06, + "loss": 0.4593, + "step": 5387 + }, + { + "epoch": 0.62, + "learning_rate": 6.79458888328903e-06, + "loss": 0.4525, + "step": 5388 + }, + { + "epoch": 0.62, + "learning_rate": 6.791082769436801e-06, + "loss": 0.4436, + "step": 5389 + }, + { + "epoch": 0.62, + "learning_rate": 6.787577095277873e-06, + "loss": 0.4504, + "step": 5390 + }, + { + "epoch": 0.62, + "learning_rate": 6.784071861292607e-06, + "loss": 0.4928, + "step": 5391 + }, + { + "epoch": 0.62, + "learning_rate": 6.780567067961293e-06, + "loss": 0.4695, + "step": 5392 + }, + { + "epoch": 0.62, + "learning_rate": 6.777062715764166e-06, + "loss": 0.4518, + "step": 5393 + }, + { + "epoch": 0.62, + "learning_rate": 6.773558805181408e-06, + "loss": 0.4454, + "step": 5394 + }, + { + "epoch": 0.62, + "learning_rate": 6.770055336693123e-06, + "loss": 0.4695, + "step": 5395 + }, + { + "epoch": 0.62, + "learning_rate": 6.766552310779374e-06, + "loss": 0.4315, + "step": 5396 + }, + { + "epoch": 0.62, + "learning_rate": 6.763049727920145e-06, + "loss": 0.454, + "step": 5397 + }, + { + "epoch": 0.62, + "learning_rate": 6.759547588595372e-06, + "loss": 0.4592, + "step": 5398 + }, + { + "epoch": 0.62, + "learning_rate": 6.7560458932849306e-06, + "loss": 0.457, + "step": 5399 + }, + { + "epoch": 0.62, + "learning_rate": 6.752544642468626e-06, + "loss": 0.4682, + "step": 5400 + }, + { + "epoch": 0.62, + "learning_rate": 6.749043836626203e-06, + "loss": 0.4561, + "step": 5401 + }, + { + "epoch": 0.62, + "learning_rate": 6.74554347623736e-06, + "loss": 0.4611, + "step": 5402 + }, + { + "epoch": 0.62, + "learning_rate": 6.742043561781717e-06, + "loss": 0.4398, + "step": 5403 + }, + { + "epoch": 0.62, + "learning_rate": 6.738544093738848e-06, + "loss": 0.4694, + "step": 5404 + }, + { + "epoch": 0.62, + "learning_rate": 6.735045072588256e-06, + "loss": 0.4432, + "step": 5405 + }, + { + "epoch": 0.62, + "learning_rate": 6.731546498809376e-06, + "loss": 0.4421, + "step": 5406 + }, + { + "epoch": 0.62, + "learning_rate": 6.7280483728816016e-06, + "loss": 0.4487, + "step": 5407 + }, + { + "epoch": 0.62, + "learning_rate": 6.724550695284247e-06, + "loss": 0.4678, + "step": 5408 + }, + { + "epoch": 0.62, + "learning_rate": 6.72105346649658e-06, + "loss": 0.4643, + "step": 5409 + }, + { + "epoch": 0.62, + "learning_rate": 6.717556686997795e-06, + "loss": 0.4759, + "step": 5410 + }, + { + "epoch": 0.62, + "learning_rate": 6.714060357267023e-06, + "loss": 0.4702, + "step": 5411 + }, + { + "epoch": 0.62, + "learning_rate": 6.7105644777833475e-06, + "loss": 0.4544, + "step": 5412 + }, + { + "epoch": 0.62, + "learning_rate": 6.707069049025781e-06, + "loss": 0.4695, + "step": 5413 + }, + { + "epoch": 0.62, + "learning_rate": 6.703574071473269e-06, + "loss": 0.4528, + "step": 5414 + }, + { + "epoch": 0.62, + "learning_rate": 6.700079545604707e-06, + "loss": 0.4569, + "step": 5415 + }, + { + "epoch": 0.62, + "learning_rate": 6.696585471898922e-06, + "loss": 0.4543, + "step": 5416 + }, + { + "epoch": 0.62, + "learning_rate": 6.693091850834681e-06, + "loss": 0.4681, + "step": 5417 + }, + { + "epoch": 0.62, + "learning_rate": 6.6895986828906886e-06, + "loss": 0.4582, + "step": 5418 + }, + { + "epoch": 0.62, + "learning_rate": 6.686105968545582e-06, + "loss": 0.4586, + "step": 5419 + }, + { + "epoch": 0.62, + "learning_rate": 6.682613708277945e-06, + "loss": 0.4444, + "step": 5420 + }, + { + "epoch": 0.62, + "learning_rate": 6.679121902566294e-06, + "loss": 0.4541, + "step": 5421 + }, + { + "epoch": 0.62, + "learning_rate": 6.675630551889088e-06, + "loss": 0.4553, + "step": 5422 + }, + { + "epoch": 0.62, + "learning_rate": 6.672139656724715e-06, + "loss": 0.467, + "step": 5423 + }, + { + "epoch": 0.62, + "learning_rate": 6.668649217551505e-06, + "loss": 0.4665, + "step": 5424 + }, + { + "epoch": 0.62, + "learning_rate": 6.665159234847731e-06, + "loss": 0.4776, + "step": 5425 + }, + { + "epoch": 0.62, + "learning_rate": 6.6616697090915975e-06, + "loss": 0.4475, + "step": 5426 + }, + { + "epoch": 0.62, + "learning_rate": 6.658180640761241e-06, + "loss": 0.4668, + "step": 5427 + }, + { + "epoch": 0.62, + "learning_rate": 6.654692030334753e-06, + "loss": 0.4912, + "step": 5428 + }, + { + "epoch": 0.62, + "learning_rate": 6.651203878290139e-06, + "loss": 0.4346, + "step": 5429 + }, + { + "epoch": 0.62, + "learning_rate": 6.647716185105362e-06, + "loss": 0.4658, + "step": 5430 + }, + { + "epoch": 0.62, + "learning_rate": 6.644228951258313e-06, + "loss": 0.4585, + "step": 5431 + }, + { + "epoch": 0.62, + "learning_rate": 6.640742177226816e-06, + "loss": 0.4507, + "step": 5432 + }, + { + "epoch": 0.62, + "learning_rate": 6.637255863488643e-06, + "loss": 0.4771, + "step": 5433 + }, + { + "epoch": 0.62, + "learning_rate": 6.63377001052149e-06, + "loss": 0.4594, + "step": 5434 + }, + { + "epoch": 0.62, + "learning_rate": 6.630284618803003e-06, + "loss": 0.4394, + "step": 5435 + }, + { + "epoch": 0.62, + "learning_rate": 6.626799688810759e-06, + "loss": 0.4817, + "step": 5436 + }, + { + "epoch": 0.62, + "learning_rate": 6.623315221022263e-06, + "loss": 0.4329, + "step": 5437 + }, + { + "epoch": 0.62, + "learning_rate": 6.619831215914974e-06, + "loss": 0.4537, + "step": 5438 + }, + { + "epoch": 0.62, + "learning_rate": 6.6163476739662724e-06, + "loss": 0.449, + "step": 5439 + }, + { + "epoch": 0.62, + "learning_rate": 6.612864595653483e-06, + "loss": 0.4501, + "step": 5440 + }, + { + "epoch": 0.62, + "learning_rate": 6.609381981453869e-06, + "loss": 0.4529, + "step": 5441 + }, + { + "epoch": 0.62, + "learning_rate": 6.60589983184462e-06, + "loss": 0.4821, + "step": 5442 + }, + { + "epoch": 0.62, + "learning_rate": 6.602418147302874e-06, + "loss": 0.443, + "step": 5443 + }, + { + "epoch": 0.62, + "learning_rate": 6.598936928305695e-06, + "loss": 0.4841, + "step": 5444 + }, + { + "epoch": 0.62, + "learning_rate": 6.5954561753300885e-06, + "loss": 0.4656, + "step": 5445 + }, + { + "epoch": 0.62, + "learning_rate": 6.591975888852998e-06, + "loss": 0.4388, + "step": 5446 + }, + { + "epoch": 0.62, + "learning_rate": 6.5884960693512965e-06, + "loss": 0.463, + "step": 5447 + }, + { + "epoch": 0.62, + "learning_rate": 6.585016717301805e-06, + "loss": 0.4715, + "step": 5448 + }, + { + "epoch": 0.62, + "learning_rate": 6.581537833181262e-06, + "loss": 0.4748, + "step": 5449 + }, + { + "epoch": 0.62, + "learning_rate": 6.578059417466356e-06, + "loss": 0.45, + "step": 5450 + }, + { + "epoch": 0.62, + "learning_rate": 6.5745814706337115e-06, + "loss": 0.4486, + "step": 5451 + }, + { + "epoch": 0.62, + "learning_rate": 6.57110399315988e-06, + "loss": 0.4499, + "step": 5452 + }, + { + "epoch": 0.62, + "learning_rate": 6.5676269855213585e-06, + "loss": 0.4558, + "step": 5453 + }, + { + "epoch": 0.62, + "learning_rate": 6.56415044819457e-06, + "loss": 0.4761, + "step": 5454 + }, + { + "epoch": 0.62, + "learning_rate": 6.560674381655876e-06, + "loss": 0.4435, + "step": 5455 + }, + { + "epoch": 0.62, + "learning_rate": 6.557198786381584e-06, + "loss": 0.4555, + "step": 5456 + }, + { + "epoch": 0.62, + "learning_rate": 6.553723662847924e-06, + "loss": 0.4568, + "step": 5457 + }, + { + "epoch": 0.62, + "learning_rate": 6.550249011531058e-06, + "loss": 0.4636, + "step": 5458 + }, + { + "epoch": 0.62, + "learning_rate": 6.546774832907101e-06, + "loss": 0.461, + "step": 5459 + }, + { + "epoch": 0.62, + "learning_rate": 6.543301127452086e-06, + "loss": 0.4682, + "step": 5460 + }, + { + "epoch": 0.62, + "learning_rate": 6.539827895641997e-06, + "loss": 0.4469, + "step": 5461 + }, + { + "epoch": 0.62, + "learning_rate": 6.536355137952737e-06, + "loss": 0.495, + "step": 5462 + }, + { + "epoch": 0.62, + "learning_rate": 6.53288285486015e-06, + "loss": 0.4454, + "step": 5463 + }, + { + "epoch": 0.62, + "learning_rate": 6.52941104684002e-06, + "loss": 0.4627, + "step": 5464 + }, + { + "epoch": 0.62, + "learning_rate": 6.52593971436806e-06, + "loss": 0.4606, + "step": 5465 + }, + { + "epoch": 0.62, + "learning_rate": 6.522468857919926e-06, + "loss": 0.46, + "step": 5466 + }, + { + "epoch": 0.62, + "learning_rate": 6.518998477971199e-06, + "loss": 0.4615, + "step": 5467 + }, + { + "epoch": 0.62, + "learning_rate": 6.515528574997394e-06, + "loss": 0.4869, + "step": 5468 + }, + { + "epoch": 0.63, + "learning_rate": 6.512059149473971e-06, + "loss": 0.4803, + "step": 5469 + }, + { + "epoch": 0.63, + "learning_rate": 6.508590201876317e-06, + "loss": 0.4653, + "step": 5470 + }, + { + "epoch": 0.63, + "learning_rate": 6.5051217326797535e-06, + "loss": 0.4486, + "step": 5471 + }, + { + "epoch": 0.63, + "learning_rate": 6.501653742359539e-06, + "loss": 0.4593, + "step": 5472 + }, + { + "epoch": 0.63, + "learning_rate": 6.49818623139087e-06, + "loss": 0.4658, + "step": 5473 + }, + { + "epoch": 0.63, + "learning_rate": 6.494719200248867e-06, + "loss": 0.4547, + "step": 5474 + }, + { + "epoch": 0.63, + "learning_rate": 6.491252649408596e-06, + "loss": 0.4492, + "step": 5475 + }, + { + "epoch": 0.63, + "learning_rate": 6.4877865793450445e-06, + "loss": 0.4601, + "step": 5476 + }, + { + "epoch": 0.63, + "learning_rate": 6.484320990533148e-06, + "loss": 0.4599, + "step": 5477 + }, + { + "epoch": 0.63, + "learning_rate": 6.480855883447767e-06, + "loss": 0.4706, + "step": 5478 + }, + { + "epoch": 0.63, + "learning_rate": 6.4773912585637e-06, + "loss": 0.4422, + "step": 5479 + }, + { + "epoch": 0.63, + "learning_rate": 6.473927116355678e-06, + "loss": 0.4583, + "step": 5480 + }, + { + "epoch": 0.63, + "learning_rate": 6.4704634572983615e-06, + "loss": 0.442, + "step": 5481 + }, + { + "epoch": 0.63, + "learning_rate": 6.4670002818663535e-06, + "loss": 0.4692, + "step": 5482 + }, + { + "epoch": 0.63, + "learning_rate": 6.463537590534188e-06, + "loss": 0.4508, + "step": 5483 + }, + { + "epoch": 0.63, + "learning_rate": 6.4600753837763255e-06, + "loss": 0.4677, + "step": 5484 + }, + { + "epoch": 0.63, + "learning_rate": 6.4566136620671705e-06, + "loss": 0.4529, + "step": 5485 + }, + { + "epoch": 0.63, + "learning_rate": 6.453152425881051e-06, + "loss": 0.4515, + "step": 5486 + }, + { + "epoch": 0.63, + "learning_rate": 6.4496916756922375e-06, + "loss": 0.4291, + "step": 5487 + }, + { + "epoch": 0.63, + "learning_rate": 6.4462314119749315e-06, + "loss": 0.464, + "step": 5488 + }, + { + "epoch": 0.63, + "learning_rate": 6.44277163520326e-06, + "loss": 0.4463, + "step": 5489 + }, + { + "epoch": 0.63, + "learning_rate": 6.439312345851298e-06, + "loss": 0.4595, + "step": 5490 + }, + { + "epoch": 0.63, + "learning_rate": 6.435853544393038e-06, + "loss": 0.4468, + "step": 5491 + }, + { + "epoch": 0.63, + "learning_rate": 6.432395231302418e-06, + "loss": 0.451, + "step": 5492 + }, + { + "epoch": 0.63, + "learning_rate": 6.428937407053304e-06, + "loss": 0.4634, + "step": 5493 + }, + { + "epoch": 0.63, + "learning_rate": 6.425480072119488e-06, + "loss": 0.459, + "step": 5494 + }, + { + "epoch": 0.63, + "learning_rate": 6.422023226974713e-06, + "loss": 0.4458, + "step": 5495 + }, + { + "epoch": 0.63, + "learning_rate": 6.4185668720926365e-06, + "loss": 0.455, + "step": 5496 + }, + { + "epoch": 0.63, + "learning_rate": 6.4151110079468545e-06, + "loss": 0.4607, + "step": 5497 + }, + { + "epoch": 0.63, + "learning_rate": 6.411655635010907e-06, + "loss": 0.4528, + "step": 5498 + }, + { + "epoch": 0.63, + "learning_rate": 6.4082007537582465e-06, + "loss": 0.4468, + "step": 5499 + }, + { + "epoch": 0.63, + "learning_rate": 6.40474636466228e-06, + "loss": 0.458, + "step": 5500 + }, + { + "epoch": 0.63, + "learning_rate": 6.4012924681963255e-06, + "loss": 0.4821, + "step": 5501 + }, + { + "epoch": 0.63, + "learning_rate": 6.397839064833647e-06, + "loss": 0.458, + "step": 5502 + }, + { + "epoch": 0.63, + "learning_rate": 6.394386155047443e-06, + "loss": 0.4543, + "step": 5503 + }, + { + "epoch": 0.63, + "learning_rate": 6.39093373931083e-06, + "loss": 0.4515, + "step": 5504 + }, + { + "epoch": 0.63, + "learning_rate": 6.387481818096877e-06, + "loss": 0.4509, + "step": 5505 + }, + { + "epoch": 0.63, + "learning_rate": 6.384030391878566e-06, + "loss": 0.4444, + "step": 5506 + }, + { + "epoch": 0.63, + "learning_rate": 6.38057946112882e-06, + "loss": 0.4718, + "step": 5507 + }, + { + "epoch": 0.63, + "learning_rate": 6.3771290263205e-06, + "loss": 0.4506, + "step": 5508 + }, + { + "epoch": 0.63, + "learning_rate": 6.373679087926388e-06, + "loss": 0.4686, + "step": 5509 + }, + { + "epoch": 0.63, + "learning_rate": 6.370229646419199e-06, + "loss": 0.4743, + "step": 5510 + }, + { + "epoch": 0.63, + "learning_rate": 6.366780702271589e-06, + "loss": 0.458, + "step": 5511 + }, + { + "epoch": 0.63, + "learning_rate": 6.363332255956136e-06, + "loss": 0.4525, + "step": 5512 + }, + { + "epoch": 0.63, + "learning_rate": 6.359884307945363e-06, + "loss": 0.4658, + "step": 5513 + }, + { + "epoch": 0.63, + "learning_rate": 6.356436858711708e-06, + "loss": 0.4587, + "step": 5514 + }, + { + "epoch": 0.63, + "learning_rate": 6.352989908727546e-06, + "loss": 0.4463, + "step": 5515 + }, + { + "epoch": 0.63, + "learning_rate": 6.349543458465193e-06, + "loss": 0.4761, + "step": 5516 + }, + { + "epoch": 0.63, + "learning_rate": 6.346097508396885e-06, + "loss": 0.4726, + "step": 5517 + }, + { + "epoch": 0.63, + "learning_rate": 6.3426520589947985e-06, + "loss": 0.4668, + "step": 5518 + }, + { + "epoch": 0.63, + "learning_rate": 6.339207110731036e-06, + "loss": 0.4606, + "step": 5519 + }, + { + "epoch": 0.63, + "learning_rate": 6.335762664077627e-06, + "loss": 0.454, + "step": 5520 + }, + { + "epoch": 0.63, + "learning_rate": 6.332318719506543e-06, + "loss": 0.469, + "step": 5521 + }, + { + "epoch": 0.63, + "learning_rate": 6.328875277489677e-06, + "loss": 0.4462, + "step": 5522 + }, + { + "epoch": 0.63, + "learning_rate": 6.325432338498865e-06, + "loss": 0.4572, + "step": 5523 + }, + { + "epoch": 0.63, + "learning_rate": 6.321989903005861e-06, + "loss": 0.4478, + "step": 5524 + }, + { + "epoch": 0.63, + "learning_rate": 6.318547971482352e-06, + "loss": 0.4604, + "step": 5525 + }, + { + "epoch": 0.63, + "learning_rate": 6.315106544399966e-06, + "loss": 0.4631, + "step": 5526 + }, + { + "epoch": 0.63, + "learning_rate": 6.311665622230254e-06, + "loss": 0.4559, + "step": 5527 + }, + { + "epoch": 0.63, + "learning_rate": 6.3082252054446955e-06, + "loss": 0.455, + "step": 5528 + }, + { + "epoch": 0.63, + "learning_rate": 6.304785294514709e-06, + "loss": 0.4549, + "step": 5529 + }, + { + "epoch": 0.63, + "learning_rate": 6.301345889911636e-06, + "loss": 0.4664, + "step": 5530 + }, + { + "epoch": 0.63, + "learning_rate": 6.297906992106755e-06, + "loss": 0.4549, + "step": 5531 + }, + { + "epoch": 0.63, + "learning_rate": 6.29446860157127e-06, + "loss": 0.4561, + "step": 5532 + }, + { + "epoch": 0.63, + "learning_rate": 6.291030718776313e-06, + "loss": 0.4534, + "step": 5533 + }, + { + "epoch": 0.63, + "learning_rate": 6.287593344192957e-06, + "loss": 0.4905, + "step": 5534 + }, + { + "epoch": 0.63, + "learning_rate": 6.284156478292196e-06, + "loss": 0.4596, + "step": 5535 + }, + { + "epoch": 0.63, + "learning_rate": 6.2807201215449584e-06, + "loss": 0.4457, + "step": 5536 + }, + { + "epoch": 0.63, + "learning_rate": 6.277284274422104e-06, + "loss": 0.4593, + "step": 5537 + }, + { + "epoch": 0.63, + "learning_rate": 6.273848937394413e-06, + "loss": 0.463, + "step": 5538 + }, + { + "epoch": 0.63, + "learning_rate": 6.270414110932611e-06, + "loss": 0.4587, + "step": 5539 + }, + { + "epoch": 0.63, + "learning_rate": 6.266979795507346e-06, + "loss": 0.4452, + "step": 5540 + }, + { + "epoch": 0.63, + "learning_rate": 6.2635459915891876e-06, + "loss": 0.4585, + "step": 5541 + }, + { + "epoch": 0.63, + "learning_rate": 6.260112699648653e-06, + "loss": 0.4637, + "step": 5542 + }, + { + "epoch": 0.63, + "learning_rate": 6.256679920156172e-06, + "loss": 0.4702, + "step": 5543 + }, + { + "epoch": 0.63, + "learning_rate": 6.253247653582119e-06, + "loss": 0.4526, + "step": 5544 + }, + { + "epoch": 0.63, + "learning_rate": 6.2498159003967896e-06, + "loss": 0.4575, + "step": 5545 + }, + { + "epoch": 0.63, + "learning_rate": 6.246384661070404e-06, + "loss": 0.4568, + "step": 5546 + }, + { + "epoch": 0.63, + "learning_rate": 6.2429539360731286e-06, + "loss": 0.4498, + "step": 5547 + }, + { + "epoch": 0.63, + "learning_rate": 6.239523725875041e-06, + "loss": 0.4679, + "step": 5548 + }, + { + "epoch": 0.63, + "learning_rate": 6.23609403094616e-06, + "loss": 0.4461, + "step": 5549 + }, + { + "epoch": 0.63, + "learning_rate": 6.232664851756434e-06, + "loss": 0.4776, + "step": 5550 + }, + { + "epoch": 0.63, + "learning_rate": 6.229236188775729e-06, + "loss": 0.4537, + "step": 5551 + }, + { + "epoch": 0.63, + "learning_rate": 6.225808042473857e-06, + "loss": 0.4517, + "step": 5552 + }, + { + "epoch": 0.63, + "learning_rate": 6.222380413320546e-06, + "loss": 0.4589, + "step": 5553 + }, + { + "epoch": 0.63, + "learning_rate": 6.218953301785453e-06, + "loss": 0.462, + "step": 5554 + }, + { + "epoch": 0.63, + "learning_rate": 6.2155267083381795e-06, + "loss": 0.4469, + "step": 5555 + }, + { + "epoch": 0.63, + "learning_rate": 6.212100633448237e-06, + "loss": 0.4665, + "step": 5556 + }, + { + "epoch": 0.64, + "learning_rate": 6.208675077585079e-06, + "loss": 0.4465, + "step": 5557 + }, + { + "epoch": 0.64, + "learning_rate": 6.2052500412180805e-06, + "loss": 0.4778, + "step": 5558 + }, + { + "epoch": 0.64, + "learning_rate": 6.201825524816545e-06, + "loss": 0.4497, + "step": 5559 + }, + { + "epoch": 0.64, + "learning_rate": 6.198401528849717e-06, + "loss": 0.4788, + "step": 5560 + }, + { + "epoch": 0.64, + "learning_rate": 6.194978053786749e-06, + "loss": 0.4548, + "step": 5561 + }, + { + "epoch": 0.64, + "learning_rate": 6.191555100096744e-06, + "loss": 0.4663, + "step": 5562 + }, + { + "epoch": 0.64, + "learning_rate": 6.188132668248716e-06, + "loss": 0.4589, + "step": 5563 + }, + { + "epoch": 0.64, + "learning_rate": 6.184710758711616e-06, + "loss": 0.4489, + "step": 5564 + }, + { + "epoch": 0.64, + "learning_rate": 6.181289371954327e-06, + "loss": 0.4458, + "step": 5565 + }, + { + "epoch": 0.64, + "learning_rate": 6.177868508445651e-06, + "loss": 0.4718, + "step": 5566 + }, + { + "epoch": 0.64, + "learning_rate": 6.174448168654317e-06, + "loss": 0.4501, + "step": 5567 + }, + { + "epoch": 0.64, + "learning_rate": 6.171028353048996e-06, + "loss": 0.4906, + "step": 5568 + }, + { + "epoch": 0.64, + "learning_rate": 6.167609062098276e-06, + "loss": 0.4411, + "step": 5569 + }, + { + "epoch": 0.64, + "learning_rate": 6.164190296270683e-06, + "loss": 0.4616, + "step": 5570 + }, + { + "epoch": 0.64, + "learning_rate": 6.160772056034655e-06, + "loss": 0.4449, + "step": 5571 + }, + { + "epoch": 0.64, + "learning_rate": 6.157354341858568e-06, + "loss": 0.4601, + "step": 5572 + }, + { + "epoch": 0.64, + "learning_rate": 6.1539371542107295e-06, + "loss": 0.4472, + "step": 5573 + }, + { + "epoch": 0.64, + "learning_rate": 6.1505204935593665e-06, + "loss": 0.4845, + "step": 5574 + }, + { + "epoch": 0.64, + "learning_rate": 6.147104360372644e-06, + "loss": 0.4442, + "step": 5575 + }, + { + "epoch": 0.64, + "learning_rate": 6.1436887551186466e-06, + "loss": 0.4733, + "step": 5576 + }, + { + "epoch": 0.64, + "learning_rate": 6.14027367826538e-06, + "loss": 0.4424, + "step": 5577 + }, + { + "epoch": 0.64, + "learning_rate": 6.136859130280794e-06, + "loss": 0.4735, + "step": 5578 + }, + { + "epoch": 0.64, + "learning_rate": 6.133445111632761e-06, + "loss": 0.4515, + "step": 5579 + }, + { + "epoch": 0.64, + "learning_rate": 6.130031622789067e-06, + "loss": 0.4605, + "step": 5580 + }, + { + "epoch": 0.64, + "learning_rate": 6.126618664217448e-06, + "loss": 0.4499, + "step": 5581 + }, + { + "epoch": 0.64, + "learning_rate": 6.123206236385543e-06, + "loss": 0.4527, + "step": 5582 + }, + { + "epoch": 0.64, + "learning_rate": 6.119794339760941e-06, + "loss": 0.4556, + "step": 5583 + }, + { + "epoch": 0.64, + "learning_rate": 6.1163829748111466e-06, + "loss": 0.4661, + "step": 5584 + }, + { + "epoch": 0.64, + "learning_rate": 6.112972142003587e-06, + "loss": 0.45, + "step": 5585 + }, + { + "epoch": 0.64, + "learning_rate": 6.109561841805629e-06, + "loss": 0.4588, + "step": 5586 + }, + { + "epoch": 0.64, + "learning_rate": 6.106152074684556e-06, + "loss": 0.4475, + "step": 5587 + }, + { + "epoch": 0.64, + "learning_rate": 6.102742841107585e-06, + "loss": 0.4708, + "step": 5588 + }, + { + "epoch": 0.64, + "learning_rate": 6.099334141541856e-06, + "loss": 0.4573, + "step": 5589 + }, + { + "epoch": 0.64, + "learning_rate": 6.095925976454433e-06, + "loss": 0.4513, + "step": 5590 + }, + { + "epoch": 0.64, + "learning_rate": 6.092518346312317e-06, + "loss": 0.4378, + "step": 5591 + }, + { + "epoch": 0.64, + "learning_rate": 6.089111251582427e-06, + "loss": 0.4679, + "step": 5592 + }, + { + "epoch": 0.64, + "learning_rate": 6.085704692731609e-06, + "loss": 0.4659, + "step": 5593 + }, + { + "epoch": 0.64, + "learning_rate": 6.082298670226642e-06, + "loss": 0.4478, + "step": 5594 + }, + { + "epoch": 0.64, + "learning_rate": 6.0788931845342205e-06, + "loss": 0.452, + "step": 5595 + }, + { + "epoch": 0.64, + "learning_rate": 6.075488236120978e-06, + "loss": 0.4487, + "step": 5596 + }, + { + "epoch": 0.64, + "learning_rate": 6.0720838254534675e-06, + "loss": 0.4547, + "step": 5597 + }, + { + "epoch": 0.64, + "learning_rate": 6.068679952998167e-06, + "loss": 0.4457, + "step": 5598 + }, + { + "epoch": 0.64, + "learning_rate": 6.065276619221485e-06, + "loss": 0.4448, + "step": 5599 + }, + { + "epoch": 0.64, + "learning_rate": 6.061873824589751e-06, + "loss": 0.4378, + "step": 5600 + }, + { + "epoch": 0.64, + "learning_rate": 6.058471569569228e-06, + "loss": 0.462, + "step": 5601 + }, + { + "epoch": 0.64, + "learning_rate": 6.055069854626102e-06, + "loss": 0.4632, + "step": 5602 + }, + { + "epoch": 0.64, + "learning_rate": 6.051668680226477e-06, + "loss": 0.4506, + "step": 5603 + }, + { + "epoch": 0.64, + "learning_rate": 6.0482680468363964e-06, + "loss": 0.4816, + "step": 5604 + }, + { + "epoch": 0.64, + "learning_rate": 6.044867954921818e-06, + "loss": 0.4275, + "step": 5605 + }, + { + "epoch": 0.64, + "learning_rate": 6.0414684049486335e-06, + "loss": 0.4369, + "step": 5606 + }, + { + "epoch": 0.64, + "learning_rate": 6.0380693973826595e-06, + "loss": 0.4646, + "step": 5607 + }, + { + "epoch": 0.64, + "learning_rate": 6.034670932689629e-06, + "loss": 0.4391, + "step": 5608 + }, + { + "epoch": 0.64, + "learning_rate": 6.031273011335215e-06, + "loss": 0.4563, + "step": 5609 + }, + { + "epoch": 0.64, + "learning_rate": 6.027875633785003e-06, + "loss": 0.4534, + "step": 5610 + }, + { + "epoch": 0.64, + "learning_rate": 6.024478800504509e-06, + "loss": 0.4575, + "step": 5611 + }, + { + "epoch": 0.64, + "learning_rate": 6.0210825119591806e-06, + "loss": 0.4483, + "step": 5612 + }, + { + "epoch": 0.64, + "learning_rate": 6.0176867686143795e-06, + "loss": 0.446, + "step": 5613 + }, + { + "epoch": 0.64, + "learning_rate": 6.014291570935405e-06, + "loss": 0.448, + "step": 5614 + }, + { + "epoch": 0.64, + "learning_rate": 6.0108969193874675e-06, + "loss": 0.4926, + "step": 5615 + }, + { + "epoch": 0.64, + "learning_rate": 6.007502814435713e-06, + "loss": 0.4475, + "step": 5616 + }, + { + "epoch": 0.64, + "learning_rate": 6.0041092565452135e-06, + "loss": 0.4339, + "step": 5617 + }, + { + "epoch": 0.64, + "learning_rate": 6.000716246180953e-06, + "loss": 0.4616, + "step": 5618 + }, + { + "epoch": 0.64, + "learning_rate": 5.9973237838078625e-06, + "loss": 0.474, + "step": 5619 + }, + { + "epoch": 0.64, + "learning_rate": 5.993931869890774e-06, + "loss": 0.4726, + "step": 5620 + }, + { + "epoch": 0.64, + "learning_rate": 5.9905405048944575e-06, + "loss": 0.464, + "step": 5621 + }, + { + "epoch": 0.64, + "learning_rate": 5.987149689283614e-06, + "loss": 0.4486, + "step": 5622 + }, + { + "epoch": 0.64, + "learning_rate": 5.983759423522852e-06, + "loss": 0.4448, + "step": 5623 + }, + { + "epoch": 0.64, + "learning_rate": 5.980369708076713e-06, + "loss": 0.4598, + "step": 5624 + }, + { + "epoch": 0.64, + "learning_rate": 5.976980543409669e-06, + "loss": 0.4511, + "step": 5625 + }, + { + "epoch": 0.64, + "learning_rate": 5.973591929986108e-06, + "loss": 0.4508, + "step": 5626 + }, + { + "epoch": 0.64, + "learning_rate": 5.97020386827035e-06, + "loss": 0.4723, + "step": 5627 + }, + { + "epoch": 0.64, + "learning_rate": 5.966816358726633e-06, + "loss": 0.4761, + "step": 5628 + }, + { + "epoch": 0.64, + "learning_rate": 5.9634294018191145e-06, + "loss": 0.4594, + "step": 5629 + }, + { + "epoch": 0.64, + "learning_rate": 5.960042998011892e-06, + "loss": 0.4682, + "step": 5630 + }, + { + "epoch": 0.64, + "learning_rate": 5.9566571477689735e-06, + "loss": 0.4535, + "step": 5631 + }, + { + "epoch": 0.64, + "learning_rate": 5.953271851554303e-06, + "loss": 0.4446, + "step": 5632 + }, + { + "epoch": 0.64, + "learning_rate": 5.949887109831736e-06, + "loss": 0.4797, + "step": 5633 + }, + { + "epoch": 0.64, + "learning_rate": 5.946502923065054e-06, + "loss": 0.4445, + "step": 5634 + }, + { + "epoch": 0.64, + "learning_rate": 5.943119291717974e-06, + "loss": 0.4538, + "step": 5635 + }, + { + "epoch": 0.64, + "learning_rate": 5.939736216254126e-06, + "loss": 0.4487, + "step": 5636 + }, + { + "epoch": 0.64, + "learning_rate": 5.936353697137063e-06, + "loss": 0.4681, + "step": 5637 + }, + { + "epoch": 0.64, + "learning_rate": 5.932971734830273e-06, + "loss": 0.4663, + "step": 5638 + }, + { + "epoch": 0.64, + "learning_rate": 5.929590329797154e-06, + "loss": 0.4624, + "step": 5639 + }, + { + "epoch": 0.64, + "learning_rate": 5.926209482501037e-06, + "loss": 0.4462, + "step": 5640 + }, + { + "epoch": 0.64, + "learning_rate": 5.9228291934051754e-06, + "loss": 0.4564, + "step": 5641 + }, + { + "epoch": 0.64, + "learning_rate": 5.919449462972737e-06, + "loss": 0.4501, + "step": 5642 + }, + { + "epoch": 0.64, + "learning_rate": 5.916070291666831e-06, + "loss": 0.4547, + "step": 5643 + }, + { + "epoch": 0.65, + "learning_rate": 5.9126916799504685e-06, + "loss": 0.4662, + "step": 5644 + }, + { + "epoch": 0.65, + "learning_rate": 5.9093136282866014e-06, + "loss": 0.4753, + "step": 5645 + }, + { + "epoch": 0.65, + "learning_rate": 5.9059361371381e-06, + "loss": 0.4517, + "step": 5646 + }, + { + "epoch": 0.65, + "learning_rate": 5.9025592069677475e-06, + "loss": 0.4549, + "step": 5647 + }, + { + "epoch": 0.65, + "learning_rate": 5.899182838238265e-06, + "loss": 0.4504, + "step": 5648 + }, + { + "epoch": 0.65, + "learning_rate": 5.895807031412293e-06, + "loss": 0.4433, + "step": 5649 + }, + { + "epoch": 0.65, + "learning_rate": 5.892431786952384e-06, + "loss": 0.453, + "step": 5650 + }, + { + "epoch": 0.65, + "learning_rate": 5.8890571053210295e-06, + "loss": 0.4681, + "step": 5651 + }, + { + "epoch": 0.65, + "learning_rate": 5.88568298698063e-06, + "loss": 0.4705, + "step": 5652 + }, + { + "epoch": 0.65, + "learning_rate": 5.88230943239352e-06, + "loss": 0.4575, + "step": 5653 + }, + { + "epoch": 0.65, + "learning_rate": 5.878936442021952e-06, + "loss": 0.4592, + "step": 5654 + }, + { + "epoch": 0.65, + "learning_rate": 5.875564016328096e-06, + "loss": 0.457, + "step": 5655 + }, + { + "epoch": 0.65, + "learning_rate": 5.872192155774056e-06, + "loss": 0.4668, + "step": 5656 + }, + { + "epoch": 0.65, + "learning_rate": 5.868820860821844e-06, + "loss": 0.4524, + "step": 5657 + }, + { + "epoch": 0.65, + "learning_rate": 5.8654501319334105e-06, + "loss": 0.4498, + "step": 5658 + }, + { + "epoch": 0.65, + "learning_rate": 5.862079969570619e-06, + "loss": 0.4785, + "step": 5659 + }, + { + "epoch": 0.65, + "learning_rate": 5.858710374195251e-06, + "loss": 0.4567, + "step": 5660 + }, + { + "epoch": 0.65, + "learning_rate": 5.855341346269026e-06, + "loss": 0.4633, + "step": 5661 + }, + { + "epoch": 0.65, + "learning_rate": 5.851972886253569e-06, + "loss": 0.4638, + "step": 5662 + }, + { + "epoch": 0.65, + "learning_rate": 5.848604994610434e-06, + "loss": 0.441, + "step": 5663 + }, + { + "epoch": 0.65, + "learning_rate": 5.845237671801103e-06, + "loss": 0.4802, + "step": 5664 + }, + { + "epoch": 0.65, + "learning_rate": 5.841870918286967e-06, + "loss": 0.4347, + "step": 5665 + }, + { + "epoch": 0.65, + "learning_rate": 5.838504734529353e-06, + "loss": 0.4517, + "step": 5666 + }, + { + "epoch": 0.65, + "learning_rate": 5.835139120989503e-06, + "loss": 0.4683, + "step": 5667 + }, + { + "epoch": 0.65, + "learning_rate": 5.831774078128574e-06, + "loss": 0.4603, + "step": 5668 + }, + { + "epoch": 0.65, + "learning_rate": 5.828409606407659e-06, + "loss": 0.48, + "step": 5669 + }, + { + "epoch": 0.65, + "learning_rate": 5.825045706287762e-06, + "loss": 0.4576, + "step": 5670 + }, + { + "epoch": 0.65, + "learning_rate": 5.821682378229813e-06, + "loss": 0.4416, + "step": 5671 + }, + { + "epoch": 0.65, + "learning_rate": 5.818319622694668e-06, + "loss": 0.4558, + "step": 5672 + }, + { + "epoch": 0.65, + "learning_rate": 5.814957440143092e-06, + "loss": 0.4724, + "step": 5673 + }, + { + "epoch": 0.65, + "learning_rate": 5.811595831035786e-06, + "loss": 0.4637, + "step": 5674 + }, + { + "epoch": 0.65, + "learning_rate": 5.8082347958333625e-06, + "loss": 0.4548, + "step": 5675 + }, + { + "epoch": 0.65, + "learning_rate": 5.804874334996353e-06, + "loss": 0.4525, + "step": 5676 + }, + { + "epoch": 0.65, + "learning_rate": 5.801514448985226e-06, + "loss": 0.4845, + "step": 5677 + }, + { + "epoch": 0.65, + "learning_rate": 5.798155138260352e-06, + "loss": 0.4575, + "step": 5678 + }, + { + "epoch": 0.65, + "learning_rate": 5.794796403282035e-06, + "loss": 0.4511, + "step": 5679 + }, + { + "epoch": 0.65, + "learning_rate": 5.791438244510499e-06, + "loss": 0.4552, + "step": 5680 + }, + { + "epoch": 0.65, + "learning_rate": 5.788080662405881e-06, + "loss": 0.4433, + "step": 5681 + }, + { + "epoch": 0.65, + "learning_rate": 5.784723657428255e-06, + "loss": 0.4881, + "step": 5682 + }, + { + "epoch": 0.65, + "learning_rate": 5.781367230037592e-06, + "loss": 0.4417, + "step": 5683 + }, + { + "epoch": 0.65, + "learning_rate": 5.7780113806938095e-06, + "loss": 0.4435, + "step": 5684 + }, + { + "epoch": 0.65, + "learning_rate": 5.774656109856729e-06, + "loss": 0.4837, + "step": 5685 + }, + { + "epoch": 0.65, + "learning_rate": 5.7713014179860925e-06, + "loss": 0.4599, + "step": 5686 + }, + { + "epoch": 0.65, + "learning_rate": 5.767947305541577e-06, + "loss": 0.459, + "step": 5687 + }, + { + "epoch": 0.65, + "learning_rate": 5.764593772982762e-06, + "loss": 0.4506, + "step": 5688 + }, + { + "epoch": 0.65, + "learning_rate": 5.76124082076916e-06, + "loss": 0.4723, + "step": 5689 + }, + { + "epoch": 0.65, + "learning_rate": 5.757888449360205e-06, + "loss": 0.4578, + "step": 5690 + }, + { + "epoch": 0.65, + "learning_rate": 5.754536659215239e-06, + "loss": 0.4623, + "step": 5691 + }, + { + "epoch": 0.65, + "learning_rate": 5.751185450793539e-06, + "loss": 0.4391, + "step": 5692 + }, + { + "epoch": 0.65, + "learning_rate": 5.747834824554293e-06, + "loss": 0.458, + "step": 5693 + }, + { + "epoch": 0.65, + "learning_rate": 5.744484780956605e-06, + "loss": 0.4721, + "step": 5694 + }, + { + "epoch": 0.65, + "learning_rate": 5.741135320459516e-06, + "loss": 0.4749, + "step": 5695 + }, + { + "epoch": 0.65, + "learning_rate": 5.737786443521968e-06, + "loss": 0.4534, + "step": 5696 + }, + { + "epoch": 0.65, + "learning_rate": 5.734438150602841e-06, + "loss": 0.449, + "step": 5697 + }, + { + "epoch": 0.65, + "learning_rate": 5.731090442160917e-06, + "loss": 0.4614, + "step": 5698 + }, + { + "epoch": 0.65, + "learning_rate": 5.727743318654911e-06, + "loss": 0.4712, + "step": 5699 + }, + { + "epoch": 0.65, + "learning_rate": 5.724396780543457e-06, + "loss": 0.4552, + "step": 5700 + }, + { + "epoch": 0.65, + "learning_rate": 5.721050828285097e-06, + "loss": 0.4427, + "step": 5701 + }, + { + "epoch": 0.65, + "learning_rate": 5.717705462338311e-06, + "loss": 0.4601, + "step": 5702 + }, + { + "epoch": 0.65, + "learning_rate": 5.714360683161484e-06, + "loss": 0.4697, + "step": 5703 + }, + { + "epoch": 0.65, + "learning_rate": 5.711016491212922e-06, + "loss": 0.4579, + "step": 5704 + }, + { + "epoch": 0.65, + "learning_rate": 5.707672886950859e-06, + "loss": 0.4585, + "step": 5705 + }, + { + "epoch": 0.65, + "learning_rate": 5.704329870833443e-06, + "loss": 0.4581, + "step": 5706 + }, + { + "epoch": 0.65, + "learning_rate": 5.700987443318737e-06, + "loss": 0.4484, + "step": 5707 + }, + { + "epoch": 0.65, + "learning_rate": 5.697645604864732e-06, + "loss": 0.4493, + "step": 5708 + }, + { + "epoch": 0.65, + "learning_rate": 5.694304355929333e-06, + "loss": 0.4515, + "step": 5709 + }, + { + "epoch": 0.65, + "learning_rate": 5.690963696970371e-06, + "loss": 0.4578, + "step": 5710 + }, + { + "epoch": 0.65, + "learning_rate": 5.687623628445588e-06, + "loss": 0.4852, + "step": 5711 + }, + { + "epoch": 0.65, + "learning_rate": 5.684284150812642e-06, + "loss": 0.4439, + "step": 5712 + }, + { + "epoch": 0.65, + "learning_rate": 5.680945264529125e-06, + "loss": 0.4691, + "step": 5713 + }, + { + "epoch": 0.65, + "learning_rate": 5.67760697005253e-06, + "loss": 0.4385, + "step": 5714 + }, + { + "epoch": 0.65, + "learning_rate": 5.674269267840287e-06, + "loss": 0.4582, + "step": 5715 + }, + { + "epoch": 0.65, + "learning_rate": 5.670932158349732e-06, + "loss": 0.4456, + "step": 5716 + }, + { + "epoch": 0.65, + "learning_rate": 5.667595642038117e-06, + "loss": 0.4779, + "step": 5717 + }, + { + "epoch": 0.65, + "learning_rate": 5.664259719362627e-06, + "loss": 0.4467, + "step": 5718 + }, + { + "epoch": 0.65, + "learning_rate": 5.660924390780359e-06, + "loss": 0.4604, + "step": 5719 + }, + { + "epoch": 0.65, + "learning_rate": 5.657589656748321e-06, + "loss": 0.4391, + "step": 5720 + }, + { + "epoch": 0.65, + "learning_rate": 5.654255517723452e-06, + "loss": 0.4577, + "step": 5721 + }, + { + "epoch": 0.65, + "learning_rate": 5.650921974162598e-06, + "loss": 0.45, + "step": 5722 + }, + { + "epoch": 0.65, + "learning_rate": 5.647589026522535e-06, + "loss": 0.4597, + "step": 5723 + }, + { + "epoch": 0.65, + "learning_rate": 5.644256675259949e-06, + "loss": 0.4548, + "step": 5724 + }, + { + "epoch": 0.65, + "learning_rate": 5.640924920831441e-06, + "loss": 0.4683, + "step": 5725 + }, + { + "epoch": 0.65, + "learning_rate": 5.637593763693545e-06, + "loss": 0.4619, + "step": 5726 + }, + { + "epoch": 0.65, + "learning_rate": 5.634263204302694e-06, + "loss": 0.453, + "step": 5727 + }, + { + "epoch": 0.65, + "learning_rate": 5.630933243115255e-06, + "loss": 0.4557, + "step": 5728 + }, + { + "epoch": 0.65, + "learning_rate": 5.627603880587511e-06, + "loss": 0.4494, + "step": 5729 + }, + { + "epoch": 0.65, + "learning_rate": 5.624275117175649e-06, + "loss": 0.4481, + "step": 5730 + }, + { + "epoch": 0.65, + "learning_rate": 5.620946953335793e-06, + "loss": 0.4744, + "step": 5731 + }, + { + "epoch": 0.66, + "learning_rate": 5.617619389523973e-06, + "loss": 0.4303, + "step": 5732 + }, + { + "epoch": 0.66, + "learning_rate": 5.614292426196133e-06, + "loss": 0.4716, + "step": 5733 + }, + { + "epoch": 0.66, + "learning_rate": 5.610966063808152e-06, + "loss": 0.4592, + "step": 5734 + }, + { + "epoch": 0.66, + "learning_rate": 5.607640302815806e-06, + "loss": 0.4709, + "step": 5735 + }, + { + "epoch": 0.66, + "learning_rate": 5.6043151436748035e-06, + "loss": 0.4491, + "step": 5736 + }, + { + "epoch": 0.66, + "learning_rate": 5.600990586840768e-06, + "loss": 0.4589, + "step": 5737 + }, + { + "epoch": 0.66, + "learning_rate": 5.597666632769232e-06, + "loss": 0.4456, + "step": 5738 + }, + { + "epoch": 0.66, + "learning_rate": 5.594343281915658e-06, + "loss": 0.4658, + "step": 5739 + }, + { + "epoch": 0.66, + "learning_rate": 5.5910205347354114e-06, + "loss": 0.4628, + "step": 5740 + }, + { + "epoch": 0.66, + "learning_rate": 5.587698391683792e-06, + "loss": 0.433, + "step": 5741 + }, + { + "epoch": 0.66, + "learning_rate": 5.584376853216003e-06, + "loss": 0.4459, + "step": 5742 + }, + { + "epoch": 0.66, + "learning_rate": 5.581055919787165e-06, + "loss": 0.4464, + "step": 5743 + }, + { + "epoch": 0.66, + "learning_rate": 5.577735591852327e-06, + "loss": 0.4552, + "step": 5744 + }, + { + "epoch": 0.66, + "learning_rate": 5.574415869866443e-06, + "loss": 0.4547, + "step": 5745 + }, + { + "epoch": 0.66, + "learning_rate": 5.571096754284389e-06, + "loss": 0.4586, + "step": 5746 + }, + { + "epoch": 0.66, + "learning_rate": 5.567778245560966e-06, + "loss": 0.5003, + "step": 5747 + }, + { + "epoch": 0.66, + "learning_rate": 5.564460344150873e-06, + "loss": 0.4509, + "step": 5748 + }, + { + "epoch": 0.66, + "learning_rate": 5.561143050508746e-06, + "loss": 0.4781, + "step": 5749 + }, + { + "epoch": 0.66, + "learning_rate": 5.5578263650891225e-06, + "loss": 0.4499, + "step": 5750 + }, + { + "epoch": 0.66, + "learning_rate": 5.554510288346459e-06, + "loss": 0.4647, + "step": 5751 + }, + { + "epoch": 0.66, + "learning_rate": 5.551194820735144e-06, + "loss": 0.4586, + "step": 5752 + }, + { + "epoch": 0.66, + "learning_rate": 5.547879962709457e-06, + "loss": 0.4501, + "step": 5753 + }, + { + "epoch": 0.66, + "learning_rate": 5.544565714723619e-06, + "loss": 0.4588, + "step": 5754 + }, + { + "epoch": 0.66, + "learning_rate": 5.541252077231746e-06, + "loss": 0.4566, + "step": 5755 + }, + { + "epoch": 0.66, + "learning_rate": 5.537939050687886e-06, + "loss": 0.4355, + "step": 5756 + }, + { + "epoch": 0.66, + "learning_rate": 5.534626635546e-06, + "loss": 0.4584, + "step": 5757 + }, + { + "epoch": 0.66, + "learning_rate": 5.53131483225996e-06, + "loss": 0.458, + "step": 5758 + }, + { + "epoch": 0.66, + "learning_rate": 5.528003641283552e-06, + "loss": 0.451, + "step": 5759 + }, + { + "epoch": 0.66, + "learning_rate": 5.524693063070492e-06, + "loss": 0.4604, + "step": 5760 + }, + { + "epoch": 0.66, + "learning_rate": 5.521383098074395e-06, + "loss": 0.4517, + "step": 5761 + }, + { + "epoch": 0.66, + "learning_rate": 5.5180737467488085e-06, + "loss": 0.4649, + "step": 5762 + }, + { + "epoch": 0.66, + "learning_rate": 5.514765009547181e-06, + "loss": 0.4876, + "step": 5763 + }, + { + "epoch": 0.66, + "learning_rate": 5.511456886922883e-06, + "loss": 0.4452, + "step": 5764 + }, + { + "epoch": 0.66, + "learning_rate": 5.508149379329204e-06, + "loss": 0.4783, + "step": 5765 + }, + { + "epoch": 0.66, + "learning_rate": 5.504842487219344e-06, + "loss": 0.4413, + "step": 5766 + }, + { + "epoch": 0.66, + "learning_rate": 5.5015362110464275e-06, + "loss": 0.4431, + "step": 5767 + }, + { + "epoch": 0.66, + "learning_rate": 5.4982305512634845e-06, + "loss": 0.4764, + "step": 5768 + }, + { + "epoch": 0.66, + "learning_rate": 5.4949255083234585e-06, + "loss": 0.4596, + "step": 5769 + }, + { + "epoch": 0.66, + "learning_rate": 5.491621082679224e-06, + "loss": 0.4597, + "step": 5770 + }, + { + "epoch": 0.66, + "learning_rate": 5.48831727478355e-06, + "loss": 0.4672, + "step": 5771 + }, + { + "epoch": 0.66, + "learning_rate": 5.4850140850891445e-06, + "loss": 0.4401, + "step": 5772 + }, + { + "epoch": 0.66, + "learning_rate": 5.481711514048609e-06, + "loss": 0.4626, + "step": 5773 + }, + { + "epoch": 0.66, + "learning_rate": 5.478409562114469e-06, + "loss": 0.444, + "step": 5774 + }, + { + "epoch": 0.66, + "learning_rate": 5.47510822973917e-06, + "loss": 0.4472, + "step": 5775 + }, + { + "epoch": 0.66, + "learning_rate": 5.4718075173750695e-06, + "loss": 0.4574, + "step": 5776 + }, + { + "epoch": 0.66, + "learning_rate": 5.4685074254744346e-06, + "loss": 0.4639, + "step": 5777 + }, + { + "epoch": 0.66, + "learning_rate": 5.465207954489454e-06, + "loss": 0.4587, + "step": 5778 + }, + { + "epoch": 0.66, + "learning_rate": 5.461909104872226e-06, + "loss": 0.4665, + "step": 5779 + }, + { + "epoch": 0.66, + "learning_rate": 5.458610877074773e-06, + "loss": 0.4562, + "step": 5780 + }, + { + "epoch": 0.66, + "learning_rate": 5.455313271549021e-06, + "loss": 0.4441, + "step": 5781 + }, + { + "epoch": 0.66, + "learning_rate": 5.452016288746813e-06, + "loss": 0.4498, + "step": 5782 + }, + { + "epoch": 0.66, + "learning_rate": 5.448719929119916e-06, + "loss": 0.4597, + "step": 5783 + }, + { + "epoch": 0.66, + "learning_rate": 5.445424193119997e-06, + "loss": 0.4465, + "step": 5784 + }, + { + "epoch": 0.66, + "learning_rate": 5.44212908119865e-06, + "loss": 0.442, + "step": 5785 + }, + { + "epoch": 0.66, + "learning_rate": 5.4388345938073824e-06, + "loss": 0.441, + "step": 5786 + }, + { + "epoch": 0.66, + "learning_rate": 5.435540731397606e-06, + "loss": 0.4598, + "step": 5787 + }, + { + "epoch": 0.66, + "learning_rate": 5.432247494420659e-06, + "loss": 0.4435, + "step": 5788 + }, + { + "epoch": 0.66, + "learning_rate": 5.4289548833277865e-06, + "loss": 0.475, + "step": 5789 + }, + { + "epoch": 0.66, + "learning_rate": 5.425662898570144e-06, + "loss": 0.453, + "step": 5790 + }, + { + "epoch": 0.66, + "learning_rate": 5.422371540598816e-06, + "loss": 0.4684, + "step": 5791 + }, + { + "epoch": 0.66, + "learning_rate": 5.419080809864785e-06, + "loss": 0.4355, + "step": 5792 + }, + { + "epoch": 0.66, + "learning_rate": 5.415790706818958e-06, + "loss": 0.4451, + "step": 5793 + }, + { + "epoch": 0.66, + "learning_rate": 5.412501231912153e-06, + "loss": 0.472, + "step": 5794 + }, + { + "epoch": 0.66, + "learning_rate": 5.409212385595098e-06, + "loss": 0.4648, + "step": 5795 + }, + { + "epoch": 0.66, + "learning_rate": 5.405924168318446e-06, + "loss": 0.4761, + "step": 5796 + }, + { + "epoch": 0.66, + "learning_rate": 5.4026365805327455e-06, + "loss": 0.4499, + "step": 5797 + }, + { + "epoch": 0.66, + "learning_rate": 5.399349622688479e-06, + "loss": 0.4524, + "step": 5798 + }, + { + "epoch": 0.66, + "learning_rate": 5.39606329523603e-06, + "loss": 0.4547, + "step": 5799 + }, + { + "epoch": 0.66, + "learning_rate": 5.392777598625694e-06, + "loss": 0.4504, + "step": 5800 + }, + { + "epoch": 0.66, + "learning_rate": 5.389492533307692e-06, + "loss": 0.4597, + "step": 5801 + }, + { + "epoch": 0.66, + "learning_rate": 5.386208099732144e-06, + "loss": 0.461, + "step": 5802 + }, + { + "epoch": 0.66, + "learning_rate": 5.382924298349095e-06, + "loss": 0.4603, + "step": 5803 + }, + { + "epoch": 0.66, + "learning_rate": 5.379641129608501e-06, + "loss": 0.474, + "step": 5804 + }, + { + "epoch": 0.66, + "learning_rate": 5.3763585939602244e-06, + "loss": 0.4792, + "step": 5805 + }, + { + "epoch": 0.66, + "learning_rate": 5.373076691854054e-06, + "loss": 0.448, + "step": 5806 + }, + { + "epoch": 0.66, + "learning_rate": 5.3697954237396764e-06, + "loss": 0.4556, + "step": 5807 + }, + { + "epoch": 0.66, + "learning_rate": 5.366514790066697e-06, + "loss": 0.4481, + "step": 5808 + }, + { + "epoch": 0.66, + "learning_rate": 5.363234791284644e-06, + "loss": 0.4715, + "step": 5809 + }, + { + "epoch": 0.66, + "learning_rate": 5.3599554278429415e-06, + "loss": 0.4523, + "step": 5810 + }, + { + "epoch": 0.66, + "learning_rate": 5.356676700190944e-06, + "loss": 0.483, + "step": 5811 + }, + { + "epoch": 0.66, + "learning_rate": 5.353398608777901e-06, + "loss": 0.4732, + "step": 5812 + }, + { + "epoch": 0.66, + "learning_rate": 5.35012115405299e-06, + "loss": 0.4547, + "step": 5813 + }, + { + "epoch": 0.66, + "learning_rate": 5.3468443364653e-06, + "loss": 0.4648, + "step": 5814 + }, + { + "epoch": 0.66, + "learning_rate": 5.343568156463821e-06, + "loss": 0.4508, + "step": 5815 + }, + { + "epoch": 0.66, + "learning_rate": 5.3402926144974625e-06, + "loss": 0.4693, + "step": 5816 + }, + { + "epoch": 0.66, + "learning_rate": 5.337017711015052e-06, + "loss": 0.4573, + "step": 5817 + }, + { + "epoch": 0.66, + "learning_rate": 5.333743446465318e-06, + "loss": 0.4506, + "step": 5818 + }, + { + "epoch": 0.67, + "learning_rate": 5.330469821296916e-06, + "loss": 0.4474, + "step": 5819 + }, + { + "epoch": 0.67, + "learning_rate": 5.327196835958402e-06, + "loss": 0.4784, + "step": 5820 + }, + { + "epoch": 0.67, + "learning_rate": 5.323924490898242e-06, + "loss": 0.4562, + "step": 5821 + }, + { + "epoch": 0.67, + "learning_rate": 5.320652786564826e-06, + "loss": 0.4497, + "step": 5822 + }, + { + "epoch": 0.67, + "learning_rate": 5.31738172340645e-06, + "loss": 0.4454, + "step": 5823 + }, + { + "epoch": 0.67, + "learning_rate": 5.314111301871325e-06, + "loss": 0.4383, + "step": 5824 + }, + { + "epoch": 0.67, + "learning_rate": 5.3108415224075725e-06, + "loss": 0.4667, + "step": 5825 + }, + { + "epoch": 0.67, + "learning_rate": 5.307572385463218e-06, + "loss": 0.4501, + "step": 5826 + }, + { + "epoch": 0.67, + "learning_rate": 5.304303891486213e-06, + "loss": 0.4514, + "step": 5827 + }, + { + "epoch": 0.67, + "learning_rate": 5.301036040924412e-06, + "loss": 0.4677, + "step": 5828 + }, + { + "epoch": 0.67, + "learning_rate": 5.297768834225581e-06, + "loss": 0.4647, + "step": 5829 + }, + { + "epoch": 0.67, + "learning_rate": 5.294502271837405e-06, + "loss": 0.4455, + "step": 5830 + }, + { + "epoch": 0.67, + "learning_rate": 5.2912363542074695e-06, + "loss": 0.4584, + "step": 5831 + }, + { + "epoch": 0.67, + "learning_rate": 5.287971081783283e-06, + "loss": 0.4672, + "step": 5832 + }, + { + "epoch": 0.67, + "learning_rate": 5.284706455012263e-06, + "loss": 0.4464, + "step": 5833 + }, + { + "epoch": 0.67, + "learning_rate": 5.281442474341729e-06, + "loss": 0.4605, + "step": 5834 + }, + { + "epoch": 0.67, + "learning_rate": 5.278179140218928e-06, + "loss": 0.4518, + "step": 5835 + }, + { + "epoch": 0.67, + "learning_rate": 5.274916453091001e-06, + "loss": 0.4436, + "step": 5836 + }, + { + "epoch": 0.67, + "learning_rate": 5.271654413405016e-06, + "loss": 0.465, + "step": 5837 + }, + { + "epoch": 0.67, + "learning_rate": 5.268393021607944e-06, + "loss": 0.4715, + "step": 5838 + }, + { + "epoch": 0.67, + "learning_rate": 5.2651322781466606e-06, + "loss": 0.445, + "step": 5839 + }, + { + "epoch": 0.67, + "learning_rate": 5.261872183467972e-06, + "loss": 0.468, + "step": 5840 + }, + { + "epoch": 0.67, + "learning_rate": 5.258612738018574e-06, + "loss": 0.4424, + "step": 5841 + }, + { + "epoch": 0.67, + "learning_rate": 5.255353942245089e-06, + "loss": 0.4553, + "step": 5842 + }, + { + "epoch": 0.67, + "learning_rate": 5.252095796594046e-06, + "loss": 0.4456, + "step": 5843 + }, + { + "epoch": 0.67, + "learning_rate": 5.2488383015118785e-06, + "loss": 0.4466, + "step": 5844 + }, + { + "epoch": 0.67, + "learning_rate": 5.2455814574449415e-06, + "loss": 0.456, + "step": 5845 + }, + { + "epoch": 0.67, + "learning_rate": 5.242325264839494e-06, + "loss": 0.4612, + "step": 5846 + }, + { + "epoch": 0.67, + "learning_rate": 5.239069724141701e-06, + "loss": 0.4419, + "step": 5847 + }, + { + "epoch": 0.67, + "learning_rate": 5.235814835797655e-06, + "loss": 0.4759, + "step": 5848 + }, + { + "epoch": 0.67, + "learning_rate": 5.232560600253336e-06, + "loss": 0.4619, + "step": 5849 + }, + { + "epoch": 0.67, + "learning_rate": 5.229307017954655e-06, + "loss": 0.4547, + "step": 5850 + }, + { + "epoch": 0.67, + "learning_rate": 5.226054089347428e-06, + "loss": 0.4444, + "step": 5851 + }, + { + "epoch": 0.67, + "learning_rate": 5.22280181487737e-06, + "loss": 0.4596, + "step": 5852 + }, + { + "epoch": 0.67, + "learning_rate": 5.219550194990124e-06, + "loss": 0.4476, + "step": 5853 + }, + { + "epoch": 0.67, + "learning_rate": 5.216299230131227e-06, + "loss": 0.4702, + "step": 5854 + }, + { + "epoch": 0.67, + "learning_rate": 5.21304892074614e-06, + "loss": 0.4289, + "step": 5855 + }, + { + "epoch": 0.67, + "learning_rate": 5.209799267280225e-06, + "loss": 0.4721, + "step": 5856 + }, + { + "epoch": 0.67, + "learning_rate": 5.206550270178754e-06, + "loss": 0.4441, + "step": 5857 + }, + { + "epoch": 0.67, + "learning_rate": 5.20330192988692e-06, + "loss": 0.4478, + "step": 5858 + }, + { + "epoch": 0.67, + "learning_rate": 5.2000542468498085e-06, + "loss": 0.4388, + "step": 5859 + }, + { + "epoch": 0.67, + "learning_rate": 5.19680722151243e-06, + "loss": 0.456, + "step": 5860 + }, + { + "epoch": 0.67, + "learning_rate": 5.1935608543197035e-06, + "loss": 0.4623, + "step": 5861 + }, + { + "epoch": 0.67, + "learning_rate": 5.1903151457164445e-06, + "loss": 0.4772, + "step": 5862 + }, + { + "epoch": 0.67, + "learning_rate": 5.187070096147397e-06, + "loss": 0.4432, + "step": 5863 + }, + { + "epoch": 0.67, + "learning_rate": 5.183825706057199e-06, + "loss": 0.4853, + "step": 5864 + }, + { + "epoch": 0.67, + "learning_rate": 5.180581975890404e-06, + "loss": 0.4542, + "step": 5865 + }, + { + "epoch": 0.67, + "learning_rate": 5.177338906091481e-06, + "loss": 0.4507, + "step": 5866 + }, + { + "epoch": 0.67, + "learning_rate": 5.1740964971047945e-06, + "loss": 0.4507, + "step": 5867 + }, + { + "epoch": 0.67, + "learning_rate": 5.1708547493746376e-06, + "loss": 0.4551, + "step": 5868 + }, + { + "epoch": 0.67, + "learning_rate": 5.16761366334519e-06, + "loss": 0.4536, + "step": 5869 + }, + { + "epoch": 0.67, + "learning_rate": 5.164373239460561e-06, + "loss": 0.4742, + "step": 5870 + }, + { + "epoch": 0.67, + "learning_rate": 5.161133478164764e-06, + "loss": 0.4438, + "step": 5871 + }, + { + "epoch": 0.67, + "learning_rate": 5.157894379901711e-06, + "loss": 0.4696, + "step": 5872 + }, + { + "epoch": 0.67, + "learning_rate": 5.154655945115233e-06, + "loss": 0.4426, + "step": 5873 + }, + { + "epoch": 0.67, + "learning_rate": 5.15141817424907e-06, + "loss": 0.447, + "step": 5874 + }, + { + "epoch": 0.67, + "learning_rate": 5.148181067746862e-06, + "loss": 0.4607, + "step": 5875 + }, + { + "epoch": 0.67, + "learning_rate": 5.144944626052178e-06, + "loss": 0.4548, + "step": 5876 + }, + { + "epoch": 0.67, + "learning_rate": 5.141708849608473e-06, + "loss": 0.4623, + "step": 5877 + }, + { + "epoch": 0.67, + "learning_rate": 5.138473738859118e-06, + "loss": 0.4585, + "step": 5878 + }, + { + "epoch": 0.67, + "learning_rate": 5.1352392942474005e-06, + "loss": 0.4618, + "step": 5879 + }, + { + "epoch": 0.67, + "learning_rate": 5.132005516216512e-06, + "loss": 0.4381, + "step": 5880 + }, + { + "epoch": 0.67, + "learning_rate": 5.128772405209556e-06, + "loss": 0.4473, + "step": 5881 + }, + { + "epoch": 0.67, + "learning_rate": 5.1255399616695345e-06, + "loss": 0.4759, + "step": 5882 + }, + { + "epoch": 0.67, + "learning_rate": 5.122308186039364e-06, + "loss": 0.4472, + "step": 5883 + }, + { + "epoch": 0.67, + "learning_rate": 5.119077078761875e-06, + "loss": 0.453, + "step": 5884 + }, + { + "epoch": 0.67, + "learning_rate": 5.115846640279798e-06, + "loss": 0.4527, + "step": 5885 + }, + { + "epoch": 0.67, + "learning_rate": 5.1126168710357735e-06, + "loss": 0.4394, + "step": 5886 + }, + { + "epoch": 0.67, + "learning_rate": 5.109387771472356e-06, + "loss": 0.4605, + "step": 5887 + }, + { + "epoch": 0.67, + "learning_rate": 5.106159342032e-06, + "loss": 0.4662, + "step": 5888 + }, + { + "epoch": 0.67, + "learning_rate": 5.102931583157074e-06, + "loss": 0.4503, + "step": 5889 + }, + { + "epoch": 0.67, + "learning_rate": 5.099704495289859e-06, + "loss": 0.4692, + "step": 5890 + }, + { + "epoch": 0.67, + "learning_rate": 5.096478078872528e-06, + "loss": 0.4399, + "step": 5891 + }, + { + "epoch": 0.67, + "learning_rate": 5.093252334347183e-06, + "loss": 0.4462, + "step": 5892 + }, + { + "epoch": 0.67, + "learning_rate": 5.09002726215581e-06, + "loss": 0.4574, + "step": 5893 + }, + { + "epoch": 0.67, + "learning_rate": 5.08680286274033e-06, + "loss": 0.4606, + "step": 5894 + }, + { + "epoch": 0.67, + "learning_rate": 5.083579136542548e-06, + "loss": 0.4525, + "step": 5895 + }, + { + "epoch": 0.67, + "learning_rate": 5.080356084004187e-06, + "loss": 0.4595, + "step": 5896 + }, + { + "epoch": 0.67, + "learning_rate": 5.0771337055668826e-06, + "loss": 0.4541, + "step": 5897 + }, + { + "epoch": 0.67, + "learning_rate": 5.073912001672165e-06, + "loss": 0.4641, + "step": 5898 + }, + { + "epoch": 0.67, + "learning_rate": 5.070690972761484e-06, + "loss": 0.4504, + "step": 5899 + }, + { + "epoch": 0.67, + "learning_rate": 5.067470619276196e-06, + "loss": 0.4616, + "step": 5900 + }, + { + "epoch": 0.67, + "learning_rate": 5.064250941657555e-06, + "loss": 0.4527, + "step": 5901 + }, + { + "epoch": 0.67, + "learning_rate": 5.061031940346734e-06, + "loss": 0.4591, + "step": 5902 + }, + { + "epoch": 0.67, + "learning_rate": 5.057813615784806e-06, + "loss": 0.4393, + "step": 5903 + }, + { + "epoch": 0.67, + "learning_rate": 5.054595968412748e-06, + "loss": 0.4637, + "step": 5904 + }, + { + "epoch": 0.67, + "learning_rate": 5.051378998671459e-06, + "loss": 0.4544, + "step": 5905 + }, + { + "epoch": 0.67, + "learning_rate": 5.048162707001727e-06, + "loss": 0.4612, + "step": 5906 + }, + { + "epoch": 0.68, + "learning_rate": 5.044947093844259e-06, + "loss": 0.4416, + "step": 5907 + }, + { + "epoch": 0.68, + "learning_rate": 5.0417321596396715e-06, + "loss": 0.4594, + "step": 5908 + }, + { + "epoch": 0.68, + "learning_rate": 5.038517904828473e-06, + "loss": 0.4519, + "step": 5909 + }, + { + "epoch": 0.68, + "learning_rate": 5.035304329851096e-06, + "loss": 0.4565, + "step": 5910 + }, + { + "epoch": 0.68, + "learning_rate": 5.032091435147867e-06, + "loss": 0.4487, + "step": 5911 + }, + { + "epoch": 0.68, + "learning_rate": 5.028879221159025e-06, + "loss": 0.4597, + "step": 5912 + }, + { + "epoch": 0.68, + "learning_rate": 5.025667688324718e-06, + "loss": 0.4687, + "step": 5913 + }, + { + "epoch": 0.68, + "learning_rate": 5.02245683708499e-06, + "loss": 0.4769, + "step": 5914 + }, + { + "epoch": 0.68, + "learning_rate": 5.0192466678798116e-06, + "loss": 0.4546, + "step": 5915 + }, + { + "epoch": 0.68, + "learning_rate": 5.016037181149036e-06, + "loss": 0.4698, + "step": 5916 + }, + { + "epoch": 0.68, + "learning_rate": 5.012828377332438e-06, + "loss": 0.447, + "step": 5917 + }, + { + "epoch": 0.68, + "learning_rate": 5.009620256869703e-06, + "loss": 0.4466, + "step": 5918 + }, + { + "epoch": 0.68, + "learning_rate": 5.0064128202004025e-06, + "loss": 0.4612, + "step": 5919 + }, + { + "epoch": 0.68, + "learning_rate": 5.003206067764039e-06, + "loss": 0.4515, + "step": 5920 + }, + { + "epoch": 0.68, + "learning_rate": 5.000000000000003e-06, + "loss": 0.4684, + "step": 5921 + }, + { + "epoch": 0.68, + "learning_rate": 4.996794617347593e-06, + "loss": 0.4616, + "step": 5922 + }, + { + "epoch": 0.68, + "learning_rate": 4.993589920246028e-06, + "loss": 0.4691, + "step": 5923 + }, + { + "epoch": 0.68, + "learning_rate": 4.9903859091344175e-06, + "loss": 0.46, + "step": 5924 + }, + { + "epoch": 0.68, + "learning_rate": 4.987182584451778e-06, + "loss": 0.4516, + "step": 5925 + }, + { + "epoch": 0.68, + "learning_rate": 4.983979946637043e-06, + "loss": 0.4487, + "step": 5926 + }, + { + "epoch": 0.68, + "learning_rate": 4.980777996129043e-06, + "loss": 0.4312, + "step": 5927 + }, + { + "epoch": 0.68, + "learning_rate": 4.977576733366521e-06, + "loss": 0.4613, + "step": 5928 + }, + { + "epoch": 0.68, + "learning_rate": 4.974376158788119e-06, + "loss": 0.4487, + "step": 5929 + }, + { + "epoch": 0.68, + "learning_rate": 4.971176272832382e-06, + "loss": 0.4573, + "step": 5930 + }, + { + "epoch": 0.68, + "learning_rate": 4.967977075937774e-06, + "loss": 0.4519, + "step": 5931 + }, + { + "epoch": 0.68, + "learning_rate": 4.964778568542649e-06, + "loss": 0.4419, + "step": 5932 + }, + { + "epoch": 0.68, + "learning_rate": 4.9615807510852795e-06, + "loss": 0.448, + "step": 5933 + }, + { + "epoch": 0.68, + "learning_rate": 4.958383624003836e-06, + "loss": 0.4687, + "step": 5934 + }, + { + "epoch": 0.68, + "learning_rate": 4.955187187736393e-06, + "loss": 0.4446, + "step": 5935 + }, + { + "epoch": 0.68, + "learning_rate": 4.951991442720937e-06, + "loss": 0.4657, + "step": 5936 + }, + { + "epoch": 0.68, + "learning_rate": 4.948796389395355e-06, + "loss": 0.4441, + "step": 5937 + }, + { + "epoch": 0.68, + "learning_rate": 4.945602028197447e-06, + "loss": 0.4799, + "step": 5938 + }, + { + "epoch": 0.68, + "learning_rate": 4.942408359564906e-06, + "loss": 0.4605, + "step": 5939 + }, + { + "epoch": 0.68, + "learning_rate": 4.939215383935331e-06, + "loss": 0.4591, + "step": 5940 + }, + { + "epoch": 0.68, + "learning_rate": 4.936023101746242e-06, + "loss": 0.4457, + "step": 5941 + }, + { + "epoch": 0.68, + "learning_rate": 4.932831513435045e-06, + "loss": 0.4516, + "step": 5942 + }, + { + "epoch": 0.68, + "learning_rate": 4.929640619439059e-06, + "loss": 0.4394, + "step": 5943 + }, + { + "epoch": 0.68, + "learning_rate": 4.926450420195513e-06, + "loss": 0.4495, + "step": 5944 + }, + { + "epoch": 0.68, + "learning_rate": 4.92326091614153e-06, + "loss": 0.4513, + "step": 5945 + }, + { + "epoch": 0.68, + "learning_rate": 4.920072107714145e-06, + "loss": 0.4674, + "step": 5946 + }, + { + "epoch": 0.68, + "learning_rate": 4.916883995350299e-06, + "loss": 0.4625, + "step": 5947 + }, + { + "epoch": 0.68, + "learning_rate": 4.913696579486829e-06, + "loss": 0.4644, + "step": 5948 + }, + { + "epoch": 0.68, + "learning_rate": 4.91050986056049e-06, + "loss": 0.4591, + "step": 5949 + }, + { + "epoch": 0.68, + "learning_rate": 4.907323839007925e-06, + "loss": 0.4544, + "step": 5950 + }, + { + "epoch": 0.68, + "learning_rate": 4.904138515265696e-06, + "loss": 0.4634, + "step": 5951 + }, + { + "epoch": 0.68, + "learning_rate": 4.900953889770264e-06, + "loss": 0.4425, + "step": 5952 + }, + { + "epoch": 0.68, + "learning_rate": 4.897769962957986e-06, + "loss": 0.4416, + "step": 5953 + }, + { + "epoch": 0.68, + "learning_rate": 4.89458673526514e-06, + "loss": 0.4747, + "step": 5954 + }, + { + "epoch": 0.68, + "learning_rate": 4.891404207127892e-06, + "loss": 0.4554, + "step": 5955 + }, + { + "epoch": 0.68, + "learning_rate": 4.888222378982323e-06, + "loss": 0.4564, + "step": 5956 + }, + { + "epoch": 0.68, + "learning_rate": 4.885041251264419e-06, + "loss": 0.4421, + "step": 5957 + }, + { + "epoch": 0.68, + "learning_rate": 4.881860824410056e-06, + "loss": 0.4374, + "step": 5958 + }, + { + "epoch": 0.68, + "learning_rate": 4.8786810988550326e-06, + "loss": 0.4528, + "step": 5959 + }, + { + "epoch": 0.68, + "learning_rate": 4.875502075035039e-06, + "loss": 0.474, + "step": 5960 + }, + { + "epoch": 0.68, + "learning_rate": 4.872323753385667e-06, + "loss": 0.4414, + "step": 5961 + }, + { + "epoch": 0.68, + "learning_rate": 4.869146134342426e-06, + "loss": 0.4653, + "step": 5962 + }, + { + "epoch": 0.68, + "learning_rate": 4.8659692183407135e-06, + "loss": 0.4524, + "step": 5963 + }, + { + "epoch": 0.68, + "learning_rate": 4.862793005815841e-06, + "loss": 0.4981, + "step": 5964 + }, + { + "epoch": 0.68, + "learning_rate": 4.859617497203024e-06, + "loss": 0.4381, + "step": 5965 + }, + { + "epoch": 0.68, + "learning_rate": 4.856442692937372e-06, + "loss": 0.4612, + "step": 5966 + }, + { + "epoch": 0.68, + "learning_rate": 4.85326859345391e-06, + "loss": 0.4269, + "step": 5967 + }, + { + "epoch": 0.68, + "learning_rate": 4.850095199187559e-06, + "loss": 0.4602, + "step": 5968 + }, + { + "epoch": 0.68, + "learning_rate": 4.846922510573139e-06, + "loss": 0.4351, + "step": 5969 + }, + { + "epoch": 0.68, + "learning_rate": 4.843750528045387e-06, + "loss": 0.4577, + "step": 5970 + }, + { + "epoch": 0.68, + "learning_rate": 4.8405792520389275e-06, + "loss": 0.4465, + "step": 5971 + }, + { + "epoch": 0.68, + "learning_rate": 4.837408682988305e-06, + "loss": 0.4765, + "step": 5972 + }, + { + "epoch": 0.68, + "learning_rate": 4.83423882132795e-06, + "loss": 0.4499, + "step": 5973 + }, + { + "epoch": 0.68, + "learning_rate": 4.831069667492209e-06, + "loss": 0.4669, + "step": 5974 + }, + { + "epoch": 0.68, + "learning_rate": 4.8279012219153284e-06, + "loss": 0.479, + "step": 5975 + }, + { + "epoch": 0.68, + "learning_rate": 4.8247334850314495e-06, + "loss": 0.4517, + "step": 5976 + }, + { + "epoch": 0.68, + "learning_rate": 4.821566457274632e-06, + "loss": 0.4491, + "step": 5977 + }, + { + "epoch": 0.68, + "learning_rate": 4.818400139078824e-06, + "loss": 0.4552, + "step": 5978 + }, + { + "epoch": 0.68, + "learning_rate": 4.815234530877879e-06, + "loss": 0.4508, + "step": 5979 + }, + { + "epoch": 0.68, + "learning_rate": 4.812069633105563e-06, + "loss": 0.4679, + "step": 5980 + }, + { + "epoch": 0.68, + "learning_rate": 4.808905446195532e-06, + "loss": 0.4566, + "step": 5981 + }, + { + "epoch": 0.68, + "learning_rate": 4.80574197058135e-06, + "loss": 0.4555, + "step": 5982 + }, + { + "epoch": 0.68, + "learning_rate": 4.802579206696486e-06, + "loss": 0.4574, + "step": 5983 + }, + { + "epoch": 0.68, + "learning_rate": 4.7994171549743085e-06, + "loss": 0.4367, + "step": 5984 + }, + { + "epoch": 0.68, + "learning_rate": 4.796255815848094e-06, + "loss": 0.4542, + "step": 5985 + }, + { + "epoch": 0.68, + "learning_rate": 4.7930951897510126e-06, + "loss": 0.4746, + "step": 5986 + }, + { + "epoch": 0.68, + "learning_rate": 4.7899352771161355e-06, + "loss": 0.4405, + "step": 5987 + }, + { + "epoch": 0.68, + "learning_rate": 4.786776078376451e-06, + "loss": 0.4694, + "step": 5988 + }, + { + "epoch": 0.68, + "learning_rate": 4.783617593964831e-06, + "loss": 0.466, + "step": 5989 + }, + { + "epoch": 0.68, + "learning_rate": 4.7804598243140664e-06, + "loss": 0.4495, + "step": 5990 + }, + { + "epoch": 0.68, + "learning_rate": 4.777302769856838e-06, + "loss": 0.4656, + "step": 5991 + }, + { + "epoch": 0.68, + "learning_rate": 4.7741464310257305e-06, + "loss": 0.4563, + "step": 5992 + }, + { + "epoch": 0.68, + "learning_rate": 4.770990808253234e-06, + "loss": 0.4475, + "step": 5993 + }, + { + "epoch": 0.69, + "learning_rate": 4.767835901971745e-06, + "loss": 0.4664, + "step": 5994 + }, + { + "epoch": 0.69, + "learning_rate": 4.764681712613547e-06, + "loss": 0.4534, + "step": 5995 + }, + { + "epoch": 0.69, + "learning_rate": 4.761528240610842e-06, + "loss": 0.4654, + "step": 5996 + }, + { + "epoch": 0.69, + "learning_rate": 4.758375486395721e-06, + "loss": 0.4525, + "step": 5997 + }, + { + "epoch": 0.69, + "learning_rate": 4.755223450400186e-06, + "loss": 0.4443, + "step": 5998 + }, + { + "epoch": 0.69, + "learning_rate": 4.752072133056135e-06, + "loss": 0.456, + "step": 5999 + }, + { + "epoch": 0.69, + "learning_rate": 4.748921534795365e-06, + "loss": 0.459, + "step": 6000 + }, + { + "epoch": 0.69, + "learning_rate": 4.745771656049584e-06, + "loss": 0.4575, + "step": 6001 + }, + { + "epoch": 0.69, + "learning_rate": 4.742622497250389e-06, + "loss": 0.4503, + "step": 6002 + }, + { + "epoch": 0.69, + "learning_rate": 4.739474058829288e-06, + "loss": 0.451, + "step": 6003 + }, + { + "epoch": 0.69, + "learning_rate": 4.736326341217694e-06, + "loss": 0.4712, + "step": 6004 + }, + { + "epoch": 0.69, + "learning_rate": 4.7331793448469045e-06, + "loss": 0.4605, + "step": 6005 + }, + { + "epoch": 0.69, + "learning_rate": 4.730033070148135e-06, + "loss": 0.4639, + "step": 6006 + }, + { + "epoch": 0.69, + "learning_rate": 4.726887517552495e-06, + "loss": 0.4432, + "step": 6007 + }, + { + "epoch": 0.69, + "learning_rate": 4.723742687490988e-06, + "loss": 0.4674, + "step": 6008 + }, + { + "epoch": 0.69, + "learning_rate": 4.7205985803945375e-06, + "loss": 0.4835, + "step": 6009 + }, + { + "epoch": 0.69, + "learning_rate": 4.717455196693945e-06, + "loss": 0.4496, + "step": 6010 + }, + { + "epoch": 0.69, + "learning_rate": 4.7143125368199335e-06, + "loss": 0.4478, + "step": 6011 + }, + { + "epoch": 0.69, + "learning_rate": 4.71117060120311e-06, + "loss": 0.4502, + "step": 6012 + }, + { + "epoch": 0.69, + "learning_rate": 4.708029390273994e-06, + "loss": 0.4507, + "step": 6013 + }, + { + "epoch": 0.69, + "learning_rate": 4.704888904463003e-06, + "loss": 0.4554, + "step": 6014 + }, + { + "epoch": 0.69, + "learning_rate": 4.701749144200449e-06, + "loss": 0.4624, + "step": 6015 + }, + { + "epoch": 0.69, + "learning_rate": 4.698610109916556e-06, + "loss": 0.4803, + "step": 6016 + }, + { + "epoch": 0.69, + "learning_rate": 4.695471802041437e-06, + "loss": 0.4823, + "step": 6017 + }, + { + "epoch": 0.69, + "learning_rate": 4.692334221005108e-06, + "loss": 0.4486, + "step": 6018 + }, + { + "epoch": 0.69, + "learning_rate": 4.689197367237494e-06, + "loss": 0.4389, + "step": 6019 + }, + { + "epoch": 0.69, + "learning_rate": 4.686061241168406e-06, + "loss": 0.4453, + "step": 6020 + }, + { + "epoch": 0.69, + "learning_rate": 4.6829258432275685e-06, + "loss": 0.4557, + "step": 6021 + }, + { + "epoch": 0.69, + "learning_rate": 4.679791173844604e-06, + "loss": 0.4706, + "step": 6022 + }, + { + "epoch": 0.69, + "learning_rate": 4.676657233449025e-06, + "loss": 0.4546, + "step": 6023 + }, + { + "epoch": 0.69, + "learning_rate": 4.673524022470259e-06, + "loss": 0.4494, + "step": 6024 + }, + { + "epoch": 0.69, + "learning_rate": 4.67039154133762e-06, + "loss": 0.4828, + "step": 6025 + }, + { + "epoch": 0.69, + "learning_rate": 4.667259790480327e-06, + "loss": 0.459, + "step": 6026 + }, + { + "epoch": 0.69, + "learning_rate": 4.664128770327506e-06, + "loss": 0.4288, + "step": 6027 + }, + { + "epoch": 0.69, + "learning_rate": 4.66099848130817e-06, + "loss": 0.4452, + "step": 6028 + }, + { + "epoch": 0.69, + "learning_rate": 4.657868923851244e-06, + "loss": 0.4451, + "step": 6029 + }, + { + "epoch": 0.69, + "learning_rate": 4.6547400983855415e-06, + "loss": 0.4561, + "step": 6030 + }, + { + "epoch": 0.69, + "learning_rate": 4.651612005339786e-06, + "loss": 0.461, + "step": 6031 + }, + { + "epoch": 0.69, + "learning_rate": 4.648484645142597e-06, + "loss": 0.4506, + "step": 6032 + }, + { + "epoch": 0.69, + "learning_rate": 4.645358018222486e-06, + "loss": 0.4559, + "step": 6033 + }, + { + "epoch": 0.69, + "learning_rate": 4.642232125007881e-06, + "loss": 0.4586, + "step": 6034 + }, + { + "epoch": 0.69, + "learning_rate": 4.639106965927093e-06, + "loss": 0.4874, + "step": 6035 + }, + { + "epoch": 0.69, + "learning_rate": 4.635982541408334e-06, + "loss": 0.454, + "step": 6036 + }, + { + "epoch": 0.69, + "learning_rate": 4.632858851879729e-06, + "loss": 0.4698, + "step": 6037 + }, + { + "epoch": 0.69, + "learning_rate": 4.629735897769289e-06, + "loss": 0.4706, + "step": 6038 + }, + { + "epoch": 0.69, + "learning_rate": 4.626613679504924e-06, + "loss": 0.4483, + "step": 6039 + }, + { + "epoch": 0.69, + "learning_rate": 4.623492197514453e-06, + "loss": 0.476, + "step": 6040 + }, + { + "epoch": 0.69, + "learning_rate": 4.620371452225587e-06, + "loss": 0.464, + "step": 6041 + }, + { + "epoch": 0.69, + "learning_rate": 4.6172514440659435e-06, + "loss": 0.4384, + "step": 6042 + }, + { + "epoch": 0.69, + "learning_rate": 4.614132173463027e-06, + "loss": 0.4674, + "step": 6043 + }, + { + "epoch": 0.69, + "learning_rate": 4.611013640844245e-06, + "loss": 0.4489, + "step": 6044 + }, + { + "epoch": 0.69, + "learning_rate": 4.607895846636914e-06, + "loss": 0.4501, + "step": 6045 + }, + { + "epoch": 0.69, + "learning_rate": 4.604778791268233e-06, + "loss": 0.4437, + "step": 6046 + }, + { + "epoch": 0.69, + "learning_rate": 4.601662475165316e-06, + "loss": 0.4566, + "step": 6047 + }, + { + "epoch": 0.69, + "learning_rate": 4.598546898755164e-06, + "loss": 0.4739, + "step": 6048 + }, + { + "epoch": 0.69, + "learning_rate": 4.595432062464678e-06, + "loss": 0.454, + "step": 6049 + }, + { + "epoch": 0.69, + "learning_rate": 4.592317966720661e-06, + "loss": 0.432, + "step": 6050 + }, + { + "epoch": 0.69, + "learning_rate": 4.589204611949819e-06, + "loss": 0.4652, + "step": 6051 + }, + { + "epoch": 0.69, + "learning_rate": 4.5860919985787454e-06, + "loss": 0.4672, + "step": 6052 + }, + { + "epoch": 0.69, + "learning_rate": 4.582980127033943e-06, + "loss": 0.4432, + "step": 6053 + }, + { + "epoch": 0.69, + "learning_rate": 4.5798689977418e-06, + "loss": 0.451, + "step": 6054 + }, + { + "epoch": 0.69, + "learning_rate": 4.576758611128619e-06, + "loss": 0.4519, + "step": 6055 + }, + { + "epoch": 0.69, + "learning_rate": 4.573648967620589e-06, + "loss": 0.4571, + "step": 6056 + }, + { + "epoch": 0.69, + "learning_rate": 4.570540067643796e-06, + "loss": 0.4493, + "step": 6057 + }, + { + "epoch": 0.69, + "learning_rate": 4.567431911624236e-06, + "loss": 0.481, + "step": 6058 + }, + { + "epoch": 0.69, + "learning_rate": 4.56432449998779e-06, + "loss": 0.4614, + "step": 6059 + }, + { + "epoch": 0.69, + "learning_rate": 4.5612178331602445e-06, + "loss": 0.4561, + "step": 6060 + }, + { + "epoch": 0.69, + "learning_rate": 4.558111911567287e-06, + "loss": 0.4479, + "step": 6061 + }, + { + "epoch": 0.69, + "learning_rate": 4.55500673563449e-06, + "loss": 0.4753, + "step": 6062 + }, + { + "epoch": 0.69, + "learning_rate": 4.55190230578734e-06, + "loss": 0.4457, + "step": 6063 + }, + { + "epoch": 0.69, + "learning_rate": 4.54879862245121e-06, + "loss": 0.4726, + "step": 6064 + }, + { + "epoch": 0.69, + "learning_rate": 4.545695686051369e-06, + "loss": 0.4457, + "step": 6065 + }, + { + "epoch": 0.69, + "learning_rate": 4.542593497012996e-06, + "loss": 0.4729, + "step": 6066 + }, + { + "epoch": 0.69, + "learning_rate": 4.539492055761153e-06, + "loss": 0.4398, + "step": 6067 + }, + { + "epoch": 0.69, + "learning_rate": 4.536391362720816e-06, + "loss": 0.4609, + "step": 6068 + }, + { + "epoch": 0.69, + "learning_rate": 4.533291418316837e-06, + "loss": 0.4608, + "step": 6069 + }, + { + "epoch": 0.69, + "learning_rate": 4.530192222973987e-06, + "loss": 0.443, + "step": 6070 + }, + { + "epoch": 0.69, + "learning_rate": 4.527093777116925e-06, + "loss": 0.4677, + "step": 6071 + }, + { + "epoch": 0.69, + "learning_rate": 4.523996081170201e-06, + "loss": 0.4473, + "step": 6072 + }, + { + "epoch": 0.69, + "learning_rate": 4.520899135558276e-06, + "loss": 0.455, + "step": 6073 + }, + { + "epoch": 0.69, + "learning_rate": 4.5178029407054965e-06, + "loss": 0.4453, + "step": 6074 + }, + { + "epoch": 0.69, + "learning_rate": 4.514707497036107e-06, + "loss": 0.4705, + "step": 6075 + }, + { + "epoch": 0.69, + "learning_rate": 4.511612804974259e-06, + "loss": 0.4396, + "step": 6076 + }, + { + "epoch": 0.69, + "learning_rate": 4.508518864943989e-06, + "loss": 0.4703, + "step": 6077 + }, + { + "epoch": 0.69, + "learning_rate": 4.505425677369238e-06, + "loss": 0.4372, + "step": 6078 + }, + { + "epoch": 0.69, + "learning_rate": 4.5023332426738445e-06, + "loss": 0.4476, + "step": 6079 + }, + { + "epoch": 0.69, + "learning_rate": 4.4992415612815355e-06, + "loss": 0.4527, + "step": 6080 + }, + { + "epoch": 0.69, + "learning_rate": 4.496150633615947e-06, + "loss": 0.4683, + "step": 6081 + }, + { + "epoch": 0.7, + "learning_rate": 4.4930604601006025e-06, + "loss": 0.4637, + "step": 6082 + }, + { + "epoch": 0.7, + "learning_rate": 4.489971041158919e-06, + "loss": 0.4505, + "step": 6083 + }, + { + "epoch": 0.7, + "learning_rate": 4.486882377214226e-06, + "loss": 0.4505, + "step": 6084 + }, + { + "epoch": 0.7, + "learning_rate": 4.483794468689728e-06, + "loss": 0.4463, + "step": 6085 + }, + { + "epoch": 0.7, + "learning_rate": 4.480707316008549e-06, + "loss": 0.4404, + "step": 6086 + }, + { + "epoch": 0.7, + "learning_rate": 4.477620919593688e-06, + "loss": 0.4671, + "step": 6087 + }, + { + "epoch": 0.7, + "learning_rate": 4.474535279868055e-06, + "loss": 0.449, + "step": 6088 + }, + { + "epoch": 0.7, + "learning_rate": 4.4714503972544545e-06, + "loss": 0.4668, + "step": 6089 + }, + { + "epoch": 0.7, + "learning_rate": 4.4683662721755805e-06, + "loss": 0.4563, + "step": 6090 + }, + { + "epoch": 0.7, + "learning_rate": 4.465282905054025e-06, + "loss": 0.4432, + "step": 6091 + }, + { + "epoch": 0.7, + "learning_rate": 4.462200296312284e-06, + "loss": 0.4485, + "step": 6092 + }, + { + "epoch": 0.7, + "learning_rate": 4.459118446372736e-06, + "loss": 0.4552, + "step": 6093 + }, + { + "epoch": 0.7, + "learning_rate": 4.45603735565767e-06, + "loss": 0.4486, + "step": 6094 + }, + { + "epoch": 0.7, + "learning_rate": 4.4529570245892625e-06, + "loss": 0.4531, + "step": 6095 + }, + { + "epoch": 0.7, + "learning_rate": 4.449877453589584e-06, + "loss": 0.4359, + "step": 6096 + }, + { + "epoch": 0.7, + "learning_rate": 4.446798643080608e-06, + "loss": 0.4864, + "step": 6097 + }, + { + "epoch": 0.7, + "learning_rate": 4.443720593484198e-06, + "loss": 0.4502, + "step": 6098 + }, + { + "epoch": 0.7, + "learning_rate": 4.440643305222121e-06, + "loss": 0.4621, + "step": 6099 + }, + { + "epoch": 0.7, + "learning_rate": 4.43756677871603e-06, + "loss": 0.4472, + "step": 6100 + }, + { + "epoch": 0.7, + "learning_rate": 4.4344910143874755e-06, + "loss": 0.4595, + "step": 6101 + }, + { + "epoch": 0.7, + "learning_rate": 4.431416012657912e-06, + "loss": 0.4382, + "step": 6102 + }, + { + "epoch": 0.7, + "learning_rate": 4.42834177394868e-06, + "loss": 0.4416, + "step": 6103 + }, + { + "epoch": 0.7, + "learning_rate": 4.425268298681015e-06, + "loss": 0.4432, + "step": 6104 + }, + { + "epoch": 0.7, + "learning_rate": 4.422195587276058e-06, + "loss": 0.4588, + "step": 6105 + }, + { + "epoch": 0.7, + "learning_rate": 4.419123640154834e-06, + "loss": 0.4484, + "step": 6106 + }, + { + "epoch": 0.7, + "learning_rate": 4.416052457738271e-06, + "loss": 0.4875, + "step": 6107 + }, + { + "epoch": 0.7, + "learning_rate": 4.412982040447193e-06, + "loss": 0.4492, + "step": 6108 + }, + { + "epoch": 0.7, + "learning_rate": 4.409912388702308e-06, + "loss": 0.4476, + "step": 6109 + }, + { + "epoch": 0.7, + "learning_rate": 4.406843502924235e-06, + "loss": 0.4601, + "step": 6110 + }, + { + "epoch": 0.7, + "learning_rate": 4.403775383533472e-06, + "loss": 0.4618, + "step": 6111 + }, + { + "epoch": 0.7, + "learning_rate": 4.400708030950428e-06, + "loss": 0.4439, + "step": 6112 + }, + { + "epoch": 0.7, + "learning_rate": 4.397641445595393e-06, + "loss": 0.4651, + "step": 6113 + }, + { + "epoch": 0.7, + "learning_rate": 4.394575627888558e-06, + "loss": 0.4394, + "step": 6114 + }, + { + "epoch": 0.7, + "learning_rate": 4.391510578250011e-06, + "loss": 0.4715, + "step": 6115 + }, + { + "epoch": 0.7, + "learning_rate": 4.388446297099728e-06, + "loss": 0.4585, + "step": 6116 + }, + { + "epoch": 0.7, + "learning_rate": 4.385382784857587e-06, + "loss": 0.4418, + "step": 6117 + }, + { + "epoch": 0.7, + "learning_rate": 4.382320041943361e-06, + "loss": 0.4464, + "step": 6118 + }, + { + "epoch": 0.7, + "learning_rate": 4.379258068776706e-06, + "loss": 0.4421, + "step": 6119 + }, + { + "epoch": 0.7, + "learning_rate": 4.3761968657771905e-06, + "loss": 0.4487, + "step": 6120 + }, + { + "epoch": 0.7, + "learning_rate": 4.3731364333642615e-06, + "loss": 0.4634, + "step": 6121 + }, + { + "epoch": 0.7, + "learning_rate": 4.370076771957264e-06, + "loss": 0.4363, + "step": 6122 + }, + { + "epoch": 0.7, + "learning_rate": 4.367017881975446e-06, + "loss": 0.4684, + "step": 6123 + }, + { + "epoch": 0.7, + "learning_rate": 4.363959763837938e-06, + "loss": 0.4628, + "step": 6124 + }, + { + "epoch": 0.7, + "learning_rate": 4.360902417963777e-06, + "loss": 0.4726, + "step": 6125 + }, + { + "epoch": 0.7, + "learning_rate": 4.357845844771881e-06, + "loss": 0.4525, + "step": 6126 + }, + { + "epoch": 0.7, + "learning_rate": 4.354790044681072e-06, + "loss": 0.4496, + "step": 6127 + }, + { + "epoch": 0.7, + "learning_rate": 4.351735018110066e-06, + "loss": 0.4513, + "step": 6128 + }, + { + "epoch": 0.7, + "learning_rate": 4.348680765477463e-06, + "loss": 0.4627, + "step": 6129 + }, + { + "epoch": 0.7, + "learning_rate": 4.3456272872017725e-06, + "loss": 0.4446, + "step": 6130 + }, + { + "epoch": 0.7, + "learning_rate": 4.342574583701382e-06, + "loss": 0.4567, + "step": 6131 + }, + { + "epoch": 0.7, + "learning_rate": 4.33952265539458e-06, + "loss": 0.4456, + "step": 6132 + }, + { + "epoch": 0.7, + "learning_rate": 4.336471502699554e-06, + "loss": 0.4527, + "step": 6133 + }, + { + "epoch": 0.7, + "learning_rate": 4.333421126034374e-06, + "loss": 0.4668, + "step": 6134 + }, + { + "epoch": 0.7, + "learning_rate": 4.330371525817012e-06, + "loss": 0.4575, + "step": 6135 + }, + { + "epoch": 0.7, + "learning_rate": 4.327322702465335e-06, + "loss": 0.4421, + "step": 6136 + }, + { + "epoch": 0.7, + "learning_rate": 4.324274656397095e-06, + "loss": 0.4728, + "step": 6137 + }, + { + "epoch": 0.7, + "learning_rate": 4.321227388029947e-06, + "loss": 0.4593, + "step": 6138 + }, + { + "epoch": 0.7, + "learning_rate": 4.318180897781432e-06, + "loss": 0.4651, + "step": 6139 + }, + { + "epoch": 0.7, + "learning_rate": 4.315135186068984e-06, + "loss": 0.4454, + "step": 6140 + }, + { + "epoch": 0.7, + "learning_rate": 4.312090253309941e-06, + "loss": 0.4575, + "step": 6141 + }, + { + "epoch": 0.7, + "learning_rate": 4.309046099921518e-06, + "loss": 0.4456, + "step": 6142 + }, + { + "epoch": 0.7, + "learning_rate": 4.306002726320839e-06, + "loss": 0.4633, + "step": 6143 + }, + { + "epoch": 0.7, + "learning_rate": 4.302960132924909e-06, + "loss": 0.4532, + "step": 6144 + }, + { + "epoch": 0.7, + "learning_rate": 4.299918320150634e-06, + "loss": 0.4513, + "step": 6145 + }, + { + "epoch": 0.7, + "learning_rate": 4.296877288414815e-06, + "loss": 0.4538, + "step": 6146 + }, + { + "epoch": 0.7, + "learning_rate": 4.2938370381341355e-06, + "loss": 0.445, + "step": 6147 + }, + { + "epoch": 0.7, + "learning_rate": 4.290797569725175e-06, + "loss": 0.4534, + "step": 6148 + }, + { + "epoch": 0.7, + "learning_rate": 4.287758883604415e-06, + "loss": 0.4542, + "step": 6149 + }, + { + "epoch": 0.7, + "learning_rate": 4.284720980188216e-06, + "loss": 0.4479, + "step": 6150 + }, + { + "epoch": 0.7, + "learning_rate": 4.281683859892849e-06, + "loss": 0.4849, + "step": 6151 + }, + { + "epoch": 0.7, + "learning_rate": 4.278647523134459e-06, + "loss": 0.4355, + "step": 6152 + }, + { + "epoch": 0.7, + "learning_rate": 4.275611970329092e-06, + "loss": 0.4748, + "step": 6153 + }, + { + "epoch": 0.7, + "learning_rate": 4.272577201892688e-06, + "loss": 0.4573, + "step": 6154 + }, + { + "epoch": 0.7, + "learning_rate": 4.269543218241079e-06, + "loss": 0.4502, + "step": 6155 + }, + { + "epoch": 0.7, + "learning_rate": 4.266510019789993e-06, + "loss": 0.464, + "step": 6156 + }, + { + "epoch": 0.7, + "learning_rate": 4.26347760695504e-06, + "loss": 0.4589, + "step": 6157 + }, + { + "epoch": 0.7, + "learning_rate": 4.260445980151725e-06, + "loss": 0.4541, + "step": 6158 + }, + { + "epoch": 0.7, + "learning_rate": 4.257415139795458e-06, + "loss": 0.4498, + "step": 6159 + }, + { + "epoch": 0.7, + "learning_rate": 4.2543850863015266e-06, + "loss": 0.451, + "step": 6160 + }, + { + "epoch": 0.7, + "learning_rate": 4.2513558200851115e-06, + "loss": 0.4428, + "step": 6161 + }, + { + "epoch": 0.7, + "learning_rate": 4.248327341561298e-06, + "loss": 0.4627, + "step": 6162 + }, + { + "epoch": 0.7, + "learning_rate": 4.245299651145048e-06, + "loss": 0.4586, + "step": 6163 + }, + { + "epoch": 0.7, + "learning_rate": 4.242272749251228e-06, + "loss": 0.4486, + "step": 6164 + }, + { + "epoch": 0.7, + "learning_rate": 4.239246636294591e-06, + "loss": 0.4699, + "step": 6165 + }, + { + "epoch": 0.7, + "learning_rate": 4.236221312689777e-06, + "loss": 0.4538, + "step": 6166 + }, + { + "epoch": 0.7, + "learning_rate": 4.2331967788513295e-06, + "loss": 0.4742, + "step": 6167 + }, + { + "epoch": 0.7, + "learning_rate": 4.230173035193671e-06, + "loss": 0.4524, + "step": 6168 + }, + { + "epoch": 0.71, + "learning_rate": 4.227150082131128e-06, + "loss": 0.4548, + "step": 6169 + }, + { + "epoch": 0.71, + "learning_rate": 4.2241279200779105e-06, + "loss": 0.4454, + "step": 6170 + }, + { + "epoch": 0.71, + "learning_rate": 4.221106549448116e-06, + "loss": 0.4473, + "step": 6171 + }, + { + "epoch": 0.71, + "learning_rate": 4.21808597065575e-06, + "loss": 0.4565, + "step": 6172 + }, + { + "epoch": 0.71, + "learning_rate": 4.215066184114689e-06, + "loss": 0.4502, + "step": 6173 + }, + { + "epoch": 0.71, + "learning_rate": 4.212047190238716e-06, + "loss": 0.4569, + "step": 6174 + }, + { + "epoch": 0.71, + "learning_rate": 4.209028989441505e-06, + "loss": 0.4568, + "step": 6175 + }, + { + "epoch": 0.71, + "learning_rate": 4.2060115821366085e-06, + "loss": 0.4514, + "step": 6176 + }, + { + "epoch": 0.71, + "learning_rate": 4.202994968737487e-06, + "loss": 0.4507, + "step": 6177 + }, + { + "epoch": 0.71, + "learning_rate": 4.199979149657481e-06, + "loss": 0.4546, + "step": 6178 + }, + { + "epoch": 0.71, + "learning_rate": 4.196964125309818e-06, + "loss": 0.4484, + "step": 6179 + }, + { + "epoch": 0.71, + "learning_rate": 4.1939498961076345e-06, + "loss": 0.451, + "step": 6180 + }, + { + "epoch": 0.71, + "learning_rate": 4.190936462463937e-06, + "loss": 0.4628, + "step": 6181 + }, + { + "epoch": 0.71, + "learning_rate": 4.187923824791642e-06, + "loss": 0.4572, + "step": 6182 + }, + { + "epoch": 0.71, + "learning_rate": 4.184911983503541e-06, + "loss": 0.483, + "step": 6183 + }, + { + "epoch": 0.71, + "learning_rate": 4.1819009390123276e-06, + "loss": 0.4448, + "step": 6184 + }, + { + "epoch": 0.71, + "learning_rate": 4.178890691730585e-06, + "loss": 0.4624, + "step": 6185 + }, + { + "epoch": 0.71, + "learning_rate": 4.17588124207078e-06, + "loss": 0.4336, + "step": 6186 + }, + { + "epoch": 0.71, + "learning_rate": 4.172872590445273e-06, + "loss": 0.4823, + "step": 6187 + }, + { + "epoch": 0.71, + "learning_rate": 4.169864737266321e-06, + "loss": 0.4495, + "step": 6188 + }, + { + "epoch": 0.71, + "learning_rate": 4.166857682946061e-06, + "loss": 0.4592, + "step": 6189 + }, + { + "epoch": 0.71, + "learning_rate": 4.163851427896534e-06, + "loss": 0.4319, + "step": 6190 + }, + { + "epoch": 0.71, + "learning_rate": 4.160845972529656e-06, + "loss": 0.4667, + "step": 6191 + }, + { + "epoch": 0.71, + "learning_rate": 4.157841317257245e-06, + "loss": 0.4676, + "step": 6192 + }, + { + "epoch": 0.71, + "learning_rate": 4.154837462491012e-06, + "loss": 0.4332, + "step": 6193 + }, + { + "epoch": 0.71, + "learning_rate": 4.151834408642542e-06, + "loss": 0.4331, + "step": 6194 + }, + { + "epoch": 0.71, + "learning_rate": 4.148832156123329e-06, + "loss": 0.4461, + "step": 6195 + }, + { + "epoch": 0.71, + "learning_rate": 4.145830705344746e-06, + "loss": 0.4527, + "step": 6196 + }, + { + "epoch": 0.71, + "learning_rate": 4.142830056718052e-06, + "loss": 0.4627, + "step": 6197 + }, + { + "epoch": 0.71, + "learning_rate": 4.139830210654413e-06, + "loss": 0.4456, + "step": 6198 + }, + { + "epoch": 0.71, + "learning_rate": 4.136831167564867e-06, + "loss": 0.4483, + "step": 6199 + }, + { + "epoch": 0.71, + "learning_rate": 4.133832927860356e-06, + "loss": 0.4507, + "step": 6200 + }, + { + "epoch": 0.71, + "learning_rate": 4.130835491951699e-06, + "loss": 0.4712, + "step": 6201 + }, + { + "epoch": 0.71, + "learning_rate": 4.127838860249617e-06, + "loss": 0.4428, + "step": 6202 + }, + { + "epoch": 0.71, + "learning_rate": 4.124843033164716e-06, + "loss": 0.4576, + "step": 6203 + }, + { + "epoch": 0.71, + "learning_rate": 4.12184801110749e-06, + "loss": 0.4357, + "step": 6204 + }, + { + "epoch": 0.71, + "learning_rate": 4.11885379448832e-06, + "loss": 0.4641, + "step": 6205 + }, + { + "epoch": 0.71, + "learning_rate": 4.115860383717486e-06, + "loss": 0.4702, + "step": 6206 + }, + { + "epoch": 0.71, + "learning_rate": 4.1128677792051465e-06, + "loss": 0.4636, + "step": 6207 + }, + { + "epoch": 0.71, + "learning_rate": 4.109875981361363e-06, + "loss": 0.4425, + "step": 6208 + }, + { + "epoch": 0.71, + "learning_rate": 4.106884990596073e-06, + "loss": 0.4679, + "step": 6209 + }, + { + "epoch": 0.71, + "learning_rate": 4.103894807319106e-06, + "loss": 0.4445, + "step": 6210 + }, + { + "epoch": 0.71, + "learning_rate": 4.100905431940189e-06, + "loss": 0.4616, + "step": 6211 + }, + { + "epoch": 0.71, + "learning_rate": 4.097916864868932e-06, + "loss": 0.4389, + "step": 6212 + }, + { + "epoch": 0.71, + "learning_rate": 4.0949291065148375e-06, + "loss": 0.4391, + "step": 6213 + }, + { + "epoch": 0.71, + "learning_rate": 4.091942157287294e-06, + "loss": 0.4661, + "step": 6214 + }, + { + "epoch": 0.71, + "learning_rate": 4.088956017595575e-06, + "loss": 0.453, + "step": 6215 + }, + { + "epoch": 0.71, + "learning_rate": 4.085970687848857e-06, + "loss": 0.4618, + "step": 6216 + }, + { + "epoch": 0.71, + "learning_rate": 4.082986168456192e-06, + "loss": 0.4677, + "step": 6217 + }, + { + "epoch": 0.71, + "learning_rate": 4.080002459826523e-06, + "loss": 0.439, + "step": 6218 + }, + { + "epoch": 0.71, + "learning_rate": 4.077019562368691e-06, + "loss": 0.4778, + "step": 6219 + }, + { + "epoch": 0.71, + "learning_rate": 4.074037476491414e-06, + "loss": 0.4595, + "step": 6220 + }, + { + "epoch": 0.71, + "learning_rate": 4.071056202603305e-06, + "loss": 0.446, + "step": 6221 + }, + { + "epoch": 0.71, + "learning_rate": 4.0680757411128714e-06, + "loss": 0.4495, + "step": 6222 + }, + { + "epoch": 0.71, + "learning_rate": 4.0650960924284945e-06, + "loss": 0.4578, + "step": 6223 + }, + { + "epoch": 0.71, + "learning_rate": 4.06211725695846e-06, + "loss": 0.4525, + "step": 6224 + }, + { + "epoch": 0.71, + "learning_rate": 4.059139235110928e-06, + "loss": 0.4576, + "step": 6225 + }, + { + "epoch": 0.71, + "learning_rate": 4.056162027293962e-06, + "loss": 0.4491, + "step": 6226 + }, + { + "epoch": 0.71, + "learning_rate": 4.053185633915501e-06, + "loss": 0.4703, + "step": 6227 + }, + { + "epoch": 0.71, + "learning_rate": 4.050210055383373e-06, + "loss": 0.447, + "step": 6228 + }, + { + "epoch": 0.71, + "learning_rate": 4.047235292105308e-06, + "loss": 0.4667, + "step": 6229 + }, + { + "epoch": 0.71, + "learning_rate": 4.0442613444889065e-06, + "loss": 0.4357, + "step": 6230 + }, + { + "epoch": 0.71, + "learning_rate": 4.04128821294167e-06, + "loss": 0.455, + "step": 6231 + }, + { + "epoch": 0.71, + "learning_rate": 4.0383158978709865e-06, + "loss": 0.4835, + "step": 6232 + }, + { + "epoch": 0.71, + "learning_rate": 4.035344399684124e-06, + "loss": 0.4562, + "step": 6233 + }, + { + "epoch": 0.71, + "learning_rate": 4.032373718788248e-06, + "loss": 0.4429, + "step": 6234 + }, + { + "epoch": 0.71, + "learning_rate": 4.029403855590409e-06, + "loss": 0.4534, + "step": 6235 + }, + { + "epoch": 0.71, + "learning_rate": 4.026434810497538e-06, + "loss": 0.4701, + "step": 6236 + }, + { + "epoch": 0.71, + "learning_rate": 4.023466583916469e-06, + "loss": 0.4494, + "step": 6237 + }, + { + "epoch": 0.71, + "learning_rate": 4.020499176253907e-06, + "loss": 0.4445, + "step": 6238 + }, + { + "epoch": 0.71, + "learning_rate": 4.017532587916461e-06, + "loss": 0.4381, + "step": 6239 + }, + { + "epoch": 0.71, + "learning_rate": 4.014566819310612e-06, + "loss": 0.4701, + "step": 6240 + }, + { + "epoch": 0.71, + "learning_rate": 4.011601870842739e-06, + "loss": 0.4726, + "step": 6241 + }, + { + "epoch": 0.71, + "learning_rate": 4.008637742919114e-06, + "loss": 0.4449, + "step": 6242 + }, + { + "epoch": 0.71, + "learning_rate": 4.005674435945881e-06, + "loss": 0.461, + "step": 6243 + }, + { + "epoch": 0.71, + "learning_rate": 4.0027119503290776e-06, + "loss": 0.4431, + "step": 6244 + }, + { + "epoch": 0.71, + "learning_rate": 3.999750286474637e-06, + "loss": 0.4566, + "step": 6245 + }, + { + "epoch": 0.71, + "learning_rate": 3.996789444788366e-06, + "loss": 0.4659, + "step": 6246 + }, + { + "epoch": 0.71, + "learning_rate": 3.993829425675974e-06, + "loss": 0.4216, + "step": 6247 + }, + { + "epoch": 0.71, + "learning_rate": 3.99087022954304e-06, + "loss": 0.4622, + "step": 6248 + }, + { + "epoch": 0.71, + "learning_rate": 3.987911856795047e-06, + "loss": 0.4577, + "step": 6249 + }, + { + "epoch": 0.71, + "learning_rate": 3.98495430783736e-06, + "loss": 0.4687, + "step": 6250 + }, + { + "epoch": 0.71, + "learning_rate": 3.981997583075222e-06, + "loss": 0.4482, + "step": 6251 + }, + { + "epoch": 0.71, + "learning_rate": 3.979041682913777e-06, + "loss": 0.4611, + "step": 6252 + }, + { + "epoch": 0.71, + "learning_rate": 3.976086607758047e-06, + "loss": 0.4587, + "step": 6253 + }, + { + "epoch": 0.71, + "learning_rate": 3.973132358012939e-06, + "loss": 0.4507, + "step": 6254 + }, + { + "epoch": 0.71, + "learning_rate": 3.970178934083259e-06, + "loss": 0.4569, + "step": 6255 + }, + { + "epoch": 0.71, + "learning_rate": 3.967226336373686e-06, + "loss": 0.4404, + "step": 6256 + }, + { + "epoch": 0.72, + "learning_rate": 3.964274565288792e-06, + "loss": 0.4412, + "step": 6257 + }, + { + "epoch": 0.72, + "learning_rate": 3.961323621233036e-06, + "loss": 0.465, + "step": 6258 + }, + { + "epoch": 0.72, + "learning_rate": 3.9583735046107655e-06, + "loss": 0.4492, + "step": 6259 + }, + { + "epoch": 0.72, + "learning_rate": 3.9554242158262134e-06, + "loss": 0.4508, + "step": 6260 + }, + { + "epoch": 0.72, + "learning_rate": 3.952475755283497e-06, + "loss": 0.4388, + "step": 6261 + }, + { + "epoch": 0.72, + "learning_rate": 3.949528123386617e-06, + "loss": 0.4523, + "step": 6262 + }, + { + "epoch": 0.72, + "learning_rate": 3.94658132053947e-06, + "loss": 0.4702, + "step": 6263 + }, + { + "epoch": 0.72, + "learning_rate": 3.943635347145829e-06, + "loss": 0.4598, + "step": 6264 + }, + { + "epoch": 0.72, + "learning_rate": 3.940690203609364e-06, + "loss": 0.4392, + "step": 6265 + }, + { + "epoch": 0.72, + "learning_rate": 3.937745890333623e-06, + "loss": 0.4792, + "step": 6266 + }, + { + "epoch": 0.72, + "learning_rate": 3.934802407722038e-06, + "loss": 0.4307, + "step": 6267 + }, + { + "epoch": 0.72, + "learning_rate": 3.931859756177936e-06, + "loss": 0.4622, + "step": 6268 + }, + { + "epoch": 0.72, + "learning_rate": 3.928917936104529e-06, + "loss": 0.4523, + "step": 6269 + }, + { + "epoch": 0.72, + "learning_rate": 3.925976947904906e-06, + "loss": 0.4647, + "step": 6270 + }, + { + "epoch": 0.72, + "learning_rate": 3.923036791982053e-06, + "loss": 0.4302, + "step": 6271 + }, + { + "epoch": 0.72, + "learning_rate": 3.920097468738833e-06, + "loss": 0.4587, + "step": 6272 + }, + { + "epoch": 0.72, + "learning_rate": 3.917158978578003e-06, + "loss": 0.4541, + "step": 6273 + }, + { + "epoch": 0.72, + "learning_rate": 3.914221321902199e-06, + "loss": 0.4458, + "step": 6274 + }, + { + "epoch": 0.72, + "learning_rate": 3.911284499113943e-06, + "loss": 0.4601, + "step": 6275 + }, + { + "epoch": 0.72, + "learning_rate": 3.908348510615653e-06, + "loss": 0.4432, + "step": 6276 + }, + { + "epoch": 0.72, + "learning_rate": 3.905413356809615e-06, + "loss": 0.4498, + "step": 6277 + }, + { + "epoch": 0.72, + "learning_rate": 3.902479038098017e-06, + "loss": 0.4558, + "step": 6278 + }, + { + "epoch": 0.72, + "learning_rate": 3.899545554882927e-06, + "loss": 0.4669, + "step": 6279 + }, + { + "epoch": 0.72, + "learning_rate": 3.896612907566294e-06, + "loss": 0.4479, + "step": 6280 + }, + { + "epoch": 0.72, + "learning_rate": 3.893681096549961e-06, + "loss": 0.4509, + "step": 6281 + }, + { + "epoch": 0.72, + "learning_rate": 3.890750122235645e-06, + "loss": 0.4552, + "step": 6282 + }, + { + "epoch": 0.72, + "learning_rate": 3.887819985024962e-06, + "loss": 0.4569, + "step": 6283 + }, + { + "epoch": 0.72, + "learning_rate": 3.884890685319402e-06, + "loss": 0.4695, + "step": 6284 + }, + { + "epoch": 0.72, + "learning_rate": 3.881962223520343e-06, + "loss": 0.4291, + "step": 6285 + }, + { + "epoch": 0.72, + "learning_rate": 3.879034600029054e-06, + "loss": 0.4662, + "step": 6286 + }, + { + "epoch": 0.72, + "learning_rate": 3.876107815246678e-06, + "loss": 0.4642, + "step": 6287 + }, + { + "epoch": 0.72, + "learning_rate": 3.873181869574256e-06, + "loss": 0.4396, + "step": 6288 + }, + { + "epoch": 0.72, + "learning_rate": 3.87025676341271e-06, + "loss": 0.4598, + "step": 6289 + }, + { + "epoch": 0.72, + "learning_rate": 3.867332497162836e-06, + "loss": 0.4541, + "step": 6290 + }, + { + "epoch": 0.72, + "learning_rate": 3.864409071225334e-06, + "loss": 0.4548, + "step": 6291 + }, + { + "epoch": 0.72, + "learning_rate": 3.861486486000771e-06, + "loss": 0.4845, + "step": 6292 + }, + { + "epoch": 0.72, + "learning_rate": 3.858564741889608e-06, + "loss": 0.451, + "step": 6293 + }, + { + "epoch": 0.72, + "learning_rate": 3.855643839292193e-06, + "loss": 0.4693, + "step": 6294 + }, + { + "epoch": 0.72, + "learning_rate": 3.852723778608748e-06, + "loss": 0.4537, + "step": 6295 + }, + { + "epoch": 0.72, + "learning_rate": 3.849804560239394e-06, + "loss": 0.4532, + "step": 6296 + }, + { + "epoch": 0.72, + "learning_rate": 3.846886184584122e-06, + "loss": 0.4356, + "step": 6297 + }, + { + "epoch": 0.72, + "learning_rate": 3.8439686520428185e-06, + "loss": 0.4561, + "step": 6298 + }, + { + "epoch": 0.72, + "learning_rate": 3.841051963015254e-06, + "loss": 0.4448, + "step": 6299 + }, + { + "epoch": 0.72, + "learning_rate": 3.8381361179010755e-06, + "loss": 0.4528, + "step": 6300 + }, + { + "epoch": 0.72, + "learning_rate": 3.8352211170998165e-06, + "loss": 0.4597, + "step": 6301 + }, + { + "epoch": 0.72, + "learning_rate": 3.8323069610109046e-06, + "loss": 0.4527, + "step": 6302 + }, + { + "epoch": 0.72, + "learning_rate": 3.829393650033635e-06, + "loss": 0.4397, + "step": 6303 + }, + { + "epoch": 0.72, + "learning_rate": 3.8264811845672055e-06, + "loss": 0.4605, + "step": 6304 + }, + { + "epoch": 0.72, + "learning_rate": 3.823569565010682e-06, + "loss": 0.4474, + "step": 6305 + }, + { + "epoch": 0.72, + "learning_rate": 3.820658791763023e-06, + "loss": 0.4566, + "step": 6306 + }, + { + "epoch": 0.72, + "learning_rate": 3.817748865223075e-06, + "loss": 0.4645, + "step": 6307 + }, + { + "epoch": 0.72, + "learning_rate": 3.814839785789555e-06, + "loss": 0.4646, + "step": 6308 + }, + { + "epoch": 0.72, + "learning_rate": 3.811931553861078e-06, + "loss": 0.4435, + "step": 6309 + }, + { + "epoch": 0.72, + "learning_rate": 3.809024169836134e-06, + "loss": 0.4757, + "step": 6310 + }, + { + "epoch": 0.72, + "learning_rate": 3.8061176341130955e-06, + "loss": 0.4353, + "step": 6311 + }, + { + "epoch": 0.72, + "learning_rate": 3.803211947090232e-06, + "loss": 0.4753, + "step": 6312 + }, + { + "epoch": 0.72, + "learning_rate": 3.8003071091656806e-06, + "loss": 0.4374, + "step": 6313 + }, + { + "epoch": 0.72, + "learning_rate": 3.7974031207374685e-06, + "loss": 0.4449, + "step": 6314 + }, + { + "epoch": 0.72, + "learning_rate": 3.7944999822035077e-06, + "loss": 0.4608, + "step": 6315 + }, + { + "epoch": 0.72, + "learning_rate": 3.791597693961596e-06, + "loss": 0.4467, + "step": 6316 + }, + { + "epoch": 0.72, + "learning_rate": 3.788696256409412e-06, + "loss": 0.4641, + "step": 6317 + }, + { + "epoch": 0.72, + "learning_rate": 3.7857956699445164e-06, + "loss": 0.467, + "step": 6318 + }, + { + "epoch": 0.72, + "learning_rate": 3.78289593496435e-06, + "loss": 0.448, + "step": 6319 + }, + { + "epoch": 0.72, + "learning_rate": 3.7799970518662477e-06, + "loss": 0.4555, + "step": 6320 + }, + { + "epoch": 0.72, + "learning_rate": 3.7770990210474147e-06, + "loss": 0.4534, + "step": 6321 + }, + { + "epoch": 0.72, + "learning_rate": 3.774201842904953e-06, + "loss": 0.4596, + "step": 6322 + }, + { + "epoch": 0.72, + "learning_rate": 3.771305517835837e-06, + "loss": 0.4588, + "step": 6323 + }, + { + "epoch": 0.72, + "learning_rate": 3.7684100462369244e-06, + "loss": 0.466, + "step": 6324 + }, + { + "epoch": 0.72, + "learning_rate": 3.765515428504963e-06, + "loss": 0.4445, + "step": 6325 + }, + { + "epoch": 0.72, + "learning_rate": 3.7626216650365833e-06, + "loss": 0.4649, + "step": 6326 + }, + { + "epoch": 0.72, + "learning_rate": 3.7597287562282892e-06, + "loss": 0.4526, + "step": 6327 + }, + { + "epoch": 0.72, + "learning_rate": 3.7568367024764794e-06, + "loss": 0.4537, + "step": 6328 + }, + { + "epoch": 0.72, + "learning_rate": 3.7539455041774255e-06, + "loss": 0.4432, + "step": 6329 + }, + { + "epoch": 0.72, + "learning_rate": 3.7510551617272907e-06, + "loss": 0.4636, + "step": 6330 + }, + { + "epoch": 0.72, + "learning_rate": 3.748165675522113e-06, + "loss": 0.4407, + "step": 6331 + }, + { + "epoch": 0.72, + "learning_rate": 3.7452770459578134e-06, + "loss": 0.4403, + "step": 6332 + }, + { + "epoch": 0.72, + "learning_rate": 3.742389273430208e-06, + "loss": 0.4471, + "step": 6333 + }, + { + "epoch": 0.72, + "learning_rate": 3.7395023583349755e-06, + "loss": 0.4537, + "step": 6334 + }, + { + "epoch": 0.72, + "learning_rate": 3.7366163010676937e-06, + "loss": 0.4709, + "step": 6335 + }, + { + "epoch": 0.72, + "learning_rate": 3.733731102023819e-06, + "loss": 0.4529, + "step": 6336 + }, + { + "epoch": 0.72, + "learning_rate": 3.730846761598682e-06, + "loss": 0.4427, + "step": 6337 + }, + { + "epoch": 0.72, + "learning_rate": 3.7279632801875076e-06, + "loss": 0.4683, + "step": 6338 + }, + { + "epoch": 0.72, + "learning_rate": 3.725080658185395e-06, + "loss": 0.4596, + "step": 6339 + }, + { + "epoch": 0.72, + "learning_rate": 3.7221988959873232e-06, + "loss": 0.4385, + "step": 6340 + }, + { + "epoch": 0.72, + "learning_rate": 3.7193179939881665e-06, + "loss": 0.4397, + "step": 6341 + }, + { + "epoch": 0.72, + "learning_rate": 3.716437952582663e-06, + "loss": 0.4779, + "step": 6342 + }, + { + "epoch": 0.72, + "learning_rate": 3.7135587721654533e-06, + "loss": 0.4569, + "step": 6343 + }, + { + "epoch": 0.73, + "learning_rate": 3.710680453131039e-06, + "loss": 0.4611, + "step": 6344 + }, + { + "epoch": 0.73, + "learning_rate": 3.7078029958738194e-06, + "loss": 0.4375, + "step": 6345 + }, + { + "epoch": 0.73, + "learning_rate": 3.7049264007880735e-06, + "loss": 0.4601, + "step": 6346 + }, + { + "epoch": 0.73, + "learning_rate": 3.7020506682679524e-06, + "loss": 0.4459, + "step": 6347 + }, + { + "epoch": 0.73, + "learning_rate": 3.699175798707502e-06, + "loss": 0.4453, + "step": 6348 + }, + { + "epoch": 0.73, + "learning_rate": 3.6963017925006407e-06, + "loss": 0.4557, + "step": 6349 + }, + { + "epoch": 0.73, + "learning_rate": 3.6934286500411675e-06, + "loss": 0.4617, + "step": 6350 + }, + { + "epoch": 0.73, + "learning_rate": 3.690556371722774e-06, + "loss": 0.46, + "step": 6351 + }, + { + "epoch": 0.73, + "learning_rate": 3.6876849579390205e-06, + "loss": 0.4603, + "step": 6352 + }, + { + "epoch": 0.73, + "learning_rate": 3.6848144090833602e-06, + "loss": 0.4403, + "step": 6353 + }, + { + "epoch": 0.73, + "learning_rate": 3.681944725549117e-06, + "loss": 0.4655, + "step": 6354 + }, + { + "epoch": 0.73, + "learning_rate": 3.6790759077295046e-06, + "loss": 0.4291, + "step": 6355 + }, + { + "epoch": 0.73, + "learning_rate": 3.676207956017618e-06, + "loss": 0.4618, + "step": 6356 + }, + { + "epoch": 0.73, + "learning_rate": 3.6733408708064265e-06, + "loss": 0.4499, + "step": 6357 + }, + { + "epoch": 0.73, + "learning_rate": 3.6704746524887835e-06, + "loss": 0.5017, + "step": 6358 + }, + { + "epoch": 0.73, + "learning_rate": 3.667609301457431e-06, + "loss": 0.4465, + "step": 6359 + }, + { + "epoch": 0.73, + "learning_rate": 3.664744818104977e-06, + "loss": 0.4627, + "step": 6360 + }, + { + "epoch": 0.73, + "learning_rate": 3.6618812028239304e-06, + "loss": 0.4456, + "step": 6361 + }, + { + "epoch": 0.73, + "learning_rate": 3.65901845600666e-06, + "loss": 0.465, + "step": 6362 + }, + { + "epoch": 0.73, + "learning_rate": 3.656156578045431e-06, + "loss": 0.4447, + "step": 6363 + }, + { + "epoch": 0.73, + "learning_rate": 3.653295569332389e-06, + "loss": 0.4692, + "step": 6364 + }, + { + "epoch": 0.73, + "learning_rate": 3.650435430259548e-06, + "loss": 0.4425, + "step": 6365 + }, + { + "epoch": 0.73, + "learning_rate": 3.6475761612188177e-06, + "loss": 0.4569, + "step": 6366 + }, + { + "epoch": 0.73, + "learning_rate": 3.644717762601978e-06, + "loss": 0.4629, + "step": 6367 + }, + { + "epoch": 0.73, + "learning_rate": 3.6418602348006903e-06, + "loss": 0.4604, + "step": 6368 + }, + { + "epoch": 0.73, + "learning_rate": 3.639003578206508e-06, + "loss": 0.4635, + "step": 6369 + }, + { + "epoch": 0.73, + "learning_rate": 3.6361477932108513e-06, + "loss": 0.4692, + "step": 6370 + }, + { + "epoch": 0.73, + "learning_rate": 3.633292880205024e-06, + "loss": 0.4452, + "step": 6371 + }, + { + "epoch": 0.73, + "learning_rate": 3.630438839580217e-06, + "loss": 0.4524, + "step": 6372 + }, + { + "epoch": 0.73, + "learning_rate": 3.627585671727496e-06, + "loss": 0.4447, + "step": 6373 + }, + { + "epoch": 0.73, + "learning_rate": 3.6247333770378133e-06, + "loss": 0.4762, + "step": 6374 + }, + { + "epoch": 0.73, + "learning_rate": 3.6218819559019934e-06, + "loss": 0.4542, + "step": 6375 + }, + { + "epoch": 0.73, + "learning_rate": 3.6190314087107415e-06, + "loss": 0.4638, + "step": 6376 + }, + { + "epoch": 0.73, + "learning_rate": 3.6161817358546513e-06, + "loss": 0.4357, + "step": 6377 + }, + { + "epoch": 0.73, + "learning_rate": 3.6133329377241866e-06, + "loss": 0.4745, + "step": 6378 + }, + { + "epoch": 0.73, + "learning_rate": 3.6104850147097035e-06, + "loss": 0.4524, + "step": 6379 + }, + { + "epoch": 0.73, + "learning_rate": 3.6076379672014263e-06, + "loss": 0.4682, + "step": 6380 + }, + { + "epoch": 0.73, + "learning_rate": 3.6047917955894606e-06, + "loss": 0.4454, + "step": 6381 + }, + { + "epoch": 0.73, + "learning_rate": 3.6019465002638e-06, + "loss": 0.4475, + "step": 6382 + }, + { + "epoch": 0.73, + "learning_rate": 3.5991020816143164e-06, + "loss": 0.452, + "step": 6383 + }, + { + "epoch": 0.73, + "learning_rate": 3.5962585400307504e-06, + "loss": 0.4758, + "step": 6384 + }, + { + "epoch": 0.73, + "learning_rate": 3.5934158759027405e-06, + "loss": 0.4591, + "step": 6385 + }, + { + "epoch": 0.73, + "learning_rate": 3.590574089619786e-06, + "loss": 0.4632, + "step": 6386 + }, + { + "epoch": 0.73, + "learning_rate": 3.587733181571282e-06, + "loss": 0.4452, + "step": 6387 + }, + { + "epoch": 0.73, + "learning_rate": 3.5848931521464947e-06, + "loss": 0.4504, + "step": 6388 + }, + { + "epoch": 0.73, + "learning_rate": 3.5820540017345663e-06, + "loss": 0.4659, + "step": 6389 + }, + { + "epoch": 0.73, + "learning_rate": 3.5792157307245313e-06, + "loss": 0.4513, + "step": 6390 + }, + { + "epoch": 0.73, + "learning_rate": 3.5763783395052887e-06, + "loss": 0.4511, + "step": 6391 + }, + { + "epoch": 0.73, + "learning_rate": 3.5735418284656287e-06, + "loss": 0.4506, + "step": 6392 + }, + { + "epoch": 0.73, + "learning_rate": 3.5707061979942205e-06, + "loss": 0.4609, + "step": 6393 + }, + { + "epoch": 0.73, + "learning_rate": 3.5678714484796006e-06, + "loss": 0.4443, + "step": 6394 + }, + { + "epoch": 0.73, + "learning_rate": 3.565037580310201e-06, + "loss": 0.4451, + "step": 6395 + }, + { + "epoch": 0.73, + "learning_rate": 3.56220459387432e-06, + "loss": 0.4531, + "step": 6396 + }, + { + "epoch": 0.73, + "learning_rate": 3.559372489560139e-06, + "loss": 0.4487, + "step": 6397 + }, + { + "epoch": 0.73, + "learning_rate": 3.5565412677557233e-06, + "loss": 0.4554, + "step": 6398 + }, + { + "epoch": 0.73, + "learning_rate": 3.553710928849009e-06, + "loss": 0.4365, + "step": 6399 + }, + { + "epoch": 0.73, + "learning_rate": 3.550881473227822e-06, + "loss": 0.4724, + "step": 6400 + }, + { + "epoch": 0.73, + "learning_rate": 3.548052901279854e-06, + "loss": 0.4517, + "step": 6401 + }, + { + "epoch": 0.73, + "learning_rate": 3.5452252133926856e-06, + "loss": 0.4618, + "step": 6402 + }, + { + "epoch": 0.73, + "learning_rate": 3.5423984099537754e-06, + "loss": 0.4458, + "step": 6403 + }, + { + "epoch": 0.73, + "learning_rate": 3.5395724913504546e-06, + "loss": 0.4444, + "step": 6404 + }, + { + "epoch": 0.73, + "learning_rate": 3.536747457969942e-06, + "loss": 0.4485, + "step": 6405 + }, + { + "epoch": 0.73, + "learning_rate": 3.5339233101993287e-06, + "loss": 0.4616, + "step": 6406 + }, + { + "epoch": 0.73, + "learning_rate": 3.5311000484255796e-06, + "loss": 0.4301, + "step": 6407 + }, + { + "epoch": 0.73, + "learning_rate": 3.5282776730355537e-06, + "loss": 0.4606, + "step": 6408 + }, + { + "epoch": 0.73, + "learning_rate": 3.5254561844159718e-06, + "loss": 0.4518, + "step": 6409 + }, + { + "epoch": 0.73, + "learning_rate": 3.5226355829534475e-06, + "loss": 0.4358, + "step": 6410 + }, + { + "epoch": 0.73, + "learning_rate": 3.51981586903446e-06, + "loss": 0.4838, + "step": 6411 + }, + { + "epoch": 0.73, + "learning_rate": 3.516997043045376e-06, + "loss": 0.4628, + "step": 6412 + }, + { + "epoch": 0.73, + "learning_rate": 3.5141791053724405e-06, + "loss": 0.4449, + "step": 6413 + }, + { + "epoch": 0.73, + "learning_rate": 3.5113620564017727e-06, + "loss": 0.4448, + "step": 6414 + }, + { + "epoch": 0.73, + "learning_rate": 3.5085458965193654e-06, + "loss": 0.4661, + "step": 6415 + }, + { + "epoch": 0.73, + "learning_rate": 3.5057306261111024e-06, + "loss": 0.4502, + "step": 6416 + }, + { + "epoch": 0.73, + "learning_rate": 3.502916245562733e-06, + "loss": 0.4547, + "step": 6417 + }, + { + "epoch": 0.73, + "learning_rate": 3.5001027552598952e-06, + "loss": 0.4459, + "step": 6418 + }, + { + "epoch": 0.73, + "learning_rate": 3.4972901555880957e-06, + "loss": 0.4551, + "step": 6419 + }, + { + "epoch": 0.73, + "learning_rate": 3.4944784469327253e-06, + "loss": 0.4592, + "step": 6420 + }, + { + "epoch": 0.73, + "learning_rate": 3.491667629679054e-06, + "loss": 0.4369, + "step": 6421 + }, + { + "epoch": 0.73, + "learning_rate": 3.488857704212224e-06, + "loss": 0.4437, + "step": 6422 + }, + { + "epoch": 0.73, + "learning_rate": 3.4860486709172536e-06, + "loss": 0.4401, + "step": 6423 + }, + { + "epoch": 0.73, + "learning_rate": 3.4832405301790507e-06, + "loss": 0.4533, + "step": 6424 + }, + { + "epoch": 0.73, + "learning_rate": 3.4804332823823862e-06, + "loss": 0.4487, + "step": 6425 + }, + { + "epoch": 0.73, + "learning_rate": 3.477626927911921e-06, + "loss": 0.4677, + "step": 6426 + }, + { + "epoch": 0.73, + "learning_rate": 3.4748214671521875e-06, + "loss": 0.4409, + "step": 6427 + }, + { + "epoch": 0.73, + "learning_rate": 3.4720169004875914e-06, + "loss": 0.4481, + "step": 6428 + }, + { + "epoch": 0.73, + "learning_rate": 3.469213228302425e-06, + "loss": 0.4797, + "step": 6429 + }, + { + "epoch": 0.73, + "learning_rate": 3.466410450980854e-06, + "loss": 0.4376, + "step": 6430 + }, + { + "epoch": 0.73, + "learning_rate": 3.4636085689069244e-06, + "loss": 0.4519, + "step": 6431 + }, + { + "epoch": 0.74, + "learning_rate": 3.4608075824645524e-06, + "loss": 0.4359, + "step": 6432 + }, + { + "epoch": 0.74, + "learning_rate": 3.4580074920375352e-06, + "loss": 0.4585, + "step": 6433 + }, + { + "epoch": 0.74, + "learning_rate": 3.4552082980095514e-06, + "loss": 0.4479, + "step": 6434 + }, + { + "epoch": 0.74, + "learning_rate": 3.4524100007641517e-06, + "loss": 0.4467, + "step": 6435 + }, + { + "epoch": 0.74, + "learning_rate": 3.4496126006847596e-06, + "loss": 0.4351, + "step": 6436 + }, + { + "epoch": 0.74, + "learning_rate": 3.446816098154692e-06, + "loss": 0.4652, + "step": 6437 + }, + { + "epoch": 0.74, + "learning_rate": 3.4440204935571208e-06, + "loss": 0.443, + "step": 6438 + }, + { + "epoch": 0.74, + "learning_rate": 3.441225787275113e-06, + "loss": 0.4518, + "step": 6439 + }, + { + "epoch": 0.74, + "learning_rate": 3.4384319796916075e-06, + "loss": 0.4392, + "step": 6440 + }, + { + "epoch": 0.74, + "learning_rate": 3.435639071189413e-06, + "loss": 0.4458, + "step": 6441 + }, + { + "epoch": 0.74, + "learning_rate": 3.4328470621512257e-06, + "loss": 0.462, + "step": 6442 + }, + { + "epoch": 0.74, + "learning_rate": 3.430055952959607e-06, + "loss": 0.4441, + "step": 6443 + }, + { + "epoch": 0.74, + "learning_rate": 3.427265743997007e-06, + "loss": 0.4649, + "step": 6444 + }, + { + "epoch": 0.74, + "learning_rate": 3.4244764356457438e-06, + "loss": 0.4703, + "step": 6445 + }, + { + "epoch": 0.74, + "learning_rate": 3.4216880282880128e-06, + "loss": 0.4514, + "step": 6446 + }, + { + "epoch": 0.74, + "learning_rate": 3.4189005223058937e-06, + "loss": 0.4628, + "step": 6447 + }, + { + "epoch": 0.74, + "learning_rate": 3.416113918081331e-06, + "loss": 0.4485, + "step": 6448 + }, + { + "epoch": 0.74, + "learning_rate": 3.4133282159961535e-06, + "loss": 0.4489, + "step": 6449 + }, + { + "epoch": 0.74, + "learning_rate": 3.4105434164320695e-06, + "loss": 0.4353, + "step": 6450 + }, + { + "epoch": 0.74, + "learning_rate": 3.4077595197706505e-06, + "loss": 0.4739, + "step": 6451 + }, + { + "epoch": 0.74, + "learning_rate": 3.404976526393361e-06, + "loss": 0.4615, + "step": 6452 + }, + { + "epoch": 0.74, + "learning_rate": 3.4021944366815286e-06, + "loss": 0.4412, + "step": 6453 + }, + { + "epoch": 0.74, + "learning_rate": 3.399413251016359e-06, + "loss": 0.4565, + "step": 6454 + }, + { + "epoch": 0.74, + "learning_rate": 3.3966329697789424e-06, + "loss": 0.4467, + "step": 6455 + }, + { + "epoch": 0.74, + "learning_rate": 3.393853593350235e-06, + "loss": 0.4398, + "step": 6456 + }, + { + "epoch": 0.74, + "learning_rate": 3.391075122111077e-06, + "loss": 0.4441, + "step": 6457 + }, + { + "epoch": 0.74, + "learning_rate": 3.3882975564421773e-06, + "loss": 0.4504, + "step": 6458 + }, + { + "epoch": 0.74, + "learning_rate": 3.3855208967241247e-06, + "loss": 0.46, + "step": 6459 + }, + { + "epoch": 0.74, + "learning_rate": 3.3827451433373904e-06, + "loss": 0.4562, + "step": 6460 + }, + { + "epoch": 0.74, + "learning_rate": 3.379970296662305e-06, + "loss": 0.468, + "step": 6461 + }, + { + "epoch": 0.74, + "learning_rate": 3.3771963570790924e-06, + "loss": 0.4444, + "step": 6462 + }, + { + "epoch": 0.74, + "learning_rate": 3.3744233249678403e-06, + "loss": 0.4594, + "step": 6463 + }, + { + "epoch": 0.74, + "learning_rate": 3.3716512007085133e-06, + "loss": 0.4506, + "step": 6464 + }, + { + "epoch": 0.74, + "learning_rate": 3.368879984680962e-06, + "loss": 0.4545, + "step": 6465 + }, + { + "epoch": 0.74, + "learning_rate": 3.366109677264895e-06, + "loss": 0.4571, + "step": 6466 + }, + { + "epoch": 0.74, + "learning_rate": 3.363340278839916e-06, + "loss": 0.4337, + "step": 6467 + }, + { + "epoch": 0.74, + "learning_rate": 3.3605717897854872e-06, + "loss": 0.4496, + "step": 6468 + }, + { + "epoch": 0.74, + "learning_rate": 3.357804210480955e-06, + "loss": 0.4677, + "step": 6469 + }, + { + "epoch": 0.74, + "learning_rate": 3.355037541305545e-06, + "loss": 0.441, + "step": 6470 + }, + { + "epoch": 0.74, + "learning_rate": 3.3522717826383477e-06, + "loss": 0.4474, + "step": 6471 + }, + { + "epoch": 0.74, + "learning_rate": 3.349506934858331e-06, + "loss": 0.4609, + "step": 6472 + }, + { + "epoch": 0.74, + "learning_rate": 3.3467429983443477e-06, + "loss": 0.4565, + "step": 6473 + }, + { + "epoch": 0.74, + "learning_rate": 3.3439799734751132e-06, + "loss": 0.4461, + "step": 6474 + }, + { + "epoch": 0.74, + "learning_rate": 3.3412178606292276e-06, + "loss": 0.465, + "step": 6475 + }, + { + "epoch": 0.74, + "learning_rate": 3.3384566601851574e-06, + "loss": 0.4509, + "step": 6476 + }, + { + "epoch": 0.74, + "learning_rate": 3.3356963725212523e-06, + "loss": 0.4677, + "step": 6477 + }, + { + "epoch": 0.74, + "learning_rate": 3.3329369980157345e-06, + "loss": 0.4562, + "step": 6478 + }, + { + "epoch": 0.74, + "learning_rate": 3.330178537046699e-06, + "loss": 0.4441, + "step": 6479 + }, + { + "epoch": 0.74, + "learning_rate": 3.327420989992112e-06, + "loss": 0.4541, + "step": 6480 + }, + { + "epoch": 0.74, + "learning_rate": 3.3246643572298253e-06, + "loss": 0.466, + "step": 6481 + }, + { + "epoch": 0.74, + "learning_rate": 3.321908639137553e-06, + "loss": 0.4409, + "step": 6482 + }, + { + "epoch": 0.74, + "learning_rate": 3.3191538360928977e-06, + "loss": 0.447, + "step": 6483 + }, + { + "epoch": 0.74, + "learning_rate": 3.3163999484733232e-06, + "loss": 0.4702, + "step": 6484 + }, + { + "epoch": 0.74, + "learning_rate": 3.313646976656172e-06, + "loss": 0.4823, + "step": 6485 + }, + { + "epoch": 0.74, + "learning_rate": 3.3108949210186657e-06, + "loss": 0.4589, + "step": 6486 + }, + { + "epoch": 0.74, + "learning_rate": 3.308143781937898e-06, + "loss": 0.46, + "step": 6487 + }, + { + "epoch": 0.74, + "learning_rate": 3.305393559790838e-06, + "loss": 0.4426, + "step": 6488 + }, + { + "epoch": 0.74, + "learning_rate": 3.302644254954326e-06, + "loss": 0.4549, + "step": 6489 + }, + { + "epoch": 0.74, + "learning_rate": 3.299895867805074e-06, + "loss": 0.4496, + "step": 6490 + }, + { + "epoch": 0.74, + "learning_rate": 3.2971483987196783e-06, + "loss": 0.4512, + "step": 6491 + }, + { + "epoch": 0.74, + "learning_rate": 3.294401848074602e-06, + "loss": 0.4596, + "step": 6492 + }, + { + "epoch": 0.74, + "learning_rate": 3.2916562162461784e-06, + "loss": 0.4452, + "step": 6493 + }, + { + "epoch": 0.74, + "learning_rate": 3.288911503610629e-06, + "loss": 0.4427, + "step": 6494 + }, + { + "epoch": 0.74, + "learning_rate": 3.2861677105440335e-06, + "loss": 0.4639, + "step": 6495 + }, + { + "epoch": 0.74, + "learning_rate": 3.2834248374223556e-06, + "loss": 0.4535, + "step": 6496 + }, + { + "epoch": 0.74, + "learning_rate": 3.2806828846214324e-06, + "loss": 0.4548, + "step": 6497 + }, + { + "epoch": 0.74, + "learning_rate": 3.277941852516968e-06, + "loss": 0.4639, + "step": 6498 + }, + { + "epoch": 0.74, + "learning_rate": 3.2752017414845514e-06, + "loss": 0.4343, + "step": 6499 + }, + { + "epoch": 0.74, + "learning_rate": 3.2724625518996322e-06, + "loss": 0.449, + "step": 6500 + }, + { + "epoch": 0.74, + "learning_rate": 3.2697242841375452e-06, + "loss": 0.4611, + "step": 6501 + }, + { + "epoch": 0.74, + "learning_rate": 3.2669869385734943e-06, + "loss": 0.4584, + "step": 6502 + }, + { + "epoch": 0.74, + "learning_rate": 3.264250515582551e-06, + "loss": 0.4496, + "step": 6503 + }, + { + "epoch": 0.74, + "learning_rate": 3.2615150155396747e-06, + "loss": 0.4493, + "step": 6504 + }, + { + "epoch": 0.74, + "learning_rate": 3.258780438819681e-06, + "loss": 0.4551, + "step": 6505 + }, + { + "epoch": 0.74, + "learning_rate": 3.2560467857972744e-06, + "loss": 0.4449, + "step": 6506 + }, + { + "epoch": 0.74, + "learning_rate": 3.2533140568470266e-06, + "loss": 0.4602, + "step": 6507 + }, + { + "epoch": 0.74, + "learning_rate": 3.2505822523433785e-06, + "loss": 0.4392, + "step": 6508 + }, + { + "epoch": 0.74, + "learning_rate": 3.247851372660653e-06, + "loss": 0.4686, + "step": 6509 + }, + { + "epoch": 0.74, + "learning_rate": 3.2451214181730396e-06, + "loss": 0.4433, + "step": 6510 + }, + { + "epoch": 0.74, + "learning_rate": 3.2423923892545994e-06, + "loss": 0.461, + "step": 6511 + }, + { + "epoch": 0.74, + "learning_rate": 3.239664286279276e-06, + "loss": 0.4404, + "step": 6512 + }, + { + "epoch": 0.74, + "learning_rate": 3.2369371096208744e-06, + "loss": 0.4545, + "step": 6513 + }, + { + "epoch": 0.74, + "learning_rate": 3.2342108596530865e-06, + "loss": 0.44, + "step": 6514 + }, + { + "epoch": 0.74, + "learning_rate": 3.23148553674946e-06, + "loss": 0.4454, + "step": 6515 + }, + { + "epoch": 0.74, + "learning_rate": 3.2287611412834306e-06, + "loss": 0.4478, + "step": 6516 + }, + { + "epoch": 0.74, + "learning_rate": 3.226037673628305e-06, + "loss": 0.4523, + "step": 6517 + }, + { + "epoch": 0.74, + "learning_rate": 3.223315134157253e-06, + "loss": 0.4633, + "step": 6518 + }, + { + "epoch": 0.75, + "learning_rate": 3.220593523243324e-06, + "loss": 0.4656, + "step": 6519 + }, + { + "epoch": 0.75, + "learning_rate": 3.2178728412594417e-06, + "loss": 0.4449, + "step": 6520 + }, + { + "epoch": 0.75, + "learning_rate": 3.2151530885783967e-06, + "loss": 0.4725, + "step": 6521 + }, + { + "epoch": 0.75, + "learning_rate": 3.212434265572861e-06, + "loss": 0.4602, + "step": 6522 + }, + { + "epoch": 0.75, + "learning_rate": 3.209716372615369e-06, + "loss": 0.4637, + "step": 6523 + }, + { + "epoch": 0.75, + "learning_rate": 3.2069994100783376e-06, + "loss": 0.4622, + "step": 6524 + }, + { + "epoch": 0.75, + "learning_rate": 3.2042833783340453e-06, + "loss": 0.4492, + "step": 6525 + }, + { + "epoch": 0.75, + "learning_rate": 3.201568277754652e-06, + "loss": 0.4493, + "step": 6526 + }, + { + "epoch": 0.75, + "learning_rate": 3.1988541087121916e-06, + "loss": 0.4665, + "step": 6527 + }, + { + "epoch": 0.75, + "learning_rate": 3.1961408715785615e-06, + "loss": 0.435, + "step": 6528 + }, + { + "epoch": 0.75, + "learning_rate": 3.193428566725534e-06, + "loss": 0.4442, + "step": 6529 + }, + { + "epoch": 0.75, + "learning_rate": 3.1907171945247595e-06, + "loss": 0.4554, + "step": 6530 + }, + { + "epoch": 0.75, + "learning_rate": 3.1880067553477513e-06, + "loss": 0.4353, + "step": 6531 + }, + { + "epoch": 0.75, + "learning_rate": 3.1852972495659064e-06, + "loss": 0.463, + "step": 6532 + }, + { + "epoch": 0.75, + "learning_rate": 3.182588677550482e-06, + "loss": 0.4648, + "step": 6533 + }, + { + "epoch": 0.75, + "learning_rate": 3.179881039672619e-06, + "loss": 0.4438, + "step": 6534 + }, + { + "epoch": 0.75, + "learning_rate": 3.1771743363033156e-06, + "loss": 0.4627, + "step": 6535 + }, + { + "epoch": 0.75, + "learning_rate": 3.174468567813461e-06, + "loss": 0.4732, + "step": 6536 + }, + { + "epoch": 0.75, + "learning_rate": 3.171763734573796e-06, + "loss": 0.434, + "step": 6537 + }, + { + "epoch": 0.75, + "learning_rate": 3.169059836954952e-06, + "loss": 0.4621, + "step": 6538 + }, + { + "epoch": 0.75, + "learning_rate": 3.1663568753274153e-06, + "loss": 0.4477, + "step": 6539 + }, + { + "epoch": 0.75, + "learning_rate": 3.1636548500615583e-06, + "loss": 0.4271, + "step": 6540 + }, + { + "epoch": 0.75, + "learning_rate": 3.1609537615276174e-06, + "loss": 0.4784, + "step": 6541 + }, + { + "epoch": 0.75, + "learning_rate": 3.1582536100956973e-06, + "loss": 0.4413, + "step": 6542 + }, + { + "epoch": 0.75, + "learning_rate": 3.1555543961357824e-06, + "loss": 0.4723, + "step": 6543 + }, + { + "epoch": 0.75, + "learning_rate": 3.1528561200177254e-06, + "loss": 0.4543, + "step": 6544 + }, + { + "epoch": 0.75, + "learning_rate": 3.1501587821112532e-06, + "loss": 0.4522, + "step": 6545 + }, + { + "epoch": 0.75, + "learning_rate": 3.14746238278596e-06, + "loss": 0.4649, + "step": 6546 + }, + { + "epoch": 0.75, + "learning_rate": 3.1447669224113074e-06, + "loss": 0.4431, + "step": 6547 + }, + { + "epoch": 0.75, + "learning_rate": 3.1420724013566408e-06, + "loss": 0.4451, + "step": 6548 + }, + { + "epoch": 0.75, + "learning_rate": 3.1393788199911657e-06, + "loss": 0.4428, + "step": 6549 + }, + { + "epoch": 0.75, + "learning_rate": 3.136686178683961e-06, + "loss": 0.468, + "step": 6550 + }, + { + "epoch": 0.75, + "learning_rate": 3.1339944778039844e-06, + "loss": 0.4625, + "step": 6551 + }, + { + "epoch": 0.75, + "learning_rate": 3.131303717720053e-06, + "loss": 0.4479, + "step": 6552 + }, + { + "epoch": 0.75, + "learning_rate": 3.128613898800864e-06, + "loss": 0.4522, + "step": 6553 + }, + { + "epoch": 0.75, + "learning_rate": 3.125925021414985e-06, + "loss": 0.4741, + "step": 6554 + }, + { + "epoch": 0.75, + "learning_rate": 3.123237085930847e-06, + "loss": 0.4563, + "step": 6555 + }, + { + "epoch": 0.75, + "learning_rate": 3.1205500927167644e-06, + "loss": 0.4503, + "step": 6556 + }, + { + "epoch": 0.75, + "learning_rate": 3.1178640421409057e-06, + "loss": 0.4515, + "step": 6557 + }, + { + "epoch": 0.75, + "learning_rate": 3.11517893457133e-06, + "loss": 0.4671, + "step": 6558 + }, + { + "epoch": 0.75, + "learning_rate": 3.112494770375951e-06, + "loss": 0.4659, + "step": 6559 + }, + { + "epoch": 0.75, + "learning_rate": 3.1098115499225567e-06, + "loss": 0.4397, + "step": 6560 + }, + { + "epoch": 0.75, + "learning_rate": 3.107129273578815e-06, + "loss": 0.4652, + "step": 6561 + }, + { + "epoch": 0.75, + "learning_rate": 3.104447941712251e-06, + "loss": 0.4627, + "step": 6562 + }, + { + "epoch": 0.75, + "learning_rate": 3.1017675546902704e-06, + "loss": 0.5041, + "step": 6563 + }, + { + "epoch": 0.75, + "learning_rate": 3.0990881128801487e-06, + "loss": 0.4534, + "step": 6564 + }, + { + "epoch": 0.75, + "learning_rate": 3.096409616649023e-06, + "loss": 0.4516, + "step": 6565 + }, + { + "epoch": 0.75, + "learning_rate": 3.0937320663639148e-06, + "loss": 0.4326, + "step": 6566 + }, + { + "epoch": 0.75, + "learning_rate": 3.091055462391703e-06, + "loss": 0.4685, + "step": 6567 + }, + { + "epoch": 0.75, + "learning_rate": 3.088379805099141e-06, + "loss": 0.4364, + "step": 6568 + }, + { + "epoch": 0.75, + "learning_rate": 3.0857050948528576e-06, + "loss": 0.4616, + "step": 6569 + }, + { + "epoch": 0.75, + "learning_rate": 3.083031332019344e-06, + "loss": 0.4496, + "step": 6570 + }, + { + "epoch": 0.75, + "learning_rate": 3.0803585169649696e-06, + "loss": 0.4441, + "step": 6571 + }, + { + "epoch": 0.75, + "learning_rate": 3.0776866500559654e-06, + "loss": 0.4711, + "step": 6572 + }, + { + "epoch": 0.75, + "learning_rate": 3.0750157316584375e-06, + "loss": 0.4634, + "step": 6573 + }, + { + "epoch": 0.75, + "learning_rate": 3.0723457621383666e-06, + "loss": 0.4329, + "step": 6574 + }, + { + "epoch": 0.75, + "learning_rate": 3.0696767418615945e-06, + "loss": 0.4494, + "step": 6575 + }, + { + "epoch": 0.75, + "learning_rate": 3.067008671193833e-06, + "loss": 0.4572, + "step": 6576 + }, + { + "epoch": 0.75, + "learning_rate": 3.0643415505006733e-06, + "loss": 0.4618, + "step": 6577 + }, + { + "epoch": 0.75, + "learning_rate": 3.0616753801475653e-06, + "loss": 0.4343, + "step": 6578 + }, + { + "epoch": 0.75, + "learning_rate": 3.059010160499839e-06, + "loss": 0.4473, + "step": 6579 + }, + { + "epoch": 0.75, + "learning_rate": 3.056345891922684e-06, + "loss": 0.4555, + "step": 6580 + }, + { + "epoch": 0.75, + "learning_rate": 3.0536825747811695e-06, + "loss": 0.4598, + "step": 6581 + }, + { + "epoch": 0.75, + "learning_rate": 3.0510202094402242e-06, + "loss": 0.4601, + "step": 6582 + }, + { + "epoch": 0.75, + "learning_rate": 3.0483587962646545e-06, + "loss": 0.4466, + "step": 6583 + }, + { + "epoch": 0.75, + "learning_rate": 3.045698335619135e-06, + "loss": 0.4614, + "step": 6584 + }, + { + "epoch": 0.75, + "learning_rate": 3.0430388278682078e-06, + "loss": 0.4739, + "step": 6585 + }, + { + "epoch": 0.75, + "learning_rate": 3.04038027337628e-06, + "loss": 0.4521, + "step": 6586 + }, + { + "epoch": 0.75, + "learning_rate": 3.0377226725076394e-06, + "loss": 0.4688, + "step": 6587 + }, + { + "epoch": 0.75, + "learning_rate": 3.035066025626434e-06, + "loss": 0.4473, + "step": 6588 + }, + { + "epoch": 0.75, + "learning_rate": 3.0324103330966804e-06, + "loss": 0.4427, + "step": 6589 + }, + { + "epoch": 0.75, + "learning_rate": 3.02975559528227e-06, + "loss": 0.4678, + "step": 6590 + }, + { + "epoch": 0.75, + "learning_rate": 3.027101812546965e-06, + "loss": 0.4627, + "step": 6591 + }, + { + "epoch": 0.75, + "learning_rate": 3.024448985254387e-06, + "loss": 0.4567, + "step": 6592 + }, + { + "epoch": 0.75, + "learning_rate": 3.021797113768039e-06, + "loss": 0.4516, + "step": 6593 + }, + { + "epoch": 0.75, + "learning_rate": 3.0191461984512794e-06, + "loss": 0.4481, + "step": 6594 + }, + { + "epoch": 0.75, + "learning_rate": 3.016496239667349e-06, + "loss": 0.4717, + "step": 6595 + }, + { + "epoch": 0.75, + "learning_rate": 3.013847237779346e-06, + "loss": 0.4614, + "step": 6596 + }, + { + "epoch": 0.75, + "learning_rate": 3.0111991931502484e-06, + "loss": 0.4335, + "step": 6597 + }, + { + "epoch": 0.75, + "learning_rate": 3.0085521061428945e-06, + "loss": 0.4489, + "step": 6598 + }, + { + "epoch": 0.75, + "learning_rate": 3.005905977119992e-06, + "loss": 0.4609, + "step": 6599 + }, + { + "epoch": 0.75, + "learning_rate": 3.0032608064441206e-06, + "loss": 0.4218, + "step": 6600 + }, + { + "epoch": 0.75, + "learning_rate": 3.0006165944777333e-06, + "loss": 0.4688, + "step": 6601 + }, + { + "epoch": 0.75, + "learning_rate": 2.997973341583138e-06, + "loss": 0.4433, + "step": 6602 + }, + { + "epoch": 0.75, + "learning_rate": 2.9953310481225275e-06, + "loss": 0.4752, + "step": 6603 + }, + { + "epoch": 0.75, + "learning_rate": 2.992689714457947e-06, + "loss": 0.4381, + "step": 6604 + }, + { + "epoch": 0.75, + "learning_rate": 2.9900493409513256e-06, + "loss": 0.459, + "step": 6605 + }, + { + "epoch": 0.75, + "learning_rate": 2.9874099279644487e-06, + "loss": 0.4376, + "step": 6606 + }, + { + "epoch": 0.76, + "learning_rate": 2.984771475858973e-06, + "loss": 0.4624, + "step": 6607 + }, + { + "epoch": 0.76, + "learning_rate": 2.9821339849964324e-06, + "loss": 0.4495, + "step": 6608 + }, + { + "epoch": 0.76, + "learning_rate": 2.979497455738214e-06, + "loss": 0.4727, + "step": 6609 + }, + { + "epoch": 0.76, + "learning_rate": 2.976861888445586e-06, + "loss": 0.4405, + "step": 6610 + }, + { + "epoch": 0.76, + "learning_rate": 2.9742272834796813e-06, + "loss": 0.4884, + "step": 6611 + }, + { + "epoch": 0.76, + "learning_rate": 2.9715936412014945e-06, + "loss": 0.4645, + "step": 6612 + }, + { + "epoch": 0.76, + "learning_rate": 2.9689609619718996e-06, + "loss": 0.4535, + "step": 6613 + }, + { + "epoch": 0.76, + "learning_rate": 2.966329246151626e-06, + "loss": 0.4459, + "step": 6614 + }, + { + "epoch": 0.76, + "learning_rate": 2.9636984941012835e-06, + "loss": 0.4543, + "step": 6615 + }, + { + "epoch": 0.76, + "learning_rate": 2.9610687061813405e-06, + "loss": 0.4395, + "step": 6616 + }, + { + "epoch": 0.76, + "learning_rate": 2.9584398827521343e-06, + "loss": 0.4394, + "step": 6617 + }, + { + "epoch": 0.76, + "learning_rate": 2.9558120241738786e-06, + "loss": 0.451, + "step": 6618 + }, + { + "epoch": 0.76, + "learning_rate": 2.9531851308066426e-06, + "loss": 0.468, + "step": 6619 + }, + { + "epoch": 0.76, + "learning_rate": 2.950559203010371e-06, + "loss": 0.4386, + "step": 6620 + }, + { + "epoch": 0.76, + "learning_rate": 2.9479342411448797e-06, + "loss": 0.4552, + "step": 6621 + }, + { + "epoch": 0.76, + "learning_rate": 2.945310245569839e-06, + "loss": 0.4542, + "step": 6622 + }, + { + "epoch": 0.76, + "learning_rate": 2.942687216644803e-06, + "loss": 0.4395, + "step": 6623 + }, + { + "epoch": 0.76, + "learning_rate": 2.94006515472918e-06, + "loss": 0.4551, + "step": 6624 + }, + { + "epoch": 0.76, + "learning_rate": 2.9374440601822495e-06, + "loss": 0.4472, + "step": 6625 + }, + { + "epoch": 0.76, + "learning_rate": 2.9348239333631655e-06, + "loss": 0.4362, + "step": 6626 + }, + { + "epoch": 0.76, + "learning_rate": 2.9322047746309377e-06, + "loss": 0.4542, + "step": 6627 + }, + { + "epoch": 0.76, + "learning_rate": 2.929586584344456e-06, + "loss": 0.4466, + "step": 6628 + }, + { + "epoch": 0.76, + "learning_rate": 2.926969362862465e-06, + "loss": 0.4727, + "step": 6629 + }, + { + "epoch": 0.76, + "learning_rate": 2.924353110543584e-06, + "loss": 0.4393, + "step": 6630 + }, + { + "epoch": 0.76, + "learning_rate": 2.9217378277463025e-06, + "loss": 0.4731, + "step": 6631 + }, + { + "epoch": 0.76, + "learning_rate": 2.919123514828969e-06, + "loss": 0.4454, + "step": 6632 + }, + { + "epoch": 0.76, + "learning_rate": 2.916510172149799e-06, + "loss": 0.4309, + "step": 6633 + }, + { + "epoch": 0.76, + "learning_rate": 2.913897800066887e-06, + "loss": 0.4529, + "step": 6634 + }, + { + "epoch": 0.76, + "learning_rate": 2.911286398938178e-06, + "loss": 0.4504, + "step": 6635 + }, + { + "epoch": 0.76, + "learning_rate": 2.9086759691214994e-06, + "loss": 0.4542, + "step": 6636 + }, + { + "epoch": 0.76, + "learning_rate": 2.9060665109745324e-06, + "loss": 0.4558, + "step": 6637 + }, + { + "epoch": 0.76, + "learning_rate": 2.9034580248548363e-06, + "loss": 0.458, + "step": 6638 + }, + { + "epoch": 0.76, + "learning_rate": 2.900850511119826e-06, + "loss": 0.4515, + "step": 6639 + }, + { + "epoch": 0.76, + "learning_rate": 2.898243970126793e-06, + "loss": 0.4648, + "step": 6640 + }, + { + "epoch": 0.76, + "learning_rate": 2.8956384022328943e-06, + "loss": 0.4455, + "step": 6641 + }, + { + "epoch": 0.76, + "learning_rate": 2.893033807795147e-06, + "loss": 0.4698, + "step": 6642 + }, + { + "epoch": 0.76, + "learning_rate": 2.8904301871704377e-06, + "loss": 0.4463, + "step": 6643 + }, + { + "epoch": 0.76, + "learning_rate": 2.8878275407155244e-06, + "loss": 0.4796, + "step": 6644 + }, + { + "epoch": 0.76, + "learning_rate": 2.885225868787025e-06, + "loss": 0.4443, + "step": 6645 + }, + { + "epoch": 0.76, + "learning_rate": 2.8826251717414245e-06, + "loss": 0.458, + "step": 6646 + }, + { + "epoch": 0.76, + "learning_rate": 2.8800254499350797e-06, + "loss": 0.4368, + "step": 6647 + }, + { + "epoch": 0.76, + "learning_rate": 2.8774267037242133e-06, + "loss": 0.4671, + "step": 6648 + }, + { + "epoch": 0.76, + "learning_rate": 2.8748289334649036e-06, + "loss": 0.4454, + "step": 6649 + }, + { + "epoch": 0.76, + "learning_rate": 2.8722321395131127e-06, + "loss": 0.433, + "step": 6650 + }, + { + "epoch": 0.76, + "learning_rate": 2.8696363222246504e-06, + "loss": 0.4398, + "step": 6651 + }, + { + "epoch": 0.76, + "learning_rate": 2.8670414819552082e-06, + "loss": 0.4618, + "step": 6652 + }, + { + "epoch": 0.76, + "learning_rate": 2.864447619060333e-06, + "loss": 0.4626, + "step": 6653 + }, + { + "epoch": 0.76, + "learning_rate": 2.861854733895446e-06, + "loss": 0.4605, + "step": 6654 + }, + { + "epoch": 0.76, + "learning_rate": 2.8592628268158273e-06, + "loss": 0.4402, + "step": 6655 + }, + { + "epoch": 0.76, + "learning_rate": 2.8566718981766238e-06, + "loss": 0.4339, + "step": 6656 + }, + { + "epoch": 0.76, + "learning_rate": 2.854081948332854e-06, + "loss": 0.4739, + "step": 6657 + }, + { + "epoch": 0.76, + "learning_rate": 2.8514929776394006e-06, + "loss": 0.4559, + "step": 6658 + }, + { + "epoch": 0.76, + "learning_rate": 2.8489049864510053e-06, + "loss": 0.4355, + "step": 6659 + }, + { + "epoch": 0.76, + "learning_rate": 2.846317975122287e-06, + "loss": 0.4431, + "step": 6660 + }, + { + "epoch": 0.76, + "learning_rate": 2.843731944007717e-06, + "loss": 0.4666, + "step": 6661 + }, + { + "epoch": 0.76, + "learning_rate": 2.841146893461646e-06, + "loss": 0.4639, + "step": 6662 + }, + { + "epoch": 0.76, + "learning_rate": 2.8385628238382803e-06, + "loss": 0.4551, + "step": 6663 + }, + { + "epoch": 0.76, + "learning_rate": 2.8359797354916907e-06, + "loss": 0.4547, + "step": 6664 + }, + { + "epoch": 0.76, + "learning_rate": 2.833397628775827e-06, + "loss": 0.4457, + "step": 6665 + }, + { + "epoch": 0.76, + "learning_rate": 2.830816504044488e-06, + "loss": 0.4398, + "step": 6666 + }, + { + "epoch": 0.76, + "learning_rate": 2.8282363616513475e-06, + "loss": 0.4445, + "step": 6667 + }, + { + "epoch": 0.76, + "learning_rate": 2.8256572019499474e-06, + "loss": 0.4753, + "step": 6668 + }, + { + "epoch": 0.76, + "learning_rate": 2.8230790252936826e-06, + "loss": 0.4523, + "step": 6669 + }, + { + "epoch": 0.76, + "learning_rate": 2.8205018320358268e-06, + "loss": 0.4613, + "step": 6670 + }, + { + "epoch": 0.76, + "learning_rate": 2.8179256225295114e-06, + "loss": 0.467, + "step": 6671 + }, + { + "epoch": 0.76, + "learning_rate": 2.815350397127732e-06, + "loss": 0.469, + "step": 6672 + }, + { + "epoch": 0.76, + "learning_rate": 2.8127761561833554e-06, + "loss": 0.4359, + "step": 6673 + }, + { + "epoch": 0.76, + "learning_rate": 2.810202900049106e-06, + "loss": 0.4473, + "step": 6674 + }, + { + "epoch": 0.76, + "learning_rate": 2.8076306290775823e-06, + "loss": 0.4625, + "step": 6675 + }, + { + "epoch": 0.76, + "learning_rate": 2.8050593436212394e-06, + "loss": 0.451, + "step": 6676 + }, + { + "epoch": 0.76, + "learning_rate": 2.8024890440324e-06, + "loss": 0.4503, + "step": 6677 + }, + { + "epoch": 0.76, + "learning_rate": 2.7999197306632576e-06, + "loss": 0.4582, + "step": 6678 + }, + { + "epoch": 0.76, + "learning_rate": 2.79735140386586e-06, + "loss": 0.4593, + "step": 6679 + }, + { + "epoch": 0.76, + "learning_rate": 2.7947840639921308e-06, + "loss": 0.4524, + "step": 6680 + }, + { + "epoch": 0.76, + "learning_rate": 2.792217711393849e-06, + "loss": 0.4527, + "step": 6681 + }, + { + "epoch": 0.76, + "learning_rate": 2.78965234642266e-06, + "loss": 0.452, + "step": 6682 + }, + { + "epoch": 0.76, + "learning_rate": 2.7870879694300825e-06, + "loss": 0.4344, + "step": 6683 + }, + { + "epoch": 0.76, + "learning_rate": 2.7845245807674893e-06, + "loss": 0.4756, + "step": 6684 + }, + { + "epoch": 0.76, + "learning_rate": 2.7819621807861197e-06, + "loss": 0.4306, + "step": 6685 + }, + { + "epoch": 0.76, + "learning_rate": 2.779400769837083e-06, + "loss": 0.4731, + "step": 6686 + }, + { + "epoch": 0.76, + "learning_rate": 2.776840348271348e-06, + "loss": 0.4586, + "step": 6687 + }, + { + "epoch": 0.76, + "learning_rate": 2.7742809164397546e-06, + "loss": 0.4586, + "step": 6688 + }, + { + "epoch": 0.76, + "learning_rate": 2.7717224746929984e-06, + "loss": 0.4355, + "step": 6689 + }, + { + "epoch": 0.76, + "learning_rate": 2.769165023381639e-06, + "loss": 0.453, + "step": 6690 + }, + { + "epoch": 0.76, + "learning_rate": 2.7666085628561126e-06, + "loss": 0.4595, + "step": 6691 + }, + { + "epoch": 0.76, + "learning_rate": 2.764053093466702e-06, + "loss": 0.4609, + "step": 6692 + }, + { + "epoch": 0.76, + "learning_rate": 2.7614986155635737e-06, + "loss": 0.4531, + "step": 6693 + }, + { + "epoch": 0.77, + "learning_rate": 2.7589451294967383e-06, + "loss": 0.4557, + "step": 6694 + }, + { + "epoch": 0.77, + "learning_rate": 2.75639263561609e-06, + "loss": 0.4522, + "step": 6695 + }, + { + "epoch": 0.77, + "learning_rate": 2.753841134271368e-06, + "loss": 0.468, + "step": 6696 + }, + { + "epoch": 0.77, + "learning_rate": 2.7512906258121907e-06, + "loss": 0.4466, + "step": 6697 + }, + { + "epoch": 0.77, + "learning_rate": 2.7487411105880356e-06, + "loss": 0.454, + "step": 6698 + }, + { + "epoch": 0.77, + "learning_rate": 2.7461925889482422e-06, + "loss": 0.4294, + "step": 6699 + }, + { + "epoch": 0.77, + "learning_rate": 2.7436450612420098e-06, + "loss": 0.4556, + "step": 6700 + }, + { + "epoch": 0.77, + "learning_rate": 2.7410985278184144e-06, + "loss": 0.4682, + "step": 6701 + }, + { + "epoch": 0.77, + "learning_rate": 2.738552989026384e-06, + "loss": 0.4551, + "step": 6702 + }, + { + "epoch": 0.77, + "learning_rate": 2.7360084452147108e-06, + "loss": 0.4589, + "step": 6703 + }, + { + "epoch": 0.77, + "learning_rate": 2.7334648967320587e-06, + "loss": 0.4527, + "step": 6704 + }, + { + "epoch": 0.77, + "learning_rate": 2.7309223439269516e-06, + "loss": 0.4516, + "step": 6705 + }, + { + "epoch": 0.77, + "learning_rate": 2.728380787147772e-06, + "loss": 0.4418, + "step": 6706 + }, + { + "epoch": 0.77, + "learning_rate": 2.725840226742774e-06, + "loss": 0.4448, + "step": 6707 + }, + { + "epoch": 0.77, + "learning_rate": 2.7233006630600667e-06, + "loss": 0.4588, + "step": 6708 + }, + { + "epoch": 0.77, + "learning_rate": 2.7207620964476323e-06, + "loss": 0.4534, + "step": 6709 + }, + { + "epoch": 0.77, + "learning_rate": 2.7182245272533046e-06, + "loss": 0.4508, + "step": 6710 + }, + { + "epoch": 0.77, + "learning_rate": 2.715687955824795e-06, + "loss": 0.4542, + "step": 6711 + }, + { + "epoch": 0.77, + "learning_rate": 2.713152382509665e-06, + "loss": 0.451, + "step": 6712 + }, + { + "epoch": 0.77, + "learning_rate": 2.7106178076553446e-06, + "loss": 0.4442, + "step": 6713 + }, + { + "epoch": 0.77, + "learning_rate": 2.7080842316091273e-06, + "loss": 0.4464, + "step": 6714 + }, + { + "epoch": 0.77, + "learning_rate": 2.7055516547181736e-06, + "loss": 0.4548, + "step": 6715 + }, + { + "epoch": 0.77, + "learning_rate": 2.703020077329498e-06, + "loss": 0.4454, + "step": 6716 + }, + { + "epoch": 0.77, + "learning_rate": 2.7004894997899878e-06, + "loss": 0.438, + "step": 6717 + }, + { + "epoch": 0.77, + "learning_rate": 2.6979599224463838e-06, + "loss": 0.4595, + "step": 6718 + }, + { + "epoch": 0.77, + "learning_rate": 2.6954313456452995e-06, + "loss": 0.4586, + "step": 6719 + }, + { + "epoch": 0.77, + "learning_rate": 2.6929037697332037e-06, + "loss": 0.4458, + "step": 6720 + }, + { + "epoch": 0.77, + "learning_rate": 2.6903771950564294e-06, + "loss": 0.4398, + "step": 6721 + }, + { + "epoch": 0.77, + "learning_rate": 2.6878516219611773e-06, + "loss": 0.4544, + "step": 6722 + }, + { + "epoch": 0.77, + "learning_rate": 2.6853270507935013e-06, + "loss": 0.4588, + "step": 6723 + }, + { + "epoch": 0.77, + "learning_rate": 2.6828034818993285e-06, + "loss": 0.4461, + "step": 6724 + }, + { + "epoch": 0.77, + "learning_rate": 2.680280915624448e-06, + "loss": 0.4515, + "step": 6725 + }, + { + "epoch": 0.77, + "learning_rate": 2.6777593523144986e-06, + "loss": 0.4567, + "step": 6726 + }, + { + "epoch": 0.77, + "learning_rate": 2.675238792314999e-06, + "loss": 0.4453, + "step": 6727 + }, + { + "epoch": 0.77, + "learning_rate": 2.6727192359713196e-06, + "loss": 0.4547, + "step": 6728 + }, + { + "epoch": 0.77, + "learning_rate": 2.670200683628691e-06, + "loss": 0.4584, + "step": 6729 + }, + { + "epoch": 0.77, + "learning_rate": 2.6676831356322184e-06, + "loss": 0.4569, + "step": 6730 + }, + { + "epoch": 0.77, + "learning_rate": 2.6651665923268555e-06, + "loss": 0.4601, + "step": 6731 + }, + { + "epoch": 0.77, + "learning_rate": 2.6626510540574314e-06, + "loss": 0.4423, + "step": 6732 + }, + { + "epoch": 0.77, + "learning_rate": 2.6601365211686248e-06, + "loss": 0.459, + "step": 6733 + }, + { + "epoch": 0.77, + "learning_rate": 2.657622994004986e-06, + "loss": 0.4568, + "step": 6734 + }, + { + "epoch": 0.77, + "learning_rate": 2.655110472910927e-06, + "loss": 0.4484, + "step": 6735 + }, + { + "epoch": 0.77, + "learning_rate": 2.652598958230713e-06, + "loss": 0.4592, + "step": 6736 + }, + { + "epoch": 0.77, + "learning_rate": 2.6500884503084857e-06, + "loss": 0.4461, + "step": 6737 + }, + { + "epoch": 0.77, + "learning_rate": 2.647578949488234e-06, + "loss": 0.4984, + "step": 6738 + }, + { + "epoch": 0.77, + "learning_rate": 2.645070456113816e-06, + "loss": 0.4566, + "step": 6739 + }, + { + "epoch": 0.77, + "learning_rate": 2.6425629705289556e-06, + "loss": 0.4459, + "step": 6740 + }, + { + "epoch": 0.77, + "learning_rate": 2.640056493077231e-06, + "loss": 0.4461, + "step": 6741 + }, + { + "epoch": 0.77, + "learning_rate": 2.637551024102084e-06, + "loss": 0.4478, + "step": 6742 + }, + { + "epoch": 0.77, + "learning_rate": 2.6350465639468213e-06, + "loss": 0.44, + "step": 6743 + }, + { + "epoch": 0.77, + "learning_rate": 2.6325431129546107e-06, + "loss": 0.4568, + "step": 6744 + }, + { + "epoch": 0.77, + "learning_rate": 2.6300406714684834e-06, + "loss": 0.4455, + "step": 6745 + }, + { + "epoch": 0.77, + "learning_rate": 2.627539239831328e-06, + "loss": 0.4659, + "step": 6746 + }, + { + "epoch": 0.77, + "learning_rate": 2.625038818385892e-06, + "loss": 0.4441, + "step": 6747 + }, + { + "epoch": 0.77, + "learning_rate": 2.6225394074747956e-06, + "loss": 0.4484, + "step": 6748 + }, + { + "epoch": 0.77, + "learning_rate": 2.620041007440508e-06, + "loss": 0.4494, + "step": 6749 + }, + { + "epoch": 0.77, + "learning_rate": 2.617543618625371e-06, + "loss": 0.4613, + "step": 6750 + }, + { + "epoch": 0.77, + "learning_rate": 2.615047241371581e-06, + "loss": 0.4473, + "step": 6751 + }, + { + "epoch": 0.77, + "learning_rate": 2.6125518760211933e-06, + "loss": 0.459, + "step": 6752 + }, + { + "epoch": 0.77, + "learning_rate": 2.610057522916132e-06, + "loss": 0.456, + "step": 6753 + }, + { + "epoch": 0.77, + "learning_rate": 2.6075641823981814e-06, + "loss": 0.4547, + "step": 6754 + }, + { + "epoch": 0.77, + "learning_rate": 2.6050718548089804e-06, + "loss": 0.4559, + "step": 6755 + }, + { + "epoch": 0.77, + "learning_rate": 2.602580540490038e-06, + "loss": 0.463, + "step": 6756 + }, + { + "epoch": 0.77, + "learning_rate": 2.6000902397827154e-06, + "loss": 0.4474, + "step": 6757 + }, + { + "epoch": 0.77, + "learning_rate": 2.5976009530282455e-06, + "loss": 0.4649, + "step": 6758 + }, + { + "epoch": 0.77, + "learning_rate": 2.595112680567711e-06, + "loss": 0.4483, + "step": 6759 + }, + { + "epoch": 0.77, + "learning_rate": 2.592625422742059e-06, + "loss": 0.4602, + "step": 6760 + }, + { + "epoch": 0.77, + "learning_rate": 2.5901391798921018e-06, + "loss": 0.4647, + "step": 6761 + }, + { + "epoch": 0.77, + "learning_rate": 2.5876539523585144e-06, + "loss": 0.4619, + "step": 6762 + }, + { + "epoch": 0.77, + "learning_rate": 2.585169740481822e-06, + "loss": 0.4544, + "step": 6763 + }, + { + "epoch": 0.77, + "learning_rate": 2.582686544602423e-06, + "loss": 0.4573, + "step": 6764 + }, + { + "epoch": 0.77, + "learning_rate": 2.5802043650605645e-06, + "loss": 0.4429, + "step": 6765 + }, + { + "epoch": 0.77, + "learning_rate": 2.577723202196366e-06, + "loss": 0.4496, + "step": 6766 + }, + { + "epoch": 0.77, + "learning_rate": 2.575243056349801e-06, + "loss": 0.4589, + "step": 6767 + }, + { + "epoch": 0.77, + "learning_rate": 2.5727639278606997e-06, + "loss": 0.4451, + "step": 6768 + }, + { + "epoch": 0.77, + "learning_rate": 2.570285817068765e-06, + "loss": 0.4611, + "step": 6769 + }, + { + "epoch": 0.77, + "learning_rate": 2.5678087243135476e-06, + "loss": 0.4572, + "step": 6770 + }, + { + "epoch": 0.77, + "learning_rate": 2.565332649934468e-06, + "loss": 0.4633, + "step": 6771 + }, + { + "epoch": 0.77, + "learning_rate": 2.5628575942708047e-06, + "loss": 0.4609, + "step": 6772 + }, + { + "epoch": 0.77, + "learning_rate": 2.560383557661692e-06, + "loss": 0.454, + "step": 6773 + }, + { + "epoch": 0.77, + "learning_rate": 2.5579105404461325e-06, + "loss": 0.4556, + "step": 6774 + }, + { + "epoch": 0.77, + "learning_rate": 2.555438542962979e-06, + "loss": 0.4432, + "step": 6775 + }, + { + "epoch": 0.77, + "learning_rate": 2.5529675655509567e-06, + "loss": 0.4549, + "step": 6776 + }, + { + "epoch": 0.77, + "learning_rate": 2.550497608548642e-06, + "loss": 0.4414, + "step": 6777 + }, + { + "epoch": 0.77, + "learning_rate": 2.5480286722944712e-06, + "loss": 0.4531, + "step": 6778 + }, + { + "epoch": 0.77, + "learning_rate": 2.5455607571267484e-06, + "loss": 0.4699, + "step": 6779 + }, + { + "epoch": 0.77, + "learning_rate": 2.543093863383629e-06, + "loss": 0.4576, + "step": 6780 + }, + { + "epoch": 0.77, + "learning_rate": 2.540627991403134e-06, + "loss": 0.4535, + "step": 6781 + }, + { + "epoch": 0.78, + "learning_rate": 2.5381631415231455e-06, + "loss": 0.479, + "step": 6782 + }, + { + "epoch": 0.78, + "learning_rate": 2.535699314081399e-06, + "loss": 0.4589, + "step": 6783 + }, + { + "epoch": 0.78, + "learning_rate": 2.5332365094154975e-06, + "loss": 0.4563, + "step": 6784 + }, + { + "epoch": 0.78, + "learning_rate": 2.530774727862899e-06, + "loss": 0.4539, + "step": 6785 + }, + { + "epoch": 0.78, + "learning_rate": 2.5283139697609195e-06, + "loss": 0.4446, + "step": 6786 + }, + { + "epoch": 0.78, + "learning_rate": 2.525854235446743e-06, + "loss": 0.4636, + "step": 6787 + }, + { + "epoch": 0.78, + "learning_rate": 2.5233955252574027e-06, + "loss": 0.4485, + "step": 6788 + }, + { + "epoch": 0.78, + "learning_rate": 2.5209378395298013e-06, + "loss": 0.4408, + "step": 6789 + }, + { + "epoch": 0.78, + "learning_rate": 2.5184811786006923e-06, + "loss": 0.4678, + "step": 6790 + }, + { + "epoch": 0.78, + "learning_rate": 2.516025542806696e-06, + "loss": 0.4418, + "step": 6791 + }, + { + "epoch": 0.78, + "learning_rate": 2.5135709324842906e-06, + "loss": 0.4579, + "step": 6792 + }, + { + "epoch": 0.78, + "learning_rate": 2.511117347969809e-06, + "loss": 0.453, + "step": 6793 + }, + { + "epoch": 0.78, + "learning_rate": 2.508664789599451e-06, + "loss": 0.4449, + "step": 6794 + }, + { + "epoch": 0.78, + "learning_rate": 2.50621325770927e-06, + "loss": 0.4601, + "step": 6795 + }, + { + "epoch": 0.78, + "learning_rate": 2.503762752635177e-06, + "loss": 0.4701, + "step": 6796 + }, + { + "epoch": 0.78, + "learning_rate": 2.501313274712952e-06, + "loss": 0.4432, + "step": 6797 + }, + { + "epoch": 0.78, + "learning_rate": 2.4988648242782255e-06, + "loss": 0.4596, + "step": 6798 + }, + { + "epoch": 0.78, + "learning_rate": 2.4964174016664865e-06, + "loss": 0.4429, + "step": 6799 + }, + { + "epoch": 0.78, + "learning_rate": 2.4939710072130895e-06, + "loss": 0.4535, + "step": 6800 + }, + { + "epoch": 0.78, + "learning_rate": 2.4915256412532463e-06, + "loss": 0.4357, + "step": 6801 + }, + { + "epoch": 0.78, + "learning_rate": 2.4890813041220286e-06, + "loss": 0.4536, + "step": 6802 + }, + { + "epoch": 0.78, + "learning_rate": 2.486637996154362e-06, + "loss": 0.442, + "step": 6803 + }, + { + "epoch": 0.78, + "learning_rate": 2.4841957176850306e-06, + "loss": 0.4543, + "step": 6804 + }, + { + "epoch": 0.78, + "learning_rate": 2.4817544690486896e-06, + "loss": 0.4535, + "step": 6805 + }, + { + "epoch": 0.78, + "learning_rate": 2.4793142505798363e-06, + "loss": 0.4411, + "step": 6806 + }, + { + "epoch": 0.78, + "learning_rate": 2.4768750626128414e-06, + "loss": 0.4497, + "step": 6807 + }, + { + "epoch": 0.78, + "learning_rate": 2.4744369054819252e-06, + "loss": 0.4533, + "step": 6808 + }, + { + "epoch": 0.78, + "learning_rate": 2.4719997795211683e-06, + "loss": 0.4417, + "step": 6809 + }, + { + "epoch": 0.78, + "learning_rate": 2.4695636850645112e-06, + "loss": 0.4485, + "step": 6810 + }, + { + "epoch": 0.78, + "learning_rate": 2.467128622445759e-06, + "loss": 0.4413, + "step": 6811 + }, + { + "epoch": 0.78, + "learning_rate": 2.464694591998563e-06, + "loss": 0.4554, + "step": 6812 + }, + { + "epoch": 0.78, + "learning_rate": 2.4622615940564454e-06, + "loss": 0.4551, + "step": 6813 + }, + { + "epoch": 0.78, + "learning_rate": 2.4598296289527745e-06, + "loss": 0.4501, + "step": 6814 + }, + { + "epoch": 0.78, + "learning_rate": 2.4573986970207906e-06, + "loss": 0.4521, + "step": 6815 + }, + { + "epoch": 0.78, + "learning_rate": 2.4549687985935832e-06, + "loss": 0.4545, + "step": 6816 + }, + { + "epoch": 0.78, + "learning_rate": 2.452539934004099e-06, + "loss": 0.436, + "step": 6817 + }, + { + "epoch": 0.78, + "learning_rate": 2.4501121035851494e-06, + "loss": 0.4512, + "step": 6818 + }, + { + "epoch": 0.78, + "learning_rate": 2.447685307669405e-06, + "loss": 0.4505, + "step": 6819 + }, + { + "epoch": 0.78, + "learning_rate": 2.4452595465893847e-06, + "loss": 0.4493, + "step": 6820 + }, + { + "epoch": 0.78, + "learning_rate": 2.4428348206774775e-06, + "loss": 0.4526, + "step": 6821 + }, + { + "epoch": 0.78, + "learning_rate": 2.4404111302659195e-06, + "loss": 0.4747, + "step": 6822 + }, + { + "epoch": 0.78, + "learning_rate": 2.4379884756868167e-06, + "loss": 0.4277, + "step": 6823 + }, + { + "epoch": 0.78, + "learning_rate": 2.4355668572721224e-06, + "loss": 0.4578, + "step": 6824 + }, + { + "epoch": 0.78, + "learning_rate": 2.433146275353652e-06, + "loss": 0.4408, + "step": 6825 + }, + { + "epoch": 0.78, + "learning_rate": 2.4307267302630834e-06, + "loss": 0.4679, + "step": 6826 + }, + { + "epoch": 0.78, + "learning_rate": 2.428308222331942e-06, + "loss": 0.4573, + "step": 6827 + }, + { + "epoch": 0.78, + "learning_rate": 2.4258907518916207e-06, + "loss": 0.4575, + "step": 6828 + }, + { + "epoch": 0.78, + "learning_rate": 2.4234743192733713e-06, + "loss": 0.4426, + "step": 6829 + }, + { + "epoch": 0.78, + "learning_rate": 2.4210589248082914e-06, + "loss": 0.4606, + "step": 6830 + }, + { + "epoch": 0.78, + "learning_rate": 2.4186445688273508e-06, + "loss": 0.4585, + "step": 6831 + }, + { + "epoch": 0.78, + "learning_rate": 2.416231251661364e-06, + "loss": 0.4363, + "step": 6832 + }, + { + "epoch": 0.78, + "learning_rate": 2.4138189736410144e-06, + "loss": 0.4513, + "step": 6833 + }, + { + "epoch": 0.78, + "learning_rate": 2.411407735096836e-06, + "loss": 0.4403, + "step": 6834 + }, + { + "epoch": 0.78, + "learning_rate": 2.40899753635922e-06, + "loss": 0.435, + "step": 6835 + }, + { + "epoch": 0.78, + "learning_rate": 2.406588377758421e-06, + "loss": 0.4783, + "step": 6836 + }, + { + "epoch": 0.78, + "learning_rate": 2.4041802596245444e-06, + "loss": 0.4533, + "step": 6837 + }, + { + "epoch": 0.78, + "learning_rate": 2.4017731822875566e-06, + "loss": 0.4545, + "step": 6838 + }, + { + "epoch": 0.78, + "learning_rate": 2.399367146077286e-06, + "loss": 0.4624, + "step": 6839 + }, + { + "epoch": 0.78, + "learning_rate": 2.3969621513234066e-06, + "loss": 0.46, + "step": 6840 + }, + { + "epoch": 0.78, + "learning_rate": 2.394558198355462e-06, + "loss": 0.4667, + "step": 6841 + }, + { + "epoch": 0.78, + "learning_rate": 2.3921552875028443e-06, + "loss": 0.4611, + "step": 6842 + }, + { + "epoch": 0.78, + "learning_rate": 2.3897534190948034e-06, + "loss": 0.4417, + "step": 6843 + }, + { + "epoch": 0.78, + "learning_rate": 2.387352593460455e-06, + "loss": 0.4708, + "step": 6844 + }, + { + "epoch": 0.78, + "learning_rate": 2.384952810928759e-06, + "loss": 0.456, + "step": 6845 + }, + { + "epoch": 0.78, + "learning_rate": 2.3825540718285454e-06, + "loss": 0.4431, + "step": 6846 + }, + { + "epoch": 0.78, + "learning_rate": 2.3801563764884905e-06, + "loss": 0.4557, + "step": 6847 + }, + { + "epoch": 0.78, + "learning_rate": 2.377759725237133e-06, + "loss": 0.4571, + "step": 6848 + }, + { + "epoch": 0.78, + "learning_rate": 2.375364118402872e-06, + "loss": 0.4388, + "step": 6849 + }, + { + "epoch": 0.78, + "learning_rate": 2.3729695563139554e-06, + "loss": 0.4392, + "step": 6850 + }, + { + "epoch": 0.78, + "learning_rate": 2.3705760392984887e-06, + "loss": 0.4436, + "step": 6851 + }, + { + "epoch": 0.78, + "learning_rate": 2.3681835676844444e-06, + "loss": 0.4396, + "step": 6852 + }, + { + "epoch": 0.78, + "learning_rate": 2.3657921417996364e-06, + "loss": 0.4496, + "step": 6853 + }, + { + "epoch": 0.78, + "learning_rate": 2.363401761971752e-06, + "loss": 0.4712, + "step": 6854 + }, + { + "epoch": 0.78, + "learning_rate": 2.361012428528321e-06, + "loss": 0.4381, + "step": 6855 + }, + { + "epoch": 0.78, + "learning_rate": 2.3586241417967336e-06, + "loss": 0.4399, + "step": 6856 + }, + { + "epoch": 0.78, + "learning_rate": 2.356236902104242e-06, + "loss": 0.4752, + "step": 6857 + }, + { + "epoch": 0.78, + "learning_rate": 2.3538507097779505e-06, + "loss": 0.4426, + "step": 6858 + }, + { + "epoch": 0.78, + "learning_rate": 2.351465565144825e-06, + "loss": 0.4458, + "step": 6859 + }, + { + "epoch": 0.78, + "learning_rate": 2.3490814685316777e-06, + "loss": 0.4235, + "step": 6860 + }, + { + "epoch": 0.78, + "learning_rate": 2.3466984202651833e-06, + "loss": 0.4463, + "step": 6861 + }, + { + "epoch": 0.78, + "learning_rate": 2.344316420671876e-06, + "loss": 0.4494, + "step": 6862 + }, + { + "epoch": 0.78, + "learning_rate": 2.3419354700781393e-06, + "loss": 0.4436, + "step": 6863 + }, + { + "epoch": 0.78, + "learning_rate": 2.339555568810221e-06, + "loss": 0.4534, + "step": 6864 + }, + { + "epoch": 0.78, + "learning_rate": 2.3371767171942183e-06, + "loss": 0.4484, + "step": 6865 + }, + { + "epoch": 0.78, + "learning_rate": 2.3347989155560835e-06, + "loss": 0.4561, + "step": 6866 + }, + { + "epoch": 0.78, + "learning_rate": 2.3324221642216328e-06, + "loss": 0.457, + "step": 6867 + }, + { + "epoch": 0.78, + "learning_rate": 2.3300464635165353e-06, + "loss": 0.4517, + "step": 6868 + }, + { + "epoch": 0.79, + "learning_rate": 2.32767181376631e-06, + "loss": 0.4567, + "step": 6869 + }, + { + "epoch": 0.79, + "learning_rate": 2.3252982152963434e-06, + "loss": 0.4546, + "step": 6870 + }, + { + "epoch": 0.79, + "learning_rate": 2.3229256684318646e-06, + "loss": 0.4425, + "step": 6871 + }, + { + "epoch": 0.79, + "learning_rate": 2.320554173497972e-06, + "loss": 0.4547, + "step": 6872 + }, + { + "epoch": 0.79, + "learning_rate": 2.31818373081961e-06, + "loss": 0.4633, + "step": 6873 + }, + { + "epoch": 0.79, + "learning_rate": 2.3158143407215796e-06, + "loss": 0.4514, + "step": 6874 + }, + { + "epoch": 0.79, + "learning_rate": 2.3134460035285433e-06, + "loss": 0.4556, + "step": 6875 + }, + { + "epoch": 0.79, + "learning_rate": 2.3110787195650173e-06, + "loss": 0.4472, + "step": 6876 + }, + { + "epoch": 0.79, + "learning_rate": 2.3087124891553703e-06, + "loss": 0.4514, + "step": 6877 + }, + { + "epoch": 0.79, + "learning_rate": 2.30634731262383e-06, + "loss": 0.4429, + "step": 6878 + }, + { + "epoch": 0.79, + "learning_rate": 2.3039831902944766e-06, + "loss": 0.4412, + "step": 6879 + }, + { + "epoch": 0.79, + "learning_rate": 2.3016201224912504e-06, + "loss": 0.4455, + "step": 6880 + }, + { + "epoch": 0.79, + "learning_rate": 2.299258109537943e-06, + "loss": 0.4689, + "step": 6881 + }, + { + "epoch": 0.79, + "learning_rate": 2.2968971517581994e-06, + "loss": 0.4426, + "step": 6882 + }, + { + "epoch": 0.79, + "learning_rate": 2.2945372494755304e-06, + "loss": 0.4538, + "step": 6883 + }, + { + "epoch": 0.79, + "learning_rate": 2.2921784030132886e-06, + "loss": 0.4454, + "step": 6884 + }, + { + "epoch": 0.79, + "learning_rate": 2.289820612694692e-06, + "loss": 0.4694, + "step": 6885 + }, + { + "epoch": 0.79, + "learning_rate": 2.2874638788428128e-06, + "loss": 0.4444, + "step": 6886 + }, + { + "epoch": 0.79, + "learning_rate": 2.2851082017805704e-06, + "loss": 0.4471, + "step": 6887 + }, + { + "epoch": 0.79, + "learning_rate": 2.2827535818307513e-06, + "loss": 0.4536, + "step": 6888 + }, + { + "epoch": 0.79, + "learning_rate": 2.2804000193159848e-06, + "loss": 0.453, + "step": 6889 + }, + { + "epoch": 0.79, + "learning_rate": 2.278047514558769e-06, + "loss": 0.47, + "step": 6890 + }, + { + "epoch": 0.79, + "learning_rate": 2.2756960678814444e-06, + "loss": 0.4648, + "step": 6891 + }, + { + "epoch": 0.79, + "learning_rate": 2.2733456796062093e-06, + "loss": 0.4424, + "step": 6892 + }, + { + "epoch": 0.79, + "learning_rate": 2.270996350055126e-06, + "loss": 0.4472, + "step": 6893 + }, + { + "epoch": 0.79, + "learning_rate": 2.2686480795500986e-06, + "loss": 0.4621, + "step": 6894 + }, + { + "epoch": 0.79, + "learning_rate": 2.2663008684128964e-06, + "loss": 0.4299, + "step": 6895 + }, + { + "epoch": 0.79, + "learning_rate": 2.2639547169651423e-06, + "loss": 0.453, + "step": 6896 + }, + { + "epoch": 0.79, + "learning_rate": 2.2616096255283048e-06, + "loss": 0.4602, + "step": 6897 + }, + { + "epoch": 0.79, + "learning_rate": 2.25926559442372e-06, + "loss": 0.447, + "step": 6898 + }, + { + "epoch": 0.79, + "learning_rate": 2.2569226239725695e-06, + "loss": 0.4674, + "step": 6899 + }, + { + "epoch": 0.79, + "learning_rate": 2.2545807144958896e-06, + "loss": 0.4247, + "step": 6900 + }, + { + "epoch": 0.79, + "learning_rate": 2.252239866314582e-06, + "loss": 0.4581, + "step": 6901 + }, + { + "epoch": 0.79, + "learning_rate": 2.249900079749385e-06, + "loss": 0.4323, + "step": 6902 + }, + { + "epoch": 0.79, + "learning_rate": 2.247561355120912e-06, + "loss": 0.442, + "step": 6903 + }, + { + "epoch": 0.79, + "learning_rate": 2.245223692749612e-06, + "loss": 0.4487, + "step": 6904 + }, + { + "epoch": 0.79, + "learning_rate": 2.2428870929558012e-06, + "loss": 0.4701, + "step": 6905 + }, + { + "epoch": 0.79, + "learning_rate": 2.240551556059647e-06, + "loss": 0.4617, + "step": 6906 + }, + { + "epoch": 0.79, + "learning_rate": 2.238217082381169e-06, + "loss": 0.4767, + "step": 6907 + }, + { + "epoch": 0.79, + "learning_rate": 2.235883672240239e-06, + "loss": 0.45, + "step": 6908 + }, + { + "epoch": 0.79, + "learning_rate": 2.233551325956591e-06, + "loss": 0.4683, + "step": 6909 + }, + { + "epoch": 0.79, + "learning_rate": 2.2312200438498043e-06, + "loss": 0.4498, + "step": 6910 + }, + { + "epoch": 0.79, + "learning_rate": 2.2288898262393212e-06, + "loss": 0.452, + "step": 6911 + }, + { + "epoch": 0.79, + "learning_rate": 2.2265606734444314e-06, + "loss": 0.4564, + "step": 6912 + }, + { + "epoch": 0.79, + "learning_rate": 2.2242325857842773e-06, + "loss": 0.4566, + "step": 6913 + }, + { + "epoch": 0.79, + "learning_rate": 2.2219055635778618e-06, + "loss": 0.4514, + "step": 6914 + }, + { + "epoch": 0.79, + "learning_rate": 2.219579607144039e-06, + "loss": 0.4527, + "step": 6915 + }, + { + "epoch": 0.79, + "learning_rate": 2.21725471680152e-06, + "loss": 0.4504, + "step": 6916 + }, + { + "epoch": 0.79, + "learning_rate": 2.214930892868864e-06, + "loss": 0.4526, + "step": 6917 + }, + { + "epoch": 0.79, + "learning_rate": 2.2126081356644836e-06, + "loss": 0.4445, + "step": 6918 + }, + { + "epoch": 0.79, + "learning_rate": 2.210286445506654e-06, + "loss": 0.4556, + "step": 6919 + }, + { + "epoch": 0.79, + "learning_rate": 2.207965822713496e-06, + "loss": 0.4578, + "step": 6920 + }, + { + "epoch": 0.79, + "learning_rate": 2.205646267602983e-06, + "loss": 0.4627, + "step": 6921 + }, + { + "epoch": 0.79, + "learning_rate": 2.203327780492953e-06, + "loss": 0.454, + "step": 6922 + }, + { + "epoch": 0.79, + "learning_rate": 2.2010103617010836e-06, + "loss": 0.4572, + "step": 6923 + }, + { + "epoch": 0.79, + "learning_rate": 2.1986940115449173e-06, + "loss": 0.4375, + "step": 6924 + }, + { + "epoch": 0.79, + "learning_rate": 2.196378730341846e-06, + "loss": 0.4548, + "step": 6925 + }, + { + "epoch": 0.79, + "learning_rate": 2.1940645184091115e-06, + "loss": 0.4455, + "step": 6926 + }, + { + "epoch": 0.79, + "learning_rate": 2.1917513760638177e-06, + "loss": 0.4483, + "step": 6927 + }, + { + "epoch": 0.79, + "learning_rate": 2.18943930362291e-06, + "loss": 0.4536, + "step": 6928 + }, + { + "epoch": 0.79, + "learning_rate": 2.1871283014032007e-06, + "loss": 0.4454, + "step": 6929 + }, + { + "epoch": 0.79, + "learning_rate": 2.1848183697213467e-06, + "loss": 0.4693, + "step": 6930 + }, + { + "epoch": 0.79, + "learning_rate": 2.1825095088938553e-06, + "loss": 0.4515, + "step": 6931 + }, + { + "epoch": 0.79, + "learning_rate": 2.1802017192370963e-06, + "loss": 0.451, + "step": 6932 + }, + { + "epoch": 0.79, + "learning_rate": 2.1778950010672895e-06, + "loss": 0.4543, + "step": 6933 + }, + { + "epoch": 0.79, + "learning_rate": 2.1755893547005036e-06, + "loss": 0.4561, + "step": 6934 + }, + { + "epoch": 0.79, + "learning_rate": 2.173284780452667e-06, + "loss": 0.4421, + "step": 6935 + }, + { + "epoch": 0.79, + "learning_rate": 2.1709812786395545e-06, + "loss": 0.4498, + "step": 6936 + }, + { + "epoch": 0.79, + "learning_rate": 2.1686788495768006e-06, + "loss": 0.4632, + "step": 6937 + }, + { + "epoch": 0.79, + "learning_rate": 2.1663774935798886e-06, + "loss": 0.4392, + "step": 6938 + }, + { + "epoch": 0.79, + "learning_rate": 2.1640772109641504e-06, + "loss": 0.4639, + "step": 6939 + }, + { + "epoch": 0.79, + "learning_rate": 2.1617780020447854e-06, + "loss": 0.4477, + "step": 6940 + }, + { + "epoch": 0.79, + "learning_rate": 2.1594798671368265e-06, + "loss": 0.4378, + "step": 6941 + }, + { + "epoch": 0.79, + "learning_rate": 2.157182806555177e-06, + "loss": 0.4584, + "step": 6942 + }, + { + "epoch": 0.79, + "learning_rate": 2.1548868206145846e-06, + "loss": 0.4768, + "step": 6943 + }, + { + "epoch": 0.79, + "learning_rate": 2.1525919096296455e-06, + "loss": 0.4334, + "step": 6944 + }, + { + "epoch": 0.79, + "learning_rate": 2.1502980739148215e-06, + "loss": 0.4477, + "step": 6945 + }, + { + "epoch": 0.79, + "learning_rate": 2.1480053137844115e-06, + "loss": 0.4488, + "step": 6946 + }, + { + "epoch": 0.79, + "learning_rate": 2.1457136295525817e-06, + "loss": 0.4608, + "step": 6947 + }, + { + "epoch": 0.79, + "learning_rate": 2.1434230215333407e-06, + "loss": 0.4605, + "step": 6948 + }, + { + "epoch": 0.79, + "learning_rate": 2.14113349004055e-06, + "loss": 0.4562, + "step": 6949 + }, + { + "epoch": 0.79, + "learning_rate": 2.138845035387932e-06, + "loss": 0.4482, + "step": 6950 + }, + { + "epoch": 0.79, + "learning_rate": 2.1365576578890513e-06, + "loss": 0.4535, + "step": 6951 + }, + { + "epoch": 0.79, + "learning_rate": 2.1342713578573327e-06, + "loss": 0.44, + "step": 6952 + }, + { + "epoch": 0.79, + "learning_rate": 2.131986135606051e-06, + "loss": 0.447, + "step": 6953 + }, + { + "epoch": 0.79, + "learning_rate": 2.1297019914483297e-06, + "loss": 0.4267, + "step": 6954 + }, + { + "epoch": 0.79, + "learning_rate": 2.1274189256971523e-06, + "loss": 0.442, + "step": 6955 + }, + { + "epoch": 0.79, + "learning_rate": 2.1251369386653454e-06, + "loss": 0.4605, + "step": 6956 + }, + { + "epoch": 0.8, + "learning_rate": 2.122856030665591e-06, + "loss": 0.4559, + "step": 6957 + }, + { + "epoch": 0.8, + "learning_rate": 2.1205762020104303e-06, + "loss": 0.4631, + "step": 6958 + }, + { + "epoch": 0.8, + "learning_rate": 2.1182974530122435e-06, + "loss": 0.4636, + "step": 6959 + }, + { + "epoch": 0.8, + "learning_rate": 2.1160197839832774e-06, + "loss": 0.4414, + "step": 6960 + }, + { + "epoch": 0.8, + "learning_rate": 2.113743195235617e-06, + "loss": 0.4593, + "step": 6961 + }, + { + "epoch": 0.8, + "learning_rate": 2.111467687081209e-06, + "loss": 0.4448, + "step": 6962 + }, + { + "epoch": 0.8, + "learning_rate": 2.109193259831851e-06, + "loss": 0.4518, + "step": 6963 + }, + { + "epoch": 0.8, + "learning_rate": 2.106919913799188e-06, + "loss": 0.4534, + "step": 6964 + }, + { + "epoch": 0.8, + "learning_rate": 2.1046476492947155e-06, + "loss": 0.4483, + "step": 6965 + }, + { + "epoch": 0.8, + "learning_rate": 2.102376466629792e-06, + "loss": 0.4542, + "step": 6966 + }, + { + "epoch": 0.8, + "learning_rate": 2.100106366115613e-06, + "loss": 0.4583, + "step": 6967 + }, + { + "epoch": 0.8, + "learning_rate": 2.0978373480632386e-06, + "loss": 0.4349, + "step": 6968 + }, + { + "epoch": 0.8, + "learning_rate": 2.0955694127835736e-06, + "loss": 0.447, + "step": 6969 + }, + { + "epoch": 0.8, + "learning_rate": 2.0933025605873702e-06, + "loss": 0.4611, + "step": 6970 + }, + { + "epoch": 0.8, + "learning_rate": 2.0910367917852437e-06, + "loss": 0.4435, + "step": 6971 + }, + { + "epoch": 0.8, + "learning_rate": 2.088772106687653e-06, + "loss": 0.4551, + "step": 6972 + }, + { + "epoch": 0.8, + "learning_rate": 2.0865085056049138e-06, + "loss": 0.4698, + "step": 6973 + }, + { + "epoch": 0.8, + "learning_rate": 2.084245988847188e-06, + "loss": 0.4405, + "step": 6974 + }, + { + "epoch": 0.8, + "learning_rate": 2.0819845567244868e-06, + "loss": 0.4661, + "step": 6975 + }, + { + "epoch": 0.8, + "learning_rate": 2.079724209546683e-06, + "loss": 0.4539, + "step": 6976 + }, + { + "epoch": 0.8, + "learning_rate": 2.077464947623492e-06, + "loss": 0.4373, + "step": 6977 + }, + { + "epoch": 0.8, + "learning_rate": 2.0752067712644807e-06, + "loss": 0.4411, + "step": 6978 + }, + { + "epoch": 0.8, + "learning_rate": 2.0729496807790737e-06, + "loss": 0.4557, + "step": 6979 + }, + { + "epoch": 0.8, + "learning_rate": 2.0706936764765393e-06, + "loss": 0.4516, + "step": 6980 + }, + { + "epoch": 0.8, + "learning_rate": 2.0684387586660027e-06, + "loss": 0.4538, + "step": 6981 + }, + { + "epoch": 0.8, + "learning_rate": 2.0661849276564394e-06, + "loss": 0.4527, + "step": 6982 + }, + { + "epoch": 0.8, + "learning_rate": 2.0639321837566696e-06, + "loss": 0.4482, + "step": 6983 + }, + { + "epoch": 0.8, + "learning_rate": 2.0616805272753758e-06, + "loss": 0.4652, + "step": 6984 + }, + { + "epoch": 0.8, + "learning_rate": 2.0594299585210796e-06, + "loss": 0.4302, + "step": 6985 + }, + { + "epoch": 0.8, + "learning_rate": 2.057180477802164e-06, + "loss": 0.4535, + "step": 6986 + }, + { + "epoch": 0.8, + "learning_rate": 2.054932085426856e-06, + "loss": 0.4474, + "step": 6987 + }, + { + "epoch": 0.8, + "learning_rate": 2.0526847817032326e-06, + "loss": 0.4389, + "step": 6988 + }, + { + "epoch": 0.8, + "learning_rate": 2.0504385669392268e-06, + "loss": 0.4523, + "step": 6989 + }, + { + "epoch": 0.8, + "learning_rate": 2.048193441442623e-06, + "loss": 0.4341, + "step": 6990 + }, + { + "epoch": 0.8, + "learning_rate": 2.0459494055210495e-06, + "loss": 0.462, + "step": 6991 + }, + { + "epoch": 0.8, + "learning_rate": 2.043706459481992e-06, + "loss": 0.4544, + "step": 6992 + }, + { + "epoch": 0.8, + "learning_rate": 2.0414646036327813e-06, + "loss": 0.4574, + "step": 6993 + }, + { + "epoch": 0.8, + "learning_rate": 2.039223838280606e-06, + "loss": 0.4469, + "step": 6994 + }, + { + "epoch": 0.8, + "learning_rate": 2.0369841637324992e-06, + "loss": 0.4479, + "step": 6995 + }, + { + "epoch": 0.8, + "learning_rate": 2.034745580295342e-06, + "loss": 0.4353, + "step": 6996 + }, + { + "epoch": 0.8, + "learning_rate": 2.0325080882758775e-06, + "loss": 0.4502, + "step": 6997 + }, + { + "epoch": 0.8, + "learning_rate": 2.030271687980685e-06, + "loss": 0.4471, + "step": 6998 + }, + { + "epoch": 0.8, + "learning_rate": 2.028036379716205e-06, + "loss": 0.457, + "step": 6999 + }, + { + "epoch": 0.8, + "learning_rate": 2.025802163788727e-06, + "loss": 0.4307, + "step": 7000 + }, + { + "epoch": 0.8, + "learning_rate": 2.023569040504384e-06, + "loss": 0.4597, + "step": 7001 + }, + { + "epoch": 0.8, + "learning_rate": 2.0213370101691675e-06, + "loss": 0.4454, + "step": 7002 + }, + { + "epoch": 0.8, + "learning_rate": 2.0191060730889132e-06, + "loss": 0.44, + "step": 7003 + }, + { + "epoch": 0.8, + "learning_rate": 2.016876229569308e-06, + "loss": 0.4463, + "step": 7004 + }, + { + "epoch": 0.8, + "learning_rate": 2.0146474799158935e-06, + "loss": 0.4651, + "step": 7005 + }, + { + "epoch": 0.8, + "learning_rate": 2.0124198244340543e-06, + "loss": 0.4465, + "step": 7006 + }, + { + "epoch": 0.8, + "learning_rate": 2.0101932634290345e-06, + "loss": 0.4694, + "step": 7007 + }, + { + "epoch": 0.8, + "learning_rate": 2.0079677972059163e-06, + "loss": 0.4513, + "step": 7008 + }, + { + "epoch": 0.8, + "learning_rate": 2.005743426069641e-06, + "loss": 0.4398, + "step": 7009 + }, + { + "epoch": 0.8, + "learning_rate": 2.003520150325e-06, + "loss": 0.4537, + "step": 7010 + }, + { + "epoch": 0.8, + "learning_rate": 2.0012979702766277e-06, + "loss": 0.4491, + "step": 7011 + }, + { + "epoch": 0.8, + "learning_rate": 1.9990768862290155e-06, + "loss": 0.4225, + "step": 7012 + }, + { + "epoch": 0.8, + "learning_rate": 1.9968568984865e-06, + "loss": 0.4786, + "step": 7013 + }, + { + "epoch": 0.8, + "learning_rate": 1.9946380073532668e-06, + "loss": 0.4358, + "step": 7014 + }, + { + "epoch": 0.8, + "learning_rate": 1.992420213133357e-06, + "loss": 0.4804, + "step": 7015 + }, + { + "epoch": 0.8, + "learning_rate": 1.9902035161306574e-06, + "loss": 0.4417, + "step": 7016 + }, + { + "epoch": 0.8, + "learning_rate": 1.9879879166489023e-06, + "loss": 0.4523, + "step": 7017 + }, + { + "epoch": 0.8, + "learning_rate": 1.9857734149916787e-06, + "loss": 0.4359, + "step": 7018 + }, + { + "epoch": 0.8, + "learning_rate": 1.983560011462425e-06, + "loss": 0.4723, + "step": 7019 + }, + { + "epoch": 0.8, + "learning_rate": 1.981347706364429e-06, + "loss": 0.441, + "step": 7020 + }, + { + "epoch": 0.8, + "learning_rate": 1.979136500000822e-06, + "loss": 0.4501, + "step": 7021 + }, + { + "epoch": 0.8, + "learning_rate": 1.9769263926745886e-06, + "loss": 0.4283, + "step": 7022 + }, + { + "epoch": 0.8, + "learning_rate": 1.974717384688566e-06, + "loss": 0.4435, + "step": 7023 + }, + { + "epoch": 0.8, + "learning_rate": 1.972509476345432e-06, + "loss": 0.4534, + "step": 7024 + }, + { + "epoch": 0.8, + "learning_rate": 1.9703026679477253e-06, + "loss": 0.4595, + "step": 7025 + }, + { + "epoch": 0.8, + "learning_rate": 1.968096959797827e-06, + "loss": 0.4401, + "step": 7026 + }, + { + "epoch": 0.8, + "learning_rate": 1.9658923521979633e-06, + "loss": 0.4562, + "step": 7027 + }, + { + "epoch": 0.8, + "learning_rate": 1.963688845450218e-06, + "loss": 0.4607, + "step": 7028 + }, + { + "epoch": 0.8, + "learning_rate": 1.9614864398565212e-06, + "loss": 0.4341, + "step": 7029 + }, + { + "epoch": 0.8, + "learning_rate": 1.9592851357186537e-06, + "loss": 0.4564, + "step": 7030 + }, + { + "epoch": 0.8, + "learning_rate": 1.957084933338241e-06, + "loss": 0.4644, + "step": 7031 + }, + { + "epoch": 0.8, + "learning_rate": 1.9548858330167584e-06, + "loss": 0.4615, + "step": 7032 + }, + { + "epoch": 0.8, + "learning_rate": 1.9526878350555344e-06, + "loss": 0.4764, + "step": 7033 + }, + { + "epoch": 0.8, + "learning_rate": 1.9504909397557436e-06, + "loss": 0.4278, + "step": 7034 + }, + { + "epoch": 0.8, + "learning_rate": 1.9482951474184054e-06, + "loss": 0.4495, + "step": 7035 + }, + { + "epoch": 0.8, + "learning_rate": 1.9461004583443986e-06, + "loss": 0.447, + "step": 7036 + }, + { + "epoch": 0.8, + "learning_rate": 1.94390687283444e-06, + "loss": 0.4558, + "step": 7037 + }, + { + "epoch": 0.8, + "learning_rate": 1.9417143911891003e-06, + "loss": 0.4436, + "step": 7038 + }, + { + "epoch": 0.8, + "learning_rate": 1.939523013708803e-06, + "loss": 0.4531, + "step": 7039 + }, + { + "epoch": 0.8, + "learning_rate": 1.937332740693809e-06, + "loss": 0.4468, + "step": 7040 + }, + { + "epoch": 0.8, + "learning_rate": 1.9351435724442412e-06, + "loss": 0.4673, + "step": 7041 + }, + { + "epoch": 0.8, + "learning_rate": 1.9329555092600593e-06, + "loss": 0.4542, + "step": 7042 + }, + { + "epoch": 0.8, + "learning_rate": 1.9307685514410803e-06, + "loss": 0.4458, + "step": 7043 + }, + { + "epoch": 0.81, + "learning_rate": 1.928582699286965e-06, + "loss": 0.4467, + "step": 7044 + }, + { + "epoch": 0.81, + "learning_rate": 1.926397953097222e-06, + "loss": 0.4474, + "step": 7045 + }, + { + "epoch": 0.81, + "learning_rate": 1.924214313171211e-06, + "loss": 0.4305, + "step": 7046 + }, + { + "epoch": 0.81, + "learning_rate": 1.9220317798081433e-06, + "loss": 0.456, + "step": 7047 + }, + { + "epoch": 0.81, + "learning_rate": 1.9198503533070688e-06, + "loss": 0.467, + "step": 7048 + }, + { + "epoch": 0.81, + "learning_rate": 1.9176700339668986e-06, + "loss": 0.4586, + "step": 7049 + }, + { + "epoch": 0.81, + "learning_rate": 1.9154908220863775e-06, + "loss": 0.447, + "step": 7050 + }, + { + "epoch": 0.81, + "learning_rate": 1.913312717964113e-06, + "loss": 0.4616, + "step": 7051 + }, + { + "epoch": 0.81, + "learning_rate": 1.9111357218985504e-06, + "loss": 0.4483, + "step": 7052 + }, + { + "epoch": 0.81, + "learning_rate": 1.9089598341879855e-06, + "loss": 0.4495, + "step": 7053 + }, + { + "epoch": 0.81, + "learning_rate": 1.9067850551305678e-06, + "loss": 0.4501, + "step": 7054 + }, + { + "epoch": 0.81, + "learning_rate": 1.9046113850242843e-06, + "loss": 0.4501, + "step": 7055 + }, + { + "epoch": 0.81, + "learning_rate": 1.9024388241669811e-06, + "loss": 0.4504, + "step": 7056 + }, + { + "epoch": 0.81, + "learning_rate": 1.900267372856348e-06, + "loss": 0.4638, + "step": 7057 + }, + { + "epoch": 0.81, + "learning_rate": 1.8980970313899193e-06, + "loss": 0.4497, + "step": 7058 + }, + { + "epoch": 0.81, + "learning_rate": 1.8959278000650839e-06, + "loss": 0.449, + "step": 7059 + }, + { + "epoch": 0.81, + "learning_rate": 1.8937596791790735e-06, + "loss": 0.45, + "step": 7060 + }, + { + "epoch": 0.81, + "learning_rate": 1.8915926690289643e-06, + "loss": 0.4337, + "step": 7061 + }, + { + "epoch": 0.81, + "learning_rate": 1.889426769911693e-06, + "loss": 0.4467, + "step": 7062 + }, + { + "epoch": 0.81, + "learning_rate": 1.887261982124029e-06, + "loss": 0.4389, + "step": 7063 + }, + { + "epoch": 0.81, + "learning_rate": 1.8850983059626026e-06, + "loss": 0.4479, + "step": 7064 + }, + { + "epoch": 0.81, + "learning_rate": 1.8829357417238802e-06, + "loss": 0.4554, + "step": 7065 + }, + { + "epoch": 0.81, + "learning_rate": 1.8807742897041847e-06, + "loss": 0.4657, + "step": 7066 + }, + { + "epoch": 0.81, + "learning_rate": 1.8786139501996847e-06, + "loss": 0.4486, + "step": 7067 + }, + { + "epoch": 0.81, + "learning_rate": 1.8764547235063912e-06, + "loss": 0.4762, + "step": 7068 + }, + { + "epoch": 0.81, + "learning_rate": 1.8742966099201699e-06, + "loss": 0.4658, + "step": 7069 + }, + { + "epoch": 0.81, + "learning_rate": 1.8721396097367294e-06, + "loss": 0.4481, + "step": 7070 + }, + { + "epoch": 0.81, + "learning_rate": 1.8699837232516226e-06, + "loss": 0.4427, + "step": 7071 + }, + { + "epoch": 0.81, + "learning_rate": 1.867828950760262e-06, + "loss": 0.4456, + "step": 7072 + }, + { + "epoch": 0.81, + "learning_rate": 1.8656752925578948e-06, + "loss": 0.4609, + "step": 7073 + }, + { + "epoch": 0.81, + "learning_rate": 1.8635227489396178e-06, + "loss": 0.4578, + "step": 7074 + }, + { + "epoch": 0.81, + "learning_rate": 1.8613713202003813e-06, + "loss": 0.4515, + "step": 7075 + }, + { + "epoch": 0.81, + "learning_rate": 1.8592210066349781e-06, + "loss": 0.4513, + "step": 7076 + }, + { + "epoch": 0.81, + "learning_rate": 1.8570718085380512e-06, + "loss": 0.4564, + "step": 7077 + }, + { + "epoch": 0.81, + "learning_rate": 1.8549237262040876e-06, + "loss": 0.4408, + "step": 7078 + }, + { + "epoch": 0.81, + "learning_rate": 1.8527767599274193e-06, + "loss": 0.4488, + "step": 7079 + }, + { + "epoch": 0.81, + "learning_rate": 1.8506309100022334e-06, + "loss": 0.4426, + "step": 7080 + }, + { + "epoch": 0.81, + "learning_rate": 1.8484861767225549e-06, + "loss": 0.4452, + "step": 7081 + }, + { + "epoch": 0.81, + "learning_rate": 1.846342560382265e-06, + "loss": 0.4534, + "step": 7082 + }, + { + "epoch": 0.81, + "learning_rate": 1.8442000612750832e-06, + "loss": 0.4499, + "step": 7083 + }, + { + "epoch": 0.81, + "learning_rate": 1.8420586796945793e-06, + "loss": 0.4449, + "step": 7084 + }, + { + "epoch": 0.81, + "learning_rate": 1.839918415934171e-06, + "loss": 0.4541, + "step": 7085 + }, + { + "epoch": 0.81, + "learning_rate": 1.8377792702871266e-06, + "loss": 0.4681, + "step": 7086 + }, + { + "epoch": 0.81, + "learning_rate": 1.8356412430465498e-06, + "loss": 0.4346, + "step": 7087 + }, + { + "epoch": 0.81, + "learning_rate": 1.8335043345054048e-06, + "loss": 0.4719, + "step": 7088 + }, + { + "epoch": 0.81, + "learning_rate": 1.83136854495649e-06, + "loss": 0.4324, + "step": 7089 + }, + { + "epoch": 0.81, + "learning_rate": 1.829233874692461e-06, + "loss": 0.4662, + "step": 7090 + }, + { + "epoch": 0.81, + "learning_rate": 1.8271003240058127e-06, + "loss": 0.4618, + "step": 7091 + }, + { + "epoch": 0.81, + "learning_rate": 1.8249678931888881e-06, + "loss": 0.4364, + "step": 7092 + }, + { + "epoch": 0.81, + "learning_rate": 1.8228365825338811e-06, + "loss": 0.4453, + "step": 7093 + }, + { + "epoch": 0.81, + "learning_rate": 1.820706392332824e-06, + "loss": 0.4628, + "step": 7094 + }, + { + "epoch": 0.81, + "learning_rate": 1.8185773228776038e-06, + "loss": 0.4462, + "step": 7095 + }, + { + "epoch": 0.81, + "learning_rate": 1.8164493744599531e-06, + "loss": 0.4569, + "step": 7096 + }, + { + "epoch": 0.81, + "learning_rate": 1.814322547371443e-06, + "loss": 0.4359, + "step": 7097 + }, + { + "epoch": 0.81, + "learning_rate": 1.8121968419035007e-06, + "loss": 0.4567, + "step": 7098 + }, + { + "epoch": 0.81, + "learning_rate": 1.810072258347394e-06, + "loss": 0.4508, + "step": 7099 + }, + { + "epoch": 0.81, + "learning_rate": 1.8079487969942344e-06, + "loss": 0.4589, + "step": 7100 + }, + { + "epoch": 0.81, + "learning_rate": 1.8058264581349893e-06, + "loss": 0.463, + "step": 7101 + }, + { + "epoch": 0.81, + "learning_rate": 1.8037052420604618e-06, + "loss": 0.4545, + "step": 7102 + }, + { + "epoch": 0.81, + "learning_rate": 1.8015851490613079e-06, + "loss": 0.4578, + "step": 7103 + }, + { + "epoch": 0.81, + "learning_rate": 1.799466179428031e-06, + "loss": 0.4468, + "step": 7104 + }, + { + "epoch": 0.81, + "learning_rate": 1.7973483334509701e-06, + "loss": 0.4457, + "step": 7105 + }, + { + "epoch": 0.81, + "learning_rate": 1.795231611420325e-06, + "loss": 0.4449, + "step": 7106 + }, + { + "epoch": 0.81, + "learning_rate": 1.7931160136261272e-06, + "loss": 0.4342, + "step": 7107 + }, + { + "epoch": 0.81, + "learning_rate": 1.7910015403582659e-06, + "loss": 0.4544, + "step": 7108 + }, + { + "epoch": 0.81, + "learning_rate": 1.7888881919064694e-06, + "loss": 0.4629, + "step": 7109 + }, + { + "epoch": 0.81, + "learning_rate": 1.7867759685603115e-06, + "loss": 0.4911, + "step": 7110 + }, + { + "epoch": 0.81, + "learning_rate": 1.7846648706092173e-06, + "loss": 0.444, + "step": 7111 + }, + { + "epoch": 0.81, + "learning_rate": 1.78255489834245e-06, + "loss": 0.4455, + "step": 7112 + }, + { + "epoch": 0.81, + "learning_rate": 1.7804460520491263e-06, + "loss": 0.4514, + "step": 7113 + }, + { + "epoch": 0.81, + "learning_rate": 1.7783383320182069e-06, + "loss": 0.4652, + "step": 7114 + }, + { + "epoch": 0.81, + "learning_rate": 1.776231738538492e-06, + "loss": 0.4435, + "step": 7115 + }, + { + "epoch": 0.81, + "learning_rate": 1.7741262718986363e-06, + "loss": 0.4645, + "step": 7116 + }, + { + "epoch": 0.81, + "learning_rate": 1.7720219323871346e-06, + "loss": 0.4633, + "step": 7117 + }, + { + "epoch": 0.81, + "learning_rate": 1.7699187202923241e-06, + "loss": 0.437, + "step": 7118 + }, + { + "epoch": 0.81, + "learning_rate": 1.7678166359023973e-06, + "loss": 0.4594, + "step": 7119 + }, + { + "epoch": 0.81, + "learning_rate": 1.7657156795053821e-06, + "loss": 0.4571, + "step": 7120 + }, + { + "epoch": 0.81, + "learning_rate": 1.763615851389161e-06, + "loss": 0.4344, + "step": 7121 + }, + { + "epoch": 0.81, + "learning_rate": 1.7615171518414542e-06, + "loss": 0.4568, + "step": 7122 + }, + { + "epoch": 0.81, + "learning_rate": 1.7594195811498294e-06, + "loss": 0.4552, + "step": 7123 + }, + { + "epoch": 0.81, + "learning_rate": 1.7573231396017064e-06, + "loss": 0.4622, + "step": 7124 + }, + { + "epoch": 0.81, + "learning_rate": 1.755227827484338e-06, + "loss": 0.4444, + "step": 7125 + }, + { + "epoch": 0.81, + "learning_rate": 1.7531336450848335e-06, + "loss": 0.4418, + "step": 7126 + }, + { + "epoch": 0.81, + "learning_rate": 1.7510405926901408e-06, + "loss": 0.4377, + "step": 7127 + }, + { + "epoch": 0.81, + "learning_rate": 1.7489486705870517e-06, + "loss": 0.448, + "step": 7128 + }, + { + "epoch": 0.81, + "learning_rate": 1.7468578790622126e-06, + "loss": 0.4435, + "step": 7129 + }, + { + "epoch": 0.81, + "learning_rate": 1.7447682184021042e-06, + "loss": 0.4557, + "step": 7130 + }, + { + "epoch": 0.81, + "learning_rate": 1.7426796888930553e-06, + "loss": 0.4505, + "step": 7131 + }, + { + "epoch": 0.82, + "learning_rate": 1.7405922908212436e-06, + "loss": 0.4546, + "step": 7132 + }, + { + "epoch": 0.82, + "learning_rate": 1.7385060244726882e-06, + "loss": 0.44, + "step": 7133 + }, + { + "epoch": 0.82, + "learning_rate": 1.736420890133258e-06, + "loss": 0.4504, + "step": 7134 + }, + { + "epoch": 0.82, + "learning_rate": 1.7343368880886603e-06, + "loss": 0.4574, + "step": 7135 + }, + { + "epoch": 0.82, + "learning_rate": 1.7322540186244462e-06, + "loss": 0.4503, + "step": 7136 + }, + { + "epoch": 0.82, + "learning_rate": 1.7301722820260226e-06, + "loss": 0.4587, + "step": 7137 + }, + { + "epoch": 0.82, + "learning_rate": 1.7280916785786261e-06, + "loss": 0.4356, + "step": 7138 + }, + { + "epoch": 0.82, + "learning_rate": 1.7260122085673525e-06, + "loss": 0.4409, + "step": 7139 + }, + { + "epoch": 0.82, + "learning_rate": 1.7239338722771326e-06, + "loss": 0.4578, + "step": 7140 + }, + { + "epoch": 0.82, + "learning_rate": 1.721856669992743e-06, + "loss": 0.447, + "step": 7141 + }, + { + "epoch": 0.82, + "learning_rate": 1.7197806019988084e-06, + "loss": 0.4626, + "step": 7142 + }, + { + "epoch": 0.82, + "learning_rate": 1.7177056685797988e-06, + "loss": 0.4323, + "step": 7143 + }, + { + "epoch": 0.82, + "learning_rate": 1.7156318700200236e-06, + "loss": 0.4271, + "step": 7144 + }, + { + "epoch": 0.82, + "learning_rate": 1.713559206603642e-06, + "loss": 0.4705, + "step": 7145 + }, + { + "epoch": 0.82, + "learning_rate": 1.7114876786146505e-06, + "loss": 0.4579, + "step": 7146 + }, + { + "epoch": 0.82, + "learning_rate": 1.7094172863369007e-06, + "loss": 0.4371, + "step": 7147 + }, + { + "epoch": 0.82, + "learning_rate": 1.7073480300540802e-06, + "loss": 0.4559, + "step": 7148 + }, + { + "epoch": 0.82, + "learning_rate": 1.7052799100497197e-06, + "loss": 0.4504, + "step": 7149 + }, + { + "epoch": 0.82, + "learning_rate": 1.703212926607204e-06, + "loss": 0.4783, + "step": 7150 + }, + { + "epoch": 0.82, + "learning_rate": 1.7011470800097496e-06, + "loss": 0.4489, + "step": 7151 + }, + { + "epoch": 0.82, + "learning_rate": 1.6990823705404269e-06, + "loss": 0.4556, + "step": 7152 + }, + { + "epoch": 0.82, + "learning_rate": 1.6970187984821496e-06, + "loss": 0.4528, + "step": 7153 + }, + { + "epoch": 0.82, + "learning_rate": 1.694956364117668e-06, + "loss": 0.4559, + "step": 7154 + }, + { + "epoch": 0.82, + "learning_rate": 1.6928950677295875e-06, + "loss": 0.4405, + "step": 7155 + }, + { + "epoch": 0.82, + "learning_rate": 1.6908349096003484e-06, + "loss": 0.4654, + "step": 7156 + }, + { + "epoch": 0.82, + "learning_rate": 1.6887758900122352e-06, + "loss": 0.453, + "step": 7157 + }, + { + "epoch": 0.82, + "learning_rate": 1.6867180092473866e-06, + "loss": 0.495, + "step": 7158 + }, + { + "epoch": 0.82, + "learning_rate": 1.6846612675877716e-06, + "loss": 0.4487, + "step": 7159 + }, + { + "epoch": 0.82, + "learning_rate": 1.6826056653152122e-06, + "loss": 0.46, + "step": 7160 + }, + { + "epoch": 0.82, + "learning_rate": 1.6805512027113745e-06, + "loss": 0.4511, + "step": 7161 + }, + { + "epoch": 0.82, + "learning_rate": 1.6784978800577611e-06, + "loss": 0.4701, + "step": 7162 + }, + { + "epoch": 0.82, + "learning_rate": 1.6764456976357279e-06, + "loss": 0.4382, + "step": 7163 + }, + { + "epoch": 0.82, + "learning_rate": 1.6743946557264656e-06, + "loss": 0.4417, + "step": 7164 + }, + { + "epoch": 0.82, + "learning_rate": 1.672344754611016e-06, + "loss": 0.4622, + "step": 7165 + }, + { + "epoch": 0.82, + "learning_rate": 1.67029599457026e-06, + "loss": 0.4403, + "step": 7166 + }, + { + "epoch": 0.82, + "learning_rate": 1.6682483758849199e-06, + "loss": 0.4704, + "step": 7167 + }, + { + "epoch": 0.82, + "learning_rate": 1.666201898835572e-06, + "loss": 0.4515, + "step": 7168 + }, + { + "epoch": 0.82, + "learning_rate": 1.6641565637026225e-06, + "loss": 0.4407, + "step": 7169 + }, + { + "epoch": 0.82, + "learning_rate": 1.6621123707663312e-06, + "loss": 0.448, + "step": 7170 + }, + { + "epoch": 0.82, + "learning_rate": 1.6600693203068007e-06, + "loss": 0.4588, + "step": 7171 + }, + { + "epoch": 0.82, + "learning_rate": 1.6580274126039698e-06, + "loss": 0.4502, + "step": 7172 + }, + { + "epoch": 0.82, + "learning_rate": 1.6559866479376297e-06, + "loss": 0.4401, + "step": 7173 + }, + { + "epoch": 0.82, + "learning_rate": 1.6539470265874092e-06, + "loss": 0.4351, + "step": 7174 + }, + { + "epoch": 0.82, + "learning_rate": 1.651908548832779e-06, + "loss": 0.4603, + "step": 7175 + }, + { + "epoch": 0.82, + "learning_rate": 1.6498712149530606e-06, + "loss": 0.4821, + "step": 7176 + }, + { + "epoch": 0.82, + "learning_rate": 1.64783502522741e-06, + "loss": 0.4347, + "step": 7177 + }, + { + "epoch": 0.82, + "learning_rate": 1.6457999799348345e-06, + "loss": 0.4536, + "step": 7178 + }, + { + "epoch": 0.82, + "learning_rate": 1.6437660793541776e-06, + "loss": 0.4413, + "step": 7179 + }, + { + "epoch": 0.82, + "learning_rate": 1.6417333237641298e-06, + "loss": 0.446, + "step": 7180 + }, + { + "epoch": 0.82, + "learning_rate": 1.6397017134432281e-06, + "loss": 0.455, + "step": 7181 + }, + { + "epoch": 0.82, + "learning_rate": 1.6376712486698443e-06, + "loss": 0.4394, + "step": 7182 + }, + { + "epoch": 0.82, + "learning_rate": 1.635641929722196e-06, + "loss": 0.4533, + "step": 7183 + }, + { + "epoch": 0.82, + "learning_rate": 1.6336137568783495e-06, + "loss": 0.4685, + "step": 7184 + }, + { + "epoch": 0.82, + "learning_rate": 1.6315867304162058e-06, + "loss": 0.4504, + "step": 7185 + }, + { + "epoch": 0.82, + "learning_rate": 1.6295608506135162e-06, + "loss": 0.4686, + "step": 7186 + }, + { + "epoch": 0.82, + "learning_rate": 1.627536117747871e-06, + "loss": 0.4506, + "step": 7187 + }, + { + "epoch": 0.82, + "learning_rate": 1.625512532096699e-06, + "loss": 0.4543, + "step": 7188 + }, + { + "epoch": 0.82, + "learning_rate": 1.623490093937281e-06, + "loss": 0.4635, + "step": 7189 + }, + { + "epoch": 0.82, + "learning_rate": 1.6214688035467363e-06, + "loss": 0.4493, + "step": 7190 + }, + { + "epoch": 0.82, + "learning_rate": 1.6194486612020277e-06, + "loss": 0.4523, + "step": 7191 + }, + { + "epoch": 0.82, + "learning_rate": 1.6174296671799571e-06, + "loss": 0.4677, + "step": 7192 + }, + { + "epoch": 0.82, + "learning_rate": 1.6154118217571723e-06, + "loss": 0.4354, + "step": 7193 + }, + { + "epoch": 0.82, + "learning_rate": 1.6133951252101642e-06, + "loss": 0.4581, + "step": 7194 + }, + { + "epoch": 0.82, + "learning_rate": 1.6113795778152663e-06, + "loss": 0.4396, + "step": 7195 + }, + { + "epoch": 0.82, + "learning_rate": 1.6093651798486487e-06, + "loss": 0.4427, + "step": 7196 + }, + { + "epoch": 0.82, + "learning_rate": 1.6073519315863351e-06, + "loss": 0.4417, + "step": 7197 + }, + { + "epoch": 0.82, + "learning_rate": 1.6053398333041791e-06, + "loss": 0.4423, + "step": 7198 + }, + { + "epoch": 0.82, + "learning_rate": 1.6033288852778882e-06, + "loss": 0.4448, + "step": 7199 + }, + { + "epoch": 0.82, + "learning_rate": 1.6013190877830065e-06, + "loss": 0.4561, + "step": 7200 + }, + { + "epoch": 0.82, + "learning_rate": 1.5993104410949189e-06, + "loss": 0.4564, + "step": 7201 + }, + { + "epoch": 0.82, + "learning_rate": 1.5973029454888578e-06, + "loss": 0.4518, + "step": 7202 + }, + { + "epoch": 0.82, + "learning_rate": 1.5952966012398908e-06, + "loss": 0.4463, + "step": 7203 + }, + { + "epoch": 0.82, + "learning_rate": 1.5932914086229366e-06, + "loss": 0.4785, + "step": 7204 + }, + { + "epoch": 0.82, + "learning_rate": 1.5912873679127495e-06, + "loss": 0.4446, + "step": 7205 + }, + { + "epoch": 0.82, + "learning_rate": 1.5892844793839235e-06, + "loss": 0.4422, + "step": 7206 + }, + { + "epoch": 0.82, + "learning_rate": 1.5872827433109073e-06, + "loss": 0.4585, + "step": 7207 + }, + { + "epoch": 0.82, + "learning_rate": 1.5852821599679747e-06, + "loss": 0.4655, + "step": 7208 + }, + { + "epoch": 0.82, + "learning_rate": 1.5832827296292564e-06, + "loss": 0.4546, + "step": 7209 + }, + { + "epoch": 0.82, + "learning_rate": 1.5812844525687188e-06, + "loss": 0.4432, + "step": 7210 + }, + { + "epoch": 0.82, + "learning_rate": 1.5792873290601662e-06, + "loss": 0.4541, + "step": 7211 + }, + { + "epoch": 0.82, + "learning_rate": 1.5772913593772543e-06, + "loss": 0.4737, + "step": 7212 + }, + { + "epoch": 0.82, + "learning_rate": 1.575296543793473e-06, + "loss": 0.4297, + "step": 7213 + }, + { + "epoch": 0.82, + "learning_rate": 1.573302882582154e-06, + "loss": 0.4452, + "step": 7214 + }, + { + "epoch": 0.82, + "learning_rate": 1.5713103760164782e-06, + "loss": 0.4724, + "step": 7215 + }, + { + "epoch": 0.82, + "learning_rate": 1.56931902436946e-06, + "loss": 0.4592, + "step": 7216 + }, + { + "epoch": 0.82, + "learning_rate": 1.5673288279139586e-06, + "loss": 0.4475, + "step": 7217 + }, + { + "epoch": 0.82, + "learning_rate": 1.5653397869226806e-06, + "loss": 0.4772, + "step": 7218 + }, + { + "epoch": 0.83, + "learning_rate": 1.5633519016681631e-06, + "loss": 0.4337, + "step": 7219 + }, + { + "epoch": 0.83, + "learning_rate": 1.561365172422795e-06, + "loss": 0.4538, + "step": 7220 + }, + { + "epoch": 0.83, + "learning_rate": 1.559379599458798e-06, + "loss": 0.4443, + "step": 7221 + }, + { + "epoch": 0.83, + "learning_rate": 1.5573951830482458e-06, + "loss": 0.4493, + "step": 7222 + }, + { + "epoch": 0.83, + "learning_rate": 1.5554119234630438e-06, + "loss": 0.4358, + "step": 7223 + }, + { + "epoch": 0.83, + "learning_rate": 1.553429820974941e-06, + "loss": 0.4569, + "step": 7224 + }, + { + "epoch": 0.83, + "learning_rate": 1.5514488758555357e-06, + "loss": 0.4639, + "step": 7225 + }, + { + "epoch": 0.83, + "learning_rate": 1.5494690883762553e-06, + "loss": 0.4464, + "step": 7226 + }, + { + "epoch": 0.83, + "learning_rate": 1.5474904588083772e-06, + "loss": 0.4525, + "step": 7227 + }, + { + "epoch": 0.83, + "learning_rate": 1.5455129874230212e-06, + "loss": 0.4562, + "step": 7228 + }, + { + "epoch": 0.83, + "learning_rate": 1.5435366744911406e-06, + "loss": 0.4502, + "step": 7229 + }, + { + "epoch": 0.83, + "learning_rate": 1.5415615202835377e-06, + "loss": 0.4496, + "step": 7230 + }, + { + "epoch": 0.83, + "learning_rate": 1.5395875250708513e-06, + "loss": 0.4492, + "step": 7231 + }, + { + "epoch": 0.83, + "learning_rate": 1.53761468912356e-06, + "loss": 0.4481, + "step": 7232 + }, + { + "epoch": 0.83, + "learning_rate": 1.5356430127119915e-06, + "loss": 0.4517, + "step": 7233 + }, + { + "epoch": 0.83, + "learning_rate": 1.5336724961063043e-06, + "loss": 0.4612, + "step": 7234 + }, + { + "epoch": 0.83, + "learning_rate": 1.5317031395765081e-06, + "loss": 0.447, + "step": 7235 + }, + { + "epoch": 0.83, + "learning_rate": 1.5297349433924435e-06, + "loss": 0.4387, + "step": 7236 + }, + { + "epoch": 0.83, + "learning_rate": 1.5277679078238018e-06, + "loss": 0.4682, + "step": 7237 + }, + { + "epoch": 0.83, + "learning_rate": 1.5258020331401102e-06, + "loss": 0.4466, + "step": 7238 + }, + { + "epoch": 0.83, + "learning_rate": 1.523837319610737e-06, + "loss": 0.4367, + "step": 7239 + }, + { + "epoch": 0.83, + "learning_rate": 1.5218737675048888e-06, + "loss": 0.452, + "step": 7240 + }, + { + "epoch": 0.83, + "learning_rate": 1.5199113770916207e-06, + "loss": 0.4353, + "step": 7241 + }, + { + "epoch": 0.83, + "learning_rate": 1.5179501486398196e-06, + "loss": 0.4702, + "step": 7242 + }, + { + "epoch": 0.83, + "learning_rate": 1.5159900824182227e-06, + "loss": 0.4602, + "step": 7243 + }, + { + "epoch": 0.83, + "learning_rate": 1.5140311786953986e-06, + "loss": 0.4643, + "step": 7244 + }, + { + "epoch": 0.83, + "learning_rate": 1.5120734377397617e-06, + "loss": 0.4605, + "step": 7245 + }, + { + "epoch": 0.83, + "learning_rate": 1.5101168598195647e-06, + "loss": 0.4429, + "step": 7246 + }, + { + "epoch": 0.83, + "learning_rate": 1.508161445202906e-06, + "loss": 0.4756, + "step": 7247 + }, + { + "epoch": 0.83, + "learning_rate": 1.5062071941577217e-06, + "loss": 0.4581, + "step": 7248 + }, + { + "epoch": 0.83, + "learning_rate": 1.5042541069517846e-06, + "loss": 0.4531, + "step": 7249 + }, + { + "epoch": 0.83, + "learning_rate": 1.5023021838527108e-06, + "loss": 0.4378, + "step": 7250 + }, + { + "epoch": 0.83, + "learning_rate": 1.5003514251279616e-06, + "loss": 0.4601, + "step": 7251 + }, + { + "epoch": 0.83, + "learning_rate": 1.4984018310448312e-06, + "loss": 0.466, + "step": 7252 + }, + { + "epoch": 0.83, + "learning_rate": 1.4964534018704558e-06, + "loss": 0.4375, + "step": 7253 + }, + { + "epoch": 0.83, + "learning_rate": 1.4945061378718184e-06, + "loss": 0.459, + "step": 7254 + }, + { + "epoch": 0.83, + "learning_rate": 1.4925600393157325e-06, + "loss": 0.4477, + "step": 7255 + }, + { + "epoch": 0.83, + "learning_rate": 1.4906151064688602e-06, + "loss": 0.4788, + "step": 7256 + }, + { + "epoch": 0.83, + "learning_rate": 1.4886713395977015e-06, + "loss": 0.4355, + "step": 7257 + }, + { + "epoch": 0.83, + "learning_rate": 1.4867287389685936e-06, + "loss": 0.4485, + "step": 7258 + }, + { + "epoch": 0.83, + "learning_rate": 1.4847873048477191e-06, + "loss": 0.4611, + "step": 7259 + }, + { + "epoch": 0.83, + "learning_rate": 1.482847037501094e-06, + "loss": 0.4529, + "step": 7260 + }, + { + "epoch": 0.83, + "learning_rate": 1.4809079371945823e-06, + "loss": 0.4453, + "step": 7261 + }, + { + "epoch": 0.83, + "learning_rate": 1.4789700041938816e-06, + "loss": 0.4605, + "step": 7262 + }, + { + "epoch": 0.83, + "learning_rate": 1.4770332387645293e-06, + "loss": 0.4372, + "step": 7263 + }, + { + "epoch": 0.83, + "learning_rate": 1.475097641171912e-06, + "loss": 0.4592, + "step": 7264 + }, + { + "epoch": 0.83, + "learning_rate": 1.4731632116812434e-06, + "loss": 0.4514, + "step": 7265 + }, + { + "epoch": 0.83, + "learning_rate": 1.4712299505575868e-06, + "loss": 0.4276, + "step": 7266 + }, + { + "epoch": 0.83, + "learning_rate": 1.4692978580658434e-06, + "loss": 0.4622, + "step": 7267 + }, + { + "epoch": 0.83, + "learning_rate": 1.4673669344707498e-06, + "loss": 0.4572, + "step": 7268 + }, + { + "epoch": 0.83, + "learning_rate": 1.4654371800368882e-06, + "loss": 0.4503, + "step": 7269 + }, + { + "epoch": 0.83, + "learning_rate": 1.4635085950286776e-06, + "loss": 0.4591, + "step": 7270 + }, + { + "epoch": 0.83, + "learning_rate": 1.4615811797103751e-06, + "loss": 0.4537, + "step": 7271 + }, + { + "epoch": 0.83, + "learning_rate": 1.459654934346083e-06, + "loss": 0.4507, + "step": 7272 + }, + { + "epoch": 0.83, + "learning_rate": 1.4577298591997357e-06, + "loss": 0.4602, + "step": 7273 + }, + { + "epoch": 0.83, + "learning_rate": 1.4558059545351144e-06, + "loss": 0.4625, + "step": 7274 + }, + { + "epoch": 0.83, + "learning_rate": 1.4538832206158381e-06, + "loss": 0.4445, + "step": 7275 + }, + { + "epoch": 0.83, + "learning_rate": 1.4519616577053597e-06, + "loss": 0.4464, + "step": 7276 + }, + { + "epoch": 0.83, + "learning_rate": 1.4500412660669828e-06, + "loss": 0.4605, + "step": 7277 + }, + { + "epoch": 0.83, + "learning_rate": 1.448122045963839e-06, + "loss": 0.4558, + "step": 7278 + }, + { + "epoch": 0.83, + "learning_rate": 1.4462039976589048e-06, + "loss": 0.4509, + "step": 7279 + }, + { + "epoch": 0.83, + "learning_rate": 1.444287121414998e-06, + "loss": 0.4439, + "step": 7280 + }, + { + "epoch": 0.83, + "learning_rate": 1.442371417494769e-06, + "loss": 0.4491, + "step": 7281 + }, + { + "epoch": 0.83, + "learning_rate": 1.4404568861607172e-06, + "loss": 0.4543, + "step": 7282 + }, + { + "epoch": 0.83, + "learning_rate": 1.4385435276751724e-06, + "loss": 0.4388, + "step": 7283 + }, + { + "epoch": 0.83, + "learning_rate": 1.4366313423003087e-06, + "loss": 0.4459, + "step": 7284 + }, + { + "epoch": 0.83, + "learning_rate": 1.4347203302981393e-06, + "loss": 0.4641, + "step": 7285 + }, + { + "epoch": 0.83, + "learning_rate": 1.432810491930514e-06, + "loss": 0.4506, + "step": 7286 + }, + { + "epoch": 0.83, + "learning_rate": 1.4309018274591246e-06, + "loss": 0.4384, + "step": 7287 + }, + { + "epoch": 0.83, + "learning_rate": 1.4289943371455007e-06, + "loss": 0.4767, + "step": 7288 + }, + { + "epoch": 0.83, + "learning_rate": 1.4270880212510086e-06, + "loss": 0.4374, + "step": 7289 + }, + { + "epoch": 0.83, + "learning_rate": 1.4251828800368594e-06, + "loss": 0.4517, + "step": 7290 + }, + { + "epoch": 0.83, + "learning_rate": 1.4232789137640968e-06, + "loss": 0.4359, + "step": 7291 + }, + { + "epoch": 0.83, + "learning_rate": 1.4213761226936095e-06, + "loss": 0.4627, + "step": 7292 + }, + { + "epoch": 0.83, + "learning_rate": 1.4194745070861194e-06, + "loss": 0.4673, + "step": 7293 + }, + { + "epoch": 0.83, + "learning_rate": 1.417574067202192e-06, + "loss": 0.4501, + "step": 7294 + }, + { + "epoch": 0.83, + "learning_rate": 1.4156748033022328e-06, + "loss": 0.443, + "step": 7295 + }, + { + "epoch": 0.83, + "learning_rate": 1.413776715646481e-06, + "loss": 0.4604, + "step": 7296 + }, + { + "epoch": 0.83, + "learning_rate": 1.4118798044950132e-06, + "loss": 0.4462, + "step": 7297 + }, + { + "epoch": 0.83, + "learning_rate": 1.409984070107755e-06, + "loss": 0.4305, + "step": 7298 + }, + { + "epoch": 0.83, + "learning_rate": 1.4080895127444594e-06, + "loss": 0.4609, + "step": 7299 + }, + { + "epoch": 0.83, + "learning_rate": 1.4061961326647266e-06, + "loss": 0.4414, + "step": 7300 + }, + { + "epoch": 0.83, + "learning_rate": 1.4043039301279904e-06, + "loss": 0.476, + "step": 7301 + }, + { + "epoch": 0.83, + "learning_rate": 1.402412905393523e-06, + "loss": 0.4646, + "step": 7302 + }, + { + "epoch": 0.83, + "learning_rate": 1.4005230587204388e-06, + "loss": 0.4492, + "step": 7303 + }, + { + "epoch": 0.83, + "learning_rate": 1.398634390367688e-06, + "loss": 0.4339, + "step": 7304 + }, + { + "epoch": 0.83, + "learning_rate": 1.3967469005940638e-06, + "loss": 0.448, + "step": 7305 + }, + { + "epoch": 0.83, + "learning_rate": 1.3948605896581923e-06, + "loss": 0.4402, + "step": 7306 + }, + { + "epoch": 0.84, + "learning_rate": 1.3929754578185373e-06, + "loss": 0.4364, + "step": 7307 + }, + { + "epoch": 0.84, + "learning_rate": 1.3910915053334094e-06, + "loss": 0.463, + "step": 7308 + }, + { + "epoch": 0.84, + "learning_rate": 1.3892087324609482e-06, + "loss": 0.4454, + "step": 7309 + }, + { + "epoch": 0.84, + "learning_rate": 1.3873271394591348e-06, + "loss": 0.4623, + "step": 7310 + }, + { + "epoch": 0.84, + "learning_rate": 1.385446726585794e-06, + "loss": 0.4608, + "step": 7311 + }, + { + "epoch": 0.84, + "learning_rate": 1.3835674940985788e-06, + "loss": 0.4374, + "step": 7312 + }, + { + "epoch": 0.84, + "learning_rate": 1.3816894422549888e-06, + "loss": 0.4594, + "step": 7313 + }, + { + "epoch": 0.84, + "learning_rate": 1.379812571312361e-06, + "loss": 0.4467, + "step": 7314 + }, + { + "epoch": 0.84, + "learning_rate": 1.3779368815278648e-06, + "loss": 0.4594, + "step": 7315 + }, + { + "epoch": 0.84, + "learning_rate": 1.3760623731585165e-06, + "loss": 0.4557, + "step": 7316 + }, + { + "epoch": 0.84, + "learning_rate": 1.3741890464611597e-06, + "loss": 0.446, + "step": 7317 + }, + { + "epoch": 0.84, + "learning_rate": 1.3723169016924865e-06, + "loss": 0.4579, + "step": 7318 + }, + { + "epoch": 0.84, + "learning_rate": 1.370445939109022e-06, + "loss": 0.4598, + "step": 7319 + }, + { + "epoch": 0.84, + "learning_rate": 1.3685761589671253e-06, + "loss": 0.4376, + "step": 7320 + }, + { + "epoch": 0.84, + "learning_rate": 1.366707561523004e-06, + "loss": 0.4252, + "step": 7321 + }, + { + "epoch": 0.84, + "learning_rate": 1.3648401470326932e-06, + "loss": 0.4451, + "step": 7322 + }, + { + "epoch": 0.84, + "learning_rate": 1.3629739157520728e-06, + "loss": 0.4647, + "step": 7323 + }, + { + "epoch": 0.84, + "learning_rate": 1.361108867936859e-06, + "loss": 0.4317, + "step": 7324 + }, + { + "epoch": 0.84, + "learning_rate": 1.359245003842602e-06, + "loss": 0.4451, + "step": 7325 + }, + { + "epoch": 0.84, + "learning_rate": 1.3573823237246965e-06, + "loss": 0.465, + "step": 7326 + }, + { + "epoch": 0.84, + "learning_rate": 1.3555208278383691e-06, + "loss": 0.4469, + "step": 7327 + }, + { + "epoch": 0.84, + "learning_rate": 1.353660516438684e-06, + "loss": 0.4517, + "step": 7328 + }, + { + "epoch": 0.84, + "learning_rate": 1.3518013897805504e-06, + "loss": 0.4704, + "step": 7329 + }, + { + "epoch": 0.84, + "learning_rate": 1.3499434481187045e-06, + "loss": 0.4471, + "step": 7330 + }, + { + "epoch": 0.84, + "learning_rate": 1.3480866917077294e-06, + "loss": 0.4489, + "step": 7331 + }, + { + "epoch": 0.84, + "learning_rate": 1.346231120802044e-06, + "loss": 0.4499, + "step": 7332 + }, + { + "epoch": 0.84, + "learning_rate": 1.3443767356558989e-06, + "loss": 0.4474, + "step": 7333 + }, + { + "epoch": 0.84, + "learning_rate": 1.3425235365233892e-06, + "loss": 0.4623, + "step": 7334 + }, + { + "epoch": 0.84, + "learning_rate": 1.3406715236584433e-06, + "loss": 0.4493, + "step": 7335 + }, + { + "epoch": 0.84, + "learning_rate": 1.3388206973148265e-06, + "loss": 0.4323, + "step": 7336 + }, + { + "epoch": 0.84, + "learning_rate": 1.336971057746147e-06, + "loss": 0.4577, + "step": 7337 + }, + { + "epoch": 0.84, + "learning_rate": 1.335122605205843e-06, + "loss": 0.436, + "step": 7338 + }, + { + "epoch": 0.84, + "learning_rate": 1.3332753399471976e-06, + "loss": 0.4512, + "step": 7339 + }, + { + "epoch": 0.84, + "learning_rate": 1.3314292622233227e-06, + "loss": 0.4495, + "step": 7340 + }, + { + "epoch": 0.84, + "learning_rate": 1.329584372287176e-06, + "loss": 0.4417, + "step": 7341 + }, + { + "epoch": 0.84, + "learning_rate": 1.3277406703915485e-06, + "loss": 0.4506, + "step": 7342 + }, + { + "epoch": 0.84, + "learning_rate": 1.325898156789066e-06, + "loss": 0.4717, + "step": 7343 + }, + { + "epoch": 0.84, + "learning_rate": 1.3240568317321966e-06, + "loss": 0.4456, + "step": 7344 + }, + { + "epoch": 0.84, + "learning_rate": 1.322216695473243e-06, + "loss": 0.4531, + "step": 7345 + }, + { + "epoch": 0.84, + "learning_rate": 1.320377748264341e-06, + "loss": 0.4392, + "step": 7346 + }, + { + "epoch": 0.84, + "learning_rate": 1.3185399903574724e-06, + "loss": 0.4464, + "step": 7347 + }, + { + "epoch": 0.84, + "learning_rate": 1.3167034220044494e-06, + "loss": 0.4205, + "step": 7348 + }, + { + "epoch": 0.84, + "learning_rate": 1.3148680434569206e-06, + "loss": 0.4495, + "step": 7349 + }, + { + "epoch": 0.84, + "learning_rate": 1.3130338549663745e-06, + "loss": 0.4453, + "step": 7350 + }, + { + "epoch": 0.84, + "learning_rate": 1.3112008567841371e-06, + "loss": 0.4435, + "step": 7351 + }, + { + "epoch": 0.84, + "learning_rate": 1.309369049161372e-06, + "loss": 0.4557, + "step": 7352 + }, + { + "epoch": 0.84, + "learning_rate": 1.3075384323490759e-06, + "loss": 0.4497, + "step": 7353 + }, + { + "epoch": 0.84, + "learning_rate": 1.3057090065980816e-06, + "loss": 0.4427, + "step": 7354 + }, + { + "epoch": 0.84, + "learning_rate": 1.3038807721590663e-06, + "loss": 0.4577, + "step": 7355 + }, + { + "epoch": 0.84, + "learning_rate": 1.302053729282533e-06, + "loss": 0.4432, + "step": 7356 + }, + { + "epoch": 0.84, + "learning_rate": 1.3002278782188337e-06, + "loss": 0.4521, + "step": 7357 + }, + { + "epoch": 0.84, + "learning_rate": 1.2984032192181473e-06, + "loss": 0.4507, + "step": 7358 + }, + { + "epoch": 0.84, + "learning_rate": 1.2965797525304913e-06, + "loss": 0.447, + "step": 7359 + }, + { + "epoch": 0.84, + "learning_rate": 1.2947574784057237e-06, + "loss": 0.4633, + "step": 7360 + }, + { + "epoch": 0.84, + "learning_rate": 1.2929363970935371e-06, + "loss": 0.4423, + "step": 7361 + }, + { + "epoch": 0.84, + "learning_rate": 1.2911165088434584e-06, + "loss": 0.4446, + "step": 7362 + }, + { + "epoch": 0.84, + "learning_rate": 1.2892978139048562e-06, + "loss": 0.459, + "step": 7363 + }, + { + "epoch": 0.84, + "learning_rate": 1.2874803125269274e-06, + "loss": 0.4648, + "step": 7364 + }, + { + "epoch": 0.84, + "learning_rate": 1.2856640049587154e-06, + "loss": 0.4511, + "step": 7365 + }, + { + "epoch": 0.84, + "learning_rate": 1.283848891449092e-06, + "loss": 0.4607, + "step": 7366 + }, + { + "epoch": 0.84, + "learning_rate": 1.2820349722467663e-06, + "loss": 0.4432, + "step": 7367 + }, + { + "epoch": 0.84, + "learning_rate": 1.2802222476002911e-06, + "loss": 0.4673, + "step": 7368 + }, + { + "epoch": 0.84, + "learning_rate": 1.278410717758045e-06, + "loss": 0.449, + "step": 7369 + }, + { + "epoch": 0.84, + "learning_rate": 1.2766003829682504e-06, + "loss": 0.4386, + "step": 7370 + }, + { + "epoch": 0.84, + "learning_rate": 1.2747912434789655e-06, + "loss": 0.4515, + "step": 7371 + }, + { + "epoch": 0.84, + "learning_rate": 1.2729832995380775e-06, + "loss": 0.4452, + "step": 7372 + }, + { + "epoch": 0.84, + "learning_rate": 1.2711765513933216e-06, + "loss": 0.4626, + "step": 7373 + }, + { + "epoch": 0.84, + "learning_rate": 1.2693709992922575e-06, + "loss": 0.4452, + "step": 7374 + }, + { + "epoch": 0.84, + "learning_rate": 1.2675666434822887e-06, + "loss": 0.443, + "step": 7375 + }, + { + "epoch": 0.84, + "learning_rate": 1.2657634842106526e-06, + "loss": 0.4632, + "step": 7376 + }, + { + "epoch": 0.84, + "learning_rate": 1.2639615217244194e-06, + "loss": 0.459, + "step": 7377 + }, + { + "epoch": 0.84, + "learning_rate": 1.2621607562705018e-06, + "loss": 0.4533, + "step": 7378 + }, + { + "epoch": 0.84, + "learning_rate": 1.26036118809564e-06, + "loss": 0.4352, + "step": 7379 + }, + { + "epoch": 0.84, + "learning_rate": 1.2585628174464192e-06, + "loss": 0.4407, + "step": 7380 + }, + { + "epoch": 0.84, + "learning_rate": 1.2567656445692566e-06, + "loss": 0.4571, + "step": 7381 + }, + { + "epoch": 0.84, + "learning_rate": 1.254969669710402e-06, + "loss": 0.4504, + "step": 7382 + }, + { + "epoch": 0.84, + "learning_rate": 1.2531748931159472e-06, + "loss": 0.4709, + "step": 7383 + }, + { + "epoch": 0.84, + "learning_rate": 1.2513813150318155e-06, + "loss": 0.4495, + "step": 7384 + }, + { + "epoch": 0.84, + "learning_rate": 1.249588935703765e-06, + "loss": 0.4497, + "step": 7385 + }, + { + "epoch": 0.84, + "learning_rate": 1.2477977553773957e-06, + "loss": 0.4771, + "step": 7386 + }, + { + "epoch": 0.84, + "learning_rate": 1.2460077742981347e-06, + "loss": 0.4575, + "step": 7387 + }, + { + "epoch": 0.84, + "learning_rate": 1.2442189927112514e-06, + "loss": 0.4703, + "step": 7388 + }, + { + "epoch": 0.84, + "learning_rate": 1.2424314108618507e-06, + "loss": 0.4338, + "step": 7389 + }, + { + "epoch": 0.84, + "learning_rate": 1.240645028994869e-06, + "loss": 0.44, + "step": 7390 + }, + { + "epoch": 0.84, + "learning_rate": 1.2388598473550828e-06, + "loss": 0.4633, + "step": 7391 + }, + { + "epoch": 0.84, + "learning_rate": 1.2370758661870997e-06, + "loss": 0.4367, + "step": 7392 + }, + { + "epoch": 0.84, + "learning_rate": 1.235293085735364e-06, + "loss": 0.4532, + "step": 7393 + }, + { + "epoch": 0.85, + "learning_rate": 1.2335115062441593e-06, + "loss": 0.4382, + "step": 7394 + }, + { + "epoch": 0.85, + "learning_rate": 1.2317311279575982e-06, + "loss": 0.4713, + "step": 7395 + }, + { + "epoch": 0.85, + "learning_rate": 1.2299519511196368e-06, + "loss": 0.4241, + "step": 7396 + }, + { + "epoch": 0.85, + "learning_rate": 1.2281739759740575e-06, + "loss": 0.4588, + "step": 7397 + }, + { + "epoch": 0.85, + "learning_rate": 1.2263972027644854e-06, + "loss": 0.4597, + "step": 7398 + }, + { + "epoch": 0.85, + "learning_rate": 1.2246216317343796e-06, + "loss": 0.4638, + "step": 7399 + }, + { + "epoch": 0.85, + "learning_rate": 1.2228472631270272e-06, + "loss": 0.4634, + "step": 7400 + }, + { + "epoch": 0.85, + "learning_rate": 1.221074097185564e-06, + "loss": 0.4513, + "step": 7401 + }, + { + "epoch": 0.85, + "learning_rate": 1.2193021341529477e-06, + "loss": 0.4579, + "step": 7402 + }, + { + "epoch": 0.85, + "learning_rate": 1.2175313742719775e-06, + "loss": 0.4613, + "step": 7403 + }, + { + "epoch": 0.85, + "learning_rate": 1.2157618177852893e-06, + "loss": 0.466, + "step": 7404 + }, + { + "epoch": 0.85, + "learning_rate": 1.2139934649353503e-06, + "loss": 0.4523, + "step": 7405 + }, + { + "epoch": 0.85, + "learning_rate": 1.212226315964462e-06, + "loss": 0.4477, + "step": 7406 + }, + { + "epoch": 0.85, + "learning_rate": 1.2104603711147666e-06, + "loss": 0.4393, + "step": 7407 + }, + { + "epoch": 0.85, + "learning_rate": 1.2086956306282371e-06, + "loss": 0.4522, + "step": 7408 + }, + { + "epoch": 0.85, + "learning_rate": 1.2069320947466845e-06, + "loss": 0.4699, + "step": 7409 + }, + { + "epoch": 0.85, + "learning_rate": 1.20516976371175e-06, + "loss": 0.4423, + "step": 7410 + }, + { + "epoch": 0.85, + "learning_rate": 1.2034086377649102e-06, + "loss": 0.4703, + "step": 7411 + }, + { + "epoch": 0.85, + "learning_rate": 1.2016487171474844e-06, + "loss": 0.4311, + "step": 7412 + }, + { + "epoch": 0.85, + "learning_rate": 1.1998900021006155e-06, + "loss": 0.4456, + "step": 7413 + }, + { + "epoch": 0.85, + "learning_rate": 1.1981324928652905e-06, + "loss": 0.4382, + "step": 7414 + }, + { + "epoch": 0.85, + "learning_rate": 1.1963761896823255e-06, + "loss": 0.4528, + "step": 7415 + }, + { + "epoch": 0.85, + "learning_rate": 1.1946210927923729e-06, + "loss": 0.4467, + "step": 7416 + }, + { + "epoch": 0.85, + "learning_rate": 1.1928672024359211e-06, + "loss": 0.448, + "step": 7417 + }, + { + "epoch": 0.85, + "learning_rate": 1.1911145188532936e-06, + "loss": 0.4427, + "step": 7418 + }, + { + "epoch": 0.85, + "learning_rate": 1.1893630422846437e-06, + "loss": 0.4495, + "step": 7419 + }, + { + "epoch": 0.85, + "learning_rate": 1.187612772969966e-06, + "loss": 0.4557, + "step": 7420 + }, + { + "epoch": 0.85, + "learning_rate": 1.1858637111490845e-06, + "loss": 0.4693, + "step": 7421 + }, + { + "epoch": 0.85, + "learning_rate": 1.1841158570616617e-06, + "loss": 0.453, + "step": 7422 + }, + { + "epoch": 0.85, + "learning_rate": 1.1823692109471919e-06, + "loss": 0.4517, + "step": 7423 + }, + { + "epoch": 0.85, + "learning_rate": 1.1806237730450009e-06, + "loss": 0.4596, + "step": 7424 + }, + { + "epoch": 0.85, + "learning_rate": 1.1788795435942591e-06, + "loss": 0.4496, + "step": 7425 + }, + { + "epoch": 0.85, + "learning_rate": 1.1771365228339593e-06, + "loss": 0.4406, + "step": 7426 + }, + { + "epoch": 0.85, + "learning_rate": 1.1753947110029373e-06, + "loss": 0.4555, + "step": 7427 + }, + { + "epoch": 0.85, + "learning_rate": 1.17365410833986e-06, + "loss": 0.4693, + "step": 7428 + }, + { + "epoch": 0.85, + "learning_rate": 1.1719147150832278e-06, + "loss": 0.4451, + "step": 7429 + }, + { + "epoch": 0.85, + "learning_rate": 1.1701765314713786e-06, + "loss": 0.4423, + "step": 7430 + }, + { + "epoch": 0.85, + "learning_rate": 1.16843955774248e-06, + "loss": 0.4563, + "step": 7431 + }, + { + "epoch": 0.85, + "learning_rate": 1.1667037941345361e-06, + "loss": 0.4354, + "step": 7432 + }, + { + "epoch": 0.85, + "learning_rate": 1.1649692408853875e-06, + "loss": 0.4411, + "step": 7433 + }, + { + "epoch": 0.85, + "learning_rate": 1.163235898232703e-06, + "loss": 0.4415, + "step": 7434 + }, + { + "epoch": 0.85, + "learning_rate": 1.1615037664139928e-06, + "loss": 0.4729, + "step": 7435 + }, + { + "epoch": 0.85, + "learning_rate": 1.1597728456665958e-06, + "loss": 0.4422, + "step": 7436 + }, + { + "epoch": 0.85, + "learning_rate": 1.1580431362276866e-06, + "loss": 0.47, + "step": 7437 + }, + { + "epoch": 0.85, + "learning_rate": 1.156314638334277e-06, + "loss": 0.4466, + "step": 7438 + }, + { + "epoch": 0.85, + "learning_rate": 1.1545873522232055e-06, + "loss": 0.4476, + "step": 7439 + }, + { + "epoch": 0.85, + "learning_rate": 1.1528612781311532e-06, + "loss": 0.4512, + "step": 7440 + }, + { + "epoch": 0.85, + "learning_rate": 1.1511364162946282e-06, + "loss": 0.4626, + "step": 7441 + }, + { + "epoch": 0.85, + "learning_rate": 1.1494127669499732e-06, + "loss": 0.4321, + "step": 7442 + }, + { + "epoch": 0.85, + "learning_rate": 1.147690330333371e-06, + "loss": 0.4471, + "step": 7443 + }, + { + "epoch": 0.85, + "learning_rate": 1.14596910668083e-06, + "loss": 0.463, + "step": 7444 + }, + { + "epoch": 0.85, + "learning_rate": 1.1442490962281983e-06, + "loss": 0.449, + "step": 7445 + }, + { + "epoch": 0.85, + "learning_rate": 1.1425302992111564e-06, + "loss": 0.4584, + "step": 7446 + }, + { + "epoch": 0.85, + "learning_rate": 1.140812715865215e-06, + "loss": 0.4398, + "step": 7447 + }, + { + "epoch": 0.85, + "learning_rate": 1.1390963464257254e-06, + "loss": 0.4492, + "step": 7448 + }, + { + "epoch": 0.85, + "learning_rate": 1.1373811911278666e-06, + "loss": 0.4535, + "step": 7449 + }, + { + "epoch": 0.85, + "learning_rate": 1.1356672502066512e-06, + "loss": 0.4298, + "step": 7450 + }, + { + "epoch": 0.85, + "learning_rate": 1.1339545238969308e-06, + "loss": 0.4531, + "step": 7451 + }, + { + "epoch": 0.85, + "learning_rate": 1.1322430124333839e-06, + "loss": 0.4477, + "step": 7452 + }, + { + "epoch": 0.85, + "learning_rate": 1.1305327160505286e-06, + "loss": 0.4755, + "step": 7453 + }, + { + "epoch": 0.85, + "learning_rate": 1.1288236349827108e-06, + "loss": 0.4572, + "step": 7454 + }, + { + "epoch": 0.85, + "learning_rate": 1.1271157694641144e-06, + "loss": 0.433, + "step": 7455 + }, + { + "epoch": 0.85, + "learning_rate": 1.1254091197287564e-06, + "loss": 0.4565, + "step": 7456 + }, + { + "epoch": 0.85, + "learning_rate": 1.1237036860104833e-06, + "loss": 0.4589, + "step": 7457 + }, + { + "epoch": 0.85, + "learning_rate": 1.1219994685429814e-06, + "loss": 0.4504, + "step": 7458 + }, + { + "epoch": 0.85, + "learning_rate": 1.1202964675597627e-06, + "loss": 0.4553, + "step": 7459 + }, + { + "epoch": 0.85, + "learning_rate": 1.1185946832941774e-06, + "loss": 0.4452, + "step": 7460 + }, + { + "epoch": 0.85, + "learning_rate": 1.116894115979409e-06, + "loss": 0.4826, + "step": 7461 + }, + { + "epoch": 0.85, + "learning_rate": 1.115194765848473e-06, + "loss": 0.4367, + "step": 7462 + }, + { + "epoch": 0.85, + "learning_rate": 1.1134966331342157e-06, + "loss": 0.4409, + "step": 7463 + }, + { + "epoch": 0.85, + "learning_rate": 1.1117997180693207e-06, + "loss": 0.474, + "step": 7464 + }, + { + "epoch": 0.85, + "learning_rate": 1.1101040208863035e-06, + "loss": 0.4283, + "step": 7465 + }, + { + "epoch": 0.85, + "learning_rate": 1.1084095418175156e-06, + "loss": 0.4431, + "step": 7466 + }, + { + "epoch": 0.85, + "learning_rate": 1.106716281095136e-06, + "loss": 0.4705, + "step": 7467 + }, + { + "epoch": 0.85, + "learning_rate": 1.1050242389511757e-06, + "loss": 0.4547, + "step": 7468 + }, + { + "epoch": 0.85, + "learning_rate": 1.103333415617488e-06, + "loss": 0.4496, + "step": 7469 + }, + { + "epoch": 0.85, + "learning_rate": 1.1016438113257487e-06, + "loss": 0.4439, + "step": 7470 + }, + { + "epoch": 0.85, + "learning_rate": 1.0999554263074752e-06, + "loss": 0.4491, + "step": 7471 + }, + { + "epoch": 0.85, + "learning_rate": 1.0982682607940131e-06, + "loss": 0.4593, + "step": 7472 + }, + { + "epoch": 0.85, + "learning_rate": 1.0965823150165378e-06, + "loss": 0.4651, + "step": 7473 + }, + { + "epoch": 0.85, + "learning_rate": 1.0948975892060655e-06, + "loss": 0.4513, + "step": 7474 + }, + { + "epoch": 0.85, + "learning_rate": 1.0932140835934414e-06, + "loss": 0.4345, + "step": 7475 + }, + { + "epoch": 0.85, + "learning_rate": 1.091531798409341e-06, + "loss": 0.4719, + "step": 7476 + }, + { + "epoch": 0.85, + "learning_rate": 1.0898507338842779e-06, + "loss": 0.4596, + "step": 7477 + }, + { + "epoch": 0.85, + "learning_rate": 1.088170890248591e-06, + "loss": 0.4358, + "step": 7478 + }, + { + "epoch": 0.85, + "learning_rate": 1.086492267732462e-06, + "loss": 0.452, + "step": 7479 + }, + { + "epoch": 0.85, + "learning_rate": 1.0848148665658975e-06, + "loss": 0.4601, + "step": 7480 + }, + { + "epoch": 0.85, + "learning_rate": 1.0831386869787353e-06, + "loss": 0.4573, + "step": 7481 + }, + { + "epoch": 0.86, + "learning_rate": 1.0814637292006536e-06, + "loss": 0.4399, + "step": 7482 + }, + { + "epoch": 0.86, + "learning_rate": 1.0797899934611567e-06, + "loss": 0.4623, + "step": 7483 + }, + { + "epoch": 0.86, + "learning_rate": 1.0781174799895844e-06, + "loss": 0.4499, + "step": 7484 + }, + { + "epoch": 0.86, + "learning_rate": 1.0764461890151112e-06, + "loss": 0.4421, + "step": 7485 + }, + { + "epoch": 0.86, + "learning_rate": 1.0747761207667372e-06, + "loss": 0.4446, + "step": 7486 + }, + { + "epoch": 0.86, + "learning_rate": 1.0731072754733019e-06, + "loss": 0.4585, + "step": 7487 + }, + { + "epoch": 0.86, + "learning_rate": 1.071439653363473e-06, + "loss": 0.4381, + "step": 7488 + }, + { + "epoch": 0.86, + "learning_rate": 1.0697732546657512e-06, + "loss": 0.4405, + "step": 7489 + }, + { + "epoch": 0.86, + "learning_rate": 1.068108079608473e-06, + "loss": 0.4639, + "step": 7490 + }, + { + "epoch": 0.86, + "learning_rate": 1.0664441284198002e-06, + "loss": 0.4362, + "step": 7491 + }, + { + "epoch": 0.86, + "learning_rate": 1.0647814013277358e-06, + "loss": 0.4415, + "step": 7492 + }, + { + "epoch": 0.86, + "learning_rate": 1.0631198985601077e-06, + "loss": 0.4541, + "step": 7493 + }, + { + "epoch": 0.86, + "learning_rate": 1.0614596203445793e-06, + "loss": 0.4463, + "step": 7494 + }, + { + "epoch": 0.86, + "learning_rate": 1.0598005669086475e-06, + "loss": 0.4557, + "step": 7495 + }, + { + "epoch": 0.86, + "learning_rate": 1.0581427384796372e-06, + "loss": 0.4597, + "step": 7496 + }, + { + "epoch": 0.86, + "learning_rate": 1.056486135284711e-06, + "loss": 0.4463, + "step": 7497 + }, + { + "epoch": 0.86, + "learning_rate": 1.0548307575508587e-06, + "loss": 0.4581, + "step": 7498 + }, + { + "epoch": 0.86, + "learning_rate": 1.053176605504902e-06, + "loss": 0.439, + "step": 7499 + }, + { + "epoch": 0.86, + "learning_rate": 1.0515236793735007e-06, + "loss": 0.4394, + "step": 7500 + }, + { + "epoch": 0.86, + "learning_rate": 1.049871979383138e-06, + "loss": 0.4437, + "step": 7501 + }, + { + "epoch": 0.86, + "learning_rate": 1.0482215057601364e-06, + "loss": 0.4593, + "step": 7502 + }, + { + "epoch": 0.86, + "learning_rate": 1.0465722587306494e-06, + "loss": 0.4444, + "step": 7503 + }, + { + "epoch": 0.86, + "learning_rate": 1.044924238520657e-06, + "loss": 0.4475, + "step": 7504 + }, + { + "epoch": 0.86, + "learning_rate": 1.043277445355978e-06, + "loss": 0.4565, + "step": 7505 + }, + { + "epoch": 0.86, + "learning_rate": 1.0416318794622594e-06, + "loss": 0.4562, + "step": 7506 + }, + { + "epoch": 0.86, + "learning_rate": 1.0399875410649763e-06, + "loss": 0.441, + "step": 7507 + }, + { + "epoch": 0.86, + "learning_rate": 1.0383444303894453e-06, + "loss": 0.4679, + "step": 7508 + }, + { + "epoch": 0.86, + "learning_rate": 1.0367025476608038e-06, + "loss": 0.4631, + "step": 7509 + }, + { + "epoch": 0.86, + "learning_rate": 1.0350618931040324e-06, + "loss": 0.4433, + "step": 7510 + }, + { + "epoch": 0.86, + "learning_rate": 1.033422466943933e-06, + "loss": 0.4734, + "step": 7511 + }, + { + "epoch": 0.86, + "learning_rate": 1.031784269405144e-06, + "loss": 0.4519, + "step": 7512 + }, + { + "epoch": 0.86, + "learning_rate": 1.0301473007121376e-06, + "loss": 0.4499, + "step": 7513 + }, + { + "epoch": 0.86, + "learning_rate": 1.0285115610892138e-06, + "loss": 0.4637, + "step": 7514 + }, + { + "epoch": 0.86, + "learning_rate": 1.026877050760503e-06, + "loss": 0.4271, + "step": 7515 + }, + { + "epoch": 0.86, + "learning_rate": 1.025243769949974e-06, + "loss": 0.4543, + "step": 7516 + }, + { + "epoch": 0.86, + "learning_rate": 1.0236117188814187e-06, + "loss": 0.4584, + "step": 7517 + }, + { + "epoch": 0.86, + "learning_rate": 1.0219808977784673e-06, + "loss": 0.4422, + "step": 7518 + }, + { + "epoch": 0.86, + "learning_rate": 1.0203513068645788e-06, + "loss": 0.4595, + "step": 7519 + }, + { + "epoch": 0.86, + "learning_rate": 1.01872294636304e-06, + "loss": 0.4512, + "step": 7520 + }, + { + "epoch": 0.86, + "learning_rate": 1.0170958164969746e-06, + "loss": 0.438, + "step": 7521 + }, + { + "epoch": 0.86, + "learning_rate": 1.0154699174893367e-06, + "loss": 0.4515, + "step": 7522 + }, + { + "epoch": 0.86, + "learning_rate": 1.0138452495629125e-06, + "loss": 0.4465, + "step": 7523 + }, + { + "epoch": 0.86, + "learning_rate": 1.012221812940315e-06, + "loss": 0.4385, + "step": 7524 + }, + { + "epoch": 0.86, + "learning_rate": 1.0105996078439894e-06, + "loss": 0.4496, + "step": 7525 + }, + { + "epoch": 0.86, + "learning_rate": 1.0089786344962194e-06, + "loss": 0.4472, + "step": 7526 + }, + { + "epoch": 0.86, + "learning_rate": 1.0073588931191104e-06, + "loss": 0.4485, + "step": 7527 + }, + { + "epoch": 0.86, + "learning_rate": 1.0057403839346037e-06, + "loss": 0.4544, + "step": 7528 + }, + { + "epoch": 0.86, + "learning_rate": 1.004123107164472e-06, + "loss": 0.4726, + "step": 7529 + }, + { + "epoch": 0.86, + "learning_rate": 1.0025070630303168e-06, + "loss": 0.4521, + "step": 7530 + }, + { + "epoch": 0.86, + "learning_rate": 1.0008922517535747e-06, + "loss": 0.4609, + "step": 7531 + }, + { + "epoch": 0.86, + "learning_rate": 9.992786735555104e-07, + "loss": 0.4313, + "step": 7532 + }, + { + "epoch": 0.86, + "learning_rate": 9.976663286572176e-07, + "loss": 0.4458, + "step": 7533 + }, + { + "epoch": 0.86, + "learning_rate": 9.960552172796278e-07, + "loss": 0.4657, + "step": 7534 + }, + { + "epoch": 0.86, + "learning_rate": 9.94445339643495e-07, + "loss": 0.435, + "step": 7535 + }, + { + "epoch": 0.86, + "learning_rate": 9.928366959694113e-07, + "loss": 0.4605, + "step": 7536 + }, + { + "epoch": 0.86, + "learning_rate": 9.912292864777961e-07, + "loss": 0.4526, + "step": 7537 + }, + { + "epoch": 0.86, + "learning_rate": 9.896231113888988e-07, + "loss": 0.468, + "step": 7538 + }, + { + "epoch": 0.86, + "learning_rate": 9.880181709228032e-07, + "loss": 0.4484, + "step": 7539 + }, + { + "epoch": 0.86, + "learning_rate": 9.8641446529942e-07, + "loss": 0.4639, + "step": 7540 + }, + { + "epoch": 0.86, + "learning_rate": 9.848119947384937e-07, + "loss": 0.4492, + "step": 7541 + }, + { + "epoch": 0.86, + "learning_rate": 9.832107594596008e-07, + "loss": 0.4683, + "step": 7542 + }, + { + "epoch": 0.86, + "learning_rate": 9.81610759682141e-07, + "loss": 0.4317, + "step": 7543 + }, + { + "epoch": 0.86, + "learning_rate": 9.800119956253574e-07, + "loss": 0.4468, + "step": 7544 + }, + { + "epoch": 0.86, + "learning_rate": 9.784144675083107e-07, + "loss": 0.4447, + "step": 7545 + }, + { + "epoch": 0.86, + "learning_rate": 9.768181755498973e-07, + "loss": 0.4756, + "step": 7546 + }, + { + "epoch": 0.86, + "learning_rate": 9.7522311996885e-07, + "loss": 0.4309, + "step": 7547 + }, + { + "epoch": 0.86, + "learning_rate": 9.736293009837206e-07, + "loss": 0.446, + "step": 7548 + }, + { + "epoch": 0.86, + "learning_rate": 9.720367188129043e-07, + "loss": 0.452, + "step": 7549 + }, + { + "epoch": 0.86, + "learning_rate": 9.704453736746156e-07, + "loss": 0.4351, + "step": 7550 + }, + { + "epoch": 0.86, + "learning_rate": 9.688552657869055e-07, + "loss": 0.4406, + "step": 7551 + }, + { + "epoch": 0.86, + "learning_rate": 9.672663953676563e-07, + "loss": 0.467, + "step": 7552 + }, + { + "epoch": 0.86, + "learning_rate": 9.656787626345765e-07, + "loss": 0.4444, + "step": 7553 + }, + { + "epoch": 0.86, + "learning_rate": 9.640923678052094e-07, + "loss": 0.4607, + "step": 7554 + }, + { + "epoch": 0.86, + "learning_rate": 9.625072110969246e-07, + "loss": 0.4649, + "step": 7555 + }, + { + "epoch": 0.86, + "learning_rate": 9.60923292726923e-07, + "loss": 0.4632, + "step": 7556 + }, + { + "epoch": 0.86, + "learning_rate": 9.593406129122397e-07, + "loss": 0.4489, + "step": 7557 + }, + { + "epoch": 0.86, + "learning_rate": 9.577591718697343e-07, + "loss": 0.4486, + "step": 7558 + }, + { + "epoch": 0.86, + "learning_rate": 9.561789698161007e-07, + "loss": 0.4312, + "step": 7559 + }, + { + "epoch": 0.86, + "learning_rate": 9.54600006967864e-07, + "loss": 0.4647, + "step": 7560 + }, + { + "epoch": 0.86, + "learning_rate": 9.530222835413739e-07, + "loss": 0.451, + "step": 7561 + }, + { + "epoch": 0.86, + "learning_rate": 9.514457997528171e-07, + "loss": 0.4704, + "step": 7562 + }, + { + "epoch": 0.86, + "learning_rate": 9.498705558182053e-07, + "loss": 0.4483, + "step": 7563 + }, + { + "epoch": 0.86, + "learning_rate": 9.482965519533804e-07, + "loss": 0.4656, + "step": 7564 + }, + { + "epoch": 0.86, + "learning_rate": 9.467237883740199e-07, + "loss": 0.4515, + "step": 7565 + }, + { + "epoch": 0.86, + "learning_rate": 9.451522652956225e-07, + "loss": 0.4385, + "step": 7566 + }, + { + "epoch": 0.86, + "learning_rate": 9.435819829335269e-07, + "loss": 0.4314, + "step": 7567 + }, + { + "epoch": 0.86, + "learning_rate": 9.420129415028934e-07, + "loss": 0.4629, + "step": 7568 + }, + { + "epoch": 0.87, + "learning_rate": 9.404451412187166e-07, + "loss": 0.4392, + "step": 7569 + }, + { + "epoch": 0.87, + "learning_rate": 9.388785822958224e-07, + "loss": 0.4575, + "step": 7570 + }, + { + "epoch": 0.87, + "learning_rate": 9.373132649488636e-07, + "loss": 0.4626, + "step": 7571 + }, + { + "epoch": 0.87, + "learning_rate": 9.357491893923198e-07, + "loss": 0.4594, + "step": 7572 + }, + { + "epoch": 0.87, + "learning_rate": 9.341863558405084e-07, + "loss": 0.4318, + "step": 7573 + }, + { + "epoch": 0.87, + "learning_rate": 9.326247645075703e-07, + "loss": 0.4612, + "step": 7574 + }, + { + "epoch": 0.87, + "learning_rate": 9.310644156074811e-07, + "loss": 0.4626, + "step": 7575 + }, + { + "epoch": 0.87, + "learning_rate": 9.295053093540408e-07, + "loss": 0.43, + "step": 7576 + }, + { + "epoch": 0.87, + "learning_rate": 9.279474459608806e-07, + "loss": 0.4374, + "step": 7577 + }, + { + "epoch": 0.87, + "learning_rate": 9.263908256414656e-07, + "loss": 0.4794, + "step": 7578 + }, + { + "epoch": 0.87, + "learning_rate": 9.24835448609085e-07, + "loss": 0.4464, + "step": 7579 + }, + { + "epoch": 0.87, + "learning_rate": 9.23281315076865e-07, + "loss": 0.4559, + "step": 7580 + }, + { + "epoch": 0.87, + "learning_rate": 9.217284252577519e-07, + "loss": 0.4274, + "step": 7581 + }, + { + "epoch": 0.87, + "learning_rate": 9.201767793645255e-07, + "loss": 0.4647, + "step": 7582 + }, + { + "epoch": 0.87, + "learning_rate": 9.186263776098014e-07, + "loss": 0.4514, + "step": 7583 + }, + { + "epoch": 0.87, + "learning_rate": 9.170772202060141e-07, + "loss": 0.4447, + "step": 7584 + }, + { + "epoch": 0.87, + "learning_rate": 9.155293073654337e-07, + "loss": 0.4388, + "step": 7585 + }, + { + "epoch": 0.87, + "learning_rate": 9.139826393001617e-07, + "loss": 0.452, + "step": 7586 + }, + { + "epoch": 0.87, + "learning_rate": 9.124372162221217e-07, + "loss": 0.4597, + "step": 7587 + }, + { + "epoch": 0.87, + "learning_rate": 9.108930383430736e-07, + "loss": 0.4609, + "step": 7588 + }, + { + "epoch": 0.87, + "learning_rate": 9.093501058746057e-07, + "loss": 0.4423, + "step": 7589 + }, + { + "epoch": 0.87, + "learning_rate": 9.078084190281311e-07, + "loss": 0.4459, + "step": 7590 + }, + { + "epoch": 0.87, + "learning_rate": 9.062679780148987e-07, + "loss": 0.4487, + "step": 7591 + }, + { + "epoch": 0.87, + "learning_rate": 9.047287830459806e-07, + "loss": 0.4524, + "step": 7592 + }, + { + "epoch": 0.87, + "learning_rate": 9.031908343322826e-07, + "loss": 0.449, + "step": 7593 + }, + { + "epoch": 0.87, + "learning_rate": 9.016541320845373e-07, + "loss": 0.4447, + "step": 7594 + }, + { + "epoch": 0.87, + "learning_rate": 9.001186765133052e-07, + "loss": 0.4528, + "step": 7595 + }, + { + "epoch": 0.87, + "learning_rate": 8.985844678289823e-07, + "loss": 0.4641, + "step": 7596 + }, + { + "epoch": 0.87, + "learning_rate": 8.97051506241785e-07, + "loss": 0.4672, + "step": 7597 + }, + { + "epoch": 0.87, + "learning_rate": 8.955197919617653e-07, + "loss": 0.4495, + "step": 7598 + }, + { + "epoch": 0.87, + "learning_rate": 8.93989325198803e-07, + "loss": 0.4568, + "step": 7599 + }, + { + "epoch": 0.87, + "learning_rate": 8.924601061626049e-07, + "loss": 0.4654, + "step": 7600 + }, + { + "epoch": 0.87, + "learning_rate": 8.909321350627109e-07, + "loss": 0.4634, + "step": 7601 + }, + { + "epoch": 0.87, + "learning_rate": 8.894054121084839e-07, + "loss": 0.4489, + "step": 7602 + }, + { + "epoch": 0.87, + "learning_rate": 8.878799375091185e-07, + "loss": 0.444, + "step": 7603 + }, + { + "epoch": 0.87, + "learning_rate": 8.863557114736432e-07, + "loss": 0.4585, + "step": 7604 + }, + { + "epoch": 0.87, + "learning_rate": 8.848327342109053e-07, + "loss": 0.4396, + "step": 7605 + }, + { + "epoch": 0.87, + "learning_rate": 8.833110059295913e-07, + "loss": 0.4655, + "step": 7606 + }, + { + "epoch": 0.87, + "learning_rate": 8.817905268382088e-07, + "loss": 0.4619, + "step": 7607 + }, + { + "epoch": 0.87, + "learning_rate": 8.802712971450989e-07, + "loss": 0.4306, + "step": 7608 + }, + { + "epoch": 0.87, + "learning_rate": 8.787533170584317e-07, + "loss": 0.4527, + "step": 7609 + }, + { + "epoch": 0.87, + "learning_rate": 8.772365867862021e-07, + "loss": 0.4511, + "step": 7610 + }, + { + "epoch": 0.87, + "learning_rate": 8.757211065362359e-07, + "loss": 0.4495, + "step": 7611 + }, + { + "epoch": 0.87, + "learning_rate": 8.742068765161893e-07, + "loss": 0.4779, + "step": 7612 + }, + { + "epoch": 0.87, + "learning_rate": 8.726938969335419e-07, + "loss": 0.4548, + "step": 7613 + }, + { + "epoch": 0.87, + "learning_rate": 8.711821679956111e-07, + "loss": 0.4472, + "step": 7614 + }, + { + "epoch": 0.87, + "learning_rate": 8.696716899095336e-07, + "loss": 0.4485, + "step": 7615 + }, + { + "epoch": 0.87, + "learning_rate": 8.681624628822794e-07, + "loss": 0.4791, + "step": 7616 + }, + { + "epoch": 0.87, + "learning_rate": 8.666544871206484e-07, + "loss": 0.4501, + "step": 7617 + }, + { + "epoch": 0.87, + "learning_rate": 8.651477628312632e-07, + "loss": 0.4501, + "step": 7618 + }, + { + "epoch": 0.87, + "learning_rate": 8.636422902205821e-07, + "loss": 0.4504, + "step": 7619 + }, + { + "epoch": 0.87, + "learning_rate": 8.621380694948878e-07, + "loss": 0.4417, + "step": 7620 + }, + { + "epoch": 0.87, + "learning_rate": 8.606351008602898e-07, + "loss": 0.4476, + "step": 7621 + }, + { + "epoch": 0.87, + "learning_rate": 8.591333845227312e-07, + "loss": 0.4568, + "step": 7622 + }, + { + "epoch": 0.87, + "learning_rate": 8.576329206879785e-07, + "loss": 0.4544, + "step": 7623 + }, + { + "epoch": 0.87, + "learning_rate": 8.561337095616306e-07, + "loss": 0.4428, + "step": 7624 + }, + { + "epoch": 0.87, + "learning_rate": 8.54635751349111e-07, + "loss": 0.4396, + "step": 7625 + }, + { + "epoch": 0.87, + "learning_rate": 8.531390462556744e-07, + "loss": 0.4423, + "step": 7626 + }, + { + "epoch": 0.87, + "learning_rate": 8.516435944864043e-07, + "loss": 0.4446, + "step": 7627 + }, + { + "epoch": 0.87, + "learning_rate": 8.501493962462092e-07, + "loss": 0.4541, + "step": 7628 + }, + { + "epoch": 0.87, + "learning_rate": 8.486564517398265e-07, + "loss": 0.4383, + "step": 7629 + }, + { + "epoch": 0.87, + "learning_rate": 8.471647611718259e-07, + "loss": 0.4603, + "step": 7630 + }, + { + "epoch": 0.87, + "learning_rate": 8.456743247465992e-07, + "loss": 0.4471, + "step": 7631 + }, + { + "epoch": 0.87, + "learning_rate": 8.441851426683723e-07, + "loss": 0.4791, + "step": 7632 + }, + { + "epoch": 0.87, + "learning_rate": 8.426972151411961e-07, + "loss": 0.4562, + "step": 7633 + }, + { + "epoch": 0.87, + "learning_rate": 8.412105423689465e-07, + "loss": 0.4443, + "step": 7634 + }, + { + "epoch": 0.87, + "learning_rate": 8.397251245553339e-07, + "loss": 0.4659, + "step": 7635 + }, + { + "epoch": 0.87, + "learning_rate": 8.382409619038923e-07, + "loss": 0.4401, + "step": 7636 + }, + { + "epoch": 0.87, + "learning_rate": 8.367580546179877e-07, + "loss": 0.4638, + "step": 7637 + }, + { + "epoch": 0.87, + "learning_rate": 8.352764029008098e-07, + "loss": 0.4554, + "step": 7638 + }, + { + "epoch": 0.87, + "learning_rate": 8.337960069553763e-07, + "loss": 0.4306, + "step": 7639 + }, + { + "epoch": 0.87, + "learning_rate": 8.323168669845383e-07, + "loss": 0.4672, + "step": 7640 + }, + { + "epoch": 0.87, + "learning_rate": 8.30838983190968e-07, + "loss": 0.453, + "step": 7641 + }, + { + "epoch": 0.87, + "learning_rate": 8.29362355777168e-07, + "loss": 0.4381, + "step": 7642 + }, + { + "epoch": 0.87, + "learning_rate": 8.278869849454718e-07, + "loss": 0.4571, + "step": 7643 + }, + { + "epoch": 0.87, + "learning_rate": 8.264128708980345e-07, + "loss": 0.4619, + "step": 7644 + }, + { + "epoch": 0.87, + "learning_rate": 8.249400138368457e-07, + "loss": 0.4437, + "step": 7645 + }, + { + "epoch": 0.87, + "learning_rate": 8.234684139637205e-07, + "loss": 0.4636, + "step": 7646 + }, + { + "epoch": 0.87, + "learning_rate": 8.219980714802978e-07, + "loss": 0.4432, + "step": 7647 + }, + { + "epoch": 0.87, + "learning_rate": 8.205289865880505e-07, + "loss": 0.4722, + "step": 7648 + }, + { + "epoch": 0.87, + "learning_rate": 8.190611594882736e-07, + "loss": 0.4393, + "step": 7649 + }, + { + "epoch": 0.87, + "learning_rate": 8.175945903820937e-07, + "loss": 0.4602, + "step": 7650 + }, + { + "epoch": 0.87, + "learning_rate": 8.161292794704634e-07, + "loss": 0.4411, + "step": 7651 + }, + { + "epoch": 0.87, + "learning_rate": 8.146652269541599e-07, + "loss": 0.458, + "step": 7652 + }, + { + "epoch": 0.87, + "learning_rate": 8.132024330337962e-07, + "loss": 0.4397, + "step": 7653 + }, + { + "epoch": 0.87, + "learning_rate": 8.11740897909803e-07, + "loss": 0.4447, + "step": 7654 + }, + { + "epoch": 0.87, + "learning_rate": 8.102806217824455e-07, + "loss": 0.4601, + "step": 7655 + }, + { + "epoch": 0.87, + "learning_rate": 8.08821604851816e-07, + "loss": 0.4818, + "step": 7656 + }, + { + "epoch": 0.88, + "learning_rate": 8.073638473178291e-07, + "loss": 0.4526, + "step": 7657 + }, + { + "epoch": 0.88, + "learning_rate": 8.059073493802327e-07, + "loss": 0.4646, + "step": 7658 + }, + { + "epoch": 0.88, + "learning_rate": 8.044521112385983e-07, + "loss": 0.4406, + "step": 7659 + }, + { + "epoch": 0.88, + "learning_rate": 8.029981330923242e-07, + "loss": 0.4413, + "step": 7660 + }, + { + "epoch": 0.88, + "learning_rate": 8.01545415140641e-07, + "loss": 0.4463, + "step": 7661 + }, + { + "epoch": 0.88, + "learning_rate": 8.000939575826016e-07, + "loss": 0.4578, + "step": 7662 + }, + { + "epoch": 0.88, + "learning_rate": 7.986437606170893e-07, + "loss": 0.4601, + "step": 7663 + }, + { + "epoch": 0.88, + "learning_rate": 7.971948244428118e-07, + "loss": 0.4446, + "step": 7664 + }, + { + "epoch": 0.88, + "learning_rate": 7.957471492583068e-07, + "loss": 0.4322, + "step": 7665 + }, + { + "epoch": 0.88, + "learning_rate": 7.943007352619392e-07, + "loss": 0.4489, + "step": 7666 + }, + { + "epoch": 0.88, + "learning_rate": 7.928555826518991e-07, + "loss": 0.4336, + "step": 7667 + }, + { + "epoch": 0.88, + "learning_rate": 7.914116916262027e-07, + "loss": 0.4407, + "step": 7668 + }, + { + "epoch": 0.88, + "learning_rate": 7.899690623826983e-07, + "loss": 0.4515, + "step": 7669 + }, + { + "epoch": 0.88, + "learning_rate": 7.885276951190568e-07, + "loss": 0.4516, + "step": 7670 + }, + { + "epoch": 0.88, + "learning_rate": 7.870875900327779e-07, + "loss": 0.4426, + "step": 7671 + }, + { + "epoch": 0.88, + "learning_rate": 7.856487473211871e-07, + "loss": 0.4682, + "step": 7672 + }, + { + "epoch": 0.88, + "learning_rate": 7.842111671814401e-07, + "loss": 0.4245, + "step": 7673 + }, + { + "epoch": 0.88, + "learning_rate": 7.82774849810517e-07, + "loss": 0.4563, + "step": 7674 + }, + { + "epoch": 0.88, + "learning_rate": 7.813397954052237e-07, + "loss": 0.4313, + "step": 7675 + }, + { + "epoch": 0.88, + "learning_rate": 7.799060041621975e-07, + "loss": 0.449, + "step": 7676 + }, + { + "epoch": 0.88, + "learning_rate": 7.784734762778978e-07, + "loss": 0.4433, + "step": 7677 + }, + { + "epoch": 0.88, + "learning_rate": 7.77042211948611e-07, + "loss": 0.438, + "step": 7678 + }, + { + "epoch": 0.88, + "learning_rate": 7.756122113704567e-07, + "loss": 0.4465, + "step": 7679 + }, + { + "epoch": 0.88, + "learning_rate": 7.741834747393751e-07, + "loss": 0.4503, + "step": 7680 + }, + { + "epoch": 0.88, + "learning_rate": 7.727560022511327e-07, + "loss": 0.4502, + "step": 7681 + }, + { + "epoch": 0.88, + "learning_rate": 7.713297941013264e-07, + "loss": 0.455, + "step": 7682 + }, + { + "epoch": 0.88, + "learning_rate": 7.69904850485379e-07, + "loss": 0.4359, + "step": 7683 + }, + { + "epoch": 0.88, + "learning_rate": 7.684811715985429e-07, + "loss": 0.4722, + "step": 7684 + }, + { + "epoch": 0.88, + "learning_rate": 7.670587576358889e-07, + "loss": 0.4377, + "step": 7685 + }, + { + "epoch": 0.88, + "learning_rate": 7.656376087923212e-07, + "loss": 0.4555, + "step": 7686 + }, + { + "epoch": 0.88, + "learning_rate": 7.642177252625704e-07, + "loss": 0.4569, + "step": 7687 + }, + { + "epoch": 0.88, + "learning_rate": 7.627991072411889e-07, + "loss": 0.4471, + "step": 7688 + }, + { + "epoch": 0.88, + "learning_rate": 7.613817549225621e-07, + "loss": 0.4646, + "step": 7689 + }, + { + "epoch": 0.88, + "learning_rate": 7.599656685008982e-07, + "loss": 0.4708, + "step": 7690 + }, + { + "epoch": 0.88, + "learning_rate": 7.585508481702308e-07, + "loss": 0.4315, + "step": 7691 + }, + { + "epoch": 0.88, + "learning_rate": 7.571372941244237e-07, + "loss": 0.4523, + "step": 7692 + }, + { + "epoch": 0.88, + "learning_rate": 7.557250065571664e-07, + "loss": 0.435, + "step": 7693 + }, + { + "epoch": 0.88, + "learning_rate": 7.543139856619708e-07, + "loss": 0.4433, + "step": 7694 + }, + { + "epoch": 0.88, + "learning_rate": 7.52904231632181e-07, + "loss": 0.4802, + "step": 7695 + }, + { + "epoch": 0.88, + "learning_rate": 7.514957446609627e-07, + "loss": 0.4475, + "step": 7696 + }, + { + "epoch": 0.88, + "learning_rate": 7.500885249413126e-07, + "loss": 0.469, + "step": 7697 + }, + { + "epoch": 0.88, + "learning_rate": 7.486825726660496e-07, + "loss": 0.4423, + "step": 7698 + }, + { + "epoch": 0.88, + "learning_rate": 7.472778880278197e-07, + "loss": 0.4658, + "step": 7699 + }, + { + "epoch": 0.88, + "learning_rate": 7.45874471219098e-07, + "loss": 0.4586, + "step": 7700 + }, + { + "epoch": 0.88, + "learning_rate": 7.444723224321804e-07, + "loss": 0.4538, + "step": 7701 + }, + { + "epoch": 0.88, + "learning_rate": 7.430714418591966e-07, + "loss": 0.4298, + "step": 7702 + }, + { + "epoch": 0.88, + "learning_rate": 7.416718296920977e-07, + "loss": 0.4549, + "step": 7703 + }, + { + "epoch": 0.88, + "learning_rate": 7.40273486122659e-07, + "loss": 0.438, + "step": 7704 + }, + { + "epoch": 0.88, + "learning_rate": 7.388764113424895e-07, + "loss": 0.4753, + "step": 7705 + }, + { + "epoch": 0.88, + "learning_rate": 7.37480605543015e-07, + "loss": 0.4512, + "step": 7706 + }, + { + "epoch": 0.88, + "learning_rate": 7.360860689154969e-07, + "loss": 0.4467, + "step": 7707 + }, + { + "epoch": 0.88, + "learning_rate": 7.346928016510135e-07, + "loss": 0.438, + "step": 7708 + }, + { + "epoch": 0.88, + "learning_rate": 7.333008039404743e-07, + "loss": 0.4606, + "step": 7709 + }, + { + "epoch": 0.88, + "learning_rate": 7.319100759746167e-07, + "loss": 0.4459, + "step": 7710 + }, + { + "epoch": 0.88, + "learning_rate": 7.305206179439972e-07, + "loss": 0.4351, + "step": 7711 + }, + { + "epoch": 0.88, + "learning_rate": 7.291324300390057e-07, + "loss": 0.46, + "step": 7712 + }, + { + "epoch": 0.88, + "learning_rate": 7.277455124498545e-07, + "loss": 0.4704, + "step": 7713 + }, + { + "epoch": 0.88, + "learning_rate": 7.263598653665815e-07, + "loss": 0.4559, + "step": 7714 + }, + { + "epoch": 0.88, + "learning_rate": 7.249754889790539e-07, + "loss": 0.4432, + "step": 7715 + }, + { + "epoch": 0.88, + "learning_rate": 7.235923834769599e-07, + "loss": 0.4376, + "step": 7716 + }, + { + "epoch": 0.88, + "learning_rate": 7.222105490498133e-07, + "loss": 0.4602, + "step": 7717 + }, + { + "epoch": 0.88, + "learning_rate": 7.208299858869616e-07, + "loss": 0.449, + "step": 7718 + }, + { + "epoch": 0.88, + "learning_rate": 7.194506941775681e-07, + "loss": 0.4414, + "step": 7719 + }, + { + "epoch": 0.88, + "learning_rate": 7.180726741106303e-07, + "loss": 0.4416, + "step": 7720 + }, + { + "epoch": 0.88, + "learning_rate": 7.16695925874964e-07, + "loss": 0.4513, + "step": 7721 + }, + { + "epoch": 0.88, + "learning_rate": 7.15320449659217e-07, + "loss": 0.4673, + "step": 7722 + }, + { + "epoch": 0.88, + "learning_rate": 7.139462456518619e-07, + "loss": 0.4606, + "step": 7723 + }, + { + "epoch": 0.88, + "learning_rate": 7.125733140411928e-07, + "loss": 0.4422, + "step": 7724 + }, + { + "epoch": 0.88, + "learning_rate": 7.1120165501533e-07, + "loss": 0.45, + "step": 7725 + }, + { + "epoch": 0.88, + "learning_rate": 7.098312687622256e-07, + "loss": 0.4541, + "step": 7726 + }, + { + "epoch": 0.88, + "learning_rate": 7.084621554696502e-07, + "loss": 0.4516, + "step": 7727 + }, + { + "epoch": 0.88, + "learning_rate": 7.070943153252053e-07, + "loss": 0.4362, + "step": 7728 + }, + { + "epoch": 0.88, + "learning_rate": 7.057277485163116e-07, + "loss": 0.4724, + "step": 7729 + }, + { + "epoch": 0.88, + "learning_rate": 7.043624552302231e-07, + "loss": 0.4389, + "step": 7730 + }, + { + "epoch": 0.88, + "learning_rate": 7.029984356540153e-07, + "loss": 0.4602, + "step": 7731 + }, + { + "epoch": 0.88, + "learning_rate": 7.016356899745869e-07, + "loss": 0.4498, + "step": 7732 + }, + { + "epoch": 0.88, + "learning_rate": 7.002742183786671e-07, + "loss": 0.4505, + "step": 7733 + }, + { + "epoch": 0.88, + "learning_rate": 6.989140210528067e-07, + "loss": 0.4436, + "step": 7734 + }, + { + "epoch": 0.88, + "learning_rate": 6.975550981833823e-07, + "loss": 0.4532, + "step": 7735 + }, + { + "epoch": 0.88, + "learning_rate": 6.961974499565982e-07, + "loss": 0.4534, + "step": 7736 + }, + { + "epoch": 0.88, + "learning_rate": 6.948410765584813e-07, + "loss": 0.4557, + "step": 7737 + }, + { + "epoch": 0.88, + "learning_rate": 6.934859781748848e-07, + "loss": 0.4727, + "step": 7738 + }, + { + "epoch": 0.88, + "learning_rate": 6.921321549914872e-07, + "loss": 0.4457, + "step": 7739 + }, + { + "epoch": 0.88, + "learning_rate": 6.907796071937944e-07, + "loss": 0.4459, + "step": 7740 + }, + { + "epoch": 0.88, + "learning_rate": 6.894283349671349e-07, + "loss": 0.4443, + "step": 7741 + }, + { + "epoch": 0.88, + "learning_rate": 6.880783384966638e-07, + "loss": 0.4602, + "step": 7742 + }, + { + "epoch": 0.88, + "learning_rate": 6.867296179673588e-07, + "loss": 0.4401, + "step": 7743 + }, + { + "epoch": 0.89, + "learning_rate": 6.853821735640265e-07, + "loss": 0.438, + "step": 7744 + }, + { + "epoch": 0.89, + "learning_rate": 6.840360054712946e-07, + "loss": 0.4418, + "step": 7745 + }, + { + "epoch": 0.89, + "learning_rate": 6.826911138736214e-07, + "loss": 0.4423, + "step": 7746 + }, + { + "epoch": 0.89, + "learning_rate": 6.81347498955286e-07, + "loss": 0.461, + "step": 7747 + }, + { + "epoch": 0.89, + "learning_rate": 6.800051609003911e-07, + "loss": 0.4588, + "step": 7748 + }, + { + "epoch": 0.89, + "learning_rate": 6.786640998928684e-07, + "loss": 0.457, + "step": 7749 + }, + { + "epoch": 0.89, + "learning_rate": 6.773243161164756e-07, + "loss": 0.4321, + "step": 7750 + }, + { + "epoch": 0.89, + "learning_rate": 6.7598580975479e-07, + "loss": 0.4422, + "step": 7751 + }, + { + "epoch": 0.89, + "learning_rate": 6.746485809912184e-07, + "loss": 0.4472, + "step": 7752 + }, + { + "epoch": 0.89, + "learning_rate": 6.733126300089898e-07, + "loss": 0.4487, + "step": 7753 + }, + { + "epoch": 0.89, + "learning_rate": 6.719779569911622e-07, + "loss": 0.4533, + "step": 7754 + }, + { + "epoch": 0.89, + "learning_rate": 6.706445621206126e-07, + "loss": 0.4623, + "step": 7755 + }, + { + "epoch": 0.89, + "learning_rate": 6.69312445580046e-07, + "loss": 0.4483, + "step": 7756 + }, + { + "epoch": 0.89, + "learning_rate": 6.67981607551994e-07, + "loss": 0.4525, + "step": 7757 + }, + { + "epoch": 0.89, + "learning_rate": 6.666520482188087e-07, + "loss": 0.4394, + "step": 7758 + }, + { + "epoch": 0.89, + "learning_rate": 6.653237677626701e-07, + "loss": 0.458, + "step": 7759 + }, + { + "epoch": 0.89, + "learning_rate": 6.639967663655844e-07, + "loss": 0.4302, + "step": 7760 + }, + { + "epoch": 0.89, + "learning_rate": 6.626710442093776e-07, + "loss": 0.4385, + "step": 7761 + }, + { + "epoch": 0.89, + "learning_rate": 6.613466014757064e-07, + "loss": 0.4548, + "step": 7762 + }, + { + "epoch": 0.89, + "learning_rate": 6.600234383460469e-07, + "loss": 0.4378, + "step": 7763 + }, + { + "epoch": 0.89, + "learning_rate": 6.587015550017006e-07, + "loss": 0.431, + "step": 7764 + }, + { + "epoch": 0.89, + "learning_rate": 6.573809516237984e-07, + "loss": 0.4637, + "step": 7765 + }, + { + "epoch": 0.89, + "learning_rate": 6.560616283932897e-07, + "loss": 0.4629, + "step": 7766 + }, + { + "epoch": 0.89, + "learning_rate": 6.547435854909534e-07, + "loss": 0.4502, + "step": 7767 + }, + { + "epoch": 0.89, + "learning_rate": 6.534268230973873e-07, + "loss": 0.4431, + "step": 7768 + }, + { + "epoch": 0.89, + "learning_rate": 6.521113413930202e-07, + "loss": 0.452, + "step": 7769 + }, + { + "epoch": 0.89, + "learning_rate": 6.507971405581037e-07, + "loss": 0.4402, + "step": 7770 + }, + { + "epoch": 0.89, + "learning_rate": 6.494842207727092e-07, + "loss": 0.4589, + "step": 7771 + }, + { + "epoch": 0.89, + "learning_rate": 6.481725822167384e-07, + "loss": 0.446, + "step": 7772 + }, + { + "epoch": 0.89, + "learning_rate": 6.468622250699152e-07, + "loss": 0.4478, + "step": 7773 + }, + { + "epoch": 0.89, + "learning_rate": 6.45553149511785e-07, + "loss": 0.4533, + "step": 7774 + }, + { + "epoch": 0.89, + "learning_rate": 6.442453557217243e-07, + "loss": 0.4651, + "step": 7775 + }, + { + "epoch": 0.89, + "learning_rate": 6.429388438789252e-07, + "loss": 0.4491, + "step": 7776 + }, + { + "epoch": 0.89, + "learning_rate": 6.416336141624146e-07, + "loss": 0.4507, + "step": 7777 + }, + { + "epoch": 0.89, + "learning_rate": 6.403296667510339e-07, + "loss": 0.4598, + "step": 7778 + }, + { + "epoch": 0.89, + "learning_rate": 6.390270018234534e-07, + "loss": 0.4574, + "step": 7779 + }, + { + "epoch": 0.89, + "learning_rate": 6.377256195581705e-07, + "loss": 0.4618, + "step": 7780 + }, + { + "epoch": 0.89, + "learning_rate": 6.364255201335013e-07, + "loss": 0.4581, + "step": 7781 + }, + { + "epoch": 0.89, + "learning_rate": 6.351267037275877e-07, + "loss": 0.4482, + "step": 7782 + }, + { + "epoch": 0.89, + "learning_rate": 6.338291705183986e-07, + "loss": 0.4528, + "step": 7783 + }, + { + "epoch": 0.89, + "learning_rate": 6.325329206837217e-07, + "loss": 0.463, + "step": 7784 + }, + { + "epoch": 0.89, + "learning_rate": 6.31237954401176e-07, + "loss": 0.4425, + "step": 7785 + }, + { + "epoch": 0.89, + "learning_rate": 6.299442718481974e-07, + "loss": 0.4418, + "step": 7786 + }, + { + "epoch": 0.89, + "learning_rate": 6.286518732020519e-07, + "loss": 0.4509, + "step": 7787 + }, + { + "epoch": 0.89, + "learning_rate": 6.273607586398267e-07, + "loss": 0.4507, + "step": 7788 + }, + { + "epoch": 0.89, + "learning_rate": 6.260709283384326e-07, + "loss": 0.4678, + "step": 7789 + }, + { + "epoch": 0.89, + "learning_rate": 6.247823824746058e-07, + "loss": 0.4374, + "step": 7790 + }, + { + "epoch": 0.89, + "learning_rate": 6.234951212249052e-07, + "loss": 0.4768, + "step": 7791 + }, + { + "epoch": 0.89, + "learning_rate": 6.222091447657119e-07, + "loss": 0.4333, + "step": 7792 + }, + { + "epoch": 0.89, + "learning_rate": 6.209244532732394e-07, + "loss": 0.4462, + "step": 7793 + }, + { + "epoch": 0.89, + "learning_rate": 6.196410469235148e-07, + "loss": 0.4515, + "step": 7794 + }, + { + "epoch": 0.89, + "learning_rate": 6.183589258923928e-07, + "loss": 0.4577, + "step": 7795 + }, + { + "epoch": 0.89, + "learning_rate": 6.170780903555529e-07, + "loss": 0.4478, + "step": 7796 + }, + { + "epoch": 0.89, + "learning_rate": 6.157985404885003e-07, + "loss": 0.4433, + "step": 7797 + }, + { + "epoch": 0.89, + "learning_rate": 6.145202764665626e-07, + "loss": 0.4634, + "step": 7798 + }, + { + "epoch": 0.89, + "learning_rate": 6.132432984648895e-07, + "loss": 0.4328, + "step": 7799 + }, + { + "epoch": 0.89, + "learning_rate": 6.119676066584523e-07, + "loss": 0.4386, + "step": 7800 + }, + { + "epoch": 0.89, + "learning_rate": 6.106932012220534e-07, + "loss": 0.4653, + "step": 7801 + }, + { + "epoch": 0.89, + "learning_rate": 6.09420082330312e-07, + "loss": 0.4458, + "step": 7802 + }, + { + "epoch": 0.89, + "learning_rate": 6.081482501576763e-07, + "loss": 0.4492, + "step": 7803 + }, + { + "epoch": 0.89, + "learning_rate": 6.068777048784136e-07, + "loss": 0.4465, + "step": 7804 + }, + { + "epoch": 0.89, + "learning_rate": 6.056084466666167e-07, + "loss": 0.4539, + "step": 7805 + }, + { + "epoch": 0.89, + "learning_rate": 6.043404756962046e-07, + "loss": 0.4583, + "step": 7806 + }, + { + "epoch": 0.89, + "learning_rate": 6.030737921409169e-07, + "loss": 0.4634, + "step": 7807 + }, + { + "epoch": 0.89, + "learning_rate": 6.01808396174316e-07, + "loss": 0.4564, + "step": 7808 + }, + { + "epoch": 0.89, + "learning_rate": 6.005442879697909e-07, + "loss": 0.4556, + "step": 7809 + }, + { + "epoch": 0.89, + "learning_rate": 5.992814677005521e-07, + "loss": 0.4375, + "step": 7810 + }, + { + "epoch": 0.89, + "learning_rate": 5.980199355396343e-07, + "loss": 0.4505, + "step": 7811 + }, + { + "epoch": 0.89, + "learning_rate": 5.967596916598961e-07, + "loss": 0.4443, + "step": 7812 + }, + { + "epoch": 0.89, + "learning_rate": 5.955007362340171e-07, + "loss": 0.4439, + "step": 7813 + }, + { + "epoch": 0.89, + "learning_rate": 5.942430694345058e-07, + "loss": 0.4427, + "step": 7814 + }, + { + "epoch": 0.89, + "learning_rate": 5.929866914336857e-07, + "loss": 0.4509, + "step": 7815 + }, + { + "epoch": 0.89, + "learning_rate": 5.917316024037123e-07, + "loss": 0.4467, + "step": 7816 + }, + { + "epoch": 0.89, + "learning_rate": 5.904778025165614e-07, + "loss": 0.4594, + "step": 7817 + }, + { + "epoch": 0.89, + "learning_rate": 5.892252919440289e-07, + "loss": 0.4512, + "step": 7818 + }, + { + "epoch": 0.89, + "learning_rate": 5.879740708577386e-07, + "loss": 0.4463, + "step": 7819 + }, + { + "epoch": 0.89, + "learning_rate": 5.867241394291356e-07, + "loss": 0.4634, + "step": 7820 + }, + { + "epoch": 0.89, + "learning_rate": 5.854754978294863e-07, + "loss": 0.4553, + "step": 7821 + }, + { + "epoch": 0.89, + "learning_rate": 5.84228146229886e-07, + "loss": 0.444, + "step": 7822 + }, + { + "epoch": 0.89, + "learning_rate": 5.829820848012457e-07, + "loss": 0.4574, + "step": 7823 + }, + { + "epoch": 0.89, + "learning_rate": 5.817373137143079e-07, + "loss": 0.4532, + "step": 7824 + }, + { + "epoch": 0.89, + "learning_rate": 5.804938331396292e-07, + "loss": 0.4523, + "step": 7825 + }, + { + "epoch": 0.89, + "learning_rate": 5.79251643247598e-07, + "loss": 0.4373, + "step": 7826 + }, + { + "epoch": 0.89, + "learning_rate": 5.780107442084215e-07, + "loss": 0.4631, + "step": 7827 + }, + { + "epoch": 0.89, + "learning_rate": 5.767711361921291e-07, + "loss": 0.4318, + "step": 7828 + }, + { + "epoch": 0.89, + "learning_rate": 5.755328193685772e-07, + "loss": 0.4593, + "step": 7829 + }, + { + "epoch": 0.89, + "learning_rate": 5.742957939074412e-07, + "loss": 0.4494, + "step": 7830 + }, + { + "epoch": 0.89, + "learning_rate": 5.730600599782188e-07, + "loss": 0.4487, + "step": 7831 + }, + { + "epoch": 0.9, + "learning_rate": 5.718256177502379e-07, + "loss": 0.4477, + "step": 7832 + }, + { + "epoch": 0.9, + "learning_rate": 5.70592467392641e-07, + "loss": 0.4551, + "step": 7833 + }, + { + "epoch": 0.9, + "learning_rate": 5.693606090744008e-07, + "loss": 0.4259, + "step": 7834 + }, + { + "epoch": 0.9, + "learning_rate": 5.681300429643044e-07, + "loss": 0.4667, + "step": 7835 + }, + { + "epoch": 0.9, + "learning_rate": 5.669007692309703e-07, + "loss": 0.4529, + "step": 7836 + }, + { + "epoch": 0.9, + "learning_rate": 5.65672788042837e-07, + "loss": 0.4611, + "step": 7837 + }, + { + "epoch": 0.9, + "learning_rate": 5.644460995681644e-07, + "loss": 0.4351, + "step": 7838 + }, + { + "epoch": 0.9, + "learning_rate": 5.632207039750348e-07, + "loss": 0.4465, + "step": 7839 + }, + { + "epoch": 0.9, + "learning_rate": 5.61996601431356e-07, + "loss": 0.463, + "step": 7840 + }, + { + "epoch": 0.9, + "learning_rate": 5.607737921048573e-07, + "loss": 0.4737, + "step": 7841 + }, + { + "epoch": 0.9, + "learning_rate": 5.595522761630911e-07, + "loss": 0.4388, + "step": 7842 + }, + { + "epoch": 0.9, + "learning_rate": 5.583320537734315e-07, + "loss": 0.455, + "step": 7843 + }, + { + "epoch": 0.9, + "learning_rate": 5.57113125103077e-07, + "loss": 0.4427, + "step": 7844 + }, + { + "epoch": 0.9, + "learning_rate": 5.558954903190483e-07, + "loss": 0.4489, + "step": 7845 + }, + { + "epoch": 0.9, + "learning_rate": 5.546791495881887e-07, + "loss": 0.4557, + "step": 7846 + }, + { + "epoch": 0.9, + "learning_rate": 5.534641030771615e-07, + "loss": 0.4598, + "step": 7847 + }, + { + "epoch": 0.9, + "learning_rate": 5.522503509524591e-07, + "loss": 0.4401, + "step": 7848 + }, + { + "epoch": 0.9, + "learning_rate": 5.510378933803895e-07, + "loss": 0.4515, + "step": 7849 + }, + { + "epoch": 0.9, + "learning_rate": 5.498267305270888e-07, + "loss": 0.4423, + "step": 7850 + }, + { + "epoch": 0.9, + "learning_rate": 5.48616862558512e-07, + "loss": 0.4442, + "step": 7851 + }, + { + "epoch": 0.9, + "learning_rate": 5.474082896404365e-07, + "loss": 0.4525, + "step": 7852 + }, + { + "epoch": 0.9, + "learning_rate": 5.462010119384665e-07, + "loss": 0.4448, + "step": 7853 + }, + { + "epoch": 0.9, + "learning_rate": 5.44995029618024e-07, + "loss": 0.4257, + "step": 7854 + }, + { + "epoch": 0.9, + "learning_rate": 5.43790342844358e-07, + "loss": 0.4738, + "step": 7855 + }, + { + "epoch": 0.9, + "learning_rate": 5.425869517825366e-07, + "loss": 0.4501, + "step": 7856 + }, + { + "epoch": 0.9, + "learning_rate": 5.413848565974489e-07, + "loss": 0.4749, + "step": 7857 + }, + { + "epoch": 0.9, + "learning_rate": 5.401840574538108e-07, + "loss": 0.4564, + "step": 7858 + }, + { + "epoch": 0.9, + "learning_rate": 5.389845545161598e-07, + "loss": 0.4435, + "step": 7859 + }, + { + "epoch": 0.9, + "learning_rate": 5.37786347948851e-07, + "loss": 0.4443, + "step": 7860 + }, + { + "epoch": 0.9, + "learning_rate": 5.365894379160686e-07, + "loss": 0.4699, + "step": 7861 + }, + { + "epoch": 0.9, + "learning_rate": 5.353938245818147e-07, + "loss": 0.4474, + "step": 7862 + }, + { + "epoch": 0.9, + "learning_rate": 5.341995081099139e-07, + "loss": 0.4651, + "step": 7863 + }, + { + "epoch": 0.9, + "learning_rate": 5.330064886640173e-07, + "loss": 0.4474, + "step": 7864 + }, + { + "epoch": 0.9, + "learning_rate": 5.318147664075923e-07, + "loss": 0.4703, + "step": 7865 + }, + { + "epoch": 0.9, + "learning_rate": 5.306243415039336e-07, + "loss": 0.452, + "step": 7866 + }, + { + "epoch": 0.9, + "learning_rate": 5.294352141161541e-07, + "loss": 0.4484, + "step": 7867 + }, + { + "epoch": 0.9, + "learning_rate": 5.282473844071933e-07, + "loss": 0.4377, + "step": 7868 + }, + { + "epoch": 0.9, + "learning_rate": 5.27060852539808e-07, + "loss": 0.4763, + "step": 7869 + }, + { + "epoch": 0.9, + "learning_rate": 5.258756186765801e-07, + "loss": 0.4567, + "step": 7870 + }, + { + "epoch": 0.9, + "learning_rate": 5.246916829799132e-07, + "loss": 0.4356, + "step": 7871 + }, + { + "epoch": 0.9, + "learning_rate": 5.235090456120329e-07, + "loss": 0.4509, + "step": 7872 + }, + { + "epoch": 0.9, + "learning_rate": 5.223277067349864e-07, + "loss": 0.4649, + "step": 7873 + }, + { + "epoch": 0.9, + "learning_rate": 5.211476665106463e-07, + "loss": 0.4489, + "step": 7874 + }, + { + "epoch": 0.9, + "learning_rate": 5.199689251007001e-07, + "loss": 0.4553, + "step": 7875 + }, + { + "epoch": 0.9, + "learning_rate": 5.187914826666662e-07, + "loss": 0.4444, + "step": 7876 + }, + { + "epoch": 0.9, + "learning_rate": 5.17615339369878e-07, + "loss": 0.4496, + "step": 7877 + }, + { + "epoch": 0.9, + "learning_rate": 5.164404953714919e-07, + "loss": 0.4492, + "step": 7878 + }, + { + "epoch": 0.9, + "learning_rate": 5.152669508324904e-07, + "loss": 0.4534, + "step": 7879 + }, + { + "epoch": 0.9, + "learning_rate": 5.140947059136736e-07, + "loss": 0.4503, + "step": 7880 + }, + { + "epoch": 0.9, + "learning_rate": 5.129237607756677e-07, + "loss": 0.4529, + "step": 7881 + }, + { + "epoch": 0.9, + "learning_rate": 5.117541155789141e-07, + "loss": 0.4364, + "step": 7882 + }, + { + "epoch": 0.9, + "learning_rate": 5.105857704836836e-07, + "loss": 0.4399, + "step": 7883 + }, + { + "epoch": 0.9, + "learning_rate": 5.094187256500671e-07, + "loss": 0.4533, + "step": 7884 + }, + { + "epoch": 0.9, + "learning_rate": 5.08252981237971e-07, + "loss": 0.4392, + "step": 7885 + }, + { + "epoch": 0.9, + "learning_rate": 5.070885374071321e-07, + "loss": 0.4464, + "step": 7886 + }, + { + "epoch": 0.9, + "learning_rate": 5.05925394317105e-07, + "loss": 0.4763, + "step": 7887 + }, + { + "epoch": 0.9, + "learning_rate": 5.047635521272631e-07, + "loss": 0.4511, + "step": 7888 + }, + { + "epoch": 0.9, + "learning_rate": 5.036030109968082e-07, + "loss": 0.456, + "step": 7889 + }, + { + "epoch": 0.9, + "learning_rate": 5.024437710847574e-07, + "loss": 0.4665, + "step": 7890 + }, + { + "epoch": 0.9, + "learning_rate": 5.012858325499559e-07, + "loss": 0.4364, + "step": 7891 + }, + { + "epoch": 0.9, + "learning_rate": 5.001291955510634e-07, + "loss": 0.4645, + "step": 7892 + }, + { + "epoch": 0.9, + "learning_rate": 4.989738602465666e-07, + "loss": 0.4316, + "step": 7893 + }, + { + "epoch": 0.9, + "learning_rate": 4.978198267947742e-07, + "loss": 0.4545, + "step": 7894 + }, + { + "epoch": 0.9, + "learning_rate": 4.966670953538133e-07, + "loss": 0.4539, + "step": 7895 + }, + { + "epoch": 0.9, + "learning_rate": 4.955156660816307e-07, + "loss": 0.454, + "step": 7896 + }, + { + "epoch": 0.9, + "learning_rate": 4.943655391360025e-07, + "loss": 0.4422, + "step": 7897 + }, + { + "epoch": 0.9, + "learning_rate": 4.932167146745193e-07, + "loss": 0.4376, + "step": 7898 + }, + { + "epoch": 0.9, + "learning_rate": 4.920691928545973e-07, + "loss": 0.456, + "step": 7899 + }, + { + "epoch": 0.9, + "learning_rate": 4.909229738334698e-07, + "loss": 0.4591, + "step": 7900 + }, + { + "epoch": 0.9, + "learning_rate": 4.897780577681954e-07, + "loss": 0.4428, + "step": 7901 + }, + { + "epoch": 0.9, + "learning_rate": 4.886344448156566e-07, + "loss": 0.4412, + "step": 7902 + }, + { + "epoch": 0.9, + "learning_rate": 4.874921351325512e-07, + "loss": 0.4327, + "step": 7903 + }, + { + "epoch": 0.9, + "learning_rate": 4.863511288753986e-07, + "loss": 0.4401, + "step": 7904 + }, + { + "epoch": 0.9, + "learning_rate": 4.85211426200547e-07, + "loss": 0.4561, + "step": 7905 + }, + { + "epoch": 0.9, + "learning_rate": 4.840730272641569e-07, + "loss": 0.4616, + "step": 7906 + }, + { + "epoch": 0.9, + "learning_rate": 4.829359322222182e-07, + "loss": 0.4657, + "step": 7907 + }, + { + "epoch": 0.9, + "learning_rate": 4.818001412305362e-07, + "loss": 0.4471, + "step": 7908 + }, + { + "epoch": 0.9, + "learning_rate": 4.806656544447374e-07, + "loss": 0.4503, + "step": 7909 + }, + { + "epoch": 0.9, + "learning_rate": 4.795324720202754e-07, + "loss": 0.4521, + "step": 7910 + }, + { + "epoch": 0.9, + "learning_rate": 4.784005941124203e-07, + "loss": 0.4485, + "step": 7911 + }, + { + "epoch": 0.9, + "learning_rate": 4.772700208762659e-07, + "loss": 0.4421, + "step": 7912 + }, + { + "epoch": 0.9, + "learning_rate": 4.761407524667239e-07, + "loss": 0.4397, + "step": 7913 + }, + { + "epoch": 0.9, + "learning_rate": 4.750127890385292e-07, + "loss": 0.4493, + "step": 7914 + }, + { + "epoch": 0.9, + "learning_rate": 4.738861307462406e-07, + "loss": 0.461, + "step": 7915 + }, + { + "epoch": 0.9, + "learning_rate": 4.7276077774423334e-07, + "loss": 0.4571, + "step": 7916 + }, + { + "epoch": 0.9, + "learning_rate": 4.716367301867053e-07, + "loss": 0.4409, + "step": 7917 + }, + { + "epoch": 0.9, + "learning_rate": 4.705139882276788e-07, + "loss": 0.4698, + "step": 7918 + }, + { + "epoch": 0.91, + "learning_rate": 4.693925520209908e-07, + "loss": 0.4495, + "step": 7919 + }, + { + "epoch": 0.91, + "learning_rate": 4.6827242172030495e-07, + "loss": 0.4517, + "step": 7920 + }, + { + "epoch": 0.91, + "learning_rate": 4.6715359747910526e-07, + "loss": 0.4549, + "step": 7921 + }, + { + "epoch": 0.91, + "learning_rate": 4.660360794506946e-07, + "loss": 0.4327, + "step": 7922 + }, + { + "epoch": 0.91, + "learning_rate": 4.649198677881983e-07, + "loss": 0.4543, + "step": 7923 + }, + { + "epoch": 0.91, + "learning_rate": 4.6380496264456064e-07, + "loss": 0.4555, + "step": 7924 + }, + { + "epoch": 0.91, + "learning_rate": 4.6269136417255167e-07, + "loss": 0.433, + "step": 7925 + }, + { + "epoch": 0.91, + "learning_rate": 4.615790725247571e-07, + "loss": 0.4415, + "step": 7926 + }, + { + "epoch": 0.91, + "learning_rate": 4.60468087853585e-07, + "loss": 0.44, + "step": 7927 + }, + { + "epoch": 0.91, + "learning_rate": 4.5935841031126693e-07, + "loss": 0.4495, + "step": 7928 + }, + { + "epoch": 0.91, + "learning_rate": 4.582500400498513e-07, + "loss": 0.4418, + "step": 7929 + }, + { + "epoch": 0.91, + "learning_rate": 4.5714297722121105e-07, + "loss": 0.458, + "step": 7930 + }, + { + "epoch": 0.91, + "learning_rate": 4.5603722197703925e-07, + "loss": 0.4415, + "step": 7931 + }, + { + "epoch": 0.91, + "learning_rate": 4.54932774468847e-07, + "loss": 0.4602, + "step": 7932 + }, + { + "epoch": 0.91, + "learning_rate": 4.5382963484797096e-07, + "loss": 0.4578, + "step": 7933 + }, + { + "epoch": 0.91, + "learning_rate": 4.5272780326556466e-07, + "loss": 0.469, + "step": 7934 + }, + { + "epoch": 0.91, + "learning_rate": 4.516272798726018e-07, + "loss": 0.4415, + "step": 7935 + }, + { + "epoch": 0.91, + "learning_rate": 4.5052806481988175e-07, + "loss": 0.4508, + "step": 7936 + }, + { + "epoch": 0.91, + "learning_rate": 4.494301582580185e-07, + "loss": 0.4495, + "step": 7937 + }, + { + "epoch": 0.91, + "learning_rate": 4.4833356033745167e-07, + "loss": 0.4321, + "step": 7938 + }, + { + "epoch": 0.91, + "learning_rate": 4.472382712084389e-07, + "loss": 0.4402, + "step": 7939 + }, + { + "epoch": 0.91, + "learning_rate": 4.4614429102105893e-07, + "loss": 0.441, + "step": 7940 + }, + { + "epoch": 0.91, + "learning_rate": 4.4505161992521417e-07, + "loss": 0.4518, + "step": 7941 + }, + { + "epoch": 0.91, + "learning_rate": 4.439602580706226e-07, + "loss": 0.4709, + "step": 7942 + }, + { + "epoch": 0.91, + "learning_rate": 4.4287020560682345e-07, + "loss": 0.4147, + "step": 7943 + }, + { + "epoch": 0.91, + "learning_rate": 4.4178146268318177e-07, + "loss": 0.4517, + "step": 7944 + }, + { + "epoch": 0.91, + "learning_rate": 4.406940294488771e-07, + "loss": 0.4611, + "step": 7945 + }, + { + "epoch": 0.91, + "learning_rate": 4.396079060529146e-07, + "loss": 0.4488, + "step": 7946 + }, + { + "epoch": 0.91, + "learning_rate": 4.3852309264411417e-07, + "loss": 0.434, + "step": 7947 + }, + { + "epoch": 0.91, + "learning_rate": 4.3743958937112253e-07, + "loss": 0.4538, + "step": 7948 + }, + { + "epoch": 0.91, + "learning_rate": 4.363573963824008e-07, + "loss": 0.4536, + "step": 7949 + }, + { + "epoch": 0.91, + "learning_rate": 4.3527651382623603e-07, + "loss": 0.4758, + "step": 7950 + }, + { + "epoch": 0.91, + "learning_rate": 4.3419694185073303e-07, + "loss": 0.449, + "step": 7951 + }, + { + "epoch": 0.91, + "learning_rate": 4.331186806038179e-07, + "loss": 0.4611, + "step": 7952 + }, + { + "epoch": 0.91, + "learning_rate": 4.320417302332325e-07, + "loss": 0.4398, + "step": 7953 + }, + { + "epoch": 0.91, + "learning_rate": 4.3096609088654873e-07, + "loss": 0.4411, + "step": 7954 + }, + { + "epoch": 0.91, + "learning_rate": 4.298917627111476e-07, + "loss": 0.4262, + "step": 7955 + }, + { + "epoch": 0.91, + "learning_rate": 4.2881874585424146e-07, + "loss": 0.4694, + "step": 7956 + }, + { + "epoch": 0.91, + "learning_rate": 4.2774704046285254e-07, + "loss": 0.4418, + "step": 7957 + }, + { + "epoch": 0.91, + "learning_rate": 4.266766466838335e-07, + "loss": 0.4588, + "step": 7958 + }, + { + "epoch": 0.91, + "learning_rate": 4.256075646638469e-07, + "loss": 0.442, + "step": 7959 + }, + { + "epoch": 0.91, + "learning_rate": 4.2453979454938563e-07, + "loss": 0.4577, + "step": 7960 + }, + { + "epoch": 0.91, + "learning_rate": 4.2347333648675383e-07, + "loss": 0.4395, + "step": 7961 + }, + { + "epoch": 0.91, + "learning_rate": 4.2240819062208337e-07, + "loss": 0.4491, + "step": 7962 + }, + { + "epoch": 0.91, + "learning_rate": 4.2134435710132093e-07, + "loss": 0.453, + "step": 7963 + }, + { + "epoch": 0.91, + "learning_rate": 4.2028183607023766e-07, + "loss": 0.4556, + "step": 7964 + }, + { + "epoch": 0.91, + "learning_rate": 4.192206276744204e-07, + "loss": 0.4387, + "step": 7965 + }, + { + "epoch": 0.91, + "learning_rate": 4.181607320592784e-07, + "loss": 0.4476, + "step": 7966 + }, + { + "epoch": 0.91, + "learning_rate": 4.1710214937004223e-07, + "loss": 0.4507, + "step": 7967 + }, + { + "epoch": 0.91, + "learning_rate": 4.1604487975176136e-07, + "loss": 0.4773, + "step": 7968 + }, + { + "epoch": 0.91, + "learning_rate": 4.149889233493054e-07, + "loss": 0.4412, + "step": 7969 + }, + { + "epoch": 0.91, + "learning_rate": 4.139342803073632e-07, + "loss": 0.4464, + "step": 7970 + }, + { + "epoch": 0.91, + "learning_rate": 4.128809507704445e-07, + "loss": 0.4332, + "step": 7971 + }, + { + "epoch": 0.91, + "learning_rate": 4.1182893488287965e-07, + "loss": 0.451, + "step": 7972 + }, + { + "epoch": 0.91, + "learning_rate": 4.1077823278881767e-07, + "loss": 0.4384, + "step": 7973 + }, + { + "epoch": 0.91, + "learning_rate": 4.097288446322278e-07, + "loss": 0.4486, + "step": 7974 + }, + { + "epoch": 0.91, + "learning_rate": 4.086807705569018e-07, + "loss": 0.4793, + "step": 7975 + }, + { + "epoch": 0.91, + "learning_rate": 4.076340107064458e-07, + "loss": 0.4487, + "step": 7976 + }, + { + "epoch": 0.91, + "learning_rate": 4.065885652242907e-07, + "loss": 0.4378, + "step": 7977 + }, + { + "epoch": 0.91, + "learning_rate": 4.055444342536885e-07, + "loss": 0.4562, + "step": 7978 + }, + { + "epoch": 0.91, + "learning_rate": 4.045016179377048e-07, + "loss": 0.4411, + "step": 7979 + }, + { + "epoch": 0.91, + "learning_rate": 4.034601164192309e-07, + "loss": 0.4305, + "step": 7980 + }, + { + "epoch": 0.91, + "learning_rate": 4.024199298409737e-07, + "loss": 0.4558, + "step": 7981 + }, + { + "epoch": 0.91, + "learning_rate": 4.013810583454647e-07, + "loss": 0.4528, + "step": 7982 + }, + { + "epoch": 0.91, + "learning_rate": 4.0034350207505124e-07, + "loss": 0.4647, + "step": 7983 + }, + { + "epoch": 0.91, + "learning_rate": 3.9930726117190064e-07, + "loss": 0.4603, + "step": 7984 + }, + { + "epoch": 0.91, + "learning_rate": 3.982723357780027e-07, + "loss": 0.4369, + "step": 7985 + }, + { + "epoch": 0.91, + "learning_rate": 3.97238726035164e-07, + "loss": 0.4506, + "step": 7986 + }, + { + "epoch": 0.91, + "learning_rate": 3.962064320850112e-07, + "loss": 0.4454, + "step": 7987 + }, + { + "epoch": 0.91, + "learning_rate": 3.951754540689956e-07, + "loss": 0.4581, + "step": 7988 + }, + { + "epoch": 0.91, + "learning_rate": 3.9414579212838087e-07, + "loss": 0.4423, + "step": 7989 + }, + { + "epoch": 0.91, + "learning_rate": 3.931174464042542e-07, + "loss": 0.4595, + "step": 7990 + }, + { + "epoch": 0.91, + "learning_rate": 3.920904170375239e-07, + "loss": 0.4608, + "step": 7991 + }, + { + "epoch": 0.91, + "learning_rate": 3.9106470416891195e-07, + "loss": 0.4527, + "step": 7992 + }, + { + "epoch": 0.91, + "learning_rate": 3.9004030793896807e-07, + "loss": 0.4451, + "step": 7993 + }, + { + "epoch": 0.91, + "learning_rate": 3.8901722848805443e-07, + "loss": 0.4519, + "step": 7994 + }, + { + "epoch": 0.91, + "learning_rate": 3.8799546595635784e-07, + "loss": 0.4403, + "step": 7995 + }, + { + "epoch": 0.91, + "learning_rate": 3.8697502048387956e-07, + "loss": 0.4511, + "step": 7996 + }, + { + "epoch": 0.91, + "learning_rate": 3.8595589221044674e-07, + "loss": 0.4411, + "step": 7997 + }, + { + "epoch": 0.91, + "learning_rate": 3.84938081275702e-07, + "loss": 0.4515, + "step": 7998 + }, + { + "epoch": 0.91, + "learning_rate": 3.839215878191083e-07, + "loss": 0.4685, + "step": 7999 + }, + { + "epoch": 0.91, + "learning_rate": 3.8290641197994526e-07, + "loss": 0.4395, + "step": 8000 + }, + { + "epoch": 0.91, + "learning_rate": 3.8189255389731837e-07, + "loss": 0.4388, + "step": 8001 + }, + { + "epoch": 0.91, + "learning_rate": 3.808800137101465e-07, + "loss": 0.4422, + "step": 8002 + }, + { + "epoch": 0.91, + "learning_rate": 3.7986879155717084e-07, + "loss": 0.4631, + "step": 8003 + }, + { + "epoch": 0.91, + "learning_rate": 3.7885888757695054e-07, + "loss": 0.4384, + "step": 8004 + }, + { + "epoch": 0.91, + "learning_rate": 3.778503019078672e-07, + "loss": 0.4312, + "step": 8005 + }, + { + "epoch": 0.91, + "learning_rate": 3.768430346881169e-07, + "loss": 0.4415, + "step": 8006 + }, + { + "epoch": 0.92, + "learning_rate": 3.7583708605571923e-07, + "loss": 0.4421, + "step": 8007 + }, + { + "epoch": 0.92, + "learning_rate": 3.748324561485128e-07, + "loss": 0.4674, + "step": 8008 + }, + { + "epoch": 0.92, + "learning_rate": 3.7382914510415316e-07, + "loss": 0.4332, + "step": 8009 + }, + { + "epoch": 0.92, + "learning_rate": 3.7282715306011465e-07, + "loss": 0.4402, + "step": 8010 + }, + { + "epoch": 0.92, + "learning_rate": 3.7182648015369524e-07, + "loss": 0.4535, + "step": 8011 + }, + { + "epoch": 0.92, + "learning_rate": 3.708271265220087e-07, + "loss": 0.4524, + "step": 8012 + }, + { + "epoch": 0.92, + "learning_rate": 3.698290923019865e-07, + "loss": 0.4364, + "step": 8013 + }, + { + "epoch": 0.92, + "learning_rate": 3.688323776303837e-07, + "loss": 0.4509, + "step": 8014 + }, + { + "epoch": 0.92, + "learning_rate": 3.678369826437733e-07, + "loss": 0.4523, + "step": 8015 + }, + { + "epoch": 0.92, + "learning_rate": 3.668429074785451e-07, + "loss": 0.4636, + "step": 8016 + }, + { + "epoch": 0.92, + "learning_rate": 3.6585015227091013e-07, + "loss": 0.4432, + "step": 8017 + }, + { + "epoch": 0.92, + "learning_rate": 3.6485871715689735e-07, + "loss": 0.4289, + "step": 8018 + }, + { + "epoch": 0.92, + "learning_rate": 3.63868602272357e-07, + "loss": 0.4573, + "step": 8019 + }, + { + "epoch": 0.92, + "learning_rate": 3.6287980775295603e-07, + "loss": 0.4517, + "step": 8020 + }, + { + "epoch": 0.92, + "learning_rate": 3.6189233373418064e-07, + "loss": 0.4437, + "step": 8021 + }, + { + "epoch": 0.92, + "learning_rate": 3.609061803513392e-07, + "loss": 0.4425, + "step": 8022 + }, + { + "epoch": 0.92, + "learning_rate": 3.5992134773955354e-07, + "loss": 0.4543, + "step": 8023 + }, + { + "epoch": 0.92, + "learning_rate": 3.589378360337692e-07, + "loss": 0.4701, + "step": 8024 + }, + { + "epoch": 0.92, + "learning_rate": 3.579556453687494e-07, + "loss": 0.4521, + "step": 8025 + }, + { + "epoch": 0.92, + "learning_rate": 3.569747758790765e-07, + "loss": 0.4548, + "step": 8026 + }, + { + "epoch": 0.92, + "learning_rate": 3.5599522769915074e-07, + "loss": 0.4471, + "step": 8027 + }, + { + "epoch": 0.92, + "learning_rate": 3.550170009631926e-07, + "loss": 0.443, + "step": 8028 + }, + { + "epoch": 0.92, + "learning_rate": 3.5404009580524144e-07, + "loss": 0.4381, + "step": 8029 + }, + { + "epoch": 0.92, + "learning_rate": 3.5306451235915475e-07, + "loss": 0.4476, + "step": 8030 + }, + { + "epoch": 0.92, + "learning_rate": 3.520902507586077e-07, + "loss": 0.4408, + "step": 8031 + }, + { + "epoch": 0.92, + "learning_rate": 3.51117311137098e-07, + "loss": 0.4631, + "step": 8032 + }, + { + "epoch": 0.92, + "learning_rate": 3.50145693627939e-07, + "loss": 0.4327, + "step": 8033 + }, + { + "epoch": 0.92, + "learning_rate": 3.4917539836426317e-07, + "loss": 0.4601, + "step": 8034 + }, + { + "epoch": 0.92, + "learning_rate": 3.4820642547902516e-07, + "loss": 0.4565, + "step": 8035 + }, + { + "epoch": 0.92, + "learning_rate": 3.472387751049944e-07, + "loss": 0.4424, + "step": 8036 + }, + { + "epoch": 0.92, + "learning_rate": 3.462724473747603e-07, + "loss": 0.459, + "step": 8037 + }, + { + "epoch": 0.92, + "learning_rate": 3.4530744242073143e-07, + "loss": 0.4567, + "step": 8038 + }, + { + "epoch": 0.92, + "learning_rate": 3.443437603751354e-07, + "loss": 0.4382, + "step": 8039 + }, + { + "epoch": 0.92, + "learning_rate": 3.433814013700187e-07, + "loss": 0.4704, + "step": 8040 + }, + { + "epoch": 0.92, + "learning_rate": 3.424203655372438e-07, + "loss": 0.4377, + "step": 8041 + }, + { + "epoch": 0.92, + "learning_rate": 3.414606530084974e-07, + "loss": 0.4581, + "step": 8042 + }, + { + "epoch": 0.92, + "learning_rate": 3.405022639152777e-07, + "loss": 0.4452, + "step": 8043 + }, + { + "epoch": 0.92, + "learning_rate": 3.3954519838890866e-07, + "loss": 0.4596, + "step": 8044 + }, + { + "epoch": 0.92, + "learning_rate": 3.3858945656052855e-07, + "loss": 0.4302, + "step": 8045 + }, + { + "epoch": 0.92, + "learning_rate": 3.376350385610938e-07, + "loss": 0.4521, + "step": 8046 + }, + { + "epoch": 0.92, + "learning_rate": 3.3668194452138423e-07, + "loss": 0.4579, + "step": 8047 + }, + { + "epoch": 0.92, + "learning_rate": 3.357301745719932e-07, + "loss": 0.4506, + "step": 8048 + }, + { + "epoch": 0.92, + "learning_rate": 3.34779728843333e-07, + "loss": 0.4377, + "step": 8049 + }, + { + "epoch": 0.92, + "learning_rate": 3.3383060746563836e-07, + "loss": 0.4593, + "step": 8050 + }, + { + "epoch": 0.92, + "learning_rate": 3.3288281056895746e-07, + "loss": 0.4456, + "step": 8051 + }, + { + "epoch": 0.92, + "learning_rate": 3.3193633828316306e-07, + "loss": 0.4628, + "step": 8052 + }, + { + "epoch": 0.92, + "learning_rate": 3.309911907379393e-07, + "loss": 0.4529, + "step": 8053 + }, + { + "epoch": 0.92, + "learning_rate": 3.300473680627947e-07, + "loss": 0.4455, + "step": 8054 + }, + { + "epoch": 0.92, + "learning_rate": 3.2910487038705476e-07, + "loss": 0.4394, + "step": 8055 + }, + { + "epoch": 0.92, + "learning_rate": 3.2816369783986166e-07, + "loss": 0.463, + "step": 8056 + }, + { + "epoch": 0.92, + "learning_rate": 3.2722385055017567e-07, + "loss": 0.4431, + "step": 8057 + }, + { + "epoch": 0.92, + "learning_rate": 3.262853286467804e-07, + "loss": 0.4564, + "step": 8058 + }, + { + "epoch": 0.92, + "learning_rate": 3.2534813225826965e-07, + "loss": 0.4397, + "step": 8059 + }, + { + "epoch": 0.92, + "learning_rate": 3.2441226151306403e-07, + "loss": 0.4575, + "step": 8060 + }, + { + "epoch": 0.92, + "learning_rate": 3.234777165393965e-07, + "loss": 0.4362, + "step": 8061 + }, + { + "epoch": 0.92, + "learning_rate": 3.2254449746532246e-07, + "loss": 0.4318, + "step": 8062 + }, + { + "epoch": 0.92, + "learning_rate": 3.216126044187118e-07, + "loss": 0.4545, + "step": 8063 + }, + { + "epoch": 0.92, + "learning_rate": 3.206820375272557e-07, + "loss": 0.462, + "step": 8064 + }, + { + "epoch": 0.92, + "learning_rate": 3.1975279691846437e-07, + "loss": 0.4365, + "step": 8065 + }, + { + "epoch": 0.92, + "learning_rate": 3.188248827196616e-07, + "loss": 0.4711, + "step": 8066 + }, + { + "epoch": 0.92, + "learning_rate": 3.178982950579923e-07, + "loss": 0.4475, + "step": 8067 + }, + { + "epoch": 0.92, + "learning_rate": 3.169730340604227e-07, + "loss": 0.4419, + "step": 8068 + }, + { + "epoch": 0.92, + "learning_rate": 3.160490998537313e-07, + "loss": 0.4292, + "step": 8069 + }, + { + "epoch": 0.92, + "learning_rate": 3.151264925645192e-07, + "loss": 0.4678, + "step": 8070 + }, + { + "epoch": 0.92, + "learning_rate": 3.142052123192019e-07, + "loss": 0.4485, + "step": 8071 + }, + { + "epoch": 0.92, + "learning_rate": 3.132852592440194e-07, + "loss": 0.4481, + "step": 8072 + }, + { + "epoch": 0.92, + "learning_rate": 3.1236663346502215e-07, + "loss": 0.4427, + "step": 8073 + }, + { + "epoch": 0.92, + "learning_rate": 3.11449335108085e-07, + "loss": 0.4701, + "step": 8074 + }, + { + "epoch": 0.92, + "learning_rate": 3.1053336429889616e-07, + "loss": 0.4609, + "step": 8075 + }, + { + "epoch": 0.92, + "learning_rate": 3.0961872116296645e-07, + "loss": 0.4481, + "step": 8076 + }, + { + "epoch": 0.92, + "learning_rate": 3.0870540582562003e-07, + "loss": 0.4279, + "step": 8077 + }, + { + "epoch": 0.92, + "learning_rate": 3.077934184120035e-07, + "loss": 0.4586, + "step": 8078 + }, + { + "epoch": 0.92, + "learning_rate": 3.06882759047078e-07, + "loss": 0.4453, + "step": 8079 + }, + { + "epoch": 0.92, + "learning_rate": 3.059734278556237e-07, + "loss": 0.4442, + "step": 8080 + }, + { + "epoch": 0.92, + "learning_rate": 3.050654249622398e-07, + "loss": 0.4565, + "step": 8081 + }, + { + "epoch": 0.92, + "learning_rate": 3.0415875049134566e-07, + "loss": 0.4458, + "step": 8082 + }, + { + "epoch": 0.92, + "learning_rate": 3.03253404567172e-07, + "loss": 0.4353, + "step": 8083 + }, + { + "epoch": 0.92, + "learning_rate": 3.0234938731377394e-07, + "loss": 0.4717, + "step": 8084 + }, + { + "epoch": 0.92, + "learning_rate": 3.014466988550202e-07, + "loss": 0.4587, + "step": 8085 + }, + { + "epoch": 0.92, + "learning_rate": 3.0054533931460186e-07, + "loss": 0.4576, + "step": 8086 + }, + { + "epoch": 0.92, + "learning_rate": 2.996453088160234e-07, + "loss": 0.4396, + "step": 8087 + }, + { + "epoch": 0.92, + "learning_rate": 2.9874660748260843e-07, + "loss": 0.443, + "step": 8088 + }, + { + "epoch": 0.92, + "learning_rate": 2.978492354375007e-07, + "loss": 0.4552, + "step": 8089 + }, + { + "epoch": 0.92, + "learning_rate": 2.969531928036595e-07, + "loss": 0.4378, + "step": 8090 + }, + { + "epoch": 0.92, + "learning_rate": 2.9605847970386125e-07, + "loss": 0.4272, + "step": 8091 + }, + { + "epoch": 0.92, + "learning_rate": 2.9516509626070553e-07, + "loss": 0.4468, + "step": 8092 + }, + { + "epoch": 0.92, + "learning_rate": 2.9427304259660117e-07, + "loss": 0.4672, + "step": 8093 + }, + { + "epoch": 0.93, + "learning_rate": 2.9338231883378365e-07, + "loss": 0.4169, + "step": 8094 + }, + { + "epoch": 0.93, + "learning_rate": 2.924929250942998e-07, + "loss": 0.4546, + "step": 8095 + }, + { + "epoch": 0.93, + "learning_rate": 2.9160486150001556e-07, + "loss": 0.4413, + "step": 8096 + }, + { + "epoch": 0.93, + "learning_rate": 2.907181281726179e-07, + "loss": 0.4463, + "step": 8097 + }, + { + "epoch": 0.93, + "learning_rate": 2.8983272523360637e-07, + "loss": 0.4412, + "step": 8098 + }, + { + "epoch": 0.93, + "learning_rate": 2.889486528043028e-07, + "loss": 0.4416, + "step": 8099 + }, + { + "epoch": 0.93, + "learning_rate": 2.880659110058448e-07, + "loss": 0.4381, + "step": 8100 + }, + { + "epoch": 0.93, + "learning_rate": 2.8718449995918553e-07, + "loss": 0.4511, + "step": 8101 + }, + { + "epoch": 0.93, + "learning_rate": 2.863044197851017e-07, + "loss": 0.463, + "step": 8102 + }, + { + "epoch": 0.93, + "learning_rate": 2.8542567060418135e-07, + "loss": 0.4676, + "step": 8103 + }, + { + "epoch": 0.93, + "learning_rate": 2.845482525368337e-07, + "loss": 0.43, + "step": 8104 + }, + { + "epoch": 0.93, + "learning_rate": 2.836721657032848e-07, + "loss": 0.4403, + "step": 8105 + }, + { + "epoch": 0.93, + "learning_rate": 2.8279741022357535e-07, + "loss": 0.444, + "step": 8106 + }, + { + "epoch": 0.93, + "learning_rate": 2.8192398621757156e-07, + "loss": 0.4691, + "step": 8107 + }, + { + "epoch": 0.93, + "learning_rate": 2.810518938049478e-07, + "loss": 0.4507, + "step": 8108 + }, + { + "epoch": 0.93, + "learning_rate": 2.801811331052007e-07, + "loss": 0.4691, + "step": 8109 + }, + { + "epoch": 0.93, + "learning_rate": 2.7931170423764363e-07, + "loss": 0.4466, + "step": 8110 + }, + { + "epoch": 0.93, + "learning_rate": 2.784436073214103e-07, + "loss": 0.4667, + "step": 8111 + }, + { + "epoch": 0.93, + "learning_rate": 2.775768424754488e-07, + "loss": 0.443, + "step": 8112 + }, + { + "epoch": 0.93, + "learning_rate": 2.7671140981852306e-07, + "loss": 0.4504, + "step": 8113 + }, + { + "epoch": 0.93, + "learning_rate": 2.7584730946921825e-07, + "loss": 0.442, + "step": 8114 + }, + { + "epoch": 0.93, + "learning_rate": 2.7498454154593624e-07, + "loss": 0.4698, + "step": 8115 + }, + { + "epoch": 0.93, + "learning_rate": 2.741231061668925e-07, + "loss": 0.4184, + "step": 8116 + }, + { + "epoch": 0.93, + "learning_rate": 2.73263003450126e-07, + "loss": 0.4488, + "step": 8117 + }, + { + "epoch": 0.93, + "learning_rate": 2.72404233513488e-07, + "loss": 0.4538, + "step": 8118 + }, + { + "epoch": 0.93, + "learning_rate": 2.71546796474651e-07, + "loss": 0.4525, + "step": 8119 + }, + { + "epoch": 0.93, + "learning_rate": 2.70690692451101e-07, + "loss": 0.4407, + "step": 8120 + }, + { + "epoch": 0.93, + "learning_rate": 2.698359215601443e-07, + "loss": 0.4438, + "step": 8121 + }, + { + "epoch": 0.93, + "learning_rate": 2.689824839189037e-07, + "loss": 0.4545, + "step": 8122 + }, + { + "epoch": 0.93, + "learning_rate": 2.681303796443202e-07, + "loss": 0.4361, + "step": 8123 + }, + { + "epoch": 0.93, + "learning_rate": 2.672796088531493e-07, + "loss": 0.4593, + "step": 8124 + }, + { + "epoch": 0.93, + "learning_rate": 2.664301716619666e-07, + "loss": 0.4915, + "step": 8125 + }, + { + "epoch": 0.93, + "learning_rate": 2.655820681871635e-07, + "loss": 0.4487, + "step": 8126 + }, + { + "epoch": 0.93, + "learning_rate": 2.6473529854494915e-07, + "loss": 0.4443, + "step": 8127 + }, + { + "epoch": 0.93, + "learning_rate": 2.638898628513498e-07, + "loss": 0.443, + "step": 8128 + }, + { + "epoch": 0.93, + "learning_rate": 2.6304576122221035e-07, + "loss": 0.4475, + "step": 8129 + }, + { + "epoch": 0.93, + "learning_rate": 2.6220299377318847e-07, + "loss": 0.4359, + "step": 8130 + }, + { + "epoch": 0.93, + "learning_rate": 2.613615606197661e-07, + "loss": 0.444, + "step": 8131 + }, + { + "epoch": 0.93, + "learning_rate": 2.605214618772356e-07, + "loss": 0.4468, + "step": 8132 + }, + { + "epoch": 0.93, + "learning_rate": 2.596826976607114e-07, + "loss": 0.4594, + "step": 8133 + }, + { + "epoch": 0.93, + "learning_rate": 2.5884526808511946e-07, + "loss": 0.4441, + "step": 8134 + }, + { + "epoch": 0.93, + "learning_rate": 2.5800917326521013e-07, + "loss": 0.4514, + "step": 8135 + }, + { + "epoch": 0.93, + "learning_rate": 2.5717441331554517e-07, + "loss": 0.4474, + "step": 8136 + }, + { + "epoch": 0.93, + "learning_rate": 2.5634098835050415e-07, + "loss": 0.439, + "step": 8137 + }, + { + "epoch": 0.93, + "learning_rate": 2.555088984842868e-07, + "loss": 0.4606, + "step": 8138 + }, + { + "epoch": 0.93, + "learning_rate": 2.546781438309087e-07, + "loss": 0.4344, + "step": 8139 + }, + { + "epoch": 0.93, + "learning_rate": 2.5384872450419985e-07, + "loss": 0.4299, + "step": 8140 + }, + { + "epoch": 0.93, + "learning_rate": 2.530206406178104e-07, + "loss": 0.4699, + "step": 8141 + }, + { + "epoch": 0.93, + "learning_rate": 2.5219389228520517e-07, + "loss": 0.452, + "step": 8142 + }, + { + "epoch": 0.93, + "learning_rate": 2.51368479619668e-07, + "loss": 0.4446, + "step": 8143 + }, + { + "epoch": 0.93, + "learning_rate": 2.505444027342996e-07, + "loss": 0.4496, + "step": 8144 + }, + { + "epoch": 0.93, + "learning_rate": 2.497216617420151e-07, + "loss": 0.4444, + "step": 8145 + }, + { + "epoch": 0.93, + "learning_rate": 2.4890025675554983e-07, + "loss": 0.4439, + "step": 8146 + }, + { + "epoch": 0.93, + "learning_rate": 2.480801878874528e-07, + "loss": 0.4742, + "step": 8147 + }, + { + "epoch": 0.93, + "learning_rate": 2.4726145525009404e-07, + "loss": 0.4355, + "step": 8148 + }, + { + "epoch": 0.93, + "learning_rate": 2.4644405895565717e-07, + "loss": 0.4577, + "step": 8149 + }, + { + "epoch": 0.93, + "learning_rate": 2.456279991161437e-07, + "loss": 0.4551, + "step": 8150 + }, + { + "epoch": 0.93, + "learning_rate": 2.448132758433719e-07, + "loss": 0.4834, + "step": 8151 + }, + { + "epoch": 0.93, + "learning_rate": 2.439998892489781e-07, + "loss": 0.4243, + "step": 8152 + }, + { + "epoch": 0.93, + "learning_rate": 2.4318783944441314e-07, + "loss": 0.458, + "step": 8153 + }, + { + "epoch": 0.93, + "learning_rate": 2.4237712654094693e-07, + "loss": 0.4451, + "step": 8154 + }, + { + "epoch": 0.93, + "learning_rate": 2.4156775064966273e-07, + "loss": 0.4431, + "step": 8155 + }, + { + "epoch": 0.93, + "learning_rate": 2.4075971188146754e-07, + "loss": 0.4606, + "step": 8156 + }, + { + "epoch": 0.93, + "learning_rate": 2.3995301034707597e-07, + "loss": 0.451, + "step": 8157 + }, + { + "epoch": 0.93, + "learning_rate": 2.3914764615702747e-07, + "loss": 0.4384, + "step": 8158 + }, + { + "epoch": 0.93, + "learning_rate": 2.3834361942167484e-07, + "loss": 0.4686, + "step": 8159 + }, + { + "epoch": 0.93, + "learning_rate": 2.375409302511855e-07, + "loss": 0.4473, + "step": 8160 + }, + { + "epoch": 0.93, + "learning_rate": 2.367395787555482e-07, + "loss": 0.4443, + "step": 8161 + }, + { + "epoch": 0.93, + "learning_rate": 2.3593956504456396e-07, + "loss": 0.4512, + "step": 8162 + }, + { + "epoch": 0.93, + "learning_rate": 2.3514088922785284e-07, + "loss": 0.4504, + "step": 8163 + }, + { + "epoch": 0.93, + "learning_rate": 2.3434355141485287e-07, + "loss": 0.4591, + "step": 8164 + }, + { + "epoch": 0.93, + "learning_rate": 2.335475517148167e-07, + "loss": 0.4477, + "step": 8165 + }, + { + "epoch": 0.93, + "learning_rate": 2.3275289023681148e-07, + "loss": 0.4334, + "step": 8166 + }, + { + "epoch": 0.93, + "learning_rate": 2.3195956708972566e-07, + "loss": 0.462, + "step": 8167 + }, + { + "epoch": 0.93, + "learning_rate": 2.3116758238226233e-07, + "loss": 0.4558, + "step": 8168 + }, + { + "epoch": 0.93, + "learning_rate": 2.3037693622294244e-07, + "loss": 0.4568, + "step": 8169 + }, + { + "epoch": 0.93, + "learning_rate": 2.2958762872009932e-07, + "loss": 0.4401, + "step": 8170 + }, + { + "epoch": 0.93, + "learning_rate": 2.2879965998188646e-07, + "loss": 0.4245, + "step": 8171 + }, + { + "epoch": 0.93, + "learning_rate": 2.280130301162742e-07, + "loss": 0.4457, + "step": 8172 + }, + { + "epoch": 0.93, + "learning_rate": 2.2722773923104736e-07, + "loss": 0.4725, + "step": 8173 + }, + { + "epoch": 0.93, + "learning_rate": 2.264437874338099e-07, + "loss": 0.4427, + "step": 8174 + }, + { + "epoch": 0.93, + "learning_rate": 2.2566117483197923e-07, + "loss": 0.4492, + "step": 8175 + }, + { + "epoch": 0.93, + "learning_rate": 2.248799015327907e-07, + "loss": 0.4409, + "step": 8176 + }, + { + "epoch": 0.93, + "learning_rate": 2.2409996764329644e-07, + "loss": 0.431, + "step": 8177 + }, + { + "epoch": 0.93, + "learning_rate": 2.233213732703665e-07, + "loss": 0.4616, + "step": 8178 + }, + { + "epoch": 0.93, + "learning_rate": 2.2254411852068226e-07, + "loss": 0.442, + "step": 8179 + }, + { + "epoch": 0.93, + "learning_rate": 2.2176820350074846e-07, + "loss": 0.4572, + "step": 8180 + }, + { + "epoch": 0.93, + "learning_rate": 2.2099362831688008e-07, + "loss": 0.4326, + "step": 8181 + }, + { + "epoch": 0.94, + "learning_rate": 2.2022039307521337e-07, + "loss": 0.4631, + "step": 8182 + }, + { + "epoch": 0.94, + "learning_rate": 2.1944849788169798e-07, + "loss": 0.4388, + "step": 8183 + }, + { + "epoch": 0.94, + "learning_rate": 2.1867794284209932e-07, + "loss": 0.4536, + "step": 8184 + }, + { + "epoch": 0.94, + "learning_rate": 2.179087280620018e-07, + "loss": 0.4582, + "step": 8185 + }, + { + "epoch": 0.94, + "learning_rate": 2.1714085364680671e-07, + "loss": 0.4496, + "step": 8186 + }, + { + "epoch": 0.94, + "learning_rate": 2.163743197017265e-07, + "loss": 0.4677, + "step": 8187 + }, + { + "epoch": 0.94, + "learning_rate": 2.156091263317972e-07, + "loss": 0.4457, + "step": 8188 + }, + { + "epoch": 0.94, + "learning_rate": 2.1484527364186492e-07, + "loss": 0.4335, + "step": 8189 + }, + { + "epoch": 0.94, + "learning_rate": 2.140827617365948e-07, + "loss": 0.4598, + "step": 8190 + }, + { + "epoch": 0.94, + "learning_rate": 2.1332159072046887e-07, + "loss": 0.4615, + "step": 8191 + }, + { + "epoch": 0.94, + "learning_rate": 2.1256176069778367e-07, + "loss": 0.4394, + "step": 8192 + }, + { + "epoch": 0.94, + "learning_rate": 2.118032717726537e-07, + "loss": 0.4442, + "step": 8193 + }, + { + "epoch": 0.94, + "learning_rate": 2.1104612404900805e-07, + "loss": 0.4259, + "step": 8194 + }, + { + "epoch": 0.94, + "learning_rate": 2.102903176305926e-07, + "loss": 0.461, + "step": 8195 + }, + { + "epoch": 0.94, + "learning_rate": 2.0953585262097232e-07, + "loss": 0.451, + "step": 8196 + }, + { + "epoch": 0.94, + "learning_rate": 2.0878272912352117e-07, + "loss": 0.455, + "step": 8197 + }, + { + "epoch": 0.94, + "learning_rate": 2.0803094724143879e-07, + "loss": 0.4479, + "step": 8198 + }, + { + "epoch": 0.94, + "learning_rate": 2.0728050707773285e-07, + "loss": 0.4658, + "step": 8199 + }, + { + "epoch": 0.94, + "learning_rate": 2.0653140873523104e-07, + "loss": 0.4647, + "step": 8200 + }, + { + "epoch": 0.94, + "learning_rate": 2.0578365231657792e-07, + "loss": 0.4496, + "step": 8201 + }, + { + "epoch": 0.94, + "learning_rate": 2.0503723792423047e-07, + "loss": 0.4439, + "step": 8202 + }, + { + "epoch": 0.94, + "learning_rate": 2.0429216566046682e-07, + "loss": 0.4974, + "step": 8203 + }, + { + "epoch": 0.94, + "learning_rate": 2.0354843562737537e-07, + "loss": 0.4585, + "step": 8204 + }, + { + "epoch": 0.94, + "learning_rate": 2.0280604792686676e-07, + "loss": 0.4521, + "step": 8205 + }, + { + "epoch": 0.94, + "learning_rate": 2.0206500266066297e-07, + "loss": 0.4531, + "step": 8206 + }, + { + "epoch": 0.94, + "learning_rate": 2.0132529993030392e-07, + "loss": 0.4476, + "step": 8207 + }, + { + "epoch": 0.94, + "learning_rate": 2.0058693983714628e-07, + "loss": 0.4422, + "step": 8208 + }, + { + "epoch": 0.94, + "learning_rate": 1.9984992248236135e-07, + "loss": 0.4532, + "step": 8209 + }, + { + "epoch": 0.94, + "learning_rate": 1.9911424796693611e-07, + "loss": 0.4243, + "step": 8210 + }, + { + "epoch": 0.94, + "learning_rate": 1.9837991639167552e-07, + "loss": 0.4565, + "step": 8211 + }, + { + "epoch": 0.94, + "learning_rate": 1.9764692785719909e-07, + "loss": 0.426, + "step": 8212 + }, + { + "epoch": 0.94, + "learning_rate": 1.9691528246394197e-07, + "loss": 0.4706, + "step": 8213 + }, + { + "epoch": 0.94, + "learning_rate": 1.9618498031215738e-07, + "loss": 0.4329, + "step": 8214 + }, + { + "epoch": 0.94, + "learning_rate": 1.954560215019108e-07, + "loss": 0.4393, + "step": 8215 + }, + { + "epoch": 0.94, + "learning_rate": 1.9472840613308787e-07, + "loss": 0.4472, + "step": 8216 + }, + { + "epoch": 0.94, + "learning_rate": 1.9400213430538773e-07, + "loss": 0.4587, + "step": 8217 + }, + { + "epoch": 0.94, + "learning_rate": 1.9327720611832523e-07, + "loss": 0.4529, + "step": 8218 + }, + { + "epoch": 0.94, + "learning_rate": 1.9255362167123316e-07, + "loss": 0.4603, + "step": 8219 + }, + { + "epoch": 0.94, + "learning_rate": 1.918313810632566e-07, + "loss": 0.4376, + "step": 8220 + }, + { + "epoch": 0.94, + "learning_rate": 1.9111048439335978e-07, + "loss": 0.4576, + "step": 8221 + }, + { + "epoch": 0.94, + "learning_rate": 1.903909317603214e-07, + "loss": 0.4322, + "step": 8222 + }, + { + "epoch": 0.94, + "learning_rate": 1.89672723262736e-07, + "loss": 0.4532, + "step": 8223 + }, + { + "epoch": 0.94, + "learning_rate": 1.889558589990148e-07, + "loss": 0.459, + "step": 8224 + }, + { + "epoch": 0.94, + "learning_rate": 1.882403390673837e-07, + "loss": 0.4548, + "step": 8225 + }, + { + "epoch": 0.94, + "learning_rate": 1.8752616356588648e-07, + "loss": 0.4389, + "step": 8226 + }, + { + "epoch": 0.94, + "learning_rate": 1.8681333259237933e-07, + "loss": 0.4535, + "step": 8227 + }, + { + "epoch": 0.94, + "learning_rate": 1.861018462445352e-07, + "loss": 0.4543, + "step": 8228 + }, + { + "epoch": 0.94, + "learning_rate": 1.8539170461984612e-07, + "loss": 0.4711, + "step": 8229 + }, + { + "epoch": 0.94, + "learning_rate": 1.8468290781561538e-07, + "loss": 0.4632, + "step": 8230 + }, + { + "epoch": 0.94, + "learning_rate": 1.8397545592896527e-07, + "loss": 0.4474, + "step": 8231 + }, + { + "epoch": 0.94, + "learning_rate": 1.832693490568327e-07, + "loss": 0.4422, + "step": 8232 + }, + { + "epoch": 0.94, + "learning_rate": 1.8256458729596692e-07, + "loss": 0.4676, + "step": 8233 + }, + { + "epoch": 0.94, + "learning_rate": 1.8186117074293964e-07, + "loss": 0.4515, + "step": 8234 + }, + { + "epoch": 0.94, + "learning_rate": 1.811590994941337e-07, + "loss": 0.4537, + "step": 8235 + }, + { + "epoch": 0.94, + "learning_rate": 1.804583736457477e-07, + "loss": 0.437, + "step": 8236 + }, + { + "epoch": 0.94, + "learning_rate": 1.797589932937982e-07, + "loss": 0.4531, + "step": 8237 + }, + { + "epoch": 0.94, + "learning_rate": 1.790609585341141e-07, + "loss": 0.4455, + "step": 8238 + }, + { + "epoch": 0.94, + "learning_rate": 1.7836426946234332e-07, + "loss": 0.4547, + "step": 8239 + }, + { + "epoch": 0.94, + "learning_rate": 1.7766892617394727e-07, + "loss": 0.4408, + "step": 8240 + }, + { + "epoch": 0.94, + "learning_rate": 1.7697492876420198e-07, + "loss": 0.4442, + "step": 8241 + }, + { + "epoch": 0.94, + "learning_rate": 1.7628227732820247e-07, + "loss": 0.4326, + "step": 8242 + }, + { + "epoch": 0.94, + "learning_rate": 1.755909719608573e-07, + "loss": 0.4699, + "step": 8243 + }, + { + "epoch": 0.94, + "learning_rate": 1.7490101275689064e-07, + "loss": 0.459, + "step": 8244 + }, + { + "epoch": 0.94, + "learning_rate": 1.7421239981084136e-07, + "loss": 0.4532, + "step": 8245 + }, + { + "epoch": 0.94, + "learning_rate": 1.7352513321706621e-07, + "loss": 0.4455, + "step": 8246 + }, + { + "epoch": 0.94, + "learning_rate": 1.7283921306973538e-07, + "loss": 0.4646, + "step": 8247 + }, + { + "epoch": 0.94, + "learning_rate": 1.7215463946283483e-07, + "loss": 0.4467, + "step": 8248 + }, + { + "epoch": 0.94, + "learning_rate": 1.714714124901662e-07, + "loss": 0.4495, + "step": 8249 + }, + { + "epoch": 0.94, + "learning_rate": 1.70789532245349e-07, + "loss": 0.4299, + "step": 8250 + }, + { + "epoch": 0.94, + "learning_rate": 1.70108998821813e-07, + "loss": 0.456, + "step": 8251 + }, + { + "epoch": 0.94, + "learning_rate": 1.6942981231280798e-07, + "loss": 0.4508, + "step": 8252 + }, + { + "epoch": 0.94, + "learning_rate": 1.6875197281139844e-07, + "loss": 0.4679, + "step": 8253 + }, + { + "epoch": 0.94, + "learning_rate": 1.680754804104623e-07, + "loss": 0.4539, + "step": 8254 + }, + { + "epoch": 0.94, + "learning_rate": 1.6740033520269538e-07, + "loss": 0.458, + "step": 8255 + }, + { + "epoch": 0.94, + "learning_rate": 1.6672653728060594e-07, + "loss": 0.4426, + "step": 8256 + }, + { + "epoch": 0.94, + "learning_rate": 1.6605408673652012e-07, + "loss": 0.4507, + "step": 8257 + }, + { + "epoch": 0.94, + "learning_rate": 1.6538298366257975e-07, + "loss": 0.4408, + "step": 8258 + }, + { + "epoch": 0.94, + "learning_rate": 1.647132281507391e-07, + "loss": 0.4554, + "step": 8259 + }, + { + "epoch": 0.94, + "learning_rate": 1.6404482029277023e-07, + "loss": 0.4514, + "step": 8260 + }, + { + "epoch": 0.94, + "learning_rate": 1.6337776018026108e-07, + "loss": 0.449, + "step": 8261 + }, + { + "epoch": 0.94, + "learning_rate": 1.627120479046118e-07, + "loss": 0.4437, + "step": 8262 + }, + { + "epoch": 0.94, + "learning_rate": 1.620476835570417e-07, + "loss": 0.4701, + "step": 8263 + }, + { + "epoch": 0.94, + "learning_rate": 1.6138466722858237e-07, + "loss": 0.4428, + "step": 8264 + }, + { + "epoch": 0.94, + "learning_rate": 1.6072299901008226e-07, + "loss": 0.4623, + "step": 8265 + }, + { + "epoch": 0.94, + "learning_rate": 1.6006267899220552e-07, + "loss": 0.4378, + "step": 8266 + }, + { + "epoch": 0.94, + "learning_rate": 1.5940370726542864e-07, + "loss": 0.4599, + "step": 8267 + }, + { + "epoch": 0.94, + "learning_rate": 1.587460839200472e-07, + "loss": 0.4498, + "step": 8268 + }, + { + "epoch": 0.95, + "learning_rate": 1.580898090461691e-07, + "loss": 0.4708, + "step": 8269 + }, + { + "epoch": 0.95, + "learning_rate": 1.5743488273372133e-07, + "loss": 0.4397, + "step": 8270 + }, + { + "epoch": 0.95, + "learning_rate": 1.567813050724387e-07, + "loss": 0.4513, + "step": 8271 + }, + { + "epoch": 0.95, + "learning_rate": 1.5612907615187967e-07, + "loss": 0.4539, + "step": 8272 + }, + { + "epoch": 0.95, + "learning_rate": 1.554781960614138e-07, + "loss": 0.467, + "step": 8273 + }, + { + "epoch": 0.95, + "learning_rate": 1.548286648902253e-07, + "loss": 0.4331, + "step": 8274 + }, + { + "epoch": 0.95, + "learning_rate": 1.5418048272731413e-07, + "loss": 0.438, + "step": 8275 + }, + { + "epoch": 0.95, + "learning_rate": 1.5353364966149697e-07, + "loss": 0.4538, + "step": 8276 + }, + { + "epoch": 0.95, + "learning_rate": 1.5288816578140298e-07, + "loss": 0.4652, + "step": 8277 + }, + { + "epoch": 0.95, + "learning_rate": 1.5224403117547916e-07, + "loss": 0.4538, + "step": 8278 + }, + { + "epoch": 0.95, + "learning_rate": 1.51601245931986e-07, + "loss": 0.4449, + "step": 8279 + }, + { + "epoch": 0.95, + "learning_rate": 1.5095981013899863e-07, + "loss": 0.435, + "step": 8280 + }, + { + "epoch": 0.95, + "learning_rate": 1.5031972388440787e-07, + "loss": 0.4511, + "step": 8281 + }, + { + "epoch": 0.95, + "learning_rate": 1.4968098725592127e-07, + "loss": 0.445, + "step": 8282 + }, + { + "epoch": 0.95, + "learning_rate": 1.4904360034106e-07, + "loss": 0.4407, + "step": 8283 + }, + { + "epoch": 0.95, + "learning_rate": 1.4840756322715866e-07, + "loss": 0.4414, + "step": 8284 + }, + { + "epoch": 0.95, + "learning_rate": 1.477728760013697e-07, + "loss": 0.4752, + "step": 8285 + }, + { + "epoch": 0.95, + "learning_rate": 1.4713953875065912e-07, + "loss": 0.4692, + "step": 8286 + }, + { + "epoch": 0.95, + "learning_rate": 1.4650755156180973e-07, + "loss": 0.4513, + "step": 8287 + }, + { + "epoch": 0.95, + "learning_rate": 1.458769145214145e-07, + "loss": 0.4425, + "step": 8288 + }, + { + "epoch": 0.95, + "learning_rate": 1.4524762771588763e-07, + "loss": 0.4558, + "step": 8289 + }, + { + "epoch": 0.95, + "learning_rate": 1.4461969123145458e-07, + "loss": 0.4535, + "step": 8290 + }, + { + "epoch": 0.95, + "learning_rate": 1.4399310515415655e-07, + "loss": 0.4321, + "step": 8291 + }, + { + "epoch": 0.95, + "learning_rate": 1.4336786956985038e-07, + "loss": 0.4441, + "step": 8292 + }, + { + "epoch": 0.95, + "learning_rate": 1.4274398456420647e-07, + "loss": 0.4445, + "step": 8293 + }, + { + "epoch": 0.95, + "learning_rate": 1.4212145022271196e-07, + "loss": 0.4637, + "step": 8294 + }, + { + "epoch": 0.95, + "learning_rate": 1.415002666306664e-07, + "loss": 0.4635, + "step": 8295 + }, + { + "epoch": 0.95, + "learning_rate": 1.4088043387318838e-07, + "loss": 0.4472, + "step": 8296 + }, + { + "epoch": 0.95, + "learning_rate": 1.4026195203520666e-07, + "loss": 0.4377, + "step": 8297 + }, + { + "epoch": 0.95, + "learning_rate": 1.3964482120146672e-07, + "loss": 0.4463, + "step": 8298 + }, + { + "epoch": 0.95, + "learning_rate": 1.3902904145653094e-07, + "loss": 0.4532, + "step": 8299 + }, + { + "epoch": 0.95, + "learning_rate": 1.384146128847741e-07, + "loss": 0.4427, + "step": 8300 + }, + { + "epoch": 0.95, + "learning_rate": 1.3780153557038655e-07, + "loss": 0.4533, + "step": 8301 + }, + { + "epoch": 0.95, + "learning_rate": 1.3718980959737448e-07, + "loss": 0.4675, + "step": 8302 + }, + { + "epoch": 0.95, + "learning_rate": 1.365794350495564e-07, + "loss": 0.4572, + "step": 8303 + }, + { + "epoch": 0.95, + "learning_rate": 1.359704120105687e-07, + "loss": 0.4524, + "step": 8304 + }, + { + "epoch": 0.95, + "learning_rate": 1.3536274056386134e-07, + "loss": 0.4236, + "step": 8305 + }, + { + "epoch": 0.95, + "learning_rate": 1.3475642079269659e-07, + "loss": 0.4386, + "step": 8306 + }, + { + "epoch": 0.95, + "learning_rate": 1.3415145278015575e-07, + "loss": 0.4413, + "step": 8307 + }, + { + "epoch": 0.95, + "learning_rate": 1.335478366091325e-07, + "loss": 0.4468, + "step": 8308 + }, + { + "epoch": 0.95, + "learning_rate": 1.329455723623352e-07, + "loss": 0.4596, + "step": 8309 + }, + { + "epoch": 0.95, + "learning_rate": 1.3234466012228887e-07, + "loss": 0.467, + "step": 8310 + }, + { + "epoch": 0.95, + "learning_rate": 1.31745099971331e-07, + "loss": 0.4309, + "step": 8311 + }, + { + "epoch": 0.95, + "learning_rate": 1.3114689199161478e-07, + "loss": 0.4875, + "step": 8312 + }, + { + "epoch": 0.95, + "learning_rate": 1.3055003626510687e-07, + "loss": 0.4334, + "step": 8313 + }, + { + "epoch": 0.95, + "learning_rate": 1.2995453287359293e-07, + "loss": 0.4388, + "step": 8314 + }, + { + "epoch": 0.95, + "learning_rate": 1.2936038189866773e-07, + "loss": 0.4582, + "step": 8315 + }, + { + "epoch": 0.95, + "learning_rate": 1.287675834217428e-07, + "loss": 0.4398, + "step": 8316 + }, + { + "epoch": 0.95, + "learning_rate": 1.2817613752404646e-07, + "loss": 0.4413, + "step": 8317 + }, + { + "epoch": 0.95, + "learning_rate": 1.2758604428661836e-07, + "loss": 0.469, + "step": 8318 + }, + { + "epoch": 0.95, + "learning_rate": 1.2699730379031604e-07, + "loss": 0.4525, + "step": 8319 + }, + { + "epoch": 0.95, + "learning_rate": 1.2640991611580943e-07, + "loss": 0.4553, + "step": 8320 + }, + { + "epoch": 0.95, + "learning_rate": 1.2582388134358414e-07, + "loss": 0.469, + "step": 8321 + }, + { + "epoch": 0.95, + "learning_rate": 1.2523919955393925e-07, + "loss": 0.4566, + "step": 8322 + }, + { + "epoch": 0.95, + "learning_rate": 1.246558708269896e-07, + "loss": 0.4418, + "step": 8323 + }, + { + "epoch": 0.95, + "learning_rate": 1.2407389524266456e-07, + "loss": 0.4348, + "step": 8324 + }, + { + "epoch": 0.95, + "learning_rate": 1.23493272880707e-07, + "loss": 0.4491, + "step": 8325 + }, + { + "epoch": 0.95, + "learning_rate": 1.2291400382067553e-07, + "loss": 0.4521, + "step": 8326 + }, + { + "epoch": 0.95, + "learning_rate": 1.223360881419433e-07, + "loss": 0.4571, + "step": 8327 + }, + { + "epoch": 0.95, + "learning_rate": 1.21759525923697e-07, + "loss": 0.4485, + "step": 8328 + }, + { + "epoch": 0.95, + "learning_rate": 1.2118431724493895e-07, + "loss": 0.4499, + "step": 8329 + }, + { + "epoch": 0.95, + "learning_rate": 1.2061046218448724e-07, + "loss": 0.4544, + "step": 8330 + }, + { + "epoch": 0.95, + "learning_rate": 1.2003796082097008e-07, + "loss": 0.4409, + "step": 8331 + }, + { + "epoch": 0.95, + "learning_rate": 1.194668132328325e-07, + "loss": 0.4329, + "step": 8332 + }, + { + "epoch": 0.95, + "learning_rate": 1.1889701949833743e-07, + "loss": 0.4414, + "step": 8333 + }, + { + "epoch": 0.95, + "learning_rate": 1.18328579695558e-07, + "loss": 0.448, + "step": 8334 + }, + { + "epoch": 0.95, + "learning_rate": 1.1776149390238301e-07, + "loss": 0.4538, + "step": 8335 + }, + { + "epoch": 0.95, + "learning_rate": 1.1719576219651585e-07, + "loss": 0.4408, + "step": 8336 + }, + { + "epoch": 0.95, + "learning_rate": 1.1663138465547341e-07, + "loss": 0.4368, + "step": 8337 + }, + { + "epoch": 0.95, + "learning_rate": 1.1606836135658939e-07, + "loss": 0.453, + "step": 8338 + }, + { + "epoch": 0.95, + "learning_rate": 1.1550669237700985e-07, + "loss": 0.4487, + "step": 8339 + }, + { + "epoch": 0.95, + "learning_rate": 1.1494637779369766e-07, + "loss": 0.429, + "step": 8340 + }, + { + "epoch": 0.95, + "learning_rate": 1.1438741768342587e-07, + "loss": 0.4466, + "step": 8341 + }, + { + "epoch": 0.95, + "learning_rate": 1.1382981212278655e-07, + "loss": 0.4571, + "step": 8342 + }, + { + "epoch": 0.95, + "learning_rate": 1.13273561188183e-07, + "loss": 0.4546, + "step": 8343 + }, + { + "epoch": 0.95, + "learning_rate": 1.1271866495583428e-07, + "loss": 0.4403, + "step": 8344 + }, + { + "epoch": 0.95, + "learning_rate": 1.12165123501774e-07, + "loss": 0.4579, + "step": 8345 + }, + { + "epoch": 0.95, + "learning_rate": 1.1161293690184927e-07, + "loss": 0.4458, + "step": 8346 + }, + { + "epoch": 0.95, + "learning_rate": 1.1106210523172068e-07, + "loss": 0.4447, + "step": 8347 + }, + { + "epoch": 0.95, + "learning_rate": 1.1051262856686673e-07, + "loss": 0.4585, + "step": 8348 + }, + { + "epoch": 0.95, + "learning_rate": 1.0996450698257721e-07, + "loss": 0.4428, + "step": 8349 + }, + { + "epoch": 0.95, + "learning_rate": 1.0941774055395538e-07, + "loss": 0.4559, + "step": 8350 + }, + { + "epoch": 0.95, + "learning_rate": 1.0887232935592351e-07, + "loss": 0.4442, + "step": 8351 + }, + { + "epoch": 0.95, + "learning_rate": 1.0832827346321295e-07, + "loss": 0.4561, + "step": 8352 + }, + { + "epoch": 0.95, + "learning_rate": 1.0778557295037296e-07, + "loss": 0.4558, + "step": 8353 + }, + { + "epoch": 0.95, + "learning_rate": 1.0724422789176404e-07, + "loss": 0.4638, + "step": 8354 + }, + { + "epoch": 0.95, + "learning_rate": 1.0670423836156241e-07, + "loss": 0.439, + "step": 8355 + }, + { + "epoch": 0.95, + "learning_rate": 1.0616560443376e-07, + "loss": 0.4519, + "step": 8356 + }, + { + "epoch": 0.96, + "learning_rate": 1.0562832618216223e-07, + "loss": 0.4447, + "step": 8357 + }, + { + "epoch": 0.96, + "learning_rate": 1.0509240368038576e-07, + "loss": 0.4416, + "step": 8358 + }, + { + "epoch": 0.96, + "learning_rate": 1.0455783700186628e-07, + "loss": 0.4505, + "step": 8359 + }, + { + "epoch": 0.96, + "learning_rate": 1.0402462621984965e-07, + "loss": 0.4415, + "step": 8360 + }, + { + "epoch": 0.96, + "learning_rate": 1.0349277140739966e-07, + "loss": 0.4615, + "step": 8361 + }, + { + "epoch": 0.96, + "learning_rate": 1.0296227263739023e-07, + "loss": 0.4467, + "step": 8362 + }, + { + "epoch": 0.96, + "learning_rate": 1.0243312998251209e-07, + "loss": 0.4314, + "step": 8363 + }, + { + "epoch": 0.96, + "learning_rate": 1.0190534351527059e-07, + "loss": 0.4581, + "step": 8364 + }, + { + "epoch": 0.96, + "learning_rate": 1.0137891330798344e-07, + "loss": 0.4285, + "step": 8365 + }, + { + "epoch": 0.96, + "learning_rate": 1.0085383943278293e-07, + "loss": 0.4495, + "step": 8366 + }, + { + "epoch": 0.96, + "learning_rate": 1.0033012196161706e-07, + "loss": 0.4665, + "step": 8367 + }, + { + "epoch": 0.96, + "learning_rate": 9.980776096624511e-08, + "loss": 0.4569, + "step": 8368 + }, + { + "epoch": 0.96, + "learning_rate": 9.928675651824427e-08, + "loss": 0.4427, + "step": 8369 + }, + { + "epoch": 0.96, + "learning_rate": 9.876710868900297e-08, + "loss": 0.4595, + "step": 8370 + }, + { + "epoch": 0.96, + "learning_rate": 9.824881754972426e-08, + "loss": 0.4592, + "step": 8371 + }, + { + "epoch": 0.96, + "learning_rate": 9.773188317142579e-08, + "loss": 0.4459, + "step": 8372 + }, + { + "epoch": 0.96, + "learning_rate": 9.721630562493867e-08, + "loss": 0.4578, + "step": 8373 + }, + { + "epoch": 0.96, + "learning_rate": 9.670208498090861e-08, + "loss": 0.4662, + "step": 8374 + }, + { + "epoch": 0.96, + "learning_rate": 9.61892213097959e-08, + "loss": 0.4425, + "step": 8375 + }, + { + "epoch": 0.96, + "learning_rate": 9.567771468187326e-08, + "loss": 0.4341, + "step": 8376 + }, + { + "epoch": 0.96, + "learning_rate": 9.516756516723124e-08, + "loss": 0.4402, + "step": 8377 + }, + { + "epoch": 0.96, + "learning_rate": 9.46587728357673e-08, + "loss": 0.4484, + "step": 8378 + }, + { + "epoch": 0.96, + "learning_rate": 9.415133775720231e-08, + "loss": 0.477, + "step": 8379 + }, + { + "epoch": 0.96, + "learning_rate": 9.364526000106289e-08, + "loss": 0.4292, + "step": 8380 + }, + { + "epoch": 0.96, + "learning_rate": 9.314053963669245e-08, + "loss": 0.4351, + "step": 8381 + }, + { + "epoch": 0.96, + "learning_rate": 9.263717673325124e-08, + "loss": 0.4642, + "step": 8382 + }, + { + "epoch": 0.96, + "learning_rate": 9.213517135971073e-08, + "loss": 0.4251, + "step": 8383 + }, + { + "epoch": 0.96, + "learning_rate": 9.163452358485591e-08, + "loss": 0.4471, + "step": 8384 + }, + { + "epoch": 0.96, + "learning_rate": 9.113523347728748e-08, + "loss": 0.4405, + "step": 8385 + }, + { + "epoch": 0.96, + "learning_rate": 9.063730110541846e-08, + "loss": 0.4519, + "step": 8386 + }, + { + "epoch": 0.96, + "learning_rate": 9.014072653747763e-08, + "loss": 0.4561, + "step": 8387 + }, + { + "epoch": 0.96, + "learning_rate": 8.964550984150611e-08, + "loss": 0.4654, + "step": 8388 + }, + { + "epoch": 0.96, + "learning_rate": 8.915165108536072e-08, + "loss": 0.4423, + "step": 8389 + }, + { + "epoch": 0.96, + "learning_rate": 8.865915033671069e-08, + "loss": 0.4749, + "step": 8390 + }, + { + "epoch": 0.96, + "learning_rate": 8.816800766303756e-08, + "loss": 0.4455, + "step": 8391 + }, + { + "epoch": 0.96, + "learning_rate": 8.767822313164198e-08, + "loss": 0.4368, + "step": 8392 + }, + { + "epoch": 0.96, + "learning_rate": 8.718979680963469e-08, + "loss": 0.4495, + "step": 8393 + }, + { + "epoch": 0.96, + "learning_rate": 8.670272876393881e-08, + "loss": 0.4468, + "step": 8394 + }, + { + "epoch": 0.96, + "learning_rate": 8.621701906129542e-08, + "loss": 0.4538, + "step": 8395 + }, + { + "epoch": 0.96, + "learning_rate": 8.573266776825683e-08, + "loss": 0.4725, + "step": 8396 + }, + { + "epoch": 0.96, + "learning_rate": 8.524967495119107e-08, + "loss": 0.4482, + "step": 8397 + }, + { + "epoch": 0.96, + "learning_rate": 8.476804067627852e-08, + "loss": 0.4601, + "step": 8398 + }, + { + "epoch": 0.96, + "learning_rate": 8.428776500951308e-08, + "loss": 0.4497, + "step": 8399 + }, + { + "epoch": 0.96, + "learning_rate": 8.380884801670431e-08, + "loss": 0.4364, + "step": 8400 + }, + { + "epoch": 0.96, + "learning_rate": 8.333128976347305e-08, + "loss": 0.4429, + "step": 8401 + }, + { + "epoch": 0.96, + "learning_rate": 8.285509031525696e-08, + "loss": 0.461, + "step": 8402 + }, + { + "epoch": 0.96, + "learning_rate": 8.238024973730497e-08, + "loss": 0.4412, + "step": 8403 + }, + { + "epoch": 0.96, + "learning_rate": 8.190676809468056e-08, + "loss": 0.4573, + "step": 8404 + }, + { + "epoch": 0.96, + "learning_rate": 8.143464545226298e-08, + "loss": 0.4421, + "step": 8405 + }, + { + "epoch": 0.96, + "learning_rate": 8.096388187474269e-08, + "loss": 0.4717, + "step": 8406 + }, + { + "epoch": 0.96, + "learning_rate": 8.049447742662364e-08, + "loss": 0.4514, + "step": 8407 + }, + { + "epoch": 0.96, + "learning_rate": 8.002643217222661e-08, + "loss": 0.4507, + "step": 8408 + }, + { + "epoch": 0.96, + "learning_rate": 7.955974617568252e-08, + "loss": 0.4262, + "step": 8409 + }, + { + "epoch": 0.96, + "learning_rate": 7.90944195009391e-08, + "loss": 0.4479, + "step": 8410 + }, + { + "epoch": 0.96, + "learning_rate": 7.863045221175647e-08, + "loss": 0.4506, + "step": 8411 + }, + { + "epoch": 0.96, + "learning_rate": 7.81678443717071e-08, + "loss": 0.4619, + "step": 8412 + }, + { + "epoch": 0.96, + "learning_rate": 7.77065960441803e-08, + "loss": 0.4708, + "step": 8413 + }, + { + "epoch": 0.96, + "learning_rate": 7.72467072923766e-08, + "loss": 0.4405, + "step": 8414 + }, + { + "epoch": 0.96, + "learning_rate": 7.678817817931006e-08, + "loss": 0.4463, + "step": 8415 + }, + { + "epoch": 0.96, + "learning_rate": 7.633100876781152e-08, + "loss": 0.48, + "step": 8416 + }, + { + "epoch": 0.96, + "learning_rate": 7.587519912052199e-08, + "loss": 0.4355, + "step": 8417 + }, + { + "epoch": 0.96, + "learning_rate": 7.542074929989818e-08, + "loss": 0.4531, + "step": 8418 + }, + { + "epoch": 0.96, + "learning_rate": 7.496765936821027e-08, + "loss": 0.4639, + "step": 8419 + }, + { + "epoch": 0.96, + "learning_rate": 7.451592938753971e-08, + "loss": 0.4652, + "step": 8420 + }, + { + "epoch": 0.96, + "learning_rate": 7.406555941978478e-08, + "loss": 0.4412, + "step": 8421 + }, + { + "epoch": 0.96, + "learning_rate": 7.361654952665608e-08, + "loss": 0.4694, + "step": 8422 + }, + { + "epoch": 0.96, + "learning_rate": 7.31688997696789e-08, + "loss": 0.4387, + "step": 8423 + }, + { + "epoch": 0.96, + "learning_rate": 7.272261021019079e-08, + "loss": 0.4594, + "step": 8424 + }, + { + "epoch": 0.96, + "learning_rate": 7.227768090934285e-08, + "loss": 0.4461, + "step": 8425 + }, + { + "epoch": 0.96, + "learning_rate": 7.183411192810075e-08, + "loss": 0.4421, + "step": 8426 + }, + { + "epoch": 0.96, + "learning_rate": 7.139190332724255e-08, + "loss": 0.4374, + "step": 8427 + }, + { + "epoch": 0.96, + "learning_rate": 7.095105516736201e-08, + "loss": 0.4646, + "step": 8428 + }, + { + "epoch": 0.96, + "learning_rate": 7.051156750886523e-08, + "loss": 0.451, + "step": 8429 + }, + { + "epoch": 0.96, + "learning_rate": 7.007344041196962e-08, + "loss": 0.4344, + "step": 8430 + }, + { + "epoch": 0.96, + "learning_rate": 6.963667393671048e-08, + "loss": 0.4456, + "step": 8431 + }, + { + "epoch": 0.96, + "learning_rate": 6.920126814293438e-08, + "loss": 0.4396, + "step": 8432 + }, + { + "epoch": 0.96, + "learning_rate": 6.876722309030026e-08, + "loss": 0.4522, + "step": 8433 + }, + { + "epoch": 0.96, + "learning_rate": 6.833453883828389e-08, + "loss": 0.449, + "step": 8434 + }, + { + "epoch": 0.96, + "learning_rate": 6.790321544617117e-08, + "loss": 0.4438, + "step": 8435 + }, + { + "epoch": 0.96, + "learning_rate": 6.747325297306484e-08, + "loss": 0.4538, + "step": 8436 + }, + { + "epoch": 0.96, + "learning_rate": 6.704465147787665e-08, + "loss": 0.4676, + "step": 8437 + }, + { + "epoch": 0.96, + "learning_rate": 6.661741101933628e-08, + "loss": 0.4309, + "step": 8438 + }, + { + "epoch": 0.96, + "learning_rate": 6.61915316559858e-08, + "loss": 0.4555, + "step": 8439 + }, + { + "epoch": 0.96, + "learning_rate": 6.576701344617964e-08, + "loss": 0.4451, + "step": 8440 + }, + { + "epoch": 0.96, + "learning_rate": 6.534385644808461e-08, + "loss": 0.4492, + "step": 8441 + }, + { + "epoch": 0.96, + "learning_rate": 6.492206071968432e-08, + "loss": 0.4569, + "step": 8442 + }, + { + "epoch": 0.96, + "learning_rate": 6.450162631877366e-08, + "loss": 0.4277, + "step": 8443 + }, + { + "epoch": 0.97, + "learning_rate": 6.40825533029632e-08, + "loss": 0.4393, + "step": 8444 + }, + { + "epoch": 0.97, + "learning_rate": 6.366484172967369e-08, + "loss": 0.4463, + "step": 8445 + }, + { + "epoch": 0.97, + "learning_rate": 6.324849165614045e-08, + "loss": 0.4572, + "step": 8446 + }, + { + "epoch": 0.97, + "learning_rate": 6.28335031394134e-08, + "loss": 0.4336, + "step": 8447 + }, + { + "epoch": 0.97, + "learning_rate": 6.241987623635482e-08, + "loss": 0.4617, + "step": 8448 + }, + { + "epoch": 0.97, + "learning_rate": 6.200761100364272e-08, + "loss": 0.4252, + "step": 8449 + }, + { + "epoch": 0.97, + "learning_rate": 6.159670749776414e-08, + "loss": 0.4396, + "step": 8450 + }, + { + "epoch": 0.97, + "learning_rate": 6.118716577502404e-08, + "loss": 0.4562, + "step": 8451 + }, + { + "epoch": 0.97, + "learning_rate": 6.077898589153642e-08, + "loss": 0.4598, + "step": 8452 + }, + { + "epoch": 0.97, + "learning_rate": 6.037216790323319e-08, + "loss": 0.4458, + "step": 8453 + }, + { + "epoch": 0.97, + "learning_rate": 5.996671186585756e-08, + "loss": 0.4534, + "step": 8454 + }, + { + "epoch": 0.97, + "learning_rate": 5.9562617834963974e-08, + "loss": 0.4514, + "step": 8455 + }, + { + "epoch": 0.97, + "learning_rate": 5.915988586592481e-08, + "loss": 0.4507, + "step": 8456 + }, + { + "epoch": 0.97, + "learning_rate": 5.8758516013921464e-08, + "loss": 0.4562, + "step": 8457 + }, + { + "epoch": 0.97, + "learning_rate": 5.8358508333951066e-08, + "loss": 0.4379, + "step": 8458 + }, + { + "epoch": 0.97, + "learning_rate": 5.795986288082422e-08, + "loss": 0.438, + "step": 8459 + }, + { + "epoch": 0.97, + "learning_rate": 5.75625797091639e-08, + "loss": 0.4665, + "step": 8460 + }, + { + "epoch": 0.97, + "learning_rate": 5.716665887340656e-08, + "loss": 0.4447, + "step": 8461 + }, + { + "epoch": 0.97, + "learning_rate": 5.677210042780212e-08, + "loss": 0.4512, + "step": 8462 + }, + { + "epoch": 0.97, + "learning_rate": 5.637890442641403e-08, + "loss": 0.4368, + "step": 8463 + }, + { + "epoch": 0.97, + "learning_rate": 5.598707092311917e-08, + "loss": 0.4483, + "step": 8464 + }, + { + "epoch": 0.97, + "learning_rate": 5.5596599971606823e-08, + "loss": 0.4491, + "step": 8465 + }, + { + "epoch": 0.97, + "learning_rate": 5.520749162538197e-08, + "loss": 0.4589, + "step": 8466 + }, + { + "epoch": 0.97, + "learning_rate": 5.4819745937758625e-08, + "loss": 0.4372, + "step": 8467 + }, + { + "epoch": 0.97, + "learning_rate": 5.443336296186874e-08, + "loss": 0.4507, + "step": 8468 + }, + { + "epoch": 0.97, + "learning_rate": 5.40483427506544e-08, + "loss": 0.4463, + "step": 8469 + }, + { + "epoch": 0.97, + "learning_rate": 5.3664685356871193e-08, + "loss": 0.4787, + "step": 8470 + }, + { + "epoch": 0.97, + "learning_rate": 5.3282390833090393e-08, + "loss": 0.4549, + "step": 8471 + }, + { + "epoch": 0.97, + "learning_rate": 5.290145923169343e-08, + "loss": 0.4251, + "step": 8472 + }, + { + "epoch": 0.97, + "learning_rate": 5.252189060487855e-08, + "loss": 0.4497, + "step": 8473 + }, + { + "epoch": 0.97, + "learning_rate": 5.214368500465305e-08, + "loss": 0.4614, + "step": 8474 + }, + { + "epoch": 0.97, + "learning_rate": 5.176684248283992e-08, + "loss": 0.4422, + "step": 8475 + }, + { + "epoch": 0.97, + "learning_rate": 5.1391363091075616e-08, + "loss": 0.4454, + "step": 8476 + }, + { + "epoch": 0.97, + "learning_rate": 5.1017246880809e-08, + "loss": 0.4402, + "step": 8477 + }, + { + "epoch": 0.97, + "learning_rate": 5.064449390330239e-08, + "loss": 0.4599, + "step": 8478 + }, + { + "epoch": 0.97, + "learning_rate": 5.02731042096305e-08, + "loss": 0.4456, + "step": 8479 + }, + { + "epoch": 0.97, + "learning_rate": 4.99030778506826e-08, + "loss": 0.4534, + "step": 8480 + }, + { + "epoch": 0.97, + "learning_rate": 4.953441487716037e-08, + "loss": 0.4505, + "step": 8481 + }, + { + "epoch": 0.97, + "learning_rate": 4.9167115339580074e-08, + "loss": 0.461, + "step": 8482 + }, + { + "epoch": 0.97, + "learning_rate": 4.8801179288268105e-08, + "loss": 0.4449, + "step": 8483 + }, + { + "epoch": 0.97, + "learning_rate": 4.84366067733677e-08, + "loss": 0.451, + "step": 8484 + }, + { + "epoch": 0.97, + "learning_rate": 4.807339784483112e-08, + "loss": 0.4375, + "step": 8485 + }, + { + "epoch": 0.97, + "learning_rate": 4.771155255242854e-08, + "loss": 0.4622, + "step": 8486 + }, + { + "epoch": 0.97, + "learning_rate": 4.7351070945739206e-08, + "loss": 0.4462, + "step": 8487 + }, + { + "epoch": 0.97, + "learning_rate": 4.699195307415805e-08, + "loss": 0.4648, + "step": 8488 + }, + { + "epoch": 0.97, + "learning_rate": 4.663419898689125e-08, + "loss": 0.4283, + "step": 8489 + }, + { + "epoch": 0.97, + "learning_rate": 4.6277808732959616e-08, + "loss": 0.4329, + "step": 8490 + }, + { + "epoch": 0.97, + "learning_rate": 4.5922782361197405e-08, + "loss": 0.4742, + "step": 8491 + }, + { + "epoch": 0.97, + "learning_rate": 4.556911992025015e-08, + "loss": 0.4469, + "step": 8492 + }, + { + "epoch": 0.97, + "learning_rate": 4.521682145857797e-08, + "loss": 0.4465, + "step": 8493 + }, + { + "epoch": 0.97, + "learning_rate": 4.486588702445338e-08, + "loss": 0.4458, + "step": 8494 + }, + { + "epoch": 0.97, + "learning_rate": 4.451631666596123e-08, + "loss": 0.4394, + "step": 8495 + }, + { + "epoch": 0.97, + "learning_rate": 4.416811043100322e-08, + "loss": 0.4785, + "step": 8496 + }, + { + "epoch": 0.97, + "learning_rate": 4.382126836728895e-08, + "loss": 0.4551, + "step": 8497 + }, + { + "epoch": 0.97, + "learning_rate": 4.347579052234374e-08, + "loss": 0.4671, + "step": 8498 + }, + { + "epoch": 0.97, + "learning_rate": 4.3131676943506395e-08, + "loss": 0.4613, + "step": 8499 + }, + { + "epoch": 0.97, + "learning_rate": 4.278892767792808e-08, + "loss": 0.463, + "step": 8500 + }, + { + "epoch": 0.97, + "learning_rate": 4.244754277257346e-08, + "loss": 0.4322, + "step": 8501 + }, + { + "epoch": 0.97, + "learning_rate": 4.210752227421955e-08, + "loss": 0.4353, + "step": 8502 + }, + { + "epoch": 0.97, + "learning_rate": 4.176886622945575e-08, + "loss": 0.4607, + "step": 8503 + }, + { + "epoch": 0.97, + "learning_rate": 4.143157468468717e-08, + "loss": 0.4652, + "step": 8504 + }, + { + "epoch": 0.97, + "learning_rate": 4.109564768613017e-08, + "loss": 0.4465, + "step": 8505 + }, + { + "epoch": 0.97, + "learning_rate": 4.076108527981237e-08, + "loss": 0.4399, + "step": 8506 + }, + { + "epoch": 0.97, + "learning_rate": 4.0427887511578224e-08, + "loss": 0.4457, + "step": 8507 + }, + { + "epoch": 0.97, + "learning_rate": 4.009605442708231e-08, + "loss": 0.4501, + "step": 8508 + }, + { + "epoch": 0.97, + "learning_rate": 3.976558607179382e-08, + "loss": 0.4637, + "step": 8509 + }, + { + "epoch": 0.97, + "learning_rate": 3.943648249099319e-08, + "loss": 0.4323, + "step": 8510 + }, + { + "epoch": 0.97, + "learning_rate": 3.910874372977658e-08, + "loss": 0.4528, + "step": 8511 + }, + { + "epoch": 0.97, + "learning_rate": 3.8782369833050284e-08, + "loss": 0.4539, + "step": 8512 + }, + { + "epoch": 0.97, + "learning_rate": 3.845736084553408e-08, + "loss": 0.4405, + "step": 8513 + }, + { + "epoch": 0.97, + "learning_rate": 3.813371681176348e-08, + "loss": 0.4558, + "step": 8514 + }, + { + "epoch": 0.97, + "learning_rate": 3.7811437776084095e-08, + "loss": 0.4546, + "step": 8515 + }, + { + "epoch": 0.97, + "learning_rate": 3.749052378265505e-08, + "loss": 0.4363, + "step": 8516 + }, + { + "epoch": 0.97, + "learning_rate": 3.717097487545007e-08, + "loss": 0.4546, + "step": 8517 + }, + { + "epoch": 0.97, + "learning_rate": 3.6852791098251906e-08, + "loss": 0.4673, + "step": 8518 + }, + { + "epoch": 0.97, + "learning_rate": 3.653597249466012e-08, + "loss": 0.4412, + "step": 8519 + }, + { + "epoch": 0.97, + "learning_rate": 3.622051910808666e-08, + "loss": 0.4481, + "step": 8520 + }, + { + "epoch": 0.97, + "learning_rate": 3.5906430981754724e-08, + "loss": 0.4613, + "step": 8521 + }, + { + "epoch": 0.97, + "learning_rate": 3.559370815870211e-08, + "loss": 0.4789, + "step": 8522 + }, + { + "epoch": 0.97, + "learning_rate": 3.528235068177899e-08, + "loss": 0.4422, + "step": 8523 + }, + { + "epoch": 0.97, + "learning_rate": 3.4972358593646785e-08, + "loss": 0.4468, + "step": 8524 + }, + { + "epoch": 0.97, + "learning_rate": 3.466373193678263e-08, + "loss": 0.4509, + "step": 8525 + }, + { + "epoch": 0.97, + "learning_rate": 3.4356470753474927e-08, + "loss": 0.4454, + "step": 8526 + }, + { + "epoch": 0.97, + "learning_rate": 3.4050575085825546e-08, + "loss": 0.4475, + "step": 8527 + }, + { + "epoch": 0.97, + "learning_rate": 3.3746044975749845e-08, + "loss": 0.4434, + "step": 8528 + }, + { + "epoch": 0.97, + "learning_rate": 3.3442880464972237e-08, + "loss": 0.4462, + "step": 8529 + }, + { + "epoch": 0.97, + "learning_rate": 3.314108159503726e-08, + "loss": 0.4664, + "step": 8530 + }, + { + "epoch": 0.97, + "learning_rate": 3.284064840729406e-08, + "loss": 0.453, + "step": 8531 + }, + { + "epoch": 0.98, + "learning_rate": 3.2541580942911935e-08, + "loss": 0.4415, + "step": 8532 + }, + { + "epoch": 0.98, + "learning_rate": 3.224387924286698e-08, + "loss": 0.4437, + "step": 8533 + }, + { + "epoch": 0.98, + "learning_rate": 3.1947543347953246e-08, + "loss": 0.4495, + "step": 8534 + }, + { + "epoch": 0.98, + "learning_rate": 3.1652573298774916e-08, + "loss": 0.4552, + "step": 8535 + }, + { + "epoch": 0.98, + "learning_rate": 3.135896913574743e-08, + "loss": 0.4514, + "step": 8536 + }, + { + "epoch": 0.98, + "learning_rate": 3.106673089910417e-08, + "loss": 0.4368, + "step": 8537 + }, + { + "epoch": 0.98, + "learning_rate": 3.077585862888643e-08, + "loss": 0.4588, + "step": 8538 + }, + { + "epoch": 0.98, + "learning_rate": 3.048635236495012e-08, + "loss": 0.446, + "step": 8539 + }, + { + "epoch": 0.98, + "learning_rate": 3.019821214696572e-08, + "loss": 0.4568, + "step": 8540 + }, + { + "epoch": 0.98, + "learning_rate": 2.9911438014412765e-08, + "loss": 0.4559, + "step": 8541 + }, + { + "epoch": 0.98, + "learning_rate": 2.962603000658648e-08, + "loss": 0.4478, + "step": 8542 + }, + { + "epoch": 0.98, + "learning_rate": 2.9341988162595593e-08, + "loss": 0.458, + "step": 8543 + }, + { + "epoch": 0.98, + "learning_rate": 2.905931252135785e-08, + "loss": 0.4289, + "step": 8544 + }, + { + "epoch": 0.98, + "learning_rate": 2.8778003121607834e-08, + "loss": 0.4586, + "step": 8545 + }, + { + "epoch": 0.98, + "learning_rate": 2.849806000189026e-08, + "loss": 0.4475, + "step": 8546 + }, + { + "epoch": 0.98, + "learning_rate": 2.8219483200563334e-08, + "loss": 0.4482, + "step": 8547 + }, + { + "epoch": 0.98, + "learning_rate": 2.794227275579986e-08, + "loss": 0.4307, + "step": 8548 + }, + { + "epoch": 0.98, + "learning_rate": 2.766642870558278e-08, + "loss": 0.4794, + "step": 8549 + }, + { + "epoch": 0.98, + "learning_rate": 2.7391951087708534e-08, + "loss": 0.4402, + "step": 8550 + }, + { + "epoch": 0.98, + "learning_rate": 2.7118839939787033e-08, + "loss": 0.4402, + "step": 8551 + }, + { + "epoch": 0.98, + "learning_rate": 2.6847095299241678e-08, + "loss": 0.4382, + "step": 8552 + }, + { + "epoch": 0.98, + "learning_rate": 2.6576717203304904e-08, + "loss": 0.4644, + "step": 8553 + }, + { + "epoch": 0.98, + "learning_rate": 2.6307705689028184e-08, + "loss": 0.4639, + "step": 8554 + }, + { + "epoch": 0.98, + "learning_rate": 2.6040060793268705e-08, + "loss": 0.4479, + "step": 8555 + }, + { + "epoch": 0.98, + "learning_rate": 2.5773782552701578e-08, + "loss": 0.4348, + "step": 8556 + }, + { + "epoch": 0.98, + "learning_rate": 2.550887100381205e-08, + "loss": 0.436, + "step": 8557 + }, + { + "epoch": 0.98, + "learning_rate": 2.5245326182899987e-08, + "loss": 0.4596, + "step": 8558 + }, + { + "epoch": 0.98, + "learning_rate": 2.4983148126076494e-08, + "loss": 0.4529, + "step": 8559 + }, + { + "epoch": 0.98, + "learning_rate": 2.4722336869265063e-08, + "loss": 0.4305, + "step": 8560 + }, + { + "epoch": 0.98, + "learning_rate": 2.4462892448202657e-08, + "loss": 0.4697, + "step": 8561 + }, + { + "epoch": 0.98, + "learning_rate": 2.4204814898440844e-08, + "loss": 0.4521, + "step": 8562 + }, + { + "epoch": 0.98, + "learning_rate": 2.394810425534022e-08, + "loss": 0.4691, + "step": 8563 + }, + { + "epoch": 0.98, + "learning_rate": 2.369276055407599e-08, + "loss": 0.4614, + "step": 8564 + }, + { + "epoch": 0.98, + "learning_rate": 2.3438783829635714e-08, + "loss": 0.4483, + "step": 8565 + }, + { + "epoch": 0.98, + "learning_rate": 2.318617411682156e-08, + "loss": 0.4659, + "step": 8566 + }, + { + "epoch": 0.98, + "learning_rate": 2.2934931450245833e-08, + "loss": 0.4533, + "step": 8567 + }, + { + "epoch": 0.98, + "learning_rate": 2.2685055864333227e-08, + "loss": 0.4534, + "step": 8568 + }, + { + "epoch": 0.98, + "learning_rate": 2.2436547393323017e-08, + "loss": 0.4475, + "step": 8569 + }, + { + "epoch": 0.98, + "learning_rate": 2.218940607126685e-08, + "loss": 0.4378, + "step": 8570 + }, + { + "epoch": 0.98, + "learning_rate": 2.1943631932028752e-08, + "loss": 0.4555, + "step": 8571 + }, + { + "epoch": 0.98, + "learning_rate": 2.169922500928512e-08, + "loss": 0.4596, + "step": 8572 + }, + { + "epoch": 0.98, + "learning_rate": 2.1456185336524714e-08, + "loss": 0.4347, + "step": 8573 + }, + { + "epoch": 0.98, + "learning_rate": 2.1214512947048684e-08, + "loss": 0.4469, + "step": 8574 + }, + { + "epoch": 0.98, + "learning_rate": 2.097420787397275e-08, + "loss": 0.4515, + "step": 8575 + }, + { + "epoch": 0.98, + "learning_rate": 2.0735270150223917e-08, + "loss": 0.4495, + "step": 8576 + }, + { + "epoch": 0.98, + "learning_rate": 2.0497699808542658e-08, + "loss": 0.4537, + "step": 8577 + }, + { + "epoch": 0.98, + "learning_rate": 2.0261496881479605e-08, + "loss": 0.4443, + "step": 8578 + }, + { + "epoch": 0.98, + "learning_rate": 2.002666140140108e-08, + "loss": 0.4546, + "step": 8579 + }, + { + "epoch": 0.98, + "learning_rate": 1.979319340048469e-08, + "loss": 0.4571, + "step": 8580 + }, + { + "epoch": 0.98, + "learning_rate": 1.956109291072039e-08, + "loss": 0.4424, + "step": 8581 + }, + { + "epoch": 0.98, + "learning_rate": 1.9330359963910527e-08, + "loss": 0.4511, + "step": 8582 + }, + { + "epoch": 0.98, + "learning_rate": 1.910099459167314e-08, + "loss": 0.4563, + "step": 8583 + }, + { + "epoch": 0.98, + "learning_rate": 1.8872996825433086e-08, + "loss": 0.4414, + "step": 8584 + }, + { + "epoch": 0.98, + "learning_rate": 1.864636669643427e-08, + "loss": 0.4554, + "step": 8585 + }, + { + "epoch": 0.98, + "learning_rate": 1.8421104235727406e-08, + "loss": 0.4389, + "step": 8586 + }, + { + "epoch": 0.98, + "learning_rate": 1.8197209474180023e-08, + "loss": 0.4647, + "step": 8587 + }, + { + "epoch": 0.98, + "learning_rate": 1.7974682442470915e-08, + "loss": 0.4403, + "step": 8588 + }, + { + "epoch": 0.98, + "learning_rate": 1.775352317109014e-08, + "loss": 0.4513, + "step": 8589 + }, + { + "epoch": 0.98, + "learning_rate": 1.7533731690342338e-08, + "loss": 0.4451, + "step": 8590 + }, + { + "epoch": 0.98, + "learning_rate": 1.7315308030342314e-08, + "loss": 0.4486, + "step": 8591 + }, + { + "epoch": 0.98, + "learning_rate": 1.7098252221021683e-08, + "loss": 0.4594, + "step": 8592 + }, + { + "epoch": 0.98, + "learning_rate": 1.6882564292119984e-08, + "loss": 0.4419, + "step": 8593 + }, + { + "epoch": 0.98, + "learning_rate": 1.666824427319136e-08, + "loss": 0.4521, + "step": 8594 + }, + { + "epoch": 0.98, + "learning_rate": 1.6455292193603424e-08, + "loss": 0.4343, + "step": 8595 + }, + { + "epoch": 0.98, + "learning_rate": 1.624370808253506e-08, + "loss": 0.4542, + "step": 8596 + }, + { + "epoch": 0.98, + "learning_rate": 1.6033491968976412e-08, + "loss": 0.4468, + "step": 8597 + }, + { + "epoch": 0.98, + "learning_rate": 1.5824643881734438e-08, + "loss": 0.4459, + "step": 8598 + }, + { + "epoch": 0.98, + "learning_rate": 1.561716384942402e-08, + "loss": 0.4632, + "step": 8599 + }, + { + "epoch": 0.98, + "learning_rate": 1.541105190047465e-08, + "loss": 0.4459, + "step": 8600 + }, + { + "epoch": 0.98, + "learning_rate": 1.5206308063129282e-08, + "loss": 0.4326, + "step": 8601 + }, + { + "epoch": 0.98, + "learning_rate": 1.5002932365442148e-08, + "loss": 0.4464, + "step": 8602 + }, + { + "epoch": 0.98, + "learning_rate": 1.480092483527984e-08, + "loss": 0.4586, + "step": 8603 + }, + { + "epoch": 0.98, + "learning_rate": 1.4600285500322442e-08, + "loss": 0.4658, + "step": 8604 + }, + { + "epoch": 0.98, + "learning_rate": 1.4401014388061296e-08, + "loss": 0.483, + "step": 8605 + }, + { + "epoch": 0.98, + "learning_rate": 1.4203111525801228e-08, + "loss": 0.4419, + "step": 8606 + }, + { + "epoch": 0.98, + "learning_rate": 1.4006576940659433e-08, + "loss": 0.4547, + "step": 8607 + }, + { + "epoch": 0.98, + "learning_rate": 1.3811410659565483e-08, + "loss": 0.4322, + "step": 8608 + }, + { + "epoch": 0.98, + "learning_rate": 1.3617612709262428e-08, + "loss": 0.4459, + "step": 8609 + }, + { + "epoch": 0.98, + "learning_rate": 1.3425183116303475e-08, + "loss": 0.4408, + "step": 8610 + }, + { + "epoch": 0.98, + "learning_rate": 1.3234121907056418e-08, + "loss": 0.4458, + "step": 8611 + }, + { + "epoch": 0.98, + "learning_rate": 1.3044429107700319e-08, + "loss": 0.4581, + "step": 8612 + }, + { + "epoch": 0.98, + "learning_rate": 1.2856104744228826e-08, + "loss": 0.4792, + "step": 8613 + }, + { + "epoch": 0.98, + "learning_rate": 1.2669148842444634e-08, + "loss": 0.4454, + "step": 8614 + }, + { + "epoch": 0.98, + "learning_rate": 1.248356142796725e-08, + "loss": 0.4506, + "step": 8615 + }, + { + "epoch": 0.98, + "learning_rate": 1.2299342526224112e-08, + "loss": 0.4594, + "step": 8616 + }, + { + "epoch": 0.98, + "learning_rate": 1.211649216245836e-08, + "loss": 0.4516, + "step": 8617 + }, + { + "epoch": 0.98, + "learning_rate": 1.1935010361724397e-08, + "loss": 0.4324, + "step": 8618 + }, + { + "epoch": 0.99, + "learning_rate": 1.1754897148889e-08, + "loss": 0.4501, + "step": 8619 + }, + { + "epoch": 0.99, + "learning_rate": 1.1576152548631314e-08, + "loss": 0.45, + "step": 8620 + }, + { + "epoch": 0.99, + "learning_rate": 1.1398776585445082e-08, + "loss": 0.4689, + "step": 8621 + }, + { + "epoch": 0.99, + "learning_rate": 1.1222769283633083e-08, + "loss": 0.4379, + "step": 8622 + }, + { + "epoch": 0.99, + "learning_rate": 1.1048130667312695e-08, + "loss": 0.449, + "step": 8623 + }, + { + "epoch": 0.99, + "learning_rate": 1.0874860760413664e-08, + "loss": 0.4387, + "step": 8624 + }, + { + "epoch": 0.99, + "learning_rate": 1.0702959586678108e-08, + "loss": 0.4623, + "step": 8625 + }, + { + "epoch": 0.99, + "learning_rate": 1.0532427169659409e-08, + "loss": 0.4342, + "step": 8626 + }, + { + "epoch": 0.99, + "learning_rate": 1.0363263532724433e-08, + "loss": 0.4405, + "step": 8627 + }, + { + "epoch": 0.99, + "learning_rate": 1.0195468699052413e-08, + "loss": 0.457, + "step": 8628 + }, + { + "epoch": 0.99, + "learning_rate": 1.0029042691636071e-08, + "loss": 0.4239, + "step": 8629 + }, + { + "epoch": 0.99, + "learning_rate": 9.863985533278275e-09, + "loss": 0.4434, + "step": 8630 + }, + { + "epoch": 0.99, + "learning_rate": 9.700297246596491e-09, + "loss": 0.461, + "step": 8631 + }, + { + "epoch": 0.99, + "learning_rate": 9.537977854018332e-09, + "loss": 0.4584, + "step": 8632 + }, + { + "epoch": 0.99, + "learning_rate": 9.377027377786007e-09, + "loss": 0.4492, + "step": 8633 + }, + { + "epoch": 0.99, + "learning_rate": 9.217445839952988e-09, + "loss": 0.4512, + "step": 8634 + }, + { + "epoch": 0.99, + "learning_rate": 9.059233262386225e-09, + "loss": 0.4474, + "step": 8635 + }, + { + "epoch": 0.99, + "learning_rate": 8.902389666765044e-09, + "loss": 0.4534, + "step": 8636 + }, + { + "epoch": 0.99, + "learning_rate": 8.746915074577811e-09, + "loss": 0.4459, + "step": 8637 + }, + { + "epoch": 0.99, + "learning_rate": 8.592809507129706e-09, + "loss": 0.4423, + "step": 8638 + }, + { + "epoch": 0.99, + "learning_rate": 8.440072985537174e-09, + "loss": 0.4627, + "step": 8639 + }, + { + "epoch": 0.99, + "learning_rate": 8.288705530727915e-09, + "loss": 0.4505, + "step": 8640 + }, + { + "epoch": 0.99, + "learning_rate": 8.138707163442005e-09, + "loss": 0.4333, + "step": 8641 + }, + { + "epoch": 0.99, + "learning_rate": 7.990077904234117e-09, + "loss": 0.4511, + "step": 8642 + }, + { + "epoch": 0.99, + "learning_rate": 7.84281777346796e-09, + "loss": 0.457, + "step": 8643 + }, + { + "epoch": 0.99, + "learning_rate": 7.696926791322946e-09, + "loss": 0.4234, + "step": 8644 + }, + { + "epoch": 0.99, + "learning_rate": 7.552404977788641e-09, + "loss": 0.4557, + "step": 8645 + }, + { + "epoch": 0.99, + "learning_rate": 7.409252352668095e-09, + "loss": 0.444, + "step": 8646 + }, + { + "epoch": 0.99, + "learning_rate": 7.267468935575617e-09, + "loss": 0.455, + "step": 8647 + }, + { + "epoch": 0.99, + "learning_rate": 7.12705474594011e-09, + "loss": 0.4692, + "step": 8648 + }, + { + "epoch": 0.99, + "learning_rate": 6.988009803000628e-09, + "loss": 0.4765, + "step": 8649 + }, + { + "epoch": 0.99, + "learning_rate": 6.8503341258086e-09, + "loss": 0.4445, + "step": 8650 + }, + { + "epoch": 0.99, + "learning_rate": 6.714027733230044e-09, + "loss": 0.4319, + "step": 8651 + }, + { + "epoch": 0.99, + "learning_rate": 6.579090643942243e-09, + "loss": 0.4481, + "step": 8652 + }, + { + "epoch": 0.99, + "learning_rate": 6.4455228764326305e-09, + "loss": 0.4501, + "step": 8653 + }, + { + "epoch": 0.99, + "learning_rate": 6.3133244490043434e-09, + "loss": 0.4475, + "step": 8654 + }, + { + "epoch": 0.99, + "learning_rate": 6.18249537977178e-09, + "loss": 0.4532, + "step": 8655 + }, + { + "epoch": 0.99, + "learning_rate": 6.053035686661712e-09, + "loss": 0.4401, + "step": 8656 + }, + { + "epoch": 0.99, + "learning_rate": 5.924945387411063e-09, + "loss": 0.4532, + "step": 8657 + }, + { + "epoch": 0.99, + "learning_rate": 5.798224499572458e-09, + "loss": 0.4334, + "step": 8658 + }, + { + "epoch": 0.99, + "learning_rate": 5.672873040509786e-09, + "loss": 0.4682, + "step": 8659 + }, + { + "epoch": 0.99, + "learning_rate": 5.548891027398195e-09, + "loss": 0.4225, + "step": 8660 + }, + { + "epoch": 0.99, + "learning_rate": 5.426278477226321e-09, + "loss": 0.4527, + "step": 8661 + }, + { + "epoch": 0.99, + "learning_rate": 5.305035406795167e-09, + "loss": 0.4467, + "step": 8662 + }, + { + "epoch": 0.99, + "learning_rate": 5.185161832718111e-09, + "loss": 0.4284, + "step": 8663 + }, + { + "epoch": 0.99, + "learning_rate": 5.0666577714186815e-09, + "loss": 0.4373, + "step": 8664 + }, + { + "epoch": 0.99, + "learning_rate": 4.949523239136112e-09, + "loss": 0.4694, + "step": 8665 + }, + { + "epoch": 0.99, + "learning_rate": 4.833758251919785e-09, + "loss": 0.4355, + "step": 8666 + }, + { + "epoch": 0.99, + "learning_rate": 4.7193628256325676e-09, + "loss": 0.4348, + "step": 8667 + }, + { + "epoch": 0.99, + "learning_rate": 4.606336975948589e-09, + "loss": 0.4469, + "step": 8668 + }, + { + "epoch": 0.99, + "learning_rate": 4.494680718355459e-09, + "loss": 0.4447, + "step": 8669 + }, + { + "epoch": 0.99, + "learning_rate": 4.384394068153164e-09, + "loss": 0.4434, + "step": 8670 + }, + { + "epoch": 0.99, + "learning_rate": 4.275477040451836e-09, + "loss": 0.453, + "step": 8671 + }, + { + "epoch": 0.99, + "learning_rate": 4.167929650176206e-09, + "loss": 0.4588, + "step": 8672 + }, + { + "epoch": 0.99, + "learning_rate": 4.061751912063372e-09, + "loss": 0.4509, + "step": 8673 + }, + { + "epoch": 0.99, + "learning_rate": 3.956943840661698e-09, + "loss": 0.4543, + "step": 8674 + }, + { + "epoch": 0.99, + "learning_rate": 3.853505450331918e-09, + "loss": 0.4426, + "step": 8675 + }, + { + "epoch": 0.99, + "learning_rate": 3.751436755247139e-09, + "loss": 0.4451, + "step": 8676 + }, + { + "epoch": 0.99, + "learning_rate": 3.650737769393953e-09, + "loss": 0.4378, + "step": 8677 + }, + { + "epoch": 0.99, + "learning_rate": 3.5514085065690984e-09, + "loss": 0.4355, + "step": 8678 + }, + { + "epoch": 0.99, + "learning_rate": 3.4534489803850215e-09, + "loss": 0.4543, + "step": 8679 + }, + { + "epoch": 0.99, + "learning_rate": 3.3568592042620974e-09, + "loss": 0.4409, + "step": 8680 + }, + { + "epoch": 0.99, + "learning_rate": 3.2616391914364056e-09, + "loss": 0.4628, + "step": 8681 + }, + { + "epoch": 0.99, + "learning_rate": 3.167788954954176e-09, + "loss": 0.4445, + "step": 8682 + }, + { + "epoch": 0.99, + "learning_rate": 3.075308507677344e-09, + "loss": 0.4646, + "step": 8683 + }, + { + "epoch": 0.99, + "learning_rate": 2.9841978622746624e-09, + "loss": 0.4466, + "step": 8684 + }, + { + "epoch": 0.99, + "learning_rate": 2.894457031232811e-09, + "loss": 0.4682, + "step": 8685 + }, + { + "epoch": 0.99, + "learning_rate": 2.8060860268475097e-09, + "loss": 0.4413, + "step": 8686 + }, + { + "epoch": 0.99, + "learning_rate": 2.7190848612279606e-09, + "loss": 0.4405, + "step": 8687 + }, + { + "epoch": 0.99, + "learning_rate": 2.6334535462935184e-09, + "loss": 0.46, + "step": 8688 + }, + { + "epoch": 0.99, + "learning_rate": 2.54919209377924e-09, + "loss": 0.454, + "step": 8689 + }, + { + "epoch": 0.99, + "learning_rate": 2.4663005152314455e-09, + "loss": 0.4464, + "step": 8690 + }, + { + "epoch": 0.99, + "learning_rate": 2.384778822006606e-09, + "loss": 0.4624, + "step": 8691 + }, + { + "epoch": 0.99, + "learning_rate": 2.304627025274675e-09, + "loss": 0.4351, + "step": 8692 + }, + { + "epoch": 0.99, + "learning_rate": 2.225845136019089e-09, + "loss": 0.4529, + "step": 8693 + }, + { + "epoch": 0.99, + "learning_rate": 2.148433165035657e-09, + "loss": 0.4624, + "step": 8694 + }, + { + "epoch": 0.99, + "learning_rate": 2.0723911229303396e-09, + "loss": 0.4256, + "step": 8695 + }, + { + "epoch": 0.99, + "learning_rate": 1.9977190201225793e-09, + "loss": 0.4375, + "step": 8696 + }, + { + "epoch": 0.99, + "learning_rate": 1.924416866844192e-09, + "loss": 0.4619, + "step": 8697 + }, + { + "epoch": 0.99, + "learning_rate": 1.8524846731404755e-09, + "loss": 0.4525, + "step": 8698 + }, + { + "epoch": 0.99, + "learning_rate": 1.7819224488657695e-09, + "loss": 0.4452, + "step": 8699 + }, + { + "epoch": 0.99, + "learning_rate": 1.7127302036901162e-09, + "loss": 0.4516, + "step": 8700 + } + ], + "logging_steps": 1.0, + "max_steps": 8750, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-8700/vision_tower/config.json b/checkpoint-8700/vision_tower/config.json new file mode 100644 index 0000000000000000000000000000000000000000..f3069766fe18bd3ee4f819b8e25b19de040b340d --- /dev/null +++ b/checkpoint-8700/vision_tower/config.json @@ -0,0 +1,19 @@ +{ + "_name_or_path": "/lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-8700/vision_tower", + "architectures": [ + "SiglipVisionModel" + ], + "attention_dropout": 0.0, + "hidden_act": "gelu_pytorch_tanh", + "hidden_size": 1152, + "image_size": 384, + "intermediate_size": 4304, + "layer_norm_eps": 1e-06, + "model_type": "siglip_vision_model", + "num_attention_heads": 16, + "num_channels": 3, + "num_hidden_layers": 27, + "patch_size": 14, + "torch_dtype": "bfloat16", + "transformers_version": "4.36.2" +} diff --git a/checkpoint-8700/vision_tower/model.safetensors b/checkpoint-8700/vision_tower/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4dbcf9ceba10f605a5736219819e451d248c567e --- /dev/null +++ b/checkpoint-8700/vision_tower/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:519994db5967bf1b5ab8279312b3afa54537ee8d8e913dd53d213b02b7eea5ea +size 856506120 diff --git a/checkpoint-8700/vision_tower/preprocessor_config.json b/checkpoint-8700/vision_tower/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0f13134ed29056f82f3ab7e0246f0ab973e7ecf3 --- /dev/null +++ b/checkpoint-8700/vision_tower/preprocessor_config.json @@ -0,0 +1,24 @@ +{ + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.5, + 0.5, + 0.5 + ], + "image_processor_type": "SiglipImageProcessor", + "image_std": [ + 0.5, + 0.5, + 0.5 + ], + "processor_class": "SiglipProcessor", + "resample": 3, + "rescale_factor": 0.00392156862745098, + "size": { + "height": 384, + "width": 384 + } +} diff --git a/checkpoint-8700/zero_to_fp32.py b/checkpoint-8700/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..c5246ff52274e1d6142001ccf085186d3545ce57 --- /dev/null +++ b/checkpoint-8700/zero_to_fp32.py @@ -0,0 +1,578 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage == 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dicts.append(torch.load(f, map_location=device)) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage == 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage == 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage == 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file) diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..57a6568a8e25a58f7b5847d1962555ad6f3fc8b6 --- /dev/null +++ b/config.json @@ -0,0 +1,253 @@ +{ + "_name_or_path": "/lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o", + "architectures": [ + "LlavaLlamaModel" + ], + "drop_path_rate": 0.0, + "hidden_size": 2560, + "image_aspect_ratio": "resize", + "interpolate_mode": "linear", + "llm_cfg": { + "_name_or_path": "/lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/llm", + "add_cross_attention": false, + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 1, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 2, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "silu", + "hidden_size": 2560, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 6912, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 4096, + "min_length": 0, + "model_max_length": 4096, + "model_type": "llama", + "no_repeat_ngram_size": 0, + "num_attention_heads": 20, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 32, + "num_key_value_heads": 20, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "pretraining_tp": 1, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": false, + "tokenizer_class": null, + "tokenizer_model_max_length": 4096, + "tokenizer_padding_side": "right", + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "bfloat16", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 32000 + }, + "mm_hidden_size": 1152, + "mm_projector_cfg": { + "_name_or_path": "/lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/mm_projector", + "add_cross_attention": false, + "architectures": [ + "MultimodalProjector" + ], + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "mm_projector_type": "mlp_downsample", + "model_type": "v2l_projector", + "no_repeat_ngram_size": 0, + "num_beam_groups": 1, + "num_beams": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "bfloat16", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false + }, + "mm_projector_lr": null, + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "cls_patch", + "mm_vision_select_layer": -2, + "model_dtype": "torch.bfloat16", + "model_type": "llava_llama", + "num_video_frames": 8, + "resume_path": "/lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o", + "s2": false, + "s2_max_split_size": 336, + "s2_scales": "336,672,1008", + "transformers_version": "4.36.2", + "tune_language_model": true, + "tune_mm_projector": true, + "tune_vision_tower": true, + "vision_resolution": -1, + "vision_tower_cfg": { + "_name_or_path": "/lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/vision_tower", + "add_cross_attention": false, + "architectures": [ + "SiglipVisionModel" + ], + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu_pytorch_tanh", + "hidden_size": 1152, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 384, + "intermediate_size": 4304, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "siglip_vision_model", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 27, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "bfloat16", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false + } +} diff --git a/llm/config.json b/llm/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e45ce52b8f1ddeb16b607192cd41960597261d65 --- /dev/null +++ b/llm/config.json @@ -0,0 +1,32 @@ +{ + "_name_or_path": "/lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/llm", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 1, + "eos_token_id": 2, + "hidden_act": "silu", + "hidden_size": 2560, + "initializer_range": 0.02, + "intermediate_size": 6912, + "max_position_embeddings": 4096, + "model_max_length": 4096, + "model_type": "llama", + "num_attention_heads": 20, + "num_hidden_layers": 32, + "num_key_value_heads": 20, + "pad_token_id": 0, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 4096, + "tokenizer_padding_side": "right", + "torch_dtype": "bfloat16", + "transformers_version": "4.36.2", + "use_cache": true, + "vocab_size": 32000 +} diff --git a/llm/generation_config.json b/llm/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bf84ec1a28ba89feb07162d95b06633a40b4975f --- /dev/null +++ b/llm/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "eos_token_id": 2, + "pad_token_id": 0, + "transformers_version": "4.36.2" +} diff --git a/llm/model-00001-of-00002.safetensors b/llm/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..cd8f5f5c86b6db5fb26324f8a54f6cd8537b2286 --- /dev/null +++ b/llm/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2fab9c810587c2af4f61c1e03000f15311da57fd0dddd3ba3b883fd833ace06d +size 4974521464 diff --git a/llm/model-00002-of-00002.safetensors b/llm/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..11e4cc1dd0c1dfbe81f863f2a60e1c159ebd03bc --- /dev/null +++ b/llm/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:118dfd01a6298b4c909569d0023f6a2ca47b76d58fae5356ca5ff542de4965c1 +size 428632856 diff --git a/llm/model.safetensors.index.json b/llm/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..8b173c9ac8194749df58c92051618c0ff74c4c20 --- /dev/null +++ b/llm/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 5403120640 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.28.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.29.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/llm/special_tokens_map.json b/llm/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..14761dcf1466dc232bd41de9c21d4c617b15755e --- /dev/null +++ b/llm/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/llm/tokenizer.model b/llm/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..3b7eab905db502ae7629c8a3c1f8412a3178c4c2 --- /dev/null +++ b/llm/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7aedb3582ecda9fa99ee9242c17a9658f6744db083ee6ebdc8fb14857f84d220 +size 499723 diff --git a/llm/tokenizer_config.json b/llm/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..47ab96cd62cc374653a0ea0fb77f9457e0f53481 --- /dev/null +++ b/llm/tokenizer_config.json @@ -0,0 +1,43 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": false, + "model_max_length": 4096, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/mm_projector/config.json b/mm_projector/config.json new file mode 100644 index 0000000000000000000000000000000000000000..282f46e907bcb9008e163afeef4afacadc984d55 --- /dev/null +++ b/mm_projector/config.json @@ -0,0 +1,10 @@ +{ + "_name_or_path": "/lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/mm_projector", + "architectures": [ + "MultimodalProjector" + ], + "mm_projector_type": "mlp_downsample", + "model_type": "v2l_projector", + "torch_dtype": "bfloat16", + "transformers_version": "4.36.2" +} diff --git a/mm_projector/model.safetensors b/mm_projector/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a9b84a325804a4fa9aa548a85df187fb3bfa5899 --- /dev/null +++ b/mm_projector/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40c421c3763c5eea8ddc38b8d8189899bd3e34ddccc58daf6261719e7f7a7d5a +size 36729360 diff --git a/terminal.log b/terminal.log new file mode 100644 index 0000000000000000000000000000000000000000..cac040cf76feae0b40c07586b460418fad454730 --- /dev/null +++ b/terminal.log @@ -0,0 +1,56655 @@ +srun: job 6684042 queued and waiting for resources +srun: job 6684042 has been allocated resources +wandb: Currently logged in as: memmelma. Use `wandb login --relogin` to force relogin +MASTER_ADDR=batch-block1-2105 +JobID: 6684042 | Full list: batch-block1-2105 batch-block1-0084 +NETWORK=Efficient-Large-Model/VILA1.5-3b +wandb: Currently logged in as: memmelma. Use `wandb login --relogin` to force relogin +MASTER_ADDR=batch-block1-2105 +JobID: 6684042 | Full list: batch-block1-2105 batch-block1-0084 +NETWORK=Efficient-Large-Model/VILA1.5-3b +WARNING:torch.distributed.run: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:torch.distributed.run: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +[2025-04-09 21:44:48,601] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-09 21:44:48,601] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-09 21:44:48,601] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-09 21:44:48,601] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-09 21:44:48,601] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-09 21:44:48,601] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-09 21:44:48,601] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-09 21:44:48,602] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-09 21:44:48,702] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-09 21:44:48,702] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-09 21:44:48,702] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-09 21:44:48,702] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-09 21:44:48,702] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-09 21:44:48,702] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-09 21:44:48,702] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-09 21:44:48,703] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-09 21:44:49,967] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-09 21:44:49,967] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-09 21:44:49,967] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-09 21:44:49,967] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-09 21:44:49,967] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-09 21:44:49,967] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-09 21:44:49,967] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-09 21:44:49,967] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-09 21:44:49,967] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-09 21:44:49,967] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-09 21:44:49,967] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-09 21:44:49,967] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-09 21:44:49,967] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-09 21:44:49,967] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-09 21:44:49,967] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-09 21:44:49,967] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-09 21:44:50,169] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-09 21:44:50,169] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-09 21:44:50,169] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-09 21:44:50,169] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-09 21:44:50,169] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-09 21:44:50,169] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-09 21:44:50,169] [INFO] [comm.py:625:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +[2025-04-09 21:44:50,169] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-09 21:44:50,169] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-09 21:44:50,169] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-09 21:44:50,169] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-09 21:44:50,169] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-09 21:44:50,169] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-09 21:44:50,169] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-09 21:44:50,169] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-09 21:44:50,169] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-09 21:44:50,169] [INFO] [comm.py:594:init_distributed] cdb=None +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/huggingface_hub/file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. + warnings.warn( +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/huggingface_hub/file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. + warnings.warn( +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/huggingface_hub/file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. + warnings.warn( +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/huggingface_hub/file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. + warnings.warn( +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/huggingface_hub/file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. + warnings.warn( +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/huggingface_hub/file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. + warnings.warn( +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/huggingface_hub/file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. + warnings.warn( +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/huggingface_hub/file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. + warnings.warn( +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/huggingface_hub/file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. + warnings.warn( +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/huggingface_hub/file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. + warnings.warn( +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/huggingface_hub/file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. + warnings.warn( +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/huggingface_hub/file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. + warnings.warn( +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/huggingface_hub/file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. + warnings.warn( +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/huggingface_hub/file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. + warnings.warn( +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/huggingface_hub/file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. + warnings.warn( +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/huggingface_hub/file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. + warnings.warn( + Fetching 17 files: 0%| | 0/17 [00:00\nWould this person be more likely to be a type a or b person?\nAnswer the question using a single word or phrase.'}, {'from': 'gpt', 'value': ''}]] (ignored) + 3%|▎ | 244/8750 [24:14<13:32:04, 5.73s/it] {'loss': 0.5141, 'learning_rate': 1.855513307984791e-05, 'epoch': 0.03} + 3%|▎ | 244/8750 [24:14<13:32:04, 5.73s/it] 3%|▎ | 244/8750 [24:07<13:32:03, 5.73s/it] {'loss': 0.5141, 'learning_rate': 1.855513307984791e-05, 'epoch': 0.03} + 3%|▎ | 244/8750 [24:07<13:32:03, 5.73s/it] 3%|▎ | 245/8750 [24:19<13:27:37, 5.70s/it] 3%|▎ | 245/8750 [24:13<13:27:37, 5.70s/it] {'loss': 0.5208, 'learning_rate': 1.8631178707224337e-05, 'epoch': 0.03} + 3%|▎ | 245/8750 [24:19<13:27:37, 5.70s/it] {'loss': 0.5208, 'learning_rate': 1.8631178707224337e-05, 'epoch': 0.03} + 3%|▎ | 245/8750 [24:13<13:27:37, 5.70s/it] 3%|▎ | 246/8750 [24:19<13:30:08, 5.72s/it] 3%|▎ | 246/8750 [24:25<13:30:09, 5.72s/it] {'loss': 0.5091, 'learning_rate': 1.870722433460076e-05, 'epoch': 0.03} + 3%|▎ | 246/8750 [24:25<13:30:09, 5.72s/it] {'loss': 0.5091, 'learning_rate': 1.870722433460076e-05, 'epoch': 0.03} + 3%|▎ | 246/8750 [24:19<13:30:08, 5.72s/it] 3%|▎ | 247/8750 [24:31<13:47:06, 5.84s/it] 3%|▎ | 247/8750 [24:25<13:47:07, 5.84s/it] {'loss': 0.5496, 'learning_rate': 1.8783269961977187e-05, 'epoch': 0.03} + 3%|▎ | 247/8750 [24:31<13:47:06, 5.84s/it] {'loss': 0.5496, 'learning_rate': 1.8783269961977187e-05, 'epoch': 0.03} + 3%|▎ | 247/8750 [24:25<13:47:07, 5.84s/it] 3%|▎ | 248/8750 [24:31<13:49:49, 5.86s/it] 3%|▎ | 248/8750 [24:37<13:49:50, 5.86s/it] {'loss': 0.518, 'learning_rate': 1.8859315589353614e-05, 'epoch': 0.03} + 3%|▎ | 248/8750 [24:37<13:49:50, 5.86s/it] {'loss': 0.518, 'learning_rate': 1.8859315589353614e-05, 'epoch': 0.03} + 3%|▎ | 248/8750 [24:31<13:49:49, 5.86s/it] 3%|▎ | 249/8750 [24:43<13:44:34, 5.82s/it] 3%|▎ | 249/8750 [24:36<13:44:35, 5.82s/it] {'loss': 0.5337, 'learning_rate': 1.893536121673004e-05, 'epoch': 0.03} + 3%|▎ | 249/8750 [24:43<13:44:34, 5.82s/it] {'loss': 0.5337, 'learning_rate': 1.893536121673004e-05, 'epoch': 0.03} + 3%|▎ | 249/8750 [24:36<13:44:35, 5.82s/it]15 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +07 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... + 3%|▎ | 250/8750 [24:42<13:34:07, 5.75s/it]3 AutoResumeHook: Checking whether to suspend... + {'loss': 0.5099, 'learning_rate': 1.9011406844106467e-05, 'epoch': 0.03} + 3%|▎ | 250/8750 [24:42<13:34:07, 5.75s/it]11 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... + 3%|▎ | 250/8750 [24:49<13:34:06, 5.75s/it]14 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... + {'loss': 0.5099, 'learning_rate': 1.9011406844106467e-05, 'epoch': 0.03} + 3%|▎ | 250/8750 [24:49<13:34:06, 5.75s/it] 3%|▎ | 251/8750 [24:54<13:27:23, 5.70s/it] 3%|▎ | 251/8750 [24:48<13:27:24, 5.70s/it] {'loss': 0.5258, 'learning_rate': 1.908745247148289e-05, 'epoch': 0.03} + 3%|▎ | 251/8750 [24:54<13:27:23, 5.70s/it] {'loss': 0.5258, 'learning_rate': 1.908745247148289e-05, 'epoch': 0.03} + 3%|▎ | 251/8750 [24:48<13:27:24, 5.70s/it] 3%|▎ | 252/8750 [25:00<13:33:10, 5.74s/it] 3%|▎ | 252/8750 [24:53<13:33:11, 5.74s/it] {'loss': 0.5225, 'learning_rate': 1.9163498098859318e-05, 'epoch': 0.03} + 3%|▎ | 252/8750 [25:00<13:33:10, 5.74s/it] {'loss': 0.5225, 'learning_rate': 1.9163498098859318e-05, 'epoch': 0.03} + 3%|▎ | 252/8750 [24:53<13:33:11, 5.74s/it] 3%|▎ | 253/8750 [25:06<13:24:20, 5.68s/it] 3%|▎ | 253/8750 [24:59<13:24:20, 5.68s/it] {'loss': 0.5185, 'learning_rate': 1.9239543726235744e-05, 'epoch': 0.03} + 3%|▎ | 253/8750 [25:06<13:24:20, 5.68s/it] {'loss': 0.5185, 'learning_rate': 1.9239543726235744e-05, 'epoch': 0.03} + 3%|▎ | 253/8750 [24:59<13:24:20, 5.68s/it] 3%|▎ | 254/8750 [25:11<13:23:05, 5.67s/it] 3%|▎ | 254/8750 [25:05<13:23:06, 5.67s/it] {'loss': 0.527, 'learning_rate': 1.931558935361217e-05, 'epoch': 0.03} + 3%|▎ | 254/8750 [25:11<13:23:05, 5.67s/it] {'loss': 0.527, 'learning_rate': 1.931558935361217e-05, 'epoch': 0.03} + 3%|▎ | 254/8750 [25:05<13:23:06, 5.67s/it] 3%|▎ | 255/8750 [25:17<13:25:57, 5.69s/it] 3%|▎ | 255/8750 [25:10<13:25:56, 5.69s/it] {'loss': 0.5155, 'learning_rate': 1.9391634980988594e-05, 'epoch': 0.03} + 3%|▎ | 255/8750 [25:17<13:25:57, 5.69s/it] {'loss': 0.5155, 'learning_rate': 1.9391634980988594e-05, 'epoch': 0.03} + 3%|▎ | 255/8750 [25:10<13:25:56, 5.69s/it] 3%|▎ | 256/8750 [25:23<13:26:31, 5.70s/it] 3%|▎ | 256/8750 [25:16<13:26:30, 5.70s/it] {'loss': 0.5138, 'learning_rate': 1.946768060836502e-05, 'epoch': 0.03} + 3%|▎ | 256/8750 [25:23<13:26:31, 5.70s/it] {'loss': 0.5138, 'learning_rate': 1.946768060836502e-05, 'epoch': 0.03} + 3%|▎ | 256/8750 [25:16<13:26:30, 5.70s/it] 3%|▎ | 257/8750 [25:28<13:22:24, 5.67s/it] 3%|▎ | 257/8750 [25:22<13:22:24, 5.67s/it] {'loss': 0.5309, 'learning_rate': 1.9543726235741448e-05, 'epoch': 0.03} + 3%|▎ | 257/8750 [25:28<13:22:24, 5.67s/it] {'loss': 0.5309, 'learning_rate': 1.9543726235741448e-05, 'epoch': 0.03} + 3%|▎ | 257/8750 [25:22<13:22:24, 5.67s/it] 3%|▎ | 258/8750 [25:27<13:18:00, 5.64s/it] 3%|▎ | 258/8750 [25:34<13:18:01, 5.64s/it] {'loss': 0.527, 'learning_rate': 1.961977186311787e-05, 'epoch': 0.03} + 3%|▎ | 258/8750 [25:34<13:18:01, 5.64s/it] {'loss': 0.527, 'learning_rate': 1.961977186311787e-05, 'epoch': 0.03} + 3%|▎ | 258/8750 [25:27<13:18:00, 5.64s/it] 3%|▎ | 259/8750 [25:40<13:22:48, 5.67s/it] 3%|▎ | 259/8750 [25:33<13:22:48, 5.67s/it] {'loss': 0.5223, 'learning_rate': 1.9695817490494298e-05, 'epoch': 0.03} + 3%|▎ | 259/8750 [25:40<13:22:48, 5.67s/it] {'loss': 0.5223, 'learning_rate': 1.9695817490494298e-05, 'epoch': 0.03} + 3%|▎ | 259/8750 [25:33<13:22:48, 5.67s/it] 3%|▎ | 260/8750 [25:45<13:28:59, 5.72s/it] 3%|▎ | 260/8750 [25:39<13:29:03, 5.72s/it] {'loss': 0.5452, 'learning_rate': 1.9771863117870725e-05, 'epoch': 0.03} + 3%|▎ | 260/8750 [25:45<13:28:59, 5.72s/it] {'loss': 0.5452, 'learning_rate': 1.9771863117870725e-05, 'epoch': 0.03} + 3%|▎ | 260/8750 [25:39<13:29:03, 5.72s/it] 3%|▎ | 261/8750 [25:51<13:25:15, 5.69s/it] 3%|▎ | 261/8750 [25:44<13:25:15, 5.69s/it] {'loss': 0.527, 'learning_rate': 1.984790874524715e-05, 'epoch': 0.03} + 3%|▎ | 261/8750 [25:51<13:25:15, 5.69s/it] {'loss': 0.527, 'learning_rate': 1.984790874524715e-05, 'epoch': 0.03} + 3%|▎ | 261/8750 [25:45<13:25:15, 5.69s/it] 3%|▎ | 262/8750 [25:57<13:26:54, 5.70s/it] 3%|▎ | 262/8750 [25:50<13:26:54, 5.70s/it] {'loss': 0.535, 'learning_rate': 1.9923954372623575e-05, 'epoch': 0.03} + 3%|▎ | 262/8750 [25:57<13:26:54, 5.70s/it] {'loss': 0.535, 'learning_rate': 1.9923954372623575e-05, 'epoch': 0.03} + 3%|▎ | 262/8750 [25:50<13:26:54, 5.70s/it] 3%|▎ | 263/8750 [26:02<13:19:24, 5.65s/it] 3%|▎ | 263/8750 [25:56<13:19:24, 5.65s/it] {'loss': 0.5283, 'learning_rate': 2e-05, 'epoch': 0.03} + 3%|▎ | 263/8750 [26:02<13:19:24, 5.65s/it] {'loss': 0.5283, 'learning_rate': 2e-05, 'epoch': 0.03} + 3%|▎ | 263/8750 [25:56<13:19:24, 5.65s/it] 3%|▎ | 264/8750 [26:08<13:17:21, 5.64s/it] 3%|▎ | 264/8750 [26:01<13:17:20, 5.64s/it] {'loss': 0.5108, 'learning_rate': 1.9999999314888373e-05, 'epoch': 0.03} + 3%|▎ | 264/8750 [26:08<13:17:21, 5.64s/it] {'loss': 0.5108, 'learning_rate': 1.9999999314888373e-05, 'epoch': 0.03} + 3%|▎ | 264/8750 [26:01<13:17:20, 5.64s/it] 3%|▎ | 265/8750 [26:14<13:21:36, 5.67s/it] 3%|▎ | 265/8750 [26:07<13:21:36, 5.67s/it] {'loss': 0.5212, 'learning_rate': 1.9999997259553572e-05, 'epoch': 0.03} + 3%|▎ | 265/8750 [26:14<13:21:36, 5.67s/it] {'loss': 0.5212, 'learning_rate': 1.9999997259553572e-05, 'epoch': 0.03} + 3%|▎ | 265/8750 [26:07<13:21:36, 5.67s/it] 3%|▎ | 266/8750 [26:19<13:25:02, 5.69s/it] 3%|▎ | 266/8750 [26:13<13:25:01, 5.69s/it] {'loss': 0.5039, 'learning_rate': 1.9999993833995886e-05, 'epoch': 0.03} + 3%|▎ | 266/8750 [26:19<13:25:02, 5.69s/it] {'loss': 0.5039, 'learning_rate': 1.9999993833995886e-05, 'epoch': 0.03} + 3%|▎ | 266/8750 [26:13<13:25:01, 5.69s/it] 3%|▎ | 267/8750 [26:25<13:23:48, 5.69s/it] 3%|▎ | 267/8750 [26:19<13:23:48, 5.69s/it] {'loss': 0.5497, 'learning_rate': 1.9999989038215787e-05, 'epoch': 0.03} + 3%|▎ | 267/8750 [26:25<13:23:48, 5.69s/it] {'loss': 0.5497, 'learning_rate': 1.9999989038215787e-05, 'epoch': 0.03} + 3%|▎ | 267/8750 [26:19<13:23:48, 5.69s/it] 3%|▎ | 268/8750 [26:31<13:38:40, 5.79s/it] 3%|▎ | 268/8750 [26:25<13:38:41, 5.79s/it] {'loss': 0.5265, 'learning_rate': 1.9999982872213925e-05, 'epoch': 0.03} + 3%|▎ | 268/8750 [26:31<13:38:40, 5.79s/it] {'loss': 0.5265, 'learning_rate': 1.9999982872213925e-05, 'epoch': 0.03} + 3%|▎ | 268/8750 [26:25<13:38:41, 5.79s/it] 3%|▎ | 269/8750 [26:37<13:41:41, 5.81s/it] 3%|▎ | 269/8750 [26:30<13:41:42, 5.81s/it] {'loss': 0.5201, 'learning_rate': 1.999997533599115e-05, 'epoch': 0.03} + 3%|▎ | 269/8750 [26:37<13:41:41, 5.81s/it] {'loss': 0.5201, 'learning_rate': 1.999997533599115e-05, 'epoch': 0.03} + 3%|▎ | 269/8750 [26:30<13:41:42, 5.81s/it] 3%|▎ | 270/8750 [26:43<13:35:53, 5.77s/it] 3%|▎ | 270/8750 [26:36<13:35:53, 5.77s/it] {'loss': 0.5279, 'learning_rate': 1.999996642954849e-05, 'epoch': 0.03} + 3%|▎ | 270/8750 [26:43<13:35:53, 5.77s/it] {'loss': 0.5279, 'learning_rate': 1.999996642954849e-05, 'epoch': 0.03} + 3%|▎ | 270/8750 [26:36<13:35:53, 5.77s/it] 3%|▎ | 271/8750 [26:48<13:27:29, 5.71s/it] 3%|▎ | 271/8750 [26:42<13:27:28, 5.71s/it] {'loss': 0.5313, 'learning_rate': 1.999995615288717e-05, 'epoch': 0.03} + 3%|▎ | 271/8750 [26:48<13:27:29, 5.71s/it] {'loss': 0.5313, 'learning_rate': 1.999995615288717e-05, 'epoch': 0.03} + 3%|▎ | 271/8750 [26:42<13:27:28, 5.71s/it] 3%|▎ | 272/8750 [26:54<13:24:53, 5.70s/it] 3%|▎ | 272/8750 [26:47<13:24:53, 5.70s/it] {'loss': 0.5108, 'learning_rate': 1.9999944506008594e-05, 'epoch': 0.03} + 3%|▎ | 272/8750 [26:54<13:24:53, 5.70s/it] {'loss': 0.5108, 'learning_rate': 1.9999944506008594e-05, 'epoch': 0.03} + 3%|▎ | 272/8750 [26:47<13:24:53, 5.70s/it] 3%|▎ | 273/8750 [26:59<13:17:07, 5.64s/it] 3%|▎ | 273/8750 [26:53<13:17:06, 5.64s/it] {'loss': 0.5274, 'learning_rate': 1.9999931488914366e-05, 'epoch': 0.03} + 3%|▎ | 273/8750 [26:59<13:17:07, 5.64s/it] {'loss': 0.5274, 'learning_rate': 1.9999931488914366e-05, 'epoch': 0.03} + 3%|▎ | 273/8750 [26:53<13:17:06, 5.64s/it] 3%|▎ | 274/8750 [27:05<13:15:47, 5.63s/it] 3%|▎ | 274/8750 [26:58<13:15:47, 5.63s/it] {'loss': 0.5071, 'learning_rate': 1.999991710160626e-05, 'epoch': 0.03} + 3%|▎ | 274/8750 [27:05<13:15:47, 5.63s/it] {'loss': 0.5071, 'learning_rate': 1.999991710160626e-05, 'epoch': 0.03} + 3%|▎ | 274/8750 [26:58<13:15:47, 5.63s/it] 3%|▎ | 275/8750 [27:11<13:21:25, 5.67s/it] 3%|▎ | 275/8750 [27:04<13:21:26, 5.67s/it] {'loss': 0.5226, 'learning_rate': 1.999990134408625e-05, 'epoch': 0.03} + 3%|▎ | 275/8750 [27:04<13:21:26, 5.67s/it] {'loss': 0.5226, 'learning_rate': 1.999990134408625e-05, 'epoch': 0.03} + 3%|▎ | 275/8750 [27:11<13:21:25, 5.67s/it] 3%|▎ | 276/8750 [27:16<13:24:35, 5.70s/it] 3%|▎ | 276/8750 [27:10<13:24:38, 5.70s/it] {'loss': 0.5201, 'learning_rate': 1.99998842163565e-05, 'epoch': 0.03} + 3%|▎ | 276/8750 [27:16<13:24:35, 5.70s/it] {'loss': 0.5201, 'learning_rate': 1.99998842163565e-05, 'epoch': 0.03} + 3%|▎ | 276/8750 [27:10<13:24:38, 5.70s/it] 3%|▎ | 277/8750 [27:22<13:30:27, 5.74s/it] 3%|▎ | 277/8750 [27:16<13:30:26, 5.74s/it] {'loss': 0.5188, 'learning_rate': 1.9999865718419352e-05, 'epoch': 0.03} + 3%|▎ | 277/8750 [27:22<13:30:27, 5.74s/it] {'loss': 0.5188, 'learning_rate': 1.9999865718419352e-05, 'epoch': 0.03} + 3%|▎ | 277/8750 [27:16<13:30:26, 5.74s/it] 3%|▎ | 278/8750 [27:28<13:30:35, 5.74s/it] 3%|▎ | 278/8750 [27:22<13:30:34, 5.74s/it] {'loss': 0.5578, 'learning_rate': 1.999984585027734e-05, 'epoch': 0.03} + 3%|▎ | 278/8750 [27:28<13:30:35, 5.74s/it] {'loss': 0.5578, 'learning_rate': 1.999984585027734e-05, 'epoch': 0.03} + 3%|▎ | 278/8750 [27:22<13:30:34, 5.74s/it] 3%|▎ | 279/8750 [27:34<13:30:35, 5.74s/it] 3%|▎ | 279/8750 [27:27<13:30:34, 5.74s/it] {'loss': 0.5245, 'learning_rate': 1.999982461193319e-05, 'epoch': 0.03} + 3%|▎ | 279/8750 [27:34<13:30:35, 5.74s/it] {'loss': 0.5245, 'learning_rate': 1.999982461193319e-05, 'epoch': 0.03} + 3%|▎ | 279/8750 [27:27<13:30:34, 5.74s/it] 3%|▎ | 280/8750 [27:40<13:34:34, 5.77s/it] 3%|▎ | 280/8750 [27:33<13:34:33, 5.77s/it] {'loss': 0.5092, 'learning_rate': 1.999980200338981e-05, 'epoch': 0.03} + 3%|▎ | 280/8750 [27:40<13:34:34, 5.77s/it] {'loss': 0.5092, 'learning_rate': 1.999980200338981e-05, 'epoch': 0.03} + 3%|▎ | 280/8750 [27:33<13:34:33, 5.77s/it] 3%|▎ | 281/8750 [27:45<13:32:51, 5.76s/it] 3%|▎ | 281/8750 [27:39<13:32:52, 5.76s/it] {'loss': 0.5174, 'learning_rate': 1.9999778024650296e-05, 'epoch': 0.03} + 3%|▎ | 281/8750 [27:45<13:32:51, 5.76s/it] {'loss': 0.5174, 'learning_rate': 1.9999778024650296e-05, 'epoch': 0.03} + 3%|▎ | 281/8750 [27:39<13:32:52, 5.76s/it] 3%|▎ | 282/8750 [27:45<13:34:15, 5.77s/it] 3%|▎ | 282/8750 [27:51<13:34:16, 5.77s/it] {'loss': 0.524, 'learning_rate': 1.9999752675717938e-05, 'epoch': 0.03} + 3%|▎ | 282/8750 [27:51<13:34:16, 5.77s/it] {'loss': 0.524, 'learning_rate': 1.9999752675717938e-05, 'epoch': 0.03} + 3%|▎ | 282/8750 [27:45<13:34:15, 5.77s/it] 3%|▎ | 283/8750 [27:57<13:28:10, 5.73s/it] 3%|▎ | 283/8750 [27:50<13:28:10, 5.73s/it] {'loss': 0.5193, 'learning_rate': 1.9999725956596204e-05, 'epoch': 0.03} + 3%|▎ | 283/8750 [27:57<13:28:10, 5.73s/it] {'loss': 0.5193, 'learning_rate': 1.9999725956596204e-05, 'epoch': 0.03} + 3%|▎ | 283/8750 [27:50<13:28:10, 5.73s/it] 3%|▎ | 284/8750 [28:02<13:23:26, 5.69s/it] 3%|▎ | 284/8750 [27:56<13:23:27, 5.69s/it] {'loss': 0.5026, 'learning_rate': 1.9999697867288764e-05, 'epoch': 0.03} + 3%|▎ | 284/8750 [28:02<13:23:26, 5.69s/it] {'loss': 0.5026, 'learning_rate': 1.9999697867288764e-05, 'epoch': 0.03} + 3%|▎ | 284/8750 [27:56<13:23:27, 5.69s/it] 3%|▎ | 285/8750 [28:08<13:21:57, 5.68s/it] 3%|▎ | 285/8750 [28:02<13:21:57, 5.68s/it] {'loss': 0.527, 'learning_rate': 1.999966840779946e-05, 'epoch': 0.03} + 3%|▎ | 285/8750 [28:08<13:21:57, 5.68s/it] {'loss': 0.527, 'learning_rate': 1.999966840779946e-05, 'epoch': 0.03} + 3%|▎ | 285/8750 [28:02<13:21:57, 5.68s/it] 3%|▎ | 286/8750 [28:14<13:28:37, 5.73s/it] 3%|▎ | 286/8750 [28:07<13:28:37, 5.73s/it] {'loss': 0.5295, 'learning_rate': 1.9999637578132328e-05, 'epoch': 0.03} + 3%|▎ | 286/8750 [28:14<13:28:37, 5.73s/it] {'loss': 0.5295, 'learning_rate': 1.9999637578132328e-05, 'epoch': 0.03} + 3%|▎ | 286/8750 [28:07<13:28:37, 5.73s/it] 3%|▎ | 287/8750 [28:20<13:37:20, 5.79s/it] 3%|▎ | 287/8750 [28:13<13:37:20, 5.79s/it] {'loss': 0.5041, 'learning_rate': 1.9999605378291593e-05, 'epoch': 0.03} + 3%|▎ | 287/8750 [28:20<13:37:20, 5.79s/it] {'loss': 0.5041, 'learning_rate': 1.9999605378291593e-05, 'epoch': 0.03} + 3%|▎ | 287/8750 [28:13<13:37:20, 5.79s/it] 3%|▎ | 288/8750 [28:26<13:34:17, 5.77s/it] 3%|▎ | 288/8750 [28:19<13:34:17, 5.77s/it] {'loss': 0.5232, 'learning_rate': 1.999957180828167e-05, 'epoch': 0.03} + 3%|▎ | 288/8750 [28:26<13:34:17, 5.77s/it] {'loss': 0.5232, 'learning_rate': 1.999957180828167e-05, 'epoch': 0.03} + 3%|▎ | 288/8750 [28:19<13:34:17, 5.77s/it] 3%|▎ | 289/8750 [28:31<13:35:15, 5.78s/it] 3%|▎ | 289/8750 [28:25<13:35:15, 5.78s/it] {'loss': 0.5181, 'learning_rate': 1.999953686810716e-05, 'epoch': 0.03} + 3%|▎ | 289/8750 [28:31<13:35:15, 5.78s/it] {'loss': 0.5181, 'learning_rate': 1.999953686810716e-05, 'epoch': 0.03} + 3%|▎ | 289/8750 [28:25<13:35:15, 5.78s/it] 3%|▎ | 290/8750 [28:37<13:38:57, 5.81s/it] 3%|▎ | 290/8750 [28:31<13:38:57, 5.81s/it] {'loss': 0.5092, 'learning_rate': 1.9999500557772843e-05, 'epoch': 0.03} + 3%|▎ | 290/8750 [28:37<13:38:57, 5.81s/it] {'loss': 0.5092, 'learning_rate': 1.9999500557772843e-05, 'epoch': 0.03} + 3%|▎ | 290/8750 [28:31<13:38:57, 5.81s/it] 3%|▎ | 291/8750 [28:43<13:32:35, 5.76s/it] 3%|▎ | 291/8750 [28:36<13:32:35, 5.76s/it] {'loss': 0.5373, 'learning_rate': 1.9999462877283702e-05, 'epoch': 0.03} + 3%|▎ | 291/8750 [28:43<13:32:35, 5.76s/it] {'loss': 0.5373, 'learning_rate': 1.9999462877283702e-05, 'epoch': 0.03} + 3%|▎ | 291/8750 [28:36<13:32:35, 5.76s/it] 3%|▎ | 292/8750 [28:49<13:40:58, 5.82s/it] 3%|▎ | 292/8750 [28:42<13:40:58, 5.82s/it] {'loss': 0.5129, 'learning_rate': 1.9999423826644895e-05, 'epoch': 0.03} + 3%|▎ | 292/8750 [28:49<13:40:58, 5.82s/it] {'loss': 0.5129, 'learning_rate': 1.9999423826644895e-05, 'epoch': 0.03} + 3%|▎ | 292/8750 [28:42<13:40:58, 5.82s/it] 3%|▎ | 293/8750 [28:54<13:28:35, 5.74s/it] 3%|▎ | 293/8750 [28:48<13:28:36, 5.74s/it] {'loss': 0.5435, 'learning_rate': 1.999938340586178e-05, 'epoch': 0.03} + 3%|▎ | 293/8750 [28:54<13:28:35, 5.74s/it] {'loss': 0.5435, 'learning_rate': 1.999938340586178e-05, 'epoch': 0.03} + 3%|▎ | 293/8750 [28:48<13:28:36, 5.74s/it] 3%|▎ | 294/8750 [29:00<13:26:00, 5.72s/it] 3%|▎ | 294/8750 [28:54<13:26:00, 5.72s/it] {'loss': 0.5212, 'learning_rate': 1.999934161493988e-05, 'epoch': 0.03} + 3%|▎ | 294/8750 [29:00<13:26:00, 5.72s/it] {'loss': 0.5212, 'learning_rate': 1.999934161493988e-05, 'epoch': 0.03} + 3%|▎ | 294/8750 [28:54<13:26:00, 5.72s/it] 3%|▎ | 295/8750 [29:06<13:22:04, 5.69s/it] 3%|▎ | 295/8750 [28:59<13:22:04, 5.69s/it] {'loss': 0.5404, 'learning_rate': 1.9999298453884944e-05, 'epoch': 0.03} + 3%|▎ | 295/8750 [29:06<13:22:04, 5.69s/it] {'loss': 0.5404, 'learning_rate': 1.9999298453884944e-05, 'epoch': 0.03} + 3%|▎ | 295/8750 [28:59<13:22:04, 5.69s/it] 3%|▎ | 296/8750 [29:05<13:21:50, 5.69s/it] 3%|▎ | 296/8750 [29:11<13:21:51, 5.69s/it] {'loss': 0.5253, 'learning_rate': 1.9999253922702868e-05, 'epoch': 0.03} + 3%|▎ | 296/8750 [29:11<13:21:51, 5.69s/it] {'loss': 0.5253, 'learning_rate': 1.9999253922702868e-05, 'epoch': 0.03} + 3%|▎ | 296/8750 [29:05<13:21:50, 5.69s/it] 3%|▎ | 297/8750 [29:17<13:23:14, 5.70s/it] 3%|▎ | 297/8750 [29:11<13:23:15, 5.70s/it] {'loss': 0.5073, 'learning_rate': 1.9999208021399757e-05, 'epoch': 0.03} + 3%|▎ | 297/8750 [29:17<13:23:14, 5.70s/it] {'loss': 0.5073, 'learning_rate': 1.9999208021399757e-05, 'epoch': 0.03} + 3%|▎ | 297/8750 [29:11<13:23:15, 5.70s/it] 3%|▎ | 298/8750 [29:23<13:28:57, 5.74s/it] 3%|▎ | 298/8750 [29:16<13:28:57, 5.74s/it] {'loss': 0.507, 'learning_rate': 1.9999160749981908e-05, 'epoch': 0.03} + 3%|▎ | 298/8750 [29:23<13:28:57, 5.74s/it] {'loss': 0.507, 'learning_rate': 1.9999160749981908e-05, 'epoch': 0.03} + 3%|▎ | 298/8750 [29:16<13:28:57, 5.74s/it] 3%|▎ | 299/8750 [29:29<13:31:07, 5.76s/it] 3%|▎ | 299/8750 [29:22<13:31:08, 5.76s/it] {'loss': 0.5219, 'learning_rate': 1.999911210845579e-05, 'epoch': 0.03} + 3%|▎ | 299/8750 [29:29<13:31:07, 5.76s/it] {'loss': 0.5219, 'learning_rate': 1.999911210845579e-05, 'epoch': 0.03} + 3%|▎ | 299/8750 [29:22<13:31:08, 5.76s/it]8 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... + 3%|▎ | 300/8750 [29:34<13:26:52, 5.73s/it]15 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +01 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +6AutoResumeHook: Checking whether to suspend... AutoResumeHook: Checking whether to suspend... + +11 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... + 3%|▎ | 300/8750 [29:28<13:26:52, 5.73s/it]12 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... + {'loss': 0.5168, 'learning_rate': 1.9999062096828072e-05, 'epoch': 0.03} + 3%|▎ | 300/8750 [29:34<13:26:52, 5.73s/it] {'loss': 0.5168, 'learning_rate': 1.9999062096828072e-05, 'epoch': 0.03} + 3%|▎ | 300/8750 [29:28<13:26:52, 5.73s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-300/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-300/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-300/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 3%|▎ | 301/8750 [29:54<23:05:00, 9.84s/it] 3%|▎ | 301/8750 [29:47<23:04:59, 9.84s/it] {'loss': 0.5167, 'learning_rate': 1.9999010715105608e-05, 'epoch': 0.03} + 3%|▎ | 301/8750 [29:54<23:05:00, 9.84s/it] {'loss': 0.5167, 'learning_rate': 1.9999010715105608e-05, 'epoch': 0.03} + 3%|▎ | 301/8750 [29:47<23:04:59, 9.84s/it] 3%|▎ | 302/8750 [30:00<20:09:59, 8.59s/it] 3%|▎ | 302/8750 [29:53<20:09:59, 8.59s/it] {'loss': 0.5081, 'learning_rate': 1.9998957963295434e-05, 'epoch': 0.03} + 3%|▎ | 302/8750 [30:00<20:09:59, 8.59s/it] {'loss': 0.5081, 'learning_rate': 1.9998957963295434e-05, 'epoch': 0.03} + 3%|▎ | 302/8750 [29:53<20:09:59, 8.59s/it] 3%|▎ | 303/8750 [30:05<18:10:50, 7.75s/it] 3%|▎ | 303/8750 [29:59<18:10:50, 7.75s/it] {'loss': 0.5134, 'learning_rate': 1.999890384140478e-05, 'epoch': 0.03} + 3%|▎ | 303/8750 [30:05<18:10:50, 7.75s/it] {'loss': 0.5134, 'learning_rate': 1.999890384140478e-05, 'epoch': 0.03} + 3%|▎ | 303/8750 [29:59<18:10:50, 7.75s/it] 3%|▎ | 304/8750 [30:11<16:37:29, 7.09s/it] 3%|▎ | 304/8750 [30:04<16:37:29, 7.09s/it] {'loss': 0.5205, 'learning_rate': 1.999884834944106e-05, 'epoch': 0.03} + 3%|▎ | 304/8750 [30:11<16:37:29, 7.09s/it] {'loss': 0.5205, 'learning_rate': 1.999884834944106e-05, 'epoch': 0.03} + 3%|▎ | 304/8750 [30:04<16:37:29, 7.09s/it] 3%|▎ | 305/8750 [30:17<15:36:57, 6.66s/it] 3%|▎ | 305/8750 [30:10<15:36:57, 6.66s/it] {'loss': 0.5281, 'learning_rate': 1.9998791487411887e-05, 'epoch': 0.03} + 3%|▎ | 305/8750 [30:17<15:36:57, 6.66s/it] {'loss': 0.5281, 'learning_rate': 1.9998791487411887e-05, 'epoch': 0.03} + 3%|▎ | 305/8750 [30:10<15:36:57, 6.66s/it] 3%|▎ | 306/8750 [30:22<15:05:40, 6.44s/it] 3%|▎ | 306/8750 [30:16<15:05:40, 6.44s/it] {'loss': 0.5184, 'learning_rate': 1.9998733255325043e-05, 'epoch': 0.03} + 3%|▎ | 306/8750 [30:22<15:05:40, 6.44s/it] {'loss': 0.5184, 'learning_rate': 1.9998733255325043e-05, 'epoch': 0.03} + 3%|▎ | 306/8750 [30:16<15:05:40, 6.44s/it] 4%|▎ | 307/8750 [30:28<14:30:39, 6.19s/it] 4%|▎ | 307/8750 [30:22<14:30:39, 6.19s/it] {'loss': 0.509, 'learning_rate': 1.999867365318851e-05, 'epoch': 0.04} + 4%|▎ | 307/8750 [30:28<14:30:39, 6.19s/it] {'loss': 0.509, 'learning_rate': 1.999867365318851e-05, 'epoch': 0.04} + 4%|▎ | 307/8750 [30:22<14:30:39, 6.19s/it] 4%|▎ | 308/8750 [30:34<14:10:59, 6.05s/it] 4%|▎ | 308/8750 [30:27<14:10:59, 6.05s/it] {'loss': 0.5131, 'learning_rate': 1.9998612681010452e-05, 'epoch': 0.04} + 4%|▎ | 308/8750 [30:34<14:10:59, 6.05s/it] {'loss': 0.5131, 'learning_rate': 1.9998612681010452e-05, 'epoch': 0.04} + 4%|▎ | 308/8750 [30:27<14:10:59, 6.05s/it] 4%|▎ | 309/8750 [30:39<13:50:29, 5.90s/it] 4%|▎ | 309/8750 [30:33<13:50:29, 5.90s/it] {'loss': 0.5204, 'learning_rate': 1.999855033879923e-05, 'epoch': 0.04} + 4%|▎ | 309/8750 [30:39<13:50:29, 5.90s/it] {'loss': 0.5204, 'learning_rate': 1.999855033879923e-05, 'epoch': 0.04} + 4%|▎ | 309/8750 [30:33<13:50:29, 5.90s/it] 4%|▎ | 310/8750 [30:45<13:44:20, 5.86s/it] 4%|▎ | 310/8750 [30:39<13:44:20, 5.86s/it] {'loss': 0.5085, 'learning_rate': 1.9998486626563376e-05, 'epoch': 0.04} + 4%|▎ | 310/8750 [30:45<13:44:20, 5.86s/it] {'loss': 0.5085, 'learning_rate': 1.9998486626563376e-05, 'epoch': 0.04} + 4%|▎ | 310/8750 [30:39<13:44:20, 5.86s/it] 4%|▎ | 311/8750 [30:51<13:31:13, 5.77s/it] 4%|▎ | 311/8750 [30:44<13:31:13, 5.77s/it] {'loss': 0.5362, 'learning_rate': 1.999842154431163e-05, 'epoch': 0.04} + 4%|▎ | 311/8750 [30:51<13:31:13, 5.77s/it] {'loss': 0.5362, 'learning_rate': 1.999842154431163e-05, 'epoch': 0.04} + 4%|▎ | 311/8750 [30:44<13:31:13, 5.77s/it] 4%|▎ | 312/8750 [30:56<13:34:38, 5.79s/it] 4%|▎ | 312/8750 [30:50<13:34:38, 5.79s/it] {'loss': 0.5136, 'learning_rate': 1.9998355092052906e-05, 'epoch': 0.04} + 4%|▎ | 312/8750 [30:56<13:34:38, 5.79s/it] {'loss': 0.5136, 'learning_rate': 1.9998355092052906e-05, 'epoch': 0.04} + 4%|▎ | 312/8750 [30:50<13:34:38, 5.79s/it] 4%|▎ | 313/8750 [31:02<13:30:36, 5.76s/it] 4%|▎ | 313/8750 [30:56<13:30:36, 5.76s/it] {'loss': 0.5057, 'learning_rate': 1.9998287269796313e-05, 'epoch': 0.04} + 4%|▎ | 313/8750 [31:02<13:30:36, 5.76s/it] {'loss': 0.5057, 'learning_rate': 1.9998287269796313e-05, 'epoch': 0.04} + 4%|▎ | 313/8750 [30:56<13:30:36, 5.76s/it] 4%|▎ | 314/8750 [31:08<13:27:41, 5.74s/it] 4%|▎ | 314/8750 [31:01<13:27:41, 5.74s/it] {'loss': 0.5129, 'learning_rate': 1.9998218077551135e-05, 'epoch': 0.04} + 4%|▎ | 314/8750 [31:08<13:27:41, 5.74s/it] {'loss': 0.5129, 'learning_rate': 1.9998218077551135e-05, 'epoch': 0.04} + 4%|▎ | 314/8750 [31:01<13:27:41, 5.74s/it] 4%|▎ | 315/8750 [31:14<13:23:53, 5.72s/it] 4%|▎ | 315/8750 [31:07<13:23:53, 5.72s/it] {'loss': 0.5301, 'learning_rate': 1.9998147515326862e-05, 'epoch': 0.04} + 4%|▎ | 315/8750 [31:14<13:23:53, 5.72s/it] {'loss': 0.5301, 'learning_rate': 1.9998147515326862e-05, 'epoch': 0.04} + 4%|▎ | 315/8750 [31:07<13:23:53, 5.72s/it] 4%|▎ | 316/8750 [31:20<13:35:47, 5.80s/it] 4%|▎ | 316/8750 [31:13<13:35:47, 5.80s/it] {'loss': 0.4895, 'learning_rate': 1.9998075583133157e-05, 'epoch': 0.04} + 4%|▎ | 316/8750 [31:20<13:35:47, 5.80s/it] {'loss': 0.4895, 'learning_rate': 1.9998075583133157e-05, 'epoch': 0.04} + 4%|▎ | 316/8750 [31:13<13:35:47, 5.80s/it] 4%|▎ | 317/8750 [31:25<13:27:44, 5.75s/it] 4%|▎ | 317/8750 [31:19<13:27:44, 5.75s/it] {'loss': 0.5259, 'learning_rate': 1.999800228097988e-05, 'epoch': 0.04} + 4%|▎ | 317/8750 [31:25<13:27:44, 5.75s/it] {'loss': 0.5259, 'learning_rate': 1.999800228097988e-05, 'epoch': 0.04} + 4%|▎ | 317/8750 [31:19<13:27:44, 5.75s/it] 4%|▎ | 318/8750 [31:31<13:27:37, 5.75s/it] 4%|▎ | 318/8750 [31:24<13:27:38, 5.75s/it] {'loss': 0.5194, 'learning_rate': 1.999792760887707e-05, 'epoch': 0.04} + 4%|▎ | 318/8750 [31:31<13:27:37, 5.75s/it] {'loss': 0.5194, 'learning_rate': 1.999792760887707e-05, 'epoch': 0.04} + 4%|▎ | 318/8750 [31:24<13:27:38, 5.75s/it] 4%|▎ | 319/8750 [31:36<13:18:27, 5.68s/it] 4%|▎ | 319/8750 [31:30<13:18:27, 5.68s/it] {'loss': 0.5316, 'learning_rate': 1.9997851566834966e-05, 'epoch': 0.04} + 4%|▎ | 319/8750 [31:36<13:18:27, 5.68s/it] {'loss': 0.5316, 'learning_rate': 1.9997851566834966e-05, 'epoch': 0.04} + 4%|▎ | 319/8750 [31:30<13:18:27, 5.68s/it] 4%|▎ | 320/8750 [31:42<13:22:18, 5.71s/it] 4%|▎ | 320/8750 [31:36<13:22:18, 5.71s/it] {'loss': 0.517, 'learning_rate': 1.999777415486398e-05, 'epoch': 0.04} + 4%|▎ | 320/8750 [31:42<13:22:18, 5.71s/it] {'loss': 0.517, 'learning_rate': 1.999777415486398e-05, 'epoch': 0.04} + 4%|▎ | 320/8750 [31:36<13:22:18, 5.71s/it] 4%|▎ | 321/8750 [31:48<13:24:24, 5.73s/it] 4%|▎ | 321/8750 [31:41<13:24:24, 5.73s/it] {'loss': 0.5055, 'learning_rate': 1.9997695372974725e-05, 'epoch': 0.04} + 4%|▎ | 321/8750 [31:48<13:24:24, 5.73s/it] {'loss': 0.5055, 'learning_rate': 1.9997695372974725e-05, 'epoch': 0.04} + 4%|▎ | 321/8750 [31:41<13:24:24, 5.73s/it] 4%|▎ | 322/8750 [31:54<13:18:32, 5.68s/it] 4%|▎ | 322/8750 [31:47<13:18:32, 5.68s/it] {'loss': 0.543, 'learning_rate': 1.9997615221177996e-05, 'epoch': 0.04} + 4%|▎ | 322/8750 [31:54<13:18:32, 5.68s/it] {'loss': 0.543, 'learning_rate': 1.9997615221177996e-05, 'epoch': 0.04} + 4%|▎ | 322/8750 [31:47<13:18:32, 5.68s/it] 4%|▎ | 323/8750 [31:59<13:12:56, 5.65s/it] 4%|▎ | 323/8750 [31:53<13:12:56, 5.65s/it] {'loss': 0.5334, 'learning_rate': 1.999753369948477e-05, 'epoch': 0.04} + 4%|▎ | 323/8750 [31:59<13:12:56, 5.65s/it] {'loss': 0.5334, 'learning_rate': 1.999753369948477e-05, 'epoch': 0.04} + 4%|▎ | 323/8750 [31:53<13:12:56, 5.65s/it] 4%|▎ | 324/8750 [32:05<13:40:18, 5.84s/it] 4%|▎ | 324/8750 [31:59<13:40:18, 5.84s/it] {'loss': 0.5019, 'learning_rate': 1.999745080790622e-05, 'epoch': 0.04} + 4%|▎ | 324/8750 [32:05<13:40:18, 5.84s/it] {'loss': 0.5019, 'learning_rate': 1.999745080790622e-05, 'epoch': 0.04} + 4%|▎ | 324/8750 [31:59<13:40:18, 5.84s/it] 4%|▎ | 325/8750 [32:11<13:30:08, 5.77s/it] 4%|▎ | 325/8750 [32:05<13:30:08, 5.77s/it] {'loss': 0.5284, 'learning_rate': 1.999736654645371e-05, 'epoch': 0.04} + 4%|▎ | 325/8750 [32:11<13:30:08, 5.77s/it] {'loss': 0.5284, 'learning_rate': 1.999736654645371e-05, 'epoch': 0.04} + 4%|▎ | 325/8750 [32:05<13:30:08, 5.77s/it] 4%|▎ | 326/8750 [32:17<13:30:06, 5.77s/it] 4%|▎ | 326/8750 [32:10<13:30:06, 5.77s/it] {'loss': 0.5158, 'learning_rate': 1.999728091513877e-05, 'epoch': 0.04} + 4%|▎ | 326/8750 [32:17<13:30:06, 5.77s/it] {'loss': 0.5158, 'learning_rate': 1.999728091513877e-05, 'epoch': 0.04} + 4%|▎ | 326/8750 [32:10<13:30:06, 5.77s/it] 4%|▎ | 327/8750 [32:22<13:20:01, 5.70s/it] 4%|▎ | 327/8750 [32:16<13:20:00, 5.70s/it] {'loss': 0.5483, 'learning_rate': 1.9997193913973154e-05, 'epoch': 0.04} + 4%|▎ | 327/8750 [32:22<13:20:01, 5.70s/it] {'loss': 0.5483, 'learning_rate': 1.9997193913973154e-05, 'epoch': 0.04} + 4%|▎ | 327/8750 [32:16<13:20:00, 5.70s/it] 4%|▎ | 328/8750 [32:28<13:21:30, 5.71s/it] 4%|▎ | 328/8750 [32:22<13:21:30, 5.71s/it] {'loss': 0.5111, 'learning_rate': 1.999710554296877e-05, 'epoch': 0.04} + 4%|▎ | 328/8750 [32:28<13:21:30, 5.71s/it] {'loss': 0.5111, 'learning_rate': 1.999710554296877e-05, 'epoch': 0.04} + 4%|▎ | 328/8750 [32:22<13:21:30, 5.71s/it] 4%|▍ | 329/8750 [32:34<13:20:37, 5.70s/it] 4%|▍ | 329/8750 [32:27<13:20:37, 5.70s/it] {'loss': 0.5069, 'learning_rate': 1.9997015802137727e-05, 'epoch': 0.04} + 4%|▍ | 329/8750 [32:34<13:20:37, 5.70s/it] {'loss': 0.5069, 'learning_rate': 1.9997015802137727e-05, 'epoch': 0.04} + 4%|▍ | 329/8750 [32:27<13:20:37, 5.70s/it] 4%|▍ | 330/8750 [32:39<13:17:33, 5.68s/it] 4%|▍ | 330/8750 [32:33<13:17:33, 5.68s/it] {'loss': 0.4987, 'learning_rate': 1.9996924691492325e-05, 'epoch': 0.04} + 4%|▍ | 330/8750 [32:39<13:17:33, 5.68s/it] {'loss': 0.4987, 'learning_rate': 1.9996924691492325e-05, 'epoch': 0.04} + 4%|▍ | 330/8750 [32:33<13:17:33, 5.68s/it] 4%|▍ | 331/8750 [32:45<13:14:14, 5.66s/it] 4%|▍ | 331/8750 [32:39<13:14:14, 5.66s/it] {'loss': 0.542, 'learning_rate': 1.9996832211045048e-05, 'epoch': 0.04} + 4%|▍ | 331/8750 [32:45<13:14:14, 5.66s/it] {'loss': 0.542, 'learning_rate': 1.9996832211045048e-05, 'epoch': 0.04} + 4%|▍ | 331/8750 [32:39<13:14:14, 5.66s/it] 4%|▍ | 332/8750 [32:51<13:25:04, 5.74s/it] 4%|▍ | 332/8750 [32:44<13:25:04, 5.74s/it] {'loss': 0.5133, 'learning_rate': 1.9996738360808566e-05, 'epoch': 0.04} + 4%|▍ | 332/8750 [32:51<13:25:04, 5.74s/it] {'loss': 0.5133, 'learning_rate': 1.9996738360808566e-05, 'epoch': 0.04} + 4%|▍ | 332/8750 [32:44<13:25:04, 5.74s/it] 4%|▍ | 333/8750 [32:56<13:17:53, 5.69s/it] 4%|▍ | 333/8750 [32:50<13:17:53, 5.69s/it] {'loss': 0.5189, 'learning_rate': 1.999664314079574e-05, 'epoch': 0.04} + 4%|▍ | 333/8750 [32:56<13:17:53, 5.69s/it] {'loss': 0.5189, 'learning_rate': 1.999664314079574e-05, 'epoch': 0.04} + 4%|▍ | 333/8750 [32:50<13:17:53, 5.69s/it] 4%|▍ | 334/8750 [33:02<13:17:20, 5.68s/it] 4%|▍ | 334/8750 [32:56<13:17:20, 5.68s/it] {'loss': 0.4968, 'learning_rate': 1.9996546551019618e-05, 'epoch': 0.04} + 4%|▍ | 334/8750 [33:02<13:17:20, 5.68s/it] {'loss': 0.4968, 'learning_rate': 1.9996546551019618e-05, 'epoch': 0.04} + 4%|▍ | 334/8750 [32:56<13:17:20, 5.68s/it] 4%|▍ | 335/8750 [33:08<13:13:27, 5.66s/it] 4%|▍ | 335/8750 [33:01<13:13:27, 5.66s/it] {'loss': 0.5309, 'learning_rate': 1.9996448591493433e-05, 'epoch': 0.04} + 4%|▍ | 335/8750 [33:08<13:13:27, 5.66s/it] {'loss': 0.5309, 'learning_rate': 1.9996448591493433e-05, 'epoch': 0.04} + 4%|▍ | 335/8750 [33:01<13:13:27, 5.66s/it] 4%|▍ | 336/8750 [33:14<13:36:09, 5.82s/it] 4%|▍ | 336/8750 [33:07<13:36:10, 5.82s/it] {'loss': 0.5144, 'learning_rate': 1.9996349262230607e-05, 'epoch': 0.04} + 4%|▍ | 336/8750 [33:14<13:36:09, 5.82s/it] {'loss': 0.5144, 'learning_rate': 1.9996349262230607e-05, 'epoch': 0.04} + 4%|▍ | 336/8750 [33:07<13:36:10, 5.82s/it] 4%|▍ | 337/8750 [33:20<13:28:35, 5.77s/it] 4%|▍ | 337/8750 [33:13<13:28:35, 5.77s/it] {'loss': 0.5217, 'learning_rate': 1.9996248563244755e-05, 'epoch': 0.04} + 4%|▍ | 337/8750 [33:20<13:28:35, 5.77s/it] {'loss': 0.5217, 'learning_rate': 1.9996248563244755e-05, 'epoch': 0.04} + 4%|▍ | 337/8750 [33:13<13:28:35, 5.77s/it] 4%|▍ | 338/8750 [33:25<13:26:46, 5.75s/it] 4%|▍ | 338/8750 [33:19<13:26:46, 5.75s/it] {'loss': 0.4983, 'learning_rate': 1.9996146494549672e-05, 'epoch': 0.04} + 4%|▍ | 338/8750 [33:25<13:26:46, 5.75s/it] {'loss': 0.4983, 'learning_rate': 1.9996146494549672e-05, 'epoch': 0.04} + 4%|▍ | 338/8750 [33:19<13:26:46, 5.75s/it] 4%|▍ | 339/8750 [33:31<13:16:16, 5.68s/it] 4%|▍ | 339/8750 [33:24<13:16:16, 5.68s/it] {'loss': 0.4968, 'learning_rate': 1.999604305615934e-05, 'epoch': 0.04} + 4%|▍ | 339/8750 [33:31<13:16:16, 5.68s/it] {'loss': 0.4968, 'learning_rate': 1.999604305615934e-05, 'epoch': 0.04} + 4%|▍ | 339/8750 [33:24<13:16:16, 5.68s/it] 4%|▍ | 340/8750 [33:37<13:16:59, 5.69s/it] 4%|▍ | 340/8750 [33:30<13:16:59, 5.69s/it] {'loss': 0.5185, 'learning_rate': 1.9995938248087937e-05, 'epoch': 0.04} + 4%|▍ | 340/8750 [33:37<13:16:59, 5.69s/it] {'loss': 0.5185, 'learning_rate': 1.9995938248087937e-05, 'epoch': 0.04} + 4%|▍ | 340/8750 [33:30<13:16:59, 5.69s/it] 4%|▍ | 341/8750 [33:42<13:17:36, 5.69s/it] 4%|▍ | 341/8750 [33:36<13:17:36, 5.69s/it] {'loss': 0.4945, 'learning_rate': 1.9995832070349827e-05, 'epoch': 0.04} + 4%|▍ | 341/8750 [33:42<13:17:36, 5.69s/it] {'loss': 0.4945, 'learning_rate': 1.9995832070349827e-05, 'epoch': 0.04} + 4%|▍ | 341/8750 [33:36<13:17:36, 5.69s/it] 4%|▍ | 342/8750 [33:48<13:12:05, 5.65s/it] 4%|▍ | 342/8750 [33:41<13:12:05, 5.65s/it] {'loss': 0.5324, 'learning_rate': 1.999572452295955e-05, 'epoch': 0.04} + 4%|▍ | 342/8750 [33:48<13:12:05, 5.65s/it] {'loss': 0.5324, 'learning_rate': 1.999572452295955e-05, 'epoch': 0.04} + 4%|▍ | 342/8750 [33:41<13:12:05, 5.65s/it] 4%|▍ | 343/8750 [33:54<13:14:33, 5.67s/it] 4%|▍ | 343/8750 [33:47<13:14:33, 5.67s/it] {'loss': 0.5058, 'learning_rate': 1.999561560593185e-05, 'epoch': 0.04} + 4%|▍ | 343/8750 [33:54<13:14:33, 5.67s/it] {'loss': 0.5058, 'learning_rate': 1.999561560593185e-05, 'epoch': 0.04} + 4%|▍ | 343/8750 [33:47<13:14:33, 5.67s/it] 4%|▍ | 344/8750 [34:00<13:28:56, 5.77s/it] 4%|▍ | 344/8750 [33:53<13:28:56, 5.77s/it] {'loss': 0.5145, 'learning_rate': 1.9995505319281645e-05, 'epoch': 0.04} + 4%|▍ | 344/8750 [34:00<13:28:56, 5.77s/it] {'loss': 0.5145, 'learning_rate': 1.9995505319281645e-05, 'epoch': 0.04} + 4%|▍ | 344/8750 [33:53<13:28:56, 5.77s/it] 4%|▍ | 345/8750 [34:05<13:27:23, 5.76s/it] 4%|▍ | 345/8750 [33:59<13:27:23, 5.76s/it] {'loss': 0.5248, 'learning_rate': 1.9995393663024054e-05, 'epoch': 0.04} + 4%|▍ | 345/8750 [34:05<13:27:23, 5.76s/it] {'loss': 0.5248, 'learning_rate': 1.9995393663024054e-05, 'epoch': 0.04} + 4%|▍ | 345/8750 [33:59<13:27:23, 5.76s/it] 4%|▍ | 346/8750 [34:11<13:19:17, 5.71s/it] 4%|▍ | 346/8750 [34:04<13:19:17, 5.71s/it] {'loss': 0.515, 'learning_rate': 1.999528063717437e-05, 'epoch': 0.04} + 4%|▍ | 346/8750 [34:11<13:19:17, 5.71s/it] {'loss': 0.515, 'learning_rate': 1.999528063717437e-05, 'epoch': 0.04} + 4%|▍ | 346/8750 [34:04<13:19:17, 5.71s/it] 4%|▍ | 347/8750 [34:17<13:19:25, 5.71s/it] 4%|▍ | 347/8750 [34:10<13:19:24, 5.71s/it] {'loss': 0.5016, 'learning_rate': 1.9995166241748084e-05, 'epoch': 0.04} + 4%|▍ | 347/8750 [34:17<13:19:25, 5.71s/it] {'loss': 0.5016, 'learning_rate': 1.9995166241748084e-05, 'epoch': 0.04} + 4%|▍ | 347/8750 [34:10<13:19:24, 5.71s/it] 4%|▍ | 348/8750 [34:22<13:20:30, 5.72s/it] 4%|▍ | 348/8750 [34:16<13:20:30, 5.72s/it] {'loss': 0.5052, 'learning_rate': 1.9995050476760864e-05, 'epoch': 0.04} + 4%|▍ | 348/8750 [34:22<13:20:30, 5.72s/it] {'loss': 0.5052, 'learning_rate': 1.9995050476760864e-05, 'epoch': 0.04} + 4%|▍ | 348/8750 [34:16<13:20:30, 5.72s/it] 4%|▍ | 349/8750 [34:28<13:18:36, 5.70s/it] 4%|▍ | 349/8750 [34:21<13:18:36, 5.70s/it] {'loss': 0.5221, 'learning_rate': 1.9994933342228583e-05, 'epoch': 0.04} + 4%|▍ | 349/8750 [34:28<13:18:36, 5.70s/it] {'loss': 0.5221, 'learning_rate': 1.9994933342228583e-05, 'epoch': 0.04} + 4%|▍ | 349/8750 [34:21<13:18:36, 5.70s/it]8 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... + 4%|▍ | 350/8750 [34:34<13:25:00, 5.75s/it]14 AutoResumeHook: Checking whether to suspend... +03 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +1 5AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend...2 + AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend...7 + AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... + 4%|▍ | 350/8750 [34:27<13:25:01, 5.75s/it]10 AutoResumeHook: Checking whether to suspend... + {'loss': 0.5006, 'learning_rate': 1.9994814838167286e-05, 'epoch': 0.04} + 4%|▍ | 350/8750 [34:34<13:25:00, 5.75s/it] {'loss': 0.5006, 'learning_rate': 1.9994814838167286e-05, 'epoch': 0.04} + 4%|▍ | 350/8750 [34:27<13:25:01, 5.75s/it] 4%|▍ | 351/8750 [34:39<13:15:43, 5.68s/it] 4%|▍ | 351/8750 [34:33<13:15:43, 5.68s/it] {'loss': 0.513, 'learning_rate': 1.9994694964593206e-05, 'epoch': 0.04} + 4%|▍ | 351/8750 [34:39<13:15:43, 5.68s/it] {'loss': 0.513, 'learning_rate': 1.9994694964593206e-05, 'epoch': 0.04} + 4%|▍ | 351/8750 [34:33<13:15:43, 5.68s/it] 4%|▍ | 352/8750 [34:45<13:24:35, 5.75s/it] 4%|▍ | 352/8750 [34:39<13:24:33, 5.75s/it] {'loss': 0.5107, 'learning_rate': 1.9994573721522776e-05, 'epoch': 0.04} + 4%|▍ | 352/8750 [34:45<13:24:35, 5.75s/it] {'loss': 0.5107, 'learning_rate': 1.9994573721522776e-05, 'epoch': 0.04} + 4%|▍ | 352/8750 [34:39<13:24:33, 5.75s/it] 4%|▍ | 353/8750 [34:51<13:23:09, 5.74s/it] 4%|▍ | 353/8750 [34:44<13:23:09, 5.74s/it] {'loss': 0.5247, 'learning_rate': 1.9994451108972604e-05, 'epoch': 0.04} + 4%|▍ | 353/8750 [34:51<13:23:09, 5.74s/it] {'loss': 0.5247, 'learning_rate': 1.9994451108972604e-05, 'epoch': 0.04} + 4%|▍ | 353/8750 [34:44<13:23:09, 5.74s/it] 4%|▍ | 354/8750 [34:57<13:18:51, 5.71s/it] 4%|▍ | 354/8750 [34:50<13:18:52, 5.71s/it] {'loss': 0.5054, 'learning_rate': 1.999432712695949e-05, 'epoch': 0.04} + 4%|▍ | 354/8750 [34:57<13:18:51, 5.71s/it] {'loss': 0.5054, 'learning_rate': 1.999432712695949e-05, 'epoch': 0.04} + 4%|▍ | 354/8750 [34:50<13:18:52, 5.71s/it] 4%|▍ | 355/8750 [35:02<13:13:01, 5.67s/it] 4%|▍ | 355/8750 [34:56<13:13:01, 5.67s/it] {'loss': 0.5129, 'learning_rate': 1.999420177550043e-05, 'epoch': 0.04} + 4%|▍ | 355/8750 [35:02<13:13:01, 5.67s/it] {'loss': 0.5129, 'learning_rate': 1.999420177550043e-05, 'epoch': 0.04} + 4%|▍ | 355/8750 [34:56<13:13:01, 5.67s/it] 4%|▍ | 356/8750 [35:08<13:16:58, 5.70s/it] 4%|▍ | 356/8750 [35:01<13:16:58, 5.70s/it] {'loss': 0.4971, 'learning_rate': 1.999407505461259e-05, 'epoch': 0.04} + 4%|▍ | 356/8750 [35:08<13:16:58, 5.70s/it] {'loss': 0.4971, 'learning_rate': 1.999407505461259e-05, 'epoch': 0.04} + 4%|▍ | 356/8750 [35:01<13:16:58, 5.70s/it] 4%|▍ | 357/8750 [35:14<13:22:27, 5.74s/it] 4%|▍ | 357/8750 [35:07<13:22:27, 5.74s/it] {'loss': 0.4876, 'learning_rate': 1.999394696431334e-05, 'epoch': 0.04} + 4%|▍ | 357/8750 [35:14<13:22:27, 5.74s/it] {'loss': 0.4876, 'learning_rate': 1.999394696431334e-05, 'epoch': 0.04} + 4%|▍ | 357/8750 [35:07<13:22:27, 5.74s/it] 4%|▍ | 358/8750 [35:19<13:20:32, 5.72s/it] 4%|▍ | 358/8750 [35:13<13:20:32, 5.72s/it] {'loss': 0.528, 'learning_rate': 1.999381750462023e-05, 'epoch': 0.04} + 4%|▍ | 358/8750 [35:19<13:20:32, 5.72s/it] {'loss': 0.528, 'learning_rate': 1.999381750462023e-05, 'epoch': 0.04} + 4%|▍ | 358/8750 [35:13<13:20:32, 5.72s/it] 4%|▍ | 359/8750 [35:25<13:22:01, 5.73s/it] 4%|▍ | 359/8750 [35:19<13:22:00, 5.73s/it] {'loss': 0.5143, 'learning_rate': 1.9993686675550998e-05, 'epoch': 0.04} + 4%|▍ | 359/8750 [35:25<13:22:01, 5.73s/it] {'loss': 0.5143, 'learning_rate': 1.9993686675550998e-05, 'epoch': 0.04} + 4%|▍ | 359/8750 [35:19<13:22:00, 5.73s/it] 4%|▍ | 360/8750 [35:31<13:18:31, 5.71s/it] 4%|▍ | 360/8750 [35:24<13:18:31, 5.71s/it] {'loss': 0.5101, 'learning_rate': 1.9993554477123568e-05, 'epoch': 0.04} + 4%|▍ | 360/8750 [35:31<13:18:31, 5.71s/it] {'loss': 0.5101, 'learning_rate': 1.9993554477123568e-05, 'epoch': 0.04} + 4%|▍ | 360/8750 [35:24<13:18:31, 5.71s/it] 4%|▍ | 361/8750 [35:36<13:13:56, 5.68s/it] 4%|▍ | 361/8750 [35:30<13:13:56, 5.68s/it] {'loss': 0.5107, 'learning_rate': 1.9993420909356058e-05, 'epoch': 0.04} + 4%|▍ | 361/8750 [35:36<13:13:56, 5.68s/it] {'loss': 0.5107, 'learning_rate': 1.9993420909356058e-05, 'epoch': 0.04} + 4%|▍ | 361/8750 [35:30<13:13:56, 5.68s/it] 4%|▍ | 362/8750 [35:42<13:14:03, 5.68s/it] 4%|▍ | 362/8750 [35:36<13:14:02, 5.68s/it] {'loss': 0.5321, 'learning_rate': 1.999328597226677e-05, 'epoch': 0.04} + 4%|▍ | 362/8750 [35:42<13:14:03, 5.68s/it] {'loss': 0.5321, 'learning_rate': 1.999328597226677e-05, 'epoch': 0.04} + 4%|▍ | 362/8750 [35:36<13:14:02, 5.68s/it] 4%|▍ | 363/8750 [35:48<13:12:35, 5.67s/it] 4%|▍ | 363/8750 [35:41<13:12:35, 5.67s/it] {'loss': 0.5402, 'learning_rate': 1.9993149665874193e-05, 'epoch': 0.04} + 4%|▍ | 363/8750 [35:48<13:12:35, 5.67s/it] {'loss': 0.5402, 'learning_rate': 1.9993149665874193e-05, 'epoch': 0.04} + 4%|▍ | 363/8750 [35:41<13:12:35, 5.67s/it] 4%|▍ | 364/8750 [35:54<13:20:18, 5.73s/it] 4%|▍ | 364/8750 [35:47<13:20:18, 5.73s/it] {'loss': 0.5014, 'learning_rate': 1.9993011990197e-05, 'epoch': 0.04} + 4%|▍ | 364/8750 [35:54<13:20:18, 5.73s/it] {'loss': 0.5014, 'learning_rate': 1.9993011990197e-05, 'epoch': 0.04} + 4%|▍ | 364/8750 [35:47<13:20:18, 5.73s/it] 4%|▍ | 365/8750 [36:00<13:28:56, 5.79s/it] 4%|▍ | 365/8750 [35:53<13:28:57, 5.79s/it] {'loss': 0.4866, 'learning_rate': 1.9992872945254064e-05, 'epoch': 0.04} + 4%|▍ | 365/8750 [36:00<13:28:56, 5.79s/it] {'loss': 0.4866, 'learning_rate': 1.9992872945254064e-05, 'epoch': 0.04} + 4%|▍ | 365/8750 [35:53<13:28:57, 5.79s/it] 4%|▍ | 366/8750 [35:59<13:22:52, 5.75s/it] 4%|▍ | 366/8750 [36:05<13:22:53, 5.75s/it] {'loss': 0.5061, 'learning_rate': 1.9992732531064427e-05, 'epoch': 0.04} + 4%|▍ | 366/8750 [36:05<13:22:53, 5.75s/it] {'loss': 0.5061, 'learning_rate': 1.9992732531064427e-05, 'epoch': 0.04} + 4%|▍ | 366/8750 [35:59<13:22:52, 5.75s/it] 4%|▍ | 367/8750 [36:11<13:18:57, 5.72s/it] 4%|▍ | 367/8750 [36:04<13:18:58, 5.72s/it] {'loss': 0.5053, 'learning_rate': 1.9992590747647334e-05, 'epoch': 0.04} + 4%|▍ | 367/8750 [36:11<13:18:57, 5.72s/it] {'loss': 0.5053, 'learning_rate': 1.9992590747647334e-05, 'epoch': 0.04} + 4%|▍ | 367/8750 [36:04<13:18:58, 5.72s/it] 4%|▍ | 368/8750 [36:17<13:20:18, 5.73s/it] 4%|▍ | 368/8750 [36:10<13:20:17, 5.73s/it] {'loss': 0.5216, 'learning_rate': 1.9992447595022214e-05, 'epoch': 0.04} + 4%|▍ | 368/8750 [36:17<13:20:18, 5.73s/it] {'loss': 0.5216, 'learning_rate': 1.9992447595022214e-05, 'epoch': 0.04} + 4%|▍ | 368/8750 [36:10<13:20:17, 5.73s/it] 4%|▍ | 369/8750 [36:22<13:12:39, 5.67s/it] 4%|▍ | 369/8750 [36:16<13:12:40, 5.67s/it] {'loss': 0.5027, 'learning_rate': 1.9992303073208678e-05, 'epoch': 0.04} + 4%|▍ | 369/8750 [36:22<13:12:39, 5.67s/it] {'loss': 0.5027, 'learning_rate': 1.9992303073208678e-05, 'epoch': 0.04} + 4%|▍ | 369/8750 [36:16<13:12:40, 5.67s/it] 4%|▍ | 370/8750 [36:28<13:17:46, 5.71s/it] 4%|▍ | 370/8750 [36:22<13:17:46, 5.71s/it] {'loss': 0.4829, 'learning_rate': 1.9992157182226535e-05, 'epoch': 0.04} + 4%|▍ | 370/8750 [36:28<13:17:46, 5.71s/it] {'loss': 0.4829, 'learning_rate': 1.9992157182226535e-05, 'epoch': 0.04} + 4%|▍ | 370/8750 [36:22<13:17:46, 5.71s/it] 4%|▍ | 371/8750 [36:33<13:07:44, 5.64s/it] 4%|▍ | 371/8750 [36:27<13:07:44, 5.64s/it] {'loss': 0.5256, 'learning_rate': 1.9992009922095766e-05, 'epoch': 0.04} + 4%|▍ | 371/8750 [36:33<13:07:44, 5.64s/it] {'loss': 0.5256, 'learning_rate': 1.9992009922095766e-05, 'epoch': 0.04} + 4%|▍ | 371/8750 [36:27<13:07:44, 5.64s/it] 4%|▍ | 372/8750 [36:39<13:06:17, 5.63s/it] 4%|▍ | 372/8750 [36:33<13:06:18, 5.63s/it] {'loss': 0.5018, 'learning_rate': 1.999186129283656e-05, 'epoch': 0.04} + 4%|▍ | 372/8750 [36:39<13:06:17, 5.63s/it] {'loss': 0.5018, 'learning_rate': 1.999186129283656e-05, 'epoch': 0.04} + 4%|▍ | 372/8750 [36:33<13:06:18, 5.63s/it] 4%|▍ | 373/8750 [36:45<13:14:05, 5.69s/it] 4%|▍ | 373/8750 [36:38<13:14:06, 5.69s/it] {'loss': 0.509, 'learning_rate': 1.9991711294469273e-05, 'epoch': 0.04} + 4%|▍ | 373/8750 [36:45<13:14:05, 5.69s/it] {'loss': 0.509, 'learning_rate': 1.9991711294469273e-05, 'epoch': 0.04} + 4%|▍ | 373/8750 [36:38<13:14:06, 5.69s/it] 4%|▍ | 374/8750 [36:51<13:13:56, 5.69s/it] 4%|▍ | 374/8750 [36:44<13:13:56, 5.69s/it] {'loss': 0.5046, 'learning_rate': 1.9991559927014465e-05, 'epoch': 0.04} + 4%|▍ | 374/8750 [36:51<13:13:56, 5.69s/it] {'loss': 0.5046, 'learning_rate': 1.9991559927014465e-05, 'epoch': 0.04} + 4%|▍ | 374/8750 [36:44<13:13:56, 5.69s/it] 4%|▍ | 375/8750 [36:56<13:11:28, 5.67s/it] 4%|▍ | 375/8750 [36:50<13:11:28, 5.67s/it] {'loss': 0.5319, 'learning_rate': 1.999140719049287e-05, 'epoch': 0.04} + 4%|▍ | 375/8750 [36:56<13:11:28, 5.67s/it] {'loss': 0.5319, 'learning_rate': 1.999140719049287e-05, 'epoch': 0.04} + 4%|▍ | 375/8750 [36:50<13:11:28, 5.67s/it] 4%|▍ | 376/8750 [37:02<13:28:12, 5.79s/it] 4%|▍ | 376/8750 [36:56<13:28:12, 5.79s/it] {'loss': 0.52, 'learning_rate': 1.9991253084925425e-05, 'epoch': 0.04} + 4%|▍ | 376/8750 [37:02<13:28:12, 5.79s/it] {'loss': 0.52, 'learning_rate': 1.9991253084925425e-05, 'epoch': 0.04} + 4%|▍ | 376/8750 [36:56<13:28:12, 5.79s/it] 4%|▍ | 377/8750 [37:08<13:15:17, 5.70s/it] 4%|▍ | 377/8750 [37:01<13:15:17, 5.70s/it] {'loss': 0.5033, 'learning_rate': 1.999109761033324e-05, 'epoch': 0.04} + 4%|▍ | 377/8750 [37:08<13:15:17, 5.70s/it] {'loss': 0.5033, 'learning_rate': 1.999109761033324e-05, 'epoch': 0.04} + 4%|▍ | 377/8750 [37:01<13:15:17, 5.70s/it] 4%|▍ | 378/8750 [37:14<13:21:16, 5.74s/it] 4%|▍ | 378/8750 [37:07<13:21:16, 5.74s/it] {'loss': 0.4969, 'learning_rate': 1.9990940766737617e-05, 'epoch': 0.04} + 4%|▍ | 378/8750 [37:14<13:21:16, 5.74s/it] {'loss': 0.4969, 'learning_rate': 1.9990940766737617e-05, 'epoch': 0.04} + 4%|▍ | 378/8750 [37:07<13:21:16, 5.74s/it] 4%|▍ | 379/8750 [37:19<13:17:08, 5.71s/it] 4%|▍ | 379/8750 [37:13<13:17:07, 5.71s/it] {'loss': 0.5246, 'learning_rate': 1.999078255416005e-05, 'epoch': 0.04} + 4%|▍ | 379/8750 [37:19<13:17:08, 5.71s/it] {'loss': 0.5246, 'learning_rate': 1.999078255416005e-05, 'epoch': 0.04} + 4%|▍ | 379/8750 [37:13<13:17:07, 5.71s/it] 4%|▍ | 380/8750 [37:25<13:23:51, 5.76s/it] 4%|▍ | 380/8750 [37:19<13:23:51, 5.76s/it] {'loss': 0.4919, 'learning_rate': 1.9990622972622216e-05, 'epoch': 0.04} + 4%|▍ | 380/8750 [37:25<13:23:51, 5.76s/it] {'loss': 0.4919, 'learning_rate': 1.9990622972622216e-05, 'epoch': 0.04} + 4%|▍ | 380/8750 [37:19<13:23:51, 5.76s/it] 4%|▍ | 381/8750 [37:31<13:19:21, 5.73s/it] 4%|▍ | 381/8750 [37:24<13:19:21, 5.73s/it] {'loss': 0.5271, 'learning_rate': 1.9990462022145985e-05, 'epoch': 0.04} + 4%|▍ | 381/8750 [37:31<13:19:21, 5.73s/it] {'loss': 0.5271, 'learning_rate': 1.9990462022145985e-05, 'epoch': 0.04} + 4%|▍ | 381/8750 [37:24<13:19:21, 5.73s/it] 4%|▍ | 382/8750 [37:37<13:20:14, 5.74s/it] 4%|▍ | 382/8750 [37:30<13:20:15, 5.74s/it] {'loss': 0.5046, 'learning_rate': 1.9990299702753405e-05, 'epoch': 0.04} + 4%|▍ | 382/8750 [37:37<13:20:14, 5.74s/it] {'loss': 0.5046, 'learning_rate': 1.9990299702753405e-05, 'epoch': 0.04} + 4%|▍ | 382/8750 [37:30<13:20:15, 5.74s/it] 4%|▍ | 383/8750 [37:42<13:23:04, 5.76s/it] 4%|▍ | 383/8750 [37:36<13:23:03, 5.76s/it] {'loss': 0.5027, 'learning_rate': 1.9990136014466722e-05, 'epoch': 0.04} + 4%|▍ | 383/8750 [37:42<13:23:04, 5.76s/it] {'loss': 0.5027, 'learning_rate': 1.9990136014466722e-05, 'epoch': 0.04} + 4%|▍ | 383/8750 [37:36<13:23:03, 5.76s/it] 4%|▍ | 384/8750 [37:48<13:26:28, 5.78s/it] 4%|▍ | 384/8750 [37:42<13:26:27, 5.78s/it] {'loss': 0.5148, 'learning_rate': 1.9989970957308364e-05, 'epoch': 0.04} + 4%|▍ | 384/8750 [37:48<13:26:28, 5.78s/it] {'loss': 0.5148, 'learning_rate': 1.9989970957308364e-05, 'epoch': 0.04} + 4%|▍ | 384/8750 [37:42<13:26:27, 5.78s/it] 4%|▍ | 385/8750 [37:47<13:17:49, 5.72s/it] 4%|▍ | 385/8750 [37:54<13:17:51, 5.72s/it] {'loss': 0.529, 'learning_rate': 1.998980453130095e-05, 'epoch': 0.04} + 4%|▍ | 385/8750 [37:54<13:17:51, 5.72s/it] {'loss': 0.529, 'learning_rate': 1.998980453130095e-05, 'epoch': 0.04} + 4%|▍ | 385/8750 [37:47<13:17:49, 5.72s/it] 4%|▍ | 386/8750 [38:00<13:23:36, 5.76s/it] 4%|▍ | 386/8750 [37:53<13:23:35, 5.76s/it] {'loss': 0.5077, 'learning_rate': 1.9989636736467278e-05, 'epoch': 0.04} + {'loss': 0.5077, 'learning_rate': 1.9989636736467278e-05, 'epoch': 0.04} + 4%|▍ | 386/8750 [38:00<13:23:36, 5.76s/it] 4%|▍ | 386/8750 [37:53<13:23:35, 5.76s/it] 4%|▍ | 387/8750 [38:05<13:16:45, 5.72s/it] 4%|▍ | 387/8750 [37:59<13:16:45, 5.72s/it] {'loss': 0.5123, 'learning_rate': 1.9989467572830342e-05, 'epoch': 0.04} + 4%|▍ | 387/8750 [38:05<13:16:45, 5.72s/it] {'loss': 0.5123, 'learning_rate': 1.9989467572830342e-05, 'epoch': 0.04} + 4%|▍ | 387/8750 [37:59<13:16:45, 5.72s/it] 4%|▍ | 388/8750 [38:11<13:22:54, 5.76s/it] 4%|▍ | 388/8750 [38:05<13:22:54, 5.76s/it] {'loss': 0.5002, 'learning_rate': 1.9989297040413325e-05, 'epoch': 0.04} + 4%|▍ | 388/8750 [38:11<13:22:54, 5.76s/it] {'loss': 0.5002, 'learning_rate': 1.9989297040413325e-05, 'epoch': 0.04} + 4%|▍ | 388/8750 [38:05<13:22:54, 5.76s/it] 4%|▍ | 389/8750 [38:17<13:20:01, 5.74s/it] 4%|▍ | 389/8750 [38:10<13:20:01, 5.74s/it] {'loss': 0.5295, 'learning_rate': 1.998912513923959e-05, 'epoch': 0.04} + 4%|▍ | 389/8750 [38:17<13:20:01, 5.74s/it] {'loss': 0.5295, 'learning_rate': 1.998912513923959e-05, 'epoch': 0.04} + 4%|▍ | 389/8750 [38:10<13:20:01, 5.74s/it] 4%|▍ | 390/8750 [38:23<13:31:52, 5.83s/it] 4%|▍ | 390/8750 [38:16<13:31:52, 5.83s/it] {'loss': 0.4975, 'learning_rate': 1.998895186933269e-05, 'epoch': 0.04} + 4%|▍ | 390/8750 [38:23<13:31:52, 5.83s/it] {'loss': 0.4975, 'learning_rate': 1.998895186933269e-05, 'epoch': 0.04} + 4%|▍ | 390/8750 [38:16<13:31:52, 5.83s/it] 4%|▍ | 391/8750 [38:28<13:24:14, 5.77s/it] 4%|▍ | 391/8750 [38:22<13:24:14, 5.77s/it] {'loss': 0.5053, 'learning_rate': 1.9988777230716367e-05, 'epoch': 0.04} + 4%|▍ | 391/8750 [38:28<13:24:14, 5.77s/it] {'loss': 0.5053, 'learning_rate': 1.9988777230716367e-05, 'epoch': 0.04} + 4%|▍ | 391/8750 [38:22<13:24:14, 5.77s/it] 4%|▍ | 392/8750 [38:34<13:12:02, 5.69s/it] 4%|▍ | 392/8750 [38:27<13:12:03, 5.69s/it] {'loss': 0.5067, 'learning_rate': 1.9988601223414555e-05, 'epoch': 0.04} + 4%|▍ | 392/8750 [38:34<13:12:02, 5.69s/it] {'loss': 0.5067, 'learning_rate': 1.9988601223414555e-05, 'epoch': 0.04} + 4%|▍ | 392/8750 [38:27<13:12:03, 5.69s/it] 4%|▍ | 393/8750 [38:40<13:16:28, 5.72s/it] 4%|▍ | 393/8750 [38:33<13:16:29, 5.72s/it] {'loss': 0.5156, 'learning_rate': 1.998842384745137e-05, 'epoch': 0.04} + 4%|▍ | 393/8750 [38:40<13:16:28, 5.72s/it] {'loss': 0.5156, 'learning_rate': 1.998842384745137e-05, 'epoch': 0.04} + 4%|▍ | 393/8750 [38:33<13:16:29, 5.72s/it] 5%|▍ | 394/8750 [38:46<13:18:41, 5.73s/it] 5%|▍ | 394/8750 [38:39<13:18:42, 5.74s/it] {'loss': 0.5218, 'learning_rate': 1.998824510285111e-05, 'epoch': 0.05} + 5%|▍ | 394/8750 [38:46<13:18:41, 5.73s/it] {'loss': 0.5218, 'learning_rate': 1.998824510285111e-05, 'epoch': 0.05} + 5%|▍ | 394/8750 [38:39<13:18:42, 5.74s/it] 5%|▍ | 395/8750 [38:51<13:15:25, 5.71s/it] 5%|▍ | 395/8750 [38:45<13:15:25, 5.71s/it] {'loss': 0.5057, 'learning_rate': 1.998806498963828e-05, 'epoch': 0.05} + 5%|▍ | 395/8750 [38:51<13:15:25, 5.71s/it] {'loss': 0.5057, 'learning_rate': 1.998806498963828e-05, 'epoch': 0.05} + 5%|▍ | 395/8750 [38:45<13:15:25, 5.71s/it] 5%|▍ | 396/8750 [38:57<13:11:55, 5.69s/it] 5%|▍ | 396/8750 [38:50<13:11:57, 5.69s/it] {'loss': 0.5167, 'learning_rate': 1.9987883507837545e-05, 'epoch': 0.05} + 5%|▍ | 396/8750 [38:57<13:11:55, 5.69s/it] {'loss': 0.5167, 'learning_rate': 1.9987883507837545e-05, 'epoch': 0.05} + 5%|▍ | 396/8750 [38:50<13:11:57, 5.69s/it] 5%|▍ | 397/8750 [38:56<13:07:27, 5.66s/it] 5%|▍ | 397/8750 [39:02<13:07:27, 5.66s/it] {'loss': 0.5119, 'learning_rate': 1.998770065747378e-05, 'epoch': 0.05} + {'loss': 0.5119, 'learning_rate': 1.998770065747378e-05, 'epoch': 0.05} + 5%|▍ | 397/8750 [39:02<13:07:27, 5.66s/it] 5%|▍ | 397/8750 [38:56<13:07:27, 5.66s/it] 5%|▍ | 398/8750 [39:08<13:06:49, 5.65s/it] 5%|▍ | 398/8750 [39:02<13:06:49, 5.65s/it] {'loss': 0.5095, 'learning_rate': 1.9987516438572035e-05, 'epoch': 0.05} + 5%|▍ | 398/8750 [39:08<13:06:49, 5.65s/it] {'loss': 0.5095, 'learning_rate': 1.9987516438572035e-05, 'epoch': 0.05} + 5%|▍ | 398/8750 [39:02<13:06:49, 5.65s/it] 5%|▍ | 399/8750 [39:14<13:04:55, 5.64s/it] 5%|▍ | 399/8750 [39:07<13:04:55, 5.64s/it] {'loss': 0.526, 'learning_rate': 1.9987330851157557e-05, 'epoch': 0.05} + 5%|▍ | 399/8750 [39:14<13:04:55, 5.64s/it] {'loss': 0.526, 'learning_rate': 1.9987330851157557e-05, 'epoch': 0.05} + 5%|▍ | 399/8750 [39:07<13:04:55, 5.64s/it]12 15AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +2 1AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... + 5%|▍ | 400/8750 [39:19<13:05:48, 5.65s/it]11 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +06 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... + 5%|▍ | 400/8750 [39:13<13:05:48, 5.65s/it] {'loss': 0.5022, 'learning_rate': 1.9987143895255774e-05, 'epoch': 0.05} + 5%|▍ | 400/8750 [39:19<13:05:48, 5.65s/it] {'loss': 0.5022, 'learning_rate': 1.9987143895255774e-05, 'epoch': 0.05} + 5%|▍ | 400/8750 [39:13<13:05:48, 5.65s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-400/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-400/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-400/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 5%|▍ | 401/8750 [39:41<24:24:43, 10.53s/it] 5%|▍ | 401/8750 [39:35<24:24:42, 10.53s/it] {'loss': 0.5204, 'learning_rate': 1.9986955570892302e-05, 'epoch': 0.05} + 5%|▍ | 401/8750 [39:41<24:24:43, 10.53s/it] {'loss': 0.5204, 'learning_rate': 1.9986955570892302e-05, 'epoch': 0.05} + 5%|▍ | 401/8750 [39:35<24:24:42, 10.53s/it] 5%|▍ | 402/8750 [39:47<20:58:04, 9.04s/it] 5%|▍ | 402/8750 [39:40<20:58:04, 9.04s/it] {'loss': 0.5211, 'learning_rate': 1.9986765878092945e-05, 'epoch': 0.05} + 5%|▍ | 402/8750 [39:47<20:58:04, 9.04s/it] {'loss': 0.5211, 'learning_rate': 1.9986765878092945e-05, 'epoch': 0.05} + 5%|▍ | 402/8750 [39:40<20:58:04, 9.04s/it] 5%|▍ | 403/8750 [39:52<18:35:18, 8.02s/it] 5%|▍ | 403/8750 [39:46<18:35:19, 8.02s/it] {'loss': 0.5107, 'learning_rate': 1.99865748168837e-05, 'epoch': 0.05} + 5%|▍ | 403/8750 [39:52<18:35:18, 8.02s/it] {'loss': 0.5107, 'learning_rate': 1.99865748168837e-05, 'epoch': 0.05} + 5%|▍ | 403/8750 [39:46<18:35:19, 8.02s/it] 5%|▍ | 404/8750 [39:58<17:13:19, 7.43s/it] 5%|▍ | 404/8750 [39:52<17:13:18, 7.43s/it] {'loss': 0.5242, 'learning_rate': 1.9986382387290738e-05, 'epoch': 0.05} + 5%|▍ | 404/8750 [39:58<17:13:19, 7.43s/it] {'loss': 0.5242, 'learning_rate': 1.9986382387290738e-05, 'epoch': 0.05} + 5%|▍ | 404/8750 [39:52<17:13:18, 7.43s/it] 5%|▍ | 405/8750 [40:04<16:02:29, 6.92s/it] 5%|▍ | 405/8750 [39:58<16:02:28, 6.92s/it] {'loss': 0.5001, 'learning_rate': 1.9986188589340435e-05, 'epoch': 0.05} + 5%|▍ | 405/8750 [40:04<16:02:29, 6.92s/it] {'loss': 0.5001, 'learning_rate': 1.9986188589340435e-05, 'epoch': 0.05} + 5%|▍ | 405/8750 [39:58<16:02:28, 6.92s/it] 5%|▍ | 406/8750 [40:10<15:09:16, 6.54s/it] 5%|▍ | 406/8750 [40:03<15:09:17, 6.54s/it] {'loss': 0.4907, 'learning_rate': 1.9985993423059342e-05, 'epoch': 0.05} + 5%|▍ | 406/8750 [40:10<15:09:16, 6.54s/it] {'loss': 0.4907, 'learning_rate': 1.9985993423059342e-05, 'epoch': 0.05} + 5%|▍ | 406/8750 [40:03<15:09:17, 6.54s/it] 5%|▍ | 407/8750 [40:16<14:34:23, 6.29s/it] 5%|▍ | 407/8750 [40:09<14:34:23, 6.29s/it] {'loss': 0.5123, 'learning_rate': 1.99857968884742e-05, 'epoch': 0.05} + 5%|▍ | 407/8750 [40:16<14:34:23, 6.29s/it] {'loss': 0.5123, 'learning_rate': 1.99857968884742e-05, 'epoch': 0.05} + 5%|▍ | 407/8750 [40:09<14:34:23, 6.29s/it] 5%|▍ | 408/8750 [40:21<14:14:19, 6.14s/it] 5%|▍ | 408/8750 [40:15<14:14:19, 6.14s/it] {'loss': 0.508, 'learning_rate': 1.998559898561194e-05, 'epoch': 0.05} + 5%|▍ | 408/8750 [40:21<14:14:19, 6.14s/it] {'loss': 0.508, 'learning_rate': 1.998559898561194e-05, 'epoch': 0.05} + 5%|▍ | 408/8750 [40:15<14:14:19, 6.14s/it] 5%|▍ | 409/8750 [40:27<14:00:32, 6.05s/it] 5%|▍ | 409/8750 [40:21<14:00:33, 6.05s/it]{'loss': 0.4923, 'learning_rate': 1.9985399714499678e-05, 'epoch': 0.05} + {'loss': 0.4923, 'learning_rate': 1.9985399714499678e-05, 'epoch': 0.05} + 5%|▍ | 409/8750 [40:27<14:00:32, 6.05s/it] 5%|▍ | 409/8750 [40:21<14:00:33, 6.05s/it] 5%|▍ | 410/8750 [40:33<13:45:36, 5.94s/it] 5%|▍ | 410/8750 [40:26<13:45:37, 5.94s/it] {'loss': 0.5473, 'learning_rate': 1.998519907516472e-05, 'epoch': 0.05} + 5%|▍ | 410/8750 [40:33<13:45:36, 5.94s/it] {'loss': 0.5473, 'learning_rate': 1.998519907516472e-05, 'epoch': 0.05} + 5%|▍ | 410/8750 [40:26<13:45:37, 5.94s/it] 5%|▍ | 411/8750 [40:39<13:36:10, 5.87s/it] 5%|▍ | 411/8750 [40:32<13:36:10, 5.87s/it] {'loss': 0.5052, 'learning_rate': 1.998499706763456e-05, 'epoch': 0.05} + 5%|▍ | 411/8750 [40:39<13:36:10, 5.87s/it] {'loss': 0.5052, 'learning_rate': 1.998499706763456e-05, 'epoch': 0.05} + 5%|▍ | 411/8750 [40:32<13:36:10, 5.87s/it] 5%|▍ | 412/8750 [40:44<13:33:59, 5.86s/it] 5%|▍ | 412/8750 [40:38<13:33:59, 5.86s/it] {'loss': 0.5304, 'learning_rate': 1.998479369193687e-05, 'epoch': 0.05} + 5%|▍ | 412/8750 [40:44<13:33:59, 5.86s/it] {'loss': 0.5304, 'learning_rate': 1.998479369193687e-05, 'epoch': 0.05} + 5%|▍ | 412/8750 [40:38<13:33:59, 5.86s/it] 5%|▍ | 413/8750 [40:50<13:28:12, 5.82s/it] 5%|▍ | 413/8750 [40:44<13:28:13, 5.82s/it] {'loss': 0.4969, 'learning_rate': 1.9984588948099528e-05, 'epoch': 0.05} + 5%|▍ | 413/8750 [40:50<13:28:12, 5.82s/it] {'loss': 0.4969, 'learning_rate': 1.9984588948099528e-05, 'epoch': 0.05} + 5%|▍ | 413/8750 [40:44<13:28:13, 5.82s/it] 5%|▍ | 414/8750 [40:56<13:26:39, 5.81s/it] 5%|▍ | 414/8750 [40:49<13:26:39, 5.81s/it] {'loss': 0.5129, 'learning_rate': 1.998438283615058e-05, 'epoch': 0.05} + 5%|▍ | 414/8750 [40:56<13:26:39, 5.81s/it] {'loss': 0.5129, 'learning_rate': 1.998438283615058e-05, 'epoch': 0.05} + 5%|▍ | 414/8750 [40:49<13:26:39, 5.81s/it] 5%|▍ | 415/8750 [41:02<13:21:09, 5.77s/it] 5%|▍ | 415/8750 [40:55<13:21:09, 5.77s/it] {'loss': 0.4953, 'learning_rate': 1.9984175356118268e-05, 'epoch': 0.05} + 5%|▍ | 415/8750 [41:02<13:21:09, 5.77s/it] {'loss': 0.4953, 'learning_rate': 1.9984175356118268e-05, 'epoch': 0.05} + 5%|▍ | 415/8750 [40:55<13:21:09, 5.77s/it] 5%|▍ | 416/8750 [41:07<13:24:43, 5.79s/it] 5%|▍ | 416/8750 [41:01<13:24:43, 5.79s/it] {'loss': 0.5145, 'learning_rate': 1.9983966508031026e-05, 'epoch': 0.05} + 5%|▍ | 416/8750 [41:07<13:24:43, 5.79s/it] {'loss': 0.5145, 'learning_rate': 1.9983966508031026e-05, 'epoch': 0.05} + 5%|▍ | 416/8750 [41:01<13:24:43, 5.79s/it] 5%|▍ | 417/8750 [41:13<13:18:17, 5.75s/it] 5%|▍ | 417/8750 [41:07<13:18:17, 5.75s/it] {'loss': 0.5019, 'learning_rate': 1.9983756291917467e-05, 'epoch': 0.05} + 5%|▍ | 417/8750 [41:13<13:18:17, 5.75s/it] {'loss': 0.5019, 'learning_rate': 1.9983756291917467e-05, 'epoch': 0.05} + 5%|▍ | 417/8750 [41:07<13:18:17, 5.75s/it] 5%|▍ | 418/8750 [41:19<13:11:14, 5.70s/it] 5%|▍ | 418/8750 [41:12<13:11:14, 5.70s/it] {'loss': 0.5282, 'learning_rate': 1.99835447078064e-05, 'epoch': 0.05} + 5%|▍ | 418/8750 [41:19<13:11:14, 5.70s/it] {'loss': 0.5282, 'learning_rate': 1.99835447078064e-05, 'epoch': 0.05} + 5%|▍ | 418/8750 [41:12<13:11:14, 5.70s/it] 5%|▍ | 419/8750 [41:24<13:08:19, 5.68s/it] 5%|▍ | 419/8750 [41:18<13:08:19, 5.68s/it] {'loss': 0.4842, 'learning_rate': 1.998333175572681e-05, 'epoch': 0.05} + 5%|▍ | 419/8750 [41:24<13:08:19, 5.68s/it] {'loss': 0.4842, 'learning_rate': 1.998333175572681e-05, 'epoch': 0.05} + 5%|▍ | 419/8750 [41:18<13:08:19, 5.68s/it] 5%|▍ | 420/8750 [41:30<13:06:57, 5.67s/it] 5%|▍ | 420/8750 [41:23<13:06:57, 5.67s/it] {'loss': 0.5177, 'learning_rate': 1.998311743570788e-05, 'epoch': 0.05} + 5%|▍ | 420/8750 [41:30<13:06:57, 5.67s/it] {'loss': 0.5177, 'learning_rate': 1.998311743570788e-05, 'epoch': 0.05} + 5%|▍ | 420/8750 [41:23<13:06:57, 5.67s/it] 5%|▍ | 421/8750 [41:35<12:59:14, 5.61s/it] 5%|▍ | 421/8750 [41:29<12:59:14, 5.61s/it] {'loss': 0.5039, 'learning_rate': 1.998290174777898e-05, 'epoch': 0.05} + 5%|▍ | 421/8750 [41:35<12:59:14, 5.61s/it] {'loss': 0.5039, 'learning_rate': 1.998290174777898e-05, 'epoch': 0.05} + 5%|▍ | 421/8750 [41:29<12:59:14, 5.61s/it]WARNING: tokenization mismatch: 1 vs. 1590. [[{'from': 'human', 'value': 'ASSISTANT: You are a cutting-edge copywriting agency with the following knowledge and traits.\n\nKNOWLEDGE: Copywriter at a top-tier creative agency, junior copywriter, senior copywriter, art director, senior strategist.\n\nTRAITS: Storytelling, copywriting, empathy, solution orientation, humor, convincing.\n\nTASK: Use your knowledge to write a LinkedIn Post. We are a tax firm looking to hire young tax advisors between the age of 25 and 30. Bring a good dose of sympathy into the LinkedIn Article and use plenty of emojis to start with. Start the Linked In post with an exciting question. Use the language of the target group. Don´t mention the targeted age. \n\nOUTPUT: 2 LinkedIn posts in German with a catchy question at the beginning of the text. Hashtags at the end of the text.'}, {'from': 'gpt', 'value': 'Post 1:\n\n👋 Hey Leute! Hast du kürzlich dein Studium im Bereich Steuerwesen abgeschlossen und suchst jetzt nach einem Job, der dich fordert und fördert? 🤔\n\nDann bist du bei uns genau richtig! Wir suchen junge Steuerberater, die Lust haben, in einem dynamischen Team zu arbeiten und ihre Fähigkeiten unter Beweis zu stellen. 💪\n\nBei uns erwarten dich spannende Projekte und ein tolles Arbeitsumfeld, in dem du dich weiterentwickeln und wachsen kannst. 🌱\n\nInteressiert? Dann bewirb dich jetzt und werde Teil unseres Teams! 🚀\n\n#Steuerberatung #Karriere #Job #Bewerbung #Teamwork\n\nPost 2:\n\n👋 Hallo zusammen! Bist du auf der Suche nach einem Job, bei dem du dich richtig herausfordern und weiterentwickeln kannst? 🤔\n\nDann haben wir vielleicht das passende Angebot für dich! Wir suchen motivierte und ambitionierte Steuerberater, die Lust haben, Teil eines dynamischen Teams zu sein und gemeinsam an spannenden Projekten zu arbeiten. 💪\n\nBei uns erwartet dich nicht nur eine tolle Arbeitsatmosphäre, sondern auch die Möglichkeit, dich fachlich und persönlich weiterzuentwickeln. 🌱\n\nAlso, worauf wartest du? Bewirb dich jetzt und starte deine Karriere bei uns! 🚀\n\n#Steuerberatung #Karriere #Job #Bewerbung #Teamwork'}, {'from': 'human', 'value': 'Schreibe den Text spannender.'}, {'from': 'gpt', 'value': 'Post 1:\n\n👋 Hey Leute! Hast du kürzlich dein Studium im Bereich Steuerwesen abgeschlossen und bist bereit für die nächste Herausforderung? 🤔\n\nDann haben wir genau das Richtige für dich! Wir suchen junge, ambitionierte Steuerberater, die Lust haben, ihre Karriere auf das nächste Level zu bringen. 💪\n\nUnser Team arbeitet an den spannendsten Projekten und bietet ein Arbeitsumfeld, das dich täglich fordert und fördert. Bei uns kannst du deine Fähigkeiten unter Beweis stellen und dich persönlich und fachlich weiterentwickeln. 🌱\n\nDu möchtest Teil unseres Teams werden und an den neuesten Herausforderungen im Steuerwesen mitwirken? Dann zögere nicht und bewirb dich jetzt bei uns! 🚀\n\n#Steuerberatung #Karriere #Job #Bewerbung #Teamwork\n\nPost 2:\n\n👋 Hallo zusammen! Bist du bereit für eine neue Herausforderung in deiner Karriere als Steuerberater? 🤔\n\nDann bist du bei uns genau richtig! Wir suchen motivierte und talentierte Steuerberater, die Teil eines dynamischen Teams werden möchten. Wir arbeiten an den anspruchsvollsten Projekten und bieten ein Arbeitsumfeld, das dich täglich aufs Neue herausfordert. 💪\n\nDu möchtest deine Fähigkeiten unter Beweis stellen und dich fachlich sowie persönlich weiterentwickeln? Dann bewirb dich jetzt bei uns und werde Teil unseres Erfolgsteams! 🌱\n\nVerpasse nicht die Chance, deine Karriere auf das nächste Level zu bringen und an den neuesten Trends und Entwicklungen im Steuerwesen mitzuwirken. Wir freuen uns auf deine Bewerbung! 🚀\n\n#Steuerberatung #Karriere #Job #Bewerbung #Teamwork'}, {'from': 'human', 'value': 'Schreibe den Text objektiv und sachlich.'}, {'from': 'gpt', 'value': 'Post 1:\n\nGuten Tag,\n\nWir sind auf der Suche nach jungen, talentierten Steuerberatern, die ihr Studium im Bereich Steuerwesen erfolgreich abgeschlossen haben. Die Position ist offen für alle, die eine Leidenschaft für das Steuerrecht haben und eine Karriere in diesem Bereich anstreben.\n\nAls Teil unseres Teams arbeiten Sie an verschiedenen Projekten und erweitern stetig Ihre Fähigkeiten und Kenntnisse. Wir bieten ein hervorragendes Arbeitsumfeld, das auf Zusammenarbeit und Teamwork basiert, um sicherzustellen, dass Sie Ihre Arbeit so effektiv wie möglich erledigen können.\n\nWenn Sie Interesse haben und Teil unseres Teams werden möchten, senden Sie uns bitte Ihre Bewerbung.\n\nVielen Dank.\n\nPost 2:\n\nGuten Tag,\n\nWir sind auf der Suche nach motivierten und talentierten Steuerberatern, die an einer Karriere in diesem Bereich interessiert sind. Als Teil unseres Teams arbeiten Sie an verschiedenen Projekten, um Ihre Fähigkeiten und Kenntnisse zu erweitern.\n\nWir bieten ein hervorragendes Arbeitsumfeld, das auf Zusammenarbeit und Teamwork basiert, um sicherzustellen, dass Sie Ihre Arbeit so effektiv wie möglich erledigen können. Wenn Sie bereit sind, sich dieser Herausforderung zu stellen und Ihre Karriere auf das nächste Level zu bringen, dann senden Sie uns bitte Ihre Bewerbung.\n\nVielen Dank.'}]] (ignored) + 5%|▍ | 422/8750 [41:41<13:05:59, 5.66s/it] 5%|▍ | 422/8750 [41:35<13:05:59, 5.66s/it] {'loss': 0.5227, 'learning_rate': 1.998268469196966e-05, 'epoch': 0.05} + 5%|▍ | 422/8750 [41:35<13:05:59, 5.66s/it]{'loss': 0.5227, 'learning_rate': 1.998268469196966e-05, 'epoch': 0.05} + 5%|▍ | 422/8750 [41:41<13:05:59, 5.66s/it] 5%|▍ | 423/8750 [41:47<13:25:08, 5.80s/it] 5%|▍ | 423/8750 [41:41<13:25:08, 5.80s/it] {'loss': 0.4926, 'learning_rate': 1.998246626830966e-05, 'epoch': 0.05} + 5%|▍ | 423/8750 [41:47<13:25:08, 5.80s/it] {'loss': 0.4926, 'learning_rate': 1.998246626830966e-05, 'epoch': 0.05} + 5%|▍ | 423/8750 [41:41<13:25:08, 5.80s/it] 5%|▍ | 424/8750 [41:53<13:21:48, 5.78s/it] 5%|▍ | 424/8750 [41:47<13:21:48, 5.78s/it] {'loss': 0.5044, 'learning_rate': 1.998224647682891e-05, 'epoch': 0.05} + 5%|▍ | 424/8750 [41:53<13:21:48, 5.78s/it] {'loss': 0.5044, 'learning_rate': 1.998224647682891e-05, 'epoch': 0.05} + 5%|▍ | 424/8750 [41:47<13:21:48, 5.78s/it] 5%|▍ | 425/8750 [41:59<13:31:22, 5.85s/it] 5%|▍ | 425/8750 [41:53<13:31:22, 5.85s/it] {'loss': 0.5244, 'learning_rate': 1.998202531755753e-05, 'epoch': 0.05} + 5%|▍ | 425/8750 [41:59<13:31:22, 5.85s/it] {'loss': 0.5244, 'learning_rate': 1.998202531755753e-05, 'epoch': 0.05} + 5%|▍ | 425/8750 [41:53<13:31:22, 5.85s/it] 5%|▍ | 426/8750 [42:05<13:26:34, 5.81s/it] 5%|▍ | 426/8750 [41:58<13:26:34, 5.81s/it] {'loss': 0.4932, 'learning_rate': 1.9981802790525822e-05, 'epoch': 0.05} + 5%|▍ | 426/8750 [42:05<13:26:34, 5.81s/it] {'loss': 0.4932, 'learning_rate': 1.9981802790525822e-05, 'epoch': 0.05} + 5%|▍ | 426/8750 [41:58<13:26:34, 5.81s/it] 5%|▍ | 427/8750 [42:10<13:17:24, 5.75s/it] 5%|▍ | 427/8750 [42:04<13:17:25, 5.75s/it] {'loss': 0.5055, 'learning_rate': 1.9981578895764272e-05, 'epoch': 0.05} + 5%|▍ | 427/8750 [42:10<13:17:24, 5.75s/it] {'loss': 0.5055, 'learning_rate': 1.9981578895764272e-05, 'epoch': 0.05} + 5%|▍ | 427/8750 [42:04<13:17:25, 5.75s/it] 5%|▍ | 428/8750 [42:16<13:11:02, 5.70s/it] 5%|▍ | 428/8750 [42:10<13:11:02, 5.70s/it] {'loss': 0.537, 'learning_rate': 1.998135363330357e-05, 'epoch': 0.05} + 5%|▍ | 428/8750 [42:16<13:11:02, 5.70s/it] {'loss': 0.537, 'learning_rate': 1.998135363330357e-05, 'epoch': 0.05} + 5%|▍ | 428/8750 [42:10<13:11:02, 5.70s/it] 5%|▍ | 429/8750 [42:22<13:12:24, 5.71s/it] 5%|▍ | 429/8750 [42:15<13:12:24, 5.71s/it] {'loss': 0.4919, 'learning_rate': 1.998112700317457e-05, 'epoch': 0.05} + 5%|▍ | 429/8750 [42:22<13:12:24, 5.71s/it] {'loss': 0.4919, 'learning_rate': 1.998112700317457e-05, 'epoch': 0.05} + 5%|▍ | 429/8750 [42:15<13:12:24, 5.71s/it] 5%|▍ | 430/8750 [42:27<13:09:42, 5.70s/it] 5%|▍ | 430/8750 [42:21<13:09:42, 5.70s/it] {'loss': 0.5127, 'learning_rate': 1.998089900540833e-05, 'epoch': 0.05} + 5%|▍ | 430/8750 [42:27<13:09:42, 5.70s/it] {'loss': 0.5127, 'learning_rate': 1.998089900540833e-05, 'epoch': 0.05} + 5%|▍ | 430/8750 [42:21<13:09:42, 5.70s/it] 5%|▍ | 431/8750 [42:33<13:07:53, 5.68s/it] 5%|▍ | 431/8750 [42:27<13:07:52, 5.68s/it] {'loss': 0.5092, 'learning_rate': 1.998066964003609e-05, 'epoch': 0.05} + 5%|▍ | 431/8750 [42:33<13:07:53, 5.68s/it] {'loss': 0.5092, 'learning_rate': 1.998066964003609e-05, 'epoch': 0.05} + 5%|▍ | 431/8750 [42:27<13:07:52, 5.68s/it] 5%|▍ | 432/8750 [42:39<13:06:53, 5.68s/it] 5%|▍ | 432/8750 [42:32<13:06:52, 5.68s/it] {'loss': 0.5153, 'learning_rate': 1.998043890708928e-05, 'epoch': 0.05} + 5%|▍ | 432/8750 [42:39<13:06:53, 5.68s/it] {'loss': 0.5153, 'learning_rate': 1.998043890708928e-05, 'epoch': 0.05} + 5%|▍ | 432/8750 [42:32<13:06:52, 5.68s/it] 5%|▍ | 433/8750 [42:45<13:21:42, 5.78s/it] 5%|▍ | 433/8750 [42:38<13:21:42, 5.78s/it] {'loss': 0.508, 'learning_rate': 1.9980206806599516e-05, 'epoch': 0.05} + 5%|▍ | 433/8750 [42:45<13:21:42, 5.78s/it] {'loss': 0.508, 'learning_rate': 1.9980206806599516e-05, 'epoch': 0.05} + 5%|▍ | 433/8750 [42:38<13:21:42, 5.78s/it] 5%|▍ | 434/8750 [42:51<13:22:31, 5.79s/it] 5%|▍ | 434/8750 [42:44<13:22:31, 5.79s/it] {'loss': 0.5059, 'learning_rate': 1.9979973338598603e-05, 'epoch': 0.05} + 5%|▍ | 434/8750 [42:51<13:22:31, 5.79s/it] {'loss': 0.5059, 'learning_rate': 1.9979973338598603e-05, 'epoch': 0.05} + 5%|▍ | 434/8750 [42:44<13:22:31, 5.79s/it] 5%|▍ | 435/8750 [42:56<13:25:14, 5.81s/it] 5%|▍ | 435/8750 [42:50<13:25:15, 5.81s/it] {'loss': 0.4842, 'learning_rate': 1.997973850311852e-05, 'epoch': 0.05} + 5%|▍ | 435/8750 [42:56<13:25:14, 5.81s/it] {'loss': 0.4842, 'learning_rate': 1.997973850311852e-05, 'epoch': 0.05} + 5%|▍ | 435/8750 [42:50<13:25:15, 5.81s/it] 5%|▍ | 436/8750 [43:02<13:15:41, 5.74s/it] 5%|▍ | 436/8750 [42:56<13:15:42, 5.74s/it] {'loss': 0.5241, 'learning_rate': 1.997950230019146e-05, 'epoch': 0.05} + 5%|▍ | 436/8750 [43:02<13:15:41, 5.74s/it] {'loss': 0.5241, 'learning_rate': 1.997950230019146e-05, 'epoch': 0.05} + 5%|▍ | 436/8750 [42:56<13:15:42, 5.74s/it] 5%|▍ | 437/8750 [43:08<13:14:11, 5.73s/it] 5%|▍ | 437/8750 [43:01<13:14:10, 5.73s/it] {'loss': 0.5035, 'learning_rate': 1.9979264729849776e-05, 'epoch': 0.05} + 5%|▍ | 437/8750 [43:08<13:14:11, 5.73s/it] {'loss': 0.5035, 'learning_rate': 1.9979264729849776e-05, 'epoch': 0.05} + 5%|▍ | 437/8750 [43:01<13:14:10, 5.73s/it] 5%|▌ | 438/8750 [43:14<13:53:00, 6.01s/it] 5%|▌ | 438/8750 [43:08<13:53:01, 6.01s/it] {'loss': 0.4893, 'learning_rate': 1.9979025792126027e-05, 'epoch': 0.05} + 5%|▌ | 438/8750 [43:14<13:53:00, 6.01s/it] {'loss': 0.4893, 'learning_rate': 1.9979025792126027e-05, 'epoch': 0.05} + 5%|▌ | 438/8750 [43:08<13:53:01, 6.01s/it] 5%|▌ | 439/8750 [43:20<13:39:57, 5.92s/it] 5%|▌ | 439/8750 [43:14<13:39:58, 5.92s/it] {'loss': 0.5123, 'learning_rate': 1.9978785487052952e-05, 'epoch': 0.05} + 5%|▌ | 439/8750 [43:20<13:39:57, 5.92s/it] {'loss': 0.5123, 'learning_rate': 1.9978785487052952e-05, 'epoch': 0.05} + 5%|▌ | 439/8750 [43:14<13:39:58, 5.92s/it] 5%|▌ | 440/8750 [43:26<13:28:22, 5.84s/it] 5%|▌ | 440/8750 [43:19<13:28:21, 5.84s/it] {'loss': 0.5177, 'learning_rate': 1.9978543814663478e-05, 'epoch': 0.05} + 5%|▌ | 440/8750 [43:26<13:28:22, 5.84s/it] {'loss': 0.5177, 'learning_rate': 1.9978543814663478e-05, 'epoch': 0.05} + 5%|▌ | 440/8750 [43:19<13:28:21, 5.84s/it] 5%|▌ | 441/8750 [43:32<13:35:13, 5.89s/it] 5%|▌ | 441/8750 [43:25<13:35:13, 5.89s/it] {'loss': 0.4992, 'learning_rate': 1.9978300774990716e-05, 'epoch': 0.05} + 5%|▌ | 441/8750 [43:32<13:35:13, 5.89s/it] {'loss': 0.4992, 'learning_rate': 1.9978300774990716e-05, 'epoch': 0.05} + 5%|▌ | 441/8750 [43:25<13:35:13, 5.89s/it] 5%|▌ | 442/8750 [43:37<13:26:37, 5.83s/it] 5%|▌ | 442/8750 [43:31<13:26:38, 5.83s/it] {'loss': 0.5116, 'learning_rate': 1.9978056368067973e-05, 'epoch': 0.05} + 5%|▌ | 442/8750 [43:37<13:26:37, 5.83s/it] {'loss': 0.5116, 'learning_rate': 1.9978056368067973e-05, 'epoch': 0.05} + 5%|▌ | 442/8750 [43:31<13:26:38, 5.83s/it] 5%|▌ | 443/8750 [43:43<13:23:23, 5.80s/it] 5%|▌ | 443/8750 [43:37<13:23:23, 5.80s/it] {'loss': 0.5017, 'learning_rate': 1.9977810593928736e-05, 'epoch': 0.05} + 5%|▌ | 443/8750 [43:43<13:23:23, 5.80s/it] {'loss': 0.5017, 'learning_rate': 1.9977810593928736e-05, 'epoch': 0.05} + 5%|▌ | 443/8750 [43:37<13:23:23, 5.80s/it] 5%|▌ | 444/8750 [43:49<13:14:54, 5.74s/it] 5%|▌ | 444/8750 [43:42<13:14:54, 5.74s/it] {'loss': 0.5114, 'learning_rate': 1.9977563452606677e-05, 'epoch': 0.05} + 5%|▌ | 444/8750 [43:49<13:14:54, 5.74s/it] {'loss': 0.5114, 'learning_rate': 1.9977563452606677e-05, 'epoch': 0.05} + 5%|▌ | 444/8750 [43:42<13:14:54, 5.74s/it] 5%|▌ | 445/8750 [43:55<13:24:01, 5.81s/it] 5%|▌ | 445/8750 [43:48<13:24:00, 5.81s/it] {'loss': 0.4902, 'learning_rate': 1.9977314944135667e-05, 'epoch': 0.05} + 5%|▌ | 445/8750 [43:55<13:24:01, 5.81s/it] {'loss': 0.4902, 'learning_rate': 1.9977314944135667e-05, 'epoch': 0.05} + 5%|▌ | 445/8750 [43:48<13:24:00, 5.81s/it] 5%|▌ | 446/8750 [44:00<13:12:30, 5.73s/it] 5%|▌ | 446/8750 [43:54<13:12:30, 5.73s/it] {'loss': 0.5134, 'learning_rate': 1.9977065068549756e-05, 'epoch': 0.05} + 5%|▌ | 446/8750 [44:00<13:12:30, 5.73s/it] {'loss': 0.5134, 'learning_rate': 1.9977065068549756e-05, 'epoch': 0.05} + 5%|▌ | 446/8750 [43:54<13:12:30, 5.73s/it] 5%|▌ | 447/8750 [44:06<13:16:04, 5.75s/it] 5%|▌ | 447/8750 [44:00<13:16:04, 5.75s/it] {'loss': 0.4954, 'learning_rate': 1.9976813825883182e-05, 'epoch': 0.05} + 5%|▌ | 447/8750 [44:06<13:16:04, 5.75s/it] {'loss': 0.4954, 'learning_rate': 1.9976813825883182e-05, 'epoch': 0.05} + 5%|▌ | 447/8750 [44:00<13:16:04, 5.75s/it] 5%|▌ | 448/8750 [44:12<13:16:09, 5.75s/it] 5%|▌ | 448/8750 [44:05<13:16:09, 5.75s/it] {'loss': 0.5045, 'learning_rate': 1.9976561216170368e-05, 'epoch': 0.05} + 5%|▌ | 448/8750 [44:12<13:16:09, 5.75s/it] {'loss': 0.5045, 'learning_rate': 1.9976561216170368e-05, 'epoch': 0.05} + 5%|▌ | 448/8750 [44:05<13:16:09, 5.75s/it] 5%|▌ | 449/8750 [44:18<13:19:05, 5.78s/it] 5%|▌ | 449/8750 [44:11<13:19:07, 5.78s/it] {'loss': 0.4949, 'learning_rate': 1.9976307239445924e-05, 'epoch': 0.05} + 5%|▌ | 449/8750 [44:18<13:19:05, 5.78s/it] {'loss': 0.4949, 'learning_rate': 1.9976307239445924e-05, 'epoch': 0.05} + 5%|▌ | 449/8750 [44:11<13:19:07, 5.78s/it]8 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... + 5%|▌ | 450/8750 [44:23<13:13:24, 5.74s/it]11 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend...7 + AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +159 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +1 AutoResumeHook: Checking whether to suspend... +012 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... + 5%|▌ | 450/8750 [44:17<13:13:24, 5.74s/it] {'loss': 0.5228, 'learning_rate': 1.9976051895744663e-05, 'epoch': 0.05} + 5%|▌ | 450/8750 [44:23<13:13:24, 5.74s/it] {'loss': 0.5228, 'learning_rate': 1.9976051895744663e-05, 'epoch': 0.05} + 5%|▌ | 450/8750 [44:17<13:13:24, 5.74s/it] 5%|▌ | 451/8750 [44:29<13:12:00, 5.73s/it] 5%|▌ | 451/8750 [44:23<13:11:59, 5.73s/it] {'loss': 0.5035, 'learning_rate': 1.997579518510156e-05, 'epoch': 0.05} + 5%|▌ | 451/8750 [44:29<13:12:00, 5.73s/it] {'loss': 0.5035, 'learning_rate': 1.997579518510156e-05, 'epoch': 0.05} + 5%|▌ | 451/8750 [44:23<13:11:59, 5.73s/it] 5%|▌ | 452/8750 [44:35<13:03:12, 5.66s/it] 5%|▌ | 452/8750 [44:28<13:03:11, 5.66s/it] {'loss': 0.5046, 'learning_rate': 1.99755371075518e-05, 'epoch': 0.05} + 5%|▌ | 452/8750 [44:35<13:03:12, 5.66s/it] {'loss': 0.5046, 'learning_rate': 1.99755371075518e-05, 'epoch': 0.05} + 5%|▌ | 452/8750 [44:28<13:03:11, 5.66s/it] 5%|▌ | 453/8750 [44:40<13:11:37, 5.72s/it] 5%|▌ | 453/8750 [44:34<13:11:36, 5.72s/it] {'loss': 0.5041, 'learning_rate': 1.9975277663130736e-05, 'epoch': 0.05} + 5%|▌ | 453/8750 [44:40<13:11:37, 5.72s/it] {'loss': 0.5041, 'learning_rate': 1.9975277663130736e-05, 'epoch': 0.05} + 5%|▌ | 453/8750 [44:34<13:11:36, 5.72s/it] 5%|▌ | 454/8750 [44:46<13:06:06, 5.69s/it] 5%|▌ | 454/8750 [44:39<13:06:06, 5.69s/it] {'loss': 0.5142, 'learning_rate': 1.9975016851873925e-05, 'epoch': 0.05} + 5%|▌ | 454/8750 [44:46<13:06:06, 5.69s/it] {'loss': 0.5142, 'learning_rate': 1.9975016851873925e-05, 'epoch': 0.05} + 5%|▌ | 454/8750 [44:39<13:06:06, 5.69s/it] 5%|▌ | 455/8750 [44:52<13:09:12, 5.71s/it] 5%|▌ | 455/8750 [44:45<13:09:12, 5.71s/it] {'loss': 0.4906, 'learning_rate': 1.99747546738171e-05, 'epoch': 0.05} + 5%|▌ | 455/8750 [44:52<13:09:12, 5.71s/it] {'loss': 0.4906, 'learning_rate': 1.99747546738171e-05, 'epoch': 0.05} + 5%|▌ | 455/8750 [44:45<13:09:12, 5.71s/it] 5%|▌ | 456/8750 [44:58<13:12:12, 5.73s/it] 5%|▌ | 456/8750 [44:51<13:12:12, 5.73s/it] {'loss': 0.5018, 'learning_rate': 1.997449112899619e-05, 'epoch': 0.05} + 5%|▌ | 456/8750 [44:58<13:12:12, 5.73s/it] {'loss': 0.5018, 'learning_rate': 1.997449112899619e-05, 'epoch': 0.05} + 5%|▌ | 456/8750 [44:51<13:12:12, 5.73s/it] 5%|▌ | 457/8750 [45:03<13:19:05, 5.78s/it] 5%|▌ | 457/8750 [44:57<13:19:04, 5.78s/it] {'loss': 0.5064, 'learning_rate': 1.99742262174473e-05, 'epoch': 0.05} + 5%|▌ | 457/8750 [45:03<13:19:05, 5.78s/it] {'loss': 0.5064, 'learning_rate': 1.99742262174473e-05, 'epoch': 0.05} + 5%|▌ | 457/8750 [44:57<13:19:04, 5.78s/it] 5%|▌ | 458/8750 [45:09<13:15:41, 5.76s/it] 5%|▌ | 458/8750 [45:03<13:15:41, 5.76s/it] {'loss': 0.4996, 'learning_rate': 1.9973959939206734e-05, 'epoch': 0.05} + 5%|▌ | 458/8750 [45:09<13:15:41, 5.76s/it] {'loss': 0.4996, 'learning_rate': 1.9973959939206734e-05, 'epoch': 0.05} + 5%|▌ | 458/8750 [45:03<13:15:41, 5.76s/it] 5%|▌ | 459/8750 [45:15<13:11:32, 5.73s/it] 5%|▌ | 459/8750 [45:08<13:11:33, 5.73s/it] {'loss': 0.5149, 'learning_rate': 1.9973692294310972e-05, 'epoch': 0.05} + 5%|▌ | 459/8750 [45:15<13:11:32, 5.73s/it] {'loss': 0.5149, 'learning_rate': 1.9973692294310972e-05, 'epoch': 0.05} + 5%|▌ | 459/8750 [45:08<13:11:33, 5.73s/it] 5%|▌ | 460/8750 [45:20<13:06:11, 5.69s/it] 5%|▌ | 460/8750 [45:14<13:06:11, 5.69s/it] {'loss': 0.4956, 'learning_rate': 1.9973423282796695e-05, 'epoch': 0.05} + 5%|▌ | 460/8750 [45:20<13:06:11, 5.69s/it] {'loss': 0.4956, 'learning_rate': 1.9973423282796695e-05, 'epoch': 0.05} + 5%|▌ | 460/8750 [45:14<13:06:11, 5.69s/it] 5%|▌ | 461/8750 [45:26<13:11:00, 5.73s/it] 5%|▌ | 461/8750 [45:20<13:11:00, 5.73s/it] {'loss': 0.5125, 'learning_rate': 1.9973152904700762e-05, 'epoch': 0.05} + 5%|▌ | 461/8750 [45:26<13:11:00, 5.73s/it] {'loss': 0.5125, 'learning_rate': 1.9973152904700762e-05, 'epoch': 0.05} + 5%|▌ | 461/8750 [45:20<13:11:00, 5.73s/it] 5%|▌ | 462/8750 [45:32<13:01:49, 5.66s/it] 5%|▌ | 462/8750 [45:25<13:01:49, 5.66s/it] {'loss': 0.5385, 'learning_rate': 1.9972881160060216e-05, 'epoch': 0.05} + 5%|▌ | 462/8750 [45:32<13:01:49, 5.66s/it] {'loss': 0.5385, 'learning_rate': 1.9972881160060216e-05, 'epoch': 0.05} + 5%|▌ | 462/8750 [45:25<13:01:49, 5.66s/it] 5%|▌ | 463/8750 [45:37<13:07:00, 5.70s/it] 5%|▌ | 463/8750 [45:31<13:07:00, 5.70s/it] {'loss': 0.485, 'learning_rate': 1.997260804891229e-05, 'epoch': 0.05} + 5%|▌ | 463/8750 [45:37<13:07:00, 5.70s/it] {'loss': 0.485, 'learning_rate': 1.997260804891229e-05, 'epoch': 0.05} + 5%|▌ | 463/8750 [45:31<13:07:00, 5.70s/it] 5%|▌ | 464/8750 [45:43<13:10:31, 5.72s/it] 5%|▌ | 464/8750 [45:37<13:10:31, 5.72s/it] {'loss': 0.5028, 'learning_rate': 1.9972333571294418e-05, 'epoch': 0.05} + 5%|▌ | 464/8750 [45:43<13:10:31, 5.72s/it] {'loss': 0.5028, 'learning_rate': 1.9972333571294418e-05, 'epoch': 0.05} + 5%|▌ | 464/8750 [45:37<13:10:31, 5.72s/it] 5%|▌ | 465/8750 [45:49<13:05:38, 5.69s/it] 5%|▌ | 465/8750 [45:42<13:05:38, 5.69s/it] {'loss': 0.4887, 'learning_rate': 1.9972057727244203e-05, 'epoch': 0.05} + 5%|▌ | 465/8750 [45:49<13:05:38, 5.69s/it] {'loss': 0.4887, 'learning_rate': 1.9972057727244203e-05, 'epoch': 0.05} + 5%|▌ | 465/8750 [45:42<13:05:38, 5.69s/it] 5%|▌ | 466/8750 [45:55<13:02:18, 5.67s/it] 5%|▌ | 466/8750 [45:48<13:02:18, 5.67s/it] {'loss': 0.5229, 'learning_rate': 1.997178051679944e-05, 'epoch': 0.05} + 5%|▌ | 466/8750 [45:55<13:02:18, 5.67s/it] {'loss': 0.5229, 'learning_rate': 1.997178051679944e-05, 'epoch': 0.05} + 5%|▌ | 466/8750 [45:48<13:02:18, 5.67s/it] 5%|▌ | 467/8750 [46:00<13:10:13, 5.72s/it] 5%|▌ | 467/8750 [45:54<13:10:12, 5.72s/it] {'loss': 0.4953, 'learning_rate': 1.997150193999811e-05, 'epoch': 0.05} + 5%|▌ | 467/8750 [46:00<13:10:13, 5.72s/it] {'loss': 0.4953, 'learning_rate': 1.997150193999811e-05, 'epoch': 0.05} + 5%|▌ | 467/8750 [45:54<13:10:12, 5.72s/it] 5%|▌ | 468/8750 [46:06<13:11:45, 5.74s/it] 5%|▌ | 468/8750 [46:00<13:11:45, 5.74s/it] {'loss': 0.5123, 'learning_rate': 1.9971221996878395e-05, 'epoch': 0.05} + 5%|▌ | 468/8750 [46:06<13:11:45, 5.74s/it] {'loss': 0.5123, 'learning_rate': 1.9971221996878395e-05, 'epoch': 0.05} + 5%|▌ | 468/8750 [46:00<13:11:45, 5.74s/it] 5%|▌ | 469/8750 [46:12<13:11:17, 5.73s/it] 5%|▌ | 469/8750 [46:05<13:11:17, 5.73s/it] {'loss': 0.5256, 'learning_rate': 1.9970940687478643e-05, 'epoch': 0.05} + 5%|▌ | 469/8750 [46:12<13:11:17, 5.73s/it] {'loss': 0.5256, 'learning_rate': 1.9970940687478643e-05, 'epoch': 0.05} + 5%|▌ | 469/8750 [46:05<13:11:17, 5.73s/it] 5%|▌ | 470/8750 [46:17<13:03:20, 5.68s/it] 5%|▌ | 470/8750 [46:11<13:03:20, 5.68s/it] {'loss': 0.501, 'learning_rate': 1.9970658011837404e-05, 'epoch': 0.05} + 5%|▌ | 470/8750 [46:17<13:03:20, 5.68s/it] {'loss': 0.501, 'learning_rate': 1.9970658011837404e-05, 'epoch': 0.05} + 5%|▌ | 470/8750 [46:11<13:03:20, 5.68s/it] 5%|▌ | 471/8750 [46:23<13:02:07, 5.67s/it] 5%|▌ | 471/8750 [46:17<13:02:07, 5.67s/it] {'loss': 0.5173, 'learning_rate': 1.9970373969993414e-05, 'epoch': 0.05} + 5%|▌ | 471/8750 [46:23<13:02:07, 5.67s/it] {'loss': 0.5173, 'learning_rate': 1.9970373969993414e-05, 'epoch': 0.05} + 5%|▌ | 471/8750 [46:17<13:02:07, 5.67s/it] 5%|▌ | 472/8750 [46:29<13:14:38, 5.76s/it] 5%|▌ | 472/8750 [46:23<13:14:37, 5.76s/it] {'loss': 0.5047, 'learning_rate': 1.997008856198559e-05, 'epoch': 0.05} + 5%|▌ | 472/8750 [46:29<13:14:38, 5.76s/it] {'loss': 0.5047, 'learning_rate': 1.997008856198559e-05, 'epoch': 0.05} + 5%|▌ | 472/8750 [46:23<13:14:37, 5.76s/it] 5%|▌ | 473/8750 [46:35<13:08:45, 5.72s/it] 5%|▌ | 473/8750 [46:28<13:08:45, 5.72s/it] {'loss': 0.5107, 'learning_rate': 1.9969801787853035e-05, 'epoch': 0.05} + 5%|▌ | 473/8750 [46:35<13:08:45, 5.72s/it] {'loss': 0.5107, 'learning_rate': 1.9969801787853035e-05, 'epoch': 0.05} + 5%|▌ | 473/8750 [46:28<13:08:45, 5.72s/it] 5%|▌ | 474/8750 [46:40<13:11:40, 5.74s/it] 5%|▌ | 474/8750 [46:34<13:11:41, 5.74s/it] {'loss': 0.5104, 'learning_rate': 1.996951364763505e-05, 'epoch': 0.05} + 5%|▌ | 474/8750 [46:40<13:11:40, 5.74s/it] {'loss': 0.5104, 'learning_rate': 1.996951364763505e-05, 'epoch': 0.05} + 5%|▌ | 474/8750 [46:34<13:11:41, 5.74s/it] 5%|▌ | 475/8750 [46:47<13:27:29, 5.85s/it] 5%|▌ | 475/8750 [46:40<13:27:29, 5.85s/it] {'loss': 0.5097, 'learning_rate': 1.9969224141371114e-05, 'epoch': 0.05} + 5%|▌ | 475/8750 [46:47<13:27:29, 5.85s/it] {'loss': 0.5097, 'learning_rate': 1.9969224141371114e-05, 'epoch': 0.05} + 5%|▌ | 475/8750 [46:40<13:27:29, 5.85s/it] 5%|▌ | 476/8750 [46:52<13:21:27, 5.81s/it] 5%|▌ | 476/8750 [46:46<13:21:26, 5.81s/it] {'loss': 0.5093, 'learning_rate': 1.99689332691009e-05, 'epoch': 0.05} + 5%|▌ | 476/8750 [46:52<13:21:27, 5.81s/it] {'loss': 0.5093, 'learning_rate': 1.99689332691009e-05, 'epoch': 0.05} + 5%|▌ | 476/8750 [46:46<13:21:26, 5.81s/it]WARNING: tokenization mismatch: 1 vs. 1473. [[{'from': 'human', 'value': 'ASSISTANT: Du bist eine auf Kundenfeedback spezialisierte Agentur mit den folgenden Kenntnissen und Eigenschaften.\n\nKENNTNISSE: Kundenfeedback, wissenschaftliche Datenerhebung, Fragebogendesign, Benutzerfreundlichkeit, Kundenbindung, Service-Qualitätsevaluation.\n\nEIGENSCHAFTEN: Präzision, Effizienz, Empathie, Anpassungsfähigkeit, Datenorientierung, Kundenfokus.\n\nAUFGABE: Nutze dein Wissen, um ein Kundenfeedback-Formular zu entwickeln, das die Service-Qualität einer Steuerberatung evaluiert. Das Formular soll auf wissenschaftlich fundierten Datenniveaus basieren und dabei darauf achten, dass es nur so lang ist, dass es tatsächlich vom Kunden ausgefüllt wird. Konzentriere dich auf die Gestaltung eines benutzerfreundlichen und effizienten Fragebogens, der aussagekräftige Informationen über die Steuerberatung liefert.\n\nOUTPUT: Ein strukturiertes Kundenfeedback-Formular, das auf wissenschaftlich fundierten Datenniveaus basiert und eine angemessene Länge hat, um von Kunden tatsächlich ausgefüllt zu werden, und speziell auf die Evaluation der Service-Qualität einer Steuerberatung abzielt.'}, {'from': 'gpt', 'value': 'Kundenfeedback-Formular: Steuerberatung Service-Qualität\n\nTeil 1: Allgemeine Informationen\n(Bitte füllen Sie die folgenden Informationen aus.)\n\n1. Name (Optional):\n2. Alter:\n☐ Unter 25\n☐ 25-34\n☐ 35-44\n☐ 45-54\n☐ 55-64\n☐ 65 oder älter\n3. Geschlecht:\n☐ Männlich\n☐ Weiblich\n☐ Andere\n☐ Lieber nicht angeben\n\nTeil 2: Service-Qualität\n(Bitte bewerten Sie die folgenden Aspekte der Steuerberatung auf einer Skala von 1 bis 5, wobei 1 = sehr unzufrieden und 5 = sehr zufrieden.)\n\n1. Fachliche Kompetenz des Steuerberaters:\n☐ 1\n☐ 2\n☐ 3\n☐ 4\n☐ 5\n2. Freundlichkeit und Hilfsbereitschaft des Personals:\n☐ 1\n☐ 2\n☐ 3\n☐ 4\n☐ 5\n3. Reaktionsgeschwindigkeit auf Anfragen und Probleme:\n☐ 1\n☐ 2\n☐ 3\n☐ 4\n☐ 5\n4. Verständlichkeit der erbrachten Informationen und Beratung:\n☐ 1\n☐ 2\n☐ 3\n☐ 4\n☐ 5\n5. Qualität der bereitgestellten Dienstleistungen und Lösungen:\n☐ 1\n☐ 2\n☐ 3\n☐ 4\n☐ 5\n6. Preis-Leistungs-Verhältnis:\n☐ 1\n☐ 2\n☐ 3\n☐ 4\n☐ 5\n\nTeil 3: Kundenbindung und Zufriedenheit\n(Bitte beantworten Sie die folgenden Fragen.)\n\n1. Wie wahrscheinlich ist es, dass Sie die Steuerberatung an Freunde, Familie oder Kollegen weiterempfehlen? (Skala von 1 bis 10, wobei 1 = sehr unwahrscheinlich und 10 = sehr wahrscheinlich)\n☐ 1\n☐ 2\n☐ 3\n☐ 4\n☐ 5\n☐ 6\n☐ 7\n☐ 8\n☐ 9\n☐ 10\n2. Wie zufrieden sind Sie insgesamt mit der Steuerberatung? (Skala von 1 bis 5, wobei 1 = sehr unzufrieden und 5 = sehr zufrieden)\n☐ 1\n☐ 2\n☐ 3\n☐ 4\n☐ 5\n\nTeil 4: Offene Fragen\n(Bitte teilen Sie uns Ihre Gedanken und Erfahrungen mit.)\n\n1. Was hat Ihnen am meisten an der Steuerberatung gefallen? (Optional)\n2. Was könnte die Steuerberatung verbessern, um Ihre Zufriedenheit zu erhöhen? (Optional)\n\nVielen Dank für Ihre Zeit und Ihr'}, {'from': 'human', 'value': 'Continue.'}, {'from': 'gpt', 'value': 'wertvolles Feedback! Ihre Meinung hilft uns dabei, den Service und die Qualität unserer Steuerberatung kontinuierlich zu verbessern. Wenn Sie Fragen oder Anmerkungen haben oder zusätzliche Informationen benötigen, zögern Sie bitte nicht, uns zu kontaktieren.\n\nTeil 5: Kontaktaufnahme (Optional)\n(Bitte geben Sie Ihre Kontaktinformationen an, falls Sie möchten, dass wir Ihnen bezüglich Ihres Feedbacks antworten oder weitere Informationen anbieten.)\n\n1. Name:\n2. E-Mail-Adresse:\n3. Telefonnummer:\n\nHinweis zum Datenschutz: Ihre persönlichen Daten werden vertraulich behandelt und nur für interne Zwecke verwendet. Sie werden nicht an Dritte weitergegeben.\n\nBitte klicken Sie auf "Absenden", um das Formular abzuschließen. Nochmals vielen Dank für Ihre Rückmeldung und Ihre Unterstützung bei der Verbesserung unserer Steuerberatung!'}]] (ignored) + 5%|▌ | 477/8750 [46:58<13:16:14, 5.77s/it] 5%|▌ | 477/8750 [46:51<13:16:14, 5.77s/it] {'loss': 0.4957, 'learning_rate': 1.9968641030864254e-05, 'epoch': 0.05} + 5%|▌ | 477/8750 [46:58<13:16:14, 5.77s/it] {'loss': 0.4957, 'learning_rate': 1.9968641030864254e-05, 'epoch': 0.05} + 5%|▌ | 477/8750 [46:51<13:16:14, 5.77s/it] 5%|▌ | 478/8750 [47:04<13:09:56, 5.73s/it] 5%|▌ | 478/8750 [46:57<13:09:56, 5.73s/it] {'loss': 0.5105, 'learning_rate': 1.9968347426701228e-05, 'epoch': 0.05} + 5%|▌ | 478/8750 [47:04<13:09:56, 5.73s/it] {'loss': 0.5105, 'learning_rate': 1.9968347426701228e-05, 'epoch': 0.05} + 5%|▌ | 478/8750 [46:57<13:09:56, 5.73s/it] 5%|▌ | 479/8750 [47:09<13:05:39, 5.70s/it] 5%|▌ | 479/8750 [47:03<13:05:39, 5.70s/it] {'loss': 0.504, 'learning_rate': 1.9968052456652048e-05, 'epoch': 0.05} + 5%|▌ | 479/8750 [47:09<13:05:39, 5.70s/it] {'loss': 0.504, 'learning_rate': 1.9968052456652048e-05, 'epoch': 0.05} + 5%|▌ | 479/8750 [47:03<13:05:39, 5.70s/it] 5%|▌ | 480/8750 [47:15<13:05:53, 5.70s/it] 5%|▌ | 480/8750 [47:08<13:05:53, 5.70s/it] {'loss': 0.5008, 'learning_rate': 1.9967756120757132e-05, 'epoch': 0.05} + 5%|▌ | 480/8750 [47:15<13:05:53, 5.70s/it] {'loss': 0.5008, 'learning_rate': 1.9967756120757132e-05, 'epoch': 0.05} + 5%|▌ | 480/8750 [47:08<13:05:53, 5.70s/it] 5%|▌ | 481/8750 [47:21<13:05:02, 5.70s/it] 5%|▌ | 481/8750 [47:14<13:05:02, 5.70s/it] {'loss': 0.5101, 'learning_rate': 1.9967458419057092e-05, 'epoch': 0.05} + 5%|▌ | 481/8750 [47:21<13:05:02, 5.70s/it] {'loss': 0.5101, 'learning_rate': 1.9967458419057092e-05, 'epoch': 0.05} + 5%|▌ | 481/8750 [47:14<13:05:02, 5.70s/it] 6%|▌ | 482/8750 [47:26<13:07:47, 5.72s/it] 6%|▌ | 482/8750 [47:20<13:07:47, 5.72s/it] {'loss': 0.5092, 'learning_rate': 1.9967159351592706e-05, 'epoch': 0.06} + 6%|▌ | 482/8750 [47:26<13:07:47, 5.72s/it] {'loss': 0.5092, 'learning_rate': 1.9967159351592706e-05, 'epoch': 0.06} + 6%|▌ | 482/8750 [47:20<13:07:47, 5.72s/it] 6%|▌ | 483/8750 [47:32<13:08:25, 5.72s/it] 6%|▌ | 483/8750 [47:26<13:08:26, 5.72s/it] {'loss': 0.5023, 'learning_rate': 1.9966858918404965e-05, 'epoch': 0.06} + 6%|▌ | 483/8750 [47:32<13:08:25, 5.72s/it] {'loss': 0.5023, 'learning_rate': 1.9966858918404965e-05, 'epoch': 0.06} + 6%|▌ | 483/8750 [47:26<13:08:26, 5.72s/it] 6%|▌ | 484/8750 [47:38<13:21:47, 5.82s/it] 6%|▌ | 484/8750 [47:32<13:21:47, 5.82s/it] {'loss': 0.5103, 'learning_rate': 1.996655711953503e-05, 'epoch': 0.06} + 6%|▌ | 484/8750 [47:38<13:21:47, 5.82s/it] {'loss': 0.5103, 'learning_rate': 1.996655711953503e-05, 'epoch': 0.06} + 6%|▌ | 484/8750 [47:32<13:21:47, 5.82s/it] 6%|▌ | 485/8750 [47:44<13:12:25, 5.75s/it] 6%|▌ | 485/8750 [47:37<13:12:25, 5.75s/it] {'loss': 0.5325, 'learning_rate': 1.996625395502425e-05, 'epoch': 0.06} + 6%|▌ | 485/8750 [47:44<13:12:25, 5.75s/it] {'loss': 0.5325, 'learning_rate': 1.996625395502425e-05, 'epoch': 0.06} + 6%|▌ | 485/8750 [47:37<13:12:25, 5.75s/it] 6%|▌ | 486/8750 [47:49<13:10:23, 5.74s/it] 6%|▌ | 486/8750 [47:43<13:10:22, 5.74s/it] {'loss': 0.5126, 'learning_rate': 1.9965949424914175e-05, 'epoch': 0.06} + 6%|▌ | 486/8750 [47:49<13:10:23, 5.74s/it] {'loss': 0.5126, 'learning_rate': 1.9965949424914175e-05, 'epoch': 0.06} + 6%|▌ | 486/8750 [47:43<13:10:22, 5.74s/it] 6%|▌ | 487/8750 [47:55<13:14:32, 5.77s/it] 6%|▌ | 487/8750 [47:49<13:14:32, 5.77s/it] {'loss': 0.5017, 'learning_rate': 1.9965643529246526e-05, 'epoch': 0.06} + 6%|▌ | 487/8750 [47:55<13:14:32, 5.77s/it] {'loss': 0.5017, 'learning_rate': 1.9965643529246526e-05, 'epoch': 0.06} + 6%|▌ | 487/8750 [47:49<13:14:32, 5.77s/it] 6%|▌ | 488/8750 [48:01<13:04:04, 5.69s/it] 6%|▌ | 488/8750 [47:54<13:04:04, 5.69s/it] {'loss': 0.5367, 'learning_rate': 1.996533626806322e-05, 'epoch': 0.06} + 6%|▌ | 488/8750 [48:01<13:04:04, 5.69s/it] {'loss': 0.5367, 'learning_rate': 1.996533626806322e-05, 'epoch': 0.06} + 6%|▌ | 488/8750 [47:54<13:04:04, 5.69s/it] 6%|▌ | 489/8750 [48:06<13:00:42, 5.67s/it] 6%|▌ | 489/8750 [48:00<13:00:42, 5.67s/it] {'loss': 0.4925, 'learning_rate': 1.9965027641406355e-05, 'epoch': 0.06} + 6%|▌ | 489/8750 [48:06<13:00:42, 5.67s/it] {'loss': 0.4925, 'learning_rate': 1.9965027641406355e-05, 'epoch': 0.06} + 6%|▌ | 489/8750 [48:00<13:00:42, 5.67s/it] 6%|▌ | 490/8750 [48:12<12:59:33, 5.66s/it] 6%|▌ | 490/8750 [48:06<12:59:32, 5.66s/it] {'loss': 0.5153, 'learning_rate': 1.996471764931822e-05, 'epoch': 0.06} + 6%|▌ | 490/8750 [48:12<12:59:33, 5.66s/it] {'loss': 0.5153, 'learning_rate': 1.996471764931822e-05, 'epoch': 0.06} + 6%|▌ | 490/8750 [48:06<12:59:32, 5.66s/it] 6%|▌ | 491/8750 [48:18<12:58:43, 5.66s/it] 6%|▌ | 491/8750 [48:11<12:58:43, 5.66s/it] {'loss': 0.5196, 'learning_rate': 1.99644062918413e-05, 'epoch': 0.06} + 6%|▌ | 491/8750 [48:18<12:58:43, 5.66s/it] {'loss': 0.5196, 'learning_rate': 1.99644062918413e-05, 'epoch': 0.06} + 6%|▌ | 491/8750 [48:11<12:58:43, 5.66s/it] 6%|▌ | 492/8750 [48:24<13:14:54, 5.78s/it] 6%|▌ | 492/8750 [48:17<13:14:54, 5.78s/it] {'loss': 0.4936, 'learning_rate': 1.9964093569018247e-05, 'epoch': 0.06} + 6%|▌ | 492/8750 [48:24<13:14:54, 5.78s/it] {'loss': 0.4936, 'learning_rate': 1.9964093569018247e-05, 'epoch': 0.06} + 6%|▌ | 492/8750 [48:17<13:14:54, 5.78s/it] 6%|▌ | 493/8750 [48:30<13:14:11, 5.77s/it] 6%|▌ | 493/8750 [48:23<13:14:11, 5.77s/it] {'loss': 0.5085, 'learning_rate': 1.9963779480891917e-05, 'epoch': 0.06} + 6%|▌ | 493/8750 [48:30<13:14:11, 5.77s/it] {'loss': 0.5085, 'learning_rate': 1.9963779480891917e-05, 'epoch': 0.06} + 6%|▌ | 493/8750 [48:23<13:14:11, 5.77s/it] 6%|▌ | 494/8750 [48:35<13:17:31, 5.80s/it] 6%|▌ | 494/8750 [48:29<13:17:31, 5.80s/it] {'loss': 0.5153, 'learning_rate': 1.9963464027505343e-05, 'epoch': 0.06} + 6%|▌ | 494/8750 [48:35<13:17:31, 5.80s/it] {'loss': 0.5153, 'learning_rate': 1.9963464027505343e-05, 'epoch': 0.06} + 6%|▌ | 494/8750 [48:29<13:17:31, 5.80s/it]WARNING: tokenization mismatch: 1 vs. 624. [[{'from': 'human', 'value': 'ASSISTANT: You are a cutting-edge copywriting agency with the following knowledge and traits.\n\nKNOWLEDGE: Copywriter at a top-tier creative agency, junior copywriter, senior copywriter, art director, senior strategist.\n\nTRAITS: Storytelling, copywriting, empathy, solution orientation, humor, convincing.\n\nTASK: Use your knowledge to write a LinkedIn Post. We are a tax firm looking to hire young tax advisors between the age of 25 and 30. Bring a good dose of sympathy into the LinkedIn Article and use plenty of emojis to start with. Start the Linked In post with an exciting question. Use the language of the target group. Don´t mention the targeted age. \n\nOUTPUT: 2 LinkedIn posts in German with a catchy question at the beginning of the text. Hashtags at the end of the text.'}, {'from': 'gpt', 'value': 'Post 1:\n\n👋 Hey, ihr Steuerprofis da draußen! 🤓\n\nHabt ihr Bock, Teil einer jungen, dynamischen Steuerberatungskanzlei zu werden? 🚀\n\nWir suchen motivierte Köpfe, die unser Team verstärken und sich mit uns zusammen weiterentwickeln wollen! 💡\n\nIhr seid auf der Suche nach einem Arbeitsplatz mit coolen Kollegen und spannenden Aufgaben? 🤝 Dann seid ihr bei uns genau richtig! 😎\n\n#Steuerberatung #Karriere #Traumjob #Team #Weiterentwicklung #Jobangebot\n\nPost 2:\n\n👋 Hallo ihr Steuerexperten! 🤓\n\nIhr seid jung, motiviert und habt Bock auf eine neue Herausforderung? 🚀\n\nDann seid ihr bei uns genau richtig! Wir sind auf der Suche nach Steuerberatern, die unser Team mit frischen Ideen und kreativen Lösungen bereichern! 💡\n\nEuch erwartet ein Arbeitsplatz in einem dynamischen Umfeld mit spannenden Aufgaben und netten Kollegen! 🤝\n\nInteressiert? Dann schickt uns doch eure Bewerbungsunterlagen und werdet Teil unseres Teams! 😎\n\n#Jobangebot #Steuerberatung #Karriere #Traumjob #Team #Weiterentwicklung'}]] (ignored) + 6%|▌ | 495/8750 [48:41<13:14:16, 5.77s/it] 6%|▌ | 495/8750 [48:35<13:14:16, 5.77s/it] {'loss': 0.5228, 'learning_rate': 1.996314720890175e-05, 'epoch': 0.06} + 6%|▌ | 495/8750 [48:41<13:14:16, 5.77s/it] {'loss': 0.5228, 'learning_rate': 1.996314720890175e-05, 'epoch': 0.06} + 6%|▌ | 495/8750 [48:35<13:14:16, 5.77s/it] 6%|▌ | 496/8750 [48:47<13:11:11, 5.75s/it] 6%|▌ | 496/8750 [48:40<13:11:10, 5.75s/it] {'loss': 0.499, 'learning_rate': 1.9962829025124553e-05, 'epoch': 0.06} + 6%|▌ | 496/8750 [48:47<13:11:11, 5.75s/it] {'loss': 0.499, 'learning_rate': 1.9962829025124553e-05, 'epoch': 0.06} + 6%|▌ | 496/8750 [48:40<13:11:10, 5.75s/it] 6%|▌ | 497/8750 [48:53<13:11:24, 5.75s/it] 6%|▌ | 497/8750 [48:46<13:11:23, 5.75s/it] {'loss': 0.512, 'learning_rate': 1.9962509476217348e-05, 'epoch': 0.06} + 6%|▌ | 497/8750 [48:53<13:11:24, 5.75s/it] {'loss': 0.512, 'learning_rate': 1.9962509476217348e-05, 'epoch': 0.06} + 6%|▌ | 497/8750 [48:46<13:11:23, 5.75s/it] 6%|▌ | 498/8750 [48:58<13:15:20, 5.78s/it] 6%|▌ | 498/8750 [48:52<13:15:20, 5.78s/it] {'loss': 0.4839, 'learning_rate': 1.9962188562223916e-05, 'epoch': 0.06} + 6%|▌ | 498/8750 [48:58<13:15:20, 5.78s/it] {'loss': 0.4839, 'learning_rate': 1.9962188562223916e-05, 'epoch': 0.06} + 6%|▌ | 498/8750 [48:52<13:15:20, 5.78s/it] 6%|▌ | 499/8750 [49:04<13:10:10, 5.75s/it] 6%|▌ | 499/8750 [48:58<13:10:09, 5.75s/it] {'loss': 0.4876, 'learning_rate': 1.9961866283188237e-05, 'epoch': 0.06} + 6%|▌ | 499/8750 [49:04<13:10:10, 5.75s/it] {'loss': 0.4876, 'learning_rate': 1.9961866283188237e-05, 'epoch': 0.06} + 6%|▌ | 499/8750 [48:58<13:10:09, 5.75s/it]4 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +9 6%|▌ | 500/8750 [49:10<13:23:55, 5.85s/it]AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +062 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + + AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... + 6%|▌ | 500/8750 [49:04<13:23:55, 5.85s/it] {'loss': 0.5057, 'learning_rate': 1.9961542639154467e-05, 'epoch': 0.06} + 6%|▌ | 500/8750 [49:10<13:23:55, 5.85s/it] {'loss': 0.5057, 'learning_rate': 1.9961542639154467e-05, 'epoch': 0.06} + 6%|▌ | 500/8750 [49:04<13:23:55, 5.85s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-500/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-500/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-500/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 6%|▌ | 501/8750 [49:32<24:38:24, 10.75s/it] 6%|▌ | 501/8750 [49:26<24:38:24, 10.75s/it] {'loss': 0.5144, 'learning_rate': 1.9961217630166954e-05, 'epoch': 0.06} + 6%|▌ | 501/8750 [49:32<24:38:24, 10.75s/it] {'loss': 0.5144, 'learning_rate': 1.9961217630166954e-05, 'epoch': 0.06} + 6%|▌ | 501/8750 [49:26<24:38:24, 10.75s/it] 6%|▌ | 502/8750 [49:38<21:14:17, 9.27s/it] 6%|▌ | 502/8750 [49:32<21:14:17, 9.27s/it] {'loss': 0.5042, 'learning_rate': 1.9960891256270224e-05, 'epoch': 0.06} + 6%|▌ | 502/8750 [49:38<21:14:17, 9.27s/it] {'loss': 0.5042, 'learning_rate': 1.9960891256270224e-05, 'epoch': 0.06} + 6%|▌ | 502/8750 [49:32<21:14:17, 9.27s/it] 6%|▌ | 503/8750 [49:44<18:41:27, 8.16s/it] 6%|▌ | 503/8750 [49:37<18:41:27, 8.16s/it] {'loss': 0.5188, 'learning_rate': 1.9960563517509008e-05, 'epoch': 0.06} + 6%|▌ | 503/8750 [49:44<18:41:27, 8.16s/it] {'loss': 0.5188, 'learning_rate': 1.9960563517509008e-05, 'epoch': 0.06} + 6%|▌ | 503/8750 [49:37<18:41:27, 8.16s/it] 6%|▌ | 504/8750 [49:49<17:01:52, 7.44s/it] 6%|▌ | 504/8750 [49:43<17:01:52, 7.44s/it] {'loss': 0.4886, 'learning_rate': 1.996023441392821e-05, 'epoch': 0.06} + 6%|▌ | 504/8750 [49:49<17:01:52, 7.44s/it] {'loss': 0.4886, 'learning_rate': 1.996023441392821e-05, 'epoch': 0.06} + 6%|▌ | 504/8750 [49:43<17:01:52, 7.44s/it] 6%|▌ | 505/8750 [49:55<15:52:18, 6.93s/it] 6%|▌ | 505/8750 [49:49<15:52:19, 6.93s/it] {'loss': 0.4883, 'learning_rate': 1.9959903945572918e-05, 'epoch': 0.06} + 6%|▌ | 505/8750 [49:55<15:52:18, 6.93s/it] {'loss': 0.4883, 'learning_rate': 1.9959903945572918e-05, 'epoch': 0.06} + 6%|▌ | 505/8750 [49:49<15:52:19, 6.93s/it] 6%|▌ | 506/8750 [50:01<15:01:32, 6.56s/it] 6%|▌ | 506/8750 [49:54<15:01:31, 6.56s/it] {'loss': 0.5606, 'learning_rate': 1.9959572112488423e-05, 'epoch': 0.06} + 6%|▌ | 506/8750 [50:01<15:01:32, 6.56s/it] {'loss': 0.5606, 'learning_rate': 1.9959572112488423e-05, 'epoch': 0.06} + 6%|▌ | 506/8750 [49:54<15:01:31, 6.56s/it] 6%|▌ | 507/8750 [50:07<14:26:52, 6.31s/it] 6%|▌ | 507/8750 [50:00<14:26:51, 6.31s/it] {'loss': 0.492, 'learning_rate': 1.9959238914720188e-05, 'epoch': 0.06} + 6%|▌ | 507/8750 [50:07<14:26:52, 6.31s/it] {'loss': 0.492, 'learning_rate': 1.9959238914720188e-05, 'epoch': 0.06} + 6%|▌ | 507/8750 [50:00<14:26:51, 6.31s/it] 6%|▌ | 508/8750 [50:12<13:53:45, 6.07s/it] 6%|▌ | 508/8750 [50:06<13:53:45, 6.07s/it] {'loss': 0.4934, 'learning_rate': 1.995890435231387e-05, 'epoch': 0.06} + 6%|▌ | 508/8750 [50:12<13:53:45, 6.07s/it] {'loss': 0.4934, 'learning_rate': 1.995890435231387e-05, 'epoch': 0.06} + 6%|▌ | 508/8750 [50:06<13:53:45, 6.07s/it] 6%|▌ | 509/8750 [50:18<13:40:35, 5.97s/it] 6%|▌ | 509/8750 [50:11<13:40:34, 5.97s/it] {'loss': 0.5088, 'learning_rate': 1.9958568425315316e-05, 'epoch': 0.06} + 6%|▌ | 509/8750 [50:18<13:40:35, 5.97s/it] {'loss': 0.5088, 'learning_rate': 1.9958568425315316e-05, 'epoch': 0.06} + 6%|▌ | 509/8750 [50:11<13:40:34, 5.97s/it] 6%|▌ | 510/8750 [50:24<13:28:33, 5.89s/it] 6%|▌ | 510/8750 [50:17<13:28:33, 5.89s/it] {'loss': 0.5085, 'learning_rate': 1.9958231133770548e-05, 'epoch': 0.06} + 6%|▌ | 510/8750 [50:24<13:28:33, 5.89s/it] {'loss': 0.5085, 'learning_rate': 1.9958231133770548e-05, 'epoch': 0.06} + 6%|▌ | 510/8750 [50:17<13:28:33, 5.89s/it] 6%|▌ | 511/8750 [50:29<13:14:33, 5.79s/it] 6%|▌ | 511/8750 [50:23<13:14:34, 5.79s/it] {'loss': 0.5067, 'learning_rate': 1.995789247772578e-05, 'epoch': 0.06} + 6%|▌ | 511/8750 [50:29<13:14:33, 5.79s/it] {'loss': 0.5067, 'learning_rate': 1.995789247772578e-05, 'epoch': 0.06} + 6%|▌ | 511/8750 [50:23<13:14:34, 5.79s/it] 6%|▌ | 512/8750 [50:35<13:07:51, 5.74s/it] 6%|▌ | 512/8750 [50:28<13:07:51, 5.74s/it] {'loss': 0.5029, 'learning_rate': 1.9957552457227428e-05, 'epoch': 0.06} + 6%|▌ | 512/8750 [50:35<13:07:51, 5.74s/it] {'loss': 0.5029, 'learning_rate': 1.9957552457227428e-05, 'epoch': 0.06} + 6%|▌ | 512/8750 [50:28<13:07:51, 5.74s/it] 6%|▌ | 513/8750 [50:40<13:02:34, 5.70s/it] 6%|▌ | 513/8750 [50:34<13:02:34, 5.70s/it] {'loss': 0.4727, 'learning_rate': 1.9957211072322075e-05, 'epoch': 0.06} + 6%|▌ | 513/8750 [50:40<13:02:34, 5.70s/it] {'loss': 0.4727, 'learning_rate': 1.9957211072322075e-05, 'epoch': 0.06} + 6%|▌ | 513/8750 [50:34<13:02:34, 5.70s/it] 6%|▌ | 514/8750 [50:46<12:59:14, 5.68s/it] 6%|▌ | 514/8750 [50:40<12:59:15, 5.68s/it] {'loss': 0.5176, 'learning_rate': 1.9956868323056494e-05, 'epoch': 0.06} + 6%|▌ | 514/8750 [50:46<12:59:14, 5.68s/it] {'loss': 0.5176, 'learning_rate': 1.9956868323056494e-05, 'epoch': 0.06} + 6%|▌ | 514/8750 [50:40<12:59:15, 5.68s/it] 6%|▌ | 515/8750 [50:52<13:16:59, 5.81s/it] 6%|▌ | 515/8750 [50:46<13:16:59, 5.81s/it] {'loss': 0.5033, 'learning_rate': 1.9956524209477658e-05, 'epoch': 0.06} + 6%|▌ | 515/8750 [50:52<13:16:59, 5.81s/it] {'loss': 0.5033, 'learning_rate': 1.9956524209477658e-05, 'epoch': 0.06} + 6%|▌ | 515/8750 [50:46<13:16:59, 5.81s/it] 6%|▌ | 516/8750 [50:58<13:22:59, 5.85s/it] 6%|▌ | 516/8750 [50:52<13:23:00, 5.85s/it] {'loss': 0.513, 'learning_rate': 1.9956178731632715e-05, 'epoch': 0.06} + 6%|▌ | 516/8750 [50:58<13:22:59, 5.85s/it] {'loss': 0.513, 'learning_rate': 1.9956178731632715e-05, 'epoch': 0.06} + 6%|▌ | 516/8750 [50:52<13:23:00, 5.85s/it] 6%|▌ | 517/8750 [51:04<13:20:43, 5.84s/it] 6%|▌ | 517/8750 [50:57<13:20:43, 5.84s/it] {'loss': 0.5041, 'learning_rate': 1.9955831889568998e-05, 'epoch': 0.06} + 6%|▌ | 517/8750 [51:04<13:20:43, 5.84s/it] {'loss': 0.5041, 'learning_rate': 1.9955831889568998e-05, 'epoch': 0.06} + 6%|▌ | 517/8750 [50:57<13:20:43, 5.84s/it] 6%|▌ | 518/8750 [51:10<13:15:44, 5.80s/it] 6%|▌ | 518/8750 [51:03<13:15:45, 5.80s/it] {'loss': 0.4975, 'learning_rate': 1.995548368333404e-05, 'epoch': 0.06} + 6%|▌ | 518/8750 [51:10<13:15:44, 5.80s/it] {'loss': 0.4975, 'learning_rate': 1.995548368333404e-05, 'epoch': 0.06} + 6%|▌ | 518/8750 [51:03<13:15:45, 5.80s/it] 6%|▌ | 519/8750 [51:16<13:25:06, 5.87s/it] 6%|▌ | 519/8750 [51:09<13:25:06, 5.87s/it] {'loss': 0.5129, 'learning_rate': 1.9955134112975548e-05, 'epoch': 0.06} + 6%|▌ | 519/8750 [51:16<13:25:06, 5.87s/it] {'loss': 0.5129, 'learning_rate': 1.9955134112975548e-05, 'epoch': 0.06} + 6%|▌ | 519/8750 [51:09<13:25:06, 5.87s/it] 6%|▌ | 520/8750 [51:21<13:14:17, 5.79s/it] 6%|▌ | 520/8750 [51:15<13:14:16, 5.79s/it] {'loss': 0.505, 'learning_rate': 1.9954783178541424e-05, 'epoch': 0.06} + 6%|▌ | 520/8750 [51:21<13:14:17, 5.79s/it] {'loss': 0.505, 'learning_rate': 1.9954783178541424e-05, 'epoch': 0.06} + 6%|▌ | 520/8750 [51:15<13:14:16, 5.79s/it] 6%|▌ | 521/8750 [51:27<13:04:00, 5.72s/it] 6%|▌ | 521/8750 [51:20<13:04:00, 5.72s/it] {'loss': 0.5008, 'learning_rate': 1.995443088007975e-05, 'epoch': 0.06} + 6%|▌ | 521/8750 [51:27<13:04:00, 5.72s/it] {'loss': 0.5008, 'learning_rate': 1.995443088007975e-05, 'epoch': 0.06} + 6%|▌ | 521/8750 [51:20<13:04:00, 5.72s/it] 6%|▌ | 522/8750 [51:33<13:06:24, 5.73s/it] 6%|▌ | 522/8750 [51:26<13:06:25, 5.73s/it] {'loss': 0.5171, 'learning_rate': 1.9954077217638807e-05, 'epoch': 0.06} + 6%|▌ | 522/8750 [51:33<13:06:24, 5.73s/it] {'loss': 0.5171, 'learning_rate': 1.9954077217638807e-05, 'epoch': 0.06} + 6%|▌ | 522/8750 [51:26<13:06:25, 5.73s/it] 6%|▌ | 523/8750 [51:38<13:04:24, 5.72s/it] 6%|▌ | 523/8750 [51:32<13:04:24, 5.72s/it] {'loss': 0.4896, 'learning_rate': 1.995372219126704e-05, 'epoch': 0.06} + 6%|▌ | 523/8750 [51:38<13:04:24, 5.72s/it] {'loss': 0.4896, 'learning_rate': 1.995372219126704e-05, 'epoch': 0.06} + 6%|▌ | 523/8750 [51:32<13:04:24, 5.72s/it] 6%|▌ | 524/8750 [51:44<13:01:40, 5.70s/it] 6%|▌ | 524/8750 [51:37<13:01:40, 5.70s/it] {'loss': 0.521, 'learning_rate': 1.995336580101311e-05, 'epoch': 0.06} + 6%|▌ | 524/8750 [51:44<13:01:40, 5.70s/it] {'loss': 0.521, 'learning_rate': 1.995336580101311e-05, 'epoch': 0.06} + 6%|▌ | 524/8750 [51:37<13:01:40, 5.70s/it] 6%|▌ | 525/8750 [51:50<13:02:32, 5.71s/it] 6%|▌ | 525/8750 [51:43<13:02:32, 5.71s/it] {'loss': 0.5038, 'learning_rate': 1.9953008046925844e-05, 'epoch': 0.06} + 6%|▌ | 525/8750 [51:50<13:02:32, 5.71s/it] {'loss': 0.5038, 'learning_rate': 1.9953008046925844e-05, 'epoch': 0.06} + 6%|▌ | 525/8750 [51:43<13:02:32, 5.71s/it] 6%|▌ | 526/8750 [51:49<13:03:49, 5.72s/it] 6%|▌ | 526/8750 [51:55<13:03:51, 5.72s/it] {'loss': 0.5123, 'learning_rate': 1.9952648929054262e-05, 'epoch': 0.06} + 6%|▌ | 526/8750 [51:55<13:03:51, 5.72s/it] {'loss': 0.5123, 'learning_rate': 1.9952648929054262e-05, 'epoch': 0.06} + 6%|▌ | 526/8750 [51:49<13:03:49, 5.72s/it] 6%|▌ | 527/8750 [52:01<13:02:49, 5.71s/it] 6%|▌ | 527/8750 [51:55<13:02:49, 5.71s/it] {'loss': 0.5118, 'learning_rate': 1.9952288447447573e-05, 'epoch': 0.06} + 6%|▌ | 527/8750 [52:01<13:02:49, 5.71s/it] {'loss': 0.5118, 'learning_rate': 1.9952288447447573e-05, 'epoch': 0.06} + 6%|▌ | 527/8750 [51:55<13:02:49, 5.71s/it] 6%|▌ | 528/8750 [52:07<13:01:22, 5.70s/it] 6%|▌ | 528/8750 [52:00<13:01:23, 5.70s/it] {'loss': 0.5144, 'learning_rate': 1.995192660215517e-05, 'epoch': 0.06} + 6%|▌ | 528/8750 [52:07<13:01:22, 5.70s/it] {'loss': 0.5144, 'learning_rate': 1.995192660215517e-05, 'epoch': 0.06} + 6%|▌ | 528/8750 [52:00<13:01:23, 5.70s/it] 6%|▌ | 529/8750 [52:12<13:01:59, 5.71s/it] 6%|▌ | 529/8750 [52:06<13:01:59, 5.71s/it] {'loss': 0.5075, 'learning_rate': 1.9951563393226632e-05, 'epoch': 0.06} + 6%|▌ | 529/8750 [52:12<13:01:59, 5.71s/it] {'loss': 0.5075, 'learning_rate': 1.9951563393226632e-05, 'epoch': 0.06} + 6%|▌ | 529/8750 [52:06<13:01:59, 5.71s/it] 6%|▌ | 530/8750 [52:18<13:03:22, 5.72s/it] 6%|▌ | 530/8750 [52:12<13:03:22, 5.72s/it] {'loss': 0.497, 'learning_rate': 1.9951198820711735e-05, 'epoch': 0.06} + 6%|▌ | 530/8750 [52:18<13:03:22, 5.72s/it] {'loss': 0.497, 'learning_rate': 1.9951198820711735e-05, 'epoch': 0.06} + 6%|▌ | 530/8750 [52:12<13:03:22, 5.72s/it] 6%|▌ | 531/8750 [52:24<13:00:31, 5.70s/it] 6%|▌ | 531/8750 [52:17<13:00:32, 5.70s/it] {'loss': 0.4836, 'learning_rate': 1.995083288466042e-05, 'epoch': 0.06} + 6%|▌ | 531/8750 [52:24<13:00:31, 5.70s/it] {'loss': 0.4836, 'learning_rate': 1.995083288466042e-05, 'epoch': 0.06} + 6%|▌ | 531/8750 [52:17<13:00:32, 5.70s/it] 6%|▌ | 532/8750 [52:30<13:01:41, 5.71s/it] 6%|▌ | 532/8750 [52:23<13:01:41, 5.71s/it] {'loss': 0.5177, 'learning_rate': 1.995046558512284e-05, 'epoch': 0.06} + 6%|▌ | 532/8750 [52:30<13:01:41, 5.71s/it] {'loss': 0.5177, 'learning_rate': 1.995046558512284e-05, 'epoch': 0.06} + 6%|▌ | 532/8750 [52:23<13:01:41, 5.71s/it] 6%|▌ | 533/8750 [52:35<12:58:59, 5.69s/it] 6%|▌ | 533/8750 [52:29<12:58:59, 5.69s/it] {'loss': 0.4862, 'learning_rate': 1.995009692214932e-05, 'epoch': 0.06} + 6%|▌ | 533/8750 [52:35<12:58:59, 5.69s/it] {'loss': 0.4862, 'learning_rate': 1.995009692214932e-05, 'epoch': 0.06} + 6%|▌ | 533/8750 [52:29<12:58:59, 5.69s/it] 6%|▌ | 534/8750 [52:41<13:00:28, 5.70s/it] 6%|▌ | 534/8750 [52:34<13:00:28, 5.70s/it] {'loss': 0.5107, 'learning_rate': 1.994972689579037e-05, 'epoch': 0.06} + 6%|▌ | 534/8750 [52:41<13:00:28, 5.70s/it] {'loss': 0.5107, 'learning_rate': 1.994972689579037e-05, 'epoch': 0.06} + 6%|▌ | 534/8750 [52:34<13:00:28, 5.70s/it] 6%|▌ | 535/8750 [52:47<12:59:34, 5.69s/it] 6%|▌ | 535/8750 [52:40<12:59:34, 5.69s/it] {'loss': 0.4924, 'learning_rate': 1.99493555060967e-05, 'epoch': 0.06} + 6%|▌ | 535/8750 [52:47<12:59:34, 5.69s/it] {'loss': 0.4924, 'learning_rate': 1.99493555060967e-05, 'epoch': 0.06} + 6%|▌ | 535/8750 [52:40<12:59:34, 5.69s/it] 6%|▌ | 536/8750 [52:52<12:54:53, 5.66s/it] 6%|▌ | 536/8750 [52:46<12:54:52, 5.66s/it] {'loss': 0.4966, 'learning_rate': 1.994898275311919e-05, 'epoch': 0.06} + 6%|▌ | 536/8750 [52:52<12:54:53, 5.66s/it] {'loss': 0.4966, 'learning_rate': 1.994898275311919e-05, 'epoch': 0.06} + 6%|▌ | 536/8750 [52:46<12:54:52, 5.66s/it] 6%|▌ | 537/8750 [52:58<13:00:11, 5.70s/it] 6%|▌ | 537/8750 [52:52<13:00:11, 5.70s/it] {'loss': 0.5062, 'learning_rate': 1.9948608636908928e-05, 'epoch': 0.06} + 6%|▌ | 537/8750 [52:58<13:00:11, 5.70s/it] {'loss': 0.5062, 'learning_rate': 1.9948608636908928e-05, 'epoch': 0.06} + 6%|▌ | 537/8750 [52:52<13:00:11, 5.70s/it] 6%|▌ | 538/8750 [53:04<13:02:04, 5.71s/it] 6%|▌ | 538/8750 [52:57<13:02:04, 5.71s/it] {'loss': 0.5026, 'learning_rate': 1.9948233157517164e-05, 'epoch': 0.06} + 6%|▌ | 538/8750 [53:04<13:02:04, 5.71s/it] {'loss': 0.5026, 'learning_rate': 1.9948233157517164e-05, 'epoch': 0.06} + 6%|▌ | 538/8750 [52:57<13:02:04, 5.71s/it] 6%|▌ | 539/8750 [53:10<13:12:25, 5.79s/it] 6%|▌ | 539/8750 [53:03<13:12:25, 5.79s/it] {'loss': 0.5129, 'learning_rate': 1.994785631499535e-05, 'epoch': 0.06} + 6%|▌ | 539/8750 [53:10<13:12:25, 5.79s/it] {'loss': 0.5129, 'learning_rate': 1.994785631499535e-05, 'epoch': 0.06} + 6%|▌ | 539/8750 [53:03<13:12:25, 5.79s/it] 6%|▌ | 540/8750 [53:15<13:01:46, 5.71s/it] 6%|▌ | 540/8750 [53:09<13:01:46, 5.71s/it] {'loss': 0.5277, 'learning_rate': 1.9947478109395123e-05, 'epoch': 0.06} + 6%|▌ | 540/8750 [53:15<13:01:46, 5.71s/it] {'loss': 0.5277, 'learning_rate': 1.9947478109395123e-05, 'epoch': 0.06} + 6%|▌ | 540/8750 [53:09<13:01:46, 5.71s/it] 6%|▌ | 541/8750 [53:21<13:13:35, 5.80s/it] 6%|▌ | 541/8750 [53:15<13:13:35, 5.80s/it] {'loss': 0.5, 'learning_rate': 1.9947098540768306e-05, 'epoch': 0.06} + 6%|▌ | 541/8750 [53:21<13:13:35, 5.80s/it] {'loss': 0.5, 'learning_rate': 1.9947098540768306e-05, 'epoch': 0.06} + 6%|▌ | 541/8750 [53:15<13:13:35, 5.80s/it] 6%|▌ | 542/8750 [53:27<13:04:57, 5.74s/it] 6%|▌ | 542/8750 [53:20<13:04:57, 5.74s/it] {'loss': 0.5134, 'learning_rate': 1.994671760916691e-05, 'epoch': 0.06} + 6%|▌ | 542/8750 [53:27<13:04:57, 5.74s/it] {'loss': 0.5134, 'learning_rate': 1.994671760916691e-05, 'epoch': 0.06} + 6%|▌ | 542/8750 [53:20<13:04:57, 5.74s/it] 6%|▌ | 543/8750 [53:33<13:04:59, 5.74s/it] 6%|▌ | 543/8750 [53:26<13:05:00, 5.74s/it] {'loss': 0.5056, 'learning_rate': 1.994633531464313e-05, 'epoch': 0.06} + 6%|▌ | 543/8750 [53:33<13:04:59, 5.74s/it] {'loss': 0.5056, 'learning_rate': 1.994633531464313e-05, 'epoch': 0.06} + 6%|▌ | 543/8750 [53:26<13:05:00, 5.74s/it] 6%|▌ | 544/8750 [53:39<13:23:36, 5.88s/it] 6%|▌ | 544/8750 [53:32<13:23:36, 5.88s/it] {'loss': 0.5002, 'learning_rate': 1.9945951657249348e-05, 'epoch': 0.06} + 6%|▌ | 544/8750 [53:39<13:23:36, 5.88s/it] {'loss': 0.5002, 'learning_rate': 1.9945951657249348e-05, 'epoch': 0.06} + 6%|▌ | 544/8750 [53:32<13:23:36, 5.88s/it] 6%|▌ | 545/8750 [53:45<13:19:31, 5.85s/it] 6%|▌ | 545/8750 [53:38<13:19:31, 5.85s/it] {'loss': 0.509, 'learning_rate': 1.9945566637038133e-05, 'epoch': 0.06} + 6%|▌ | 545/8750 [53:45<13:19:31, 5.85s/it] {'loss': 0.509, 'learning_rate': 1.9945566637038133e-05, 'epoch': 0.06} + 6%|▌ | 545/8750 [53:38<13:19:31, 5.85s/it] 6%|▌ | 546/8750 [53:50<13:11:36, 5.79s/it] 6%|▌ | 546/8750 [53:44<13:11:36, 5.79s/it] {'loss': 0.5243, 'learning_rate': 1.9945180254062242e-05, 'epoch': 0.06} + 6%|▌ | 546/8750 [53:50<13:11:36, 5.79s/it] {'loss': 0.5243, 'learning_rate': 1.9945180254062242e-05, 'epoch': 0.06} + 6%|▌ | 546/8750 [53:44<13:11:36, 5.79s/it] 6%|▋ | 547/8750 [53:56<13:06:47, 5.75s/it] 6%|▋ | 547/8750 [53:49<13:06:47, 5.75s/it] {'loss': 0.5495, 'learning_rate': 1.994479250837462e-05, 'epoch': 0.06} + 6%|▋ | 547/8750 [53:56<13:06:47, 5.75s/it] {'loss': 0.5495, 'learning_rate': 1.994479250837462e-05, 'epoch': 0.06} + 6%|▋ | 547/8750 [53:49<13:06:47, 5.75s/it] 6%|▋ | 548/8750 [54:01<12:58:43, 5.70s/it] 6%|▋ | 548/8750 [53:55<12:58:42, 5.70s/it] {'loss': 0.4995, 'learning_rate': 1.9944403400028392e-05, 'epoch': 0.06} + 6%|▋ | 548/8750 [54:01<12:58:43, 5.70s/it] {'loss': 0.4995, 'learning_rate': 1.9944403400028392e-05, 'epoch': 0.06} + 6%|▋ | 548/8750 [53:55<12:58:42, 5.70s/it] 6%|▋ | 549/8750 [54:07<13:01:31, 5.72s/it] 6%|▋ | 549/8750 [54:01<13:01:31, 5.72s/it] {'loss': 0.4847, 'learning_rate': 1.9944012929076884e-05, 'epoch': 0.06} + 6%|▋ | 549/8750 [54:07<13:01:31, 5.72s/it] {'loss': 0.4847, 'learning_rate': 1.9944012929076884e-05, 'epoch': 0.06} + 6%|▋ | 549/8750 [54:01<13:01:31, 5.72s/it]1213 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... +74 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... + +8 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend...14 + AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... + 6%|▋ | 550/8750 [54:13<13:01:19, 5.72s/it]11 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +0 AutoResumeHook: Checking whether to suspend... + 6%|▋ | 550/8750 [54:06<13:01:20, 5.72s/it] {'loss': 0.5106, 'learning_rate': 1.9943621095573588e-05, 'epoch': 0.06} + 6%|▋ | 550/8750 [54:13<13:01:19, 5.72s/it] {'loss': 0.5106, 'learning_rate': 1.9943621095573588e-05, 'epoch': 0.06} + 6%|▋ | 550/8750 [54:06<13:01:20, 5.72s/it] 6%|▋ | 551/8750 [54:19<12:59:40, 5.71s/it] 6%|▋ | 551/8750 [54:12<12:59:40, 5.71s/it] {'loss': 0.5006, 'learning_rate': 1.9943227899572198e-05, 'epoch': 0.06} + 6%|▋ | 551/8750 [54:19<12:59:40, 5.71s/it] {'loss': 0.5006, 'learning_rate': 1.9943227899572198e-05, 'epoch': 0.06} + 6%|▋ | 551/8750 [54:12<12:59:40, 5.71s/it] 6%|▋ | 552/8750 [54:25<13:08:55, 5.77s/it] 6%|▋ | 552/8750 [54:18<13:08:55, 5.77s/it] {'loss': 0.5108, 'learning_rate': 1.9942833341126597e-05, 'epoch': 0.06} + 6%|▋ | 552/8750 [54:25<13:08:55, 5.77s/it] {'loss': 0.5108, 'learning_rate': 1.9942833341126597e-05, 'epoch': 0.06} + 6%|▋ | 552/8750 [54:18<13:08:55, 5.77s/it] 6%|▋ | 553/8750 [54:30<13:04:48, 5.74s/it] 6%|▋ | 553/8750 [54:24<13:04:49, 5.74s/it] {'loss': 0.4873, 'learning_rate': 1.9942437420290835e-05, 'epoch': 0.06} + 6%|▋ | 553/8750 [54:30<13:04:48, 5.74s/it] {'loss': 0.4873, 'learning_rate': 1.9942437420290835e-05, 'epoch': 0.06} + 6%|▋ | 553/8750 [54:24<13:04:49, 5.74s/it] 6%|▋ | 554/8750 [54:36<12:59:20, 5.71s/it] 6%|▋ | 554/8750 [54:29<12:59:19, 5.71s/it] {'loss': 0.5005, 'learning_rate': 1.994204013711918e-05, 'epoch': 0.06} + 6%|▋ | 554/8750 [54:36<12:59:20, 5.71s/it] {'loss': 0.5005, 'learning_rate': 1.994204013711918e-05, 'epoch': 0.06} + 6%|▋ | 554/8750 [54:29<12:59:19, 5.71s/it] 6%|▋ | 555/8750 [54:41<12:56:38, 5.69s/it] 6%|▋ | 555/8750 [54:35<12:56:37, 5.69s/it] {'loss': 0.5119, 'learning_rate': 1.9941641491666052e-05, 'epoch': 0.06} + 6%|▋ | 555/8750 [54:41<12:56:38, 5.69s/it] {'loss': 0.5119, 'learning_rate': 1.9941641491666052e-05, 'epoch': 0.06} + 6%|▋ | 555/8750 [54:35<12:56:37, 5.69s/it] 6%|▋ | 556/8750 [54:47<12:53:07, 5.66s/it] 6%|▋ | 556/8750 [54:41<12:53:06, 5.66s/it] {'loss': 0.5057, 'learning_rate': 1.994124148398608e-05, 'epoch': 0.06} + 6%|▋ | 556/8750 [54:47<12:53:07, 5.66s/it] {'loss': 0.5057, 'learning_rate': 1.994124148398608e-05, 'epoch': 0.06} + 6%|▋ | 556/8750 [54:41<12:53:06, 5.66s/it] 6%|▋ | 557/8750 [54:53<12:55:00, 5.68s/it] 6%|▋ | 557/8750 [54:46<12:55:00, 5.68s/it] {'loss': 0.4932, 'learning_rate': 1.9940840114134078e-05, 'epoch': 0.06} + 6%|▋ | 557/8750 [54:53<12:55:00, 5.68s/it] {'loss': 0.4932, 'learning_rate': 1.9940840114134078e-05, 'epoch': 0.06} + 6%|▋ | 557/8750 [54:46<12:55:00, 5.68s/it] 6%|▋ | 558/8750 [54:58<12:54:41, 5.67s/it] 6%|▋ | 558/8750 [54:52<12:54:41, 5.67s/it] {'loss': 0.505, 'learning_rate': 1.9940437382165038e-05, 'epoch': 0.06} + 6%|▋ | 558/8750 [54:58<12:54:41, 5.67s/it] {'loss': 0.505, 'learning_rate': 1.9940437382165038e-05, 'epoch': 0.06} + 6%|▋ | 558/8750 [54:52<12:54:41, 5.67s/it] 6%|▋ | 559/8750 [55:04<12:57:39, 5.70s/it] 6%|▋ | 559/8750 [54:58<12:57:38, 5.70s/it] {'loss': 0.487, 'learning_rate': 1.9940033288134143e-05, 'epoch': 0.06} + 6%|▋ | 559/8750 [55:04<12:57:39, 5.70s/it] {'loss': 0.487, 'learning_rate': 1.9940033288134143e-05, 'epoch': 0.06} + 6%|▋ | 559/8750 [54:58<12:57:38, 5.70s/it] 6%|▋ | 560/8750 [55:10<12:59:48, 5.71s/it] 6%|▋ | 560/8750 [55:03<12:59:48, 5.71s/it] {'loss': 0.5201, 'learning_rate': 1.993962783209677e-05, 'epoch': 0.06} + 6%|▋ | 560/8750 [55:10<12:59:48, 5.71s/it] {'loss': 0.5201, 'learning_rate': 1.993962783209677e-05, 'epoch': 0.06} + 6%|▋ | 560/8750 [55:03<12:59:48, 5.71s/it] 6%|▋ | 561/8750 [55:16<12:59:07, 5.71s/it] 6%|▋ | 561/8750 [55:09<12:59:07, 5.71s/it] {'loss': 0.5063, 'learning_rate': 1.9939221014108467e-05, 'epoch': 0.06} + 6%|▋ | 561/8750 [55:16<12:59:07, 5.71s/it] {'loss': 0.5063, 'learning_rate': 1.9939221014108467e-05, 'epoch': 0.06} + 6%|▋ | 561/8750 [55:09<12:59:07, 5.71s/it] 6%|▋ | 562/8750 [55:21<12:58:38, 5.71s/it] 6%|▋ | 562/8750 [55:15<12:58:38, 5.71s/it] {'loss': 0.5005, 'learning_rate': 1.9938812834224978e-05, 'epoch': 0.06} + 6%|▋ | 562/8750 [55:21<12:58:38, 5.71s/it] {'loss': 0.5005, 'learning_rate': 1.9938812834224978e-05, 'epoch': 0.06} + 6%|▋ | 562/8750 [55:15<12:58:38, 5.71s/it] 6%|▋ | 563/8750 [55:27<12:58:52, 5.71s/it] 6%|▋ | 563/8750 [55:21<12:58:52, 5.71s/it] {'loss': 0.5017, 'learning_rate': 1.993840329250224e-05, 'epoch': 0.06} + 6%|▋ | 563/8750 [55:27<12:58:52, 5.71s/it] {'loss': 0.5017, 'learning_rate': 1.993840329250224e-05, 'epoch': 0.06} + 6%|▋ | 563/8750 [55:21<12:58:52, 5.71s/it] 6%|▋ | 564/8750 [55:33<12:54:29, 5.68s/it] 6%|▋ | 564/8750 [55:26<12:54:29, 5.68s/it] {'loss': 0.519, 'learning_rate': 1.993799238899636e-05, 'epoch': 0.06} + 6%|▋ | 564/8750 [55:33<12:54:29, 5.68s/it] {'loss': 0.519, 'learning_rate': 1.993799238899636e-05, 'epoch': 0.06} + 6%|▋ | 564/8750 [55:26<12:54:29, 5.68s/it] 6%|▋ | 565/8750 [55:38<12:57:51, 5.70s/it] 6%|▋ | 565/8750 [55:32<12:57:50, 5.70s/it] {'loss': 0.4934, 'learning_rate': 1.9937580123763645e-05, 'epoch': 0.06} + 6%|▋ | 565/8750 [55:38<12:57:51, 5.70s/it] {'loss': 0.4934, 'learning_rate': 1.9937580123763645e-05, 'epoch': 0.06} + 6%|▋ | 565/8750 [55:32<12:57:50, 5.70s/it] 6%|▋ | 566/8750 [55:44<13:07:20, 5.77s/it] 6%|▋ | 566/8750 [55:38<13:07:19, 5.77s/it] {'loss': 0.5086, 'learning_rate': 1.993716649686059e-05, 'epoch': 0.06} + 6%|▋ | 566/8750 [55:44<13:07:20, 5.77s/it] {'loss': 0.5086, 'learning_rate': 1.993716649686059e-05, 'epoch': 0.06} + 6%|▋ | 566/8750 [55:38<13:07:19, 5.77s/it] 6%|▋ | 567/8750 [55:50<13:12:31, 5.81s/it] 6%|▋ | 567/8750 [55:44<13:12:31, 5.81s/it] {'loss': 0.4863, 'learning_rate': 1.993675150834386e-05, 'epoch': 0.06} + 6%|▋ | 567/8750 [55:50<13:12:31, 5.81s/it] {'loss': 0.4863, 'learning_rate': 1.993675150834386e-05, 'epoch': 0.06} + 6%|▋ | 567/8750 [55:44<13:12:31, 5.81s/it] 6%|▋ | 568/8750 [55:56<13:05:28, 5.76s/it] 6%|▋ | 568/8750 [55:49<13:05:27, 5.76s/it] {'loss': 0.5089, 'learning_rate': 1.993633515827033e-05, 'epoch': 0.06} + 6%|▋ | 568/8750 [55:56<13:05:28, 5.76s/it] {'loss': 0.5089, 'learning_rate': 1.993633515827033e-05, 'epoch': 0.06} + 6%|▋ | 568/8750 [55:49<13:05:27, 5.76s/it] 7%|▋ | 569/8750 [56:02<13:00:08, 5.72s/it] 7%|▋ | 569/8750 [55:55<13:00:07, 5.72s/it] {'loss': 0.5077, 'learning_rate': 1.9935917446697038e-05, 'epoch': 0.07} + 7%|▋ | 569/8750 [56:02<13:00:08, 5.72s/it] {'loss': 0.5077, 'learning_rate': 1.9935917446697038e-05, 'epoch': 0.07} + 7%|▋ | 569/8750 [55:55<13:00:07, 5.72s/it] 7%|▋ | 570/8750 [56:07<12:57:19, 5.70s/it] 7%|▋ | 570/8750 [56:01<12:57:19, 5.70s/it] {'loss': 0.4964, 'learning_rate': 1.993549837368123e-05, 'epoch': 0.07} + 7%|▋ | 570/8750 [56:07<12:57:19, 5.70s/it] {'loss': 0.4964, 'learning_rate': 1.993549837368123e-05, 'epoch': 0.07} + 7%|▋ | 570/8750 [56:01<12:57:19, 5.70s/it] 7%|▋ | 571/8750 [56:13<12:53:00, 5.67s/it] 7%|▋ | 571/8750 [56:06<12:53:01, 5.67s/it] {'loss': 0.5055, 'learning_rate': 1.9935077939280316e-05, 'epoch': 0.07} + 7%|▋ | 571/8750 [56:13<12:53:00, 5.67s/it] {'loss': 0.5055, 'learning_rate': 1.9935077939280316e-05, 'epoch': 0.07} + 7%|▋ | 571/8750 [56:06<12:53:01, 5.67s/it] 7%|▋ | 572/8750 [56:18<12:51:31, 5.66s/it] 7%|▋ | 572/8750 [56:12<12:51:31, 5.66s/it] {'loss': 0.503, 'learning_rate': 1.993465614355192e-05, 'epoch': 0.07} + 7%|▋ | 572/8750 [56:18<12:51:31, 5.66s/it] {'loss': 0.503, 'learning_rate': 1.993465614355192e-05, 'epoch': 0.07} + 7%|▋ | 572/8750 [56:12<12:51:31, 5.66s/it] 7%|▋ | 573/8750 [56:24<12:57:47, 5.71s/it] 7%|▋ | 573/8750 [56:18<12:57:47, 5.71s/it] {'loss': 0.5179, 'learning_rate': 1.9934232986553823e-05, 'epoch': 0.07} + 7%|▋ | 573/8750 [56:24<12:57:47, 5.71s/it] {'loss': 0.5179, 'learning_rate': 1.9934232986553823e-05, 'epoch': 0.07} + 7%|▋ | 573/8750 [56:18<12:57:47, 5.71s/it] 7%|▋ | 574/8750 [56:30<12:52:33, 5.67s/it] 7%|▋ | 574/8750 [56:23<12:52:33, 5.67s/it] {'loss': 0.4953, 'learning_rate': 1.9933808468344016e-05, 'epoch': 0.07} + 7%|▋ | 574/8750 [56:30<12:52:33, 5.67s/it] {'loss': 0.4953, 'learning_rate': 1.9933808468344016e-05, 'epoch': 0.07} + 7%|▋ | 574/8750 [56:23<12:52:33, 5.67s/it] 7%|▋ | 575/8750 [56:36<13:03:48, 5.75s/it] 7%|▋ | 575/8750 [56:29<13:03:48, 5.75s/it] {'loss': 0.4912, 'learning_rate': 1.9933382588980665e-05, 'epoch': 0.07} + 7%|▋ | 575/8750 [56:36<13:03:48, 5.75s/it] {'loss': 0.4912, 'learning_rate': 1.9933382588980665e-05, 'epoch': 0.07} + 7%|▋ | 575/8750 [56:29<13:03:48, 5.75s/it] 7%|▋ | 576/8750 [56:41<13:01:21, 5.74s/it] 7%|▋ | 576/8750 [56:35<13:01:21, 5.74s/it] {'loss': 0.4973, 'learning_rate': 1.9932955348522125e-05, 'epoch': 0.07} + 7%|▋ | 576/8750 [56:41<13:01:21, 5.74s/it] {'loss': 0.4973, 'learning_rate': 1.9932955348522125e-05, 'epoch': 0.07} + 7%|▋ | 576/8750 [56:35<13:01:21, 5.74s/it] 7%|▋ | 577/8750 [56:47<13:07:59, 5.78s/it] 7%|▋ | 577/8750 [56:41<13:07:58, 5.78s/it] {'loss': 0.5004, 'learning_rate': 1.9932526747026936e-05, 'epoch': 0.07} + 7%|▋ | 577/8750 [56:47<13:07:59, 5.78s/it] {'loss': 0.5004, 'learning_rate': 1.9932526747026936e-05, 'epoch': 0.07} + 7%|▋ | 577/8750 [56:41<13:07:58, 5.78s/it] 7%|▋ | 578/8750 [56:53<13:08:15, 5.79s/it] 7%|▋ | 578/8750 [56:47<13:08:14, 5.79s/it] {'loss': 0.5175, 'learning_rate': 1.993209678455383e-05, 'epoch': 0.07} + 7%|▋ | 578/8750 [56:53<13:08:15, 5.79s/it] {'loss': 0.5175, 'learning_rate': 1.993209678455383e-05, 'epoch': 0.07} + 7%|▋ | 578/8750 [56:47<13:08:14, 5.79s/it] 7%|▋ | 579/8750 [56:59<13:06:46, 5.78s/it] 7%|▋ | 579/8750 [56:52<13:06:46, 5.78s/it] {'loss': 0.5021, 'learning_rate': 1.9931665461161716e-05, 'epoch': 0.07} + 7%|▋ | 579/8750 [56:59<13:06:46, 5.78s/it] {'loss': 0.5021, 'learning_rate': 1.9931665461161716e-05, 'epoch': 0.07} + 7%|▋ | 579/8750 [56:52<13:06:46, 5.78s/it] 7%|▋ | 580/8750 [57:05<13:04:03, 5.76s/it] 7%|▋ | 580/8750 [56:58<13:04:04, 5.76s/it] {'loss': 0.5096, 'learning_rate': 1.9931232776909703e-05, 'epoch': 0.07} + 7%|▋ | 580/8750 [57:05<13:04:03, 5.76s/it] {'loss': 0.5096, 'learning_rate': 1.9931232776909703e-05, 'epoch': 0.07} + 7%|▋ | 580/8750 [56:58<13:04:04, 5.76s/it] 7%|▋ | 581/8750 [57:10<12:59:44, 5.73s/it] 7%|▋ | 581/8750 [57:04<12:59:45, 5.73s/it] {'loss': 0.4768, 'learning_rate': 1.993079873185707e-05, 'epoch': 0.07} + 7%|▋ | 581/8750 [57:10<12:59:44, 5.73s/it] {'loss': 0.4768, 'learning_rate': 1.993079873185707e-05, 'epoch': 0.07} + 7%|▋ | 581/8750 [57:04<12:59:45, 5.73s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (4214 > 4096). Running this sequence through the model will result in indexing errors + 7%|▋ | 582/8750 [57:16<12:56:53, 5.71s/it] 7%|▋ | 582/8750 [57:09<12:56:54, 5.71s/it] {'loss': 0.5109, 'learning_rate': 1.993036332606329e-05, 'epoch': 0.07} + 7%|▋ | 582/8750 [57:16<12:56:53, 5.71s/it] {'loss': 0.5109, 'learning_rate': 1.993036332606329e-05, 'epoch': 0.07} + 7%|▋ | 582/8750 [57:09<12:56:54, 5.71s/it] 7%|▋ | 583/8750 [57:22<13:01:23, 5.74s/it] 7%|▋ | 583/8750 [57:15<13:01:23, 5.74s/it] {'loss': 0.4972, 'learning_rate': 1.9929926559588032e-05, 'epoch': 0.07} + 7%|▋ | 583/8750 [57:22<13:01:23, 5.74s/it] {'loss': 0.4972, 'learning_rate': 1.9929926559588032e-05, 'epoch': 0.07} + 7%|▋ | 583/8750 [57:15<13:01:23, 5.74s/it] 7%|▋ | 584/8750 [57:28<13:09:43, 5.80s/it] 7%|▋ | 584/8750 [57:21<13:09:43, 5.80s/it] {'loss': 0.4919, 'learning_rate': 1.9929488432491137e-05, 'epoch': 0.07} + 7%|▋ | 584/8750 [57:28<13:09:43, 5.80s/it] {'loss': 0.4919, 'learning_rate': 1.9929488432491137e-05, 'epoch': 0.07} + 7%|▋ | 584/8750 [57:21<13:09:43, 5.80s/it] 7%|▋ | 585/8750 [57:33<13:07:40, 5.79s/it] 7%|▋ | 585/8750 [57:27<13:07:40, 5.79s/it] {'loss': 0.4959, 'learning_rate': 1.9929048944832638e-05, 'epoch': 0.07} + 7%|▋ | 585/8750 [57:33<13:07:40, 5.79s/it] {'loss': 0.4959, 'learning_rate': 1.9929048944832638e-05, 'epoch': 0.07} + 7%|▋ | 585/8750 [57:27<13:07:40, 5.79s/it] 7%|▋ | 586/8750 [57:39<13:00:03, 5.73s/it] 7%|▋ | 586/8750 [57:33<13:00:06, 5.73s/it] {'loss': 0.5132, 'learning_rate': 1.9928608096672757e-05, 'epoch': 0.07} + 7%|▋ | 586/8750 [57:39<13:00:03, 5.73s/it] {'loss': 0.5132, 'learning_rate': 1.9928608096672757e-05, 'epoch': 0.07} + 7%|▋ | 586/8750 [57:33<13:00:06, 5.73s/it] 7%|▋ | 587/8750 [57:45<12:56:02, 5.70s/it] 7%|▋ | 587/8750 [57:38<12:56:02, 5.70s/it] {'loss': 0.512, 'learning_rate': 1.99281658880719e-05, 'epoch': 0.07} + 7%|▋ | 587/8750 [57:45<12:56:02, 5.70s/it] {'loss': 0.512, 'learning_rate': 1.99281658880719e-05, 'epoch': 0.07} + 7%|▋ | 587/8750 [57:38<12:56:02, 5.70s/it] 7%|▋ | 588/8750 [57:51<13:06:58, 5.79s/it] 7%|▋ | 588/8750 [57:44<13:06:58, 5.79s/it] {'loss': 0.4972, 'learning_rate': 1.992772231909066e-05, 'epoch': 0.07} + 7%|▋ | 588/8750 [57:51<13:06:58, 5.79s/it] {'loss': 0.4972, 'learning_rate': 1.992772231909066e-05, 'epoch': 0.07} + 7%|▋ | 588/8750 [57:44<13:06:58, 5.79s/it] 7%|▋ | 589/8750 [57:57<13:14:21, 5.84s/it] 7%|▋ | 589/8750 [57:50<13:14:21, 5.84s/it] {'loss': 0.5027, 'learning_rate': 1.9927277389789812e-05, 'epoch': 0.07} + 7%|▋ | 589/8750 [57:57<13:14:21, 5.84s/it] {'loss': 0.5027, 'learning_rate': 1.9927277389789812e-05, 'epoch': 0.07} + 7%|▋ | 589/8750 [57:50<13:14:21, 5.84s/it] 7%|▋ | 590/8750 [58:02<13:01:51, 5.75s/it] 7%|▋ | 590/8750 [57:56<13:01:51, 5.75s/it] {'loss': 0.4921, 'learning_rate': 1.9926831100230322e-05, 'epoch': 0.07} + 7%|▋ | 590/8750 [58:02<13:01:51, 5.75s/it] {'loss': 0.4921, 'learning_rate': 1.9926831100230322e-05, 'epoch': 0.07} + 7%|▋ | 590/8750 [57:56<13:01:51, 5.75s/it] 7%|▋ | 591/8750 [58:08<12:57:53, 5.72s/it] 7%|▋ | 591/8750 [58:01<12:57:52, 5.72s/it] {'loss': 0.5223, 'learning_rate': 1.9926383450473344e-05, 'epoch': 0.07} + 7%|▋ | 591/8750 [58:08<12:57:53, 5.72s/it] {'loss': 0.5223, 'learning_rate': 1.9926383450473344e-05, 'epoch': 0.07} + 7%|▋ | 591/8750 [58:01<12:57:52, 5.72s/it] 7%|▋ | 592/8750 [58:14<12:57:51, 5.72s/it] 7%|▋ | 592/8750 [58:07<12:57:51, 5.72s/it] {'loss': 0.496, 'learning_rate': 1.9925934440580218e-05, 'epoch': 0.07} + 7%|▋ | 592/8750 [58:14<12:57:51, 5.72s/it] {'loss': 0.496, 'learning_rate': 1.9925934440580218e-05, 'epoch': 0.07} + 7%|▋ | 592/8750 [58:07<12:57:51, 5.72s/it] 7%|▋ | 593/8750 [58:19<12:51:09, 5.67s/it] 7%|▋ | 593/8750 [58:13<12:51:09, 5.67s/it] {'loss': 0.5087, 'learning_rate': 1.9925484070612465e-05, 'epoch': 0.07} + 7%|▋ | 593/8750 [58:19<12:51:09, 5.67s/it] {'loss': 0.5087, 'learning_rate': 1.9925484070612465e-05, 'epoch': 0.07} + 7%|▋ | 593/8750 [58:13<12:51:09, 5.67s/it] 7%|▋ | 594/8750 [58:25<12:52:18, 5.68s/it] 7%|▋ | 594/8750 [58:18<12:52:17, 5.68s/it] {'loss': 0.5022, 'learning_rate': 1.9925032340631793e-05, 'epoch': 0.07} + 7%|▋ | 594/8750 [58:25<12:52:18, 5.68s/it] {'loss': 0.5022, 'learning_rate': 1.9925032340631793e-05, 'epoch': 0.07} + 7%|▋ | 594/8750 [58:18<12:52:17, 5.68s/it] 7%|▋ | 595/8750 [58:24<12:52:26, 5.68s/it] 7%|▋ | 595/8750 [58:31<12:52:29, 5.68s/it] {'loss': 0.511, 'learning_rate': 1.9924579250700104e-05, 'epoch': 0.07} + 7%|▋ | 595/8750 [58:31<12:52:29, 5.68s/it] {'loss': 0.511, 'learning_rate': 1.9924579250700104e-05, 'epoch': 0.07} + 7%|▋ | 595/8750 [58:24<12:52:26, 5.68s/it] 7%|▋ | 596/8750 [58:36<12:47:04, 5.64s/it] 7%|▋ | 596/8750 [58:30<12:47:06, 5.64s/it] {'loss': 0.509, 'learning_rate': 1.992412480087948e-05, 'epoch': 0.07} + 7%|▋ | 596/8750 [58:36<12:47:04, 5.64s/it] {'loss': 0.509, 'learning_rate': 1.992412480087948e-05, 'epoch': 0.07} + 7%|▋ | 596/8750 [58:30<12:47:06, 5.64s/it] 7%|▋ | 597/8750 [58:42<12:59:16, 5.73s/it] 7%|▋ | 597/8750 [58:36<12:59:16, 5.73s/it] {'loss': 0.513, 'learning_rate': 1.992366899123219e-05, 'epoch': 0.07} + 7%|▋ | 597/8750 [58:42<12:59:16, 5.73s/it] {'loss': 0.513, 'learning_rate': 1.992366899123219e-05, 'epoch': 0.07} + 7%|▋ | 597/8750 [58:36<12:59:16, 5.73s/it] 7%|▋ | 598/8750 [58:48<12:51:28, 5.68s/it] 7%|▋ | 598/8750 [58:41<12:51:29, 5.68s/it] {'loss': 0.5128, 'learning_rate': 1.9923211821820692e-05, 'epoch': 0.07} + 7%|▋ | 598/8750 [58:48<12:51:28, 5.68s/it] {'loss': 0.5128, 'learning_rate': 1.9923211821820692e-05, 'epoch': 0.07} + 7%|▋ | 598/8750 [58:41<12:51:29, 5.68s/it]/lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/VILA/llava/model/llava_arch.py:397: UserWarning: Inputs truncated! + warnings.warn("Inputs truncated!") + 7%|▋ | 599/8750 [58:54<13:09:38, 5.81s/it] 7%|▋ | 599/8750 [58:47<13:09:38, 5.81s/it] {'loss': 0.5197, 'learning_rate': 1.9922753292707627e-05, 'epoch': 0.07} + 7%|▋ | 599/8750 [58:54<13:09:38, 5.81s/it] {'loss': 0.5197, 'learning_rate': 1.9922753292707627e-05, 'epoch': 0.07} + 7%|▋ | 599/8750 [58:47<13:09:38, 5.81s/it]12 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +8 15AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... + 7%|▋ | 600/8750 [58:59<13:03:02, 5.76s/it]14 AutoResumeHook: Checking whether to suspend... +010 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... + 1AutoResumeHook: Checking whether to suspend... AutoResumeHook: Checking whether to suspend... + + 7%|▋ | 600/8750 [58:53<13:03:03, 5.76s/it] {'loss': 0.4962, 'learning_rate': 1.992229340395582e-05, 'epoch': 0.07} + 7%|▋ | 600/8750 [58:59<13:03:02, 5.76s/it] {'loss': 0.4962, 'learning_rate': 1.992229340395582e-05, 'epoch': 0.07} + 7%|▋ | 600/8750 [58:53<13:03:03, 5.76s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-600/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-600/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-600/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 7%|▋ | 601/8750 [59:20<23:10:33, 10.24s/it] 7%|▋ | 601/8750 [59:14<23:10:33, 10.24s/it] {'loss': 0.4956, 'learning_rate': 1.9921832155628295e-05, 'epoch': 0.07} + 7%|▋ | 601/8750 [59:20<23:10:33, 10.24s/it] {'loss': 0.4956, 'learning_rate': 1.9921832155628295e-05, 'epoch': 0.07} + 7%|▋ | 601/8750 [59:14<23:10:33, 10.24s/it] 7%|▋ | 602/8750 [59:26<20:05:57, 8.88s/it] 7%|▋ | 602/8750 [59:19<20:05:58, 8.88s/it] {'loss': 0.4905, 'learning_rate': 1.9921369547788246e-05, 'epoch': 0.07} + 7%|▋ | 602/8750 [59:26<20:05:57, 8.88s/it] {'loss': 0.4905, 'learning_rate': 1.9921369547788246e-05, 'epoch': 0.07} + 7%|▋ | 602/8750 [59:19<20:05:58, 8.88s/it] 7%|▋ | 603/8750 [59:31<17:58:30, 7.94s/it] 7%|▋ | 603/8750 [59:25<17:58:29, 7.94s/it] {'loss': 0.5012, 'learning_rate': 1.9920905580499063e-05, 'epoch': 0.07} + 7%|▋ | 603/8750 [59:31<17:58:30, 7.94s/it] {'loss': 0.5012, 'learning_rate': 1.9920905580499063e-05, 'epoch': 0.07} + 7%|▋ | 603/8750 [59:25<17:58:29, 7.94s/it] 7%|▋ | 604/8750 [59:37<16:23:38, 7.25s/it] 7%|▋ | 604/8750 [59:31<16:23:38, 7.25s/it] {'loss': 0.4991, 'learning_rate': 1.9920440253824318e-05, 'epoch': 0.07} + 7%|▋ | 604/8750 [59:37<16:23:38, 7.25s/it] {'loss': 0.4991, 'learning_rate': 1.9920440253824318e-05, 'epoch': 0.07} + 7%|▋ | 604/8750 [59:31<16:23:38, 7.25s/it] 7%|▋ | 605/8750 [59:43<15:26:42, 6.83s/it] 7%|▋ | 605/8750 [59:36<15:26:43, 6.83s/it] {'loss': 0.5092, 'learning_rate': 1.9919973567827776e-05, 'epoch': 0.07} + 7%|▋ | 605/8750 [59:43<15:26:42, 6.83s/it] {'loss': 0.5092, 'learning_rate': 1.9919973567827776e-05, 'epoch': 0.07} + 7%|▋ | 605/8750 [59:36<15:26:43, 6.83s/it] 7%|▋ | 606/8750 [59:49<14:47:37, 6.54s/it] 7%|▋ | 606/8750 [59:42<14:47:38, 6.54s/it] {'loss': 0.4813, 'learning_rate': 1.991950552257338e-05, 'epoch': 0.07} + 7%|▋ | 606/8750 [59:49<14:47:37, 6.54s/it] {'loss': 0.4813, 'learning_rate': 1.991950552257338e-05, 'epoch': 0.07} + 7%|▋ | 606/8750 [59:42<14:47:38, 6.54s/it] 7%|▋ | 607/8750 [59:54<14:12:18, 6.28s/it] 7%|▋ | 607/8750 [59:48<14:12:18, 6.28s/it] {'loss': 0.4992, 'learning_rate': 1.991903611812526e-05, 'epoch': 0.07} + 7%|▋ | 607/8750 [59:54<14:12:18, 6.28s/it] {'loss': 0.4992, 'learning_rate': 1.991903611812526e-05, 'epoch': 0.07} + 7%|▋ | 607/8750 [59:48<14:12:18, 6.28s/it] 7%|▋ | 608/8750 [1:00:00<13:50:29, 6.12s/it] 7%|▋ | 608/8750 [59:54<13:50:29, 6.12s/it] {'loss': 0.4908, 'learning_rate': 1.9918565354547738e-05, 'epoch': 0.07} + 7%|▋ | 608/8750 [1:00:00<13:50:29, 6.12s/it] {'loss': 0.4908, 'learning_rate': 1.9918565354547738e-05, 'epoch': 0.07} + 7%|▋ | 608/8750 [59:54<13:50:29, 6.12s/it] 7%|▋ | 609/8750 [1:00:06<13:39:21, 6.04s/it] 7%|▋ | 609/8750 [1:00:00<13:39:21, 6.04s/it] {'loss': 0.505, 'learning_rate': 1.991809323190532e-05, 'epoch': 0.07} + 7%|▋ | 609/8750 [1:00:06<13:39:21, 6.04s/it] {'loss': 0.505, 'learning_rate': 1.991809323190532e-05, 'epoch': 0.07} + 7%|▋ | 609/8750 [1:00:00<13:39:21, 6.04s/it] 7%|▋ | 610/8750 [1:00:12<13:26:24, 5.94s/it] 7%|▋ | 610/8750 [1:00:05<13:26:24, 5.94s/it] {'loss': 0.4865, 'learning_rate': 1.99176197502627e-05, 'epoch': 0.07} + 7%|▋ | 610/8750 [1:00:12<13:26:24, 5.94s/it] {'loss': 0.4865, 'learning_rate': 1.99176197502627e-05, 'epoch': 0.07} + 7%|▋ | 610/8750 [1:00:05<13:26:24, 5.94s/it] 7%|▋ | 611/8750 [1:00:18<13:19:44, 5.90s/it] 7%|▋ | 611/8750 [1:00:11<13:19:44, 5.90s/it] {'loss': 0.4982, 'learning_rate': 1.9917144909684745e-05, 'epoch': 0.07} + 7%|▋ | 611/8750 [1:00:18<13:19:44, 5.90s/it] {'loss': 0.4982, 'learning_rate': 1.9917144909684745e-05, 'epoch': 0.07} + 7%|▋ | 611/8750 [1:00:11<13:19:44, 5.90s/it] 7%|▋ | 612/8750 [1:00:23<13:05:53, 5.79s/it] 7%|▋ | 612/8750 [1:00:17<13:05:53, 5.79s/it] {'loss': 0.5175, 'learning_rate': 1.9916668710236528e-05, 'epoch': 0.07} + 7%|▋ | 612/8750 [1:00:23<13:05:53, 5.79s/it] {'loss': 0.5175, 'learning_rate': 1.9916668710236528e-05, 'epoch': 0.07} + 7%|▋ | 612/8750 [1:00:17<13:05:53, 5.79s/it] 7%|▋ | 613/8750 [1:00:29<12:59:19, 5.75s/it] 7%|▋ | 613/8750 [1:00:22<12:59:19, 5.75s/it] {'loss': 0.529, 'learning_rate': 1.9916191151983297e-05, 'epoch': 0.07} + 7%|▋ | 613/8750 [1:00:29<12:59:19, 5.75s/it] {'loss': 0.529, 'learning_rate': 1.9916191151983297e-05, 'epoch': 0.07} + 7%|▋ | 613/8750 [1:00:22<12:59:19, 5.75s/it] 7%|▋ | 614/8750 [1:00:34<12:53:09, 5.70s/it] 7%|▋ | 614/8750 [1:00:28<12:53:09, 5.70s/it] {'loss': 0.4916, 'learning_rate': 1.9915712234990486e-05, 'epoch': 0.07} + 7%|▋ | 614/8750 [1:00:34<12:53:09, 5.70s/it] {'loss': 0.4916, 'learning_rate': 1.9915712234990486e-05, 'epoch': 0.07} + 7%|▋ | 614/8750 [1:00:28<12:53:09, 5.70s/it] 7%|▋ | 615/8750 [1:00:40<13:02:37, 5.77s/it] 7%|▋ | 615/8750 [1:00:34<13:02:37, 5.77s/it] {'loss': 0.5154, 'learning_rate': 1.9915231959323722e-05, 'epoch': 0.07} + 7%|▋ | 615/8750 [1:00:40<13:02:37, 5.77s/it] {'loss': 0.5154, 'learning_rate': 1.9915231959323722e-05, 'epoch': 0.07} + 7%|▋ | 615/8750 [1:00:34<13:02:37, 5.77s/it] 7%|▋ | 616/8750 [1:00:46<12:59:15, 5.75s/it] 7%|▋ | 616/8750 [1:00:40<12:59:14, 5.75s/it] {'loss': 0.4903, 'learning_rate': 1.991475032504881e-05, 'epoch': 0.07} + 7%|▋ | 616/8750 [1:00:46<12:59:15, 5.75s/it] {'loss': 0.4903, 'learning_rate': 1.991475032504881e-05, 'epoch': 0.07} + 7%|▋ | 616/8750 [1:00:40<12:59:14, 5.75s/it] 7%|▋ | 617/8750 [1:00:52<13:03:11, 5.78s/it] 7%|▋ | 617/8750 [1:00:45<13:03:11, 5.78s/it] {'loss': 0.4984, 'learning_rate': 1.9914267332231746e-05, 'epoch': 0.07} + 7%|▋ | 617/8750 [1:00:52<13:03:11, 5.78s/it] {'loss': 0.4984, 'learning_rate': 1.9914267332231746e-05, 'epoch': 0.07} + 7%|▋ | 617/8750 [1:00:45<13:03:11, 5.78s/it] 7%|▋ | 618/8750 [1:00:57<12:52:18, 5.70s/it] 7%|▋ | 618/8750 [1:00:51<12:52:18, 5.70s/it] {'loss': 0.5179, 'learning_rate': 1.991378298093871e-05, 'epoch': 0.07} + 7%|▋ | 618/8750 [1:00:57<12:52:18, 5.70s/it] {'loss': 0.5179, 'learning_rate': 1.991378298093871e-05, 'epoch': 0.07} + 7%|▋ | 618/8750 [1:00:51<12:52:18, 5.70s/it] 7%|▋ | 619/8750 [1:01:03<12:54:08, 5.71s/it] 7%|▋ | 619/8750 [1:00:57<12:54:07, 5.71s/it] {'loss': 0.4865, 'learning_rate': 1.9913297271236063e-05, 'epoch': 0.07} + 7%|▋ | 619/8750 [1:01:03<12:54:08, 5.71s/it] {'loss': 0.4865, 'learning_rate': 1.9913297271236063e-05, 'epoch': 0.07} + 7%|▋ | 619/8750 [1:00:57<12:54:07, 5.71s/it] 7%|▋ | 620/8750 [1:01:09<12:52:12, 5.70s/it] 7%|▋ | 620/8750 [1:01:02<12:52:12, 5.70s/it] {'loss': 0.5074, 'learning_rate': 1.9912810203190367e-05, 'epoch': 0.07} + 7%|▋ | 620/8750 [1:01:09<12:52:12, 5.70s/it] {'loss': 0.5074, 'learning_rate': 1.9912810203190367e-05, 'epoch': 0.07} + 7%|▋ | 620/8750 [1:01:02<12:52:12, 5.70s/it] 7%|▋ | 621/8750 [1:01:15<12:59:46, 5.76s/it] 7%|▋ | 621/8750 [1:01:08<12:59:49, 5.76s/it] {'loss': 0.5076, 'learning_rate': 1.991232177686836e-05, 'epoch': 0.07} + 7%|▋ | 621/8750 [1:01:15<12:59:46, 5.76s/it] {'loss': 0.5076, 'learning_rate': 1.991232177686836e-05, 'epoch': 0.07} + 7%|▋ | 621/8750 [1:01:08<12:59:49, 5.76s/it] 7%|▋ | 622/8750 [1:01:20<12:50:18, 5.69s/it] 7%|▋ | 622/8750 [1:01:14<12:50:18, 5.69s/it] {'loss': 0.5042, 'learning_rate': 1.9911831992336963e-05, 'epoch': 0.07} + 7%|▋ | 622/8750 [1:01:20<12:50:18, 5.69s/it] {'loss': 0.5042, 'learning_rate': 1.9911831992336963e-05, 'epoch': 0.07} + 7%|▋ | 622/8750 [1:01:14<12:50:18, 5.69s/it] 7%|▋ | 623/8750 [1:01:26<12:47:19, 5.67s/it] 7%|▋ | 623/8750 [1:01:19<12:47:19, 5.67s/it] {'loss': 0.5021, 'learning_rate': 1.9911340849663293e-05, 'epoch': 0.07} + 7%|▋ | 623/8750 [1:01:26<12:47:19, 5.67s/it] {'loss': 0.5021, 'learning_rate': 1.9911340849663293e-05, 'epoch': 0.07} + 7%|▋ | 623/8750 [1:01:19<12:47:19, 5.67s/it] 7%|▋ | 624/8750 [1:01:31<12:46:47, 5.66s/it] 7%|▋ | 624/8750 [1:01:25<12:46:46, 5.66s/it] {'loss': 0.5062, 'learning_rate': 1.991084834891464e-05, 'epoch': 0.07} + 7%|▋ | 624/8750 [1:01:31<12:46:47, 5.66s/it] {'loss': 0.5062, 'learning_rate': 1.991084834891464e-05, 'epoch': 0.07} + 7%|▋ | 624/8750 [1:01:25<12:46:46, 5.66s/it] 7%|▋ | 625/8750 [1:01:37<12:44:03, 5.64s/it] 7%|▋ | 625/8750 [1:01:31<12:44:03, 5.64s/it] {'loss': 0.4975, 'learning_rate': 1.9910354490158498e-05, 'epoch': 0.07} + 7%|▋ | 625/8750 [1:01:37<12:44:03, 5.64s/it] {'loss': 0.4975, 'learning_rate': 1.9910354490158498e-05, 'epoch': 0.07} + 7%|▋ | 625/8750 [1:01:31<12:44:03, 5.64s/it] 7%|▋ | 626/8750 [1:01:43<13:05:36, 5.80s/it] 7%|▋ | 626/8750 [1:01:37<13:05:35, 5.80s/it] {'loss': 0.4892, 'learning_rate': 1.9909859273462525e-05, 'epoch': 0.07} + 7%|▋ | 626/8750 [1:01:43<13:05:36, 5.80s/it] {'loss': 0.4892, 'learning_rate': 1.9909859273462525e-05, 'epoch': 0.07} + 7%|▋ | 626/8750 [1:01:37<13:05:35, 5.80s/it] 7%|▋ | 627/8750 [1:01:49<13:14:20, 5.87s/it] 7%|▋ | 627/8750 [1:01:43<13:14:19, 5.87s/it] {'loss': 0.4962, 'learning_rate': 1.9909362698894585e-05, 'epoch': 0.07} + 7%|▋ | 627/8750 [1:01:49<13:14:20, 5.87s/it] {'loss': 0.4962, 'learning_rate': 1.9909362698894585e-05, 'epoch': 0.07} + 7%|▋ | 627/8750 [1:01:43<13:14:19, 5.87s/it] 7%|▋ | 628/8750 [1:01:55<13:01:09, 5.77s/it] 7%|▋ | 628/8750 [1:01:48<13:01:10, 5.77s/it] {'loss': 0.5167, 'learning_rate': 1.9908864766522716e-05, 'epoch': 0.07} + 7%|▋ | 628/8750 [1:01:55<13:01:09, 5.77s/it] {'loss': 0.5167, 'learning_rate': 1.9908864766522716e-05, 'epoch': 0.07} + 7%|▋ | 628/8750 [1:01:48<13:01:10, 5.77s/it] 7%|▋ | 629/8750 [1:02:00<12:54:02, 5.72s/it] 7%|▋ | 629/8750 [1:01:54<12:54:01, 5.72s/it] {'loss': 0.5168, 'learning_rate': 1.9908365476415146e-05, 'epoch': 0.07} + 7%|▋ | 629/8750 [1:02:00<12:54:02, 5.72s/it] {'loss': 0.5168, 'learning_rate': 1.9908365476415146e-05, 'epoch': 0.07} + 7%|▋ | 629/8750 [1:01:54<12:54:01, 5.72s/it] 7%|▋ | 630/8750 [1:02:06<12:54:23, 5.72s/it] 7%|▋ | 630/8750 [1:02:00<12:54:23, 5.72s/it] {'loss': 0.502, 'learning_rate': 1.9907864828640292e-05, 'epoch': 0.07} + 7%|▋ | 630/8750 [1:02:06<12:54:23, 5.72s/it] {'loss': 0.502, 'learning_rate': 1.9907864828640292e-05, 'epoch': 0.07} + 7%|▋ | 630/8750 [1:02:00<12:54:23, 5.72s/it] 7%|▋ | 631/8750 [1:02:12<12:50:49, 5.70s/it] 7%|▋ | 631/8750 [1:02:05<12:50:49, 5.70s/it] {'loss': 0.5143, 'learning_rate': 1.9907362823266752e-05, 'epoch': 0.07} + 7%|▋ | 631/8750 [1:02:12<12:50:49, 5.70s/it] {'loss': 0.5143, 'learning_rate': 1.9907362823266752e-05, 'epoch': 0.07} + 7%|▋ | 631/8750 [1:02:05<12:50:49, 5.70s/it] 7%|▋ | 632/8750 [1:02:18<12:52:17, 5.71s/it] 7%|▋ | 632/8750 [1:02:11<12:52:16, 5.71s/it] {'loss': 0.5045, 'learning_rate': 1.9906859460363307e-05, 'epoch': 0.07} + 7%|▋ | 632/8750 [1:02:18<12:52:17, 5.71s/it] {'loss': 0.5045, 'learning_rate': 1.9906859460363307e-05, 'epoch': 0.07} + 7%|▋ | 632/8750 [1:02:11<12:52:16, 5.71s/it] 7%|▋ | 633/8750 [1:02:23<12:51:42, 5.70s/it] 7%|▋ | 633/8750 [1:02:17<12:51:42, 5.70s/it] {'loss': 0.5051, 'learning_rate': 1.9906354739998937e-05, 'epoch': 0.07} + 7%|▋ | 633/8750 [1:02:23<12:51:42, 5.70s/it] {'loss': 0.5051, 'learning_rate': 1.9906354739998937e-05, 'epoch': 0.07} + 7%|▋ | 633/8750 [1:02:17<12:51:42, 5.70s/it] 7%|▋ | 634/8750 [1:02:23<12:57:00, 5.74s/it] 7%|▋ | 634/8750 [1:02:29<12:57:00, 5.74s/it] {'loss': 0.4971, 'learning_rate': 1.99058486622428e-05, 'epoch': 0.07} + 7%|▋ | 634/8750 [1:02:23<12:57:00, 5.74s/it]{'loss': 0.4971, 'learning_rate': 1.99058486622428e-05, 'epoch': 0.07} + 7%|▋ | 634/8750 [1:02:29<12:57:00, 5.74s/it] 7%|▋ | 635/8750 [1:02:35<12:58:44, 5.76s/it] 7%|▋ | 635/8750 [1:02:28<12:58:45, 5.76s/it] {'loss': 0.4839, 'learning_rate': 1.990534122716423e-05, 'epoch': 0.07} + 7%|▋ | 635/8750 [1:02:35<12:58:44, 5.76s/it] {'loss': 0.4839, 'learning_rate': 1.990534122716423e-05, 'epoch': 0.07} + 7%|▋ | 635/8750 [1:02:28<12:58:45, 5.76s/it] 7%|▋ | 636/8750 [1:02:40<12:51:05, 5.70s/it] 7%|▋ | 636/8750 [1:02:34<12:51:05, 5.70s/it] {'loss': 0.5059, 'learning_rate': 1.990483243483277e-05, 'epoch': 0.07} + 7%|▋ | 636/8750 [1:02:40<12:51:05, 5.70s/it] {'loss': 0.5059, 'learning_rate': 1.990483243483277e-05, 'epoch': 0.07} + 7%|▋ | 636/8750 [1:02:34<12:51:05, 5.70s/it] 7%|▋ | 637/8750 [1:02:46<12:49:01, 5.69s/it] 7%|▋ | 637/8750 [1:02:40<12:49:01, 5.69s/it] {'loss': 0.504, 'learning_rate': 1.990432228531813e-05, 'epoch': 0.07} + 7%|▋ | 637/8750 [1:02:46<12:49:01, 5.69s/it] {'loss': 0.504, 'learning_rate': 1.990432228531813e-05, 'epoch': 0.07} + 7%|▋ | 637/8750 [1:02:40<12:49:01, 5.69s/it] 7%|▋ | 638/8750 [1:02:52<12:43:37, 5.65s/it] 7%|▋ | 638/8750 [1:02:45<12:43:37, 5.65s/it]{'loss': 0.5081, 'learning_rate': 1.9903810778690204e-05, 'epoch': 0.07} + {'loss': 0.5081, 'learning_rate': 1.9903810778690204e-05, 'epoch': 0.07} + 7%|▋ | 638/8750 [1:02:52<12:43:37, 5.65s/it] 7%|▋ | 638/8750 [1:02:45<12:43:37, 5.65s/it] 7%|▋ | 639/8750 [1:02:57<12:47:09, 5.67s/it] 7%|▋ | 639/8750 [1:02:51<12:47:08, 5.67s/it] {'loss': 0.5, 'learning_rate': 1.9903297915019093e-05, 'epoch': 0.07} + 7%|▋ | 639/8750 [1:02:57<12:47:09, 5.67s/it] {'loss': 0.5, 'learning_rate': 1.9903297915019093e-05, 'epoch': 0.07} + 7%|▋ | 639/8750 [1:02:51<12:47:08, 5.67s/it] 7%|▋ | 640/8750 [1:03:03<12:53:31, 5.72s/it] 7%|▋ | 640/8750 [1:02:57<12:53:31, 5.72s/it] {'loss': 0.5103, 'learning_rate': 1.9902783694375064e-05, 'epoch': 0.07} + 7%|▋ | 640/8750 [1:03:03<12:53:31, 5.72s/it] {'loss': 0.5103, 'learning_rate': 1.9902783694375064e-05, 'epoch': 0.07} + 7%|▋ | 640/8750 [1:02:57<12:53:31, 5.72s/it] 7%|▋ | 641/8750 [1:03:09<12:51:40, 5.71s/it] 7%|▋ | 641/8750 [1:03:02<12:51:40, 5.71s/it] {'loss': 0.5111, 'learning_rate': 1.9902268116828578e-05, 'epoch': 0.07} + 7%|▋ | 641/8750 [1:03:09<12:51:40, 5.71s/it] {'loss': 0.5111, 'learning_rate': 1.9902268116828578e-05, 'epoch': 0.07} + 7%|▋ | 641/8750 [1:03:02<12:51:40, 5.71s/it] 7%|▋ | 642/8750 [1:03:15<12:54:56, 5.73s/it] 7%|▋ | 642/8750 [1:03:08<12:54:57, 5.73s/it] {'loss': 0.4893, 'learning_rate': 1.9901751182450276e-05, 'epoch': 0.07} + 7%|▋ | 642/8750 [1:03:15<12:54:56, 5.73s/it] {'loss': 0.4893, 'learning_rate': 1.9901751182450276e-05, 'epoch': 0.07} + 7%|▋ | 642/8750 [1:03:08<12:54:57, 5.73s/it] 7%|▋ | 643/8750 [1:03:14<13:04:56, 5.81s/it] 7%|▋ | 643/8750 [1:03:21<13:04:57, 5.81s/it] {'loss': 0.4861, 'learning_rate': 1.9901232891310998e-05, 'epoch': 0.07} + 7%|▋ | 643/8750 [1:03:21<13:04:57, 5.81s/it] {'loss': 0.4861, 'learning_rate': 1.9901232891310998e-05, 'epoch': 0.07} + 7%|▋ | 643/8750 [1:03:14<13:04:56, 5.81s/it] 7%|▋ | 644/8750 [1:03:26<13:01:00, 5.78s/it] 7%|▋ | 644/8750 [1:03:20<13:01:00, 5.78s/it] {'loss': 0.5005, 'learning_rate': 1.9900713243481758e-05, 'epoch': 0.07} + 7%|▋ | 644/8750 [1:03:26<13:01:00, 5.78s/it] {'loss': 0.5005, 'learning_rate': 1.9900713243481758e-05, 'epoch': 0.07} + 7%|▋ | 644/8750 [1:03:20<13:01:00, 5.78s/it] 7%|▋ | 645/8750 [1:03:26<12:56:53, 5.75s/it] 7%|▋ | 645/8750 [1:03:32<12:56:53, 5.75s/it] {'loss': 0.4887, 'learning_rate': 1.990019223903376e-05, 'epoch': 0.07} + 7%|▋ | 645/8750 [1:03:32<12:56:53, 5.75s/it] {'loss': 0.4887, 'learning_rate': 1.990019223903376e-05, 'epoch': 0.07} + 7%|▋ | 645/8750 [1:03:26<12:56:53, 5.75s/it] 7%|▋ | 646/8750 [1:03:38<12:52:42, 5.72s/it] 7%|▋ | 646/8750 [1:03:31<12:52:43, 5.72s/it] {'loss': 0.5158, 'learning_rate': 1.9899669878038382e-05, 'epoch': 0.07} + 7%|▋ | 646/8750 [1:03:38<12:52:42, 5.72s/it] {'loss': 0.5158, 'learning_rate': 1.9899669878038382e-05, 'epoch': 0.07} + 7%|▋ | 646/8750 [1:03:31<12:52:43, 5.72s/it] 7%|▋ | 647/8750 [1:03:43<12:51:16, 5.71s/it] 7%|▋ | 647/8750 [1:03:37<12:51:17, 5.71s/it] {'loss': 0.4871, 'learning_rate': 1.989914616056722e-05, 'epoch': 0.07} + 7%|▋ | 647/8750 [1:03:43<12:51:16, 5.71s/it] {'loss': 0.4871, 'learning_rate': 1.989914616056722e-05, 'epoch': 0.07} + 7%|▋ | 647/8750 [1:03:37<12:51:17, 5.71s/it] 7%|▋ | 648/8750 [1:03:49<12:58:32, 5.77s/it] 7%|▋ | 648/8750 [1:03:43<12:58:32, 5.77s/it] {'loss': 0.5133, 'learning_rate': 1.9898621086692017e-05, 'epoch': 0.07} + 7%|▋ | 648/8750 [1:03:49<12:58:32, 5.77s/it] {'loss': 0.5133, 'learning_rate': 1.9898621086692017e-05, 'epoch': 0.07} + 7%|▋ | 648/8750 [1:03:43<12:58:32, 5.77s/it] 7%|▋ | 649/8750 [1:03:55<13:06:17, 5.82s/it] 7%|▋ | 649/8750 [1:03:49<13:06:17, 5.82s/it] {'loss': 0.4984, 'learning_rate': 1.989809465648473e-05, 'epoch': 0.07} + 7%|▋ | 649/8750 [1:03:55<13:06:17, 5.82s/it] {'loss': 0.4984, 'learning_rate': 1.989809465648473e-05, 'epoch': 0.07} + 7%|▋ | 649/8750 [1:03:49<13:06:17, 5.82s/it]3 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +148 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +13 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... + 7%|▋ | 650/8750 [1:04:01<13:07:01, 5.83s/it]0 10 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... + 7%|▋ | 650/8750 [1:03:55<13:07:01, 5.83s/it] {'loss': 0.4881, 'learning_rate': 1.989756687001749e-05, 'epoch': 0.07} + 7%|▋ | 650/8750 [1:04:01<13:07:01, 5.83s/it] {'loss': 0.4881, 'learning_rate': 1.989756687001749e-05, 'epoch': 0.07} + 7%|▋ | 650/8750 [1:03:55<13:07:01, 5.83s/it] 7%|▋ | 651/8750 [1:04:07<13:05:58, 5.82s/it] 7%|▋ | 651/8750 [1:04:00<13:05:57, 5.82s/it] {'loss': 0.4802, 'learning_rate': 1.9897037727362612e-05, 'epoch': 0.07} + 7%|▋ | 651/8750 [1:04:07<13:05:58, 5.82s/it] {'loss': 0.4802, 'learning_rate': 1.9897037727362612e-05, 'epoch': 0.07} + 7%|▋ | 651/8750 [1:04:00<13:05:57, 5.82s/it] 7%|▋ | 652/8750 [1:04:13<12:59:01, 5.77s/it] 7%|▋ | 652/8750 [1:04:06<12:59:00, 5.77s/it] {'loss': 0.5036, 'learning_rate': 1.9896507228592604e-05, 'epoch': 0.07} + 7%|▋ | 652/8750 [1:04:13<12:59:01, 5.77s/it] {'loss': 0.5036, 'learning_rate': 1.9896507228592604e-05, 'epoch': 0.07} + 7%|▋ | 652/8750 [1:04:06<12:59:00, 5.77s/it] 7%|▋ | 653/8750 [1:04:18<13:03:11, 5.80s/it] 7%|▋ | 653/8750 [1:04:12<13:03:11, 5.80s/it] {'loss': 0.4942, 'learning_rate': 1.989597537378015e-05, 'epoch': 0.07} + 7%|▋ | 653/8750 [1:04:18<13:03:11, 5.80s/it] {'loss': 0.4942, 'learning_rate': 1.989597537378015e-05, 'epoch': 0.07} + 7%|▋ | 653/8750 [1:04:12<13:03:11, 5.80s/it] 7%|▋ | 654/8750 [1:04:24<12:55:31, 5.75s/it] 7%|▋ | 654/8750 [1:04:18<12:55:31, 5.75s/it] {'loss': 0.5228, 'learning_rate': 1.9895442162998136e-05, 'epoch': 0.07} + 7%|▋ | 654/8750 [1:04:24<12:55:31, 5.75s/it] {'loss': 0.5228, 'learning_rate': 1.9895442162998136e-05, 'epoch': 0.07} + 7%|▋ | 654/8750 [1:04:18<12:55:31, 5.75s/it] 7%|▋ | 655/8750 [1:04:30<12:56:34, 5.76s/it] 7%|▋ | 655/8750 [1:04:23<12:56:33, 5.76s/it] {'loss': 0.4976, 'learning_rate': 1.9894907596319615e-05, 'epoch': 0.07} + 7%|▋ | 655/8750 [1:04:30<12:56:34, 5.76s/it] {'loss': 0.4976, 'learning_rate': 1.9894907596319615e-05, 'epoch': 0.07} + 7%|▋ | 655/8750 [1:04:23<12:56:33, 5.76s/it] 7%|▋ | 656/8750 [1:04:29<12:51:32, 5.72s/it] {'loss': 0.5151, 'learning_rate': 1.989437167381784e-05, 'epoch': 0.07} + 7%|▋ | 656/8750 [1:04:29<12:51:32, 5.72s/it] 7%|▋ | 656/8750 [1:04:35<12:51:32, 5.72s/it] {'loss': 0.5151, 'learning_rate': 1.989437167381784e-05, 'epoch': 0.07} + 7%|▋ | 656/8750 [1:04:35<12:51:32, 5.72s/it] 8%|▊ | 657/8750 [1:04:41<12:44:55, 5.67s/it] 8%|▊ | 657/8750 [1:04:34<12:44:55, 5.67s/it] {'loss': 0.5039, 'learning_rate': 1.9893834395566242e-05, 'epoch': 0.08} + 8%|▊ | 657/8750 [1:04:35<12:44:55, 5.67s/it] {'loss': 0.5039, 'learning_rate': 1.9893834395566242e-05, 'epoch': 0.08} + 8%|▊ | 657/8750 [1:04:41<12:44:55, 5.67s/it] 8%|▊ | 658/8750 [1:04:47<12:41:59, 5.65s/it] 8%|▊ | 658/8750 [1:04:40<12:41:59, 5.65s/it] {'loss': 0.4843, 'learning_rate': 1.989329576163844e-05, 'epoch': 0.08} + 8%|▊ | 658/8750 [1:04:47<12:41:59, 5.65s/it] {'loss': 0.4843, 'learning_rate': 1.989329576163844e-05, 'epoch': 0.08} + 8%|▊ | 658/8750 [1:04:40<12:41:59, 5.65s/it] 8%|▊ | 659/8750 [1:04:52<12:43:03, 5.66s/it] 8%|▊ | 659/8750 [1:04:46<12:43:06, 5.66s/it] {'loss': 0.5066, 'learning_rate': 1.989275577210824e-05, 'epoch': 0.08} + 8%|▊ | 659/8750 [1:04:52<12:43:03, 5.66s/it] {'loss': 0.5066, 'learning_rate': 1.989275577210824e-05, 'epoch': 0.08} + 8%|▊ | 659/8750 [1:04:46<12:43:06, 5.66s/it] 8%|▊ | 660/8750 [1:04:58<12:44:52, 5.67s/it] 8%|▊ | 660/8750 [1:04:51<12:44:52, 5.67s/it] {'loss': 0.497, 'learning_rate': 1.989221442704963e-05, 'epoch': 0.08} + 8%|▊ | 660/8750 [1:04:58<12:44:52, 5.67s/it] {'loss': 0.497, 'learning_rate': 1.989221442704963e-05, 'epoch': 0.08} + 8%|▊ | 660/8750 [1:04:51<12:44:52, 5.67s/it] 8%|▊ | 661/8750 [1:05:04<12:47:10, 5.69s/it] 8%|▊ | 661/8750 [1:04:57<12:47:10, 5.69s/it] {'loss': 0.4965, 'learning_rate': 1.9891671726536787e-05, 'epoch': 0.08} + 8%|▊ | 661/8750 [1:05:04<12:47:10, 5.69s/it] {'loss': 0.4965, 'learning_rate': 1.9891671726536787e-05, 'epoch': 0.08} + 8%|▊ | 661/8750 [1:04:57<12:47:10, 5.69s/it] 8%|▊ | 662/8750 [1:05:03<12:45:34, 5.68s/it] 8%|▊ | 662/8750 [1:05:09<12:45:37, 5.68s/it] {'loss': 0.4853, 'learning_rate': 1.9891127670644076e-05, 'epoch': 0.08} + 8%|▊ | 662/8750 [1:05:09<12:45:37, 5.68s/it] {'loss': 0.4853, 'learning_rate': 1.9891127670644076e-05, 'epoch': 0.08} + 8%|▊ | 662/8750 [1:05:03<12:45:34, 5.68s/it] 8%|▊ | 663/8750 [1:05:15<12:55:43, 5.76s/it] 8%|▊ | 663/8750 [1:05:09<12:55:48, 5.76s/it] {'loss': 0.5132, 'learning_rate': 1.9890582259446046e-05, 'epoch': 0.08} + 8%|▊ | 663/8750 [1:05:15<12:55:43, 5.76s/it] {'loss': 0.5132, 'learning_rate': 1.9890582259446046e-05, 'epoch': 0.08} + 8%|▊ | 663/8750 [1:05:09<12:55:48, 5.76s/it] 8%|▊ | 664/8750 [1:05:15<13:10:31, 5.87s/it] {'loss': 0.4881, 'learning_rate': 1.9890035493017424e-05, 'epoch': 0.08} + 8%|▊ | 664/8750 [1:05:15<13:10:31, 5.87s/it] 8%|▊ | 664/8750 [1:05:21<13:10:33, 5.87s/it] {'loss': 0.4881, 'learning_rate': 1.9890035493017424e-05, 'epoch': 0.08} + 8%|▊ | 664/8750 [1:05:21<13:10:33, 5.87s/it] 8%|▊ | 665/8750 [1:05:27<13:01:05, 5.80s/it] 8%|▊ | 665/8750 [1:05:21<13:01:05, 5.80s/it] {'loss': 0.5049, 'learning_rate': 1.9889487371433134e-05, 'epoch': 0.08} + 8%|▊ | 665/8750 [1:05:27<13:01:05, 5.80s/it] {'loss': 0.5049, 'learning_rate': 1.9889487371433134e-05, 'epoch': 0.08} + 8%|▊ | 665/8750 [1:05:21<13:01:05, 5.80s/it] 8%|▊ | 666/8750 [1:05:33<12:56:26, 5.76s/it] 8%|▊ | 666/8750 [1:05:26<12:56:27, 5.76s/it] {'loss': 0.4886, 'learning_rate': 1.988893789476828e-05, 'epoch': 0.08} + 8%|▊ | 666/8750 [1:05:33<12:56:26, 5.76s/it] {'loss': 0.4886, 'learning_rate': 1.988893789476828e-05, 'epoch': 0.08} + 8%|▊ | 666/8750 [1:05:26<12:56:27, 5.76s/it] 8%|▊ | 667/8750 [1:05:39<13:11:30, 5.88s/it] 8%|▊ | 667/8750 [1:05:32<13:11:34, 5.88s/it] {'loss': 0.5109, 'learning_rate': 1.9888387063098153e-05, 'epoch': 0.08} + 8%|▊ | 667/8750 [1:05:39<13:11:30, 5.88s/it] {'loss': 0.5109, 'learning_rate': 1.9888387063098153e-05, 'epoch': 0.08} + 8%|▊ | 667/8750 [1:05:32<13:11:34, 5.88s/it] 8%|▊ | 668/8750 [1:05:45<13:08:37, 5.85s/it] 8%|▊ | 668/8750 [1:05:38<13:08:36, 5.85s/it] {'loss': 0.4744, 'learning_rate': 1.9887834876498228e-05, 'epoch': 0.08} + 8%|▊ | 668/8750 [1:05:45<13:08:37, 5.85s/it] {'loss': 0.4744, 'learning_rate': 1.9887834876498228e-05, 'epoch': 0.08} + 8%|▊ | 668/8750 [1:05:38<13:08:36, 5.85s/it] 8%|▊ | 669/8750 [1:05:50<12:57:27, 5.77s/it] 8%|▊ | 669/8750 [1:05:44<12:57:26, 5.77s/it] {'loss': 0.4952, 'learning_rate': 1.9887281335044167e-05, 'epoch': 0.08} + 8%|▊ | 669/8750 [1:05:50<12:57:27, 5.77s/it] {'loss': 0.4952, 'learning_rate': 1.9887281335044167e-05, 'epoch': 0.08} + 8%|▊ | 669/8750 [1:05:44<12:57:26, 5.77s/it] 8%|▊ | 670/8750 [1:05:56<13:00:52, 5.80s/it] 8%|▊ | 670/8750 [1:05:50<13:00:52, 5.80s/it] {'loss': 0.4774, 'learning_rate': 1.988672643881182e-05, 'epoch': 0.08} + 8%|▊ | 670/8750 [1:05:56<13:00:52, 5.80s/it] {'loss': 0.4774, 'learning_rate': 1.988672643881182e-05, 'epoch': 0.08} + 8%|▊ | 670/8750 [1:05:50<13:00:52, 5.80s/it] 8%|▊ | 671/8750 [1:06:02<12:54:10, 5.75s/it] 8%|▊ | 671/8750 [1:05:55<12:54:09, 5.75s/it] {'loss': 0.509, 'learning_rate': 1.9886170187877214e-05, 'epoch': 0.08} + 8%|▊ | 671/8750 [1:06:02<12:54:10, 5.75s/it] {'loss': 0.509, 'learning_rate': 1.9886170187877214e-05, 'epoch': 0.08} + 8%|▊ | 671/8750 [1:05:55<12:54:09, 5.75s/it] 8%|▊ | 672/8750 [1:06:07<12:47:18, 5.70s/it] {'loss': 0.4924, 'learning_rate': 1.9885612582316575e-05, 'epoch': 0.08} + 8%|▊ | 672/8750 [1:06:01<12:47:17, 5.70s/it] {'loss': 0.4924, 'learning_rate': 1.9885612582316575e-05, 'epoch': 0.08} 8%|▊ | 672/8750 [1:06:07<12:47:18, 5.70s/it] + 8%|▊ | 672/8750 [1:06:01<12:47:17, 5.70s/it] 8%|▊ | 673/8750 [1:06:13<12:50:56, 5.73s/it] 8%|▊ | 673/8750 [1:06:07<12:50:55, 5.73s/it] {'loss': 0.4924, 'learning_rate': 1.9885053622206305e-05, 'epoch': 0.08} + 8%|▊ | 673/8750 [1:06:13<12:50:56, 5.73s/it] {'loss': 0.4924, 'learning_rate': 1.9885053622206305e-05, 'epoch': 0.08} + 8%|▊ | 673/8750 [1:06:07<12:50:55, 5.73s/it] 8%|▊ | 674/8750 [1:06:19<12:50:49, 5.73s/it] 8%|▊ | 674/8750 [1:06:12<12:50:49, 5.73s/it] {'loss': 0.5126, 'learning_rate': 1.9884493307622993e-05, 'epoch': 0.08} + 8%|▊ | 674/8750 [1:06:19<12:50:49, 5.73s/it] {'loss': 0.5126, 'learning_rate': 1.9884493307622993e-05, 'epoch': 0.08} + 8%|▊ | 674/8750 [1:06:12<12:50:49, 5.73s/it] 8%|▊ | 675/8750 [1:06:25<13:02:37, 5.82s/it] 8%|▊ | 675/8750 [1:06:18<13:02:37, 5.82s/it] {'loss': 0.4876, 'learning_rate': 1.988393163864341e-05, 'epoch': 0.08} + 8%|▊ | 675/8750 [1:06:25<13:02:37, 5.82s/it] {'loss': 0.4876, 'learning_rate': 1.988393163864341e-05, 'epoch': 0.08} + 8%|▊ | 675/8750 [1:06:18<13:02:37, 5.82s/it] 8%|▊ | 676/8750 [1:06:30<12:53:04, 5.74s/it] 8%|▊ | 676/8750 [1:06:24<12:53:04, 5.74s/it] {'loss': 0.4895, 'learning_rate': 1.9883368615344526e-05, 'epoch': 0.08} + 8%|▊ | 676/8750 [1:06:30<12:53:04, 5.74s/it] {'loss': 0.4895, 'learning_rate': 1.9883368615344526e-05, 'epoch': 0.08} + 8%|▊ | 676/8750 [1:06:24<12:53:04, 5.74s/it] 8%|▊ | 677/8750 [1:06:36<12:50:26, 5.73s/it] 8%|▊ | 677/8750 [1:06:30<12:50:27, 5.73s/it] {'loss': 0.5074, 'learning_rate': 1.9882804237803487e-05, 'epoch': 0.08} + 8%|▊ | 677/8750 [1:06:36<12:50:26, 5.73s/it] {'loss': 0.5074, 'learning_rate': 1.9882804237803487e-05, 'epoch': 0.08} + 8%|▊ | 677/8750 [1:06:30<12:50:27, 5.73s/it] 8%|▊ | 678/8750 [1:06:42<12:53:52, 5.75s/it] 8%|▊ | 678/8750 [1:06:35<12:53:51, 5.75s/it] {'loss': 0.4826, 'learning_rate': 1.988223850609762e-05, 'epoch': 0.08} + 8%|▊ | 678/8750 [1:06:42<12:53:52, 5.75s/it] {'loss': 0.4826, 'learning_rate': 1.988223850609762e-05, 'epoch': 0.08} + 8%|▊ | 678/8750 [1:06:35<12:53:51, 5.75s/it] 8%|▊ | 679/8750 [1:06:48<12:54:14, 5.76s/it] 8%|▊ | 679/8750 [1:06:41<12:54:14, 5.76s/it] {'loss': 0.5268, 'learning_rate': 1.9881671420304444e-05, 'epoch': 0.08} + 8%|▊ | 679/8750 [1:06:48<12:54:14, 5.76s/it] {'loss': 0.5268, 'learning_rate': 1.9881671420304444e-05, 'epoch': 0.08} + 8%|▊ | 679/8750 [1:06:41<12:54:14, 5.76s/it] 8%|▊ | 680/8750 [1:06:53<12:48:38, 5.71s/it] 8%|▊ | 680/8750 [1:06:47<12:48:37, 5.71s/it] {'loss': 0.4995, 'learning_rate': 1.9881102980501664e-05, 'epoch': 0.08} + 8%|▊ | 680/8750 [1:06:53<12:48:38, 5.71s/it] {'loss': 0.4995, 'learning_rate': 1.9881102980501664e-05, 'epoch': 0.08} + 8%|▊ | 680/8750 [1:06:47<12:48:37, 5.71s/it] 8%|▊ | 681/8750 [1:06:59<12:45:47, 5.69s/it] 8%|▊ | 681/8750 [1:06:52<12:45:47, 5.69s/it] {'loss': 0.5015, 'learning_rate': 1.988053318676717e-05, 'epoch': 0.08} + 8%|▊ | 681/8750 [1:06:59<12:45:47, 5.69s/it] {'loss': 0.5015, 'learning_rate': 1.988053318676717e-05, 'epoch': 0.08} + 8%|▊ | 681/8750 [1:06:52<12:45:47, 5.69s/it] 8%|▊ | 682/8750 [1:07:05<12:46:36, 5.70s/it] 8%|▊ | 682/8750 [1:06:58<12:46:37, 5.70s/it] {'loss': 0.5134, 'learning_rate': 1.9879962039179033e-05, 'epoch': 0.08} + 8%|▊ | 682/8750 [1:07:05<12:46:36, 5.70s/it] {'loss': 0.5134, 'learning_rate': 1.9879962039179033e-05, 'epoch': 0.08} + 8%|▊ | 682/8750 [1:06:58<12:46:37, 5.70s/it] 8%|▊ | 683/8750 [1:07:10<12:46:39, 5.70s/it] 8%|▊ | 683/8750 [1:07:04<12:46:38, 5.70s/it] {'loss': 0.4966, 'learning_rate': 1.9879389537815514e-05, 'epoch': 0.08} + 8%|▊ | 683/8750 [1:07:10<12:46:39, 5.70s/it] {'loss': 0.4966, 'learning_rate': 1.9879389537815514e-05, 'epoch': 0.08} + 8%|▊ | 683/8750 [1:07:04<12:46:38, 5.70s/it] 8%|▊ | 684/8750 [1:07:16<12:58:09, 5.79s/it] 8%|▊ | 684/8750 [1:07:10<12:58:10, 5.79s/it] {'loss': 0.4792, 'learning_rate': 1.9878815682755062e-05, 'epoch': 0.08} + 8%|▊ | 684/8750 [1:07:16<12:58:09, 5.79s/it] {'loss': 0.4792, 'learning_rate': 1.9878815682755062e-05, 'epoch': 0.08} + 8%|▊ | 684/8750 [1:07:10<12:58:10, 5.79s/it] 8%|▊ | 685/8750 [1:07:22<12:49:12, 5.72s/it] 8%|▊ | 685/8750 [1:07:15<12:49:13, 5.72s/it] {'loss': 0.523, 'learning_rate': 1.9878240474076306e-05, 'epoch': 0.08} + 8%|▊ | 685/8750 [1:07:22<12:49:12, 5.72s/it] {'loss': 0.523, 'learning_rate': 1.9878240474076306e-05, 'epoch': 0.08} + 8%|▊ | 685/8750 [1:07:15<12:49:13, 5.72s/it] 8%|▊ | 686/8750 [1:07:28<12:52:03, 5.74s/it] 8%|▊ | 686/8750 [1:07:21<12:52:03, 5.74s/it] {'loss': 0.492, 'learning_rate': 1.987766391185806e-05, 'epoch': 0.08} + 8%|▊ | 686/8750 [1:07:28<12:52:03, 5.74s/it] {'loss': 0.492, 'learning_rate': 1.987766391185806e-05, 'epoch': 0.08} + 8%|▊ | 686/8750 [1:07:21<12:52:03, 5.74s/it] 8%|▊ | 687/8750 [1:07:33<12:47:53, 5.71s/it] 8%|▊ | 687/8750 [1:07:27<12:47:52, 5.71s/it] {'loss': 0.5097, 'learning_rate': 1.9877085996179327e-05, 'epoch': 0.08} + 8%|▊ | 687/8750 [1:07:33<12:47:53, 5.71s/it] {'loss': 0.5097, 'learning_rate': 1.9877085996179327e-05, 'epoch': 0.08} + 8%|▊ | 687/8750 [1:07:27<12:47:52, 5.71s/it] 8%|▊ | 688/8750 [1:07:39<12:54:41, 5.77s/it] 8%|▊ | 688/8750 [1:07:33<12:54:41, 5.77s/it] {'loss': 0.4948, 'learning_rate': 1.9876506727119294e-05, 'epoch': 0.08} + 8%|▊ | 688/8750 [1:07:39<12:54:41, 5.77s/it] {'loss': 0.4948, 'learning_rate': 1.9876506727119294e-05, 'epoch': 0.08} + 8%|▊ | 688/8750 [1:07:33<12:54:41, 5.77s/it] 8%|▊ | 689/8750 [1:07:45<12:54:50, 5.77s/it] 8%|▊ | 689/8750 [1:07:39<12:54:51, 5.77s/it] {'loss': 0.5193, 'learning_rate': 1.9875926104757337e-05, 'epoch': 0.08} + 8%|▊ | 689/8750 [1:07:45<12:54:50, 5.77s/it] {'loss': 0.5193, 'learning_rate': 1.9875926104757337e-05, 'epoch': 0.08} + 8%|▊ | 689/8750 [1:07:39<12:54:51, 5.77s/it] 8%|▊ | 690/8750 [1:07:51<12:53:04, 5.75s/it] 8%|▊ | 690/8750 [1:07:44<12:53:04, 5.75s/it] {'loss': 0.5, 'learning_rate': 1.9875344129173012e-05, 'epoch': 0.08} + 8%|▊ | 690/8750 [1:07:51<12:53:04, 5.75s/it] {'loss': 0.5, 'learning_rate': 1.9875344129173012e-05, 'epoch': 0.08} + 8%|▊ | 690/8750 [1:07:44<12:53:04, 5.75s/it] 8%|▊ | 691/8750 [1:07:57<12:58:05, 5.79s/it] 8%|▊ | 691/8750 [1:07:50<12:58:05, 5.79s/it] {'loss': 0.4983, 'learning_rate': 1.9874760800446063e-05, 'epoch': 0.08} + 8%|▊ | 691/8750 [1:07:57<12:58:05, 5.79s/it] {'loss': 0.4983, 'learning_rate': 1.9874760800446063e-05, 'epoch': 0.08} + 8%|▊ | 691/8750 [1:07:50<12:58:05, 5.79s/it] 8%|▊ | 692/8750 [1:08:03<13:01:50, 5.82s/it] 8%|▊ | 692/8750 [1:07:56<13:01:51, 5.82s/it] {'loss': 0.4759, 'learning_rate': 1.9874176118656415e-05, 'epoch': 0.08} + 8%|▊ | 692/8750 [1:08:03<13:01:50, 5.82s/it] {'loss': 0.4759, 'learning_rate': 1.9874176118656415e-05, 'epoch': 0.08} + 8%|▊ | 692/8750 [1:07:56<13:01:51, 5.82s/it] 8%|▊ | 693/8750 [1:08:08<12:52:33, 5.75s/it] 8%|▊ | 693/8750 [1:08:02<12:52:33, 5.75s/it] {'loss': 0.5069, 'learning_rate': 1.9873590083884192e-05, 'epoch': 0.08} + 8%|▊ | 693/8750 [1:08:08<12:52:33, 5.75s/it] {'loss': 0.5069, 'learning_rate': 1.9873590083884192e-05, 'epoch': 0.08} + 8%|▊ | 693/8750 [1:08:02<12:52:33, 5.75s/it] 8%|▊ | 694/8750 [1:08:14<12:56:29, 5.78s/it] 8%|▊ | 694/8750 [1:08:08<12:56:30, 5.78s/it] {'loss': 0.5129, 'learning_rate': 1.9873002696209688e-05, 'epoch': 0.08} + 8%|▊ | 694/8750 [1:08:14<12:56:29, 5.78s/it] {'loss': 0.5129, 'learning_rate': 1.9873002696209688e-05, 'epoch': 0.08} + 8%|▊ | 694/8750 [1:08:08<12:56:30, 5.78s/it] 8%|▊ | 695/8750 [1:08:20<13:03:04, 5.83s/it] 8%|▊ | 695/8750 [1:08:13<13:03:03, 5.83s/it] {'loss': 0.508, 'learning_rate': 1.9872413955713382e-05, 'epoch': 0.08} + 8%|▊ | 695/8750 [1:08:13<13:03:03, 5.83s/it]{'loss': 0.508, 'learning_rate': 1.9872413955713382e-05, 'epoch': 0.08} + 8%|▊ | 695/8750 [1:08:20<13:03:04, 5.83s/it] 8%|▊ | 696/8750 [1:08:26<12:55:04, 5.77s/it] 8%|▊ | 696/8750 [1:08:19<12:55:04, 5.77s/it] {'loss': 0.4963, 'learning_rate': 1.9871823862475955e-05, 'epoch': 0.08} + 8%|▊ | 696/8750 [1:08:19<12:55:04, 5.77s/it]{'loss': 0.4963, 'learning_rate': 1.9871823862475955e-05, 'epoch': 0.08} + 8%|▊ | 696/8750 [1:08:26<12:55:04, 5.77s/it] 8%|▊ | 697/8750 [1:08:31<12:53:02, 5.76s/it] 8%|▊ | 697/8750 [1:08:25<12:53:02, 5.76s/it] {'loss': 0.5074, 'learning_rate': 1.987123241657826e-05, 'epoch': 0.08} + 8%|▊ | 697/8750 [1:08:31<12:53:02, 5.76s/it] {'loss': 0.5074, 'learning_rate': 1.987123241657826e-05, 'epoch': 0.08} + 8%|▊ | 697/8750 [1:08:25<12:53:02, 5.76s/it] 8%|▊ | 698/8750 [1:08:37<12:44:53, 5.70s/it] 8%|▊ | 698/8750 [1:08:30<12:44:53, 5.70s/it] {'loss': 0.5238, 'learning_rate': 1.9870639618101333e-05, 'epoch': 0.08} + 8%|▊ | 698/8750 [1:08:37<12:44:53, 5.70s/it] {'loss': 0.5238, 'learning_rate': 1.9870639618101333e-05, 'epoch': 0.08} + 8%|▊ | 698/8750 [1:08:30<12:44:53, 5.70s/it] 8%|▊ | 699/8750 [1:08:43<12:45:08, 5.70s/it] 8%|▊ | 699/8750 [1:08:36<12:45:08, 5.70s/it] {'loss': 0.4916, 'learning_rate': 1.987004546712641e-05, 'epoch': 0.08} + 8%|▊ | 699/8750 [1:08:43<12:45:08, 5.70s/it] {'loss': 0.4916, 'learning_rate': 1.987004546712641e-05, 'epoch': 0.08} + 8%|▊ | 699/8750 [1:08:36<12:45:08, 5.70s/it]2 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +0 3 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... + 8%|▊ | 700/8750 [1:08:42<12:46:35, 5.71s/it] {'loss': 0.4913, 'learning_rate': 1.9869449963734894e-05, 'epoch': 0.08} + 8%|▊ | 700/8750 [1:08:42<12:46:35, 5.71s/it]1213 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +15 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... + 8%|▊ | 700/8750 [1:08:48<12:46:36, 5.71s/it]9 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4913, 'learning_rate': 1.9869449963734894e-05, 'epoch': 0.08} + 8%|▊ | 700/8750 [1:08:48<12:46:36, 5.71s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-700/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-700/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-700/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 8%|▊ | 701/8750 [1:09:09<22:46:11, 10.18s/it] 8%|▊ | 701/8750 [1:09:02<22:46:12, 10.18s/it] {'loss': 0.4755, 'learning_rate': 1.9868853108008387e-05, 'epoch': 0.08} + 8%|▊ | 701/8750 [1:09:09<22:46:11, 10.18s/it] {'loss': 0.4755, 'learning_rate': 1.9868853108008387e-05, 'epoch': 0.08} + 8%|▊ | 701/8750 [1:09:02<22:46:12, 10.18s/it] 8%|▊ | 702/8750 [1:09:15<19:47:10, 8.85s/it] 8%|▊ | 702/8750 [1:09:08<19:47:11, 8.85s/it] {'loss': 0.4836, 'learning_rate': 1.986825490002867e-05, 'epoch': 0.08} + 8%|▊ | 702/8750 [1:09:15<19:47:10, 8.85s/it] {'loss': 0.4836, 'learning_rate': 1.986825490002867e-05, 'epoch': 0.08} + 8%|▊ | 702/8750 [1:09:08<19:47:11, 8.85s/it] 8%|▊ | 703/8750 [1:09:20<17:40:03, 7.90s/it] 8%|▊ | 703/8750 [1:09:14<17:40:04, 7.90s/it] {'loss': 0.4985, 'learning_rate': 1.9867655339877713e-05, 'epoch': 0.08} + 8%|▊ | 703/8750 [1:09:20<17:40:03, 7.90s/it] {'loss': 0.4985, 'learning_rate': 1.9867655339877713e-05, 'epoch': 0.08} + 8%|▊ | 703/8750 [1:09:14<17:40:04, 7.90s/it] 8%|▊ | 704/8750 [1:09:26<16:09:22, 7.23s/it] 8%|▊ | 704/8750 [1:09:20<16:09:22, 7.23s/it] {'loss': 0.4972, 'learning_rate': 1.9867054427637667e-05, 'epoch': 0.08} + 8%|▊ | 704/8750 [1:09:26<16:09:22, 7.23s/it] {'loss': 0.4972, 'learning_rate': 1.9867054427637667e-05, 'epoch': 0.08} + 8%|▊ | 704/8750 [1:09:20<16:09:22, 7.23s/it] 8%|▊ | 705/8750 [1:09:32<15:11:54, 6.80s/it] 8%|▊ | 705/8750 [1:09:25<15:11:54, 6.80s/it] {'loss': 0.5101, 'learning_rate': 1.986645216339087e-05, 'epoch': 0.08} + 8%|▊ | 705/8750 [1:09:32<15:11:54, 6.80s/it] {'loss': 0.5101, 'learning_rate': 1.986645216339087e-05, 'epoch': 0.08} + 8%|▊ | 705/8750 [1:09:25<15:11:54, 6.80s/it] 8%|▊ | 706/8750 [1:09:37<14:25:58, 6.46s/it] 8%|▊ | 706/8750 [1:09:31<14:25:58, 6.46s/it] {'loss': 0.4929, 'learning_rate': 1.9865848547219845e-05, 'epoch': 0.08} + 8%|▊ | 706/8750 [1:09:37<14:25:58, 6.46s/it] {'loss': 0.4929, 'learning_rate': 1.9865848547219845e-05, 'epoch': 0.08} + 8%|▊ | 706/8750 [1:09:31<14:25:58, 6.46s/it] 8%|▊ | 707/8750 [1:09:43<13:52:58, 6.21s/it] 8%|▊ | 707/8750 [1:09:37<13:52:58, 6.21s/it] {'loss': 0.4964, 'learning_rate': 1.9865243579207304e-05, 'epoch': 0.08} + 8%|▊ | 707/8750 [1:09:43<13:52:58, 6.21s/it] {'loss': 0.4964, 'learning_rate': 1.9865243579207304e-05, 'epoch': 0.08} + 8%|▊ | 707/8750 [1:09:37<13:52:58, 6.21s/it] 8%|▊ | 708/8750 [1:09:49<13:24:27, 6.00s/it] 8%|▊ | 708/8750 [1:09:42<13:24:28, 6.00s/it] {'loss': 0.5103, 'learning_rate': 1.986463725943614e-05, 'epoch': 0.08} + 8%|▊ | 708/8750 [1:09:49<13:24:27, 6.00s/it] {'loss': 0.5103, 'learning_rate': 1.986463725943614e-05, 'epoch': 0.08} + 8%|▊ | 708/8750 [1:09:42<13:24:28, 6.00s/it] 8%|▊ | 709/8750 [1:09:54<13:06:30, 5.87s/it] 8%|▊ | 709/8750 [1:09:48<13:06:30, 5.87s/it] {'loss': 0.481, 'learning_rate': 1.9864029587989432e-05, 'epoch': 0.08} + 8%|▊ | 709/8750 [1:09:54<13:06:30, 5.87s/it] {'loss': 0.481, 'learning_rate': 1.9864029587989432e-05, 'epoch': 0.08} + 8%|▊ | 709/8750 [1:09:48<13:06:30, 5.87s/it] 8%|▊ | 710/8750 [1:10:00<13:15:50, 5.94s/it] 8%|▊ | 710/8750 [1:09:54<13:15:49, 5.94s/it] {'loss': 0.4843, 'learning_rate': 1.9863420564950445e-05, 'epoch': 0.08} + 8%|▊ | 710/8750 [1:10:00<13:15:50, 5.94s/it] {'loss': 0.4843, 'learning_rate': 1.9863420564950445e-05, 'epoch': 0.08} + 8%|▊ | 710/8750 [1:09:54<13:15:49, 5.94s/it] 8%|▊ | 711/8750 [1:10:06<13:06:46, 5.87s/it] 8%|▊ | 711/8750 [1:10:00<13:06:47, 5.87s/it] {'loss': 0.5253, 'learning_rate': 1.986281019040263e-05, 'epoch': 0.08} + 8%|▊ | 711/8750 [1:10:06<13:06:46, 5.87s/it] {'loss': 0.5253, 'learning_rate': 1.986281019040263e-05, 'epoch': 0.08} + 8%|▊ | 711/8750 [1:10:00<13:06:47, 5.87s/it] 8%|▊ | 712/8750 [1:10:12<13:06:55, 5.87s/it] 8%|▊ | 712/8750 [1:10:05<13:06:54, 5.87s/it] {'loss': 0.4945, 'learning_rate': 1.9862198464429614e-05, 'epoch': 0.08} + 8%|▊ | 712/8750 [1:10:12<13:06:55, 5.87s/it] {'loss': 0.4945, 'learning_rate': 1.9862198464429614e-05, 'epoch': 0.08} + 8%|▊ | 712/8750 [1:10:05<13:06:54, 5.87s/it] 8%|▊ | 713/8750 [1:10:18<13:12:22, 5.92s/it] 8%|▊ | 713/8750 [1:10:11<13:12:21, 5.92s/it] {'loss': 0.4945, 'learning_rate': 1.9861585387115228e-05, 'epoch': 0.08} + 8%|▊ | 713/8750 [1:10:18<13:12:22, 5.92s/it] {'loss': 0.4945, 'learning_rate': 1.9861585387115228e-05, 'epoch': 0.08} + 8%|▊ | 713/8750 [1:10:11<13:12:21, 5.92s/it] 8%|▊ | 714/8750 [1:10:24<13:06:27, 5.87s/it] 8%|▊ | 714/8750 [1:10:17<13:06:27, 5.87s/it] {'loss': 0.4998, 'learning_rate': 1.986097095854347e-05, 'epoch': 0.08} + 8%|▊ | 714/8750 [1:10:24<13:06:27, 5.87s/it] {'loss': 0.4998, 'learning_rate': 1.986097095854347e-05, 'epoch': 0.08} + 8%|▊ | 714/8750 [1:10:17<13:06:27, 5.87s/it] 8%|▊ | 715/8750 [1:10:29<12:52:07, 5.77s/it] 8%|▊ | 715/8750 [1:10:23<12:52:07, 5.77s/it] {'loss': 0.4981, 'learning_rate': 1.9860355178798536e-05, 'epoch': 0.08} + 8%|▊ | 715/8750 [1:10:29<12:52:07, 5.77s/it] {'loss': 0.4981, 'learning_rate': 1.9860355178798536e-05, 'epoch': 0.08} + 8%|▊ | 715/8750 [1:10:23<12:52:07, 5.77s/it] 8%|▊ | 716/8750 [1:10:35<12:50:24, 5.75s/it] 8%|▊ | 716/8750 [1:10:28<12:50:25, 5.75s/it] {'loss': 0.5039, 'learning_rate': 1.9859738047964795e-05, 'epoch': 0.08} + 8%|▊ | 716/8750 [1:10:35<12:50:24, 5.75s/it] {'loss': 0.5039, 'learning_rate': 1.9859738047964795e-05, 'epoch': 0.08} + 8%|▊ | 716/8750 [1:10:28<12:50:25, 5.75s/it] 8%|▊ | 717/8750 [1:10:41<12:56:25, 5.80s/it] 8%|▊ | 717/8750 [1:10:34<12:56:25, 5.80s/it] {'loss': 0.4968, 'learning_rate': 1.9859119566126813e-05, 'epoch': 0.08} + 8%|▊ | 717/8750 [1:10:41<12:56:25, 5.80s/it] {'loss': 0.4968, 'learning_rate': 1.9859119566126813e-05, 'epoch': 0.08} + 8%|▊ | 717/8750 [1:10:34<12:56:25, 5.80s/it] 8%|▊ | 718/8750 [1:10:47<12:51:36, 5.76s/it] 8%|▊ | 718/8750 [1:10:40<12:51:36, 5.76s/it] {'loss': 0.4974, 'learning_rate': 1.9858499733369335e-05, 'epoch': 0.08} + 8%|▊ | 718/8750 [1:10:47<12:51:36, 5.76s/it] {'loss': 0.4974, 'learning_rate': 1.9858499733369335e-05, 'epoch': 0.08} + 8%|▊ | 718/8750 [1:10:40<12:51:36, 5.76s/it] 8%|▊ | 719/8750 [1:10:53<13:02:17, 5.84s/it] 8%|▊ | 719/8750 [1:10:46<13:02:17, 5.84s/it] {'loss': 0.4996, 'learning_rate': 1.985787854977729e-05, 'epoch': 0.08} + 8%|▊ | 719/8750 [1:10:53<13:02:17, 5.84s/it] {'loss': 0.4996, 'learning_rate': 1.985787854977729e-05, 'epoch': 0.08} + 8%|▊ | 719/8750 [1:10:46<13:02:17, 5.84s/it] 8%|▊ | 720/8750 [1:10:58<12:53:57, 5.78s/it] 8%|▊ | 720/8750 [1:10:52<12:53:56, 5.78s/it] {'loss': 0.4793, 'learning_rate': 1.9857256015435797e-05, 'epoch': 0.08} + 8%|▊ | 720/8750 [1:10:58<12:53:57, 5.78s/it] {'loss': 0.4793, 'learning_rate': 1.9857256015435797e-05, 'epoch': 0.08} + 8%|▊ | 720/8750 [1:10:52<12:53:56, 5.78s/it] 8%|▊ | 721/8750 [1:11:04<12:51:33, 5.77s/it] 8%|▊ | 721/8750 [1:10:57<12:51:34, 5.77s/it] {'loss': 0.4923, 'learning_rate': 1.985663213043015e-05, 'epoch': 0.08} + 8%|▊ | 721/8750 [1:11:04<12:51:33, 5.77s/it] {'loss': 0.4923, 'learning_rate': 1.985663213043015e-05, 'epoch': 0.08} + 8%|▊ | 721/8750 [1:10:57<12:51:34, 5.77s/it] 8%|▊ | 722/8750 [1:11:10<12:48:58, 5.75s/it] 8%|▊ | 722/8750 [1:11:03<12:48:58, 5.75s/it] {'loss': 0.4878, 'learning_rate': 1.9856006894845844e-05, 'epoch': 0.08} + 8%|▊ | 722/8750 [1:11:10<12:48:58, 5.75s/it] {'loss': 0.4878, 'learning_rate': 1.9856006894845844e-05, 'epoch': 0.08} + 8%|▊ | 722/8750 [1:11:03<12:48:58, 5.75s/it] 8%|▊ | 723/8750 [1:11:15<12:41:12, 5.69s/it] 8%|▊ | 723/8750 [1:11:09<12:41:12, 5.69s/it] {'loss': 0.5218, 'learning_rate': 1.9855380308768546e-05, 'epoch': 0.08} + 8%|▊ | 723/8750 [1:11:15<12:41:12, 5.69s/it] {'loss': 0.5218, 'learning_rate': 1.9855380308768546e-05, 'epoch': 0.08} + 8%|▊ | 723/8750 [1:11:09<12:41:12, 5.69s/it] 8%|▊ | 724/8750 [1:11:21<12:46:06, 5.73s/it] 8%|▊ | 724/8750 [1:11:14<12:46:06, 5.73s/it] {'loss': 0.4872, 'learning_rate': 1.9854752372284113e-05, 'epoch': 0.08} + 8%|▊ | 724/8750 [1:11:21<12:46:06, 5.73s/it] {'loss': 0.4872, 'learning_rate': 1.9854752372284113e-05, 'epoch': 0.08} + 8%|▊ | 724/8750 [1:11:14<12:46:06, 5.73s/it] 8%|▊ | 725/8750 [1:11:27<12:43:12, 5.71s/it] 8%|▊ | 725/8750 [1:11:20<12:43:12, 5.71s/it]{'loss': 0.4902, 'learning_rate': 1.9854123085478587e-05, 'epoch': 0.08} + {'loss': 0.4902, 'learning_rate': 1.9854123085478587e-05, 'epoch': 0.08} + 8%|▊ | 725/8750 [1:11:27<12:43:12, 5.71s/it] 8%|▊ | 725/8750 [1:11:20<12:43:12, 5.71s/it] 8%|▊ | 726/8750 [1:11:32<12:37:04, 5.66s/it] 8%|▊ | 726/8750 [1:11:26<12:37:08, 5.66s/it] {'loss': 0.5086, 'learning_rate': 1.9853492448438192e-05, 'epoch': 0.08} + 8%|▊ | 726/8750 [1:11:32<12:37:04, 5.66s/it] {'loss': 0.5086, 'learning_rate': 1.9853492448438192e-05, 'epoch': 0.08} + 8%|▊ | 726/8750 [1:11:26<12:37:08, 5.66s/it] 8%|▊ | 727/8750 [1:11:38<12:44:41, 5.72s/it] 8%|▊ | 727/8750 [1:11:32<12:44:40, 5.72s/it] {'loss': 0.4679, 'learning_rate': 1.985286046124934e-05, 'epoch': 0.08} + 8%|▊ | 727/8750 [1:11:38<12:44:41, 5.72s/it] {'loss': 0.4679, 'learning_rate': 1.985286046124934e-05, 'epoch': 0.08} + 8%|▊ | 727/8750 [1:11:32<12:44:40, 5.72s/it] 8%|▊ | 728/8750 [1:11:44<12:47:58, 5.74s/it] 8%|▊ | 728/8750 [1:11:37<12:47:58, 5.74s/it] {'loss': 0.4948, 'learning_rate': 1.985222712399863e-05, 'epoch': 0.08} + 8%|▊ | 728/8750 [1:11:44<12:47:58, 5.74s/it] {'loss': 0.4948, 'learning_rate': 1.985222712399863e-05, 'epoch': 0.08} + 8%|▊ | 728/8750 [1:11:37<12:47:58, 5.74s/it] 8%|▊ | 729/8750 [1:11:50<12:46:25, 5.73s/it] 8%|▊ | 729/8750 [1:11:43<12:46:25, 5.73s/it] {'loss': 0.4955, 'learning_rate': 1.985159243677284e-05, 'epoch': 0.08} + 8%|▊ | 729/8750 [1:11:50<12:46:25, 5.73s/it] {'loss': 0.4955, 'learning_rate': 1.985159243677284e-05, 'epoch': 0.08} + 8%|▊ | 729/8750 [1:11:43<12:46:25, 5.73s/it] 8%|▊ | 730/8750 [1:11:55<12:42:24, 5.70s/it] 8%|▊ | 730/8750 [1:11:49<12:42:24, 5.70s/it] {'loss': 0.4996, 'learning_rate': 1.985095639965894e-05, 'epoch': 0.08} + 8%|▊ | 730/8750 [1:11:55<12:42:24, 5.70s/it] {'loss': 0.4996, 'learning_rate': 1.985095639965894e-05, 'epoch': 0.08} + 8%|▊ | 730/8750 [1:11:49<12:42:24, 5.70s/it] 8%|▊ | 731/8750 [1:12:01<12:51:02, 5.77s/it] 8%|▊ | 731/8750 [1:11:55<12:51:01, 5.77s/it] {'loss': 0.512, 'learning_rate': 1.985031901274408e-05, 'epoch': 0.08} + 8%|▊ | 731/8750 [1:12:01<12:51:02, 5.77s/it] {'loss': 0.512, 'learning_rate': 1.985031901274408e-05, 'epoch': 0.08} + 8%|▊ | 731/8750 [1:11:55<12:51:01, 5.77s/it] 8%|▊ | 732/8750 [1:12:07<12:43:24, 5.71s/it] 8%|▊ | 732/8750 [1:12:00<12:43:25, 5.71s/it] {'loss': 0.492, 'learning_rate': 1.9849680276115593e-05, 'epoch': 0.08} + 8%|▊ | 732/8750 [1:12:07<12:43:24, 5.71s/it] {'loss': 0.492, 'learning_rate': 1.9849680276115593e-05, 'epoch': 0.08} + 8%|▊ | 732/8750 [1:12:00<12:43:25, 5.71s/it] 8%|▊ | 733/8750 [1:12:12<12:41:23, 5.70s/it] 8%|▊ | 733/8750 [1:12:06<12:41:23, 5.70s/it] {'loss': 0.4928, 'learning_rate': 1.9849040189861004e-05, 'epoch': 0.08} + 8%|▊ | 733/8750 [1:12:12<12:41:23, 5.70s/it] {'loss': 0.4928, 'learning_rate': 1.9849040189861004e-05, 'epoch': 0.08} + 8%|▊ | 733/8750 [1:12:06<12:41:23, 5.70s/it] 8%|▊ | 734/8750 [1:12:18<12:44:59, 5.73s/it] 8%|▊ | 734/8750 [1:12:12<12:44:59, 5.73s/it] {'loss': 0.5268, 'learning_rate': 1.9848398754068018e-05, 'epoch': 0.08} + 8%|▊ | 734/8750 [1:12:18<12:44:59, 5.73s/it] {'loss': 0.5268, 'learning_rate': 1.9848398754068018e-05, 'epoch': 0.08} + 8%|▊ | 734/8750 [1:12:12<12:44:59, 5.73s/it] 8%|▊ | 735/8750 [1:12:24<12:48:26, 5.75s/it] 8%|▊ | 735/8750 [1:12:17<12:48:24, 5.75s/it] {'loss': 0.4822, 'learning_rate': 1.984775596882452e-05, 'epoch': 0.08} + 8%|▊ | 735/8750 [1:12:24<12:48:26, 5.75s/it] {'loss': 0.4822, 'learning_rate': 1.984775596882452e-05, 'epoch': 0.08} + 8%|▊ | 735/8750 [1:12:17<12:48:24, 5.75s/it] 8%|▊ | 736/8750 [1:12:23<12:47:56, 5.75s/it] 8%|▊ | 736/8750 [1:12:30<12:47:56, 5.75s/it] {'loss': 0.487, 'learning_rate': 1.98471118342186e-05, 'epoch': 0.08} + 8%|▊ | 736/8750 [1:12:23<12:47:56, 5.75s/it]{'loss': 0.487, 'learning_rate': 1.98471118342186e-05, 'epoch': 0.08} + 8%|▊ | 736/8750 [1:12:30<12:47:56, 5.75s/it] 8%|▊ | 737/8750 [1:12:35<12:40:30, 5.69s/it] 8%|▊ | 737/8750 [1:12:29<12:40:31, 5.69s/it] {'loss': 0.5087, 'learning_rate': 1.9846466350338506e-05, 'epoch': 0.08} + 8%|▊ | 737/8750 [1:12:35<12:40:30, 5.69s/it] {'loss': 0.5087, 'learning_rate': 1.9846466350338506e-05, 'epoch': 0.08} + 8%|▊ | 737/8750 [1:12:29<12:40:31, 5.69s/it] 8%|▊ | 738/8750 [1:12:41<12:46:25, 5.74s/it] 8%|▊ | 738/8750 [1:12:35<12:46:24, 5.74s/it] {'loss': 0.4785, 'learning_rate': 1.9845819517272688e-05, 'epoch': 0.08} + 8%|▊ | 738/8750 [1:12:41<12:46:25, 5.74s/it]{'loss': 0.4785, 'learning_rate': 1.9845819517272688e-05, 'epoch': 0.08} + 8%|▊ | 738/8750 [1:12:35<12:46:24, 5.74s/it] 8%|▊ | 739/8750 [1:12:47<12:42:29, 5.71s/it] 8%|▊ | 739/8750 [1:12:40<12:42:31, 5.71s/it] {'loss': 0.523, 'learning_rate': 1.9845171335109776e-05, 'epoch': 0.08} + 8%|▊ | 739/8750 [1:12:47<12:42:29, 5.71s/it] {'loss': 0.523, 'learning_rate': 1.9845171335109776e-05, 'epoch': 0.08} + 8%|▊ | 739/8750 [1:12:40<12:42:31, 5.71s/it] 8%|▊ | 740/8750 [1:12:52<12:37:38, 5.68s/it] 8%|▊ | 740/8750 [1:12:46<12:37:37, 5.68s/it] {'loss': 0.4755, 'learning_rate': 1.9844521803938588e-05, 'epoch': 0.08} + 8%|▊ | 740/8750 [1:12:52<12:37:38, 5.68s/it] {'loss': 0.4755, 'learning_rate': 1.9844521803938588e-05, 'epoch': 0.08} + 8%|▊ | 740/8750 [1:12:46<12:37:37, 5.68s/it] 8%|▊ | 741/8750 [1:12:58<12:47:21, 5.75s/it] 8%|▊ | 741/8750 [1:12:52<12:47:21, 5.75s/it] {'loss': 0.4949, 'learning_rate': 1.9843870923848122e-05, 'epoch': 0.08} + 8%|▊ | 741/8750 [1:12:58<12:47:21, 5.75s/it] {'loss': 0.4949, 'learning_rate': 1.9843870923848122e-05, 'epoch': 0.08} + 8%|▊ | 741/8750 [1:12:52<12:47:21, 5.75s/it] 8%|▊ | 742/8750 [1:13:04<12:45:14, 5.73s/it] 8%|▊ | 742/8750 [1:12:57<12:45:14, 5.73s/it] {'loss': 0.5024, 'learning_rate': 1.984321869492756e-05, 'epoch': 0.08} + 8%|▊ | 742/8750 [1:13:04<12:45:14, 5.73s/it] {'loss': 0.5024, 'learning_rate': 1.984321869492756e-05, 'epoch': 0.08} + 8%|▊ | 742/8750 [1:12:57<12:45:14, 5.73s/it] 8%|▊ | 743/8750 [1:13:10<12:41:40, 5.71s/it] 8%|▊ | 743/8750 [1:13:03<12:41:40, 5.71s/it] {'loss': 0.502, 'learning_rate': 1.984256511726628e-05, 'epoch': 0.08} + 8%|▊ | 743/8750 [1:13:10<12:41:40, 5.71s/it] {'loss': 0.502, 'learning_rate': 1.984256511726628e-05, 'epoch': 0.08} + 8%|▊ | 743/8750 [1:13:03<12:41:40, 5.71s/it] 9%|▊ | 744/8750 [1:13:16<12:48:39, 5.76s/it] 9%|▊ | 744/8750 [1:13:09<12:48:39, 5.76s/it] {'loss': 0.4899, 'learning_rate': 1.984191019095383e-05, 'epoch': 0.09} + 9%|▊ | 744/8750 [1:13:16<12:48:39, 5.76s/it] {'loss': 0.4899, 'learning_rate': 1.984191019095383e-05, 'epoch': 0.09} + 9%|▊ | 744/8750 [1:13:09<12:48:39, 5.76s/it] 9%|▊ | 745/8750 [1:13:21<12:45:20, 5.74s/it] 9%|▊ | 745/8750 [1:13:15<12:45:20, 5.74s/it] {'loss': 0.5067, 'learning_rate': 1.9841253916079953e-05, 'epoch': 0.09} + 9%|▊ | 745/8750 [1:13:21<12:45:20, 5.74s/it] {'loss': 0.5067, 'learning_rate': 1.9841253916079953e-05, 'epoch': 0.09} + 9%|▊ | 745/8750 [1:13:15<12:45:20, 5.74s/it] 9%|▊ | 746/8750 [1:13:27<12:38:55, 5.69s/it] 9%|▊ | 746/8750 [1:13:20<12:38:54, 5.69s/it] {'loss': 0.4877, 'learning_rate': 1.9840596292734573e-05, 'epoch': 0.09} + 9%|▊ | 746/8750 [1:13:27<12:38:55, 5.69s/it] {'loss': 0.4877, 'learning_rate': 1.9840596292734573e-05, 'epoch': 0.09} + 9%|▊ | 746/8750 [1:13:20<12:38:54, 5.69s/it] 9%|▊ | 747/8750 [1:13:32<12:37:18, 5.68s/it] 9%|▊ | 747/8750 [1:13:26<12:37:17, 5.68s/it] {'loss': 0.5142, 'learning_rate': 1.9839937321007795e-05, 'epoch': 0.09} + 9%|▊ | 747/8750 [1:13:32<12:37:18, 5.68s/it] {'loss': 0.5142, 'learning_rate': 1.9839937321007795e-05, 'epoch': 0.09} + 9%|▊ | 747/8750 [1:13:26<12:37:17, 5.68s/it] 9%|▊ | 748/8750 [1:13:38<12:36:39, 5.67s/it] 9%|▊ | 748/8750 [1:13:32<12:36:39, 5.67s/it] {'loss': 0.488, 'learning_rate': 1.983927700098992e-05, 'epoch': 0.09} + 9%|▊ | 748/8750 [1:13:38<12:36:39, 5.67s/it] {'loss': 0.488, 'learning_rate': 1.983927700098992e-05, 'epoch': 0.09} + 9%|▊ | 748/8750 [1:13:32<12:36:39, 5.67s/it] 9%|▊ | 749/8750 [1:13:44<12:45:16, 5.74s/it] 9%|▊ | 749/8750 [1:13:37<12:45:16, 5.74s/it] {'loss': 0.501, 'learning_rate': 1.983861533277142e-05, 'epoch': 0.09} + 9%|▊ | 749/8750 [1:13:44<12:45:16, 5.74s/it] {'loss': 0.501, 'learning_rate': 1.983861533277142e-05, 'epoch': 0.09} + 9%|▊ | 749/8750 [1:13:37<12:45:16, 5.74s/it]813 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +73 AutoResumeHook: Checking whether to suspend... 2 +AutoResumeHook: Checking whether to suspend... 11 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... + 9%|▊ | 750/8750 [1:13:50<12:42:01, 5.72s/it]12 AutoResumeHook: Checking whether to suspend... + +AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +0 AutoResumeHook: Checking whether to suspend... + 9%|▊ | 750/8750 [1:13:43<12:42:02, 5.72s/it]1 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4959, 'learning_rate': 1.983795231644296e-05, 'epoch': 0.09} + 9%|▊ | 750/8750 [1:13:50<12:42:01, 5.72s/it] {'loss': 0.4959, 'learning_rate': 1.983795231644296e-05, 'epoch': 0.09} + 9%|▊ | 750/8750 [1:13:43<12:42:02, 5.72s/it] 9%|▊ | 751/8750 [1:13:55<12:38:20, 5.69s/it] 9%|▊ | 751/8750 [1:13:49<12:38:20, 5.69s/it] {'loss': 0.5074, 'learning_rate': 1.983728795209539e-05, 'epoch': 0.09} + 9%|▊ | 751/8750 [1:13:55<12:38:20, 5.69s/it] {'loss': 0.5074, 'learning_rate': 1.983728795209539e-05, 'epoch': 0.09} + 9%|▊ | 751/8750 [1:13:49<12:38:20, 5.69s/it] 9%|▊ | 752/8750 [1:14:01<12:34:26, 5.66s/it] 9%|▊ | 752/8750 [1:13:54<12:34:25, 5.66s/it] {'loss': 0.4955, 'learning_rate': 1.9836622239819743e-05, 'epoch': 0.09} + 9%|▊ | 752/8750 [1:14:01<12:34:26, 5.66s/it] {'loss': 0.4955, 'learning_rate': 1.9836622239819743e-05, 'epoch': 0.09} + 9%|▊ | 752/8750 [1:13:54<12:34:25, 5.66s/it] 9%|▊ | 753/8750 [1:14:07<12:34:44, 5.66s/it] 9%|▊ | 753/8750 [1:14:00<12:34:44, 5.66s/it] {'loss': 0.4876, 'learning_rate': 1.983595517970723e-05, 'epoch': 0.09} + 9%|▊ | 753/8750 [1:14:07<12:34:44, 5.66s/it] {'loss': 0.4876, 'learning_rate': 1.983595517970723e-05, 'epoch': 0.09} + 9%|▊ | 753/8750 [1:14:00<12:34:44, 5.66s/it] 9%|▊ | 754/8750 [1:14:12<12:41:23, 5.71s/it] 9%|▊ | 754/8750 [1:14:06<12:41:23, 5.71s/it] {'loss': 0.4906, 'learning_rate': 1.9835286771849264e-05, 'epoch': 0.09} + 9%|▊ | 754/8750 [1:14:12<12:41:23, 5.71s/it] {'loss': 0.4906, 'learning_rate': 1.9835286771849264e-05, 'epoch': 0.09} + 9%|▊ | 754/8750 [1:14:06<12:41:23, 5.71s/it] 9%|▊ | 755/8750 [1:14:18<12:38:27, 5.69s/it] 9%|▊ | 755/8750 [1:14:12<12:38:28, 5.69s/it] {'loss': 0.5096, 'learning_rate': 1.9834617016337424e-05, 'epoch': 0.09} + 9%|▊ | 755/8750 [1:14:18<12:38:27, 5.69s/it] {'loss': 0.5096, 'learning_rate': 1.9834617016337424e-05, 'epoch': 0.09} + 9%|▊ | 755/8750 [1:14:12<12:38:28, 5.69s/it] 9%|▊ | 756/8750 [1:14:24<12:37:38, 5.69s/it] 9%|▊ | 756/8750 [1:14:17<12:37:38, 5.69s/it] {'loss': 0.513, 'learning_rate': 1.9833945913263483e-05, 'epoch': 0.09} + 9%|▊ | 756/8750 [1:14:24<12:37:38, 5.69s/it] {'loss': 0.513, 'learning_rate': 1.9833945913263483e-05, 'epoch': 0.09} + 9%|▊ | 756/8750 [1:14:17<12:37:38, 5.69s/it] 9%|▊ | 757/8750 [1:14:29<12:37:13, 5.68s/it] 9%|▊ | 757/8750 [1:14:23<12:37:13, 5.68s/it] {'loss': 0.5042, 'learning_rate': 1.9833273462719396e-05, 'epoch': 0.09} + 9%|▊ | 757/8750 [1:14:29<12:37:13, 5.68s/it] {'loss': 0.5042, 'learning_rate': 1.9833273462719396e-05, 'epoch': 0.09} + 9%|▊ | 757/8750 [1:14:23<12:37:13, 5.68s/it] 9%|▊ | 758/8750 [1:14:35<12:34:26, 5.66s/it] 9%|▊ | 758/8750 [1:14:28<12:34:25, 5.66s/it] {'loss': 0.4622, 'learning_rate': 1.9832599664797306e-05, 'epoch': 0.09} + 9%|▊ | 758/8750 [1:14:35<12:34:26, 5.66s/it] {'loss': 0.4622, 'learning_rate': 1.9832599664797306e-05, 'epoch': 0.09} + 9%|▊ | 758/8750 [1:14:28<12:34:25, 5.66s/it] 9%|▊ | 759/8750 [1:14:41<12:34:38, 5.67s/it] 9%|▊ | 759/8750 [1:14:34<12:34:38, 5.67s/it] {'loss': 0.4877, 'learning_rate': 1.9831924519589537e-05, 'epoch': 0.09} + 9%|▊ | 759/8750 [1:14:41<12:34:38, 5.67s/it] {'loss': 0.4877, 'learning_rate': 1.9831924519589537e-05, 'epoch': 0.09} + 9%|▊ | 759/8750 [1:14:34<12:34:38, 5.67s/it] 9%|▊ | 760/8750 [1:14:46<12:32:32, 5.65s/it] 9%|▊ | 760/8750 [1:14:40<12:32:33, 5.65s/it] {'loss': 0.5123, 'learning_rate': 1.9831248027188604e-05, 'epoch': 0.09} + 9%|▊ | 760/8750 [1:14:46<12:32:32, 5.65s/it] {'loss': 0.5123, 'learning_rate': 1.9831248027188604e-05, 'epoch': 0.09} + 9%|▊ | 760/8750 [1:14:40<12:32:33, 5.65s/it] 9%|▊ | 761/8750 [1:14:52<12:43:28, 5.73s/it] 9%|▊ | 761/8750 [1:14:46<12:43:28, 5.73s/it] {'loss': 0.4812, 'learning_rate': 1.983057018768719e-05, 'epoch': 0.09} + 9%|▊ | 761/8750 [1:14:52<12:43:28, 5.73s/it] {'loss': 0.4812, 'learning_rate': 1.983057018768719e-05, 'epoch': 0.09} + 9%|▊ | 761/8750 [1:14:46<12:43:28, 5.73s/it] 9%|▊ | 762/8750 [1:14:58<12:43:41, 5.74s/it] 9%|▊ | 762/8750 [1:14:51<12:43:41, 5.74s/it] {'loss': 0.4839, 'learning_rate': 1.982989100117819e-05, 'epoch': 0.09} + 9%|▊ | 762/8750 [1:14:58<12:43:41, 5.74s/it] {'loss': 0.4839, 'learning_rate': 1.982989100117819e-05, 'epoch': 0.09} + 9%|▊ | 762/8750 [1:14:51<12:43:41, 5.74s/it] 9%|▊ | 763/8750 [1:15:04<12:41:22, 5.72s/it] 9%|▊ | 763/8750 [1:14:57<12:41:21, 5.72s/it] {'loss': 0.5185, 'learning_rate': 1.9829210467754654e-05, 'epoch': 0.09} + 9%|▊ | 763/8750 [1:15:04<12:41:22, 5.72s/it] {'loss': 0.5185, 'learning_rate': 1.9829210467754654e-05, 'epoch': 0.09} + 9%|▊ | 763/8750 [1:14:57<12:41:21, 5.72s/it] 9%|▊ | 764/8750 [1:15:09<12:40:17, 5.71s/it] 9%|▊ | 764/8750 [1:15:03<12:42:42, 5.73s/it] {'loss': 0.4878, 'learning_rate': 1.9828528587509836e-05, 'epoch': 0.09} + 9%|▊ | 764/8750 [1:15:09<12:40:17, 5.71s/it] {'loss': 0.4878, 'learning_rate': 1.9828528587509836e-05, 'epoch': 0.09} + 9%|▊ | 764/8750 [1:15:03<12:42:42, 5.73s/it] 9%|▊ | 765/8750 [1:15:16<13:02:11, 5.88s/it] 9%|▊ | 765/8750 [1:15:09<13:01:28, 5.87s/it] {'loss': 0.4959, 'learning_rate': 1.982784536053717e-05, 'epoch': 0.09} + 9%|▊ | 765/8750 [1:15:16<13:02:11, 5.88s/it] {'loss': 0.4959, 'learning_rate': 1.982784536053717e-05, 'epoch': 0.09} + 9%|▊ | 765/8750 [1:15:09<13:01:28, 5.87s/it] 9%|▉ | 766/8750 [1:15:21<12:50:13, 5.79s/it] 9%|▉ | 766/8750 [1:15:15<12:49:43, 5.78s/it] {'loss': 0.5101, 'learning_rate': 1.9827160786930267e-05, 'epoch': 0.09} + 9%|▉ | 766/8750 [1:15:21<12:50:13, 5.79s/it] {'loss': 0.5101, 'learning_rate': 1.9827160786930267e-05, 'epoch': 0.09} + 9%|▉ | 766/8750 [1:15:15<12:49:43, 5.78s/it] 9%|▉ | 767/8750 [1:15:27<12:49:28, 5.78s/it] 9%|▉ | 767/8750 [1:15:20<12:49:07, 5.78s/it] {'loss': 0.4955, 'learning_rate': 1.9826474866782933e-05, 'epoch': 0.09} + 9%|▉ | 767/8750 [1:15:27<12:49:28, 5.78s/it] {'loss': 0.4955, 'learning_rate': 1.9826474866782933e-05, 'epoch': 0.09} + 9%|▉ | 767/8750 [1:15:20<12:49:07, 5.78s/it] 9%|▉ | 768/8750 [1:15:33<12:52:07, 5.80s/it] 9%|▉ | 768/8750 [1:15:26<12:51:52, 5.80s/it] {'loss': 0.5152, 'learning_rate': 1.9825787600189163e-05, 'epoch': 0.09} + 9%|▉ | 768/8750 [1:15:33<12:52:07, 5.80s/it] {'loss': 0.5152, 'learning_rate': 1.9825787600189163e-05, 'epoch': 0.09} + 9%|▉ | 768/8750 [1:15:26<12:51:52, 5.80s/it] 9%|▉ | 769/8750 [1:15:38<12:43:49, 5.74s/it] 9%|▉ | 769/8750 [1:15:32<12:43:38, 5.74s/it] {'loss': 0.4909, 'learning_rate': 1.982509898724311e-05, 'epoch': 0.09} + 9%|▉ | 769/8750 [1:15:38<12:43:49, 5.74s/it] {'loss': 0.4909, 'learning_rate': 1.982509898724311e-05, 'epoch': 0.09} + 9%|▉ | 769/8750 [1:15:32<12:43:38, 5.74s/it] 9%|▉ | 770/8750 [1:15:44<12:56:06, 5.84s/it] 9%|▉ | 770/8750 [1:15:38<12:55:58, 5.83s/it] {'loss': 0.5013, 'learning_rate': 1.9824409028039143e-05, 'epoch': 0.09} + 9%|▉ | 770/8750 [1:15:44<12:56:06, 5.84s/it] {'loss': 0.5013, 'learning_rate': 1.9824409028039143e-05, 'epoch': 0.09} + 9%|▉ | 770/8750 [1:15:38<12:55:58, 5.83s/it] 9%|▉ | 771/8750 [1:15:50<12:45:39, 5.76s/it] 9%|▉ | 771/8750 [1:15:43<12:45:34, 5.76s/it] {'loss': 0.4928, 'learning_rate': 1.9823717722671798e-05, 'epoch': 0.09} + 9%|▉ | 771/8750 [1:15:50<12:45:39, 5.76s/it] {'loss': 0.4928, 'learning_rate': 1.9823717722671798e-05, 'epoch': 0.09} + 9%|▉ | 771/8750 [1:15:43<12:45:34, 5.76s/it] 9%|▉ | 772/8750 [1:15:56<12:42:13, 5.73s/it] 9%|▉ | 772/8750 [1:15:49<12:42:10, 5.73s/it] {'loss': 0.4885, 'learning_rate': 1.98230250712358e-05, 'epoch': 0.09} + 9%|▉ | 772/8750 [1:15:56<12:42:13, 5.73s/it] {'loss': 0.4885, 'learning_rate': 1.98230250712358e-05, 'epoch': 0.09} + 9%|▉ | 772/8750 [1:15:49<12:42:10, 5.73s/it] 9%|▉ | 773/8750 [1:16:02<12:48:18, 5.78s/it] 9%|▉ | 773/8750 [1:15:55<12:48:15, 5.78s/it] {'loss': 0.5048, 'learning_rate': 1.9822331073826056e-05, 'epoch': 0.09} + 9%|▉ | 773/8750 [1:16:02<12:48:18, 5.78s/it] {'loss': 0.5048, 'learning_rate': 1.9822331073826056e-05, 'epoch': 0.09} + 9%|▉ | 773/8750 [1:15:55<12:48:15, 5.78s/it] 9%|▉ | 774/8750 [1:16:07<12:41:41, 5.73s/it] 9%|▉ | 774/8750 [1:16:01<12:41:39, 5.73s/it] {'loss': 0.4921, 'learning_rate': 1.982163573053766e-05, 'epoch': 0.09} + 9%|▉ | 774/8750 [1:16:07<12:41:41, 5.73s/it] {'loss': 0.4921, 'learning_rate': 1.982163573053766e-05, 'epoch': 0.09} + 9%|▉ | 774/8750 [1:16:01<12:41:39, 5.73s/it] 9%|▉ | 775/8750 [1:16:13<12:52:17, 5.81s/it] 9%|▉ | 775/8750 [1:16:07<12:52:16, 5.81s/it] {'loss': 0.4884, 'learning_rate': 1.9820939041465887e-05, 'epoch': 0.09} + 9%|▉ | 775/8750 [1:16:13<12:52:17, 5.81s/it] {'loss': 0.4884, 'learning_rate': 1.9820939041465887e-05, 'epoch': 0.09} + 9%|▉ | 775/8750 [1:16:07<12:52:16, 5.81s/it] 9%|▉ | 776/8750 [1:16:19<12:53:11, 5.82s/it] 9%|▉ | 776/8750 [1:16:13<12:53:11, 5.82s/it] {'loss': 0.4746, 'learning_rate': 1.9820241006706203e-05, 'epoch': 0.09} + 9%|▉ | 776/8750 [1:16:19<12:53:11, 5.82s/it] {'loss': 0.4746, 'learning_rate': 1.9820241006706203e-05, 'epoch': 0.09} + 9%|▉ | 776/8750 [1:16:13<12:53:11, 5.82s/it] 9%|▉ | 777/8750 [1:16:25<12:54:47, 5.83s/it] 9%|▉ | 777/8750 [1:16:18<12:54:46, 5.83s/it] {'loss': 0.4968, 'learning_rate': 1.9819541626354252e-05, 'epoch': 0.09} + 9%|▉ | 777/8750 [1:16:25<12:54:47, 5.83s/it] {'loss': 0.4968, 'learning_rate': 1.9819541626354252e-05, 'epoch': 0.09} + 9%|▉ | 777/8750 [1:16:18<12:54:46, 5.83s/it] 9%|▉ | 778/8750 [1:16:31<12:52:05, 5.81s/it] 9%|▉ | 778/8750 [1:16:24<12:52:05, 5.81s/it] {'loss': 0.4767, 'learning_rate': 1.9818840900505866e-05, 'epoch': 0.09} + 9%|▉ | 778/8750 [1:16:31<12:52:05, 5.81s/it] {'loss': 0.4767, 'learning_rate': 1.9818840900505866e-05, 'epoch': 0.09} + 9%|▉ | 778/8750 [1:16:24<12:52:05, 5.81s/it] 9%|▉ | 779/8750 [1:16:36<12:46:38, 5.77s/it] 9%|▉ | 779/8750 [1:16:30<12:46:38, 5.77s/it] {'loss': 0.5091, 'learning_rate': 1.9818138829257063e-05, 'epoch': 0.09} + 9%|▉ | 779/8750 [1:16:36<12:46:38, 5.77s/it] {'loss': 0.5091, 'learning_rate': 1.9818138829257063e-05, 'epoch': 0.09} + 9%|▉ | 779/8750 [1:16:30<12:46:38, 5.77s/it] 9%|▉ | 780/8750 [1:16:42<12:43:47, 5.75s/it] 9%|▉ | 780/8750 [1:16:36<12:43:47, 5.75s/it] {'loss': 0.5013, 'learning_rate': 1.9817435412704037e-05, 'epoch': 0.09} + 9%|▉ | 780/8750 [1:16:42<12:43:47, 5.75s/it] {'loss': 0.5013, 'learning_rate': 1.9817435412704037e-05, 'epoch': 0.09} + 9%|▉ | 780/8750 [1:16:36<12:43:47, 5.75s/it] 9%|▉ | 781/8750 [1:16:48<12:38:47, 5.71s/it] 9%|▉ | 781/8750 [1:16:41<12:38:48, 5.71s/it] {'loss': 0.525, 'learning_rate': 1.981673065094317e-05, 'epoch': 0.09} + 9%|▉ | 781/8750 [1:16:48<12:38:47, 5.71s/it] {'loss': 0.525, 'learning_rate': 1.981673065094317e-05, 'epoch': 0.09} + 9%|▉ | 781/8750 [1:16:41<12:38:48, 5.71s/it] 9%|▉ | 782/8750 [1:16:53<12:37:42, 5.71s/it] 9%|▉ | 782/8750 [1:16:47<12:37:51, 5.71s/it] {'loss': 0.4728, 'learning_rate': 1.9816024544071038e-05, 'epoch': 0.09} + 9%|▉ | 782/8750 [1:16:53<12:37:42, 5.71s/it] {'loss': 0.4728, 'learning_rate': 1.9816024544071038e-05, 'epoch': 0.09} + 9%|▉ | 782/8750 [1:16:47<12:37:51, 5.71s/it] 9%|▉ | 783/8750 [1:16:59<12:33:58, 5.68s/it] 9%|▉ | 783/8750 [1:16:52<12:33:55, 5.68s/it] {'loss': 0.505, 'learning_rate': 1.9815317092184388e-05, 'epoch': 0.09} + 9%|▉ | 783/8750 [1:16:59<12:33:58, 5.68s/it] {'loss': 0.505, 'learning_rate': 1.9815317092184388e-05, 'epoch': 0.09} + 9%|▉ | 783/8750 [1:16:52<12:33:55, 5.68s/it] 9%|▉ | 784/8750 [1:17:05<12:37:37, 5.71s/it] 9%|▉ | 784/8750 [1:16:58<12:37:35, 5.71s/it] {'loss': 0.4836, 'learning_rate': 1.9814608295380155e-05, 'epoch': 0.09} + 9%|▉ | 784/8750 [1:17:05<12:37:37, 5.71s/it] {'loss': 0.4836, 'learning_rate': 1.9814608295380155e-05, 'epoch': 0.09} + 9%|▉ | 784/8750 [1:16:58<12:37:35, 5.71s/it] 9%|▉ | 785/8750 [1:17:10<12:36:40, 5.70s/it] 9%|▉ | 785/8750 [1:17:04<12:36:38, 5.70s/it] {'loss': 0.4904, 'learning_rate': 1.9813898153755465e-05, 'epoch': 0.09} + 9%|▉ | 785/8750 [1:17:10<12:36:40, 5.70s/it] {'loss': 0.4904, 'learning_rate': 1.9813898153755465e-05, 'epoch': 0.09} + 9%|▉ | 785/8750 [1:17:04<12:36:38, 5.70s/it] 9%|▉ | 786/8750 [1:17:10<12:34:03, 5.68s/it] {'loss': 0.4916, 'learning_rate': 1.9813186667407624e-05, 'epoch': 0.09} + 9%|▉ | 786/8750 [1:17:10<12:34:03, 5.68s/it]{'loss': 0.4916, 'learning_rate': 1.9813186667407624e-05, 'epoch': 0.09} + 9%|▉ | 786/8750 [1:17:16<12:34:04, 5.68s/it] 9%|▉ | 786/8750 [1:17:16<12:34:04, 5.68s/it] 9%|▉ | 787/8750 [1:17:22<12:33:12, 5.68s/it] 9%|▉ | 787/8750 [1:17:15<12:33:11, 5.68s/it] {'loss': 0.4835, 'learning_rate': 1.9812473836434115e-05, 'epoch': 0.09} + 9%|▉ | 787/8750 [1:17:22<12:33:12, 5.68s/it] {'loss': 0.4835, 'learning_rate': 1.9812473836434115e-05, 'epoch': 0.09} + 9%|▉ | 787/8750 [1:17:15<12:33:11, 5.68s/it] 9%|▉ | 788/8750 [1:17:28<12:41:28, 5.74s/it] 9%|▉ | 788/8750 [1:17:21<12:41:28, 5.74s/it] {'loss': 0.5016, 'learning_rate': 1.981175966093262e-05, 'epoch': 0.09} + 9%|▉ | 788/8750 [1:17:28<12:41:28, 5.74s/it] {'loss': 0.5016, 'learning_rate': 1.981175966093262e-05, 'epoch': 0.09} + 9%|▉ | 788/8750 [1:17:21<12:41:28, 5.74s/it] 9%|▉ | 789/8750 [1:17:33<12:36:13, 5.70s/it] 9%|▉ | 789/8750 [1:17:27<12:36:12, 5.70s/it] {'loss': 0.507, 'learning_rate': 1.9811044141000985e-05, 'epoch': 0.09} + 9%|▉ | 789/8750 [1:17:33<12:36:13, 5.70s/it] {'loss': 0.507, 'learning_rate': 1.9811044141000985e-05, 'epoch': 0.09} + 9%|▉ | 789/8750 [1:17:27<12:36:12, 5.70s/it] 9%|▉ | 790/8750 [1:17:39<12:47:01, 5.78s/it] 9%|▉ | 790/8750 [1:17:33<12:47:00, 5.78s/it] {'loss': 0.4745, 'learning_rate': 1.9810327276737268e-05, 'epoch': 0.09} + 9%|▉ | 790/8750 [1:17:39<12:47:01, 5.78s/it] {'loss': 0.4745, 'learning_rate': 1.9810327276737268e-05, 'epoch': 0.09} + 9%|▉ | 790/8750 [1:17:33<12:47:00, 5.78s/it] 9%|▉ | 791/8750 [1:17:45<12:47:12, 5.78s/it] 9%|▉ | 791/8750 [1:17:38<12:47:12, 5.78s/it] {'loss': 0.5002, 'learning_rate': 1.980960906823968e-05, 'epoch': 0.09} + 9%|▉ | 791/8750 [1:17:45<12:47:12, 5.78s/it] {'loss': 0.5002, 'learning_rate': 1.980960906823968e-05, 'epoch': 0.09} + 9%|▉ | 791/8750 [1:17:38<12:47:12, 5.78s/it] 9%|▉ | 792/8750 [1:17:51<12:40:48, 5.74s/it] 9%|▉ | 792/8750 [1:17:44<12:40:46, 5.74s/it] {'loss': 0.4881, 'learning_rate': 1.9808889515606644e-05, 'epoch': 0.09} + 9%|▉ | 792/8750 [1:17:51<12:40:48, 5.74s/it] {'loss': 0.4881, 'learning_rate': 1.9808889515606644e-05, 'epoch': 0.09} + 9%|▉ | 792/8750 [1:17:44<12:40:46, 5.74s/it] 9%|▉ | 793/8750 [1:17:56<12:47:36, 5.79s/it] 9%|▉ | 793/8750 [1:17:50<12:47:36, 5.79s/it] {'loss': 0.4806, 'learning_rate': 1.9808168618936746e-05, 'epoch': 0.09} + 9%|▉ | 793/8750 [1:17:56<12:47:36, 5.79s/it] {'loss': 0.4806, 'learning_rate': 1.9808168618936746e-05, 'epoch': 0.09} + 9%|▉ | 793/8750 [1:17:50<12:47:36, 5.79s/it] 9%|▉ | 794/8750 [1:18:02<12:35:18, 5.70s/it] 9%|▉ | 794/8750 [1:17:55<12:35:18, 5.70s/it] {'loss': 0.4801, 'learning_rate': 1.980744637832877e-05, 'epoch': 0.09} + 9%|▉ | 794/8750 [1:18:02<12:35:18, 5.70s/it] {'loss': 0.4801, 'learning_rate': 1.980744637832877e-05, 'epoch': 0.09} + 9%|▉ | 794/8750 [1:17:55<12:35:18, 5.70s/it] 9%|▉ | 795/8750 [1:18:08<12:31:53, 5.67s/it] 9%|▉ | 795/8750 [1:18:01<12:31:53, 5.67s/it] {'loss': 0.4944, 'learning_rate': 1.9806722793881675e-05, 'epoch': 0.09} + 9%|▉ | 795/8750 [1:18:08<12:31:53, 5.67s/it] {'loss': 0.4944, 'learning_rate': 1.9806722793881675e-05, 'epoch': 0.09} + 9%|▉ | 795/8750 [1:18:01<12:31:53, 5.67s/it] 9%|▉ | 796/8750 [1:18:13<12:32:51, 5.68s/it] 9%|▉ | 796/8750 [1:18:07<12:32:51, 5.68s/it] {'loss': 0.499, 'learning_rate': 1.9805997865694616e-05, 'epoch': 0.09} + 9%|▉ | 796/8750 [1:18:13<12:32:51, 5.68s/it] {'loss': 0.499, 'learning_rate': 1.9805997865694616e-05, 'epoch': 0.09} + 9%|▉ | 796/8750 [1:18:07<12:32:51, 5.68s/it] 9%|▉ | 797/8750 [1:18:19<12:28:02, 5.64s/it] 9%|▉ | 797/8750 [1:18:12<12:28:02, 5.64s/it] {'loss': 0.5009, 'learning_rate': 1.9805271593866914e-05, 'epoch': 0.09} + 9%|▉ | 797/8750 [1:18:19<12:28:02, 5.64s/it] {'loss': 0.5009, 'learning_rate': 1.9805271593866914e-05, 'epoch': 0.09} + 9%|▉ | 797/8750 [1:18:12<12:28:02, 5.64s/it] 9%|▉ | 798/8750 [1:18:25<12:40:59, 5.74s/it] 9%|▉ | 798/8750 [1:18:18<12:40:59, 5.74s/it] {'loss': 0.4867, 'learning_rate': 1.9804543978498093e-05, 'epoch': 0.09} + 9%|▉ | 798/8750 [1:18:25<12:40:59, 5.74s/it] {'loss': 0.4867, 'learning_rate': 1.9804543978498093e-05, 'epoch': 0.09} + 9%|▉ | 798/8750 [1:18:18<12:40:59, 5.74s/it] 9%|▉ | 799/8750 [1:18:30<12:35:58, 5.70s/it] 9%|▉ | 799/8750 [1:18:24<12:35:57, 5.70s/it] {'loss': 0.5012, 'learning_rate': 1.9803815019687844e-05, 'epoch': 0.09} + 9%|▉ | 799/8750 [1:18:30<12:35:58, 5.70s/it] {'loss': 0.5012, 'learning_rate': 1.9803815019687844e-05, 'epoch': 0.09} + 9%|▉ | 799/8750 [1:18:24<12:35:57, 5.70s/it]12 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +815 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... + 9%|▉ | 800/8750 [1:18:36<12:35:32, 5.70s/it]3 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +06 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... + 9%|▉ | 800/8750 [1:18:30<12:35:33, 5.70s/it] {'loss': 0.5014, 'learning_rate': 1.980308471753606e-05, 'epoch': 0.09} + 9%|▉ | 800/8750 [1:18:36<12:35:32, 5.70s/it] {'loss': 0.5014, 'learning_rate': 1.980308471753606e-05, 'epoch': 0.09} + 9%|▉ | 800/8750 [1:18:30<12:35:33, 5.70s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-800/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-800/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-800/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 9%|▉ | 801/8750 [1:18:55<21:28:13, 9.72s/it] 9%|▉ | 801/8750 [1:18:49<21:28:13, 9.72s/it] {'loss': 0.5037, 'learning_rate': 1.9802353072142802e-05, 'epoch': 0.09} + 9%|▉ | 801/8750 [1:18:55<21:28:13, 9.72s/it] {'loss': 0.5037, 'learning_rate': 1.9802353072142802e-05, 'epoch': 0.09} + 9%|▉ | 801/8750 [1:18:49<21:28:13, 9.72s/it] 9%|▉ | 802/8750 [1:19:01<18:53:53, 8.56s/it] 9%|▉ | 802/8750 [1:18:55<18:53:52, 8.56s/it] {'loss': 0.4848, 'learning_rate': 1.9801620083608327e-05, 'epoch': 0.09} + 9%|▉ | 802/8750 [1:19:01<18:53:53, 8.56s/it] {'loss': 0.4848, 'learning_rate': 1.9801620083608327e-05, 'epoch': 0.09} + 9%|▉ | 802/8750 [1:18:55<18:53:52, 8.56s/it] 9%|▉ | 803/8750 [1:19:07<16:59:40, 7.70s/it] 9%|▉ | 803/8750 [1:19:00<16:59:40, 7.70s/it] {'loss': 0.4847, 'learning_rate': 1.9800885752033067e-05, 'epoch': 0.09} + 9%|▉ | 803/8750 [1:19:07<16:59:40, 7.70s/it] {'loss': 0.4847, 'learning_rate': 1.9800885752033067e-05, 'epoch': 0.09} + 9%|▉ | 803/8750 [1:19:00<16:59:40, 7.70s/it] 9%|▉ | 804/8750 [1:19:13<15:43:15, 7.12s/it] 9%|▉ | 804/8750 [1:19:06<15:43:17, 7.12s/it] {'loss': 0.4958, 'learning_rate': 1.980015007751764e-05, 'epoch': 0.09} + 9%|▉ | 804/8750 [1:19:13<15:43:15, 7.12s/it] {'loss': 0.4958, 'learning_rate': 1.980015007751764e-05, 'epoch': 0.09} + 9%|▉ | 804/8750 [1:19:06<15:43:17, 7.12s/it] 9%|▉ | 805/8750 [1:19:18<14:47:03, 6.70s/it] 9%|▉ | 805/8750 [1:19:12<14:47:02, 6.70s/it] {'loss': 0.4871, 'learning_rate': 1.9799413060162854e-05, 'epoch': 0.09} + 9%|▉ | 805/8750 [1:19:18<14:47:03, 6.70s/it] {'loss': 0.4871, 'learning_rate': 1.9799413060162854e-05, 'epoch': 0.09} + 9%|▉ | 805/8750 [1:19:12<14:47:02, 6.70s/it] 9%|▉ | 806/8750 [1:19:24<14:04:55, 6.38s/it] 9%|▉ | 806/8750 [1:19:17<14:04:54, 6.38s/it] {'loss': 0.4962, 'learning_rate': 1.9798674700069698e-05, 'epoch': 0.09} + 9%|▉ | 806/8750 [1:19:24<14:04:55, 6.38s/it] {'loss': 0.4962, 'learning_rate': 1.9798674700069698e-05, 'epoch': 0.09} + 9%|▉ | 806/8750 [1:19:17<14:04:54, 6.38s/it] 9%|▉ | 807/8750 [1:19:30<13:46:10, 6.24s/it] 9%|▉ | 807/8750 [1:19:23<13:46:10, 6.24s/it] {'loss': 0.5009, 'learning_rate': 1.979793499733934e-05, 'epoch': 0.09} + 9%|▉ | 807/8750 [1:19:30<13:46:10, 6.24s/it] {'loss': 0.5009, 'learning_rate': 1.979793499733934e-05, 'epoch': 0.09} + 9%|▉ | 807/8750 [1:19:23<13:46:10, 6.24s/it] 9%|▉ | 808/8750 [1:19:35<13:21:26, 6.05s/it] 9%|▉ | 808/8750 [1:19:29<13:21:26, 6.05s/it] {'loss': 0.4764, 'learning_rate': 1.9797193952073135e-05, 'epoch': 0.09} + 9%|▉ | 808/8750 [1:19:35<13:21:26, 6.05s/it] {'loss': 0.4764, 'learning_rate': 1.9797193952073135e-05, 'epoch': 0.09} + 9%|▉ | 808/8750 [1:19:29<13:21:26, 6.05s/it] 9%|▉ | 809/8750 [1:19:41<13:03:25, 5.92s/it] 9%|▉ | 809/8750 [1:19:35<13:03:25, 5.92s/it] {'loss': 0.5013, 'learning_rate': 1.9796451564372624e-05, 'epoch': 0.09} + 9%|▉ | 809/8750 [1:19:41<13:03:25, 5.92s/it] {'loss': 0.5013, 'learning_rate': 1.9796451564372624e-05, 'epoch': 0.09} + 9%|▉ | 809/8750 [1:19:35<13:03:25, 5.92s/it] 9%|▉ | 810/8750 [1:19:47<13:02:14, 5.91s/it] 9%|▉ | 810/8750 [1:19:40<13:02:14, 5.91s/it] {'loss': 0.5022, 'learning_rate': 1.979570783433954e-05, 'epoch': 0.09} + 9%|▉ | 810/8750 [1:19:47<13:02:14, 5.91s/it] {'loss': 0.5022, 'learning_rate': 1.979570783433954e-05, 'epoch': 0.09} + 9%|▉ | 810/8750 [1:19:40<13:02:14, 5.91s/it] 9%|▉ | 811/8750 [1:19:53<12:59:42, 5.89s/it] 9%|▉ | 811/8750 [1:19:46<12:59:42, 5.89s/it] {'loss': 0.4912, 'learning_rate': 1.9794962762075772e-05, 'epoch': 0.09} + 9%|▉ | 811/8750 [1:19:53<12:59:42, 5.89s/it] {'loss': 0.4912, 'learning_rate': 1.9794962762075772e-05, 'epoch': 0.09} + 9%|▉ | 811/8750 [1:19:46<12:59:42, 5.89s/it] 9%|▉ | 812/8750 [1:19:58<12:48:44, 5.81s/it] 9%|▉ | 812/8750 [1:19:52<12:48:45, 5.81s/it] {'loss': 0.5052, 'learning_rate': 1.9794216347683425e-05, 'epoch': 0.09} + 9%|▉ | 812/8750 [1:19:58<12:48:44, 5.81s/it] {'loss': 0.5052, 'learning_rate': 1.9794216347683425e-05, 'epoch': 0.09} + 9%|▉ | 812/8750 [1:19:52<12:48:45, 5.81s/it] 9%|▉ | 813/8750 [1:20:04<12:44:41, 5.78s/it] 9%|▉ | 813/8750 [1:19:58<12:44:39, 5.78s/it] {'loss': 0.4961, 'learning_rate': 1.979346859126477e-05, 'epoch': 0.09} + 9%|▉ | 813/8750 [1:20:04<12:44:41, 5.78s/it] {'loss': 0.4961, 'learning_rate': 1.979346859126477e-05, 'epoch': 0.09} + 9%|▉ | 813/8750 [1:19:58<12:44:39, 5.78s/it] 9%|▉ | 814/8750 [1:20:10<12:41:30, 5.76s/it] 9%|▉ | 814/8750 [1:20:03<12:41:29, 5.76s/it] {'loss': 0.5019, 'learning_rate': 1.979271949292227e-05, 'epoch': 0.09} + 9%|▉ | 814/8750 [1:20:10<12:41:30, 5.76s/it] {'loss': 0.5019, 'learning_rate': 1.979271949292227e-05, 'epoch': 0.09} + 9%|▉ | 814/8750 [1:20:03<12:41:29, 5.76s/it] 9%|▉ | 815/8750 [1:20:15<12:35:46, 5.71s/it] 9%|▉ | 815/8750 [1:20:09<12:35:46, 5.71s/it] {'loss': 0.497, 'learning_rate': 1.9791969052758563e-05, 'epoch': 0.09} + 9%|▉ | 815/8750 [1:20:15<12:35:46, 5.71s/it] {'loss': 0.497, 'learning_rate': 1.9791969052758563e-05, 'epoch': 0.09} + 9%|▉ | 815/8750 [1:20:09<12:35:46, 5.71s/it] 9%|▉ | 816/8750 [1:20:21<12:35:48, 5.72s/it] 9%|▉ | 816/8750 [1:20:15<12:35:48, 5.72s/it] {'loss': 0.4944, 'learning_rate': 1.979121727087648e-05, 'epoch': 0.09} + 9%|▉ | 816/8750 [1:20:21<12:35:48, 5.72s/it] {'loss': 0.4944, 'learning_rate': 1.979121727087648e-05, 'epoch': 0.09} + 9%|▉ | 816/8750 [1:20:15<12:35:48, 5.72s/it] 9%|▉ | 817/8750 [1:20:27<12:31:29, 5.68s/it] 9%|▉ | 817/8750 [1:20:20<12:31:29, 5.68s/it] {'loss': 0.4989, 'learning_rate': 1.979046414737903e-05, 'epoch': 0.09} + 9%|▉ | 817/8750 [1:20:27<12:31:29, 5.68s/it] {'loss': 0.4989, 'learning_rate': 1.979046414737903e-05, 'epoch': 0.09} + 9%|▉ | 817/8750 [1:20:20<12:31:29, 5.68s/it] 9%|▉ | 818/8750 [1:20:32<12:29:54, 5.67s/it] 9%|▉ | 818/8750 [1:20:26<12:29:54, 5.67s/it] {'loss': 0.4955, 'learning_rate': 1.978970968236941e-05, 'epoch': 0.09} + 9%|▉ | 818/8750 [1:20:32<12:29:54, 5.67s/it] {'loss': 0.4955, 'learning_rate': 1.978970968236941e-05, 'epoch': 0.09} + 9%|▉ | 818/8750 [1:20:26<12:29:54, 5.67s/it] 9%|▉ | 819/8750 [1:20:38<12:25:35, 5.64s/it] 9%|▉ | 819/8750 [1:20:31<12:25:35, 5.64s/it] {'loss': 0.4843, 'learning_rate': 1.9788953875950992e-05, 'epoch': 0.09} + 9%|▉ | 819/8750 [1:20:38<12:25:35, 5.64s/it] {'loss': 0.4843, 'learning_rate': 1.9788953875950992e-05, 'epoch': 0.09} + 9%|▉ | 819/8750 [1:20:31<12:25:35, 5.64s/it] 9%|▉ | 820/8750 [1:20:44<12:34:36, 5.71s/it] 9%|▉ | 820/8750 [1:20:37<12:34:35, 5.71s/it] {'loss': 0.4918, 'learning_rate': 1.9788196728227348e-05, 'epoch': 0.09} + 9%|▉ | 820/8750 [1:20:44<12:34:36, 5.71s/it] {'loss': 0.4918, 'learning_rate': 1.9788196728227348e-05, 'epoch': 0.09} + 9%|▉ | 820/8750 [1:20:37<12:34:35, 5.71s/it] 9%|▉ | 821/8750 [1:20:49<12:27:28, 5.66s/it] 9%|▉ | 821/8750 [1:20:43<12:27:28, 5.66s/it] {'loss': 0.4815, 'learning_rate': 1.9787438239302217e-05, 'epoch': 0.09} + 9%|▉ | 821/8750 [1:20:49<12:27:28, 5.66s/it] {'loss': 0.4815, 'learning_rate': 1.9787438239302217e-05, 'epoch': 0.09} + 9%|▉ | 821/8750 [1:20:43<12:27:28, 5.66s/it] 9%|▉ | 822/8750 [1:20:55<12:36:08, 5.72s/it] 9%|▉ | 822/8750 [1:20:49<12:36:08, 5.72s/it] {'loss': 0.4935, 'learning_rate': 1.9786678409279535e-05, 'epoch': 0.09} + 9%|▉ | 822/8750 [1:20:55<12:36:08, 5.72s/it] {'loss': 0.4935, 'learning_rate': 1.9786678409279535e-05, 'epoch': 0.09} + 9%|▉ | 822/8750 [1:20:49<12:36:08, 5.72s/it] 9%|▉ | 823/8750 [1:21:01<12:33:06, 5.70s/it] 9%|▉ | 823/8750 [1:20:54<12:33:07, 5.70s/it] {'loss': 0.4966, 'learning_rate': 1.9785917238263405e-05, 'epoch': 0.09} + 9%|▉ | 823/8750 [1:21:01<12:33:06, 5.70s/it] {'loss': 0.4966, 'learning_rate': 1.9785917238263405e-05, 'epoch': 0.09} + 9%|▉ | 823/8750 [1:20:54<12:33:07, 5.70s/it] 9%|▉ | 824/8750 [1:21:07<12:32:40, 5.70s/it] 9%|▉ | 824/8750 [1:21:00<12:32:41, 5.70s/it] {'loss': 0.514, 'learning_rate': 1.9785154726358134e-05, 'epoch': 0.09} + 9%|▉ | 824/8750 [1:21:07<12:32:40, 5.70s/it] {'loss': 0.514, 'learning_rate': 1.9785154726358134e-05, 'epoch': 0.09} + 9%|▉ | 824/8750 [1:21:00<12:32:41, 5.70s/it] 9%|▉ | 825/8750 [1:21:12<12:26:16, 5.65s/it] 9%|▉ | 825/8750 [1:21:06<12:26:16, 5.65s/it] {'loss': 0.517, 'learning_rate': 1.9784390873668206e-05, 'epoch': 0.09} + 9%|▉ | 825/8750 [1:21:12<12:26:16, 5.65s/it] {'loss': 0.517, 'learning_rate': 1.9784390873668206e-05, 'epoch': 0.09} + 9%|▉ | 825/8750 [1:21:06<12:26:16, 5.65s/it] 9%|▉ | 826/8750 [1:21:18<12:23:41, 5.63s/it] 9%|▉ | 826/8750 [1:21:11<12:23:42, 5.63s/it] {'loss': 0.4887, 'learning_rate': 1.9783625680298276e-05, 'epoch': 0.09} + 9%|▉ | 826/8750 [1:21:18<12:23:41, 5.63s/it] {'loss': 0.4887, 'learning_rate': 1.9783625680298276e-05, 'epoch': 0.09} + 9%|▉ | 826/8750 [1:21:11<12:23:42, 5.63s/it] 9%|▉ | 827/8750 [1:21:24<12:32:29, 5.70s/it] 9%|▉ | 827/8750 [1:21:17<12:32:29, 5.70s/it] {'loss': 0.4793, 'learning_rate': 1.9782859146353196e-05, 'epoch': 0.09} + 9%|▉ | 827/8750 [1:21:24<12:32:29, 5.70s/it] {'loss': 0.4793, 'learning_rate': 1.9782859146353196e-05, 'epoch': 0.09} + 9%|▉ | 827/8750 [1:21:17<12:32:29, 5.70s/it] 9%|▉ | 828/8750 [1:21:30<12:49:31, 5.83s/it] 9%|▉ | 828/8750 [1:21:23<12:49:31, 5.83s/it] {'loss': 0.4941, 'learning_rate': 1.9782091271938e-05, 'epoch': 0.09} + 9%|▉ | 828/8750 [1:21:30<12:49:31, 5.83s/it] {'loss': 0.4941, 'learning_rate': 1.9782091271938e-05, 'epoch': 0.09} + 9%|▉ | 828/8750 [1:21:23<12:49:31, 5.83s/it] 9%|▉ | 829/8750 [1:21:35<12:44:32, 5.79s/it] 9%|▉ | 829/8750 [1:21:29<12:44:33, 5.79s/it] {'loss': 0.5018, 'learning_rate': 1.9781322057157902e-05, 'epoch': 0.09} + 9%|▉ | 829/8750 [1:21:35<12:44:32, 5.79s/it] {'loss': 0.5018, 'learning_rate': 1.9781322057157902e-05, 'epoch': 0.09} + 9%|▉ | 829/8750 [1:21:29<12:44:33, 5.79s/it] 9%|▉ | 830/8750 [1:21:41<12:38:18, 5.74s/it] 9%|▉ | 830/8750 [1:21:35<12:38:18, 5.74s/it] {'loss': 0.4994, 'learning_rate': 1.9780551502118306e-05, 'epoch': 0.09} + 9%|▉ | 830/8750 [1:21:41<12:38:18, 5.74s/it] {'loss': 0.4994, 'learning_rate': 1.9780551502118306e-05, 'epoch': 0.09} + 9%|▉ | 830/8750 [1:21:35<12:38:18, 5.74s/it] 9%|▉ | 831/8750 [1:21:47<12:43:45, 5.79s/it] 9%|▉ | 831/8750 [1:21:40<12:43:45, 5.79s/it] {'loss': 0.491, 'learning_rate': 1.9779779606924788e-05, 'epoch': 0.09} + 9%|▉ | 831/8750 [1:21:47<12:43:45, 5.79s/it] {'loss': 0.491, 'learning_rate': 1.9779779606924788e-05, 'epoch': 0.09} + 9%|▉ | 831/8750 [1:21:40<12:43:45, 5.79s/it] 10%|▉ | 832/8750 [1:21:53<12:45:25, 5.80s/it] 10%|▉ | 832/8750 [1:21:46<12:45:25, 5.80s/it] {'loss': 0.4961, 'learning_rate': 1.977900637168312e-05, 'epoch': 0.1} + 10%|▉ | 832/8750 [1:21:53<12:45:25, 5.80s/it] {'loss': 0.4961, 'learning_rate': 1.977900637168312e-05, 'epoch': 0.1} + 10%|▉ | 832/8750 [1:21:46<12:45:25, 5.80s/it] 10%|▉ | 833/8750 [1:21:58<12:35:17, 5.72s/it] 10%|▉ | 833/8750 [1:21:52<12:35:16, 5.72s/it] {'loss': 0.4925, 'learning_rate': 1.9778231796499254e-05, 'epoch': 0.1} + 10%|▉ | 833/8750 [1:21:58<12:35:17, 5.72s/it] {'loss': 0.4925, 'learning_rate': 1.9778231796499254e-05, 'epoch': 0.1} + 10%|▉ | 833/8750 [1:21:52<12:35:16, 5.72s/it] 10%|▉ | 834/8750 [1:22:04<12:34:31, 5.72s/it] 10%|▉ | 834/8750 [1:21:58<12:34:30, 5.72s/it] {'loss': 0.4914, 'learning_rate': 1.977745588147932e-05, 'epoch': 0.1} + 10%|▉ | 834/8750 [1:22:04<12:34:31, 5.72s/it] {'loss': 0.4914, 'learning_rate': 1.977745588147932e-05, 'epoch': 0.1} + 10%|▉ | 834/8750 [1:21:58<12:34:30, 5.72s/it] 10%|▉ | 835/8750 [1:22:10<12:37:16, 5.74s/it] 10%|▉ | 835/8750 [1:22:03<12:37:16, 5.74s/it] {'loss': 0.4989, 'learning_rate': 1.977667862672964e-05, 'epoch': 0.1} + 10%|▉ | 835/8750 [1:22:10<12:37:16, 5.74s/it] {'loss': 0.4989, 'learning_rate': 1.977667862672964e-05, 'epoch': 0.1} + 10%|▉ | 835/8750 [1:22:03<12:37:16, 5.74s/it] 10%|▉ | 836/8750 [1:22:15<12:34:35, 5.72s/it] 10%|▉ | 836/8750 [1:22:09<12:34:35, 5.72s/it] {'loss': 0.4993, 'learning_rate': 1.9775900032356704e-05, 'epoch': 0.1} + 10%|▉ | 836/8750 [1:22:15<12:34:35, 5.72s/it] {'loss': 0.4993, 'learning_rate': 1.9775900032356704e-05, 'epoch': 0.1} + 10%|▉ | 836/8750 [1:22:09<12:34:35, 5.72s/it] 10%|▉ | 837/8750 [1:22:21<12:37:12, 5.74s/it] 10%|▉ | 837/8750 [1:22:15<12:37:11, 5.74s/it] {'loss': 0.4878, 'learning_rate': 1.9775120098467212e-05, 'epoch': 0.1} + 10%|▉ | 837/8750 [1:22:21<12:37:12, 5.74s/it] {'loss': 0.4878, 'learning_rate': 1.9775120098467212e-05, 'epoch': 0.1} + 10%|▉ | 837/8750 [1:22:15<12:37:11, 5.74s/it] 10%|▉ | 838/8750 [1:22:27<12:31:45, 5.70s/it] 10%|▉ | 838/8750 [1:22:20<12:31:44, 5.70s/it] {'loss': 0.5092, 'learning_rate': 1.9774338825168024e-05, 'epoch': 0.1} + 10%|▉ | 838/8750 [1:22:27<12:31:45, 5.70s/it] {'loss': 0.5092, 'learning_rate': 1.9774338825168024e-05, 'epoch': 0.1} + 10%|▉ | 838/8750 [1:22:20<12:31:44, 5.70s/it] 10%|▉ | 839/8750 [1:22:33<12:37:02, 5.74s/it] 10%|▉ | 839/8750 [1:22:26<12:37:03, 5.74s/it] {'loss': 0.4944, 'learning_rate': 1.977355621256619e-05, 'epoch': 0.1} + 10%|▉ | 839/8750 [1:22:33<12:37:02, 5.74s/it] {'loss': 0.4944, 'learning_rate': 1.977355621256619e-05, 'epoch': 0.1} + 10%|▉ | 839/8750 [1:22:26<12:37:03, 5.74s/it] 10%|▉ | 840/8750 [1:22:38<12:34:44, 5.72s/it] 10%|▉ | 840/8750 [1:22:32<12:34:44, 5.73s/it] {'loss': 0.5022, 'learning_rate': 1.9772772260768954e-05, 'epoch': 0.1} + 10%|▉ | 840/8750 [1:22:38<12:34:44, 5.72s/it] {'loss': 0.5022, 'learning_rate': 1.9772772260768954e-05, 'epoch': 0.1} + 10%|▉ | 840/8750 [1:22:32<12:34:44, 5.73s/it] 10%|▉ | 841/8750 [1:22:44<12:28:26, 5.68s/it] 10%|▉ | 841/8750 [1:22:37<12:28:26, 5.68s/it] {'loss': 0.5086, 'learning_rate': 1.9771986969883727e-05, 'epoch': 0.1} + 10%|▉ | 841/8750 [1:22:44<12:28:26, 5.68s/it] {'loss': 0.5086, 'learning_rate': 1.9771986969883727e-05, 'epoch': 0.1} + 10%|▉ | 841/8750 [1:22:37<12:28:26, 5.68s/it] 10%|▉ | 842/8750 [1:22:50<12:24:19, 5.65s/it] 10%|▉ | 842/8750 [1:22:43<12:24:19, 5.65s/it] {'loss': 0.4895, 'learning_rate': 1.9771200340018115e-05, 'epoch': 0.1} + 10%|▉ | 842/8750 [1:22:50<12:24:19, 5.65s/it] {'loss': 0.4895, 'learning_rate': 1.9771200340018115e-05, 'epoch': 0.1} + 10%|▉ | 842/8750 [1:22:43<12:24:19, 5.65s/it] 10%|▉ | 843/8750 [1:22:55<12:25:20, 5.66s/it] 10%|▉ | 843/8750 [1:22:49<12:25:21, 5.66s/it] {'loss': 0.4911, 'learning_rate': 1.97704123712799e-05, 'epoch': 0.1} + 10%|▉ | 843/8750 [1:22:55<12:25:20, 5.66s/it] {'loss': 0.4911, 'learning_rate': 1.97704123712799e-05, 'epoch': 0.1} + 10%|▉ | 843/8750 [1:22:49<12:25:21, 5.66s/it] 10%|▉ | 844/8750 [1:23:01<12:27:29, 5.67s/it] 10%|▉ | 844/8750 [1:22:54<12:27:28, 5.67s/it] {'loss': 0.5018, 'learning_rate': 1.976962306377706e-05, 'epoch': 0.1} + 10%|▉ | 844/8750 [1:23:01<12:27:29, 5.67s/it] {'loss': 0.5018, 'learning_rate': 1.976962306377706e-05, 'epoch': 0.1} + 10%|▉ | 844/8750 [1:22:54<12:27:28, 5.67s/it] 10%|▉ | 845/8750 [1:23:07<12:39:50, 5.77s/it] 10%|▉ | 845/8750 [1:23:00<12:39:49, 5.77s/it] {'loss': 0.4837, 'learning_rate': 1.9768832417617737e-05, 'epoch': 0.1} + 10%|▉ | 845/8750 [1:23:07<12:39:50, 5.77s/it] {'loss': 0.4837, 'learning_rate': 1.9768832417617737e-05, 'epoch': 0.1} + 10%|▉ | 845/8750 [1:23:00<12:39:49, 5.77s/it] 10%|▉ | 846/8750 [1:23:13<12:37:50, 5.75s/it] 10%|▉ | 846/8750 [1:23:06<12:37:50, 5.75s/it] {'loss': 0.5251, 'learning_rate': 1.9768040432910276e-05, 'epoch': 0.1} + 10%|▉ | 846/8750 [1:23:13<12:37:50, 5.75s/it] {'loss': 0.5251, 'learning_rate': 1.9768040432910276e-05, 'epoch': 0.1} + 10%|▉ | 846/8750 [1:23:06<12:37:50, 5.75s/it] 10%|▉ | 847/8750 [1:23:19<12:45:00, 5.81s/it] 10%|▉ | 847/8750 [1:23:12<12:45:00, 5.81s/it] {'loss': 0.5045, 'learning_rate': 1.976724710976319e-05, 'epoch': 0.1} + 10%|▉ | 847/8750 [1:23:19<12:45:00, 5.81s/it] {'loss': 0.5045, 'learning_rate': 1.976724710976319e-05, 'epoch': 0.1} + 10%|▉ | 847/8750 [1:23:12<12:45:00, 5.81s/it] 10%|▉ | 848/8750 [1:23:24<12:35:33, 5.74s/it] 10%|▉ | 848/8750 [1:23:18<12:35:33, 5.74s/it] {'loss': 0.5115, 'learning_rate': 1.9766452448285184e-05, 'epoch': 0.1} + 10%|▉ | 848/8750 [1:23:24<12:35:33, 5.74s/it] {'loss': 0.5115, 'learning_rate': 1.9766452448285184e-05, 'epoch': 0.1} + 10%|▉ | 848/8750 [1:23:18<12:35:33, 5.74s/it] 10%|▉ | 849/8750 [1:23:30<12:36:04, 5.74s/it] 10%|▉ | 849/8750 [1:23:23<12:36:04, 5.74s/it] {'loss': 0.4769, 'learning_rate': 1.9765656448585148e-05, 'epoch': 0.1} + 10%|▉ | 849/8750 [1:23:30<12:36:04, 5.74s/it] {'loss': 0.4769, 'learning_rate': 1.9765656448585148e-05, 'epoch': 0.1} + 10%|▉ | 849/8750 [1:23:23<12:36:04, 5.74s/it]13 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... + 10%|▉ | 850/8750 [1:23:36<12:42:11, 5.79s/it]12 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +014 AutoResumeHook: Checking whether to suspend... + 1 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... + 10%|▉ | 850/8750 [1:23:29<12:42:11, 5.79s/it]9 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4936, 'learning_rate': 1.976485911077215e-05, 'epoch': 0.1} + 10%|▉ | 850/8750 [1:23:36<12:42:11, 5.79s/it] {'loss': 0.4936, 'learning_rate': 1.976485911077215e-05, 'epoch': 0.1} + 10%|▉ | 850/8750 [1:23:29<12:42:11, 5.79s/it] 10%|▉ | 851/8750 [1:23:42<12:42:02, 5.79s/it] 10%|▉ | 851/8750 [1:23:35<12:42:02, 5.79s/it] {'loss': 0.4867, 'learning_rate': 1.9764060434955437e-05, 'epoch': 0.1} + 10%|▉ | 851/8750 [1:23:42<12:42:02, 5.79s/it] {'loss': 0.4867, 'learning_rate': 1.9764060434955437e-05, 'epoch': 0.1} + 10%|▉ | 851/8750 [1:23:35<12:42:02, 5.79s/it] 10%|▉ | 852/8750 [1:23:47<12:34:06, 5.73s/it] 10%|▉ | 852/8750 [1:23:41<12:34:06, 5.73s/it] {'loss': 0.4924, 'learning_rate': 1.9763260421244455e-05, 'epoch': 0.1} + 10%|▉ | 852/8750 [1:23:47<12:34:06, 5.73s/it] {'loss': 0.4924, 'learning_rate': 1.9763260421244455e-05, 'epoch': 0.1} + 10%|▉ | 852/8750 [1:23:41<12:34:06, 5.73s/it] 10%|▉ | 853/8750 [1:23:53<12:39:47, 5.77s/it] 10%|▉ | 853/8750 [1:23:47<12:39:46, 5.77s/it] {'loss': 0.4903, 'learning_rate': 1.9762459069748817e-05, 'epoch': 0.1} + 10%|▉ | 853/8750 [1:23:53<12:39:47, 5.77s/it] {'loss': 0.4903, 'learning_rate': 1.9762459069748817e-05, 'epoch': 0.1} + 10%|▉ | 853/8750 [1:23:47<12:39:46, 5.77s/it] 10%|▉ | 854/8750 [1:23:59<12:38:50, 5.77s/it] 10%|▉ | 854/8750 [1:23:52<12:38:49, 5.77s/it] {'loss': 0.4812, 'learning_rate': 1.9761656380578328e-05, 'epoch': 0.1} + 10%|▉ | 854/8750 [1:23:59<12:38:50, 5.77s/it] {'loss': 0.4812, 'learning_rate': 1.9761656380578328e-05, 'epoch': 0.1} + 10%|▉ | 854/8750 [1:23:52<12:38:49, 5.77s/it] 10%|▉ | 855/8750 [1:24:05<12:43:58, 5.81s/it] 10%|▉ | 855/8750 [1:23:58<12:43:59, 5.81s/it] {'loss': 0.4974, 'learning_rate': 1.9760852353842973e-05, 'epoch': 0.1} + 10%|▉ | 855/8750 [1:24:05<12:43:58, 5.81s/it] {'loss': 0.4974, 'learning_rate': 1.9760852353842973e-05, 'epoch': 0.1} + 10%|▉ | 855/8750 [1:23:58<12:43:59, 5.81s/it] 10%|▉ | 856/8750 [1:24:04<12:37:23, 5.76s/it] 10%|▉ | 856/8750 [1:24:10<12:37:25, 5.76s/it] {'loss': 0.5047, 'learning_rate': 1.9760046989652926e-05, 'epoch': 0.1} + 10%|▉ | 856/8750 [1:24:10<12:37:25, 5.76s/it] {'loss': 0.5047, 'learning_rate': 1.9760046989652926e-05, 'epoch': 0.1} + 10%|▉ | 856/8750 [1:24:04<12:37:23, 5.76s/it] 10%|▉ | 857/8750 [1:24:16<12:28:03, 5.69s/it] 10%|▉ | 857/8750 [1:24:09<12:28:02, 5.69s/it] {'loss': 0.4837, 'learning_rate': 1.9759240288118536e-05, 'epoch': 0.1} + 10%|▉ | 857/8750 [1:24:16<12:28:03, 5.69s/it] {'loss': 0.4837, 'learning_rate': 1.9759240288118536e-05, 'epoch': 0.1} + 10%|▉ | 857/8750 [1:24:09<12:28:02, 5.69s/it] 10%|▉ | 858/8750 [1:24:15<12:26:30, 5.68s/it] 10%|▉ | 858/8750 [1:24:21<12:26:32, 5.68s/it] {'loss': 0.4966, 'learning_rate': 1.975843224935034e-05, 'epoch': 0.1} + 10%|▉ | 858/8750 [1:24:21<12:26:32, 5.68s/it] {'loss': 0.4966, 'learning_rate': 1.975843224935034e-05, 'epoch': 0.1} + 10%|▉ | 858/8750 [1:24:15<12:26:30, 5.68s/it] 10%|▉ | 859/8750 [1:24:21<12:32:48, 5.72s/it] 10%|▉ | 859/8750 [1:24:27<12:32:48, 5.72s/it] {'loss': 0.505, 'learning_rate': 1.9757622873459056e-05, 'epoch': 0.1} + 10%|▉ | 859/8750 [1:24:27<12:32:48, 5.72s/it] {'loss': 0.505, 'learning_rate': 1.9757622873459056e-05, 'epoch': 0.1} + 10%|▉ | 859/8750 [1:24:21<12:32:48, 5.72s/it] 10%|▉ | 860/8750 [1:24:33<12:30:08, 5.70s/it] 10%|▉ | 860/8750 [1:24:26<12:30:09, 5.70s/it] {'loss': 0.4818, 'learning_rate': 1.9756812160555586e-05, 'epoch': 0.1} + 10%|▉ | 860/8750 [1:24:33<12:30:08, 5.70s/it] {'loss': 0.4818, 'learning_rate': 1.9756812160555586e-05, 'epoch': 0.1} + 10%|▉ | 860/8750 [1:24:27<12:30:09, 5.70s/it] 10%|▉ | 861/8750 [1:24:39<12:28:34, 5.69s/it] 10%|▉ | 861/8750 [1:24:32<12:28:34, 5.69s/it] {'loss': 0.4965, 'learning_rate': 1.9756000110751023e-05, 'epoch': 0.1} + 10%|▉ | 861/8750 [1:24:39<12:28:34, 5.69s/it] {'loss': 0.4965, 'learning_rate': 1.9756000110751023e-05, 'epoch': 0.1} + 10%|▉ | 861/8750 [1:24:32<12:28:34, 5.69s/it] 10%|▉ | 862/8750 [1:24:38<12:45:41, 5.82s/it] 10%|▉ | 862/8750 [1:24:45<12:45:42, 5.82s/it] {'loss': 0.4987, 'learning_rate': 1.975518672415663e-05, 'epoch': 0.1} + 10%|▉ | 862/8750 [1:24:45<12:45:42, 5.82s/it] {'loss': 0.4987, 'learning_rate': 1.975518672415663e-05, 'epoch': 0.1} + 10%|▉ | 862/8750 [1:24:38<12:45:41, 5.82s/it] 10%|▉ | 863/8750 [1:24:51<12:49:01, 5.85s/it] 10%|▉ | 863/8750 [1:24:44<12:49:01, 5.85s/it] {'loss': 0.4718, 'learning_rate': 1.975437200088386e-05, 'epoch': 0.1} + 10%|▉ | 863/8750 [1:24:51<12:49:01, 5.85s/it] {'loss': 0.4718, 'learning_rate': 1.975437200088386e-05, 'epoch': 0.1} + 10%|▉ | 863/8750 [1:24:44<12:49:01, 5.85s/it] 10%|▉ | 864/8750 [1:24:56<12:37:39, 5.76s/it] {'loss': 0.5024, 'learning_rate': 1.9753555941044345e-05, 'epoch': 0.1} + 10%|▉ | 864/8750 [1:24:56<12:37:39, 5.76s/it] 10%|▉ | 864/8750 [1:24:50<12:37:38, 5.76s/it] {'loss': 0.5024, 'learning_rate': 1.9753555941044345e-05, 'epoch': 0.1} + 10%|▉ | 864/8750 [1:24:50<12:37:38, 5.76s/it] 10%|▉ | 865/8750 [1:24:55<12:34:31, 5.74s/it] 10%|▉ | 865/8750 [1:25:02<12:34:31, 5.74s/it] {'loss': 0.4937, 'learning_rate': 1.9752738544749906e-05, 'epoch': 0.1} + 10%|▉ | 865/8750 [1:25:02<12:34:31, 5.74s/it] {'loss': 0.4937, 'learning_rate': 1.9752738544749906e-05, 'epoch': 0.1} + 10%|▉ | 865/8750 [1:24:55<12:34:31, 5.74s/it] 10%|▉ | 866/8750 [1:25:01<12:31:53, 5.72s/it] 10%|▉ | 866/8750 [1:25:08<12:31:53, 5.72s/it] {'loss': 0.4968, 'learning_rate': 1.975191981211255e-05, 'epoch': 0.1} + 10%|▉ | 866/8750 [1:25:08<12:31:53, 5.72s/it] {'loss': 0.4968, 'learning_rate': 1.975191981211255e-05, 'epoch': 0.1} + 10%|▉ | 866/8750 [1:25:01<12:31:53, 5.72s/it] 10%|▉ | 867/8750 [1:25:13<12:26:55, 5.69s/it] 10%|▉ | 867/8750 [1:25:07<12:26:56, 5.69s/it] {'loss': 0.4785, 'learning_rate': 1.9751099743244454e-05, 'epoch': 0.1} + 10%|▉ | 867/8750 [1:25:13<12:26:55, 5.69s/it] {'loss': 0.4785, 'learning_rate': 1.9751099743244454e-05, 'epoch': 0.1} + 10%|▉ | 867/8750 [1:25:07<12:26:56, 5.69s/it] 10%|▉ | 868/8750 [1:25:19<12:30:46, 5.72s/it] 10%|▉ | 868/8750 [1:25:13<12:30:46, 5.72s/it] {'loss': 0.5002, 'learning_rate': 1.9750278338257985e-05, 'epoch': 0.1} + 10%|▉ | 868/8750 [1:25:19<12:30:46, 5.72s/it] {'loss': 0.5002, 'learning_rate': 1.9750278338257985e-05, 'epoch': 0.1} + 10%|▉ | 868/8750 [1:25:13<12:30:46, 5.72s/it] 10%|▉ | 869/8750 [1:25:25<12:28:31, 5.70s/it] 10%|▉ | 869/8750 [1:25:18<12:28:31, 5.70s/it] {'loss': 0.4948, 'learning_rate': 1.9749455597265704e-05, 'epoch': 0.1} + 10%|▉ | 869/8750 [1:25:25<12:28:31, 5.70s/it] {'loss': 0.4948, 'learning_rate': 1.9749455597265704e-05, 'epoch': 0.1} + 10%|▉ | 869/8750 [1:25:18<12:28:31, 5.70s/it] 10%|▉ | 870/8750 [1:25:24<12:31:50, 5.72s/it] 10%|▉ | 870/8750 [1:25:30<12:31:50, 5.72s/it] {'loss': 0.4887, 'learning_rate': 1.9748631520380333e-05, 'epoch': 0.1} + 10%|▉ | 870/8750 [1:25:30<12:31:50, 5.72s/it] {'loss': 0.4887, 'learning_rate': 1.9748631520380333e-05, 'epoch': 0.1} + 10%|▉ | 870/8750 [1:25:24<12:31:50, 5.72s/it] 10%|▉ | 871/8750 [1:25:36<12:40:03, 5.79s/it] 10%|▉ | 871/8750 [1:25:30<12:40:03, 5.79s/it] {'loss': 0.4751, 'learning_rate': 1.97478061077148e-05, 'epoch': 0.1} + 10%|▉ | 871/8750 [1:25:36<12:40:03, 5.79s/it] {'loss': 0.4751, 'learning_rate': 1.97478061077148e-05, 'epoch': 0.1} + 10%|▉ | 871/8750 [1:25:30<12:40:03, 5.79s/it] 10%|▉ | 872/8750 [1:25:42<12:36:38, 5.76s/it] 10%|▉ | 872/8750 [1:25:36<12:36:38, 5.76s/it] {'loss': 0.4976, 'learning_rate': 1.9746979359382193e-05, 'epoch': 0.1} + 10%|▉ | 872/8750 [1:25:42<12:36:38, 5.76s/it] {'loss': 0.4976, 'learning_rate': 1.9746979359382193e-05, 'epoch': 0.1} + 10%|▉ | 872/8750 [1:25:36<12:36:38, 5.76s/it] 10%|▉ | 873/8750 [1:25:48<12:25:38, 5.68s/it] 10%|▉ | 873/8750 [1:25:41<12:25:38, 5.68s/it] {'loss': 0.5071, 'learning_rate': 1.9746151275495803e-05, 'epoch': 0.1} + 10%|▉ | 873/8750 [1:25:48<12:25:38, 5.68s/it] {'loss': 0.5071, 'learning_rate': 1.9746151275495803e-05, 'epoch': 0.1} + 10%|▉ | 873/8750 [1:25:41<12:25:38, 5.68s/it] 10%|▉ | 874/8750 [1:25:53<12:25:32, 5.68s/it] 10%|▉ | 874/8750 [1:25:47<12:25:32, 5.68s/it] {'loss': 0.5251, 'learning_rate': 1.974532185616909e-05, 'epoch': 0.1} + 10%|▉ | 874/8750 [1:25:53<12:25:32, 5.68s/it] {'loss': 0.5251, 'learning_rate': 1.974532185616909e-05, 'epoch': 0.1} + 10%|▉ | 874/8750 [1:25:47<12:25:32, 5.68s/it] 10%|█ | 875/8750 [1:25:59<12:24:29, 5.67s/it] 10%|█ | 875/8750 [1:25:52<12:24:28, 5.67s/it] {'loss': 0.484, 'learning_rate': 1.9744491101515715e-05, 'epoch': 0.1} + 10%|█ | 875/8750 [1:25:59<12:24:29, 5.67s/it] {'loss': 0.484, 'learning_rate': 1.9744491101515715e-05, 'epoch': 0.1} + 10%|█ | 875/8750 [1:25:52<12:24:28, 5.67s/it] 10%|█ | 876/8750 [1:25:58<12:22:52, 5.66s/it] 10%|█ | 876/8750 [1:26:05<12:22:53, 5.66s/it] {'loss': 0.4894, 'learning_rate': 1.9743659011649495e-05, 'epoch': 0.1} + 10%|█ | 876/8750 [1:26:05<12:22:53, 5.66s/it] {'loss': 0.4894, 'learning_rate': 1.9743659011649495e-05, 'epoch': 0.1} + 10%|█ | 876/8750 [1:25:58<12:22:52, 5.66s/it] 10%|█ | 877/8750 [1:26:10<12:21:57, 5.65s/it] 10%|█ | 877/8750 [1:26:04<12:21:57, 5.65s/it] {'loss': 0.4815, 'learning_rate': 1.9742825586684457e-05, 'epoch': 0.1} + 10%|█ | 877/8750 [1:26:10<12:21:57, 5.65s/it] {'loss': 0.4815, 'learning_rate': 1.9742825586684457e-05, 'epoch': 0.1} + 10%|█ | 877/8750 [1:26:04<12:21:57, 5.65s/it] 10%|█ | 878/8750 [1:26:16<12:19:02, 5.63s/it] 10%|█ | 878/8750 [1:26:09<12:19:02, 5.63s/it] {'loss': 0.5014, 'learning_rate': 1.9741990826734793e-05, 'epoch': 0.1} + 10%|█ | 878/8750 [1:26:16<12:19:02, 5.63s/it] {'loss': 0.5014, 'learning_rate': 1.9741990826734793e-05, 'epoch': 0.1} + 10%|█ | 878/8750 [1:26:09<12:19:02, 5.63s/it] 10%|█ | 879/8750 [1:26:21<12:21:53, 5.66s/it] 10%|█ | 879/8750 [1:26:15<12:21:54, 5.66s/it] {'loss': 0.4918, 'learning_rate': 1.9741154731914882e-05, 'epoch': 0.1} + 10%|█ | 879/8750 [1:26:21<12:21:53, 5.66s/it] {'loss': 0.4918, 'learning_rate': 1.9741154731914882e-05, 'epoch': 0.1} + 10%|█ | 879/8750 [1:26:15<12:21:54, 5.66s/it] 10%|█ | 880/8750 [1:26:27<12:24:50, 5.68s/it] 10%|█ | 880/8750 [1:26:21<12:24:50, 5.68s/it] {'loss': 0.4868, 'learning_rate': 1.974031730233929e-05, 'epoch': 0.1} + 10%|█ | 880/8750 [1:26:27<12:24:50, 5.68s/it] {'loss': 0.4868, 'learning_rate': 1.974031730233929e-05, 'epoch': 0.1} + 10%|█ | 880/8750 [1:26:21<12:24:50, 5.68s/it] 10%|█ | 881/8750 [1:26:33<12:21:51, 5.66s/it] 10%|█ | 881/8750 [1:26:26<12:21:52, 5.66s/it] {'loss': 0.4782, 'learning_rate': 1.9739478538122765e-05, 'epoch': 0.1} + 10%|█ | 881/8750 [1:26:33<12:21:51, 5.66s/it] {'loss': 0.4782, 'learning_rate': 1.9739478538122765e-05, 'epoch': 0.1} + 10%|█ | 881/8750 [1:26:26<12:21:52, 5.66s/it] 10%|█ | 882/8750 [1:26:38<12:19:12, 5.64s/it] 10%|█ | 882/8750 [1:26:32<12:19:13, 5.64s/it] {'loss': 0.5087, 'learning_rate': 1.9738638439380237e-05, 'epoch': 0.1} + 10%|█ | 882/8750 [1:26:38<12:19:12, 5.64s/it] {'loss': 0.5087, 'learning_rate': 1.9738638439380237e-05, 'epoch': 0.1} + 10%|█ | 882/8750 [1:26:32<12:19:13, 5.64s/it] 10%|█ | 883/8750 [1:26:44<12:21:06, 5.65s/it] 10%|█ | 883/8750 [1:26:38<12:21:06, 5.65s/it] {'loss': 0.4789, 'learning_rate': 1.9737797006226815e-05, 'epoch': 0.1} + {'loss': 0.4789, 'learning_rate': 1.9737797006226815e-05, 'epoch': 0.1} + 10%|█ | 883/8750 [1:26:44<12:21:06, 5.65s/it] 10%|█ | 883/8750 [1:26:38<12:21:06, 5.65s/it] 10%|█ | 884/8750 [1:26:50<12:20:28, 5.65s/it] 10%|█ | 884/8750 [1:26:43<12:20:28, 5.65s/it] {'loss': 0.4782, 'learning_rate': 1.9736954238777793e-05, 'epoch': 0.1} + 10%|█ | 884/8750 [1:26:50<12:20:28, 5.65s/it] {'loss': 0.4782, 'learning_rate': 1.9736954238777793e-05, 'epoch': 0.1} + 10%|█ | 884/8750 [1:26:43<12:20:28, 5.65s/it] 10%|█ | 885/8750 [1:26:55<12:22:09, 5.66s/it] 10%|█ | 885/8750 [1:26:49<12:22:09, 5.66s/it] {'loss': 0.4946, 'learning_rate': 1.973611013714865e-05, 'epoch': 0.1} + 10%|█ | 885/8750 [1:26:55<12:22:09, 5.66s/it] {'loss': 0.4946, 'learning_rate': 1.973611013714865e-05, 'epoch': 0.1} + 10%|█ | 885/8750 [1:26:49<12:22:09, 5.66s/it] 10%|█ | 886/8750 [1:27:01<12:24:12, 5.68s/it] 10%|█ | 886/8750 [1:26:55<12:24:12, 5.68s/it] {'loss': 0.5054, 'learning_rate': 1.9735264701455054e-05, 'epoch': 0.1} + 10%|█ | 886/8750 [1:27:01<12:24:12, 5.68s/it] {'loss': 0.5054, 'learning_rate': 1.9735264701455054e-05, 'epoch': 0.1} + 10%|█ | 886/8750 [1:26:55<12:24:12, 5.68s/it] 10%|█ | 887/8750 [1:27:07<12:23:06, 5.67s/it] 10%|█ | 887/8750 [1:27:00<12:23:06, 5.67s/it] {'loss': 0.4952, 'learning_rate': 1.973441793181284e-05, 'epoch': 0.1} + 10%|█ | 887/8750 [1:27:07<12:23:06, 5.67s/it] {'loss': 0.4952, 'learning_rate': 1.973441793181284e-05, 'epoch': 0.1} + 10%|█ | 887/8750 [1:27:00<12:23:06, 5.67s/it] 10%|█ | 888/8750 [1:27:13<12:26:13, 5.69s/it] 10%|█ | 888/8750 [1:27:06<12:26:13, 5.69s/it] {'loss': 0.4816, 'learning_rate': 1.9733569828338038e-05, 'epoch': 0.1} + 10%|█ | 888/8750 [1:27:13<12:26:13, 5.69s/it] {'loss': 0.4816, 'learning_rate': 1.9733569828338038e-05, 'epoch': 0.1} + 10%|█ | 888/8750 [1:27:06<12:26:13, 5.69s/it] 10%|█ | 889/8750 [1:27:12<12:29:09, 5.72s/it] 10%|█ | 889/8750 [1:27:18<12:29:10, 5.72s/it] {'loss': 0.5189, 'learning_rate': 1.9732720391146852e-05, 'epoch': 0.1} + 10%|█ | 889/8750 [1:27:18<12:29:10, 5.72s/it] {'loss': 0.5189, 'learning_rate': 1.9732720391146852e-05, 'epoch': 0.1} + 10%|█ | 889/8750 [1:27:12<12:29:09, 5.72s/it] 10%|█ | 890/8750 [1:27:24<12:32:15, 5.74s/it] 10%|█ | 890/8750 [1:27:18<12:32:20, 5.74s/it] {'loss': 0.4958, 'learning_rate': 1.973186962035568e-05, 'epoch': 0.1} + 10%|█ | 890/8750 [1:27:24<12:32:15, 5.74s/it] {'loss': 0.4958, 'learning_rate': 1.973186962035568e-05, 'epoch': 0.1} + 10%|█ | 890/8750 [1:27:18<12:32:20, 5.74s/it] 10%|█ | 891/8750 [1:27:30<12:25:52, 5.69s/it] 10%|█ | 891/8750 [1:27:23<12:25:51, 5.69s/it] {'loss': 0.5128, 'learning_rate': 1.97310175160811e-05, 'epoch': 0.1} + 10%|█ | 891/8750 [1:27:30<12:25:52, 5.69s/it] {'loss': 0.5128, 'learning_rate': 1.97310175160811e-05, 'epoch': 0.1} + 10%|█ | 891/8750 [1:27:23<12:25:51, 5.69s/it] 10%|█ | 892/8750 [1:27:35<12:27:57, 5.71s/it] 10%|█ | 892/8750 [1:27:29<12:27:56, 5.71s/it] {'loss': 0.5123, 'learning_rate': 1.9730164078439857e-05, 'epoch': 0.1} + 10%|█ | 892/8750 [1:27:35<12:27:57, 5.71s/it] {'loss': 0.5123, 'learning_rate': 1.9730164078439857e-05, 'epoch': 0.1} + 10%|█ | 892/8750 [1:27:29<12:27:56, 5.71s/it] 10%|█ | 893/8750 [1:27:41<12:32:35, 5.75s/it] 10%|█ | 893/8750 [1:27:35<12:32:34, 5.75s/it] {'loss': 0.4817, 'learning_rate': 1.97293093075489e-05, 'epoch': 0.1} + 10%|█ | 893/8750 [1:27:41<12:32:35, 5.75s/it] {'loss': 0.4817, 'learning_rate': 1.97293093075489e-05, 'epoch': 0.1} + 10%|█ | 893/8750 [1:27:35<12:32:34, 5.75s/it] 10%|█ | 894/8750 [1:27:47<12:29:27, 5.72s/it] 10%|█ | 894/8750 [1:27:40<12:29:26, 5.72s/it] {'loss': 0.5027, 'learning_rate': 1.9728453203525352e-05, 'epoch': 0.1} + 10%|█ | 894/8750 [1:27:47<12:29:27, 5.72s/it] {'loss': 0.5027, 'learning_rate': 1.9728453203525352e-05, 'epoch': 0.1} + 10%|█ | 894/8750 [1:27:40<12:29:26, 5.72s/it] 10%|█ | 895/8750 [1:27:53<12:31:23, 5.74s/it] 10%|█ | 895/8750 [1:27:46<12:31:23, 5.74s/it] {'loss': 0.5033, 'learning_rate': 1.9727595766486514e-05, 'epoch': 0.1} + 10%|█ | 895/8750 [1:27:53<12:31:23, 5.74s/it] {'loss': 0.5033, 'learning_rate': 1.9727595766486514e-05, 'epoch': 0.1} + 10%|█ | 895/8750 [1:27:46<12:31:23, 5.74s/it] 10%|█ | 896/8750 [1:27:58<12:29:47, 5.73s/it] 10%|█ | 896/8750 [1:27:52<12:29:47, 5.73s/it] {'loss': 0.4904, 'learning_rate': 1.972673699654988e-05, 'epoch': 0.1} + 10%|█ | 896/8750 [1:27:58<12:29:47, 5.73s/it] {'loss': 0.4904, 'learning_rate': 1.972673699654988e-05, 'epoch': 0.1} + 10%|█ | 896/8750 [1:27:52<12:29:47, 5.73s/it] 10%|█ | 897/8750 [1:28:04<12:27:10, 5.71s/it] 10%|█ | 897/8750 [1:27:58<12:27:09, 5.71s/it] {'loss': 0.4848, 'learning_rate': 1.9725876893833108e-05, 'epoch': 0.1} + 10%|█ | 897/8750 [1:28:04<12:27:10, 5.71s/it] {'loss': 0.4848, 'learning_rate': 1.9725876893833108e-05, 'epoch': 0.1} + 10%|█ | 897/8750 [1:27:58<12:27:09, 5.71s/it] 10%|█ | 898/8750 [1:28:10<12:36:24, 5.78s/it] 10%|█ | 898/8750 [1:28:04<12:36:24, 5.78s/it] {'loss': 0.4898, 'learning_rate': 1.9725015458454068e-05, 'epoch': 0.1} + 10%|█ | 898/8750 [1:28:10<12:36:24, 5.78s/it] {'loss': 0.4898, 'learning_rate': 1.9725015458454068e-05, 'epoch': 0.1} + 10%|█ | 898/8750 [1:28:04<12:36:24, 5.78s/it] 10%|█ | 899/8750 [1:28:16<12:29:39, 5.73s/it] 10%|█ | 899/8750 [1:28:09<12:29:39, 5.73s/it] {'loss': 0.5056, 'learning_rate': 1.9724152690530785e-05, 'epoch': 0.1} + 10%|█ | 899/8750 [1:28:16<12:29:39, 5.73s/it] {'loss': 0.5056, 'learning_rate': 1.9724152690530785e-05, 'epoch': 0.1} + 10%|█ | 899/8750 [1:28:09<12:29:39, 5.73s/it]13 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +08 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... + 10%|█ | 900/8750 [1:28:21<12:27:57, 5.72s/it]9 AutoResumeHook: Checking whether to suspend... + 10%|█ | 900/8750 [1:28:15<12:27:57, 5.72s/it] {'loss': 0.4908, 'learning_rate': 1.972328859018148e-05, 'epoch': 0.1} + 10%|█ | 900/8750 [1:28:21<12:27:57, 5.72s/it] {'loss': 0.4908, 'learning_rate': 1.972328859018148e-05, 'epoch': 0.1} + 10%|█ | 900/8750 [1:28:15<12:27:57, 5.72s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-900/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-900/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-900/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 10%|█ | 901/8750 [1:28:43<22:37:03, 10.37s/it] 10%|█ | 901/8750 [1:28:36<22:37:03, 10.37s/it] {'loss': 0.4829, 'learning_rate': 1.9722423157524553e-05, 'epoch': 0.1} + 10%|█ | 901/8750 [1:28:43<22:37:03, 10.37s/it] {'loss': 0.4829, 'learning_rate': 1.9722423157524553e-05, 'epoch': 0.1} + 10%|█ | 901/8750 [1:28:36<22:37:03, 10.37s/it] 10%|█ | 902/8750 [1:28:48<19:40:03, 9.02s/it] 10%|█ | 902/8750 [1:28:42<19:40:03, 9.02s/it] {'loss': 0.4885, 'learning_rate': 1.972155639267859e-05, 'epoch': 0.1} + 10%|█ | 902/8750 [1:28:48<19:40:03, 9.02s/it] {'loss': 0.4885, 'learning_rate': 1.972155639267859e-05, 'epoch': 0.1} + 10%|█ | 902/8750 [1:28:42<19:40:03, 9.02s/it] 10%|█ | 903/8750 [1:28:54<17:27:09, 8.01s/it] 10%|█ | 903/8750 [1:28:48<17:27:10, 8.01s/it] {'loss': 0.4938, 'learning_rate': 1.972068829576236e-05, 'epoch': 0.1} + 10%|█ | 903/8750 [1:28:54<17:27:09, 8.01s/it] {'loss': 0.4938, 'learning_rate': 1.972068829576236e-05, 'epoch': 0.1} + 10%|█ | 903/8750 [1:28:48<17:27:10, 8.01s/it] 10%|█ | 904/8750 [1:29:00<16:04:52, 7.38s/it] 10%|█ | 904/8750 [1:28:54<16:04:53, 7.38s/it] {'loss': 0.4802, 'learning_rate': 1.9719818866894802e-05, 'epoch': 0.1} + 10%|█ | 904/8750 [1:29:00<16:04:52, 7.38s/it] {'loss': 0.4802, 'learning_rate': 1.9719818866894802e-05, 'epoch': 0.1} + 10%|█ | 904/8750 [1:28:54<16:04:53, 7.38s/it] 10%|█ | 905/8750 [1:29:06<14:52:39, 6.83s/it] 10%|█ | 905/8750 [1:28:59<14:52:39, 6.83s/it] {'loss': 0.5079, 'learning_rate': 1.9718948106195055e-05, 'epoch': 0.1} + 10%|█ | 905/8750 [1:29:06<14:52:39, 6.83s/it] {'loss': 0.5079, 'learning_rate': 1.9718948106195055e-05, 'epoch': 0.1} + 10%|█ | 905/8750 [1:28:59<14:52:39, 6.83s/it] 10%|█ | 906/8750 [1:29:11<14:17:56, 6.56s/it] 10%|█ | 906/8750 [1:29:05<14:17:55, 6.56s/it] {'loss': 0.4982, 'learning_rate': 1.971807601378243e-05, 'epoch': 0.1} + 10%|█ | 906/8750 [1:29:11<14:17:56, 6.56s/it] {'loss': 0.4982, 'learning_rate': 1.971807601378243e-05, 'epoch': 0.1} + 10%|█ | 906/8750 [1:29:05<14:17:55, 6.56s/it] 10%|█ | 907/8750 [1:29:17<13:47:51, 6.33s/it] 10%|█ | 907/8750 [1:29:11<13:47:52, 6.33s/it] {'loss': 0.4993, 'learning_rate': 1.9717202589776424e-05, 'epoch': 0.1} + 10%|█ | 907/8750 [1:29:17<13:47:51, 6.33s/it] {'loss': 0.4993, 'learning_rate': 1.9717202589776424e-05, 'epoch': 0.1} + 10%|█ | 907/8750 [1:29:11<13:47:52, 6.33s/it] 10%|█ | 908/8750 [1:29:23<13:25:56, 6.17s/it] 10%|█ | 908/8750 [1:29:17<13:25:57, 6.17s/it] {'loss': 0.5026, 'learning_rate': 1.971632783429672e-05, 'epoch': 0.1} + 10%|█ | 908/8750 [1:29:23<13:25:56, 6.17s/it] {'loss': 0.5026, 'learning_rate': 1.971632783429672e-05, 'epoch': 0.1} + 10%|█ | 908/8750 [1:29:17<13:25:57, 6.17s/it] 10%|█ | 909/8750 [1:29:22<13:03:16, 5.99s/it] 10%|█ | 909/8750 [1:29:29<13:03:18, 5.99s/it] {'loss': 0.4881, 'learning_rate': 1.9715451747463168e-05, 'epoch': 0.1} + 10%|█ | 909/8750 [1:29:29<13:03:18, 5.99s/it] {'loss': 0.4881, 'learning_rate': 1.9715451747463168e-05, 'epoch': 0.1} + 10%|█ | 909/8750 [1:29:22<13:03:16, 5.99s/it] 10%|█ | 910/8750 [1:29:34<12:52:29, 5.91s/it] 10%|█ | 910/8750 [1:29:28<12:52:29, 5.91s/it] {'loss': 0.5166, 'learning_rate': 1.971457432939582e-05, 'epoch': 0.1} + 10%|█ | 910/8750 [1:29:34<12:52:29, 5.91s/it] {'loss': 0.5166, 'learning_rate': 1.971457432939582e-05, 'epoch': 0.1} + 10%|█ | 910/8750 [1:29:28<12:52:29, 5.91s/it] 10%|█ | 911/8750 [1:29:40<12:48:32, 5.88s/it] 10%|█ | 911/8750 [1:29:34<12:48:32, 5.88s/it] {'loss': 0.4888, 'learning_rate': 1.97136955802149e-05, 'epoch': 0.1} + 10%|█ | 911/8750 [1:29:40<12:48:32, 5.88s/it] {'loss': 0.4888, 'learning_rate': 1.97136955802149e-05, 'epoch': 0.1} + 10%|█ | 911/8750 [1:29:34<12:48:32, 5.88s/it] 10%|█ | 912/8750 [1:29:40<12:47:28, 5.87s/it] 10%|█ | 912/8750 [1:29:46<12:47:32, 5.88s/it] {'loss': 0.4803, 'learning_rate': 1.9712815500040815e-05, 'epoch': 0.1} + 10%|█ | 912/8750 [1:29:46<12:47:32, 5.88s/it] {'loss': 0.4803, 'learning_rate': 1.9712815500040815e-05, 'epoch': 0.1} + 10%|█ | 912/8750 [1:29:40<12:47:28, 5.87s/it] 10%|█ | 913/8750 [1:29:45<12:46:48, 5.87s/it] 10%|█ | 913/8750 [1:29:52<12:46:47, 5.87s/it] {'loss': 0.4939, 'learning_rate': 1.9711934088994157e-05, 'epoch': 0.1} + 10%|█ | 913/8750 [1:29:52<12:46:47, 5.87s/it] {'loss': 0.4939, 'learning_rate': 1.9711934088994157e-05, 'epoch': 0.1} + 10%|█ | 913/8750 [1:29:45<12:46:48, 5.87s/it] 10%|█ | 914/8750 [1:29:51<12:39:51, 5.82s/it] 10%|█ | 914/8750 [1:29:58<12:39:51, 5.82s/it] {'loss': 0.4948, 'learning_rate': 1.97110513471957e-05, 'epoch': 0.1} + 10%|█ | 914/8750 [1:29:58<12:39:51, 5.82s/it] {'loss': 0.4948, 'learning_rate': 1.97110513471957e-05, 'epoch': 0.1} + 10%|█ | 914/8750 [1:29:51<12:39:51, 5.82s/it] 10%|█ | 915/8750 [1:30:03<12:36:43, 5.79s/it] 10%|█ | 915/8750 [1:29:57<12:36:43, 5.80s/it] {'loss': 0.4926, 'learning_rate': 1.9710167274766395e-05, 'epoch': 0.1} + 10%|█ | 915/8750 [1:30:03<12:36:43, 5.79s/it] {'loss': 0.4926, 'learning_rate': 1.9710167274766395e-05, 'epoch': 0.1} + 10%|█ | 915/8750 [1:29:57<12:36:43, 5.80s/it] 10%|█ | 916/8750 [1:30:09<12:34:03, 5.78s/it] 10%|█ | 916/8750 [1:30:03<12:34:03, 5.78s/it] {'loss': 0.4969, 'learning_rate': 1.9709281871827386e-05, 'epoch': 0.1} + 10%|█ | 916/8750 [1:30:09<12:34:03, 5.78s/it] {'loss': 0.4969, 'learning_rate': 1.9709281871827386e-05, 'epoch': 0.1} + 10%|█ | 916/8750 [1:30:03<12:34:03, 5.78s/it] 10%|█ | 917/8750 [1:30:15<12:30:50, 5.75s/it] 10%|█ | 917/8750 [1:30:08<12:30:50, 5.75s/it] {'loss': 0.4836, 'learning_rate': 1.9708395138499986e-05, 'epoch': 0.1} + 10%|█ | 917/8750 [1:30:15<12:30:50, 5.75s/it] {'loss': 0.4836, 'learning_rate': 1.9708395138499986e-05, 'epoch': 0.1} + 10%|█ | 917/8750 [1:30:08<12:30:50, 5.75s/it] 10%|█ | 918/8750 [1:30:20<12:25:25, 5.71s/it] 10%|█ | 918/8750 [1:30:14<12:25:26, 5.71s/it] {'loss': 0.5263, 'learning_rate': 1.97075070749057e-05, 'epoch': 0.1} + 10%|█ | 918/8750 [1:30:20<12:25:25, 5.71s/it] {'loss': 0.5263, 'learning_rate': 1.97075070749057e-05, 'epoch': 0.1} + 10%|█ | 918/8750 [1:30:14<12:25:26, 5.71s/it] 11%|█ | 919/8750 [1:30:26<12:25:35, 5.71s/it] 11%|█ | 919/8750 [1:30:20<12:25:35, 5.71s/it] {'loss': 0.4922, 'learning_rate': 1.970661768116622e-05, 'epoch': 0.11} + 11%|█ | 919/8750 [1:30:26<12:25:35, 5.71s/it] {'loss': 0.4922, 'learning_rate': 1.970661768116622e-05, 'epoch': 0.11} + 11%|█ | 919/8750 [1:30:20<12:25:35, 5.71s/it] 11%|█ | 920/8750 [1:30:32<12:26:09, 5.72s/it] 11%|█ | 920/8750 [1:30:25<12:26:09, 5.72s/it] {'loss': 0.4912, 'learning_rate': 1.9705726957403398e-05, 'epoch': 0.11} + 11%|█ | 920/8750 [1:30:32<12:26:09, 5.72s/it] {'loss': 0.4912, 'learning_rate': 1.9705726957403398e-05, 'epoch': 0.11} + 11%|█ | 920/8750 [1:30:25<12:26:09, 5.72s/it] 11%|█ | 921/8750 [1:30:38<12:26:07, 5.72s/it] 11%|█ | 921/8750 [1:30:31<12:26:07, 5.72s/it] {'loss': 0.4587, 'learning_rate': 1.9704834903739297e-05, 'epoch': 0.11} + 11%|█ | 921/8750 [1:30:38<12:26:07, 5.72s/it] {'loss': 0.4587, 'learning_rate': 1.9704834903739297e-05, 'epoch': 0.11} + 11%|█ | 921/8750 [1:30:31<12:26:07, 5.72s/it] 11%|█ | 922/8750 [1:30:43<12:30:32, 5.75s/it] 11%|█ | 922/8750 [1:30:37<12:30:32, 5.75s/it] {'loss': 0.5059, 'learning_rate': 1.970394152029614e-05, 'epoch': 0.11} + 11%|█ | 922/8750 [1:30:43<12:30:32, 5.75s/it] {'loss': 0.5059, 'learning_rate': 1.970394152029614e-05, 'epoch': 0.11} + 11%|█ | 922/8750 [1:30:37<12:30:32, 5.75s/it] 11%|█ | 923/8750 [1:30:49<12:24:53, 5.71s/it] 11%|█ | 923/8750 [1:30:43<12:24:54, 5.71s/it] {'loss': 0.4826, 'learning_rate': 1.970304680719634e-05, 'epoch': 0.11} + 11%|█ | 923/8750 [1:30:49<12:24:53, 5.71s/it] {'loss': 0.4826, 'learning_rate': 1.970304680719634e-05, 'epoch': 0.11} + 11%|█ | 923/8750 [1:30:43<12:24:54, 5.71s/it] 11%|█ | 924/8750 [1:30:55<12:24:50, 5.71s/it] 11%|█ | 924/8750 [1:30:48<12:24:49, 5.71s/it] {'loss': 0.5044, 'learning_rate': 1.9702150764562498e-05, 'epoch': 0.11} + 11%|█ | 924/8750 [1:30:55<12:24:50, 5.71s/it] {'loss': 0.5044, 'learning_rate': 1.9702150764562498e-05, 'epoch': 0.11} + 11%|█ | 924/8750 [1:30:48<12:24:49, 5.71s/it] 11%|█ | 925/8750 [1:31:00<12:26:05, 5.72s/it] 11%|█ | 925/8750 [1:30:54<12:26:05, 5.72s/it] {'loss': 0.4838, 'learning_rate': 1.970125339251739e-05, 'epoch': 0.11} + 11%|█ | 925/8750 [1:31:00<12:26:05, 5.72s/it] {'loss': 0.4838, 'learning_rate': 1.970125339251739e-05, 'epoch': 0.11} + 11%|█ | 925/8750 [1:30:54<12:26:05, 5.72s/it] 11%|█ | 926/8750 [1:31:06<12:31:34, 5.76s/it] 11%|█ | 926/8750 [1:31:00<12:31:34, 5.76s/it] {'loss': 0.5082, 'learning_rate': 1.9700354691183977e-05, 'epoch': 0.11} + 11%|█ | 926/8750 [1:31:06<12:31:34, 5.76s/it] {'loss': 0.5082, 'learning_rate': 1.9700354691183977e-05, 'epoch': 0.11} + 11%|█ | 926/8750 [1:31:00<12:31:34, 5.76s/it] 11%|█ | 927/8750 [1:31:06<12:32:52, 5.77s/it] 11%|█ | 927/8750 [1:31:12<12:32:53, 5.77s/it] {'loss': 0.4833, 'learning_rate': 1.9699454660685398e-05, 'epoch': 0.11} + 11%|█ | 927/8750 [1:31:12<12:32:53, 5.77s/it] {'loss': 0.4833, 'learning_rate': 1.9699454660685398e-05, 'epoch': 0.11} + 11%|█ | 927/8750 [1:31:06<12:32:52, 5.77s/it] 11%|█ | 928/8750 [1:31:18<12:27:47, 5.74s/it] 11%|█ | 928/8750 [1:31:11<12:27:49, 5.74s/it] {'loss': 0.4837, 'learning_rate': 1.969855330114498e-05, 'epoch': 0.11} + 11%|█ | 928/8750 [1:31:18<12:27:47, 5.74s/it] {'loss': 0.4837, 'learning_rate': 1.969855330114498e-05, 'epoch': 0.11} + 11%|█ | 928/8750 [1:31:11<12:27:49, 5.74s/it] 11%|█ | 929/8750 [1:31:17<12:22:54, 5.70s/it] 11%|█ | 929/8750 [1:31:23<12:22:54, 5.70s/it] {'loss': 0.4915, 'learning_rate': 1.9697650612686228e-05, 'epoch': 0.11} + 11%|█ | 929/8750 [1:31:23<12:22:54, 5.70s/it] {'loss': 0.4915, 'learning_rate': 1.9697650612686228e-05, 'epoch': 0.11} + 11%|█ | 929/8750 [1:31:17<12:22:54, 5.70s/it] 11%|█ | 930/8750 [1:31:29<12:31:08, 5.76s/it] 11%|█ | 930/8750 [1:31:23<12:31:08, 5.76s/it] {'loss': 0.4941, 'learning_rate': 1.9696746595432828e-05, 'epoch': 0.11} + 11%|█ | 930/8750 [1:31:29<12:31:08, 5.76s/it] {'loss': 0.4941, 'learning_rate': 1.9696746595432828e-05, 'epoch': 0.11} + 11%|█ | 930/8750 [1:31:23<12:31:08, 5.76s/it] 11%|█ | 931/8750 [1:31:35<12:24:19, 5.71s/it] 11%|█ | 931/8750 [1:31:28<12:24:20, 5.71s/it] {'loss': 0.5001, 'learning_rate': 1.9695841249508656e-05, 'epoch': 0.11} + 11%|█ | 931/8750 [1:31:35<12:24:19, 5.71s/it] {'loss': 0.5001, 'learning_rate': 1.9695841249508656e-05, 'epoch': 0.11} + 11%|█ | 931/8750 [1:31:28<12:24:20, 5.71s/it] 11%|█ | 932/8750 [1:31:34<12:20:57, 5.69s/it] 11%|█ | 932/8750 [1:31:41<12:20:57, 5.69s/it] {'loss': 0.514, 'learning_rate': 1.9694934575037762e-05, 'epoch': 0.11} + 11%|█ | 932/8750 [1:31:41<12:20:57, 5.69s/it] {'loss': 0.514, 'learning_rate': 1.9694934575037762e-05, 'epoch': 0.11} + 11%|█ | 932/8750 [1:31:34<12:20:57, 5.69s/it] 11%|█ | 933/8750 [1:31:40<12:27:04, 5.73s/it] 11%|█ | 933/8750 [1:31:46<12:27:06, 5.73s/it] {'loss': 0.4917, 'learning_rate': 1.969402657214438e-05, 'epoch': 0.11} + 11%|█ | 933/8750 [1:31:46<12:27:06, 5.73s/it] {'loss': 0.4917, 'learning_rate': 1.969402657214438e-05, 'epoch': 0.11} + 11%|█ | 933/8750 [1:31:40<12:27:04, 5.73s/it] 11%|█ | 934/8750 [1:31:52<12:26:17, 5.73s/it] 11%|█ | 934/8750 [1:31:46<12:26:18, 5.73s/it]{'loss': 0.4981, 'learning_rate': 1.9693117240952928e-05, 'epoch': 0.11} + {'loss': 0.4981, 'learning_rate': 1.9693117240952928e-05, 'epoch': 0.11} + 11%|█ | 934/8750 [1:31:52<12:26:17, 5.73s/it] 11%|█ | 934/8750 [1:31:46<12:26:18, 5.73s/it] 11%|█ | 935/8750 [1:31:58<12:25:31, 5.72s/it] 11%|█ | 935/8750 [1:31:51<12:25:31, 5.72s/it] {'loss': 0.4806, 'learning_rate': 1.9692206581588e-05, 'epoch': 0.11} + 11%|█ | 935/8750 [1:31:58<12:25:31, 5.72s/it] {'loss': 0.4806, 'learning_rate': 1.9692206581588e-05, 'epoch': 0.11} + 11%|█ | 935/8750 [1:31:51<12:25:31, 5.72s/it] 11%|█ | 936/8750 [1:31:57<12:27:49, 5.74s/it] 11%|█ | 936/8750 [1:32:04<12:27:50, 5.74s/it] {'loss': 0.496, 'learning_rate': 1.969129459417438e-05, 'epoch': 0.11} + 11%|█ | 936/8750 [1:32:04<12:27:50, 5.74s/it] {'loss': 0.496, 'learning_rate': 1.969129459417438e-05, 'epoch': 0.11} + 11%|█ | 936/8750 [1:31:57<12:27:49, 5.74s/it] 11%|█ | 937/8750 [1:32:03<12:42:35, 5.86s/it] 11%|█ | 937/8750 [1:32:10<12:42:35, 5.86s/it] {'loss': 0.4817, 'learning_rate': 1.9690381278837038e-05, 'epoch': 0.11} + 11%|█ | 937/8750 [1:32:10<12:42:35, 5.86s/it] {'loss': 0.4817, 'learning_rate': 1.9690381278837038e-05, 'epoch': 0.11} + 11%|█ | 937/8750 [1:32:03<12:42:35, 5.86s/it] 11%|█ | 938/8750 [1:32:15<12:37:05, 5.81s/it] 11%|█ | 938/8750 [1:32:09<12:37:05, 5.81s/it] {'loss': 0.5036, 'learning_rate': 1.9689466635701106e-05, 'epoch': 0.11} + 11%|█ | 938/8750 [1:32:15<12:37:05, 5.81s/it] {'loss': 0.5036, 'learning_rate': 1.9689466635701106e-05, 'epoch': 0.11} + 11%|█ | 938/8750 [1:32:09<12:37:05, 5.81s/it] 11%|█ | 939/8750 [1:32:21<12:33:24, 5.79s/it] 11%|█ | 939/8750 [1:32:15<12:33:25, 5.79s/it] {'loss': 0.5118, 'learning_rate': 1.9688550664891915e-05, 'epoch': 0.11} + 11%|█ | 939/8750 [1:32:21<12:33:24, 5.79s/it] {'loss': 0.5118, 'learning_rate': 1.9688550664891915e-05, 'epoch': 0.11} + 11%|█ | 939/8750 [1:32:15<12:33:25, 5.79s/it] 11%|█ | 940/8750 [1:32:27<12:32:16, 5.78s/it] 11%|█ | 940/8750 [1:32:20<12:32:16, 5.78s/it] {'loss': 0.5007, 'learning_rate': 1.968763336653498e-05, 'epoch': 0.11} + 11%|█ | 940/8750 [1:32:27<12:32:16, 5.78s/it] {'loss': 0.5007, 'learning_rate': 1.968763336653498e-05, 'epoch': 0.11} + 11%|█ | 940/8750 [1:32:20<12:32:16, 5.78s/it] 11%|█ | 941/8750 [1:32:33<12:31:42, 5.78s/it] 11%|█ | 941/8750 [1:32:26<12:31:42, 5.78s/it] {'loss': 0.4755, 'learning_rate': 1.968671474075598e-05, 'epoch': 0.11} + 11%|█ | 941/8750 [1:32:33<12:31:42, 5.78s/it] {'loss': 0.4755, 'learning_rate': 1.968671474075598e-05, 'epoch': 0.11} + 11%|█ | 941/8750 [1:32:26<12:31:42, 5.78s/it] 11%|█ | 942/8750 [1:32:32<12:27:08, 5.74s/it] 11%|█ | 942/8750 [1:32:38<12:27:09, 5.74s/it] {'loss': 0.51, 'learning_rate': 1.96857947876808e-05, 'epoch': 0.11} + 11%|█ | 942/8750 [1:32:38<12:27:09, 5.74s/it] {'loss': 0.51, 'learning_rate': 1.96857947876808e-05, 'epoch': 0.11} + 11%|█ | 942/8750 [1:32:32<12:27:08, 5.74s/it] 11%|█ | 943/8750 [1:32:44<12:26:39, 5.74s/it] 11%|█ | 943/8750 [1:32:38<12:26:40, 5.74s/it] {'loss': 0.4993, 'learning_rate': 1.968487350743548e-05, 'epoch': 0.11} + 11%|█ | 943/8750 [1:32:44<12:26:39, 5.74s/it] {'loss': 0.4993, 'learning_rate': 1.968487350743548e-05, 'epoch': 0.11} + 11%|█ | 943/8750 [1:32:38<12:26:40, 5.74s/it] 11%|█ | 944/8750 [1:32:50<12:28:43, 5.75s/it] 11%|█ | 944/8750 [1:32:43<12:28:43, 5.75s/it] {'loss': 0.5162, 'learning_rate': 1.968395090014627e-05, 'epoch': 0.11} + 11%|█ | 944/8750 [1:32:50<12:28:43, 5.75s/it] {'loss': 0.5162, 'learning_rate': 1.968395090014627e-05, 'epoch': 0.11} + 11%|█ | 944/8750 [1:32:43<12:28:43, 5.75s/it] 11%|█ | 945/8750 [1:32:55<12:22:06, 5.70s/it] 11%|█ | 945/8750 [1:32:49<12:22:06, 5.70s/it] {'loss': 0.4912, 'learning_rate': 1.968302696593958e-05, 'epoch': 0.11} + 11%|█ | 945/8750 [1:32:55<12:22:06, 5.70s/it] {'loss': 0.4912, 'learning_rate': 1.968302696593958e-05, 'epoch': 0.11} + 11%|█ | 945/8750 [1:32:49<12:22:06, 5.70s/it] 11%|█ | 946/8750 [1:33:01<12:20:14, 5.69s/it] 11%|█ | 946/8750 [1:32:55<12:20:14, 5.69s/it] {'loss': 0.4905, 'learning_rate': 1.968210170494201e-05, 'epoch': 0.11} + 11%|█ | 946/8750 [1:33:01<12:20:14, 5.69s/it] {'loss': 0.4905, 'learning_rate': 1.968210170494201e-05, 'epoch': 0.11} + 11%|█ | 946/8750 [1:32:55<12:20:14, 5.69s/it] 11%|█ | 947/8750 [1:33:07<12:22:20, 5.71s/it] 11%|█ | 947/8750 [1:33:00<12:22:20, 5.71s/it] {'loss': 0.4988, 'learning_rate': 1.9681175117280343e-05, 'epoch': 0.11} + 11%|█ | 947/8750 [1:33:07<12:22:20, 5.71s/it] {'loss': 0.4988, 'learning_rate': 1.9681175117280343e-05, 'epoch': 0.11} + 11%|█ | 947/8750 [1:33:00<12:22:20, 5.71s/it] 11%|█ | 948/8750 [1:33:13<12:21:40, 5.70s/it] 11%|█ | 948/8750 [1:33:06<12:21:40, 5.70s/it] {'loss': 0.4947, 'learning_rate': 1.9680247203081537e-05, 'epoch': 0.11} + 11%|█ | 948/8750 [1:33:13<12:21:40, 5.70s/it] {'loss': 0.4947, 'learning_rate': 1.9680247203081537e-05, 'epoch': 0.11} + 11%|█ | 948/8750 [1:33:06<12:21:40, 5.70s/it] 11%|█ | 949/8750 [1:33:18<12:17:42, 5.67s/it] 11%|█ | 949/8750 [1:33:12<12:17:42, 5.67s/it] {'loss': 0.4997, 'learning_rate': 1.9679317962472746e-05, 'epoch': 0.11} + 11%|█ | 949/8750 [1:33:18<12:17:42, 5.67s/it] {'loss': 0.4997, 'learning_rate': 1.9679317962472746e-05, 'epoch': 0.11} + 11%|█ | 949/8750 [1:33:12<12:17:42, 5.67s/it]13 AutoResumeHook: Checking whether to suspend... +3 1AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +0 11%|█ | 950/8750 [1:33:24<12:18:14, 5.68s/it]9 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... + 11%|█ | 950/8750 [1:33:17<12:18:14, 5.68s/it] {'loss': 0.4728, 'learning_rate': 1.9678387395581292e-05, 'epoch': 0.11} + 11%|█ | 950/8750 [1:33:24<12:18:14, 5.68s/it] {'loss': 0.4728, 'learning_rate': 1.9678387395581292e-05, 'epoch': 0.11} + 11%|█ | 950/8750 [1:33:17<12:18:14, 5.68s/it] 11%|█ | 951/8750 [1:33:23<12:22:54, 5.72s/it] 11%|█ | 951/8750 [1:33:30<12:22:57, 5.72s/it] {'loss': 0.4971, 'learning_rate': 1.967745550253468e-05, 'epoch': 0.11} + 11%|█ | 951/8750 [1:33:30<12:22:57, 5.72s/it] {'loss': 0.4971, 'learning_rate': 1.967745550253468e-05, 'epoch': 0.11} + 11%|█ | 951/8750 [1:33:23<12:22:54, 5.72s/it] 11%|█ | 952/8750 [1:33:36<12:35:24, 5.81s/it] 11%|█ | 952/8750 [1:33:29<12:35:24, 5.81s/it] {'loss': 0.488, 'learning_rate': 1.9676522283460606e-05, 'epoch': 0.11} + 11%|█ | 952/8750 [1:33:36<12:35:24, 5.81s/it] {'loss': 0.488, 'learning_rate': 1.9676522283460606e-05, 'epoch': 0.11} + 11%|█ | 952/8750 [1:33:29<12:35:24, 5.81s/it] 11%|█ | 953/8750 [1:33:35<12:32:45, 5.79s/it] 11%|█ | 953/8750 [1:33:41<12:32:45, 5.79s/it] {'loss': 0.4897, 'learning_rate': 1.9675587738486935e-05, 'epoch': 0.11} + 11%|█ | 953/8750 [1:33:41<12:32:45, 5.79s/it] {'loss': 0.4897, 'learning_rate': 1.9675587738486935e-05, 'epoch': 0.11} + 11%|█ | 953/8750 [1:33:35<12:32:45, 5.79s/it] 11%|█ | 954/8750 [1:33:47<12:27:46, 5.76s/it] 11%|█ | 954/8750 [1:33:41<12:27:46, 5.76s/it] {'loss': 0.4924, 'learning_rate': 1.9674651867741733e-05, 'epoch': 0.11} + 11%|█ | 954/8750 [1:33:47<12:27:46, 5.76s/it] {'loss': 0.4924, 'learning_rate': 1.9674651867741733e-05, 'epoch': 0.11} + 11%|█ | 954/8750 [1:33:41<12:27:46, 5.76s/it] 11%|█ | 955/8750 [1:33:47<12:38:39, 5.84s/it] 11%|█ | 955/8750 [1:33:53<12:38:41, 5.84s/it] {'loss': 0.489, 'learning_rate': 1.967371467135322e-05, 'epoch': 0.11} + 11%|█ | 955/8750 [1:33:53<12:38:41, 5.84s/it] {'loss': 0.489, 'learning_rate': 1.967371467135322e-05, 'epoch': 0.11} + 11%|█ | 955/8750 [1:33:47<12:38:39, 5.84s/it] 11%|█ | 956/8750 [1:33:59<12:38:27, 5.84s/it] 11%|█ | 956/8750 [1:33:52<12:38:28, 5.84s/it] {'loss': 0.4761, 'learning_rate': 1.9672776149449826e-05, 'epoch': 0.11} + 11%|█ | 956/8750 [1:33:59<12:38:27, 5.84s/it] {'loss': 0.4761, 'learning_rate': 1.9672776149449826e-05, 'epoch': 0.11} + 11%|█ | 956/8750 [1:33:52<12:38:28, 5.84s/it] 11%|█ | 957/8750 [1:34:05<12:32:19, 5.79s/it] 11%|█ | 957/8750 [1:33:58<12:32:20, 5.79s/it] {'loss': 0.513, 'learning_rate': 1.967183630216014e-05, 'epoch': 0.11} + 11%|█ | 957/8750 [1:34:05<12:32:19, 5.79s/it] {'loss': 0.513, 'learning_rate': 1.967183630216014e-05, 'epoch': 0.11} + 11%|█ | 957/8750 [1:33:58<12:32:20, 5.79s/it] 11%|█ | 958/8750 [1:34:10<12:26:14, 5.75s/it] 11%|█ | 958/8750 [1:34:04<12:26:14, 5.75s/it] {'loss': 0.4968, 'learning_rate': 1.9670895129612946e-05, 'epoch': 0.11} + 11%|█ | 958/8750 [1:34:10<12:26:14, 5.75s/it] {'loss': 0.4968, 'learning_rate': 1.9670895129612946e-05, 'epoch': 0.11} + 11%|█ | 958/8750 [1:34:04<12:26:14, 5.75s/it] 11%|█ | 959/8750 [1:34:16<12:20:29, 5.70s/it] 11%|█ | 959/8750 [1:34:09<12:20:29, 5.70s/it] {'loss': 0.4754, 'learning_rate': 1.9669952631937206e-05, 'epoch': 0.11} + 11%|█ | 959/8750 [1:34:16<12:20:29, 5.70s/it] {'loss': 0.4754, 'learning_rate': 1.9669952631937206e-05, 'epoch': 0.11} + 11%|█ | 959/8750 [1:34:09<12:20:29, 5.70s/it] 11%|█ | 960/8750 [1:34:22<12:22:47, 5.72s/it] 11%|█ | 960/8750 [1:34:15<12:22:47, 5.72s/it] {'loss': 0.4952, 'learning_rate': 1.9669008809262064e-05, 'epoch': 0.11} + 11%|█ | 960/8750 [1:34:22<12:22:47, 5.72s/it] {'loss': 0.4952, 'learning_rate': 1.9669008809262064e-05, 'epoch': 0.11} + 11%|█ | 960/8750 [1:34:15<12:22:47, 5.72s/it] 11%|█ | 961/8750 [1:34:27<12:19:58, 5.70s/it] 11%|█ | 961/8750 [1:34:21<12:19:59, 5.70s/it] {'loss': 0.4881, 'learning_rate': 1.9668063661716837e-05, 'epoch': 0.11} + 11%|█ | 961/8750 [1:34:27<12:19:58, 5.70s/it] {'loss': 0.4881, 'learning_rate': 1.9668063661716837e-05, 'epoch': 0.11} + 11%|█ | 961/8750 [1:34:21<12:19:59, 5.70s/it] 11%|█ | 962/8750 [1:34:33<12:19:00, 5.69s/it] 11%|█ | 962/8750 [1:34:26<12:19:00, 5.69s/it] {'loss': 0.4988, 'learning_rate': 1.9667117189431045e-05, 'epoch': 0.11} + 11%|█ | 962/8750 [1:34:33<12:19:00, 5.69s/it] {'loss': 0.4988, 'learning_rate': 1.9667117189431045e-05, 'epoch': 0.11} + 11%|█ | 962/8750 [1:34:26<12:19:00, 5.69s/it] 11%|█ | 963/8750 [1:34:39<12:19:33, 5.70s/it] 11%|█ | 963/8750 [1:34:32<12:19:33, 5.70s/it] {'loss': 0.4809, 'learning_rate': 1.9666169392534363e-05, 'epoch': 0.11} + 11%|█ | 963/8750 [1:34:39<12:19:33, 5.70s/it] {'loss': 0.4809, 'learning_rate': 1.9666169392534363e-05, 'epoch': 0.11} + 11%|█ | 963/8750 [1:34:32<12:19:33, 5.70s/it] 11%|█ | 964/8750 [1:34:44<12:17:20, 5.68s/it] 11%|█ | 964/8750 [1:34:38<12:17:20, 5.68s/it] {'loss': 0.4772, 'learning_rate': 1.966522027115667e-05, 'epoch': 0.11} + 11%|█ | 964/8750 [1:34:44<12:17:20, 5.68s/it] {'loss': 0.4772, 'learning_rate': 1.966522027115667e-05, 'epoch': 0.11} + 11%|█ | 964/8750 [1:34:38<12:17:20, 5.68s/it] 11%|█ | 965/8750 [1:34:50<12:25:40, 5.75s/it] 11%|█ | 965/8750 [1:34:44<12:25:40, 5.75s/it] {'loss': 0.4936, 'learning_rate': 1.966426982542801e-05, 'epoch': 0.11} + 11%|█ | 965/8750 [1:34:50<12:25:40, 5.75s/it] {'loss': 0.4936, 'learning_rate': 1.966426982542801e-05, 'epoch': 0.11} + 11%|█ | 965/8750 [1:34:44<12:25:40, 5.75s/it] 11%|█ | 966/8750 [1:34:56<12:32:07, 5.80s/it] 11%|█ | 966/8750 [1:34:50<12:32:07, 5.80s/it] {'loss': 0.4872, 'learning_rate': 1.9663318055478616e-05, 'epoch': 0.11} + 11%|█ | 966/8750 [1:34:56<12:32:07, 5.80s/it] {'loss': 0.4872, 'learning_rate': 1.9663318055478616e-05, 'epoch': 0.11} + 11%|█ | 966/8750 [1:34:50<12:32:07, 5.80s/it] 11%|█ | 967/8750 [1:35:02<12:20:02, 5.71s/it] 11%|█ | 967/8750 [1:34:55<12:20:03, 5.71s/it] {'loss': 0.5017, 'learning_rate': 1.9662364961438907e-05, 'epoch': 0.11} + 11%|█ | 967/8750 [1:35:02<12:20:02, 5.71s/it] {'loss': 0.5017, 'learning_rate': 1.9662364961438907e-05, 'epoch': 0.11} + 11%|█ | 967/8750 [1:34:55<12:20:03, 5.71s/it] 11%|█ | 968/8750 [1:35:07<12:19:27, 5.70s/it] 11%|█ | 968/8750 [1:35:01<12:19:26, 5.70s/it] {'loss': 0.4938, 'learning_rate': 1.966141054343947e-05, 'epoch': 0.11} + 11%|█ | 968/8750 [1:35:07<12:19:27, 5.70s/it] {'loss': 0.4938, 'learning_rate': 1.966141054343947e-05, 'epoch': 0.11} + 11%|█ | 968/8750 [1:35:01<12:19:26, 5.70s/it] 11%|█ | 969/8750 [1:35:13<12:24:21, 5.74s/it] 11%|█ | 969/8750 [1:35:07<12:24:20, 5.74s/it] {'loss': 0.4742, 'learning_rate': 1.9660454801611094e-05, 'epoch': 0.11} + 11%|█ | 969/8750 [1:35:13<12:24:21, 5.74s/it] {'loss': 0.4742, 'learning_rate': 1.9660454801611094e-05, 'epoch': 0.11} + 11%|█ | 969/8750 [1:35:07<12:24:20, 5.74s/it] 11%|█ | 970/8750 [1:35:12<12:24:48, 5.74s/it] 11%|█ | 970/8750 [1:35:19<12:24:52, 5.74s/it] {'loss': 0.4818, 'learning_rate': 1.9659497736084722e-05, 'epoch': 0.11} + 11%|█ | 970/8750 [1:35:19<12:24:52, 5.74s/it] {'loss': 0.4818, 'learning_rate': 1.9659497736084722e-05, 'epoch': 0.11} + 11%|█ | 970/8750 [1:35:12<12:24:48, 5.74s/it] 11%|█ | 971/8750 [1:35:25<12:20:47, 5.71s/it] 11%|█ | 971/8750 [1:35:18<12:20:48, 5.71s/it] {'loss': 0.4862, 'learning_rate': 1.9658539346991504e-05, 'epoch': 0.11} + 11%|█ | 971/8750 [1:35:25<12:20:47, 5.71s/it] {'loss': 0.4862, 'learning_rate': 1.9658539346991504e-05, 'epoch': 0.11} + 11%|█ | 971/8750 [1:35:18<12:20:48, 5.71s/it] 11%|█ | 972/8750 [1:35:24<12:28:06, 5.77s/it] 11%|█ | 972/8750 [1:35:30<12:28:07, 5.77s/it] {'loss': 0.4894, 'learning_rate': 1.9657579634462757e-05, 'epoch': 0.11} + 11%|█ | 972/8750 [1:35:24<12:28:06, 5.77s/it]{'loss': 0.4894, 'learning_rate': 1.9657579634462757e-05, 'epoch': 0.11} + 11%|█ | 972/8750 [1:35:30<12:28:07, 5.77s/it] 11%|█ | 973/8750 [1:35:36<12:25:08, 5.75s/it] 11%|█ | 973/8750 [1:35:30<12:25:09, 5.75s/it] {'loss': 0.4805, 'learning_rate': 1.9656618598629985e-05, 'epoch': 0.11} + 11%|█ | 973/8750 [1:35:36<12:25:08, 5.75s/it] {'loss': 0.4805, 'learning_rate': 1.9656618598629985e-05, 'epoch': 0.11} + 11%|█ | 973/8750 [1:35:30<12:25:09, 5.75s/it] 11%|█ | 974/8750 [1:35:42<12:18:14, 5.70s/it] 11%|█ | 974/8750 [1:35:35<12:18:15, 5.70s/it] {'loss': 0.4846, 'learning_rate': 1.9655656239624864e-05, 'epoch': 0.11} + 11%|█ | 974/8750 [1:35:42<12:18:14, 5.70s/it] {'loss': 0.4846, 'learning_rate': 1.9655656239624864e-05, 'epoch': 0.11} + 11%|█ | 974/8750 [1:35:35<12:18:15, 5.70s/it] 11%|█ | 975/8750 [1:35:47<12:18:52, 5.70s/it] 11%|█ | 975/8750 [1:35:41<12:18:52, 5.70s/it] {'loss': 0.5027, 'learning_rate': 1.965469255757927e-05, 'epoch': 0.11} + 11%|█ | 975/8750 [1:35:47<12:18:52, 5.70s/it] {'loss': 0.5027, 'learning_rate': 1.965469255757927e-05, 'epoch': 0.11} + 11%|█ | 975/8750 [1:35:41<12:18:52, 5.70s/it] 11%|█ | 976/8750 [1:35:53<12:16:54, 5.69s/it] 11%|█ | 976/8750 [1:35:47<12:16:54, 5.69s/it] {'loss': 0.4863, 'learning_rate': 1.9653727552625242e-05, 'epoch': 0.11} + 11%|█ | 976/8750 [1:35:53<12:16:54, 5.69s/it] {'loss': 0.4863, 'learning_rate': 1.9653727552625242e-05, 'epoch': 0.11} + 11%|█ | 976/8750 [1:35:47<12:16:54, 5.69s/it] 11%|█ | 977/8750 [1:35:59<12:20:10, 5.71s/it] 11%|█ | 977/8750 [1:35:52<12:20:10, 5.71s/it] {'loss': 0.4852, 'learning_rate': 1.9652761224895006e-05, 'epoch': 0.11} + 11%|█ | 977/8750 [1:35:59<12:20:10, 5.71s/it] {'loss': 0.4852, 'learning_rate': 1.9652761224895006e-05, 'epoch': 0.11} + 11%|█ | 977/8750 [1:35:52<12:20:10, 5.71s/it] 11%|█ | 978/8750 [1:36:05<12:26:44, 5.76s/it] 11%|█ | 978/8750 [1:35:58<12:26:44, 5.76s/it] {'loss': 0.4887, 'learning_rate': 1.9651793574520975e-05, 'epoch': 0.11} + 11%|█ | 978/8750 [1:36:05<12:26:44, 5.76s/it] {'loss': 0.4887, 'learning_rate': 1.9651793574520975e-05, 'epoch': 0.11} + 11%|█ | 978/8750 [1:35:58<12:26:44, 5.76s/it] 11%|█ | 979/8750 [1:36:10<12:20:28, 5.72s/it] 11%|█ | 979/8750 [1:36:04<12:20:28, 5.72s/it] {'loss': 0.4766, 'learning_rate': 1.965082460163574e-05, 'epoch': 0.11} + 11%|█ | 979/8750 [1:36:10<12:20:28, 5.72s/it] {'loss': 0.4766, 'learning_rate': 1.965082460163574e-05, 'epoch': 0.11} + 11%|█ | 979/8750 [1:36:04<12:20:28, 5.72s/it] 11%|█ | 980/8750 [1:36:16<12:18:26, 5.70s/it] 11%|█ | 980/8750 [1:36:10<12:18:26, 5.70s/it] {'loss': 0.5133, 'learning_rate': 1.9649854306372065e-05, 'epoch': 0.11} + 11%|█ | 980/8750 [1:36:16<12:18:26, 5.70s/it] {'loss': 0.5133, 'learning_rate': 1.9649854306372065e-05, 'epoch': 0.11} + 11%|█ | 980/8750 [1:36:10<12:18:26, 5.70s/it] 11%|█ | 981/8750 [1:36:22<12:31:43, 5.81s/it] 11%|█ | 981/8750 [1:36:16<12:31:44, 5.81s/it] {'loss': 0.4669, 'learning_rate': 1.9648882688862905e-05, 'epoch': 0.11} + 11%|█ | 981/8750 [1:36:22<12:31:43, 5.81s/it] {'loss': 0.4669, 'learning_rate': 1.9648882688862905e-05, 'epoch': 0.11} + 11%|█ | 981/8750 [1:36:16<12:31:44, 5.81s/it] 11%|█ | 982/8750 [1:36:28<12:28:52, 5.78s/it] 11%|█ | 982/8750 [1:36:21<12:28:52, 5.78s/it] {'loss': 0.4821, 'learning_rate': 1.9647909749241394e-05, 'epoch': 0.11} + 11%|█ | 982/8750 [1:36:28<12:28:52, 5.78s/it] {'loss': 0.4821, 'learning_rate': 1.9647909749241394e-05, 'epoch': 0.11} + 11%|█ | 982/8750 [1:36:21<12:28:52, 5.78s/it] 11%|█ | 983/8750 [1:36:33<12:24:20, 5.75s/it] 11%|█ | 983/8750 [1:36:27<12:24:20, 5.75s/it] {'loss': 0.4946, 'learning_rate': 1.9646935487640848e-05, 'epoch': 0.11} + 11%|█ | 983/8750 [1:36:33<12:24:20, 5.75s/it] {'loss': 0.4946, 'learning_rate': 1.9646935487640848e-05, 'epoch': 0.11} + 11%|█ | 983/8750 [1:36:27<12:24:20, 5.75s/it] 11%|█ | 984/8750 [1:36:39<12:16:34, 5.69s/it] 11%|█ | 984/8750 [1:36:33<12:16:35, 5.69s/it] {'loss': 0.5043, 'learning_rate': 1.964595990419476e-05, 'epoch': 0.11} + 11%|█ | 984/8750 [1:36:39<12:16:34, 5.69s/it] {'loss': 0.5043, 'learning_rate': 1.964595990419476e-05, 'epoch': 0.11} + 11%|█ | 984/8750 [1:36:33<12:16:35, 5.69s/it] 11%|█▏ | 985/8750 [1:36:45<12:23:00, 5.74s/it] 11%|█▏ | 985/8750 [1:36:38<12:23:00, 5.74s/it] {'loss': 0.4751, 'learning_rate': 1.964498299903681e-05, 'epoch': 0.11} + 11%|█▏ | 985/8750 [1:36:45<12:23:00, 5.74s/it] {'loss': 0.4751, 'learning_rate': 1.964498299903681e-05, 'epoch': 0.11} + 11%|█▏ | 985/8750 [1:36:38<12:23:00, 5.74s/it] 11%|█▏ | 986/8750 [1:36:44<12:17:13, 5.70s/it] 11%|█▏ | 986/8750 [1:36:50<12:17:14, 5.70s/it] {'loss': 0.5033, 'learning_rate': 1.964400477230085e-05, 'epoch': 0.11} + 11%|█▏ | 986/8750 [1:36:50<12:17:14, 5.70s/it] {'loss': 0.5033, 'learning_rate': 1.964400477230085e-05, 'epoch': 0.11} + 11%|█▏ | 986/8750 [1:36:44<12:17:13, 5.70s/it] 11%|█▏ | 987/8750 [1:36:50<12:19:19, 5.71s/it] 11%|█▏ | 987/8750 [1:36:56<12:19:19, 5.71s/it] {'loss': 0.4757, 'learning_rate': 1.9643025224120923e-05, 'epoch': 0.11} + 11%|█▏ | 987/8750 [1:36:56<12:19:19, 5.71s/it] {'loss': 0.4757, 'learning_rate': 1.9643025224120923e-05, 'epoch': 0.11} + 11%|█▏ | 987/8750 [1:36:50<12:19:19, 5.71s/it] 11%|█▏ | 988/8750 [1:36:56<12:26:15, 5.77s/it] 11%|█▏ | 988/8750 [1:37:02<12:26:15, 5.77s/it] {'loss': 0.4983, 'learning_rate': 1.9642044354631255e-05, 'epoch': 0.11} + 11%|█▏ | 988/8750 [1:37:02<12:26:15, 5.77s/it] {'loss': 0.4983, 'learning_rate': 1.9642044354631255e-05, 'epoch': 0.11} + 11%|█▏ | 988/8750 [1:36:56<12:26:15, 5.77s/it] 11%|█▏ | 989/8750 [1:37:08<12:23:46, 5.75s/it] 11%|█▏ | 989/8750 [1:37:01<12:23:46, 5.75s/it] {'loss': 0.49, 'learning_rate': 1.9641062163966232e-05, 'epoch': 0.11} + 11%|█▏ | 989/8750 [1:37:08<12:23:46, 5.75s/it] {'loss': 0.49, 'learning_rate': 1.9641062163966232e-05, 'epoch': 0.11} + 11%|█▏ | 989/8750 [1:37:01<12:23:46, 5.75s/it] 11%|█▏ | 990/8750 [1:37:14<12:21:33, 5.73s/it] 11%|█▏ | 990/8750 [1:37:07<12:21:39, 5.73s/it] {'loss': 0.4855, 'learning_rate': 1.9640078652260447e-05, 'epoch': 0.11} + 11%|█▏ | 990/8750 [1:37:14<12:21:33, 5.73s/it] {'loss': 0.4855, 'learning_rate': 1.9640078652260447e-05, 'epoch': 0.11} + 11%|█▏ | 990/8750 [1:37:07<12:21:39, 5.73s/it] 11%|█▏ | 991/8750 [1:37:19<12:27:33, 5.78s/it] 11%|█▏ | 991/8750 [1:37:13<12:27:31, 5.78s/it] {'loss': 0.5073, 'learning_rate': 1.9639093819648664e-05, 'epoch': 0.11} + 11%|█▏ | 991/8750 [1:37:19<12:27:33, 5.78s/it] {'loss': 0.5073, 'learning_rate': 1.9639093819648664e-05, 'epoch': 0.11} + 11%|█▏ | 991/8750 [1:37:13<12:27:31, 5.78s/it] 11%|█▏ | 992/8750 [1:37:25<12:16:13, 5.69s/it] 11%|█▏ | 992/8750 [1:37:18<12:16:12, 5.69s/it] {'loss': 0.5177, 'learning_rate': 1.963810766626582e-05, 'epoch': 0.11} + 11%|█▏ | 992/8750 [1:37:25<12:16:13, 5.69s/it] {'loss': 0.5177, 'learning_rate': 1.963810766626582e-05, 'epoch': 0.11} + 11%|█▏ | 992/8750 [1:37:18<12:16:12, 5.69s/it] 11%|█▏ | 993/8750 [1:37:31<12:21:59, 5.74s/it] 11%|█▏ | 993/8750 [1:37:24<12:21:58, 5.74s/it] {'loss': 0.4823, 'learning_rate': 1.9637120192247046e-05, 'epoch': 0.11} + 11%|█▏ | 993/8750 [1:37:31<12:21:59, 5.74s/it] {'loss': 0.4823, 'learning_rate': 1.9637120192247046e-05, 'epoch': 0.11} + 11%|█▏ | 993/8750 [1:37:24<12:21:58, 5.74s/it] 11%|█▏ | 994/8750 [1:37:30<12:13:54, 5.68s/it] 11%|█▏ | 994/8750 [1:37:36<12:13:57, 5.68s/it] {'loss': 0.4998, 'learning_rate': 1.9636131397727646e-05, 'epoch': 0.11} + 11%|█▏ | 994/8750 [1:37:36<12:13:57, 5.68s/it] {'loss': 0.4998, 'learning_rate': 1.9636131397727646e-05, 'epoch': 0.11} + 11%|█▏ | 994/8750 [1:37:30<12:13:54, 5.68s/it] 11%|█▏ | 995/8750 [1:37:42<12:13:14, 5.67s/it] 11%|█▏ | 995/8750 [1:37:35<12:13:14, 5.67s/it] {'loss': 0.4702, 'learning_rate': 1.9635141282843105e-05, 'epoch': 0.11} + 11%|█▏ | 995/8750 [1:37:42<12:13:14, 5.67s/it] {'loss': 0.4702, 'learning_rate': 1.9635141282843105e-05, 'epoch': 0.11} + 11%|█▏ | 995/8750 [1:37:35<12:13:14, 5.67s/it] 11%|█▏ | 996/8750 [1:37:48<12:15:33, 5.69s/it] 11%|█▏ | 996/8750 [1:37:41<12:15:33, 5.69s/it] {'loss': 0.4997, 'learning_rate': 1.9634149847729093e-05, 'epoch': 0.11} + 11%|█▏ | 996/8750 [1:37:48<12:15:33, 5.69s/it] {'loss': 0.4997, 'learning_rate': 1.9634149847729093e-05, 'epoch': 0.11} + 11%|█▏ | 996/8750 [1:37:41<12:15:33, 5.69s/it] 11%|█▏ | 997/8750 [1:37:53<12:17:07, 5.70s/it] 11%|█▏ | 997/8750 [1:37:47<12:17:07, 5.70s/it] {'loss': 0.4942, 'learning_rate': 1.963315709252146e-05, 'epoch': 0.11} + 11%|█▏ | 997/8750 [1:37:53<12:17:07, 5.70s/it] {'loss': 0.4942, 'learning_rate': 1.963315709252146e-05, 'epoch': 0.11} + 11%|█▏ | 997/8750 [1:37:47<12:17:07, 5.70s/it] 11%|█▏ | 998/8750 [1:37:59<12:15:31, 5.69s/it] 11%|█▏ | 998/8750 [1:37:53<12:15:31, 5.69s/it] {'loss': 0.5002, 'learning_rate': 1.963216301735623e-05, 'epoch': 0.11} + 11%|█▏ | 998/8750 [1:37:59<12:15:31, 5.69s/it] {'loss': 0.5002, 'learning_rate': 1.963216301735623e-05, 'epoch': 0.11} + 11%|█▏ | 998/8750 [1:37:53<12:15:31, 5.69s/it] 11%|█▏ | 999/8750 [1:38:05<12:11:11, 5.66s/it] 11%|█▏ | 999/8750 [1:37:58<12:11:11, 5.66s/it] {'loss': 0.5039, 'learning_rate': 1.9631167622369617e-05, 'epoch': 0.11} + 11%|█▏ | 999/8750 [1:38:05<12:11:11, 5.66s/it] {'loss': 0.5039, 'learning_rate': 1.9631167622369617e-05, 'epoch': 0.11} + 11%|█▏ | 999/8750 [1:37:58<12:11:11, 5.66s/it]3 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +07 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... + 11%|█▏ | 1000/8750 [1:38:10<12:09:18, 5.65s/it]12 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... + 11%|█▏ | 1000/8750 [1:38:04<12:09:18, 5.65s/it]14 AutoResumeHook: Checking whether to suspend... + {'loss': 0.497, 'learning_rate': 1.9630170907698015e-05, 'epoch': 0.11} + 11%|█▏ | 1000/8750 [1:38:10<12:09:18, 5.65s/it] {'loss': 0.497, 'learning_rate': 1.9630170907698015e-05, 'epoch': 0.11} + 11%|█▏ | 1000/8750 [1:38:04<12:09:18, 5.65s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-1000/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-1000/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-1000/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 11%|█▏ | 1001/8750 [1:38:32<22:34:26, 10.49s/it] 11%|█▏ | 1001/8750 [1:38:26<22:34:26, 10.49s/it] {'loss': 0.4884, 'learning_rate': 1.9629172873477995e-05, 'epoch': 0.11} + 11%|█▏ | 1001/8750 [1:38:32<22:34:26, 10.49s/it] {'loss': 0.4884, 'learning_rate': 1.9629172873477995e-05, 'epoch': 0.11} + 11%|█▏ | 1001/8750 [1:38:26<22:34:26, 10.49s/it] 11%|█▏ | 1002/8750 [1:38:31<19:24:47, 9.02s/it] 11%|█▏ | 1002/8750 [1:38:38<19:24:48, 9.02s/it] {'loss': 0.4728, 'learning_rate': 1.9628173519846308e-05, 'epoch': 0.11} + 11%|█▏ | 1002/8750 [1:38:38<19:24:48, 9.02s/it] {'loss': 0.4728, 'learning_rate': 1.9628173519846308e-05, 'epoch': 0.11} + 11%|█▏ | 1002/8750 [1:38:31<19:24:47, 9.02s/it] 11%|█▏ | 1003/8750 [1:38:43<17:11:39, 7.99s/it] 11%|█▏ | 1003/8750 [1:38:37<17:11:40, 7.99s/it] {'loss': 0.5004, 'learning_rate': 1.9627172846939886e-05, 'epoch': 0.11} + 11%|█▏ | 1003/8750 [1:38:43<17:11:39, 7.99s/it] {'loss': 0.5004, 'learning_rate': 1.9627172846939886e-05, 'epoch': 0.11} + 11%|█▏ | 1003/8750 [1:38:37<17:11:40, 7.99s/it] 11%|█▏ | 1004/8750 [1:38:42<15:44:11, 7.31s/it] 11%|█▏ | 1004/8750 [1:38:49<15:44:12, 7.31s/it] {'loss': 0.4989, 'learning_rate': 1.962617085489585e-05, 'epoch': 0.11} + 11%|█▏ | 1004/8750 [1:38:49<15:44:12, 7.31s/it] {'loss': 0.4989, 'learning_rate': 1.962617085489585e-05, 'epoch': 0.11} + 11%|█▏ | 1004/8750 [1:38:43<15:44:11, 7.31s/it] 11%|█▏ | 1005/8750 [1:38:55<14:43:29, 6.84s/it] 11%|█▏ | 1005/8750 [1:38:48<14:43:29, 6.84s/it] {'loss': 0.4877, 'learning_rate': 1.962516754385149e-05, 'epoch': 0.11} + 11%|█▏ | 1005/8750 [1:38:55<14:43:29, 6.84s/it] {'loss': 0.4877, 'learning_rate': 1.962516754385149e-05, 'epoch': 0.11} + 11%|█▏ | 1005/8750 [1:38:48<14:43:29, 6.84s/it] 11%|█▏ | 1006/8750 [1:39:00<13:52:25, 6.45s/it] 11%|█▏ | 1006/8750 [1:38:54<13:52:25, 6.45s/it] {'loss': 0.4992, 'learning_rate': 1.962416291394428e-05, 'epoch': 0.11} + 11%|█▏ | 1006/8750 [1:39:00<13:52:25, 6.45s/it] {'loss': 0.4992, 'learning_rate': 1.962416291394428e-05, 'epoch': 0.11} + 11%|█▏ | 1006/8750 [1:38:54<13:52:25, 6.45s/it] 12%|█▏ | 1007/8750 [1:39:06<13:34:12, 6.31s/it] 12%|█▏ | 1007/8750 [1:39:00<13:34:13, 6.31s/it] {'loss': 0.4895, 'learning_rate': 1.9623156965311884e-05, 'epoch': 0.12} + 12%|█▏ | 1007/8750 [1:39:06<13:34:12, 6.31s/it] {'loss': 0.4895, 'learning_rate': 1.9623156965311884e-05, 'epoch': 0.12} + 12%|█▏ | 1007/8750 [1:39:00<13:34:13, 6.31s/it] 12%|█▏ | 1008/8750 [1:39:12<13:21:12, 6.21s/it] 12%|█▏ | 1008/8750 [1:39:06<13:21:11, 6.21s/it] {'loss': 0.4922, 'learning_rate': 1.9622149698092135e-05, 'epoch': 0.12} + 12%|█▏ | 1008/8750 [1:39:12<13:21:12, 6.21s/it] {'loss': 0.4922, 'learning_rate': 1.9622149698092135e-05, 'epoch': 0.12} + 12%|█▏ | 1008/8750 [1:39:06<13:21:11, 6.21s/it] 12%|█▏ | 1009/8750 [1:39:18<12:56:38, 6.02s/it] 12%|█▏ | 1009/8750 [1:39:11<12:56:38, 6.02s/it] {'loss': 0.5085, 'learning_rate': 1.962114111242305e-05, 'epoch': 0.12} + 12%|█▏ | 1009/8750 [1:39:18<12:56:38, 6.02s/it] {'loss': 0.5085, 'learning_rate': 1.962114111242305e-05, 'epoch': 0.12} + 12%|█▏ | 1009/8750 [1:39:11<12:56:38, 6.02s/it] 12%|█▏ | 1010/8750 [1:39:17<12:46:44, 5.94s/it] 12%|█▏ | 1010/8750 [1:39:24<12:46:45, 5.94s/it] {'loss': 0.473, 'learning_rate': 1.962013120844283e-05, 'epoch': 0.12} + 12%|█▏ | 1010/8750 [1:39:24<12:46:45, 5.94s/it] {'loss': 0.473, 'learning_rate': 1.962013120844283e-05, 'epoch': 0.12} + 12%|█▏ | 1010/8750 [1:39:17<12:46:44, 5.94s/it] 12%|█▏ | 1011/8750 [1:39:29<12:40:30, 5.90s/it] 12%|█▏ | 1011/8750 [1:39:23<12:40:31, 5.90s/it] {'loss': 0.4942, 'learning_rate': 1.9619119986289855e-05, 'epoch': 0.12} + 12%|█▏ | 1011/8750 [1:39:29<12:40:30, 5.90s/it] {'loss': 0.4942, 'learning_rate': 1.9619119986289855e-05, 'epoch': 0.12} + 12%|█▏ | 1011/8750 [1:39:23<12:40:31, 5.90s/it] 12%|█▏ | 1012/8750 [1:39:35<12:24:39, 5.77s/it] 12%|█▏ | 1012/8750 [1:39:28<12:24:39, 5.77s/it] {'loss': 0.4962, 'learning_rate': 1.9618107446102682e-05, 'epoch': 0.12} + 12%|█▏ | 1012/8750 [1:39:35<12:24:39, 5.77s/it] {'loss': 0.4962, 'learning_rate': 1.9618107446102682e-05, 'epoch': 0.12} + 12%|█▏ | 1012/8750 [1:39:28<12:24:39, 5.77s/it] 12%|█▏ | 1013/8750 [1:39:40<12:18:33, 5.73s/it] 12%|█▏ | 1013/8750 [1:39:34<12:18:33, 5.73s/it] {'loss': 0.4859, 'learning_rate': 1.9617093588020057e-05, 'epoch': 0.12} + 12%|█▏ | 1013/8750 [1:39:40<12:18:33, 5.73s/it] {'loss': 0.4859, 'learning_rate': 1.9617093588020057e-05, 'epoch': 0.12} + 12%|█▏ | 1013/8750 [1:39:34<12:18:33, 5.73s/it] 12%|█▏ | 1014/8750 [1:39:40<12:26:53, 5.79s/it] 12%|█▏ | 1014/8750 [1:39:46<12:26:53, 5.79s/it] {'loss': 0.4819, 'learning_rate': 1.9616078412180896e-05, 'epoch': 0.12} + 12%|█▏ | 1014/8750 [1:39:46<12:26:53, 5.79s/it] {'loss': 0.4819, 'learning_rate': 1.9616078412180896e-05, 'epoch': 0.12} + 12%|█▏ | 1014/8750 [1:39:40<12:26:53, 5.79s/it] 12%|█▏ | 1015/8750 [1:39:52<12:19:24, 5.74s/it] 12%|█▏ | 1015/8750 [1:39:46<12:19:25, 5.74s/it] {'loss': 0.4826, 'learning_rate': 1.96150619187243e-05, 'epoch': 0.12} + 12%|█▏ | 1015/8750 [1:39:52<12:19:24, 5.74s/it] {'loss': 0.4826, 'learning_rate': 1.96150619187243e-05, 'epoch': 0.12} + 12%|█▏ | 1015/8750 [1:39:46<12:19:25, 5.74s/it] 12%|█▏ | 1016/8750 [1:39:58<12:11:54, 5.68s/it] 12%|█▏ | 1016/8750 [1:39:51<12:11:53, 5.68s/it] {'loss': 0.5166, 'learning_rate': 1.9614044107789553e-05, 'epoch': 0.12} + 12%|█▏ | 1016/8750 [1:39:58<12:11:54, 5.68s/it] {'loss': 0.5166, 'learning_rate': 1.9614044107789553e-05, 'epoch': 0.12} + 12%|█▏ | 1016/8750 [1:39:51<12:11:53, 5.68s/it] 12%|█▏ | 1017/8750 [1:40:03<12:09:22, 5.66s/it] 12%|█▏ | 1017/8750 [1:39:57<12:09:22, 5.66s/it] {'loss': 0.4963, 'learning_rate': 1.9613024979516123e-05, 'epoch': 0.12} + 12%|█▏ | 1017/8750 [1:40:03<12:09:22, 5.66s/it] {'loss': 0.4963, 'learning_rate': 1.9613024979516123e-05, 'epoch': 0.12} + 12%|█▏ | 1017/8750 [1:39:57<12:09:22, 5.66s/it] 12%|█▏ | 1018/8750 [1:40:02<12:06:47, 5.64s/it] 12%|█▏ | 1018/8750 [1:40:09<12:06:48, 5.64s/it] {'loss': 0.4796, 'learning_rate': 1.9612004534043644e-05, 'epoch': 0.12} + 12%|█▏ | 1018/8750 [1:40:09<12:06:48, 5.64s/it] {'loss': 0.4796, 'learning_rate': 1.9612004534043644e-05, 'epoch': 0.12} + 12%|█▏ | 1018/8750 [1:40:02<12:06:47, 5.64s/it] 12%|█▏ | 1019/8750 [1:40:14<12:08:13, 5.65s/it] 12%|█▏ | 1019/8750 [1:40:08<12:08:13, 5.65s/it] {'loss': 0.4787, 'learning_rate': 1.9610982771511947e-05, 'epoch': 0.12} + 12%|█▏ | 1019/8750 [1:40:14<12:08:13, 5.65s/it] {'loss': 0.4787, 'learning_rate': 1.9610982771511947e-05, 'epoch': 0.12} + 12%|█▏ | 1019/8750 [1:40:08<12:08:13, 5.65s/it] 12%|█▏ | 1020/8750 [1:40:20<12:02:29, 5.61s/it] 12%|█▏ | 1020/8750 [1:40:13<12:02:29, 5.61s/it] {'loss': 0.4938, 'learning_rate': 1.9609959692061037e-05, 'epoch': 0.12} + 12%|█▏ | 1020/8750 [1:40:20<12:02:29, 5.61s/it] {'loss': 0.4938, 'learning_rate': 1.9609959692061037e-05, 'epoch': 0.12} + 12%|█▏ | 1020/8750 [1:40:13<12:02:29, 5.61s/it] 12%|█▏ | 1021/8750 [1:40:19<12:11:29, 5.68s/it] 12%|█▏ | 1021/8750 [1:40:26<12:11:31, 5.68s/it] {'loss': 0.48, 'learning_rate': 1.9608935295831092e-05, 'epoch': 0.12} + 12%|█▏ | 1021/8750 [1:40:26<12:11:31, 5.68s/it] {'loss': 0.48, 'learning_rate': 1.9608935295831092e-05, 'epoch': 0.12} + 12%|█▏ | 1021/8750 [1:40:19<12:11:29, 5.68s/it] 12%|█▏ | 1022/8750 [1:40:32<12:15:16, 5.71s/it] 12%|█▏ | 1022/8750 [1:40:25<12:15:18, 5.71s/it] {'loss': 0.4807, 'learning_rate': 1.9607909582962478e-05, 'epoch': 0.12} + 12%|█▏ | 1022/8750 [1:40:32<12:15:16, 5.71s/it] {'loss': 0.4807, 'learning_rate': 1.9607909582962478e-05, 'epoch': 0.12} + 12%|█▏ | 1022/8750 [1:40:25<12:15:18, 5.71s/it] 12%|█▏ | 1023/8750 [1:40:37<12:13:33, 5.70s/it] 12%|█▏ | 1023/8750 [1:40:31<12:13:33, 5.70s/it] {'loss': 0.4893, 'learning_rate': 1.9606882553595748e-05, 'epoch': 0.12} + 12%|█▏ | 1023/8750 [1:40:37<12:13:33, 5.70s/it] {'loss': 0.4893, 'learning_rate': 1.9606882553595748e-05, 'epoch': 0.12} + 12%|█▏ | 1023/8750 [1:40:31<12:13:33, 5.70s/it] 12%|█▏ | 1024/8750 [1:40:43<12:13:19, 5.69s/it] 12%|█▏ | 1024/8750 [1:40:36<12:13:19, 5.69s/it] {'loss': 0.4854, 'learning_rate': 1.960585420787162e-05, 'epoch': 0.12} + 12%|█▏ | 1024/8750 [1:40:43<12:13:19, 5.69s/it] {'loss': 0.4854, 'learning_rate': 1.960585420787162e-05, 'epoch': 0.12} + 12%|█▏ | 1024/8750 [1:40:36<12:13:19, 5.69s/it] 12%|█▏ | 1025/8750 [1:40:49<12:13:40, 5.70s/it] 12%|█▏ | 1025/8750 [1:40:42<12:13:40, 5.70s/it] {'loss': 0.5115, 'learning_rate': 1.9604824545931005e-05, 'epoch': 0.12} + 12%|█▏ | 1025/8750 [1:40:49<12:13:40, 5.70s/it] {'loss': 0.5115, 'learning_rate': 1.9604824545931005e-05, 'epoch': 0.12} + 12%|█▏ | 1025/8750 [1:40:42<12:13:40, 5.70s/it] 12%|█▏ | 1026/8750 [1:40:54<12:11:38, 5.68s/it] 12%|█▏ | 1026/8750 [1:40:48<12:11:38, 5.68s/it] {'loss': 0.4804, 'learning_rate': 1.960379356791499e-05, 'epoch': 0.12} + 12%|█▏ | 1026/8750 [1:40:54<12:11:38, 5.68s/it] {'loss': 0.4804, 'learning_rate': 1.960379356791499e-05, 'epoch': 0.12} + 12%|█▏ | 1026/8750 [1:40:48<12:11:38, 5.68s/it] 12%|█▏ | 1027/8750 [1:41:00<12:11:03, 5.68s/it] 12%|█▏ | 1027/8750 [1:40:53<12:11:04, 5.68s/it] {'loss': 0.4954, 'learning_rate': 1.960276127396484e-05, 'epoch': 0.12} + 12%|█▏ | 1027/8750 [1:41:00<12:11:03, 5.68s/it] {'loss': 0.4954, 'learning_rate': 1.960276127396484e-05, 'epoch': 0.12} + 12%|█▏ | 1027/8750 [1:40:53<12:11:04, 5.68s/it] 12%|█▏ | 1028/8750 [1:41:06<12:10:19, 5.67s/it] 12%|█▏ | 1028/8750 [1:40:59<12:10:19, 5.67s/it] {'loss': 0.4761, 'learning_rate': 1.9601727664222e-05, 'epoch': 0.12} + 12%|█▏ | 1028/8750 [1:41:06<12:10:19, 5.67s/it] {'loss': 0.4761, 'learning_rate': 1.9601727664222e-05, 'epoch': 0.12} + 12%|█▏ | 1028/8750 [1:40:59<12:10:19, 5.67s/it] 12%|█▏ | 1029/8750 [1:41:11<12:13:26, 5.70s/it] 12%|█▏ | 1029/8750 [1:41:05<12:13:26, 5.70s/it] {'loss': 0.5093, 'learning_rate': 1.96006927388281e-05, 'epoch': 0.12} + 12%|█▏ | 1029/8750 [1:41:11<12:13:26, 5.70s/it] {'loss': 0.5093, 'learning_rate': 1.96006927388281e-05, 'epoch': 0.12} + 12%|█▏ | 1029/8750 [1:41:05<12:13:26, 5.70s/it] 12%|█▏ | 1030/8750 [1:41:17<12:16:41, 5.73s/it] 12%|█▏ | 1030/8750 [1:41:11<12:16:41, 5.73s/it] {'loss': 0.4766, 'learning_rate': 1.959965649792495e-05, 'epoch': 0.12} + 12%|█▏ | 1030/8750 [1:41:17<12:16:41, 5.73s/it] {'loss': 0.4766, 'learning_rate': 1.959965649792495e-05, 'epoch': 0.12} + 12%|█▏ | 1030/8750 [1:41:11<12:16:41, 5.73s/it] 12%|█▏ | 1031/8750 [1:41:23<12:27:33, 5.81s/it] 12%|█▏ | 1031/8750 [1:41:17<12:27:33, 5.81s/it] {'loss': 0.4788, 'learning_rate': 1.9598618941654535e-05, 'epoch': 0.12} + 12%|█▏ | 1031/8750 [1:41:23<12:27:33, 5.81s/it] {'loss': 0.4788, 'learning_rate': 1.9598618941654535e-05, 'epoch': 0.12} + 12%|█▏ | 1031/8750 [1:41:17<12:27:33, 5.81s/it] 12%|█▏ | 1032/8750 [1:41:22<12:24:21, 5.79s/it] 12%|█▏ | 1032/8750 [1:41:29<12:24:22, 5.79s/it] {'loss': 0.4934, 'learning_rate': 1.9597580070159026e-05, 'epoch': 0.12} + 12%|█▏ | 1032/8750 [1:41:29<12:24:22, 5.79s/it] {'loss': 0.4934, 'learning_rate': 1.9597580070159026e-05, 'epoch': 0.12} + 12%|█▏ | 1032/8750 [1:41:22<12:24:21, 5.79s/it] 12%|█▏ | 1033/8750 [1:41:35<12:21:25, 5.76s/it] 12%|█▏ | 1033/8750 [1:41:28<12:21:27, 5.76s/it] {'loss': 0.4993, 'learning_rate': 1.9596539883580773e-05, 'epoch': 0.12} + 12%|█▏ | 1033/8750 [1:41:35<12:21:25, 5.76s/it] {'loss': 0.4993, 'learning_rate': 1.9596539883580773e-05, 'epoch': 0.12} + 12%|█▏ | 1033/8750 [1:41:28<12:21:27, 5.76s/it] 12%|█▏ | 1034/8750 [1:41:40<12:23:01, 5.78s/it] 12%|█▏ | 1034/8750 [1:41:34<12:23:01, 5.78s/it] {'loss': 0.4967, 'learning_rate': 1.9595498382062295e-05, 'epoch': 0.12} + 12%|█▏ | 1034/8750 [1:41:40<12:23:01, 5.78s/it] {'loss': 0.4967, 'learning_rate': 1.9595498382062295e-05, 'epoch': 0.12} + 12%|█▏ | 1034/8750 [1:41:34<12:23:01, 5.78s/it] 12%|█▏ | 1035/8750 [1:41:46<12:17:20, 5.73s/it] 12%|█▏ | 1035/8750 [1:41:40<12:17:21, 5.73s/it] {'loss': 0.5108, 'learning_rate': 1.9594455565746313e-05, 'epoch': 0.12} + 12%|█▏ | 1035/8750 [1:41:46<12:17:20, 5.73s/it] {'loss': 0.5108, 'learning_rate': 1.9594455565746313e-05, 'epoch': 0.12} + 12%|█▏ | 1035/8750 [1:41:40<12:17:21, 5.73s/it] 12%|█▏ | 1036/8750 [1:41:45<12:08:37, 5.67s/it] 12%|█▏ | 1036/8750 [1:41:52<12:08:38, 5.67s/it] {'loss': 0.482, 'learning_rate': 1.959341143477571e-05, 'epoch': 0.12} + 12%|█▏ | 1036/8750 [1:41:52<12:08:38, 5.67s/it] {'loss': 0.482, 'learning_rate': 1.959341143477571e-05, 'epoch': 0.12} + 12%|█▏ | 1036/8750 [1:41:45<12:08:37, 5.67s/it] 12%|█▏ | 1037/8750 [1:41:57<12:11:55, 5.69s/it] 12%|█▏ | 1037/8750 [1:41:51<12:11:55, 5.69s/it] {'loss': 0.4744, 'learning_rate': 1.9592365989293557e-05, 'epoch': 0.12} + 12%|█▏ | 1037/8750 [1:41:57<12:11:55, 5.69s/it] {'loss': 0.4744, 'learning_rate': 1.9592365989293557e-05, 'epoch': 0.12} + 12%|█▏ | 1037/8750 [1:41:51<12:11:55, 5.69s/it] 12%|█▏ | 1038/8750 [1:42:03<12:04:30, 5.64s/it] 12%|█▏ | 1038/8750 [1:41:56<12:04:30, 5.64s/it] {'loss': 0.5033, 'learning_rate': 1.95913192294431e-05, 'epoch': 0.12} + 12%|█▏ | 1038/8750 [1:42:03<12:04:30, 5.64s/it] {'loss': 0.5033, 'learning_rate': 1.95913192294431e-05, 'epoch': 0.12} + 12%|█▏ | 1038/8750 [1:41:56<12:04:30, 5.64s/it] 12%|█▏ | 1039/8750 [1:42:09<12:09:05, 5.67s/it] 12%|█▏ | 1039/8750 [1:42:02<12:09:05, 5.67s/it] {'loss': 0.4749, 'learning_rate': 1.9590271155367776e-05, 'epoch': 0.12} + 12%|█▏ | 1039/8750 [1:42:09<12:09:05, 5.67s/it] {'loss': 0.4749, 'learning_rate': 1.9590271155367776e-05, 'epoch': 0.12} + 12%|█▏ | 1039/8750 [1:42:02<12:09:05, 5.67s/it] 12%|█▏ | 1040/8750 [1:42:14<12:12:36, 5.70s/it] 12%|█▏ | 1040/8750 [1:42:08<12:12:35, 5.70s/it] {'loss': 0.489, 'learning_rate': 1.9589221767211188e-05, 'epoch': 0.12} + 12%|█▏ | 1040/8750 [1:42:14<12:12:36, 5.70s/it] {'loss': 0.489, 'learning_rate': 1.9589221767211188e-05, 'epoch': 0.12} + 12%|█▏ | 1040/8750 [1:42:08<12:12:35, 5.70s/it] 12%|█▏ | 1041/8750 [1:42:20<12:06:16, 5.65s/it] 12%|█▏ | 1041/8750 [1:42:13<12:06:15, 5.65s/it] {'loss': 0.4956, 'learning_rate': 1.9588171065117122e-05, 'epoch': 0.12} + 12%|█▏ | 1041/8750 [1:42:20<12:06:16, 5.65s/it] {'loss': 0.4956, 'learning_rate': 1.9588171065117122e-05, 'epoch': 0.12} + 12%|█▏ | 1041/8750 [1:42:13<12:06:15, 5.65s/it] 12%|█▏ | 1042/8750 [1:42:25<12:04:02, 5.64s/it] 12%|█▏ | 1042/8750 [1:42:19<12:04:02, 5.64s/it] {'loss': 0.4975, 'learning_rate': 1.9587119049229558e-05, 'epoch': 0.12} + 12%|█▏ | 1042/8750 [1:42:25<12:04:02, 5.64s/it] {'loss': 0.4975, 'learning_rate': 1.9587119049229558e-05, 'epoch': 0.12} + 12%|█▏ | 1042/8750 [1:42:19<12:04:02, 5.64s/it] 12%|█▏ | 1043/8750 [1:42:31<12:13:35, 5.71s/it] 12%|█▏ | 1043/8750 [1:42:25<12:13:35, 5.71s/it] {'loss': 0.5006, 'learning_rate': 1.9586065719692636e-05, 'epoch': 0.12} + 12%|█▏ | 1043/8750 [1:42:31<12:13:35, 5.71s/it] {'loss': 0.5006, 'learning_rate': 1.9586065719692636e-05, 'epoch': 0.12} + 12%|█▏ | 1043/8750 [1:42:25<12:13:35, 5.71s/it] 12%|█▏ | 1044/8750 [1:42:37<12:14:47, 5.72s/it] 12%|█▏ | 1044/8750 [1:42:31<12:14:47, 5.72s/it] {'loss': 0.5086, 'learning_rate': 1.9585011076650695e-05, 'epoch': 0.12} + 12%|█▏ | 1044/8750 [1:42:37<12:14:47, 5.72s/it] {'loss': 0.5086, 'learning_rate': 1.9585011076650695e-05, 'epoch': 0.12} + 12%|█▏ | 1044/8750 [1:42:31<12:14:47, 5.72s/it] 12%|█▏ | 1045/8750 [1:42:43<12:24:19, 5.80s/it] 12%|█▏ | 1045/8750 [1:42:37<12:24:19, 5.80s/it] {'loss': 0.4902, 'learning_rate': 1.958395512024824e-05, 'epoch': 0.12} + 12%|█▏ | 1045/8750 [1:42:43<12:24:19, 5.80s/it] {'loss': 0.4902, 'learning_rate': 1.958395512024824e-05, 'epoch': 0.12} + 12%|█▏ | 1045/8750 [1:42:37<12:24:19, 5.80s/it] 12%|█▏ | 1046/8750 [1:42:49<12:23:18, 5.79s/it] 12%|█▏ | 1046/8750 [1:42:42<12:23:18, 5.79s/it] {'loss': 0.4795, 'learning_rate': 1.9582897850629958e-05, 'epoch': 0.12} + 12%|█▏ | 1046/8750 [1:42:49<12:23:18, 5.79s/it] {'loss': 0.4795, 'learning_rate': 1.9582897850629958e-05, 'epoch': 0.12} + 12%|█▏ | 1046/8750 [1:42:42<12:23:18, 5.79s/it] 12%|█▏ | 1047/8750 [1:42:55<12:24:39, 5.80s/it] 12%|█▏ | 1047/8750 [1:42:48<12:24:39, 5.80s/it] {'loss': 0.4852, 'learning_rate': 1.9581839267940722e-05, 'epoch': 0.12} + 12%|█▏ | 1047/8750 [1:42:55<12:24:39, 5.80s/it] {'loss': 0.4852, 'learning_rate': 1.9581839267940722e-05, 'epoch': 0.12} + 12%|█▏ | 1047/8750 [1:42:48<12:24:39, 5.80s/it] 12%|█▏ | 1048/8750 [1:43:00<12:19:32, 5.76s/it] 12%|█▏ | 1048/8750 [1:42:54<12:19:32, 5.76s/it] {'loss': 0.4886, 'learning_rate': 1.9580779372325583e-05, 'epoch': 0.12} + 12%|█▏ | 1048/8750 [1:43:00<12:19:32, 5.76s/it] {'loss': 0.4886, 'learning_rate': 1.9580779372325583e-05, 'epoch': 0.12} + 12%|█▏ | 1048/8750 [1:42:54<12:19:32, 5.76s/it] 12%|█▏ | 1049/8750 [1:43:06<12:20:19, 5.77s/it] 12%|█▏ | 1049/8750 [1:43:00<12:20:19, 5.77s/it] {'loss': 0.4913, 'learning_rate': 1.9579718163929767e-05, 'epoch': 0.12} + 12%|█▏ | 1049/8750 [1:43:06<12:20:19, 5.77s/it] {'loss': 0.4913, 'learning_rate': 1.9579718163929767e-05, 'epoch': 0.12} + 12%|█▏ | 1049/8750 [1:43:00<12:20:19, 5.77s/it]4 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +05 AutoResumeHook: Checking whether to suspend... + 12%|█▏ | 1050/8750 [1:43:12<12:12:57, 5.71s/it]13 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend...11 AutoResumeHook: Checking whether to suspend... + +6 AutoResumeHook: Checking whether to suspend... + 12%|█▏ | 1050/8750 [1:43:05<12:12:58, 5.71s/it]10 AutoResumeHook: Checking whether to suspend... + {'loss': 0.5032, 'learning_rate': 1.957865564289868e-05, 'epoch': 0.12} + 12%|█▏ | 1050/8750 [1:43:12<12:12:57, 5.71s/it] {'loss': 0.5032, 'learning_rate': 1.957865564289868e-05, 'epoch': 0.12} + 12%|█▏ | 1050/8750 [1:43:05<12:12:58, 5.71s/it] 12%|█▏ | 1051/8750 [1:43:18<12:20:05, 5.77s/it] 12%|█▏ | 1051/8750 [1:43:11<12:20:05, 5.77s/it] {'loss': 0.4787, 'learning_rate': 1.9577591809377917e-05, 'epoch': 0.12} + 12%|█▏ | 1051/8750 [1:43:18<12:20:05, 5.77s/it] {'loss': 0.4787, 'learning_rate': 1.9577591809377917e-05, 'epoch': 0.12} + 12%|█▏ | 1051/8750 [1:43:11<12:20:05, 5.77s/it] 12%|█▏ | 1052/8750 [1:43:17<12:13:16, 5.72s/it] 12%|█▏ | 1052/8750 [1:43:23<12:13:17, 5.72s/it] {'loss': 0.5013, 'learning_rate': 1.957652666351325e-05, 'epoch': 0.12} + 12%|█▏ | 1052/8750 [1:43:23<12:13:17, 5.72s/it] {'loss': 0.5013, 'learning_rate': 1.957652666351325e-05, 'epoch': 0.12} + 12%|█▏ | 1052/8750 [1:43:17<12:13:16, 5.72s/it] 12%|█▏ | 1053/8750 [1:43:29<12:08:07, 5.68s/it] 12%|█▏ | 1053/8750 [1:43:22<12:08:07, 5.68s/it] {'loss': 0.5499, 'learning_rate': 1.9575460205450616e-05, 'epoch': 0.12} + 12%|█▏ | 1053/8750 [1:43:29<12:08:07, 5.68s/it] {'loss': 0.5499, 'learning_rate': 1.9575460205450616e-05, 'epoch': 0.12} + 12%|█▏ | 1053/8750 [1:43:22<12:08:07, 5.68s/it] 12%|█▏ | 1054/8750 [1:43:35<12:09:52, 5.69s/it] 12%|█▏ | 1054/8750 [1:43:28<12:09:52, 5.69s/it] {'loss': 0.4768, 'learning_rate': 1.9574392435336156e-05, 'epoch': 0.12} + 12%|█▏ | 1054/8750 [1:43:35<12:09:52, 5.69s/it] {'loss': 0.4768, 'learning_rate': 1.9574392435336156e-05, 'epoch': 0.12} + 12%|█▏ | 1054/8750 [1:43:28<12:09:52, 5.69s/it] 12%|█▏ | 1055/8750 [1:43:40<12:11:52, 5.71s/it] 12%|█▏ | 1055/8750 [1:43:34<12:11:52, 5.71s/it] {'loss': 0.4963, 'learning_rate': 1.957332335331617e-05, 'epoch': 0.12} + 12%|█▏ | 1055/8750 [1:43:40<12:11:52, 5.71s/it] {'loss': 0.4963, 'learning_rate': 1.957332335331617e-05, 'epoch': 0.12} + 12%|█▏ | 1055/8750 [1:43:34<12:11:52, 5.71s/it] 12%|█▏ | 1056/8750 [1:43:46<12:28:49, 5.84s/it] 12%|█▏ | 1056/8750 [1:43:40<12:28:49, 5.84s/it] {'loss': 0.4937, 'learning_rate': 1.957225295953715e-05, 'epoch': 0.12} + 12%|█▏ | 1056/8750 [1:43:46<12:28:49, 5.84s/it] {'loss': 0.4937, 'learning_rate': 1.957225295953715e-05, 'epoch': 0.12} + 12%|█▏ | 1056/8750 [1:43:40<12:28:49, 5.84s/it] 12%|█▏ | 1057/8750 [1:43:52<12:24:38, 5.81s/it] 12%|█▏ | 1057/8750 [1:43:46<12:24:39, 5.81s/it] {'loss': 0.4775, 'learning_rate': 1.9571181254145762e-05, 'epoch': 0.12} + 12%|█▏ | 1057/8750 [1:43:52<12:24:38, 5.81s/it] {'loss': 0.4775, 'learning_rate': 1.9571181254145762e-05, 'epoch': 0.12} + 12%|█▏ | 1057/8750 [1:43:46<12:24:39, 5.81s/it] 12%|█▏ | 1058/8750 [1:43:58<12:14:58, 5.73s/it] 12%|█▏ | 1058/8750 [1:43:51<12:14:57, 5.73s/it] {'loss': 0.5146, 'learning_rate': 1.9570108237288853e-05, 'epoch': 0.12} + 12%|█▏ | 1058/8750 [1:43:58<12:14:58, 5.73s/it] {'loss': 0.5146, 'learning_rate': 1.9570108237288853e-05, 'epoch': 0.12} + 12%|█▏ | 1058/8750 [1:43:51<12:14:57, 5.73s/it] 12%|█▏ | 1059/8750 [1:44:04<12:24:56, 5.81s/it] 12%|█▏ | 1059/8750 [1:43:57<12:24:56, 5.81s/it] {'loss': 0.4953, 'learning_rate': 1.9569033909113454e-05, 'epoch': 0.12} + 12%|█▏ | 1059/8750 [1:44:04<12:24:56, 5.81s/it] {'loss': 0.4953, 'learning_rate': 1.9569033909113454e-05, 'epoch': 0.12} + 12%|█▏ | 1059/8750 [1:43:57<12:24:56, 5.81s/it] 12%|█▏ | 1060/8750 [1:44:10<12:30:07, 5.85s/it] 12%|█▏ | 1060/8750 [1:44:03<12:30:06, 5.85s/it] {'loss': 0.4758, 'learning_rate': 1.9567958269766768e-05, 'epoch': 0.12} + 12%|█▏ | 1060/8750 [1:44:10<12:30:07, 5.85s/it] {'loss': 0.4758, 'learning_rate': 1.9567958269766768e-05, 'epoch': 0.12} + 12%|█▏ | 1060/8750 [1:44:03<12:30:06, 5.85s/it] 12%|█▏ | 1061/8750 [1:44:15<12:19:01, 5.77s/it] 12%|█▏ | 1061/8750 [1:44:09<12:19:01, 5.77s/it] {'loss': 0.5177, 'learning_rate': 1.9566881319396184e-05, 'epoch': 0.12} + 12%|█▏ | 1061/8750 [1:44:15<12:19:01, 5.77s/it] {'loss': 0.5177, 'learning_rate': 1.9566881319396184e-05, 'epoch': 0.12} + 12%|█▏ | 1061/8750 [1:44:09<12:19:01, 5.77s/it] 12%|█▏ | 1062/8750 [1:44:21<12:14:31, 5.73s/it] 12%|█▏ | 1062/8750 [1:44:14<12:14:31, 5.73s/it] {'loss': 0.4715, 'learning_rate': 1.956580305814927e-05, 'epoch': 0.12} + 12%|█▏ | 1062/8750 [1:44:21<12:14:31, 5.73s/it] {'loss': 0.4715, 'learning_rate': 1.956580305814927e-05, 'epoch': 0.12} + 12%|█▏ | 1062/8750 [1:44:14<12:14:31, 5.73s/it] 12%|█▏ | 1063/8750 [1:44:27<12:17:03, 5.75s/it] 12%|█▏ | 1063/8750 [1:44:20<12:17:03, 5.75s/it] {'loss': 0.4987, 'learning_rate': 1.9564723486173766e-05, 'epoch': 0.12} + 12%|█▏ | 1063/8750 [1:44:27<12:17:03, 5.75s/it] {'loss': 0.4987, 'learning_rate': 1.9564723486173766e-05, 'epoch': 0.12} + 12%|█▏ | 1063/8750 [1:44:20<12:17:03, 5.75s/it] 12%|█▏ | 1064/8750 [1:44:32<12:16:15, 5.75s/it] 12%|█▏ | 1064/8750 [1:44:26<12:16:15, 5.75s/it] {'loss': 0.4819, 'learning_rate': 1.95636426036176e-05, 'epoch': 0.12} + 12%|█▏ | 1064/8750 [1:44:32<12:16:15, 5.75s/it] {'loss': 0.4819, 'learning_rate': 1.95636426036176e-05, 'epoch': 0.12} + 12%|█▏ | 1064/8750 [1:44:26<12:16:15, 5.75s/it] 12%|█▏ | 1065/8750 [1:44:39<12:30:26, 5.86s/it] 12%|█▏ | 1065/8750 [1:44:32<12:30:26, 5.86s/it] {'loss': 0.5034, 'learning_rate': 1.9562560410628883e-05, 'epoch': 0.12} + 12%|█▏ | 1065/8750 [1:44:39<12:30:26, 5.86s/it] {'loss': 0.5034, 'learning_rate': 1.9562560410628883e-05, 'epoch': 0.12} + 12%|█▏ | 1065/8750 [1:44:32<12:30:26, 5.86s/it] 12%|█▏ | 1066/8750 [1:44:44<12:25:17, 5.82s/it] 12%|█▏ | 1066/8750 [1:44:38<12:25:17, 5.82s/it] {'loss': 0.5026, 'learning_rate': 1.9561476907355886e-05, 'epoch': 0.12} + 12%|█▏ | 1066/8750 [1:44:44<12:25:17, 5.82s/it] {'loss': 0.5026, 'learning_rate': 1.9561476907355886e-05, 'epoch': 0.12} + 12%|█▏ | 1066/8750 [1:44:38<12:25:17, 5.82s/it] 12%|█▏ | 1067/8750 [1:44:50<12:19:52, 5.78s/it] 12%|█▏ | 1067/8750 [1:44:43<12:19:52, 5.78s/it] {'loss': 0.5056, 'learning_rate': 1.956039209394709e-05, 'epoch': 0.12} + 12%|█▏ | 1067/8750 [1:44:50<12:19:52, 5.78s/it] {'loss': 0.5056, 'learning_rate': 1.956039209394709e-05, 'epoch': 0.12} + 12%|█▏ | 1067/8750 [1:44:43<12:19:52, 5.78s/it] 12%|█▏ | 1068/8750 [1:44:49<12:18:32, 5.77s/it] 12%|█▏ | 1068/8750 [1:44:56<12:18:33, 5.77s/it] {'loss': 0.4825, 'learning_rate': 1.9559305970551125e-05, 'epoch': 0.12} + 12%|█▏ | 1068/8750 [1:44:56<12:18:33, 5.77s/it] {'loss': 0.4825, 'learning_rate': 1.9559305970551125e-05, 'epoch': 0.12} + 12%|█▏ | 1068/8750 [1:44:49<12:18:32, 5.77s/it] 12%|█▏ | 1069/8750 [1:45:01<12:13:09, 5.73s/it] 12%|█▏ | 1069/8750 [1:44:55<12:13:09, 5.73s/it] {'loss': 0.515, 'learning_rate': 1.955821853731682e-05, 'epoch': 0.12} + 12%|█▏ | 1069/8750 [1:45:01<12:13:09, 5.73s/it] {'loss': 0.515, 'learning_rate': 1.955821853731682e-05, 'epoch': 0.12} + 12%|█▏ | 1069/8750 [1:44:55<12:13:09, 5.73s/it] 12%|█▏ | 1070/8750 [1:45:07<12:12:40, 5.72s/it] 12%|█▏ | 1070/8750 [1:45:01<12:12:40, 5.72s/it] {'loss': 0.4992, 'learning_rate': 1.955712979439318e-05, 'epoch': 0.12} + 12%|█▏ | 1070/8750 [1:45:07<12:12:40, 5.72s/it] {'loss': 0.4992, 'learning_rate': 1.955712979439318e-05, 'epoch': 0.12} + 12%|█▏ | 1070/8750 [1:45:01<12:12:40, 5.72s/it] 12%|█▏ | 1071/8750 [1:45:13<12:26:30, 5.83s/it] 12%|█▏ | 1071/8750 [1:45:07<12:26:30, 5.83s/it] {'loss': 0.483, 'learning_rate': 1.955603974192938e-05, 'epoch': 0.12} + 12%|█▏ | 1071/8750 [1:45:13<12:26:30, 5.83s/it] {'loss': 0.483, 'learning_rate': 1.955603974192938e-05, 'epoch': 0.12} + 12%|█▏ | 1071/8750 [1:45:07<12:26:30, 5.83s/it] 12%|█▏ | 1072/8750 [1:45:12<12:17:06, 5.76s/it] 12%|█▏ | 1072/8750 [1:45:19<12:17:06, 5.76s/it] {'loss': 0.493, 'learning_rate': 1.955494838007479e-05, 'epoch': 0.12} + 12%|█▏ | 1072/8750 [1:45:19<12:17:06, 5.76s/it] {'loss': 0.493, 'learning_rate': 1.955494838007479e-05, 'epoch': 0.12} + 12%|█▏ | 1072/8750 [1:45:12<12:17:06, 5.76s/it] 12%|█▏ | 1073/8750 [1:45:24<12:13:27, 5.73s/it] 12%|█▏ | 1073/8750 [1:45:18<12:13:27, 5.73s/it] {'loss': 0.4969, 'learning_rate': 1.9553855708978943e-05, 'epoch': 0.12} + 12%|█▏ | 1073/8750 [1:45:24<12:13:27, 5.73s/it] {'loss': 0.4969, 'learning_rate': 1.9553855708978943e-05, 'epoch': 0.12} + 12%|█▏ | 1073/8750 [1:45:18<12:13:27, 5.73s/it] 12%|█▏ | 1074/8750 [1:45:30<12:08:05, 5.69s/it] 12%|█▏ | 1074/8750 [1:45:23<12:08:06, 5.69s/it] {'loss': 0.506, 'learning_rate': 1.9552761728791563e-05, 'epoch': 0.12} + 12%|█▏ | 1074/8750 [1:45:30<12:08:05, 5.69s/it] {'loss': 0.506, 'learning_rate': 1.9552761728791563e-05, 'epoch': 0.12} + 12%|█▏ | 1074/8750 [1:45:23<12:08:06, 5.69s/it] 12%|█▏ | 1075/8750 [1:45:36<12:20:28, 5.79s/it] 12%|█▏ | 1075/8750 [1:45:29<12:20:28, 5.79s/it] {'loss': 0.4959, 'learning_rate': 1.955166643966255e-05, 'epoch': 0.12} + 12%|█▏ | 1075/8750 [1:45:36<12:20:28, 5.79s/it] {'loss': 0.4959, 'learning_rate': 1.955166643966255e-05, 'epoch': 0.12} + 12%|█▏ | 1075/8750 [1:45:29<12:20:28, 5.79s/it] 12%|█▏ | 1076/8750 [1:45:35<12:17:42, 5.77s/it] 12%|█▏ | 1076/8750 [1:45:42<12:17:43, 5.77s/it] {'loss': 0.4879, 'learning_rate': 1.9550569841741984e-05, 'epoch': 0.12} + 12%|█▏ | 1076/8750 [1:45:42<12:17:43, 5.77s/it] {'loss': 0.4879, 'learning_rate': 1.9550569841741984e-05, 'epoch': 0.12} + 12%|█▏ | 1076/8750 [1:45:35<12:17:42, 5.77s/it] 12%|█▏ | 1077/8750 [1:45:41<12:13:51, 5.74s/it] 12%|█▏ | 1077/8750 [1:45:47<12:13:51, 5.74s/it] {'loss': 0.4908, 'learning_rate': 1.9549471935180123e-05, 'epoch': 0.12} + 12%|█▏ | 1077/8750 [1:45:47<12:13:51, 5.74s/it] {'loss': 0.4908, 'learning_rate': 1.9549471935180123e-05, 'epoch': 0.12} + 12%|█▏ | 1077/8750 [1:45:41<12:13:51, 5.74s/it] 12%|█▏ | 1078/8750 [1:45:53<12:11:05, 5.72s/it] 12%|█▏ | 1078/8750 [1:45:47<12:11:06, 5.72s/it] {'loss': 0.4847, 'learning_rate': 1.95483727201274e-05, 'epoch': 0.12} + 12%|█▏ | 1078/8750 [1:45:53<12:11:05, 5.72s/it] {'loss': 0.4847, 'learning_rate': 1.95483727201274e-05, 'epoch': 0.12} + 12%|█▏ | 1078/8750 [1:45:47<12:11:06, 5.72s/it] 12%|█▏ | 1079/8750 [1:45:59<12:10:49, 5.72s/it] 12%|█▏ | 1079/8750 [1:45:52<12:10:49, 5.72s/it] {'loss': 0.5054, 'learning_rate': 1.9547272196734436e-05, 'epoch': 0.12} + 12%|█▏ | 1079/8750 [1:45:59<12:10:49, 5.72s/it] {'loss': 0.5054, 'learning_rate': 1.9547272196734436e-05, 'epoch': 0.12} + 12%|█▏ | 1079/8750 [1:45:52<12:10:49, 5.72s/it] 12%|█▏ | 1080/8750 [1:46:05<12:11:55, 5.73s/it] 12%|█▏ | 1080/8750 [1:45:58<12:11:55, 5.73s/it] {'loss': 0.4997, 'learning_rate': 1.954617036515203e-05, 'epoch': 0.12} + 12%|█▏ | 1080/8750 [1:46:05<12:11:55, 5.73s/it] {'loss': 0.4997, 'learning_rate': 1.954617036515203e-05, 'epoch': 0.12} + 12%|█▏ | 1080/8750 [1:45:58<12:11:55, 5.73s/it] 12%|█▏ | 1081/8750 [1:46:10<12:21:15, 5.80s/it] 12%|█▏ | 1081/8750 [1:46:04<12:21:15, 5.80s/it] {'loss': 0.5026, 'learning_rate': 1.9545067225531155e-05, 'epoch': 0.12} + 12%|█▏ | 1081/8750 [1:46:10<12:21:15, 5.80s/it] {'loss': 0.5026, 'learning_rate': 1.9545067225531155e-05, 'epoch': 0.12} + 12%|█▏ | 1081/8750 [1:46:04<12:21:15, 5.80s/it] 12%|█▏ | 1082/8750 [1:46:16<12:13:01, 5.74s/it] 12%|█▏ | 1082/8750 [1:46:10<12:13:01, 5.74s/it] {'loss': 0.5015, 'learning_rate': 1.954396277802296e-05, 'epoch': 0.12} + {'loss': 0.5015, 'learning_rate': 1.954396277802296e-05, 'epoch': 0.12} 12%|█▏ | 1082/8750 [1:46:16<12:13:01, 5.74s/it] + 12%|█▏ | 1082/8750 [1:46:10<12:13:01, 5.74s/it] 12%|█▏ | 1083/8750 [1:46:22<12:09:19, 5.71s/it] 12%|█▏ | 1083/8750 [1:46:15<12:09:19, 5.71s/it] {'loss': 0.4918, 'learning_rate': 1.954285702277879e-05, 'epoch': 0.12} + 12%|█▏ | 1083/8750 [1:46:15<12:09:19, 5.71s/it] {'loss': 0.4918, 'learning_rate': 1.954285702277879e-05, 'epoch': 0.12} + 12%|█▏ | 1083/8750 [1:46:22<12:09:19, 5.71s/it] 12%|█▏ | 1084/8750 [1:46:21<12:10:02, 5.71s/it] 12%|█▏ | 1084/8750 [1:46:27<12:10:02, 5.71s/it] {'loss': 0.4966, 'learning_rate': 1.954174995995015e-05, 'epoch': 0.12} + 12%|█▏ | 1084/8750 [1:46:27<12:10:02, 5.71s/it] {'loss': 0.4966, 'learning_rate': 1.954174995995015e-05, 'epoch': 0.12} + 12%|█▏ | 1084/8750 [1:46:21<12:10:02, 5.71s/it] 12%|█▏ | 1085/8750 [1:46:33<12:12:16, 5.73s/it] 12%|█▏ | 1085/8750 [1:46:27<12:12:16, 5.73s/it] {'loss': 0.4972, 'learning_rate': 1.9540641589688735e-05, 'epoch': 0.12} + 12%|█▏ | 1085/8750 [1:46:33<12:12:16, 5.73s/it] {'loss': 0.4972, 'learning_rate': 1.9540641589688735e-05, 'epoch': 0.12} + 12%|█▏ | 1085/8750 [1:46:27<12:12:16, 5.73s/it] 12%|█▏ | 1086/8750 [1:46:39<12:17:22, 5.77s/it] 12%|█▏ | 1086/8750 [1:46:33<12:17:22, 5.77s/it] {'loss': 0.4849, 'learning_rate': 1.953953191214642e-05, 'epoch': 0.12} + 12%|█▏ | 1086/8750 [1:46:39<12:17:22, 5.77s/it] {'loss': 0.4849, 'learning_rate': 1.953953191214642e-05, 'epoch': 0.12} + 12%|█▏ | 1086/8750 [1:46:33<12:17:22, 5.77s/it] 12%|█▏ | 1087/8750 [1:46:38<12:14:22, 5.75s/it] 12%|█▏ | 1087/8750 [1:46:45<12:14:22, 5.75s/it] {'loss': 0.5057, 'learning_rate': 1.9538420927475247e-05, 'epoch': 0.12} + 12%|█▏ | 1087/8750 [1:46:45<12:14:22, 5.75s/it] {'loss': 0.5057, 'learning_rate': 1.9538420927475247e-05, 'epoch': 0.12} + 12%|█▏ | 1087/8750 [1:46:38<12:14:22, 5.75s/it] 12%|█▏ | 1088/8750 [1:46:44<12:07:50, 5.70s/it] 12%|█▏ | 1088/8750 [1:46:50<12:07:50, 5.70s/it] {'loss': 0.4687, 'learning_rate': 1.953730863582745e-05, 'epoch': 0.12} + 12%|█▏ | 1088/8750 [1:46:50<12:07:50, 5.70s/it] {'loss': 0.4687, 'learning_rate': 1.953730863582745e-05, 'epoch': 0.12} + 12%|█▏ | 1088/8750 [1:46:44<12:07:50, 5.70s/it] 12%|█▏ | 1089/8750 [1:46:56<12:04:51, 5.68s/it] 12%|█▏ | 1089/8750 [1:46:49<12:04:52, 5.68s/it] {'loss': 0.4987, 'learning_rate': 1.9536195037355438e-05, 'epoch': 0.12} + 12%|█▏ | 1089/8750 [1:46:56<12:04:51, 5.68s/it] {'loss': 0.4987, 'learning_rate': 1.9536195037355438e-05, 'epoch': 0.12} + 12%|█▏ | 1089/8750 [1:46:49<12:04:52, 5.68s/it] 12%|█▏ | 1090/8750 [1:47:02<12:04:43, 5.68s/it] 12%|█▏ | 1090/8750 [1:46:55<12:04:43, 5.68s/it] {'loss': 0.4879, 'learning_rate': 1.9535080132211805e-05, 'epoch': 0.12} + 12%|█▏ | 1090/8750 [1:47:02<12:04:43, 5.68s/it] {'loss': 0.4879, 'learning_rate': 1.9535080132211805e-05, 'epoch': 0.12} + 12%|█▏ | 1090/8750 [1:46:55<12:04:43, 5.68s/it] 12%|█▏ | 1091/8750 [1:47:08<12:13:52, 5.75s/it] 12%|█▏ | 1091/8750 [1:47:01<12:13:53, 5.75s/it] {'loss': 0.4896, 'learning_rate': 1.9533963920549307e-05, 'epoch': 0.12} + 12%|█▏ | 1091/8750 [1:47:08<12:13:52, 5.75s/it] {'loss': 0.4896, 'learning_rate': 1.9533963920549307e-05, 'epoch': 0.12} + 12%|█▏ | 1091/8750 [1:47:01<12:13:53, 5.75s/it] 12%|█▏ | 1092/8750 [1:47:13<12:09:52, 5.72s/it] 12%|█▏ | 1092/8750 [1:47:07<12:09:52, 5.72s/it] {'loss': 0.4914, 'learning_rate': 1.9532846402520898e-05, 'epoch': 0.12} + 12%|█▏ | 1092/8750 [1:47:13<12:09:52, 5.72s/it] {'loss': 0.4914, 'learning_rate': 1.9532846402520898e-05, 'epoch': 0.12} + 12%|█▏ | 1092/8750 [1:47:07<12:09:52, 5.72s/it] 12%|█▏ | 1093/8750 [1:47:12<12:07:04, 5.70s/it] 12%|█▏ | 1093/8750 [1:47:19<12:07:05, 5.70s/it] {'loss': 0.4981, 'learning_rate': 1.95317275782797e-05, 'epoch': 0.12} + 12%|█▏ | 1093/8750 [1:47:19<12:07:05, 5.70s/it] {'loss': 0.4981, 'learning_rate': 1.95317275782797e-05, 'epoch': 0.12} + 12%|█▏ | 1093/8750 [1:47:12<12:07:04, 5.70s/it] 13%|█▎ | 1094/8750 [1:47:24<12:00:34, 5.65s/it] 13%|█▎ | 1094/8750 [1:47:18<12:00:34, 5.65s/it] {'loss': 0.5114, 'learning_rate': 1.953060744797901e-05, 'epoch': 0.13} + 13%|█▎ | 1094/8750 [1:47:24<12:00:34, 5.65s/it] {'loss': 0.5114, 'learning_rate': 1.953060744797901e-05, 'epoch': 0.13} + 13%|█▎ | 1094/8750 [1:47:18<12:00:34, 5.65s/it] 13%|█▎ | 1095/8750 [1:47:30<12:01:12, 5.65s/it] 13%|█▎ | 1095/8750 [1:47:24<12:01:13, 5.65s/it] {'loss': 0.4893, 'learning_rate': 1.9529486011772326e-05, 'epoch': 0.13} + 13%|█▎ | 1095/8750 [1:47:30<12:01:12, 5.65s/it] {'loss': 0.4893, 'learning_rate': 1.9529486011772326e-05, 'epoch': 0.13} + 13%|█▎ | 1095/8750 [1:47:24<12:01:13, 5.65s/it] 13%|█▎ | 1096/8750 [1:47:36<11:59:52, 5.64s/it] 13%|█▎ | 1096/8750 [1:47:29<11:59:52, 5.64s/it] {'loss': 0.4894, 'learning_rate': 1.95283632698133e-05, 'epoch': 0.13} + 13%|█▎ | 1096/8750 [1:47:36<11:59:52, 5.64s/it] {'loss': 0.4894, 'learning_rate': 1.95283632698133e-05, 'epoch': 0.13} + 13%|█▎ | 1096/8750 [1:47:29<11:59:52, 5.64s/it] 13%|█▎ | 1097/8750 [1:47:41<11:55:46, 5.61s/it] 13%|█▎ | 1097/8750 [1:47:35<11:55:46, 5.61s/it] {'loss': 0.4872, 'learning_rate': 1.952723922225577e-05, 'epoch': 0.13} + 13%|█▎ | 1097/8750 [1:47:41<11:55:46, 5.61s/it] {'loss': 0.4872, 'learning_rate': 1.952723922225577e-05, 'epoch': 0.13} + 13%|█▎ | 1097/8750 [1:47:35<11:55:46, 5.61s/it] 13%|█▎ | 1098/8750 [1:47:47<12:00:08, 5.65s/it] 13%|█▎ | 1098/8750 [1:47:40<12:00:08, 5.65s/it] {'loss': 0.4907, 'learning_rate': 1.952611386925376e-05, 'epoch': 0.13} + 13%|█▎ | 1098/8750 [1:47:47<12:00:08, 5.65s/it] {'loss': 0.4907, 'learning_rate': 1.952611386925376e-05, 'epoch': 0.13} + 13%|█▎ | 1098/8750 [1:47:40<12:00:08, 5.65s/it] 13%|█▎ | 1099/8750 [1:47:53<11:59:02, 5.64s/it] 13%|█▎ | 1099/8750 [1:47:46<11:59:02, 5.64s/it] {'loss': 0.4816, 'learning_rate': 1.952498721096147e-05, 'epoch': 0.13} + 13%|█▎ | 1099/8750 [1:47:53<11:59:02, 5.64s/it] {'loss': 0.4816, 'learning_rate': 1.952498721096147e-05, 'epoch': 0.13} + 13%|█▎ | 1099/8750 [1:47:46<11:59:02, 5.64s/it]15 AutoResumeHook: Checking whether to suspend... +43 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +214 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +5 10 AutoResumeHook: Checking whether to suspend... +11AutoResumeHook: Checking whether to suspend... + 7 AutoResumeHook: Checking whether to suspend... +8AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +0 6 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... + 13%|█▎ | 1100/8750 [1:47:58<12:03:12, 5.67s/it]9 AutoResumeHook: Checking whether to suspend... + 13%|█▎ | 1100/8750 [1:47:52<12:03:12, 5.67s/it]13 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4909, 'learning_rate': 1.952385924753328e-05, 'epoch': 0.13} + 13%|█▎ | 1100/8750 [1:47:58<12:03:12, 5.67s/it] {'loss': 0.4909, 'learning_rate': 1.952385924753328e-05, 'epoch': 0.13} + 13%|█▎ | 1100/8750 [1:47:52<12:03:12, 5.67s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-1100/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-1100/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-1100/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 13%|█▎ | 1101/8750 [1:48:12<21:26:11, 10.09s/it] 13%|█▎ | 1101/8750 [1:48:19<21:26:12, 10.09s/it] {'loss': 0.493, 'learning_rate': 1.9522729979123735e-05, 'epoch': 0.13} + 13%|█▎ | 1101/8750 [1:48:19<21:26:12, 10.09s/it] {'loss': 0.493, 'learning_rate': 1.9522729979123735e-05, 'epoch': 0.13} + 13%|█▎ | 1101/8750 [1:48:12<21:26:11, 10.09s/it] 13%|█▎ | 1102/8750 [1:48:18<18:32:32, 8.73s/it] 13%|█▎ | 1102/8750 [1:48:24<18:32:33, 8.73s/it] {'loss': 0.5024, 'learning_rate': 1.952159940588758e-05, 'epoch': 0.13} + 13%|█▎ | 1102/8750 [1:48:24<18:32:33, 8.73s/it] {'loss': 0.5024, 'learning_rate': 1.952159940588758e-05, 'epoch': 0.13} + 13%|█▎ | 1102/8750 [1:48:18<18:32:32, 8.73s/it] 13%|█▎ | 1103/8750 [1:48:30<16:34:23, 7.80s/it] 13%|█▎ | 1103/8750 [1:48:23<16:34:24, 7.80s/it] {'loss': 0.4942, 'learning_rate': 1.9520467527979726e-05, 'epoch': 0.13} + 13%|█▎ | 1103/8750 [1:48:30<16:34:23, 7.80s/it] {'loss': 0.4942, 'learning_rate': 1.9520467527979726e-05, 'epoch': 0.13} + 13%|█▎ | 1103/8750 [1:48:23<16:34:24, 7.80s/it] 13%|█▎ | 1104/8750 [1:48:36<15:09:42, 7.14s/it] 13%|█▎ | 1104/8750 [1:48:29<15:09:42, 7.14s/it] {'loss': 0.484, 'learning_rate': 1.9519334345555264e-05, 'epoch': 0.13} + 13%|█▎ | 1104/8750 [1:48:36<15:09:42, 7.14s/it] {'loss': 0.484, 'learning_rate': 1.9519334345555264e-05, 'epoch': 0.13} + 13%|█▎ | 1104/8750 [1:48:29<15:09:42, 7.14s/it] 13%|█▎ | 1105/8750 [1:48:41<14:18:17, 6.74s/it] 13%|█▎ | 1105/8750 [1:48:35<14:18:16, 6.74s/it] {'loss': 0.4942, 'learning_rate': 1.9518199858769466e-05, 'epoch': 0.13} + 13%|█▎ | 1105/8750 [1:48:41<14:18:17, 6.74s/it] {'loss': 0.4942, 'learning_rate': 1.9518199858769466e-05, 'epoch': 0.13} + 13%|█▎ | 1105/8750 [1:48:35<14:18:16, 6.74s/it] 13%|█▎ | 1106/8750 [1:48:41<13:48:23, 6.50s/it] 13%|█▎ | 1106/8750 [1:48:47<13:48:23, 6.50s/it] {'loss': 0.469, 'learning_rate': 1.9517064067777786e-05, 'epoch': 0.13} + 13%|█▎ | 1106/8750 [1:48:47<13:48:23, 6.50s/it] {'loss': 0.469, 'learning_rate': 1.9517064067777786e-05, 'epoch': 0.13} + 13%|█▎ | 1106/8750 [1:48:41<13:48:23, 6.50s/it] 13%|█▎ | 1107/8750 [1:48:46<13:18:22, 6.27s/it] 13%|█▎ | 1107/8750 [1:48:53<13:18:23, 6.27s/it] {'loss': 0.4857, 'learning_rate': 1.9515926972735847e-05, 'epoch': 0.13} + 13%|█▎ | 1107/8750 [1:48:53<13:18:23, 6.27s/it] {'loss': 0.4857, 'learning_rate': 1.9515926972735847e-05, 'epoch': 0.13} + 13%|█▎ | 1107/8750 [1:48:46<13:18:22, 6.27s/it] 13%|█▎ | 1108/8750 [1:48:52<12:50:35, 6.05s/it] 13%|█▎ | 1108/8750 [1:48:59<12:50:35, 6.05s/it] {'loss': 0.4872, 'learning_rate': 1.9514788573799457e-05, 'epoch': 0.13} + 13%|█▎ | 1108/8750 [1:48:59<12:50:35, 6.05s/it] {'loss': 0.4872, 'learning_rate': 1.9514788573799457e-05, 'epoch': 0.13} + 13%|█▎ | 1108/8750 [1:48:52<12:50:35, 6.05s/it] 13%|█▎ | 1109/8750 [1:49:04<12:31:45, 5.90s/it] 13%|█▎ | 1109/8750 [1:48:58<12:31:46, 5.90s/it] {'loss': 0.5115, 'learning_rate': 1.9513648871124604e-05, 'epoch': 0.13} + 13%|█▎ | 1109/8750 [1:49:04<12:31:45, 5.90s/it] {'loss': 0.5115, 'learning_rate': 1.9513648871124604e-05, 'epoch': 0.13} + 13%|█▎ | 1109/8750 [1:48:58<12:31:46, 5.90s/it] 13%|█▎ | 1110/8750 [1:49:10<12:27:50, 5.87s/it] 13%|█▎ | 1110/8750 [1:49:03<12:27:51, 5.87s/it] {'loss': 0.4947, 'learning_rate': 1.9512507864867452e-05, 'epoch': 0.13} + 13%|█▎ | 1110/8750 [1:49:10<12:27:50, 5.87s/it] {'loss': 0.4947, 'learning_rate': 1.9512507864867452e-05, 'epoch': 0.13} + 13%|█▎ | 1110/8750 [1:49:03<12:27:51, 5.87s/it] 13%|█▎ | 1111/8750 [1:49:15<12:17:41, 5.79s/it] 13%|█▎ | 1111/8750 [1:49:09<12:17:42, 5.79s/it] {'loss': 0.4913, 'learning_rate': 1.9511365555184345e-05, 'epoch': 0.13} + 13%|█▎ | 1111/8750 [1:49:15<12:17:41, 5.79s/it] {'loss': 0.4913, 'learning_rate': 1.9511365555184345e-05, 'epoch': 0.13} + 13%|█▎ | 1111/8750 [1:49:09<12:17:42, 5.79s/it] 13%|█▎ | 1112/8750 [1:49:15<12:12:24, 5.75s/it] 13%|█▎ | 1112/8750 [1:49:21<12:12:26, 5.75s/it] {'loss': 0.4762, 'learning_rate': 1.9510221942231803e-05, 'epoch': 0.13} + 13%|█▎ | 1112/8750 [1:49:21<12:12:26, 5.75s/it] {'loss': 0.4762, 'learning_rate': 1.9510221942231803e-05, 'epoch': 0.13} + 13%|█▎ | 1112/8750 [1:49:15<12:12:24, 5.75s/it] 13%|█▎ | 1113/8750 [1:49:27<12:10:53, 5.74s/it] 13%|█▎ | 1113/8750 [1:49:20<12:10:53, 5.74s/it] {'loss': 0.4975, 'learning_rate': 1.950907702616653e-05, 'epoch': 0.13} + 13%|█▎ | 1113/8750 [1:49:27<12:10:53, 5.74s/it] {'loss': 0.4975, 'learning_rate': 1.950907702616653e-05, 'epoch': 0.13} + 13%|█▎ | 1113/8750 [1:49:20<12:10:53, 5.74s/it] 13%|█▎ | 1114/8750 [1:49:26<12:12:25, 5.76s/it] 13%|█▎ | 1114/8750 [1:49:33<12:12:26, 5.76s/it] {'loss': 0.4734, 'learning_rate': 1.9507930807145406e-05, 'epoch': 0.13} + 13%|█▎ | 1114/8750 [1:49:33<12:12:26, 5.76s/it] {'loss': 0.4734, 'learning_rate': 1.9507930807145406e-05, 'epoch': 0.13} + 13%|█▎ | 1114/8750 [1:49:26<12:12:25, 5.76s/it] 13%|█▎ | 1115/8750 [1:49:38<12:13:38, 5.77s/it] 13%|█▎ | 1115/8750 [1:49:32<12:13:38, 5.77s/it] {'loss': 0.4966, 'learning_rate': 1.9506783285325482e-05, 'epoch': 0.13} + 13%|█▎ | 1115/8750 [1:49:38<12:13:38, 5.77s/it] {'loss': 0.4966, 'learning_rate': 1.9506783285325482e-05, 'epoch': 0.13} + 13%|█▎ | 1115/8750 [1:49:32<12:13:38, 5.77s/it] 13%|█▎ | 1116/8750 [1:49:44<12:03:03, 5.68s/it] 13%|█▎ | 1116/8750 [1:49:37<12:03:03, 5.68s/it] {'loss': 0.4885, 'learning_rate': 1.9505634460863997e-05, 'epoch': 0.13} + 13%|█▎ | 1116/8750 [1:49:44<12:03:03, 5.68s/it] {'loss': 0.4885, 'learning_rate': 1.9505634460863997e-05, 'epoch': 0.13} + 13%|█▎ | 1116/8750 [1:49:37<12:03:03, 5.68s/it] 13%|█▎ | 1117/8750 [1:49:50<12:03:23, 5.69s/it] 13%|█▎ | 1117/8750 [1:49:43<12:03:23, 5.69s/it] {'loss': 0.4667, 'learning_rate': 1.950448433391837e-05, 'epoch': 0.13} + 13%|█▎ | 1117/8750 [1:49:50<12:03:23, 5.69s/it] {'loss': 0.4667, 'learning_rate': 1.950448433391837e-05, 'epoch': 0.13} + 13%|█▎ | 1117/8750 [1:49:43<12:03:23, 5.69s/it] 13%|█▎ | 1118/8750 [1:49:49<12:03:29, 5.69s/it] 13%|█▎ | 1118/8750 [1:49:55<12:03:30, 5.69s/it] {'loss': 0.5057, 'learning_rate': 1.9503332904646188e-05, 'epoch': 0.13} + 13%|█▎ | 1118/8750 [1:49:55<12:03:30, 5.69s/it] {'loss': 0.5057, 'learning_rate': 1.9503332904646188e-05, 'epoch': 0.13} + 13%|█▎ | 1118/8750 [1:49:49<12:03:29, 5.69s/it] 13%|█▎ | 1119/8750 [1:50:01<12:01:50, 5.68s/it] 13%|█▎ | 1119/8750 [1:49:54<12:01:51, 5.68s/it] {'loss': 0.5029, 'learning_rate': 1.9502180173205227e-05, 'epoch': 0.13} + 13%|█▎ | 1119/8750 [1:50:01<12:01:50, 5.68s/it] {'loss': 0.5029, 'learning_rate': 1.9502180173205227e-05, 'epoch': 0.13} + 13%|█▎ | 1119/8750 [1:49:54<12:01:51, 5.68s/it] 13%|█▎ | 1120/8750 [1:50:07<12:14:03, 5.77s/it] 13%|█▎ | 1120/8750 [1:50:00<12:14:04, 5.77s/it] {'loss': 0.4856, 'learning_rate': 1.9501026139753433e-05, 'epoch': 0.13} + 13%|█▎ | 1120/8750 [1:50:07<12:14:03, 5.77s/it] {'loss': 0.4856, 'learning_rate': 1.9501026139753433e-05, 'epoch': 0.13} + 13%|█▎ | 1120/8750 [1:50:00<12:14:04, 5.77s/it] 13%|█▎ | 1121/8750 [1:50:13<12:11:33, 5.75s/it] 13%|█▎ | 1121/8750 [1:50:06<12:11:33, 5.75s/it] {'loss': 0.4836, 'learning_rate': 1.9499870804448936e-05, 'epoch': 0.13} + 13%|█▎ | 1121/8750 [1:50:13<12:11:33, 5.75s/it] {'loss': 0.4836, 'learning_rate': 1.9499870804448936e-05, 'epoch': 0.13} + 13%|█▎ | 1121/8750 [1:50:06<12:11:33, 5.75s/it] 13%|█▎ | 1122/8750 [1:50:12<12:08:05, 5.73s/it] 13%|█▎ | 1122/8750 [1:50:18<12:08:06, 5.73s/it] {'loss': 0.4915, 'learning_rate': 1.9498714167450047e-05, 'epoch': 0.13} + 13%|█▎ | 1122/8750 [1:50:18<12:08:06, 5.73s/it] {'loss': 0.4915, 'learning_rate': 1.9498714167450047e-05, 'epoch': 0.13} + 13%|█▎ | 1122/8750 [1:50:12<12:08:05, 5.73s/it] 13%|█▎ | 1123/8750 [1:50:18<12:08:30, 5.73s/it] 13%|█▎ | 1123/8750 [1:50:24<12:08:31, 5.73s/it] {'loss': 0.4904, 'learning_rate': 1.9497556228915246e-05, 'epoch': 0.13} + 13%|█▎ | 1123/8750 [1:50:24<12:08:31, 5.73s/it] {'loss': 0.4904, 'learning_rate': 1.9497556228915246e-05, 'epoch': 0.13} + 13%|█▎ | 1123/8750 [1:50:18<12:08:30, 5.73s/it] 13%|█▎ | 1124/8750 [1:50:30<12:07:57, 5.73s/it] 13%|█▎ | 1124/8750 [1:50:23<12:07:58, 5.73s/it] {'loss': 0.4843, 'learning_rate': 1.9496396989003195e-05, 'epoch': 0.13} + 13%|█▎ | 1124/8750 [1:50:30<12:07:57, 5.73s/it] {'loss': 0.4843, 'learning_rate': 1.9496396989003195e-05, 'epoch': 0.13} + 13%|█▎ | 1124/8750 [1:50:23<12:07:58, 5.73s/it] 13%|█▎ | 1125/8750 [1:50:36<12:16:55, 5.80s/it] 13%|█▎ | 1125/8750 [1:50:29<12:16:56, 5.80s/it] {'loss': 0.481, 'learning_rate': 1.9495236447872738e-05, 'epoch': 0.13} + 13%|█▎ | 1125/8750 [1:50:36<12:16:55, 5.80s/it] {'loss': 0.481, 'learning_rate': 1.9495236447872738e-05, 'epoch': 0.13} + 13%|█▎ | 1125/8750 [1:50:29<12:16:56, 5.80s/it] 13%|█▎ | 1126/8750 [1:50:35<12:06:41, 5.72s/it] 13%|█▎ | 1126/8750 [1:50:41<12:06:41, 5.72s/it] {'loss': 0.5034, 'learning_rate': 1.94940746056829e-05, 'epoch': 0.13} + 13%|█▎ | 1126/8750 [1:50:41<12:06:41, 5.72s/it] {'loss': 0.5034, 'learning_rate': 1.94940746056829e-05, 'epoch': 0.13} + 13%|█▎ | 1126/8750 [1:50:35<12:06:41, 5.72s/it] 13%|█▎ | 1127/8750 [1:50:47<12:07:05, 5.72s/it] 13%|█▎ | 1127/8750 [1:50:41<12:07:07, 5.72s/it] {'loss': 0.4913, 'learning_rate': 1.949291146259287e-05, 'epoch': 0.13} + 13%|█▎ | 1127/8750 [1:50:47<12:07:05, 5.72s/it] {'loss': 0.4913, 'learning_rate': 1.949291146259287e-05, 'epoch': 0.13} + 13%|█▎ | 1127/8750 [1:50:41<12:07:07, 5.72s/it] 13%|█▎ | 1128/8750 [1:50:53<12:08:57, 5.74s/it] 13%|█▎ | 1128/8750 [1:50:46<12:08:57, 5.74s/it] {'loss': 0.4805, 'learning_rate': 1.949174701876203e-05, 'epoch': 0.13} + 13%|█▎ | 1128/8750 [1:50:53<12:08:57, 5.74s/it] {'loss': 0.4805, 'learning_rate': 1.949174701876203e-05, 'epoch': 0.13} + 13%|█▎ | 1128/8750 [1:50:46<12:08:57, 5.74s/it] 13%|█▎ | 1129/8750 [1:50:58<12:04:00, 5.70s/it] 13%|█▎ | 1129/8750 [1:50:52<12:04:00, 5.70s/it] {'loss': 0.4944, 'learning_rate': 1.9490581274349934e-05, 'epoch': 0.13} + 13%|█▎ | 1129/8750 [1:50:58<12:04:00, 5.70s/it] {'loss': 0.4944, 'learning_rate': 1.9490581274349934e-05, 'epoch': 0.13} + 13%|█▎ | 1129/8750 [1:50:52<12:04:00, 5.70s/it] 13%|█▎ | 1130/8750 [1:50:58<12:05:17, 5.71s/it] 13%|█▎ | 1130/8750 [1:51:04<12:05:18, 5.71s/it] {'loss': 0.4868, 'learning_rate': 1.9489414229516318e-05, 'epoch': 0.13} + 13%|█▎ | 1130/8750 [1:51:04<12:05:18, 5.71s/it] {'loss': 0.4868, 'learning_rate': 1.9489414229516318e-05, 'epoch': 0.13} + 13%|█▎ | 1130/8750 [1:50:58<12:05:17, 5.71s/it] 13%|█▎ | 1131/8750 [1:51:03<12:07:43, 5.73s/it] 13%|█▎ | 1131/8750 [1:51:10<12:07:43, 5.73s/it] {'loss': 0.4768, 'learning_rate': 1.9488245884421087e-05, 'epoch': 0.13} + 13%|█▎ | 1131/8750 [1:51:10<12:07:43, 5.73s/it] {'loss': 0.4768, 'learning_rate': 1.9488245884421087e-05, 'epoch': 0.13} + 13%|█▎ | 1131/8750 [1:51:03<12:07:43, 5.73s/it] 13%|█▎ | 1132/8750 [1:51:09<12:02:52, 5.69s/it] 13%|█▎ | 1132/8750 [1:51:16<12:02:53, 5.69s/it] {'loss': 0.4823, 'learning_rate': 1.9487076239224337e-05, 'epoch': 0.13} + 13%|█▎ | 1132/8750 [1:51:16<12:02:53, 5.69s/it] {'loss': 0.4823, 'learning_rate': 1.9487076239224337e-05, 'epoch': 0.13} + 13%|█▎ | 1132/8750 [1:51:09<12:02:52, 5.69s/it] 13%|█▎ | 1133/8750 [1:51:15<12:02:20, 5.69s/it] 13%|█▎ | 1133/8750 [1:51:21<12:02:21, 5.69s/it] {'loss': 0.4853, 'learning_rate': 1.948590529408633e-05, 'epoch': 0.13} + 13%|█▎ | 1133/8750 [1:51:21<12:02:21, 5.69s/it] {'loss': 0.4853, 'learning_rate': 1.948590529408633e-05, 'epoch': 0.13} + 13%|█▎ | 1133/8750 [1:51:15<12:02:20, 5.69s/it] 13%|█▎ | 1134/8750 [1:51:20<11:59:03, 5.66s/it] 13%|█▎ | 1134/8750 [1:51:27<11:59:03, 5.66s/it] {'loss': 0.4846, 'learning_rate': 1.948473304916751e-05, 'epoch': 0.13} + 13%|█▎ | 1134/8750 [1:51:27<11:59:03, 5.66s/it] {'loss': 0.4846, 'learning_rate': 1.948473304916751e-05, 'epoch': 0.13} + 13%|█▎ | 1134/8750 [1:51:20<11:59:03, 5.66s/it] 13%|█▎ | 1135/8750 [1:51:33<11:59:52, 5.67s/it] 13%|█▎ | 1135/8750 [1:51:26<11:59:54, 5.67s/it] {'loss': 0.5139, 'learning_rate': 1.948355950462851e-05, 'epoch': 0.13} + 13%|█▎ | 1135/8750 [1:51:33<11:59:52, 5.67s/it] {'loss': 0.5139, 'learning_rate': 1.948355950462851e-05, 'epoch': 0.13} + 13%|█▎ | 1135/8750 [1:51:26<11:59:54, 5.67s/it] 13%|█▎ | 1136/8750 [1:51:32<12:07:45, 5.73s/it] 13%|█▎ | 1136/8750 [1:51:38<12:07:45, 5.73s/it] {'loss': 0.4903, 'learning_rate': 1.9482384660630125e-05, 'epoch': 0.13} + 13%|█▎ | 1136/8750 [1:51:32<12:07:45, 5.73s/it]{'loss': 0.4903, 'learning_rate': 1.9482384660630125e-05, 'epoch': 0.13} + 13%|█▎ | 1136/8750 [1:51:38<12:07:45, 5.73s/it] 13%|█▎ | 1137/8750 [1:51:38<12:04:52, 5.71s/it] 13%|█▎ | 1137/8750 [1:51:44<12:04:53, 5.71s/it] {'loss': 0.5032, 'learning_rate': 1.9481208517333336e-05, 'epoch': 0.13} + 13%|█▎ | 1137/8750 [1:51:44<12:04:53, 5.71s/it] {'loss': 0.5032, 'learning_rate': 1.9481208517333336e-05, 'epoch': 0.13} + 13%|█▎ | 1137/8750 [1:51:38<12:04:52, 5.71s/it] 13%|█▎ | 1138/8750 [1:51:43<12:04:26, 5.71s/it] 13%|█▎ | 1138/8750 [1:51:50<12:04:27, 5.71s/it] {'loss': 0.4933, 'learning_rate': 1.9480031074899303e-05, 'epoch': 0.13} + 13%|█▎ | 1138/8750 [1:51:50<12:04:27, 5.71s/it] {'loss': 0.4933, 'learning_rate': 1.9480031074899303e-05, 'epoch': 0.13} + 13%|█▎ | 1138/8750 [1:51:43<12:04:26, 5.71s/it] 13%|█▎ | 1139/8750 [1:51:55<12:01:34, 5.69s/it] 13%|█▎ | 1139/8750 [1:51:49<12:01:36, 5.69s/it] {'loss': 0.4865, 'learning_rate': 1.9478852333489356e-05, 'epoch': 0.13} + 13%|█▎ | 1139/8750 [1:51:55<12:01:34, 5.69s/it] {'loss': 0.4865, 'learning_rate': 1.9478852333489356e-05, 'epoch': 0.13} + 13%|█▎ | 1139/8750 [1:51:49<12:01:36, 5.69s/it] 13%|█▎ | 1140/8750 [1:51:55<12:01:34, 5.69s/it] 13%|█▎ | 1140/8750 [1:52:01<12:01:34, 5.69s/it] {'loss': 0.4838, 'learning_rate': 1.9477672293265014e-05, 'epoch': 0.13} + {'loss': 0.4838, 'learning_rate': 1.9477672293265014e-05, 'epoch': 0.13} + 13%|█▎ | 1140/8750 [1:52:01<12:01:34, 5.69s/it] 13%|█▎ | 1140/8750 [1:51:55<12:01:34, 5.69s/it] 13%|█▎ | 1141/8750 [1:52:07<12:07:16, 5.73s/it] 13%|█▎ | 1141/8750 [1:52:00<12:07:16, 5.73s/it] {'loss': 0.4925, 'learning_rate': 1.9476490954387968e-05, 'epoch': 0.13} + 13%|█▎ | 1141/8750 [1:52:07<12:07:16, 5.73s/it] {'loss': 0.4925, 'learning_rate': 1.9476490954387968e-05, 'epoch': 0.13} + 13%|█▎ | 1141/8750 [1:52:00<12:07:16, 5.73s/it] 13%|█▎ | 1142/8750 [1:52:13<12:04:30, 5.71s/it] 13%|█▎ | 1142/8750 [1:52:06<12:04:29, 5.71s/it] {'loss': 0.5201, 'learning_rate': 1.947530831702009e-05, 'epoch': 0.13} + 13%|█▎ | 1142/8750 [1:52:13<12:04:30, 5.71s/it] {'loss': 0.5201, 'learning_rate': 1.947530831702009e-05, 'epoch': 0.13} + 13%|█▎ | 1142/8750 [1:52:06<12:04:29, 5.71s/it] 13%|█▎ | 1143/8750 [1:52:18<12:01:34, 5.69s/it] 13%|█▎ | 1143/8750 [1:52:12<12:01:34, 5.69s/it] {'loss': 0.4918, 'learning_rate': 1.9474124381323424e-05, 'epoch': 0.13} + 13%|█▎ | 1143/8750 [1:52:18<12:01:34, 5.69s/it] {'loss': 0.4918, 'learning_rate': 1.9474124381323424e-05, 'epoch': 0.13} + 13%|█▎ | 1143/8750 [1:52:12<12:01:34, 5.69s/it] 13%|█▎ | 1144/8750 [1:52:24<12:12:52, 5.78s/it] 13%|█▎ | 1144/8750 [1:52:18<12:12:52, 5.78s/it] {'loss': 0.4993, 'learning_rate': 1.9472939147460194e-05, 'epoch': 0.13} + 13%|█▎ | 1144/8750 [1:52:24<12:12:52, 5.78s/it] {'loss': 0.4993, 'learning_rate': 1.9472939147460194e-05, 'epoch': 0.13} + 13%|█▎ | 1144/8750 [1:52:18<12:12:52, 5.78s/it] 13%|█▎ | 1145/8750 [1:52:30<12:09:26, 5.75s/it] 13%|█▎ | 1145/8750 [1:52:23<12:09:26, 5.76s/it] {'loss': 0.4926, 'learning_rate': 1.947175261559281e-05, 'epoch': 0.13} + 13%|█▎ | 1145/8750 [1:52:30<12:09:26, 5.75s/it] {'loss': 0.4926, 'learning_rate': 1.947175261559281e-05, 'epoch': 0.13} + 13%|█▎ | 1145/8750 [1:52:23<12:09:26, 5.76s/it] 13%|█▎ | 1146/8750 [1:52:36<12:03:24, 5.71s/it] 13%|█▎ | 1146/8750 [1:52:29<12:03:24, 5.71s/it] {'loss': 0.5156, 'learning_rate': 1.9470564785883848e-05, 'epoch': 0.13} + 13%|█▎ | 1146/8750 [1:52:36<12:03:24, 5.71s/it] {'loss': 0.5156, 'learning_rate': 1.9470564785883848e-05, 'epoch': 0.13} + 13%|█▎ | 1146/8750 [1:52:29<12:03:24, 5.71s/it] 13%|█▎ | 1147/8750 [1:52:41<12:12:46, 5.78s/it] 13%|█▎ | 1147/8750 [1:52:35<12:12:46, 5.78s/it] {'loss': 0.4835, 'learning_rate': 1.9469375658496066e-05, 'epoch': 0.13} + 13%|█▎ | 1147/8750 [1:52:41<12:12:46, 5.78s/it] {'loss': 0.4835, 'learning_rate': 1.9469375658496066e-05, 'epoch': 0.13} + 13%|█▎ | 1147/8750 [1:52:35<12:12:46, 5.78s/it] 13%|█▎ | 1148/8750 [1:52:47<12:13:25, 5.79s/it] 13%|█▎ | 1148/8750 [1:52:41<12:13:24, 5.79s/it] {'loss': 0.4778, 'learning_rate': 1.946818523359241e-05, 'epoch': 0.13} + 13%|█▎ | 1148/8750 [1:52:47<12:13:25, 5.79s/it] {'loss': 0.4778, 'learning_rate': 1.946818523359241e-05, 'epoch': 0.13} + 13%|█▎ | 1148/8750 [1:52:41<12:13:24, 5.79s/it] 13%|█▎ | 1149/8750 [1:52:47<12:12:08, 5.78s/it] 13%|█▎ | 1149/8750 [1:52:53<12:12:09, 5.78s/it] {'loss': 0.4853, 'learning_rate': 1.9466993511335985e-05, 'epoch': 0.13} + 13%|█▎ | 1149/8750 [1:52:53<12:12:09, 5.78s/it] {'loss': 0.4853, 'learning_rate': 1.9466993511335985e-05, 'epoch': 0.13} + 13%|█▎ | 1149/8750 [1:52:47<12:12:08, 5.78s/it]13 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +1210 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +74 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend...0 +6 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... + 13%|█▎ | 1150/8750 [1:52:59<12:07:57, 5.75s/it] 13%|█▎ | 1150/8750 [1:52:52<12:07:57, 5.75s/it]14 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4952, 'learning_rate': 1.9465800491890087e-05, 'epoch': 0.13} + 13%|█▎ | 1150/8750 [1:52:59<12:07:57, 5.75s/it] {'loss': 0.4952, 'learning_rate': 1.9465800491890087e-05, 'epoch': 0.13} + 13%|█▎ | 1150/8750 [1:52:52<12:07:57, 5.75s/it] 13%|█▎ | 1151/8750 [1:53:04<12:03:00, 5.71s/it] 13%|█▎ | 1151/8750 [1:52:58<12:03:00, 5.71s/it] {'loss': 0.4911, 'learning_rate': 1.946460617541819e-05, 'epoch': 0.13} + 13%|█▎ | 1151/8750 [1:53:04<12:03:00, 5.71s/it] {'loss': 0.4911, 'learning_rate': 1.946460617541819e-05, 'epoch': 0.13} + 13%|█▎ | 1151/8750 [1:52:58<12:03:00, 5.71s/it] 13%|█▎ | 1152/8750 [1:53:10<12:00:56, 5.69s/it] 13%|█▎ | 1152/8750 [1:53:03<12:00:56, 5.69s/it] {'loss': 0.4908, 'learning_rate': 1.9463410562083937e-05, 'epoch': 0.13} + 13%|█▎ | 1152/8750 [1:53:10<12:00:56, 5.69s/it] {'loss': 0.4908, 'learning_rate': 1.9463410562083937e-05, 'epoch': 0.13} + 13%|█▎ | 1152/8750 [1:53:03<12:00:56, 5.69s/it] 13%|█▎ | 1153/8750 [1:53:16<12:04:31, 5.72s/it] 13%|█▎ | 1153/8750 [1:53:09<12:04:31, 5.72s/it] {'loss': 0.4897, 'learning_rate': 1.946221365205115e-05, 'epoch': 0.13} + 13%|█▎ | 1153/8750 [1:53:16<12:04:31, 5.72s/it] {'loss': 0.4897, 'learning_rate': 1.946221365205115e-05, 'epoch': 0.13} + 13%|█▎ | 1153/8750 [1:53:09<12:04:31, 5.72s/it] 13%|█▎ | 1154/8750 [1:53:15<12:01:12, 5.70s/it] 13%|█▎ | 1154/8750 [1:53:21<12:01:13, 5.70s/it] {'loss': 0.4868, 'learning_rate': 1.9461015445483843e-05, 'epoch': 0.13} + 13%|█▎ | 1154/8750 [1:53:21<12:01:13, 5.70s/it] {'loss': 0.4868, 'learning_rate': 1.9461015445483843e-05, 'epoch': 0.13} + 13%|█▎ | 1154/8750 [1:53:15<12:01:12, 5.70s/it] 13%|█▎ | 1155/8750 [1:53:27<11:58:08, 5.67s/it] 13%|█▎ | 1155/8750 [1:53:21<11:58:08, 5.67s/it] {'loss': 0.497, 'learning_rate': 1.9459815942546192e-05, 'epoch': 0.13} + 13%|█▎ | 1155/8750 [1:53:27<11:58:08, 5.67s/it] {'loss': 0.497, 'learning_rate': 1.9459815942546192e-05, 'epoch': 0.13} + 13%|█▎ | 1155/8750 [1:53:21<11:58:08, 5.67s/it] 13%|█▎ | 1156/8750 [1:53:26<12:06:00, 5.74s/it] 13%|█▎ | 1156/8750 [1:53:33<12:06:01, 5.74s/it] {'loss': 0.4966, 'learning_rate': 1.9458615143402554e-05, 'epoch': 0.13} + 13%|█▎ | 1156/8750 [1:53:33<12:06:01, 5.74s/it] {'loss': 0.4966, 'learning_rate': 1.9458615143402554e-05, 'epoch': 0.13} + 13%|█▎ | 1156/8750 [1:53:26<12:06:00, 5.74s/it] 13%|█▎ | 1157/8750 [1:53:32<12:03:02, 5.71s/it] 13%|█▎ | 1157/8750 [1:53:39<12:03:03, 5.71s/it] {'loss': 0.4687, 'learning_rate': 1.9457413048217466e-05, 'epoch': 0.13} + 13%|█▎ | 1157/8750 [1:53:39<12:03:03, 5.71s/it] {'loss': 0.4687, 'learning_rate': 1.9457413048217466e-05, 'epoch': 0.13} + 13%|█▎ | 1157/8750 [1:53:32<12:03:02, 5.71s/it] 13%|█▎ | 1158/8750 [1:53:38<12:02:49, 5.71s/it] 13%|█▎ | 1158/8750 [1:53:44<12:02:50, 5.71s/it] {'loss': 0.4876, 'learning_rate': 1.9456209657155645e-05, 'epoch': 0.13} + 13%|█▎ | 1158/8750 [1:53:44<12:02:50, 5.71s/it] {'loss': 0.4876, 'learning_rate': 1.9456209657155645e-05, 'epoch': 0.13} + 13%|█▎ | 1158/8750 [1:53:38<12:02:49, 5.71s/it] 13%|█▎ | 1159/8750 [1:53:43<11:55:48, 5.66s/it] 13%|█▎ | 1159/8750 [1:53:50<11:55:48, 5.66s/it] {'loss': 0.5257, 'learning_rate': 1.9455004970381978e-05, 'epoch': 0.13} + 13%|█▎ | 1159/8750 [1:53:50<11:55:48, 5.66s/it] {'loss': 0.5257, 'learning_rate': 1.9455004970381978e-05, 'epoch': 0.13} + 13%|█▎ | 1159/8750 [1:53:43<11:55:48, 5.66s/it] 13%|█▎ | 1160/8750 [1:53:49<11:54:51, 5.65s/it] 13%|█▎ | 1160/8750 [1:53:55<11:54:51, 5.65s/it] {'loss': 0.4873, 'learning_rate': 1.9453798988061535e-05, 'epoch': 0.13} + 13%|█▎ | 1160/8750 [1:53:55<11:54:51, 5.65s/it] {'loss': 0.4873, 'learning_rate': 1.9453798988061535e-05, 'epoch': 0.13} + 13%|█▎ | 1160/8750 [1:53:49<11:54:51, 5.65s/it] 13%|█▎ | 1161/8750 [1:53:55<11:59:23, 5.69s/it] 13%|█▎ | 1161/8750 [1:54:01<11:59:24, 5.69s/it] {'loss': 0.4734, 'learning_rate': 1.9452591710359566e-05, 'epoch': 0.13} + 13%|█▎ | 1161/8750 [1:54:01<11:59:24, 5.69s/it] {'loss': 0.4734, 'learning_rate': 1.9452591710359566e-05, 'epoch': 0.13} + 13%|█▎ | 1161/8750 [1:53:55<11:59:23, 5.69s/it] 13%|█▎ | 1162/8750 [1:54:07<12:12:47, 5.79s/it] 13%|█▎ | 1162/8750 [1:54:01<12:12:47, 5.79s/it] {'loss': 0.485, 'learning_rate': 1.9451383137441492e-05, 'epoch': 0.13} + 13%|█▎ | 1162/8750 [1:54:07<12:12:47, 5.79s/it] {'loss': 0.485, 'learning_rate': 1.9451383137441492e-05, 'epoch': 0.13} + 13%|█▎ | 1162/8750 [1:54:01<12:12:47, 5.79s/it] 13%|█▎ | 1163/8750 [1:54:07<12:10:21, 5.78s/it] 13%|█▎ | 1163/8750 [1:54:13<12:10:21, 5.78s/it] {'loss': 0.4731, 'learning_rate': 1.9450173269472915e-05, 'epoch': 0.13} + 13%|█▎ | 1163/8750 [1:54:07<12:10:21, 5.78s/it]{'loss': 0.4731, 'learning_rate': 1.9450173269472915e-05, 'epoch': 0.13} + 13%|█▎ | 1163/8750 [1:54:13<12:10:21, 5.78s/it] 13%|█▎ | 1164/8750 [1:54:12<12:18:12, 5.84s/it] 13%|█▎ | 1164/8750 [1:54:19<12:18:11, 5.84s/it] {'loss': 0.4943, 'learning_rate': 1.9448962106619614e-05, 'epoch': 0.13} + 13%|█▎ | 1164/8750 [1:54:19<12:18:11, 5.84s/it] {'loss': 0.4943, 'learning_rate': 1.9448962106619614e-05, 'epoch': 0.13} + 13%|█▎ | 1164/8750 [1:54:12<12:18:12, 5.84s/it] 13%|█▎ | 1165/8750 [1:54:18<12:12:07, 5.79s/it] 13%|█▎ | 1165/8750 [1:54:25<12:12:08, 5.79s/it] {'loss': 0.4884, 'learning_rate': 1.944774964904754e-05, 'epoch': 0.13} + 13%|█▎ | 1165/8750 [1:54:25<12:12:08, 5.79s/it] {'loss': 0.4884, 'learning_rate': 1.944774964904754e-05, 'epoch': 0.13} + 13%|█▎ | 1165/8750 [1:54:18<12:12:07, 5.79s/it] 13%|█▎ | 1166/8750 [1:54:24<12:19:21, 5.85s/it] 13%|█▎ | 1166/8750 [1:54:31<12:19:21, 5.85s/it] {'loss': 0.48, 'learning_rate': 1.944653589692284e-05, 'epoch': 0.13} + 13%|█▎ | 1166/8750 [1:54:31<12:19:21, 5.85s/it] {'loss': 0.48, 'learning_rate': 1.944653589692284e-05, 'epoch': 0.13} + 13%|█▎ | 1166/8750 [1:54:24<12:19:21, 5.85s/it] 13%|█▎ | 1167/8750 [1:54:30<12:13:39, 5.81s/it] 13%|█▎ | 1167/8750 [1:54:36<12:13:40, 5.81s/it] {'loss': 0.5034, 'learning_rate': 1.9445320850411816e-05, 'epoch': 0.13} + 13%|█▎ | 1167/8750 [1:54:36<12:13:40, 5.81s/it] {'loss': 0.5034, 'learning_rate': 1.9445320850411816e-05, 'epoch': 0.13} + 13%|█▎ | 1167/8750 [1:54:30<12:13:39, 5.81s/it] 13%|█▎ | 1168/8750 [1:54:42<12:12:57, 5.80s/it] 13%|█▎ | 1168/8750 [1:54:36<12:12:57, 5.80s/it] {'loss': 0.5059, 'learning_rate': 1.9444104509680954e-05, 'epoch': 0.13} + 13%|█▎ | 1168/8750 [1:54:42<12:12:57, 5.80s/it] {'loss': 0.5059, 'learning_rate': 1.9444104509680954e-05, 'epoch': 0.13} + 13%|█▎ | 1168/8750 [1:54:36<12:12:57, 5.80s/it] 13%|█▎ | 1169/8750 [1:54:41<12:10:14, 5.78s/it] 13%|█▎ | 1169/8750 [1:54:48<12:10:15, 5.78s/it] {'loss': 0.4864, 'learning_rate': 1.9442886874896924e-05, 'epoch': 0.13} + 13%|█▎ | 1169/8750 [1:54:48<12:10:15, 5.78s/it] {'loss': 0.4864, 'learning_rate': 1.9442886874896924e-05, 'epoch': 0.13} + 13%|█▎ | 1169/8750 [1:54:41<12:10:14, 5.78s/it] 13%|█▎ | 1170/8750 [1:54:54<12:04:52, 5.74s/it] 13%|█▎ | 1170/8750 [1:54:47<12:04:52, 5.74s/it] {'loss': 0.491, 'learning_rate': 1.944166794622657e-05, 'epoch': 0.13} + 13%|█▎ | 1170/8750 [1:54:54<12:04:52, 5.74s/it] {'loss': 0.491, 'learning_rate': 1.944166794622657e-05, 'epoch': 0.13} + 13%|█▎ | 1170/8750 [1:54:47<12:04:52, 5.74s/it] 13%|█▎ | 1171/8750 [1:54:53<12:00:02, 5.70s/it] 13%|█▎ | 1171/8750 [1:54:59<12:00:03, 5.70s/it] {'loss': 0.4786, 'learning_rate': 1.9440447723836914e-05, 'epoch': 0.13} + 13%|█▎ | 1171/8750 [1:54:59<12:00:03, 5.70s/it] {'loss': 0.4786, 'learning_rate': 1.9440447723836914e-05, 'epoch': 0.13} + 13%|█▎ | 1171/8750 [1:54:53<12:00:02, 5.70s/it] 13%|█▎ | 1172/8750 [1:54:58<12:01:52, 5.72s/it] 13%|█▎ | 1172/8750 [1:55:05<12:01:52, 5.72s/it] {'loss': 0.4786, 'learning_rate': 1.9439226207895143e-05, 'epoch': 0.13} + 13%|█▎ | 1172/8750 [1:55:05<12:01:52, 5.72s/it] {'loss': 0.4786, 'learning_rate': 1.9439226207895143e-05, 'epoch': 0.13} + 13%|█▎ | 1172/8750 [1:54:58<12:01:52, 5.72s/it] 13%|█▎ | 1173/8750 [1:55:04<12:01:54, 5.72s/it] 13%|█▎ | 1173/8750 [1:55:11<12:01:54, 5.72s/it] {'loss': 0.4986, 'learning_rate': 1.9438003398568647e-05, 'epoch': 0.13} + 13%|█▎ | 1173/8750 [1:55:04<12:01:54, 5.72s/it]{'loss': 0.4986, 'learning_rate': 1.9438003398568647e-05, 'epoch': 0.13} + 13%|█▎ | 1173/8750 [1:55:11<12:01:54, 5.72s/it] 13%|█▎ | 1174/8750 [1:55:10<12:05:19, 5.74s/it] 13%|█▎ | 1174/8750 [1:55:16<12:05:19, 5.74s/it] {'loss': 0.4635, 'learning_rate': 1.9436779296024967e-05, 'epoch': 0.13} + 13%|█▎ | 1174/8750 [1:55:16<12:05:19, 5.74s/it] {'loss': 0.4635, 'learning_rate': 1.9436779296024967e-05, 'epoch': 0.13} + 13%|█▎ | 1174/8750 [1:55:10<12:05:19, 5.74s/it] 13%|█▎ | 1175/8750 [1:55:15<11:54:43, 5.66s/it] 13%|█▎ | 1175/8750 [1:55:22<11:54:43, 5.66s/it] {'loss': 0.5228, 'learning_rate': 1.9435553900431838e-05, 'epoch': 0.13} + 13%|█▎ | 1175/8750 [1:55:22<11:54:43, 5.66s/it] {'loss': 0.5228, 'learning_rate': 1.9435553900431838e-05, 'epoch': 0.13} + 13%|█▎ | 1175/8750 [1:55:15<11:54:43, 5.66s/it] 13%|█▎ | 1176/8750 [1:55:21<11:56:10, 5.67s/it] 13%|█▎ | 1176/8750 [1:55:28<11:56:10, 5.67s/it] {'loss': 0.4745, 'learning_rate': 1.9434327211957166e-05, 'epoch': 0.13} + 13%|█▎ | 1176/8750 [1:55:28<11:56:10, 5.67s/it] {'loss': 0.4745, 'learning_rate': 1.9434327211957166e-05, 'epoch': 0.13} + 13%|█▎ | 1176/8750 [1:55:21<11:56:10, 5.67s/it] 13%|█▎ | 1177/8750 [1:55:33<11:56:09, 5.67s/it] 13%|█▎ | 1177/8750 [1:55:27<11:56:09, 5.67s/it] {'loss': 0.4875, 'learning_rate': 1.943309923076903e-05, 'epoch': 0.13} + 13%|█▎ | 1177/8750 [1:55:33<11:56:09, 5.67s/it] {'loss': 0.4875, 'learning_rate': 1.943309923076903e-05, 'epoch': 0.13} + 13%|█▎ | 1177/8750 [1:55:27<11:56:09, 5.67s/it] 13%|█▎ | 1178/8750 [1:55:32<11:55:48, 5.67s/it] 13%|█▎ | 1178/8750 [1:55:39<11:55:48, 5.67s/it] {'loss': 0.479, 'learning_rate': 1.9431869957035698e-05, 'epoch': 0.13} + 13%|█▎ | 1178/8750 [1:55:39<11:55:48, 5.67s/it] {'loss': 0.479, 'learning_rate': 1.9431869957035698e-05, 'epoch': 0.13} + 13%|█▎ | 1178/8750 [1:55:32<11:55:48, 5.67s/it] 13%|█▎ | 1179/8750 [1:55:45<11:57:33, 5.69s/it] 13%|█▎ | 1179/8750 [1:55:38<11:57:34, 5.69s/it] {'loss': 0.4843, 'learning_rate': 1.9430639390925604e-05, 'epoch': 0.13} + 13%|█▎ | 1179/8750 [1:55:45<11:57:33, 5.69s/it] {'loss': 0.4843, 'learning_rate': 1.9430639390925604e-05, 'epoch': 0.13} + 13%|█▎ | 1179/8750 [1:55:38<11:57:34, 5.69s/it] 13%|█▎ | 1180/8750 [1:55:44<12:04:14, 5.74s/it] 13%|█▎ | 1180/8750 [1:55:50<12:04:15, 5.74s/it] {'loss': 0.4825, 'learning_rate': 1.942940753260736e-05, 'epoch': 0.13} + 13%|█▎ | 1180/8750 [1:55:51<12:04:15, 5.74s/it] {'loss': 0.4825, 'learning_rate': 1.942940753260736e-05, 'epoch': 0.13} + 13%|█▎ | 1180/8750 [1:55:44<12:04:14, 5.74s/it] 13%|█▎ | 1181/8750 [1:55:50<11:59:26, 5.70s/it] 13%|█▎ | 1181/8750 [1:55:56<11:59:25, 5.70s/it] {'loss': 0.4885, 'learning_rate': 1.9428174382249764e-05, 'epoch': 0.13} + 13%|█▎ | 1181/8750 [1:55:56<11:59:25, 5.70s/it] {'loss': 0.4885, 'learning_rate': 1.9428174382249764e-05, 'epoch': 0.13} + 13%|█▎ | 1181/8750 [1:55:50<11:59:26, 5.70s/it] 14%|█▎ | 1182/8750 [1:56:02<12:11:51, 5.80s/it] 14%|█▎ | 1182/8750 [1:55:56<12:11:51, 5.80s/it] {'loss': 0.4608, 'learning_rate': 1.942693994002178e-05, 'epoch': 0.14} + 14%|█▎ | 1182/8750 [1:56:02<12:11:51, 5.80s/it] {'loss': 0.4608, 'learning_rate': 1.942693994002178e-05, 'epoch': 0.14} + 14%|█▎ | 1182/8750 [1:55:56<12:11:51, 5.80s/it] 14%|█▎ | 1183/8750 [1:56:08<12:07:29, 5.77s/it] 14%|█▎ | 1183/8750 [1:56:01<12:07:29, 5.77s/it] {'loss': 0.4976, 'learning_rate': 1.9425704206092562e-05, 'epoch': 0.14} + 14%|█▎ | 1183/8750 [1:56:08<12:07:29, 5.77s/it] {'loss': 0.4976, 'learning_rate': 1.9425704206092562e-05, 'epoch': 0.14} + 14%|█▎ | 1183/8750 [1:56:01<12:07:29, 5.77s/it] 14%|█▎ | 1184/8750 [1:56:07<12:03:51, 5.74s/it] 14%|█▎ | 1184/8750 [1:56:14<12:03:51, 5.74s/it] {'loss': 0.4948, 'learning_rate': 1.9424467180631422e-05, 'epoch': 0.14} + 14%|█▎ | 1184/8750 [1:56:14<12:03:51, 5.74s/it] {'loss': 0.4948, 'learning_rate': 1.9424467180631422e-05, 'epoch': 0.14} + 14%|█▎ | 1184/8750 [1:56:07<12:03:51, 5.74s/it] 14%|█▎ | 1185/8750 [1:56:13<11:56:08, 5.68s/it] 14%|█▎ | 1185/8750 [1:56:19<11:56:09, 5.68s/it] {'loss': 0.5119, 'learning_rate': 1.942322886380787e-05, 'epoch': 0.14} + 14%|█▎ | 1185/8750 [1:56:19<11:56:09, 5.68s/it] {'loss': 0.5119, 'learning_rate': 1.942322886380787e-05, 'epoch': 0.14} + 14%|█▎ | 1185/8750 [1:56:13<11:56:08, 5.68s/it] 14%|█▎ | 1186/8750 [1:56:25<11:53:38, 5.66s/it] 14%|█▎ | 1186/8750 [1:56:18<11:53:38, 5.66s/it] {'loss': 0.4797, 'learning_rate': 1.942198925579158e-05, 'epoch': 0.14} + 14%|█▎ | 1186/8750 [1:56:25<11:53:38, 5.66s/it] {'loss': 0.4797, 'learning_rate': 1.942198925579158e-05, 'epoch': 0.14} + 14%|█▎ | 1186/8750 [1:56:18<11:53:38, 5.66s/it] 14%|█▎ | 1187/8750 [1:56:24<11:55:49, 5.68s/it] 14%|█▎ | 1187/8750 [1:56:30<11:55:49, 5.68s/it] {'loss': 0.4915, 'learning_rate': 1.9420748356752405e-05, 'epoch': 0.14} + 14%|█▎ | 1187/8750 [1:56:30<11:55:49, 5.68s/it] {'loss': 0.4915, 'learning_rate': 1.9420748356752405e-05, 'epoch': 0.14} + 14%|█▎ | 1187/8750 [1:56:24<11:55:49, 5.68s/it] 14%|█▎ | 1188/8750 [1:56:36<11:53:25, 5.66s/it] 14%|█▎ | 1188/8750 [1:56:30<11:53:26, 5.66s/it] {'loss': 0.4795, 'learning_rate': 1.9419506166860374e-05, 'epoch': 0.14} + 14%|█▎ | 1188/8750 [1:56:36<11:53:25, 5.66s/it] {'loss': 0.4795, 'learning_rate': 1.9419506166860374e-05, 'epoch': 0.14} + 14%|█▎ | 1188/8750 [1:56:30<11:53:26, 5.66s/it] 14%|█▎ | 1189/8750 [1:56:42<11:53:04, 5.66s/it] 14%|█▎ | 1189/8750 [1:56:35<11:53:05, 5.66s/it] {'loss': 0.4857, 'learning_rate': 1.9418262686285697e-05, 'epoch': 0.14} + 14%|█▎ | 1189/8750 [1:56:42<11:53:04, 5.66s/it] {'loss': 0.4857, 'learning_rate': 1.9418262686285697e-05, 'epoch': 0.14} + 14%|█▎ | 1189/8750 [1:56:35<11:53:05, 5.66s/it] 14%|█▎ | 1190/8750 [1:56:47<11:55:57, 5.68s/it] 14%|█▎ | 1190/8750 [1:56:41<11:55:57, 5.68s/it] {'loss': 0.4809, 'learning_rate': 1.9417017915198758e-05, 'epoch': 0.14} + 14%|█▎ | 1190/8750 [1:56:47<11:55:57, 5.68s/it] {'loss': 0.4809, 'learning_rate': 1.9417017915198758e-05, 'epoch': 0.14} + 14%|█▎ | 1190/8750 [1:56:41<11:55:57, 5.68s/it] 14%|█▎ | 1191/8750 [1:56:46<11:49:51, 5.63s/it] 14%|█▎ | 1191/8750 [1:56:53<11:49:52, 5.63s/it] {'loss': 0.4958, 'learning_rate': 1.9415771853770117e-05, 'epoch': 0.14} + 14%|█▎ | 1191/8750 [1:56:53<11:49:52, 5.63s/it] {'loss': 0.4958, 'learning_rate': 1.9415771853770117e-05, 'epoch': 0.14} + 14%|█▎ | 1191/8750 [1:56:46<11:49:51, 5.63s/it] 14%|█▎ | 1192/8750 [1:56:59<11:50:38, 5.64s/it] 14%|█▎ | 1192/8750 [1:56:52<11:50:39, 5.64s/it] {'loss': 0.49, 'learning_rate': 1.9414524502170514e-05, 'epoch': 0.14} + 14%|█▎ | 1192/8750 [1:56:59<11:50:38, 5.64s/it] {'loss': 0.49, 'learning_rate': 1.9414524502170514e-05, 'epoch': 0.14} + 14%|█▎ | 1192/8750 [1:56:52<11:50:39, 5.64s/it] 14%|█▎ | 1193/8750 [1:56:58<11:55:51, 5.68s/it] 14%|█▎ | 1193/8750 [1:57:04<11:55:51, 5.68s/it] {'loss': 0.5236, 'learning_rate': 1.941327586057087e-05, 'epoch': 0.14} + 14%|█▎ | 1193/8750 [1:57:04<11:55:51, 5.68s/it] {'loss': 0.5236, 'learning_rate': 1.941327586057087e-05, 'epoch': 0.14} + 14%|█▎ | 1193/8750 [1:56:58<11:55:51, 5.68s/it] 14%|█▎ | 1194/8750 [1:57:10<11:55:36, 5.68s/it] 14%|█▎ | 1194/8750 [1:57:04<11:55:36, 5.68s/it] {'loss': 0.4767, 'learning_rate': 1.9412025929142263e-05, 'epoch': 0.14} + 14%|█▎ | 1194/8750 [1:57:10<11:55:36, 5.68s/it] {'loss': 0.4767, 'learning_rate': 1.9412025929142263e-05, 'epoch': 0.14} + 14%|█▎ | 1194/8750 [1:57:04<11:55:36, 5.68s/it] 14%|█▎ | 1195/8750 [1:57:16<11:52:51, 5.66s/it] 14%|█▎ | 1195/8750 [1:57:09<11:52:52, 5.66s/it] {'loss': 0.5111, 'learning_rate': 1.9410774708055972e-05, 'epoch': 0.14} + 14%|█▎ | 1195/8750 [1:57:16<11:52:51, 5.66s/it] {'loss': 0.5111, 'learning_rate': 1.9410774708055972e-05, 'epoch': 0.14} + 14%|█▎ | 1195/8750 [1:57:09<11:52:52, 5.66s/it] 14%|█▎ | 1196/8750 [1:57:15<11:58:29, 5.71s/it] 14%|█▎ | 1196/8750 [1:57:21<11:58:30, 5.71s/it] {'loss': 0.4883, 'learning_rate': 1.940952219748344e-05, 'epoch': 0.14} + 14%|█▎ | 1196/8750 [1:57:21<11:58:30, 5.71s/it] {'loss': 0.4883, 'learning_rate': 1.940952219748344e-05, 'epoch': 0.14} + 14%|█▎ | 1196/8750 [1:57:15<11:58:29, 5.71s/it] 14%|█▎ | 1197/8750 [1:57:27<11:53:05, 5.66s/it] 14%|█▎ | 1197/8750 [1:57:21<11:53:05, 5.66s/it] {'loss': 0.4899, 'learning_rate': 1.9408268397596287e-05, 'epoch': 0.14} + 14%|█▎ | 1197/8750 [1:57:27<11:53:05, 5.66s/it] {'loss': 0.4899, 'learning_rate': 1.9408268397596287e-05, 'epoch': 0.14} + 14%|█▎ | 1197/8750 [1:57:21<11:53:05, 5.66s/it] 14%|█▎ | 1198/8750 [1:57:27<12:13:19, 5.83s/it] 14%|█▎ | 1198/8750 [1:57:33<12:13:19, 5.83s/it] {'loss': 0.4783, 'learning_rate': 1.9407013308566315e-05, 'epoch': 0.14} + 14%|█▎ | 1198/8750 [1:57:33<12:13:19, 5.83s/it] {'loss': 0.4783, 'learning_rate': 1.9407013308566315e-05, 'epoch': 0.14} + 14%|█▎ | 1198/8750 [1:57:27<12:13:19, 5.83s/it] 14%|█▎ | 1199/8750 [1:57:32<12:05:52, 5.77s/it] 14%|█▎ | 1199/8750 [1:57:39<12:05:53, 5.77s/it] {'loss': 0.5071, 'learning_rate': 1.9405756930565496e-05, 'epoch': 0.14} + 14%|█▎ | 1199/8750 [1:57:39<12:05:53, 5.77s/it] {'loss': 0.5071, 'learning_rate': 1.9405756930565496e-05, 'epoch': 0.14} + 14%|█▎ | 1199/8750 [1:57:32<12:05:52, 5.77s/it]13 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... + 14%|█▎ | 1200/8750 [1:57:45<12:08:12, 5.79s/it]0 AutoResumeHook: Checking whether to suspend... + 14%|█▎ | 1200/8750 [1:57:38<12:08:15, 5.79s/it] {'loss': 0.4763, 'learning_rate': 1.9404499263765983e-05, 'epoch': 0.14} + 14%|█▎ | 1200/8750 [1:57:45<12:08:12, 5.79s/it] {'loss': 0.4763, 'learning_rate': 1.9404499263765983e-05, 'epoch': 0.14} + 14%|█▎ | 1200/8750 [1:57:38<12:08:15, 5.79s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-1200/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-1200/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-1200/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 14%|█▎ | 1201/8750 [1:57:58<20:47:36, 9.92s/it] 14%|█▎ | 1201/8750 [1:58:04<20:47:37, 9.92s/it] {'loss': 0.4966, 'learning_rate': 1.9403240308340105e-05, 'epoch': 0.14} + 14%|█▎ | 1201/8750 [1:58:04<20:47:37, 9.92s/it] {'loss': 0.4966, 'learning_rate': 1.9403240308340105e-05, 'epoch': 0.14} + 14%|█▎ | 1201/8750 [1:57:58<20:47:36, 9.92s/it] 14%|█▎ | 1202/8750 [1:58:03<18:07:49, 8.65s/it] 14%|█▎ | 1202/8750 [1:58:10<18:07:49, 8.65s/it] {'loss': 0.4701, 'learning_rate': 1.940198006446037e-05, 'epoch': 0.14} + 14%|█▎ | 1202/8750 [1:58:10<18:07:49, 8.65s/it] {'loss': 0.4701, 'learning_rate': 1.940198006446037e-05, 'epoch': 0.14} + 14%|█▎ | 1202/8750 [1:58:03<18:07:49, 8.65s/it] 14%|█▎ | 1203/8750 [1:58:09<16:14:05, 7.74s/it] 14%|█▎ | 1203/8750 [1:58:16<16:14:06, 7.74s/it] {'loss': 0.5047, 'learning_rate': 1.940071853229945e-05, 'epoch': 0.14} + 14%|█▎ | 1203/8750 [1:58:16<16:14:06, 7.74s/it] {'loss': 0.5047, 'learning_rate': 1.940071853229945e-05, 'epoch': 0.14} + 14%|█▎ | 1203/8750 [1:58:09<16:14:05, 7.74s/it] 14%|█▍ | 1204/8750 [1:58:15<14:55:28, 7.12s/it] 14%|█▍ | 1204/8750 [1:58:21<14:55:28, 7.12s/it] {'loss': 0.4907, 'learning_rate': 1.939945571203021e-05, 'epoch': 0.14} + 14%|█▍ | 1204/8750 [1:58:21<14:55:28, 7.12s/it] {'loss': 0.4907, 'learning_rate': 1.939945571203021e-05, 'epoch': 0.14} + 14%|█▍ | 1204/8750 [1:58:15<14:55:28, 7.12s/it] 14%|█▍ | 1205/8750 [1:58:27<14:04:33, 6.72s/it] 14%|█▍ | 1205/8750 [1:58:21<14:04:33, 6.72s/it] {'loss': 0.4777, 'learning_rate': 1.9398191603825687e-05, 'epoch': 0.14} + 14%|█▍ | 1205/8750 [1:58:27<14:04:33, 6.72s/it] {'loss': 0.4777, 'learning_rate': 1.9398191603825687e-05, 'epoch': 0.14} + 14%|█▍ | 1205/8750 [1:58:21<14:04:33, 6.72s/it] 14%|█▍ | 1206/8750 [1:58:26<13:31:48, 6.46s/it] 14%|█▍ | 1206/8750 [1:58:33<13:31:48, 6.46s/it] {'loss': 0.4816, 'learning_rate': 1.9396926207859085e-05, 'epoch': 0.14} + 14%|█▍ | 1206/8750 [1:58:33<13:31:48, 6.46s/it] {'loss': 0.4816, 'learning_rate': 1.9396926207859085e-05, 'epoch': 0.14} + 14%|█▍ | 1206/8750 [1:58:26<13:31:48, 6.46s/it] 14%|█▍ | 1207/8750 [1:58:39<13:08:32, 6.27s/it] 14%|█▍ | 1207/8750 [1:58:32<13:08:32, 6.27s/it] {'loss': 0.492, 'learning_rate': 1.9395659524303795e-05, 'epoch': 0.14} + {'loss': 0.492, 'learning_rate': 1.9395659524303795e-05, 'epoch': 0.14} + 14%|█▍ | 1207/8750 [1:58:39<13:08:32, 6.27s/it] 14%|█▍ | 1207/8750 [1:58:32<13:08:32, 6.27s/it] 14%|█▍ | 1208/8750 [1:58:38<12:59:20, 6.20s/it] 14%|█▍ | 1208/8750 [1:58:45<12:59:21, 6.20s/it] {'loss': 0.4973, 'learning_rate': 1.9394391553333384e-05, 'epoch': 0.14} + 14%|█▍ | 1208/8750 [1:58:45<12:59:21, 6.20s/it] {'loss': 0.4973, 'learning_rate': 1.9394391553333384e-05, 'epoch': 0.14} + 14%|█▍ | 1208/8750 [1:58:38<12:59:20, 6.20s/it] 14%|█▍ | 1209/8750 [1:58:44<12:33:56, 6.00s/it] 14%|█▍ | 1209/8750 [1:58:50<12:34:01, 6.00s/it] {'loss': 0.501, 'learning_rate': 1.939312229512159e-05, 'epoch': 0.14} + 14%|█▍ | 1209/8750 [1:58:50<12:34:01, 6.00s/it] {'loss': 0.501, 'learning_rate': 1.939312229512159e-05, 'epoch': 0.14} + 14%|█▍ | 1209/8750 [1:58:44<12:33:56, 6.00s/it] 14%|█▍ | 1210/8750 [1:58:56<12:21:56, 5.90s/it] 14%|█▍ | 1210/8750 [1:58:49<12:21:57, 5.90s/it] {'loss': 0.4835, 'learning_rate': 1.9391851749842326e-05, 'epoch': 0.14} + {'loss': 0.4835, 'learning_rate': 1.9391851749842326e-05, 'epoch': 0.14} 14%|█▍ | 1210/8750 [1:58:56<12:21:56, 5.90s/it] + 14%|█▍ | 1210/8750 [1:58:49<12:21:57, 5.90s/it] 14%|█▍ | 1211/8750 [1:58:55<12:14:05, 5.84s/it] 14%|█▍ | 1211/8750 [1:59:02<12:14:05, 5.84s/it] {'loss': 0.4839, 'learning_rate': 1.939057991766969e-05, 'epoch': 0.14} + 14%|█▍ | 1211/8750 [1:59:02<12:14:05, 5.84s/it] {'loss': 0.4839, 'learning_rate': 1.939057991766969e-05, 'epoch': 0.14} + 14%|█▍ | 1211/8750 [1:58:55<12:14:05, 5.84s/it] 14%|█▍ | 1212/8750 [1:59:01<12:01:50, 5.75s/it] 14%|█▍ | 1212/8750 [1:59:07<12:01:49, 5.75s/it] {'loss': 0.4796, 'learning_rate': 1.938930679877795e-05, 'epoch': 0.14} + 14%|█▍ | 1212/8750 [1:59:07<12:01:49, 5.75s/it] {'loss': 0.4796, 'learning_rate': 1.938930679877795e-05, 'epoch': 0.14} + 14%|█▍ | 1212/8750 [1:59:01<12:01:50, 5.75s/it] 14%|█▍ | 1213/8750 [1:59:06<12:00:13, 5.73s/it] 14%|█▍ | 1213/8750 [1:59:13<12:00:14, 5.73s/it] {'loss': 0.4854, 'learning_rate': 1.938803239334155e-05, 'epoch': 0.14} + 14%|█▍ | 1213/8750 [1:59:13<12:00:14, 5.73s/it] {'loss': 0.4854, 'learning_rate': 1.938803239334155e-05, 'epoch': 0.14} + 14%|█▍ | 1213/8750 [1:59:06<12:00:13, 5.73s/it] 14%|█▍ | 1214/8750 [1:59:19<11:56:49, 5.71s/it] 14%|█▍ | 1214/8750 [1:59:12<11:56:50, 5.71s/it] {'loss': 0.5023, 'learning_rate': 1.9386756701535115e-05, 'epoch': 0.14} + 14%|█▍ | 1214/8750 [1:59:19<11:56:49, 5.71s/it] {'loss': 0.5023, 'learning_rate': 1.9386756701535115e-05, 'epoch': 0.14} + 14%|█▍ | 1214/8750 [1:59:12<11:56:50, 5.71s/it] 14%|█▍ | 1215/8750 [1:59:18<11:59:00, 5.73s/it] 14%|█▍ | 1215/8750 [1:59:24<11:59:00, 5.73s/it] {'loss': 0.476, 'learning_rate': 1.938547972353344e-05, 'epoch': 0.14} + 14%|█▍ | 1215/8750 [1:59:24<11:59:00, 5.73s/it] {'loss': 0.476, 'learning_rate': 1.938547972353344e-05, 'epoch': 0.14} + 14%|█▍ | 1215/8750 [1:59:18<11:59:00, 5.73s/it] 14%|█▍ | 1216/8750 [1:59:24<11:58:45, 5.72s/it] 14%|█▍ | 1216/8750 [1:59:30<11:58:44, 5.72s/it] {'loss': 0.4784, 'learning_rate': 1.93842014595115e-05, 'epoch': 0.14} + 14%|█▍ | 1216/8750 [1:59:30<11:58:44, 5.72s/it] {'loss': 0.4784, 'learning_rate': 1.93842014595115e-05, 'epoch': 0.14} + 14%|█▍ | 1216/8750 [1:59:24<11:58:45, 5.72s/it] 14%|█▍ | 1217/8750 [1:59:36<11:52:04, 5.67s/it] 14%|█▍ | 1217/8750 [1:59:29<11:52:05, 5.67s/it] {'loss': 0.5077, 'learning_rate': 1.9382921909644448e-05, 'epoch': 0.14} + 14%|█▍ | 1217/8750 [1:59:36<11:52:04, 5.67s/it] {'loss': 0.5077, 'learning_rate': 1.9382921909644448e-05, 'epoch': 0.14} + 14%|█▍ | 1217/8750 [1:59:29<11:52:05, 5.67s/it] 14%|█▍ | 1218/8750 [1:59:35<11:49:37, 5.65s/it] 14%|█▍ | 1218/8750 [1:59:41<11:49:37, 5.65s/it] {'loss': 0.5086, 'learning_rate': 1.938164107410761e-05, 'epoch': 0.14} + 14%|█▍ | 1218/8750 [1:59:41<11:49:37, 5.65s/it] {'loss': 0.5086, 'learning_rate': 1.938164107410761e-05, 'epoch': 0.14} + 14%|█▍ | 1218/8750 [1:59:35<11:49:37, 5.65s/it] 14%|█▍ | 1219/8750 [1:59:40<11:50:48, 5.66s/it] 14%|█▍ | 1219/8750 [1:59:47<11:50:48, 5.66s/it] {'loss': 0.4828, 'learning_rate': 1.938035895307649e-05, 'epoch': 0.14} + 14%|█▍ | 1219/8750 [1:59:47<11:50:48, 5.66s/it] {'loss': 0.4828, 'learning_rate': 1.938035895307649e-05, 'epoch': 0.14} + 14%|█▍ | 1219/8750 [1:59:40<11:50:48, 5.66s/it] 14%|█▍ | 1220/8750 [1:59:47<12:21:57, 5.91s/it] 14%|█▍ | 1220/8750 [1:59:53<12:21:57, 5.91s/it] {'loss': 0.4738, 'learning_rate': 1.9379075546726764e-05, 'epoch': 0.14} + 14%|█▍ | 1220/8750 [1:59:53<12:21:57, 5.91s/it] {'loss': 0.4738, 'learning_rate': 1.9379075546726764e-05, 'epoch': 0.14} + 14%|█▍ | 1220/8750 [1:59:47<12:21:57, 5.91s/it] 14%|█▍ | 1221/8750 [1:59:52<12:09:12, 5.81s/it] 14%|█▍ | 1221/8750 [1:59:59<12:09:12, 5.81s/it] {'loss': 0.484, 'learning_rate': 1.9377790855234288e-05, 'epoch': 0.14} + 14%|█▍ | 1221/8750 [1:59:59<12:09:12, 5.81s/it] {'loss': 0.484, 'learning_rate': 1.9377790855234288e-05, 'epoch': 0.14} + 14%|█▍ | 1221/8750 [1:59:52<12:09:12, 5.81s/it] 14%|█▍ | 1222/8750 [1:59:58<12:09:50, 5.82s/it] 14%|█▍ | 1222/8750 [2:00:05<12:09:50, 5.82s/it] {'loss': 0.4836, 'learning_rate': 1.9376504878775098e-05, 'epoch': 0.14} + 14%|█▍ | 1222/8750 [2:00:05<12:09:50, 5.82s/it] {'loss': 0.4836, 'learning_rate': 1.9376504878775098e-05, 'epoch': 0.14} + 14%|█▍ | 1222/8750 [1:59:58<12:09:50, 5.82s/it] 14%|█▍ | 1223/8750 [2:00:04<12:06:22, 5.79s/it] 14%|█▍ | 1223/8750 [2:00:10<12:06:23, 5.79s/it] {'loss': 0.4859, 'learning_rate': 1.9375217617525396e-05, 'epoch': 0.14} + 14%|█▍ | 1223/8750 [2:00:10<12:06:23, 5.79s/it] {'loss': 0.4859, 'learning_rate': 1.9375217617525396e-05, 'epoch': 0.14} + 14%|█▍ | 1223/8750 [2:00:04<12:06:22, 5.79s/it] 14%|█▍ | 1224/8750 [2:00:10<11:58:05, 5.72s/it] 14%|█▍ | 1224/8750 [2:00:16<11:58:07, 5.73s/it] {'loss': 0.4851, 'learning_rate': 1.937392907166157e-05, 'epoch': 0.14} + 14%|█▍ | 1224/8750 [2:00:16<11:58:07, 5.73s/it] {'loss': 0.4851, 'learning_rate': 1.937392907166157e-05, 'epoch': 0.14} + 14%|█▍ | 1224/8750 [2:00:10<11:58:05, 5.72s/it] 14%|█▍ | 1225/8750 [2:00:16<12:12:21, 5.84s/it] 14%|█▍ | 1225/8750 [2:00:22<12:12:20, 5.84s/it] {'loss': 0.4709, 'learning_rate': 1.9372639241360173e-05, 'epoch': 0.14} + 14%|█▍ | 1225/8750 [2:00:22<12:12:20, 5.84s/it] {'loss': 0.4709, 'learning_rate': 1.9372639241360173e-05, 'epoch': 0.14} + 14%|█▍ | 1225/8750 [2:00:16<12:12:21, 5.84s/it] 14%|█▍ | 1226/8750 [2:00:28<12:09:27, 5.82s/it] 14%|█▍ | 1226/8750 [2:00:21<12:09:29, 5.82s/it] {'loss': 0.5075, 'learning_rate': 1.937134812679795e-05, 'epoch': 0.14} + 14%|█▍ | 1226/8750 [2:00:28<12:09:27, 5.82s/it] {'loss': 0.5075, 'learning_rate': 1.937134812679795e-05, 'epoch': 0.14} + 14%|█▍ | 1226/8750 [2:00:21<12:09:29, 5.82s/it] 14%|█▍ | 1227/8750 [2:00:27<12:01:31, 5.75s/it] 14%|█▍ | 1227/8750 [2:00:34<12:01:31, 5.75s/it] {'loss': 0.5, 'learning_rate': 1.9370055728151805e-05, 'epoch': 0.14} + 14%|█▍ | 1227/8750 [2:00:34<12:01:31, 5.75s/it] {'loss': 0.5, 'learning_rate': 1.9370055728151805e-05, 'epoch': 0.14} + 14%|█▍ | 1227/8750 [2:00:27<12:01:31, 5.75s/it] 14%|█▍ | 1228/8750 [2:00:33<11:58:21, 5.73s/it] 14%|█▍ | 1228/8750 [2:00:39<11:58:21, 5.73s/it] {'loss': 0.4768, 'learning_rate': 1.936876204559883e-05, 'epoch': 0.14} + 14%|█▍ | 1228/8750 [2:00:39<11:58:21, 5.73s/it] {'loss': 0.4768, 'learning_rate': 1.936876204559883e-05, 'epoch': 0.14} + 14%|█▍ | 1228/8750 [2:00:33<11:58:21, 5.73s/it] 14%|█▍ | 1229/8750 [2:00:45<11:53:45, 5.69s/it] 14%|█▍ | 1229/8750 [2:00:38<11:53:45, 5.69s/it] {'loss': 0.5111, 'learning_rate': 1.936746707931628e-05, 'epoch': 0.14} + 14%|█▍ | 1229/8750 [2:00:45<11:53:45, 5.69s/it] {'loss': 0.5111, 'learning_rate': 1.936746707931628e-05, 'epoch': 0.14} + 14%|█▍ | 1229/8750 [2:00:38<11:53:45, 5.69s/it] 14%|█▍ | 1230/8750 [2:00:51<12:08:30, 5.81s/it] 14%|█▍ | 1230/8750 [2:00:44<12:08:31, 5.81s/it] {'loss': 0.4642, 'learning_rate': 1.9366170829481607e-05, 'epoch': 0.14} + 14%|█▍ | 1230/8750 [2:00:51<12:08:30, 5.81s/it] {'loss': 0.4642, 'learning_rate': 1.9366170829481607e-05, 'epoch': 0.14} + 14%|█▍ | 1230/8750 [2:00:44<12:08:31, 5.81s/it] 14%|█▍ | 1231/8750 [2:00:51<12:21:13, 5.91s/it] 14%|█▍ | 1231/8750 [2:00:57<12:21:13, 5.91s/it] {'loss': 0.4755, 'learning_rate': 1.9364873296272414e-05, 'epoch': 0.14} + 14%|█▍ | 1231/8750 [2:00:57<12:21:13, 5.91s/it] {'loss': 0.4755, 'learning_rate': 1.9364873296272414e-05, 'epoch': 0.14} + 14%|█▍ | 1231/8750 [2:00:51<12:21:13, 5.91s/it] 14%|█▍ | 1232/8750 [2:00:56<12:08:57, 5.82s/it] 14%|█▍ | 1232/8750 [2:01:03<12:08:57, 5.82s/it] {'loss': 0.4973, 'learning_rate': 1.9363574479866504e-05, 'epoch': 0.14} + 14%|█▍ | 1232/8750 [2:01:03<12:08:57, 5.82s/it] {'loss': 0.4973, 'learning_rate': 1.9363574479866504e-05, 'epoch': 0.14} + 14%|█▍ | 1232/8750 [2:00:56<12:08:57, 5.82s/it] 14%|█▍ | 1233/8750 [2:01:02<12:02:03, 5.76s/it] 14%|█▍ | 1233/8750 [2:01:08<12:02:04, 5.76s/it] {'loss': 0.4738, 'learning_rate': 1.936227438044183e-05, 'epoch': 0.14} + 14%|█▍ | 1233/8750 [2:01:08<12:02:04, 5.76s/it] {'loss': 0.4738, 'learning_rate': 1.936227438044183e-05, 'epoch': 0.14} + 14%|█▍ | 1233/8750 [2:01:02<12:02:03, 5.76s/it] 14%|█▍ | 1234/8750 [2:01:14<12:01:23, 5.76s/it] 14%|█▍ | 1234/8750 [2:01:08<12:01:24, 5.76s/it] {'loss': 0.4883, 'learning_rate': 1.9360972998176547e-05, 'epoch': 0.14} + 14%|█▍ | 1234/8750 [2:01:14<12:01:23, 5.76s/it] {'loss': 0.4883, 'learning_rate': 1.9360972998176547e-05, 'epoch': 0.14} + 14%|█▍ | 1234/8750 [2:01:08<12:01:24, 5.76s/it] 14%|█▍ | 1235/8750 [2:01:13<12:04:22, 5.78s/it] 14%|█▍ | 1235/8750 [2:01:20<12:04:22, 5.78s/it] {'loss': 0.4855, 'learning_rate': 1.9359670333248967e-05, 'epoch': 0.14} + 14%|█▍ | 1235/8750 [2:01:20<12:04:22, 5.78s/it] {'loss': 0.4855, 'learning_rate': 1.9359670333248967e-05, 'epoch': 0.14} + 14%|█▍ | 1235/8750 [2:01:13<12:04:22, 5.78s/it] 14%|█▍ | 1236/8750 [2:01:19<12:01:29, 5.76s/it] 14%|█▍ | 1236/8750 [2:01:26<12:01:29, 5.76s/it] {'loss': 0.503, 'learning_rate': 1.935836638583759e-05, 'epoch': 0.14} + 14%|█▍ | 1236/8750 [2:01:26<12:01:29, 5.76s/it] {'loss': 0.503, 'learning_rate': 1.935836638583759e-05, 'epoch': 0.14} + 14%|█▍ | 1236/8750 [2:01:19<12:01:29, 5.76s/it] 14%|█▍ | 1237/8750 [2:01:25<11:56:28, 5.72s/it] 14%|█▍ | 1237/8750 [2:01:31<11:56:30, 5.72s/it] {'loss': 0.4941, 'learning_rate': 1.935706115612108e-05, 'epoch': 0.14} + 14%|█▍ | 1237/8750 [2:01:31<11:56:30, 5.72s/it] {'loss': 0.4941, 'learning_rate': 1.935706115612108e-05, 'epoch': 0.14} + 14%|█▍ | 1237/8750 [2:01:25<11:56:28, 5.72s/it] 14%|█▍ | 1238/8750 [2:01:37<11:50:59, 5.68s/it] 14%|█▍ | 1238/8750 [2:01:30<11:51:00, 5.68s/it] {'loss': 0.4675, 'learning_rate': 1.935575464427828e-05, 'epoch': 0.14} + 14%|█▍ | 1238/8750 [2:01:37<11:50:59, 5.68s/it] {'loss': 0.4675, 'learning_rate': 1.935575464427828e-05, 'epoch': 0.14} + 14%|█▍ | 1238/8750 [2:01:30<11:51:00, 5.68s/it] 14%|█▍ | 1239/8750 [2:01:36<11:48:06, 5.66s/it] 14%|█▍ | 1239/8750 [2:01:42<11:48:06, 5.66s/it] {'loss': 0.468, 'learning_rate': 1.9354446850488216e-05, 'epoch': 0.14} + 14%|█▍ | 1239/8750 [2:01:42<11:48:06, 5.66s/it] {'loss': 0.468, 'learning_rate': 1.9354446850488216e-05, 'epoch': 0.14} + 14%|█▍ | 1239/8750 [2:01:36<11:48:06, 5.66s/it] 14%|█▍ | 1240/8750 [2:01:41<11:44:04, 5.63s/it] 14%|█▍ | 1240/8750 [2:01:48<11:44:04, 5.63s/it] {'loss': 0.5015, 'learning_rate': 1.9353137774930085e-05, 'epoch': 0.14} + 14%|█▍ | 1240/8750 [2:01:48<11:44:04, 5.63s/it] {'loss': 0.5015, 'learning_rate': 1.9353137774930085e-05, 'epoch': 0.14} + 14%|█▍ | 1240/8750 [2:01:41<11:44:04, 5.63s/it] 14%|█▍ | 1241/8750 [2:01:47<11:48:09, 5.66s/it] 14%|█▍ | 1241/8750 [2:01:54<11:48:09, 5.66s/it] {'loss': 0.4915, 'learning_rate': 1.935182741778326e-05, 'epoch': 0.14} + 14%|█▍ | 1241/8750 [2:01:54<11:48:09, 5.66s/it] {'loss': 0.4915, 'learning_rate': 1.935182741778326e-05, 'epoch': 0.14} + 14%|█▍ | 1241/8750 [2:01:47<11:48:09, 5.66s/it] 14%|█▍ | 1242/8750 [2:01:53<11:51:53, 5.69s/it] 14%|█▍ | 1242/8750 [2:01:59<11:51:53, 5.69s/it] {'loss': 0.4781, 'learning_rate': 1.9350515779227294e-05, 'epoch': 0.14} + 14%|█▍ | 1242/8750 [2:01:59<11:51:53, 5.69s/it] {'loss': 0.4781, 'learning_rate': 1.9350515779227294e-05, 'epoch': 0.14} + 14%|█▍ | 1242/8750 [2:01:53<11:51:53, 5.69s/it] 14%|█▍ | 1243/8750 [2:02:05<11:48:32, 5.66s/it] 14%|█▍ | 1243/8750 [2:01:59<11:48:33, 5.66s/it] {'loss': 0.4798, 'learning_rate': 1.93492028594419e-05, 'epoch': 0.14} + 14%|█▍ | 1243/8750 [2:01:59<11:48:33, 5.66s/it] {'loss': 0.4798, 'learning_rate': 1.93492028594419e-05, 'epoch': 0.14} + 14%|█▍ | 1243/8750 [2:02:05<11:48:32, 5.66s/it] 14%|█▍ | 1244/8750 [2:02:04<11:47:39, 5.66s/it] 14%|█▍ | 1244/8750 [2:02:11<11:47:39, 5.66s/it] {'loss': 0.4966, 'learning_rate': 1.934788865860698e-05, 'epoch': 0.14} + 14%|█▍ | 1244/8750 [2:02:11<11:47:39, 5.66s/it] {'loss': 0.4966, 'learning_rate': 1.934788865860698e-05, 'epoch': 0.14} + 14%|█▍ | 1244/8750 [2:02:04<11:47:39, 5.66s/it] 14%|█▍ | 1245/8750 [2:02:10<11:47:30, 5.66s/it] 14%|█▍ | 1245/8750 [2:02:16<11:47:32, 5.66s/it] {'loss': 0.4916, 'learning_rate': 1.9346573176902616e-05, 'epoch': 0.14} + 14%|█▍ | 1245/8750 [2:02:16<11:47:32, 5.66s/it] {'loss': 0.4916, 'learning_rate': 1.9346573176902616e-05, 'epoch': 0.14} + 14%|█▍ | 1245/8750 [2:02:10<11:47:30, 5.66s/it] 14%|█▍ | 1246/8750 [2:02:16<11:48:32, 5.67s/it] 14%|█▍ | 1246/8750 [2:02:22<11:48:32, 5.67s/it] {'loss': 0.4928, 'learning_rate': 1.934525641450905e-05, 'epoch': 0.14} + 14%|█▍ | 1246/8750 [2:02:22<11:48:32, 5.67s/it] {'loss': 0.4928, 'learning_rate': 1.934525641450905e-05, 'epoch': 0.14} + 14%|█▍ | 1246/8750 [2:02:16<11:48:32, 5.67s/it] 14%|█▍ | 1247/8750 [2:02:21<11:51:21, 5.69s/it] 14%|█▍ | 1247/8750 [2:02:28<11:51:21, 5.69s/it] {'loss': 0.4975, 'learning_rate': 1.9343938371606714e-05, 'epoch': 0.14} + 14%|█▍ | 1247/8750 [2:02:28<11:51:21, 5.69s/it] {'loss': 0.4975, 'learning_rate': 1.9343938371606714e-05, 'epoch': 0.14} + 14%|█▍ | 1247/8750 [2:02:21<11:51:21, 5.69s/it] 14%|█▍ | 1248/8750 [2:02:27<11:48:32, 5.67s/it] 14%|█▍ | 1248/8750 [2:02:33<11:48:37, 5.67s/it] {'loss': 0.4795, 'learning_rate': 1.9342619048376202e-05, 'epoch': 0.14} + 14%|█▍ | 1248/8750 [2:02:33<11:48:37, 5.67s/it] {'loss': 0.4795, 'learning_rate': 1.9342619048376202e-05, 'epoch': 0.14} + 14%|█▍ | 1248/8750 [2:02:27<11:48:32, 5.67s/it] 14%|█▍ | 1249/8750 [2:02:33<12:01:12, 5.77s/it] 14%|█▍ | 1249/8750 [2:02:39<12:01:12, 5.77s/it] {'loss': 0.4847, 'learning_rate': 1.93412984449983e-05, 'epoch': 0.14} + 14%|█▍ | 1249/8750 [2:02:39<12:01:12, 5.77s/it] {'loss': 0.4847, 'learning_rate': 1.93412984449983e-05, 'epoch': 0.14} + 14%|█▍ | 1249/8750 [2:02:33<12:01:12, 5.77s/it]13 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +17 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +06 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... + 14%|█▍ | 1250/8750 [2:02:45<11:56:31, 5.73s/it]9 14%|█▍ | 1250/8750 [2:02:39<11:56:33, 5.73s/it]AutoResumeHook: Checking whether to suspend... + {'loss': 0.4977, 'learning_rate': 1.9339976561653956e-05, 'epoch': 0.14} + 14%|█▍ | 1250/8750 [2:02:45<11:56:31, 5.73s/it] {'loss': 0.4977, 'learning_rate': 1.9339976561653956e-05, 'epoch': 0.14} + 14%|█▍ | 1250/8750 [2:02:39<11:56:33, 5.73s/it] 14%|█▍ | 1251/8750 [2:02:44<11:55:56, 5.73s/it] 14%|█▍ | 1251/8750 [2:02:51<11:55:55, 5.73s/it] {'loss': 0.4751, 'learning_rate': 1.9338653398524295e-05, 'epoch': 0.14} + 14%|█▍ | 1251/8750 [2:02:51<11:55:55, 5.73s/it] {'loss': 0.4751, 'learning_rate': 1.9338653398524295e-05, 'epoch': 0.14} + 14%|█▍ | 1251/8750 [2:02:44<11:55:56, 5.73s/it] 14%|█▍ | 1252/8750 [2:02:50<11:53:44, 5.71s/it] 14%|█▍ | 1252/8750 [2:02:56<11:53:43, 5.71s/it] {'loss': 0.4929, 'learning_rate': 1.933732895579062e-05, 'epoch': 0.14} + 14%|█▍ | 1252/8750 [2:02:56<11:53:43, 5.71s/it] {'loss': 0.4929, 'learning_rate': 1.933732895579062e-05, 'epoch': 0.14} + 14%|█▍ | 1252/8750 [2:02:50<11:53:44, 5.71s/it] 14%|█▍ | 1253/8750 [2:02:56<12:02:16, 5.78s/it] 14%|█▍ | 1253/8750 [2:03:02<12:02:16, 5.78s/it] {'loss': 0.4971, 'learning_rate': 1.933600323363442e-05, 'epoch': 0.14} + 14%|█▍ | 1253/8750 [2:03:02<12:02:16, 5.78s/it] {'loss': 0.4971, 'learning_rate': 1.933600323363442e-05, 'epoch': 0.14} + 14%|█▍ | 1253/8750 [2:02:56<12:02:16, 5.78s/it] 14%|█▍ | 1254/8750 [2:03:08<12:01:07, 5.77s/it] 14%|█▍ | 1254/8750 [2:03:02<12:01:09, 5.77s/it] {'loss': 0.4927, 'learning_rate': 1.933467623223733e-05, 'epoch': 0.14} + 14%|█▍ | 1254/8750 [2:03:08<12:01:07, 5.77s/it] {'loss': 0.4927, 'learning_rate': 1.933467623223733e-05, 'epoch': 0.14} + 14%|█▍ | 1254/8750 [2:03:02<12:01:09, 5.77s/it] 14%|█▍ | 1255/8750 [2:03:07<11:55:02, 5.72s/it] 14%|█▍ | 1255/8750 [2:03:14<11:55:02, 5.72s/it] {'loss': 0.4969, 'learning_rate': 1.9333347951781194e-05, 'epoch': 0.14} + 14%|█▍ | 1255/8750 [2:03:14<11:55:02, 5.72s/it] {'loss': 0.4969, 'learning_rate': 1.9333347951781194e-05, 'epoch': 0.14} + 14%|█▍ | 1255/8750 [2:03:07<11:55:02, 5.72s/it] 14%|█▍ | 1256/8750 [2:03:13<11:54:56, 5.72s/it] 14%|█▍ | 1256/8750 [2:03:19<11:54:56, 5.72s/it] {'loss': 0.4833, 'learning_rate': 1.933201839244801e-05, 'epoch': 0.14} + 14%|█▍ | 1256/8750 [2:03:19<11:54:56, 5.72s/it] {'loss': 0.4833, 'learning_rate': 1.933201839244801e-05, 'epoch': 0.14} + 14%|█▍ | 1256/8750 [2:03:13<11:54:56, 5.72s/it] 14%|█▍ | 1257/8750 [2:03:19<11:58:31, 5.75s/it] 14%|█▍ | 1257/8750 [2:03:25<11:58:31, 5.75s/it] {'loss': 0.4656, 'learning_rate': 1.9330687554419956e-05, 'epoch': 0.14} + 14%|█▍ | 1257/8750 [2:03:25<11:58:31, 5.75s/it] {'loss': 0.4656, 'learning_rate': 1.9330687554419956e-05, 'epoch': 0.14} + 14%|█▍ | 1257/8750 [2:03:19<11:58:31, 5.75s/it] 14%|█▍ | 1258/8750 [2:03:24<11:53:46, 5.72s/it] 14%|█▍ | 1258/8750 [2:03:31<11:53:46, 5.72s/it] {'loss': 0.496, 'learning_rate': 1.932935543787939e-05, 'epoch': 0.14} + 14%|█▍ | 1258/8750 [2:03:31<11:53:46, 5.72s/it] {'loss': 0.496, 'learning_rate': 1.932935543787939e-05, 'epoch': 0.14} + 14%|█▍ | 1258/8750 [2:03:24<11:53:46, 5.72s/it] 14%|█▍ | 1259/8750 [2:03:30<11:58:40, 5.76s/it] 14%|█▍ | 1259/8750 [2:03:37<11:58:39, 5.76s/it] {'loss': 0.4697, 'learning_rate': 1.9328022043008842e-05, 'epoch': 0.14} + 14%|█▍ | 1259/8750 [2:03:37<11:58:39, 5.76s/it] {'loss': 0.4697, 'learning_rate': 1.9328022043008842e-05, 'epoch': 0.14} + 14%|█▍ | 1259/8750 [2:03:30<11:58:40, 5.76s/it] 14%|█▍ | 1260/8750 [2:03:36<12:01:35, 5.78s/it] 14%|█▍ | 1260/8750 [2:03:43<12:01:35, 5.78s/it] {'loss': 0.4901, 'learning_rate': 1.9326687369991012e-05, 'epoch': 0.14} + 14%|█▍ | 1260/8750 [2:03:43<12:01:35, 5.78s/it] {'loss': 0.4901, 'learning_rate': 1.9326687369991012e-05, 'epoch': 0.14} + 14%|█▍ | 1260/8750 [2:03:36<12:01:35, 5.78s/it] 14%|█▍ | 1261/8750 [2:03:48<11:55:52, 5.74s/it] 14%|█▍ | 1261/8750 [2:03:42<11:55:53, 5.74s/it] {'loss': 0.4935, 'learning_rate': 1.9325351419008783e-05, 'epoch': 0.14} + 14%|█▍ | 1261/8750 [2:03:48<11:55:52, 5.74s/it] {'loss': 0.4935, 'learning_rate': 1.9325351419008783e-05, 'epoch': 0.14} + 14%|█▍ | 1261/8750 [2:03:42<11:55:53, 5.74s/it] 14%|█▍ | 1262/8750 [2:03:47<11:54:23, 5.72s/it] 14%|█▍ | 1262/8750 [2:03:54<11:54:24, 5.72s/it] {'loss': 0.5023, 'learning_rate': 1.932401419024521e-05, 'epoch': 0.14} + 14%|█▍ | 1262/8750 [2:03:54<11:54:24, 5.72s/it] {'loss': 0.5023, 'learning_rate': 1.932401419024521e-05, 'epoch': 0.14} + 14%|█▍ | 1262/8750 [2:03:47<11:54:23, 5.72s/it] 14%|█▍ | 1263/8750 [2:03:53<11:52:26, 5.71s/it] 14%|█▍ | 1263/8750 [2:04:00<11:52:26, 5.71s/it] {'loss': 0.4864, 'learning_rate': 1.9322675683883528e-05, 'epoch': 0.14} + 14%|█▍ | 1263/8750 [2:04:00<11:52:26, 5.71s/it] {'loss': 0.4864, 'learning_rate': 1.9322675683883528e-05, 'epoch': 0.14} + 14%|█▍ | 1263/8750 [2:03:53<11:52:26, 5.71s/it] 14%|█▍ | 1264/8750 [2:03:59<11:57:26, 5.75s/it] 14%|█▍ | 1264/8750 [2:04:05<11:57:26, 5.75s/it] {'loss': 0.4949, 'learning_rate': 1.9321335900107134e-05, 'epoch': 0.14} + 14%|█▍ | 1264/8750 [2:04:05<11:57:26, 5.75s/it] {'loss': 0.4949, 'learning_rate': 1.9321335900107134e-05, 'epoch': 0.14} + 14%|█▍ | 1264/8750 [2:03:59<11:57:26, 5.75s/it] 14%|█▍ | 1265/8750 [2:04:05<11:50:45, 5.70s/it] 14%|█▍ | 1265/8750 [2:04:11<11:50:45, 5.70s/it] {'loss': 0.4694, 'learning_rate': 1.931999483909961e-05, 'epoch': 0.14} + 14%|█▍ | 1265/8750 [2:04:11<11:50:45, 5.70s/it] {'loss': 0.4694, 'learning_rate': 1.931999483909961e-05, 'epoch': 0.14} + 14%|█▍ | 1265/8750 [2:04:05<11:50:45, 5.70s/it] 14%|█▍ | 1266/8750 [2:04:10<11:55:52, 5.74s/it] 14%|█▍ | 1266/8750 [2:04:17<11:55:52, 5.74s/it] {'loss': 0.4915, 'learning_rate': 1.9318652501044715e-05, 'epoch': 0.14} + 14%|█▍ | 1266/8750 [2:04:17<11:55:52, 5.74s/it] {'loss': 0.4915, 'learning_rate': 1.9318652501044715e-05, 'epoch': 0.14} + 14%|█▍ | 1266/8750 [2:04:10<11:55:52, 5.74s/it] 14%|█▍ | 1267/8750 [2:04:16<11:52:02, 5.71s/it] 14%|█▍ | 1267/8750 [2:04:23<11:52:01, 5.71s/it] {'loss': 0.492, 'learning_rate': 1.931730888612638e-05, 'epoch': 0.14} + 14%|█▍ | 1267/8750 [2:04:23<11:52:01, 5.71s/it] {'loss': 0.492, 'learning_rate': 1.931730888612638e-05, 'epoch': 0.14} + 14%|█▍ | 1267/8750 [2:04:16<11:52:02, 5.71s/it] 14%|█▍ | 1268/8750 [2:04:28<11:52:21, 5.71s/it] 14%|█▍ | 1268/8750 [2:04:22<11:52:21, 5.71s/it] {'loss': 0.4826, 'learning_rate': 1.9315963994528707e-05, 'epoch': 0.14} + 14%|█▍ | 1268/8750 [2:04:28<11:52:21, 5.71s/it] {'loss': 0.4826, 'learning_rate': 1.9315963994528707e-05, 'epoch': 0.14} + 14%|█▍ | 1268/8750 [2:04:22<11:52:21, 5.71s/it] 15%|█▍ | 1269/8750 [2:04:28<11:54:52, 5.73s/it] 15%|█▍ | 1269/8750 [2:04:34<11:54:52, 5.73s/it] {'loss': 0.4878, 'learning_rate': 1.931461782643598e-05, 'epoch': 0.15} + 15%|█▍ | 1269/8750 [2:04:34<11:54:52, 5.73s/it] {'loss': 0.4878, 'learning_rate': 1.931461782643598e-05, 'epoch': 0.15} + 15%|█▍ | 1269/8750 [2:04:28<11:54:52, 5.73s/it] 15%|█▍ | 1270/8750 [2:04:33<11:49:11, 5.69s/it] 15%|█▍ | 1270/8750 [2:04:40<11:49:11, 5.69s/it] {'loss': 0.5038, 'learning_rate': 1.9313270382032644e-05, 'epoch': 0.15} + 15%|█▍ | 1270/8750 [2:04:40<11:49:11, 5.69s/it] {'loss': 0.5038, 'learning_rate': 1.9313270382032644e-05, 'epoch': 0.15} + 15%|█▍ | 1270/8750 [2:04:33<11:49:11, 5.69s/it] 15%|█▍ | 1271/8750 [2:04:46<12:16:52, 5.91s/it] 15%|█▍ | 1271/8750 [2:04:40<12:16:53, 5.91s/it] {'loss': 0.4938, 'learning_rate': 1.9311921661503338e-05, 'epoch': 0.15} + 15%|█▍ | 1271/8750 [2:04:46<12:16:52, 5.91s/it] {'loss': 0.4938, 'learning_rate': 1.9311921661503338e-05, 'epoch': 0.15} + 15%|█▍ | 1271/8750 [2:04:40<12:16:53, 5.91s/it] 15%|█▍ | 1272/8750 [2:04:45<12:12:54, 5.88s/it] 15%|█▍ | 1272/8750 [2:04:52<12:12:54, 5.88s/it] {'loss': 0.487, 'learning_rate': 1.9310571665032867e-05, 'epoch': 0.15} + 15%|█▍ | 1272/8750 [2:04:45<12:12:54, 5.88s/it]{'loss': 0.487, 'learning_rate': 1.9310571665032867e-05, 'epoch': 0.15} + 15%|█▍ | 1272/8750 [2:04:52<12:12:54, 5.88s/it] 15%|█▍ | 1273/8750 [2:04:51<12:03:12, 5.80s/it] 15%|█▍ | 1273/8750 [2:04:57<12:03:12, 5.80s/it] {'loss': 0.4885, 'learning_rate': 1.9309220392806206e-05, 'epoch': 0.15} + 15%|█▍ | 1273/8750 [2:04:57<12:03:12, 5.80s/it] {'loss': 0.4885, 'learning_rate': 1.9309220392806206e-05, 'epoch': 0.15} + 15%|█▍ | 1273/8750 [2:04:51<12:03:12, 5.80s/it] 15%|█▍ | 1274/8750 [2:05:03<12:08:34, 5.85s/it] 15%|█▍ | 1274/8750 [2:04:57<12:08:34, 5.85s/it] {'loss': 0.4796, 'learning_rate': 1.9307867845008513e-05, 'epoch': 0.15} + 15%|█▍ | 1274/8750 [2:05:03<12:08:34, 5.85s/it] {'loss': 0.4796, 'learning_rate': 1.9307867845008513e-05, 'epoch': 0.15} + 15%|█▍ | 1274/8750 [2:04:57<12:08:34, 5.85s/it] 15%|█▍ | 1275/8750 [2:05:03<12:05:04, 5.82s/it] 15%|█▍ | 1275/8750 [2:05:09<12:05:04, 5.82s/it] {'loss': 0.4803, 'learning_rate': 1.930651402182512e-05, 'epoch': 0.15} + 15%|█▍ | 1275/8750 [2:05:09<12:05:04, 5.82s/it] {'loss': 0.4803, 'learning_rate': 1.930651402182512e-05, 'epoch': 0.15} + 15%|█▍ | 1275/8750 [2:05:03<12:05:04, 5.82s/it] 15%|█▍ | 1276/8750 [2:05:08<11:57:33, 5.76s/it] 15%|█▍ | 1276/8750 [2:05:15<11:57:34, 5.76s/it] {'loss': 0.4871, 'learning_rate': 1.9305158923441524e-05, 'epoch': 0.15} + 15%|█▍ | 1276/8750 [2:05:15<11:57:34, 5.76s/it] {'loss': 0.4871, 'learning_rate': 1.9305158923441524e-05, 'epoch': 0.15} + 15%|█▍ | 1276/8750 [2:05:08<11:57:33, 5.76s/it] 15%|█▍ | 1277/8750 [2:05:20<11:50:57, 5.71s/it] 15%|█▍ | 1277/8750 [2:05:14<11:50:57, 5.71s/it] {'loss': 0.4888, 'learning_rate': 1.9303802550043404e-05, 'epoch': 0.15} + 15%|█▍ | 1277/8750 [2:05:20<11:50:57, 5.71s/it] {'loss': 0.4888, 'learning_rate': 1.9303802550043404e-05, 'epoch': 0.15} + 15%|█▍ | 1277/8750 [2:05:14<11:50:57, 5.71s/it] 15%|█▍ | 1278/8750 [2:05:19<11:44:09, 5.65s/it] 15%|█▍ | 1278/8750 [2:05:26<11:44:08, 5.65s/it] {'loss': 0.4926, 'learning_rate': 1.930244490181662e-05, 'epoch': 0.15} + 15%|█▍ | 1278/8750 [2:05:26<11:44:08, 5.65s/it] {'loss': 0.4926, 'learning_rate': 1.930244490181662e-05, 'epoch': 0.15} + 15%|█▍ | 1278/8750 [2:05:19<11:44:09, 5.65s/it] 15%|█▍ | 1279/8750 [2:05:32<11:45:58, 5.67s/it] 15%|█▍ | 1279/8750 [2:05:25<11:45:59, 5.67s/it] {'loss': 0.4892, 'learning_rate': 1.9301085978947195e-05, 'epoch': 0.15} + 15%|█▍ | 1279/8750 [2:05:32<11:45:58, 5.67s/it] {'loss': 0.4892, 'learning_rate': 1.9301085978947195e-05, 'epoch': 0.15} + 15%|█▍ | 1279/8750 [2:05:25<11:45:59, 5.67s/it] 15%|█▍ | 1280/8750 [2:05:37<11:43:16, 5.65s/it] 15%|█▍ | 1280/8750 [2:05:31<11:43:16, 5.65s/it] {'loss': 0.4975, 'learning_rate': 1.9299725781621335e-05, 'epoch': 0.15} + 15%|█▍ | 1280/8750 [2:05:37<11:43:16, 5.65s/it] {'loss': 0.4975, 'learning_rate': 1.9299725781621335e-05, 'epoch': 0.15} + 15%|█▍ | 1280/8750 [2:05:31<11:43:16, 5.65s/it] 15%|█▍ | 1281/8750 [2:05:36<11:41:33, 5.64s/it] 15%|█▍ | 1281/8750 [2:05:43<11:41:34, 5.64s/it] {'loss': 0.4898, 'learning_rate': 1.9298364310025412e-05, 'epoch': 0.15} + 15%|█▍ | 1281/8750 [2:05:43<11:41:34, 5.64s/it] {'loss': 0.4898, 'learning_rate': 1.9298364310025412e-05, 'epoch': 0.15} + 15%|█▍ | 1281/8750 [2:05:36<11:41:33, 5.64s/it] 15%|█▍ | 1282/8750 [2:05:49<12:00:09, 5.79s/it] 15%|█▍ | 1282/8750 [2:05:42<12:00:09, 5.79s/it] {'loss': 0.472, 'learning_rate': 1.929700156434599e-05, 'epoch': 0.15} + 15%|█▍ | 1282/8750 [2:05:49<12:00:09, 5.79s/it] {'loss': 0.472, 'learning_rate': 1.929700156434599e-05, 'epoch': 0.15} + 15%|█▍ | 1282/8750 [2:05:42<12:00:09, 5.79s/it] 15%|█▍ | 1283/8750 [2:05:48<12:08:12, 5.85s/it] 15%|█▍ | 1283/8750 [2:05:55<12:08:12, 5.85s/it] {'loss': 0.4749, 'learning_rate': 1.929563754476978e-05, 'epoch': 0.15} + 15%|█▍ | 1283/8750 [2:05:55<12:08:12, 5.85s/it] {'loss': 0.4749, 'learning_rate': 1.929563754476978e-05, 'epoch': 0.15} + 15%|█▍ | 1283/8750 [2:05:48<12:08:12, 5.85s/it] 15%|█▍ | 1284/8750 [2:05:54<12:01:27, 5.80s/it] 15%|█▍ | 1284/8750 [2:06:01<12:01:28, 5.80s/it] {'loss': 0.4967, 'learning_rate': 1.929427225148369e-05, 'epoch': 0.15} + 15%|█▍ | 1284/8750 [2:06:01<12:01:28, 5.80s/it] {'loss': 0.4967, 'learning_rate': 1.929427225148369e-05, 'epoch': 0.15} + 15%|█▍ | 1284/8750 [2:05:54<12:01:27, 5.80s/it] 15%|█▍ | 1285/8750 [2:06:06<11:56:48, 5.76s/it] 15%|█▍ | 1285/8750 [2:06:00<11:56:49, 5.76s/it] {'loss': 0.4912, 'learning_rate': 1.92929056846748e-05, 'epoch': 0.15} + 15%|█▍ | 1285/8750 [2:06:06<11:56:48, 5.76s/it] {'loss': 0.4912, 'learning_rate': 1.92929056846748e-05, 'epoch': 0.15} + 15%|█▍ | 1285/8750 [2:06:00<11:56:49, 5.76s/it] 15%|█▍ | 1286/8750 [2:06:05<11:51:37, 5.72s/it] 15%|█▍ | 1286/8750 [2:06:12<11:51:37, 5.72s/it] {'loss': 0.4963, 'learning_rate': 1.9291537844530352e-05, 'epoch': 0.15} + 15%|█▍ | 1286/8750 [2:06:12<11:51:37, 5.72s/it] {'loss': 0.4963, 'learning_rate': 1.9291537844530352e-05, 'epoch': 0.15} + 15%|█▍ | 1286/8750 [2:06:05<11:51:37, 5.72s/it] 15%|█▍ | 1287/8750 [2:06:11<11:45:34, 5.67s/it] 15%|█▍ | 1287/8750 [2:06:17<11:45:34, 5.67s/it] {'loss': 0.5038, 'learning_rate': 1.9290168731237776e-05, 'epoch': 0.15} + 15%|█▍ | 1287/8750 [2:06:17<11:45:34, 5.67s/it] {'loss': 0.5038, 'learning_rate': 1.9290168731237776e-05, 'epoch': 0.15} + 15%|█▍ | 1287/8750 [2:06:11<11:45:34, 5.67s/it] 15%|█▍ | 1288/8750 [2:06:17<11:49:43, 5.71s/it] 15%|█▍ | 1288/8750 [2:06:23<11:49:43, 5.71s/it] {'loss': 0.4706, 'learning_rate': 1.9288798344984673e-05, 'epoch': 0.15} + 15%|█▍ | 1288/8750 [2:06:23<11:49:43, 5.71s/it] {'loss': 0.4706, 'learning_rate': 1.9288798344984673e-05, 'epoch': 0.15} + 15%|█▍ | 1288/8750 [2:06:17<11:49:43, 5.71s/it] 15%|█▍ | 1289/8750 [2:06:29<11:54:06, 5.74s/it] 15%|█▍ | 1289/8750 [2:06:23<11:54:08, 5.74s/it] {'loss': 0.498, 'learning_rate': 1.928742668595881e-05, 'epoch': 0.15} + 15%|█▍ | 1289/8750 [2:06:29<11:54:06, 5.74s/it] {'loss': 0.498, 'learning_rate': 1.928742668595881e-05, 'epoch': 0.15} + 15%|█▍ | 1289/8750 [2:06:23<11:54:08, 5.74s/it] 15%|█▍ | 1290/8750 [2:06:35<11:49:04, 5.70s/it] 15%|█▍ | 1290/8750 [2:06:28<11:49:04, 5.70s/it] {'loss': 0.488, 'learning_rate': 1.9286053754348142e-05, 'epoch': 0.15} + 15%|█▍ | 1290/8750 [2:06:35<11:49:04, 5.70s/it] {'loss': 0.488, 'learning_rate': 1.9286053754348142e-05, 'epoch': 0.15} + 15%|█▍ | 1290/8750 [2:06:28<11:49:04, 5.70s/it] 15%|█▍ | 1291/8750 [2:06:40<11:46:42, 5.68s/it] 15%|█▍ | 1291/8750 [2:06:34<11:46:42, 5.68s/it] {'loss': 0.4734, 'learning_rate': 1.9284679550340783e-05, 'epoch': 0.15} + 15%|█▍ | 1291/8750 [2:06:40<11:46:42, 5.68s/it] {'loss': 0.4734, 'learning_rate': 1.9284679550340783e-05, 'epoch': 0.15} + 15%|█▍ | 1291/8750 [2:06:34<11:46:42, 5.68s/it] 15%|█▍ | 1292/8750 [2:06:40<11:47:25, 5.69s/it] 15%|█▍ | 1292/8750 [2:06:46<11:47:26, 5.69s/it] {'loss': 0.4883, 'learning_rate': 1.928330407412504e-05, 'epoch': 0.15} + 15%|█▍ | 1292/8750 [2:06:46<11:47:26, 5.69s/it] {'loss': 0.4883, 'learning_rate': 1.928330407412504e-05, 'epoch': 0.15} + 15%|█▍ | 1292/8750 [2:06:40<11:47:25, 5.69s/it] 15%|█▍ | 1293/8750 [2:06:45<11:42:14, 5.65s/it] 15%|█▍ | 1293/8750 [2:06:52<11:42:14, 5.65s/it] {'loss': 0.4742, 'learning_rate': 1.9281927325889373e-05, 'epoch': 0.15} + 15%|█▍ | 1293/8750 [2:06:52<11:42:14, 5.65s/it] {'loss': 0.4742, 'learning_rate': 1.9281927325889373e-05, 'epoch': 0.15} + 15%|█▍ | 1293/8750 [2:06:45<11:42:14, 5.65s/it] 15%|█▍ | 1294/8750 [2:06:51<11:46:00, 5.68s/it] 15%|█▍ | 1294/8750 [2:06:57<11:46:01, 5.68s/it] {'loss': 0.499, 'learning_rate': 1.9280549305822435e-05, 'epoch': 0.15} + 15%|█▍ | 1294/8750 [2:06:57<11:46:01, 5.68s/it] {'loss': 0.499, 'learning_rate': 1.9280549305822435e-05, 'epoch': 0.15} + 15%|█▍ | 1294/8750 [2:06:51<11:46:00, 5.68s/it] 15%|█▍ | 1295/8750 [2:07:03<11:41:35, 5.65s/it] 15%|█▍ | 1295/8750 [2:06:56<11:41:35, 5.65s/it] {'loss': 0.4847, 'learning_rate': 1.927917001411304e-05, 'epoch': 0.15} + 15%|█▍ | 1295/8750 [2:07:03<11:41:35, 5.65s/it] {'loss': 0.4847, 'learning_rate': 1.927917001411304e-05, 'epoch': 0.15} + 15%|█▍ | 1295/8750 [2:06:56<11:41:35, 5.65s/it] 15%|█▍ | 1296/8750 [2:07:02<11:38:29, 5.62s/it] 15%|█▍ | 1296/8750 [2:07:08<11:38:30, 5.62s/it] {'loss': 0.4661, 'learning_rate': 1.9277789450950187e-05, 'epoch': 0.15} + 15%|█▍ | 1296/8750 [2:07:08<11:38:30, 5.62s/it] {'loss': 0.4661, 'learning_rate': 1.9277789450950187e-05, 'epoch': 0.15} + 15%|█▍ | 1296/8750 [2:07:02<11:38:29, 5.62s/it] 15%|█▍ | 1297/8750 [2:07:08<11:37:39, 5.62s/it] 15%|█▍ | 1297/8750 [2:07:14<11:37:40, 5.62s/it] {'loss': 0.4792, 'learning_rate': 1.9276407616523044e-05, 'epoch': 0.15} + 15%|█▍ | 1297/8750 [2:07:14<11:37:40, 5.62s/it] {'loss': 0.4792, 'learning_rate': 1.9276407616523044e-05, 'epoch': 0.15} + 15%|█▍ | 1297/8750 [2:07:08<11:37:39, 5.62s/it] 15%|█▍ | 1298/8750 [2:07:13<11:40:36, 5.64s/it] 15%|█▍ | 1298/8750 [2:07:20<11:40:36, 5.64s/it] {'loss': 0.4986, 'learning_rate': 1.927502451102095e-05, 'epoch': 0.15} + 15%|█▍ | 1298/8750 [2:07:20<11:40:36, 5.64s/it] {'loss': 0.4986, 'learning_rate': 1.927502451102095e-05, 'epoch': 0.15} + 15%|█▍ | 1298/8750 [2:07:13<11:40:36, 5.64s/it] 15%|█▍ | 1299/8750 [2:07:26<11:57:18, 5.78s/it] 15%|█▍ | 1299/8750 [2:07:19<11:57:19, 5.78s/it] {'loss': 0.4912, 'learning_rate': 1.927364013463342e-05, 'epoch': 0.15} + 15%|█▍ | 1299/8750 [2:07:26<11:57:18, 5.78s/it] {'loss': 0.4912, 'learning_rate': 1.927364013463342e-05, 'epoch': 0.15} + 15%|█▍ | 1299/8750 [2:07:19<11:57:19, 5.78s/it]13 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +1211 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +0 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... + 15%|█▍ | 1300/8750 [2:07:25<11:54:19, 5.75s/it]1 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... + 15%|█▍ | 1300/8750 [2:07:32<11:54:19, 5.75s/it] {'loss': 0.4729, 'learning_rate': 1.9272254487550144e-05, 'epoch': 0.15} + 15%|█▍ | 1300/8750 [2:07:32<11:54:19, 5.75s/it] {'loss': 0.4729, 'learning_rate': 1.9272254487550144e-05, 'epoch': 0.15} + 15%|█▍ | 1300/8750 [2:07:25<11:54:19, 5.75s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-1300/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-1300/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-1300/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 15%|█▍ | 1301/8750 [2:07:45<20:28:39, 9.90s/it] 15%|█▍ | 1301/8750 [2:07:51<20:28:39, 9.90s/it] {'loss': 0.4785, 'learning_rate': 1.9270867569960994e-05, 'epoch': 0.15} + 15%|█▍ | 1301/8750 [2:07:51<20:28:39, 9.90s/it] {'loss': 0.4785, 'learning_rate': 1.9270867569960994e-05, 'epoch': 0.15} + 15%|█▍ | 1301/8750 [2:07:45<20:28:39, 9.90s/it] 15%|█▍ | 1302/8750 [2:07:50<17:50:35, 8.62s/it] 15%|█▍ | 1302/8750 [2:07:57<17:50:36, 8.62s/it] {'loss': 0.5064, 'learning_rate': 1.9269479382056008e-05, 'epoch': 0.15} + 15%|█▍ | 1302/8750 [2:07:57<17:50:36, 8.62s/it] {'loss': 0.5064, 'learning_rate': 1.9269479382056008e-05, 'epoch': 0.15} + 15%|█▍ | 1302/8750 [2:07:50<17:50:35, 8.62s/it] 15%|█▍ | 1303/8750 [2:08:03<16:01:53, 7.75s/it] 15%|█▍ | 1303/8750 [2:07:56<16:01:54, 7.75s/it] {'loss': 0.4841, 'learning_rate': 1.926808992402539e-05, 'epoch': 0.15} + 15%|█▍ | 1303/8750 [2:08:03<16:01:53, 7.75s/it] {'loss': 0.4841, 'learning_rate': 1.926808992402539e-05, 'epoch': 0.15} + 15%|█▍ | 1303/8750 [2:07:56<16:01:54, 7.75s/it] 15%|█▍ | 1304/8750 [2:08:02<14:43:18, 7.12s/it] 15%|█▍ | 1304/8750 [2:08:08<14:43:19, 7.12s/it] {'loss': 0.4717, 'learning_rate': 1.926669919605953e-05, 'epoch': 0.15} + 15%|█▍ | 1304/8750 [2:08:08<14:43:19, 7.12s/it] {'loss': 0.4717, 'learning_rate': 1.926669919605953e-05, 'epoch': 0.15} + 15%|█▍ | 1304/8750 [2:08:02<14:43:18, 7.12s/it] 15%|█▍ | 1305/8750 [2:08:07<13:47:56, 6.67s/it] 15%|█▍ | 1305/8750 [2:08:14<13:47:55, 6.67s/it] {'loss': 0.4853, 'learning_rate': 1.926530719834899e-05, 'epoch': 0.15} + 15%|█▍ | 1305/8750 [2:08:14<13:47:55, 6.67s/it] {'loss': 0.4853, 'learning_rate': 1.926530719834899e-05, 'epoch': 0.15} + 15%|█▍ | 1305/8750 [2:08:07<13:47:56, 6.67s/it] 15%|█▍ | 1306/8750 [2:08:13<13:15:35, 6.41s/it] 15%|█▍ | 1306/8750 [2:08:20<13:15:35, 6.41s/it] {'loss': 0.4704, 'learning_rate': 1.9263913931084507e-05, 'epoch': 0.15} + 15%|█▍ | 1306/8750 [2:08:20<13:15:35, 6.41s/it] {'loss': 0.4704, 'learning_rate': 1.9263913931084507e-05, 'epoch': 0.15} + 15%|█▍ | 1306/8750 [2:08:13<13:15:35, 6.41s/it] 15%|█▍ | 1307/8750 [2:08:19<12:45:15, 6.17s/it] 15%|█▍ | 1307/8750 [2:08:25<12:45:15, 6.17s/it] {'loss': 0.5001, 'learning_rate': 1.9262519394456985e-05, 'epoch': 0.15} + 15%|█▍ | 1307/8750 [2:08:25<12:45:15, 6.17s/it] {'loss': 0.5001, 'learning_rate': 1.9262519394456985e-05, 'epoch': 0.15} + 15%|█▍ | 1307/8750 [2:08:19<12:45:15, 6.17s/it] 15%|█▍ | 1308/8750 [2:08:31<12:32:53, 6.07s/it] 15%|█▍ | 1308/8750 [2:08:25<12:32:53, 6.07s/it] {'loss': 0.4755, 'learning_rate': 1.9261123588657514e-05, 'epoch': 0.15} + 15%|█▍ | 1308/8750 [2:08:31<12:32:53, 6.07s/it] {'loss': 0.4755, 'learning_rate': 1.9261123588657514e-05, 'epoch': 0.15} + 15%|█▍ | 1308/8750 [2:08:25<12:32:53, 6.07s/it] 15%|█▍ | 1309/8750 [2:08:30<12:20:10, 5.97s/it] 15%|█▍ | 1309/8750 [2:08:37<12:20:10, 5.97s/it] {'loss': 0.4841, 'learning_rate': 1.925972651387734e-05, 'epoch': 0.15} + 15%|█▍ | 1309/8750 [2:08:37<12:20:10, 5.97s/it] {'loss': 0.4841, 'learning_rate': 1.925972651387734e-05, 'epoch': 0.15} + 15%|█▍ | 1309/8750 [2:08:30<12:20:10, 5.97s/it] 15%|█▍ | 1310/8750 [2:08:36<12:04:15, 5.84s/it] 15%|█▍ | 1310/8750 [2:08:42<12:04:15, 5.84s/it] {'loss': 0.5065, 'learning_rate': 1.9258328170307905e-05, 'epoch': 0.15} + 15%|█▍ | 1310/8750 [2:08:42<12:04:15, 5.84s/it] {'loss': 0.5065, 'learning_rate': 1.9258328170307905e-05, 'epoch': 0.15} + 15%|█▍ | 1310/8750 [2:08:36<12:04:15, 5.84s/it] 15%|█▍ | 1311/8750 [2:08:41<11:56:18, 5.78s/it] 15%|█▍ | 1311/8750 [2:08:48<11:56:18, 5.78s/it] {'loss': 0.4867, 'learning_rate': 1.9256928558140806e-05, 'epoch': 0.15} + 15%|█▍ | 1311/8750 [2:08:48<11:56:18, 5.78s/it] {'loss': 0.4867, 'learning_rate': 1.9256928558140806e-05, 'epoch': 0.15} + 15%|█▍ | 1311/8750 [2:08:41<11:56:18, 5.78s/it] 15%|█▍ | 1312/8750 [2:08:47<11:58:02, 5.79s/it] 15%|█▍ | 1312/8750 [2:08:54<11:58:03, 5.79s/it] {'loss': 0.4879, 'learning_rate': 1.925552767756782e-05, 'epoch': 0.15} + 15%|█▍ | 1312/8750 [2:08:54<11:58:03, 5.79s/it] {'loss': 0.4879, 'learning_rate': 1.925552767756782e-05, 'epoch': 0.15} + 15%|█▍ | 1312/8750 [2:08:47<11:58:02, 5.79s/it] 15%|█▌ | 1313/8750 [2:08:53<11:54:20, 5.76s/it] 15%|█▌ | 1313/8750 [2:08:59<11:54:21, 5.76s/it] {'loss': 0.4927, 'learning_rate': 1.9254125528780908e-05, 'epoch': 0.15} + 15%|█▌ | 1313/8750 [2:08:59<11:54:21, 5.76s/it] {'loss': 0.4927, 'learning_rate': 1.9254125528780908e-05, 'epoch': 0.15} + 15%|█▌ | 1313/8750 [2:08:53<11:54:20, 5.76s/it] 15%|█▌ | 1314/8750 [2:08:59<11:49:38, 5.73s/it] 15%|█▌ | 1314/8750 [2:09:05<11:49:39, 5.73s/it] {'loss': 0.5016, 'learning_rate': 1.9252722111972182e-05, 'epoch': 0.15} + 15%|█▌ | 1314/8750 [2:09:05<11:49:39, 5.73s/it] {'loss': 0.5016, 'learning_rate': 1.9252722111972182e-05, 'epoch': 0.15} + 15%|█▌ | 1314/8750 [2:08:59<11:49:38, 5.73s/it] 15%|█▌ | 1315/8750 [2:09:04<11:52:57, 5.75s/it] 15%|█▌ | 1315/8750 [2:09:11<11:52:57, 5.75s/it] {'loss': 0.5064, 'learning_rate': 1.9251317427333953e-05, 'epoch': 0.15} + 15%|█▌ | 1315/8750 [2:09:11<11:52:57, 5.75s/it] {'loss': 0.5064, 'learning_rate': 1.9251317427333953e-05, 'epoch': 0.15} + 15%|█▌ | 1315/8750 [2:09:04<11:52:57, 5.75s/it] 15%|█▌ | 1316/8750 [2:09:10<11:52:36, 5.75s/it] 15%|█▌ | 1316/8750 [2:09:17<11:52:35, 5.75s/it] {'loss': 0.4779, 'learning_rate': 1.924991147505869e-05, 'epoch': 0.15} + 15%|█▌ | 1316/8750 [2:09:17<11:52:35, 5.75s/it] {'loss': 0.4779, 'learning_rate': 1.924991147505869e-05, 'epoch': 0.15} + 15%|█▌ | 1316/8750 [2:09:10<11:52:36, 5.75s/it] 15%|█▌ | 1317/8750 [2:09:16<11:55:04, 5.77s/it] 15%|█▌ | 1317/8750 [2:09:22<11:55:05, 5.77s/it] {'loss': 0.4678, 'learning_rate': 1.924850425533904e-05, 'epoch': 0.15} + 15%|█▌ | 1317/8750 [2:09:22<11:55:05, 5.77s/it] {'loss': 0.4678, 'learning_rate': 1.924850425533904e-05, 'epoch': 0.15} + 15%|█▌ | 1317/8750 [2:09:16<11:55:04, 5.77s/it] 15%|█▌ | 1318/8750 [2:09:28<12:02:37, 5.83s/it] 15%|█▌ | 1318/8750 [2:09:22<12:02:39, 5.83s/it] {'loss': 0.4932, 'learning_rate': 1.9247095768367822e-05, 'epoch': 0.15} + 15%|█▌ | 1318/8750 [2:09:28<12:02:37, 5.83s/it] {'loss': 0.4932, 'learning_rate': 1.9247095768367822e-05, 'epoch': 0.15} + 15%|█▌ | 1318/8750 [2:09:22<12:02:39, 5.83s/it] 15%|█▌ | 1319/8750 [2:09:34<11:52:08, 5.75s/it] 15%|█▌ | 1319/8750 [2:09:28<11:52:09, 5.75s/it] {'loss': 0.4863, 'learning_rate': 1.924568601433803e-05, 'epoch': 0.15} + 15%|█▌ | 1319/8750 [2:09:34<11:52:08, 5.75s/it] {'loss': 0.4863, 'learning_rate': 1.924568601433803e-05, 'epoch': 0.15} + 15%|█▌ | 1319/8750 [2:09:28<11:52:09, 5.75s/it] 15%|█▌ | 1320/8750 [2:09:40<11:48:13, 5.72s/it] 15%|█▌ | 1320/8750 [2:09:33<11:48:13, 5.72s/it] {'loss': 0.484, 'learning_rate': 1.9244274993442836e-05, 'epoch': 0.15} + 15%|█▌ | 1320/8750 [2:09:40<11:48:13, 5.72s/it] {'loss': 0.484, 'learning_rate': 1.9244274993442836e-05, 'epoch': 0.15} + 15%|█▌ | 1320/8750 [2:09:33<11:48:13, 5.72s/it] 15%|█▌ | 1321/8750 [2:09:45<11:47:11, 5.71s/it] 15%|█▌ | 1321/8750 [2:09:39<11:47:12, 5.71s/it] {'loss': 0.4773, 'learning_rate': 1.924286270587558e-05, 'epoch': 0.15} + 15%|█▌ | 1321/8750 [2:09:45<11:47:11, 5.71s/it] {'loss': 0.4773, 'learning_rate': 1.924286270587558e-05, 'epoch': 0.15} + 15%|█▌ | 1321/8750 [2:09:39<11:47:12, 5.71s/it] 15%|█▌ | 1322/8750 [2:09:45<11:49:34, 5.73s/it] 15%|█▌ | 1322/8750 [2:09:51<11:49:35, 5.73s/it] {'loss': 0.4771, 'learning_rate': 1.924144915182977e-05, 'epoch': 0.15} + 15%|█▌ | 1322/8750 [2:09:51<11:49:35, 5.73s/it] {'loss': 0.4771, 'learning_rate': 1.924144915182977e-05, 'epoch': 0.15} + 15%|█▌ | 1322/8750 [2:09:45<11:49:34, 5.73s/it] 15%|█▌ | 1323/8750 [2:09:51<12:02:31, 5.84s/it] 15%|█▌ | 1323/8750 [2:09:57<12:02:37, 5.84s/it] {'loss': 0.4836, 'learning_rate': 1.9240034331499105e-05, 'epoch': 0.15} + 15%|█▌ | 1323/8750 [2:09:57<12:02:37, 5.84s/it] {'loss': 0.4836, 'learning_rate': 1.9240034331499105e-05, 'epoch': 0.15} + 15%|█▌ | 1323/8750 [2:09:51<12:02:31, 5.84s/it] 15%|█▌ | 1324/8750 [2:09:56<11:55:55, 5.78s/it] 15%|█▌ | 1324/8750 [2:10:03<11:55:54, 5.78s/it] {'loss': 0.4933, 'learning_rate': 1.923861824507744e-05, 'epoch': 0.15} + 15%|█▌ | 1324/8750 [2:10:03<11:55:54, 5.78s/it] {'loss': 0.4933, 'learning_rate': 1.923861824507744e-05, 'epoch': 0.15} + 15%|█▌ | 1324/8750 [2:09:56<11:55:55, 5.78s/it] 15%|█▌ | 1325/8750 [2:10:09<12:11:57, 5.91s/it] 15%|█▌ | 1325/8750 [2:10:03<12:11:58, 5.92s/it] {'loss': 0.4814, 'learning_rate': 1.9237200892758814e-05, 'epoch': 0.15} + 15%|█▌ | 1325/8750 [2:10:09<12:11:57, 5.91s/it] {'loss': 0.4814, 'learning_rate': 1.9237200892758814e-05, 'epoch': 0.15} + 15%|█▌ | 1325/8750 [2:10:03<12:11:58, 5.92s/it] 15%|█▌ | 1326/8750 [2:10:08<12:00:27, 5.82s/it] 15%|█▌ | 1326/8750 [2:10:15<12:00:26, 5.82s/it] {'loss': 0.477, 'learning_rate': 1.923578227473743e-05, 'epoch': 0.15} + 15%|█▌ | 1326/8750 [2:10:15<12:00:26, 5.82s/it] {'loss': 0.477, 'learning_rate': 1.923578227473743e-05, 'epoch': 0.15} + 15%|█▌ | 1326/8750 [2:10:08<12:00:27, 5.82s/it] 15%|█▌ | 1327/8750 [2:10:14<11:50:17, 5.74s/it] 15%|█▌ | 1327/8750 [2:10:20<11:50:16, 5.74s/it] {'loss': 0.4851, 'learning_rate': 1.923436239120768e-05, 'epoch': 0.15} + 15%|█▌ | 1327/8750 [2:10:20<11:50:16, 5.74s/it] {'loss': 0.4851, 'learning_rate': 1.923436239120768e-05, 'epoch': 0.15} + 15%|█▌ | 1327/8750 [2:10:14<11:50:17, 5.74s/it] 15%|█▌ | 1328/8750 [2:10:26<11:41:49, 5.67s/it] 15%|█▌ | 1328/8750 [2:10:19<11:41:50, 5.67s/it] {'loss': 0.4971, 'learning_rate': 1.9232941242364114e-05, 'epoch': 0.15} + 15%|█▌ | 1328/8750 [2:10:26<11:41:49, 5.67s/it] {'loss': 0.4971, 'learning_rate': 1.9232941242364114e-05, 'epoch': 0.15} + 15%|█▌ | 1328/8750 [2:10:19<11:41:50, 5.67s/it] 15%|█▌ | 1329/8750 [2:10:25<11:45:59, 5.71s/it] 15%|█▌ | 1329/8750 [2:10:32<11:45:58, 5.71s/it] {'loss': 0.488, 'learning_rate': 1.9231518828401458e-05, 'epoch': 0.15} + 15%|█▌ | 1329/8750 [2:10:32<11:45:58, 5.71s/it] {'loss': 0.488, 'learning_rate': 1.9231518828401458e-05, 'epoch': 0.15} + 15%|█▌ | 1329/8750 [2:10:25<11:45:59, 5.71s/it] 15%|█▌ | 1330/8750 [2:10:38<11:56:11, 5.79s/it] 15%|█▌ | 1330/8750 [2:10:31<11:56:13, 5.79s/it] {'loss': 0.4949, 'learning_rate': 1.923009514951462e-05, 'epoch': 0.15} + 15%|█▌ | 1330/8750 [2:10:31<11:56:13, 5.79s/it] {'loss': 0.4949, 'learning_rate': 1.923009514951462e-05, 'epoch': 0.15} + 15%|█▌ | 1330/8750 [2:10:38<11:56:11, 5.79s/it] 15%|█▌ | 1331/8750 [2:10:37<11:49:46, 5.74s/it] 15%|█▌ | 1331/8750 [2:10:43<11:49:46, 5.74s/it] {'loss': 0.4887, 'learning_rate': 1.9228670205898675e-05, 'epoch': 0.15} + 15%|█▌ | 1331/8750 [2:10:43<11:49:46, 5.74s/it] {'loss': 0.4887, 'learning_rate': 1.9228670205898675e-05, 'epoch': 0.15} + 15%|█▌ | 1331/8750 [2:10:37<11:49:46, 5.74s/it] 15%|█▌ | 1332/8750 [2:10:49<11:47:42, 5.72s/it] 15%|█▌ | 1332/8750 [2:10:42<11:47:42, 5.72s/it] {'loss': 0.4823, 'learning_rate': 1.922724399774887e-05, 'epoch': 0.15} + 15%|█▌ | 1332/8750 [2:10:49<11:47:42, 5.72s/it] {'loss': 0.4823, 'learning_rate': 1.922724399774887e-05, 'epoch': 0.15} + 15%|█▌ | 1332/8750 [2:10:42<11:47:42, 5.72s/it] 15%|█▌ | 1333/8750 [2:10:55<11:47:36, 5.72s/it] 15%|█▌ | 1333/8750 [2:10:48<11:47:37, 5.72s/it] {'loss': 0.4923, 'learning_rate': 1.9225816525260626e-05, 'epoch': 0.15} + 15%|█▌ | 1333/8750 [2:10:55<11:47:36, 5.72s/it] {'loss': 0.4923, 'learning_rate': 1.9225816525260626e-05, 'epoch': 0.15} + 15%|█▌ | 1333/8750 [2:10:48<11:47:37, 5.72s/it] 15%|█▌ | 1334/8750 [2:10:54<11:47:11, 5.72s/it] 15%|█▌ | 1334/8750 [2:11:00<11:47:12, 5.72s/it] {'loss': 0.4817, 'learning_rate': 1.9224387788629547e-05, 'epoch': 0.15} + 15%|█▌ | 1334/8750 [2:11:00<11:47:12, 5.72s/it] {'loss': 0.4817, 'learning_rate': 1.9224387788629547e-05, 'epoch': 0.15} + 15%|█▌ | 1334/8750 [2:10:54<11:47:11, 5.72s/it] 15%|█▌ | 1335/8750 [2:11:06<11:44:28, 5.70s/it] 15%|█▌ | 1335/8750 [2:10:59<11:44:28, 5.70s/it] {'loss': 0.4834, 'learning_rate': 1.922295778805139e-05, 'epoch': 0.15} + 15%|█▌ | 1335/8750 [2:11:06<11:44:28, 5.70s/it] {'loss': 0.4834, 'learning_rate': 1.922295778805139e-05, 'epoch': 0.15} + 15%|█▌ | 1335/8750 [2:10:59<11:44:28, 5.70s/it] 15%|█▌ | 1336/8750 [2:11:05<11:46:00, 5.71s/it] 15%|█▌ | 1336/8750 [2:11:12<11:46:01, 5.71s/it] {'loss': 0.5067, 'learning_rate': 1.9221526523722104e-05, 'epoch': 0.15} + 15%|█▌ | 1336/8750 [2:11:12<11:46:01, 5.71s/it] {'loss': 0.5067, 'learning_rate': 1.9221526523722104e-05, 'epoch': 0.15} + 15%|█▌ | 1336/8750 [2:11:05<11:46:00, 5.71s/it] 15%|█▌ | 1337/8750 [2:11:11<11:46:20, 5.72s/it] 15%|█▌ | 1337/8750 [2:11:17<11:46:20, 5.72s/it] {'loss': 0.4721, 'learning_rate': 1.9220093995837805e-05, 'epoch': 0.15} + 15%|█▌ | 1337/8750 [2:11:17<11:46:20, 5.72s/it] {'loss': 0.4721, 'learning_rate': 1.9220093995837805e-05, 'epoch': 0.15} + 15%|█▌ | 1337/8750 [2:11:11<11:46:20, 5.72s/it] 15%|█▌ | 1338/8750 [2:11:17<11:52:54, 5.77s/it] 15%|█▌ | 1338/8750 [2:11:23<11:52:55, 5.77s/it] {'loss': 0.5119, 'learning_rate': 1.9218660204594778e-05, 'epoch': 0.15} + 15%|█▌ | 1338/8750 [2:11:23<11:52:55, 5.77s/it] {'loss': 0.5119, 'learning_rate': 1.9218660204594778e-05, 'epoch': 0.15} + 15%|█▌ | 1338/8750 [2:11:17<11:52:54, 5.77s/it] 15%|█▌ | 1339/8750 [2:11:29<11:50:27, 5.75s/it] 15%|█▌ | 1339/8750 [2:11:23<11:50:28, 5.75s/it] {'loss': 0.469, 'learning_rate': 1.9217225150189483e-05, 'epoch': 0.15} + 15%|█▌ | 1339/8750 [2:11:29<11:50:27, 5.75s/it] {'loss': 0.469, 'learning_rate': 1.9217225150189483e-05, 'epoch': 0.15} + 15%|█▌ | 1339/8750 [2:11:23<11:50:28, 5.75s/it] 15%|█▌ | 1340/8750 [2:11:29<12:01:08, 5.84s/it] 15%|█▌ | 1340/8750 [2:11:35<12:01:09, 5.84s/it] {'loss': 0.4769, 'learning_rate': 1.921578883281856e-05, 'epoch': 0.15} + 15%|█▌ | 1340/8750 [2:11:35<12:01:09, 5.84s/it]{'loss': 0.4769, 'learning_rate': 1.921578883281856e-05, 'epoch': 0.15} + 15%|█▌ | 1340/8750 [2:11:29<12:01:08, 5.84s/it] 15%|█▌ | 1341/8750 [2:11:35<12:11:20, 5.92s/it] 15%|█▌ | 1341/8750 [2:11:41<12:11:20, 5.92s/it] {'loss': 0.4706, 'learning_rate': 1.9214351252678815e-05, 'epoch': 0.15} + 15%|█▌ | 1341/8750 [2:11:41<12:11:20, 5.92s/it] {'loss': 0.4706, 'learning_rate': 1.9214351252678815e-05, 'epoch': 0.15} + 15%|█▌ | 1341/8750 [2:11:35<12:11:20, 5.92s/it] 15%|█▌ | 1342/8750 [2:11:40<12:03:06, 5.86s/it] 15%|█▌ | 1342/8750 [2:11:47<12:03:05, 5.86s/it] {'loss': 0.4812, 'learning_rate': 1.9212912409967223e-05, 'epoch': 0.15} + 15%|█▌ | 1342/8750 [2:11:47<12:03:05, 5.86s/it] {'loss': 0.4812, 'learning_rate': 1.9212912409967223e-05, 'epoch': 0.15} + 15%|█▌ | 1342/8750 [2:11:40<12:03:06, 5.86s/it] 15%|█▌ | 1343/8750 [2:11:46<11:56:51, 5.81s/it] 15%|█▌ | 1343/8750 [2:11:53<11:56:51, 5.81s/it] {'loss': 0.4874, 'learning_rate': 1.9211472304880945e-05, 'epoch': 0.15} + 15%|█▌ | 1343/8750 [2:11:53<11:56:51, 5.81s/it] {'loss': 0.4874, 'learning_rate': 1.9211472304880945e-05, 'epoch': 0.15} + 15%|█▌ | 1343/8750 [2:11:46<11:56:51, 5.81s/it] 15%|█▌ | 1344/8750 [2:11:52<11:59:41, 5.83s/it] 15%|█▌ | 1344/8750 [2:11:58<11:59:41, 5.83s/it] {'loss': 0.5001, 'learning_rate': 1.9210030937617303e-05, 'epoch': 0.15} + 15%|█▌ | 1344/8750 [2:11:58<11:59:41, 5.83s/it] {'loss': 0.5001, 'learning_rate': 1.9210030937617303e-05, 'epoch': 0.15} + 15%|█▌ | 1344/8750 [2:11:52<11:59:41, 5.83s/it] 15%|█▌ | 1345/8750 [2:11:58<11:51:27, 5.76s/it] 15%|█▌ | 1345/8750 [2:12:04<11:51:28, 5.76s/it] {'loss': 0.477, 'learning_rate': 1.9208588308373798e-05, 'epoch': 0.15} + 15%|█▌ | 1345/8750 [2:12:04<11:51:28, 5.76s/it] {'loss': 0.477, 'learning_rate': 1.9208588308373798e-05, 'epoch': 0.15} + 15%|█▌ | 1345/8750 [2:11:58<11:51:27, 5.76s/it] 15%|█▌ | 1346/8750 [2:12:03<11:48:44, 5.74s/it] 15%|█▌ | 1346/8750 [2:12:10<11:48:45, 5.74s/it] {'loss': 0.5098, 'learning_rate': 1.9207144417348103e-05, 'epoch': 0.15} + 15%|█▌ | 1346/8750 [2:12:10<11:48:45, 5.74s/it] {'loss': 0.5098, 'learning_rate': 1.9207144417348103e-05, 'epoch': 0.15} + 15%|█▌ | 1346/8750 [2:12:03<11:48:44, 5.74s/it] 15%|█▌ | 1347/8750 [2:12:15<11:45:12, 5.72s/it] 15%|█▌ | 1347/8750 [2:12:09<11:45:13, 5.72s/it] {'loss': 0.4901, 'learning_rate': 1.9205699264738063e-05, 'epoch': 0.15} + 15%|█▌ | 1347/8750 [2:12:15<11:45:12, 5.72s/it] {'loss': 0.4901, 'learning_rate': 1.9205699264738063e-05, 'epoch': 0.15} + 15%|█▌ | 1347/8750 [2:12:09<11:45:13, 5.72s/it] 15%|█▌ | 1348/8750 [2:12:21<11:48:01, 5.74s/it] 15%|█▌ | 1348/8750 [2:12:15<11:48:02, 5.74s/it] {'loss': 0.4818, 'learning_rate': 1.9204252850741695e-05, 'epoch': 0.15} + 15%|█▌ | 1348/8750 [2:12:21<11:48:01, 5.74s/it] {'loss': 0.4818, 'learning_rate': 1.9204252850741695e-05, 'epoch': 0.15} + 15%|█▌ | 1348/8750 [2:12:15<11:48:02, 5.74s/it] 15%|█▌ | 1349/8750 [2:12:20<11:42:40, 5.70s/it] 15%|█▌ | 1349/8750 [2:12:27<11:42:40, 5.70s/it] {'loss': 0.4806, 'learning_rate': 1.920280517555719e-05, 'epoch': 0.15} + 15%|█▌ | 1349/8750 [2:12:27<11:42:40, 5.70s/it] {'loss': 0.4806, 'learning_rate': 1.920280517555719e-05, 'epoch': 0.15} + 15%|█▌ | 1349/8750 [2:12:20<11:42:40, 5.70s/it]9 AutoResumeHook: Checking whether to suspend... +35 4AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend...12 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +1013 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... + + +62 7AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... + + 15%|█▌ | 1350/8750 [2:12:33<11:44:32, 5.71s/it]01 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... + 15%|█▌ | 1350/8750 [2:12:26<11:44:33, 5.71s/it] {'loss': 0.4831, 'learning_rate': 1.9201356239382914e-05, 'epoch': 0.15} + 15%|█▌ | 1350/8750 [2:12:33<11:44:32, 5.71s/it] {'loss': 0.4831, 'learning_rate': 1.9201356239382914e-05, 'epoch': 0.15} + 15%|█▌ | 1350/8750 [2:12:26<11:44:33, 5.71s/it] 15%|█▌ | 1351/8750 [2:12:32<11:40:33, 5.68s/it] 15%|█▌ | 1351/8750 [2:12:38<11:40:33, 5.68s/it] {'loss': 0.4894, 'learning_rate': 1.9199906042417403e-05, 'epoch': 0.15} + 15%|█▌ | 1351/8750 [2:12:38<11:40:33, 5.68s/it] {'loss': 0.4894, 'learning_rate': 1.9199906042417403e-05, 'epoch': 0.15} + 15%|█▌ | 1351/8750 [2:12:32<11:40:33, 5.68s/it] 15%|█▌ | 1352/8750 [2:12:38<11:49:51, 5.76s/it] 15%|█▌ | 1352/8750 [2:12:44<11:49:52, 5.76s/it] {'loss': 0.4902, 'learning_rate': 1.919845458485936e-05, 'epoch': 0.15} + 15%|█▌ | 1352/8750 [2:12:44<11:49:52, 5.76s/it] {'loss': 0.4902, 'learning_rate': 1.919845458485936e-05, 'epoch': 0.15} + 15%|█▌ | 1352/8750 [2:12:38<11:49:51, 5.76s/it] 15%|█▌ | 1353/8750 [2:12:43<11:49:19, 5.75s/it] 15%|█▌ | 1353/8750 [2:12:50<11:49:19, 5.75s/it] {'loss': 0.4933, 'learning_rate': 1.9197001866907676e-05, 'epoch': 0.15} + 15%|█▌ | 1353/8750 [2:12:50<11:49:19, 5.75s/it] {'loss': 0.4933, 'learning_rate': 1.9197001866907676e-05, 'epoch': 0.15} + 15%|█▌ | 1353/8750 [2:12:43<11:49:19, 5.75s/it] 15%|█▌ | 1354/8750 [2:12:49<11:43:35, 5.71s/it] 15%|█▌ | 1354/8750 [2:12:55<11:43:35, 5.71s/it] {'loss': 0.4747, 'learning_rate': 1.9195547888761403e-05, 'epoch': 0.15} + {'loss': 0.4747, 'learning_rate': 1.9195547888761403e-05, 'epoch': 0.15} 15%|█▌ | 1354/8750 [2:12:55<11:43:35, 5.71s/it] + 15%|█▌ | 1354/8750 [2:12:49<11:43:35, 5.71s/it] 15%|█▌ | 1355/8750 [2:12:55<11:39:45, 5.68s/it] 15%|█▌ | 1355/8750 [2:13:01<11:39:46, 5.68s/it] {'loss': 0.4754, 'learning_rate': 1.9194092650619767e-05, 'epoch': 0.15} + 15%|█▌ | 1355/8750 [2:13:01<11:39:46, 5.68s/it] {'loss': 0.4754, 'learning_rate': 1.9194092650619767e-05, 'epoch': 0.15} + 15%|█▌ | 1355/8750 [2:12:55<11:39:45, 5.68s/it] 15%|█▌ | 1356/8750 [2:13:00<11:40:10, 5.68s/it] 15%|█▌ | 1356/8750 [2:13:07<11:40:11, 5.68s/it] {'loss': 0.5062, 'learning_rate': 1.9192636152682173e-05, 'epoch': 0.15} + 15%|█▌ | 1356/8750 [2:13:07<11:40:11, 5.68s/it] {'loss': 0.5062, 'learning_rate': 1.9192636152682173e-05, 'epoch': 0.15} + 15%|█▌ | 1356/8750 [2:13:00<11:40:10, 5.68s/it] 16%|█▌ | 1357/8750 [2:13:06<11:47:15, 5.74s/it] 16%|█▌ | 1357/8750 [2:13:13<11:47:15, 5.74s/it] {'loss': 0.4714, 'learning_rate': 1.9191178395148188e-05, 'epoch': 0.16} + 16%|█▌ | 1357/8750 [2:13:13<11:47:15, 5.74s/it] {'loss': 0.4714, 'learning_rate': 1.9191178395148188e-05, 'epoch': 0.16} + 16%|█▌ | 1357/8750 [2:13:06<11:47:15, 5.74s/it] 16%|█▌ | 1358/8750 [2:13:12<11:46:03, 5.73s/it] 16%|█▌ | 1358/8750 [2:13:18<11:46:04, 5.73s/it] {'loss': 0.4884, 'learning_rate': 1.9189719378217554e-05, 'epoch': 0.16} + 16%|█▌ | 1358/8750 [2:13:18<11:46:04, 5.73s/it] {'loss': 0.4884, 'learning_rate': 1.9189719378217554e-05, 'epoch': 0.16} + 16%|█▌ | 1358/8750 [2:13:12<11:46:03, 5.73s/it] 16%|█▌ | 1359/8750 [2:13:18<11:48:29, 5.75s/it] 16%|█▌ | 1359/8750 [2:13:24<11:48:28, 5.75s/it] {'loss': 0.4867, 'learning_rate': 1.91882591020902e-05, 'epoch': 0.16} + 16%|█▌ | 1359/8750 [2:13:24<11:48:28, 5.75s/it] {'loss': 0.4867, 'learning_rate': 1.91882591020902e-05, 'epoch': 0.16} + 16%|█▌ | 1359/8750 [2:13:18<11:48:29, 5.75s/it] 16%|█▌ | 1360/8750 [2:13:23<11:42:31, 5.70s/it] 16%|█▌ | 1360/8750 [2:13:30<11:42:31, 5.70s/it] {'loss': 0.5001, 'learning_rate': 1.9186797566966205e-05, 'epoch': 0.16} + 16%|█▌ | 1360/8750 [2:13:30<11:42:31, 5.70s/it] {'loss': 0.5001, 'learning_rate': 1.9186797566966205e-05, 'epoch': 0.16} + 16%|█▌ | 1360/8750 [2:13:23<11:42:31, 5.70s/it] 16%|█▌ | 1361/8750 [2:13:29<11:55:25, 5.81s/it] 16%|█▌ | 1361/8750 [2:13:36<11:55:24, 5.81s/it] {'loss': 0.4858, 'learning_rate': 1.918533477304584e-05, 'epoch': 0.16} + 16%|█▌ | 1361/8750 [2:13:36<11:55:24, 5.81s/it] {'loss': 0.4858, 'learning_rate': 1.918533477304584e-05, 'epoch': 0.16} + 16%|█▌ | 1361/8750 [2:13:29<11:55:25, 5.81s/it] 16%|█▌ | 1362/8750 [2:13:35<11:48:46, 5.76s/it] 16%|█▌ | 1362/8750 [2:13:41<11:48:47, 5.76s/it] {'loss': 0.4954, 'learning_rate': 1.918387072052954e-05, 'epoch': 0.16} + 16%|█▌ | 1362/8750 [2:13:41<11:48:47, 5.76s/it] {'loss': 0.4954, 'learning_rate': 1.918387072052954e-05, 'epoch': 0.16} + 16%|█▌ | 1362/8750 [2:13:35<11:48:46, 5.76s/it] 16%|█▌ | 1363/8750 [2:13:41<11:51:55, 5.78s/it] 16%|█▌ | 1363/8750 [2:13:47<11:51:55, 5.78s/it] {'loss': 0.4623, 'learning_rate': 1.918240540961791e-05, 'epoch': 0.16} + 16%|█▌ | 1363/8750 [2:13:47<11:51:55, 5.78s/it] {'loss': 0.4623, 'learning_rate': 1.918240540961791e-05, 'epoch': 0.16} + 16%|█▌ | 1363/8750 [2:13:41<11:51:55, 5.78s/it] 16%|█▌ | 1364/8750 [2:13:47<11:50:47, 5.77s/it] 16%|█▌ | 1364/8750 [2:13:53<11:50:48, 5.77s/it] {'loss': 0.5021, 'learning_rate': 1.9180938840511727e-05, 'epoch': 0.16} + 16%|█▌ | 1364/8750 [2:13:53<11:50:48, 5.77s/it] {'loss': 0.5021, 'learning_rate': 1.9180938840511727e-05, 'epoch': 0.16} + 16%|█▌ | 1364/8750 [2:13:47<11:50:47, 5.77s/it] 16%|█▌ | 1365/8750 [2:13:52<11:45:20, 5.73s/it] 16%|█▌ | 1365/8750 [2:13:59<11:45:20, 5.73s/it] {'loss': 0.4912, 'learning_rate': 1.917947101341195e-05, 'epoch': 0.16} + 16%|█▌ | 1365/8750 [2:13:59<11:45:20, 5.73s/it] {'loss': 0.4912, 'learning_rate': 1.917947101341195e-05, 'epoch': 0.16} + 16%|█▌ | 1365/8750 [2:13:52<11:45:20, 5.73s/it] 16%|█▌ | 1366/8750 [2:13:58<11:42:42, 5.71s/it] 16%|█▌ | 1366/8750 [2:14:04<11:42:42, 5.71s/it] {'loss': 0.4741, 'learning_rate': 1.9178001928519703e-05, 'epoch': 0.16} + 16%|█▌ | 1366/8750 [2:14:04<11:42:42, 5.71s/it] {'loss': 0.4741, 'learning_rate': 1.9178001928519703e-05, 'epoch': 0.16} + 16%|█▌ | 1366/8750 [2:13:58<11:42:42, 5.71s/it] 16%|█▌ | 1367/8750 [2:14:04<11:45:08, 5.73s/it] 16%|█▌ | 1367/8750 [2:14:10<11:45:09, 5.73s/it] {'loss': 0.4754, 'learning_rate': 1.9176531586036282e-05, 'epoch': 0.16} + 16%|█▌ | 1367/8750 [2:14:10<11:45:09, 5.73s/it] {'loss': 0.4754, 'learning_rate': 1.9176531586036282e-05, 'epoch': 0.16} + 16%|█▌ | 1367/8750 [2:14:04<11:45:08, 5.73s/it] 16%|█▌ | 1368/8750 [2:14:09<11:42:44, 5.71s/it] 16%|█▌ | 1368/8750 [2:14:16<11:42:43, 5.71s/it] {'loss': 0.4776, 'learning_rate': 1.9175059986163157e-05, 'epoch': 0.16} + 16%|█▌ | 1368/8750 [2:14:16<11:42:43, 5.71s/it] {'loss': 0.4776, 'learning_rate': 1.9175059986163157e-05, 'epoch': 0.16} + 16%|█▌ | 1368/8750 [2:14:09<11:42:44, 5.71s/it] 16%|█▌ | 1369/8750 [2:14:15<11:42:10, 5.71s/it] 16%|█▌ | 1369/8750 [2:14:21<11:42:10, 5.71s/it] {'loss': 0.4847, 'learning_rate': 1.9173587129101967e-05, 'epoch': 0.16} + 16%|█▌ | 1369/8750 [2:14:21<11:42:10, 5.71s/it] {'loss': 0.4847, 'learning_rate': 1.9173587129101967e-05, 'epoch': 0.16} + 16%|█▌ | 1369/8750 [2:14:15<11:42:10, 5.71s/it] 16%|█▌ | 1370/8750 [2:14:21<11:45:42, 5.74s/it] 16%|█▌ | 1370/8750 [2:14:27<11:45:43, 5.74s/it] {'loss': 0.4818, 'learning_rate': 1.917211301505453e-05, 'epoch': 0.16} + 16%|█▌ | 1370/8750 [2:14:27<11:45:43, 5.74s/it] {'loss': 0.4818, 'learning_rate': 1.917211301505453e-05, 'epoch': 0.16} + 16%|█▌ | 1370/8750 [2:14:21<11:45:42, 5.74s/it] 16%|█▌ | 1371/8750 [2:14:26<11:41:28, 5.70s/it] 16%|█▌ | 1371/8750 [2:14:33<11:41:27, 5.70s/it] {'loss': 0.465, 'learning_rate': 1.9170637644222835e-05, 'epoch': 0.16} + 16%|█▌ | 1371/8750 [2:14:33<11:41:27, 5.70s/it] {'loss': 0.465, 'learning_rate': 1.9170637644222835e-05, 'epoch': 0.16} + 16%|█▌ | 1371/8750 [2:14:26<11:41:28, 5.70s/it] 16%|█▌ | 1372/8750 [2:14:32<11:32:14, 5.63s/it] 16%|█▌ | 1372/8750 [2:14:38<11:32:15, 5.63s/it] {'loss': 0.516, 'learning_rate': 1.9169161016809036e-05, 'epoch': 0.16} + 16%|█▌ | 1372/8750 [2:14:38<11:32:15, 5.63s/it] {'loss': 0.516, 'learning_rate': 1.9169161016809036e-05, 'epoch': 0.16} + 16%|█▌ | 1372/8750 [2:14:32<11:32:14, 5.63s/it] 16%|█▌ | 1373/8750 [2:14:38<11:38:16, 5.68s/it] 16%|█▌ | 1373/8750 [2:14:44<11:38:15, 5.68s/it] {'loss': 0.4855, 'learning_rate': 1.9167683133015465e-05, 'epoch': 0.16} + 16%|█▌ | 1373/8750 [2:14:44<11:38:15, 5.68s/it] {'loss': 0.4855, 'learning_rate': 1.9167683133015465e-05, 'epoch': 0.16} + 16%|█▌ | 1373/8750 [2:14:38<11:38:16, 5.68s/it] 16%|█▌ | 1374/8750 [2:14:43<11:36:17, 5.66s/it] 16%|█▌ | 1374/8750 [2:14:50<11:36:18, 5.66s/it] {'loss': 0.4876, 'learning_rate': 1.9166203993044627e-05, 'epoch': 0.16} + 16%|█▌ | 1374/8750 [2:14:50<11:36:18, 5.66s/it] {'loss': 0.4876, 'learning_rate': 1.9166203993044627e-05, 'epoch': 0.16} + 16%|█▌ | 1374/8750 [2:14:43<11:36:17, 5.66s/it] 16%|█▌ | 1375/8750 [2:14:49<11:34:01, 5.65s/it] 16%|█▌ | 1375/8750 [2:14:55<11:34:01, 5.65s/it] {'loss': 0.4911, 'learning_rate': 1.9164723597099192e-05, 'epoch': 0.16} + 16%|█▌ | 1375/8750 [2:14:55<11:34:01, 5.65s/it] {'loss': 0.4911, 'learning_rate': 1.9164723597099192e-05, 'epoch': 0.16} + 16%|█▌ | 1375/8750 [2:14:49<11:34:01, 5.65s/it] 16%|█▌ | 1376/8750 [2:14:55<11:41:45, 5.71s/it] 16%|█▌ | 1376/8750 [2:15:01<11:41:49, 5.71s/it] {'loss': 0.4916, 'learning_rate': 1.9163241945382012e-05, 'epoch': 0.16} + 16%|█▌ | 1376/8750 [2:15:01<11:41:49, 5.71s/it] {'loss': 0.4916, 'learning_rate': 1.9163241945382012e-05, 'epoch': 0.16} + 16%|█▌ | 1376/8750 [2:14:55<11:41:45, 5.71s/it] 16%|█▌ | 1377/8750 [2:15:01<11:47:26, 5.76s/it] 16%|█▌ | 1377/8750 [2:15:07<11:47:25, 5.76s/it] {'loss': 0.4721, 'learning_rate': 1.9161759038096108e-05, 'epoch': 0.16} + 16%|█▌ | 1377/8750 [2:15:07<11:47:25, 5.76s/it] {'loss': 0.4721, 'learning_rate': 1.9161759038096108e-05, 'epoch': 0.16} + 16%|█▌ | 1377/8750 [2:15:01<11:47:26, 5.76s/it] 16%|█▌ | 1378/8750 [2:15:06<11:48:45, 5.77s/it] 16%|█▌ | 1378/8750 [2:15:13<11:48:44, 5.77s/it] {'loss': 0.4814, 'learning_rate': 1.9160274875444668e-05, 'epoch': 0.16} + 16%|█▌ | 1378/8750 [2:15:13<11:48:44, 5.77s/it] {'loss': 0.4814, 'learning_rate': 1.9160274875444668e-05, 'epoch': 0.16} + 16%|█▌ | 1378/8750 [2:15:06<11:48:45, 5.77s/it] 16%|█▌ | 1379/8750 [2:15:12<11:42:32, 5.72s/it] 16%|█▌ | 1379/8750 [2:15:18<11:42:31, 5.72s/it] {'loss': 0.4883, 'learning_rate': 1.9158789457631054e-05, 'epoch': 0.16} + 16%|█▌ | 1379/8750 [2:15:18<11:42:31, 5.72s/it] {'loss': 0.4883, 'learning_rate': 1.9158789457631054e-05, 'epoch': 0.16} + 16%|█▌ | 1379/8750 [2:15:12<11:42:32, 5.72s/it] 16%|█▌ | 1380/8750 [2:15:18<11:50:04, 5.78s/it] 16%|█▌ | 1380/8750 [2:15:24<11:50:04, 5.78s/it] {'loss': 0.4865, 'learning_rate': 1.9157302784858807e-05, 'epoch': 0.16} + 16%|█▌ | 1380/8750 [2:15:24<11:50:04, 5.78s/it] {'loss': 0.4865, 'learning_rate': 1.9157302784858807e-05, 'epoch': 0.16} + 16%|█▌ | 1380/8750 [2:15:18<11:50:04, 5.78s/it] 16%|█▌ | 1381/8750 [2:15:24<11:46:12, 5.75s/it] 16%|█▌ | 1381/8750 [2:15:30<11:46:11, 5.75s/it] {'loss': 0.4839, 'learning_rate': 1.915581485733163e-05, 'epoch': 0.16} + 16%|█▌ | 1381/8750 [2:15:30<11:46:11, 5.75s/it] {'loss': 0.4839, 'learning_rate': 1.915581485733163e-05, 'epoch': 0.16} + 16%|█▌ | 1381/8750 [2:15:24<11:46:12, 5.75s/it] 16%|█▌ | 1382/8750 [2:15:29<11:39:57, 5.70s/it] 16%|█▌ | 1382/8750 [2:15:36<11:39:57, 5.70s/it] {'loss': 0.4959, 'learning_rate': 1.91543256752534e-05, 'epoch': 0.16} + 16%|█▌ | 1382/8750 [2:15:36<11:39:57, 5.70s/it] {'loss': 0.4959, 'learning_rate': 1.91543256752534e-05, 'epoch': 0.16} + 16%|█▌ | 1382/8750 [2:15:29<11:39:57, 5.70s/it] 16%|█▌ | 1383/8750 [2:15:35<11:49:39, 5.78s/it] 16%|█▌ | 1383/8750 [2:15:42<11:49:38, 5.78s/it] {'loss': 0.4822, 'learning_rate': 1.915283523882818e-05, 'epoch': 0.16} + 16%|█▌ | 1383/8750 [2:15:42<11:49:38, 5.78s/it] {'loss': 0.4822, 'learning_rate': 1.915283523882818e-05, 'epoch': 0.16} + 16%|█▌ | 1383/8750 [2:15:35<11:49:39, 5.78s/it] 16%|█▌ | 1384/8750 [2:15:41<11:41:35, 5.71s/it] 16%|█▌ | 1384/8750 [2:15:47<11:41:35, 5.71s/it] {'loss': 0.4907, 'learning_rate': 1.9151343548260176e-05, 'epoch': 0.16} + 16%|█▌ | 1384/8750 [2:15:47<11:41:35, 5.71s/it] {'loss': 0.4907, 'learning_rate': 1.9151343548260176e-05, 'epoch': 0.16} + 16%|█▌ | 1384/8750 [2:15:41<11:41:35, 5.71s/it] 16%|█▌ | 1385/8750 [2:15:47<11:49:36, 5.78s/it] 16%|█▌ | 1385/8750 [2:15:53<11:49:36, 5.78s/it] {'loss': 0.4936, 'learning_rate': 1.9149850603753793e-05, 'epoch': 0.16} + 16%|█▌ | 1385/8750 [2:15:53<11:49:36, 5.78s/it] {'loss': 0.4936, 'learning_rate': 1.9149850603753793e-05, 'epoch': 0.16} + 16%|█▌ | 1385/8750 [2:15:47<11:49:36, 5.78s/it] 16%|█▌ | 1386/8750 [2:15:52<11:46:23, 5.76s/it] 16%|█▌ | 1386/8750 [2:15:59<11:46:23, 5.76s/it] {'loss': 0.4744, 'learning_rate': 1.91483564055136e-05, 'epoch': 0.16} + 16%|█▌ | 1386/8750 [2:15:59<11:46:23, 5.76s/it] {'loss': 0.4744, 'learning_rate': 1.91483564055136e-05, 'epoch': 0.16} + 16%|█▌ | 1386/8750 [2:15:52<11:46:23, 5.76s/it] 16%|█▌ | 1387/8750 [2:16:05<11:46:13, 5.75s/it] 16%|█▌ | 1387/8750 [2:15:58<11:46:14, 5.76s/it] {'loss': 0.4955, 'learning_rate': 1.9146860953744325e-05, 'epoch': 0.16} + 16%|█▌ | 1387/8750 [2:16:05<11:46:13, 5.75s/it] {'loss': 0.4955, 'learning_rate': 1.9146860953744325e-05, 'epoch': 0.16} + 16%|█▌ | 1387/8750 [2:15:58<11:46:14, 5.76s/it] 16%|█▌ | 1388/8750 [2:16:04<11:39:09, 5.70s/it] 16%|█▌ | 1388/8750 [2:16:10<11:39:09, 5.70s/it] {'loss': 0.4918, 'learning_rate': 1.9145364248650892e-05, 'epoch': 0.16} + 16%|█▌ | 1388/8750 [2:16:10<11:39:09, 5.70s/it] {'loss': 0.4918, 'learning_rate': 1.9145364248650892e-05, 'epoch': 0.16} + 16%|█▌ | 1388/8750 [2:16:04<11:39:09, 5.70s/it] 16%|█▌ | 1389/8750 [2:16:09<11:37:20, 5.68s/it] 16%|█▌ | 1389/8750 [2:16:16<11:37:20, 5.68s/it] {'loss': 0.4582, 'learning_rate': 1.914386629043837e-05, 'epoch': 0.16} + 16%|█▌ | 1389/8750 [2:16:16<11:37:20, 5.68s/it] {'loss': 0.4582, 'learning_rate': 1.914386629043837e-05, 'epoch': 0.16} + 16%|█▌ | 1389/8750 [2:16:09<11:37:20, 5.68s/it] 16%|█▌ | 1390/8750 [2:16:15<11:40:20, 5.71s/it] 16%|█▌ | 1390/8750 [2:16:22<11:40:20, 5.71s/it] {'loss': 0.4987, 'learning_rate': 1.9142367079312023e-05, 'epoch': 0.16} + 16%|█▌ | 1390/8750 [2:16:22<11:40:20, 5.71s/it] {'loss': 0.4987, 'learning_rate': 1.9142367079312023e-05, 'epoch': 0.16} + 16%|█▌ | 1390/8750 [2:16:15<11:40:20, 5.71s/it] 16%|█▌ | 1391/8750 [2:16:21<11:37:13, 5.68s/it] 16%|█▌ | 1391/8750 [2:16:27<11:37:13, 5.68s/it] {'loss': 0.4701, 'learning_rate': 1.9140866615477272e-05, 'epoch': 0.16} + 16%|█▌ | 1391/8750 [2:16:27<11:37:13, 5.68s/it] {'loss': 0.4701, 'learning_rate': 1.9140866615477272e-05, 'epoch': 0.16} + 16%|█▌ | 1391/8750 [2:16:21<11:37:13, 5.68s/it] 16%|█▌ | 1392/8750 [2:16:27<11:49:45, 5.79s/it] 16%|█▌ | 1392/8750 [2:16:33<11:49:44, 5.79s/it] {'loss': 0.4822, 'learning_rate': 1.913936489913971e-05, 'epoch': 0.16} + 16%|█▌ | 1392/8750 [2:16:33<11:49:44, 5.79s/it] {'loss': 0.4822, 'learning_rate': 1.913936489913971e-05, 'epoch': 0.16} + 16%|█▌ | 1392/8750 [2:16:27<11:49:45, 5.79s/it] 16%|█▌ | 1393/8750 [2:16:32<11:47:00, 5.77s/it] 16%|█▌ | 1393/8750 [2:16:39<11:47:00, 5.77s/it] {'loss': 0.4738, 'learning_rate': 1.9137861930505112e-05, 'epoch': 0.16} + 16%|█▌ | 1393/8750 [2:16:39<11:47:00, 5.77s/it] {'loss': 0.4738, 'learning_rate': 1.9137861930505112e-05, 'epoch': 0.16} + 16%|█▌ | 1393/8750 [2:16:32<11:47:00, 5.77s/it] 16%|█▌ | 1394/8750 [2:16:38<11:48:24, 5.78s/it] 16%|█▌ | 1394/8750 [2:16:45<11:48:24, 5.78s/it] {'loss': 0.4974, 'learning_rate': 1.9136357709779418e-05, 'epoch': 0.16} + 16%|█▌ | 1394/8750 [2:16:45<11:48:24, 5.78s/it] {'loss': 0.4974, 'learning_rate': 1.9136357709779418e-05, 'epoch': 0.16} + 16%|█▌ | 1394/8750 [2:16:38<11:48:24, 5.78s/it] 16%|█▌ | 1395/8750 [2:16:44<11:43:38, 5.74s/it] 16%|█▌ | 1395/8750 [2:16:50<11:43:39, 5.74s/it] {'loss': 0.4939, 'learning_rate': 1.9134852237168738e-05, 'epoch': 0.16} + 16%|█▌ | 1395/8750 [2:16:50<11:43:39, 5.74s/it] {'loss': 0.4939, 'learning_rate': 1.9134852237168738e-05, 'epoch': 0.16} + 16%|█▌ | 1395/8750 [2:16:44<11:43:38, 5.74s/it] 16%|█▌ | 1396/8750 [2:16:50<11:45:58, 5.76s/it] 16%|█▌ | 1396/8750 [2:16:56<11:45:58, 5.76s/it] {'loss': 0.4838, 'learning_rate': 1.9133345512879353e-05, 'epoch': 0.16} + 16%|█▌ | 1396/8750 [2:16:56<11:45:58, 5.76s/it] {'loss': 0.4838, 'learning_rate': 1.9133345512879353e-05, 'epoch': 0.16} + 16%|█▌ | 1396/8750 [2:16:50<11:45:58, 5.76s/it] 16%|█▌ | 1397/8750 [2:16:56<11:48:32, 5.78s/it] 16%|█▌ | 1397/8750 [2:17:02<11:48:33, 5.78s/it] {'loss': 0.4822, 'learning_rate': 1.9131837537117724e-05, 'epoch': 0.16} + 16%|█▌ | 1397/8750 [2:17:02<11:48:33, 5.78s/it] {'loss': 0.4822, 'learning_rate': 1.9131837537117724e-05, 'epoch': 0.16} + 16%|█▌ | 1397/8750 [2:16:56<11:48:32, 5.78s/it] 16%|█▌ | 1398/8750 [2:17:01<11:42:50, 5.74s/it] 16%|█▌ | 1398/8750 [2:17:08<11:42:49, 5.74s/it] {'loss': 0.5028, 'learning_rate': 1.913032831009047e-05, 'epoch': 0.16} + 16%|█▌ | 1398/8750 [2:17:08<11:42:49, 5.74s/it] {'loss': 0.5028, 'learning_rate': 1.913032831009047e-05, 'epoch': 0.16} + 16%|█▌ | 1398/8750 [2:17:01<11:42:50, 5.74s/it] 16%|█▌ | 1399/8750 [2:17:07<11:39:33, 5.71s/it] 16%|█▌ | 1399/8750 [2:17:13<11:39:33, 5.71s/it] {'loss': 0.4745, 'learning_rate': 1.9128817832004393e-05, 'epoch': 0.16} + 16%|█▌ | 1399/8750 [2:17:13<11:39:33, 5.71s/it] {'loss': 0.4745, 'learning_rate': 1.9128817832004393e-05, 'epoch': 0.16} + 16%|█▌ | 1399/8750 [2:17:07<11:39:33, 5.71s/it]3 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +25 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... + 11 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... + 16%|█▌ | 1400/8750 [2:17:12<11:37:10, 5.69s/it]15 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... + 16%|█▌ | 1400/8750 [2:17:19<11:37:09, 5.69s/it] {'loss': 0.4764, 'learning_rate': 1.912730610306646e-05, 'epoch': 0.16} + 16%|█▌ | 1400/8750 [2:17:19<11:37:09, 5.69s/it] {'loss': 0.4764, 'learning_rate': 1.912730610306646e-05, 'epoch': 0.16} + 16%|█▌ | 1400/8750 [2:17:12<11:37:10, 5.69s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-1400/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-1400/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-1400/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 16%|█▌ | 1401/8750 [2:17:33<20:37:45, 10.11s/it] 16%|█▌ | 1401/8750 [2:17:39<20:37:45, 10.11s/it] {'loss': 0.4963, 'learning_rate': 1.9125793123483814e-05, 'epoch': 0.16} + 16%|█▌ | 1401/8750 [2:17:39<20:37:45, 10.11s/it] {'loss': 0.4963, 'learning_rate': 1.9125793123483814e-05, 'epoch': 0.16} + 16%|█▌ | 1401/8750 [2:17:33<20:37:45, 10.11s/it] 16%|█▌ | 1402/8750 [2:17:38<17:50:47, 8.74s/it] 16%|█▌ | 1402/8750 [2:17:45<17:50:47, 8.74s/it] {'loss': 0.4879, 'learning_rate': 1.912427889346377e-05, 'epoch': 0.16} + 16%|█▌ | 1402/8750 [2:17:45<17:50:47, 8.74s/it] {'loss': 0.4879, 'learning_rate': 1.912427889346377e-05, 'epoch': 0.16} + 16%|█▌ | 1402/8750 [2:17:38<17:50:47, 8.74s/it] 16%|█▌ | 1403/8750 [2:17:44<16:02:29, 7.86s/it] 16%|█▌ | 1403/8750 [2:17:51<16:02:30, 7.86s/it] {'loss': 0.4866, 'learning_rate': 1.91227634132138e-05, 'epoch': 0.16} + 16%|█▌ | 1403/8750 [2:17:51<16:02:30, 7.86s/it] {'loss': 0.4866, 'learning_rate': 1.91227634132138e-05, 'epoch': 0.16} + 16%|█▌ | 1403/8750 [2:17:44<16:02:29, 7.86s/it] 16%|█▌ | 1404/8750 [2:17:50<14:40:23, 7.19s/it] 16%|█▌ | 1404/8750 [2:17:56<14:40:23, 7.19s/it] {'loss': 0.4828, 'learning_rate': 1.912124668294157e-05, 'epoch': 0.16} + 16%|█▌ | 1404/8750 [2:17:56<14:40:23, 7.19s/it] {'loss': 0.4828, 'learning_rate': 1.912124668294157e-05, 'epoch': 0.16} + 16%|█▌ | 1404/8750 [2:17:50<14:40:23, 7.19s/it] 16%|█▌ | 1405/8750 [2:17:56<13:46:31, 6.75s/it] 16%|█▌ | 1405/8750 [2:18:02<13:46:31, 6.75s/it] {'loss': 0.5033, 'learning_rate': 1.91197287028549e-05, 'epoch': 0.16} + 16%|█▌ | 1405/8750 [2:18:02<13:46:31, 6.75s/it] {'loss': 0.5033, 'learning_rate': 1.91197287028549e-05, 'epoch': 0.16} + 16%|█▌ | 1405/8750 [2:17:56<13:46:31, 6.75s/it] 16%|█▌ | 1406/8750 [2:18:02<13:15:27, 6.50s/it] 16%|█▌ | 1406/8750 [2:18:08<13:15:27, 6.50s/it] {'loss': 0.4922, 'learning_rate': 1.9118209473161794e-05, 'epoch': 0.16} + 16%|█▌ | 1406/8750 [2:18:08<13:15:27, 6.50s/it] {'loss': 0.4922, 'learning_rate': 1.9118209473161794e-05, 'epoch': 0.16} + 16%|█▌ | 1406/8750 [2:18:02<13:15:27, 6.50s/it] 16%|█▌ | 1407/8750 [2:18:07<12:48:55, 6.28s/it] 16%|█▌ | 1407/8750 [2:18:14<12:48:55, 6.28s/it] {'loss': 0.4738, 'learning_rate': 1.9116688994070413e-05, 'epoch': 0.16} + 16%|█▌ | 1407/8750 [2:18:14<12:48:55, 6.28s/it] {'loss': 0.4738, 'learning_rate': 1.9116688994070413e-05, 'epoch': 0.16} + 16%|█▌ | 1407/8750 [2:18:07<12:48:55, 6.28s/it] 16%|█▌ | 1408/8750 [2:18:13<12:21:07, 6.06s/it] 16%|█▌ | 1408/8750 [2:18:19<12:21:07, 6.06s/it] {'loss': 0.4901, 'learning_rate': 1.9115167265789096e-05, 'epoch': 0.16} + 16%|█▌ | 1408/8750 [2:18:19<12:21:07, 6.06s/it]{'loss': 0.4901, 'learning_rate': 1.9115167265789096e-05, 'epoch': 0.16} + 16%|█▌ | 1408/8750 [2:18:13<12:21:07, 6.06s/it] 16%|█▌ | 1409/8750 [2:18:19<12:11:37, 5.98s/it] 16%|█▌ | 1409/8750 [2:18:25<12:11:36, 5.98s/it] {'loss': 0.4672, 'learning_rate': 1.911364428852636e-05, 'epoch': 0.16} + 16%|█▌ | 1409/8750 [2:18:25<12:11:36, 5.98s/it] {'loss': 0.4672, 'learning_rate': 1.911364428852636e-05, 'epoch': 0.16} + 16%|█▌ | 1409/8750 [2:18:19<12:11:37, 5.98s/it] 16%|█▌ | 1410/8750 [2:18:25<12:10:37, 5.97s/it] 16%|█▌ | 1410/8750 [2:18:31<12:10:38, 5.97s/it] {'loss': 0.4893, 'learning_rate': 1.9112120062490883e-05, 'epoch': 0.16} + 16%|█▌ | 1410/8750 [2:18:31<12:10:38, 5.97s/it] {'loss': 0.4893, 'learning_rate': 1.9112120062490883e-05, 'epoch': 0.16} + 16%|█▌ | 1410/8750 [2:18:25<12:10:37, 5.97s/it] 16%|█▌ | 1411/8750 [2:18:30<11:59:01, 5.88s/it] 16%|█▌ | 1411/8750 [2:18:37<11:59:01, 5.88s/it] {'loss': 0.4717, 'learning_rate': 1.911059458789152e-05, 'epoch': 0.16} + 16%|█▌ | 1411/8750 [2:18:37<11:59:01, 5.88s/it] {'loss': 0.4717, 'learning_rate': 1.911059458789152e-05, 'epoch': 0.16} + 16%|█▌ | 1411/8750 [2:18:30<11:59:01, 5.88s/it] 16%|█▌ | 1412/8750 [2:18:42<11:48:11, 5.79s/it] 16%|█▌ | 1412/8750 [2:18:36<11:48:12, 5.79s/it] {'loss': 0.4992, 'learning_rate': 1.9109067864937292e-05, 'epoch': 0.16} + 16%|█▌ | 1412/8750 [2:18:42<11:48:11, 5.79s/it] {'loss': 0.4992, 'learning_rate': 1.9109067864937292e-05, 'epoch': 0.16} + 16%|█▌ | 1412/8750 [2:18:36<11:48:12, 5.79s/it] 16%|█▌ | 1413/8750 [2:18:42<11:54:08, 5.84s/it] 16%|█▌ | 1413/8750 [2:18:48<11:54:08, 5.84s/it] {'loss': 0.4814, 'learning_rate': 1.9107539893837396e-05, 'epoch': 0.16} + 16%|█▌ | 1413/8750 [2:18:48<11:54:08, 5.84s/it] {'loss': 0.4814, 'learning_rate': 1.9107539893837396e-05, 'epoch': 0.16} + 16%|█▌ | 1413/8750 [2:18:42<11:54:08, 5.84s/it] 16%|█▌ | 1414/8750 [2:18:47<11:42:16, 5.74s/it] 16%|█▌ | 1414/8750 [2:18:54<11:42:15, 5.74s/it] {'loss': 0.4832, 'learning_rate': 1.91060106748012e-05, 'epoch': 0.16} + 16%|█▌ | 1414/8750 [2:18:54<11:42:15, 5.74s/it] {'loss': 0.4832, 'learning_rate': 1.91060106748012e-05, 'epoch': 0.16} + 16%|█▌ | 1414/8750 [2:18:47<11:42:16, 5.74s/it] 16%|█▌ | 1415/8750 [2:18:53<11:43:16, 5.75s/it] 16%|█▌ | 1415/8750 [2:19:00<11:43:16, 5.75s/it] {'loss': 0.4766, 'learning_rate': 1.9104480208038236e-05, 'epoch': 0.16} + 16%|█▌ | 1415/8750 [2:19:00<11:43:16, 5.75s/it] {'loss': 0.4766, 'learning_rate': 1.9104480208038236e-05, 'epoch': 0.16} + 16%|█▌ | 1415/8750 [2:18:53<11:43:16, 5.75s/it] 16%|█▌ | 1416/8750 [2:18:59<11:42:20, 5.75s/it] 16%|█▌ | 1416/8750 [2:19:05<11:42:20, 5.75s/it] {'loss': 0.4931, 'learning_rate': 1.9102948493758217e-05, 'epoch': 0.16} + 16%|█▌ | 1416/8750 [2:19:05<11:42:20, 5.75s/it] {'loss': 0.4931, 'learning_rate': 1.9102948493758217e-05, 'epoch': 0.16} + 16%|█▌ | 1416/8750 [2:18:59<11:42:20, 5.75s/it] 16%|█▌ | 1417/8750 [2:19:05<11:41:47, 5.74s/it] 16%|█▌ | 1417/8750 [2:19:11<11:41:49, 5.74s/it] {'loss': 0.4702, 'learning_rate': 1.9101415532171018e-05, 'epoch': 0.16} + 16%|█▌ | 1417/8750 [2:19:11<11:41:49, 5.74s/it] {'loss': 0.4702, 'learning_rate': 1.9101415532171018e-05, 'epoch': 0.16} + 16%|█▌ | 1417/8750 [2:19:05<11:41:47, 5.74s/it] 16%|█▌ | 1418/8750 [2:19:10<11:37:02, 5.70s/it] 16%|█▌ | 1418/8750 [2:19:17<11:37:01, 5.70s/it] {'loss': 0.4695, 'learning_rate': 1.90998813234867e-05, 'epoch': 0.16} + 16%|█▌ | 1418/8750 [2:19:17<11:37:01, 5.70s/it] {'loss': 0.4695, 'learning_rate': 1.90998813234867e-05, 'epoch': 0.16} + 16%|█▌ | 1418/8750 [2:19:10<11:37:02, 5.70s/it] 16%|█▌ | 1419/8750 [2:19:16<11:35:58, 5.70s/it] 16%|█▌ | 1419/8750 [2:19:22<11:35:58, 5.70s/it] {'loss': 0.5036, 'learning_rate': 1.9098345867915467e-05, 'epoch': 0.16} + 16%|█▌ | 1419/8750 [2:19:22<11:35:58, 5.70s/it] {'loss': 0.5036, 'learning_rate': 1.9098345867915467e-05, 'epoch': 0.16} + 16%|█▌ | 1419/8750 [2:19:16<11:35:58, 5.70s/it] 16%|█▌ | 1420/8750 [2:19:22<11:40:00, 5.73s/it] 16%|█▌ | 1420/8750 [2:19:28<11:40:00, 5.73s/it] {'loss': 0.5027, 'learning_rate': 1.909680916566772e-05, 'epoch': 0.16} + 16%|█▌ | 1420/8750 [2:19:28<11:40:00, 5.73s/it] {'loss': 0.5027, 'learning_rate': 1.909680916566772e-05, 'epoch': 0.16} + 16%|█▌ | 1420/8750 [2:19:22<11:40:00, 5.73s/it] 16%|█▌ | 1421/8750 [2:19:27<11:34:05, 5.68s/it] 16%|█▌ | 1421/8750 [2:19:34<11:34:04, 5.68s/it] {'loss': 0.4953, 'learning_rate': 1.9095271216954022e-05, 'epoch': 0.16} + 16%|█▌ | 1421/8750 [2:19:34<11:34:04, 5.68s/it] {'loss': 0.4953, 'learning_rate': 1.9095271216954022e-05, 'epoch': 0.16} + 16%|█▌ | 1421/8750 [2:19:27<11:34:05, 5.68s/it] 16%|█▋ | 1422/8750 [2:19:33<11:37:54, 5.71s/it] 16%|█▋ | 1422/8750 [2:19:39<11:37:54, 5.71s/it] {'loss': 0.4851, 'learning_rate': 1.9093732021985103e-05, 'epoch': 0.16} + 16%|█▋ | 1422/8750 [2:19:39<11:37:54, 5.71s/it] {'loss': 0.4851, 'learning_rate': 1.9093732021985103e-05, 'epoch': 0.16} + 16%|█▋ | 1422/8750 [2:19:33<11:37:54, 5.71s/it] 16%|█▋ | 1423/8750 [2:19:45<11:43:00, 5.76s/it] 16%|█▋ | 1423/8750 [2:19:39<11:43:01, 5.76s/it] {'loss': 0.4972, 'learning_rate': 1.909219158097187e-05, 'epoch': 0.16} + 16%|█▋ | 1423/8750 [2:19:45<11:43:00, 5.76s/it] {'loss': 0.4972, 'learning_rate': 1.909219158097187e-05, 'epoch': 0.16} + 16%|█▋ | 1423/8750 [2:19:39<11:43:01, 5.76s/it] 16%|█▋ | 1424/8750 [2:19:44<11:37:12, 5.71s/it] 16%|█▋ | 1424/8750 [2:19:51<11:37:13, 5.71s/it] {'loss': 0.4746, 'learning_rate': 1.9090649894125395e-05, 'epoch': 0.16} + 16%|█▋ | 1424/8750 [2:19:51<11:37:13, 5.71s/it] {'loss': 0.4746, 'learning_rate': 1.9090649894125395e-05, 'epoch': 0.16} + 16%|█▋ | 1424/8750 [2:19:44<11:37:12, 5.71s/it] 16%|█▋ | 1425/8750 [2:19:50<11:36:12, 5.70s/it] 16%|█▋ | 1425/8750 [2:19:57<11:36:12, 5.70s/it] {'loss': 0.4911, 'learning_rate': 1.908910696165693e-05, 'epoch': 0.16} +{'loss': 0.4911, 'learning_rate': 1.908910696165693e-05, 'epoch': 0.16} 16%|█▋ | 1425/8750 [2:19:57<11:36:12, 5.70s/it] + 16%|█▋ | 1425/8750 [2:19:50<11:36:12, 5.70s/it] 16%|█▋ | 1426/8750 [2:19:56<11:44:28, 5.77s/it] 16%|█▋ | 1426/8750 [2:20:03<11:44:29, 5.77s/it] {'loss': 0.4701, 'learning_rate': 1.908756278377788e-05, 'epoch': 0.16} + 16%|█▋ | 1426/8750 [2:20:03<11:44:29, 5.77s/it] {'loss': 0.4701, 'learning_rate': 1.908756278377788e-05, 'epoch': 0.16} + 16%|█▋ | 1426/8750 [2:19:56<11:44:28, 5.77s/it] 16%|█▋ | 1427/8750 [2:20:02<11:46:45, 5.79s/it] 16%|█▋ | 1427/8750 [2:20:08<11:46:45, 5.79s/it] {'loss': 0.4793, 'learning_rate': 1.9086017360699843e-05, 'epoch': 0.16} + 16%|█▋ | 1427/8750 [2:20:08<11:46:45, 5.79s/it] {'loss': 0.4793, 'learning_rate': 1.9086017360699843e-05, 'epoch': 0.16} + 16%|█▋ | 1427/8750 [2:20:02<11:46:45, 5.79s/it] 16%|█▋ | 1428/8750 [2:20:07<11:37:19, 5.71s/it] 16%|█▋ | 1428/8750 [2:20:14<11:37:19, 5.71s/it] {'loss': 0.4962, 'learning_rate': 1.9084470692634567e-05, 'epoch': 0.16} + 16%|█▋ | 1428/8750 [2:20:14<11:37:19, 5.71s/it] {'loss': 0.4962, 'learning_rate': 1.9084470692634567e-05, 'epoch': 0.16} + 16%|█▋ | 1428/8750 [2:20:07<11:37:19, 5.71s/it] 16%|█▋ | 1429/8750 [2:20:13<11:32:50, 5.68s/it] 16%|█▋ | 1429/8750 [2:20:20<11:32:50, 5.68s/it] {'loss': 0.4917, 'learning_rate': 1.9082922779793988e-05, 'epoch': 0.16} + 16%|█▋ | 1429/8750 [2:20:20<11:32:50, 5.68s/it] {'loss': 0.4917, 'learning_rate': 1.9082922779793988e-05, 'epoch': 0.16} + 16%|█▋ | 1429/8750 [2:20:13<11:32:50, 5.68s/it] 16%|█▋ | 1430/8750 [2:20:19<11:31:22, 5.67s/it] 16%|█▋ | 1430/8750 [2:20:25<11:31:22, 5.67s/it] {'loss': 0.5028, 'learning_rate': 1.9081373622390204e-05, 'epoch': 0.16} + 16%|█▋ | 1430/8750 [2:20:25<11:31:22, 5.67s/it] {'loss': 0.5028, 'learning_rate': 1.9081373622390204e-05, 'epoch': 0.16} + 16%|█▋ | 1430/8750 [2:20:19<11:31:22, 5.67s/it] 16%|█▋ | 1431/8750 [2:20:31<11:31:28, 5.67s/it] 16%|█▋ | 1431/8750 [2:20:24<11:31:35, 5.67s/it] {'loss': 0.471, 'learning_rate': 1.9079823220635477e-05, 'epoch': 0.16} + 16%|█▋ | 1431/8750 [2:20:31<11:31:28, 5.67s/it] {'loss': 0.471, 'learning_rate': 1.9079823220635477e-05, 'epoch': 0.16} + 16%|█▋ | 1431/8750 [2:20:24<11:31:35, 5.67s/it] 16%|█▋ | 1432/8750 [2:20:30<11:32:03, 5.67s/it] 16%|█▋ | 1432/8750 [2:20:37<11:32:06, 5.67s/it] {'loss': 0.4818, 'learning_rate': 1.907827157474225e-05, 'epoch': 0.16} + 16%|█▋ | 1432/8750 [2:20:37<11:32:06, 5.67s/it] {'loss': 0.4818, 'learning_rate': 1.907827157474225e-05, 'epoch': 0.16} + 16%|█▋ | 1432/8750 [2:20:30<11:32:03, 5.67s/it] 16%|█▋ | 1433/8750 [2:20:36<11:35:26, 5.70s/it] 16%|█▋ | 1433/8750 [2:20:42<11:35:30, 5.70s/it] {'loss': 0.4884, 'learning_rate': 1.9076718684923136e-05, 'epoch': 0.16} + 16%|█▋ | 1433/8750 [2:20:42<11:35:30, 5.70s/it] {'loss': 0.4884, 'learning_rate': 1.9076718684923136e-05, 'epoch': 0.16} + 16%|█▋ | 1433/8750 [2:20:36<11:35:26, 5.70s/it] 16%|█▋ | 1434/8750 [2:20:41<11:34:47, 5.70s/it] 16%|█▋ | 1434/8750 [2:20:48<11:34:47, 5.70s/it] {'loss': 0.4989, 'learning_rate': 1.9075164551390918e-05, 'epoch': 0.16} + 16%|█▋ | 1434/8750 [2:20:48<11:34:47, 5.70s/it] {'loss': 0.4989, 'learning_rate': 1.9075164551390918e-05, 'epoch': 0.16} + 16%|█▋ | 1434/8750 [2:20:41<11:34:47, 5.70s/it] 16%|█▋ | 1435/8750 [2:20:47<11:45:25, 5.79s/it] 16%|█▋ | 1435/8750 [2:20:54<11:45:25, 5.79s/it] {'loss': 0.4663, 'learning_rate': 1.9073609174358535e-05, 'epoch': 0.16} + 16%|█▋ | 1435/8750 [2:20:54<11:45:25, 5.79s/it] {'loss': 0.4663, 'learning_rate': 1.9073609174358535e-05, 'epoch': 0.16} + 16%|█▋ | 1435/8750 [2:20:47<11:45:25, 5.79s/it] 16%|█▋ | 1436/8750 [2:20:53<11:40:14, 5.74s/it] 16%|█▋ | 1436/8750 [2:21:00<11:40:14, 5.74s/it] {'loss': 0.5005, 'learning_rate': 1.9072052554039123e-05, 'epoch': 0.16} + 16%|█▋ | 1436/8750 [2:21:00<11:40:14, 5.74s/it] {'loss': 0.5005, 'learning_rate': 1.9072052554039123e-05, 'epoch': 0.16} + 16%|█▋ | 1436/8750 [2:20:53<11:40:14, 5.74s/it] 16%|█▋ | 1437/8750 [2:20:59<11:39:04, 5.74s/it] 16%|█▋ | 1437/8750 [2:21:05<11:39:04, 5.74s/it] {'loss': 0.4952, 'learning_rate': 1.9070494690645966e-05, 'epoch': 0.16} + 16%|█▋ | 1437/8750 [2:21:05<11:39:04, 5.74s/it]{'loss': 0.4952, 'learning_rate': 1.9070494690645966e-05, 'epoch': 0.16} + 16%|█▋ | 1437/8750 [2:20:59<11:39:04, 5.74s/it] 16%|█▋ | 1438/8750 [2:21:05<11:43:48, 5.78s/it] 16%|█▋ | 1438/8750 [2:21:11<11:43:48, 5.78s/it] {'loss': 0.4959, 'learning_rate': 1.9068935584392522e-05, 'epoch': 0.16} + 16%|█▋ | 1438/8750 [2:21:11<11:43:48, 5.78s/it] {'loss': 0.4959, 'learning_rate': 1.9068935584392522e-05, 'epoch': 0.16} + 16%|█▋ | 1438/8750 [2:21:05<11:43:48, 5.78s/it] 16%|█▋ | 1439/8750 [2:21:10<11:41:11, 5.75s/it] 16%|█▋ | 1439/8750 [2:21:17<11:41:11, 5.75s/it] {'loss': 0.4891, 'learning_rate': 1.906737523549243e-05, 'epoch': 0.16} + 16%|█▋ | 1439/8750 [2:21:17<11:41:11, 5.75s/it] {'loss': 0.4891, 'learning_rate': 1.906737523549243e-05, 'epoch': 0.16} + 16%|█▋ | 1439/8750 [2:21:10<11:41:11, 5.75s/it] 16%|█▋ | 1440/8750 [2:21:16<11:37:17, 5.72s/it] 16%|█▋ | 1440/8750 [2:21:23<11:37:17, 5.72s/it] {'loss': 0.4844, 'learning_rate': 1.9065813644159495e-05, 'epoch': 0.16} + 16%|█▋ | 1440/8750 [2:21:23<11:37:17, 5.72s/it] {'loss': 0.4844, 'learning_rate': 1.9065813644159495e-05, 'epoch': 0.16} + 16%|█▋ | 1440/8750 [2:21:16<11:37:17, 5.72s/it] 16%|█▋ | 1441/8750 [2:21:22<11:44:11, 5.78s/it] 16%|█▋ | 1441/8750 [2:21:28<11:44:11, 5.78s/it] {'loss': 0.4942, 'learning_rate': 1.906425081060768e-05, 'epoch': 0.16} + 16%|█▋ | 1441/8750 [2:21:28<11:44:11, 5.78s/it] {'loss': 0.4942, 'learning_rate': 1.906425081060768e-05, 'epoch': 0.16} + 16%|█▋ | 1441/8750 [2:21:22<11:44:11, 5.78s/it] 16%|█▋ | 1442/8750 [2:21:28<11:46:07, 5.80s/it] 16%|█▋ | 1442/8750 [2:21:34<11:46:06, 5.80s/it] {'loss': 0.4822, 'learning_rate': 1.906268673505114e-05, 'epoch': 0.16} + 16%|█▋ | 1442/8750 [2:21:34<11:46:06, 5.80s/it] {'loss': 0.4822, 'learning_rate': 1.906268673505114e-05, 'epoch': 0.16} + 16%|█▋ | 1442/8750 [2:21:28<11:46:07, 5.80s/it] 16%|█▋ | 1443/8750 [2:21:34<11:46:11, 5.80s/it] 16%|█▋ | 1443/8750 [2:21:40<11:46:11, 5.80s/it] {'loss': 0.4636, 'learning_rate': 1.906112141770418e-05, 'epoch': 0.16} + 16%|█▋ | 1443/8750 [2:21:40<11:46:11, 5.80s/it] {'loss': 0.4636, 'learning_rate': 1.906112141770418e-05, 'epoch': 0.16} + 16%|█▋ | 1443/8750 [2:21:34<11:46:11, 5.80s/it] 17%|█▋ | 1444/8750 [2:21:40<11:53:17, 5.86s/it] 17%|█▋ | 1444/8750 [2:21:46<11:53:17, 5.86s/it] {'loss': 0.488, 'learning_rate': 1.9059554858781285e-05, 'epoch': 0.17} + 17%|█▋ | 1444/8750 [2:21:46<11:53:17, 5.86s/it] {'loss': 0.488, 'learning_rate': 1.9059554858781285e-05, 'epoch': 0.17} + 17%|█▋ | 1444/8750 [2:21:40<11:53:17, 5.86s/it] 17%|█▋ | 1445/8750 [2:21:45<11:38:03, 5.73s/it] 17%|█▋ | 1445/8750 [2:21:52<11:38:04, 5.73s/it] {'loss': 0.5003, 'learning_rate': 1.9057987058497106e-05, 'epoch': 0.17} + 17%|█▋ | 1445/8750 [2:21:52<11:38:04, 5.73s/it] {'loss': 0.5003, 'learning_rate': 1.9057987058497106e-05, 'epoch': 0.17} + 17%|█▋ | 1445/8750 [2:21:45<11:38:03, 5.73s/it] 17%|█▋ | 1446/8750 [2:21:51<11:36:44, 5.72s/it] 17%|█▋ | 1446/8750 [2:21:57<11:36:44, 5.72s/it] {'loss': 0.503, 'learning_rate': 1.9056418017066476e-05, 'epoch': 0.17} + 17%|█▋ | 1446/8750 [2:21:57<11:36:44, 5.72s/it] {'loss': 0.503, 'learning_rate': 1.9056418017066476e-05, 'epoch': 0.17} + 17%|█▋ | 1446/8750 [2:21:51<11:36:44, 5.72s/it] 17%|█▋ | 1447/8750 [2:21:56<11:34:12, 5.70s/it] 17%|█▋ | 1447/8750 [2:22:03<11:34:11, 5.70s/it] {'loss': 0.4837, 'learning_rate': 1.905484773470438e-05, 'epoch': 0.17} + 17%|█▋ | 1447/8750 [2:22:03<11:34:11, 5.70s/it] {'loss': 0.4837, 'learning_rate': 1.905484773470438e-05, 'epoch': 0.17} + 17%|█▋ | 1447/8750 [2:21:56<11:34:12, 5.70s/it] 17%|█▋ | 1448/8750 [2:22:02<11:30:14, 5.67s/it] 17%|█▋ | 1448/8750 [2:22:09<11:30:16, 5.67s/it] {'loss': 0.4655, 'learning_rate': 1.905327621162598e-05, 'epoch': 0.17} + 17%|█▋ | 1448/8750 [2:22:09<11:30:16, 5.67s/it] {'loss': 0.4655, 'learning_rate': 1.905327621162598e-05, 'epoch': 0.17} + 17%|█▋ | 1448/8750 [2:22:02<11:30:14, 5.67s/it] 17%|█▋ | 1449/8750 [2:22:08<11:37:13, 5.73s/it] 17%|█▋ | 1449/8750 [2:22:14<11:37:12, 5.73s/it] {'loss': 0.4768, 'learning_rate': 1.905170344804662e-05, 'epoch': 0.17} + 17%|█▋ | 1449/8750 [2:22:14<11:37:12, 5.73s/it] {'loss': 0.4768, 'learning_rate': 1.905170344804662e-05, 'epoch': 0.17} + 17%|█▋ | 1449/8750 [2:22:08<11:37:13, 5.73s/it]9 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +1210 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +05 AutoResumeHook: Checking whether to suspend... + 2 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend...3 + AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... + 17%|█▋ | 1450/8750 [2:22:14<11:34:57, 5.71s/it]7 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... + 17%|█▋ | 1450/8750 [2:22:20<11:34:57, 5.71s/it]14 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4872, 'learning_rate': 1.90501294441818e-05, 'epoch': 0.17} + 17%|█▋ | 1450/8750 [2:22:20<11:34:57, 5.71s/it] {'loss': 0.4872, 'learning_rate': 1.90501294441818e-05, 'epoch': 0.17} + 17%|█▋ | 1450/8750 [2:22:14<11:34:57, 5.71s/it] 17%|█▋ | 1451/8750 [2:22:20<11:46:27, 5.81s/it] 17%|█▋ | 1451/8750 [2:22:26<11:46:27, 5.81s/it] {'loss': 0.4736, 'learning_rate': 1.9048554200247184e-05, 'epoch': 0.17} + 17%|█▋ | 1451/8750 [2:22:26<11:46:27, 5.81s/it] {'loss': 0.4736, 'learning_rate': 1.9048554200247184e-05, 'epoch': 0.17} + 17%|█▋ | 1451/8750 [2:22:20<11:46:27, 5.81s/it] 17%|█▋ | 1452/8750 [2:22:25<11:40:56, 5.76s/it] 17%|█▋ | 1452/8750 [2:22:32<11:40:56, 5.76s/it] {'loss': 0.4808, 'learning_rate': 1.9046977716458627e-05, 'epoch': 0.17} + 17%|█▋ | 1452/8750 [2:22:32<11:40:56, 5.76s/it] {'loss': 0.4808, 'learning_rate': 1.9046977716458627e-05, 'epoch': 0.17} + 17%|█▋ | 1452/8750 [2:22:25<11:40:56, 5.76s/it] 17%|█▋ | 1453/8750 [2:22:31<11:38:57, 5.75s/it] 17%|█▋ | 1453/8750 [2:22:37<11:38:57, 5.75s/it] {'loss': 0.4925, 'learning_rate': 1.904539999303214e-05, 'epoch': 0.17} + 17%|█▋ | 1453/8750 [2:22:37<11:38:57, 5.75s/it]{'loss': 0.4925, 'learning_rate': 1.904539999303214e-05, 'epoch': 0.17} + 17%|█▋ | 1453/8750 [2:22:31<11:38:57, 5.75s/it] 17%|█▋ | 1454/8750 [2:22:37<11:38:15, 5.74s/it] 17%|█▋ | 1454/8750 [2:22:43<11:38:14, 5.74s/it] {'loss': 0.4836, 'learning_rate': 1.90438210301839e-05, 'epoch': 0.17} + 17%|█▋ | 1454/8750 [2:22:43<11:38:14, 5.74s/it] {'loss': 0.4836, 'learning_rate': 1.90438210301839e-05, 'epoch': 0.17} + 17%|█▋ | 1454/8750 [2:22:37<11:38:15, 5.74s/it] 17%|█▋ | 1455/8750 [2:22:42<11:32:34, 5.70s/it] 17%|█▋ | 1455/8750 [2:22:49<11:32:34, 5.70s/it] {'loss': 0.4678, 'learning_rate': 1.9042240828130267e-05, 'epoch': 0.17} + 17%|█▋ | 1455/8750 [2:22:49<11:32:34, 5.70s/it] {'loss': 0.4678, 'learning_rate': 1.9042240828130267e-05, 'epoch': 0.17} + 17%|█▋ | 1455/8750 [2:22:42<11:32:34, 5.70s/it] 17%|█▋ | 1456/8750 [2:22:48<11:30:55, 5.68s/it] 17%|█▋ | 1456/8750 [2:22:54<11:30:55, 5.68s/it] {'loss': 0.4835, 'learning_rate': 1.9040659387087762e-05, 'epoch': 0.17} + 17%|█▋ | 1456/8750 [2:22:54<11:30:55, 5.68s/it] {'loss': 0.4835, 'learning_rate': 1.9040659387087762e-05, 'epoch': 0.17} + 17%|█▋ | 1456/8750 [2:22:48<11:30:55, 5.68s/it] 17%|█▋ | 1457/8750 [2:22:54<11:27:59, 5.66s/it] 17%|█▋ | 1457/8750 [2:23:00<11:27:59, 5.66s/it] {'loss': 0.4949, 'learning_rate': 1.903907670727308e-05, 'epoch': 0.17} + 17%|█▋ | 1457/8750 [2:23:00<11:27:59, 5.66s/it] {'loss': 0.4949, 'learning_rate': 1.903907670727308e-05, 'epoch': 0.17} + 17%|█▋ | 1457/8750 [2:22:54<11:27:59, 5.66s/it] 17%|█▋ | 1458/8750 [2:22:59<11:30:47, 5.68s/it] 17%|█▋ | 1458/8750 [2:23:06<11:30:47, 5.68s/it] {'loss': 0.4879, 'learning_rate': 1.903749278890308e-05, 'epoch': 0.17} + 17%|█▋ | 1458/8750 [2:23:06<11:30:47, 5.68s/it] {'loss': 0.4879, 'learning_rate': 1.903749278890308e-05, 'epoch': 0.17} + 17%|█▋ | 1458/8750 [2:22:59<11:30:47, 5.68s/it] 17%|█▋ | 1459/8750 [2:23:05<11:28:07, 5.66s/it] 17%|█▋ | 1459/8750 [2:23:11<11:28:07, 5.66s/it] {'loss': 0.5004, 'learning_rate': 1.903590763219479e-05, 'epoch': 0.17} + 17%|█▋ | 1459/8750 [2:23:11<11:28:07, 5.66s/it] {'loss': 0.5004, 'learning_rate': 1.903590763219479e-05, 'epoch': 0.17} + 17%|█▋ | 1459/8750 [2:23:05<11:28:07, 5.66s/it] 17%|█▋ | 1460/8750 [2:23:11<11:42:28, 5.78s/it] 17%|█▋ | 1460/8750 [2:23:17<11:42:28, 5.78s/it] {'loss': 0.4882, 'learning_rate': 1.9034321237365424e-05, 'epoch': 0.17} + 17%|█▋ | 1460/8750 [2:23:17<11:42:28, 5.78s/it] {'loss': 0.4882, 'learning_rate': 1.9034321237365424e-05, 'epoch': 0.17} + 17%|█▋ | 1460/8750 [2:23:11<11:42:28, 5.78s/it] 17%|█▋ | 1461/8750 [2:23:17<11:35:39, 5.73s/it] 17%|█▋ | 1461/8750 [2:23:23<11:35:39, 5.73s/it] {'loss': 0.4818, 'learning_rate': 1.9032733604632347e-05, 'epoch': 0.17} + 17%|█▋ | 1461/8750 [2:23:23<11:35:39, 5.73s/it] {'loss': 0.4818, 'learning_rate': 1.9032733604632347e-05, 'epoch': 0.17} + 17%|█▋ | 1461/8750 [2:23:17<11:35:39, 5.73s/it] 17%|█▋ | 1462/8750 [2:23:22<11:38:15, 5.75s/it] 17%|█▋ | 1462/8750 [2:23:29<11:38:16, 5.75s/it] {'loss': 0.5016, 'learning_rate': 1.9031144734213097e-05, 'epoch': 0.17} + {'loss': 0.5016, 'learning_rate': 1.9031144734213097e-05, 'epoch': 0.17} 17%|█▋ | 1462/8750 [2:23:29<11:38:16, 5.75s/it] + 17%|█▋ | 1462/8750 [2:23:22<11:38:15, 5.75s/it] 17%|█▋ | 1463/8750 [2:23:28<11:42:56, 5.79s/it] 17%|█▋ | 1463/8750 [2:23:35<11:42:56, 5.79s/it] {'loss': 0.5071, 'learning_rate': 1.9029554626325386e-05, 'epoch': 0.17} + 17%|█▋ | 1463/8750 [2:23:28<11:42:56, 5.79s/it]{'loss': 0.5071, 'learning_rate': 1.9029554626325386e-05, 'epoch': 0.17} + 17%|█▋ | 1463/8750 [2:23:35<11:42:56, 5.79s/it] 17%|█▋ | 1464/8750 [2:23:34<11:39:58, 5.76s/it] 17%|█▋ | 1464/8750 [2:23:40<11:39:58, 5.76s/it] {'loss': 0.4721, 'learning_rate': 1.90279632811871e-05, 'epoch': 0.17} + 17%|█▋ | 1464/8750 [2:23:40<11:39:58, 5.76s/it] {'loss': 0.4721, 'learning_rate': 1.90279632811871e-05, 'epoch': 0.17} + 17%|█▋ | 1464/8750 [2:23:34<11:39:58, 5.76s/it] 17%|█▋ | 1465/8750 [2:23:39<11:32:27, 5.70s/it] 17%|█▋ | 1465/8750 [2:23:46<11:32:26, 5.70s/it] {'loss': 0.4908, 'learning_rate': 1.902637069901628e-05, 'epoch': 0.17} + 17%|█▋ | 1465/8750 [2:23:46<11:32:26, 5.70s/it] {'loss': 0.4908, 'learning_rate': 1.902637069901628e-05, 'epoch': 0.17} + 17%|█▋ | 1465/8750 [2:23:39<11:32:27, 5.70s/it] 17%|█▋ | 1466/8750 [2:23:45<11:29:16, 5.68s/it] 17%|█▋ | 1466/8750 [2:23:52<11:29:15, 5.68s/it] {'loss': 0.4762, 'learning_rate': 1.9024776880031154e-05, 'epoch': 0.17} + 17%|█▋ | 1466/8750 [2:23:52<11:29:15, 5.68s/it] {'loss': 0.4762, 'learning_rate': 1.9024776880031154e-05, 'epoch': 0.17} + 17%|█▋ | 1466/8750 [2:23:45<11:29:16, 5.68s/it] 17%|█▋ | 1467/8750 [2:23:51<11:30:52, 5.69s/it] 17%|█▋ | 1467/8750 [2:23:57<11:30:52, 5.69s/it] {'loss': 0.4817, 'learning_rate': 1.9023181824450106e-05, 'epoch': 0.17} + 17%|█▋ | 1467/8750 [2:23:57<11:30:52, 5.69s/it] {'loss': 0.4817, 'learning_rate': 1.9023181824450106e-05, 'epoch': 0.17} + 17%|█▋ | 1467/8750 [2:23:51<11:30:52, 5.69s/it] 17%|█▋ | 1468/8750 [2:23:57<11:42:51, 5.79s/it] 17%|█▋ | 1468/8750 [2:24:03<11:42:52, 5.79s/it] {'loss': 0.4806, 'learning_rate': 1.9021585532491694e-05, 'epoch': 0.17} + 17%|█▋ | 1468/8750 [2:24:03<11:42:52, 5.79s/it] {'loss': 0.4806, 'learning_rate': 1.9021585532491694e-05, 'epoch': 0.17} + 17%|█▋ | 1468/8750 [2:23:57<11:42:51, 5.79s/it] 17%|█▋ | 1469/8750 [2:24:03<11:39:56, 5.77s/it] 17%|█▋ | 1469/8750 [2:24:09<11:39:56, 5.77s/it] {'loss': 0.4877, 'learning_rate': 1.9019988004374645e-05, 'epoch': 0.17} + 17%|█▋ | 1469/8750 [2:24:09<11:39:56, 5.77s/it] {'loss': 0.4877, 'learning_rate': 1.9019988004374645e-05, 'epoch': 0.17} + 17%|█▋ | 1469/8750 [2:24:03<11:39:56, 5.77s/it] 17%|█▋ | 1470/8750 [2:24:08<11:41:17, 5.78s/it] 17%|█▋ | 1470/8750 [2:24:15<11:41:18, 5.78s/it] {'loss': 0.4813, 'learning_rate': 1.901838924031786e-05, 'epoch': 0.17} + 17%|█▋ | 1470/8750 [2:24:15<11:41:18, 5.78s/it] {'loss': 0.4813, 'learning_rate': 1.901838924031786e-05, 'epoch': 0.17} + 17%|█▋ | 1470/8750 [2:24:08<11:41:17, 5.78s/it] 17%|█▋ | 1471/8750 [2:24:14<11:37:03, 5.75s/it] 17%|█▋ | 1471/8750 [2:24:21<11:37:03, 5.75s/it] {'loss': 0.4945, 'learning_rate': 1.90167892405404e-05, 'epoch': 0.17} + 17%|█▋ | 1471/8750 [2:24:21<11:37:03, 5.75s/it] {'loss': 0.4945, 'learning_rate': 1.90167892405404e-05, 'epoch': 0.17} + 17%|█▋ | 1471/8750 [2:24:14<11:37:03, 5.75s/it] 17%|█▋ | 1472/8750 [2:24:20<11:37:05, 5.75s/it] 17%|█▋ | 1472/8750 [2:24:26<11:37:04, 5.75s/it] {'loss': 0.4863, 'learning_rate': 1.9015188005261505e-05, 'epoch': 0.17} + {'loss': 0.4863, 'learning_rate': 1.9015188005261505e-05, 'epoch': 0.17} 17%|█▋ | 1472/8750 [2:24:26<11:37:04, 5.75s/it] + 17%|█▋ | 1472/8750 [2:24:20<11:37:05, 5.75s/it] 17%|█▋ | 1473/8750 [2:24:25<11:33:50, 5.72s/it] 17%|█▋ | 1473/8750 [2:24:32<11:33:49, 5.72s/it] {'loss': 0.4841, 'learning_rate': 1.9013585534700582e-05, 'epoch': 0.17} + 17%|█▋ | 1473/8750 [2:24:32<11:33:49, 5.72s/it] {'loss': 0.4841, 'learning_rate': 1.9013585534700582e-05, 'epoch': 0.17} + 17%|█▋ | 1473/8750 [2:24:25<11:33:50, 5.72s/it] 17%|█▋ | 1474/8750 [2:24:31<11:32:49, 5.71s/it] 17%|█▋ | 1474/8750 [2:24:38<11:32:49, 5.71s/it] {'loss': 0.4733, 'learning_rate': 1.90119818290772e-05, 'epoch': 0.17} + 17%|█▋ | 1474/8750 [2:24:38<11:32:49, 5.71s/it] {'loss': 0.4733, 'learning_rate': 1.90119818290772e-05, 'epoch': 0.17} + 17%|█▋ | 1474/8750 [2:24:31<11:32:49, 5.71s/it] 17%|█▋ | 1475/8750 [2:24:37<11:28:49, 5.68s/it] 17%|█▋ | 1475/8750 [2:24:43<11:28:49, 5.68s/it] {'loss': 0.4872, 'learning_rate': 1.9010376888611106e-05, 'epoch': 0.17} + 17%|█▋ | 1475/8750 [2:24:43<11:28:49, 5.68s/it] {'loss': 0.4872, 'learning_rate': 1.9010376888611106e-05, 'epoch': 0.17} + 17%|█▋ | 1475/8750 [2:24:37<11:28:49, 5.68s/it] 17%|█▋ | 1476/8750 [2:24:42<11:28:19, 5.68s/it] 17%|█▋ | 1476/8750 [2:24:49<11:28:20, 5.68s/it] {'loss': 0.4819, 'learning_rate': 1.9008770713522206e-05, 'epoch': 0.17} + 17%|█▋ | 1476/8750 [2:24:49<11:28:20, 5.68s/it] {'loss': 0.4819, 'learning_rate': 1.9008770713522206e-05, 'epoch': 0.17} + 17%|█▋ | 1476/8750 [2:24:42<11:28:19, 5.68s/it] 17%|█▋ | 1477/8750 [2:24:48<11:36:06, 5.74s/it] 17%|█▋ | 1477/8750 [2:24:55<11:36:06, 5.74s/it] {'loss': 0.4832, 'learning_rate': 1.9007163304030593e-05, 'epoch': 0.17} + 17%|█▋ | 1477/8750 [2:24:55<11:36:06, 5.74s/it] {'loss': 0.4832, 'learning_rate': 1.9007163304030593e-05, 'epoch': 0.17} + 17%|█▋ | 1477/8750 [2:24:48<11:36:06, 5.74s/it] 17%|█▋ | 1478/8750 [2:24:54<11:29:43, 5.69s/it] 17%|█▋ | 1478/8750 [2:25:00<11:29:42, 5.69s/it] {'loss': 0.478, 'learning_rate': 1.9005554660356505e-05, 'epoch': 0.17} + 17%|█▋ | 1478/8750 [2:25:00<11:29:42, 5.69s/it] {'loss': 0.478, 'learning_rate': 1.9005554660356505e-05, 'epoch': 0.17} + 17%|█▋ | 1478/8750 [2:24:54<11:29:43, 5.69s/it] 17%|█▋ | 1479/8750 [2:25:00<11:28:56, 5.69s/it] 17%|█▋ | 1479/8750 [2:25:06<11:28:55, 5.68s/it] {'loss': 0.4873, 'learning_rate': 1.9003944782720375e-05, 'epoch': 0.17} + 17%|█▋ | 1479/8750 [2:25:06<11:28:55, 5.68s/it] {'loss': 0.4873, 'learning_rate': 1.9003944782720375e-05, 'epoch': 0.17} + 17%|█▋ | 1479/8750 [2:25:00<11:28:56, 5.69s/it] 17%|█▋ | 1480/8750 [2:25:12<11:34:58, 5.74s/it] 17%|█▋ | 1480/8750 [2:25:05<11:35:04, 5.74s/it] {'loss': 0.4852, 'learning_rate': 1.9002333671342782e-05, 'epoch': 0.17} + 17%|█▋ | 1480/8750 [2:25:12<11:34:58, 5.74s/it] {'loss': 0.4852, 'learning_rate': 1.9002333671342782e-05, 'epoch': 0.17} + 17%|█▋ | 1480/8750 [2:25:05<11:35:04, 5.74s/it] 17%|█▋ | 1481/8750 [2:25:11<11:31:14, 5.71s/it] 17%|█▋ | 1481/8750 [2:25:18<11:31:16, 5.71s/it] {'loss': 0.4965, 'learning_rate': 1.9000721326444492e-05, 'epoch': 0.17} + 17%|█▋ | 1481/8750 [2:25:18<11:31:16, 5.71s/it] {'loss': 0.4965, 'learning_rate': 1.9000721326444492e-05, 'epoch': 0.17} + 17%|█▋ | 1481/8750 [2:25:11<11:31:14, 5.71s/it] 17%|█▋ | 1482/8750 [2:25:17<11:28:50, 5.69s/it] 17%|█▋ | 1482/8750 [2:25:23<11:28:51, 5.69s/it] {'loss': 0.4757, 'learning_rate': 1.8999107748246427e-05, 'epoch': 0.17} + 17%|█▋ | 1482/8750 [2:25:23<11:28:51, 5.69s/it] {'loss': 0.4757, 'learning_rate': 1.8999107748246427e-05, 'epoch': 0.17} + 17%|█▋ | 1482/8750 [2:25:17<11:28:50, 5.69s/it] 17%|█▋ | 1483/8750 [2:25:22<11:29:36, 5.69s/it] 17%|█▋ | 1483/8750 [2:25:29<11:29:37, 5.69s/it] {'loss': 0.4775, 'learning_rate': 1.8997492936969686e-05, 'epoch': 0.17} + 17%|█▋ | 1483/8750 [2:25:29<11:29:37, 5.69s/it] {'loss': 0.4775, 'learning_rate': 1.8997492936969686e-05, 'epoch': 0.17} + 17%|█▋ | 1483/8750 [2:25:22<11:29:36, 5.69s/it] 17%|█▋ | 1484/8750 [2:25:28<11:28:13, 5.68s/it] 17%|█▋ | 1484/8750 [2:25:35<11:28:14, 5.68s/it] {'loss': 0.4777, 'learning_rate': 1.899587689283553e-05, 'epoch': 0.17} + 17%|█▋ | 1484/8750 [2:25:35<11:28:14, 5.68s/it] {'loss': 0.4777, 'learning_rate': 1.899587689283553e-05, 'epoch': 0.17} + 17%|█▋ | 1484/8750 [2:25:28<11:28:13, 5.68s/it] 17%|█▋ | 1485/8750 [2:25:34<11:24:31, 5.65s/it] 17%|█▋ | 1485/8750 [2:25:40<11:24:31, 5.65s/it] {'loss': 0.4887, 'learning_rate': 1.89942596160654e-05, 'epoch': 0.17} + 17%|█▋ | 1485/8750 [2:25:40<11:24:31, 5.65s/it] {'loss': 0.4887, 'learning_rate': 1.89942596160654e-05, 'epoch': 0.17} + 17%|█▋ | 1485/8750 [2:25:34<11:24:31, 5.65s/it] 17%|█▋ | 1486/8750 [2:25:39<11:25:14, 5.66s/it] 17%|█▋ | 1486/8750 [2:25:46<11:25:14, 5.66s/it] {'loss': 0.4752, 'learning_rate': 1.899264110688089e-05, 'epoch': 0.17} + 17%|█▋ | 1486/8750 [2:25:46<11:25:14, 5.66s/it] {'loss': 0.4752, 'learning_rate': 1.899264110688089e-05, 'epoch': 0.17} + 17%|█▋ | 1486/8750 [2:25:39<11:25:14, 5.66s/it] 17%|█▋ | 1487/8750 [2:25:45<11:22:51, 5.64s/it] 17%|█▋ | 1487/8750 [2:25:51<11:22:51, 5.64s/it] {'loss': 0.4975, 'learning_rate': 1.8991021365503782e-05, 'epoch': 0.17} + 17%|█▋ | 1487/8750 [2:25:51<11:22:51, 5.64s/it] {'loss': 0.4975, 'learning_rate': 1.8991021365503782e-05, 'epoch': 0.17} + 17%|█▋ | 1487/8750 [2:25:45<11:22:51, 5.64s/it] 17%|█▋ | 1488/8750 [2:25:51<11:43:13, 5.81s/it] 17%|█▋ | 1488/8750 [2:25:58<11:43:14, 5.81s/it] {'loss': 0.4667, 'learning_rate': 1.8989400392156012e-05, 'epoch': 0.17} + 17%|█▋ | 1488/8750 [2:25:58<11:43:14, 5.81s/it] {'loss': 0.4667, 'learning_rate': 1.8989400392156012e-05, 'epoch': 0.17} + 17%|█▋ | 1488/8750 [2:25:51<11:43:13, 5.81s/it] 17%|█▋ | 1489/8750 [2:25:57<11:37:06, 5.76s/it] 17%|█▋ | 1489/8750 [2:26:03<11:37:06, 5.76s/it] {'loss': 0.5035, 'learning_rate': 1.898777818705969e-05, 'epoch': 0.17} + 17%|█▋ | 1489/8750 [2:26:03<11:37:06, 5.76s/it] {'loss': 0.5035, 'learning_rate': 1.898777818705969e-05, 'epoch': 0.17} + 17%|█▋ | 1489/8750 [2:25:57<11:37:06, 5.76s/it] 17%|█▋ | 1490/8750 [2:26:03<11:44:16, 5.82s/it] 17%|█▋ | 1490/8750 [2:26:09<11:44:17, 5.82s/it] {'loss': 0.4994, 'learning_rate': 1.898615475043709e-05, 'epoch': 0.17} + 17%|█▋ | 1490/8750 [2:26:09<11:44:17, 5.82s/it] {'loss': 0.4994, 'learning_rate': 1.898615475043709e-05, 'epoch': 0.17} + 17%|█▋ | 1490/8750 [2:26:03<11:44:16, 5.82s/it] 17%|█▋ | 1491/8750 [2:26:09<11:43:32, 5.82s/it] 17%|█▋ | 1491/8750 [2:26:15<11:43:33, 5.82s/it] {'loss': 0.4835, 'learning_rate': 1.8984530082510665e-05, 'epoch': 0.17} + 17%|█▋ | 1491/8750 [2:26:15<11:43:33, 5.82s/it] {'loss': 0.4835, 'learning_rate': 1.8984530082510665e-05, 'epoch': 0.17} + 17%|█▋ | 1491/8750 [2:26:09<11:43:32, 5.82s/it] 17%|█▋ | 1492/8750 [2:26:14<11:43:58, 5.82s/it] 17%|█▋ | 1492/8750 [2:26:21<11:43:58, 5.82s/it] {'loss': 0.4679, 'learning_rate': 1.898290418350303e-05, 'epoch': 0.17} + 17%|█▋ | 1492/8750 [2:26:21<11:43:58, 5.82s/it] {'loss': 0.4679, 'learning_rate': 1.898290418350303e-05, 'epoch': 0.17} + 17%|█▋ | 1492/8750 [2:26:14<11:43:58, 5.82s/it] 17%|█▋ | 1493/8750 [2:26:20<11:38:45, 5.78s/it] 17%|█▋ | 1493/8750 [2:26:27<11:38:45, 5.78s/it] {'loss': 0.4847, 'learning_rate': 1.8981277053636963e-05, 'epoch': 0.17} + 17%|█▋ | 1493/8750 [2:26:27<11:38:45, 5.78s/it] {'loss': 0.4847, 'learning_rate': 1.8981277053636963e-05, 'epoch': 0.17} + 17%|█▋ | 1493/8750 [2:26:20<11:38:45, 5.78s/it] 17%|█▋ | 1494/8750 [2:26:26<11:34:49, 5.75s/it] 17%|█▋ | 1494/8750 [2:26:32<11:34:49, 5.75s/it] {'loss': 0.4753, 'learning_rate': 1.8979648693135428e-05, 'epoch': 0.17} + 17%|█▋ | 1494/8750 [2:26:32<11:34:49, 5.75s/it] {'loss': 0.4753, 'learning_rate': 1.8979648693135428e-05, 'epoch': 0.17} + 17%|█▋ | 1494/8750 [2:26:26<11:34:49, 5.75s/it] 17%|█▋ | 1495/8750 [2:26:31<11:31:54, 5.72s/it] 17%|█▋ | 1495/8750 [2:26:38<11:31:53, 5.72s/it] {'loss': 0.5043, 'learning_rate': 1.8978019102221538e-05, 'epoch': 0.17} + 17%|█▋ | 1495/8750 [2:26:38<11:31:53, 5.72s/it] {'loss': 0.5043, 'learning_rate': 1.8978019102221538e-05, 'epoch': 0.17} + 17%|█▋ | 1495/8750 [2:26:31<11:31:54, 5.72s/it] 17%|█▋ | 1496/8750 [2:26:37<11:33:06, 5.73s/it] 17%|█▋ | 1496/8750 [2:26:44<11:33:06, 5.73s/it] {'loss': 0.4708, 'learning_rate': 1.8976388281118584e-05, 'epoch': 0.17} + 17%|█▋ | 1496/8750 [2:26:44<11:33:06, 5.73s/it] {'loss': 0.4708, 'learning_rate': 1.8976388281118584e-05, 'epoch': 0.17} + 17%|█▋ | 1496/8750 [2:26:37<11:33:06, 5.73s/it] 17%|█▋ | 1497/8750 [2:26:43<11:30:52, 5.72s/it] 17%|█▋ | 1497/8750 [2:26:49<11:30:52, 5.72s/it] {'loss': 0.4852, 'learning_rate': 1.8974756230050028e-05, 'epoch': 0.17} + 17%|█▋ | 1497/8750 [2:26:49<11:30:52, 5.72s/it] {'loss': 0.4852, 'learning_rate': 1.8974756230050028e-05, 'epoch': 0.17} + 17%|█▋ | 1497/8750 [2:26:43<11:30:52, 5.72s/it] 17%|█▋ | 1498/8750 [2:26:49<11:34:42, 5.75s/it] 17%|█▋ | 1498/8750 [2:26:55<11:34:42, 5.75s/it] {'loss': 0.4888, 'learning_rate': 1.8973122949239497e-05, 'epoch': 0.17} + 17%|█▋ | 1498/8750 [2:26:55<11:34:42, 5.75s/it] {'loss': 0.4888, 'learning_rate': 1.8973122949239497e-05, 'epoch': 0.17} + 17%|█▋ | 1498/8750 [2:26:49<11:34:42, 5.75s/it] 17%|█▋ | 1499/8750 [2:26:55<11:48:17, 5.86s/it] 17%|█▋ | 1499/8750 [2:27:01<11:48:16, 5.86s/it] {'loss': 0.4918, 'learning_rate': 1.897148843891079e-05, 'epoch': 0.17} + 17%|█▋ | 1499/8750 [2:27:01<11:48:16, 5.86s/it] {'loss': 0.4918, 'learning_rate': 1.897148843891079e-05, 'epoch': 0.17} + 17%|█▋ | 1499/8750 [2:26:55<11:48:17, 5.86s/it]4 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +35 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +06 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... + 17%|█▋ | 1500/8750 [2:27:07<11:43:40, 5.82s/it] 17%|█▋ | 1500/8750 [2:27:00<11:43:41, 5.82s/it]10 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4865, 'learning_rate': 1.8969852699287865e-05, 'epoch': 0.17} + 17%|█▋ | 1500/8750 [2:27:07<11:43:40, 5.82s/it] {'loss': 0.4865, 'learning_rate': 1.8969852699287865e-05, 'epoch': 0.17} + 17%|█▋ | 1500/8750 [2:27:01<11:43:41, 5.82s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-1500/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-1500/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-1500/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 17%|█▋ | 1501/8750 [2:27:27<20:24:57, 10.14s/it] 17%|█▋ | 1501/8750 [2:27:21<20:24:57, 10.14s/it] {'loss': 0.4742, 'learning_rate': 1.896821573059486e-05, 'epoch': 0.17} + 17%|█▋ | 1501/8750 [2:27:27<20:24:57, 10.14s/it] {'loss': 0.4742, 'learning_rate': 1.896821573059486e-05, 'epoch': 0.17} + 17%|█▋ | 1501/8750 [2:27:21<20:24:57, 10.14s/it] 17%|█▋ | 1502/8750 [2:27:33<17:53:00, 8.88s/it] 17%|█▋ | 1502/8750 [2:27:27<17:53:02, 8.88s/it] {'loss': 0.4891, 'learning_rate': 1.896657753305607e-05, 'epoch': 0.17} + 17%|█▋ | 1502/8750 [2:27:33<17:53:00, 8.88s/it] {'loss': 0.4891, 'learning_rate': 1.896657753305607e-05, 'epoch': 0.17} + 17%|█▋ | 1502/8750 [2:27:27<17:53:02, 8.88s/it] 17%|█▋ | 1503/8750 [2:27:39<16:10:20, 8.03s/it] 17%|█▋ | 1503/8750 [2:27:33<16:10:20, 8.03s/it] {'loss': 0.5085, 'learning_rate': 1.896493810689597e-05, 'epoch': 0.17} + 17%|█▋ | 1503/8750 [2:27:39<16:10:20, 8.03s/it] {'loss': 0.5085, 'learning_rate': 1.896493810689597e-05, 'epoch': 0.17} + 17%|█▋ | 1503/8750 [2:27:33<16:10:20, 8.03s/it] 17%|█▋ | 1504/8750 [2:27:45<14:44:02, 7.32s/it] 17%|█▋ | 1504/8750 [2:27:38<14:44:06, 7.32s/it] {'loss': 0.5098, 'learning_rate': 1.89632974523392e-05, 'epoch': 0.17} + 17%|█▋ | 1504/8750 [2:27:45<14:44:02, 7.32s/it] {'loss': 0.5098, 'learning_rate': 1.89632974523392e-05, 'epoch': 0.17} + 17%|█▋ | 1504/8750 [2:27:38<14:44:06, 7.32s/it] 17%|█▋ | 1505/8750 [2:27:50<13:42:28, 6.81s/it] 17%|█▋ | 1505/8750 [2:27:44<13:42:27, 6.81s/it] {'loss': 0.4616, 'learning_rate': 1.8961655569610557e-05, 'epoch': 0.17} + 17%|█▋ | 1505/8750 [2:27:50<13:42:28, 6.81s/it] {'loss': 0.4616, 'learning_rate': 1.8961655569610557e-05, 'epoch': 0.17} + 17%|█▋ | 1505/8750 [2:27:44<13:42:27, 6.81s/it] 17%|█▋ | 1506/8750 [2:27:56<13:03:29, 6.49s/it] 17%|█▋ | 1506/8750 [2:27:50<13:03:28, 6.49s/it] {'loss': 0.4735, 'learning_rate': 1.8960012458935025e-05, 'epoch': 0.17} + 17%|█▋ | 1506/8750 [2:27:56<13:03:29, 6.49s/it] {'loss': 0.4735, 'learning_rate': 1.8960012458935025e-05, 'epoch': 0.17} + 17%|█▋ | 1506/8750 [2:27:50<13:03:28, 6.49s/it] 17%|█▋ | 1507/8750 [2:27:55<12:27:00, 6.19s/it] 17%|█▋ | 1507/8750 [2:28:02<12:27:01, 6.19s/it] {'loss': 0.5024, 'learning_rate': 1.8958368120537746e-05, 'epoch': 0.17} + 17%|█▋ | 1507/8750 [2:28:02<12:27:01, 6.19s/it] {'loss': 0.5024, 'learning_rate': 1.8958368120537746e-05, 'epoch': 0.17} + 17%|█▋ | 1507/8750 [2:27:55<12:27:00, 6.19s/it] 17%|█▋ | 1508/8750 [2:28:01<12:15:00, 6.09s/it] 17%|█▋ | 1508/8750 [2:28:08<12:15:01, 6.09s/it] {'loss': 0.4795, 'learning_rate': 1.8956722554644026e-05, 'epoch': 0.17} + 17%|█▋ | 1508/8750 [2:28:08<12:15:01, 6.09s/it] {'loss': 0.4795, 'learning_rate': 1.8956722554644026e-05, 'epoch': 0.17} + 17%|█▋ | 1508/8750 [2:28:01<12:15:00, 6.09s/it] 17%|█▋ | 1509/8750 [2:28:07<12:05:34, 6.01s/it] 17%|█▋ | 1509/8750 [2:28:13<12:05:35, 6.01s/it] {'loss': 0.4802, 'learning_rate': 1.8955075761479342e-05, 'epoch': 0.17} + 17%|█▋ | 1509/8750 [2:28:13<12:05:35, 6.01s/it] {'loss': 0.4802, 'learning_rate': 1.8955075761479342e-05, 'epoch': 0.17} + 17%|█▋ | 1509/8750 [2:28:07<12:05:34, 6.01s/it] 17%|█▋ | 1510/8750 [2:28:13<11:54:23, 5.92s/it] 17%|█▋ | 1510/8750 [2:28:19<11:54:24, 5.92s/it] {'loss': 0.4744, 'learning_rate': 1.895342774126935e-05, 'epoch': 0.17} + 17%|█▋ | 1510/8750 [2:28:19<11:54:24, 5.92s/it] {'loss': 0.4744, 'learning_rate': 1.895342774126935e-05, 'epoch': 0.17} + 17%|█▋ | 1510/8750 [2:28:13<11:54:23, 5.92s/it] 17%|█▋ | 1511/8750 [2:28:25<11:50:41, 5.89s/it] 17%|█▋ | 1511/8750 [2:28:18<11:50:41, 5.89s/it] {'loss': 0.4734, 'learning_rate': 1.8951778494239862e-05, 'epoch': 0.17} + 17%|█▋ | 1511/8750 [2:28:25<11:50:41, 5.89s/it] {'loss': 0.4734, 'learning_rate': 1.8951778494239862e-05, 'epoch': 0.17} + 17%|█▋ | 1511/8750 [2:28:18<11:50:41, 5.89s/it] 17%|█▋ | 1512/8750 [2:28:30<11:35:34, 5.77s/it] 17%|█▋ | 1512/8750 [2:28:24<11:35:34, 5.77s/it] {'loss': 0.4954, 'learning_rate': 1.8950128020616863e-05, 'epoch': 0.17} + 17%|█▋ | 1512/8750 [2:28:24<11:35:34, 5.77s/it] {'loss': 0.4954, 'learning_rate': 1.8950128020616863e-05, 'epoch': 0.17} + 17%|█▋ | 1512/8750 [2:28:30<11:35:34, 5.77s/it] 17%|█▋ | 1513/8750 [2:28:36<11:33:49, 5.75s/it] 17%|█▋ | 1513/8750 [2:28:30<11:33:50, 5.75s/it] {'loss': 0.4863, 'learning_rate': 1.89484763206265e-05, 'epoch': 0.17} + 17%|█▋ | 1513/8750 [2:28:36<11:33:49, 5.75s/it] {'loss': 0.4863, 'learning_rate': 1.89484763206265e-05, 'epoch': 0.17} + 17%|█▋ | 1513/8750 [2:28:30<11:33:50, 5.75s/it] 17%|█▋ | 1514/8750 [2:28:35<11:24:24, 5.67s/it] 17%|█▋ | 1514/8750 [2:28:42<11:24:25, 5.68s/it] {'loss': 0.4804, 'learning_rate': 1.89468233944951e-05, 'epoch': 0.17} + 17%|█▋ | 1514/8750 [2:28:42<11:24:25, 5.68s/it] {'loss': 0.4804, 'learning_rate': 1.89468233944951e-05, 'epoch': 0.17} + 17%|█▋ | 1514/8750 [2:28:35<11:24:24, 5.67s/it] 17%|█▋ | 1515/8750 [2:28:41<11:24:54, 5.68s/it] 17%|█▋ | 1515/8750 [2:28:47<11:24:54, 5.68s/it] {'loss': 0.5027, 'learning_rate': 1.8945169242449145e-05, 'epoch': 0.17} + 17%|█▋ | 1515/8750 [2:28:47<11:24:54, 5.68s/it]{'loss': 0.5027, 'learning_rate': 1.8945169242449145e-05, 'epoch': 0.17} + 17%|█▋ | 1515/8750 [2:28:41<11:24:54, 5.68s/it] 17%|█▋ | 1516/8750 [2:28:53<11:34:00, 5.76s/it] 17%|█▋ | 1516/8750 [2:28:47<11:34:01, 5.76s/it] {'loss': 0.4762, 'learning_rate': 1.894351386471529e-05, 'epoch': 0.17} + 17%|█▋ | 1516/8750 [2:28:53<11:34:00, 5.76s/it] {'loss': 0.4762, 'learning_rate': 1.894351386471529e-05, 'epoch': 0.17} + 17%|█▋ | 1516/8750 [2:28:47<11:34:01, 5.76s/it] 17%|█▋ | 1517/8750 [2:28:59<11:37:14, 5.78s/it] 17%|█▋ | 1517/8750 [2:28:53<11:37:14, 5.78s/it] {'loss': 0.4789, 'learning_rate': 1.8941857261520363e-05, 'epoch': 0.17} + 17%|█▋ | 1517/8750 [2:28:59<11:37:14, 5.78s/it] {'loss': 0.4789, 'learning_rate': 1.8941857261520363e-05, 'epoch': 0.17} + 17%|█▋ | 1517/8750 [2:28:53<11:37:14, 5.78s/it] 17%|█▋ | 1518/8750 [2:29:05<11:32:27, 5.74s/it] 17%|█▋ | 1518/8750 [2:28:58<11:32:27, 5.74s/it] {'loss': 0.4867, 'learning_rate': 1.8940199433091354e-05, 'epoch': 0.17} + 17%|█▋ | 1518/8750 [2:29:05<11:32:27, 5.74s/it] {'loss': 0.4867, 'learning_rate': 1.8940199433091354e-05, 'epoch': 0.17} + 17%|█▋ | 1518/8750 [2:28:58<11:32:27, 5.74s/it] 17%|█▋ | 1519/8750 [2:29:11<11:33:25, 5.75s/it] 17%|█▋ | 1519/8750 [2:29:04<11:33:25, 5.75s/it] {'loss': 0.4938, 'learning_rate': 1.893854037965542e-05, 'epoch': 0.17} + 17%|█▋ | 1519/8750 [2:29:11<11:33:25, 5.75s/it] {'loss': 0.4938, 'learning_rate': 1.893854037965542e-05, 'epoch': 0.17} + 17%|█▋ | 1519/8750 [2:29:04<11:33:25, 5.75s/it] 17%|█▋ | 1520/8750 [2:29:16<11:32:05, 5.74s/it] 17%|█▋ | 1520/8750 [2:29:10<11:32:06, 5.74s/it] {'loss': 0.4817, 'learning_rate': 1.8936880101439893e-05, 'epoch': 0.17} + 17%|█▋ | 1520/8750 [2:29:16<11:32:05, 5.74s/it] {'loss': 0.4817, 'learning_rate': 1.8936880101439893e-05, 'epoch': 0.17} + 17%|█▋ | 1520/8750 [2:29:10<11:32:06, 5.74s/it] 17%|█▋ | 1521/8750 [2:29:22<11:35:26, 5.77s/it] 17%|█▋ | 1521/8750 [2:29:16<11:35:27, 5.77s/it] {'loss': 0.5014, 'learning_rate': 1.8935218598672266e-05, 'epoch': 0.17} + 17%|█▋ | 1521/8750 [2:29:22<11:35:26, 5.77s/it] {'loss': 0.5014, 'learning_rate': 1.8935218598672266e-05, 'epoch': 0.17} + 17%|█▋ | 1521/8750 [2:29:16<11:35:27, 5.77s/it] 17%|█▋ | 1522/8750 [2:29:28<11:26:54, 5.70s/it] 17%|█▋ | 1522/8750 [2:29:21<11:26:54, 5.70s/it] {'loss': 0.4909, 'learning_rate': 1.8933555871580204e-05, 'epoch': 0.17} + 17%|█▋ | 1522/8750 [2:29:28<11:26:54, 5.70s/it] {'loss': 0.4909, 'learning_rate': 1.8933555871580204e-05, 'epoch': 0.17} + 17%|█▋ | 1522/8750 [2:29:21<11:26:54, 5.70s/it] 17%|█▋ | 1523/8750 [2:29:33<11:23:58, 5.68s/it] 17%|█▋ | 1523/8750 [2:29:27<11:23:58, 5.68s/it] {'loss': 0.5019, 'learning_rate': 1.8931891920391533e-05, 'epoch': 0.17} + 17%|█▋ | 1523/8750 [2:29:33<11:23:58, 5.68s/it] {'loss': 0.5019, 'learning_rate': 1.8931891920391533e-05, 'epoch': 0.17} + 17%|█▋ | 1523/8750 [2:29:27<11:23:58, 5.68s/it] 17%|█▋ | 1524/8750 [2:29:39<11:34:15, 5.76s/it] 17%|█▋ | 1524/8750 [2:29:33<11:34:15, 5.76s/it] {'loss': 0.482, 'learning_rate': 1.893022674533425e-05, 'epoch': 0.17} + 17%|█▋ | 1524/8750 [2:29:39<11:34:15, 5.76s/it] {'loss': 0.482, 'learning_rate': 1.893022674533425e-05, 'epoch': 0.17} + 17%|█▋ | 1524/8750 [2:29:33<11:34:15, 5.76s/it] 17%|█▋ | 1525/8750 [2:29:38<11:34:13, 5.77s/it] 17%|█▋ | 1525/8750 [2:29:45<11:34:14, 5.77s/it] {'loss': 0.507, 'learning_rate': 1.8928560346636532e-05, 'epoch': 0.17} + 17%|█▋ | 1525/8750 [2:29:45<11:34:14, 5.77s/it] {'loss': 0.507, 'learning_rate': 1.8928560346636532e-05, 'epoch': 0.17} + 17%|█▋ | 1525/8750 [2:29:38<11:34:13, 5.77s/it] 17%|█▋ | 1526/8750 [2:29:44<11:37:30, 5.79s/it] 17%|█▋ | 1526/8750 [2:29:51<11:37:31, 5.79s/it] {'loss': 0.4968, 'learning_rate': 1.89268927245267e-05, 'epoch': 0.17} + 17%|█▋ | 1526/8750 [2:29:51<11:37:31, 5.79s/it] {'loss': 0.4968, 'learning_rate': 1.89268927245267e-05, 'epoch': 0.17} + 17%|█▋ | 1526/8750 [2:29:44<11:37:30, 5.79s/it] 17%|█▋ | 1527/8750 [2:29:57<11:37:46, 5.80s/it] 17%|█▋ | 1527/8750 [2:29:50<11:37:47, 5.80s/it] {'loss': 0.4785, 'learning_rate': 1.8925223879233267e-05, 'epoch': 0.17} + 17%|█▋ | 1527/8750 [2:29:57<11:37:46, 5.80s/it] {'loss': 0.4785, 'learning_rate': 1.8925223879233267e-05, 'epoch': 0.17} + 17%|█▋ | 1527/8750 [2:29:50<11:37:47, 5.80s/it] 17%|█▋ | 1528/8750 [2:30:02<11:38:53, 5.81s/it] 17%|█▋ | 1528/8750 [2:29:56<11:38:53, 5.81s/it] {'loss': 0.4749, 'learning_rate': 1.8923553810984893e-05, 'epoch': 0.17} + 17%|█▋ | 1528/8750 [2:29:56<11:38:53, 5.81s/it]{'loss': 0.4749, 'learning_rate': 1.8923553810984893e-05, 'epoch': 0.17} + 17%|█▋ | 1528/8750 [2:30:02<11:38:53, 5.81s/it] 17%|█▋ | 1529/8750 [2:30:08<11:36:38, 5.79s/it] 17%|█▋ | 1529/8750 [2:30:02<11:36:38, 5.79s/it] {'loss': 0.4744, 'learning_rate': 1.8921882520010416e-05, 'epoch': 0.17} + 17%|█▋ | 1529/8750 [2:30:08<11:36:38, 5.79s/it] {'loss': 0.4744, 'learning_rate': 1.8921882520010416e-05, 'epoch': 0.17} + 17%|█▋ | 1529/8750 [2:30:02<11:36:38, 5.79s/it] 17%|█▋ | 1530/8750 [2:30:14<11:33:46, 5.77s/it] 17%|█▋ | 1530/8750 [2:30:07<11:33:46, 5.77s/it] {'loss': 0.5001, 'learning_rate': 1.8920210006538843e-05, 'epoch': 0.17} + 17%|█▋ | 1530/8750 [2:30:14<11:33:46, 5.77s/it] {'loss': 0.5001, 'learning_rate': 1.8920210006538843e-05, 'epoch': 0.17} + 17%|█▋ | 1530/8750 [2:30:07<11:33:46, 5.77s/it] 17%|█▋ | 1531/8750 [2:30:13<11:30:07, 5.74s/it] 17%|█▋ | 1531/8750 [2:30:20<11:30:09, 5.74s/it] {'loss': 0.4816, 'learning_rate': 1.891853627079935e-05, 'epoch': 0.17} + 17%|█▋ | 1531/8750 [2:30:20<11:30:09, 5.74s/it] {'loss': 0.4816, 'learning_rate': 1.891853627079935e-05, 'epoch': 0.17} + 17%|█▋ | 1531/8750 [2:30:13<11:30:07, 5.74s/it] 18%|█▊ | 1532/8750 [2:30:25<11:29:20, 5.73s/it] 18%|█▊ | 1532/8750 [2:30:19<11:29:20, 5.73s/it] {'loss': 0.4662, 'learning_rate': 1.8916861313021268e-05, 'epoch': 0.18} + 18%|█▊ | 1532/8750 [2:30:25<11:29:20, 5.73s/it] {'loss': 0.4662, 'learning_rate': 1.8916861313021268e-05, 'epoch': 0.18} + 18%|█▊ | 1532/8750 [2:30:19<11:29:20, 5.73s/it] 18%|█▊ | 1533/8750 [2:30:31<11:27:42, 5.72s/it] 18%|█▊ | 1533/8750 [2:30:24<11:27:42, 5.72s/it] {'loss': 0.4976, 'learning_rate': 1.8915185133434107e-05, 'epoch': 0.18} + 18%|█▊ | 1533/8750 [2:30:31<11:27:42, 5.72s/it] {'loss': 0.4976, 'learning_rate': 1.8915185133434107e-05, 'epoch': 0.18} + 18%|█▊ | 1533/8750 [2:30:25<11:27:42, 5.72s/it] 18%|█▊ | 1534/8750 [2:30:37<11:31:14, 5.75s/it] 18%|█▊ | 1534/8750 [2:30:30<11:31:14, 5.75s/it] {'loss': 0.48, 'learning_rate': 1.891350773226754e-05, 'epoch': 0.18} + 18%|█▊ | 1534/8750 [2:30:37<11:31:14, 5.75s/it] {'loss': 0.48, 'learning_rate': 1.891350773226754e-05, 'epoch': 0.18} + 18%|█▊ | 1534/8750 [2:30:30<11:31:14, 5.75s/it] 18%|█▊ | 1535/8750 [2:30:43<11:33:17, 5.77s/it] 18%|█▊ | 1535/8750 [2:30:36<11:33:16, 5.77s/it] {'loss': 0.4784, 'learning_rate': 1.891182910975141e-05, 'epoch': 0.18} + 18%|█▊ | 1535/8750 [2:30:43<11:33:17, 5.77s/it] {'loss': 0.4784, 'learning_rate': 1.891182910975141e-05, 'epoch': 0.18} + 18%|█▊ | 1535/8750 [2:30:36<11:33:16, 5.77s/it] 18%|█▊ | 1536/8750 [2:30:48<11:28:07, 5.72s/it] 18%|█▊ | 1536/8750 [2:30:42<11:28:08, 5.72s/it] {'loss': 0.4776, 'learning_rate': 1.8910149266115724e-05, 'epoch': 0.18} + 18%|█▊ | 1536/8750 [2:30:48<11:28:07, 5.72s/it] {'loss': 0.4776, 'learning_rate': 1.8910149266115724e-05, 'epoch': 0.18} + 18%|█▊ | 1536/8750 [2:30:42<11:28:08, 5.72s/it] 18%|█▊ | 1537/8750 [2:30:54<11:41:14, 5.83s/it] 18%|█▊ | 1537/8750 [2:30:48<11:41:14, 5.83s/it] {'loss': 0.4834, 'learning_rate': 1.890846820159066e-05, 'epoch': 0.18} + 18%|█▊ | 1537/8750 [2:30:54<11:41:14, 5.83s/it] {'loss': 0.4834, 'learning_rate': 1.890846820159066e-05, 'epoch': 0.18} + 18%|█▊ | 1537/8750 [2:30:48<11:41:14, 5.83s/it] 18%|█▊ | 1538/8750 [2:31:00<11:33:30, 5.77s/it] 18%|█▊ | 1538/8750 [2:30:53<11:33:31, 5.77s/it] {'loss': 0.4873, 'learning_rate': 1.890678591640656e-05, 'epoch': 0.18} + 18%|█▊ | 1538/8750 [2:31:00<11:33:30, 5.77s/it] {'loss': 0.4873, 'learning_rate': 1.890678591640656e-05, 'epoch': 0.18} + 18%|█▊ | 1538/8750 [2:30:53<11:33:31, 5.77s/it] 18%|█▊ | 1539/8750 [2:31:06<11:28:16, 5.73s/it] 18%|█▊ | 1539/8750 [2:30:59<11:28:16, 5.73s/it] {'loss': 0.5074, 'learning_rate': 1.8905102410793936e-05, 'epoch': 0.18} + 18%|█▊ | 1539/8750 [2:31:06<11:28:16, 5.73s/it] {'loss': 0.5074, 'learning_rate': 1.8905102410793936e-05, 'epoch': 0.18} + 18%|█▊ | 1539/8750 [2:30:59<11:28:16, 5.73s/it] 18%|█▊ | 1540/8750 [2:31:12<11:41:54, 5.84s/it] 18%|█▊ | 1540/8750 [2:31:05<11:41:55, 5.84s/it] {'loss': 0.478, 'learning_rate': 1.8903417684983465e-05, 'epoch': 0.18} + 18%|█▊ | 1540/8750 [2:31:12<11:41:54, 5.84s/it] {'loss': 0.478, 'learning_rate': 1.8903417684983465e-05, 'epoch': 0.18} + 18%|█▊ | 1540/8750 [2:31:05<11:41:55, 5.84s/it] 18%|█▊ | 1541/8750 [2:31:17<11:34:25, 5.78s/it] 18%|█▊ | 1541/8750 [2:31:11<11:34:25, 5.78s/it] {'loss': 0.483, 'learning_rate': 1.8901731739205992e-05, 'epoch': 0.18} + 18%|█▊ | 1541/8750 [2:31:17<11:34:25, 5.78s/it] {'loss': 0.483, 'learning_rate': 1.8901731739205992e-05, 'epoch': 0.18} + 18%|█▊ | 1541/8750 [2:31:11<11:34:25, 5.78s/it] 18%|█▊ | 1542/8750 [2:31:23<11:39:03, 5.82s/it] 18%|█▊ | 1542/8750 [2:31:17<11:39:03, 5.82s/it] {'loss': 0.4688, 'learning_rate': 1.8900044573692527e-05, 'epoch': 0.18} + 18%|█▊ | 1542/8750 [2:31:23<11:39:03, 5.82s/it] {'loss': 0.4688, 'learning_rate': 1.8900044573692527e-05, 'epoch': 0.18} + 18%|█▊ | 1542/8750 [2:31:17<11:39:03, 5.82s/it] 18%|█▊ | 1543/8750 [2:31:29<11:32:58, 5.77s/it] 18%|█▊ | 1543/8750 [2:31:22<11:32:57, 5.77s/it] {'loss': 0.487, 'learning_rate': 1.8898356188674253e-05, 'epoch': 0.18} + 18%|█▊ | 1543/8750 [2:31:29<11:32:58, 5.77s/it] {'loss': 0.487, 'learning_rate': 1.8898356188674253e-05, 'epoch': 0.18} + 18%|█▊ | 1543/8750 [2:31:22<11:32:57, 5.77s/it] 18%|█▊ | 1544/8750 [2:31:35<11:30:02, 5.75s/it] 18%|█▊ | 1544/8750 [2:31:28<11:30:02, 5.75s/it] {'loss': 0.489, 'learning_rate': 1.8896666584382516e-05, 'epoch': 0.18} + 18%|█▊ | 1544/8750 [2:31:35<11:30:02, 5.75s/it] {'loss': 0.489, 'learning_rate': 1.8896666584382516e-05, 'epoch': 0.18} + 18%|█▊ | 1544/8750 [2:31:28<11:30:02, 5.75s/it] 18%|█▊ | 1545/8750 [2:31:40<11:25:21, 5.71s/it] 18%|█▊ | 1545/8750 [2:31:34<11:25:21, 5.71s/it] {'loss': 0.4841, 'learning_rate': 1.8894975761048826e-05, 'epoch': 0.18} + 18%|█▊ | 1545/8750 [2:31:40<11:25:21, 5.71s/it] {'loss': 0.4841, 'learning_rate': 1.8894975761048826e-05, 'epoch': 0.18} + 18%|█▊ | 1545/8750 [2:31:34<11:25:21, 5.71s/it] 18%|█▊ | 1546/8750 [2:31:46<11:21:58, 5.68s/it] 18%|█▊ | 1546/8750 [2:31:39<11:21:58, 5.68s/it] {'loss': 0.4876, 'learning_rate': 1.8893283718904866e-05, 'epoch': 0.18} + 18%|█▊ | 1546/8750 [2:31:46<11:21:58, 5.68s/it] {'loss': 0.4876, 'learning_rate': 1.8893283718904866e-05, 'epoch': 0.18} + 18%|█▊ | 1546/8750 [2:31:39<11:21:58, 5.68s/it] 18%|█▊ | 1547/8750 [2:31:51<11:17:13, 5.64s/it] 18%|█▊ | 1547/8750 [2:31:45<11:17:13, 5.64s/it] {'loss': 0.4965, 'learning_rate': 1.8891590458182486e-05, 'epoch': 0.18} + 18%|█▊ | 1547/8750 [2:31:51<11:17:13, 5.64s/it] {'loss': 0.4965, 'learning_rate': 1.8891590458182486e-05, 'epoch': 0.18} + 18%|█▊ | 1547/8750 [2:31:45<11:17:13, 5.64s/it] 18%|█▊ | 1548/8750 [2:31:57<11:25:17, 5.71s/it] 18%|█▊ | 1548/8750 [2:31:51<11:25:17, 5.71s/it] {'loss': 0.4915, 'learning_rate': 1.8889895979113698e-05, 'epoch': 0.18} + 18%|█▊ | 1548/8750 [2:31:57<11:25:17, 5.71s/it] {'loss': 0.4915, 'learning_rate': 1.8889895979113698e-05, 'epoch': 0.18} + 18%|█▊ | 1548/8750 [2:31:51<11:25:17, 5.71s/it] 18%|█▊ | 1549/8750 [2:32:03<11:37:23, 5.81s/it] 18%|█▊ | 1549/8750 [2:31:57<11:37:23, 5.81s/it] {'loss': 0.4695, 'learning_rate': 1.888820028193068e-05, 'epoch': 0.18} + 18%|█▊ | 1549/8750 [2:32:03<11:37:23, 5.81s/it] {'loss': 0.4695, 'learning_rate': 1.888820028193068e-05, 'epoch': 0.18} + 18%|█▊ | 1549/8750 [2:31:57<11:37:23, 5.81s/it]4 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +1 18%|█▊ | 1550/8750 [2:32:09<11:29:30, 5.75s/it]15 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +014 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +13 18%|█▊ | 1550/8750 [2:32:02<11:29:30, 5.75s/it] AutoResumeHook: Checking whether to suspend... + {'loss': 0.4902, 'learning_rate': 1.8886503366865786e-05, 'epoch': 0.18} + 18%|█▊ | 1550/8750 [2:32:09<11:29:30, 5.75s/it] {'loss': 0.4902, 'learning_rate': 1.8886503366865786e-05, 'epoch': 0.18} + 18%|█▊ | 1550/8750 [2:32:02<11:29:30, 5.75s/it] 18%|█▊ | 1551/8750 [2:32:15<11:31:42, 5.77s/it] 18%|█▊ | 1551/8750 [2:32:08<11:31:43, 5.77s/it] {'loss': 0.4967, 'learning_rate': 1.888480523415153e-05, 'epoch': 0.18} + 18%|█▊ | 1551/8750 [2:32:15<11:31:42, 5.77s/it] {'loss': 0.4967, 'learning_rate': 1.888480523415153e-05, 'epoch': 0.18} + 18%|█▊ | 1551/8750 [2:32:08<11:31:43, 5.77s/it] 18%|█▊ | 1552/8750 [2:32:21<11:34:00, 5.79s/it] 18%|█▊ | 1552/8750 [2:32:14<11:34:00, 5.79s/it] {'loss': 0.4639, 'learning_rate': 1.8883105884020595e-05, 'epoch': 0.18} + 18%|█▊ | 1552/8750 [2:32:21<11:34:00, 5.79s/it] {'loss': 0.4639, 'learning_rate': 1.8883105884020595e-05, 'epoch': 0.18} + 18%|█▊ | 1552/8750 [2:32:14<11:34:00, 5.79s/it] 18%|█▊ | 1553/8750 [2:32:26<11:33:50, 5.78s/it] 18%|█▊ | 1553/8750 [2:32:20<11:33:49, 5.78s/it] {'loss': 0.4954, 'learning_rate': 1.8881405316705824e-05, 'epoch': 0.18} + 18%|█▊ | 1553/8750 [2:32:26<11:33:50, 5.78s/it] {'loss': 0.4954, 'learning_rate': 1.8881405316705824e-05, 'epoch': 0.18} + 18%|█▊ | 1553/8750 [2:32:20<11:33:49, 5.78s/it] 18%|█▊ | 1554/8750 [2:32:32<11:32:41, 5.78s/it] 18%|█▊ | 1554/8750 [2:32:26<11:32:41, 5.78s/it] {'loss': 0.4777, 'learning_rate': 1.887970353244024e-05, 'epoch': 0.18} + 18%|█▊ | 1554/8750 [2:32:32<11:32:41, 5.78s/it] {'loss': 0.4777, 'learning_rate': 1.887970353244024e-05, 'epoch': 0.18} + 18%|█▊ | 1554/8750 [2:32:26<11:32:41, 5.78s/it] 18%|█▊ | 1555/8750 [2:32:38<11:33:44, 5.79s/it] 18%|█▊ | 1555/8750 [2:32:31<11:33:45, 5.79s/it] {'loss': 0.4929, 'learning_rate': 1.887800053145702e-05, 'epoch': 0.18} + 18%|█▊ | 1555/8750 [2:32:38<11:33:44, 5.79s/it] {'loss': 0.4929, 'learning_rate': 1.887800053145702e-05, 'epoch': 0.18} + 18%|█▊ | 1555/8750 [2:32:31<11:33:45, 5.79s/it] 18%|█▊ | 1556/8750 [2:32:44<11:32:52, 5.78s/it] 18%|█▊ | 1556/8750 [2:32:37<11:32:52, 5.78s/it] {'loss': 0.5071, 'learning_rate': 1.8876296313989516e-05, 'epoch': 0.18} + 18%|█▊ | 1556/8750 [2:32:44<11:32:52, 5.78s/it] {'loss': 0.5071, 'learning_rate': 1.8876296313989516e-05, 'epoch': 0.18} + 18%|█▊ | 1556/8750 [2:32:37<11:32:52, 5.78s/it] 18%|█▊ | 1557/8750 [2:32:50<11:42:43, 5.86s/it] 18%|█▊ | 1557/8750 [2:32:43<11:42:42, 5.86s/it] {'loss': 0.4704, 'learning_rate': 1.8874590880271245e-05, 'epoch': 0.18} + 18%|█▊ | 1557/8750 [2:32:50<11:42:43, 5.86s/it] {'loss': 0.4704, 'learning_rate': 1.8874590880271245e-05, 'epoch': 0.18} + 18%|█▊ | 1557/8750 [2:32:43<11:42:42, 5.86s/it] 18%|█▊ | 1558/8750 [2:32:55<11:37:20, 5.82s/it] 18%|█▊ | 1558/8750 [2:32:49<11:37:19, 5.82s/it] {'loss': 0.468, 'learning_rate': 1.8872884230535886e-05, 'epoch': 0.18} + 18%|█▊ | 1558/8750 [2:32:55<11:37:20, 5.82s/it] {'loss': 0.468, 'learning_rate': 1.8872884230535886e-05, 'epoch': 0.18} + 18%|█▊ | 1558/8750 [2:32:49<11:37:19, 5.82s/it] 18%|█▊ | 1559/8750 [2:33:01<11:33:28, 5.79s/it] 18%|█▊ | 1559/8750 [2:32:55<11:33:27, 5.79s/it] {'loss': 0.4986, 'learning_rate': 1.8871176365017293e-05, 'epoch': 0.18} + 18%|█▊ | 1559/8750 [2:33:01<11:33:28, 5.79s/it] {'loss': 0.4986, 'learning_rate': 1.8871176365017293e-05, 'epoch': 0.18} + 18%|█▊ | 1559/8750 [2:32:55<11:33:27, 5.79s/it] 18%|█▊ | 1560/8750 [2:33:07<11:39:26, 5.84s/it] 18%|█▊ | 1560/8750 [2:33:01<11:39:26, 5.84s/it] {'loss': 0.4718, 'learning_rate': 1.8869467283949475e-05, 'epoch': 0.18} + 18%|█▊ | 1560/8750 [2:33:07<11:39:26, 5.84s/it] {'loss': 0.4718, 'learning_rate': 1.8869467283949475e-05, 'epoch': 0.18} + 18%|█▊ | 1560/8750 [2:33:01<11:39:26, 5.84s/it] 18%|█▊ | 1561/8750 [2:33:13<11:42:35, 5.86s/it] 18%|█▊ | 1561/8750 [2:33:06<11:42:35, 5.86s/it] {'loss': 0.4777, 'learning_rate': 1.8867756987566615e-05, 'epoch': 0.18} + 18%|█▊ | 1561/8750 [2:33:13<11:42:35, 5.86s/it] {'loss': 0.4777, 'learning_rate': 1.8867756987566615e-05, 'epoch': 0.18} + 18%|█▊ | 1561/8750 [2:33:06<11:42:35, 5.86s/it] 18%|█▊ | 1562/8750 [2:33:19<11:46:24, 5.90s/it] 18%|█▊ | 1562/8750 [2:33:12<11:46:24, 5.90s/it] {'loss': 0.492, 'learning_rate': 1.8866045476103073e-05, 'epoch': 0.18} + 18%|█▊ | 1562/8750 [2:33:19<11:46:24, 5.90s/it] {'loss': 0.492, 'learning_rate': 1.8866045476103073e-05, 'epoch': 0.18} + 18%|█▊ | 1562/8750 [2:33:12<11:46:24, 5.90s/it] 18%|█▊ | 1563/8750 [2:33:25<11:37:59, 5.83s/it] 18%|█▊ | 1563/8750 [2:33:18<11:37:58, 5.83s/it] {'loss': 0.5308, 'learning_rate': 1.886433274979335e-05, 'epoch': 0.18} + 18%|█▊ | 1563/8750 [2:33:25<11:37:59, 5.83s/it] {'loss': 0.5308, 'learning_rate': 1.886433274979335e-05, 'epoch': 0.18} + 18%|█▊ | 1563/8750 [2:33:18<11:37:58, 5.83s/it] 18%|█▊ | 1564/8750 [2:33:30<11:29:45, 5.76s/it] 18%|█▊ | 1564/8750 [2:33:24<11:29:48, 5.76s/it] {'loss': 0.4872, 'learning_rate': 1.8862618808872138e-05, 'epoch': 0.18} + 18%|█▊ | 1564/8750 [2:33:30<11:29:45, 5.76s/it] {'loss': 0.4872, 'learning_rate': 1.8862618808872138e-05, 'epoch': 0.18} + 18%|█▊ | 1564/8750 [2:33:24<11:29:48, 5.76s/it] 18%|█▊ | 1565/8750 [2:33:36<11:24:50, 5.72s/it] 18%|█▊ | 1565/8750 [2:33:29<11:24:49, 5.72s/it] {'loss': 0.4714, 'learning_rate': 1.8860903653574277e-05, 'epoch': 0.18} + 18%|█▊ | 1565/8750 [2:33:36<11:24:50, 5.72s/it] {'loss': 0.4714, 'learning_rate': 1.8860903653574277e-05, 'epoch': 0.18} + 18%|█▊ | 1565/8750 [2:33:29<11:24:49, 5.72s/it] 18%|█▊ | 1566/8750 [2:33:42<11:34:11, 5.80s/it] 18%|█▊ | 1566/8750 [2:33:35<11:34:10, 5.80s/it] {'loss': 0.4829, 'learning_rate': 1.8859187284134785e-05, 'epoch': 0.18} + 18%|█▊ | 1566/8750 [2:33:42<11:34:11, 5.80s/it] {'loss': 0.4829, 'learning_rate': 1.8859187284134785e-05, 'epoch': 0.18} + 18%|█▊ | 1566/8750 [2:33:35<11:34:10, 5.80s/it] 18%|█▊ | 1567/8750 [2:33:48<11:30:09, 5.76s/it] 18%|█▊ | 1567/8750 [2:33:41<11:30:09, 5.76s/it] {'loss': 0.4899, 'learning_rate': 1.8857469700788845e-05, 'epoch': 0.18} + 18%|█▊ | 1567/8750 [2:33:48<11:30:09, 5.76s/it] {'loss': 0.4899, 'learning_rate': 1.8857469700788845e-05, 'epoch': 0.18} + 18%|█▊ | 1567/8750 [2:33:41<11:30:09, 5.76s/it] 18%|█▊ | 1568/8750 [2:33:53<11:30:43, 5.77s/it] 18%|█▊ | 1568/8750 [2:33:47<11:30:42, 5.77s/it] {'loss': 0.4854, 'learning_rate': 1.8855750903771805e-05, 'epoch': 0.18} + 18%|█▊ | 1568/8750 [2:33:53<11:30:43, 5.77s/it] {'loss': 0.4854, 'learning_rate': 1.8855750903771805e-05, 'epoch': 0.18} + 18%|█▊ | 1568/8750 [2:33:47<11:30:42, 5.77s/it] 18%|█▊ | 1569/8750 [2:33:59<11:24:37, 5.72s/it] 18%|█▊ | 1569/8750 [2:33:52<11:24:37, 5.72s/it] {'loss': 0.4738, 'learning_rate': 1.8854030893319173e-05, 'epoch': 0.18} + 18%|█▊ | 1569/8750 [2:33:59<11:24:37, 5.72s/it] {'loss': 0.4738, 'learning_rate': 1.8854030893319173e-05, 'epoch': 0.18} + 18%|█▊ | 1569/8750 [2:33:52<11:24:37, 5.72s/it] 18%|█▊ | 1570/8750 [2:34:05<11:20:21, 5.69s/it] 18%|█▊ | 1570/8750 [2:33:58<11:20:21, 5.69s/it] {'loss': 0.4797, 'learning_rate': 1.8852309669666634e-05, 'epoch': 0.18} + 18%|█▊ | 1570/8750 [2:34:05<11:20:21, 5.69s/it] {'loss': 0.4797, 'learning_rate': 1.8852309669666634e-05, 'epoch': 0.18} + 18%|█▊ | 1570/8750 [2:33:58<11:20:21, 5.69s/it] 18%|█▊ | 1571/8750 [2:34:10<11:27:25, 5.75s/it] 18%|█▊ | 1571/8750 [2:34:04<11:27:24, 5.75s/it] {'loss': 0.4985, 'learning_rate': 1.885058723305003e-05, 'epoch': 0.18} + 18%|█▊ | 1571/8750 [2:34:10<11:27:25, 5.75s/it] {'loss': 0.4985, 'learning_rate': 1.885058723305003e-05, 'epoch': 0.18} + 18%|█▊ | 1571/8750 [2:34:04<11:27:24, 5.75s/it] 18%|█▊ | 1572/8750 [2:34:16<11:27:24, 5.75s/it] 18%|█▊ | 1572/8750 [2:34:10<11:27:24, 5.75s/it] {'loss': 0.4893, 'learning_rate': 1.8848863583705373e-05, 'epoch': 0.18} + 18%|█▊ | 1572/8750 [2:34:16<11:27:24, 5.75s/it] {'loss': 0.4893, 'learning_rate': 1.8848863583705373e-05, 'epoch': 0.18} + 18%|█▊ | 1572/8750 [2:34:10<11:27:24, 5.75s/it] 18%|█▊ | 1573/8750 [2:34:22<11:23:13, 5.71s/it] 18%|█▊ | 1573/8750 [2:34:15<11:23:14, 5.71s/it] {'loss': 0.5035, 'learning_rate': 1.884713872186885e-05, 'epoch': 0.18} + 18%|█▊ | 1573/8750 [2:34:22<11:23:13, 5.71s/it] {'loss': 0.5035, 'learning_rate': 1.884713872186885e-05, 'epoch': 0.18} + 18%|█▊ | 1573/8750 [2:34:15<11:23:14, 5.71s/it] 18%|█▊ | 1574/8750 [2:34:27<11:18:39, 5.67s/it] 18%|█▊ | 1574/8750 [2:34:21<11:18:38, 5.67s/it] {'loss': 0.4932, 'learning_rate': 1.8845412647776795e-05, 'epoch': 0.18} + 18%|█▊ | 1574/8750 [2:34:27<11:18:39, 5.67s/it] {'loss': 0.4932, 'learning_rate': 1.8845412647776795e-05, 'epoch': 0.18} + 18%|█▊ | 1574/8750 [2:34:21<11:18:38, 5.67s/it] 18%|█▊ | 1575/8750 [2:34:33<11:21:34, 5.70s/it] 18%|█▊ | 1575/8750 [2:34:27<11:21:34, 5.70s/it] {'loss': 0.4753, 'learning_rate': 1.8843685361665724e-05, 'epoch': 0.18} + 18%|█▊ | 1575/8750 [2:34:33<11:21:34, 5.70s/it] {'loss': 0.4753, 'learning_rate': 1.8843685361665724e-05, 'epoch': 0.18} + 18%|█▊ | 1575/8750 [2:34:27<11:21:34, 5.70s/it] 18%|█▊ | 1576/8750 [2:34:39<11:24:24, 5.72s/it] 18%|█▊ | 1576/8750 [2:34:32<11:24:24, 5.72s/it] {'loss': 0.4796, 'learning_rate': 1.8841956863772314e-05, 'epoch': 0.18} + 18%|█▊ | 1576/8750 [2:34:39<11:24:24, 5.72s/it] {'loss': 0.4796, 'learning_rate': 1.8841956863772314e-05, 'epoch': 0.18} + 18%|█▊ | 1576/8750 [2:34:32<11:24:24, 5.72s/it] 18%|█▊ | 1577/8750 [2:34:45<11:21:00, 5.70s/it] 18%|█▊ | 1577/8750 [2:34:38<11:21:00, 5.70s/it] {'loss': 0.4888, 'learning_rate': 1.8840227154333405e-05, 'epoch': 0.18} + 18%|█▊ | 1577/8750 [2:34:45<11:21:00, 5.70s/it] {'loss': 0.4888, 'learning_rate': 1.8840227154333405e-05, 'epoch': 0.18} + 18%|█▊ | 1577/8750 [2:34:38<11:21:00, 5.70s/it] 18%|█▊ | 1578/8750 [2:34:50<11:18:51, 5.68s/it] 18%|█▊ | 1578/8750 [2:34:44<11:18:51, 5.68s/it] {'loss': 0.4732, 'learning_rate': 1.883849623358601e-05, 'epoch': 0.18} + 18%|█▊ | 1578/8750 [2:34:50<11:18:51, 5.68s/it] {'loss': 0.4732, 'learning_rate': 1.883849623358601e-05, 'epoch': 0.18} + 18%|█▊ | 1578/8750 [2:34:44<11:18:51, 5.68s/it] 18%|█▊ | 1579/8750 [2:34:56<11:12:53, 5.63s/it] 18%|█▊ | 1579/8750 [2:34:49<11:12:53, 5.63s/it] {'loss': 0.4919, 'learning_rate': 1.88367641017673e-05, 'epoch': 0.18} + 18%|█▊ | 1579/8750 [2:34:56<11:12:53, 5.63s/it] {'loss': 0.4919, 'learning_rate': 1.88367641017673e-05, 'epoch': 0.18} + 18%|█▊ | 1579/8750 [2:34:49<11:12:53, 5.63s/it] 18%|█▊ | 1580/8750 [2:35:01<11:14:01, 5.64s/it] 18%|█▊ | 1580/8750 [2:34:55<11:14:00, 5.64s/it] {'loss': 0.4883, 'learning_rate': 1.8835030759114617e-05, 'epoch': 0.18} + 18%|█▊ | 1580/8750 [2:35:01<11:14:01, 5.64s/it] {'loss': 0.4883, 'learning_rate': 1.8835030759114617e-05, 'epoch': 0.18} + 18%|█▊ | 1580/8750 [2:34:55<11:14:00, 5.64s/it] 18%|█▊ | 1581/8750 [2:35:07<11:21:54, 5.71s/it] 18%|█▊ | 1581/8750 [2:35:01<11:21:55, 5.71s/it] {'loss': 0.4923, 'learning_rate': 1.8833296205865466e-05, 'epoch': 0.18} + 18%|█▊ | 1581/8750 [2:35:07<11:21:54, 5.71s/it] {'loss': 0.4923, 'learning_rate': 1.8833296205865466e-05, 'epoch': 0.18} + 18%|█▊ | 1581/8750 [2:35:01<11:21:55, 5.71s/it] 18%|█▊ | 1582/8750 [2:35:13<11:18:11, 5.68s/it] 18%|█▊ | 1582/8750 [2:35:06<11:18:11, 5.68s/it] {'loss': 0.477, 'learning_rate': 1.8831560442257523e-05, 'epoch': 0.18} + 18%|█▊ | 1582/8750 [2:35:13<11:18:11, 5.68s/it] {'loss': 0.477, 'learning_rate': 1.8831560442257523e-05, 'epoch': 0.18} + 18%|█▊ | 1582/8750 [2:35:06<11:18:11, 5.68s/it] 18%|█▊ | 1583/8750 [2:35:19<11:33:09, 5.80s/it] 18%|█▊ | 1583/8750 [2:35:12<11:33:09, 5.80s/it] {'loss': 0.4749, 'learning_rate': 1.8829823468528624e-05, 'epoch': 0.18} + 18%|█▊ | 1583/8750 [2:35:19<11:33:09, 5.80s/it] {'loss': 0.4749, 'learning_rate': 1.8829823468528624e-05, 'epoch': 0.18} + 18%|█▊ | 1583/8750 [2:35:12<11:33:09, 5.80s/it] 18%|█▊ | 1584/8750 [2:35:25<11:47:24, 5.92s/it] 18%|█▊ | 1584/8750 [2:35:19<11:47:24, 5.92s/it] {'loss': 0.4813, 'learning_rate': 1.8828085284916777e-05, 'epoch': 0.18} + 18%|█▊ | 1584/8750 [2:35:25<11:47:24, 5.92s/it] {'loss': 0.4813, 'learning_rate': 1.8828085284916777e-05, 'epoch': 0.18} + 18%|█▊ | 1584/8750 [2:35:19<11:47:24, 5.92s/it] 18%|█▊ | 1585/8750 [2:35:31<11:34:32, 5.82s/it] 18%|█▊ | 1585/8750 [2:35:24<11:34:32, 5.82s/it] {'loss': 0.5162, 'learning_rate': 1.882634589166014e-05, 'epoch': 0.18} + 18%|█▊ | 1585/8750 [2:35:31<11:34:32, 5.82s/it] {'loss': 0.5162, 'learning_rate': 1.882634589166014e-05, 'epoch': 0.18} + 18%|█▊ | 1585/8750 [2:35:24<11:34:32, 5.82s/it] 18%|█▊ | 1586/8750 [2:35:37<11:39:29, 5.86s/it] 18%|█▊ | 1586/8750 [2:35:30<11:39:29, 5.86s/it] {'loss': 0.4578, 'learning_rate': 1.8824605288997064e-05, 'epoch': 0.18} + 18%|█▊ | 1586/8750 [2:35:37<11:39:29, 5.86s/it] {'loss': 0.4578, 'learning_rate': 1.8824605288997064e-05, 'epoch': 0.18} + 18%|█▊ | 1586/8750 [2:35:30<11:39:29, 5.86s/it] 18%|█▊ | 1587/8750 [2:35:42<11:35:11, 5.82s/it] 18%|█▊ | 1587/8750 [2:35:36<11:35:11, 5.82s/it] {'loss': 0.5004, 'learning_rate': 1.882286347716604e-05, 'epoch': 0.18}{'loss': 0.5004, 'learning_rate': 1.882286347716604e-05, 'epoch': 0.18} + + 18%|█▊ | 1587/8750 [2:35:42<11:35:11, 5.82s/it] 18%|█▊ | 1587/8750 [2:35:36<11:35:11, 5.82s/it] 18%|█▊ | 1588/8750 [2:35:48<11:30:08, 5.78s/it] 18%|█▊ | 1588/8750 [2:35:42<11:30:09, 5.78s/it] {'loss': 0.4933, 'learning_rate': 1.8821120456405743e-05, 'epoch': 0.18} + 18%|█▊ | 1588/8750 [2:35:48<11:30:08, 5.78s/it] {'loss': 0.4933, 'learning_rate': 1.8821120456405743e-05, 'epoch': 0.18} + 18%|█▊ | 1588/8750 [2:35:42<11:30:09, 5.78s/it] 18%|█▊ | 1589/8750 [2:35:54<11:28:17, 5.77s/it] 18%|█▊ | 1589/8750 [2:35:47<11:28:16, 5.77s/it] {'loss': 0.5036, 'learning_rate': 1.8819376226955e-05, 'epoch': 0.18} + 18%|█▊ | 1589/8750 [2:35:54<11:28:17, 5.77s/it] {'loss': 0.5036, 'learning_rate': 1.8819376226955e-05, 'epoch': 0.18} + 18%|█▊ | 1589/8750 [2:35:47<11:28:16, 5.77s/it] 18%|█▊ | 1590/8750 [2:36:00<11:35:41, 5.83s/it] 18%|█▊ | 1590/8750 [2:35:53<11:35:42, 5.83s/it] {'loss': 0.4774, 'learning_rate': 1.8817630789052813e-05, 'epoch': 0.18} + 18%|█▊ | 1590/8750 [2:36:00<11:35:41, 5.83s/it] {'loss': 0.4774, 'learning_rate': 1.8817630789052813e-05, 'epoch': 0.18} + 18%|█▊ | 1590/8750 [2:35:53<11:35:42, 5.83s/it] 18%|█▊ | 1591/8750 [2:36:05<11:25:46, 5.75s/it] 18%|█▊ | 1591/8750 [2:35:59<11:25:47, 5.75s/it] {'loss': 0.4817, 'learning_rate': 1.881588414293834e-05, 'epoch': 0.18} + 18%|█▊ | 1591/8750 [2:36:05<11:25:46, 5.75s/it] {'loss': 0.4817, 'learning_rate': 1.881588414293834e-05, 'epoch': 0.18} + 18%|█▊ | 1591/8750 [2:35:59<11:25:47, 5.75s/it] 18%|█▊ | 1592/8750 [2:36:11<11:36:52, 5.84s/it] 18%|█▊ | 1592/8750 [2:36:05<11:36:52, 5.84s/it] {'loss': 0.4606, 'learning_rate': 1.881413628885092e-05, 'epoch': 0.18} + 18%|█▊ | 1592/8750 [2:36:11<11:36:52, 5.84s/it] {'loss': 0.4606, 'learning_rate': 1.881413628885092e-05, 'epoch': 0.18} + 18%|█▊ | 1592/8750 [2:36:05<11:36:52, 5.84s/it] 18%|█▊ | 1593/8750 [2:36:11<11:30:21, 5.79s/it] 18%|█▊ | 1593/8750 [2:36:17<11:30:22, 5.79s/it] {'loss': 0.4866, 'learning_rate': 1.8812387227030035e-05, 'epoch': 0.18} + 18%|█▊ | 1593/8750 [2:36:17<11:30:22, 5.79s/it] {'loss': 0.4866, 'learning_rate': 1.8812387227030035e-05, 'epoch': 0.18} + 18%|█▊ | 1593/8750 [2:36:11<11:30:21, 5.79s/it] 18%|█▊ | 1594/8750 [2:36:23<11:33:27, 5.81s/it] 18%|█▊ | 1594/8750 [2:36:16<11:33:27, 5.81s/it] {'loss': 0.4622, 'learning_rate': 1.8810636957715357e-05, 'epoch': 0.18} + 18%|█▊ | 1594/8750 [2:36:23<11:33:27, 5.81s/it] {'loss': 0.4622, 'learning_rate': 1.8810636957715357e-05, 'epoch': 0.18} + 18%|█▊ | 1594/8750 [2:36:16<11:33:27, 5.81s/it] 18%|█▊ | 1595/8750 [2:36:22<11:27:44, 5.77s/it] 18%|█▊ | 1595/8750 [2:36:29<11:27:45, 5.77s/it] {'loss': 0.4884, 'learning_rate': 1.880888548114671e-05, 'epoch': 0.18} + 18%|█▊ | 1595/8750 [2:36:29<11:27:45, 5.77s/it] {'loss': 0.4884, 'learning_rate': 1.880888548114671e-05, 'epoch': 0.18} + 18%|█▊ | 1595/8750 [2:36:22<11:27:44, 5.77s/it] 18%|█▊ | 1596/8750 [2:36:34<11:19:13, 5.70s/it] 18%|█▊ | 1596/8750 [2:36:28<11:19:13, 5.70s/it] {'loss': 0.479, 'learning_rate': 1.880713279756408e-05, 'epoch': 0.18} + 18%|█▊ | 1596/8750 [2:36:34<11:19:13, 5.70s/it] {'loss': 0.479, 'learning_rate': 1.880713279756408e-05, 'epoch': 0.18} + 18%|█▊ | 1596/8750 [2:36:28<11:19:13, 5.70s/it] 18%|█▊ | 1597/8750 [2:36:40<11:14:16, 5.66s/it] 18%|█▊ | 1597/8750 [2:36:33<11:14:16, 5.66s/it] {'loss': 0.4996, 'learning_rate': 1.880537890720763e-05, 'epoch': 0.18} + 18%|█▊ | 1597/8750 [2:36:40<11:14:16, 5.66s/it] {'loss': 0.4996, 'learning_rate': 1.880537890720763e-05, 'epoch': 0.18} + 18%|█▊ | 1597/8750 [2:36:33<11:14:16, 5.66s/it] 18%|█▊ | 1598/8750 [2:36:45<11:11:17, 5.63s/it] 18%|█▊ | 1598/8750 [2:36:39<11:11:17, 5.63s/it] {'loss': 0.4693, 'learning_rate': 1.8803623810317678e-05, 'epoch': 0.18} + 18%|█▊ | 1598/8750 [2:36:45<11:11:17, 5.63s/it] {'loss': 0.4693, 'learning_rate': 1.8803623810317678e-05, 'epoch': 0.18} + 18%|█▊ | 1598/8750 [2:36:39<11:11:17, 5.63s/it] 18%|█▊ | 1599/8750 [2:36:51<11:09:21, 5.62s/it] 18%|█▊ | 1599/8750 [2:36:44<11:09:21, 5.62s/it] {'loss': 0.4771, 'learning_rate': 1.8801867507134712e-05, 'epoch': 0.18} + 18%|█▊ | 1599/8750 [2:36:51<11:09:21, 5.62s/it] {'loss': 0.4771, 'learning_rate': 1.8801867507134712e-05, 'epoch': 0.18} + 18%|█▊ | 1599/8750 [2:36:44<11:09:21, 5.62s/it]4 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +902 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... + 18%|█▊ | 1600/8750 [2:36:56<11:04:46, 5.58s/it]147 AutoResumeHook: Checking whether to suspend... 18%|█▊ | 1600/8750 [2:36:50<11:04:46, 5.58s/it]15 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... + +10 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4914, 'learning_rate': 1.8800109997899386e-05, 'epoch': 0.18} + 18%|█▊ | 1600/8750 [2:36:56<11:04:46, 5.58s/it] {'loss': 0.4914, 'learning_rate': 1.8800109997899386e-05, 'epoch': 0.18} + 18%|█▊ | 1600/8750 [2:36:50<11:04:46, 5.58s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-1600/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-1600/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-1600/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 18%|█▊ | 1601/8750 [2:37:16<19:31:42, 9.83s/it] 18%|█▊ | 1601/8750 [2:37:10<19:31:43, 9.83s/it] {'loss': 0.4782, 'learning_rate': 1.879835128285252e-05, 'epoch': 0.18} + 18%|█▊ | 1601/8750 [2:37:16<19:31:42, 9.83s/it] {'loss': 0.4782, 'learning_rate': 1.879835128285252e-05, 'epoch': 0.18} + 18%|█▊ | 1601/8750 [2:37:10<19:31:43, 9.83s/it] 18%|█▊ | 1602/8750 [2:37:22<17:07:34, 8.63s/it] 18%|█▊ | 1602/8750 [2:37:15<17:07:34, 8.63s/it] {'loss': 0.4953, 'learning_rate': 1.879659136223509e-05, 'epoch': 0.18} + 18%|█▊ | 1602/8750 [2:37:22<17:07:34, 8.63s/it] {'loss': 0.4953, 'learning_rate': 1.879659136223509e-05, 'epoch': 0.18} + 18%|█▊ | 1602/8750 [2:37:15<17:07:34, 8.63s/it] 18%|█▊ | 1603/8750 [2:37:27<15:18:23, 7.71s/it] 18%|█▊ | 1603/8750 [2:37:21<15:18:23, 7.71s/it] {'loss': 0.4904, 'learning_rate': 1.8794830236288254e-05, 'epoch': 0.18} + 18%|█▊ | 1603/8750 [2:37:27<15:18:23, 7.71s/it] {'loss': 0.4904, 'learning_rate': 1.8794830236288254e-05, 'epoch': 0.18} + 18%|█▊ | 1603/8750 [2:37:21<15:18:23, 7.71s/it] 18%|█▊ | 1604/8750 [2:37:33<14:07:22, 7.11s/it] 18%|█▊ | 1604/8750 [2:37:27<14:07:22, 7.11s/it] {'loss': 0.4721, 'learning_rate': 1.8793067905253318e-05, 'epoch': 0.18} + 18%|█▊ | 1604/8750 [2:37:33<14:07:22, 7.11s/it] {'loss': 0.4721, 'learning_rate': 1.8793067905253318e-05, 'epoch': 0.18} + 18%|█▊ | 1604/8750 [2:37:27<14:07:22, 7.11s/it] 18%|█▊ | 1605/8750 [2:37:39<13:10:50, 6.64s/it] 18%|█▊ | 1605/8750 [2:37:32<13:10:50, 6.64s/it] {'loss': 0.5035, 'learning_rate': 1.8791304369371765e-05, 'epoch': 0.18} + 18%|█▊ | 1605/8750 [2:37:39<13:10:50, 6.64s/it] {'loss': 0.5035, 'learning_rate': 1.8791304369371765e-05, 'epoch': 0.18} + 18%|█▊ | 1605/8750 [2:37:32<13:10:50, 6.64s/it] 18%|█▊ | 1606/8750 [2:37:44<12:34:07, 6.33s/it] 18%|█▊ | 1606/8750 [2:37:38<12:34:08, 6.33s/it] {'loss': 0.4752, 'learning_rate': 1.8789539628885233e-05, 'epoch': 0.18} + 18%|█▊ | 1606/8750 [2:37:44<12:34:07, 6.33s/it] {'loss': 0.4752, 'learning_rate': 1.8789539628885233e-05, 'epoch': 0.18} + 18%|█▊ | 1606/8750 [2:37:38<12:34:08, 6.33s/it] 18%|█▊ | 1607/8750 [2:37:50<12:10:32, 6.14s/it] 18%|█▊ | 1607/8750 [2:37:44<12:10:32, 6.14s/it] {'loss': 0.4852, 'learning_rate': 1.878777368403554e-05, 'epoch': 0.18} + 18%|█▊ | 1607/8750 [2:37:50<12:10:32, 6.14s/it] {'loss': 0.4852, 'learning_rate': 1.878777368403554e-05, 'epoch': 0.18} + 18%|█▊ | 1607/8750 [2:37:44<12:10:32, 6.14s/it] 18%|█▊ | 1608/8750 [2:37:49<11:51:20, 5.98s/it] 18%|█▊ | 1608/8750 [2:37:56<11:51:20, 5.98s/it] {'loss': 0.496, 'learning_rate': 1.8786006535064654e-05, 'epoch': 0.18} + 18%|█▊ | 1608/8750 [2:37:56<11:51:20, 5.98s/it] {'loss': 0.496, 'learning_rate': 1.8786006535064654e-05, 'epoch': 0.18} + 18%|█▊ | 1608/8750 [2:37:49<11:51:20, 5.98s/it] 18%|█▊ | 1609/8750 [2:37:55<11:47:02, 5.94s/it] 18%|█▊ | 1609/8750 [2:38:01<11:47:03, 5.94s/it] {'loss': 0.4785, 'learning_rate': 1.8784238182214713e-05, 'epoch': 0.18} + 18%|█▊ | 1609/8750 [2:38:01<11:47:03, 5.94s/it] {'loss': 0.4785, 'learning_rate': 1.8784238182214713e-05, 'epoch': 0.18} + 18%|█▊ | 1609/8750 [2:37:55<11:47:02, 5.94s/it] 18%|█▊ | 1610/8750 [2:38:07<11:47:55, 5.95s/it] 18%|█▊ | 1610/8750 [2:38:01<11:47:56, 5.95s/it] {'loss': 0.4813, 'learning_rate': 1.8782468625728027e-05, 'epoch': 0.18} + 18%|█▊ | 1610/8750 [2:38:07<11:47:55, 5.95s/it] {'loss': 0.4813, 'learning_rate': 1.8782468625728027e-05, 'epoch': 0.18} + 18%|█▊ | 1610/8750 [2:38:01<11:47:56, 5.95s/it] 18%|█▊ | 1611/8750 [2:38:13<11:37:31, 5.86s/it] 18%|█▊ | 1611/8750 [2:38:07<11:37:30, 5.86s/it] {'loss': 0.5088, 'learning_rate': 1.8780697865847056e-05, 'epoch': 0.18} + 18%|█▊ | 1611/8750 [2:38:13<11:37:31, 5.86s/it] {'loss': 0.5088, 'learning_rate': 1.8780697865847056e-05, 'epoch': 0.18} + 18%|█▊ | 1611/8750 [2:38:07<11:37:30, 5.86s/it] 18%|█▊ | 1612/8750 [2:38:19<11:30:27, 5.80s/it] 18%|█▊ | 1612/8750 [2:38:12<11:30:27, 5.80s/it] {'loss': 0.4805, 'learning_rate': 1.877892590281444e-05, 'epoch': 0.18} + 18%|█▊ | 1612/8750 [2:38:19<11:30:27, 5.80s/it] {'loss': 0.4805, 'learning_rate': 1.877892590281444e-05, 'epoch': 0.18} + 18%|█▊ | 1612/8750 [2:38:12<11:30:27, 5.80s/it] 18%|█▊ | 1613/8750 [2:38:25<11:35:54, 5.85s/it] 18%|█▊ | 1613/8750 [2:38:18<11:35:56, 5.85s/it] {'loss': 0.5001, 'learning_rate': 1.877715273687297e-05, 'epoch': 0.18} + 18%|█▊ | 1613/8750 [2:38:25<11:35:54, 5.85s/it] {'loss': 0.5001, 'learning_rate': 1.877715273687297e-05, 'epoch': 0.18} + 18%|█▊ | 1613/8750 [2:38:18<11:35:56, 5.85s/it] 18%|█▊ | 1614/8750 [2:38:30<11:27:26, 5.78s/it] 18%|█▊ | 1614/8750 [2:38:24<11:27:25, 5.78s/it] {'loss': 0.4999, 'learning_rate': 1.8775378368265622e-05, 'epoch': 0.18} + 18%|█▊ | 1614/8750 [2:38:30<11:27:26, 5.78s/it] {'loss': 0.4999, 'learning_rate': 1.8775378368265622e-05, 'epoch': 0.18} + 18%|█▊ | 1614/8750 [2:38:24<11:27:25, 5.78s/it] 18%|█▊ | 1615/8750 [2:38:36<11:24:46, 5.76s/it] 18%|█▊ | 1615/8750 [2:38:30<11:24:46, 5.76s/it] {'loss': 0.4834, 'learning_rate': 1.8773602797235516e-05, 'epoch': 0.18} + 18%|█▊ | 1615/8750 [2:38:36<11:24:46, 5.76s/it] {'loss': 0.4834, 'learning_rate': 1.8773602797235516e-05, 'epoch': 0.18} + 18%|█▊ | 1615/8750 [2:38:30<11:24:46, 5.76s/it] 18%|█▊ | 1616/8750 [2:38:42<11:25:18, 5.76s/it] 18%|█▊ | 1616/8750 [2:38:35<11:25:18, 5.76s/it] {'loss': 0.4797, 'learning_rate': 1.8771826024025944e-05, 'epoch': 0.18} + 18%|█▊ | 1616/8750 [2:38:42<11:25:18, 5.76s/it] {'loss': 0.4797, 'learning_rate': 1.8771826024025944e-05, 'epoch': 0.18} + 18%|█▊ | 1616/8750 [2:38:35<11:25:18, 5.76s/it] 18%|█▊ | 1617/8750 [2:38:47<11:19:55, 5.72s/it] 18%|█▊ | 1617/8750 [2:38:41<11:19:54, 5.72s/it] {'loss': 0.488, 'learning_rate': 1.8770048048880367e-05, 'epoch': 0.18} + 18%|█▊ | 1617/8750 [2:38:47<11:19:55, 5.72s/it] {'loss': 0.488, 'learning_rate': 1.8770048048880367e-05, 'epoch': 0.18} + 18%|█▊ | 1617/8750 [2:38:41<11:19:54, 5.72s/it] 18%|█▊ | 1618/8750 [2:38:53<11:29:59, 5.80s/it] 18%|█▊ | 1618/8750 [2:38:47<11:29:59, 5.80s/it] {'loss': 0.4786, 'learning_rate': 1.8768268872042402e-05, 'epoch': 0.18} + 18%|█▊ | 1618/8750 [2:38:53<11:29:59, 5.80s/it] {'loss': 0.4786, 'learning_rate': 1.8768268872042402e-05, 'epoch': 0.18} + 18%|█▊ | 1618/8750 [2:38:47<11:29:59, 5.80s/it] 19%|█▊ | 1619/8750 [2:38:59<11:24:36, 5.76s/it] 19%|█▊ | 1619/8750 [2:38:53<11:24:36, 5.76s/it] {'loss': 0.4776, 'learning_rate': 1.8766488493755845e-05, 'epoch': 0.19} + 19%|█▊ | 1619/8750 [2:38:59<11:24:36, 5.76s/it] {'loss': 0.4776, 'learning_rate': 1.8766488493755845e-05, 'epoch': 0.19} + 19%|█▊ | 1619/8750 [2:38:53<11:24:36, 5.76s/it] 19%|█▊ | 1620/8750 [2:39:05<11:24:04, 5.76s/it] 19%|█▊ | 1620/8750 [2:38:58<11:24:04, 5.76s/it] {'loss': 0.4899, 'learning_rate': 1.8764706914264636e-05, 'epoch': 0.19} + 19%|█▊ | 1620/8750 [2:39:05<11:24:04, 5.76s/it] {'loss': 0.4899, 'learning_rate': 1.8764706914264636e-05, 'epoch': 0.19} + 19%|█▊ | 1620/8750 [2:38:58<11:24:04, 5.76s/it] 19%|█▊ | 1621/8750 [2:39:04<11:18:18, 5.71s/it] 19%|█▊ | 1621/8750 [2:39:10<11:18:20, 5.71s/it] {'loss': 0.4933, 'learning_rate': 1.8762924133812905e-05, 'epoch': 0.19} + 19%|█▊ | 1621/8750 [2:39:10<11:18:20, 5.71s/it] {'loss': 0.4933, 'learning_rate': 1.8762924133812905e-05, 'epoch': 0.19} + 19%|█▊ | 1621/8750 [2:39:04<11:18:18, 5.71s/it] 19%|█▊ | 1622/8750 [2:39:10<11:14:55, 5.68s/it] 19%|█▊ | 1622/8750 [2:39:16<11:14:57, 5.68s/it] {'loss': 0.4851, 'learning_rate': 1.876114015264492e-05, 'epoch': 0.19} + 19%|█▊ | 1622/8750 [2:39:16<11:14:57, 5.68s/it] {'loss': 0.4851, 'learning_rate': 1.876114015264492e-05, 'epoch': 0.19} + 19%|█▊ | 1622/8750 [2:39:10<11:14:55, 5.68s/it] 19%|█▊ | 1623/8750 [2:39:15<11:13:08, 5.67s/it] 19%|█▊ | 1623/8750 [2:39:22<11:13:08, 5.67s/it] {'loss': 0.4766, 'learning_rate': 1.8759354971005133e-05, 'epoch': 0.19} + 19%|█▊ | 1623/8750 [2:39:22<11:13:08, 5.67s/it] {'loss': 0.4766, 'learning_rate': 1.8759354971005133e-05, 'epoch': 0.19} + 19%|█▊ | 1623/8750 [2:39:15<11:13:08, 5.67s/it] 19%|█▊ | 1624/8750 [2:39:21<11:15:38, 5.69s/it] 19%|█▊ | 1624/8750 [2:39:27<11:15:38, 5.69s/it] {'loss': 0.5006, 'learning_rate': 1.875756858913815e-05, 'epoch': 0.19} + 19%|█▊ | 1624/8750 [2:39:27<11:15:38, 5.69s/it] {'loss': 0.5006, 'learning_rate': 1.875756858913815e-05, 'epoch': 0.19} + 19%|█▊ | 1624/8750 [2:39:21<11:15:38, 5.69s/it] 19%|█▊ | 1625/8750 [2:39:33<11:11:47, 5.66s/it] 19%|█▊ | 1625/8750 [2:39:27<11:11:48, 5.66s/it] {'loss': 0.4958, 'learning_rate': 1.875578100728875e-05, 'epoch': 0.19} + 19%|█▊ | 1625/8750 [2:39:33<11:11:47, 5.66s/it] {'loss': 0.4958, 'learning_rate': 1.875578100728875e-05, 'epoch': 0.19} + 19%|█▊ | 1625/8750 [2:39:27<11:11:48, 5.66s/it] 19%|█▊ | 1626/8750 [2:39:32<11:15:55, 5.69s/it] 19%|█▊ | 1626/8750 [2:39:39<11:15:56, 5.69s/it] {'loss': 0.482, 'learning_rate': 1.8753992225701868e-05, 'epoch': 0.19} + 19%|█▊ | 1626/8750 [2:39:39<11:15:56, 5.69s/it] {'loss': 0.482, 'learning_rate': 1.8753992225701868e-05, 'epoch': 0.19} + 19%|█▊ | 1626/8750 [2:39:32<11:15:55, 5.69s/it] 19%|█▊ | 1627/8750 [2:39:45<11:17:49, 5.71s/it] 19%|█▊ | 1627/8750 [2:39:38<11:17:50, 5.71s/it] {'loss': 0.4894, 'learning_rate': 1.875220224462261e-05, 'epoch': 0.19} + 19%|█▊ | 1627/8750 [2:39:45<11:17:49, 5.71s/it] {'loss': 0.4894, 'learning_rate': 1.875220224462261e-05, 'epoch': 0.19} + 19%|█▊ | 1627/8750 [2:39:38<11:17:50, 5.71s/it] 19%|█▊ | 1628/8750 [2:39:44<11:13:29, 5.67s/it] 19%|█▊ | 1628/8750 [2:39:50<11:13:30, 5.67s/it] {'loss': 0.474, 'learning_rate': 1.8750411064296237e-05, 'epoch': 0.19} + 19%|█▊ | 1628/8750 [2:39:50<11:13:30, 5.67s/it] {'loss': 0.474, 'learning_rate': 1.8750411064296237e-05, 'epoch': 0.19} + 19%|█▊ | 1628/8750 [2:39:44<11:13:29, 5.67s/it] 19%|█▊ | 1629/8750 [2:39:56<11:11:56, 5.66s/it] 19%|█▊ | 1629/8750 [2:39:49<11:11:57, 5.66s/it] {'loss': 0.507, 'learning_rate': 1.8748618684968187e-05, 'epoch': 0.19} + 19%|█▊ | 1629/8750 [2:39:56<11:11:56, 5.66s/it] {'loss': 0.507, 'learning_rate': 1.8748618684968187e-05, 'epoch': 0.19} + 19%|█▊ | 1629/8750 [2:39:49<11:11:57, 5.66s/it] 19%|█▊ | 1630/8750 [2:40:01<11:12:51, 5.67s/it] 19%|█▊ | 1630/8750 [2:39:55<11:12:51, 5.67s/it] {'loss': 0.4657, 'learning_rate': 1.8746825106884055e-05, 'epoch': 0.19} + 19%|█▊ | 1630/8750 [2:40:01<11:12:51, 5.67s/it] {'loss': 0.4657, 'learning_rate': 1.8746825106884055e-05, 'epoch': 0.19} + 19%|█▊ | 1630/8750 [2:39:55<11:12:51, 5.67s/it] 19%|█▊ | 1631/8750 [2:40:01<11:12:39, 5.67s/it] 19%|█▊ | 1631/8750 [2:40:07<11:12:40, 5.67s/it] {'loss': 0.484, 'learning_rate': 1.87450303302896e-05, 'epoch': 0.19} + 19%|█▊ | 1631/8750 [2:40:07<11:12:40, 5.67s/it] {'loss': 0.484, 'learning_rate': 1.87450303302896e-05, 'epoch': 0.19} + 19%|█▊ | 1631/8750 [2:40:01<11:12:39, 5.67s/it] 19%|█▊ | 1632/8750 [2:40:06<11:17:08, 5.71s/it] 19%|█▊ | 1632/8750 [2:40:13<11:17:08, 5.71s/it] {'loss': 0.479, 'learning_rate': 1.8743234355430746e-05, 'epoch': 0.19} + 19%|█▊ | 1632/8750 [2:40:13<11:17:08, 5.71s/it] {'loss': 0.479, 'learning_rate': 1.8743234355430746e-05, 'epoch': 0.19} + 19%|█▊ | 1632/8750 [2:40:06<11:17:08, 5.71s/it] 19%|█▊ | 1633/8750 [2:40:12<11:24:31, 5.77s/it] 19%|█▊ | 1633/8750 [2:40:19<11:24:31, 5.77s/it] {'loss': 0.4876, 'learning_rate': 1.8741437182553582e-05, 'epoch': 0.19} + 19%|█▊ | 1633/8750 [2:40:19<11:24:31, 5.77s/it] {'loss': 0.4876, 'learning_rate': 1.8741437182553582e-05, 'epoch': 0.19} + 19%|█▊ | 1633/8750 [2:40:12<11:24:31, 5.77s/it] 19%|█▊ | 1634/8750 [2:40:18<11:20:24, 5.74s/it] 19%|█▊ | 1634/8750 [2:40:25<11:20:24, 5.74s/it] {'loss': 0.5082, 'learning_rate': 1.8739638811904363e-05, 'epoch': 0.19} + 19%|█▊ | 1634/8750 [2:40:25<11:20:24, 5.74s/it] {'loss': 0.5082, 'learning_rate': 1.8739638811904363e-05, 'epoch': 0.19} + 19%|█▊ | 1634/8750 [2:40:18<11:20:24, 5.74s/it] 19%|█▊ | 1635/8750 [2:40:24<11:17:24, 5.71s/it] 19%|█▊ | 1635/8750 [2:40:30<11:17:24, 5.71s/it] {'loss': 0.4617, 'learning_rate': 1.8737839243729504e-05, 'epoch': 0.19} + 19%|█▊ | 1635/8750 [2:40:30<11:17:24, 5.71s/it] {'loss': 0.4617, 'learning_rate': 1.8737839243729504e-05, 'epoch': 0.19} + 19%|█▊ | 1635/8750 [2:40:24<11:17:24, 5.71s/it] 19%|█▊ | 1636/8750 [2:40:36<11:21:24, 5.75s/it] {'loss': 0.4841, 'learning_rate': 1.8736038478275584e-05, 'epoch': 0.19} + 19%|█▊ | 1636/8750 [2:40:36<11:21:24, 5.75s/it] 19%|█▊ | 1636/8750 [2:40:30<11:21:24, 5.75s/it] {'loss': 0.4841, 'learning_rate': 1.8736038478275584e-05, 'epoch': 0.19} + 19%|█▊ | 1636/8750 [2:40:30<11:21:24, 5.75s/it] 19%|█▊ | 1637/8750 [2:40:35<11:16:47, 5.71s/it] 19%|█▊ | 1637/8750 [2:40:42<11:16:47, 5.71s/it] {'loss': 0.4825, 'learning_rate': 1.873423651578935e-05, 'epoch': 0.19} + 19%|█▊ | 1637/8750 [2:40:42<11:16:47, 5.71s/it] {'loss': 0.4825, 'learning_rate': 1.873423651578935e-05, 'epoch': 0.19} + 19%|█▊ | 1637/8750 [2:40:35<11:16:47, 5.71s/it] 19%|█▊ | 1638/8750 [2:40:41<11:26:07, 5.79s/it] 19%|█▊ | 1638/8750 [2:40:48<11:26:07, 5.79s/it] {'loss': 0.4856, 'learning_rate': 1.8732433356517713e-05, 'epoch': 0.19} + 19%|█▊ | 1638/8750 [2:40:48<11:26:07, 5.79s/it] {'loss': 0.4856, 'learning_rate': 1.8732433356517713e-05, 'epoch': 0.19} + 19%|█▊ | 1638/8750 [2:40:41<11:26:07, 5.79s/it] 19%|█▊ | 1639/8750 [2:40:53<11:18:57, 5.73s/it] 19%|█▊ | 1639/8750 [2:40:47<11:18:58, 5.73s/it] {'loss': 0.4894, 'learning_rate': 1.8730629000707746e-05, 'epoch': 0.19} + 19%|█▊ | 1639/8750 [2:40:53<11:18:57, 5.73s/it] {'loss': 0.4894, 'learning_rate': 1.8730629000707746e-05, 'epoch': 0.19} + 19%|█▊ | 1639/8750 [2:40:47<11:18:58, 5.73s/it] 19%|█▊ | 1640/8750 [2:40:59<11:17:13, 5.72s/it] 19%|█▊ | 1640/8750 [2:40:52<11:17:14, 5.72s/it] {'loss': 0.4878, 'learning_rate': 1.872882344860668e-05, 'epoch': 0.19} + 19%|█▊ | 1640/8750 [2:40:59<11:17:13, 5.72s/it] {'loss': 0.4878, 'learning_rate': 1.872882344860668e-05, 'epoch': 0.19} + 19%|█▊ | 1640/8750 [2:40:52<11:17:14, 5.72s/it] 19%|█▉ | 1641/8750 [2:40:58<11:08:32, 5.64s/it] 19%|█▉ | 1641/8750 [2:41:04<11:08:33, 5.64s/it] {'loss': 0.5066, 'learning_rate': 1.872701670046192e-05, 'epoch': 0.19} + 19%|█▉ | 1641/8750 [2:41:04<11:08:33, 5.64s/it] {'loss': 0.5066, 'learning_rate': 1.872701670046192e-05, 'epoch': 0.19} + 19%|█▉ | 1641/8750 [2:40:58<11:08:32, 5.64s/it] 19%|█▉ | 1642/8750 [2:41:10<11:16:46, 5.71s/it] 19%|█▉ | 1642/8750 [2:41:04<11:16:47, 5.71s/it] {'loss': 0.4937, 'learning_rate': 1.8725208756521036e-05, 'epoch': 0.19} + 19%|█▉ | 1642/8750 [2:41:10<11:16:46, 5.71s/it] {'loss': 0.4937, 'learning_rate': 1.8725208756521036e-05, 'epoch': 0.19} + 19%|█▉ | 1642/8750 [2:41:04<11:16:47, 5.71s/it] 19%|█▉ | 1643/8750 [2:41:09<11:15:56, 5.71s/it] 19%|█▉ | 1643/8750 [2:41:16<11:15:57, 5.71s/it] {'loss': 0.4747, 'learning_rate': 1.8723399617031754e-05, 'epoch': 0.19} + 19%|█▉ | 1643/8750 [2:41:16<11:15:57, 5.71s/it] {'loss': 0.4747, 'learning_rate': 1.8723399617031754e-05, 'epoch': 0.19} + 19%|█▉ | 1643/8750 [2:41:09<11:15:56, 5.71s/it] 19%|█▉ | 1644/8750 [2:41:22<11:26:11, 5.79s/it] 19%|█▉ | 1644/8750 [2:41:15<11:26:11, 5.79s/it] {'loss': 0.4769, 'learning_rate': 1.8721589282241956e-05, 'epoch': 0.19} + 19%|█▉ | 1644/8750 [2:41:22<11:26:11, 5.79s/it] {'loss': 0.4769, 'learning_rate': 1.8721589282241956e-05, 'epoch': 0.19} + 19%|█▉ | 1644/8750 [2:41:15<11:26:11, 5.79s/it] 19%|█▉ | 1645/8750 [2:41:21<11:22:57, 5.77s/it] 19%|█▉ | 1645/8750 [2:41:28<11:22:57, 5.77s/it] {'loss': 0.4826, 'learning_rate': 1.8719777752399713e-05, 'epoch': 0.19} + 19%|█▉ | 1645/8750 [2:41:28<11:22:57, 5.77s/it] {'loss': 0.4826, 'learning_rate': 1.8719777752399713e-05, 'epoch': 0.19} + 19%|█▉ | 1645/8750 [2:41:21<11:22:57, 5.77s/it] 19%|█▉ | 1646/8750 [2:41:33<11:17:38, 5.72s/it] 19%|█▉ | 1646/8750 [2:41:27<11:17:39, 5.72s/it] {'loss': 0.4781, 'learning_rate': 1.8717965027753235e-05, 'epoch': 0.19} + 19%|█▉ | 1646/8750 [2:41:33<11:17:38, 5.72s/it] {'loss': 0.4781, 'learning_rate': 1.8717965027753235e-05, 'epoch': 0.19} + 19%|█▉ | 1646/8750 [2:41:27<11:17:39, 5.72s/it] 19%|█▉ | 1647/8750 [2:41:33<11:23:16, 5.77s/it] 19%|█▉ | 1647/8750 [2:41:39<11:23:16, 5.77s/it] {'loss': 0.4992, 'learning_rate': 1.8716151108550912e-05, 'epoch': 0.19} + 19%|█▉ | 1647/8750 [2:41:39<11:23:16, 5.77s/it] {'loss': 0.4992, 'learning_rate': 1.8716151108550912e-05, 'epoch': 0.19} + 19%|█▉ | 1647/8750 [2:41:33<11:23:16, 5.77s/it] 19%|█▉ | 1648/8750 [2:41:45<11:23:44, 5.78s/it] 19%|█▉ | 1648/8750 [2:41:38<11:23:44, 5.78s/it] {'loss': 0.4906, 'learning_rate': 1.871433599504129e-05, 'epoch': 0.19} + 19%|█▉ | 1648/8750 [2:41:45<11:23:44, 5.78s/it] {'loss': 0.4906, 'learning_rate': 1.871433599504129e-05, 'epoch': 0.19} + 19%|█▉ | 1648/8750 [2:41:38<11:23:44, 5.78s/it] 19%|█▉ | 1649/8750 [2:41:51<11:21:45, 5.76s/it] 19%|█▉ | 1649/8750 [2:41:44<11:21:45, 5.76s/it] {'loss': 0.4832, 'learning_rate': 1.8712519687473075e-05, 'epoch': 0.19} + 19%|█▉ | 1649/8750 [2:41:51<11:21:45, 5.76s/it] {'loss': 0.4832, 'learning_rate': 1.8712519687473075e-05, 'epoch': 0.19} + 19%|█▉ | 1649/8750 [2:41:44<11:21:45, 5.76s/it]9 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +8112 AutoResumeHook: Checking whether to suspend... + 3 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... 5 AutoResumeHook: Checking whether to suspend... + +AutoResumeHook: Checking whether to suspend...0 +12 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... + 19%|█▉ | 1650/8750 [2:41:56<11:16:07, 5.71s/it]15 7AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... AutoResumeHook: Checking whether to suspend... + + + 19%|█▉ | 1650/8750 [2:41:50<11:16:07, 5.71s/it] {'loss': 0.4871, 'learning_rate': 1.8710702186095147e-05, 'epoch': 0.19} + 19%|█▉ | 1650/8750 [2:41:56<11:16:07, 5.71s/it] {'loss': 0.4871, 'learning_rate': 1.8710702186095147e-05, 'epoch': 0.19} + 19%|█▉ | 1650/8750 [2:41:50<11:16:07, 5.71s/it] 19%|█▉ | 1651/8750 [2:41:56<11:24:59, 5.79s/it] 19%|█▉ | 1651/8750 [2:42:02<11:25:00, 5.79s/it] {'loss': 0.4787, 'learning_rate': 1.8708883491156544e-05, 'epoch': 0.19} + 19%|█▉ | 1651/8750 [2:42:02<11:25:00, 5.79s/it] {'loss': 0.4787, 'learning_rate': 1.8708883491156544e-05, 'epoch': 0.19} + 19%|█▉ | 1651/8750 [2:41:56<11:24:59, 5.79s/it] 19%|█▉ | 1652/8750 [2:42:01<11:24:19, 5.78s/it] 19%|█▉ | 1652/8750 [2:42:08<11:24:19, 5.78s/it] {'loss': 0.4772, 'learning_rate': 1.8707063602906466e-05, 'epoch': 0.19} + 19%|█▉ | 1652/8750 [2:42:08<11:24:19, 5.78s/it] {'loss': 0.4772, 'learning_rate': 1.8707063602906466e-05, 'epoch': 0.19} + 19%|█▉ | 1652/8750 [2:42:01<11:24:19, 5.78s/it] 19%|█▉ | 1653/8750 [2:42:07<11:18:19, 5.73s/it] 19%|█▉ | 1653/8750 [2:42:14<11:18:19, 5.73s/it] {'loss': 0.4791, 'learning_rate': 1.8705242521594276e-05, 'epoch': 0.19} + 19%|█▉ | 1653/8750 [2:42:14<11:18:19, 5.73s/it] {'loss': 0.4791, 'learning_rate': 1.8705242521594276e-05, 'epoch': 0.19} + 19%|█▉ | 1653/8750 [2:42:07<11:18:19, 5.73s/it] 19%|█▉ | 1654/8750 [2:42:13<11:17:00, 5.72s/it] 19%|█▉ | 1654/8750 [2:42:19<11:17:00, 5.72s/it] {'loss': 0.4847, 'learning_rate': 1.870342024746951e-05, 'epoch': 0.19} + 19%|█▉ | 1654/8750 [2:42:19<11:17:00, 5.72s/it] {'loss': 0.4847, 'learning_rate': 1.870342024746951e-05, 'epoch': 0.19} + 19%|█▉ | 1654/8750 [2:42:13<11:17:00, 5.72s/it] 19%|█▉ | 1655/8750 [2:42:18<11:11:20, 5.68s/it] 19%|█▉ | 1655/8750 [2:42:25<11:11:20, 5.68s/it] {'loss': 0.4919, 'learning_rate': 1.8701596780781855e-05, 'epoch': 0.19} + 19%|█▉ | 1655/8750 [2:42:25<11:11:20, 5.68s/it] {'loss': 0.4919, 'learning_rate': 1.8701596780781855e-05, 'epoch': 0.19} + 19%|█▉ | 1655/8750 [2:42:18<11:11:20, 5.68s/it] 19%|█▉ | 1656/8750 [2:42:31<11:19:44, 5.75s/it] 19%|█▉ | 1656/8750 [2:42:24<11:19:45, 5.75s/it] {'loss': 0.4897, 'learning_rate': 1.869977212178117e-05, 'epoch': 0.19} + 19%|█▉ | 1656/8750 [2:42:31<11:19:44, 5.75s/it] {'loss': 0.4897, 'learning_rate': 1.869977212178117e-05, 'epoch': 0.19} + 19%|█▉ | 1656/8750 [2:42:24<11:19:45, 5.75s/it] 19%|█▉ | 1657/8750 [2:42:30<11:19:52, 5.75s/it] 19%|█▉ | 1657/8750 [2:42:37<11:19:52, 5.75s/it] {'loss': 0.4652, 'learning_rate': 1.8697946270717468e-05, 'epoch': 0.19} + 19%|█▉ | 1657/8750 [2:42:37<11:19:52, 5.75s/it] {'loss': 0.4652, 'learning_rate': 1.8697946270717468e-05, 'epoch': 0.19} + 19%|█▉ | 1657/8750 [2:42:30<11:19:52, 5.75s/it] 19%|█▉ | 1658/8750 [2:42:42<11:14:08, 5.70s/it] 19%|█▉ | 1658/8750 [2:42:36<11:14:09, 5.70s/it] {'loss': 0.493, 'learning_rate': 1.8696119227840937e-05, 'epoch': 0.19} + 19%|█▉ | 1658/8750 [2:42:42<11:14:08, 5.70s/it] {'loss': 0.493, 'learning_rate': 1.8696119227840937e-05, 'epoch': 0.19} + 19%|█▉ | 1658/8750 [2:42:36<11:14:09, 5.70s/it] 19%|█▉ | 1659/8750 [2:42:42<11:24:54, 5.80s/it] 19%|█▉ | 1659/8750 [2:42:48<11:24:55, 5.80s/it] {'loss': 0.46, 'learning_rate': 1.869429099340192e-05, 'epoch': 0.19} + 19%|█▉ | 1659/8750 [2:42:48<11:24:55, 5.80s/it] {'loss': 0.46, 'learning_rate': 1.869429099340192e-05, 'epoch': 0.19} + 19%|█▉ | 1659/8750 [2:42:42<11:24:54, 5.80s/it] 19%|█▉ | 1660/8750 [2:42:47<11:19:35, 5.75s/it] 19%|█▉ | 1660/8750 [2:42:54<11:19:36, 5.75s/it] {'loss': 0.5069, 'learning_rate': 1.8692461567650925e-05, 'epoch': 0.19} + 19%|█▉ | 1660/8750 [2:42:54<11:19:36, 5.75s/it] {'loss': 0.5069, 'learning_rate': 1.8692461567650925e-05, 'epoch': 0.19} + 19%|█▉ | 1660/8750 [2:42:47<11:19:35, 5.75s/it] 19%|█▉ | 1661/8750 [2:42:59<11:17:42, 5.74s/it] 19%|█▉ | 1661/8750 [2:42:53<11:17:43, 5.74s/it] {'loss': 0.4855, 'learning_rate': 1.869063095083863e-05, 'epoch': 0.19} + 19%|█▉ | 1661/8750 [2:42:59<11:17:42, 5.74s/it] {'loss': 0.4855, 'learning_rate': 1.869063095083863e-05, 'epoch': 0.19} + 19%|█▉ | 1661/8750 [2:42:53<11:17:43, 5.74s/it] 19%|█▉ | 1662/8750 [2:42:59<11:17:03, 5.73s/it] 19%|█▉ | 1662/8750 [2:43:05<11:17:03, 5.73s/it] {'loss': 0.4678, 'learning_rate': 1.8688799143215863e-05, 'epoch': 0.19} + 19%|█▉ | 1662/8750 [2:43:05<11:17:03, 5.73s/it] {'loss': 0.4678, 'learning_rate': 1.8688799143215863e-05, 'epoch': 0.19} + 19%|█▉ | 1662/8750 [2:42:59<11:17:03, 5.73s/it] 19%|█▉ | 1663/8750 [2:43:04<11:09:24, 5.67s/it] 19%|█▉ | 1663/8750 [2:43:11<11:09:24, 5.67s/it] {'loss': 0.4733, 'learning_rate': 1.8686966145033626e-05, 'epoch': 0.19} + 19%|█▉ | 1663/8750 [2:43:04<11:09:24, 5.67s/it] {'loss': 0.4733, 'learning_rate': 1.8686966145033626e-05, 'epoch': 0.19} + 19%|█▉ | 1663/8750 [2:43:11<11:09:24, 5.67s/it] 19%|█▉ | 1664/8750 [2:43:10<11:21:53, 5.77s/it] 19%|█▉ | 1664/8750 [2:43:17<11:21:53, 5.77s/it] {'loss': 0.4652, 'learning_rate': 1.8685131956543082e-05, 'epoch': 0.19} + 19%|█▉ | 1664/8750 [2:43:17<11:21:53, 5.77s/it] {'loss': 0.4652, 'learning_rate': 1.8685131956543082e-05, 'epoch': 0.19} + 19%|█▉ | 1664/8750 [2:43:10<11:21:53, 5.77s/it] 19%|█▉ | 1665/8750 [2:43:16<11:15:29, 5.72s/it] 19%|█▉ | 1665/8750 [2:43:22<11:15:29, 5.72s/it] {'loss': 0.4923, 'learning_rate': 1.8683296577995554e-05, 'epoch': 0.19} + 19%|█▉ | 1665/8750 [2:43:22<11:15:29, 5.72s/it] {'loss': 0.4923, 'learning_rate': 1.8683296577995554e-05, 'epoch': 0.19} + 19%|█▉ | 1665/8750 [2:43:16<11:15:29, 5.72s/it] 19%|█▉ | 1666/8750 [2:43:22<11:17:15, 5.74s/it] 19%|█▉ | 1666/8750 [2:43:28<11:17:15, 5.74s/it] {'loss': 0.4963, 'learning_rate': 1.8681460009642533e-05, 'epoch': 0.19} + 19%|█▉ | 1666/8750 [2:43:28<11:17:15, 5.74s/it] {'loss': 0.4963, 'learning_rate': 1.8681460009642533e-05, 'epoch': 0.19} + 19%|█▉ | 1666/8750 [2:43:22<11:17:15, 5.74s/it] 19%|█▉ | 1667/8750 [2:43:27<11:12:02, 5.69s/it] 19%|█▉ | 1667/8750 [2:43:34<11:12:03, 5.69s/it] {'loss': 0.4617, 'learning_rate': 1.867962225173566e-05, 'epoch': 0.19} + 19%|█▉ | 1667/8750 [2:43:34<11:12:03, 5.69s/it] {'loss': 0.4617, 'learning_rate': 1.867962225173566e-05, 'epoch': 0.19} + 19%|█▉ | 1667/8750 [2:43:27<11:12:02, 5.69s/it] 19%|█▉ | 1668/8750 [2:43:33<11:13:28, 5.71s/it] 19%|█▉ | 1668/8750 [2:43:39<11:13:28, 5.71s/it] {'loss': 0.4967, 'learning_rate': 1.867778330452676e-05, 'epoch': 0.19} + 19%|█▉ | 1668/8750 [2:43:39<11:13:28, 5.71s/it] {'loss': 0.4967, 'learning_rate': 1.867778330452676e-05, 'epoch': 0.19} + 19%|█▉ | 1668/8750 [2:43:33<11:13:28, 5.71s/it] 19%|█▉ | 1669/8750 [2:43:39<11:11:03, 5.69s/it] 19%|█▉ | 1669/8750 [2:43:45<11:11:03, 5.69s/it] {'loss': 0.4924, 'learning_rate': 1.8675943168267804e-05, 'epoch': 0.19} + 19%|█▉ | 1669/8750 [2:43:45<11:11:03, 5.69s/it] {'loss': 0.4924, 'learning_rate': 1.8675943168267804e-05, 'epoch': 0.19} + 19%|█▉ | 1669/8750 [2:43:39<11:11:03, 5.69s/it] 19%|█▉ | 1670/8750 [2:43:45<11:19:39, 5.76s/it] 19%|█▉ | 1670/8750 [2:43:51<11:19:39, 5.76s/it] {'loss': 0.4953, 'learning_rate': 1.8674101843210935e-05, 'epoch': 0.19} + 19%|█▉ | 1670/8750 [2:43:51<11:19:39, 5.76s/it] {'loss': 0.4953, 'learning_rate': 1.8674101843210935e-05, 'epoch': 0.19} + 19%|█▉ | 1670/8750 [2:43:45<11:19:39, 5.76s/it] 19%|█▉ | 1671/8750 [2:43:57<11:18:04, 5.75s/it] 19%|█▉ | 1671/8750 [2:43:50<11:18:04, 5.75s/it] {'loss': 0.4848, 'learning_rate': 1.8672259329608457e-05, 'epoch': 0.19} + 19%|█▉ | 1671/8750 [2:43:57<11:18:04, 5.75s/it] {'loss': 0.4848, 'learning_rate': 1.8672259329608457e-05, 'epoch': 0.19} + 19%|█▉ | 1671/8750 [2:43:50<11:18:04, 5.75s/it] 19%|█▉ | 1672/8750 [2:43:56<11:20:21, 5.77s/it] 19%|█▉ | 1672/8750 [2:44:03<11:20:21, 5.77s/it] {'loss': 0.4905, 'learning_rate': 1.8670415627712825e-05, 'epoch': 0.19} + 19%|█▉ | 1672/8750 [2:44:03<11:20:21, 5.77s/it] {'loss': 0.4905, 'learning_rate': 1.8670415627712825e-05, 'epoch': 0.19} + 19%|█▉ | 1672/8750 [2:43:56<11:20:21, 5.77s/it] 19%|█▉ | 1673/8750 [2:44:02<11:14:31, 5.72s/it] 19%|█▉ | 1673/8750 [2:44:08<11:14:31, 5.72s/it] {'loss': 0.4961, 'learning_rate': 1.866857073777668e-05, 'epoch': 0.19} + 19%|█▉ | 1673/8750 [2:44:08<11:14:31, 5.72s/it] {'loss': 0.4961, 'learning_rate': 1.866857073777668e-05, 'epoch': 0.19} + 19%|█▉ | 1673/8750 [2:44:02<11:14:31, 5.72s/it] 19%|█▉ | 1674/8750 [2:44:08<11:21:16, 5.78s/it] 19%|█▉ | 1674/8750 [2:44:14<11:21:16, 5.78s/it] {'loss': 0.4816, 'learning_rate': 1.8666724660052807e-05, 'epoch': 0.19} + 19%|█▉ | 1674/8750 [2:44:14<11:21:16, 5.78s/it] {'loss': 0.4816, 'learning_rate': 1.8666724660052807e-05, 'epoch': 0.19} + 19%|█▉ | 1674/8750 [2:44:08<11:21:16, 5.78s/it] 19%|█▉ | 1675/8750 [2:44:14<11:26:39, 5.82s/it] 19%|█▉ | 1675/8750 [2:44:20<11:26:38, 5.82s/it] {'loss': 0.4707, 'learning_rate': 1.8664877394794158e-05, 'epoch': 0.19} + 19%|█▉ | 1675/8750 [2:44:20<11:26:38, 5.82s/it] {'loss': 0.4707, 'learning_rate': 1.8664877394794158e-05, 'epoch': 0.19} + 19%|█▉ | 1675/8750 [2:44:14<11:26:39, 5.82s/it] 19%|█▉ | 1676/8750 [2:44:19<11:20:14, 5.77s/it] 19%|█▉ | 1676/8750 [2:44:26<11:20:13, 5.77s/it] {'loss': 0.4883, 'learning_rate': 1.8663028942253854e-05, 'epoch': 0.19} + 19%|█▉ | 1676/8750 [2:44:26<11:20:13, 5.77s/it] {'loss': 0.4883, 'learning_rate': 1.8663028942253854e-05, 'epoch': 0.19} + 19%|█▉ | 1676/8750 [2:44:19<11:20:14, 5.77s/it] 19%|█▉ | 1677/8750 [2:44:25<11:18:39, 5.76s/it] 19%|█▉ | 1677/8750 [2:44:31<11:18:39, 5.76s/it] {'loss': 0.4905, 'learning_rate': 1.8661179302685177e-05, 'epoch': 0.19} + 19%|█▉ | 1677/8750 [2:44:31<11:18:39, 5.76s/it] {'loss': 0.4905, 'learning_rate': 1.8661179302685177e-05, 'epoch': 0.19} + 19%|█▉ | 1677/8750 [2:44:25<11:18:39, 5.76s/it] 19%|█▉ | 1678/8750 [2:44:37<11:18:29, 5.76s/it] 19%|█▉ | 1678/8750 [2:44:31<11:18:30, 5.76s/it] {'loss': 0.4828, 'learning_rate': 1.8659328476341557e-05, 'epoch': 0.19} + 19%|█▉ | 1678/8750 [2:44:37<11:18:29, 5.76s/it] {'loss': 0.4828, 'learning_rate': 1.8659328476341557e-05, 'epoch': 0.19} + 19%|█▉ | 1678/8750 [2:44:31<11:18:30, 5.76s/it] 19%|█▉ | 1679/8750 [2:44:36<11:14:15, 5.72s/it] 19%|█▉ | 1679/8750 [2:44:43<11:14:15, 5.72s/it] {'loss': 0.4787, 'learning_rate': 1.865747646347661e-05, 'epoch': 0.19} + 19%|█▉ | 1679/8750 [2:44:43<11:14:15, 5.72s/it] {'loss': 0.4787, 'learning_rate': 1.865747646347661e-05, 'epoch': 0.19} + 19%|█▉ | 1679/8750 [2:44:36<11:14:15, 5.72s/it] 19%|█▉ | 1680/8750 [2:44:42<11:19:38, 5.77s/it] 19%|█▉ | 1680/8750 [2:44:49<11:19:38, 5.77s/it] {'loss': 0.478, 'learning_rate': 1.8655623264344103e-05, 'epoch': 0.19} + 19%|█▉ | 1680/8750 [2:44:49<11:19:38, 5.77s/it] {'loss': 0.478, 'learning_rate': 1.8655623264344103e-05, 'epoch': 0.19} + 19%|█▉ | 1680/8750 [2:44:42<11:19:38, 5.77s/it] 19%|█▉ | 1681/8750 [2:44:54<11:11:48, 5.70s/it] 19%|█▉ | 1681/8750 [2:44:48<11:11:49, 5.70s/it] {'loss': 0.5098, 'learning_rate': 1.8653768879197956e-05, 'epoch': 0.19} + 19%|█▉ | 1681/8750 [2:44:54<11:11:48, 5.70s/it] {'loss': 0.5098, 'learning_rate': 1.8653768879197956e-05, 'epoch': 0.19} + 19%|█▉ | 1681/8750 [2:44:48<11:11:49, 5.70s/it] 19%|█▉ | 1682/8750 [2:44:54<11:15:49, 5.74s/it] 19%|█▉ | 1682/8750 [2:45:00<11:15:49, 5.74s/it] {'loss': 0.4905, 'learning_rate': 1.865191330829227e-05, 'epoch': 0.19} + 19%|█▉ | 1682/8750 [2:45:00<11:15:49, 5.74s/it] {'loss': 0.4905, 'learning_rate': 1.865191330829227e-05, 'epoch': 0.19} + 19%|█▉ | 1682/8750 [2:44:54<11:15:49, 5.74s/it] 19%|█▉ | 1683/8750 [2:44:59<11:21:07, 5.78s/it] 19%|█▉ | 1683/8750 [2:45:06<11:21:07, 5.78s/it] {'loss': 0.4881, 'learning_rate': 1.8650056551881297e-05, 'epoch': 0.19} + 19%|█▉ | 1683/8750 [2:45:06<11:21:07, 5.78s/it] {'loss': 0.4881, 'learning_rate': 1.8650056551881297e-05, 'epoch': 0.19} + 19%|█▉ | 1683/8750 [2:44:59<11:21:07, 5.78s/it] 19%|█▉ | 1684/8750 [2:45:05<11:18:04, 5.76s/it] 19%|█▉ | 1684/8750 [2:45:12<11:18:04, 5.76s/it] {'loss': 0.4741, 'learning_rate': 1.8648198610219452e-05, 'epoch': 0.19} + 19%|█▉ | 1684/8750 [2:45:12<11:18:04, 5.76s/it] {'loss': 0.4741, 'learning_rate': 1.8648198610219452e-05, 'epoch': 0.19} + 19%|█▉ | 1684/8750 [2:45:05<11:18:04, 5.76s/it] 19%|█▉ | 1685/8750 [2:45:17<11:17:45, 5.76s/it] 19%|█▉ | 1685/8750 [2:45:11<11:17:45, 5.76s/it] {'loss': 0.4758, 'learning_rate': 1.864633948356132e-05, 'epoch': 0.19} + 19%|█▉ | 1685/8750 [2:45:17<11:17:45, 5.76s/it] {'loss': 0.4758, 'learning_rate': 1.864633948356132e-05, 'epoch': 0.19} + 19%|█▉ | 1685/8750 [2:45:11<11:17:45, 5.76s/it] 19%|█▉ | 1686/8750 [2:45:17<11:29:47, 5.86s/it] 19%|█▉ | 1686/8750 [2:45:23<11:29:47, 5.86s/it] {'loss': 0.4836, 'learning_rate': 1.8644479172161635e-05, 'epoch': 0.19} + 19%|█▉ | 1686/8750 [2:45:23<11:29:47, 5.86s/it] {'loss': 0.4836, 'learning_rate': 1.8644479172161635e-05, 'epoch': 0.19} + 19%|█▉ | 1686/8750 [2:45:17<11:29:47, 5.86s/it] 19%|█▉ | 1687/8750 [2:45:23<11:23:13, 5.80s/it] 19%|█▉ | 1687/8750 [2:45:29<11:23:13, 5.80s/it] {'loss': 0.4902, 'learning_rate': 1.8642617676275306e-05, 'epoch': 0.19} + 19%|█▉ | 1687/8750 [2:45:29<11:23:13, 5.80s/it] {'loss': 0.4902, 'learning_rate': 1.8642617676275306e-05, 'epoch': 0.19} + 19%|█▉ | 1687/8750 [2:45:23<11:23:13, 5.80s/it] 19%|█▉ | 1688/8750 [2:45:29<11:35:31, 5.91s/it] 19%|█▉ | 1688/8750 [2:45:35<11:35:31, 5.91s/it] {'loss': 0.4673, 'learning_rate': 1.8640754996157397e-05, 'epoch': 0.19} + 19%|█▉ | 1688/8750 [2:45:35<11:35:31, 5.91s/it] {'loss': 0.4673, 'learning_rate': 1.8640754996157397e-05, 'epoch': 0.19} + 19%|█▉ | 1688/8750 [2:45:29<11:35:31, 5.91s/it] 19%|█▉ | 1689/8750 [2:45:34<11:21:06, 5.79s/it] 19%|█▉ | 1689/8750 [2:45:41<11:21:06, 5.79s/it] {'loss': 0.5032, 'learning_rate': 1.863889113206314e-05, 'epoch': 0.19} + {'loss': 0.5032, 'learning_rate': 1.863889113206314e-05, 'epoch': 0.19} 19%|█▉ | 1689/8750 [2:45:41<11:21:06, 5.79s/it] + 19%|█▉ | 1689/8750 [2:45:34<11:21:06, 5.79s/it] 19%|█▉ | 1690/8750 [2:45:40<11:29:16, 5.86s/it] 19%|█▉ | 1690/8750 [2:45:47<11:29:16, 5.86s/it] {'loss': 0.4906, 'learning_rate': 1.863702608424793e-05, 'epoch': 0.19} + 19%|█▉ | 1690/8750 [2:45:47<11:29:16, 5.86s/it] {'loss': 0.4906, 'learning_rate': 1.863702608424793e-05, 'epoch': 0.19} + 19%|█▉ | 1690/8750 [2:45:40<11:29:16, 5.86s/it] 19%|█▉ | 1691/8750 [2:45:46<11:25:01, 5.82s/it] 19%|█▉ | 1691/8750 [2:45:53<11:25:01, 5.82s/it] {'loss': 0.4808, 'learning_rate': 1.863515985296731e-05, 'epoch': 0.19} + 19%|█▉ | 1691/8750 [2:45:53<11:25:01, 5.82s/it] {'loss': 0.4808, 'learning_rate': 1.863515985296731e-05, 'epoch': 0.19} + 19%|█▉ | 1691/8750 [2:45:46<11:25:01, 5.82s/it] 19%|█▉ | 1692/8750 [2:45:52<11:23:06, 5.81s/it] 19%|█▉ | 1692/8750 [2:45:58<11:23:05, 5.81s/it] {'loss': 0.4903, 'learning_rate': 1.8633292438476998e-05, 'epoch': 0.19} + 19%|█▉ | 1692/8750 [2:45:58<11:23:05, 5.81s/it] {'loss': 0.4903, 'learning_rate': 1.8633292438476998e-05, 'epoch': 0.19} + 19%|█▉ | 1692/8750 [2:45:52<11:23:06, 5.81s/it] 19%|█▉ | 1693/8750 [2:45:57<11:14:20, 5.73s/it] 19%|█▉ | 1693/8750 [2:46:04<11:14:20, 5.73s/it] {'loss': 0.4751, 'learning_rate': 1.8631423841032876e-05, 'epoch': 0.19} + 19%|█▉ | 1693/8750 [2:46:04<11:14:20, 5.73s/it] {'loss': 0.4751, 'learning_rate': 1.8631423841032876e-05, 'epoch': 0.19} + 19%|█▉ | 1693/8750 [2:45:57<11:14:20, 5.73s/it] 19%|█▉ | 1694/8750 [2:46:03<11:15:29, 5.74s/it] 19%|█▉ | 1694/8750 [2:46:10<11:15:29, 5.74s/it] {'loss': 0.4874, 'learning_rate': 1.8629554060890982e-05, 'epoch': 0.19} + 19%|█▉ | 1694/8750 [2:46:10<11:15:29, 5.74s/it] {'loss': 0.4874, 'learning_rate': 1.8629554060890982e-05, 'epoch': 0.19} + 19%|█▉ | 1694/8750 [2:46:03<11:15:29, 5.74s/it] 19%|█▉ | 1695/8750 [2:46:09<11:16:52, 5.76s/it] 19%|█▉ | 1695/8750 [2:46:15<11:16:52, 5.76s/it] {'loss': 0.4805, 'learning_rate': 1.8627683098307516e-05, 'epoch': 0.19} + 19%|█▉ | 1695/8750 [2:46:15<11:16:52, 5.76s/it] {'loss': 0.4805, 'learning_rate': 1.8627683098307516e-05, 'epoch': 0.19} + 19%|█▉ | 1695/8750 [2:46:09<11:16:52, 5.76s/it] 19%|█▉ | 1696/8750 [2:46:15<11:13:56, 5.73s/it] 19%|█▉ | 1696/8750 [2:46:21<11:13:56, 5.73s/it] {'loss': 0.4862, 'learning_rate': 1.862581095353884e-05, 'epoch': 0.19} + 19%|█▉ | 1696/8750 [2:46:21<11:13:56, 5.73s/it] {'loss': 0.4862, 'learning_rate': 1.862581095353884e-05, 'epoch': 0.19} + 19%|█▉ | 1696/8750 [2:46:15<11:13:56, 5.73s/it] 19%|█▉ | 1697/8750 [2:46:20<11:08:44, 5.69s/it] 19%|█▉ | 1697/8750 [2:46:27<11:08:44, 5.69s/it] {'loss': 0.4728, 'learning_rate': 1.8623937626841485e-05, 'epoch': 0.19} + 19%|█▉ | 1697/8750 [2:46:27<11:08:44, 5.69s/it] {'loss': 0.4728, 'learning_rate': 1.8623937626841485e-05, 'epoch': 0.19} + 19%|█▉ | 1697/8750 [2:46:20<11:08:44, 5.69s/it] 19%|█▉ | 1698/8750 [2:46:32<11:05:57, 5.67s/it] 19%|█▉ | 1698/8750 [2:46:26<11:05:58, 5.67s/it] {'loss': 0.4992, 'learning_rate': 1.8622063118472135e-05, 'epoch': 0.19} + 19%|█▉ | 1698/8750 [2:46:32<11:05:57, 5.67s/it] {'loss': 0.4992, 'learning_rate': 1.8622063118472135e-05, 'epoch': 0.19} + 19%|█▉ | 1698/8750 [2:46:26<11:05:58, 5.67s/it] 19%|█▉ | 1699/8750 [2:46:38<11:08:19, 5.69s/it] 19%|█▉ | 1699/8750 [2:46:32<11:08:20, 5.69s/it] {'loss': 0.4798, 'learning_rate': 1.8620187428687643e-05, 'epoch': 0.19} + 19%|█▉ | 1699/8750 [2:46:38<11:08:19, 5.69s/it] {'loss': 0.4798, 'learning_rate': 1.8620187428687643e-05, 'epoch': 0.19} + 19%|█▉ | 1699/8750 [2:46:32<11:08:20, 5.69s/it]4 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +07 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... + 1 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +3 AutoResumeHook: Checking whether to suspend... + 19%|█▉ | 1700/8750 [2:46:37<11:06:38, 5.67s/it]10 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +1215 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + + 19%|█▉ | 1700/8750 [2:46:44<11:06:38, 5.67s/it] {'loss': 0.4886, 'learning_rate': 1.861831055774501e-05, 'epoch': 0.19} + 19%|█▉ | 1700/8750 [2:46:37<11:06:38, 5.67s/it]{'loss': 0.4886, 'learning_rate': 1.861831055774501e-05, 'epoch': 0.19} + 19%|█▉ | 1700/8750 [2:46:44<11:06:38, 5.67s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-1700/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-1700/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-1700/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 19%|█▉ | 1701/8750 [2:47:06<20:59:11, 10.72s/it] 19%|█▉ | 1701/8750 [2:47:00<20:59:12, 10.72s/it] {'loss': 0.4727, 'learning_rate': 1.8616432505901427e-05, 'epoch': 0.19} + 19%|█▉ | 1701/8750 [2:47:06<20:59:11, 10.72s/it] {'loss': 0.4727, 'learning_rate': 1.8616432505901427e-05, 'epoch': 0.19} + 19%|█▉ | 1701/8750 [2:47:00<20:59:12, 10.72s/it] 19%|█▉ | 1702/8750 [2:47:12<18:13:36, 9.31s/it] 19%|█▉ | 1702/8750 [2:47:06<18:13:38, 9.31s/it] {'loss': 0.4862, 'learning_rate': 1.861455327341421e-05, 'epoch': 0.19} + 19%|█▉ | 1702/8750 [2:47:12<18:13:36, 9.31s/it] {'loss': 0.4862, 'learning_rate': 1.861455327341421e-05, 'epoch': 0.19} + 19%|█▉ | 1702/8750 [2:47:06<18:13:38, 9.31s/it] 19%|█▉ | 1703/8750 [2:47:18<16:04:50, 8.21s/it] 19%|█▉ | 1703/8750 [2:47:11<16:04:50, 8.21s/it] {'loss': 0.4721, 'learning_rate': 1.8612672860540865e-05, 'epoch': 0.19} + 19%|█▉ | 1703/8750 [2:47:18<16:04:50, 8.21s/it] {'loss': 0.4721, 'learning_rate': 1.8612672860540865e-05, 'epoch': 0.19} + 19%|█▉ | 1703/8750 [2:47:11<16:04:50, 8.21s/it] 19%|█▉ | 1704/8750 [2:47:24<14:38:38, 7.48s/it] 19%|█▉ | 1704/8750 [2:47:17<14:38:37, 7.48s/it] {'loss': 0.4631, 'learning_rate': 1.8610791267539053e-05, 'epoch': 0.19} + 19%|█▉ | 1704/8750 [2:47:24<14:38:38, 7.48s/it] {'loss': 0.4631, 'learning_rate': 1.8610791267539053e-05, 'epoch': 0.19} + 19%|█▉ | 1704/8750 [2:47:17<14:38:37, 7.48s/it] 19%|█▉ | 1705/8750 [2:47:23<13:39:40, 6.98s/it] 19%|█▉ | 1705/8750 [2:47:29<13:39:42, 6.98s/it] {'loss': 0.4935, 'learning_rate': 1.8608908494666593e-05, 'epoch': 0.19} + 19%|█▉ | 1705/8750 [2:47:29<13:39:42, 6.98s/it] {'loss': 0.4935, 'learning_rate': 1.8608908494666593e-05, 'epoch': 0.19} + 19%|█▉ | 1705/8750 [2:47:23<13:39:40, 6.98s/it] 19%|█▉ | 1706/8750 [2:47:29<12:55:27, 6.61s/it] 19%|█▉ | 1706/8750 [2:47:35<12:55:27, 6.61s/it] {'loss': 0.4937, 'learning_rate': 1.8607024542181465e-05, 'epoch': 0.19} + 19%|█▉ | 1706/8750 [2:47:35<12:55:27, 6.61s/it] {'loss': 0.4937, 'learning_rate': 1.8607024542181465e-05, 'epoch': 0.19} + 19%|█▉ | 1706/8750 [2:47:29<12:55:27, 6.61s/it] 20%|█▉ | 1707/8750 [2:47:34<12:16:17, 6.27s/it] 20%|█▉ | 1707/8750 [2:47:41<12:16:17, 6.27s/it] {'loss': 0.474, 'learning_rate': 1.860513941034181e-05, 'epoch': 0.2} + 20%|█▉ | 1707/8750 [2:47:41<12:16:17, 6.27s/it] {'loss': 0.474, 'learning_rate': 1.860513941034181e-05, 'epoch': 0.2} + 20%|█▉ | 1707/8750 [2:47:34<12:16:17, 6.27s/it] 20%|█▉ | 1708/8750 [2:47:40<11:53:19, 6.08s/it] 20%|█▉ | 1708/8750 [2:47:46<11:53:19, 6.08s/it] {'loss': 0.4716, 'learning_rate': 1.8603253099405937e-05, 'epoch': 0.2} + 20%|█▉ | 1708/8750 [2:47:46<11:53:19, 6.08s/it] {'loss': 0.4716, 'learning_rate': 1.8603253099405937e-05, 'epoch': 0.2} + 20%|█▉ | 1708/8750 [2:47:40<11:53:19, 6.08s/it] 20%|█▉ | 1709/8750 [2:47:52<11:43:24, 5.99s/it] 20%|█▉ | 1709/8750 [2:47:46<11:43:25, 5.99s/it] {'loss': 0.5024, 'learning_rate': 1.8601365609632315e-05, 'epoch': 0.2} + 20%|█▉ | 1709/8750 [2:47:52<11:43:24, 5.99s/it] {'loss': 0.5024, 'learning_rate': 1.8601365609632315e-05, 'epoch': 0.2} + 20%|█▉ | 1709/8750 [2:47:46<11:43:25, 5.99s/it] 20%|█▉ | 1710/8750 [2:47:51<11:31:07, 5.89s/it] 20%|█▉ | 1710/8750 [2:47:58<11:31:09, 5.89s/it] {'loss': 0.4774, 'learning_rate': 1.859947694127956e-05, 'epoch': 0.2} + 20%|█▉ | 1710/8750 [2:47:51<11:31:07, 5.89s/it] {'loss': 0.4774, 'learning_rate': 1.859947694127956e-05, 'epoch': 0.2} + 20%|█▉ | 1710/8750 [2:47:58<11:31:09, 5.89s/it] 20%|█▉ | 1711/8750 [2:48:03<11:26:11, 5.85s/it] 20%|█▉ | 1711/8750 [2:47:57<11:26:11, 5.85s/it] {'loss': 0.4771, 'learning_rate': 1.859758709460648e-05, 'epoch': 0.2} + 20%|█▉ | 1711/8750 [2:48:03<11:26:11, 5.85s/it] {'loss': 0.4771, 'learning_rate': 1.859758709460648e-05, 'epoch': 0.2} + 20%|█▉ | 1711/8750 [2:47:57<11:26:11, 5.85s/it] 20%|█▉ | 1712/8750 [2:48:03<11:17:06, 5.77s/it] 20%|█▉ | 1712/8750 [2:48:09<11:17:07, 5.77s/it] {'loss': 0.4811, 'learning_rate': 1.8595696069872013e-05, 'epoch': 0.2} + 20%|█▉ | 1712/8750 [2:48:09<11:17:07, 5.77s/it] {'loss': 0.4811, 'learning_rate': 1.8595696069872013e-05, 'epoch': 0.2} + 20%|█▉ | 1712/8750 [2:48:03<11:17:06, 5.77s/it] 20%|█▉ | 1713/8750 [2:48:08<11:18:11, 5.78s/it] 20%|█▉ | 1713/8750 [2:48:15<11:18:11, 5.78s/it] {'loss': 0.476, 'learning_rate': 1.8593803867335276e-05, 'epoch': 0.2} + 20%|█▉ | 1713/8750 [2:48:15<11:18:11, 5.78s/it] {'loss': 0.476, 'learning_rate': 1.8593803867335276e-05, 'epoch': 0.2} + 20%|█▉ | 1713/8750 [2:48:08<11:18:11, 5.78s/it] 20%|█▉ | 1714/8750 [2:48:14<11:13:15, 5.74s/it] 20%|█▉ | 1714/8750 [2:48:21<11:13:15, 5.74s/it] {'loss': 0.4997, 'learning_rate': 1.859191048725554e-05, 'epoch': 0.2} + 20%|█▉ | 1714/8750 [2:48:21<11:13:15, 5.74s/it] {'loss': 0.4997, 'learning_rate': 1.859191048725554e-05, 'epoch': 0.2} + 20%|█▉ | 1714/8750 [2:48:14<11:13:15, 5.74s/it] 20%|█▉ | 1715/8750 [2:48:20<11:10:04, 5.71s/it] 20%|█▉ | 1715/8750 [2:48:26<11:10:04, 5.71s/it] {'loss': 0.4814, 'learning_rate': 1.8590015929892245e-05, 'epoch': 0.2} + 20%|█▉ | 1715/8750 [2:48:26<11:10:04, 5.71s/it] {'loss': 0.4814, 'learning_rate': 1.8590015929892245e-05, 'epoch': 0.2} + 20%|█▉ | 1715/8750 [2:48:20<11:10:04, 5.71s/it] 20%|█▉ | 1716/8750 [2:48:25<11:09:31, 5.71s/it] 20%|█▉ | 1716/8750 [2:48:32<11:09:31, 5.71s/it] {'loss': 0.4644, 'learning_rate': 1.858812019550499e-05, 'epoch': 0.2} + 20%|█▉ | 1716/8750 [2:48:32<11:09:31, 5.71s/it] {'loss': 0.4644, 'learning_rate': 1.858812019550499e-05, 'epoch': 0.2} + 20%|█▉ | 1716/8750 [2:48:25<11:09:31, 5.71s/it] 20%|█▉ | 1717/8750 [2:48:31<11:19:58, 5.80s/it] 20%|█▉ | 1717/8750 [2:48:38<11:19:58, 5.80s/it] {'loss': 0.4897, 'learning_rate': 1.8586223284353522e-05, 'epoch': 0.2} + 20%|█▉ | 1717/8750 [2:48:38<11:19:58, 5.80s/it] {'loss': 0.4897, 'learning_rate': 1.8586223284353522e-05, 'epoch': 0.2} + 20%|█▉ | 1717/8750 [2:48:31<11:19:58, 5.80s/it] 20%|█▉ | 1718/8750 [2:48:37<11:15:06, 5.76s/it] 20%|█▉ | 1718/8750 [2:48:44<11:15:07, 5.76s/it] {'loss': 0.4907, 'learning_rate': 1.8584325196697767e-05, 'epoch': 0.2} + 20%|█▉ | 1718/8750 [2:48:44<11:15:07, 5.76s/it] {'loss': 0.4907, 'learning_rate': 1.8584325196697767e-05, 'epoch': 0.2} + 20%|█▉ | 1718/8750 [2:48:37<11:15:06, 5.76s/it] 20%|█▉ | 1719/8750 [2:48:50<11:21:48, 5.82s/it] 20%|█▉ | 1719/8750 [2:48:43<11:21:50, 5.82s/it] {'loss': 0.475, 'learning_rate': 1.8582425932797807e-05, 'epoch': 0.2} + 20%|█▉ | 1719/8750 [2:48:50<11:21:48, 5.82s/it] {'loss': 0.475, 'learning_rate': 1.8582425932797807e-05, 'epoch': 0.2} + 20%|█▉ | 1719/8750 [2:48:43<11:21:50, 5.82s/it] 20%|█▉ | 1720/8750 [2:48:55<11:15:23, 5.76s/it] 20%|█▉ | 1720/8750 [2:48:49<11:15:24, 5.76s/it] {'loss': 0.4799, 'learning_rate': 1.8580525492913884e-05, 'epoch': 0.2} + 20%|█▉ | 1720/8750 [2:48:55<11:15:23, 5.76s/it] {'loss': 0.4799, 'learning_rate': 1.8580525492913884e-05, 'epoch': 0.2} + 20%|█▉ | 1720/8750 [2:48:49<11:15:24, 5.76s/it] 20%|█▉ | 1721/8750 [2:49:01<11:21:15, 5.82s/it] 20%|█▉ | 1721/8750 [2:48:55<11:21:16, 5.82s/it] {'loss': 0.4759, 'learning_rate': 1.8578623877306394e-05, 'epoch': 0.2} + 20%|█▉ | 1721/8750 [2:49:01<11:21:15, 5.82s/it] {'loss': 0.4759, 'learning_rate': 1.8578623877306394e-05, 'epoch': 0.2} + 20%|█▉ | 1721/8750 [2:48:55<11:21:16, 5.82s/it] 20%|█▉ | 1722/8750 [2:49:07<11:09:35, 5.72s/it] 20%|█▉ | 1722/8750 [2:49:00<11:09:37, 5.72s/it] {'loss': 0.509, 'learning_rate': 1.8576721086235908e-05, 'epoch': 0.2} + 20%|█▉ | 1722/8750 [2:49:07<11:09:35, 5.72s/it] {'loss': 0.509, 'learning_rate': 1.8576721086235908e-05, 'epoch': 0.2} + 20%|█▉ | 1722/8750 [2:49:00<11:09:37, 5.72s/it] 20%|█▉ | 1723/8750 [2:49:12<11:08:14, 5.71s/it] 20%|█▉ | 1723/8750 [2:49:06<11:08:14, 5.71s/it] {'loss': 0.4782, 'learning_rate': 1.8574817119963145e-05, 'epoch': 0.2} + 20%|█▉ | 1723/8750 [2:49:12<11:08:14, 5.71s/it] {'loss': 0.4782, 'learning_rate': 1.8574817119963145e-05, 'epoch': 0.2} + 20%|█▉ | 1723/8750 [2:49:06<11:08:14, 5.71s/it] 20%|█▉ | 1724/8750 [2:49:11<11:06:44, 5.69s/it] 20%|█▉ | 1724/8750 [2:49:18<11:06:45, 5.69s/it] {'loss': 0.4763, 'learning_rate': 1.8572911978748993e-05, 'epoch': 0.2} + 20%|█▉ | 1724/8750 [2:49:18<11:06:45, 5.69s/it] {'loss': 0.4763, 'learning_rate': 1.8572911978748993e-05, 'epoch': 0.2} + 20%|█▉ | 1724/8750 [2:49:11<11:06:44, 5.69s/it] 20%|█▉ | 1725/8750 [2:49:24<11:05:53, 5.69s/it] 20%|█▉ | 1725/8750 [2:49:17<11:05:54, 5.69s/it] {'loss': 0.5017, 'learning_rate': 1.8571005662854502e-05, 'epoch': 0.2} + 20%|█▉ | 1725/8750 [2:49:24<11:05:53, 5.69s/it] {'loss': 0.5017, 'learning_rate': 1.8571005662854502e-05, 'epoch': 0.2} + 20%|█▉ | 1725/8750 [2:49:17<11:05:54, 5.69s/it] 20%|█▉ | 1726/8750 [2:49:23<11:10:33, 5.73s/it] 20%|█▉ | 1726/8750 [2:49:29<11:10:34, 5.73s/it] {'loss': 0.4736, 'learning_rate': 1.8569098172540875e-05, 'epoch': 0.2} + 20%|█▉ | 1726/8750 [2:49:29<11:10:34, 5.73s/it] {'loss': 0.4736, 'learning_rate': 1.8569098172540875e-05, 'epoch': 0.2} + 20%|█▉ | 1726/8750 [2:49:23<11:10:33, 5.73s/it] 20%|█▉ | 1727/8750 [2:49:35<11:07:33, 5.70s/it] 20%|█▉ | 1727/8750 [2:49:29<11:07:33, 5.70s/it] {'loss': 0.4701, 'learning_rate': 1.856718950806949e-05, 'epoch': 0.2} + 20%|█▉ | 1727/8750 [2:49:35<11:07:33, 5.70s/it] {'loss': 0.4701, 'learning_rate': 1.856718950806949e-05, 'epoch': 0.2} + 20%|█▉ | 1727/8750 [2:49:29<11:07:33, 5.70s/it] 20%|█▉ | 1728/8750 [2:49:34<11:08:00, 5.71s/it] 20%|█▉ | 1728/8750 [2:49:41<11:08:00, 5.71s/it] {'loss': 0.4726, 'learning_rate': 1.8565279669701862e-05, 'epoch': 0.2} + 20%|█▉ | 1728/8750 [2:49:41<11:08:00, 5.71s/it] {'loss': 0.4726, 'learning_rate': 1.8565279669701862e-05, 'epoch': 0.2} + 20%|█▉ | 1728/8750 [2:49:34<11:08:00, 5.71s/it] 20%|█▉ | 1729/8750 [2:49:47<11:11:15, 5.74s/it] 20%|█▉ | 1729/8750 [2:49:40<11:11:15, 5.74s/it] {'loss': 0.4578, 'learning_rate': 1.8563368657699693e-05, 'epoch': 0.2} + 20%|█▉ | 1729/8750 [2:49:47<11:11:15, 5.74s/it] {'loss': 0.4578, 'learning_rate': 1.8563368657699693e-05, 'epoch': 0.2} + 20%|█▉ | 1729/8750 [2:49:40<11:11:15, 5.74s/it] 20%|█▉ | 1730/8750 [2:49:46<11:06:32, 5.70s/it] 20%|█▉ | 1730/8750 [2:49:52<11:06:33, 5.70s/it] {'loss': 0.4809, 'learning_rate': 1.856145647232483e-05, 'epoch': 0.2} + 20%|█▉ | 1730/8750 [2:49:52<11:06:33, 5.70s/it] {'loss': 0.4809, 'learning_rate': 1.856145647232483e-05, 'epoch': 0.2} + 20%|█▉ | 1730/8750 [2:49:46<11:06:32, 5.70s/it] 20%|█▉ | 1731/8750 [2:49:51<11:04:21, 5.68s/it] 20%|█▉ | 1731/8750 [2:49:58<11:04:23, 5.68s/it] {'loss': 0.4705, 'learning_rate': 1.8559543113839285e-05, 'epoch': 0.2} + 20%|█▉ | 1731/8750 [2:49:58<11:04:23, 5.68s/it] {'loss': 0.4705, 'learning_rate': 1.8559543113839285e-05, 'epoch': 0.2} + 20%|█▉ | 1731/8750 [2:49:51<11:04:21, 5.68s/it] 20%|█▉ | 1732/8750 [2:49:57<11:07:37, 5.71s/it] 20%|█▉ | 1732/8750 [2:50:04<11:07:37, 5.71s/it] {'loss': 0.5124, 'learning_rate': 1.8557628582505235e-05, 'epoch': 0.2} + 20%|█▉ | 1732/8750 [2:50:04<11:07:37, 5.71s/it] {'loss': 0.5124, 'learning_rate': 1.8557628582505235e-05, 'epoch': 0.2} + 20%|█▉ | 1732/8750 [2:49:57<11:07:37, 5.71s/it] 20%|█▉ | 1733/8750 [2:50:10<11:17:33, 5.79s/it] 20%|█▉ | 1733/8750 [2:50:03<11:17:34, 5.79s/it] {'loss': 0.473, 'learning_rate': 1.8555712878585005e-05, 'epoch': 0.2} + 20%|█▉ | 1733/8750 [2:50:10<11:17:33, 5.79s/it] {'loss': 0.473, 'learning_rate': 1.8555712878585005e-05, 'epoch': 0.2} + 20%|█▉ | 1733/8750 [2:50:03<11:17:34, 5.79s/it] 20%|█▉ | 1734/8750 [2:50:09<11:16:50, 5.79s/it] 20%|█▉ | 1734/8750 [2:50:15<11:16:50, 5.79s/it] {'loss': 0.4715, 'learning_rate': 1.8553796002341098e-05, 'epoch': 0.2} + 20%|█▉ | 1734/8750 [2:50:15<11:16:50, 5.79s/it] {'loss': 0.4715, 'learning_rate': 1.8553796002341098e-05, 'epoch': 0.2} + 20%|█▉ | 1734/8750 [2:50:09<11:16:50, 5.79s/it] 20%|█▉ | 1735/8750 [2:50:21<11:09:49, 5.73s/it] 20%|█▉ | 1735/8750 [2:50:14<11:09:49, 5.73s/it] {'loss': 0.4905, 'learning_rate': 1.8551877954036165e-05, 'epoch': 0.2} + 20%|█▉ | 1735/8750 [2:50:21<11:09:49, 5.73s/it] {'loss': 0.4905, 'learning_rate': 1.8551877954036165e-05, 'epoch': 0.2} + 20%|█▉ | 1735/8750 [2:50:14<11:09:49, 5.73s/it] 20%|█▉ | 1736/8750 [2:50:27<11:09:49, 5.73s/it] 20%|█▉ | 1736/8750 [2:50:20<11:09:50, 5.73s/it] {'loss': 0.4924, 'learning_rate': 1.854995873393302e-05, 'epoch': 0.2} + 20%|█▉ | 1736/8750 [2:50:27<11:09:49, 5.73s/it] {'loss': 0.4924, 'learning_rate': 1.854995873393302e-05, 'epoch': 0.2} + 20%|█▉ | 1736/8750 [2:50:20<11:09:50, 5.73s/it] 20%|█▉ | 1737/8750 [2:50:32<11:09:43, 5.73s/it] 20%|█▉ | 1737/8750 [2:50:26<11:09:45, 5.73s/it] {'loss': 0.4684, 'learning_rate': 1.854803834229464e-05, 'epoch': 0.2} + 20%|█▉ | 1737/8750 [2:50:32<11:09:43, 5.73s/it] {'loss': 0.4684, 'learning_rate': 1.854803834229464e-05, 'epoch': 0.2} + 20%|█▉ | 1737/8750 [2:50:26<11:09:45, 5.73s/it] 20%|█▉ | 1738/8750 [2:50:32<11:06:40, 5.70s/it] 20%|█▉ | 1738/8750 [2:50:38<11:06:41, 5.70s/it] {'loss': 0.4869, 'learning_rate': 1.8546116779384165e-05, 'epoch': 0.2} + 20%|█▉ | 1738/8750 [2:50:38<11:06:41, 5.70s/it] {'loss': 0.4869, 'learning_rate': 1.8546116779384165e-05, 'epoch': 0.2} + 20%|█▉ | 1738/8750 [2:50:32<11:06:40, 5.70s/it] 20%|█▉ | 1739/8750 [2:50:37<11:11:59, 5.75s/it] 20%|█▉ | 1739/8750 [2:50:44<11:12:00, 5.75s/it] {'loss': 0.47, 'learning_rate': 1.8544194045464888e-05, 'epoch': 0.2} + 20%|█▉ | 1739/8750 [2:50:44<11:12:00, 5.75s/it] {'loss': 0.47, 'learning_rate': 1.8544194045464888e-05, 'epoch': 0.2} + 20%|█▉ | 1739/8750 [2:50:37<11:11:59, 5.75s/it] 20%|█▉ | 1740/8750 [2:50:50<11:08:13, 5.72s/it] 20%|█▉ | 1740/8750 [2:50:43<11:08:14, 5.72s/it] {'loss': 0.4872, 'learning_rate': 1.8542270140800266e-05, 'epoch': 0.2} + 20%|█▉ | 1740/8750 [2:50:50<11:08:13, 5.72s/it] {'loss': 0.4872, 'learning_rate': 1.8542270140800266e-05, 'epoch': 0.2} + 20%|█▉ | 1740/8750 [2:50:43<11:08:14, 5.72s/it] 20%|█▉ | 1741/8750 [2:50:55<11:08:38, 5.72s/it] 20%|█▉ | 1741/8750 [2:50:49<11:08:38, 5.72s/it] {'loss': 0.4664, 'learning_rate': 1.854034506565392e-05, 'epoch': 0.2} + 20%|█▉ | 1741/8750 [2:50:55<11:08:38, 5.72s/it] {'loss': 0.4664, 'learning_rate': 1.854034506565392e-05, 'epoch': 0.2} + 20%|█▉ | 1741/8750 [2:50:49<11:08:38, 5.72s/it] 20%|█▉ | 1742/8750 [2:51:01<11:08:26, 5.72s/it] 20%|█▉ | 1742/8750 [2:50:55<11:08:26, 5.72s/it] {'loss': 0.4862, 'learning_rate': 1.8538418820289628e-05, 'epoch': 0.2} + 20%|█▉ | 1742/8750 [2:51:01<11:08:26, 5.72s/it] {'loss': 0.4862, 'learning_rate': 1.8538418820289628e-05, 'epoch': 0.2} + 20%|█▉ | 1742/8750 [2:50:55<11:08:26, 5.72s/it] 20%|█▉ | 1743/8750 [2:51:07<11:04:31, 5.69s/it] 20%|█▉ | 1743/8750 [2:51:00<11:04:31, 5.69s/it] {'loss': 0.466, 'learning_rate': 1.8536491404971327e-05, 'epoch': 0.2} + 20%|█▉ | 1743/8750 [2:51:07<11:04:31, 5.69s/it] {'loss': 0.466, 'learning_rate': 1.8536491404971327e-05, 'epoch': 0.2} + 20%|█▉ | 1743/8750 [2:51:00<11:04:31, 5.69s/it] 20%|█▉ | 1744/8750 [2:51:12<11:03:51, 5.69s/it] 20%|█▉ | 1744/8750 [2:51:06<11:03:52, 5.69s/it] {'loss': 0.4869, 'learning_rate': 1.8534562819963112e-05, 'epoch': 0.2} + 20%|█▉ | 1744/8750 [2:51:12<11:03:51, 5.69s/it] {'loss': 0.4869, 'learning_rate': 1.8534562819963112e-05, 'epoch': 0.2} + 20%|█▉ | 1744/8750 [2:51:06<11:03:52, 5.69s/it] 20%|█▉ | 1745/8750 [2:51:12<11:14:20, 5.78s/it] 20%|█▉ | 1745/8750 [2:51:18<11:14:21, 5.78s/it] {'loss': 0.477, 'learning_rate': 1.853263306552925e-05, 'epoch': 0.2} + 20%|█▉ | 1745/8750 [2:51:18<11:14:21, 5.78s/it] {'loss': 0.477, 'learning_rate': 1.853263306552925e-05, 'epoch': 0.2} + 20%|█▉ | 1745/8750 [2:51:12<11:14:20, 5.78s/it] 20%|█▉ | 1746/8750 [2:51:17<11:08:51, 5.73s/it] 20%|█▉ | 1746/8750 [2:51:24<11:08:52, 5.73s/it] {'loss': 0.4889, 'learning_rate': 1.8530702141934157e-05, 'epoch': 0.2} + 20%|█▉ | 1746/8750 [2:51:24<11:08:52, 5.73s/it] {'loss': 0.4889, 'learning_rate': 1.8530702141934157e-05, 'epoch': 0.2} + 20%|█▉ | 1746/8750 [2:51:17<11:08:51, 5.73s/it] 20%|█▉ | 1747/8750 [2:51:30<11:20:10, 5.83s/it] 20%|█▉ | 1747/8750 [2:51:23<11:20:11, 5.83s/it] {'loss': 0.4812, 'learning_rate': 1.8528770049442413e-05, 'epoch': 0.2} + 20%|█▉ | 1747/8750 [2:51:30<11:20:10, 5.83s/it] {'loss': 0.4812, 'learning_rate': 1.8528770049442413e-05, 'epoch': 0.2} + 20%|█▉ | 1747/8750 [2:51:23<11:20:11, 5.83s/it] 20%|█▉ | 1748/8750 [2:51:36<11:13:48, 5.77s/it] 20%|█▉ | 1748/8750 [2:51:29<11:13:49, 5.77s/it] {'loss': 0.5022, 'learning_rate': 1.852683678831876e-05, 'epoch': 0.2} + 20%|█▉ | 1748/8750 [2:51:36<11:13:48, 5.77s/it] {'loss': 0.5022, 'learning_rate': 1.852683678831876e-05, 'epoch': 0.2} + 20%|█▉ | 1748/8750 [2:51:29<11:13:49, 5.77s/it] 20%|█▉ | 1749/8750 [2:51:42<11:27:45, 5.89s/it] 20%|█▉ | 1749/8750 [2:51:35<11:27:46, 5.89s/it] {'loss': 0.477, 'learning_rate': 1.852490235882809e-05, 'epoch': 0.2} + 20%|█▉ | 1749/8750 [2:51:42<11:27:45, 5.89s/it] {'loss': 0.477, 'learning_rate': 1.852490235882809e-05, 'epoch': 0.2} + 20%|█▉ | 1749/8750 [2:51:35<11:27:46, 5.89s/it]4 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend...12 + AutoResumeHook: Checking whether to suspend... +05 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +1110 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +15 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... + 20%|██ | 1750/8750 [2:51:41<11:19:34, 5.82s/it]3 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... + 20%|██ | 1750/8750 [2:51:47<11:19:34, 5.82s/it]1 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4985, 'learning_rate': 1.852296676123547e-05, 'epoch': 0.2} + 20%|██ | 1750/8750 [2:51:47<11:19:34, 5.82s/it] {'loss': 0.4985, 'learning_rate': 1.852296676123547e-05, 'epoch': 0.2} + 20%|██ | 1750/8750 [2:51:41<11:19:34, 5.82s/it] 20%|██ | 1751/8750 [2:51:47<11:15:28, 5.79s/it] 20%|██ | 1751/8750 [2:51:53<11:15:28, 5.79s/it] {'loss': 0.489, 'learning_rate': 1.8521029995806123e-05, 'epoch': 0.2} + 20%|██ | 1751/8750 [2:51:53<11:15:28, 5.79s/it] {'loss': 0.489, 'learning_rate': 1.8521029995806123e-05, 'epoch': 0.2} + 20%|██ | 1751/8750 [2:51:47<11:15:28, 5.79s/it] 20%|██ | 1752/8750 [2:51:59<11:10:57, 5.75s/it] 20%|██ | 1752/8750 [2:51:52<11:10:58, 5.75s/it] {'loss': 0.4941, 'learning_rate': 1.851909206280542e-05, 'epoch': 0.2} + 20%|██ | 1752/8750 [2:51:59<11:10:57, 5.75s/it] {'loss': 0.4941, 'learning_rate': 1.851909206280542e-05, 'epoch': 0.2} + 20%|██ | 1752/8750 [2:51:52<11:10:58, 5.75s/it] 20%|██ | 1753/8750 [2:51:58<11:07:59, 5.73s/it] 20%|██ | 1753/8750 [2:52:05<11:08:00, 5.73s/it] {'loss': 0.4833, 'learning_rate': 1.8517152962498908e-05, 'epoch': 0.2} + 20%|██ | 1753/8750 [2:52:05<11:08:00, 5.73s/it] {'loss': 0.4833, 'learning_rate': 1.8517152962498908e-05, 'epoch': 0.2} + 20%|██ | 1753/8750 [2:51:58<11:07:59, 5.73s/it] 20%|██ | 1754/8750 [2:52:04<11:04:45, 5.70s/it] 20%|██ | 1754/8750 [2:52:10<11:04:46, 5.70s/it] {'loss': 0.4874, 'learning_rate': 1.8515212695152284e-05, 'epoch': 0.2} + 20%|██ | 1754/8750 [2:52:10<11:04:46, 5.70s/it] {'loss': 0.4874, 'learning_rate': 1.8515212695152284e-05, 'epoch': 0.2} + 20%|██ | 1754/8750 [2:52:04<11:04:45, 5.70s/it] 20%|██ | 1755/8750 [2:52:10<11:18:00, 5.82s/it] 20%|██ | 1755/8750 [2:52:16<11:18:01, 5.82s/it] {'loss': 0.4818, 'learning_rate': 1.8513271261031406e-05, 'epoch': 0.2} + 20%|██ | 1755/8750 [2:52:16<11:18:01, 5.82s/it] {'loss': 0.4818, 'learning_rate': 1.8513271261031406e-05, 'epoch': 0.2} + 20%|██ | 1755/8750 [2:52:10<11:18:00, 5.82s/it] 20%|██ | 1756/8750 [2:52:22<11:20:06, 5.83s/it] 20%|██ | 1756/8750 [2:52:16<11:20:07, 5.83s/it] {'loss': 0.4875, 'learning_rate': 1.8511328660402302e-05, 'epoch': 0.2} + 20%|██ | 1756/8750 [2:52:22<11:20:06, 5.83s/it] {'loss': 0.4875, 'learning_rate': 1.8511328660402302e-05, 'epoch': 0.2} + 20%|██ | 1756/8750 [2:52:16<11:20:07, 5.83s/it] 20%|██ | 1757/8750 [2:52:28<11:11:27, 5.76s/it] 20%|██ | 1757/8750 [2:52:21<11:11:27, 5.76s/it] {'loss': 0.4796, 'learning_rate': 1.850938489353114e-05, 'epoch': 0.2} + 20%|██ | 1757/8750 [2:52:28<11:11:27, 5.76s/it] {'loss': 0.4796, 'learning_rate': 1.850938489353114e-05, 'epoch': 0.2} + 20%|██ | 1757/8750 [2:52:21<11:11:27, 5.76s/it] 20%|██ | 1758/8750 [2:52:33<11:06:38, 5.72s/it] 20%|██ | 1758/8750 [2:52:27<11:06:38, 5.72s/it] {'loss': 0.4783, 'learning_rate': 1.850743996068427e-05, 'epoch': 0.2} + 20%|██ | 1758/8750 [2:52:33<11:06:38, 5.72s/it] {'loss': 0.4783, 'learning_rate': 1.850743996068427e-05, 'epoch': 0.2} + 20%|██ | 1758/8750 [2:52:27<11:06:38, 5.72s/it] 20%|██ | 1759/8750 [2:52:39<11:08:41, 5.74s/it] 20%|██ | 1759/8750 [2:52:33<11:08:41, 5.74s/it] {'loss': 0.4777, 'learning_rate': 1.8505493862128187e-05, 'epoch': 0.2} + 20%|██ | 1759/8750 [2:52:39<11:08:41, 5.74s/it] {'loss': 0.4777, 'learning_rate': 1.8505493862128187e-05, 'epoch': 0.2} + 20%|██ | 1759/8750 [2:52:33<11:08:41, 5.74s/it] 20%|██ | 1760/8750 [2:52:45<11:06:25, 5.72s/it] 20%|██ | 1760/8750 [2:52:38<11:06:25, 5.72s/it] {'loss': 0.478, 'learning_rate': 1.8503546598129547e-05, 'epoch': 0.2} + 20%|██ | 1760/8750 [2:52:45<11:06:25, 5.72s/it] {'loss': 0.478, 'learning_rate': 1.8503546598129547e-05, 'epoch': 0.2} + 20%|██ | 1760/8750 [2:52:38<11:06:25, 5.72s/it] 20%|██ | 1761/8750 [2:52:44<11:06:32, 5.72s/it] 20%|██ | 1761/8750 [2:52:51<11:06:32, 5.72s/it] {'loss': 0.4774, 'learning_rate': 1.8501598168955172e-05, 'epoch': 0.2} + 20%|██ | 1761/8750 [2:52:51<11:06:32, 5.72s/it] {'loss': 0.4774, 'learning_rate': 1.8501598168955172e-05, 'epoch': 0.2} + 20%|██ | 1761/8750 [2:52:44<11:06:32, 5.72s/it] 20%|██ | 1762/8750 [2:52:56<11:12:16, 5.77s/it] {'loss': 0.4938, 'learning_rate': 1.8499648574872042e-05, 'epoch': 0.2} + 20%|██ | 1762/8750 [2:52:50<11:12:16, 5.77s/it] 20%|██ | 1762/8750 [2:52:56<11:12:16, 5.77s/it] {'loss': 0.4938, 'learning_rate': 1.8499648574872042e-05, 'epoch': 0.2} + 20%|██ | 1762/8750 [2:52:50<11:12:16, 5.77s/it] 20%|██ | 1763/8750 [2:53:02<11:07:23, 5.73s/it] 20%|██ | 1763/8750 [2:52:56<11:07:24, 5.73s/it] {'loss': 0.4838, 'learning_rate': 1.849769781614729e-05, 'epoch': 0.2} + 20%|██ | 1763/8750 [2:53:02<11:07:23, 5.73s/it] {'loss': 0.4838, 'learning_rate': 1.849769781614729e-05, 'epoch': 0.2} + 20%|██ | 1763/8750 [2:52:56<11:07:24, 5.73s/it] 20%|██ | 1764/8750 [2:53:08<11:06:33, 5.72s/it] 20%|██ | 1764/8750 [2:53:01<11:06:33, 5.72s/it] {'loss': 0.4955, 'learning_rate': 1.849574589304822e-05, 'epoch': 0.2} + {'loss': 0.4955, 'learning_rate': 1.849574589304822e-05, 'epoch': 0.2} + 20%|██ | 1764/8750 [2:53:08<11:06:33, 5.72s/it] 20%|██ | 1764/8750 [2:53:01<11:06:33, 5.72s/it] 20%|██ | 1765/8750 [2:53:07<11:04:26, 5.71s/it] 20%|██ | 1765/8750 [2:53:13<11:04:27, 5.71s/it] {'loss': 0.4886, 'learning_rate': 1.8493792805842278e-05, 'epoch': 0.2} + 20%|██ | 1765/8750 [2:53:13<11:04:27, 5.71s/it] {'loss': 0.4886, 'learning_rate': 1.8493792805842278e-05, 'epoch': 0.2} + 20%|██ | 1765/8750 [2:53:07<11:04:26, 5.71s/it] 20%|██ | 1766/8750 [2:53:19<11:10:20, 5.76s/it] 20%|██ | 1766/8750 [2:53:13<11:10:20, 5.76s/it] {'loss': 0.4945, 'learning_rate': 1.8491838554797096e-05, 'epoch': 0.2} + 20%|██ | 1766/8750 [2:53:19<11:10:20, 5.76s/it] {'loss': 0.4945, 'learning_rate': 1.8491838554797096e-05, 'epoch': 0.2} + 20%|██ | 1766/8750 [2:53:13<11:10:20, 5.76s/it] 20%|██ | 1767/8750 [2:53:25<11:08:01, 5.74s/it] 20%|██ | 1767/8750 [2:53:19<11:08:02, 5.74s/it] {'loss': 0.4612, 'learning_rate': 1.8489883140180437e-05, 'epoch': 0.2} + 20%|██ | 1767/8750 [2:53:25<11:08:01, 5.74s/it] {'loss': 0.4612, 'learning_rate': 1.8489883140180437e-05, 'epoch': 0.2} + 20%|██ | 1767/8750 [2:53:19<11:08:02, 5.74s/it] 20%|██ | 1768/8750 [2:53:31<11:05:16, 5.72s/it] 20%|██ | 1768/8750 [2:53:24<11:05:16, 5.72s/it] {'loss': 0.492, 'learning_rate': 1.848792656226024e-05, 'epoch': 0.2} + 20%|██ | 1768/8750 [2:53:31<11:05:16, 5.72s/it] {'loss': 0.492, 'learning_rate': 1.848792656226024e-05, 'epoch': 0.2} + 20%|██ | 1768/8750 [2:53:24<11:05:16, 5.72s/it] 20%|██ | 1769/8750 [2:53:36<11:02:13, 5.69s/it] 20%|██ | 1769/8750 [2:53:30<11:02:14, 5.69s/it] {'loss': 0.4717, 'learning_rate': 1.8485968821304604e-05, 'epoch': 0.2} + 20%|██ | 1769/8750 [2:53:36<11:02:13, 5.69s/it] {'loss': 0.4717, 'learning_rate': 1.8485968821304604e-05, 'epoch': 0.2} + 20%|██ | 1769/8750 [2:53:30<11:02:14, 5.69s/it] 20%|██ | 1770/8750 [2:53:42<11:01:52, 5.69s/it] 20%|██ | 1770/8750 [2:53:35<11:01:52, 5.69s/it] {'loss': 0.4814, 'learning_rate': 1.848400991758178e-05, 'epoch': 0.2} + 20%|██ | 1770/8750 [2:53:42<11:01:52, 5.69s/it] {'loss': 0.4814, 'learning_rate': 1.848400991758178e-05, 'epoch': 0.2} + 20%|██ | 1770/8750 [2:53:35<11:01:52, 5.69s/it] 20%|██ | 1771/8750 [2:53:48<11:14:30, 5.80s/it] 20%|██ | 1771/8750 [2:53:42<11:14:30, 5.80s/it] {'loss': 0.48, 'learning_rate': 1.8482049851360182e-05, 'epoch': 0.2} + 20%|██ | 1771/8750 [2:53:48<11:14:30, 5.80s/it] {'loss': 0.48, 'learning_rate': 1.8482049851360182e-05, 'epoch': 0.2} + 20%|██ | 1771/8750 [2:53:42<11:14:30, 5.80s/it] 20%|██ | 1772/8750 [2:53:47<11:13:04, 5.79s/it] 20%|██ | 1772/8750 [2:53:54<11:13:07, 5.79s/it] {'loss': 0.4792, 'learning_rate': 1.8480088622908382e-05, 'epoch': 0.2} + 20%|██ | 1772/8750 [2:53:54<11:13:07, 5.79s/it] {'loss': 0.4792, 'learning_rate': 1.8480088622908382e-05, 'epoch': 0.2} + 20%|██ | 1772/8750 [2:53:47<11:13:04, 5.79s/it] 20%|██ | 1773/8750 [2:53:53<11:05:31, 5.72s/it] 20%|██ | 1773/8750 [2:53:59<11:05:32, 5.72s/it] {'loss': 0.4839, 'learning_rate': 1.8478126232495114e-05, 'epoch': 0.2} + 20%|██ | 1773/8750 [2:53:59<11:05:32, 5.72s/it] {'loss': 0.4839, 'learning_rate': 1.8478126232495114e-05, 'epoch': 0.2} + 20%|██ | 1773/8750 [2:53:53<11:05:31, 5.72s/it] 20%|██ | 1774/8750 [2:54:05<11:09:32, 5.76s/it] 20%|██ | 1774/8750 [2:53:59<11:09:32, 5.76s/it] {'loss': 0.4825, 'learning_rate': 1.8476162680389268e-05, 'epoch': 0.2} + 20%|██ | 1774/8750 [2:54:05<11:09:32, 5.76s/it] {'loss': 0.4825, 'learning_rate': 1.8476162680389268e-05, 'epoch': 0.2} + 20%|██ | 1774/8750 [2:53:59<11:09:32, 5.76s/it] 20%|██ | 1775/8750 [2:54:11<11:10:05, 5.76s/it] 20%|██ | 1775/8750 [2:54:04<11:10:05, 5.76s/it] {'loss': 0.4821, 'learning_rate': 1.847419796685989e-05, 'epoch': 0.2} + 20%|██ | 1775/8750 [2:54:11<11:10:05, 5.76s/it] {'loss': 0.4821, 'learning_rate': 1.847419796685989e-05, 'epoch': 0.2} + 20%|██ | 1775/8750 [2:54:04<11:10:05, 5.76s/it] 20%|██ | 1776/8750 [2:54:17<11:04:35, 5.72s/it] 20%|██ | 1776/8750 [2:54:10<11:04:35, 5.72s/it] {'loss': 0.5115, 'learning_rate': 1.84722320921762e-05, 'epoch': 0.2} + 20%|██ | 1776/8750 [2:54:17<11:04:35, 5.72s/it] {'loss': 0.5115, 'learning_rate': 1.84722320921762e-05, 'epoch': 0.2} + 20%|██ | 1776/8750 [2:54:10<11:04:35, 5.72s/it] 20%|██ | 1777/8750 [2:54:16<11:10:19, 5.77s/it] 20%|██ | 1777/8750 [2:54:22<11:10:19, 5.77s/it] {'loss': 0.46, 'learning_rate': 1.8470265056607557e-05, 'epoch': 0.2} + 20%|██ | 1777/8750 [2:54:22<11:10:19, 5.77s/it] {'loss': 0.46, 'learning_rate': 1.8470265056607557e-05, 'epoch': 0.2} + 20%|██ | 1777/8750 [2:54:16<11:10:19, 5.77s/it] 20%|██ | 1778/8750 [2:54:28<11:07:55, 5.75s/it] 20%|██ | 1778/8750 [2:54:22<11:07:56, 5.75s/it] {'loss': 0.4905, 'learning_rate': 1.8468296860423494e-05, 'epoch': 0.2} + 20%|██ | 1778/8750 [2:54:28<11:07:55, 5.75s/it] {'loss': 0.4905, 'learning_rate': 1.8468296860423494e-05, 'epoch': 0.2} + 20%|██ | 1778/8750 [2:54:22<11:07:56, 5.75s/it] 20%|██ | 1779/8750 [2:54:34<11:05:03, 5.72s/it] 20%|██ | 1779/8750 [2:54:27<11:05:03, 5.72s/it] {'loss': 0.4867, 'learning_rate': 1.8466327503893697e-05, 'epoch': 0.2} + 20%|██ | 1779/8750 [2:54:34<11:05:03, 5.72s/it] {'loss': 0.4867, 'learning_rate': 1.8466327503893697e-05, 'epoch': 0.2} + 20%|██ | 1779/8750 [2:54:27<11:05:03, 5.72s/it] 20%|██ | 1780/8750 [2:54:33<11:02:00, 5.70s/it] 20%|██ | 1780/8750 [2:54:39<11:02:01, 5.70s/it] {'loss': 0.4755, 'learning_rate': 1.8464356987288012e-05, 'epoch': 0.2} + 20%|██ | 1780/8750 [2:54:39<11:02:01, 5.70s/it] {'loss': 0.4755, 'learning_rate': 1.8464356987288012e-05, 'epoch': 0.2} + 20%|██ | 1780/8750 [2:54:33<11:02:00, 5.70s/it] 20%|██ | 1781/8750 [2:54:45<11:11:14, 5.78s/it] 20%|██ | 1781/8750 [2:54:39<11:11:14, 5.78s/it] {'loss': 0.4933, 'learning_rate': 1.8462385310876444e-05, 'epoch': 0.2} + 20%|██ | 1781/8750 [2:54:45<11:11:14, 5.78s/it] {'loss': 0.4933, 'learning_rate': 1.8462385310876444e-05, 'epoch': 0.2} + 20%|██ | 1781/8750 [2:54:39<11:11:14, 5.78s/it] 20%|██ | 1782/8750 [2:54:51<11:13:08, 5.80s/it] 20%|██ | 1782/8750 [2:54:45<11:13:08, 5.80s/it] {'loss': 0.4886, 'learning_rate': 1.8460412474929154e-05, 'epoch': 0.2} + 20%|██ | 1782/8750 [2:54:51<11:13:08, 5.80s/it] {'loss': 0.4886, 'learning_rate': 1.8460412474929154e-05, 'epoch': 0.2} + 20%|██ | 1782/8750 [2:54:45<11:13:08, 5.80s/it] 20%|██ | 1783/8750 [2:54:51<11:15:21, 5.82s/it] 20%|██ | 1783/8750 [2:54:57<11:15:22, 5.82s/it] {'loss': 0.4768, 'learning_rate': 1.8458438479716466e-05, 'epoch': 0.2} + 20%|██ | 1783/8750 [2:54:57<11:15:22, 5.82s/it] {'loss': 0.4768, 'learning_rate': 1.8458438479716466e-05, 'epoch': 0.2} + 20%|██ | 1783/8750 [2:54:51<11:15:21, 5.82s/it] 20%|██ | 1784/8750 [2:54:57<11:23:15, 5.89s/it] 20%|██ | 1784/8750 [2:55:03<11:23:15, 5.89s/it] {'loss': 0.4779, 'learning_rate': 1.845646332550886e-05, 'epoch': 0.2} + 20%|██ | 1784/8750 [2:55:03<11:23:15, 5.89s/it] {'loss': 0.4779, 'learning_rate': 1.845646332550886e-05, 'epoch': 0.2} + 20%|██ | 1784/8750 [2:54:57<11:23:15, 5.89s/it] 20%|██ | 1785/8750 [2:55:09<11:26:37, 5.91s/it] 20%|██ | 1785/8750 [2:55:03<11:26:38, 5.92s/it] {'loss': 0.4866, 'learning_rate': 1.845448701257698e-05, 'epoch': 0.2} + 20%|██ | 1785/8750 [2:55:09<11:26:37, 5.91s/it] {'loss': 0.4866, 'learning_rate': 1.845448701257698e-05, 'epoch': 0.2} + 20%|██ | 1785/8750 [2:55:03<11:26:38, 5.92s/it] 20%|██ | 1786/8750 [2:55:15<11:22:22, 5.88s/it] 20%|██ | 1786/8750 [2:55:08<11:22:22, 5.88s/it] {'loss': 0.4603, 'learning_rate': 1.8452509541191625e-05, 'epoch': 0.2} + 20%|██ | 1786/8750 [2:55:15<11:22:22, 5.88s/it] {'loss': 0.4603, 'learning_rate': 1.8452509541191625e-05, 'epoch': 0.2} + 20%|██ | 1786/8750 [2:55:08<11:22:22, 5.88s/it] 20%|██ | 1787/8750 [2:55:21<11:17:34, 5.84s/it] 20%|██ | 1787/8750 [2:55:14<11:17:35, 5.84s/it] {'loss': 0.5003, 'learning_rate': 1.8450530911623747e-05, 'epoch': 0.2} + 20%|██ | 1787/8750 [2:55:21<11:17:34, 5.84s/it] {'loss': 0.5003, 'learning_rate': 1.8450530911623747e-05, 'epoch': 0.2} + 20%|██ | 1787/8750 [2:55:14<11:17:35, 5.84s/it] 20%|██ | 1788/8750 [2:55:20<11:16:59, 5.83s/it] 20%|██ | 1788/8750 [2:55:27<11:17:00, 5.83s/it] {'loss': 0.4798, 'learning_rate': 1.8448551124144467e-05, 'epoch': 0.2} + 20%|██ | 1788/8750 [2:55:27<11:17:00, 5.83s/it] {'loss': 0.4798, 'learning_rate': 1.8448551124144467e-05, 'epoch': 0.2} + 20%|██ | 1788/8750 [2:55:20<11:16:59, 5.83s/it] 20%|██ | 1789/8750 [2:55:26<11:15:46, 5.82s/it] 20%|██ | 1789/8750 [2:55:32<11:15:47, 5.82s/it] {'loss': 0.4767, 'learning_rate': 1.844657017902506e-05, 'epoch': 0.2} + 20%|██ | 1789/8750 [2:55:32<11:15:47, 5.82s/it] {'loss': 0.4767, 'learning_rate': 1.844657017902506e-05, 'epoch': 0.2} + 20%|██ | 1789/8750 [2:55:26<11:15:46, 5.82s/it] 20%|██ | 1790/8750 [2:55:38<11:07:25, 5.75s/it] 20%|██ | 1790/8750 [2:55:31<11:07:25, 5.75s/it] {'loss': 0.4962, 'learning_rate': 1.844458807653696e-05, 'epoch': 0.2} + 20%|██ | 1790/8750 [2:55:38<11:07:25, 5.75s/it] {'loss': 0.4962, 'learning_rate': 1.844458807653696e-05, 'epoch': 0.2} + 20%|██ | 1790/8750 [2:55:31<11:07:25, 5.75s/it] 20%|██ | 1791/8750 [2:55:37<11:00:22, 5.69s/it] 20%|██ | 1791/8750 [2:55:43<11:00:23, 5.69s/it] {'loss': 0.4689, 'learning_rate': 1.8442604816951757e-05, 'epoch': 0.2} + 20%|██ | 1791/8750 [2:55:43<11:00:23, 5.69s/it] {'loss': 0.4689, 'learning_rate': 1.8442604816951757e-05, 'epoch': 0.2} + 20%|██ | 1791/8750 [2:55:37<11:00:22, 5.69s/it] 20%|██ | 1792/8750 [2:55:49<10:53:56, 5.64s/it] 20%|██ | 1792/8750 [2:55:43<10:53:56, 5.64s/it] {'loss': 0.5121, 'learning_rate': 1.8440620400541202e-05, 'epoch': 0.2} + 20%|██ | 1792/8750 [2:55:49<10:53:56, 5.64s/it] {'loss': 0.5121, 'learning_rate': 1.8440620400541202e-05, 'epoch': 0.2} + 20%|██ | 1792/8750 [2:55:43<10:53:56, 5.64s/it] 20%|██ | 1793/8750 [2:55:49<11:06:34, 5.75s/it] 20%|██ | 1793/8750 [2:55:55<11:06:35, 5.75s/it] {'loss': 0.4729, 'learning_rate': 1.843863482757721e-05, 'epoch': 0.2} + 20%|██ | 1793/8750 [2:55:55<11:06:35, 5.75s/it] {'loss': 0.4729, 'learning_rate': 1.843863482757721e-05, 'epoch': 0.2} + 20%|██ | 1793/8750 [2:55:49<11:06:34, 5.75s/it] 21%|██ | 1794/8750 [2:55:54<11:08:16, 5.76s/it] 21%|██ | 1794/8750 [2:56:01<11:08:17, 5.76s/it] {'loss': 0.4883, 'learning_rate': 1.8436648098331838e-05, 'epoch': 0.21} + 21%|██ | 1794/8750 [2:56:01<11:08:17, 5.76s/it] {'loss': 0.4883, 'learning_rate': 1.8436648098331838e-05, 'epoch': 0.21} + 21%|██ | 1794/8750 [2:55:54<11:08:16, 5.76s/it] 21%|██ | 1795/8750 [2:56:00<11:00:36, 5.70s/it] 21%|██ | 1795/8750 [2:56:06<11:00:36, 5.70s/it] {'loss': 0.4687, 'learning_rate': 1.843466021307732e-05, 'epoch': 0.21} + 21%|██ | 1795/8750 [2:56:06<11:00:36, 5.70s/it] {'loss': 0.4687, 'learning_rate': 1.843466021307732e-05, 'epoch': 0.21} + 21%|██ | 1795/8750 [2:56:00<11:00:36, 5.70s/it] 21%|██ | 1796/8750 [2:56:06<11:03:54, 5.73s/it] 21%|██ | 1796/8750 [2:56:12<11:03:54, 5.73s/it] {'loss': 0.4604, 'learning_rate': 1.8432671172086044e-05, 'epoch': 0.21} + 21%|██ | 1796/8750 [2:56:12<11:03:54, 5.73s/it] {'loss': 0.4604, 'learning_rate': 1.8432671172086044e-05, 'epoch': 0.21} + 21%|██ | 1796/8750 [2:56:06<11:03:54, 5.73s/it] 21%|██ | 1797/8750 [2:56:18<11:03:12, 5.72s/it] 21%|██ | 1797/8750 [2:56:11<11:03:13, 5.72s/it] {'loss': 0.4798, 'learning_rate': 1.8430680975630545e-05, 'epoch': 0.21} + 21%|██ | 1797/8750 [2:56:18<11:03:12, 5.72s/it] {'loss': 0.4798, 'learning_rate': 1.8430680975630545e-05, 'epoch': 0.21} + 21%|██ | 1797/8750 [2:56:11<11:03:13, 5.72s/it] 21%|██ | 1798/8750 [2:56:24<11:01:54, 5.71s/it] 21%|██ | 1798/8750 [2:56:17<11:01:55, 5.71s/it] {'loss': 0.468, 'learning_rate': 1.8428689623983526e-05, 'epoch': 0.21} + 21%|██ | 1798/8750 [2:56:24<11:01:54, 5.71s/it] {'loss': 0.468, 'learning_rate': 1.8428689623983526e-05, 'epoch': 0.21} + 21%|██ | 1798/8750 [2:56:17<11:01:55, 5.71s/it] 21%|██ | 1799/8750 [2:56:23<10:57:23, 5.67s/it] 21%|██ | 1799/8750 [2:56:29<10:57:23, 5.67s/it] {'loss': 0.4735, 'learning_rate': 1.8426697117417848e-05, 'epoch': 0.21} + 21%|██ | 1799/8750 [2:56:29<10:57:23, 5.67s/it] {'loss': 0.4735, 'learning_rate': 1.8426697117417848e-05, 'epoch': 0.21} + 21%|██ | 1799/8750 [2:56:23<10:57:23, 5.67s/it]3 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +05 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +1210 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... + 15 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend...9 AutoResumeHook: Checking whether to suspend... + +8 AutoResumeHook: Checking whether to suspend... + 21%|██ | 1800/8750 [2:56:28<10:58:37, 5.69s/it]6 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... + 21%|██ | 1800/8750 [2:56:35<10:58:37, 5.69s/it]11 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4889, 'learning_rate': 1.8424703456206533e-05, 'epoch': 0.21} + 21%|██ | 1800/8750 [2:56:35<10:58:37, 5.69s/it] {'loss': 0.4889, 'learning_rate': 1.8424703456206533e-05, 'epoch': 0.21} + 21%|██ | 1800/8750 [2:56:28<10:58:37, 5.69s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-1800/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-1800/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-1800/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 21%|██ | 1801/8750 [2:56:47<18:27:55, 9.57s/it] 21%|██ | 1801/8750 [2:56:53<18:27:56, 9.57s/it] {'loss': 0.4724, 'learning_rate': 1.842270864062275e-05, 'epoch': 0.21} + 21%|██ | 1801/8750 [2:56:53<18:27:56, 9.57s/it] {'loss': 0.4724, 'learning_rate': 1.842270864062275e-05, 'epoch': 0.21} + 21%|██ | 1801/8750 [2:56:47<18:27:55, 9.57s/it] 21%|██ | 1802/8750 [2:56:59<16:11:15, 8.39s/it] 21%|██ | 1802/8750 [2:56:53<16:11:16, 8.39s/it] {'loss': 0.4837, 'learning_rate': 1.8420712670939837e-05, 'epoch': 0.21} + 21%|██ | 1802/8750 [2:56:53<16:11:16, 8.39s/it] {'loss': 0.4837, 'learning_rate': 1.8420712670939837e-05, 'epoch': 0.21} + 21%|██ | 1802/8750 [2:56:59<16:11:15, 8.39s/it] 21%|██ | 1803/8750 [2:56:58<14:38:45, 7.59s/it] 21%|██ | 1803/8750 [2:57:05<14:38:46, 7.59s/it] {'loss': 0.4789, 'learning_rate': 1.8418715547431283e-05, 'epoch': 0.21} + 21%|██ | 1803/8750 [2:57:05<14:38:46, 7.59s/it] {'loss': 0.4789, 'learning_rate': 1.8418715547431283e-05, 'epoch': 0.21} + 21%|██ | 1803/8750 [2:56:58<14:38:45, 7.59s/it] 21%|██ | 1804/8750 [2:57:11<13:45:18, 7.13s/it] 21%|██ | 1804/8750 [2:57:04<13:45:17, 7.13s/it] {'loss': 0.4703, 'learning_rate': 1.8416717270370744e-05, 'epoch': 0.21} + 21%|██ | 1804/8750 [2:57:11<13:45:18, 7.13s/it] {'loss': 0.4703, 'learning_rate': 1.8416717270370744e-05, 'epoch': 0.21} + 21%|██ | 1804/8750 [2:57:04<13:45:17, 7.13s/it] 21%|██ | 1805/8750 [2:57:10<12:51:58, 6.67s/it] 21%|██ | 1805/8750 [2:57:16<12:51:58, 6.67s/it] {'loss': 0.4746, 'learning_rate': 1.841471784003203e-05, 'epoch': 0.21} + 21%|██ | 1805/8750 [2:57:16<12:51:58, 6.67s/it] {'loss': 0.4746, 'learning_rate': 1.841471784003203e-05, 'epoch': 0.21} + 21%|██ | 1805/8750 [2:57:10<12:51:58, 6.67s/it] 21%|██ | 1806/8750 [2:57:16<12:25:30, 6.44s/it] 21%|██ | 1806/8750 [2:57:22<12:25:30, 6.44s/it] {'loss': 0.4867, 'learning_rate': 1.84127172566891e-05, 'epoch': 0.21} + 21%|██ | 1806/8750 [2:57:22<12:25:30, 6.44s/it] {'loss': 0.4867, 'learning_rate': 1.84127172566891e-05, 'epoch': 0.21} + 21%|██ | 1806/8750 [2:57:16<12:25:30, 6.44s/it] 21%|██ | 1807/8750 [2:57:22<12:04:50, 6.26s/it] 21%|██ | 1807/8750 [2:57:28<12:04:50, 6.26s/it] {'loss': 0.4663, 'learning_rate': 1.841071552061608e-05, 'epoch': 0.21} + 21%|██ | 1807/8750 [2:57:28<12:04:50, 6.26s/it] {'loss': 0.4663, 'learning_rate': 1.841071552061608e-05, 'epoch': 0.21} + 21%|██ | 1807/8750 [2:57:22<12:04:50, 6.26s/it] 21%|██ | 1808/8750 [2:57:28<11:56:03, 6.19s/it] 21%|██ | 1808/8750 [2:57:34<11:56:03, 6.19s/it] {'loss': 0.5056, 'learning_rate': 1.8408712632087256e-05, 'epoch': 0.21} + 21%|██ | 1808/8750 [2:57:34<11:56:03, 6.19s/it] {'loss': 0.5056, 'learning_rate': 1.8408712632087256e-05, 'epoch': 0.21} + 21%|██ | 1808/8750 [2:57:28<11:56:03, 6.19s/it] 21%|██ | 1809/8750 [2:57:33<11:37:33, 6.03s/it] 21%|██ | 1809/8750 [2:57:40<11:37:33, 6.03s/it] {'loss': 0.4712, 'learning_rate': 1.840670859137707e-05, 'epoch': 0.21} + 21%|██ | 1809/8750 [2:57:40<11:37:33, 6.03s/it] {'loss': 0.4712, 'learning_rate': 1.840670859137707e-05, 'epoch': 0.21} + 21%|██ | 1809/8750 [2:57:33<11:37:33, 6.03s/it] 21%|██ | 1810/8750 [2:57:39<11:26:14, 5.93s/it] 21%|██ | 1810/8750 [2:57:46<11:26:15, 5.93s/it] {'loss': 0.4915, 'learning_rate': 1.840470339876011e-05, 'epoch': 0.21} + 21%|██ | 1810/8750 [2:57:46<11:26:15, 5.93s/it] {'loss': 0.4915, 'learning_rate': 1.840470339876011e-05, 'epoch': 0.21} + 21%|██ | 1810/8750 [2:57:39<11:26:14, 5.93s/it] 21%|██ | 1811/8750 [2:57:45<11:14:28, 5.83s/it] 21%|██ | 1811/8750 [2:57:51<11:14:28, 5.83s/it] {'loss': 0.4752, 'learning_rate': 1.8402697054511145e-05, 'epoch': 0.21} + 21%|██ | 1811/8750 [2:57:51<11:14:28, 5.83s/it] {'loss': 0.4752, 'learning_rate': 1.8402697054511145e-05, 'epoch': 0.21} + 21%|██ | 1811/8750 [2:57:45<11:14:28, 5.83s/it] 21%|██ | 1812/8750 [2:57:50<11:10:52, 5.80s/it] 21%|██ | 1812/8750 [2:57:57<11:10:53, 5.80s/it] {'loss': 0.4876, 'learning_rate': 1.8400689558905083e-05, 'epoch': 0.21} + 21%|██ | 1812/8750 [2:57:57<11:10:53, 5.80s/it] {'loss': 0.4876, 'learning_rate': 1.8400689558905083e-05, 'epoch': 0.21} + 21%|██ | 1812/8750 [2:57:50<11:10:52, 5.80s/it] 21%|██ | 1813/8750 [2:57:56<11:09:25, 5.79s/it] 21%|██ | 1813/8750 [2:58:03<11:09:25, 5.79s/it] {'loss': 0.4676, 'learning_rate': 1.8398680912216997e-05, 'epoch': 0.21} + 21%|██ | 1813/8750 [2:58:03<11:09:25, 5.79s/it] {'loss': 0.4676, 'learning_rate': 1.8398680912216997e-05, 'epoch': 0.21} + 21%|██ | 1813/8750 [2:57:56<11:09:25, 5.79s/it] 21%|██ | 1814/8750 [2:58:08<11:08:37, 5.78s/it] 21%|██ | 1814/8750 [2:58:02<11:08:38, 5.78s/it] {'loss': 0.497, 'learning_rate': 1.8396671114722112e-05, 'epoch': 0.21} + 21%|██ | 1814/8750 [2:58:08<11:08:37, 5.78s/it] {'loss': 0.497, 'learning_rate': 1.8396671114722112e-05, 'epoch': 0.21} + 21%|██ | 1814/8750 [2:58:02<11:08:38, 5.78s/it] 21%|██ | 1815/8750 [2:58:08<11:13:36, 5.83s/it] 21%|██ | 1815/8750 [2:58:14<11:13:37, 5.83s/it] {'loss': 0.4645, 'learning_rate': 1.8394660166695822e-05, 'epoch': 0.21} + 21%|██ | 1815/8750 [2:58:14<11:13:37, 5.83s/it] {'loss': 0.4645, 'learning_rate': 1.8394660166695822e-05, 'epoch': 0.21} + 21%|██ | 1815/8750 [2:58:08<11:13:36, 5.83s/it] 21%|██ | 1816/8750 [2:58:20<11:04:31, 5.75s/it] 21%|██ | 1816/8750 [2:58:13<11:04:32, 5.75s/it] {'loss': 0.5071, 'learning_rate': 1.8392648068413667e-05, 'epoch': 0.21} + 21%|██ | 1816/8750 [2:58:20<11:04:31, 5.75s/it] {'loss': 0.5071, 'learning_rate': 1.8392648068413667e-05, 'epoch': 0.21} + 21%|██ | 1816/8750 [2:58:13<11:04:32, 5.75s/it] 21%|██ | 1817/8750 [2:58:19<10:59:22, 5.71s/it] 21%|██ | 1817/8750 [2:58:26<10:59:22, 5.71s/it] {'loss': 0.4908, 'learning_rate': 1.8390634820151353e-05, 'epoch': 0.21} + 21%|██ | 1817/8750 [2:58:26<10:59:22, 5.71s/it] {'loss': 0.4908, 'learning_rate': 1.8390634820151353e-05, 'epoch': 0.21} + 21%|██ | 1817/8750 [2:58:19<10:59:22, 5.71s/it] 21%|██ | 1818/8750 [2:58:25<11:09:12, 5.79s/it] 21%|██ | 1818/8750 [2:58:32<11:09:12, 5.79s/it] {'loss': 0.4663, 'learning_rate': 1.8388620422184738e-05, 'epoch': 0.21} + 21%|██ | 1818/8750 [2:58:32<11:09:12, 5.79s/it] {'loss': 0.4663, 'learning_rate': 1.8388620422184738e-05, 'epoch': 0.21} + 21%|██ | 1818/8750 [2:58:25<11:09:12, 5.79s/it] 21%|██ | 1819/8750 [2:58:37<11:00:20, 5.72s/it] 21%|██ | 1819/8750 [2:58:31<11:00:21, 5.72s/it] {'loss': 0.4705, 'learning_rate': 1.8386604874789836e-05, 'epoch': 0.21} + 21%|██ | 1819/8750 [2:58:37<11:00:20, 5.72s/it] {'loss': 0.4705, 'learning_rate': 1.8386604874789836e-05, 'epoch': 0.21} + 21%|██ | 1819/8750 [2:58:31<11:00:21, 5.72s/it] 21%|██ | 1820/8750 [2:58:43<10:58:45, 5.70s/it] 21%|██ | 1820/8750 [2:58:36<10:58:45, 5.70s/it] {'loss': 0.4809, 'learning_rate': 1.8384588178242828e-05, 'epoch': 0.21} + 21%|██ | 1820/8750 [2:58:43<10:58:45, 5.70s/it] {'loss': 0.4809, 'learning_rate': 1.8384588178242828e-05, 'epoch': 0.21} + 21%|██ | 1820/8750 [2:58:36<10:58:45, 5.70s/it] 21%|██ | 1821/8750 [2:58:49<11:00:33, 5.72s/it] 21%|██ | 1821/8750 [2:58:42<11:00:33, 5.72s/it] {'loss': 0.5041, 'learning_rate': 1.8382570332820045e-05, 'epoch': 0.21} + 21%|██ | 1821/8750 [2:58:49<11:00:33, 5.72s/it] {'loss': 0.5041, 'learning_rate': 1.8382570332820045e-05, 'epoch': 0.21} + 21%|██ | 1821/8750 [2:58:42<11:00:33, 5.72s/it] 21%|██ | 1822/8750 [2:58:48<10:58:03, 5.70s/it] 21%|██ | 1822/8750 [2:58:54<10:58:08, 5.70s/it] {'loss': 0.472, 'learning_rate': 1.8380551338797974e-05, 'epoch': 0.21} + 21%|██ | 1822/8750 [2:58:54<10:58:08, 5.70s/it] {'loss': 0.472, 'learning_rate': 1.8380551338797974e-05, 'epoch': 0.21} + 21%|██ | 1822/8750 [2:58:48<10:58:03, 5.70s/it] 21%|██ | 1823/8750 [2:58:53<10:55:39, 5.68s/it] 21%|██ | 1823/8750 [2:59:00<10:55:38, 5.68s/it] {'loss': 0.4672, 'learning_rate': 1.8378531196453265e-05, 'epoch': 0.21} + 21%|██ | 1823/8750 [2:59:00<10:55:38, 5.68s/it] {'loss': 0.4672, 'learning_rate': 1.8378531196453265e-05, 'epoch': 0.21} + 21%|██ | 1823/8750 [2:58:53<10:55:39, 5.68s/it] 21%|██ | 1824/8750 [2:58:59<10:58:08, 5.70s/it] 21%|██ | 1824/8750 [2:59:06<10:58:07, 5.70s/it] {'loss': 0.498, 'learning_rate': 1.837650990606272e-05, 'epoch': 0.21} + 21%|██ | 1824/8750 [2:58:59<10:58:08, 5.70s/it]{'loss': 0.498, 'learning_rate': 1.837650990606272e-05, 'epoch': 0.21} + 21%|██ | 1824/8750 [2:59:06<10:58:07, 5.70s/it] 21%|██ | 1825/8750 [2:59:05<11:03:10, 5.75s/it] 21%|██ | 1825/8750 [2:59:11<11:03:09, 5.75s/it] {'loss': 0.4822, 'learning_rate': 1.8374487467903303e-05, 'epoch': 0.21} + 21%|██ | 1825/8750 [2:59:11<11:03:09, 5.75s/it] {'loss': 0.4822, 'learning_rate': 1.8374487467903303e-05, 'epoch': 0.21} + 21%|██ | 1825/8750 [2:59:05<11:03:10, 5.75s/it] 21%|██ | 1826/8750 [2:59:10<10:53:50, 5.67s/it] 21%|██ | 1826/8750 [2:59:17<10:53:49, 5.67s/it] {'loss': 0.5021, 'learning_rate': 1.8372463882252133e-05, 'epoch': 0.21} + 21%|██ | 1826/8750 [2:59:17<10:53:49, 5.67s/it] {'loss': 0.5021, 'learning_rate': 1.8372463882252133e-05, 'epoch': 0.21} + 21%|██ | 1826/8750 [2:59:10<10:53:50, 5.67s/it] 21%|██ | 1827/8750 [2:59:16<10:57:37, 5.70s/it] 21%|██ | 1827/8750 [2:59:23<10:57:37, 5.70s/it] {'loss': 0.4562, 'learning_rate': 1.8370439149386484e-05, 'epoch': 0.21} + 21%|██ | 1827/8750 [2:59:23<10:57:37, 5.70s/it] {'loss': 0.4562, 'learning_rate': 1.8370439149386484e-05, 'epoch': 0.21} + 21%|██ | 1827/8750 [2:59:16<10:57:37, 5.70s/it] 21%|██ | 1828/8750 [2:59:22<10:54:22, 5.67s/it] 21%|██ | 1828/8750 [2:59:28<10:54:22, 5.67s/it] {'loss': 0.4936, 'learning_rate': 1.8368413269583795e-05, 'epoch': 0.21} + 21%|██ | 1828/8750 [2:59:28<10:54:22, 5.67s/it] {'loss': 0.4936, 'learning_rate': 1.8368413269583795e-05, 'epoch': 0.21} + 21%|██ | 1828/8750 [2:59:22<10:54:22, 5.67s/it] 21%|██ | 1829/8750 [2:59:28<10:59:07, 5.71s/it] 21%|██ | 1829/8750 [2:59:34<10:59:06, 5.71s/it] {'loss': 0.4606, 'learning_rate': 1.8366386243121654e-05, 'epoch': 0.21} + 21%|██ | 1829/8750 [2:59:34<10:59:06, 5.71s/it] {'loss': 0.4606, 'learning_rate': 1.8366386243121654e-05, 'epoch': 0.21} + 21%|██ | 1829/8750 [2:59:28<10:59:07, 5.71s/it] 21%|██ | 1830/8750 [2:59:40<11:00:11, 5.72s/it] 21%|██ | 1830/8750 [2:59:33<11:00:12, 5.72s/it] {'loss': 0.4959, 'learning_rate': 1.8364358070277807e-05, 'epoch': 0.21} + 21%|██ | 1830/8750 [2:59:40<11:00:11, 5.72s/it] {'loss': 0.4959, 'learning_rate': 1.8364358070277807e-05, 'epoch': 0.21} + 21%|██ | 1830/8750 [2:59:33<11:00:12, 5.72s/it] 21%|██ | 1831/8750 [2:59:39<11:01:50, 5.74s/it] 21%|██ | 1831/8750 [2:59:46<11:01:50, 5.74s/it] {'loss': 0.4866, 'learning_rate': 1.836232875133016e-05, 'epoch': 0.21} + 21%|██ | 1831/8750 [2:59:46<11:01:50, 5.74s/it] {'loss': 0.4866, 'learning_rate': 1.836232875133016e-05, 'epoch': 0.21} + 21%|██ | 1831/8750 [2:59:39<11:01:50, 5.74s/it] 21%|██ | 1832/8750 [2:59:45<10:58:07, 5.71s/it] 21%|██ | 1832/8750 [2:59:51<10:58:06, 5.71s/it] {'loss': 0.4869, 'learning_rate': 1.8360298286556774e-05, 'epoch': 0.21} + 21%|██ | 1832/8750 [2:59:51<10:58:06, 5.71s/it] {'loss': 0.4869, 'learning_rate': 1.8360298286556774e-05, 'epoch': 0.21} + 21%|██ | 1832/8750 [2:59:45<10:58:07, 5.71s/it] 21%|██ | 1833/8750 [2:59:51<11:03:55, 5.76s/it] 21%|██ | 1833/8750 [2:59:57<11:03:55, 5.76s/it] {'loss': 0.4695, 'learning_rate': 1.8358266676235872e-05, 'epoch': 0.21} + 21%|██ | 1833/8750 [2:59:57<11:03:55, 5.76s/it] {'loss': 0.4695, 'learning_rate': 1.8358266676235872e-05, 'epoch': 0.21} + 21%|██ | 1833/8750 [2:59:51<11:03:55, 5.76s/it] 21%|██ | 1834/8750 [3:00:03<10:55:43, 5.69s/it] 21%|██ | 1834/8750 [2:59:56<10:55:43, 5.69s/it] {'loss': 0.5119, 'learning_rate': 1.8356233920645822e-05, 'epoch': 0.21} + 21%|██ | 1834/8750 [3:00:03<10:55:43, 5.69s/it] {'loss': 0.5119, 'learning_rate': 1.8356233920645822e-05, 'epoch': 0.21} + 21%|██ | 1834/8750 [2:59:56<10:55:43, 5.69s/it] 21%|██ | 1835/8750 [3:00:08<10:56:59, 5.70s/it] 21%|██ | 1835/8750 [3:00:02<10:57:00, 5.70s/it] {'loss': 0.4823, 'learning_rate': 1.8354200020065168e-05, 'epoch': 0.21} + 21%|██ | 1835/8750 [3:00:08<10:56:59, 5.70s/it] {'loss': 0.4823, 'learning_rate': 1.8354200020065168e-05, 'epoch': 0.21} + 21%|██ | 1835/8750 [3:00:02<10:57:00, 5.70s/it] 21%|██ | 1836/8750 [3:00:08<10:56:32, 5.70s/it] 21%|██ | 1836/8750 [3:00:14<10:56:32, 5.70s/it] {'loss': 0.4872, 'learning_rate': 1.8352164974772592e-05, 'epoch': 0.21} + 21%|██ | 1836/8750 [3:00:14<10:56:32, 5.70s/it] {'loss': 0.4872, 'learning_rate': 1.8352164974772592e-05, 'epoch': 0.21} + 21%|██ | 1836/8750 [3:00:08<10:56:32, 5.70s/it] 21%|██ | 1837/8750 [3:00:13<11:02:09, 5.75s/it] 21%|██ | 1837/8750 [3:00:20<11:02:09, 5.75s/it] {'loss': 0.4604, 'learning_rate': 1.8350128785046943e-05, 'epoch': 0.21} + 21%|██ | 1837/8750 [3:00:20<11:02:09, 5.75s/it] {'loss': 0.4604, 'learning_rate': 1.8350128785046943e-05, 'epoch': 0.21} + 21%|██ | 1837/8750 [3:00:13<11:02:09, 5.75s/it] 21%|██ | 1838/8750 [3:00:26<11:00:42, 5.74s/it] 21%|██ | 1838/8750 [3:00:19<11:00:43, 5.74s/it] {'loss': 0.4571, 'learning_rate': 1.8348091451167224e-05, 'epoch': 0.21} + 21%|██ | 1838/8750 [3:00:26<11:00:42, 5.74s/it] {'loss': 0.4571, 'learning_rate': 1.8348091451167224e-05, 'epoch': 0.21} + 21%|██ | 1838/8750 [3:00:19<11:00:43, 5.74s/it] 21%|██ | 1839/8750 [3:00:31<10:57:52, 5.71s/it] 21%|██ | 1839/8750 [3:00:25<10:57:52, 5.71s/it] {'loss': 0.5112, 'learning_rate': 1.8346052973412593e-05, 'epoch': 0.21} + 21%|██ | 1839/8750 [3:00:31<10:57:52, 5.71s/it] {'loss': 0.5112, 'learning_rate': 1.8346052973412593e-05, 'epoch': 0.21} + 21%|██ | 1839/8750 [3:00:25<10:57:52, 5.71s/it] 21%|██ | 1840/8750 [3:00:30<10:53:58, 5.68s/it] 21%|██ | 1840/8750 [3:00:37<10:53:58, 5.68s/it] {'loss': 0.4873, 'learning_rate': 1.834401335206237e-05, 'epoch': 0.21} + 21%|██ | 1840/8750 [3:00:37<10:53:58, 5.68s/it] {'loss': 0.4873, 'learning_rate': 1.834401335206237e-05, 'epoch': 0.21} + 21%|██ | 1840/8750 [3:00:30<10:53:58, 5.68s/it] 21%|██ | 1841/8750 [3:00:43<10:56:08, 5.70s/it] 21%|██ | 1841/8750 [3:00:36<10:56:09, 5.70s/it] {'loss': 0.4778, 'learning_rate': 1.8341972587396032e-05, 'epoch': 0.21} + 21%|██ | 1841/8750 [3:00:43<10:56:08, 5.70s/it] {'loss': 0.4778, 'learning_rate': 1.8341972587396032e-05, 'epoch': 0.21} + 21%|██ | 1841/8750 [3:00:36<10:56:09, 5.70s/it] 21%|██ | 1842/8750 [3:00:48<10:52:11, 5.66s/it] 21%|██ | 1842/8750 [3:00:42<10:52:11, 5.66s/it] {'loss': 0.475, 'learning_rate': 1.8339930679693202e-05, 'epoch': 0.21} + 21%|██ | 1842/8750 [3:00:48<10:52:11, 5.66s/it] {'loss': 0.475, 'learning_rate': 1.8339930679693202e-05, 'epoch': 0.21} + 21%|██ | 1842/8750 [3:00:42<10:52:11, 5.66s/it] 21%|██ | 1843/8750 [3:00:54<11:00:31, 5.74s/it] 21%|██ | 1843/8750 [3:00:48<11:00:31, 5.74s/it] {'loss': 0.4649, 'learning_rate': 1.8337887629233672e-05, 'epoch': 0.21} + 21%|██ | 1843/8750 [3:00:54<11:00:31, 5.74s/it] {'loss': 0.4649, 'learning_rate': 1.8337887629233672e-05, 'epoch': 0.21} + 21%|██ | 1843/8750 [3:00:48<11:00:31, 5.74s/it] 21%|██ | 1844/8750 [3:01:00<10:56:52, 5.71s/it] 21%|██ | 1844/8750 [3:00:53<10:56:53, 5.71s/it] {'loss': 0.5096, 'learning_rate': 1.833584343629738e-05, 'epoch': 0.21} + 21%|██ | 1844/8750 [3:01:00<10:56:52, 5.71s/it] {'loss': 0.5096, 'learning_rate': 1.833584343629738e-05, 'epoch': 0.21} + 21%|██ | 1844/8750 [3:00:53<10:56:53, 5.71s/it] 21%|██ | 1845/8750 [3:01:06<10:59:38, 5.73s/it] 21%|██ | 1845/8750 [3:00:59<10:59:38, 5.73s/it] {'loss': 0.4945, 'learning_rate': 1.8333798101164433e-05, 'epoch': 0.21} + 21%|██ | 1845/8750 [3:01:06<10:59:38, 5.73s/it] {'loss': 0.4945, 'learning_rate': 1.8333798101164433e-05, 'epoch': 0.21} + 21%|██ | 1845/8750 [3:00:59<10:59:38, 5.73s/it] 21%|██ | 1846/8750 [3:01:05<10:54:16, 5.69s/it] 21%|██ | 1846/8750 [3:01:11<10:54:16, 5.69s/it] {'loss': 0.4645, 'learning_rate': 1.833175162411508e-05, 'epoch': 0.21} + 21%|██ | 1846/8750 [3:01:11<10:54:16, 5.69s/it] {'loss': 0.4645, 'learning_rate': 1.833175162411508e-05, 'epoch': 0.21} + 21%|██ | 1846/8750 [3:01:05<10:54:16, 5.69s/it] 21%|██ | 1847/8750 [3:01:10<10:50:00, 5.65s/it] 21%|██ | 1847/8750 [3:01:17<10:50:01, 5.65s/it] {'loss': 0.4822, 'learning_rate': 1.8329704005429745e-05, 'epoch': 0.21} + 21%|██ | 1847/8750 [3:01:17<10:50:01, 5.65s/it] {'loss': 0.4822, 'learning_rate': 1.8329704005429745e-05, 'epoch': 0.21} + 21%|██ | 1847/8750 [3:01:10<10:50:00, 5.65s/it] 21%|██ | 1848/8750 [3:01:22<10:50:36, 5.66s/it] 21%|██ | 1848/8750 [3:01:16<10:50:37, 5.66s/it] {'loss': 0.4826, 'learning_rate': 1.8327655245388986e-05, 'epoch': 0.21} + 21%|██ | 1848/8750 [3:01:22<10:50:36, 5.66s/it] {'loss': 0.4826, 'learning_rate': 1.8327655245388986e-05, 'epoch': 0.21} + 21%|██ | 1848/8750 [3:01:16<10:50:37, 5.66s/it] 21%|██ | 1849/8750 [3:01:28<10:47:08, 5.63s/it] 21%|██ | 1849/8750 [3:01:21<10:47:09, 5.63s/it] {'loss': 0.4994, 'learning_rate': 1.8325605344273536e-05, 'epoch': 0.21} + 21%|██ | 1849/8750 [3:01:28<10:47:08, 5.63s/it] {'loss': 0.4994, 'learning_rate': 1.8325605344273536e-05, 'epoch': 0.21} + 21%|██ | 1849/8750 [3:01:21<10:47:09, 5.63s/it]9 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... + 21%|██ | 1850/8750 [3:01:34<10:50:20, 5.66s/it]14 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +62 7 AutoResumeHook: Checking whether to suspend... AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... + +3 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +0 AutoResumeHook: Checking whether to suspend... + 21%|██ | 1850/8750 [3:01:27<10:50:22, 5.66s/it] {'loss': 0.4873, 'learning_rate': 1.8323554302364273e-05, 'epoch': 0.21} + 21%|██ | 1850/8750 [3:01:34<10:50:20, 5.66s/it] {'loss': 0.4873, 'learning_rate': 1.8323554302364273e-05, 'epoch': 0.21} + 21%|██ | 1850/8750 [3:01:27<10:50:22, 5.66s/it] 21%|██ | 1851/8750 [3:01:40<11:00:09, 5.74s/it] 21%|██ | 1851/8750 [3:01:33<11:00:10, 5.74s/it] {'loss': 0.4714, 'learning_rate': 1.832150211994224e-05, 'epoch': 0.21} + 21%|██ | 1851/8750 [3:01:40<11:00:09, 5.74s/it] {'loss': 0.4714, 'learning_rate': 1.832150211994224e-05, 'epoch': 0.21} + 21%|██ | 1851/8750 [3:01:33<11:00:10, 5.74s/it] 21%|██ | 1852/8750 [3:01:39<10:55:10, 5.70s/it] 21%|██ | 1852/8750 [3:01:45<10:55:12, 5.70s/it] {'loss': 0.4985, 'learning_rate': 1.8319448797288628e-05, 'epoch': 0.21} + 21%|██ | 1852/8750 [3:01:45<10:55:12, 5.70s/it] {'loss': 0.4985, 'learning_rate': 1.8319448797288628e-05, 'epoch': 0.21} + 21%|██ | 1852/8750 [3:01:39<10:55:10, 5.70s/it] 21%|██ | 1853/8750 [3:01:45<11:04:20, 5.78s/it] 21%|██ | 1853/8750 [3:01:51<11:04:21, 5.78s/it] {'loss': 0.4863, 'learning_rate': 1.831739433468479e-05, 'epoch': 0.21} + 21%|██ | 1853/8750 [3:01:51<11:04:21, 5.78s/it] {'loss': 0.4863, 'learning_rate': 1.831739433468479e-05, 'epoch': 0.21} + 21%|██ | 1853/8750 [3:01:45<11:04:20, 5.78s/it] 21%|██ | 1854/8750 [3:01:57<10:59:23, 5.74s/it] 21%|██ | 1854/8750 [3:01:50<10:59:23, 5.74s/it] {'loss': 0.4833, 'learning_rate': 1.831533873241223e-05, 'epoch': 0.21} + 21%|██ | 1854/8750 [3:01:57<10:59:23, 5.74s/it] {'loss': 0.4833, 'learning_rate': 1.831533873241223e-05, 'epoch': 0.21} + 21%|██ | 1854/8750 [3:01:50<10:59:23, 5.74s/it] 21%|██ | 1855/8750 [3:01:56<10:57:01, 5.72s/it] 21%|██ | 1855/8750 [3:02:03<10:57:01, 5.72s/it] {'loss': 0.4667, 'learning_rate': 1.831328199075262e-05, 'epoch': 0.21} + 21%|██ | 1855/8750 [3:02:03<10:57:01, 5.72s/it] {'loss': 0.4667, 'learning_rate': 1.831328199075262e-05, 'epoch': 0.21} + 21%|██ | 1855/8750 [3:01:56<10:57:01, 5.72s/it] 21%|██ | 1856/8750 [3:02:08<10:53:37, 5.69s/it] 21%|██ | 1856/8750 [3:02:02<10:53:37, 5.69s/it] {'loss': 0.476, 'learning_rate': 1.8311224109987768e-05, 'epoch': 0.21} + 21%|██ | 1856/8750 [3:02:08<10:53:37, 5.69s/it] {'loss': 0.476, 'learning_rate': 1.8311224109987768e-05, 'epoch': 0.21} + 21%|██ | 1856/8750 [3:02:02<10:53:37, 5.69s/it] 21%|██ | 1857/8750 [3:02:14<10:50:29, 5.66s/it] 21%|██ | 1857/8750 [3:02:07<10:50:29, 5.66s/it] {'loss': 0.516, 'learning_rate': 1.8309165090399657e-05, 'epoch': 0.21} + 21%|██ | 1857/8750 [3:02:14<10:50:29, 5.66s/it] {'loss': 0.516, 'learning_rate': 1.8309165090399657e-05, 'epoch': 0.21} + 21%|██ | 1857/8750 [3:02:07<10:50:29, 5.66s/it] 21%|██ | 1858/8750 [3:02:19<10:52:46, 5.68s/it] 21%|██ | 1858/8750 [3:02:13<10:52:47, 5.68s/it] {'loss': 0.4699, 'learning_rate': 1.8307104932270415e-05, 'epoch': 0.21} + 21%|██ | 1858/8750 [3:02:19<10:52:46, 5.68s/it] {'loss': 0.4699, 'learning_rate': 1.8307104932270415e-05, 'epoch': 0.21} + 21%|██ | 1858/8750 [3:02:13<10:52:47, 5.68s/it] 21%|██ | 1859/8750 [3:02:25<11:01:48, 5.76s/it] 21%|██ | 1859/8750 [3:02:19<11:01:48, 5.76s/it] {'loss': 0.4787, 'learning_rate': 1.8305043635882334e-05, 'epoch': 0.21} + 21%|██ | 1859/8750 [3:02:25<11:01:48, 5.76s/it] {'loss': 0.4787, 'learning_rate': 1.8305043635882334e-05, 'epoch': 0.21} + 21%|██ | 1859/8750 [3:02:19<11:01:48, 5.76s/it] 21%|██▏ | 1860/8750 [3:02:31<10:54:41, 5.70s/it] 21%|██▏ | 1860/8750 [3:02:24<10:54:41, 5.70s/it] {'loss': 0.4944, 'learning_rate': 1.830298120151785e-05, 'epoch': 0.21} + 21%|██▏ | 1860/8750 [3:02:31<10:54:41, 5.70s/it] {'loss': 0.4944, 'learning_rate': 1.830298120151785e-05, 'epoch': 0.21} + 21%|██▏ | 1860/8750 [3:02:24<10:54:41, 5.70s/it] 21%|██▏ | 1861/8750 [3:02:30<10:54:02, 5.70s/it] 21%|██▏ | 1861/8750 [3:02:37<10:54:03, 5.70s/it] {'loss': 0.494, 'learning_rate': 1.8300917629459575e-05, 'epoch': 0.21} + 21%|██▏ | 1861/8750 [3:02:37<10:54:03, 5.70s/it] {'loss': 0.494, 'learning_rate': 1.8300917629459575e-05, 'epoch': 0.21} + 21%|██▏ | 1861/8750 [3:02:30<10:54:02, 5.70s/it] 21%|██▏ | 1862/8750 [3:02:42<10:47:20, 5.64s/it] 21%|██▏ | 1862/8750 [3:02:36<10:47:20, 5.64s/it] {'loss': 0.4809, 'learning_rate': 1.8298852919990254e-05, 'epoch': 0.21} + 21%|██▏ | 1862/8750 [3:02:42<10:47:20, 5.64s/it] {'loss': 0.4809, 'learning_rate': 1.8298852919990254e-05, 'epoch': 0.21} + 21%|██▏ | 1862/8750 [3:02:36<10:47:20, 5.64s/it] 21%|██▏ | 1863/8750 [3:02:48<10:46:35, 5.63s/it] 21%|██▏ | 1863/8750 [3:02:41<10:46:35, 5.63s/it] {'loss': 0.4735, 'learning_rate': 1.82967870733928e-05, 'epoch': 0.21} + 21%|██▏ | 1863/8750 [3:02:48<10:46:35, 5.63s/it] {'loss': 0.4735, 'learning_rate': 1.82967870733928e-05, 'epoch': 0.21} + 21%|██▏ | 1863/8750 [3:02:41<10:46:35, 5.63s/it] 21%|██▏ | 1864/8750 [3:02:47<10:55:14, 5.71s/it] 21%|██▏ | 1864/8750 [3:02:54<10:55:15, 5.71s/it] {'loss': 0.4649, 'learning_rate': 1.8294720089950282e-05, 'epoch': 0.21} + 21%|██▏ | 1864/8750 [3:02:54<10:55:15, 5.71s/it] {'loss': 0.4649, 'learning_rate': 1.8294720089950282e-05, 'epoch': 0.21} + 21%|██▏ | 1864/8750 [3:02:47<10:55:14, 5.71s/it] 21%|██▏ | 1865/8750 [3:02:59<10:53:54, 5.70s/it] 21%|██▏ | 1865/8750 [3:02:53<10:53:54, 5.70s/it] {'loss': 0.491, 'learning_rate': 1.8292651969945923e-05, 'epoch': 0.21} + 21%|██▏ | 1865/8750 [3:02:59<10:53:54, 5.70s/it] {'loss': 0.491, 'learning_rate': 1.8292651969945923e-05, 'epoch': 0.21} + 21%|██▏ | 1865/8750 [3:02:53<10:53:54, 5.70s/it] 21%|██▏ | 1866/8750 [3:03:05<10:50:04, 5.67s/it] 21%|██▏ | 1866/8750 [3:02:58<10:50:04, 5.67s/it] {'loss': 0.4947, 'learning_rate': 1.82905827136631e-05, 'epoch': 0.21} + 21%|██▏ | 1866/8750 [3:03:05<10:50:04, 5.67s/it] {'loss': 0.4947, 'learning_rate': 1.82905827136631e-05, 'epoch': 0.21} + 21%|██▏ | 1866/8750 [3:02:58<10:50:04, 5.67s/it] 21%|██▏ | 1867/8750 [3:03:11<10:55:08, 5.71s/it] 21%|██▏ | 1867/8750 [3:03:04<10:55:09, 5.71s/it] {'loss': 0.4686, 'learning_rate': 1.828851232138535e-05, 'epoch': 0.21} + 21%|██▏ | 1867/8750 [3:03:11<10:55:08, 5.71s/it] {'loss': 0.4686, 'learning_rate': 1.828851232138535e-05, 'epoch': 0.21} + 21%|██▏ | 1867/8750 [3:03:04<10:55:09, 5.71s/it] 21%|██▏ | 1868/8750 [3:03:16<10:50:52, 5.67s/it] 21%|██▏ | 1868/8750 [3:03:10<10:50:53, 5.67s/it] {'loss': 0.4651, 'learning_rate': 1.828644079339636e-05, 'epoch': 0.21} + 21%|██▏ | 1868/8750 [3:03:16<10:50:52, 5.67s/it] {'loss': 0.4651, 'learning_rate': 1.828644079339636e-05, 'epoch': 0.21} + 21%|██▏ | 1868/8750 [3:03:10<10:50:53, 5.67s/it] 21%|██▏ | 1869/8750 [3:03:15<10:47:27, 5.65s/it] 21%|██▏ | 1869/8750 [3:03:22<10:47:28, 5.65s/it] {'loss': 0.4662, 'learning_rate': 1.828436812997998e-05, 'epoch': 0.21} + 21%|██▏ | 1869/8750 [3:03:22<10:47:28, 5.65s/it] {'loss': 0.4662, 'learning_rate': 1.828436812997998e-05, 'epoch': 0.21} + 21%|██▏ | 1869/8750 [3:03:15<10:47:27, 5.65s/it] 21%|██▏ | 1870/8750 [3:03:21<10:49:41, 5.67s/it] 21%|██▏ | 1870/8750 [3:03:28<10:49:41, 5.67s/it] {'loss': 0.4872, 'learning_rate': 1.8282294331420204e-05, 'epoch': 0.21} + 21%|██▏ | 1870/8750 [3:03:28<10:49:41, 5.67s/it] {'loss': 0.4872, 'learning_rate': 1.8282294331420204e-05, 'epoch': 0.21} + 21%|██▏ | 1870/8750 [3:03:21<10:49:41, 5.67s/it] 21%|██▏ | 1871/8750 [3:03:27<11:02:18, 5.78s/it] 21%|██▏ | 1871/8750 [3:03:34<11:02:18, 5.78s/it] {'loss': 0.4907, 'learning_rate': 1.8280219398001192e-05, 'epoch': 0.21} + 21%|██▏ | 1871/8750 [3:03:34<11:02:18, 5.78s/it] {'loss': 0.4907, 'learning_rate': 1.8280219398001192e-05, 'epoch': 0.21} + 21%|██▏ | 1871/8750 [3:03:27<11:02:18, 5.78s/it] 21%|██▏ | 1872/8750 [3:03:33<11:01:40, 5.77s/it] 21%|██▏ | 1872/8750 [3:03:39<11:01:40, 5.77s/it] {'loss': 0.4671, 'learning_rate': 1.827814333000726e-05, 'epoch': 0.21} + 21%|██▏ | 1872/8750 [3:03:39<11:01:40, 5.77s/it] {'loss': 0.4671, 'learning_rate': 1.827814333000726e-05, 'epoch': 0.21} + 21%|██▏ | 1872/8750 [3:03:33<11:01:40, 5.77s/it] 21%|██▏ | 1873/8750 [3:03:39<11:00:14, 5.76s/it] 21%|██▏ | 1873/8750 [3:03:45<11:00:16, 5.76s/it] {'loss': 0.474, 'learning_rate': 1.827606612772287e-05, 'epoch': 0.21} + 21%|██▏ | 1873/8750 [3:03:45<11:00:16, 5.76s/it] {'loss': 0.474, 'learning_rate': 1.827606612772287e-05, 'epoch': 0.21} + 21%|██▏ | 1873/8750 [3:03:39<11:00:14, 5.76s/it] 21%|██▏ | 1874/8750 [3:03:51<10:54:12, 5.71s/it] 21%|██▏ | 1874/8750 [3:03:44<10:54:13, 5.71s/it] {'loss': 0.4831, 'learning_rate': 1.827398779143265e-05, 'epoch': 0.21} + 21%|██▏ | 1874/8750 [3:03:51<10:54:12, 5.71s/it] {'loss': 0.4831, 'learning_rate': 1.827398779143265e-05, 'epoch': 0.21} + 21%|██▏ | 1874/8750 [3:03:44<10:54:13, 5.71s/it] 21%|██▏ | 1875/8750 [3:03:57<11:04:29, 5.80s/it] 21%|██▏ | 1875/8750 [3:03:50<11:04:30, 5.80s/it] {'loss': 0.4973, 'learning_rate': 1.8271908321421376e-05, 'epoch': 0.21} + 21%|██▏ | 1875/8750 [3:03:57<11:04:29, 5.80s/it] {'loss': 0.4973, 'learning_rate': 1.8271908321421376e-05, 'epoch': 0.21} + 21%|██▏ | 1875/8750 [3:03:50<11:04:30, 5.80s/it] 21%|██▏ | 1876/8750 [3:04:02<10:59:38, 5.76s/it] 21%|██▏ | 1876/8750 [3:03:56<10:59:38, 5.76s/it] {'loss': 0.4786, 'learning_rate': 1.8269827717973982e-05, 'epoch': 0.21} + 21%|██▏ | 1876/8750 [3:04:02<10:59:38, 5.76s/it] {'loss': 0.4786, 'learning_rate': 1.8269827717973982e-05, 'epoch': 0.21} + 21%|██▏ | 1876/8750 [3:03:56<10:59:38, 5.76s/it] 21%|██▏ | 1877/8750 [3:04:08<10:58:52, 5.75s/it] 21%|██▏ | 1877/8750 [3:04:02<10:58:51, 5.75s/it] {'loss': 0.4745, 'learning_rate': 1.8267745981375555e-05, 'epoch': 0.21} + 21%|██▏ | 1877/8750 [3:04:08<10:58:52, 5.75s/it] {'loss': 0.4745, 'learning_rate': 1.8267745981375555e-05, 'epoch': 0.21} + 21%|██▏ | 1877/8750 [3:04:02<10:58:51, 5.75s/it] 21%|██▏ | 1878/8750 [3:04:14<10:54:54, 5.72s/it] 21%|██▏ | 1878/8750 [3:04:07<10:54:55, 5.72s/it] {'loss': 0.5025, 'learning_rate': 1.8265663111911344e-05, 'epoch': 0.21} + 21%|██▏ | 1878/8750 [3:04:14<10:54:54, 5.72s/it] {'loss': 0.5025, 'learning_rate': 1.8265663111911344e-05, 'epoch': 0.21} + 21%|██▏ | 1878/8750 [3:04:07<10:54:55, 5.72s/it] 21%|██▏ | 1879/8750 [3:04:13<11:05:02, 5.81s/it] 21%|██▏ | 1879/8750 [3:04:20<11:05:03, 5.81s/it] {'loss': 0.4768, 'learning_rate': 1.8263579109866745e-05, 'epoch': 0.21} + 21%|██▏ | 1879/8750 [3:04:20<11:05:03, 5.81s/it] {'loss': 0.4768, 'learning_rate': 1.8263579109866745e-05, 'epoch': 0.21} + 21%|██▏ | 1879/8750 [3:04:13<11:05:02, 5.81s/it] 21%|██▏ | 1880/8750 [3:04:19<11:00:20, 5.77s/it] 21%|██▏ | 1880/8750 [3:04:25<11:00:20, 5.77s/it] {'loss': 0.4807, 'learning_rate': 1.8261493975527312e-05, 'epoch': 0.21} + 21%|██▏ | 1880/8750 [3:04:25<11:00:20, 5.77s/it] {'loss': 0.4807, 'learning_rate': 1.8261493975527312e-05, 'epoch': 0.21} + 21%|██▏ | 1880/8750 [3:04:19<11:00:20, 5.77s/it] 21%|██▏ | 1881/8750 [3:04:31<11:04:04, 5.80s/it] 21%|██▏ | 1881/8750 [3:04:25<11:04:05, 5.80s/it] {'loss': 0.4922, 'learning_rate': 1.8259407709178758e-05, 'epoch': 0.21} + 21%|██▏ | 1881/8750 [3:04:31<11:04:04, 5.80s/it] {'loss': 0.4922, 'learning_rate': 1.8259407709178758e-05, 'epoch': 0.21} + 21%|██▏ | 1881/8750 [3:04:25<11:04:05, 5.80s/it] 22%|██▏ | 1882/8750 [3:04:37<11:11:44, 5.87s/it] 22%|██▏ | 1882/8750 [3:04:31<11:11:44, 5.87s/it] {'loss': 0.4806, 'learning_rate': 1.8257320311106948e-05, 'epoch': 0.22} + 22%|██▏ | 1882/8750 [3:04:37<11:11:44, 5.87s/it] {'loss': 0.4806, 'learning_rate': 1.8257320311106948e-05, 'epoch': 0.22} + 22%|██▏ | 1882/8750 [3:04:31<11:11:44, 5.87s/it] 22%|██▏ | 1883/8750 [3:04:43<11:08:32, 5.84s/it] 22%|██▏ | 1883/8750 [3:04:37<11:08:32, 5.84s/it] {'loss': 0.491, 'learning_rate': 1.82552317815979e-05, 'epoch': 0.22} + 22%|██▏ | 1883/8750 [3:04:43<11:08:32, 5.84s/it] {'loss': 0.491, 'learning_rate': 1.82552317815979e-05, 'epoch': 0.22} + 22%|██▏ | 1883/8750 [3:04:37<11:08:32, 5.84s/it] 22%|██▏ | 1884/8750 [3:04:49<10:59:12, 5.76s/it] 22%|██▏ | 1884/8750 [3:04:42<10:59:12, 5.76s/it] {'loss': 0.4727, 'learning_rate': 1.825314212093779e-05, 'epoch': 0.22} + 22%|██▏ | 1884/8750 [3:04:49<10:59:12, 5.76s/it] {'loss': 0.4727, 'learning_rate': 1.825314212093779e-05, 'epoch': 0.22} + 22%|██▏ | 1884/8750 [3:04:42<10:59:12, 5.76s/it] 22%|██▏ | 1885/8750 [3:04:48<10:55:58, 5.73s/it] 22%|██▏ | 1885/8750 [3:04:54<10:55:59, 5.73s/it] {'loss': 0.4835, 'learning_rate': 1.825105132941295e-05, 'epoch': 0.22} + 22%|██▏ | 1885/8750 [3:04:54<10:55:59, 5.73s/it] {'loss': 0.4835, 'learning_rate': 1.825105132941295e-05, 'epoch': 0.22} + 22%|██▏ | 1885/8750 [3:04:48<10:55:58, 5.73s/it] 22%|██▏ | 1886/8750 [3:04:54<10:52:11, 5.70s/it] 22%|██▏ | 1886/8750 [3:05:00<10:52:12, 5.70s/it] {'loss': 0.4773, 'learning_rate': 1.8248959407309862e-05, 'epoch': 0.22} + 22%|██▏ | 1886/8750 [3:05:00<10:52:12, 5.70s/it] {'loss': 0.4773, 'learning_rate': 1.8248959407309862e-05, 'epoch': 0.22} + 22%|██▏ | 1886/8750 [3:04:54<10:52:11, 5.70s/it] 22%|██▏ | 1887/8750 [3:05:06<10:58:00, 5.75s/it] 22%|██▏ | 1887/8750 [3:04:59<10:58:01, 5.75s/it] {'loss': 0.4654, 'learning_rate': 1.824686635491517e-05, 'epoch': 0.22} + 22%|██▏ | 1887/8750 [3:05:06<10:58:00, 5.75s/it] {'loss': 0.4654, 'learning_rate': 1.824686635491517e-05, 'epoch': 0.22} + 22%|██▏ | 1887/8750 [3:04:59<10:58:01, 5.75s/it] 22%|██▏ | 1888/8750 [3:05:11<10:50:36, 5.69s/it] 22%|██▏ | 1888/8750 [3:05:05<10:50:37, 5.69s/it] {'loss': 0.4921, 'learning_rate': 1.824477217251566e-05, 'epoch': 0.22} + 22%|██▏ | 1888/8750 [3:05:11<10:50:36, 5.69s/it] {'loss': 0.4921, 'learning_rate': 1.824477217251566e-05, 'epoch': 0.22} + 22%|██▏ | 1888/8750 [3:05:05<10:50:37, 5.69s/it] 22%|██▏ | 1889/8750 [3:05:11<10:52:26, 5.71s/it] 22%|██▏ | 1889/8750 [3:05:17<10:52:27, 5.71s/it] {'loss': 0.5063, 'learning_rate': 1.8242676860398295e-05, 'epoch': 0.22} + 22%|██▏ | 1889/8750 [3:05:17<10:52:27, 5.71s/it] {'loss': 0.5063, 'learning_rate': 1.8242676860398295e-05, 'epoch': 0.22} + 22%|██▏ | 1889/8750 [3:05:11<10:52:26, 5.71s/it] 22%|██▏ | 1890/8750 [3:05:23<11:06:23, 5.83s/it] 22%|██▏ | 1890/8750 [3:05:17<11:06:23, 5.83s/it] {'loss': 0.4779, 'learning_rate': 1.824058041885017e-05, 'epoch': 0.22} + 22%|██▏ | 1890/8750 [3:05:23<11:06:23, 5.83s/it] {'loss': 0.4779, 'learning_rate': 1.824058041885017e-05, 'epoch': 0.22} + 22%|██▏ | 1890/8750 [3:05:17<11:06:23, 5.83s/it] 22%|██▏ | 1891/8750 [3:05:29<10:57:45, 5.75s/it] 22%|██▏ | 1891/8750 [3:05:22<10:57:45, 5.75s/it] {'loss': 0.4864, 'learning_rate': 1.8238482848158548e-05, 'epoch': 0.22} + 22%|██▏ | 1891/8750 [3:05:29<10:57:45, 5.75s/it] {'loss': 0.4864, 'learning_rate': 1.8238482848158548e-05, 'epoch': 0.22} + 22%|██▏ | 1891/8750 [3:05:22<10:57:45, 5.75s/it] 22%|██▏ | 1892/8750 [3:05:28<11:03:03, 5.80s/it] 22%|██▏ | 1892/8750 [3:05:35<11:03:03, 5.80s/it] {'loss': 0.4714, 'learning_rate': 1.8236384148610843e-05, 'epoch': 0.22} + 22%|██▏ | 1892/8750 [3:05:35<11:03:03, 5.80s/it] {'loss': 0.4714, 'learning_rate': 1.8236384148610843e-05, 'epoch': 0.22} + 22%|██▏ | 1892/8750 [3:05:28<11:03:03, 5.80s/it] 22%|██▏ | 1893/8750 [3:05:41<11:02:14, 5.79s/it] 22%|██▏ | 1893/8750 [3:05:34<11:02:14, 5.79s/it] {'loss': 0.4911, 'learning_rate': 1.823428432049462e-05, 'epoch': 0.22} + 22%|██▏ | 1893/8750 [3:05:41<11:02:14, 5.79s/it] {'loss': 0.4911, 'learning_rate': 1.823428432049462e-05, 'epoch': 0.22} + 22%|██▏ | 1893/8750 [3:05:34<11:02:14, 5.79s/it] 22%|██▏ | 1894/8750 [3:05:40<10:55:25, 5.74s/it] 22%|██▏ | 1894/8750 [3:05:46<10:55:25, 5.74s/it] {'loss': 0.4711, 'learning_rate': 1.8232183364097605e-05, 'epoch': 0.22} + 22%|██▏ | 1894/8750 [3:05:46<10:55:25, 5.74s/it] {'loss': 0.4711, 'learning_rate': 1.8232183364097605e-05, 'epoch': 0.22} + 22%|██▏ | 1894/8750 [3:05:40<10:55:25, 5.74s/it] 22%|██▏ | 1895/8750 [3:05:52<10:51:17, 5.70s/it] 22%|██▏ | 1895/8750 [3:05:45<10:51:18, 5.70s/it] {'loss': 0.4749, 'learning_rate': 1.8230081279707675e-05, 'epoch': 0.22} + 22%|██▏ | 1895/8750 [3:05:52<10:51:17, 5.70s/it] {'loss': 0.4749, 'learning_rate': 1.8230081279707675e-05, 'epoch': 0.22} + 22%|██▏ | 1895/8750 [3:05:45<10:51:18, 5.70s/it] 22%|██▏ | 1896/8750 [3:05:57<10:49:47, 5.69s/it] 22%|██▏ | 1896/8750 [3:05:51<10:49:47, 5.69s/it] {'loss': 0.4695, 'learning_rate': 1.822797806761287e-05, 'epoch': 0.22} + 22%|██▏ | 1896/8750 [3:05:57<10:49:47, 5.69s/it] {'loss': 0.4695, 'learning_rate': 1.822797806761287e-05, 'epoch': 0.22} + 22%|██▏ | 1896/8750 [3:05:51<10:49:47, 5.69s/it] 22%|██▏ | 1897/8750 [3:05:56<10:43:25, 5.63s/it] 22%|██▏ | 1897/8750 [3:06:03<10:43:26, 5.63s/it] {'loss': 0.4946, 'learning_rate': 1.8225873728101367e-05, 'epoch': 0.22} + 22%|██▏ | 1897/8750 [3:06:03<10:43:26, 5.63s/it] {'loss': 0.4946, 'learning_rate': 1.8225873728101367e-05, 'epoch': 0.22} + 22%|██▏ | 1897/8750 [3:05:56<10:43:25, 5.63s/it] 22%|██▏ | 1898/8750 [3:06:02<10:42:30, 5.63s/it] 22%|██▏ | 1898/8750 [3:06:09<10:42:31, 5.63s/it] {'loss': 0.4869, 'learning_rate': 1.822376826146151e-05, 'epoch': 0.22} + 22%|██▏ | 1898/8750 [3:06:09<10:42:31, 5.63s/it] {'loss': 0.4869, 'learning_rate': 1.822376826146151e-05, 'epoch': 0.22} + 22%|██▏ | 1898/8750 [3:06:02<10:42:30, 5.63s/it] 22%|██▏ | 1899/8750 [3:06:08<10:45:29, 5.65s/it] 22%|██▏ | 1899/8750 [3:06:14<10:45:29, 5.65s/it] {'loss': 0.4795, 'learning_rate': 1.8221661667981795e-05, 'epoch': 0.22} + 22%|██▏ | 1899/8750 [3:06:14<10:45:29, 5.65s/it] {'loss': 0.4795, 'learning_rate': 1.8221661667981795e-05, 'epoch': 0.22} + 22%|██▏ | 1899/8750 [3:06:08<10:45:29, 5.65s/it]4 AutoResumeHook: Checking whether to suspend... +1011 8AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +5 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... + 22%|██▏ | 1900/8750 [3:06:20<10:43:22, 5.64s/it]6 AutoResumeHook: Checking whether to suspend... +20 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... + 22%|██▏ | 1900/8750 [3:06:13<10:43:23, 5.64s/it]15 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4721, 'learning_rate': 1.8219553947950874e-05, 'epoch': 0.22} + 22%|██▏ | 1900/8750 [3:06:20<10:43:22, 5.64s/it] {'loss': 0.4721, 'learning_rate': 1.8219553947950874e-05, 'epoch': 0.22} + 22%|██▏ | 1900/8750 [3:06:13<10:43:23, 5.64s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-1900/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-1900/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-1900/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 22%|██▏ | 1901/8750 [3:06:39<18:30:50, 9.73s/it] 22%|██▏ | 1901/8750 [3:06:33<18:30:50, 9.73s/it] {'loss': 0.4663, 'learning_rate': 1.8217445101657553e-05, 'epoch': 0.22} + 22%|██▏ | 1901/8750 [3:06:39<18:30:50, 9.73s/it] {'loss': 0.4663, 'learning_rate': 1.8217445101657553e-05, 'epoch': 0.22} + 22%|██▏ | 1901/8750 [3:06:33<18:30:50, 9.73s/it] 22%|██▏ | 1902/8750 [3:06:45<16:21:19, 8.60s/it] 22%|██▏ | 1902/8750 [3:06:39<16:21:19, 8.60s/it] {'loss': 0.4833, 'learning_rate': 1.8215335129390785e-05, 'epoch': 0.22} + 22%|██▏ | 1902/8750 [3:06:45<16:21:19, 8.60s/it] {'loss': 0.4833, 'learning_rate': 1.8215335129390785e-05, 'epoch': 0.22} + 22%|██▏ | 1902/8750 [3:06:39<16:21:19, 8.60s/it] 22%|██▏ | 1903/8750 [3:06:51<14:41:55, 7.73s/it] 22%|██▏ | 1903/8750 [3:06:44<14:41:56, 7.73s/it] {'loss': 0.481, 'learning_rate': 1.821322403143969e-05, 'epoch': 0.22} + 22%|██▏ | 1903/8750 [3:06:51<14:41:55, 7.73s/it] {'loss': 0.481, 'learning_rate': 1.821322403143969e-05, 'epoch': 0.22} + 22%|██▏ | 1903/8750 [3:06:44<14:41:56, 7.73s/it] 22%|██▏ | 1904/8750 [3:06:50<13:27:54, 7.08s/it] 22%|██▏ | 1904/8750 [3:06:56<13:27:55, 7.08s/it] {'loss': 0.473, 'learning_rate': 1.8211111808093534e-05, 'epoch': 0.22} + 22%|██▏ | 1904/8750 [3:06:56<13:27:55, 7.08s/it] {'loss': 0.473, 'learning_rate': 1.8211111808093534e-05, 'epoch': 0.22} + 22%|██▏ | 1904/8750 [3:06:50<13:27:54, 7.08s/it] 22%|██▏ | 1905/8750 [3:06:56<12:41:36, 6.68s/it] 22%|██▏ | 1905/8750 [3:07:02<12:41:37, 6.68s/it] {'loss': 0.4881, 'learning_rate': 1.8208998459641737e-05, 'epoch': 0.22} + 22%|██▏ | 1905/8750 [3:07:02<12:41:37, 6.68s/it] {'loss': 0.4881, 'learning_rate': 1.8208998459641737e-05, 'epoch': 0.22} + 22%|██▏ | 1905/8750 [3:06:56<12:41:36, 6.68s/it] 22%|██▏ | 1906/8750 [3:07:01<12:07:38, 6.38s/it] 22%|██▏ | 1906/8750 [3:07:08<12:07:38, 6.38s/it] {'loss': 0.4802, 'learning_rate': 1.8206883986373872e-05, 'epoch': 0.22} + 22%|██▏ | 1906/8750 [3:07:08<12:07:38, 6.38s/it] {'loss': 0.4802, 'learning_rate': 1.8206883986373872e-05, 'epoch': 0.22} + 22%|██▏ | 1906/8750 [3:07:01<12:07:38, 6.38s/it] 22%|██▏ | 1907/8750 [3:07:07<11:41:22, 6.15s/it] 22%|██▏ | 1907/8750 [3:07:13<11:41:23, 6.15s/it] {'loss': 0.4852, 'learning_rate': 1.820476838857968e-05, 'epoch': 0.22} + 22%|██▏ | 1907/8750 [3:07:13<11:41:23, 6.15s/it] {'loss': 0.4852, 'learning_rate': 1.820476838857968e-05, 'epoch': 0.22} + 22%|██▏ | 1907/8750 [3:07:07<11:41:22, 6.15s/it] 22%|██▏ | 1908/8750 [3:07:19<11:27:06, 6.03s/it] 22%|██▏ | 1908/8750 [3:07:13<11:27:06, 6.03s/it] {'loss': 0.4831, 'learning_rate': 1.820265166654903e-05, 'epoch': 0.22} + 22%|██▏ | 1908/8750 [3:07:19<11:27:06, 6.03s/it] {'loss': 0.4831, 'learning_rate': 1.820265166654903e-05, 'epoch': 0.22} + 22%|██▏ | 1908/8750 [3:07:13<11:27:06, 6.03s/it] 22%|██▏ | 1909/8750 [3:07:25<11:15:07, 5.92s/it] 22%|██▏ | 1909/8750 [3:07:18<11:15:07, 5.92s/it] {'loss': 0.497, 'learning_rate': 1.8200533820571973e-05, 'epoch': 0.22} + 22%|██▏ | 1909/8750 [3:07:25<11:15:07, 5.92s/it] {'loss': 0.497, 'learning_rate': 1.8200533820571973e-05, 'epoch': 0.22} + 22%|██▏ | 1909/8750 [3:07:18<11:15:07, 5.92s/it] 22%|██▏ | 1910/8750 [3:07:24<11:06:54, 5.85s/it] 22%|██▏ | 1910/8750 [3:07:31<11:06:54, 5.85s/it] {'loss': 0.4615, 'learning_rate': 1.8198414850938694e-05, 'epoch': 0.22} + 22%|██▏ | 1910/8750 [3:07:31<11:06:54, 5.85s/it] {'loss': 0.4615, 'learning_rate': 1.8198414850938694e-05, 'epoch': 0.22} + 22%|██▏ | 1910/8750 [3:07:24<11:06:54, 5.85s/it] 22%|██▏ | 1911/8750 [3:07:36<11:04:15, 5.83s/it] 22%|██▏ | 1911/8750 [3:07:30<11:04:15, 5.83s/it] {'loss': 0.4956, 'learning_rate': 1.8196294757939543e-05, 'epoch': 0.22} + 22%|██▏ | 1911/8750 [3:07:36<11:04:15, 5.83s/it] {'loss': 0.4956, 'learning_rate': 1.8196294757939543e-05, 'epoch': 0.22} + 22%|██▏ | 1911/8750 [3:07:30<11:04:15, 5.83s/it] 22%|██▏ | 1912/8750 [3:07:42<10:59:35, 5.79s/it] 22%|██▏ | 1912/8750 [3:07:35<10:59:35, 5.79s/it] {'loss': 0.4716, 'learning_rate': 1.8194173541865014e-05, 'epoch': 0.22} + 22%|██▏ | 1912/8750 [3:07:42<10:59:35, 5.79s/it] {'loss': 0.4716, 'learning_rate': 1.8194173541865014e-05, 'epoch': 0.22} + 22%|██▏ | 1912/8750 [3:07:35<10:59:35, 5.79s/it] 22%|██▏ | 1913/8750 [3:07:41<10:57:29, 5.77s/it] 22%|██▏ | 1913/8750 [3:07:48<10:57:30, 5.77s/it] {'loss': 0.4638, 'learning_rate': 1.8192051203005768e-05, 'epoch': 0.22} + 22%|██▏ | 1913/8750 [3:07:48<10:57:30, 5.77s/it] {'loss': 0.4638, 'learning_rate': 1.8192051203005768e-05, 'epoch': 0.22} + 22%|██▏ | 1913/8750 [3:07:41<10:57:29, 5.77s/it] 22%|██▏ | 1914/8750 [3:07:47<10:57:03, 5.77s/it] 22%|██▏ | 1914/8750 [3:07:53<10:57:03, 5.77s/it] {'loss': 0.4879, 'learning_rate': 1.818992774165261e-05, 'epoch': 0.22} + 22%|██▏ | 1914/8750 [3:07:53<10:57:03, 5.77s/it] {'loss': 0.4879, 'learning_rate': 1.818992774165261e-05, 'epoch': 0.22} + 22%|██▏ | 1914/8750 [3:07:47<10:57:03, 5.77s/it] 22%|██▏ | 1915/8750 [3:07:59<10:57:00, 5.77s/it] 22%|██▏ | 1915/8750 [3:07:53<10:57:01, 5.77s/it] {'loss': 0.4875, 'learning_rate': 1.81878031580965e-05, 'epoch': 0.22} + 22%|██▏ | 1915/8750 [3:07:59<10:57:00, 5.77s/it] {'loss': 0.4875, 'learning_rate': 1.81878031580965e-05, 'epoch': 0.22} + 22%|██▏ | 1915/8750 [3:07:53<10:57:01, 5.77s/it] 22%|██▏ | 1916/8750 [3:08:05<10:55:04, 5.75s/it] 22%|██▏ | 1916/8750 [3:07:58<10:55:05, 5.75s/it] {'loss': 0.4882, 'learning_rate': 1.8185677452628557e-05, 'epoch': 0.22} + 22%|██▏ | 1916/8750 [3:08:05<10:55:04, 5.75s/it] {'loss': 0.4882, 'learning_rate': 1.8185677452628557e-05, 'epoch': 0.22} + 22%|██▏ | 1916/8750 [3:07:58<10:55:05, 5.75s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (5002 > 4096). Running this sequence through the model will result in indexing errors + 22%|██▏ | 1917/8750 [3:08:11<10:51:17, 5.72s/it] 22%|██▏ | 1917/8750 [3:08:04<10:51:17, 5.72s/it] {'loss': 0.4997, 'learning_rate': 1.818355062554005e-05, 'epoch': 0.22} + 22%|██▏ | 1917/8750 [3:08:11<10:51:17, 5.72s/it] {'loss': 0.4997, 'learning_rate': 1.818355062554005e-05, 'epoch': 0.22} + 22%|██▏ | 1917/8750 [3:08:04<10:51:17, 5.72s/it] 22%|██▏ | 1918/8750 [3:08:17<10:57:35, 5.78s/it] 22%|██▏ | 1918/8750 [3:08:10<10:57:35, 5.78s/it] {'loss': 0.4726, 'learning_rate': 1.81814226771224e-05, 'epoch': 0.22} + 22%|██▏ | 1918/8750 [3:08:17<10:57:35, 5.78s/it] {'loss': 0.4726, 'learning_rate': 1.81814226771224e-05, 'epoch': 0.22} + 22%|██▏ | 1918/8750 [3:08:10<10:57:35, 5.78s/it] 22%|██▏ | 1919/8750 [3:08:16<10:48:42, 5.70s/it] 22%|██▏ | 1919/8750 [3:08:22<10:48:42, 5.70s/it] {'loss': 0.4946, 'learning_rate': 1.8179293607667177e-05, 'epoch': 0.22} + 22%|██▏ | 1919/8750 [3:08:22<10:48:42, 5.70s/it] {'loss': 0.4946, 'learning_rate': 1.8179293607667177e-05, 'epoch': 0.22} + 22%|██▏ | 1919/8750 [3:08:16<10:48:42, 5.70s/it] 22%|██▏ | 1920/8750 [3:08:21<10:52:35, 5.73s/it] 22%|██▏ | 1920/8750 [3:08:28<10:52:35, 5.73s/it] {'loss': 0.4876, 'learning_rate': 1.8177163417466122e-05, 'epoch': 0.22} + 22%|██▏ | 1920/8750 [3:08:28<10:52:35, 5.73s/it] {'loss': 0.4876, 'learning_rate': 1.8177163417466122e-05, 'epoch': 0.22} + 22%|██▏ | 1920/8750 [3:08:21<10:52:35, 5.73s/it] 22%|██▏ | 1921/8750 [3:08:27<10:54:42, 5.75s/it] 22%|██▏ | 1921/8750 [3:08:34<10:54:42, 5.75s/it] {'loss': 0.4709, 'learning_rate': 1.8175032106811114e-05, 'epoch': 0.22} + 22%|██▏ | 1921/8750 [3:08:34<10:54:42, 5.75s/it] {'loss': 0.4709, 'learning_rate': 1.8175032106811114e-05, 'epoch': 0.22} + 22%|██▏ | 1921/8750 [3:08:27<10:54:42, 5.75s/it] 22%|██▏ | 1922/8750 [3:08:33<10:52:57, 5.74s/it] 22%|██▏ | 1922/8750 [3:08:39<10:52:57, 5.74s/it] {'loss': 0.4999, 'learning_rate': 1.817289967599419e-05, 'epoch': 0.22} + 22%|██▏ | 1922/8750 [3:08:39<10:52:57, 5.74s/it] {'loss': 0.4999, 'learning_rate': 1.817289967599419e-05, 'epoch': 0.22} + 22%|██▏ | 1922/8750 [3:08:33<10:52:57, 5.74s/it] 22%|██▏ | 1923/8750 [3:08:45<11:04:48, 5.84s/it] 22%|██▏ | 1923/8750 [3:08:39<11:04:49, 5.84s/it] {'loss': 0.4864, 'learning_rate': 1.8170766125307543e-05, 'epoch': 0.22} + 22%|██▏ | 1923/8750 [3:08:45<11:04:48, 5.84s/it] {'loss': 0.4864, 'learning_rate': 1.8170766125307543e-05, 'epoch': 0.22} + 22%|██▏ | 1923/8750 [3:08:39<11:04:49, 5.84s/it] 22%|██▏ | 1924/8750 [3:08:45<11:00:14, 5.80s/it] 22%|██▏ | 1924/8750 [3:08:51<11:00:15, 5.80s/it] {'loss': 0.4812, 'learning_rate': 1.816863145504351e-05, 'epoch': 0.22} + 22%|██▏ | 1924/8750 [3:08:51<11:00:15, 5.80s/it] {'loss': 0.4812, 'learning_rate': 1.816863145504351e-05, 'epoch': 0.22} + 22%|██▏ | 1924/8750 [3:08:45<11:00:14, 5.80s/it] 22%|██▏ | 1925/8750 [3:08:50<10:59:04, 5.79s/it] 22%|██▏ | 1925/8750 [3:08:57<10:59:04, 5.79s/it] {'loss': 0.4748, 'learning_rate': 1.81664956654946e-05, 'epoch': 0.22} + 22%|██▏ | 1925/8750 [3:08:57<10:59:04, 5.79s/it] {'loss': 0.4748, 'learning_rate': 1.81664956654946e-05, 'epoch': 0.22} + 22%|██▏ | 1925/8750 [3:08:50<10:59:04, 5.79s/it] 22%|██▏ | 1926/8750 [3:09:03<10:56:29, 5.77s/it] 22%|██▏ | 1926/8750 [3:08:56<10:56:29, 5.77s/it] {'loss': 0.4671, 'learning_rate': 1.816435875695345e-05, 'epoch': 0.22} + 22%|██▏ | 1926/8750 [3:09:03<10:56:29, 5.77s/it] {'loss': 0.4671, 'learning_rate': 1.816435875695345e-05, 'epoch': 0.22} + 22%|██▏ | 1926/8750 [3:08:56<10:56:29, 5.77s/it] 22%|██▏ | 1927/8750 [3:09:09<11:10:21, 5.89s/it] 22%|██▏ | 1927/8750 [3:09:02<11:10:21, 5.90s/it] {'loss': 0.4808, 'learning_rate': 1.8162220729712875e-05, 'epoch': 0.22} + 22%|██▏ | 1927/8750 [3:09:09<11:10:21, 5.89s/it] {'loss': 0.4808, 'learning_rate': 1.8162220729712875e-05, 'epoch': 0.22} + 22%|██▏ | 1927/8750 [3:09:02<11:10:21, 5.90s/it] 22%|██▏ | 1928/8750 [3:09:15<11:06:22, 5.86s/it] 22%|██▏ | 1928/8750 [3:09:08<11:06:23, 5.86s/it] {'loss': 0.4736, 'learning_rate': 1.8160081584065833e-05, 'epoch': 0.22} + 22%|██▏ | 1928/8750 [3:09:15<11:06:22, 5.86s/it] {'loss': 0.4736, 'learning_rate': 1.8160081584065833e-05, 'epoch': 0.22} + 22%|██▏ | 1928/8750 [3:09:08<11:06:23, 5.86s/it] 22%|██▏ | 1929/8750 [3:09:14<11:02:33, 5.83s/it] 22%|██▏ | 1929/8750 [3:09:20<11:02:34, 5.83s/it] {'loss': 0.4868, 'learning_rate': 1.8157941320305424e-05, 'epoch': 0.22} + 22%|██▏ | 1929/8750 [3:09:20<11:02:34, 5.83s/it] {'loss': 0.4868, 'learning_rate': 1.8157941320305424e-05, 'epoch': 0.22} + 22%|██▏ | 1929/8750 [3:09:14<11:02:33, 5.83s/it] 22%|██▏ | 1930/8750 [3:09:20<11:02:13, 5.83s/it] 22%|██▏ | 1930/8750 [3:09:26<11:02:13, 5.83s/it] {'loss': 0.4725, 'learning_rate': 1.815579993872492e-05, 'epoch': 0.22} + 22%|██▏ | 1930/8750 [3:09:26<11:02:13, 5.83s/it] {'loss': 0.4725, 'learning_rate': 1.815579993872492e-05, 'epoch': 0.22} + 22%|██▏ | 1930/8750 [3:09:20<11:02:13, 5.83s/it] 22%|██▏ | 1931/8750 [3:09:32<10:56:01, 5.77s/it] 22%|██▏ | 1931/8750 [3:09:25<10:56:02, 5.77s/it] {'loss': 0.491, 'learning_rate': 1.8153657439617738e-05, 'epoch': 0.22} + 22%|██▏ | 1931/8750 [3:09:32<10:56:01, 5.77s/it] {'loss': 0.491, 'learning_rate': 1.8153657439617738e-05, 'epoch': 0.22} + 22%|██▏ | 1931/8750 [3:09:25<10:56:02, 5.77s/it] 22%|██▏ | 1932/8750 [3:09:31<10:57:28, 5.79s/it] 22%|██▏ | 1932/8750 [3:09:38<10:57:29, 5.79s/it] {'loss': 0.4761, 'learning_rate': 1.8151513823277447e-05, 'epoch': 0.22} + 22%|██▏ | 1932/8750 [3:09:38<10:57:29, 5.79s/it] {'loss': 0.4761, 'learning_rate': 1.8151513823277447e-05, 'epoch': 0.22} + 22%|██▏ | 1932/8750 [3:09:31<10:57:28, 5.79s/it]/lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/VILA/llava/model/llava_arch.py:397: UserWarning: Inputs truncated! + warnings.warn("Inputs truncated!") + 22%|██▏ | 1933/8750 [3:09:37<11:02:26, 5.83s/it] 22%|██▏ | 1933/8750 [3:09:44<11:02:26, 5.83s/it] {'loss': 0.5125, 'learning_rate': 1.8149369089997767e-05, 'epoch': 0.22} + 22%|██▏ | 1933/8750 [3:09:44<11:02:26, 5.83s/it] {'loss': 0.5125, 'learning_rate': 1.8149369089997767e-05, 'epoch': 0.22} + 22%|██▏ | 1933/8750 [3:09:37<11:02:26, 5.83s/it] 22%|██▏ | 1934/8750 [3:09:43<10:55:10, 5.77s/it] 22%|██▏ | 1934/8750 [3:09:49<10:55:10, 5.77s/it] {'loss': 0.4676, 'learning_rate': 1.814722324007258e-05, 'epoch': 0.22} + 22%|██▏ | 1934/8750 [3:09:49<10:55:10, 5.77s/it] {'loss': 0.4676, 'learning_rate': 1.814722324007258e-05, 'epoch': 0.22} + 22%|██▏ | 1934/8750 [3:09:43<10:55:10, 5.77s/it] 22%|██▏ | 1935/8750 [3:09:55<10:46:47, 5.69s/it] 22%|██▏ | 1935/8750 [3:09:48<10:46:48, 5.69s/it] {'loss': 0.4984, 'learning_rate': 1.8145076273795914e-05, 'epoch': 0.22} + 22%|██▏ | 1935/8750 [3:09:55<10:46:47, 5.69s/it] {'loss': 0.4984, 'learning_rate': 1.8145076273795914e-05, 'epoch': 0.22} + 22%|██▏ | 1935/8750 [3:09:48<10:46:48, 5.69s/it] 22%|██▏ | 1936/8750 [3:10:00<10:49:49, 5.72s/it] 22%|██▏ | 1936/8750 [3:09:54<10:49:49, 5.72s/it] {'loss': 0.47, 'learning_rate': 1.814292819146195e-05, 'epoch': 0.22} + 22%|██▏ | 1936/8750 [3:10:01<10:49:49, 5.72s/it] {'loss': 0.47, 'learning_rate': 1.814292819146195e-05, 'epoch': 0.22} + 22%|██▏ | 1936/8750 [3:09:54<10:49:49, 5.72s/it] 22%|██▏ | 1937/8750 [3:10:06<10:53:12, 5.75s/it] 22%|██▏ | 1937/8750 [3:10:00<10:53:13, 5.75s/it] {'loss': 0.4997, 'learning_rate': 1.814077899336502e-05, 'epoch': 0.22} + 22%|██▏ | 1937/8750 [3:10:06<10:53:12, 5.75s/it] {'loss': 0.4997, 'learning_rate': 1.814077899336502e-05, 'epoch': 0.22} + 22%|██▏ | 1937/8750 [3:10:00<10:53:13, 5.75s/it] 22%|██▏ | 1938/8750 [3:10:12<10:45:47, 5.69s/it] 22%|██▏ | 1938/8750 [3:10:05<10:45:48, 5.69s/it] {'loss': 0.4623, 'learning_rate': 1.813862867979962e-05, 'epoch': 0.22} + 22%|██▏ | 1938/8750 [3:10:12<10:45:47, 5.69s/it] {'loss': 0.4623, 'learning_rate': 1.813862867979962e-05, 'epoch': 0.22} + 22%|██▏ | 1938/8750 [3:10:05<10:45:48, 5.69s/it] 22%|██▏ | 1939/8750 [3:10:18<10:53:18, 5.76s/it] 22%|██▏ | 1939/8750 [3:10:11<10:53:19, 5.76s/it] {'loss': 0.463, 'learning_rate': 1.8136477251060385e-05, 'epoch': 0.22} + 22%|██▏ | 1939/8750 [3:10:18<10:53:18, 5.76s/it] {'loss': 0.463, 'learning_rate': 1.8136477251060385e-05, 'epoch': 0.22} + 22%|██▏ | 1939/8750 [3:10:11<10:53:19, 5.76s/it] 22%|██▏ | 1940/8750 [3:10:24<10:52:21, 5.75s/it] 22%|██▏ | 1940/8750 [3:10:17<10:52:21, 5.75s/it] {'loss': 0.4888, 'learning_rate': 1.813432470744211e-05, 'epoch': 0.22} + 22%|██▏ | 1940/8750 [3:10:24<10:52:21, 5.75s/it] {'loss': 0.4888, 'learning_rate': 1.813432470744211e-05, 'epoch': 0.22} + 22%|██▏ | 1940/8750 [3:10:17<10:52:21, 5.75s/it] 22%|██▏ | 1941/8750 [3:10:29<10:48:05, 5.71s/it] 22%|██▏ | 1941/8750 [3:10:23<10:48:05, 5.71s/it] {'loss': 0.4848, 'learning_rate': 1.813217104923974e-05, 'epoch': 0.22} + 22%|██▏ | 1941/8750 [3:10:29<10:48:05, 5.71s/it] {'loss': 0.4848, 'learning_rate': 1.813217104923974e-05, 'epoch': 0.22} + 22%|██▏ | 1941/8750 [3:10:23<10:48:05, 5.71s/it] 22%|██▏ | 1942/8750 [3:10:35<10:43:37, 5.67s/it] 22%|██▏ | 1942/8750 [3:10:28<10:43:37, 5.67s/it] {'loss': 0.5043, 'learning_rate': 1.813001627674838e-05, 'epoch': 0.22} + 22%|██▏ | 1942/8750 [3:10:35<10:43:37, 5.67s/it] {'loss': 0.5043, 'learning_rate': 1.813001627674838e-05, 'epoch': 0.22} + 22%|██▏ | 1942/8750 [3:10:28<10:43:37, 5.67s/it] 22%|██▏ | 1943/8750 [3:10:40<10:46:39, 5.70s/it] 22%|██▏ | 1943/8750 [3:10:34<10:46:38, 5.70s/it] {'loss': 0.4762, 'learning_rate': 1.8127860390263275e-05, 'epoch': 0.22} + 22%|██▏ | 1943/8750 [3:10:40<10:46:39, 5.70s/it] {'loss': 0.4762, 'learning_rate': 1.8127860390263275e-05, 'epoch': 0.22} + 22%|██▏ | 1943/8750 [3:10:34<10:46:38, 5.70s/it] 22%|██▏ | 1944/8750 [3:10:46<10:42:52, 5.67s/it] 22%|██▏ | 1944/8750 [3:10:40<10:42:53, 5.67s/it] {'loss': 0.4681, 'learning_rate': 1.812570339007983e-05, 'epoch': 0.22} + 22%|██▏ | 1944/8750 [3:10:46<10:42:52, 5.67s/it] {'loss': 0.4681, 'learning_rate': 1.812570339007983e-05, 'epoch': 0.22} + 22%|██▏ | 1944/8750 [3:10:40<10:42:53, 5.67s/it] 22%|██▏ | 1945/8750 [3:10:52<10:49:23, 5.73s/it] 22%|██▏ | 1945/8750 [3:10:45<10:49:23, 5.73s/it] {'loss': 0.4824, 'learning_rate': 1.8123545276493607e-05, 'epoch': 0.22} + 22%|██▏ | 1945/8750 [3:10:52<10:49:23, 5.73s/it] {'loss': 0.4824, 'learning_rate': 1.8123545276493607e-05, 'epoch': 0.22} + 22%|██▏ | 1945/8750 [3:10:45<10:49:23, 5.73s/it] 22%|██▏ | 1946/8750 [3:10:58<10:46:24, 5.70s/it] 22%|██▏ | 1946/8750 [3:10:51<10:46:23, 5.70s/it] {'loss': 0.4819, 'learning_rate': 1.8121386049800317e-05, 'epoch': 0.22} + 22%|██▏ | 1946/8750 [3:10:58<10:46:24, 5.70s/it] {'loss': 0.4819, 'learning_rate': 1.8121386049800317e-05, 'epoch': 0.22} + 22%|██▏ | 1946/8750 [3:10:51<10:46:23, 5.70s/it] 22%|██▏ | 1947/8750 [3:11:03<10:54:13, 5.77s/it] 22%|██▏ | 1947/8750 [3:10:57<10:54:13, 5.77s/it] {'loss': 0.4734, 'learning_rate': 1.8119225710295815e-05, 'epoch': 0.22} + 22%|██▏ | 1947/8750 [3:11:04<10:54:13, 5.77s/it] {'loss': 0.4734, 'learning_rate': 1.8119225710295815e-05, 'epoch': 0.22} + 22%|██▏ | 1947/8750 [3:10:57<10:54:13, 5.77s/it] 22%|██▏ | 1948/8750 [3:11:09<10:51:26, 5.75s/it] 22%|██▏ | 1948/8750 [3:11:03<10:51:26, 5.75s/it] {'loss': 0.4641, 'learning_rate': 1.811706425827612e-05, 'epoch': 0.22} + 22%|██▏ | 1948/8750 [3:11:09<10:51:26, 5.75s/it] {'loss': 0.4641, 'learning_rate': 1.811706425827612e-05, 'epoch': 0.22} + 22%|██▏ | 1948/8750 [3:11:03<10:51:26, 5.75s/it] 22%|██▏ | 1949/8750 [3:11:08<10:51:38, 5.75s/it] 22%|██▏ | 1949/8750 [3:11:15<10:51:38, 5.75s/it] {'loss': 0.4786, 'learning_rate': 1.8114901694037402e-05, 'epoch': 0.22} + 22%|██▏ | 1949/8750 [3:11:15<10:51:38, 5.75s/it] {'loss': 0.4786, 'learning_rate': 1.8114901694037402e-05, 'epoch': 0.22} + 22%|██▏ | 1949/8750 [3:11:08<10:51:38, 5.75s/it]12 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +98 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +07 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... + 6 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... + 22%|██▏ | 1950/8750 [3:11:21<10:48:47, 5.72s/it]13 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... + 22%|██▏ | 1950/8750 [3:11:14<10:48:47, 5.72s/it]1 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... + {'loss': 0.484, 'learning_rate': 1.8112738017875974e-05, 'epoch': 0.22} + 22%|██▏ | 1950/8750 [3:11:21<10:48:47, 5.72s/it] {'loss': 0.484, 'learning_rate': 1.8112738017875974e-05, 'epoch': 0.22} + 22%|██▏ | 1950/8750 [3:11:14<10:48:47, 5.72s/it] 22%|██▏ | 1951/8750 [3:11:26<10:47:11, 5.71s/it] 22%|██▏ | 1951/8750 [3:11:20<10:47:11, 5.71s/it] {'loss': 0.5028, 'learning_rate': 1.811057323008831e-05, 'epoch': 0.22} + 22%|██▏ | 1951/8750 [3:11:26<10:47:11, 5.71s/it] {'loss': 0.5028, 'learning_rate': 1.811057323008831e-05, 'epoch': 0.22} + 22%|██▏ | 1951/8750 [3:11:20<10:47:11, 5.71s/it] 22%|██▏ | 1952/8750 [3:11:32<10:49:48, 5.74s/it] 22%|██▏ | 1952/8750 [3:11:26<10:49:48, 5.74s/it] {'loss': 0.481, 'learning_rate': 1.810840733097104e-05, 'epoch': 0.22} + 22%|██▏ | 1952/8750 [3:11:32<10:49:48, 5.74s/it] {'loss': 0.481, 'learning_rate': 1.810840733097104e-05, 'epoch': 0.22} + 22%|██▏ | 1952/8750 [3:11:26<10:49:48, 5.74s/it] 22%|██▏ | 1953/8750 [3:11:38<10:46:52, 5.71s/it] 22%|██▏ | 1953/8750 [3:11:31<10:46:52, 5.71s/it] {'loss': 0.4853, 'learning_rate': 1.8106240320820928e-05, 'epoch': 0.22} + 22%|██▏ | 1953/8750 [3:11:38<10:46:52, 5.71s/it] {'loss': 0.4853, 'learning_rate': 1.8106240320820928e-05, 'epoch': 0.22} + 22%|██▏ | 1953/8750 [3:11:31<10:46:52, 5.71s/it] 22%|██▏ | 1954/8750 [3:11:43<10:46:01, 5.70s/it] 22%|██▏ | 1954/8750 [3:11:37<10:46:01, 5.70s/it] {'loss': 0.468, 'learning_rate': 1.8104072199934916e-05, 'epoch': 0.22} + 22%|██▏ | 1954/8750 [3:11:43<10:46:01, 5.70s/it] {'loss': 0.468, 'learning_rate': 1.8104072199934916e-05, 'epoch': 0.22} + 22%|██▏ | 1954/8750 [3:11:37<10:46:01, 5.70s/it] 22%|██▏ | 1955/8750 [3:11:49<10:52:22, 5.76s/it] 22%|██▏ | 1955/8750 [3:11:43<10:52:23, 5.76s/it] {'loss': 0.4989, 'learning_rate': 1.8101902968610082e-05, 'epoch': 0.22} + 22%|██▏ | 1955/8750 [3:11:49<10:52:22, 5.76s/it] {'loss': 0.4989, 'learning_rate': 1.8101902968610082e-05, 'epoch': 0.22} + 22%|██▏ | 1955/8750 [3:11:43<10:52:23, 5.76s/it] 22%|██▏ | 1956/8750 [3:11:49<10:50:21, 5.74s/it] 22%|██▏ | 1956/8750 [3:11:55<10:50:21, 5.74s/it] {'loss': 0.4905, 'learning_rate': 1.8099732627143655e-05, 'epoch': 0.22} + 22%|██▏ | 1956/8750 [3:11:55<10:50:21, 5.74s/it] {'loss': 0.4905, 'learning_rate': 1.8099732627143655e-05, 'epoch': 0.22} + 22%|██▏ | 1956/8750 [3:11:49<10:50:21, 5.74s/it] 22%|██▏ | 1957/8750 [3:12:01<10:51:33, 5.76s/it] 22%|██▏ | 1957/8750 [3:11:54<10:51:33, 5.76s/it] {'loss': 0.464, 'learning_rate': 1.809756117583302e-05, 'epoch': 0.22} + 22%|██▏ | 1957/8750 [3:12:01<10:51:33, 5.76s/it] {'loss': 0.464, 'learning_rate': 1.809756117583302e-05, 'epoch': 0.22} + 22%|██▏ | 1957/8750 [3:11:54<10:51:33, 5.76s/it] 22%|██▏ | 1958/8750 [3:12:00<10:48:32, 5.73s/it] 22%|██▏ | 1958/8750 [3:12:06<10:48:33, 5.73s/it] {'loss': 0.4789, 'learning_rate': 1.809538861497572e-05, 'epoch': 0.22} + 22%|██▏ | 1958/8750 [3:12:06<10:48:33, 5.73s/it] {'loss': 0.4789, 'learning_rate': 1.809538861497572e-05, 'epoch': 0.22} + 22%|██▏ | 1958/8750 [3:12:00<10:48:32, 5.73s/it] 22%|██▏ | 1959/8750 [3:12:12<10:46:48, 5.71s/it] 22%|██▏ | 1959/8750 [3:12:06<10:46:48, 5.71s/it] {'loss': 0.4867, 'learning_rate': 1.8093214944869437e-05, 'epoch': 0.22} + 22%|██▏ | 1959/8750 [3:12:12<10:46:48, 5.71s/it] {'loss': 0.4867, 'learning_rate': 1.8093214944869437e-05, 'epoch': 0.22} + 22%|██▏ | 1959/8750 [3:12:06<10:46:48, 5.71s/it] 22%|██▏ | 1960/8750 [3:12:18<10:47:45, 5.72s/it] 22%|██▏ | 1960/8750 [3:12:11<10:47:45, 5.72s/it] {'loss': 0.4939, 'learning_rate': 1.8091040165812018e-05, 'epoch': 0.22} + 22%|██▏ | 1960/8750 [3:12:18<10:47:45, 5.72s/it] {'loss': 0.4939, 'learning_rate': 1.8091040165812018e-05, 'epoch': 0.22} + 22%|██▏ | 1960/8750 [3:12:11<10:47:45, 5.72s/it] 22%|██▏ | 1961/8750 [3:12:24<10:46:12, 5.71s/it] 22%|██▏ | 1961/8750 [3:12:17<10:46:12, 5.71s/it] {'loss': 0.4874, 'learning_rate': 1.8088864278101452e-05, 'epoch': 0.22} + 22%|██▏ | 1961/8750 [3:12:24<10:46:12, 5.71s/it] {'loss': 0.4874, 'learning_rate': 1.8088864278101452e-05, 'epoch': 0.22} + 22%|██▏ | 1961/8750 [3:12:17<10:46:12, 5.71s/it] 22%|██▏ | 1962/8750 [3:12:29<10:50:17, 5.75s/it] 22%|██▏ | 1962/8750 [3:12:23<10:50:17, 5.75s/it] {'loss': 0.4741, 'learning_rate': 1.808668728203589e-05, 'epoch': 0.22} + 22%|██▏ | 1962/8750 [3:12:29<10:50:17, 5.75s/it] {'loss': 0.4741, 'learning_rate': 1.808668728203589e-05, 'epoch': 0.22} + 22%|██▏ | 1962/8750 [3:12:23<10:50:17, 5.75s/it] 22%|██▏ | 1963/8750 [3:12:35<10:46:09, 5.71s/it] 22%|██▏ | 1963/8750 [3:12:29<10:46:09, 5.71s/it] {'loss': 0.4968, 'learning_rate': 1.8084509177913623e-05, 'epoch': 0.22} + 22%|██▏ | 1963/8750 [3:12:35<10:46:09, 5.71s/it] {'loss': 0.4968, 'learning_rate': 1.8084509177913623e-05, 'epoch': 0.22} + 22%|██▏ | 1963/8750 [3:12:29<10:46:09, 5.71s/it] 22%|██▏ | 1964/8750 [3:12:34<10:43:49, 5.69s/it] 22%|██▏ | 1964/8750 [3:12:41<10:43:49, 5.69s/it] {'loss': 0.4904, 'learning_rate': 1.8082329966033105e-05, 'epoch': 0.22} + 22%|██▏ | 1964/8750 [3:12:41<10:43:49, 5.69s/it] {'loss': 0.4904, 'learning_rate': 1.8082329966033105e-05, 'epoch': 0.22} + 22%|██▏ | 1964/8750 [3:12:34<10:43:49, 5.69s/it] 22%|██▏ | 1965/8750 [3:12:47<11:00:30, 5.84s/it] 22%|██▏ | 1965/8750 [3:12:40<11:00:30, 5.84s/it] {'loss': 0.4782, 'learning_rate': 1.8080149646692932e-05, 'epoch': 0.22} + 22%|██▏ | 1965/8750 [3:12:47<11:00:30, 5.84s/it] {'loss': 0.4782, 'learning_rate': 1.8080149646692932e-05, 'epoch': 0.22} + 22%|██▏ | 1965/8750 [3:12:40<11:00:30, 5.84s/it] 22%|██▏ | 1966/8750 [3:12:52<10:52:55, 5.77s/it] 22%|██▏ | 1966/8750 [3:12:46<10:52:55, 5.77s/it] {'loss': 0.4839, 'learning_rate': 1.807796822019186e-05, 'epoch': 0.22} + 22%|██▏ | 1966/8750 [3:12:52<10:52:55, 5.77s/it] {'loss': 0.4839, 'learning_rate': 1.807796822019186e-05, 'epoch': 0.22} + 22%|██▏ | 1966/8750 [3:12:46<10:52:55, 5.77s/it] 22%|██▏ | 1967/8750 [3:12:58<10:45:50, 5.71s/it] 22%|██▏ | 1967/8750 [3:12:52<10:45:51, 5.71s/it] {'loss': 0.4976, 'learning_rate': 1.807578568682879e-05, 'epoch': 0.22} + 22%|██▏ | 1967/8750 [3:12:58<10:45:50, 5.71s/it] {'loss': 0.4976, 'learning_rate': 1.807578568682879e-05, 'epoch': 0.22} + 22%|██▏ | 1967/8750 [3:12:52<10:45:51, 5.71s/it] 22%|██▏ | 1968/8750 [3:13:04<10:45:36, 5.71s/it] 22%|██▏ | 1968/8750 [3:12:57<10:45:36, 5.71s/it] {'loss': 0.4823, 'learning_rate': 1.8073602046902784e-05, 'epoch': 0.22} + 22%|██▏ | 1968/8750 [3:13:04<10:45:36, 5.71s/it] {'loss': 0.4823, 'learning_rate': 1.8073602046902784e-05, 'epoch': 0.22} + 22%|██▏ | 1968/8750 [3:12:57<10:45:36, 5.71s/it] 23%|██▎ | 1969/8750 [3:13:10<10:47:43, 5.73s/it] 23%|██▎ | 1969/8750 [3:13:03<10:47:43, 5.73s/it] {'loss': 0.4928, 'learning_rate': 1.8071417300713038e-05, 'epoch': 0.23} + 23%|██▎ | 1969/8750 [3:13:10<10:47:43, 5.73s/it] {'loss': 0.4928, 'learning_rate': 1.8071417300713038e-05, 'epoch': 0.23} + 23%|██▎ | 1969/8750 [3:13:03<10:47:43, 5.73s/it] 23%|██▎ | 1970/8750 [3:13:15<10:49:26, 5.75s/it] 23%|██▎ | 1970/8750 [3:13:09<10:49:26, 5.75s/it] {'loss': 0.4787, 'learning_rate': 1.8069231448558923e-05, 'epoch': 0.23} + 23%|██▎ | 1970/8750 [3:13:15<10:49:26, 5.75s/it] {'loss': 0.4787, 'learning_rate': 1.8069231448558923e-05, 'epoch': 0.23} + 23%|██▎ | 1970/8750 [3:13:09<10:49:26, 5.75s/it] 23%|██▎ | 1971/8750 [3:13:21<10:44:00, 5.70s/it] 23%|██▎ | 1971/8750 [3:13:14<10:44:00, 5.70s/it] {'loss': 0.4659, 'learning_rate': 1.806704449073994e-05, 'epoch': 0.23} + 23%|██▎ | 1971/8750 [3:13:21<10:44:00, 5.70s/it] {'loss': 0.4659, 'learning_rate': 1.806704449073994e-05, 'epoch': 0.23} + 23%|██▎ | 1971/8750 [3:13:14<10:44:00, 5.70s/it] 23%|██▎ | 1972/8750 [3:13:27<10:43:43, 5.70s/it] 23%|██▎ | 1972/8750 [3:13:20<10:43:43, 5.70s/it] {'loss': 0.477, 'learning_rate': 1.806485642755576e-05, 'epoch': 0.23} + 23%|██▎ | 1972/8750 [3:13:27<10:43:43, 5.70s/it] {'loss': 0.477, 'learning_rate': 1.806485642755576e-05, 'epoch': 0.23} + 23%|██▎ | 1972/8750 [3:13:20<10:43:43, 5.70s/it] 23%|██▎ | 1973/8750 [3:13:32<10:45:52, 5.72s/it] 23%|██▎ | 1973/8750 [3:13:26<10:45:52, 5.72s/it] {'loss': 0.4825, 'learning_rate': 1.8062667259306193e-05, 'epoch': 0.23} + 23%|██▎ | 1973/8750 [3:13:32<10:45:52, 5.72s/it] {'loss': 0.4825, 'learning_rate': 1.8062667259306193e-05, 'epoch': 0.23} + 23%|██▎ | 1973/8750 [3:13:26<10:45:52, 5.72s/it] 23%|██▎ | 1974/8750 [3:13:38<10:45:36, 5.72s/it] 23%|██▎ | 1974/8750 [3:13:32<10:45:36, 5.72s/it] {'loss': 0.4756, 'learning_rate': 1.80604769862912e-05, 'epoch': 0.23} + 23%|██▎ | 1974/8750 [3:13:38<10:45:36, 5.72s/it] {'loss': 0.4756, 'learning_rate': 1.80604769862912e-05, 'epoch': 0.23} + 23%|██▎ | 1974/8750 [3:13:32<10:45:36, 5.72s/it] 23%|██▎ | 1975/8750 [3:13:44<10:42:49, 5.69s/it] 23%|██▎ | 1975/8750 [3:13:37<10:42:49, 5.69s/it] {'loss': 0.4778, 'learning_rate': 1.8058285608810903e-05, 'epoch': 0.23} + 23%|██▎ | 1975/8750 [3:13:44<10:42:49, 5.69s/it] {'loss': 0.4778, 'learning_rate': 1.8058285608810903e-05, 'epoch': 0.23} + 23%|██▎ | 1975/8750 [3:13:37<10:42:49, 5.69s/it] 23%|██▎ | 1976/8750 [3:13:50<10:46:29, 5.73s/it] 23%|██▎ | 1976/8750 [3:13:43<10:46:29, 5.73s/it] {'loss': 0.5072, 'learning_rate': 1.8056093127165564e-05, 'epoch': 0.23} + 23%|██▎ | 1976/8750 [3:13:50<10:46:29, 5.73s/it] {'loss': 0.5072, 'learning_rate': 1.8056093127165564e-05, 'epoch': 0.23} + 23%|██▎ | 1976/8750 [3:13:43<10:46:29, 5.73s/it] 23%|██▎ | 1977/8750 [3:13:55<10:44:19, 5.71s/it] 23%|██▎ | 1977/8750 [3:13:49<10:44:19, 5.71s/it] {'loss': 0.4728, 'learning_rate': 1.8053899541655605e-05, 'epoch': 0.23} + 23%|██▎ | 1977/8750 [3:13:55<10:44:19, 5.71s/it] {'loss': 0.4728, 'learning_rate': 1.8053899541655605e-05, 'epoch': 0.23} + 23%|██▎ | 1977/8750 [3:13:49<10:44:19, 5.71s/it] 23%|██▎ | 1978/8750 [3:14:01<10:51:25, 5.77s/it] 23%|██▎ | 1978/8750 [3:13:55<10:51:25, 5.77s/it] {'loss': 0.4835, 'learning_rate': 1.8051704852581595e-05, 'epoch': 0.23} + 23%|██▎ | 1978/8750 [3:14:01<10:51:25, 5.77s/it] {'loss': 0.4835, 'learning_rate': 1.8051704852581595e-05, 'epoch': 0.23} + 23%|██▎ | 1978/8750 [3:13:55<10:51:25, 5.77s/it] 23%|██▎ | 1979/8750 [3:14:07<10:48:08, 5.74s/it] 23%|██▎ | 1979/8750 [3:14:00<10:48:08, 5.74s/it] {'loss': 0.4829, 'learning_rate': 1.804950906024426e-05, 'epoch': 0.23} + 23%|██▎ | 1979/8750 [3:14:07<10:48:08, 5.74s/it] {'loss': 0.4829, 'learning_rate': 1.804950906024426e-05, 'epoch': 0.23} + 23%|██▎ | 1979/8750 [3:14:00<10:48:08, 5.74s/it] 23%|██▎ | 1980/8750 [3:14:06<10:41:54, 5.69s/it] 23%|██▎ | 1980/8750 [3:14:12<10:41:55, 5.69s/it] {'loss': 0.4627, 'learning_rate': 1.804731216494447e-05, 'epoch': 0.23} + 23%|██▎ | 1980/8750 [3:14:12<10:41:55, 5.69s/it] {'loss': 0.4627, 'learning_rate': 1.804731216494447e-05, 'epoch': 0.23} + 23%|██▎ | 1980/8750 [3:14:06<10:41:54, 5.69s/it] 23%|██▎ | 1981/8750 [3:14:18<10:49:56, 5.76s/it] 23%|██▎ | 1981/8750 [3:14:12<10:49:56, 5.76s/it] {'loss': 0.4922, 'learning_rate': 1.804511416698324e-05, 'epoch': 0.23} + 23%|██▎ | 1981/8750 [3:14:18<10:49:56, 5.76s/it] {'loss': 0.4922, 'learning_rate': 1.804511416698324e-05, 'epoch': 0.23} + 23%|██▎ | 1981/8750 [3:14:12<10:49:56, 5.76s/it] 23%|██▎ | 1982/8750 [3:14:24<10:44:01, 5.71s/it] 23%|██▎ | 1982/8750 [3:14:17<10:44:02, 5.71s/it] {'loss': 0.4687, 'learning_rate': 1.804291506666176e-05, 'epoch': 0.23} + 23%|██▎ | 1982/8750 [3:14:24<10:44:01, 5.71s/it] {'loss': 0.4687, 'learning_rate': 1.804291506666176e-05, 'epoch': 0.23} + 23%|██▎ | 1982/8750 [3:14:17<10:44:02, 5.71s/it] 23%|██▎ | 1983/8750 [3:14:29<10:38:54, 5.66s/it] 23%|██▎ | 1983/8750 [3:14:23<10:38:54, 5.66s/it] {'loss': 0.5074, 'learning_rate': 1.8040714864281347e-05, 'epoch': 0.23} + 23%|██▎ | 1983/8750 [3:14:29<10:38:54, 5.66s/it] {'loss': 0.5074, 'learning_rate': 1.8040714864281347e-05, 'epoch': 0.23} + 23%|██▎ | 1983/8750 [3:14:23<10:38:54, 5.66s/it] 23%|██▎ | 1984/8750 [3:14:35<10:43:04, 5.70s/it] 23%|██▎ | 1984/8750 [3:14:29<10:43:05, 5.70s/it] {'loss': 0.479, 'learning_rate': 1.8038513560143477e-05, 'epoch': 0.23} + 23%|██▎ | 1984/8750 [3:14:35<10:43:04, 5.70s/it] {'loss': 0.479, 'learning_rate': 1.8038513560143477e-05, 'epoch': 0.23} + 23%|██▎ | 1984/8750 [3:14:29<10:43:05, 5.70s/it] 23%|██▎ | 1985/8750 [3:14:41<10:42:07, 5.70s/it] 23%|██▎ | 1985/8750 [3:14:34<10:42:08, 5.70s/it] {'loss': 0.4841, 'learning_rate': 1.8036311154549783e-05, 'epoch': 0.23} + 23%|██▎ | 1985/8750 [3:14:41<10:42:07, 5.70s/it] {'loss': 0.4841, 'learning_rate': 1.8036311154549783e-05, 'epoch': 0.23} + 23%|██▎ | 1985/8750 [3:14:34<10:42:08, 5.70s/it] 23%|██▎ | 1986/8750 [3:14:47<10:41:16, 5.69s/it] 23%|██▎ | 1986/8750 [3:14:40<10:41:16, 5.69s/it] {'loss': 0.4791, 'learning_rate': 1.803410764780204e-05, 'epoch': 0.23} + 23%|██▎ | 1986/8750 [3:14:47<10:41:16, 5.69s/it] {'loss': 0.4791, 'learning_rate': 1.803410764780204e-05, 'epoch': 0.23} + 23%|██▎ | 1986/8750 [3:14:40<10:41:16, 5.69s/it] 23%|██▎ | 1987/8750 [3:14:52<10:41:59, 5.70s/it] 23%|██▎ | 1987/8750 [3:14:46<10:42:00, 5.70s/it] {'loss': 0.5005, 'learning_rate': 1.803190304020218e-05, 'epoch': 0.23} + 23%|██▎ | 1987/8750 [3:14:52<10:41:59, 5.70s/it] {'loss': 0.5005, 'learning_rate': 1.803190304020218e-05, 'epoch': 0.23} + 23%|██▎ | 1987/8750 [3:14:46<10:42:00, 5.70s/it] 23%|██▎ | 1988/8750 [3:14:58<10:45:59, 5.73s/it] 23%|██▎ | 1988/8750 [3:14:52<10:45:59, 5.73s/it] {'loss': 0.4771, 'learning_rate': 1.8029697332052277e-05, 'epoch': 0.23} + 23%|██▎ | 1988/8750 [3:14:58<10:45:59, 5.73s/it] {'loss': 0.4771, 'learning_rate': 1.8029697332052277e-05, 'epoch': 0.23} + 23%|██▎ | 1988/8750 [3:14:52<10:45:59, 5.73s/it] 23%|██▎ | 1989/8750 [3:15:04<10:51:33, 5.78s/it] 23%|██▎ | 1989/8750 [3:14:58<10:51:33, 5.78s/it] {'loss': 0.4959, 'learning_rate': 1.8027490523654568e-05, 'epoch': 0.23} + 23%|██▎ | 1989/8750 [3:15:04<10:51:33, 5.78s/it] {'loss': 0.4959, 'learning_rate': 1.8027490523654568e-05, 'epoch': 0.23} + 23%|██▎ | 1989/8750 [3:14:58<10:51:33, 5.78s/it] 23%|██▎ | 1990/8750 [3:15:10<10:55:35, 5.82s/it] 23%|██▎ | 1990/8750 [3:15:03<10:55:35, 5.82s/it] {'loss': 0.4576, 'learning_rate': 1.8025282615311437e-05, 'epoch': 0.23} + 23%|██▎ | 1990/8750 [3:15:10<10:55:35, 5.82s/it] {'loss': 0.4576, 'learning_rate': 1.8025282615311437e-05, 'epoch': 0.23} + 23%|██▎ | 1990/8750 [3:15:03<10:55:35, 5.82s/it] 23%|██▎ | 1991/8750 [3:15:16<10:48:09, 5.75s/it] 23%|██▎ | 1991/8750 [3:15:09<10:48:09, 5.75s/it] {'loss': 0.4692, 'learning_rate': 1.802307360732541e-05, 'epoch': 0.23} + 23%|██▎ | 1991/8750 [3:15:16<10:48:09, 5.75s/it] {'loss': 0.4692, 'learning_rate': 1.802307360732541e-05, 'epoch': 0.23} + 23%|██▎ | 1991/8750 [3:15:09<10:48:09, 5.75s/it] 23%|██▎ | 1992/8750 [3:15:21<10:41:04, 5.69s/it] 23%|██▎ | 1992/8750 [3:15:15<10:41:04, 5.69s/it] {'loss': 0.4766, 'learning_rate': 1.8020863499999182e-05, 'epoch': 0.23} + 23%|██▎ | 1992/8750 [3:15:21<10:41:04, 5.69s/it] {'loss': 0.4766, 'learning_rate': 1.8020863499999182e-05, 'epoch': 0.23} + 23%|██▎ | 1992/8750 [3:15:15<10:41:04, 5.69s/it] 23%|██▎ | 1993/8750 [3:15:27<10:41:26, 5.70s/it] 23%|██▎ | 1993/8750 [3:15:20<10:41:26, 5.70s/it] {'loss': 0.4822, 'learning_rate': 1.801865229363557e-05, 'epoch': 0.23} + 23%|██▎ | 1993/8750 [3:15:27<10:41:26, 5.70s/it] {'loss': 0.4822, 'learning_rate': 1.801865229363557e-05, 'epoch': 0.23} + 23%|██▎ | 1993/8750 [3:15:20<10:41:26, 5.70s/it] 23%|██▎ | 1994/8750 [3:15:33<10:59:20, 5.86s/it] 23%|██▎ | 1994/8750 [3:15:27<10:59:20, 5.86s/it] {'loss': 0.4648, 'learning_rate': 1.8016439988537576e-05, 'epoch': 0.23} + 23%|██▎ | 1994/8750 [3:15:33<10:59:20, 5.86s/it] {'loss': 0.4648, 'learning_rate': 1.8016439988537576e-05, 'epoch': 0.23} + 23%|██▎ | 1994/8750 [3:15:27<10:59:20, 5.86s/it] 23%|██▎ | 1995/8750 [3:15:39<10:50:42, 5.78s/it] 23%|██▎ | 1995/8750 [3:15:32<10:50:42, 5.78s/it] {'loss': 0.505, 'learning_rate': 1.8014226585008322e-05, 'epoch': 0.23} + 23%|██▎ | 1995/8750 [3:15:39<10:50:42, 5.78s/it] {'loss': 0.505, 'learning_rate': 1.8014226585008322e-05, 'epoch': 0.23} + 23%|██▎ | 1995/8750 [3:15:32<10:50:42, 5.78s/it] 23%|██▎ | 1996/8750 [3:15:44<10:45:51, 5.74s/it] 23%|██▎ | 1996/8750 [3:15:38<10:45:51, 5.74s/it] {'loss': 0.4659, 'learning_rate': 1.80120120833511e-05, 'epoch': 0.23} + 23%|██▎ | 1996/8750 [3:15:44<10:45:51, 5.74s/it] {'loss': 0.4659, 'learning_rate': 1.80120120833511e-05, 'epoch': 0.23} + 23%|██▎ | 1996/8750 [3:15:38<10:45:51, 5.74s/it] 23%|██▎ | 1997/8750 [3:15:50<10:50:22, 5.78s/it] 23%|██▎ | 1997/8750 [3:15:44<10:50:21, 5.78s/it] {'loss': 0.485, 'learning_rate': 1.8009796483869347e-05, 'epoch': 0.23} + 23%|██▎ | 1997/8750 [3:15:50<10:50:22, 5.78s/it] {'loss': 0.485, 'learning_rate': 1.8009796483869347e-05, 'epoch': 0.23} + 23%|██▎ | 1997/8750 [3:15:44<10:50:21, 5.78s/it] 23%|██▎ | 1998/8750 [3:15:56<10:44:10, 5.72s/it] 23%|██▎ | 1998/8750 [3:15:49<10:44:10, 5.72s/it] {'loss': 0.4654, 'learning_rate': 1.8007579786866648e-05, 'epoch': 0.23}{'loss': 0.4654, 'learning_rate': 1.8007579786866648e-05, 'epoch': 0.23} + + 23%|██▎ | 1998/8750 [3:15:56<10:44:10, 5.72s/it] 23%|██▎ | 1998/8750 [3:15:49<10:44:10, 5.72s/it] 23%|██▎ | 1999/8750 [3:16:01<10:45:34, 5.74s/it] 23%|██▎ | 1999/8750 [3:15:55<10:45:34, 5.74s/it] {'loss': 0.4911, 'learning_rate': 1.8005361992646736e-05, 'epoch': 0.23} + 23%|██▎ | 1999/8750 [3:16:01<10:45:34, 5.74s/it] {'loss': 0.4911, 'learning_rate': 1.8005361992646736e-05, 'epoch': 0.23} + 23%|██▎ | 1999/8750 [3:15:55<10:45:34, 5.74s/it]12 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend...8 + AutoResumeHook: Checking whether to suspend... + 23%|██▎ | 2000/8750 [3:16:07<10:45:40, 5.74s/it]4 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +06 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... + 23%|██▎ | 2000/8750 [3:16:01<10:45:40, 5.74s/it]11 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... + {'loss': 0.5044, 'learning_rate': 1.8003143101513502e-05, 'epoch': 0.23} + 23%|██▎ | 2000/8750 [3:16:07<10:45:40, 5.74s/it] {'loss': 0.5044, 'learning_rate': 1.8003143101513502e-05, 'epoch': 0.23} + 23%|██▎ | 2000/8750 [3:16:01<10:45:40, 5.74s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-2000/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-2000/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-2000/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 23%|██▎ | 2001/8750 [3:16:27<18:35:27, 9.92s/it] 23%|██▎ | 2001/8750 [3:16:20<18:35:27, 9.92s/it] {'loss': 0.4838, 'learning_rate': 1.8000923113770987e-05, 'epoch': 0.23} + 23%|██▎ | 2001/8750 [3:16:27<18:35:27, 9.92s/it] {'loss': 0.4838, 'learning_rate': 1.8000923113770987e-05, 'epoch': 0.23} + 23%|██▎ | 2001/8750 [3:16:20<18:35:27, 9.92s/it] 23%|██▎ | 2002/8750 [3:16:26<16:23:12, 8.74s/it] 23%|██▎ | 2002/8750 [3:16:33<16:23:14, 8.74s/it] {'loss': 0.454, 'learning_rate': 1.7998702029723372e-05, 'epoch': 0.23} + 23%|██▎ | 2002/8750 [3:16:33<16:23:14, 8.74s/it] {'loss': 0.454, 'learning_rate': 1.7998702029723372e-05, 'epoch': 0.23} + 23%|██▎ | 2002/8750 [3:16:26<16:23:12, 8.74s/it] 23%|██▎ | 2003/8750 [3:16:32<14:36:58, 7.80s/it] 23%|██▎ | 2003/8750 [3:16:38<14:37:00, 7.80s/it] {'loss': 0.4945, 'learning_rate': 1.7996479849675e-05, 'epoch': 0.23} + 23%|██▎ | 2003/8750 [3:16:38<14:37:00, 7.80s/it] {'loss': 0.4945, 'learning_rate': 1.7996479849675e-05, 'epoch': 0.23} + 23%|██▎ | 2003/8750 [3:16:32<14:36:58, 7.80s/it] 23%|██▎ | 2004/8750 [3:16:44<13:27:04, 7.18s/it] 23%|██▎ | 2004/8750 [3:16:38<13:27:05, 7.18s/it] {'loss': 0.4773, 'learning_rate': 1.799425657393036e-05, 'epoch': 0.23} + 23%|██▎ | 2004/8750 [3:16:44<13:27:04, 7.18s/it] {'loss': 0.4773, 'learning_rate': 1.799425657393036e-05, 'epoch': 0.23} + 23%|██▎ | 2004/8750 [3:16:38<13:27:05, 7.18s/it] 23%|██▎ | 2005/8750 [3:16:50<12:36:27, 6.73s/it] 23%|██▎ | 2005/8750 [3:16:43<12:36:28, 6.73s/it] {'loss': 0.4931, 'learning_rate': 1.7992032202794084e-05, 'epoch': 0.23} + 23%|██▎ | 2005/8750 [3:16:50<12:36:27, 6.73s/it] {'loss': 0.4931, 'learning_rate': 1.7992032202794084e-05, 'epoch': 0.23} + 23%|██▎ | 2005/8750 [3:16:43<12:36:28, 6.73s/it] 23%|██▎ | 2006/8750 [3:16:56<12:07:30, 6.47s/it] 23%|██▎ | 2006/8750 [3:16:49<12:07:30, 6.47s/it] {'loss': 0.4624, 'learning_rate': 1.798980673657097e-05, 'epoch': 0.23} + 23%|██▎ | 2006/8750 [3:16:56<12:07:30, 6.47s/it] {'loss': 0.4624, 'learning_rate': 1.798980673657097e-05, 'epoch': 0.23} + 23%|██▎ | 2006/8750 [3:16:49<12:07:30, 6.47s/it] 23%|██▎ | 2007/8750 [3:17:01<11:39:50, 6.23s/it] 23%|██▎ | 2007/8750 [3:16:55<11:39:50, 6.23s/it] {'loss': 0.481, 'learning_rate': 1.7987580175565948e-05, 'epoch': 0.23} + 23%|██▎ | 2007/8750 [3:17:01<11:39:50, 6.23s/it] {'loss': 0.481, 'learning_rate': 1.7987580175565948e-05, 'epoch': 0.23} + 23%|██▎ | 2007/8750 [3:16:55<11:39:50, 6.23s/it] 23%|██▎ | 2008/8750 [3:17:07<11:23:23, 6.08s/it] 23%|██▎ | 2008/8750 [3:17:01<11:23:23, 6.08s/it] {'loss': 0.4772, 'learning_rate': 1.798535252008411e-05, 'epoch': 0.23} + 23%|██▎ | 2008/8750 [3:17:07<11:23:23, 6.08s/it] {'loss': 0.4772, 'learning_rate': 1.798535252008411e-05, 'epoch': 0.23} + 23%|██▎ | 2008/8750 [3:17:01<11:23:23, 6.08s/it] 23%|██▎ | 2009/8750 [3:17:13<11:09:42, 5.96s/it] 23%|██▎ | 2009/8750 [3:17:06<11:09:42, 5.96s/it] {'loss': 0.4951, 'learning_rate': 1.7983123770430696e-05, 'epoch': 0.23} + 23%|██▎ | 2009/8750 [3:17:13<11:09:42, 5.96s/it] {'loss': 0.4951, 'learning_rate': 1.7983123770430696e-05, 'epoch': 0.23} + 23%|██▎ | 2009/8750 [3:17:06<11:09:42, 5.96s/it] 23%|██▎ | 2010/8750 [3:17:19<11:01:46, 5.89s/it] 23%|██▎ | 2010/8750 [3:17:12<11:01:46, 5.89s/it] {'loss': 0.4809, 'learning_rate': 1.7980893926911092e-05, 'epoch': 0.23} + 23%|██▎ | 2010/8750 [3:17:19<11:01:46, 5.89s/it] {'loss': 0.4809, 'learning_rate': 1.7980893926911092e-05, 'epoch': 0.23} + 23%|██▎ | 2010/8750 [3:17:12<11:01:46, 5.89s/it] 23%|██▎ | 2011/8750 [3:17:24<10:57:47, 5.86s/it] 23%|██▎ | 2011/8750 [3:17:18<10:57:48, 5.86s/it] {'loss': 0.4861, 'learning_rate': 1.7978662989830834e-05, 'epoch': 0.23} + 23%|██▎ | 2011/8750 [3:17:24<10:57:47, 5.86s/it] {'loss': 0.4861, 'learning_rate': 1.7978662989830834e-05, 'epoch': 0.23} + 23%|██▎ | 2011/8750 [3:17:18<10:57:48, 5.86s/it] 23%|██▎ | 2012/8750 [3:17:30<10:51:59, 5.81s/it] 23%|██▎ | 2012/8750 [3:17:24<10:52:00, 5.81s/it] {'loss': 0.4623, 'learning_rate': 1.7976430959495617e-05, 'epoch': 0.23} + 23%|██▎ | 2012/8750 [3:17:30<10:51:59, 5.81s/it] {'loss': 0.4623, 'learning_rate': 1.7976430959495617e-05, 'epoch': 0.23} + 23%|██▎ | 2012/8750 [3:17:24<10:52:00, 5.81s/it] 23%|██▎ | 2013/8750 [3:17:36<10:55:17, 5.84s/it] 23%|██▎ | 2013/8750 [3:17:29<10:55:17, 5.84s/it] {'loss': 0.4759, 'learning_rate': 1.7974197836211275e-05, 'epoch': 0.23} + 23%|██▎ | 2013/8750 [3:17:36<10:55:17, 5.84s/it] {'loss': 0.4759, 'learning_rate': 1.7974197836211275e-05, 'epoch': 0.23} + 23%|██▎ | 2013/8750 [3:17:29<10:55:17, 5.84s/it] 23%|██▎ | 2014/8750 [3:17:42<10:52:19, 5.81s/it] 23%|██▎ | 2014/8750 [3:17:35<10:52:19, 5.81s/it] {'loss': 0.4842, 'learning_rate': 1.7971963620283795e-05, 'epoch': 0.23} + 23%|██▎ | 2014/8750 [3:17:42<10:52:19, 5.81s/it] {'loss': 0.4842, 'learning_rate': 1.7971963620283795e-05, 'epoch': 0.23} + 23%|██▎ | 2014/8750 [3:17:35<10:52:19, 5.81s/it] 23%|██▎ | 2015/8750 [3:17:48<10:52:49, 5.82s/it] 23%|██▎ | 2015/8750 [3:17:41<10:52:48, 5.82s/it] {'loss': 0.4847, 'learning_rate': 1.7969728312019316e-05, 'epoch': 0.23} + 23%|██▎ | 2015/8750 [3:17:48<10:52:49, 5.82s/it] {'loss': 0.4847, 'learning_rate': 1.7969728312019316e-05, 'epoch': 0.23} + 23%|██▎ | 2015/8750 [3:17:41<10:52:48, 5.82s/it] 23%|██▎ | 2016/8750 [3:17:47<10:44:50, 5.75s/it] 23%|██▎ | 2016/8750 [3:17:53<10:44:50, 5.75s/it] {'loss': 0.484, 'learning_rate': 1.7967491911724125e-05, 'epoch': 0.23} + 23%|██▎ | 2016/8750 [3:17:53<10:44:50, 5.75s/it] {'loss': 0.484, 'learning_rate': 1.7967491911724125e-05, 'epoch': 0.23} + 23%|██▎ | 2016/8750 [3:17:47<10:44:50, 5.75s/it] 23%|██▎ | 2017/8750 [3:17:59<10:41:59, 5.72s/it] 23%|██▎ | 2017/8750 [3:17:52<10:41:59, 5.72s/it] {'loss': 0.4841, 'learning_rate': 1.796525441970466e-05, 'epoch': 0.23} + 23%|██▎ | 2017/8750 [3:17:59<10:41:59, 5.72s/it] {'loss': 0.4841, 'learning_rate': 1.796525441970466e-05, 'epoch': 0.23} + 23%|██▎ | 2017/8750 [3:17:52<10:41:59, 5.72s/it] 23%|██▎ | 2018/8750 [3:18:04<10:35:47, 5.67s/it] 23%|██▎ | 2018/8750 [3:17:58<10:35:47, 5.67s/it] {'loss': 0.4862, 'learning_rate': 1.7963015836267502e-05, 'epoch': 0.23} + 23%|██▎ | 2018/8750 [3:18:04<10:35:47, 5.67s/it] {'loss': 0.4862, 'learning_rate': 1.7963015836267502e-05, 'epoch': 0.23} + 23%|██▎ | 2018/8750 [3:17:58<10:35:47, 5.67s/it] 23%|██▎ | 2019/8750 [3:18:10<10:37:08, 5.68s/it] 23%|██▎ | 2019/8750 [3:18:04<10:37:09, 5.68s/it] {'loss': 0.4713, 'learning_rate': 1.7960776161719396e-05, 'epoch': 0.23} + 23%|██▎ | 2019/8750 [3:18:10<10:37:08, 5.68s/it] {'loss': 0.4713, 'learning_rate': 1.7960776161719396e-05, 'epoch': 0.23} + 23%|██▎ | 2019/8750 [3:18:04<10:37:09, 5.68s/it] 23%|██▎ | 2020/8750 [3:18:16<10:43:34, 5.74s/it] 23%|██▎ | 2020/8750 [3:18:09<10:43:34, 5.74s/it] {'loss': 0.4855, 'learning_rate': 1.7958535396367218e-05, 'epoch': 0.23} + 23%|██▎ | 2020/8750 [3:18:16<10:43:34, 5.74s/it] {'loss': 0.4855, 'learning_rate': 1.7958535396367218e-05, 'epoch': 0.23} + 23%|██▎ | 2020/8750 [3:18:09<10:43:34, 5.74s/it] 23%|██▎ | 2021/8750 [3:18:21<10:34:32, 5.66s/it] 23%|██▎ | 2021/8750 [3:18:15<10:34:32, 5.66s/it] {'loss': 0.4947, 'learning_rate': 1.795629354051801e-05, 'epoch': 0.23} + 23%|██▎ | 2021/8750 [3:18:21<10:34:32, 5.66s/it] {'loss': 0.4947, 'learning_rate': 1.795629354051801e-05, 'epoch': 0.23} + 23%|██▎ | 2021/8750 [3:18:15<10:34:32, 5.66s/it] 23%|██▎ | 2022/8750 [3:18:21<10:41:52, 5.72s/it] 23%|██▎ | 2022/8750 [3:18:27<10:41:53, 5.72s/it] {'loss': 0.4709, 'learning_rate': 1.7954050594478952e-05, 'epoch': 0.23} + 23%|██▎ | 2022/8750 [3:18:27<10:41:53, 5.72s/it] {'loss': 0.4709, 'learning_rate': 1.7954050594478952e-05, 'epoch': 0.23} + 23%|██▎ | 2022/8750 [3:18:21<10:41:52, 5.72s/it] 23%|██▎ | 2023/8750 [3:18:26<10:36:56, 5.68s/it] 23%|██▎ | 2023/8750 [3:18:33<10:36:57, 5.68s/it] {'loss': 0.4753, 'learning_rate': 1.795180655855738e-05, 'epoch': 0.23} + 23%|██▎ | 2023/8750 [3:18:33<10:36:57, 5.68s/it] {'loss': 0.4753, 'learning_rate': 1.795180655855738e-05, 'epoch': 0.23} + 23%|██▎ | 2023/8750 [3:18:26<10:36:56, 5.68s/it] 23%|██▎ | 2024/8750 [3:18:39<10:37:27, 5.69s/it] 23%|██▎ | 2024/8750 [3:18:32<10:37:28, 5.69s/it] {'loss': 0.4592, 'learning_rate': 1.7949561433060775e-05, 'epoch': 0.23} + 23%|██▎ | 2024/8750 [3:18:39<10:37:27, 5.69s/it] {'loss': 0.4592, 'learning_rate': 1.7949561433060775e-05, 'epoch': 0.23} + 23%|██▎ | 2024/8750 [3:18:32<10:37:28, 5.69s/it] 23%|██▎ | 2025/8750 [3:18:44<10:37:11, 5.68s/it] 23%|██▎ | 2025/8750 [3:18:38<10:37:11, 5.68s/it] {'loss': 0.479, 'learning_rate': 1.794731521829677e-05, 'epoch': 0.23} + 23%|██▎ | 2025/8750 [3:18:44<10:37:11, 5.68s/it] {'loss': 0.479, 'learning_rate': 1.794731521829677e-05, 'epoch': 0.23} + 23%|██▎ | 2025/8750 [3:18:38<10:37:11, 5.68s/it] 23%|██▎ | 2026/8750 [3:18:50<10:37:59, 5.69s/it] 23%|██▎ | 2026/8750 [3:18:43<10:37:59, 5.69s/it] {'loss': 0.4769, 'learning_rate': 1.7945067914573147e-05, 'epoch': 0.23} + 23%|██▎ | 2026/8750 [3:18:50<10:37:59, 5.69s/it] {'loss': 0.4769, 'learning_rate': 1.7945067914573147e-05, 'epoch': 0.23} + 23%|██▎ | 2026/8750 [3:18:43<10:37:59, 5.69s/it] 23%|██▎ | 2027/8750 [3:18:56<10:36:19, 5.68s/it] 23%|██▎ | 2027/8750 [3:18:49<10:36:20, 5.68s/it] {'loss': 0.4883, 'learning_rate': 1.7942819522197837e-05, 'epoch': 0.23} + {'loss': 0.4883, 'learning_rate': 1.7942819522197837e-05, 'epoch': 0.23} 23%|██▎ | 2027/8750 [3:18:56<10:36:19, 5.68s/it] + 23%|██▎ | 2027/8750 [3:18:49<10:36:20, 5.68s/it] 23%|██▎ | 2028/8750 [3:19:01<10:36:57, 5.69s/it] 23%|██▎ | 2028/8750 [3:18:55<10:36:58, 5.69s/it] {'loss': 0.4726, 'learning_rate': 1.794057004147892e-05, 'epoch': 0.23} + 23%|██▎ | 2028/8750 [3:19:01<10:36:57, 5.69s/it] {'loss': 0.4726, 'learning_rate': 1.794057004147892e-05, 'epoch': 0.23} + 23%|██▎ | 2028/8750 [3:18:55<10:36:58, 5.69s/it] 23%|██▎ | 2029/8750 [3:19:07<10:33:13, 5.65s/it] 23%|██▎ | 2029/8750 [3:19:00<10:33:13, 5.65s/it] {'loss': 0.4931, 'learning_rate': 1.793831947272463e-05, 'epoch': 0.23} + 23%|██▎ | 2029/8750 [3:19:07<10:33:13, 5.65s/it] {'loss': 0.4931, 'learning_rate': 1.793831947272463e-05, 'epoch': 0.23} + 23%|██▎ | 2029/8750 [3:19:00<10:33:13, 5.65s/it] 23%|██▎ | 2030/8750 [3:19:13<10:52:28, 5.83s/it] 23%|██▎ | 2030/8750 [3:19:07<10:52:28, 5.83s/it] {'loss': 0.4699, 'learning_rate': 1.793606781624333e-05, 'epoch': 0.23} + 23%|██▎ | 2030/8750 [3:19:13<10:52:28, 5.83s/it] {'loss': 0.4699, 'learning_rate': 1.793606781624333e-05, 'epoch': 0.23} + 23%|██▎ | 2030/8750 [3:19:07<10:52:28, 5.83s/it] 23%|██▎ | 2031/8750 [3:19:19<10:46:44, 5.78s/it] 23%|██▎ | 2031/8750 [3:19:12<10:46:43, 5.78s/it] {'loss': 0.4965, 'learning_rate': 1.7933815072343565e-05, 'epoch': 0.23} + 23%|██▎ | 2031/8750 [3:19:19<10:46:44, 5.78s/it] {'loss': 0.4965, 'learning_rate': 1.7933815072343565e-05, 'epoch': 0.23} + 23%|██▎ | 2031/8750 [3:19:12<10:46:43, 5.78s/it] 23%|██▎ | 2032/8750 [3:19:24<10:44:04, 5.75s/it] 23%|██▎ | 2032/8750 [3:19:18<10:44:04, 5.75s/it] {'loss': 0.4694, 'learning_rate': 1.7931561241333998e-05, 'epoch': 0.23} + 23%|██▎ | 2032/8750 [3:19:24<10:44:04, 5.75s/it] {'loss': 0.4694, 'learning_rate': 1.7931561241333998e-05, 'epoch': 0.23} + 23%|██▎ | 2032/8750 [3:19:18<10:44:04, 5.75s/it] 23%|██▎ | 2033/8750 [3:19:30<10:38:45, 5.71s/it] 23%|██▎ | 2033/8750 [3:19:24<10:38:45, 5.71s/it] {'loss': 0.4896, 'learning_rate': 1.7929306323523463e-05, 'epoch': 0.23} + 23%|██▎ | 2033/8750 [3:19:30<10:38:45, 5.71s/it] {'loss': 0.4896, 'learning_rate': 1.7929306323523463e-05, 'epoch': 0.23} + 23%|██▎ | 2033/8750 [3:19:24<10:38:45, 5.71s/it] 23%|██▎ | 2034/8750 [3:19:36<10:33:56, 5.66s/it] 23%|██▎ | 2034/8750 [3:19:29<10:33:56, 5.66s/it] {'loss': 0.4737, 'learning_rate': 1.792705031922093e-05, 'epoch': 0.23} + 23%|██▎ | 2034/8750 [3:19:36<10:33:56, 5.66s/it] {'loss': 0.4737, 'learning_rate': 1.792705031922093e-05, 'epoch': 0.23} + 23%|██▎ | 2034/8750 [3:19:29<10:33:56, 5.66s/it] 23%|██▎ | 2035/8750 [3:19:41<10:39:07, 5.71s/it] 23%|██▎ | 2035/8750 [3:19:35<10:39:08, 5.71s/it] {'loss': 0.4702, 'learning_rate': 1.792479322873552e-05, 'epoch': 0.23} + 23%|██▎ | 2035/8750 [3:19:41<10:39:07, 5.71s/it] {'loss': 0.4702, 'learning_rate': 1.792479322873552e-05, 'epoch': 0.23} + 23%|██▎ | 2035/8750 [3:19:35<10:39:08, 5.71s/it] 23%|██▎ | 2036/8750 [3:19:47<10:49:06, 5.80s/it] 23%|██▎ | 2036/8750 [3:19:41<10:49:06, 5.80s/it] {'loss': 0.5163, 'learning_rate': 1.792253505237651e-05, 'epoch': 0.23} + 23%|██▎ | 2036/8750 [3:19:47<10:49:06, 5.80s/it] {'loss': 0.5163, 'learning_rate': 1.792253505237651e-05, 'epoch': 0.23} + 23%|██▎ | 2036/8750 [3:19:41<10:49:06, 5.80s/it] 23%|██▎ | 2037/8750 [3:19:53<10:42:52, 5.75s/it] 23%|██▎ | 2037/8750 [3:19:47<10:42:51, 5.75s/it] {'loss': 0.4697, 'learning_rate': 1.7920275790453318e-05, 'epoch': 0.23} + 23%|██▎ | 2037/8750 [3:19:53<10:42:52, 5.75s/it] {'loss': 0.4697, 'learning_rate': 1.7920275790453318e-05, 'epoch': 0.23} + 23%|██▎ | 2037/8750 [3:19:47<10:42:51, 5.75s/it] 23%|██▎ | 2038/8750 [3:19:59<10:42:33, 5.74s/it] 23%|██▎ | 2038/8750 [3:19:52<10:42:33, 5.74s/it] {'loss': 0.4757, 'learning_rate': 1.7918015443275517e-05, 'epoch': 0.23} + 23%|██▎ | 2038/8750 [3:19:59<10:42:33, 5.74s/it] {'loss': 0.4757, 'learning_rate': 1.7918015443275517e-05, 'epoch': 0.23} + 23%|██▎ | 2038/8750 [3:19:52<10:42:33, 5.74s/it] 23%|██▎ | 2039/8750 [3:20:05<10:43:27, 5.75s/it] 23%|██▎ | 2039/8750 [3:19:58<10:43:28, 5.75s/it] {'loss': 0.4824, 'learning_rate': 1.7915754011152815e-05, 'epoch': 0.23} + 23%|██▎ | 2039/8750 [3:20:05<10:43:27, 5.75s/it] {'loss': 0.4824, 'learning_rate': 1.7915754011152815e-05, 'epoch': 0.23} + 23%|██▎ | 2039/8750 [3:19:58<10:43:28, 5.75s/it] 23%|██▎ | 2040/8750 [3:20:10<10:44:14, 5.76s/it] 23%|██▎ | 2040/8750 [3:20:04<10:44:14, 5.76s/it] {'loss': 0.4852, 'learning_rate': 1.791349149439509e-05, 'epoch': 0.23} + 23%|██▎ | 2040/8750 [3:20:10<10:44:14, 5.76s/it] {'loss': 0.4852, 'learning_rate': 1.791349149439509e-05, 'epoch': 0.23} + 23%|██▎ | 2040/8750 [3:20:04<10:44:14, 5.76s/it] 23%|██▎ | 2041/8750 [3:20:16<10:47:08, 5.79s/it] 23%|██▎ | 2041/8750 [3:20:10<10:47:08, 5.79s/it] {'loss': 0.4749, 'learning_rate': 1.7911227893312347e-05, 'epoch': 0.23} + 23%|██▎ | 2041/8750 [3:20:16<10:47:08, 5.79s/it] {'loss': 0.4749, 'learning_rate': 1.7911227893312347e-05, 'epoch': 0.23} + 23%|██▎ | 2041/8750 [3:20:10<10:47:08, 5.79s/it] 23%|██▎ | 2042/8750 [3:20:22<10:41:37, 5.74s/it] 23%|██▎ | 2042/8750 [3:20:15<10:41:37, 5.74s/it] {'loss': 0.485, 'learning_rate': 1.790896320821476e-05, 'epoch': 0.23} + 23%|██▎ | 2042/8750 [3:20:22<10:41:37, 5.74s/it] {'loss': 0.485, 'learning_rate': 1.790896320821476e-05, 'epoch': 0.23} + 23%|██▎ | 2042/8750 [3:20:15<10:41:37, 5.74s/it] 23%|██▎ | 2043/8750 [3:20:28<10:40:43, 5.73s/it] 23%|██▎ | 2043/8750 [3:20:21<10:40:43, 5.73s/it] {'loss': 0.4604, 'learning_rate': 1.7906697439412634e-05, 'epoch': 0.23} + 23%|██▎ | 2043/8750 [3:20:28<10:40:43, 5.73s/it] {'loss': 0.4604, 'learning_rate': 1.7906697439412634e-05, 'epoch': 0.23} + 23%|██▎ | 2043/8750 [3:20:21<10:40:43, 5.73s/it] 23%|██▎ | 2044/8750 [3:20:33<10:34:59, 5.68s/it] 23%|██▎ | 2044/8750 [3:20:27<10:35:00, 5.68s/it] {'loss': 0.4646, 'learning_rate': 1.790443058721643e-05, 'epoch': 0.23} + 23%|██▎ | 2044/8750 [3:20:33<10:34:59, 5.68s/it] {'loss': 0.4646, 'learning_rate': 1.790443058721643e-05, 'epoch': 0.23} + 23%|██▎ | 2044/8750 [3:20:27<10:35:00, 5.68s/it] 23%|██▎ | 2045/8750 [3:20:39<10:37:24, 5.70s/it] 23%|██▎ | 2045/8750 [3:20:32<10:37:23, 5.70s/it] {'loss': 0.4981, 'learning_rate': 1.7902162651936766e-05, 'epoch': 0.23} + 23%|██▎ | 2045/8750 [3:20:39<10:37:24, 5.70s/it] {'loss': 0.4981, 'learning_rate': 1.7902162651936766e-05, 'epoch': 0.23} + 23%|██▎ | 2045/8750 [3:20:32<10:37:23, 5.70s/it] 23%|██▎ | 2046/8750 [3:20:38<10:46:48, 5.79s/it] 23%|██▎ | 2046/8750 [3:20:45<10:46:50, 5.79s/it] {'loss': 0.4722, 'learning_rate': 1.789989363388439e-05, 'epoch': 0.23} + 23%|██▎ | 2046/8750 [3:20:45<10:46:50, 5.79s/it] {'loss': 0.4722, 'learning_rate': 1.789989363388439e-05, 'epoch': 0.23} + 23%|██▎ | 2046/8750 [3:20:38<10:46:48, 5.79s/it] 23%|██▎ | 2047/8750 [3:20:44<10:42:53, 5.75s/it] 23%|██▎ | 2047/8750 [3:20:51<10:42:54, 5.75s/it] {'loss': 0.4707, 'learning_rate': 1.7897623533370212e-05, 'epoch': 0.23} + 23%|██▎ | 2047/8750 [3:20:51<10:42:54, 5.75s/it] {'loss': 0.4707, 'learning_rate': 1.7897623533370212e-05, 'epoch': 0.23} + 23%|██▎ | 2047/8750 [3:20:44<10:42:53, 5.75s/it] 23%|██▎ | 2048/8750 [3:20:50<10:53:35, 5.85s/it] 23%|██▎ | 2048/8750 [3:20:57<10:53:36, 5.85s/it] {'loss': 0.4786, 'learning_rate': 1.7895352350705288e-05, 'epoch': 0.23} + 23%|██▎ | 2048/8750 [3:20:57<10:53:36, 5.85s/it] {'loss': 0.4786, 'learning_rate': 1.7895352350705288e-05, 'epoch': 0.23} + 23%|██▎ | 2048/8750 [3:20:50<10:53:35, 5.85s/it] 23%|██▎ | 2049/8750 [3:20:56<10:46:15, 5.79s/it] 23%|██▎ | 2049/8750 [3:21:02<10:46:15, 5.79s/it] {'loss': 0.4885, 'learning_rate': 1.7893080086200817e-05, 'epoch': 0.23} + 23%|██▎ | 2049/8750 [3:21:02<10:46:15, 5.79s/it] {'loss': 0.4885, 'learning_rate': 1.7893080086200817e-05, 'epoch': 0.23} + 23%|██▎ | 2049/8750 [3:20:56<10:46:15, 5.79s/it]12 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +119 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +8 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +01 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... 23%|██▎ | 2050/8750 [3:21:08<10:50:08, 5.82s/it]13 AutoResumeHook: Checking whether to suspend... + +3 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... + 23%|██▎ | 2050/8750 [3:21:02<10:50:10, 5.82s/it]6 AutoResumeHook: Checking whether to suspend... +1510 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... + {'loss': 0.4913, 'learning_rate': 1.789080674016815e-05, 'epoch': 0.23} + 23%|██▎ | 2050/8750 [3:21:08<10:50:08, 5.82s/it] {'loss': 0.4913, 'learning_rate': 1.789080674016815e-05, 'epoch': 0.23} + 23%|██▎ | 2050/8750 [3:21:02<10:50:10, 5.82s/it] 23%|██▎ | 2051/8750 [3:21:07<10:43:07, 5.76s/it] 23%|██▎ | 2051/8750 [3:21:14<10:43:08, 5.76s/it] {'loss': 0.498, 'learning_rate': 1.7888532312918793e-05, 'epoch': 0.23} + 23%|██▎ | 2051/8750 [3:21:14<10:43:08, 5.76s/it] {'loss': 0.498, 'learning_rate': 1.7888532312918793e-05, 'epoch': 0.23} + 23%|██▎ | 2051/8750 [3:21:07<10:43:07, 5.76s/it] 23%|██▎ | 2052/8750 [3:21:19<10:39:53, 5.73s/it] 23%|██▎ | 2052/8750 [3:21:13<10:39:54, 5.73s/it] {'loss': 0.4651, 'learning_rate': 1.7886256804764385e-05, 'epoch': 0.23} + 23%|██▎ | 2052/8750 [3:21:19<10:39:53, 5.73s/it] {'loss': 0.4651, 'learning_rate': 1.7886256804764385e-05, 'epoch': 0.23} + 23%|██▎ | 2052/8750 [3:21:13<10:39:54, 5.73s/it] 23%|██▎ | 2053/8750 [3:21:25<10:40:58, 5.74s/it] 23%|██▎ | 2053/8750 [3:21:19<10:40:59, 5.74s/it] {'loss': 0.4745, 'learning_rate': 1.7883980216016724e-05, 'epoch': 0.23} + 23%|██▎ | 2053/8750 [3:21:25<10:40:58, 5.74s/it] {'loss': 0.4745, 'learning_rate': 1.7883980216016724e-05, 'epoch': 0.23} + 23%|██▎ | 2053/8750 [3:21:19<10:40:59, 5.74s/it] 23%|██▎ | 2054/8750 [3:21:24<10:34:59, 5.69s/it] 23%|██▎ | 2054/8750 [3:21:31<10:35:00, 5.69s/it] {'loss': 0.4867, 'learning_rate': 1.788170254698776e-05, 'epoch': 0.23} + 23%|██▎ | 2054/8750 [3:21:31<10:35:00, 5.69s/it] {'loss': 0.4867, 'learning_rate': 1.788170254698776e-05, 'epoch': 0.23} + 23%|██▎ | 2054/8750 [3:21:24<10:34:59, 5.69s/it] 23%|██▎ | 2055/8750 [3:21:30<10:34:35, 5.69s/it] 23%|██▎ | 2055/8750 [3:21:36<10:34:36, 5.69s/it] {'loss': 0.455, 'learning_rate': 1.7879423797989573e-05, 'epoch': 0.23} + 23%|██▎ | 2055/8750 [3:21:36<10:34:36, 5.69s/it] {'loss': 0.455, 'learning_rate': 1.7879423797989573e-05, 'epoch': 0.23} + 23%|██▎ | 2055/8750 [3:21:30<10:34:35, 5.69s/it] 23%|██▎ | 2056/8750 [3:21:36<10:30:34, 5.65s/it] 23%|██▎ | 2056/8750 [3:21:42<10:30:34, 5.65s/it] {'loss': 0.4727, 'learning_rate': 1.787714396933441e-05, 'epoch': 0.23} + 23%|██▎ | 2056/8750 [3:21:42<10:30:34, 5.65s/it] {'loss': 0.4727, 'learning_rate': 1.787714396933441e-05, 'epoch': 0.23} + 23%|██▎ | 2056/8750 [3:21:36<10:30:34, 5.65s/it] 24%|██▎ | 2057/8750 [3:21:48<10:41:09, 5.75s/it] 24%|██▎ | 2057/8750 [3:21:41<10:41:10, 5.75s/it] {'loss': 0.4833, 'learning_rate': 1.7874863061334658e-05, 'epoch': 0.24} + 24%|██▎ | 2057/8750 [3:21:48<10:41:09, 5.75s/it] {'loss': 0.4833, 'learning_rate': 1.7874863061334658e-05, 'epoch': 0.24} + 24%|██▎ | 2057/8750 [3:21:41<10:41:10, 5.75s/it] 24%|██▎ | 2058/8750 [3:21:47<10:46:18, 5.79s/it] 24%|██▎ | 2058/8750 [3:21:54<10:46:18, 5.79s/it] {'loss': 0.494, 'learning_rate': 1.7872581074302852e-05, 'epoch': 0.24} + 24%|██▎ | 2058/8750 [3:21:54<10:46:18, 5.79s/it] {'loss': 0.494, 'learning_rate': 1.7872581074302852e-05, 'epoch': 0.24} + 24%|██▎ | 2058/8750 [3:21:47<10:46:18, 5.79s/it] 24%|██▎ | 2059/8750 [3:21:53<10:47:20, 5.80s/it] 24%|██▎ | 2059/8750 [3:22:00<10:47:20, 5.80s/it] {'loss': 0.4881, 'learning_rate': 1.7870298008551674e-05, 'epoch': 0.24} + 24%|██▎ | 2059/8750 [3:22:00<10:47:20, 5.80s/it] {'loss': 0.4881, 'learning_rate': 1.7870298008551674e-05, 'epoch': 0.24} + 24%|██▎ | 2059/8750 [3:21:53<10:47:20, 5.80s/it] 24%|██▎ | 2060/8750 [3:22:05<10:41:19, 5.75s/it] 24%|██▎ | 2060/8750 [3:21:59<10:41:19, 5.75s/it] {'loss': 0.4843, 'learning_rate': 1.786801386439395e-05, 'epoch': 0.24} + 24%|██▎ | 2060/8750 [3:22:05<10:41:19, 5.75s/it] {'loss': 0.4843, 'learning_rate': 1.786801386439395e-05, 'epoch': 0.24} + 24%|██▎ | 2060/8750 [3:21:59<10:41:19, 5.75s/it] 24%|██▎ | 2061/8750 [3:22:11<10:33:31, 5.68s/it] 24%|██▎ | 2061/8750 [3:22:04<10:33:32, 5.68s/it] {'loss': 0.4787, 'learning_rate': 1.7865728642142668e-05, 'epoch': 0.24} + 24%|██▎ | 2061/8750 [3:22:11<10:33:31, 5.68s/it] {'loss': 0.4787, 'learning_rate': 1.7865728642142668e-05, 'epoch': 0.24} + 24%|██▎ | 2061/8750 [3:22:04<10:33:32, 5.68s/it] 24%|██▎ | 2062/8750 [3:22:10<10:32:05, 5.67s/it] 24%|██▎ | 2062/8750 [3:22:16<10:32:06, 5.67s/it] {'loss': 0.4876, 'learning_rate': 1.786344234211095e-05, 'epoch': 0.24} + 24%|██▎ | 2062/8750 [3:22:16<10:32:06, 5.67s/it] {'loss': 0.4876, 'learning_rate': 1.786344234211095e-05, 'epoch': 0.24} + 24%|██▎ | 2062/8750 [3:22:10<10:32:05, 5.67s/it] 24%|██▎ | 2063/8750 [3:22:16<10:32:52, 5.68s/it] 24%|██▎ | 2063/8750 [3:22:22<10:32:53, 5.68s/it] {'loss': 0.4639, 'learning_rate': 1.786115496461207e-05, 'epoch': 0.24} + 24%|██▎ | 2063/8750 [3:22:22<10:32:53, 5.68s/it] {'loss': 0.4639, 'learning_rate': 1.786115496461207e-05, 'epoch': 0.24} + 24%|██▎ | 2063/8750 [3:22:16<10:32:52, 5.68s/it] 24%|██▎ | 2064/8750 [3:22:21<10:35:52, 5.71s/it] 24%|██▎ | 2064/8750 [3:22:28<10:35:52, 5.71s/it] {'loss': 0.4657, 'learning_rate': 1.7858866509959455e-05, 'epoch': 0.24} + 24%|██▎ | 2064/8750 [3:22:28<10:35:52, 5.71s/it] {'loss': 0.4657, 'learning_rate': 1.7858866509959455e-05, 'epoch': 0.24} + 24%|██▎ | 2064/8750 [3:22:21<10:35:52, 5.71s/it] 24%|██▎ | 2065/8750 [3:22:27<10:36:14, 5.71s/it] 24%|██▎ | 2065/8750 [3:22:34<10:36:14, 5.71s/it] {'loss': 0.4768, 'learning_rate': 1.7856576978466666e-05, 'epoch': 0.24} + 24%|██▎ | 2065/8750 [3:22:34<10:36:14, 5.71s/it] {'loss': 0.4768, 'learning_rate': 1.7856576978466666e-05, 'epoch': 0.24} + 24%|██▎ | 2065/8750 [3:22:27<10:36:14, 5.71s/it] 24%|██▎ | 2066/8750 [3:22:33<10:40:54, 5.75s/it] 24%|██▎ | 2066/8750 [3:22:40<10:40:54, 5.75s/it] {'loss': 0.4837, 'learning_rate': 1.785428637044742e-05, 'epoch': 0.24} + 24%|██▎ | 2066/8750 [3:22:40<10:40:54, 5.75s/it] {'loss': 0.4837, 'learning_rate': 1.785428637044742e-05, 'epoch': 0.24} + 24%|██▎ | 2066/8750 [3:22:33<10:40:54, 5.75s/it] 24%|██▎ | 2067/8750 [3:22:39<10:42:27, 5.77s/it] 24%|██▎ | 2067/8750 [3:22:45<10:42:27, 5.77s/it] {'loss': 0.498, 'learning_rate': 1.7851994686215592e-05, 'epoch': 0.24} + 24%|██▎ | 2067/8750 [3:22:45<10:42:27, 5.77s/it] {'loss': 0.498, 'learning_rate': 1.7851994686215592e-05, 'epoch': 0.24} + 24%|██▎ | 2067/8750 [3:22:39<10:42:27, 5.77s/it] 24%|██▎ | 2068/8750 [3:22:51<10:38:50, 5.74s/it] 24%|██▎ | 2068/8750 [3:22:45<10:38:52, 5.74s/it] {'loss': 0.4789, 'learning_rate': 1.7849701926085183e-05, 'epoch': 0.24} + 24%|██▎ | 2068/8750 [3:22:51<10:38:50, 5.74s/it] {'loss': 0.4789, 'learning_rate': 1.7849701926085183e-05, 'epoch': 0.24} + 24%|██▎ | 2068/8750 [3:22:45<10:38:52, 5.74s/it] 24%|██▎ | 2069/8750 [3:22:57<10:34:06, 5.69s/it] 24%|██▎ | 2069/8750 [3:22:50<10:34:07, 5.69s/it] {'loss': 0.47, 'learning_rate': 1.7847408090370355e-05, 'epoch': 0.24} + 24%|██▎ | 2069/8750 [3:22:57<10:34:06, 5.69s/it] {'loss': 0.47, 'learning_rate': 1.7847408090370355e-05, 'epoch': 0.24} + 24%|██▎ | 2069/8750 [3:22:50<10:34:07, 5.69s/it] 24%|██▎ | 2070/8750 [3:22:56<10:30:39, 5.66s/it] 24%|██▎ | 2070/8750 [3:23:02<10:30:40, 5.66s/it] {'loss': 0.4868, 'learning_rate': 1.784511317938542e-05, 'epoch': 0.24} + 24%|██▎ | 2070/8750 [3:23:02<10:30:40, 5.66s/it] {'loss': 0.4868, 'learning_rate': 1.784511317938542e-05, 'epoch': 0.24} + 24%|██▎ | 2070/8750 [3:22:56<10:30:39, 5.66s/it] 24%|██▎ | 2071/8750 [3:23:08<10:30:30, 5.66s/it] 24%|██▎ | 2071/8750 [3:23:01<10:30:32, 5.66s/it] {'loss': 0.4803, 'learning_rate': 1.7842817193444823e-05, 'epoch': 0.24} + 24%|██▎ | 2071/8750 [3:23:08<10:30:30, 5.66s/it] {'loss': 0.4803, 'learning_rate': 1.7842817193444823e-05, 'epoch': 0.24} + 24%|██▎ | 2071/8750 [3:23:01<10:30:32, 5.66s/it] 24%|██▎ | 2072/8750 [3:23:13<10:25:28, 5.62s/it] 24%|██▎ | 2072/8750 [3:23:07<10:25:28, 5.62s/it] {'loss': 0.4863, 'learning_rate': 1.7840520132863173e-05, 'epoch': 0.24} + 24%|██▎ | 2072/8750 [3:23:13<10:25:28, 5.62s/it] {'loss': 0.4863, 'learning_rate': 1.7840520132863173e-05, 'epoch': 0.24} + 24%|██▎ | 2072/8750 [3:23:07<10:25:28, 5.62s/it] 24%|██▎ | 2073/8750 [3:23:19<10:25:23, 5.62s/it] 24%|██▎ | 2073/8750 [3:23:12<10:25:24, 5.62s/it] {'loss': 0.4667, 'learning_rate': 1.783822199795522e-05, 'epoch': 0.24} + 24%|██▎ | 2073/8750 [3:23:19<10:25:23, 5.62s/it] {'loss': 0.4667, 'learning_rate': 1.783822199795522e-05, 'epoch': 0.24} + 24%|██▎ | 2073/8750 [3:23:12<10:25:24, 5.62s/it] 24%|██▎ | 2074/8750 [3:23:18<10:32:38, 5.69s/it] 24%|██▎ | 2074/8750 [3:23:25<10:32:39, 5.69s/it] {'loss': 0.4738, 'learning_rate': 1.7835922789035853e-05, 'epoch': 0.24} + 24%|██▎ | 2074/8750 [3:23:25<10:32:39, 5.69s/it] {'loss': 0.4738, 'learning_rate': 1.7835922789035853e-05, 'epoch': 0.24} + 24%|██▎ | 2074/8750 [3:23:18<10:32:38, 5.69s/it] 24%|██▎ | 2075/8750 [3:23:24<10:35:43, 5.71s/it] 24%|██▎ | 2075/8750 [3:23:31<10:35:43, 5.71s/it] {'loss': 0.454, 'learning_rate': 1.7833622506420116e-05, 'epoch': 0.24} + 24%|██▎ | 2075/8750 [3:23:31<10:35:43, 5.71s/it] {'loss': 0.454, 'learning_rate': 1.7833622506420116e-05, 'epoch': 0.24} + 24%|██▎ | 2075/8750 [3:23:24<10:35:43, 5.71s/it] 24%|██▎ | 2076/8750 [3:23:36<10:36:03, 5.72s/it] 24%|██▎ | 2076/8750 [3:23:30<10:36:04, 5.72s/it] {'loss': 0.5017, 'learning_rate': 1.7831321150423203e-05, 'epoch': 0.24} + 24%|██▎ | 2076/8750 [3:23:36<10:36:03, 5.72s/it] {'loss': 0.5017, 'learning_rate': 1.7831321150423203e-05, 'epoch': 0.24} + 24%|██▎ | 2076/8750 [3:23:30<10:36:04, 5.72s/it] 24%|██▎ | 2077/8750 [3:23:35<10:27:20, 5.64s/it] 24%|██▎ | 2077/8750 [3:23:42<10:27:20, 5.64s/it] {'loss': 0.4968, 'learning_rate': 1.782901872136045e-05, 'epoch': 0.24} + 24%|██▎ | 2077/8750 [3:23:42<10:27:20, 5.64s/it] {'loss': 0.4968, 'learning_rate': 1.782901872136045e-05, 'epoch': 0.24} + 24%|██▎ | 2077/8750 [3:23:35<10:27:20, 5.64s/it] 24%|██▎ | 2078/8750 [3:23:41<10:37:07, 5.73s/it] 24%|██▎ | 2078/8750 [3:23:48<10:37:09, 5.73s/it] {'loss': 0.4852, 'learning_rate': 1.7826715219547336e-05, 'epoch': 0.24} + 24%|██▎ | 2078/8750 [3:23:48<10:37:09, 5.73s/it] {'loss': 0.4852, 'learning_rate': 1.7826715219547336e-05, 'epoch': 0.24} + 24%|██▎ | 2078/8750 [3:23:41<10:37:07, 5.73s/it] 24%|██▍ | 2079/8750 [3:23:47<10:38:22, 5.74s/it] 24%|██▍ | 2079/8750 [3:23:54<10:38:23, 5.74s/it] {'loss': 0.4827, 'learning_rate': 1.78244106452995e-05, 'epoch': 0.24} + 24%|██▍ | 2079/8750 [3:23:54<10:38:23, 5.74s/it] {'loss': 0.4827, 'learning_rate': 1.78244106452995e-05, 'epoch': 0.24} + 24%|██▍ | 2079/8750 [3:23:47<10:38:22, 5.74s/it] 24%|██▍ | 2080/8750 [3:23:53<10:34:46, 5.71s/it] 24%|██▍ | 2080/8750 [3:23:59<10:34:47, 5.71s/it] {'loss': 0.4838, 'learning_rate': 1.7822104998932715e-05, 'epoch': 0.24} + 24%|██▍ | 2080/8750 [3:23:59<10:34:47, 5.71s/it] {'loss': 0.4838, 'learning_rate': 1.7822104998932715e-05, 'epoch': 0.24} + 24%|██▍ | 2080/8750 [3:23:53<10:34:46, 5.71s/it] 24%|██▍ | 2081/8750 [3:23:59<10:47:22, 5.82s/it] 24%|██▍ | 2081/8750 [3:24:05<10:47:22, 5.82s/it] {'loss': 0.4729, 'learning_rate': 1.7819798280762907e-05, 'epoch': 0.24} + 24%|██▍ | 2081/8750 [3:24:05<10:47:22, 5.82s/it] {'loss': 0.4729, 'learning_rate': 1.7819798280762907e-05, 'epoch': 0.24} + 24%|██▍ | 2081/8750 [3:23:59<10:47:22, 5.82s/it] 24%|██▍ | 2082/8750 [3:24:05<11:00:20, 5.94s/it] 24%|██▍ | 2082/8750 [3:24:11<11:00:20, 5.94s/it] {'loss': 0.4647, 'learning_rate': 1.7817490491106148e-05, 'epoch': 0.24} + 24%|██▍ | 2082/8750 [3:24:11<11:00:20, 5.94s/it] {'loss': 0.4647, 'learning_rate': 1.7817490491106148e-05, 'epoch': 0.24} + 24%|██▍ | 2082/8750 [3:24:05<11:00:20, 5.94s/it] 24%|██▍ | 2083/8750 [3:24:10<10:46:45, 5.82s/it] 24%|██▍ | 2083/8750 [3:24:17<10:46:44, 5.82s/it] {'loss': 0.4783, 'learning_rate': 1.7815181630278656e-05, 'epoch': 0.24} + 24%|██▍ | 2083/8750 [3:24:17<10:46:44, 5.82s/it] {'loss': 0.4783, 'learning_rate': 1.7815181630278656e-05, 'epoch': 0.24} + 24%|██▍ | 2083/8750 [3:24:10<10:46:45, 5.82s/it] 24%|██▍ | 2084/8750 [3:24:16<10:46:04, 5.82s/it] 24%|██▍ | 2084/8750 [3:24:23<10:46:04, 5.82s/it] {'loss': 0.4797, 'learning_rate': 1.78128716985968e-05, 'epoch': 0.24} + 24%|██▍ | 2084/8750 [3:24:23<10:46:04, 5.82s/it] {'loss': 0.4797, 'learning_rate': 1.78128716985968e-05, 'epoch': 0.24} + 24%|██▍ | 2084/8750 [3:24:16<10:46:04, 5.82s/it] 24%|██▍ | 2085/8750 [3:24:28<10:39:41, 5.76s/it] 24%|██▍ | 2085/8750 [3:24:22<10:39:42, 5.76s/it] {'loss': 0.5009, 'learning_rate': 1.781056069637709e-05, 'epoch': 0.24} + 24%|██▍ | 2085/8750 [3:24:28<10:39:41, 5.76s/it] {'loss': 0.5009, 'learning_rate': 1.781056069637709e-05, 'epoch': 0.24} + 24%|██▍ | 2085/8750 [3:24:22<10:39:42, 5.76s/it] 24%|██▍ | 2086/8750 [3:24:28<10:42:43, 5.79s/it] 24%|██▍ | 2086/8750 [3:24:34<10:42:43, 5.79s/it] {'loss': 0.5092, 'learning_rate': 1.7808248623936183e-05, 'epoch': 0.24} + 24%|██▍ | 2086/8750 [3:24:34<10:42:43, 5.79s/it] {'loss': 0.5092, 'learning_rate': 1.7808248623936183e-05, 'epoch': 0.24} + 24%|██▍ | 2086/8750 [3:24:28<10:42:43, 5.79s/it] 24%|██▍ | 2087/8750 [3:24:40<10:51:18, 5.87s/it] 24%|██▍ | 2087/8750 [3:24:34<10:51:19, 5.87s/it] {'loss': 0.4692, 'learning_rate': 1.780593548159089e-05, 'epoch': 0.24} + 24%|██▍ | 2087/8750 [3:24:40<10:51:18, 5.87s/it] {'loss': 0.4692, 'learning_rate': 1.780593548159089e-05, 'epoch': 0.24} + 24%|██▍ | 2087/8750 [3:24:34<10:51:19, 5.87s/it] 24%|██▍ | 2088/8750 [3:24:39<10:42:22, 5.79s/it] 24%|██▍ | 2088/8750 [3:24:46<10:42:22, 5.79s/it] {'loss': 0.4904, 'learning_rate': 1.7803621269658154e-05, 'epoch': 0.24} + 24%|██▍ | 2088/8750 [3:24:46<10:42:22, 5.79s/it] {'loss': 0.4904, 'learning_rate': 1.7803621269658154e-05, 'epoch': 0.24} + 24%|██▍ | 2088/8750 [3:24:39<10:42:22, 5.79s/it] 24%|██▍ | 2089/8750 [3:24:45<10:41:58, 5.78s/it] 24%|██▍ | 2089/8750 [3:24:52<10:41:59, 5.78s/it] {'loss': 0.4693, 'learning_rate': 1.7801305988455085e-05, 'epoch': 0.24} + 24%|██▍ | 2089/8750 [3:24:52<10:41:59, 5.78s/it] {'loss': 0.4693, 'learning_rate': 1.7801305988455085e-05, 'epoch': 0.24} + 24%|██▍ | 2089/8750 [3:24:45<10:41:58, 5.78s/it] 24%|██▍ | 2090/8750 [3:24:51<10:46:29, 5.82s/it] 24%|██▍ | 2090/8750 [3:24:58<10:46:29, 5.82s/it] {'loss': 0.4656, 'learning_rate': 1.779898963829892e-05, 'epoch': 0.24} + 24%|██▍ | 2090/8750 [3:24:58<10:46:29, 5.82s/it] {'loss': 0.4656, 'learning_rate': 1.779898963829892e-05, 'epoch': 0.24} + 24%|██▍ | 2090/8750 [3:24:51<10:46:29, 5.82s/it] 24%|██▍ | 2091/8750 [3:24:57<10:37:12, 5.74s/it] 24%|██▍ | 2091/8750 [3:25:03<10:37:13, 5.74s/it] {'loss': 0.4907, 'learning_rate': 1.779667221950705e-05, 'epoch': 0.24} + 24%|██▍ | 2091/8750 [3:25:03<10:37:13, 5.74s/it] {'loss': 0.4907, 'learning_rate': 1.779667221950705e-05, 'epoch': 0.24} + 24%|██▍ | 2091/8750 [3:24:57<10:37:12, 5.74s/it] 24%|██▍ | 2092/8750 [3:25:09<10:34:49, 5.72s/it] 24%|██▍ | 2092/8750 [3:25:02<10:34:51, 5.72s/it] {'loss': 0.47, 'learning_rate': 1.7794353732397018e-05, 'epoch': 0.24} + 24%|██▍ | 2092/8750 [3:25:09<10:34:49, 5.72s/it] {'loss': 0.47, 'learning_rate': 1.7794353732397018e-05, 'epoch': 0.24} + 24%|██▍ | 2092/8750 [3:25:02<10:34:51, 5.72s/it] 24%|██▍ | 2093/8750 [3:25:14<10:26:40, 5.65s/it] 24%|██▍ | 2093/8750 [3:25:08<10:26:41, 5.65s/it] {'loss': 0.4755, 'learning_rate': 1.7792034177286508e-05, 'epoch': 0.24} + 24%|██▍ | 2093/8750 [3:25:14<10:26:40, 5.65s/it] {'loss': 0.4755, 'learning_rate': 1.7792034177286508e-05, 'epoch': 0.24} + 24%|██▍ | 2093/8750 [3:25:08<10:26:41, 5.65s/it] 24%|██▍ | 2094/8750 [3:25:14<10:30:52, 5.69s/it] 24%|██▍ | 2094/8750 [3:25:20<10:30:54, 5.69s/it] {'loss': 0.4875, 'learning_rate': 1.778971355449335e-05, 'epoch': 0.24} + 24%|██▍ | 2094/8750 [3:25:20<10:30:54, 5.69s/it]{'loss': 0.4875, 'learning_rate': 1.778971355449335e-05, 'epoch': 0.24} + 24%|██▍ | 2094/8750 [3:25:14<10:30:52, 5.69s/it] 24%|██▍ | 2095/8750 [3:25:19<10:35:17, 5.73s/it] 24%|██▍ | 2095/8750 [3:25:26<10:35:17, 5.73s/it] {'loss': 0.4787, 'learning_rate': 1.7787391864335517e-05, 'epoch': 0.24} + 24%|██▍ | 2095/8750 [3:25:26<10:35:17, 5.73s/it] {'loss': 0.4787, 'learning_rate': 1.7787391864335517e-05, 'epoch': 0.24} + 24%|██▍ | 2095/8750 [3:25:19<10:35:17, 5.73s/it] 24%|██▍ | 2096/8750 [3:25:25<10:37:09, 5.75s/it] 24%|██▍ | 2096/8750 [3:25:32<10:37:10, 5.75s/it] {'loss': 0.4721, 'learning_rate': 1.778506910713114e-05, 'epoch': 0.24} + 24%|██▍ | 2096/8750 [3:25:32<10:37:10, 5.75s/it] {'loss': 0.4721, 'learning_rate': 1.778506910713114e-05, 'epoch': 0.24} + 24%|██▍ | 2096/8750 [3:25:25<10:37:09, 5.75s/it] 24%|██▍ | 2097/8750 [3:25:38<10:45:32, 5.82s/it] 24%|██▍ | 2097/8750 [3:25:31<10:45:33, 5.82s/it] {'loss': 0.4827, 'learning_rate': 1.778274528319848e-05, 'epoch': 0.24} + 24%|██▍ | 2097/8750 [3:25:38<10:45:32, 5.82s/it] {'loss': 0.4827, 'learning_rate': 1.778274528319848e-05, 'epoch': 0.24} + 24%|██▍ | 2097/8750 [3:25:31<10:45:33, 5.82s/it] 24%|██▍ | 2098/8750 [3:25:43<10:40:00, 5.77s/it] 24%|██▍ | 2098/8750 [3:25:37<10:40:01, 5.77s/it] {'loss': 0.462, 'learning_rate': 1.778042039285596e-05, 'epoch': 0.24} + 24%|██▍ | 2098/8750 [3:25:43<10:40:00, 5.77s/it] {'loss': 0.462, 'learning_rate': 1.778042039285596e-05, 'epoch': 0.24} + 24%|██▍ | 2098/8750 [3:25:37<10:40:01, 5.77s/it] 24%|██▍ | 2099/8750 [3:25:43<10:48:08, 5.85s/it] 24%|██▍ | 2099/8750 [3:25:49<10:48:08, 5.85s/it] {'loss': 0.4773, 'learning_rate': 1.777809443642214e-05, 'epoch': 0.24} + 24%|██▍ | 2099/8750 [3:25:43<10:48:08, 5.85s/it]{'loss': 0.4773, 'learning_rate': 1.777809443642214e-05, 'epoch': 0.24} + 24%|██▍ | 2099/8750 [3:25:49<10:48:08, 5.85s/it]12 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... + 24%|██▍ | 2100/8750 [3:25:55<10:43:26, 5.81s/it]9 AutoResumeHook: Checking whether to suspend... +02 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... + 7 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend...3 AutoResumeHook: Checking whether to suspend... + 24%|██▍ | 2100/8750 [3:25:49<10:43:28, 5.81s/it] +13 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4847, 'learning_rate': 1.7775767414215726e-05, 'epoch': 0.24} + 24%|██▍ | 2100/8750 [3:25:55<10:43:26, 5.81s/it] {'loss': 0.4847, 'learning_rate': 1.7775767414215726e-05, 'epoch': 0.24} + 24%|██▍ | 2100/8750 [3:25:49<10:43:28, 5.81s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-2100/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-2100/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-2100/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 24%|██▍ | 2101/8750 [3:27:08<47:49:57, 25.90s/it] 24%|██▍ | 2101/8750 [3:27:01<47:49:58, 25.90s/it] {'loss': 0.4716, 'learning_rate': 1.7773439326555574e-05, 'epoch': 0.24} + 24%|██▍ | 2101/8750 [3:27:08<47:49:57, 25.90s/it] {'loss': 0.4716, 'learning_rate': 1.7773439326555574e-05, 'epoch': 0.24} + 24%|██▍ | 2101/8750 [3:27:01<47:49:58, 25.90s/it] 24%|██▍ | 2102/8750 [3:27:07<36:43:46, 19.89s/it] 24%|██▍ | 2102/8750 [3:27:14<36:43:47, 19.89s/it] {'loss': 0.4797, 'learning_rate': 1.777111017376068e-05, 'epoch': 0.24} + 24%|██▍ | 2102/8750 [3:27:14<36:43:47, 19.89s/it] {'loss': 0.4797, 'learning_rate': 1.777111017376068e-05, 'epoch': 0.24} + 24%|██▍ | 2102/8750 [3:27:07<36:43:46, 19.89s/it] 24%|██▍ | 2103/8750 [3:27:13<28:50:45, 15.62s/it] 24%|██▍ | 2103/8750 [3:27:19<28:50:46, 15.62s/it] {'loss': 0.4778, 'learning_rate': 1.7768779956150196e-05, 'epoch': 0.24} + 24%|██▍ | 2103/8750 [3:27:19<28:50:46, 15.62s/it] {'loss': 0.4778, 'learning_rate': 1.7768779956150196e-05, 'epoch': 0.24} + 24%|██▍ | 2103/8750 [3:27:13<28:50:45, 15.62s/it] 24%|██▍ | 2104/8750 [3:27:25<23:29:40, 12.73s/it] 24%|██▍ | 2104/8750 [3:27:19<23:29:41, 12.73s/it] {'loss': 0.4625, 'learning_rate': 1.776644867404341e-05, 'epoch': 0.24} + 24%|██▍ | 2104/8750 [3:27:25<23:29:40, 12.73s/it] {'loss': 0.4625, 'learning_rate': 1.776644867404341e-05, 'epoch': 0.24} + 24%|██▍ | 2104/8750 [3:27:19<23:29:41, 12.73s/it] 24%|██▍ | 2105/8750 [3:27:31<19:35:50, 10.62s/it] 24%|██▍ | 2105/8750 [3:27:25<19:35:50, 10.62s/it] {'loss': 0.46, 'learning_rate': 1.776411632775976e-05, 'epoch': 0.24} + 24%|██▍ | 2105/8750 [3:27:31<19:35:50, 10.62s/it] {'loss': 0.46, 'learning_rate': 1.776411632775976e-05, 'epoch': 0.24} + 24%|██▍ | 2105/8750 [3:27:25<19:35:50, 10.62s/it] 24%|██▍ | 2106/8750 [3:27:37<16:47:52, 9.10s/it] 24%|██▍ | 2106/8750 [3:27:30<16:47:52, 9.10s/it] {'loss': 0.4934, 'learning_rate': 1.7761782917618836e-05, 'epoch': 0.24} + 24%|██▍ | 2106/8750 [3:27:37<16:47:52, 9.10s/it] {'loss': 0.4934, 'learning_rate': 1.7761782917618836e-05, 'epoch': 0.24} + 24%|██▍ | 2106/8750 [3:27:30<16:47:52, 9.10s/it] 24%|██▍ | 2107/8750 [3:27:42<14:55:35, 8.09s/it] 24%|██▍ | 2107/8750 [3:27:36<14:55:36, 8.09s/it] {'loss': 0.459, 'learning_rate': 1.7759448443940355e-05, 'epoch': 0.24} + 24%|██▍ | 2107/8750 [3:27:42<14:55:35, 8.09s/it] {'loss': 0.459, 'learning_rate': 1.7759448443940355e-05, 'epoch': 0.24} + 24%|██▍ | 2107/8750 [3:27:36<14:55:36, 8.09s/it] 24%|██▍ | 2108/8750 [3:27:42<13:34:47, 7.36s/it] 24%|██▍ | 2108/8750 [3:27:48<13:34:48, 7.36s/it] {'loss': 0.4785, 'learning_rate': 1.77571129070442e-05, 'epoch': 0.24} + 24%|██▍ | 2108/8750 [3:27:48<13:34:48, 7.36s/it] {'loss': 0.4785, 'learning_rate': 1.77571129070442e-05, 'epoch': 0.24} + 24%|██▍ | 2108/8750 [3:27:42<13:34:47, 7.36s/it] 24%|██▍ | 2109/8750 [3:27:47<12:41:08, 6.88s/it] 24%|██▍ | 2109/8750 [3:27:54<12:41:08, 6.88s/it] {'loss': 0.4648, 'learning_rate': 1.775477630725039e-05, 'epoch': 0.24} + 24%|██▍ | 2109/8750 [3:27:54<12:41:08, 6.88s/it] {'loss': 0.4648, 'learning_rate': 1.775477630725039e-05, 'epoch': 0.24} + 24%|██▍ | 2109/8750 [3:27:47<12:41:08, 6.88s/it] 24%|██▍ | 2110/8750 [3:27:53<11:57:50, 6.49s/it] 24%|██▍ | 2110/8750 [3:27:59<11:57:50, 6.49s/it] {'loss': 0.5045, 'learning_rate': 1.7752438644879092e-05, 'epoch': 0.24} + 24%|██▍ | 2110/8750 [3:27:59<11:57:50, 6.49s/it] {'loss': 0.5045, 'learning_rate': 1.7752438644879092e-05, 'epoch': 0.24} + 24%|██▍ | 2110/8750 [3:27:53<11:57:50, 6.49s/it] 24%|██▍ | 2111/8750 [3:28:05<11:40:00, 6.33s/it] 24%|██▍ | 2111/8750 [3:27:59<11:40:00, 6.33s/it] {'loss': 0.4848, 'learning_rate': 1.7750099920250616e-05, 'epoch': 0.24} + 24%|██▍ | 2111/8750 [3:28:05<11:40:00, 6.33s/it] {'loss': 0.4848, 'learning_rate': 1.7750099920250616e-05, 'epoch': 0.24} + 24%|██▍ | 2111/8750 [3:27:59<11:40:00, 6.33s/it] 24%|██▍ | 2112/8750 [3:28:04<11:17:24, 6.12s/it] 24%|██▍ | 2112/8750 [3:28:11<11:17:25, 6.12s/it] {'loss': 0.4724, 'learning_rate': 1.774776013368542e-05, 'epoch': 0.24} + 24%|██▍ | 2112/8750 [3:28:11<11:17:25, 6.12s/it] {'loss': 0.4724, 'learning_rate': 1.774776013368542e-05, 'epoch': 0.24} + 24%|██▍ | 2112/8750 [3:28:04<11:17:24, 6.12s/it] 24%|██▍ | 2113/8750 [3:28:17<11:02:36, 5.99s/it] 24%|██▍ | 2113/8750 [3:28:10<11:02:37, 5.99s/it] {'loss': 0.4749, 'learning_rate': 1.774541928550411e-05, 'epoch': 0.24} + 24%|██▍ | 2113/8750 [3:28:17<11:02:36, 5.99s/it] {'loss': 0.4749, 'learning_rate': 1.774541928550411e-05, 'epoch': 0.24} + 24%|██▍ | 2113/8750 [3:28:10<11:02:37, 5.99s/it] 24%|██▍ | 2114/8750 [3:28:22<10:53:32, 5.91s/it] 24%|██▍ | 2114/8750 [3:28:16<10:53:33, 5.91s/it] {'loss': 0.4872, 'learning_rate': 1.7743077376027433e-05, 'epoch': 0.24} + 24%|██▍ | 2114/8750 [3:28:22<10:53:32, 5.91s/it] {'loss': 0.4872, 'learning_rate': 1.7743077376027433e-05, 'epoch': 0.24} + 24%|██▍ | 2114/8750 [3:28:16<10:53:33, 5.91s/it] 24%|██▍ | 2115/8750 [3:28:28<10:50:14, 5.88s/it] 24%|██▍ | 2115/8750 [3:28:22<10:50:14, 5.88s/it] {'loss': 0.4778, 'learning_rate': 1.7740734405576283e-05, 'epoch': 0.24} + 24%|██▍ | 2115/8750 [3:28:28<10:50:14, 5.88s/it] {'loss': 0.4778, 'learning_rate': 1.7740734405576283e-05, 'epoch': 0.24} + 24%|██▍ | 2115/8750 [3:28:22<10:50:14, 5.88s/it] 24%|██▍ | 2116/8750 [3:28:34<10:46:03, 5.84s/it] 24%|██▍ | 2116/8750 [3:28:27<10:46:05, 5.84s/it] {'loss': 0.4904, 'learning_rate': 1.7738390374471696e-05, 'epoch': 0.24} + 24%|██▍ | 2116/8750 [3:28:34<10:46:03, 5.84s/it] {'loss': 0.4904, 'learning_rate': 1.7738390374471696e-05, 'epoch': 0.24} + 24%|██▍ | 2116/8750 [3:28:27<10:46:05, 5.84s/it] 24%|██▍ | 2117/8750 [3:28:40<10:37:58, 5.77s/it] 24%|██▍ | 2117/8750 [3:28:33<10:38:00, 5.77s/it] {'loss': 0.4892, 'learning_rate': 1.773604528303486e-05, 'epoch': 0.24} + 24%|██▍ | 2117/8750 [3:28:40<10:37:58, 5.77s/it] {'loss': 0.4892, 'learning_rate': 1.773604528303486e-05, 'epoch': 0.24} + 24%|██▍ | 2117/8750 [3:28:33<10:38:00, 5.77s/it] 24%|██▍ | 2118/8750 [3:28:45<10:40:44, 5.80s/it] 24%|██▍ | 2118/8750 [3:28:39<10:40:44, 5.80s/it] {'loss': 0.4918, 'learning_rate': 1.7733699131587104e-05, 'epoch': 0.24} + 24%|██▍ | 2118/8750 [3:28:45<10:40:44, 5.80s/it] {'loss': 0.4918, 'learning_rate': 1.7733699131587104e-05, 'epoch': 0.24} + 24%|██▍ | 2118/8750 [3:28:39<10:40:44, 5.80s/it] 24%|██▍ | 2119/8750 [3:28:51<10:47:03, 5.85s/it] 24%|██▍ | 2119/8750 [3:28:45<10:47:03, 5.85s/it] {'loss': 0.4776, 'learning_rate': 1.77313519204499e-05, 'epoch': 0.24} + 24%|██▍ | 2119/8750 [3:28:51<10:47:03, 5.85s/it] {'loss': 0.4776, 'learning_rate': 1.77313519204499e-05, 'epoch': 0.24} + 24%|██▍ | 2119/8750 [3:28:45<10:47:03, 5.85s/it] 24%|██▍ | 2120/8750 [3:28:57<10:40:34, 5.80s/it] 24%|██▍ | 2120/8750 [3:28:51<10:40:33, 5.80s/it] {'loss': 0.4777, 'learning_rate': 1.7729003649944878e-05, 'epoch': 0.24} + 24%|██▍ | 2120/8750 [3:28:57<10:40:34, 5.80s/it] {'loss': 0.4777, 'learning_rate': 1.7729003649944878e-05, 'epoch': 0.24} + 24%|██▍ | 2120/8750 [3:28:51<10:40:33, 5.80s/it] 24%|██▍ | 2121/8750 [3:29:03<10:38:02, 5.77s/it] 24%|██▍ | 2121/8750 [3:28:56<10:38:01, 5.77s/it] {'loss': 0.4935, 'learning_rate': 1.7726654320393795e-05, 'epoch': 0.24} + 24%|██▍ | 2121/8750 [3:29:03<10:38:02, 5.77s/it] {'loss': 0.4935, 'learning_rate': 1.7726654320393795e-05, 'epoch': 0.24} + 24%|██▍ | 2121/8750 [3:28:56<10:38:01, 5.77s/it] 24%|██▍ | 2122/8750 [3:29:08<10:36:17, 5.76s/it] 24%|██▍ | 2122/8750 [3:29:02<10:36:17, 5.76s/it] {'loss': 0.482, 'learning_rate': 1.772430393211856e-05, 'epoch': 0.24} + 24%|██▍ | 2122/8750 [3:29:08<10:36:17, 5.76s/it] {'loss': 0.482, 'learning_rate': 1.772430393211856e-05, 'epoch': 0.24} + 24%|██▍ | 2122/8750 [3:29:02<10:36:17, 5.76s/it] 24%|██▍ | 2123/8750 [3:29:14<10:28:54, 5.69s/it] 24%|██▍ | 2123/8750 [3:29:08<10:28:53, 5.69s/it] {'loss': 0.4718, 'learning_rate': 1.7721952485441232e-05, 'epoch': 0.24} + 24%|██▍ | 2123/8750 [3:29:14<10:28:54, 5.69s/it] {'loss': 0.4718, 'learning_rate': 1.7721952485441232e-05, 'epoch': 0.24} + 24%|██▍ | 2123/8750 [3:29:08<10:28:53, 5.69s/it] 24%|██▍ | 2124/8750 [3:29:20<10:37:30, 5.77s/it] 24%|██▍ | 2124/8750 [3:29:13<10:37:29, 5.77s/it] {'loss': 0.4591, 'learning_rate': 1.7719599980684016e-05, 'epoch': 0.24} + 24%|██▍ | 2124/8750 [3:29:20<10:37:30, 5.77s/it] {'loss': 0.4591, 'learning_rate': 1.7719599980684016e-05, 'epoch': 0.24} + 24%|██▍ | 2124/8750 [3:29:13<10:37:29, 5.77s/it] 24%|██▍ | 2125/8750 [3:29:26<10:43:10, 5.83s/it] 24%|██▍ | 2125/8750 [3:29:19<10:43:10, 5.83s/it] {'loss': 0.481, 'learning_rate': 1.7717246418169252e-05, 'epoch': 0.24} + 24%|██▍ | 2125/8750 [3:29:26<10:43:10, 5.83s/it] {'loss': 0.481, 'learning_rate': 1.7717246418169252e-05, 'epoch': 0.24} + 24%|██▍ | 2125/8750 [3:29:19<10:43:10, 5.83s/it] 24%|██▍ | 2126/8750 [3:29:25<10:36:05, 5.76s/it] 24%|██▍ | 2126/8750 [3:29:32<10:36:06, 5.76s/it] {'loss': 0.4851, 'learning_rate': 1.7714891798219432e-05, 'epoch': 0.24} + 24%|██▍ | 2126/8750 [3:29:32<10:36:06, 5.76s/it] {'loss': 0.4851, 'learning_rate': 1.7714891798219432e-05, 'epoch': 0.24} + 24%|██▍ | 2126/8750 [3:29:25<10:36:05, 5.76s/it] 24%|██▍ | 2127/8750 [3:29:37<10:32:28, 5.73s/it] {'loss': 0.5005, 'learning_rate': 1.771253612115719e-05, 'epoch': 0.24} + 24%|██▍ | 2127/8750 [3:29:37<10:32:28, 5.73s/it] 24%|██▍ | 2127/8750 [3:29:31<10:32:28, 5.73s/it] {'loss': 0.5005, 'learning_rate': 1.771253612115719e-05, 'epoch': 0.24} + 24%|██▍ | 2127/8750 [3:29:31<10:32:28, 5.73s/it] 24%|██▍ | 2128/8750 [3:29:37<10:37:23, 5.78s/it] 24%|██▍ | 2128/8750 [3:29:43<10:37:23, 5.78s/it] {'loss': 0.4734, 'learning_rate': 1.7710179387305308e-05, 'epoch': 0.24} + 24%|██▍ | 2128/8750 [3:29:43<10:37:23, 5.78s/it] {'loss': 0.4734, 'learning_rate': 1.7710179387305308e-05, 'epoch': 0.24} + 24%|██▍ | 2128/8750 [3:29:37<10:37:23, 5.78s/it] 24%|██▍ | 2129/8750 [3:29:49<10:36:05, 5.76s/it] 24%|██▍ | 2129/8750 [3:29:42<10:36:06, 5.76s/it] {'loss': 0.4805, 'learning_rate': 1.7707821596986715e-05, 'epoch': 0.24} + 24%|██▍ | 2129/8750 [3:29:49<10:36:05, 5.76s/it] {'loss': 0.4805, 'learning_rate': 1.7707821596986715e-05, 'epoch': 0.24} + 24%|██▍ | 2129/8750 [3:29:42<10:36:06, 5.76s/it] 24%|██▍ | 2130/8750 [3:29:55<10:34:44, 5.75s/it] 24%|██▍ | 2130/8750 [3:29:48<10:34:45, 5.75s/it] {'loss': 0.4809, 'learning_rate': 1.7705462750524474e-05, 'epoch': 0.24} + 24%|██▍ | 2130/8750 [3:29:55<10:34:44, 5.75s/it] {'loss': 0.4809, 'learning_rate': 1.7705462750524474e-05, 'epoch': 0.24} + 24%|██▍ | 2130/8750 [3:29:48<10:34:45, 5.75s/it] 24%|██▍ | 2131/8750 [3:30:00<10:40:48, 5.81s/it] 24%|██▍ | 2131/8750 [3:29:54<10:40:48, 5.81s/it] {'loss': 0.4728, 'learning_rate': 1.77031028482418e-05, 'epoch': 0.24} + 24%|██▍ | 2131/8750 [3:30:00<10:40:48, 5.81s/it] {'loss': 0.4728, 'learning_rate': 1.77031028482418e-05, 'epoch': 0.24} + 24%|██▍ | 2131/8750 [3:29:54<10:40:48, 5.81s/it] 24%|██▍ | 2132/8750 [3:30:06<10:37:48, 5.78s/it] 24%|██▍ | 2132/8750 [3:30:00<10:37:49, 5.78s/it] {'loss': 0.4759, 'learning_rate': 1.770074189046206e-05, 'epoch': 0.24} + 24%|██▍ | 2132/8750 [3:30:06<10:37:48, 5.78s/it] {'loss': 0.4759, 'learning_rate': 1.770074189046206e-05, 'epoch': 0.24} + 24%|██▍ | 2132/8750 [3:30:00<10:37:49, 5.78s/it] 24%|██▍ | 2133/8750 [3:30:12<10:37:46, 5.78s/it] 24%|██▍ | 2133/8750 [3:30:05<10:37:47, 5.78s/it] {'loss': 0.4709, 'learning_rate': 1.7698379877508755e-05, 'epoch': 0.24} + 24%|██▍ | 2133/8750 [3:30:12<10:37:46, 5.78s/it] {'loss': 0.4709, 'learning_rate': 1.7698379877508755e-05, 'epoch': 0.24} + 24%|██▍ | 2133/8750 [3:30:05<10:37:47, 5.78s/it] 24%|██▍ | 2134/8750 [3:30:18<10:36:36, 5.77s/it] 24%|██▍ | 2134/8750 [3:30:11<10:36:37, 5.77s/it] {'loss': 0.4993, 'learning_rate': 1.7696016809705525e-05, 'epoch': 0.24} + 24%|██▍ | 2134/8750 [3:30:18<10:36:36, 5.77s/it] {'loss': 0.4993, 'learning_rate': 1.7696016809705525e-05, 'epoch': 0.24} + 24%|██▍ | 2134/8750 [3:30:11<10:36:37, 5.77s/it] 24%|██▍ | 2135/8750 [3:30:23<10:32:45, 5.74s/it] 24%|██▍ | 2135/8750 [3:30:17<10:32:45, 5.74s/it] {'loss': 0.4765, 'learning_rate': 1.7693652687376173e-05, 'epoch': 0.24} + 24%|██▍ | 2135/8750 [3:30:23<10:32:45, 5.74s/it] {'loss': 0.4765, 'learning_rate': 1.7693652687376173e-05, 'epoch': 0.24} + 24%|██▍ | 2135/8750 [3:30:17<10:32:45, 5.74s/it] 24%|██▍ | 2136/8750 [3:30:29<10:25:40, 5.68s/it] 24%|██▍ | 2136/8750 [3:30:22<10:25:40, 5.68s/it] {'loss': 0.4845, 'learning_rate': 1.769128751084463e-05, 'epoch': 0.24} + 24%|██▍ | 2136/8750 [3:30:29<10:25:40, 5.68s/it] {'loss': 0.4845, 'learning_rate': 1.769128751084463e-05, 'epoch': 0.24} + 24%|██▍ | 2136/8750 [3:30:22<10:25:40, 5.68s/it] 24%|██▍ | 2137/8750 [3:30:35<10:29:54, 5.72s/it] 24%|██▍ | 2137/8750 [3:30:28<10:29:54, 5.72s/it] {'loss': 0.4776, 'learning_rate': 1.7688921280434984e-05, 'epoch': 0.24} + 24%|██▍ | 2137/8750 [3:30:35<10:29:54, 5.72s/it] {'loss': 0.4776, 'learning_rate': 1.7688921280434984e-05, 'epoch': 0.24} + 24%|██▍ | 2137/8750 [3:30:28<10:29:54, 5.72s/it] 24%|██▍ | 2138/8750 [3:30:41<10:39:59, 5.81s/it] 24%|██▍ | 2138/8750 [3:30:34<10:39:57, 5.81s/it] {'loss': 0.4779, 'learning_rate': 1.768655399647146e-05, 'epoch': 0.24} + 24%|██▍ | 2138/8750 [3:30:41<10:39:59, 5.81s/it] {'loss': 0.4779, 'learning_rate': 1.768655399647146e-05, 'epoch': 0.24} + 24%|██▍ | 2138/8750 [3:30:34<10:39:57, 5.81s/it] 24%|██▍ | 2139/8750 [3:30:47<10:44:01, 5.85s/it] 24%|██▍ | 2139/8750 [3:30:40<10:44:00, 5.84s/it] {'loss': 0.4532, 'learning_rate': 1.7684185659278423e-05, 'epoch': 0.24} + 24%|██▍ | 2139/8750 [3:30:47<10:44:01, 5.85s/it] {'loss': 0.4532, 'learning_rate': 1.7684185659278423e-05, 'epoch': 0.24} + 24%|██▍ | 2139/8750 [3:30:40<10:44:00, 5.84s/it] 24%|██▍ | 2140/8750 [3:30:52<10:42:37, 5.83s/it] 24%|██▍ | 2140/8750 [3:30:46<10:42:39, 5.83s/it] {'loss': 0.4901, 'learning_rate': 1.7681816269180394e-05, 'epoch': 0.24} + 24%|██▍ | 2140/8750 [3:30:52<10:42:37, 5.83s/it] {'loss': 0.4901, 'learning_rate': 1.7681816269180394e-05, 'epoch': 0.24} + 24%|██▍ | 2140/8750 [3:30:46<10:42:39, 5.83s/it] 24%|██▍ | 2141/8750 [3:30:58<10:41:49, 5.83s/it] 24%|██▍ | 2141/8750 [3:30:52<10:41:49, 5.83s/it] {'loss': 0.4582, 'learning_rate': 1.7679445826502033e-05, 'epoch': 0.24} + 24%|██▍ | 2141/8750 [3:30:58<10:41:49, 5.83s/it] {'loss': 0.4582, 'learning_rate': 1.7679445826502033e-05, 'epoch': 0.24} + 24%|██▍ | 2141/8750 [3:30:52<10:41:49, 5.83s/it] 24%|██▍ | 2142/8750 [3:31:04<10:40:30, 5.82s/it] 24%|██▍ | 2142/8750 [3:30:58<10:40:32, 5.82s/it] {'loss': 0.4821, 'learning_rate': 1.767707433156814e-05, 'epoch': 0.24} + 24%|██▍ | 2142/8750 [3:31:04<10:40:30, 5.82s/it] {'loss': 0.4821, 'learning_rate': 1.767707433156814e-05, 'epoch': 0.24} + 24%|██▍ | 2142/8750 [3:30:58<10:40:32, 5.82s/it] 24%|██▍ | 2143/8750 [3:31:10<10:36:34, 5.78s/it] 24%|██▍ | 2143/8750 [3:31:03<10:36:32, 5.78s/it] {'loss': 0.4743, 'learning_rate': 1.767470178470366e-05, 'epoch': 0.24} + 24%|██▍ | 2143/8750 [3:31:10<10:36:34, 5.78s/it] {'loss': 0.4743, 'learning_rate': 1.767470178470366e-05, 'epoch': 0.24} + 24%|██▍ | 2143/8750 [3:31:03<10:36:32, 5.78s/it] 25%|██▍ | 2144/8750 [3:31:15<10:30:42, 5.73s/it] 25%|██▍ | 2144/8750 [3:31:09<10:30:41, 5.73s/it] {'loss': 0.5039, 'learning_rate': 1.7672328186233692e-05, 'epoch': 0.25} + 25%|██▍ | 2144/8750 [3:31:15<10:30:42, 5.73s/it] {'loss': 0.5039, 'learning_rate': 1.7672328186233692e-05, 'epoch': 0.25} + 25%|██▍ | 2144/8750 [3:31:09<10:30:41, 5.73s/it] 25%|██▍ | 2145/8750 [3:31:21<10:32:04, 5.74s/it] 25%|██▍ | 2145/8750 [3:31:15<10:32:04, 5.74s/it] {'loss': 0.4517, 'learning_rate': 1.766995353648347e-05, 'epoch': 0.25} + 25%|██▍ | 2145/8750 [3:31:21<10:32:04, 5.74s/it] {'loss': 0.4517, 'learning_rate': 1.766995353648347e-05, 'epoch': 0.25} + 25%|██▍ | 2145/8750 [3:31:15<10:32:04, 5.74s/it] 25%|██▍ | 2146/8750 [3:31:20<10:33:36, 5.76s/it] 25%|██▍ | 2146/8750 [3:31:27<10:33:37, 5.76s/it] {'loss': 0.4928, 'learning_rate': 1.766757783577837e-05, 'epoch': 0.25} + 25%|██▍ | 2146/8750 [3:31:27<10:33:37, 5.76s/it] {'loss': 0.4928, 'learning_rate': 1.766757783577837e-05, 'epoch': 0.25} + 25%|██▍ | 2146/8750 [3:31:20<10:33:36, 5.76s/it] 25%|██▍ | 2147/8750 [3:31:26<10:27:29, 5.70s/it] 25%|██▍ | 2147/8750 [3:31:33<10:27:30, 5.70s/it] {'loss': 0.4828, 'learning_rate': 1.766520108444392e-05, 'epoch': 0.25} + 25%|██▍ | 2147/8750 [3:31:33<10:27:30, 5.70s/it] {'loss': 0.4828, 'learning_rate': 1.766520108444392e-05, 'epoch': 0.25} + 25%|██▍ | 2147/8750 [3:31:26<10:27:29, 5.70s/it] 25%|██▍ | 2148/8750 [3:31:38<10:28:05, 5.71s/it] 25%|██▍ | 2148/8750 [3:31:32<10:28:05, 5.71s/it] {'loss': 0.4716, 'learning_rate': 1.7662823282805788e-05, 'epoch': 0.25} + 25%|██▍ | 2148/8750 [3:31:38<10:28:05, 5.71s/it] {'loss': 0.4716, 'learning_rate': 1.7662823282805788e-05, 'epoch': 0.25} + 25%|██▍ | 2148/8750 [3:31:32<10:28:05, 5.71s/it] 25%|██▍ | 2149/8750 [3:31:44<10:26:39, 5.70s/it] 25%|██▍ | 2149/8750 [3:31:37<10:26:40, 5.70s/it]{'loss': 0.4756, 'learning_rate': 1.766044443118978e-05, 'epoch': 0.25} + {'loss': 0.4756, 'learning_rate': 1.766044443118978e-05, 'epoch': 0.25} + 25%|██▍ | 2149/8750 [3:31:44<10:26:39, 5.70s/it] 25%|██▍ | 2149/8750 [3:31:37<10:26:40, 5.70s/it]9 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +25 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +6 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +108 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +0 25%|██▍ | 2150/8750 [3:31:50<10:23:58, 5.67s/it]13 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... + 25%|██▍ | 2150/8750 [3:31:43<10:23:58, 5.67s/it]1 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4674, 'learning_rate': 1.765806452992186e-05, 'epoch': 0.25} + 25%|██▍ | 2150/8750 [3:31:50<10:23:58, 5.67s/it] {'loss': 0.4674, 'learning_rate': 1.765806452992186e-05, 'epoch': 0.25} + 25%|██▍ | 2150/8750 [3:31:43<10:23:58, 5.67s/it] 25%|██▍ | 2151/8750 [3:31:49<10:25:22, 5.69s/it] 25%|██▍ | 2151/8750 [3:31:55<10:25:23, 5.69s/it] {'loss': 0.4787, 'learning_rate': 1.7655683579328128e-05, 'epoch': 0.25} + 25%|██▍ | 2151/8750 [3:31:55<10:25:23, 5.69s/it] {'loss': 0.4787, 'learning_rate': 1.7655683579328128e-05, 'epoch': 0.25} + 25%|██▍ | 2151/8750 [3:31:49<10:25:22, 5.69s/it] 25%|██▍ | 2152/8750 [3:32:01<10:24:38, 5.68s/it] 25%|██▍ | 2152/8750 [3:31:54<10:24:39, 5.68s/it] {'loss': 0.5245, 'learning_rate': 1.765330157973482e-05, 'epoch': 0.25} + 25%|██▍ | 2152/8750 [3:32:01<10:24:38, 5.68s/it] {'loss': 0.5245, 'learning_rate': 1.765330157973482e-05, 'epoch': 0.25} + 25%|██▍ | 2152/8750 [3:31:54<10:24:39, 5.68s/it] 25%|██▍ | 2153/8750 [3:32:07<10:21:47, 5.66s/it] 25%|██▍ | 2153/8750 [3:32:00<10:21:46, 5.66s/it] {'loss': 0.4759, 'learning_rate': 1.7650918531468326e-05, 'epoch': 0.25} + 25%|██▍ | 2153/8750 [3:32:07<10:21:47, 5.66s/it] {'loss': 0.4759, 'learning_rate': 1.7650918531468326e-05, 'epoch': 0.25} + 25%|██▍ | 2153/8750 [3:32:00<10:21:46, 5.66s/it] 25%|██▍ | 2154/8750 [3:32:12<10:24:09, 5.68s/it] 25%|██▍ | 2154/8750 [3:32:06<10:24:08, 5.68s/it] {'loss': 0.4719, 'learning_rate': 1.7648534434855177e-05, 'epoch': 0.25} + 25%|██▍ | 2154/8750 [3:32:12<10:24:09, 5.68s/it] {'loss': 0.4719, 'learning_rate': 1.7648534434855177e-05, 'epoch': 0.25} + 25%|██▍ | 2154/8750 [3:32:06<10:24:08, 5.68s/it] 25%|██▍ | 2155/8750 [3:32:18<10:18:16, 5.62s/it] 25%|██▍ | 2155/8750 [3:32:11<10:18:16, 5.62s/it] {'loss': 0.4807, 'learning_rate': 1.764614929022205e-05, 'epoch': 0.25} + 25%|██▍ | 2155/8750 [3:32:18<10:18:16, 5.62s/it] {'loss': 0.4807, 'learning_rate': 1.764614929022205e-05, 'epoch': 0.25} + 25%|██▍ | 2155/8750 [3:32:11<10:18:16, 5.62s/it] 25%|██▍ | 2156/8750 [3:32:23<10:21:11, 5.65s/it] 25%|██▍ | 2156/8750 [3:32:17<10:21:11, 5.65s/it] {'loss': 0.4773, 'learning_rate': 1.764376309789576e-05, 'epoch': 0.25} + 25%|██▍ | 2156/8750 [3:32:23<10:21:11, 5.65s/it] {'loss': 0.4773, 'learning_rate': 1.764376309789576e-05, 'epoch': 0.25} + 25%|██▍ | 2156/8750 [3:32:17<10:21:11, 5.65s/it] 25%|██▍ | 2157/8750 [3:32:29<10:25:00, 5.69s/it] 25%|██▍ | 2157/8750 [3:32:23<10:25:00, 5.69s/it] {'loss': 0.4787, 'learning_rate': 1.764137585820327e-05, 'epoch': 0.25} + 25%|██▍ | 2157/8750 [3:32:29<10:25:00, 5.69s/it] {'loss': 0.4787, 'learning_rate': 1.764137585820327e-05, 'epoch': 0.25} + 25%|██▍ | 2157/8750 [3:32:23<10:25:00, 5.69s/it] 25%|██▍ | 2158/8750 [3:32:28<10:21:52, 5.66s/it] 25%|██▍ | 2158/8750 [3:32:35<10:21:52, 5.66s/it] {'loss': 0.4781, 'learning_rate': 1.7638987571471685e-05, 'epoch': 0.25} + 25%|██▍ | 2158/8750 [3:32:35<10:21:52, 5.66s/it] {'loss': 0.4781, 'learning_rate': 1.7638987571471685e-05, 'epoch': 0.25} + 25%|██▍ | 2158/8750 [3:32:28<10:21:52, 5.66s/it] 25%|██▍ | 2159/8750 [3:32:41<10:30:39, 5.74s/it] 25%|██▍ | 2159/8750 [3:32:34<10:30:40, 5.74s/it] {'loss': 0.4679, 'learning_rate': 1.7636598238028253e-05, 'epoch': 0.25} + 25%|██▍ | 2159/8750 [3:32:41<10:30:39, 5.74s/it] {'loss': 0.4679, 'learning_rate': 1.7636598238028253e-05, 'epoch': 0.25} + 25%|██▍ | 2159/8750 [3:32:34<10:30:40, 5.74s/it] 25%|██▍ | 2160/8750 [3:32:40<10:43:36, 5.86s/it] 25%|██▍ | 2160/8750 [3:32:47<10:43:36, 5.86s/it] {'loss': 0.4922, 'learning_rate': 1.7634207858200366e-05, 'epoch': 0.25} + 25%|██▍ | 2160/8750 [3:32:47<10:43:36, 5.86s/it] {'loss': 0.4922, 'learning_rate': 1.7634207858200366e-05, 'epoch': 0.25} + 25%|██▍ | 2160/8750 [3:32:40<10:43:36, 5.86s/it] 25%|██▍ | 2161/8750 [3:32:52<10:33:05, 5.76s/it] 25%|██▍ | 2161/8750 [3:32:46<10:33:06, 5.77s/it] {'loss': 0.491, 'learning_rate': 1.763181643231556e-05, 'epoch': 0.25} + 25%|██▍ | 2161/8750 [3:32:52<10:33:05, 5.76s/it] {'loss': 0.491, 'learning_rate': 1.763181643231556e-05, 'epoch': 0.25} + 25%|██▍ | 2161/8750 [3:32:46<10:33:06, 5.77s/it] 25%|██▍ | 2162/8750 [3:32:52<10:34:17, 5.78s/it] 25%|██▍ | 2162/8750 [3:32:58<10:34:17, 5.78s/it] {'loss': 0.4677, 'learning_rate': 1.7629423960701513e-05, 'epoch': 0.25} + 25%|██▍ | 2162/8750 [3:32:58<10:34:17, 5.78s/it] {'loss': 0.4677, 'learning_rate': 1.7629423960701513e-05, 'epoch': 0.25} + 25%|██▍ | 2162/8750 [3:32:52<10:34:17, 5.78s/it] 25%|██▍ | 2163/8750 [3:32:58<10:34:02, 5.78s/it] 25%|██▍ | 2163/8750 [3:33:04<10:34:03, 5.78s/it] {'loss': 0.4873, 'learning_rate': 1.7627030443686047e-05, 'epoch': 0.25} + 25%|██▍ | 2163/8750 [3:33:04<10:34:03, 5.78s/it] {'loss': 0.4873, 'learning_rate': 1.7627030443686047e-05, 'epoch': 0.25} + 25%|██▍ | 2163/8750 [3:32:58<10:34:02, 5.78s/it] 25%|██▍ | 2164/8750 [3:33:10<10:32:48, 5.76s/it] 25%|██▍ | 2164/8750 [3:33:03<10:32:48, 5.77s/it] {'loss': 0.4677, 'learning_rate': 1.762463588159713e-05, 'epoch': 0.25} + 25%|██▍ | 2164/8750 [3:33:10<10:32:48, 5.76s/it] {'loss': 0.4677, 'learning_rate': 1.762463588159713e-05, 'epoch': 0.25} + 25%|██▍ | 2164/8750 [3:33:03<10:32:48, 5.77s/it] 25%|██▍ | 2165/8750 [3:33:09<10:34:10, 5.78s/it] 25%|██▍ | 2165/8750 [3:33:16<10:34:11, 5.78s/it] {'loss': 0.4581, 'learning_rate': 1.762224027476287e-05, 'epoch': 0.25} + 25%|██▍ | 2165/8750 [3:33:16<10:34:11, 5.78s/it] {'loss': 0.4581, 'learning_rate': 1.762224027476287e-05, 'epoch': 0.25} + 25%|██▍ | 2165/8750 [3:33:09<10:34:10, 5.78s/it] 25%|██▍ | 2166/8750 [3:33:15<10:28:18, 5.73s/it] 25%|██▍ | 2166/8750 [3:33:21<10:28:19, 5.73s/it] {'loss': 0.5085, 'learning_rate': 1.761984362351151e-05, 'epoch': 0.25} + 25%|██▍ | 2166/8750 [3:33:21<10:28:19, 5.73s/it] {'loss': 0.5085, 'learning_rate': 1.761984362351151e-05, 'epoch': 0.25} + 25%|██▍ | 2166/8750 [3:33:15<10:28:18, 5.73s/it] 25%|██▍ | 2167/8750 [3:33:27<10:38:14, 5.82s/it] 25%|██▍ | 2167/8750 [3:33:21<10:38:14, 5.82s/it] {'loss': 0.4582, 'learning_rate': 1.7617445928171458e-05, 'epoch': 0.25} + 25%|██▍ | 2167/8750 [3:33:27<10:38:14, 5.82s/it] {'loss': 0.4582, 'learning_rate': 1.7617445928171458e-05, 'epoch': 0.25} + 25%|██▍ | 2167/8750 [3:33:21<10:38:14, 5.82s/it] 25%|██▍ | 2168/8750 [3:33:33<10:30:22, 5.75s/it] 25%|██▍ | 2168/8750 [3:33:26<10:30:23, 5.75s/it] {'loss': 0.499, 'learning_rate': 1.761504718907124e-05, 'epoch': 0.25} + 25%|██▍ | 2168/8750 [3:33:33<10:30:22, 5.75s/it] {'loss': 0.499, 'learning_rate': 1.761504718907124e-05, 'epoch': 0.25} + 25%|██▍ | 2168/8750 [3:33:26<10:30:23, 5.75s/it] 25%|██▍ | 2169/8750 [3:33:39<10:31:49, 5.76s/it] 25%|██▍ | 2169/8750 [3:33:32<10:31:49, 5.76s/it] {'loss': 0.4789, 'learning_rate': 1.7612647406539548e-05, 'epoch': 0.25} + 25%|██▍ | 2169/8750 [3:33:39<10:31:49, 5.76s/it] {'loss': 0.4789, 'learning_rate': 1.7612647406539548e-05, 'epoch': 0.25} + 25%|██▍ | 2169/8750 [3:33:32<10:31:49, 5.76s/it] 25%|██▍ | 2170/8750 [3:33:44<10:33:26, 5.78s/it] 25%|██▍ | 2170/8750 [3:33:38<10:33:26, 5.78s/it] {'loss': 0.4757, 'learning_rate': 1.76102465809052e-05, 'epoch': 0.25} + 25%|██▍ | 2170/8750 [3:33:44<10:33:26, 5.78s/it] {'loss': 0.4757, 'learning_rate': 1.76102465809052e-05, 'epoch': 0.25} + 25%|██▍ | 2170/8750 [3:33:38<10:33:26, 5.78s/it] 25%|██▍ | 2171/8750 [3:33:50<10:31:00, 5.75s/it] 25%|██▍ | 2171/8750 [3:33:44<10:31:01, 5.75s/it] {'loss': 0.4804, 'learning_rate': 1.760784471249716e-05, 'epoch': 0.25} + 25%|██▍ | 2171/8750 [3:33:50<10:31:00, 5.75s/it] {'loss': 0.4804, 'learning_rate': 1.760784471249716e-05, 'epoch': 0.25} + 25%|██▍ | 2171/8750 [3:33:44<10:31:01, 5.75s/it] 25%|██▍ | 2172/8750 [3:33:49<10:30:40, 5.75s/it] 25%|██▍ | 2172/8750 [3:33:56<10:30:40, 5.75s/it] {'loss': 0.4877, 'learning_rate': 1.760544180164454e-05, 'epoch': 0.25} + {'loss': 0.4877, 'learning_rate': 1.760544180164454e-05, 'epoch': 0.25} 25%|██▍ | 2172/8750 [3:33:56<10:30:40, 5.75s/it] + 25%|██▍ | 2172/8750 [3:33:49<10:30:40, 5.75s/it] 25%|██▍ | 2173/8750 [3:33:55<10:31:02, 5.76s/it] 25%|██▍ | 2173/8750 [3:34:02<10:31:03, 5.76s/it] {'loss': 0.4686, 'learning_rate': 1.7603037848676593e-05, 'epoch': 0.25} + 25%|██▍ | 2173/8750 [3:34:02<10:31:03, 5.76s/it] {'loss': 0.4686, 'learning_rate': 1.7603037848676593e-05, 'epoch': 0.25} + 25%|██▍ | 2173/8750 [3:33:55<10:31:02, 5.76s/it] 25%|██▍ | 2174/8750 [3:34:01<10:35:48, 5.80s/it] 25%|██▍ | 2174/8750 [3:34:08<10:35:48, 5.80s/it] {'loss': 0.4784, 'learning_rate': 1.7600632853922713e-05, 'epoch': 0.25} + 25%|██▍ | 2174/8750 [3:34:08<10:35:48, 5.80s/it] {'loss': 0.4784, 'learning_rate': 1.7600632853922713e-05, 'epoch': 0.25} + 25%|██▍ | 2174/8750 [3:34:01<10:35:48, 5.80s/it] 25%|██▍ | 2175/8750 [3:34:13<10:38:53, 5.83s/it] 25%|██▍ | 2175/8750 [3:34:07<10:38:53, 5.83s/it] {'loss': 0.4851, 'learning_rate': 1.7598226817712442e-05, 'epoch': 0.25} + 25%|██▍ | 2175/8750 [3:34:13<10:38:53, 5.83s/it] {'loss': 0.4851, 'learning_rate': 1.7598226817712442e-05, 'epoch': 0.25} + 25%|██▍ | 2175/8750 [3:34:07<10:38:53, 5.83s/it] 25%|██▍ | 2176/8750 [3:34:13<10:33:57, 5.79s/it] 25%|██▍ | 2176/8750 [3:34:19<10:33:57, 5.79s/it] {'loss': 0.4848, 'learning_rate': 1.7595819740375457e-05, 'epoch': 0.25} + 25%|██▍ | 2176/8750 [3:34:19<10:33:57, 5.79s/it] {'loss': 0.4848, 'learning_rate': 1.7595819740375457e-05, 'epoch': 0.25} + 25%|██▍ | 2176/8750 [3:34:13<10:33:57, 5.79s/it] 25%|██▍ | 2177/8750 [3:34:25<10:40:48, 5.85s/it] 25%|██▍ | 2177/8750 [3:34:19<10:40:48, 5.85s/it] {'loss': 0.4683, 'learning_rate': 1.7593411622241584e-05, 'epoch': 0.25} + 25%|██▍ | 2177/8750 [3:34:25<10:40:48, 5.85s/it] {'loss': 0.4683, 'learning_rate': 1.7593411622241584e-05, 'epoch': 0.25} + 25%|██▍ | 2177/8750 [3:34:19<10:40:48, 5.85s/it] 25%|██▍ | 2178/8750 [3:34:25<10:42:47, 5.87s/it] 25%|██▍ | 2178/8750 [3:34:31<10:42:48, 5.87s/it] {'loss': 0.4815, 'learning_rate': 1.7591002463640784e-05, 'epoch': 0.25} + 25%|██▍ | 2178/8750 [3:34:31<10:42:48, 5.87s/it] {'loss': 0.4815, 'learning_rate': 1.7591002463640784e-05, 'epoch': 0.25} + 25%|██▍ | 2178/8750 [3:34:25<10:42:47, 5.87s/it] 25%|██▍ | 2179/8750 [3:34:37<10:36:33, 5.81s/it] 25%|██▍ | 2179/8750 [3:34:30<10:36:33, 5.81s/it] {'loss': 0.5039, 'learning_rate': 1.758859226490317e-05, 'epoch': 0.25} + 25%|██▍ | 2179/8750 [3:34:37<10:36:33, 5.81s/it] {'loss': 0.5039, 'learning_rate': 1.758859226490317e-05, 'epoch': 0.25} + 25%|██▍ | 2179/8750 [3:34:30<10:36:33, 5.81s/it] 25%|██▍ | 2180/8750 [3:34:42<10:30:29, 5.76s/it] 25%|██▍ | 2180/8750 [3:34:36<10:30:29, 5.76s/it] {'loss': 0.4845, 'learning_rate': 1.7586181026358987e-05, 'epoch': 0.25} + 25%|██▍ | 2180/8750 [3:34:42<10:30:29, 5.76s/it] {'loss': 0.4845, 'learning_rate': 1.7586181026358987e-05, 'epoch': 0.25} + 25%|██▍ | 2180/8750 [3:34:36<10:30:29, 5.76s/it] 25%|██▍ | 2181/8750 [3:34:42<10:31:11, 5.77s/it] 25%|██▍ | 2181/8750 [3:34:48<10:31:11, 5.77s/it] {'loss': 0.4929, 'learning_rate': 1.758376874833864e-05, 'epoch': 0.25} + 25%|██▍ | 2181/8750 [3:34:48<10:31:11, 5.77s/it] {'loss': 0.4929, 'learning_rate': 1.758376874833864e-05, 'epoch': 0.25} + 25%|██▍ | 2181/8750 [3:34:42<10:31:11, 5.77s/it] 25%|██▍ | 2182/8750 [3:34:47<10:29:36, 5.75s/it] 25%|██▍ | 2182/8750 [3:34:54<10:29:36, 5.75s/it] {'loss': 0.4848, 'learning_rate': 1.7581355431172653e-05, 'epoch': 0.25} + 25%|██▍ | 2182/8750 [3:34:54<10:29:36, 5.75s/it] {'loss': 0.4848, 'learning_rate': 1.7581355431172653e-05, 'epoch': 0.25} + 25%|██▍ | 2182/8750 [3:34:47<10:29:36, 5.75s/it] 25%|██▍ | 2183/8750 [3:35:00<10:33:41, 5.79s/it] 25%|██▍ | 2183/8750 [3:34:53<10:33:42, 5.79s/it] {'loss': 0.4796, 'learning_rate': 1.757894107519171e-05, 'epoch': 0.25} + 25%|██▍ | 2183/8750 [3:35:00<10:33:41, 5.79s/it] {'loss': 0.4796, 'learning_rate': 1.757894107519171e-05, 'epoch': 0.25} + 25%|██▍ | 2183/8750 [3:34:53<10:33:42, 5.79s/it] 25%|██▍ | 2184/8750 [3:35:05<10:31:03, 5.77s/it] 25%|██▍ | 2184/8750 [3:34:59<10:31:04, 5.77s/it] {'loss': 0.4889, 'learning_rate': 1.757652568072663e-05, 'epoch': 0.25} + 25%|██▍ | 2184/8750 [3:35:05<10:31:03, 5.77s/it] {'loss': 0.4889, 'learning_rate': 1.757652568072663e-05, 'epoch': 0.25} + 25%|██▍ | 2184/8750 [3:34:59<10:31:04, 5.77s/it] 25%|██▍ | 2185/8750 [3:35:05<10:42:18, 5.87s/it] 25%|██▍ | 2185/8750 [3:35:12<10:42:19, 5.87s/it] {'loss': 0.4622, 'learning_rate': 1.757410924810838e-05, 'epoch': 0.25} + 25%|██▍ | 2185/8750 [3:35:05<10:42:18, 5.87s/it]{'loss': 0.4622, 'learning_rate': 1.757410924810838e-05, 'epoch': 0.25} + 25%|██▍ | 2185/8750 [3:35:12<10:42:19, 5.87s/it] 25%|██▍ | 2186/8750 [3:35:11<10:37:54, 5.83s/it] 25%|██▍ | 2186/8750 [3:35:17<10:37:55, 5.83s/it] {'loss': 0.4766, 'learning_rate': 1.757169177766806e-05, 'epoch': 0.25} + 25%|██▍ | 2186/8750 [3:35:17<10:37:55, 5.83s/it] {'loss': 0.4766, 'learning_rate': 1.757169177766806e-05, 'epoch': 0.25} + 25%|██▍ | 2186/8750 [3:35:11<10:37:54, 5.83s/it] 25%|██▍ | 2187/8750 [3:35:23<10:38:07, 5.83s/it] 25%|██▍ | 2187/8750 [3:35:17<10:38:08, 5.83s/it] {'loss': 0.4708, 'learning_rate': 1.7569273269736918e-05, 'epoch': 0.25} + 25%|██▍ | 2187/8750 [3:35:23<10:38:07, 5.83s/it] {'loss': 0.4708, 'learning_rate': 1.7569273269736918e-05, 'epoch': 0.25} + 25%|██▍ | 2187/8750 [3:35:17<10:38:08, 5.83s/it] 25%|██▌ | 2188/8750 [3:35:29<10:30:21, 5.76s/it] 25%|██▌ | 2188/8750 [3:35:22<10:30:22, 5.76s/it] {'loss': 0.537, 'learning_rate': 1.756685372464635e-05, 'epoch': 0.25} + 25%|██▌ | 2188/8750 [3:35:29<10:30:21, 5.76s/it] {'loss': 0.537, 'learning_rate': 1.756685372464635e-05, 'epoch': 0.25} + 25%|██▌ | 2188/8750 [3:35:22<10:30:22, 5.76s/it] 25%|██▌ | 2189/8750 [3:35:34<10:23:04, 5.70s/it] 25%|██▌ | 2189/8750 [3:35:28<10:23:05, 5.70s/it] {'loss': 0.4926, 'learning_rate': 1.7564433142727882e-05, 'epoch': 0.25} + 25%|██▌ | 2189/8750 [3:35:34<10:23:04, 5.70s/it] {'loss': 0.4926, 'learning_rate': 1.7564433142727882e-05, 'epoch': 0.25} + 25%|██▌ | 2189/8750 [3:35:28<10:23:05, 5.70s/it] 25%|██▌ | 2190/8750 [3:35:40<10:18:42, 5.66s/it] 25%|██▌ | 2190/8750 [3:35:33<10:18:41, 5.66s/it] {'loss': 0.4633, 'learning_rate': 1.7562011524313187e-05, 'epoch': 0.25} + 25%|██▌ | 2190/8750 [3:35:40<10:18:42, 5.66s/it] {'loss': 0.4633, 'learning_rate': 1.7562011524313187e-05, 'epoch': 0.25} + 25%|██▌ | 2190/8750 [3:35:33<10:18:41, 5.66s/it] 25%|██▌ | 2191/8750 [3:35:46<10:21:19, 5.68s/it] 25%|██▌ | 2191/8750 [3:35:39<10:21:19, 5.68s/it] {'loss': 0.4688, 'learning_rate': 1.755958886973408e-05, 'epoch': 0.25} + 25%|██▌ | 2191/8750 [3:35:46<10:21:19, 5.68s/it] {'loss': 0.4688, 'learning_rate': 1.755958886973408e-05, 'epoch': 0.25} + 25%|██▌ | 2191/8750 [3:35:39<10:21:19, 5.68s/it] 25%|██▌ | 2192/8750 [3:35:51<10:18:00, 5.65s/it] 25%|██▌ | 2192/8750 [3:35:45<10:18:00, 5.65s/it] {'loss': 0.4958, 'learning_rate': 1.7557165179322522e-05, 'epoch': 0.25} + 25%|██▌ | 2192/8750 [3:35:51<10:18:00, 5.65s/it] {'loss': 0.4958, 'learning_rate': 1.7557165179322522e-05, 'epoch': 0.25} + 25%|██▌ | 2192/8750 [3:35:45<10:18:00, 5.65s/it] 25%|██▌ | 2193/8750 [3:35:57<10:25:26, 5.72s/it] 25%|██▌ | 2193/8750 [3:35:51<10:25:26, 5.72s/it] {'loss': 0.4528, 'learning_rate': 1.7554740453410617e-05, 'epoch': 0.25} + 25%|██▌ | 2193/8750 [3:35:57<10:25:26, 5.72s/it] {'loss': 0.4528, 'learning_rate': 1.7554740453410617e-05, 'epoch': 0.25} + 25%|██▌ | 2193/8750 [3:35:51<10:25:26, 5.72s/it] 25%|██▌ | 2194/8750 [3:36:03<10:24:06, 5.71s/it] 25%|██▌ | 2194/8750 [3:35:56<10:24:05, 5.71s/it] {'loss': 0.4919, 'learning_rate': 1.75523146923306e-05, 'epoch': 0.25} + 25%|██▌ | 2194/8750 [3:36:03<10:24:06, 5.71s/it] {'loss': 0.4919, 'learning_rate': 1.75523146923306e-05, 'epoch': 0.25} + 25%|██▌ | 2194/8750 [3:35:56<10:24:05, 5.71s/it] 25%|██▌ | 2195/8750 [3:36:08<10:24:19, 5.71s/it] 25%|██▌ | 2195/8750 [3:36:02<10:24:19, 5.71s/it] {'loss': 0.4762, 'learning_rate': 1.7549887896414853e-05, 'epoch': 0.25} + 25%|██▌ | 2195/8750 [3:36:08<10:24:19, 5.71s/it] {'loss': 0.4762, 'learning_rate': 1.7549887896414853e-05, 'epoch': 0.25} + 25%|██▌ | 2195/8750 [3:36:02<10:24:19, 5.71s/it] 25%|██▌ | 2196/8750 [3:36:08<10:26:39, 5.74s/it] 25%|██▌ | 2196/8750 [3:36:14<10:26:40, 5.74s/it] {'loss': 0.4834, 'learning_rate': 1.7547460065995903e-05, 'epoch': 0.25} + 25%|██▌ | 2196/8750 [3:36:14<10:26:40, 5.74s/it] {'loss': 0.4834, 'learning_rate': 1.7547460065995903e-05, 'epoch': 0.25} + 25%|██▌ | 2196/8750 [3:36:08<10:26:39, 5.74s/it] 25%|██▌ | 2197/8750 [3:36:13<10:23:09, 5.71s/it] 25%|██▌ | 2197/8750 [3:36:20<10:23:10, 5.71s/it] {'loss': 0.5091, 'learning_rate': 1.754503120140642e-05, 'epoch': 0.25} + 25%|██▌ | 2197/8750 [3:36:20<10:23:10, 5.71s/it] {'loss': 0.5091, 'learning_rate': 1.754503120140642e-05, 'epoch': 0.25} + 25%|██▌ | 2197/8750 [3:36:13<10:23:09, 5.71s/it] 25%|██▌ | 2198/8750 [3:36:19<10:26:39, 5.74s/it] 25%|██▌ | 2198/8750 [3:36:26<10:26:39, 5.74s/it] {'loss': 0.4773, 'learning_rate': 1.7542601302979213e-05, 'epoch': 0.25} + 25%|██▌ | 2198/8750 [3:36:26<10:26:39, 5.74s/it] {'loss': 0.4773, 'learning_rate': 1.7542601302979213e-05, 'epoch': 0.25} + 25%|██▌ | 2198/8750 [3:36:19<10:26:39, 5.74s/it] 25%|██▌ | 2199/8750 [3:36:25<10:29:27, 5.77s/it] 25%|██▌ | 2199/8750 [3:36:32<10:29:27, 5.77s/it] {'loss': 0.4704, 'learning_rate': 1.7540170371047228e-05, 'epoch': 0.25} + 25%|██▌ | 2199/8750 [3:36:32<10:29:27, 5.77s/it] {'loss': 0.4704, 'learning_rate': 1.7540170371047228e-05, 'epoch': 0.25} + 25%|██▌ | 2199/8750 [3:36:25<10:29:27, 5.77s/it]12 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +4 13 AutoResumeHook: Checking whether to suspend... +2AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend...7 AutoResumeHook: Checking whether to suspend... +6 + AutoResumeHook: Checking whether to suspend... +03 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend...1 + 25%|██▌ | 2200/8750 [3:36:37<10:27:47, 5.75s/it]AutoResumeHook: Checking whether to suspend... + 25%|██▌ | 2200/8750 [3:36:31<10:27:47, 5.75s/it] {'loss': 0.4855, 'learning_rate': 1.753773840594356e-05, 'epoch': 0.25} + 25%|██▌ | 2200/8750 [3:36:37<10:27:47, 5.75s/it] {'loss': 0.4855, 'learning_rate': 1.753773840594356e-05, 'epoch': 0.25} + 25%|██▌ | 2200/8750 [3:36:31<10:27:47, 5.75s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-2200/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-2200/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-2200/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 25%|██▌ | 2201/8750 [3:36:59<18:59:36, 10.44s/it] 25%|██▌ | 2201/8750 [3:36:52<18:59:37, 10.44s/it] {'loss': 0.48, 'learning_rate': 1.753530540800144e-05, 'epoch': 0.25} + 25%|██▌ | 2201/8750 [3:36:59<18:59:36, 10.44s/it] {'loss': 0.48, 'learning_rate': 1.753530540800144e-05, 'epoch': 0.25} + 25%|██▌ | 2201/8750 [3:36:52<18:59:37, 10.44s/it] 25%|██▌ | 2202/8750 [3:37:04<16:20:40, 8.99s/it] 25%|██▌ | 2202/8750 [3:36:58<16:20:40, 8.99s/it] {'loss': 0.4906, 'learning_rate': 1.7532871377554243e-05, 'epoch': 0.25} + 25%|██▌ | 2202/8750 [3:37:04<16:20:40, 8.99s/it] {'loss': 0.4906, 'learning_rate': 1.7532871377554243e-05, 'epoch': 0.25} + 25%|██▌ | 2202/8750 [3:36:58<16:20:40, 8.99s/it] 25%|██▌ | 2203/8750 [3:37:10<14:43:20, 8.10s/it] 25%|██▌ | 2203/8750 [3:37:04<14:43:21, 8.10s/it] {'loss': 0.4709, 'learning_rate': 1.7530436314935492e-05, 'epoch': 0.25} + {'loss': 0.4709, 'learning_rate': 1.7530436314935492e-05, 'epoch': 0.25} + 25%|██▌ | 2203/8750 [3:37:10<14:43:20, 8.10s/it] 25%|██▌ | 2203/8750 [3:37:04<14:43:21, 8.10s/it] 25%|██▌ | 2204/8750 [3:37:10<13:31:25, 7.44s/it] 25%|██▌ | 2204/8750 [3:37:16<13:31:27, 7.44s/it] {'loss': 0.4907, 'learning_rate': 1.7528000220478836e-05, 'epoch': 0.25} + 25%|██▌ | 2204/8750 [3:37:16<13:31:27, 7.44s/it] {'loss': 0.4907, 'learning_rate': 1.7528000220478836e-05, 'epoch': 0.25} + 25%|██▌ | 2204/8750 [3:37:10<13:31:25, 7.44s/it] 25%|██▌ | 2205/8750 [3:37:22<12:34:56, 6.92s/it] 25%|██▌ | 2205/8750 [3:37:15<12:34:57, 6.92s/it] {'loss': 0.4629, 'learning_rate': 1.7525563094518078e-05, 'epoch': 0.25} + 25%|██▌ | 2205/8750 [3:37:22<12:34:56, 6.92s/it] {'loss': 0.4629, 'learning_rate': 1.7525563094518078e-05, 'epoch': 0.25} + 25%|██▌ | 2205/8750 [3:37:15<12:34:57, 6.92s/it] 25%|██▌ | 2206/8750 [3:37:21<11:54:29, 6.55s/it] 25%|██▌ | 2206/8750 [3:37:28<11:54:29, 6.55s/it] {'loss': 0.4857, 'learning_rate': 1.7523124937387162e-05, 'epoch': 0.25} + 25%|██▌ | 2206/8750 [3:37:28<11:54:29, 6.55s/it] {'loss': 0.4857, 'learning_rate': 1.7523124937387162e-05, 'epoch': 0.25} + 25%|██▌ | 2206/8750 [3:37:21<11:54:29, 6.55s/it] 25%|██▌ | 2207/8750 [3:37:27<11:31:42, 6.34s/it] 25%|██▌ | 2207/8750 [3:37:33<11:31:42, 6.34s/it] {'loss': 0.4732, 'learning_rate': 1.7520685749420164e-05, 'epoch': 0.25} + 25%|██▌ | 2207/8750 [3:37:33<11:31:42, 6.34s/it] {'loss': 0.4732, 'learning_rate': 1.7520685749420164e-05, 'epoch': 0.25} + 25%|██▌ | 2207/8750 [3:37:27<11:31:42, 6.34s/it] 25%|██▌ | 2208/8750 [3:37:33<11:09:30, 6.14s/it] 25%|██▌ | 2208/8750 [3:37:39<11:09:30, 6.14s/it] {'loss': 0.4853, 'learning_rate': 1.7518245530951315e-05, 'epoch': 0.25} + 25%|██▌ | 2208/8750 [3:37:39<11:09:30, 6.14s/it] {'loss': 0.4853, 'learning_rate': 1.7518245530951315e-05, 'epoch': 0.25} + 25%|██▌ | 2208/8750 [3:37:33<11:09:30, 6.14s/it] 25%|██▌ | 2209/8750 [3:37:45<11:00:21, 6.06s/it] 25%|██▌ | 2209/8750 [3:37:38<11:00:23, 6.06s/it] {'loss': 0.4757, 'learning_rate': 1.7515804282314974e-05, 'epoch': 0.25} + 25%|██▌ | 2209/8750 [3:37:45<11:00:21, 6.06s/it] {'loss': 0.4757, 'learning_rate': 1.7515804282314974e-05, 'epoch': 0.25} + 25%|██▌ | 2209/8750 [3:37:38<11:00:23, 6.06s/it] 25%|██▌ | 2210/8750 [3:37:51<10:50:35, 5.97s/it] 25%|██▌ | 2210/8750 [3:37:44<10:50:35, 5.97s/it] {'loss': 0.48, 'learning_rate': 1.751336200384564e-05, 'epoch': 0.25} + 25%|██▌ | 2210/8750 [3:37:51<10:50:35, 5.97s/it] {'loss': 0.48, 'learning_rate': 1.751336200384564e-05, 'epoch': 0.25} + 25%|██▌ | 2210/8750 [3:37:44<10:50:35, 5.97s/it] 25%|██▌ | 2211/8750 [3:37:56<10:43:35, 5.91s/it] 25%|██▌ | 2211/8750 [3:37:50<10:43:35, 5.91s/it] {'loss': 0.4729, 'learning_rate': 1.751091869587797e-05, 'epoch': 0.25} + 25%|██▌ | 2211/8750 [3:37:56<10:43:35, 5.91s/it] {'loss': 0.4729, 'learning_rate': 1.751091869587797e-05, 'epoch': 0.25} + 25%|██▌ | 2211/8750 [3:37:50<10:43:35, 5.91s/it] 25%|██▌ | 2212/8750 [3:37:56<10:34:46, 5.83s/it] 25%|██▌ | 2212/8750 [3:38:02<10:34:47, 5.83s/it] {'loss': 0.4996, 'learning_rate': 1.7508474358746753e-05, 'epoch': 0.25} + 25%|██▌ | 2212/8750 [3:38:02<10:34:47, 5.83s/it] {'loss': 0.4996, 'learning_rate': 1.7508474358746753e-05, 'epoch': 0.25} + 25%|██▌ | 2212/8750 [3:37:56<10:34:46, 5.83s/it] 25%|██▌ | 2213/8750 [3:38:08<10:29:08, 5.77s/it] 25%|██▌ | 2213/8750 [3:38:01<10:29:08, 5.77s/it] {'loss': 0.4631, 'learning_rate': 1.7506028992786912e-05, 'epoch': 0.25} + 25%|██▌ | 2213/8750 [3:38:08<10:29:08, 5.77s/it] {'loss': 0.4631, 'learning_rate': 1.7506028992786912e-05, 'epoch': 0.25} + 25%|██▌ | 2213/8750 [3:38:01<10:29:08, 5.77s/it] 25%|██▌ | 2214/8750 [3:38:07<10:36:56, 5.85s/it] 25%|██▌ | 2214/8750 [3:38:14<10:36:56, 5.85s/it] {'loss': 0.4687, 'learning_rate': 1.7503582598333517e-05, 'epoch': 0.25} + 25%|██▌ | 2214/8750 [3:38:14<10:36:56, 5.85s/it] {'loss': 0.4687, 'learning_rate': 1.7503582598333517e-05, 'epoch': 0.25} + 25%|██▌ | 2214/8750 [3:38:07<10:36:56, 5.85s/it] 25%|██▌ | 2215/8750 [3:38:19<10:30:07, 5.79s/it] 25%|██▌ | 2215/8750 [3:38:13<10:30:07, 5.79s/it] {'loss': 0.4873, 'learning_rate': 1.750113517572178e-05, 'epoch': 0.25} + 25%|██▌ | 2215/8750 [3:38:19<10:30:07, 5.79s/it] {'loss': 0.4873, 'learning_rate': 1.750113517572178e-05, 'epoch': 0.25} + 25%|██▌ | 2215/8750 [3:38:13<10:30:07, 5.79s/it] 25%|██▌ | 2216/8750 [3:38:19<10:25:59, 5.75s/it] 25%|██▌ | 2216/8750 [3:38:25<10:26:00, 5.75s/it] {'loss': 0.474, 'learning_rate': 1.749868672528705e-05, 'epoch': 0.25} + 25%|██▌ | 2216/8750 [3:38:25<10:26:00, 5.75s/it] {'loss': 0.474, 'learning_rate': 1.749868672528705e-05, 'epoch': 0.25} + 25%|██▌ | 2216/8750 [3:38:19<10:25:59, 5.75s/it] 25%|██▌ | 2217/8750 [3:38:31<10:34:41, 5.83s/it] 25%|██▌ | 2217/8750 [3:38:25<10:34:44, 5.83s/it] {'loss': 0.4696, 'learning_rate': 1.7496237247364827e-05, 'epoch': 0.25} + 25%|██▌ | 2217/8750 [3:38:31<10:34:41, 5.83s/it] {'loss': 0.4696, 'learning_rate': 1.7496237247364827e-05, 'epoch': 0.25} + 25%|██▌ | 2217/8750 [3:38:25<10:34:44, 5.83s/it] 25%|██▌ | 2218/8750 [3:38:37<10:33:50, 5.82s/it] 25%|██▌ | 2218/8750 [3:38:30<10:33:49, 5.82s/it] {'loss': 0.4869, 'learning_rate': 1.7493786742290734e-05, 'epoch': 0.25} + 25%|██▌ | 2218/8750 [3:38:37<10:33:50, 5.82s/it] {'loss': 0.4869, 'learning_rate': 1.7493786742290734e-05, 'epoch': 0.25} + 25%|██▌ | 2218/8750 [3:38:30<10:33:49, 5.82s/it] 25%|██▌ | 2219/8750 [3:38:36<10:31:01, 5.80s/it] 25%|██▌ | 2219/8750 [3:38:43<10:31:02, 5.80s/it] {'loss': 0.4751, 'learning_rate': 1.7491335210400554e-05, 'epoch': 0.25} + 25%|██▌ | 2219/8750 [3:38:43<10:31:02, 5.80s/it] {'loss': 0.4751, 'learning_rate': 1.7491335210400554e-05, 'epoch': 0.25} + 25%|██▌ | 2219/8750 [3:38:36<10:31:01, 5.80s/it] 25%|██▌ | 2220/8750 [3:38:48<10:22:44, 5.72s/it] 25%|██▌ | 2220/8750 [3:38:42<10:22:44, 5.72s/it] {'loss': 0.5086, 'learning_rate': 1.7488882652030193e-05, 'epoch': 0.25} + 25%|██▌ | 2220/8750 [3:38:48<10:22:44, 5.72s/it] {'loss': 0.5086, 'learning_rate': 1.7488882652030193e-05, 'epoch': 0.25} + 25%|██▌ | 2220/8750 [3:38:42<10:22:44, 5.72s/it] 25%|██▌ | 2221/8750 [3:38:47<10:24:43, 5.74s/it] 25%|██▌ | 2221/8750 [3:38:54<10:24:44, 5.74s/it] {'loss': 0.4697, 'learning_rate': 1.748642906751571e-05, 'epoch': 0.25} + 25%|██▌ | 2221/8750 [3:38:54<10:24:44, 5.74s/it] {'loss': 0.4697, 'learning_rate': 1.748642906751571e-05, 'epoch': 0.25} + 25%|██▌ | 2221/8750 [3:38:47<10:24:43, 5.74s/it] 25%|██▌ | 2222/8750 [3:39:00<10:29:47, 5.79s/it] 25%|██▌ | 2222/8750 [3:38:53<10:29:47, 5.79s/it] {'loss': 0.4575, 'learning_rate': 1.7483974457193307e-05, 'epoch': 0.25} + 25%|██▌ | 2222/8750 [3:39:00<10:29:47, 5.79s/it] {'loss': 0.4575, 'learning_rate': 1.7483974457193307e-05, 'epoch': 0.25} + 25%|██▌ | 2222/8750 [3:38:53<10:29:47, 5.79s/it] 25%|██▌ | 2223/8750 [3:39:05<10:23:20, 5.73s/it] 25%|██▌ | 2223/8750 [3:38:59<10:23:20, 5.73s/it] {'loss': 0.482, 'learning_rate': 1.748151882139931e-05, 'epoch': 0.25} + 25%|██▌ | 2223/8750 [3:39:05<10:23:20, 5.73s/it] {'loss': 0.482, 'learning_rate': 1.748151882139931e-05, 'epoch': 0.25} + 25%|██▌ | 2223/8750 [3:38:59<10:23:20, 5.73s/it] 25%|██▌ | 2224/8750 [3:39:04<10:15:59, 5.66s/it] 25%|██▌ | 2224/8750 [3:39:11<10:15:59, 5.66s/it] {'loss': 0.4892, 'learning_rate': 1.7479062160470205e-05, 'epoch': 0.25} + 25%|██▌ | 2224/8750 [3:39:11<10:15:59, 5.66s/it] {'loss': 0.4892, 'learning_rate': 1.7479062160470205e-05, 'epoch': 0.25} + 25%|██▌ | 2224/8750 [3:39:04<10:15:59, 5.66s/it] 25%|██▌ | 2225/8750 [3:39:17<10:28:48, 5.78s/it] 25%|██▌ | 2225/8750 [3:39:11<10:28:49, 5.78s/it] {'loss': 0.4507, 'learning_rate': 1.74766044747426e-05, 'epoch': 0.25} + 25%|██▌ | 2225/8750 [3:39:17<10:28:48, 5.78s/it] {'loss': 0.4507, 'learning_rate': 1.74766044747426e-05, 'epoch': 0.25} + 25%|██▌ | 2225/8750 [3:39:11<10:28:49, 5.78s/it] 25%|██▌ | 2226/8750 [3:39:23<10:26:24, 5.76s/it] 25%|██▌ | 2226/8750 [3:39:16<10:26:25, 5.76s/it] {'loss': 0.4824, 'learning_rate': 1.7474145764553262e-05, 'epoch': 0.25} + 25%|██▌ | 2226/8750 [3:39:23<10:26:24, 5.76s/it] {'loss': 0.4824, 'learning_rate': 1.7474145764553262e-05, 'epoch': 0.25} + 25%|██▌ | 2226/8750 [3:39:16<10:26:25, 5.76s/it] 25%|██▌ | 2227/8750 [3:39:29<10:28:02, 5.78s/it] 25%|██▌ | 2227/8750 [3:39:22<10:28:02, 5.78s/it] {'loss': 0.4685, 'learning_rate': 1.7471686030239082e-05, 'epoch': 0.25} + 25%|██▌ | 2227/8750 [3:39:29<10:28:02, 5.78s/it] {'loss': 0.4685, 'learning_rate': 1.7471686030239082e-05, 'epoch': 0.25} + 25%|██▌ | 2227/8750 [3:39:22<10:28:02, 5.78s/it] 25%|██▌ | 2228/8750 [3:39:27<10:17:14, 5.68s/it] 25%|██▌ | 2228/8750 [3:39:34<10:17:14, 5.68s/it] {'loss': 0.5125, 'learning_rate': 1.7469225272137104e-05, 'epoch': 0.25} + 25%|██▌ | 2228/8750 [3:39:27<10:17:14, 5.68s/it]{'loss': 0.5125, 'learning_rate': 1.7469225272137104e-05, 'epoch': 0.25} + 25%|██▌ | 2228/8750 [3:39:34<10:17:14, 5.68s/it] 25%|██▌ | 2229/8750 [3:39:33<10:28:16, 5.78s/it] 25%|██▌ | 2229/8750 [3:39:40<10:28:16, 5.78s/it] {'loss': 0.4813, 'learning_rate': 1.7466763490584504e-05, 'epoch': 0.25} + 25%|██▌ | 2229/8750 [3:39:40<10:28:16, 5.78s/it] {'loss': 0.4813, 'learning_rate': 1.7466763490584504e-05, 'epoch': 0.25} + 25%|██▌ | 2229/8750 [3:39:34<10:28:16, 5.78s/it] 25%|██▌ | 2230/8750 [3:39:39<10:21:33, 5.72s/it] 25%|██▌ | 2230/8750 [3:39:46<10:21:33, 5.72s/it] {'loss': 0.4821, 'learning_rate': 1.7464300685918602e-05, 'epoch': 0.25} + 25%|██▌ | 2230/8750 [3:39:46<10:21:33, 5.72s/it] {'loss': 0.4821, 'learning_rate': 1.7464300685918602e-05, 'epoch': 0.25} + 25%|██▌ | 2230/8750 [3:39:39<10:21:33, 5.72s/it] 25%|██▌ | 2231/8750 [3:39:51<10:16:50, 5.68s/it] 25%|██▌ | 2231/8750 [3:39:45<10:16:50, 5.68s/it] {'loss': 0.4607, 'learning_rate': 1.7461836858476858e-05, 'epoch': 0.25} + 25%|██▌ | 2231/8750 [3:39:51<10:16:50, 5.68s/it] {'loss': 0.4607, 'learning_rate': 1.7461836858476858e-05, 'epoch': 0.25} + 25%|██▌ | 2231/8750 [3:39:45<10:16:50, 5.68s/it] 26%|██▌ | 2232/8750 [3:39:57<10:15:43, 5.67s/it] 26%|██▌ | 2232/8750 [3:39:50<10:15:43, 5.67s/it] {'loss': 0.4742, 'learning_rate': 1.745937200859687e-05, 'epoch': 0.26} + 26%|██▌ | 2232/8750 [3:39:57<10:15:43, 5.67s/it] {'loss': 0.4742, 'learning_rate': 1.745937200859687e-05, 'epoch': 0.26} + 26%|██▌ | 2232/8750 [3:39:50<10:15:43, 5.67s/it] 26%|██▌ | 2233/8750 [3:40:02<10:16:14, 5.67s/it] 26%|██▌ | 2233/8750 [3:39:56<10:16:14, 5.67s/it] {'loss': 0.47, 'learning_rate': 1.7456906136616374e-05, 'epoch': 0.26} + 26%|██▌ | 2233/8750 [3:40:02<10:16:14, 5.67s/it] {'loss': 0.47, 'learning_rate': 1.7456906136616374e-05, 'epoch': 0.26} + 26%|██▌ | 2233/8750 [3:39:56<10:16:14, 5.67s/it] 26%|██▌ | 2234/8750 [3:40:08<10:14:52, 5.66s/it] 26%|██▌ | 2234/8750 [3:40:02<10:14:53, 5.66s/it] {'loss': 0.4881, 'learning_rate': 1.7454439242873257e-05, 'epoch': 0.26} + 26%|██▌ | 2234/8750 [3:40:08<10:14:52, 5.66s/it] {'loss': 0.4881, 'learning_rate': 1.7454439242873257e-05, 'epoch': 0.26} + 26%|██▌ | 2234/8750 [3:40:02<10:14:53, 5.66s/it] 26%|██▌ | 2235/8750 [3:40:08<10:33:07, 5.83s/it] 26%|██▌ | 2235/8750 [3:40:14<10:33:07, 5.83s/it] {'loss': 0.4716, 'learning_rate': 1.745197132770553e-05, 'epoch': 0.26} + 26%|██▌ | 2235/8750 [3:40:14<10:33:07, 5.83s/it] {'loss': 0.4716, 'learning_rate': 1.745197132770553e-05, 'epoch': 0.26} + 26%|██▌ | 2235/8750 [3:40:08<10:33:07, 5.83s/it] 26%|██▌ | 2236/8750 [3:40:13<10:26:02, 5.77s/it] 26%|██▌ | 2236/8750 [3:40:20<10:26:03, 5.77s/it] {'loss': 0.5046, 'learning_rate': 1.7449502391451362e-05, 'epoch': 0.26} + 26%|██▌ | 2236/8750 [3:40:20<10:26:03, 5.77s/it] {'loss': 0.5046, 'learning_rate': 1.7449502391451362e-05, 'epoch': 0.26} + 26%|██▌ | 2236/8750 [3:40:13<10:26:02, 5.77s/it] 26%|██▌ | 2237/8750 [3:40:26<10:24:57, 5.76s/it] 26%|██▌ | 2237/8750 [3:40:19<10:24:57, 5.76s/it] {'loss': 0.48, 'learning_rate': 1.7447032434449045e-05, 'epoch': 0.26} + 26%|██▌ | 2237/8750 [3:40:26<10:24:57, 5.76s/it] {'loss': 0.48, 'learning_rate': 1.7447032434449045e-05, 'epoch': 0.26} + 26%|██▌ | 2237/8750 [3:40:19<10:24:57, 5.76s/it] 26%|██▌ | 2238/8750 [3:40:31<10:22:43, 5.74s/it] 26%|██▌ | 2238/8750 [3:40:25<10:22:44, 5.74s/it] {'loss': 0.4854, 'learning_rate': 1.7444561457037022e-05, 'epoch': 0.26} + 26%|██▌ | 2238/8750 [3:40:31<10:22:43, 5.74s/it] {'loss': 0.4854, 'learning_rate': 1.7444561457037022e-05, 'epoch': 0.26} + 26%|██▌ | 2238/8750 [3:40:25<10:22:44, 5.74s/it] 26%|██▌ | 2239/8750 [3:40:37<10:22:52, 5.74s/it] 26%|██▌ | 2239/8750 [3:40:31<10:22:53, 5.74s/it] {'loss': 0.4625, 'learning_rate': 1.744208945955387e-05, 'epoch': 0.26} + 26%|██▌ | 2239/8750 [3:40:37<10:22:52, 5.74s/it] {'loss': 0.4625, 'learning_rate': 1.744208945955387e-05, 'epoch': 0.26} + 26%|██▌ | 2239/8750 [3:40:31<10:22:53, 5.74s/it] 26%|██▌ | 2240/8750 [3:40:43<10:17:15, 5.69s/it] 26%|██▌ | 2240/8750 [3:40:36<10:17:15, 5.69s/it] {'loss': 0.4795, 'learning_rate': 1.743961644233831e-05, 'epoch': 0.26} + 26%|██▌ | 2240/8750 [3:40:43<10:17:15, 5.69s/it] {'loss': 0.4795, 'learning_rate': 1.743961644233831e-05, 'epoch': 0.26} + 26%|██▌ | 2240/8750 [3:40:36<10:17:15, 5.69s/it] 26%|██▌ | 2241/8750 [3:40:42<10:28:25, 5.79s/it] 26%|██▌ | 2241/8750 [3:40:49<10:28:25, 5.79s/it] {'loss': 0.4831, 'learning_rate': 1.7437142405729196e-05, 'epoch': 0.26} + 26%|██▌ | 2241/8750 [3:40:49<10:28:25, 5.79s/it] {'loss': 0.4831, 'learning_rate': 1.7437142405729196e-05, 'epoch': 0.26} + 26%|██▌ | 2241/8750 [3:40:42<10:28:25, 5.79s/it] 26%|██▌ | 2242/8750 [3:40:55<10:34:26, 5.85s/it] 26%|██▌ | 2242/8750 [3:40:48<10:34:26, 5.85s/it] {'loss': 0.4779, 'learning_rate': 1.743466735006553e-05, 'epoch': 0.26} + 26%|██▌ | 2242/8750 [3:40:55<10:34:26, 5.85s/it] {'loss': 0.4779, 'learning_rate': 1.743466735006553e-05, 'epoch': 0.26} + 26%|██▌ | 2242/8750 [3:40:48<10:34:26, 5.85s/it] 26%|██▌ | 2243/8750 [3:41:01<10:32:24, 5.83s/it] 26%|██▌ | 2243/8750 [3:40:54<10:32:24, 5.83s/it] {'loss': 0.4721, 'learning_rate': 1.7432191275686454e-05, 'epoch': 0.26} + 26%|██▌ | 2243/8750 [3:41:01<10:32:24, 5.83s/it] {'loss': 0.4721, 'learning_rate': 1.7432191275686454e-05, 'epoch': 0.26} + 26%|██▌ | 2243/8750 [3:40:54<10:32:24, 5.83s/it] 26%|██▌ | 2244/8750 [3:41:06<10:28:44, 5.80s/it] 26%|██▌ | 2244/8750 [3:41:00<10:28:44, 5.80s/it] {'loss': 0.4879, 'learning_rate': 1.7429714182931238e-05, 'epoch': 0.26} + 26%|██▌ | 2244/8750 [3:41:06<10:28:44, 5.80s/it] {'loss': 0.4879, 'learning_rate': 1.7429714182931238e-05, 'epoch': 0.26} + 26%|██▌ | 2244/8750 [3:41:00<10:28:44, 5.80s/it] 26%|██▌ | 2245/8750 [3:41:12<10:31:08, 5.82s/it] 26%|██▌ | 2245/8750 [3:41:06<10:31:08, 5.82s/it] {'loss': 0.4855, 'learning_rate': 1.7427236072139306e-05, 'epoch': 0.26} + 26%|██▌ | 2245/8750 [3:41:12<10:31:08, 5.82s/it] {'loss': 0.4855, 'learning_rate': 1.7427236072139306e-05, 'epoch': 0.26} + 26%|██▌ | 2245/8750 [3:41:06<10:31:08, 5.82s/it] 26%|██▌ | 2246/8750 [3:41:18<10:34:07, 5.85s/it] 26%|██▌ | 2246/8750 [3:41:12<10:34:07, 5.85s/it] {'loss': 0.4936, 'learning_rate': 1.7424756943650203e-05, 'epoch': 0.26} + 26%|██▌ | 2246/8750 [3:41:18<10:34:07, 5.85s/it] {'loss': 0.4936, 'learning_rate': 1.7424756943650203e-05, 'epoch': 0.26} + 26%|██▌ | 2246/8750 [3:41:12<10:34:07, 5.85s/it] 26%|██▌ | 2247/8750 [3:41:17<10:28:14, 5.80s/it] 26%|██▌ | 2247/8750 [3:41:24<10:28:15, 5.80s/it] {'loss': 0.4762, 'learning_rate': 1.7422276797803638e-05, 'epoch': 0.26} + 26%|██▌ | 2247/8750 [3:41:24<10:28:15, 5.80s/it] {'loss': 0.4762, 'learning_rate': 1.7422276797803638e-05, 'epoch': 0.26} + 26%|██▌ | 2247/8750 [3:41:17<10:28:14, 5.80s/it] 26%|██▌ | 2248/8750 [3:41:29<10:20:45, 5.73s/it] 26%|██▌ | 2248/8750 [3:41:23<10:20:45, 5.73s/it] {'loss': 0.4638, 'learning_rate': 1.741979563493944e-05, 'epoch': 0.26} + 26%|██▌ | 2248/8750 [3:41:29<10:20:45, 5.73s/it] {'loss': 0.4638, 'learning_rate': 1.741979563493944e-05, 'epoch': 0.26} + 26%|██▌ | 2248/8750 [3:41:23<10:20:45, 5.73s/it] 26%|██▌ | 2249/8750 [3:41:35<10:20:36, 5.73s/it] 26%|██▌ | 2249/8750 [3:41:28<10:20:36, 5.73s/it] {'loss': 0.4827, 'learning_rate': 1.741731345539758e-05, 'epoch': 0.26} + 26%|██▌ | 2249/8750 [3:41:35<10:20:36, 5.73s/it] {'loss': 0.4827, 'learning_rate': 1.741731345539758e-05, 'epoch': 0.26} + 26%|██▌ | 2249/8750 [3:41:28<10:20:36, 5.73s/it]12 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +08 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... + 26%|██▌ | 2250/8750 [3:41:41<10:17:30, 5.70s/it]14 AutoResumeHook: Checking whether to suspend... + 10 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... + 26%|██▌ | 2250/8750 [3:41:34<10:17:31, 5.70s/it] {'loss': 0.4847, 'learning_rate': 1.741483025951818e-05, 'epoch': 0.26} + 26%|██▌ | 2250/8750 [3:41:41<10:17:30, 5.70s/it] {'loss': 0.4847, 'learning_rate': 1.741483025951818e-05, 'epoch': 0.26} + 26%|██▌ | 2250/8750 [3:41:34<10:17:31, 5.70s/it] 26%|██▌ | 2251/8750 [3:41:46<10:23:00, 5.75s/it] 26%|██▌ | 2251/8750 [3:41:40<10:23:00, 5.75s/it] {'loss': 0.4682, 'learning_rate': 1.7412346047641485e-05, 'epoch': 0.26} + 26%|██▌ | 2251/8750 [3:41:46<10:23:00, 5.75s/it] {'loss': 0.4682, 'learning_rate': 1.7412346047641485e-05, 'epoch': 0.26} + 26%|██▌ | 2251/8750 [3:41:40<10:23:00, 5.75s/it] 26%|██▌ | 2252/8750 [3:41:52<10:16:03, 5.69s/it] 26%|██▌ | 2252/8750 [3:41:46<10:16:02, 5.69s/it] {'loss': 0.4909, 'learning_rate': 1.74098608201079e-05, 'epoch': 0.26} + 26%|██▌ | 2252/8750 [3:41:52<10:16:03, 5.69s/it] {'loss': 0.4909, 'learning_rate': 1.74098608201079e-05, 'epoch': 0.26} + 26%|██▌ | 2252/8750 [3:41:46<10:16:02, 5.69s/it] 26%|██▌ | 2253/8750 [3:41:51<10:18:09, 5.71s/it] 26%|██▌ | 2253/8750 [3:41:58<10:18:10, 5.71s/it] {'loss': 0.479, 'learning_rate': 1.7407374577257945e-05, 'epoch': 0.26} + 26%|██▌ | 2253/8750 [3:41:58<10:18:10, 5.71s/it] {'loss': 0.479, 'learning_rate': 1.7407374577257945e-05, 'epoch': 0.26} + 26%|██▌ | 2253/8750 [3:41:51<10:18:09, 5.71s/it] 26%|██▌ | 2254/8750 [3:41:57<10:15:03, 5.68s/it] 26%|██▌ | 2254/8750 [3:42:03<10:15:04, 5.68s/it] {'loss': 0.5104, 'learning_rate': 1.7404887319432294e-05, 'epoch': 0.26} + 26%|██▌ | 2254/8750 [3:42:03<10:15:04, 5.68s/it] {'loss': 0.5104, 'learning_rate': 1.7404887319432294e-05, 'epoch': 0.26} + 26%|██▌ | 2254/8750 [3:41:57<10:15:03, 5.68s/it] 26%|██▌ | 2255/8750 [3:42:09<10:17:42, 5.71s/it] 26%|██▌ | 2255/8750 [3:42:03<10:17:42, 5.71s/it] {'loss': 0.4657, 'learning_rate': 1.740239904697176e-05, 'epoch': 0.26} + 26%|██▌ | 2255/8750 [3:42:09<10:17:42, 5.71s/it] {'loss': 0.4657, 'learning_rate': 1.740239904697176e-05, 'epoch': 0.26} + 26%|██▌ | 2255/8750 [3:42:03<10:17:42, 5.71s/it] 26%|██▌ | 2256/8750 [3:42:08<10:11:49, 5.65s/it] 26%|██▌ | 2256/8750 [3:42:15<10:11:50, 5.65s/it] {'loss': 0.4881, 'learning_rate': 1.7399909760217285e-05, 'epoch': 0.26} + 26%|██▌ | 2256/8750 [3:42:15<10:11:50, 5.65s/it] {'loss': 0.4881, 'learning_rate': 1.7399909760217285e-05, 'epoch': 0.26} + 26%|██▌ | 2256/8750 [3:42:08<10:11:49, 5.65s/it] 26%|██▌ | 2257/8750 [3:42:14<10:10:10, 5.64s/it] 26%|██▌ | 2257/8750 [3:42:20<10:10:10, 5.64s/it] {'loss': 0.4684, 'learning_rate': 1.7397419459509962e-05, 'epoch': 0.26} + 26%|██▌ | 2257/8750 [3:42:20<10:10:10, 5.64s/it] {'loss': 0.4684, 'learning_rate': 1.7397419459509962e-05, 'epoch': 0.26} + 26%|██▌ | 2257/8750 [3:42:14<10:10:10, 5.64s/it] 26%|██▌ | 2258/8750 [3:42:26<10:15:15, 5.69s/it] 26%|██▌ | 2258/8750 [3:42:20<10:15:15, 5.69s/it]{'loss': 0.4574, 'learning_rate': 1.739492814519102e-05, 'epoch': 0.26} + 26%|██▌ | 2258/8750 [3:42:26<10:15:15, 5.69s/it] {'loss': 0.4574, 'learning_rate': 1.739492814519102e-05, 'epoch': 0.26} + 26%|██▌ | 2258/8750 [3:42:20<10:15:15, 5.69s/it] 26%|██▌ | 2259/8750 [3:42:25<10:21:09, 5.74s/it] 26%|██▌ | 2259/8750 [3:42:32<10:21:09, 5.74s/it] {'loss': 0.4794, 'learning_rate': 1.739243581760182e-05, 'epoch': 0.26} + 26%|██▌ | 2259/8750 [3:42:32<10:21:09, 5.74s/it] {'loss': 0.4794, 'learning_rate': 1.739243581760182e-05, 'epoch': 0.26} + 26%|██▌ | 2259/8750 [3:42:25<10:21:09, 5.74s/it] 26%|██▌ | 2260/8750 [3:42:31<10:17:39, 5.71s/it] 26%|██▌ | 2260/8750 [3:42:38<10:17:40, 5.71s/it] {'loss': 0.4778, 'learning_rate': 1.738994247708387e-05, 'epoch': 0.26} + 26%|██▌ | 2260/8750 [3:42:38<10:17:40, 5.71s/it] {'loss': 0.4778, 'learning_rate': 1.738994247708387e-05, 'epoch': 0.26} + 26%|██▌ | 2260/8750 [3:42:31<10:17:39, 5.71s/it] 26%|██▌ | 2261/8750 [3:42:44<10:23:47, 5.77s/it] 26%|██▌ | 2261/8750 [3:42:37<10:23:48, 5.77s/it] {'loss': 0.489, 'learning_rate': 1.7387448123978813e-05, 'epoch': 0.26} + 26%|██▌ | 2261/8750 [3:42:44<10:23:47, 5.77s/it] {'loss': 0.489, 'learning_rate': 1.7387448123978813e-05, 'epoch': 0.26} + 26%|██▌ | 2261/8750 [3:42:37<10:23:48, 5.77s/it] 26%|██▌ | 2262/8750 [3:42:49<10:16:43, 5.70s/it] 26%|██▌ | 2262/8750 [3:42:43<10:16:43, 5.70s/it] {'loss': 0.4871, 'learning_rate': 1.7384952758628423e-05, 'epoch': 0.26} + 26%|██▌ | 2262/8750 [3:42:49<10:16:43, 5.70s/it] {'loss': 0.4871, 'learning_rate': 1.7384952758628423e-05, 'epoch': 0.26} + 26%|██▌ | 2262/8750 [3:42:43<10:16:43, 5.70s/it] 26%|██▌ | 2263/8750 [3:42:55<10:24:56, 5.78s/it] 26%|██▌ | 2263/8750 [3:42:49<10:24:56, 5.78s/it] {'loss': 0.4855, 'learning_rate': 1.738245638137463e-05, 'epoch': 0.26} + 26%|██▌ | 2263/8750 [3:42:55<10:24:56, 5.78s/it] {'loss': 0.4855, 'learning_rate': 1.738245638137463e-05, 'epoch': 0.26} + 26%|██▌ | 2263/8750 [3:42:49<10:24:56, 5.78s/it] 26%|██▌ | 2264/8750 [3:43:01<10:25:36, 5.79s/it] 26%|██▌ | 2264/8750 [3:42:54<10:25:36, 5.79s/it] {'loss': 0.471, 'learning_rate': 1.7379958992559494e-05, 'epoch': 0.26} + 26%|██▌ | 2264/8750 [3:43:01<10:25:36, 5.79s/it] {'loss': 0.471, 'learning_rate': 1.7379958992559494e-05, 'epoch': 0.26} + 26%|██▌ | 2264/8750 [3:42:54<10:25:36, 5.79s/it] 26%|██▌ | 2265/8750 [3:43:06<10:21:27, 5.75s/it] 26%|██▌ | 2265/8750 [3:43:00<10:21:27, 5.75s/it] {'loss': 0.4735, 'learning_rate': 1.737746059252521e-05, 'epoch': 0.26} + 26%|██▌ | 2265/8750 [3:43:06<10:21:27, 5.75s/it] {'loss': 0.4735, 'learning_rate': 1.737746059252521e-05, 'epoch': 0.26} + 26%|██▌ | 2265/8750 [3:43:00<10:21:27, 5.75s/it] 26%|██▌ | 2266/8750 [3:43:06<10:22:11, 5.76s/it] 26%|██▌ | 2266/8750 [3:43:12<10:22:11, 5.76s/it] {'loss': 0.4721, 'learning_rate': 1.737496118161411e-05, 'epoch': 0.26} + 26%|██▌ | 2266/8750 [3:43:12<10:22:11, 5.76s/it] {'loss': 0.4721, 'learning_rate': 1.737496118161411e-05, 'epoch': 0.26} + 26%|██▌ | 2266/8750 [3:43:06<10:22:11, 5.76s/it] 26%|██▌ | 2267/8750 [3:43:18<10:18:24, 5.72s/it] 26%|██▌ | 2267/8750 [3:43:11<10:18:25, 5.72s/it] {'loss': 0.492, 'learning_rate': 1.7372460760168676e-05, 'epoch': 0.26} + 26%|██▌ | 2267/8750 [3:43:18<10:18:24, 5.72s/it] {'loss': 0.492, 'learning_rate': 1.7372460760168676e-05, 'epoch': 0.26} + 26%|██▌ | 2267/8750 [3:43:11<10:18:25, 5.72s/it] 26%|██▌ | 2268/8750 [3:43:24<10:14:30, 5.69s/it] 26%|██▌ | 2268/8750 [3:43:17<10:14:30, 5.69s/it] {'loss': 0.4756, 'learning_rate': 1.736995932853152e-05, 'epoch': 0.26} + 26%|██▌ | 2268/8750 [3:43:24<10:14:30, 5.69s/it] {'loss': 0.4756, 'learning_rate': 1.736995932853152e-05, 'epoch': 0.26} + 26%|██▌ | 2268/8750 [3:43:17<10:14:30, 5.69s/it] 26%|██▌ | 2269/8750 [3:43:29<10:12:06, 5.67s/it] 26%|██▌ | 2269/8750 [3:43:23<10:12:06, 5.67s/it] {'loss': 0.5099, 'learning_rate': 1.736745688704539e-05, 'epoch': 0.26} + 26%|██▌ | 2269/8750 [3:43:29<10:12:06, 5.67s/it] {'loss': 0.5099, 'learning_rate': 1.736745688704539e-05, 'epoch': 0.26} + 26%|██▌ | 2269/8750 [3:43:23<10:12:06, 5.67s/it] 26%|██▌ | 2270/8750 [3:43:28<10:09:43, 5.65s/it] 26%|██▌ | 2270/8750 [3:43:35<10:09:44, 5.65s/it] {'loss': 0.476, 'learning_rate': 1.736495343605318e-05, 'epoch': 0.26} + 26%|██▌ | 2270/8750 [3:43:35<10:09:44, 5.65s/it] {'loss': 0.476, 'learning_rate': 1.736495343605318e-05, 'epoch': 0.26} + 26%|██▌ | 2270/8750 [3:43:28<10:09:43, 5.65s/it] 26%|██▌ | 2271/8750 [3:43:41<10:18:27, 5.73s/it] 26%|██▌ | 2271/8750 [3:43:34<10:18:28, 5.73s/it] {'loss': 0.4761, 'learning_rate': 1.736244897589792e-05, 'epoch': 0.26} + 26%|██▌ | 2271/8750 [3:43:41<10:18:27, 5.73s/it] {'loss': 0.4761, 'learning_rate': 1.736244897589792e-05, 'epoch': 0.26} + 26%|██▌ | 2271/8750 [3:43:34<10:18:28, 5.73s/it] 26%|██▌ | 2272/8750 [3:43:46<10:19:39, 5.74s/it] 26%|██▌ | 2272/8750 [3:43:40<10:19:39, 5.74s/it] {'loss': 0.4725, 'learning_rate': 1.7359943506922775e-05, 'epoch': 0.26} + 26%|██▌ | 2272/8750 [3:43:46<10:19:39, 5.74s/it] {'loss': 0.4725, 'learning_rate': 1.7359943506922775e-05, 'epoch': 0.26} + 26%|██▌ | 2272/8750 [3:43:40<10:19:39, 5.74s/it] 26%|██▌ | 2273/8750 [3:43:52<10:18:53, 5.73s/it] 26%|██▌ | 2273/8750 [3:43:46<10:18:53, 5.73s/it] {'loss': 0.477, 'learning_rate': 1.735743702947105e-05, 'epoch': 0.26} + 26%|██▌ | 2273/8750 [3:43:52<10:18:53, 5.73s/it] {'loss': 0.477, 'learning_rate': 1.735743702947105e-05, 'epoch': 0.26} + 26%|██▌ | 2273/8750 [3:43:46<10:18:53, 5.73s/it] 26%|██▌ | 2274/8750 [3:43:58<10:21:07, 5.75s/it] 26%|██▌ | 2274/8750 [3:43:51<10:21:07, 5.75s/it] {'loss': 0.4682, 'learning_rate': 1.7354929543886186e-05, 'epoch': 0.26} + 26%|██▌ | 2274/8750 [3:43:58<10:21:07, 5.75s/it] {'loss': 0.4682, 'learning_rate': 1.7354929543886186e-05, 'epoch': 0.26} + 26%|██▌ | 2274/8750 [3:43:51<10:21:07, 5.75s/it] 26%|██▌ | 2275/8750 [3:44:04<10:21:12, 5.76s/it] 26%|██▌ | 2275/8750 [3:43:57<10:21:13, 5.76s/it] {'loss': 0.4591, 'learning_rate': 1.7352421050511767e-05, 'epoch': 0.26} + 26%|██▌ | 2275/8750 [3:44:04<10:21:12, 5.76s/it] {'loss': 0.4591, 'learning_rate': 1.7352421050511767e-05, 'epoch': 0.26} + 26%|██▌ | 2275/8750 [3:43:57<10:21:13, 5.76s/it] 26%|██▌ | 2276/8750 [3:44:09<10:19:52, 5.74s/it] 26%|██▌ | 2276/8750 [3:44:03<10:19:51, 5.74s/it] {'loss': 0.4755, 'learning_rate': 1.734991154969152e-05, 'epoch': 0.26} + 26%|██▌ | 2276/8750 [3:44:09<10:19:52, 5.74s/it] {'loss': 0.4755, 'learning_rate': 1.734991154969152e-05, 'epoch': 0.26} + 26%|██▌ | 2276/8750 [3:44:03<10:19:51, 5.74s/it] 26%|██▌ | 2277/8750 [3:44:15<10:16:29, 5.71s/it] 26%|██▌ | 2277/8750 [3:44:09<10:16:29, 5.71s/it] {'loss': 0.4902, 'learning_rate': 1.7347401041769284e-05, 'epoch': 0.26} + 26%|██▌ | 2277/8750 [3:44:15<10:16:29, 5.71s/it] {'loss': 0.4902, 'learning_rate': 1.7347401041769284e-05, 'epoch': 0.26} + 26%|██▌ | 2277/8750 [3:44:09<10:16:29, 5.71s/it] 26%|██▌ | 2278/8750 [3:44:21<10:14:25, 5.70s/it] 26%|██▌ | 2278/8750 [3:44:14<10:14:25, 5.70s/it] {'loss': 0.4777, 'learning_rate': 1.7344889527089074e-05, 'epoch': 0.26} + 26%|██▌ | 2278/8750 [3:44:21<10:14:25, 5.70s/it] {'loss': 0.4777, 'learning_rate': 1.7344889527089074e-05, 'epoch': 0.26} + 26%|██▌ | 2278/8750 [3:44:14<10:14:25, 5.70s/it] 26%|██▌ | 2279/8750 [3:44:26<10:13:20, 5.69s/it] 26%|██▌ | 2279/8750 [3:44:20<10:13:20, 5.69s/it] {'loss': 0.4873, 'learning_rate': 1.7342377005995014e-05, 'epoch': 0.26} + 26%|██▌ | 2279/8750 [3:44:26<10:13:20, 5.69s/it] {'loss': 0.4873, 'learning_rate': 1.7342377005995014e-05, 'epoch': 0.26} + 26%|██▌ | 2279/8750 [3:44:20<10:13:20, 5.69s/it] 26%|██▌ | 2280/8750 [3:44:26<10:12:12, 5.68s/it] 26%|██▌ | 2280/8750 [3:44:32<10:12:13, 5.68s/it] {'loss': 0.4777, 'learning_rate': 1.733986347883138e-05, 'epoch': 0.26} + 26%|██▌ | 2280/8750 [3:44:32<10:12:13, 5.68s/it] {'loss': 0.4777, 'learning_rate': 1.733986347883138e-05, 'epoch': 0.26} + 26%|██▌ | 2280/8750 [3:44:26<10:12:12, 5.68s/it] 26%|██▌ | 2281/8750 [3:44:38<10:11:18, 5.67s/it] 26%|██▌ | 2281/8750 [3:44:31<10:11:19, 5.67s/it] {'loss': 0.4675, 'learning_rate': 1.7337348945942572e-05, 'epoch': 0.26} + 26%|██▌ | 2281/8750 [3:44:38<10:11:18, 5.67s/it] {'loss': 0.4675, 'learning_rate': 1.7337348945942572e-05, 'epoch': 0.26} + 26%|██▌ | 2281/8750 [3:44:31<10:11:19, 5.67s/it] 26%|██▌ | 2282/8750 [3:44:43<10:13:51, 5.69s/it] 26%|██▌ | 2282/8750 [3:44:37<10:13:51, 5.69s/it] {'loss': 0.4943, 'learning_rate': 1.7334833407673145e-05, 'epoch': 0.26} + 26%|██▌ | 2282/8750 [3:44:43<10:13:51, 5.69s/it] {'loss': 0.4943, 'learning_rate': 1.7334833407673145e-05, 'epoch': 0.26} + 26%|██▌ | 2282/8750 [3:44:37<10:13:51, 5.69s/it] 26%|██▌ | 2283/8750 [3:44:43<10:22:48, 5.78s/it] 26%|██▌ | 2283/8750 [3:44:49<10:22:49, 5.78s/it] {'loss': 0.4708, 'learning_rate': 1.7332316864367785e-05, 'epoch': 0.26} + 26%|██▌ | 2283/8750 [3:44:49<10:22:49, 5.78s/it] {'loss': 0.4708, 'learning_rate': 1.7332316864367785e-05, 'epoch': 0.26} + 26%|██▌ | 2283/8750 [3:44:43<10:22:48, 5.78s/it] 26%|██▌ | 2284/8750 [3:44:55<10:20:38, 5.76s/it] 26%|██▌ | 2284/8750 [3:44:49<10:20:40, 5.76s/it] {'loss': 0.4695, 'learning_rate': 1.732979931637131e-05, 'epoch': 0.26} + 26%|██▌ | 2284/8750 [3:44:55<10:20:38, 5.76s/it] {'loss': 0.4695, 'learning_rate': 1.732979931637131e-05, 'epoch': 0.26} + 26%|██▌ | 2284/8750 [3:44:49<10:20:40, 5.76s/it] 26%|██▌ | 2285/8750 [3:45:01<10:21:49, 5.77s/it] 26%|██▌ | 2285/8750 [3:44:54<10:21:49, 5.77s/it] {'loss': 0.488, 'learning_rate': 1.7327280764028683e-05, 'epoch': 0.26} + 26%|██▌ | 2285/8750 [3:45:01<10:21:49, 5.77s/it] {'loss': 0.488, 'learning_rate': 1.7327280764028683e-05, 'epoch': 0.26} + 26%|██▌ | 2285/8750 [3:44:54<10:21:49, 5.77s/it] 26%|██▌ | 2286/8750 [3:45:07<10:18:11, 5.74s/it] 26%|██▌ | 2286/8750 [3:45:00<10:18:11, 5.74s/it] {'loss': 0.5035, 'learning_rate': 1.7324761207685005e-05, 'epoch': 0.26} + 26%|██▌ | 2286/8750 [3:45:07<10:18:11, 5.74s/it] {'loss': 0.5035, 'learning_rate': 1.7324761207685005e-05, 'epoch': 0.26} + 26%|██▌ | 2286/8750 [3:45:00<10:18:11, 5.74s/it] 26%|██▌ | 2287/8750 [3:45:12<10:15:05, 5.71s/it] 26%|██▌ | 2287/8750 [3:45:06<10:15:05, 5.71s/it] {'loss': 0.4726, 'learning_rate': 1.7322240647685503e-05, 'epoch': 0.26} + 26%|██▌ | 2287/8750 [3:45:12<10:15:05, 5.71s/it] {'loss': 0.4726, 'learning_rate': 1.7322240647685503e-05, 'epoch': 0.26} + 26%|██▌ | 2287/8750 [3:45:06<10:15:05, 5.71s/it] 26%|██▌ | 2288/8750 [3:45:18<10:12:59, 5.69s/it] 26%|██▌ | 2288/8750 [3:45:11<10:13:00, 5.69s/it] {'loss': 0.4609, 'learning_rate': 1.7319719084375556e-05, 'epoch': 0.26} + 26%|██▌ | 2288/8750 [3:45:18<10:12:59, 5.69s/it] {'loss': 0.4609, 'learning_rate': 1.7319719084375556e-05, 'epoch': 0.26} + 26%|██▌ | 2288/8750 [3:45:11<10:13:00, 5.69s/it] 26%|██▌ | 2289/8750 [3:45:23<10:09:24, 5.66s/it] 26%|██▌ | 2289/8750 [3:45:17<10:09:23, 5.66s/it] {'loss': 0.488, 'learning_rate': 1.7317196518100672e-05, 'epoch': 0.26} + 26%|██▌ | 2289/8750 [3:45:23<10:09:24, 5.66s/it] {'loss': 0.488, 'learning_rate': 1.7317196518100672e-05, 'epoch': 0.26} + 26%|██▌ | 2289/8750 [3:45:17<10:09:23, 5.66s/it] 26%|██▌ | 2290/8750 [3:45:29<10:12:23, 5.69s/it] 26%|██▌ | 2290/8750 [3:45:23<10:12:22, 5.69s/it] {'loss': 0.474, 'learning_rate': 1.7314672949206502e-05, 'epoch': 0.26} + 26%|██▌ | 2290/8750 [3:45:29<10:12:23, 5.69s/it] {'loss': 0.474, 'learning_rate': 1.7314672949206502e-05, 'epoch': 0.26} + 26%|██▌ | 2290/8750 [3:45:23<10:12:22, 5.69s/it] 26%|██▌ | 2291/8750 [3:45:35<10:11:47, 5.68s/it] 26%|██▌ | 2291/8750 [3:45:28<10:11:48, 5.68s/it] {'loss': 0.4646, 'learning_rate': 1.731214837803883e-05, 'epoch': 0.26} + 26%|██▌ | 2291/8750 [3:45:35<10:11:47, 5.68s/it] {'loss': 0.4646, 'learning_rate': 1.731214837803883e-05, 'epoch': 0.26} + 26%|██▌ | 2291/8750 [3:45:28<10:11:48, 5.68s/it] 26%|██▌ | 2292/8750 [3:45:41<10:20:52, 5.77s/it] 26%|██▌ | 2292/8750 [3:45:34<10:20:51, 5.77s/it] {'loss': 0.4767, 'learning_rate': 1.7309622804943573e-05, 'epoch': 0.26} + 26%|██▌ | 2292/8750 [3:45:41<10:20:52, 5.77s/it] {'loss': 0.4767, 'learning_rate': 1.7309622804943573e-05, 'epoch': 0.26} + 26%|██▌ | 2292/8750 [3:45:34<10:20:51, 5.77s/it] 26%|██▌ | 2293/8750 [3:45:47<10:19:11, 5.75s/it] 26%|██▌ | 2293/8750 [3:45:40<10:19:11, 5.75s/it] {'loss': 0.5126, 'learning_rate': 1.73070962302668e-05, 'epoch': 0.26} + 26%|██▌ | 2293/8750 [3:45:47<10:19:11, 5.75s/it] {'loss': 0.5126, 'learning_rate': 1.73070962302668e-05, 'epoch': 0.26} + 26%|██▌ | 2293/8750 [3:45:40<10:19:11, 5.75s/it] 26%|██▌ | 2294/8750 [3:45:46<10:30:32, 5.86s/it] 26%|██▌ | 2294/8750 [3:45:53<10:30:33, 5.86s/it] {'loss': 0.4779, 'learning_rate': 1.7304568654354703e-05, 'epoch': 0.26} + 26%|██▌ | 2294/8750 [3:45:53<10:30:33, 5.86s/it] {'loss': 0.4779, 'learning_rate': 1.7304568654354703e-05, 'epoch': 0.26} + 26%|██▌ | 2294/8750 [3:45:46<10:30:32, 5.86s/it] 26%|██▌ | 2295/8750 [3:45:52<10:23:49, 5.80s/it] 26%|██▌ | 2295/8750 [3:45:58<10:23:49, 5.80s/it] {'loss': 0.4769, 'learning_rate': 1.7302040077553616e-05, 'epoch': 0.26} + 26%|██▌ | 2295/8750 [3:45:58<10:23:49, 5.80s/it] {'loss': 0.4769, 'learning_rate': 1.7302040077553616e-05, 'epoch': 0.26} + 26%|██▌ | 2295/8750 [3:45:52<10:23:49, 5.80s/it] 26%|██▌ | 2296/8750 [3:46:04<10:21:38, 5.78s/it] 26%|██▌ | 2296/8750 [3:45:58<10:21:39, 5.78s/it] {'loss': 0.4764, 'learning_rate': 1.7299510500210015e-05, 'epoch': 0.26} + 26%|██▌ | 2296/8750 [3:46:04<10:21:38, 5.78s/it] {'loss': 0.4764, 'learning_rate': 1.7299510500210015e-05, 'epoch': 0.26} + 26%|██▌ | 2296/8750 [3:45:58<10:21:39, 5.78s/it] 26%|██▋ | 2297/8750 [3:46:10<10:21:00, 5.77s/it] 26%|██▋ | 2297/8750 [3:46:03<10:21:01, 5.77s/it] {'loss': 0.4786, 'learning_rate': 1.7296979922670502e-05, 'epoch': 0.26} + 26%|██▋ | 2297/8750 [3:46:10<10:21:00, 5.77s/it] {'loss': 0.4786, 'learning_rate': 1.7296979922670502e-05, 'epoch': 0.26} + 26%|██▋ | 2297/8750 [3:46:03<10:21:01, 5.77s/it] 26%|██▋ | 2298/8750 [3:46:16<10:17:25, 5.74s/it] 26%|██▋ | 2298/8750 [3:46:09<10:17:25, 5.74s/it] {'loss': 0.4735, 'learning_rate': 1.729444834528183e-05, 'epoch': 0.26} + 26%|██▋ | 2298/8750 [3:46:16<10:17:25, 5.74s/it] {'loss': 0.4735, 'learning_rate': 1.729444834528183e-05, 'epoch': 0.26} + 26%|██▋ | 2298/8750 [3:46:09<10:17:25, 5.74s/it] 26%|██▋ | 2299/8750 [3:46:21<10:13:15, 5.70s/it] 26%|██▋ | 2299/8750 [3:46:15<10:13:16, 5.70s/it] {'loss': 0.4773, 'learning_rate': 1.7291915768390875e-05, 'epoch': 0.26} + 26%|██▋ | 2299/8750 [3:46:21<10:13:15, 5.70s/it] {'loss': 0.4773, 'learning_rate': 1.7291915768390875e-05, 'epoch': 0.26} + 26%|██▋ | 2299/8750 [3:46:15<10:13:16, 5.70s/it]12 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +015 AutoResumeHook: Checking whether to suspend... + 26%|██▋ | 2300/8750 [3:46:27<10:10:00, 5.67s/it]13 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... + 26%|██▋ | 2300/8750 [3:46:20<10:10:00, 5.67s/it]3 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4762, 'learning_rate': 1.728938219234466e-05, 'epoch': 0.26} + 26%|██▋ | 2300/8750 [3:46:27<10:10:00, 5.67s/it] {'loss': 0.4762, 'learning_rate': 1.728938219234466e-05, 'epoch': 0.26} + 26%|██▋ | 2300/8750 [3:46:20<10:10:00, 5.67s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-2300/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-2300/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-2300/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 26%|██▋ | 2301/8750 [3:46:42<18:54:48, 10.56s/it] 26%|██▋ | 2301/8750 [3:46:49<18:54:51, 10.56s/it] {'loss': 0.4629, 'learning_rate': 1.728684761749034e-05, 'epoch': 0.26} + 26%|██▋ | 2301/8750 [3:46:49<18:54:51, 10.56s/it] {'loss': 0.4629, 'learning_rate': 1.728684761749034e-05, 'epoch': 0.26} + 26%|██▋ | 2301/8750 [3:46:42<18:54:48, 10.56s/it] 26%|██▋ | 2302/8750 [3:46:48<16:12:04, 9.05s/it] 26%|██▋ | 2302/8750 [3:46:54<16:12:04, 9.05s/it] {'loss': 0.4624, 'learning_rate': 1.728431204417521e-05, 'epoch': 0.26} + 26%|██▋ | 2302/8750 [3:46:54<16:12:04, 9.05s/it] {'loss': 0.4624, 'learning_rate': 1.728431204417521e-05, 'epoch': 0.26} + 26%|██▋ | 2302/8750 [3:46:48<16:12:04, 9.05s/it] 26%|██▋ | 2303/8750 [3:46:53<14:26:58, 8.07s/it] 26%|██▋ | 2303/8750 [3:47:00<14:26:58, 8.07s/it] {'loss': 0.4793, 'learning_rate': 1.7281775472746695e-05, 'epoch': 0.26} + 26%|██▋ | 2303/8750 [3:47:00<14:26:58, 8.07s/it] {'loss': 0.4793, 'learning_rate': 1.7281775472746695e-05, 'epoch': 0.26} + 26%|██▋ | 2303/8750 [3:46:53<14:26:58, 8.07s/it] 26%|██▋ | 2304/8750 [3:46:59<13:07:31, 7.33s/it] 26%|██▋ | 2304/8750 [3:47:06<13:07:31, 7.33s/it] {'loss': 0.4707, 'learning_rate': 1.727923790355237e-05, 'epoch': 0.26} + 26%|██▋ | 2304/8750 [3:47:06<13:07:31, 7.33s/it] {'loss': 0.4707, 'learning_rate': 1.727923790355237e-05, 'epoch': 0.26} + 26%|██▋ | 2304/8750 [3:46:59<13:07:31, 7.33s/it] 26%|██▋ | 2305/8750 [3:47:05<12:14:38, 6.84s/it] 26%|██▋ | 2305/8750 [3:47:11<12:14:38, 6.84s/it] {'loss': 0.4638, 'learning_rate': 1.7276699336939936e-05, 'epoch': 0.26} + 26%|██▋ | 2305/8750 [3:47:11<12:14:38, 6.84s/it] {'loss': 0.4638, 'learning_rate': 1.7276699336939936e-05, 'epoch': 0.26} + 26%|██▋ | 2305/8750 [3:47:05<12:14:38, 6.84s/it] 26%|██▋ | 2306/8750 [3:47:10<11:33:25, 6.46s/it] 26%|██▋ | 2306/8750 [3:47:17<11:33:26, 6.46s/it] {'loss': 0.492, 'learning_rate': 1.7274159773257227e-05, 'epoch': 0.26} + 26%|██▋ | 2306/8750 [3:47:17<11:33:26, 6.46s/it] {'loss': 0.492, 'learning_rate': 1.7274159773257227e-05, 'epoch': 0.26} + 26%|██▋ | 2306/8750 [3:47:10<11:33:25, 6.46s/it] 26%|██▋ | 2307/8750 [3:47:16<11:06:07, 6.20s/it] 26%|██▋ | 2307/8750 [3:47:22<11:06:07, 6.20s/it] {'loss': 0.4632, 'learning_rate': 1.7271619212852232e-05, 'epoch': 0.26} + 26%|██▋ | 2307/8750 [3:47:22<11:06:07, 6.20s/it] {'loss': 0.4632, 'learning_rate': 1.7271619212852232e-05, 'epoch': 0.26} + 26%|██▋ | 2307/8750 [3:47:16<11:06:07, 6.20s/it] 26%|██▋ | 2308/8750 [3:47:22<10:54:15, 6.09s/it] 26%|██▋ | 2308/8750 [3:47:28<10:54:15, 6.09s/it] {'loss': 0.479, 'learning_rate': 1.726907765607305e-05, 'epoch': 0.26} + 26%|██▋ | 2308/8750 [3:47:28<10:54:15, 6.09s/it] {'loss': 0.479, 'learning_rate': 1.726907765607305e-05, 'epoch': 0.26} + 26%|██▋ | 2308/8750 [3:47:22<10:54:15, 6.09s/it] 26%|██▋ | 2309/8750 [3:47:27<10:39:26, 5.96s/it] 26%|██▋ | 2309/8750 [3:47:34<10:39:26, 5.96s/it] {'loss': 0.4955, 'learning_rate': 1.7266535103267943e-05, 'epoch': 0.26} + 26%|██▋ | 2309/8750 [3:47:34<10:39:26, 5.96s/it] {'loss': 0.4955, 'learning_rate': 1.7266535103267943e-05, 'epoch': 0.26} + 26%|██▋ | 2309/8750 [3:47:27<10:39:26, 5.96s/it] 26%|██▋ | 2310/8750 [3:47:34<10:45:41, 6.02s/it] 26%|██▋ | 2310/8750 [3:47:40<10:45:41, 6.02s/it] {'loss': 0.471, 'learning_rate': 1.726399155478529e-05, 'epoch': 0.26} + 26%|██▋ | 2310/8750 [3:47:40<10:45:41, 6.02s/it] {'loss': 0.471, 'learning_rate': 1.726399155478529e-05, 'epoch': 0.26} + 26%|██▋ | 2310/8750 [3:47:34<10:45:41, 6.02s/it] 26%|██▋ | 2311/8750 [3:47:39<10:33:48, 5.91s/it] 26%|██▋ | 2311/8750 [3:47:46<10:33:50, 5.91s/it] {'loss': 0.4869, 'learning_rate': 1.7261447010973623e-05, 'epoch': 0.26} + 26%|██▋ | 2311/8750 [3:47:46<10:33:50, 5.91s/it] {'loss': 0.4869, 'learning_rate': 1.7261447010973623e-05, 'epoch': 0.26} + 26%|██▋ | 2311/8750 [3:47:39<10:33:48, 5.91s/it] 26%|██▋ | 2312/8750 [3:47:45<10:22:27, 5.80s/it] 26%|██▋ | 2312/8750 [3:47:51<10:22:27, 5.80s/it] {'loss': 0.4765, 'learning_rate': 1.7258901472181587e-05, 'epoch': 0.26} + 26%|██▋ | 2312/8750 [3:47:51<10:22:27, 5.80s/it] {'loss': 0.4765, 'learning_rate': 1.7258901472181587e-05, 'epoch': 0.26} + 26%|██▋ | 2312/8750 [3:47:45<10:22:27, 5.80s/it] 26%|██▋ | 2313/8750 [3:47:51<10:29:50, 5.87s/it] 26%|██▋ | 2313/8750 [3:47:57<10:29:49, 5.87s/it] {'loss': 0.4795, 'learning_rate': 1.725635493875799e-05, 'epoch': 0.26} + 26%|██▋ | 2313/8750 [3:47:57<10:29:49, 5.87s/it] {'loss': 0.4795, 'learning_rate': 1.725635493875799e-05, 'epoch': 0.26} + 26%|██▋ | 2313/8750 [3:47:51<10:29:50, 5.87s/it] 26%|██▋ | 2314/8750 [3:47:57<10:36:29, 5.93s/it] 26%|██▋ | 2314/8750 [3:48:03<10:36:29, 5.93s/it] {'loss': 0.4612, 'learning_rate': 1.725380741105176e-05, 'epoch': 0.26} + 26%|██▋ | 2314/8750 [3:48:03<10:36:29, 5.93s/it] {'loss': 0.4612, 'learning_rate': 1.725380741105176e-05, 'epoch': 0.26} + 26%|██▋ | 2314/8750 [3:47:57<10:36:29, 5.93s/it] 26%|██▋ | 2315/8750 [3:48:03<10:27:21, 5.85s/it] 26%|██▋ | 2315/8750 [3:48:09<10:27:21, 5.85s/it] {'loss': 0.48, 'learning_rate': 1.7251258889411964e-05, 'epoch': 0.26} + 26%|██▋ | 2315/8750 [3:48:09<10:27:21, 5.85s/it] {'loss': 0.48, 'learning_rate': 1.7251258889411964e-05, 'epoch': 0.26} + 26%|██▋ | 2315/8750 [3:48:03<10:27:21, 5.85s/it] 26%|██▋ | 2316/8750 [3:48:08<10:25:28, 5.83s/it] 26%|██▋ | 2316/8750 [3:48:15<10:25:28, 5.83s/it] {'loss': 0.483, 'learning_rate': 1.724870937418781e-05, 'epoch': 0.26} + 26%|██▋ | 2316/8750 [3:48:15<10:25:28, 5.83s/it] {'loss': 0.483, 'learning_rate': 1.724870937418781e-05, 'epoch': 0.26} + 26%|██▋ | 2316/8750 [3:48:08<10:25:28, 5.83s/it] 26%|██▋ | 2317/8750 [3:48:14<10:20:56, 5.79s/it] 26%|██▋ | 2317/8750 [3:48:21<10:20:56, 5.79s/it] {'loss': 0.4708, 'learning_rate': 1.7246158865728634e-05, 'epoch': 0.26} + 26%|██▋ | 2317/8750 [3:48:21<10:20:56, 5.79s/it] {'loss': 0.4708, 'learning_rate': 1.7246158865728634e-05, 'epoch': 0.26} + 26%|██▋ | 2317/8750 [3:48:14<10:20:56, 5.79s/it] 26%|██▋ | 2318/8750 [3:48:20<10:16:58, 5.76s/it] 26%|██▋ | 2318/8750 [3:48:26<10:16:58, 5.76s/it] {'loss': 0.4752, 'learning_rate': 1.7243607364383916e-05, 'epoch': 0.26} + 26%|██▋ | 2318/8750 [3:48:26<10:16:58, 5.76s/it] {'loss': 0.4752, 'learning_rate': 1.7243607364383916e-05, 'epoch': 0.26} + 26%|██▋ | 2318/8750 [3:48:20<10:16:58, 5.76s/it] 27%|██▋ | 2319/8750 [3:48:26<10:18:16, 5.77s/it] 27%|██▋ | 2319/8750 [3:48:32<10:18:16, 5.77s/it] {'loss': 0.4801, 'learning_rate': 1.7241054870503262e-05, 'epoch': 0.27} + 27%|██▋ | 2319/8750 [3:48:32<10:18:16, 5.77s/it] {'loss': 0.4801, 'learning_rate': 1.7241054870503262e-05, 'epoch': 0.27} + 27%|██▋ | 2319/8750 [3:48:26<10:18:16, 5.77s/it] 27%|██▋ | 2320/8750 [3:48:31<10:16:11, 5.75s/it] 27%|██▋ | 2320/8750 [3:48:38<10:16:11, 5.75s/it] {'loss': 0.4739, 'learning_rate': 1.723850138443643e-05, 'epoch': 0.27} + 27%|██▋ | 2320/8750 [3:48:38<10:16:11, 5.75s/it] {'loss': 0.4739, 'learning_rate': 1.723850138443643e-05, 'epoch': 0.27} + 27%|██▋ | 2320/8750 [3:48:31<10:16:11, 5.75s/it] 27%|██▋ | 2321/8750 [3:48:37<10:13:53, 5.73s/it] 27%|██▋ | 2321/8750 [3:48:43<10:13:53, 5.73s/it] {'loss': 0.4771, 'learning_rate': 1.72359469065333e-05, 'epoch': 0.27} + 27%|██▋ | 2321/8750 [3:48:43<10:13:53, 5.73s/it] {'loss': 0.4771, 'learning_rate': 1.72359469065333e-05, 'epoch': 0.27} + 27%|██▋ | 2321/8750 [3:48:37<10:13:53, 5.73s/it] 27%|██▋ | 2322/8750 [3:48:43<10:13:17, 5.72s/it] 27%|██▋ | 2322/8750 [3:48:49<10:13:17, 5.72s/it] {'loss': 0.4811, 'learning_rate': 1.723339143714389e-05, 'epoch': 0.27} + 27%|██▋ | 2322/8750 [3:48:49<10:13:17, 5.72s/it] {'loss': 0.4811, 'learning_rate': 1.723339143714389e-05, 'epoch': 0.27} + 27%|██▋ | 2322/8750 [3:48:43<10:13:17, 5.72s/it] 27%|██▋ | 2323/8750 [3:48:48<10:10:45, 5.70s/it] 27%|██▋ | 2323/8750 [3:48:55<10:10:45, 5.70s/it] {'loss': 0.5013, 'learning_rate': 1.7230834976618364e-05, 'epoch': 0.27} + 27%|██▋ | 2323/8750 [3:48:55<10:10:45, 5.70s/it] {'loss': 0.5013, 'learning_rate': 1.7230834976618364e-05, 'epoch': 0.27} + 27%|██▋ | 2323/8750 [3:48:48<10:10:45, 5.70s/it] 27%|██▋ | 2324/8750 [3:48:54<10:16:21, 5.76s/it] 27%|██▋ | 2324/8750 [3:49:01<10:16:22, 5.76s/it] {'loss': 0.4576, 'learning_rate': 1.7228277525307007e-05, 'epoch': 0.27} + 27%|██▋ | 2324/8750 [3:49:01<10:16:22, 5.76s/it] {'loss': 0.4576, 'learning_rate': 1.7228277525307007e-05, 'epoch': 0.27} + 27%|██▋ | 2324/8750 [3:48:54<10:16:21, 5.76s/it] 27%|██▋ | 2325/8750 [3:49:00<10:17:59, 5.77s/it] 27%|██▋ | 2325/8750 [3:49:06<10:17:58, 5.77s/it] {'loss': 0.4736, 'learning_rate': 1.7225719083560246e-05, 'epoch': 0.27} + 27%|██▋ | 2325/8750 [3:49:06<10:17:58, 5.77s/it] {'loss': 0.4736, 'learning_rate': 1.7225719083560246e-05, 'epoch': 0.27} + 27%|██▋ | 2325/8750 [3:49:00<10:17:59, 5.77s/it] 27%|██▋ | 2326/8750 [3:49:06<10:13:19, 5.73s/it] 27%|██▋ | 2326/8750 [3:49:12<10:13:19, 5.73s/it] {'loss': 0.4784, 'learning_rate': 1.7223159651728653e-05, 'epoch': 0.27} + 27%|██▋ | 2326/8750 [3:49:12<10:13:19, 5.73s/it] {'loss': 0.4784, 'learning_rate': 1.7223159651728653e-05, 'epoch': 0.27} + 27%|██▋ | 2326/8750 [3:49:06<10:13:19, 5.73s/it] 27%|██▋ | 2327/8750 [3:49:11<10:15:29, 5.75s/it] 27%|██▋ | 2327/8750 [3:49:18<10:15:29, 5.75s/it] {'loss': 0.4742, 'learning_rate': 1.7220599230162917e-05, 'epoch': 0.27} + 27%|██▋ | 2327/8750 [3:49:18<10:15:29, 5.75s/it] {'loss': 0.4742, 'learning_rate': 1.7220599230162917e-05, 'epoch': 0.27} + 27%|██▋ | 2327/8750 [3:49:11<10:15:29, 5.75s/it] 27%|██▋ | 2328/8750 [3:49:17<10:15:07, 5.75s/it] 27%|██▋ | 2328/8750 [3:49:24<10:15:06, 5.75s/it] {'loss': 0.4799, 'learning_rate': 1.7218037819213883e-05, 'epoch': 0.27} + 27%|██▋ | 2328/8750 [3:49:24<10:15:06, 5.75s/it] {'loss': 0.4799, 'learning_rate': 1.7218037819213883e-05, 'epoch': 0.27} + 27%|██▋ | 2328/8750 [3:49:17<10:15:07, 5.75s/it] 27%|██▋ | 2329/8750 [3:49:23<10:22:58, 5.82s/it] 27%|██▋ | 2329/8750 [3:49:30<10:22:59, 5.82s/it] {'loss': 0.4987, 'learning_rate': 1.7215475419232516e-05, 'epoch': 0.27} + 27%|██▋ | 2329/8750 [3:49:30<10:22:59, 5.82s/it] {'loss': 0.4987, 'learning_rate': 1.7215475419232516e-05, 'epoch': 0.27} + 27%|██▋ | 2329/8750 [3:49:23<10:22:58, 5.82s/it] 27%|██▋ | 2330/8750 [3:49:35<10:20:06, 5.80s/it] 27%|██▋ | 2330/8750 [3:49:29<10:20:06, 5.80s/it] {'loss': 0.4768, 'learning_rate': 1.7212912030569923e-05, 'epoch': 0.27} + 27%|██▋ | 2330/8750 [3:49:35<10:20:06, 5.80s/it]{'loss': 0.4768, 'learning_rate': 1.7212912030569923e-05, 'epoch': 0.27} + 27%|██▋ | 2330/8750 [3:49:29<10:20:06, 5.80s/it] 27%|██▋ | 2331/8750 [3:49:35<10:25:27, 5.85s/it] 27%|██▋ | 2331/8750 [3:49:41<10:25:27, 5.85s/it] {'loss': 0.4819, 'learning_rate': 1.7210347653577343e-05, 'epoch': 0.27} + 27%|██▋ | 2331/8750 [3:49:41<10:25:27, 5.85s/it] {'loss': 0.4819, 'learning_rate': 1.7210347653577343e-05, 'epoch': 0.27} + 27%|██▋ | 2331/8750 [3:49:35<10:25:27, 5.85s/it] 27%|██▋ | 2332/8750 [3:49:41<10:29:35, 5.89s/it] 27%|██▋ | 2332/8750 [3:49:47<10:29:35, 5.89s/it] {'loss': 0.4671, 'learning_rate': 1.7207782288606154e-05, 'epoch': 0.27} + 27%|██▋ | 2332/8750 [3:49:47<10:29:35, 5.89s/it] {'loss': 0.4671, 'learning_rate': 1.7207782288606154e-05, 'epoch': 0.27} + 27%|██▋ | 2332/8750 [3:49:41<10:29:35, 5.89s/it] 27%|██▋ | 2333/8750 [3:49:47<10:28:52, 5.88s/it] 27%|██▋ | 2333/8750 [3:49:53<10:28:52, 5.88s/it] {'loss': 0.4661, 'learning_rate': 1.720521593600787e-05, 'epoch': 0.27} + 27%|██▋ | 2333/8750 [3:49:53<10:28:52, 5.88s/it] {'loss': 0.4661, 'learning_rate': 1.720521593600787e-05, 'epoch': 0.27} + 27%|██▋ | 2333/8750 [3:49:47<10:28:52, 5.88s/it] 27%|██▋ | 2334/8750 [3:49:52<10:23:42, 5.83s/it] 27%|██▋ | 2334/8750 [3:49:59<10:23:42, 5.83s/it] {'loss': 0.4803, 'learning_rate': 1.7202648596134143e-05, 'epoch': 0.27} + 27%|██▋ | 2334/8750 [3:49:59<10:23:42, 5.83s/it] {'loss': 0.4803, 'learning_rate': 1.7202648596134143e-05, 'epoch': 0.27} + 27%|██▋ | 2334/8750 [3:49:52<10:23:42, 5.83s/it] 27%|██▋ | 2335/8750 [3:49:58<10:25:56, 5.85s/it] 27%|██▋ | 2335/8750 [3:50:05<10:25:56, 5.85s/it] {'loss': 0.4671, 'learning_rate': 1.7200080269336745e-05, 'epoch': 0.27} + 27%|██▋ | 2335/8750 [3:50:05<10:25:56, 5.85s/it] {'loss': 0.4671, 'learning_rate': 1.7200080269336745e-05, 'epoch': 0.27} + 27%|██▋ | 2335/8750 [3:49:58<10:25:56, 5.85s/it] 27%|██▋ | 2336/8750 [3:50:04<10:21:34, 5.81s/it] 27%|██▋ | 2336/8750 [3:50:11<10:21:34, 5.81s/it] {'loss': 0.4705, 'learning_rate': 1.71975109559676e-05, 'epoch': 0.27} + 27%|██▋ | 2336/8750 [3:50:11<10:21:34, 5.81s/it] {'loss': 0.4705, 'learning_rate': 1.71975109559676e-05, 'epoch': 0.27} + 27%|██▋ | 2336/8750 [3:50:04<10:21:34, 5.81s/it] 27%|██▋ | 2337/8750 [3:50:10<10:17:08, 5.77s/it] 27%|██▋ | 2337/8750 [3:50:16<10:17:09, 5.77s/it] {'loss': 0.4898, 'learning_rate': 1.7194940656378763e-05, 'epoch': 0.27} + 27%|██▋ | 2337/8750 [3:50:16<10:17:09, 5.77s/it] {'loss': 0.4898, 'learning_rate': 1.7194940656378763e-05, 'epoch': 0.27} + 27%|██▋ | 2337/8750 [3:50:10<10:17:08, 5.77s/it] 27%|██▋ | 2338/8750 [3:50:15<10:09:50, 5.71s/it] 27%|██▋ | 2338/8750 [3:50:22<10:09:50, 5.71s/it] {'loss': 0.4762, 'learning_rate': 1.7192369370922423e-05, 'epoch': 0.27} + 27%|██▋ | 2338/8750 [3:50:22<10:09:50, 5.71s/it] {'loss': 0.4762, 'learning_rate': 1.7192369370922423e-05, 'epoch': 0.27} + 27%|██▋ | 2338/8750 [3:50:15<10:09:50, 5.71s/it] 27%|██▋ | 2339/8750 [3:50:21<10:10:09, 5.71s/it] 27%|██▋ | 2339/8750 [3:50:27<10:10:09, 5.71s/it] {'loss': 0.4991, 'learning_rate': 1.7189797099950895e-05, 'epoch': 0.27} + 27%|██▋ | 2339/8750 [3:50:27<10:10:09, 5.71s/it] {'loss': 0.4991, 'learning_rate': 1.7189797099950895e-05, 'epoch': 0.27} + 27%|██▋ | 2339/8750 [3:50:21<10:10:09, 5.71s/it] 27%|██▋ | 2340/8750 [3:50:27<10:06:27, 5.68s/it] 27%|██▋ | 2340/8750 [3:50:33<10:06:28, 5.68s/it] {'loss': 0.4784, 'learning_rate': 1.7187223843816648e-05, 'epoch': 0.27} + 27%|██▋ | 2340/8750 [3:50:33<10:06:28, 5.68s/it] {'loss': 0.4784, 'learning_rate': 1.7187223843816648e-05, 'epoch': 0.27} + 27%|██▋ | 2340/8750 [3:50:27<10:06:27, 5.68s/it] 27%|██▋ | 2341/8750 [3:50:32<10:05:42, 5.67s/it] 27%|██▋ | 2341/8750 [3:50:39<10:05:42, 5.67s/it] {'loss': 0.4749, 'learning_rate': 1.7184649602872274e-05, 'epoch': 0.27} + 27%|██▋ | 2341/8750 [3:50:39<10:05:42, 5.67s/it] {'loss': 0.4749, 'learning_rate': 1.7184649602872274e-05, 'epoch': 0.27} + 27%|██▋ | 2341/8750 [3:50:32<10:05:42, 5.67s/it] 27%|██▋ | 2342/8750 [3:50:38<10:06:33, 5.68s/it] 27%|██▋ | 2342/8750 [3:50:44<10:06:32, 5.68s/it] {'loss': 0.4777, 'learning_rate': 1.7182074377470494e-05, 'epoch': 0.27} + 27%|██▋ | 2342/8750 [3:50:44<10:06:32, 5.68s/it] {'loss': 0.4777, 'learning_rate': 1.7182074377470494e-05, 'epoch': 0.27} + 27%|██▋ | 2342/8750 [3:50:38<10:06:33, 5.68s/it] 27%|██▋ | 2343/8750 [3:50:44<10:07:14, 5.69s/it] 27%|██▋ | 2343/8750 [3:50:50<10:07:14, 5.69s/it] {'loss': 0.4733, 'learning_rate': 1.717949816796418e-05, 'epoch': 0.27} + 27%|██▋ | 2343/8750 [3:50:50<10:07:14, 5.69s/it] {'loss': 0.4733, 'learning_rate': 1.717949816796418e-05, 'epoch': 0.27} + 27%|██▋ | 2343/8750 [3:50:44<10:07:14, 5.69s/it] 27%|██▋ | 2344/8750 [3:50:49<10:08:08, 5.70s/it] 27%|██▋ | 2344/8750 [3:50:56<10:08:08, 5.70s/it] {'loss': 0.4914, 'learning_rate': 1.7176920974706318e-05, 'epoch': 0.27} + 27%|██▋ | 2344/8750 [3:50:56<10:08:08, 5.70s/it] {'loss': 0.4914, 'learning_rate': 1.7176920974706318e-05, 'epoch': 0.27} + 27%|██▋ | 2344/8750 [3:50:49<10:08:08, 5.70s/it] 27%|██▋ | 2345/8750 [3:50:55<10:16:32, 5.78s/it] 27%|██▋ | 2345/8750 [3:51:02<10:16:32, 5.78s/it] {'loss': 0.4806, 'learning_rate': 1.7174342798050056e-05, 'epoch': 0.27} + 27%|██▋ | 2345/8750 [3:51:02<10:16:32, 5.78s/it] {'loss': 0.4806, 'learning_rate': 1.7174342798050056e-05, 'epoch': 0.27} + 27%|██▋ | 2345/8750 [3:50:55<10:16:32, 5.78s/it] 27%|██▋ | 2346/8750 [3:51:01<10:17:34, 5.79s/it] 27%|██▋ | 2346/8750 [3:51:08<10:17:34, 5.79s/it] {'loss': 0.4793, 'learning_rate': 1.7171763638348653e-05, 'epoch': 0.27} + 27%|██▋ | 2346/8750 [3:51:08<10:17:34, 5.79s/it] {'loss': 0.4793, 'learning_rate': 1.7171763638348653e-05, 'epoch': 0.27} + 27%|██▋ | 2346/8750 [3:51:01<10:17:34, 5.79s/it] 27%|██▋ | 2347/8750 [3:51:07<10:15:30, 5.77s/it] 27%|██▋ | 2347/8750 [3:51:13<10:15:30, 5.77s/it] {'loss': 0.4827, 'learning_rate': 1.7169183495955516e-05, 'epoch': 0.27} + 27%|██▋ | 2347/8750 [3:51:13<10:15:30, 5.77s/it] {'loss': 0.4827, 'learning_rate': 1.7169183495955516e-05, 'epoch': 0.27} + 27%|██▋ | 2347/8750 [3:51:07<10:15:30, 5.77s/it] 27%|██▋ | 2348/8750 [3:51:13<10:11:52, 5.73s/it] 27%|██▋ | 2348/8750 [3:51:19<10:11:52, 5.73s/it] {'loss': 0.4725, 'learning_rate': 1.7166602371224178e-05, 'epoch': 0.27} + 27%|██▋ | 2348/8750 [3:51:19<10:11:52, 5.73s/it] {'loss': 0.4725, 'learning_rate': 1.7166602371224178e-05, 'epoch': 0.27} + 27%|██▋ | 2348/8750 [3:51:13<10:11:52, 5.73s/it] 27%|██▋ | 2349/8750 [3:51:18<10:14:01, 5.76s/it] 27%|██▋ | 2349/8750 [3:51:25<10:14:01, 5.76s/it] {'loss': 0.4927, 'learning_rate': 1.716402026450831e-05, 'epoch': 0.27} + 27%|██▋ | 2349/8750 [3:51:25<10:14:01, 5.76s/it] {'loss': 0.4927, 'learning_rate': 1.716402026450831e-05, 'epoch': 0.27} + 27%|██▋ | 2349/8750 [3:51:18<10:14:01, 5.76s/it]12 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +02 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... + 27%|██▋ | 2350/8750 [3:51:24<10:08:39, 5.71s/it]7 AutoResumeHook: Checking whether to suspend... +1013 AutoResumeHook: Checking whether to suspend... AutoResumeHook: Checking whether to suspend... + +14 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... + 27%|██▋ | 2350/8750 [3:51:30<10:08:39, 5.71s/it] {'loss': 0.4618, 'learning_rate': 1.7161437176161727e-05, 'epoch': 0.27} + 27%|██▋ | 2350/8750 [3:51:30<10:08:39, 5.71s/it] {'loss': 0.4618, 'learning_rate': 1.7161437176161727e-05, 'epoch': 0.27} + 27%|██▋ | 2350/8750 [3:51:24<10:08:39, 5.71s/it] 27%|██▋ | 2351/8750 [3:51:30<10:13:17, 5.75s/it] 27%|██▋ | 2351/8750 [3:51:36<10:13:17, 5.75s/it] {'loss': 0.4854, 'learning_rate': 1.7158853106538358e-05, 'epoch': 0.27} + 27%|██▋ | 2351/8750 [3:51:36<10:13:17, 5.75s/it] {'loss': 0.4854, 'learning_rate': 1.7158853106538358e-05, 'epoch': 0.27} + 27%|██▋ | 2351/8750 [3:51:30<10:13:17, 5.75s/it] 27%|██▋ | 2352/8750 [3:51:36<10:18:12, 5.80s/it] 27%|██▋ | 2352/8750 [3:51:42<10:18:12, 5.80s/it] {'loss': 0.4853, 'learning_rate': 1.7156268055992286e-05, 'epoch': 0.27} + 27%|██▋ | 2352/8750 [3:51:42<10:18:12, 5.80s/it] {'loss': 0.4853, 'learning_rate': 1.7156268055992286e-05, 'epoch': 0.27} + 27%|██▋ | 2352/8750 [3:51:36<10:18:12, 5.80s/it] 27%|██▋ | 2353/8750 [3:51:42<10:27:15, 5.88s/it] 27%|██▋ | 2353/8750 [3:51:48<10:27:16, 5.88s/it] {'loss': 0.4795, 'learning_rate': 1.7153682024877716e-05, 'epoch': 0.27} + 27%|██▋ | 2353/8750 [3:51:48<10:27:16, 5.88s/it] {'loss': 0.4795, 'learning_rate': 1.7153682024877716e-05, 'epoch': 0.27} + 27%|██▋ | 2353/8750 [3:51:42<10:27:15, 5.88s/it] 27%|██▋ | 2354/8750 [3:51:48<10:26:09, 5.87s/it] 27%|██▋ | 2354/8750 [3:51:54<10:26:09, 5.87s/it] {'loss': 0.4838, 'learning_rate': 1.7151095013548996e-05, 'epoch': 0.27} + 27%|██▋ | 2354/8750 [3:51:54<10:26:09, 5.87s/it] {'loss': 0.4838, 'learning_rate': 1.7151095013548996e-05, 'epoch': 0.27} + 27%|██▋ | 2354/8750 [3:51:48<10:26:09, 5.87s/it] 27%|██▋ | 2355/8750 [3:51:53<10:14:47, 5.77s/it] 27%|██▋ | 2355/8750 [3:52:00<10:14:45, 5.77s/it] {'loss': 0.4743, 'learning_rate': 1.7148507022360602e-05, 'epoch': 0.27} + 27%|██▋ | 2355/8750 [3:52:00<10:14:45, 5.77s/it] {'loss': 0.4743, 'learning_rate': 1.7148507022360602e-05, 'epoch': 0.27} + 27%|██▋ | 2355/8750 [3:51:53<10:14:47, 5.77s/it] 27%|██▋ | 2356/8750 [3:51:59<10:10:02, 5.72s/it] 27%|██▋ | 2356/8750 [3:52:05<10:10:02, 5.72s/it] {'loss': 0.4669, 'learning_rate': 1.7145918051667147e-05, 'epoch': 0.27} + 27%|██▋ | 2356/8750 [3:52:05<10:10:02, 5.72s/it] {'loss': 0.4669, 'learning_rate': 1.7145918051667147e-05, 'epoch': 0.27} + 27%|██▋ | 2356/8750 [3:51:59<10:10:02, 5.72s/it] 27%|██▋ | 2357/8750 [3:52:05<10:13:36, 5.76s/it] 27%|██▋ | 2357/8750 [3:52:11<10:13:35, 5.76s/it] {'loss': 0.4712, 'learning_rate': 1.714332810182338e-05, 'epoch': 0.27} + 27%|██▋ | 2357/8750 [3:52:11<10:13:35, 5.76s/it] {'loss': 0.4712, 'learning_rate': 1.714332810182338e-05, 'epoch': 0.27} + 27%|██▋ | 2357/8750 [3:52:05<10:13:36, 5.76s/it] 27%|██▋ | 2358/8750 [3:52:10<10:08:34, 5.71s/it] 27%|██▋ | 2358/8750 [3:52:17<10:08:34, 5.71s/it] {'loss': 0.4703, 'learning_rate': 1.7140737173184178e-05, 'epoch': 0.27} + 27%|██▋ | 2358/8750 [3:52:17<10:08:34, 5.71s/it] {'loss': 0.4703, 'learning_rate': 1.7140737173184178e-05, 'epoch': 0.27} + 27%|██▋ | 2358/8750 [3:52:10<10:08:34, 5.71s/it] 27%|██▋ | 2359/8750 [3:52:16<10:10:44, 5.73s/it] 27%|██▋ | 2359/8750 [3:52:22<10:10:43, 5.73s/it] {'loss': 0.48, 'learning_rate': 1.713814526610456e-05, 'epoch': 0.27} + 27%|██▋ | 2359/8750 [3:52:22<10:10:43, 5.73s/it] {'loss': 0.48, 'learning_rate': 1.713814526610456e-05, 'epoch': 0.27} + 27%|██▋ | 2359/8750 [3:52:16<10:10:44, 5.73s/it] 27%|██▋ | 2360/8750 [3:52:22<10:13:36, 5.76s/it] 27%|██▋ | 2360/8750 [3:52:28<10:13:36, 5.76s/it] {'loss': 0.4717, 'learning_rate': 1.713555238093967e-05, 'epoch': 0.27} + 27%|██▋ | 2360/8750 [3:52:22<10:13:36, 5.76s/it]{'loss': 0.4717, 'learning_rate': 1.713555238093967e-05, 'epoch': 0.27} + 27%|██▋ | 2360/8750 [3:52:28<10:13:36, 5.76s/it] 27%|██▋ | 2361/8750 [3:52:28<10:13:06, 5.76s/it] 27%|██▋ | 2361/8750 [3:52:34<10:13:06, 5.76s/it] {'loss': 0.4669, 'learning_rate': 1.7132958518044797e-05, 'epoch': 0.27} + 27%|██▋ | 2361/8750 [3:52:34<10:13:06, 5.76s/it] {'loss': 0.4669, 'learning_rate': 1.7132958518044797e-05, 'epoch': 0.27} + 27%|██▋ | 2361/8750 [3:52:28<10:13:06, 5.76s/it] 27%|██▋ | 2362/8750 [3:52:33<10:09:13, 5.72s/it] 27%|██▋ | 2362/8750 [3:52:40<10:09:13, 5.72s/it] {'loss': 0.4762, 'learning_rate': 1.713036367777535e-05, 'epoch': 0.27} + 27%|██▋ | 2362/8750 [3:52:40<10:09:13, 5.72s/it] {'loss': 0.4762, 'learning_rate': 1.713036367777535e-05, 'epoch': 0.27} + 27%|██▋ | 2362/8750 [3:52:33<10:09:13, 5.72s/it] 27%|██▋ | 2363/8750 [3:52:39<10:05:01, 5.68s/it] 27%|██▋ | 2363/8750 [3:52:45<10:05:01, 5.68s/it] {'loss': 0.4835, 'learning_rate': 1.7127767860486892e-05, 'epoch': 0.27} + 27%|██▋ | 2363/8750 [3:52:45<10:05:01, 5.68s/it] {'loss': 0.4835, 'learning_rate': 1.7127767860486892e-05, 'epoch': 0.27} + 27%|██▋ | 2363/8750 [3:52:39<10:05:01, 5.68s/it] 27%|██▋ | 2364/8750 [3:52:44<10:02:18, 5.66s/it] 27%|██▋ | 2364/8750 [3:52:51<10:02:18, 5.66s/it] {'loss': 0.4753, 'learning_rate': 1.71251710665351e-05, 'epoch': 0.27} + 27%|██▋ | 2364/8750 [3:52:51<10:02:18, 5.66s/it] {'loss': 0.4753, 'learning_rate': 1.71251710665351e-05, 'epoch': 0.27} + 27%|██▋ | 2364/8750 [3:52:44<10:02:18, 5.66s/it] 27%|██▋ | 2365/8750 [3:52:50<10:11:26, 5.75s/it] 27%|██▋ | 2365/8750 [3:52:57<10:11:26, 5.75s/it] {'loss': 0.4621, 'learning_rate': 1.7122573296275788e-05, 'epoch': 0.27} + 27%|██▋ | 2365/8750 [3:52:57<10:11:26, 5.75s/it] {'loss': 0.4621, 'learning_rate': 1.7122573296275788e-05, 'epoch': 0.27} + 27%|██▋ | 2365/8750 [3:52:50<10:11:26, 5.75s/it] 27%|██▋ | 2366/8750 [3:52:56<10:17:54, 5.81s/it] 27%|██▋ | 2366/8750 [3:53:03<10:17:54, 5.81s/it] {'loss': 0.4649, 'learning_rate': 1.711997455006492e-05, 'epoch': 0.27} + 27%|██▋ | 2366/8750 [3:53:03<10:17:54, 5.81s/it] {'loss': 0.4649, 'learning_rate': 1.711997455006492e-05, 'epoch': 0.27} + 27%|██▋ | 2366/8750 [3:52:56<10:17:54, 5.81s/it] 27%|██▋ | 2367/8750 [3:53:02<10:16:03, 5.79s/it] 27%|██▋ | 2367/8750 [3:53:09<10:16:04, 5.79s/it] {'loss': 0.4877, 'learning_rate': 1.711737482825858e-05, 'epoch': 0.27} + 27%|██▋ | 2367/8750 [3:53:09<10:16:04, 5.79s/it] {'loss': 0.4877, 'learning_rate': 1.711737482825858e-05, 'epoch': 0.27} + 27%|██▋ | 2367/8750 [3:53:02<10:16:03, 5.79s/it] 27%|██▋ | 2368/8750 [3:53:08<10:18:01, 5.81s/it] 27%|██▋ | 2368/8750 [3:53:14<10:18:00, 5.81s/it] {'loss': 0.4709, 'learning_rate': 1.7114774131212983e-05, 'epoch': 0.27} + 27%|██▋ | 2368/8750 [3:53:14<10:18:00, 5.81s/it] {'loss': 0.4709, 'learning_rate': 1.7114774131212983e-05, 'epoch': 0.27} + 27%|██▋ | 2368/8750 [3:53:08<10:18:01, 5.81s/it] 27%|██▋ | 2369/8750 [3:53:14<10:23:29, 5.86s/it] 27%|██▋ | 2369/8750 [3:53:20<10:23:30, 5.86s/it] {'loss': 0.4921, 'learning_rate': 1.7112172459284478e-05, 'epoch': 0.27} + 27%|██▋ | 2369/8750 [3:53:20<10:23:30, 5.86s/it] {'loss': 0.4921, 'learning_rate': 1.7112172459284478e-05, 'epoch': 0.27} + 27%|██▋ | 2369/8750 [3:53:14<10:23:29, 5.86s/it] 27%|██▋ | 2370/8750 [3:53:26<10:13:58, 5.77s/it] 27%|██▋ | 2370/8750 [3:53:19<10:14:00, 5.77s/it] {'loss': 0.4739, 'learning_rate': 1.7109569812829565e-05, 'epoch': 0.27} + 27%|██▋ | 2370/8750 [3:53:26<10:13:58, 5.77s/it] {'loss': 0.4739, 'learning_rate': 1.7109569812829565e-05, 'epoch': 0.27} + 27%|██▋ | 2370/8750 [3:53:19<10:14:00, 5.77s/it] 27%|██▋ | 2371/8750 [3:53:25<10:09:38, 5.73s/it] 27%|██▋ | 2371/8750 [3:53:32<10:09:38, 5.73s/it] {'loss': 0.4971, 'learning_rate': 1.710696619220486e-05, 'epoch': 0.27} + 27%|██▋ | 2371/8750 [3:53:32<10:09:38, 5.73s/it] {'loss': 0.4971, 'learning_rate': 1.710696619220486e-05, 'epoch': 0.27} + 27%|██▋ | 2371/8750 [3:53:25<10:09:38, 5.73s/it] 27%|██▋ | 2372/8750 [3:53:31<10:08:17, 5.72s/it] 27%|██▋ | 2372/8750 [3:53:37<10:08:18, 5.72s/it] {'loss': 0.4782, 'learning_rate': 1.7104361597767107e-05, 'epoch': 0.27} + {'loss': 0.4782, 'learning_rate': 1.7104361597767107e-05, 'epoch': 0.27} 27%|██▋ | 2372/8750 [3:53:37<10:08:18, 5.72s/it] + 27%|██▋ | 2372/8750 [3:53:31<10:08:17, 5.72s/it] 27%|██▋ | 2373/8750 [3:53:36<10:02:09, 5.67s/it] 27%|██▋ | 2373/8750 [3:53:43<10:02:09, 5.67s/it] {'loss': 0.4725, 'learning_rate': 1.7101756029873208e-05, 'epoch': 0.27} + 27%|██▋ | 2373/8750 [3:53:43<10:02:09, 5.67s/it] {'loss': 0.4725, 'learning_rate': 1.7101756029873208e-05, 'epoch': 0.27} + 27%|██▋ | 2373/8750 [3:53:36<10:02:09, 5.67s/it] 27%|██▋ | 2374/8750 [3:53:42<10:01:00, 5.66s/it] 27%|██▋ | 2374/8750 [3:53:48<10:01:00, 5.66s/it] {'loss': 0.4709, 'learning_rate': 1.7099149488880174e-05, 'epoch': 0.27} + 27%|██▋ | 2374/8750 [3:53:48<10:01:00, 5.66s/it] {'loss': 0.4709, 'learning_rate': 1.7099149488880174e-05, 'epoch': 0.27} + 27%|██▋ | 2374/8750 [3:53:42<10:01:00, 5.66s/it] 27%|██▋ | 2375/8750 [3:53:47<9:57:52, 5.63s/it] 27%|██▋ | 2375/8750 [3:53:54<9:57:53, 5.63s/it] {'loss': 0.4594, 'learning_rate': 1.709654197514517e-05, 'epoch': 0.27} + 27%|██▋ | 2375/8750 [3:53:54<9:57:53, 5.63s/it] {'loss': 0.4594, 'learning_rate': 1.709654197514517e-05, 'epoch': 0.27} + 27%|██▋ | 2375/8750 [3:53:47<9:57:52, 5.63s/it] 27%|██▋ | 2376/8750 [3:53:53<10:08:13, 5.73s/it] 27%|██▋ | 2376/8750 [3:54:00<10:08:13, 5.73s/it] {'loss': 0.4697, 'learning_rate': 1.709393348902547e-05, 'epoch': 0.27} + 27%|██▋ | 2376/8750 [3:54:00<10:08:13, 5.73s/it] {'loss': 0.4697, 'learning_rate': 1.709393348902547e-05, 'epoch': 0.27} + 27%|██▋ | 2376/8750 [3:53:53<10:08:13, 5.73s/it] 27%|██▋ | 2377/8750 [3:53:59<10:09:58, 5.74s/it] 27%|██▋ | 2377/8750 [3:54:06<10:09:59, 5.74s/it] {'loss': 0.4889, 'learning_rate': 1.7091324030878504e-05, 'epoch': 0.27} + 27%|██▋ | 2377/8750 [3:54:06<10:09:59, 5.74s/it] {'loss': 0.4889, 'learning_rate': 1.7091324030878504e-05, 'epoch': 0.27} + 27%|██▋ | 2377/8750 [3:53:59<10:09:58, 5.74s/it] 27%|██▋ | 2378/8750 [3:54:05<10:07:27, 5.72s/it] {'loss': 0.4615, 'learning_rate': 1.7088713601061823e-05, 'epoch': 0.27} + 27%|██▋ | 2378/8750 [3:54:05<10:07:27, 5.72s/it] 27%|██▋ | 2378/8750 [3:54:11<10:07:26, 5.72s/it] {'loss': 0.4615, 'learning_rate': 1.7088713601061823e-05, 'epoch': 0.27} + 27%|██▋ | 2378/8750 [3:54:11<10:07:26, 5.72s/it] 27%|██▋ | 2379/8750 [3:54:10<10:01:04, 5.66s/it] 27%|██▋ | 2379/8750 [3:54:17<10:01:04, 5.66s/it] {'loss': 0.4891, 'learning_rate': 1.7086102199933116e-05, 'epoch': 0.27} + 27%|██▋ | 2379/8750 [3:54:17<10:01:04, 5.66s/it]{'loss': 0.4891, 'learning_rate': 1.7086102199933116e-05, 'epoch': 0.27} + 27%|██▋ | 2379/8750 [3:54:10<10:01:04, 5.66s/it] 27%|██▋ | 2380/8750 [3:54:16<10:03:21, 5.68s/it] 27%|██▋ | 2380/8750 [3:54:23<10:03:21, 5.68s/it] {'loss': 0.4881, 'learning_rate': 1.7083489827850202e-05, 'epoch': 0.27} + 27%|██▋ | 2380/8750 [3:54:23<10:03:21, 5.68s/it] {'loss': 0.4881, 'learning_rate': 1.7083489827850202e-05, 'epoch': 0.27} + 27%|██▋ | 2380/8750 [3:54:16<10:03:21, 5.68s/it] 27%|██▋ | 2381/8750 [3:54:22<10:00:31, 5.66s/it] 27%|██▋ | 2381/8750 [3:54:28<10:00:30, 5.66s/it] {'loss': 0.4834, 'learning_rate': 1.7080876485171035e-05, 'epoch': 0.27} + 27%|██▋ | 2381/8750 [3:54:28<10:00:30, 5.66s/it] {'loss': 0.4834, 'learning_rate': 1.7080876485171035e-05, 'epoch': 0.27} + 27%|██▋ | 2381/8750 [3:54:22<10:00:31, 5.66s/it] 27%|██▋ | 2382/8750 [3:54:28<10:18:37, 5.83s/it] 27%|██▋ | 2382/8750 [3:54:34<10:18:37, 5.83s/it] {'loss': 0.4731, 'learning_rate': 1.70782621722537e-05, 'epoch': 0.27} + 27%|██▋ | 2382/8750 [3:54:34<10:18:37, 5.83s/it] {'loss': 0.4731, 'learning_rate': 1.70782621722537e-05, 'epoch': 0.27} + 27%|██▋ | 2382/8750 [3:54:28<10:18:37, 5.83s/it] 27%|██▋ | 2383/8750 [3:54:34<10:17:44, 5.82s/it] 27%|██▋ | 2383/8750 [3:54:40<10:17:43, 5.82s/it] {'loss': 0.5067, 'learning_rate': 1.7075646889456415e-05, 'epoch': 0.27} + 27%|██▋ | 2383/8750 [3:54:40<10:17:43, 5.82s/it] {'loss': 0.5067, 'learning_rate': 1.7075646889456415e-05, 'epoch': 0.27} + 27%|██▋ | 2383/8750 [3:54:34<10:17:44, 5.82s/it] 27%|██▋ | 2384/8750 [3:54:40<10:19:42, 5.84s/it] 27%|██▋ | 2384/8750 [3:54:46<10:19:42, 5.84s/it] {'loss': 0.4691, 'learning_rate': 1.7073030637137535e-05, 'epoch': 0.27} + 27%|██▋ | 2384/8750 [3:54:46<10:19:42, 5.84s/it] {'loss': 0.4691, 'learning_rate': 1.7073030637137535e-05, 'epoch': 0.27} + 27%|██▋ | 2384/8750 [3:54:40<10:19:42, 5.84s/it] 27%|██▋ | 2385/8750 [3:54:46<10:19:32, 5.84s/it] 27%|██▋ | 2385/8750 [3:54:52<10:19:32, 5.84s/it] {'loss': 0.4868, 'learning_rate': 1.7070413415655548e-05, 'epoch': 0.27} + 27%|██▋ | 2385/8750 [3:54:52<10:19:32, 5.84s/it] {'loss': 0.4868, 'learning_rate': 1.7070413415655548e-05, 'epoch': 0.27} + 27%|██▋ | 2385/8750 [3:54:46<10:19:32, 5.84s/it] 27%|██▋ | 2386/8750 [3:54:51<10:18:53, 5.83s/it] 27%|██▋ | 2386/8750 [3:54:58<10:18:53, 5.83s/it] {'loss': 0.4821, 'learning_rate': 1.7067795225369063e-05, 'epoch': 0.27} + 27%|██▋ | 2386/8750 [3:54:58<10:18:53, 5.83s/it] {'loss': 0.4821, 'learning_rate': 1.7067795225369063e-05, 'epoch': 0.27} + 27%|██▋ | 2386/8750 [3:54:51<10:18:53, 5.83s/it] 27%|██▋ | 2387/8750 [3:54:57<10:15:25, 5.80s/it] 27%|██▋ | 2387/8750 [3:55:04<10:15:25, 5.80s/it] {'loss': 0.4938, 'learning_rate': 1.7065176066636836e-05, 'epoch': 0.27} + 27%|██▋ | 2387/8750 [3:55:04<10:15:25, 5.80s/it] {'loss': 0.4938, 'learning_rate': 1.7065176066636836e-05, 'epoch': 0.27} + 27%|██▋ | 2387/8750 [3:54:57<10:15:25, 5.80s/it] 27%|██▋ | 2388/8750 [3:55:09<10:15:04, 5.80s/it] 27%|██▋ | 2388/8750 [3:55:03<10:15:06, 5.80s/it] {'loss': 0.4777, 'learning_rate': 1.706255593981775e-05, 'epoch': 0.27} + 27%|██▋ | 2388/8750 [3:55:09<10:15:04, 5.80s/it] {'loss': 0.4777, 'learning_rate': 1.706255593981775e-05, 'epoch': 0.27} + 27%|██▋ | 2388/8750 [3:55:03<10:15:06, 5.80s/it] 27%|██▋ | 2389/8750 [3:55:15<10:09:05, 5.75s/it] {'loss': 0.5058, 'learning_rate': 1.7059934845270826e-05, 'epoch': 0.27} + 27%|██▋ | 2389/8750 [3:55:15<10:09:05, 5.75s/it] 27%|██▋ | 2389/8750 [3:55:08<10:09:06, 5.75s/it] {'loss': 0.5058, 'learning_rate': 1.7059934845270826e-05, 'epoch': 0.27} + 27%|██▋ | 2389/8750 [3:55:08<10:09:06, 5.75s/it] 27%|██▋ | 2390/8750 [3:55:21<10:10:10, 5.76s/it] 27%|██▋ | 2390/8750 [3:55:14<10:10:11, 5.76s/it] {'loss': 0.4644, 'learning_rate': 1.70573127833552e-05, 'epoch': 0.27} + 27%|██▋ | 2390/8750 [3:55:21<10:10:10, 5.76s/it] {'loss': 0.4644, 'learning_rate': 1.70573127833552e-05, 'epoch': 0.27} + 27%|██▋ | 2390/8750 [3:55:14<10:10:11, 5.76s/it] 27%|██▋ | 2391/8750 [3:55:27<10:14:19, 5.80s/it] 27%|██▋ | 2391/8750 [3:55:20<10:14:24, 5.80s/it] {'loss': 0.4899, 'learning_rate': 1.705468975443016e-05, 'epoch': 0.27} + 27%|██▋ | 2391/8750 [3:55:27<10:14:19, 5.80s/it] {'loss': 0.4899, 'learning_rate': 1.705468975443016e-05, 'epoch': 0.27} + 27%|██▋ | 2391/8750 [3:55:20<10:14:24, 5.80s/it] 27%|██▋ | 2392/8750 [3:55:26<10:08:34, 5.74s/it] 27%|██▋ | 2392/8750 [3:55:32<10:08:36, 5.74s/it] {'loss': 0.4905, 'learning_rate': 1.7052065758855123e-05, 'epoch': 0.27} + 27%|██▋ | 2392/8750 [3:55:32<10:08:36, 5.74s/it] {'loss': 0.4905, 'learning_rate': 1.7052065758855123e-05, 'epoch': 0.27} + 27%|██▋ | 2392/8750 [3:55:26<10:08:34, 5.74s/it] 27%|██▋ | 2393/8750 [3:55:32<10:08:33, 5.74s/it] 27%|██▋ | 2393/8750 [3:55:38<10:08:35, 5.74s/it] {'loss': 0.4652, 'learning_rate': 1.704944079698963e-05, 'epoch': 0.27} + 27%|██▋ | 2393/8750 [3:55:38<10:08:35, 5.74s/it] {'loss': 0.4652, 'learning_rate': 1.704944079698963e-05, 'epoch': 0.27} + 27%|██▋ | 2393/8750 [3:55:32<10:08:33, 5.74s/it] 27%|██▋ | 2394/8750 [3:55:37<10:04:09, 5.70s/it] 27%|██▋ | 2394/8750 [3:55:44<10:04:10, 5.70s/it] {'loss': 0.4839, 'learning_rate': 1.704681486919336e-05, 'epoch': 0.27} + 27%|██▋ | 2394/8750 [3:55:44<10:04:10, 5.70s/it] {'loss': 0.4839, 'learning_rate': 1.704681486919336e-05, 'epoch': 0.27} + 27%|██▋ | 2394/8750 [3:55:37<10:04:09, 5.70s/it] 27%|██▋ | 2395/8750 [3:55:43<10:00:04, 5.67s/it] 27%|██▋ | 2395/8750 [3:55:49<10:00:05, 5.67s/it] {'loss': 0.4743, 'learning_rate': 1.7044187975826126e-05, 'epoch': 0.27} + 27%|██▋ | 2395/8750 [3:55:49<10:00:05, 5.67s/it] {'loss': 0.4743, 'learning_rate': 1.7044187975826126e-05, 'epoch': 0.27} + 27%|██▋ | 2395/8750 [3:55:43<10:00:04, 5.67s/it] 27%|██▋ | 2396/8750 [3:55:55<9:56:27, 5.63s/it] 27%|██▋ | 2396/8750 [3:55:48<9:56:28, 5.63s/it] {'loss': 0.4723, 'learning_rate': 1.704156011724787e-05, 'epoch': 0.27} + 27%|██▋ | 2396/8750 [3:55:55<9:56:27, 5.63s/it] {'loss': 0.4723, 'learning_rate': 1.704156011724787e-05, 'epoch': 0.27} + 27%|██▋ | 2396/8750 [3:55:48<9:56:28, 5.63s/it] 27%|██▋ | 2397/8750 [3:56:01<10:00:21, 5.67s/it] 27%|██▋ | 2397/8750 [3:55:54<10:00:22, 5.67s/it] {'loss': 0.4885, 'learning_rate': 1.7038931293818665e-05, 'epoch': 0.27} + 27%|██▋ | 2397/8750 [3:56:01<10:00:21, 5.67s/it] {'loss': 0.4885, 'learning_rate': 1.7038931293818665e-05, 'epoch': 0.27} + 27%|██▋ | 2397/8750 [3:55:54<10:00:22, 5.67s/it] 27%|██▋ | 2398/8750 [3:56:06<9:59:12, 5.66s/it] 27%|██▋ | 2398/8750 [3:56:00<9:59:12, 5.66s/it] {'loss': 0.471, 'learning_rate': 1.703630150589872e-05, 'epoch': 0.27} + 27%|██▋ | 2398/8750 [3:56:06<9:59:12, 5.66s/it] {'loss': 0.471, 'learning_rate': 1.703630150589872e-05, 'epoch': 0.27} + 27%|██▋ | 2398/8750 [3:56:00<9:59:12, 5.66s/it] 27%|██▋ | 2399/8750 [3:56:05<9:57:47, 5.65s/it] 27%|██▋ | 2399/8750 [3:56:12<9:57:47, 5.65s/it] {'loss': 0.4791, 'learning_rate': 1.7033670753848373e-05, 'epoch': 0.27} + 27%|██▋ | 2399/8750 [3:56:12<9:57:47, 5.65s/it] {'loss': 0.4791, 'learning_rate': 1.7033670753848373e-05, 'epoch': 0.27} + 27%|██▋ | 2399/8750 [3:56:05<9:57:47, 5.65s/it]12 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +08 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend...3 +AutoResumeHook: Checking whether to suspend... + 27%|██▋ | 2400/8750 [3:56:18<10:07:27, 5.74s/it]13 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... + 27%|██▋ | 2400/8750 [3:56:11<10:07:27, 5.74s/it]15 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4562, 'learning_rate': 1.7031039038028103e-05, 'epoch': 0.27} + 27%|██▋ | 2400/8750 [3:56:18<10:07:27, 5.74s/it] {'loss': 0.4562, 'learning_rate': 1.7031039038028103e-05, 'epoch': 0.27} + 27%|██▋ | 2400/8750 [3:56:11<10:07:27, 5.74s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-2400/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-2400/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-2400/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 27%|██▋ | 2401/8750 [3:56:37<20:28:13, 11.61s/it] 27%|██▋ | 2401/8750 [3:56:43<20:28:14, 11.61s/it] {'loss': 0.4891, 'learning_rate': 1.7028406358798505e-05, 'epoch': 0.27} + 27%|██▋ | 2401/8750 [3:56:43<20:28:14, 11.61s/it] {'loss': 0.4891, 'learning_rate': 1.7028406358798505e-05, 'epoch': 0.27} + 27%|██▋ | 2401/8750 [3:56:37<20:28:13, 11.61s/it] 27%|██▋ | 2402/8750 [3:56:42<17:23:17, 9.86s/it] 27%|██▋ | 2402/8750 [3:56:49<17:23:18, 9.86s/it] {'loss': 0.4637, 'learning_rate': 1.7025772716520324e-05, 'epoch': 0.27} + 27%|██▋ | 2402/8750 [3:56:49<17:23:18, 9.86s/it] {'loss': 0.4637, 'learning_rate': 1.7025772716520324e-05, 'epoch': 0.27} + 27%|██▋ | 2402/8750 [3:56:42<17:23:17, 9.86s/it] 27%|██▋ | 2403/8750 [3:56:48<15:07:18, 8.58s/it] 27%|██▋ | 2403/8750 [3:56:54<15:07:18, 8.58s/it] {'loss': 0.4753, 'learning_rate': 1.7023138111554412e-05, 'epoch': 0.27} + 27%|██▋ | 2403/8750 [3:56:54<15:07:18, 8.58s/it] {'loss': 0.4753, 'learning_rate': 1.7023138111554412e-05, 'epoch': 0.27} + 27%|██▋ | 2403/8750 [3:56:48<15:07:18, 8.58s/it] 27%|██▋ | 2404/8750 [3:56:53<13:32:32, 7.68s/it] 27%|██▋ | 2404/8750 [3:57:00<13:32:34, 7.68s/it] {'loss': 0.4724, 'learning_rate': 1.702050254426179e-05, 'epoch': 0.27} + 27%|██▋ | 2404/8750 [3:57:00<13:32:34, 7.68s/it] {'loss': 0.4724, 'learning_rate': 1.702050254426179e-05, 'epoch': 0.27} + 27%|██▋ | 2404/8750 [3:56:53<13:32:32, 7.68s/it] 27%|██▋ | 2405/8750 [3:56:59<12:30:07, 7.09s/it] 27%|██▋ | 2405/8750 [3:57:06<12:30:08, 7.09s/it] {'loss': 0.4795, 'learning_rate': 1.701786601500357e-05, 'epoch': 0.27} + 27%|██▋ | 2405/8750 [3:57:06<12:30:08, 7.09s/it] {'loss': 0.4795, 'learning_rate': 1.701786601500357e-05, 'epoch': 0.27} + 27%|██▋ | 2405/8750 [3:56:59<12:30:07, 7.09s/it] 27%|██▋ | 2406/8750 [3:57:05<11:46:50, 6.69s/it] 27%|██▋ | 2406/8750 [3:57:11<11:46:51, 6.69s/it] {'loss': 0.4705, 'learning_rate': 1.701522852414103e-05, 'epoch': 0.27} + 27%|██▋ | 2406/8750 [3:57:11<11:46:51, 6.69s/it] {'loss': 0.4705, 'learning_rate': 1.701522852414103e-05, 'epoch': 0.27} + 27%|██▋ | 2406/8750 [3:57:05<11:46:50, 6.69s/it] 28%|██▊ | 2407/8750 [3:57:17<11:16:01, 6.39s/it] 28%|██▊ | 2407/8750 [3:57:11<11:16:02, 6.39s/it] {'loss': 0.5052, 'learning_rate': 1.7012590072035554e-05, 'epoch': 0.28} + 28%|██▊ | 2407/8750 [3:57:17<11:16:01, 6.39s/it] {'loss': 0.5052, 'learning_rate': 1.7012590072035554e-05, 'epoch': 0.28} + 28%|██▊ | 2407/8750 [3:57:11<11:16:02, 6.39s/it] 28%|██▊ | 2408/8750 [3:57:16<10:53:34, 6.18s/it] {'loss': 0.4825, 'learning_rate': 1.7009950659048677e-05, 'epoch': 0.28} 28%|██▊ | 2408/8750 [3:57:23<10:53:34, 6.18s/it] + 28%|██▊ | 2408/8750 [3:57:16<10:53:34, 6.18s/it] {'loss': 0.4825, 'learning_rate': 1.7009950659048677e-05, 'epoch': 0.28} + 28%|██▊ | 2408/8750 [3:57:23<10:53:34, 6.18s/it] 28%|██▊ | 2409/8750 [3:57:22<10:34:25, 6.00s/it] 28%|██▊ | 2409/8750 [3:57:28<10:34:26, 6.00s/it] {'loss': 0.48, 'learning_rate': 1.7007310285542057e-05, 'epoch': 0.28} + 28%|██▊ | 2409/8750 [3:57:28<10:34:26, 6.00s/it] {'loss': 0.48, 'learning_rate': 1.7007310285542057e-05, 'epoch': 0.28} + 28%|██▊ | 2409/8750 [3:57:22<10:34:25, 6.00s/it] 28%|██▊ | 2410/8750 [3:57:28<10:24:24, 5.91s/it] 28%|██▊ | 2410/8750 [3:57:34<10:24:24, 5.91s/it] {'loss': 0.4526, 'learning_rate': 1.7004668951877475e-05, 'epoch': 0.28} + 28%|██▊ | 2410/8750 [3:57:34<10:24:24, 5.91s/it] {'loss': 0.4526, 'learning_rate': 1.7004668951877475e-05, 'epoch': 0.28} + 28%|██▊ | 2410/8750 [3:57:28<10:24:24, 5.91s/it]Apr 10 01:44:04.248328 4030566 slurmstepd 0x155550ab8700: error: *** STEP 6684042.0 ON batch-block1-2105 CANCELLED AT 2025-04-10T01:44:04 DUE TO TIME LIMIT *** +srun: Job step aborted: Waiting up to 122 seconds for job step to finish. +srun: error: batch-block1-2105: task 0: Terminated +srun: Terminating StepId=6684042.0 +srun: error: batch-block1-0084: task 1: Terminated +srun: job 6710093 queued and waiting for resources +srun: job 6710093 has been allocated resources +wandb: Currently logged in as: memmelma. Use `wandb login --relogin` to force relogin +MASTER_ADDR=batch-block1-0082 +JobID: 6710093 | Full list: batch-block1-0082 batch-block1-10017 +NETWORK=Efficient-Large-Model/VILA1.5-3b +wandb: Currently logged in as: memmelma. Use `wandb login --relogin` to force relogin +MASTER_ADDR=batch-block1-0082 +JobID: 6710093 | Full list: batch-block1-0082 batch-block1-10017 +NETWORK=Efficient-Large-Model/VILA1.5-3b +WARNING:torch.distributed.run: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:torch.distributed.run: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +[2025-04-10 10:10:02,228] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 10:10:02,228] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 10:10:02,228] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 10:10:02,228] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 10:10:02,228] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 10:10:02,228] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 10:10:02,228] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 10:10:02,228] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 10:10:02,453] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 10:10:02,453] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 10:10:02,453] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 10:10:02,453] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 10:10:02,453] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 10:10:02,453] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 10:10:02,453] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 10:10:02,453] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 10:10:03,385] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 10:10:03,385] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 10:10:03,385] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 10:10:03,385] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 10:10:03,385] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 10:10:03,385] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 10:10:03,385] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 10:10:03,385] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 10:10:03,385] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 10:10:03,385] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 10:10:03,385] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 10:10:03,385] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 10:10:03,385] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 10:10:03,385] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 10:10:03,385] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 10:10:03,385] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 10:10:03,385] [INFO] [comm.py:625:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +[2025-04-10 10:10:03,911] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 10:10:03,911] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 10:10:03,912] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 10:10:03,912] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 10:10:03,911] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 10:10:03,912] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 10:10:03,912] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 10:10:03,912] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 10:10:03,912] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 10:10:03,912] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 10:10:03,912] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 10:10:03,912] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 10:10:04,041] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 10:10:04,041] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 10:10:04,043] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 10:10:04,043] [INFO] [comm.py:594:init_distributed] cdb=None +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +[2025-04-10 10:10:14,478] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 2.70B parameters + Loading checkpoint shards: 0%| | 0/2 [00:00 4096). Running this sequence through the model will result in indexing errors + 30%|███ | 2666/8750 [26:32<9:45:52, 5.78s/it] 30%|███ | 2666/8750 [26:35<9:45:51, 5.78s/it] {'loss': 0.5038, 'learning_rate': 1.6297949331732047e-05, 'epoch': 0.3} + 30%|███ | 2666/8750 [26:35<9:45:51, 5.78s/it] {'loss': 0.5038, 'learning_rate': 1.6297949331732047e-05, 'epoch': 0.3} + 30%|███ | 2666/8750 [26:32<9:45:52, 5.78s/it] 30%|███ | 2667/8750 [26:37<9:43:00, 5.75s/it] 30%|███ | 2667/8750 [26:40<9:42:59, 5.75s/it] {'loss': 0.4835, 'learning_rate': 1.629507359921193e-05, 'epoch': 0.3} + 30%|███ | 2667/8750 [26:40<9:42:59, 5.75s/it] {'loss': 0.4835, 'learning_rate': 1.629507359921193e-05, 'epoch': 0.3} + 30%|███ | 2667/8750 [26:37<9:43:00, 5.75s/it] 30%|███ | 2668/8750 [26:43<9:45:20, 5.77s/it] 30%|███ | 2668/8750 [26:46<9:45:21, 5.77s/it] {'loss': 0.4742, 'learning_rate': 1.6292197004126184e-05, 'epoch': 0.3} + 30%|███ | 2668/8750 [26:46<9:45:21, 5.77s/it] {'loss': 0.4742, 'learning_rate': 1.6292197004126184e-05, 'epoch': 0.3} + 30%|███ | 2668/8750 [26:43<9:45:20, 5.77s/it] 31%|███ | 2669/8750 [26:49<9:53:42, 5.86s/it] 31%|███ | 2669/8750 [26:52<9:53:42, 5.86s/it] {'loss': 0.4685, 'learning_rate': 1.6289319546868966e-05, 'epoch': 0.31} + 31%|███ | 2669/8750 [26:52<9:53:42, 5.86s/it] {'loss': 0.4685, 'learning_rate': 1.6289319546868966e-05, 'epoch': 0.31} + 31%|███ | 2669/8750 [26:49<9:53:42, 5.86s/it] 31%|███ | 2670/8750 [26:55<9:51:32, 5.84s/it] 31%|███ | 2670/8750 [26:58<9:51:31, 5.84s/it] {'loss': 0.4954, 'learning_rate': 1.6286441227834552e-05, 'epoch': 0.31} + 31%|███ | 2670/8750 [26:58<9:51:31, 5.84s/it] {'loss': 0.4954, 'learning_rate': 1.6286441227834552e-05, 'epoch': 0.31} + 31%|███ | 2670/8750 [26:55<9:51:32, 5.84s/it] 31%|███ | 2671/8750 [27:01<9:54:50, 5.87s/it] 31%|███ | 2671/8750 [27:04<9:54:51, 5.87s/it] {'loss': 0.4682, 'learning_rate': 1.6283562047417342e-05, 'epoch': 0.31} + 31%|███ | 2671/8750 [27:04<9:54:51, 5.87s/it] {'loss': 0.4682, 'learning_rate': 1.6283562047417342e-05, 'epoch': 0.31} + 31%|███ | 2671/8750 [27:01<9:54:50, 5.87s/it] 31%|███ | 2672/8750 [27:07<9:53:24, 5.86s/it] 31%|███ | 2672/8750 [27:10<9:53:24, 5.86s/it] {'loss': 0.4653, 'learning_rate': 1.628068200601184e-05, 'epoch': 0.31} + 31%|███ | 2672/8750 [27:10<9:53:24, 5.86s/it] {'loss': 0.4653, 'learning_rate': 1.628068200601184e-05, 'epoch': 0.31} + 31%|███ | 2672/8750 [27:07<9:53:24, 5.86s/it] 31%|███ | 2673/8750 [27:13<9:49:45, 5.82s/it] 31%|███ | 2673/8750 [27:15<9:49:45, 5.82s/it] {'loss': 0.4799, 'learning_rate': 1.627780110401268e-05, 'epoch': 0.31} + 31%|███ | 2673/8750 [27:15<9:49:45, 5.82s/it] {'loss': 0.4799, 'learning_rate': 1.627780110401268e-05, 'epoch': 0.31} + 31%|███ | 2673/8750 [27:13<9:49:45, 5.82s/it] 31%|███ | 2674/8750 [27:18<9:46:01, 5.79s/it] 31%|███ | 2674/8750 [27:21<9:46:01, 5.79s/it] {'loss': 0.4749, 'learning_rate': 1.6274919341814607e-05, 'epoch': 0.31} + 31%|███ | 2674/8750 [27:21<9:46:01, 5.79s/it] {'loss': 0.4749, 'learning_rate': 1.6274919341814607e-05, 'epoch': 0.31} + 31%|███ | 2674/8750 [27:18<9:46:01, 5.79s/it] 31%|███ | 2675/8750 [27:24<9:45:14, 5.78s/it] 31%|███ | 2675/8750 [27:27<9:45:13, 5.78s/it] {'loss': 0.4631, 'learning_rate': 1.6272036719812496e-05, 'epoch': 0.31} + 31%|███ | 2675/8750 [27:27<9:45:13, 5.78s/it] {'loss': 0.4631, 'learning_rate': 1.6272036719812496e-05, 'epoch': 0.31} + 31%|███ | 2675/8750 [27:24<9:45:14, 5.78s/it] 31%|███ | 2676/8750 [27:30<9:40:41, 5.74s/it] 31%|███ | 2676/8750 [27:33<9:40:41, 5.74s/it] {'loss': 0.4921, 'learning_rate': 1.6269153238401317e-05, 'epoch': 0.31} + 31%|███ | 2676/8750 [27:33<9:40:41, 5.74s/it] {'loss': 0.4921, 'learning_rate': 1.6269153238401317e-05, 'epoch': 0.31} + 31%|███ | 2676/8750 [27:30<9:40:41, 5.74s/it] 31%|███ | 2677/8750 [27:35<9:40:43, 5.74s/it] 31%|███ | 2677/8750 [27:38<9:40:43, 5.74s/it] {'loss': 0.4678, 'learning_rate': 1.626626889797618e-05, 'epoch': 0.31} + 31%|███ | 2677/8750 [27:38<9:40:43, 5.74s/it] {'loss': 0.4678, 'learning_rate': 1.626626889797618e-05, 'epoch': 0.31} + 31%|███ | 2677/8750 [27:35<9:40:43, 5.74s/it] 31%|███ | 2678/8750 [27:41<9:36:36, 5.70s/it] 31%|███ | 2678/8750 [27:44<9:36:36, 5.70s/it] {'loss': 0.4558, 'learning_rate': 1.6263383698932307e-05, 'epoch': 0.31} + 31%|███ | 2678/8750 [27:44<9:36:36, 5.70s/it] {'loss': 0.4558, 'learning_rate': 1.6263383698932307e-05, 'epoch': 0.31} + 31%|███ | 2678/8750 [27:41<9:36:36, 5.70s/it] 31%|███ | 2679/8750 [27:47<9:36:32, 5.70s/it] 31%|███ | 2679/8750 [27:50<9:36:32, 5.70s/it] {'loss': 0.4778, 'learning_rate': 1.6260497641665028e-05, 'epoch': 0.31} + 31%|███ | 2679/8750 [27:50<9:36:32, 5.70s/it] {'loss': 0.4778, 'learning_rate': 1.6260497641665028e-05, 'epoch': 0.31} + 31%|███ | 2679/8750 [27:47<9:36:32, 5.70s/it] 31%|███ | 2680/8750 [27:52<9:32:36, 5.66s/it] 31%|███ | 2680/8750 [27:55<9:32:36, 5.66s/it] {'loss': 0.4876, 'learning_rate': 1.6257610726569798e-05, 'epoch': 0.31} + 31%|███ | 2680/8750 [27:55<9:32:36, 5.66s/it] {'loss': 0.4876, 'learning_rate': 1.6257610726569798e-05, 'epoch': 0.31} + 31%|███ | 2680/8750 [27:52<9:32:36, 5.66s/it] 31%|███ | 2681/8750 [27:58<9:42:46, 5.76s/it] 31%|███ | 2681/8750 [28:01<9:42:45, 5.76s/it] {'loss': 0.4687, 'learning_rate': 1.625472295404219e-05, 'epoch': 0.31} + 31%|███ | 2681/8750 [28:01<9:42:45, 5.76s/it] {'loss': 0.4687, 'learning_rate': 1.625472295404219e-05, 'epoch': 0.31} + 31%|███ | 2681/8750 [27:58<9:42:46, 5.76s/it]/lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/VILA/llava/model/llava_arch.py:397: UserWarning: Inputs truncated! + warnings.warn("Inputs truncated!") + 31%|███ | 2682/8750 [28:04<9:40:21, 5.74s/it] 31%|███ | 2682/8750 [28:07<9:40:20, 5.74s/it] {'loss': 0.4772, 'learning_rate': 1.625183432447789e-05, 'epoch': 0.31} + 31%|███ | 2682/8750 [28:07<9:40:20, 5.74s/it] {'loss': 0.4772, 'learning_rate': 1.625183432447789e-05, 'epoch': 0.31} + 31%|███ | 2682/8750 [28:04<9:40:21, 5.74s/it] 31%|███ | 2683/8750 [28:10<9:38:18, 5.72s/it] 31%|███ | 2683/8750 [28:12<9:38:20, 5.72s/it] {'loss': 0.46, 'learning_rate': 1.6248944838272712e-05, 'epoch': 0.31} + 31%|███ | 2683/8750 [28:12<9:38:20, 5.72s/it] {'loss': 0.46, 'learning_rate': 1.6248944838272712e-05, 'epoch': 0.31} + 31%|███ | 2683/8750 [28:10<9:38:18, 5.72s/it] 31%|███ | 2684/8750 [28:15<9:34:15, 5.68s/it] 31%|███ | 2684/8750 [28:18<9:34:15, 5.68s/it] {'loss': 0.4925, 'learning_rate': 1.6246054495822575e-05, 'epoch': 0.31} + 31%|███ | 2684/8750 [28:18<9:34:15, 5.68s/it] {'loss': 0.4925, 'learning_rate': 1.6246054495822575e-05, 'epoch': 0.31} + 31%|███ | 2684/8750 [28:15<9:34:15, 5.68s/it] 31%|███ | 2685/8750 [28:21<9:38:55, 5.73s/it] 31%|███ | 2685/8750 [28:24<9:38:54, 5.73s/it] {'loss': 0.4656, 'learning_rate': 1.6243163297523524e-05, 'epoch': 0.31} + 31%|███ | 2685/8750 [28:24<9:38:54, 5.73s/it] {'loss': 0.4656, 'learning_rate': 1.6243163297523524e-05, 'epoch': 0.31} + 31%|███ | 2685/8750 [28:21<9:38:55, 5.73s/it] 31%|███ | 2686/8750 [28:27<9:50:05, 5.84s/it] 31%|███ | 2686/8750 [28:30<9:50:05, 5.84s/it] {'loss': 0.4525, 'learning_rate': 1.6240271243771713e-05, 'epoch': 0.31} + 31%|███ | 2686/8750 [28:30<9:50:05, 5.84s/it] {'loss': 0.4525, 'learning_rate': 1.6240271243771713e-05, 'epoch': 0.31} + 31%|███ | 2686/8750 [28:27<9:50:05, 5.84s/it] 31%|███ | 2687/8750 [28:33<9:48:07, 5.82s/it] 31%|███ | 2687/8750 [28:36<9:48:07, 5.82s/it] {'loss': 0.4818, 'learning_rate': 1.6237378334963422e-05, 'epoch': 0.31} + 31%|███ | 2687/8750 [28:36<9:48:07, 5.82s/it] {'loss': 0.4818, 'learning_rate': 1.6237378334963422e-05, 'epoch': 0.31} + 31%|███ | 2687/8750 [28:33<9:48:07, 5.82s/it] 31%|███ | 2688/8750 [28:39<9:47:22, 5.81s/it] 31%|███ | 2688/8750 [28:42<9:47:22, 5.81s/it] {'loss': 0.4646, 'learning_rate': 1.623448457149504e-05, 'epoch': 0.31} + 31%|███ | 2688/8750 [28:42<9:47:22, 5.81s/it] {'loss': 0.4646, 'learning_rate': 1.623448457149504e-05, 'epoch': 0.31} + 31%|███ | 2688/8750 [28:39<9:47:22, 5.81s/it] 31%|███ | 2689/8750 [28:45<9:53:49, 5.88s/it] 31%|███ | 2689/8750 [28:48<9:53:49, 5.88s/it] {'loss': 0.4739, 'learning_rate': 1.623158995376308e-05, 'epoch': 0.31} + 31%|███ | 2689/8750 [28:48<9:53:49, 5.88s/it] {'loss': 0.4739, 'learning_rate': 1.623158995376308e-05, 'epoch': 0.31} + 31%|███ | 2689/8750 [28:45<9:53:49, 5.88s/it] 31%|███ | 2690/8750 [28:50<9:47:35, 5.82s/it] 31%|███ | 2690/8750 [28:53<9:47:35, 5.82s/it] {'loss': 0.4613, 'learning_rate': 1.6228694482164167e-05, 'epoch': 0.31} + 31%|███ | 2690/8750 [28:53<9:47:35, 5.82s/it] {'loss': 0.4613, 'learning_rate': 1.6228694482164167e-05, 'epoch': 0.31} + 31%|███ | 2690/8750 [28:50<9:47:35, 5.82s/it] 31%|███ | 2691/8750 [28:56<9:44:08, 5.78s/it] 31%|███ | 2691/8750 [28:59<9:44:07, 5.78s/it] {'loss': 0.5051, 'learning_rate': 1.622579815709505e-05, 'epoch': 0.31} + 31%|███ | 2691/8750 [28:59<9:44:07, 5.78s/it] {'loss': 0.5051, 'learning_rate': 1.622579815709505e-05, 'epoch': 0.31} + 31%|███ | 2691/8750 [28:56<9:44:08, 5.78s/it] 31%|███ | 2692/8750 [29:02<9:47:36, 5.82s/it] 31%|███ | 2692/8750 [29:05<9:47:36, 5.82s/it] {'loss': 0.4472, 'learning_rate': 1.6222900978952586e-05, 'epoch': 0.31} + 31%|███ | 2692/8750 [29:05<9:47:36, 5.82s/it] {'loss': 0.4472, 'learning_rate': 1.6222900978952586e-05, 'epoch': 0.31} + 31%|███ | 2692/8750 [29:02<9:47:36, 5.82s/it] 31%|███ | 2693/8750 [29:08<9:43:20, 5.78s/it] 31%|███ | 2693/8750 [29:11<9:43:19, 5.78s/it] {'loss': 0.5041, 'learning_rate': 1.6220002948133756e-05, 'epoch': 0.31} + 31%|███ | 2693/8750 [29:11<9:43:19, 5.78s/it] {'loss': 0.5041, 'learning_rate': 1.6220002948133756e-05, 'epoch': 0.31} + 31%|███ | 2693/8750 [29:08<9:43:20, 5.78s/it] 31%|███ | 2694/8750 [29:16<9:38:39, 5.73s/it] 31%|███ | 2694/8750 [29:13<9:38:39, 5.73s/it] {'loss': 0.4845, 'learning_rate': 1.6217104065035652e-05, 'epoch': 0.31} + 31%|███ | 2694/8750 [29:16<9:38:39, 5.73s/it] {'loss': 0.4845, 'learning_rate': 1.6217104065035652e-05, 'epoch': 0.31} + 31%|███ | 2694/8750 [29:13<9:38:39, 5.73s/it] 31%|███ | 2695/8750 [29:19<9:38:04, 5.73s/it] 31%|███ | 2695/8750 [29:22<9:38:04, 5.73s/it] {'loss': 0.4754, 'learning_rate': 1.6214204330055484e-05, 'epoch': 0.31} + 31%|███ | 2695/8750 [29:22<9:38:04, 5.73s/it] {'loss': 0.4754, 'learning_rate': 1.6214204330055484e-05, 'epoch': 0.31} + 31%|███ | 2695/8750 [29:19<9:38:04, 5.73s/it] 31%|███ | 2696/8750 [29:25<9:36:47, 5.72s/it] 31%|███ | 2696/8750 [29:28<9:36:47, 5.72s/it] {'loss': 0.4589, 'learning_rate': 1.621130374359059e-05, 'epoch': 0.31} + 31%|███ | 2696/8750 [29:28<9:36:47, 5.72s/it] {'loss': 0.4589, 'learning_rate': 1.621130374359059e-05, 'epoch': 0.31} + 31%|███ | 2696/8750 [29:25<9:36:47, 5.72s/it] 31%|███ | 2697/8750 [29:30<9:35:48, 5.71s/it] 31%|███ | 2697/8750 [29:33<9:35:48, 5.71s/it] {'loss': 0.4649, 'learning_rate': 1.6208402306038406e-05, 'epoch': 0.31} + 31%|███ | 2697/8750 [29:33<9:35:48, 5.71s/it] {'loss': 0.4649, 'learning_rate': 1.6208402306038406e-05, 'epoch': 0.31} + 31%|███ | 2697/8750 [29:30<9:35:48, 5.71s/it] 31%|███ | 2698/8750 [29:39<9:41:28, 5.76s/it] 31%|███ | 2698/8750 [29:36<9:41:29, 5.76s/it] {'loss': 0.4751, 'learning_rate': 1.620550001779649e-05, 'epoch': 0.31} + 31%|███ | 2698/8750 [29:39<9:41:28, 5.76s/it] {'loss': 0.4751, 'learning_rate': 1.620550001779649e-05, 'epoch': 0.31} + 31%|███ | 2698/8750 [29:36<9:41:29, 5.76s/it] 31%|███ | 2699/8750 [29:45<9:37:37, 5.73s/it] 31%|███ | 2699/8750 [29:42<9:37:37, 5.73s/it] {'loss': 0.4805, 'learning_rate': 1.6202596879262536e-05, 'epoch': 0.31} + 31%|███ | 2699/8750 [29:45<9:37:37, 5.73s/it] {'loss': 0.4805, 'learning_rate': 1.6202596879262536e-05, 'epoch': 0.31} + 31%|███ | 2699/8750 [29:42<9:37:37, 5.73s/it]11 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +015 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... + 31%|███ | 2700/8750 [29:51<9:40:10, 5.75s/it] 31%|███ | 2700/8750 [29:48<9:40:10, 5.75s/it]12 AutoResumeHook: Checking whether to suspend... + {'loss': 0.46, 'learning_rate': 1.6199692890834324e-05, 'epoch': 0.31} + 31%|███ | 2700/8750 [29:51<9:40:10, 5.75s/it] {'loss': 0.46, 'learning_rate': 1.6199692890834324e-05, 'epoch': 0.31} + 31%|███ | 2700/8750 [29:48<9:40:10, 5.75s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-2700/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-2700/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-2700/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 31%|███ | 2701/8750 [30:09<17:22:13, 10.34s/it] 31%|███ | 2701/8750 [30:12<17:22:14, 10.34s/it] {'loss': 0.477, 'learning_rate': 1.6196788052909772e-05, 'epoch': 0.31} + 31%|███ | 2701/8750 [30:12<17:22:14, 10.34s/it] {'loss': 0.477, 'learning_rate': 1.6196788052909772e-05, 'epoch': 0.31} + 31%|███ | 2701/8750 [30:09<17:22:13, 10.34s/it] 31%|███ | 2702/8750 [30:14<14:58:08, 8.91s/it] 31%|███ | 2702/8750 [30:17<14:58:08, 8.91s/it] {'loss': 0.4757, 'learning_rate': 1.6193882365886905e-05, 'epoch': 0.31} + 31%|███ | 2702/8750 [30:17<14:58:08, 8.91s/it] {'loss': 0.4757, 'learning_rate': 1.6193882365886905e-05, 'epoch': 0.31} + 31%|███ | 2702/8750 [30:14<14:58:08, 8.91s/it] 31%|███ | 2703/8750 [30:20<13:22:41, 7.96s/it] 31%|███ | 2703/8750 [30:23<13:22:42, 7.96s/it] {'loss': 0.4712, 'learning_rate': 1.6190975830163872e-05, 'epoch': 0.31} + 31%|███ | 2703/8750 [30:23<13:22:42, 7.96s/it] {'loss': 0.4712, 'learning_rate': 1.6190975830163872e-05, 'epoch': 0.31} + 31%|███ | 2703/8750 [30:20<13:22:41, 7.96s/it] 31%|███ | 2704/8750 [30:26<12:18:06, 7.32s/it] 31%|███ | 2704/8750 [30:29<12:18:06, 7.32s/it] {'loss': 0.4701, 'learning_rate': 1.6188068446138925e-05, 'epoch': 0.31} + 31%|███ | 2704/8750 [30:29<12:18:06, 7.32s/it] {'loss': 0.4701, 'learning_rate': 1.6188068446138925e-05, 'epoch': 0.31} + 31%|███ | 2704/8750 [30:26<12:18:06, 7.32s/it] 31%|███ | 2705/8750 [30:32<11:29:01, 6.84s/it] 31%|███ | 2705/8750 [30:35<11:29:01, 6.84s/it] {'loss': 0.4706, 'learning_rate': 1.6185160214210447e-05, 'epoch': 0.31} + 31%|███ | 2705/8750 [30:35<11:29:01, 6.84s/it] {'loss': 0.4706, 'learning_rate': 1.6185160214210447e-05, 'epoch': 0.31} + 31%|███ | 2705/8750 [30:32<11:29:01, 6.84s/it] 31%|███ | 2706/8750 [30:40<10:51:58, 6.47s/it] 31%|███ | 2706/8750 [30:37<10:51:59, 6.47s/it] {'loss': 0.4659, 'learning_rate': 1.6182251134776927e-05, 'epoch': 0.31} + 31%|███ | 2706/8750 [30:40<10:51:58, 6.47s/it] {'loss': 0.4659, 'learning_rate': 1.6182251134776927e-05, 'epoch': 0.31} + 31%|███ | 2706/8750 [30:37<10:51:59, 6.47s/it] 31%|███ | 2707/8750 [30:46<10:28:07, 6.24s/it] 31%|███ | 2707/8750 [30:43<10:28:07, 6.24s/it] {'loss': 0.4882, 'learning_rate': 1.6179341208236977e-05, 'epoch': 0.31} + 31%|███ | 2707/8750 [30:46<10:28:07, 6.24s/it] {'loss': 0.4882, 'learning_rate': 1.6179341208236977e-05, 'epoch': 0.31} + 31%|███ | 2707/8750 [30:43<10:28:07, 6.24s/it] 31%|███ | 2708/8750 [30:49<10:13:31, 6.09s/it] 31%|███ | 2708/8750 [30:52<10:13:32, 6.09s/it] {'loss': 0.4824, 'learning_rate': 1.617643043498932e-05, 'epoch': 0.31} + 31%|███ | 2708/8750 [30:52<10:13:32, 6.09s/it] {'loss': 0.4824, 'learning_rate': 1.617643043498932e-05, 'epoch': 0.31} + 31%|███ | 2708/8750 [30:49<10:13:31, 6.09s/it] 31%|███ | 2709/8750 [30:55<10:04:32, 6.00s/it] 31%|███ | 2709/8750 [30:57<10:04:32, 6.00s/it] {'loss': 0.4679, 'learning_rate': 1.6173518815432797e-05, 'epoch': 0.31} {'loss': 0.4679, 'learning_rate': 1.6173518815432797e-05, 'epoch': 0.31} + 31%|███ | 2709/8750 [30:57<10:04:32, 6.00s/it] + 31%|███ | 2709/8750 [30:55<10:04:32, 6.00s/it] 31%|███ | 2710/8750 [31:00<9:57:38, 5.94s/it] 31%|███ | 2710/8750 [31:03<9:57:38, 5.94s/it] {'loss': 0.4932, 'learning_rate': 1.6170606349966367e-05, 'epoch': 0.31} + 31%|███ | 2710/8750 [31:03<9:57:38, 5.94s/it] {'loss': 0.4932, 'learning_rate': 1.6170606349966367e-05, 'epoch': 0.31} + 31%|███ | 2710/8750 [31:00<9:57:38, 5.94s/it] 31%|███ | 2711/8750 [31:06<9:49:36, 5.86s/it] 31%|███ | 2711/8750 [31:09<9:49:37, 5.86s/it] {'loss': 0.485, 'learning_rate': 1.6167693038989098e-05, 'epoch': 0.31} + 31%|███ | 2711/8750 [31:09<9:49:37, 5.86s/it] {'loss': 0.485, 'learning_rate': 1.6167693038989098e-05, 'epoch': 0.31} + 31%|███ | 2711/8750 [31:06<9:49:36, 5.86s/it] 31%|███ | 2712/8750 [31:12<10:00:55, 5.97s/it] 31%|███ | 2712/8750 [31:15<10:00:55, 5.97s/it] {'loss': 0.4555, 'learning_rate': 1.6164778882900186e-05, 'epoch': 0.31} + 31%|███ | 2712/8750 [31:15<10:00:55, 5.97s/it] {'loss': 0.4555, 'learning_rate': 1.6164778882900186e-05, 'epoch': 0.31} + 31%|███ | 2712/8750 [31:12<10:00:55, 5.97s/it] 31%|███ | 2713/8750 [31:18<9:50:02, 5.86s/it] 31%|███ | 2713/8750 [31:21<9:50:03, 5.86s/it] {'loss': 0.5035, 'learning_rate': 1.6161863882098926e-05, 'epoch': 0.31} + 31%|███ | 2713/8750 [31:21<9:50:03, 5.86s/it] {'loss': 0.5035, 'learning_rate': 1.6161863882098926e-05, 'epoch': 0.31} + 31%|███ | 2713/8750 [31:18<9:50:02, 5.86s/it] 31%|███ | 2714/8750 [31:26<9:46:12, 5.83s/it] 31%|███ | 2714/8750 [31:24<9:46:13, 5.83s/it] {'loss': 0.4557, 'learning_rate': 1.615894803698475e-05, 'epoch': 0.31} + 31%|███ | 2714/8750 [31:26<9:46:12, 5.83s/it] {'loss': 0.4557, 'learning_rate': 1.615894803698475e-05, 'epoch': 0.31} + 31%|███ | 2714/8750 [31:24<9:46:13, 5.83s/it] 31%|███ | 2715/8750 [31:29<9:44:13, 5.81s/it] 31%|███ | 2715/8750 [31:32<9:44:13, 5.81s/it] {'loss': 0.4986, 'learning_rate': 1.615603134795718e-05, 'epoch': 0.31} + 31%|███ | 2715/8750 [31:32<9:44:13, 5.81s/it] {'loss': 0.4986, 'learning_rate': 1.615603134795718e-05, 'epoch': 0.31} + 31%|███ | 2715/8750 [31:29<9:44:13, 5.81s/it] 31%|███ | 2716/8750 [31:35<9:37:13, 5.74s/it] 31%|███ | 2716/8750 [31:38<9:37:14, 5.74s/it] {'loss': 0.4746, 'learning_rate': 1.615311381541588e-05, 'epoch': 0.31} + 31%|███ | 2716/8750 [31:38<9:37:14, 5.74s/it] {'loss': 0.4746, 'learning_rate': 1.615311381541588e-05, 'epoch': 0.31} + 31%|███ | 2716/8750 [31:35<9:37:13, 5.74s/it] 31%|███ | 2717/8750 [31:44<9:36:23, 5.73s/it] 31%|███ | 2717/8750 [31:41<9:36:24, 5.73s/it] {'loss': 0.4601, 'learning_rate': 1.615019543976061e-05, 'epoch': 0.31} + 31%|███ | 2717/8750 [31:44<9:36:23, 5.73s/it] {'loss': 0.4601, 'learning_rate': 1.615019543976061e-05, 'epoch': 0.31} + 31%|███ | 2717/8750 [31:41<9:36:24, 5.73s/it] 31%|███ | 2718/8750 [31:49<9:42:24, 5.79s/it] 31%|███ | 2718/8750 [31:47<9:42:25, 5.79s/it] {'loss': 0.4762, 'learning_rate': 1.6147276221391256e-05, 'epoch': 0.31} + 31%|███ | 2718/8750 [31:49<9:42:24, 5.79s/it] {'loss': 0.4762, 'learning_rate': 1.6147276221391256e-05, 'epoch': 0.31} + 31%|███ | 2718/8750 [31:47<9:42:25, 5.79s/it] 31%|███ | 2719/8750 [31:53<9:51:13, 5.88s/it] 31%|███ | 2719/8750 [31:56<9:51:13, 5.88s/it] {'loss': 0.483, 'learning_rate': 1.614435616070781e-05, 'epoch': 0.31} + 31%|███ | 2719/8750 [31:56<9:51:13, 5.88s/it] {'loss': 0.483, 'learning_rate': 1.614435616070781e-05, 'epoch': 0.31} + 31%|███ | 2719/8750 [31:53<9:51:13, 5.88s/it] 31%|███ | 2720/8750 [31:59<9:50:51, 5.88s/it] 31%|███ | 2720/8750 [32:01<9:50:52, 5.88s/it] {'loss': 0.4747, 'learning_rate': 1.6141435258110397e-05, 'epoch': 0.31} + 31%|███ | 2720/8750 [32:01<9:50:52, 5.88s/it] {'loss': 0.4747, 'learning_rate': 1.6141435258110397e-05, 'epoch': 0.31} + 31%|███ | 2720/8750 [31:59<9:50:51, 5.88s/it] 31%|███ | 2721/8750 [32:07<9:52:46, 5.90s/it] 31%|███ | 2721/8750 [32:04<9:52:47, 5.90s/it] {'loss': 0.4668, 'learning_rate': 1.6138513513999234e-05, 'epoch': 0.31} + 31%|███ | 2721/8750 [32:07<9:52:46, 5.90s/it] {'loss': 0.4668, 'learning_rate': 1.6138513513999234e-05, 'epoch': 0.31} + 31%|███ | 2721/8750 [32:04<9:52:47, 5.90s/it] 31%|███ | 2722/8750 [32:13<9:48:22, 5.86s/it] 31%|███ | 2722/8750 [32:10<9:48:23, 5.86s/it] {'loss': 0.4704, 'learning_rate': 1.613559092877467e-05, 'epoch': 0.31} + 31%|███ | 2722/8750 [32:13<9:48:22, 5.86s/it] {'loss': 0.4704, 'learning_rate': 1.613559092877467e-05, 'epoch': 0.31} + 31%|███ | 2722/8750 [32:10<9:48:23, 5.86s/it] 31%|███ | 2723/8750 [32:16<9:48:37, 5.86s/it] 31%|███ | 2723/8750 [32:19<9:48:37, 5.86s/it] {'loss': 0.4809, 'learning_rate': 1.6132667502837164e-05, 'epoch': 0.31} + 31%|███ | 2723/8750 [32:19<9:48:37, 5.86s/it] {'loss': 0.4809, 'learning_rate': 1.6132667502837164e-05, 'epoch': 0.31} + 31%|███ | 2723/8750 [32:16<9:48:37, 5.86s/it] 31%|███ | 2724/8750 [32:22<9:42:58, 5.80s/it] 31%|███ | 2724/8750 [32:25<9:42:59, 5.80s/it] {'loss': 0.4661, 'learning_rate': 1.6129743236587293e-05, 'epoch': 0.31} + 31%|███ | 2724/8750 [32:25<9:42:59, 5.80s/it] {'loss': 0.4661, 'learning_rate': 1.6129743236587293e-05, 'epoch': 0.31} + 31%|███ | 2724/8750 [32:22<9:42:58, 5.80s/it] 31%|███ | 2725/8750 [32:31<9:44:18, 5.82s/it] 31%|███ | 2725/8750 [32:28<9:44:18, 5.82s/it] {'loss': 0.48, 'learning_rate': 1.6126818130425746e-05, 'epoch': 0.31} + 31%|███ | 2725/8750 [32:31<9:44:18, 5.82s/it] {'loss': 0.48, 'learning_rate': 1.6126818130425746e-05, 'epoch': 0.31} + 31%|███ | 2725/8750 [32:28<9:44:18, 5.82s/it] 31%|███ | 2726/8750 [32:36<9:41:25, 5.79s/it] 31%|███ | 2726/8750 [32:33<9:41:25, 5.79s/it] {'loss': 0.4916, 'learning_rate': 1.6123892184753324e-05, 'epoch': 0.31} + 31%|███ | 2726/8750 [32:36<9:41:25, 5.79s/it] {'loss': 0.4916, 'learning_rate': 1.6123892184753324e-05, 'epoch': 0.31} + 31%|███ | 2726/8750 [32:33<9:41:25, 5.79s/it] 31%|███ | 2727/8750 [32:39<9:44:53, 5.83s/it] 31%|███ | 2727/8750 [32:42<9:44:53, 5.83s/it] {'loss': 0.4791, 'learning_rate': 1.612096539997095e-05, 'epoch': 0.31} + 31%|███ | 2727/8750 [32:42<9:44:53, 5.83s/it] {'loss': 0.4791, 'learning_rate': 1.612096539997095e-05, 'epoch': 0.31} + 31%|███ | 2727/8750 [32:39<9:44:53, 5.83s/it] 31%|███ | 2728/8750 [32:45<9:43:21, 5.81s/it] 31%|███ | 2728/8750 [32:48<9:43:22, 5.81s/it] {'loss': 0.4851, 'learning_rate': 1.611803777647966e-05, 'epoch': 0.31} + 31%|███ | 2728/8750 [32:48<9:43:22, 5.81s/it] {'loss': 0.4851, 'learning_rate': 1.611803777647966e-05, 'epoch': 0.31} + 31%|███ | 2728/8750 [32:45<9:43:21, 5.81s/it] 31%|███ | 2729/8750 [32:54<9:46:12, 5.84s/it] 31%|███ | 2729/8750 [32:51<9:46:12, 5.84s/it] {'loss': 0.4755, 'learning_rate': 1.6115109314680603e-05, 'epoch': 0.31} + 31%|███ | 2729/8750 [32:54<9:46:12, 5.84s/it] {'loss': 0.4755, 'learning_rate': 1.6115109314680603e-05, 'epoch': 0.31} + 31%|███ | 2729/8750 [32:51<9:46:12, 5.84s/it] 31%|███ | 2730/8750 [33:00<9:45:38, 5.84s/it] 31%|███ | 2730/8750 [32:57<9:45:39, 5.84s/it] {'loss': 0.4662, 'learning_rate': 1.611218001497504e-05, 'epoch': 0.31} + 31%|███ | 2730/8750 [33:00<9:45:38, 5.84s/it] {'loss': 0.4662, 'learning_rate': 1.611218001497504e-05, 'epoch': 0.31} + 31%|███ | 2730/8750 [32:57<9:45:39, 5.84s/it] 31%|███ | 2731/8750 [33:03<9:43:22, 5.82s/it] 31%|███ | 2731/8750 [33:05<9:43:23, 5.82s/it] {'loss': 0.4743, 'learning_rate': 1.610924987776436e-05, 'epoch': 0.31} + 31%|███ | 2731/8750 [33:05<9:43:23, 5.82s/it] {'loss': 0.4743, 'learning_rate': 1.610924987776436e-05, 'epoch': 0.31} + 31%|███ | 2731/8750 [33:03<9:43:22, 5.82s/it] 31%|███ | 2732/8750 [33:08<9:47:04, 5.85s/it] 31%|███ | 2732/8750 [33:11<9:47:04, 5.85s/it] {'loss': 0.446, 'learning_rate': 1.6106318903450042e-05, 'epoch': 0.31} + 31%|███ | 2732/8750 [33:11<9:47:04, 5.85s/it] {'loss': 0.446, 'learning_rate': 1.6106318903450042e-05, 'epoch': 0.31} + 31%|███ | 2732/8750 [33:08<9:47:04, 5.85s/it] 31%|███ | 2733/8750 [33:17<9:46:51, 5.85s/it] 31%|███ | 2733/8750 [33:14<9:46:51, 5.85s/it] {'loss': 0.48, 'learning_rate': 1.6103387092433704e-05, 'epoch': 0.31} + 31%|███ | 2733/8750 [33:17<9:46:51, 5.85s/it] {'loss': 0.48, 'learning_rate': 1.6103387092433704e-05, 'epoch': 0.31} + 31%|███ | 2733/8750 [33:14<9:46:51, 5.85s/it] 31%|███ | 2734/8750 [33:23<9:48:26, 5.87s/it] 31%|███ | 2734/8750 [33:20<9:48:26, 5.87s/it] {'loss': 0.4846, 'learning_rate': 1.6100454445117074e-05, 'epoch': 0.31} + 31%|███ | 2734/8750 [33:23<9:48:26, 5.87s/it] {'loss': 0.4846, 'learning_rate': 1.6100454445117074e-05, 'epoch': 0.31} + 31%|███ | 2734/8750 [33:20<9:48:26, 5.87s/it] 31%|███▏ | 2735/8750 [33:29<9:45:51, 5.84s/it] 31%|███▏ | 2735/8750 [33:26<9:45:51, 5.84s/it] {'loss': 0.4946, 'learning_rate': 1.6097520961901983e-05, 'epoch': 0.31} + 31%|███▏ | 2735/8750 [33:29<9:45:51, 5.84s/it] {'loss': 0.4946, 'learning_rate': 1.6097520961901983e-05, 'epoch': 0.31} + 31%|███▏ | 2735/8750 [33:26<9:45:51, 5.84s/it] 31%|███▏ | 2736/8750 [33:35<9:53:45, 5.92s/it] 31%|███▏ | 2736/8750 [33:32<9:53:45, 5.92s/it] {'loss': 0.4798, 'learning_rate': 1.6094586643190388e-05, 'epoch': 0.31} + 31%|███▏ | 2736/8750 [33:35<9:53:45, 5.92s/it] {'loss': 0.4798, 'learning_rate': 1.6094586643190388e-05, 'epoch': 0.31} + 31%|███▏ | 2736/8750 [33:32<9:53:45, 5.92s/it] 31%|███▏ | 2737/8750 [33:38<9:54:26, 5.93s/it] 31%|███▏ | 2737/8750 [33:41<9:54:26, 5.93s/it]{'loss': 0.4737, 'learning_rate': 1.609165148938435e-05, 'epoch': 0.31} + {'loss': 0.4737, 'learning_rate': 1.609165148938435e-05, 'epoch': 0.31} + 31%|███▏ | 2737/8750 [33:41<9:54:26, 5.93s/it] 31%|███▏ | 2737/8750 [33:38<9:54:26, 5.93s/it] 31%|███▏ | 2738/8750 [33:47<9:47:25, 5.86s/it] 31%|███▏ | 2738/8750 [33:44<9:47:25, 5.86s/it] {'loss': 0.4634, 'learning_rate': 1.608871550088606e-05, 'epoch': 0.31} + 31%|███▏ | 2738/8750 [33:47<9:47:25, 5.86s/it] {'loss': 0.4634, 'learning_rate': 1.608871550088606e-05, 'epoch': 0.31} + 31%|███▏ | 2738/8750 [33:44<9:47:25, 5.86s/it] 31%|███▏ | 2739/8750 [33:52<9:41:24, 5.80s/it] 31%|███▏ | 2739/8750 [33:49<9:41:24, 5.80s/it] {'loss': 0.4716, 'learning_rate': 1.6085778678097804e-05, 'epoch': 0.31} + 31%|███▏ | 2739/8750 [33:52<9:41:24, 5.80s/it] {'loss': 0.4716, 'learning_rate': 1.6085778678097804e-05, 'epoch': 0.31} + 31%|███▏ | 2739/8750 [33:49<9:41:24, 5.80s/it] 31%|███▏ | 2740/8750 [33:58<9:35:38, 5.75s/it] 31%|███▏ | 2740/8750 [33:55<9:35:38, 5.75s/it] {'loss': 0.4621, 'learning_rate': 1.6082841021422e-05, 'epoch': 0.31} + 31%|███▏ | 2740/8750 [33:58<9:35:38, 5.75s/it] {'loss': 0.4621, 'learning_rate': 1.6082841021422e-05, 'epoch': 0.31} + 31%|███▏ | 2740/8750 [33:55<9:35:38, 5.75s/it] 31%|███▏ | 2741/8750 [34:01<9:36:55, 5.76s/it] 31%|███▏ | 2741/8750 [34:04<9:36:56, 5.76s/it] {'loss': 0.4844, 'learning_rate': 1.607990253126117e-05, 'epoch': 0.31} + 31%|███▏ | 2741/8750 [34:04<9:36:56, 5.76s/it] {'loss': 0.4844, 'learning_rate': 1.607990253126117e-05, 'epoch': 0.31} + 31%|███▏ | 2741/8750 [34:01<9:36:55, 5.76s/it] 31%|███▏ | 2742/8750 [34:10<9:37:35, 5.77s/it] 31%|███▏ | 2742/8750 [34:07<9:37:36, 5.77s/it] {'loss': 0.479, 'learning_rate': 1.607696320801795e-05, 'epoch': 0.31} + 31%|███▏ | 2742/8750 [34:10<9:37:35, 5.77s/it] {'loss': 0.479, 'learning_rate': 1.607696320801795e-05, 'epoch': 0.31} + 31%|███▏ | 2742/8750 [34:07<9:37:36, 5.77s/it] 31%|███▏ | 2743/8750 [34:13<9:49:48, 5.89s/it] 31%|███▏ | 2743/8750 [34:16<9:49:48, 5.89s/it] {'loss': 0.4718, 'learning_rate': 1.6074023052095096e-05, 'epoch': 0.31} + 31%|███▏ | 2743/8750 [34:16<9:49:48, 5.89s/it] {'loss': 0.4718, 'learning_rate': 1.6074023052095096e-05, 'epoch': 0.31} + 31%|███▏ | 2743/8750 [34:13<9:49:48, 5.89s/it] 31%|███▏ | 2744/8750 [34:19<9:47:57, 5.87s/it] 31%|███▏ | 2744/8750 [34:22<9:47:57, 5.87s/it] {'loss': 0.4843, 'learning_rate': 1.6071082063895476e-05, 'epoch': 0.31} + 31%|███▏ | 2744/8750 [34:22<9:47:57, 5.87s/it] {'loss': 0.4843, 'learning_rate': 1.6071082063895476e-05, 'epoch': 0.31} + 31%|███▏ | 2744/8750 [34:19<9:47:57, 5.87s/it] 31%|███▏ | 2745/8750 [34:27<9:43:55, 5.83s/it] 31%|███▏ | 2745/8750 [34:24<9:43:55, 5.83s/it] {'loss': 0.4677, 'learning_rate': 1.6068140243822065e-05, 'epoch': 0.31} + 31%|███▏ | 2745/8750 [34:27<9:43:55, 5.83s/it] {'loss': 0.4677, 'learning_rate': 1.6068140243822065e-05, 'epoch': 0.31} + 31%|███▏ | 2745/8750 [34:24<9:43:55, 5.83s/it] 31%|███▏ | 2746/8750 [34:33<9:43:43, 5.83s/it] 31%|███▏ | 2746/8750 [34:30<9:43:44, 5.83s/it] {'loss': 0.4666, 'learning_rate': 1.6065197592277965e-05, 'epoch': 0.31} + 31%|███▏ | 2746/8750 [34:33<9:43:43, 5.83s/it] {'loss': 0.4666, 'learning_rate': 1.6065197592277965e-05, 'epoch': 0.31} + 31%|███▏ | 2746/8750 [34:30<9:43:44, 5.83s/it] 31%|███▏ | 2747/8750 [34:39<9:41:54, 5.82s/it] 31%|███▏ | 2747/8750 [34:36<9:41:54, 5.82s/it] {'loss': 0.4853, 'learning_rate': 1.6062254109666383e-05, 'epoch': 0.31} + 31%|███▏ | 2747/8750 [34:39<9:41:54, 5.82s/it] {'loss': 0.4853, 'learning_rate': 1.6062254109666383e-05, 'epoch': 0.31} + 31%|███▏ | 2747/8750 [34:36<9:41:54, 5.82s/it] 31%|███▏ | 2748/8750 [34:42<9:35:38, 5.75s/it] 31%|███▏ | 2748/8750 [34:45<9:35:39, 5.75s/it] {'loss': 0.4735, 'learning_rate': 1.6059309796390638e-05, 'epoch': 0.31} + 31%|███▏ | 2748/8750 [34:45<9:35:39, 5.75s/it] {'loss': 0.4735, 'learning_rate': 1.6059309796390638e-05, 'epoch': 0.31} + 31%|███▏ | 2748/8750 [34:42<9:35:38, 5.75s/it] 31%|███▏ | 2749/8750 [34:50<9:34:39, 5.75s/it] 31%|███▏ | 2749/8750 [34:47<9:34:39, 5.75s/it] {'loss': 0.4768, 'learning_rate': 1.6056364652854174e-05, 'epoch': 0.31} + 31%|███▏ | 2749/8750 [34:50<9:34:39, 5.75s/it] {'loss': 0.4768, 'learning_rate': 1.6056364652854174e-05, 'epoch': 0.31} + 31%|███▏ | 2749/8750 [34:47<9:34:39, 5.75s/it]2 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +11 9AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +04 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend...1 AutoResumeHook: Checking whether to suspend... + 31%|███▏ | 2750/8750 [34:53<9:31:01, 5.71s/it] +12 AutoResumeHook: Checking whether to suspend... + 31%|███▏ | 2750/8750 [34:56<9:31:01, 5.71s/it]7 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4528, 'learning_rate': 1.6053418679460534e-05, 'epoch': 0.31} + 31%|███▏ | 2750/8750 [34:56<9:31:01, 5.71s/it] {'loss': 0.4528, 'learning_rate': 1.6053418679460534e-05, 'epoch': 0.31} + 31%|███▏ | 2750/8750 [34:53<9:31:01, 5.71s/it] 31%|███▏ | 2751/8750 [34:59<9:34:22, 5.74s/it] 31%|███▏ | 2751/8750 [35:02<9:34:22, 5.74s/it] {'loss': 0.4702, 'learning_rate': 1.6050471876613386e-05, 'epoch': 0.31} + 31%|███▏ | 2751/8750 [35:02<9:34:22, 5.74s/it] {'loss': 0.4702, 'learning_rate': 1.6050471876613386e-05, 'epoch': 0.31} + 31%|███▏ | 2751/8750 [34:59<9:34:22, 5.74s/it] 31%|███▏ | 2752/8750 [35:04<9:30:40, 5.71s/it] 31%|███▏ | 2752/8750 [35:07<9:30:40, 5.71s/it] {'loss': 0.4713, 'learning_rate': 1.6047524244716506e-05, 'epoch': 0.31} + 31%|███▏ | 2752/8750 [35:07<9:30:40, 5.71s/it] {'loss': 0.4713, 'learning_rate': 1.6047524244716506e-05, 'epoch': 0.31} + 31%|███▏ | 2752/8750 [35:04<9:30:40, 5.71s/it] 31%|███▏ | 2753/8750 [35:10<9:31:12, 5.71s/it] 31%|███▏ | 2753/8750 [35:13<9:31:11, 5.71s/it] {'loss': 0.4758, 'learning_rate': 1.604457578417379e-05, 'epoch': 0.31} + 31%|███▏ | 2753/8750 [35:13<9:31:11, 5.71s/it] {'loss': 0.4758, 'learning_rate': 1.604457578417379e-05, 'epoch': 0.31} + 31%|███▏ | 2753/8750 [35:10<9:31:12, 5.71s/it] 31%|███▏ | 2754/8750 [35:19<9:30:42, 5.71s/it] 31%|███▏ | 2754/8750 [35:16<9:30:43, 5.71s/it] {'loss': 0.4742, 'learning_rate': 1.6041626495389235e-05, 'epoch': 0.31} + 31%|███▏ | 2754/8750 [35:19<9:30:42, 5.71s/it] {'loss': 0.4742, 'learning_rate': 1.6041626495389235e-05, 'epoch': 0.31} + 31%|███▏ | 2754/8750 [35:16<9:30:43, 5.71s/it] 31%|███▏ | 2755/8750 [35:25<9:32:34, 5.73s/it] 31%|███▏ | 2755/8750 [35:22<9:32:34, 5.73s/it] {'loss': 0.4639, 'learning_rate': 1.6038676378766968e-05, 'epoch': 0.31} + 31%|███▏ | 2755/8750 [35:25<9:32:34, 5.73s/it] {'loss': 0.4639, 'learning_rate': 1.6038676378766968e-05, 'epoch': 0.31} + 31%|███▏ | 2755/8750 [35:22<9:32:34, 5.73s/it] 31%|███▏ | 2756/8750 [35:27<9:34:36, 5.75s/it] 31%|███▏ | 2756/8750 [35:30<9:34:37, 5.75s/it] {'loss': 0.4743, 'learning_rate': 1.603572543471121e-05, 'epoch': 0.31} + 31%|███▏ | 2756/8750 [35:30<9:34:37, 5.75s/it] {'loss': 0.4743, 'learning_rate': 1.603572543471121e-05, 'epoch': 0.31} + 31%|███▏ | 2756/8750 [35:27<9:34:36, 5.75s/it] 32%|███▏ | 2757/8750 [35:33<9:36:27, 5.77s/it] 32%|███▏ | 2757/8750 [35:36<9:36:27, 5.77s/it] {'loss': 0.4833, 'learning_rate': 1.603277366362632e-05, 'epoch': 0.32} + 32%|███▏ | 2757/8750 [35:36<9:36:27, 5.77s/it] {'loss': 0.4833, 'learning_rate': 1.603277366362632e-05, 'epoch': 0.32} + 32%|███▏ | 2757/8750 [35:33<9:36:27, 5.77s/it] 32%|███▏ | 2758/8750 [35:39<9:33:09, 5.74s/it] 32%|███▏ | 2758/8750 [35:42<9:33:10, 5.74s/it] {'loss': 0.4669, 'learning_rate': 1.6029821065916745e-05, 'epoch': 0.32} + 32%|███▏ | 2758/8750 [35:42<9:33:10, 5.74s/it] {'loss': 0.4669, 'learning_rate': 1.6029821065916745e-05, 'epoch': 0.32} + 32%|███▏ | 2758/8750 [35:39<9:33:09, 5.74s/it] 32%|███▏ | 2759/8750 [35:45<9:32:05, 5.73s/it] 32%|███▏ | 2759/8750 [35:48<9:32:05, 5.73s/it] {'loss': 0.4883, 'learning_rate': 1.602686764198706e-05, 'epoch': 0.32} + 32%|███▏ | 2759/8750 [35:48<9:32:05, 5.73s/it] {'loss': 0.4883, 'learning_rate': 1.602686764198706e-05, 'epoch': 0.32} + 32%|███▏ | 2759/8750 [35:45<9:32:05, 5.73s/it] 32%|███▏ | 2760/8750 [35:50<9:26:58, 5.68s/it] 32%|███▏ | 2760/8750 [35:53<9:26:58, 5.68s/it] {'loss': 0.4739, 'learning_rate': 1.602391339224196e-05, 'epoch': 0.32} + 32%|███▏ | 2760/8750 [35:53<9:26:58, 5.68s/it] {'loss': 0.4739, 'learning_rate': 1.602391339224196e-05, 'epoch': 0.32} + 32%|███▏ | 2760/8750 [35:50<9:26:58, 5.68s/it] 32%|███▏ | 2761/8750 [35:56<9:31:34, 5.73s/it] 32%|███▏ | 2761/8750 [35:59<9:31:35, 5.73s/it] {'loss': 0.466, 'learning_rate': 1.6020958317086224e-05, 'epoch': 0.32} + 32%|███▏ | 2761/8750 [35:59<9:31:35, 5.73s/it] {'loss': 0.466, 'learning_rate': 1.6020958317086224e-05, 'epoch': 0.32} + 32%|███▏ | 2761/8750 [35:56<9:31:34, 5.73s/it] 32%|███▏ | 2762/8750 [36:05<9:27:08, 5.68s/it] 32%|███▏ | 2762/8750 [36:02<9:27:09, 5.68s/it] {'loss': 0.4624, 'learning_rate': 1.601800241692478e-05, 'epoch': 0.32} + 32%|███▏ | 2762/8750 [36:05<9:27:08, 5.68s/it] {'loss': 0.4624, 'learning_rate': 1.601800241692478e-05, 'epoch': 0.32} + 32%|███▏ | 2762/8750 [36:02<9:27:09, 5.68s/it] 32%|███▏ | 2763/8750 [36:07<9:31:02, 5.72s/it] 32%|███▏ | 2763/8750 [36:10<9:31:01, 5.72s/it] {'loss': 0.4698, 'learning_rate': 1.6015045692162644e-05, 'epoch': 0.32} + 32%|███▏ | 2763/8750 [36:10<9:31:01, 5.72s/it] {'loss': 0.4698, 'learning_rate': 1.6015045692162644e-05, 'epoch': 0.32} + 32%|███▏ | 2763/8750 [36:07<9:31:02, 5.72s/it] 32%|███▏ | 2764/8750 [36:13<9:29:12, 5.71s/it] 32%|███▏ | 2764/8750 [36:16<9:29:13, 5.71s/it] {'loss': 0.4567, 'learning_rate': 1.6012088143204953e-05, 'epoch': 0.32} + 32%|███▏ | 2764/8750 [36:16<9:29:13, 5.71s/it] {'loss': 0.4567, 'learning_rate': 1.6012088143204953e-05, 'epoch': 0.32} + 32%|███▏ | 2764/8750 [36:13<9:29:12, 5.71s/it] 32%|███▏ | 2765/8750 [36:19<9:31:22, 5.73s/it] 32%|███▏ | 2765/8750 [36:22<9:31:22, 5.73s/it] {'loss': 0.4719, 'learning_rate': 1.6009129770456962e-05, 'epoch': 0.32} + 32%|███▏ | 2765/8750 [36:22<9:31:22, 5.73s/it] {'loss': 0.4719, 'learning_rate': 1.6009129770456962e-05, 'epoch': 0.32} + 32%|███▏ | 2765/8750 [36:19<9:31:22, 5.73s/it] 32%|███▏ | 2766/8750 [36:25<9:33:00, 5.75s/it] 32%|███▏ | 2766/8750 [36:28<9:32:59, 5.75s/it] {'loss': 0.4681, 'learning_rate': 1.6006170574324033e-05, 'epoch': 0.32} + 32%|███▏ | 2766/8750 [36:28<9:32:59, 5.75s/it] {'loss': 0.4681, 'learning_rate': 1.6006170574324033e-05, 'epoch': 0.32} + 32%|███▏ | 2766/8750 [36:25<9:33:00, 5.75s/it] 32%|███▏ | 2767/8750 [36:30<9:31:22, 5.73s/it] 32%|███▏ | 2767/8750 [36:33<9:31:23, 5.73s/it] {'loss': 0.4753, 'learning_rate': 1.6003210555211635e-05, 'epoch': 0.32} + 32%|███▏ | 2767/8750 [36:33<9:31:23, 5.73s/it] {'loss': 0.4753, 'learning_rate': 1.6003210555211635e-05, 'epoch': 0.32} + 32%|███▏ | 2767/8750 [36:30<9:31:22, 5.73s/it] 32%|███▏ | 2768/8750 [36:36<9:30:44, 5.72s/it] 32%|███▏ | 2768/8750 [36:39<9:30:45, 5.72s/it] {'loss': 0.4674, 'learning_rate': 1.6000249713525366e-05, 'epoch': 0.32} + 32%|███▏ | 2768/8750 [36:39<9:30:45, 5.72s/it] {'loss': 0.4674, 'learning_rate': 1.6000249713525366e-05, 'epoch': 0.32} + 32%|███▏ | 2768/8750 [36:36<9:30:44, 5.72s/it] 32%|███▏ | 2769/8750 [36:42<9:37:34, 5.79s/it] 32%|███▏ | 2769/8750 [36:45<9:37:34, 5.79s/it] {'loss': 0.4563, 'learning_rate': 1.5997288049670924e-05, 'epoch': 0.32} + 32%|███▏ | 2769/8750 [36:45<9:37:34, 5.79s/it] {'loss': 0.4563, 'learning_rate': 1.5997288049670924e-05, 'epoch': 0.32} + 32%|███▏ | 2769/8750 [36:42<9:37:34, 5.79s/it] 32%|███▏ | 2770/8750 [36:48<9:37:34, 5.80s/it] 32%|███▏ | 2770/8750 [36:51<9:37:34, 5.80s/it] {'loss': 0.5064, 'learning_rate': 1.5994325564054122e-05, 'epoch': 0.32} + 32%|███▏ | 2770/8750 [36:51<9:37:34, 5.80s/it] {'loss': 0.5064, 'learning_rate': 1.5994325564054122e-05, 'epoch': 0.32} + 32%|███▏ | 2770/8750 [36:48<9:37:34, 5.80s/it] 32%|███▏ | 2771/8750 [36:54<9:38:52, 5.81s/it] 32%|███▏ | 2771/8750 [36:57<9:38:51, 5.81s/it] {'loss': 0.4761, 'learning_rate': 1.599136225708089e-05, 'epoch': 0.32} + 32%|███▏ | 2771/8750 [36:57<9:38:51, 5.81s/it] {'loss': 0.4761, 'learning_rate': 1.599136225708089e-05, 'epoch': 0.32} + 32%|███▏ | 2771/8750 [36:54<9:38:52, 5.81s/it] 32%|███▏ | 2772/8750 [37:00<9:46:12, 5.88s/it] 32%|███▏ | 2772/8750 [37:03<9:46:12, 5.88s/it] {'loss': 0.4571, 'learning_rate': 1.598839812915726e-05, 'epoch': 0.32} + 32%|███▏ | 2772/8750 [37:03<9:46:12, 5.88s/it] {'loss': 0.4571, 'learning_rate': 1.598839812915726e-05, 'epoch': 0.32} + 32%|███▏ | 2772/8750 [37:00<9:46:12, 5.88s/it] 32%|███▏ | 2773/8750 [37:05<9:38:57, 5.81s/it] 32%|███▏ | 2773/8750 [37:08<9:38:58, 5.81s/it] {'loss': 0.4861, 'learning_rate': 1.598543318068939e-05, 'epoch': 0.32} + 32%|███▏ | 2773/8750 [37:08<9:38:58, 5.81s/it] {'loss': 0.4861, 'learning_rate': 1.598543318068939e-05, 'epoch': 0.32} + 32%|███▏ | 2773/8750 [37:05<9:38:57, 5.81s/it] 32%|███▏ | 2774/8750 [37:11<9:34:29, 5.77s/it] 32%|███▏ | 2774/8750 [37:14<9:34:30, 5.77s/it] {'loss': 0.4579, 'learning_rate': 1.5982467412083543e-05, 'epoch': 0.32} + 32%|███▏ | 2774/8750 [37:14<9:34:30, 5.77s/it] {'loss': 0.4579, 'learning_rate': 1.5982467412083543e-05, 'epoch': 0.32} + 32%|███▏ | 2774/8750 [37:11<9:34:29, 5.77s/it] 32%|███▏ | 2775/8750 [37:17<9:36:09, 5.79s/it] 32%|███▏ | 2775/8750 [37:20<9:36:08, 5.79s/it] {'loss': 0.4937, 'learning_rate': 1.5979500823746096e-05, 'epoch': 0.32} + 32%|███▏ | 2775/8750 [37:20<9:36:08, 5.79s/it] {'loss': 0.4937, 'learning_rate': 1.5979500823746096e-05, 'epoch': 0.32} + 32%|███▏ | 2775/8750 [37:17<9:36:09, 5.79s/it] 32%|███▏ | 2776/8750 [37:26<9:35:46, 5.78s/it] 32%|███▏ | 2776/8750 [37:23<9:35:47, 5.78s/it] {'loss': 0.4646, 'learning_rate': 1.5976533416083535e-05, 'epoch': 0.32} + 32%|███▏ | 2776/8750 [37:26<9:35:46, 5.78s/it] {'loss': 0.4646, 'learning_rate': 1.5976533416083535e-05, 'epoch': 0.32} + 32%|███▏ | 2776/8750 [37:23<9:35:47, 5.78s/it] 32%|███▏ | 2777/8750 [37:28<9:34:12, 5.77s/it] 32%|███▏ | 2777/8750 [37:31<9:34:12, 5.77s/it] {'loss': 0.4636, 'learning_rate': 1.5973565189502463e-05, 'epoch': 0.32} + 32%|███▏ | 2777/8750 [37:31<9:34:12, 5.77s/it] {'loss': 0.4636, 'learning_rate': 1.5973565189502463e-05, 'epoch': 0.32} + 32%|███▏ | 2777/8750 [37:28<9:34:12, 5.77s/it] 32%|███▏ | 2778/8750 [37:37<9:38:39, 5.81s/it] 32%|███▏ | 2778/8750 [37:34<9:38:39, 5.81s/it] {'loss': 0.4741, 'learning_rate': 1.5970596144409595e-05, 'epoch': 0.32} + 32%|███▏ | 2778/8750 [37:37<9:38:39, 5.81s/it] {'loss': 0.4741, 'learning_rate': 1.5970596144409595e-05, 'epoch': 0.32} + 32%|███▏ | 2778/8750 [37:34<9:38:39, 5.81s/it] 32%|███▏ | 2779/8750 [37:40<9:38:51, 5.82s/it] 32%|███▏ | 2779/8750 [37:43<9:38:51, 5.82s/it] {'loss': 0.4716, 'learning_rate': 1.5967626281211754e-05, 'epoch': 0.32} + 32%|███▏ | 2779/8750 [37:43<9:38:51, 5.82s/it] {'loss': 0.4716, 'learning_rate': 1.5967626281211754e-05, 'epoch': 0.32} + 32%|███▏ | 2779/8750 [37:40<9:38:51, 5.82s/it] 32%|███▏ | 2780/8750 [37:49<9:38:25, 5.81s/it] 32%|███▏ | 2780/8750 [37:46<9:38:25, 5.81s/it] {'loss': 0.4568, 'learning_rate': 1.596465560031588e-05, 'epoch': 0.32} + 32%|███▏ | 2780/8750 [37:49<9:38:25, 5.81s/it] {'loss': 0.4568, 'learning_rate': 1.596465560031588e-05, 'epoch': 0.32} + 32%|███▏ | 2780/8750 [37:46<9:38:25, 5.81s/it] 32%|███▏ | 2781/8750 [37:51<9:31:51, 5.75s/it] 32%|███▏ | 2781/8750 [37:54<9:31:51, 5.75s/it] {'loss': 0.4772, 'learning_rate': 1.5961684102129015e-05, 'epoch': 0.32} + 32%|███▏ | 2781/8750 [37:54<9:31:51, 5.75s/it] {'loss': 0.4772, 'learning_rate': 1.5961684102129015e-05, 'epoch': 0.32} + 32%|███▏ | 2781/8750 [37:51<9:31:51, 5.75s/it] 32%|███▏ | 2782/8750 [37:57<9:35:09, 5.78s/it] 32%|███▏ | 2782/8750 [38:00<9:35:09, 5.78s/it] {'loss': 0.4642, 'learning_rate': 1.5958711787058332e-05, 'epoch': 0.32} + 32%|███▏ | 2782/8750 [38:00<9:35:09, 5.78s/it] {'loss': 0.4642, 'learning_rate': 1.5958711787058332e-05, 'epoch': 0.32} + 32%|███▏ | 2782/8750 [37:57<9:35:09, 5.78s/it] 32%|███▏ | 2783/8750 [38:03<9:29:18, 5.72s/it] 32%|███▏ | 2783/8750 [38:06<9:29:18, 5.72s/it] {'loss': 0.4938, 'learning_rate': 1.5955738655511094e-05, 'epoch': 0.32} + 32%|███▏ | 2783/8750 [38:06<9:29:18, 5.72s/it] {'loss': 0.4938, 'learning_rate': 1.5955738655511094e-05, 'epoch': 0.32} + 32%|███▏ | 2783/8750 [38:03<9:29:18, 5.72s/it] 32%|███▏ | 2784/8750 [38:09<9:31:33, 5.75s/it] 32%|███▏ | 2784/8750 [38:12<9:31:33, 5.75s/it] {'loss': 0.4634, 'learning_rate': 1.5952764707894696e-05, 'epoch': 0.32} + 32%|███▏ | 2784/8750 [38:12<9:31:33, 5.75s/it] {'loss': 0.4634, 'learning_rate': 1.5952764707894696e-05, 'epoch': 0.32} + 32%|███▏ | 2784/8750 [38:09<9:31:33, 5.75s/it] 32%|███▏ | 2785/8750 [38:14<9:28:32, 5.72s/it] 32%|███▏ | 2785/8750 [38:17<9:28:32, 5.72s/it] {'loss': 0.4767, 'learning_rate': 1.594978994461663e-05, 'epoch': 0.32} + 32%|███▏ | 2785/8750 [38:17<9:28:32, 5.72s/it] {'loss': 0.4767, 'learning_rate': 1.594978994461663e-05, 'epoch': 0.32} + 32%|███▏ | 2785/8750 [38:14<9:28:32, 5.72s/it] 32%|███▏ | 2786/8750 [38:20<9:27:27, 5.71s/it] 32%|███▏ | 2786/8750 [38:23<9:27:27, 5.71s/it] {'loss': 0.4734, 'learning_rate': 1.5946814366084505e-05, 'epoch': 0.32} + 32%|███▏ | 2786/8750 [38:23<9:27:27, 5.71s/it] {'loss': 0.4734, 'learning_rate': 1.5946814366084505e-05, 'epoch': 0.32} + 32%|███▏ | 2786/8750 [38:20<9:27:27, 5.71s/it] 32%|███▏ | 2787/8750 [38:26<9:25:34, 5.69s/it] 32%|███▏ | 2787/8750 [38:29<9:25:34, 5.69s/it] {'loss': 0.4918, 'learning_rate': 1.594383797270604e-05, 'epoch': 0.32} + 32%|███▏ | 2787/8750 [38:29<9:25:34, 5.69s/it] {'loss': 0.4918, 'learning_rate': 1.594383797270604e-05, 'epoch': 0.32} + 32%|███▏ | 2787/8750 [38:26<9:25:34, 5.69s/it] 32%|███▏ | 2788/8750 [38:32<9:33:52, 5.78s/it] 32%|███▏ | 2788/8750 [38:35<9:33:52, 5.78s/it] {'loss': 0.4859, 'learning_rate': 1.5940860764889073e-05, 'epoch': 0.32} + 32%|███▏ | 2788/8750 [38:35<9:33:52, 5.78s/it] {'loss': 0.4859, 'learning_rate': 1.5940860764889073e-05, 'epoch': 0.32} + 32%|███▏ | 2788/8750 [38:32<9:33:52, 5.78s/it] 32%|███▏ | 2789/8750 [38:38<9:40:33, 5.84s/it] 32%|███▏ | 2789/8750 [38:41<9:40:33, 5.84s/it] {'loss': 0.462, 'learning_rate': 1.5937882743041543e-05, 'epoch': 0.32} + 32%|███▏ | 2789/8750 [38:41<9:40:33, 5.84s/it] {'loss': 0.462, 'learning_rate': 1.5937882743041543e-05, 'epoch': 0.32} + 32%|███▏ | 2789/8750 [38:38<9:40:33, 5.84s/it] 32%|███▏ | 2790/8750 [38:43<9:37:54, 5.82s/it] 32%|███▏ | 2790/8750 [38:46<9:37:54, 5.82s/it] {'loss': 0.4686, 'learning_rate': 1.5934903907571507e-05, 'epoch': 0.32} + 32%|███▏ | 2790/8750 [38:46<9:37:54, 5.82s/it] {'loss': 0.4686, 'learning_rate': 1.5934903907571507e-05, 'epoch': 0.32} + 32%|███▏ | 2790/8750 [38:43<9:37:54, 5.82s/it] 32%|███▏ | 2791/8750 [38:49<9:38:20, 5.82s/it] 32%|███▏ | 2791/8750 [38:52<9:38:20, 5.82s/it] {'loss': 0.4752, 'learning_rate': 1.593192425888713e-05, 'epoch': 0.32} + 32%|███▏ | 2791/8750 [38:52<9:38:20, 5.82s/it] {'loss': 0.4752, 'learning_rate': 1.593192425888713e-05, 'epoch': 0.32} + 32%|███▏ | 2791/8750 [38:49<9:38:20, 5.82s/it] 32%|███▏ | 2792/8750 [38:55<9:33:52, 5.78s/it] 32%|███▏ | 2792/8750 [38:58<9:33:53, 5.78s/it] {'loss': 0.4988, 'learning_rate': 1.5928943797396695e-05, 'epoch': 0.32} + 32%|███▏ | 2792/8750 [38:58<9:33:53, 5.78s/it] {'loss': 0.4988, 'learning_rate': 1.5928943797396695e-05, 'epoch': 0.32} + 32%|███▏ | 2792/8750 [38:55<9:33:52, 5.78s/it] 32%|███▏ | 2793/8750 [39:01<9:45:28, 5.90s/it] 32%|███▏ | 2793/8750 [39:04<9:45:28, 5.90s/it] {'loss': 0.4771, 'learning_rate': 1.592596252350859e-05, 'epoch': 0.32} + 32%|███▏ | 2793/8750 [39:04<9:45:28, 5.90s/it] {'loss': 0.4771, 'learning_rate': 1.592596252350859e-05, 'epoch': 0.32} + 32%|███▏ | 2793/8750 [39:01<9:45:28, 5.90s/it] 32%|███▏ | 2794/8750 [39:07<9:39:02, 5.83s/it] 32%|███▏ | 2794/8750 [39:10<9:39:02, 5.83s/it] {'loss': 0.4763, 'learning_rate': 1.5922980437631314e-05, 'epoch': 0.32} + 32%|███▏ | 2794/8750 [39:10<9:39:02, 5.83s/it] {'loss': 0.4763, 'learning_rate': 1.5922980437631314e-05, 'epoch': 0.32} + 32%|███▏ | 2794/8750 [39:07<9:39:02, 5.83s/it] 32%|███▏ | 2795/8750 [39:13<9:39:41, 5.84s/it] 32%|███▏ | 2795/8750 [39:16<9:39:45, 5.84s/it] {'loss': 0.4701, 'learning_rate': 1.591999754017348e-05, 'epoch': 0.32} + 32%|███▏ | 2795/8750 [39:16<9:39:45, 5.84s/it] {'loss': 0.4701, 'learning_rate': 1.591999754017348e-05, 'epoch': 0.32} + 32%|███▏ | 2795/8750 [39:13<9:39:41, 5.84s/it] 32%|███▏ | 2796/8750 [39:18<9:37:26, 5.82s/it] 32%|███▏ | 2796/8750 [39:21<9:37:26, 5.82s/it] {'loss': 0.4927, 'learning_rate': 1.5917013831543814e-05, 'epoch': 0.32} + 32%|███▏ | 2796/8750 [39:21<9:37:26, 5.82s/it] {'loss': 0.4927, 'learning_rate': 1.5917013831543814e-05, 'epoch': 0.32} + 32%|███▏ | 2796/8750 [39:18<9:37:26, 5.82s/it] 32%|███▏ | 2797/8750 [39:25<9:48:14, 5.93s/it] 32%|███▏ | 2797/8750 [39:28<9:48:13, 5.93s/it] {'loss': 0.4832, 'learning_rate': 1.5914029312151146e-05, 'epoch': 0.32} + 32%|███▏ | 2797/8750 [39:28<9:48:13, 5.93s/it] {'loss': 0.4832, 'learning_rate': 1.5914029312151146e-05, 'epoch': 0.32} + 32%|███▏ | 2797/8750 [39:25<9:48:14, 5.93s/it] 32%|███▏ | 2798/8750 [39:33<9:46:09, 5.91s/it] 32%|███▏ | 2798/8750 [39:31<9:46:10, 5.91s/it] {'loss': 0.475, 'learning_rate': 1.5911043982404426e-05, 'epoch': 0.32} + 32%|███▏ | 2798/8750 [39:33<9:46:09, 5.91s/it] {'loss': 0.475, 'learning_rate': 1.5911043982404426e-05, 'epoch': 0.32} + 32%|███▏ | 2798/8750 [39:31<9:46:10, 5.91s/it] 32%|███▏ | 2799/8750 [39:36<9:47:29, 5.92s/it] 32%|███▏ | 2799/8750 [39:39<9:47:28, 5.92s/it] {'loss': 0.4782, 'learning_rate': 1.590805784271271e-05, 'epoch': 0.32} + 32%|███▏ | 2799/8750 [39:39<9:47:28, 5.92s/it] {'loss': 0.4782, 'learning_rate': 1.590805784271271e-05, 'epoch': 0.32} + 32%|███▏ | 2799/8750 [39:36<9:47:29, 5.92s/it]14 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +358 AutoResumeHook: Checking whether to suspend... AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + + +4 AutoResumeHook: Checking whether to suspend... +102 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +0 32%|███▏ | 2800/8750 [39:45<9:39:27, 5.84s/it]AutoResumeHook: Checking whether to suspend... + 32%|███▏ | 2800/8750 [39:42<9:39:28, 5.84s/it]1 AutoResumeHook: Checking whether to suspend... +12 7 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... + {'loss': 0.4783, 'learning_rate': 1.5905070893485165e-05, 'epoch': 0.32} + 32%|███▏ | 2800/8750 [39:45<9:39:27, 5.84s/it] {'loss': 0.4783, 'learning_rate': 1.5905070893485165e-05, 'epoch': 0.32} + 32%|███▏ | 2800/8750 [39:42<9:39:28, 5.84s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-2800/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-2800/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-2800/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 32%|███▏ | 2801/8750 [40:06<18:48:10, 11.38s/it] 32%|███▏ | 2801/8750 [40:09<18:48:10, 11.38s/it] {'loss': 0.514, 'learning_rate': 1.5902083135131067e-05, 'epoch': 0.32} + 32%|███▏ | 2801/8750 [40:09<18:48:10, 11.38s/it] {'loss': 0.514, 'learning_rate': 1.5902083135131067e-05, 'epoch': 0.32} + 32%|███▏ | 2801/8750 [40:06<18:48:10, 11.38s/it] 32%|███▏ | 2802/8750 [40:12<15:59:16, 9.68s/it] 32%|███▏ | 2802/8750 [40:15<15:59:16, 9.68s/it] {'loss': 0.4526, 'learning_rate': 1.5899094568059812e-05, 'epoch': 0.32} + 32%|███▏ | 2802/8750 [40:15<15:59:16, 9.68s/it] {'loss': 0.4526, 'learning_rate': 1.5899094568059812e-05, 'epoch': 0.32} + 32%|███▏ | 2802/8750 [40:12<15:59:16, 9.68s/it] 32%|███▏ | 2803/8750 [40:18<14:02:20, 8.50s/it] 32%|███▏ | 2803/8750 [40:21<14:02:20, 8.50s/it] {'loss': 0.4837, 'learning_rate': 1.58961051926809e-05, 'epoch': 0.32} + 32%|███▏ | 2803/8750 [40:21<14:02:20, 8.50s/it] {'loss': 0.4837, 'learning_rate': 1.58961051926809e-05, 'epoch': 0.32} + 32%|███▏ | 2803/8750 [40:18<14:02:20, 8.50s/it] 32%|███▏ | 2804/8750 [40:24<12:38:43, 7.66s/it] 32%|███▏ | 2804/8750 [40:26<12:38:44, 7.66s/it] {'loss': 0.4742, 'learning_rate': 1.5893115009403932e-05, 'epoch': 0.32} + 32%|███▏ | 2804/8750 [40:26<12:38:44, 7.66s/it] {'loss': 0.4742, 'learning_rate': 1.5893115009403932e-05, 'epoch': 0.32} + 32%|███▏ | 2804/8750 [40:24<12:38:43, 7.66s/it] 32%|███▏ | 2805/8750 [40:30<11:50:05, 7.17s/it] 32%|███▏ | 2805/8750 [40:32<11:50:05, 7.17s/it] {'loss': 0.4731, 'learning_rate': 1.589012401863864e-05, 'epoch': 0.32} + 32%|███▏ | 2805/8750 [40:33<11:50:05, 7.17s/it] {'loss': 0.4731, 'learning_rate': 1.589012401863864e-05, 'epoch': 0.32} + 32%|███▏ | 2805/8750 [40:30<11:50:05, 7.17s/it] 32%|███▏ | 2806/8750 [40:35<11:09:56, 6.76s/it] 32%|███▏ | 2806/8750 [40:38<11:09:56, 6.76s/it] {'loss': 0.4697, 'learning_rate': 1.5887132220794855e-05, 'epoch': 0.32} + 32%|███▏ | 2806/8750 [40:38<11:09:56, 6.76s/it] {'loss': 0.4697, 'learning_rate': 1.5887132220794855e-05, 'epoch': 0.32} + 32%|███▏ | 2806/8750 [40:35<11:09:56, 6.76s/it] 32%|███▏ | 2807/8750 [40:41<10:39:52, 6.46s/it] 32%|███▏ | 2807/8750 [40:44<10:39:52, 6.46s/it] {'loss': 0.4692, 'learning_rate': 1.5884139616282517e-05, 'epoch': 0.32} + 32%|███▏ | 2807/8750 [40:44<10:39:52, 6.46s/it] {'loss': 0.4692, 'learning_rate': 1.5884139616282517e-05, 'epoch': 0.32} + 32%|███▏ | 2807/8750 [40:41<10:39:52, 6.46s/it] 32%|███▏ | 2808/8750 [40:47<10:17:01, 6.23s/it] 32%|███▏ | 2808/8750 [40:50<10:17:01, 6.23s/it] {'loss': 0.4793, 'learning_rate': 1.5881146205511683e-05, 'epoch': 0.32} + 32%|███▏ | 2808/8750 [40:50<10:17:01, 6.23s/it] {'loss': 0.4793, 'learning_rate': 1.5881146205511683e-05, 'epoch': 0.32} + 32%|███▏ | 2808/8750 [40:47<10:17:01, 6.23s/it] 32%|███▏ | 2809/8750 [40:52<9:58:52, 6.05s/it] 32%|███▏ | 2809/8750 [40:55<9:58:52, 6.05s/it] {'loss': 0.4826, 'learning_rate': 1.5878151988892513e-05, 'epoch': 0.32} + 32%|███▏ | 2809/8750 [40:55<9:58:52, 6.05s/it] {'loss': 0.4826, 'learning_rate': 1.5878151988892513e-05, 'epoch': 0.32} + 32%|███▏ | 2809/8750 [40:52<9:58:52, 6.05s/it] 32%|███▏ | 2810/8750 [40:58<9:48:21, 5.94s/it] 32%|███▏ | 2810/8750 [41:01<9:48:21, 5.94s/it] {'loss': 0.4683, 'learning_rate': 1.5875156966835285e-05, 'epoch': 0.32} + 32%|███▏ | 2810/8750 [41:01<9:48:21, 5.94s/it] {'loss': 0.4683, 'learning_rate': 1.5875156966835285e-05, 'epoch': 0.32} + 32%|███▏ | 2810/8750 [40:58<9:48:21, 5.94s/it] 32%|███▏ | 2811/8750 [41:04<9:44:02, 5.90s/it] 32%|███▏ | 2811/8750 [41:07<9:44:02, 5.90s/it] {'loss': 0.4728, 'learning_rate': 1.5872161139750384e-05, 'epoch': 0.32} + 32%|███▏ | 2811/8750 [41:07<9:44:02, 5.90s/it] {'loss': 0.4728, 'learning_rate': 1.5872161139750384e-05, 'epoch': 0.32} + 32%|███▏ | 2811/8750 [41:04<9:44:02, 5.90s/it] 32%|███▏ | 2812/8750 [41:10<9:35:54, 5.82s/it] 32%|███▏ | 2812/8750 [41:13<9:35:54, 5.82s/it] {'loss': 0.4519, 'learning_rate': 1.5869164508048304e-05, 'epoch': 0.32} + 32%|███▏ | 2812/8750 [41:13<9:35:54, 5.82s/it] {'loss': 0.4519, 'learning_rate': 1.5869164508048304e-05, 'epoch': 0.32} + 32%|███▏ | 2812/8750 [41:10<9:35:54, 5.82s/it] 32%|███▏ | 2813/8750 [41:15<9:36:14, 5.82s/it] 32%|███▏ | 2813/8750 [41:18<9:36:14, 5.82s/it] {'loss': 0.4566, 'learning_rate': 1.5866167072139645e-05, 'epoch': 0.32} + 32%|███▏ | 2813/8750 [41:18<9:36:14, 5.82s/it] {'loss': 0.4566, 'learning_rate': 1.5866167072139645e-05, 'epoch': 0.32} + 32%|███▏ | 2813/8750 [41:15<9:36:14, 5.82s/it] 32%|███▏ | 2814/8750 [41:21<9:34:40, 5.81s/it] 32%|███▏ | 2814/8750 [41:24<9:34:40, 5.81s/it] {'loss': 0.4898, 'learning_rate': 1.5863168832435137e-05, 'epoch': 0.32} + 32%|███▏ | 2814/8750 [41:24<9:34:40, 5.81s/it] {'loss': 0.4898, 'learning_rate': 1.5863168832435137e-05, 'epoch': 0.32} + 32%|███▏ | 2814/8750 [41:21<9:34:40, 5.81s/it] 32%|███▏ | 2815/8750 [41:27<9:28:46, 5.75s/it] 32%|███▏ | 2815/8750 [41:30<9:28:46, 5.75s/it] {'loss': 0.472, 'learning_rate': 1.5860169789345592e-05, 'epoch': 0.32} + 32%|███▏ | 2815/8750 [41:30<9:28:46, 5.75s/it] {'loss': 0.472, 'learning_rate': 1.5860169789345592e-05, 'epoch': 0.32} + 32%|███▏ | 2815/8750 [41:27<9:28:46, 5.75s/it] 32%|███▏ | 2816/8750 [41:32<9:24:42, 5.71s/it] 32%|███▏ | 2816/8750 [41:35<9:24:42, 5.71s/it] {'loss': 0.4816, 'learning_rate': 1.5857169943281948e-05, 'epoch': 0.32} + 32%|███▏ | 2816/8750 [41:35<9:24:42, 5.71s/it] {'loss': 0.4816, 'learning_rate': 1.5857169943281948e-05, 'epoch': 0.32} + 32%|███▏ | 2816/8750 [41:32<9:24:42, 5.71s/it] 32%|███▏ | 2817/8750 [41:38<9:22:28, 5.69s/it] 32%|███▏ | 2817/8750 [41:41<9:22:28, 5.69s/it] {'loss': 0.4718, 'learning_rate': 1.585416929465526e-05, 'epoch': 0.32} + 32%|███▏ | 2817/8750 [41:41<9:22:28, 5.69s/it] {'loss': 0.4718, 'learning_rate': 1.585416929465526e-05, 'epoch': 0.32} + 32%|███▏ | 2817/8750 [41:38<9:22:28, 5.69s/it] 32%|███▏ | 2818/8750 [41:44<9:28:40, 5.75s/it] 32%|███▏ | 2818/8750 [41:47<9:28:39, 5.75s/it] {'loss': 0.4443, 'learning_rate': 1.585116784387667e-05, 'epoch': 0.32} + 32%|███▏ | 2818/8750 [41:47<9:28:39, 5.75s/it] {'loss': 0.4443, 'learning_rate': 1.585116784387667e-05, 'epoch': 0.32} + 32%|███▏ | 2818/8750 [41:44<9:28:40, 5.75s/it] 32%|███▏ | 2819/8750 [41:50<9:31:47, 5.78s/it] 32%|███▏ | 2819/8750 [41:53<9:31:46, 5.78s/it] {'loss': 0.4654, 'learning_rate': 1.5848165591357458e-05, 'epoch': 0.32} + 32%|███▏ | 2819/8750 [41:53<9:31:46, 5.78s/it] {'loss': 0.4654, 'learning_rate': 1.5848165591357458e-05, 'epoch': 0.32} + 32%|███▏ | 2819/8750 [41:50<9:31:47, 5.78s/it] 32%|███▏ | 2820/8750 [41:56<9:33:52, 5.81s/it] 32%|███▏ | 2820/8750 [41:59<9:33:53, 5.81s/it] {'loss': 0.4793, 'learning_rate': 1.584516253750899e-05, 'epoch': 0.32} + 32%|███▏ | 2820/8750 [41:59<9:33:53, 5.81s/it] {'loss': 0.4793, 'learning_rate': 1.584516253750899e-05, 'epoch': 0.32} + 32%|███▏ | 2820/8750 [41:56<9:33:52, 5.81s/it] 32%|███▏ | 2821/8750 [42:02<9:37:47, 5.85s/it] 32%|███▏ | 2821/8750 [42:05<9:37:47, 5.85s/it] {'loss': 0.4703, 'learning_rate': 1.5842158682742756e-05, 'epoch': 0.32} + 32%|███▏ | 2821/8750 [42:05<9:37:47, 5.85s/it] {'loss': 0.4703, 'learning_rate': 1.5842158682742756e-05, 'epoch': 0.32} + 32%|███▏ | 2821/8750 [42:02<9:37:47, 5.85s/it] 32%|███▏ | 2822/8750 [42:08<9:44:21, 5.91s/it] 32%|███▏ | 2822/8750 [42:11<9:44:21, 5.91s/it] {'loss': 0.4631, 'learning_rate': 1.5839154027470346e-05, 'epoch': 0.32} + 32%|███▏ | 2822/8750 [42:11<9:44:21, 5.91s/it] {'loss': 0.4631, 'learning_rate': 1.5839154027470346e-05, 'epoch': 0.32} + 32%|███▏ | 2822/8750 [42:08<9:44:21, 5.91s/it] 32%|███▏ | 2823/8750 [42:14<9:48:34, 5.96s/it] 32%|███▏ | 2823/8750 [42:17<9:48:34, 5.96s/it] {'loss': 0.4858, 'learning_rate': 1.583614857210347e-05, 'epoch': 0.32} + 32%|███▏ | 2823/8750 [42:17<9:48:34, 5.96s/it] {'loss': 0.4858, 'learning_rate': 1.583614857210347e-05, 'epoch': 0.32} + 32%|███▏ | 2823/8750 [42:14<9:48:34, 5.96s/it] 32%|███▏ | 2824/8750 [42:20<9:43:23, 5.91s/it] 32%|███▏ | 2824/8750 [42:22<9:43:23, 5.91s/it] {'loss': 0.4655, 'learning_rate': 1.5833142317053943e-05, 'epoch': 0.32} + 32%|███▏ | 2824/8750 [42:22<9:43:23, 5.91s/it] {'loss': 0.4655, 'learning_rate': 1.5833142317053943e-05, 'epoch': 0.32} + 32%|███▏ | 2824/8750 [42:20<9:43:23, 5.91s/it] 32%|███▏ | 2825/8750 [42:25<9:34:35, 5.82s/it] 32%|███▏ | 2825/8750 [42:28<9:34:34, 5.82s/it] {'loss': 0.4708, 'learning_rate': 1.5830135262733684e-05, 'epoch': 0.32} + 32%|███▏ | 2825/8750 [42:28<9:34:34, 5.82s/it] {'loss': 0.4708, 'learning_rate': 1.5830135262733684e-05, 'epoch': 0.32} + 32%|███▏ | 2825/8750 [42:25<9:34:35, 5.82s/it] 32%|███▏ | 2826/8750 [42:31<9:41:43, 5.89s/it] 32%|███▏ | 2826/8750 [42:34<9:41:43, 5.89s/it] {'loss': 0.477, 'learning_rate': 1.582712740955473e-05, 'epoch': 0.32} + 32%|███▏ | 2826/8750 [42:34<9:41:43, 5.89s/it] {'loss': 0.477, 'learning_rate': 1.582712740955473e-05, 'epoch': 0.32} + 32%|███▏ | 2826/8750 [42:31<9:41:43, 5.89s/it] 32%|███▏ | 2827/8750 [42:37<9:35:57, 5.83s/it] 32%|███▏ | 2827/8750 [42:40<9:35:57, 5.83s/it] {'loss': 0.4869, 'learning_rate': 1.5824118757929224e-05, 'epoch': 0.32} + 32%|███▏ | 2827/8750 [42:40<9:35:57, 5.83s/it] {'loss': 0.4869, 'learning_rate': 1.5824118757929224e-05, 'epoch': 0.32} + 32%|███▏ | 2827/8750 [42:37<9:35:57, 5.83s/it] 32%|███▏ | 2828/8750 [42:43<9:47:55, 5.96s/it] 32%|███▏ | 2828/8750 [42:46<9:47:56, 5.96s/it] {'loss': 0.4708, 'learning_rate': 1.5821109308269416e-05, 'epoch': 0.32} + 32%|███▏ | 2828/8750 [42:46<9:47:56, 5.96s/it] {'loss': 0.4708, 'learning_rate': 1.5821109308269416e-05, 'epoch': 0.32} + 32%|███▏ | 2828/8750 [42:43<9:47:55, 5.96s/it] 32%|███▏ | 2829/8750 [42:49<9:45:23, 5.93s/it] 32%|███▏ | 2829/8750 [42:52<9:45:22, 5.93s/it] {'loss': 0.4834, 'learning_rate': 1.581809906098767e-05, 'epoch': 0.32} + 32%|███▏ | 2829/8750 [42:52<9:45:22, 5.93s/it] {'loss': 0.4834, 'learning_rate': 1.581809906098767e-05, 'epoch': 0.32} + 32%|███▏ | 2829/8750 [42:49<9:45:23, 5.93s/it] 32%|███▏ | 2830/8750 [42:55<9:46:28, 5.94s/it] 32%|███▏ | 2830/8750 [42:58<9:46:28, 5.94s/it] {'loss': 0.4669, 'learning_rate': 1.581508801649646e-05, 'epoch': 0.32} + 32%|███▏ | 2830/8750 [42:58<9:46:28, 5.94s/it] {'loss': 0.4669, 'learning_rate': 1.581508801649646e-05, 'epoch': 0.32} + 32%|███▏ | 2830/8750 [42:55<9:46:28, 5.94s/it] 32%|███▏ | 2831/8750 [43:01<9:38:24, 5.86s/it] 32%|███▏ | 2831/8750 [43:04<9:38:24, 5.86s/it] {'loss': 0.4761, 'learning_rate': 1.581207617520836e-05, 'epoch': 0.32} + 32%|███▏ | 2831/8750 [43:04<9:38:24, 5.86s/it] {'loss': 0.4761, 'learning_rate': 1.581207617520836e-05, 'epoch': 0.32} + 32%|███▏ | 2831/8750 [43:01<9:38:24, 5.86s/it] 32%|███▏ | 2832/8750 [43:06<9:35:49, 5.84s/it] 32%|███▏ | 2832/8750 [43:09<9:35:49, 5.84s/it] {'loss': 0.4765, 'learning_rate': 1.5809063537536066e-05, 'epoch': 0.32} + 32%|███▏ | 2832/8750 [43:09<9:35:49, 5.84s/it] {'loss': 0.4765, 'learning_rate': 1.5809063537536066e-05, 'epoch': 0.32} + 32%|███▏ | 2832/8750 [43:06<9:35:49, 5.84s/it] 32%|███▏ | 2833/8750 [43:12<9:29:57, 5.78s/it] 32%|███▏ | 2833/8750 [43:15<9:29:58, 5.78s/it] {'loss': 0.4775, 'learning_rate': 1.580605010389237e-05, 'epoch': 0.32} + 32%|███▏ | 2833/8750 [43:15<9:29:58, 5.78s/it] {'loss': 0.4775, 'learning_rate': 1.580605010389237e-05, 'epoch': 0.32} + 32%|███▏ | 2833/8750 [43:12<9:29:57, 5.78s/it] 32%|███▏ | 2834/8750 [43:18<9:29:16, 5.77s/it] 32%|███▏ | 2834/8750 [43:21<9:29:15, 5.77s/it] {'loss': 0.4697, 'learning_rate': 1.5803035874690186e-05, 'epoch': 0.32} + 32%|███▏ | 2834/8750 [43:21<9:29:15, 5.77s/it] {'loss': 0.4697, 'learning_rate': 1.5803035874690186e-05, 'epoch': 0.32} + 32%|███▏ | 2834/8750 [43:18<9:29:16, 5.77s/it] 32%|███▏ | 2835/8750 [43:24<9:33:27, 5.82s/it] 32%|███▏ | 2835/8750 [43:27<9:33:27, 5.82s/it] {'loss': 0.4574, 'learning_rate': 1.5800020850342524e-05, 'epoch': 0.32} + 32%|███▏ | 2835/8750 [43:27<9:33:27, 5.82s/it] {'loss': 0.4574, 'learning_rate': 1.5800020850342524e-05, 'epoch': 0.32} + 32%|███▏ | 2835/8750 [43:24<9:33:27, 5.82s/it] 32%|███▏ | 2836/8750 [43:30<9:30:57, 5.79s/it] 32%|███▏ | 2836/8750 [43:32<9:30:58, 5.79s/it] {'loss': 0.4732, 'learning_rate': 1.5797005031262514e-05, 'epoch': 0.32} + 32%|███▏ | 2836/8750 [43:32<9:30:58, 5.79s/it] {'loss': 0.4732, 'learning_rate': 1.5797005031262514e-05, 'epoch': 0.32} + 32%|███▏ | 2836/8750 [43:30<9:30:57, 5.79s/it] 32%|███▏ | 2837/8750 [43:35<9:26:49, 5.75s/it] 32%|███▏ | 2837/8750 [43:38<9:26:49, 5.75s/it] {'loss': 0.492, 'learning_rate': 1.579398841786339e-05, 'epoch': 0.32} + 32%|███▏ | 2837/8750 [43:38<9:26:49, 5.75s/it] {'loss': 0.492, 'learning_rate': 1.579398841786339e-05, 'epoch': 0.32} + 32%|███▏ | 2837/8750 [43:35<9:26:49, 5.75s/it] 32%|███▏ | 2838/8750 [43:41<9:25:39, 5.74s/it] 32%|███▏ | 2838/8750 [43:44<9:25:39, 5.74s/it] {'loss': 0.4496, 'learning_rate': 1.57909710105585e-05, 'epoch': 0.32} + 32%|███▏ | 2838/8750 [43:44<9:25:39, 5.74s/it] {'loss': 0.4496, 'learning_rate': 1.57909710105585e-05, 'epoch': 0.32} + 32%|███▏ | 2838/8750 [43:41<9:25:39, 5.74s/it] 32%|███▏ | 2839/8750 [43:47<9:30:24, 5.79s/it] 32%|███▏ | 2839/8750 [43:50<9:30:24, 5.79s/it] {'loss': 0.478, 'learning_rate': 1.5787952809761286e-05, 'epoch': 0.32} + 32%|███▏ | 2839/8750 [43:50<9:30:24, 5.79s/it] {'loss': 0.478, 'learning_rate': 1.5787952809761286e-05, 'epoch': 0.32} + 32%|███▏ | 2839/8750 [43:47<9:30:24, 5.79s/it] 32%|███▏ | 2840/8750 [43:52<9:27:18, 5.76s/it] 32%|███▏ | 2840/8750 [43:55<9:27:18, 5.76s/it] {'loss': 0.4775, 'learning_rate': 1.5784933815885315e-05, 'epoch': 0.32} + 32%|███▏ | 2840/8750 [43:55<9:27:18, 5.76s/it] {'loss': 0.4775, 'learning_rate': 1.5784933815885315e-05, 'epoch': 0.32} + 32%|███▏ | 2840/8750 [43:52<9:27:18, 5.76s/it] 32%|███▏ | 2841/8750 [43:58<9:26:11, 5.75s/it] 32%|███▏ | 2841/8750 [44:01<9:26:11, 5.75s/it] {'loss': 0.4648, 'learning_rate': 1.5781914029344254e-05, 'epoch': 0.32} + 32%|███▏ | 2841/8750 [44:01<9:26:11, 5.75s/it] {'loss': 0.4648, 'learning_rate': 1.5781914029344254e-05, 'epoch': 0.32} + 32%|███▏ | 2841/8750 [43:58<9:26:11, 5.75s/it] 32%|███▏ | 2842/8750 [44:04<9:32:29, 5.81s/it] 32%|███▏ | 2842/8750 [44:07<9:32:29, 5.81s/it] {'loss': 0.4642, 'learning_rate': 1.5778893450551888e-05, 'epoch': 0.32} + 32%|███▏ | 2842/8750 [44:07<9:32:29, 5.81s/it] {'loss': 0.4642, 'learning_rate': 1.5778893450551888e-05, 'epoch': 0.32} + 32%|███▏ | 2842/8750 [44:04<9:32:29, 5.81s/it] 32%|███▏ | 2843/8750 [44:10<9:26:56, 5.76s/it] 32%|███▏ | 2843/8750 [44:13<9:26:56, 5.76s/it] {'loss': 0.4863, 'learning_rate': 1.5775872079922098e-05, 'epoch': 0.32} + 32%|███▏ | 2843/8750 [44:13<9:26:56, 5.76s/it] {'loss': 0.4863, 'learning_rate': 1.5775872079922098e-05, 'epoch': 0.32} + 32%|███▏ | 2843/8750 [44:10<9:26:56, 5.76s/it] 33%|███▎ | 2844/8750 [44:16<9:27:09, 5.76s/it] 33%|███▎ | 2844/8750 [44:19<9:27:09, 5.76s/it] {'loss': 0.4598, 'learning_rate': 1.5772849917868876e-05, 'epoch': 0.33} + 33%|███▎ | 2844/8750 [44:19<9:27:09, 5.76s/it] {'loss': 0.4598, 'learning_rate': 1.5772849917868876e-05, 'epoch': 0.33} + 33%|███▎ | 2844/8750 [44:16<9:27:09, 5.76s/it] 33%|███▎ | 2845/8750 [44:22<9:32:16, 5.81s/it] 33%|███▎ | 2845/8750 [44:24<9:32:16, 5.81s/it] {'loss': 0.478, 'learning_rate': 1.576982696480633e-05, 'epoch': 0.33} + 33%|███▎ | 2845/8750 [44:24<9:32:16, 5.81s/it] {'loss': 0.478, 'learning_rate': 1.576982696480633e-05, 'epoch': 0.33} + 33%|███▎ | 2845/8750 [44:22<9:32:16, 5.81s/it] 33%|███▎ | 2846/8750 [44:27<9:30:23, 5.80s/it] 33%|███▎ | 2846/8750 [44:30<9:30:23, 5.80s/it] {'loss': 0.4735, 'learning_rate': 1.5766803221148676e-05, 'epoch': 0.33} + 33%|███▎ | 2846/8750 [44:30<9:30:23, 5.80s/it] {'loss': 0.4735, 'learning_rate': 1.5766803221148676e-05, 'epoch': 0.33} + 33%|███▎ | 2846/8750 [44:27<9:30:23, 5.80s/it] 33%|███▎ | 2847/8750 [44:33<9:25:51, 5.75s/it] 33%|███▎ | 2847/8750 [44:36<9:25:51, 5.75s/it] {'loss': 0.4723, 'learning_rate': 1.5763778687310224e-05, 'epoch': 0.33} + 33%|███▎ | 2847/8750 [44:36<9:25:51, 5.75s/it] {'loss': 0.4723, 'learning_rate': 1.5763778687310224e-05, 'epoch': 0.33} + 33%|███▎ | 2847/8750 [44:33<9:25:51, 5.75s/it] 33%|███▎ | 2848/8750 [44:39<9:25:17, 5.75s/it] 33%|███▎ | 2848/8750 [44:42<9:25:17, 5.75s/it] {'loss': 0.4484, 'learning_rate': 1.5760753363705412e-05, 'epoch': 0.33} + 33%|███▎ | 2848/8750 [44:42<9:25:17, 5.75s/it] {'loss': 0.4484, 'learning_rate': 1.5760753363705412e-05, 'epoch': 0.33} + 33%|███▎ | 2848/8750 [44:39<9:25:17, 5.75s/it] 33%|███▎ | 2849/8750 [44:44<9:25:04, 5.75s/it] 33%|███▎ | 2849/8750 [44:47<9:25:04, 5.75s/it] {'loss': 0.4742, 'learning_rate': 1.5757727250748773e-05, 'epoch': 0.33} + 33%|███▎ | 2849/8750 [44:47<9:25:04, 5.75s/it] {'loss': 0.4742, 'learning_rate': 1.5757727250748773e-05, 'epoch': 0.33} + 33%|███▎ | 2849/8750 [44:44<9:25:04, 5.75s/it]2 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +01 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... + 33%|███▎ | 2850/8750 [44:50<9:20:25, 5.70s/it]8 AutoResumeHook: Checking whether to suspend... + 33%|███▎ | 2850/8750 [44:53<9:20:24, 5.70s/it] {'loss': 0.4912, 'learning_rate': 1.5754700348854955e-05, 'epoch': 0.33} + 33%|███▎ | 2850/8750 [44:53<9:20:24, 5.70s/it] {'loss': 0.4912, 'learning_rate': 1.5754700348854955e-05, 'epoch': 0.33} + 33%|███▎ | 2850/8750 [44:50<9:20:25, 5.70s/it] 33%|███▎ | 2851/8750 [44:56<9:23:16, 5.73s/it] 33%|███▎ | 2851/8750 [44:59<9:23:16, 5.73s/it] {'loss': 0.4788, 'learning_rate': 1.5751672658438707e-05, 'epoch': 0.33} + 33%|███▎ | 2851/8750 [44:59<9:23:16, 5.73s/it] {'loss': 0.4788, 'learning_rate': 1.5751672658438707e-05, 'epoch': 0.33} + 33%|███▎ | 2851/8750 [44:56<9:23:16, 5.73s/it] 33%|███▎ | 2852/8750 [45:02<9:25:13, 5.75s/it] 33%|███▎ | 2852/8750 [45:05<9:25:13, 5.75s/it] {'loss': 0.4717, 'learning_rate': 1.574864417991489e-05, 'epoch': 0.33} + 33%|███▎ | 2852/8750 [45:05<9:25:13, 5.75s/it] {'loss': 0.4717, 'learning_rate': 1.574864417991489e-05, 'epoch': 0.33} + 33%|███▎ | 2852/8750 [45:02<9:25:13, 5.75s/it] 33%|███▎ | 2853/8750 [45:07<9:23:17, 5.73s/it] 33%|███▎ | 2853/8750 [45:10<9:23:16, 5.73s/it] {'loss': 0.5, 'learning_rate': 1.5745614913698478e-05, 'epoch': 0.33} + 33%|███▎ | 2853/8750 [45:10<9:23:16, 5.73s/it] {'loss': 0.5, 'learning_rate': 1.5745614913698478e-05, 'epoch': 0.33} + 33%|███▎ | 2853/8750 [45:07<9:23:17, 5.73s/it] 33%|███▎ | 2854/8750 [45:13<9:22:44, 5.73s/it] 33%|███▎ | 2854/8750 [45:16<9:22:44, 5.73s/it] {'loss': 0.4665, 'learning_rate': 1.5742584860204547e-05, 'epoch': 0.33} + 33%|███▎ | 2854/8750 [45:16<9:22:44, 5.73s/it] {'loss': 0.4665, 'learning_rate': 1.5742584860204547e-05, 'epoch': 0.33} + 33%|███▎ | 2854/8750 [45:13<9:22:44, 5.73s/it] 33%|███▎ | 2855/8750 [45:19<9:30:00, 5.80s/it] 33%|███▎ | 2855/8750 [45:22<9:30:00, 5.80s/it] {'loss': 0.471, 'learning_rate': 1.5739554019848274e-05, 'epoch': 0.33} + 33%|███▎ | 2855/8750 [45:22<9:30:00, 5.80s/it] {'loss': 0.471, 'learning_rate': 1.5739554019848274e-05, 'epoch': 0.33} + 33%|███▎ | 2855/8750 [45:19<9:30:00, 5.80s/it] 33%|███▎ | 2856/8750 [45:25<9:31:00, 5.81s/it] 33%|███▎ | 2856/8750 [45:28<9:31:00, 5.81s/it] {'loss': 0.4684, 'learning_rate': 1.5736522393044962e-05, 'epoch': 0.33} + 33%|███▎ | 2856/8750 [45:28<9:31:00, 5.81s/it] {'loss': 0.4684, 'learning_rate': 1.5736522393044962e-05, 'epoch': 0.33} + 33%|███▎ | 2856/8750 [45:25<9:31:00, 5.81s/it] 33%|███▎ | 2857/8750 [45:31<9:35:14, 5.86s/it] 33%|███▎ | 2857/8750 [45:34<9:35:14, 5.86s/it] {'loss': 0.4837, 'learning_rate': 1.5733489980210007e-05, 'epoch': 0.33} + 33%|███▎ | 2857/8750 [45:34<9:35:14, 5.86s/it] {'loss': 0.4837, 'learning_rate': 1.5733489980210007e-05, 'epoch': 0.33} + 33%|███▎ | 2857/8750 [45:31<9:35:14, 5.86s/it] 33%|███▎ | 2858/8750 [45:36<9:28:32, 5.79s/it] 33%|███▎ | 2858/8750 [45:39<9:28:32, 5.79s/it] {'loss': 0.4908, 'learning_rate': 1.573045678175892e-05, 'epoch': 0.33} + 33%|███▎ | 2858/8750 [45:39<9:28:32, 5.79s/it] {'loss': 0.4908, 'learning_rate': 1.573045678175892e-05, 'epoch': 0.33} + 33%|███▎ | 2858/8750 [45:36<9:28:32, 5.79s/it] 33%|███▎ | 2859/8750 [45:42<9:25:51, 5.76s/it] 33%|███▎ | 2859/8750 [45:45<9:25:51, 5.76s/it] {'loss': 0.4824, 'learning_rate': 1.5727422798107313e-05, 'epoch': 0.33} + 33%|███▎ | 2859/8750 [45:45<9:25:51, 5.76s/it] {'loss': 0.4824, 'learning_rate': 1.5727422798107313e-05, 'epoch': 0.33} + 33%|███▎ | 2859/8750 [45:42<9:25:51, 5.76s/it] 33%|███▎ | 2860/8750 [45:48<9:25:27, 5.76s/it] 33%|███▎ | 2860/8750 [45:51<9:25:28, 5.76s/it] {'loss': 0.4636, 'learning_rate': 1.5724388029670912e-05, 'epoch': 0.33} + 33%|███▎ | 2860/8750 [45:51<9:25:28, 5.76s/it] {'loss': 0.4636, 'learning_rate': 1.5724388029670912e-05, 'epoch': 0.33} + 33%|███▎ | 2860/8750 [45:48<9:25:27, 5.76s/it] 33%|███▎ | 2861/8750 [45:54<9:24:35, 5.75s/it] 33%|███▎ | 2861/8750 [45:57<9:24:35, 5.75s/it] {'loss': 0.4617, 'learning_rate': 1.5721352476865546e-05, 'epoch': 0.33} + {'loss': 0.4617, 'learning_rate': 1.5721352476865546e-05, 'epoch': 0.33} + 33%|███▎ | 2861/8750 [45:57<9:24:35, 5.75s/it] 33%|███▎ | 2861/8750 [45:54<9:24:35, 5.75s/it] 33%|███▎ | 2862/8750 [45:59<9:24:04, 5.75s/it] 33%|███▎ | 2862/8750 [46:02<9:24:04, 5.75s/it] {'loss': 0.4884, 'learning_rate': 1.5718316140107156e-05, 'epoch': 0.33} + 33%|███▎ | 2862/8750 [46:02<9:24:04, 5.75s/it] {'loss': 0.4884, 'learning_rate': 1.5718316140107156e-05, 'epoch': 0.33} + 33%|███▎ | 2862/8750 [45:59<9:24:04, 5.75s/it] 33%|███▎ | 2863/8750 [46:05<9:24:17, 5.75s/it] 33%|███▎ | 2863/8750 [46:08<9:24:16, 5.75s/it] {'loss': 0.4766, 'learning_rate': 1.5715279019811783e-05, 'epoch': 0.33} + {'loss': 0.4766, 'learning_rate': 1.5715279019811783e-05, 'epoch': 0.33} 33%|███▎ | 2863/8750 [46:08<9:24:16, 5.75s/it] + 33%|███▎ | 2863/8750 [46:05<9:24:17, 5.75s/it] 33%|███▎ | 2864/8750 [46:11<9:24:33, 5.75s/it] 33%|███▎ | 2864/8750 [46:14<9:24:33, 5.75s/it] {'loss': 0.4708, 'learning_rate': 1.571224111639559e-05, 'epoch': 0.33} + {'loss': 0.4708, 'learning_rate': 1.571224111639559e-05, 'epoch': 0.33} 33%|███▎ | 2864/8750 [46:14<9:24:33, 5.75s/it] + 33%|███▎ | 2864/8750 [46:11<9:24:33, 5.75s/it] 33%|███▎ | 2865/8750 [46:17<9:24:34, 5.76s/it] 33%|███▎ | 2865/8750 [46:20<9:24:34, 5.76s/it] {'loss': 0.4695, 'learning_rate': 1.570920243027483e-05, 'epoch': 0.33} + 33%|███▎ | 2865/8750 [46:20<9:24:34, 5.76s/it] {'loss': 0.4695, 'learning_rate': 1.570920243027483e-05, 'epoch': 0.33} + 33%|███▎ | 2865/8750 [46:17<9:24:34, 5.76s/it] 33%|███▎ | 2866/8750 [46:22<9:27:05, 5.78s/it] 33%|███▎ | 2866/8750 [46:25<9:27:05, 5.78s/it] {'loss': 0.4738, 'learning_rate': 1.5706162961865866e-05, 'epoch': 0.33} + 33%|███▎ | 2866/8750 [46:25<9:27:05, 5.78s/it] {'loss': 0.4738, 'learning_rate': 1.5706162961865866e-05, 'epoch': 0.33} + 33%|███▎ | 2866/8750 [46:22<9:27:05, 5.78s/it] 33%|███▎ | 2867/8750 [46:28<9:24:57, 5.76s/it] 33%|███▎ | 2867/8750 [46:31<9:24:57, 5.76s/it] {'loss': 0.4878, 'learning_rate': 1.570312271158519e-05, 'epoch': 0.33} + 33%|███▎ | 2867/8750 [46:31<9:24:57, 5.76s/it] {'loss': 0.4878, 'learning_rate': 1.570312271158519e-05, 'epoch': 0.33} + 33%|███▎ | 2867/8750 [46:28<9:24:57, 5.76s/it] 33%|███▎ | 2868/8750 [46:34<9:21:39, 5.73s/it] 33%|███▎ | 2868/8750 [46:37<9:21:39, 5.73s/it] {'loss': 0.4698, 'learning_rate': 1.5700081679849362e-05, 'epoch': 0.33} + 33%|███▎ | 2868/8750 [46:37<9:21:39, 5.73s/it] {'loss': 0.4698, 'learning_rate': 1.5700081679849362e-05, 'epoch': 0.33} + 33%|███▎ | 2868/8750 [46:34<9:21:39, 5.73s/it] 33%|███▎ | 2869/8750 [46:40<9:23:12, 5.75s/it] 33%|███▎ | 2869/8750 [46:43<9:23:12, 5.75s/it] {'loss': 0.4732, 'learning_rate': 1.569703986707509e-05, 'epoch': 0.33} + 33%|███▎ | 2869/8750 [46:43<9:23:12, 5.75s/it] {'loss': 0.4732, 'learning_rate': 1.569703986707509e-05, 'epoch': 0.33} + 33%|███▎ | 2869/8750 [46:40<9:23:12, 5.75s/it] 33%|███▎ | 2870/8750 [46:45<9:23:50, 5.75s/it] 33%|███▎ | 2870/8750 [46:48<9:23:50, 5.75s/it] {'loss': 0.5003, 'learning_rate': 1.5693997273679165e-05, 'epoch': 0.33} + 33%|███▎ | 2870/8750 [46:48<9:23:50, 5.75s/it] {'loss': 0.5003, 'learning_rate': 1.5693997273679165e-05, 'epoch': 0.33} + 33%|███▎ | 2870/8750 [46:45<9:23:50, 5.75s/it] 33%|███▎ | 2871/8750 [46:51<9:32:03, 5.84s/it] 33%|███▎ | 2871/8750 [46:54<9:32:03, 5.84s/it] {'loss': 0.4552, 'learning_rate': 1.5690953900078485e-05, 'epoch': 0.33} + 33%|███▎ | 2871/8750 [46:54<9:32:03, 5.84s/it] {'loss': 0.4552, 'learning_rate': 1.5690953900078485e-05, 'epoch': 0.33} + 33%|███▎ | 2871/8750 [46:51<9:32:03, 5.84s/it] 33%|███▎ | 2872/8750 [46:57<9:27:53, 5.80s/it] 33%|███▎ | 2872/8750 [47:00<9:27:52, 5.80s/it] {'loss': 0.472, 'learning_rate': 1.5687909746690064e-05, 'epoch': 0.33} + 33%|███▎ | 2872/8750 [47:00<9:27:52, 5.80s/it] {'loss': 0.472, 'learning_rate': 1.5687909746690064e-05, 'epoch': 0.33} + 33%|███▎ | 2872/8750 [46:57<9:27:53, 5.80s/it] 33%|███▎ | 2873/8750 [47:03<9:30:47, 5.83s/it] 33%|███▎ | 2873/8750 [47:06<9:30:46, 5.83s/it] {'loss': 0.4709, 'learning_rate': 1.568486481393102e-05, 'epoch': 0.33} + 33%|███▎ | 2873/8750 [47:06<9:30:46, 5.83s/it] {'loss': 0.4709, 'learning_rate': 1.568486481393102e-05, 'epoch': 0.33} + 33%|███▎ | 2873/8750 [47:03<9:30:47, 5.83s/it] 33%|███▎ | 2874/8750 [47:09<9:26:42, 5.79s/it] 33%|███▎ | 2874/8750 [47:12<9:26:42, 5.79s/it] {'loss': 0.4626, 'learning_rate': 1.5681819102218572e-05, 'epoch': 0.33} + 33%|███▎ | 2874/8750 [47:12<9:26:42, 5.79s/it] {'loss': 0.4626, 'learning_rate': 1.5681819102218572e-05, 'epoch': 0.33} + 33%|███▎ | 2874/8750 [47:09<9:26:42, 5.79s/it] 33%|███▎ | 2875/8750 [47:14<9:24:37, 5.77s/it] 33%|███▎ | 2875/8750 [47:17<9:24:37, 5.77s/it] {'loss': 0.4704, 'learning_rate': 1.5678772611970056e-05, 'epoch': 0.33} + 33%|███▎ | 2875/8750 [47:17<9:24:37, 5.77s/it] {'loss': 0.4704, 'learning_rate': 1.5678772611970056e-05, 'epoch': 0.33} + 33%|███▎ | 2875/8750 [47:14<9:24:37, 5.77s/it] 33%|███▎ | 2876/8750 [47:23<9:28:28, 5.81s/it] 33%|███▎ | 2876/8750 [47:20<9:28:29, 5.81s/it] {'loss': 0.4631, 'learning_rate': 1.5675725343602904e-05, 'epoch': 0.33} + 33%|███▎ | 2876/8750 [47:23<9:28:28, 5.81s/it] {'loss': 0.4631, 'learning_rate': 1.5675725343602904e-05, 'epoch': 0.33} + 33%|███▎ | 2876/8750 [47:20<9:28:29, 5.81s/it] 33%|███▎ | 2877/8750 [47:26<9:22:54, 5.75s/it] 33%|███▎ | 2877/8750 [47:29<9:22:54, 5.75s/it] {'loss': 0.4735, 'learning_rate': 1.5672677297534665e-05, 'epoch': 0.33} + 33%|███▎ | 2877/8750 [47:29<9:22:54, 5.75s/it] {'loss': 0.4735, 'learning_rate': 1.5672677297534665e-05, 'epoch': 0.33} + 33%|███▎ | 2877/8750 [47:26<9:22:54, 5.75s/it] 33%|███▎ | 2878/8750 [47:32<9:29:07, 5.82s/it] 33%|███▎ | 2878/8750 [47:35<9:29:07, 5.82s/it] {'loss': 0.4643, 'learning_rate': 1.566962847418299e-05, 'epoch': 0.33} + 33%|███▎ | 2878/8750 [47:35<9:29:07, 5.82s/it] {'loss': 0.4643, 'learning_rate': 1.566962847418299e-05, 'epoch': 0.33} + 33%|███▎ | 2878/8750 [47:32<9:29:07, 5.82s/it] 33%|███▎ | 2879/8750 [47:38<9:28:15, 5.81s/it] 33%|███▎ | 2879/8750 [47:41<9:28:15, 5.81s/it] {'loss': 0.4823, 'learning_rate': 1.5666578873965627e-05, 'epoch': 0.33} + 33%|███▎ | 2879/8750 [47:41<9:28:15, 5.81s/it] {'loss': 0.4823, 'learning_rate': 1.5666578873965627e-05, 'epoch': 0.33} + 33%|███▎ | 2879/8750 [47:38<9:28:15, 5.81s/it] 33%|███▎ | 2880/8750 [47:43<9:26:52, 5.79s/it] 33%|███▎ | 2880/8750 [47:46<9:26:52, 5.79s/it] {'loss': 0.4623, 'learning_rate': 1.566352849730045e-05, 'epoch': 0.33} + 33%|███▎ | 2880/8750 [47:46<9:26:52, 5.79s/it] {'loss': 0.4623, 'learning_rate': 1.566352849730045e-05, 'epoch': 0.33} + 33%|███▎ | 2880/8750 [47:43<9:26:52, 5.79s/it] 33%|███▎ | 2881/8750 [47:49<9:32:37, 5.85s/it] 33%|███▎ | 2881/8750 [47:52<9:32:37, 5.85s/it] {'loss': 0.4846, 'learning_rate': 1.566047734460542e-05, 'epoch': 0.33} + 33%|███▎ | 2881/8750 [47:52<9:32:37, 5.85s/it] {'loss': 0.4846, 'learning_rate': 1.566047734460542e-05, 'epoch': 0.33} + 33%|███▎ | 2881/8750 [47:49<9:32:37, 5.85s/it] 33%|███▎ | 2882/8750 [47:55<9:34:42, 5.88s/it] 33%|███▎ | 2882/8750 [47:58<9:34:42, 5.88s/it] {'loss': 0.4666, 'learning_rate': 1.5657425416298623e-05, 'epoch': 0.33} + 33%|███▎ | 2882/8750 [47:58<9:34:42, 5.88s/it] {'loss': 0.4666, 'learning_rate': 1.5657425416298623e-05, 'epoch': 0.33} + 33%|███▎ | 2882/8750 [47:55<9:34:42, 5.88s/it] 33%|███▎ | 2883/8750 [48:01<9:28:30, 5.81s/it] 33%|███▎ | 2883/8750 [48:04<9:28:30, 5.81s/it] {'loss': 0.4601, 'learning_rate': 1.565437271279823e-05, 'epoch': 0.33} + 33%|███▎ | 2883/8750 [48:04<9:28:30, 5.81s/it] {'loss': 0.4601, 'learning_rate': 1.565437271279823e-05, 'epoch': 0.33} + 33%|███▎ | 2883/8750 [48:01<9:28:30, 5.81s/it]WARNING: tokenization mismatch: 1 vs. 737. [[{'from': 'human', 'value': 'ASSISTANT: You are a top consultant specialising in advising tax firms and have the following skills and qualities.\n\nKNOWLEDGE: Consultant in a top consulting firm specialising in tax consulting, senior strategist, digital expert.\n\nCHARACTERISTICS: Customer-focused, empathetic, solution-oriented, personable, persuasive, creative.\n\nTASK: Use your knowledge to develop 10 measures to encourage clients to submit their documents digitally instead of by post. The measures should be formulated in such a way that the client recognises the advantages of digital filing. The law firm is digitised and can accept digital documents.\n\nOUTPUT: A list of the 10 actions, each with the name of the action as a heading and 2 short additional bullet points.\n\nOUTPUT LANGUAGE: German'}, {'from': 'gpt', 'value': '1. Persönlicher Kontakt und Beratung:\n\t* Persönliche Beratung und Unterstützung bei der Umstellung auf digitales Einreichen\n\t* Betonung der Vorteile, wie Zeit- und Kosteneinsparungen\n2. Schulung und Unterstützung:\n\t* Schulungen für die digitale Einreichung von Dokumenten\n\t* Anleitung zur Nutzung von Tools und Software\n3. Anreize schaffen:\n\t* Ermäßigung bei der Beauftragung digitaler Dienstleistungen\n\t* Sonderangebote für Kunden, die ihre Dokumente digital einreichen\n4. Informationen bereitstellen:\n\t* Veröffentlichung von Leitfäden und Handbüchern für die digitale Einreichung von Dokumenten\n\t* FAQ-Liste für häufig gestellte Fragen zum digitalen Einreichen\n5. Klare Kommunikation:\n\t* Verständliche Kommunikation der Vorteile der digitalen Einreichung\n\t* Bereitstellung von Informationen über die Sicherheit der digitalen Übermittlung\n6. Einfache Zugänglichkeit:\n\t* Vereinfachung des digitalen Einreichungsprozesses durch intuitive Online-Plattformen\n\t* Bereitstellung von Unterstützung für den Fall technischer Schwierigkeiten\n7. Zeitersparnis betonen:\n\t* Betonung der Vorteile von Zeitersparnis durch digitales Einreichen\n\t* Erinnerungen an Fristen und den Nutzen der pünktlichen Einreichung\n8. Umweltbewusstsein:\n\t* Betonung des Umweltvorteils durch digitales Einreichen von Dokumenten\n\t* Verweis auf die Möglichkeit, Papierverbrauch zu reduzieren\n9. Kosteneffizienz:\n\t* Betonung der Vorteile von Kosteneinsparungen durch digitales Einreichen\n\t* Verweis auf die Möglichkeit, Papier- und Portokosten zu sparen\n10. Feedback einholen:\n* Umfrage zur Bewertung des digitalen Einreichungsprozesses\n* Analyse der Ergebnisse zur Verbesserung der digitalen Dienstleistungen'}]] (ignored) + 33%|███▎ | 2884/8750 [48:07<9:29:32, 5.83s/it] 33%|███▎ | 2884/8750 [48:10<9:29:31, 5.83s/it] {'loss': 0.4834, 'learning_rate': 1.5651319234522538e-05, 'epoch': 0.33} + 33%|███▎ | 2884/8750 [48:10<9:29:31, 5.83s/it] {'loss': 0.4834, 'learning_rate': 1.5651319234522538e-05, 'epoch': 0.33} + 33%|███▎ | 2884/8750 [48:07<9:29:32, 5.83s/it] 33%|███▎ | 2885/8750 [48:13<9:27:13, 5.80s/it] 33%|███▎ | 2885/8750 [48:16<9:27:14, 5.80s/it] {'loss': 0.4699, 'learning_rate': 1.5648264981889936e-05, 'epoch': 0.33} + 33%|███▎ | 2885/8750 [48:16<9:27:14, 5.80s/it] {'loss': 0.4699, 'learning_rate': 1.5648264981889936e-05, 'epoch': 0.33} + 33%|███▎ | 2885/8750 [48:13<9:27:13, 5.80s/it] 33%|███▎ | 2886/8750 [48:18<9:25:26, 5.79s/it] 33%|███▎ | 2886/8750 [48:21<9:25:26, 5.79s/it] {'loss': 0.4772, 'learning_rate': 1.564520995531893e-05, 'epoch': 0.33} + 33%|███▎ | 2886/8750 [48:21<9:25:26, 5.79s/it] {'loss': 0.4772, 'learning_rate': 1.564520995531893e-05, 'epoch': 0.33} + 33%|███▎ | 2886/8750 [48:18<9:25:26, 5.79s/it] 33%|███▎ | 2887/8750 [48:24<9:28:13, 5.81s/it] 33%|███▎ | 2887/8750 [48:27<9:28:13, 5.81s/it] {'loss': 0.4867, 'learning_rate': 1.5642154155228124e-05, 'epoch': 0.33} + 33%|███▎ | 2887/8750 [48:27<9:28:13, 5.81s/it] {'loss': 0.4867, 'learning_rate': 1.5642154155228124e-05, 'epoch': 0.33} + 33%|███▎ | 2887/8750 [48:24<9:28:13, 5.81s/it] 33%|███▎ | 2888/8750 [48:30<9:28:24, 5.82s/it] 33%|███▎ | 2888/8750 [48:33<9:28:24, 5.82s/it] {'loss': 0.4585, 'learning_rate': 1.5639097582036226e-05, 'epoch': 0.33} + 33%|███▎ | 2888/8750 [48:33<9:28:24, 5.82s/it] {'loss': 0.4585, 'learning_rate': 1.5639097582036226e-05, 'epoch': 0.33} + 33%|███▎ | 2888/8750 [48:30<9:28:24, 5.82s/it]WARNING: tokenization mismatch: 1 vs. 1419. [[{'from': 'human', 'value': 'ASSISTANT: You are an innovative tax advisory firm with the following knowledge and traits.\n\nKNOWLEDGE: Tax advisor, junior tax advisor, senior tax advisor, financial strategist, tax regulations.\n\nTRAITS: Empathy, innovation, solution orientation, professionalism, communication, client focus.\n\nTASK: Use your knowledge to develop an internal strategy document. We are a tax advisory firm looking to identify 10 creative ways to improve client satisfaction and loyalty. Focus on ideas that are tailored to the needs of tax advisors and enhance the overall client experience.\n\nOUTPUT: A table in German containing 10 creative strategies to improve client satisfaction and loyalty for a tax advisory firm, including a brief description for each.'}, {'from': 'gpt', 'value': 'Tabelle: 10 kreative Strategien zur Steigerung der Kundenzufriedenheit und -loyalität in einer Steuerberatungskanzlei\n\n| Nr. | Strategie | Beschreibung |\n| --- | --- | --- |\n| 1 | Personalisierter Service | Jeder Kunde erhält einen dedizierten Steuerberater, der auf seine individuellen Bedürfnisse und Anliegen eingeht. |\n| 2 | Proaktive Steuerplanung | Regelmäßige Updates und Empfehlungen zur Steueroptimierung, basierend auf aktuellen Gesetzesänderungen und Kundenzielen. |\n| 3 | Umfassende Schulungsangebote | Kostenlose Workshops und Webinare, um das Steuerwissen der Kunden zu erweitern und den Umgang mit Steuerthemen zu erleichtern. |\n| 4 | Digitale Plattform | Eine benutzerfreundliche Online-Plattform, die den Kunden den Zugriff auf ihre Steuerdokumente und den Kontakt zum Berater erleichtert. |\n| 5 | Einführung von Steuer-Coaching | Persönliche Coaching-Sitzungen, um den Kunden dabei zu unterstützen, ihre steuerlichen Ziele und Möglichkeiten besser zu verstehen. |\n| 6 | Schnelle Reaktionszeiten | Verbindliches Versprechen, innerhalb von 24 Stunden auf Kundenanfragen zu reagieren, um eine effiziente Kommunikation zu gewährleisten. |\n| 7 | Transparente Preisgestaltung | Klare und faire Preisstrukturen, die den Kunden im Voraus bekannt sind, um böse Überraschungen zu vermeiden. |\n| 8 | Individuelle Jahresgespräche | Regelmäßige persönliche Jahresgespräche, um auf die Bedürfnisse und Ziele der Kunden einzugehen und die Zusammenarbeit zu stärken. |\n| 9 | Kunden-Feedback-System | Einrichtung eines einfachen Feedback-Systems, um Kundenmeinungen und Verbesserungsvorschläge aktiv einzuholen und umzusetzen. |\n| 10 | Steuer-Newsletter und Informationsangebot | Monatlicher Newsletter mit aktuellen Steuerinformationen und Tipps, der den Kunden einen Mehrwert bietet und sie auf dem Laufenden hält. |'}, {'from': 'human', 'value': 'Bitte etwas kreativer und auf bezugnehmend auf die zwischenmenschliche Ebene.'}, {'from': 'gpt', 'value': 'Tabelle: 10 kreative Strategien zur Steigerung der Kundenzufriedenheit und -loyalität in einer Steuerberatungskanzlei mit Fokus auf zwischenmenschlicher Ebene\n\n| Nr. | Strategie | Beschreibung |\n| --- | --- | --- |\n| 1 | Kunden-Events | Organisation von regelmäßigen, informellen Veranstaltungen, um den persönlichen Kontakt zwischen Klienten und Beratern zu fördern. |\n| 2 | Persönliche Geburtstagsgrüße | Individuelle Geburtstagskarten oder -nachrichten für jeden Kunden, um Wertschätzung zu zeigen und Beziehungen zu stärken. |\n| 3 | Kundenbefragung & Anpassung | Aktives Einholen von Kundenmeinungen zur individuellen Betreuung und Anpassung unserer Leistungen entsprechend der Kundenbedürfnisse. |\n| 4 | Teamvorstellung & Empathie | Vorstellung unserer Berater auf der Firmenwebsite und in sozialen Medien, um eine persönliche und empathische Beziehung zum Kunden aufzubauen. |\n| 5 | Kunden-Mentoring-Programm | Schaffung eines Programms, bei dem langjährige Kunden als Mentoren für Neukunden fungieren, um Vertrauen und Loyalität zu fördern. |\n| 6 | "Tag der offenen Tür" | Einladung der Kunden zu einem "Tag der offenen Tür" in unserer Kanzlei, um ihnen einen persönlichen Einblick in unsere Arbeitsweise zu geben. |\n| 7 | Individuelle Erfolgsgeschichten | Erstellung von individuellen Erfolgsgeschichten für jeden Kunden, um ihre steuerlichen Erfolge hervorzuheben und ihre Loyalität zu stärken. |\n| 8 | Gemeinsame soziale Projekte | Zusammenarbeit mit Kunden an sozialen Projekten und Spendenaktionen, um gemeinsame Werte zu teilen und die Zusammenarbeit zu fördern. |\n| 9 | Flexibilität in der Kommunikation | Anbieten verschiedener Kommunikationswege (Telefon, E-Mail, Videoanruf, etc.), um auf die individuellen Bedürfnisse der Kunden einzugehen. |\n| 10 | Jährliches Kunden-Dankeschön | Überraschung der Kunden mit einer kleinen Aufmerksamkeit oder einem Dankeschön-Event, um die Wertschätzung für ihre Treue auszudrücken. |'}]] (ignored) + 33%|███▎ | 2889/8750 [48:36<9:27:44, 5.81s/it] 33%|███▎ | 2889/8750 [48:39<9:27:44, 5.81s/it] {'loss': 0.4807, 'learning_rate': 1.5636040236162066e-05, 'epoch': 0.33} + 33%|███▎ | 2889/8750 [48:39<9:27:44, 5.81s/it] {'loss': 0.4807, 'learning_rate': 1.5636040236162066e-05, 'epoch': 0.33} + 33%|███▎ | 2889/8750 [48:36<9:27:44, 5.81s/it] 33%|███▎ | 2890/8750 [48:42<9:30:20, 5.84s/it] 33%|███▎ | 2890/8750 [48:45<9:30:20, 5.84s/it] {'loss': 0.4746, 'learning_rate': 1.5632982118024556e-05, 'epoch': 0.33} + 33%|███▎ | 2890/8750 [48:45<9:30:20, 5.84s/it] {'loss': 0.4746, 'learning_rate': 1.5632982118024556e-05, 'epoch': 0.33} + 33%|███▎ | 2890/8750 [48:42<9:30:20, 5.84s/it] 33%|███▎ | 2891/8750 [48:48<9:26:35, 5.80s/it] 33%|███▎ | 2891/8750 [48:50<9:26:36, 5.80s/it] {'loss': 0.4685, 'learning_rate': 1.562992322804274e-05, 'epoch': 0.33} + {'loss': 0.4685, 'learning_rate': 1.562992322804274e-05, 'epoch': 0.33} 33%|███▎ | 2891/8750 [48:50<9:26:36, 5.80s/it] + 33%|███▎ | 2891/8750 [48:48<9:26:35, 5.80s/it] 33%|███▎ | 2892/8750 [48:53<9:26:45, 5.80s/it] 33%|███▎ | 2892/8750 [48:56<9:26:45, 5.80s/it] {'loss': 0.4727, 'learning_rate': 1.5626863566635744e-05, 'epoch': 0.33} + 33%|███▎ | 2892/8750 [48:56<9:26:45, 5.80s/it] {'loss': 0.4727, 'learning_rate': 1.5626863566635744e-05, 'epoch': 0.33} + 33%|███▎ | 2892/8750 [48:53<9:26:45, 5.80s/it] 33%|███▎ | 2893/8750 [48:59<9:21:10, 5.75s/it] 33%|███▎ | 2893/8750 [49:02<9:21:10, 5.75s/it] {'loss': 0.4873, 'learning_rate': 1.5623803134222812e-05, 'epoch': 0.33} + 33%|███▎ | 2893/8750 [49:02<9:21:10, 5.75s/it] {'loss': 0.4873, 'learning_rate': 1.5623803134222812e-05, 'epoch': 0.33} + 33%|███▎ | 2893/8750 [48:59<9:21:10, 5.75s/it] 33%|███▎ | 2894/8750 [49:05<9:23:23, 5.77s/it] 33%|███▎ | 2894/8750 [49:08<9:23:23, 5.77s/it] {'loss': 0.4863, 'learning_rate': 1.5620741931223292e-05, 'epoch': 0.33} + 33%|███▎ | 2894/8750 [49:08<9:23:23, 5.77s/it] {'loss': 0.4863, 'learning_rate': 1.5620741931223292e-05, 'epoch': 0.33} + 33%|███▎ | 2894/8750 [49:05<9:23:23, 5.77s/it] 33%|███▎ | 2895/8750 [49:10<9:18:08, 5.72s/it] 33%|███▎ | 2895/8750 [49:13<9:18:08, 5.72s/it] {'loss': 0.4903, 'learning_rate': 1.5617679958056643e-05, 'epoch': 0.33} + 33%|███▎ | 2895/8750 [49:13<9:18:08, 5.72s/it] {'loss': 0.4903, 'learning_rate': 1.5617679958056643e-05, 'epoch': 0.33} + 33%|███▎ | 2895/8750 [49:10<9:18:08, 5.72s/it] 33%|███▎ | 2896/8750 [49:16<9:22:47, 5.77s/it] 33%|███▎ | 2896/8750 [49:19<9:22:47, 5.77s/it] {'loss': 0.4736, 'learning_rate': 1.5614617215142412e-05, 'epoch': 0.33} + 33%|███▎ | 2896/8750 [49:19<9:22:47, 5.77s/it] {'loss': 0.4736, 'learning_rate': 1.5614617215142412e-05, 'epoch': 0.33} + 33%|███▎ | 2896/8750 [49:16<9:22:47, 5.77s/it] 33%|███▎ | 2897/8750 [49:22<9:23:32, 5.78s/it] 33%|███▎ | 2897/8750 [49:25<9:23:32, 5.78s/it] {'loss': 0.4637, 'learning_rate': 1.5611553702900275e-05, 'epoch': 0.33} + 33%|███▎ | 2897/8750 [49:25<9:23:32, 5.78s/it] {'loss': 0.4637, 'learning_rate': 1.5611553702900275e-05, 'epoch': 0.33} + 33%|███▎ | 2897/8750 [49:22<9:23:32, 5.78s/it] 33%|███▎ | 2898/8750 [49:28<9:21:50, 5.76s/it] 33%|███▎ | 2898/8750 [49:31<9:21:50, 5.76s/it] {'loss': 0.4758, 'learning_rate': 1.5608489421749995e-05, 'epoch': 0.33} + 33%|███▎ | 2898/8750 [49:31<9:21:50, 5.76s/it] {'loss': 0.4758, 'learning_rate': 1.5608489421749995e-05, 'epoch': 0.33} + 33%|███▎ | 2898/8750 [49:28<9:21:50, 5.76s/it] 33%|███▎ | 2899/8750 [49:34<9:21:14, 5.76s/it] 33%|███▎ | 2899/8750 [49:36<9:21:14, 5.76s/it] {'loss': 0.4812, 'learning_rate': 1.5605424372111447e-05, 'epoch': 0.33} + 33%|███▎ | 2899/8750 [49:36<9:21:14, 5.76s/it] {'loss': 0.4812, 'learning_rate': 1.5605424372111447e-05, 'epoch': 0.33} + 33%|███▎ | 2899/8750 [49:34<9:21:14, 5.76s/it]1014 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +6 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +03 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend...12 + AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... + 33%|███▎ | 2900/8750 [49:39<9:18:30, 5.73s/it]7 AutoResumeHook: Checking whether to suspend... + 33%|███▎ | 2900/8750 [49:42<9:18:30, 5.73s/it] {'loss': 0.4662, 'learning_rate': 1.5602358554404613e-05, 'epoch': 0.33} + 33%|███▎ | 2900/8750 [49:42<9:18:30, 5.73s/it] {'loss': 0.4662, 'learning_rate': 1.5602358554404613e-05, 'epoch': 0.33} + 33%|███▎ | 2900/8750 [49:39<9:18:30, 5.73s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-2900/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-2900/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-2900/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 33%|███▎ | 2901/8750 [50:01<16:58:56, 10.45s/it] 33%|███▎ | 2901/8750 [50:04<16:58:55, 10.45s/it] {'loss': 0.4708, 'learning_rate': 1.5599291969049575e-05, 'epoch': 0.33} + 33%|███▎ | 2901/8750 [50:04<16:58:55, 10.45s/it] {'loss': 0.4708, 'learning_rate': 1.5599291969049575e-05, 'epoch': 0.33} + 33%|███▎ | 2901/8750 [50:01<16:58:56, 10.45s/it] 33%|███▎ | 2902/8750 [50:06<14:43:12, 9.06s/it] 33%|███▎ | 2902/8750 [50:09<14:43:12, 9.06s/it] {'loss': 0.469, 'learning_rate': 1.5596224616466527e-05, 'epoch': 0.33} + 33%|███▎ | 2902/8750 [50:09<14:43:12, 9.06s/it] {'loss': 0.469, 'learning_rate': 1.5596224616466527e-05, 'epoch': 0.33} + 33%|███▎ | 2902/8750 [50:06<14:43:12, 9.06s/it] 33%|███▎ | 2903/8750 [50:12<13:04:51, 8.05s/it] 33%|███▎ | 2903/8750 [50:15<13:04:52, 8.05s/it] {'loss': 0.4772, 'learning_rate': 1.5593156497075767e-05, 'epoch': 0.33} + 33%|███▎ | 2903/8750 [50:15<13:04:52, 8.05s/it] {'loss': 0.4772, 'learning_rate': 1.5593156497075767e-05, 'epoch': 0.33} + 33%|███▎ | 2903/8750 [50:12<13:04:51, 8.05s/it] 33%|███▎ | 2904/8750 [50:18<12:00:29, 7.39s/it] 33%|███▎ | 2904/8750 [50:21<12:00:29, 7.39s/it] {'loss': 0.484, 'learning_rate': 1.5590087611297694e-05, 'epoch': 0.33} + 33%|███▎ | 2904/8750 [50:21<12:00:29, 7.39s/it] {'loss': 0.484, 'learning_rate': 1.5590087611297694e-05, 'epoch': 0.33} + 33%|███▎ | 2904/8750 [50:18<12:00:29, 7.39s/it] 33%|███▎ | 2905/8750 [50:24<11:12:09, 6.90s/it] 33%|███▎ | 2905/8750 [50:27<11:12:09, 6.90s/it] {'loss': 0.468, 'learning_rate': 1.558701795955281e-05, 'epoch': 0.33} + 33%|███▎ | 2905/8750 [50:27<11:12:09, 6.90s/it] {'loss': 0.468, 'learning_rate': 1.558701795955281e-05, 'epoch': 0.33} + 33%|███▎ | 2905/8750 [50:24<11:12:09, 6.90s/it] 33%|███▎ | 2906/8750 [50:30<10:40:05, 6.57s/it] 33%|███▎ | 2906/8750 [50:33<10:40:05, 6.57s/it] {'loss': 0.4638, 'learning_rate': 1.558394754226173e-05, 'epoch': 0.33} + 33%|███▎ | 2906/8750 [50:33<10:40:05, 6.57s/it] {'loss': 0.4638, 'learning_rate': 1.558394754226173e-05, 'epoch': 0.33} + 33%|███▎ | 2906/8750 [50:30<10:40:05, 6.57s/it] 33%|███▎ | 2907/8750 [50:36<10:23:54, 6.41s/it] 33%|███▎ | 2907/8750 [50:39<10:23:55, 6.41s/it] {'loss': 0.4748, 'learning_rate': 1.5580876359845166e-05, 'epoch': 0.33} + 33%|███▎ | 2907/8750 [50:39<10:23:55, 6.41s/it] {'loss': 0.4748, 'learning_rate': 1.5580876359845166e-05, 'epoch': 0.33} + 33%|███▎ | 2907/8750 [50:36<10:23:54, 6.41s/it] 33%|███▎ | 2908/8750 [50:41<10:01:27, 6.18s/it] 33%|███▎ | 2908/8750 [50:44<10:01:27, 6.18s/it] {'loss': 0.4646, 'learning_rate': 1.557780441272395e-05, 'epoch': 0.33} + 33%|███▎ | 2908/8750 [50:44<10:01:27, 6.18s/it] {'loss': 0.4646, 'learning_rate': 1.557780441272395e-05, 'epoch': 0.33} + 33%|███▎ | 2908/8750 [50:41<10:01:27, 6.18s/it] 33%|███▎ | 2909/8750 [50:47<9:50:36, 6.07s/it] 33%|███▎ | 2909/8750 [50:50<9:50:35, 6.07s/it] {'loss': 0.4657, 'learning_rate': 1.5574731701318987e-05, 'epoch': 0.33} + 33%|███▎ | 2909/8750 [50:50<9:50:35, 6.07s/it] {'loss': 0.4657, 'learning_rate': 1.5574731701318987e-05, 'epoch': 0.33} + 33%|███▎ | 2909/8750 [50:47<9:50:36, 6.07s/it] 33%|███▎ | 2910/8750 [50:53<9:37:37, 5.93s/it] 33%|███▎ | 2910/8750 [50:56<9:37:37, 5.93s/it] {'loss': 0.5036, 'learning_rate': 1.5571658226051325e-05, 'epoch': 0.33} + 33%|███▎ | 2910/8750 [50:56<9:37:37, 5.93s/it] {'loss': 0.5036, 'learning_rate': 1.5571658226051325e-05, 'epoch': 0.33} + 33%|███▎ | 2910/8750 [50:53<9:37:37, 5.93s/it] 33%|███▎ | 2911/8750 [50:58<9:30:44, 5.86s/it] 33%|███▎ | 2911/8750 [51:01<9:30:44, 5.86s/it] {'loss': 0.478, 'learning_rate': 1.556858398734209e-05, 'epoch': 0.33} + 33%|███▎ | 2911/8750 [51:01<9:30:44, 5.86s/it] {'loss': 0.478, 'learning_rate': 1.556858398734209e-05, 'epoch': 0.33} + 33%|███▎ | 2911/8750 [50:58<9:30:44, 5.86s/it] 33%|███▎ | 2912/8750 [51:05<9:43:24, 6.00s/it] 33%|███▎ | 2912/8750 [51:08<9:43:24, 6.00s/it] {'loss': 0.4641, 'learning_rate': 1.5565508985612525e-05, 'epoch': 0.33} + 33%|███▎ | 2912/8750 [51:08<9:43:24, 6.00s/it] {'loss': 0.4641, 'learning_rate': 1.5565508985612525e-05, 'epoch': 0.33} + 33%|███▎ | 2912/8750 [51:05<9:43:24, 6.00s/it] 33%|███▎ | 2913/8750 [51:11<9:38:17, 5.94s/it] 33%|███▎ | 2913/8750 [51:13<9:38:17, 5.94s/it] {'loss': 0.4988, 'learning_rate': 1.556243322128397e-05, 'epoch': 0.33} + 33%|███▎ | 2913/8750 [51:13<9:38:17, 5.94s/it] {'loss': 0.4988, 'learning_rate': 1.556243322128397e-05, 'epoch': 0.33} + 33%|███▎ | 2913/8750 [51:11<9:38:17, 5.94s/it] 33%|███▎ | 2914/8750 [51:16<9:32:32, 5.89s/it] 33%|███▎ | 2914/8750 [51:19<9:32:32, 5.89s/it] {'loss': 0.4682, 'learning_rate': 1.5559356694777882e-05, 'epoch': 0.33} + 33%|███▎ | 2914/8750 [51:19<9:32:32, 5.89s/it] {'loss': 0.4682, 'learning_rate': 1.5559356694777882e-05, 'epoch': 0.33} + 33%|███▎ | 2914/8750 [51:16<9:32:32, 5.89s/it] 33%|███▎ | 2915/8750 [51:22<9:29:36, 5.86s/it] 33%|███▎ | 2915/8750 [51:25<9:29:36, 5.86s/it] {'loss': 0.4605, 'learning_rate': 1.5556279406515802e-05, 'epoch': 0.33} + 33%|███▎ | 2915/8750 [51:25<9:29:36, 5.86s/it] {'loss': 0.4605, 'learning_rate': 1.5556279406515802e-05, 'epoch': 0.33} + 33%|███▎ | 2915/8750 [51:22<9:29:36, 5.86s/it] 33%|███▎ | 2916/8750 [51:28<9:33:00, 5.89s/it] 33%|███▎ | 2916/8750 [51:31<9:33:00, 5.89s/it] {'loss': 0.4572, 'learning_rate': 1.5553201356919394e-05, 'epoch': 0.33} + 33%|███▎ | 2916/8750 [51:31<9:33:00, 5.89s/it] {'loss': 0.4572, 'learning_rate': 1.5553201356919394e-05, 'epoch': 0.33} + 33%|███▎ | 2916/8750 [51:28<9:33:00, 5.89s/it] 33%|███▎ | 2917/8750 [51:34<9:32:14, 5.89s/it] 33%|███▎ | 2917/8750 [51:37<9:32:14, 5.89s/it] {'loss': 0.4649, 'learning_rate': 1.555012254641042e-05, 'epoch': 0.33} + 33%|███▎ | 2917/8750 [51:37<9:32:14, 5.89s/it] {'loss': 0.4649, 'learning_rate': 1.555012254641042e-05, 'epoch': 0.33} + 33%|███▎ | 2917/8750 [51:34<9:32:14, 5.89s/it] 33%|███▎ | 2918/8750 [51:40<9:26:59, 5.83s/it] 33%|███▎ | 2918/8750 [51:43<9:26:59, 5.83s/it] {'loss': 0.4938, 'learning_rate': 1.554704297541074e-05, 'epoch': 0.33} + 33%|███▎ | 2918/8750 [51:43<9:26:59, 5.83s/it] {'loss': 0.4938, 'learning_rate': 1.554704297541074e-05, 'epoch': 0.33} + 33%|███▎ | 2918/8750 [51:40<9:26:59, 5.83s/it] 33%|███▎ | 2919/8750 [51:45<9:23:51, 5.80s/it] 33%|███▎ | 2919/8750 [51:48<9:23:51, 5.80s/it] {'loss': 0.4751, 'learning_rate': 1.5543962644342335e-05, 'epoch': 0.33} + 33%|███▎ | 2919/8750 [51:48<9:23:51, 5.80s/it] {'loss': 0.4751, 'learning_rate': 1.5543962644342335e-05, 'epoch': 0.33} + 33%|███▎ | 2919/8750 [51:45<9:23:51, 5.80s/it] 33%|███▎ | 2920/8750 [51:51<9:31:03, 5.88s/it] 33%|███▎ | 2920/8750 [51:54<9:31:02, 5.88s/it] {'loss': 0.4628, 'learning_rate': 1.5540881553627264e-05, 'epoch': 0.33} + 33%|███▎ | 2920/8750 [51:54<9:31:02, 5.88s/it] {'loss': 0.4628, 'learning_rate': 1.5540881553627264e-05, 'epoch': 0.33} + 33%|███▎ | 2920/8750 [51:51<9:31:03, 5.88s/it] 33%|███▎ | 2921/8750 [51:57<9:26:07, 5.83s/it] 33%|███▎ | 2921/8750 [52:00<9:26:07, 5.83s/it] {'loss': 0.4738, 'learning_rate': 1.553779970368772e-05, 'epoch': 0.33} + 33%|███▎ | 2921/8750 [52:00<9:26:07, 5.83s/it] {'loss': 0.4738, 'learning_rate': 1.553779970368772e-05, 'epoch': 0.33} + 33%|███▎ | 2921/8750 [51:57<9:26:07, 5.83s/it] 33%|███▎ | 2922/8750 [52:03<9:21:01, 5.78s/it] 33%|███▎ | 2922/8750 [52:06<9:21:01, 5.78s/it] {'loss': 0.4807, 'learning_rate': 1.553471709494598e-05, 'epoch': 0.33} + 33%|███▎ | 2922/8750 [52:06<9:21:01, 5.78s/it] {'loss': 0.4807, 'learning_rate': 1.553471709494598e-05, 'epoch': 0.33} + 33%|███▎ | 2922/8750 [52:03<9:21:01, 5.78s/it] 33%|███▎ | 2923/8750 [52:09<9:22:06, 5.79s/it] 33%|███▎ | 2923/8750 [52:11<9:22:06, 5.79s/it] {'loss': 0.456, 'learning_rate': 1.5531633727824423e-05, 'epoch': 0.33} + 33%|███▎ | 2923/8750 [52:11<9:22:06, 5.79s/it] {'loss': 0.456, 'learning_rate': 1.5531633727824423e-05, 'epoch': 0.33} + 33%|███▎ | 2923/8750 [52:09<9:22:06, 5.79s/it] 33%|███▎ | 2924/8750 [52:14<9:22:03, 5.79s/it] 33%|███▎ | 2924/8750 [52:17<9:22:03, 5.79s/it] {'loss': 0.4865, 'learning_rate': 1.5528549602745545e-05, 'epoch': 0.33} + 33%|███▎ | 2924/8750 [52:17<9:22:03, 5.79s/it] {'loss': 0.4865, 'learning_rate': 1.5528549602745545e-05, 'epoch': 0.33} + 33%|███▎ | 2924/8750 [52:14<9:22:03, 5.79s/it] 33%|███▎ | 2925/8750 [52:20<9:19:44, 5.77s/it] 33%|███▎ | 2925/8750 [52:23<9:19:44, 5.77s/it] {'loss': 0.4608, 'learning_rate': 1.5525464720131945e-05, 'epoch': 0.33} + 33%|███▎ | 2925/8750 [52:23<9:19:44, 5.77s/it] {'loss': 0.4608, 'learning_rate': 1.5525464720131945e-05, 'epoch': 0.33} + 33%|███▎ | 2925/8750 [52:20<9:19:44, 5.77s/it] 33%|███▎ | 2926/8750 [52:26<9:18:31, 5.75s/it] 33%|███▎ | 2926/8750 [52:29<9:18:31, 5.75s/it] {'loss': 0.4985, 'learning_rate': 1.5522379080406315e-05, 'epoch': 0.33} + 33%|███▎ | 2926/8750 [52:29<9:18:31, 5.75s/it] {'loss': 0.4985, 'learning_rate': 1.5522379080406315e-05, 'epoch': 0.33} + 33%|███▎ | 2926/8750 [52:26<9:18:31, 5.75s/it] 33%|███▎ | 2927/8750 [52:32<9:20:47, 5.78s/it] 33%|███▎ | 2927/8750 [52:35<9:20:48, 5.78s/it] {'loss': 0.4733, 'learning_rate': 1.5519292683991455e-05, 'epoch': 0.33} + 33%|███▎ | 2927/8750 [52:35<9:20:48, 5.78s/it] {'loss': 0.4733, 'learning_rate': 1.5519292683991455e-05, 'epoch': 0.33} + 33%|███▎ | 2927/8750 [52:32<9:20:47, 5.78s/it] 33%|███▎ | 2928/8750 [52:37<9:19:03, 5.76s/it] 33%|███▎ | 2928/8750 [52:40<9:19:03, 5.76s/it] {'loss': 0.4852, 'learning_rate': 1.5516205531310272e-05, 'epoch': 0.33} + 33%|███▎ | 2928/8750 [52:40<9:19:03, 5.76s/it] {'loss': 0.4852, 'learning_rate': 1.5516205531310272e-05, 'epoch': 0.33} + 33%|███▎ | 2928/8750 [52:37<9:19:03, 5.76s/it] 33%|███▎ | 2929/8750 [52:43<9:18:58, 5.76s/it] 33%|███▎ | 2929/8750 [52:46<9:18:58, 5.76s/it] {'loss': 0.4641, 'learning_rate': 1.5513117622785778e-05, 'epoch': 0.33} + 33%|███▎ | 2929/8750 [52:46<9:18:58, 5.76s/it] {'loss': 0.4641, 'learning_rate': 1.5513117622785778e-05, 'epoch': 0.33} + 33%|███▎ | 2929/8750 [52:43<9:18:58, 5.76s/it] 33%|███▎ | 2930/8750 [52:49<9:23:36, 5.81s/it] 33%|███▎ | 2930/8750 [52:52<9:23:36, 5.81s/it] {'loss': 0.4682, 'learning_rate': 1.5510028958841085e-05, 'epoch': 0.33} + 33%|███▎ | 2930/8750 [52:52<9:23:36, 5.81s/it] {'loss': 0.4682, 'learning_rate': 1.5510028958841085e-05, 'epoch': 0.33} + 33%|███▎ | 2930/8750 [52:49<9:23:36, 5.81s/it] 33%|███▎ | 2931/8750 [52:55<9:18:43, 5.76s/it] 33%|███▎ | 2931/8750 [52:58<9:18:43, 5.76s/it] {'loss': 0.4544, 'learning_rate': 1.5506939539899403e-05, 'epoch': 0.33} + 33%|███▎ | 2931/8750 [52:58<9:18:43, 5.76s/it] {'loss': 0.4544, 'learning_rate': 1.5506939539899403e-05, 'epoch': 0.33} + 33%|███▎ | 2931/8750 [52:55<9:18:43, 5.76s/it] 34%|███▎ | 2932/8750 [53:01<9:22:50, 5.80s/it] 34%|███▎ | 2932/8750 [53:04<9:22:50, 5.80s/it] {'loss': 0.4609, 'learning_rate': 1.5503849366384053e-05, 'epoch': 0.34} + 34%|███▎ | 2932/8750 [53:04<9:22:50, 5.80s/it] {'loss': 0.4609, 'learning_rate': 1.5503849366384053e-05, 'epoch': 0.34} + 34%|███▎ | 2932/8750 [53:01<9:22:50, 5.80s/it] 34%|███▎ | 2933/8750 [53:06<9:22:20, 5.80s/it] 34%|███▎ | 2933/8750 [53:09<9:22:19, 5.80s/it] {'loss': 0.4947, 'learning_rate': 1.5500758438718463e-05, 'epoch': 0.34} + 34%|███▎ | 2933/8750 [53:09<9:22:19, 5.80s/it] {'loss': 0.4947, 'learning_rate': 1.5500758438718463e-05, 'epoch': 0.34} + 34%|███▎ | 2933/8750 [53:06<9:22:20, 5.80s/it] 34%|███▎ | 2934/8750 [53:12<9:20:50, 5.79s/it] 34%|███▎ | 2934/8750 [53:15<9:20:50, 5.79s/it] {'loss': 0.4783, 'learning_rate': 1.5497666757326157e-05, 'epoch': 0.34} + 34%|███▎ | 2934/8750 [53:15<9:20:50, 5.79s/it] {'loss': 0.4783, 'learning_rate': 1.5497666757326157e-05, 'epoch': 0.34} + 34%|███▎ | 2934/8750 [53:12<9:20:50, 5.79s/it] 34%|███▎ | 2935/8750 [53:18<9:16:44, 5.74s/it] 34%|███▎ | 2935/8750 [53:21<9:16:44, 5.74s/it] {'loss': 0.4702, 'learning_rate': 1.5494574322630765e-05, 'epoch': 0.34} + 34%|███▎ | 2935/8750 [53:21<9:16:44, 5.74s/it] {'loss': 0.4702, 'learning_rate': 1.5494574322630765e-05, 'epoch': 0.34} + 34%|███▎ | 2935/8750 [53:18<9:16:44, 5.74s/it] 34%|███▎ | 2936/8750 [53:23<9:13:54, 5.72s/it] 34%|███▎ | 2936/8750 [53:26<9:13:54, 5.72s/it] {'loss': 0.5003, 'learning_rate': 1.5491481135056012e-05, 'epoch': 0.34} + 34%|███▎ | 2936/8750 [53:26<9:13:54, 5.72s/it] {'loss': 0.5003, 'learning_rate': 1.5491481135056012e-05, 'epoch': 0.34} + 34%|███▎ | 2936/8750 [53:23<9:13:54, 5.72s/it] 34%|███▎ | 2937/8750 [53:29<9:21:46, 5.80s/it] 34%|███▎ | 2937/8750 [53:32<9:21:46, 5.80s/it] {'loss': 0.4549, 'learning_rate': 1.5488387195025745e-05, 'epoch': 0.34} + 34%|███▎ | 2937/8750 [53:32<9:21:46, 5.80s/it] {'loss': 0.4549, 'learning_rate': 1.5488387195025745e-05, 'epoch': 0.34} + 34%|███▎ | 2937/8750 [53:29<9:21:46, 5.80s/it] 34%|███▎ | 2938/8750 [53:35<9:18:30, 5.77s/it] 34%|███▎ | 2938/8750 [53:38<9:18:30, 5.77s/it] {'loss': 0.4735, 'learning_rate': 1.5485292502963892e-05, 'epoch': 0.34} + 34%|███▎ | 2938/8750 [53:38<9:18:30, 5.77s/it] {'loss': 0.4735, 'learning_rate': 1.5485292502963892e-05, 'epoch': 0.34} + 34%|███▎ | 2938/8750 [53:35<9:18:30, 5.77s/it] 34%|███▎ | 2939/8750 [53:41<9:16:02, 5.74s/it] 34%|███▎ | 2939/8750 [53:44<9:16:02, 5.74s/it] {'loss': 0.4817, 'learning_rate': 1.548219705929451e-05, 'epoch': 0.34} + 34%|███▎ | 2939/8750 [53:44<9:16:02, 5.74s/it] {'loss': 0.4817, 'learning_rate': 1.548219705929451e-05, 'epoch': 0.34} + 34%|███▎ | 2939/8750 [53:41<9:16:02, 5.74s/it] 34%|███▎ | 2940/8750 [53:47<9:27:25, 5.86s/it] 34%|███▎ | 2940/8750 [53:50<9:27:24, 5.86s/it] {'loss': 0.4679, 'learning_rate': 1.5479100864441726e-05, 'epoch': 0.34} + 34%|███▎ | 2940/8750 [53:50<9:27:24, 5.86s/it] {'loss': 0.4679, 'learning_rate': 1.5479100864441726e-05, 'epoch': 0.34} + 34%|███▎ | 2940/8750 [53:47<9:27:25, 5.86s/it] 34%|███▎ | 2941/8750 [53:53<9:23:47, 5.82s/it] 34%|███▎ | 2941/8750 [53:56<9:23:47, 5.82s/it] {'loss': 0.4699, 'learning_rate': 1.54760039188298e-05, 'epoch': 0.34} + 34%|███▎ | 2941/8750 [53:56<9:23:47, 5.82s/it] {'loss': 0.4699, 'learning_rate': 1.54760039188298e-05, 'epoch': 0.34} + 34%|███▎ | 2941/8750 [53:53<9:23:47, 5.82s/it] 34%|███▎ | 2942/8750 [53:59<9:32:55, 5.92s/it] 34%|███▎ | 2942/8750 [54:02<9:32:56, 5.92s/it] {'loss': 0.4791, 'learning_rate': 1.5472906222883075e-05, 'epoch': 0.34} + 34%|███▎ | 2942/8750 [54:02<9:32:56, 5.92s/it] {'loss': 0.4791, 'learning_rate': 1.5472906222883075e-05, 'epoch': 0.34} + 34%|███▎ | 2942/8750 [53:59<9:32:55, 5.92s/it] 34%|███▎ | 2943/8750 [54:05<9:28:46, 5.88s/it] 34%|███▎ | 2943/8750 [54:08<9:28:46, 5.88s/it] {'loss': 0.4749, 'learning_rate': 1.5469807777026014e-05, 'epoch': 0.34} + 34%|███▎ | 2943/8750 [54:08<9:28:46, 5.88s/it] {'loss': 0.4749, 'learning_rate': 1.5469807777026014e-05, 'epoch': 0.34} + 34%|███▎ | 2943/8750 [54:05<9:28:46, 5.88s/it] 34%|███▎ | 2944/8750 [54:10<9:20:06, 5.79s/it] 34%|███▎ | 2944/8750 [54:13<9:20:07, 5.79s/it] {'loss': 0.4845, 'learning_rate': 1.5466708581683164e-05, 'epoch': 0.34} + 34%|███▎ | 2944/8750 [54:13<9:20:07, 5.79s/it] {'loss': 0.4845, 'learning_rate': 1.5466708581683164e-05, 'epoch': 0.34} + 34%|███▎ | 2944/8750 [54:10<9:20:06, 5.79s/it] 34%|███▎ | 2945/8750 [54:19<9:15:04, 5.74s/it] 34%|███▎ | 2945/8750 [54:16<9:15:06, 5.74s/it] {'loss': 0.4682, 'learning_rate': 1.546360863727919e-05, 'epoch': 0.34} + 34%|███▎ | 2945/8750 [54:19<9:15:04, 5.74s/it] {'loss': 0.4682, 'learning_rate': 1.546360863727919e-05, 'epoch': 0.34} + 34%|███▎ | 2945/8750 [54:16<9:15:06, 5.74s/it] 34%|███▎ | 2946/8750 [54:22<9:17:27, 5.76s/it] 34%|███▎ | 2946/8750 [54:25<9:17:27, 5.76s/it] {'loss': 0.4739, 'learning_rate': 1.546050794423885e-05, 'epoch': 0.34} + 34%|███▎ | 2946/8750 [54:25<9:17:27, 5.76s/it] {'loss': 0.4739, 'learning_rate': 1.546050794423885e-05, 'epoch': 0.34} + 34%|███▎ | 2946/8750 [54:22<9:17:27, 5.76s/it] 34%|███▎ | 2947/8750 [54:28<9:20:45, 5.80s/it] 34%|███▎ | 2947/8750 [54:30<9:20:46, 5.80s/it] {'loss': 0.4703, 'learning_rate': 1.5457406502987007e-05, 'epoch': 0.34} + 34%|███▎ | 2947/8750 [54:30<9:20:46, 5.80s/it] {'loss': 0.4703, 'learning_rate': 1.5457406502987007e-05, 'epoch': 0.34} + 34%|███▎ | 2947/8750 [54:28<9:20:45, 5.80s/it] 34%|███▎ | 2948/8750 [54:33<9:18:45, 5.78s/it] 34%|███▎ | 2948/8750 [54:36<9:18:44, 5.78s/it] {'loss': 0.486, 'learning_rate': 1.5454304313948635e-05, 'epoch': 0.34} + 34%|███▎ | 2948/8750 [54:36<9:18:44, 5.78s/it] {'loss': 0.486, 'learning_rate': 1.5454304313948635e-05, 'epoch': 0.34} + 34%|███▎ | 2948/8750 [54:33<9:18:45, 5.78s/it] 34%|███▎ | 2949/8750 [54:39<9:14:59, 5.74s/it] 34%|███▎ | 2949/8750 [54:42<9:14:59, 5.74s/it] {'loss': 0.4633, 'learning_rate': 1.5451201377548793e-05, 'epoch': 0.34} + 34%|███▎ | 2949/8750 [54:42<9:14:59, 5.74s/it] {'loss': 0.4633, 'learning_rate': 1.5451201377548793e-05, 'epoch': 0.34} + 34%|███▎ | 2949/8750 [54:39<9:14:59, 5.74s/it]1114 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +2 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +0 6AutoResumeHook: Checking whether to suspend... AutoResumeHook: Checking whether to suspend... + +10 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... + 34%|███▎ | 2950/8750 [54:45<9:16:16, 5.75s/it]13 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... + 34%|███▎ | 2950/8750 [54:48<9:16:16, 5.75s/it]7 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4788, 'learning_rate': 1.5448097694212663e-05, 'epoch': 0.34} + 34%|███▎ | 2950/8750 [54:48<9:16:16, 5.75s/it] {'loss': 0.4788, 'learning_rate': 1.5448097694212663e-05, 'epoch': 0.34} + 34%|███▎ | 2950/8750 [54:45<9:16:16, 5.75s/it] 34%|███▎ | 2951/8750 [54:50<9:12:11, 5.71s/it] 34%|███▎ | 2951/8750 [54:53<9:12:10, 5.71s/it] {'loss': 0.4686, 'learning_rate': 1.544499326436551e-05, 'epoch': 0.34} + 34%|███▎ | 2951/8750 [54:53<9:12:10, 5.71s/it] {'loss': 0.4686, 'learning_rate': 1.544499326436551e-05, 'epoch': 0.34} + 34%|███▎ | 2951/8750 [54:50<9:12:11, 5.71s/it] 34%|███▎ | 2952/8750 [54:56<9:13:40, 5.73s/it] 34%|███▎ | 2952/8750 [54:59<9:13:40, 5.73s/it] {'loss': 0.482, 'learning_rate': 1.5441888088432716e-05, 'epoch': 0.34} + 34%|███▎ | 2952/8750 [54:59<9:13:40, 5.73s/it] {'loss': 0.482, 'learning_rate': 1.5441888088432716e-05, 'epoch': 0.34} + 34%|███▎ | 2952/8750 [54:56<9:13:40, 5.73s/it] 34%|███▎ | 2953/8750 [55:02<9:17:35, 5.77s/it] 34%|███▎ | 2953/8750 [55:05<9:17:35, 5.77s/it] {'loss': 0.4666, 'learning_rate': 1.5438782166839757e-05, 'epoch': 0.34} + 34%|███▎ | 2953/8750 [55:05<9:17:35, 5.77s/it] {'loss': 0.4666, 'learning_rate': 1.5438782166839757e-05, 'epoch': 0.34} + 34%|███▎ | 2953/8750 [55:02<9:17:35, 5.77s/it] 34%|███▍ | 2954/8750 [55:08<9:17:32, 5.77s/it] 34%|███▍ | 2954/8750 [55:11<9:17:32, 5.77s/it] {'loss': 0.4929, 'learning_rate': 1.5435675500012212e-05, 'epoch': 0.34} + 34%|███▍ | 2954/8750 [55:11<9:17:32, 5.77s/it] {'loss': 0.4929, 'learning_rate': 1.5435675500012212e-05, 'epoch': 0.34} + 34%|███▍ | 2954/8750 [55:08<9:17:32, 5.77s/it] 34%|███▍ | 2955/8750 [55:14<9:18:23, 5.78s/it] 34%|███▍ | 2955/8750 [55:16<9:18:24, 5.78s/it] {'loss': 0.4673, 'learning_rate': 1.5432568088375766e-05, 'epoch': 0.34} + 34%|███▍ | 2955/8750 [55:16<9:18:24, 5.78s/it] {'loss': 0.4673, 'learning_rate': 1.5432568088375766e-05, 'epoch': 0.34} + 34%|███▍ | 2955/8750 [55:14<9:18:23, 5.78s/it] 34%|███▍ | 2956/8750 [55:19<9:19:54, 5.80s/it] 34%|███▍ | 2956/8750 [55:22<9:19:53, 5.80s/it] {'loss': 0.4798, 'learning_rate': 1.542945993235621e-05, 'epoch': 0.34} + 34%|███▍ | 2956/8750 [55:22<9:19:53, 5.80s/it] {'loss': 0.4798, 'learning_rate': 1.542945993235621e-05, 'epoch': 0.34} + 34%|███▍ | 2956/8750 [55:19<9:19:54, 5.80s/it] 34%|███▍ | 2957/8750 [55:25<9:23:44, 5.84s/it] 34%|███▍ | 2957/8750 [55:28<9:23:44, 5.84s/it] {'loss': 0.4764, 'learning_rate': 1.5426351032379418e-05, 'epoch': 0.34} + 34%|███▍ | 2957/8750 [55:28<9:23:44, 5.84s/it] {'loss': 0.4764, 'learning_rate': 1.5426351032379418e-05, 'epoch': 0.34} + 34%|███▍ | 2957/8750 [55:25<9:23:44, 5.84s/it] 34%|███▍ | 2958/8750 [55:31<9:18:53, 5.79s/it] 34%|███▍ | 2958/8750 [55:34<9:18:54, 5.79s/it] {'loss': 0.4605, 'learning_rate': 1.5423241388871383e-05, 'epoch': 0.34} + 34%|███▍ | 2958/8750 [55:34<9:18:54, 5.79s/it] {'loss': 0.4605, 'learning_rate': 1.5423241388871383e-05, 'epoch': 0.34} + 34%|███▍ | 2958/8750 [55:31<9:18:53, 5.79s/it] 34%|███▍ | 2959/8750 [55:37<9:14:54, 5.75s/it] 34%|███▍ | 2959/8750 [55:40<9:14:53, 5.75s/it] {'loss': 0.4917, 'learning_rate': 1.54201310022582e-05, 'epoch': 0.34} + 34%|███▍ | 2959/8750 [55:40<9:14:53, 5.75s/it] {'loss': 0.4917, 'learning_rate': 1.54201310022582e-05, 'epoch': 0.34} + 34%|███▍ | 2959/8750 [55:37<9:14:54, 5.75s/it] 34%|███▍ | 2960/8750 [55:42<9:16:03, 5.76s/it] 34%|███▍ | 2960/8750 [55:45<9:16:03, 5.76s/it] {'loss': 0.468, 'learning_rate': 1.541701987296606e-05, 'epoch': 0.34} + 34%|███▍ | 2960/8750 [55:45<9:16:03, 5.76s/it] {'loss': 0.468, 'learning_rate': 1.541701987296606e-05, 'epoch': 0.34} + 34%|███▍ | 2960/8750 [55:42<9:16:03, 5.76s/it] 34%|███▍ | 2961/8750 [55:48<9:19:01, 5.79s/it] 34%|███▍ | 2961/8750 [55:51<9:19:01, 5.79s/it] {'loss': 0.4596, 'learning_rate': 1.5413908001421257e-05, 'epoch': 0.34} + 34%|███▍ | 2961/8750 [55:51<9:19:01, 5.79s/it] {'loss': 0.4596, 'learning_rate': 1.5413908001421257e-05, 'epoch': 0.34} + 34%|███▍ | 2961/8750 [55:48<9:19:01, 5.79s/it] 34%|███▍ | 2962/8750 [55:54<9:19:02, 5.80s/it] 34%|███▍ | 2962/8750 [55:57<9:19:02, 5.80s/it] {'loss': 0.488, 'learning_rate': 1.5410795388050182e-05, 'epoch': 0.34} + 34%|███▍ | 2962/8750 [55:57<9:19:02, 5.80s/it] {'loss': 0.488, 'learning_rate': 1.5410795388050182e-05, 'epoch': 0.34} + 34%|███▍ | 2962/8750 [55:54<9:19:02, 5.80s/it] 34%|███▍ | 2963/8750 [56:00<9:15:31, 5.76s/it] 34%|███▍ | 2963/8750 [56:03<9:15:32, 5.76s/it] {'loss': 0.4573, 'learning_rate': 1.540768203327934e-05, 'epoch': 0.34} + 34%|███▍ | 2963/8750 [56:03<9:15:32, 5.76s/it] {'loss': 0.4573, 'learning_rate': 1.540768203327934e-05, 'epoch': 0.34} + 34%|███▍ | 2963/8750 [56:00<9:15:31, 5.76s/it] 34%|███▍ | 2964/8750 [56:06<9:15:53, 5.76s/it] 34%|███▍ | 2964/8750 [56:08<9:15:52, 5.76s/it] {'loss': 0.4672, 'learning_rate': 1.5404567937535326e-05, 'epoch': 0.34} + 34%|███▍ | 2964/8750 [56:08<9:15:52, 5.76s/it] {'loss': 0.4672, 'learning_rate': 1.5404567937535326e-05, 'epoch': 0.34} + 34%|███▍ | 2964/8750 [56:06<9:15:53, 5.76s/it] 34%|███▍ | 2965/8750 [56:11<9:12:34, 5.73s/it] 34%|███▍ | 2965/8750 [56:14<9:12:34, 5.73s/it] {'loss': 0.4718, 'learning_rate': 1.540145310124484e-05, 'epoch': 0.34} + 34%|███▍ | 2965/8750 [56:14<9:12:34, 5.73s/it] {'loss': 0.4718, 'learning_rate': 1.540145310124484e-05, 'epoch': 0.34} + 34%|███▍ | 2965/8750 [56:11<9:12:34, 5.73s/it] 34%|███▍ | 2966/8750 [56:17<9:14:35, 5.75s/it] 34%|███▍ | 2966/8750 [56:20<9:14:35, 5.75s/it] {'loss': 0.4789, 'learning_rate': 1.5398337524834688e-05, 'epoch': 0.34} + 34%|███▍ | 2966/8750 [56:20<9:14:35, 5.75s/it] {'loss': 0.4789, 'learning_rate': 1.5398337524834688e-05, 'epoch': 0.34} + 34%|███▍ | 2966/8750 [56:17<9:14:35, 5.75s/it] 34%|███▍ | 2967/8750 [56:23<9:12:46, 5.74s/it] 34%|███▍ | 2967/8750 [56:26<9:12:45, 5.74s/it] {'loss': 0.4671, 'learning_rate': 1.5395221208731766e-05, 'epoch': 0.34} + 34%|███▍ | 2967/8750 [56:26<9:12:45, 5.74s/it] {'loss': 0.4671, 'learning_rate': 1.5395221208731766e-05, 'epoch': 0.34} + 34%|███▍ | 2967/8750 [56:23<9:12:46, 5.74s/it] 34%|███▍ | 2968/8750 [56:28<9:08:50, 5.70s/it] 34%|███▍ | 2968/8750 [56:31<9:08:51, 5.70s/it] {'loss': 0.4992, 'learning_rate': 1.5392104153363086e-05, 'epoch': 0.34} + 34%|███▍ | 2968/8750 [56:31<9:08:51, 5.70s/it] {'loss': 0.4992, 'learning_rate': 1.5392104153363086e-05, 'epoch': 0.34} + 34%|███▍ | 2968/8750 [56:28<9:08:50, 5.70s/it] 34%|███▍ | 2969/8750 [56:37<9:14:35, 5.76s/it] 34%|███▍ | 2969/8750 [56:34<9:14:36, 5.76s/it] {'loss': 0.4666, 'learning_rate': 1.538898635915576e-05, 'epoch': 0.34} + 34%|███▍ | 2969/8750 [56:37<9:14:35, 5.76s/it] {'loss': 0.4666, 'learning_rate': 1.538898635915576e-05, 'epoch': 0.34} + 34%|███▍ | 2969/8750 [56:34<9:14:36, 5.76s/it] 34%|███▍ | 2970/8750 [56:40<9:17:30, 5.79s/it] 34%|███▍ | 2970/8750 [56:43<9:17:30, 5.79s/it] {'loss': 0.486, 'learning_rate': 1.5385867826536977e-05, 'epoch': 0.34} + 34%|███▍ | 2970/8750 [56:43<9:17:30, 5.79s/it] {'loss': 0.486, 'learning_rate': 1.5385867826536977e-05, 'epoch': 0.34} + 34%|███▍ | 2970/8750 [56:40<9:17:30, 5.79s/it] 34%|███▍ | 2971/8750 [56:46<9:14:25, 5.76s/it] 34%|███▍ | 2971/8750 [56:49<9:14:26, 5.76s/it] {'loss': 0.4593, 'learning_rate': 1.5382748555934058e-05, 'epoch': 0.34} + 34%|███▍ | 2971/8750 [56:49<9:14:26, 5.76s/it] {'loss': 0.4593, 'learning_rate': 1.5382748555934058e-05, 'epoch': 0.34} + 34%|███▍ | 2971/8750 [56:46<9:14:25, 5.76s/it] 34%|███▍ | 2972/8750 [56:51<9:14:09, 5.75s/it] 34%|███▍ | 2972/8750 [56:54<9:14:08, 5.75s/it] {'loss': 0.4736, 'learning_rate': 1.5379628547774412e-05, 'epoch': 0.34} + 34%|███▍ | 2972/8750 [56:54<9:14:08, 5.75s/it] {'loss': 0.4736, 'learning_rate': 1.5379628547774412e-05, 'epoch': 0.34} + 34%|███▍ | 2972/8750 [56:51<9:14:09, 5.75s/it] 34%|███▍ | 2973/8750 [56:57<9:15:27, 5.77s/it] 34%|███▍ | 2973/8750 [57:00<9:15:27, 5.77s/it] {'loss': 0.4685, 'learning_rate': 1.5376507802485547e-05, 'epoch': 0.34} + 34%|███▍ | 2973/8750 [57:00<9:15:27, 5.77s/it] {'loss': 0.4685, 'learning_rate': 1.5376507802485547e-05, 'epoch': 0.34} + 34%|███▍ | 2973/8750 [56:57<9:15:27, 5.77s/it] 34%|███▍ | 2974/8750 [57:03<9:17:07, 5.79s/it] 34%|███▍ | 2974/8750 [57:06<9:17:08, 5.79s/it] {'loss': 0.4572, 'learning_rate': 1.537338632049508e-05, 'epoch': 0.34} + 34%|███▍ | 2974/8750 [57:06<9:17:08, 5.79s/it] {'loss': 0.4572, 'learning_rate': 1.537338632049508e-05, 'epoch': 0.34} + 34%|███▍ | 2974/8750 [57:03<9:17:07, 5.79s/it] 34%|███▍ | 2975/8750 [57:09<9:15:05, 5.77s/it] 34%|███▍ | 2975/8750 [57:12<9:15:04, 5.77s/it] {'loss': 0.4599, 'learning_rate': 1.5370264102230716e-05, 'epoch': 0.34} + 34%|███▍ | 2975/8750 [57:12<9:15:04, 5.77s/it] {'loss': 0.4599, 'learning_rate': 1.5370264102230716e-05, 'epoch': 0.34} + 34%|███▍ | 2975/8750 [57:09<9:15:05, 5.77s/it] 34%|███▍ | 2976/8750 [57:15<9:16:21, 5.78s/it] 34%|███▍ | 2976/8750 [57:18<9:16:22, 5.78s/it] {'loss': 0.4735, 'learning_rate': 1.5367141148120275e-05, 'epoch': 0.34} + 34%|███▍ | 2976/8750 [57:18<9:16:22, 5.78s/it] {'loss': 0.4735, 'learning_rate': 1.5367141148120275e-05, 'epoch': 0.34} + 34%|███▍ | 2976/8750 [57:15<9:16:21, 5.78s/it] 34%|███▍ | 2977/8750 [57:20<9:13:44, 5.76s/it] 34%|███▍ | 2977/8750 [57:23<9:13:43, 5.75s/it] {'loss': 0.476, 'learning_rate': 1.5364017458591668e-05, 'epoch': 0.34} + 34%|███▍ | 2977/8750 [57:23<9:13:43, 5.75s/it] {'loss': 0.476, 'learning_rate': 1.5364017458591668e-05, 'epoch': 0.34} + 34%|███▍ | 2977/8750 [57:20<9:13:44, 5.76s/it] 34%|███▍ | 2978/8750 [57:26<9:12:26, 5.74s/it] 34%|███▍ | 2978/8750 [57:29<9:12:27, 5.74s/it] {'loss': 0.4663, 'learning_rate': 1.536089303407291e-05, 'epoch': 0.34} + 34%|███▍ | 2978/8750 [57:29<9:12:27, 5.74s/it] {'loss': 0.4663, 'learning_rate': 1.536089303407291e-05, 'epoch': 0.34} + 34%|███▍ | 2978/8750 [57:26<9:12:26, 5.74s/it] 34%|███▍ | 2979/8750 [57:32<9:18:57, 5.81s/it] 34%|███▍ | 2979/8750 [57:35<9:18:57, 5.81s/it] {'loss': 0.4701, 'learning_rate': 1.535776787499212e-05, 'epoch': 0.34} + 34%|███▍ | 2979/8750 [57:35<9:18:57, 5.81s/it] {'loss': 0.4701, 'learning_rate': 1.535776787499212e-05, 'epoch': 0.34} + 34%|███▍ | 2979/8750 [57:32<9:18:57, 5.81s/it] 34%|███▍ | 2980/8750 [57:38<9:15:56, 5.78s/it] 34%|███▍ | 2980/8750 [57:41<9:15:55, 5.78s/it] {'loss': 0.483, 'learning_rate': 1.5354641981777514e-05, 'epoch': 0.34} + 34%|███▍ | 2980/8750 [57:41<9:15:55, 5.78s/it] {'loss': 0.483, 'learning_rate': 1.5354641981777514e-05, 'epoch': 0.34} + 34%|███▍ | 2980/8750 [57:38<9:15:56, 5.78s/it] 34%|███▍ | 2981/8750 [57:44<9:20:12, 5.83s/it] 34%|███▍ | 2981/8750 [57:47<9:20:12, 5.83s/it] {'loss': 0.4585, 'learning_rate': 1.5351515354857404e-05, 'epoch': 0.34} + 34%|███▍ | 2981/8750 [57:47<9:20:12, 5.83s/it] {'loss': 0.4585, 'learning_rate': 1.5351515354857404e-05, 'epoch': 0.34} + 34%|███▍ | 2981/8750 [57:44<9:20:12, 5.83s/it] 34%|███▍ | 2982/8750 [57:50<9:25:27, 5.88s/it] 34%|███▍ | 2982/8750 [57:53<9:25:28, 5.88s/it] {'loss': 0.4679, 'learning_rate': 1.5348387994660214e-05, 'epoch': 0.34} + 34%|███▍ | 2982/8750 [57:53<9:25:28, 5.88s/it] {'loss': 0.4679, 'learning_rate': 1.5348387994660214e-05, 'epoch': 0.34} + 34%|███▍ | 2982/8750 [57:50<9:25:27, 5.88s/it] 34%|███▍ | 2983/8750 [57:58<9:25:58, 5.89s/it] 34%|███▍ | 2983/8750 [57:56<9:25:59, 5.89s/it] {'loss': 0.4726, 'learning_rate': 1.534525990161446e-05, 'epoch': 0.34} + 34%|███▍ | 2983/8750 [57:58<9:25:58, 5.89s/it] {'loss': 0.4726, 'learning_rate': 1.534525990161446e-05, 'epoch': 0.34} + 34%|███▍ | 2983/8750 [57:56<9:25:59, 5.89s/it] 34%|███▍ | 2984/8750 [58:01<9:20:48, 5.84s/it] 34%|███▍ | 2984/8750 [58:04<9:20:49, 5.84s/it] {'loss': 0.4763, 'learning_rate': 1.534213107614876e-05, 'epoch': 0.34} + {'loss': 0.4763, 'learning_rate': 1.534213107614876e-05, 'epoch': 0.34} 34%|███▍ | 2984/8750 [58:04<9:20:49, 5.84s/it] + 34%|███▍ | 2984/8750 [58:01<9:20:48, 5.84s/it] 34%|███▍ | 2985/8750 [58:07<9:15:56, 5.79s/it] 34%|███▍ | 2985/8750 [58:10<9:15:55, 5.79s/it] {'loss': 0.4865, 'learning_rate': 1.5339001518691833e-05, 'epoch': 0.34} + 34%|███▍ | 2985/8750 [58:10<9:15:55, 5.79s/it] {'loss': 0.4865, 'learning_rate': 1.5339001518691833e-05, 'epoch': 0.34} + 34%|███▍ | 2985/8750 [58:07<9:15:56, 5.79s/it] 34%|███▍ | 2986/8750 [58:13<9:18:59, 5.82s/it] 34%|███▍ | 2986/8750 [58:16<9:18:59, 5.82s/it] {'loss': 0.4598, 'learning_rate': 1.5335871229672496e-05, 'epoch': 0.34} + 34%|███▍ | 2986/8750 [58:16<9:18:59, 5.82s/it] {'loss': 0.4598, 'learning_rate': 1.5335871229672496e-05, 'epoch': 0.34} + 34%|███▍ | 2986/8750 [58:13<9:18:59, 5.82s/it] 34%|███▍ | 2987/8750 [58:19<9:17:57, 5.81s/it] 34%|███▍ | 2987/8750 [58:22<9:17:57, 5.81s/it] {'loss': 0.4754, 'learning_rate': 1.5332740209519674e-05, 'epoch': 0.34} + 34%|███▍ | 2987/8750 [58:22<9:17:57, 5.81s/it] {'loss': 0.4754, 'learning_rate': 1.5332740209519674e-05, 'epoch': 0.34} + 34%|███▍ | 2987/8750 [58:19<9:17:57, 5.81s/it] 34%|███▍ | 2988/8750 [58:24<9:14:55, 5.78s/it] 34%|███▍ | 2988/8750 [58:27<9:14:55, 5.78s/it] {'loss': 0.4915, 'learning_rate': 1.5329608458662383e-05, 'epoch': 0.34} + 34%|███▍ | 2988/8750 [58:27<9:14:55, 5.78s/it] {'loss': 0.4915, 'learning_rate': 1.5329608458662383e-05, 'epoch': 0.34} + 34%|███▍ | 2988/8750 [58:24<9:14:55, 5.78s/it] 34%|███▍ | 2989/8750 [58:30<9:24:11, 5.88s/it] 34%|███▍ | 2989/8750 [58:33<9:24:11, 5.88s/it] {'loss': 0.4633, 'learning_rate': 1.5326475977529745e-05, 'epoch': 0.34} + 34%|███▍ | 2989/8750 [58:33<9:24:11, 5.88s/it] {'loss': 0.4633, 'learning_rate': 1.5326475977529745e-05, 'epoch': 0.34} + 34%|███▍ | 2989/8750 [58:30<9:24:11, 5.88s/it] 34%|███▍ | 2990/8750 [58:36<9:20:56, 5.84s/it] 34%|███▍ | 2990/8750 [58:39<9:20:57, 5.84s/it] {'loss': 0.4808, 'learning_rate': 1.5323342766550978e-05, 'epoch': 0.34} + 34%|███▍ | 2990/8750 [58:39<9:20:57, 5.84s/it] {'loss': 0.4808, 'learning_rate': 1.5323342766550978e-05, 'epoch': 0.34} + 34%|███▍ | 2990/8750 [58:36<9:20:56, 5.84s/it] 34%|███▍ | 2991/8750 [58:45<9:13:48, 5.77s/it] 34%|███▍ | 2991/8750 [58:42<9:13:48, 5.77s/it] {'loss': 0.4753, 'learning_rate': 1.53202088261554e-05, 'epoch': 0.34} + 34%|███▍ | 2991/8750 [58:45<9:13:48, 5.77s/it] {'loss': 0.4753, 'learning_rate': 1.53202088261554e-05, 'epoch': 0.34} + 34%|███▍ | 2991/8750 [58:42<9:13:48, 5.77s/it] 34%|███▍ | 2992/8750 [58:48<9:18:21, 5.82s/it] 34%|███▍ | 2992/8750 [58:51<9:18:22, 5.82s/it] {'loss': 0.4628, 'learning_rate': 1.5317074156772434e-05, 'epoch': 0.34} + 34%|███▍ | 2992/8750 [58:51<9:18:22, 5.82s/it] {'loss': 0.4628, 'learning_rate': 1.5317074156772434e-05, 'epoch': 0.34} + 34%|███▍ | 2992/8750 [58:48<9:18:21, 5.82s/it] 34%|███▍ | 2993/8750 [58:54<9:16:58, 5.80s/it] 34%|███▍ | 2993/8750 [58:56<9:16:58, 5.80s/it] {'loss': 0.4698, 'learning_rate': 1.5313938758831596e-05, 'epoch': 0.34} + 34%|███▍ | 2993/8750 [58:56<9:16:58, 5.80s/it] {'loss': 0.4698, 'learning_rate': 1.5313938758831596e-05, 'epoch': 0.34} + 34%|███▍ | 2993/8750 [58:54<9:16:58, 5.80s/it] 34%|███▍ | 2994/8750 [58:59<9:10:37, 5.74s/it] 34%|███▍ | 2994/8750 [59:02<9:10:38, 5.74s/it] {'loss': 0.4773, 'learning_rate': 1.531080263276251e-05, 'epoch': 0.34} + 34%|███▍ | 2994/8750 [59:02<9:10:38, 5.74s/it] {'loss': 0.4773, 'learning_rate': 1.531080263276251e-05, 'epoch': 0.34} + 34%|███▍ | 2994/8750 [58:59<9:10:37, 5.74s/it] 34%|███▍ | 2995/8750 [59:05<9:09:51, 5.73s/it] 34%|███▍ | 2995/8750 [59:08<9:09:52, 5.73s/it] {'loss': 0.4576, 'learning_rate': 1.5307665778994897e-05, 'epoch': 0.34} + 34%|███▍ | 2995/8750 [59:08<9:09:52, 5.73s/it] {'loss': 0.4576, 'learning_rate': 1.5307665778994897e-05, 'epoch': 0.34} + 34%|███▍ | 2995/8750 [59:05<9:09:51, 5.73s/it] 34%|███▍ | 2996/8750 [59:11<9:09:40, 5.73s/it] 34%|███▍ | 2996/8750 [59:13<9:09:40, 5.73s/it] {'loss': 0.4724, 'learning_rate': 1.5304528197958565e-05, 'epoch': 0.34} + 34%|███▍ | 2996/8750 [59:13<9:09:40, 5.73s/it] {'loss': 0.4724, 'learning_rate': 1.5304528197958565e-05, 'epoch': 0.34} + 34%|███▍ | 2996/8750 [59:11<9:09:40, 5.73s/it] 34%|███▍ | 2997/8750 [59:16<9:11:23, 5.75s/it] 34%|███▍ | 2997/8750 [59:19<9:11:23, 5.75s/it] {'loss': 0.4608, 'learning_rate': 1.5301389890083446e-05, 'epoch': 0.34} + 34%|███▍ | 2997/8750 [59:19<9:11:23, 5.75s/it] {'loss': 0.4608, 'learning_rate': 1.5301389890083446e-05, 'epoch': 0.34} + 34%|███▍ | 2997/8750 [59:16<9:11:23, 5.75s/it] 34%|███▍ | 2998/8750 [59:22<9:17:51, 5.82s/it] 34%|███▍ | 2998/8750 [59:25<9:17:51, 5.82s/it] {'loss': 0.4734, 'learning_rate': 1.529825085579955e-05, 'epoch': 0.34} + 34%|███▍ | 2998/8750 [59:25<9:17:51, 5.82s/it] {'loss': 0.4734, 'learning_rate': 1.529825085579955e-05, 'epoch': 0.34} + 34%|███▍ | 2998/8750 [59:22<9:17:51, 5.82s/it] 34%|███▍ | 2999/8750 [59:31<9:22:19, 5.87s/it] 34%|███▍ | 2999/8750 [59:28<9:22:20, 5.87s/it] {'loss': 0.4648, 'learning_rate': 1.5295111095536997e-05, 'epoch': 0.34} + 34%|███▍ | 2999/8750 [59:31<9:22:19, 5.87s/it] {'loss': 0.4648, 'learning_rate': 1.5295111095536997e-05, 'epoch': 0.34} + 34%|███▍ | 2999/8750 [59:28<9:22:20, 5.87s/it]14 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +02 AutoResumeHook: Checking whether to suspend... + 4 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... + 34%|███▍ | 3000/8750 [59:34<9:24:32, 5.89s/it]3 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... + 34%|███▍ | 3000/8750 [59:37<9:24:31, 5.89s/it]7 AutoResumeHook: Checking whether to suspend... + {'loss': 0.454, 'learning_rate': 1.5291970609726008e-05, 'epoch': 0.34} + 34%|███▍ | 3000/8750 [59:37<9:24:31, 5.89s/it] {'loss': 0.454, 'learning_rate': 1.5291970609726008e-05, 'epoch': 0.34} + 34%|███▍ | 3000/8750 [59:34<9:24:32, 5.89s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-3000/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-3000/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-3000/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 34%|███▍ | 3001/8750 [59:59<16:56:10, 10.61s/it] 34%|███▍ | 3001/8750 [59:56<16:56:11, 10.61s/it] {'loss': 0.4805, 'learning_rate': 1.5288829398796892e-05, 'epoch': 0.34} + 34%|███▍ | 3001/8750 [59:59<16:56:10, 10.61s/it] {'loss': 0.4805, 'learning_rate': 1.5288829398796892e-05, 'epoch': 0.34} + 34%|███▍ | 3001/8750 [59:56<16:56:11, 10.61s/it] 34%|███▍ | 3002/8750 [1:00:04<14:35:25, 9.14s/it] 34%|███▍ | 3002/8750 [1:00:02<14:35:26, 9.14s/it] {'loss': 0.4806, 'learning_rate': 1.528568746318007e-05, 'epoch': 0.34} + 34%|███▍ | 3002/8750 [1:00:04<14:35:25, 9.14s/it] {'loss': 0.4806, 'learning_rate': 1.528568746318007e-05, 'epoch': 0.34} + 34%|███▍ | 3002/8750 [1:00:02<14:35:26, 9.14s/it] 34%|███▍ | 3003/8750 [1:00:07<12:57:27, 8.12s/it] 34%|███▍ | 3003/8750 [1:00:10<12:57:27, 8.12s/it] {'loss': 0.4626, 'learning_rate': 1.5282544803306056e-05, 'epoch': 0.34} + 34%|███▍ | 3003/8750 [1:00:10<12:57:27, 8.12s/it] {'loss': 0.4626, 'learning_rate': 1.5282544803306056e-05, 'epoch': 0.34} + 34%|███▍ | 3003/8750 [1:00:07<12:57:27, 8.12s/it] 34%|███▍ | 3004/8750 [1:00:13<11:53:13, 7.45s/it] 34%|███▍ | 3004/8750 [1:00:16<11:53:13, 7.45s/it] {'loss': 0.4611, 'learning_rate': 1.5279401419605466e-05, 'epoch': 0.34} + 34%|███▍ | 3004/8750 [1:00:16<11:53:13, 7.45s/it] {'loss': 0.4611, 'learning_rate': 1.5279401419605466e-05, 'epoch': 0.34} + 34%|███▍ | 3004/8750 [1:00:13<11:53:13, 7.45s/it] 34%|███▍ | 3005/8750 [1:00:19<11:09:26, 6.99s/it] 34%|███▍ | 3005/8750 [1:00:22<11:09:26, 6.99s/it] {'loss': 0.4811, 'learning_rate': 1.527625731250901e-05, 'epoch': 0.34} + 34%|███▍ | 3005/8750 [1:00:22<11:09:26, 6.99s/it] {'loss': 0.4811, 'learning_rate': 1.527625731250901e-05, 'epoch': 0.34} + 34%|███▍ | 3005/8750 [1:00:19<11:09:26, 6.99s/it] 34%|███▍ | 3006/8750 [1:00:28<10:30:34, 6.59s/it] 34%|███▍ | 3006/8750 [1:00:25<10:30:35, 6.59s/it] {'loss': 0.4894, 'learning_rate': 1.527311248244751e-05, 'epoch': 0.34} + 34%|███▍ | 3006/8750 [1:00:28<10:30:34, 6.59s/it] {'loss': 0.4894, 'learning_rate': 1.527311248244751e-05, 'epoch': 0.34} + 34%|███▍ | 3006/8750 [1:00:25<10:30:35, 6.59s/it] 34%|███▍ | 3007/8750 [1:00:31<10:08:35, 6.36s/it] 34%|███▍ | 3007/8750 [1:00:33<10:08:35, 6.36s/it] {'loss': 0.4652, 'learning_rate': 1.5269966929851866e-05, 'epoch': 0.34} + 34%|███▍ | 3007/8750 [1:00:34<10:08:35, 6.36s/it] {'loss': 0.4652, 'learning_rate': 1.5269966929851866e-05, 'epoch': 0.34} + 34%|███▍ | 3007/8750 [1:00:31<10:08:35, 6.36s/it] 34%|███▍ | 3008/8750 [1:00:39<9:53:27, 6.20s/it] 34%|███▍ | 3008/8750 [1:00:36<9:53:27, 6.20s/it] {'loss': 0.4701, 'learning_rate': 1.52668206551531e-05, 'epoch': 0.34} + 34%|███▍ | 3008/8750 [1:00:39<9:53:27, 6.20s/it] {'loss': 0.4701, 'learning_rate': 1.52668206551531e-05, 'epoch': 0.34} + 34%|███▍ | 3008/8750 [1:00:36<9:53:27, 6.20s/it] 34%|███▍ | 3009/8750 [1:00:42<9:37:59, 6.04s/it] 34%|███▍ | 3009/8750 [1:00:45<9:37:59, 6.04s/it] {'loss': 0.4649, 'learning_rate': 1.526367365878231e-05, 'epoch': 0.34} + 34%|███▍ | 3009/8750 [1:00:45<9:37:59, 6.04s/it] {'loss': 0.4649, 'learning_rate': 1.526367365878231e-05, 'epoch': 0.34} + 34%|███▍ | 3009/8750 [1:00:42<9:37:59, 6.04s/it] 34%|███▍ | 3010/8750 [1:00:48<9:27:23, 5.93s/it] 34%|███▍ | 3010/8750 [1:00:51<9:27:23, 5.93s/it] {'loss': 0.4541, 'learning_rate': 1.526052594117071e-05, 'epoch': 0.34} + 34%|███▍ | 3010/8750 [1:00:51<9:27:23, 5.93s/it] {'loss': 0.4541, 'learning_rate': 1.526052594117071e-05, 'epoch': 0.34} + 34%|███▍ | 3010/8750 [1:00:48<9:27:23, 5.93s/it] 34%|███▍ | 3011/8750 [1:00:53<9:19:04, 5.84s/it] 34%|███▍ | 3011/8750 [1:00:56<9:19:04, 5.84s/it] {'loss': 0.4898, 'learning_rate': 1.5257377502749614e-05, 'epoch': 0.34} + 34%|███▍ | 3011/8750 [1:00:56<9:19:04, 5.84s/it] {'loss': 0.4898, 'learning_rate': 1.5257377502749614e-05, 'epoch': 0.34} + 34%|███▍ | 3011/8750 [1:00:53<9:19:04, 5.84s/it] 34%|███▍ | 3012/8750 [1:00:59<9:15:42, 5.81s/it] 34%|███▍ | 3012/8750 [1:01:02<9:15:42, 5.81s/it] {'loss': 0.4587, 'learning_rate': 1.525422834395042e-05, 'epoch': 0.34} + 34%|███▍ | 3012/8750 [1:01:02<9:15:42, 5.81s/it] {'loss': 0.4587, 'learning_rate': 1.525422834395042e-05, 'epoch': 0.34} + 34%|███▍ | 3012/8750 [1:00:59<9:15:42, 5.81s/it] 34%|███▍ | 3013/8750 [1:01:08<9:10:19, 5.76s/it] 34%|███▍ | 3013/8750 [1:01:05<9:10:19, 5.76s/it] {'loss': 0.4891, 'learning_rate': 1.525107846520464e-05, 'epoch': 0.34} + 34%|███▍ | 3013/8750 [1:01:08<9:10:19, 5.76s/it] {'loss': 0.4891, 'learning_rate': 1.525107846520464e-05, 'epoch': 0.34} + 34%|███▍ | 3013/8750 [1:01:05<9:10:19, 5.76s/it] 34%|███▍ | 3014/8750 [1:01:11<9:09:49, 5.75s/it] 34%|███▍ | 3014/8750 [1:01:13<9:09:49, 5.75s/it] {'loss': 0.4719, 'learning_rate': 1.5247927866943869e-05, 'epoch': 0.34} + 34%|███▍ | 3014/8750 [1:01:13<9:09:49, 5.75s/it] {'loss': 0.4719, 'learning_rate': 1.5247927866943869e-05, 'epoch': 0.34} + 34%|███▍ | 3014/8750 [1:01:11<9:09:49, 5.75s/it] 34%|███▍ | 3015/8750 [1:01:16<9:13:11, 5.79s/it] 34%|███▍ | 3015/8750 [1:01:19<9:13:11, 5.79s/it] {'loss': 0.4638, 'learning_rate': 1.5244776549599816e-05, 'epoch': 0.34} + 34%|███▍ | 3015/8750 [1:01:19<9:13:11, 5.79s/it] {'loss': 0.4638, 'learning_rate': 1.5244776549599816e-05, 'epoch': 0.34} + 34%|███▍ | 3015/8750 [1:01:16<9:13:11, 5.79s/it] 34%|███▍ | 3016/8750 [1:01:22<9:13:58, 5.80s/it] 34%|███▍ | 3016/8750 [1:01:25<9:13:58, 5.80s/it] {'loss': 0.4802, 'learning_rate': 1.5241624513604281e-05, 'epoch': 0.34} + 34%|███▍ | 3016/8750 [1:01:25<9:13:58, 5.80s/it] {'loss': 0.4802, 'learning_rate': 1.5241624513604281e-05, 'epoch': 0.34} + 34%|███▍ | 3016/8750 [1:01:22<9:13:58, 5.80s/it] 34%|███▍ | 3017/8750 [1:01:28<9:11:07, 5.77s/it] 34%|███▍ | 3017/8750 [1:01:31<9:11:07, 5.77s/it] {'loss': 0.4526, 'learning_rate': 1.523847175938916e-05, 'epoch': 0.34} + 34%|███▍ | 3017/8750 [1:01:31<9:11:07, 5.77s/it] {'loss': 0.4526, 'learning_rate': 1.523847175938916e-05, 'epoch': 0.34} + 34%|███▍ | 3017/8750 [1:01:28<9:11:07, 5.77s/it] 34%|███▍ | 3018/8750 [1:01:34<9:13:11, 5.79s/it] 34%|███▍ | 3018/8750 [1:01:37<9:13:12, 5.79s/it] {'loss': 0.4785, 'learning_rate': 1.5235318287386455e-05, 'epoch': 0.34} + 34%|███▍ | 3018/8750 [1:01:37<9:13:12, 5.79s/it] {'loss': 0.4785, 'learning_rate': 1.5235318287386455e-05, 'epoch': 0.34} + 34%|███▍ | 3018/8750 [1:01:34<9:13:11, 5.79s/it] 35%|███▍ | 3019/8750 [1:01:40<9:14:10, 5.80s/it] 35%|███▍ | 3019/8750 [1:01:42<9:14:10, 5.80s/it] {'loss': 0.4786, 'learning_rate': 1.5232164098028257e-05, 'epoch': 0.35} + 35%|███▍ | 3019/8750 [1:01:42<9:14:10, 5.80s/it] {'loss': 0.4786, 'learning_rate': 1.5232164098028257e-05, 'epoch': 0.35} + 35%|███▍ | 3019/8750 [1:01:40<9:14:10, 5.80s/it] 35%|███▍ | 3020/8750 [1:01:45<9:14:31, 5.81s/it] 35%|███▍ | 3020/8750 [1:01:48<9:14:31, 5.81s/it] {'loss': 0.4906, 'learning_rate': 1.5229009191746769e-05, 'epoch': 0.35} + 35%|███▍ | 3020/8750 [1:01:48<9:14:31, 5.81s/it] {'loss': 0.4906, 'learning_rate': 1.5229009191746769e-05, 'epoch': 0.35} + 35%|███▍ | 3020/8750 [1:01:45<9:14:31, 5.81s/it] 35%|███▍ | 3021/8750 [1:01:52<9:26:38, 5.93s/it] 35%|███▍ | 3021/8750 [1:01:55<9:26:38, 5.93s/it] {'loss': 0.4532, 'learning_rate': 1.5225853568974271e-05, 'epoch': 0.35} + 35%|███▍ | 3021/8750 [1:01:55<9:26:38, 5.93s/it] {'loss': 0.4532, 'learning_rate': 1.5225853568974271e-05, 'epoch': 0.35} + 35%|███▍ | 3021/8750 [1:01:52<9:26:38, 5.93s/it] 35%|███▍ | 3022/8750 [1:02:00<9:19:15, 5.86s/it] 35%|███▍ | 3022/8750 [1:01:57<9:19:15, 5.86s/it] {'loss': 0.4748, 'learning_rate': 1.5222697230143166e-05, 'epoch': 0.35} + 35%|███▍ | 3022/8750 [1:02:00<9:19:15, 5.86s/it] {'loss': 0.4748, 'learning_rate': 1.5222697230143166e-05, 'epoch': 0.35} + 35%|███▍ | 3022/8750 [1:01:57<9:19:15, 5.86s/it] 35%|███▍ | 3023/8750 [1:02:03<9:17:21, 5.84s/it] 35%|███▍ | 3023/8750 [1:02:06<9:17:21, 5.84s/it] {'loss': 0.4803, 'learning_rate': 1.5219540175685938e-05, 'epoch': 0.35} + 35%|███▍ | 3023/8750 [1:02:06<9:17:21, 5.84s/it] {'loss': 0.4803, 'learning_rate': 1.5219540175685938e-05, 'epoch': 0.35} + 35%|███▍ | 3023/8750 [1:02:03<9:17:21, 5.84s/it] 35%|███▍ | 3024/8750 [1:02:09<9:18:16, 5.85s/it] 35%|███▍ | 3024/8750 [1:02:12<9:18:16, 5.85s/it] {'loss': 0.4751, 'learning_rate': 1.521638240603517e-05, 'epoch': 0.35} + 35%|███▍ | 3024/8750 [1:02:12<9:18:16, 5.85s/it] {'loss': 0.4751, 'learning_rate': 1.521638240603517e-05, 'epoch': 0.35} + 35%|███▍ | 3024/8750 [1:02:09<9:18:16, 5.85s/it] 35%|███▍ | 3025/8750 [1:02:15<9:13:13, 5.80s/it] 35%|███▍ | 3025/8750 [1:02:18<9:13:13, 5.80s/it] {'loss': 0.4679, 'learning_rate': 1.5213223921623553e-05, 'epoch': 0.35} + 35%|███▍ | 3025/8750 [1:02:18<9:13:13, 5.80s/it] {'loss': 0.4679, 'learning_rate': 1.5213223921623553e-05, 'epoch': 0.35} + 35%|███▍ | 3025/8750 [1:02:15<9:13:13, 5.80s/it] 35%|███▍ | 3026/8750 [1:02:21<9:18:22, 5.85s/it] 35%|███▍ | 3026/8750 [1:02:24<9:18:22, 5.85s/it] {'loss': 0.4836, 'learning_rate': 1.5210064722883865e-05, 'epoch': 0.35} + 35%|███▍ | 3026/8750 [1:02:24<9:18:22, 5.85s/it] {'loss': 0.4836, 'learning_rate': 1.5210064722883865e-05, 'epoch': 0.35} + 35%|███▍ | 3026/8750 [1:02:21<9:18:22, 5.85s/it] 35%|███▍ | 3027/8750 [1:02:26<9:13:12, 5.80s/it] 35%|███▍ | 3027/8750 [1:02:29<9:13:12, 5.80s/it] {'loss': 0.4656, 'learning_rate': 1.5206904810248992e-05, 'epoch': 0.35} + 35%|███▍ | 3027/8750 [1:02:29<9:13:12, 5.80s/it] {'loss': 0.4656, 'learning_rate': 1.5206904810248992e-05, 'epoch': 0.35} + 35%|███▍ | 3027/8750 [1:02:26<9:13:12, 5.80s/it] 35%|███▍ | 3028/8750 [1:02:32<9:07:53, 5.75s/it] 35%|███▍ | 3028/8750 [1:02:35<9:07:53, 5.75s/it] {'loss': 0.4757, 'learning_rate': 1.5203744184151907e-05, 'epoch': 0.35} + 35%|███▍ | 3028/8750 [1:02:35<9:07:53, 5.75s/it] {'loss': 0.4757, 'learning_rate': 1.5203744184151907e-05, 'epoch': 0.35} + 35%|███▍ | 3028/8750 [1:02:32<9:07:53, 5.75s/it] 35%|███▍ | 3029/8750 [1:02:38<9:06:06, 5.73s/it] 35%|███▍ | 3029/8750 [1:02:41<9:06:06, 5.73s/it] {'loss': 0.4992, 'learning_rate': 1.5200582845025688e-05, 'epoch': 0.35} + 35%|███▍ | 3029/8750 [1:02:41<9:06:06, 5.73s/it] {'loss': 0.4992, 'learning_rate': 1.5200582845025688e-05, 'epoch': 0.35} + 35%|███▍ | 3029/8750 [1:02:38<9:06:06, 5.73s/it] 35%|███▍ | 3030/8750 [1:02:43<9:07:47, 5.75s/it] 35%|███▍ | 3030/8750 [1:02:46<9:07:47, 5.75s/it] {'loss': 0.4701, 'learning_rate': 1.5197420793303514e-05, 'epoch': 0.35} + 35%|███▍ | 3030/8750 [1:02:46<9:07:47, 5.75s/it] {'loss': 0.4701, 'learning_rate': 1.5197420793303514e-05, 'epoch': 0.35} + 35%|███▍ | 3030/8750 [1:02:43<9:07:47, 5.75s/it] 35%|███▍ | 3031/8750 [1:02:49<9:16:06, 5.83s/it] 35%|███▍ | 3031/8750 [1:02:52<9:16:06, 5.83s/it] {'loss': 0.4565, 'learning_rate': 1.5194258029418657e-05, 'epoch': 0.35} + 35%|███▍ | 3031/8750 [1:02:52<9:16:06, 5.83s/it] {'loss': 0.4565, 'learning_rate': 1.5194258029418657e-05, 'epoch': 0.35} + 35%|███▍ | 3031/8750 [1:02:49<9:16:06, 5.83s/it] 35%|███▍ | 3032/8750 [1:02:58<9:23:58, 5.92s/it] 35%|███▍ | 3032/8750 [1:02:56<9:23:59, 5.92s/it] {'loss': 0.4628, 'learning_rate': 1.5191094553804476e-05, 'epoch': 0.35} + 35%|███▍ | 3032/8750 [1:02:58<9:23:58, 5.92s/it] {'loss': 0.4628, 'learning_rate': 1.5191094553804476e-05, 'epoch': 0.35} + 35%|███▍ | 3032/8750 [1:02:56<9:23:59, 5.92s/it] 35%|███▍ | 3033/8750 [1:03:01<9:21:52, 5.90s/it] 35%|███▍ | 3033/8750 [1:03:04<9:21:52, 5.90s/it] {'loss': 0.4777, 'learning_rate': 1.5187930366894442e-05, 'epoch': 0.35} + 35%|███▍ | 3033/8750 [1:03:04<9:21:52, 5.90s/it] {'loss': 0.4777, 'learning_rate': 1.5187930366894442e-05, 'epoch': 0.35} + 35%|███▍ | 3033/8750 [1:03:01<9:21:52, 5.90s/it] 35%|███▍ | 3034/8750 [1:03:07<9:14:11, 5.82s/it] 35%|███▍ | 3034/8750 [1:03:10<9:14:11, 5.82s/it] {'loss': 0.4835, 'learning_rate': 1.5184765469122122e-05, 'epoch': 0.35} + 35%|███▍ | 3034/8750 [1:03:10<9:14:11, 5.82s/it] {'loss': 0.4835, 'learning_rate': 1.5184765469122122e-05, 'epoch': 0.35} + 35%|███▍ | 3034/8750 [1:03:07<9:14:11, 5.82s/it] 35%|███▍ | 3035/8750 [1:03:13<9:10:55, 5.78s/it] 35%|███▍ | 3035/8750 [1:03:16<9:10:55, 5.78s/it] {'loss': 0.4564, 'learning_rate': 1.5181599860921182e-05, 'epoch': 0.35} + 35%|███▍ | 3035/8750 [1:03:16<9:10:55, 5.78s/it] {'loss': 0.4564, 'learning_rate': 1.5181599860921182e-05, 'epoch': 0.35} + 35%|███▍ | 3035/8750 [1:03:13<9:10:55, 5.78s/it] 35%|███▍ | 3036/8750 [1:03:19<9:10:47, 5.78s/it] 35%|███▍ | 3036/8750 [1:03:21<9:10:47, 5.78s/it] {'loss': 0.4759, 'learning_rate': 1.517843354272537e-05, 'epoch': 0.35} + 35%|███▍ | 3036/8750 [1:03:21<9:10:47, 5.78s/it] {'loss': 0.4759, 'learning_rate': 1.517843354272537e-05, 'epoch': 0.35} + 35%|███▍ | 3036/8750 [1:03:19<9:10:47, 5.78s/it] 35%|███▍ | 3037/8750 [1:03:24<9:10:13, 5.78s/it] 35%|███▍ | 3037/8750 [1:03:27<9:10:12, 5.78s/it] {'loss': 0.4583, 'learning_rate': 1.517526651496855e-05, 'epoch': 0.35} + 35%|███▍ | 3037/8750 [1:03:27<9:10:12, 5.78s/it] {'loss': 0.4583, 'learning_rate': 1.517526651496855e-05, 'epoch': 0.35} + 35%|███▍ | 3037/8750 [1:03:24<9:10:13, 5.78s/it] 35%|███▍ | 3038/8750 [1:03:30<9:08:09, 5.76s/it] 35%|███▍ | 3038/8750 [1:03:33<9:08:09, 5.76s/it] {'loss': 0.4962, 'learning_rate': 1.5172098778084672e-05, 'epoch': 0.35} + 35%|███▍ | 3038/8750 [1:03:33<9:08:09, 5.76s/it] {'loss': 0.4962, 'learning_rate': 1.5172098778084672e-05, 'epoch': 0.35} + 35%|███▍ | 3038/8750 [1:03:30<9:08:09, 5.76s/it] 35%|███▍ | 3039/8750 [1:03:36<9:10:10, 5.78s/it] 35%|███▍ | 3039/8750 [1:03:39<9:10:10, 5.78s/it] {'loss': 0.4479, 'learning_rate': 1.5168930332507791e-05, 'epoch': 0.35} + 35%|███▍ | 3039/8750 [1:03:39<9:10:10, 5.78s/it] {'loss': 0.4479, 'learning_rate': 1.5168930332507791e-05, 'epoch': 0.35} + 35%|███▍ | 3039/8750 [1:03:36<9:10:10, 5.78s/it] 35%|███▍ | 3040/8750 [1:03:42<9:08:35, 5.76s/it] 35%|███▍ | 3040/8750 [1:03:44<9:08:34, 5.76s/it] {'loss': 0.4767, 'learning_rate': 1.5165761178672052e-05, 'epoch': 0.35} + 35%|███▍ | 3040/8750 [1:03:44<9:08:34, 5.76s/it] {'loss': 0.4767, 'learning_rate': 1.5165761178672052e-05, 'epoch': 0.35} + 35%|███▍ | 3040/8750 [1:03:42<9:08:35, 5.76s/it] 35%|███▍ | 3041/8750 [1:03:48<9:16:42, 5.85s/it] 35%|███▍ | 3041/8750 [1:03:51<9:16:42, 5.85s/it] {'loss': 0.4743, 'learning_rate': 1.51625913170117e-05, 'epoch': 0.35} + 35%|███▍ | 3041/8750 [1:03:51<9:16:42, 5.85s/it] {'loss': 0.4743, 'learning_rate': 1.51625913170117e-05, 'epoch': 0.35} + 35%|███▍ | 3041/8750 [1:03:48<9:16:42, 5.85s/it] 35%|███▍ | 3042/8750 [1:03:53<9:15:12, 5.84s/it] 35%|███▍ | 3042/8750 [1:03:56<9:15:12, 5.84s/it] {'loss': 0.4861, 'learning_rate': 1.5159420747961076e-05, 'epoch': 0.35} + 35%|███▍ | 3042/8750 [1:03:56<9:15:12, 5.84s/it] {'loss': 0.4861, 'learning_rate': 1.5159420747961076e-05, 'epoch': 0.35} + 35%|███▍ | 3042/8750 [1:03:53<9:15:12, 5.84s/it] 35%|███▍ | 3043/8750 [1:03:59<9:17:06, 5.86s/it] 35%|███▍ | 3043/8750 [1:04:02<9:17:06, 5.86s/it] {'loss': 0.46, 'learning_rate': 1.5156249471954617e-05, 'epoch': 0.35} + {'loss': 0.46, 'learning_rate': 1.5156249471954617e-05, 'epoch': 0.35} + 35%|███▍ | 3043/8750 [1:04:02<9:17:06, 5.86s/it] 35%|███▍ | 3043/8750 [1:03:59<9:17:06, 5.86s/it] 35%|███▍ | 3044/8750 [1:04:08<9:17:39, 5.86s/it] 35%|███▍ | 3044/8750 [1:04:05<9:17:40, 5.86s/it] {'loss': 0.468, 'learning_rate': 1.5153077489426865e-05, 'epoch': 0.35} + 35%|███▍ | 3044/8750 [1:04:08<9:17:39, 5.86s/it] {'loss': 0.468, 'learning_rate': 1.5153077489426865e-05, 'epoch': 0.35} + 35%|███▍ | 3044/8750 [1:04:05<9:17:40, 5.86s/it] 35%|███▍ | 3045/8750 [1:04:14<9:16:04, 5.85s/it] 35%|███▍ | 3045/8750 [1:04:11<9:16:04, 5.85s/it] {'loss': 0.4918, 'learning_rate': 1.5149904800812448e-05, 'epoch': 0.35} + 35%|███▍ | 3045/8750 [1:04:14<9:16:04, 5.85s/it] {'loss': 0.4918, 'learning_rate': 1.5149904800812448e-05, 'epoch': 0.35} + 35%|███▍ | 3045/8750 [1:04:11<9:16:04, 5.85s/it] 35%|███▍ | 3046/8750 [1:04:20<9:13:10, 5.82s/it] 35%|███▍ | 3046/8750 [1:04:17<9:13:10, 5.82s/it] {'loss': 0.4897, 'learning_rate': 1.514673140654609e-05, 'epoch': 0.35} + 35%|███▍ | 3046/8750 [1:04:20<9:13:10, 5.82s/it] {'loss': 0.4897, 'learning_rate': 1.514673140654609e-05, 'epoch': 0.35} + 35%|███▍ | 3046/8750 [1:04:17<9:13:10, 5.82s/it] 35%|███▍ | 3047/8750 [1:04:25<9:11:48, 5.81s/it] 35%|███▍ | 3047/8750 [1:04:23<9:11:49, 5.81s/it] {'loss': 0.4717, 'learning_rate': 1.514355730706263e-05, 'epoch': 0.35} + 35%|███▍ | 3047/8750 [1:04:25<9:11:48, 5.81s/it] {'loss': 0.4717, 'learning_rate': 1.514355730706263e-05, 'epoch': 0.35} + 35%|███▍ | 3047/8750 [1:04:23<9:11:49, 5.81s/it] 35%|███▍ | 3048/8750 [1:04:31<9:11:53, 5.81s/it] 35%|███▍ | 3048/8750 [1:04:28<9:11:53, 5.81s/it] {'loss': 0.4652, 'learning_rate': 1.5140382502796978e-05, 'epoch': 0.35} + 35%|███▍ | 3048/8750 [1:04:31<9:11:53, 5.81s/it] {'loss': 0.4652, 'learning_rate': 1.5140382502796978e-05, 'epoch': 0.35} + 35%|███▍ | 3048/8750 [1:04:28<9:11:53, 5.81s/it] 35%|███▍ | 3049/8750 [1:04:38<9:26:26, 5.96s/it] 35%|███▍ | 3049/8750 [1:04:35<9:26:26, 5.96s/it] {'loss': 0.4705, 'learning_rate': 1.5137206994184159e-05, 'epoch': 0.35} + 35%|███▍ | 3049/8750 [1:04:38<9:26:26, 5.96s/it] {'loss': 0.4705, 'learning_rate': 1.5137206994184159e-05, 'epoch': 0.35} + 35%|███▍ | 3049/8750 [1:04:35<9:26:26, 5.96s/it]10 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +8 15AutoResumeHook: Checking whether to suspend... AutoResumeHook: Checking whether to suspend... + +13 AutoResumeHook: Checking whether to suspend... 35%|███▍ | 3050/8750 [1:04:44<9:25:43, 5.95s/it] +4 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +02 AutoResumeHook: Checking whether to suspend... + 7 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... + 35%|███▍ | 3050/8750 [1:04:41<9:25:43, 5.96s/it] {'loss': 0.4599, 'learning_rate': 1.5134030781659288e-05, 'epoch': 0.35} + 35%|███▍ | 3050/8750 [1:04:44<9:25:43, 5.95s/it] {'loss': 0.4599, 'learning_rate': 1.5134030781659288e-05, 'epoch': 0.35} + 35%|███▍ | 3050/8750 [1:04:41<9:25:43, 5.96s/it] 35%|███▍ | 3051/8750 [1:04:49<9:19:50, 5.89s/it] 35%|███▍ | 3051/8750 [1:04:46<9:19:50, 5.89s/it] {'loss': 0.4557, 'learning_rate': 1.513085386565758e-05, 'epoch': 0.35} + 35%|███▍ | 3051/8750 [1:04:49<9:19:50, 5.89s/it] {'loss': 0.4557, 'learning_rate': 1.513085386565758e-05, 'epoch': 0.35} + 35%|███▍ | 3051/8750 [1:04:46<9:19:50, 5.89s/it] 35%|███▍ | 3052/8750 [1:04:55<9:16:02, 5.86s/it] 35%|███▍ | 3052/8750 [1:04:52<9:16:01, 5.85s/it] {'loss': 0.4788, 'learning_rate': 1.5127676246614336e-05, 'epoch': 0.35} + 35%|███▍ | 3052/8750 [1:04:55<9:16:02, 5.86s/it] {'loss': 0.4788, 'learning_rate': 1.5127676246614336e-05, 'epoch': 0.35} + 35%|███▍ | 3052/8750 [1:04:52<9:16:01, 5.85s/it] 35%|███▍ | 3053/8750 [1:05:01<9:09:09, 5.78s/it] 35%|███▍ | 3053/8750 [1:04:58<9:09:09, 5.78s/it] {'loss': 0.4772, 'learning_rate': 1.5124497924964966e-05, 'epoch': 0.35} + 35%|███▍ | 3053/8750 [1:05:01<9:09:09, 5.78s/it] {'loss': 0.4772, 'learning_rate': 1.5124497924964966e-05, 'epoch': 0.35} + 35%|███▍ | 3053/8750 [1:04:58<9:09:09, 5.78s/it] 35%|███▍ | 3054/8750 [1:05:07<9:13:46, 5.83s/it] 35%|███▍ | 3054/8750 [1:05:04<9:13:46, 5.83s/it] {'loss': 0.473, 'learning_rate': 1.512131890114497e-05, 'epoch': 0.35} + 35%|███▍ | 3054/8750 [1:05:07<9:13:46, 5.83s/it] {'loss': 0.473, 'learning_rate': 1.512131890114497e-05, 'epoch': 0.35} + 35%|███▍ | 3054/8750 [1:05:04<9:13:46, 5.83s/it] 35%|███▍ | 3055/8750 [1:05:10<9:15:54, 5.86s/it] 35%|███▍ | 3055/8750 [1:05:13<9:16:04, 5.86s/it] {'loss': 0.4603, 'learning_rate': 1.5118139175589944e-05, 'epoch': 0.35} + 35%|███▍ | 3055/8750 [1:05:13<9:16:04, 5.86s/it] {'loss': 0.4603, 'learning_rate': 1.5118139175589944e-05, 'epoch': 0.35} + 35%|███▍ | 3055/8750 [1:05:10<9:15:54, 5.86s/it] 35%|███▍ | 3056/8750 [1:05:18<9:09:39, 5.79s/it] 35%|███▍ | 3056/8750 [1:05:15<9:09:43, 5.79s/it] {'loss': 0.4807, 'learning_rate': 1.5114958748735584e-05, 'epoch': 0.35} + 35%|███▍ | 3056/8750 [1:05:18<9:09:39, 5.79s/it] {'loss': 0.4807, 'learning_rate': 1.5114958748735584e-05, 'epoch': 0.35} + 35%|███▍ | 3056/8750 [1:05:15<9:09:43, 5.79s/it] 35%|███▍ | 3057/8750 [1:05:24<9:08:10, 5.78s/it] 35%|███▍ | 3057/8750 [1:05:21<9:08:13, 5.78s/it] {'loss': 0.4671, 'learning_rate': 1.5111777621017677e-05, 'epoch': 0.35} + 35%|███▍ | 3057/8750 [1:05:24<9:08:10, 5.78s/it] {'loss': 0.4671, 'learning_rate': 1.5111777621017677e-05, 'epoch': 0.35} + 35%|███▍ | 3057/8750 [1:05:21<9:08:13, 5.78s/it] 35%|███▍ | 3058/8750 [1:05:30<9:09:42, 5.79s/it] 35%|███▍ | 3058/8750 [1:05:27<9:09:43, 5.79s/it] {'loss': 0.4546, 'learning_rate': 1.5108595792872112e-05, 'epoch': 0.35} + 35%|███▍ | 3058/8750 [1:05:30<9:09:42, 5.79s/it] {'loss': 0.4546, 'learning_rate': 1.5108595792872112e-05, 'epoch': 0.35} + 35%|███▍ | 3058/8750 [1:05:27<9:09:43, 5.79s/it] 35%|███▍ | 3059/8750 [1:05:36<9:11:57, 5.82s/it] 35%|███▍ | 3059/8750 [1:05:33<9:12:01, 5.82s/it] {'loss': 0.4701, 'learning_rate': 1.5105413264734866e-05, 'epoch': 0.35} + 35%|███▍ | 3059/8750 [1:05:36<9:11:57, 5.82s/it] {'loss': 0.4701, 'learning_rate': 1.5105413264734866e-05, 'epoch': 0.35} + 35%|███▍ | 3059/8750 [1:05:33<9:12:01, 5.82s/it] 35%|███▍ | 3060/8750 [1:05:41<9:06:58, 5.77s/it] 35%|███▍ | 3060/8750 [1:05:38<9:06:58, 5.77s/it] {'loss': 0.4811, 'learning_rate': 1.5102230037042018e-05, 'epoch': 0.35} + 35%|███▍ | 3060/8750 [1:05:41<9:06:58, 5.77s/it] {'loss': 0.4811, 'learning_rate': 1.5102230037042018e-05, 'epoch': 0.35} + 35%|███▍ | 3060/8750 [1:05:38<9:06:58, 5.77s/it] 35%|███▍ | 3061/8750 [1:05:47<9:09:06, 5.79s/it] 35%|███▍ | 3061/8750 [1:05:44<9:09:06, 5.79s/it] {'loss': 0.4597, 'learning_rate': 1.5099046110229742e-05, 'epoch': 0.35} + 35%|███▍ | 3061/8750 [1:05:47<9:09:06, 5.79s/it] {'loss': 0.4597, 'learning_rate': 1.5099046110229742e-05, 'epoch': 0.35} + 35%|███▍ | 3061/8750 [1:05:44<9:09:06, 5.79s/it] 35%|███▍ | 3062/8750 [1:05:53<9:05:05, 5.75s/it] 35%|███▍ | 3062/8750 [1:05:50<9:05:05, 5.75s/it] {'loss': 0.4749, 'learning_rate': 1.5095861484734307e-05, 'epoch': 0.35} + 35%|███▍ | 3062/8750 [1:05:53<9:05:05, 5.75s/it] {'loss': 0.4749, 'learning_rate': 1.5095861484734307e-05, 'epoch': 0.35} + 35%|███▍ | 3062/8750 [1:05:50<9:05:05, 5.75s/it] 35%|███▌ | 3063/8750 [1:05:59<9:06:19, 5.76s/it] 35%|███▌ | 3063/8750 [1:05:56<9:06:19, 5.76s/it] {'loss': 0.4709, 'learning_rate': 1.5092676160992077e-05, 'epoch': 0.35} + 35%|███▌ | 3063/8750 [1:05:59<9:06:19, 5.76s/it] {'loss': 0.4709, 'learning_rate': 1.5092676160992077e-05, 'epoch': 0.35} + 35%|███▌ | 3063/8750 [1:05:56<9:06:19, 5.76s/it] 35%|███▌ | 3064/8750 [1:06:05<9:11:44, 5.82s/it] 35%|███▌ | 3064/8750 [1:06:02<9:11:44, 5.82s/it] {'loss': 0.4601, 'learning_rate': 1.5089490139439514e-05, 'epoch': 0.35} + 35%|███▌ | 3064/8750 [1:06:05<9:11:44, 5.82s/it] {'loss': 0.4601, 'learning_rate': 1.5089490139439514e-05, 'epoch': 0.35} + 35%|███▌ | 3064/8750 [1:06:02<9:11:44, 5.82s/it] 35%|███▌ | 3065/8750 [1:06:10<9:07:56, 5.78s/it] 35%|███▌ | 3065/8750 [1:06:07<9:07:55, 5.78s/it] {'loss': 0.4482, 'learning_rate': 1.508630342051317e-05, 'epoch': 0.35} + 35%|███▌ | 3065/8750 [1:06:10<9:07:56, 5.78s/it] {'loss': 0.4482, 'learning_rate': 1.508630342051317e-05, 'epoch': 0.35} + 35%|███▌ | 3065/8750 [1:06:07<9:07:55, 5.78s/it] 35%|███▌ | 3066/8750 [1:06:16<9:11:10, 5.82s/it] 35%|███▌ | 3066/8750 [1:06:13<9:11:10, 5.82s/it] {'loss': 0.4873, 'learning_rate': 1.5083116004649703e-05, 'epoch': 0.35} + 35%|███▌ | 3066/8750 [1:06:16<9:11:10, 5.82s/it] {'loss': 0.4873, 'learning_rate': 1.5083116004649703e-05, 'epoch': 0.35} + 35%|███▌ | 3066/8750 [1:06:13<9:11:10, 5.82s/it] 35%|███▌ | 3067/8750 [1:06:22<9:10:46, 5.81s/it] 35%|███▌ | 3067/8750 [1:06:19<9:10:46, 5.81s/it] {'loss': 0.4526, 'learning_rate': 1.5079927892285855e-05, 'epoch': 0.35} + 35%|███▌ | 3067/8750 [1:06:22<9:10:46, 5.81s/it] {'loss': 0.4526, 'learning_rate': 1.5079927892285855e-05, 'epoch': 0.35} + 35%|███▌ | 3067/8750 [1:06:19<9:10:46, 5.81s/it] 35%|███▌ | 3068/8750 [1:06:25<9:06:16, 5.77s/it] 35%|███▌ | 3068/8750 [1:06:28<9:06:16, 5.77s/it] {'loss': 0.4646, 'learning_rate': 1.5076739083858472e-05, 'epoch': 0.35} + 35%|███▌ | 3068/8750 [1:06:28<9:06:16, 5.77s/it] {'loss': 0.4646, 'learning_rate': 1.5076739083858472e-05, 'epoch': 0.35} + 35%|███▌ | 3068/8750 [1:06:25<9:06:16, 5.77s/it] 35%|███▌ | 3069/8750 [1:06:33<9:06:31, 5.77s/it] 35%|███▌ | 3069/8750 [1:06:30<9:06:31, 5.77s/it] {'loss': 0.4813, 'learning_rate': 1.5073549579804493e-05, 'epoch': 0.35} + 35%|███▌ | 3069/8750 [1:06:33<9:06:31, 5.77s/it] {'loss': 0.4813, 'learning_rate': 1.5073549579804493e-05, 'epoch': 0.35} + 35%|███▌ | 3069/8750 [1:06:30<9:06:31, 5.77s/it] 35%|███▌ | 3070/8750 [1:06:36<9:11:12, 5.82s/it] 35%|███▌ | 3070/8750 [1:06:39<9:11:12, 5.82s/it] {'loss': 0.4693, 'learning_rate': 1.5070359380560944e-05, 'epoch': 0.35} + {'loss': 0.4693, 'learning_rate': 1.5070359380560944e-05, 'epoch': 0.35} 35%|███▌ | 3070/8750 [1:06:39<9:11:12, 5.82s/it] + 35%|███▌ | 3070/8750 [1:06:36<9:11:12, 5.82s/it] 35%|███▌ | 3071/8750 [1:06:45<9:10:41, 5.82s/it] 35%|███▌ | 3071/8750 [1:06:42<9:10:41, 5.82s/it] {'loss': 0.4967, 'learning_rate': 1.5067168486564959e-05, 'epoch': 0.35} + 35%|███▌ | 3071/8750 [1:06:45<9:10:41, 5.82s/it] {'loss': 0.4967, 'learning_rate': 1.5067168486564959e-05, 'epoch': 0.35} + 35%|███▌ | 3071/8750 [1:06:42<9:10:41, 5.82s/it] 35%|███▌ | 3072/8750 [1:06:51<9:08:17, 5.79s/it] 35%|███▌ | 3072/8750 [1:06:48<9:08:17, 5.79s/it] {'loss': 0.4671, 'learning_rate': 1.5063976898253763e-05, 'epoch': 0.35} + 35%|███▌ | 3072/8750 [1:06:51<9:08:17, 5.79s/it] {'loss': 0.4671, 'learning_rate': 1.5063976898253763e-05, 'epoch': 0.35} + 35%|███▌ | 3072/8750 [1:06:48<9:08:17, 5.79s/it] 35%|███▌ | 3073/8750 [1:06:54<9:10:47, 5.82s/it] 35%|███▌ | 3073/8750 [1:06:57<9:10:48, 5.82s/it] {'loss': 0.4815, 'learning_rate': 1.506078461606467e-05, 'epoch': 0.35} + 35%|███▌ | 3073/8750 [1:06:57<9:10:48, 5.82s/it] {'loss': 0.4815, 'learning_rate': 1.506078461606467e-05, 'epoch': 0.35} + 35%|███▌ | 3073/8750 [1:06:54<9:10:47, 5.82s/it] 35%|███▌ | 3074/8750 [1:07:02<9:06:37, 5.78s/it] 35%|███▌ | 3074/8750 [1:06:59<9:06:38, 5.78s/it] {'loss': 0.471, 'learning_rate': 1.5057591640435098e-05, 'epoch': 0.35} + 35%|███▌ | 3074/8750 [1:07:02<9:06:37, 5.78s/it] {'loss': 0.471, 'learning_rate': 1.5057591640435098e-05, 'epoch': 0.35} + 35%|███▌ | 3074/8750 [1:06:59<9:06:38, 5.78s/it] 35%|███▌ | 3075/8750 [1:07:08<9:07:30, 5.79s/it] 35%|███▌ | 3075/8750 [1:07:05<9:07:30, 5.79s/it] {'loss': 0.4657, 'learning_rate': 1.5054397971802557e-05, 'epoch': 0.35} + 35%|███▌ | 3075/8750 [1:07:08<9:07:30, 5.79s/it] {'loss': 0.4657, 'learning_rate': 1.5054397971802557e-05, 'epoch': 0.35} + 35%|███▌ | 3075/8750 [1:07:05<9:07:30, 5.79s/it] 35%|███▌ | 3076/8750 [1:07:14<9:01:08, 5.72s/it] 35%|███▌ | 3076/8750 [1:07:11<9:01:09, 5.72s/it] {'loss': 0.4786, 'learning_rate': 1.5051203610604643e-05, 'epoch': 0.35} + 35%|███▌ | 3076/8750 [1:07:14<9:01:08, 5.72s/it] {'loss': 0.4786, 'learning_rate': 1.5051203610604643e-05, 'epoch': 0.35} + 35%|███▌ | 3076/8750 [1:07:11<9:01:09, 5.72s/it] 35%|███▌ | 3077/8750 [1:07:20<9:03:49, 5.75s/it] 35%|███▌ | 3077/8750 [1:07:17<9:03:49, 5.75s/it] {'loss': 0.4743, 'learning_rate': 1.5048008557279064e-05, 'epoch': 0.35} + 35%|███▌ | 3077/8750 [1:07:20<9:03:49, 5.75s/it] {'loss': 0.4743, 'learning_rate': 1.5048008557279064e-05, 'epoch': 0.35} + 35%|███▌ | 3077/8750 [1:07:17<9:03:49, 5.75s/it] 35%|███▌ | 3078/8750 [1:07:25<9:01:56, 5.73s/it] 35%|███▌ | 3078/8750 [1:07:22<9:01:55, 5.73s/it] {'loss': 0.4958, 'learning_rate': 1.504481281226361e-05, 'epoch': 0.35} + 35%|███▌ | 3078/8750 [1:07:25<9:01:56, 5.73s/it] {'loss': 0.4958, 'learning_rate': 1.504481281226361e-05, 'epoch': 0.35} + 35%|███▌ | 3078/8750 [1:07:22<9:01:55, 5.73s/it] 35%|███▌ | 3079/8750 [1:07:31<9:04:48, 5.76s/it] 35%|███▌ | 3079/8750 [1:07:28<9:04:48, 5.76s/it] {'loss': 0.4849, 'learning_rate': 1.504161637599617e-05, 'epoch': 0.35} + 35%|███▌ | 3079/8750 [1:07:31<9:04:48, 5.76s/it] {'loss': 0.4849, 'learning_rate': 1.504161637599617e-05, 'epoch': 0.35} + 35%|███▌ | 3079/8750 [1:07:28<9:04:48, 5.76s/it] 35%|███▌ | 3080/8750 [1:07:37<9:07:25, 5.79s/it] 35%|███▌ | 3080/8750 [1:07:34<9:07:26, 5.79s/it] {'loss': 0.4544, 'learning_rate': 1.5038419248914725e-05, 'epoch': 0.35} + 35%|███▌ | 3080/8750 [1:07:37<9:07:25, 5.79s/it] {'loss': 0.4544, 'learning_rate': 1.5038419248914725e-05, 'epoch': 0.35} + 35%|███▌ | 3080/8750 [1:07:34<9:07:26, 5.79s/it] 35%|███▌ | 3081/8750 [1:07:43<9:14:53, 5.87s/it] 35%|███▌ | 3081/8750 [1:07:40<9:14:53, 5.87s/it] {'loss': 0.478, 'learning_rate': 1.5035221431457352e-05, 'epoch': 0.35} + 35%|███▌ | 3081/8750 [1:07:43<9:14:53, 5.87s/it] {'loss': 0.478, 'learning_rate': 1.5035221431457352e-05, 'epoch': 0.35} + 35%|███▌ | 3081/8750 [1:07:40<9:14:53, 5.87s/it] 35%|███▌ | 3082/8750 [1:07:49<9:12:13, 5.85s/it] 35%|███▌ | 3082/8750 [1:07:46<9:12:13, 5.85s/it] {'loss': 0.4758, 'learning_rate': 1.5032022924062228e-05, 'epoch': 0.35} + 35%|███▌ | 3082/8750 [1:07:49<9:12:13, 5.85s/it] {'loss': 0.4758, 'learning_rate': 1.5032022924062228e-05, 'epoch': 0.35} + 35%|███▌ | 3082/8750 [1:07:46<9:12:13, 5.85s/it] 35%|███▌ | 3083/8750 [1:07:55<9:12:43, 5.85s/it] 35%|███▌ | 3083/8750 [1:07:52<9:12:43, 5.85s/it] {'loss': 0.4713, 'learning_rate': 1.5028823727167621e-05, 'epoch': 0.35} + 35%|███▌ | 3083/8750 [1:07:55<9:12:43, 5.85s/it] {'loss': 0.4713, 'learning_rate': 1.5028823727167621e-05, 'epoch': 0.35} + 35%|███▌ | 3083/8750 [1:07:52<9:12:43, 5.85s/it] 35%|███▌ | 3084/8750 [1:08:00<9:07:36, 5.80s/it] 35%|███▌ | 3084/8750 [1:07:57<9:07:36, 5.80s/it] {'loss': 0.4591, 'learning_rate': 1.5025623841211885e-05, 'epoch': 0.35} + 35%|███▌ | 3084/8750 [1:08:00<9:07:36, 5.80s/it] {'loss': 0.4591, 'learning_rate': 1.5025623841211885e-05, 'epoch': 0.35} + 35%|███▌ | 3084/8750 [1:07:57<9:07:36, 5.80s/it] 35%|███▌ | 3085/8750 [1:08:06<9:06:18, 5.79s/it] 35%|███▌ | 3085/8750 [1:08:03<9:06:18, 5.79s/it] {'loss': 0.457, 'learning_rate': 1.502242326663348e-05, 'epoch': 0.35} + 35%|███▌ | 3085/8750 [1:08:06<9:06:18, 5.79s/it] {'loss': 0.457, 'learning_rate': 1.502242326663348e-05, 'epoch': 0.35} + 35%|███▌ | 3085/8750 [1:08:03<9:06:18, 5.79s/it] 35%|███▌ | 3086/8750 [1:08:09<9:00:46, 5.73s/it] 35%|███▌ | 3086/8750 [1:08:12<9:00:46, 5.73s/it] {'loss': 0.4923, 'learning_rate': 1.5019222003870954e-05, 'epoch': 0.35} + 35%|███▌ | 3086/8750 [1:08:12<9:00:46, 5.73s/it] {'loss': 0.4923, 'learning_rate': 1.5019222003870954e-05, 'epoch': 0.35} + 35%|███▌ | 3086/8750 [1:08:09<9:00:46, 5.73s/it] 35%|███▌ | 3087/8750 [1:08:18<9:04:40, 5.77s/it] 35%|███▌ | 3087/8750 [1:08:15<9:04:40, 5.77s/it] {'loss': 0.4548, 'learning_rate': 1.501602005336296e-05, 'epoch': 0.35} + 35%|███▌ | 3087/8750 [1:08:18<9:04:40, 5.77s/it] {'loss': 0.4548, 'learning_rate': 1.501602005336296e-05, 'epoch': 0.35} + 35%|███▌ | 3087/8750 [1:08:15<9:04:40, 5.77s/it] 35%|███▌ | 3088/8750 [1:08:23<9:08:14, 5.81s/it] 35%|███▌ | 3088/8750 [1:08:21<9:08:14, 5.81s/it] {'loss': 0.4693, 'learning_rate': 1.5012817415548226e-05, 'epoch': 0.35} + 35%|███▌ | 3088/8750 [1:08:23<9:08:14, 5.81s/it] {'loss': 0.4693, 'learning_rate': 1.5012817415548226e-05, 'epoch': 0.35} + 35%|███▌ | 3088/8750 [1:08:21<9:08:14, 5.81s/it] 35%|███▌ | 3089/8750 [1:08:29<9:02:07, 5.75s/it] 35%|███▌ | 3089/8750 [1:08:26<9:02:08, 5.75s/it] {'loss': 0.4745, 'learning_rate': 1.500961409086559e-05, 'epoch': 0.35} + 35%|███▌ | 3089/8750 [1:08:29<9:02:07, 5.75s/it] {'loss': 0.4745, 'learning_rate': 1.500961409086559e-05, 'epoch': 0.35} + 35%|███▌ | 3089/8750 [1:08:26<9:02:08, 5.75s/it] 35%|███▌ | 3090/8750 [1:08:32<9:05:16, 5.78s/it] 35%|███▌ | 3090/8750 [1:08:35<9:05:17, 5.78s/it] {'loss': 0.4642, 'learning_rate': 1.5006410079753974e-05, 'epoch': 0.35} + 35%|███▌ | 3090/8750 [1:08:35<9:05:17, 5.78s/it] {'loss': 0.4642, 'learning_rate': 1.5006410079753974e-05, 'epoch': 0.35} + 35%|███▌ | 3090/8750 [1:08:32<9:05:16, 5.78s/it] 35%|███▌ | 3091/8750 [1:08:41<9:14:43, 5.88s/it] 35%|███▌ | 3091/8750 [1:08:38<9:14:44, 5.88s/it] {'loss': 0.4622, 'learning_rate': 1.5003205382652409e-05, 'epoch': 0.35} + 35%|███▌ | 3091/8750 [1:08:41<9:14:43, 5.88s/it] {'loss': 0.4622, 'learning_rate': 1.5003205382652409e-05, 'epoch': 0.35} + 35%|███▌ | 3091/8750 [1:08:38<9:14:44, 5.88s/it] 35%|███▌ | 3092/8750 [1:08:47<9:12:03, 5.85s/it] 35%|███▌ | 3092/8750 [1:08:44<9:12:04, 5.85s/it] {'loss': 0.4943, 'learning_rate': 1.5000000000000002e-05, 'epoch': 0.35} + 35%|███▌ | 3092/8750 [1:08:47<9:12:03, 5.85s/it] {'loss': 0.4943, 'learning_rate': 1.5000000000000002e-05, 'epoch': 0.35} + 35%|███▌ | 3092/8750 [1:08:44<9:12:04, 5.85s/it] 35%|███▌ | 3093/8750 [1:08:53<9:06:57, 5.80s/it] 35%|███▌ | 3093/8750 [1:08:50<9:06:57, 5.80s/it] {'loss': 0.4657, 'learning_rate': 1.4996793932235965e-05, 'epoch': 0.35} + 35%|███▌ | 3093/8750 [1:08:53<9:06:57, 5.80s/it] {'loss': 0.4657, 'learning_rate': 1.4996793932235965e-05, 'epoch': 0.35} + 35%|███▌ | 3093/8750 [1:08:50<9:06:57, 5.80s/it] 35%|███▌ | 3094/8750 [1:08:58<9:03:20, 5.76s/it] 35%|███▌ | 3094/8750 [1:08:55<9:03:21, 5.76s/it] {'loss': 0.4849, 'learning_rate': 1.4993587179799598e-05, 'epoch': 0.35} + 35%|███▌ | 3094/8750 [1:08:58<9:03:20, 5.76s/it] {'loss': 0.4849, 'learning_rate': 1.4993587179799598e-05, 'epoch': 0.35} + 35%|███▌ | 3094/8750 [1:08:55<9:03:21, 5.76s/it] 35%|███▌ | 3095/8750 [1:09:04<9:02:54, 5.76s/it] 35%|███▌ | 3095/8750 [1:09:01<9:02:54, 5.76s/it] {'loss': 0.4615, 'learning_rate': 1.49903797431303e-05, 'epoch': 0.35} + 35%|███▌ | 3095/8750 [1:09:04<9:02:54, 5.76s/it] {'loss': 0.4615, 'learning_rate': 1.49903797431303e-05, 'epoch': 0.35} + 35%|███▌ | 3095/8750 [1:09:01<9:02:54, 5.76s/it] 35%|███▌ | 3096/8750 [1:09:10<9:01:27, 5.75s/it] 35%|███▌ | 3096/8750 [1:09:07<9:01:27, 5.75s/it] {'loss': 0.4694, 'learning_rate': 1.4987171622667562e-05, 'epoch': 0.35} + 35%|███▌ | 3096/8750 [1:09:10<9:01:27, 5.75s/it] {'loss': 0.4694, 'learning_rate': 1.4987171622667562e-05, 'epoch': 0.35} + 35%|███▌ | 3096/8750 [1:09:07<9:01:27, 5.75s/it] 35%|███▌ | 3097/8750 [1:09:15<9:00:39, 5.74s/it] 35%|███▌ | 3097/8750 [1:09:12<9:00:39, 5.74s/it] {'loss': 0.4771, 'learning_rate': 1.4983962818850967e-05, 'epoch': 0.35} + 35%|███▌ | 3097/8750 [1:09:15<9:00:39, 5.74s/it] {'loss': 0.4771, 'learning_rate': 1.4983962818850967e-05, 'epoch': 0.35} + 35%|███▌ | 3097/8750 [1:09:12<9:00:39, 5.74s/it] 35%|███▌ | 3098/8750 [1:09:21<8:58:34, 5.72s/it] 35%|███▌ | 3098/8750 [1:09:18<8:58:34, 5.72s/it] {'loss': 0.4754, 'learning_rate': 1.4980753332120193e-05, 'epoch': 0.35} + 35%|███▌ | 3098/8750 [1:09:21<8:58:34, 5.72s/it] {'loss': 0.4754, 'learning_rate': 1.4980753332120193e-05, 'epoch': 0.35} + 35%|███▌ | 3098/8750 [1:09:18<8:58:34, 5.72s/it] 35%|███▌ | 3099/8750 [1:09:27<9:13:00, 5.87s/it] 35%|███▌ | 3099/8750 [1:09:24<9:13:01, 5.87s/it] {'loss': 0.4576, 'learning_rate': 1.4977543162915011e-05, 'epoch': 0.35} + 35%|███▌ | 3099/8750 [1:09:27<9:13:00, 5.87s/it] {'loss': 0.4576, 'learning_rate': 1.4977543162915011e-05, 'epoch': 0.35} + 35%|███▌ | 3099/8750 [1:09:24<9:13:01, 5.87s/it]5 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +1312 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +14 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +16 35%|███▌ | 3100/8750 [1:09:33<9:07:02, 5.81s/it] 9 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... +0AutoResumeHook: Checking whether to suspend... + +15 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend...4 AutoResumeHook: Checking whether to suspend... + +2 AutoResumeHook: Checking whether to suspend... + 35%|███▌ | 3100/8750 [1:09:30<9:07:02, 5.81s/it] {'loss': 0.4701, 'learning_rate': 1.4974332311675286e-05, 'epoch': 0.35} + 35%|███▌ | 3100/8750 [1:09:33<9:07:02, 5.81s/it] {'loss': 0.4701, 'learning_rate': 1.4974332311675286e-05, 'epoch': 0.35} + 35%|███▌ | 3100/8750 [1:09:30<9:07:02, 5.81s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-3100/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-3100/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-3100/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 35%|███▌ | 3101/8750 [1:09:54<16:07:45, 10.28s/it] 35%|███▌ | 3101/8750 [1:09:51<16:07:45, 10.28s/it] {'loss': 0.4787, 'learning_rate': 1.497112077884098e-05, 'epoch': 0.35} + 35%|███▌ | 3101/8750 [1:09:54<16:07:45, 10.28s/it] {'loss': 0.4787, 'learning_rate': 1.497112077884098e-05, 'epoch': 0.35} + 35%|███▌ | 3101/8750 [1:09:51<16:07:45, 10.28s/it] 35%|███▌ | 3102/8750 [1:09:59<13:56:36, 8.89s/it] 35%|███▌ | 3102/8750 [1:09:56<13:56:36, 8.89s/it] {'loss': 0.5031, 'learning_rate': 1.4967908564852137e-05, 'epoch': 0.35} + 35%|███▌ | 3102/8750 [1:09:59<13:56:36, 8.89s/it] {'loss': 0.5031, 'learning_rate': 1.4967908564852137e-05, 'epoch': 0.35} + 35%|███▌ | 3102/8750 [1:09:56<13:56:36, 8.89s/it] 35%|███▌ | 3103/8750 [1:10:05<12:29:25, 7.96s/it] 35%|███▌ | 3103/8750 [1:10:02<12:29:25, 7.96s/it] {'loss': 0.47, 'learning_rate': 1.4964695670148907e-05, 'epoch': 0.35} + 35%|███▌ | 3103/8750 [1:10:05<12:29:25, 7.96s/it] {'loss': 0.47, 'learning_rate': 1.4964695670148907e-05, 'epoch': 0.35} + 35%|███▌ | 3103/8750 [1:10:02<12:29:25, 7.96s/it] 35%|███▌ | 3104/8750 [1:10:11<11:26:56, 7.30s/it] 35%|███▌ | 3104/8750 [1:10:08<11:26:56, 7.30s/it] {'loss': 0.4716, 'learning_rate': 1.4961482095171529e-05, 'epoch': 0.35} + 35%|███▌ | 3104/8750 [1:10:11<11:26:56, 7.30s/it] {'loss': 0.4716, 'learning_rate': 1.4961482095171529e-05, 'epoch': 0.35} + 35%|███▌ | 3104/8750 [1:10:08<11:26:56, 7.30s/it] 35%|███▌ | 3105/8750 [1:10:16<10:37:53, 6.78s/it] 35%|███▌ | 3105/8750 [1:10:14<10:37:53, 6.78s/it] {'loss': 0.4739, 'learning_rate': 1.4958267840360332e-05, 'epoch': 0.35} + 35%|███▌ | 3105/8750 [1:10:16<10:37:53, 6.78s/it] {'loss': 0.4739, 'learning_rate': 1.4958267840360332e-05, 'epoch': 0.35} + 35%|███▌ | 3105/8750 [1:10:14<10:37:53, 6.78s/it] 35%|███▌ | 3106/8750 [1:10:19<10:07:32, 6.46s/it] 35%|███▌ | 3106/8750 [1:10:22<10:07:34, 6.46s/it] {'loss': 0.4693, 'learning_rate': 1.495505290615574e-05, 'epoch': 0.35} + 35%|███▌ | 3106/8750 [1:10:22<10:07:34, 6.46s/it] {'loss': 0.4693, 'learning_rate': 1.495505290615574e-05, 'epoch': 0.35} + 35%|███▌ | 3106/8750 [1:10:19<10:07:32, 6.46s/it] 36%|███▌ | 3107/8750 [1:10:25<9:47:16, 6.24s/it] 36%|███▌ | 3107/8750 [1:10:28<9:47:17, 6.24s/it] {'loss': 0.4659, 'learning_rate': 1.4951837292998277e-05, 'epoch': 0.36} + 36%|███▌ | 3107/8750 [1:10:28<9:47:17, 6.24s/it] {'loss': 0.4659, 'learning_rate': 1.4951837292998277e-05, 'epoch': 0.36} + 36%|███▌ | 3107/8750 [1:10:25<9:47:16, 6.24s/it] 36%|███▌ | 3108/8750 [1:10:34<9:33:11, 6.10s/it] 36%|███▌ | 3108/8750 [1:10:31<9:33:12, 6.10s/it] {'loss': 0.4686, 'learning_rate': 1.4948621001328544e-05, 'epoch': 0.36} + 36%|███▌ | 3108/8750 [1:10:34<9:33:11, 6.10s/it] {'loss': 0.4686, 'learning_rate': 1.4948621001328544e-05, 'epoch': 0.36} + 36%|███▌ | 3108/8750 [1:10:31<9:33:12, 6.10s/it] 36%|███▌ | 3109/8750 [1:10:39<9:26:27, 6.03s/it] 36%|███▌ | 3109/8750 [1:10:37<9:26:27, 6.03s/it] {'loss': 0.4645, 'learning_rate': 1.4945404031587255e-05, 'epoch': 0.36} + 36%|███▌ | 3109/8750 [1:10:39<9:26:27, 6.03s/it] {'loss': 0.4645, 'learning_rate': 1.4945404031587255e-05, 'epoch': 0.36} + 36%|███▌ | 3109/8750 [1:10:37<9:26:27, 6.03s/it] 36%|███▌ | 3110/8750 [1:10:45<9:16:35, 5.92s/it] 36%|███▌ | 3110/8750 [1:10:42<9:16:35, 5.92s/it] {'loss': 0.4806, 'learning_rate': 1.4942186384215198e-05, 'epoch': 0.36} + 36%|███▌ | 3110/8750 [1:10:45<9:16:35, 5.92s/it] {'loss': 0.4806, 'learning_rate': 1.4942186384215198e-05, 'epoch': 0.36} + 36%|███▌ | 3110/8750 [1:10:42<9:16:35, 5.92s/it] 36%|███▌ | 3111/8750 [1:10:48<9:09:29, 5.85s/it] 36%|███▌ | 3111/8750 [1:10:51<9:09:30, 5.85s/it] {'loss': 0.4747, 'learning_rate': 1.4938968059653269e-05, 'epoch': 0.36} + 36%|███▌ | 3111/8750 [1:10:51<9:09:30, 5.85s/it] {'loss': 0.4747, 'learning_rate': 1.4938968059653269e-05, 'epoch': 0.36} + 36%|███▌ | 3111/8750 [1:10:48<9:09:29, 5.85s/it] 36%|███▌ | 3112/8750 [1:10:54<9:04:46, 5.80s/it] 36%|███▌ | 3112/8750 [1:10:57<9:04:47, 5.80s/it] {'loss': 0.4749, 'learning_rate': 1.4935749058342446e-05, 'epoch': 0.36} + 36%|███▌ | 3112/8750 [1:10:57<9:04:47, 5.80s/it] {'loss': 0.4749, 'learning_rate': 1.4935749058342446e-05, 'epoch': 0.36} + 36%|███▌ | 3112/8750 [1:10:54<9:04:46, 5.80s/it] 36%|███▌ | 3113/8750 [1:11:03<9:15:00, 5.91s/it] 36%|███▌ | 3113/8750 [1:11:00<9:15:00, 5.91s/it] {'loss': 0.4757, 'learning_rate': 1.4932529380723806e-05, 'epoch': 0.36} + 36%|███▌ | 3113/8750 [1:11:03<9:15:00, 5.91s/it] {'loss': 0.4757, 'learning_rate': 1.4932529380723806e-05, 'epoch': 0.36} + 36%|███▌ | 3113/8750 [1:11:00<9:15:00, 5.91s/it] 36%|███▌ | 3114/8750 [1:11:05<9:08:30, 5.84s/it] 36%|███▌ | 3114/8750 [1:11:08<9:08:31, 5.84s/it] {'loss': 0.4718, 'learning_rate': 1.4929309027238517e-05, 'epoch': 0.36} + 36%|███▌ | 3114/8750 [1:11:08<9:08:31, 5.84s/it] {'loss': 0.4718, 'learning_rate': 1.4929309027238517e-05, 'epoch': 0.36} + 36%|███▌ | 3114/8750 [1:11:05<9:08:30, 5.84s/it] 36%|███▌ | 3115/8750 [1:11:11<9:04:48, 5.80s/it] 36%|███▌ | 3115/8750 [1:11:14<9:04:48, 5.80s/it] {'loss': 0.4873, 'learning_rate': 1.4926087998327838e-05, 'epoch': 0.36} + 36%|███▌ | 3115/8750 [1:11:14<9:04:48, 5.80s/it] {'loss': 0.4873, 'learning_rate': 1.4926087998327838e-05, 'epoch': 0.36} + 36%|███▌ | 3115/8750 [1:11:11<9:04:48, 5.80s/it] 36%|███▌ | 3116/8750 [1:11:20<9:12:22, 5.88s/it] 36%|███▌ | 3116/8750 [1:11:17<9:12:23, 5.88s/it] {'loss': 0.458, 'learning_rate': 1.4922866294433122e-05, 'epoch': 0.36} + 36%|███▌ | 3116/8750 [1:11:20<9:12:22, 5.88s/it] {'loss': 0.458, 'learning_rate': 1.4922866294433122e-05, 'epoch': 0.36} + 36%|███▌ | 3116/8750 [1:11:17<9:12:23, 5.88s/it] 36%|███▌ | 3117/8750 [1:11:26<9:06:37, 5.82s/it] 36%|███▌ | 3117/8750 [1:11:23<9:06:38, 5.82s/it] {'loss': 0.4518, 'learning_rate': 1.4919643915995816e-05, 'epoch': 0.36} + 36%|███▌ | 3117/8750 [1:11:26<9:06:37, 5.82s/it] {'loss': 0.4518, 'learning_rate': 1.4919643915995816e-05, 'epoch': 0.36} + 36%|███▌ | 3117/8750 [1:11:23<9:06:38, 5.82s/it] 36%|███▌ | 3118/8750 [1:11:32<9:09:50, 5.86s/it] 36%|███▌ | 3118/8750 [1:11:29<9:09:50, 5.86s/it] {'loss': 0.484, 'learning_rate': 1.4916420863457456e-05, 'epoch': 0.36} + 36%|███▌ | 3118/8750 [1:11:32<9:09:50, 5.86s/it] {'loss': 0.484, 'learning_rate': 1.4916420863457456e-05, 'epoch': 0.36} + 36%|███▌ | 3118/8750 [1:11:29<9:09:50, 5.86s/it] 36%|███▌ | 3119/8750 [1:11:38<9:08:22, 5.84s/it] 36%|███▌ | 3119/8750 [1:11:35<9:08:22, 5.84s/it] {'loss': 0.4704, 'learning_rate': 1.4913197137259675e-05, 'epoch': 0.36} + 36%|███▌ | 3119/8750 [1:11:38<9:08:22, 5.84s/it] {'loss': 0.4704, 'learning_rate': 1.4913197137259675e-05, 'epoch': 0.36} + 36%|███▌ | 3119/8750 [1:11:35<9:08:22, 5.84s/it] 36%|███▌ | 3120/8750 [1:11:43<9:04:53, 5.81s/it] 36%|███▌ | 3120/8750 [1:11:40<9:04:54, 5.81s/it] {'loss': 0.4703, 'learning_rate': 1.490997273784419e-05, 'epoch': 0.36} + 36%|███▌ | 3120/8750 [1:11:43<9:04:53, 5.81s/it] {'loss': 0.4703, 'learning_rate': 1.490997273784419e-05, 'epoch': 0.36} + 36%|███▌ | 3120/8750 [1:11:40<9:04:54, 5.81s/it] 36%|███▌ | 3121/8750 [1:11:49<9:03:57, 5.80s/it] 36%|███▌ | 3121/8750 [1:11:46<9:03:57, 5.80s/it] {'loss': 0.472, 'learning_rate': 1.4906747665652821e-05, 'epoch': 0.36} + 36%|███▌ | 3121/8750 [1:11:49<9:03:57, 5.80s/it] {'loss': 0.472, 'learning_rate': 1.4906747665652821e-05, 'epoch': 0.36} + 36%|███▌ | 3121/8750 [1:11:46<9:03:57, 5.80s/it] 36%|███▌ | 3122/8750 [1:11:55<9:01:16, 5.77s/it] 36%|███▌ | 3122/8750 [1:11:52<9:01:15, 5.77s/it] {'loss': 0.4628, 'learning_rate': 1.4903521921127472e-05, 'epoch': 0.36} + 36%|███▌ | 3122/8750 [1:11:55<9:01:16, 5.77s/it] {'loss': 0.4628, 'learning_rate': 1.4903521921127472e-05, 'epoch': 0.36} + 36%|███▌ | 3122/8750 [1:11:52<9:01:15, 5.77s/it] 36%|███▌ | 3123/8750 [1:12:00<8:56:58, 5.73s/it] 36%|███▌ | 3123/8750 [1:11:57<8:56:58, 5.73s/it] {'loss': 0.4817, 'learning_rate': 1.4900295504710143e-05, 'epoch': 0.36} + 36%|███▌ | 3123/8750 [1:12:00<8:56:58, 5.73s/it] {'loss': 0.4817, 'learning_rate': 1.4900295504710143e-05, 'epoch': 0.36} + 36%|███▌ | 3123/8750 [1:11:58<8:56:58, 5.73s/it] 36%|███▌ | 3124/8750 [1:12:06<9:00:22, 5.76s/it] 36%|███▌ | 3124/8750 [1:12:03<9:00:22, 5.76s/it] {'loss': 0.464, 'learning_rate': 1.4897068416842926e-05, 'epoch': 0.36} + 36%|███▌ | 3124/8750 [1:12:06<9:00:22, 5.76s/it] {'loss': 0.464, 'learning_rate': 1.4897068416842926e-05, 'epoch': 0.36} + 36%|███▌ | 3124/8750 [1:12:03<9:00:22, 5.76s/it] 36%|███▌ | 3125/8750 [1:12:12<9:10:49, 5.88s/it] 36%|███▌ | 3125/8750 [1:12:09<9:10:50, 5.88s/it] {'loss': 0.4822, 'learning_rate': 1.4893840657968001e-05, 'epoch': 0.36} + {'loss': 0.4822, 'learning_rate': 1.4893840657968001e-05, 'epoch': 0.36} 36%|███▌ | 3125/8750 [1:12:12<9:10:49, 5.88s/it] + 36%|███▌ | 3125/8750 [1:12:09<9:10:50, 5.88s/it] 36%|███▌ | 3126/8750 [1:12:18<9:06:55, 5.83s/it] 36%|███▌ | 3126/8750 [1:12:15<9:06:55, 5.83s/it] {'loss': 0.4618, 'learning_rate': 1.4890612228527648e-05, 'epoch': 0.36} + 36%|███▌ | 3126/8750 [1:12:18<9:06:55, 5.83s/it] {'loss': 0.4618, 'learning_rate': 1.4890612228527648e-05, 'epoch': 0.36} + 36%|███▌ | 3126/8750 [1:12:15<9:06:55, 5.83s/it] 36%|███▌ | 3127/8750 [1:12:24<9:05:57, 5.83s/it] 36%|███▌ | 3127/8750 [1:12:21<9:05:57, 5.83s/it] {'loss': 0.4776, 'learning_rate': 1.4887383128964232e-05, 'epoch': 0.36} + 36%|███▌ | 3127/8750 [1:12:24<9:05:57, 5.83s/it] {'loss': 0.4776, 'learning_rate': 1.4887383128964232e-05, 'epoch': 0.36} + 36%|███▌ | 3127/8750 [1:12:21<9:05:57, 5.83s/it] 36%|███▌ | 3128/8750 [1:12:30<9:08:19, 5.85s/it] 36%|███▌ | 3128/8750 [1:12:27<9:08:18, 5.85s/it] {'loss': 0.4775, 'learning_rate': 1.4884153359720205e-05, 'epoch': 0.36} + 36%|███▌ | 3128/8750 [1:12:30<9:08:19, 5.85s/it] {'loss': 0.4775, 'learning_rate': 1.4884153359720205e-05, 'epoch': 0.36} + 36%|███▌ | 3128/8750 [1:12:27<9:08:18, 5.85s/it] 36%|███▌ | 3129/8750 [1:12:36<9:04:52, 5.82s/it] 36%|███▌ | 3129/8750 [1:12:33<9:04:52, 5.82s/it] {'loss': 0.4653, 'learning_rate': 1.4880922921238128e-05, 'epoch': 0.36} + 36%|███▌ | 3129/8750 [1:12:36<9:04:52, 5.82s/it] {'loss': 0.4653, 'learning_rate': 1.4880922921238128e-05, 'epoch': 0.36} + 36%|███▌ | 3129/8750 [1:12:33<9:04:52, 5.82s/it] 36%|███▌ | 3130/8750 [1:12:41<9:05:25, 5.82s/it] 36%|███▌ | 3130/8750 [1:12:39<9:05:24, 5.82s/it] {'loss': 0.4933, 'learning_rate': 1.4877691813960638e-05, 'epoch': 0.36} + 36%|███▌ | 3130/8750 [1:12:41<9:05:25, 5.82s/it] {'loss': 0.4933, 'learning_rate': 1.4877691813960638e-05, 'epoch': 0.36} + 36%|███▌ | 3130/8750 [1:12:39<9:05:24, 5.82s/it] 36%|███▌ | 3131/8750 [1:12:47<9:01:55, 5.79s/it] 36%|███▌ | 3131/8750 [1:12:44<9:01:55, 5.79s/it] {'loss': 0.465, 'learning_rate': 1.4874460038330469e-05, 'epoch': 0.36} + 36%|███▌ | 3131/8750 [1:12:47<9:01:55, 5.79s/it] {'loss': 0.465, 'learning_rate': 1.4874460038330469e-05, 'epoch': 0.36} + 36%|███▌ | 3131/8750 [1:12:44<9:01:55, 5.79s/it] 36%|███▌ | 3132/8750 [1:12:53<9:04:47, 5.82s/it] 36%|███▌ | 3132/8750 [1:12:50<9:04:47, 5.82s/it] {'loss': 0.4592, 'learning_rate': 1.4871227594790447e-05, 'epoch': 0.36} + 36%|███▌ | 3132/8750 [1:12:53<9:04:47, 5.82s/it] {'loss': 0.4592, 'learning_rate': 1.4871227594790447e-05, 'epoch': 0.36} + 36%|███▌ | 3132/8750 [1:12:50<9:04:47, 5.82s/it] 36%|███▌ | 3133/8750 [1:12:59<9:01:55, 5.79s/it] 36%|███▌ | 3133/8750 [1:12:56<9:01:55, 5.79s/it] {'loss': 0.4704, 'learning_rate': 1.4867994483783485e-05, 'epoch': 0.36} + 36%|███▌ | 3133/8750 [1:12:59<9:01:55, 5.79s/it] {'loss': 0.4704, 'learning_rate': 1.4867994483783485e-05, 'epoch': 0.36} + 36%|███▌ | 3133/8750 [1:12:56<9:01:55, 5.79s/it] 36%|███▌ | 3134/8750 [1:13:04<9:00:30, 5.77s/it] 36%|███▌ | 3134/8750 [1:13:02<9:00:30, 5.77s/it] {'loss': 0.4597, 'learning_rate': 1.48647607057526e-05, 'epoch': 0.36} + 36%|███▌ | 3134/8750 [1:13:04<9:00:30, 5.77s/it] {'loss': 0.4597, 'learning_rate': 1.48647607057526e-05, 'epoch': 0.36} + 36%|███▌ | 3134/8750 [1:13:02<9:00:30, 5.77s/it] 36%|███▌ | 3135/8750 [1:13:10<9:04:14, 5.82s/it] 36%|███▌ | 3135/8750 [1:13:07<9:04:14, 5.82s/it] {'loss': 0.4649, 'learning_rate': 1.4861526261140886e-05, 'epoch': 0.36} + 36%|███▌ | 3135/8750 [1:13:10<9:04:14, 5.82s/it] {'loss': 0.4649, 'learning_rate': 1.4861526261140886e-05, 'epoch': 0.36} + 36%|███▌ | 3135/8750 [1:13:07<9:04:14, 5.82s/it] 36%|███▌ | 3136/8750 [1:13:16<8:58:21, 5.75s/it] 36%|███▌ | 3136/8750 [1:13:13<8:58:21, 5.75s/it] {'loss': 0.4881, 'learning_rate': 1.4858291150391533e-05, 'epoch': 0.36} + 36%|███▌ | 3136/8750 [1:13:16<8:58:21, 5.75s/it] {'loss': 0.4881, 'learning_rate': 1.4858291150391533e-05, 'epoch': 0.36} + 36%|███▌ | 3136/8750 [1:13:13<8:58:21, 5.75s/it] 36%|███▌ | 3137/8750 [1:13:22<9:03:14, 5.81s/it] 36%|███▌ | 3137/8750 [1:13:19<9:03:14, 5.81s/it] {'loss': 0.4686, 'learning_rate': 1.4855055373947829e-05, 'epoch': 0.36} + 36%|███▌ | 3137/8750 [1:13:22<9:03:14, 5.81s/it] {'loss': 0.4686, 'learning_rate': 1.4855055373947829e-05, 'epoch': 0.36} + 36%|███▌ | 3137/8750 [1:13:19<9:03:14, 5.81s/it] 36%|███▌ | 3138/8750 [1:13:25<9:04:24, 5.82s/it] 36%|███▌ | 3138/8750 [1:13:28<9:04:25, 5.82s/it] {'loss': 0.4904, 'learning_rate': 1.4851818932253137e-05, 'epoch': 0.36} + 36%|███▌ | 3138/8750 [1:13:28<9:04:25, 5.82s/it] {'loss': 0.4904, 'learning_rate': 1.4851818932253137e-05, 'epoch': 0.36} + 36%|███▌ | 3138/8750 [1:13:25<9:04:24, 5.82s/it] 36%|███▌ | 3139/8750 [1:13:31<9:03:21, 5.81s/it] 36%|███▌ | 3139/8750 [1:13:34<9:03:21, 5.81s/it] {'loss': 0.4775, 'learning_rate': 1.4848581825750935e-05, 'epoch': 0.36} + 36%|███▌ | 3139/8750 [1:13:34<9:03:21, 5.81s/it] {'loss': 0.4775, 'learning_rate': 1.4848581825750935e-05, 'epoch': 0.36} + 36%|███▌ | 3139/8750 [1:13:31<9:03:21, 5.81s/it] 36%|███▌ | 3140/8750 [1:13:40<9:11:46, 5.90s/it] 36%|███▌ | 3140/8750 [1:13:37<9:11:46, 5.90s/it] {'loss': 0.456, 'learning_rate': 1.4845344054884772e-05, 'epoch': 0.36} + 36%|███▌ | 3140/8750 [1:13:40<9:11:46, 5.90s/it] {'loss': 0.456, 'learning_rate': 1.4845344054884772e-05, 'epoch': 0.36} + 36%|███▌ | 3140/8750 [1:13:37<9:11:46, 5.90s/it] 36%|███▌ | 3141/8750 [1:13:45<9:06:31, 5.85s/it] 36%|███▌ | 3141/8750 [1:13:42<9:06:31, 5.85s/it] {'loss': 0.4727, 'learning_rate': 1.4842105620098292e-05, 'epoch': 0.36} + 36%|███▌ | 3141/8750 [1:13:45<9:06:31, 5.85s/it] {'loss': 0.4727, 'learning_rate': 1.4842105620098292e-05, 'epoch': 0.36} + 36%|███▌ | 3141/8750 [1:13:42<9:06:31, 5.85s/it] 36%|███▌ | 3142/8750 [1:13:48<9:02:52, 5.81s/it] 36%|███▌ | 3142/8750 [1:13:51<9:02:53, 5.81s/it] {'loss': 0.4698, 'learning_rate': 1.4838866521835238e-05, 'epoch': 0.36} + 36%|███▌ | 3142/8750 [1:13:51<9:02:53, 5.81s/it] {'loss': 0.4698, 'learning_rate': 1.4838866521835238e-05, 'epoch': 0.36} + 36%|███▌ | 3142/8750 [1:13:48<9:02:52, 5.81s/it] 36%|███▌ | 3143/8750 [1:13:54<9:08:47, 5.87s/it] 36%|███▌ | 3143/8750 [1:13:57<9:08:48, 5.87s/it] {'loss': 0.4501, 'learning_rate': 1.4835626760539437e-05, 'epoch': 0.36} + 36%|███▌ | 3143/8750 [1:13:57<9:08:48, 5.87s/it] {'loss': 0.4501, 'learning_rate': 1.4835626760539437e-05, 'epoch': 0.36} + 36%|███▌ | 3143/8750 [1:13:54<9:08:47, 5.87s/it] 36%|███▌ | 3144/8750 [1:14:00<9:03:18, 5.81s/it] 36%|███▌ | 3144/8750 [1:14:03<9:03:18, 5.81s/it] {'loss': 0.4737, 'learning_rate': 1.483238633665481e-05, 'epoch': 0.36} + 36%|███▌ | 3144/8750 [1:14:03<9:03:18, 5.81s/it] {'loss': 0.4737, 'learning_rate': 1.483238633665481e-05, 'epoch': 0.36} + 36%|███▌ | 3144/8750 [1:14:00<9:03:18, 5.81s/it] 36%|███▌ | 3145/8750 [1:14:06<9:01:40, 5.80s/it] 36%|███▌ | 3145/8750 [1:14:09<9:01:40, 5.80s/it] {'loss': 0.476, 'learning_rate': 1.4829145250625368e-05, 'epoch': 0.36} + 36%|███▌ | 3145/8750 [1:14:09<9:01:40, 5.80s/it] {'loss': 0.476, 'learning_rate': 1.4829145250625368e-05, 'epoch': 0.36} + 36%|███▌ | 3145/8750 [1:14:06<9:01:40, 5.80s/it] 36%|███▌ | 3146/8750 [1:14:12<9:05:54, 5.84s/it] 36%|███▌ | 3146/8750 [1:14:15<9:05:55, 5.84s/it] {'loss': 0.4562, 'learning_rate': 1.4825903502895207e-05, 'epoch': 0.36} + {'loss': 0.4562, 'learning_rate': 1.4825903502895207e-05, 'epoch': 0.36} 36%|███▌ | 3146/8750 [1:14:15<9:05:55, 5.84s/it] + 36%|███▌ | 3146/8750 [1:14:12<9:05:54, 5.84s/it] 36%|███▌ | 3147/8750 [1:14:18<9:07:35, 5.86s/it] 36%|███▌ | 3147/8750 [1:14:20<9:07:35, 5.86s/it] {'loss': 0.4805, 'learning_rate': 1.4822661093908521e-05, 'epoch': 0.36} + 36%|███▌ | 3147/8750 [1:14:20<9:07:35, 5.86s/it] {'loss': 0.4805, 'learning_rate': 1.4822661093908521e-05, 'epoch': 0.36} + 36%|███▌ | 3147/8750 [1:14:18<9:07:35, 5.86s/it] 36%|███▌ | 3148/8750 [1:14:26<9:05:50, 5.85s/it] 36%|███▌ | 3148/8750 [1:14:23<9:05:51, 5.85s/it] {'loss': 0.4816, 'learning_rate': 1.4819418024109595e-05, 'epoch': 0.36} + 36%|███▌ | 3148/8750 [1:14:26<9:05:50, 5.85s/it] {'loss': 0.4816, 'learning_rate': 1.4819418024109595e-05, 'epoch': 0.36} + 36%|███▌ | 3148/8750 [1:14:23<9:05:51, 5.85s/it] 36%|███▌ | 3149/8750 [1:14:32<9:02:34, 5.81s/it] 36%|███▌ | 3149/8750 [1:14:29<9:02:35, 5.81s/it] {'loss': 0.4696, 'learning_rate': 1.4816174293942804e-05, 'epoch': 0.36} + 36%|███▌ | 3149/8750 [1:14:32<9:02:34, 5.81s/it] {'loss': 0.4696, 'learning_rate': 1.4816174293942804e-05, 'epoch': 0.36} + 36%|███▌ | 3149/8750 [1:14:29<9:02:35, 5.81s/it]10 AutoResumeHook: Checking whether to suspend... +05 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... 36%|███▌ | 3150/8750 [1:14:35<9:00:21, 5.79s/it] + 36%|███▌ | 3150/8750 [1:14:38<9:00:22, 5.79s/it]3 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4819, 'learning_rate': 1.4812929903852606e-05, 'epoch': 0.36} + 36%|███▌ | 3150/8750 [1:14:38<9:00:22, 5.79s/it] {'loss': 0.4819, 'learning_rate': 1.4812929903852606e-05, 'epoch': 0.36} + 36%|███▌ | 3150/8750 [1:14:35<9:00:21, 5.79s/it] 36%|███▌ | 3151/8750 [1:14:41<8:58:38, 5.77s/it] 36%|███▌ | 3151/8750 [1:14:43<8:58:39, 5.77s/it] {'loss': 0.4685, 'learning_rate': 1.4809684854283557e-05, 'epoch': 0.36} + 36%|███▌ | 3151/8750 [1:14:43<8:58:39, 5.77s/it] {'loss': 0.4685, 'learning_rate': 1.4809684854283557e-05, 'epoch': 0.36} + 36%|███▌ | 3151/8750 [1:14:41<8:58:38, 5.77s/it] 36%|███▌ | 3152/8750 [1:14:46<8:56:54, 5.75s/it] 36%|███▌ | 3152/8750 [1:14:49<8:56:54, 5.75s/it] {'loss': 0.4632, 'learning_rate': 1.4806439145680298e-05, 'epoch': 0.36} + 36%|███▌ | 3152/8750 [1:14:49<8:56:54, 5.75s/it] {'loss': 0.4632, 'learning_rate': 1.4806439145680298e-05, 'epoch': 0.36} + 36%|███▌ | 3152/8750 [1:14:46<8:56:54, 5.75s/it] 36%|███▌ | 3153/8750 [1:14:55<9:09:33, 5.89s/it] 36%|███▌ | 3153/8750 [1:14:52<9:09:34, 5.89s/it] {'loss': 0.4622, 'learning_rate': 1.4803192778487569e-05, 'epoch': 0.36} + 36%|███▌ | 3153/8750 [1:14:55<9:09:33, 5.89s/it] {'loss': 0.4622, 'learning_rate': 1.4803192778487569e-05, 'epoch': 0.36} + 36%|███▌ | 3153/8750 [1:14:52<9:09:34, 5.89s/it] 36%|███▌ | 3154/8750 [1:14:58<8:56:23, 5.75s/it] 36%|███▌ | 3154/8750 [1:15:01<8:56:23, 5.75s/it] {'loss': 0.508, 'learning_rate': 1.4799945753150194e-05, 'epoch': 0.36} + 36%|███▌ | 3154/8750 [1:15:01<8:56:23, 5.75s/it] {'loss': 0.508, 'learning_rate': 1.4799945753150194e-05, 'epoch': 0.36} + 36%|███▌ | 3154/8750 [1:14:58<8:56:23, 5.75s/it] 36%|███▌ | 3155/8750 [1:15:04<8:57:04, 5.76s/it] 36%|███▌ | 3155/8750 [1:15:07<8:57:05, 5.76s/it] {'loss': 0.4752, 'learning_rate': 1.4796698070113084e-05, 'epoch': 0.36} + 36%|███▌ | 3155/8750 [1:15:07<8:57:05, 5.76s/it] {'loss': 0.4752, 'learning_rate': 1.4796698070113084e-05, 'epoch': 0.36} + 36%|███▌ | 3155/8750 [1:15:04<8:57:04, 5.76s/it] 36%|███▌ | 3156/8750 [1:15:13<9:05:04, 5.85s/it] 36%|███▌ | 3156/8750 [1:15:10<9:05:04, 5.85s/it] {'loss': 0.4801, 'learning_rate': 1.4793449729821248e-05, 'epoch': 0.36} + 36%|███▌ | 3156/8750 [1:15:13<9:05:04, 5.85s/it] {'loss': 0.4801, 'learning_rate': 1.4793449729821248e-05, 'epoch': 0.36} + 36%|███▌ | 3156/8750 [1:15:10<9:05:04, 5.85s/it] 36%|███▌ | 3157/8750 [1:15:18<9:01:32, 5.81s/it] 36%|███▌ | 3157/8750 [1:15:15<9:01:33, 5.81s/it] {'loss': 0.4513, 'learning_rate': 1.4790200732719779e-05, 'epoch': 0.36} + 36%|███▌ | 3157/8750 [1:15:18<9:01:32, 5.81s/it] {'loss': 0.4513, 'learning_rate': 1.4790200732719779e-05, 'epoch': 0.36} + 36%|███▌ | 3157/8750 [1:15:15<9:01:33, 5.81s/it] 36%|███▌ | 3158/8750 [1:15:21<9:08:00, 5.88s/it] 36%|███▌ | 3158/8750 [1:15:24<9:08:01, 5.88s/it] {'loss': 0.4687, 'learning_rate': 1.4786951079253861e-05, 'epoch': 0.36} + 36%|███▌ | 3158/8750 [1:15:24<9:08:01, 5.88s/it] {'loss': 0.4687, 'learning_rate': 1.4786951079253861e-05, 'epoch': 0.36} + 36%|███▌ | 3158/8750 [1:15:21<9:08:00, 5.88s/it] 36%|███▌ | 3159/8750 [1:15:27<9:03:59, 5.84s/it] 36%|███▌ | 3159/8750 [1:15:30<9:03:59, 5.84s/it] {'loss': 0.5047, 'learning_rate': 1.4783700769868775e-05, 'epoch': 0.36} + 36%|███▌ | 3159/8750 [1:15:30<9:03:59, 5.84s/it] {'loss': 0.5047, 'learning_rate': 1.4783700769868775e-05, 'epoch': 0.36} + 36%|███▌ | 3159/8750 [1:15:27<9:03:59, 5.84s/it] 36%|███▌ | 3160/8750 [1:15:33<8:59:25, 5.79s/it] 36%|███▌ | 3160/8750 [1:15:36<8:59:26, 5.79s/it] {'loss': 0.4439, 'learning_rate': 1.4780449805009878e-05, 'epoch': 0.36} + 36%|███▌ | 3160/8750 [1:15:36<8:59:26, 5.79s/it] {'loss': 0.4439, 'learning_rate': 1.4780449805009878e-05, 'epoch': 0.36} + 36%|███▌ | 3160/8750 [1:15:33<8:59:25, 5.79s/it] 36%|███▌ | 3161/8750 [1:15:42<9:00:19, 5.80s/it] 36%|███▌ | 3161/8750 [1:15:39<9:00:20, 5.80s/it] {'loss': 0.481, 'learning_rate': 1.477719818512263e-05, 'epoch': 0.36} + 36%|███▌ | 3161/8750 [1:15:42<9:00:19, 5.80s/it] {'loss': 0.481, 'learning_rate': 1.477719818512263e-05, 'epoch': 0.36} + 36%|███▌ | 3161/8750 [1:15:39<9:00:20, 5.80s/it] 36%|███▌ | 3162/8750 [1:15:44<8:58:45, 5.78s/it] 36%|███▌ | 3162/8750 [1:15:47<8:58:46, 5.78s/it] {'loss': 0.4716, 'learning_rate': 1.4773945910652576e-05, 'epoch': 0.36} + 36%|███▌ | 3162/8750 [1:15:47<8:58:46, 5.78s/it] {'loss': 0.4716, 'learning_rate': 1.4773945910652576e-05, 'epoch': 0.36} + 36%|███▌ | 3162/8750 [1:15:44<8:58:45, 5.78s/it] 36%|███▌ | 3163/8750 [1:15:50<8:55:43, 5.75s/it] 36%|███▌ | 3163/8750 [1:15:53<8:55:43, 5.75s/it] {'loss': 0.4807, 'learning_rate': 1.4770692982045344e-05, 'epoch': 0.36} + 36%|███▌ | 3163/8750 [1:15:53<8:55:43, 5.75s/it] {'loss': 0.4807, 'learning_rate': 1.4770692982045344e-05, 'epoch': 0.36} + 36%|███▌ | 3163/8750 [1:15:50<8:55:43, 5.75s/it] 36%|███▌ | 3164/8750 [1:15:59<8:57:28, 5.77s/it] 36%|███▌ | 3164/8750 [1:15:56<8:57:29, 5.77s/it] {'loss': 0.476, 'learning_rate': 1.4767439399746666e-05, 'epoch': 0.36} + 36%|███▌ | 3164/8750 [1:15:59<8:57:28, 5.77s/it] {'loss': 0.476, 'learning_rate': 1.4767439399746666e-05, 'epoch': 0.36} + 36%|███▌ | 3164/8750 [1:15:56<8:57:29, 5.77s/it] 36%|███▌ | 3165/8750 [1:16:05<9:04:29, 5.85s/it] 36%|███▌ | 3165/8750 [1:16:02<9:04:29, 5.85s/it] {'loss': 0.4745, 'learning_rate': 1.4764185164202349e-05, 'epoch': 0.36} + 36%|███▌ | 3165/8750 [1:16:05<9:04:29, 5.85s/it] {'loss': 0.4745, 'learning_rate': 1.4764185164202349e-05, 'epoch': 0.36} + 36%|███▌ | 3165/8750 [1:16:02<9:04:29, 5.85s/it] 36%|███▌ | 3166/8750 [1:16:11<9:02:53, 5.83s/it] 36%|███▌ | 3166/8750 [1:16:08<9:02:53, 5.83s/it] {'loss': 0.4644, 'learning_rate': 1.47609302758583e-05, 'epoch': 0.36} + 36%|███▌ | 3166/8750 [1:16:11<9:02:53, 5.83s/it] {'loss': 0.4644, 'learning_rate': 1.47609302758583e-05, 'epoch': 0.36} + 36%|███▌ | 3166/8750 [1:16:08<9:02:53, 5.83s/it] 36%|███▌ | 3167/8750 [1:16:17<9:02:00, 5.82s/it] 36%|███▌ | 3167/8750 [1:16:14<9:02:00, 5.82s/it] {'loss': 0.4823, 'learning_rate': 1.4757674735160512e-05, 'epoch': 0.36} + 36%|███▌ | 3167/8750 [1:16:17<9:02:00, 5.82s/it] {'loss': 0.4823, 'learning_rate': 1.4757674735160512e-05, 'epoch': 0.36} + 36%|███▌ | 3167/8750 [1:16:14<9:02:00, 5.82s/it] 36%|███▌ | 3168/8750 [1:16:22<9:00:14, 5.81s/it] 36%|███▌ | 3168/8750 [1:16:19<9:00:15, 5.81s/it] {'loss': 0.464, 'learning_rate': 1.475441854255506e-05, 'epoch': 0.36} + {'loss': 0.464, 'learning_rate': 1.475441854255506e-05, 'epoch': 0.36} 36%|███▌ | 3168/8750 [1:16:22<9:00:14, 5.81s/it] + 36%|███▌ | 3168/8750 [1:16:19<9:00:15, 5.81s/it] 36%|███▌ | 3169/8750 [1:16:28<8:57:01, 5.77s/it] 36%|███▌ | 3169/8750 [1:16:25<8:57:01, 5.77s/it] {'loss': 0.4667, 'learning_rate': 1.4751161698488124e-05, 'epoch': 0.36} + 36%|███▌ | 3169/8750 [1:16:28<8:57:01, 5.77s/it] {'loss': 0.4667, 'learning_rate': 1.4751161698488124e-05, 'epoch': 0.36} + 36%|███▌ | 3169/8750 [1:16:25<8:57:01, 5.77s/it] 36%|███▌ | 3170/8750 [1:16:34<8:57:21, 5.78s/it] 36%|███▌ | 3170/8750 [1:16:31<8:57:21, 5.78s/it] {'loss': 0.4758, 'learning_rate': 1.4747904203405959e-05, 'epoch': 0.36} + 36%|███▌ | 3170/8750 [1:16:34<8:57:21, 5.78s/it] {'loss': 0.4758, 'learning_rate': 1.4747904203405959e-05, 'epoch': 0.36} + 36%|███▌ | 3170/8750 [1:16:31<8:57:21, 5.78s/it] 36%|███▌ | 3171/8750 [1:16:40<8:56:39, 5.77s/it] 36%|███▌ | 3171/8750 [1:16:37<8:56:39, 5.77s/it] {'loss': 0.4825, 'learning_rate': 1.4744646057754913e-05, 'epoch': 0.36} + 36%|███▌ | 3171/8750 [1:16:40<8:56:39, 5.77s/it] {'loss': 0.4825, 'learning_rate': 1.4744646057754913e-05, 'epoch': 0.36} + 36%|███▌ | 3171/8750 [1:16:37<8:56:39, 5.77s/it] 36%|███▋ | 3172/8750 [1:16:45<8:55:25, 5.76s/it] 36%|███▋ | 3172/8750 [1:16:42<8:55:25, 5.76s/it] {'loss': 0.447, 'learning_rate': 1.4741387261981428e-05, 'epoch': 0.36} + 36%|███▋ | 3172/8750 [1:16:45<8:55:25, 5.76s/it] {'loss': 0.447, 'learning_rate': 1.4741387261981428e-05, 'epoch': 0.36} + 36%|███▋ | 3172/8750 [1:16:42<8:55:25, 5.76s/it] 36%|███▋ | 3173/8750 [1:16:51<8:59:05, 5.80s/it] 36%|███▋ | 3173/8750 [1:16:48<8:59:06, 5.80s/it] {'loss': 0.4674, 'learning_rate': 1.4738127816532034e-05, 'epoch': 0.36} + 36%|███▋ | 3173/8750 [1:16:51<8:59:05, 5.80s/it] {'loss': 0.4674, 'learning_rate': 1.4738127816532034e-05, 'epoch': 0.36} + 36%|███▋ | 3173/8750 [1:16:48<8:59:06, 5.80s/it] 36%|███▋ | 3174/8750 [1:16:57<8:56:27, 5.77s/it] 36%|███▋ | 3174/8750 [1:16:54<8:56:26, 5.77s/it] {'loss': 0.4779, 'learning_rate': 1.4734867721853341e-05, 'epoch': 0.36} + 36%|███▋ | 3174/8750 [1:16:57<8:56:27, 5.77s/it] {'loss': 0.4779, 'learning_rate': 1.4734867721853341e-05, 'epoch': 0.36} + 36%|███▋ | 3174/8750 [1:16:54<8:56:26, 5.77s/it] 36%|███▋ | 3175/8750 [1:17:03<9:09:01, 5.91s/it] 36%|███▋ | 3175/8750 [1:17:00<9:09:01, 5.91s/it] {'loss': 0.4519, 'learning_rate': 1.4731606978392061e-05, 'epoch': 0.36} + 36%|███▋ | 3175/8750 [1:17:03<9:09:01, 5.91s/it] {'loss': 0.4519, 'learning_rate': 1.4731606978392061e-05, 'epoch': 0.36} + 36%|███▋ | 3175/8750 [1:17:00<9:09:01, 5.91s/it] 36%|███▋ | 3176/8750 [1:17:09<9:11:12, 5.93s/it] 36%|███▋ | 3176/8750 [1:17:06<9:11:12, 5.93s/it] {'loss': 0.4539, 'learning_rate': 1.4728345586594986e-05, 'epoch': 0.36} + 36%|███▋ | 3176/8750 [1:17:09<9:11:12, 5.93s/it] {'loss': 0.4539, 'learning_rate': 1.4728345586594986e-05, 'epoch': 0.36} + 36%|███▋ | 3176/8750 [1:17:06<9:11:12, 5.93s/it] 36%|███▋ | 3177/8750 [1:17:15<9:04:44, 5.86s/it] 36%|███▋ | 3177/8750 [1:17:12<9:04:43, 5.86s/it] {'loss': 0.474, 'learning_rate': 1.4725083546909e-05, 'epoch': 0.36} + 36%|███▋ | 3177/8750 [1:17:15<9:04:44, 5.86s/it] {'loss': 0.474, 'learning_rate': 1.4725083546909e-05, 'epoch': 0.36} + 36%|███▋ | 3177/8750 [1:17:12<9:04:43, 5.86s/it] 36%|███▋ | 3178/8750 [1:17:21<9:12:27, 5.95s/it] 36%|███▋ | 3178/8750 [1:17:18<9:12:27, 5.95s/it] {'loss': 0.479, 'learning_rate': 1.4721820859781076e-05, 'epoch': 0.36} + 36%|███▋ | 3178/8750 [1:17:21<9:12:27, 5.95s/it] {'loss': 0.479, 'learning_rate': 1.4721820859781076e-05, 'epoch': 0.36} + 36%|███▋ | 3178/8750 [1:17:18<9:12:27, 5.95s/it] 36%|███▋ | 3179/8750 [1:17:24<9:05:46, 5.88s/it] 36%|███▋ | 3179/8750 [1:17:27<9:05:47, 5.88s/it] {'loss': 0.477, 'learning_rate': 1.4718557525658272e-05, 'epoch': 0.36} + 36%|███▋ | 3179/8750 [1:17:27<9:05:47, 5.88s/it] {'loss': 0.477, 'learning_rate': 1.4718557525658272e-05, 'epoch': 0.36} + 36%|███▋ | 3179/8750 [1:17:24<9:05:46, 5.88s/it] 36%|███▋ | 3180/8750 [1:17:32<8:59:49, 5.82s/it] 36%|███▋ | 3180/8750 [1:17:29<8:59:50, 5.82s/it] {'loss': 0.4798, 'learning_rate': 1.471529354498774e-05, 'epoch': 0.36} + 36%|███▋ | 3180/8750 [1:17:32<8:59:49, 5.82s/it] {'loss': 0.4798, 'learning_rate': 1.471529354498774e-05, 'epoch': 0.36} + 36%|███▋ | 3180/8750 [1:17:29<8:59:50, 5.82s/it] 36%|███▋ | 3181/8750 [1:17:39<9:10:13, 5.93s/it] 36%|███▋ | 3181/8750 [1:17:36<9:10:13, 5.93s/it] {'loss': 0.4852, 'learning_rate': 1.471202891821672e-05, 'epoch': 0.36} + 36%|███▋ | 3181/8750 [1:17:39<9:10:13, 5.93s/it] {'loss': 0.4852, 'learning_rate': 1.471202891821672e-05, 'epoch': 0.36} + 36%|███▋ | 3181/8750 [1:17:36<9:10:13, 5.93s/it] 36%|███▋ | 3182/8750 [1:17:41<9:05:18, 5.88s/it] 36%|███▋ | 3182/8750 [1:17:44<9:05:19, 5.88s/it] {'loss': 0.4692, 'learning_rate': 1.4708763645792531e-05, 'epoch': 0.36} + 36%|███▋ | 3182/8750 [1:17:44<9:05:19, 5.88s/it] {'loss': 0.4692, 'learning_rate': 1.4708763645792531e-05, 'epoch': 0.36} + 36%|███▋ | 3182/8750 [1:17:41<9:05:18, 5.88s/it] 36%|███▋ | 3183/8750 [1:17:50<9:02:59, 5.85s/it] 36%|███▋ | 3183/8750 [1:17:47<9:03:00, 5.85s/it] {'loss': 0.4714, 'learning_rate': 1.4705497728162602e-05, 'epoch': 0.36} + 36%|███▋ | 3183/8750 [1:17:50<9:02:59, 5.85s/it] {'loss': 0.4714, 'learning_rate': 1.4705497728162602e-05, 'epoch': 0.36} + 36%|███▋ | 3183/8750 [1:17:47<9:03:00, 5.85s/it] 36%|███▋ | 3184/8750 [1:17:53<9:04:55, 5.87s/it] 36%|███▋ | 3184/8750 [1:17:56<9:04:56, 5.87s/it] {'loss': 0.4565, 'learning_rate': 1.4702231165774423e-05, 'epoch': 0.36} + 36%|███▋ | 3184/8750 [1:17:56<9:04:56, 5.87s/it] {'loss': 0.4565, 'learning_rate': 1.4702231165774423e-05, 'epoch': 0.36} + 36%|███▋ | 3184/8750 [1:17:53<9:04:55, 5.87s/it] 36%|███▋ | 3185/8750 [1:18:02<9:04:33, 5.87s/it] 36%|███▋ | 3185/8750 [1:17:59<9:04:33, 5.87s/it] {'loss': 0.494, 'learning_rate': 1.4698963959075592e-05, 'epoch': 0.36} + 36%|███▋ | 3185/8750 [1:18:02<9:04:33, 5.87s/it] {'loss': 0.494, 'learning_rate': 1.4698963959075592e-05, 'epoch': 0.36} + 36%|███▋ | 3185/8750 [1:17:59<9:04:33, 5.87s/it] 36%|███▋ | 3186/8750 [1:18:05<8:57:50, 5.80s/it] 36%|███▋ | 3186/8750 [1:18:07<8:57:50, 5.80s/it] {'loss': 0.458, 'learning_rate': 1.469569610851379e-05, 'epoch': 0.36} + 36%|███▋ | 3186/8750 [1:18:07<8:57:50, 5.80s/it] {'loss': 0.458, 'learning_rate': 1.469569610851379e-05, 'epoch': 0.36} + 36%|███▋ | 3186/8750 [1:18:05<8:57:50, 5.80s/it] 36%|███▋ | 3187/8750 [1:18:10<8:55:05, 5.77s/it] 36%|███▋ | 3187/8750 [1:18:13<8:55:05, 5.77s/it] {'loss': 0.4748, 'learning_rate': 1.4692427614536783e-05, 'epoch': 0.36} + 36%|███▋ | 3187/8750 [1:18:13<8:55:05, 5.77s/it] {'loss': 0.4748, 'learning_rate': 1.4692427614536783e-05, 'epoch': 0.36} + 36%|███▋ | 3187/8750 [1:18:10<8:55:05, 5.77s/it] 36%|███▋ | 3188/8750 [1:18:19<9:01:45, 5.84s/it] 36%|███▋ | 3188/8750 [1:18:16<9:01:46, 5.84s/it] {'loss': 0.4661, 'learning_rate': 1.4689158477592433e-05, 'epoch': 0.36} + 36%|███▋ | 3188/8750 [1:18:19<9:01:45, 5.84s/it] {'loss': 0.4661, 'learning_rate': 1.4689158477592433e-05, 'epoch': 0.36} + 36%|███▋ | 3188/8750 [1:18:16<9:01:46, 5.84s/it] 36%|███▋ | 3189/8750 [1:18:25<9:00:30, 5.83s/it] 36%|███▋ | 3189/8750 [1:18:22<9:00:31, 5.83s/it] {'loss': 0.4881, 'learning_rate': 1.4685888698128677e-05, 'epoch': 0.36} + 36%|███▋ | 3189/8750 [1:18:25<9:00:30, 5.83s/it] {'loss': 0.4881, 'learning_rate': 1.4685888698128677e-05, 'epoch': 0.36} + 36%|███▋ | 3189/8750 [1:18:22<9:00:31, 5.83s/it] 36%|███▋ | 3190/8750 [1:18:31<8:57:48, 5.80s/it] 36%|███▋ | 3190/8750 [1:18:28<8:57:49, 5.80s/it] {'loss': 0.4749, 'learning_rate': 1.468261827659355e-05, 'epoch': 0.36} + 36%|███▋ | 3190/8750 [1:18:31<8:57:48, 5.80s/it] {'loss': 0.4749, 'learning_rate': 1.468261827659355e-05, 'epoch': 0.36} + 36%|███▋ | 3190/8750 [1:18:28<8:57:49, 5.80s/it] 36%|███▋ | 3191/8750 [1:18:37<9:00:54, 5.84s/it] 36%|███▋ | 3191/8750 [1:18:34<9:00:54, 5.84s/it] {'loss': 0.4825, 'learning_rate': 1.4679347213435176e-05, 'epoch': 0.36} + 36%|███▋ | 3191/8750 [1:18:37<9:00:54, 5.84s/it] {'loss': 0.4825, 'learning_rate': 1.4679347213435176e-05, 'epoch': 0.36} + 36%|███▋ | 3191/8750 [1:18:34<9:00:54, 5.84s/it] 36%|███▋ | 3192/8750 [1:18:43<9:10:02, 5.94s/it] 36%|███▋ | 3192/8750 [1:18:40<9:10:02, 5.94s/it] {'loss': 0.4861, 'learning_rate': 1.4676075509101763e-05, 'epoch': 0.36} + 36%|███▋ | 3192/8750 [1:18:43<9:10:02, 5.94s/it] {'loss': 0.4861, 'learning_rate': 1.4676075509101763e-05, 'epoch': 0.36} + 36%|███▋ | 3192/8750 [1:18:40<9:10:02, 5.94s/it] 36%|███▋ | 3193/8750 [1:18:49<9:06:36, 5.90s/it] 36%|███▋ | 3193/8750 [1:18:46<9:06:36, 5.90s/it] {'loss': 0.4816, 'learning_rate': 1.4672803164041604e-05, 'epoch': 0.36} + 36%|███▋ | 3193/8750 [1:18:49<9:06:36, 5.90s/it] {'loss': 0.4816, 'learning_rate': 1.4672803164041604e-05, 'epoch': 0.36} + 36%|███▋ | 3193/8750 [1:18:46<9:06:36, 5.90s/it] 37%|███▋ | 3194/8750 [1:18:55<9:08:38, 5.92s/it] 37%|███▋ | 3194/8750 [1:18:52<9:08:39, 5.92s/it] {'loss': 0.4642, 'learning_rate': 1.4669530178703089e-05, 'epoch': 0.37} + 37%|███▋ | 3194/8750 [1:18:55<9:08:38, 5.92s/it] {'loss': 0.4642, 'learning_rate': 1.4669530178703089e-05, 'epoch': 0.37} + 37%|███▋ | 3194/8750 [1:18:52<9:08:39, 5.92s/it] 37%|███▋ | 3195/8750 [1:19:00<9:01:51, 5.85s/it] 37%|███▋ | 3195/8750 [1:18:57<9:01:51, 5.85s/it] {'loss': 0.4829, 'learning_rate': 1.4666256553534681e-05, 'epoch': 0.37} + 37%|███▋ | 3195/8750 [1:19:00<9:01:51, 5.85s/it] {'loss': 0.4829, 'learning_rate': 1.4666256553534681e-05, 'epoch': 0.37} + 37%|███▋ | 3195/8750 [1:18:57<9:01:51, 5.85s/it] 37%|███▋ | 3196/8750 [1:19:06<9:00:56, 5.84s/it] 37%|███▋ | 3196/8750 [1:19:03<9:00:56, 5.84s/it] {'loss': 0.4793, 'learning_rate': 1.466298228898495e-05, 'epoch': 0.37} + 37%|███▋ | 3196/8750 [1:19:06<9:00:56, 5.84s/it] {'loss': 0.4793, 'learning_rate': 1.466298228898495e-05, 'epoch': 0.37} + 37%|███▋ | 3196/8750 [1:19:03<9:00:56, 5.84s/it] 37%|███▋ | 3197/8750 [1:19:12<9:03:51, 5.88s/it] 37%|███▋ | 3197/8750 [1:19:09<9:03:51, 5.88s/it] {'loss': 0.4768, 'learning_rate': 1.465970738550254e-05, 'epoch': 0.37} + 37%|███▋ | 3197/8750 [1:19:12<9:03:51, 5.88s/it] {'loss': 0.4768, 'learning_rate': 1.465970738550254e-05, 'epoch': 0.37} + 37%|███▋ | 3197/8750 [1:19:09<9:03:51, 5.88s/it] 37%|███▋ | 3198/8750 [1:19:18<9:01:03, 5.85s/it] 37%|███▋ | 3198/8750 [1:19:15<9:01:04, 5.85s/it] {'loss': 0.4439, 'learning_rate': 1.4656431843536182e-05, 'epoch': 0.37} + 37%|███▋ | 3198/8750 [1:19:18<9:01:03, 5.85s/it] {'loss': 0.4439, 'learning_rate': 1.4656431843536182e-05, 'epoch': 0.37} + 37%|███▋ | 3198/8750 [1:19:15<9:01:04, 5.85s/it] 37%|███▋ | 3199/8750 [1:19:21<9:00:06, 5.84s/it] 37%|███▋ | 3199/8750 [1:19:24<9:00:07, 5.84s/it] {'loss': 0.4805, 'learning_rate': 1.4653155663534702e-05, 'epoch': 0.37} + 37%|███▋ | 3199/8750 [1:19:24<9:00:07, 5.84s/it] {'loss': 0.4805, 'learning_rate': 1.4653155663534702e-05, 'epoch': 0.37} + 37%|███▋ | 3199/8750 [1:19:21<9:00:06, 5.84s/it]14 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +0 37%|███▋ | 3200/8750 [1:19:29<8:56:17, 5.80s/it]6 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... + 37%|███▋ | 3200/8750 [1:19:26<8:56:18, 5.80s/it]13 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4833, 'learning_rate': 1.464987884594701e-05, 'epoch': 0.37} + 37%|███▋ | 3200/8750 [1:19:29<8:56:17, 5.80s/it] {'loss': 0.4833, 'learning_rate': 1.464987884594701e-05, 'epoch': 0.37} + 37%|███▋ | 3200/8750 [1:19:26<8:56:18, 5.80s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-3200/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-3200/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-3200/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 37%|███▋ | 3201/8750 [1:19:50<17:07:49, 11.11s/it] 37%|███▋ | 3201/8750 [1:19:53<17:07:50, 11.11s/it] {'loss': 0.4571, 'learning_rate': 1.4646601391222102e-05, 'epoch': 0.37} + 37%|███▋ | 3201/8750 [1:19:53<17:07:50, 11.11s/it] {'loss': 0.4571, 'learning_rate': 1.4646601391222102e-05, 'epoch': 0.37} + 37%|███▋ | 3201/8750 [1:19:50<17:07:49, 11.11s/it] 37%|███▋ | 3202/8750 [1:19:56<14:41:47, 9.54s/it] 37%|███▋ | 3202/8750 [1:19:59<14:41:48, 9.54s/it] {'loss': 0.4729, 'learning_rate': 1.464332329980906e-05, 'epoch': 0.37} + 37%|███▋ | 3202/8750 [1:19:59<14:41:48, 9.54s/it] {'loss': 0.4729, 'learning_rate': 1.464332329980906e-05, 'epoch': 0.37} + 37%|███▋ | 3202/8750 [1:19:56<14:41:47, 9.54s/it] 37%|███▋ | 3203/8750 [1:20:01<12:52:40, 8.36s/it] 37%|███▋ | 3203/8750 [1:20:04<12:52:40, 8.36s/it] {'loss': 0.4818, 'learning_rate': 1.4640044572157062e-05, 'epoch': 0.37} + 37%|███▋ | 3203/8750 [1:20:04<12:52:40, 8.36s/it] {'loss': 0.4818, 'learning_rate': 1.4640044572157062e-05, 'epoch': 0.37} + 37%|███▋ | 3203/8750 [1:20:01<12:52:40, 8.36s/it] 37%|███▋ | 3204/8750 [1:20:07<11:43:56, 7.62s/it] 37%|███▋ | 3204/8750 [1:20:10<11:43:57, 7.62s/it] {'loss': 0.4877, 'learning_rate': 1.4636765208715358e-05, 'epoch': 0.37} + 37%|███▋ | 3204/8750 [1:20:10<11:43:57, 7.62s/it] {'loss': 0.4877, 'learning_rate': 1.4636765208715358e-05, 'epoch': 0.37} + 37%|███▋ | 3204/8750 [1:20:07<11:43:56, 7.62s/it] 37%|███▋ | 3205/8750 [1:20:13<10:52:50, 7.06s/it] 37%|███▋ | 3205/8750 [1:20:16<10:52:50, 7.06s/it] {'loss': 0.4683, 'learning_rate': 1.4633485209933305e-05, 'epoch': 0.37} + 37%|███▋ | 3205/8750 [1:20:16<10:52:50, 7.06s/it] {'loss': 0.4683, 'learning_rate': 1.4633485209933305e-05, 'epoch': 0.37} + 37%|███▋ | 3205/8750 [1:20:13<10:52:50, 7.06s/it] 37%|███▋ | 3206/8750 [1:20:19<10:22:05, 6.73s/it] 37%|███▋ | 3206/8750 [1:20:22<10:22:05, 6.73s/it] {'loss': 0.4616, 'learning_rate': 1.4630204576260328e-05, 'epoch': 0.37} + 37%|███▋ | 3206/8750 [1:20:22<10:22:05, 6.73s/it] {'loss': 0.4616, 'learning_rate': 1.4630204576260328e-05, 'epoch': 0.37} + 37%|███▋ | 3206/8750 [1:20:19<10:22:05, 6.73s/it] 37%|███▋ | 3207/8750 [1:20:25<9:54:57, 6.44s/it] 37%|███▋ | 3207/8750 [1:20:28<9:54:57, 6.44s/it] {'loss': 0.4905, 'learning_rate': 1.4626923308145948e-05, 'epoch': 0.37} + 37%|███▋ | 3207/8750 [1:20:28<9:54:57, 6.44s/it] {'loss': 0.4905, 'learning_rate': 1.4626923308145948e-05, 'epoch': 0.37} + 37%|███▋ | 3207/8750 [1:20:25<9:54:57, 6.44s/it] 37%|███▋ | 3208/8750 [1:20:31<9:35:36, 6.23s/it] 37%|███▋ | 3208/8750 [1:20:33<9:35:36, 6.23s/it] {'loss': 0.4609, 'learning_rate': 1.4623641406039776e-05, 'epoch': 0.37} + 37%|███▋ | 3208/8750 [1:20:33<9:35:36, 6.23s/it] {'loss': 0.4609, 'learning_rate': 1.4623641406039776e-05, 'epoch': 0.37} + 37%|███▋ | 3208/8750 [1:20:31<9:35:36, 6.23s/it] 37%|███▋ | 3209/8750 [1:20:36<9:24:40, 6.11s/it] 37%|███▋ | 3209/8750 [1:20:39<9:24:40, 6.11s/it] {'loss': 0.4601, 'learning_rate': 1.46203588703915e-05, 'epoch': 0.37} + 37%|███▋ | 3209/8750 [1:20:39<9:24:40, 6.11s/it] {'loss': 0.4601, 'learning_rate': 1.46203588703915e-05, 'epoch': 0.37} + 37%|███▋ | 3209/8750 [1:20:36<9:24:40, 6.11s/it] 37%|███▋ | 3210/8750 [1:20:42<9:14:30, 6.01s/it] 37%|███▋ | 3210/8750 [1:20:45<9:14:30, 6.01s/it] {'loss': 0.4575, 'learning_rate': 1.4617075701650907e-05, 'epoch': 0.37} + 37%|███▋ | 3210/8750 [1:20:45<9:14:30, 6.01s/it] {'loss': 0.4575, 'learning_rate': 1.4617075701650907e-05, 'epoch': 0.37} + 37%|███▋ | 3210/8750 [1:20:42<9:14:30, 6.01s/it] 37%|███▋ | 3211/8750 [1:20:48<9:03:05, 5.88s/it] 37%|███▋ | 3211/8750 [1:20:51<9:03:05, 5.88s/it] {'loss': 0.4594, 'learning_rate': 1.461379190026786e-05, 'epoch': 0.37} + 37%|███▋ | 3211/8750 [1:20:51<9:03:05, 5.88s/it] {'loss': 0.4594, 'learning_rate': 1.461379190026786e-05, 'epoch': 0.37} + 37%|███▋ | 3211/8750 [1:20:48<9:03:05, 5.88s/it] 37%|███▋ | 3212/8750 [1:20:53<8:58:28, 5.83s/it] 37%|███▋ | 3212/8750 [1:20:56<8:58:28, 5.83s/it] {'loss': 0.4681, 'learning_rate': 1.4610507466692312e-05, 'epoch': 0.37} + 37%|███▋ | 3212/8750 [1:20:56<8:58:28, 5.83s/it] {'loss': 0.4681, 'learning_rate': 1.4610507466692312e-05, 'epoch': 0.37} + 37%|███▋ | 3212/8750 [1:20:53<8:58:28, 5.83s/it] 37%|███▋ | 3213/8750 [1:20:59<8:55:24, 5.80s/it] 37%|███▋ | 3213/8750 [1:21:02<8:55:25, 5.80s/it] {'loss': 0.48, 'learning_rate': 1.460722240137431e-05, 'epoch': 0.37} + 37%|███▋ | 3213/8750 [1:21:02<8:55:25, 5.80s/it] {'loss': 0.48, 'learning_rate': 1.460722240137431e-05, 'epoch': 0.37} + 37%|███▋ | 3213/8750 [1:20:59<8:55:24, 5.80s/it] 37%|███▋ | 3214/8750 [1:21:05<8:52:25, 5.77s/it] 37%|███▋ | 3214/8750 [1:21:08<8:52:25, 5.77s/it] {'loss': 0.4612, 'learning_rate': 1.4603936704763975e-05, 'epoch': 0.37} + 37%|███▋ | 3214/8750 [1:21:08<8:52:25, 5.77s/it] {'loss': 0.4612, 'learning_rate': 1.4603936704763975e-05, 'epoch': 0.37} + 37%|███▋ | 3214/8750 [1:21:05<8:52:25, 5.77s/it] 37%|███▋ | 3215/8750 [1:21:11<8:50:01, 5.75s/it] 37%|███▋ | 3215/8750 [1:21:14<8:50:01, 5.75s/it] {'loss': 0.4754, 'learning_rate': 1.4600650377311523e-05, 'epoch': 0.37} + 37%|███▋ | 3215/8750 [1:21:14<8:50:01, 5.75s/it] {'loss': 0.4754, 'learning_rate': 1.4600650377311523e-05, 'epoch': 0.37} + 37%|███▋ | 3215/8750 [1:21:11<8:50:01, 5.75s/it] 37%|███▋ | 3216/8750 [1:21:16<8:48:31, 5.73s/it] 37%|███▋ | 3216/8750 [1:21:19<8:48:31, 5.73s/it] {'loss': 0.4533, 'learning_rate': 1.4597363419467257e-05, 'epoch': 0.37} + 37%|███▋ | 3216/8750 [1:21:19<8:48:31, 5.73s/it] {'loss': 0.4533, 'learning_rate': 1.4597363419467257e-05, 'epoch': 0.37} + 37%|███▋ | 3216/8750 [1:21:16<8:48:31, 5.73s/it] 37%|███▋ | 3217/8750 [1:21:22<8:54:13, 5.79s/it] 37%|███▋ | 3217/8750 [1:21:25<8:54:13, 5.79s/it] {'loss': 0.4847, 'learning_rate': 1.4594075831681557e-05, 'epoch': 0.37} + 37%|███▋ | 3217/8750 [1:21:25<8:54:13, 5.79s/it] {'loss': 0.4847, 'learning_rate': 1.4594075831681557e-05, 'epoch': 0.37} + 37%|███▋ | 3217/8750 [1:21:22<8:54:13, 5.79s/it] 37%|███▋ | 3218/8750 [1:21:28<8:55:36, 5.81s/it] 37%|███▋ | 3218/8750 [1:21:31<8:55:36, 5.81s/it] {'loss': 0.4619, 'learning_rate': 1.4590787614404902e-05, 'epoch': 0.37} + 37%|███▋ | 3218/8750 [1:21:31<8:55:36, 5.81s/it] {'loss': 0.4619, 'learning_rate': 1.4590787614404902e-05, 'epoch': 0.37} + 37%|███▋ | 3218/8750 [1:21:28<8:55:36, 5.81s/it] 37%|███▋ | 3219/8750 [1:21:34<8:53:36, 5.79s/it] 37%|███▋ | 3219/8750 [1:21:37<8:53:36, 5.79s/it] {'loss': 0.4724, 'learning_rate': 1.4587498768087849e-05, 'epoch': 0.37} + 37%|███▋ | 3219/8750 [1:21:37<8:53:36, 5.79s/it] {'loss': 0.4724, 'learning_rate': 1.4587498768087849e-05, 'epoch': 0.37} + 37%|███▋ | 3219/8750 [1:21:34<8:53:36, 5.79s/it] 37%|███▋ | 3220/8750 [1:21:40<8:53:16, 5.79s/it] 37%|███▋ | 3220/8750 [1:21:43<8:53:16, 5.79s/it] {'loss': 0.4591, 'learning_rate': 1.4584209293181044e-05, 'epoch': 0.37} + 37%|███▋ | 3220/8750 [1:21:43<8:53:16, 5.79s/it] {'loss': 0.4591, 'learning_rate': 1.4584209293181044e-05, 'epoch': 0.37} + 37%|███▋ | 3220/8750 [1:21:40<8:53:16, 5.79s/it] 37%|███▋ | 3221/8750 [1:21:45<8:51:57, 5.77s/it] 37%|███▋ | 3221/8750 [1:21:48<8:51:57, 5.77s/it] {'loss': 0.4992, 'learning_rate': 1.4580919190135219e-05, 'epoch': 0.37} + 37%|███▋ | 3221/8750 [1:21:48<8:51:57, 5.77s/it] {'loss': 0.4992, 'learning_rate': 1.4580919190135219e-05, 'epoch': 0.37} + 37%|███▋ | 3221/8750 [1:21:45<8:51:57, 5.77s/it] 37%|███▋ | 3222/8750 [1:21:51<8:55:29, 5.81s/it] 37%|███▋ | 3222/8750 [1:21:54<8:55:29, 5.81s/it] {'loss': 0.4668, 'learning_rate': 1.4577628459401188e-05, 'epoch': 0.37} + 37%|███▋ | 3222/8750 [1:21:54<8:55:29, 5.81s/it] {'loss': 0.4668, 'learning_rate': 1.4577628459401188e-05, 'epoch': 0.37} + 37%|███▋ | 3222/8750 [1:21:51<8:55:29, 5.81s/it] 37%|███▋ | 3223/8750 [1:21:57<8:51:03, 5.77s/it] 37%|███▋ | 3223/8750 [1:22:00<8:51:03, 5.77s/it] {'loss': 0.4642, 'learning_rate': 1.457433710142986e-05, 'epoch': 0.37} + 37%|███▋ | 3223/8750 [1:22:00<8:51:03, 5.77s/it] {'loss': 0.4642, 'learning_rate': 1.457433710142986e-05, 'epoch': 0.37} + 37%|███▋ | 3223/8750 [1:21:57<8:51:03, 5.77s/it] 37%|███▋ | 3224/8750 [1:22:03<8:46:37, 5.72s/it] 37%|███▋ | 3224/8750 [1:22:05<8:46:36, 5.72s/it] {'loss': 0.4759, 'learning_rate': 1.4571045116672219e-05, 'epoch': 0.37} + 37%|███▋ | 3224/8750 [1:22:05<8:46:36, 5.72s/it] {'loss': 0.4759, 'learning_rate': 1.4571045116672219e-05, 'epoch': 0.37} + 37%|███▋ | 3224/8750 [1:22:03<8:46:37, 5.72s/it] 37%|███▋ | 3225/8750 [1:22:08<8:49:27, 5.75s/it] 37%|███▋ | 3225/8750 [1:22:11<8:49:27, 5.75s/it] {'loss': 0.4752, 'learning_rate': 1.4567752505579345e-05, 'epoch': 0.37} + 37%|███▋ | 3225/8750 [1:22:11<8:49:27, 5.75s/it] {'loss': 0.4752, 'learning_rate': 1.4567752505579345e-05, 'epoch': 0.37} + 37%|███▋ | 3225/8750 [1:22:08<8:49:27, 5.75s/it] 37%|███▋ | 3226/8750 [1:22:14<8:48:01, 5.74s/it] 37%|███▋ | 3226/8750 [1:22:17<8:48:01, 5.74s/it] {'loss': 0.4603, 'learning_rate': 1.4564459268602396e-05, 'epoch': 0.37} + 37%|███▋ | 3226/8750 [1:22:17<8:48:01, 5.74s/it] {'loss': 0.4603, 'learning_rate': 1.4564459268602396e-05, 'epoch': 0.37} + 37%|███▋ | 3226/8750 [1:22:14<8:48:01, 5.74s/it] 37%|███▋ | 3227/8750 [1:22:20<8:46:47, 5.72s/it] 37%|███▋ | 3227/8750 [1:22:23<8:46:47, 5.72s/it] {'loss': 0.4835, 'learning_rate': 1.4561165406192622e-05, 'epoch': 0.37} + 37%|███▋ | 3227/8750 [1:22:23<8:46:47, 5.72s/it] {'loss': 0.4835, 'learning_rate': 1.4561165406192622e-05, 'epoch': 0.37} + 37%|███▋ | 3227/8750 [1:22:20<8:46:47, 5.72s/it] 37%|███▋ | 3228/8750 [1:22:26<8:50:24, 5.76s/it] 37%|███▋ | 3228/8750 [1:22:28<8:50:24, 5.76s/it] {'loss': 0.466, 'learning_rate': 1.455787091880135e-05, 'epoch': 0.37} + 37%|███▋ | 3228/8750 [1:22:29<8:50:24, 5.76s/it] {'loss': 0.466, 'learning_rate': 1.455787091880135e-05, 'epoch': 0.37} + 37%|███▋ | 3228/8750 [1:22:26<8:50:24, 5.76s/it] 37%|███▋ | 3229/8750 [1:22:31<8:50:13, 5.76s/it] 37%|███▋ | 3229/8750 [1:22:34<8:50:13, 5.76s/it] {'loss': 0.4776, 'learning_rate': 1.4554575806880005e-05, 'epoch': 0.37} + 37%|███▋ | 3229/8750 [1:22:34<8:50:13, 5.76s/it] {'loss': 0.4776, 'learning_rate': 1.4554575806880005e-05, 'epoch': 0.37} + 37%|███▋ | 3229/8750 [1:22:31<8:50:13, 5.76s/it] 37%|███▋ | 3230/8750 [1:22:37<8:50:16, 5.76s/it] 37%|███▋ | 3230/8750 [1:22:40<8:50:16, 5.76s/it] {'loss': 0.4615, 'learning_rate': 1.4551280070880089e-05, 'epoch': 0.37} + 37%|███▋ | 3230/8750 [1:22:40<8:50:16, 5.76s/it] {'loss': 0.4615, 'learning_rate': 1.4551280070880089e-05, 'epoch': 0.37} + 37%|███▋ | 3230/8750 [1:22:37<8:50:16, 5.76s/it] 37%|███▋ | 3231/8750 [1:22:43<8:46:53, 5.73s/it] 37%|███▋ | 3231/8750 [1:22:46<8:46:53, 5.73s/it] {'loss': 0.4812, 'learning_rate': 1.454798371125319e-05, 'epoch': 0.37} + 37%|███▋ | 3231/8750 [1:22:46<8:46:53, 5.73s/it] {'loss': 0.4812, 'learning_rate': 1.454798371125319e-05, 'epoch': 0.37} + 37%|███▋ | 3231/8750 [1:22:43<8:46:53, 5.73s/it] 37%|███▋ | 3232/8750 [1:22:49<8:47:29, 5.74s/it] 37%|███▋ | 3232/8750 [1:22:51<8:47:29, 5.74s/it] {'loss': 0.4703, 'learning_rate': 1.4544686728450982e-05, 'epoch': 0.37} + 37%|███▋ | 3232/8750 [1:22:51<8:47:29, 5.74s/it] {'loss': 0.4703, 'learning_rate': 1.4544686728450982e-05, 'epoch': 0.37} + 37%|███▋ | 3232/8750 [1:22:49<8:47:29, 5.74s/it] 37%|███▋ | 3233/8750 [1:22:55<8:55:34, 5.82s/it] 37%|███▋ | 3233/8750 [1:22:57<8:55:34, 5.82s/it] {'loss': 0.4607, 'learning_rate': 1.4541389122925229e-05, 'epoch': 0.37} + 37%|███▋ | 3233/8750 [1:22:57<8:55:34, 5.82s/it] {'loss': 0.4607, 'learning_rate': 1.4541389122925229e-05, 'epoch': 0.37} + 37%|███▋ | 3233/8750 [1:22:55<8:55:34, 5.82s/it] 37%|███▋ | 3234/8750 [1:23:00<8:56:59, 5.84s/it] 37%|███▋ | 3234/8750 [1:23:03<8:56:59, 5.84s/it] {'loss': 0.466, 'learning_rate': 1.4538090895127774e-05, 'epoch': 0.37} + 37%|███▋ | 3234/8750 [1:23:03<8:56:59, 5.84s/it] {'loss': 0.466, 'learning_rate': 1.4538090895127774e-05, 'epoch': 0.37} + 37%|███▋ | 3234/8750 [1:23:00<8:56:59, 5.84s/it] 37%|███▋ | 3235/8750 [1:23:06<9:01:37, 5.89s/it] 37%|███▋ | 3235/8750 [1:23:09<9:01:37, 5.89s/it] {'loss': 0.4682, 'learning_rate': 1.4534792045510548e-05, 'epoch': 0.37} + 37%|███▋ | 3235/8750 [1:23:09<9:01:37, 5.89s/it] {'loss': 0.4682, 'learning_rate': 1.4534792045510548e-05, 'epoch': 0.37} + 37%|███▋ | 3235/8750 [1:23:06<9:01:37, 5.89s/it] 37%|███▋ | 3236/8750 [1:23:12<9:03:40, 5.92s/it] 37%|███▋ | 3236/8750 [1:23:15<9:03:40, 5.92s/it] {'loss': 0.4742, 'learning_rate': 1.453149257452557e-05, 'epoch': 0.37} + 37%|███▋ | 3236/8750 [1:23:15<9:03:40, 5.92s/it] {'loss': 0.4742, 'learning_rate': 1.453149257452557e-05, 'epoch': 0.37} + 37%|███▋ | 3236/8750 [1:23:12<9:03:40, 5.92s/it] 37%|███▋ | 3237/8750 [1:23:18<8:56:49, 5.84s/it] 37%|███▋ | 3237/8750 [1:23:21<8:56:48, 5.84s/it] {'loss': 0.4683, 'learning_rate': 1.4528192482624932e-05, 'epoch': 0.37} + 37%|███▋ | 3237/8750 [1:23:21<8:56:48, 5.84s/it] {'loss': 0.4683, 'learning_rate': 1.4528192482624932e-05, 'epoch': 0.37} + 37%|███▋ | 3237/8750 [1:23:18<8:56:49, 5.84s/it] 37%|███▋ | 3238/8750 [1:23:24<8:53:27, 5.81s/it] 37%|███▋ | 3238/8750 [1:23:27<8:53:27, 5.81s/it] {'loss': 0.4715, 'learning_rate': 1.4524891770260831e-05, 'epoch': 0.37} + 37%|███▋ | 3238/8750 [1:23:27<8:53:27, 5.81s/it] {'loss': 0.4715, 'learning_rate': 1.4524891770260831e-05, 'epoch': 0.37} + 37%|███▋ | 3238/8750 [1:23:24<8:53:27, 5.81s/it] 37%|███▋ | 3239/8750 [1:23:30<9:02:03, 5.90s/it] 37%|███▋ | 3239/8750 [1:23:33<9:02:03, 5.90s/it] {'loss': 0.4758, 'learning_rate': 1.4521590437885533e-05, 'epoch': 0.37} + 37%|███▋ | 3239/8750 [1:23:33<9:02:03, 5.90s/it] {'loss': 0.4758, 'learning_rate': 1.4521590437885533e-05, 'epoch': 0.37} + 37%|███▋ | 3239/8750 [1:23:30<9:02:03, 5.90s/it] 37%|███▋ | 3240/8750 [1:23:36<8:59:03, 5.87s/it] 37%|███▋ | 3240/8750 [1:23:39<8:59:03, 5.87s/it] {'loss': 0.4706, 'learning_rate': 1.4518288485951398e-05, 'epoch': 0.37} + 37%|███▋ | 3240/8750 [1:23:39<8:59:03, 5.87s/it] {'loss': 0.4706, 'learning_rate': 1.4518288485951398e-05, 'epoch': 0.37} + 37%|███▋ | 3240/8750 [1:23:36<8:59:03, 5.87s/it] 37%|███▋ | 3241/8750 [1:23:41<8:49:10, 5.76s/it] 37%|███▋ | 3241/8750 [1:23:44<8:49:10, 5.76s/it] {'loss': 0.4904, 'learning_rate': 1.4514985914910862e-05, 'epoch': 0.37} + 37%|███▋ | 3241/8750 [1:23:44<8:49:10, 5.76s/it] {'loss': 0.4904, 'learning_rate': 1.4514985914910862e-05, 'epoch': 0.37} + 37%|███▋ | 3241/8750 [1:23:41<8:49:10, 5.76s/it] 37%|███▋ | 3242/8750 [1:23:47<8:49:25, 5.77s/it] 37%|███▋ | 3242/8750 [1:23:50<8:49:25, 5.77s/it] {'loss': 0.4547, 'learning_rate': 1.451168272521645e-05, 'epoch': 0.37} + 37%|███▋ | 3242/8750 [1:23:50<8:49:25, 5.77s/it] {'loss': 0.4547, 'learning_rate': 1.451168272521645e-05, 'epoch': 0.37} + 37%|███▋ | 3242/8750 [1:23:47<8:49:25, 5.77s/it] 37%|███▋ | 3243/8750 [1:23:53<8:47:55, 5.75s/it] 37%|███▋ | 3243/8750 [1:23:56<8:47:55, 5.75s/it] {'loss': 0.4668, 'learning_rate': 1.450837891732078e-05, 'epoch': 0.37} + 37%|███▋ | 3243/8750 [1:23:56<8:47:55, 5.75s/it] {'loss': 0.4668, 'learning_rate': 1.450837891732078e-05, 'epoch': 0.37} + 37%|███▋ | 3243/8750 [1:23:53<8:47:55, 5.75s/it] 37%|███▋ | 3244/8750 [1:23:59<8:57:36, 5.86s/it] 37%|███▋ | 3244/8750 [1:24:02<8:57:35, 5.86s/it] {'loss': 0.457, 'learning_rate': 1.4505074491676542e-05, 'epoch': 0.37} + 37%|███▋ | 3244/8750 [1:24:02<8:57:35, 5.86s/it] {'loss': 0.457, 'learning_rate': 1.4505074491676542e-05, 'epoch': 0.37} + 37%|███▋ | 3244/8750 [1:23:59<8:57:36, 5.86s/it] 37%|███▋ | 3245/8750 [1:24:05<8:56:39, 5.85s/it] 37%|███▋ | 3245/8750 [1:24:08<8:56:39, 5.85s/it] {'loss': 0.4586, 'learning_rate': 1.450176944873652e-05, 'epoch': 0.37} + 37%|███▋ | 3245/8750 [1:24:08<8:56:39, 5.85s/it] {'loss': 0.4586, 'learning_rate': 1.450176944873652e-05, 'epoch': 0.37} + 37%|███▋ | 3245/8750 [1:24:05<8:56:39, 5.85s/it] 37%|███▋ | 3246/8750 [1:24:10<8:52:44, 5.81s/it] 37%|███▋ | 3246/8750 [1:24:13<8:52:43, 5.81s/it] {'loss': 0.4766, 'learning_rate': 1.4498463788953574e-05, 'epoch': 0.37} + 37%|███▋ | 3246/8750 [1:24:13<8:52:43, 5.81s/it] {'loss': 0.4766, 'learning_rate': 1.4498463788953574e-05, 'epoch': 0.37} + 37%|███▋ | 3246/8750 [1:24:10<8:52:44, 5.81s/it] 37%|███▋ | 3247/8750 [1:24:16<8:51:47, 5.80s/it] 37%|███▋ | 3247/8750 [1:24:19<8:51:47, 5.80s/it] {'loss': 0.4627, 'learning_rate': 1.4495157512780655e-05, 'epoch': 0.37} + 37%|███▋ | 3247/8750 [1:24:19<8:51:47, 5.80s/it] {'loss': 0.4627, 'learning_rate': 1.4495157512780655e-05, 'epoch': 0.37} + 37%|███▋ | 3247/8750 [1:24:16<8:51:47, 5.80s/it] 37%|███▋ | 3248/8750 [1:24:22<8:59:38, 5.88s/it] 37%|███▋ | 3248/8750 [1:24:25<8:59:37, 5.88s/it] {'loss': 0.487, 'learning_rate': 1.4491850620670798e-05, 'epoch': 0.37} + 37%|███▋ | 3248/8750 [1:24:25<8:59:37, 5.88s/it] {'loss': 0.487, 'learning_rate': 1.4491850620670798e-05, 'epoch': 0.37} + 37%|███▋ | 3248/8750 [1:24:22<8:59:38, 5.88s/it] 37%|███▋ | 3249/8750 [1:24:28<8:54:06, 5.83s/it] 37%|███▋ | 3249/8750 [1:24:31<8:54:06, 5.83s/it] {'loss': 0.4664, 'learning_rate': 1.4488543113077121e-05, 'epoch': 0.37} + 37%|███▋ | 3249/8750 [1:24:31<8:54:06, 5.83s/it] {'loss': 0.4664, 'learning_rate': 1.4488543113077121e-05, 'epoch': 0.37} + 37%|███▋ | 3249/8750 [1:24:28<8:54:06, 5.83s/it]11 AutoResumeHook: Checking whether to suspend... +014 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... + 37%|███▋ | 3250/8750 [1:24:34<9:00:11, 5.89s/it]12 AutoResumeHook: Checking whether to suspend... + 37%|███▋ | 3250/8750 [1:24:37<9:00:11, 5.89s/it]9 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4712, 'learning_rate': 1.4485234990452826e-05, 'epoch': 0.37} + 37%|███▋ | 3250/8750 [1:24:37<9:00:11, 5.89s/it] {'loss': 0.4712, 'learning_rate': 1.4485234990452826e-05, 'epoch': 0.37} + 37%|███▋ | 3250/8750 [1:24:34<9:00:11, 5.89s/it] 37%|███▋ | 3251/8750 [1:24:40<8:55:13, 5.84s/it] 37%|███▋ | 3251/8750 [1:24:43<8:55:13, 5.84s/it] {'loss': 0.4645, 'learning_rate': 1.4481926253251197e-05, 'epoch': 0.37} + 37%|███▋ | 3251/8750 [1:24:43<8:55:13, 5.84s/it] {'loss': 0.4645, 'learning_rate': 1.4481926253251197e-05, 'epoch': 0.37} + 37%|███▋ | 3251/8750 [1:24:40<8:55:13, 5.84s/it] 37%|███▋ | 3252/8750 [1:24:46<9:00:49, 5.90s/it] 37%|███▋ | 3252/8750 [1:24:49<9:00:49, 5.90s/it] {'loss': 0.4795, 'learning_rate': 1.4478616901925606e-05, 'epoch': 0.37} + 37%|███▋ | 3252/8750 [1:24:49<9:00:49, 5.90s/it] {'loss': 0.4795, 'learning_rate': 1.4478616901925606e-05, 'epoch': 0.37} + 37%|███▋ | 3252/8750 [1:24:46<9:00:49, 5.90s/it] 37%|███▋ | 3253/8750 [1:24:52<9:05:00, 5.95s/it] 37%|███▋ | 3253/8750 [1:24:55<9:05:00, 5.95s/it] {'loss': 0.4754, 'learning_rate': 1.4475306936929513e-05, 'epoch': 0.37} + 37%|███▋ | 3253/8750 [1:24:55<9:05:00, 5.95s/it] {'loss': 0.4754, 'learning_rate': 1.4475306936929513e-05, 'epoch': 0.37} + 37%|███▋ | 3253/8750 [1:24:52<9:05:00, 5.95s/it] 37%|███▋ | 3254/8750 [1:24:58<9:06:45, 5.97s/it] 37%|███▋ | 3254/8750 [1:25:01<9:06:45, 5.97s/it] {'loss': 0.4654, 'learning_rate': 1.4471996358716451e-05, 'epoch': 0.37} + 37%|███▋ | 3254/8750 [1:25:01<9:06:45, 5.97s/it] {'loss': 0.4654, 'learning_rate': 1.4471996358716451e-05, 'epoch': 0.37} + 37%|███▋ | 3254/8750 [1:24:58<9:06:45, 5.97s/it] 37%|███▋ | 3255/8750 [1:25:04<9:00:39, 5.90s/it] 37%|███▋ | 3255/8750 [1:25:06<9:00:39, 5.90s/it] {'loss': 0.4913, 'learning_rate': 1.4468685167740044e-05, 'epoch': 0.37} + 37%|███▋ | 3255/8750 [1:25:06<9:00:39, 5.90s/it] {'loss': 0.4913, 'learning_rate': 1.4468685167740044e-05, 'epoch': 0.37} + 37%|███▋ | 3255/8750 [1:25:04<9:00:39, 5.90s/it] 37%|███▋ | 3256/8750 [1:25:10<9:03:53, 5.94s/it] 37%|███▋ | 3256/8750 [1:25:12<9:03:52, 5.94s/it] {'loss': 0.4487, 'learning_rate': 1.4465373364454001e-05, 'epoch': 0.37} + 37%|███▋ | 3256/8750 [1:25:13<9:03:52, 5.94s/it] {'loss': 0.4487, 'learning_rate': 1.4465373364454001e-05, 'epoch': 0.37} + 37%|███▋ | 3256/8750 [1:25:10<9:03:53, 5.94s/it] 37%|███▋ | 3257/8750 [1:25:15<8:57:50, 5.87s/it] 37%|███▋ | 3257/8750 [1:25:18<8:57:50, 5.87s/it] {'loss': 0.4955, 'learning_rate': 1.4462060949312114e-05, 'epoch': 0.37} + 37%|███▋ | 3257/8750 [1:25:18<8:57:50, 5.87s/it] {'loss': 0.4955, 'learning_rate': 1.4462060949312114e-05, 'epoch': 0.37} + 37%|███▋ | 3257/8750 [1:25:15<8:57:50, 5.87s/it] 37%|███▋ | 3258/8750 [1:25:21<8:59:10, 5.89s/it] 37%|███▋ | 3258/8750 [1:25:24<8:59:11, 5.89s/it] {'loss': 0.4717, 'learning_rate': 1.4458747922768256e-05, 'epoch': 0.37} + 37%|███▋ | 3258/8750 [1:25:24<8:59:11, 5.89s/it] {'loss': 0.4717, 'learning_rate': 1.4458747922768256e-05, 'epoch': 0.37} + 37%|███▋ | 3258/8750 [1:25:21<8:59:10, 5.89s/it] 37%|███▋ | 3259/8750 [1:25:27<8:57:12, 5.87s/it] 37%|███▋ | 3259/8750 [1:25:30<8:57:12, 5.87s/it] {'loss': 0.4566, 'learning_rate': 1.4455434285276385e-05, 'epoch': 0.37} + 37%|███▋ | 3259/8750 [1:25:30<8:57:12, 5.87s/it] {'loss': 0.4566, 'learning_rate': 1.4455434285276385e-05, 'epoch': 0.37} + 37%|███▋ | 3259/8750 [1:25:27<8:57:12, 5.87s/it] 37%|███▋ | 3260/8750 [1:25:33<8:50:50, 5.80s/it] 37%|███▋ | 3260/8750 [1:25:36<8:50:50, 5.80s/it] {'loss': 0.4784, 'learning_rate': 1.4452120037290547e-05, 'epoch': 0.37} + 37%|███▋ | 3260/8750 [1:25:36<8:50:50, 5.80s/it] {'loss': 0.4784, 'learning_rate': 1.4452120037290547e-05, 'epoch': 0.37} + 37%|███▋ | 3260/8750 [1:25:33<8:50:50, 5.80s/it] 37%|███▋ | 3261/8750 [1:25:38<8:48:50, 5.78s/it] 37%|███▋ | 3261/8750 [1:25:41<8:48:50, 5.78s/it] {'loss': 0.4602, 'learning_rate': 1.444880517926486e-05, 'epoch': 0.37} + 37%|███▋ | 3261/8750 [1:25:41<8:48:50, 5.78s/it] {'loss': 0.4602, 'learning_rate': 1.444880517926486e-05, 'epoch': 0.37} + 37%|███▋ | 3261/8750 [1:25:38<8:48:50, 5.78s/it] 37%|███▋ | 3262/8750 [1:25:44<8:50:41, 5.80s/it] 37%|███▋ | 3262/8750 [1:25:47<8:50:41, 5.80s/it] {'loss': 0.4926, 'learning_rate': 1.4445489711653542e-05, 'epoch': 0.37} + 37%|███▋ | 3262/8750 [1:25:47<8:50:41, 5.80s/it] {'loss': 0.4926, 'learning_rate': 1.4445489711653542e-05, 'epoch': 0.37} + 37%|███▋ | 3262/8750 [1:25:44<8:50:41, 5.80s/it] 37%|███▋ | 3263/8750 [1:25:50<8:45:19, 5.74s/it] 37%|███▋ | 3263/8750 [1:25:53<8:45:19, 5.74s/it] {'loss': 0.4705, 'learning_rate': 1.4442173634910881e-05, 'epoch': 0.37} + 37%|███▋ | 3263/8750 [1:25:53<8:45:19, 5.74s/it] {'loss': 0.4705, 'learning_rate': 1.4442173634910881e-05, 'epoch': 0.37} + 37%|███▋ | 3263/8750 [1:25:50<8:45:19, 5.74s/it] 37%|███▋ | 3264/8750 [1:25:56<8:47:36, 5.77s/it] 37%|███▋ | 3264/8750 [1:25:59<8:47:36, 5.77s/it] {'loss': 0.4573, 'learning_rate': 1.4438856949491258e-05, 'epoch': 0.37} + 37%|███▋ | 3264/8750 [1:25:59<8:47:36, 5.77s/it] {'loss': 0.4573, 'learning_rate': 1.4438856949491258e-05, 'epoch': 0.37} + 37%|███▋ | 3264/8750 [1:25:56<8:47:36, 5.77s/it] 37%|███▋ | 3265/8750 [1:26:02<8:49:34, 5.79s/it] 37%|███▋ | 3265/8750 [1:26:04<8:49:34, 5.79s/it] {'loss': 0.4668, 'learning_rate': 1.4435539655849126e-05, 'epoch': 0.37} + 37%|███▋ | 3265/8750 [1:26:04<8:49:34, 5.79s/it] {'loss': 0.4668, 'learning_rate': 1.4435539655849126e-05, 'epoch': 0.37} + 37%|███▋ | 3265/8750 [1:26:02<8:49:34, 5.79s/it] 37%|███▋ | 3266/8750 [1:26:07<8:49:12, 5.79s/it] 37%|███▋ | 3266/8750 [1:26:10<8:49:12, 5.79s/it] {'loss': 0.4646, 'learning_rate': 1.4432221754439037e-05, 'epoch': 0.37} + 37%|███▋ | 3266/8750 [1:26:10<8:49:12, 5.79s/it] {'loss': 0.4646, 'learning_rate': 1.4432221754439037e-05, 'epoch': 0.37} + 37%|███▋ | 3266/8750 [1:26:07<8:49:12, 5.79s/it] 37%|███▋ | 3267/8750 [1:26:13<8:58:35, 5.89s/it] 37%|███▋ | 3267/8750 [1:26:16<8:58:35, 5.89s/it] {'loss': 0.4654, 'learning_rate': 1.4428903245715611e-05, 'epoch': 0.37} + 37%|███▋ | 3267/8750 [1:26:16<8:58:35, 5.89s/it] {'loss': 0.4654, 'learning_rate': 1.4428903245715611e-05, 'epoch': 0.37} + 37%|███▋ | 3267/8750 [1:26:13<8:58:35, 5.89s/it] 37%|███▋ | 3268/8750 [1:26:19<8:56:27, 5.87s/it] 37%|███▋ | 3268/8750 [1:26:22<8:56:27, 5.87s/it] {'loss': 0.4748, 'learning_rate': 1.442558413013356e-05, 'epoch': 0.37} + 37%|███▋ | 3268/8750 [1:26:22<8:56:27, 5.87s/it] {'loss': 0.4748, 'learning_rate': 1.442558413013356e-05, 'epoch': 0.37} + 37%|███▋ | 3268/8750 [1:26:19<8:56:27, 5.87s/it] 37%|███▋ | 3269/8750 [1:26:25<8:55:02, 5.86s/it] 37%|███▋ | 3269/8750 [1:26:28<8:55:02, 5.86s/it] {'loss': 0.4817, 'learning_rate': 1.4422264408147676e-05, 'epoch': 0.37} + 37%|███▋ | 3269/8750 [1:26:28<8:55:02, 5.86s/it] {'loss': 0.4817, 'learning_rate': 1.4422264408147676e-05, 'epoch': 0.37} + 37%|███▋ | 3269/8750 [1:26:25<8:55:02, 5.86s/it] 37%|███▋ | 3270/8750 [1:26:31<8:54:29, 5.85s/it] 37%|███▋ | 3270/8750 [1:26:34<8:54:29, 5.85s/it] {'loss': 0.4665, 'learning_rate': 1.4418944080212838e-05, 'epoch': 0.37} + 37%|███▋ | 3270/8750 [1:26:34<8:54:29, 5.85s/it] {'loss': 0.4665, 'learning_rate': 1.4418944080212838e-05, 'epoch': 0.37} + 37%|███▋ | 3270/8750 [1:26:31<8:54:29, 5.85s/it] 37%|███▋ | 3271/8750 [1:26:37<8:52:16, 5.83s/it] 37%|███▋ | 3271/8750 [1:26:40<8:52:16, 5.83s/it] {'loss': 0.4809, 'learning_rate': 1.4415623146784e-05, 'epoch': 0.37} + 37%|███▋ | 3271/8750 [1:26:40<8:52:16, 5.83s/it] {'loss': 0.4809, 'learning_rate': 1.4415623146784e-05, 'epoch': 0.37} + 37%|███▋ | 3271/8750 [1:26:37<8:52:16, 5.83s/it] 37%|███▋ | 3272/8750 [1:26:42<8:49:45, 5.80s/it] 37%|███▋ | 3272/8750 [1:26:45<8:49:45, 5.80s/it] {'loss': 0.4672, 'learning_rate': 1.441230160831621e-05, 'epoch': 0.37} + 37%|███▋ | 3272/8750 [1:26:45<8:49:45, 5.80s/it] {'loss': 0.4672, 'learning_rate': 1.441230160831621e-05, 'epoch': 0.37} + 37%|███▋ | 3272/8750 [1:26:42<8:49:45, 5.80s/it] 37%|███▋ | 3273/8750 [1:26:48<8:48:26, 5.79s/it] 37%|███▋ | 3273/8750 [1:26:51<8:48:26, 5.79s/it] {'loss': 0.4713, 'learning_rate': 1.4408979465264588e-05, 'epoch': 0.37} + 37%|███▋ | 3273/8750 [1:26:51<8:48:26, 5.79s/it] {'loss': 0.4713, 'learning_rate': 1.4408979465264588e-05, 'epoch': 0.37} + 37%|███▋ | 3273/8750 [1:26:48<8:48:26, 5.79s/it] 37%|███▋ | 3274/8750 [1:26:54<8:51:08, 5.82s/it] 37%|███▋ | 3274/8750 [1:26:57<8:51:08, 5.82s/it] {'loss': 0.4615, 'learning_rate': 1.4405656718084344e-05, 'epoch': 0.37} + 37%|███▋ | 3274/8750 [1:26:57<8:51:08, 5.82s/it] {'loss': 0.4615, 'learning_rate': 1.4405656718084344e-05, 'epoch': 0.37} + 37%|███▋ | 3274/8750 [1:26:54<8:51:08, 5.82s/it] 37%|███▋ | 3275/8750 [1:27:00<8:48:44, 5.79s/it] 37%|███▋ | 3275/8750 [1:27:03<8:48:44, 5.79s/it] {'loss': 0.5039, 'learning_rate': 1.440233336723077e-05, 'epoch': 0.37} + 37%|███▋ | 3275/8750 [1:27:03<8:48:44, 5.79s/it] {'loss': 0.5039, 'learning_rate': 1.440233336723077e-05, 'epoch': 0.37} + 37%|███▋ | 3275/8750 [1:27:00<8:48:44, 5.79s/it] 37%|███▋ | 3276/8750 [1:27:06<8:47:22, 5.78s/it] 37%|███▋ | 3276/8750 [1:27:09<8:47:22, 5.78s/it] {'loss': 0.4667, 'learning_rate': 1.4399009413159234e-05, 'epoch': 0.37} + 37%|███▋ | 3276/8750 [1:27:09<8:47:22, 5.78s/it] {'loss': 0.4667, 'learning_rate': 1.4399009413159234e-05, 'epoch': 0.37} + 37%|███▋ | 3276/8750 [1:27:06<8:47:22, 5.78s/it] 37%|███▋ | 3277/8750 [1:27:11<8:47:49, 5.79s/it] 37%|███▋ | 3277/8750 [1:27:14<8:47:49, 5.79s/it] {'loss': 0.4888, 'learning_rate': 1.4395684856325198e-05, 'epoch': 0.37} + 37%|███▋ | 3277/8750 [1:27:14<8:47:49, 5.79s/it] {'loss': 0.4888, 'learning_rate': 1.4395684856325198e-05, 'epoch': 0.37} + 37%|███▋ | 3277/8750 [1:27:11<8:47:49, 5.79s/it] 37%|███▋ | 3278/8750 [1:27:17<8:48:38, 5.80s/it] 37%|███▋ | 3278/8750 [1:27:20<8:48:38, 5.80s/it] {'loss': 0.4571, 'learning_rate': 1.4392359697184197e-05, 'epoch': 0.37} + 37%|███▋ | 3278/8750 [1:27:20<8:48:38, 5.80s/it] {'loss': 0.4571, 'learning_rate': 1.4392359697184197e-05, 'epoch': 0.37} + 37%|███▋ | 3278/8750 [1:27:17<8:48:38, 5.80s/it] 37%|███▋ | 3279/8750 [1:27:23<8:53:48, 5.85s/it] 37%|███▋ | 3279/8750 [1:27:26<8:53:48, 5.85s/it] {'loss': 0.4598, 'learning_rate': 1.4389033936191851e-05, 'epoch': 0.37} + 37%|███▋ | 3279/8750 [1:27:26<8:53:48, 5.85s/it] {'loss': 0.4598, 'learning_rate': 1.4389033936191851e-05, 'epoch': 0.37} + 37%|███▋ | 3279/8750 [1:27:23<8:53:48, 5.85s/it] 37%|███▋ | 3280/8750 [1:27:29<8:50:53, 5.82s/it] 37%|███▋ | 3280/8750 [1:27:32<8:50:53, 5.82s/it] {'loss': 0.4715, 'learning_rate': 1.4385707573803869e-05, 'epoch': 0.37} + 37%|███▋ | 3280/8750 [1:27:32<8:50:53, 5.82s/it] {'loss': 0.4715, 'learning_rate': 1.4385707573803869e-05, 'epoch': 0.37} + 37%|███▋ | 3280/8750 [1:27:29<8:50:53, 5.82s/it] 37%|███▋ | 3281/8750 [1:27:35<8:49:14, 5.81s/it] 37%|███▋ | 3281/8750 [1:27:38<8:49:14, 5.81s/it] {'loss': 0.5006, 'learning_rate': 1.4382380610476032e-05, 'epoch': 0.37} + 37%|███▋ | 3281/8750 [1:27:38<8:49:14, 5.81s/it] {'loss': 0.5006, 'learning_rate': 1.4382380610476032e-05, 'epoch': 0.37} + 37%|███▋ | 3281/8750 [1:27:35<8:49:14, 5.81s/it] 38%|███▊ | 3282/8750 [1:27:40<8:46:07, 5.77s/it] 38%|███▊ | 3282/8750 [1:27:43<8:46:07, 5.77s/it] {'loss': 0.4744, 'learning_rate': 1.4379053046664208e-05, 'epoch': 0.38} + 38%|███▊ | 3282/8750 [1:27:43<8:46:07, 5.77s/it] {'loss': 0.4744, 'learning_rate': 1.4379053046664208e-05, 'epoch': 0.38} + 38%|███▊ | 3282/8750 [1:27:40<8:46:07, 5.77s/it] 38%|███▊ | 3283/8750 [1:27:46<8:49:21, 5.81s/it] 38%|███▊ | 3283/8750 [1:27:49<8:49:21, 5.81s/it] {'loss': 0.4742, 'learning_rate': 1.437572488282435e-05, 'epoch': 0.38} + 38%|███▊ | 3283/8750 [1:27:49<8:49:21, 5.81s/it] {'loss': 0.4742, 'learning_rate': 1.437572488282435e-05, 'epoch': 0.38} + 38%|███▊ | 3283/8750 [1:27:46<8:49:21, 5.81s/it] 38%|███▊ | 3284/8750 [1:27:52<8:47:58, 5.80s/it] 38%|███▊ | 3284/8750 [1:27:55<8:47:58, 5.80s/it] {'loss': 0.4635, 'learning_rate': 1.4372396119412493e-05, 'epoch': 0.38} + 38%|███▊ | 3284/8750 [1:27:55<8:47:58, 5.80s/it] {'loss': 0.4635, 'learning_rate': 1.4372396119412493e-05, 'epoch': 0.38} + 38%|███▊ | 3284/8750 [1:27:52<8:47:58, 5.80s/it] 38%|███▊ | 3285/8750 [1:27:58<8:46:06, 5.78s/it] 38%|███▊ | 3285/8750 [1:28:01<8:46:06, 5.78s/it] {'loss': 0.4539, 'learning_rate': 1.4369066756884745e-05, 'epoch': 0.38} + 38%|███▊ | 3285/8750 [1:28:01<8:46:06, 5.78s/it] {'loss': 0.4539, 'learning_rate': 1.4369066756884745e-05, 'epoch': 0.38} + 38%|███▊ | 3285/8750 [1:27:58<8:46:06, 5.78s/it] 38%|███▊ | 3286/8750 [1:28:04<8:51:18, 5.83s/it] 38%|███▊ | 3286/8750 [1:28:07<8:51:18, 5.83s/it] {'loss': 0.4807, 'learning_rate': 1.4365736795697306e-05, 'epoch': 0.38} + 38%|███▊ | 3286/8750 [1:28:07<8:51:18, 5.83s/it] {'loss': 0.4807, 'learning_rate': 1.4365736795697306e-05, 'epoch': 0.38} + 38%|███▊ | 3286/8750 [1:28:04<8:51:18, 5.83s/it] 38%|███▊ | 3287/8750 [1:28:09<8:46:29, 5.78s/it] 38%|███▊ | 3287/8750 [1:28:12<8:46:31, 5.78s/it] {'loss': 0.486, 'learning_rate': 1.436240623630646e-05, 'epoch': 0.38} + 38%|███▊ | 3287/8750 [1:28:12<8:46:31, 5.78s/it] {'loss': 0.486, 'learning_rate': 1.436240623630646e-05, 'epoch': 0.38} + 38%|███▊ | 3287/8750 [1:28:09<8:46:29, 5.78s/it] 38%|███▊ | 3288/8750 [1:28:15<8:47:43, 5.80s/it] 38%|███▊ | 3288/8750 [1:28:18<8:47:42, 5.80s/it] {'loss': 0.4795, 'learning_rate': 1.4359075079168562e-05, 'epoch': 0.38} + 38%|███▊ | 3288/8750 [1:28:18<8:47:42, 5.80s/it] {'loss': 0.4795, 'learning_rate': 1.4359075079168562e-05, 'epoch': 0.38} + 38%|███▊ | 3288/8750 [1:28:15<8:47:43, 5.80s/it] 38%|███▊ | 3289/8750 [1:28:21<8:43:03, 5.75s/it] 38%|███▊ | 3289/8750 [1:28:24<8:43:02, 5.75s/it] {'loss': 0.4836, 'learning_rate': 1.4355743324740055e-05, 'epoch': 0.38} + 38%|███▊ | 3289/8750 [1:28:24<8:43:02, 5.75s/it] {'loss': 0.4836, 'learning_rate': 1.4355743324740055e-05, 'epoch': 0.38} + 38%|███▊ | 3289/8750 [1:28:21<8:43:03, 5.75s/it] 38%|███▊ | 3290/8750 [1:28:27<8:47:47, 5.80s/it] 38%|███▊ | 3290/8750 [1:28:30<8:47:47, 5.80s/it] {'loss': 0.4509, 'learning_rate': 1.4352410973477466e-05, 'epoch': 0.38} + 38%|███▊ | 3290/8750 [1:28:30<8:47:47, 5.80s/it] {'loss': 0.4509, 'learning_rate': 1.4352410973477466e-05, 'epoch': 0.38} + 38%|███▊ | 3290/8750 [1:28:27<8:47:47, 5.80s/it] 38%|███▊ | 3291/8750 [1:28:33<8:44:05, 5.76s/it] 38%|███▊ | 3291/8750 [1:28:35<8:44:05, 5.76s/it] {'loss': 0.4845, 'learning_rate': 1.4349078025837401e-05, 'epoch': 0.38} + 38%|███▊ | 3291/8750 [1:28:35<8:44:05, 5.76s/it] {'loss': 0.4845, 'learning_rate': 1.4349078025837401e-05, 'epoch': 0.38} + 38%|███▊ | 3291/8750 [1:28:33<8:44:05, 5.76s/it] 38%|███▊ | 3292/8750 [1:28:38<8:46:59, 5.79s/it] 38%|███▊ | 3292/8750 [1:28:41<8:46:59, 5.79s/it] {'loss': 0.4521, 'learning_rate': 1.4345744482276551e-05, 'epoch': 0.38} + 38%|███▊ | 3292/8750 [1:28:41<8:46:59, 5.79s/it] {'loss': 0.4521, 'learning_rate': 1.4345744482276551e-05, 'epoch': 0.38} + 38%|███▊ | 3292/8750 [1:28:38<8:46:59, 5.79s/it] 38%|███▊ | 3293/8750 [1:28:44<8:52:04, 5.85s/it] 38%|███▊ | 3293/8750 [1:28:47<8:52:03, 5.85s/it] {'loss': 0.461, 'learning_rate': 1.4342410343251683e-05, 'epoch': 0.38} + 38%|███▊ | 3293/8750 [1:28:47<8:52:03, 5.85s/it] {'loss': 0.461, 'learning_rate': 1.4342410343251683e-05, 'epoch': 0.38} + 38%|███▊ | 3293/8750 [1:28:44<8:52:04, 5.85s/it] 38%|███▊ | 3294/8750 [1:28:50<8:48:42, 5.81s/it] 38%|███▊ | 3294/8750 [1:28:53<8:48:42, 5.81s/it] {'loss': 0.4714, 'learning_rate': 1.4339075609219645e-05, 'epoch': 0.38} + 38%|███▊ | 3294/8750 [1:28:53<8:48:42, 5.81s/it] {'loss': 0.4714, 'learning_rate': 1.4339075609219645e-05, 'epoch': 0.38} + 38%|███▊ | 3294/8750 [1:28:50<8:48:42, 5.81s/it] 38%|███▊ | 3295/8750 [1:28:56<8:48:07, 5.81s/it] 38%|███▊ | 3295/8750 [1:28:59<8:48:07, 5.81s/it] {'loss': 0.4697, 'learning_rate': 1.4335740280637374e-05, 'epoch': 0.38} + 38%|███▊ | 3295/8750 [1:28:59<8:48:07, 5.81s/it] {'loss': 0.4697, 'learning_rate': 1.4335740280637374e-05, 'epoch': 0.38} + 38%|███▊ | 3295/8750 [1:28:56<8:48:07, 5.81s/it] 38%|███▊ | 3296/8750 [1:29:02<8:44:35, 5.77s/it] 38%|███▊ | 3296/8750 [1:29:04<8:44:35, 5.77s/it] {'loss': 0.4775, 'learning_rate': 1.4332404357961884e-05, 'epoch': 0.38} + 38%|███▊ | 3296/8750 [1:29:04<8:44:35, 5.77s/it] {'loss': 0.4775, 'learning_rate': 1.4332404357961884e-05, 'epoch': 0.38} + 38%|███▊ | 3296/8750 [1:29:02<8:44:35, 5.77s/it] 38%|███▊ | 3297/8750 [1:29:07<8:48:11, 5.81s/it] 38%|███▊ | 3297/8750 [1:29:10<8:48:11, 5.81s/it] {'loss': 0.4835, 'learning_rate': 1.4329067841650274e-05, 'epoch': 0.38} + 38%|███▊ | 3297/8750 [1:29:10<8:48:11, 5.81s/it] {'loss': 0.4835, 'learning_rate': 1.4329067841650274e-05, 'epoch': 0.38} + 38%|███▊ | 3297/8750 [1:29:07<8:48:11, 5.81s/it] 38%|███▊ | 3298/8750 [1:29:13<8:52:48, 5.86s/it] 38%|███▊ | 3298/8750 [1:29:16<8:52:48, 5.86s/it] {'loss': 0.4689, 'learning_rate': 1.4325730732159717e-05, 'epoch': 0.38} + 38%|███▊ | 3298/8750 [1:29:16<8:52:48, 5.86s/it] {'loss': 0.4689, 'learning_rate': 1.4325730732159717e-05, 'epoch': 0.38} + 38%|███▊ | 3298/8750 [1:29:13<8:52:48, 5.86s/it] 38%|███▊ | 3299/8750 [1:29:19<8:46:37, 5.80s/it] 38%|███▊ | 3299/8750 [1:29:22<8:46:37, 5.80s/it] {'loss': 0.4673, 'learning_rate': 1.432239302994747e-05, 'epoch': 0.38} + 38%|███▊ | 3299/8750 [1:29:22<8:46:37, 5.80s/it] {'loss': 0.4673, 'learning_rate': 1.432239302994747e-05, 'epoch': 0.38} + 38%|███▊ | 3299/8750 [1:29:19<8:46:37, 5.80s/it]010 AutoResumeHook: Checking whether to suspend... +51 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... +14 + AutoResumeHook: Checking whether to suspend... + 8 AutoResumeHook: Checking whether to suspend... 38%|███▊ | 3300/8750 [1:29:25<8:50:33, 5.84s/it]AutoResumeHook: Checking whether to suspend... + +4 9 AutoResumeHook: Checking whether to suspend... + 38%|███▊ | 3300/8750 [1:29:28<8:50:32, 5.84s/it]AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... + {'loss': 0.462, 'learning_rate': 1.4319054735470879e-05, 'epoch': 0.38} + 38%|███▊ | 3300/8750 [1:29:28<8:50:32, 5.84s/it] {'loss': 0.462, 'learning_rate': 1.4319054735470879e-05, 'epoch': 0.38} + 38%|███▊ | 3300/8750 [1:29:25<8:50:33, 5.84s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-3300/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-3300/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-3300/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 38%|███▊ | 3301/8750 [1:29:45<15:12:56, 10.05s/it] 38%|███▊ | 3301/8750 [1:29:48<15:12:56, 10.05s/it] {'loss': 0.4648, 'learning_rate': 1.4315715849187362e-05, 'epoch': 0.38} + 38%|███▊ | 3301/8750 [1:29:48<15:12:56, 10.05s/it] {'loss': 0.4648, 'learning_rate': 1.4315715849187362e-05, 'epoch': 0.38} + 38%|███▊ | 3301/8750 [1:29:45<15:12:56, 10.05s/it] 38%|███▊ | 3302/8750 [1:29:51<13:20:44, 8.82s/it] 38%|███▊ | 3302/8750 [1:29:54<13:20:45, 8.82s/it] {'loss': 0.4644, 'learning_rate': 1.4312376371554417e-05, 'epoch': 0.38} + 38%|███▊ | 3302/8750 [1:29:54<13:20:45, 8.82s/it] {'loss': 0.4644, 'learning_rate': 1.4312376371554417e-05, 'epoch': 0.38} + 38%|███▊ | 3302/8750 [1:29:51<13:20:44, 8.82s/it] 38%|███▊ | 3303/8750 [1:29:57<11:59:48, 7.93s/it] 38%|███▊ | 3303/8750 [1:30:00<11:59:48, 7.93s/it] {'loss': 0.4579, 'learning_rate': 1.4309036303029632e-05, 'epoch': 0.38} + 38%|███▊ | 3303/8750 [1:30:00<11:59:48, 7.93s/it] {'loss': 0.4579, 'learning_rate': 1.4309036303029632e-05, 'epoch': 0.38} + 38%|███▊ | 3303/8750 [1:29:57<11:59:48, 7.93s/it] 38%|███▊ | 3304/8750 [1:30:02<11:00:57, 7.28s/it] 38%|███▊ | 3304/8750 [1:30:05<11:00:57, 7.28s/it] {'loss': 0.4781, 'learning_rate': 1.4305695644070665e-05, 'epoch': 0.38} + 38%|███▊ | 3304/8750 [1:30:05<11:00:57, 7.28s/it] {'loss': 0.4781, 'learning_rate': 1.4305695644070665e-05, 'epoch': 0.38} + 38%|███▊ | 3304/8750 [1:30:02<11:00:57, 7.28s/it] 38%|███▊ | 3305/8750 [1:30:08<10:19:07, 6.82s/it] 38%|███▊ | 3305/8750 [1:30:11<10:19:09, 6.82s/it] {'loss': 0.4719, 'learning_rate': 1.4302354395135269e-05, 'epoch': 0.38} + 38%|███▊ | 3305/8750 [1:30:11<10:19:09, 6.82s/it] {'loss': 0.4719, 'learning_rate': 1.4302354395135269e-05, 'epoch': 0.38} + 38%|███▊ | 3305/8750 [1:30:08<10:19:07, 6.82s/it] 38%|███▊ | 3306/8750 [1:30:14<9:54:01, 6.55s/it] 38%|███▊ | 3306/8750 [1:30:17<9:54:00, 6.55s/it] {'loss': 0.4776, 'learning_rate': 1.4299012556681269e-05, 'epoch': 0.38} + 38%|███▊ | 3306/8750 [1:30:17<9:54:00, 6.55s/it] {'loss': 0.4776, 'learning_rate': 1.4299012556681269e-05, 'epoch': 0.38} + 38%|███▊ | 3306/8750 [1:30:14<9:54:01, 6.55s/it] 38%|███▊ | 3307/8750 [1:30:20<9:37:45, 6.37s/it] 38%|███▊ | 3307/8750 [1:30:23<9:37:45, 6.37s/it] {'loss': 0.4732, 'learning_rate': 1.4295670129166564e-05, 'epoch': 0.38} + 38%|███▊ | 3307/8750 [1:30:23<9:37:45, 6.37s/it] {'loss': 0.4732, 'learning_rate': 1.4295670129166564e-05, 'epoch': 0.38} + 38%|███▊ | 3307/8750 [1:30:20<9:37:45, 6.37s/it] 38%|███▊ | 3308/8750 [1:30:26<9:21:49, 6.19s/it] 38%|███▊ | 3308/8750 [1:30:29<9:21:49, 6.19s/it] {'loss': 0.4742, 'learning_rate': 1.4292327113049145e-05, 'epoch': 0.38} + 38%|███▊ | 3308/8750 [1:30:29<9:21:49, 6.19s/it] {'loss': 0.4742, 'learning_rate': 1.4292327113049145e-05, 'epoch': 0.38} + 38%|███▊ | 3308/8750 [1:30:26<9:21:49, 6.19s/it] 38%|███▊ | 3309/8750 [1:30:32<9:10:22, 6.07s/it] 38%|███▊ | 3309/8750 [1:30:35<9:10:22, 6.07s/it] {'loss': 0.4832, 'learning_rate': 1.428898350878708e-05, 'epoch': 0.38} + 38%|███▊ | 3309/8750 [1:30:35<9:10:22, 6.07s/it] {'loss': 0.4832, 'learning_rate': 1.428898350878708e-05, 'epoch': 0.38} + 38%|███▊ | 3309/8750 [1:30:32<9:10:22, 6.07s/it] 38%|███▊ | 3310/8750 [1:30:38<9:04:08, 6.00s/it] 38%|███▊ | 3310/8750 [1:30:40<9:04:08, 6.00s/it] {'loss': 0.4557, 'learning_rate': 1.428563931683852e-05, 'epoch': 0.38} + 38%|███▊ | 3310/8750 [1:30:40<9:04:08, 6.00s/it] {'loss': 0.4557, 'learning_rate': 1.428563931683852e-05, 'epoch': 0.38} + 38%|███▊ | 3310/8750 [1:30:38<9:04:08, 6.00s/it] 38%|███▊ | 3311/8750 [1:30:43<8:55:44, 5.91s/it] 38%|███▊ | 3311/8750 [1:30:46<8:55:44, 5.91s/it] {'loss': 0.4736, 'learning_rate': 1.4282294537661692e-05, 'epoch': 0.38} + 38%|███▊ | 3311/8750 [1:30:46<8:55:44, 5.91s/it] {'loss': 0.4736, 'learning_rate': 1.4282294537661692e-05, 'epoch': 0.38} + 38%|███▊ | 3311/8750 [1:30:43<8:55:44, 5.91s/it] 38%|███▊ | 3312/8750 [1:30:49<8:52:06, 5.87s/it] 38%|███▊ | 3312/8750 [1:30:52<8:52:06, 5.87s/it] {'loss': 0.4591, 'learning_rate': 1.4278949171714904e-05, 'epoch': 0.38} + 38%|███▊ | 3312/8750 [1:30:52<8:52:06, 5.87s/it] {'loss': 0.4591, 'learning_rate': 1.4278949171714904e-05, 'epoch': 0.38} + 38%|███▊ | 3312/8750 [1:30:49<8:52:06, 5.87s/it] 38%|███▊ | 3313/8750 [1:30:55<8:47:20, 5.82s/it] 38%|███▊ | 3313/8750 [1:30:58<8:47:20, 5.82s/it] {'loss': 0.4673, 'learning_rate': 1.4275603219456544e-05, 'epoch': 0.38} + 38%|███▊ | 3313/8750 [1:30:58<8:47:20, 5.82s/it] {'loss': 0.4673, 'learning_rate': 1.4275603219456544e-05, 'epoch': 0.38} + 38%|███▊ | 3313/8750 [1:30:55<8:47:20, 5.82s/it] 38%|███▊ | 3314/8750 [1:31:00<8:46:08, 5.81s/it] 38%|███▊ | 3314/8750 [1:31:03<8:46:08, 5.81s/it] {'loss': 0.4759, 'learning_rate': 1.4272256681345087e-05, 'epoch': 0.38} + 38%|███▊ | 3314/8750 [1:31:03<8:46:08, 5.81s/it] {'loss': 0.4759, 'learning_rate': 1.4272256681345087e-05, 'epoch': 0.38} + 38%|███▊ | 3314/8750 [1:31:00<8:46:08, 5.81s/it] 38%|███▊ | 3315/8750 [1:31:06<8:48:55, 5.84s/it] 38%|███▊ | 3315/8750 [1:31:09<8:48:56, 5.84s/it] {'loss': 0.4635, 'learning_rate': 1.4268909557839085e-05, 'epoch': 0.38} + 38%|███▊ | 3315/8750 [1:31:09<8:48:56, 5.84s/it] {'loss': 0.4635, 'learning_rate': 1.4268909557839085e-05, 'epoch': 0.38} + 38%|███▊ | 3315/8750 [1:31:06<8:48:55, 5.84s/it] 38%|███▊ | 3316/8750 [1:31:12<8:46:28, 5.81s/it] 38%|███▊ | 3316/8750 [1:31:15<8:46:27, 5.81s/it] {'loss': 0.4533, 'learning_rate': 1.4265561849397163e-05, 'epoch': 0.38} + 38%|███▊ | 3316/8750 [1:31:15<8:46:27, 5.81s/it] {'loss': 0.4533, 'learning_rate': 1.4265561849397163e-05, 'epoch': 0.38} + 38%|███▊ | 3316/8750 [1:31:12<8:46:28, 5.81s/it] 38%|███▊ | 3317/8750 [1:31:18<8:48:17, 5.83s/it] 38%|███▊ | 3317/8750 [1:31:21<8:48:16, 5.83s/it] {'loss': 0.4715, 'learning_rate': 1.4262213556478033e-05, 'epoch': 0.38} + 38%|███▊ | 3317/8750 [1:31:21<8:48:16, 5.83s/it] {'loss': 0.4715, 'learning_rate': 1.4262213556478033e-05, 'epoch': 0.38} + 38%|███▊ | 3317/8750 [1:31:18<8:48:17, 5.83s/it] 38%|███▊ | 3318/8750 [1:31:24<8:47:18, 5.82s/it] 38%|███▊ | 3318/8750 [1:31:27<8:47:17, 5.82s/it] {'loss': 0.4616, 'learning_rate': 1.4258864679540488e-05, 'epoch': 0.38} + 38%|███▊ | 3318/8750 [1:31:27<8:47:17, 5.82s/it] {'loss': 0.4616, 'learning_rate': 1.4258864679540488e-05, 'epoch': 0.38} + 38%|███▊ | 3318/8750 [1:31:24<8:47:18, 5.82s/it] 38%|███▊ | 3319/8750 [1:31:30<8:47:34, 5.83s/it] 38%|███▊ | 3319/8750 [1:31:33<8:47:34, 5.83s/it] {'loss': 0.4528, 'learning_rate': 1.4255515219043398e-05, 'epoch': 0.38} + 38%|███▊ | 3319/8750 [1:31:33<8:47:34, 5.83s/it] {'loss': 0.4528, 'learning_rate': 1.4255515219043398e-05, 'epoch': 0.38} + 38%|███▊ | 3319/8750 [1:31:30<8:47:34, 5.83s/it] 38%|███▊ | 3320/8750 [1:31:35<8:41:50, 5.77s/it] 38%|███▊ | 3320/8750 [1:31:38<8:41:50, 5.77s/it] {'loss': 0.4803, 'learning_rate': 1.425216517544571e-05, 'epoch': 0.38} + 38%|███▊ | 3320/8750 [1:31:38<8:41:50, 5.77s/it] {'loss': 0.4803, 'learning_rate': 1.425216517544571e-05, 'epoch': 0.38} + 38%|███▊ | 3320/8750 [1:31:35<8:41:50, 5.77s/it] 38%|███▊ | 3321/8750 [1:31:41<8:39:44, 5.74s/it] 38%|███▊ | 3321/8750 [1:31:44<8:39:44, 5.74s/it] {'loss': 0.462, 'learning_rate': 1.4248814549206464e-05, 'epoch': 0.38} + 38%|███▊ | 3321/8750 [1:31:44<8:39:44, 5.74s/it] {'loss': 0.462, 'learning_rate': 1.4248814549206464e-05, 'epoch': 0.38} + 38%|███▊ | 3321/8750 [1:31:41<8:39:44, 5.74s/it] 38%|███▊ | 3322/8750 [1:31:47<8:36:22, 5.71s/it] 38%|███▊ | 3322/8750 [1:31:50<8:36:22, 5.71s/it] {'loss': 0.4962, 'learning_rate': 1.4245463340784761e-05, 'epoch': 0.38} + 38%|███▊ | 3322/8750 [1:31:50<8:36:22, 5.71s/it] {'loss': 0.4962, 'learning_rate': 1.4245463340784761e-05, 'epoch': 0.38} + 38%|███▊ | 3322/8750 [1:31:47<8:36:22, 5.71s/it] 38%|███▊ | 3323/8750 [1:31:52<8:36:41, 5.71s/it] 38%|███▊ | 3323/8750 [1:31:55<8:36:41, 5.71s/it] {'loss': 0.4631, 'learning_rate': 1.4242111550639797e-05, 'epoch': 0.38} + 38%|███▊ | 3323/8750 [1:31:55<8:36:41, 5.71s/it] {'loss': 0.4631, 'learning_rate': 1.4242111550639797e-05, 'epoch': 0.38} + 38%|███▊ | 3323/8750 [1:31:52<8:36:41, 5.71s/it] 38%|███▊ | 3324/8750 [1:31:58<8:39:03, 5.74s/it] 38%|███▊ | 3324/8750 [1:32:01<8:39:03, 5.74s/it] {'loss': 0.4858, 'learning_rate': 1.4238759179230841e-05, 'epoch': 0.38} + 38%|███▊ | 3324/8750 [1:32:01<8:39:03, 5.74s/it] {'loss': 0.4858, 'learning_rate': 1.4238759179230841e-05, 'epoch': 0.38} + 38%|███▊ | 3324/8750 [1:31:58<8:39:03, 5.74s/it] 38%|███▊ | 3325/8750 [1:32:04<8:40:37, 5.76s/it] 38%|███▊ | 3325/8750 [1:32:07<8:40:37, 5.76s/it] {'loss': 0.4673, 'learning_rate': 1.4235406227017241e-05, 'epoch': 0.38} + 38%|███▊ | 3325/8750 [1:32:07<8:40:37, 5.76s/it] {'loss': 0.4673, 'learning_rate': 1.4235406227017241e-05, 'epoch': 0.38} + 38%|███▊ | 3325/8750 [1:32:04<8:40:37, 5.76s/it] 38%|███▊ | 3326/8750 [1:32:10<8:39:55, 5.75s/it] 38%|███▊ | 3326/8750 [1:32:13<8:39:55, 5.75s/it] {'loss': 0.4831, 'learning_rate': 1.423205269445843e-05, 'epoch': 0.38} + 38%|███▊ | 3326/8750 [1:32:13<8:39:55, 5.75s/it] {'loss': 0.4831, 'learning_rate': 1.423205269445843e-05, 'epoch': 0.38} + 38%|███▊ | 3326/8750 [1:32:10<8:39:55, 5.75s/it] 38%|███▊ | 3327/8750 [1:32:15<8:41:31, 5.77s/it] 38%|███▊ | 3327/8750 [1:32:18<8:41:32, 5.77s/it] {'loss': 0.4592, 'learning_rate': 1.4228698582013908e-05, 'epoch': 0.38} + 38%|███▊ | 3327/8750 [1:32:18<8:41:32, 5.77s/it] {'loss': 0.4592, 'learning_rate': 1.4228698582013908e-05, 'epoch': 0.38} + 38%|███▊ | 3327/8750 [1:32:15<8:41:31, 5.77s/it] 38%|███▊ | 3328/8750 [1:32:21<8:44:29, 5.80s/it] 38%|███▊ | 3328/8750 [1:32:24<8:44:29, 5.80s/it] {'loss': 0.4626, 'learning_rate': 1.4225343890143275e-05, 'epoch': 0.38} + 38%|███▊ | 3328/8750 [1:32:24<8:44:29, 5.80s/it] {'loss': 0.4626, 'learning_rate': 1.4225343890143275e-05, 'epoch': 0.38} + 38%|███▊ | 3328/8750 [1:32:21<8:44:29, 5.80s/it] 38%|███▊ | 3329/8750 [1:32:27<8:44:59, 5.81s/it] 38%|███▊ | 3329/8750 [1:32:30<8:44:59, 5.81s/it] {'loss': 0.4664, 'learning_rate': 1.4221988619306192e-05, 'epoch': 0.38} + 38%|███▊ | 3329/8750 [1:32:30<8:44:59, 5.81s/it] {'loss': 0.4664, 'learning_rate': 1.4221988619306192e-05, 'epoch': 0.38} + 38%|███▊ | 3329/8750 [1:32:27<8:44:59, 5.81s/it] 38%|███▊ | 3330/8750 [1:32:33<8:45:40, 5.82s/it] 38%|███▊ | 3330/8750 [1:32:36<8:45:40, 5.82s/it] {'loss': 0.4782, 'learning_rate': 1.4218632769962408e-05, 'epoch': 0.38} + 38%|███▊ | 3330/8750 [1:32:36<8:45:40, 5.82s/it] {'loss': 0.4782, 'learning_rate': 1.4218632769962408e-05, 'epoch': 0.38} + 38%|███▊ | 3330/8750 [1:32:33<8:45:40, 5.82s/it] 38%|███▊ | 3331/8750 [1:32:39<8:46:16, 5.83s/it] 38%|███▊ | 3331/8750 [1:32:42<8:46:16, 5.83s/it] {'loss': 0.4562, 'learning_rate': 1.4215276342571749e-05, 'epoch': 0.38} + 38%|███▊ | 3331/8750 [1:32:42<8:46:16, 5.83s/it] {'loss': 0.4562, 'learning_rate': 1.4215276342571749e-05, 'epoch': 0.38} + 38%|███▊ | 3331/8750 [1:32:39<8:46:16, 5.83s/it] 38%|███▊ | 3332/8750 [1:32:45<8:48:35, 5.85s/it] 38%|███▊ | 3332/8750 [1:32:48<8:48:35, 5.85s/it] {'loss': 0.4621, 'learning_rate': 1.4211919337594118e-05, 'epoch': 0.38} + 38%|███▊ | 3332/8750 [1:32:48<8:48:35, 5.85s/it] {'loss': 0.4621, 'learning_rate': 1.4211919337594118e-05, 'epoch': 0.38} + 38%|███▊ | 3332/8750 [1:32:45<8:48:35, 5.85s/it] 38%|███▊ | 3333/8750 [1:32:51<8:44:59, 5.81s/it] 38%|███▊ | 3333/8750 [1:32:53<8:44:59, 5.81s/it] {'loss': 0.4751, 'learning_rate': 1.4208561755489502e-05, 'epoch': 0.38} + 38%|███▊ | 3333/8750 [1:32:53<8:44:59, 5.81s/it] {'loss': 0.4751, 'learning_rate': 1.4208561755489502e-05, 'epoch': 0.38} + 38%|███▊ | 3333/8750 [1:32:51<8:44:59, 5.81s/it] 38%|███▊ | 3334/8750 [1:32:56<8:42:34, 5.79s/it] 38%|███▊ | 3334/8750 [1:32:59<8:42:34, 5.79s/it] {'loss': 0.4712, 'learning_rate': 1.4205203596717966e-05, 'epoch': 0.38} + 38%|███▊ | 3334/8750 [1:32:59<8:42:34, 5.79s/it] {'loss': 0.4712, 'learning_rate': 1.4205203596717966e-05, 'epoch': 0.38} + 38%|███▊ | 3334/8750 [1:32:56<8:42:34, 5.79s/it] 38%|███▊ | 3335/8750 [1:33:02<8:37:54, 5.74s/it] 38%|███▊ | 3335/8750 [1:33:05<8:37:54, 5.74s/it] {'loss': 0.4627, 'learning_rate': 1.420184486173965e-05, 'epoch': 0.38} + 38%|███▊ | 3335/8750 [1:33:05<8:37:54, 5.74s/it] {'loss': 0.4627, 'learning_rate': 1.420184486173965e-05, 'epoch': 0.38} + 38%|███▊ | 3335/8750 [1:33:02<8:37:54, 5.74s/it] 38%|███▊ | 3336/8750 [1:33:08<8:50:28, 5.88s/it] 38%|███▊ | 3336/8750 [1:33:11<8:50:28, 5.88s/it] {'loss': 0.4534, 'learning_rate': 1.4198485551014778e-05, 'epoch': 0.38} + 38%|███▊ | 3336/8750 [1:33:11<8:50:28, 5.88s/it] {'loss': 0.4534, 'learning_rate': 1.4198485551014778e-05, 'epoch': 0.38} + 38%|███▊ | 3336/8750 [1:33:08<8:50:28, 5.88s/it] 38%|███▊ | 3337/8750 [1:33:14<8:46:59, 5.84s/it] 38%|███▊ | 3337/8750 [1:33:17<8:46:59, 5.84s/it] {'loss': 0.4545, 'learning_rate': 1.4195125665003648e-05, 'epoch': 0.38} + 38%|███▊ | 3337/8750 [1:33:17<8:46:59, 5.84s/it] {'loss': 0.4545, 'learning_rate': 1.4195125665003648e-05, 'epoch': 0.38} + 38%|███▊ | 3337/8750 [1:33:14<8:46:59, 5.84s/it] 38%|███▊ | 3338/8750 [1:33:20<8:45:51, 5.83s/it] 38%|███▊ | 3338/8750 [1:33:23<8:45:51, 5.83s/it] {'loss': 0.4793, 'learning_rate': 1.4191765204166643e-05, 'epoch': 0.38} + 38%|███▊ | 3338/8750 [1:33:23<8:45:51, 5.83s/it] {'loss': 0.4793, 'learning_rate': 1.4191765204166643e-05, 'epoch': 0.38} + 38%|███▊ | 3338/8750 [1:33:20<8:45:51, 5.83s/it] 38%|███▊ | 3339/8750 [1:33:28<8:43:38, 5.81s/it] 38%|███▊ | 3339/8750 [1:33:25<8:43:39, 5.81s/it]{'loss': 0.4492, 'learning_rate': 1.4188404168964219e-05, 'epoch': 0.38} + {'loss': 0.4492, 'learning_rate': 1.4188404168964219e-05, 'epoch': 0.38} + 38%|███▊ | 3339/8750 [1:33:28<8:43:38, 5.81s/it] 38%|███▊ | 3339/8750 [1:33:25<8:43:39, 5.81s/it] 38%|███▊ | 3340/8750 [1:33:31<8:40:08, 5.77s/it] 38%|███▊ | 3340/8750 [1:33:34<8:40:08, 5.77s/it] {'loss': 0.4563, 'learning_rate': 1.418504255985691e-05, 'epoch': 0.38} + 38%|███▊ | 3340/8750 [1:33:34<8:40:08, 5.77s/it] {'loss': 0.4563, 'learning_rate': 1.418504255985691e-05, 'epoch': 0.38} + 38%|███▊ | 3340/8750 [1:33:31<8:40:08, 5.77s/it] 38%|███▊ | 3341/8750 [1:33:37<8:37:30, 5.74s/it] 38%|███▊ | 3341/8750 [1:33:40<8:37:30, 5.74s/it] {'loss': 0.4673, 'learning_rate': 1.4181680377305336e-05, 'epoch': 0.38} + 38%|███▊ | 3341/8750 [1:33:40<8:37:30, 5.74s/it] {'loss': 0.4673, 'learning_rate': 1.4181680377305336e-05, 'epoch': 0.38} + 38%|███▊ | 3341/8750 [1:33:37<8:37:30, 5.74s/it] 38%|███▊ | 3342/8750 [1:33:43<8:38:59, 5.76s/it] 38%|███▊ | 3342/8750 [1:33:45<8:38:59, 5.76s/it] {'loss': 0.4813, 'learning_rate': 1.4178317621770187e-05, 'epoch': 0.38} + 38%|███▊ | 3342/8750 [1:33:45<8:38:59, 5.76s/it] {'loss': 0.4813, 'learning_rate': 1.4178317621770187e-05, 'epoch': 0.38} + 38%|███▊ | 3342/8750 [1:33:43<8:38:59, 5.76s/it] 38%|███▊ | 3343/8750 [1:33:48<8:37:33, 5.74s/it] 38%|███▊ | 3343/8750 [1:33:51<8:37:33, 5.74s/it] {'loss': 0.4651, 'learning_rate': 1.4174954293712242e-05, 'epoch': 0.38} + 38%|███▊ | 3343/8750 [1:33:51<8:37:33, 5.74s/it] {'loss': 0.4651, 'learning_rate': 1.4174954293712242e-05, 'epoch': 0.38} + 38%|███▊ | 3343/8750 [1:33:48<8:37:33, 5.74s/it] 38%|███▊ | 3344/8750 [1:33:54<8:38:05, 5.75s/it] 38%|███▊ | 3344/8750 [1:33:57<8:38:05, 5.75s/it] {'loss': 0.469, 'learning_rate': 1.4171590393592346e-05, 'epoch': 0.38} + 38%|███▊ | 3344/8750 [1:33:57<8:38:05, 5.75s/it] {'loss': 0.469, 'learning_rate': 1.4171590393592346e-05, 'epoch': 0.38} + 38%|███▊ | 3344/8750 [1:33:54<8:38:05, 5.75s/it] 38%|███▊ | 3345/8750 [1:34:00<8:38:12, 5.75s/it] 38%|███▊ | 3345/8750 [1:34:03<8:38:12, 5.75s/it] {'loss': 0.4549, 'learning_rate': 1.4168225921871433e-05, 'epoch': 0.38} + 38%|███▊ | 3345/8750 [1:34:03<8:38:12, 5.75s/it] {'loss': 0.4549, 'learning_rate': 1.4168225921871433e-05, 'epoch': 0.38} + 38%|███▊ | 3345/8750 [1:34:00<8:38:12, 5.75s/it] 38%|███▊ | 3346/8750 [1:34:05<8:34:23, 5.71s/it] 38%|███▊ | 3346/8750 [1:34:08<8:34:23, 5.71s/it] {'loss': 0.4827, 'learning_rate': 1.4164860879010502e-05, 'epoch': 0.38} + 38%|███▊ | 3346/8750 [1:34:08<8:34:23, 5.71s/it] {'loss': 0.4827, 'learning_rate': 1.4164860879010502e-05, 'epoch': 0.38} + 38%|███▊ | 3346/8750 [1:34:05<8:34:23, 5.71s/it] 38%|███▊ | 3347/8750 [1:34:11<8:35:57, 5.73s/it] 38%|███▊ | 3347/8750 [1:34:14<8:35:57, 5.73s/it] {'loss': 0.4757, 'learning_rate': 1.4161495265470649e-05, 'epoch': 0.38} + 38%|███▊ | 3347/8750 [1:34:11<8:35:57, 5.73s/it]{'loss': 0.4757, 'learning_rate': 1.4161495265470649e-05, 'epoch': 0.38} + 38%|███▊ | 3347/8750 [1:34:14<8:35:57, 5.73s/it] 38%|███▊ | 3348/8750 [1:34:20<8:37:31, 5.75s/it] 38%|███▊ | 3348/8750 [1:34:17<8:37:32, 5.75s/it] {'loss': 0.4788, 'learning_rate': 1.4158129081713035e-05, 'epoch': 0.38} + 38%|███▊ | 3348/8750 [1:34:20<8:37:31, 5.75s/it] {'loss': 0.4788, 'learning_rate': 1.4158129081713035e-05, 'epoch': 0.38} + 38%|███▊ | 3348/8750 [1:34:17<8:37:32, 5.75s/it] 38%|███▊ | 3349/8750 [1:34:23<8:35:31, 5.73s/it] 38%|███▊ | 3349/8750 [1:34:26<8:35:31, 5.73s/it] {'loss': 0.4634, 'learning_rate': 1.41547623281989e-05, 'epoch': 0.38} + 38%|███▊ | 3349/8750 [1:34:26<8:35:31, 5.73s/it] {'loss': 0.4634, 'learning_rate': 1.41547623281989e-05, 'epoch': 0.38} + 38%|███▊ | 3349/8750 [1:34:23<8:35:31, 5.73s/it]10 AutoResumeHook: Checking whether to suspend... +0149 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +8 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +4 38%|███▊ | 3350/8750 [1:34:28<8:36:44, 5.74s/it] AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... + 38%|███▊ | 3350/8750 [1:34:31<8:36:44, 5.74s/it] {'loss': 0.4736, 'learning_rate': 1.415139500538957e-05, 'epoch': 0.38} + 38%|███▊ | 3350/8750 [1:34:31<8:36:44, 5.74s/it] {'loss': 0.4736, 'learning_rate': 1.415139500538957e-05, 'epoch': 0.38} + 38%|███▊ | 3350/8750 [1:34:28<8:36:44, 5.74s/it] 38%|███▊ | 3351/8750 [1:34:34<8:34:56, 5.72s/it] 38%|███▊ | 3351/8750 [1:34:37<8:34:56, 5.72s/it] {'loss': 0.4737, 'learning_rate': 1.4148027113746435e-05, 'epoch': 0.38} + 38%|███▊ | 3351/8750 [1:34:37<8:34:56, 5.72s/it] {'loss': 0.4737, 'learning_rate': 1.4148027113746435e-05, 'epoch': 0.38} + 38%|███▊ | 3351/8750 [1:34:34<8:34:56, 5.72s/it] 38%|███▊ | 3352/8750 [1:34:40<8:36:35, 5.74s/it] 38%|███▊ | 3352/8750 [1:34:43<8:36:35, 5.74s/it] {'loss': 0.4693, 'learning_rate': 1.4144658653730976e-05, 'epoch': 0.38} + 38%|███▊ | 3352/8750 [1:34:43<8:36:35, 5.74s/it] {'loss': 0.4693, 'learning_rate': 1.4144658653730976e-05, 'epoch': 0.38} + 38%|███▊ | 3352/8750 [1:34:40<8:36:35, 5.74s/it] 38%|███▊ | 3353/8750 [1:34:46<8:36:49, 5.75s/it] 38%|███▊ | 3353/8750 [1:34:49<8:36:49, 5.75s/it] {'loss': 0.4686, 'learning_rate': 1.4141289625804748e-05, 'epoch': 0.38} + 38%|███▊ | 3353/8750 [1:34:49<8:36:49, 5.75s/it] {'loss': 0.4686, 'learning_rate': 1.4141289625804748e-05, 'epoch': 0.38} + 38%|███▊ | 3353/8750 [1:34:46<8:36:49, 5.75s/it] 38%|███▊ | 3354/8750 [1:34:51<8:37:22, 5.75s/it] 38%|███▊ | 3354/8750 [1:34:54<8:37:22, 5.75s/it] {'loss': 0.4651, 'learning_rate': 1.4137920030429386e-05, 'epoch': 0.38} + 38%|███▊ | 3354/8750 [1:34:54<8:37:22, 5.75s/it] {'loss': 0.4651, 'learning_rate': 1.4137920030429386e-05, 'epoch': 0.38} + 38%|███▊ | 3354/8750 [1:34:51<8:37:22, 5.75s/it] 38%|███▊ | 3355/8750 [1:34:57<8:36:07, 5.74s/it] 38%|███▊ | 3355/8750 [1:35:00<8:36:07, 5.74s/it] {'loss': 0.4646, 'learning_rate': 1.4134549868066594e-05, 'epoch': 0.38} + 38%|███▊ | 3355/8750 [1:35:00<8:36:07, 5.74s/it] {'loss': 0.4646, 'learning_rate': 1.4134549868066594e-05, 'epoch': 0.38} + 38%|███▊ | 3355/8750 [1:34:57<8:36:07, 5.74s/it] 38%|███▊ | 3356/8750 [1:35:03<8:34:13, 5.72s/it] 38%|███▊ | 3356/8750 [1:35:06<8:34:13, 5.72s/it] {'loss': 0.4861, 'learning_rate': 1.4131179139178157e-05, 'epoch': 0.38} + 38%|███▊ | 3356/8750 [1:35:06<8:34:13, 5.72s/it] {'loss': 0.4861, 'learning_rate': 1.4131179139178157e-05, 'epoch': 0.38} + 38%|███▊ | 3356/8750 [1:35:03<8:34:13, 5.72s/it] 38%|███▊ | 3357/8750 [1:35:08<8:33:37, 5.71s/it] 38%|███▊ | 3357/8750 [1:35:11<8:33:37, 5.71s/it] {'loss': 0.4695, 'learning_rate': 1.4127807844225947e-05, 'epoch': 0.38} + 38%|███▊ | 3357/8750 [1:35:11<8:33:37, 5.71s/it] {'loss': 0.4695, 'learning_rate': 1.4127807844225947e-05, 'epoch': 0.38} + 38%|███▊ | 3357/8750 [1:35:08<8:33:37, 5.71s/it] 38%|███▊ | 3358/8750 [1:35:14<8:32:03, 5.70s/it] 38%|███▊ | 3358/8750 [1:35:17<8:32:03, 5.70s/it] {'loss': 0.4706, 'learning_rate': 1.4124435983671907e-05, 'epoch': 0.38} + 38%|███▊ | 3358/8750 [1:35:17<8:32:03, 5.70s/it] {'loss': 0.4706, 'learning_rate': 1.4124435983671907e-05, 'epoch': 0.38} + 38%|███▊ | 3358/8750 [1:35:14<8:32:03, 5.70s/it] 38%|███▊ | 3359/8750 [1:35:20<8:43:04, 5.82s/it] 38%|███▊ | 3359/8750 [1:35:23<8:43:04, 5.82s/it] {'loss': 0.4766, 'learning_rate': 1.4121063557978051e-05, 'epoch': 0.38} + 38%|███▊ | 3359/8750 [1:35:23<8:43:04, 5.82s/it] {'loss': 0.4766, 'learning_rate': 1.4121063557978051e-05, 'epoch': 0.38} + 38%|███▊ | 3359/8750 [1:35:20<8:43:04, 5.82s/it] 38%|███▊ | 3360/8750 [1:35:26<8:45:51, 5.85s/it] 38%|███▊ | 3360/8750 [1:35:29<8:45:51, 5.85s/it] {'loss': 0.4701, 'learning_rate': 1.4117690567606483e-05, 'epoch': 0.38} + 38%|███▊ | 3360/8750 [1:35:29<8:45:51, 5.85s/it] {'loss': 0.4701, 'learning_rate': 1.4117690567606483e-05, 'epoch': 0.38} + 38%|███▊ | 3360/8750 [1:35:26<8:45:51, 5.85s/it] 38%|███▊ | 3361/8750 [1:35:32<8:42:22, 5.82s/it] 38%|███▊ | 3361/8750 [1:35:35<8:42:22, 5.82s/it] {'loss': 0.4642, 'learning_rate': 1.411431701301937e-05, 'epoch': 0.38} + 38%|███▊ | 3361/8750 [1:35:35<8:42:22, 5.82s/it] {'loss': 0.4642, 'learning_rate': 1.411431701301937e-05, 'epoch': 0.38} + 38%|███▊ | 3361/8750 [1:35:32<8:42:22, 5.82s/it] 38%|███▊ | 3362/8750 [1:35:38<8:44:01, 5.84s/it] 38%|███▊ | 3362/8750 [1:35:41<8:44:01, 5.84s/it] {'loss': 0.4676, 'learning_rate': 1.4110942894678971e-05, 'epoch': 0.38} + 38%|███▊ | 3362/8750 [1:35:41<8:44:01, 5.84s/it] {'loss': 0.4676, 'learning_rate': 1.4110942894678971e-05, 'epoch': 0.38} + 38%|███▊ | 3362/8750 [1:35:38<8:44:01, 5.84s/it] 38%|███▊ | 3363/8750 [1:35:44<8:45:07, 5.85s/it] 38%|███▊ | 3363/8750 [1:35:47<8:45:07, 5.85s/it] {'loss': 0.4786, 'learning_rate': 1.410756821304762e-05, 'epoch': 0.38} + 38%|███▊ | 3363/8750 [1:35:47<8:45:07, 5.85s/it] {'loss': 0.4786, 'learning_rate': 1.410756821304762e-05, 'epoch': 0.38} + 38%|███▊ | 3363/8750 [1:35:44<8:45:07, 5.85s/it] 38%|███▊ | 3364/8750 [1:35:49<8:38:36, 5.78s/it] 38%|███▊ | 3364/8750 [1:35:52<8:38:36, 5.78s/it] {'loss': 0.47, 'learning_rate': 1.410419296858771e-05, 'epoch': 0.38} + 38%|███▊ | 3364/8750 [1:35:52<8:38:36, 5.78s/it] {'loss': 0.47, 'learning_rate': 1.410419296858771e-05, 'epoch': 0.38} + 38%|███▊ | 3364/8750 [1:35:49<8:38:36, 5.78s/it] 38%|███▊ | 3365/8750 [1:35:55<8:41:06, 5.81s/it] 38%|███▊ | 3365/8750 [1:35:58<8:41:07, 5.81s/it] {'loss': 0.4691, 'learning_rate': 1.4100817161761738e-05, 'epoch': 0.38} + 38%|███▊ | 3365/8750 [1:35:58<8:41:07, 5.81s/it] {'loss': 0.4691, 'learning_rate': 1.4100817161761738e-05, 'epoch': 0.38} + 38%|███▊ | 3365/8750 [1:35:55<8:41:06, 5.81s/it] 38%|███▊ | 3366/8750 [1:36:01<8:38:46, 5.78s/it] 38%|███▊ | 3366/8750 [1:36:04<8:38:46, 5.78s/it] {'loss': 0.4569, 'learning_rate': 1.4097440793032253e-05, 'epoch': 0.38} + 38%|███▊ | 3366/8750 [1:36:04<8:38:46, 5.78s/it] {'loss': 0.4569, 'learning_rate': 1.4097440793032253e-05, 'epoch': 0.38} + 38%|███▊ | 3366/8750 [1:36:01<8:38:46, 5.78s/it] 38%|███▊ | 3367/8750 [1:36:07<8:35:50, 5.75s/it] 38%|███▊ | 3367/8750 [1:36:09<8:35:49, 5.75s/it] {'loss': 0.4553, 'learning_rate': 1.4094063862861904e-05, 'epoch': 0.38} + 38%|███▊ | 3367/8750 [1:36:09<8:35:49, 5.75s/it] {'loss': 0.4553, 'learning_rate': 1.4094063862861904e-05, 'epoch': 0.38} + 38%|███▊ | 3367/8750 [1:36:07<8:35:50, 5.75s/it] 38%|███▊ | 3368/8750 [1:36:12<8:36:10, 5.75s/it] 38%|███▊ | 3368/8750 [1:36:15<8:36:10, 5.75s/it] {'loss': 0.4536, 'learning_rate': 1.4090686371713403e-05, 'epoch': 0.38} + 38%|███▊ | 3368/8750 [1:36:15<8:36:10, 5.75s/it] {'loss': 0.4536, 'learning_rate': 1.4090686371713403e-05, 'epoch': 0.38} + 38%|███▊ | 3368/8750 [1:36:12<8:36:10, 5.75s/it] 39%|███▊ | 3369/8750 [1:36:18<8:40:38, 5.81s/it] 39%|███▊ | 3369/8750 [1:36:21<8:40:38, 5.81s/it] {'loss': 0.452, 'learning_rate': 1.4087308320049536e-05, 'epoch': 0.39} + 39%|███▊ | 3369/8750 [1:36:21<8:40:38, 5.81s/it] {'loss': 0.452, 'learning_rate': 1.4087308320049536e-05, 'epoch': 0.39} + 39%|███▊ | 3369/8750 [1:36:18<8:40:38, 5.81s/it] 39%|███▊ | 3370/8750 [1:36:24<8:44:06, 5.85s/it] 39%|███▊ | 3370/8750 [1:36:27<8:44:06, 5.85s/it] {'loss': 0.4892, 'learning_rate': 1.4083929708333173e-05, 'epoch': 0.39} + 39%|███▊ | 3370/8750 [1:36:27<8:44:06, 5.85s/it] {'loss': 0.4892, 'learning_rate': 1.4083929708333173e-05, 'epoch': 0.39} + 39%|███▊ | 3370/8750 [1:36:24<8:44:06, 5.85s/it] 39%|███▊ | 3371/8750 [1:36:30<8:46:58, 5.88s/it] 39%|███▊ | 3371/8750 [1:36:33<8:46:58, 5.88s/it] {'loss': 0.4583, 'learning_rate': 1.4080550537027264e-05, 'epoch': 0.39} + 39%|███▊ | 3371/8750 [1:36:33<8:46:58, 5.88s/it] {'loss': 0.4583, 'learning_rate': 1.4080550537027264e-05, 'epoch': 0.39} + 39%|███▊ | 3371/8750 [1:36:30<8:46:58, 5.88s/it] 39%|███▊ | 3372/8750 [1:36:36<8:41:09, 5.81s/it] 39%|███▊ | 3372/8750 [1:36:39<8:41:09, 5.81s/it] {'loss': 0.4813, 'learning_rate': 1.4077170806594831e-05, 'epoch': 0.39} + 39%|███▊ | 3372/8750 [1:36:39<8:41:09, 5.81s/it] {'loss': 0.4813, 'learning_rate': 1.4077170806594831e-05, 'epoch': 0.39} + 39%|███▊ | 3372/8750 [1:36:36<8:41:09, 5.81s/it] 39%|███▊ | 3373/8750 [1:36:41<8:34:04, 5.74s/it] 39%|███▊ | 3373/8750 [1:36:44<8:34:03, 5.74s/it] {'loss': 0.4754, 'learning_rate': 1.4073790517498967e-05, 'epoch': 0.39} + 39%|███▊ | 3373/8750 [1:36:44<8:34:03, 5.74s/it] {'loss': 0.4754, 'learning_rate': 1.4073790517498967e-05, 'epoch': 0.39} + 39%|███▊ | 3373/8750 [1:36:41<8:34:04, 5.74s/it] 39%|███▊ | 3374/8750 [1:36:47<8:35:55, 5.76s/it] 39%|███▊ | 3374/8750 [1:36:50<8:35:55, 5.76s/it]{'loss': 0.4719, 'learning_rate': 1.4070409670202849e-05, 'epoch': 0.39} + {'loss': 0.4719, 'learning_rate': 1.4070409670202849e-05, 'epoch': 0.39} + 39%|███▊ | 3374/8750 [1:36:50<8:35:55, 5.76s/it] 39%|███▊ | 3374/8750 [1:36:47<8:35:55, 5.76s/it] 39%|███▊ | 3375/8750 [1:36:53<8:40:33, 5.81s/it] 39%|███▊ | 3375/8750 [1:36:56<8:40:33, 5.81s/it] {'loss': 0.4677, 'learning_rate': 1.4067028265169728e-05, 'epoch': 0.39} + 39%|███▊ | 3375/8750 [1:36:56<8:40:33, 5.81s/it] {'loss': 0.4677, 'learning_rate': 1.4067028265169728e-05, 'epoch': 0.39} + 39%|███▊ | 3375/8750 [1:36:53<8:40:33, 5.81s/it] 39%|███▊ | 3376/8750 [1:36:59<8:38:18, 5.79s/it] 39%|███▊ | 3376/8750 [1:37:02<8:38:18, 5.79s/it] {'loss': 0.4727, 'learning_rate': 1.4063646302862938e-05, 'epoch': 0.39} + 39%|███▊ | 3376/8750 [1:37:02<8:38:18, 5.79s/it] {'loss': 0.4727, 'learning_rate': 1.4063646302862938e-05, 'epoch': 0.39} + 39%|███▊ | 3376/8750 [1:36:59<8:38:18, 5.79s/it] 39%|███▊ | 3377/8750 [1:37:05<8:43:36, 5.85s/it] 39%|███▊ | 3377/8750 [1:37:08<8:43:36, 5.85s/it] {'loss': 0.4658, 'learning_rate': 1.406026378374588e-05, 'epoch': 0.39} + 39%|███▊ | 3377/8750 [1:37:08<8:43:36, 5.85s/it] {'loss': 0.4658, 'learning_rate': 1.406026378374588e-05, 'epoch': 0.39} + 39%|███▊ | 3377/8750 [1:37:05<8:43:36, 5.85s/it] 39%|███▊ | 3378/8750 [1:37:11<8:41:18, 5.82s/it] 39%|███▊ | 3378/8750 [1:37:13<8:41:18, 5.82s/it] {'loss': 0.4719, 'learning_rate': 1.405688070828203e-05, 'epoch': 0.39} + 39%|███▊ | 3378/8750 [1:37:13<8:41:18, 5.82s/it] {'loss': 0.4719, 'learning_rate': 1.405688070828203e-05, 'epoch': 0.39} + 39%|███▊ | 3378/8750 [1:37:11<8:41:18, 5.82s/it] 39%|███▊ | 3379/8750 [1:37:16<8:40:37, 5.82s/it] 39%|███▊ | 3379/8750 [1:37:19<8:40:38, 5.82s/it] {'loss': 0.4711, 'learning_rate': 1.4053497076934948e-05, 'epoch': 0.39} + 39%|███▊ | 3379/8750 [1:37:19<8:40:38, 5.82s/it] {'loss': 0.4711, 'learning_rate': 1.4053497076934948e-05, 'epoch': 0.39} + 39%|███▊ | 3379/8750 [1:37:16<8:40:37, 5.82s/it] 39%|███▊ | 3380/8750 [1:37:22<8:33:34, 5.74s/it] 39%|███▊ | 3380/8750 [1:37:25<8:33:33, 5.74s/it] {'loss': 0.4865, 'learning_rate': 1.405011289016827e-05, 'epoch': 0.39} + 39%|███▊ | 3380/8750 [1:37:25<8:33:33, 5.74s/it] {'loss': 0.4865, 'learning_rate': 1.405011289016827e-05, 'epoch': 0.39} + 39%|███▊ | 3380/8750 [1:37:22<8:33:34, 5.74s/it] 39%|███▊ | 3381/8750 [1:37:28<8:32:49, 5.73s/it] 39%|███▊ | 3381/8750 [1:37:31<8:32:49, 5.73s/it] {'loss': 0.4585, 'learning_rate': 1.4046728148445701e-05, 'epoch': 0.39} + 39%|███▊ | 3381/8750 [1:37:31<8:32:49, 5.73s/it] {'loss': 0.4585, 'learning_rate': 1.4046728148445701e-05, 'epoch': 0.39} + 39%|███▊ | 3381/8750 [1:37:28<8:32:49, 5.73s/it] 39%|███▊ | 3382/8750 [1:37:33<8:33:29, 5.74s/it] 39%|███▊ | 3382/8750 [1:37:36<8:33:29, 5.74s/it] {'loss': 0.4705, 'learning_rate': 1.4043342852231027e-05, 'epoch': 0.39} + 39%|███▊ | 3382/8750 [1:37:36<8:33:29, 5.74s/it] {'loss': 0.4705, 'learning_rate': 1.4043342852231027e-05, 'epoch': 0.39} + 39%|███▊ | 3382/8750 [1:37:33<8:33:29, 5.74s/it] 39%|███▊ | 3383/8750 [1:37:39<8:34:10, 5.75s/it] 39%|███▊ | 3383/8750 [1:37:42<8:34:10, 5.75s/it] {'loss': 0.4612, 'learning_rate': 1.4039957001988112e-05, 'epoch': 0.39} + 39%|███▊ | 3383/8750 [1:37:42<8:34:10, 5.75s/it] {'loss': 0.4612, 'learning_rate': 1.4039957001988112e-05, 'epoch': 0.39} + 39%|███▊ | 3383/8750 [1:37:39<8:34:10, 5.75s/it] 39%|███▊ | 3384/8750 [1:37:45<8:42:13, 5.84s/it] 39%|███▊ | 3384/8750 [1:37:48<8:42:13, 5.84s/it] {'loss': 0.4656, 'learning_rate': 1.4036570598180888e-05, 'epoch': 0.39} + 39%|███▊ | 3384/8750 [1:37:48<8:42:13, 5.84s/it] {'loss': 0.4656, 'learning_rate': 1.4036570598180888e-05, 'epoch': 0.39} + 39%|███▊ | 3384/8750 [1:37:45<8:42:13, 5.84s/it] 39%|███▊ | 3385/8750 [1:37:51<8:40:54, 5.83s/it] 39%|███▊ | 3385/8750 [1:37:54<8:40:54, 5.83s/it] {'loss': 0.4502, 'learning_rate': 1.4033183641273374e-05, 'epoch': 0.39} + 39%|███▊ | 3385/8750 [1:37:54<8:40:54, 5.83s/it] {'loss': 0.4502, 'learning_rate': 1.4033183641273374e-05, 'epoch': 0.39} + 39%|███▊ | 3385/8750 [1:37:51<8:40:54, 5.83s/it] 39%|███▊ | 3386/8750 [1:37:57<8:39:00, 5.81s/it] 39%|███▊ | 3386/8750 [1:38:00<8:39:00, 5.81s/it] {'loss': 0.4727, 'learning_rate': 1.4029796131729652e-05, 'epoch': 0.39} + 39%|███▊ | 3386/8750 [1:38:00<8:39:00, 5.81s/it] {'loss': 0.4727, 'learning_rate': 1.4029796131729652e-05, 'epoch': 0.39} + 39%|███▊ | 3386/8750 [1:37:57<8:39:00, 5.81s/it] 39%|███▊ | 3387/8750 [1:38:02<8:35:57, 5.77s/it] 39%|███▊ | 3387/8750 [1:38:05<8:35:57, 5.77s/it] {'loss': 0.4752, 'learning_rate': 1.4026408070013892e-05, 'epoch': 0.39} + 39%|███▊ | 3387/8750 [1:38:05<8:35:57, 5.77s/it] {'loss': 0.4752, 'learning_rate': 1.4026408070013892e-05, 'epoch': 0.39} + 39%|███▊ | 3387/8750 [1:38:02<8:35:57, 5.77s/it] 39%|███▊ | 3388/8750 [1:38:08<8:34:47, 5.76s/it] 39%|███▊ | 3388/8750 [1:38:11<8:34:48, 5.76s/it] {'loss': 0.4736, 'learning_rate': 1.4023019456590335e-05, 'epoch': 0.39} + 39%|███▊ | 3388/8750 [1:38:11<8:34:48, 5.76s/it] {'loss': 0.4736, 'learning_rate': 1.4023019456590335e-05, 'epoch': 0.39} + 39%|███▊ | 3388/8750 [1:38:08<8:34:47, 5.76s/it] 39%|███▊ | 3389/8750 [1:38:14<8:38:38, 5.80s/it] 39%|███▊ | 3389/8750 [1:38:17<8:38:38, 5.80s/it] {'loss': 0.4678, 'learning_rate': 1.4019630291923289e-05, 'epoch': 0.39} + 39%|███▊ | 3389/8750 [1:38:17<8:38:38, 5.80s/it] {'loss': 0.4678, 'learning_rate': 1.4019630291923289e-05, 'epoch': 0.39} + 39%|███▊ | 3389/8750 [1:38:14<8:38:38, 5.80s/it] 39%|███▊ | 3390/8750 [1:38:20<8:34:04, 5.75s/it] 39%|███▊ | 3390/8750 [1:38:23<8:34:04, 5.75s/it] {'loss': 0.483, 'learning_rate': 1.4016240576477152e-05, 'epoch': 0.39} + {'loss': 0.483, 'learning_rate': 1.4016240576477152e-05, 'epoch': 0.39} + 39%|███▊ | 3390/8750 [1:38:23<8:34:04, 5.75s/it] 39%|███▊ | 3390/8750 [1:38:20<8:34:04, 5.75s/it] 39%|███▉ | 3391/8750 [1:38:25<8:32:37, 5.74s/it] 39%|███▉ | 3391/8750 [1:38:28<8:32:37, 5.74s/it] {'loss': 0.4638, 'learning_rate': 1.401285031071639e-05, 'epoch': 0.39} + 39%|███▉ | 3391/8750 [1:38:28<8:32:37, 5.74s/it] {'loss': 0.4638, 'learning_rate': 1.401285031071639e-05, 'epoch': 0.39} + 39%|███▉ | 3391/8750 [1:38:25<8:32:37, 5.74s/it] 39%|███▉ | 3392/8750 [1:38:31<8:33:33, 5.75s/it] 39%|███▉ | 3392/8750 [1:38:34<8:33:32, 5.75s/it] {'loss': 0.4708, 'learning_rate': 1.4009459495105542e-05, 'epoch': 0.39} + 39%|███▉ | 3392/8750 [1:38:34<8:33:32, 5.75s/it] {'loss': 0.4708, 'learning_rate': 1.4009459495105542e-05, 'epoch': 0.39} + 39%|███▉ | 3392/8750 [1:38:31<8:33:33, 5.75s/it] 39%|███▉ | 3393/8750 [1:38:37<8:32:04, 5.74s/it] 39%|███▉ | 3393/8750 [1:38:40<8:32:04, 5.74s/it] {'loss': 0.4631, 'learning_rate': 1.400606813010923e-05, 'epoch': 0.39} + 39%|███▉ | 3393/8750 [1:38:40<8:32:04, 5.74s/it] {'loss': 0.4631, 'learning_rate': 1.400606813010923e-05, 'epoch': 0.39} + 39%|███▉ | 3393/8750 [1:38:37<8:32:04, 5.74s/it] 39%|███▉ | 3394/8750 [1:38:43<8:28:58, 5.70s/it] 39%|███▉ | 3394/8750 [1:38:45<8:28:58, 5.70s/it] {'loss': 0.4596, 'learning_rate': 1.4002676216192141e-05, 'epoch': 0.39} + 39%|███▉ | 3394/8750 [1:38:45<8:28:58, 5.70s/it] {'loss': 0.4596, 'learning_rate': 1.4002676216192141e-05, 'epoch': 0.39} + 39%|███▉ | 3394/8750 [1:38:43<8:28:58, 5.70s/it] 39%|███▉ | 3395/8750 [1:38:48<8:30:03, 5.72s/it] 39%|███▉ | 3395/8750 [1:38:51<8:30:04, 5.72s/it] {'loss': 0.4453, 'learning_rate': 1.3999283753819047e-05, 'epoch': 0.39} + 39%|███▉ | 3395/8750 [1:38:51<8:30:04, 5.72s/it] {'loss': 0.4453, 'learning_rate': 1.3999283753819047e-05, 'epoch': 0.39} + 39%|███▉ | 3395/8750 [1:38:48<8:30:03, 5.72s/it] 39%|███▉ | 3396/8750 [1:38:54<8:30:52, 5.73s/it] 39%|███▉ | 3396/8750 [1:38:57<8:30:52, 5.73s/it] {'loss': 0.4751, 'learning_rate': 1.3995890743454789e-05, 'epoch': 0.39} + 39%|███▉ | 3396/8750 [1:38:57<8:30:52, 5.73s/it] {'loss': 0.4751, 'learning_rate': 1.3995890743454789e-05, 'epoch': 0.39} + 39%|███▉ | 3396/8750 [1:38:54<8:30:52, 5.73s/it] 39%|███▉ | 3397/8750 [1:39:00<8:29:20, 5.71s/it] 39%|███▉ | 3397/8750 [1:39:03<8:29:20, 5.71s/it] {'loss': 0.4807, 'learning_rate': 1.3992497185564289e-05, 'epoch': 0.39} + 39%|███▉ | 3397/8750 [1:39:03<8:29:20, 5.71s/it] {'loss': 0.4807, 'learning_rate': 1.3992497185564289e-05, 'epoch': 0.39} + 39%|███▉ | 3397/8750 [1:39:00<8:29:20, 5.71s/it] 39%|███▉ | 3398/8750 [1:39:06<8:33:17, 5.75s/it] 39%|███▉ | 3398/8750 [1:39:08<8:33:17, 5.75s/it] {'loss': 0.4586, 'learning_rate': 1.3989103080612533e-05, 'epoch': 0.39} + 39%|███▉ | 3398/8750 [1:39:08<8:33:17, 5.75s/it] {'loss': 0.4586, 'learning_rate': 1.3989103080612533e-05, 'epoch': 0.39} + 39%|███▉ | 3398/8750 [1:39:06<8:33:17, 5.75s/it] 39%|███▉ | 3399/8750 [1:39:11<8:29:30, 5.71s/it] 39%|███▉ | 3399/8750 [1:39:14<8:29:31, 5.71s/it] {'loss': 0.4707, 'learning_rate': 1.3985708429064598e-05, 'epoch': 0.39} + {'loss': 0.4707, 'learning_rate': 1.3985708429064598e-05, 'epoch': 0.39} 39%|███▉ | 3399/8750 [1:39:14<8:29:31, 5.71s/it] + 39%|███▉ | 3399/8750 [1:39:11<8:29:30, 5.71s/it]0109 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +6 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... + 39%|███▉ | 3400/8750 [1:39:17<8:33:28, 5.76s/it]3 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +71 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + + 39%|███▉ | 3400/8750 [1:39:20<8:33:27, 5.76s/it]5 AutoResumeHook: Checking whether to suspend...15 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... + +4 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4871, 'learning_rate': 1.3982313231385622e-05, 'epoch': 0.39} + 39%|███▉ | 3400/8750 [1:39:20<8:33:27, 5.76s/it] {'loss': 0.4871, 'learning_rate': 1.3982313231385622e-05, 'epoch': 0.39} + 39%|███▉ | 3400/8750 [1:39:17<8:33:28, 5.76s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-3400/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-3400/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-3400/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 39%|███▉ | 3401/8750 [1:39:41<16:33:21, 11.14s/it] 39%|███▉ | 3401/8750 [1:39:44<16:33:25, 11.14s/it] {'loss': 0.4575, 'learning_rate': 1.3978917488040822e-05, 'epoch': 0.39} + 39%|███▉ | 3401/8750 [1:39:44<16:33:25, 11.14s/it] {'loss': 0.4575, 'learning_rate': 1.3978917488040822e-05, 'epoch': 0.39} + 39%|███▉ | 3401/8750 [1:39:41<16:33:21, 11.14s/it] 39%|███▉ | 3402/8750 [1:39:47<14:10:54, 9.55s/it] 39%|███▉ | 3402/8750 [1:39:49<14:10:53, 9.55s/it] {'loss': 0.446, 'learning_rate': 1.3975521199495495e-05, 'epoch': 0.39} + 39%|███▉ | 3402/8750 [1:39:49<14:10:53, 9.55s/it] {'loss': 0.446, 'learning_rate': 1.3975521199495495e-05, 'epoch': 0.39} + 39%|███▉ | 3402/8750 [1:39:47<14:10:54, 9.55s/it] 39%|███▉ | 3403/8750 [1:39:52<12:27:06, 8.38s/it] 39%|███▉ | 3403/8750 [1:39:55<12:27:06, 8.38s/it] {'loss': 0.4673, 'learning_rate': 1.3972124366215002e-05, 'epoch': 0.39} + 39%|███▉ | 3403/8750 [1:39:55<12:27:06, 8.38s/it] {'loss': 0.4673, 'learning_rate': 1.3972124366215002e-05, 'epoch': 0.39} + 39%|███▉ | 3403/8750 [1:39:52<12:27:06, 8.38s/it] 39%|███▉ | 3404/8750 [1:39:58<11:18:30, 7.62s/it] 39%|███▉ | 3404/8750 [1:40:01<11:18:29, 7.61s/it] {'loss': 0.4686, 'learning_rate': 1.3968726988664788e-05, 'epoch': 0.39} + 39%|███▉ | 3404/8750 [1:40:01<11:18:29, 7.61s/it] {'loss': 0.4686, 'learning_rate': 1.3968726988664788e-05, 'epoch': 0.39} + 39%|███▉ | 3404/8750 [1:39:58<11:18:30, 7.62s/it] 39%|███▉ | 3405/8750 [1:40:04<10:29:18, 7.06s/it] 39%|███▉ | 3405/8750 [1:40:07<10:29:17, 7.06s/it] {'loss': 0.4776, 'learning_rate': 1.3965329067310372e-05, 'epoch': 0.39} + 39%|███▉ | 3405/8750 [1:40:07<10:29:17, 7.06s/it] {'loss': 0.4776, 'learning_rate': 1.3965329067310372e-05, 'epoch': 0.39} + 39%|███▉ | 3405/8750 [1:40:04<10:29:18, 7.06s/it] 39%|███▉ | 3406/8750 [1:40:10<9:53:41, 6.67s/it] 39%|███▉ | 3406/8750 [1:40:13<9:53:41, 6.67s/it] {'loss': 0.4751, 'learning_rate': 1.3961930602617345e-05, 'epoch': 0.39} + 39%|███▉ | 3406/8750 [1:40:13<9:53:41, 6.67s/it] {'loss': 0.4751, 'learning_rate': 1.3961930602617345e-05, 'epoch': 0.39} + 39%|███▉ | 3406/8750 [1:40:10<9:53:41, 6.67s/it] 39%|███▉ | 3407/8750 [1:40:15<9:27:40, 6.37s/it] 39%|███▉ | 3407/8750 [1:40:18<9:27:40, 6.37s/it] {'loss': 0.4603, 'learning_rate': 1.3958531595051367e-05, 'epoch': 0.39} + 39%|███▉ | 3407/8750 [1:40:18<9:27:40, 6.37s/it] {'loss': 0.4603, 'learning_rate': 1.3958531595051367e-05, 'epoch': 0.39} + 39%|███▉ | 3407/8750 [1:40:15<9:27:40, 6.37s/it] 39%|███▉ | 3408/8750 [1:40:21<9:11:54, 6.20s/it] 39%|███▉ | 3408/8750 [1:40:24<9:11:54, 6.20s/it] {'loss': 0.4653, 'learning_rate': 1.395513204507818e-05, 'epoch': 0.39} + 39%|███▉ | 3408/8750 [1:40:24<9:11:54, 6.20s/it] {'loss': 0.4653, 'learning_rate': 1.395513204507818e-05, 'epoch': 0.39} + 39%|███▉ | 3408/8750 [1:40:21<9:11:54, 6.20s/it] 39%|███▉ | 3409/8750 [1:40:27<9:10:03, 6.18s/it] 39%|███▉ | 3409/8750 [1:40:30<9:10:03, 6.18s/it] {'loss': 0.4896, 'learning_rate': 1.3951731953163606e-05, 'epoch': 0.39} + 39%|███▉ | 3409/8750 [1:40:30<9:10:03, 6.18s/it] {'loss': 0.4896, 'learning_rate': 1.3951731953163606e-05, 'epoch': 0.39} + 39%|███▉ | 3409/8750 [1:40:27<9:10:03, 6.18s/it] 39%|███▉ | 3410/8750 [1:40:33<8:58:01, 6.05s/it] 39%|███▉ | 3410/8750 [1:40:36<8:58:01, 6.05s/it] {'loss': 0.4479, 'learning_rate': 1.3948331319773525e-05, 'epoch': 0.39} + 39%|███▉ | 3410/8750 [1:40:36<8:58:01, 6.05s/it] {'loss': 0.4479, 'learning_rate': 1.3948331319773525e-05, 'epoch': 0.39} + 39%|███▉ | 3410/8750 [1:40:33<8:58:01, 6.05s/it] 39%|███▉ | 3411/8750 [1:40:39<8:50:58, 5.97s/it] 39%|███▉ | 3411/8750 [1:40:42<8:50:58, 5.97s/it] {'loss': 0.47, 'learning_rate': 1.3944930145373903e-05, 'epoch': 0.39} + 39%|███▉ | 3411/8750 [1:40:42<8:50:58, 5.97s/it] {'loss': 0.47, 'learning_rate': 1.3944930145373903e-05, 'epoch': 0.39} + 39%|███▉ | 3411/8750 [1:40:39<8:50:58, 5.97s/it] 39%|███▉ | 3412/8750 [1:40:45<8:57:40, 6.04s/it] 39%|███▉ | 3412/8750 [1:40:48<8:57:40, 6.04s/it] {'loss': 0.4531, 'learning_rate': 1.3941528430430773e-05, 'epoch': 0.39} + 39%|███▉ | 3412/8750 [1:40:48<8:57:40, 6.04s/it] {'loss': 0.4531, 'learning_rate': 1.3941528430430773e-05, 'epoch': 0.39} + 39%|███▉ | 3412/8750 [1:40:45<8:57:40, 6.04s/it] 39%|███▉ | 3413/8750 [1:40:51<8:50:36, 5.97s/it] 39%|███▉ | 3413/8750 [1:40:54<8:50:36, 5.97s/it] {'loss': 0.4815, 'learning_rate': 1.393812617541025e-05, 'epoch': 0.39} + 39%|███▉ | 3413/8750 [1:40:54<8:50:36, 5.97s/it] {'loss': 0.4815, 'learning_rate': 1.393812617541025e-05, 'epoch': 0.39} + 39%|███▉ | 3413/8750 [1:40:51<8:50:36, 5.97s/it] 39%|███▉ | 3414/8750 [1:40:57<8:47:33, 5.93s/it] 39%|███▉ | 3414/8750 [1:40:59<8:47:33, 5.93s/it] {'loss': 0.4752, 'learning_rate': 1.3934723380778517e-05, 'epoch': 0.39} + 39%|███▉ | 3414/8750 [1:40:59<8:47:33, 5.93s/it] {'loss': 0.4752, 'learning_rate': 1.3934723380778517e-05, 'epoch': 0.39} + 39%|███▉ | 3414/8750 [1:40:57<8:47:33, 5.93s/it] 39%|███▉ | 3415/8750 [1:41:02<8:45:53, 5.91s/it] 39%|███▉ | 3415/8750 [1:41:05<8:45:53, 5.91s/it] {'loss': 0.4631, 'learning_rate': 1.3931320047001838e-05, 'epoch': 0.39} + 39%|███▉ | 3415/8750 [1:41:05<8:45:53, 5.91s/it] {'loss': 0.4631, 'learning_rate': 1.3931320047001838e-05, 'epoch': 0.39} + 39%|███▉ | 3415/8750 [1:41:02<8:45:53, 5.91s/it] 39%|███▉ | 3416/8750 [1:41:08<8:45:35, 5.91s/it] 39%|███▉ | 3416/8750 [1:41:11<8:45:35, 5.91s/it] {'loss': 0.4979, 'learning_rate': 1.3927916174546536e-05, 'epoch': 0.39} + 39%|███▉ | 3416/8750 [1:41:11<8:45:35, 5.91s/it] {'loss': 0.4979, 'learning_rate': 1.3927916174546536e-05, 'epoch': 0.39} + 39%|███▉ | 3416/8750 [1:41:08<8:45:35, 5.91s/it] 39%|███▉ | 3417/8750 [1:41:14<8:39:05, 5.84s/it] 39%|███▉ | 3417/8750 [1:41:17<8:39:05, 5.84s/it] {'loss': 0.4603, 'learning_rate': 1.3924511763879025e-05, 'epoch': 0.39} + 39%|███▉ | 3417/8750 [1:41:17<8:39:05, 5.84s/it] {'loss': 0.4603, 'learning_rate': 1.3924511763879025e-05, 'epoch': 0.39} + 39%|███▉ | 3417/8750 [1:41:14<8:39:05, 5.84s/it] 39%|███▉ | 3418/8750 [1:41:20<8:39:11, 5.84s/it] 39%|███▉ | 3418/8750 [1:41:23<8:39:11, 5.84s/it] {'loss': 0.4488, 'learning_rate': 1.3921106815465782e-05, 'epoch': 0.39} + 39%|███▉ | 3418/8750 [1:41:23<8:39:11, 5.84s/it] {'loss': 0.4488, 'learning_rate': 1.3921106815465782e-05, 'epoch': 0.39} + 39%|███▉ | 3418/8750 [1:41:20<8:39:11, 5.84s/it] 39%|███▉ | 3419/8750 [1:41:26<8:34:23, 5.79s/it] 39%|███▉ | 3419/8750 [1:41:28<8:34:22, 5.79s/it] {'loss': 0.4639, 'learning_rate': 1.3917701329773364e-05, 'epoch': 0.39} + 39%|███▉ | 3419/8750 [1:41:28<8:34:22, 5.79s/it] {'loss': 0.4639, 'learning_rate': 1.3917701329773364e-05, 'epoch': 0.39} + 39%|███▉ | 3419/8750 [1:41:26<8:34:23, 5.79s/it] 39%|███▉ | 3420/8750 [1:41:31<8:36:05, 5.81s/it] 39%|███▉ | 3420/8750 [1:41:34<8:36:05, 5.81s/it] {'loss': 0.49, 'learning_rate': 1.3914295307268396e-05, 'epoch': 0.39} + 39%|███▉ | 3420/8750 [1:41:34<8:36:05, 5.81s/it] {'loss': 0.49, 'learning_rate': 1.3914295307268396e-05, 'epoch': 0.39} + 39%|███▉ | 3420/8750 [1:41:31<8:36:05, 5.81s/it] 39%|███▉ | 3421/8750 [1:41:37<8:34:25, 5.79s/it] 39%|███▉ | 3421/8750 [1:41:40<8:34:25, 5.79s/it] {'loss': 0.4681, 'learning_rate': 1.3910888748417577e-05, 'epoch': 0.39} + 39%|███▉ | 3421/8750 [1:41:40<8:34:25, 5.79s/it] {'loss': 0.4681, 'learning_rate': 1.3910888748417577e-05, 'epoch': 0.39} + 39%|███▉ | 3421/8750 [1:41:37<8:34:25, 5.79s/it] 39%|███▉ | 3422/8750 [1:41:43<8:36:46, 5.82s/it] 39%|███▉ | 3422/8750 [1:41:46<8:36:46, 5.82s/it] {'loss': 0.4684, 'learning_rate': 1.3907481653687687e-05, 'epoch': 0.39} + 39%|███▉ | 3422/8750 [1:41:46<8:36:46, 5.82s/it] {'loss': 0.4684, 'learning_rate': 1.3907481653687687e-05, 'epoch': 0.39} + 39%|███▉ | 3422/8750 [1:41:43<8:36:46, 5.82s/it] 39%|███▉ | 3423/8750 [1:41:49<8:30:52, 5.75s/it] 39%|███▉ | 3423/8750 [1:41:52<8:30:52, 5.75s/it] {'loss': 0.4867, 'learning_rate': 1.3904074023545566e-05, 'epoch': 0.39} + 39%|███▉ | 3423/8750 [1:41:52<8:30:52, 5.75s/it] {'loss': 0.4867, 'learning_rate': 1.3904074023545566e-05, 'epoch': 0.39} + 39%|███▉ | 3423/8750 [1:41:49<8:30:52, 5.75s/it] 39%|███▉ | 3424/8750 [1:41:54<8:32:04, 5.77s/it] 39%|███▉ | 3424/8750 [1:41:57<8:32:04, 5.77s/it] {'loss': 0.4704, 'learning_rate': 1.390066585845815e-05, 'epoch': 0.39} + 39%|███▉ | 3424/8750 [1:41:57<8:32:04, 5.77s/it] {'loss': 0.4704, 'learning_rate': 1.390066585845815e-05, 'epoch': 0.39} + 39%|███▉ | 3424/8750 [1:41:54<8:32:04, 5.77s/it] 39%|███▉ | 3425/8750 [1:42:00<8:38:29, 5.84s/it] 39%|███▉ | 3425/8750 [1:42:03<8:38:29, 5.84s/it] {'loss': 0.4487, 'learning_rate': 1.389725715889242e-05, 'epoch': 0.39} + 39%|███▉ | 3425/8750 [1:42:03<8:38:29, 5.84s/it] {'loss': 0.4487, 'learning_rate': 1.389725715889242e-05, 'epoch': 0.39} + 39%|███▉ | 3425/8750 [1:42:00<8:38:29, 5.84s/it] 39%|███▉ | 3426/8750 [1:42:06<8:33:47, 5.79s/it] 39%|███▉ | 3426/8750 [1:42:09<8:33:47, 5.79s/it] {'loss': 0.4687, 'learning_rate': 1.3893847925315447e-05, 'epoch': 0.39} + 39%|███▉ | 3426/8750 [1:42:09<8:33:47, 5.79s/it] {'loss': 0.4687, 'learning_rate': 1.3893847925315447e-05, 'epoch': 0.39} + 39%|███▉ | 3426/8750 [1:42:06<8:33:47, 5.79s/it] 39%|███▉ | 3427/8750 [1:42:12<8:32:05, 5.77s/it] 39%|███▉ | 3427/8750 [1:42:15<8:32:05, 5.77s/it] {'loss': 0.4856, 'learning_rate': 1.3890438158194374e-05, 'epoch': 0.39} + {'loss': 0.4856, 'learning_rate': 1.3890438158194374e-05, 'epoch': 0.39} 39%|███▉ | 3427/8750 [1:42:15<8:32:05, 5.77s/it] + 39%|███▉ | 3427/8750 [1:42:12<8:32:05, 5.77s/it] 39%|███▉ | 3428/8750 [1:42:18<8:30:58, 5.76s/it] 39%|███▉ | 3428/8750 [1:42:21<8:30:58, 5.76s/it] {'loss': 0.4726, 'learning_rate': 1.3887027857996416e-05, 'epoch': 0.39} + 39%|███▉ | 3428/8750 [1:42:21<8:30:58, 5.76s/it] {'loss': 0.4726, 'learning_rate': 1.3887027857996416e-05, 'epoch': 0.39} + 39%|███▉ | 3428/8750 [1:42:18<8:30:58, 5.76s/it] 39%|███▉ | 3429/8750 [1:42:23<8:30:23, 5.76s/it] 39%|███▉ | 3429/8750 [1:42:26<8:30:23, 5.76s/it] {'loss': 0.4639, 'learning_rate': 1.3883617025188858e-05, 'epoch': 0.39} + 39%|███▉ | 3429/8750 [1:42:26<8:30:23, 5.76s/it] {'loss': 0.4639, 'learning_rate': 1.3883617025188858e-05, 'epoch': 0.39} + 39%|███▉ | 3429/8750 [1:42:23<8:30:23, 5.76s/it] 39%|███▉ | 3430/8750 [1:42:29<8:30:14, 5.75s/it] 39%|███▉ | 3430/8750 [1:42:32<8:30:15, 5.75s/it] {'loss': 0.4716, 'learning_rate': 1.3880205660239062e-05, 'epoch': 0.39} + 39%|███▉ | 3430/8750 [1:42:32<8:30:15, 5.75s/it] {'loss': 0.4716, 'learning_rate': 1.3880205660239062e-05, 'epoch': 0.39} + 39%|███▉ | 3430/8750 [1:42:29<8:30:14, 5.75s/it] 39%|███▉ | 3431/8750 [1:42:35<8:27:13, 5.72s/it] 39%|███▉ | 3431/8750 [1:42:38<8:27:12, 5.72s/it] {'loss': 0.4622, 'learning_rate': 1.387679376361446e-05, 'epoch': 0.39} + 39%|███▉ | 3431/8750 [1:42:38<8:27:12, 5.72s/it] {'loss': 0.4622, 'learning_rate': 1.387679376361446e-05, 'epoch': 0.39} + 39%|███▉ | 3431/8750 [1:42:35<8:27:13, 5.72s/it] 39%|███▉ | 3432/8750 [1:42:41<8:31:16, 5.77s/it] 39%|███▉ | 3432/8750 [1:42:44<8:31:16, 5.77s/it] {'loss': 0.4813, 'learning_rate': 1.3873381335782559e-05, 'epoch': 0.39} + {'loss': 0.4813, 'learning_rate': 1.3873381335782559e-05, 'epoch': 0.39} + 39%|███▉ | 3432/8750 [1:42:44<8:31:16, 5.77s/it] 39%|███▉ | 3432/8750 [1:42:41<8:31:16, 5.77s/it] 39%|███▉ | 3433/8750 [1:42:46<8:30:12, 5.76s/it] 39%|███▉ | 3433/8750 [1:42:49<8:30:12, 5.76s/it] {'loss': 0.4577, 'learning_rate': 1.3869968377210936e-05, 'epoch': 0.39} + 39%|███▉ | 3433/8750 [1:42:49<8:30:12, 5.76s/it] {'loss': 0.4577, 'learning_rate': 1.3869968377210936e-05, 'epoch': 0.39} + 39%|███▉ | 3433/8750 [1:42:46<8:30:12, 5.76s/it] 39%|███▉ | 3434/8750 [1:42:52<8:30:48, 5.77s/it] 39%|███▉ | 3434/8750 [1:42:55<8:30:49, 5.77s/it] {'loss': 0.4814, 'learning_rate': 1.3866554888367243e-05, 'epoch': 0.39} + 39%|███▉ | 3434/8750 [1:42:55<8:30:49, 5.77s/it] {'loss': 0.4814, 'learning_rate': 1.3866554888367243e-05, 'epoch': 0.39} + 39%|███▉ | 3434/8750 [1:42:52<8:30:48, 5.77s/it] 39%|███▉ | 3435/8750 [1:42:58<8:28:45, 5.74s/it] 39%|███▉ | 3435/8750 [1:43:01<8:28:45, 5.74s/it] {'loss': 0.4519, 'learning_rate': 1.3863140869719207e-05, 'epoch': 0.39} + 39%|███▉ | 3435/8750 [1:43:01<8:28:45, 5.74s/it] {'loss': 0.4519, 'learning_rate': 1.3863140869719207e-05, 'epoch': 0.39} + 39%|███▉ | 3435/8750 [1:42:58<8:28:45, 5.74s/it] 39%|███▉ | 3436/8750 [1:43:03<8:26:46, 5.72s/it] 39%|███▉ | 3436/8750 [1:43:06<8:26:46, 5.72s/it] {'loss': 0.4714, 'learning_rate': 1.3859726321734623e-05, 'epoch': 0.39} + 39%|███▉ | 3436/8750 [1:43:06<8:26:46, 5.72s/it] {'loss': 0.4714, 'learning_rate': 1.3859726321734623e-05, 'epoch': 0.39} + 39%|███▉ | 3436/8750 [1:43:03<8:26:46, 5.72s/it] 39%|███▉ | 3437/8750 [1:43:09<8:30:25, 5.76s/it] 39%|███▉ | 3437/8750 [1:43:12<8:30:25, 5.76s/it] {'loss': 0.4725, 'learning_rate': 1.385631124488136e-05, 'epoch': 0.39} + 39%|███▉ | 3437/8750 [1:43:12<8:30:25, 5.76s/it] {'loss': 0.4725, 'learning_rate': 1.385631124488136e-05, 'epoch': 0.39} + 39%|███▉ | 3437/8750 [1:43:09<8:30:25, 5.76s/it] 39%|███▉ | 3438/8750 [1:43:15<8:32:11, 5.79s/it] 39%|███▉ | 3438/8750 [1:43:18<8:32:12, 5.79s/it] {'loss': 0.476, 'learning_rate': 1.3852895639627357e-05, 'epoch': 0.39} + 39%|███▉ | 3438/8750 [1:43:18<8:32:12, 5.79s/it] {'loss': 0.476, 'learning_rate': 1.3852895639627357e-05, 'epoch': 0.39} + 39%|███▉ | 3438/8750 [1:43:15<8:32:11, 5.79s/it] 39%|███▉ | 3439/8750 [1:43:21<8:29:47, 5.76s/it] 39%|███▉ | 3439/8750 [1:43:24<8:29:47, 5.76s/it] {'loss': 0.4709, 'learning_rate': 1.3849479506440633e-05, 'epoch': 0.39} + 39%|███▉ | 3439/8750 [1:43:24<8:29:47, 5.76s/it] {'loss': 0.4709, 'learning_rate': 1.3849479506440633e-05, 'epoch': 0.39} + 39%|███▉ | 3439/8750 [1:43:21<8:29:47, 5.76s/it] 39%|███▉ | 3440/8750 [1:43:27<8:28:36, 5.75s/it] 39%|███▉ | 3440/8750 [1:43:30<8:28:36, 5.75s/it] {'loss': 0.4698, 'learning_rate': 1.3846062845789275e-05, 'epoch': 0.39} + 39%|███▉ | 3440/8750 [1:43:30<8:28:36, 5.75s/it] {'loss': 0.4698, 'learning_rate': 1.3846062845789275e-05, 'epoch': 0.39} + 39%|███▉ | 3440/8750 [1:43:27<8:28:36, 5.75s/it] 39%|███▉ | 3441/8750 [1:43:32<8:31:15, 5.78s/it] 39%|███▉ | 3441/8750 [1:43:35<8:31:15, 5.78s/it] {'loss': 0.4537, 'learning_rate': 1.3842645658141436e-05, 'epoch': 0.39} + 39%|███▉ | 3441/8750 [1:43:35<8:31:15, 5.78s/it] {'loss': 0.4537, 'learning_rate': 1.3842645658141436e-05, 'epoch': 0.39} + 39%|███▉ | 3441/8750 [1:43:32<8:31:15, 5.78s/it] 39%|███▉ | 3442/8750 [1:43:38<8:27:49, 5.74s/it] 39%|███▉ | 3442/8750 [1:43:41<8:27:49, 5.74s/it] {'loss': 0.4755, 'learning_rate': 1.383922794396535e-05, 'epoch': 0.39} + 39%|███▉ | 3442/8750 [1:43:41<8:27:49, 5.74s/it] {'loss': 0.4755, 'learning_rate': 1.383922794396535e-05, 'epoch': 0.39} + 39%|███▉ | 3442/8750 [1:43:38<8:27:49, 5.74s/it] 39%|███▉ | 3443/8750 [1:43:44<8:33:55, 5.81s/it] 39%|███▉ | 3443/8750 [1:43:47<8:33:54, 5.81s/it] {'loss': 0.4526, 'learning_rate': 1.3835809703729322e-05, 'epoch': 0.39} + 39%|███▉ | 3443/8750 [1:43:47<8:33:54, 5.81s/it] {'loss': 0.4526, 'learning_rate': 1.3835809703729322e-05, 'epoch': 0.39} + 39%|███▉ | 3443/8750 [1:43:44<8:33:55, 5.81s/it] 39%|███▉ | 3444/8750 [1:43:50<8:36:50, 5.84s/it] 39%|███▉ | 3444/8750 [1:43:53<8:36:50, 5.84s/it] {'loss': 0.4573, 'learning_rate': 1.3832390937901723e-05, 'epoch': 0.39} + 39%|███▉ | 3444/8750 [1:43:53<8:36:50, 5.84s/it] {'loss': 0.4573, 'learning_rate': 1.3832390937901723e-05, 'epoch': 0.39} + 39%|███▉ | 3444/8750 [1:43:50<8:36:50, 5.84s/it] 39%|███▉ | 3445/8750 [1:43:59<8:33:40, 5.81s/it] 39%|███▉ | 3445/8750 [1:43:56<8:33:41, 5.81s/it] {'loss': 0.4754, 'learning_rate': 1.3828971646951005e-05, 'epoch': 0.39} + 39%|███▉ | 3445/8750 [1:43:59<8:33:40, 5.81s/it] {'loss': 0.4754, 'learning_rate': 1.3828971646951005e-05, 'epoch': 0.39} + 39%|███▉ | 3445/8750 [1:43:56<8:33:41, 5.81s/it] 39%|███▉ | 3446/8750 [1:44:02<8:37:57, 5.86s/it] 39%|███▉ | 3446/8750 [1:44:05<8:37:57, 5.86s/it] {'loss': 0.4687, 'learning_rate': 1.3825551831345685e-05, 'epoch': 0.39} + 39%|███▉ | 3446/8750 [1:44:05<8:37:57, 5.86s/it] {'loss': 0.4687, 'learning_rate': 1.3825551831345685e-05, 'epoch': 0.39} + 39%|███▉ | 3446/8750 [1:44:02<8:37:57, 5.86s/it] 39%|███▉ | 3447/8750 [1:44:08<8:41:08, 5.90s/it] 39%|███▉ | 3447/8750 [1:44:11<8:41:08, 5.90s/it] {'loss': 0.4574, 'learning_rate': 1.3822131491554355e-05, 'epoch': 0.39} + {'loss': 0.4574, 'learning_rate': 1.3822131491554355e-05, 'epoch': 0.39} 39%|███▉ | 3447/8750 [1:44:11<8:41:08, 5.90s/it] + 39%|███▉ | 3447/8750 [1:44:08<8:41:08, 5.90s/it] 39%|███▉ | 3448/8750 [1:44:13<8:38:52, 5.87s/it] 39%|███▉ | 3448/8750 [1:44:16<8:38:52, 5.87s/it] {'loss': 0.4768, 'learning_rate': 1.3818710628045677e-05, 'epoch': 0.39} + 39%|███▉ | 3448/8750 [1:44:16<8:38:52, 5.87s/it] {'loss': 0.4768, 'learning_rate': 1.3818710628045677e-05, 'epoch': 0.39} + 39%|███▉ | 3448/8750 [1:44:14<8:38:52, 5.87s/it] 39%|███▉ | 3449/8750 [1:44:19<8:35:20, 5.83s/it] 39%|███▉ | 3449/8750 [1:44:22<8:35:20, 5.83s/it] {'loss': 0.473, 'learning_rate': 1.3815289241288383e-05, 'epoch': 0.39} + 39%|███▉ | 3449/8750 [1:44:22<8:35:20, 5.83s/it] {'loss': 0.473, 'learning_rate': 1.3815289241288383e-05, 'epoch': 0.39} + 39%|███▉ | 3449/8750 [1:44:19<8:35:20, 5.83s/it]10 AutoResumeHook: Checking whether to suspend... +09 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... + 39%|███▉ | 3450/8750 [1:44:25<8:32:04, 5.80s/it]8 AutoResumeHook: Checking whether to suspend... + 39%|███▉ | 3450/8750 [1:44:28<8:32:03, 5.80s/it]1412 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4808, 'learning_rate': 1.3811867331751286e-05, 'epoch': 0.39} + 39%|███▉ | 3450/8750 [1:44:28<8:32:03, 5.80s/it] {'loss': 0.4808, 'learning_rate': 1.3811867331751286e-05, 'epoch': 0.39} + 39%|███▉ | 3450/8750 [1:44:25<8:32:04, 5.80s/it] 39%|███▉ | 3451/8750 [1:44:31<8:27:58, 5.75s/it] 39%|███▉ | 3451/8750 [1:44:34<8:27:58, 5.75s/it] {'loss': 0.4514, 'learning_rate': 1.380844489990326e-05, 'epoch': 0.39} + 39%|███▉ | 3451/8750 [1:44:34<8:27:58, 5.75s/it] {'loss': 0.4514, 'learning_rate': 1.380844489990326e-05, 'epoch': 0.39} + 39%|███▉ | 3451/8750 [1:44:31<8:27:58, 5.75s/it] 39%|███▉ | 3452/8750 [1:44:36<8:27:11, 5.74s/it] 39%|███▉ | 3452/8750 [1:44:39<8:27:11, 5.74s/it] {'loss': 0.4766, 'learning_rate': 1.3805021946213251e-05, 'epoch': 0.39} + 39%|███▉ | 3452/8750 [1:44:39<8:27:11, 5.74s/it] {'loss': 0.4766, 'learning_rate': 1.3805021946213251e-05, 'epoch': 0.39} + 39%|███▉ | 3452/8750 [1:44:36<8:27:11, 5.74s/it] 39%|███▉ | 3453/8750 [1:44:42<8:27:24, 5.75s/it] 39%|███▉ | 3453/8750 [1:44:45<8:27:24, 5.75s/it] {'loss': 0.4523, 'learning_rate': 1.3801598471150286e-05, 'epoch': 0.39} + 39%|███▉ | 3453/8750 [1:44:45<8:27:24, 5.75s/it] {'loss': 0.4523, 'learning_rate': 1.3801598471150286e-05, 'epoch': 0.39} + 39%|███▉ | 3453/8750 [1:44:42<8:27:24, 5.75s/it] 39%|███▉ | 3454/8750 [1:44:48<8:31:44, 5.80s/it] 39%|███▉ | 3454/8750 [1:44:51<8:31:44, 5.80s/it] {'loss': 0.4715, 'learning_rate': 1.3798174475183457e-05, 'epoch': 0.39} + 39%|███▉ | 3454/8750 [1:44:51<8:31:44, 5.80s/it] {'loss': 0.4715, 'learning_rate': 1.3798174475183457e-05, 'epoch': 0.39} + 39%|███▉ | 3454/8750 [1:44:48<8:31:44, 5.80s/it] 39%|███▉ | 3455/8750 [1:44:54<8:30:48, 5.79s/it] 39%|███▉ | 3455/8750 [1:44:57<8:30:47, 5.79s/it] {'loss': 0.4447, 'learning_rate': 1.3794749958781924e-05, 'epoch': 0.39} + 39%|███▉ | 3455/8750 [1:44:57<8:30:47, 5.79s/it] {'loss': 0.4447, 'learning_rate': 1.3794749958781924e-05, 'epoch': 0.39} + 39%|███▉ | 3455/8750 [1:44:54<8:30:48, 5.79s/it] 39%|███▉ | 3456/8750 [1:44:59<8:26:01, 5.74s/it] 39%|███▉ | 3456/8750 [1:45:02<8:26:01, 5.74s/it] {'loss': 0.471, 'learning_rate': 1.3791324922414924e-05, 'epoch': 0.39} + 39%|███▉ | 3456/8750 [1:45:02<8:26:01, 5.74s/it] {'loss': 0.471, 'learning_rate': 1.3791324922414924e-05, 'epoch': 0.39} + 39%|███▉ | 3456/8750 [1:44:59<8:26:01, 5.74s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (4374 > 4096). Running this sequence through the model will result in indexing errors + 40%|███▉ | 3457/8750 [1:45:05<8:24:31, 5.72s/it] 40%|███▉ | 3457/8750 [1:45:08<8:24:31, 5.72s/it] {'loss': 0.4725, 'learning_rate': 1.3787899366551764e-05, 'epoch': 0.4} + 40%|███▉ | 3457/8750 [1:45:08<8:24:31, 5.72s/it] {'loss': 0.4725, 'learning_rate': 1.3787899366551764e-05, 'epoch': 0.4} + 40%|███▉ | 3457/8750 [1:45:05<8:24:31, 5.72s/it] 40%|███▉ | 3458/8750 [1:45:11<8:26:02, 5.74s/it] 40%|███▉ | 3458/8750 [1:45:14<8:26:02, 5.74s/it] {'loss': 0.4766, 'learning_rate': 1.3784473291661824e-05, 'epoch': 0.4} + 40%|███▉ | 3458/8750 [1:45:14<8:26:02, 5.74s/it] {'loss': 0.4766, 'learning_rate': 1.3784473291661824e-05, 'epoch': 0.4} + 40%|███▉ | 3458/8750 [1:45:11<8:26:02, 5.74s/it] 40%|███▉ | 3459/8750 [1:45:17<8:31:39, 5.80s/it] 40%|███▉ | 3459/8750 [1:45:20<8:31:39, 5.80s/it] {'loss': 0.4553, 'learning_rate': 1.3781046698214549e-05, 'epoch': 0.4} + 40%|███▉ | 3459/8750 [1:45:20<8:31:39, 5.80s/it] {'loss': 0.4553, 'learning_rate': 1.3781046698214549e-05, 'epoch': 0.4} + 40%|███▉ | 3459/8750 [1:45:17<8:31:39, 5.80s/it] 40%|███▉ | 3460/8750 [1:45:22<8:26:49, 5.75s/it] 40%|███▉ | 3460/8750 [1:45:25<8:26:49, 5.75s/it] {'loss': 0.4858, 'learning_rate': 1.3777619586679458e-05, 'epoch': 0.4} + 40%|███▉ | 3460/8750 [1:45:25<8:26:49, 5.75s/it] {'loss': 0.4858, 'learning_rate': 1.3777619586679458e-05, 'epoch': 0.4} + 40%|███▉ | 3460/8750 [1:45:22<8:26:49, 5.75s/it] 40%|███▉ | 3461/8750 [1:45:28<8:35:08, 5.84s/it] 40%|███▉ | 3461/8750 [1:45:31<8:35:08, 5.84s/it] {'loss': 0.4454, 'learning_rate': 1.3774191957526144e-05, 'epoch': 0.4} + 40%|███▉ | 3461/8750 [1:45:31<8:35:08, 5.84s/it] {'loss': 0.4454, 'learning_rate': 1.3774191957526144e-05, 'epoch': 0.4} + 40%|███▉ | 3461/8750 [1:45:28<8:35:08, 5.84s/it] 40%|███▉ | 3462/8750 [1:45:34<8:32:37, 5.82s/it] 40%|███▉ | 3462/8750 [1:45:37<8:32:37, 5.82s/it] {'loss': 0.4684, 'learning_rate': 1.3770763811224273e-05, 'epoch': 0.4} + 40%|███▉ | 3462/8750 [1:45:37<8:32:37, 5.82s/it] {'loss': 0.4684, 'learning_rate': 1.3770763811224273e-05, 'epoch': 0.4} + 40%|███▉ | 3462/8750 [1:45:34<8:32:37, 5.82s/it] 40%|███▉ | 3463/8750 [1:45:40<8:32:05, 5.81s/it] 40%|███▉ | 3463/8750 [1:45:43<8:32:05, 5.81s/it] {'loss': 0.4524, 'learning_rate': 1.376733514824357e-05, 'epoch': 0.4} + 40%|███▉ | 3463/8750 [1:45:43<8:32:05, 5.81s/it] {'loss': 0.4524, 'learning_rate': 1.376733514824357e-05, 'epoch': 0.4} + 40%|███▉ | 3463/8750 [1:45:40<8:32:05, 5.81s/it] 40%|███▉ | 3464/8750 [1:45:46<8:34:16, 5.84s/it] 40%|███▉ | 3464/8750 [1:45:49<8:34:16, 5.84s/it] {'loss': 0.4756, 'learning_rate': 1.3763905969053841e-05, 'epoch': 0.4} + 40%|███▉ | 3464/8750 [1:45:49<8:34:16, 5.84s/it] {'loss': 0.4756, 'learning_rate': 1.3763905969053841e-05, 'epoch': 0.4} + 40%|███▉ | 3464/8750 [1:45:46<8:34:16, 5.84s/it] 40%|███▉ | 3465/8750 [1:45:52<8:36:10, 5.86s/it] 40%|███▉ | 3465/8750 [1:45:55<8:36:10, 5.86s/it] {'loss': 0.4817, 'learning_rate': 1.376047627412496e-05, 'epoch': 0.4} + 40%|███▉ | 3465/8750 [1:45:55<8:36:10, 5.86s/it] {'loss': 0.4817, 'learning_rate': 1.376047627412496e-05, 'epoch': 0.4} + 40%|███▉ | 3465/8750 [1:45:52<8:36:10, 5.86s/it] 40%|███▉ | 3466/8750 [1:45:58<8:35:25, 5.85s/it] 40%|███▉ | 3466/8750 [1:46:01<8:35:25, 5.85s/it] {'loss': 0.4717, 'learning_rate': 1.3757046063926876e-05, 'epoch': 0.4} + 40%|███▉ | 3466/8750 [1:46:01<8:35:25, 5.85s/it] {'loss': 0.4717, 'learning_rate': 1.3757046063926876e-05, 'epoch': 0.4} + 40%|███▉ | 3466/8750 [1:45:58<8:35:25, 5.85s/it] 40%|███▉ | 3467/8750 [1:46:04<8:36:09, 5.86s/it] 40%|███▉ | 3467/8750 [1:46:06<8:36:09, 5.86s/it] {'loss': 0.4687, 'learning_rate': 1.3753615338929598e-05, 'epoch': 0.4} + 40%|███▉ | 3467/8750 [1:46:06<8:36:09, 5.86s/it] {'loss': 0.4687, 'learning_rate': 1.3753615338929598e-05, 'epoch': 0.4} + 40%|███▉ | 3467/8750 [1:46:04<8:36:09, 5.86s/it] 40%|███▉ | 3468/8750 [1:46:09<8:31:37, 5.81s/it] 40%|███▉ | 3468/8750 [1:46:12<8:31:37, 5.81s/it] {'loss': 0.4765, 'learning_rate': 1.3750184099603216e-05, 'epoch': 0.4} + 40%|███▉ | 3468/8750 [1:46:12<8:31:37, 5.81s/it] {'loss': 0.4765, 'learning_rate': 1.3750184099603216e-05, 'epoch': 0.4} + 40%|███▉ | 3468/8750 [1:46:09<8:31:37, 5.81s/it] 40%|███▉ | 3469/8750 [1:46:18<8:28:59, 5.78s/it] 40%|███▉ | 3469/8750 [1:46:15<8:29:00, 5.78s/it] {'loss': 0.4669, 'learning_rate': 1.3746752346417884e-05, 'epoch': 0.4} + 40%|███▉ | 3469/8750 [1:46:18<8:28:59, 5.78s/it] {'loss': 0.4669, 'learning_rate': 1.3746752346417884e-05, 'epoch': 0.4} + 40%|███▉ | 3469/8750 [1:46:15<8:29:00, 5.78s/it] 40%|███▉ | 3470/8750 [1:46:21<8:30:44, 5.80s/it] 40%|███▉ | 3470/8750 [1:46:24<8:30:44, 5.80s/it] {'loss': 0.4586, 'learning_rate': 1.3743320079843828e-05, 'epoch': 0.4} + 40%|███▉ | 3470/8750 [1:46:24<8:30:44, 5.80s/it] {'loss': 0.4586, 'learning_rate': 1.3743320079843828e-05, 'epoch': 0.4} + 40%|███▉ | 3470/8750 [1:46:21<8:30:44, 5.80s/it] 40%|███▉ | 3471/8750 [1:46:29<8:28:49, 5.78s/it] 40%|███▉ | 3471/8750 [1:46:27<8:28:50, 5.78s/it] {'loss': 0.4725, 'learning_rate': 1.3739887300351349e-05, 'epoch': 0.4} + 40%|███▉ | 3471/8750 [1:46:29<8:28:49, 5.78s/it] {'loss': 0.4725, 'learning_rate': 1.3739887300351349e-05, 'epoch': 0.4} + 40%|███▉ | 3471/8750 [1:46:27<8:28:50, 5.78s/it] 40%|███▉ | 3472/8750 [1:46:32<8:25:02, 5.74s/it] 40%|███▉ | 3472/8750 [1:46:35<8:25:02, 5.74s/it] {'loss': 0.4761, 'learning_rate': 1.3736454008410816e-05, 'epoch': 0.4} + 40%|███▉ | 3472/8750 [1:46:35<8:25:02, 5.74s/it] {'loss': 0.4761, 'learning_rate': 1.3736454008410816e-05, 'epoch': 0.4} + 40%|███▉ | 3472/8750 [1:46:32<8:25:02, 5.74s/it] 40%|███▉ | 3473/8750 [1:46:41<8:28:31, 5.78s/it] 40%|███▉ | 3473/8750 [1:46:38<8:28:32, 5.78s/it] {'loss': 0.4666, 'learning_rate': 1.373302020449266e-05, 'epoch': 0.4} + 40%|███▉ | 3473/8750 [1:46:41<8:28:31, 5.78s/it] {'loss': 0.4666, 'learning_rate': 1.373302020449266e-05, 'epoch': 0.4} + 40%|███▉ | 3473/8750 [1:46:38<8:28:32, 5.78s/it]/lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/VILA/llava/model/llava_arch.py:397: UserWarning: Inputs truncated! + warnings.warn("Inputs truncated!") + 40%|███▉ | 3474/8750 [1:46:44<8:28:10, 5.78s/it] 40%|███▉ | 3474/8750 [1:46:47<8:28:10, 5.78s/it] {'loss': 0.4902, 'learning_rate': 1.3729585889067391e-05, 'epoch': 0.4} + 40%|███▉ | 3474/8750 [1:46:47<8:28:10, 5.78s/it] {'loss': 0.4902, 'learning_rate': 1.3729585889067391e-05, 'epoch': 0.4} + 40%|███▉ | 3474/8750 [1:46:44<8:28:10, 5.78s/it] 40%|███▉ | 3475/8750 [1:46:50<8:26:50, 5.76s/it] 40%|███▉ | 3475/8750 [1:46:53<8:26:50, 5.77s/it] {'loss': 0.461, 'learning_rate': 1.3726151062605588e-05, 'epoch': 0.4} + 40%|███▉ | 3475/8750 [1:46:53<8:26:50, 5.77s/it] {'loss': 0.461, 'learning_rate': 1.3726151062605588e-05, 'epoch': 0.4} + 40%|███▉ | 3475/8750 [1:46:50<8:26:50, 5.76s/it] 40%|███▉ | 3476/8750 [1:46:58<8:28:00, 5.78s/it] 40%|███▉ | 3476/8750 [1:46:55<8:28:01, 5.78s/it] {'loss': 0.4729, 'learning_rate': 1.3722715725577902e-05, 'epoch': 0.4} + 40%|███▉ | 3476/8750 [1:46:58<8:28:00, 5.78s/it] {'loss': 0.4729, 'learning_rate': 1.3722715725577902e-05, 'epoch': 0.4} + 40%|███▉ | 3476/8750 [1:46:55<8:28:01, 5.78s/it] 40%|███▉ | 3477/8750 [1:47:01<8:26:53, 5.77s/it] 40%|███▉ | 3477/8750 [1:47:04<8:26:54, 5.77s/it] {'loss': 0.4725, 'learning_rate': 1.3719279878455046e-05, 'epoch': 0.4} + 40%|███▉ | 3477/8750 [1:47:04<8:26:54, 5.77s/it] {'loss': 0.4725, 'learning_rate': 1.3719279878455046e-05, 'epoch': 0.4} + 40%|███▉ | 3477/8750 [1:47:01<8:26:53, 5.77s/it] 40%|███▉ | 3478/8750 [1:47:07<8:25:43, 5.76s/it] 40%|███▉ | 3478/8750 [1:47:10<8:25:43, 5.76s/it] {'loss': 0.4572, 'learning_rate': 1.3715843521707805e-05, 'epoch': 0.4} + 40%|███▉ | 3478/8750 [1:47:10<8:25:43, 5.76s/it] {'loss': 0.4572, 'learning_rate': 1.3715843521707805e-05, 'epoch': 0.4} + 40%|███▉ | 3478/8750 [1:47:07<8:25:43, 5.76s/it] 40%|███▉ | 3479/8750 [1:47:13<8:25:25, 5.75s/it] 40%|███▉ | 3479/8750 [1:47:16<8:25:26, 5.75s/it] {'loss': 0.4482, 'learning_rate': 1.3712406655807047e-05, 'epoch': 0.4} + 40%|███▉ | 3479/8750 [1:47:16<8:25:26, 5.75s/it] {'loss': 0.4482, 'learning_rate': 1.3712406655807047e-05, 'epoch': 0.4} + 40%|███▉ | 3479/8750 [1:47:13<8:25:25, 5.75s/it] 40%|███▉ | 3480/8750 [1:47:18<8:21:15, 5.71s/it] 40%|███▉ | 3480/8750 [1:47:21<8:21:15, 5.71s/it] {'loss': 0.4834, 'learning_rate': 1.3708969281223687e-05, 'epoch': 0.4} + 40%|███▉ | 3480/8750 [1:47:21<8:21:15, 5.71s/it] {'loss': 0.4834, 'learning_rate': 1.3708969281223687e-05, 'epoch': 0.4} + 40%|███▉ | 3480/8750 [1:47:18<8:21:15, 5.71s/it] 40%|███▉ | 3481/8750 [1:47:27<8:22:55, 5.73s/it] 40%|███▉ | 3481/8750 [1:47:24<8:22:55, 5.73s/it] {'loss': 0.4743, 'learning_rate': 1.3705531398428736e-05, 'epoch': 0.4} + 40%|███▉ | 3481/8750 [1:47:27<8:22:55, 5.73s/it] {'loss': 0.4743, 'learning_rate': 1.3705531398428736e-05, 'epoch': 0.4} + 40%|███▉ | 3481/8750 [1:47:24<8:22:55, 5.73s/it] 40%|███▉ | 3482/8750 [1:47:30<8:20:45, 5.70s/it] 40%|███▉ | 3482/8750 [1:47:33<8:20:45, 5.70s/it] {'loss': 0.4896, 'learning_rate': 1.3702093007893249e-05, 'epoch': 0.4} + 40%|███▉ | 3482/8750 [1:47:33<8:20:45, 5.70s/it] {'loss': 0.4896, 'learning_rate': 1.3702093007893249e-05, 'epoch': 0.4} + 40%|███▉ | 3482/8750 [1:47:30<8:20:45, 5.70s/it] 40%|███▉ | 3483/8750 [1:47:35<8:18:17, 5.68s/it] 40%|███▉ | 3483/8750 [1:47:38<8:18:17, 5.68s/it] {'loss': 0.4706, 'learning_rate': 1.3698654110088365e-05, 'epoch': 0.4} + 40%|███▉ | 3483/8750 [1:47:38<8:18:17, 5.68s/it] {'loss': 0.4706, 'learning_rate': 1.3698654110088365e-05, 'epoch': 0.4} + 40%|███▉ | 3483/8750 [1:47:35<8:18:17, 5.68s/it] 40%|███▉ | 3484/8750 [1:47:41<8:26:35, 5.77s/it] 40%|███▉ | 3484/8750 [1:47:44<8:26:35, 5.77s/it] {'loss': 0.4695, 'learning_rate': 1.3695214705485294e-05, 'epoch': 0.4} + 40%|███▉ | 3484/8750 [1:47:44<8:26:35, 5.77s/it] {'loss': 0.4695, 'learning_rate': 1.3695214705485294e-05, 'epoch': 0.4} + 40%|███▉ | 3484/8750 [1:47:41<8:26:35, 5.77s/it] 40%|███▉ | 3485/8750 [1:47:47<8:26:42, 5.77s/it] 40%|███▉ | 3485/8750 [1:47:50<8:26:42, 5.77s/it] {'loss': 0.4616, 'learning_rate': 1.3691774794555306e-05, 'epoch': 0.4} + 40%|███▉ | 3485/8750 [1:47:50<8:26:42, 5.77s/it] {'loss': 0.4616, 'learning_rate': 1.3691774794555306e-05, 'epoch': 0.4} + 40%|███▉ | 3485/8750 [1:47:47<8:26:42, 5.77s/it] 40%|███▉ | 3486/8750 [1:47:53<8:24:10, 5.75s/it] 40%|███▉ | 3486/8750 [1:47:56<8:24:10, 5.75s/it] {'loss': 0.4823, 'learning_rate': 1.368833437776975e-05, 'epoch': 0.4} + {'loss': 0.4823, 'learning_rate': 1.368833437776975e-05, 'epoch': 0.4} 40%|███▉ | 3486/8750 [1:47:56<8:24:10, 5.75s/it] + 40%|███▉ | 3486/8750 [1:47:53<8:24:10, 5.75s/it] 40%|███▉ | 3487/8750 [1:47:59<8:39:18, 5.92s/it] 40%|███▉ | 3487/8750 [1:48:02<8:39:18, 5.92s/it] {'loss': 0.4539, 'learning_rate': 1.3684893455600036e-05, 'epoch': 0.4} + 40%|███▉ | 3487/8750 [1:48:02<8:39:18, 5.92s/it] {'loss': 0.4539, 'learning_rate': 1.3684893455600036e-05, 'epoch': 0.4} + 40%|███▉ | 3487/8750 [1:47:59<8:39:18, 5.92s/it] 40%|███▉ | 3488/8750 [1:48:05<8:36:21, 5.89s/it] 40%|███▉ | 3488/8750 [1:48:08<8:36:21, 5.89s/it] {'loss': 0.4737, 'learning_rate': 1.368145202851765e-05, 'epoch': 0.4} + 40%|███▉ | 3488/8750 [1:48:08<8:36:21, 5.89s/it] {'loss': 0.4737, 'learning_rate': 1.368145202851765e-05, 'epoch': 0.4} + 40%|███▉ | 3488/8750 [1:48:05<8:36:21, 5.89s/it] 40%|███▉ | 3489/8750 [1:48:11<8:33:13, 5.85s/it] 40%|███▉ | 3489/8750 [1:48:14<8:33:13, 5.85s/it] {'loss': 0.4613, 'learning_rate': 1.3678010096994143e-05, 'epoch': 0.4} + {'loss': 0.4613, 'learning_rate': 1.3678010096994143e-05, 'epoch': 0.4} 40%|███▉ | 3489/8750 [1:48:14<8:33:13, 5.85s/it] + 40%|███▉ | 3489/8750 [1:48:11<8:33:13, 5.85s/it] 40%|███▉ | 3490/8750 [1:48:16<8:26:59, 5.78s/it] 40%|███▉ | 3490/8750 [1:48:19<8:26:59, 5.78s/it] {'loss': 0.485, 'learning_rate': 1.3674567661501138e-05, 'epoch': 0.4} + 40%|███▉ | 3490/8750 [1:48:19<8:26:59, 5.78s/it] {'loss': 0.485, 'learning_rate': 1.3674567661501138e-05, 'epoch': 0.4} + 40%|███▉ | 3490/8750 [1:48:16<8:26:59, 5.78s/it] 40%|███▉ | 3491/8750 [1:48:22<8:30:02, 5.82s/it] 40%|███▉ | 3491/8750 [1:48:25<8:30:02, 5.82s/it] {'loss': 0.4766, 'learning_rate': 1.3671124722510325e-05, 'epoch': 0.4} + 40%|███▉ | 3491/8750 [1:48:25<8:30:02, 5.82s/it] {'loss': 0.4766, 'learning_rate': 1.3671124722510325e-05, 'epoch': 0.4} + 40%|███▉ | 3491/8750 [1:48:22<8:30:02, 5.82s/it] 40%|███▉ | 3492/8750 [1:48:28<8:30:40, 5.83s/it] 40%|███▉ | 3492/8750 [1:48:31<8:30:40, 5.83s/it] {'loss': 0.4763, 'learning_rate': 1.366768128049346e-05, 'epoch': 0.4} + 40%|███▉ | 3492/8750 [1:48:31<8:30:40, 5.83s/it] {'loss': 0.4763, 'learning_rate': 1.366768128049346e-05, 'epoch': 0.4} + 40%|███▉ | 3492/8750 [1:48:28<8:30:40, 5.83s/it] 40%|███▉ | 3493/8750 [1:48:34<8:39:06, 5.92s/it] 40%|███▉ | 3493/8750 [1:48:37<8:39:06, 5.92s/it] {'loss': 0.4588, 'learning_rate': 1.3664237335922377e-05, 'epoch': 0.4} + 40%|███▉ | 3493/8750 [1:48:37<8:39:06, 5.92s/it] {'loss': 0.4588, 'learning_rate': 1.3664237335922377e-05, 'epoch': 0.4} + 40%|███▉ | 3493/8750 [1:48:34<8:39:06, 5.92s/it] 40%|███▉ | 3494/8750 [1:48:40<8:35:12, 5.88s/it] 40%|███▉ | 3494/8750 [1:48:43<8:35:12, 5.88s/it] {'loss': 0.4883, 'learning_rate': 1.3660792889268967e-05, 'epoch': 0.4} + 40%|███▉ | 3494/8750 [1:48:43<8:35:12, 5.88s/it] {'loss': 0.4883, 'learning_rate': 1.3660792889268967e-05, 'epoch': 0.4} + 40%|███▉ | 3494/8750 [1:48:40<8:35:12, 5.88s/it] 40%|███▉ | 3495/8750 [1:48:46<8:36:38, 5.90s/it] 40%|███▉ | 3495/8750 [1:48:49<8:36:38, 5.90s/it] {'loss': 0.4671, 'learning_rate': 1.3657347941005204e-05, 'epoch': 0.4} + 40%|███▉ | 3495/8750 [1:48:49<8:36:38, 5.90s/it] {'loss': 0.4671, 'learning_rate': 1.3657347941005204e-05, 'epoch': 0.4} + 40%|███▉ | 3495/8750 [1:48:46<8:36:38, 5.90s/it] 40%|███▉ | 3496/8750 [1:48:52<8:32:35, 5.85s/it] 40%|███▉ | 3496/8750 [1:48:55<8:32:35, 5.85s/it] {'loss': 0.464, 'learning_rate': 1.3653902491603117e-05, 'epoch': 0.4} + 40%|███▉ | 3496/8750 [1:48:55<8:32:35, 5.85s/it] {'loss': 0.464, 'learning_rate': 1.3653902491603117e-05, 'epoch': 0.4} + 40%|███▉ | 3496/8750 [1:48:52<8:32:35, 5.85s/it] 40%|███▉ | 3497/8750 [1:48:57<8:29:38, 5.82s/it] 40%|███▉ | 3497/8750 [1:49:00<8:29:38, 5.82s/it] {'loss': 0.4619, 'learning_rate': 1.3650456541534811e-05, 'epoch': 0.4} + 40%|███▉ | 3497/8750 [1:49:00<8:29:38, 5.82s/it] {'loss': 0.4619, 'learning_rate': 1.3650456541534811e-05, 'epoch': 0.4} + 40%|███▉ | 3497/8750 [1:48:57<8:29:38, 5.82s/it] 40%|███▉ | 3498/8750 [1:49:03<8:29:03, 5.82s/it] 40%|███▉ | 3498/8750 [1:49:06<8:29:03, 5.82s/it] {'loss': 0.4806, 'learning_rate': 1.3647010091272456e-05, 'epoch': 0.4} + 40%|███▉ | 3498/8750 [1:49:06<8:29:03, 5.82s/it] {'loss': 0.4806, 'learning_rate': 1.3647010091272456e-05, 'epoch': 0.4} + 40%|███▉ | 3498/8750 [1:49:03<8:29:03, 5.82s/it] 40%|███▉ | 3499/8750 [1:49:09<8:33:55, 5.87s/it] 40%|███▉ | 3499/8750 [1:49:12<8:33:55, 5.87s/it] {'loss': 0.4832, 'learning_rate': 1.3643563141288297e-05, 'epoch': 0.4} + 40%|███▉ | 3499/8750 [1:49:12<8:33:55, 5.87s/it] {'loss': 0.4832, 'learning_rate': 1.3643563141288297e-05, 'epoch': 0.4} + 40%|███▉ | 3499/8750 [1:49:09<8:33:55, 5.87s/it]010 AutoResumeHook: Checking whether to suspend... +93 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... + 6 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + + 40%|████ | 3500/8750 [1:49:15<8:28:49, 5.82s/it]8 11AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... + 40%|████ | 3500/8750 [1:49:18<8:28:49, 5.82s/it]13 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... + {'loss': 0.476, 'learning_rate': 1.364011569205464e-05, 'epoch': 0.4} + 40%|████ | 3500/8750 [1:49:18<8:28:49, 5.82s/it] {'loss': 0.476, 'learning_rate': 1.364011569205464e-05, 'epoch': 0.4} + 40%|████ | 3500/8750 [1:49:15<8:28:49, 5.82s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-3500/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-3500/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-3500/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 40%|████ | 3501/8750 [1:49:40<16:44:57, 11.49s/it] 40%|████ | 3501/8750 [1:49:42<16:44:57, 11.49s/it] {'loss': 0.4768, 'learning_rate': 1.3636667744043864e-05, 'epoch': 0.4} + 40%|████ | 3501/8750 [1:49:42<16:44:57, 11.49s/it] {'loss': 0.4768, 'learning_rate': 1.3636667744043864e-05, 'epoch': 0.4} + 40%|████ | 3501/8750 [1:49:40<16:44:57, 11.49s/it] 40%|████ | 3502/8750 [1:49:45<14:14:13, 9.77s/it] 40%|████ | 3502/8750 [1:49:48<14:14:13, 9.77s/it] {'loss': 0.4722, 'learning_rate': 1.3633219297728415e-05, 'epoch': 0.4} + 40%|████ | 3502/8750 [1:49:48<14:14:13, 9.77s/it] {'loss': 0.4722, 'learning_rate': 1.3633219297728415e-05, 'epoch': 0.4} + 40%|████ | 3502/8750 [1:49:45<14:14:13, 9.77s/it] 40%|████ | 3503/8750 [1:49:51<12:25:47, 8.53s/it] 40%|████ | 3503/8750 [1:49:54<12:25:47, 8.53s/it] {'loss': 0.4721, 'learning_rate': 1.3629770353580804e-05, 'epoch': 0.4} + 40%|████ | 3503/8750 [1:49:54<12:25:47, 8.53s/it] {'loss': 0.4721, 'learning_rate': 1.3629770353580804e-05, 'epoch': 0.4} + 40%|████ | 3503/8750 [1:49:51<12:25:47, 8.53s/it] 40%|████ | 3504/8750 [1:50:00<11:11:16, 7.68s/it] 40%|████ | 3504/8750 [1:49:57<11:11:18, 7.68s/it] {'loss': 0.4715, 'learning_rate': 1.3626320912073616e-05, 'epoch': 0.4} + 40%|████ | 3504/8750 [1:50:00<11:11:16, 7.68s/it] {'loss': 0.4715, 'learning_rate': 1.3626320912073616e-05, 'epoch': 0.4} + 40%|████ | 3504/8750 [1:49:57<11:11:18, 7.68s/it] 40%|████ | 3505/8750 [1:50:03<10:24:08, 7.14s/it] 40%|████ | 3505/8750 [1:50:05<10:24:08, 7.14s/it] {'loss': 0.4711, 'learning_rate': 1.3622870973679503e-05, 'epoch': 0.4} + 40%|████ | 3505/8750 [1:50:05<10:24:08, 7.14s/it] {'loss': 0.4711, 'learning_rate': 1.3622870973679503e-05, 'epoch': 0.4} + 40%|████ | 3505/8750 [1:50:03<10:24:08, 7.14s/it] 40%|████ | 3506/8750 [1:50:08<9:44:52, 6.69s/it] 40%|████ | 3506/8750 [1:50:11<9:44:53, 6.69s/it] {'loss': 0.4722, 'learning_rate': 1.361942053887118e-05, 'epoch': 0.4} + {'loss': 0.4722, 'learning_rate': 1.361942053887118e-05, 'epoch': 0.4} + 40%|████ | 3506/8750 [1:50:11<9:44:53, 6.69s/it] 40%|████ | 3506/8750 [1:50:08<9:44:52, 6.69s/it] 40%|████ | 3507/8750 [1:50:14<9:20:46, 6.42s/it] 40%|████ | 3507/8750 [1:50:17<9:20:46, 6.42s/it] {'loss': 0.4726, 'learning_rate': 1.3615969608121438e-05, 'epoch': 0.4} + 40%|████ | 3507/8750 [1:50:17<9:20:46, 6.42s/it] {'loss': 0.4726, 'learning_rate': 1.3615969608121438e-05, 'epoch': 0.4} + 40%|████ | 3507/8750 [1:50:14<9:20:46, 6.42s/it] 40%|████ | 3508/8750 [1:50:20<9:06:12, 6.25s/it] 40%|████ | 3508/8750 [1:50:23<9:06:12, 6.25s/it] {'loss': 0.465, 'learning_rate': 1.3612518181903127e-05, 'epoch': 0.4} + 40%|████ | 3508/8750 [1:50:23<9:06:12, 6.25s/it] {'loss': 0.465, 'learning_rate': 1.3612518181903127e-05, 'epoch': 0.4} + 40%|████ | 3508/8750 [1:50:20<9:06:12, 6.25s/it] 40%|████ | 3509/8750 [1:50:28<8:48:40, 6.05s/it] 40%|████ | 3509/8750 [1:50:25<8:48:40, 6.05s/it] {'loss': 0.4693, 'learning_rate': 1.360906626068917e-05, 'epoch': 0.4} + 40%|████ | 3509/8750 [1:50:28<8:48:40, 6.05s/it] {'loss': 0.4693, 'learning_rate': 1.360906626068917e-05, 'epoch': 0.4} + 40%|████ | 3509/8750 [1:50:25<8:48:40, 6.05s/it] 40%|████ | 3510/8750 [1:50:31<8:45:21, 6.02s/it] 40%|████ | 3510/8750 [1:50:34<8:45:22, 6.02s/it] {'loss': 0.4564, 'learning_rate': 1.3605613844952561e-05, 'epoch': 0.4} + 40%|████ | 3510/8750 [1:50:34<8:45:22, 6.02s/it] {'loss': 0.4564, 'learning_rate': 1.3605613844952561e-05, 'epoch': 0.4} + 40%|████ | 3510/8750 [1:50:31<8:45:21, 6.02s/it] 40%|████ | 3511/8750 [1:50:37<8:40:42, 5.96s/it] 40%|████ | 3511/8750 [1:50:40<8:40:41, 5.96s/it] {'loss': 0.4639, 'learning_rate': 1.3602160935166357e-05, 'epoch': 0.4} + 40%|████ | 3511/8750 [1:50:40<8:40:41, 5.96s/it] {'loss': 0.4639, 'learning_rate': 1.3602160935166357e-05, 'epoch': 0.4} + 40%|████ | 3511/8750 [1:50:37<8:40:42, 5.96s/it] 40%|████ | 3512/8750 [1:50:43<8:40:07, 5.96s/it] 40%|████ | 3512/8750 [1:50:46<8:40:07, 5.96s/it] {'loss': 0.4704, 'learning_rate': 1.359870753180368e-05, 'epoch': 0.4} + 40%|████ | 3512/8750 [1:50:46<8:40:07, 5.96s/it] {'loss': 0.4704, 'learning_rate': 1.359870753180368e-05, 'epoch': 0.4} + 40%|████ | 3512/8750 [1:50:43<8:40:07, 5.96s/it] 40%|████ | 3513/8750 [1:50:49<8:33:25, 5.88s/it] 40%|████ | 3513/8750 [1:50:52<8:33:25, 5.88s/it] {'loss': 0.4511, 'learning_rate': 1.3595253635337724e-05, 'epoch': 0.4} + 40%|████ | 3513/8750 [1:50:52<8:33:25, 5.88s/it] {'loss': 0.4511, 'learning_rate': 1.3595253635337724e-05, 'epoch': 0.4} + 40%|████ | 3513/8750 [1:50:49<8:33:25, 5.88s/it] 40%|████ | 3514/8750 [1:50:55<8:34:52, 5.90s/it] 40%|████ | 3514/8750 [1:50:58<8:34:52, 5.90s/it] {'loss': 0.4738, 'learning_rate': 1.3591799246241753e-05, 'epoch': 0.4} + 40%|████ | 3514/8750 [1:50:58<8:34:52, 5.90s/it] {'loss': 0.4738, 'learning_rate': 1.3591799246241753e-05, 'epoch': 0.4} + 40%|████ | 3514/8750 [1:50:55<8:34:52, 5.90s/it] 40%|████ | 3515/8750 [1:51:01<8:34:43, 5.90s/it] 40%|████ | 3515/8750 [1:51:04<8:34:43, 5.90s/it] {'loss': 0.478, 'learning_rate': 1.3588344364989096e-05, 'epoch': 0.4} + 40%|████ | 3515/8750 [1:51:04<8:34:43, 5.90s/it] {'loss': 0.478, 'learning_rate': 1.3588344364989096e-05, 'epoch': 0.4} + 40%|████ | 3515/8750 [1:51:01<8:34:43, 5.90s/it] 40%|████ | 3516/8750 [1:51:06<8:31:06, 5.86s/it] 40%|████ | 3516/8750 [1:51:09<8:31:06, 5.86s/it] {'loss': 0.4549, 'learning_rate': 1.3584888992053146e-05, 'epoch': 0.4} + 40%|████ | 3516/8750 [1:51:09<8:31:06, 5.86s/it] {'loss': 0.4549, 'learning_rate': 1.3584888992053146e-05, 'epoch': 0.4} + 40%|████ | 3516/8750 [1:51:06<8:31:06, 5.86s/it] 40%|████ | 3517/8750 [1:51:15<8:23:23, 5.77s/it] 40%|████ | 3517/8750 [1:51:12<8:23:23, 5.77s/it] {'loss': 0.4767, 'learning_rate': 1.3581433127907366e-05, 'epoch': 0.4} + 40%|████ | 3517/8750 [1:51:15<8:23:23, 5.77s/it] {'loss': 0.4767, 'learning_rate': 1.3581433127907366e-05, 'epoch': 0.4} + 40%|████ | 3517/8750 [1:51:12<8:23:23, 5.77s/it] 40%|████ | 3518/8750 [1:51:18<8:19:22, 5.73s/it] 40%|████ | 3518/8750 [1:51:21<8:19:22, 5.73s/it] {'loss': 0.4492, 'learning_rate': 1.357797677302529e-05, 'epoch': 0.4} + 40%|████ | 3518/8750 [1:51:21<8:19:22, 5.73s/it] {'loss': 0.4492, 'learning_rate': 1.357797677302529e-05, 'epoch': 0.4} + 40%|████ | 3518/8750 [1:51:18<8:19:22, 5.73s/it] 40%|████ | 3519/8750 [1:51:26<8:21:24, 5.75s/it] 40%|████ | 3519/8750 [1:51:23<8:21:24, 5.75s/it] {'loss': 0.4672, 'learning_rate': 1.3574519927880511e-05, 'epoch': 0.4} + 40%|████ | 3519/8750 [1:51:26<8:21:24, 5.75s/it] {'loss': 0.4672, 'learning_rate': 1.3574519927880511e-05, 'epoch': 0.4} + 40%|████ | 3519/8750 [1:51:23<8:21:24, 5.75s/it] 40%|████ | 3520/8750 [1:51:29<8:21:22, 5.75s/it] 40%|████ | 3520/8750 [1:51:32<8:21:22, 5.75s/it] {'loss': 0.473, 'learning_rate': 1.3571062592946703e-05, 'epoch': 0.4} + 40%|████ | 3520/8750 [1:51:32<8:21:22, 5.75s/it] {'loss': 0.473, 'learning_rate': 1.3571062592946703e-05, 'epoch': 0.4} + 40%|████ | 3520/8750 [1:51:29<8:21:22, 5.75s/it] 40%|████ | 3521/8750 [1:51:35<8:23:37, 5.78s/it] 40%|████ | 3521/8750 [1:51:38<8:23:37, 5.78s/it] {'loss': 0.4583, 'learning_rate': 1.3567604768697585e-05, 'epoch': 0.4} + 40%|████ | 3521/8750 [1:51:38<8:23:37, 5.78s/it] {'loss': 0.4583, 'learning_rate': 1.3567604768697585e-05, 'epoch': 0.4} + 40%|████ | 3521/8750 [1:51:35<8:23:37, 5.78s/it] 40%|████ | 3522/8750 [1:51:41<8:26:57, 5.82s/it] 40%|████ | 3522/8750 [1:51:44<8:26:57, 5.82s/it] {'loss': 0.4575, 'learning_rate': 1.3564146455606961e-05, 'epoch': 0.4} + 40%|████ | 3522/8750 [1:51:44<8:26:57, 5.82s/it] {'loss': 0.4575, 'learning_rate': 1.3564146455606961e-05, 'epoch': 0.4} + 40%|████ | 3522/8750 [1:51:41<8:26:57, 5.82s/it] 40%|████ | 3523/8750 [1:51:47<8:30:05, 5.86s/it] 40%|████ | 3523/8750 [1:51:50<8:30:05, 5.86s/it] {'loss': 0.4642, 'learning_rate': 1.3560687654148703e-05, 'epoch': 0.4} + 40%|████ | 3523/8750 [1:51:50<8:30:05, 5.86s/it] {'loss': 0.4642, 'learning_rate': 1.3560687654148703e-05, 'epoch': 0.4} + 40%|████ | 3523/8750 [1:51:47<8:30:05, 5.86s/it] 40%|████ | 3524/8750 [1:51:53<8:25:55, 5.81s/it] 40%|████ | 3524/8750 [1:51:55<8:25:55, 5.81s/it] {'loss': 0.4688, 'learning_rate': 1.3557228364796742e-05, 'epoch': 0.4} + 40%|████ | 3524/8750 [1:51:55<8:25:55, 5.81s/it] {'loss': 0.4688, 'learning_rate': 1.3557228364796742e-05, 'epoch': 0.4} + 40%|████ | 3524/8750 [1:51:53<8:25:55, 5.81s/it] 40%|████ | 3525/8750 [1:51:58<8:27:45, 5.83s/it] 40%|████ | 3525/8750 [1:52:01<8:27:45, 5.83s/it] {'loss': 0.4582, 'learning_rate': 1.3553768588025073e-05, 'epoch': 0.4} + 40%|████ | 3525/8750 [1:52:01<8:27:45, 5.83s/it] {'loss': 0.4582, 'learning_rate': 1.3553768588025073e-05, 'epoch': 0.4} + 40%|████ | 3525/8750 [1:51:58<8:27:45, 5.83s/it] 40%|████ | 3526/8750 [1:52:04<8:27:00, 5.82s/it] 40%|████ | 3526/8750 [1:52:07<8:27:00, 5.82s/it] {'loss': 0.4701, 'learning_rate': 1.3550308324307767e-05, 'epoch': 0.4} + 40%|████ | 3526/8750 [1:52:07<8:27:00, 5.82s/it] {'loss': 0.4701, 'learning_rate': 1.3550308324307767e-05, 'epoch': 0.4} + 40%|████ | 3526/8750 [1:52:04<8:27:00, 5.82s/it] 40%|████ | 3527/8750 [1:52:10<8:24:26, 5.79s/it] 40%|████ | 3527/8750 [1:52:13<8:24:27, 5.80s/it] {'loss': 0.4618, 'learning_rate': 1.3546847574118951e-05, 'epoch': 0.4} + 40%|████ | 3527/8750 [1:52:13<8:24:27, 5.80s/it] {'loss': 0.4618, 'learning_rate': 1.3546847574118951e-05, 'epoch': 0.4} + 40%|████ | 3527/8750 [1:52:10<8:24:26, 5.79s/it] 40%|████ | 3528/8750 [1:52:16<8:23:48, 5.79s/it] 40%|████ | 3528/8750 [1:52:19<8:23:48, 5.79s/it] {'loss': 0.4574, 'learning_rate': 1.3543386337932834e-05, 'epoch': 0.4} + 40%|████ | 3528/8750 [1:52:19<8:23:48, 5.79s/it] {'loss': 0.4574, 'learning_rate': 1.3543386337932834e-05, 'epoch': 0.4} + 40%|████ | 3528/8750 [1:52:16<8:23:48, 5.79s/it] 40%|████ | 3529/8750 [1:52:22<8:25:25, 5.81s/it] 40%|████ | 3529/8750 [1:52:25<8:25:25, 5.81s/it] {'loss': 0.4567, 'learning_rate': 1.3539924616223679e-05, 'epoch': 0.4} + 40%|████ | 3529/8750 [1:52:25<8:25:25, 5.81s/it] {'loss': 0.4567, 'learning_rate': 1.3539924616223679e-05, 'epoch': 0.4} + 40%|████ | 3529/8750 [1:52:22<8:25:25, 5.81s/it] 40%|████ | 3530/8750 [1:52:27<8:21:32, 5.76s/it] 40%|████ | 3530/8750 [1:52:30<8:21:32, 5.76s/it] {'loss': 0.4813, 'learning_rate': 1.3536462409465816e-05, 'epoch': 0.4} + 40%|████ | 3530/8750 [1:52:30<8:21:32, 5.76s/it] {'loss': 0.4813, 'learning_rate': 1.3536462409465816e-05, 'epoch': 0.4} + 40%|████ | 3530/8750 [1:52:27<8:21:32, 5.76s/it] 40%|████ | 3531/8750 [1:52:34<8:34:34, 5.92s/it] 40%|████ | 3531/8750 [1:52:36<8:34:35, 5.92s/it] {'loss': 0.4813, 'learning_rate': 1.3532999718133648e-05, 'epoch': 0.4} + 40%|████ | 3531/8750 [1:52:36<8:34:35, 5.92s/it] {'loss': 0.4813, 'learning_rate': 1.3532999718133648e-05, 'epoch': 0.4} + 40%|████ | 3531/8750 [1:52:34<8:34:34, 5.92s/it] 40%|████ | 3532/8750 [1:52:39<8:27:22, 5.83s/it] 40%|████ | 3532/8750 [1:52:42<8:27:21, 5.83s/it] {'loss': 0.4799, 'learning_rate': 1.3529536542701638e-05, 'epoch': 0.4} + 40%|████ | 3532/8750 [1:52:42<8:27:21, 5.83s/it] {'loss': 0.4799, 'learning_rate': 1.3529536542701638e-05, 'epoch': 0.4} + 40%|████ | 3532/8750 [1:52:39<8:27:22, 5.83s/it] 40%|████ | 3533/8750 [1:52:45<8:23:16, 5.79s/it] 40%|████ | 3533/8750 [1:52:48<8:23:16, 5.79s/it] {'loss': 0.4817, 'learning_rate': 1.3526072883644326e-05, 'epoch': 0.4} + 40%|████ | 3533/8750 [1:52:48<8:23:16, 5.79s/it] {'loss': 0.4817, 'learning_rate': 1.3526072883644326e-05, 'epoch': 0.4} + 40%|████ | 3533/8750 [1:52:45<8:23:16, 5.79s/it] 40%|████ | 3534/8750 [1:52:51<8:23:22, 5.79s/it] 40%|████ | 3534/8750 [1:52:54<8:23:22, 5.79s/it] {'loss': 0.4664, 'learning_rate': 1.3522608741436303e-05, 'epoch': 0.4} + 40%|████ | 3534/8750 [1:52:54<8:23:22, 5.79s/it] {'loss': 0.4664, 'learning_rate': 1.3522608741436303e-05, 'epoch': 0.4} + 40%|████ | 3534/8750 [1:52:51<8:23:22, 5.79s/it] 40%|████ | 3535/8750 [1:52:56<8:20:08, 5.75s/it] 40%|████ | 3535/8750 [1:52:59<8:20:08, 5.75s/it] {'loss': 0.4972, 'learning_rate': 1.3519144116552236e-05, 'epoch': 0.4} + 40%|████ | 3535/8750 [1:52:59<8:20:08, 5.75s/it] {'loss': 0.4972, 'learning_rate': 1.3519144116552236e-05, 'epoch': 0.4} + 40%|████ | 3535/8750 [1:52:56<8:20:08, 5.75s/it] 40%|████ | 3536/8750 [1:53:02<8:24:39, 5.81s/it] 40%|████ | 3536/8750 [1:53:05<8:24:39, 5.81s/it] {'loss': 0.4505, 'learning_rate': 1.3515679009466856e-05, 'epoch': 0.4} + 40%|████ | 3536/8750 [1:53:05<8:24:39, 5.81s/it] {'loss': 0.4505, 'learning_rate': 1.3515679009466856e-05, 'epoch': 0.4} + 40%|████ | 3536/8750 [1:53:02<8:24:39, 5.81s/it] 40%|████ | 3537/8750 [1:53:08<8:25:15, 5.82s/it] 40%|████ | 3537/8750 [1:53:11<8:25:15, 5.82s/it] {'loss': 0.4666, 'learning_rate': 1.3512213420654959e-05, 'epoch': 0.4} + 40%|████ | 3537/8750 [1:53:11<8:25:15, 5.82s/it] {'loss': 0.4666, 'learning_rate': 1.3512213420654959e-05, 'epoch': 0.4} + 40%|████ | 3537/8750 [1:53:08<8:25:15, 5.82s/it] 40%|████ | 3538/8750 [1:53:14<8:24:35, 5.81s/it] 40%|████ | 3538/8750 [1:53:17<8:24:35, 5.81s/it] {'loss': 0.4746, 'learning_rate': 1.350874735059141e-05, 'epoch': 0.4} + 40%|████ | 3538/8750 [1:53:17<8:24:35, 5.81s/it] {'loss': 0.4746, 'learning_rate': 1.350874735059141e-05, 'epoch': 0.4} + 40%|████ | 3538/8750 [1:53:14<8:24:35, 5.81s/it] 40%|████ | 3539/8750 [1:53:20<8:21:58, 5.78s/it] 40%|████ | 3539/8750 [1:53:23<8:21:58, 5.78s/it] {'loss': 0.4741, 'learning_rate': 1.3505280799751134e-05, 'epoch': 0.4} + 40%|████ | 3539/8750 [1:53:23<8:21:58, 5.78s/it] {'loss': 0.4741, 'learning_rate': 1.3505280799751134e-05, 'epoch': 0.4} + 40%|████ | 3539/8750 [1:53:20<8:21:58, 5.78s/it] 40%|████ | 3540/8750 [1:53:26<8:26:37, 5.83s/it] 40%|████ | 3540/8750 [1:53:28<8:26:37, 5.83s/it] {'loss': 0.4802, 'learning_rate': 1.3501813768609134e-05, 'epoch': 0.4} + 40%|████ | 3540/8750 [1:53:28<8:26:37, 5.83s/it] {'loss': 0.4802, 'learning_rate': 1.3501813768609134e-05, 'epoch': 0.4} + 40%|████ | 3540/8750 [1:53:26<8:26:37, 5.83s/it] 40%|████ | 3541/8750 [1:53:31<8:23:35, 5.80s/it] 40%|████ | 3541/8750 [1:53:34<8:23:35, 5.80s/it] {'loss': 0.4697, 'learning_rate': 1.3498346257640461e-05, 'epoch': 0.4} + 40%|████ | 3541/8750 [1:53:34<8:23:35, 5.80s/it] {'loss': 0.4697, 'learning_rate': 1.3498346257640461e-05, 'epoch': 0.4} + 40%|████ | 3541/8750 [1:53:31<8:23:35, 5.80s/it] 40%|████ | 3542/8750 [1:53:37<8:28:08, 5.85s/it] 40%|████ | 3542/8750 [1:53:40<8:28:08, 5.85s/it] {'loss': 0.4593, 'learning_rate': 1.349487826732025e-05, 'epoch': 0.4} + {'loss': 0.4593, 'learning_rate': 1.349487826732025e-05, 'epoch': 0.4} 40%|████ | 3542/8750 [1:53:40<8:28:08, 5.85s/it] + 40%|████ | 3542/8750 [1:53:37<8:28:08, 5.85s/it] 40%|████ | 3543/8750 [1:53:43<8:22:11, 5.79s/it] 40%|████ | 3543/8750 [1:53:46<8:22:10, 5.79s/it] {'loss': 0.4985, 'learning_rate': 1.3491409798123687e-05, 'epoch': 0.4} + 40%|████ | 3543/8750 [1:53:46<8:22:10, 5.79s/it] {'loss': 0.4985, 'learning_rate': 1.3491409798123687e-05, 'epoch': 0.4} + 40%|████ | 3543/8750 [1:53:43<8:22:11, 5.79s/it] 41%|████ | 3544/8750 [1:53:52<8:21:04, 5.77s/it] 41%|████ | 3544/8750 [1:53:49<8:21:04, 5.78s/it] {'loss': 0.4663, 'learning_rate': 1.3487940850526033e-05, 'epoch': 0.41} + 41%|████ | 3544/8750 [1:53:52<8:21:04, 5.77s/it] {'loss': 0.4663, 'learning_rate': 1.3487940850526033e-05, 'epoch': 0.41} + 41%|████ | 3544/8750 [1:53:49<8:21:04, 5.78s/it] 41%|████ | 3545/8750 [1:53:57<8:18:51, 5.75s/it] 41%|████ | 3545/8750 [1:53:54<8:18:51, 5.75s/it] {'loss': 0.4558, 'learning_rate': 1.348447142500261e-05, 'epoch': 0.41} + 41%|████ | 3545/8750 [1:53:57<8:18:51, 5.75s/it] {'loss': 0.4558, 'learning_rate': 1.348447142500261e-05, 'epoch': 0.41} + 41%|████ | 3545/8750 [1:53:54<8:18:51, 5.75s/it] 41%|████ | 3546/8750 [1:54:00<8:16:35, 5.73s/it] 41%|████ | 3546/8750 [1:54:03<8:16:35, 5.73s/it] {'loss': 0.4566, 'learning_rate': 1.3481001522028807e-05, 'epoch': 0.41} + 41%|████ | 3546/8750 [1:54:03<8:16:35, 5.73s/it] {'loss': 0.4566, 'learning_rate': 1.3481001522028807e-05, 'epoch': 0.41} + 41%|████ | 3546/8750 [1:54:00<8:16:35, 5.73s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (4399 > 4096). Running this sequence through the model will result in indexing errors + 41%|████ | 3547/8750 [1:54:06<8:16:09, 5.72s/it] 41%|████ | 3547/8750 [1:54:09<8:16:09, 5.72s/it] {'loss': 0.4626, 'learning_rate': 1.3477531142080076e-05, 'epoch': 0.41} + 41%|████ | 3547/8750 [1:54:09<8:16:09, 5.72s/it] {'loss': 0.4626, 'learning_rate': 1.3477531142080076e-05, 'epoch': 0.41} + 41%|████ | 3547/8750 [1:54:06<8:16:09, 5.72s/it] 41%|████ | 3548/8750 [1:54:11<8:14:50, 5.71s/it] 41%|████ | 3548/8750 [1:54:14<8:14:50, 5.71s/it] {'loss': 0.487, 'learning_rate': 1.347406028563194e-05, 'epoch': 0.41} + {'loss': 0.487, 'learning_rate': 1.347406028563194e-05, 'epoch': 0.41} 41%|████ | 3548/8750 [1:54:14<8:14:50, 5.71s/it] + 41%|████ | 3548/8750 [1:54:11<8:14:50, 5.71s/it] 41%|████ | 3549/8750 [1:54:17<8:14:40, 5.71s/it] 41%|████ | 3549/8750 [1:54:20<8:14:40, 5.71s/it] {'loss': 0.4712, 'learning_rate': 1.3470588953159982e-05, 'epoch': 0.41} + 41%|████ | 3549/8750 [1:54:20<8:14:40, 5.71s/it] {'loss': 0.4712, 'learning_rate': 1.3470588953159982e-05, 'epoch': 0.41} + 41%|████ | 3549/8750 [1:54:17<8:14:40, 5.71s/it]10 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... + 41%|████ | 3550/8750 [1:54:26<8:20:33, 5.78s/it]01 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +1112 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... + 41%|████ | 3550/8750 [1:54:23<8:20:33, 5.78s/it]3 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4427, 'learning_rate': 1.3467117145139854e-05, 'epoch': 0.41} + 41%|████ | 3550/8750 [1:54:26<8:20:33, 5.78s/it] {'loss': 0.4427, 'learning_rate': 1.3467117145139854e-05, 'epoch': 0.41} + 41%|████ | 3550/8750 [1:54:23<8:20:33, 5.78s/it] 41%|████ | 3551/8750 [1:54:32<8:20:08, 5.77s/it] 41%|████ | 3551/8750 [1:54:29<8:20:08, 5.77s/it] {'loss': 0.4693, 'learning_rate': 1.3463644862047267e-05, 'epoch': 0.41} + 41%|████ | 3551/8750 [1:54:32<8:20:08, 5.77s/it] {'loss': 0.4693, 'learning_rate': 1.3463644862047267e-05, 'epoch': 0.41} + 41%|████ | 3551/8750 [1:54:29<8:20:08, 5.77s/it] 41%|████ | 3552/8750 [1:54:35<8:19:51, 5.77s/it] 41%|████ | 3552/8750 [1:54:37<8:19:51, 5.77s/it] {'loss': 0.4581, 'learning_rate': 1.3460172104358007e-05, 'epoch': 0.41} + 41%|████ | 3552/8750 [1:54:37<8:19:51, 5.77s/it] {'loss': 0.4581, 'learning_rate': 1.3460172104358007e-05, 'epoch': 0.41} + 41%|████ | 3552/8750 [1:54:35<8:19:51, 5.77s/it] 41%|████ | 3553/8750 [1:54:43<8:18:16, 5.75s/it] 41%|████ | 3553/8750 [1:54:40<8:18:16, 5.75s/it] {'loss': 0.4775, 'learning_rate': 1.3456698872547915e-05, 'epoch': 0.41} + 41%|████ | 3553/8750 [1:54:43<8:18:16, 5.75s/it] {'loss': 0.4775, 'learning_rate': 1.3456698872547915e-05, 'epoch': 0.41} + 41%|████ | 3553/8750 [1:54:40<8:18:16, 5.75s/it] 41%|████ | 3554/8750 [1:54:49<8:19:41, 5.77s/it] 41%|████ | 3554/8750 [1:54:46<8:19:41, 5.77s/it] {'loss': 0.4602, 'learning_rate': 1.3453225167092902e-05, 'epoch': 0.41} + 41%|████ | 3554/8750 [1:54:49<8:19:41, 5.77s/it] {'loss': 0.4602, 'learning_rate': 1.3453225167092902e-05, 'epoch': 0.41} + 41%|████ | 3554/8750 [1:54:46<8:19:41, 5.77s/it] 41%|████ | 3555/8750 [1:54:55<8:16:58, 5.74s/it] 41%|████ | 3555/8750 [1:54:52<8:16:58, 5.74s/it] {'loss': 0.4855, 'learning_rate': 1.3449750988468943e-05, 'epoch': 0.41} + 41%|████ | 3555/8750 [1:54:55<8:16:58, 5.74s/it] {'loss': 0.4855, 'learning_rate': 1.3449750988468943e-05, 'epoch': 0.41} + 41%|████ | 3555/8750 [1:54:52<8:16:58, 5.74s/it] 41%|████ | 3556/8750 [1:55:00<8:15:53, 5.73s/it] 41%|████ | 3556/8750 [1:54:57<8:15:53, 5.73s/it] {'loss': 0.4711, 'learning_rate': 1.344627633715208e-05, 'epoch': 0.41} + 41%|████ | 3556/8750 [1:55:00<8:15:53, 5.73s/it] {'loss': 0.4711, 'learning_rate': 1.344627633715208e-05, 'epoch': 0.41} + 41%|████ | 3556/8750 [1:54:57<8:15:53, 5.73s/it] 41%|████ | 3557/8750 [1:55:03<8:18:45, 5.76s/it] 41%|████ | 3557/8750 [1:55:06<8:18:45, 5.76s/it] {'loss': 0.4648, 'learning_rate': 1.3442801213618417e-05, 'epoch': 0.41} + 41%|████ | 3557/8750 [1:55:06<8:18:45, 5.76s/it] {'loss': 0.4648, 'learning_rate': 1.3442801213618417e-05, 'epoch': 0.41} + 41%|████ | 3557/8750 [1:55:03<8:18:45, 5.76s/it] 41%|████ | 3558/8750 [1:55:12<8:18:42, 5.76s/it] 41%|████ | 3558/8750 [1:55:09<8:18:42, 5.76s/it] {'loss': 0.4784, 'learning_rate': 1.3439325618344123e-05, 'epoch': 0.41} + 41%|████ | 3558/8750 [1:55:12<8:18:42, 5.76s/it] {'loss': 0.4784, 'learning_rate': 1.3439325618344123e-05, 'epoch': 0.41} + 41%|████ | 3558/8750 [1:55:09<8:18:42, 5.76s/it] 41%|████ | 3559/8750 [1:55:15<8:19:55, 5.78s/it] 41%|████ | 3559/8750 [1:55:18<8:19:56, 5.78s/it] {'loss': 0.4907, 'learning_rate': 1.3435849551805436e-05, 'epoch': 0.41} + 41%|████ | 3559/8750 [1:55:18<8:19:56, 5.78s/it] {'loss': 0.4907, 'learning_rate': 1.3435849551805436e-05, 'epoch': 0.41} + 41%|████ | 3559/8750 [1:55:15<8:19:55, 5.78s/it] 41%|████ | 3560/8750 [1:55:21<8:27:25, 5.87s/it] 41%|████ | 3560/8750 [1:55:24<8:27:25, 5.87s/it] {'loss': 0.4504, 'learning_rate': 1.3432373014478644e-05, 'epoch': 0.41} + 41%|████ | 3560/8750 [1:55:24<8:27:25, 5.87s/it] {'loss': 0.4504, 'learning_rate': 1.3432373014478644e-05, 'epoch': 0.41} + 41%|████ | 3560/8750 [1:55:21<8:27:25, 5.87s/it] 41%|████ | 3561/8750 [1:55:27<8:24:28, 5.83s/it] 41%|████ | 3561/8750 [1:55:30<8:24:29, 5.83s/it] {'loss': 0.4856, 'learning_rate': 1.3428896006840122e-05, 'epoch': 0.41} + 41%|████ | 3561/8750 [1:55:30<8:24:29, 5.83s/it] {'loss': 0.4856, 'learning_rate': 1.3428896006840122e-05, 'epoch': 0.41} + 41%|████ | 3561/8750 [1:55:27<8:24:28, 5.83s/it] 41%|████ | 3562/8750 [1:55:33<8:26:36, 5.86s/it] 41%|████ | 3562/8750 [1:55:36<8:26:36, 5.86s/it] {'loss': 0.4712, 'learning_rate': 1.3425418529366293e-05, 'epoch': 0.41} + 41%|████ | 3562/8750 [1:55:36<8:26:36, 5.86s/it] {'loss': 0.4712, 'learning_rate': 1.3425418529366293e-05, 'epoch': 0.41} + 41%|████ | 3562/8750 [1:55:33<8:26:36, 5.86s/it]/lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/VILA/llava/model/llava_arch.py:397: UserWarning: Inputs truncated! + warnings.warn("Inputs truncated!") + 41%|████ | 3563/8750 [1:55:39<8:33:36, 5.94s/it] 41%|████ | 3563/8750 [1:55:42<8:33:36, 5.94s/it] {'loss': 0.4771, 'learning_rate': 1.3421940582533645e-05, 'epoch': 0.41} + 41%|████ | 3563/8750 [1:55:42<8:33:36, 5.94s/it] {'loss': 0.4771, 'learning_rate': 1.3421940582533645e-05, 'epoch': 0.41} + 41%|████ | 3563/8750 [1:55:39<8:33:36, 5.94s/it] 41%|████ | 3564/8750 [1:55:44<8:25:56, 5.85s/it] 41%|████ | 3564/8750 [1:55:47<8:25:56, 5.85s/it] {'loss': 0.4686, 'learning_rate': 1.3418462166818743e-05, 'epoch': 0.41} + 41%|████ | 3564/8750 [1:55:47<8:25:56, 5.85s/it] {'loss': 0.4686, 'learning_rate': 1.3418462166818743e-05, 'epoch': 0.41} + 41%|████ | 3564/8750 [1:55:44<8:25:56, 5.85s/it] 41%|████ | 3565/8750 [1:55:53<8:25:25, 5.85s/it] 41%|████ | 3565/8750 [1:55:50<8:25:25, 5.85s/it] {'loss': 0.4561, 'learning_rate': 1.34149832826982e-05, 'epoch': 0.41} + 41%|████ | 3565/8750 [1:55:53<8:25:25, 5.85s/it] {'loss': 0.4561, 'learning_rate': 1.34149832826982e-05, 'epoch': 0.41} + 41%|████ | 3565/8750 [1:55:50<8:25:25, 5.85s/it] 41%|████ | 3566/8750 [1:55:59<8:27:54, 5.88s/it] 41%|████ | 3566/8750 [1:55:56<8:27:54, 5.88s/it] {'loss': 0.4762, 'learning_rate': 1.3411503930648704e-05, 'epoch': 0.41} + 41%|████ | 3566/8750 [1:55:59<8:27:54, 5.88s/it] {'loss': 0.4762, 'learning_rate': 1.3411503930648704e-05, 'epoch': 0.41} + 41%|████ | 3566/8750 [1:55:56<8:27:54, 5.88s/it] 41%|████ | 3567/8750 [1:56:02<8:23:25, 5.83s/it] 41%|████ | 3567/8750 [1:56:05<8:23:25, 5.83s/it] {'loss': 0.477, 'learning_rate': 1.3408024111147004e-05, 'epoch': 0.41} + 41%|████ | 3567/8750 [1:56:05<8:23:25, 5.83s/it] {'loss': 0.477, 'learning_rate': 1.3408024111147004e-05, 'epoch': 0.41} + 41%|████ | 3567/8750 [1:56:02<8:23:25, 5.83s/it] 41%|████ | 3568/8750 [1:56:08<8:33:59, 5.95s/it] 41%|████ | 3568/8750 [1:56:11<8:33:59, 5.95s/it] {'loss': 0.4383, 'learning_rate': 1.3404543824669915e-05, 'epoch': 0.41} + 41%|████ | 3568/8750 [1:56:11<8:33:59, 5.95s/it] {'loss': 0.4383, 'learning_rate': 1.3404543824669915e-05, 'epoch': 0.41} + 41%|████ | 3568/8750 [1:56:08<8:33:59, 5.95s/it] 41%|████ | 3569/8750 [1:56:14<8:32:17, 5.93s/it] 41%|████ | 3569/8750 [1:56:17<8:32:17, 5.93s/it] {'loss': 0.4824, 'learning_rate': 1.3401063071694309e-05, 'epoch': 0.41} + 41%|████ | 3569/8750 [1:56:17<8:32:17, 5.93s/it] {'loss': 0.4824, 'learning_rate': 1.3401063071694309e-05, 'epoch': 0.41} + 41%|████ | 3569/8750 [1:56:14<8:32:17, 5.93s/it] 41%|████ | 3570/8750 [1:56:20<8:30:24, 5.91s/it] 41%|████ | 3570/8750 [1:56:23<8:30:24, 5.91s/it] {'loss': 0.4664, 'learning_rate': 1.3397581852697128e-05, 'epoch': 0.41} + 41%|████ | 3570/8750 [1:56:23<8:30:24, 5.91s/it] {'loss': 0.4664, 'learning_rate': 1.3397581852697128e-05, 'epoch': 0.41} + 41%|████ | 3570/8750 [1:56:20<8:30:24, 5.91s/it] 41%|████ | 3571/8750 [1:56:26<8:29:49, 5.91s/it] 41%|████ | 3571/8750 [1:56:29<8:29:49, 5.91s/it] {'loss': 0.4723, 'learning_rate': 1.3394100168155382e-05, 'epoch': 0.41} + 41%|████ | 3571/8750 [1:56:29<8:29:49, 5.91s/it] {'loss': 0.4723, 'learning_rate': 1.3394100168155382e-05, 'epoch': 0.41} + 41%|████ | 3571/8750 [1:56:26<8:29:49, 5.91s/it] 41%|████ | 3572/8750 [1:56:32<8:35:33, 5.97s/it] 41%|████ | 3572/8750 [1:56:35<8:35:33, 5.97s/it] {'loss': 0.4462, 'learning_rate': 1.3390618018546135e-05, 'epoch': 0.41} + 41%|████ | 3572/8750 [1:56:35<8:35:33, 5.97s/it] {'loss': 0.4462, 'learning_rate': 1.3390618018546135e-05, 'epoch': 0.41} + 41%|████ | 3572/8750 [1:56:32<8:35:33, 5.97s/it] 41%|████ | 3573/8750 [1:56:38<8:30:54, 5.92s/it] 41%|████ | 3573/8750 [1:56:41<8:30:54, 5.92s/it] {'loss': 0.4555, 'learning_rate': 1.3387135404346519e-05, 'epoch': 0.41} + 41%|████ | 3573/8750 [1:56:41<8:30:54, 5.92s/it] {'loss': 0.4555, 'learning_rate': 1.3387135404346519e-05, 'epoch': 0.41} + 41%|████ | 3573/8750 [1:56:38<8:30:54, 5.92s/it] 41%|████ | 3574/8750 [1:56:43<8:25:12, 5.86s/it] 41%|████ | 3574/8750 [1:56:46<8:25:12, 5.86s/it] {'loss': 0.4666, 'learning_rate': 1.338365232603373e-05, 'epoch': 0.41} + 41%|████ | 3574/8750 [1:56:46<8:25:12, 5.86s/it] {'loss': 0.4666, 'learning_rate': 1.338365232603373e-05, 'epoch': 0.41} + 41%|████ | 3574/8750 [1:56:43<8:25:12, 5.86s/it] 41%|████ | 3575/8750 [1:56:49<8:22:51, 5.83s/it] 41%|████ | 3575/8750 [1:56:52<8:22:50, 5.83s/it] {'loss': 0.4895, 'learning_rate': 1.3380168784085028e-05, 'epoch': 0.41} + 41%|████ | 3575/8750 [1:56:52<8:22:50, 5.83s/it] {'loss': 0.4895, 'learning_rate': 1.3380168784085028e-05, 'epoch': 0.41} + 41%|████ | 3575/8750 [1:56:49<8:22:51, 5.83s/it] 41%|████ | 3576/8750 [1:56:55<8:18:26, 5.78s/it] 41%|████ | 3576/8750 [1:56:58<8:18:26, 5.78s/it] {'loss': 0.4813, 'learning_rate': 1.3376684778977738e-05, 'epoch': 0.41} + 41%|████ | 3576/8750 [1:56:58<8:18:26, 5.78s/it] {'loss': 0.4813, 'learning_rate': 1.3376684778977738e-05, 'epoch': 0.41} + 41%|████ | 3576/8750 [1:56:55<8:18:26, 5.78s/it] 41%|████ | 3577/8750 [1:57:01<8:22:01, 5.82s/it] 41%|████ | 3577/8750 [1:57:04<8:22:01, 5.82s/it] {'loss': 0.485, 'learning_rate': 1.3373200311189245e-05, 'epoch': 0.41} + 41%|████ | 3577/8750 [1:57:04<8:22:01, 5.82s/it] {'loss': 0.485, 'learning_rate': 1.3373200311189245e-05, 'epoch': 0.41} + 41%|████ | 3577/8750 [1:57:01<8:22:01, 5.82s/it] 41%|████ | 3578/8750 [1:57:07<8:20:29, 5.81s/it] 41%|████ | 3578/8750 [1:57:09<8:20:29, 5.81s/it] {'loss': 0.4799, 'learning_rate': 1.3369715381197e-05, 'epoch': 0.41} + 41%|████ | 3578/8750 [1:57:09<8:20:29, 5.81s/it] {'loss': 0.4799, 'learning_rate': 1.3369715381197e-05, 'epoch': 0.41} + 41%|████ | 3578/8750 [1:57:07<8:20:29, 5.81s/it] 41%|████ | 3579/8750 [1:57:12<8:22:34, 5.83s/it] 41%|████ | 3579/8750 [1:57:15<8:22:35, 5.83s/it] {'loss': 0.4667, 'learning_rate': 1.336622998947851e-05, 'epoch': 0.41} + 41%|████ | 3579/8750 [1:57:15<8:22:35, 5.83s/it] {'loss': 0.4667, 'learning_rate': 1.336622998947851e-05, 'epoch': 0.41} + 41%|████ | 3579/8750 [1:57:12<8:22:34, 5.83s/it] 41%|████ | 3580/8750 [1:57:18<8:23:13, 5.84s/it] 41%|████ | 3580/8750 [1:57:21<8:23:13, 5.84s/it] {'loss': 0.4623, 'learning_rate': 1.336274413651136e-05, 'epoch': 0.41} + 41%|████ | 3580/8750 [1:57:21<8:23:13, 5.84s/it] {'loss': 0.4623, 'learning_rate': 1.336274413651136e-05, 'epoch': 0.41} + 41%|████ | 3580/8750 [1:57:18<8:23:13, 5.84s/it] 41%|████ | 3581/8750 [1:57:24<8:25:26, 5.87s/it] 41%|████ | 3581/8750 [1:57:27<8:25:26, 5.87s/it] {'loss': 0.4914, 'learning_rate': 1.3359257822773187e-05, 'epoch': 0.41} + 41%|████ | 3581/8750 [1:57:27<8:25:26, 5.87s/it] {'loss': 0.4914, 'learning_rate': 1.3359257822773187e-05, 'epoch': 0.41} + 41%|████ | 3581/8750 [1:57:24<8:25:26, 5.87s/it] 41%|████ | 3582/8750 [1:57:30<8:20:53, 5.82s/it] 41%|████ | 3582/8750 [1:57:33<8:20:53, 5.82s/it]{'loss': 0.4591, 'learning_rate': 1.3355771048741692e-05, 'epoch': 0.41} + {'loss': 0.4591, 'learning_rate': 1.3355771048741692e-05, 'epoch': 0.41} 41%|████ | 3582/8750 [1:57:33<8:20:53, 5.82s/it] + 41%|████ | 3582/8750 [1:57:30<8:20:53, 5.82s/it] 41%|████ | 3583/8750 [1:57:36<8:18:06, 5.78s/it] 41%|████ | 3583/8750 [1:57:39<8:18:05, 5.78s/it] {'loss': 0.4642, 'learning_rate': 1.335228381489464e-05, 'epoch': 0.41} + 41%|████ | 3583/8750 [1:57:39<8:18:05, 5.78s/it] {'loss': 0.4642, 'learning_rate': 1.335228381489464e-05, 'epoch': 0.41} + 41%|████ | 3583/8750 [1:57:36<8:18:06, 5.78s/it] 41%|████ | 3584/8750 [1:57:41<8:18:16, 5.79s/it] 41%|████ | 3584/8750 [1:57:44<8:18:16, 5.79s/it] {'loss': 0.4682, 'learning_rate': 1.3348796121709862e-05, 'epoch': 0.41} + 41%|████ | 3584/8750 [1:57:44<8:18:16, 5.79s/it] {'loss': 0.4682, 'learning_rate': 1.3348796121709862e-05, 'epoch': 0.41} + 41%|████ | 3584/8750 [1:57:41<8:18:16, 5.79s/it] 41%|████ | 3585/8750 [1:57:47<8:19:25, 5.80s/it] 41%|████ | 3585/8750 [1:57:50<8:19:25, 5.80s/it] {'loss': 0.4726, 'learning_rate': 1.3345307969665252e-05, 'epoch': 0.41} + 41%|████ | 3585/8750 [1:57:50<8:19:25, 5.80s/it] {'loss': 0.4726, 'learning_rate': 1.3345307969665252e-05, 'epoch': 0.41} + 41%|████ | 3585/8750 [1:57:47<8:19:25, 5.80s/it] 41%|████ | 3586/8750 [1:57:53<8:16:21, 5.77s/it] 41%|████ | 3586/8750 [1:57:56<8:16:21, 5.77s/it] {'loss': 0.4532, 'learning_rate': 1.3341819359238762e-05, 'epoch': 0.41} + 41%|████ | 3586/8750 [1:57:56<8:16:21, 5.77s/it] {'loss': 0.4532, 'learning_rate': 1.3341819359238762e-05, 'epoch': 0.41} + 41%|████ | 3586/8750 [1:57:53<8:16:21, 5.77s/it] 41%|████ | 3587/8750 [1:57:59<8:23:39, 5.85s/it] 41%|████ | 3587/8750 [1:58:02<8:23:39, 5.85s/it] {'loss': 0.4763, 'learning_rate': 1.3338330290908408e-05, 'epoch': 0.41} + 41%|████ | 3587/8750 [1:58:02<8:23:39, 5.85s/it] {'loss': 0.4763, 'learning_rate': 1.3338330290908408e-05, 'epoch': 0.41} + 41%|████ | 3587/8750 [1:57:59<8:23:39, 5.85s/it] 41%|████ | 3588/8750 [1:58:05<8:21:38, 5.83s/it] 41%|████ | 3588/8750 [1:58:08<8:21:38, 5.83s/it] {'loss': 0.4579, 'learning_rate': 1.3334840765152272e-05, 'epoch': 0.41} + 41%|████ | 3588/8750 [1:58:08<8:21:38, 5.83s/it] {'loss': 0.4579, 'learning_rate': 1.3334840765152272e-05, 'epoch': 0.41} + 41%|████ | 3588/8750 [1:58:05<8:21:38, 5.83s/it] 41%|████ | 3589/8750 [1:58:11<8:25:10, 5.87s/it] 41%|████ | 3589/8750 [1:58:14<8:25:10, 5.87s/it] {'loss': 0.4719, 'learning_rate': 1.3331350782448495e-05, 'epoch': 0.41} + 41%|████ | 3589/8750 [1:58:14<8:25:10, 5.87s/it] {'loss': 0.4719, 'learning_rate': 1.3331350782448495e-05, 'epoch': 0.41} + 41%|████ | 3589/8750 [1:58:11<8:25:10, 5.87s/it] 41%|████ | 3590/8750 [1:58:17<8:22:36, 5.84s/it] 41%|████ | 3590/8750 [1:58:19<8:22:36, 5.84s/it] {'loss': 0.4579, 'learning_rate': 1.332786034327529e-05, 'epoch': 0.41} + 41%|████ | 3590/8750 [1:58:19<8:22:36, 5.84s/it] {'loss': 0.4579, 'learning_rate': 1.332786034327529e-05, 'epoch': 0.41} + 41%|████ | 3590/8750 [1:58:17<8:22:36, 5.84s/it] 41%|████ | 3591/8750 [1:58:22<8:23:02, 5.85s/it] 41%|████ | 3591/8750 [1:58:25<8:23:03, 5.85s/it] {'loss': 0.4598, 'learning_rate': 1.3324369448110916e-05, 'epoch': 0.41} + 41%|████ | 3591/8750 [1:58:25<8:23:03, 5.85s/it] {'loss': 0.4598, 'learning_rate': 1.3324369448110916e-05, 'epoch': 0.41} + 41%|████ | 3591/8750 [1:58:22<8:23:02, 5.85s/it] 41%|████ | 3592/8750 [1:58:28<8:18:29, 5.80s/it] 41%|████ | 3592/8750 [1:58:31<8:18:29, 5.80s/it] {'loss': 0.4768, 'learning_rate': 1.3320878097433707e-05, 'epoch': 0.41} + 41%|████ | 3592/8750 [1:58:31<8:18:29, 5.80s/it] {'loss': 0.4768, 'learning_rate': 1.3320878097433707e-05, 'epoch': 0.41} + 41%|████ | 3592/8750 [1:58:28<8:18:29, 5.80s/it] 41%|████ | 3593/8750 [1:58:34<8:14:42, 5.76s/it] 41%|████ | 3593/8750 [1:58:37<8:14:42, 5.76s/it] {'loss': 0.4901, 'learning_rate': 1.331738629172206e-05, 'epoch': 0.41} + 41%|████ | 3593/8750 [1:58:37<8:14:42, 5.76s/it] {'loss': 0.4901, 'learning_rate': 1.331738629172206e-05, 'epoch': 0.41} + 41%|████ | 3593/8750 [1:58:34<8:14:42, 5.76s/it] 41%|████ | 3594/8750 [1:58:40<8:18:30, 5.80s/it] 41%|████ | 3594/8750 [1:58:43<8:18:30, 5.80s/it] {'loss': 0.4635, 'learning_rate': 1.3313894031454421e-05, 'epoch': 0.41} + 41%|████ | 3594/8750 [1:58:43<8:18:30, 5.80s/it] {'loss': 0.4635, 'learning_rate': 1.3313894031454421e-05, 'epoch': 0.41} + 41%|████ | 3594/8750 [1:58:40<8:18:30, 5.80s/it] 41%|████ | 3595/8750 [1:58:45<8:16:49, 5.78s/it] 41%|████ | 3595/8750 [1:58:48<8:16:49, 5.78s/it] {'loss': 0.4515, 'learning_rate': 1.3310401317109316e-05, 'epoch': 0.41} + 41%|████ | 3595/8750 [1:58:48<8:16:49, 5.78s/it] {'loss': 0.4515, 'learning_rate': 1.3310401317109316e-05, 'epoch': 0.41} + 41%|████ | 3595/8750 [1:58:45<8:16:49, 5.78s/it] 41%|████ | 3596/8750 [1:58:51<8:19:55, 5.82s/it] 41%|████ | 3596/8750 [1:58:54<8:19:55, 5.82s/it] {'loss': 0.4682, 'learning_rate': 1.330690814916532e-05, 'epoch': 0.41} + 41%|████ | 3596/8750 [1:58:54<8:19:55, 5.82s/it] {'loss': 0.4682, 'learning_rate': 1.330690814916532e-05, 'epoch': 0.41} + 41%|████ | 3596/8750 [1:58:51<8:19:55, 5.82s/it] 41%|████ | 3597/8750 [1:58:57<8:16:37, 5.78s/it] 41%|████ | 3597/8750 [1:59:00<8:16:36, 5.78s/it] {'loss': 0.4734, 'learning_rate': 1.330341452810108e-05, 'epoch': 0.41} + 41%|████ | 3597/8750 [1:59:00<8:16:36, 5.78s/it] {'loss': 0.4734, 'learning_rate': 1.330341452810108e-05, 'epoch': 0.41} + 41%|████ | 3597/8750 [1:58:57<8:16:37, 5.78s/it] 41%|████ | 3598/8750 [1:59:03<8:14:35, 5.76s/it] 41%|████ | 3598/8750 [1:59:06<8:14:35, 5.76s/it] {'loss': 0.4524, 'learning_rate': 1.3299920454395296e-05, 'epoch': 0.41} + 41%|████ | 3598/8750 [1:59:06<8:14:35, 5.76s/it] {'loss': 0.4524, 'learning_rate': 1.3299920454395296e-05, 'epoch': 0.41} + 41%|████ | 3598/8750 [1:59:03<8:14:35, 5.76s/it] 41%|████ | 3599/8750 [1:59:09<8:21:21, 5.84s/it] 41%|████ | 3599/8750 [1:59:12<8:21:22, 5.84s/it] {'loss': 0.4722, 'learning_rate': 1.3296425928526735e-05, 'epoch': 0.41} + 41%|████ | 3599/8750 [1:59:12<8:21:22, 5.84s/it] {'loss': 0.4722, 'learning_rate': 1.3296425928526735e-05, 'epoch': 0.41} + 41%|████ | 3599/8750 [1:59:09<8:21:21, 5.84s/it]10 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +148 9 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +05 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... + 41%|████ | 3600/8750 [1:59:14<8:17:34, 5.80s/it] 41%|████ | 3600/8750 [1:59:17<8:17:34, 5.80s/it]12 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4714, 'learning_rate': 1.3292930950974223e-05, 'epoch': 0.41} + 41%|████ | 3600/8750 [1:59:17<8:17:34, 5.80s/it] {'loss': 0.4714, 'learning_rate': 1.3292930950974223e-05, 'epoch': 0.41} + 41%|████ | 3600/8750 [1:59:14<8:17:34, 5.80s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-3600/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-3600/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-3600/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 41%|████ | 3601/8750 [1:59:40<15:35:34, 10.90s/it] 41%|████ | 3601/8750 [1:59:37<15:35:36, 10.90s/it] {'loss': 0.4808, 'learning_rate': 1.3289435522216657e-05, 'epoch': 0.41} + 41%|████ | 3601/8750 [1:59:40<15:35:34, 10.90s/it] {'loss': 0.4808, 'learning_rate': 1.3289435522216657e-05, 'epoch': 0.41} + 41%|████ | 3601/8750 [1:59:37<15:35:36, 10.90s/it] 41%|████ | 3602/8750 [1:59:46<13:26:26, 9.40s/it] 41%|████ | 3602/8750 [1:59:43<13:26:27, 9.40s/it] {'loss': 0.4508, 'learning_rate': 1.3285939642732979e-05, 'epoch': 0.41} + 41%|████ | 3602/8750 [1:59:46<13:26:26, 9.40s/it] {'loss': 0.4508, 'learning_rate': 1.3285939642732979e-05, 'epoch': 0.41} + 41%|████ | 3602/8750 [1:59:43<13:26:27, 9.40s/it] 41%|████ | 3603/8750 [1:59:52<11:48:29, 8.26s/it] 41%|████ | 3603/8750 [1:59:49<11:48:29, 8.26s/it] {'loss': 0.5019, 'learning_rate': 1.3282443313002209e-05, 'epoch': 0.41} + 41%|████ | 3603/8750 [1:59:52<11:48:29, 8.26s/it] {'loss': 0.5019, 'learning_rate': 1.3282443313002209e-05, 'epoch': 0.41} + 41%|████ | 3603/8750 [1:59:49<11:48:29, 8.26s/it] 41%|████ | 3604/8750 [1:59:58<10:51:39, 7.60s/it] 41%|████ | 3604/8750 [1:59:55<10:51:40, 7.60s/it] {'loss': 0.4552, 'learning_rate': 1.3278946533503422e-05, 'epoch': 0.41} + 41%|████ | 3604/8750 [1:59:58<10:51:39, 7.60s/it] {'loss': 0.4552, 'learning_rate': 1.3278946533503422e-05, 'epoch': 0.41} + 41%|████ | 3604/8750 [1:59:55<10:51:40, 7.60s/it] 41%|████ | 3605/8750 [2:00:04<10:08:37, 7.10s/it] 41%|████ | 3605/8750 [2:00:01<10:08:37, 7.10s/it] {'loss': 0.4557, 'learning_rate': 1.3275449304715753e-05, 'epoch': 0.41} + 41%|████ | 3605/8750 [2:00:04<10:08:37, 7.10s/it] {'loss': 0.4557, 'learning_rate': 1.3275449304715753e-05, 'epoch': 0.41} + 41%|████ | 3605/8750 [2:00:01<10:08:37, 7.10s/it] 41%|████ | 3606/8750 [2:00:09<9:37:06, 6.73s/it] 41%|████ | 3606/8750 [2:00:07<9:37:06, 6.73s/it] {'loss': 0.4747, 'learning_rate': 1.3271951627118402e-05, 'epoch': 0.41} + 41%|████ | 3606/8750 [2:00:10<9:37:06, 6.73s/it] {'loss': 0.4747, 'learning_rate': 1.3271951627118402e-05, 'epoch': 0.41} + 41%|████ | 3606/8750 [2:00:07<9:37:06, 6.73s/it] 41%|████ | 3607/8750 [2:00:15<9:17:58, 6.51s/it] 41%|████ | 3607/8750 [2:00:13<9:17:58, 6.51s/it] {'loss': 0.4785, 'learning_rate': 1.3268453501190628e-05, 'epoch': 0.41} + 41%|████ | 3607/8750 [2:00:15<9:17:58, 6.51s/it] {'loss': 0.4785, 'learning_rate': 1.3268453501190628e-05, 'epoch': 0.41} + 41%|████ | 3607/8750 [2:00:13<9:17:58, 6.51s/it] 41%|████ | 3608/8750 [2:00:21<8:59:04, 6.29s/it] 41%|████ | 3608/8750 [2:00:18<8:59:04, 6.29s/it] {'loss': 0.4874, 'learning_rate': 1.3264954927411751e-05, 'epoch': 0.41} + 41%|████ | 3608/8750 [2:00:21<8:59:04, 6.29s/it] {'loss': 0.4874, 'learning_rate': 1.3264954927411751e-05, 'epoch': 0.41} + 41%|████ | 3608/8750 [2:00:18<8:59:04, 6.29s/it] 41%|████ | 3609/8750 [2:00:27<8:47:12, 6.15s/it] 41%|████ | 3609/8750 [2:00:24<8:47:12, 6.15s/it] {'loss': 0.4692, 'learning_rate': 1.3261455906261154e-05, 'epoch': 0.41} + 41%|████ | 3609/8750 [2:00:27<8:47:12, 6.15s/it] {'loss': 0.4692, 'learning_rate': 1.3261455906261154e-05, 'epoch': 0.41} + 41%|████ | 3609/8750 [2:00:24<8:47:12, 6.15s/it] 41%|████▏ | 3610/8750 [2:00:33<8:36:41, 6.03s/it] 41%|████▏ | 3610/8750 [2:00:30<8:36:41, 6.03s/it] {'loss': 0.472, 'learning_rate': 1.3257956438218283e-05, 'epoch': 0.41} + 41%|████▏ | 3610/8750 [2:00:33<8:36:41, 6.03s/it] {'loss': 0.472, 'learning_rate': 1.3257956438218283e-05, 'epoch': 0.41} + 41%|████▏ | 3610/8750 [2:00:30<8:36:41, 6.03s/it] 41%|████▏ | 3611/8750 [2:00:39<8:28:19, 5.94s/it] 41%|████▏ | 3611/8750 [2:00:36<8:28:20, 5.94s/it] {'loss': 0.4603, 'learning_rate': 1.3254456523762643e-05, 'epoch': 0.41} + 41%|████▏ | 3611/8750 [2:00:39<8:28:19, 5.94s/it] {'loss': 0.4603, 'learning_rate': 1.3254456523762643e-05, 'epoch': 0.41} + 41%|████▏ | 3611/8750 [2:00:36<8:28:20, 5.94s/it] 41%|████▏ | 3612/8750 [2:00:44<8:22:38, 5.87s/it] 41%|████▏ | 3612/8750 [2:00:41<8:22:38, 5.87s/it] {'loss': 0.476, 'learning_rate': 1.3250956163373801e-05, 'epoch': 0.41} + 41%|████▏ | 3612/8750 [2:00:44<8:22:38, 5.87s/it] {'loss': 0.476, 'learning_rate': 1.3250956163373801e-05, 'epoch': 0.41} + 41%|████▏ | 3612/8750 [2:00:41<8:22:38, 5.87s/it] 41%|████▏ | 3613/8750 [2:00:50<8:18:07, 5.82s/it] 41%|████▏ | 3613/8750 [2:00:47<8:18:07, 5.82s/it] {'loss': 0.469, 'learning_rate': 1.324745535753138e-05, 'epoch': 0.41} + 41%|████▏ | 3613/8750 [2:00:50<8:18:07, 5.82s/it] {'loss': 0.469, 'learning_rate': 1.324745535753138e-05, 'epoch': 0.41} + 41%|████▏ | 3613/8750 [2:00:47<8:18:07, 5.82s/it] 41%|████▏ | 3614/8750 [2:00:56<8:14:58, 5.78s/it] 41%|████▏ | 3614/8750 [2:00:53<8:14:58, 5.78s/it] {'loss': 0.4551, 'learning_rate': 1.3243954106715074e-05, 'epoch': 0.41} + 41%|████▏ | 3614/8750 [2:00:56<8:14:58, 5.78s/it] {'loss': 0.4551, 'learning_rate': 1.3243954106715074e-05, 'epoch': 0.41} + 41%|████▏ | 3614/8750 [2:00:53<8:14:58, 5.78s/it] 41%|████▏ | 3615/8750 [2:01:01<8:14:23, 5.78s/it] 41%|████▏ | 3615/8750 [2:00:59<8:14:23, 5.78s/it] {'loss': 0.4715, 'learning_rate': 1.3240452411404628e-05, 'epoch': 0.41} + 41%|████▏ | 3615/8750 [2:01:01<8:14:23, 5.78s/it] {'loss': 0.4715, 'learning_rate': 1.3240452411404628e-05, 'epoch': 0.41} + 41%|████▏ | 3615/8750 [2:00:59<8:14:23, 5.78s/it] 41%|████▏ | 3616/8750 [2:01:07<8:19:37, 5.84s/it] 41%|████▏ | 3616/8750 [2:01:05<8:19:37, 5.84s/it] {'loss': 0.4817, 'learning_rate': 1.3236950272079858e-05, 'epoch': 0.41} + 41%|████▏ | 3616/8750 [2:01:07<8:19:37, 5.84s/it] {'loss': 0.4817, 'learning_rate': 1.3236950272079858e-05, 'epoch': 0.41} + 41%|████▏ | 3616/8750 [2:01:05<8:19:37, 5.84s/it] 41%|████▏ | 3617/8750 [2:01:13<8:20:41, 5.85s/it] 41%|████▏ | 3617/8750 [2:01:10<8:20:41, 5.85s/it] {'loss': 0.4714, 'learning_rate': 1.3233447689220629e-05, 'epoch': 0.41} + 41%|████▏ | 3617/8750 [2:01:13<8:20:41, 5.85s/it] {'loss': 0.4714, 'learning_rate': 1.3233447689220629e-05, 'epoch': 0.41} + 41%|████▏ | 3617/8750 [2:01:10<8:20:41, 5.85s/it] 41%|████▏ | 3618/8750 [2:01:19<8:20:52, 5.86s/it] 41%|████▏ | 3618/8750 [2:01:16<8:20:52, 5.86s/it] {'loss': 0.4745, 'learning_rate': 1.3229944663306877e-05, 'epoch': 0.41} + 41%|████▏ | 3618/8750 [2:01:19<8:20:52, 5.86s/it] {'loss': 0.4745, 'learning_rate': 1.3229944663306877e-05, 'epoch': 0.41} + 41%|████▏ | 3618/8750 [2:01:16<8:20:52, 5.86s/it] 41%|████▏ | 3619/8750 [2:01:25<8:17:39, 5.82s/it] 41%|████▏ | 3619/8750 [2:01:22<8:17:39, 5.82s/it] {'loss': 0.4591, 'learning_rate': 1.3226441194818596e-05, 'epoch': 0.41} + 41%|████▏ | 3619/8750 [2:01:25<8:17:39, 5.82s/it] {'loss': 0.4591, 'learning_rate': 1.3226441194818596e-05, 'epoch': 0.41} + 41%|████▏ | 3619/8750 [2:01:22<8:17:39, 5.82s/it] 41%|████▏ | 3620/8750 [2:01:31<8:15:25, 5.79s/it] 41%|████▏ | 3620/8750 [2:01:28<8:15:25, 5.79s/it] {'loss': 0.4703, 'learning_rate': 1.3222937284235835e-05, 'epoch': 0.41} + 41%|████▏ | 3620/8750 [2:01:31<8:15:25, 5.79s/it] {'loss': 0.4703, 'learning_rate': 1.3222937284235835e-05, 'epoch': 0.41} + 41%|████▏ | 3620/8750 [2:01:28<8:15:25, 5.79s/it] 41%|████▏ | 3621/8750 [2:01:37<8:18:20, 5.83s/it] 41%|████▏ | 3621/8750 [2:01:34<8:18:20, 5.83s/it] {'loss': 0.4754, 'learning_rate': 1.3219432932038712e-05, 'epoch': 0.41} + 41%|████▏ | 3621/8750 [2:01:37<8:18:20, 5.83s/it] {'loss': 0.4754, 'learning_rate': 1.3219432932038712e-05, 'epoch': 0.41} + 41%|████▏ | 3621/8750 [2:01:34<8:18:20, 5.83s/it] 41%|████▏ | 3622/8750 [2:01:42<8:19:29, 5.84s/it] 41%|████▏ | 3622/8750 [2:01:40<8:19:29, 5.84s/it] {'loss': 0.4626, 'learning_rate': 1.3215928138707396e-05, 'epoch': 0.41} + 41%|████▏ | 3622/8750 [2:01:42<8:19:29, 5.84s/it] {'loss': 0.4626, 'learning_rate': 1.3215928138707396e-05, 'epoch': 0.41} + 41%|████▏ | 3622/8750 [2:01:40<8:19:29, 5.84s/it] 41%|████▏ | 3623/8750 [2:01:48<8:21:55, 5.87s/it] 41%|████▏ | 3623/8750 [2:01:45<8:21:56, 5.87s/it] {'loss': 0.4779, 'learning_rate': 1.321242290472213e-05, 'epoch': 0.41} + 41%|████▏ | 3623/8750 [2:01:48<8:21:55, 5.87s/it] {'loss': 0.4779, 'learning_rate': 1.321242290472213e-05, 'epoch': 0.41} + 41%|████▏ | 3623/8750 [2:01:45<8:21:56, 5.87s/it] 41%|████▏ | 3624/8750 [2:01:54<8:18:18, 5.83s/it] 41%|████▏ | 3624/8750 [2:01:51<8:18:18, 5.83s/it] {'loss': 0.4725, 'learning_rate': 1.3208917230563201e-05, 'epoch': 0.41} + 41%|████▏ | 3624/8750 [2:01:54<8:18:18, 5.83s/it] {'loss': 0.4725, 'learning_rate': 1.3208917230563201e-05, 'epoch': 0.41} + 41%|████▏ | 3624/8750 [2:01:51<8:18:18, 5.83s/it] 41%|████▏ | 3625/8750 [2:02:00<8:11:46, 5.76s/it] 41%|████▏ | 3625/8750 [2:01:57<8:11:46, 5.76s/it] {'loss': 0.4656, 'learning_rate': 1.3205411116710973e-05, 'epoch': 0.41} + 41%|████▏ | 3625/8750 [2:02:00<8:11:46, 5.76s/it] {'loss': 0.4656, 'learning_rate': 1.3205411116710973e-05, 'epoch': 0.41} + 41%|████▏ | 3625/8750 [2:01:57<8:11:46, 5.76s/it] 41%|████▏ | 3626/8750 [2:02:05<8:10:39, 5.75s/it] 41%|████▏ | 3626/8750 [2:02:02<8:10:39, 5.75s/it] {'loss': 0.486, 'learning_rate': 1.3201904563645853e-05, 'epoch': 0.41} + 41%|████▏ | 3626/8750 [2:02:05<8:10:39, 5.75s/it] {'loss': 0.486, 'learning_rate': 1.3201904563645853e-05, 'epoch': 0.41} + 41%|████▏ | 3626/8750 [2:02:02<8:10:39, 5.75s/it] 41%|████▏ | 3627/8750 [2:02:11<8:13:56, 5.79s/it] 41%|████▏ | 3627/8750 [2:02:08<8:13:56, 5.78s/it] {'loss': 0.4626, 'learning_rate': 1.3198397571848323e-05, 'epoch': 0.41} + 41%|████▏ | 3627/8750 [2:02:11<8:13:56, 5.79s/it] {'loss': 0.4626, 'learning_rate': 1.3198397571848323e-05, 'epoch': 0.41} + 41%|████▏ | 3627/8750 [2:02:08<8:13:56, 5.78s/it] 41%|████▏ | 3628/8750 [2:02:17<8:13:52, 5.79s/it] 41%|████▏ | 3628/8750 [2:02:14<8:13:52, 5.79s/it] {'loss': 0.4682, 'learning_rate': 1.319489014179892e-05, 'epoch': 0.41} + 41%|████▏ | 3628/8750 [2:02:17<8:13:52, 5.79s/it] {'loss': 0.4682, 'learning_rate': 1.319489014179892e-05, 'epoch': 0.41} + 41%|████▏ | 3628/8750 [2:02:14<8:13:52, 5.79s/it] 41%|████▏ | 3629/8750 [2:02:23<8:16:48, 5.82s/it] 41%|████▏ | 3629/8750 [2:02:20<8:16:48, 5.82s/it] {'loss': 0.4631, 'learning_rate': 1.3191382273978237e-05, 'epoch': 0.41} + 41%|████▏ | 3629/8750 [2:02:23<8:16:48, 5.82s/it] {'loss': 0.4631, 'learning_rate': 1.3191382273978237e-05, 'epoch': 0.41} + 41%|████▏ | 3629/8750 [2:02:20<8:16:48, 5.82s/it] 41%|████▏ | 3630/8750 [2:02:29<8:15:50, 5.81s/it] 41%|████▏ | 3630/8750 [2:02:26<8:15:50, 5.81s/it] {'loss': 0.4509, 'learning_rate': 1.3187873968866928e-05, 'epoch': 0.41} + 41%|████▏ | 3630/8750 [2:02:29<8:15:50, 5.81s/it] {'loss': 0.4509, 'learning_rate': 1.3187873968866928e-05, 'epoch': 0.41} + 41%|████▏ | 3630/8750 [2:02:26<8:15:50, 5.81s/it] 41%|████▏ | 3631/8750 [2:02:35<8:18:32, 5.84s/it] 41%|████▏ | 3631/8750 [2:02:32<8:18:32, 5.84s/it] {'loss': 0.4576, 'learning_rate': 1.3184365226945715e-05, 'epoch': 0.41} + 41%|████▏ | 3631/8750 [2:02:35<8:18:32, 5.84s/it] {'loss': 0.4576, 'learning_rate': 1.3184365226945715e-05, 'epoch': 0.41} + 41%|████▏ | 3631/8750 [2:02:32<8:18:32, 5.84s/it] 42%|████▏ | 3632/8750 [2:02:40<8:17:15, 5.83s/it] 42%|████▏ | 3632/8750 [2:02:38<8:17:15, 5.83s/it] {'loss': 0.4649, 'learning_rate': 1.318085604869537e-05, 'epoch': 0.42} + 42%|████▏ | 3632/8750 [2:02:40<8:17:15, 5.83s/it] {'loss': 0.4649, 'learning_rate': 1.318085604869537e-05, 'epoch': 0.42} + 42%|████▏ | 3632/8750 [2:02:38<8:17:15, 5.83s/it] 42%|████▏ | 3633/8750 [2:02:46<8:13:36, 5.79s/it] 42%|████▏ | 3633/8750 [2:02:43<8:13:36, 5.79s/it] {'loss': 0.4934, 'learning_rate': 1.3177346434596734e-05, 'epoch': 0.42} + 42%|████▏ | 3633/8750 [2:02:46<8:13:36, 5.79s/it] {'loss': 0.4934, 'learning_rate': 1.3177346434596734e-05, 'epoch': 0.42} + 42%|████▏ | 3633/8750 [2:02:43<8:13:36, 5.79s/it] 42%|████▏ | 3634/8750 [2:02:52<8:16:01, 5.82s/it] 42%|████▏ | 3634/8750 [2:02:49<8:16:01, 5.82s/it] {'loss': 0.4483, 'learning_rate': 1.3173836385130693e-05, 'epoch': 0.42} + 42%|████▏ | 3634/8750 [2:02:52<8:16:01, 5.82s/it] {'loss': 0.4483, 'learning_rate': 1.3173836385130693e-05, 'epoch': 0.42} + 42%|████▏ | 3634/8750 [2:02:49<8:16:01, 5.82s/it] 42%|████▏ | 3635/8750 [2:02:58<8:17:22, 5.83s/it] 42%|████▏ | 3635/8750 [2:02:55<8:17:22, 5.83s/it] {'loss': 0.4704, 'learning_rate': 1.3170325900778211e-05, 'epoch': 0.42} + 42%|████▏ | 3635/8750 [2:02:58<8:17:22, 5.83s/it] {'loss': 0.4704, 'learning_rate': 1.3170325900778211e-05, 'epoch': 0.42} + 42%|████▏ | 3635/8750 [2:02:55<8:17:22, 5.83s/it] 42%|████▏ | 3636/8750 [2:03:04<8:14:47, 5.81s/it] 42%|████▏ | 3636/8750 [2:03:01<8:14:48, 5.81s/it] {'loss': 0.4706, 'learning_rate': 1.3166814982020298e-05, 'epoch': 0.42} + 42%|████▏ | 3636/8750 [2:03:04<8:14:47, 5.81s/it] {'loss': 0.4706, 'learning_rate': 1.3166814982020298e-05, 'epoch': 0.42} + 42%|████▏ | 3636/8750 [2:03:01<8:14:48, 5.81s/it] 42%|████▏ | 3637/8750 [2:03:09<8:11:06, 5.76s/it] 42%|████▏ | 3637/8750 [2:03:06<8:11:06, 5.76s/it] {'loss': 0.4671, 'learning_rate': 1.3163303629338029e-05, 'epoch': 0.42} + 42%|████▏ | 3637/8750 [2:03:09<8:11:06, 5.76s/it] {'loss': 0.4671, 'learning_rate': 1.3163303629338029e-05, 'epoch': 0.42} + 42%|████▏ | 3637/8750 [2:03:06<8:11:06, 5.76s/it] 42%|████▏ | 3638/8750 [2:03:12<8:12:51, 5.78s/it] 42%|████▏ | 3638/8750 [2:03:15<8:12:51, 5.78s/it] {'loss': 0.4487, 'learning_rate': 1.3159791843212542e-05, 'epoch': 0.42} + {'loss': 0.4487, 'learning_rate': 1.3159791843212542e-05, 'epoch': 0.42} 42%|████▏ | 3638/8750 [2:03:15<8:12:51, 5.78s/it] + 42%|████▏ | 3638/8750 [2:03:12<8:12:51, 5.78s/it] 42%|████▏ | 3639/8750 [2:03:21<8:13:41, 5.80s/it] 42%|████▏ | 3639/8750 [2:03:18<8:13:41, 5.80s/it] {'loss': 0.4742, 'learning_rate': 1.3156279624125023e-05, 'epoch': 0.42} + 42%|████▏ | 3639/8750 [2:03:21<8:13:41, 5.80s/it] {'loss': 0.4742, 'learning_rate': 1.3156279624125023e-05, 'epoch': 0.42} + 42%|████▏ | 3639/8750 [2:03:18<8:13:41, 5.80s/it] 42%|████▏ | 3640/8750 [2:03:24<8:12:01, 5.78s/it] 42%|████▏ | 3640/8750 [2:03:27<8:12:06, 5.78s/it] {'loss': 0.4657, 'learning_rate': 1.3152766972556727e-05, 'epoch': 0.42} + 42%|████▏ | 3640/8750 [2:03:27<8:12:06, 5.78s/it] {'loss': 0.4657, 'learning_rate': 1.3152766972556727e-05, 'epoch': 0.42} + 42%|████▏ | 3640/8750 [2:03:24<8:12:01, 5.78s/it] 42%|████▏ | 3641/8750 [2:03:32<8:08:12, 5.73s/it] 42%|████▏ | 3641/8750 [2:03:29<8:08:13, 5.73s/it] {'loss': 0.4887, 'learning_rate': 1.3149253888988967e-05, 'epoch': 0.42} + 42%|████▏ | 3641/8750 [2:03:32<8:08:12, 5.73s/it] {'loss': 0.4887, 'learning_rate': 1.3149253888988967e-05, 'epoch': 0.42} + 42%|████▏ | 3641/8750 [2:03:29<8:08:13, 5.73s/it] 42%|████▏ | 3642/8750 [2:03:38<8:08:14, 5.74s/it] 42%|████▏ | 3642/8750 [2:03:35<8:08:14, 5.73s/it] {'loss': 0.4618, 'learning_rate': 1.3145740373903118e-05, 'epoch': 0.42} + 42%|████▏ | 3642/8750 [2:03:38<8:08:14, 5.74s/it] {'loss': 0.4618, 'learning_rate': 1.3145740373903118e-05, 'epoch': 0.42} + 42%|████▏ | 3642/8750 [2:03:35<8:08:14, 5.73s/it] 42%|████▏ | 3643/8750 [2:03:44<8:12:41, 5.79s/it] 42%|████▏ | 3643/8750 [2:03:41<8:12:42, 5.79s/it] {'loss': 0.4543, 'learning_rate': 1.31422264277806e-05, 'epoch': 0.42} + 42%|████▏ | 3643/8750 [2:03:44<8:12:41, 5.79s/it] {'loss': 0.4543, 'learning_rate': 1.31422264277806e-05, 'epoch': 0.42} + 42%|████▏ | 3643/8750 [2:03:41<8:12:42, 5.79s/it] 42%|████▏ | 3644/8750 [2:03:50<8:10:57, 5.77s/it] 42%|████▏ | 3644/8750 [2:03:47<8:10:58, 5.77s/it] {'loss': 0.4653, 'learning_rate': 1.3138712051102908e-05, 'epoch': 0.42} + 42%|████▏ | 3644/8750 [2:03:50<8:10:57, 5.77s/it] {'loss': 0.4653, 'learning_rate': 1.3138712051102908e-05, 'epoch': 0.42} + 42%|████▏ | 3644/8750 [2:03:47<8:10:58, 5.77s/it] 42%|████▏ | 3645/8750 [2:03:55<8:09:12, 5.75s/it] 42%|████▏ | 3645/8750 [2:03:53<8:09:13, 5.75s/it] {'loss': 0.4668, 'learning_rate': 1.3135197244351595e-05, 'epoch': 0.42} + 42%|████▏ | 3645/8750 [2:03:55<8:09:12, 5.75s/it] {'loss': 0.4668, 'learning_rate': 1.3135197244351595e-05, 'epoch': 0.42} + 42%|████▏ | 3645/8750 [2:03:53<8:09:13, 5.75s/it] 42%|████▏ | 3646/8750 [2:04:02<8:22:28, 5.91s/it] 42%|████▏ | 3646/8750 [2:03:59<8:22:28, 5.91s/it] {'loss': 0.4705, 'learning_rate': 1.3131682008008255e-05, 'epoch': 0.42} + 42%|████▏ | 3646/8750 [2:04:02<8:22:28, 5.91s/it] {'loss': 0.4705, 'learning_rate': 1.3131682008008255e-05, 'epoch': 0.42} + 42%|████▏ | 3646/8750 [2:03:59<8:22:28, 5.91s/it] 42%|████▏ | 3647/8750 [2:04:08<8:24:19, 5.93s/it] 42%|████▏ | 3647/8750 [2:04:05<8:24:19, 5.93s/it] {'loss': 0.4697, 'learning_rate': 1.3128166342554567e-05, 'epoch': 0.42} + 42%|████▏ | 3647/8750 [2:04:08<8:24:19, 5.93s/it] {'loss': 0.4697, 'learning_rate': 1.3128166342554567e-05, 'epoch': 0.42} + 42%|████▏ | 3647/8750 [2:04:05<8:24:19, 5.93s/it] 42%|████▏ | 3648/8750 [2:04:13<8:17:02, 5.85s/it] 42%|████▏ | 3648/8750 [2:04:10<8:17:02, 5.85s/it] {'loss': 0.4545, 'learning_rate': 1.3124650248472248e-05, 'epoch': 0.42} + 42%|████▏ | 3648/8750 [2:04:13<8:17:02, 5.85s/it] {'loss': 0.4545, 'learning_rate': 1.3124650248472248e-05, 'epoch': 0.42} + 42%|████▏ | 3648/8750 [2:04:10<8:17:02, 5.85s/it] 42%|████▏ | 3649/8750 [2:04:19<8:18:27, 5.86s/it] 42%|████▏ | 3649/8750 [2:04:16<8:18:27, 5.86s/it] {'loss': 0.4855, 'learning_rate': 1.3121133726243083e-05, 'epoch': 0.42} + 42%|████▏ | 3649/8750 [2:04:19<8:18:27, 5.86s/it] {'loss': 0.4855, 'learning_rate': 1.3121133726243083e-05, 'epoch': 0.42} + 42%|████▏ | 3649/8750 [2:04:16<8:18:27, 5.86s/it]1 AutoResumeHook: Checking whether to suspend... +1410 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +9 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... + 42%|████▏ | 3650/8750 [2:04:25<8:16:55, 5.85s/it]13 4 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +11AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +02 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... + 42%|████▏ | 3650/8750 [2:04:22<8:16:55, 5.85s/it] {'loss': 0.4607, 'learning_rate': 1.3117616776348915e-05, 'epoch': 0.42} + 42%|████▏ | 3650/8750 [2:04:25<8:16:55, 5.85s/it] {'loss': 0.4607, 'learning_rate': 1.3117616776348915e-05, 'epoch': 0.42} + 42%|████▏ | 3650/8750 [2:04:22<8:16:55, 5.85s/it] 42%|████▏ | 3651/8750 [2:04:31<8:21:10, 5.90s/it] 42%|████▏ | 3651/8750 [2:04:28<8:21:10, 5.90s/it] {'loss': 0.4721, 'learning_rate': 1.3114099399271646e-05, 'epoch': 0.42} + {'loss': 0.4721, 'learning_rate': 1.3114099399271646e-05, 'epoch': 0.42} + 42%|████▏ | 3651/8750 [2:04:31<8:21:10, 5.90s/it] 42%|████▏ | 3651/8750 [2:04:28<8:21:10, 5.90s/it] 42%|████▏ | 3652/8750 [2:04:37<8:12:55, 5.80s/it] 42%|████▏ | 3652/8750 [2:04:34<8:12:55, 5.80s/it] {'loss': 0.4476, 'learning_rate': 1.311058159549323e-05, 'epoch': 0.42} + 42%|████▏ | 3652/8750 [2:04:37<8:12:55, 5.80s/it] {'loss': 0.4476, 'learning_rate': 1.311058159549323e-05, 'epoch': 0.42} + 42%|████▏ | 3652/8750 [2:04:34<8:12:55, 5.80s/it] 42%|████▏ | 3653/8750 [2:04:42<8:08:17, 5.75s/it] 42%|████▏ | 3653/8750 [2:04:39<8:08:17, 5.75s/it] {'loss': 0.4616, 'learning_rate': 1.3107063365495692e-05, 'epoch': 0.42} + 42%|████▏ | 3653/8750 [2:04:42<8:08:17, 5.75s/it] {'loss': 0.4616, 'learning_rate': 1.3107063365495692e-05, 'epoch': 0.42} + 42%|████▏ | 3653/8750 [2:04:39<8:08:17, 5.75s/it] 42%|████▏ | 3654/8750 [2:04:48<8:08:16, 5.75s/it] 42%|████▏ | 3654/8750 [2:04:45<8:08:16, 5.75s/it] {'loss': 0.454, 'learning_rate': 1.31035447097611e-05, 'epoch': 0.42} + 42%|████▏ | 3654/8750 [2:04:48<8:08:16, 5.75s/it] {'loss': 0.454, 'learning_rate': 1.31035447097611e-05, 'epoch': 0.42} + 42%|████▏ | 3654/8750 [2:04:45<8:08:16, 5.75s/it] 42%|████▏ | 3655/8750 [2:04:54<8:05:25, 5.72s/it] 42%|████▏ | 3655/8750 [2:04:51<8:05:25, 5.72s/it] {'loss': 0.4719, 'learning_rate': 1.3100025628771595e-05, 'epoch': 0.42} + 42%|████▏ | 3655/8750 [2:04:54<8:05:25, 5.72s/it] {'loss': 0.4719, 'learning_rate': 1.3100025628771595e-05, 'epoch': 0.42} + 42%|████▏ | 3655/8750 [2:04:51<8:05:25, 5.72s/it] 42%|████▏ | 3656/8750 [2:04:59<8:04:36, 5.71s/it] 42%|████▏ | 3656/8750 [2:04:56<8:04:35, 5.71s/it] {'loss': 0.4565, 'learning_rate': 1.3096506123009368e-05, 'epoch': 0.42} + 42%|████▏ | 3656/8750 [2:04:59<8:04:36, 5.71s/it] {'loss': 0.4565, 'learning_rate': 1.3096506123009368e-05, 'epoch': 0.42} + 42%|████▏ | 3656/8750 [2:04:56<8:04:35, 5.71s/it] 42%|████▏ | 3657/8750 [2:05:05<8:05:33, 5.72s/it] 42%|████▏ | 3657/8750 [2:05:02<8:05:33, 5.72s/it] {'loss': 0.478, 'learning_rate': 1.3092986192956665e-05, 'epoch': 0.42} + 42%|████▏ | 3657/8750 [2:05:05<8:05:33, 5.72s/it] {'loss': 0.478, 'learning_rate': 1.3092986192956665e-05, 'epoch': 0.42} + 42%|████▏ | 3657/8750 [2:05:02<8:05:33, 5.72s/it] 42%|████▏ | 3658/8750 [2:05:11<8:08:03, 5.75s/it] 42%|████▏ | 3658/8750 [2:05:08<8:08:03, 5.75s/it] {'loss': 0.4596, 'learning_rate': 1.3089465839095803e-05, 'epoch': 0.42} + 42%|████▏ | 3658/8750 [2:05:11<8:08:03, 5.75s/it] {'loss': 0.4596, 'learning_rate': 1.3089465839095803e-05, 'epoch': 0.42} + 42%|████▏ | 3658/8750 [2:05:08<8:08:03, 5.75s/it] 42%|████▏ | 3659/8750 [2:05:17<8:12:21, 5.80s/it] 42%|████▏ | 3659/8750 [2:05:14<8:12:23, 5.80s/it] {'loss': 0.4778, 'learning_rate': 1.3085945061909144e-05, 'epoch': 0.42} + 42%|████▏ | 3659/8750 [2:05:17<8:12:21, 5.80s/it] {'loss': 0.4778, 'learning_rate': 1.3085945061909144e-05, 'epoch': 0.42} + 42%|████▏ | 3659/8750 [2:05:14<8:12:23, 5.80s/it] 42%|████▏ | 3660/8750 [2:05:23<8:14:25, 5.83s/it] 42%|████▏ | 3660/8750 [2:05:20<8:14:24, 5.83s/it] {'loss': 0.4706, 'learning_rate': 1.3082423861879114e-05, 'epoch': 0.42} + 42%|████▏ | 3660/8750 [2:05:23<8:14:25, 5.83s/it] {'loss': 0.4706, 'learning_rate': 1.3082423861879114e-05, 'epoch': 0.42} + 42%|████▏ | 3660/8750 [2:05:20<8:14:24, 5.83s/it] 42%|████▏ | 3661/8750 [2:05:28<8:12:45, 5.81s/it] 42%|████▏ | 3661/8750 [2:05:26<8:12:44, 5.81s/it] {'loss': 0.4497, 'learning_rate': 1.3078902239488196e-05, 'epoch': 0.42} + 42%|████▏ | 3661/8750 [2:05:29<8:12:45, 5.81s/it] {'loss': 0.4497, 'learning_rate': 1.3078902239488196e-05, 'epoch': 0.42} + 42%|████▏ | 3661/8750 [2:05:26<8:12:44, 5.81s/it] 42%|████▏ | 3662/8750 [2:05:34<8:09:45, 5.78s/it] 42%|████▏ | 3662/8750 [2:05:31<8:09:45, 5.78s/it] {'loss': 0.4668, 'learning_rate': 1.3075380195218931e-05, 'epoch': 0.42} + 42%|████▏ | 3662/8750 [2:05:34<8:09:45, 5.78s/it] {'loss': 0.4668, 'learning_rate': 1.3075380195218931e-05, 'epoch': 0.42} + 42%|████▏ | 3662/8750 [2:05:31<8:09:45, 5.78s/it] 42%|████▏ | 3663/8750 [2:05:40<8:07:08, 5.75s/it] 42%|████▏ | 3663/8750 [2:05:37<8:07:08, 5.75s/it] {'loss': 0.469, 'learning_rate': 1.3071857729553918e-05, 'epoch': 0.42} + 42%|████▏ | 3663/8750 [2:05:40<8:07:08, 5.75s/it] {'loss': 0.469, 'learning_rate': 1.3071857729553918e-05, 'epoch': 0.42} + 42%|████▏ | 3663/8750 [2:05:37<8:07:08, 5.75s/it] 42%|████▏ | 3664/8750 [2:05:46<8:07:59, 5.76s/it] 42%|████▏ | 3664/8750 [2:05:43<8:07:59, 5.76s/it] {'loss': 0.4776, 'learning_rate': 1.3068334842975813e-05, 'epoch': 0.42} + 42%|████▏ | 3664/8750 [2:05:46<8:07:59, 5.76s/it] {'loss': 0.4776, 'learning_rate': 1.3068334842975813e-05, 'epoch': 0.42} + 42%|████▏ | 3664/8750 [2:05:43<8:07:59, 5.76s/it] 42%|████▏ | 3665/8750 [2:05:52<8:10:14, 5.78s/it] 42%|████▏ | 3665/8750 [2:05:49<8:10:14, 5.78s/it] {'loss': 0.4726, 'learning_rate': 1.306481153596733e-05, 'epoch': 0.42} + 42%|████▏ | 3665/8750 [2:05:52<8:10:14, 5.78s/it] {'loss': 0.4726, 'learning_rate': 1.306481153596733e-05, 'epoch': 0.42} + 42%|████▏ | 3665/8750 [2:05:49<8:10:14, 5.78s/it] 42%|████▏ | 3666/8750 [2:05:57<8:07:39, 5.76s/it] 42%|████▏ | 3666/8750 [2:05:54<8:07:39, 5.76s/it] {'loss': 0.4661, 'learning_rate': 1.3061287809011243e-05, 'epoch': 0.42} + 42%|████▏ | 3666/8750 [2:05:57<8:07:39, 5.76s/it] {'loss': 0.4661, 'learning_rate': 1.3061287809011243e-05, 'epoch': 0.42} + 42%|████▏ | 3666/8750 [2:05:54<8:07:39, 5.76s/it] 42%|████▏ | 3667/8750 [2:06:03<8:19:18, 5.89s/it] 42%|████▏ | 3667/8750 [2:06:00<8:19:18, 5.89s/it] {'loss': 0.4822, 'learning_rate': 1.3057763662590377e-05, 'epoch': 0.42} + 42%|████▏ | 3667/8750 [2:06:03<8:19:18, 5.89s/it] {'loss': 0.4822, 'learning_rate': 1.3057763662590377e-05, 'epoch': 0.42} + 42%|████▏ | 3667/8750 [2:06:00<8:19:18, 5.89s/it] 42%|████▏ | 3668/8750 [2:06:09<8:16:03, 5.86s/it] 42%|████▏ | 3668/8750 [2:06:06<8:16:03, 5.86s/it] {'loss': 0.4588, 'learning_rate': 1.3054239097187625e-05, 'epoch': 0.42} + 42%|████▏ | 3668/8750 [2:06:09<8:16:03, 5.86s/it] {'loss': 0.4588, 'learning_rate': 1.3054239097187625e-05, 'epoch': 0.42} + 42%|████▏ | 3668/8750 [2:06:06<8:16:03, 5.86s/it] 42%|████▏ | 3669/8750 [2:06:15<8:16:02, 5.86s/it] 42%|████▏ | 3669/8750 [2:06:12<8:16:02, 5.86s/it] {'loss': 0.4591, 'learning_rate': 1.3050714113285922e-05, 'epoch': 0.42} + 42%|████▏ | 3669/8750 [2:06:15<8:16:02, 5.86s/it] {'loss': 0.4591, 'learning_rate': 1.3050714113285922e-05, 'epoch': 0.42} + 42%|████▏ | 3669/8750 [2:06:12<8:16:02, 5.86s/it] 42%|████▏ | 3670/8750 [2:06:21<8:11:39, 5.81s/it] 42%|████▏ | 3670/8750 [2:06:18<8:11:39, 5.81s/it] {'loss': 0.4658, 'learning_rate': 1.3047188711368278e-05, 'epoch': 0.42} + 42%|████▏ | 3670/8750 [2:06:21<8:11:39, 5.81s/it] {'loss': 0.4658, 'learning_rate': 1.3047188711368278e-05, 'epoch': 0.42} + 42%|████▏ | 3670/8750 [2:06:18<8:11:39, 5.81s/it] 42%|████▏ | 3671/8750 [2:06:27<8:15:17, 5.85s/it] 42%|████▏ | 3671/8750 [2:06:24<8:15:17, 5.85s/it] {'loss': 0.4681, 'learning_rate': 1.3043662891917748e-05, 'epoch': 0.42} + 42%|████▏ | 3671/8750 [2:06:27<8:15:17, 5.85s/it] {'loss': 0.4681, 'learning_rate': 1.3043662891917748e-05, 'epoch': 0.42} + 42%|████▏ | 3671/8750 [2:06:24<8:15:17, 5.85s/it] 42%|████▏ | 3672/8750 [2:06:33<8:18:02, 5.88s/it] 42%|████▏ | 3672/8750 [2:06:30<8:18:02, 5.88s/it] {'loss': 0.4674, 'learning_rate': 1.3040136655417448e-05, 'epoch': 0.42} + 42%|████▏ | 3672/8750 [2:06:33<8:18:02, 5.88s/it] {'loss': 0.4674, 'learning_rate': 1.3040136655417448e-05, 'epoch': 0.42} + 42%|████▏ | 3672/8750 [2:06:30<8:18:02, 5.88s/it] 42%|████▏ | 3673/8750 [2:06:38<8:13:00, 5.83s/it] 42%|████▏ | 3673/8750 [2:06:35<8:12:59, 5.83s/it] {'loss': 0.4597, 'learning_rate': 1.303661000235055e-05, 'epoch': 0.42} + 42%|████▏ | 3673/8750 [2:06:38<8:13:00, 5.83s/it] {'loss': 0.4597, 'learning_rate': 1.303661000235055e-05, 'epoch': 0.42} + 42%|████▏ | 3673/8750 [2:06:35<8:12:59, 5.83s/it] 42%|████▏ | 3674/8750 [2:06:44<8:16:18, 5.87s/it] 42%|████▏ | 3674/8750 [2:06:41<8:16:18, 5.87s/it] {'loss': 0.4537, 'learning_rate': 1.3033082933200287e-05, 'epoch': 0.42} + 42%|████▏ | 3674/8750 [2:06:44<8:16:18, 5.87s/it] {'loss': 0.4537, 'learning_rate': 1.3033082933200287e-05, 'epoch': 0.42} + 42%|████▏ | 3674/8750 [2:06:41<8:16:18, 5.87s/it] 42%|████▏ | 3675/8750 [2:06:50<8:16:24, 5.87s/it] 42%|████▏ | 3675/8750 [2:06:47<8:16:25, 5.87s/it] {'loss': 0.4705, 'learning_rate': 1.3029555448449947e-05, 'epoch': 0.42} + 42%|████▏ | 3675/8750 [2:06:50<8:16:24, 5.87s/it] {'loss': 0.4705, 'learning_rate': 1.3029555448449947e-05, 'epoch': 0.42} + 42%|████▏ | 3675/8750 [2:06:47<8:16:25, 5.87s/it] 42%|████▏ | 3676/8750 [2:06:56<8:19:31, 5.91s/it] 42%|████▏ | 3676/8750 [2:06:53<8:19:33, 5.91s/it] {'loss': 0.4701, 'learning_rate': 1.302602754858287e-05, 'epoch': 0.42} + 42%|████▏ | 3676/8750 [2:06:56<8:19:31, 5.91s/it] {'loss': 0.4701, 'learning_rate': 1.302602754858287e-05, 'epoch': 0.42} + 42%|████▏ | 3676/8750 [2:06:53<8:19:33, 5.91s/it] 42%|████▏ | 3677/8750 [2:07:02<8:13:19, 5.83s/it] 42%|████▏ | 3677/8750 [2:06:59<8:13:19, 5.83s/it] {'loss': 0.4851, 'learning_rate': 1.3022499234082463e-05, 'epoch': 0.42} + 42%|████▏ | 3677/8750 [2:07:02<8:13:19, 5.83s/it] {'loss': 0.4851, 'learning_rate': 1.3022499234082463e-05, 'epoch': 0.42} + 42%|████▏ | 3677/8750 [2:06:59<8:13:19, 5.83s/it] 42%|████▏ | 3678/8750 [2:07:08<8:16:00, 5.87s/it] 42%|████▏ | 3678/8750 [2:07:05<8:15:59, 5.87s/it] {'loss': 0.4751, 'learning_rate': 1.3018970505432176e-05, 'epoch': 0.42} + 42%|████▏ | 3678/8750 [2:07:08<8:16:00, 5.87s/it] {'loss': 0.4751, 'learning_rate': 1.3018970505432176e-05, 'epoch': 0.42} + 42%|████▏ | 3678/8750 [2:07:05<8:15:59, 5.87s/it] 42%|████▏ | 3679/8750 [2:07:14<8:14:13, 5.85s/it] 42%|████▏ | 3679/8750 [2:07:11<8:14:13, 5.85s/it] {'loss': 0.4522, 'learning_rate': 1.3015441363115526e-05, 'epoch': 0.42} + 42%|████▏ | 3679/8750 [2:07:14<8:14:13, 5.85s/it] {'loss': 0.4522, 'learning_rate': 1.3015441363115526e-05, 'epoch': 0.42} + 42%|████▏ | 3679/8750 [2:07:11<8:14:13, 5.85s/it] 42%|████▏ | 3680/8750 [2:07:16<8:11:18, 5.81s/it] 42%|████▏ | 3680/8750 [2:07:19<8:11:22, 5.82s/it] {'loss': 0.4703, 'learning_rate': 1.3011911807616091e-05, 'epoch': 0.42} + 42%|████▏ | 3680/8750 [2:07:19<8:11:22, 5.82s/it] {'loss': 0.4703, 'learning_rate': 1.3011911807616091e-05, 'epoch': 0.42} + 42%|████▏ | 3680/8750 [2:07:16<8:11:18, 5.81s/it] 42%|████▏ | 3681/8750 [2:07:25<8:07:13, 5.77s/it] 42%|████▏ | 3681/8750 [2:07:22<8:07:14, 5.77s/it] {'loss': 0.4713, 'learning_rate': 1.3008381839417493e-05, 'epoch': 0.42} + 42%|████▏ | 3681/8750 [2:07:25<8:07:13, 5.77s/it] {'loss': 0.4713, 'learning_rate': 1.3008381839417493e-05, 'epoch': 0.42} + 42%|████▏ | 3681/8750 [2:07:22<8:07:14, 5.77s/it] 42%|████▏ | 3682/8750 [2:07:31<8:18:45, 5.90s/it] 42%|████▏ | 3682/8750 [2:07:28<8:18:47, 5.91s/it] {'loss': 0.4806, 'learning_rate': 1.3004851459003416e-05, 'epoch': 0.42} + 42%|████▏ | 3682/8750 [2:07:31<8:18:45, 5.90s/it] {'loss': 0.4806, 'learning_rate': 1.3004851459003416e-05, 'epoch': 0.42} + 42%|████▏ | 3682/8750 [2:07:28<8:18:47, 5.91s/it] 42%|████▏ | 3683/8750 [2:07:37<8:12:12, 5.83s/it] 42%|████▏ | 3683/8750 [2:07:34<8:12:13, 5.83s/it] {'loss': 0.4972, 'learning_rate': 1.30013206668576e-05, 'epoch': 0.42} + 42%|████▏ | 3683/8750 [2:07:37<8:12:12, 5.83s/it] {'loss': 0.4972, 'learning_rate': 1.30013206668576e-05, 'epoch': 0.42} + 42%|████▏ | 3683/8750 [2:07:34<8:12:13, 5.83s/it] 42%|████▏ | 3684/8750 [2:07:43<8:11:15, 5.82s/it] 42%|████▏ | 3684/8750 [2:07:40<8:11:15, 5.82s/it] {'loss': 0.4535, 'learning_rate': 1.2997789463463848e-05, 'epoch': 0.42} + 42%|████▏ | 3684/8750 [2:07:43<8:11:15, 5.82s/it] {'loss': 0.4535, 'learning_rate': 1.2997789463463848e-05, 'epoch': 0.42} + 42%|████▏ | 3684/8750 [2:07:40<8:11:15, 5.82s/it] 42%|████▏ | 3685/8750 [2:07:49<8:15:49, 5.87s/it] 42%|████▏ | 3685/8750 [2:07:46<8:15:49, 5.87s/it] {'loss': 0.479, 'learning_rate': 1.2994257849306009e-05, 'epoch': 0.42} + 42%|████▏ | 3685/8750 [2:07:49<8:15:49, 5.87s/it] {'loss': 0.479, 'learning_rate': 1.2994257849306009e-05, 'epoch': 0.42} + 42%|████▏ | 3685/8750 [2:07:46<8:15:49, 5.87s/it] 42%|████▏ | 3686/8750 [2:07:52<8:14:40, 5.86s/it] 42%|████▏ | 3686/8750 [2:07:54<8:14:40, 5.86s/it] {'loss': 0.4595, 'learning_rate': 1.2990725824867995e-05, 'epoch': 0.42} + 42%|████▏ | 3686/8750 [2:07:54<8:14:40, 5.86s/it] {'loss': 0.4595, 'learning_rate': 1.2990725824867995e-05, 'epoch': 0.42} + 42%|████▏ | 3686/8750 [2:07:52<8:14:40, 5.86s/it] 42%|████▏ | 3687/8750 [2:08:00<8:17:22, 5.89s/it] 42%|████▏ | 3687/8750 [2:07:58<8:17:22, 5.89s/it] {'loss': 0.464, 'learning_rate': 1.2987193390633773e-05, 'epoch': 0.42} + 42%|████▏ | 3687/8750 [2:08:00<8:17:22, 5.89s/it] {'loss': 0.464, 'learning_rate': 1.2987193390633773e-05, 'epoch': 0.42} + 42%|████▏ | 3687/8750 [2:07:58<8:17:22, 5.89s/it] 42%|████▏ | 3688/8750 [2:08:06<8:18:05, 5.90s/it] 42%|████▏ | 3688/8750 [2:08:03<8:18:06, 5.90s/it] {'loss': 0.4592, 'learning_rate': 1.298366054708736e-05, 'epoch': 0.42} + 42%|████▏ | 3688/8750 [2:08:06<8:18:05, 5.90s/it] {'loss': 0.4592, 'learning_rate': 1.298366054708736e-05, 'epoch': 0.42} + 42%|████▏ | 3688/8750 [2:08:03<8:18:06, 5.90s/it] 42%|████▏ | 3689/8750 [2:08:12<8:16:09, 5.88s/it] 42%|████▏ | 3689/8750 [2:08:09<8:16:08, 5.88s/it] {'loss': 0.4798, 'learning_rate': 1.2980127294712839e-05, 'epoch': 0.42} + 42%|████▏ | 3689/8750 [2:08:12<8:16:09, 5.88s/it] {'loss': 0.4798, 'learning_rate': 1.2980127294712839e-05, 'epoch': 0.42} + 42%|████▏ | 3689/8750 [2:08:09<8:16:08, 5.88s/it] 42%|████▏ | 3690/8750 [2:08:18<8:14:34, 5.86s/it] 42%|████▏ | 3690/8750 [2:08:15<8:14:34, 5.86s/it] {'loss': 0.4443, 'learning_rate': 1.2976593633994347e-05, 'epoch': 0.42} + 42%|████▏ | 3690/8750 [2:08:18<8:14:34, 5.86s/it] {'loss': 0.4443, 'learning_rate': 1.2976593633994347e-05, 'epoch': 0.42} + 42%|████▏ | 3690/8750 [2:08:15<8:14:34, 5.86s/it] 42%|████▏ | 3691/8750 [2:08:24<8:11:32, 5.83s/it] 42%|████▏ | 3691/8750 [2:08:21<8:11:31, 5.83s/it] {'loss': 0.4957, 'learning_rate': 1.297305956541607e-05, 'epoch': 0.42} + 42%|████▏ | 3691/8750 [2:08:24<8:11:32, 5.83s/it] {'loss': 0.4957, 'learning_rate': 1.297305956541607e-05, 'epoch': 0.42} + 42%|████▏ | 3691/8750 [2:08:21<8:11:31, 5.83s/it] 42%|████▏ | 3692/8750 [2:08:29<8:08:06, 5.79s/it] 42%|████▏ | 3692/8750 [2:08:27<8:08:07, 5.79s/it] {'loss': 0.4539, 'learning_rate': 1.2969525089462253e-05, 'epoch': 0.42} + 42%|████▏ | 3692/8750 [2:08:29<8:08:06, 5.79s/it] {'loss': 0.4539, 'learning_rate': 1.2969525089462253e-05, 'epoch': 0.42} + 42%|████▏ | 3692/8750 [2:08:27<8:08:07, 5.79s/it] 42%|████▏ | 3693/8750 [2:08:36<8:18:42, 5.92s/it] 42%|████▏ | 3693/8750 [2:08:33<8:18:42, 5.92s/it] {'loss': 0.4583, 'learning_rate': 1.2965990206617203e-05, 'epoch': 0.42} + 42%|████▏ | 3693/8750 [2:08:36<8:18:42, 5.92s/it] {'loss': 0.4583, 'learning_rate': 1.2965990206617203e-05, 'epoch': 0.42} + 42%|████▏ | 3693/8750 [2:08:33<8:18:42, 5.92s/it] 42%|████▏ | 3694/8750 [2:08:42<8:18:23, 5.91s/it] 42%|████▏ | 3694/8750 [2:08:39<8:18:23, 5.91s/it] {'loss': 0.4544, 'learning_rate': 1.2962454917365275e-05, 'epoch': 0.42} + 42%|████▏ | 3694/8750 [2:08:42<8:18:23, 5.91s/it] {'loss': 0.4544, 'learning_rate': 1.2962454917365275e-05, 'epoch': 0.42} + 42%|████▏ | 3694/8750 [2:08:39<8:18:23, 5.91s/it] 42%|████▏ | 3695/8750 [2:08:47<8:10:56, 5.83s/it] 42%|████▏ | 3695/8750 [2:08:44<8:10:56, 5.83s/it] {'loss': 0.4833, 'learning_rate': 1.2958919222190885e-05, 'epoch': 0.42} + 42%|████▏ | 3695/8750 [2:08:47<8:10:56, 5.83s/it] {'loss': 0.4833, 'learning_rate': 1.2958919222190885e-05, 'epoch': 0.42} + 42%|████▏ | 3695/8750 [2:08:44<8:10:56, 5.83s/it] 42%|████▏ | 3696/8750 [2:08:53<8:13:17, 5.86s/it] 42%|████▏ | 3696/8750 [2:08:50<8:13:17, 5.86s/it] {'loss': 0.469, 'learning_rate': 1.2955383121578498e-05, 'epoch': 0.42} + 42%|████▏ | 3696/8750 [2:08:53<8:13:17, 5.86s/it] {'loss': 0.469, 'learning_rate': 1.2955383121578498e-05, 'epoch': 0.42} + 42%|████▏ | 3696/8750 [2:08:50<8:13:17, 5.86s/it] 42%|████▏ | 3697/8750 [2:08:59<8:09:44, 5.82s/it] 42%|████▏ | 3697/8750 [2:08:56<8:09:45, 5.82s/it] {'loss': 0.4562, 'learning_rate': 1.2951846616012642e-05, 'epoch': 0.42} + 42%|████▏ | 3697/8750 [2:08:59<8:09:44, 5.82s/it] {'loss': 0.4562, 'learning_rate': 1.2951846616012642e-05, 'epoch': 0.42} + 42%|████▏ | 3697/8750 [2:08:56<8:09:45, 5.82s/it] 42%|████▏ | 3698/8750 [2:09:05<8:09:51, 5.82s/it] 42%|████▏ | 3698/8750 [2:09:02<8:09:51, 5.82s/it] {'loss': 0.4555, 'learning_rate': 1.2948309705977893e-05, 'epoch': 0.42} + 42%|████▏ | 3698/8750 [2:09:05<8:09:51, 5.82s/it] {'loss': 0.4555, 'learning_rate': 1.2948309705977893e-05, 'epoch': 0.42} + 42%|████▏ | 3698/8750 [2:09:02<8:09:51, 5.82s/it] 42%|████▏ | 3699/8750 [2:09:11<8:14:28, 5.87s/it] 42%|████▏ | 3699/8750 [2:09:08<8:14:28, 5.87s/it] {'loss': 0.4657, 'learning_rate': 1.2944772391958896e-05, 'epoch': 0.42} + 42%|████▏ | 3699/8750 [2:09:11<8:14:28, 5.87s/it] {'loss': 0.4657, 'learning_rate': 1.2944772391958896e-05, 'epoch': 0.42} + 42%|████▏ | 3699/8750 [2:09:08<8:14:28, 5.87s/it]10 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... 42%|████▏ | 3700/8750 [2:09:16<8:10:52, 5.83s/it] +11 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +012 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... + 42%|████▏ | 3700/8750 [2:09:14<8:10:53, 5.83s/it] {'loss': 0.4753, 'learning_rate': 1.294123467444033e-05, 'epoch': 0.42} + 42%|████▏ | 3700/8750 [2:09:16<8:10:52, 5.83s/it] {'loss': 0.4753, 'learning_rate': 1.294123467444033e-05, 'epoch': 0.42} + 42%|████▏ | 3700/8750 [2:09:14<8:10:53, 5.83s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-3700/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-3700/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-3700/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 42%|████▏ | 3701/8750 [2:09:39<16:19:54, 11.64s/it] 42%|████▏ | 3701/8750 [2:09:42<16:19:57, 11.65s/it] {'loss': 0.4762, 'learning_rate': 1.2937696553906949e-05, 'epoch': 0.42} + {'loss': 0.4762, 'learning_rate': 1.2937696553906949e-05, 'epoch': 0.42} 42%|████▏ | 3701/8750 [2:09:42<16:19:57, 11.65s/it] + 42%|████▏ | 3701/8750 [2:09:39<16:19:54, 11.64s/it] 42%|████▏ | 3702/8750 [2:09:48<13:57:27, 9.95s/it] 42%|████▏ | 3702/8750 [2:09:45<13:57:27, 9.95s/it] {'loss': 0.45, 'learning_rate': 1.2934158030843554e-05, 'epoch': 0.42} + 42%|████▏ | 3702/8750 [2:09:48<13:57:27, 9.95s/it] {'loss': 0.45, 'learning_rate': 1.2934158030843554e-05, 'epoch': 0.42} + 42%|████▏ | 3702/8750 [2:09:45<13:57:27, 9.95s/it] 42%|████▏ | 3703/8750 [2:09:53<12:11:11, 8.69s/it] 42%|████▏ | 3703/8750 [2:09:50<12:11:11, 8.69s/it] {'loss': 0.4803, 'learning_rate': 1.2930619105734999e-05, 'epoch': 0.42} + 42%|████▏ | 3703/8750 [2:09:53<12:11:11, 8.69s/it] {'loss': 0.4803, 'learning_rate': 1.2930619105734999e-05, 'epoch': 0.42} + 42%|████▏ | 3703/8750 [2:09:50<12:11:11, 8.69s/it] 42%|████▏ | 3704/8750 [2:09:59<10:59:40, 7.84s/it] 42%|████▏ | 3704/8750 [2:09:56<10:59:41, 7.84s/it] {'loss': 0.4837, 'learning_rate': 1.2927079779066196e-05, 'epoch': 0.42} + 42%|████▏ | 3704/8750 [2:09:59<10:59:40, 7.84s/it] {'loss': 0.4837, 'learning_rate': 1.2927079779066196e-05, 'epoch': 0.42} + 42%|████▏ | 3704/8750 [2:09:56<10:59:41, 7.84s/it] 42%|████▏ | 3705/8750 [2:10:05<10:06:19, 7.21s/it] 42%|████▏ | 3705/8750 [2:10:02<10:06:19, 7.21s/it] {'loss': 0.4546, 'learning_rate': 1.2923540051322114e-05, 'epoch': 0.42} + 42%|████▏ | 3705/8750 [2:10:05<10:06:19, 7.21s/it] {'loss': 0.4546, 'learning_rate': 1.2923540051322114e-05, 'epoch': 0.42} + 42%|████▏ | 3705/8750 [2:10:02<10:06:19, 7.21s/it] 42%|████▏ | 3706/8750 [2:10:11<9:27:47, 6.75s/it] 42%|████▏ | 3706/8750 [2:10:08<9:27:47, 6.75s/it] {'loss': 0.4731, 'learning_rate': 1.2919999922987775e-05, 'epoch': 0.42} + 42%|████▏ | 3706/8750 [2:10:11<9:27:47, 6.75s/it] {'loss': 0.4731, 'learning_rate': 1.2919999922987775e-05, 'epoch': 0.42} + 42%|████▏ | 3706/8750 [2:10:08<9:27:47, 6.75s/it] 42%|████▏ | 3707/8750 [2:10:17<9:04:37, 6.48s/it] 42%|████▏ | 3707/8750 [2:10:14<9:04:37, 6.48s/it] {'loss': 0.4707, 'learning_rate': 1.291645939454825e-05, 'epoch': 0.42} + 42%|████▏ | 3707/8750 [2:10:17<9:04:37, 6.48s/it] {'loss': 0.4707, 'learning_rate': 1.291645939454825e-05, 'epoch': 0.42} + 42%|████▏ | 3707/8750 [2:10:14<9:04:37, 6.48s/it] 42%|████▏ | 3708/8750 [2:10:22<8:50:52, 6.32s/it] 42%|████▏ | 3708/8750 [2:10:20<8:50:52, 6.32s/it] {'loss': 0.4518, 'learning_rate': 1.2912918466488678e-05, 'epoch': 0.42} + 42%|████▏ | 3708/8750 [2:10:22<8:50:52, 6.32s/it] {'loss': 0.4518, 'learning_rate': 1.2912918466488678e-05, 'epoch': 0.42} + 42%|████▏ | 3708/8750 [2:10:20<8:50:52, 6.32s/it] 42%|████▏ | 3709/8750 [2:10:28<8:35:54, 6.14s/it] 42%|████▏ | 3709/8750 [2:10:25<8:35:54, 6.14s/it] {'loss': 0.4774, 'learning_rate': 1.2909377139294242e-05, 'epoch': 0.42} + 42%|████▏ | 3709/8750 [2:10:28<8:35:54, 6.14s/it] {'loss': 0.4774, 'learning_rate': 1.2909377139294242e-05, 'epoch': 0.42} + 42%|████▏ | 3709/8750 [2:10:25<8:35:54, 6.14s/it] 42%|████▏ | 3710/8750 [2:10:34<8:24:06, 6.00s/it] 42%|████▏ | 3710/8750 [2:10:31<8:24:06, 6.00s/it] {'loss': 0.4678, 'learning_rate': 1.290583541345018e-05, 'epoch': 0.42} + 42%|████▏ | 3710/8750 [2:10:34<8:24:06, 6.00s/it] {'loss': 0.4678, 'learning_rate': 1.290583541345018e-05, 'epoch': 0.42} + 42%|████▏ | 3710/8750 [2:10:31<8:24:06, 6.00s/it] 42%|████▏ | 3711/8750 [2:10:39<8:12:47, 5.87s/it] 42%|████▏ | 3711/8750 [2:10:36<8:12:47, 5.87s/it] {'loss': 0.4693, 'learning_rate': 1.2902293289441791e-05, 'epoch': 0.42} + 42%|████▏ | 3711/8750 [2:10:39<8:12:47, 5.87s/it] {'loss': 0.4693, 'learning_rate': 1.2902293289441791e-05, 'epoch': 0.42} + 42%|████▏ | 3711/8750 [2:10:37<8:12:47, 5.87s/it] 42%|████▏ | 3712/8750 [2:10:45<8:13:29, 5.88s/it] 42%|████▏ | 3712/8750 [2:10:42<8:13:29, 5.88s/it] {'loss': 0.4519, 'learning_rate': 1.2898750767754427e-05, 'epoch': 0.42} + 42%|████▏ | 3712/8750 [2:10:45<8:13:29, 5.88s/it] {'loss': 0.4519, 'learning_rate': 1.2898750767754427e-05, 'epoch': 0.42} + 42%|████▏ | 3712/8750 [2:10:42<8:13:29, 5.88s/it] 42%|████▏ | 3713/8750 [2:10:51<8:12:46, 5.87s/it] 42%|████▏ | 3713/8750 [2:10:48<8:12:46, 5.87s/it] {'loss': 0.4656, 'learning_rate': 1.2895207848873488e-05, 'epoch': 0.42} + 42%|████▏ | 3713/8750 [2:10:51<8:12:46, 5.87s/it] {'loss': 0.4656, 'learning_rate': 1.2895207848873488e-05, 'epoch': 0.42} + 42%|████▏ | 3713/8750 [2:10:48<8:12:46, 5.87s/it] 42%|████▏ | 3714/8750 [2:10:57<8:08:28, 5.82s/it] 42%|████▏ | 3714/8750 [2:10:54<8:08:28, 5.82s/it] {'loss': 0.4719, 'learning_rate': 1.2891664533284434e-05, 'epoch': 0.42} + 42%|████▏ | 3714/8750 [2:10:57<8:08:28, 5.82s/it] {'loss': 0.4719, 'learning_rate': 1.2891664533284434e-05, 'epoch': 0.42} + 42%|████▏ | 3714/8750 [2:10:54<8:08:28, 5.82s/it] 42%|████▏ | 3715/8750 [2:11:03<8:14:50, 5.90s/it] 42%|████▏ | 3715/8750 [2:11:00<8:14:50, 5.90s/it] {'loss': 0.4629, 'learning_rate': 1.288812082147278e-05, 'epoch': 0.42} + 42%|████▏ | 3715/8750 [2:11:03<8:14:50, 5.90s/it] {'loss': 0.4629, 'learning_rate': 1.288812082147278e-05, 'epoch': 0.42} + 42%|████▏ | 3715/8750 [2:11:00<8:14:50, 5.90s/it] 42%|████▏ | 3716/8750 [2:11:09<8:14:44, 5.90s/it] 42%|████▏ | 3716/8750 [2:11:06<8:14:44, 5.90s/it] {'loss': 0.4535, 'learning_rate': 1.2884576713924093e-05, 'epoch': 0.42} + 42%|████▏ | 3716/8750 [2:11:09<8:14:44, 5.90s/it] {'loss': 0.4535, 'learning_rate': 1.2884576713924093e-05, 'epoch': 0.42} + 42%|████▏ | 3716/8750 [2:11:06<8:14:44, 5.90s/it] 42%|████▏ | 3717/8750 [2:11:14<8:07:56, 5.82s/it] 42%|████▏ | 3717/8750 [2:11:12<8:07:56, 5.82s/it] {'loss': 0.4501, 'learning_rate': 1.2881032211123994e-05, 'epoch': 0.42} + 42%|████▏ | 3717/8750 [2:11:14<8:07:56, 5.82s/it] {'loss': 0.4501, 'learning_rate': 1.2881032211123994e-05, 'epoch': 0.42} + 42%|████▏ | 3717/8750 [2:11:12<8:07:56, 5.82s/it] 42%|████▏ | 3718/8750 [2:11:20<8:03:20, 5.76s/it] 42%|████▏ | 3718/8750 [2:11:17<8:03:20, 5.76s/it] {'loss': 0.4916, 'learning_rate': 1.2877487313558159e-05, 'epoch': 0.42} + 42%|████▏ | 3718/8750 [2:11:20<8:03:20, 5.76s/it] {'loss': 0.4916, 'learning_rate': 1.2877487313558159e-05, 'epoch': 0.42} + 42%|████▏ | 3718/8750 [2:11:17<8:03:20, 5.76s/it] 43%|████▎ | 3719/8750 [2:11:26<8:05:37, 5.79s/it] 43%|████▎ | 3719/8750 [2:11:23<8:05:37, 5.79s/it] {'loss': 0.4555, 'learning_rate': 1.287394202171232e-05, 'epoch': 0.43} + 43%|████▎ | 3719/8750 [2:11:26<8:05:37, 5.79s/it] {'loss': 0.4555, 'learning_rate': 1.287394202171232e-05, 'epoch': 0.43} + 43%|████▎ | 3719/8750 [2:11:23<8:05:37, 5.79s/it] 43%|████▎ | 3720/8750 [2:11:32<8:10:06, 5.85s/it] 43%|████▎ | 3720/8750 [2:11:29<8:10:06, 5.85s/it] {'loss': 0.4693, 'learning_rate': 1.287039633607226e-05, 'epoch': 0.43} + 43%|████▎ | 3720/8750 [2:11:32<8:10:06, 5.85s/it] {'loss': 0.4693, 'learning_rate': 1.287039633607226e-05, 'epoch': 0.43} + 43%|████▎ | 3720/8750 [2:11:29<8:10:06, 5.85s/it] 43%|████▎ | 3721/8750 [2:11:38<8:06:34, 5.81s/it] 43%|████▎ | 3721/8750 [2:11:35<8:06:34, 5.81s/it] {'loss': 0.4711, 'learning_rate': 1.2866850257123817e-05, 'epoch': 0.43} + 43%|████▎ | 3721/8750 [2:11:38<8:06:34, 5.81s/it] {'loss': 0.4711, 'learning_rate': 1.2866850257123817e-05, 'epoch': 0.43} + 43%|████▎ | 3721/8750 [2:11:35<8:06:34, 5.81s/it] 43%|████▎ | 3722/8750 [2:11:43<8:04:32, 5.78s/it] 43%|████▎ | 3722/8750 [2:11:40<8:04:32, 5.78s/it] {'loss': 0.4536, 'learning_rate': 1.2863303785352883e-05, 'epoch': 0.43} + 43%|████▎ | 3722/8750 [2:11:43<8:04:32, 5.78s/it] {'loss': 0.4536, 'learning_rate': 1.2863303785352883e-05, 'epoch': 0.43} + 43%|████▎ | 3722/8750 [2:11:40<8:04:32, 5.78s/it] 43%|████▎ | 3723/8750 [2:11:49<8:03:18, 5.77s/it] 43%|████▎ | 3723/8750 [2:11:46<8:03:18, 5.77s/it] {'loss': 0.4687, 'learning_rate': 1.2859756921245403e-05, 'epoch': 0.43} + 43%|████▎ | 3723/8750 [2:11:49<8:03:18, 5.77s/it] {'loss': 0.4687, 'learning_rate': 1.2859756921245403e-05, 'epoch': 0.43} + 43%|████▎ | 3723/8750 [2:11:46<8:03:18, 5.77s/it] 43%|████▎ | 3724/8750 [2:11:55<8:03:55, 5.78s/it] 43%|████▎ | 3724/8750 [2:11:52<8:03:55, 5.78s/it] {'loss': 0.4715, 'learning_rate': 1.2856209665287378e-05, 'epoch': 0.43} + 43%|████▎ | 3724/8750 [2:11:55<8:03:55, 5.78s/it] {'loss': 0.4715, 'learning_rate': 1.2856209665287378e-05, 'epoch': 0.43} + 43%|████▎ | 3724/8750 [2:11:52<8:03:55, 5.78s/it] 43%|████▎ | 3725/8750 [2:12:01<8:04:07, 5.78s/it] 43%|████▎ | 3725/8750 [2:11:58<8:04:08, 5.78s/it] {'loss': 0.4529, 'learning_rate': 1.2852662017964863e-05, 'epoch': 0.43} + 43%|████▎ | 3725/8750 [2:12:01<8:04:07, 5.78s/it] {'loss': 0.4529, 'learning_rate': 1.2852662017964863e-05, 'epoch': 0.43} + 43%|████▎ | 3725/8750 [2:11:58<8:04:08, 5.78s/it] 43%|████▎ | 3726/8750 [2:12:06<8:01:41, 5.75s/it] 43%|████▎ | 3726/8750 [2:12:03<8:01:41, 5.75s/it] {'loss': 0.4851, 'learning_rate': 1.2849113979763956e-05, 'epoch': 0.43} + 43%|████▎ | 3726/8750 [2:12:06<8:01:41, 5.75s/it] {'loss': 0.4851, 'learning_rate': 1.2849113979763956e-05, 'epoch': 0.43} + 43%|████▎ | 3726/8750 [2:12:03<8:01:41, 5.75s/it] 43%|████▎ | 3727/8750 [2:12:12<7:57:52, 5.71s/it] 43%|████▎ | 3727/8750 [2:12:09<7:57:52, 5.71s/it] {'loss': 0.4807, 'learning_rate': 1.2845565551170829e-05, 'epoch': 0.43} + 43%|████▎ | 3727/8750 [2:12:12<7:57:52, 5.71s/it] {'loss': 0.4807, 'learning_rate': 1.2845565551170829e-05, 'epoch': 0.43} + 43%|████▎ | 3727/8750 [2:12:09<7:57:52, 5.71s/it] 43%|████▎ | 3728/8750 [2:12:18<7:59:39, 5.73s/it] 43%|████▎ | 3728/8750 [2:12:15<7:59:40, 5.73s/it] {'loss': 0.456, 'learning_rate': 1.2842016732671689e-05, 'epoch': 0.43} + 43%|████▎ | 3728/8750 [2:12:18<7:59:39, 5.73s/it] {'loss': 0.456, 'learning_rate': 1.2842016732671689e-05, 'epoch': 0.43} + 43%|████▎ | 3728/8750 [2:12:15<7:59:40, 5.73s/it] 43%|████▎ | 3729/8750 [2:12:23<7:59:03, 5.72s/it] 43%|████▎ | 3729/8750 [2:12:21<7:59:03, 5.72s/it] {'loss': 0.4591, 'learning_rate': 1.2838467524752808e-05, 'epoch': 0.43} + 43%|████▎ | 3729/8750 [2:12:23<7:59:03, 5.72s/it] {'loss': 0.4591, 'learning_rate': 1.2838467524752808e-05, 'epoch': 0.43} + 43%|████▎ | 3729/8750 [2:12:21<7:59:03, 5.72s/it] 43%|████▎ | 3730/8750 [2:12:29<7:58:59, 5.73s/it] 43%|████▎ | 3730/8750 [2:12:26<7:58:59, 5.73s/it] {'loss': 0.4645, 'learning_rate': 1.2834917927900504e-05, 'epoch': 0.43} + 43%|████▎ | 3730/8750 [2:12:29<7:58:59, 5.73s/it] {'loss': 0.4645, 'learning_rate': 1.2834917927900504e-05, 'epoch': 0.43} + 43%|████▎ | 3730/8750 [2:12:26<7:58:59, 5.73s/it] 43%|████▎ | 3731/8750 [2:12:35<8:00:21, 5.74s/it] 43%|████▎ | 3731/8750 [2:12:32<8:00:20, 5.74s/it] {'loss': 0.4533, 'learning_rate': 1.2831367942601146e-05, 'epoch': 0.43} + 43%|████▎ | 3731/8750 [2:12:35<8:00:21, 5.74s/it] {'loss': 0.4533, 'learning_rate': 1.2831367942601146e-05, 'epoch': 0.43} + 43%|████▎ | 3731/8750 [2:12:32<8:00:20, 5.74s/it] 43%|████▎ | 3732/8750 [2:12:41<7:56:47, 5.70s/it] 43%|████▎ | 3732/8750 [2:12:38<7:56:47, 5.70s/it] {'loss': 0.4886, 'learning_rate': 1.2827817569341167e-05, 'epoch': 0.43} + 43%|████▎ | 3732/8750 [2:12:41<7:56:47, 5.70s/it] {'loss': 0.4886, 'learning_rate': 1.2827817569341167e-05, 'epoch': 0.43} + 43%|████▎ | 3732/8750 [2:12:38<7:56:47, 5.70s/it] 43%|████▎ | 3733/8750 [2:12:43<7:56:55, 5.70s/it] 43%|████▎ | 3733/8750 [2:12:46<7:56:56, 5.70s/it] {'loss': 0.4656, 'learning_rate': 1.282426680860705e-05, 'epoch': 0.43} + 43%|████▎ | 3733/8750 [2:12:46<7:56:56, 5.70s/it] {'loss': 0.4656, 'learning_rate': 1.282426680860705e-05, 'epoch': 0.43} + 43%|████▎ | 3733/8750 [2:12:43<7:56:55, 5.70s/it] 43%|████▎ | 3734/8750 [2:12:52<8:01:33, 5.76s/it] 43%|████▎ | 3734/8750 [2:12:49<8:01:33, 5.76s/it] {'loss': 0.4773, 'learning_rate': 1.2820715660885328e-05, 'epoch': 0.43} + 43%|████▎ | 3734/8750 [2:12:52<8:01:33, 5.76s/it] {'loss': 0.4773, 'learning_rate': 1.2820715660885328e-05, 'epoch': 0.43} + 43%|████▎ | 3734/8750 [2:12:49<8:01:33, 5.76s/it] 43%|████▎ | 3735/8750 [2:12:58<8:05:35, 5.81s/it] 43%|████▎ | 3735/8750 [2:12:55<8:05:36, 5.81s/it] {'loss': 0.4702, 'learning_rate': 1.2817164126662581e-05, 'epoch': 0.43} + 43%|████▎ | 3735/8750 [2:12:58<8:05:35, 5.81s/it] {'loss': 0.4702, 'learning_rate': 1.2817164126662581e-05, 'epoch': 0.43} + 43%|████▎ | 3735/8750 [2:12:55<8:05:36, 5.81s/it] 43%|████▎ | 3736/8750 [2:13:04<7:59:53, 5.74s/it] 43%|████▎ | 3736/8750 [2:13:01<7:59:53, 5.74s/it] {'loss': 0.4791, 'learning_rate': 1.281361220642545e-05, 'epoch': 0.43} + 43%|████▎ | 3736/8750 [2:13:04<7:59:53, 5.74s/it] {'loss': 0.4791, 'learning_rate': 1.281361220642545e-05, 'epoch': 0.43} + 43%|████▎ | 3736/8750 [2:13:01<7:59:53, 5.74s/it] 43%|████▎ | 3737/8750 [2:13:10<8:03:50, 5.79s/it] 43%|████▎ | 3737/8750 [2:13:07<8:03:50, 5.79s/it] {'loss': 0.4713, 'learning_rate': 1.281005990066063e-05, 'epoch': 0.43} + 43%|████▎ | 3737/8750 [2:13:10<8:03:50, 5.79s/it] {'loss': 0.4713, 'learning_rate': 1.281005990066063e-05, 'epoch': 0.43} + 43%|████▎ | 3737/8750 [2:13:07<8:03:50, 5.79s/it] 43%|████▎ | 3738/8750 [2:13:15<8:02:38, 5.78s/it] 43%|████▎ | 3738/8750 [2:13:12<8:02:38, 5.78s/it] {'loss': 0.4736, 'learning_rate': 1.280650720985487e-05, 'epoch': 0.43} + 43%|████▎ | 3738/8750 [2:13:15<8:02:38, 5.78s/it] {'loss': 0.4736, 'learning_rate': 1.280650720985487e-05, 'epoch': 0.43} + 43%|████▎ | 3738/8750 [2:13:12<8:02:38, 5.78s/it] 43%|████▎ | 3739/8750 [2:13:21<8:06:31, 5.83s/it] 43%|████▎ | 3739/8750 [2:13:18<8:06:31, 5.83s/it] {'loss': 0.449, 'learning_rate': 1.2802954134494963e-05, 'epoch': 0.43} + 43%|████▎ | 3739/8750 [2:13:21<8:06:31, 5.83s/it] {'loss': 0.449, 'learning_rate': 1.2802954134494963e-05, 'epoch': 0.43} + 43%|████▎ | 3739/8750 [2:13:18<8:06:31, 5.83s/it] 43%|████▎ | 3740/8750 [2:13:27<8:05:16, 5.81s/it] 43%|████▎ | 3740/8750 [2:13:24<8:05:17, 5.81s/it] {'loss': 0.4865, 'learning_rate': 1.2799400675067754e-05, 'epoch': 0.43} + 43%|████▎ | 3740/8750 [2:13:27<8:05:16, 5.81s/it] {'loss': 0.4865, 'learning_rate': 1.2799400675067754e-05, 'epoch': 0.43} + 43%|████▎ | 3740/8750 [2:13:24<8:05:17, 5.81s/it] 43%|████▎ | 3741/8750 [2:13:33<8:01:57, 5.77s/it] 43%|████▎ | 3741/8750 [2:13:30<8:01:57, 5.77s/it] {'loss': 0.4372, 'learning_rate': 1.2795846832060157e-05, 'epoch': 0.43} + 43%|████▎ | 3741/8750 [2:13:33<8:01:57, 5.77s/it] {'loss': 0.4372, 'learning_rate': 1.2795846832060157e-05, 'epoch': 0.43} + 43%|████▎ | 3741/8750 [2:13:30<8:01:57, 5.77s/it] 43%|████▎ | 3742/8750 [2:13:39<8:02:23, 5.78s/it] 43%|████▎ | 3742/8750 [2:13:36<8:02:23, 5.78s/it] {'loss': 0.4791, 'learning_rate': 1.2792292605959125e-05, 'epoch': 0.43} + 43%|████▎ | 3742/8750 [2:13:39<8:02:23, 5.78s/it] {'loss': 0.4791, 'learning_rate': 1.2792292605959125e-05, 'epoch': 0.43} + 43%|████▎ | 3742/8750 [2:13:36<8:02:23, 5.78s/it] 43%|████▎ | 3743/8750 [2:13:44<7:58:05, 5.73s/it] 43%|████▎ | 3743/8750 [2:13:41<7:58:04, 5.73s/it] {'loss': 0.4706, 'learning_rate': 1.2788737997251665e-05, 'epoch': 0.43} + 43%|████▎ | 3743/8750 [2:13:44<7:58:05, 5.73s/it] {'loss': 0.4706, 'learning_rate': 1.2788737997251665e-05, 'epoch': 0.43} + 43%|████▎ | 3743/8750 [2:13:41<7:58:04, 5.73s/it] 43%|████▎ | 3744/8750 [2:13:50<8:01:10, 5.77s/it] 43%|████▎ | 3744/8750 [2:13:47<8:01:10, 5.77s/it] {'loss': 0.4556, 'learning_rate': 1.2785183006424836e-05, 'epoch': 0.43} + 43%|████▎ | 3744/8750 [2:13:50<8:01:10, 5.77s/it] {'loss': 0.4556, 'learning_rate': 1.2785183006424836e-05, 'epoch': 0.43} + 43%|████▎ | 3744/8750 [2:13:47<8:01:10, 5.77s/it] 43%|████▎ | 3745/8750 [2:13:56<8:00:04, 5.76s/it] 43%|████▎ | 3745/8750 [2:13:53<8:00:03, 5.76s/it] {'loss': 0.4523, 'learning_rate': 1.278162763396575e-05, 'epoch': 0.43} + 43%|████▎ | 3745/8750 [2:13:56<8:00:04, 5.76s/it] {'loss': 0.4523, 'learning_rate': 1.278162763396575e-05, 'epoch': 0.43} + 43%|████▎ | 3745/8750 [2:13:53<8:00:03, 5.76s/it] 43%|████▎ | 3746/8750 [2:14:02<7:59:46, 5.75s/it] 43%|████▎ | 3746/8750 [2:13:59<7:59:47, 5.75s/it] {'loss': 0.4699, 'learning_rate': 1.2778071880361577e-05, 'epoch': 0.43} + 43%|████▎ | 3746/8750 [2:14:02<7:59:46, 5.75s/it] {'loss': 0.4699, 'learning_rate': 1.2778071880361577e-05, 'epoch': 0.43} + 43%|████▎ | 3746/8750 [2:13:59<7:59:47, 5.75s/it] 43%|████▎ | 3747/8750 [2:14:07<8:00:22, 5.76s/it] 43%|████▎ | 3747/8750 [2:14:04<8:00:22, 5.76s/it] {'loss': 0.4654, 'learning_rate': 1.2774515746099536e-05, 'epoch': 0.43} + 43%|████▎ | 3747/8750 [2:14:07<8:00:22, 5.76s/it] {'loss': 0.4654, 'learning_rate': 1.2774515746099536e-05, 'epoch': 0.43} + 43%|████▎ | 3747/8750 [2:14:04<8:00:22, 5.76s/it] 43%|████▎ | 3748/8750 [2:14:10<8:05:53, 5.83s/it] 43%|████▎ | 3748/8750 [2:14:13<8:05:53, 5.83s/it] {'loss': 0.4471, 'learning_rate': 1.277095923166689e-05, 'epoch': 0.43} + 43%|████▎ | 3748/8750 [2:14:10<8:05:53, 5.83s/it] {'loss': 0.4471, 'learning_rate': 1.277095923166689e-05, 'epoch': 0.43} + 43%|████▎ | 3748/8750 [2:14:13<8:05:53, 5.83s/it] 43%|████▎ | 3749/8750 [2:14:19<8:03:15, 5.80s/it] 43%|████▎ | 3749/8750 [2:14:16<8:03:15, 5.80s/it] {'loss': 0.4751, 'learning_rate': 1.2767402337550966e-05, 'epoch': 0.43} + 43%|████▎ | 3749/8750 [2:14:19<8:03:15, 5.80s/it] {'loss': 0.4751, 'learning_rate': 1.2767402337550966e-05, 'epoch': 0.43} + 43%|████▎ | 3749/8750 [2:14:16<8:03:15, 5.80s/it]10 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend...1 AutoResumeHook: Checking whether to suspend... + +6 AutoResumeHook: Checking whether to suspend... + 43%|████▎ | 3750/8750 [2:14:25<8:00:35, 5.77s/it]9 AutoResumeHook: Checking whether to suspend... +014 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... + 13 AutoResumeHook: Checking whether to suspend...2 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... 43%|████▎ | 3750/8750 [2:14:22<8:00:35, 5.77s/it] + +11 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend...5 AutoResumeHook: Checking whether to suspend... + +12 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4884, 'learning_rate': 1.2763845064239134e-05, 'epoch': 0.43} + {'loss': 0.4884, 'learning_rate': 1.2763845064239134e-05, 'epoch': 0.43} + 43%|████▎ | 3750/8750 [2:14:25<8:00:35, 5.77s/it] 43%|████▎ | 3750/8750 [2:14:22<8:00:35, 5.77s/it] 43%|████▎ | 3751/8750 [2:14:31<8:03:15, 5.80s/it] 43%|████▎ | 3751/8750 [2:14:28<8:03:15, 5.80s/it] {'loss': 0.4594, 'learning_rate': 1.2760287412218824e-05, 'epoch': 0.43} + 43%|████▎ | 3751/8750 [2:14:31<8:03:15, 5.80s/it] {'loss': 0.4594, 'learning_rate': 1.2760287412218824e-05, 'epoch': 0.43} + 43%|████▎ | 3751/8750 [2:14:28<8:03:15, 5.80s/it] 43%|████▎ | 3752/8750 [2:14:33<8:01:02, 5.77s/it] 43%|████▎ | 3752/8750 [2:14:36<8:01:02, 5.77s/it] {'loss': 0.4848, 'learning_rate': 1.275672938197751e-05, 'epoch': 0.43} + {'loss': 0.4848, 'learning_rate': 1.275672938197751e-05, 'epoch': 0.43} + 43%|████▎ | 3752/8750 [2:14:36<8:01:02, 5.77s/it] 43%|████▎ | 3752/8750 [2:14:33<8:01:02, 5.77s/it] 43%|████▎ | 3753/8750 [2:14:42<8:01:42, 5.78s/it] 43%|████▎ | 3753/8750 [2:14:39<8:01:42, 5.78s/it] {'loss': 0.4542, 'learning_rate': 1.2753170974002727e-05, 'epoch': 0.43} + 43%|████▎ | 3753/8750 [2:14:42<8:01:42, 5.78s/it] {'loss': 0.4542, 'learning_rate': 1.2753170974002727e-05, 'epoch': 0.43} + 43%|████▎ | 3753/8750 [2:14:39<8:01:42, 5.78s/it] 43%|████▎ | 3754/8750 [2:14:48<8:06:44, 5.85s/it] 43%|████▎ | 3754/8750 [2:14:45<8:06:44, 5.85s/it] {'loss': 0.4643, 'learning_rate': 1.2749612188782048e-05, 'epoch': 0.43} + 43%|████▎ | 3754/8750 [2:14:48<8:06:44, 5.85s/it] {'loss': 0.4643, 'learning_rate': 1.2749612188782048e-05, 'epoch': 0.43} + 43%|████▎ | 3754/8750 [2:14:45<8:06:44, 5.85s/it] 43%|████▎ | 3755/8750 [2:14:54<8:02:18, 5.79s/it] 43%|████▎ | 3755/8750 [2:14:51<8:02:18, 5.79s/it] {'loss': 0.46, 'learning_rate': 1.2746053026803114e-05, 'epoch': 0.43} + 43%|████▎ | 3755/8750 [2:14:54<8:02:18, 5.79s/it] {'loss': 0.46, 'learning_rate': 1.2746053026803114e-05, 'epoch': 0.43} + 43%|████▎ | 3755/8750 [2:14:51<8:02:18, 5.79s/it] 43%|████▎ | 3756/8750 [2:15:00<8:07:43, 5.86s/it] 43%|████▎ | 3756/8750 [2:14:57<8:07:43, 5.86s/it] {'loss': 0.4519, 'learning_rate': 1.2742493488553606e-05, 'epoch': 0.43} + 43%|████▎ | 3756/8750 [2:15:00<8:07:43, 5.86s/it] {'loss': 0.4519, 'learning_rate': 1.2742493488553606e-05, 'epoch': 0.43} + 43%|████▎ | 3756/8750 [2:14:57<8:07:43, 5.86s/it] 43%|████▎ | 3757/8750 [2:15:06<8:04:38, 5.82s/it] 43%|████▎ | 3757/8750 [2:15:03<8:04:38, 5.82s/it] {'loss': 0.4655, 'learning_rate': 1.2738933574521262e-05, 'epoch': 0.43} + 43%|████▎ | 3757/8750 [2:15:06<8:04:38, 5.82s/it] {'loss': 0.4655, 'learning_rate': 1.2738933574521262e-05, 'epoch': 0.43} + 43%|████▎ | 3757/8750 [2:15:03<8:04:38, 5.82s/it] 43%|████▎ | 3758/8750 [2:15:11<8:01:10, 5.78s/it] 43%|████▎ | 3758/8750 [2:15:08<8:01:10, 5.78s/it] {'loss': 0.4716, 'learning_rate': 1.2735373285193867e-05, 'epoch': 0.43} + 43%|████▎ | 3758/8750 [2:15:11<8:01:10, 5.78s/it] {'loss': 0.4716, 'learning_rate': 1.2735373285193867e-05, 'epoch': 0.43} + 43%|████▎ | 3758/8750 [2:15:08<8:01:10, 5.78s/it] 43%|████▎ | 3759/8750 [2:15:17<8:01:31, 5.79s/it] 43%|████▎ | 3759/8750 [2:15:14<8:01:31, 5.79s/it] {'loss': 0.469, 'learning_rate': 1.2731812621059262e-05, 'epoch': 0.43} + 43%|████▎ | 3759/8750 [2:15:17<8:01:31, 5.79s/it] {'loss': 0.469, 'learning_rate': 1.2731812621059262e-05, 'epoch': 0.43} + 43%|████▎ | 3759/8750 [2:15:14<8:01:31, 5.79s/it] 43%|████▎ | 3760/8750 [2:15:23<8:03:06, 5.81s/it] 43%|████▎ | 3760/8750 [2:15:20<8:03:07, 5.81s/it] {'loss': 0.4608, 'learning_rate': 1.2728251582605335e-05, 'epoch': 0.43} + 43%|████▎ | 3760/8750 [2:15:23<8:03:06, 5.81s/it] {'loss': 0.4608, 'learning_rate': 1.2728251582605335e-05, 'epoch': 0.43} + 43%|████▎ | 3760/8750 [2:15:20<8:03:07, 5.81s/it] 43%|████▎ | 3761/8750 [2:15:28<7:56:54, 5.74s/it] 43%|████▎ | 3761/8750 [2:15:25<7:56:54, 5.74s/it] {'loss': 0.4635, 'learning_rate': 1.2724690170320031e-05, 'epoch': 0.43} + 43%|████▎ | 3761/8750 [2:15:28<7:56:54, 5.74s/it] {'loss': 0.4635, 'learning_rate': 1.2724690170320031e-05, 'epoch': 0.43} + 43%|████▎ | 3761/8750 [2:15:26<7:56:54, 5.74s/it] 43%|████▎ | 3762/8750 [2:15:34<7:56:37, 5.73s/it] 43%|████▎ | 3762/8750 [2:15:31<7:56:37, 5.73s/it] {'loss': 0.4552, 'learning_rate': 1.2721128384691342e-05, 'epoch': 0.43} + 43%|████▎ | 3762/8750 [2:15:34<7:56:37, 5.73s/it] {'loss': 0.4552, 'learning_rate': 1.2721128384691342e-05, 'epoch': 0.43} + 43%|████▎ | 3762/8750 [2:15:31<7:56:37, 5.73s/it] 43%|████▎ | 3763/8750 [2:15:40<8:01:56, 5.80s/it] 43%|████▎ | 3763/8750 [2:15:37<8:01:56, 5.80s/it] {'loss': 0.4805, 'learning_rate': 1.2717566226207311e-05, 'epoch': 0.43} + 43%|████▎ | 3763/8750 [2:15:40<8:01:56, 5.80s/it] {'loss': 0.4805, 'learning_rate': 1.2717566226207311e-05, 'epoch': 0.43} + 43%|████▎ | 3763/8750 [2:15:37<8:01:56, 5.80s/it] 43%|████▎ | 3764/8750 [2:15:46<8:02:23, 5.80s/it] 43%|████▎ | 3764/8750 [2:15:43<8:02:22, 5.80s/it] {'loss': 0.4595, 'learning_rate': 1.2714003695356037e-05, 'epoch': 0.43} + 43%|████▎ | 3764/8750 [2:15:46<8:02:23, 5.80s/it] {'loss': 0.4595, 'learning_rate': 1.2714003695356037e-05, 'epoch': 0.43} + 43%|████▎ | 3764/8750 [2:15:43<8:02:22, 5.80s/it] 43%|████▎ | 3765/8750 [2:15:52<8:08:28, 5.88s/it] 43%|████▎ | 3765/8750 [2:15:49<8:08:28, 5.88s/it] {'loss': 0.4617, 'learning_rate': 1.2710440792625662e-05, 'epoch': 0.43} + 43%|████▎ | 3765/8750 [2:15:52<8:08:28, 5.88s/it] {'loss': 0.4617, 'learning_rate': 1.2710440792625662e-05, 'epoch': 0.43} + 43%|████▎ | 3765/8750 [2:15:49<8:08:28, 5.88s/it] 43%|████▎ | 3766/8750 [2:15:58<8:03:38, 5.82s/it] 43%|████▎ | 3766/8750 [2:15:55<8:03:39, 5.82s/it] {'loss': 0.485, 'learning_rate': 1.2706877518504384e-05, 'epoch': 0.43} + 43%|████▎ | 3766/8750 [2:15:58<8:03:38, 5.82s/it] {'loss': 0.485, 'learning_rate': 1.2706877518504384e-05, 'epoch': 0.43} + 43%|████▎ | 3766/8750 [2:15:55<8:03:39, 5.82s/it] 43%|████▎ | 3767/8750 [2:16:04<8:04:57, 5.84s/it] 43%|████▎ | 3767/8750 [2:16:01<8:04:57, 5.84s/it] {'loss': 0.462, 'learning_rate': 1.2703313873480451e-05, 'epoch': 0.43} + 43%|████▎ | 3767/8750 [2:16:04<8:04:57, 5.84s/it] {'loss': 0.462, 'learning_rate': 1.2703313873480451e-05, 'epoch': 0.43} + 43%|████▎ | 3767/8750 [2:16:01<8:04:57, 5.84s/it] 43%|████▎ | 3768/8750 [2:16:06<8:03:55, 5.83s/it] 43%|████▎ | 3768/8750 [2:16:09<8:03:57, 5.83s/it] {'loss': 0.4743, 'learning_rate': 1.2699749858042164e-05, 'epoch': 0.43} + 43%|████▎ | 3768/8750 [2:16:09<8:03:57, 5.83s/it] {'loss': 0.4743, 'learning_rate': 1.2699749858042164e-05, 'epoch': 0.43} + 43%|████▎ | 3768/8750 [2:16:06<8:03:55, 5.83s/it] 43%|████▎ | 3769/8750 [2:16:15<8:04:27, 5.84s/it] 43%|████▎ | 3769/8750 [2:16:12<8:04:27, 5.84s/it] {'loss': 0.4731, 'learning_rate': 1.269618547267787e-05, 'epoch': 0.43} + 43%|████▎ | 3769/8750 [2:16:15<8:04:27, 5.84s/it] {'loss': 0.4731, 'learning_rate': 1.269618547267787e-05, 'epoch': 0.43} + 43%|████▎ | 3769/8750 [2:16:12<8:04:27, 5.84s/it] 43%|████▎ | 3770/8750 [2:16:18<8:00:28, 5.79s/it] 43%|████▎ | 3770/8750 [2:16:21<8:00:29, 5.79s/it] {'loss': 0.4591, 'learning_rate': 1.2692620717875972e-05, 'epoch': 0.43} + 43%|████▎ | 3770/8750 [2:16:21<8:00:29, 5.79s/it] {'loss': 0.4591, 'learning_rate': 1.2692620717875972e-05, 'epoch': 0.43} + 43%|████▎ | 3770/8750 [2:16:18<8:00:28, 5.79s/it] 43%|████▎ | 3771/8750 [2:16:27<7:58:48, 5.77s/it] 43%|████▎ | 3771/8750 [2:16:24<7:58:49, 5.77s/it] {'loss': 0.4616, 'learning_rate': 1.2689055594124919e-05, 'epoch': 0.43} + 43%|████▎ | 3771/8750 [2:16:27<7:58:48, 5.77s/it] {'loss': 0.4616, 'learning_rate': 1.2689055594124919e-05, 'epoch': 0.43} + 43%|████▎ | 3771/8750 [2:16:24<7:58:49, 5.77s/it] 43%|████▎ | 3772/8750 [2:16:33<8:03:38, 5.83s/it] 43%|████▎ | 3772/8750 [2:16:30<8:03:38, 5.83s/it] {'loss': 0.4946, 'learning_rate': 1.2685490101913214e-05, 'epoch': 0.43} + 43%|████▎ | 3772/8750 [2:16:33<8:03:38, 5.83s/it] {'loss': 0.4946, 'learning_rate': 1.2685490101913214e-05, 'epoch': 0.43} + 43%|████▎ | 3772/8750 [2:16:30<8:03:38, 5.83s/it] 43%|████▎ | 3773/8750 [2:16:38<7:58:54, 5.77s/it] 43%|████▎ | 3773/8750 [2:16:35<7:58:54, 5.77s/it] {'loss': 0.4563, 'learning_rate': 1.2681924241729409e-05, 'epoch': 0.43} + 43%|████▎ | 3773/8750 [2:16:38<7:58:54, 5.77s/it] {'loss': 0.4563, 'learning_rate': 1.2681924241729409e-05, 'epoch': 0.43} + 43%|████▎ | 3773/8750 [2:16:35<7:58:54, 5.77s/it] 43%|████▎ | 3774/8750 [2:16:44<8:00:06, 5.79s/it] 43%|████▎ | 3774/8750 [2:16:41<8:00:06, 5.79s/it] {'loss': 0.4606, 'learning_rate': 1.2678358014062104e-05, 'epoch': 0.43} + 43%|████▎ | 3774/8750 [2:16:44<8:00:06, 5.79s/it] {'loss': 0.4606, 'learning_rate': 1.2678358014062104e-05, 'epoch': 0.43} + 43%|████▎ | 3774/8750 [2:16:41<8:00:06, 5.79s/it] 43%|████▎ | 3775/8750 [2:16:50<7:58:13, 5.77s/it] 43%|████▎ | 3775/8750 [2:16:47<7:58:13, 5.77s/it] {'loss': 0.4596, 'learning_rate': 1.2674791419399956e-05, 'epoch': 0.43} + 43%|████▎ | 3775/8750 [2:16:50<7:58:13, 5.77s/it] {'loss': 0.4596, 'learning_rate': 1.2674791419399956e-05, 'epoch': 0.43} + 43%|████▎ | 3775/8750 [2:16:47<7:58:13, 5.77s/it] 43%|████▎ | 3776/8750 [2:16:55<7:55:48, 5.74s/it] 43%|████▎ | 3776/8750 [2:16:53<7:55:48, 5.74s/it] {'loss': 0.5041, 'learning_rate': 1.2671224458231664e-05, 'epoch': 0.43} + 43%|████▎ | 3776/8750 [2:16:55<7:55:48, 5.74s/it] {'loss': 0.5041, 'learning_rate': 1.2671224458231664e-05, 'epoch': 0.43} + 43%|████▎ | 3776/8750 [2:16:53<7:55:48, 5.74s/it] 43%|████▎ | 3777/8750 [2:17:01<7:57:25, 5.76s/it] 43%|████▎ | 3777/8750 [2:16:58<7:57:25, 5.76s/it] {'loss': 0.4335, 'learning_rate': 1.2667657131045983e-05, 'epoch': 0.43} + 43%|████▎ | 3777/8750 [2:17:01<7:57:25, 5.76s/it] {'loss': 0.4335, 'learning_rate': 1.2667657131045983e-05, 'epoch': 0.43} + 43%|████▎ | 3777/8750 [2:16:58<7:57:25, 5.76s/it] 43%|████▎ | 3778/8750 [2:17:04<7:56:56, 5.76s/it] 43%|████▎ | 3778/8750 [2:17:07<7:56:57, 5.76s/it] {'loss': 0.4698, 'learning_rate': 1.2664089438331716e-05, 'epoch': 0.43} + 43%|████▎ | 3778/8750 [2:17:07<7:56:57, 5.76s/it] {'loss': 0.4698, 'learning_rate': 1.2664089438331716e-05, 'epoch': 0.43} + 43%|████▎ | 3778/8750 [2:17:04<7:56:56, 5.76s/it] 43%|████▎ | 3779/8750 [2:17:13<7:57:21, 5.76s/it] 43%|████▎ | 3779/8750 [2:17:10<7:57:22, 5.76s/it] {'loss': 0.4479, 'learning_rate': 1.266052138057772e-05, 'epoch': 0.43} + 43%|████▎ | 3779/8750 [2:17:13<7:57:21, 5.76s/it] {'loss': 0.4479, 'learning_rate': 1.266052138057772e-05, 'epoch': 0.43} + 43%|████▎ | 3779/8750 [2:17:10<7:57:22, 5.76s/it] 43%|████▎ | 3780/8750 [2:17:19<7:59:53, 5.79s/it] 43%|████▎ | 3780/8750 [2:17:16<7:59:53, 5.79s/it] {'loss': 0.4608, 'learning_rate': 1.2656952958272893e-05, 'epoch': 0.43} + 43%|████▎ | 3780/8750 [2:17:19<7:59:53, 5.79s/it] {'loss': 0.4608, 'learning_rate': 1.2656952958272893e-05, 'epoch': 0.43} + 43%|████▎ | 3780/8750 [2:17:16<7:59:53, 5.79s/it] 43%|████▎ | 3781/8750 [2:17:24<7:54:27, 5.73s/it] 43%|████▎ | 3781/8750 [2:17:21<7:54:27, 5.73s/it] {'loss': 0.4745, 'learning_rate': 1.2653384171906192e-05, 'epoch': 0.43} + 43%|████▎ | 3781/8750 [2:17:24<7:54:27, 5.73s/it] {'loss': 0.4745, 'learning_rate': 1.2653384171906192e-05, 'epoch': 0.43} + 43%|████▎ | 3781/8750 [2:17:21<7:54:27, 5.73s/it] 43%|████▎ | 3782/8750 [2:17:30<7:59:39, 5.79s/it] 43%|████▎ | 3782/8750 [2:17:27<7:59:39, 5.79s/it] {'loss': 0.4606, 'learning_rate': 1.264981502196662e-05, 'epoch': 0.43} + {'loss': 0.4606, 'learning_rate': 1.264981502196662e-05, 'epoch': 0.43} 43%|████▎ | 3782/8750 [2:17:30<7:59:39, 5.79s/it] + 43%|████▎ | 3782/8750 [2:17:27<7:59:39, 5.79s/it] 43%|████▎ | 3783/8750 [2:17:36<8:01:11, 5.81s/it] 43%|████▎ | 3783/8750 [2:17:33<8:01:11, 5.81s/it] {'loss': 0.47, 'learning_rate': 1.2646245508943227e-05, 'epoch': 0.43} + 43%|████▎ | 3783/8750 [2:17:36<8:01:11, 5.81s/it] {'loss': 0.47, 'learning_rate': 1.2646245508943227e-05, 'epoch': 0.43} + 43%|████▎ | 3783/8750 [2:17:33<8:01:11, 5.81s/it] 43%|████▎ | 3784/8750 [2:17:42<7:56:35, 5.76s/it] 43%|████▎ | 3784/8750 [2:17:39<7:56:35, 5.76s/it] {'loss': 0.4923, 'learning_rate': 1.2642675633325122e-05, 'epoch': 0.43} + 43%|████▎ | 3784/8750 [2:17:42<7:56:35, 5.76s/it] {'loss': 0.4923, 'learning_rate': 1.2642675633325122e-05, 'epoch': 0.43} + 43%|████▎ | 3784/8750 [2:17:39<7:56:35, 5.76s/it] 43%|████▎ | 3785/8750 [2:17:47<7:58:05, 5.78s/it] 43%|████▎ | 3785/8750 [2:17:45<7:58:05, 5.78s/it] {'loss': 0.4503, 'learning_rate': 1.2639105395601452e-05, 'epoch': 0.43} + 43%|████▎ | 3785/8750 [2:17:47<7:58:05, 5.78s/it] {'loss': 0.4503, 'learning_rate': 1.2639105395601452e-05, 'epoch': 0.43} + 43%|████▎ | 3785/8750 [2:17:45<7:58:05, 5.78s/it] 43%|████▎ | 3786/8750 [2:17:53<7:55:52, 5.75s/it] 43%|████▎ | 3786/8750 [2:17:50<7:55:52, 5.75s/it] {'loss': 0.4632, 'learning_rate': 1.2635534796261424e-05, 'epoch': 0.43} + 43%|████▎ | 3786/8750 [2:17:53<7:55:52, 5.75s/it] {'loss': 0.4632, 'learning_rate': 1.2635534796261424e-05, 'epoch': 0.43} + 43%|████▎ | 3786/8750 [2:17:50<7:55:52, 5.75s/it] 43%|████▎ | 3787/8750 [2:17:59<7:54:56, 5.74s/it] 43%|████▎ | 3787/8750 [2:17:56<7:54:56, 5.74s/it] {'loss': 0.4807, 'learning_rate': 1.2631963835794285e-05, 'epoch': 0.43} + 43%|████▎ | 3787/8750 [2:17:59<7:54:56, 5.74s/it] {'loss': 0.4807, 'learning_rate': 1.2631963835794285e-05, 'epoch': 0.43} + 43%|████▎ | 3787/8750 [2:17:56<7:54:56, 5.74s/it] 43%|████▎ | 3788/8750 [2:18:05<7:53:06, 5.72s/it] 43%|████▎ | 3788/8750 [2:18:02<7:53:06, 5.72s/it] {'loss': 0.4647, 'learning_rate': 1.2628392514689339e-05, 'epoch': 0.43} + 43%|████▎ | 3788/8750 [2:18:05<7:53:06, 5.72s/it] {'loss': 0.4647, 'learning_rate': 1.2628392514689339e-05, 'epoch': 0.43} + 43%|████▎ | 3788/8750 [2:18:02<7:53:06, 5.72s/it] 43%|████▎ | 3789/8750 [2:18:11<8:01:03, 5.82s/it] 43%|████▎ | 3789/8750 [2:18:08<8:01:02, 5.82s/it] {'loss': 0.4696, 'learning_rate': 1.2624820833435939e-05, 'epoch': 0.43} + 43%|████▎ | 3789/8750 [2:18:11<8:01:03, 5.82s/it] {'loss': 0.4696, 'learning_rate': 1.2624820833435939e-05, 'epoch': 0.43} + 43%|████▎ | 3789/8750 [2:18:08<8:01:02, 5.82s/it] 43%|████▎ | 3790/8750 [2:18:16<8:00:01, 5.81s/it] 43%|████▎ | 3790/8750 [2:18:13<8:00:02, 5.81s/it] {'loss': 0.4604, 'learning_rate': 1.262124879252348e-05, 'epoch': 0.43} + 43%|████▎ | 3790/8750 [2:18:16<8:00:01, 5.81s/it] {'loss': 0.4604, 'learning_rate': 1.262124879252348e-05, 'epoch': 0.43} + 43%|████▎ | 3790/8750 [2:18:13<8:00:02, 5.81s/it] 43%|████▎ | 3791/8750 [2:18:22<7:59:21, 5.80s/it] 43%|████▎ | 3791/8750 [2:18:19<7:59:21, 5.80s/it] {'loss': 0.4466, 'learning_rate': 1.2617676392441419e-05, 'epoch': 0.43} + 43%|████▎ | 3791/8750 [2:18:22<7:59:21, 5.80s/it] {'loss': 0.4466, 'learning_rate': 1.2617676392441419e-05, 'epoch': 0.43} + 43%|████▎ | 3791/8750 [2:18:19<7:59:21, 5.80s/it] 43%|████▎ | 3792/8750 [2:18:28<7:58:24, 5.79s/it] 43%|████▎ | 3792/8750 [2:18:25<7:58:24, 5.79s/it] {'loss': 0.4844, 'learning_rate': 1.2614103633679244e-05, 'epoch': 0.43} + 43%|████▎ | 3792/8750 [2:18:28<7:58:24, 5.79s/it] {'loss': 0.4844, 'learning_rate': 1.2614103633679244e-05, 'epoch': 0.43} + 43%|████▎ | 3792/8750 [2:18:25<7:58:24, 5.79s/it] 43%|████▎ | 3793/8750 [2:18:34<7:55:28, 5.76s/it] 43%|████▎ | 3793/8750 [2:18:31<7:55:28, 5.76s/it] {'loss': 0.4597, 'learning_rate': 1.2610530516726506e-05, 'epoch': 0.43} + 43%|████▎ | 3793/8750 [2:18:34<7:55:28, 5.76s/it] {'loss': 0.4597, 'learning_rate': 1.2610530516726506e-05, 'epoch': 0.43} + 43%|████▎ | 3793/8750 [2:18:31<7:55:28, 5.76s/it] 43%|████▎ | 3794/8750 [2:18:39<7:52:19, 5.72s/it] 43%|████▎ | 3794/8750 [2:18:36<7:52:19, 5.72s/it] {'loss': 0.4664, 'learning_rate': 1.260695704207281e-05, 'epoch': 0.43} + 43%|████▎ | 3794/8750 [2:18:39<7:52:19, 5.72s/it] {'loss': 0.4664, 'learning_rate': 1.260695704207281e-05, 'epoch': 0.43} + 43%|████▎ | 3794/8750 [2:18:36<7:52:19, 5.72s/it] 43%|████▎ | 3795/8750 [2:18:45<7:51:07, 5.70s/it] 43%|████▎ | 3795/8750 [2:18:42<7:51:08, 5.70s/it] {'loss': 0.4714, 'learning_rate': 1.2603383210207796e-05, 'epoch': 0.43} + 43%|████▎ | 3795/8750 [2:18:45<7:51:07, 5.70s/it] {'loss': 0.4714, 'learning_rate': 1.2603383210207796e-05, 'epoch': 0.43} + 43%|████▎ | 3795/8750 [2:18:42<7:51:08, 5.70s/it] 43%|████▎ | 3796/8750 [2:18:51<7:54:08, 5.74s/it] 43%|████▎ | 3796/8750 [2:18:48<7:54:07, 5.74s/it] {'loss': 0.4559, 'learning_rate': 1.2599809021621157e-05, 'epoch': 0.43} + 43%|████▎ | 3796/8750 [2:18:51<7:54:08, 5.74s/it] {'loss': 0.4559, 'learning_rate': 1.2599809021621157e-05, 'epoch': 0.43} + 43%|████▎ | 3796/8750 [2:18:48<7:54:07, 5.74s/it] 43%|████▎ | 3797/8750 [2:18:57<7:57:30, 5.78s/it] 43%|████▎ | 3797/8750 [2:18:54<7:57:30, 5.78s/it] {'loss': 0.4551, 'learning_rate': 1.2596234476802636e-05, 'epoch': 0.43} + 43%|████▎ | 3797/8750 [2:18:57<7:57:30, 5.78s/it] {'loss': 0.4551, 'learning_rate': 1.2596234476802636e-05, 'epoch': 0.43} + 43%|████▎ | 3797/8750 [2:18:54<7:57:30, 5.78s/it] 43%|████▎ | 3798/8750 [2:19:03<8:03:56, 5.86s/it] 43%|████▎ | 3798/8750 [2:19:00<8:03:56, 5.86s/it] {'loss': 0.4842, 'learning_rate': 1.2592659576242028e-05, 'epoch': 0.43} + 43%|████▎ | 3798/8750 [2:19:03<8:03:56, 5.86s/it] {'loss': 0.4842, 'learning_rate': 1.2592659576242028e-05, 'epoch': 0.43} + 43%|████▎ | 3798/8750 [2:19:00<8:03:56, 5.86s/it] 43%|████▎ | 3799/8750 [2:19:08<8:02:56, 5.85s/it] 43%|████▎ | 3799/8750 [2:19:06<8:02:56, 5.85s/it] {'loss': 0.4632, 'learning_rate': 1.2589084320429178e-05, 'epoch': 0.43} + 43%|████▎ | 3799/8750 [2:19:08<8:02:56, 5.85s/it] {'loss': 0.4632, 'learning_rate': 1.2589084320429178e-05, 'epoch': 0.43} + 43%|████▎ | 3799/8750 [2:19:06<8:02:56, 5.85s/it]10 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +148 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend...4 AutoResumeHook: Checking whether to suspend... +3 + 43%|████▎ | 3800/8750 [2:19:14<8:01:26, 5.84s/it] AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +5 9 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +0 AutoResumeHook: Checking whether to suspend... + 43%|████▎ | 3800/8750 [2:19:11<8:01:27, 5.84s/it] {'loss': 0.477, 'learning_rate': 1.2585508709853971e-05, 'epoch': 0.43} + 43%|████▎ | 3800/8750 [2:19:14<8:01:26, 5.84s/it] {'loss': 0.477, 'learning_rate': 1.2585508709853971e-05, 'epoch': 0.43} + 43%|████▎ | 3800/8750 [2:19:11<8:01:27, 5.84s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-3800/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-3800/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-3800/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 43%|████▎ | 3801/8750 [2:19:37<14:52:03, 10.82s/it] 43%|████▎ | 3801/8750 [2:19:34<14:52:02, 10.81s/it] {'loss': 0.4712, 'learning_rate': 1.2581932745006343e-05, 'epoch': 0.43} + 43%|████▎ | 3801/8750 [2:19:37<14:52:03, 10.82s/it] {'loss': 0.4712, 'learning_rate': 1.2581932745006343e-05, 'epoch': 0.43} + 43%|████▎ | 3801/8750 [2:19:34<14:52:02, 10.81s/it] 43%|████▎ | 3802/8750 [2:19:42<12:43:29, 9.26s/it] 43%|████▎ | 3802/8750 [2:19:39<12:43:29, 9.26s/it] {'loss': 0.4702, 'learning_rate': 1.2578356426376283e-05, 'epoch': 0.43} + 43%|████▎ | 3802/8750 [2:19:42<12:43:29, 9.26s/it] {'loss': 0.4702, 'learning_rate': 1.2578356426376283e-05, 'epoch': 0.43} + 43%|████▎ | 3802/8750 [2:19:39<12:43:29, 9.26s/it] 43%|████▎ | 3803/8750 [2:19:48<11:14:37, 8.18s/it] 43%|████▎ | 3803/8750 [2:19:45<11:14:36, 8.18s/it] {'loss': 0.4609, 'learning_rate': 1.2574779754453831e-05, 'epoch': 0.43} + 43%|████▎ | 3803/8750 [2:19:48<11:14:37, 8.18s/it] {'loss': 0.4609, 'learning_rate': 1.2574779754453831e-05, 'epoch': 0.43} + 43%|████▎ | 3803/8750 [2:19:45<11:14:36, 8.18s/it] 43%|████▎ | 3804/8750 [2:19:54<10:14:10, 7.45s/it] 43%|████▎ | 3804/8750 [2:19:51<10:14:10, 7.45s/it] {'loss': 0.4698, 'learning_rate': 1.257120272972907e-05, 'epoch': 0.43} + 43%|████▎ | 3804/8750 [2:19:54<10:14:10, 7.45s/it] {'loss': 0.4698, 'learning_rate': 1.257120272972907e-05, 'epoch': 0.43} + 43%|████▎ | 3804/8750 [2:19:51<10:14:10, 7.45s/it] 43%|████▎ | 3805/8750 [2:19:59<9:29:34, 6.91s/it] 43%|████▎ | 3805/8750 [2:19:56<9:29:34, 6.91s/it] {'loss': 0.4681, 'learning_rate': 1.2567625352692127e-05, 'epoch': 0.43} + 43%|████▎ | 3805/8750 [2:19:59<9:29:34, 6.91s/it] {'loss': 0.4681, 'learning_rate': 1.2567625352692127e-05, 'epoch': 0.43} + 43%|████▎ | 3805/8750 [2:19:56<9:29:34, 6.91s/it] 43%|████▎ | 3806/8750 [2:20:05<9:00:10, 6.56s/it] 43%|████▎ | 3806/8750 [2:20:02<9:00:10, 6.56s/it] {'loss': 0.4557, 'learning_rate': 1.2564047623833186e-05, 'epoch': 0.43} + 43%|████▎ | 3806/8750 [2:20:05<9:00:10, 6.56s/it] {'loss': 0.4557, 'learning_rate': 1.2564047623833186e-05, 'epoch': 0.43} + 43%|████▎ | 3806/8750 [2:20:02<9:00:10, 6.56s/it] 44%|████▎ | 3807/8750 [2:20:11<8:38:54, 6.30s/it] 44%|████▎ | 3807/8750 [2:20:08<8:38:54, 6.30s/it] {'loss': 0.4728, 'learning_rate': 1.2560469543642472e-05, 'epoch': 0.44} + 44%|████▎ | 3807/8750 [2:20:11<8:38:54, 6.30s/it] {'loss': 0.4728, 'learning_rate': 1.2560469543642472e-05, 'epoch': 0.44} + 44%|████▎ | 3807/8750 [2:20:08<8:38:54, 6.30s/it] 44%|████▎ | 3808/8750 [2:20:17<8:29:19, 6.18s/it] 44%|████▎ | 3808/8750 [2:20:14<8:29:18, 6.18s/it] {'loss': 0.4641, 'learning_rate': 1.255689111261027e-05, 'epoch': 0.44} + 44%|████▎ | 3808/8750 [2:20:17<8:29:19, 6.18s/it] {'loss': 0.4641, 'learning_rate': 1.255689111261027e-05, 'epoch': 0.44} + 44%|████▎ | 3808/8750 [2:20:14<8:29:18, 6.18s/it] 44%|████▎ | 3809/8750 [2:20:22<8:16:11, 6.03s/it] 44%|████▎ | 3809/8750 [2:20:19<8:16:11, 6.03s/it] {'loss': 0.4671, 'learning_rate': 1.2553312331226896e-05, 'epoch': 0.44} + 44%|████▎ | 3809/8750 [2:20:22<8:16:11, 6.03s/it] {'loss': 0.4671, 'learning_rate': 1.2553312331226896e-05, 'epoch': 0.44} + 44%|████▎ | 3809/8750 [2:20:19<8:16:11, 6.03s/it] 44%|████▎ | 3810/8750 [2:20:28<8:14:42, 6.01s/it] 44%|████▎ | 3810/8750 [2:20:25<8:14:42, 6.01s/it] {'loss': 0.4723, 'learning_rate': 1.254973319998273e-05, 'epoch': 0.44} + 44%|████▎ | 3810/8750 [2:20:28<8:14:42, 6.01s/it] {'loss': 0.4723, 'learning_rate': 1.254973319998273e-05, 'epoch': 0.44} + 44%|████▎ | 3810/8750 [2:20:25<8:14:42, 6.01s/it] 44%|████▎ | 3811/8750 [2:20:34<8:09:54, 5.95s/it] 44%|████▎ | 3811/8750 [2:20:31<8:09:54, 5.95s/it] {'loss': 0.4556, 'learning_rate': 1.2546153719368189e-05, 'epoch': 0.44} + 44%|████▎ | 3811/8750 [2:20:34<8:09:54, 5.95s/it] {'loss': 0.4556, 'learning_rate': 1.2546153719368189e-05, 'epoch': 0.44} + 44%|████▎ | 3811/8750 [2:20:31<8:09:54, 5.95s/it] 44%|████▎ | 3812/8750 [2:20:40<8:06:37, 5.91s/it] 44%|████▎ | 3812/8750 [2:20:37<8:06:37, 5.91s/it] {'loss': 0.479, 'learning_rate': 1.2542573889873741e-05, 'epoch': 0.44} + 44%|████▎ | 3812/8750 [2:20:40<8:06:37, 5.91s/it] {'loss': 0.479, 'learning_rate': 1.2542573889873741e-05, 'epoch': 0.44} + 44%|████▎ | 3812/8750 [2:20:37<8:06:37, 5.91s/it] 44%|████▎ | 3813/8750 [2:20:46<7:59:20, 5.83s/it] 44%|████▎ | 3813/8750 [2:20:43<7:59:20, 5.83s/it] {'loss': 0.4741, 'learning_rate': 1.2538993711989906e-05, 'epoch': 0.44} + 44%|████▎ | 3813/8750 [2:20:46<7:59:20, 5.83s/it] {'loss': 0.4741, 'learning_rate': 1.2538993711989906e-05, 'epoch': 0.44} + 44%|████▎ | 3813/8750 [2:20:43<7:59:20, 5.83s/it] 44%|████▎ | 3814/8750 [2:20:52<8:05:03, 5.90s/it] 44%|████▎ | 3814/8750 [2:20:49<8:05:02, 5.90s/it] {'loss': 0.4584, 'learning_rate': 1.2535413186207247e-05, 'epoch': 0.44} + 44%|████▎ | 3814/8750 [2:20:52<8:05:03, 5.90s/it] {'loss': 0.4584, 'learning_rate': 1.2535413186207247e-05, 'epoch': 0.44} + 44%|████▎ | 3814/8750 [2:20:49<8:05:02, 5.90s/it] 44%|████▎ | 3815/8750 [2:20:58<8:05:48, 5.91s/it] 44%|████▎ | 3815/8750 [2:20:55<8:05:47, 5.91s/it] {'loss': 0.4645, 'learning_rate': 1.2531832313016374e-05, 'epoch': 0.44} + 44%|████▎ | 3815/8750 [2:20:58<8:05:48, 5.91s/it] {'loss': 0.4645, 'learning_rate': 1.2531832313016374e-05, 'epoch': 0.44} + 44%|████▎ | 3815/8750 [2:20:55<8:05:47, 5.91s/it] 44%|████▎ | 3816/8750 [2:21:04<8:05:07, 5.90s/it] 44%|████▎ | 3816/8750 [2:21:01<8:05:07, 5.90s/it] {'loss': 0.4779, 'learning_rate': 1.2528251092907948e-05, 'epoch': 0.44} + 44%|████▎ | 3816/8750 [2:21:04<8:05:07, 5.90s/it] {'loss': 0.4779, 'learning_rate': 1.2528251092907948e-05, 'epoch': 0.44} + 44%|████▎ | 3816/8750 [2:21:01<8:05:07, 5.90s/it] 44%|████▎ | 3817/8750 [2:21:07<8:05:31, 5.91s/it] 44%|████▎ | 3817/8750 [2:21:09<8:05:31, 5.91s/it] {'loss': 0.4673, 'learning_rate': 1.2524669526372674e-05, 'epoch': 0.44} + {'loss': 0.4673, 'learning_rate': 1.2524669526372674e-05, 'epoch': 0.44} + 44%|████▎ | 3817/8750 [2:21:09<8:05:31, 5.91s/it] 44%|████▎ | 3817/8750 [2:21:07<8:05:31, 5.91s/it] 44%|████▎ | 3818/8750 [2:21:15<8:02:03, 5.86s/it] 44%|████▎ | 3818/8750 [2:21:12<8:02:03, 5.86s/it] {'loss': 0.4735, 'learning_rate': 1.2521087613901313e-05, 'epoch': 0.44} + 44%|████▎ | 3818/8750 [2:21:15<8:02:03, 5.86s/it] {'loss': 0.4735, 'learning_rate': 1.2521087613901313e-05, 'epoch': 0.44} + 44%|████▎ | 3818/8750 [2:21:12<8:02:03, 5.86s/it] 44%|████▎ | 3819/8750 [2:21:21<8:06:45, 5.92s/it] 44%|████▎ | 3819/8750 [2:21:18<8:06:45, 5.92s/it] {'loss': 0.4649, 'learning_rate': 1.251750535598466e-05, 'epoch': 0.44} + 44%|████▎ | 3819/8750 [2:21:21<8:06:45, 5.92s/it] {'loss': 0.4649, 'learning_rate': 1.251750535598466e-05, 'epoch': 0.44} + 44%|████▎ | 3819/8750 [2:21:18<8:06:45, 5.92s/it] 44%|████▎ | 3820/8750 [2:21:27<7:59:46, 5.84s/it] 44%|████▎ | 3820/8750 [2:21:24<7:59:46, 5.84s/it] {'loss': 0.4772, 'learning_rate': 1.2513922753113567e-05, 'epoch': 0.44} + 44%|████▎ | 3820/8750 [2:21:27<7:59:46, 5.84s/it] {'loss': 0.4772, 'learning_rate': 1.2513922753113567e-05, 'epoch': 0.44} + 44%|████▎ | 3820/8750 [2:21:24<7:59:46, 5.84s/it] 44%|████▎ | 3821/8750 [2:21:33<8:04:09, 5.89s/it] 44%|████▎ | 3821/8750 [2:21:30<8:04:09, 5.89s/it] {'loss': 0.4602, 'learning_rate': 1.2510339805778932e-05, 'epoch': 0.44} + 44%|████▎ | 3821/8750 [2:21:33<8:04:09, 5.89s/it] {'loss': 0.4602, 'learning_rate': 1.2510339805778932e-05, 'epoch': 0.44} + 44%|████▎ | 3821/8750 [2:21:30<8:04:09, 5.89s/it] 44%|████▎ | 3822/8750 [2:21:36<8:01:12, 5.86s/it] 44%|████▎ | 3822/8750 [2:21:39<8:01:12, 5.86s/it] {'loss': 0.4734, 'learning_rate': 1.2506756514471696e-05, 'epoch': 0.44} + 44%|████▎ | 3822/8750 [2:21:39<8:01:12, 5.86s/it] {'loss': 0.4734, 'learning_rate': 1.2506756514471696e-05, 'epoch': 0.44} + 44%|████▎ | 3822/8750 [2:21:36<8:01:12, 5.86s/it] 44%|████▎ | 3823/8750 [2:21:44<7:59:26, 5.84s/it] 44%|████▎ | 3823/8750 [2:21:42<7:59:26, 5.84s/it] {'loss': 0.4635, 'learning_rate': 1.2503172879682853e-05, 'epoch': 0.44} + 44%|████▎ | 3823/8750 [2:21:44<7:59:26, 5.84s/it] {'loss': 0.4635, 'learning_rate': 1.2503172879682853e-05, 'epoch': 0.44} + 44%|████▎ | 3823/8750 [2:21:42<7:59:26, 5.84s/it] 44%|████▎ | 3824/8750 [2:21:50<8:00:40, 5.85s/it] 44%|████▎ | 3824/8750 [2:21:47<8:00:40, 5.85s/it] {'loss': 0.4662, 'learning_rate': 1.2499588901903437e-05, 'epoch': 0.44} + 44%|████▎ | 3824/8750 [2:21:50<8:00:40, 5.85s/it] {'loss': 0.4662, 'learning_rate': 1.2499588901903437e-05, 'epoch': 0.44} + 44%|████▎ | 3824/8750 [2:21:47<8:00:40, 5.85s/it] 44%|████▎ | 3825/8750 [2:21:56<7:59:51, 5.85s/it] 44%|████▎ | 3825/8750 [2:21:53<7:59:51, 5.85s/it] {'loss': 0.4722, 'learning_rate': 1.2496004581624538e-05, 'epoch': 0.44} + 44%|████▎ | 3825/8750 [2:21:56<7:59:51, 5.85s/it] {'loss': 0.4722, 'learning_rate': 1.2496004581624538e-05, 'epoch': 0.44} + 44%|████▎ | 3825/8750 [2:21:53<7:59:51, 5.85s/it] 44%|████▎ | 3826/8750 [2:22:02<7:57:56, 5.82s/it] 44%|████▎ | 3826/8750 [2:21:59<7:57:56, 5.82s/it] {'loss': 0.4521, 'learning_rate': 1.2492419919337282e-05, 'epoch': 0.44} + 44%|████▎ | 3826/8750 [2:22:02<7:57:56, 5.82s/it] {'loss': 0.4521, 'learning_rate': 1.2492419919337282e-05, 'epoch': 0.44} + 44%|████▎ | 3826/8750 [2:21:59<7:57:56, 5.82s/it] 44%|████▎ | 3827/8750 [2:22:08<7:57:49, 5.82s/it] 44%|████▎ | 3827/8750 [2:22:05<7:57:49, 5.82s/it] {'loss': 0.4681, 'learning_rate': 1.2488834915532852e-05, 'epoch': 0.44} + 44%|████▎ | 3827/8750 [2:22:08<7:57:49, 5.82s/it] {'loss': 0.4681, 'learning_rate': 1.2488834915532852e-05, 'epoch': 0.44} + 44%|████▎ | 3827/8750 [2:22:05<7:57:49, 5.82s/it] 44%|████▎ | 3828/8750 [2:22:13<7:52:42, 5.76s/it] 44%|████▎ | 3828/8750 [2:22:11<7:52:41, 5.76s/it] {'loss': 0.5047, 'learning_rate': 1.2485249570702471e-05, 'epoch': 0.44} + 44%|████▎ | 3828/8750 [2:22:13<7:52:42, 5.76s/it] {'loss': 0.5047, 'learning_rate': 1.2485249570702471e-05, 'epoch': 0.44} + 44%|████▎ | 3828/8750 [2:22:11<7:52:41, 5.76s/it] 44%|████▍ | 3829/8750 [2:22:19<7:50:39, 5.74s/it] 44%|████▍ | 3829/8750 [2:22:16<7:50:39, 5.74s/it] {'loss': 0.4595, 'learning_rate': 1.2481663885337417e-05, 'epoch': 0.44} + 44%|████▍ | 3829/8750 [2:22:19<7:50:39, 5.74s/it] {'loss': 0.4595, 'learning_rate': 1.2481663885337417e-05, 'epoch': 0.44} + 44%|████▍ | 3829/8750 [2:22:16<7:50:39, 5.74s/it] 44%|████▍ | 3830/8750 [2:22:25<7:48:27, 5.71s/it] 44%|████▍ | 3830/8750 [2:22:22<7:48:26, 5.71s/it] {'loss': 0.4662, 'learning_rate': 1.2478077859929e-05, 'epoch': 0.44} + 44%|████▍ | 3830/8750 [2:22:25<7:48:27, 5.71s/it] {'loss': 0.4662, 'learning_rate': 1.2478077859929e-05, 'epoch': 0.44} + 44%|████▍ | 3830/8750 [2:22:22<7:48:26, 5.71s/it] 44%|████▍ | 3831/8750 [2:22:30<7:48:05, 5.71s/it] 44%|████▍ | 3831/8750 [2:22:28<7:48:05, 5.71s/it] {'loss': 0.4725, 'learning_rate': 1.2474491494968593e-05, 'epoch': 0.44} + 44%|████▍ | 3831/8750 [2:22:30<7:48:05, 5.71s/it] {'loss': 0.4725, 'learning_rate': 1.2474491494968593e-05, 'epoch': 0.44} + 44%|████▍ | 3831/8750 [2:22:28<7:48:05, 5.71s/it] 44%|████▍ | 3832/8750 [2:22:36<7:49:39, 5.73s/it] 44%|████▍ | 3832/8750 [2:22:33<7:49:39, 5.73s/it] {'loss': 0.4675, 'learning_rate': 1.2470904790947605e-05, 'epoch': 0.44} + 44%|████▍ | 3832/8750 [2:22:36<7:49:39, 5.73s/it] {'loss': 0.4675, 'learning_rate': 1.2470904790947605e-05, 'epoch': 0.44} + 44%|████▍ | 3832/8750 [2:22:33<7:49:39, 5.73s/it] 44%|████▍ | 3833/8750 [2:22:42<7:52:15, 5.76s/it] 44%|████▍ | 3833/8750 [2:22:39<7:52:15, 5.76s/it] {'loss': 0.4636, 'learning_rate': 1.2467317748357493e-05, 'epoch': 0.44} + 44%|████▍ | 3833/8750 [2:22:42<7:52:15, 5.76s/it] {'loss': 0.4636, 'learning_rate': 1.2467317748357493e-05, 'epoch': 0.44} + 44%|████▍ | 3833/8750 [2:22:39<7:52:15, 5.76s/it] 44%|████▍ | 3834/8750 [2:22:48<7:58:57, 5.85s/it] 44%|████▍ | 3834/8750 [2:22:45<7:58:57, 5.85s/it] {'loss': 0.4673, 'learning_rate': 1.2463730367689768e-05, 'epoch': 0.44} + 44%|████▍ | 3834/8750 [2:22:48<7:58:57, 5.85s/it] {'loss': 0.4673, 'learning_rate': 1.2463730367689768e-05, 'epoch': 0.44} + 44%|████▍ | 3834/8750 [2:22:45<7:58:57, 5.85s/it] 44%|████▍ | 3835/8750 [2:22:54<7:58:06, 5.84s/it] 44%|████▍ | 3835/8750 [2:22:51<7:58:06, 5.84s/it] {'loss': 0.4526, 'learning_rate': 1.246014264943597e-05, 'epoch': 0.44} + 44%|████▍ | 3835/8750 [2:22:54<7:58:06, 5.84s/it] {'loss': 0.4526, 'learning_rate': 1.246014264943597e-05, 'epoch': 0.44} + 44%|████▍ | 3835/8750 [2:22:51<7:58:06, 5.84s/it] 44%|████▍ | 3836/8750 [2:23:00<7:57:09, 5.83s/it] 44%|████▍ | 3836/8750 [2:22:57<7:57:09, 5.83s/it] {'loss': 0.4888, 'learning_rate': 1.2456554594087709e-05, 'epoch': 0.44} + 44%|████▍ | 3836/8750 [2:23:00<7:57:09, 5.83s/it] {'loss': 0.4888, 'learning_rate': 1.2456554594087709e-05, 'epoch': 0.44} + 44%|████▍ | 3836/8750 [2:22:57<7:57:09, 5.83s/it] 44%|████▍ | 3837/8750 [2:23:05<7:54:55, 5.80s/it] 44%|████▍ | 3837/8750 [2:23:03<7:54:55, 5.80s/it] {'loss': 0.4601, 'learning_rate': 1.245296620213662e-05, 'epoch': 0.44} + 44%|████▍ | 3837/8750 [2:23:05<7:54:55, 5.80s/it] {'loss': 0.4601, 'learning_rate': 1.245296620213662e-05, 'epoch': 0.44} + 44%|████▍ | 3837/8750 [2:23:03<7:54:55, 5.80s/it] 44%|████▍ | 3838/8750 [2:23:11<7:53:45, 5.79s/it] 44%|████▍ | 3838/8750 [2:23:08<7:53:45, 5.79s/it] {'loss': 0.4894, 'learning_rate': 1.2449377474074398e-05, 'epoch': 0.44} + 44%|████▍ | 3838/8750 [2:23:11<7:53:45, 5.79s/it] {'loss': 0.4894, 'learning_rate': 1.2449377474074398e-05, 'epoch': 0.44} + 44%|████▍ | 3838/8750 [2:23:08<7:53:45, 5.79s/it] 44%|████▍ | 3839/8750 [2:23:17<7:52:28, 5.77s/it] 44%|████▍ | 3839/8750 [2:23:14<7:52:28, 5.77s/it] {'loss': 0.4568, 'learning_rate': 1.2445788410392778e-05, 'epoch': 0.44} + 44%|████▍ | 3839/8750 [2:23:17<7:52:28, 5.77s/it] {'loss': 0.4568, 'learning_rate': 1.2445788410392778e-05, 'epoch': 0.44} + 44%|████▍ | 3839/8750 [2:23:14<7:52:28, 5.77s/it] 44%|████▍ | 3840/8750 [2:23:23<7:59:31, 5.86s/it] 44%|████▍ | 3840/8750 [2:23:20<7:59:31, 5.86s/it] {'loss': 0.4489, 'learning_rate': 1.2442199011583538e-05, 'epoch': 0.44} + 44%|████▍ | 3840/8750 [2:23:23<7:59:31, 5.86s/it] {'loss': 0.4489, 'learning_rate': 1.2442199011583538e-05, 'epoch': 0.44} + 44%|████▍ | 3840/8750 [2:23:20<7:59:31, 5.86s/it] 44%|████▍ | 3841/8750 [2:23:29<8:00:47, 5.88s/it] 44%|████▍ | 3841/8750 [2:23:26<8:00:48, 5.88s/it] {'loss': 0.4727, 'learning_rate': 1.2438609278138509e-05, 'epoch': 0.44} + 44%|████▍ | 3841/8750 [2:23:29<8:00:47, 5.88s/it] {'loss': 0.4727, 'learning_rate': 1.2438609278138509e-05, 'epoch': 0.44} + 44%|████▍ | 3841/8750 [2:23:26<8:00:48, 5.88s/it] 44%|████▍ | 3842/8750 [2:23:35<7:55:48, 5.82s/it] 44%|████▍ | 3842/8750 [2:23:32<7:55:47, 5.82s/it] {'loss': 0.4745, 'learning_rate': 1.2435019210549564e-05, 'epoch': 0.44} + 44%|████▍ | 3842/8750 [2:23:35<7:55:48, 5.82s/it] {'loss': 0.4745, 'learning_rate': 1.2435019210549564e-05, 'epoch': 0.44} + 44%|████▍ | 3842/8750 [2:23:32<7:55:47, 5.82s/it] 44%|████▍ | 3843/8750 [2:23:41<7:58:43, 5.85s/it] 44%|████▍ | 3843/8750 [2:23:38<7:58:43, 5.85s/it] {'loss': 0.4597, 'learning_rate': 1.2431428809308625e-05, 'epoch': 0.44} + 44%|████▍ | 3843/8750 [2:23:41<7:58:43, 5.85s/it] {'loss': 0.4597, 'learning_rate': 1.2431428809308625e-05, 'epoch': 0.44} + 44%|████▍ | 3843/8750 [2:23:38<7:58:43, 5.85s/it] 44%|████▍ | 3844/8750 [2:23:46<7:54:49, 5.81s/it] 44%|████▍ | 3844/8750 [2:23:43<7:54:48, 5.81s/it] {'loss': 0.4838, 'learning_rate': 1.2427838074907654e-05, 'epoch': 0.44} + 44%|████▍ | 3844/8750 [2:23:46<7:54:49, 5.81s/it] {'loss': 0.4838, 'learning_rate': 1.2427838074907654e-05, 'epoch': 0.44} + 44%|████▍ | 3844/8750 [2:23:43<7:54:48, 5.81s/it] 44%|████▍ | 3845/8750 [2:23:52<7:52:02, 5.77s/it] 44%|████▍ | 3845/8750 [2:23:49<7:52:02, 5.77s/it] {'loss': 0.4504, 'learning_rate': 1.2424247007838659e-05, 'epoch': 0.44} + 44%|████▍ | 3845/8750 [2:23:52<7:52:02, 5.77s/it] {'loss': 0.4504, 'learning_rate': 1.2424247007838659e-05, 'epoch': 0.44} + 44%|████▍ | 3845/8750 [2:23:49<7:52:02, 5.77s/it] 44%|████▍ | 3846/8750 [2:23:58<7:48:49, 5.74s/it] 44%|████▍ | 3846/8750 [2:23:55<7:48:49, 5.74s/it] {'loss': 0.4759, 'learning_rate': 1.2420655608593701e-05, 'epoch': 0.44} + 44%|████▍ | 3846/8750 [2:23:58<7:48:49, 5.74s/it] {'loss': 0.4759, 'learning_rate': 1.2420655608593701e-05, 'epoch': 0.44} + 44%|████▍ | 3846/8750 [2:23:55<7:48:49, 5.74s/it] 44%|████▍ | 3847/8750 [2:24:03<7:51:20, 5.77s/it] 44%|████▍ | 3847/8750 [2:24:01<7:51:20, 5.77s/it] {'loss': 0.4513, 'learning_rate': 1.2417063877664883e-05, 'epoch': 0.44} + 44%|████▍ | 3847/8750 [2:24:03<7:51:20, 5.77s/it] {'loss': 0.4513, 'learning_rate': 1.2417063877664883e-05, 'epoch': 0.44} + 44%|████▍ | 3847/8750 [2:24:01<7:51:20, 5.77s/it] 44%|████▍ | 3848/8750 [2:24:09<7:53:08, 5.79s/it] 44%|████▍ | 3848/8750 [2:24:06<7:53:08, 5.79s/it] {'loss': 0.471, 'learning_rate': 1.241347181554435e-05, 'epoch': 0.44} + 44%|████▍ | 3848/8750 [2:24:09<7:53:08, 5.79s/it] {'loss': 0.471, 'learning_rate': 1.241347181554435e-05, 'epoch': 0.44} + 44%|████▍ | 3848/8750 [2:24:06<7:53:08, 5.79s/it] 44%|████▍ | 3849/8750 [2:24:15<7:50:15, 5.76s/it] 44%|████▍ | 3849/8750 [2:24:12<7:50:15, 5.76s/it] {'loss': 0.4847, 'learning_rate': 1.2409879422724293e-05, 'epoch': 0.44} + 44%|████▍ | 3849/8750 [2:24:15<7:50:15, 5.76s/it] {'loss': 0.4847, 'learning_rate': 1.2409879422724293e-05, 'epoch': 0.44} + 44%|████▍ | 3849/8750 [2:24:12<7:50:15, 5.76s/it]9 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +1412 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... + 44%|████▍ | 3850/8750 [2:24:21<7:47:18, 5.72s/it]015 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... + 44%|████▍ | 3850/8750 [2:24:18<7:47:18, 5.72s/it]1 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4829, 'learning_rate': 1.240628669969695e-05, 'epoch': 0.44} + 44%|████▍ | 3850/8750 [2:24:21<7:47:18, 5.72s/it] {'loss': 0.4829, 'learning_rate': 1.240628669969695e-05, 'epoch': 0.44} + 44%|████▍ | 3850/8750 [2:24:18<7:47:18, 5.72s/it] 44%|████▍ | 3851/8750 [2:24:26<7:48:09, 5.73s/it] 44%|████▍ | 3851/8750 [2:24:23<7:48:09, 5.73s/it] {'loss': 0.4523, 'learning_rate': 1.2402693646954607e-05, 'epoch': 0.44} + 44%|████▍ | 3851/8750 [2:24:26<7:48:09, 5.73s/it] {'loss': 0.4523, 'learning_rate': 1.2402693646954607e-05, 'epoch': 0.44} + 44%|████▍ | 3851/8750 [2:24:23<7:48:09, 5.73s/it] 44%|████▍ | 3852/8750 [2:24:33<8:00:40, 5.89s/it] 44%|████▍ | 3852/8750 [2:24:30<8:00:40, 5.89s/it] {'loss': 0.4763, 'learning_rate': 1.2399100264989593e-05, 'epoch': 0.44} + 44%|████▍ | 3852/8750 [2:24:33<8:00:40, 5.89s/it] {'loss': 0.4763, 'learning_rate': 1.2399100264989593e-05, 'epoch': 0.44} + 44%|████▍ | 3852/8750 [2:24:30<8:00:40, 5.89s/it] 44%|████▍ | 3853/8750 [2:24:38<8:00:20, 5.89s/it] 44%|████▍ | 3853/8750 [2:24:36<8:00:20, 5.89s/it] {'loss': 0.4629, 'learning_rate': 1.2395506554294281e-05, 'epoch': 0.44} + 44%|████▍ | 3853/8750 [2:24:38<8:00:20, 5.89s/it] {'loss': 0.4629, 'learning_rate': 1.2395506554294281e-05, 'epoch': 0.44} + 44%|████▍ | 3853/8750 [2:24:36<8:00:20, 5.89s/it] 44%|████▍ | 3854/8750 [2:24:44<7:52:24, 5.79s/it] 44%|████▍ | 3854/8750 [2:24:41<7:52:24, 5.79s/it] {'loss': 0.4689, 'learning_rate': 1.2391912515361085e-05, 'epoch': 0.44} + 44%|████▍ | 3854/8750 [2:24:44<7:52:24, 5.79s/it] {'loss': 0.4689, 'learning_rate': 1.2391912515361085e-05, 'epoch': 0.44} + 44%|████▍ | 3854/8750 [2:24:41<7:52:24, 5.79s/it] 44%|████▍ | 3855/8750 [2:24:47<7:49:50, 5.76s/it] 44%|████▍ | 3855/8750 [2:24:50<7:49:50, 5.76s/it] {'loss': 0.459, 'learning_rate': 1.2388318148682474e-05, 'epoch': 0.44} + {'loss': 0.459, 'learning_rate': 1.2388318148682474e-05, 'epoch': 0.44} + 44%|████▍ | 3855/8750 [2:24:50<7:49:50, 5.76s/it] 44%|████▍ | 3855/8750 [2:24:47<7:49:50, 5.76s/it] 44%|████▍ | 3856/8750 [2:24:56<7:50:25, 5.77s/it] 44%|████▍ | 3856/8750 [2:24:53<7:50:25, 5.77s/it] {'loss': 0.4638, 'learning_rate': 1.2384723454750957e-05, 'epoch': 0.44} + 44%|████▍ | 3856/8750 [2:24:56<7:50:25, 5.77s/it] {'loss': 0.4638, 'learning_rate': 1.2384723454750957e-05, 'epoch': 0.44} + 44%|████▍ | 3856/8750 [2:24:53<7:50:25, 5.77s/it] 44%|████▍ | 3857/8750 [2:25:01<7:52:10, 5.79s/it] 44%|████▍ | 3857/8750 [2:24:58<7:52:10, 5.79s/it] {'loss': 0.4496, 'learning_rate': 1.2381128434059082e-05, 'epoch': 0.44} + 44%|████▍ | 3857/8750 [2:25:01<7:52:10, 5.79s/it] {'loss': 0.4496, 'learning_rate': 1.2381128434059082e-05, 'epoch': 0.44} + 44%|████▍ | 3857/8750 [2:24:58<7:52:10, 5.79s/it] 44%|████▍ | 3858/8750 [2:25:07<7:49:16, 5.76s/it] 44%|████▍ | 3858/8750 [2:25:04<7:49:15, 5.76s/it] {'loss': 0.4754, 'learning_rate': 1.2377533087099451e-05, 'epoch': 0.44} + 44%|████▍ | 3858/8750 [2:25:07<7:49:16, 5.76s/it] {'loss': 0.4754, 'learning_rate': 1.2377533087099451e-05, 'epoch': 0.44} + 44%|████▍ | 3858/8750 [2:25:04<7:49:15, 5.76s/it] 44%|████▍ | 3859/8750 [2:25:13<7:48:57, 5.75s/it] 44%|████▍ | 3859/8750 [2:25:10<7:48:58, 5.75s/it] {'loss': 0.4502, 'learning_rate': 1.2373937414364703e-05, 'epoch': 0.44} + 44%|████▍ | 3859/8750 [2:25:13<7:48:57, 5.75s/it] {'loss': 0.4502, 'learning_rate': 1.2373937414364703e-05, 'epoch': 0.44} + 44%|████▍ | 3859/8750 [2:25:10<7:48:58, 5.75s/it] 44%|████▍ | 3860/8750 [2:25:19<7:50:27, 5.77s/it] 44%|████▍ | 3860/8750 [2:25:16<7:50:27, 5.77s/it] {'loss': 0.4675, 'learning_rate': 1.237034141634753e-05, 'epoch': 0.44} + 44%|████▍ | 3860/8750 [2:25:19<7:50:27, 5.77s/it] {'loss': 0.4675, 'learning_rate': 1.237034141634753e-05, 'epoch': 0.44} + 44%|████▍ | 3860/8750 [2:25:16<7:50:27, 5.77s/it] 44%|████▍ | 3861/8750 [2:25:24<7:48:37, 5.75s/it] 44%|████▍ | 3861/8750 [2:25:21<7:48:37, 5.75s/it] {'loss': 0.4682, 'learning_rate': 1.2366745093540667e-05, 'epoch': 0.44} + 44%|████▍ | 3861/8750 [2:25:24<7:48:37, 5.75s/it] {'loss': 0.4682, 'learning_rate': 1.2366745093540667e-05, 'epoch': 0.44} + 44%|████▍ | 3861/8750 [2:25:21<7:48:37, 5.75s/it] 44%|████▍ | 3862/8750 [2:25:30<7:48:13, 5.75s/it] 44%|████▍ | 3862/8750 [2:25:27<7:48:13, 5.75s/it] {'loss': 0.4683, 'learning_rate': 1.2363148446436882e-05, 'epoch': 0.44} + 44%|████▍ | 3862/8750 [2:25:30<7:48:13, 5.75s/it] {'loss': 0.4683, 'learning_rate': 1.2363148446436882e-05, 'epoch': 0.44} + 44%|████▍ | 3862/8750 [2:25:27<7:48:13, 5.75s/it] 44%|████▍ | 3863/8750 [2:25:36<7:48:59, 5.76s/it] 44%|████▍ | 3863/8750 [2:25:33<7:48:58, 5.76s/it] {'loss': 0.4558, 'learning_rate': 1.2359551475529e-05, 'epoch': 0.44} + 44%|████▍ | 3863/8750 [2:25:36<7:48:59, 5.76s/it] {'loss': 0.4558, 'learning_rate': 1.2359551475529e-05, 'epoch': 0.44} + 44%|████▍ | 3863/8750 [2:25:33<7:48:58, 5.76s/it] 44%|████▍ | 3864/8750 [2:25:42<7:49:58, 5.77s/it] 44%|████▍ | 3864/8750 [2:25:39<7:49:58, 5.77s/it] {'loss': 0.4749, 'learning_rate': 1.2355954181309883e-05, 'epoch': 0.44} + 44%|████▍ | 3864/8750 [2:25:42<7:49:58, 5.77s/it] {'loss': 0.4749, 'learning_rate': 1.2355954181309883e-05, 'epoch': 0.44} + 44%|████▍ | 3864/8750 [2:25:39<7:49:58, 5.77s/it] 44%|████▍ | 3865/8750 [2:25:47<7:50:21, 5.78s/it] 44%|████▍ | 3865/8750 [2:25:45<7:50:22, 5.78s/it] {'loss': 0.4658, 'learning_rate': 1.235235656427245e-05, 'epoch': 0.44} + 44%|████▍ | 3865/8750 [2:25:47<7:50:21, 5.78s/it] {'loss': 0.4658, 'learning_rate': 1.235235656427245e-05, 'epoch': 0.44} + 44%|████▍ | 3865/8750 [2:25:45<7:50:22, 5.78s/it] 44%|████▍ | 3866/8750 [2:25:53<7:55:38, 5.84s/it] 44%|████▍ | 3866/8750 [2:25:51<7:55:38, 5.84s/it] {'loss': 0.4681, 'learning_rate': 1.2348758624909644e-05, 'epoch': 0.44} + 44%|████▍ | 3866/8750 [2:25:53<7:55:38, 5.84s/it] {'loss': 0.4681, 'learning_rate': 1.2348758624909644e-05, 'epoch': 0.44} + 44%|████▍ | 3866/8750 [2:25:51<7:55:38, 5.84s/it] 44%|████▍ | 3867/8750 [2:25:59<7:49:46, 5.77s/it] 44%|████▍ | 3867/8750 [2:25:56<7:49:46, 5.77s/it] {'loss': 0.4694, 'learning_rate': 1.2345160363714471e-05, 'epoch': 0.44} + 44%|████▍ | 3867/8750 [2:25:59<7:49:46, 5.77s/it] {'loss': 0.4694, 'learning_rate': 1.2345160363714471e-05, 'epoch': 0.44} + 44%|████▍ | 3867/8750 [2:25:56<7:49:46, 5.77s/it] 44%|████▍ | 3868/8750 [2:26:05<7:47:42, 5.75s/it] 44%|████▍ | 3868/8750 [2:26:02<7:47:43, 5.75s/it] {'loss': 0.4718, 'learning_rate': 1.2341561781179965e-05, 'epoch': 0.44} + 44%|████▍ | 3868/8750 [2:26:05<7:47:42, 5.75s/it] {'loss': 0.4718, 'learning_rate': 1.2341561781179965e-05, 'epoch': 0.44} + 44%|████▍ | 3868/8750 [2:26:02<7:47:43, 5.75s/it] 44%|████▍ | 3869/8750 [2:26:10<7:45:20, 5.72s/it] 44%|████▍ | 3869/8750 [2:26:07<7:45:21, 5.72s/it] {'loss': 0.4808, 'learning_rate': 1.233796287779922e-05, 'epoch': 0.44} + 44%|████▍ | 3869/8750 [2:26:10<7:45:20, 5.72s/it] {'loss': 0.4808, 'learning_rate': 1.233796287779922e-05, 'epoch': 0.44} + 44%|████▍ | 3869/8750 [2:26:07<7:45:21, 5.72s/it] 44%|████▍ | 3870/8750 [2:26:16<7:48:57, 5.77s/it] 44%|████▍ | 3870/8750 [2:26:13<7:48:56, 5.77s/it] {'loss': 0.4671, 'learning_rate': 1.2334363654065363e-05, 'epoch': 0.44} + 44%|████▍ | 3870/8750 [2:26:16<7:48:57, 5.77s/it] {'loss': 0.4671, 'learning_rate': 1.2334363654065363e-05, 'epoch': 0.44} + 44%|████▍ | 3870/8750 [2:26:13<7:48:56, 5.77s/it] 44%|████▍ | 3871/8750 [2:26:22<7:47:45, 5.75s/it] 44%|████▍ | 3871/8750 [2:26:19<7:47:44, 5.75s/it] {'loss': 0.4458, 'learning_rate': 1.2330764110471567e-05, 'epoch': 0.44} + {'loss': 0.4458, 'learning_rate': 1.2330764110471567e-05, 'epoch': 0.44} 44%|████▍ | 3871/8750 [2:26:22<7:47:45, 5.75s/it] + 44%|████▍ | 3871/8750 [2:26:19<7:47:44, 5.75s/it] 44%|████▍ | 3872/8750 [2:26:28<7:47:18, 5.75s/it] 44%|████▍ | 3872/8750 [2:26:25<7:47:18, 5.75s/it] {'loss': 0.4809, 'learning_rate': 1.2327164247511051e-05, 'epoch': 0.44} + 44%|████▍ | 3872/8750 [2:26:28<7:47:18, 5.75s/it] {'loss': 0.4809, 'learning_rate': 1.2327164247511051e-05, 'epoch': 0.44} + 44%|████▍ | 3872/8750 [2:26:25<7:47:18, 5.75s/it] 44%|████▍ | 3873/8750 [2:26:34<7:50:09, 5.78s/it] 44%|████▍ | 3873/8750 [2:26:31<7:50:08, 5.78s/it] {'loss': 0.4554, 'learning_rate': 1.2323564065677078e-05, 'epoch': 0.44} + 44%|████▍ | 3873/8750 [2:26:34<7:50:09, 5.78s/it] {'loss': 0.4554, 'learning_rate': 1.2323564065677078e-05, 'epoch': 0.44} + 44%|████▍ | 3873/8750 [2:26:31<7:50:08, 5.78s/it] 44%|████▍ | 3874/8750 [2:26:39<7:51:24, 5.80s/it] 44%|████▍ | 3874/8750 [2:26:37<7:51:24, 5.80s/it] {'loss': 0.4595, 'learning_rate': 1.2319963565462949e-05, 'epoch': 0.44} + 44%|████▍ | 3874/8750 [2:26:39<7:51:24, 5.80s/it] {'loss': 0.4595, 'learning_rate': 1.2319963565462949e-05, 'epoch': 0.44} + 44%|████▍ | 3874/8750 [2:26:37<7:51:24, 5.80s/it] 44%|████▍ | 3875/8750 [2:26:45<7:52:01, 5.81s/it] 44%|████▍ | 3875/8750 [2:26:42<7:52:01, 5.81s/it] {'loss': 0.4644, 'learning_rate': 1.2316362747362019e-05, 'epoch': 0.44} + 44%|████▍ | 3875/8750 [2:26:45<7:52:01, 5.81s/it] {'loss': 0.4644, 'learning_rate': 1.2316362747362019e-05, 'epoch': 0.44} + 44%|████▍ | 3875/8750 [2:26:42<7:52:01, 5.81s/it] 44%|████▍ | 3876/8750 [2:26:51<7:48:31, 5.77s/it] 44%|████▍ | 3876/8750 [2:26:48<7:48:30, 5.77s/it] {'loss': 0.463, 'learning_rate': 1.2312761611867673e-05, 'epoch': 0.44} + 44%|████▍ | 3876/8750 [2:26:51<7:48:31, 5.77s/it] {'loss': 0.463, 'learning_rate': 1.2312761611867673e-05, 'epoch': 0.44} + 44%|████▍ | 3876/8750 [2:26:48<7:48:30, 5.77s/it] 44%|████▍ | 3877/8750 [2:26:57<7:46:51, 5.75s/it] 44%|████▍ | 3877/8750 [2:26:54<7:46:51, 5.75s/it] {'loss': 0.4827, 'learning_rate': 1.2309160159473354e-05, 'epoch': 0.44} + 44%|████▍ | 3877/8750 [2:26:57<7:46:51, 5.75s/it] {'loss': 0.4827, 'learning_rate': 1.2309160159473354e-05, 'epoch': 0.44} + 44%|████▍ | 3877/8750 [2:26:54<7:46:51, 5.75s/it] 44%|████▍ | 3878/8750 [2:27:02<7:47:58, 5.76s/it] 44%|████▍ | 3878/8750 [2:27:00<7:47:58, 5.76s/it] {'loss': 0.4786, 'learning_rate': 1.2305558390672539e-05, 'epoch': 0.44} + 44%|████▍ | 3878/8750 [2:27:02<7:47:58, 5.76s/it] {'loss': 0.4786, 'learning_rate': 1.2305558390672539e-05, 'epoch': 0.44} + 44%|████▍ | 3878/8750 [2:27:00<7:47:58, 5.76s/it] 44%|████▍ | 3879/8750 [2:27:08<7:50:21, 5.79s/it] 44%|████▍ | 3879/8750 [2:27:05<7:50:22, 5.79s/it] {'loss': 0.4691, 'learning_rate': 1.2301956305958746e-05, 'epoch': 0.44} + 44%|████▍ | 3879/8750 [2:27:08<7:50:21, 5.79s/it] {'loss': 0.4691, 'learning_rate': 1.2301956305958746e-05, 'epoch': 0.44} + 44%|████▍ | 3879/8750 [2:27:05<7:50:22, 5.79s/it] 44%|████▍ | 3880/8750 [2:27:14<7:45:49, 5.74s/it] 44%|████▍ | 3880/8750 [2:27:11<7:45:50, 5.74s/it] {'loss': 0.4597, 'learning_rate': 1.2298353905825549e-05, 'epoch': 0.44} + 44%|████▍ | 3880/8750 [2:27:14<7:45:49, 5.74s/it] {'loss': 0.4597, 'learning_rate': 1.2298353905825549e-05, 'epoch': 0.44} + 44%|████▍ | 3880/8750 [2:27:11<7:45:50, 5.74s/it] 44%|████▍ | 3881/8750 [2:27:20<7:45:45, 5.74s/it] 44%|████▍ | 3881/8750 [2:27:17<7:45:45, 5.74s/it] {'loss': 0.4655, 'learning_rate': 1.2294751190766552e-05, 'epoch': 0.44} + 44%|████▍ | 3881/8750 [2:27:20<7:45:45, 5.74s/it] {'loss': 0.4655, 'learning_rate': 1.2294751190766552e-05, 'epoch': 0.44} + 44%|████▍ | 3881/8750 [2:27:17<7:45:45, 5.74s/it] 44%|████▍ | 3882/8750 [2:27:25<7:45:43, 5.74s/it] 44%|████▍ | 3882/8750 [2:27:22<7:45:43, 5.74s/it] {'loss': 0.4744, 'learning_rate': 1.229114816127541e-05, 'epoch': 0.44} + 44%|████▍ | 3882/8750 [2:27:25<7:45:43, 5.74s/it] {'loss': 0.4744, 'learning_rate': 1.229114816127541e-05, 'epoch': 0.44} + 44%|████▍ | 3882/8750 [2:27:22<7:45:43, 5.74s/it] 44%|████▍ | 3883/8750 [2:27:31<7:51:54, 5.82s/it] 44%|████▍ | 3883/8750 [2:27:28<7:51:55, 5.82s/it] {'loss': 0.4498, 'learning_rate': 1.2287544817845817e-05, 'epoch': 0.44} + 44%|████▍ | 3883/8750 [2:27:31<7:51:54, 5.82s/it] {'loss': 0.4498, 'learning_rate': 1.2287544817845817e-05, 'epoch': 0.44} + 44%|████▍ | 3883/8750 [2:27:28<7:51:55, 5.82s/it] 44%|████▍ | 3884/8750 [2:27:37<7:46:37, 5.75s/it] 44%|████▍ | 3884/8750 [2:27:34<7:46:35, 5.75s/it] {'loss': 0.4816, 'learning_rate': 1.2283941160971512e-05, 'epoch': 0.44} + 44%|████▍ | 3884/8750 [2:27:37<7:46:37, 5.75s/it] {'loss': 0.4816, 'learning_rate': 1.2283941160971512e-05, 'epoch': 0.44} + 44%|████▍ | 3884/8750 [2:27:34<7:46:35, 5.75s/it] 44%|████▍ | 3885/8750 [2:27:43<7:54:45, 5.86s/it] 44%|████▍ | 3885/8750 [2:27:40<7:54:45, 5.86s/it] {'loss': 0.4777, 'learning_rate': 1.2280337191146276e-05, 'epoch': 0.44} + 44%|████▍ | 3885/8750 [2:27:43<7:54:45, 5.86s/it] {'loss': 0.4777, 'learning_rate': 1.2280337191146276e-05, 'epoch': 0.44} + 44%|████▍ | 3885/8750 [2:27:40<7:54:45, 5.86s/it] 44%|████▍ | 3886/8750 [2:27:49<7:51:07, 5.81s/it] 44%|████▍ | 3886/8750 [2:27:46<7:51:08, 5.81s/it] {'loss': 0.4736, 'learning_rate': 1.2276732908863933e-05, 'epoch': 0.44} + 44%|████▍ | 3886/8750 [2:27:49<7:51:07, 5.81s/it] {'loss': 0.4736, 'learning_rate': 1.2276732908863933e-05, 'epoch': 0.44} + 44%|████▍ | 3886/8750 [2:27:46<7:51:08, 5.81s/it] 44%|████▍ | 3887/8750 [2:27:55<8:00:40, 5.93s/it] 44%|████▍ | 3887/8750 [2:27:52<8:00:40, 5.93s/it] {'loss': 0.462, 'learning_rate': 1.2273128314618353e-05, 'epoch': 0.44} + 44%|████▍ | 3887/8750 [2:27:55<8:00:40, 5.93s/it] {'loss': 0.462, 'learning_rate': 1.2273128314618353e-05, 'epoch': 0.44} + 44%|████▍ | 3887/8750 [2:27:52<8:00:40, 5.93s/it] 44%|████▍ | 3888/8750 [2:27:58<7:57:42, 5.90s/it] 44%|████▍ | 3888/8750 [2:28:01<7:57:42, 5.90s/it] {'loss': 0.4851, 'learning_rate': 1.226952340890344e-05, 'epoch': 0.44} + {'loss': 0.4851, 'learning_rate': 1.226952340890344e-05, 'epoch': 0.44} 44%|████▍ | 3888/8750 [2:28:01<7:57:42, 5.90s/it] + 44%|████▍ | 3888/8750 [2:27:58<7:57:42, 5.90s/it] 44%|████▍ | 3889/8750 [2:28:07<7:56:58, 5.89s/it] 44%|████▍ | 3889/8750 [2:28:04<7:56:58, 5.89s/it] {'loss': 0.4576, 'learning_rate': 1.2265918192213153e-05, 'epoch': 0.44} + 44%|████▍ | 3889/8750 [2:28:07<7:56:58, 5.89s/it] {'loss': 0.4576, 'learning_rate': 1.2265918192213153e-05, 'epoch': 0.44} + 44%|████▍ | 3889/8750 [2:28:04<7:56:58, 5.89s/it] 44%|████▍ | 3890/8750 [2:28:12<7:49:48, 5.80s/it] 44%|████▍ | 3890/8750 [2:28:09<7:49:48, 5.80s/it] {'loss': 0.4893, 'learning_rate': 1.2262312665041482e-05, 'epoch': 0.44} + {'loss': 0.4893, 'learning_rate': 1.2262312665041482e-05, 'epoch': 0.44} 44%|████▍ | 3890/8750 [2:28:12<7:49:48, 5.80s/it] + 44%|████▍ | 3890/8750 [2:28:09<7:49:48, 5.80s/it] 44%|████▍ | 3891/8750 [2:28:18<7:50:26, 5.81s/it] 44%|████▍ | 3891/8750 [2:28:15<7:50:26, 5.81s/it] {'loss': 0.462, 'learning_rate': 1.2258706827882472e-05, 'epoch': 0.44} + 44%|████▍ | 3891/8750 [2:28:18<7:50:26, 5.81s/it] {'loss': 0.462, 'learning_rate': 1.2258706827882472e-05, 'epoch': 0.44} + 44%|████▍ | 3891/8750 [2:28:15<7:50:26, 5.81s/it] 44%|████▍ | 3892/8750 [2:28:24<7:47:28, 5.77s/it] 44%|████▍ | 3892/8750 [2:28:21<7:47:27, 5.77s/it] {'loss': 0.4632, 'learning_rate': 1.2255100681230192e-05, 'epoch': 0.44} + 44%|████▍ | 3892/8750 [2:28:24<7:47:28, 5.77s/it] {'loss': 0.4632, 'learning_rate': 1.2255100681230192e-05, 'epoch': 0.44} + 44%|████▍ | 3892/8750 [2:28:21<7:47:27, 5.77s/it] 44%|████▍ | 3893/8750 [2:28:30<7:48:59, 5.79s/it] 44%|████▍ | 3893/8750 [2:28:27<7:48:59, 5.79s/it] {'loss': 0.4544, 'learning_rate': 1.2251494225578775e-05, 'epoch': 0.44} + 44%|████▍ | 3893/8750 [2:28:30<7:48:59, 5.79s/it] {'loss': 0.4544, 'learning_rate': 1.2251494225578775e-05, 'epoch': 0.44} + 44%|████▍ | 3893/8750 [2:28:27<7:48:59, 5.79s/it] 45%|████▍ | 3894/8750 [2:28:32<7:46:29, 5.76s/it] 45%|████▍ | 3894/8750 [2:28:35<7:46:29, 5.76s/it]{'loss': 0.4882, 'learning_rate': 1.224788746142238e-05, 'epoch': 0.45} + {'loss': 0.4882, 'learning_rate': 1.224788746142238e-05, 'epoch': 0.45} + 45%|████▍ | 3894/8750 [2:28:35<7:46:29, 5.76s/it] 45%|████▍ | 3894/8750 [2:28:32<7:46:29, 5.76s/it] 45%|████▍ | 3895/8750 [2:28:41<7:47:31, 5.78s/it] 45%|████▍ | 3895/8750 [2:28:38<7:47:31, 5.78s/it] {'loss': 0.4637, 'learning_rate': 1.2244280389255218e-05, 'epoch': 0.45} + 45%|████▍ | 3895/8750 [2:28:41<7:47:31, 5.78s/it] {'loss': 0.4637, 'learning_rate': 1.2244280389255218e-05, 'epoch': 0.45} + 45%|████▍ | 3895/8750 [2:28:38<7:47:31, 5.78s/it] 45%|████▍ | 3896/8750 [2:28:47<7:52:22, 5.84s/it] 45%|████▍ | 3896/8750 [2:28:44<7:52:22, 5.84s/it] {'loss': 0.459, 'learning_rate': 1.2240673009571536e-05, 'epoch': 0.45} + 45%|████▍ | 3896/8750 [2:28:47<7:52:22, 5.84s/it] {'loss': 0.459, 'learning_rate': 1.2240673009571536e-05, 'epoch': 0.45} + 45%|████▍ | 3896/8750 [2:28:44<7:52:22, 5.84s/it] 45%|████▍ | 3897/8750 [2:28:53<7:47:55, 5.79s/it] 45%|████▍ | 3897/8750 [2:28:50<7:47:53, 5.78s/it] {'loss': 0.4563, 'learning_rate': 1.2237065322865625e-05, 'epoch': 0.45} + 45%|████▍ | 3897/8750 [2:28:53<7:47:55, 5.79s/it] {'loss': 0.4563, 'learning_rate': 1.2237065322865625e-05, 'epoch': 0.45} + 45%|████▍ | 3897/8750 [2:28:50<7:47:53, 5.78s/it] 45%|████▍ | 3898/8750 [2:28:59<7:50:24, 5.82s/it] 45%|████▍ | 3898/8750 [2:28:56<7:50:25, 5.82s/it] {'loss': 0.4665, 'learning_rate': 1.223345732963182e-05, 'epoch': 0.45} + 45%|████▍ | 3898/8750 [2:28:59<7:50:24, 5.82s/it] {'loss': 0.4665, 'learning_rate': 1.223345732963182e-05, 'epoch': 0.45} + 45%|████▍ | 3898/8750 [2:28:56<7:50:25, 5.82s/it] 45%|████▍ | 3899/8750 [2:29:05<7:54:15, 5.87s/it] 45%|████▍ | 3899/8750 [2:29:02<7:54:15, 5.87s/it] {'loss': 0.4574, 'learning_rate': 1.2229849030364496e-05, 'epoch': 0.45} + 45%|████▍ | 3899/8750 [2:29:05<7:54:15, 5.87s/it] {'loss': 0.4574, 'learning_rate': 1.2229849030364496e-05, 'epoch': 0.45} + 45%|████▍ | 3899/8750 [2:29:02<7:54:15, 5.87s/it]10 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... + 45%|████▍ | 3900/8750 [2:29:10<7:52:49, 5.85s/it]47 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +013 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... + 45%|████▍ | 3900/8750 [2:29:08<7:52:49, 5.85s/it] {'loss': 0.4701, 'learning_rate': 1.2226240425558071e-05, 'epoch': 0.45} + 45%|████▍ | 3900/8750 [2:29:10<7:52:49, 5.85s/it] {'loss': 0.4701, 'learning_rate': 1.2226240425558071e-05, 'epoch': 0.45} + 45%|████▍ | 3900/8750 [2:29:08<7:52:49, 5.85s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-3900/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-3900/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-3900/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 45%|████▍ | 3901/8750 [2:29:36<15:53:18, 11.80s/it] 45%|████▍ | 3901/8750 [2:29:33<15:53:18, 11.80s/it] {'loss': 0.4649, 'learning_rate': 1.2222631515707005e-05, 'epoch': 0.45} + 45%|████▍ | 3901/8750 [2:29:36<15:53:18, 11.80s/it] {'loss': 0.4649, 'learning_rate': 1.2222631515707005e-05, 'epoch': 0.45} + 45%|████▍ | 3901/8750 [2:29:33<15:53:18, 11.80s/it] 45%|████▍ | 3902/8750 [2:29:42<13:23:36, 9.95s/it] 45%|████▍ | 3902/8750 [2:29:39<13:23:36, 9.95s/it] {'loss': 0.4686, 'learning_rate': 1.2219022301305796e-05, 'epoch': 0.45} + 45%|████▍ | 3902/8750 [2:29:42<13:23:36, 9.95s/it] {'loss': 0.4686, 'learning_rate': 1.2219022301305796e-05, 'epoch': 0.45} + 45%|████▍ | 3902/8750 [2:29:39<13:23:36, 9.95s/it] 45%|████▍ | 3903/8750 [2:29:47<11:40:28, 8.67s/it] 45%|████▍ | 3903/8750 [2:29:45<11:40:28, 8.67s/it] {'loss': 0.4741, 'learning_rate': 1.2215412782848993e-05, 'epoch': 0.45} + 45%|████▍ | 3903/8750 [2:29:47<11:40:28, 8.67s/it] {'loss': 0.4741, 'learning_rate': 1.2215412782848993e-05, 'epoch': 0.45} + 45%|████▍ | 3903/8750 [2:29:45<11:40:28, 8.67s/it] 45%|████▍ | 3904/8750 [2:29:50<10:33:15, 7.84s/it] 45%|████▍ | 3904/8750 [2:29:53<10:33:16, 7.84s/it] {'loss': 0.4629, 'learning_rate': 1.2211802960831176e-05, 'epoch': 0.45} + {'loss': 0.4629, 'learning_rate': 1.2211802960831176e-05, 'epoch': 0.45} 45%|████▍ | 3904/8750 [2:29:53<10:33:16, 7.84s/it] + 45%|████▍ | 3904/8750 [2:29:50<10:33:15, 7.84s/it] 45%|████▍ | 3905/8750 [2:29:59<9:45:01, 7.24s/it] 45%|████▍ | 3905/8750 [2:29:56<9:45:02, 7.25s/it] {'loss': 0.4678, 'learning_rate': 1.2208192835746973e-05, 'epoch': 0.45} + 45%|████▍ | 3905/8750 [2:29:59<9:45:01, 7.24s/it] {'loss': 0.4678, 'learning_rate': 1.2208192835746973e-05, 'epoch': 0.45} + 45%|████▍ | 3905/8750 [2:29:56<9:45:02, 7.25s/it] 45%|████▍ | 3906/8750 [2:30:05<9:11:34, 6.83s/it] 45%|████▍ | 3906/8750 [2:30:02<9:11:33, 6.83s/it] {'loss': 0.4732, 'learning_rate': 1.2204582408091047e-05, 'epoch': 0.45} + 45%|████▍ | 3906/8750 [2:30:05<9:11:34, 6.83s/it] {'loss': 0.4732, 'learning_rate': 1.2204582408091047e-05, 'epoch': 0.45} + 45%|████▍ | 3906/8750 [2:30:02<9:11:33, 6.83s/it] 45%|████▍ | 3907/8750 [2:30:11<8:53:40, 6.61s/it] 45%|████▍ | 3907/8750 [2:30:08<8:53:40, 6.61s/it] {'loss': 0.4798, 'learning_rate': 1.2200971678358113e-05, 'epoch': 0.45} + 45%|████▍ | 3907/8750 [2:30:11<8:53:40, 6.61s/it] {'loss': 0.4798, 'learning_rate': 1.2200971678358113e-05, 'epoch': 0.45} + 45%|████▍ | 3907/8750 [2:30:08<8:53:40, 6.61s/it] 45%|████▍ | 3908/8750 [2:30:17<8:36:24, 6.40s/it] 45%|████▍ | 3908/8750 [2:30:14<8:36:24, 6.40s/it] {'loss': 0.4674, 'learning_rate': 1.2197360647042922e-05, 'epoch': 0.45} + 45%|████▍ | 3908/8750 [2:30:17<8:36:24, 6.40s/it] {'loss': 0.4674, 'learning_rate': 1.2197360647042922e-05, 'epoch': 0.45} + 45%|████▍ | 3908/8750 [2:30:14<8:36:24, 6.40s/it] 45%|████▍ | 3909/8750 [2:30:23<8:25:36, 6.27s/it] 45%|████▍ | 3909/8750 [2:30:20<8:25:37, 6.27s/it] {'loss': 0.4708, 'learning_rate': 1.2193749314640264e-05, 'epoch': 0.45} + 45%|████▍ | 3909/8750 [2:30:23<8:25:36, 6.27s/it] {'loss': 0.4708, 'learning_rate': 1.2193749314640264e-05, 'epoch': 0.45} + 45%|████▍ | 3909/8750 [2:30:20<8:25:37, 6.27s/it] 45%|████▍ | 3910/8750 [2:30:29<8:11:48, 6.10s/it] 45%|████▍ | 3910/8750 [2:30:26<8:11:48, 6.10s/it] {'loss': 0.4884, 'learning_rate': 1.2190137681644968e-05, 'epoch': 0.45} + 45%|████▍ | 3910/8750 [2:30:29<8:11:48, 6.10s/it] {'loss': 0.4884, 'learning_rate': 1.2190137681644968e-05, 'epoch': 0.45} + 45%|████▍ | 3910/8750 [2:30:26<8:11:48, 6.10s/it] 45%|████▍ | 3911/8750 [2:30:34<7:59:28, 5.95s/it] 45%|████▍ | 3911/8750 [2:30:31<7:59:27, 5.95s/it] {'loss': 0.468, 'learning_rate': 1.2186525748551914e-05, 'epoch': 0.45} + 45%|████▍ | 3911/8750 [2:30:34<7:59:28, 5.95s/it] {'loss': 0.468, 'learning_rate': 1.2186525748551914e-05, 'epoch': 0.45} + 45%|████▍ | 3911/8750 [2:30:31<7:59:27, 5.95s/it] 45%|████▍ | 3912/8750 [2:30:40<7:54:15, 5.88s/it] 45%|████▍ | 3912/8750 [2:30:37<7:54:15, 5.88s/it] {'loss': 0.4619, 'learning_rate': 1.2182913515856016e-05, 'epoch': 0.45} + 45%|████▍ | 3912/8750 [2:30:40<7:54:15, 5.88s/it] {'loss': 0.4619, 'learning_rate': 1.2182913515856016e-05, 'epoch': 0.45} + 45%|████▍ | 3912/8750 [2:30:37<7:54:15, 5.88s/it] 45%|████▍ | 3913/8750 [2:30:46<8:02:02, 5.98s/it] 45%|████▍ | 3913/8750 [2:30:43<8:02:01, 5.98s/it] {'loss': 0.4604, 'learning_rate': 1.2179300984052233e-05, 'epoch': 0.45} + 45%|████▍ | 3913/8750 [2:30:46<8:02:02, 5.98s/it] {'loss': 0.4604, 'learning_rate': 1.2179300984052233e-05, 'epoch': 0.45} + 45%|████▍ | 3913/8750 [2:30:43<8:02:01, 5.98s/it] 45%|████▍ | 3914/8750 [2:30:52<7:56:45, 5.92s/it] 45%|████▍ | 3914/8750 [2:30:49<7:56:46, 5.92s/it] {'loss': 0.4674, 'learning_rate': 1.217568815363556e-05, 'epoch': 0.45} + 45%|████▍ | 3914/8750 [2:30:52<7:56:45, 5.92s/it] {'loss': 0.4674, 'learning_rate': 1.217568815363556e-05, 'epoch': 0.45} + 45%|████▍ | 3914/8750 [2:30:49<7:56:46, 5.92s/it] 45%|████▍ | 3915/8750 [2:30:58<7:51:39, 5.85s/it] 45%|████▍ | 3915/8750 [2:30:55<7:51:39, 5.85s/it] {'loss': 0.4838, 'learning_rate': 1.2172075025101032e-05, 'epoch': 0.45} + 45%|████▍ | 3915/8750 [2:30:58<7:51:39, 5.85s/it] {'loss': 0.4838, 'learning_rate': 1.2172075025101032e-05, 'epoch': 0.45} + 45%|████▍ | 3915/8750 [2:30:55<7:51:39, 5.85s/it] 45%|████▍ | 3916/8750 [2:31:04<7:52:38, 5.87s/it] 45%|████▍ | 3916/8750 [2:31:01<7:52:39, 5.87s/it] {'loss': 0.4492, 'learning_rate': 1.2168461598943728e-05, 'epoch': 0.45} + 45%|████▍ | 3916/8750 [2:31:04<7:52:38, 5.87s/it] {'loss': 0.4492, 'learning_rate': 1.2168461598943728e-05, 'epoch': 0.45} + 45%|████▍ | 3916/8750 [2:31:01<7:52:39, 5.87s/it] 45%|████▍ | 3917/8750 [2:31:09<7:48:30, 5.82s/it] 45%|████▍ | 3917/8750 [2:31:06<7:48:30, 5.82s/it] {'loss': 0.4636, 'learning_rate': 1.2164847875658776e-05, 'epoch': 0.45} + 45%|████▍ | 3917/8750 [2:31:09<7:48:30, 5.82s/it] {'loss': 0.4636, 'learning_rate': 1.2164847875658776e-05, 'epoch': 0.45} + 45%|████▍ | 3917/8750 [2:31:06<7:48:30, 5.82s/it] 45%|████▍ | 3918/8750 [2:31:15<7:46:42, 5.80s/it] 45%|████▍ | 3918/8750 [2:31:12<7:46:41, 5.80s/it] {'loss': 0.4594, 'learning_rate': 1.2161233855741332e-05, 'epoch': 0.45} + 45%|████▍ | 3918/8750 [2:31:15<7:46:42, 5.80s/it] {'loss': 0.4594, 'learning_rate': 1.2161233855741332e-05, 'epoch': 0.45} + 45%|████▍ | 3918/8750 [2:31:12<7:46:41, 5.80s/it] 45%|████▍ | 3919/8750 [2:31:21<7:51:48, 5.86s/it] 45%|████▍ | 3919/8750 [2:31:18<7:51:48, 5.86s/it] {'loss': 0.488, 'learning_rate': 1.2157619539686597e-05, 'epoch': 0.45} + 45%|████▍ | 3919/8750 [2:31:21<7:51:48, 5.86s/it] {'loss': 0.488, 'learning_rate': 1.2157619539686597e-05, 'epoch': 0.45} + 45%|████▍ | 3919/8750 [2:31:18<7:51:48, 5.86s/it] 45%|████▍ | 3920/8750 [2:31:27<7:47:08, 5.80s/it] 45%|████▍ | 3920/8750 [2:31:24<7:47:08, 5.80s/it] {'loss': 0.4618, 'learning_rate': 1.2154004927989815e-05, 'epoch': 0.45} + 45%|████▍ | 3920/8750 [2:31:27<7:47:08, 5.80s/it] {'loss': 0.4618, 'learning_rate': 1.2154004927989815e-05, 'epoch': 0.45} + 45%|████▍ | 3920/8750 [2:31:24<7:47:08, 5.80s/it] 45%|████▍ | 3921/8750 [2:31:33<7:58:00, 5.94s/it] 45%|████▍ | 3921/8750 [2:31:30<7:58:01, 5.94s/it] {'loss': 0.4698, 'learning_rate': 1.2150390021146263e-05, 'epoch': 0.45} + 45%|████▍ | 3921/8750 [2:31:33<7:58:00, 5.94s/it] {'loss': 0.4698, 'learning_rate': 1.2150390021146263e-05, 'epoch': 0.45} + 45%|████▍ | 3921/8750 [2:31:30<7:58:01, 5.94s/it] 45%|████▍ | 3922/8750 [2:31:39<7:53:34, 5.89s/it] 45%|████▍ | 3922/8750 [2:31:36<7:53:34, 5.89s/it] {'loss': 0.4638, 'learning_rate': 1.2146774819651275e-05, 'epoch': 0.45} + 45%|████▍ | 3922/8750 [2:31:39<7:53:34, 5.89s/it] {'loss': 0.4638, 'learning_rate': 1.2146774819651275e-05, 'epoch': 0.45} + 45%|████▍ | 3922/8750 [2:31:36<7:53:34, 5.89s/it] 45%|████▍ | 3923/8750 [2:31:45<7:49:53, 5.84s/it] 45%|████▍ | 3923/8750 [2:31:42<7:49:53, 5.84s/it] {'loss': 0.4549, 'learning_rate': 1.2143159324000204e-05, 'epoch': 0.45} + 45%|████▍ | 3923/8750 [2:31:45<7:49:53, 5.84s/it] {'loss': 0.4549, 'learning_rate': 1.2143159324000204e-05, 'epoch': 0.45} + 45%|████▍ | 3923/8750 [2:31:42<7:49:53, 5.84s/it] 45%|████▍ | 3924/8750 [2:31:50<7:47:04, 5.81s/it] 45%|████▍ | 3924/8750 [2:31:47<7:47:05, 5.81s/it] {'loss': 0.4666, 'learning_rate': 1.2139543534688456e-05, 'epoch': 0.45} + 45%|████▍ | 3924/8750 [2:31:50<7:47:04, 5.81s/it] {'loss': 0.4666, 'learning_rate': 1.2139543534688456e-05, 'epoch': 0.45} + 45%|████▍ | 3924/8750 [2:31:47<7:47:05, 5.81s/it] 45%|████▍ | 3925/8750 [2:31:56<7:44:23, 5.77s/it] 45%|████▍ | 3925/8750 [2:31:53<7:44:23, 5.77s/it] {'loss': 0.4641, 'learning_rate': 1.2135927452211477e-05, 'epoch': 0.45} + 45%|████▍ | 3925/8750 [2:31:56<7:44:23, 5.77s/it] {'loss': 0.4641, 'learning_rate': 1.2135927452211477e-05, 'epoch': 0.45} + 45%|████▍ | 3925/8750 [2:31:53<7:44:23, 5.77s/it] 45%|████▍ | 3926/8750 [2:32:02<7:49:37, 5.84s/it] 45%|████▍ | 3926/8750 [2:31:59<7:49:36, 5.84s/it] {'loss': 0.4634, 'learning_rate': 1.2132311077064749e-05, 'epoch': 0.45} + 45%|████▍ | 3926/8750 [2:32:02<7:49:37, 5.84s/it] {'loss': 0.4634, 'learning_rate': 1.2132311077064749e-05, 'epoch': 0.45} + 45%|████▍ | 3926/8750 [2:31:59<7:49:36, 5.84s/it] 45%|████▍ | 3927/8750 [2:32:08<7:50:12, 5.85s/it] 45%|████▍ | 3927/8750 [2:32:05<7:50:12, 5.85s/it] {'loss': 0.4843, 'learning_rate': 1.2128694409743797e-05, 'epoch': 0.45} + 45%|████▍ | 3927/8750 [2:32:08<7:50:12, 5.85s/it] {'loss': 0.4843, 'learning_rate': 1.2128694409743797e-05, 'epoch': 0.45} + 45%|████▍ | 3927/8750 [2:32:05<7:50:12, 5.85s/it] 45%|████▍ | 3928/8750 [2:32:14<7:47:42, 5.82s/it] 45%|████▍ | 3928/8750 [2:32:11<7:47:41, 5.82s/it] {'loss': 0.4659, 'learning_rate': 1.2125077450744187e-05, 'epoch': 0.45} + 45%|████▍ | 3928/8750 [2:32:14<7:47:42, 5.82s/it] {'loss': 0.4659, 'learning_rate': 1.2125077450744187e-05, 'epoch': 0.45} + 45%|████▍ | 3928/8750 [2:32:11<7:47:41, 5.82s/it] 45%|████▍ | 3929/8750 [2:32:19<7:49:19, 5.84s/it] 45%|████▍ | 3929/8750 [2:32:17<7:49:19, 5.84s/it] {'loss': 0.4573, 'learning_rate': 1.2121460200561521e-05, 'epoch': 0.45} + 45%|████▍ | 3929/8750 [2:32:19<7:49:19, 5.84s/it] {'loss': 0.4573, 'learning_rate': 1.2121460200561521e-05, 'epoch': 0.45} + 45%|████▍ | 3929/8750 [2:32:17<7:49:19, 5.84s/it] 45%|████▍ | 3930/8750 [2:32:25<7:44:58, 5.79s/it] 45%|████▍ | 3930/8750 [2:32:22<7:44:58, 5.79s/it] {'loss': 0.4587, 'learning_rate': 1.2117842659691444e-05, 'epoch': 0.45} + 45%|████▍ | 3930/8750 [2:32:25<7:44:58, 5.79s/it] {'loss': 0.4587, 'learning_rate': 1.2117842659691444e-05, 'epoch': 0.45} + 45%|████▍ | 3930/8750 [2:32:22<7:44:58, 5.79s/it] 45%|████▍ | 3931/8750 [2:32:31<7:52:12, 5.88s/it] 45%|████▍ | 3931/8750 [2:32:28<7:52:12, 5.88s/it] {'loss': 0.467, 'learning_rate': 1.2114224828629638e-05, 'epoch': 0.45} + 45%|████▍ | 3931/8750 [2:32:31<7:52:12, 5.88s/it] {'loss': 0.467, 'learning_rate': 1.2114224828629638e-05, 'epoch': 0.45} + 45%|████▍ | 3931/8750 [2:32:28<7:52:12, 5.88s/it] 45%|████▍ | 3932/8750 [2:32:37<7:54:54, 5.91s/it] 45%|████▍ | 3932/8750 [2:32:34<7:54:54, 5.91s/it] {'loss': 0.4531, 'learning_rate': 1.2110606707871828e-05, 'epoch': 0.45} + 45%|████▍ | 3932/8750 [2:32:37<7:54:54, 5.91s/it] {'loss': 0.4531, 'learning_rate': 1.2110606707871828e-05, 'epoch': 0.45} + 45%|████▍ | 3932/8750 [2:32:34<7:54:54, 5.91s/it] 45%|████▍ | 3933/8750 [2:32:43<7:51:51, 5.88s/it] 45%|████▍ | 3933/8750 [2:32:40<7:51:51, 5.88s/it] {'loss': 0.4764, 'learning_rate': 1.2106988297913778e-05, 'epoch': 0.45} + 45%|████▍ | 3933/8750 [2:32:43<7:51:51, 5.88s/it] {'loss': 0.4764, 'learning_rate': 1.2106988297913778e-05, 'epoch': 0.45} + 45%|████▍ | 3933/8750 [2:32:40<7:51:51, 5.88s/it] 45%|████▍ | 3934/8750 [2:32:49<7:47:39, 5.83s/it] 45%|████▍ | 3934/8750 [2:32:46<7:47:39, 5.83s/it] {'loss': 0.4769, 'learning_rate': 1.2103369599251289e-05, 'epoch': 0.45} + 45%|████▍ | 3934/8750 [2:32:49<7:47:39, 5.83s/it] {'loss': 0.4769, 'learning_rate': 1.2103369599251289e-05, 'epoch': 0.45} + 45%|████▍ | 3934/8750 [2:32:46<7:47:39, 5.83s/it] 45%|████▍ | 3935/8750 [2:32:55<7:47:41, 5.83s/it] 45%|████▍ | 3935/8750 [2:32:52<7:47:41, 5.83s/it] {'loss': 0.4911, 'learning_rate': 1.2099750612380205e-05, 'epoch': 0.45} + 45%|████▍ | 3935/8750 [2:32:55<7:47:41, 5.83s/it] {'loss': 0.4911, 'learning_rate': 1.2099750612380205e-05, 'epoch': 0.45} + 45%|████▍ | 3935/8750 [2:32:52<7:47:41, 5.83s/it] 45%|████▍ | 3936/8750 [2:33:00<7:48:21, 5.84s/it] 45%|████▍ | 3936/8750 [2:32:57<7:48:21, 5.84s/it] {'loss': 0.4438, 'learning_rate': 1.2096131337796408e-05, 'epoch': 0.45} + 45%|████▍ | 3936/8750 [2:33:00<7:48:21, 5.84s/it] {'loss': 0.4438, 'learning_rate': 1.2096131337796408e-05, 'epoch': 0.45} + 45%|████▍ | 3936/8750 [2:32:57<7:48:21, 5.84s/it] 45%|████▍ | 3937/8750 [2:33:06<7:43:09, 5.77s/it] 45%|████▍ | 3937/8750 [2:33:03<7:43:09, 5.77s/it] {'loss': 0.4926, 'learning_rate': 1.2092511775995821e-05, 'epoch': 0.45} + 45%|████▍ | 3937/8750 [2:33:06<7:43:09, 5.77s/it] {'loss': 0.4926, 'learning_rate': 1.2092511775995821e-05, 'epoch': 0.45} + 45%|████▍ | 3937/8750 [2:33:03<7:43:09, 5.77s/it] 45%|████▌ | 3938/8750 [2:33:12<7:43:17, 5.78s/it] 45%|████▌ | 3938/8750 [2:33:09<7:43:16, 5.78s/it] {'loss': 0.4568, 'learning_rate': 1.20888919274744e-05, 'epoch': 0.45} + 45%|████▌ | 3938/8750 [2:33:12<7:43:17, 5.78s/it] {'loss': 0.4568, 'learning_rate': 1.20888919274744e-05, 'epoch': 0.45} + 45%|████▌ | 3938/8750 [2:33:09<7:43:16, 5.78s/it] 45%|████▌ | 3939/8750 [2:33:18<7:42:17, 5.77s/it] 45%|████▌ | 3939/8750 [2:33:15<7:42:17, 5.77s/it] {'loss': 0.4464, 'learning_rate': 1.208527179272815e-05, 'epoch': 0.45} + 45%|████▌ | 3939/8750 [2:33:18<7:42:17, 5.77s/it] {'loss': 0.4464, 'learning_rate': 1.208527179272815e-05, 'epoch': 0.45} + 45%|████▌ | 3939/8750 [2:33:15<7:42:17, 5.77s/it] 45%|████▌ | 3940/8750 [2:33:23<7:39:39, 5.73s/it] 45%|████▌ | 3940/8750 [2:33:20<7:39:39, 5.73s/it] {'loss': 0.4697, 'learning_rate': 1.2081651372253107e-05, 'epoch': 0.45} + 45%|████▌ | 3940/8750 [2:33:23<7:39:39, 5.73s/it] {'loss': 0.4697, 'learning_rate': 1.2081651372253107e-05, 'epoch': 0.45} + 45%|████▌ | 3940/8750 [2:33:20<7:39:39, 5.73s/it] 45%|████▌ | 3941/8750 [2:33:29<7:49:38, 5.86s/it] 45%|████▌ | 3941/8750 [2:33:26<7:49:38, 5.86s/it] {'loss': 0.4728, 'learning_rate': 1.2078030666545351e-05, 'epoch': 0.45} + 45%|████▌ | 3941/8750 [2:33:29<7:49:38, 5.86s/it] {'loss': 0.4728, 'learning_rate': 1.2078030666545351e-05, 'epoch': 0.45} + 45%|████▌ | 3941/8750 [2:33:26<7:49:38, 5.86s/it] 45%|████▌ | 3942/8750 [2:33:35<7:46:34, 5.82s/it] 45%|████▌ | 3942/8750 [2:33:32<7:46:34, 5.82s/it] {'loss': 0.4593, 'learning_rate': 1.2074409676101e-05, 'epoch': 0.45} + 45%|████▌ | 3942/8750 [2:33:35<7:46:34, 5.82s/it] {'loss': 0.4593, 'learning_rate': 1.2074409676101e-05, 'epoch': 0.45} + 45%|████▌ | 3942/8750 [2:33:32<7:46:34, 5.82s/it] 45%|████▌ | 3943/8750 [2:33:41<7:44:50, 5.80s/it] 45%|████▌ | 3943/8750 [2:33:38<7:44:51, 5.80s/it] {'loss': 0.4709, 'learning_rate': 1.2070788401416209e-05, 'epoch': 0.45} + 45%|████▌ | 3943/8750 [2:33:41<7:44:50, 5.80s/it] {'loss': 0.4709, 'learning_rate': 1.2070788401416209e-05, 'epoch': 0.45} + 45%|████▌ | 3943/8750 [2:33:38<7:44:51, 5.80s/it] 45%|████▌ | 3944/8750 [2:33:47<7:42:37, 5.78s/it] 45%|████▌ | 3944/8750 [2:33:44<7:42:38, 5.78s/it] {'loss': 0.4667, 'learning_rate': 1.2067166842987175e-05, 'epoch': 0.45} + 45%|████▌ | 3944/8750 [2:33:47<7:42:37, 5.78s/it] {'loss': 0.4667, 'learning_rate': 1.2067166842987175e-05, 'epoch': 0.45} + 45%|████▌ | 3944/8750 [2:33:44<7:42:38, 5.78s/it] 45%|████▌ | 3945/8750 [2:33:52<7:41:26, 5.76s/it] 45%|████▌ | 3945/8750 [2:33:49<7:41:26, 5.76s/it] {'loss': 0.4728, 'learning_rate': 1.206354500131013e-05, 'epoch': 0.45} + 45%|████▌ | 3945/8750 [2:33:52<7:41:26, 5.76s/it] {'loss': 0.4728, 'learning_rate': 1.206354500131013e-05, 'epoch': 0.45} + 45%|████▌ | 3945/8750 [2:33:49<7:41:26, 5.76s/it] 45%|████▌ | 3946/8750 [2:33:58<7:38:58, 5.73s/it] 45%|████▌ | 3946/8750 [2:33:55<7:38:58, 5.73s/it] {'loss': 0.4962, 'learning_rate': 1.205992287688135e-05, 'epoch': 0.45} + 45%|████▌ | 3946/8750 [2:33:58<7:38:58, 5.73s/it] {'loss': 0.4962, 'learning_rate': 1.205992287688135e-05, 'epoch': 0.45} + 45%|████▌ | 3946/8750 [2:33:55<7:38:58, 5.73s/it] 45%|████▌ | 3947/8750 [2:34:04<7:37:10, 5.71s/it] 45%|████▌ | 3947/8750 [2:34:01<7:37:10, 5.71s/it] {'loss': 0.4613, 'learning_rate': 1.2056300470197144e-05, 'epoch': 0.45} + 45%|████▌ | 3947/8750 [2:34:04<7:37:10, 5.71s/it] {'loss': 0.4613, 'learning_rate': 1.2056300470197144e-05, 'epoch': 0.45} + 45%|████▌ | 3947/8750 [2:34:01<7:37:10, 5.71s/it] 45%|████▌ | 3948/8750 [2:34:09<7:38:34, 5.73s/it] 45%|████▌ | 3948/8750 [2:34:06<7:38:34, 5.73s/it] {'loss': 0.4539, 'learning_rate': 1.2052677781753869e-05, 'epoch': 0.45} + 45%|████▌ | 3948/8750 [2:34:09<7:38:34, 5.73s/it] {'loss': 0.4539, 'learning_rate': 1.2052677781753869e-05, 'epoch': 0.45} + 45%|████▌ | 3948/8750 [2:34:07<7:38:34, 5.73s/it] 45%|████▌ | 3949/8750 [2:34:15<7:44:10, 5.80s/it] 45%|████▌ | 3949/8750 [2:34:12<7:44:10, 5.80s/it] {'loss': 0.4602, 'learning_rate': 1.2049054812047905e-05, 'epoch': 0.45} + 45%|████▌ | 3949/8750 [2:34:15<7:44:10, 5.80s/it] {'loss': 0.4602, 'learning_rate': 1.2049054812047905e-05, 'epoch': 0.45} + 45%|████▌ | 3949/8750 [2:34:12<7:44:10, 5.80s/it]10 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... + 45%|████▌ | 3950/8750 [2:34:21<7:41:02, 5.76s/it]3 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +013 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... + 45%|████▌ | 3950/8750 [2:34:18<7:41:03, 5.76s/it] {'loss': 0.4621, 'learning_rate': 1.2045431561575685e-05, 'epoch': 0.45} + 45%|████▌ | 3950/8750 [2:34:21<7:41:02, 5.76s/it] {'loss': 0.4621, 'learning_rate': 1.2045431561575685e-05, 'epoch': 0.45} + 45%|████▌ | 3950/8750 [2:34:18<7:41:03, 5.76s/it] 45%|████▌ | 3951/8750 [2:34:27<7:38:33, 5.73s/it] 45%|████▌ | 3951/8750 [2:34:24<7:38:34, 5.73s/it] {'loss': 0.4672, 'learning_rate': 1.2041808030833675e-05, 'epoch': 0.45} + 45%|████▌ | 3951/8750 [2:34:27<7:38:33, 5.73s/it] {'loss': 0.4672, 'learning_rate': 1.2041808030833675e-05, 'epoch': 0.45} + 45%|████▌ | 3951/8750 [2:34:24<7:38:34, 5.73s/it] 45%|████▌ | 3952/8750 [2:34:32<7:36:29, 5.71s/it] 45%|████▌ | 3952/8750 [2:34:29<7:36:29, 5.71s/it] {'loss': 0.4647, 'learning_rate': 1.2038184220318381e-05, 'epoch': 0.45} + {'loss': 0.4647, 'learning_rate': 1.2038184220318381e-05, 'epoch': 0.45} 45%|████▌ | 3952/8750 [2:34:32<7:36:29, 5.71s/it] + 45%|████▌ | 3952/8750 [2:34:29<7:36:29, 5.71s/it] 45%|████▌ | 3953/8750 [2:34:38<7:37:19, 5.72s/it] 45%|████▌ | 3953/8750 [2:34:35<7:37:19, 5.72s/it] {'loss': 0.4773, 'learning_rate': 1.2034560130526341e-05, 'epoch': 0.45} + 45%|████▌ | 3953/8750 [2:34:38<7:37:19, 5.72s/it] {'loss': 0.4773, 'learning_rate': 1.2034560130526341e-05, 'epoch': 0.45} + 45%|████▌ | 3953/8750 [2:34:35<7:37:19, 5.72s/it] 45%|████▌ | 3954/8750 [2:34:44<7:38:51, 5.74s/it] 45%|████▌ | 3954/8750 [2:34:41<7:38:51, 5.74s/it] {'loss': 0.4605, 'learning_rate': 1.2030935761954137e-05, 'epoch': 0.45} + 45%|████▌ | 3954/8750 [2:34:44<7:38:51, 5.74s/it] {'loss': 0.4605, 'learning_rate': 1.2030935761954137e-05, 'epoch': 0.45} + 45%|████▌ | 3954/8750 [2:34:41<7:38:51, 5.74s/it] 45%|████▌ | 3955/8750 [2:34:50<7:38:44, 5.74s/it] 45%|████▌ | 3955/8750 [2:34:47<7:38:44, 5.74s/it] {'loss': 0.4847, 'learning_rate': 1.2027311115098395e-05, 'epoch': 0.45} + {'loss': 0.4847, 'learning_rate': 1.2027311115098395e-05, 'epoch': 0.45} 45%|████▌ | 3955/8750 [2:34:50<7:38:44, 5.74s/it] + 45%|████▌ | 3955/8750 [2:34:47<7:38:44, 5.74s/it] 45%|████▌ | 3956/8750 [2:34:55<7:36:21, 5.71s/it] 45%|████▌ | 3956/8750 [2:34:52<7:36:21, 5.71s/it] {'loss': 0.4613, 'learning_rate': 1.2023686190455766e-05, 'epoch': 0.45} + 45%|████▌ | 3956/8750 [2:34:55<7:36:21, 5.71s/it] {'loss': 0.4613, 'learning_rate': 1.2023686190455766e-05, 'epoch': 0.45} + 45%|████▌ | 3956/8750 [2:34:52<7:36:21, 5.71s/it] 45%|████▌ | 3957/8750 [2:35:01<7:42:18, 5.79s/it] 45%|████▌ | 3957/8750 [2:34:58<7:42:18, 5.79s/it] {'loss': 0.4723, 'learning_rate': 1.202006098852295e-05, 'epoch': 0.45} + 45%|████▌ | 3957/8750 [2:35:01<7:42:18, 5.79s/it] {'loss': 0.4723, 'learning_rate': 1.202006098852295e-05, 'epoch': 0.45} + 45%|████▌ | 3957/8750 [2:34:58<7:42:18, 5.79s/it] 45%|████▌ | 3958/8750 [2:35:07<7:44:03, 5.81s/it] 45%|████▌ | 3958/8750 [2:35:04<7:44:02, 5.81s/it] {'loss': 0.4552, 'learning_rate': 1.2016435509796677e-05, 'epoch': 0.45} + 45%|████▌ | 3958/8750 [2:35:07<7:44:03, 5.81s/it] {'loss': 0.4552, 'learning_rate': 1.2016435509796677e-05, 'epoch': 0.45} + 45%|████▌ | 3958/8750 [2:35:04<7:44:02, 5.81s/it] 45%|████▌ | 3959/8750 [2:35:13<7:44:35, 5.82s/it] 45%|████▌ | 3959/8750 [2:35:10<7:44:35, 5.82s/it] {'loss': 0.4701, 'learning_rate': 1.2012809754773718e-05, 'epoch': 0.45} + 45%|████▌ | 3959/8750 [2:35:13<7:44:35, 5.82s/it] {'loss': 0.4701, 'learning_rate': 1.2012809754773718e-05, 'epoch': 0.45} + 45%|████▌ | 3959/8750 [2:35:10<7:44:35, 5.82s/it] 45%|████▌ | 3960/8750 [2:35:19<7:49:17, 5.88s/it] 45%|████▌ | 3960/8750 [2:35:16<7:49:17, 5.88s/it] {'loss': 0.4679, 'learning_rate': 1.2009183723950886e-05, 'epoch': 0.45} + 45%|████▌ | 3960/8750 [2:35:19<7:49:17, 5.88s/it] {'loss': 0.4679, 'learning_rate': 1.2009183723950886e-05, 'epoch': 0.45} + 45%|████▌ | 3960/8750 [2:35:16<7:49:17, 5.88s/it] 45%|████▌ | 3961/8750 [2:35:25<7:45:45, 5.84s/it] 45%|████▌ | 3961/8750 [2:35:22<7:45:45, 5.84s/it] {'loss': 0.4634, 'learning_rate': 1.2005557417825029e-05, 'epoch': 0.45} + 45%|████▌ | 3961/8750 [2:35:25<7:45:45, 5.84s/it] {'loss': 0.4634, 'learning_rate': 1.2005557417825029e-05, 'epoch': 0.45} + 45%|████▌ | 3961/8750 [2:35:22<7:45:45, 5.84s/it] 45%|████▌ | 3962/8750 [2:35:30<7:44:23, 5.82s/it] 45%|████▌ | 3962/8750 [2:35:28<7:44:23, 5.82s/it] {'loss': 0.4503, 'learning_rate': 1.2001930836893026e-05, 'epoch': 0.45} + 45%|████▌ | 3962/8750 [2:35:30<7:44:23, 5.82s/it] {'loss': 0.4503, 'learning_rate': 1.2001930836893026e-05, 'epoch': 0.45} + 45%|████▌ | 3962/8750 [2:35:28<7:44:23, 5.82s/it] 45%|████▌ | 3963/8750 [2:35:36<7:41:57, 5.79s/it] 45%|████▌ | 3963/8750 [2:35:33<7:41:57, 5.79s/it] {'loss': 0.4864, 'learning_rate': 1.1998303981651804e-05, 'epoch': 0.45} + {'loss': 0.4864, 'learning_rate': 1.1998303981651804e-05, 'epoch': 0.45} 45%|████▌ | 3963/8750 [2:35:36<7:41:57, 5.79s/it] + 45%|████▌ | 3963/8750 [2:35:33<7:41:57, 5.79s/it] 45%|████▌ | 3964/8750 [2:35:42<7:38:16, 5.75s/it] 45%|████▌ | 3964/8750 [2:35:39<7:38:16, 5.75s/it] {'loss': 0.4726, 'learning_rate': 1.199467685259832e-05, 'epoch': 0.45} + 45%|████▌ | 3964/8750 [2:35:42<7:38:16, 5.75s/it] {'loss': 0.4726, 'learning_rate': 1.199467685259832e-05, 'epoch': 0.45} + 45%|████▌ | 3964/8750 [2:35:39<7:38:16, 5.75s/it] 45%|████▌ | 3965/8750 [2:35:48<7:40:52, 5.78s/it] 45%|████▌ | 3965/8750 [2:35:45<7:40:52, 5.78s/it] {'loss': 0.4478, 'learning_rate': 1.1991049450229577e-05, 'epoch': 0.45} + 45%|████▌ | 3965/8750 [2:35:48<7:40:52, 5.78s/it] {'loss': 0.4478, 'learning_rate': 1.1991049450229577e-05, 'epoch': 0.45} + 45%|████▌ | 3965/8750 [2:35:45<7:40:52, 5.78s/it] 45%|████▌ | 3966/8750 [2:35:53<7:38:44, 5.75s/it] 45%|████▌ | 3966/8750 [2:35:50<7:38:44, 5.75s/it] {'loss': 0.4787, 'learning_rate': 1.1987421775042605e-05, 'epoch': 0.45} + 45%|████▌ | 3966/8750 [2:35:53<7:38:44, 5.75s/it] {'loss': 0.4787, 'learning_rate': 1.1987421775042605e-05, 'epoch': 0.45} + 45%|████▌ | 3966/8750 [2:35:50<7:38:44, 5.75s/it] 45%|████▌ | 3967/8750 [2:35:59<7:40:36, 5.78s/it] 45%|████▌ | 3967/8750 [2:35:56<7:40:36, 5.78s/it] {'loss': 0.4795, 'learning_rate': 1.1983793827534477e-05, 'epoch': 0.45} + 45%|████▌ | 3967/8750 [2:35:59<7:40:36, 5.78s/it] {'loss': 0.4795, 'learning_rate': 1.1983793827534477e-05, 'epoch': 0.45} + 45%|████▌ | 3967/8750 [2:35:56<7:40:36, 5.78s/it] 45%|████▌ | 3968/8750 [2:36:05<7:41:27, 5.79s/it] 45%|████▌ | 3968/8750 [2:36:02<7:41:27, 5.79s/it] {'loss': 0.4697, 'learning_rate': 1.1980165608202303e-05, 'epoch': 0.45} + 45%|████▌ | 3968/8750 [2:36:05<7:41:27, 5.79s/it] {'loss': 0.4697, 'learning_rate': 1.1980165608202303e-05, 'epoch': 0.45} + 45%|████▌ | 3968/8750 [2:36:02<7:41:27, 5.79s/it] 45%|████▌ | 3969/8750 [2:36:11<7:46:10, 5.85s/it] 45%|████▌ | 3969/8750 [2:36:08<7:46:10, 5.85s/it] {'loss': 0.4527, 'learning_rate': 1.1976537117543234e-05, 'epoch': 0.45} + 45%|████▌ | 3969/8750 [2:36:11<7:46:10, 5.85s/it] {'loss': 0.4527, 'learning_rate': 1.1976537117543234e-05, 'epoch': 0.45} + 45%|████▌ | 3969/8750 [2:36:08<7:46:10, 5.85s/it] 45%|████▌ | 3970/8750 [2:36:17<7:44:18, 5.83s/it] 45%|████▌ | 3970/8750 [2:36:14<7:44:17, 5.83s/it] {'loss': 0.4764, 'learning_rate': 1.1972908356054455e-05, 'epoch': 0.45} + 45%|████▌ | 3970/8750 [2:36:17<7:44:18, 5.83s/it] {'loss': 0.4764, 'learning_rate': 1.1972908356054455e-05, 'epoch': 0.45} + 45%|████▌ | 3970/8750 [2:36:14<7:44:17, 5.83s/it] 45%|████▌ | 3971/8750 [2:36:23<7:42:32, 5.81s/it] 45%|████▌ | 3971/8750 [2:36:20<7:42:32, 5.81s/it] {'loss': 0.4666, 'learning_rate': 1.1969279324233179e-05, 'epoch': 0.45} + 45%|████▌ | 3971/8750 [2:36:23<7:42:32, 5.81s/it] {'loss': 0.4666, 'learning_rate': 1.1969279324233179e-05, 'epoch': 0.45} + 45%|████▌ | 3971/8750 [2:36:20<7:42:32, 5.81s/it] 45%|████▌ | 3972/8750 [2:36:28<7:39:39, 5.77s/it] 45%|████▌ | 3972/8750 [2:36:25<7:39:38, 5.77s/it] {'loss': 0.4536, 'learning_rate': 1.1965650022576672e-05, 'epoch': 0.45} + 45%|████▌ | 3972/8750 [2:36:28<7:39:39, 5.77s/it] {'loss': 0.4536, 'learning_rate': 1.1965650022576672e-05, 'epoch': 0.45} + 45%|████▌ | 3972/8750 [2:36:25<7:39:38, 5.77s/it] 45%|████▌ | 3973/8750 [2:36:34<7:43:47, 5.83s/it] 45%|████▌ | 3973/8750 [2:36:31<7:43:47, 5.83s/it] {'loss': 0.4626, 'learning_rate': 1.196202045158222e-05, 'epoch': 0.45} + 45%|████▌ | 3973/8750 [2:36:34<7:43:47, 5.83s/it] {'loss': 0.4626, 'learning_rate': 1.196202045158222e-05, 'epoch': 0.45} + 45%|████▌ | 3973/8750 [2:36:31<7:43:47, 5.83s/it] 45%|████▌ | 3974/8750 [2:36:40<7:43:20, 5.82s/it] 45%|████▌ | 3974/8750 [2:36:37<7:43:20, 5.82s/it] {'loss': 0.471, 'learning_rate': 1.1958390611747167e-05, 'epoch': 0.45} + 45%|████▌ | 3974/8750 [2:36:40<7:43:20, 5.82s/it] {'loss': 0.471, 'learning_rate': 1.1958390611747167e-05, 'epoch': 0.45} + 45%|████▌ | 3974/8750 [2:36:37<7:43:20, 5.82s/it] 45%|████▌ | 3975/8750 [2:36:46<7:39:38, 5.78s/it] 45%|████▌ | 3975/8750 [2:36:43<7:39:39, 5.78s/it] {'loss': 0.4596, 'learning_rate': 1.1954760503568878e-05, 'epoch': 0.45} + 45%|████▌ | 3975/8750 [2:36:46<7:39:38, 5.78s/it] {'loss': 0.4596, 'learning_rate': 1.1954760503568878e-05, 'epoch': 0.45} + 45%|████▌ | 3975/8750 [2:36:43<7:39:39, 5.78s/it] 45%|████▌ | 3976/8750 [2:36:52<7:44:26, 5.84s/it] 45%|████▌ | 3976/8750 [2:36:49<7:44:26, 5.84s/it] {'loss': 0.4595, 'learning_rate': 1.1951130127544756e-05, 'epoch': 0.45} + 45%|████▌ | 3976/8750 [2:36:52<7:44:26, 5.84s/it] {'loss': 0.4595, 'learning_rate': 1.1951130127544756e-05, 'epoch': 0.45} + 45%|████▌ | 3976/8750 [2:36:49<7:44:26, 5.84s/it] 45%|████▌ | 3977/8750 [2:36:57<7:41:53, 5.81s/it] 45%|████▌ | 3977/8750 [2:36:54<7:41:53, 5.81s/it] {'loss': 0.4803, 'learning_rate': 1.1947499484172245e-05, 'epoch': 0.45} + 45%|████▌ | 3977/8750 [2:36:57<7:41:53, 5.81s/it] {'loss': 0.4803, 'learning_rate': 1.1947499484172245e-05, 'epoch': 0.45} + 45%|████▌ | 3977/8750 [2:36:54<7:41:53, 5.81s/it] 45%|████▌ | 3978/8750 [2:37:03<7:36:36, 5.74s/it] 45%|████▌ | 3978/8750 [2:37:00<7:36:37, 5.74s/it] {'loss': 0.4693, 'learning_rate': 1.1943868573948825e-05, 'epoch': 0.45} + 45%|████▌ | 3978/8750 [2:37:03<7:36:36, 5.74s/it] {'loss': 0.4693, 'learning_rate': 1.1943868573948825e-05, 'epoch': 0.45} + 45%|████▌ | 3978/8750 [2:37:00<7:36:37, 5.74s/it] 45%|████▌ | 3979/8750 [2:37:09<7:37:27, 5.75s/it] 45%|████▌ | 3979/8750 [2:37:06<7:37:27, 5.75s/it] {'loss': 0.4819, 'learning_rate': 1.194023739737201e-05, 'epoch': 0.45} + 45%|████▌ | 3979/8750 [2:37:09<7:37:27, 5.75s/it] {'loss': 0.4819, 'learning_rate': 1.194023739737201e-05, 'epoch': 0.45} + 45%|████▌ | 3979/8750 [2:37:06<7:37:27, 5.75s/it] 45%|████▌ | 3980/8750 [2:37:15<7:41:42, 5.81s/it] 45%|████▌ | 3980/8750 [2:37:12<7:41:42, 5.81s/it] {'loss': 0.4549, 'learning_rate': 1.1936605954939355e-05, 'epoch': 0.45} + 45%|████▌ | 3980/8750 [2:37:15<7:41:42, 5.81s/it] {'loss': 0.4549, 'learning_rate': 1.1936605954939355e-05, 'epoch': 0.45} + 45%|████▌ | 3980/8750 [2:37:12<7:41:42, 5.81s/it] 45%|████▌ | 3981/8750 [2:37:20<7:37:32, 5.76s/it] 45%|████▌ | 3981/8750 [2:37:17<7:37:32, 5.76s/it] {'loss': 0.4593, 'learning_rate': 1.1932974247148445e-05, 'epoch': 0.45} + 45%|████▌ | 3981/8750 [2:37:20<7:37:32, 5.76s/it] {'loss': 0.4593, 'learning_rate': 1.1932974247148445e-05, 'epoch': 0.45} + 45%|████▌ | 3981/8750 [2:37:17<7:37:32, 5.76s/it] 46%|████▌ | 3982/8750 [2:37:26<7:35:58, 5.74s/it] 46%|████▌ | 3982/8750 [2:37:23<7:35:58, 5.74s/it] {'loss': 0.4699, 'learning_rate': 1.192934227449691e-05, 'epoch': 0.46} + 46%|████▌ | 3982/8750 [2:37:26<7:35:58, 5.74s/it] {'loss': 0.4699, 'learning_rate': 1.192934227449691e-05, 'epoch': 0.46} + 46%|████▌ | 3982/8750 [2:37:23<7:35:58, 5.74s/it] 46%|████▌ | 3983/8750 [2:37:32<7:45:29, 5.86s/it] 46%|████▌ | 3983/8750 [2:37:29<7:45:30, 5.86s/it] {'loss': 0.4489, 'learning_rate': 1.1925710037482405e-05, 'epoch': 0.46} + 46%|████▌ | 3983/8750 [2:37:32<7:45:29, 5.86s/it] {'loss': 0.4489, 'learning_rate': 1.1925710037482405e-05, 'epoch': 0.46} + 46%|████▌ | 3983/8750 [2:37:29<7:45:30, 5.86s/it] 46%|████▌ | 3984/8750 [2:37:38<7:43:03, 5.83s/it] 46%|████▌ | 3984/8750 [2:37:35<7:43:03, 5.83s/it] {'loss': 0.4618, 'learning_rate': 1.1922077536602634e-05, 'epoch': 0.46} + 46%|████▌ | 3984/8750 [2:37:38<7:43:03, 5.83s/it] {'loss': 0.4618, 'learning_rate': 1.1922077536602634e-05, 'epoch': 0.46} + 46%|████▌ | 3984/8750 [2:37:35<7:43:03, 5.83s/it] 46%|████▌ | 3985/8750 [2:37:44<7:39:40, 5.79s/it] 46%|████▌ | 3985/8750 [2:37:41<7:39:40, 5.79s/it] {'loss': 0.4802, 'learning_rate': 1.1918444772355329e-05, 'epoch': 0.46} + 46%|████▌ | 3985/8750 [2:37:44<7:39:40, 5.79s/it] {'loss': 0.4802, 'learning_rate': 1.1918444772355329e-05, 'epoch': 0.46} + 46%|████▌ | 3985/8750 [2:37:41<7:39:40, 5.79s/it] 46%|████▌ | 3986/8750 [2:37:49<7:39:40, 5.79s/it] 46%|████▌ | 3986/8750 [2:37:47<7:39:40, 5.79s/it] {'loss': 0.4796, 'learning_rate': 1.1914811745238256e-05, 'epoch': 0.46} + 46%|████▌ | 3986/8750 [2:37:49<7:39:40, 5.79s/it] {'loss': 0.4796, 'learning_rate': 1.1914811745238256e-05, 'epoch': 0.46} + 46%|████▌ | 3986/8750 [2:37:47<7:39:40, 5.79s/it] 46%|████▌ | 3987/8750 [2:37:55<7:45:31, 5.86s/it] 46%|████▌ | 3987/8750 [2:37:53<7:45:31, 5.86s/it] {'loss': 0.4778, 'learning_rate': 1.1911178455749223e-05, 'epoch': 0.46} + 46%|████▌ | 3987/8750 [2:37:55<7:45:31, 5.86s/it] {'loss': 0.4778, 'learning_rate': 1.1911178455749223e-05, 'epoch': 0.46} + 46%|████▌ | 3987/8750 [2:37:53<7:45:31, 5.86s/it] 46%|████▌ | 3988/8750 [2:38:01<7:41:59, 5.82s/it] 46%|████▌ | 3988/8750 [2:37:58<7:42:00, 5.82s/it] {'loss': 0.4448, 'learning_rate': 1.1907544904386074e-05, 'epoch': 0.46} + 46%|████▌ | 3988/8750 [2:38:01<7:41:59, 5.82s/it] {'loss': 0.4448, 'learning_rate': 1.1907544904386074e-05, 'epoch': 0.46} + 46%|████▌ | 3988/8750 [2:37:58<7:42:00, 5.82s/it] 46%|████▌ | 3989/8750 [2:38:07<7:39:43, 5.79s/it] 46%|████▌ | 3989/8750 [2:38:04<7:39:43, 5.79s/it] {'loss': 0.4815, 'learning_rate': 1.1903911091646684e-05, 'epoch': 0.46} + 46%|████▌ | 3989/8750 [2:38:07<7:39:43, 5.79s/it] {'loss': 0.4815, 'learning_rate': 1.1903911091646684e-05, 'epoch': 0.46} + 46%|████▌ | 3989/8750 [2:38:04<7:39:43, 5.79s/it] 46%|████▌ | 3990/8750 [2:38:13<7:42:57, 5.84s/it] 46%|████▌ | 3990/8750 [2:38:10<7:42:56, 5.84s/it] {'loss': 0.4605, 'learning_rate': 1.190027701802897e-05, 'epoch': 0.46} + 46%|████▌ | 3990/8750 [2:38:13<7:42:57, 5.84s/it] {'loss': 0.4605, 'learning_rate': 1.190027701802897e-05, 'epoch': 0.46} + 46%|████▌ | 3990/8750 [2:38:10<7:42:56, 5.84s/it] 46%|████▌ | 3991/8750 [2:38:19<7:53:48, 5.97s/it] 46%|████▌ | 3991/8750 [2:38:16<7:53:47, 5.97s/it] {'loss': 0.4692, 'learning_rate': 1.1896642684030874e-05, 'epoch': 0.46} + 46%|████▌ | 3991/8750 [2:38:19<7:53:48, 5.97s/it] {'loss': 0.4692, 'learning_rate': 1.1896642684030874e-05, 'epoch': 0.46} + 46%|████▌ | 3991/8750 [2:38:16<7:53:47, 5.97s/it] 46%|████▌ | 3992/8750 [2:38:25<7:47:27, 5.89s/it] 46%|████▌ | 3992/8750 [2:38:22<7:47:27, 5.89s/it] {'loss': 0.4576, 'learning_rate': 1.1893008090150389e-05, 'epoch': 0.46} + 46%|████▌ | 3992/8750 [2:38:25<7:47:27, 5.89s/it] {'loss': 0.4576, 'learning_rate': 1.1893008090150389e-05, 'epoch': 0.46} + 46%|████▌ | 3992/8750 [2:38:22<7:47:27, 5.89s/it] 46%|████▌ | 3993/8750 [2:38:31<7:46:32, 5.88s/it] 46%|████▌ | 3993/8750 [2:38:28<7:46:32, 5.88s/it] {'loss': 0.472, 'learning_rate': 1.1889373236885531e-05, 'epoch': 0.46} + 46%|████▌ | 3993/8750 [2:38:31<7:46:32, 5.88s/it] {'loss': 0.472, 'learning_rate': 1.1889373236885531e-05, 'epoch': 0.46} + 46%|████▌ | 3993/8750 [2:38:28<7:46:32, 5.88s/it] 46%|████▌ | 3994/8750 [2:38:37<7:44:02, 5.85s/it] 46%|████▌ | 3994/8750 [2:38:34<7:44:01, 5.85s/it] {'loss': 0.4549, 'learning_rate': 1.1885738124734359e-05, 'epoch': 0.46} + 46%|████▌ | 3994/8750 [2:38:37<7:44:02, 5.85s/it] {'loss': 0.4549, 'learning_rate': 1.1885738124734359e-05, 'epoch': 0.46} + 46%|████▌ | 3994/8750 [2:38:34<7:44:01, 5.85s/it] 46%|████▌ | 3995/8750 [2:38:42<7:40:57, 5.82s/it] 46%|████▌ | 3995/8750 [2:38:39<7:40:57, 5.82s/it] {'loss': 0.481, 'learning_rate': 1.188210275419496e-05, 'epoch': 0.46} + 46%|████▌ | 3995/8750 [2:38:42<7:40:57, 5.82s/it] {'loss': 0.481, 'learning_rate': 1.188210275419496e-05, 'epoch': 0.46} + 46%|████▌ | 3995/8750 [2:38:39<7:40:57, 5.82s/it] 46%|████▌ | 3996/8750 [2:38:48<7:44:36, 5.86s/it] 46%|████▌ | 3996/8750 [2:38:45<7:44:36, 5.86s/it] {'loss': 0.449, 'learning_rate': 1.1878467125765464e-05, 'epoch': 0.46} + 46%|████▌ | 3996/8750 [2:38:48<7:44:36, 5.86s/it] {'loss': 0.449, 'learning_rate': 1.1878467125765464e-05, 'epoch': 0.46} + 46%|████▌ | 3996/8750 [2:38:45<7:44:36, 5.86s/it] 46%|████▌ | 3997/8750 [2:38:51<7:41:08, 5.82s/it] 46%|████▌ | 3997/8750 [2:38:54<7:41:07, 5.82s/it] {'loss': 0.4738, 'learning_rate': 1.1874831239944034e-05, 'epoch': 0.46} + 46%|████▌ | 3997/8750 [2:38:54<7:41:07, 5.82s/it] {'loss': 0.4738, 'learning_rate': 1.1874831239944034e-05, 'epoch': 0.46} + 46%|████▌ | 3997/8750 [2:38:51<7:41:08, 5.82s/it] 46%|████▌ | 3998/8750 [2:39:00<7:40:42, 5.82s/it] 46%|████▌ | 3998/8750 [2:38:57<7:40:42, 5.82s/it] {'loss': 0.4507, 'learning_rate': 1.1871195097228864e-05, 'epoch': 0.46} + 46%|████▌ | 3998/8750 [2:39:00<7:40:42, 5.82s/it] {'loss': 0.4507, 'learning_rate': 1.1871195097228864e-05, 'epoch': 0.46} + 46%|████▌ | 3998/8750 [2:38:57<7:40:42, 5.82s/it] 46%|████▌ | 3999/8750 [2:39:05<7:37:10, 5.77s/it] 46%|████▌ | 3999/8750 [2:39:02<7:37:10, 5.77s/it] {'loss': 0.4777, 'learning_rate': 1.1867558698118192e-05, 'epoch': 0.46} + 46%|████▌ | 3999/8750 [2:39:05<7:37:10, 5.77s/it] {'loss': 0.4777, 'learning_rate': 1.1867558698118192e-05, 'epoch': 0.46} + 46%|████▌ | 3999/8750 [2:39:02<7:37:10, 5.77s/it]10 AutoResumeHook: Checking whether to suspend... +43 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +9 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... + 46%|████▌ | 4000/8750 [2:39:11<7:37:28, 5.78s/it]6 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +02 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... + 46%|████▌ | 4000/8750 [2:39:08<7:37:29, 5.78s/it]1 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4692, 'learning_rate': 1.1863922043110282e-05, 'epoch': 0.46} + 46%|████▌ | 4000/8750 [2:39:11<7:37:28, 5.78s/it] {'loss': 0.4692, 'learning_rate': 1.1863922043110282e-05, 'epoch': 0.46} + 46%|████▌ | 4000/8750 [2:39:08<7:37:29, 5.78s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-4000/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-4000/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-4000/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 46%|████▌ | 4001/8750 [2:39:35<14:52:21, 11.27s/it] 46%|████▌ | 4001/8750 [2:39:32<14:52:22, 11.27s/it] {'loss': 0.474, 'learning_rate': 1.1860285132703435e-05, 'epoch': 0.46} + 46%|████▌ | 4001/8750 [2:39:35<14:52:21, 11.27s/it] {'loss': 0.474, 'learning_rate': 1.1860285132703435e-05, 'epoch': 0.46} + 46%|████▌ | 4001/8750 [2:39:32<14:52:22, 11.27s/it] 46%|████▌ | 4002/8750 [2:39:41<12:43:19, 9.65s/it] 46%|████▌ | 4002/8750 [2:39:38<12:43:20, 9.65s/it] {'loss': 0.462, 'learning_rate': 1.1856647967395995e-05, 'epoch': 0.46} + 46%|████▌ | 4002/8750 [2:39:41<12:43:19, 9.65s/it] {'loss': 0.462, 'learning_rate': 1.1856647967395995e-05, 'epoch': 0.46} + 46%|████▌ | 4002/8750 [2:39:38<12:43:20, 9.65s/it] 46%|████▌ | 4003/8750 [2:39:47<11:09:08, 8.46s/it] 46%|████▌ | 4003/8750 [2:39:44<11:09:08, 8.46s/it] {'loss': 0.4657, 'learning_rate': 1.185301054768633e-05, 'epoch': 0.46} + 46%|████▌ | 4003/8750 [2:39:47<11:09:08, 8.46s/it] {'loss': 0.4657, 'learning_rate': 1.185301054768633e-05, 'epoch': 0.46} + 46%|████▌ | 4003/8750 [2:39:44<11:09:08, 8.46s/it] 46%|████▌ | 4004/8750 [2:39:52<10:02:08, 7.61s/it] 46%|████▌ | 4004/8750 [2:39:50<10:02:08, 7.61s/it] {'loss': 0.4854, 'learning_rate': 1.1849372874072852e-05, 'epoch': 0.46} + 46%|████▌ | 4004/8750 [2:39:52<10:02:08, 7.61s/it] {'loss': 0.4854, 'learning_rate': 1.1849372874072852e-05, 'epoch': 0.46} + 46%|████▌ | 4004/8750 [2:39:50<10:02:08, 7.61s/it] 46%|████▌ | 4005/8750 [2:39:59<9:27:55, 7.18s/it] 46%|████▌ | 4005/8750 [2:39:56<9:27:54, 7.18s/it] {'loss': 0.4589, 'learning_rate': 1.1845734947054e-05, 'epoch': 0.46} + 46%|████▌ | 4005/8750 [2:39:59<9:27:55, 7.18s/it] {'loss': 0.4589, 'learning_rate': 1.1845734947054e-05, 'epoch': 0.46} + 46%|████▌ | 4005/8750 [2:39:56<9:27:54, 7.18s/it] 46%|████▌ | 4006/8750 [2:40:05<8:57:47, 6.80s/it] 46%|████▌ | 4006/8750 [2:40:02<8:57:47, 6.80s/it] {'loss': 0.4663, 'learning_rate': 1.1842096767128249e-05, 'epoch': 0.46} + 46%|████▌ | 4006/8750 [2:40:05<8:57:47, 6.80s/it] {'loss': 0.4663, 'learning_rate': 1.1842096767128249e-05, 'epoch': 0.46} + 46%|████▌ | 4006/8750 [2:40:02<8:57:47, 6.80s/it] 46%|████▌ | 4007/8750 [2:40:10<8:31:40, 6.47s/it] 46%|████▌ | 4007/8750 [2:40:07<8:31:40, 6.47s/it] {'loss': 0.4764, 'learning_rate': 1.1838458334794116e-05, 'epoch': 0.46} + 46%|████▌ | 4007/8750 [2:40:10<8:31:40, 6.47s/it] {'loss': 0.4764, 'learning_rate': 1.1838458334794116e-05, 'epoch': 0.46} + 46%|████▌ | 4007/8750 [2:40:07<8:31:40, 6.47s/it] 46%|████▌ | 4008/8750 [2:40:13<8:18:48, 6.31s/it] 46%|████▌ | 4008/8750 [2:40:16<8:18:48, 6.31s/it] {'loss': 0.4485, 'learning_rate': 1.1834819650550144e-05, 'epoch': 0.46} + 46%|████▌ | 4008/8750 [2:40:16<8:18:48, 6.31s/it] {'loss': 0.4485, 'learning_rate': 1.1834819650550144e-05, 'epoch': 0.46} + 46%|████▌ | 4008/8750 [2:40:13<8:18:48, 6.31s/it] 46%|████▌ | 4009/8750 [2:40:22<8:09:18, 6.19s/it] 46%|████▌ | 4009/8750 [2:40:19<8:09:17, 6.19s/it] {'loss': 0.4674, 'learning_rate': 1.1831180714894918e-05, 'epoch': 0.46} + 46%|████▌ | 4009/8750 [2:40:22<8:09:18, 6.19s/it] {'loss': 0.4674, 'learning_rate': 1.1831180714894918e-05, 'epoch': 0.46} + 46%|████▌ | 4009/8750 [2:40:19<8:09:17, 6.19s/it] 46%|████▌ | 4010/8750 [2:40:28<7:58:47, 6.06s/it] 46%|████▌ | 4010/8750 [2:40:25<7:58:46, 6.06s/it] {'loss': 0.4603, 'learning_rate': 1.1827541528327052e-05, 'epoch': 0.46} + 46%|████▌ | 4010/8750 [2:40:28<7:58:47, 6.06s/it] {'loss': 0.4603, 'learning_rate': 1.1827541528327052e-05, 'epoch': 0.46} + 46%|████▌ | 4010/8750 [2:40:25<7:58:46, 6.06s/it] 46%|████▌ | 4011/8750 [2:40:33<7:46:43, 5.91s/it] 46%|████▌ | 4011/8750 [2:40:31<7:46:43, 5.91s/it] {'loss': 0.4521, 'learning_rate': 1.182390209134519e-05, 'epoch': 0.46} + 46%|████▌ | 4011/8750 [2:40:33<7:46:43, 5.91s/it] {'loss': 0.4521, 'learning_rate': 1.182390209134519e-05, 'epoch': 0.46} + 46%|████▌ | 4011/8750 [2:40:31<7:46:43, 5.91s/it] 46%|████▌ | 4012/8750 [2:40:36<7:41:21, 5.84s/it] 46%|████▌ | 4012/8750 [2:40:39<7:41:22, 5.84s/it] {'loss': 0.4839, 'learning_rate': 1.1820262404448023e-05, 'epoch': 0.46} + 46%|████▌ | 4012/8750 [2:40:39<7:41:22, 5.84s/it] {'loss': 0.4839, 'learning_rate': 1.1820262404448023e-05, 'epoch': 0.46} + 46%|████▌ | 4012/8750 [2:40:36<7:41:21, 5.84s/it] 46%|████▌ | 4013/8750 [2:40:45<7:36:09, 5.78s/it] 46%|████▌ | 4013/8750 [2:40:42<7:36:10, 5.78s/it] {'loss': 0.4953, 'learning_rate': 1.181662246813427e-05, 'epoch': 0.46} + 46%|████▌ | 4013/8750 [2:40:45<7:36:09, 5.78s/it] {'loss': 0.4953, 'learning_rate': 1.181662246813427e-05, 'epoch': 0.46} + 46%|████▌ | 4013/8750 [2:40:42<7:36:10, 5.78s/it] 46%|████▌ | 4014/8750 [2:40:48<7:36:15, 5.78s/it] 46%|████▌ | 4014/8750 [2:40:51<7:36:16, 5.78s/it] {'loss': 0.4508, 'learning_rate': 1.1812982282902676e-05, 'epoch': 0.46} + 46%|████▌ | 4014/8750 [2:40:51<7:36:16, 5.78s/it] {'loss': 0.4508, 'learning_rate': 1.1812982282902676e-05, 'epoch': 0.46} + 46%|████▌ | 4014/8750 [2:40:48<7:36:15, 5.78s/it] 46%|████▌ | 4015/8750 [2:40:53<7:35:40, 5.77s/it] 46%|████▌ | 4015/8750 [2:40:56<7:35:40, 5.77s/it] {'loss': 0.4796, 'learning_rate': 1.1809341849252034e-05, 'epoch': 0.46} + 46%|████▌ | 4015/8750 [2:40:56<7:35:40, 5.77s/it] {'loss': 0.4796, 'learning_rate': 1.1809341849252034e-05, 'epoch': 0.46} + 46%|████▌ | 4015/8750 [2:40:53<7:35:40, 5.77s/it] 46%|████▌ | 4016/8750 [2:41:02<7:36:06, 5.78s/it] 46%|████▌ | 4016/8750 [2:40:59<7:36:06, 5.78s/it] {'loss': 0.4567, 'learning_rate': 1.180570116768116e-05, 'epoch': 0.46} + 46%|████▌ | 4016/8750 [2:41:02<7:36:06, 5.78s/it] {'loss': 0.4567, 'learning_rate': 1.180570116768116e-05, 'epoch': 0.46} + 46%|████▌ | 4016/8750 [2:40:59<7:36:06, 5.78s/it] 46%|████▌ | 4017/8750 [2:41:05<7:42:28, 5.86s/it] 46%|████▌ | 4017/8750 [2:41:08<7:42:29, 5.86s/it] {'loss': 0.4681, 'learning_rate': 1.1802060238688915e-05, 'epoch': 0.46} + 46%|████▌ | 4017/8750 [2:41:08<7:42:29, 5.86s/it] {'loss': 0.4681, 'learning_rate': 1.1802060238688915e-05, 'epoch': 0.46} + 46%|████▌ | 4017/8750 [2:41:05<7:42:28, 5.86s/it] 46%|████▌ | 4018/8750 [2:41:11<7:36:37, 5.79s/it] 46%|████▌ | 4018/8750 [2:41:14<7:36:37, 5.79s/it] {'loss': 0.4736, 'learning_rate': 1.1798419062774185e-05, 'epoch': 0.46} + 46%|████▌ | 4018/8750 [2:41:14<7:36:37, 5.79s/it] {'loss': 0.4736, 'learning_rate': 1.1798419062774185e-05, 'epoch': 0.46} + 46%|████▌ | 4018/8750 [2:41:11<7:36:37, 5.79s/it] 46%|████▌ | 4019/8750 [2:41:20<7:39:13, 5.82s/it] 46%|████▌ | 4019/8750 [2:41:17<7:39:14, 5.82s/it] {'loss': 0.4584, 'learning_rate': 1.179477764043589e-05, 'epoch': 0.46} + 46%|████▌ | 4019/8750 [2:41:20<7:39:13, 5.82s/it] {'loss': 0.4584, 'learning_rate': 1.179477764043589e-05, 'epoch': 0.46} + 46%|████▌ | 4019/8750 [2:41:17<7:39:14, 5.82s/it] 46%|████▌ | 4020/8750 [2:41:22<7:36:23, 5.79s/it] 46%|████▌ | 4020/8750 [2:41:25<7:36:23, 5.79s/it] {'loss': 0.4717, 'learning_rate': 1.1791135972172989e-05, 'epoch': 0.46} + 46%|████▌ | 4020/8750 [2:41:25<7:36:23, 5.79s/it] {'loss': 0.4717, 'learning_rate': 1.1791135972172989e-05, 'epoch': 0.46} + 46%|████▌ | 4020/8750 [2:41:22<7:36:23, 5.79s/it] 46%|████▌ | 4021/8750 [2:41:31<7:36:42, 5.79s/it] 46%|████▌ | 4021/8750 [2:41:28<7:36:42, 5.79s/it] {'loss': 0.4796, 'learning_rate': 1.1787494058484468e-05, 'epoch': 0.46} + 46%|████▌ | 4021/8750 [2:41:31<7:36:42, 5.79s/it] {'loss': 0.4796, 'learning_rate': 1.1787494058484468e-05, 'epoch': 0.46} + 46%|████▌ | 4021/8750 [2:41:28<7:36:42, 5.79s/it] 46%|████▌ | 4022/8750 [2:41:37<7:38:02, 5.81s/it] 46%|████▌ | 4022/8750 [2:41:34<7:38:03, 5.81s/it] {'loss': 0.4622, 'learning_rate': 1.1783851899869357e-05, 'epoch': 0.46} + 46%|████▌ | 4022/8750 [2:41:37<7:38:02, 5.81s/it] {'loss': 0.4622, 'learning_rate': 1.1783851899869357e-05, 'epoch': 0.46} + 46%|████▌ | 4022/8750 [2:41:34<7:38:03, 5.81s/it] 46%|████▌ | 4023/8750 [2:41:40<7:36:16, 5.79s/it] 46%|████▌ | 4023/8750 [2:41:43<7:36:16, 5.79s/it] {'loss': 0.4661, 'learning_rate': 1.1780209496826707e-05, 'epoch': 0.46} + 46%|████▌ | 4023/8750 [2:41:43<7:36:16, 5.79s/it] {'loss': 0.4661, 'learning_rate': 1.1780209496826707e-05, 'epoch': 0.46} + 46%|████▌ | 4023/8750 [2:41:40<7:36:16, 5.79s/it] 46%|████▌ | 4024/8750 [2:41:46<7:34:47, 5.77s/it] 46%|████▌ | 4024/8750 [2:41:49<7:34:48, 5.77s/it] {'loss': 0.4547, 'learning_rate': 1.177656684985561e-05, 'epoch': 0.46} + 46%|████▌ | 4024/8750 [2:41:49<7:34:48, 5.77s/it] {'loss': 0.4547, 'learning_rate': 1.177656684985561e-05, 'epoch': 0.46} + 46%|████▌ | 4024/8750 [2:41:46<7:34:47, 5.77s/it] 46%|████▌ | 4025/8750 [2:41:51<7:36:50, 5.80s/it] 46%|████▌ | 4025/8750 [2:41:54<7:36:50, 5.80s/it] {'loss': 0.4589, 'learning_rate': 1.1772923959455188e-05, 'epoch': 0.46} + 46%|████▌ | 4025/8750 [2:41:54<7:36:50, 5.80s/it] {'loss': 0.4589, 'learning_rate': 1.1772923959455188e-05, 'epoch': 0.46} + 46%|████▌ | 4025/8750 [2:41:51<7:36:50, 5.80s/it] 46%|████▌ | 4026/8750 [2:41:57<7:34:19, 5.77s/it] 46%|████▌ | 4026/8750 [2:42:00<7:34:19, 5.77s/it] {'loss': 0.4461, 'learning_rate': 1.1769280826124604e-05, 'epoch': 0.46} + 46%|████▌ | 4026/8750 [2:42:00<7:34:19, 5.77s/it] {'loss': 0.4461, 'learning_rate': 1.1769280826124604e-05, 'epoch': 0.46} + 46%|████▌ | 4026/8750 [2:41:57<7:34:19, 5.77s/it] 46%|████▌ | 4027/8750 [2:42:06<7:31:48, 5.74s/it] 46%|████▌ | 4027/8750 [2:42:03<7:31:48, 5.74s/it] {'loss': 0.479, 'learning_rate': 1.1765637450363048e-05, 'epoch': 0.46} + 46%|████▌ | 4027/8750 [2:42:06<7:31:48, 5.74s/it] {'loss': 0.479, 'learning_rate': 1.1765637450363048e-05, 'epoch': 0.46} + 46%|████▌ | 4027/8750 [2:42:03<7:31:48, 5.74s/it] 46%|████▌ | 4028/8750 [2:42:09<7:36:22, 5.80s/it] 46%|████▌ | 4028/8750 [2:42:12<7:36:22, 5.80s/it] {'loss': 0.475, 'learning_rate': 1.176199383266974e-05, 'epoch': 0.46} + 46%|████▌ | 4028/8750 [2:42:12<7:36:22, 5.80s/it] {'loss': 0.475, 'learning_rate': 1.176199383266974e-05, 'epoch': 0.46} + 46%|████▌ | 4028/8750 [2:42:09<7:36:22, 5.80s/it] 46%|████▌ | 4029/8750 [2:42:17<7:36:45, 5.80s/it] 46%|████▌ | 4029/8750 [2:42:15<7:36:46, 5.81s/it] {'loss': 0.485, 'learning_rate': 1.1758349973543936e-05, 'epoch': 0.46} + 46%|████▌ | 4029/8750 [2:42:17<7:36:45, 5.80s/it] {'loss': 0.485, 'learning_rate': 1.1758349973543936e-05, 'epoch': 0.46} + 46%|████▌ | 4029/8750 [2:42:15<7:36:46, 5.81s/it] 46%|████▌ | 4030/8750 [2:42:20<7:34:50, 5.78s/it] 46%|████▌ | 4030/8750 [2:42:23<7:34:51, 5.78s/it] {'loss': 0.4537, 'learning_rate': 1.1754705873484929e-05, 'epoch': 0.46} + 46%|████▌ | 4030/8750 [2:42:23<7:34:51, 5.78s/it] {'loss': 0.4537, 'learning_rate': 1.1754705873484929e-05, 'epoch': 0.46} + 46%|████▌ | 4030/8750 [2:42:20<7:34:50, 5.78s/it] 46%|████▌ | 4031/8750 [2:42:26<7:37:01, 5.81s/it] 46%|████▌ | 4031/8750 [2:42:29<7:37:01, 5.81s/it] {'loss': 0.4889, 'learning_rate': 1.1751061532992045e-05, 'epoch': 0.46} + 46%|████▌ | 4031/8750 [2:42:29<7:37:01, 5.81s/it] {'loss': 0.4889, 'learning_rate': 1.1751061532992045e-05, 'epoch': 0.46} + 46%|████▌ | 4031/8750 [2:42:26<7:37:01, 5.81s/it] 46%|████▌ | 4032/8750 [2:42:32<7:44:00, 5.90s/it] 46%|████▌ | 4032/8750 [2:42:35<7:44:00, 5.90s/it] {'loss': 0.4695, 'learning_rate': 1.1747416952564632e-05, 'epoch': 0.46} + 46%|████▌ | 4032/8750 [2:42:35<7:44:00, 5.90s/it] {'loss': 0.4695, 'learning_rate': 1.1747416952564632e-05, 'epoch': 0.46} + 46%|████▌ | 4032/8750 [2:42:32<7:44:00, 5.90s/it] 46%|████▌ | 4033/8750 [2:42:41<7:37:46, 5.82s/it] 46%|████▌ | 4033/8750 [2:42:38<7:37:46, 5.82s/it] {'loss': 0.4752, 'learning_rate': 1.1743772132702086e-05, 'epoch': 0.46} + 46%|████▌ | 4033/8750 [2:42:41<7:37:46, 5.82s/it] {'loss': 0.4752, 'learning_rate': 1.1743772132702086e-05, 'epoch': 0.46} + 46%|████▌ | 4033/8750 [2:42:38<7:37:46, 5.82s/it] 46%|████▌ | 4034/8750 [2:42:44<7:39:36, 5.85s/it] 46%|████▌ | 4034/8750 [2:42:47<7:39:36, 5.85s/it] {'loss': 0.4519, 'learning_rate': 1.1740127073903826e-05, 'epoch': 0.46} + 46%|████▌ | 4034/8750 [2:42:47<7:39:36, 5.85s/it] {'loss': 0.4519, 'learning_rate': 1.1740127073903826e-05, 'epoch': 0.46} + 46%|████▌ | 4034/8750 [2:42:44<7:39:36, 5.85s/it] 46%|████▌ | 4035/8750 [2:42:49<7:34:54, 5.79s/it] 46%|████▌ | 4035/8750 [2:42:52<7:34:54, 5.79s/it] {'loss': 0.4603, 'learning_rate': 1.1736481776669307e-05, 'epoch': 0.46} + 46%|████▌ | 4035/8750 [2:42:52<7:34:54, 5.79s/it] {'loss': 0.4603, 'learning_rate': 1.1736481776669307e-05, 'epoch': 0.46} + 46%|████▌ | 4035/8750 [2:42:49<7:34:54, 5.79s/it] 46%|████▌ | 4036/8750 [2:42:58<7:32:10, 5.76s/it] 46%|████▌ | 4036/8750 [2:42:55<7:32:11, 5.76s/it] {'loss': 0.4767, 'learning_rate': 1.1732836241498013e-05, 'epoch': 0.46} + 46%|████▌ | 4036/8750 [2:42:58<7:32:10, 5.76s/it] {'loss': 0.4767, 'learning_rate': 1.1732836241498013e-05, 'epoch': 0.46} + 46%|████▌ | 4036/8750 [2:42:55<7:32:11, 5.76s/it] 46%|████▌ | 4037/8750 [2:43:01<7:29:27, 5.72s/it] 46%|████▌ | 4037/8750 [2:43:04<7:29:27, 5.72s/it] {'loss': 0.4883, 'learning_rate': 1.1729190468889466e-05, 'epoch': 0.46} + 46%|████▌ | 4037/8750 [2:43:04<7:29:27, 5.72s/it] {'loss': 0.4883, 'learning_rate': 1.1729190468889466e-05, 'epoch': 0.46} + 46%|████▌ | 4037/8750 [2:43:01<7:29:27, 5.72s/it] 46%|████▌ | 4038/8750 [2:43:09<7:28:15, 5.71s/it] 46%|████▌ | 4038/8750 [2:43:06<7:28:16, 5.71s/it] {'loss': 0.4877, 'learning_rate': 1.1725544459343221e-05, 'epoch': 0.46} + 46%|████▌ | 4038/8750 [2:43:09<7:28:15, 5.71s/it] {'loss': 0.4877, 'learning_rate': 1.1725544459343221e-05, 'epoch': 0.46} + 46%|████▌ | 4038/8750 [2:43:06<7:28:16, 5.71s/it] 46%|████▌ | 4039/8750 [2:43:15<7:30:19, 5.74s/it] 46%|████▌ | 4039/8750 [2:43:12<7:30:19, 5.74s/it] {'loss': 0.4644, 'learning_rate': 1.172189821335886e-05, 'epoch': 0.46} + 46%|████▌ | 4039/8750 [2:43:15<7:30:19, 5.74s/it] {'loss': 0.4644, 'learning_rate': 1.172189821335886e-05, 'epoch': 0.46} + 46%|████▌ | 4039/8750 [2:43:12<7:30:19, 5.74s/it] 46%|████▌ | 4040/8750 [2:43:21<7:29:12, 5.72s/it] 46%|████▌ | 4040/8750 [2:43:18<7:29:12, 5.72s/it] {'loss': 0.4534, 'learning_rate': 1.1718251731436001e-05, 'epoch': 0.46} + 46%|████▌ | 4040/8750 [2:43:21<7:29:12, 5.72s/it] {'loss': 0.4534, 'learning_rate': 1.1718251731436001e-05, 'epoch': 0.46} + 46%|████▌ | 4040/8750 [2:43:18<7:29:12, 5.72s/it] 46%|████▌ | 4041/8750 [2:43:24<7:31:41, 5.76s/it] 46%|████▌ | 4041/8750 [2:43:27<7:31:42, 5.76s/it] {'loss': 0.4697, 'learning_rate': 1.1714605014074291e-05, 'epoch': 0.46} + 46%|████▌ | 4041/8750 [2:43:27<7:31:42, 5.76s/it] {'loss': 0.4697, 'learning_rate': 1.1714605014074291e-05, 'epoch': 0.46} + 46%|████▌ | 4041/8750 [2:43:24<7:31:41, 5.76s/it] 46%|████▌ | 4042/8750 [2:43:32<7:29:19, 5.73s/it] 46%|████▌ | 4042/8750 [2:43:29<7:29:20, 5.73s/it] {'loss': 0.4632, 'learning_rate': 1.1710958061773413e-05, 'epoch': 0.46} + 46%|████▌ | 4042/8750 [2:43:32<7:29:19, 5.73s/it] {'loss': 0.4632, 'learning_rate': 1.1710958061773413e-05, 'epoch': 0.46} + 46%|████▌ | 4042/8750 [2:43:29<7:29:20, 5.73s/it] 46%|████▌ | 4043/8750 [2:43:38<7:28:19, 5.71s/it] 46%|████▌ | 4043/8750 [2:43:35<7:28:20, 5.71s/it] {'loss': 0.4561, 'learning_rate': 1.1707310875033085e-05, 'epoch': 0.46} + 46%|████▌ | 4043/8750 [2:43:38<7:28:19, 5.71s/it] {'loss': 0.4561, 'learning_rate': 1.1707310875033085e-05, 'epoch': 0.46} + 46%|████▌ | 4043/8750 [2:43:35<7:28:20, 5.71s/it] 46%|████▌ | 4044/8750 [2:43:44<7:28:54, 5.72s/it] 46%|████▌ | 4044/8750 [2:43:41<7:28:55, 5.72s/it] {'loss': 0.4752, 'learning_rate': 1.1703663454353045e-05, 'epoch': 0.46} + 46%|████▌ | 4044/8750 [2:43:44<7:28:54, 5.72s/it] {'loss': 0.4752, 'learning_rate': 1.1703663454353045e-05, 'epoch': 0.46} + 46%|████▌ | 4044/8750 [2:43:41<7:28:55, 5.72s/it] 46%|████▌ | 4045/8750 [2:43:47<7:33:31, 5.78s/it] 46%|████▌ | 4045/8750 [2:43:50<7:33:32, 5.78s/it] {'loss': 0.4879, 'learning_rate': 1.1700015800233078e-05, 'epoch': 0.46} + 46%|████▌ | 4045/8750 [2:43:50<7:33:32, 5.78s/it] {'loss': 0.4879, 'learning_rate': 1.1700015800233078e-05, 'epoch': 0.46} + 46%|████▌ | 4045/8750 [2:43:47<7:33:31, 5.78s/it] 46%|████▌ | 4046/8750 [2:43:55<7:31:43, 5.76s/it] 46%|████▌ | 4046/8750 [2:43:53<7:31:43, 5.76s/it] {'loss': 0.4552, 'learning_rate': 1.169636791317299e-05, 'epoch': 0.46} + 46%|████▌ | 4046/8750 [2:43:55<7:31:43, 5.76s/it] {'loss': 0.4552, 'learning_rate': 1.169636791317299e-05, 'epoch': 0.46} + 46%|████▌ | 4046/8750 [2:43:53<7:31:43, 5.76s/it] 46%|████▋ | 4047/8750 [2:44:01<7:34:10, 5.79s/it] 46%|████▋ | 4047/8750 [2:43:58<7:34:11, 5.79s/it] {'loss': 0.4641, 'learning_rate': 1.1692719793672627e-05, 'epoch': 0.46} + 46%|████▋ | 4047/8750 [2:44:01<7:34:10, 5.79s/it] {'loss': 0.4641, 'learning_rate': 1.1692719793672627e-05, 'epoch': 0.46} + 46%|████▋ | 4047/8750 [2:43:58<7:34:11, 5.79s/it] 46%|████▋ | 4048/8750 [2:44:04<7:31:12, 5.76s/it] 46%|████▋ | 4048/8750 [2:44:07<7:31:13, 5.76s/it] {'loss': 0.4615, 'learning_rate': 1.1689071442231858e-05, 'epoch': 0.46} + 46%|████▋ | 4048/8750 [2:44:07<7:31:13, 5.76s/it] {'loss': 0.4615, 'learning_rate': 1.1689071442231858e-05, 'epoch': 0.46} + 46%|████▋ | 4048/8750 [2:44:04<7:31:12, 5.76s/it] 46%|████▋ | 4049/8750 [2:44:10<7:33:14, 5.78s/it] 46%|████▋ | 4049/8750 [2:44:13<7:33:16, 5.79s/it] {'loss': 0.4859, 'learning_rate': 1.1685422859350592e-05, 'epoch': 0.46} + 46%|████▋ | 4049/8750 [2:44:13<7:33:16, 5.79s/it] {'loss': 0.4859, 'learning_rate': 1.1685422859350592e-05, 'epoch': 0.46} + 46%|████▋ | 4049/8750 [2:44:10<7:33:14, 5.78s/it]4 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +014 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +11 12AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... + 46%|████▋ | 4050/8750 [2:44:19<7:39:07, 5.86s/it] 46%|████▋ | 4050/8750 [2:44:16<7:39:08, 5.86s/it]3 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4727, 'learning_rate': 1.1681774045528764e-05, 'epoch': 0.46} + 46%|████▋ | 4050/8750 [2:44:19<7:39:07, 5.86s/it] {'loss': 0.4727, 'learning_rate': 1.1681774045528764e-05, 'epoch': 0.46} + 46%|████▋ | 4050/8750 [2:44:16<7:39:08, 5.86s/it] 46%|████▋ | 4051/8750 [2:44:25<7:37:09, 5.84s/it] 46%|████▋ | 4051/8750 [2:44:22<7:37:10, 5.84s/it] {'loss': 0.4682, 'learning_rate': 1.1678125001266347e-05, 'epoch': 0.46} + 46%|████▋ | 4051/8750 [2:44:25<7:37:09, 5.84s/it] {'loss': 0.4682, 'learning_rate': 1.1678125001266347e-05, 'epoch': 0.46} + 46%|████▋ | 4051/8750 [2:44:22<7:37:10, 5.84s/it] 46%|████▋ | 4052/8750 [2:44:30<7:35:11, 5.81s/it] 46%|████▋ | 4052/8750 [2:44:28<7:35:10, 5.81s/it] {'loss': 0.4597, 'learning_rate': 1.1674475727063337e-05, 'epoch': 0.46} + 46%|████▋ | 4052/8750 [2:44:30<7:35:11, 5.81s/it] {'loss': 0.4597, 'learning_rate': 1.1674475727063337e-05, 'epoch': 0.46} + 46%|████▋ | 4052/8750 [2:44:28<7:35:10, 5.81s/it] 46%|████▋ | 4053/8750 [2:44:36<7:31:32, 5.77s/it] 46%|████▋ | 4053/8750 [2:44:33<7:31:32, 5.77s/it] {'loss': 0.4775, 'learning_rate': 1.1670826223419766e-05, 'epoch': 0.46} + 46%|████▋ | 4053/8750 [2:44:36<7:31:32, 5.77s/it] {'loss': 0.4775, 'learning_rate': 1.1670826223419766e-05, 'epoch': 0.46} + 46%|████▋ | 4053/8750 [2:44:33<7:31:32, 5.77s/it] 46%|████▋ | 4054/8750 [2:44:42<7:30:38, 5.76s/it] 46%|████▋ | 4054/8750 [2:44:39<7:30:38, 5.76s/it] {'loss': 0.4741, 'learning_rate': 1.1667176490835701e-05, 'epoch': 0.46} + 46%|████▋ | 4054/8750 [2:44:42<7:30:38, 5.76s/it] {'loss': 0.4741, 'learning_rate': 1.1667176490835701e-05, 'epoch': 0.46} + 46%|████▋ | 4054/8750 [2:44:39<7:30:38, 5.76s/it] 46%|████▋ | 4055/8750 [2:44:48<7:36:05, 5.83s/it] 46%|████▋ | 4055/8750 [2:44:45<7:36:05, 5.83s/it] {'loss': 0.4678, 'learning_rate': 1.1663526529811235e-05, 'epoch': 0.46} + 46%|████▋ | 4055/8750 [2:44:48<7:36:05, 5.83s/it] {'loss': 0.4678, 'learning_rate': 1.1663526529811235e-05, 'epoch': 0.46} + 46%|████▋ | 4055/8750 [2:44:45<7:36:05, 5.83s/it] 46%|████▋ | 4056/8750 [2:44:53<7:30:57, 5.76s/it] 46%|████▋ | 4056/8750 [2:44:51<7:30:57, 5.76s/it] {'loss': 0.4658, 'learning_rate': 1.1659876340846494e-05, 'epoch': 0.46} + 46%|████▋ | 4056/8750 [2:44:53<7:30:57, 5.76s/it] {'loss': 0.4658, 'learning_rate': 1.1659876340846494e-05, 'epoch': 0.46} + 46%|████▋ | 4056/8750 [2:44:51<7:30:57, 5.76s/it] 46%|████▋ | 4057/8750 [2:44:59<7:32:42, 5.79s/it] 46%|████▋ | 4057/8750 [2:44:56<7:32:43, 5.79s/it] {'loss': 0.4533, 'learning_rate': 1.165622592444164e-05, 'epoch': 0.46} + 46%|████▋ | 4057/8750 [2:44:59<7:32:42, 5.79s/it] {'loss': 0.4533, 'learning_rate': 1.165622592444164e-05, 'epoch': 0.46} + 46%|████▋ | 4057/8750 [2:44:56<7:32:43, 5.79s/it] 46%|████▋ | 4058/8750 [2:45:05<7:37:45, 5.85s/it] 46%|████▋ | 4058/8750 [2:45:02<7:37:45, 5.85s/it] {'loss': 0.4822, 'learning_rate': 1.165257528109685e-05, 'epoch': 0.46} + 46%|████▋ | 4058/8750 [2:45:05<7:37:45, 5.85s/it] {'loss': 0.4822, 'learning_rate': 1.165257528109685e-05, 'epoch': 0.46} + 46%|████▋ | 4058/8750 [2:45:02<7:37:45, 5.85s/it] 46%|████▋ | 4059/8750 [2:45:11<7:35:13, 5.82s/it] 46%|████▋ | 4059/8750 [2:45:08<7:35:12, 5.82s/it] {'loss': 0.4722, 'learning_rate': 1.1648924411312354e-05, 'epoch': 0.46} + 46%|████▋ | 4059/8750 [2:45:11<7:35:13, 5.82s/it] {'loss': 0.4722, 'learning_rate': 1.1648924411312354e-05, 'epoch': 0.46} + 46%|████▋ | 4059/8750 [2:45:08<7:35:12, 5.82s/it] 46%|████▋ | 4060/8750 [2:45:17<7:35:50, 5.83s/it] 46%|████▋ | 4060/8750 [2:45:14<7:35:50, 5.83s/it] {'loss': 0.4607, 'learning_rate': 1.1645273315588399e-05, 'epoch': 0.46} + 46%|████▋ | 4060/8750 [2:45:17<7:35:50, 5.83s/it] {'loss': 0.4607, 'learning_rate': 1.1645273315588399e-05, 'epoch': 0.46} + 46%|████▋ | 4060/8750 [2:45:14<7:35:50, 5.83s/it] 46%|████▋ | 4061/8750 [2:45:23<7:35:01, 5.82s/it] 46%|████▋ | 4061/8750 [2:45:20<7:35:02, 5.82s/it] {'loss': 0.4798, 'learning_rate': 1.164162199442527e-05, 'epoch': 0.46} + 46%|████▋ | 4061/8750 [2:45:23<7:35:01, 5.82s/it] {'loss': 0.4798, 'learning_rate': 1.164162199442527e-05, 'epoch': 0.46} + 46%|████▋ | 4061/8750 [2:45:20<7:35:02, 5.82s/it] 46%|████▋ | 4062/8750 [2:45:28<7:33:06, 5.80s/it] 46%|████▋ | 4062/8750 [2:45:26<7:33:06, 5.80s/it] {'loss': 0.4758, 'learning_rate': 1.1637970448323274e-05, 'epoch': 0.46} + 46%|████▋ | 4062/8750 [2:45:28<7:33:06, 5.80s/it] {'loss': 0.4758, 'learning_rate': 1.1637970448323274e-05, 'epoch': 0.46} + 46%|████▋ | 4062/8750 [2:45:26<7:33:06, 5.80s/it] 46%|████▋ | 4063/8750 [2:45:34<7:31:08, 5.78s/it] 46%|████▋ | 4063/8750 [2:45:31<7:31:09, 5.78s/it] {'loss': 0.4617, 'learning_rate': 1.1634318677782755e-05, 'epoch': 0.46} + 46%|████▋ | 4063/8750 [2:45:34<7:31:08, 5.78s/it] {'loss': 0.4617, 'learning_rate': 1.1634318677782755e-05, 'epoch': 0.46} + 46%|████▋ | 4063/8750 [2:45:31<7:31:09, 5.78s/it] 46%|████▋ | 4064/8750 [2:45:40<7:28:31, 5.74s/it] 46%|████▋ | 4064/8750 [2:45:37<7:28:32, 5.74s/it] {'loss': 0.467, 'learning_rate': 1.163066668330409e-05, 'epoch': 0.46} + 46%|████▋ | 4064/8750 [2:45:40<7:28:31, 5.74s/it] {'loss': 0.467, 'learning_rate': 1.163066668330409e-05, 'epoch': 0.46} + 46%|████▋ | 4064/8750 [2:45:37<7:28:32, 5.74s/it] 46%|████▋ | 4065/8750 [2:45:46<7:31:20, 5.78s/it] 46%|████▋ | 4065/8750 [2:45:43<7:31:20, 5.78s/it] {'loss': 0.4489, 'learning_rate': 1.1627014465387685e-05, 'epoch': 0.46} + 46%|████▋ | 4065/8750 [2:45:46<7:31:20, 5.78s/it] {'loss': 0.4489, 'learning_rate': 1.1627014465387685e-05, 'epoch': 0.46} + 46%|████▋ | 4065/8750 [2:45:43<7:31:20, 5.78s/it] 46%|████▋ | 4066/8750 [2:45:52<7:32:39, 5.80s/it] 46%|████▋ | 4066/8750 [2:45:49<7:32:40, 5.80s/it] {'loss': 0.4678, 'learning_rate': 1.1623362024533974e-05, 'epoch': 0.46} + 46%|████▋ | 4066/8750 [2:45:52<7:32:39, 5.80s/it] {'loss': 0.4678, 'learning_rate': 1.1623362024533974e-05, 'epoch': 0.46} + 46%|████▋ | 4066/8750 [2:45:49<7:32:40, 5.80s/it] 46%|████▋ | 4067/8750 [2:45:54<7:28:19, 5.74s/it] 46%|████▋ | 4067/8750 [2:45:57<7:28:21, 5.74s/it] {'loss': 0.4925, 'learning_rate': 1.161970936124342e-05, 'epoch': 0.46} + 46%|████▋ | 4067/8750 [2:45:57<7:28:21, 5.74s/it] {'loss': 0.4925, 'learning_rate': 1.161970936124342e-05, 'epoch': 0.46} + 46%|████▋ | 4067/8750 [2:45:54<7:28:19, 5.74s/it] 46%|████▋ | 4068/8750 [2:46:03<7:28:01, 5.74s/it] 46%|████▋ | 4068/8750 [2:46:00<7:28:00, 5.74s/it] {'loss': 0.4605, 'learning_rate': 1.1616056476016521e-05, 'epoch': 0.46} + 46%|████▋ | 4068/8750 [2:46:03<7:28:01, 5.74s/it] {'loss': 0.4605, 'learning_rate': 1.1616056476016521e-05, 'epoch': 0.46} + 46%|████▋ | 4068/8750 [2:46:00<7:28:00, 5.74s/it] 47%|████▋ | 4069/8750 [2:46:09<7:29:21, 5.76s/it] 47%|████▋ | 4069/8750 [2:46:06<7:29:21, 5.76s/it] {'loss': 0.4645, 'learning_rate': 1.1612403369353806e-05, 'epoch': 0.47} + 47%|████▋ | 4069/8750 [2:46:09<7:29:21, 5.76s/it] {'loss': 0.4645, 'learning_rate': 1.1612403369353806e-05, 'epoch': 0.47} + 47%|████▋ | 4069/8750 [2:46:06<7:29:21, 5.76s/it] 47%|████▋ | 4070/8750 [2:46:15<7:31:34, 5.79s/it] 47%|████▋ | 4070/8750 [2:46:12<7:31:34, 5.79s/it] {'loss': 0.4682, 'learning_rate': 1.1608750041755832e-05, 'epoch': 0.47} + 47%|████▋ | 4070/8750 [2:46:15<7:31:34, 5.79s/it] {'loss': 0.4682, 'learning_rate': 1.1608750041755832e-05, 'epoch': 0.47} + 47%|████▋ | 4070/8750 [2:46:12<7:31:34, 5.79s/it] 47%|████▋ | 4071/8750 [2:46:20<7:31:57, 5.80s/it] 47%|████▋ | 4071/8750 [2:46:17<7:31:57, 5.80s/it] {'loss': 0.4891, 'learning_rate': 1.1605096493723183e-05, 'epoch': 0.47} + 47%|████▋ | 4071/8750 [2:46:20<7:31:57, 5.80s/it] {'loss': 0.4891, 'learning_rate': 1.1605096493723183e-05, 'epoch': 0.47} + 47%|████▋ | 4071/8750 [2:46:17<7:31:57, 5.80s/it] 47%|████▋ | 4072/8750 [2:46:23<7:31:43, 5.79s/it] 47%|████▋ | 4072/8750 [2:46:26<7:31:44, 5.79s/it] {'loss': 0.4667, 'learning_rate': 1.1601442725756478e-05, 'epoch': 0.47} + 47%|████▋ | 4072/8750 [2:46:26<7:31:44, 5.79s/it] {'loss': 0.4667, 'learning_rate': 1.1601442725756478e-05, 'epoch': 0.47} + 47%|████▋ | 4072/8750 [2:46:23<7:31:43, 5.79s/it] 47%|████▋ | 4073/8750 [2:46:29<7:31:09, 5.79s/it] 47%|████▋ | 4073/8750 [2:46:32<7:31:09, 5.79s/it] {'loss': 0.4829, 'learning_rate': 1.1597788738356365e-05, 'epoch': 0.47} + 47%|████▋ | 4073/8750 [2:46:32<7:31:09, 5.79s/it] {'loss': 0.4829, 'learning_rate': 1.1597788738356365e-05, 'epoch': 0.47} + 47%|████▋ | 4073/8750 [2:46:29<7:31:09, 5.79s/it] 47%|████▋ | 4074/8750 [2:46:35<7:30:37, 5.78s/it] 47%|████▋ | 4074/8750 [2:46:38<7:30:37, 5.78s/it] {'loss': 0.4345, 'learning_rate': 1.1594134532023525e-05, 'epoch': 0.47} + 47%|████▋ | 4074/8750 [2:46:38<7:30:37, 5.78s/it] {'loss': 0.4345, 'learning_rate': 1.1594134532023525e-05, 'epoch': 0.47} + 47%|████▋ | 4074/8750 [2:46:35<7:30:37, 5.78s/it] 47%|████▋ | 4075/8750 [2:46:40<7:29:26, 5.77s/it] 47%|████▋ | 4075/8750 [2:46:43<7:29:26, 5.77s/it] {'loss': 0.4448, 'learning_rate': 1.1590480107258663e-05, 'epoch': 0.47} + 47%|████▋ | 4075/8750 [2:46:43<7:29:26, 5.77s/it] {'loss': 0.4448, 'learning_rate': 1.1590480107258663e-05, 'epoch': 0.47} + 47%|████▋ | 4075/8750 [2:46:40<7:29:26, 5.77s/it] 47%|████▋ | 4076/8750 [2:46:46<7:27:59, 5.75s/it] 47%|████▋ | 4076/8750 [2:46:49<7:27:59, 5.75s/it] {'loss': 0.4806, 'learning_rate': 1.1586825464562515e-05, 'epoch': 0.47} + 47%|████▋ | 4076/8750 [2:46:49<7:27:59, 5.75s/it] {'loss': 0.4806, 'learning_rate': 1.1586825464562515e-05, 'epoch': 0.47} + 47%|████▋ | 4076/8750 [2:46:46<7:27:59, 5.75s/it] 47%|████▋ | 4077/8750 [2:46:55<7:25:41, 5.72s/it] 47%|████▋ | 4077/8750 [2:46:52<7:25:42, 5.72s/it] {'loss': 0.4897, 'learning_rate': 1.1583170604435851e-05, 'epoch': 0.47} + 47%|████▋ | 4077/8750 [2:46:55<7:25:41, 5.72s/it] {'loss': 0.4897, 'learning_rate': 1.1583170604435851e-05, 'epoch': 0.47} + 47%|████▋ | 4077/8750 [2:46:52<7:25:42, 5.72s/it] 47%|████▋ | 4078/8750 [2:47:01<7:27:06, 5.74s/it] 47%|████▋ | 4078/8750 [2:46:58<7:27:07, 5.74s/it] {'loss': 0.4704, 'learning_rate': 1.1579515527379468e-05, 'epoch': 0.47} + 47%|████▋ | 4078/8750 [2:47:01<7:27:06, 5.74s/it] {'loss': 0.4704, 'learning_rate': 1.1579515527379468e-05, 'epoch': 0.47} + 47%|████▋ | 4078/8750 [2:46:58<7:27:07, 5.74s/it] 47%|████▋ | 4079/8750 [2:47:04<7:29:40, 5.78s/it] 47%|████▋ | 4079/8750 [2:47:06<7:29:40, 5.78s/it] {'loss': 0.4678, 'learning_rate': 1.1575860233894195e-05, 'epoch': 0.47} + 47%|████▋ | 4079/8750 [2:47:06<7:29:40, 5.78s/it] {'loss': 0.4678, 'learning_rate': 1.1575860233894195e-05, 'epoch': 0.47} + 47%|████▋ | 4079/8750 [2:47:04<7:29:40, 5.78s/it] 47%|████▋ | 4080/8750 [2:47:09<7:28:03, 5.76s/it] 47%|████▋ | 4080/8750 [2:47:12<7:28:04, 5.76s/it] {'loss': 0.4891, 'learning_rate': 1.1572204724480887e-05, 'epoch': 0.47} + 47%|████▋ | 4080/8750 [2:47:12<7:28:04, 5.76s/it] {'loss': 0.4891, 'learning_rate': 1.1572204724480887e-05, 'epoch': 0.47} + 47%|████▋ | 4080/8750 [2:47:09<7:28:03, 5.76s/it] 47%|████▋ | 4081/8750 [2:47:15<7:33:20, 5.83s/it] 47%|████▋ | 4081/8750 [2:47:18<7:33:20, 5.83s/it] {'loss': 0.4619, 'learning_rate': 1.1568548999640428e-05, 'epoch': 0.47} + 47%|████▋ | 4081/8750 [2:47:18<7:33:20, 5.83s/it] {'loss': 0.4619, 'learning_rate': 1.1568548999640428e-05, 'epoch': 0.47} + 47%|████▋ | 4081/8750 [2:47:15<7:33:20, 5.83s/it] 47%|████▋ | 4082/8750 [2:47:21<7:32:01, 5.81s/it] 47%|████▋ | 4082/8750 [2:47:24<7:32:02, 5.81s/it] {'loss': 0.4855, 'learning_rate': 1.1564893059873734e-05, 'epoch': 0.47} + 47%|████▋ | 4082/8750 [2:47:24<7:32:02, 5.81s/it] {'loss': 0.4855, 'learning_rate': 1.1564893059873734e-05, 'epoch': 0.47} + 47%|████▋ | 4082/8750 [2:47:21<7:32:01, 5.81s/it] 47%|████▋ | 4083/8750 [2:47:27<7:32:24, 5.82s/it] 47%|████▋ | 4083/8750 [2:47:30<7:32:24, 5.82s/it] {'loss': 0.4489, 'learning_rate': 1.1561236905681761e-05, 'epoch': 0.47} + 47%|████▋ | 4083/8750 [2:47:30<7:32:24, 5.82s/it] {'loss': 0.4489, 'learning_rate': 1.1561236905681761e-05, 'epoch': 0.47} + 47%|████▋ | 4083/8750 [2:47:27<7:32:24, 5.82s/it] 47%|████▋ | 4084/8750 [2:47:33<7:35:29, 5.86s/it] 47%|████▋ | 4084/8750 [2:47:36<7:35:29, 5.86s/it] {'loss': 0.4747, 'learning_rate': 1.155758053756547e-05, 'epoch': 0.47} + 47%|████▋ | 4084/8750 [2:47:36<7:35:29, 5.86s/it] {'loss': 0.4747, 'learning_rate': 1.155758053756547e-05, 'epoch': 0.47} + 47%|████▋ | 4084/8750 [2:47:33<7:35:29, 5.86s/it] 47%|████▋ | 4085/8750 [2:47:39<7:33:57, 5.84s/it] 47%|████▋ | 4085/8750 [2:47:41<7:33:58, 5.84s/it] {'loss': 0.4877, 'learning_rate': 1.1553923956025871e-05, 'epoch': 0.47} + 47%|████▋ | 4085/8750 [2:47:41<7:33:58, 5.84s/it] {'loss': 0.4877, 'learning_rate': 1.1553923956025871e-05, 'epoch': 0.47} + 47%|████▋ | 4085/8750 [2:47:39<7:33:57, 5.84s/it] 47%|████▋ | 4086/8750 [2:47:44<7:33:24, 5.83s/it] 47%|████▋ | 4086/8750 [2:47:47<7:33:24, 5.83s/it] {'loss': 0.4616, 'learning_rate': 1.1550267161563998e-05, 'epoch': 0.47} + {'loss': 0.4616, 'learning_rate': 1.1550267161563998e-05, 'epoch': 0.47} + 47%|████▋ | 4086/8750 [2:47:47<7:33:24, 5.83s/it] 47%|████▋ | 4086/8750 [2:47:44<7:33:24, 5.83s/it] 47%|████▋ | 4087/8750 [2:47:50<7:36:25, 5.87s/it] 47%|████▋ | 4087/8750 [2:47:53<7:36:24, 5.87s/it] {'loss': 0.4657, 'learning_rate': 1.1546610154680908e-05, 'epoch': 0.47} + 47%|████▋ | 4087/8750 [2:47:53<7:36:24, 5.87s/it] {'loss': 0.4657, 'learning_rate': 1.1546610154680908e-05, 'epoch': 0.47} + 47%|████▋ | 4087/8750 [2:47:50<7:36:25, 5.87s/it] 47%|████▋ | 4088/8750 [2:47:59<7:33:53, 5.84s/it] 47%|████▋ | 4088/8750 [2:47:56<7:33:54, 5.84s/it] {'loss': 0.4554, 'learning_rate': 1.1542952935877703e-05, 'epoch': 0.47} + 47%|████▋ | 4088/8750 [2:47:59<7:33:53, 5.84s/it] {'loss': 0.4554, 'learning_rate': 1.1542952935877703e-05, 'epoch': 0.47} + 47%|████▋ | 4088/8750 [2:47:56<7:33:54, 5.84s/it] 47%|████▋ | 4089/8750 [2:48:02<7:31:30, 5.81s/it] 47%|████▋ | 4089/8750 [2:48:05<7:31:30, 5.81s/it] {'loss': 0.4621, 'learning_rate': 1.1539295505655494e-05, 'epoch': 0.47} + 47%|████▋ | 4089/8750 [2:48:05<7:31:30, 5.81s/it] {'loss': 0.4621, 'learning_rate': 1.1539295505655494e-05, 'epoch': 0.47} + 47%|████▋ | 4089/8750 [2:48:02<7:31:30, 5.81s/it] 47%|████▋ | 4090/8750 [2:48:08<7:29:22, 5.79s/it] 47%|████▋ | 4090/8750 [2:48:10<7:29:23, 5.79s/it] {'loss': 0.4696, 'learning_rate': 1.1535637864515436e-05, 'epoch': 0.47} + 47%|████▋ | 4090/8750 [2:48:10<7:29:23, 5.79s/it] {'loss': 0.4696, 'learning_rate': 1.1535637864515436e-05, 'epoch': 0.47} + 47%|████▋ | 4090/8750 [2:48:08<7:29:22, 5.79s/it] 47%|████▋ | 4091/8750 [2:48:13<7:28:29, 5.78s/it] 47%|████▋ | 4091/8750 [2:48:16<7:28:30, 5.78s/it] {'loss': 0.4821, 'learning_rate': 1.1531980012958706e-05, 'epoch': 0.47} + 47%|████▋ | 4091/8750 [2:48:16<7:28:30, 5.78s/it] {'loss': 0.4821, 'learning_rate': 1.1531980012958706e-05, 'epoch': 0.47} + 47%|████▋ | 4091/8750 [2:48:13<7:28:29, 5.78s/it] 47%|████▋ | 4092/8750 [2:48:19<7:26:55, 5.76s/it] 47%|████▋ | 4092/8750 [2:48:22<7:26:55, 5.76s/it] {'loss': 0.4452, 'learning_rate': 1.152832195148651e-05, 'epoch': 0.47} + 47%|████▋ | 4092/8750 [2:48:22<7:26:55, 5.76s/it] {'loss': 0.4452, 'learning_rate': 1.152832195148651e-05, 'epoch': 0.47} + 47%|████▋ | 4092/8750 [2:48:19<7:26:55, 5.76s/it] 47%|████▋ | 4093/8750 [2:48:25<7:26:30, 5.75s/it] 47%|████▋ | 4093/8750 [2:48:28<7:26:30, 5.75s/it] {'loss': 0.5015, 'learning_rate': 1.1524663680600083e-05, 'epoch': 0.47} + 47%|████▋ | 4093/8750 [2:48:28<7:26:30, 5.75s/it] {'loss': 0.5015, 'learning_rate': 1.1524663680600083e-05, 'epoch': 0.47} + 47%|████▋ | 4093/8750 [2:48:25<7:26:30, 5.75s/it] 47%|████▋ | 4094/8750 [2:48:31<7:31:28, 5.82s/it] 47%|████▋ | 4094/8750 [2:48:34<7:31:28, 5.82s/it] {'loss': 0.4502, 'learning_rate': 1.1521005200800694e-05, 'epoch': 0.47} + 47%|████▋ | 4094/8750 [2:48:34<7:31:28, 5.82s/it] {'loss': 0.4502, 'learning_rate': 1.1521005200800694e-05, 'epoch': 0.47} + 47%|████▋ | 4094/8750 [2:48:31<7:31:28, 5.82s/it] 47%|████▋ | 4095/8750 [2:48:36<7:27:25, 5.77s/it] 47%|████▋ | 4095/8750 [2:48:39<7:27:25, 5.77s/it] {'loss': 0.4993, 'learning_rate': 1.1517346512589635e-05, 'epoch': 0.47} + 47%|████▋ | 4095/8750 [2:48:39<7:27:25, 5.77s/it] {'loss': 0.4993, 'learning_rate': 1.1517346512589635e-05, 'epoch': 0.47} + 47%|████▋ | 4095/8750 [2:48:36<7:27:25, 5.77s/it] 47%|████▋ | 4096/8750 [2:48:42<7:30:36, 5.81s/it] 47%|████▋ | 4096/8750 [2:48:45<7:30:36, 5.81s/it] {'loss': 0.4516, 'learning_rate': 1.1513687616468225e-05, 'epoch': 0.47} + 47%|████▋ | 4096/8750 [2:48:42<7:30:36, 5.81s/it]{'loss': 0.4516, 'learning_rate': 1.1513687616468225e-05, 'epoch': 0.47} + 47%|████▋ | 4096/8750 [2:48:45<7:30:36, 5.81s/it] 47%|████▋ | 4097/8750 [2:48:48<7:35:14, 5.87s/it] 47%|████▋ | 4097/8750 [2:48:51<7:35:15, 5.87s/it] {'loss': 0.4495, 'learning_rate': 1.1510028512937818e-05, 'epoch': 0.47} + 47%|████▋ | 4097/8750 [2:48:51<7:35:15, 5.87s/it] {'loss': 0.4495, 'learning_rate': 1.1510028512937818e-05, 'epoch': 0.47} + 47%|████▋ | 4097/8750 [2:48:48<7:35:14, 5.87s/it] 47%|████▋ | 4098/8750 [2:48:57<7:29:26, 5.80s/it] 47%|████▋ | 4098/8750 [2:48:54<7:29:27, 5.80s/it] {'loss': 0.4643, 'learning_rate': 1.1506369202499791e-05, 'epoch': 0.47} + 47%|████▋ | 4098/8750 [2:48:57<7:29:26, 5.80s/it] {'loss': 0.4643, 'learning_rate': 1.1506369202499791e-05, 'epoch': 0.47} + 47%|████▋ | 4098/8750 [2:48:54<7:29:27, 5.80s/it] 47%|████▋ | 4099/8750 [2:49:02<7:23:39, 5.72s/it] 47%|████▋ | 4099/8750 [2:49:00<7:23:39, 5.72s/it] {'loss': 0.474, 'learning_rate': 1.1502709685655553e-05, 'epoch': 0.47} + 47%|████▋ | 4099/8750 [2:49:02<7:23:39, 5.72s/it] {'loss': 0.474, 'learning_rate': 1.1502709685655553e-05, 'epoch': 0.47} + 47%|████▋ | 4099/8750 [2:49:00<7:23:39, 5.72s/it]14 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +07 AutoResumeHook: Checking whether to suspend... + 47%|████▋ | 4100/8750 [2:49:08<7:24:56, 5.74s/it]12 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... + 47%|████▋ | 4100/8750 [2:49:05<7:24:56, 5.74s/it]9 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend...13 AutoResumeHook: Checking whether to suspend... + +2 AutoResumeHook: Checking whether to suspend... + {'loss': 0.448, 'learning_rate': 1.149904996290654e-05, 'epoch': 0.47} + 47%|████▋ | 4100/8750 [2:49:08<7:24:56, 5.74s/it] {'loss': 0.448, 'learning_rate': 1.149904996290654e-05, 'epoch': 0.47} + 47%|████▋ | 4100/8750 [2:49:05<7:24:56, 5.74s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-4100/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-4100/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-4100/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 47%|████▋ | 4101/8750 [2:49:25<12:57:26, 10.03s/it] 47%|████▋ | 4101/8750 [2:49:28<12:57:27, 10.03s/it] {'loss': 0.4667, 'learning_rate': 1.149539003475421e-05, 'epoch': 0.47} + 47%|████▋ | 4101/8750 [2:49:28<12:57:27, 10.03s/it] {'loss': 0.4667, 'learning_rate': 1.149539003475421e-05, 'epoch': 0.47} + 47%|████▋ | 4101/8750 [2:49:25<12:57:26, 10.03s/it] 47%|████▋ | 4102/8750 [2:49:31<11:19:21, 8.77s/it] 47%|████▋ | 4102/8750 [2:49:34<11:19:22, 8.77s/it] {'loss': 0.4557, 'learning_rate': 1.1491729901700062e-05, 'epoch': 0.47} + 47%|████▋ | 4102/8750 [2:49:34<11:19:22, 8.77s/it] {'loss': 0.4557, 'learning_rate': 1.1491729901700062e-05, 'epoch': 0.47} + 47%|████▋ | 4102/8750 [2:49:31<11:19:21, 8.77s/it] 47%|████▋ | 4103/8750 [2:49:37<10:11:01, 7.89s/it] 47%|████▋ | 4103/8750 [2:49:40<10:11:01, 7.89s/it] {'loss': 0.4826, 'learning_rate': 1.148806956424561e-05, 'epoch': 0.47} + 47%|████▋ | 4103/8750 [2:49:40<10:11:01, 7.89s/it] {'loss': 0.4826, 'learning_rate': 1.148806956424561e-05, 'epoch': 0.47} + 47%|████▋ | 4103/8750 [2:49:37<10:11:01, 7.89s/it] 47%|████▋ | 4104/8750 [2:49:46<9:21:36, 7.25s/it] 47%|████▋ | 4104/8750 [2:49:43<9:21:36, 7.25s/it] {'loss': 0.4517, 'learning_rate': 1.1484409022892406e-05, 'epoch': 0.47} + 47%|████▋ | 4104/8750 [2:49:46<9:21:36, 7.25s/it] {'loss': 0.4517, 'learning_rate': 1.1484409022892406e-05, 'epoch': 0.47} + 47%|████▋ | 4104/8750 [2:49:43<9:21:36, 7.25s/it] 47%|████▋ | 4105/8750 [2:49:48<8:43:48, 6.77s/it] 47%|████▋ | 4105/8750 [2:49:51<8:43:48, 6.77s/it] {'loss': 0.4756, 'learning_rate': 1.1480748278142025e-05, 'epoch': 0.47} + 47%|████▋ | 4105/8750 [2:49:48<8:43:48, 6.77s/it]{'loss': 0.4756, 'learning_rate': 1.1480748278142025e-05, 'epoch': 0.47} + 47%|████▋ | 4105/8750 [2:49:51<8:43:48, 6.77s/it] 47%|████▋ | 4106/8750 [2:49:54<8:22:04, 6.49s/it] 47%|████▋ | 4106/8750 [2:49:57<8:22:04, 6.49s/it] {'loss': 0.4744, 'learning_rate': 1.1477087330496071e-05, 'epoch': 0.47} + {'loss': 0.4744, 'learning_rate': 1.1477087330496071e-05, 'epoch': 0.47} + 47%|████▋ | 4106/8750 [2:49:57<8:22:04, 6.49s/it] 47%|████▋ | 4106/8750 [2:49:54<8:22:04, 6.49s/it] 47%|████▋ | 4107/8750 [2:50:00<8:03:53, 6.25s/it] 47%|████▋ | 4107/8750 [2:50:03<8:03:53, 6.25s/it] {'loss': 0.4785, 'learning_rate': 1.1473426180456174e-05, 'epoch': 0.47} + 47%|████▋ | 4107/8750 [2:50:03<8:03:53, 6.25s/it] {'loss': 0.4785, 'learning_rate': 1.1473426180456174e-05, 'epoch': 0.47} + 47%|████▋ | 4107/8750 [2:50:00<8:03:53, 6.25s/it] 47%|████▋ | 4108/8750 [2:50:06<7:53:19, 6.12s/it] 47%|████▋ | 4108/8750 [2:50:09<7:53:19, 6.12s/it] {'loss': 0.4586, 'learning_rate': 1.1469764828523995e-05, 'epoch': 0.47} + 47%|████▋ | 4108/8750 [2:50:09<7:53:19, 6.12s/it] {'loss': 0.4586, 'learning_rate': 1.1469764828523995e-05, 'epoch': 0.47} + 47%|████▋ | 4108/8750 [2:50:06<7:53:19, 6.12s/it] 47%|████▋ | 4109/8750 [2:50:11<7:44:56, 6.01s/it] 47%|████▋ | 4109/8750 [2:50:14<7:44:57, 6.01s/it] {'loss': 0.4743, 'learning_rate': 1.146610327520122e-05, 'epoch': 0.47} + 47%|████▋ | 4109/8750 [2:50:14<7:44:57, 6.01s/it] {'loss': 0.4743, 'learning_rate': 1.146610327520122e-05, 'epoch': 0.47} + 47%|████▋ | 4109/8750 [2:50:11<7:44:56, 6.01s/it] 47%|████▋ | 4110/8750 [2:50:17<7:42:17, 5.98s/it] 47%|████▋ | 4110/8750 [2:50:20<7:42:17, 5.98s/it] {'loss': 0.4627, 'learning_rate': 1.1462441520989565e-05, 'epoch': 0.47} + 47%|████▋ | 4110/8750 [2:50:20<7:42:17, 5.98s/it] {'loss': 0.4627, 'learning_rate': 1.1462441520989565e-05, 'epoch': 0.47} + 47%|████▋ | 4110/8750 [2:50:17<7:42:17, 5.98s/it] 47%|████▋ | 4111/8750 [2:50:23<7:35:26, 5.89s/it] 47%|████▋ | 4111/8750 [2:50:26<7:35:25, 5.89s/it] {'loss': 0.4851, 'learning_rate': 1.1458779566390768e-05, 'epoch': 0.47} + 47%|████▋ | 4111/8750 [2:50:26<7:35:25, 5.89s/it] {'loss': 0.4851, 'learning_rate': 1.1458779566390768e-05, 'epoch': 0.47} + 47%|████▋ | 4111/8750 [2:50:23<7:35:26, 5.89s/it] 47%|████▋ | 4112/8750 [2:50:29<7:32:06, 5.85s/it] 47%|████▋ | 4112/8750 [2:50:32<7:32:06, 5.85s/it] {'loss': 0.4588, 'learning_rate': 1.1455117411906604e-05, 'epoch': 0.47} + 47%|████▋ | 4112/8750 [2:50:32<7:32:06, 5.85s/it] {'loss': 0.4588, 'learning_rate': 1.1455117411906604e-05, 'epoch': 0.47} + 47%|████▋ | 4112/8750 [2:50:29<7:32:06, 5.85s/it] 47%|████▋ | 4113/8750 [2:50:35<7:29:08, 5.81s/it] 47%|████▋ | 4113/8750 [2:50:37<7:29:07, 5.81s/it] {'loss': 0.47, 'learning_rate': 1.1451455058038864e-05, 'epoch': 0.47} + 47%|████▋ | 4113/8750 [2:50:37<7:29:07, 5.81s/it] {'loss': 0.47, 'learning_rate': 1.1451455058038864e-05, 'epoch': 0.47} + 47%|████▋ | 4113/8750 [2:50:35<7:29:08, 5.81s/it] 47%|████▋ | 4114/8750 [2:50:40<7:29:21, 5.82s/it] 47%|████▋ | 4114/8750 [2:50:43<7:29:21, 5.82s/it] {'loss': 0.4498, 'learning_rate': 1.1447792505289384e-05, 'epoch': 0.47} + 47%|████▋ | 4114/8750 [2:50:43<7:29:21, 5.82s/it] {'loss': 0.4498, 'learning_rate': 1.1447792505289384e-05, 'epoch': 0.47} + 47%|████▋ | 4114/8750 [2:50:40<7:29:21, 5.82s/it] 47%|████▋ | 4115/8750 [2:50:46<7:29:00, 5.81s/it] 47%|████▋ | 4115/8750 [2:50:49<7:29:00, 5.81s/it] {'loss': 0.4372, 'learning_rate': 1.1444129754159998e-05, 'epoch': 0.47} + 47%|████▋ | 4115/8750 [2:50:49<7:29:00, 5.81s/it] {'loss': 0.4372, 'learning_rate': 1.1444129754159998e-05, 'epoch': 0.47} + 47%|████▋ | 4115/8750 [2:50:46<7:29:00, 5.81s/it] 47%|████▋ | 4116/8750 [2:50:52<7:25:47, 5.77s/it] 47%|████▋ | 4116/8750 [2:50:55<7:25:46, 5.77s/it] {'loss': 0.4844, 'learning_rate': 1.1440466805152596e-05, 'epoch': 0.47} + 47%|████▋ | 4116/8750 [2:50:55<7:25:46, 5.77s/it] {'loss': 0.4844, 'learning_rate': 1.1440466805152596e-05, 'epoch': 0.47} + 47%|████▋ | 4116/8750 [2:50:52<7:25:47, 5.77s/it] 47%|████▋ | 4117/8750 [2:50:58<7:28:43, 5.81s/it] 47%|████▋ | 4117/8750 [2:51:01<7:28:43, 5.81s/it] {'loss': 0.4632, 'learning_rate': 1.1436803658769082e-05, 'epoch': 0.47} + 47%|████▋ | 4117/8750 [2:51:01<7:28:43, 5.81s/it] {'loss': 0.4632, 'learning_rate': 1.1436803658769082e-05, 'epoch': 0.47} + 47%|████▋ | 4117/8750 [2:50:58<7:28:43, 5.81s/it] 47%|████▋ | 4118/8750 [2:51:04<7:28:16, 5.81s/it] 47%|████▋ | 4118/8750 [2:51:06<7:28:16, 5.81s/it] {'loss': 0.4676, 'learning_rate': 1.1433140315511392e-05, 'epoch': 0.47} + 47%|████▋ | 4118/8750 [2:51:06<7:28:16, 5.81s/it] {'loss': 0.4676, 'learning_rate': 1.1433140315511392e-05, 'epoch': 0.47} + 47%|████▋ | 4118/8750 [2:51:04<7:28:16, 5.81s/it] 47%|████▋ | 4119/8750 [2:51:09<7:30:25, 5.84s/it] 47%|████▋ | 4119/8750 [2:51:12<7:30:24, 5.84s/it] {'loss': 0.4622, 'learning_rate': 1.142947677588148e-05, 'epoch': 0.47} + 47%|████▋ | 4119/8750 [2:51:12<7:30:24, 5.84s/it] {'loss': 0.4622, 'learning_rate': 1.142947677588148e-05, 'epoch': 0.47} + 47%|████▋ | 4119/8750 [2:51:09<7:30:25, 5.84s/it] 47%|████▋ | 4120/8750 [2:51:15<7:33:58, 5.88s/it] 47%|████▋ | 4120/8750 [2:51:18<7:33:58, 5.88s/it] {'loss': 0.4423, 'learning_rate': 1.1425813040381332e-05, 'epoch': 0.47} + 47%|████▋ | 4120/8750 [2:51:18<7:33:58, 5.88s/it] {'loss': 0.4423, 'learning_rate': 1.1425813040381332e-05, 'epoch': 0.47} + 47%|████▋ | 4120/8750 [2:51:15<7:33:58, 5.88s/it] 47%|████▋ | 4121/8750 [2:51:21<7:28:15, 5.81s/it] 47%|████▋ | 4121/8750 [2:51:24<7:28:15, 5.81s/it] {'loss': 0.4961, 'learning_rate': 1.1422149109512967e-05, 'epoch': 0.47} + 47%|████▋ | 4121/8750 [2:51:24<7:28:15, 5.81s/it] {'loss': 0.4961, 'learning_rate': 1.1422149109512967e-05, 'epoch': 0.47} + 47%|████▋ | 4121/8750 [2:51:21<7:28:15, 5.81s/it] 47%|████▋ | 4122/8750 [2:51:27<7:26:22, 5.79s/it] 47%|████▋ | 4122/8750 [2:51:30<7:26:22, 5.79s/it] {'loss': 0.4723, 'learning_rate': 1.1418484983778421e-05, 'epoch': 0.47} + 47%|████▋ | 4122/8750 [2:51:30<7:26:22, 5.79s/it] {'loss': 0.4723, 'learning_rate': 1.1418484983778421e-05, 'epoch': 0.47} + 47%|████▋ | 4122/8750 [2:51:27<7:26:22, 5.79s/it] 47%|████▋ | 4123/8750 [2:51:33<7:29:46, 5.83s/it] 47%|████▋ | 4123/8750 [2:51:36<7:29:46, 5.83s/it] {'loss': 0.486, 'learning_rate': 1.1414820663679768e-05, 'epoch': 0.47} + 47%|████▋ | 4123/8750 [2:51:36<7:29:46, 5.83s/it] {'loss': 0.486, 'learning_rate': 1.1414820663679768e-05, 'epoch': 0.47} + 47%|████▋ | 4123/8750 [2:51:33<7:29:46, 5.83s/it] 47%|████▋ | 4124/8750 [2:51:38<7:26:22, 5.79s/it] 47%|████▋ | 4124/8750 [2:51:41<7:26:22, 5.79s/it] {'loss': 0.4641, 'learning_rate': 1.1411156149719094e-05, 'epoch': 0.47} + 47%|████▋ | 4124/8750 [2:51:41<7:26:22, 5.79s/it] {'loss': 0.4641, 'learning_rate': 1.1411156149719094e-05, 'epoch': 0.47} + 47%|████▋ | 4124/8750 [2:51:38<7:26:22, 5.79s/it] 47%|████▋ | 4125/8750 [2:51:47<7:24:31, 5.77s/it] 47%|████▋ | 4125/8750 [2:51:44<7:24:32, 5.77s/it] {'loss': 0.4745, 'learning_rate': 1.1407491442398518e-05, 'epoch': 0.47} + 47%|████▋ | 4125/8750 [2:51:47<7:24:31, 5.77s/it] {'loss': 0.4745, 'learning_rate': 1.1407491442398518e-05, 'epoch': 0.47} + 47%|████▋ | 4125/8750 [2:51:44<7:24:32, 5.77s/it] 47%|████▋ | 4126/8750 [2:51:50<7:26:45, 5.80s/it] 47%|████▋ | 4126/8750 [2:51:53<7:26:45, 5.80s/it] {'loss': 0.4447, 'learning_rate': 1.1403826542220193e-05, 'epoch': 0.47} + {'loss': 0.4447, 'learning_rate': 1.1403826542220193e-05, 'epoch': 0.47} 47%|████▋ | 4126/8750 [2:51:53<7:26:45, 5.80s/it] + 47%|████▋ | 4126/8750 [2:51:50<7:26:45, 5.80s/it] 47%|████▋ | 4127/8750 [2:51:59<7:31:44, 5.86s/it] 47%|████▋ | 4127/8750 [2:51:56<7:31:45, 5.86s/it] {'loss': 0.4544, 'learning_rate': 1.1400161449686293e-05, 'epoch': 0.47} + 47%|████▋ | 4127/8750 [2:51:59<7:31:44, 5.86s/it] {'loss': 0.4544, 'learning_rate': 1.1400161449686293e-05, 'epoch': 0.47} + 47%|████▋ | 4127/8750 [2:51:56<7:31:45, 5.86s/it] 47%|████▋ | 4128/8750 [2:52:02<7:29:42, 5.84s/it] 47%|████▋ | 4128/8750 [2:52:05<7:29:42, 5.84s/it] {'loss': 0.4771, 'learning_rate': 1.139649616529901e-05, 'epoch': 0.47} + 47%|████▋ | 4128/8750 [2:52:05<7:29:42, 5.84s/it] {'loss': 0.4771, 'learning_rate': 1.139649616529901e-05, 'epoch': 0.47} + 47%|████▋ | 4128/8750 [2:52:02<7:29:42, 5.84s/it] 47%|████▋ | 4129/8750 [2:52:07<7:23:22, 5.76s/it] 47%|████▋ | 4129/8750 [2:52:10<7:23:22, 5.76s/it] {'loss': 0.4845, 'learning_rate': 1.1392830689560577e-05, 'epoch': 0.47} + 47%|████▋ | 4129/8750 [2:52:10<7:23:22, 5.76s/it] {'loss': 0.4845, 'learning_rate': 1.1392830689560577e-05, 'epoch': 0.47} + 47%|████▋ | 4129/8750 [2:52:07<7:23:22, 5.76s/it] 47%|████▋ | 4130/8750 [2:52:13<7:19:46, 5.71s/it] 47%|████▋ | 4130/8750 [2:52:16<7:19:48, 5.71s/it] {'loss': 0.4613, 'learning_rate': 1.1389165022973238e-05, 'epoch': 0.47} + 47%|████▋ | 4130/8750 [2:52:16<7:19:48, 5.71s/it] {'loss': 0.4613, 'learning_rate': 1.1389165022973238e-05, 'epoch': 0.47} + 47%|████▋ | 4130/8750 [2:52:13<7:19:46, 5.71s/it] 47%|████▋ | 4131/8750 [2:52:22<7:20:02, 5.72s/it] 47%|████▋ | 4131/8750 [2:52:19<7:20:03, 5.72s/it] {'loss': 0.4566, 'learning_rate': 1.1385499166039281e-05, 'epoch': 0.47} + 47%|████▋ | 4131/8750 [2:52:22<7:20:02, 5.72s/it] {'loss': 0.4566, 'learning_rate': 1.1385499166039281e-05, 'epoch': 0.47} + 47%|████▋ | 4131/8750 [2:52:19<7:20:03, 5.72s/it] 47%|████▋ | 4132/8750 [2:52:24<7:19:34, 5.71s/it] 47%|████▋ | 4132/8750 [2:52:27<7:19:34, 5.71s/it] {'loss': 0.4597, 'learning_rate': 1.1381833119261003e-05, 'epoch': 0.47} + 47%|████▋ | 4132/8750 [2:52:27<7:19:34, 5.71s/it] {'loss': 0.4597, 'learning_rate': 1.1381833119261003e-05, 'epoch': 0.47} + 47%|████▋ | 4132/8750 [2:52:24<7:19:34, 5.71s/it] 47%|████▋ | 4133/8750 [2:52:30<7:17:25, 5.68s/it] 47%|████▋ | 4133/8750 [2:52:33<7:17:25, 5.68s/it] {'loss': 0.4653, 'learning_rate': 1.1378166883140738e-05, 'epoch': 0.47} + 47%|████▋ | 4133/8750 [2:52:33<7:17:25, 5.68s/it] {'loss': 0.4653, 'learning_rate': 1.1378166883140738e-05, 'epoch': 0.47} + 47%|████▋ | 4133/8750 [2:52:30<7:17:25, 5.68s/it] 47%|████▋ | 4134/8750 [2:52:36<7:21:22, 5.74s/it] 47%|████▋ | 4134/8750 [2:52:39<7:21:22, 5.74s/it] {'loss': 0.4702, 'learning_rate': 1.1374500458180839e-05, 'epoch': 0.47} + 47%|████▋ | 4134/8750 [2:52:39<7:21:22, 5.74s/it] {'loss': 0.4702, 'learning_rate': 1.1374500458180839e-05, 'epoch': 0.47} + 47%|████▋ | 4134/8750 [2:52:36<7:21:22, 5.74s/it] 47%|████▋ | 4135/8750 [2:52:42<7:24:15, 5.78s/it] 47%|████▋ | 4135/8750 [2:52:45<7:24:15, 5.78s/it] {'loss': 0.4467, 'learning_rate': 1.137083384488369e-05, 'epoch': 0.47} + 47%|████▋ | 4135/8750 [2:52:45<7:24:15, 5.78s/it] {'loss': 0.4467, 'learning_rate': 1.137083384488369e-05, 'epoch': 0.47} + 47%|████▋ | 4135/8750 [2:52:42<7:24:15, 5.78s/it] 47%|████▋ | 4136/8750 [2:52:51<7:28:44, 5.84s/it] 47%|████▋ | 4136/8750 [2:52:48<7:28:45, 5.84s/it] {'loss': 0.4668, 'learning_rate': 1.1367167043751701e-05, 'epoch': 0.47} + 47%|████▋ | 4136/8750 [2:52:51<7:28:44, 5.84s/it] {'loss': 0.4668, 'learning_rate': 1.1367167043751701e-05, 'epoch': 0.47} + 47%|████▋ | 4136/8750 [2:52:48<7:28:45, 5.84s/it] 47%|████▋ | 4137/8750 [2:52:54<7:26:32, 5.81s/it] 47%|████▋ | 4137/8750 [2:52:56<7:26:33, 5.81s/it] {'loss': 0.4543, 'learning_rate': 1.1363500055287301e-05, 'epoch': 0.47} + 47%|████▋ | 4137/8750 [2:52:56<7:26:33, 5.81s/it] {'loss': 0.4543, 'learning_rate': 1.1363500055287301e-05, 'epoch': 0.47} + 47%|████▋ | 4137/8750 [2:52:54<7:26:32, 5.81s/it] 47%|████▋ | 4138/8750 [2:52:59<7:26:34, 5.81s/it] 47%|████▋ | 4138/8750 [2:53:02<7:26:34, 5.81s/it] {'loss': 0.4653, 'learning_rate': 1.1359832879992956e-05, 'epoch': 0.47} + 47%|████▋ | 4138/8750 [2:53:02<7:26:34, 5.81s/it] {'loss': 0.4653, 'learning_rate': 1.1359832879992956e-05, 'epoch': 0.47} + 47%|████▋ | 4138/8750 [2:52:59<7:26:34, 5.81s/it] 47%|████▋ | 4139/8750 [2:53:08<7:26:29, 5.81s/it] 47%|████▋ | 4139/8750 [2:53:05<7:26:29, 5.81s/it] {'loss': 0.4749, 'learning_rate': 1.1356165518371142e-05, 'epoch': 0.47} + 47%|████▋ | 4139/8750 [2:53:08<7:26:29, 5.81s/it] {'loss': 0.4749, 'learning_rate': 1.1356165518371142e-05, 'epoch': 0.47} + 47%|████▋ | 4139/8750 [2:53:05<7:26:29, 5.81s/it] 47%|████▋ | 4140/8750 [2:53:11<7:22:31, 5.76s/it] 47%|████▋ | 4140/8750 [2:53:14<7:22:33, 5.76s/it] {'loss': 0.4748, 'learning_rate': 1.1352497970924376e-05, 'epoch': 0.47} + 47%|████▋ | 4140/8750 [2:53:14<7:22:33, 5.76s/it] {'loss': 0.4748, 'learning_rate': 1.1352497970924376e-05, 'epoch': 0.47} + 47%|████▋ | 4140/8750 [2:53:11<7:22:31, 5.76s/it] 47%|████▋ | 4141/8750 [2:53:16<7:21:32, 5.75s/it] 47%|████▋ | 4141/8750 [2:53:19<7:21:32, 5.75s/it] {'loss': 0.4532, 'learning_rate': 1.1348830238155191e-05, 'epoch': 0.47} + {'loss': 0.4532, 'learning_rate': 1.1348830238155191e-05, 'epoch': 0.47} 47%|████▋ | 4141/8750 [2:53:19<7:21:32, 5.75s/it] + 47%|████▋ | 4141/8750 [2:53:16<7:21:32, 5.75s/it] 47%|████▋ | 4142/8750 [2:53:25<7:19:37, 5.72s/it] 47%|████▋ | 4142/8750 [2:53:22<7:19:37, 5.72s/it] {'loss': 0.4458, 'learning_rate': 1.134516232056615e-05, 'epoch': 0.47} + 47%|████▋ | 4142/8750 [2:53:25<7:19:37, 5.72s/it] {'loss': 0.4458, 'learning_rate': 1.134516232056615e-05, 'epoch': 0.47} + 47%|████▋ | 4142/8750 [2:53:22<7:19:37, 5.72s/it] 47%|████▋ | 4143/8750 [2:53:28<7:17:34, 5.70s/it] 47%|████▋ | 4143/8750 [2:53:31<7:17:34, 5.70s/it] {'loss': 0.4781, 'learning_rate': 1.134149421865984e-05, 'epoch': 0.47} + 47%|████▋ | 4143/8750 [2:53:31<7:17:34, 5.70s/it] {'loss': 0.4781, 'learning_rate': 1.134149421865984e-05, 'epoch': 0.47} + 47%|████▋ | 4143/8750 [2:53:28<7:17:34, 5.70s/it] 47%|████▋ | 4144/8750 [2:53:34<7:26:57, 5.82s/it] 47%|████▋ | 4144/8750 [2:53:37<7:26:57, 5.82s/it] {'loss': 0.4488, 'learning_rate': 1.1337825932938866e-05, 'epoch': 0.47} + 47%|████▋ | 4144/8750 [2:53:37<7:26:57, 5.82s/it] {'loss': 0.4488, 'learning_rate': 1.1337825932938866e-05, 'epoch': 0.47} + 47%|████▋ | 4144/8750 [2:53:34<7:26:57, 5.82s/it] 47%|████▋ | 4145/8750 [2:53:42<7:22:27, 5.76s/it] 47%|████▋ | 4145/8750 [2:53:40<7:22:27, 5.76s/it] {'loss': 0.4605, 'learning_rate': 1.1334157463905876e-05, 'epoch': 0.47} + 47%|████▋ | 4145/8750 [2:53:42<7:22:27, 5.76s/it] {'loss': 0.4605, 'learning_rate': 1.1334157463905876e-05, 'epoch': 0.47} + 47%|████▋ | 4145/8750 [2:53:40<7:22:27, 5.76s/it] 47%|████▋ | 4146/8750 [2:53:45<7:26:08, 5.81s/it] 47%|████▋ | 4146/8750 [2:53:48<7:26:08, 5.81s/it] {'loss': 0.4605, 'learning_rate': 1.1330488812063526e-05, 'epoch': 0.47} + 47%|████▋ | 4146/8750 [2:53:48<7:26:08, 5.81s/it] {'loss': 0.4605, 'learning_rate': 1.1330488812063526e-05, 'epoch': 0.47} + 47%|████▋ | 4146/8750 [2:53:45<7:26:08, 5.81s/it] 47%|████▋ | 4147/8750 [2:53:54<7:24:17, 5.79s/it] 47%|████▋ | 4147/8750 [2:53:51<7:24:17, 5.79s/it] {'loss': 0.4966, 'learning_rate': 1.1326819977914503e-05, 'epoch': 0.47} + 47%|████▋ | 4147/8750 [2:53:54<7:24:17, 5.79s/it] {'loss': 0.4966, 'learning_rate': 1.1326819977914503e-05, 'epoch': 0.47} + 47%|████▋ | 4147/8750 [2:53:51<7:24:17, 5.79s/it] 47%|████▋ | 4148/8750 [2:54:00<7:24:28, 5.79s/it] 47%|████▋ | 4148/8750 [2:53:57<7:24:28, 5.79s/it] {'loss': 0.4677, 'learning_rate': 1.132315096196152e-05, 'epoch': 0.47} + 47%|████▋ | 4148/8750 [2:54:00<7:24:28, 5.79s/it] {'loss': 0.4677, 'learning_rate': 1.132315096196152e-05, 'epoch': 0.47} + 47%|████▋ | 4148/8750 [2:53:57<7:24:28, 5.79s/it] 47%|████▋ | 4149/8750 [2:54:06<7:23:20, 5.78s/it] 47%|████▋ | 4149/8750 [2:54:03<7:23:20, 5.78s/it] {'loss': 0.4461, 'learning_rate': 1.1319481764707313e-05, 'epoch': 0.47} + 47%|████▋ | 4149/8750 [2:54:06<7:23:20, 5.78s/it] {'loss': 0.4461, 'learning_rate': 1.1319481764707313e-05, 'epoch': 0.47} + 47%|████▋ | 4149/8750 [2:54:03<7:23:20, 5.78s/it]11 AutoResumeHook: Checking whether to suspend... +5 1AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend...14 + AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +10 13AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +03 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... + 47%|████▋ | 4150/8750 [2:54:11<7:19:59, 5.74s/it] AutoResumeHook: Checking whether to suspend... + 47%|████▋ | 4150/8750 [2:54:08<7:19:59, 5.74s/it] {'loss': 0.4615, 'learning_rate': 1.131581238665465e-05, 'epoch': 0.47} + 47%|████▋ | 4150/8750 [2:54:11<7:19:59, 5.74s/it] {'loss': 0.4615, 'learning_rate': 1.131581238665465e-05, 'epoch': 0.47} + 47%|████▋ | 4150/8750 [2:54:08<7:19:59, 5.74s/it] 47%|████▋ | 4151/8750 [2:54:14<7:21:40, 5.76s/it] 47%|████▋ | 4151/8750 [2:54:17<7:21:41, 5.76s/it] {'loss': 0.4617, 'learning_rate': 1.1312142828306309e-05, 'epoch': 0.47} + 47%|████▋ | 4151/8750 [2:54:17<7:21:41, 5.76s/it] {'loss': 0.4617, 'learning_rate': 1.1312142828306309e-05, 'epoch': 0.47} + 47%|████▋ | 4151/8750 [2:54:14<7:21:40, 5.76s/it] 47%|████▋ | 4152/8750 [2:54:20<7:32:12, 5.90s/it] 47%|████▋ | 4152/8750 [2:54:23<7:32:12, 5.90s/it] {'loss': 0.4534, 'learning_rate': 1.1308473090165107e-05, 'epoch': 0.47} + 47%|████▋ | 4152/8750 [2:54:23<7:32:12, 5.90s/it] {'loss': 0.4534, 'learning_rate': 1.1308473090165107e-05, 'epoch': 0.47} + 47%|████▋ | 4152/8750 [2:54:20<7:32:12, 5.90s/it] 47%|████▋ | 4153/8750 [2:54:29<7:29:26, 5.87s/it] 47%|████▋ | 4153/8750 [2:54:26<7:29:26, 5.87s/it] {'loss': 0.4575, 'learning_rate': 1.1304803172733878e-05, 'epoch': 0.47} + 47%|████▋ | 4153/8750 [2:54:29<7:29:26, 5.87s/it] {'loss': 0.4575, 'learning_rate': 1.1304803172733878e-05, 'epoch': 0.47} + 47%|████▋ | 4153/8750 [2:54:26<7:29:26, 5.87s/it] 47%|████▋ | 4154/8750 [2:54:32<7:25:13, 5.81s/it] 47%|████▋ | 4154/8750 [2:54:35<7:25:13, 5.81s/it] {'loss': 0.463, 'learning_rate': 1.1301133076515482e-05, 'epoch': 0.47} + 47%|████▋ | 4154/8750 [2:54:35<7:25:13, 5.81s/it] {'loss': 0.463, 'learning_rate': 1.1301133076515482e-05, 'epoch': 0.47} + 47%|████▋ | 4154/8750 [2:54:32<7:25:13, 5.81s/it] 47%|████▋ | 4155/8750 [2:54:37<7:18:50, 5.73s/it] 47%|████▋ | 4155/8750 [2:54:40<7:18:50, 5.73s/it] {'loss': 0.4876, 'learning_rate': 1.1297462802012806e-05, 'epoch': 0.47} + 47%|████▋ | 4155/8750 [2:54:40<7:18:50, 5.73s/it] {'loss': 0.4876, 'learning_rate': 1.1297462802012806e-05, 'epoch': 0.47} + 47%|████▋ | 4155/8750 [2:54:37<7:18:50, 5.73s/it] 47%|████▋ | 4156/8750 [2:54:43<7:16:58, 5.71s/it] 47%|████▋ | 4156/8750 [2:54:46<7:16:58, 5.71s/it] {'loss': 0.4654, 'learning_rate': 1.129379234972876e-05, 'epoch': 0.47} + 47%|████▋ | 4156/8750 [2:54:46<7:16:58, 5.71s/it] {'loss': 0.4654, 'learning_rate': 1.129379234972876e-05, 'epoch': 0.47} + 47%|████▋ | 4156/8750 [2:54:43<7:16:58, 5.71s/it] 48%|████▊ | 4157/8750 [2:54:49<7:23:25, 5.79s/it] 48%|████▊ | 4157/8750 [2:54:52<7:23:24, 5.79s/it] {'loss': 0.4713, 'learning_rate': 1.1290121720166277e-05, 'epoch': 0.48} + 48%|████▊ | 4157/8750 [2:54:52<7:23:24, 5.79s/it] {'loss': 0.4713, 'learning_rate': 1.1290121720166277e-05, 'epoch': 0.48} + 48%|████▊ | 4157/8750 [2:54:49<7:23:25, 5.79s/it] 48%|████▊ | 4158/8750 [2:54:55<7:29:56, 5.88s/it] 48%|████▊ | 4158/8750 [2:54:58<7:29:56, 5.88s/it] {'loss': 0.4479, 'learning_rate': 1.1286450913828313e-05, 'epoch': 0.48} + 48%|████▊ | 4158/8750 [2:54:58<7:29:56, 5.88s/it] {'loss': 0.4479, 'learning_rate': 1.1286450913828313e-05, 'epoch': 0.48} + 48%|████▊ | 4158/8750 [2:54:55<7:29:56, 5.88s/it] 48%|████▊ | 4159/8750 [2:55:04<7:26:57, 5.84s/it] 48%|████▊ | 4159/8750 [2:55:01<7:26:57, 5.84s/it] {'loss': 0.4562, 'learning_rate': 1.1282779931217852e-05, 'epoch': 0.48} + 48%|████▊ | 4159/8750 [2:55:04<7:26:57, 5.84s/it] {'loss': 0.4562, 'learning_rate': 1.1282779931217852e-05, 'epoch': 0.48} + 48%|████▊ | 4159/8750 [2:55:01<7:26:57, 5.84s/it] 48%|████▊ | 4160/8750 [2:55:07<7:24:28, 5.81s/it] 48%|████▊ | 4160/8750 [2:55:10<7:24:28, 5.81s/it] {'loss': 0.4596, 'learning_rate': 1.1279108772837901e-05, 'epoch': 0.48} + 48%|████▊ | 4160/8750 [2:55:10<7:24:28, 5.81s/it] {'loss': 0.4596, 'learning_rate': 1.1279108772837901e-05, 'epoch': 0.48} + 48%|████▊ | 4160/8750 [2:55:07<7:24:28, 5.81s/it] 48%|████▊ | 4161/8750 [2:55:15<7:26:24, 5.84s/it] 48%|████▊ | 4161/8750 [2:55:13<7:26:24, 5.84s/it] {'loss': 0.469, 'learning_rate': 1.1275437439191493e-05, 'epoch': 0.48} + 48%|████▊ | 4161/8750 [2:55:15<7:26:24, 5.84s/it] {'loss': 0.469, 'learning_rate': 1.1275437439191493e-05, 'epoch': 0.48} + 48%|████▊ | 4161/8750 [2:55:13<7:26:24, 5.84s/it] 48%|████▊ | 4162/8750 [2:55:18<7:23:56, 5.81s/it] 48%|████▊ | 4162/8750 [2:55:21<7:23:56, 5.81s/it]{'loss': 0.4607, 'learning_rate': 1.1271765930781677e-05, 'epoch': 0.48} + 48%|████▊ | 4162/8750 [2:55:21<7:23:56, 5.81s/it] {'loss': 0.4607, 'learning_rate': 1.1271765930781677e-05, 'epoch': 0.48} + 48%|████▊ | 4162/8750 [2:55:18<7:23:56, 5.81s/it] 48%|████▊ | 4163/8750 [2:55:27<7:19:24, 5.75s/it] 48%|████▊ | 4163/8750 [2:55:24<7:19:24, 5.75s/it] {'loss': 0.471, 'learning_rate': 1.1268094248111536e-05, 'epoch': 0.48} + 48%|████▊ | 4163/8750 [2:55:27<7:19:24, 5.75s/it] {'loss': 0.471, 'learning_rate': 1.1268094248111536e-05, 'epoch': 0.48} + 48%|████▊ | 4163/8750 [2:55:24<7:19:24, 5.75s/it] 48%|████▊ | 4164/8750 [2:55:33<7:17:31, 5.72s/it] 48%|████▊ | 4164/8750 [2:55:30<7:17:31, 5.72s/it] {'loss': 0.4497, 'learning_rate': 1.1264422391684171e-05, 'epoch': 0.48} + 48%|████▊ | 4164/8750 [2:55:33<7:17:31, 5.72s/it] {'loss': 0.4497, 'learning_rate': 1.1264422391684171e-05, 'epoch': 0.48} + 48%|████▊ | 4164/8750 [2:55:30<7:17:31, 5.72s/it] 48%|████▊ | 4165/8750 [2:55:35<7:17:49, 5.73s/it] 48%|████▊ | 4165/8750 [2:55:38<7:17:49, 5.73s/it] {'loss': 0.4635, 'learning_rate': 1.126075036200271e-05, 'epoch': 0.48} + 48%|████▊ | 4165/8750 [2:55:38<7:17:49, 5.73s/it] {'loss': 0.4635, 'learning_rate': 1.126075036200271e-05, 'epoch': 0.48} + 48%|████▊ | 4165/8750 [2:55:35<7:17:49, 5.73s/it] 48%|████▊ | 4166/8750 [2:55:44<7:16:15, 5.71s/it] 48%|████▊ | 4166/8750 [2:55:41<7:16:15, 5.71s/it] {'loss': 0.4637, 'learning_rate': 1.1257078159570303e-05, 'epoch': 0.48} + 48%|████▊ | 4166/8750 [2:55:44<7:16:15, 5.71s/it] {'loss': 0.4637, 'learning_rate': 1.1257078159570303e-05, 'epoch': 0.48} + 48%|████▊ | 4166/8750 [2:55:41<7:16:15, 5.71s/it] 48%|████▊ | 4167/8750 [2:55:47<7:15:37, 5.70s/it] 48%|████▊ | 4167/8750 [2:55:50<7:15:37, 5.70s/it] {'loss': 0.4544, 'learning_rate': 1.125340578489012e-05, 'epoch': 0.48} + 48%|████▊ | 4167/8750 [2:55:50<7:15:37, 5.70s/it] {'loss': 0.4544, 'learning_rate': 1.125340578489012e-05, 'epoch': 0.48} + 48%|████▊ | 4167/8750 [2:55:47<7:15:37, 5.70s/it] 48%|████▊ | 4168/8750 [2:55:53<7:19:34, 5.76s/it] 48%|████▊ | 4168/8750 [2:55:55<7:19:35, 5.76s/it] {'loss': 0.4707, 'learning_rate': 1.1249733238465359e-05, 'epoch': 0.48} + 48%|████▊ | 4168/8750 [2:55:55<7:19:35, 5.76s/it] {'loss': 0.4707, 'learning_rate': 1.1249733238465359e-05, 'epoch': 0.48} + 48%|████▊ | 4168/8750 [2:55:53<7:19:34, 5.76s/it] 48%|████▊ | 4169/8750 [2:56:01<7:20:22, 5.77s/it] 48%|████▊ | 4169/8750 [2:55:58<7:20:22, 5.77s/it] {'loss': 0.4489, 'learning_rate': 1.1246060520799244e-05, 'epoch': 0.48} + 48%|████▊ | 4169/8750 [2:56:01<7:20:22, 5.77s/it] {'loss': 0.4489, 'learning_rate': 1.1246060520799244e-05, 'epoch': 0.48} + 48%|████▊ | 4169/8750 [2:55:58<7:20:22, 5.77s/it] 48%|████▊ | 4170/8750 [2:56:04<7:20:08, 5.77s/it] 48%|████▊ | 4170/8750 [2:56:07<7:20:08, 5.77s/it] {'loss': 0.4794, 'learning_rate': 1.1242387632395019e-05, 'epoch': 0.48} + 48%|████▊ | 4170/8750 [2:56:07<7:20:08, 5.77s/it] {'loss': 0.4794, 'learning_rate': 1.1242387632395019e-05, 'epoch': 0.48} + 48%|████▊ | 4170/8750 [2:56:04<7:20:08, 5.77s/it] 48%|████▊ | 4171/8750 [2:56:10<7:18:04, 5.74s/it] 48%|████▊ | 4171/8750 [2:56:13<7:18:04, 5.74s/it] {'loss': 0.4467, 'learning_rate': 1.1238714573755954e-05, 'epoch': 0.48} + 48%|████▊ | 4171/8750 [2:56:13<7:18:04, 5.74s/it] {'loss': 0.4467, 'learning_rate': 1.1238714573755954e-05, 'epoch': 0.48} + 48%|████▊ | 4171/8750 [2:56:10<7:18:04, 5.74s/it] 48%|████▊ | 4172/8750 [2:56:15<7:15:26, 5.71s/it] 48%|████▊ | 4172/8750 [2:56:18<7:15:26, 5.71s/it] {'loss': 0.4618, 'learning_rate': 1.1235041345385328e-05, 'epoch': 0.48} + 48%|████▊ | 4172/8750 [2:56:18<7:15:26, 5.71s/it] {'loss': 0.4618, 'learning_rate': 1.1235041345385328e-05, 'epoch': 0.48} + 48%|████▊ | 4172/8750 [2:56:15<7:15:26, 5.71s/it] 48%|████▊ | 4173/8750 [2:56:21<7:17:38, 5.74s/it] 48%|████▊ | 4173/8750 [2:56:24<7:17:38, 5.74s/it] {'loss': 0.4623, 'learning_rate': 1.123136794778647e-05, 'epoch': 0.48} + 48%|████▊ | 4173/8750 [2:56:24<7:17:38, 5.74s/it] {'loss': 0.4623, 'learning_rate': 1.123136794778647e-05, 'epoch': 0.48} + 48%|████▊ | 4173/8750 [2:56:21<7:17:38, 5.74s/it] 48%|████▊ | 4174/8750 [2:56:27<7:14:47, 5.70s/it] 48%|████▊ | 4174/8750 [2:56:30<7:14:47, 5.70s/it] {'loss': 0.4752, 'learning_rate': 1.122769438146271e-05, 'epoch': 0.48} + 48%|████▊ | 4174/8750 [2:56:30<7:14:47, 5.70s/it] {'loss': 0.4752, 'learning_rate': 1.122769438146271e-05, 'epoch': 0.48} + 48%|████▊ | 4174/8750 [2:56:27<7:14:47, 5.70s/it] 48%|████▊ | 4175/8750 [2:56:36<7:18:41, 5.75s/it] 48%|████▊ | 4175/8750 [2:56:33<7:18:41, 5.75s/it] {'loss': 0.4514, 'learning_rate': 1.1224020646917413e-05, 'epoch': 0.48} + 48%|████▊ | 4175/8750 [2:56:36<7:18:41, 5.75s/it] {'loss': 0.4514, 'learning_rate': 1.1224020646917413e-05, 'epoch': 0.48} + 48%|████▊ | 4175/8750 [2:56:33<7:18:41, 5.75s/it] 48%|████▊ | 4176/8750 [2:56:38<7:17:54, 5.74s/it] 48%|████▊ | 4176/8750 [2:56:41<7:17:55, 5.74s/it] {'loss': 0.4628, 'learning_rate': 1.1220346744653956e-05, 'epoch': 0.48} + 48%|████▊ | 4176/8750 [2:56:41<7:17:55, 5.74s/it] {'loss': 0.4628, 'learning_rate': 1.1220346744653956e-05, 'epoch': 0.48} + 48%|████▊ | 4176/8750 [2:56:38<7:17:54, 5.74s/it] 48%|████▊ | 4177/8750 [2:56:44<7:20:08, 5.77s/it] 48%|████▊ | 4177/8750 [2:56:47<7:20:08, 5.77s/it] {'loss': 0.4627, 'learning_rate': 1.1216672675175748e-05, 'epoch': 0.48} + 48%|████▊ | 4177/8750 [2:56:47<7:20:08, 5.77s/it] {'loss': 0.4627, 'learning_rate': 1.1216672675175748e-05, 'epoch': 0.48} + 48%|████▊ | 4177/8750 [2:56:44<7:20:08, 5.77s/it] 48%|████▊ | 4178/8750 [2:56:50<7:23:38, 5.82s/it] 48%|████▊ | 4178/8750 [2:56:53<7:23:38, 5.82s/it] {'loss': 0.4575, 'learning_rate': 1.1212998438986223e-05, 'epoch': 0.48} + 48%|████▊ | 4178/8750 [2:56:53<7:23:38, 5.82s/it] {'loss': 0.4575, 'learning_rate': 1.1212998438986223e-05, 'epoch': 0.48} + 48%|████▊ | 4178/8750 [2:56:50<7:23:38, 5.82s/it] 48%|████▊ | 4179/8750 [2:56:56<7:23:26, 5.82s/it] 48%|████▊ | 4179/8750 [2:56:59<7:23:26, 5.82s/it] {'loss': 0.4789, 'learning_rate': 1.1209324036588828e-05, 'epoch': 0.48} + 48%|████▊ | 4179/8750 [2:56:59<7:23:26, 5.82s/it] {'loss': 0.4789, 'learning_rate': 1.1209324036588828e-05, 'epoch': 0.48} + 48%|████▊ | 4179/8750 [2:56:56<7:23:26, 5.82s/it] 48%|████▊ | 4180/8750 [2:57:02<7:22:45, 5.81s/it] 48%|████▊ | 4180/8750 [2:57:05<7:22:44, 5.81s/it] {'loss': 0.4914, 'learning_rate': 1.1205649468487042e-05, 'epoch': 0.48} + {'loss': 0.4914, 'learning_rate': 1.1205649468487042e-05, 'epoch': 0.48} + 48%|████▊ | 4180/8750 [2:57:05<7:22:44, 5.81s/it] 48%|████▊ | 4180/8750 [2:57:02<7:22:45, 5.81s/it] 48%|████▊ | 4181/8750 [2:57:08<7:21:00, 5.79s/it] 48%|████▊ | 4181/8750 [2:57:10<7:21:00, 5.79s/it] {'loss': 0.4602, 'learning_rate': 1.1201974735184362e-05, 'epoch': 0.48} + 48%|████▊ | 4181/8750 [2:57:11<7:21:00, 5.79s/it] {'loss': 0.4602, 'learning_rate': 1.1201974735184362e-05, 'epoch': 0.48} + 48%|████▊ | 4181/8750 [2:57:08<7:21:00, 5.79s/it] 48%|████▊ | 4182/8750 [2:57:16<7:19:29, 5.77s/it] 48%|████▊ | 4182/8750 [2:57:13<7:19:30, 5.77s/it] {'loss': 0.4697, 'learning_rate': 1.1198299837184305e-05, 'epoch': 0.48} + 48%|████▊ | 4182/8750 [2:57:16<7:19:29, 5.77s/it] {'loss': 0.4697, 'learning_rate': 1.1198299837184305e-05, 'epoch': 0.48} + 48%|████▊ | 4182/8750 [2:57:13<7:19:30, 5.77s/it] 48%|████▊ | 4183/8750 [2:57:22<7:17:38, 5.75s/it] 48%|████▊ | 4183/8750 [2:57:19<7:17:39, 5.75s/it] {'loss': 0.4737, 'learning_rate': 1.1194624774990418e-05, 'epoch': 0.48} + 48%|████▊ | 4183/8750 [2:57:22<7:17:38, 5.75s/it] {'loss': 0.4737, 'learning_rate': 1.1194624774990418e-05, 'epoch': 0.48} + 48%|████▊ | 4183/8750 [2:57:19<7:17:39, 5.75s/it] 48%|████▊ | 4184/8750 [2:57:28<7:15:40, 5.73s/it] 48%|████▊ | 4184/8750 [2:57:25<7:15:40, 5.73s/it] {'loss': 0.4548, 'learning_rate': 1.119094954910627e-05, 'epoch': 0.48} + 48%|████▊ | 4184/8750 [2:57:28<7:15:40, 5.73s/it] {'loss': 0.4548, 'learning_rate': 1.119094954910627e-05, 'epoch': 0.48} + 48%|████▊ | 4184/8750 [2:57:25<7:15:40, 5.73s/it] 48%|████▊ | 4185/8750 [2:57:31<7:19:30, 5.78s/it] 48%|████▊ | 4185/8750 [2:57:33<7:19:30, 5.78s/it] {'loss': 0.4352, 'learning_rate': 1.118727416003544e-05, 'epoch': 0.48} + {'loss': 0.4352, 'learning_rate': 1.118727416003544e-05, 'epoch': 0.48} 48%|████▊ | 4185/8750 [2:57:33<7:19:30, 5.78s/it] + 48%|████▊ | 4185/8750 [2:57:31<7:19:30, 5.78s/it] 48%|████▊ | 4186/8750 [2:57:39<7:21:13, 5.80s/it] 48%|████▊ | 4186/8750 [2:57:36<7:21:13, 5.80s/it] {'loss': 0.4824, 'learning_rate': 1.1183598608281543e-05, 'epoch': 0.48} + 48%|████▊ | 4186/8750 [2:57:39<7:21:13, 5.80s/it] {'loss': 0.4824, 'learning_rate': 1.1183598608281543e-05, 'epoch': 0.48} + 48%|████▊ | 4186/8750 [2:57:36<7:21:13, 5.80s/it] 48%|████▊ | 4187/8750 [2:57:45<7:19:59, 5.79s/it] 48%|████▊ | 4187/8750 [2:57:42<7:19:59, 5.79s/it] {'loss': 0.4557, 'learning_rate': 1.1179922894348207e-05, 'epoch': 0.48} + 48%|████▊ | 4187/8750 [2:57:45<7:19:59, 5.79s/it] {'loss': 0.4557, 'learning_rate': 1.1179922894348207e-05, 'epoch': 0.48} + 48%|████▊ | 4187/8750 [2:57:42<7:19:59, 5.79s/it] 48%|████▊ | 4188/8750 [2:57:48<7:15:13, 5.72s/it] 48%|████▊ | 4188/8750 [2:57:51<7:15:14, 5.72s/it] {'loss': 0.4896, 'learning_rate': 1.11762470187391e-05, 'epoch': 0.48} + 48%|████▊ | 4188/8750 [2:57:51<7:15:14, 5.72s/it] {'loss': 0.4896, 'learning_rate': 1.11762470187391e-05, 'epoch': 0.48} + 48%|████▊ | 4188/8750 [2:57:48<7:15:13, 5.72s/it] 48%|████▊ | 4189/8750 [2:57:57<7:18:29, 5.77s/it] 48%|████▊ | 4189/8750 [2:57:54<7:18:31, 5.77s/it] {'loss': 0.4562, 'learning_rate': 1.1172570981957886e-05, 'epoch': 0.48} + 48%|████▊ | 4189/8750 [2:57:57<7:18:29, 5.77s/it] {'loss': 0.4562, 'learning_rate': 1.1172570981957886e-05, 'epoch': 0.48} + 48%|████▊ | 4189/8750 [2:57:54<7:18:31, 5.77s/it] 48%|████▊ | 4190/8750 [2:57:59<7:17:13, 5.75s/it] 48%|████▊ | 4190/8750 [2:58:02<7:17:14, 5.75s/it] {'loss': 0.4579, 'learning_rate': 1.1168894784508268e-05, 'epoch': 0.48} + 48%|████▊ | 4190/8750 [2:58:02<7:17:14, 5.75s/it] {'loss': 0.4579, 'learning_rate': 1.1168894784508268e-05, 'epoch': 0.48} + 48%|████▊ | 4190/8750 [2:57:59<7:17:13, 5.75s/it] 48%|████▊ | 4191/8750 [2:58:05<7:14:10, 5.71s/it] 48%|████▊ | 4191/8750 [2:58:08<7:14:11, 5.71s/it] {'loss': 0.4793, 'learning_rate': 1.1165218426893969e-05, 'epoch': 0.48} + 48%|████▊ | 4191/8750 [2:58:08<7:14:11, 5.71s/it] {'loss': 0.4793, 'learning_rate': 1.1165218426893969e-05, 'epoch': 0.48} + 48%|████▊ | 4191/8750 [2:58:05<7:14:10, 5.71s/it] 48%|████▊ | 4192/8750 [2:58:11<7:17:07, 5.75s/it] 48%|████▊ | 4192/8750 [2:58:14<7:17:08, 5.75s/it] {'loss': 0.4601, 'learning_rate': 1.1161541909618728e-05, 'epoch': 0.48} + 48%|████▊ | 4192/8750 [2:58:14<7:17:08, 5.75s/it] {'loss': 0.4601, 'learning_rate': 1.1161541909618728e-05, 'epoch': 0.48} + 48%|████▊ | 4192/8750 [2:58:11<7:17:07, 5.75s/it] 48%|████▊ | 4193/8750 [2:58:17<7:20:07, 5.79s/it] 48%|████▊ | 4193/8750 [2:58:20<7:20:07, 5.79s/it] {'loss': 0.4474, 'learning_rate': 1.1157865233186315e-05, 'epoch': 0.48} + 48%|████▊ | 4193/8750 [2:58:20<7:20:07, 5.79s/it] {'loss': 0.4474, 'learning_rate': 1.1157865233186315e-05, 'epoch': 0.48} + 48%|████▊ | 4193/8750 [2:58:17<7:20:07, 5.79s/it] 48%|████▊ | 4194/8750 [2:58:23<7:20:02, 5.79s/it] 48%|████▊ | 4194/8750 [2:58:25<7:20:02, 5.80s/it] {'loss': 0.4726, 'learning_rate': 1.1154188398100516e-05, 'epoch': 0.48} + 48%|████▊ | 4194/8750 [2:58:25<7:20:02, 5.80s/it] {'loss': 0.4726, 'learning_rate': 1.1154188398100516e-05, 'epoch': 0.48} + 48%|████▊ | 4194/8750 [2:58:23<7:20:02, 5.79s/it] 48%|████▊ | 4195/8750 [2:58:28<7:17:07, 5.76s/it] 48%|████▊ | 4195/8750 [2:58:31<7:17:07, 5.76s/it] {'loss': 0.4498, 'learning_rate': 1.1150511404865136e-05, 'epoch': 0.48} + 48%|████▊ | 4195/8750 [2:58:31<7:17:07, 5.76s/it] {'loss': 0.4498, 'learning_rate': 1.1150511404865136e-05, 'epoch': 0.48} + 48%|████▊ | 4195/8750 [2:58:28<7:17:07, 5.76s/it] 48%|████▊ | 4196/8750 [2:58:34<7:19:19, 5.79s/it] 48%|████▊ | 4196/8750 [2:58:37<7:19:19, 5.79s/it] {'loss': 0.473, 'learning_rate': 1.1146834253984008e-05, 'epoch': 0.48} + 48%|████▊ | 4196/8750 [2:58:37<7:19:19, 5.79s/it] {'loss': 0.473, 'learning_rate': 1.1146834253984008e-05, 'epoch': 0.48} + 48%|████▊ | 4196/8750 [2:58:34<7:19:19, 5.79s/it] 48%|████▊ | 4197/8750 [2:58:40<7:20:00, 5.80s/it] 48%|████▊ | 4197/8750 [2:58:43<7:20:00, 5.80s/it] {'loss': 0.4606, 'learning_rate': 1.114315694596098e-05, 'epoch': 0.48} + 48%|████▊ | 4197/8750 [2:58:43<7:20:00, 5.80s/it] {'loss': 0.4606, 'learning_rate': 1.114315694596098e-05, 'epoch': 0.48} + 48%|████▊ | 4197/8750 [2:58:40<7:20:00, 5.80s/it] 48%|████▊ | 4198/8750 [2:58:46<7:22:17, 5.83s/it] 48%|████▊ | 4198/8750 [2:58:49<7:22:17, 5.83s/it] {'loss': 0.4608, 'learning_rate': 1.1139479481299928e-05, 'epoch': 0.48} + 48%|████▊ | 4198/8750 [2:58:49<7:22:17, 5.83s/it] {'loss': 0.4608, 'learning_rate': 1.1139479481299928e-05, 'epoch': 0.48} + 48%|████▊ | 4198/8750 [2:58:46<7:22:17, 5.83s/it] 48%|████▊ | 4199/8750 [2:58:54<7:18:20, 5.78s/it] 48%|████▊ | 4199/8750 [2:58:51<7:18:20, 5.78s/it] {'loss': 0.4625, 'learning_rate': 1.113580186050475e-05, 'epoch': 0.48} + 48%|████▊ | 4199/8750 [2:58:54<7:18:20, 5.78s/it] {'loss': 0.4625, 'learning_rate': 1.113580186050475e-05, 'epoch': 0.48} + 48%|████▊ | 4199/8750 [2:58:51<7:18:20, 5.78s/it]15 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +15 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +08 AutoResumeHook: Checking whether to suspend... + 10 AutoResumeHook: Checking whether to suspend... + 48%|████▊ | 4200/8750 [2:59:00<7:21:10, 5.82s/it]11 AutoResumeHook: Checking whether to suspend... +139 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +6AutoResumeHook: Checking whether to suspend... AutoResumeHook: Checking whether to suspend... +3 + AutoResumeHook: Checking whether to suspend... + 48%|████▊ | 4200/8750 [2:58:57<7:21:11, 5.82s/it] {'loss': 0.4688, 'learning_rate': 1.1132124084079359e-05, 'epoch': 0.48} + 48%|████▊ | 4200/8750 [2:59:00<7:21:10, 5.82s/it] {'loss': 0.4688, 'learning_rate': 1.1132124084079359e-05, 'epoch': 0.48} + 48%|████▊ | 4200/8750 [2:58:57<7:21:11, 5.82s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-4200/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-4200/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-4200/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 48%|████▊ | 4201/8750 [2:59:19<12:19:21, 9.75s/it] 48%|████▊ | 4201/8750 [2:59:16<12:19:21, 9.75s/it] {'loss': 0.4852, 'learning_rate': 1.112844615252769e-05, 'epoch': 0.48} + 48%|████▊ | 4201/8750 [2:59:19<12:19:21, 9.75s/it] {'loss': 0.4852, 'learning_rate': 1.112844615252769e-05, 'epoch': 0.48} + 48%|████▊ | 4201/8750 [2:59:16<12:19:21, 9.75s/it] 48%|████▊ | 4202/8750 [2:59:22<10:54:52, 8.64s/it] 48%|████▊ | 4202/8750 [2:59:25<10:54:52, 8.64s/it] {'loss': 0.4653, 'learning_rate': 1.1124768066353705e-05, 'epoch': 0.48} + 48%|████▊ | 4202/8750 [2:59:25<10:54:52, 8.64s/it] {'loss': 0.4653, 'learning_rate': 1.1124768066353705e-05, 'epoch': 0.48} + 48%|████▊ | 4202/8750 [2:59:22<10:54:52, 8.64s/it] 48%|████▊ | 4203/8750 [2:59:28<9:47:20, 7.75s/it] 48%|████▊ | 4203/8750 [2:59:31<9:47:20, 7.75s/it] {'loss': 0.4611, 'learning_rate': 1.1121089826061385e-05, 'epoch': 0.48} + 48%|████▊ | 4203/8750 [2:59:31<9:47:20, 7.75s/it] {'loss': 0.4611, 'learning_rate': 1.1121089826061385e-05, 'epoch': 0.48} + 48%|████▊ | 4203/8750 [2:59:28<9:47:20, 7.75s/it] 48%|████▊ | 4204/8750 [2:59:37<8:59:36, 7.12s/it] 48%|████▊ | 4204/8750 [2:59:34<8:59:36, 7.12s/it] {'loss': 0.47, 'learning_rate': 1.1117411432154725e-05, 'epoch': 0.48} + 48%|████▊ | 4204/8750 [2:59:37<8:59:36, 7.12s/it] {'loss': 0.47, 'learning_rate': 1.1117411432154725e-05, 'epoch': 0.48} + 48%|████▊ | 4204/8750 [2:59:34<8:59:36, 7.12s/it] 48%|████▊ | 4205/8750 [2:59:42<8:25:42, 6.68s/it] 48%|████▊ | 4205/8750 [2:59:39<8:25:43, 6.68s/it]{'loss': 0.4632, 'learning_rate': 1.1113732885137755e-05, 'epoch': 0.48} + {'loss': 0.4632, 'learning_rate': 1.1113732885137755e-05, 'epoch': 0.48} + 48%|████▊ | 4205/8750 [2:59:42<8:25:42, 6.68s/it] 48%|████▊ | 4205/8750 [2:59:39<8:25:43, 6.68s/it] 48%|████▊ | 4206/8750 [2:59:48<8:02:57, 6.38s/it] 48%|████▊ | 4206/8750 [2:59:45<8:02:57, 6.38s/it] {'loss': 0.4561, 'learning_rate': 1.1110054185514513e-05, 'epoch': 0.48} + 48%|████▊ | 4206/8750 [2:59:48<8:02:57, 6.38s/it] {'loss': 0.4561, 'learning_rate': 1.1110054185514513e-05, 'epoch': 0.48} + 48%|████▊ | 4206/8750 [2:59:45<8:02:57, 6.38s/it] 48%|████▊ | 4207/8750 [2:59:54<7:47:54, 6.18s/it] 48%|████▊ | 4207/8750 [2:59:51<7:47:55, 6.18s/it] {'loss': 0.4782, 'learning_rate': 1.1106375333789065e-05, 'epoch': 0.48} + 48%|████▊ | 4207/8750 [2:59:54<7:47:54, 6.18s/it] {'loss': 0.4782, 'learning_rate': 1.1106375333789065e-05, 'epoch': 0.48} + 48%|████▊ | 4207/8750 [2:59:51<7:47:55, 6.18s/it] 48%|████▊ | 4208/8750 [2:59:59<7:39:45, 6.07s/it] 48%|████▊ | 4208/8750 [2:59:56<7:39:45, 6.07s/it] {'loss': 0.4728, 'learning_rate': 1.1102696330465495e-05, 'epoch': 0.48} + 48%|████▊ | 4208/8750 [2:59:59<7:39:45, 6.07s/it] {'loss': 0.4728, 'learning_rate': 1.1102696330465495e-05, 'epoch': 0.48} + 48%|████▊ | 4208/8750 [2:59:56<7:39:45, 6.07s/it] 48%|████▊ | 4209/8750 [3:00:05<7:29:25, 5.94s/it] 48%|████▊ | 4209/8750 [3:00:02<7:29:25, 5.94s/it] {'loss': 0.4594, 'learning_rate': 1.1099017176047909e-05, 'epoch': 0.48} + 48%|████▊ | 4209/8750 [3:00:05<7:29:25, 5.94s/it] {'loss': 0.4594, 'learning_rate': 1.1099017176047909e-05, 'epoch': 0.48} + 48%|████▊ | 4209/8750 [3:00:02<7:29:25, 5.94s/it] 48%|████▊ | 4210/8750 [3:00:08<7:39:15, 6.07s/it] 48%|████▊ | 4210/8750 [3:00:11<7:39:15, 6.07s/it] {'loss': 0.4676, 'learning_rate': 1.109533787104043e-05, 'epoch': 0.48} + 48%|████▊ | 4210/8750 [3:00:11<7:39:15, 6.07s/it] {'loss': 0.4676, 'learning_rate': 1.109533787104043e-05, 'epoch': 0.48} + 48%|████▊ | 4210/8750 [3:00:08<7:39:15, 6.07s/it] 48%|████▊ | 4211/8750 [3:00:14<7:33:20, 5.99s/it] 48%|████▊ | 4211/8750 [3:00:17<7:33:21, 5.99s/it]{'loss': 0.46, 'learning_rate': 1.109165841594721e-05, 'epoch': 0.48} + {'loss': 0.46, 'learning_rate': 1.109165841594721e-05, 'epoch': 0.48} + 48%|████▊ | 4211/8750 [3:00:17<7:33:21, 5.99s/it] 48%|████▊ | 4211/8750 [3:00:14<7:33:20, 5.99s/it] 48%|████▊ | 4212/8750 [3:00:20<7:26:09, 5.90s/it] 48%|████▊ | 4212/8750 [3:00:23<7:26:09, 5.90s/it] {'loss': 0.4804, 'learning_rate': 1.1087978811272417e-05, 'epoch': 0.48} + 48%|████▊ | 4212/8750 [3:00:23<7:26:09, 5.90s/it] {'loss': 0.4804, 'learning_rate': 1.1087978811272417e-05, 'epoch': 0.48} + 48%|████▊ | 4212/8750 [3:00:20<7:26:09, 5.90s/it] 48%|████▊ | 4213/8750 [3:00:29<7:23:03, 5.86s/it] 48%|████▊ | 4213/8750 [3:00:26<7:23:03, 5.86s/it] {'loss': 0.4493, 'learning_rate': 1.1084299057520234e-05, 'epoch': 0.48} + 48%|████▊ | 4213/8750 [3:00:29<7:23:03, 5.86s/it] {'loss': 0.4493, 'learning_rate': 1.1084299057520234e-05, 'epoch': 0.48} + 48%|████▊ | 4213/8750 [3:00:26<7:23:03, 5.86s/it] 48%|████▊ | 4214/8750 [3:00:35<7:23:15, 5.86s/it] 48%|████▊ | 4214/8750 [3:00:32<7:23:16, 5.86s/it] {'loss': 0.4893, 'learning_rate': 1.1080619155194873e-05, 'epoch': 0.48} + 48%|████▊ | 4214/8750 [3:00:35<7:23:15, 5.86s/it] {'loss': 0.4893, 'learning_rate': 1.1080619155194873e-05, 'epoch': 0.48} + 48%|████▊ | 4214/8750 [3:00:32<7:23:16, 5.86s/it] 48%|████▊ | 4215/8750 [3:00:37<7:20:50, 5.83s/it] 48%|████▊ | 4215/8750 [3:00:40<7:20:50, 5.83s/it] {'loss': 0.452, 'learning_rate': 1.107693910480056e-05, 'epoch': 0.48} + {'loss': 0.452, 'learning_rate': 1.107693910480056e-05, 'epoch': 0.48} 48%|████▊ | 4215/8750 [3:00:40<7:20:50, 5.83s/it] + 48%|████▊ | 4215/8750 [3:00:37<7:20:50, 5.83s/it] 48%|████▊ | 4216/8750 [3:00:46<7:20:00, 5.82s/it] 48%|████▊ | 4216/8750 [3:00:43<7:20:00, 5.82s/it] {'loss': 0.4683, 'learning_rate': 1.1073258906841547e-05, 'epoch': 0.48} + 48%|████▊ | 4216/8750 [3:00:46<7:20:00, 5.82s/it] {'loss': 0.4683, 'learning_rate': 1.1073258906841547e-05, 'epoch': 0.48} + 48%|████▊ | 4216/8750 [3:00:43<7:20:00, 5.82s/it] 48%|████▊ | 4217/8750 [3:00:52<7:19:47, 5.82s/it] 48%|████▊ | 4217/8750 [3:00:49<7:19:47, 5.82s/it] {'loss': 0.4677, 'learning_rate': 1.10695785618221e-05, 'epoch': 0.48} + 48%|████▊ | 4217/8750 [3:00:52<7:19:47, 5.82s/it] {'loss': 0.4677, 'learning_rate': 1.10695785618221e-05, 'epoch': 0.48} + 48%|████▊ | 4217/8750 [3:00:49<7:19:47, 5.82s/it] 48%|████▊ | 4218/8750 [3:00:58<7:22:53, 5.86s/it] 48%|████▊ | 4218/8750 [3:00:55<7:22:53, 5.86s/it] {'loss': 0.4687, 'learning_rate': 1.1065898070246512e-05, 'epoch': 0.48} + 48%|████▊ | 4218/8750 [3:00:58<7:22:53, 5.86s/it] {'loss': 0.4687, 'learning_rate': 1.1065898070246512e-05, 'epoch': 0.48} + 48%|████▊ | 4218/8750 [3:00:55<7:22:53, 5.86s/it] 48%|████▊ | 4219/8750 [3:01:04<7:22:46, 5.86s/it] 48%|████▊ | 4219/8750 [3:01:01<7:22:46, 5.86s/it] {'loss': 0.4808, 'learning_rate': 1.1062217432619095e-05, 'epoch': 0.48} + 48%|████▊ | 4219/8750 [3:01:04<7:22:46, 5.86s/it] {'loss': 0.4808, 'learning_rate': 1.1062217432619095e-05, 'epoch': 0.48} + 48%|████▊ | 4219/8750 [3:01:01<7:22:46, 5.86s/it] 48%|████▊ | 4220/8750 [3:01:10<7:26:49, 5.92s/it] 48%|████▊ | 4220/8750 [3:01:07<7:26:49, 5.92s/it] {'loss': 0.4601, 'learning_rate': 1.1058536649444167e-05, 'epoch': 0.48} + 48%|████▊ | 4220/8750 [3:01:10<7:26:49, 5.92s/it] {'loss': 0.4601, 'learning_rate': 1.1058536649444167e-05, 'epoch': 0.48} + 48%|████▊ | 4220/8750 [3:01:07<7:26:49, 5.92s/it] 48%|████▊ | 4221/8750 [3:01:13<7:24:23, 5.89s/it] 48%|████▊ | 4221/8750 [3:01:16<7:24:24, 5.89s/it] {'loss': 0.4564, 'learning_rate': 1.105485572122609e-05, 'epoch': 0.48} + 48%|████▊ | 4221/8750 [3:01:16<7:24:24, 5.89s/it] {'loss': 0.4564, 'learning_rate': 1.105485572122609e-05, 'epoch': 0.48} + 48%|████▊ | 4221/8750 [3:01:13<7:24:23, 5.89s/it] 48%|████▊ | 4222/8750 [3:01:21<7:18:36, 5.81s/it] 48%|████▊ | 4222/8750 [3:01:18<7:18:36, 5.81s/it] {'loss': 0.4677, 'learning_rate': 1.1051174648469225e-05, 'epoch': 0.48} + 48%|████▊ | 4222/8750 [3:01:21<7:18:36, 5.81s/it] {'loss': 0.4677, 'learning_rate': 1.1051174648469225e-05, 'epoch': 0.48} + 48%|████▊ | 4222/8750 [3:01:18<7:18:36, 5.81s/it] 48%|████▊ | 4223/8750 [3:01:27<7:16:11, 5.78s/it] 48%|████▊ | 4223/8750 [3:01:24<7:16:11, 5.78s/it] {'loss': 0.4712, 'learning_rate': 1.104749343167797e-05, 'epoch': 0.48} + 48%|████▊ | 4223/8750 [3:01:27<7:16:11, 5.78s/it] {'loss': 0.4712, 'learning_rate': 1.104749343167797e-05, 'epoch': 0.48} + 48%|████▊ | 4223/8750 [3:01:24<7:16:11, 5.78s/it] 48%|████▊ | 4224/8750 [3:01:30<7:16:06, 5.78s/it] 48%|████▊ | 4224/8750 [3:01:33<7:16:07, 5.78s/it] {'loss': 0.475, 'learning_rate': 1.104381207135672e-05, 'epoch': 0.48} + 48%|████▊ | 4224/8750 [3:01:33<7:16:07, 5.78s/it] {'loss': 0.475, 'learning_rate': 1.104381207135672e-05, 'epoch': 0.48} + 48%|████▊ | 4224/8750 [3:01:30<7:16:06, 5.78s/it] 48%|████▊ | 4225/8750 [3:01:38<7:13:49, 5.75s/it] 48%|████▊ | 4225/8750 [3:01:36<7:13:49, 5.75s/it] {'loss': 0.4599, 'learning_rate': 1.104013056800992e-05, 'epoch': 0.48} + 48%|████▊ | 4225/8750 [3:01:38<7:13:49, 5.75s/it] {'loss': 0.4599, 'learning_rate': 1.104013056800992e-05, 'epoch': 0.48} + 48%|████▊ | 4225/8750 [3:01:36<7:13:49, 5.75s/it] 48%|████▊ | 4226/8750 [3:01:44<7:14:09, 5.76s/it] 48%|████▊ | 4226/8750 [3:01:41<7:14:09, 5.76s/it] {'loss': 0.4706, 'learning_rate': 1.1036448922142004e-05, 'epoch': 0.48} + 48%|████▊ | 4226/8750 [3:01:44<7:14:09, 5.76s/it] {'loss': 0.4706, 'learning_rate': 1.1036448922142004e-05, 'epoch': 0.48} + 48%|████▊ | 4226/8750 [3:01:41<7:14:09, 5.76s/it] 48%|████▊ | 4227/8750 [3:01:50<7:10:05, 5.71s/it] 48%|████▊ | 4227/8750 [3:01:47<7:10:05, 5.71s/it] {'loss': 0.4714, 'learning_rate': 1.1032767134257451e-05, 'epoch': 0.48} + 48%|████▊ | 4227/8750 [3:01:50<7:10:05, 5.71s/it] {'loss': 0.4714, 'learning_rate': 1.1032767134257451e-05, 'epoch': 0.48} + 48%|████▊ | 4227/8750 [3:01:47<7:10:05, 5.71s/it] 48%|████▊ | 4228/8750 [3:01:55<7:09:54, 5.70s/it] 48%|████▊ | 4228/8750 [3:01:53<7:09:54, 5.70s/it] {'loss': 0.4596, 'learning_rate': 1.102908520486074e-05, 'epoch': 0.48} + 48%|████▊ | 4228/8750 [3:01:55<7:09:54, 5.70s/it] {'loss': 0.4596, 'learning_rate': 1.102908520486074e-05, 'epoch': 0.48} + 48%|████▊ | 4228/8750 [3:01:53<7:09:54, 5.70s/it] 48%|████▊ | 4229/8750 [3:02:01<7:11:21, 5.72s/it] 48%|████▊ | 4229/8750 [3:01:58<7:11:21, 5.72s/it] {'loss': 0.4529, 'learning_rate': 1.1025403134456378e-05, 'epoch': 0.48} + 48%|████▊ | 4229/8750 [3:02:01<7:11:21, 5.72s/it] {'loss': 0.4529, 'learning_rate': 1.1025403134456378e-05, 'epoch': 0.48} + 48%|████▊ | 4229/8750 [3:01:58<7:11:21, 5.72s/it] 48%|████▊ | 4230/8750 [3:02:07<7:10:08, 5.71s/it] 48%|████▊ | 4230/8750 [3:02:04<7:10:08, 5.71s/it] {'loss': 0.4729, 'learning_rate': 1.1021720923548897e-05, 'epoch': 0.48} + 48%|████▊ | 4230/8750 [3:02:07<7:10:08, 5.71s/it] {'loss': 0.4729, 'learning_rate': 1.1021720923548897e-05, 'epoch': 0.48} + 48%|████▊ | 4230/8750 [3:02:04<7:10:08, 5.71s/it] 48%|████▊ | 4231/8750 [3:02:13<7:11:43, 5.73s/it] 48%|████▊ | 4231/8750 [3:02:10<7:11:43, 5.73s/it] {'loss': 0.4506, 'learning_rate': 1.1018038572642837e-05, 'epoch': 0.48} + 48%|████▊ | 4231/8750 [3:02:13<7:11:43, 5.73s/it] {'loss': 0.4506, 'learning_rate': 1.1018038572642837e-05, 'epoch': 0.48} + 48%|████▊ | 4231/8750 [3:02:10<7:11:43, 5.73s/it] 48%|████▊ | 4232/8750 [3:02:18<7:12:18, 5.74s/it] 48%|████▊ | 4232/8750 [3:02:16<7:12:18, 5.74s/it] {'loss': 0.4713, 'learning_rate': 1.1014356082242766e-05, 'epoch': 0.48} + 48%|████▊ | 4232/8750 [3:02:18<7:12:18, 5.74s/it] {'loss': 0.4713, 'learning_rate': 1.1014356082242766e-05, 'epoch': 0.48} + 48%|████▊ | 4232/8750 [3:02:16<7:12:18, 5.74s/it] 48%|████▊ | 4233/8750 [3:02:24<7:17:02, 5.81s/it] 48%|████▊ | 4233/8750 [3:02:22<7:17:02, 5.81s/it] {'loss': 0.4613, 'learning_rate': 1.1010673452853262e-05, 'epoch': 0.48} + 48%|████▊ | 4233/8750 [3:02:24<7:17:02, 5.81s/it] {'loss': 0.4613, 'learning_rate': 1.1010673452853262e-05, 'epoch': 0.48} + 48%|████▊ | 4233/8750 [3:02:22<7:17:02, 5.81s/it] 48%|████▊ | 4234/8750 [3:02:30<7:18:41, 5.83s/it] 48%|████▊ | 4234/8750 [3:02:27<7:18:41, 5.83s/it] {'loss': 0.4614, 'learning_rate': 1.1006990684978928e-05, 'epoch': 0.48} + 48%|████▊ | 4234/8750 [3:02:30<7:18:41, 5.83s/it] {'loss': 0.4614, 'learning_rate': 1.1006990684978928e-05, 'epoch': 0.48} + 48%|████▊ | 4234/8750 [3:02:27<7:18:41, 5.83s/it] 48%|████▊ | 4235/8750 [3:02:36<7:21:55, 5.87s/it] 48%|████▊ | 4235/8750 [3:02:33<7:21:55, 5.87s/it] {'loss': 0.4434, 'learning_rate': 1.1003307779124392e-05, 'epoch': 0.48} + 48%|████▊ | 4235/8750 [3:02:36<7:21:55, 5.87s/it] {'loss': 0.4434, 'learning_rate': 1.1003307779124392e-05, 'epoch': 0.48} + 48%|████▊ | 4235/8750 [3:02:33<7:21:55, 5.87s/it] 48%|████▊ | 4236/8750 [3:02:39<7:19:33, 5.84s/it] 48%|████▊ | 4236/8750 [3:02:42<7:19:34, 5.84s/it] {'loss': 0.4647, 'learning_rate': 1.0999624735794292e-05, 'epoch': 0.48} + 48%|████▊ | 4236/8750 [3:02:42<7:19:34, 5.84s/it] {'loss': 0.4647, 'learning_rate': 1.0999624735794292e-05, 'epoch': 0.48} + 48%|████▊ | 4236/8750 [3:02:39<7:19:33, 5.84s/it] 48%|████▊ | 4237/8750 [3:02:45<7:17:09, 5.81s/it] 48%|████▊ | 4237/8750 [3:02:48<7:17:09, 5.81s/it] {'loss': 0.4507, 'learning_rate': 1.0995941555493283e-05, 'epoch': 0.48} + 48%|████▊ | 4237/8750 [3:02:48<7:17:09, 5.81s/it] {'loss': 0.4507, 'learning_rate': 1.0995941555493283e-05, 'epoch': 0.48} + 48%|████▊ | 4237/8750 [3:02:45<7:17:09, 5.81s/it] 48%|████▊ | 4238/8750 [3:02:53<7:10:46, 5.73s/it] 48%|████▊ | 4238/8750 [3:02:50<7:10:46, 5.73s/it] {'loss': 0.4711, 'learning_rate': 1.0992258238726046e-05, 'epoch': 0.48} + 48%|████▊ | 4238/8750 [3:02:53<7:10:46, 5.73s/it] {'loss': 0.4711, 'learning_rate': 1.0992258238726046e-05, 'epoch': 0.48} + 48%|████▊ | 4238/8750 [3:02:50<7:10:46, 5.73s/it] 48%|████▊ | 4239/8750 [3:02:59<7:08:13, 5.70s/it] 48%|████▊ | 4239/8750 [3:02:56<7:08:13, 5.70s/it] {'loss': 0.4512, 'learning_rate': 1.0988574785997275e-05, 'epoch': 0.48} + 48%|████▊ | 4239/8750 [3:02:59<7:08:13, 5.70s/it] {'loss': 0.4512, 'learning_rate': 1.0988574785997275e-05, 'epoch': 0.48} + 48%|████▊ | 4239/8750 [3:02:56<7:08:13, 5.70s/it] 48%|████▊ | 4240/8750 [3:03:05<7:17:52, 5.83s/it] 48%|████▊ | 4240/8750 [3:03:02<7:17:52, 5.83s/it] {'loss': 0.4598, 'learning_rate': 1.0984891197811686e-05, 'epoch': 0.48} + 48%|████▊ | 4240/8750 [3:03:05<7:17:52, 5.83s/it] {'loss': 0.4598, 'learning_rate': 1.0984891197811686e-05, 'epoch': 0.48} + 48%|████▊ | 4240/8750 [3:03:02<7:17:52, 5.83s/it] 48%|████▊ | 4241/8750 [3:03:11<7:17:20, 5.82s/it] 48%|████▊ | 4241/8750 [3:03:08<7:17:21, 5.82s/it] {'loss': 0.4618, 'learning_rate': 1.0981207474674021e-05, 'epoch': 0.48} + 48%|████▊ | 4241/8750 [3:03:11<7:17:20, 5.82s/it] {'loss': 0.4618, 'learning_rate': 1.0981207474674021e-05, 'epoch': 0.48} + 48%|████▊ | 4241/8750 [3:03:08<7:17:21, 5.82s/it] 48%|████▊ | 4242/8750 [3:03:17<7:14:07, 5.78s/it] 48%|████▊ | 4242/8750 [3:03:14<7:14:07, 5.78s/it] {'loss': 0.473, 'learning_rate': 1.0977523617089019e-05, 'epoch': 0.48} + 48%|████▊ | 4242/8750 [3:03:17<7:14:07, 5.78s/it] {'loss': 0.473, 'learning_rate': 1.0977523617089019e-05, 'epoch': 0.48} + 48%|████▊ | 4242/8750 [3:03:14<7:14:07, 5.78s/it] 48%|████▊ | 4243/8750 [3:03:22<7:13:03, 5.77s/it] 48%|████▊ | 4243/8750 [3:03:19<7:13:03, 5.77s/it] {'loss': 0.4364, 'learning_rate': 1.097383962556146e-05, 'epoch': 0.48} + 48%|████▊ | 4243/8750 [3:03:22<7:13:03, 5.77s/it] {'loss': 0.4364, 'learning_rate': 1.097383962556146e-05, 'epoch': 0.48} + 48%|████▊ | 4243/8750 [3:03:19<7:13:03, 5.77s/it] 49%|████▊ | 4244/8750 [3:03:28<7:13:45, 5.78s/it] 49%|████▊ | 4244/8750 [3:03:25<7:13:45, 5.78s/it] {'loss': 0.4559, 'learning_rate': 1.0970155500596127e-05, 'epoch': 0.49} + 49%|████▊ | 4244/8750 [3:03:28<7:13:45, 5.78s/it] {'loss': 0.4559, 'learning_rate': 1.0970155500596127e-05, 'epoch': 0.49} + 49%|████▊ | 4244/8750 [3:03:25<7:13:45, 5.78s/it] 49%|████▊ | 4245/8750 [3:03:34<7:19:36, 5.86s/it] 49%|████▊ | 4245/8750 [3:03:31<7:19:36, 5.86s/it] {'loss': 0.4674, 'learning_rate': 1.0966471242697834e-05, 'epoch': 0.49} + 49%|████▊ | 4245/8750 [3:03:34<7:19:36, 5.86s/it] {'loss': 0.4674, 'learning_rate': 1.0966471242697834e-05, 'epoch': 0.49} + 49%|████▊ | 4245/8750 [3:03:31<7:19:36, 5.86s/it] 49%|████▊ | 4246/8750 [3:03:40<7:14:53, 5.79s/it] 49%|████▊ | 4246/8750 [3:03:37<7:14:53, 5.79s/it] {'loss': 0.4773, 'learning_rate': 1.0962786852371402e-05, 'epoch': 0.49} + 49%|████▊ | 4246/8750 [3:03:40<7:14:53, 5.79s/it] {'loss': 0.4773, 'learning_rate': 1.0962786852371402e-05, 'epoch': 0.49} + 49%|████▊ | 4246/8750 [3:03:37<7:14:53, 5.79s/it] 49%|████▊ | 4247/8750 [3:03:46<7:15:43, 5.81s/it] 49%|████▊ | 4247/8750 [3:03:43<7:15:44, 5.81s/it] {'loss': 0.4772, 'learning_rate': 1.0959102330121676e-05, 'epoch': 0.49} + 49%|████▊ | 4247/8750 [3:03:46<7:15:43, 5.81s/it] {'loss': 0.4772, 'learning_rate': 1.0959102330121676e-05, 'epoch': 0.49} + 49%|████▊ | 4247/8750 [3:03:43<7:15:44, 5.81s/it] 49%|████▊ | 4248/8750 [3:03:51<7:11:17, 5.75s/it] 49%|████▊ | 4248/8750 [3:03:48<7:11:17, 5.75s/it] {'loss': 0.4668, 'learning_rate': 1.0955417676453517e-05, 'epoch': 0.49} + {'loss': 0.4668, 'learning_rate': 1.0955417676453517e-05, 'epoch': 0.49} + 49%|████▊ | 4248/8750 [3:03:51<7:11:17, 5.75s/it] 49%|████▊ | 4248/8750 [3:03:48<7:11:17, 5.75s/it] 49%|████▊ | 4249/8750 [3:03:54<7:11:17, 5.75s/it] 49%|████▊ | 4249/8750 [3:03:57<7:11:17, 5.75s/it]{'loss': 0.4536, 'learning_rate': 1.0951732891871807e-05, 'epoch': 0.49} + {'loss': 0.4536, 'learning_rate': 1.0951732891871807e-05, 'epoch': 0.49} + 49%|████▊ | 4249/8750 [3:03:57<7:11:17, 5.75s/it] 49%|████▊ | 4249/8750 [3:03:54<7:11:17, 5.75s/it]1 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +04 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... + 49%|████▊ | 4250/8750 [3:04:03<7:11:24, 5.75s/it]10 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... + 7 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend...5 AutoResumeHook: Checking whether to suspend... + +6 AutoResumeHook: Checking whether to suspend... + 49%|████▊ | 4250/8750 [3:04:00<7:11:25, 5.75s/it]12 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4819, 'learning_rate': 1.0948047976881439e-05, 'epoch': 0.49} + 49%|████▊ | 4250/8750 [3:04:03<7:11:24, 5.75s/it] {'loss': 0.4819, 'learning_rate': 1.0948047976881439e-05, 'epoch': 0.49} + 49%|████▊ | 4250/8750 [3:04:00<7:11:25, 5.75s/it] 49%|████▊ | 4251/8750 [3:04:08<7:10:27, 5.74s/it] 49%|████▊ | 4251/8750 [3:04:06<7:10:26, 5.74s/it] {'loss': 0.4545, 'learning_rate': 1.0944362931987336e-05, 'epoch': 0.49} + 49%|████▊ | 4251/8750 [3:04:08<7:10:27, 5.74s/it] {'loss': 0.4545, 'learning_rate': 1.0944362931987336e-05, 'epoch': 0.49} + 49%|████▊ | 4251/8750 [3:04:06<7:10:26, 5.74s/it] 49%|████▊ | 4252/8750 [3:04:11<7:09:40, 5.73s/it] 49%|████▊ | 4252/8750 [3:04:14<7:09:43, 5.73s/it] {'loss': 0.4557, 'learning_rate': 1.0940677757694425e-05, 'epoch': 0.49} + 49%|████▊ | 4252/8750 [3:04:14<7:09:43, 5.73s/it] {'loss': 0.4557, 'learning_rate': 1.0940677757694425e-05, 'epoch': 0.49} + 49%|████▊ | 4252/8750 [3:04:11<7:09:40, 5.73s/it] 49%|████▊ | 4253/8750 [3:04:20<7:10:47, 5.75s/it] 49%|████▊ | 4253/8750 [3:04:17<7:10:48, 5.75s/it] {'loss': 0.4531, 'learning_rate': 1.093699245450766e-05, 'epoch': 0.49} + 49%|████▊ | 4253/8750 [3:04:20<7:10:47, 5.75s/it] {'loss': 0.4531, 'learning_rate': 1.093699245450766e-05, 'epoch': 0.49} + 49%|████▊ | 4253/8750 [3:04:17<7:10:48, 5.75s/it] 49%|████▊ | 4254/8750 [3:04:26<7:10:28, 5.74s/it] 49%|████▊ | 4254/8750 [3:04:23<7:10:29, 5.75s/it] {'loss': 0.4528, 'learning_rate': 1.093330702293201e-05, 'epoch': 0.49} + 49%|████▊ | 4254/8750 [3:04:26<7:10:28, 5.74s/it] {'loss': 0.4528, 'learning_rate': 1.093330702293201e-05, 'epoch': 0.49} + 49%|████▊ | 4254/8750 [3:04:23<7:10:29, 5.75s/it] 49%|████▊ | 4255/8750 [3:04:32<7:17:15, 5.84s/it] 49%|████▊ | 4255/8750 [3:04:29<7:17:16, 5.84s/it] {'loss': 0.4605, 'learning_rate': 1.092962146347246e-05, 'epoch': 0.49} + 49%|████▊ | 4255/8750 [3:04:32<7:17:15, 5.84s/it] {'loss': 0.4605, 'learning_rate': 1.092962146347246e-05, 'epoch': 0.49} + 49%|████▊ | 4255/8750 [3:04:29<7:17:16, 5.84s/it] 49%|████▊ | 4256/8750 [3:04:38<7:19:53, 5.87s/it] 49%|████▊ | 4256/8750 [3:04:35<7:19:53, 5.87s/it] {'loss': 0.4651, 'learning_rate': 1.0925935776634014e-05, 'epoch': 0.49} + 49%|████▊ | 4256/8750 [3:04:38<7:19:53, 5.87s/it] {'loss': 0.4651, 'learning_rate': 1.0925935776634014e-05, 'epoch': 0.49} + 49%|████▊ | 4256/8750 [3:04:35<7:19:53, 5.87s/it] 49%|████▊ | 4257/8750 [3:04:43<7:13:10, 5.78s/it] 49%|████▊ | 4257/8750 [3:04:40<7:13:10, 5.78s/it] {'loss': 0.4555, 'learning_rate': 1.0922249962921694e-05, 'epoch': 0.49} + 49%|████▊ | 4257/8750 [3:04:43<7:13:10, 5.78s/it] {'loss': 0.4555, 'learning_rate': 1.0922249962921694e-05, 'epoch': 0.49} + 49%|████▊ | 4257/8750 [3:04:40<7:13:10, 5.78s/it] 49%|████▊ | 4258/8750 [3:04:49<7:12:15, 5.77s/it] 49%|████▊ | 4258/8750 [3:04:46<7:12:15, 5.77s/it] {'loss': 0.4513, 'learning_rate': 1.0918564022840539e-05, 'epoch': 0.49} + 49%|████▊ | 4258/8750 [3:04:49<7:12:15, 5.77s/it] {'loss': 0.4513, 'learning_rate': 1.0918564022840539e-05, 'epoch': 0.49} + 49%|████▊ | 4258/8750 [3:04:46<7:12:15, 5.77s/it] 49%|████▊ | 4259/8750 [3:04:55<7:09:07, 5.73s/it] 49%|████▊ | 4259/8750 [3:04:52<7:09:07, 5.73s/it] {'loss': 0.4656, 'learning_rate': 1.0914877956895604e-05, 'epoch': 0.49} + 49%|████▊ | 4259/8750 [3:04:55<7:09:07, 5.73s/it] {'loss': 0.4656, 'learning_rate': 1.0914877956895604e-05, 'epoch': 0.49} + 49%|████▊ | 4259/8750 [3:04:52<7:09:07, 5.73s/it] 49%|████▊ | 4260/8750 [3:05:01<7:11:22, 5.76s/it] 49%|████▊ | 4260/8750 [3:04:58<7:11:22, 5.76s/it] {'loss': 0.4609, 'learning_rate': 1.0911191765591966e-05, 'epoch': 0.49} + 49%|████▊ | 4260/8750 [3:05:01<7:11:22, 5.76s/it] {'loss': 0.4609, 'learning_rate': 1.0911191765591966e-05, 'epoch': 0.49} + 49%|████▊ | 4260/8750 [3:04:58<7:11:22, 5.76s/it] 49%|████▊ | 4261/8750 [3:05:06<7:14:21, 5.81s/it] 49%|████▊ | 4261/8750 [3:05:03<7:14:21, 5.81s/it] {'loss': 0.4698, 'learning_rate': 1.090750544943471e-05, 'epoch': 0.49} + 49%|████▊ | 4261/8750 [3:05:06<7:14:21, 5.81s/it] {'loss': 0.4698, 'learning_rate': 1.090750544943471e-05, 'epoch': 0.49} + 49%|████▊ | 4261/8750 [3:05:03<7:14:21, 5.81s/it] 49%|████▊ | 4262/8750 [3:05:12<7:12:29, 5.78s/it] 49%|████▊ | 4262/8750 [3:05:09<7:12:29, 5.78s/it] {'loss': 0.4613, 'learning_rate': 1.0903819008928948e-05, 'epoch': 0.49} + 49%|████▊ | 4262/8750 [3:05:12<7:12:29, 5.78s/it] {'loss': 0.4613, 'learning_rate': 1.0903819008928948e-05, 'epoch': 0.49} + 49%|████▊ | 4262/8750 [3:05:09<7:12:29, 5.78s/it] 49%|████▊ | 4263/8750 [3:05:18<7:14:14, 5.81s/it] 49%|████▊ | 4263/8750 [3:05:15<7:14:14, 5.81s/it] {'loss': 0.4463, 'learning_rate': 1.0900132444579801e-05, 'epoch': 0.49} + 49%|████▊ | 4263/8750 [3:05:18<7:14:14, 5.81s/it] {'loss': 0.4463, 'learning_rate': 1.0900132444579801e-05, 'epoch': 0.49} + 49%|████▊ | 4263/8750 [3:05:15<7:14:14, 5.81s/it] 49%|████▊ | 4264/8750 [3:05:24<7:12:28, 5.78s/it] 49%|████▊ | 4264/8750 [3:05:21<7:12:29, 5.78s/it] {'loss': 0.4858, 'learning_rate': 1.0896445756892415e-05, 'epoch': 0.49} + 49%|████▊ | 4264/8750 [3:05:24<7:12:28, 5.78s/it] {'loss': 0.4858, 'learning_rate': 1.0896445756892415e-05, 'epoch': 0.49} + 49%|████▊ | 4264/8750 [3:05:21<7:12:29, 5.78s/it] 49%|████▊ | 4265/8750 [3:05:30<7:12:05, 5.78s/it] 49%|████▊ | 4265/8750 [3:05:27<7:12:06, 5.78s/it] {'loss': 0.4598, 'learning_rate': 1.0892758946371943e-05, 'epoch': 0.49} + 49%|████▊ | 4265/8750 [3:05:30<7:12:05, 5.78s/it] {'loss': 0.4598, 'learning_rate': 1.0892758946371943e-05, 'epoch': 0.49} + 49%|████▊ | 4265/8750 [3:05:27<7:12:06, 5.78s/it] 49%|████▉ | 4266/8750 [3:05:35<7:15:37, 5.83s/it] 49%|████▉ | 4266/8750 [3:05:33<7:15:37, 5.83s/it] {'loss': 0.4422, 'learning_rate': 1.0889072013523568e-05, 'epoch': 0.49} + 49%|████▉ | 4266/8750 [3:05:35<7:15:37, 5.83s/it] {'loss': 0.4422, 'learning_rate': 1.0889072013523568e-05, 'epoch': 0.49} + 49%|████▉ | 4266/8750 [3:05:33<7:15:37, 5.83s/it] 49%|████▉ | 4267/8750 [3:05:41<7:12:08, 5.78s/it] 49%|████▉ | 4267/8750 [3:05:38<7:12:08, 5.78s/it] {'loss': 0.4569, 'learning_rate': 1.0885384958852474e-05, 'epoch': 0.49} + 49%|████▉ | 4267/8750 [3:05:41<7:12:08, 5.78s/it] {'loss': 0.4569, 'learning_rate': 1.0885384958852474e-05, 'epoch': 0.49} + 49%|████▉ | 4267/8750 [3:05:38<7:12:08, 5.78s/it] 49%|████▉ | 4268/8750 [3:05:47<7:09:12, 5.75s/it] 49%|████▉ | 4268/8750 [3:05:44<7:09:12, 5.75s/it] {'loss': 0.4665, 'learning_rate': 1.0881697782863874e-05, 'epoch': 0.49} + 49%|████▉ | 4268/8750 [3:05:47<7:09:12, 5.75s/it] {'loss': 0.4665, 'learning_rate': 1.0881697782863874e-05, 'epoch': 0.49} + 49%|████▉ | 4268/8750 [3:05:44<7:09:12, 5.75s/it] 49%|████▉ | 4269/8750 [3:05:53<7:08:33, 5.74s/it] 49%|████▉ | 4269/8750 [3:05:50<7:08:33, 5.74s/it] {'loss': 0.4516, 'learning_rate': 1.0878010486062993e-05, 'epoch': 0.49} + 49%|████▉ | 4269/8750 [3:05:53<7:08:33, 5.74s/it] {'loss': 0.4516, 'learning_rate': 1.0878010486062993e-05, 'epoch': 0.49} + 49%|████▉ | 4269/8750 [3:05:50<7:08:33, 5.74s/it] 49%|████▉ | 4270/8750 [3:05:58<7:12:05, 5.79s/it] 49%|████▉ | 4270/8750 [3:05:55<7:12:05, 5.79s/it] {'loss': 0.4574, 'learning_rate': 1.0874323068955073e-05, 'epoch': 0.49} + 49%|████▉ | 4270/8750 [3:05:58<7:12:05, 5.79s/it] {'loss': 0.4574, 'learning_rate': 1.0874323068955073e-05, 'epoch': 0.49} + 49%|████▉ | 4270/8750 [3:05:55<7:12:05, 5.79s/it] 49%|████▉ | 4271/8750 [3:06:04<7:11:29, 5.78s/it] 49%|████▉ | 4271/8750 [3:06:01<7:11:28, 5.78s/it] {'loss': 0.4604, 'learning_rate': 1.0870635532045375e-05, 'epoch': 0.49} + 49%|████▉ | 4271/8750 [3:06:04<7:11:29, 5.78s/it] {'loss': 0.4604, 'learning_rate': 1.0870635532045375e-05, 'epoch': 0.49} + 49%|████▉ | 4271/8750 [3:06:01<7:11:28, 5.78s/it] 49%|████▉ | 4272/8750 [3:06:10<7:11:12, 5.78s/it] 49%|████▉ | 4272/8750 [3:06:07<7:11:12, 5.78s/it] {'loss': 0.4557, 'learning_rate': 1.0866947875839167e-05, 'epoch': 0.49} + 49%|████▉ | 4272/8750 [3:06:10<7:11:12, 5.78s/it] {'loss': 0.4557, 'learning_rate': 1.0866947875839167e-05, 'epoch': 0.49} + 49%|████▉ | 4272/8750 [3:06:07<7:11:12, 5.78s/it] 49%|████▉ | 4273/8750 [3:06:16<7:14:17, 5.82s/it] 49%|████▉ | 4273/8750 [3:06:13<7:14:17, 5.82s/it] {'loss': 0.4617, 'learning_rate': 1.0863260100841744e-05, 'epoch': 0.49} + 49%|████▉ | 4273/8750 [3:06:16<7:14:17, 5.82s/it] {'loss': 0.4617, 'learning_rate': 1.0863260100841744e-05, 'epoch': 0.49} + 49%|████▉ | 4273/8750 [3:06:13<7:14:17, 5.82s/it] 49%|████▉ | 4274/8750 [3:06:22<7:12:12, 5.79s/it] 49%|████▉ | 4274/8750 [3:06:19<7:12:12, 5.79s/it] {'loss': 0.4668, 'learning_rate': 1.0859572207558416e-05, 'epoch': 0.49} + 49%|████▉ | 4274/8750 [3:06:22<7:12:12, 5.79s/it] {'loss': 0.4668, 'learning_rate': 1.0859572207558416e-05, 'epoch': 0.49} + 49%|████▉ | 4274/8750 [3:06:19<7:12:12, 5.79s/it] 49%|████▉ | 4275/8750 [3:06:28<7:15:19, 5.84s/it] 49%|████▉ | 4275/8750 [3:06:25<7:15:19, 5.84s/it] {'loss': 0.451, 'learning_rate': 1.0855884196494507e-05, 'epoch': 0.49} + 49%|████▉ | 4275/8750 [3:06:28<7:15:19, 5.84s/it] {'loss': 0.451, 'learning_rate': 1.0855884196494507e-05, 'epoch': 0.49} + 49%|████▉ | 4275/8750 [3:06:25<7:15:19, 5.84s/it] 49%|████▉ | 4276/8750 [3:06:33<7:12:33, 5.80s/it] 49%|████▉ | 4276/8750 [3:06:30<7:12:33, 5.80s/it] {'loss': 0.4583, 'learning_rate': 1.0852196068155352e-05, 'epoch': 0.49} + 49%|████▉ | 4276/8750 [3:06:33<7:12:33, 5.80s/it] {'loss': 0.4583, 'learning_rate': 1.0852196068155352e-05, 'epoch': 0.49} + 49%|████▉ | 4276/8750 [3:06:30<7:12:33, 5.80s/it] 49%|████▉ | 4277/8750 [3:06:39<7:14:27, 5.83s/it] 49%|████▉ | 4277/8750 [3:06:36<7:14:27, 5.83s/it] {'loss': 0.4804, 'learning_rate': 1.0848507823046306e-05, 'epoch': 0.49} + 49%|████▉ | 4277/8750 [3:06:39<7:14:27, 5.83s/it] {'loss': 0.4804, 'learning_rate': 1.0848507823046306e-05, 'epoch': 0.49} + 49%|████▉ | 4277/8750 [3:06:36<7:14:27, 5.83s/it] 49%|████▉ | 4278/8750 [3:06:45<7:15:15, 5.84s/it] 49%|████▉ | 4278/8750 [3:06:42<7:15:15, 5.84s/it] {'loss': 0.4617, 'learning_rate': 1.0844819461672748e-05, 'epoch': 0.49} + 49%|████▉ | 4278/8750 [3:06:45<7:15:15, 5.84s/it] {'loss': 0.4617, 'learning_rate': 1.0844819461672748e-05, 'epoch': 0.49} + 49%|████▉ | 4278/8750 [3:06:42<7:15:15, 5.84s/it] 49%|████▉ | 4279/8750 [3:06:51<7:12:37, 5.81s/it] 49%|████▉ | 4279/8750 [3:06:48<7:12:37, 5.81s/it] {'loss': 0.4703, 'learning_rate': 1.0841130984540063e-05, 'epoch': 0.49} + 49%|████▉ | 4279/8750 [3:06:51<7:12:37, 5.81s/it] {'loss': 0.4703, 'learning_rate': 1.0841130984540063e-05, 'epoch': 0.49} + 49%|████▉ | 4279/8750 [3:06:48<7:12:37, 5.81s/it] 49%|████▉ | 4280/8750 [3:06:54<7:11:14, 5.79s/it] 49%|████▉ | 4280/8750 [3:06:56<7:11:14, 5.79s/it] {'loss': 0.4611, 'learning_rate': 1.0837442392153651e-05, 'epoch': 0.49} + 49%|████▉ | 4280/8750 [3:06:56<7:11:14, 5.79s/it] {'loss': 0.4611, 'learning_rate': 1.0837442392153651e-05, 'epoch': 0.49} + 49%|████▉ | 4280/8750 [3:06:54<7:11:14, 5.79s/it] 49%|████▉ | 4281/8750 [3:07:02<7:09:52, 5.77s/it] 49%|████▉ | 4281/8750 [3:06:59<7:09:51, 5.77s/it] {'loss': 0.451, 'learning_rate': 1.0833753685018935e-05, 'epoch': 0.49} + 49%|████▉ | 4281/8750 [3:07:02<7:09:52, 5.77s/it] {'loss': 0.451, 'learning_rate': 1.0833753685018935e-05, 'epoch': 0.49} + 49%|████▉ | 4281/8750 [3:06:59<7:09:51, 5.77s/it] 49%|████▉ | 4282/8750 [3:07:05<7:12:49, 5.81s/it] 49%|████▉ | 4282/8750 [3:07:08<7:12:50, 5.81s/it] {'loss': 0.4829, 'learning_rate': 1.0830064863641352e-05, 'epoch': 0.49} + 49%|████▉ | 4282/8750 [3:07:08<7:12:50, 5.81s/it] {'loss': 0.4829, 'learning_rate': 1.0830064863641352e-05, 'epoch': 0.49} + 49%|████▉ | 4282/8750 [3:07:05<7:12:49, 5.81s/it] 49%|████▉ | 4283/8750 [3:07:11<7:07:57, 5.75s/it] 49%|████▉ | 4283/8750 [3:07:14<7:07:57, 5.75s/it] {'loss': 0.459, 'learning_rate': 1.082637592852635e-05, 'epoch': 0.49} + 49%|████▉ | 4283/8750 [3:07:11<7:07:57, 5.75s/it]{'loss': 0.459, 'learning_rate': 1.082637592852635e-05, 'epoch': 0.49} + 49%|████▉ | 4283/8750 [3:07:14<7:07:57, 5.75s/it] 49%|████▉ | 4284/8750 [3:07:19<7:04:54, 5.71s/it] 49%|████▉ | 4284/8750 [3:07:16<7:04:54, 5.71s/it] {'loss': 0.468, 'learning_rate': 1.0822686880179395e-05, 'epoch': 0.49} + 49%|████▉ | 4284/8750 [3:07:19<7:04:54, 5.71s/it] {'loss': 0.468, 'learning_rate': 1.0822686880179395e-05, 'epoch': 0.49} + 49%|████▉ | 4284/8750 [3:07:16<7:04:54, 5.71s/it] 49%|████▉ | 4285/8750 [3:07:23<7:14:44, 5.84s/it] 49%|████▉ | 4285/8750 [3:07:25<7:14:44, 5.84s/it] {'loss': 0.4545, 'learning_rate': 1.081899771910597e-05, 'epoch': 0.49} + 49%|████▉ | 4285/8750 [3:07:25<7:14:44, 5.84s/it] {'loss': 0.4545, 'learning_rate': 1.081899771910597e-05, 'epoch': 0.49} + 49%|████▉ | 4285/8750 [3:07:23<7:14:44, 5.84s/it] 49%|████▉ | 4286/8750 [3:07:31<7:12:23, 5.81s/it] 49%|████▉ | 4286/8750 [3:07:28<7:12:23, 5.81s/it] {'loss': 0.4563, 'learning_rate': 1.081530844581157e-05, 'epoch': 0.49} + 49%|████▉ | 4286/8750 [3:07:31<7:12:23, 5.81s/it] {'loss': 0.4563, 'learning_rate': 1.081530844581157e-05, 'epoch': 0.49} + 49%|████▉ | 4286/8750 [3:07:28<7:12:23, 5.81s/it] 49%|████▉ | 4287/8750 [3:07:37<7:16:39, 5.87s/it] 49%|████▉ | 4287/8750 [3:07:34<7:16:39, 5.87s/it] {'loss': 0.4765, 'learning_rate': 1.0811619060801713e-05, 'epoch': 0.49} + 49%|████▉ | 4287/8750 [3:07:37<7:16:39, 5.87s/it] {'loss': 0.4765, 'learning_rate': 1.0811619060801713e-05, 'epoch': 0.49} + 49%|████▉ | 4287/8750 [3:07:34<7:16:39, 5.87s/it] 49%|████▉ | 4288/8750 [3:07:43<7:14:36, 5.84s/it] 49%|████▉ | 4288/8750 [3:07:40<7:14:37, 5.84s/it] {'loss': 0.4501, 'learning_rate': 1.0807929564581925e-05, 'epoch': 0.49} + 49%|████▉ | 4288/8750 [3:07:43<7:14:36, 5.84s/it] {'loss': 0.4501, 'learning_rate': 1.0807929564581925e-05, 'epoch': 0.49} + 49%|████▉ | 4288/8750 [3:07:40<7:14:37, 5.84s/it] 49%|████▉ | 4289/8750 [3:07:49<7:12:54, 5.82s/it] 49%|████▉ | 4289/8750 [3:07:46<7:12:54, 5.82s/it] {'loss': 0.4503, 'learning_rate': 1.080423995765775e-05, 'epoch': 0.49} + 49%|████▉ | 4289/8750 [3:07:49<7:12:54, 5.82s/it] {'loss': 0.4503, 'learning_rate': 1.080423995765775e-05, 'epoch': 0.49} + 49%|████▉ | 4289/8750 [3:07:46<7:12:54, 5.82s/it] 49%|████▉ | 4290/8750 [3:07:54<7:09:11, 5.77s/it] 49%|████▉ | 4290/8750 [3:07:52<7:09:11, 5.77s/it] {'loss': 0.4785, 'learning_rate': 1.0800550240534742e-05, 'epoch': 0.49} + 49%|████▉ | 4290/8750 [3:07:54<7:09:11, 5.77s/it] {'loss': 0.4785, 'learning_rate': 1.0800550240534742e-05, 'epoch': 0.49} + 49%|████▉ | 4290/8750 [3:07:52<7:09:11, 5.77s/it] 49%|████▉ | 4291/8750 [3:08:00<7:07:41, 5.75s/it] 49%|████▉ | 4291/8750 [3:07:57<7:07:41, 5.75s/it] {'loss': 0.4585, 'learning_rate': 1.0796860413718475e-05, 'epoch': 0.49} + 49%|████▉ | 4291/8750 [3:08:00<7:07:41, 5.75s/it] {'loss': 0.4585, 'learning_rate': 1.0796860413718475e-05, 'epoch': 0.49} + 49%|████▉ | 4291/8750 [3:07:57<7:07:41, 5.75s/it] 49%|████▉ | 4292/8750 [3:08:06<7:13:19, 5.83s/it] 49%|████▉ | 4292/8750 [3:08:03<7:13:19, 5.83s/it] {'loss': 0.4625, 'learning_rate': 1.0793170477714546e-05, 'epoch': 0.49} + 49%|████▉ | 4292/8750 [3:08:06<7:13:19, 5.83s/it] {'loss': 0.4625, 'learning_rate': 1.0793170477714546e-05, 'epoch': 0.49} + 49%|████▉ | 4292/8750 [3:08:03<7:13:19, 5.83s/it] 49%|████▉ | 4293/8750 [3:08:09<7:16:58, 5.88s/it] 49%|████▉ | 4293/8750 [3:08:12<7:16:58, 5.88s/it]{'loss': 0.4596, 'learning_rate': 1.0789480433028551e-05, 'epoch': 0.49} + {'loss': 0.4596, 'learning_rate': 1.0789480433028551e-05, 'epoch': 0.49} + 49%|████▉ | 4293/8750 [3:08:12<7:16:58, 5.88s/it] 49%|████▉ | 4293/8750 [3:08:09<7:16:58, 5.88s/it] 49%|████▉ | 4294/8750 [3:08:18<7:12:40, 5.83s/it] 49%|████▉ | 4294/8750 [3:08:15<7:12:40, 5.83s/it] {'loss': 0.4536, 'learning_rate': 1.0785790280166114e-05, 'epoch': 0.49} + 49%|████▉ | 4294/8750 [3:08:18<7:12:40, 5.83s/it] {'loss': 0.4536, 'learning_rate': 1.0785790280166114e-05, 'epoch': 0.49} + 49%|████▉ | 4294/8750 [3:08:15<7:12:40, 5.83s/it] 49%|████▉ | 4295/8750 [3:08:24<7:17:48, 5.90s/it] 49%|████▉ | 4295/8750 [3:08:21<7:17:48, 5.90s/it] {'loss': 0.4592, 'learning_rate': 1.078210001963286e-05, 'epoch': 0.49} + 49%|████▉ | 4295/8750 [3:08:24<7:17:48, 5.90s/it] {'loss': 0.4592, 'learning_rate': 1.078210001963286e-05, 'epoch': 0.49} + 49%|████▉ | 4295/8750 [3:08:21<7:17:48, 5.90s/it] 49%|████▉ | 4296/8750 [3:08:30<7:17:00, 5.89s/it] 49%|████▉ | 4296/8750 [3:08:27<7:17:00, 5.89s/it] {'loss': 0.4829, 'learning_rate': 1.0778409651934442e-05, 'epoch': 0.49} + 49%|████▉ | 4296/8750 [3:08:30<7:17:00, 5.89s/it] {'loss': 0.4829, 'learning_rate': 1.0778409651934442e-05, 'epoch': 0.49} + 49%|████▉ | 4296/8750 [3:08:27<7:17:00, 5.89s/it] 49%|████▉ | 4297/8750 [3:08:36<7:13:15, 5.84s/it] 49%|████▉ | 4297/8750 [3:08:33<7:13:15, 5.84s/it] {'loss': 0.4804, 'learning_rate': 1.0774719177576526e-05, 'epoch': 0.49} + 49%|████▉ | 4297/8750 [3:08:36<7:13:15, 5.84s/it] {'loss': 0.4804, 'learning_rate': 1.0774719177576526e-05, 'epoch': 0.49} + 49%|████▉ | 4297/8750 [3:08:33<7:13:15, 5.84s/it] 49%|████▉ | 4298/8750 [3:08:41<7:12:04, 5.82s/it] 49%|████▉ | 4298/8750 [3:08:38<7:12:04, 5.82s/it] {'loss': 0.4635, 'learning_rate': 1.0771028597064785e-05, 'epoch': 0.49} + 49%|████▉ | 4298/8750 [3:08:41<7:12:04, 5.82s/it] {'loss': 0.4635, 'learning_rate': 1.0771028597064785e-05, 'epoch': 0.49} + 49%|████▉ | 4298/8750 [3:08:38<7:12:04, 5.82s/it] 49%|████▉ | 4299/8750 [3:08:47<7:10:25, 5.80s/it] 49%|████▉ | 4299/8750 [3:08:44<7:10:24, 5.80s/it] {'loss': 0.458, 'learning_rate': 1.076733791090491e-05, 'epoch': 0.49} + 49%|████▉ | 4299/8750 [3:08:47<7:10:25, 5.80s/it] {'loss': 0.458, 'learning_rate': 1.076733791090491e-05, 'epoch': 0.49} + 49%|████▉ | 4299/8750 [3:08:44<7:10:24, 5.80s/it]15 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... + 49%|████▉ | 4300/8750 [3:08:53<7:13:52, 5.85s/it]34 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +06 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... + 11 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... + 49%|████▉ | 4300/8750 [3:08:50<7:13:52, 5.85s/it]2 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4521, 'learning_rate': 1.0763647119602614e-05, 'epoch': 0.49} + 49%|████▉ | 4300/8750 [3:08:53<7:13:52, 5.85s/it] {'loss': 0.4521, 'learning_rate': 1.0763647119602614e-05, 'epoch': 0.49} + 49%|████▉ | 4300/8750 [3:08:50<7:13:52, 5.85s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-4300/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-4300/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-4300/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 49%|████▉ | 4301/8750 [3:09:16<13:29:59, 10.92s/it] 49%|████▉ | 4301/8750 [3:09:13<13:29:59, 10.92s/it] {'loss': 0.4635, 'learning_rate': 1.0759956223663608e-05, 'epoch': 0.49} + 49%|████▉ | 4301/8750 [3:09:16<13:29:59, 10.92s/it] {'loss': 0.4635, 'learning_rate': 1.0759956223663608e-05, 'epoch': 0.49} + 49%|████▉ | 4301/8750 [3:09:13<13:29:59, 10.92s/it] 49%|████▉ | 4302/8750 [3:09:22<11:36:58, 9.40s/it] 49%|████▉ | 4302/8750 [3:09:19<11:36:59, 9.40s/it] {'loss': 0.4627, 'learning_rate': 1.0756265223593637e-05, 'epoch': 0.49} + 49%|████▉ | 4302/8750 [3:09:22<11:36:58, 9.40s/it] {'loss': 0.4627, 'learning_rate': 1.0756265223593637e-05, 'epoch': 0.49} + 49%|████▉ | 4302/8750 [3:09:19<11:36:59, 9.40s/it] 49%|████▉ | 4303/8750 [3:09:28<10:24:00, 8.42s/it] 49%|████▉ | 4303/8750 [3:09:25<10:24:00, 8.42s/it] {'loss': 0.4643, 'learning_rate': 1.0752574119898445e-05, 'epoch': 0.49} + 49%|████▉ | 4303/8750 [3:09:28<10:24:00, 8.42s/it] {'loss': 0.4643, 'learning_rate': 1.0752574119898445e-05, 'epoch': 0.49} + 49%|████▉ | 4303/8750 [3:09:25<10:24:00, 8.42s/it] 49%|████▉ | 4304/8750 [3:09:34<9:28:42, 7.67s/it] 49%|████▉ | 4304/8750 [3:09:31<9:28:42, 7.67s/it] {'loss': 0.4849, 'learning_rate': 1.0748882913083794e-05, 'epoch': 0.49} + 49%|████▉ | 4304/8750 [3:09:34<9:28:42, 7.67s/it] {'loss': 0.4849, 'learning_rate': 1.0748882913083794e-05, 'epoch': 0.49} + 49%|████▉ | 4304/8750 [3:09:31<9:28:42, 7.67s/it] 49%|████▉ | 4305/8750 [3:09:39<8:45:37, 7.10s/it] 49%|████▉ | 4305/8750 [3:09:37<8:45:37, 7.09s/it] {'loss': 0.4653, 'learning_rate': 1.0745191603655466e-05, 'epoch': 0.49} + 49%|████▉ | 4305/8750 [3:09:39<8:45:37, 7.10s/it] {'loss': 0.4653, 'learning_rate': 1.0745191603655466e-05, 'epoch': 0.49} + 49%|████▉ | 4305/8750 [3:09:37<8:45:37, 7.09s/it] 49%|████▉ | 4306/8750 [3:09:46<8:22:39, 6.79s/it] 49%|████▉ | 4306/8750 [3:09:43<8:22:39, 6.79s/it] {'loss': 0.48, 'learning_rate': 1.074150019211925e-05, 'epoch': 0.49} + 49%|████▉ | 4306/8750 [3:09:46<8:22:39, 6.79s/it] {'loss': 0.48, 'learning_rate': 1.074150019211925e-05, 'epoch': 0.49} + 49%|████▉ | 4306/8750 [3:09:43<8:22:39, 6.79s/it] 49%|████▉ | 4307/8750 [3:09:51<8:04:49, 6.55s/it] 49%|████▉ | 4307/8750 [3:09:49<8:04:49, 6.55s/it] {'loss': 0.4599, 'learning_rate': 1.0737808678980954e-05, 'epoch': 0.49} + 49%|████▉ | 4307/8750 [3:09:52<8:04:49, 6.55s/it] {'loss': 0.4599, 'learning_rate': 1.0737808678980954e-05, 'epoch': 0.49} + 49%|████▉ | 4307/8750 [3:09:49<8:04:49, 6.55s/it] 49%|████▉ | 4308/8750 [3:09:57<7:44:37, 6.28s/it] 49%|████▉ | 4308/8750 [3:09:54<7:44:37, 6.28s/it] {'loss': 0.4787, 'learning_rate': 1.0734117064746395e-05, 'epoch': 0.49} + 49%|████▉ | 4308/8750 [3:09:57<7:44:37, 6.28s/it] {'loss': 0.4787, 'learning_rate': 1.0734117064746395e-05, 'epoch': 0.49} + 49%|████▉ | 4308/8750 [3:09:54<7:44:37, 6.28s/it] 49%|████▉ | 4309/8750 [3:10:03<7:32:42, 6.12s/it] 49%|████▉ | 4309/8750 [3:10:00<7:32:42, 6.12s/it] {'loss': 0.4467, 'learning_rate': 1.073042534992141e-05, 'epoch': 0.49} + 49%|████▉ | 4309/8750 [3:10:03<7:32:42, 6.12s/it] {'loss': 0.4467, 'learning_rate': 1.073042534992141e-05, 'epoch': 0.49} + 49%|████▉ | 4309/8750 [3:10:00<7:32:42, 6.12s/it] 49%|████▉ | 4310/8750 [3:10:09<7:21:55, 5.97s/it] 49%|████▉ | 4310/8750 [3:10:06<7:21:55, 5.97s/it] {'loss': 0.4669, 'learning_rate': 1.0726733535011844e-05, 'epoch': 0.49} + 49%|████▉ | 4310/8750 [3:10:09<7:21:55, 5.97s/it] {'loss': 0.4669, 'learning_rate': 1.0726733535011844e-05, 'epoch': 0.49} + 49%|████▉ | 4310/8750 [3:10:06<7:21:55, 5.97s/it] 49%|████▉ | 4311/8750 [3:10:14<7:20:54, 5.96s/it] 49%|████▉ | 4311/8750 [3:10:12<7:20:54, 5.96s/it] {'loss': 0.4679, 'learning_rate': 1.0723041620523558e-05, 'epoch': 0.49} + 49%|████▉ | 4311/8750 [3:10:14<7:20:54, 5.96s/it] {'loss': 0.4679, 'learning_rate': 1.0723041620523558e-05, 'epoch': 0.49} + 49%|████▉ | 4311/8750 [3:10:12<7:20:54, 5.96s/it] 49%|████▉ | 4312/8750 [3:10:20<7:18:23, 5.93s/it] 49%|████▉ | 4312/8750 [3:10:17<7:18:22, 5.93s/it] {'loss': 0.443, 'learning_rate': 1.0719349606962426e-05, 'epoch': 0.49} + 49%|████▉ | 4312/8750 [3:10:20<7:18:23, 5.93s/it] {'loss': 0.443, 'learning_rate': 1.0719349606962426e-05, 'epoch': 0.49} + 49%|████▉ | 4312/8750 [3:10:17<7:18:22, 5.93s/it] 49%|████▉ | 4313/8750 [3:10:26<7:14:16, 5.87s/it] 49%|████▉ | 4313/8750 [3:10:23<7:14:16, 5.87s/it] {'loss': 0.4815, 'learning_rate': 1.071565749483434e-05, 'epoch': 0.49} + 49%|████▉ | 4313/8750 [3:10:26<7:14:16, 5.87s/it] {'loss': 0.4815, 'learning_rate': 1.071565749483434e-05, 'epoch': 0.49} + 49%|████▉ | 4313/8750 [3:10:23<7:14:16, 5.87s/it] 49%|████▉ | 4314/8750 [3:10:32<7:13:18, 5.86s/it] 49%|████▉ | 4314/8750 [3:10:29<7:13:18, 5.86s/it] {'loss': 0.4714, 'learning_rate': 1.0711965284645198e-05, 'epoch': 0.49} + 49%|████▉ | 4314/8750 [3:10:32<7:13:18, 5.86s/it] {'loss': 0.4714, 'learning_rate': 1.0711965284645198e-05, 'epoch': 0.49} + 49%|████▉ | 4314/8750 [3:10:29<7:13:18, 5.86s/it] 49%|████▉ | 4315/8750 [3:10:38<7:08:52, 5.80s/it] 49%|████▉ | 4315/8750 [3:10:35<7:08:51, 5.80s/it] {'loss': 0.45, 'learning_rate': 1.0708272976900915e-05, 'epoch': 0.49} + 49%|████▉ | 4315/8750 [3:10:38<7:08:52, 5.80s/it] {'loss': 0.45, 'learning_rate': 1.0708272976900915e-05, 'epoch': 0.49} + 49%|████▉ | 4315/8750 [3:10:35<7:08:51, 5.80s/it] 49%|████▉ | 4316/8750 [3:10:43<7:05:57, 5.76s/it] 49%|████▉ | 4316/8750 [3:10:40<7:05:57, 5.76s/it] {'loss': 0.4824, 'learning_rate': 1.0704580572107424e-05, 'epoch': 0.49} + 49%|████▉ | 4316/8750 [3:10:43<7:05:57, 5.76s/it] {'loss': 0.4824, 'learning_rate': 1.0704580572107424e-05, 'epoch': 0.49} + 49%|████▉ | 4316/8750 [3:10:40<7:05:57, 5.76s/it] 49%|████▉ | 4317/8750 [3:10:49<7:10:45, 5.83s/it] 49%|████▉ | 4317/8750 [3:10:46<7:10:45, 5.83s/it] {'loss': 0.4578, 'learning_rate': 1.0700888070770663e-05, 'epoch': 0.49} + 49%|████▉ | 4317/8750 [3:10:49<7:10:45, 5.83s/it] {'loss': 0.4578, 'learning_rate': 1.0700888070770663e-05, 'epoch': 0.49} + 49%|████▉ | 4317/8750 [3:10:46<7:10:45, 5.83s/it] 49%|████▉ | 4318/8750 [3:10:55<7:07:43, 5.79s/it] 49%|████▉ | 4318/8750 [3:10:52<7:07:43, 5.79s/it] {'loss': 0.4675, 'learning_rate': 1.0697195473396587e-05, 'epoch': 0.49} + 49%|████▉ | 4318/8750 [3:10:55<7:07:43, 5.79s/it] {'loss': 0.4675, 'learning_rate': 1.0697195473396587e-05, 'epoch': 0.49} + 49%|████▉ | 4318/8750 [3:10:52<7:07:43, 5.79s/it] 49%|████▉ | 4319/8750 [3:11:01<7:09:54, 5.82s/it] 49%|████▉ | 4319/8750 [3:10:58<7:09:55, 5.82s/it] {'loss': 0.4679, 'learning_rate': 1.0693502780491168e-05, 'epoch': 0.49} + 49%|████▉ | 4319/8750 [3:11:01<7:09:54, 5.82s/it] {'loss': 0.4679, 'learning_rate': 1.0693502780491168e-05, 'epoch': 0.49} + 49%|████▉ | 4319/8750 [3:10:58<7:09:55, 5.82s/it] 49%|████▉ | 4320/8750 [3:11:07<7:09:22, 5.82s/it] 49%|████▉ | 4320/8750 [3:11:04<7:09:22, 5.82s/it] {'loss': 0.4471, 'learning_rate': 1.0689809992560382e-05, 'epoch': 0.49} + 49%|████▉ | 4320/8750 [3:11:07<7:09:22, 5.82s/it] {'loss': 0.4471, 'learning_rate': 1.0689809992560382e-05, 'epoch': 0.49} + 49%|████▉ | 4320/8750 [3:11:04<7:09:22, 5.82s/it] 49%|████▉ | 4321/8750 [3:11:09<7:05:45, 5.77s/it] 49%|████▉ | 4321/8750 [3:11:12<7:05:47, 5.77s/it] {'loss': 0.4587, 'learning_rate': 1.0686117110110228e-05, 'epoch': 0.49} + 49%|████▉ | 4321/8750 [3:11:12<7:05:47, 5.77s/it] {'loss': 0.4587, 'learning_rate': 1.0686117110110228e-05, 'epoch': 0.49} + 49%|████▉ | 4321/8750 [3:11:09<7:05:45, 5.77s/it] 49%|████▉ | 4322/8750 [3:11:15<7:06:04, 5.77s/it] 49%|████▉ | 4322/8750 [3:11:18<7:06:03, 5.77s/it]{'loss': 0.4886, 'learning_rate': 1.0682424133646712e-05, 'epoch': 0.49} + 49%|████▉ | 4322/8750 [3:11:18<7:06:03, 5.77s/it] {'loss': 0.4886, 'learning_rate': 1.0682424133646712e-05, 'epoch': 0.49} + 49%|████▉ | 4322/8750 [3:11:15<7:06:04, 5.77s/it] 49%|████▉ | 4323/8750 [3:11:24<7:04:40, 5.76s/it] 49%|████▉ | 4323/8750 [3:11:21<7:04:40, 5.76s/it] {'loss': 0.4718, 'learning_rate': 1.067873106367585e-05, 'epoch': 0.49} + 49%|████▉ | 4323/8750 [3:11:24<7:04:40, 5.76s/it] {'loss': 0.4718, 'learning_rate': 1.067873106367585e-05, 'epoch': 0.49} + 49%|████▉ | 4323/8750 [3:11:21<7:04:40, 5.76s/it] 49%|████▉ | 4324/8750 [3:11:29<7:03:02, 5.73s/it] 49%|████▉ | 4324/8750 [3:11:27<7:03:02, 5.73s/it] {'loss': 0.4528, 'learning_rate': 1.0675037900703684e-05, 'epoch': 0.49} + 49%|████▉ | 4324/8750 [3:11:29<7:03:02, 5.73s/it] {'loss': 0.4528, 'learning_rate': 1.0675037900703684e-05, 'epoch': 0.49} + 49%|████▉ | 4324/8750 [3:11:27<7:03:02, 5.73s/it] 49%|████▉ | 4325/8750 [3:11:35<7:05:07, 5.76s/it] 49%|████▉ | 4325/8750 [3:11:32<7:05:08, 5.76s/it] {'loss': 0.4683, 'learning_rate': 1.0671344645236253e-05, 'epoch': 0.49} + 49%|████▉ | 4325/8750 [3:11:35<7:05:07, 5.76s/it] {'loss': 0.4683, 'learning_rate': 1.0671344645236253e-05, 'epoch': 0.49} + 49%|████▉ | 4325/8750 [3:11:32<7:05:08, 5.76s/it] 49%|████▉ | 4326/8750 [3:11:38<7:08:51, 5.82s/it] 49%|████▉ | 4326/8750 [3:11:41<7:08:51, 5.82s/it] {'loss': 0.473, 'learning_rate': 1.0667651297779615e-05, 'epoch': 0.49} + 49%|████▉ | 4326/8750 [3:11:41<7:08:51, 5.82s/it] {'loss': 0.473, 'learning_rate': 1.0667651297779615e-05, 'epoch': 0.49} + 49%|████▉ | 4326/8750 [3:11:38<7:08:51, 5.82s/it] 49%|████▉ | 4327/8750 [3:11:47<7:06:28, 5.79s/it] 49%|████▉ | 4327/8750 [3:11:44<7:06:31, 5.79s/it] {'loss': 0.4572, 'learning_rate': 1.0663957858839843e-05, 'epoch': 0.49} + 49%|████▉ | 4327/8750 [3:11:47<7:06:28, 5.79s/it] {'loss': 0.4572, 'learning_rate': 1.0663957858839843e-05, 'epoch': 0.49} + 49%|████▉ | 4327/8750 [3:11:44<7:06:31, 5.79s/it] 49%|████▉ | 4328/8750 [3:11:53<7:03:41, 5.75s/it] 49%|████▉ | 4328/8750 [3:11:50<7:03:40, 5.75s/it] {'loss': 0.4593, 'learning_rate': 1.0660264328923024e-05, 'epoch': 0.49} + 49%|████▉ | 4328/8750 [3:11:53<7:03:41, 5.75s/it] {'loss': 0.4593, 'learning_rate': 1.0660264328923024e-05, 'epoch': 0.49} + 49%|████▉ | 4328/8750 [3:11:50<7:03:40, 5.75s/it] 49%|████▉ | 4329/8750 [3:11:58<7:01:52, 5.73s/it] 49%|████▉ | 4329/8750 [3:11:55<7:01:51, 5.73s/it] {'loss': 0.4559, 'learning_rate': 1.0656570708535248e-05, 'epoch': 0.49} + 49%|████▉ | 4329/8750 [3:11:58<7:01:52, 5.73s/it] {'loss': 0.4559, 'learning_rate': 1.0656570708535248e-05, 'epoch': 0.49} + 49%|████▉ | 4329/8750 [3:11:55<7:01:51, 5.73s/it] 49%|████▉ | 4330/8750 [3:12:04<7:02:30, 5.74s/it] 49%|████▉ | 4330/8750 [3:12:01<7:02:29, 5.74s/it] {'loss': 0.4716, 'learning_rate': 1.0652876998182626e-05, 'epoch': 0.49} + 49%|████▉ | 4330/8750 [3:12:04<7:02:30, 5.74s/it] {'loss': 0.4716, 'learning_rate': 1.0652876998182626e-05, 'epoch': 0.49} + 49%|████▉ | 4330/8750 [3:12:01<7:02:29, 5.74s/it] 49%|████▉ | 4331/8750 [3:12:10<6:59:57, 5.70s/it] 49%|████▉ | 4331/8750 [3:12:07<6:59:57, 5.70s/it] {'loss': 0.5014, 'learning_rate': 1.064918319837128e-05, 'epoch': 0.49} + 49%|████▉ | 4331/8750 [3:12:10<6:59:57, 5.70s/it] {'loss': 0.5014, 'learning_rate': 1.064918319837128e-05, 'epoch': 0.49} + 49%|████▉ | 4331/8750 [3:12:07<6:59:57, 5.70s/it] 50%|████▉ | 4332/8750 [3:12:13<7:01:57, 5.73s/it] 50%|████▉ | 4332/8750 [3:12:15<7:01:58, 5.73s/it] {'loss': 0.4458, 'learning_rate': 1.0645489309607346e-05, 'epoch': 0.5} + 50%|████▉ | 4332/8750 [3:12:15<7:01:58, 5.73s/it] {'loss': 0.4458, 'learning_rate': 1.0645489309607346e-05, 'epoch': 0.5} + 50%|████▉ | 4332/8750 [3:12:13<7:01:57, 5.73s/it] 50%|████▉ | 4333/8750 [3:12:21<7:05:03, 5.77s/it] 50%|████▉ | 4333/8750 [3:12:18<7:05:02, 5.77s/it] {'loss': 0.4546, 'learning_rate': 1.064179533239696e-05, 'epoch': 0.5} + 50%|████▉ | 4333/8750 [3:12:21<7:05:03, 5.77s/it] {'loss': 0.4546, 'learning_rate': 1.064179533239696e-05, 'epoch': 0.5} + 50%|████▉ | 4333/8750 [3:12:18<7:05:02, 5.77s/it] 50%|████▉ | 4334/8750 [3:12:27<7:00:07, 5.71s/it] 50%|████▉ | 4334/8750 [3:12:24<7:00:07, 5.71s/it] {'loss': 0.5054, 'learning_rate': 1.0638101267246283e-05, 'epoch': 0.5} + 50%|████▉ | 4334/8750 [3:12:27<7:00:07, 5.71s/it] {'loss': 0.5054, 'learning_rate': 1.0638101267246283e-05, 'epoch': 0.5} + 50%|████▉ | 4334/8750 [3:12:24<7:00:07, 5.71s/it] 50%|████▉ | 4335/8750 [3:12:33<7:02:07, 5.74s/it] 50%|████▉ | 4335/8750 [3:12:30<7:02:07, 5.74s/it] {'loss': 0.4734, 'learning_rate': 1.0634407114661492e-05, 'epoch': 0.5} + 50%|████▉ | 4335/8750 [3:12:33<7:02:07, 5.74s/it] {'loss': 0.4734, 'learning_rate': 1.0634407114661492e-05, 'epoch': 0.5} + 50%|████▉ | 4335/8750 [3:12:30<7:02:07, 5.74s/it] 50%|████▉ | 4336/8750 [3:12:38<6:59:42, 5.71s/it] 50%|████▉ | 4336/8750 [3:12:35<6:59:42, 5.71s/it] {'loss': 0.4572, 'learning_rate': 1.0630712875148758e-05, 'epoch': 0.5} + 50%|████▉ | 4336/8750 [3:12:38<6:59:42, 5.71s/it] {'loss': 0.4572, 'learning_rate': 1.0630712875148758e-05, 'epoch': 0.5} + 50%|████▉ | 4336/8750 [3:12:35<6:59:42, 5.71s/it] 50%|████▉ | 4337/8750 [3:12:44<6:59:06, 5.70s/it] 50%|████▉ | 4337/8750 [3:12:41<6:59:06, 5.70s/it] {'loss': 0.4573, 'learning_rate': 1.0627018549214284e-05, 'epoch': 0.5} + 50%|████▉ | 4337/8750 [3:12:44<6:59:06, 5.70s/it] {'loss': 0.4573, 'learning_rate': 1.0627018549214284e-05, 'epoch': 0.5} + 50%|████▉ | 4337/8750 [3:12:41<6:59:06, 5.70s/it] 50%|████▉ | 4338/8750 [3:12:50<7:08:00, 5.82s/it] 50%|████▉ | 4338/8750 [3:12:47<7:08:00, 5.82s/it] {'loss': 0.4595, 'learning_rate': 1.062332413736426e-05, 'epoch': 0.5} + 50%|████▉ | 4338/8750 [3:12:50<7:08:00, 5.82s/it] {'loss': 0.4595, 'learning_rate': 1.062332413736426e-05, 'epoch': 0.5} + 50%|████▉ | 4338/8750 [3:12:47<7:08:00, 5.82s/it] 50%|████▉ | 4339/8750 [3:12:56<7:07:14, 5.81s/it] 50%|████▉ | 4339/8750 [3:12:53<7:07:14, 5.81s/it] {'loss': 0.4774, 'learning_rate': 1.0619629640104921e-05, 'epoch': 0.5} + 50%|████▉ | 4339/8750 [3:12:56<7:07:14, 5.81s/it] {'loss': 0.4774, 'learning_rate': 1.0619629640104921e-05, 'epoch': 0.5} + 50%|████▉ | 4339/8750 [3:12:53<7:07:14, 5.81s/it] 50%|████▉ | 4340/8750 [3:13:02<7:03:56, 5.77s/it] 50%|████▉ | 4340/8750 [3:12:59<7:03:56, 5.77s/it] {'loss': 0.4731, 'learning_rate': 1.0615935057942485e-05, 'epoch': 0.5} + 50%|████▉ | 4340/8750 [3:13:02<7:03:56, 5.77s/it] {'loss': 0.4731, 'learning_rate': 1.0615935057942485e-05, 'epoch': 0.5} + 50%|████▉ | 4340/8750 [3:12:59<7:03:56, 5.77s/it] 50%|████▉ | 4341/8750 [3:13:07<7:06:20, 5.80s/it] 50%|████▉ | 4341/8750 [3:13:05<7:06:20, 5.80s/it] {'loss': 0.4404, 'learning_rate': 1.0612240391383197e-05, 'epoch': 0.5} + 50%|████▉ | 4341/8750 [3:13:07<7:06:20, 5.80s/it] {'loss': 0.4404, 'learning_rate': 1.0612240391383197e-05, 'epoch': 0.5} + 50%|████▉ | 4341/8750 [3:13:05<7:06:20, 5.80s/it] 50%|████▉ | 4342/8750 [3:13:10<7:06:59, 5.81s/it] 50%|████▉ | 4342/8750 [3:13:13<7:06:59, 5.81s/it] {'loss': 0.4807, 'learning_rate': 1.0608545640933304e-05, 'epoch': 0.5} + 50%|████▉ | 4342/8750 [3:13:13<7:06:59, 5.81s/it] {'loss': 0.4807, 'learning_rate': 1.0608545640933304e-05, 'epoch': 0.5} + 50%|████▉ | 4342/8750 [3:13:10<7:06:59, 5.81s/it] 50%|████▉ | 4343/8750 [3:13:19<7:07:22, 5.82s/it] 50%|████▉ | 4343/8750 [3:13:16<7:07:22, 5.82s/it] {'loss': 0.4475, 'learning_rate': 1.060485080709907e-05, 'epoch': 0.5} + 50%|████▉ | 4343/8750 [3:13:19<7:07:22, 5.82s/it] {'loss': 0.4475, 'learning_rate': 1.060485080709907e-05, 'epoch': 0.5} + 50%|████▉ | 4343/8750 [3:13:16<7:07:22, 5.82s/it] 50%|████▉ | 4344/8750 [3:13:25<7:14:19, 5.91s/it] 50%|████▉ | 4344/8750 [3:13:22<7:14:19, 5.91s/it] {'loss': 0.468, 'learning_rate': 1.0601155890386771e-05, 'epoch': 0.5} + 50%|████▉ | 4344/8750 [3:13:25<7:14:19, 5.91s/it] {'loss': 0.468, 'learning_rate': 1.0601155890386771e-05, 'epoch': 0.5} + 50%|████▉ | 4344/8750 [3:13:22<7:14:19, 5.91s/it] 50%|████▉ | 4345/8750 [3:13:31<7:13:42, 5.91s/it] 50%|████▉ | 4345/8750 [3:13:28<7:13:42, 5.91s/it] {'loss': 0.4684, 'learning_rate': 1.05974608913027e-05, 'epoch': 0.5} + 50%|████▉ | 4345/8750 [3:13:31<7:13:42, 5.91s/it] {'loss': 0.4684, 'learning_rate': 1.05974608913027e-05, 'epoch': 0.5} + 50%|████▉ | 4345/8750 [3:13:28<7:13:42, 5.91s/it] 50%|████▉ | 4346/8750 [3:13:37<7:09:41, 5.85s/it] 50%|████▉ | 4346/8750 [3:13:34<7:09:41, 5.85s/it] {'loss': 0.4471, 'learning_rate': 1.0593765810353142e-05, 'epoch': 0.5} + 50%|████▉ | 4346/8750 [3:13:37<7:09:41, 5.85s/it] {'loss': 0.4471, 'learning_rate': 1.0593765810353142e-05, 'epoch': 0.5} + 50%|████▉ | 4346/8750 [3:13:34<7:09:41, 5.85s/it] 50%|████▉ | 4347/8750 [3:13:40<7:15:32, 5.94s/it] 50%|████▉ | 4347/8750 [3:13:43<7:15:35, 5.94s/it] {'loss': 0.465, 'learning_rate': 1.0590070648044415e-05, 'epoch': 0.5} + 50%|████▉ | 4347/8750 [3:13:43<7:15:35, 5.94s/it] {'loss': 0.465, 'learning_rate': 1.0590070648044415e-05, 'epoch': 0.5} + 50%|████▉ | 4347/8750 [3:13:40<7:15:32, 5.94s/it] 50%|████▉ | 4348/8750 [3:13:49<7:11:51, 5.89s/it] 50%|████▉ | 4348/8750 [3:13:46<7:11:51, 5.89s/it] {'loss': 0.4646, 'learning_rate': 1.0586375404882832e-05, 'epoch': 0.5} + 50%|████▉ | 4348/8750 [3:13:49<7:11:51, 5.89s/it] {'loss': 0.4646, 'learning_rate': 1.0586375404882832e-05, 'epoch': 0.5} + 50%|████▉ | 4348/8750 [3:13:46<7:11:51, 5.89s/it] 50%|████▉ | 4349/8750 [3:13:54<7:03:57, 5.78s/it] 50%|████▉ | 4349/8750 [3:13:51<7:03:57, 5.78s/it] {'loss': 0.4695, 'learning_rate': 1.0582680081374728e-05, 'epoch': 0.5} + 50%|████▉ | 4349/8750 [3:13:54<7:03:57, 5.78s/it] {'loss': 0.4695, 'learning_rate': 1.0582680081374728e-05, 'epoch': 0.5} + 50%|████▉ | 4349/8750 [3:13:51<7:03:57, 5.78s/it]8 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... + 50%|████▉ | 4350/8750 [3:14:00<6:59:40, 5.72s/it]15 AutoResumeHook: Checking whether to suspend... +1014 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +06 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... + 50%|████▉ | 4350/8750 [3:13:57<6:59:40, 5.72s/it]5 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4684, 'learning_rate': 1.0578984678026445e-05, 'epoch': 0.5} + 50%|████▉ | 4350/8750 [3:14:00<6:59:40, 5.72s/it] {'loss': 0.4684, 'learning_rate': 1.0578984678026445e-05, 'epoch': 0.5} + 50%|████▉ | 4350/8750 [3:13:57<6:59:40, 5.72s/it] 50%|████▉ | 4351/8750 [3:14:06<6:58:57, 5.71s/it] 50%|████▉ | 4351/8750 [3:14:03<6:58:57, 5.71s/it] {'loss': 0.454, 'learning_rate': 1.0575289195344334e-05, 'epoch': 0.5} + 50%|████▉ | 4351/8750 [3:14:06<6:58:57, 5.71s/it] {'loss': 0.454, 'learning_rate': 1.0575289195344334e-05, 'epoch': 0.5} + 50%|████▉ | 4351/8750 [3:14:03<6:58:57, 5.71s/it] 50%|████▉ | 4352/8750 [3:14:11<6:59:16, 5.72s/it] 50%|████▉ | 4352/8750 [3:14:08<6:59:16, 5.72s/it] {'loss': 0.477, 'learning_rate': 1.0571593633834758e-05, 'epoch': 0.5} + 50%|████▉ | 4352/8750 [3:14:11<6:59:16, 5.72s/it] {'loss': 0.477, 'learning_rate': 1.0571593633834758e-05, 'epoch': 0.5} + 50%|████▉ | 4352/8750 [3:14:08<6:59:16, 5.72s/it] 50%|████▉ | 4353/8750 [3:14:17<7:04:36, 5.79s/it] 50%|████▉ | 4353/8750 [3:14:14<7:04:36, 5.79s/it] {'loss': 0.4632, 'learning_rate': 1.0567897994004093e-05, 'epoch': 0.5} + 50%|████▉ | 4353/8750 [3:14:17<7:04:36, 5.79s/it] {'loss': 0.4632, 'learning_rate': 1.0567897994004093e-05, 'epoch': 0.5} + 50%|████▉ | 4353/8750 [3:14:14<7:04:36, 5.79s/it] 50%|████▉ | 4354/8750 [3:14:23<7:04:21, 5.79s/it] 50%|████▉ | 4354/8750 [3:14:20<7:04:21, 5.79s/it] {'loss': 0.4516, 'learning_rate': 1.0564202276358726e-05, 'epoch': 0.5} + 50%|████▉ | 4354/8750 [3:14:23<7:04:21, 5.79s/it] {'loss': 0.4516, 'learning_rate': 1.0564202276358726e-05, 'epoch': 0.5} + 50%|████▉ | 4354/8750 [3:14:20<7:04:21, 5.79s/it] 50%|████▉ | 4355/8750 [3:14:29<7:09:39, 5.87s/it] 50%|████▉ | 4355/8750 [3:14:26<7:09:39, 5.87s/it] {'loss': 0.4679, 'learning_rate': 1.0560506481405048e-05, 'epoch': 0.5} + 50%|████▉ | 4355/8750 [3:14:29<7:09:39, 5.87s/it] {'loss': 0.4679, 'learning_rate': 1.0560506481405048e-05, 'epoch': 0.5} + 50%|████▉ | 4355/8750 [3:14:26<7:09:39, 5.87s/it] 50%|████▉ | 4356/8750 [3:14:35<7:06:44, 5.83s/it] 50%|████▉ | 4356/8750 [3:14:32<7:06:44, 5.83s/it] {'loss': 0.4686, 'learning_rate': 1.0556810609649471e-05, 'epoch': 0.5} + 50%|████▉ | 4356/8750 [3:14:35<7:06:44, 5.83s/it] {'loss': 0.4686, 'learning_rate': 1.0556810609649471e-05, 'epoch': 0.5} + 50%|████▉ | 4356/8750 [3:14:32<7:06:44, 5.83s/it] 50%|████▉ | 4357/8750 [3:14:38<7:02:53, 5.78s/it] 50%|████▉ | 4357/8750 [3:14:40<7:02:54, 5.78s/it] {'loss': 0.4695, 'learning_rate': 1.0553114661598406e-05, 'epoch': 0.5} + 50%|████▉ | 4357/8750 [3:14:38<7:02:53, 5.78s/it]{'loss': 0.4695, 'learning_rate': 1.0553114661598406e-05, 'epoch': 0.5} + 50%|████▉ | 4357/8750 [3:14:40<7:02:54, 5.78s/it] 50%|████▉ | 4358/8750 [3:14:46<7:02:50, 5.78s/it] 50%|████▉ | 4358/8750 [3:14:43<7:02:50, 5.78s/it] {'loss': 0.4569, 'learning_rate': 1.0549418637758284e-05, 'epoch': 0.5} + 50%|████▉ | 4358/8750 [3:14:46<7:02:50, 5.78s/it] {'loss': 0.4569, 'learning_rate': 1.0549418637758284e-05, 'epoch': 0.5} + 50%|████▉ | 4358/8750 [3:14:43<7:02:50, 5.78s/it] 50%|████▉ | 4359/8750 [3:14:52<7:01:17, 5.76s/it] 50%|████▉ | 4359/8750 [3:14:49<7:01:18, 5.76s/it] {'loss': 0.4721, 'learning_rate': 1.0545722538635544e-05, 'epoch': 0.5} + 50%|████▉ | 4359/8750 [3:14:52<7:01:17, 5.76s/it] {'loss': 0.4721, 'learning_rate': 1.0545722538635544e-05, 'epoch': 0.5} + 50%|████▉ | 4359/8750 [3:14:49<7:01:18, 5.76s/it] 50%|████▉ | 4360/8750 [3:14:58<6:59:12, 5.73s/it] 50%|████▉ | 4360/8750 [3:14:55<6:59:12, 5.73s/it] {'loss': 0.4599, 'learning_rate': 1.054202636473663e-05, 'epoch': 0.5} + {'loss': 0.4599, 'learning_rate': 1.054202636473663e-05, 'epoch': 0.5} 50%|████▉ | 4360/8750 [3:14:58<6:59:12, 5.73s/it] + 50%|████▉ | 4360/8750 [3:14:55<6:59:12, 5.73s/it] 50%|████▉ | 4361/8750 [3:15:04<7:05:39, 5.82s/it] 50%|████▉ | 4361/8750 [3:15:01<7:05:39, 5.82s/it] {'loss': 0.4584, 'learning_rate': 1.0538330116568006e-05, 'epoch': 0.5} + 50%|████▉ | 4361/8750 [3:15:04<7:05:39, 5.82s/it] {'loss': 0.4584, 'learning_rate': 1.0538330116568006e-05, 'epoch': 0.5} + 50%|████▉ | 4361/8750 [3:15:01<7:05:39, 5.82s/it] 50%|████▉ | 4362/8750 [3:15:09<7:02:46, 5.78s/it] 50%|████▉ | 4362/8750 [3:15:06<7:02:46, 5.78s/it] {'loss': 0.469, 'learning_rate': 1.0534633794636134e-05, 'epoch': 0.5} + 50%|████▉ | 4362/8750 [3:15:09<7:02:46, 5.78s/it] {'loss': 0.469, 'learning_rate': 1.0534633794636134e-05, 'epoch': 0.5} + 50%|████▉ | 4362/8750 [3:15:06<7:02:46, 5.78s/it] 50%|████▉ | 4363/8750 [3:15:15<7:08:09, 5.86s/it] 50%|████▉ | 4363/8750 [3:15:12<7:08:09, 5.86s/it] {'loss': 0.4706, 'learning_rate': 1.0530937399447496e-05, 'epoch': 0.5} + 50%|████▉ | 4363/8750 [3:15:15<7:08:09, 5.86s/it] {'loss': 0.4706, 'learning_rate': 1.0530937399447496e-05, 'epoch': 0.5} + 50%|████▉ | 4363/8750 [3:15:12<7:08:09, 5.86s/it] 50%|████▉ | 4364/8750 [3:15:21<7:03:28, 5.79s/it] 50%|████▉ | 4364/8750 [3:15:18<7:03:28, 5.79s/it] {'loss': 0.4607, 'learning_rate': 1.0527240931508582e-05, 'epoch': 0.5} + 50%|████▉ | 4364/8750 [3:15:21<7:03:28, 5.79s/it] {'loss': 0.4607, 'learning_rate': 1.0527240931508582e-05, 'epoch': 0.5} + 50%|████▉ | 4364/8750 [3:15:18<7:03:28, 5.79s/it] 50%|████▉ | 4365/8750 [3:15:27<7:00:41, 5.76s/it] 50%|████▉ | 4365/8750 [3:15:24<7:00:41, 5.76s/it] {'loss': 0.4766, 'learning_rate': 1.0523544391325888e-05, 'epoch': 0.5} + 50%|████▉ | 4365/8750 [3:15:27<7:00:41, 5.76s/it] {'loss': 0.4766, 'learning_rate': 1.0523544391325888e-05, 'epoch': 0.5} + 50%|████▉ | 4365/8750 [3:15:24<7:00:41, 5.76s/it] 50%|████▉ | 4366/8750 [3:15:33<7:07:06, 5.85s/it] 50%|████▉ | 4366/8750 [3:15:30<7:07:06, 5.85s/it] {'loss': 0.4552, 'learning_rate': 1.0519847779405926e-05, 'epoch': 0.5} + 50%|████▉ | 4366/8750 [3:15:33<7:07:06, 5.85s/it] {'loss': 0.4552, 'learning_rate': 1.0519847779405926e-05, 'epoch': 0.5} + 50%|████▉ | 4366/8750 [3:15:30<7:07:06, 5.85s/it] 50%|████▉ | 4367/8750 [3:15:39<7:07:35, 5.85s/it] 50%|████▉ | 4367/8750 [3:15:36<7:07:34, 5.85s/it] {'loss': 0.4917, 'learning_rate': 1.051615109625521e-05, 'epoch': 0.5} + 50%|████▉ | 4367/8750 [3:15:39<7:07:35, 5.85s/it] {'loss': 0.4917, 'learning_rate': 1.051615109625521e-05, 'epoch': 0.5} + 50%|████▉ | 4367/8750 [3:15:36<7:07:34, 5.85s/it] 50%|████▉ | 4368/8750 [3:15:44<7:06:32, 5.84s/it] 50%|████▉ | 4368/8750 [3:15:42<7:06:32, 5.84s/it] {'loss': 0.4614, 'learning_rate': 1.0512454342380269e-05, 'epoch': 0.5} + 50%|████▉ | 4368/8750 [3:15:44<7:06:32, 5.84s/it] {'loss': 0.4614, 'learning_rate': 1.0512454342380269e-05, 'epoch': 0.5} + 50%|████▉ | 4368/8750 [3:15:42<7:06:32, 5.84s/it] 50%|████▉ | 4369/8750 [3:15:50<7:08:27, 5.87s/it] 50%|████▉ | 4369/8750 [3:15:47<7:08:27, 5.87s/it] {'loss': 0.453, 'learning_rate': 1.0508757518287642e-05, 'epoch': 0.5} + 50%|████▉ | 4369/8750 [3:15:50<7:08:27, 5.87s/it] {'loss': 0.453, 'learning_rate': 1.0508757518287642e-05, 'epoch': 0.5} + 50%|████▉ | 4369/8750 [3:15:47<7:08:27, 5.87s/it] 50%|████▉ | 4370/8750 [3:15:56<7:07:31, 5.86s/it] 50%|████▉ | 4370/8750 [3:15:53<7:07:31, 5.86s/it] {'loss': 0.4705, 'learning_rate': 1.0505060624483878e-05, 'epoch': 0.5} + 50%|████▉ | 4370/8750 [3:15:56<7:07:31, 5.86s/it] {'loss': 0.4705, 'learning_rate': 1.0505060624483878e-05, 'epoch': 0.5} + 50%|████▉ | 4370/8750 [3:15:53<7:07:31, 5.86s/it] 50%|████▉ | 4371/8750 [3:16:02<7:07:53, 5.86s/it] 50%|████▉ | 4371/8750 [3:15:59<7:07:53, 5.86s/it] {'loss': 0.4748, 'learning_rate': 1.0501363661475533e-05, 'epoch': 0.5} + 50%|████▉ | 4371/8750 [3:16:02<7:07:53, 5.86s/it] {'loss': 0.4748, 'learning_rate': 1.0501363661475533e-05, 'epoch': 0.5} + 50%|████▉ | 4371/8750 [3:15:59<7:07:53, 5.86s/it] 50%|████▉ | 4372/8750 [3:16:08<7:09:50, 5.89s/it] 50%|████▉ | 4372/8750 [3:16:05<7:09:50, 5.89s/it] {'loss': 0.4521, 'learning_rate': 1.0497666629769172e-05, 'epoch': 0.5} + 50%|████▉ | 4372/8750 [3:16:08<7:09:50, 5.89s/it] {'loss': 0.4521, 'learning_rate': 1.0497666629769172e-05, 'epoch': 0.5} + 50%|████▉ | 4372/8750 [3:16:05<7:09:50, 5.89s/it] 50%|████▉ | 4373/8750 [3:16:11<7:06:22, 5.84s/it] 50%|████▉ | 4373/8750 [3:16:14<7:06:22, 5.84s/it] {'loss': 0.463, 'learning_rate': 1.049396952987137e-05, 'epoch': 0.5} + {'loss': 0.463, 'learning_rate': 1.049396952987137e-05, 'epoch': 0.5} 50%|████▉ | 4373/8750 [3:16:14<7:06:22, 5.84s/it] + 50%|████▉ | 4373/8750 [3:16:11<7:06:22, 5.84s/it] 50%|████▉ | 4374/8750 [3:16:20<7:07:49, 5.87s/it] 50%|████▉ | 4374/8750 [3:16:17<7:07:49, 5.87s/it] {'loss': 0.4574, 'learning_rate': 1.0490272362288716e-05, 'epoch': 0.5} + 50%|████▉ | 4374/8750 [3:16:20<7:07:49, 5.87s/it] {'loss': 0.4574, 'learning_rate': 1.0490272362288716e-05, 'epoch': 0.5} + 50%|████▉ | 4374/8750 [3:16:17<7:07:49, 5.87s/it] 50%|█████ | 4375/8750 [3:16:25<6:58:22, 5.74s/it] 50%|█████ | 4375/8750 [3:16:22<6:58:22, 5.74s/it] {'loss': 0.5077, 'learning_rate': 1.0486575127527802e-05, 'epoch': 0.5} + 50%|█████ | 4375/8750 [3:16:25<6:58:22, 5.74s/it] {'loss': 0.5077, 'learning_rate': 1.0486575127527802e-05, 'epoch': 0.5} + 50%|█████ | 4375/8750 [3:16:22<6:58:22, 5.74s/it] 50%|█████ | 4376/8750 [3:16:31<6:58:58, 5.75s/it] 50%|█████ | 4376/8750 [3:16:28<6:58:58, 5.75s/it] {'loss': 0.4699, 'learning_rate': 1.0482877826095233e-05, 'epoch': 0.5} + 50%|█████ | 4376/8750 [3:16:31<6:58:58, 5.75s/it] {'loss': 0.4699, 'learning_rate': 1.0482877826095233e-05, 'epoch': 0.5} + 50%|█████ | 4376/8750 [3:16:28<6:58:58, 5.75s/it] 50%|█████ | 4377/8750 [3:16:37<6:58:33, 5.74s/it] 50%|█████ | 4377/8750 [3:16:34<6:58:33, 5.74s/it] {'loss': 0.4626, 'learning_rate': 1.047918045849762e-05, 'epoch': 0.5} + 50%|█████ | 4377/8750 [3:16:37<6:58:33, 5.74s/it] {'loss': 0.4626, 'learning_rate': 1.047918045849762e-05, 'epoch': 0.5} + 50%|█████ | 4377/8750 [3:16:34<6:58:33, 5.74s/it] 50%|█████ | 4378/8750 [3:16:39<6:56:02, 5.71s/it] 50%|█████ | 4378/8750 [3:16:42<6:56:03, 5.71s/it] {'loss': 0.4677, 'learning_rate': 1.0475483025241587e-05, 'epoch': 0.5} + 50%|█████ | 4378/8750 [3:16:42<6:56:03, 5.71s/it] {'loss': 0.4677, 'learning_rate': 1.0475483025241587e-05, 'epoch': 0.5} + 50%|█████ | 4378/8750 [3:16:39<6:56:02, 5.71s/it] 50%|█████ | 4379/8750 [3:16:45<6:59:38, 5.76s/it] 50%|█████ | 4379/8750 [3:16:48<6:59:38, 5.76s/it] {'loss': 0.4496, 'learning_rate': 1.0471785526833762e-05, 'epoch': 0.5} + 50%|█████ | 4379/8750 [3:16:48<6:59:38, 5.76s/it] {'loss': 0.4496, 'learning_rate': 1.0471785526833762e-05, 'epoch': 0.5} + 50%|█████ | 4379/8750 [3:16:45<6:59:38, 5.76s/it] 50%|█████ | 4380/8750 [3:16:54<7:03:12, 5.81s/it] 50%|█████ | 4380/8750 [3:16:51<7:03:12, 5.81s/it] {'loss': 0.4818, 'learning_rate': 1.046808796378079e-05, 'epoch': 0.5} + 50%|█████ | 4380/8750 [3:16:54<7:03:12, 5.81s/it] {'loss': 0.4818, 'learning_rate': 1.046808796378079e-05, 'epoch': 0.5} + 50%|█████ | 4380/8750 [3:16:51<7:03:12, 5.81s/it] 50%|█████ | 4381/8750 [3:17:00<7:05:10, 5.84s/it] 50%|█████ | 4381/8750 [3:16:57<7:05:10, 5.84s/it] {'loss': 0.4648, 'learning_rate': 1.0464390336589311e-05, 'epoch': 0.5} + 50%|█████ | 4381/8750 [3:17:00<7:05:10, 5.84s/it] {'loss': 0.4648, 'learning_rate': 1.0464390336589311e-05, 'epoch': 0.5} + 50%|█████ | 4381/8750 [3:16:57<7:05:10, 5.84s/it] 50%|█████ | 4382/8750 [3:17:06<7:00:26, 5.78s/it] 50%|█████ | 4382/8750 [3:17:03<7:00:27, 5.78s/it] {'loss': 0.4639, 'learning_rate': 1.046069264576599e-05, 'epoch': 0.5} + 50%|█████ | 4382/8750 [3:17:06<7:00:26, 5.78s/it] {'loss': 0.4639, 'learning_rate': 1.046069264576599e-05, 'epoch': 0.5} + 50%|█████ | 4382/8750 [3:17:03<7:00:27, 5.78s/it] 50%|█████ | 4383/8750 [3:17:11<6:56:04, 5.72s/it] 50%|█████ | 4383/8750 [3:17:08<6:56:04, 5.72s/it] {'loss': 0.501, 'learning_rate': 1.0456994891817492e-05, 'epoch': 0.5} + 50%|█████ | 4383/8750 [3:17:11<6:56:04, 5.72s/it] {'loss': 0.501, 'learning_rate': 1.0456994891817492e-05, 'epoch': 0.5} + 50%|█████ | 4383/8750 [3:17:08<6:56:04, 5.72s/it] 50%|█████ | 4384/8750 [3:17:17<6:57:29, 5.74s/it] 50%|█████ | 4384/8750 [3:17:14<6:57:28, 5.74s/it] {'loss': 0.4538, 'learning_rate': 1.045329707525049e-05, 'epoch': 0.5} + 50%|█████ | 4384/8750 [3:17:17<6:57:29, 5.74s/it] {'loss': 0.4538, 'learning_rate': 1.045329707525049e-05, 'epoch': 0.5} + 50%|█████ | 4384/8750 [3:17:14<6:57:28, 5.74s/it] 50%|█████ | 4385/8750 [3:17:23<6:56:13, 5.72s/it] 50%|█████ | 4385/8750 [3:17:20<6:56:13, 5.72s/it] {'loss': 0.4661, 'learning_rate': 1.0449599196571671e-05, 'epoch': 0.5} + 50%|█████ | 4385/8750 [3:17:23<6:56:13, 5.72s/it] {'loss': 0.4661, 'learning_rate': 1.0449599196571671e-05, 'epoch': 0.5} + 50%|█████ | 4385/8750 [3:17:20<6:56:13, 5.72s/it] 50%|█████ | 4386/8750 [3:17:28<6:58:41, 5.76s/it] 50%|█████ | 4386/8750 [3:17:26<6:58:41, 5.76s/it] {'loss': 0.4508, 'learning_rate': 1.044590125628772e-05, 'epoch': 0.5} + 50%|█████ | 4386/8750 [3:17:28<6:58:41, 5.76s/it] {'loss': 0.4508, 'learning_rate': 1.044590125628772e-05, 'epoch': 0.5} + 50%|█████ | 4386/8750 [3:17:26<6:58:41, 5.76s/it] 50%|█████ | 4387/8750 [3:17:34<7:03:36, 5.83s/it] 50%|█████ | 4387/8750 [3:17:32<7:03:36, 5.83s/it] {'loss': 0.4545, 'learning_rate': 1.0442203254905346e-05, 'epoch': 0.5} + 50%|█████ | 4387/8750 [3:17:34<7:03:36, 5.83s/it] {'loss': 0.4545, 'learning_rate': 1.0442203254905346e-05, 'epoch': 0.5} + 50%|█████ | 4387/8750 [3:17:32<7:03:36, 5.83s/it] 50%|█████ | 4388/8750 [3:17:40<7:03:53, 5.83s/it] 50%|█████ | 4388/8750 [3:17:37<7:03:53, 5.83s/it] {'loss': 0.4552, 'learning_rate': 1.043850519293125e-05, 'epoch': 0.5} + 50%|█████ | 4388/8750 [3:17:40<7:03:53, 5.83s/it] {'loss': 0.4552, 'learning_rate': 1.043850519293125e-05, 'epoch': 0.5} + 50%|█████ | 4388/8750 [3:17:37<7:03:53, 5.83s/it] 50%|█████ | 4389/8750 [3:17:46<7:00:34, 5.79s/it] 50%|█████ | 4389/8750 [3:17:43<7:00:34, 5.79s/it] {'loss': 0.4872, 'learning_rate': 1.0434807070872154e-05, 'epoch': 0.5} + 50%|█████ | 4389/8750 [3:17:46<7:00:34, 5.79s/it] {'loss': 0.4872, 'learning_rate': 1.0434807070872154e-05, 'epoch': 0.5} + 50%|█████ | 4389/8750 [3:17:43<7:00:34, 5.79s/it] 50%|█████ | 4390/8750 [3:17:52<7:01:40, 5.80s/it] 50%|█████ | 4390/8750 [3:17:49<7:01:40, 5.80s/it] {'loss': 0.4596, 'learning_rate': 1.0431108889234783e-05, 'epoch': 0.5} + 50%|█████ | 4390/8750 [3:17:52<7:01:40, 5.80s/it] {'loss': 0.4596, 'learning_rate': 1.0431108889234783e-05, 'epoch': 0.5} + 50%|█████ | 4390/8750 [3:17:49<7:01:40, 5.80s/it] 50%|█████ | 4391/8750 [3:17:58<6:59:09, 5.77s/it] 50%|█████ | 4391/8750 [3:17:55<6:59:09, 5.77s/it] {'loss': 0.4589, 'learning_rate': 1.0427410648525863e-05, 'epoch': 0.5} + 50%|█████ | 4391/8750 [3:17:58<6:59:09, 5.77s/it] {'loss': 0.4589, 'learning_rate': 1.0427410648525863e-05, 'epoch': 0.5} + 50%|█████ | 4391/8750 [3:17:55<6:59:09, 5.77s/it] 50%|█████ | 4392/8750 [3:18:03<6:57:29, 5.75s/it] 50%|█████ | 4392/8750 [3:18:00<6:57:30, 5.75s/it] {'loss': 0.4557, 'learning_rate': 1.0423712349252148e-05, 'epoch': 0.5} + 50%|█████ | 4392/8750 [3:18:03<6:57:29, 5.75s/it] {'loss': 0.4557, 'learning_rate': 1.0423712349252148e-05, 'epoch': 0.5} + 50%|█████ | 4392/8750 [3:18:00<6:57:30, 5.75s/it] 50%|█████ | 4393/8750 [3:18:09<6:54:40, 5.71s/it] 50%|█████ | 4393/8750 [3:18:06<6:54:40, 5.71s/it] {'loss': 0.4582, 'learning_rate': 1.0420013991920382e-05, 'epoch': 0.5} + 50%|█████ | 4393/8750 [3:18:09<6:54:40, 5.71s/it] {'loss': 0.4582, 'learning_rate': 1.0420013991920382e-05, 'epoch': 0.5} + 50%|█████ | 4393/8750 [3:18:06<6:54:40, 5.71s/it] 50%|█████ | 4394/8750 [3:18:15<6:57:21, 5.75s/it] 50%|█████ | 4394/8750 [3:18:12<6:57:21, 5.75s/it] {'loss': 0.4466, 'learning_rate': 1.041631557703732e-05, 'epoch': 0.5} + 50%|█████ | 4394/8750 [3:18:15<6:57:21, 5.75s/it] {'loss': 0.4466, 'learning_rate': 1.041631557703732e-05, 'epoch': 0.5} + 50%|█████ | 4394/8750 [3:18:12<6:57:21, 5.75s/it] 50%|█████ | 4395/8750 [3:18:21<6:58:35, 5.77s/it] 50%|█████ | 4395/8750 [3:18:18<6:58:36, 5.77s/it] {'loss': 0.4584, 'learning_rate': 1.0412617105109725e-05, 'epoch': 0.5} + 50%|█████ | 4395/8750 [3:18:21<6:58:35, 5.77s/it] {'loss': 0.4584, 'learning_rate': 1.0412617105109725e-05, 'epoch': 0.5} + 50%|█████ | 4395/8750 [3:18:18<6:58:36, 5.77s/it] 50%|█████ | 4396/8750 [3:18:26<6:57:44, 5.76s/it] 50%|█████ | 4396/8750 [3:18:23<6:57:44, 5.76s/it] {'loss': 0.4645, 'learning_rate': 1.0408918576644378e-05, 'epoch': 0.5} + 50%|█████ | 4396/8750 [3:18:26<6:57:44, 5.76s/it] {'loss': 0.4645, 'learning_rate': 1.0408918576644378e-05, 'epoch': 0.5} + 50%|█████ | 4396/8750 [3:18:23<6:57:44, 5.76s/it] 50%|█████ | 4397/8750 [3:18:32<6:55:55, 5.73s/it] 50%|█████ | 4397/8750 [3:18:29<6:55:55, 5.73s/it] {'loss': 0.4663, 'learning_rate': 1.0405219992148057e-05, 'epoch': 0.5} + 50%|█████ | 4397/8750 [3:18:32<6:55:55, 5.73s/it] {'loss': 0.4663, 'learning_rate': 1.0405219992148057e-05, 'epoch': 0.5} + 50%|█████ | 4397/8750 [3:18:29<6:55:55, 5.73s/it] 50%|█████ | 4398/8750 [3:18:38<6:59:56, 5.79s/it] 50%|█████ | 4398/8750 [3:18:35<6:59:56, 5.79s/it] {'loss': 0.4447, 'learning_rate': 1.040152135212755e-05, 'epoch': 0.5} + 50%|█████ | 4398/8750 [3:18:38<6:59:56, 5.79s/it] {'loss': 0.4447, 'learning_rate': 1.040152135212755e-05, 'epoch': 0.5} + 50%|█████ | 4398/8750 [3:18:35<6:59:56, 5.79s/it] 50%|█████ | 4399/8750 [3:18:44<6:57:40, 5.76s/it] 50%|█████ | 4399/8750 [3:18:41<6:57:40, 5.76s/it] {'loss': 0.4731, 'learning_rate': 1.0397822657089653e-05, 'epoch': 0.5} + 50%|█████ | 4399/8750 [3:18:44<6:57:40, 5.76s/it] {'loss': 0.4731, 'learning_rate': 1.0397822657089653e-05, 'epoch': 0.5} + 50%|█████ | 4399/8750 [3:18:41<6:57:40, 5.76s/it]8 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... + 50%|█████ | 4400/8750 [3:18:49<6:59:13, 5.78s/it]12 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +60 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... + 7 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend...13 AutoResumeHook: Checking whether to suspend... + +5 AutoResumeHook: Checking whether to suspend... + 50%|█████ | 4400/8750 [3:18:46<6:59:14, 5.78s/it]15 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4579, 'learning_rate': 1.039412390754117e-05, 'epoch': 0.5} + 50%|█████ | 4400/8750 [3:18:49<6:59:13, 5.78s/it] {'loss': 0.4579, 'learning_rate': 1.039412390754117e-05, 'epoch': 0.5} + 50%|█████ | 4400/8750 [3:18:46<6:59:14, 5.78s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-4400/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-4400/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-4400/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 50%|█████ | 4401/8750 [3:19:09<12:04:38, 10.00s/it] 50%|█████ | 4401/8750 [3:19:06<12:04:38, 10.00s/it] {'loss': 0.4788, 'learning_rate': 1.039042510398891e-05, 'epoch': 0.5} + 50%|█████ | 4401/8750 [3:19:09<12:04:38, 10.00s/it] {'loss': 0.4788, 'learning_rate': 1.039042510398891e-05, 'epoch': 0.5} + 50%|█████ | 4401/8750 [3:19:06<12:04:38, 10.00s/it] 50%|█████ | 4402/8750 [3:19:15<10:32:59, 8.73s/it] 50%|█████ | 4402/8750 [3:19:12<10:32:59, 8.73s/it] {'loss': 0.473, 'learning_rate': 1.03867262469397e-05, 'epoch': 0.5} + 50%|█████ | 4402/8750 [3:19:15<10:32:59, 8.73s/it] {'loss': 0.473, 'learning_rate': 1.03867262469397e-05, 'epoch': 0.5} + 50%|█████ | 4402/8750 [3:19:12<10:32:59, 8.73s/it] 50%|█████ | 4403/8750 [3:19:21<9:30:02, 7.87s/it] 50%|█████ | 4403/8750 [3:19:18<9:30:03, 7.87s/it] {'loss': 0.4693, 'learning_rate': 1.0383027336900356e-05, 'epoch': 0.5} + 50%|█████ | 4403/8750 [3:19:21<9:30:02, 7.87s/it] {'loss': 0.4693, 'learning_rate': 1.0383027336900356e-05, 'epoch': 0.5} + 50%|█████ | 4403/8750 [3:19:18<9:30:03, 7.87s/it] 50%|█████ | 4404/8750 [3:19:27<8:46:23, 7.27s/it] 50%|█████ | 4404/8750 [3:19:24<8:46:24, 7.27s/it] {'loss': 0.4546, 'learning_rate': 1.0379328374377715e-05, 'epoch': 0.5} + 50%|█████ | 4404/8750 [3:19:27<8:46:23, 7.27s/it] {'loss': 0.4546, 'learning_rate': 1.0379328374377715e-05, 'epoch': 0.5} + 50%|█████ | 4404/8750 [3:19:24<8:46:24, 7.27s/it] 50%|█████ | 4405/8750 [3:19:32<8:13:22, 6.81s/it] 50%|█████ | 4405/8750 [3:19:30<8:13:22, 6.81s/it] {'loss': 0.4708, 'learning_rate': 1.0375629359878616e-05, 'epoch': 0.5} + 50%|█████ | 4405/8750 [3:19:32<8:13:22, 6.81s/it] {'loss': 0.4708, 'learning_rate': 1.0375629359878616e-05, 'epoch': 0.5} + 50%|█████ | 4405/8750 [3:19:30<8:13:22, 6.81s/it] 50%|█████ | 4406/8750 [3:19:38<7:50:56, 6.50s/it] 50%|█████ | 4406/8750 [3:19:35<7:50:56, 6.50s/it] {'loss': 0.4519, 'learning_rate': 1.0371930293909911e-05, 'epoch': 0.5} + 50%|█████ | 4406/8750 [3:19:38<7:50:56, 6.50s/it] {'loss': 0.4519, 'learning_rate': 1.0371930293909911e-05, 'epoch': 0.5} + 50%|█████ | 4406/8750 [3:19:35<7:50:56, 6.50s/it] 50%|█████ | 4407/8750 [3:19:44<7:35:47, 6.30s/it] 50%|█████ | 4407/8750 [3:19:41<7:35:47, 6.30s/it] {'loss': 0.4842, 'learning_rate': 1.0368231176978454e-05, 'epoch': 0.5} + 50%|█████ | 4407/8750 [3:19:44<7:35:47, 6.30s/it] {'loss': 0.4842, 'learning_rate': 1.0368231176978454e-05, 'epoch': 0.5} + 50%|█████ | 4407/8750 [3:19:41<7:35:47, 6.30s/it] 50%|█████ | 4408/8750 [3:19:50<7:27:22, 6.18s/it] 50%|█████ | 4408/8750 [3:19:47<7:27:22, 6.18s/it] {'loss': 0.4637, 'learning_rate': 1.0364532009591101e-05, 'epoch': 0.5} + 50%|█████ | 4408/8750 [3:19:50<7:27:22, 6.18s/it] {'loss': 0.4637, 'learning_rate': 1.0364532009591101e-05, 'epoch': 0.5} + 50%|█████ | 4408/8750 [3:19:47<7:27:22, 6.18s/it] 50%|█████ | 4409/8750 [3:19:56<7:21:29, 6.10s/it] 50%|█████ | 4409/8750 [3:19:53<7:21:29, 6.10s/it] {'loss': 0.4791, 'learning_rate': 1.0360832792254727e-05, 'epoch': 0.5} + 50%|█████ | 4409/8750 [3:19:56<7:21:29, 6.10s/it] {'loss': 0.4791, 'learning_rate': 1.0360832792254727e-05, 'epoch': 0.5} + 50%|█████ | 4409/8750 [3:19:53<7:21:29, 6.10s/it] 50%|█████ | 4410/8750 [3:20:02<7:14:56, 6.01s/it] 50%|█████ | 4410/8750 [3:19:59<7:14:57, 6.01s/it] {'loss': 0.4545, 'learning_rate': 1.03571335254762e-05, 'epoch': 0.5} + 50%|█████ | 4410/8750 [3:20:02<7:14:56, 6.01s/it] {'loss': 0.4545, 'learning_rate': 1.03571335254762e-05, 'epoch': 0.5} + 50%|█████ | 4410/8750 [3:19:59<7:14:57, 6.01s/it] 50%|█████ | 4411/8750 [3:20:07<7:08:51, 5.93s/it] 50%|█████ | 4411/8750 [3:20:05<7:08:51, 5.93s/it] {'loss': 0.4852, 'learning_rate': 1.0353434209762412e-05, 'epoch': 0.5} + 50%|█████ | 4411/8750 [3:20:07<7:08:51, 5.93s/it] {'loss': 0.4852, 'learning_rate': 1.0353434209762412e-05, 'epoch': 0.5} + 50%|█████ | 4411/8750 [3:20:05<7:08:51, 5.93s/it] 50%|█████ | 4412/8750 [3:20:13<7:10:02, 5.95s/it] 50%|█████ | 4412/8750 [3:20:10<7:10:02, 5.95s/it] {'loss': 0.4396, 'learning_rate': 1.0349734845620244e-05, 'epoch': 0.5} + 50%|█████ | 4412/8750 [3:20:13<7:10:02, 5.95s/it] {'loss': 0.4396, 'learning_rate': 1.0349734845620244e-05, 'epoch': 0.5} + 50%|█████ | 4412/8750 [3:20:10<7:10:02, 5.95s/it] 50%|█████ | 4413/8750 [3:20:19<7:12:26, 5.98s/it] 50%|█████ | 4413/8750 [3:20:17<7:12:26, 5.98s/it] {'loss': 0.4579, 'learning_rate': 1.0346035433556594e-05, 'epoch': 0.5} + 50%|█████ | 4413/8750 [3:20:19<7:12:26, 5.98s/it] {'loss': 0.4579, 'learning_rate': 1.0346035433556594e-05, 'epoch': 0.5} + 50%|█████ | 4413/8750 [3:20:17<7:12:26, 5.98s/it] 50%|█████ | 4414/8750 [3:20:25<7:05:43, 5.89s/it] 50%|█████ | 4414/8750 [3:20:22<7:05:43, 5.89s/it] {'loss': 0.4664, 'learning_rate': 1.0342335974078364e-05, 'epoch': 0.5} + 50%|█████ | 4414/8750 [3:20:25<7:05:43, 5.89s/it] {'loss': 0.4664, 'learning_rate': 1.0342335974078364e-05, 'epoch': 0.5} + 50%|█████ | 4414/8750 [3:20:22<7:05:43, 5.89s/it] 50%|█████ | 4415/8750 [3:20:31<6:58:06, 5.79s/it] 50%|█████ | 4415/8750 [3:20:28<6:58:07, 5.79s/it] {'loss': 0.4647, 'learning_rate': 1.0338636467692462e-05, 'epoch': 0.5} + 50%|█████ | 4415/8750 [3:20:31<6:58:06, 5.79s/it] {'loss': 0.4647, 'learning_rate': 1.0338636467692462e-05, 'epoch': 0.5} + 50%|█████ | 4415/8750 [3:20:28<6:58:07, 5.79s/it] 50%|█████ | 4416/8750 [3:20:36<6:54:25, 5.74s/it] 50%|█████ | 4416/8750 [3:20:33<6:54:25, 5.74s/it] {'loss': 0.4764, 'learning_rate': 1.0334936914905802e-05, 'epoch': 0.5} + 50%|█████ | 4416/8750 [3:20:36<6:54:25, 5.74s/it] {'loss': 0.4764, 'learning_rate': 1.0334936914905802e-05, 'epoch': 0.5} + 50%|█████ | 4416/8750 [3:20:33<6:54:25, 5.74s/it] 50%|█████ | 4417/8750 [3:20:42<6:54:43, 5.74s/it] 50%|█████ | 4417/8750 [3:20:39<6:54:43, 5.74s/it] {'loss': 0.4702, 'learning_rate': 1.0331237316225309e-05, 'epoch': 0.5} + 50%|█████ | 4417/8750 [3:20:42<6:54:43, 5.74s/it] {'loss': 0.4702, 'learning_rate': 1.0331237316225309e-05, 'epoch': 0.5} + 50%|█████ | 4417/8750 [3:20:39<6:54:43, 5.74s/it] 50%|█████ | 4418/8750 [3:20:48<7:02:15, 5.85s/it] 50%|█████ | 4418/8750 [3:20:45<7:02:14, 5.85s/it] {'loss': 0.4681, 'learning_rate': 1.0327537672157908e-05, 'epoch': 0.5} + 50%|█████ | 4418/8750 [3:20:48<7:02:15, 5.85s/it] {'loss': 0.4681, 'learning_rate': 1.0327537672157908e-05, 'epoch': 0.5} + 50%|█████ | 4418/8750 [3:20:45<7:02:14, 5.85s/it] 51%|█████ | 4419/8750 [3:20:54<6:57:41, 5.79s/it] 51%|█████ | 4419/8750 [3:20:51<6:57:41, 5.79s/it] {'loss': 0.4651, 'learning_rate': 1.0323837983210535e-05, 'epoch': 0.51} + 51%|█████ | 4419/8750 [3:20:54<6:57:41, 5.79s/it] {'loss': 0.4651, 'learning_rate': 1.0323837983210535e-05, 'epoch': 0.51} + 51%|█████ | 4419/8750 [3:20:51<6:57:41, 5.79s/it] 51%|█████ | 4420/8750 [3:21:00<6:55:49, 5.76s/it] 51%|█████ | 4420/8750 [3:20:57<6:55:49, 5.76s/it] {'loss': 0.465, 'learning_rate': 1.0320138249890126e-05, 'epoch': 0.51} + 51%|█████ | 4420/8750 [3:21:00<6:55:49, 5.76s/it] {'loss': 0.465, 'learning_rate': 1.0320138249890126e-05, 'epoch': 0.51} + 51%|█████ | 4420/8750 [3:20:57<6:55:49, 5.76s/it] 51%|█████ | 4421/8750 [3:21:05<6:58:54, 5.81s/it] 51%|█████ | 4421/8750 [3:21:03<6:58:54, 5.81s/it] {'loss': 0.4518, 'learning_rate': 1.031643847270363e-05, 'epoch': 0.51} + 51%|█████ | 4421/8750 [3:21:05<6:58:54, 5.81s/it] {'loss': 0.4518, 'learning_rate': 1.031643847270363e-05, 'epoch': 0.51} + 51%|█████ | 4421/8750 [3:21:03<6:58:54, 5.81s/it] 51%|█████ | 4422/8750 [3:21:11<6:57:09, 5.78s/it] 51%|█████ | 4422/8750 [3:21:08<6:57:09, 5.78s/it] {'loss': 0.4564, 'learning_rate': 1.0312738652157996e-05, 'epoch': 0.51} + 51%|█████ | 4422/8750 [3:21:11<6:57:09, 5.78s/it] {'loss': 0.4564, 'learning_rate': 1.0312738652157996e-05, 'epoch': 0.51} + 51%|█████ | 4422/8750 [3:21:08<6:57:09, 5.78s/it] 51%|█████ | 4423/8750 [3:21:17<6:56:30, 5.78s/it] 51%|█████ | 4423/8750 [3:21:14<6:56:30, 5.78s/it] {'loss': 0.481, 'learning_rate': 1.0309038788760185e-05, 'epoch': 0.51} + 51%|█████ | 4423/8750 [3:21:17<6:56:30, 5.78s/it] {'loss': 0.481, 'learning_rate': 1.0309038788760185e-05, 'epoch': 0.51} + 51%|█████ | 4423/8750 [3:21:14<6:56:30, 5.78s/it] 51%|█████ | 4424/8750 [3:21:23<6:52:23, 5.72s/it] 51%|█████ | 4424/8750 [3:21:20<6:52:23, 5.72s/it] {'loss': 0.4721, 'learning_rate': 1.0305338883017163e-05, 'epoch': 0.51} + 51%|█████ | 4424/8750 [3:21:23<6:52:23, 5.72s/it] {'loss': 0.4721, 'learning_rate': 1.0305338883017163e-05, 'epoch': 0.51} + 51%|█████ | 4424/8750 [3:21:20<6:52:23, 5.72s/it] 51%|█████ | 4425/8750 [3:21:28<6:52:36, 5.72s/it] 51%|█████ | 4425/8750 [3:21:25<6:52:36, 5.72s/it] {'loss': 0.469, 'learning_rate': 1.0301638935435896e-05, 'epoch': 0.51} + 51%|█████ | 4425/8750 [3:21:28<6:52:36, 5.72s/it] {'loss': 0.469, 'learning_rate': 1.0301638935435896e-05, 'epoch': 0.51} + 51%|█████ | 4425/8750 [3:21:25<6:52:36, 5.72s/it] 51%|█████ | 4426/8750 [3:21:34<6:50:07, 5.69s/it] 51%|█████ | 4426/8750 [3:21:31<6:50:07, 5.69s/it] {'loss': 0.4487, 'learning_rate': 1.0297938946523361e-05, 'epoch': 0.51} + 51%|█████ | 4426/8750 [3:21:34<6:50:07, 5.69s/it] {'loss': 0.4487, 'learning_rate': 1.0297938946523361e-05, 'epoch': 0.51} + 51%|█████ | 4426/8750 [3:21:31<6:50:07, 5.69s/it] 51%|█████ | 4427/8750 [3:21:40<6:50:37, 5.70s/it] 51%|█████ | 4427/8750 [3:21:37<6:50:37, 5.70s/it] {'loss': 0.4555, 'learning_rate': 1.0294238916786537e-05, 'epoch': 0.51} + 51%|█████ | 4427/8750 [3:21:40<6:50:37, 5.70s/it] {'loss': 0.4555, 'learning_rate': 1.0294238916786537e-05, 'epoch': 0.51} + 51%|█████ | 4427/8750 [3:21:37<6:50:37, 5.70s/it] 51%|█████ | 4428/8750 [3:21:45<6:53:18, 5.74s/it] 51%|█████ | 4428/8750 [3:21:42<6:53:18, 5.74s/it] {'loss': 0.453, 'learning_rate': 1.0290538846732415e-05, 'epoch': 0.51} + 51%|█████ | 4428/8750 [3:21:45<6:53:18, 5.74s/it] {'loss': 0.453, 'learning_rate': 1.0290538846732415e-05, 'epoch': 0.51} + 51%|█████ | 4428/8750 [3:21:42<6:53:18, 5.74s/it] 51%|█████ | 4429/8750 [3:21:51<6:54:45, 5.76s/it] 51%|█████ | 4429/8750 [3:21:48<6:54:44, 5.76s/it] {'loss': 0.4768, 'learning_rate': 1.0286838736867981e-05, 'epoch': 0.51} + 51%|█████ | 4429/8750 [3:21:51<6:54:45, 5.76s/it] {'loss': 0.4768, 'learning_rate': 1.0286838736867981e-05, 'epoch': 0.51} + 51%|█████ | 4429/8750 [3:21:48<6:54:44, 5.76s/it] 51%|█████ | 4430/8750 [3:21:54<6:57:29, 5.80s/it] 51%|█████ | 4430/8750 [3:21:57<6:57:30, 5.80s/it] {'loss': 0.4549, 'learning_rate': 1.0283138587700236e-05, 'epoch': 0.51} + 51%|█████ | 4430/8750 [3:21:57<6:57:30, 5.80s/it] {'loss': 0.4549, 'learning_rate': 1.0283138587700236e-05, 'epoch': 0.51} + 51%|█████ | 4430/8750 [3:21:54<6:57:29, 5.80s/it] 51%|█████ | 4431/8750 [3:22:03<6:52:43, 5.73s/it] 51%|█████ | 4431/8750 [3:22:00<6:52:43, 5.73s/it] {'loss': 0.4679, 'learning_rate': 1.0279438399736185e-05, 'epoch': 0.51} + 51%|█████ | 4431/8750 [3:22:03<6:52:43, 5.73s/it] {'loss': 0.4679, 'learning_rate': 1.0279438399736185e-05, 'epoch': 0.51} + 51%|█████ | 4431/8750 [3:22:00<6:52:43, 5.73s/it] 51%|█████ | 4432/8750 [3:22:08<6:49:29, 5.69s/it] 51%|█████ | 4432/8750 [3:22:05<6:49:30, 5.69s/it] {'loss': 0.4489, 'learning_rate': 1.0275738173482835e-05, 'epoch': 0.51} + 51%|█████ | 4432/8750 [3:22:08<6:49:29, 5.69s/it] {'loss': 0.4489, 'learning_rate': 1.0275738173482835e-05, 'epoch': 0.51} + 51%|█████ | 4432/8750 [3:22:05<6:49:30, 5.69s/it] 51%|█████ | 4433/8750 [3:22:14<6:54:53, 5.77s/it] 51%|█████ | 4433/8750 [3:22:11<6:54:53, 5.77s/it] {'loss': 0.4635, 'learning_rate': 1.0272037909447197e-05, 'epoch': 0.51} + 51%|█████ | 4433/8750 [3:22:14<6:54:53, 5.77s/it] {'loss': 0.4635, 'learning_rate': 1.0272037909447197e-05, 'epoch': 0.51} + 51%|█████ | 4433/8750 [3:22:11<6:54:53, 5.77s/it] 51%|█████ | 4434/8750 [3:22:20<6:55:37, 5.78s/it] 51%|█████ | 4434/8750 [3:22:17<6:55:37, 5.78s/it] {'loss': 0.4474, 'learning_rate': 1.0268337608136292e-05, 'epoch': 0.51} + 51%|█████ | 4434/8750 [3:22:20<6:55:37, 5.78s/it] {'loss': 0.4474, 'learning_rate': 1.0268337608136292e-05, 'epoch': 0.51} + 51%|█████ | 4434/8750 [3:22:17<6:55:37, 5.78s/it] 51%|█████ | 4435/8750 [3:22:26<6:54:29, 5.76s/it] 51%|█████ | 4435/8750 [3:22:23<6:54:30, 5.76s/it] {'loss': 0.4724, 'learning_rate': 1.0264637270057146e-05, 'epoch': 0.51} + 51%|█████ | 4435/8750 [3:22:26<6:54:29, 5.76s/it] {'loss': 0.4724, 'learning_rate': 1.0264637270057146e-05, 'epoch': 0.51} + 51%|█████ | 4435/8750 [3:22:23<6:54:30, 5.76s/it] 51%|█████ | 4436/8750 [3:22:32<6:56:28, 5.79s/it] 51%|█████ | 4436/8750 [3:22:29<6:56:28, 5.79s/it] {'loss': 0.458, 'learning_rate': 1.0260936895716781e-05, 'epoch': 0.51} + 51%|█████ | 4436/8750 [3:22:32<6:56:28, 5.79s/it] {'loss': 0.458, 'learning_rate': 1.0260936895716781e-05, 'epoch': 0.51} + 51%|█████ | 4436/8750 [3:22:29<6:56:28, 5.79s/it] 51%|█████ | 4437/8750 [3:22:37<6:57:01, 5.80s/it] 51%|█████ | 4437/8750 [3:22:35<6:57:01, 5.80s/it] {'loss': 0.4632, 'learning_rate': 1.0257236485622241e-05, 'epoch': 0.51} + 51%|█████ | 4437/8750 [3:22:37<6:57:01, 5.80s/it] {'loss': 0.4632, 'learning_rate': 1.0257236485622241e-05, 'epoch': 0.51} + 51%|█████ | 4437/8750 [3:22:35<6:57:01, 5.80s/it] 51%|█████ | 4438/8750 [3:22:43<6:57:28, 5.81s/it] 51%|█████ | 4438/8750 [3:22:40<6:57:28, 5.81s/it] {'loss': 0.4592, 'learning_rate': 1.0253536040280556e-05, 'epoch': 0.51} + 51%|█████ | 4438/8750 [3:22:43<6:57:28, 5.81s/it] {'loss': 0.4592, 'learning_rate': 1.0253536040280556e-05, 'epoch': 0.51} + 51%|█████ | 4438/8750 [3:22:40<6:57:28, 5.81s/it] 51%|█████ | 4439/8750 [3:22:49<6:56:04, 5.79s/it] 51%|█████ | 4439/8750 [3:22:46<6:56:04, 5.79s/it] {'loss': 0.4643, 'learning_rate': 1.0249835560198772e-05, 'epoch': 0.51} + 51%|█████ | 4439/8750 [3:22:49<6:56:04, 5.79s/it] {'loss': 0.4643, 'learning_rate': 1.0249835560198772e-05, 'epoch': 0.51} + 51%|█████ | 4439/8750 [3:22:46<6:56:04, 5.79s/it] 51%|█████ | 4440/8750 [3:22:55<6:59:07, 5.83s/it] 51%|█████ | 4440/8750 [3:22:52<6:59:07, 5.83s/it] {'loss': 0.4771, 'learning_rate': 1.0246135045883943e-05, 'epoch': 0.51} + 51%|█████ | 4440/8750 [3:22:55<6:59:07, 5.83s/it] {'loss': 0.4771, 'learning_rate': 1.0246135045883943e-05, 'epoch': 0.51} + 51%|█████ | 4440/8750 [3:22:52<6:59:07, 5.83s/it] 51%|█████ | 4441/8750 [3:23:01<6:55:08, 5.78s/it] 51%|█████ | 4441/8750 [3:22:58<6:55:07, 5.78s/it] {'loss': 0.4872, 'learning_rate': 1.0242434497843117e-05, 'epoch': 0.51} + 51%|█████ | 4441/8750 [3:23:01<6:55:08, 5.78s/it] {'loss': 0.4872, 'learning_rate': 1.0242434497843117e-05, 'epoch': 0.51} + 51%|█████ | 4441/8750 [3:22:58<6:55:07, 5.78s/it] 51%|█████ | 4442/8750 [3:23:06<6:54:26, 5.77s/it] 51%|█████ | 4442/8750 [3:23:03<6:54:27, 5.77s/it] {'loss': 0.4517, 'learning_rate': 1.023873391658335e-05, 'epoch': 0.51} + 51%|█████ | 4442/8750 [3:23:06<6:54:26, 5.77s/it] {'loss': 0.4517, 'learning_rate': 1.023873391658335e-05, 'epoch': 0.51} + 51%|█████ | 4442/8750 [3:23:03<6:54:27, 5.77s/it] 51%|█████ | 4443/8750 [3:23:12<6:53:45, 5.76s/it] 51%|█████ | 4443/8750 [3:23:09<6:53:45, 5.76s/it] {'loss': 0.4768, 'learning_rate': 1.0235033302611704e-05, 'epoch': 0.51} + {'loss': 0.4768, 'learning_rate': 1.0235033302611704e-05, 'epoch': 0.51} 51%|█████ | 4443/8750 [3:23:12<6:53:45, 5.76s/it] + 51%|█████ | 4443/8750 [3:23:09<6:53:45, 5.76s/it] 51%|█████ | 4444/8750 [3:23:18<6:51:56, 5.74s/it] 51%|█████ | 4444/8750 [3:23:15<6:51:56, 5.74s/it] {'loss': 0.4565, 'learning_rate': 1.023133265643525e-05, 'epoch': 0.51} + 51%|█████ | 4444/8750 [3:23:18<6:51:56, 5.74s/it] {'loss': 0.4565, 'learning_rate': 1.023133265643525e-05, 'epoch': 0.51} + 51%|█████ | 4444/8750 [3:23:15<6:51:56, 5.74s/it] 51%|█████ | 4445/8750 [3:23:24<6:52:45, 5.75s/it] 51%|█████ | 4445/8750 [3:23:21<6:52:45, 5.75s/it] {'loss': 0.4665, 'learning_rate': 1.0227631978561057e-05, 'epoch': 0.51} + 51%|█████ | 4445/8750 [3:23:24<6:52:45, 5.75s/it] {'loss': 0.4665, 'learning_rate': 1.0227631978561057e-05, 'epoch': 0.51} + 51%|█████ | 4445/8750 [3:23:21<6:52:45, 5.75s/it] 51%|█████ | 4446/8750 [3:23:29<6:53:57, 5.77s/it] 51%|█████ | 4446/8750 [3:23:26<6:53:57, 5.77s/it] {'loss': 0.4572, 'learning_rate': 1.0223931269496204e-05, 'epoch': 0.51} + 51%|█████ | 4446/8750 [3:23:29<6:53:57, 5.77s/it] {'loss': 0.4572, 'learning_rate': 1.0223931269496204e-05, 'epoch': 0.51} + 51%|█████ | 4446/8750 [3:23:26<6:53:57, 5.77s/it] 51%|█████ | 4447/8750 [3:23:35<6:57:41, 5.82s/it] 51%|█████ | 4447/8750 [3:23:32<6:57:40, 5.82s/it] {'loss': 0.4538, 'learning_rate': 1.0220230529747766e-05, 'epoch': 0.51} + 51%|█████ | 4447/8750 [3:23:35<6:57:41, 5.82s/it] {'loss': 0.4538, 'learning_rate': 1.0220230529747766e-05, 'epoch': 0.51} + 51%|█████ | 4447/8750 [3:23:32<6:57:40, 5.82s/it] 51%|█████ | 4448/8750 [3:23:41<6:54:43, 5.78s/it] 51%|█████ | 4448/8750 [3:23:38<6:54:43, 5.78s/it] {'loss': 0.4572, 'learning_rate': 1.0216529759822823e-05, 'epoch': 0.51} + 51%|█████ | 4448/8750 [3:23:41<6:54:43, 5.78s/it] {'loss': 0.4572, 'learning_rate': 1.0216529759822823e-05, 'epoch': 0.51} + 51%|█████ | 4448/8750 [3:23:38<6:54:43, 5.78s/it] 51%|█████ | 4449/8750 [3:23:47<6:53:46, 5.77s/it] 51%|█████ | 4449/8750 [3:23:44<6:53:47, 5.77s/it] {'loss': 0.4631, 'learning_rate': 1.0212828960228475e-05, 'epoch': 0.51} + 51%|█████ | 4449/8750 [3:23:47<6:53:46, 5.77s/it] {'loss': 0.4631, 'learning_rate': 1.0212828960228475e-05, 'epoch': 0.51} + 51%|█████ | 4449/8750 [3:23:44<6:53:47, 5.77s/it]11 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... + 51%|█████ | 4450/8750 [3:23:53<6:55:56, 5.80s/it]9 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +50 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... + 51%|█████ | 4450/8750 [3:23:50<6:55:57, 5.80s/it]1 AutoResumeHook: Checking whether to suspend... + {'loss': 0.454, 'learning_rate': 1.0209128131471809e-05, 'epoch': 0.51} + 51%|█████ | 4450/8750 [3:23:53<6:55:56, 5.80s/it] {'loss': 0.454, 'learning_rate': 1.0209128131471809e-05, 'epoch': 0.51} + 51%|█████ | 4450/8750 [3:23:50<6:55:57, 5.80s/it] 51%|█████ | 4451/8750 [3:23:58<6:55:51, 5.80s/it] 51%|█████ | 4451/8750 [3:23:56<6:55:51, 5.80s/it] {'loss': 0.4702, 'learning_rate': 1.0205427274059915e-05, 'epoch': 0.51} + 51%|█████ | 4451/8750 [3:23:58<6:55:51, 5.80s/it] {'loss': 0.4702, 'learning_rate': 1.0205427274059915e-05, 'epoch': 0.51} + 51%|█████ | 4451/8750 [3:23:56<6:55:51, 5.80s/it] 51%|█████ | 4452/8750 [3:24:04<6:57:55, 5.83s/it] 51%|█████ | 4452/8750 [3:24:01<6:57:55, 5.83s/it] {'loss': 0.4545, 'learning_rate': 1.02017263884999e-05, 'epoch': 0.51} + 51%|█████ | 4452/8750 [3:24:04<6:57:55, 5.83s/it] {'loss': 0.4545, 'learning_rate': 1.02017263884999e-05, 'epoch': 0.51} + 51%|█████ | 4452/8750 [3:24:01<6:57:55, 5.83s/it] 51%|█████ | 4453/8750 [3:24:10<6:53:50, 5.78s/it] 51%|█████ | 4453/8750 [3:24:07<6:53:50, 5.78s/it] {'loss': 0.4729, 'learning_rate': 1.0198025475298865e-05, 'epoch': 0.51} + 51%|█████ | 4453/8750 [3:24:10<6:53:50, 5.78s/it] {'loss': 0.4729, 'learning_rate': 1.0198025475298865e-05, 'epoch': 0.51} + 51%|█████ | 4453/8750 [3:24:07<6:53:50, 5.78s/it] 51%|█████ | 4454/8750 [3:24:16<6:53:33, 5.78s/it] 51%|█████ | 4454/8750 [3:24:13<6:53:33, 5.78s/it] {'loss': 0.4482, 'learning_rate': 1.019432453496392e-05, 'epoch': 0.51} + 51%|█████ | 4454/8750 [3:24:16<6:53:33, 5.78s/it] {'loss': 0.4482, 'learning_rate': 1.019432453496392e-05, 'epoch': 0.51} + 51%|█████ | 4454/8750 [3:24:13<6:53:33, 5.78s/it] 51%|█████ | 4455/8750 [3:24:19<6:57:01, 5.83s/it] 51%|█████ | 4455/8750 [3:24:22<6:57:03, 5.83s/it] {'loss': 0.446, 'learning_rate': 1.0190623568002178e-05, 'epoch': 0.51} + 51%|█████ | 4455/8750 [3:24:22<6:57:03, 5.83s/it] {'loss': 0.446, 'learning_rate': 1.0190623568002178e-05, 'epoch': 0.51} + 51%|█████ | 4455/8750 [3:24:19<6:57:01, 5.83s/it] 51%|█████ | 4456/8750 [3:24:28<6:57:30, 5.83s/it] 51%|█████ | 4456/8750 [3:24:25<6:57:30, 5.83s/it] {'loss': 0.4636, 'learning_rate': 1.0186922574920747e-05, 'epoch': 0.51} + {'loss': 0.4636, 'learning_rate': 1.0186922574920747e-05, 'epoch': 0.51} 51%|█████ | 4456/8750 [3:24:28<6:57:30, 5.83s/it] + 51%|█████ | 4456/8750 [3:24:25<6:57:30, 5.83s/it] 51%|█████ | 4457/8750 [3:24:33<6:54:12, 5.79s/it] 51%|█████ | 4457/8750 [3:24:30<6:54:12, 5.79s/it] {'loss': 0.4636, 'learning_rate': 1.018322155622675e-05, 'epoch': 0.51} + 51%|█████ | 4457/8750 [3:24:33<6:54:12, 5.79s/it] {'loss': 0.4636, 'learning_rate': 1.018322155622675e-05, 'epoch': 0.51} + 51%|█████ | 4457/8750 [3:24:30<6:54:12, 5.79s/it] 51%|█████ | 4458/8750 [3:24:39<6:55:06, 5.80s/it] 51%|█████ | 4458/8750 [3:24:36<6:55:06, 5.80s/it] {'loss': 0.469, 'learning_rate': 1.017952051242731e-05, 'epoch': 0.51} + 51%|█████ | 4458/8750 [3:24:39<6:55:06, 5.80s/it] {'loss': 0.469, 'learning_rate': 1.017952051242731e-05, 'epoch': 0.51} + 51%|█████ | 4458/8750 [3:24:36<6:55:06, 5.80s/it] 51%|█████ | 4459/8750 [3:24:45<7:00:36, 5.88s/it] 51%|█████ | 4459/8750 [3:24:42<7:00:36, 5.88s/it] {'loss': 0.4491, 'learning_rate': 1.0175819444029555e-05, 'epoch': 0.51} + 51%|█████ | 4459/8750 [3:24:45<7:00:36, 5.88s/it] {'loss': 0.4491, 'learning_rate': 1.0175819444029555e-05, 'epoch': 0.51} + 51%|█████ | 4459/8750 [3:24:42<7:00:36, 5.88s/it] 51%|█████ | 4460/8750 [3:24:51<7:02:54, 5.91s/it] 51%|█████ | 4460/8750 [3:24:48<7:02:54, 5.91s/it] {'loss': 0.4544, 'learning_rate': 1.0172118351540608e-05, 'epoch': 0.51} + 51%|█████ | 4460/8750 [3:24:51<7:02:54, 5.91s/it] {'loss': 0.4544, 'learning_rate': 1.0172118351540608e-05, 'epoch': 0.51} + 51%|█████ | 4460/8750 [3:24:48<7:02:54, 5.91s/it] 51%|█████ | 4461/8750 [3:24:57<6:59:45, 5.87s/it] 51%|█████ | 4461/8750 [3:24:54<6:59:46, 5.87s/it] {'loss': 0.4653, 'learning_rate': 1.0168417235467604e-05, 'epoch': 0.51} + 51%|█████ | 4461/8750 [3:24:57<6:59:45, 5.87s/it] {'loss': 0.4653, 'learning_rate': 1.0168417235467604e-05, 'epoch': 0.51} + 51%|█████ | 4461/8750 [3:24:54<6:59:46, 5.87s/it] 51%|█████ | 4462/8750 [3:25:03<6:57:50, 5.85s/it] 51%|█████ | 4462/8750 [3:25:00<6:57:50, 5.85s/it] {'loss': 0.4614, 'learning_rate': 1.0164716096317677e-05, 'epoch': 0.51} + 51%|█████ | 4462/8750 [3:25:03<6:57:50, 5.85s/it] {'loss': 0.4614, 'learning_rate': 1.0164716096317677e-05, 'epoch': 0.51} + 51%|█████ | 4462/8750 [3:25:00<6:57:50, 5.85s/it] 51%|█████ | 4463/8750 [3:25:09<7:05:44, 5.96s/it] 51%|█████ | 4463/8750 [3:25:06<7:05:44, 5.96s/it] {'loss': 0.4497, 'learning_rate': 1.016101493459797e-05, 'epoch': 0.51} + 51%|█████ | 4463/8750 [3:25:09<7:05:44, 5.96s/it] {'loss': 0.4497, 'learning_rate': 1.016101493459797e-05, 'epoch': 0.51} + 51%|█████ | 4463/8750 [3:25:06<7:05:44, 5.96s/it] 51%|█████ | 4464/8750 [3:25:15<6:59:44, 5.88s/it] 51%|█████ | 4464/8750 [3:25:12<6:59:44, 5.88s/it] {'loss': 0.4507, 'learning_rate': 1.0157313750815623e-05, 'epoch': 0.51} + 51%|█████ | 4464/8750 [3:25:15<6:59:44, 5.88s/it] {'loss': 0.4507, 'learning_rate': 1.0157313750815623e-05, 'epoch': 0.51} + 51%|█████ | 4464/8750 [3:25:12<6:59:44, 5.88s/it] 51%|█████ | 4465/8750 [3:25:21<7:00:32, 5.89s/it] 51%|█████ | 4465/8750 [3:25:18<7:00:31, 5.89s/it] {'loss': 0.4477, 'learning_rate': 1.0153612545477778e-05, 'epoch': 0.51} + 51%|█████ | 4465/8750 [3:25:21<7:00:32, 5.89s/it] {'loss': 0.4477, 'learning_rate': 1.0153612545477778e-05, 'epoch': 0.51} + 51%|█████ | 4465/8750 [3:25:18<7:00:31, 5.89s/it] 51%|█████ | 4466/8750 [3:25:26<6:56:15, 5.83s/it] 51%|█████ | 4466/8750 [3:25:23<6:56:15, 5.83s/it] {'loss': 0.4675, 'learning_rate': 1.0149911319091583e-05, 'epoch': 0.51} + 51%|█████ | 4466/8750 [3:25:26<6:56:15, 5.83s/it] {'loss': 0.4675, 'learning_rate': 1.0149911319091583e-05, 'epoch': 0.51} + 51%|█████ | 4466/8750 [3:25:23<6:56:15, 5.83s/it] 51%|█████ | 4467/8750 [3:25:32<6:53:38, 5.79s/it] 51%|█████ | 4467/8750 [3:25:29<6:53:38, 5.79s/it] {'loss': 0.4669, 'learning_rate': 1.014621007216419e-05, 'epoch': 0.51} + 51%|█████ | 4467/8750 [3:25:32<6:53:38, 5.79s/it] {'loss': 0.4669, 'learning_rate': 1.014621007216419e-05, 'epoch': 0.51} + 51%|█████ | 4467/8750 [3:25:29<6:53:38, 5.79s/it] 51%|█████ | 4468/8750 [3:25:38<6:50:27, 5.75s/it] 51%|█████ | 4468/8750 [3:25:35<6:50:27, 5.75s/it] {'loss': 0.4668, 'learning_rate': 1.0142508805202757e-05, 'epoch': 0.51} + 51%|█████ | 4468/8750 [3:25:35<6:50:27, 5.75s/it]{'loss': 0.4668, 'learning_rate': 1.0142508805202757e-05, 'epoch': 0.51} + 51%|█████ | 4468/8750 [3:25:38<6:50:27, 5.75s/it] 51%|█████ | 4469/8750 [3:25:44<6:54:19, 5.81s/it] 51%|█████ | 4469/8750 [3:25:41<6:54:19, 5.81s/it] {'loss': 0.4754, 'learning_rate': 1.0138807518714435e-05, 'epoch': 0.51} + 51%|█████ | 4469/8750 [3:25:44<6:54:19, 5.81s/it] {'loss': 0.4754, 'learning_rate': 1.0138807518714435e-05, 'epoch': 0.51} + 51%|█████ | 4469/8750 [3:25:41<6:54:19, 5.81s/it] 51%|█████ | 4470/8750 [3:25:49<6:52:19, 5.78s/it] 51%|█████ | 4470/8750 [3:25:46<6:52:20, 5.78s/it] {'loss': 0.4581, 'learning_rate': 1.0135106213206382e-05, 'epoch': 0.51} + 51%|█████ | 4470/8750 [3:25:49<6:52:19, 5.78s/it] {'loss': 0.4581, 'learning_rate': 1.0135106213206382e-05, 'epoch': 0.51} + 51%|█████ | 4470/8750 [3:25:46<6:52:20, 5.78s/it] 51%|█████ | 4471/8750 [3:25:55<6:51:31, 5.77s/it] 51%|█████ | 4471/8750 [3:25:52<6:51:32, 5.77s/it] {'loss': 0.4621, 'learning_rate': 1.0131404889185762e-05, 'epoch': 0.51} + 51%|█████ | 4471/8750 [3:25:55<6:51:31, 5.77s/it] {'loss': 0.4621, 'learning_rate': 1.0131404889185762e-05, 'epoch': 0.51} + 51%|█████ | 4471/8750 [3:25:52<6:51:32, 5.77s/it] 51%|█████ | 4472/8750 [3:26:01<6:52:41, 5.79s/it] 51%|█████ | 4472/8750 [3:25:58<6:52:41, 5.79s/it] {'loss': 0.459, 'learning_rate': 1.012770354715974e-05, 'epoch': 0.51} + 51%|█████ | 4472/8750 [3:26:01<6:52:41, 5.79s/it] {'loss': 0.459, 'learning_rate': 1.012770354715974e-05, 'epoch': 0.51} + 51%|█████ | 4472/8750 [3:25:58<6:52:41, 5.79s/it] 51%|█████ | 4473/8750 [3:26:07<6:53:13, 5.80s/it] 51%|█████ | 4473/8750 [3:26:04<6:53:13, 5.80s/it] {'loss': 0.4633, 'learning_rate': 1.012400218763548e-05, 'epoch': 0.51} + 51%|█████ | 4473/8750 [3:26:07<6:53:13, 5.80s/it] {'loss': 0.4633, 'learning_rate': 1.012400218763548e-05, 'epoch': 0.51} + 51%|█████ | 4473/8750 [3:26:04<6:53:13, 5.80s/it] 51%|█████ | 4474/8750 [3:26:12<6:49:44, 5.75s/it] 51%|█████ | 4474/8750 [3:26:09<6:49:44, 5.75s/it] {'loss': 0.4751, 'learning_rate': 1.012030081112015e-05, 'epoch': 0.51} + 51%|█████ | 4474/8750 [3:26:12<6:49:44, 5.75s/it] {'loss': 0.4751, 'learning_rate': 1.012030081112015e-05, 'epoch': 0.51} + 51%|█████ | 4474/8750 [3:26:09<6:49:44, 5.75s/it] 51%|█████ | 4475/8750 [3:26:18<6:50:29, 5.76s/it] 51%|█████ | 4475/8750 [3:26:15<6:50:29, 5.76s/it] {'loss': 0.4624, 'learning_rate': 1.0116599418120924e-05, 'epoch': 0.51} + 51%|█████ | 4475/8750 [3:26:18<6:50:29, 5.76s/it] {'loss': 0.4624, 'learning_rate': 1.0116599418120924e-05, 'epoch': 0.51} + 51%|█████ | 4475/8750 [3:26:15<6:50:29, 5.76s/it] 51%|█████ | 4476/8750 [3:26:24<6:50:46, 5.77s/it] 51%|█████ | 4476/8750 [3:26:21<6:50:45, 5.77s/it] {'loss': 0.4584, 'learning_rate': 1.0112898009144977e-05, 'epoch': 0.51} + 51%|█████ | 4476/8750 [3:26:24<6:50:46, 5.77s/it] {'loss': 0.4584, 'learning_rate': 1.0112898009144977e-05, 'epoch': 0.51} + 51%|█████ | 4476/8750 [3:26:21<6:50:45, 5.77s/it] 51%|█████ | 4477/8750 [3:26:30<6:55:10, 5.83s/it] 51%|█████ | 4477/8750 [3:26:27<6:55:10, 5.83s/it] {'loss': 0.4615, 'learning_rate': 1.0109196584699478e-05, 'epoch': 0.51} + 51%|█████ | 4477/8750 [3:26:30<6:55:10, 5.83s/it] {'loss': 0.4615, 'learning_rate': 1.0109196584699478e-05, 'epoch': 0.51} + 51%|█████ | 4477/8750 [3:26:27<6:55:10, 5.83s/it] 51%|█████ | 4478/8750 [3:26:36<6:57:51, 5.87s/it] 51%|█████ | 4478/8750 [3:26:33<6:57:51, 5.87s/it] {'loss': 0.4712, 'learning_rate': 1.0105495145291612e-05, 'epoch': 0.51} + 51%|█████ | 4478/8750 [3:26:36<6:57:51, 5.87s/it] {'loss': 0.4712, 'learning_rate': 1.0105495145291612e-05, 'epoch': 0.51} + 51%|█████ | 4478/8750 [3:26:33<6:57:51, 5.87s/it] 51%|█████ | 4479/8750 [3:26:39<6:56:42, 5.85s/it] 51%|█████ | 4479/8750 [3:26:42<6:56:43, 5.85s/it]{'loss': 0.456, 'learning_rate': 1.0101793691428554e-05, 'epoch': 0.51} + {'loss': 0.456, 'learning_rate': 1.0101793691428554e-05, 'epoch': 0.51} + 51%|█████ | 4479/8750 [3:26:42<6:56:43, 5.85s/it] 51%|█████ | 4479/8750 [3:26:39<6:56:42, 5.85s/it] 51%|█████ | 4480/8750 [3:26:48<7:05:35, 5.98s/it] 51%|█████ | 4480/8750 [3:26:45<7:05:36, 5.98s/it] {'loss': 0.4551, 'learning_rate': 1.0098092223617488e-05, 'epoch': 0.51} + 51%|█████ | 4480/8750 [3:26:48<7:05:35, 5.98s/it] {'loss': 0.4551, 'learning_rate': 1.0098092223617488e-05, 'epoch': 0.51} + 51%|█████ | 4480/8750 [3:26:45<7:05:36, 5.98s/it] 51%|█████ | 4481/8750 [3:26:54<7:09:29, 6.04s/it] 51%|█████ | 4481/8750 [3:26:51<7:09:29, 6.04s/it] {'loss': 0.4553, 'learning_rate': 1.0094390742365598e-05, 'epoch': 0.51} + {'loss': 0.4553, 'learning_rate': 1.0094390742365598e-05, 'epoch': 0.51} 51%|█████ | 4481/8750 [3:26:54<7:09:29, 6.04s/it] + 51%|█████ | 4481/8750 [3:26:51<7:09:29, 6.04s/it] 51%|█████ | 4482/8750 [3:27:00<7:06:40, 6.00s/it] 51%|█████ | 4482/8750 [3:26:57<7:06:40, 6.00s/it] {'loss': 0.47, 'learning_rate': 1.009068924818007e-05, 'epoch': 0.51} + 51%|█████ | 4482/8750 [3:27:00<7:06:40, 6.00s/it] {'loss': 0.47, 'learning_rate': 1.009068924818007e-05, 'epoch': 0.51} + 51%|█████ | 4482/8750 [3:26:57<7:06:40, 6.00s/it] 51%|█████ | 4483/8750 [3:27:06<7:03:02, 5.95s/it] 51%|█████ | 4483/8750 [3:27:03<7:03:03, 5.95s/it] {'loss': 0.4609, 'learning_rate': 1.0086987741568089e-05, 'epoch': 0.51} + 51%|█████ | 4483/8750 [3:27:06<7:03:02, 5.95s/it] {'loss': 0.4609, 'learning_rate': 1.0086987741568089e-05, 'epoch': 0.51} + 51%|█████ | 4483/8750 [3:27:03<7:03:03, 5.95s/it] 51%|█████ | 4484/8750 [3:27:12<7:00:30, 5.91s/it] 51%|█████ | 4484/8750 [3:27:09<7:00:31, 5.91s/it] {'loss': 0.4694, 'learning_rate': 1.0083286223036845e-05, 'epoch': 0.51} + 51%|█████ | 4484/8750 [3:27:12<7:00:30, 5.91s/it] {'loss': 0.4694, 'learning_rate': 1.0083286223036845e-05, 'epoch': 0.51} + 51%|█████ | 4484/8750 [3:27:09<7:00:31, 5.91s/it] 51%|█████▏ | 4485/8750 [3:27:17<6:58:10, 5.88s/it] 51%|█████▏ | 4485/8750 [3:27:14<6:58:10, 5.88s/it] {'loss': 0.4731, 'learning_rate': 1.0079584693093529e-05, 'epoch': 0.51} + 51%|█████▏ | 4485/8750 [3:27:17<6:58:10, 5.88s/it] {'loss': 0.4731, 'learning_rate': 1.0079584693093529e-05, 'epoch': 0.51} + 51%|█████▏ | 4485/8750 [3:27:14<6:58:10, 5.88s/it] 51%|█████▏ | 4486/8750 [3:27:23<6:56:00, 5.85s/it] 51%|█████▏ | 4486/8750 [3:27:20<6:56:00, 5.85s/it] {'loss': 0.4774, 'learning_rate': 1.0075883152245334e-05, 'epoch': 0.51} + 51%|█████▏ | 4486/8750 [3:27:23<6:56:00, 5.85s/it] {'loss': 0.4774, 'learning_rate': 1.0075883152245334e-05, 'epoch': 0.51} + 51%|█████▏ | 4486/8750 [3:27:20<6:56:00, 5.85s/it] 51%|█████▏ | 4487/8750 [3:27:29<6:53:41, 5.82s/it] 51%|█████▏ | 4487/8750 [3:27:26<6:53:41, 5.82s/it] {'loss': 0.4797, 'learning_rate': 1.007218160099945e-05, 'epoch': 0.51} + 51%|█████▏ | 4487/8750 [3:27:29<6:53:41, 5.82s/it] {'loss': 0.4797, 'learning_rate': 1.007218160099945e-05, 'epoch': 0.51} + 51%|█████▏ | 4487/8750 [3:27:26<6:53:41, 5.82s/it] 51%|█████▏ | 4488/8750 [3:27:32<6:51:03, 5.79s/it] 51%|█████▏ | 4488/8750 [3:27:35<6:51:04, 5.79s/it] {'loss': 0.4382, 'learning_rate': 1.006848003986308e-05, 'epoch': 0.51} + 51%|█████▏ | 4488/8750 [3:27:35<6:51:04, 5.79s/it] {'loss': 0.4382, 'learning_rate': 1.006848003986308e-05, 'epoch': 0.51} + 51%|█████▏ | 4488/8750 [3:27:32<6:51:03, 5.79s/it] 51%|█████▏ | 4489/8750 [3:27:38<6:50:47, 5.78s/it] 51%|█████▏ | 4489/8750 [3:27:40<6:50:48, 5.78s/it] {'loss': 0.4554, 'learning_rate': 1.0064778469343413e-05, 'epoch': 0.51} + 51%|█████▏ | 4489/8750 [3:27:40<6:50:48, 5.78s/it] {'loss': 0.4554, 'learning_rate': 1.0064778469343413e-05, 'epoch': 0.51} + 51%|█████▏ | 4489/8750 [3:27:38<6:50:47, 5.78s/it] 51%|█████▏ | 4490/8750 [3:27:46<6:46:31, 5.73s/it] 51%|█████▏ | 4490/8750 [3:27:43<6:46:32, 5.73s/it] {'loss': 0.4602, 'learning_rate': 1.006107688994765e-05, 'epoch': 0.51} + 51%|█████▏ | 4490/8750 [3:27:46<6:46:31, 5.73s/it] {'loss': 0.4602, 'learning_rate': 1.006107688994765e-05, 'epoch': 0.51} + 51%|█████▏ | 4490/8750 [3:27:43<6:46:32, 5.73s/it] 51%|█████▏ | 4491/8750 [3:27:52<6:48:35, 5.76s/it] 51%|█████▏ | 4491/8750 [3:27:49<6:48:35, 5.76s/it] {'loss': 0.4634, 'learning_rate': 1.0057375302182988e-05, 'epoch': 0.51} + 51%|█████▏ | 4491/8750 [3:27:52<6:48:35, 5.76s/it] {'loss': 0.4634, 'learning_rate': 1.0057375302182988e-05, 'epoch': 0.51} + 51%|█████▏ | 4491/8750 [3:27:49<6:48:35, 5.76s/it] 51%|█████▏ | 4492/8750 [3:27:58<6:49:44, 5.77s/it] 51%|█████▏ | 4492/8750 [3:27:55<6:49:43, 5.77s/it] {'loss': 0.4633, 'learning_rate': 1.005367370655663e-05, 'epoch': 0.51} + {'loss': 0.4633, 'learning_rate': 1.005367370655663e-05, 'epoch': 0.51} 51%|█████▏ | 4492/8750 [3:27:58<6:49:44, 5.77s/it] + 51%|█████▏ | 4492/8750 [3:27:55<6:49:43, 5.77s/it] 51%|█████▏ | 4493/8750 [3:28:03<6:46:18, 5.73s/it] 51%|█████▏ | 4493/8750 [3:28:00<6:46:18, 5.73s/it] {'loss': 0.4775, 'learning_rate': 1.0049972103575775e-05, 'epoch': 0.51} + 51%|█████▏ | 4493/8750 [3:28:03<6:46:18, 5.73s/it] {'loss': 0.4775, 'learning_rate': 1.0049972103575775e-05, 'epoch': 0.51} + 51%|█████▏ | 4493/8750 [3:28:00<6:46:18, 5.73s/it] 51%|█████▏ | 4494/8750 [3:28:09<6:46:00, 5.72s/it] 51%|█████▏ | 4494/8750 [3:28:06<6:46:00, 5.72s/it] {'loss': 0.462, 'learning_rate': 1.004627049374763e-05, 'epoch': 0.51} + 51%|█████▏ | 4494/8750 [3:28:09<6:46:00, 5.72s/it] {'loss': 0.462, 'learning_rate': 1.004627049374763e-05, 'epoch': 0.51} + 51%|█████▏ | 4494/8750 [3:28:06<6:46:00, 5.72s/it] 51%|█████▏ | 4495/8750 [3:28:15<6:46:42, 5.73s/it] 51%|█████▏ | 4495/8750 [3:28:12<6:46:42, 5.74s/it] {'loss': 0.4605, 'learning_rate': 1.0042568877579388e-05, 'epoch': 0.51} + 51%|█████▏ | 4495/8750 [3:28:15<6:46:42, 5.73s/it] {'loss': 0.4605, 'learning_rate': 1.0042568877579388e-05, 'epoch': 0.51} + 51%|█████▏ | 4495/8750 [3:28:12<6:46:42, 5.74s/it] 51%|█████▏ | 4496/8750 [3:28:21<6:51:23, 5.80s/it] 51%|█████▏ | 4496/8750 [3:28:18<6:51:22, 5.80s/it] {'loss': 0.45, 'learning_rate': 1.0038867255578261e-05, 'epoch': 0.51} + 51%|█████▏ | 4496/8750 [3:28:21<6:51:23, 5.80s/it] {'loss': 0.45, 'learning_rate': 1.0038867255578261e-05, 'epoch': 0.51} + 51%|█████▏ | 4496/8750 [3:28:18<6:51:22, 5.80s/it] 51%|█████▏ | 4497/8750 [3:28:26<6:49:43, 5.78s/it] 51%|█████▏ | 4497/8750 [3:28:24<6:49:43, 5.78s/it] {'loss': 0.4773, 'learning_rate': 1.0035165628251455e-05, 'epoch': 0.51} + 51%|█████▏ | 4497/8750 [3:28:26<6:49:43, 5.78s/it] {'loss': 0.4773, 'learning_rate': 1.0035165628251455e-05, 'epoch': 0.51} + 51%|█████▏ | 4497/8750 [3:28:24<6:49:43, 5.78s/it] 51%|█████▏ | 4498/8750 [3:28:32<6:48:57, 5.77s/it] 51%|█████▏ | 4498/8750 [3:28:29<6:48:58, 5.77s/it] {'loss': 0.462, 'learning_rate': 1.0031463996106175e-05, 'epoch': 0.51} + 51%|█████▏ | 4498/8750 [3:28:32<6:48:57, 5.77s/it] {'loss': 0.462, 'learning_rate': 1.0031463996106175e-05, 'epoch': 0.51} + 51%|█████▏ | 4498/8750 [3:28:29<6:48:58, 5.77s/it] 51%|█████▏ | 4499/8750 [3:28:38<6:43:24, 5.69s/it] 51%|█████▏ | 4499/8750 [3:28:35<6:43:23, 5.69s/it] {'loss': 0.4733, 'learning_rate': 1.002776235964962e-05, 'epoch': 0.51} + 51%|█████▏ | 4499/8750 [3:28:38<6:43:24, 5.69s/it] {'loss': 0.4733, 'learning_rate': 1.002776235964962e-05, 'epoch': 0.51} + 51%|█████▏ | 4499/8750 [3:28:35<6:43:23, 5.69s/it]11 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... + 51%|█████▏ | 4500/8750 [3:28:43<6:44:03, 5.70s/it]10 AutoResumeHook: Checking whether to suspend... +03 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... + 51%|█████▏ | 4500/8750 [3:28:41<6:44:03, 5.70s/it]5 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4687, 'learning_rate': 1.0024060719389002e-05, 'epoch': 0.51} + 51%|█████▏ | 4500/8750 [3:28:43<6:44:03, 5.70s/it] {'loss': 0.4687, 'learning_rate': 1.0024060719389002e-05, 'epoch': 0.51} + 51%|█████▏ | 4500/8750 [3:28:41<6:44:03, 5.70s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-4500/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-4500/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-4500/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 51%|█████▏ | 4501/8750 [3:29:03<12:37:23, 10.70s/it] 51%|█████▏ | 4501/8750 [3:29:06<12:37:25, 10.70s/it] {'loss': 0.4611, 'learning_rate': 1.002035907583153e-05, 'epoch': 0.51} + 51%|█████▏ | 4501/8750 [3:29:06<12:37:25, 10.70s/it] {'loss': 0.4611, 'learning_rate': 1.002035907583153e-05, 'epoch': 0.51} + 51%|█████▏ | 4501/8750 [3:29:03<12:37:23, 10.70s/it] 51%|█████▏ | 4502/8750 [3:29:09<10:53:14, 9.23s/it] 51%|█████▏ | 4502/8750 [3:29:12<10:53:14, 9.23s/it] {'loss': 0.4397, 'learning_rate': 1.001665742948441e-05, 'epoch': 0.51} + 51%|█████▏ | 4502/8750 [3:29:12<10:53:14, 9.23s/it] {'loss': 0.4397, 'learning_rate': 1.001665742948441e-05, 'epoch': 0.51} + 51%|█████▏ | 4502/8750 [3:29:09<10:53:14, 9.23s/it] 51%|█████▏ | 4503/8750 [3:29:18<9:43:29, 8.24s/it] 51%|█████▏ | 4503/8750 [3:29:15<9:43:30, 8.24s/it] {'loss': 0.4622, 'learning_rate': 1.0012955780854852e-05, 'epoch': 0.51} + 51%|█████▏ | 4503/8750 [3:29:18<9:43:29, 8.24s/it] {'loss': 0.4622, 'learning_rate': 1.0012955780854852e-05, 'epoch': 0.51} + 51%|█████▏ | 4503/8750 [3:29:15<9:43:30, 8.24s/it] 51%|█████▏ | 4504/8750 [3:29:24<8:55:47, 7.57s/it] 51%|█████▏ | 4504/8750 [3:29:21<8:55:47, 7.57s/it] {'loss': 0.4544, 'learning_rate': 1.0009254130450059e-05, 'epoch': 0.51} + 51%|█████▏ | 4504/8750 [3:29:24<8:55:47, 7.57s/it] {'loss': 0.4544, 'learning_rate': 1.0009254130450059e-05, 'epoch': 0.51} + 51%|█████▏ | 4504/8750 [3:29:21<8:55:47, 7.57s/it] 51%|█████▏ | 4505/8750 [3:29:29<8:15:02, 7.00s/it] 51%|█████▏ | 4505/8750 [3:29:26<8:15:02, 7.00s/it] {'loss': 0.4854, 'learning_rate': 1.0005552478777244e-05, 'epoch': 0.51} + 51%|█████▏ | 4505/8750 [3:29:29<8:15:02, 7.00s/it] {'loss': 0.4854, 'learning_rate': 1.0005552478777244e-05, 'epoch': 0.51} + 51%|█████▏ | 4505/8750 [3:29:26<8:15:02, 7.00s/it] 51%|█████▏ | 4506/8750 [3:29:35<7:51:26, 6.66s/it] 51%|█████▏ | 4506/8750 [3:29:32<7:51:26, 6.66s/it] {'loss': 0.458, 'learning_rate': 1.0001850826343615e-05, 'epoch': 0.51} + 51%|█████▏ | 4506/8750 [3:29:35<7:51:26, 6.66s/it] {'loss': 0.458, 'learning_rate': 1.0001850826343615e-05, 'epoch': 0.51} + 51%|█████▏ | 4506/8750 [3:29:32<7:51:26, 6.66s/it] 52%|█████▏ | 4507/8750 [3:29:41<7:31:16, 6.38s/it] 52%|█████▏ | 4507/8750 [3:29:38<7:31:16, 6.38s/it] {'loss': 0.4704, 'learning_rate': 9.998149173656387e-06, 'epoch': 0.52} + 52%|█████▏ | 4507/8750 [3:29:41<7:31:16, 6.38s/it] {'loss': 0.4704, 'learning_rate': 9.998149173656387e-06, 'epoch': 0.52} + 52%|█████▏ | 4507/8750 [3:29:38<7:31:16, 6.38s/it] 52%|█████▏ | 4508/8750 [3:29:47<7:18:00, 6.20s/it] 52%|█████▏ | 4508/8750 [3:29:44<7:18:00, 6.20s/it] {'loss': 0.4649, 'learning_rate': 9.994447521222758e-06, 'epoch': 0.52} + 52%|█████▏ | 4508/8750 [3:29:47<7:18:00, 6.20s/it] {'loss': 0.4649, 'learning_rate': 9.994447521222758e-06, 'epoch': 0.52} + 52%|█████▏ | 4508/8750 [3:29:44<7:18:00, 6.20s/it] 52%|█████▏ | 4509/8750 [3:29:52<7:10:15, 6.09s/it] 52%|█████▏ | 4509/8750 [3:29:49<7:10:15, 6.09s/it] {'loss': 0.4603, 'learning_rate': 9.990745869549943e-06, 'epoch': 0.52} + 52%|█████▏ | 4509/8750 [3:29:52<7:10:15, 6.09s/it] {'loss': 0.4603, 'learning_rate': 9.990745869549943e-06, 'epoch': 0.52} + 52%|█████▏ | 4509/8750 [3:29:49<7:10:15, 6.09s/it] 52%|█████▏ | 4510/8750 [3:29:58<7:02:22, 5.98s/it] 52%|█████▏ | 4510/8750 [3:29:55<7:02:21, 5.98s/it] {'loss': 0.4715, 'learning_rate': 9.987044219145155e-06, 'epoch': 0.52} + 52%|█████▏ | 4510/8750 [3:29:58<7:02:22, 5.98s/it] {'loss': 0.4715, 'learning_rate': 9.987044219145155e-06, 'epoch': 0.52} + 52%|█████▏ | 4510/8750 [3:29:55<7:02:21, 5.98s/it] 52%|█████▏ | 4511/8750 [3:30:04<6:55:30, 5.88s/it] 52%|█████▏ | 4511/8750 [3:30:01<6:55:30, 5.88s/it] {'loss': 0.4502, 'learning_rate': 9.983342570515592e-06, 'epoch': 0.52} + 52%|█████▏ | 4511/8750 [3:30:04<6:55:30, 5.88s/it] {'loss': 0.4502, 'learning_rate': 9.983342570515592e-06, 'epoch': 0.52} + 52%|█████▏ | 4511/8750 [3:30:01<6:55:30, 5.88s/it] 52%|█████▏ | 4512/8750 [3:30:10<6:53:48, 5.86s/it] 52%|█████▏ | 4512/8750 [3:30:07<6:53:48, 5.86s/it] {'loss': 0.4481, 'learning_rate': 9.979640924168475e-06, 'epoch': 0.52} + 52%|█████▏ | 4512/8750 [3:30:10<6:53:48, 5.86s/it] {'loss': 0.4481, 'learning_rate': 9.979640924168475e-06, 'epoch': 0.52} + 52%|█████▏ | 4512/8750 [3:30:07<6:53:48, 5.86s/it] 52%|█████▏ | 4513/8750 [3:30:12<6:48:53, 5.79s/it] 52%|█████▏ | 4513/8750 [3:30:15<6:48:54, 5.79s/it] {'loss': 0.4688, 'learning_rate': 9.975939280611e-06, 'epoch': 0.52} + 52%|█████▏ | 4513/8750 [3:30:15<6:48:54, 5.79s/it] {'loss': 0.4688, 'learning_rate': 9.975939280611e-06, 'epoch': 0.52} + 52%|█████▏ | 4513/8750 [3:30:12<6:48:53, 5.79s/it] 52%|█████▏ | 4514/8750 [3:30:21<6:50:44, 5.82s/it] 52%|█████▏ | 4514/8750 [3:30:18<6:50:44, 5.82s/it] {'loss': 0.4572, 'learning_rate': 9.972237640350383e-06, 'epoch': 0.52} + 52%|█████▏ | 4514/8750 [3:30:21<6:50:44, 5.82s/it] {'loss': 0.4572, 'learning_rate': 9.972237640350383e-06, 'epoch': 0.52} + 52%|█████▏ | 4514/8750 [3:30:18<6:50:44, 5.82s/it] 52%|█████▏ | 4515/8750 [3:30:27<6:50:26, 5.81s/it] 52%|█████▏ | 4515/8750 [3:30:24<6:50:26, 5.81s/it] {'loss': 0.4765, 'learning_rate': 9.968536003893832e-06, 'epoch': 0.52} + 52%|█████▏ | 4515/8750 [3:30:27<6:50:26, 5.81s/it] {'loss': 0.4765, 'learning_rate': 9.968536003893832e-06, 'epoch': 0.52} + 52%|█████▏ | 4515/8750 [3:30:24<6:50:26, 5.81s/it] 52%|█████▏ | 4516/8750 [3:30:33<6:50:34, 5.82s/it] 52%|█████▏ | 4516/8750 [3:30:30<6:50:34, 5.82s/it] {'loss': 0.4421, 'learning_rate': 9.964834371748547e-06, 'epoch': 0.52} + 52%|█████▏ | 4516/8750 [3:30:33<6:50:34, 5.82s/it] {'loss': 0.4421, 'learning_rate': 9.964834371748547e-06, 'epoch': 0.52} + 52%|█████▏ | 4516/8750 [3:30:30<6:50:34, 5.82s/it] 52%|█████▏ | 4517/8750 [3:30:36<6:48:50, 5.80s/it] 52%|█████▏ | 4517/8750 [3:30:38<6:48:50, 5.80s/it] {'loss': 0.4692, 'learning_rate': 9.96113274442174e-06, 'epoch': 0.52} + 52%|█████▏ | 4517/8750 [3:30:38<6:48:50, 5.80s/it] {'loss': 0.4692, 'learning_rate': 9.96113274442174e-06, 'epoch': 0.52} + 52%|█████▏ | 4517/8750 [3:30:36<6:48:50, 5.80s/it] 52%|█████▏ | 4518/8750 [3:30:44<6:47:24, 5.78s/it] 52%|█████▏ | 4518/8750 [3:30:41<6:47:24, 5.78s/it] {'loss': 0.4652, 'learning_rate': 9.957431122420615e-06, 'epoch': 0.52} + 52%|█████▏ | 4518/8750 [3:30:44<6:47:24, 5.78s/it] {'loss': 0.4652, 'learning_rate': 9.957431122420615e-06, 'epoch': 0.52} + 52%|█████▏ | 4518/8750 [3:30:41<6:47:24, 5.78s/it] 52%|█████▏ | 4519/8750 [3:30:50<6:46:30, 5.76s/it] 52%|█████▏ | 4519/8750 [3:30:47<6:46:30, 5.76s/it] {'loss': 0.4713, 'learning_rate': 9.953729506252374e-06, 'epoch': 0.52} + 52%|█████▏ | 4519/8750 [3:30:50<6:46:30, 5.76s/it] {'loss': 0.4713, 'learning_rate': 9.953729506252374e-06, 'epoch': 0.52} + 52%|█████▏ | 4519/8750 [3:30:47<6:46:30, 5.76s/it] 52%|█████▏ | 4520/8750 [3:30:56<6:52:15, 5.85s/it] 52%|█████▏ | 4520/8750 [3:30:53<6:52:16, 5.85s/it] {'loss': 0.4544, 'learning_rate': 9.950027896424228e-06, 'epoch': 0.52} + 52%|█████▏ | 4520/8750 [3:30:56<6:52:15, 5.85s/it] {'loss': 0.4544, 'learning_rate': 9.950027896424228e-06, 'epoch': 0.52} + 52%|█████▏ | 4520/8750 [3:30:53<6:52:16, 5.85s/it] 52%|█████▏ | 4521/8750 [3:31:02<6:50:45, 5.83s/it] 52%|█████▏ | 4521/8750 [3:30:59<6:50:45, 5.83s/it] {'loss': 0.4721, 'learning_rate': 9.946326293443371e-06, 'epoch': 0.52} + 52%|█████▏ | 4521/8750 [3:31:02<6:50:45, 5.83s/it] {'loss': 0.4721, 'learning_rate': 9.946326293443371e-06, 'epoch': 0.52} + 52%|█████▏ | 4521/8750 [3:30:59<6:50:45, 5.83s/it] 52%|█████▏ | 4522/8750 [3:31:05<6:49:11, 5.81s/it] 52%|█████▏ | 4522/8750 [3:31:08<6:49:12, 5.81s/it] {'loss': 0.4527, 'learning_rate': 9.942624697817015e-06, 'epoch': 0.52} + 52%|█████▏ | 4522/8750 [3:31:08<6:49:12, 5.81s/it] {'loss': 0.4527, 'learning_rate': 9.942624697817015e-06, 'epoch': 0.52} + 52%|█████▏ | 4522/8750 [3:31:05<6:49:11, 5.81s/it] 52%|█████▏ | 4523/8750 [3:31:14<6:55:16, 5.89s/it] 52%|█████▏ | 4523/8750 [3:31:11<6:55:16, 5.89s/it] {'loss': 0.4667, 'learning_rate': 9.938923110052353e-06, 'epoch': 0.52} + 52%|█████▏ | 4523/8750 [3:31:11<6:55:16, 5.89s/it]{'loss': 0.4667, 'learning_rate': 9.938923110052353e-06, 'epoch': 0.52} + 52%|█████▏ | 4523/8750 [3:31:14<6:55:16, 5.89s/it] 52%|█████▏ | 4524/8750 [3:31:16<6:53:06, 5.87s/it] 52%|█████▏ | 4524/8750 [3:31:19<6:53:07, 5.87s/it] {'loss': 0.4684, 'learning_rate': 9.935221530656589e-06, 'epoch': 0.52} + 52%|█████▏ | 4524/8750 [3:31:19<6:53:07, 5.87s/it] {'loss': 0.4684, 'learning_rate': 9.935221530656589e-06, 'epoch': 0.52} + 52%|█████▏ | 4524/8750 [3:31:16<6:53:06, 5.87s/it] 52%|█████▏ | 4525/8750 [3:31:22<6:50:59, 5.84s/it] 52%|█████▏ | 4525/8750 [3:31:25<6:51:00, 5.84s/it] {'loss': 0.469, 'learning_rate': 9.931519960136925e-06, 'epoch': 0.52} + 52%|█████▏ | 4525/8750 [3:31:25<6:51:00, 5.84s/it] {'loss': 0.469, 'learning_rate': 9.931519960136925e-06, 'epoch': 0.52} + 52%|█████▏ | 4525/8750 [3:31:22<6:50:59, 5.84s/it] 52%|█████▏ | 4526/8750 [3:31:31<6:47:21, 5.79s/it] 52%|█████▏ | 4526/8750 [3:31:28<6:47:22, 5.79s/it] {'loss': 0.4682, 'learning_rate': 9.92781839900055e-06, 'epoch': 0.52} + 52%|█████▏ | 4526/8750 [3:31:31<6:47:21, 5.79s/it] {'loss': 0.4682, 'learning_rate': 9.92781839900055e-06, 'epoch': 0.52} + 52%|█████▏ | 4526/8750 [3:31:28<6:47:22, 5.79s/it] 52%|█████▏ | 4527/8750 [3:31:37<6:46:24, 5.77s/it] 52%|█████▏ | 4527/8750 [3:31:34<6:46:24, 5.77s/it] {'loss': 0.4617, 'learning_rate': 9.92411684775467e-06, 'epoch': 0.52} + 52%|█████▏ | 4527/8750 [3:31:37<6:46:24, 5.77s/it] {'loss': 0.4617, 'learning_rate': 9.92411684775467e-06, 'epoch': 0.52} + 52%|█████▏ | 4527/8750 [3:31:34<6:46:24, 5.77s/it] 52%|█████▏ | 4528/8750 [3:31:40<6:48:43, 5.81s/it] 52%|█████▏ | 4528/8750 [3:31:42<6:48:43, 5.81s/it] {'loss': 0.4692, 'learning_rate': 9.920415306906475e-06, 'epoch': 0.52} + 52%|█████▏ | 4528/8750 [3:31:42<6:48:43, 5.81s/it] {'loss': 0.4692, 'learning_rate': 9.920415306906475e-06, 'epoch': 0.52} + 52%|█████▏ | 4528/8750 [3:31:40<6:48:43, 5.81s/it] 52%|█████▏ | 4529/8750 [3:31:48<6:48:22, 5.80s/it] 52%|█████▏ | 4529/8750 [3:31:45<6:48:22, 5.80s/it] {'loss': 0.4503, 'learning_rate': 9.916713776963156e-06, 'epoch': 0.52} + 52%|█████▏ | 4529/8750 [3:31:48<6:48:22, 5.80s/it] {'loss': 0.4503, 'learning_rate': 9.916713776963156e-06, 'epoch': 0.52} + 52%|█████▏ | 4529/8750 [3:31:45<6:48:22, 5.80s/it] 52%|█████▏ | 4530/8750 [3:31:51<6:48:39, 5.81s/it] 52%|█████▏ | 4530/8750 [3:31:54<6:48:39, 5.81s/it] {'loss': 0.453, 'learning_rate': 9.913012258431915e-06, 'epoch': 0.52} + 52%|█████▏ | 4530/8750 [3:31:54<6:48:39, 5.81s/it] {'loss': 0.453, 'learning_rate': 9.913012258431915e-06, 'epoch': 0.52} + 52%|█████▏ | 4530/8750 [3:31:51<6:48:39, 5.81s/it] 52%|█████▏ | 4531/8750 [3:31:57<6:46:34, 5.78s/it] 52%|█████▏ | 4531/8750 [3:32:00<6:46:34, 5.78s/it] {'loss': 0.4832, 'learning_rate': 9.909310751819936e-06, 'epoch': 0.52} + 52%|█████▏ | 4531/8750 [3:32:00<6:46:34, 5.78s/it] {'loss': 0.4832, 'learning_rate': 9.909310751819936e-06, 'epoch': 0.52} + 52%|█████▏ | 4531/8750 [3:31:57<6:46:34, 5.78s/it] 52%|█████▏ | 4532/8750 [3:32:06<6:46:52, 5.79s/it] 52%|█████▏ | 4532/8750 [3:32:03<6:46:52, 5.79s/it] {'loss': 0.4704, 'learning_rate': 9.905609257634404e-06, 'epoch': 0.52} + 52%|█████▏ | 4532/8750 [3:32:06<6:46:52, 5.79s/it] {'loss': 0.4704, 'learning_rate': 9.905609257634404e-06, 'epoch': 0.52} + 52%|█████▏ | 4532/8750 [3:32:03<6:46:52, 5.79s/it] 52%|█████▏ | 4533/8750 [3:32:12<6:49:48, 5.83s/it] 52%|█████▏ | 4533/8750 [3:32:09<6:49:48, 5.83s/it] {'loss': 0.4666, 'learning_rate': 9.901907776382514e-06, 'epoch': 0.52} + 52%|█████▏ | 4533/8750 [3:32:12<6:49:48, 5.83s/it] {'loss': 0.4666, 'learning_rate': 9.901907776382514e-06, 'epoch': 0.52} + 52%|█████▏ | 4533/8750 [3:32:09<6:49:48, 5.83s/it] 52%|█████▏ | 4534/8750 [3:32:17<6:44:51, 5.76s/it] 52%|█████▏ | 4534/8750 [3:32:14<6:44:51, 5.76s/it] {'loss': 0.4546, 'learning_rate': 9.898206308571446e-06, 'epoch': 0.52} + 52%|█████▏ | 4534/8750 [3:32:17<6:44:51, 5.76s/it] {'loss': 0.4546, 'learning_rate': 9.898206308571446e-06, 'epoch': 0.52} + 52%|█████▏ | 4534/8750 [3:32:14<6:44:51, 5.76s/it] 52%|█████▏ | 4535/8750 [3:32:20<6:44:39, 5.76s/it] 52%|█████▏ | 4535/8750 [3:32:23<6:44:39, 5.76s/it] {'loss': 0.4571, 'learning_rate': 9.894504854708391e-06, 'epoch': 0.52} + 52%|█████▏ | 4535/8750 [3:32:23<6:44:39, 5.76s/it] {'loss': 0.4571, 'learning_rate': 9.894504854708391e-06, 'epoch': 0.52} + 52%|█████▏ | 4535/8750 [3:32:20<6:44:39, 5.76s/it] 52%|█████▏ | 4536/8750 [3:32:29<6:44:45, 5.76s/it] 52%|█████▏ | 4536/8750 [3:32:26<6:44:45, 5.76s/it] {'loss': 0.475, 'learning_rate': 9.890803415300527e-06, 'epoch': 0.52} + 52%|█████▏ | 4536/8750 [3:32:29<6:44:45, 5.76s/it] {'loss': 0.475, 'learning_rate': 9.890803415300527e-06, 'epoch': 0.52} + 52%|█████▏ | 4536/8750 [3:32:26<6:44:45, 5.76s/it] 52%|█████▏ | 4537/8750 [3:32:32<6:45:54, 5.78s/it] 52%|█████▏ | 4537/8750 [3:32:34<6:45:54, 5.78s/it] {'loss': 0.4718, 'learning_rate': 9.887101990855027e-06, 'epoch': 0.52} + 52%|█████▏ | 4537/8750 [3:32:35<6:45:54, 5.78s/it] {'loss': 0.4718, 'learning_rate': 9.887101990855027e-06, 'epoch': 0.52} + 52%|█████▏ | 4537/8750 [3:32:32<6:45:54, 5.78s/it] 52%|█████▏ | 4538/8750 [3:32:40<6:46:06, 5.78s/it] 52%|█████▏ | 4538/8750 [3:32:37<6:46:06, 5.78s/it] {'loss': 0.4536, 'learning_rate': 9.883400581879077e-06, 'epoch': 0.52} + 52%|█████▏ | 4538/8750 [3:32:40<6:46:06, 5.78s/it] {'loss': 0.4536, 'learning_rate': 9.883400581879077e-06, 'epoch': 0.52} + 52%|█████▏ | 4538/8750 [3:32:37<6:46:06, 5.78s/it] 52%|█████▏ | 4539/8750 [3:32:46<6:43:43, 5.75s/it] 52%|█████▏ | 4539/8750 [3:32:43<6:43:43, 5.75s/it] {'loss': 0.4675, 'learning_rate': 9.87969918887985e-06, 'epoch': 0.52} + 52%|█████▏ | 4539/8750 [3:32:46<6:43:43, 5.75s/it] {'loss': 0.4675, 'learning_rate': 9.87969918887985e-06, 'epoch': 0.52} + 52%|█████▏ | 4539/8750 [3:32:43<6:43:43, 5.75s/it] 52%|█████▏ | 4540/8750 [3:32:52<6:42:37, 5.74s/it] 52%|█████▏ | 4540/8750 [3:32:49<6:42:37, 5.74s/it] {'loss': 0.451, 'learning_rate': 9.875997812364524e-06, 'epoch': 0.52} + 52%|█████▏ | 4540/8750 [3:32:52<6:42:37, 5.74s/it] {'loss': 0.451, 'learning_rate': 9.875997812364524e-06, 'epoch': 0.52} + 52%|█████▏ | 4540/8750 [3:32:49<6:42:37, 5.74s/it] 52%|█████▏ | 4541/8750 [3:32:57<6:41:46, 5.73s/it] 52%|█████▏ | 4541/8750 [3:32:54<6:41:47, 5.73s/it] {'loss': 0.4575, 'learning_rate': 9.872296452840266e-06, 'epoch': 0.52} + 52%|█████▏ | 4541/8750 [3:32:57<6:41:46, 5.73s/it] {'loss': 0.4575, 'learning_rate': 9.872296452840266e-06, 'epoch': 0.52} + 52%|█████▏ | 4541/8750 [3:32:54<6:41:47, 5.73s/it] 52%|█████▏ | 4542/8750 [3:33:03<6:41:41, 5.73s/it] 52%|█████▏ | 4542/8750 [3:33:00<6:41:41, 5.73s/it] {'loss': 0.4571, 'learning_rate': 9.86859511081424e-06, 'epoch': 0.52} + 52%|█████▏ | 4542/8750 [3:33:03<6:41:41, 5.73s/it] {'loss': 0.4571, 'learning_rate': 9.86859511081424e-06, 'epoch': 0.52} + 52%|█████▏ | 4542/8750 [3:33:00<6:41:41, 5.73s/it] 52%|█████▏ | 4543/8750 [3:33:06<6:44:48, 5.77s/it] 52%|█████▏ | 4543/8750 [3:33:09<6:44:48, 5.77s/it] {'loss': 0.4678, 'learning_rate': 9.86489378679362e-06, 'epoch': 0.52} + {'loss': 0.4678, 'learning_rate': 9.86489378679362e-06, 'epoch': 0.52} + 52%|█████▏ | 4543/8750 [3:33:09<6:44:48, 5.77s/it] 52%|█████▏ | 4543/8750 [3:33:06<6:44:48, 5.77s/it] 52%|█████▏ | 4544/8750 [3:33:15<6:44:47, 5.77s/it] 52%|█████▏ | 4544/8750 [3:33:12<6:44:47, 5.77s/it] {'loss': 0.4673, 'learning_rate': 9.86119248128557e-06, 'epoch': 0.52} + 52%|█████▏ | 4544/8750 [3:33:15<6:44:47, 5.77s/it] {'loss': 0.4673, 'learning_rate': 9.86119248128557e-06, 'epoch': 0.52} + 52%|█████▏ | 4544/8750 [3:33:12<6:44:47, 5.77s/it] 52%|█████▏ | 4545/8750 [3:33:18<6:43:58, 5.76s/it] 52%|█████▏ | 4545/8750 [3:33:21<6:43:59, 5.76s/it] {'loss': 0.4733, 'learning_rate': 9.857491194797244e-06, 'epoch': 0.52} + 52%|█████▏ | 4545/8750 [3:33:21<6:43:59, 5.76s/it] {'loss': 0.4733, 'learning_rate': 9.857491194797244e-06, 'epoch': 0.52} + 52%|█████▏ | 4545/8750 [3:33:18<6:43:58, 5.76s/it] 52%|█████▏ | 4546/8750 [3:33:23<6:45:13, 5.78s/it] 52%|█████▏ | 4546/8750 [3:33:26<6:45:13, 5.78s/it] {'loss': 0.4682, 'learning_rate': 9.853789927835811e-06, 'epoch': 0.52} + 52%|█████▏ | 4546/8750 [3:33:26<6:45:13, 5.78s/it] {'loss': 0.4682, 'learning_rate': 9.853789927835811e-06, 'epoch': 0.52} + 52%|█████▏ | 4546/8750 [3:33:23<6:45:13, 5.78s/it] 52%|█████▏ | 4547/8750 [3:33:32<6:42:54, 5.75s/it] 52%|█████▏ | 4547/8750 [3:33:29<6:42:54, 5.75s/it] {'loss': 0.4519, 'learning_rate': 9.85008868090842e-06, 'epoch': 0.52} + 52%|█████▏ | 4547/8750 [3:33:32<6:42:54, 5.75s/it] {'loss': 0.4519, 'learning_rate': 9.85008868090842e-06, 'epoch': 0.52} + 52%|█████▏ | 4547/8750 [3:33:29<6:42:54, 5.75s/it] 52%|█████▏ | 4548/8750 [3:33:35<6:44:01, 5.77s/it] 52%|█████▏ | 4548/8750 [3:33:38<6:44:01, 5.77s/it] {'loss': 0.4542, 'learning_rate': 9.846387454522225e-06, 'epoch': 0.52} + 52%|█████▏ | 4548/8750 [3:33:38<6:44:01, 5.77s/it] {'loss': 0.4542, 'learning_rate': 9.846387454522225e-06, 'epoch': 0.52} + 52%|█████▏ | 4548/8750 [3:33:35<6:44:01, 5.77s/it] 52%|█████▏ | 4549/8750 [3:33:43<6:41:30, 5.73s/it] 52%|█████▏ | 4549/8750 [3:33:41<6:41:30, 5.73s/it] {'loss': 0.4461, 'learning_rate': 9.842686249184384e-06, 'epoch': 0.52} + 52%|█████▏ | 4549/8750 [3:33:43<6:41:30, 5.73s/it] {'loss': 0.4461, 'learning_rate': 9.842686249184384e-06, 'epoch': 0.52} + 52%|█████▏ | 4549/8750 [3:33:41<6:41:30, 5.73s/it]9 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... + 52%|█████▏ | 4550/8750 [3:33:49<6:42:35, 5.75s/it]10 AutoResumeHook: Checking whether to suspend... +04 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... + 52%|█████▏ | 4550/8750 [3:33:46<6:42:35, 5.75s/it]5 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4728, 'learning_rate': 9.838985065402032e-06, 'epoch': 0.52} + 52%|█████▏ | 4550/8750 [3:33:49<6:42:35, 5.75s/it] {'loss': 0.4728, 'learning_rate': 9.838985065402032e-06, 'epoch': 0.52} + 52%|█████▏ | 4550/8750 [3:33:46<6:42:35, 5.75s/it] 52%|█████▏ | 4551/8750 [3:33:55<6:39:38, 5.71s/it] 52%|█████▏ | 4551/8750 [3:33:52<6:39:38, 5.71s/it] {'loss': 0.4747, 'learning_rate': 9.835283903682327e-06, 'epoch': 0.52} + 52%|█████▏ | 4551/8750 [3:33:55<6:39:38, 5.71s/it] {'loss': 0.4747, 'learning_rate': 9.835283903682327e-06, 'epoch': 0.52} + 52%|█████▏ | 4551/8750 [3:33:52<6:39:38, 5.71s/it] 52%|█████▏ | 4552/8750 [3:33:58<6:47:26, 5.82s/it] 52%|█████▏ | 4552/8750 [3:34:01<6:47:26, 5.82s/it] {'loss': 0.4419, 'learning_rate': 9.831582764532399e-06, 'epoch': 0.52} + 52%|█████▏ | 4552/8750 [3:34:01<6:47:26, 5.82s/it] {'loss': 0.4419, 'learning_rate': 9.831582764532399e-06, 'epoch': 0.52} + 52%|█████▏ | 4552/8750 [3:33:58<6:47:26, 5.82s/it] 52%|█████▏ | 4553/8750 [3:34:07<6:45:22, 5.80s/it] 52%|█████▏ | 4553/8750 [3:34:04<6:45:22, 5.80s/it] {'loss': 0.4559, 'learning_rate': 9.827881648459396e-06, 'epoch': 0.52} + 52%|█████▏ | 4553/8750 [3:34:07<6:45:22, 5.80s/it] {'loss': 0.4559, 'learning_rate': 9.827881648459396e-06, 'epoch': 0.52} + 52%|█████▏ | 4553/8750 [3:34:04<6:45:22, 5.80s/it] 52%|█████▏ | 4554/8750 [3:34:10<6:44:21, 5.78s/it] 52%|█████▏ | 4554/8750 [3:34:12<6:44:22, 5.78s/it] {'loss': 0.4902, 'learning_rate': 9.824180555970451e-06, 'epoch': 0.52} + {'loss': 0.4902, 'learning_rate': 9.824180555970451e-06, 'epoch': 0.52} 52%|█████▏ | 4554/8750 [3:34:12<6:44:22, 5.78s/it] + 52%|█████▏ | 4554/8750 [3:34:10<6:44:21, 5.78s/it] 52%|█████▏ | 4555/8750 [3:34:16<6:48:27, 5.84s/it] 52%|█████▏ | 4555/8750 [3:34:18<6:48:27, 5.84s/it] {'loss': 0.443, 'learning_rate': 9.820479487572691e-06, 'epoch': 0.52} + 52%|█████▏ | 4555/8750 [3:34:18<6:48:27, 5.84s/it] {'loss': 0.443, 'learning_rate': 9.820479487572691e-06, 'epoch': 0.52} + 52%|█████▏ | 4555/8750 [3:34:16<6:48:27, 5.84s/it] 52%|█████▏ | 4556/8750 [3:34:24<6:48:34, 5.85s/it] 52%|█████▏ | 4556/8750 [3:34:21<6:48:35, 5.85s/it] {'loss': 0.4473, 'learning_rate': 9.816778443773253e-06, 'epoch': 0.52} + 52%|█████▏ | 4556/8750 [3:34:24<6:48:34, 5.85s/it] {'loss': 0.4473, 'learning_rate': 9.816778443773253e-06, 'epoch': 0.52} + 52%|█████▏ | 4556/8750 [3:34:21<6:48:35, 5.85s/it] 52%|█████▏ | 4557/8750 [3:34:30<6:44:14, 5.78s/it] 52%|█████▏ | 4557/8750 [3:34:27<6:44:14, 5.78s/it] {'loss': 0.4676, 'learning_rate': 9.813077425079258e-06, 'epoch': 0.52} + 52%|█████▏ | 4557/8750 [3:34:30<6:44:14, 5.78s/it] {'loss': 0.4676, 'learning_rate': 9.813077425079258e-06, 'epoch': 0.52} + 52%|█████▏ | 4557/8750 [3:34:27<6:44:14, 5.78s/it] 52%|█████▏ | 4558/8750 [3:34:36<6:41:52, 5.75s/it] 52%|█████▏ | 4558/8750 [3:34:33<6:41:52, 5.75s/it] {'loss': 0.446, 'learning_rate': 9.809376431997825e-06, 'epoch': 0.52} + 52%|█████▏ | 4558/8750 [3:34:36<6:41:52, 5.75s/it] {'loss': 0.446, 'learning_rate': 9.809376431997825e-06, 'epoch': 0.52} + 52%|█████▏ | 4558/8750 [3:34:33<6:41:52, 5.75s/it] 52%|█████▏ | 4559/8750 [3:34:41<6:41:24, 5.75s/it] 52%|█████▏ | 4559/8750 [3:34:38<6:41:25, 5.75s/it] {'loss': 0.4786, 'learning_rate': 9.805675465036084e-06, 'epoch': 0.52} + 52%|█████▏ | 4559/8750 [3:34:41<6:41:24, 5.75s/it] {'loss': 0.4786, 'learning_rate': 9.805675465036084e-06, 'epoch': 0.52} + 52%|█████▏ | 4559/8750 [3:34:38<6:41:25, 5.75s/it] 52%|█████▏ | 4560/8750 [3:34:44<6:42:54, 5.77s/it] 52%|█████▏ | 4560/8750 [3:34:47<6:42:54, 5.77s/it] {'loss': 0.4651, 'learning_rate': 9.801974524701135e-06, 'epoch': 0.52} + 52%|█████▏ | 4560/8750 [3:34:47<6:42:54, 5.77s/it] {'loss': 0.4651, 'learning_rate': 9.801974524701135e-06, 'epoch': 0.52} + 52%|█████▏ | 4560/8750 [3:34:44<6:42:54, 5.77s/it] 52%|█████▏ | 4561/8750 [3:34:53<6:43:46, 5.78s/it] 52%|█████▏ | 4561/8750 [3:34:50<6:43:46, 5.78s/it] {'loss': 0.4478, 'learning_rate': 9.798273611500103e-06, 'epoch': 0.52} + 52%|█████▏ | 4561/8750 [3:34:53<6:43:46, 5.78s/it] {'loss': 0.4478, 'learning_rate': 9.798273611500103e-06, 'epoch': 0.52} + 52%|█████▏ | 4561/8750 [3:34:50<6:43:46, 5.78s/it] 52%|█████▏ | 4562/8750 [3:34:56<6:45:46, 5.81s/it] 52%|█████▏ | 4562/8750 [3:34:59<6:45:46, 5.81s/it] {'loss': 0.4633, 'learning_rate': 9.794572725940088e-06, 'epoch': 0.52} + 52%|█████▏ | 4562/8750 [3:34:59<6:45:46, 5.81s/it] {'loss': 0.4633, 'learning_rate': 9.794572725940088e-06, 'epoch': 0.52} + 52%|█████▏ | 4562/8750 [3:34:56<6:45:46, 5.81s/it] 52%|█████▏ | 4563/8750 [3:35:02<6:48:29, 5.85s/it] 52%|█████▏ | 4563/8750 [3:35:05<6:48:29, 5.85s/it] {'loss': 0.4631, 'learning_rate': 9.790871868528194e-06, 'epoch': 0.52} + 52%|█████▏ | 4563/8750 [3:35:05<6:48:29, 5.85s/it] {'loss': 0.4631, 'learning_rate': 9.790871868528194e-06, 'epoch': 0.52} + 52%|█████▏ | 4563/8750 [3:35:02<6:48:29, 5.85s/it] 52%|█████▏ | 4564/8750 [3:35:11<6:53:13, 5.92s/it] 52%|█████▏ | 4564/8750 [3:35:08<6:53:13, 5.92s/it] {'loss': 0.458, 'learning_rate': 9.787171039771528e-06, 'epoch': 0.52} + 52%|█████▏ | 4564/8750 [3:35:11<6:53:13, 5.92s/it] {'loss': 0.458, 'learning_rate': 9.787171039771528e-06, 'epoch': 0.52} + 52%|█████▏ | 4564/8750 [3:35:08<6:53:13, 5.92s/it] 52%|█████▏ | 4565/8750 [3:35:17<6:48:17, 5.85s/it] 52%|█████▏ | 4565/8750 [3:35:14<6:48:17, 5.85s/it] {'loss': 0.4644, 'learning_rate': 9.783470240177175e-06, 'epoch': 0.52} + 52%|█████▏ | 4565/8750 [3:35:17<6:48:17, 5.85s/it] {'loss': 0.4644, 'learning_rate': 9.783470240177175e-06, 'epoch': 0.52} + 52%|█████▏ | 4565/8750 [3:35:14<6:48:17, 5.85s/it] 52%|█████▏ | 4566/8750 [3:35:22<6:45:31, 5.82s/it] 52%|█████▏ | 4566/8750 [3:35:19<6:45:31, 5.82s/it] {'loss': 0.4525, 'learning_rate': 9.779769470252237e-06, 'epoch': 0.52} + 52%|█████▏ | 4566/8750 [3:35:22<6:45:31, 5.82s/it] {'loss': 0.4525, 'learning_rate': 9.779769470252237e-06, 'epoch': 0.52} + 52%|█████▏ | 4566/8750 [3:35:19<6:45:31, 5.82s/it] 52%|█████▏ | 4567/8750 [3:35:25<6:42:58, 5.78s/it] 52%|█████▏ | 4567/8750 [3:35:28<6:42:58, 5.78s/it] {'loss': 0.48, 'learning_rate': 9.776068730503801e-06, 'epoch': 0.52} + 52%|█████▏ | 4567/8750 [3:35:28<6:42:58, 5.78s/it] {'loss': 0.48, 'learning_rate': 9.776068730503801e-06, 'epoch': 0.52} + 52%|█████▏ | 4567/8750 [3:35:25<6:42:58, 5.78s/it] 52%|█████▏ | 4568/8750 [3:35:34<6:42:12, 5.77s/it] 52%|█████▏ | 4568/8750 [3:35:31<6:42:12, 5.77s/it] {'loss': 0.4676, 'learning_rate': 9.772368021438943e-06, 'epoch': 0.52} + 52%|█████▏ | 4568/8750 [3:35:34<6:42:12, 5.77s/it] {'loss': 0.4676, 'learning_rate': 9.772368021438943e-06, 'epoch': 0.52} + 52%|█████▏ | 4568/8750 [3:35:31<6:42:12, 5.77s/it] 52%|█████▏ | 4569/8750 [3:35:37<6:42:20, 5.77s/it] 52%|█████▏ | 4569/8750 [3:35:40<6:42:20, 5.77s/it] {'loss': 0.4516, 'learning_rate': 9.768667343564752e-06, 'epoch': 0.52} + 52%|█████▏ | 4569/8750 [3:35:40<6:42:20, 5.77s/it] {'loss': 0.4516, 'learning_rate': 9.768667343564752e-06, 'epoch': 0.52} + 52%|█████▏ | 4569/8750 [3:35:37<6:42:20, 5.77s/it] 52%|█████▏ | 4570/8750 [3:35:42<6:41:58, 5.77s/it] 52%|█████▏ | 4570/8750 [3:35:45<6:41:58, 5.77s/it] {'loss': 0.4608, 'learning_rate': 9.7649666973883e-06, 'epoch': 0.52} + 52%|█████▏ | 4570/8750 [3:35:45<6:41:58, 5.77s/it] {'loss': 0.4608, 'learning_rate': 9.7649666973883e-06, 'epoch': 0.52} + 52%|█████▏ | 4570/8750 [3:35:42<6:41:58, 5.77s/it] 52%|█████▏ | 4571/8750 [3:35:51<6:40:01, 5.74s/it] 52%|█████▏ | 4571/8750 [3:35:48<6:40:01, 5.74s/it] {'loss': 0.4624, 'learning_rate': 9.761266083416655e-06, 'epoch': 0.52} + 52%|█████▏ | 4571/8750 [3:35:51<6:40:01, 5.74s/it] {'loss': 0.4624, 'learning_rate': 9.761266083416655e-06, 'epoch': 0.52} + 52%|█████▏ | 4571/8750 [3:35:48<6:40:01, 5.74s/it] 52%|█████▏ | 4572/8750 [3:35:57<6:43:26, 5.79s/it] 52%|█████▏ | 4572/8750 [3:35:54<6:43:26, 5.79s/it] {'loss': 0.4609, 'learning_rate': 9.75756550215689e-06, 'epoch': 0.52} + 52%|█████▏ | 4572/8750 [3:35:57<6:43:26, 5.79s/it] {'loss': 0.4609, 'learning_rate': 9.75756550215689e-06, 'epoch': 0.52} + 52%|█████▏ | 4572/8750 [3:35:54<6:43:26, 5.79s/it] 52%|█████▏ | 4573/8750 [3:36:00<6:41:49, 5.77s/it] 52%|█████▏ | 4573/8750 [3:36:03<6:41:49, 5.77s/it] {'loss': 0.4632, 'learning_rate': 9.753864954116058e-06, 'epoch': 0.52} + 52%|█████▏ | 4573/8750 [3:36:03<6:41:49, 5.77s/it] {'loss': 0.4632, 'learning_rate': 9.753864954116058e-06, 'epoch': 0.52} + 52%|█████▏ | 4573/8750 [3:36:00<6:41:49, 5.77s/it] 52%|█████▏ | 4574/8750 [3:36:08<6:41:15, 5.77s/it] 52%|█████▏ | 4574/8750 [3:36:05<6:41:15, 5.77s/it] {'loss': 0.4696, 'learning_rate': 9.75016443980123e-06, 'epoch': 0.52} + 52%|█████▏ | 4574/8750 [3:36:08<6:41:15, 5.77s/it] {'loss': 0.4696, 'learning_rate': 9.75016443980123e-06, 'epoch': 0.52} + 52%|█████▏ | 4574/8750 [3:36:05<6:41:15, 5.77s/it] 52%|█████▏ | 4575/8750 [3:36:11<6:39:28, 5.74s/it] 52%|█████▏ | 4575/8750 [3:36:14<6:39:28, 5.74s/it] {'loss': 0.4639, 'learning_rate': 9.746463959719447e-06, 'epoch': 0.52} + 52%|█████▏ | 4575/8750 [3:36:14<6:39:28, 5.74s/it] {'loss': 0.4639, 'learning_rate': 9.746463959719447e-06, 'epoch': 0.52} + 52%|█████▏ | 4575/8750 [3:36:11<6:39:28, 5.74s/it] 52%|█████▏ | 4576/8750 [3:36:20<6:43:26, 5.80s/it] 52%|█████▏ | 4576/8750 [3:36:17<6:43:26, 5.80s/it] {'loss': 0.4705, 'learning_rate': 9.74276351437776e-06, 'epoch': 0.52} + 52%|█████▏ | 4576/8750 [3:36:20<6:43:26, 5.80s/it] {'loss': 0.4705, 'learning_rate': 9.74276351437776e-06, 'epoch': 0.52} + 52%|█████▏ | 4576/8750 [3:36:17<6:43:26, 5.80s/it] 52%|█████▏ | 4577/8750 [3:36:26<6:42:03, 5.78s/it] 52%|█████▏ | 4577/8750 [3:36:23<6:42:03, 5.78s/it] {'loss': 0.4716, 'learning_rate': 9.73906310428322e-06, 'epoch': 0.52} + 52%|█████▏ | 4577/8750 [3:36:26<6:42:03, 5.78s/it] {'loss': 0.4716, 'learning_rate': 9.73906310428322e-06, 'epoch': 0.52} + 52%|█████▏ | 4577/8750 [3:36:23<6:42:03, 5.78s/it] 52%|█████▏ | 4578/8750 [3:36:31<6:39:25, 5.74s/it] 52%|█████▏ | 4578/8750 [3:36:28<6:39:25, 5.74s/it] {'loss': 0.4548, 'learning_rate': 9.735362729942856e-06, 'epoch': 0.52} + 52%|█████▏ | 4578/8750 [3:36:31<6:39:25, 5.74s/it] {'loss': 0.4548, 'learning_rate': 9.735362729942856e-06, 'epoch': 0.52} + 52%|█████▏ | 4578/8750 [3:36:28<6:39:25, 5.74s/it] 52%|█████▏ | 4579/8750 [3:36:37<6:39:18, 5.74s/it] 52%|█████▏ | 4579/8750 [3:36:34<6:39:18, 5.74s/it] {'loss': 0.453, 'learning_rate': 9.731662391863711e-06, 'epoch': 0.52} + 52%|█████▏ | 4579/8750 [3:36:37<6:39:18, 5.74s/it] {'loss': 0.453, 'learning_rate': 9.731662391863711e-06, 'epoch': 0.52} + 52%|█████▏ | 4579/8750 [3:36:34<6:39:18, 5.74s/it] 52%|█████▏ | 4580/8750 [3:36:43<6:39:18, 5.75s/it] 52%|█████▏ | 4580/8750 [3:36:40<6:39:19, 5.75s/it] {'loss': 0.4619, 'learning_rate': 9.727962090552808e-06, 'epoch': 0.52} + 52%|█████▏ | 4580/8750 [3:36:43<6:39:18, 5.75s/it] {'loss': 0.4619, 'learning_rate': 9.727962090552808e-06, 'epoch': 0.52} + 52%|█████▏ | 4580/8750 [3:36:40<6:39:19, 5.75s/it] 52%|█████▏ | 4581/8750 [3:36:49<6:40:39, 5.77s/it] 52%|█████▏ | 4581/8750 [3:36:46<6:40:39, 5.77s/it] {'loss': 0.4502, 'learning_rate': 9.724261826517167e-06, 'epoch': 0.52} + 52%|█████▏ | 4581/8750 [3:36:49<6:40:39, 5.77s/it] {'loss': 0.4502, 'learning_rate': 9.724261826517167e-06, 'epoch': 0.52} + 52%|█████▏ | 4581/8750 [3:36:46<6:40:39, 5.77s/it] 52%|█████▏ | 4582/8750 [3:36:54<6:37:07, 5.72s/it] 52%|█████▏ | 4582/8750 [3:36:51<6:37:07, 5.72s/it] {'loss': 0.4543, 'learning_rate': 9.720561600263818e-06, 'epoch': 0.52} + 52%|█████▏ | 4582/8750 [3:36:54<6:37:07, 5.72s/it] {'loss': 0.4543, 'learning_rate': 9.720561600263818e-06, 'epoch': 0.52} + 52%|█████▏ | 4582/8750 [3:36:51<6:37:07, 5.72s/it] 52%|█████▏ | 4583/8750 [3:37:00<6:35:56, 5.70s/it] 52%|█████▏ | 4583/8750 [3:36:57<6:35:56, 5.70s/it] {'loss': 0.4512, 'learning_rate': 9.716861412299769e-06, 'epoch': 0.52} + 52%|█████▏ | 4583/8750 [3:37:00<6:35:56, 5.70s/it] {'loss': 0.4512, 'learning_rate': 9.716861412299769e-06, 'epoch': 0.52} + 52%|█████▏ | 4583/8750 [3:36:57<6:35:56, 5.70s/it] 52%|█████▏ | 4584/8750 [3:37:03<6:35:52, 5.70s/it] 52%|█████▏ | 4584/8750 [3:37:06<6:35:53, 5.70s/it] {'loss': 0.4865, 'learning_rate': 9.713161263132022e-06, 'epoch': 0.52} + 52%|█████▏ | 4584/8750 [3:37:06<6:35:53, 5.70s/it] {'loss': 0.4865, 'learning_rate': 9.713161263132022e-06, 'epoch': 0.52} + 52%|█████▏ | 4584/8750 [3:37:03<6:35:52, 5.70s/it] 52%|█████▏ | 4585/8750 [3:37:11<6:36:14, 5.71s/it] 52%|█████▏ | 4585/8750 [3:37:08<6:36:14, 5.71s/it] {'loss': 0.4704, 'learning_rate': 9.70946115326759e-06, 'epoch': 0.52} + 52%|█████▏ | 4585/8750 [3:37:11<6:36:14, 5.71s/it] {'loss': 0.4704, 'learning_rate': 9.70946115326759e-06, 'epoch': 0.52} + 52%|█████▏ | 4585/8750 [3:37:08<6:36:14, 5.71s/it] 52%|█████▏ | 4586/8750 [3:37:17<6:38:09, 5.74s/it] 52%|█████▏ | 4586/8750 [3:37:14<6:38:09, 5.74s/it] {'loss': 0.4619, 'learning_rate': 9.705761083213463e-06, 'epoch': 0.52} + 52%|█████▏ | 4586/8750 [3:37:17<6:38:09, 5.74s/it] {'loss': 0.4619, 'learning_rate': 9.705761083213463e-06, 'epoch': 0.52} + 52%|█████▏ | 4586/8750 [3:37:14<6:38:09, 5.74s/it] 52%|█████▏ | 4587/8750 [3:37:23<6:43:36, 5.82s/it] 52%|█████▏ | 4587/8750 [3:37:20<6:43:36, 5.82s/it] {'loss': 0.4452, 'learning_rate': 9.702061053476642e-06, 'epoch': 0.52} + 52%|█████▏ | 4587/8750 [3:37:23<6:43:36, 5.82s/it] {'loss': 0.4452, 'learning_rate': 9.702061053476642e-06, 'epoch': 0.52} + 52%|█████▏ | 4587/8750 [3:37:20<6:43:36, 5.82s/it] 52%|█████▏ | 4588/8750 [3:37:26<6:45:49, 5.85s/it] 52%|█████▏ | 4588/8750 [3:37:29<6:45:49, 5.85s/it] {'loss': 0.4892, 'learning_rate': 9.698361064564107e-06, 'epoch': 0.52} + 52%|█████▏ | 4588/8750 [3:37:29<6:45:49, 5.85s/it] {'loss': 0.4892, 'learning_rate': 9.698361064564107e-06, 'epoch': 0.52} + 52%|█████▏ | 4588/8750 [3:37:26<6:45:49, 5.85s/it] 52%|█████▏ | 4589/8750 [3:37:35<6:47:20, 5.87s/it] 52%|█████▏ | 4589/8750 [3:37:32<6:47:19, 5.87s/it] {'loss': 0.437, 'learning_rate': 9.694661116982838e-06, 'epoch': 0.52} + 52%|█████▏ | 4589/8750 [3:37:35<6:47:20, 5.87s/it] {'loss': 0.437, 'learning_rate': 9.694661116982838e-06, 'epoch': 0.52} + 52%|█████▏ | 4589/8750 [3:37:32<6:47:19, 5.87s/it] 52%|█████▏ | 4590/8750 [3:37:41<6:43:26, 5.82s/it] 52%|█████▏ | 4590/8750 [3:37:38<6:43:26, 5.82s/it] {'loss': 0.4578, 'learning_rate': 9.690961211239816e-06, 'epoch': 0.52} + 52%|█████▏ | 4590/8750 [3:37:41<6:43:26, 5.82s/it] {'loss': 0.4578, 'learning_rate': 9.690961211239816e-06, 'epoch': 0.52} + 52%|█████▏ | 4590/8750 [3:37:38<6:43:26, 5.82s/it] 52%|█████▏ | 4591/8750 [3:37:46<6:40:05, 5.77s/it] 52%|█████▏ | 4591/8750 [3:37:43<6:40:05, 5.77s/it] {'loss': 0.4736, 'learning_rate': 9.687261347842004e-06, 'epoch': 0.52} + 52%|█████▏ | 4591/8750 [3:37:46<6:40:05, 5.77s/it] {'loss': 0.4736, 'learning_rate': 9.687261347842004e-06, 'epoch': 0.52} + 52%|█████▏ | 4591/8750 [3:37:43<6:40:05, 5.77s/it] 52%|█████▏ | 4592/8750 [3:37:52<6:38:28, 5.75s/it] 52%|█████▏ | 4592/8750 [3:37:49<6:38:28, 5.75s/it] {'loss': 0.4749, 'learning_rate': 9.683561527296375e-06, 'epoch': 0.52} + 52%|█████▏ | 4592/8750 [3:37:52<6:38:28, 5.75s/it] {'loss': 0.4749, 'learning_rate': 9.683561527296375e-06, 'epoch': 0.52} + 52%|█████▏ | 4592/8750 [3:37:49<6:38:28, 5.75s/it] 52%|█████▏ | 4593/8750 [3:37:58<6:40:37, 5.78s/it] 52%|█████▏ | 4593/8750 [3:37:55<6:40:37, 5.78s/it] {'loss': 0.4598, 'learning_rate': 9.67986175010988e-06, 'epoch': 0.52} + 52%|█████▏ | 4593/8750 [3:37:58<6:40:37, 5.78s/it] {'loss': 0.4598, 'learning_rate': 9.67986175010988e-06, 'epoch': 0.52} + 52%|█████▏ | 4593/8750 [3:37:55<6:40:37, 5.78s/it] 53%|█████▎ | 4594/8750 [3:38:01<6:37:25, 5.74s/it] 53%|█████▎ | 4594/8750 [3:38:04<6:37:26, 5.74s/it] {'loss': 0.4787, 'learning_rate': 9.676162016789469e-06, 'epoch': 0.53} + 53%|█████▎ | 4594/8750 [3:38:04<6:37:26, 5.74s/it] {'loss': 0.4787, 'learning_rate': 9.676162016789469e-06, 'epoch': 0.53} + 53%|█████▎ | 4594/8750 [3:38:01<6:37:25, 5.74s/it] 53%|█████▎ | 4595/8750 [3:38:06<6:35:09, 5.71s/it] 53%|█████▎ | 4595/8750 [3:38:09<6:35:09, 5.71s/it] {'loss': 0.4639, 'learning_rate': 9.672462327842095e-06, 'epoch': 0.53} + 53%|█████▎ | 4595/8750 [3:38:09<6:35:09, 5.71s/it] {'loss': 0.4639, 'learning_rate': 9.672462327842095e-06, 'epoch': 0.53} + 53%|█████▎ | 4595/8750 [3:38:06<6:35:09, 5.71s/it] 53%|█████▎ | 4596/8750 [3:38:12<6:38:19, 5.75s/it] 53%|█████▎ | 4596/8750 [3:38:15<6:38:19, 5.75s/it] {'loss': 0.4549, 'learning_rate': 9.668762683774691e-06, 'epoch': 0.53} + 53%|█████▎ | 4596/8750 [3:38:15<6:38:19, 5.75s/it] {'loss': 0.4549, 'learning_rate': 9.668762683774691e-06, 'epoch': 0.53} + 53%|█████▎ | 4596/8750 [3:38:12<6:38:19, 5.75s/it] 53%|█████▎ | 4597/8750 [3:38:21<6:42:49, 5.82s/it] 53%|█████▎ | 4597/8750 [3:38:18<6:42:50, 5.82s/it] {'loss': 0.4702, 'learning_rate': 9.6650630850942e-06, 'epoch': 0.53} + 53%|█████▎ | 4597/8750 [3:38:21<6:42:49, 5.82s/it] {'loss': 0.4702, 'learning_rate': 9.6650630850942e-06, 'epoch': 0.53} + 53%|█████▎ | 4597/8750 [3:38:18<6:42:50, 5.82s/it] 53%|█████▎ | 4598/8750 [3:38:27<6:43:23, 5.83s/it] 53%|█████▎ | 4598/8750 [3:38:24<6:43:24, 5.83s/it] {'loss': 0.4616, 'learning_rate': 9.661363532307543e-06, 'epoch': 0.53} + 53%|█████▎ | 4598/8750 [3:38:27<6:43:23, 5.83s/it] {'loss': 0.4616, 'learning_rate': 9.661363532307543e-06, 'epoch': 0.53} + 53%|█████▎ | 4598/8750 [3:38:24<6:43:24, 5.83s/it] 53%|█████▎ | 4599/8750 [3:38:33<6:42:00, 5.81s/it] 53%|█████▎ | 4599/8750 [3:38:30<6:42:00, 5.81s/it] {'loss': 0.4599, 'learning_rate': 9.65766402592164e-06, 'epoch': 0.53} + 53%|█████▎ | 4599/8750 [3:38:33<6:42:00, 5.81s/it] {'loss': 0.4599, 'learning_rate': 9.65766402592164e-06, 'epoch': 0.53} + 53%|█████▎ | 4599/8750 [3:38:30<6:42:00, 5.81s/it]6 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +128 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... + 53%|█████▎ | 4600/8750 [3:38:38<6:39:37, 5.78s/it]11 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +01513 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... + 53%|█████▎ | 4600/8750 [3:38:35<6:39:37, 5.78s/it]10 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4743, 'learning_rate': 9.65396456644341e-06, 'epoch': 0.53} + 53%|█████▎ | 4600/8750 [3:38:38<6:39:37, 5.78s/it] {'loss': 0.4743, 'learning_rate': 9.65396456644341e-06, 'epoch': 0.53} + 53%|█████▎ | 4600/8750 [3:38:35<6:39:37, 5.78s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-4600/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-4600/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-4600/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 53%|█████▎ | 4601/8750 [3:39:00<12:05:17, 10.49s/it] 53%|█████▎ | 4601/8750 [3:38:57<12:05:17, 10.49s/it] {'loss': 0.4605, 'learning_rate': 9.650265154379761e-06, 'epoch': 0.53} + 53%|█████▎ | 4601/8750 [3:39:00<12:05:17, 10.49s/it] {'loss': 0.4605, 'learning_rate': 9.650265154379761e-06, 'epoch': 0.53} + 53%|█████▎ | 4601/8750 [3:38:57<12:05:17, 10.49s/it] 53%|█████▎ | 4602/8750 [3:39:03<10:28:46, 9.10s/it] 53%|█████▎ | 4602/8750 [3:39:06<10:28:47, 9.10s/it] {'loss': 0.4574, 'learning_rate': 9.64656579023759e-06, 'epoch': 0.53} + 53%|█████▎ | 4602/8750 [3:39:06<10:28:47, 9.10s/it] {'loss': 0.4574, 'learning_rate': 9.64656579023759e-06, 'epoch': 0.53} + 53%|█████▎ | 4602/8750 [3:39:03<10:28:46, 9.10s/it] 53%|█████▎ | 4603/8750 [3:39:08<9:18:03, 8.07s/it] 53%|█████▎ | 4603/8750 [3:39:11<9:18:03, 8.07s/it] {'loss': 0.458, 'learning_rate': 9.642866474523802e-06, 'epoch': 0.53} + 53%|█████▎ | 4603/8750 [3:39:11<9:18:03, 8.07s/it] {'loss': 0.458, 'learning_rate': 9.642866474523802e-06, 'epoch': 0.53} + 53%|█████▎ | 4603/8750 [3:39:08<9:18:03, 8.07s/it] 53%|█████▎ | 4604/8750 [3:39:17<8:34:26, 7.44s/it] 53%|█████▎ | 4604/8750 [3:39:14<8:34:27, 7.45s/it] {'loss': 0.448, 'learning_rate': 9.639167207745276e-06, 'epoch': 0.53} + 53%|█████▎ | 4604/8750 [3:39:17<8:34:26, 7.44s/it] {'loss': 0.448, 'learning_rate': 9.639167207745276e-06, 'epoch': 0.53} + 53%|█████▎ | 4604/8750 [3:39:14<8:34:27, 7.45s/it] 53%|█████▎ | 4605/8750 [3:39:23<8:05:47, 7.03s/it] 53%|█████▎ | 4605/8750 [3:39:21<8:05:47, 7.03s/it] {'loss': 0.4661, 'learning_rate': 9.6354679904089e-06, 'epoch': 0.53} + 53%|█████▎ | 4605/8750 [3:39:23<8:05:47, 7.03s/it] {'loss': 0.4661, 'learning_rate': 9.6354679904089e-06, 'epoch': 0.53} + 53%|█████▎ | 4605/8750 [3:39:21<8:05:47, 7.03s/it] 53%|█████▎ | 4606/8750 [3:39:29<7:40:12, 6.66s/it] 53%|█████▎ | 4606/8750 [3:39:26<7:40:13, 6.66s/it] {'loss': 0.4667, 'learning_rate': 9.631768823021551e-06, 'epoch': 0.53} + 53%|█████▎ | 4606/8750 [3:39:29<7:40:12, 6.66s/it] {'loss': 0.4667, 'learning_rate': 9.631768823021551e-06, 'epoch': 0.53} + 53%|█████▎ | 4606/8750 [3:39:26<7:40:13, 6.66s/it] 53%|█████▎ | 4607/8750 [3:39:35<7:28:21, 6.49s/it] 53%|█████▎ | 4607/8750 [3:39:32<7:28:21, 6.49s/it] {'loss': 0.4673, 'learning_rate': 9.628069706090089e-06, 'epoch': 0.53} + 53%|█████▎ | 4607/8750 [3:39:35<7:28:21, 6.49s/it] {'loss': 0.4673, 'learning_rate': 9.628069706090089e-06, 'epoch': 0.53} + 53%|█████▎ | 4607/8750 [3:39:32<7:28:21, 6.49s/it] 53%|█████▎ | 4608/8750 [3:39:38<7:18:11, 6.35s/it] 53%|█████▎ | 4608/8750 [3:39:41<7:18:11, 6.35s/it] {'loss': 0.4743, 'learning_rate': 9.624370640121387e-06, 'epoch': 0.53} + 53%|█████▎ | 4608/8750 [3:39:41<7:18:11, 6.35s/it] {'loss': 0.4743, 'learning_rate': 9.624370640121387e-06, 'epoch': 0.53} + 53%|█████▎ | 4608/8750 [3:39:38<7:18:11, 6.35s/it] 53%|█████▎ | 4609/8750 [3:39:47<7:05:05, 6.16s/it] 53%|█████▎ | 4609/8750 [3:39:44<7:05:05, 6.16s/it] {'loss': 0.4708, 'learning_rate': 9.620671625622287e-06, 'epoch': 0.53} + 53%|█████▎ | 4609/8750 [3:39:47<7:05:05, 6.16s/it] {'loss': 0.4708, 'learning_rate': 9.620671625622287e-06, 'epoch': 0.53} + 53%|█████▎ | 4609/8750 [3:39:44<7:05:05, 6.16s/it] 53%|█████▎ | 4610/8750 [3:39:50<6:54:25, 6.01s/it] 53%|█████▎ | 4610/8750 [3:39:53<6:54:25, 6.01s/it] {'loss': 0.4519, 'learning_rate': 9.616972663099648e-06, 'epoch': 0.53} + 53%|█████▎ | 4610/8750 [3:39:53<6:54:25, 6.01s/it] {'loss': 0.4519, 'learning_rate': 9.616972663099648e-06, 'epoch': 0.53} + 53%|█████▎ | 4610/8750 [3:39:50<6:54:25, 6.01s/it] 53%|█████▎ | 4611/8750 [3:39:58<6:48:50, 5.93s/it] 53%|█████▎ | 4611/8750 [3:39:56<6:48:50, 5.93s/it] {'loss': 0.4693, 'learning_rate': 9.613273753060306e-06, 'epoch': 0.53} + 53%|█████▎ | 4611/8750 [3:39:58<6:48:50, 5.93s/it] {'loss': 0.4693, 'learning_rate': 9.613273753060306e-06, 'epoch': 0.53} + 53%|█████▎ | 4611/8750 [3:39:56<6:48:50, 5.93s/it] 53%|█████▎ | 4612/8750 [3:40:04<6:45:40, 5.88s/it] 53%|█████▎ | 4612/8750 [3:40:01<6:45:40, 5.88s/it] {'loss': 0.4497, 'learning_rate': 9.60957489601109e-06, 'epoch': 0.53} + 53%|█████▎ | 4612/8750 [3:40:04<6:45:40, 5.88s/it] {'loss': 0.4497, 'learning_rate': 9.60957489601109e-06, 'epoch': 0.53} + 53%|█████▎ | 4612/8750 [3:40:01<6:45:40, 5.88s/it] 53%|█████▎ | 4613/8750 [3:40:10<6:46:01, 5.89s/it] 53%|█████▎ | 4613/8750 [3:40:07<6:46:01, 5.89s/it] {'loss': 0.4547, 'learning_rate': 9.605876092458835e-06, 'epoch': 0.53} + 53%|█████▎ | 4613/8750 [3:40:10<6:46:01, 5.89s/it] {'loss': 0.4547, 'learning_rate': 9.605876092458835e-06, 'epoch': 0.53} + 53%|█████▎ | 4613/8750 [3:40:07<6:46:01, 5.89s/it] 53%|█████▎ | 4614/8750 [3:40:13<6:43:07, 5.85s/it] 53%|█████▎ | 4614/8750 [3:40:16<6:43:07, 5.85s/it] {'loss': 0.4704, 'learning_rate': 9.60217734291035e-06, 'epoch': 0.53} + 53%|█████▎ | 4614/8750 [3:40:16<6:43:07, 5.85s/it] {'loss': 0.4704, 'learning_rate': 9.60217734291035e-06, 'epoch': 0.53} + 53%|█████▎ | 4614/8750 [3:40:13<6:43:07, 5.85s/it] 53%|█████▎ | 4615/8750 [3:40:22<6:40:34, 5.81s/it] 53%|█████▎ | 4615/8750 [3:40:19<6:40:34, 5.81s/it] {'loss': 0.4674, 'learning_rate': 9.598478647872451e-06, 'epoch': 0.53} + 53%|█████▎ | 4615/8750 [3:40:22<6:40:34, 5.81s/it] {'loss': 0.4674, 'learning_rate': 9.598478647872451e-06, 'epoch': 0.53} + 53%|█████▎ | 4615/8750 [3:40:19<6:40:34, 5.81s/it] 53%|█████▎ | 4616/8750 [3:40:28<6:47:35, 5.92s/it] 53%|█████▎ | 4616/8750 [3:40:25<6:47:35, 5.92s/it] {'loss': 0.4687, 'learning_rate': 9.594780007851947e-06, 'epoch': 0.53} + 53%|█████▎ | 4616/8750 [3:40:28<6:47:35, 5.92s/it] {'loss': 0.4687, 'learning_rate': 9.594780007851947e-06, 'epoch': 0.53} + 53%|█████▎ | 4616/8750 [3:40:25<6:47:35, 5.92s/it] 53%|█████▎ | 4617/8750 [3:40:34<6:46:54, 5.91s/it] 53%|█████▎ | 4617/8750 [3:40:31<6:46:54, 5.91s/it] {'loss': 0.4746, 'learning_rate': 9.591081423355622e-06, 'epoch': 0.53} + 53%|█████▎ | 4617/8750 [3:40:34<6:46:54, 5.91s/it] {'loss': 0.4746, 'learning_rate': 9.591081423355622e-06, 'epoch': 0.53} + 53%|█████▎ | 4617/8750 [3:40:31<6:46:54, 5.91s/it] 53%|█████▎ | 4618/8750 [3:40:40<6:45:25, 5.89s/it] 53%|█████▎ | 4618/8750 [3:40:37<6:45:24, 5.89s/it] {'loss': 0.4641, 'learning_rate': 9.587382894890276e-06, 'epoch': 0.53} + 53%|█████▎ | 4618/8750 [3:40:37<6:45:24, 5.89s/it]{'loss': 0.4641, 'learning_rate': 9.587382894890276e-06, 'epoch': 0.53} + 53%|█████▎ | 4618/8750 [3:40:40<6:45:25, 5.89s/it] 53%|█████▎ | 4619/8750 [3:40:45<6:40:44, 5.82s/it] 53%|█████▎ | 4619/8750 [3:40:42<6:40:44, 5.82s/it] {'loss': 0.4568, 'learning_rate': 9.583684422962686e-06, 'epoch': 0.53} + 53%|█████▎ | 4619/8750 [3:40:45<6:40:44, 5.82s/it] {'loss': 0.4568, 'learning_rate': 9.583684422962686e-06, 'epoch': 0.53} + 53%|█████▎ | 4619/8750 [3:40:42<6:40:44, 5.82s/it] 53%|█████▎ | 4620/8750 [3:40:51<6:37:11, 5.77s/it] 53%|█████▎ | 4620/8750 [3:40:48<6:37:11, 5.77s/it] {'loss': 0.4585, 'learning_rate': 9.57998600807962e-06, 'epoch': 0.53} + 53%|█████▎ | 4620/8750 [3:40:51<6:37:11, 5.77s/it] {'loss': 0.4585, 'learning_rate': 9.57998600807962e-06, 'epoch': 0.53} + 53%|█████▎ | 4620/8750 [3:40:48<6:37:11, 5.77s/it] 53%|█████▎ | 4621/8750 [3:40:57<6:42:04, 5.84s/it] 53%|█████▎ | 4621/8750 [3:40:54<6:42:04, 5.84s/it] {'loss': 0.455, 'learning_rate': 9.576287650747854e-06, 'epoch': 0.53} + 53%|█████▎ | 4621/8750 [3:40:57<6:42:04, 5.84s/it] {'loss': 0.455, 'learning_rate': 9.576287650747854e-06, 'epoch': 0.53} + 53%|█████▎ | 4621/8750 [3:40:54<6:42:04, 5.84s/it] 53%|█████▎ | 4622/8750 [3:41:02<6:37:38, 5.78s/it] 53%|█████▎ | 4622/8750 [3:41:00<6:37:38, 5.78s/it] {'loss': 0.4596, 'learning_rate': 9.572589351474135e-06, 'epoch': 0.53} + 53%|█████▎ | 4622/8750 [3:41:02<6:37:38, 5.78s/it] {'loss': 0.4596, 'learning_rate': 9.572589351474135e-06, 'epoch': 0.53} + 53%|█████▎ | 4622/8750 [3:41:00<6:37:38, 5.78s/it] 53%|█████▎ | 4623/8750 [3:41:08<6:37:05, 5.77s/it] 53%|█████▎ | 4623/8750 [3:41:05<6:37:05, 5.77s/it] {'loss': 0.4498, 'learning_rate': 9.568891110765219e-06, 'epoch': 0.53} + 53%|█████▎ | 4623/8750 [3:41:08<6:37:05, 5.77s/it] {'loss': 0.4498, 'learning_rate': 9.568891110765219e-06, 'epoch': 0.53} + 53%|█████▎ | 4623/8750 [3:41:05<6:37:05, 5.77s/it] 53%|█████▎ | 4624/8750 [3:41:11<6:35:32, 5.75s/it] 53%|█████▎ | 4624/8750 [3:41:14<6:35:32, 5.75s/it] {'loss': 0.4718, 'learning_rate': 9.565192929127849e-06, 'epoch': 0.53} + 53%|█████▎ | 4624/8750 [3:41:14<6:35:32, 5.75s/it] {'loss': 0.4718, 'learning_rate': 9.565192929127849e-06, 'epoch': 0.53} + 53%|█████▎ | 4624/8750 [3:41:11<6:35:32, 5.75s/it] 53%|█████▎ | 4625/8750 [3:41:17<6:39:49, 5.82s/it] 53%|█████▎ | 4625/8750 [3:41:20<6:39:49, 5.82s/it] {'loss': 0.459, 'learning_rate': 9.56149480706875e-06, 'epoch': 0.53} + 53%|█████▎ | 4625/8750 [3:41:20<6:39:49, 5.82s/it] {'loss': 0.459, 'learning_rate': 9.56149480706875e-06, 'epoch': 0.53} + 53%|█████▎ | 4625/8750 [3:41:17<6:39:49, 5.82s/it] 53%|█████▎ | 4626/8750 [3:41:26<6:38:03, 5.79s/it] 53%|█████▎ | 4626/8750 [3:41:23<6:38:03, 5.79s/it] {'loss': 0.4699, 'learning_rate': 9.557796745094659e-06, 'epoch': 0.53} + 53%|█████▎ | 4626/8750 [3:41:26<6:38:03, 5.79s/it] {'loss': 0.4699, 'learning_rate': 9.557796745094659e-06, 'epoch': 0.53} + 53%|█████▎ | 4626/8750 [3:41:23<6:38:03, 5.79s/it] 53%|█████▎ | 4627/8750 [3:41:31<6:34:27, 5.74s/it] 53%|█████▎ | 4627/8750 [3:41:28<6:34:27, 5.74s/it] {'loss': 0.4617, 'learning_rate': 9.554098743712282e-06, 'epoch': 0.53} + 53%|█████▎ | 4627/8750 [3:41:31<6:34:27, 5.74s/it] {'loss': 0.4617, 'learning_rate': 9.554098743712282e-06, 'epoch': 0.53} + 53%|█████▎ | 4627/8750 [3:41:28<6:34:27, 5.74s/it] 53%|█████▎ | 4628/8750 [3:41:37<6:35:04, 5.75s/it] 53%|█████▎ | 4628/8750 [3:41:34<6:35:04, 5.75s/it] {'loss': 0.4584, 'learning_rate': 9.55040080342833e-06, 'epoch': 0.53} + 53%|█████▎ | 4628/8750 [3:41:37<6:35:04, 5.75s/it] {'loss': 0.4584, 'learning_rate': 9.55040080342833e-06, 'epoch': 0.53} + 53%|█████▎ | 4628/8750 [3:41:34<6:35:04, 5.75s/it] 53%|█████▎ | 4629/8750 [3:41:40<6:33:07, 5.72s/it] 53%|█████▎ | 4629/8750 [3:41:43<6:33:07, 5.72s/it] {'loss': 0.4683, 'learning_rate': 9.546702924749513e-06, 'epoch': 0.53} + 53%|█████▎ | 4629/8750 [3:41:43<6:33:07, 5.72s/it] {'loss': 0.4683, 'learning_rate': 9.546702924749513e-06, 'epoch': 0.53} + 53%|█████▎ | 4629/8750 [3:41:40<6:33:07, 5.72s/it] 53%|█████▎ | 4630/8750 [3:41:46<6:35:32, 5.76s/it] 53%|█████▎ | 4630/8750 [3:41:49<6:35:32, 5.76s/it] {'loss': 0.4565, 'learning_rate': 9.543005108182508e-06, 'epoch': 0.53} + 53%|█████▎ | 4630/8750 [3:41:49<6:35:32, 5.76s/it] {'loss': 0.4565, 'learning_rate': 9.543005108182508e-06, 'epoch': 0.53} + 53%|█████▎ | 4630/8750 [3:41:46<6:35:32, 5.76s/it] 53%|█████▎ | 4631/8750 [3:41:54<6:38:03, 5.80s/it] 53%|█████▎ | 4631/8750 [3:41:51<6:38:03, 5.80s/it] {'loss': 0.4596, 'learning_rate': 9.539307354234013e-06, 'epoch': 0.53} + 53%|█████▎ | 4631/8750 [3:41:54<6:38:03, 5.80s/it] {'loss': 0.4596, 'learning_rate': 9.539307354234013e-06, 'epoch': 0.53} + 53%|█████▎ | 4631/8750 [3:41:51<6:38:03, 5.80s/it] 53%|█████▎ | 4632/8750 [3:41:58<6:43:13, 5.88s/it] 53%|█████▎ | 4632/8750 [3:42:00<6:43:13, 5.88s/it] {'loss': 0.4669, 'learning_rate': 9.535609663410692e-06, 'epoch': 0.53} + 53%|█████▎ | 4632/8750 [3:42:00<6:43:13, 5.88s/it] {'loss': 0.4669, 'learning_rate': 9.535609663410692e-06, 'epoch': 0.53} + 53%|█████▎ | 4632/8750 [3:41:58<6:43:13, 5.88s/it]WARNING: tokenization mismatch: 1 vs. 64. [[{'from': 'human', 'value': '\nWhat vitamin is this vegetable associated with?\nAnswer the question using a single word or phrase.'}, {'from': 'gpt', 'value': ''}]] (ignored) + 53%|█████▎ | 4633/8750 [3:42:03<6:42:17, 5.86s/it] 53%|█████▎ | 4633/8750 [3:42:06<6:42:17, 5.86s/it] {'loss': 0.4542, 'learning_rate': 9.531912036219214e-06, 'epoch': 0.53} + {'loss': 0.4542, 'learning_rate': 9.531912036219214e-06, 'epoch': 0.53} + 53%|█████▎ | 4633/8750 [3:42:06<6:42:17, 5.86s/it] 53%|█████▎ | 4633/8750 [3:42:03<6:42:17, 5.86s/it] 53%|█████▎ | 4634/8750 [3:42:12<6:45:14, 5.91s/it] 53%|█████▎ | 4634/8750 [3:42:09<6:45:14, 5.91s/it] {'loss': 0.4628, 'learning_rate': 9.528214473166241e-06, 'epoch': 0.53} + 53%|█████▎ | 4634/8750 [3:42:12<6:45:14, 5.91s/it] {'loss': 0.4628, 'learning_rate': 9.528214473166241e-06, 'epoch': 0.53} + 53%|█████▎ | 4634/8750 [3:42:09<6:45:14, 5.91s/it] 53%|█████▎ | 4635/8750 [3:42:15<6:41:58, 5.86s/it] 53%|█████▎ | 4635/8750 [3:42:18<6:41:58, 5.86s/it] {'loss': 0.4588, 'learning_rate': 9.524516974758415e-06, 'epoch': 0.53} + 53%|█████▎ | 4635/8750 [3:42:18<6:41:58, 5.86s/it] {'loss': 0.4588, 'learning_rate': 9.524516974758415e-06, 'epoch': 0.53} + 53%|█████▎ | 4635/8750 [3:42:15<6:41:58, 5.86s/it] 53%|█████▎ | 4636/8750 [3:42:24<6:43:49, 5.89s/it] 53%|█████▎ | 4636/8750 [3:42:21<6:43:49, 5.89s/it] {'loss': 0.4724, 'learning_rate': 9.520819541502384e-06, 'epoch': 0.53} + 53%|█████▎ | 4636/8750 [3:42:24<6:43:49, 5.89s/it] {'loss': 0.4724, 'learning_rate': 9.520819541502384e-06, 'epoch': 0.53} + 53%|█████▎ | 4636/8750 [3:42:21<6:43:49, 5.89s/it] 53%|█████▎ | 4637/8750 [3:42:30<6:40:30, 5.84s/it] 53%|█████▎ | 4637/8750 [3:42:27<6:40:30, 5.84s/it] {'loss': 0.4556, 'learning_rate': 9.51712217390477e-06, 'epoch': 0.53} + 53%|█████▎ | 4637/8750 [3:42:30<6:40:30, 5.84s/it] {'loss': 0.4556, 'learning_rate': 9.51712217390477e-06, 'epoch': 0.53} + 53%|█████▎ | 4637/8750 [3:42:27<6:40:30, 5.84s/it] 53%|█████▎ | 4638/8750 [3:42:36<6:42:32, 5.87s/it] 53%|█████▎ | 4638/8750 [3:42:33<6:42:32, 5.87s/it] {'loss': 0.4625, 'learning_rate': 9.5134248724722e-06, 'epoch': 0.53} + 53%|█████▎ | 4638/8750 [3:42:36<6:42:32, 5.87s/it] {'loss': 0.4625, 'learning_rate': 9.5134248724722e-06, 'epoch': 0.53} + 53%|█████▎ | 4638/8750 [3:42:33<6:42:32, 5.87s/it] 53%|█████▎ | 4639/8750 [3:42:41<6:40:19, 5.84s/it] 53%|█████▎ | 4639/8750 [3:42:39<6:40:19, 5.84s/it] {'loss': 0.4548, 'learning_rate': 9.509727637711287e-06, 'epoch': 0.53} + 53%|█████▎ | 4639/8750 [3:42:41<6:40:19, 5.84s/it] {'loss': 0.4548, 'learning_rate': 9.509727637711287e-06, 'epoch': 0.53} + 53%|█████▎ | 4639/8750 [3:42:39<6:40:19, 5.84s/it] 53%|█████▎ | 4640/8750 [3:42:47<6:38:25, 5.82s/it] 53%|█████▎ | 4640/8750 [3:42:44<6:38:25, 5.82s/it] {'loss': 0.4666, 'learning_rate': 9.506030470128635e-06, 'epoch': 0.53} + 53%|█████▎ | 4640/8750 [3:42:47<6:38:25, 5.82s/it] {'loss': 0.4666, 'learning_rate': 9.506030470128635e-06, 'epoch': 0.53} + 53%|█████▎ | 4640/8750 [3:42:44<6:38:25, 5.82s/it] 53%|█████▎ | 4641/8750 [3:42:53<6:41:30, 5.86s/it] 53%|█████▎ | 4641/8750 [3:42:50<6:41:30, 5.86s/it] {'loss': 0.4542, 'learning_rate': 9.502333370230831e-06, 'epoch': 0.53} + 53%|█████▎ | 4641/8750 [3:42:53<6:41:30, 5.86s/it] {'loss': 0.4542, 'learning_rate': 9.502333370230831e-06, 'epoch': 0.53} + 53%|█████▎ | 4641/8750 [3:42:50<6:41:30, 5.86s/it] 53%|█████▎ | 4642/8750 [3:42:59<6:35:48, 5.78s/it] 53%|█████▎ | 4642/8750 [3:42:56<6:35:48, 5.78s/it] {'loss': 0.4654, 'learning_rate': 9.49863633852447e-06, 'epoch': 0.53} + 53%|█████▎ | 4642/8750 [3:42:59<6:35:48, 5.78s/it] {'loss': 0.4654, 'learning_rate': 9.49863633852447e-06, 'epoch': 0.53} + 53%|█████▎ | 4642/8750 [3:42:56<6:35:48, 5.78s/it] 53%|█████▎ | 4643/8750 [3:43:04<6:33:16, 5.75s/it] 53%|█████▎ | 4643/8750 [3:43:02<6:33:17, 5.75s/it] {'loss': 0.4402, 'learning_rate': 9.494939375516122e-06, 'epoch': 0.53} + 53%|█████▎ | 4643/8750 [3:43:04<6:33:16, 5.75s/it] {'loss': 0.4402, 'learning_rate': 9.494939375516122e-06, 'epoch': 0.53} + 53%|█████▎ | 4643/8750 [3:43:02<6:33:17, 5.75s/it] 53%|█████▎ | 4644/8750 [3:43:08<6:40:57, 5.86s/it] 53%|█████▎ | 4644/8750 [3:43:11<6:40:58, 5.86s/it] {'loss': 0.459, 'learning_rate': 9.49124248171236e-06, 'epoch': 0.53} + 53%|█████▎ | 4644/8750 [3:43:11<6:40:58, 5.86s/it] {'loss': 0.459, 'learning_rate': 9.49124248171236e-06, 'epoch': 0.53} + 53%|█████▎ | 4644/8750 [3:43:08<6:40:57, 5.86s/it] 53%|█████▎ | 4645/8750 [3:43:13<6:39:01, 5.83s/it] 53%|█████▎ | 4645/8750 [3:43:16<6:39:01, 5.83s/it] {'loss': 0.4613, 'learning_rate': 9.487545657619736e-06, 'epoch': 0.53} + 53%|█████▎ | 4645/8750 [3:43:16<6:39:01, 5.83s/it] {'loss': 0.4613, 'learning_rate': 9.487545657619736e-06, 'epoch': 0.53} + 53%|█████▎ | 4645/8750 [3:43:13<6:39:01, 5.83s/it] 53%|█████▎ | 4646/8750 [3:43:22<6:39:33, 5.84s/it] 53%|█████▎ | 4646/8750 [3:43:19<6:39:33, 5.84s/it] {'loss': 0.4691, 'learning_rate': 9.483848903744795e-06, 'epoch': 0.53} + 53%|█████▎ | 4646/8750 [3:43:22<6:39:33, 5.84s/it] {'loss': 0.4691, 'learning_rate': 9.483848903744795e-06, 'epoch': 0.53} + 53%|█████▎ | 4646/8750 [3:43:19<6:39:33, 5.84s/it] 53%|█████▎ | 4647/8750 [3:43:25<6:36:30, 5.80s/it] 53%|█████▎ | 4647/8750 [3:43:28<6:36:30, 5.80s/it] {'loss': 0.451, 'learning_rate': 9.48015222059408e-06, 'epoch': 0.53} + 53%|█████▎ | 4647/8750 [3:43:28<6:36:30, 5.80s/it] {'loss': 0.451, 'learning_rate': 9.48015222059408e-06, 'epoch': 0.53} + 53%|█████▎ | 4647/8750 [3:43:25<6:36:30, 5.80s/it] 53%|█████▎ | 4648/8750 [3:43:31<6:32:40, 5.74s/it] 53%|█████▎ | 4648/8750 [3:43:34<6:32:40, 5.74s/it] {'loss': 0.4657, 'learning_rate': 9.476455608674112e-06, 'epoch': 0.53} + 53%|█████▎ | 4648/8750 [3:43:34<6:32:40, 5.74s/it] {'loss': 0.4657, 'learning_rate': 9.476455608674112e-06, 'epoch': 0.53} + 53%|█████▎ | 4648/8750 [3:43:31<6:32:40, 5.74s/it] 53%|█████▎ | 4649/8750 [3:43:39<6:33:06, 5.75s/it] 53%|█████▎ | 4649/8750 [3:43:36<6:33:06, 5.75s/it] {'loss': 0.4631, 'learning_rate': 9.472759068491421e-06, 'epoch': 0.53} + 53%|█████▎ | 4649/8750 [3:43:39<6:33:06, 5.75s/it] {'loss': 0.4631, 'learning_rate': 9.472759068491421e-06, 'epoch': 0.53} + 53%|█████▎ | 4649/8750 [3:43:36<6:33:06, 5.75s/it]14 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... + 53%|█████▎ | 4650/8750 [3:43:45<6:33:10, 5.75s/it]10 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +0 AutoResumeHook: Checking whether to suspend... + 53%|█████▎ | 4650/8750 [3:43:42<6:33:11, 5.75s/it]2 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4553, 'learning_rate': 9.469062600552509e-06, 'epoch': 0.53} + 53%|█████▎ | 4650/8750 [3:43:45<6:33:10, 5.75s/it] {'loss': 0.4553, 'learning_rate': 9.469062600552509e-06, 'epoch': 0.53} + 53%|█████▎ | 4650/8750 [3:43:42<6:33:11, 5.75s/it] 53%|█████▎ | 4651/8750 [3:43:51<6:36:28, 5.80s/it] 53%|█████▎ | 4651/8750 [3:43:48<6:36:28, 5.80s/it] {'loss': 0.4589, 'learning_rate': 9.46536620536387e-06, 'epoch': 0.53} + 53%|█████▎ | 4651/8750 [3:43:51<6:36:28, 5.80s/it] {'loss': 0.4589, 'learning_rate': 9.46536620536387e-06, 'epoch': 0.53} + 53%|█████▎ | 4651/8750 [3:43:48<6:36:28, 5.80s/it] 53%|█████▎ | 4652/8750 [3:43:54<6:40:54, 5.87s/it] 53%|█████▎ | 4652/8750 [3:43:57<6:40:54, 5.87s/it] {'loss': 0.4644, 'learning_rate': 9.461669883431997e-06, 'epoch': 0.53} + {'loss': 0.4644, 'learning_rate': 9.461669883431997e-06, 'epoch': 0.53} 53%|█████▎ | 4652/8750 [3:43:57<6:40:54, 5.87s/it] + 53%|█████▎ | 4652/8750 [3:43:54<6:40:54, 5.87s/it] 53%|█████▎ | 4653/8750 [3:44:03<6:37:44, 5.82s/it] 53%|█████▎ | 4653/8750 [3:44:00<6:37:44, 5.82s/it] {'loss': 0.4886, 'learning_rate': 9.457973635263375e-06, 'epoch': 0.53} + 53%|█████▎ | 4653/8750 [3:44:03<6:37:44, 5.82s/it] {'loss': 0.4886, 'learning_rate': 9.457973635263375e-06, 'epoch': 0.53} + 53%|█████▎ | 4653/8750 [3:44:00<6:37:44, 5.82s/it] 53%|█████▎ | 4654/8750 [3:44:05<6:34:05, 5.77s/it] 53%|█████▎ | 4654/8750 [3:44:08<6:34:05, 5.77s/it] {'loss': 0.4691, 'learning_rate': 9.45427746136446e-06, 'epoch': 0.53} + 53%|█████▎ | 4654/8750 [3:44:08<6:34:05, 5.77s/it] {'loss': 0.4691, 'learning_rate': 9.45427746136446e-06, 'epoch': 0.53} + 53%|█████▎ | 4654/8750 [3:44:05<6:34:05, 5.77s/it] 53%|█████▎ | 4655/8750 [3:44:11<6:36:47, 5.81s/it] 53%|█████▎ | 4655/8750 [3:44:14<6:36:47, 5.81s/it] {'loss': 0.4444, 'learning_rate': 9.45058136224172e-06, 'epoch': 0.53} + 53%|█████▎ | 4655/8750 [3:44:11<6:36:47, 5.81s/it]{'loss': 0.4444, 'learning_rate': 9.45058136224172e-06, 'epoch': 0.53} + 53%|█████▎ | 4655/8750 [3:44:14<6:36:47, 5.81s/it] 53%|█████▎ | 4656/8750 [3:44:17<6:35:48, 5.80s/it] 53%|█████▎ | 4656/8750 [3:44:20<6:35:48, 5.80s/it] {'loss': 0.4715, 'learning_rate': 9.446885338401597e-06, 'epoch': 0.53} + 53%|█████▎ | 4656/8750 [3:44:20<6:35:48, 5.80s/it] {'loss': 0.4715, 'learning_rate': 9.446885338401597e-06, 'epoch': 0.53} + 53%|█████▎ | 4656/8750 [3:44:17<6:35:48, 5.80s/it] 53%|█████▎ | 4657/8750 [3:44:26<6:38:35, 5.84s/it] 53%|█████▎ | 4657/8750 [3:44:23<6:38:36, 5.84s/it] {'loss': 0.4667, 'learning_rate': 9.443189390350534e-06, 'epoch': 0.53} + 53%|█████▎ | 4657/8750 [3:44:26<6:38:35, 5.84s/it] {'loss': 0.4667, 'learning_rate': 9.443189390350534e-06, 'epoch': 0.53} + 53%|█████▎ | 4657/8750 [3:44:23<6:38:36, 5.84s/it] 53%|█████▎ | 4658/8750 [3:44:29<6:36:05, 5.81s/it] 53%|█████▎ | 4658/8750 [3:44:32<6:36:05, 5.81s/it] {'loss': 0.4679, 'learning_rate': 9.439493518594957e-06, 'epoch': 0.53} + 53%|█████▎ | 4658/8750 [3:44:32<6:36:05, 5.81s/it] {'loss': 0.4679, 'learning_rate': 9.439493518594957e-06, 'epoch': 0.53} + 53%|█████▎ | 4658/8750 [3:44:29<6:36:05, 5.81s/it] 53%|█████▎ | 4659/8750 [3:44:35<6:36:27, 5.81s/it] 53%|█████▎ | 4659/8750 [3:44:38<6:36:27, 5.81s/it] {'loss': 0.4479, 'learning_rate': 9.435797723641277e-06, 'epoch': 0.53} + 53%|█████▎ | 4659/8750 [3:44:38<6:36:27, 5.81s/it] {'loss': 0.4479, 'learning_rate': 9.435797723641277e-06, 'epoch': 0.53} + 53%|█████▎ | 4659/8750 [3:44:35<6:36:27, 5.81s/it] 53%|█████▎ | 4660/8750 [3:44:43<6:34:14, 5.78s/it] 53%|█████▎ | 4660/8750 [3:44:40<6:34:15, 5.78s/it] {'loss': 0.4777, 'learning_rate': 9.432102005995912e-06, 'epoch': 0.53} + 53%|█████▎ | 4660/8750 [3:44:43<6:34:14, 5.78s/it] {'loss': 0.4777, 'learning_rate': 9.432102005995912e-06, 'epoch': 0.53} + 53%|█████▎ | 4660/8750 [3:44:40<6:34:15, 5.78s/it] 53%|█████▎ | 4661/8750 [3:44:46<6:33:27, 5.77s/it] 53%|█████▎ | 4661/8750 [3:44:49<6:33:27, 5.77s/it] {'loss': 0.4656, 'learning_rate': 9.428406366165244e-06, 'epoch': 0.53} + 53%|█████▎ | 4661/8750 [3:44:49<6:33:27, 5.77s/it] {'loss': 0.4656, 'learning_rate': 9.428406366165244e-06, 'epoch': 0.53} + 53%|█████▎ | 4661/8750 [3:44:46<6:33:27, 5.77s/it] 53%|█████▎ | 4662/8750 [3:44:52<6:44:22, 5.93s/it] 53%|█████▎ | 4662/8750 [3:44:55<6:44:22, 5.93s/it]{'loss': 0.4465, 'learning_rate': 9.424710804655669e-06, 'epoch': 0.53} + {'loss': 0.4465, 'learning_rate': 9.424710804655669e-06, 'epoch': 0.53} + 53%|█████▎ | 4662/8750 [3:44:55<6:44:22, 5.93s/it] 53%|█████▎ | 4662/8750 [3:44:52<6:44:22, 5.93s/it] 53%|█████▎ | 4663/8750 [3:45:01<6:36:46, 5.82s/it] 53%|█████▎ | 4663/8750 [3:44:58<6:36:46, 5.82s/it] {'loss': 0.4584, 'learning_rate': 9.42101532197356e-06, 'epoch': 0.53} + 53%|█████▎ | 4663/8750 [3:45:01<6:36:46, 5.82s/it] {'loss': 0.4584, 'learning_rate': 9.42101532197356e-06, 'epoch': 0.53} + 53%|█████▎ | 4663/8750 [3:44:58<6:36:46, 5.82s/it] 53%|█████▎ | 4664/8750 [3:45:04<6:36:28, 5.82s/it] 53%|█████▎ | 4664/8750 [3:45:07<6:36:28, 5.82s/it] {'loss': 0.4644, 'learning_rate': 9.417319918625274e-06, 'epoch': 0.53} + 53%|█████▎ | 4664/8750 [3:45:07<6:36:28, 5.82s/it] {'loss': 0.4644, 'learning_rate': 9.417319918625274e-06, 'epoch': 0.53} + 53%|█████▎ | 4664/8750 [3:45:04<6:36:28, 5.82s/it] 53%|█████▎ | 4665/8750 [3:45:10<6:34:51, 5.80s/it] 53%|█████▎ | 4665/8750 [3:45:12<6:34:51, 5.80s/it] {'loss': 0.4449, 'learning_rate': 9.413624595117173e-06, 'epoch': 0.53} + {'loss': 0.4449, 'learning_rate': 9.413624595117173e-06, 'epoch': 0.53} 53%|█████▎ | 4665/8750 [3:45:12<6:34:51, 5.80s/it] + 53%|█████▎ | 4665/8750 [3:45:10<6:34:51, 5.80s/it] 53%|█████▎ | 4666/8750 [3:45:15<6:35:21, 5.81s/it] 53%|█████▎ | 4666/8750 [3:45:18<6:35:21, 5.81s/it] {'loss': 0.4884, 'learning_rate': 9.409929351955592e-06, 'epoch': 0.53} + {'loss': 0.4884, 'learning_rate': 9.409929351955592e-06, 'epoch': 0.53} + 53%|█████▎ | 4666/8750 [3:45:18<6:35:21, 5.81s/it] 53%|█████▎ | 4666/8750 [3:45:15<6:35:21, 5.81s/it] 53%|█████▎ | 4667/8750 [3:45:24<6:40:12, 5.88s/it] 53%|█████▎ | 4667/8750 [3:45:21<6:40:13, 5.88s/it] {'loss': 0.4483, 'learning_rate': 9.40623418964686e-06, 'epoch': 0.53} + 53%|█████▎ | 4667/8750 [3:45:24<6:40:12, 5.88s/it] {'loss': 0.4483, 'learning_rate': 9.40623418964686e-06, 'epoch': 0.53} + 53%|█████▎ | 4667/8750 [3:45:21<6:40:13, 5.88s/it] 53%|█████▎ | 4668/8750 [3:45:30<6:37:09, 5.84s/it] 53%|█████▎ | 4668/8750 [3:45:27<6:37:09, 5.84s/it] {'loss': 0.4915, 'learning_rate': 9.402539108697306e-06, 'epoch': 0.53} + 53%|█████▎ | 4668/8750 [3:45:30<6:37:09, 5.84s/it] {'loss': 0.4915, 'learning_rate': 9.402539108697306e-06, 'epoch': 0.53} + 53%|█████▎ | 4668/8750 [3:45:27<6:37:09, 5.84s/it] 53%|█████▎ | 4669/8750 [3:45:36<6:34:26, 5.80s/it] 53%|█████▎ | 4669/8750 [3:45:33<6:34:26, 5.80s/it] {'loss': 0.4573, 'learning_rate': 9.398844109613228e-06, 'epoch': 0.53} + 53%|█████▎ | 4669/8750 [3:45:36<6:34:26, 5.80s/it] {'loss': 0.4573, 'learning_rate': 9.398844109613228e-06, 'epoch': 0.53} + 53%|█████▎ | 4669/8750 [3:45:33<6:34:26, 5.80s/it] 53%|█████▎ | 4670/8750 [3:45:39<6:31:17, 5.75s/it] 53%|█████▎ | 4670/8750 [3:45:41<6:31:17, 5.75s/it] {'loss': 0.4712, 'learning_rate': 9.395149192900934e-06, 'epoch': 0.53} + 53%|█████▎ | 4670/8750 [3:45:41<6:31:17, 5.75s/it] {'loss': 0.4712, 'learning_rate': 9.395149192900934e-06, 'epoch': 0.53} + 53%|█████▎ | 4670/8750 [3:45:39<6:31:17, 5.75s/it] 53%|█████▎ | 4671/8750 [3:45:44<6:30:46, 5.75s/it] 53%|█████▎ | 4671/8750 [3:45:47<6:30:46, 5.75s/it] {'loss': 0.4664, 'learning_rate': 9.391454359066701e-06, 'epoch': 0.53} + 53%|█████▎ | 4671/8750 [3:45:47<6:30:46, 5.75s/it] {'loss': 0.4664, 'learning_rate': 9.391454359066701e-06, 'epoch': 0.53} + 53%|█████▎ | 4671/8750 [3:45:44<6:30:46, 5.75s/it] 53%|█████▎ | 4672/8750 [3:45:53<6:31:27, 5.76s/it] 53%|█████▎ | 4672/8750 [3:45:50<6:31:27, 5.76s/it] {'loss': 0.4573, 'learning_rate': 9.387759608616806e-06, 'epoch': 0.53} + 53%|█████▎ | 4672/8750 [3:45:53<6:31:27, 5.76s/it] {'loss': 0.4573, 'learning_rate': 9.387759608616806e-06, 'epoch': 0.53} + 53%|█████▎ | 4672/8750 [3:45:50<6:31:27, 5.76s/it] 53%|█████▎ | 4673/8750 [3:45:59<6:33:40, 5.79s/it] 53%|█████▎ | 4673/8750 [3:45:56<6:33:40, 5.79s/it] {'loss': 0.4352, 'learning_rate': 9.384064942057518e-06, 'epoch': 0.53} + 53%|█████▎ | 4673/8750 [3:45:59<6:33:40, 5.79s/it] {'loss': 0.4352, 'learning_rate': 9.384064942057518e-06, 'epoch': 0.53} + 53%|█████▎ | 4673/8750 [3:45:56<6:33:40, 5.79s/it] 53%|█████▎ | 4674/8750 [3:46:05<6:33:23, 5.79s/it] 53%|█████▎ | 4674/8750 [3:46:02<6:33:23, 5.79s/it] {'loss': 0.4715, 'learning_rate': 9.380370359895079e-06, 'epoch': 0.53} + 53%|█████▎ | 4674/8750 [3:46:05<6:33:23, 5.79s/it] {'loss': 0.4715, 'learning_rate': 9.380370359895079e-06, 'epoch': 0.53} + 53%|█████▎ | 4674/8750 [3:46:02<6:33:23, 5.79s/it] 53%|█████▎ | 4675/8750 [3:46:11<6:36:27, 5.84s/it] 53%|█████▎ | 4675/8750 [3:46:08<6:36:27, 5.84s/it] {'loss': 0.4777, 'learning_rate': 9.37667586263574e-06, 'epoch': 0.53} + 53%|█████▎ | 4675/8750 [3:46:11<6:36:27, 5.84s/it] {'loss': 0.4777, 'learning_rate': 9.37667586263574e-06, 'epoch': 0.53} + 53%|█████▎ | 4675/8750 [3:46:08<6:36:27, 5.84s/it] 53%|█████▎ | 4676/8750 [3:46:16<6:35:47, 5.83s/it] 53%|█████▎ | 4676/8750 [3:46:13<6:35:47, 5.83s/it] {'loss': 0.4662, 'learning_rate': 9.372981450785723e-06, 'epoch': 0.53} + 53%|█████▎ | 4676/8750 [3:46:16<6:35:47, 5.83s/it] {'loss': 0.4662, 'learning_rate': 9.372981450785723e-06, 'epoch': 0.53} + 53%|█████▎ | 4676/8750 [3:46:13<6:35:47, 5.83s/it] 53%|█████▎ | 4677/8750 [3:46:19<6:33:02, 5.79s/it] 53%|█████▎ | 4677/8750 [3:46:22<6:33:02, 5.79s/it] {'loss': 0.4557, 'learning_rate': 9.369287124851243e-06, 'epoch': 0.53} + 53%|█████▎ | 4677/8750 [3:46:22<6:33:02, 5.79s/it] {'loss': 0.4557, 'learning_rate': 9.369287124851243e-06, 'epoch': 0.53} + 53%|█████▎ | 4677/8750 [3:46:19<6:33:02, 5.79s/it] 53%|█████▎ | 4678/8750 [3:46:25<6:32:12, 5.78s/it] 53%|█████▎ | 4678/8750 [3:46:28<6:32:12, 5.78s/it] {'loss': 0.457, 'learning_rate': 9.365592885338512e-06, 'epoch': 0.53} + 53%|█████▎ | 4678/8750 [3:46:28<6:32:12, 5.78s/it] {'loss': 0.457, 'learning_rate': 9.365592885338512e-06, 'epoch': 0.53} + 53%|█████▎ | 4678/8750 [3:46:25<6:32:12, 5.78s/it] 53%|█████▎ | 4679/8750 [3:46:34<6:32:30, 5.79s/it] 53%|█████▎ | 4679/8750 [3:46:31<6:32:30, 5.78s/it] {'loss': 0.4632, 'learning_rate': 9.361898732753715e-06, 'epoch': 0.53} + 53%|█████▎ | 4679/8750 [3:46:34<6:32:30, 5.79s/it] {'loss': 0.4632, 'learning_rate': 9.361898732753715e-06, 'epoch': 0.53} + 53%|█████▎ | 4679/8750 [3:46:31<6:32:30, 5.78s/it] 53%|█████▎ | 4680/8750 [3:46:36<6:29:10, 5.74s/it] 53%|█████▎ | 4680/8750 [3:46:39<6:29:11, 5.74s/it]{'loss': 0.4692, 'learning_rate': 9.358204667603043e-06, 'epoch': 0.53} + {'loss': 0.4692, 'learning_rate': 9.358204667603043e-06, 'epoch': 0.53} + 53%|█████▎ | 4680/8750 [3:46:39<6:29:11, 5.74s/it] 53%|█████▎ | 4680/8750 [3:46:36<6:29:10, 5.74s/it] 53%|█████▎ | 4681/8750 [3:46:45<6:31:03, 5.77s/it] 53%|█████▎ | 4681/8750 [3:46:42<6:31:03, 5.77s/it] {'loss': 0.4565, 'learning_rate': 9.35451069039266e-06, 'epoch': 0.53} + 53%|█████▎ | 4681/8750 [3:46:45<6:31:03, 5.77s/it] {'loss': 0.4565, 'learning_rate': 9.35451069039266e-06, 'epoch': 0.53} + 53%|█████▎ | 4681/8750 [3:46:42<6:31:03, 5.77s/it] 54%|█████▎ | 4682/8750 [3:46:48<6:33:56, 5.81s/it] 54%|█████▎ | 4682/8750 [3:46:51<6:33:56, 5.81s/it] {'loss': 0.4649, 'learning_rate': 9.35081680162872e-06, 'epoch': 0.54} + {'loss': 0.4649, 'learning_rate': 9.35081680162872e-06, 'epoch': 0.54} 54%|█████▎ | 4682/8750 [3:46:51<6:33:56, 5.81s/it] + 54%|█████▎ | 4682/8750 [3:46:48<6:33:56, 5.81s/it] 54%|█████▎ | 4683/8750 [3:46:57<6:33:46, 5.81s/it] 54%|█████▎ | 4683/8750 [3:46:54<6:33:46, 5.81s/it] {'loss': 0.4416, 'learning_rate': 9.347123001817376e-06, 'epoch': 0.54} + 54%|█████▎ | 4683/8750 [3:46:57<6:33:46, 5.81s/it] {'loss': 0.4416, 'learning_rate': 9.347123001817376e-06, 'epoch': 0.54} + 54%|█████▎ | 4683/8750 [3:46:54<6:33:46, 5.81s/it] 54%|█████▎ | 4684/8750 [3:47:02<6:30:32, 5.76s/it] 54%|█████▎ | 4684/8750 [3:47:00<6:30:32, 5.76s/it] {'loss': 0.485, 'learning_rate': 9.343429291464756e-06, 'epoch': 0.54} + 54%|█████▎ | 4684/8750 [3:47:02<6:30:32, 5.76s/it] {'loss': 0.485, 'learning_rate': 9.343429291464756e-06, 'epoch': 0.54} + 54%|█████▎ | 4684/8750 [3:47:00<6:30:32, 5.76s/it] 54%|█████▎ | 4685/8750 [3:47:05<6:27:03, 5.71s/it] 54%|█████▎ | 4685/8750 [3:47:08<6:27:03, 5.71s/it] {'loss': 0.4466, 'learning_rate': 9.339735671076978e-06, 'epoch': 0.54} + {'loss': 0.4466, 'learning_rate': 9.339735671076978e-06, 'epoch': 0.54} + 54%|█████▎ | 4685/8750 [3:47:08<6:27:03, 5.71s/it] 54%|█████▎ | 4685/8750 [3:47:05<6:27:03, 5.71s/it] 54%|█████▎ | 4686/8750 [3:47:14<6:24:43, 5.68s/it] 54%|█████▎ | 4686/8750 [3:47:11<6:24:42, 5.68s/it] {'loss': 0.4855, 'learning_rate': 9.336042141160158e-06, 'epoch': 0.54} + 54%|█████▎ | 4686/8750 [3:47:14<6:24:43, 5.68s/it] {'loss': 0.4855, 'learning_rate': 9.336042141160158e-06, 'epoch': 0.54} + 54%|█████▎ | 4686/8750 [3:47:11<6:24:42, 5.68s/it] 54%|█████▎ | 4687/8750 [3:47:20<6:28:21, 5.73s/it] 54%|█████▎ | 4687/8750 [3:47:17<6:28:21, 5.73s/it] {'loss': 0.4635, 'learning_rate': 9.332348702220386e-06, 'epoch': 0.54} + 54%|█████▎ | 4687/8750 [3:47:20<6:28:21, 5.73s/it] {'loss': 0.4635, 'learning_rate': 9.332348702220386e-06, 'epoch': 0.54} + 54%|█████▎ | 4687/8750 [3:47:17<6:28:21, 5.73s/it] 54%|█████▎ | 4688/8750 [3:47:25<6:28:31, 5.74s/it] 54%|█████▎ | 4688/8750 [3:47:22<6:28:31, 5.74s/it] {'loss': 0.4589, 'learning_rate': 9.32865535476375e-06, 'epoch': 0.54} + 54%|█████▎ | 4688/8750 [3:47:25<6:28:31, 5.74s/it] {'loss': 0.4589, 'learning_rate': 9.32865535476375e-06, 'epoch': 0.54} + 54%|█████▎ | 4688/8750 [3:47:22<6:28:31, 5.74s/it] 54%|█████▎ | 4689/8750 [3:47:28<6:25:51, 5.70s/it] 54%|█████▎ | 4689/8750 [3:47:31<6:25:52, 5.70s/it] {'loss': 0.4634, 'learning_rate': 9.32496209929632e-06, 'epoch': 0.54} + {'loss': 0.4634, 'learning_rate': 9.32496209929632e-06, 'epoch': 0.54} 54%|█████▎ | 4689/8750 [3:47:31<6:25:52, 5.70s/it] + 54%|█████▎ | 4689/8750 [3:47:28<6:25:51, 5.70s/it] 54%|█████▎ | 4690/8750 [3:47:34<6:25:53, 5.70s/it] 54%|█████▎ | 4690/8750 [3:47:37<6:25:53, 5.70s/it] {'loss': 0.4645, 'learning_rate': 9.32126893632415e-06, 'epoch': 0.54} + {'loss': 0.4645, 'learning_rate': 9.32126893632415e-06, 'epoch': 0.54} + 54%|█████▎ | 4690/8750 [3:47:37<6:25:53, 5.70s/it] 54%|█████▎ | 4690/8750 [3:47:34<6:25:53, 5.70s/it] 54%|█████▎ | 4691/8750 [3:47:42<6:26:36, 5.71s/it] 54%|█████▎ | 4691/8750 [3:47:39<6:26:36, 5.71s/it] {'loss': 0.442, 'learning_rate': 9.317575866353293e-06, 'epoch': 0.54} + 54%|█████▎ | 4691/8750 [3:47:42<6:26:36, 5.71s/it] {'loss': 0.442, 'learning_rate': 9.317575866353293e-06, 'epoch': 0.54} + 54%|█████▎ | 4691/8750 [3:47:39<6:26:36, 5.71s/it] 54%|█████▎ | 4692/8750 [3:47:48<6:24:56, 5.69s/it] 54%|█████▎ | 4692/8750 [3:47:45<6:24:56, 5.69s/it] {'loss': 0.4607, 'learning_rate': 9.313882889889773e-06, 'epoch': 0.54} + 54%|█████▎ | 4692/8750 [3:47:48<6:24:56, 5.69s/it] {'loss': 0.4607, 'learning_rate': 9.313882889889773e-06, 'epoch': 0.54} + 54%|█████▎ | 4692/8750 [3:47:45<6:24:56, 5.69s/it] 54%|█████▎ | 4693/8750 [3:47:54<6:27:36, 5.73s/it] 54%|█████▎ | 4693/8750 [3:47:51<6:27:36, 5.73s/it]{'loss': 0.4629, 'learning_rate': 9.31019000743962e-06, 'epoch': 0.54} + {'loss': 0.4629, 'learning_rate': 9.31019000743962e-06, 'epoch': 0.54} + 54%|█████▎ | 4693/8750 [3:47:54<6:27:36, 5.73s/it] 54%|█████▎ | 4693/8750 [3:47:51<6:27:36, 5.73s/it] 54%|█████▎ | 4694/8750 [3:47:59<6:25:04, 5.70s/it] 54%|█████▎ | 4694/8750 [3:47:56<6:25:04, 5.70s/it] {'loss': 0.4754, 'learning_rate': 9.306497219508835e-06, 'epoch': 0.54} + 54%|█████▎ | 4694/8750 [3:47:59<6:25:04, 5.70s/it] {'loss': 0.4754, 'learning_rate': 9.306497219508835e-06, 'epoch': 0.54} + 54%|█████▎ | 4694/8750 [3:47:56<6:25:04, 5.70s/it] 54%|█████▎ | 4695/8750 [3:48:05<6:26:32, 5.72s/it] 54%|█████▎ | 4695/8750 [3:48:02<6:26:32, 5.72s/it] {'loss': 0.4514, 'learning_rate': 9.302804526603413e-06, 'epoch': 0.54} + 54%|█████▎ | 4695/8750 [3:48:05<6:26:32, 5.72s/it] {'loss': 0.4514, 'learning_rate': 9.302804526603413e-06, 'epoch': 0.54} + 54%|█████▎ | 4695/8750 [3:48:02<6:26:32, 5.72s/it] 54%|█████▎ | 4696/8750 [3:48:08<6:31:28, 5.79s/it] 54%|█████▎ | 4696/8750 [3:48:11<6:31:28, 5.79s/it] {'loss': 0.4604, 'learning_rate': 9.29911192922934e-06, 'epoch': 0.54} + 54%|█████▎ | 4696/8750 [3:48:11<6:31:28, 5.79s/it] {'loss': 0.4604, 'learning_rate': 9.29911192922934e-06, 'epoch': 0.54} + 54%|█████▎ | 4696/8750 [3:48:08<6:31:28, 5.79s/it] 54%|█████▎ | 4697/8750 [3:48:17<6:27:40, 5.74s/it] 54%|█████▎ | 4697/8750 [3:48:14<6:27:40, 5.74s/it] {'loss': 0.4665, 'learning_rate': 9.29541942789258e-06, 'epoch': 0.54} + {'loss': 0.4665, 'learning_rate': 9.29541942789258e-06, 'epoch': 0.54} 54%|█████▎ | 4697/8750 [3:48:17<6:27:40, 5.74s/it] + 54%|█████▎ | 4697/8750 [3:48:14<6:27:40, 5.74s/it] 54%|█████▎ | 4698/8750 [3:48:23<6:34:17, 5.84s/it] 54%|█████▎ | 4698/8750 [3:48:20<6:34:17, 5.84s/it] {'loss': 0.4833, 'learning_rate': 9.291727023099087e-06, 'epoch': 0.54} + {'loss': 0.4833, 'learning_rate': 9.291727023099087e-06, 'epoch': 0.54} 54%|█████▎ | 4698/8750 [3:48:23<6:34:17, 5.84s/it] + 54%|█████▎ | 4698/8750 [3:48:20<6:34:17, 5.84s/it] 54%|█████▎ | 4699/8750 [3:48:29<6:44:19, 5.99s/it] 54%|█████▎ | 4699/8750 [3:48:26<6:44:19, 5.99s/it] {'loss': 0.4474, 'learning_rate': 9.288034715354806e-06, 'epoch': 0.54} + 54%|█████▎ | 4699/8750 [3:48:29<6:44:19, 5.99s/it] {'loss': 0.4474, 'learning_rate': 9.288034715354806e-06, 'epoch': 0.54} + 54%|█████▎ | 4699/8750 [3:48:26<6:44:19, 5.99s/it]14 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend...9 + AutoResumeHook: Checking whether to suspend... + 54%|█████▎ | 4700/8750 [3:48:35<6:39:53, 5.92s/it]12 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +045 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... + 54%|█████▎ | 4700/8750 [3:48:32<6:39:53, 5.92s/it]11 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4553, 'learning_rate': 9.28434250516566e-06, 'epoch': 0.54} + 54%|█████▎ | 4700/8750 [3:48:35<6:39:53, 5.92s/it] {'loss': 0.4553, 'learning_rate': 9.28434250516566e-06, 'epoch': 0.54} + 54%|█████▎ | 4700/8750 [3:48:32<6:39:53, 5.92s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-4700/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-4700/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-4700/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 54%|█████▎ | 4701/8750 [3:48:58<12:33:56, 11.17s/it] 54%|█████▎ | 4701/8750 [3:48:55<12:33:56, 11.17s/it] {'loss': 0.4573, 'learning_rate': 9.280650393037578e-06, 'epoch': 0.54} + 54%|█████▎ | 4701/8750 [3:48:58<12:33:56, 11.17s/it] {'loss': 0.4573, 'learning_rate': 9.280650393037578e-06, 'epoch': 0.54} + 54%|█████▎ | 4701/8750 [3:48:55<12:33:56, 11.17s/it] 54%|█████▎ | 4702/8750 [3:49:04<10:42:44, 9.53s/it] 54%|█████▎ | 4702/8750 [3:49:01<10:42:45, 9.53s/it] {'loss': 0.499, 'learning_rate': 9.276958379476449e-06, 'epoch': 0.54} + 54%|█████▎ | 4702/8750 [3:49:04<10:42:44, 9.53s/it] {'loss': 0.499, 'learning_rate': 9.276958379476449e-06, 'epoch': 0.54} + 54%|█████▎ | 4702/8750 [3:49:01<10:42:45, 9.53s/it] 54%|█████▎ | 4703/8750 [3:49:10<9:29:01, 8.44s/it] 54%|█████▎ | 4703/8750 [3:49:07<9:29:00, 8.44s/it] {'loss': 0.4457, 'learning_rate': 9.27326646498816e-06, 'epoch': 0.54} + 54%|█████▎ | 4703/8750 [3:49:10<9:29:01, 8.44s/it] {'loss': 0.4457, 'learning_rate': 9.27326646498816e-06, 'epoch': 0.54} + 54%|█████▎ | 4703/8750 [3:49:07<9:29:00, 8.44s/it] 54%|█████▍ | 4704/8750 [3:49:16<8:36:44, 7.66s/it] 54%|█████▍ | 4704/8750 [3:49:13<8:36:44, 7.66s/it] {'loss': 0.4709, 'learning_rate': 9.269574650078594e-06, 'epoch': 0.54} + 54%|█████▍ | 4704/8750 [3:49:16<8:36:44, 7.66s/it] {'loss': 0.4709, 'learning_rate': 9.269574650078594e-06, 'epoch': 0.54} + 54%|█████▍ | 4704/8750 [3:49:13<8:36:44, 7.66s/it] 54%|█████▍ | 4705/8750 [3:49:22<8:00:46, 7.13s/it] 54%|█████▍ | 4705/8750 [3:49:19<8:00:47, 7.13s/it] {'loss': 0.4526, 'learning_rate': 9.265882935253605e-06, 'epoch': 0.54} + 54%|█████▍ | 4705/8750 [3:49:22<8:00:46, 7.13s/it] {'loss': 0.4526, 'learning_rate': 9.265882935253605e-06, 'epoch': 0.54} + 54%|█████▍ | 4705/8750 [3:49:19<8:00:47, 7.13s/it] 54%|█████▍ | 4706/8750 [3:49:27<7:31:30, 6.70s/it] 54%|█████▍ | 4706/8750 [3:49:24<7:31:29, 6.70s/it] {'loss': 0.473, 'learning_rate': 9.262191321019049e-06, 'epoch': 0.54} + 54%|█████▍ | 4706/8750 [3:49:27<7:31:30, 6.70s/it] {'loss': 0.473, 'learning_rate': 9.262191321019049e-06, 'epoch': 0.54} + 54%|█████▍ | 4706/8750 [3:49:24<7:31:29, 6.70s/it] 54%|█████▍ | 4707/8750 [3:49:33<7:19:24, 6.52s/it] 54%|█████▍ | 4707/8750 [3:49:31<7:19:24, 6.52s/it] {'loss': 0.4639, 'learning_rate': 9.258499807880755e-06, 'epoch': 0.54} + 54%|█████▍ | 4707/8750 [3:49:33<7:19:24, 6.52s/it] {'loss': 0.4639, 'learning_rate': 9.258499807880755e-06, 'epoch': 0.54} + 54%|█████▍ | 4707/8750 [3:49:31<7:19:24, 6.52s/it] 54%|█████▍ | 4708/8750 [3:49:39<7:04:16, 6.30s/it] 54%|█████▍ | 4708/8750 [3:49:36<7:04:16, 6.30s/it] {'loss': 0.4748, 'learning_rate': 9.254808396344536e-06, 'epoch': 0.54} + 54%|█████▍ | 4708/8750 [3:49:39<7:04:16, 6.30s/it] {'loss': 0.4748, 'learning_rate': 9.254808396344536e-06, 'epoch': 0.54} + 54%|█████▍ | 4708/8750 [3:49:36<7:04:16, 6.30s/it] 54%|█████▍ | 4709/8750 [3:49:45<6:54:16, 6.15s/it] 54%|█████▍ | 4709/8750 [3:49:42<6:54:17, 6.15s/it] {'loss': 0.4479, 'learning_rate': 9.251117086916209e-06, 'epoch': 0.54} + 54%|█████▍ | 4709/8750 [3:49:45<6:54:16, 6.15s/it] {'loss': 0.4479, 'learning_rate': 9.251117086916209e-06, 'epoch': 0.54} + 54%|█████▍ | 4709/8750 [3:49:42<6:54:17, 6.15s/it] 54%|█████▍ | 4710/8750 [3:49:51<6:44:03, 6.00s/it] 54%|█████▍ | 4710/8750 [3:49:48<6:44:03, 6.00s/it] {'loss': 0.4813, 'learning_rate': 9.247425880101561e-06, 'epoch': 0.54} + 54%|█████▍ | 4710/8750 [3:49:51<6:44:03, 6.00s/it] {'loss': 0.4813, 'learning_rate': 9.247425880101561e-06, 'epoch': 0.54} + 54%|█████▍ | 4710/8750 [3:49:48<6:44:03, 6.00s/it] 54%|█████▍ | 4711/8750 [3:49:54<6:43:27, 5.99s/it] 54%|█████▍ | 4711/8750 [3:49:57<6:43:28, 5.99s/it] {'loss': 0.4594, 'learning_rate': 9.243734776406365e-06, 'epoch': 0.54} + 54%|█████▍ | 4711/8750 [3:49:57<6:43:28, 5.99s/it] {'loss': 0.4594, 'learning_rate': 9.243734776406365e-06, 'epoch': 0.54} + 54%|█████▍ | 4711/8750 [3:49:54<6:43:27, 5.99s/it] 54%|█████▍ | 4712/8750 [3:50:02<6:37:37, 5.91s/it] 54%|█████▍ | 4712/8750 [3:49:59<6:37:37, 5.91s/it] {'loss': 0.4825, 'learning_rate': 9.240043776336397e-06, 'epoch': 0.54} + 54%|█████▍ | 4712/8750 [3:50:02<6:37:37, 5.91s/it] {'loss': 0.4825, 'learning_rate': 9.240043776336397e-06, 'epoch': 0.54} + 54%|█████▍ | 4712/8750 [3:49:59<6:37:37, 5.91s/it] 54%|█████▍ | 4713/8750 [3:50:08<6:35:48, 5.88s/it] 54%|█████▍ | 4713/8750 [3:50:05<6:35:49, 5.88s/it] {'loss': 0.4511, 'learning_rate': 9.23635288039739e-06, 'epoch': 0.54} + 54%|█████▍ | 4713/8750 [3:50:08<6:35:48, 5.88s/it] {'loss': 0.4511, 'learning_rate': 9.23635288039739e-06, 'epoch': 0.54} + 54%|█████▍ | 4713/8750 [3:50:05<6:35:49, 5.88s/it] 54%|█████▍ | 4714/8750 [3:50:14<6:32:41, 5.84s/it] 54%|█████▍ | 4714/8750 [3:50:11<6:32:40, 5.84s/it] {'loss': 0.4457, 'learning_rate': 9.232662089095091e-06, 'epoch': 0.54} + 54%|█████▍ | 4714/8750 [3:50:14<6:32:41, 5.84s/it] {'loss': 0.4457, 'learning_rate': 9.232662089095091e-06, 'epoch': 0.54} + 54%|█████▍ | 4714/8750 [3:50:11<6:32:40, 5.84s/it] 54%|█████▍ | 4715/8750 [3:50:20<6:32:26, 5.84s/it] 54%|█████▍ | 4715/8750 [3:50:17<6:32:26, 5.84s/it] {'loss': 0.4835, 'learning_rate': 9.22897140293522e-06, 'epoch': 0.54} + 54%|█████▍ | 4715/8750 [3:50:20<6:32:26, 5.84s/it] {'loss': 0.4835, 'learning_rate': 9.22897140293522e-06, 'epoch': 0.54} + 54%|█████▍ | 4715/8750 [3:50:17<6:32:26, 5.84s/it] 54%|█████▍ | 4716/8750 [3:50:26<6:32:28, 5.84s/it] 54%|█████▍ | 4716/8750 [3:50:23<6:32:29, 5.84s/it] {'loss': 0.448, 'learning_rate': 9.225280822423477e-06, 'epoch': 0.54} + 54%|█████▍ | 4716/8750 [3:50:26<6:32:28, 5.84s/it] {'loss': 0.448, 'learning_rate': 9.225280822423477e-06, 'epoch': 0.54} + 54%|█████▍ | 4716/8750 [3:50:23<6:32:29, 5.84s/it] 54%|█████▍ | 4717/8750 [3:50:31<6:30:45, 5.81s/it] 54%|█████▍ | 4717/8750 [3:50:28<6:30:45, 5.81s/it] {'loss': 0.4666, 'learning_rate': 9.221590348065561e-06, 'epoch': 0.54} + 54%|█████▍ | 4717/8750 [3:50:31<6:30:45, 5.81s/it] {'loss': 0.4666, 'learning_rate': 9.221590348065561e-06, 'epoch': 0.54} + 54%|█████▍ | 4717/8750 [3:50:28<6:30:45, 5.81s/it] 54%|█████▍ | 4718/8750 [3:50:34<6:28:08, 5.78s/it] 54%|█████▍ | 4718/8750 [3:50:37<6:28:08, 5.78s/it]{'loss': 0.4612, 'learning_rate': 9.217899980367142e-06, 'epoch': 0.54} + {'loss': 0.4612, 'learning_rate': 9.217899980367142e-06, 'epoch': 0.54} + 54%|█████▍ | 4718/8750 [3:50:37<6:28:08, 5.78s/it] 54%|█████▍ | 4718/8750 [3:50:34<6:28:08, 5.78s/it] 54%|█████▍ | 4719/8750 [3:50:43<6:30:27, 5.81s/it] 54%|█████▍ | 4719/8750 [3:50:40<6:30:27, 5.81s/it] {'loss': 0.4589, 'learning_rate': 9.214209719833891e-06, 'epoch': 0.54} + 54%|█████▍ | 4719/8750 [3:50:43<6:30:27, 5.81s/it] {'loss': 0.4589, 'learning_rate': 9.214209719833891e-06, 'epoch': 0.54} + 54%|█████▍ | 4719/8750 [3:50:40<6:30:27, 5.81s/it] 54%|█████▍ | 4720/8750 [3:50:49<6:30:00, 5.81s/it] 54%|█████▍ | 4720/8750 [3:50:46<6:30:00, 5.81s/it] {'loss': 0.4755, 'learning_rate': 9.210519566971452e-06, 'epoch': 0.54} + 54%|█████▍ | 4720/8750 [3:50:49<6:30:00, 5.81s/it] {'loss': 0.4755, 'learning_rate': 9.210519566971452e-06, 'epoch': 0.54} + 54%|█████▍ | 4720/8750 [3:50:46<6:30:00, 5.81s/it] 54%|█████▍ | 4721/8750 [3:50:52<6:30:01, 5.81s/it] 54%|█████▍ | 4721/8750 [3:50:55<6:30:01, 5.81s/it] {'loss': 0.4646, 'learning_rate': 9.206829522285456e-06, 'epoch': 0.54} + {'loss': 0.4646, 'learning_rate': 9.206829522285456e-06, 'epoch': 0.54} + 54%|█████▍ | 4721/8750 [3:50:55<6:30:01, 5.81s/it] 54%|█████▍ | 4721/8750 [3:50:52<6:30:01, 5.81s/it] 54%|█████▍ | 4722/8750 [3:51:00<6:29:07, 5.80s/it] 54%|█████▍ | 4722/8750 [3:50:57<6:29:07, 5.80s/it] {'loss': 0.4539, 'learning_rate': 9.203139586281527e-06, 'epoch': 0.54} + 54%|█████▍ | 4722/8750 [3:51:00<6:29:07, 5.80s/it] {'loss': 0.4539, 'learning_rate': 9.203139586281527e-06, 'epoch': 0.54} + 54%|█████▍ | 4722/8750 [3:50:57<6:29:07, 5.80s/it] 54%|█████▍ | 4723/8750 [3:51:06<6:29:14, 5.80s/it] 54%|█████▍ | 4723/8750 [3:51:03<6:29:14, 5.80s/it] {'loss': 0.4549, 'learning_rate': 9.199449759465263e-06, 'epoch': 0.54} + 54%|█████▍ | 4723/8750 [3:51:06<6:29:14, 5.80s/it] {'loss': 0.4549, 'learning_rate': 9.199449759465263e-06, 'epoch': 0.54} + 54%|█████▍ | 4723/8750 [3:51:03<6:29:14, 5.80s/it] 54%|█████▍ | 4724/8750 [3:51:12<6:30:06, 5.81s/it] 54%|█████▍ | 4724/8750 [3:51:09<6:30:06, 5.81s/it] {'loss': 0.4756, 'learning_rate': 9.195760042342254e-06, 'epoch': 0.54} + 54%|█████▍ | 4724/8750 [3:51:12<6:30:06, 5.81s/it] {'loss': 0.4756, 'learning_rate': 9.195760042342254e-06, 'epoch': 0.54} + 54%|█████▍ | 4724/8750 [3:51:09<6:30:06, 5.81s/it] 54%|█████▍ | 4725/8750 [3:51:18<6:29:50, 5.81s/it] 54%|█████▍ | 4725/8750 [3:51:15<6:29:50, 5.81s/it] {'loss': 0.4545, 'learning_rate': 9.192070435418079e-06, 'epoch': 0.54} + 54%|█████▍ | 4725/8750 [3:51:18<6:29:50, 5.81s/it] {'loss': 0.4545, 'learning_rate': 9.192070435418079e-06, 'epoch': 0.54} + 54%|█████▍ | 4725/8750 [3:51:15<6:29:50, 5.81s/it] 54%|█████▍ | 4726/8750 [3:51:21<6:27:30, 5.78s/it] 54%|█████▍ | 4726/8750 [3:51:23<6:27:30, 5.78s/it]{'loss': 0.4611, 'learning_rate': 9.188380939198287e-06, 'epoch': 0.54} + {'loss': 0.4611, 'learning_rate': 9.188380939198287e-06, 'epoch': 0.54} + 54%|█████▍ | 4726/8750 [3:51:23<6:27:30, 5.78s/it] 54%|█████▍ | 4726/8750 [3:51:21<6:27:30, 5.78s/it] 54%|█████▍ | 4727/8750 [3:51:29<6:27:02, 5.77s/it] 54%|█████▍ | 4727/8750 [3:51:26<6:27:02, 5.77s/it] {'loss': 0.4593, 'learning_rate': 9.184691554188432e-06, 'epoch': 0.54} + 54%|█████▍ | 4727/8750 [3:51:29<6:27:02, 5.77s/it] {'loss': 0.4593, 'learning_rate': 9.184691554188432e-06, 'epoch': 0.54} + 54%|█████▍ | 4727/8750 [3:51:26<6:27:02, 5.77s/it] 54%|█████▍ | 4728/8750 [3:51:35<6:25:09, 5.75s/it] 54%|█████▍ | 4728/8750 [3:51:32<6:25:09, 5.75s/it] {'loss': 0.4526, 'learning_rate': 9.181002280894034e-06, 'epoch': 0.54} + 54%|█████▍ | 4728/8750 [3:51:35<6:25:09, 5.75s/it] {'loss': 0.4526, 'learning_rate': 9.181002280894034e-06, 'epoch': 0.54} + 54%|█████▍ | 4728/8750 [3:51:32<6:25:09, 5.75s/it] 54%|█████▍ | 4729/8750 [3:51:41<6:25:55, 5.76s/it] 54%|█████▍ | 4729/8750 [3:51:38<6:25:55, 5.76s/it] {'loss': 0.4463, 'learning_rate': 9.177313119820608e-06, 'epoch': 0.54} + 54%|█████▍ | 4729/8750 [3:51:41<6:25:55, 5.76s/it] {'loss': 0.4463, 'learning_rate': 9.177313119820608e-06, 'epoch': 0.54} + 54%|█████▍ | 4729/8750 [3:51:38<6:25:55, 5.76s/it] 54%|█████▍ | 4730/8750 [3:51:44<6:26:27, 5.77s/it] 54%|█████▍ | 4730/8750 [3:51:47<6:26:28, 5.77s/it] {'loss': 0.4694, 'learning_rate': 9.173624071473655e-06, 'epoch': 0.54} + {'loss': 0.4694, 'learning_rate': 9.173624071473655e-06, 'epoch': 0.54} + 54%|█████▍ | 4730/8750 [3:51:47<6:26:28, 5.77s/it] 54%|█████▍ | 4730/8750 [3:51:44<6:26:27, 5.77s/it] 54%|█████▍ | 4731/8750 [3:51:52<6:29:24, 5.81s/it] 54%|█████▍ | 4731/8750 [3:51:50<6:29:24, 5.81s/it] {'loss': 0.4631, 'learning_rate': 9.16993513635865e-06, 'epoch': 0.54} + 54%|█████▍ | 4731/8750 [3:51:52<6:29:24, 5.81s/it] {'loss': 0.4631, 'learning_rate': 9.16993513635865e-06, 'epoch': 0.54} + 54%|█████▍ | 4731/8750 [3:51:50<6:29:24, 5.81s/it] 54%|█████▍ | 4732/8750 [3:51:58<6:29:05, 5.81s/it] 54%|█████▍ | 4732/8750 [3:51:55<6:29:05, 5.81s/it] {'loss': 0.4379, 'learning_rate': 9.166246314981066e-06, 'epoch': 0.54} + {'loss': 0.4379, 'learning_rate': 9.166246314981066e-06, 'epoch': 0.54} + 54%|█████▍ | 4732/8750 [3:51:58<6:29:05, 5.81s/it] 54%|█████▍ | 4732/8750 [3:51:55<6:29:05, 5.81s/it] 54%|█████▍ | 4733/8750 [3:52:04<6:31:33, 5.85s/it] 54%|█████▍ | 4733/8750 [3:52:01<6:31:33, 5.85s/it] {'loss': 0.4644, 'learning_rate': 9.162557607846352e-06, 'epoch': 0.54} + 54%|█████▍ | 4733/8750 [3:52:04<6:31:33, 5.85s/it] {'loss': 0.4644, 'learning_rate': 9.162557607846352e-06, 'epoch': 0.54} + 54%|█████▍ | 4733/8750 [3:52:01<6:31:33, 5.85s/it] 54%|█████▍ | 4734/8750 [3:52:10<6:27:22, 5.79s/it] 54%|█████▍ | 4734/8750 [3:52:07<6:27:22, 5.79s/it] {'loss': 0.4767, 'learning_rate': 9.158869015459939e-06, 'epoch': 0.54} + 54%|█████▍ | 4734/8750 [3:52:10<6:27:22, 5.79s/it] {'loss': 0.4767, 'learning_rate': 9.158869015459939e-06, 'epoch': 0.54} + 54%|█████▍ | 4734/8750 [3:52:07<6:27:22, 5.79s/it] 54%|█████▍ | 4735/8750 [3:52:13<6:26:10, 5.77s/it] 54%|█████▍ | 4735/8750 [3:52:16<6:26:11, 5.77s/it] {'loss': 0.4582, 'learning_rate': 9.155180538327255e-06, 'epoch': 0.54} + 54%|█████▍ | 4735/8750 [3:52:16<6:26:11, 5.77s/it] {'loss': 0.4582, 'learning_rate': 9.155180538327255e-06, 'epoch': 0.54} + 54%|█████▍ | 4735/8750 [3:52:13<6:26:10, 5.77s/it] 54%|█████▍ | 4736/8750 [3:52:21<6:27:07, 5.79s/it] 54%|█████▍ | 4736/8750 [3:52:18<6:27:07, 5.79s/it] {'loss': 0.4598, 'learning_rate': 9.151492176953697e-06, 'epoch': 0.54} + 54%|█████▍ | 4736/8750 [3:52:21<6:27:07, 5.79s/it] {'loss': 0.4598, 'learning_rate': 9.151492176953697e-06, 'epoch': 0.54} + 54%|█████▍ | 4736/8750 [3:52:18<6:27:07, 5.79s/it] 54%|█████▍ | 4737/8750 [3:52:27<6:26:05, 5.77s/it] 54%|█████▍ | 4737/8750 [3:52:24<6:26:05, 5.77s/it] {'loss': 0.4647, 'learning_rate': 9.147803931844651e-06, 'epoch': 0.54} + 54%|█████▍ | 4737/8750 [3:52:27<6:26:05, 5.77s/it] {'loss': 0.4647, 'learning_rate': 9.147803931844651e-06, 'epoch': 0.54} + 54%|█████▍ | 4737/8750 [3:52:24<6:26:05, 5.77s/it] 54%|█████▍ | 4738/8750 [3:52:30<6:24:24, 5.75s/it] 54%|█████▍ | 4738/8750 [3:52:33<6:24:26, 5.75s/it] {'loss': 0.4929, 'learning_rate': 9.144115803505498e-06, 'epoch': 0.54} + 54%|█████▍ | 4738/8750 [3:52:33<6:24:26, 5.75s/it] {'loss': 0.4929, 'learning_rate': 9.144115803505498e-06, 'epoch': 0.54} + 54%|█████▍ | 4738/8750 [3:52:30<6:24:24, 5.75s/it] 54%|█████▍ | 4739/8750 [3:52:36<6:24:48, 5.76s/it] 54%|█████▍ | 4739/8750 [3:52:39<6:24:49, 5.76s/it] {'loss': 0.4567, 'learning_rate': 9.140427792441584e-06, 'epoch': 0.54} + 54%|█████▍ | 4739/8750 [3:52:39<6:24:49, 5.76s/it] {'loss': 0.4567, 'learning_rate': 9.140427792441584e-06, 'epoch': 0.54} + 54%|█████▍ | 4739/8750 [3:52:36<6:24:48, 5.76s/it] 54%|█████▍ | 4740/8750 [3:52:44<6:27:25, 5.80s/it] 54%|█████▍ | 4740/8750 [3:52:42<6:27:25, 5.80s/it] {'loss': 0.4656, 'learning_rate': 9.136739899158257e-06, 'epoch': 0.54} + 54%|█████▍ | 4740/8750 [3:52:44<6:27:25, 5.80s/it] {'loss': 0.4656, 'learning_rate': 9.136739899158257e-06, 'epoch': 0.54} + 54%|█████▍ | 4740/8750 [3:52:42<6:27:25, 5.80s/it] 54%|█████▍ | 4741/8750 [3:52:50<6:26:24, 5.78s/it] 54%|█████▍ | 4741/8750 [3:52:47<6:26:25, 5.78s/it] {'loss': 0.4568, 'learning_rate': 9.133052124160837e-06, 'epoch': 0.54} + 54%|█████▍ | 4741/8750 [3:52:50<6:26:24, 5.78s/it] {'loss': 0.4568, 'learning_rate': 9.133052124160837e-06, 'epoch': 0.54} + 54%|█████▍ | 4741/8750 [3:52:47<6:26:25, 5.78s/it] 54%|█████▍ | 4742/8750 [3:52:56<6:24:08, 5.75s/it] 54%|█████▍ | 4742/8750 [3:52:53<6:24:09, 5.75s/it] {'loss': 0.4599, 'learning_rate': 9.129364467954628e-06, 'epoch': 0.54} + 54%|█████▍ | 4742/8750 [3:52:56<6:24:08, 5.75s/it] {'loss': 0.4599, 'learning_rate': 9.129364467954628e-06, 'epoch': 0.54} + 54%|█████▍ | 4742/8750 [3:52:53<6:24:09, 5.75s/it] 54%|█████▍ | 4743/8750 [3:53:02<6:21:51, 5.72s/it] 54%|█████▍ | 4743/8750 [3:52:59<6:21:51, 5.72s/it] {'loss': 0.4576, 'learning_rate': 9.125676931044928e-06, 'epoch': 0.54} + 54%|█████▍ | 4743/8750 [3:53:02<6:21:51, 5.72s/it] {'loss': 0.4576, 'learning_rate': 9.125676931044928e-06, 'epoch': 0.54} + 54%|█████▍ | 4743/8750 [3:52:59<6:21:51, 5.72s/it] 54%|█████▍ | 4744/8750 [3:53:07<6:21:08, 5.71s/it] 54%|█████▍ | 4744/8750 [3:53:04<6:21:08, 5.71s/it] {'loss': 0.456, 'learning_rate': 9.121989513937007e-06, 'epoch': 0.54} + 54%|█████▍ | 4744/8750 [3:53:07<6:21:08, 5.71s/it] {'loss': 0.456, 'learning_rate': 9.121989513937007e-06, 'epoch': 0.54} + 54%|█████▍ | 4744/8750 [3:53:04<6:21:08, 5.71s/it] 54%|█████▍ | 4745/8750 [3:53:13<6:22:23, 5.73s/it] 54%|█████▍ | 4745/8750 [3:53:10<6:22:23, 5.73s/it] {'loss': 0.4509, 'learning_rate': 9.11830221713613e-06, 'epoch': 0.54} + 54%|█████▍ | 4745/8750 [3:53:13<6:22:23, 5.73s/it] {'loss': 0.4509, 'learning_rate': 9.11830221713613e-06, 'epoch': 0.54} + 54%|█████▍ | 4745/8750 [3:53:10<6:22:23, 5.73s/it] 54%|█████▍ | 4746/8750 [3:53:19<6:19:15, 5.68s/it] 54%|█████▍ | 4746/8750 [3:53:16<6:19:14, 5.68s/it] {'loss': 0.4747, 'learning_rate': 9.11461504114753e-06, 'epoch': 0.54} + 54%|█████▍ | 4746/8750 [3:53:19<6:19:15, 5.68s/it] {'loss': 0.4747, 'learning_rate': 9.11461504114753e-06, 'epoch': 0.54} + 54%|█████▍ | 4746/8750 [3:53:16<6:19:14, 5.68s/it] 54%|█████▍ | 4747/8750 [3:53:24<6:21:02, 5.71s/it] 54%|█████▍ | 4747/8750 [3:53:21<6:21:02, 5.71s/it] {'loss': 0.4661, 'learning_rate': 9.110927986476434e-06, 'epoch': 0.54} + 54%|█████▍ | 4747/8750 [3:53:24<6:21:02, 5.71s/it] {'loss': 0.4661, 'learning_rate': 9.110927986476434e-06, 'epoch': 0.54} + 54%|█████▍ | 4747/8750 [3:53:21<6:21:02, 5.71s/it] 54%|█████▍ | 4748/8750 [3:53:27<6:22:04, 5.73s/it] 54%|█████▍ | 4748/8750 [3:53:30<6:22:04, 5.73s/it] {'loss': 0.4478, 'learning_rate': 9.107241053628058e-06, 'epoch': 0.54} + {'loss': 0.4478, 'learning_rate': 9.107241053628058e-06, 'epoch': 0.54} + 54%|█████▍ | 4748/8750 [3:53:30<6:22:04, 5.73s/it] 54%|█████▍ | 4748/8750 [3:53:27<6:22:04, 5.73s/it] 54%|█████▍ | 4749/8750 [3:53:36<6:24:01, 5.76s/it] 54%|█████▍ | 4749/8750 [3:53:33<6:24:01, 5.76s/it] {'loss': 0.4496, 'learning_rate': 9.103554243107592e-06, 'epoch': 0.54} + 54%|█████▍ | 4749/8750 [3:53:36<6:24:01, 5.76s/it] {'loss': 0.4496, 'learning_rate': 9.103554243107592e-06, 'epoch': 0.54} + 54%|█████▍ | 4749/8750 [3:53:33<6:24:01, 5.76s/it]6 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... + 54%|█████▍ | 4750/8750 [3:53:42<6:22:49, 5.74s/it]11 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +0 AutoResumeHook: Checking whether to suspend... + 54%|█████▍ | 4750/8750 [3:53:39<6:22:50, 5.74s/it] {'loss': 0.4703, 'learning_rate': 9.0998675554202e-06, 'epoch': 0.54} + 54%|█████▍ | 4750/8750 [3:53:42<6:22:49, 5.74s/it] {'loss': 0.4703, 'learning_rate': 9.0998675554202e-06, 'epoch': 0.54} + 54%|█████▍ | 4750/8750 [3:53:39<6:22:50, 5.74s/it] 54%|█████▍ | 4751/8750 [3:53:47<6:22:42, 5.74s/it] 54%|█████▍ | 4751/8750 [3:53:44<6:22:41, 5.74s/it] {'loss': 0.4598, 'learning_rate': 9.096180991071055e-06, 'epoch': 0.54} + 54%|█████▍ | 4751/8750 [3:53:47<6:22:42, 5.74s/it] {'loss': 0.4598, 'learning_rate': 9.096180991071055e-06, 'epoch': 0.54} + 54%|█████▍ | 4751/8750 [3:53:44<6:22:41, 5.74s/it] 54%|█████▍ | 4752/8750 [3:53:50<6:26:47, 5.80s/it] 54%|█████▍ | 4752/8750 [3:53:53<6:26:48, 5.80s/it] {'loss': 0.461, 'learning_rate': 9.09249455056529e-06, 'epoch': 0.54} + 54%|█████▍ | 4752/8750 [3:53:53<6:26:48, 5.80s/it] {'loss': 0.461, 'learning_rate': 9.09249455056529e-06, 'epoch': 0.54} + 54%|█████▍ | 4752/8750 [3:53:50<6:26:47, 5.80s/it] 54%|█████▍ | 4753/8750 [3:54:00<6:34:37, 5.92s/it] 54%|█████▍ | 4753/8750 [3:53:57<6:34:37, 5.92s/it] {'loss': 0.4479, 'learning_rate': 9.088808234408037e-06, 'epoch': 0.54} + 54%|█████▍ | 4753/8750 [3:54:00<6:34:37, 5.92s/it] {'loss': 0.4479, 'learning_rate': 9.088808234408037e-06, 'epoch': 0.54} + 54%|█████▍ | 4753/8750 [3:53:57<6:34:37, 5.92s/it] 54%|█████▍ | 4754/8750 [3:54:05<6:29:11, 5.84s/it] 54%|█████▍ | 4754/8750 [3:54:02<6:29:10, 5.84s/it] {'loss': 0.4583, 'learning_rate': 9.0851220431044e-06, 'epoch': 0.54} + 54%|█████▍ | 4754/8750 [3:54:05<6:29:11, 5.84s/it] {'loss': 0.4583, 'learning_rate': 9.0851220431044e-06, 'epoch': 0.54} + 54%|█████▍ | 4754/8750 [3:54:02<6:29:10, 5.84s/it] 54%|█████▍ | 4755/8750 [3:54:11<6:26:32, 5.81s/it] 54%|█████▍ | 4755/8750 [3:54:08<6:26:32, 5.81s/it] {'loss': 0.4393, 'learning_rate': 9.081435977159464e-06, 'epoch': 0.54} + 54%|█████▍ | 4755/8750 [3:54:11<6:26:32, 5.81s/it] {'loss': 0.4393, 'learning_rate': 9.081435977159464e-06, 'epoch': 0.54} + 54%|█████▍ | 4755/8750 [3:54:08<6:26:32, 5.81s/it] 54%|█████▍ | 4756/8750 [3:54:17<6:25:21, 5.79s/it] 54%|█████▍ | 4756/8750 [3:54:14<6:25:22, 5.79s/it] {'loss': 0.4853, 'learning_rate': 9.07775003707831e-06, 'epoch': 0.54} + 54%|█████▍ | 4756/8750 [3:54:17<6:25:21, 5.79s/it] {'loss': 0.4853, 'learning_rate': 9.07775003707831e-06, 'epoch': 0.54} + 54%|█████▍ | 4756/8750 [3:54:14<6:25:22, 5.79s/it] 54%|█████▍ | 4757/8750 [3:54:22<6:24:55, 5.78s/it] 54%|█████▍ | 4757/8750 [3:54:20<6:24:55, 5.78s/it] {'loss': 0.4505, 'learning_rate': 9.074064223365986e-06, 'epoch': 0.54} + 54%|█████▍ | 4757/8750 [3:54:22<6:24:55, 5.78s/it] {'loss': 0.4505, 'learning_rate': 9.074064223365986e-06, 'epoch': 0.54} + 54%|█████▍ | 4757/8750 [3:54:20<6:24:55, 5.78s/it] 54%|█████▍ | 4758/8750 [3:54:28<6:28:23, 5.84s/it] 54%|█████▍ | 4758/8750 [3:54:25<6:28:23, 5.84s/it] {'loss': 0.4418, 'learning_rate': 9.070378536527544e-06, 'epoch': 0.54} + {'loss': 0.4418, 'learning_rate': 9.070378536527544e-06, 'epoch': 0.54} 54%|█████▍ | 4758/8750 [3:54:28<6:28:23, 5.84s/it] + 54%|█████▍ | 4758/8750 [3:54:26<6:28:23, 5.84s/it] 54%|█████▍ | 4759/8750 [3:54:34<6:26:04, 5.80s/it] 54%|█████▍ | 4759/8750 [3:54:31<6:26:04, 5.80s/it] {'loss': 0.4643, 'learning_rate': 9.066692977067996e-06, 'epoch': 0.54} + 54%|█████▍ | 4759/8750 [3:54:34<6:26:04, 5.80s/it] {'loss': 0.4643, 'learning_rate': 9.066692977067996e-06, 'epoch': 0.54} + 54%|█████▍ | 4759/8750 [3:54:31<6:26:04, 5.80s/it] 54%|█████▍ | 4760/8750 [3:54:40<6:21:02, 5.73s/it] 54%|█████▍ | 4760/8750 [3:54:37<6:21:02, 5.73s/it] {'loss': 0.4698, 'learning_rate': 9.063007545492342e-06, 'epoch': 0.54} + 54%|█████▍ | 4760/8750 [3:54:40<6:21:02, 5.73s/it] {'loss': 0.4698, 'learning_rate': 9.063007545492342e-06, 'epoch': 0.54} + 54%|█████▍ | 4760/8750 [3:54:37<6:21:02, 5.73s/it] 54%|█████▍ | 4761/8750 [3:54:45<6:21:56, 5.75s/it] 54%|█████▍ | 4761/8750 [3:54:43<6:21:57, 5.75s/it] {'loss': 0.4585, 'learning_rate': 9.059322242305579e-06, 'epoch': 0.54} + 54%|█████▍ | 4761/8750 [3:54:45<6:21:56, 5.75s/it] {'loss': 0.4585, 'learning_rate': 9.059322242305579e-06, 'epoch': 0.54} + 54%|█████▍ | 4761/8750 [3:54:43<6:21:57, 5.75s/it] 54%|█████▍ | 4762/8750 [3:54:51<6:20:12, 5.72s/it] 54%|█████▍ | 4762/8750 [3:54:48<6:20:11, 5.72s/it] {'loss': 0.4436, 'learning_rate': 9.055637068012664e-06, 'epoch': 0.54} + 54%|█████▍ | 4762/8750 [3:54:51<6:20:12, 5.72s/it] {'loss': 0.4436, 'learning_rate': 9.055637068012664e-06, 'epoch': 0.54} + 54%|█████▍ | 4762/8750 [3:54:48<6:20:11, 5.72s/it] 54%|█████▍ | 4763/8750 [3:54:57<6:20:48, 5.73s/it] 54%|█████▍ | 4763/8750 [3:54:54<6:20:47, 5.73s/it] {'loss': 0.4627, 'learning_rate': 9.051952023118563e-06, 'epoch': 0.54} + 54%|█████▍ | 4763/8750 [3:54:57<6:20:48, 5.73s/it] {'loss': 0.4627, 'learning_rate': 9.051952023118563e-06, 'epoch': 0.54} + 54%|█████▍ | 4763/8750 [3:54:54<6:20:47, 5.73s/it] 54%|█████▍ | 4764/8750 [3:55:03<6:18:47, 5.70s/it] 54%|█████▍ | 4764/8750 [3:55:00<6:18:46, 5.70s/it] {'loss': 0.4787, 'learning_rate': 9.048267108128198e-06, 'epoch': 0.54} + 54%|█████▍ | 4764/8750 [3:55:03<6:18:47, 5.70s/it] {'loss': 0.4787, 'learning_rate': 9.048267108128198e-06, 'epoch': 0.54} + 54%|█████▍ | 4764/8750 [3:55:00<6:18:46, 5.70s/it] 54%|█████▍ | 4765/8750 [3:55:08<6:22:04, 5.75s/it] 54%|█████▍ | 4765/8750 [3:55:05<6:22:03, 5.75s/it] {'loss': 0.457, 'learning_rate': 9.044582323546486e-06, 'epoch': 0.54} + {'loss': 0.457, 'learning_rate': 9.044582323546486e-06, 'epoch': 0.54} 54%|█████▍ | 4765/8750 [3:55:08<6:22:04, 5.75s/it] + 54%|█████▍ | 4765/8750 [3:55:05<6:22:03, 5.75s/it] 54%|█████▍ | 4766/8750 [3:55:14<6:20:53, 5.74s/it] 54%|█████▍ | 4766/8750 [3:55:11<6:20:53, 5.74s/it] {'loss': 0.4668, 'learning_rate': 9.040897669878327e-06, 'epoch': 0.54} + 54%|█████▍ | 4766/8750 [3:55:14<6:20:53, 5.74s/it] {'loss': 0.4668, 'learning_rate': 9.040897669878327e-06, 'epoch': 0.54} + 54%|█████▍ | 4766/8750 [3:55:11<6:20:53, 5.74s/it] 54%|█████▍ | 4767/8750 [3:55:20<6:24:16, 5.79s/it] 54%|█████▍ | 4767/8750 [3:55:17<6:24:16, 5.79s/it] {'loss': 0.4772, 'learning_rate': 9.037213147628603e-06, 'epoch': 0.54} + 54%|█████▍ | 4767/8750 [3:55:20<6:24:16, 5.79s/it] {'loss': 0.4772, 'learning_rate': 9.037213147628603e-06, 'epoch': 0.54} + 54%|█████▍ | 4767/8750 [3:55:17<6:24:16, 5.79s/it] 54%|█████▍ | 4768/8750 [3:55:26<6:26:15, 5.82s/it] 54%|█████▍ | 4768/8750 [3:55:23<6:26:16, 5.82s/it] {'loss': 0.4538, 'learning_rate': 9.033528757302167e-06, 'epoch': 0.54} + 54%|█████▍ | 4768/8750 [3:55:26<6:26:15, 5.82s/it] {'loss': 0.4538, 'learning_rate': 9.033528757302167e-06, 'epoch': 0.54} + 54%|█████▍ | 4768/8750 [3:55:23<6:26:16, 5.82s/it] 55%|█████▍ | 4769/8750 [3:55:32<6:27:15, 5.84s/it] 55%|█████▍ | 4769/8750 [3:55:29<6:27:15, 5.84s/it] {'loss': 0.4749, 'learning_rate': 9.029844499403876e-06, 'epoch': 0.55} + 55%|█████▍ | 4769/8750 [3:55:32<6:27:15, 5.84s/it] {'loss': 0.4749, 'learning_rate': 9.029844499403876e-06, 'epoch': 0.55} + 55%|█████▍ | 4769/8750 [3:55:29<6:27:15, 5.84s/it] 55%|█████▍ | 4770/8750 [3:55:37<6:23:01, 5.77s/it] 55%|█████▍ | 4770/8750 [3:55:34<6:23:00, 5.77s/it] {'loss': 0.4524, 'learning_rate': 9.026160374438543e-06, 'epoch': 0.55} + 55%|█████▍ | 4770/8750 [3:55:37<6:23:01, 5.77s/it] {'loss': 0.4524, 'learning_rate': 9.026160374438543e-06, 'epoch': 0.55} + 55%|█████▍ | 4770/8750 [3:55:34<6:23:00, 5.77s/it] 55%|█████▍ | 4771/8750 [3:55:43<6:21:12, 5.75s/it] 55%|█████▍ | 4771/8750 [3:55:40<6:21:12, 5.75s/it] {'loss': 0.4618, 'learning_rate': 9.022476382910983e-06, 'epoch': 0.55} + 55%|█████▍ | 4771/8750 [3:55:43<6:21:12, 5.75s/it] {'loss': 0.4618, 'learning_rate': 9.022476382910983e-06, 'epoch': 0.55} + 55%|█████▍ | 4771/8750 [3:55:40<6:21:12, 5.75s/it] 55%|█████▍ | 4772/8750 [3:55:49<6:16:28, 5.68s/it] 55%|█████▍ | 4772/8750 [3:55:46<6:16:28, 5.68s/it] {'loss': 0.4579, 'learning_rate': 9.018792525325986e-06, 'epoch': 0.55} + 55%|█████▍ | 4772/8750 [3:55:49<6:16:28, 5.68s/it] {'loss': 0.4579, 'learning_rate': 9.018792525325986e-06, 'epoch': 0.55} + 55%|█████▍ | 4772/8750 [3:55:46<6:16:28, 5.68s/it] 55%|█████▍ | 4773/8750 [3:55:51<6:16:45, 5.68s/it] 55%|█████▍ | 4773/8750 [3:55:54<6:16:45, 5.68s/it] {'loss': 0.4505, 'learning_rate': 9.015108802188314e-06, 'epoch': 0.55} + {'loss': 0.4505, 'learning_rate': 9.015108802188314e-06, 'epoch': 0.55} + 55%|█████▍ | 4773/8750 [3:55:54<6:16:45, 5.68s/it] 55%|█████▍ | 4773/8750 [3:55:51<6:16:45, 5.68s/it] 55%|█████▍ | 4774/8750 [3:56:00<6:17:58, 5.70s/it] 55%|█████▍ | 4774/8750 [3:55:57<6:17:58, 5.70s/it] {'loss': 0.4616, 'learning_rate': 9.01142521400273e-06, 'epoch': 0.55} + 55%|█████▍ | 4774/8750 [3:56:00<6:17:58, 5.70s/it] {'loss': 0.4616, 'learning_rate': 9.01142521400273e-06, 'epoch': 0.55} + 55%|█████▍ | 4774/8750 [3:55:57<6:17:58, 5.70s/it] 55%|█████▍ | 4775/8750 [3:56:06<6:19:09, 5.72s/it] 55%|█████▍ | 4775/8750 [3:56:03<6:19:09, 5.72s/it] {'loss': 0.447, 'learning_rate': 9.007741761273957e-06, 'epoch': 0.55} + 55%|█████▍ | 4775/8750 [3:56:06<6:19:09, 5.72s/it] {'loss': 0.447, 'learning_rate': 9.007741761273957e-06, 'epoch': 0.55} + 55%|█████▍ | 4775/8750 [3:56:03<6:19:09, 5.72s/it] 55%|█████▍ | 4776/8750 [3:56:12<6:20:51, 5.75s/it] 55%|█████▍ | 4776/8750 [3:56:09<6:20:52, 5.75s/it] {'loss': 0.4495, 'learning_rate': 9.004058444506718e-06, 'epoch': 0.55} + 55%|█████▍ | 4776/8750 [3:56:12<6:20:51, 5.75s/it] {'loss': 0.4495, 'learning_rate': 9.004058444506718e-06, 'epoch': 0.55} + 55%|█████▍ | 4776/8750 [3:56:09<6:20:52, 5.75s/it] 55%|█████▍ | 4777/8750 [3:56:17<6:18:31, 5.72s/it] 55%|█████▍ | 4777/8750 [3:56:14<6:18:31, 5.72s/it] {'loss': 0.4651, 'learning_rate': 9.000375264205713e-06, 'epoch': 0.55} + 55%|█████▍ | 4777/8750 [3:56:17<6:18:31, 5.72s/it] {'loss': 0.4651, 'learning_rate': 9.000375264205713e-06, 'epoch': 0.55} + 55%|█████▍ | 4777/8750 [3:56:14<6:18:31, 5.72s/it] 55%|█████▍ | 4778/8750 [3:56:23<6:22:07, 5.77s/it] 55%|█████▍ | 4778/8750 [3:56:20<6:22:07, 5.77s/it] {'loss': 0.4482, 'learning_rate': 8.996692220875608e-06, 'epoch': 0.55} + 55%|█████▍ | 4778/8750 [3:56:23<6:22:07, 5.77s/it] {'loss': 0.4482, 'learning_rate': 8.996692220875608e-06, 'epoch': 0.55} + 55%|█████▍ | 4778/8750 [3:56:20<6:22:07, 5.77s/it] 55%|█████▍ | 4779/8750 [3:56:29<6:25:41, 5.83s/it] 55%|█████▍ | 4779/8750 [3:56:26<6:25:40, 5.83s/it] {'loss': 0.467, 'learning_rate': 8.993009315021073e-06, 'epoch': 0.55} + 55%|█████▍ | 4779/8750 [3:56:29<6:25:41, 5.83s/it] {'loss': 0.467, 'learning_rate': 8.993009315021073e-06, 'epoch': 0.55} + 55%|█████▍ | 4779/8750 [3:56:26<6:25:40, 5.83s/it] 55%|█████▍ | 4780/8750 [3:56:35<6:27:32, 5.86s/it] 55%|█████▍ | 4780/8750 [3:56:32<6:27:32, 5.86s/it] {'loss': 0.4633, 'learning_rate': 8.989326547146743e-06, 'epoch': 0.55} + 55%|█████▍ | 4780/8750 [3:56:35<6:27:32, 5.86s/it] {'loss': 0.4633, 'learning_rate': 8.989326547146743e-06, 'epoch': 0.55} + 55%|█████▍ | 4780/8750 [3:56:32<6:27:32, 5.86s/it] 55%|█████▍ | 4781/8750 [3:56:41<6:24:18, 5.81s/it] 55%|█████▍ | 4781/8750 [3:56:38<6:24:19, 5.81s/it] {'loss': 0.4625, 'learning_rate': 8.985643917757237e-06, 'epoch': 0.55} + 55%|█████▍ | 4781/8750 [3:56:41<6:24:18, 5.81s/it] {'loss': 0.4625, 'learning_rate': 8.985643917757237e-06, 'epoch': 0.55} + 55%|█████▍ | 4781/8750 [3:56:38<6:24:19, 5.81s/it] 55%|█████▍ | 4782/8750 [3:56:47<6:23:50, 5.80s/it] 55%|█████▍ | 4782/8750 [3:56:44<6:23:51, 5.80s/it] {'loss': 0.4593, 'learning_rate': 8.981961427357166e-06, 'epoch': 0.55} + 55%|█████▍ | 4782/8750 [3:56:47<6:23:50, 5.80s/it] {'loss': 0.4593, 'learning_rate': 8.981961427357166e-06, 'epoch': 0.55} + 55%|█████▍ | 4782/8750 [3:56:44<6:23:51, 5.80s/it] 55%|█████▍ | 4783/8750 [3:56:49<6:24:39, 5.82s/it] 55%|█████▍ | 4783/8750 [3:56:52<6:24:39, 5.82s/it] {'loss': 0.4615, 'learning_rate': 8.978279076451104e-06, 'epoch': 0.55} + 55%|█████▍ | 4783/8750 [3:56:52<6:24:39, 5.82s/it] {'loss': 0.4615, 'learning_rate': 8.978279076451104e-06, 'epoch': 0.55} + 55%|█████▍ | 4783/8750 [3:56:49<6:24:39, 5.82s/it] 55%|█████▍ | 4784/8750 [3:56:59<6:34:50, 5.97s/it] 55%|█████▍ | 4784/8750 [3:56:56<6:34:49, 5.97s/it] {'loss': 0.4491, 'learning_rate': 8.974596865543624e-06, 'epoch': 0.55} + 55%|█████▍ | 4784/8750 [3:56:59<6:34:50, 5.97s/it] {'loss': 0.4491, 'learning_rate': 8.974596865543624e-06, 'epoch': 0.55} + 55%|█████▍ | 4784/8750 [3:56:56<6:34:49, 5.97s/it] 55%|█████▍ | 4785/8750 [3:57:04<6:26:57, 5.86s/it] 55%|█████▍ | 4785/8750 [3:57:01<6:26:58, 5.86s/it] {'loss': 0.478, 'learning_rate': 8.970914795139264e-06, 'epoch': 0.55} + 55%|█████▍ | 4785/8750 [3:57:04<6:26:57, 5.86s/it] {'loss': 0.478, 'learning_rate': 8.970914795139264e-06, 'epoch': 0.55} + 55%|█████▍ | 4785/8750 [3:57:01<6:26:58, 5.86s/it] 55%|█████▍ | 4786/8750 [3:57:10<6:28:03, 5.87s/it] 55%|█████▍ | 4786/8750 [3:57:07<6:28:02, 5.87s/it] {'loss': 0.4484, 'learning_rate': 8.967232865742552e-06, 'epoch': 0.55} + 55%|█████▍ | 4786/8750 [3:57:10<6:28:03, 5.87s/it] {'loss': 0.4484, 'learning_rate': 8.967232865742552e-06, 'epoch': 0.55} + 55%|█████▍ | 4786/8750 [3:57:07<6:28:02, 5.87s/it]Apr 10 14:09:24.552054 999102 slurmstepd 0x155550ab8700: error: *** STEP 6710093.0 ON batch-block1-0082 CANCELLED AT 2025-04-10T14:09:24 DUE TO TIME LIMIT *** +srun: Job step aborted: Waiting up to 122 seconds for job step to finish. + 55%|█████▍ | 4787/8750 [3:57:16<6:26:51, 5.86s/it] 55%|█████▍ | 4787/8750 [3:57:13<6:26:51, 5.86s/it] {'loss': 0.4514, 'learning_rate': 8.963551077857999e-06, 'epoch': 0.55} + 55%|█████▍ | 4787/8750 [3:57:16<6:26:51, 5.86s/it] {'loss': 0.4514, 'learning_rate': 8.963551077857999e-06, 'epoch': 0.55} + 55%|█████▍ | 4787/8750 [3:57:13<6:26:51, 5.86s/it]srun: error: batch-block1-10017: task 1: Terminated +srun: Terminating StepId=6710093.0 +srun: error: batch-block1-0082: task 0: Terminated +srun: job 6724028 queued and waiting for resources +srun: job 6724028 has been allocated resources +wandb: Currently logged in as: memmelma. Use `wandb login --relogin` to force relogin +MASTER_ADDR=batch-block1-2097 +JobID: 6724028 | Full list: batch-block1-2097 batch-block1-10017 +NETWORK=Efficient-Large-Model/VILA1.5-3b +WARNING:torch.distributed.run: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +wandb: Currently logged in as: memmelma. Use `wandb login --relogin` to force relogin +MASTER_ADDR=batch-block1-2097 +JobID: 6724028 | Full list: batch-block1-2097 batch-block1-10017 +NETWORK=Efficient-Large-Model/VILA1.5-3b +WARNING:torch.distributed.run: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +[2025-04-10 14:11:41,578] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 14:11:41,579] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +[2025-04-10 14:11:41,686] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 14:11:41,697] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 14:11:41,711] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 14:11:41,721] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 14:11:41,724] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 14:11:41,742] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 14:11:41,928] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 14:11:41,928] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 14:11:41,928] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 14:11:41,928] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 14:11:42,036] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 14:11:42,036] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 14:11:42,038] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 14:11:42,038] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 14:11:42,046] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 14:11:42,046] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 14:11:42,047] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 14:11:42,047] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 14:11:42,053] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 14:11:42,053] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 14:11:42,066] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 14:11:42,066] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 14:11:42,424] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 14:11:42,425] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 14:11:42,432] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 14:11:42,432] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 14:11:42,440] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 14:11:42,440] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 14:11:42,447] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 14:11:42,447] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 14:11:42,728] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 14:11:42,728] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 14:11:42,728] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 14:11:42,728] [INFO] [comm.py:625:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +[2025-04-10 14:11:42,728] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 14:11:42,738] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 14:11:42,738] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 14:11:42,738] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 14:11:42,738] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 14:11:42,739] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 14:11:42,739] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 14:11:42,739] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 14:11:42,739] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 14:11:42,751] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 14:11:42,751] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 14:11:42,759] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 14:11:42,759] [INFO] [comm.py:594:init_distributed] cdb=None +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +[2025-04-10 14:11:53,089] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 2.70B parameters + Loading checkpoint shards: 0%| | 0/2 [00:00 4096). Running this sequence through the model will result in indexing errors + 73%|███████▎ | 6366/8750 [2:45:20<3:49:42, 5.78s/it] 73%|███████▎ | 6366/8750 [2:45:18<3:49:42, 5.78s/it] {'loss': 0.4569, 'learning_rate': 3.6475761612188177e-06, 'epoch': 0.73} + 73%|███████▎ | 6366/8750 [2:45:20<3:49:42, 5.78s/it] {'loss': 0.4569, 'learning_rate': 3.6475761612188177e-06, 'epoch': 0.73} + 73%|███████▎ | 6366/8750 [2:45:18<3:49:42, 5.78s/it] 73%|███████▎ | 6367/8750 [2:45:25<3:48:51, 5.76s/it] 73%|███████▎ | 6367/8750 [2:45:23<3:48:51, 5.76s/it] {'loss': 0.4629, 'learning_rate': 3.644717762601978e-06, 'epoch': 0.73} + 73%|███████▎ | 6367/8750 [2:45:25<3:48:51, 5.76s/it] {'loss': 0.4629, 'learning_rate': 3.644717762601978e-06, 'epoch': 0.73} + 73%|███████▎ | 6367/8750 [2:45:23<3:48:51, 5.76s/it] 73%|███████▎ | 6368/8750 [2:45:31<3:46:41, 5.71s/it] 73%|███████▎ | 6368/8750 [2:45:29<3:46:41, 5.71s/it] {'loss': 0.4604, 'learning_rate': 3.6418602348006903e-06, 'epoch': 0.73} + 73%|███████▎ | 6368/8750 [2:45:31<3:46:41, 5.71s/it] {'loss': 0.4604, 'learning_rate': 3.6418602348006903e-06, 'epoch': 0.73} + 73%|███████▎ | 6368/8750 [2:45:29<3:46:41, 5.71s/it] 73%|███████▎ | 6369/8750 [2:45:37<3:48:14, 5.75s/it] 73%|███████▎ | 6369/8750 [2:45:35<3:48:14, 5.75s/it] {'loss': 0.4635, 'learning_rate': 3.639003578206508e-06, 'epoch': 0.73} + 73%|███████▎ | 6369/8750 [2:45:37<3:48:14, 5.75s/it] {'loss': 0.4635, 'learning_rate': 3.639003578206508e-06, 'epoch': 0.73} + 73%|███████▎ | 6369/8750 [2:45:35<3:48:14, 5.75s/it] 73%|███████▎ | 6370/8750 [2:45:43<3:48:33, 5.76s/it] 73%|███████▎ | 6370/8750 [2:45:40<3:48:33, 5.76s/it] {'loss': 0.4692, 'learning_rate': 3.6361477932108513e-06, 'epoch': 0.73} + 73%|███████▎ | 6370/8750 [2:45:43<3:48:33, 5.76s/it] {'loss': 0.4692, 'learning_rate': 3.6361477932108513e-06, 'epoch': 0.73} + 73%|███████▎ | 6370/8750 [2:45:40<3:48:33, 5.76s/it] 73%|███████▎ | 6371/8750 [2:45:46<3:48:11, 5.76s/it] 73%|███████▎ | 6371/8750 [2:45:48<3:48:11, 5.76s/it] {'loss': 0.4452, 'learning_rate': 3.633292880205024e-06, 'epoch': 0.73} + 73%|███████▎ | 6371/8750 [2:45:48<3:48:11, 5.76s/it] {'loss': 0.4452, 'learning_rate': 3.633292880205024e-06, 'epoch': 0.73} + 73%|███████▎ | 6371/8750 [2:45:46<3:48:11, 5.76s/it] 73%|███████▎ | 6372/8750 [2:45:54<3:47:11, 5.73s/it] 73%|███████▎ | 6372/8750 [2:45:52<3:47:12, 5.73s/it] {'loss': 0.4524, 'learning_rate': 3.630438839580217e-06, 'epoch': 0.73} + 73%|███████▎ | 6372/8750 [2:45:54<3:47:11, 5.73s/it] {'loss': 0.4524, 'learning_rate': 3.630438839580217e-06, 'epoch': 0.73} + 73%|███████▎ | 6372/8750 [2:45:52<3:47:12, 5.73s/it] 73%|███████▎ | 6373/8750 [2:46:00<3:48:00, 5.76s/it] 73%|███████▎ | 6373/8750 [2:45:58<3:48:00, 5.76s/it] {'loss': 0.4447, 'learning_rate': 3.627585671727496e-06, 'epoch': 0.73} + 73%|███████▎ | 6373/8750 [2:46:00<3:48:00, 5.76s/it] {'loss': 0.4447, 'learning_rate': 3.627585671727496e-06, 'epoch': 0.73} + 73%|███████▎ | 6373/8750 [2:45:58<3:48:00, 5.76s/it] 73%|███████▎ | 6374/8750 [2:46:06<3:47:18, 5.74s/it] 73%|███████▎ | 6374/8750 [2:46:03<3:47:18, 5.74s/it] {'loss': 0.4762, 'learning_rate': 3.6247333770378133e-06, 'epoch': 0.73} + 73%|███████▎ | 6374/8750 [2:46:06<3:47:18, 5.74s/it] {'loss': 0.4762, 'learning_rate': 3.6247333770378133e-06, 'epoch': 0.73} + 73%|███████▎ | 6374/8750 [2:46:03<3:47:18, 5.74s/it] 73%|███████▎ | 6375/8750 [2:46:09<3:46:29, 5.72s/it] 73%|███████▎ | 6375/8750 [2:46:11<3:46:29, 5.72s/it] {'loss': 0.4542, 'learning_rate': 3.6218819559019934e-06, 'epoch': 0.73} + 73%|███████▎ | 6375/8750 [2:46:11<3:46:29, 5.72s/it] {'loss': 0.4542, 'learning_rate': 3.6218819559019934e-06, 'epoch': 0.73} + 73%|███████▎ | 6375/8750 [2:46:09<3:46:29, 5.72s/it] 73%|███████▎ | 6376/8750 [2:46:17<3:44:50, 5.68s/it] 73%|███████▎ | 6376/8750 [2:46:15<3:44:50, 5.68s/it] {'loss': 0.4638, 'learning_rate': 3.6190314087107415e-06, 'epoch': 0.73} + 73%|███████▎ | 6376/8750 [2:46:17<3:44:50, 5.68s/it] {'loss': 0.4638, 'learning_rate': 3.6190314087107415e-06, 'epoch': 0.73} + 73%|███████▎ | 6376/8750 [2:46:15<3:44:50, 5.68s/it] 73%|███████▎ | 6377/8750 [2:46:23<3:51:14, 5.85s/it] 73%|███████▎ | 6377/8750 [2:46:21<3:51:15, 5.85s/it] {'loss': 0.4357, 'learning_rate': 3.6161817358546513e-06, 'epoch': 0.73} + 73%|███████▎ | 6377/8750 [2:46:23<3:51:14, 5.85s/it] {'loss': 0.4357, 'learning_rate': 3.6161817358546513e-06, 'epoch': 0.73} + 73%|███████▎ | 6377/8750 [2:46:21<3:51:15, 5.85s/it] 73%|███████▎ | 6378/8750 [2:46:29<3:48:27, 5.78s/it] {'loss': 0.4745, 'learning_rate': 3.6133329377241866e-06, 'epoch': 0.73} + 73%|███████▎ | 6378/8750 [2:46:29<3:48:27, 5.78s/it] 73%|███████▎ | 6378/8750 [2:46:26<3:48:27, 5.78s/it] {'loss': 0.4745, 'learning_rate': 3.6133329377241866e-06, 'epoch': 0.73} + 73%|███████▎ | 6378/8750 [2:46:27<3:48:27, 5.78s/it] 73%|███████▎ | 6379/8750 [2:46:35<3:49:14, 5.80s/it] 73%|███████▎ | 6379/8750 [2:46:32<3:49:14, 5.80s/it] {'loss': 0.4524, 'learning_rate': 3.6104850147097035e-06, 'epoch': 0.73} + 73%|███████▎ | 6379/8750 [2:46:35<3:49:14, 5.80s/it] {'loss': 0.4524, 'learning_rate': 3.6104850147097035e-06, 'epoch': 0.73} + 73%|███████▎ | 6379/8750 [2:46:32<3:49:14, 5.80s/it] 73%|███████▎ | 6380/8750 [2:46:41<3:50:32, 5.84s/it] 73%|███████▎ | 6380/8750 [2:46:38<3:50:31, 5.84s/it] {'loss': 0.4682, 'learning_rate': 3.6076379672014263e-06, 'epoch': 0.73} + 73%|███████▎ | 6380/8750 [2:46:41<3:50:32, 5.84s/it] {'loss': 0.4682, 'learning_rate': 3.6076379672014263e-06, 'epoch': 0.73} + 73%|███████▎ | 6380/8750 [2:46:38<3:50:31, 5.84s/it] 73%|███████▎ | 6381/8750 [2:46:46<3:48:43, 5.79s/it] 73%|███████▎ | 6381/8750 [2:46:44<3:48:43, 5.79s/it] {'loss': 0.4454, 'learning_rate': 3.6047917955894606e-06, 'epoch': 0.73} + 73%|███████▎ | 6381/8750 [2:46:46<3:48:43, 5.79s/it] {'loss': 0.4454, 'learning_rate': 3.6047917955894606e-06, 'epoch': 0.73} + 73%|███████▎ | 6381/8750 [2:46:44<3:48:43, 5.79s/it] 73%|███████▎ | 6382/8750 [2:46:52<3:48:05, 5.78s/it] 73%|███████▎ | 6382/8750 [2:46:50<3:48:05, 5.78s/it] {'loss': 0.4475, 'learning_rate': 3.6019465002638e-06, 'epoch': 0.73} + 73%|███████▎ | 6382/8750 [2:46:52<3:48:05, 5.78s/it] {'loss': 0.4475, 'learning_rate': 3.6019465002638e-06, 'epoch': 0.73} + 73%|███████▎ | 6382/8750 [2:46:50<3:48:05, 5.78s/it]/lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/VILA/llava/model/llava_arch.py:397: UserWarning: Inputs truncated! + warnings.warn("Inputs truncated!") +[2025-04-10 17:00:50,696] [WARNING] [stage3.py:1850:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time + 73%|███████▎ | 6383/8750 [2:46:58<3:56:30, 6.00s/it] 73%|███████▎ | 6383/8750 [2:46:56<3:56:30, 6.00s/it] {'loss': 0.452, 'learning_rate': 3.5991020816143164e-06, 'epoch': 0.73} + 73%|███████▎ | 6383/8750 [2:46:58<3:56:30, 6.00s/it] {'loss': 0.452, 'learning_rate': 3.5991020816143164e-06, 'epoch': 0.73} + 73%|███████▎ | 6383/8750 [2:46:56<3:56:30, 6.00s/it] 73%|███████▎ | 6384/8750 [2:47:04<3:51:11, 5.86s/it] 73%|███████▎ | 6384/8750 [2:47:02<3:51:11, 5.86s/it] {'loss': 0.4758, 'learning_rate': 3.5962585400307504e-06, 'epoch': 0.73} + 73%|███████▎ | 6384/8750 [2:47:04<3:51:11, 5.86s/it] {'loss': 0.4758, 'learning_rate': 3.5962585400307504e-06, 'epoch': 0.73} + 73%|███████▎ | 6384/8750 [2:47:02<3:51:11, 5.86s/it] 73%|███████▎ | 6385/8750 [2:47:10<3:50:19, 5.84s/it] 73%|███████▎ | 6385/8750 [2:47:08<3:50:19, 5.84s/it] {'loss': 0.4591, 'learning_rate': 3.5934158759027405e-06, 'epoch': 0.73} + 73%|███████▎ | 6385/8750 [2:47:10<3:50:19, 5.84s/it] {'loss': 0.4591, 'learning_rate': 3.5934158759027405e-06, 'epoch': 0.73} + 73%|███████▎ | 6385/8750 [2:47:08<3:50:19, 5.84s/it] 73%|███████▎ | 6386/8750 [2:47:16<3:48:34, 5.80s/it] 73%|███████▎ | 6386/8750 [2:47:13<3:48:34, 5.80s/it] {'loss': 0.4632, 'learning_rate': 3.590574089619786e-06, 'epoch': 0.73} + 73%|███████▎ | 6386/8750 [2:47:13<3:48:34, 5.80s/it] {'loss': 0.4632, 'learning_rate': 3.590574089619786e-06, 'epoch': 0.73} + 73%|███████▎ | 6386/8750 [2:47:16<3:48:34, 5.80s/it] 73%|███████▎ | 6387/8750 [2:47:21<3:47:12, 5.77s/it] 73%|███████▎ | 6387/8750 [2:47:19<3:47:12, 5.77s/it] {'loss': 0.4452, 'learning_rate': 3.587733181571282e-06, 'epoch': 0.73} + 73%|███████▎ | 6387/8750 [2:47:21<3:47:12, 5.77s/it] {'loss': 0.4452, 'learning_rate': 3.587733181571282e-06, 'epoch': 0.73} + 73%|███████▎ | 6387/8750 [2:47:19<3:47:12, 5.77s/it] 73%|███████▎ | 6388/8750 [2:47:27<3:45:11, 5.72s/it] 73%|███████▎ | 6388/8750 [2:47:25<3:45:11, 5.72s/it] {'loss': 0.4504, 'learning_rate': 3.5848931521464947e-06, 'epoch': 0.73} + 73%|███████▎ | 6388/8750 [2:47:27<3:45:11, 5.72s/it] {'loss': 0.4504, 'learning_rate': 3.5848931521464947e-06, 'epoch': 0.73} + 73%|███████▎ | 6388/8750 [2:47:25<3:45:11, 5.72s/it] 73%|███████▎ | 6389/8750 [2:47:33<3:45:11, 5.72s/it] 73%|███████▎ | 6389/8750 [2:47:30<3:45:11, 5.72s/it] {'loss': 0.4659, 'learning_rate': 3.5820540017345663e-06, 'epoch': 0.73} + 73%|███████▎ | 6389/8750 [2:47:33<3:45:11, 5.72s/it] {'loss': 0.4659, 'learning_rate': 3.5820540017345663e-06, 'epoch': 0.73} + 73%|███████▎ | 6389/8750 [2:47:30<3:45:11, 5.72s/it] 73%|███████▎ | 6390/8750 [2:47:36<3:46:34, 5.76s/it] 73%|███████▎ | 6390/8750 [2:47:38<3:46:34, 5.76s/it] {'loss': 0.4513, 'learning_rate': 3.5792157307245313e-06, 'epoch': 0.73} + 73%|███████▎ | 6390/8750 [2:47:36<3:46:34, 5.76s/it] {'loss': 0.4513, 'learning_rate': 3.5792157307245313e-06, 'epoch': 0.73} + 73%|███████▎ | 6390/8750 [2:47:38<3:46:34, 5.76s/it] 73%|███████▎ | 6391/8750 [2:47:44<3:45:45, 5.74s/it] 73%|███████▎ | 6391/8750 [2:47:42<3:45:45, 5.74s/it] {'loss': 0.4511, 'learning_rate': 3.5763783395052887e-06, 'epoch': 0.73} + 73%|███████▎ | 6391/8750 [2:47:44<3:45:45, 5.74s/it] {'loss': 0.4511, 'learning_rate': 3.5763783395052887e-06, 'epoch': 0.73} + 73%|███████▎ | 6391/8750 [2:47:42<3:45:45, 5.74s/it] 73%|███████▎ | 6392/8750 [2:47:50<3:46:35, 5.77s/it] 73%|███████▎ | 6392/8750 [2:47:48<3:46:35, 5.77s/it] {'loss': 0.4506, 'learning_rate': 3.5735418284656287e-06, 'epoch': 0.73} + 73%|███████▎ | 6392/8750 [2:47:50<3:46:35, 5.77s/it] {'loss': 0.4506, 'learning_rate': 3.5735418284656287e-06, 'epoch': 0.73} + 73%|███████▎ | 6392/8750 [2:47:48<3:46:35, 5.77s/it] 73%|███████▎ | 6393/8750 [2:47:55<3:43:35, 5.69s/it] 73%|███████▎ | 6393/8750 [2:47:53<3:43:35, 5.69s/it] {'loss': 0.4609, 'learning_rate': 3.5707061979942205e-06, 'epoch': 0.73} + 73%|███████▎ | 6393/8750 [2:47:55<3:43:35, 5.69s/it] {'loss': 0.4609, 'learning_rate': 3.5707061979942205e-06, 'epoch': 0.73} + 73%|███████▎ | 6393/8750 [2:47:53<3:43:35, 5.69s/it] 73%|███████▎ | 6394/8750 [2:48:01<3:43:02, 5.68s/it] 73%|███████▎ | 6394/8750 [2:47:59<3:43:02, 5.68s/it] {'loss': 0.4443, 'learning_rate': 3.5678714484796006e-06, 'epoch': 0.73} + 73%|███████▎ | 6394/8750 [2:48:01<3:43:02, 5.68s/it] {'loss': 0.4443, 'learning_rate': 3.5678714484796006e-06, 'epoch': 0.73} + 73%|███████▎ | 6394/8750 [2:47:59<3:43:02, 5.68s/it] 73%|███████▎ | 6395/8750 [2:48:07<3:42:47, 5.68s/it] 73%|███████▎ | 6395/8750 [2:48:05<3:42:47, 5.68s/it] {'loss': 0.4451, 'learning_rate': 3.565037580310201e-06, 'epoch': 0.73} + 73%|███████▎ | 6395/8750 [2:48:07<3:42:47, 5.68s/it] {'loss': 0.4451, 'learning_rate': 3.565037580310201e-06, 'epoch': 0.73} + 73%|███████▎ | 6395/8750 [2:48:05<3:42:47, 5.68s/it] 73%|███████▎ | 6396/8750 [2:48:12<3:41:54, 5.66s/it] 73%|███████▎ | 6396/8750 [2:48:10<3:41:54, 5.66s/it] {'loss': 0.4531, 'learning_rate': 3.56220459387432e-06, 'epoch': 0.73} + 73%|███████▎ | 6396/8750 [2:48:12<3:41:54, 5.66s/it] {'loss': 0.4531, 'learning_rate': 3.56220459387432e-06, 'epoch': 0.73} + 73%|███████▎ | 6396/8750 [2:48:10<3:41:54, 5.66s/it] 73%|███████▎ | 6397/8750 [2:48:18<3:43:15, 5.69s/it] 73%|███████▎ | 6397/8750 [2:48:16<3:43:15, 5.69s/it] {'loss': 0.4487, 'learning_rate': 3.559372489560139e-06, 'epoch': 0.73} + 73%|███████▎ | 6397/8750 [2:48:18<3:43:15, 5.69s/it] {'loss': 0.4487, 'learning_rate': 3.559372489560139e-06, 'epoch': 0.73} + 73%|███████▎ | 6397/8750 [2:48:16<3:43:15, 5.69s/it] 73%|███████▎ | 6398/8750 [2:48:24<3:42:42, 5.68s/it] 73%|███████▎ | 6398/8750 [2:48:22<3:42:42, 5.68s/it] {'loss': 0.4554, 'learning_rate': 3.5565412677557233e-06, 'epoch': 0.73} + 73%|███████▎ | 6398/8750 [2:48:24<3:42:42, 5.68s/it] {'loss': 0.4554, 'learning_rate': 3.5565412677557233e-06, 'epoch': 0.73} + 73%|███████▎ | 6398/8750 [2:48:22<3:42:42, 5.68s/it] 73%|███████▎ | 6399/8750 [2:48:30<3:43:12, 5.70s/it] 73%|███████▎ | 6399/8750 [2:48:27<3:43:11, 5.70s/it] {'loss': 0.4365, 'learning_rate': 3.553710928849009e-06, 'epoch': 0.73} + 73%|███████▎ | 6399/8750 [2:48:30<3:43:12, 5.70s/it] {'loss': 0.4365, 'learning_rate': 3.553710928849009e-06, 'epoch': 0.73} + 73%|███████▎ | 6399/8750 [2:48:27<3:43:11, 5.70s/it]9 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +0 73%|███████▎ | 6400/8750 [2:48:35<3:42:15, 5.67s/it]13 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... + 73%|███████▎ | 6400/8750 [2:48:33<3:42:15, 5.67s/it]15 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4724, 'learning_rate': 3.550881473227822e-06, 'epoch': 0.73} + 73%|███████▎ | 6400/8750 [2:48:35<3:42:15, 5.67s/it] {'loss': 0.4724, 'learning_rate': 3.550881473227822e-06, 'epoch': 0.73} + 73%|███████▎ | 6400/8750 [2:48:33<3:42:15, 5.67s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-6400/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-6400/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-6400/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 73%|███████▎ | 6401/8750 [2:48:57<6:56:24, 10.64s/it] 73%|███████▎ | 6401/8750 [2:48:55<6:56:24, 10.64s/it] {'loss': 0.4517, 'learning_rate': 3.548052901279854e-06, 'epoch': 0.73} + 73%|███████▎ | 6401/8750 [2:48:57<6:56:24, 10.64s/it] {'loss': 0.4517, 'learning_rate': 3.548052901279854e-06, 'epoch': 0.73} + 73%|███████▎ | 6401/8750 [2:48:55<6:56:24, 10.64s/it] 73%|███████▎ | 6402/8750 [2:49:03<5:59:19, 9.18s/it] 73%|███████▎ | 6402/8750 [2:49:01<5:59:19, 9.18s/it] {'loss': 0.4618, 'learning_rate': 3.5452252133926856e-06, 'epoch': 0.73} + 73%|███████▎ | 6402/8750 [2:49:03<5:59:19, 9.18s/it] {'loss': 0.4618, 'learning_rate': 3.5452252133926856e-06, 'epoch': 0.73} + 73%|███████▎ | 6402/8750 [2:49:01<5:59:19, 9.18s/it] 73%|███████▎ | 6403/8750 [2:49:09<5:18:51, 8.15s/it] 73%|███████▎ | 6403/8750 [2:49:07<5:18:51, 8.15s/it] {'loss': 0.4458, 'learning_rate': 3.5423984099537754e-06, 'epoch': 0.73} + 73%|███████▎ | 6403/8750 [2:49:09<5:18:51, 8.15s/it] {'loss': 0.4458, 'learning_rate': 3.5423984099537754e-06, 'epoch': 0.73} + 73%|███████▎ | 6403/8750 [2:49:07<5:18:51, 8.15s/it] 73%|███████▎ | 6404/8750 [2:49:15<4:48:49, 7.39s/it] 73%|███████▎ | 6404/8750 [2:49:12<4:48:49, 7.39s/it] {'loss': 0.4444, 'learning_rate': 3.5395724913504546e-06, 'epoch': 0.73} + 73%|███████▎ | 6404/8750 [2:49:15<4:48:49, 7.39s/it] {'loss': 0.4444, 'learning_rate': 3.5395724913504546e-06, 'epoch': 0.73} + 73%|███████▎ | 6404/8750 [2:49:12<4:48:49, 7.39s/it] 73%|███████▎ | 6405/8750 [2:49:20<4:29:32, 6.90s/it] 73%|███████▎ | 6405/8750 [2:49:18<4:29:32, 6.90s/it] {'loss': 0.4485, 'learning_rate': 3.536747457969942e-06, 'epoch': 0.73} + 73%|███████▎ | 6405/8750 [2:49:20<4:29:32, 6.90s/it] {'loss': 0.4485, 'learning_rate': 3.536747457969942e-06, 'epoch': 0.73} + 73%|███████▎ | 6405/8750 [2:49:18<4:29:32, 6.90s/it] 73%|███████▎ | 6406/8750 [2:49:26<4:14:18, 6.51s/it] 73%|███████▎ | 6406/8750 [2:49:24<4:14:18, 6.51s/it] {'loss': 0.4616, 'learning_rate': 3.5339233101993287e-06, 'epoch': 0.73} + 73%|███████▎ | 6406/8750 [2:49:26<4:14:18, 6.51s/it] {'loss': 0.4616, 'learning_rate': 3.5339233101993287e-06, 'epoch': 0.73} + 73%|███████▎ | 6406/8750 [2:49:24<4:14:18, 6.51s/it] 73%|███████▎ | 6407/8750 [2:49:32<4:04:44, 6.27s/it] 73%|███████▎ | 6407/8750 [2:49:29<4:04:44, 6.27s/it] {'loss': 0.4301, 'learning_rate': 3.5311000484255796e-06, 'epoch': 0.73} + 73%|███████▎ | 6407/8750 [2:49:32<4:04:44, 6.27s/it] {'loss': 0.4301, 'learning_rate': 3.5311000484255796e-06, 'epoch': 0.73} + 73%|███████▎ | 6407/8750 [2:49:29<4:04:44, 6.27s/it] 73%|███████▎ | 6408/8750 [2:49:37<3:57:03, 6.07s/it] 73%|███████▎ | 6408/8750 [2:49:35<3:57:03, 6.07s/it] {'loss': 0.4606, 'learning_rate': 3.5282776730355537e-06, 'epoch': 0.73} + 73%|███████▎ | 6408/8750 [2:49:37<3:57:03, 6.07s/it] {'loss': 0.4606, 'learning_rate': 3.5282776730355537e-06, 'epoch': 0.73} + 73%|███████▎ | 6408/8750 [2:49:35<3:57:03, 6.07s/it] 73%|███████▎ | 6409/8750 [2:49:43<3:53:53, 5.99s/it] 73%|███████▎ | 6409/8750 [2:49:41<3:53:52, 5.99s/it] {'loss': 0.4518, 'learning_rate': 3.5254561844159718e-06, 'epoch': 0.73} + 73%|███████▎ | 6409/8750 [2:49:43<3:53:53, 5.99s/it] {'loss': 0.4518, 'learning_rate': 3.5254561844159718e-06, 'epoch': 0.73} + 73%|███████▎ | 6409/8750 [2:49:41<3:53:52, 5.99s/it] 73%|███████▎ | 6410/8750 [2:49:49<3:52:18, 5.96s/it] 73%|███████▎ | 6410/8750 [2:49:47<3:52:18, 5.96s/it] {'loss': 0.4358, 'learning_rate': 3.5226355829534475e-06, 'epoch': 0.73} + 73%|███████▎ | 6410/8750 [2:49:49<3:52:18, 5.96s/it] {'loss': 0.4358, 'learning_rate': 3.5226355829534475e-06, 'epoch': 0.73} + 73%|███████▎ | 6410/8750 [2:49:47<3:52:18, 5.96s/it] 73%|███████▎ | 6411/8750 [2:49:55<3:48:58, 5.87s/it] 73%|███████▎ | 6411/8750 [2:49:52<3:48:58, 5.87s/it] {'loss': 0.4838, 'learning_rate': 3.51981586903446e-06, 'epoch': 0.73} + 73%|███████▎ | 6411/8750 [2:49:55<3:48:58, 5.87s/it] {'loss': 0.4838, 'learning_rate': 3.51981586903446e-06, 'epoch': 0.73} + 73%|███████▎ | 6411/8750 [2:49:52<3:48:58, 5.87s/it] 73%|███████▎ | 6412/8750 [2:50:00<3:46:56, 5.82s/it] 73%|███████▎ | 6412/8750 [2:49:58<3:46:56, 5.82s/it] {'loss': 0.4628, 'learning_rate': 3.516997043045376e-06, 'epoch': 0.73} + 73%|███████▎ | 6412/8750 [2:50:00<3:46:56, 5.82s/it] {'loss': 0.4628, 'learning_rate': 3.516997043045376e-06, 'epoch': 0.73} + 73%|███████▎ | 6412/8750 [2:49:58<3:46:56, 5.82s/it] 73%|███████▎ | 6413/8750 [2:50:06<3:47:24, 5.84s/it] 73%|███████▎ | 6413/8750 [2:50:04<3:47:24, 5.84s/it]{'loss': 0.4449, 'learning_rate': 3.5141791053724405e-06, 'epoch': 0.73} + 73%|███████▎ | 6413/8750 [2:50:06<3:47:24, 5.84s/it] {'loss': 0.4449, 'learning_rate': 3.5141791053724405e-06, 'epoch': 0.73} + 73%|███████▎ | 6413/8750 [2:50:04<3:47:24, 5.84s/it] 73%|███████▎ | 6414/8750 [2:50:12<3:48:29, 5.87s/it] 73%|███████▎ | 6414/8750 [2:50:10<3:48:29, 5.87s/it] {'loss': 0.4448, 'learning_rate': 3.5113620564017727e-06, 'epoch': 0.73} + 73%|███████▎ | 6414/8750 [2:50:12<3:48:29, 5.87s/it] {'loss': 0.4448, 'learning_rate': 3.5113620564017727e-06, 'epoch': 0.73} + 73%|███████▎ | 6414/8750 [2:50:10<3:48:29, 5.87s/it] 73%|███████▎ | 6415/8750 [2:50:16<3:47:15, 5.84s/it] 73%|███████▎ | 6415/8750 [2:50:18<3:47:15, 5.84s/it] {'loss': 0.4661, 'learning_rate': 3.5085458965193654e-06, 'epoch': 0.73} + 73%|███████▎ | 6415/8750 [2:50:18<3:47:15, 5.84s/it] {'loss': 0.4661, 'learning_rate': 3.5085458965193654e-06, 'epoch': 0.73} + 73%|███████▎ | 6415/8750 [2:50:16<3:47:15, 5.84s/it] 73%|███████▎ | 6416/8750 [2:50:21<3:46:15, 5.82s/it] 73%|███████▎ | 6416/8750 [2:50:24<3:46:15, 5.82s/it] {'loss': 0.4502, 'learning_rate': 3.5057306261111024e-06, 'epoch': 0.73} + 73%|███████▎ | 6416/8750 [2:50:24<3:46:15, 5.82s/it] {'loss': 0.4502, 'learning_rate': 3.5057306261111024e-06, 'epoch': 0.73} + 73%|███████▎ | 6416/8750 [2:50:21<3:46:15, 5.82s/it] 73%|███████▎ | 6417/8750 [2:50:29<3:44:58, 5.79s/it] 73%|███████▎ | 6417/8750 [2:50:27<3:44:58, 5.79s/it] {'loss': 0.4547, 'learning_rate': 3.502916245562733e-06, 'epoch': 0.73} + 73%|███████▎ | 6417/8750 [2:50:29<3:44:58, 5.79s/it] {'loss': 0.4547, 'learning_rate': 3.502916245562733e-06, 'epoch': 0.73} + 73%|███████▎ | 6417/8750 [2:50:27<3:44:58, 5.79s/it] 73%|███████▎ | 6418/8750 [2:50:35<3:44:30, 5.78s/it] 73%|███████▎ | 6418/8750 [2:50:33<3:44:30, 5.78s/it]{'loss': 0.4459, 'learning_rate': 3.5001027552598952e-06, 'epoch': 0.73} + {'loss': 0.4459, 'learning_rate': 3.5001027552598952e-06, 'epoch': 0.73} + 73%|███████▎ | 6418/8750 [2:50:35<3:44:30, 5.78s/it] 73%|███████▎ | 6418/8750 [2:50:33<3:44:30, 5.78s/it] 73%|███████▎ | 6419/8750 [2:50:41<3:43:18, 5.75s/it] 73%|███████▎ | 6419/8750 [2:50:38<3:43:18, 5.75s/it] {'loss': 0.4551, 'learning_rate': 3.4972901555880957e-06, 'epoch': 0.73} + 73%|███████▎ | 6419/8750 [2:50:41<3:43:18, 5.75s/it] {'loss': 0.4551, 'learning_rate': 3.4972901555880957e-06, 'epoch': 0.73} + 73%|███████▎ | 6419/8750 [2:50:38<3:43:18, 5.75s/it] 73%|███████▎ | 6420/8750 [2:50:44<3:43:03, 5.74s/it] 73%|███████▎ | 6420/8750 [2:50:46<3:43:03, 5.74s/it] {'loss': 0.4592, 'learning_rate': 3.4944784469327253e-06, 'epoch': 0.73} + 73%|███████▎ | 6420/8750 [2:50:46<3:43:03, 5.74s/it] {'loss': 0.4592, 'learning_rate': 3.4944784469327253e-06, 'epoch': 0.73} + 73%|███████▎ | 6420/8750 [2:50:44<3:43:03, 5.74s/it] 73%|███████▎ | 6421/8750 [2:50:52<3:42:25, 5.73s/it] 73%|███████▎ | 6421/8750 [2:50:50<3:42:25, 5.73s/it] {'loss': 0.4369, 'learning_rate': 3.491667629679054e-06, 'epoch': 0.73} + 73%|███████▎ | 6421/8750 [2:50:50<3:42:25, 5.73s/it]{'loss': 0.4369, 'learning_rate': 3.491667629679054e-06, 'epoch': 0.73} + 73%|███████▎ | 6421/8750 [2:50:52<3:42:25, 5.73s/it] 73%|███████▎ | 6422/8750 [2:50:58<3:41:26, 5.71s/it] 73%|███████▎ | 6422/8750 [2:50:56<3:41:27, 5.71s/it] {'loss': 0.4437, 'learning_rate': 3.488857704212224e-06, 'epoch': 0.73} + 73%|███████▎ | 6422/8750 [2:50:58<3:41:26, 5.71s/it] {'loss': 0.4437, 'learning_rate': 3.488857704212224e-06, 'epoch': 0.73} + 73%|███████▎ | 6422/8750 [2:50:56<3:41:27, 5.71s/it] 73%|███████▎ | 6423/8750 [2:51:04<3:40:58, 5.70s/it] 73%|███████▎ | 6423/8750 [2:51:01<3:40:58, 5.70s/it] {'loss': 0.4401, 'learning_rate': 3.4860486709172536e-06, 'epoch': 0.73} + 73%|███████▎ | 6423/8750 [2:51:04<3:40:58, 5.70s/it] {'loss': 0.4401, 'learning_rate': 3.4860486709172536e-06, 'epoch': 0.73} + 73%|███████▎ | 6423/8750 [2:51:01<3:40:58, 5.70s/it] 73%|███████▎ | 6424/8750 [2:51:09<3:39:25, 5.66s/it] 73%|███████▎ | 6424/8750 [2:51:07<3:39:25, 5.66s/it] {'loss': 0.4533, 'learning_rate': 3.4832405301790507e-06, 'epoch': 0.73} + 73%|███████▎ | 6424/8750 [2:51:09<3:39:25, 5.66s/it] {'loss': 0.4533, 'learning_rate': 3.4832405301790507e-06, 'epoch': 0.73} + 73%|███████▎ | 6424/8750 [2:51:07<3:39:25, 5.66s/it] 73%|███████▎ | 6425/8750 [2:51:15<3:40:19, 5.69s/it] 73%|███████▎ | 6425/8750 [2:51:13<3:40:19, 5.69s/it] {'loss': 0.4487, 'learning_rate': 3.4804332823823862e-06, 'epoch': 0.73} + 73%|███████▎ | 6425/8750 [2:51:15<3:40:19, 5.69s/it] {'loss': 0.4487, 'learning_rate': 3.4804332823823862e-06, 'epoch': 0.73} + 73%|███████▎ | 6425/8750 [2:51:13<3:40:19, 5.69s/it] 73%|███████▎ | 6426/8750 [2:51:18<3:39:27, 5.67s/it] 73%|███████▎ | 6426/8750 [2:51:20<3:39:27, 5.67s/it] {'loss': 0.4677, 'learning_rate': 3.477626927911921e-06, 'epoch': 0.73} + {'loss': 0.4677, 'learning_rate': 3.477626927911921e-06, 'epoch': 0.73} + 73%|███████▎ | 6426/8750 [2:51:20<3:39:27, 5.67s/it] 73%|███████▎ | 6426/8750 [2:51:18<3:39:27, 5.67s/it] 73%|███████▎ | 6427/8750 [2:51:26<3:40:26, 5.69s/it] 73%|███████▎ | 6427/8750 [2:51:24<3:40:26, 5.69s/it] {'loss': 0.4409, 'learning_rate': 3.4748214671521875e-06, 'epoch': 0.73} + 73%|███████▎ | 6427/8750 [2:51:26<3:40:26, 5.69s/it] {'loss': 0.4409, 'learning_rate': 3.4748214671521875e-06, 'epoch': 0.73} + 73%|███████▎ | 6427/8750 [2:51:24<3:40:26, 5.69s/it] 73%|███████▎ | 6428/8750 [2:51:30<3:41:04, 5.71s/it] 73%|███████▎ | 6428/8750 [2:51:32<3:41:04, 5.71s/it] {'loss': 0.4481, 'learning_rate': 3.4720169004875914e-06, 'epoch': 0.73} + 73%|███████▎ | 6428/8750 [2:51:32<3:41:04, 5.71s/it] {'loss': 0.4481, 'learning_rate': 3.4720169004875914e-06, 'epoch': 0.73} + 73%|███████▎ | 6428/8750 [2:51:30<3:41:04, 5.71s/it] 73%|███████▎ | 6429/8750 [2:51:35<3:41:46, 5.73s/it] 73%|███████▎ | 6429/8750 [2:51:38<3:41:46, 5.73s/it] {'loss': 0.4797, 'learning_rate': 3.469213228302425e-06, 'epoch': 0.73} + 73%|███████▎ | 6429/8750 [2:51:38<3:41:46, 5.73s/it] {'loss': 0.4797, 'learning_rate': 3.469213228302425e-06, 'epoch': 0.73} + 73%|███████▎ | 6429/8750 [2:51:35<3:41:46, 5.73s/it] 73%|███████▎ | 6430/8750 [2:51:41<3:42:11, 5.75s/it] 73%|███████▎ | 6430/8750 [2:51:44<3:42:11, 5.75s/it] {'loss': 0.4376, 'learning_rate': 3.466410450980854e-06, 'epoch': 0.73} + 73%|███████▎ | 6430/8750 [2:51:44<3:42:11, 5.75s/it] {'loss': 0.4376, 'learning_rate': 3.466410450980854e-06, 'epoch': 0.73} + 73%|███████▎ | 6430/8750 [2:51:41<3:42:11, 5.75s/it] 73%|███████▎ | 6431/8750 [2:51:49<3:41:17, 5.73s/it] {'loss': 0.4519, 'learning_rate': 3.4636085689069244e-06, 'epoch': 0.73} + 73%|███████▎ | 6431/8750 [2:51:49<3:41:17, 5.73s/it] 73%|███████▎ | 6431/8750 [2:51:47<3:41:17, 5.73s/it] {'loss': 0.4519, 'learning_rate': 3.4636085689069244e-06, 'epoch': 0.73} + 73%|███████▎ | 6431/8750 [2:51:47<3:41:17, 5.73s/it] 74%|███████▎ | 6432/8750 [2:51:55<3:44:04, 5.80s/it] 74%|███████▎ | 6432/8750 [2:51:53<3:44:04, 5.80s/it] {'loss': 0.4359, 'learning_rate': 3.4608075824645524e-06, 'epoch': 0.74} + 74%|███████▎ | 6432/8750 [2:51:55<3:44:04, 5.80s/it] {'loss': 0.4359, 'learning_rate': 3.4608075824645524e-06, 'epoch': 0.74} + 74%|███████▎ | 6432/8750 [2:51:53<3:44:04, 5.80s/it] 74%|███████▎ | 6433/8750 [2:52:01<3:42:22, 5.76s/it] 74%|███████▎ | 6433/8750 [2:51:59<3:42:22, 5.76s/it] {'loss': 0.4585, 'learning_rate': 3.4580074920375352e-06, 'epoch': 0.74} + 74%|███████▎ | 6433/8750 [2:52:01<3:42:22, 5.76s/it] {'loss': 0.4585, 'learning_rate': 3.4580074920375352e-06, 'epoch': 0.74} + 74%|███████▎ | 6433/8750 [2:51:59<3:42:22, 5.76s/it] 74%|███████▎ | 6434/8750 [2:52:06<3:40:16, 5.71s/it] 74%|███████▎ | 6434/8750 [2:52:04<3:40:16, 5.71s/it] {'loss': 0.4479, 'learning_rate': 3.4552082980095514e-06, 'epoch': 0.74} + 74%|███████▎ | 6434/8750 [2:52:06<3:40:16, 5.71s/it] {'loss': 0.4479, 'learning_rate': 3.4552082980095514e-06, 'epoch': 0.74} + 74%|███████▎ | 6434/8750 [2:52:04<3:40:16, 5.71s/it] 74%|███████▎ | 6435/8750 [2:52:12<3:39:17, 5.68s/it] 74%|███████▎ | 6435/8750 [2:52:10<3:39:17, 5.68s/it] {'loss': 0.4467, 'learning_rate': 3.4524100007641517e-06, 'epoch': 0.74} + {'loss': 0.4467, 'learning_rate': 3.4524100007641517e-06, 'epoch': 0.74} 74%|███████▎ | 6435/8750 [2:52:12<3:39:17, 5.68s/it] + 74%|███████▎ | 6435/8750 [2:52:10<3:39:17, 5.68s/it] 74%|███████▎ | 6436/8750 [2:52:18<3:39:05, 5.68s/it] 74%|███████▎ | 6436/8750 [2:52:15<3:39:05, 5.68s/it] {'loss': 0.4351, 'learning_rate': 3.4496126006847596e-06, 'epoch': 0.74} + 74%|███████▎ | 6436/8750 [2:52:18<3:39:05, 5.68s/it] {'loss': 0.4351, 'learning_rate': 3.4496126006847596e-06, 'epoch': 0.74} + 74%|███████▎ | 6436/8750 [2:52:15<3:39:05, 5.68s/it] 74%|███████▎ | 6437/8750 [2:52:23<3:38:26, 5.67s/it] 74%|███████▎ | 6437/8750 [2:52:21<3:38:26, 5.67s/it] {'loss': 0.4652, 'learning_rate': 3.446816098154692e-06, 'epoch': 0.74} + 74%|███████▎ | 6437/8750 [2:52:23<3:38:26, 5.67s/it] {'loss': 0.4652, 'learning_rate': 3.446816098154692e-06, 'epoch': 0.74} + 74%|███████▎ | 6437/8750 [2:52:21<3:38:26, 5.67s/it] 74%|███████▎ | 6438/8750 [2:52:29<3:41:16, 5.74s/it] 74%|███████▎ | 6438/8750 [2:52:27<3:41:16, 5.74s/it] {'loss': 0.443, 'learning_rate': 3.4440204935571208e-06, 'epoch': 0.74} + 74%|███████▎ | 6438/8750 [2:52:27<3:41:16, 5.74s/it]{'loss': 0.443, 'learning_rate': 3.4440204935571208e-06, 'epoch': 0.74} + 74%|███████▎ | 6438/8750 [2:52:29<3:41:16, 5.74s/it] 74%|███████▎ | 6439/8750 [2:52:35<3:39:44, 5.71s/it] 74%|███████▎ | 6439/8750 [2:52:33<3:39:44, 5.71s/it] {'loss': 0.4518, 'learning_rate': 3.441225787275113e-06, 'epoch': 0.74} + 74%|███████▎ | 6439/8750 [2:52:35<3:39:44, 5.71s/it] {'loss': 0.4518, 'learning_rate': 3.441225787275113e-06, 'epoch': 0.74} + 74%|███████▎ | 6439/8750 [2:52:33<3:39:44, 5.71s/it] 74%|███████▎ | 6440/8750 [2:52:41<3:39:42, 5.71s/it] 74%|███████▎ | 6440/8750 [2:52:38<3:39:42, 5.71s/it] {'loss': 0.4392, 'learning_rate': 3.4384319796916075e-06, 'epoch': 0.74} + 74%|███████▎ | 6440/8750 [2:52:38<3:39:42, 5.71s/it] {'loss': 0.4392, 'learning_rate': 3.4384319796916075e-06, 'epoch': 0.74} + 74%|███████▎ | 6440/8750 [2:52:41<3:39:42, 5.71s/it] 74%|███████▎ | 6441/8750 [2:52:46<3:41:36, 5.76s/it] 74%|███████▎ | 6441/8750 [2:52:44<3:41:36, 5.76s/it] {'loss': 0.4458, 'learning_rate': 3.435639071189413e-06, 'epoch': 0.74} + 74%|███████▎ | 6441/8750 [2:52:46<3:41:36, 5.76s/it] {'loss': 0.4458, 'learning_rate': 3.435639071189413e-06, 'epoch': 0.74} + 74%|███████▎ | 6441/8750 [2:52:44<3:41:36, 5.76s/it] 74%|███████▎ | 6442/8750 [2:52:52<3:41:02, 5.75s/it] 74%|███████▎ | 6442/8750 [2:52:50<3:41:02, 5.75s/it] {'loss': 0.462, 'learning_rate': 3.4328470621512257e-06, 'epoch': 0.74} + 74%|███████▎ | 6442/8750 [2:52:52<3:41:02, 5.75s/it] {'loss': 0.462, 'learning_rate': 3.4328470621512257e-06, 'epoch': 0.74} + 74%|███████▎ | 6442/8750 [2:52:50<3:41:02, 5.75s/it] 74%|███████▎ | 6443/8750 [2:52:58<3:39:50, 5.72s/it] 74%|███████▎ | 6443/8750 [2:52:56<3:39:50, 5.72s/it] {'loss': 0.4441, 'learning_rate': 3.430055952959607e-06, 'epoch': 0.74} + 74%|███████▎ | 6443/8750 [2:52:58<3:39:50, 5.72s/it] {'loss': 0.4441, 'learning_rate': 3.430055952959607e-06, 'epoch': 0.74} + 74%|███████▎ | 6443/8750 [2:52:56<3:39:50, 5.72s/it] 74%|███████▎ | 6444/8750 [2:53:03<3:38:43, 5.69s/it] 74%|███████▎ | 6444/8750 [2:53:01<3:38:43, 5.69s/it] {'loss': 0.4649, 'learning_rate': 3.427265743997007e-06, 'epoch': 0.74} + 74%|███████▎ | 6444/8750 [2:53:03<3:38:43, 5.69s/it] {'loss': 0.4649, 'learning_rate': 3.427265743997007e-06, 'epoch': 0.74} + 74%|███████▎ | 6444/8750 [2:53:01<3:38:43, 5.69s/it] 74%|███████▎ | 6445/8750 [2:53:09<3:38:35, 5.69s/it] 74%|███████▎ | 6445/8750 [2:53:07<3:38:35, 5.69s/it] {'loss': 0.4703, 'learning_rate': 3.4244764356457438e-06, 'epoch': 0.74} + 74%|███████▎ | 6445/8750 [2:53:09<3:38:35, 5.69s/it] {'loss': 0.4703, 'learning_rate': 3.4244764356457438e-06, 'epoch': 0.74} + 74%|███████▎ | 6445/8750 [2:53:07<3:38:35, 5.69s/it] 74%|███████▎ | 6446/8750 [2:53:15<3:40:02, 5.73s/it] 74%|███████▎ | 6446/8750 [2:53:13<3:40:02, 5.73s/it] {'loss': 0.4514, 'learning_rate': 3.4216880282880128e-06, 'epoch': 0.74} + 74%|███████▎ | 6446/8750 [2:53:15<3:40:02, 5.73s/it] {'loss': 0.4514, 'learning_rate': 3.4216880282880128e-06, 'epoch': 0.74} + 74%|███████▎ | 6446/8750 [2:53:13<3:40:02, 5.73s/it] 74%|███████▎ | 6447/8750 [2:53:21<3:39:22, 5.72s/it] 74%|███████▎ | 6447/8750 [2:53:18<3:39:22, 5.72s/it] {'loss': 0.4628, 'learning_rate': 3.4189005223058937e-06, 'epoch': 0.74} + 74%|███████▎ | 6447/8750 [2:53:21<3:39:22, 5.72s/it] {'loss': 0.4628, 'learning_rate': 3.4189005223058937e-06, 'epoch': 0.74} + 74%|███████▎ | 6447/8750 [2:53:18<3:39:22, 5.72s/it] 74%|███████▎ | 6448/8750 [2:53:26<3:39:07, 5.71s/it] 74%|███████▎ | 6448/8750 [2:53:24<3:39:07, 5.71s/it] {'loss': 0.4485, 'learning_rate': 3.416113918081331e-06, 'epoch': 0.74} + 74%|███████▎ | 6448/8750 [2:53:24<3:39:07, 5.71s/it] {'loss': 0.4485, 'learning_rate': 3.416113918081331e-06, 'epoch': 0.74} + 74%|███████▎ | 6448/8750 [2:53:26<3:39:07, 5.71s/it] 74%|███████▎ | 6449/8750 [2:53:32<3:39:02, 5.71s/it] 74%|███████▎ | 6449/8750 [2:53:30<3:39:02, 5.71s/it] {'loss': 0.4489, 'learning_rate': 3.4133282159961535e-06, 'epoch': 0.74} + 74%|███████▎ | 6449/8750 [2:53:32<3:39:02, 5.71s/it] {'loss': 0.4489, 'learning_rate': 3.4133282159961535e-06, 'epoch': 0.74} + 74%|███████▎ | 6449/8750 [2:53:30<3:39:02, 5.71s/it]9 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... + 74%|███████▎ | 6450/8750 [2:53:38<3:40:00, 5.74s/it]013 AutoResumeHook: Checking whether to suspend... +1512 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +1 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... + 74%|███████▎ | 6450/8750 [2:53:36<3:40:00, 5.74s/it]10 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4353, 'learning_rate': 3.4105434164320695e-06, 'epoch': 0.74} + 74%|███████▎ | 6450/8750 [2:53:38<3:40:00, 5.74s/it] {'loss': 0.4353, 'learning_rate': 3.4105434164320695e-06, 'epoch': 0.74} + 74%|███████▎ | 6450/8750 [2:53:36<3:40:00, 5.74s/it] 74%|███████▎ | 6451/8750 [2:53:44<3:39:57, 5.74s/it] 74%|███████▎ | 6451/8750 [2:53:41<3:39:57, 5.74s/it] {'loss': 0.4739, 'learning_rate': 3.4077595197706505e-06, 'epoch': 0.74} + 74%|███████▎ | 6451/8750 [2:53:44<3:39:57, 5.74s/it] {'loss': 0.4739, 'learning_rate': 3.4077595197706505e-06, 'epoch': 0.74} + 74%|███████▎ | 6451/8750 [2:53:41<3:39:57, 5.74s/it] 74%|███████▎ | 6452/8750 [2:53:49<3:38:31, 5.71s/it] 74%|███████▎ | 6452/8750 [2:53:47<3:38:31, 5.71s/it] {'loss': 0.4615, 'learning_rate': 3.404976526393361e-06, 'epoch': 0.74} + 74%|███████▎ | 6452/8750 [2:53:49<3:38:31, 5.71s/it] {'loss': 0.4615, 'learning_rate': 3.404976526393361e-06, 'epoch': 0.74} + 74%|███████▎ | 6452/8750 [2:53:47<3:38:31, 5.71s/it] 74%|███████▎ | 6453/8750 [2:53:55<3:40:00, 5.75s/it] 74%|███████▎ | 6453/8750 [2:53:53<3:40:00, 5.75s/it] {'loss': 0.4412, 'learning_rate': 3.4021944366815286e-06, 'epoch': 0.74} + 74%|███████▎ | 6453/8750 [2:53:55<3:40:00, 5.75s/it] {'loss': 0.4412, 'learning_rate': 3.4021944366815286e-06, 'epoch': 0.74} + 74%|███████▎ | 6453/8750 [2:53:53<3:40:00, 5.75s/it] 74%|███████▍ | 6454/8750 [2:54:01<3:39:01, 5.72s/it] 74%|███████▍ | 6454/8750 [2:53:59<3:39:01, 5.72s/it] {'loss': 0.4565, 'learning_rate': 3.399413251016359e-06, 'epoch': 0.74} + 74%|███████▍ | 6454/8750 [2:54:01<3:39:01, 5.72s/it] {'loss': 0.4565, 'learning_rate': 3.399413251016359e-06, 'epoch': 0.74} + 74%|███████▍ | 6454/8750 [2:53:59<3:39:01, 5.72s/it] 74%|███████▍ | 6455/8750 [2:54:07<3:39:17, 5.73s/it] 74%|███████▍ | 6455/8750 [2:54:04<3:39:17, 5.73s/it] {'loss': 0.4467, 'learning_rate': 3.3966329697789424e-06, 'epoch': 0.74} + 74%|███████▍ | 6455/8750 [2:54:07<3:39:17, 5.73s/it] {'loss': 0.4467, 'learning_rate': 3.3966329697789424e-06, 'epoch': 0.74} + 74%|███████▍ | 6455/8750 [2:54:04<3:39:17, 5.73s/it] 74%|███████▍ | 6456/8750 [2:54:12<3:40:33, 5.77s/it] 74%|███████▍ | 6456/8750 [2:54:10<3:40:32, 5.77s/it] {'loss': 0.4398, 'learning_rate': 3.393853593350235e-06, 'epoch': 0.74} + 74%|███████▍ | 6456/8750 [2:54:12<3:40:33, 5.77s/it] {'loss': 0.4398, 'learning_rate': 3.393853593350235e-06, 'epoch': 0.74} + 74%|███████▍ | 6456/8750 [2:54:10<3:40:32, 5.77s/it] 74%|███████▍ | 6457/8750 [2:54:19<3:45:04, 5.89s/it] 74%|███████▍ | 6457/8750 [2:54:16<3:45:04, 5.89s/it] {'loss': 0.4441, 'learning_rate': 3.391075122111077e-06, 'epoch': 0.74} + 74%|███████▍ | 6457/8750 [2:54:19<3:45:04, 5.89s/it] {'loss': 0.4441, 'learning_rate': 3.391075122111077e-06, 'epoch': 0.74} + 74%|███████▍ | 6457/8750 [2:54:16<3:45:04, 5.89s/it] 74%|███████▍ | 6458/8750 [2:54:24<3:42:27, 5.82s/it] 74%|███████▍ | 6458/8750 [2:54:22<3:42:26, 5.82s/it] {'loss': 0.4504, 'learning_rate': 3.3882975564421773e-06, 'epoch': 0.74} + 74%|███████▍ | 6458/8750 [2:54:24<3:42:27, 5.82s/it] {'loss': 0.4504, 'learning_rate': 3.3882975564421773e-06, 'epoch': 0.74} + 74%|███████▍ | 6458/8750 [2:54:22<3:42:26, 5.82s/it] 74%|███████▍ | 6459/8750 [2:54:30<3:42:04, 5.82s/it] 74%|███████▍ | 6459/8750 [2:54:28<3:42:04, 5.82s/it] {'loss': 0.46, 'learning_rate': 3.3855208967241247e-06, 'epoch': 0.74} + 74%|███████▍ | 6459/8750 [2:54:28<3:42:04, 5.82s/it]{'loss': 0.46, 'learning_rate': 3.3855208967241247e-06, 'epoch': 0.74} + 74%|███████▍ | 6459/8750 [2:54:30<3:42:04, 5.82s/it] 74%|███████▍ | 6460/8750 [2:54:36<3:39:35, 5.75s/it] 74%|███████▍ | 6460/8750 [2:54:33<3:39:35, 5.75s/it] {'loss': 0.4562, 'learning_rate': 3.3827451433373904e-06, 'epoch': 0.74} + 74%|███████▍ | 6460/8750 [2:54:36<3:39:35, 5.75s/it] {'loss': 0.4562, 'learning_rate': 3.3827451433373904e-06, 'epoch': 0.74} + 74%|███████▍ | 6460/8750 [2:54:33<3:39:35, 5.75s/it] 74%|███████▍ | 6461/8750 [2:54:41<3:37:11, 5.69s/it] 74%|███████▍ | 6461/8750 [2:54:39<3:37:11, 5.69s/it] {'loss': 0.468, 'learning_rate': 3.379970296662305e-06, 'epoch': 0.74} + 74%|███████▍ | 6461/8750 [2:54:41<3:37:11, 5.69s/it] {'loss': 0.468, 'learning_rate': 3.379970296662305e-06, 'epoch': 0.74} + 74%|███████▍ | 6461/8750 [2:54:39<3:37:11, 5.69s/it] 74%|███████▍ | 6462/8750 [2:54:47<3:39:42, 5.76s/it] 74%|███████▍ | 6462/8750 [2:54:45<3:39:41, 5.76s/it] {'loss': 0.4444, 'learning_rate': 3.3771963570790924e-06, 'epoch': 0.74} + 74%|███████▍ | 6462/8750 [2:54:45<3:39:41, 5.76s/it]{'loss': 0.4444, 'learning_rate': 3.3771963570790924e-06, 'epoch': 0.74} + 74%|███████▍ | 6462/8750 [2:54:47<3:39:42, 5.76s/it] 74%|███████▍ | 6463/8750 [2:54:53<3:36:37, 5.68s/it] 74%|███████▍ | 6463/8750 [2:54:50<3:36:37, 5.68s/it] {'loss': 0.4594, 'learning_rate': 3.3744233249678403e-06, 'epoch': 0.74} + 74%|███████▍ | 6463/8750 [2:54:53<3:36:37, 5.68s/it] {'loss': 0.4594, 'learning_rate': 3.3744233249678403e-06, 'epoch': 0.74} + 74%|███████▍ | 6463/8750 [2:54:50<3:36:37, 5.68s/it] 74%|███████▍ | 6464/8750 [2:54:58<3:36:19, 5.68s/it] 74%|███████▍ | 6464/8750 [2:54:56<3:36:19, 5.68s/it] {'loss': 0.4506, 'learning_rate': 3.3716512007085133e-06, 'epoch': 0.74} + 74%|███████▍ | 6464/8750 [2:54:58<3:36:19, 5.68s/it] {'loss': 0.4506, 'learning_rate': 3.3716512007085133e-06, 'epoch': 0.74} + 74%|███████▍ | 6464/8750 [2:54:56<3:36:19, 5.68s/it] 74%|███████▍ | 6465/8750 [2:55:04<3:40:58, 5.80s/it] 74%|███████▍ | 6465/8750 [2:55:02<3:40:58, 5.80s/it] {'loss': 0.4545, 'learning_rate': 3.368879984680962e-06, 'epoch': 0.74} + 74%|███████▍ | 6465/8750 [2:55:04<3:40:58, 5.80s/it] {'loss': 0.4545, 'learning_rate': 3.368879984680962e-06, 'epoch': 0.74} + 74%|███████▍ | 6465/8750 [2:55:02<3:40:58, 5.80s/it] 74%|███████▍ | 6466/8750 [2:55:10<3:39:45, 5.77s/it] 74%|███████▍ | 6466/8750 [2:55:08<3:39:45, 5.77s/it] {'loss': 0.4571, 'learning_rate': 3.366109677264895e-06, 'epoch': 0.74} + 74%|███████▍ | 6466/8750 [2:55:10<3:39:45, 5.77s/it] {'loss': 0.4571, 'learning_rate': 3.366109677264895e-06, 'epoch': 0.74} + 74%|███████▍ | 6466/8750 [2:55:08<3:39:45, 5.77s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (5014 > 4096). Running this sequence through the model will result in indexing errors + 74%|███████▍ | 6467/8750 [2:55:16<3:37:22, 5.71s/it] 74%|███████▍ | 6467/8750 [2:55:13<3:37:23, 5.71s/it] {'loss': 0.4337, 'learning_rate': 3.363340278839916e-06, 'epoch': 0.74} + 74%|███████▍ | 6467/8750 [2:55:16<3:37:22, 5.71s/it] {'loss': 0.4337, 'learning_rate': 3.363340278839916e-06, 'epoch': 0.74} + 74%|███████▍ | 6467/8750 [2:55:13<3:37:23, 5.71s/it] 74%|███████▍ | 6468/8750 [2:55:22<3:40:37, 5.80s/it] 74%|███████▍ | 6468/8750 [2:55:19<3:40:37, 5.80s/it] {'loss': 0.4496, 'learning_rate': 3.3605717897854872e-06, 'epoch': 0.74} + 74%|███████▍ | 6468/8750 [2:55:22<3:40:37, 5.80s/it] {'loss': 0.4496, 'learning_rate': 3.3605717897854872e-06, 'epoch': 0.74} + 74%|███████▍ | 6468/8750 [2:55:19<3:40:37, 5.80s/it] 74%|███████▍ | 6469/8750 [2:55:27<3:40:41, 5.81s/it] 74%|███████▍ | 6469/8750 [2:55:25<3:40:41, 5.81s/it] {'loss': 0.4677, 'learning_rate': 3.357804210480955e-06, 'epoch': 0.74} + 74%|███████▍ | 6469/8750 [2:55:25<3:40:41, 5.81s/it] {'loss': 0.4677, 'learning_rate': 3.357804210480955e-06, 'epoch': 0.74} + 74%|███████▍ | 6469/8750 [2:55:27<3:40:41, 5.81s/it] 74%|███████▍ | 6470/8750 [2:55:33<3:37:59, 5.74s/it] 74%|███████▍ | 6470/8750 [2:55:31<3:37:59, 5.74s/it] {'loss': 0.441, 'learning_rate': 3.355037541305545e-06, 'epoch': 0.74} + 74%|███████▍ | 6470/8750 [2:55:31<3:37:59, 5.74s/it]{'loss': 0.441, 'learning_rate': 3.355037541305545e-06, 'epoch': 0.74} + 74%|███████▍ | 6470/8750 [2:55:33<3:37:59, 5.74s/it] 74%|███████▍ | 6471/8750 [2:55:39<3:38:06, 5.74s/it] 74%|███████▍ | 6471/8750 [2:55:37<3:38:06, 5.74s/it] {'loss': 0.4474, 'learning_rate': 3.3522717826383477e-06, 'epoch': 0.74} + 74%|███████▍ | 6471/8750 [2:55:39<3:38:06, 5.74s/it] {'loss': 0.4474, 'learning_rate': 3.3522717826383477e-06, 'epoch': 0.74} + 74%|███████▍ | 6471/8750 [2:55:37<3:38:06, 5.74s/it] 74%|███████▍ | 6472/8750 [2:55:44<3:36:18, 5.70s/it] 74%|███████▍ | 6472/8750 [2:55:42<3:36:18, 5.70s/it] {'loss': 0.4609, 'learning_rate': 3.349506934858331e-06, 'epoch': 0.74} + 74%|███████▍ | 6472/8750 [2:55:44<3:36:18, 5.70s/it] {'loss': 0.4609, 'learning_rate': 3.349506934858331e-06, 'epoch': 0.74} + 74%|███████▍ | 6472/8750 [2:55:42<3:36:18, 5.70s/it] 74%|███████▍ | 6473/8750 [2:55:51<3:41:04, 5.83s/it] 74%|███████▍ | 6473/8750 [2:55:48<3:41:04, 5.83s/it] {'loss': 0.4565, 'learning_rate': 3.3467429983443477e-06, 'epoch': 0.74} + 74%|███████▍ | 6473/8750 [2:55:51<3:41:04, 5.83s/it] {'loss': 0.4565, 'learning_rate': 3.3467429983443477e-06, 'epoch': 0.74} + 74%|███████▍ | 6473/8750 [2:55:48<3:41:04, 5.83s/it] 74%|███████▍ | 6474/8750 [2:55:56<3:40:42, 5.82s/it] 74%|███████▍ | 6474/8750 [2:55:54<3:40:42, 5.82s/it] {'loss': 0.4461, 'learning_rate': 3.3439799734751132e-06, 'epoch': 0.74} + 74%|███████▍ | 6474/8750 [2:55:56<3:40:42, 5.82s/it] {'loss': 0.4461, 'learning_rate': 3.3439799734751132e-06, 'epoch': 0.74} + 74%|███████▍ | 6474/8750 [2:55:54<3:40:42, 5.82s/it] 74%|███████▍ | 6475/8750 [2:56:02<3:38:16, 5.76s/it] 74%|███████▍ | 6475/8750 [2:56:00<3:38:16, 5.76s/it] {'loss': 0.465, 'learning_rate': 3.3412178606292276e-06, 'epoch': 0.74} + 74%|███████▍ | 6475/8750 [2:56:00<3:38:16, 5.76s/it] {'loss': 0.465, 'learning_rate': 3.3412178606292276e-06, 'epoch': 0.74} + 74%|███████▍ | 6475/8750 [2:56:02<3:38:16, 5.76s/it] 74%|███████▍ | 6476/8750 [2:56:08<3:37:25, 5.74s/it] 74%|███████▍ | 6476/8750 [2:56:05<3:37:25, 5.74s/it] {'loss': 0.4509, 'learning_rate': 3.3384566601851574e-06, 'epoch': 0.74} + {'loss': 0.4509, 'learning_rate': 3.3384566601851574e-06, 'epoch': 0.74} + 74%|███████▍ | 6476/8750 [2:56:08<3:37:25, 5.74s/it] 74%|███████▍ | 6476/8750 [2:56:05<3:37:25, 5.74s/it] 74%|███████▍ | 6477/8750 [2:56:13<3:37:21, 5.74s/it] 74%|███████▍ | 6477/8750 [2:56:11<3:37:21, 5.74s/it] {'loss': 0.4677, 'learning_rate': 3.3356963725212523e-06, 'epoch': 0.74} + 74%|███████▍ | 6477/8750 [2:56:13<3:37:21, 5.74s/it] {'loss': 0.4677, 'learning_rate': 3.3356963725212523e-06, 'epoch': 0.74} + 74%|███████▍ | 6477/8750 [2:56:11<3:37:21, 5.74s/it] 74%|███████▍ | 6478/8750 [2:56:19<3:37:13, 5.74s/it] 74%|███████▍ | 6478/8750 [2:56:17<3:37:13, 5.74s/it] {'loss': 0.4562, 'learning_rate': 3.3329369980157345e-06, 'epoch': 0.74} + 74%|███████▍ | 6478/8750 [2:56:19<3:37:13, 5.74s/it] {'loss': 0.4562, 'learning_rate': 3.3329369980157345e-06, 'epoch': 0.74} + 74%|███████▍ | 6478/8750 [2:56:17<3:37:13, 5.74s/it] 74%|███████▍ | 6479/8750 [2:56:25<3:39:59, 5.81s/it] 74%|███████▍ | 6479/8750 [2:56:23<3:39:59, 5.81s/it] {'loss': 0.4441, 'learning_rate': 3.330178537046699e-06, 'epoch': 0.74} + 74%|███████▍ | 6479/8750 [2:56:25<3:39:59, 5.81s/it] {'loss': 0.4441, 'learning_rate': 3.330178537046699e-06, 'epoch': 0.74} + 74%|███████▍ | 6479/8750 [2:56:23<3:39:59, 5.81s/it] 74%|███████▍ | 6480/8750 [2:56:31<3:40:59, 5.84s/it] 74%|███████▍ | 6480/8750 [2:56:29<3:40:59, 5.84s/it] {'loss': 0.4541, 'learning_rate': 3.327420989992112e-06, 'epoch': 0.74} + 74%|███████▍ | 6480/8750 [2:56:31<3:40:59, 5.84s/it] {'loss': 0.4541, 'learning_rate': 3.327420989992112e-06, 'epoch': 0.74} + 74%|███████▍ | 6480/8750 [2:56:29<3:40:59, 5.84s/it] 74%|███████▍ | 6481/8750 [2:56:37<3:38:33, 5.78s/it] 74%|███████▍ | 6481/8750 [2:56:34<3:38:33, 5.78s/it] {'loss': 0.466, 'learning_rate': 3.3246643572298253e-06, 'epoch': 0.74} + 74%|███████▍ | 6481/8750 [2:56:37<3:38:33, 5.78s/it] {'loss': 0.466, 'learning_rate': 3.3246643572298253e-06, 'epoch': 0.74} + 74%|███████▍ | 6481/8750 [2:56:34<3:38:33, 5.78s/it] 74%|███████▍ | 6482/8750 [2:56:42<3:37:40, 5.76s/it] 74%|███████▍ | 6482/8750 [2:56:40<3:37:40, 5.76s/it] {'loss': 0.4409, 'learning_rate': 3.321908639137553e-06, 'epoch': 0.74} + 74%|███████▍ | 6482/8750 [2:56:42<3:37:40, 5.76s/it] {'loss': 0.4409, 'learning_rate': 3.321908639137553e-06, 'epoch': 0.74} + 74%|███████▍ | 6482/8750 [2:56:40<3:37:40, 5.76s/it] 74%|███████▍ | 6483/8750 [2:56:49<3:42:15, 5.88s/it] 74%|███████▍ | 6483/8750 [2:56:46<3:42:15, 5.88s/it] {'loss': 0.447, 'learning_rate': 3.3191538360928977e-06, 'epoch': 0.74} + {'loss': 0.447, 'learning_rate': 3.3191538360928977e-06, 'epoch': 0.74} 74%|███████▍ | 6483/8750 [2:56:49<3:42:15, 5.88s/it] + 74%|███████▍ | 6483/8750 [2:56:46<3:42:15, 5.88s/it]/lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/VILA/llava/model/llava_arch.py:397: UserWarning: Inputs truncated! + warnings.warn("Inputs truncated!") + 74%|███████▍ | 6484/8750 [2:56:55<3:44:54, 5.96s/it] 74%|███████▍ | 6484/8750 [2:56:52<3:44:53, 5.95s/it] {'loss': 0.4702, 'learning_rate': 3.3163999484733232e-06, 'epoch': 0.74} + 74%|███████▍ | 6484/8750 [2:56:52<3:44:53, 5.95s/it] {'loss': 0.4702, 'learning_rate': 3.3163999484733232e-06, 'epoch': 0.74} + 74%|███████▍ | 6484/8750 [2:56:55<3:44:54, 5.96s/it] 74%|███████▍ | 6485/8750 [2:57:00<3:42:21, 5.89s/it] 74%|███████▍ | 6485/8750 [2:56:58<3:42:22, 5.89s/it] {'loss': 0.4823, 'learning_rate': 3.313646976656172e-06, 'epoch': 0.74} + 74%|███████▍ | 6485/8750 [2:57:00<3:42:21, 5.89s/it] {'loss': 0.4823, 'learning_rate': 3.313646976656172e-06, 'epoch': 0.74} + 74%|███████▍ | 6485/8750 [2:56:58<3:42:22, 5.89s/it] 74%|███████▍ | 6486/8750 [2:57:06<3:40:39, 5.85s/it] 74%|███████▍ | 6486/8750 [2:57:04<3:40:40, 5.85s/it] {'loss': 0.4589, 'learning_rate': 3.3108949210186657e-06, 'epoch': 0.74} + 74%|███████▍ | 6486/8750 [2:57:06<3:40:39, 5.85s/it] {'loss': 0.4589, 'learning_rate': 3.3108949210186657e-06, 'epoch': 0.74} + 74%|███████▍ | 6486/8750 [2:57:04<3:40:40, 5.85s/it] 74%|███████▍ | 6487/8750 [2:57:12<3:43:57, 5.94s/it] 74%|███████▍ | 6487/8750 [2:57:10<3:43:57, 5.94s/it] {'loss': 0.46, 'learning_rate': 3.308143781937898e-06, 'epoch': 0.74} + 74%|███████▍ | 6487/8750 [2:57:12<3:43:57, 5.94s/it] {'loss': 0.46, 'learning_rate': 3.308143781937898e-06, 'epoch': 0.74} + 74%|███████▍ | 6487/8750 [2:57:10<3:43:57, 5.94s/it] 74%|███████▍ | 6488/8750 [2:57:18<3:46:23, 6.01s/it] 74%|███████▍ | 6488/8750 [2:57:16<3:46:23, 6.01s/it] {'loss': 0.4426, 'learning_rate': 3.305393559790838e-06, 'epoch': 0.74} + 74%|███████▍ | 6488/8750 [2:57:18<3:46:23, 6.01s/it] {'loss': 0.4426, 'learning_rate': 3.305393559790838e-06, 'epoch': 0.74} + 74%|███████▍ | 6488/8750 [2:57:16<3:46:23, 6.01s/it] 74%|███████▍ | 6489/8750 [2:57:24<3:42:53, 5.91s/it] 74%|███████▍ | 6489/8750 [2:57:22<3:42:53, 5.91s/it] {'loss': 0.4549, 'learning_rate': 3.302644254954326e-06, 'epoch': 0.74} + 74%|███████▍ | 6489/8750 [2:57:24<3:42:53, 5.91s/it] {'loss': 0.4549, 'learning_rate': 3.302644254954326e-06, 'epoch': 0.74} + 74%|███████▍ | 6489/8750 [2:57:22<3:42:53, 5.91s/it] 74%|███████▍ | 6490/8750 [2:57:30<3:42:27, 5.91s/it] 74%|███████▍ | 6490/8750 [2:57:28<3:42:27, 5.91s/it] {'loss': 0.4496, 'learning_rate': 3.299895867805074e-06, 'epoch': 0.74} + 74%|███████▍ | 6490/8750 [2:57:30<3:42:27, 5.91s/it] {'loss': 0.4496, 'learning_rate': 3.299895867805074e-06, 'epoch': 0.74} + 74%|███████▍ | 6490/8750 [2:57:28<3:42:27, 5.91s/it] 74%|███████▍ | 6491/8750 [2:57:36<3:40:03, 5.84s/it] 74%|███████▍ | 6491/8750 [2:57:33<3:40:03, 5.84s/it] {'loss': 0.4512, 'learning_rate': 3.2971483987196783e-06, 'epoch': 0.74} + 74%|███████▍ | 6491/8750 [2:57:36<3:40:03, 5.84s/it] {'loss': 0.4512, 'learning_rate': 3.2971483987196783e-06, 'epoch': 0.74} + 74%|███████▍ | 6491/8750 [2:57:33<3:40:03, 5.84s/it] 74%|███████▍ | 6492/8750 [2:57:42<3:41:53, 5.90s/it] 74%|███████▍ | 6492/8750 [2:57:39<3:41:53, 5.90s/it] {'loss': 0.4596, 'learning_rate': 3.294401848074602e-06, 'epoch': 0.74} + 74%|███████▍ | 6492/8750 [2:57:42<3:41:53, 5.90s/it] {'loss': 0.4596, 'learning_rate': 3.294401848074602e-06, 'epoch': 0.74} + 74%|███████▍ | 6492/8750 [2:57:39<3:41:53, 5.90s/it] 74%|███████▍ | 6493/8750 [2:57:47<3:39:25, 5.83s/it] 74%|███████▍ | 6493/8750 [2:57:45<3:39:25, 5.83s/it] {'loss': 0.4452, 'learning_rate': 3.2916562162461784e-06, 'epoch': 0.74} + 74%|███████▍ | 6493/8750 [2:57:47<3:39:25, 5.83s/it] {'loss': 0.4452, 'learning_rate': 3.2916562162461784e-06, 'epoch': 0.74} + 74%|███████▍ | 6493/8750 [2:57:45<3:39:25, 5.83s/it] 74%|███████▍ | 6494/8750 [2:57:53<3:40:12, 5.86s/it] 74%|███████▍ | 6494/8750 [2:57:51<3:40:12, 5.86s/it] {'loss': 0.4427, 'learning_rate': 3.288911503610629e-06, 'epoch': 0.74} + 74%|███████▍ | 6494/8750 [2:57:53<3:40:12, 5.86s/it] {'loss': 0.4427, 'learning_rate': 3.288911503610629e-06, 'epoch': 0.74} + 74%|███████▍ | 6494/8750 [2:57:51<3:40:12, 5.86s/it] 74%|███████▍ | 6495/8750 [2:57:59<3:37:33, 5.79s/it] 74%|███████▍ | 6495/8750 [2:57:57<3:37:33, 5.79s/it] {'loss': 0.4639, 'learning_rate': 3.2861677105440335e-06, 'epoch': 0.74} + 74%|███████▍ | 6495/8750 [2:57:59<3:37:33, 5.79s/it] {'loss': 0.4639, 'learning_rate': 3.2861677105440335e-06, 'epoch': 0.74} + 74%|███████▍ | 6495/8750 [2:57:57<3:37:33, 5.79s/it] 74%|███████▍ | 6496/8750 [2:58:02<3:36:04, 5.75s/it] 74%|███████▍ | 6496/8750 [2:58:05<3:36:04, 5.75s/it]{'loss': 0.4535, 'learning_rate': 3.2834248374223556e-06, 'epoch': 0.74} + 74%|███████▍ | 6496/8750 [2:58:02<3:36:04, 5.75s/it] {'loss': 0.4535, 'learning_rate': 3.2834248374223556e-06, 'epoch': 0.74} + 74%|███████▍ | 6496/8750 [2:58:05<3:36:04, 5.75s/it] 74%|███████▍ | 6497/8750 [2:58:11<3:39:12, 5.84s/it] 74%|███████▍ | 6497/8750 [2:58:08<3:39:12, 5.84s/it] {'loss': 0.4548, 'learning_rate': 3.2806828846214324e-06, 'epoch': 0.74} + 74%|███████▍ | 6497/8750 [2:58:11<3:39:12, 5.84s/it] {'loss': 0.4548, 'learning_rate': 3.2806828846214324e-06, 'epoch': 0.74} + 74%|███████▍ | 6497/8750 [2:58:08<3:39:12, 5.84s/it] 74%|███████▍ | 6498/8750 [2:58:16<3:38:38, 5.83s/it] 74%|███████▍ | 6498/8750 [2:58:14<3:38:38, 5.83s/it] {'loss': 0.4639, 'learning_rate': 3.277941852516968e-06, 'epoch': 0.74} + 74%|███████▍ | 6498/8750 [2:58:16<3:38:38, 5.83s/it] {'loss': 0.4639, 'learning_rate': 3.277941852516968e-06, 'epoch': 0.74} + 74%|███████▍ | 6498/8750 [2:58:14<3:38:38, 5.83s/it] 74%|███████▍ | 6499/8750 [2:58:23<3:41:21, 5.90s/it] 74%|███████▍ | 6499/8750 [2:58:20<3:41:21, 5.90s/it] {'loss': 0.4343, 'learning_rate': 3.2752017414845514e-06, 'epoch': 0.74} + 74%|███████▍ | 6499/8750 [2:58:23<3:41:21, 5.90s/it] {'loss': 0.4343, 'learning_rate': 3.2752017414845514e-06, 'epoch': 0.74} + 74%|███████▍ | 6499/8750 [2:58:20<3:41:21, 5.90s/it]9 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... + 74%|███████▍ | 6500/8750 [2:58:28<3:38:25, 5.82s/it]13 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +010 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... + 74%|███████▍ | 6500/8750 [2:58:26<3:38:25, 5.82s/it]3 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... + {'loss': 0.449, 'learning_rate': 3.2724625518996322e-06, 'epoch': 0.74} + 74%|███████▍ | 6500/8750 [2:58:28<3:38:25, 5.82s/it] {'loss': 0.449, 'learning_rate': 3.2724625518996322e-06, 'epoch': 0.74} + 74%|███████▍ | 6500/8750 [2:58:26<3:38:25, 5.82s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-6500/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-6500/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-6500/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 74%|███████▍ | 6501/8750 [2:58:49<6:26:31, 10.31s/it] 74%|███████▍ | 6501/8750 [2:58:47<6:26:31, 10.31s/it] {'loss': 0.4611, 'learning_rate': 3.2697242841375452e-06, 'epoch': 0.74} + {'loss': 0.4611, 'learning_rate': 3.2697242841375452e-06, 'epoch': 0.74} 74%|███████▍ | 6501/8750 [2:58:49<6:26:31, 10.31s/it] + 74%|███████▍ | 6501/8750 [2:58:47<6:26:31, 10.31s/it] 74%|███████▍ | 6502/8750 [2:58:55<5:33:04, 8.89s/it] 74%|███████▍ | 6502/8750 [2:58:52<5:33:04, 8.89s/it] {'loss': 0.4584, 'learning_rate': 3.2669869385734943e-06, 'epoch': 0.74} + 74%|███████▍ | 6502/8750 [2:58:55<5:33:04, 8.89s/it] {'loss': 0.4584, 'learning_rate': 3.2669869385734943e-06, 'epoch': 0.74} + 74%|███████▍ | 6502/8750 [2:58:52<5:33:04, 8.89s/it] 74%|███████▍ | 6503/8750 [2:59:00<4:56:10, 7.91s/it] 74%|███████▍ | 6503/8750 [2:58:58<4:56:10, 7.91s/it] {'loss': 0.4496, 'learning_rate': 3.264250515582551e-06, 'epoch': 0.74} + 74%|███████▍ | 6503/8750 [2:59:00<4:56:10, 7.91s/it] {'loss': 0.4496, 'learning_rate': 3.264250515582551e-06, 'epoch': 0.74} + 74%|███████▍ | 6503/8750 [2:58:58<4:56:10, 7.91s/it] 74%|███████▍ | 6504/8750 [2:59:06<4:31:07, 7.24s/it] 74%|███████▍ | 6504/8750 [2:59:04<4:31:06, 7.24s/it] {'loss': 0.4493, 'learning_rate': 3.2615150155396747e-06, 'epoch': 0.74} + 74%|███████▍ | 6504/8750 [2:59:06<4:31:07, 7.24s/it] {'loss': 0.4493, 'learning_rate': 3.2615150155396747e-06, 'epoch': 0.74} + 74%|███████▍ | 6504/8750 [2:59:04<4:31:06, 7.24s/it] 74%|███████▍ | 6505/8750 [2:59:12<4:13:30, 6.78s/it] 74%|███████▍ | 6505/8750 [2:59:09<4:13:30, 6.78s/it] {'loss': 0.4551, 'learning_rate': 3.258780438819681e-06, 'epoch': 0.74} + 74%|███████▍ | 6505/8750 [2:59:12<4:13:30, 6.78s/it] {'loss': 0.4551, 'learning_rate': 3.258780438819681e-06, 'epoch': 0.74} + 74%|███████▍ | 6505/8750 [2:59:09<4:13:30, 6.78s/it] 74%|███████▍ | 6506/8750 [2:59:17<4:01:37, 6.46s/it] 74%|███████▍ | 6506/8750 [2:59:15<4:01:37, 6.46s/it] {'loss': 0.4449, 'learning_rate': 3.2560467857972744e-06, 'epoch': 0.74} + {'loss': 0.4449, 'learning_rate': 3.2560467857972744e-06, 'epoch': 0.74} + 74%|███████▍ | 6506/8750 [2:59:17<4:01:37, 6.46s/it] 74%|███████▍ | 6506/8750 [2:59:15<4:01:37, 6.46s/it] 74%|███████▍ | 6507/8750 [2:59:23<3:52:58, 6.23s/it] 74%|███████▍ | 6507/8750 [2:59:21<3:52:58, 6.23s/it] {'loss': 0.4602, 'learning_rate': 3.2533140568470266e-06, 'epoch': 0.74} + 74%|███████▍ | 6507/8750 [2:59:23<3:52:58, 6.23s/it] {'loss': 0.4602, 'learning_rate': 3.2533140568470266e-06, 'epoch': 0.74} + 74%|███████▍ | 6507/8750 [2:59:21<3:52:58, 6.23s/it] 74%|███████▍ | 6508/8750 [2:59:29<3:47:08, 6.08s/it] 74%|███████▍ | 6508/8750 [2:59:26<3:47:08, 6.08s/it] {'loss': 0.4392, 'learning_rate': 3.2505822523433785e-06, 'epoch': 0.74} +{'loss': 0.4392, 'learning_rate': 3.2505822523433785e-06, 'epoch': 0.74} + 74%|███████▍ | 6508/8750 [2:59:29<3:47:08, 6.08s/it] 74%|███████▍ | 6508/8750 [2:59:26<3:47:08, 6.08s/it] 74%|███████▍ | 6509/8750 [2:59:35<3:45:17, 6.03s/it] 74%|███████▍ | 6509/8750 [2:59:32<3:45:17, 6.03s/it] {'loss': 0.4686, 'learning_rate': 3.247851372660653e-06, 'epoch': 0.74} + 74%|███████▍ | 6509/8750 [2:59:32<3:45:17, 6.03s/it]{'loss': 0.4686, 'learning_rate': 3.247851372660653e-06, 'epoch': 0.74} + 74%|███████▍ | 6509/8750 [2:59:35<3:45:17, 6.03s/it] 74%|███████▍ | 6510/8750 [2:59:40<3:40:52, 5.92s/it] 74%|███████▍ | 6510/8750 [2:59:38<3:40:52, 5.92s/it] {'loss': 0.4433, 'learning_rate': 3.2451214181730396e-06, 'epoch': 0.74} + 74%|███████▍ | 6510/8750 [2:59:40<3:40:52, 5.92s/it] {'loss': 0.4433, 'learning_rate': 3.2451214181730396e-06, 'epoch': 0.74} + 74%|███████▍ | 6510/8750 [2:59:38<3:40:52, 5.92s/it] 74%|███████▍ | 6511/8750 [2:59:46<3:36:37, 5.81s/it] 74%|███████▍ | 6511/8750 [2:59:44<3:36:37, 5.81s/it] {'loss': 0.461, 'learning_rate': 3.2423923892545994e-06, 'epoch': 0.74} + 74%|███████▍ | 6511/8750 [2:59:46<3:36:37, 5.81s/it] {'loss': 0.461, 'learning_rate': 3.2423923892545994e-06, 'epoch': 0.74} + 74%|███████▍ | 6511/8750 [2:59:44<3:36:37, 5.81s/it] 74%|███████▍ | 6512/8750 [2:59:52<3:37:19, 5.83s/it] 74%|███████▍ | 6512/8750 [2:59:49<3:37:20, 5.83s/it] {'loss': 0.4404, 'learning_rate': 3.239664286279276e-06, 'epoch': 0.74} + 74%|███████▍ | 6512/8750 [2:59:52<3:37:19, 5.83s/it] {'loss': 0.4404, 'learning_rate': 3.239664286279276e-06, 'epoch': 0.74} + 74%|███████▍ | 6512/8750 [2:59:49<3:37:20, 5.83s/it] 74%|███████▍ | 6513/8750 [2:59:57<3:36:48, 5.82s/it] 74%|███████▍ | 6513/8750 [2:59:55<3:36:48, 5.82s/it] {'loss': 0.4545, 'learning_rate': 3.2369371096208744e-06, 'epoch': 0.74} + {'loss': 0.4545, 'learning_rate': 3.2369371096208744e-06, 'epoch': 0.74} + 74%|███████▍ | 6513/8750 [2:59:57<3:36:48, 5.82s/it] 74%|███████▍ | 6513/8750 [2:59:55<3:36:48, 5.82s/it] 74%|███████▍ | 6514/8750 [3:00:03<3:37:17, 5.83s/it] 74%|███████▍ | 6514/8750 [3:00:01<3:37:17, 5.83s/it] {'loss': 0.44, 'learning_rate': 3.2342108596530865e-06, 'epoch': 0.74} + 74%|███████▍ | 6514/8750 [3:00:03<3:37:17, 5.83s/it] {'loss': 0.44, 'learning_rate': 3.2342108596530865e-06, 'epoch': 0.74} + 74%|███████▍ | 6514/8750 [3:00:01<3:37:17, 5.83s/it] 74%|███████▍ | 6515/8750 [3:00:09<3:36:03, 5.80s/it] 74%|███████▍ | 6515/8750 [3:00:07<3:36:03, 5.80s/it] {'loss': 0.4454, 'learning_rate': 3.23148553674946e-06, 'epoch': 0.74} + 74%|███████▍ | 6515/8750 [3:00:09<3:36:03, 5.80s/it] {'loss': 0.4454, 'learning_rate': 3.23148553674946e-06, 'epoch': 0.74} + 74%|███████▍ | 6515/8750 [3:00:07<3:36:03, 5.80s/it] 74%|███████▍ | 6516/8750 [3:00:15<3:37:41, 5.85s/it] 74%|███████▍ | 6516/8750 [3:00:13<3:37:41, 5.85s/it] {'loss': 0.4478, 'learning_rate': 3.2287611412834306e-06, 'epoch': 0.74} + 74%|███████▍ | 6516/8750 [3:00:15<3:37:41, 5.85s/it] {'loss': 0.4478, 'learning_rate': 3.2287611412834306e-06, 'epoch': 0.74} + 74%|███████▍ | 6516/8750 [3:00:13<3:37:41, 5.85s/it] 74%|███████▍ | 6517/8750 [3:00:21<3:39:20, 5.89s/it] 74%|███████▍ | 6517/8750 [3:00:19<3:39:20, 5.89s/it] {'loss': 0.4523, 'learning_rate': 3.226037673628305e-06, 'epoch': 0.74} + 74%|███████▍ | 6517/8750 [3:00:21<3:39:20, 5.89s/it] {'loss': 0.4523, 'learning_rate': 3.226037673628305e-06, 'epoch': 0.74} + 74%|███████▍ | 6517/8750 [3:00:19<3:39:20, 5.89s/it] 74%|███████▍ | 6518/8750 [3:00:27<3:39:38, 5.90s/it] 74%|███████▍ | 6518/8750 [3:00:25<3:39:38, 5.90s/it] {'loss': 0.4633, 'learning_rate': 3.223315134157253e-06, 'epoch': 0.74} + 74%|███████▍ | 6518/8750 [3:00:27<3:39:38, 5.90s/it] {'loss': 0.4633, 'learning_rate': 3.223315134157253e-06, 'epoch': 0.74} + 74%|███████▍ | 6518/8750 [3:00:25<3:39:38, 5.90s/it] 75%|███████▍ | 6519/8750 [3:00:32<3:35:03, 5.78s/it] 75%|███████▍ | 6519/8750 [3:00:30<3:35:03, 5.78s/it] {'loss': 0.4656, 'learning_rate': 3.220593523243324e-06, 'epoch': 0.75} + 75%|███████▍ | 6519/8750 [3:00:32<3:35:03, 5.78s/it] {'loss': 0.4656, 'learning_rate': 3.220593523243324e-06, 'epoch': 0.75} + 75%|███████▍ | 6519/8750 [3:00:30<3:35:03, 5.78s/it] 75%|███████▍ | 6520/8750 [3:00:38<3:35:21, 5.79s/it] 75%|███████▍ | 6520/8750 [3:00:36<3:35:21, 5.79s/it] {'loss': 0.4449, 'learning_rate': 3.2178728412594417e-06, 'epoch': 0.75} + 75%|███████▍ | 6520/8750 [3:00:38<3:35:21, 5.79s/it] {'loss': 0.4449, 'learning_rate': 3.2178728412594417e-06, 'epoch': 0.75} + 75%|███████▍ | 6520/8750 [3:00:36<3:35:21, 5.79s/it] 75%|███████▍ | 6521/8750 [3:00:44<3:32:32, 5.72s/it] 75%|███████▍ | 6521/8750 [3:00:42<3:32:32, 5.72s/it] {'loss': 0.4725, 'learning_rate': 3.2151530885783967e-06, 'epoch': 0.75} + 75%|███████▍ | 6521/8750 [3:00:44<3:32:32, 5.72s/it] {'loss': 0.4725, 'learning_rate': 3.2151530885783967e-06, 'epoch': 0.75} + 75%|███████▍ | 6521/8750 [3:00:42<3:32:32, 5.72s/it] 75%|███████▍ | 6522/8750 [3:00:49<3:31:24, 5.69s/it] 75%|███████▍ | 6522/8750 [3:00:47<3:31:24, 5.69s/it] {'loss': 0.4602, 'learning_rate': 3.212434265572861e-06, 'epoch': 0.75} + 75%|███████▍ | 6522/8750 [3:00:49<3:31:24, 5.69s/it] {'loss': 0.4602, 'learning_rate': 3.212434265572861e-06, 'epoch': 0.75} + 75%|███████▍ | 6522/8750 [3:00:47<3:31:24, 5.69s/it] 75%|███████▍ | 6523/8750 [3:00:56<3:35:35, 5.81s/it] 75%|███████▍ | 6523/8750 [3:00:53<3:35:36, 5.81s/it] {'loss': 0.4637, 'learning_rate': 3.209716372615369e-06, 'epoch': 0.75} + 75%|███████▍ | 6523/8750 [3:00:56<3:35:35, 5.81s/it] {'loss': 0.4637, 'learning_rate': 3.209716372615369e-06, 'epoch': 0.75} + 75%|███████▍ | 6523/8750 [3:00:53<3:35:36, 5.81s/it] 75%|███████▍ | 6524/8750 [3:01:01<3:32:50, 5.74s/it] 75%|███████▍ | 6524/8750 [3:00:59<3:32:50, 5.74s/it] {'loss': 0.4622, 'learning_rate': 3.2069994100783376e-06, 'epoch': 0.75} + 75%|███████▍ | 6524/8750 [3:01:01<3:32:50, 5.74s/it] {'loss': 0.4622, 'learning_rate': 3.2069994100783376e-06, 'epoch': 0.75} + 75%|███████▍ | 6524/8750 [3:00:59<3:32:50, 5.74s/it] 75%|███████▍ | 6525/8750 [3:01:07<3:32:08, 5.72s/it] 75%|███████▍ | 6525/8750 [3:01:05<3:32:08, 5.72s/it] {'loss': 0.4492, 'learning_rate': 3.2042833783340453e-06, 'epoch': 0.75} + 75%|███████▍ | 6525/8750 [3:01:07<3:32:08, 5.72s/it] {'loss': 0.4492, 'learning_rate': 3.2042833783340453e-06, 'epoch': 0.75} + 75%|███████▍ | 6525/8750 [3:01:05<3:32:08, 5.72s/it] 75%|███████▍ | 6526/8750 [3:01:13<3:32:49, 5.74s/it] 75%|███████▍ | 6526/8750 [3:01:10<3:32:50, 5.74s/it] {'loss': 0.4493, 'learning_rate': 3.201568277754652e-06, 'epoch': 0.75} + 75%|███████▍ | 6526/8750 [3:01:13<3:32:49, 5.74s/it] {'loss': 0.4493, 'learning_rate': 3.201568277754652e-06, 'epoch': 0.75} + 75%|███████▍ | 6526/8750 [3:01:10<3:32:50, 5.74s/it] 75%|███████▍ | 6527/8750 [3:01:18<3:31:12, 5.70s/it] 75%|███████▍ | 6527/8750 [3:01:16<3:31:12, 5.70s/it] {'loss': 0.4665, 'learning_rate': 3.1988541087121916e-06, 'epoch': 0.75} + 75%|███████▍ | 6527/8750 [3:01:18<3:31:12, 5.70s/it] {'loss': 0.4665, 'learning_rate': 3.1988541087121916e-06, 'epoch': 0.75} + 75%|███████▍ | 6527/8750 [3:01:16<3:31:12, 5.70s/it] 75%|███████▍ | 6528/8750 [3:01:24<3:31:26, 5.71s/it] 75%|███████▍ | 6528/8750 [3:01:22<3:31:26, 5.71s/it] {'loss': 0.435, 'learning_rate': 3.1961408715785615e-06, 'epoch': 0.75} + 75%|███████▍ | 6528/8750 [3:01:24<3:31:26, 5.71s/it]{'loss': 0.435, 'learning_rate': 3.1961408715785615e-06, 'epoch': 0.75} + 75%|███████▍ | 6528/8750 [3:01:22<3:31:26, 5.71s/it] 75%|███████▍ | 6529/8750 [3:01:30<3:30:33, 5.69s/it] 75%|███████▍ | 6529/8750 [3:01:27<3:30:33, 5.69s/it] {'loss': 0.4442, 'learning_rate': 3.193428566725534e-06, 'epoch': 0.75} + 75%|███████▍ | 6529/8750 [3:01:30<3:30:33, 5.69s/it] {'loss': 0.4442, 'learning_rate': 3.193428566725534e-06, 'epoch': 0.75} + 75%|███████▍ | 6529/8750 [3:01:27<3:30:33, 5.69s/it] 75%|███████▍ | 6530/8750 [3:01:35<3:29:50, 5.67s/it] 75%|███████▍ | 6530/8750 [3:01:33<3:29:50, 5.67s/it] {'loss': 0.4554, 'learning_rate': 3.1907171945247595e-06, 'epoch': 0.75} + 75%|███████▍ | 6530/8750 [3:01:35<3:29:50, 5.67s/it] {'loss': 0.4554, 'learning_rate': 3.1907171945247595e-06, 'epoch': 0.75} + 75%|███████▍ | 6530/8750 [3:01:33<3:29:50, 5.67s/it] 75%|███████▍ | 6531/8750 [3:01:41<3:29:17, 5.66s/it] 75%|███████▍ | 6531/8750 [3:01:39<3:29:17, 5.66s/it] {'loss': 0.4353, 'learning_rate': 3.1880067553477513e-06, 'epoch': 0.75} + 75%|███████▍ | 6531/8750 [3:01:41<3:29:17, 5.66s/it] {'loss': 0.4353, 'learning_rate': 3.1880067553477513e-06, 'epoch': 0.75} + 75%|███████▍ | 6531/8750 [3:01:39<3:29:17, 5.66s/it] 75%|███████▍ | 6532/8750 [3:01:46<3:28:49, 5.65s/it] 75%|███████▍ | 6532/8750 [3:01:44<3:28:49, 5.65s/it] {'loss': 0.463, 'learning_rate': 3.1852972495659064e-06, 'epoch': 0.75} + 75%|███████▍ | 6532/8750 [3:01:44<3:28:49, 5.65s/it] {'loss': 0.463, 'learning_rate': 3.1852972495659064e-06, 'epoch': 0.75} + 75%|███████▍ | 6532/8750 [3:01:46<3:28:49, 5.65s/it] 75%|███████▍ | 6533/8750 [3:01:52<3:30:08, 5.69s/it] 75%|███████▍ | 6533/8750 [3:01:50<3:30:07, 5.69s/it] {'loss': 0.4648, 'learning_rate': 3.182588677550482e-06, 'epoch': 0.75} + 75%|███████▍ | 6533/8750 [3:01:52<3:30:08, 5.69s/it] {'loss': 0.4648, 'learning_rate': 3.182588677550482e-06, 'epoch': 0.75} + 75%|███████▍ | 6533/8750 [3:01:50<3:30:07, 5.69s/it] 75%|███████▍ | 6534/8750 [3:01:58<3:29:34, 5.67s/it] 75%|███████▍ | 6534/8750 [3:01:56<3:29:33, 5.67s/it] {'loss': 0.4438, 'learning_rate': 3.179881039672619e-06, 'epoch': 0.75} + 75%|███████▍ | 6534/8750 [3:01:58<3:29:34, 5.67s/it] {'loss': 0.4438, 'learning_rate': 3.179881039672619e-06, 'epoch': 0.75} + 75%|███████▍ | 6534/8750 [3:01:56<3:29:33, 5.67s/it] 75%|███████▍ | 6535/8750 [3:02:04<3:31:11, 5.72s/it] 75%|███████▍ | 6535/8750 [3:02:01<3:31:13, 5.72s/it] {'loss': 0.4627, 'learning_rate': 3.1771743363033156e-06, 'epoch': 0.75} + 75%|███████▍ | 6535/8750 [3:02:04<3:31:11, 5.72s/it] {'loss': 0.4627, 'learning_rate': 3.1771743363033156e-06, 'epoch': 0.75} + 75%|███████▍ | 6535/8750 [3:02:01<3:31:13, 5.72s/it] 75%|███████▍ | 6536/8750 [3:02:09<3:29:32, 5.68s/it] 75%|███████▍ | 6536/8750 [3:02:07<3:29:32, 5.68s/it] {'loss': 0.4732, 'learning_rate': 3.174468567813461e-06, 'epoch': 0.75} + 75%|███████▍ | 6536/8750 [3:02:09<3:29:32, 5.68s/it] {'loss': 0.4732, 'learning_rate': 3.174468567813461e-06, 'epoch': 0.75} + 75%|███████▍ | 6536/8750 [3:02:07<3:29:32, 5.68s/it] 75%|███████▍ | 6537/8750 [3:02:15<3:28:43, 5.66s/it] 75%|███████▍ | 6537/8750 [3:02:13<3:28:43, 5.66s/it] {'loss': 0.434, 'learning_rate': 3.171763734573796e-06, 'epoch': 0.75} + 75%|███████▍ | 6537/8750 [3:02:15<3:28:43, 5.66s/it] {'loss': 0.434, 'learning_rate': 3.171763734573796e-06, 'epoch': 0.75} + 75%|███████▍ | 6537/8750 [3:02:13<3:28:43, 5.66s/it] 75%|███████▍ | 6538/8750 [3:02:20<3:27:37, 5.63s/it] 75%|███████▍ | 6538/8750 [3:02:18<3:27:37, 5.63s/it] {'loss': 0.4621, 'learning_rate': 3.169059836954952e-06, 'epoch': 0.75} + 75%|███████▍ | 6538/8750 [3:02:20<3:27:37, 5.63s/it] {'loss': 0.4621, 'learning_rate': 3.169059836954952e-06, 'epoch': 0.75} + 75%|███████▍ | 6538/8750 [3:02:18<3:27:37, 5.63s/it] 75%|███████▍ | 6539/8750 [3:02:26<3:28:20, 5.65s/it] {'loss': 0.4477, 'learning_rate': 3.1663568753274153e-06, 'epoch': 0.75} + 75%|███████▍ | 6539/8750 [3:02:26<3:28:20, 5.65s/it] 75%|███████▍ | 6539/8750 [3:02:24<3:28:20, 5.65s/it] {'loss': 0.4477, 'learning_rate': 3.1663568753274153e-06, 'epoch': 0.75} + 75%|███████▍ | 6539/8750 [3:02:24<3:28:20, 5.65s/it] 75%|███████▍ | 6540/8750 [3:02:32<3:29:22, 5.68s/it] 75%|███████▍ | 6540/8750 [3:02:30<3:29:22, 5.68s/it] {'loss': 0.4271, 'learning_rate': 3.1636548500615583e-06, 'epoch': 0.75} + 75%|███████▍ | 6540/8750 [3:02:32<3:29:22, 5.68s/it] {'loss': 0.4271, 'learning_rate': 3.1636548500615583e-06, 'epoch': 0.75} + 75%|███████▍ | 6540/8750 [3:02:30<3:29:22, 5.68s/it] 75%|███████▍ | 6541/8750 [3:02:38<3:29:03, 5.68s/it] 75%|███████▍ | 6541/8750 [3:02:35<3:29:03, 5.68s/it] {'loss': 0.4784, 'learning_rate': 3.1609537615276174e-06, 'epoch': 0.75} + 75%|███████▍ | 6541/8750 [3:02:38<3:29:03, 5.68s/it] {'loss': 0.4784, 'learning_rate': 3.1609537615276174e-06, 'epoch': 0.75} + 75%|███████▍ | 6541/8750 [3:02:35<3:29:03, 5.68s/it] 75%|███████▍ | 6542/8750 [3:02:43<3:28:08, 5.66s/it] 75%|███████▍ | 6542/8750 [3:02:41<3:28:08, 5.66s/it] {'loss': 0.4413, 'learning_rate': 3.1582536100956973e-06, 'epoch': 0.75} + 75%|███████▍ | 6542/8750 [3:02:43<3:28:08, 5.66s/it] {'loss': 0.4413, 'learning_rate': 3.1582536100956973e-06, 'epoch': 0.75} + 75%|███████▍ | 6542/8750 [3:02:41<3:28:08, 5.66s/it] 75%|███████▍ | 6543/8750 [3:02:49<3:26:55, 5.63s/it] 75%|███████▍ | 6543/8750 [3:02:46<3:26:55, 5.63s/it] {'loss': 0.4723, 'learning_rate': 3.1555543961357824e-06, 'epoch': 0.75} + 75%|███████▍ | 6543/8750 [3:02:49<3:26:55, 5.63s/it] {'loss': 0.4723, 'learning_rate': 3.1555543961357824e-06, 'epoch': 0.75} + 75%|███████▍ | 6543/8750 [3:02:46<3:26:55, 5.63s/it] 75%|███████▍ | 6544/8750 [3:02:55<3:28:39, 5.68s/it] 75%|███████▍ | 6544/8750 [3:02:52<3:28:39, 5.68s/it] {'loss': 0.4543, 'learning_rate': 3.1528561200177254e-06, 'epoch': 0.75} + 75%|███████▍ | 6544/8750 [3:02:55<3:28:39, 5.68s/it] {'loss': 0.4543, 'learning_rate': 3.1528561200177254e-06, 'epoch': 0.75} + 75%|███████▍ | 6544/8750 [3:02:52<3:28:39, 5.68s/it] 75%|███████▍ | 6545/8750 [3:03:00<3:31:01, 5.74s/it] 75%|███████▍ | 6545/8750 [3:02:58<3:31:01, 5.74s/it] {'loss': 0.4522, 'learning_rate': 3.1501587821112532e-06, 'epoch': 0.75} + 75%|███████▍ | 6545/8750 [3:03:00<3:31:01, 5.74s/it] {'loss': 0.4522, 'learning_rate': 3.1501587821112532e-06, 'epoch': 0.75} + 75%|███████▍ | 6545/8750 [3:02:58<3:31:01, 5.74s/it] 75%|███████▍ | 6546/8750 [3:03:06<3:32:28, 5.78s/it] 75%|███████▍ | 6546/8750 [3:03:04<3:32:29, 5.78s/it] {'loss': 0.4649, 'learning_rate': 3.14746238278596e-06, 'epoch': 0.75} + 75%|███████▍ | 6546/8750 [3:03:06<3:32:28, 5.78s/it] {'loss': 0.4649, 'learning_rate': 3.14746238278596e-06, 'epoch': 0.75} + 75%|███████▍ | 6546/8750 [3:03:04<3:32:29, 5.78s/it] 75%|███████▍ | 6547/8750 [3:03:12<3:32:17, 5.78s/it] 75%|███████▍ | 6547/8750 [3:03:10<3:32:17, 5.78s/it] {'loss': 0.4431, 'learning_rate': 3.1447669224113074e-06, 'epoch': 0.75} + 75%|███████▍ | 6547/8750 [3:03:12<3:32:17, 5.78s/it] {'loss': 0.4431, 'learning_rate': 3.1447669224113074e-06, 'epoch': 0.75} + 75%|███████▍ | 6547/8750 [3:03:10<3:32:17, 5.78s/it] 75%|███████▍ | 6548/8750 [3:03:15<3:31:07, 5.75s/it] 75%|███████▍ | 6548/8750 [3:03:18<3:31:07, 5.75s/it] {'loss': 0.4451, 'learning_rate': 3.1420724013566408e-06, 'epoch': 0.75} + 75%|███████▍ | 6548/8750 [3:03:15<3:31:07, 5.75s/it] {'loss': 0.4451, 'learning_rate': 3.1420724013566408e-06, 'epoch': 0.75} + 75%|███████▍ | 6548/8750 [3:03:18<3:31:07, 5.75s/it] 75%|███████▍ | 6549/8750 [3:03:23<3:30:34, 5.74s/it] 75%|███████▍ | 6549/8750 [3:03:21<3:30:34, 5.74s/it] {'loss': 0.4428, 'learning_rate': 3.1393788199911657e-06, 'epoch': 0.75} + 75%|███████▍ | 6549/8750 [3:03:23<3:30:34, 5.74s/it] {'loss': 0.4428, 'learning_rate': 3.1393788199911657e-06, 'epoch': 0.75} + 75%|███████▍ | 6549/8750 [3:03:21<3:30:34, 5.74s/it]9 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +108 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + + 75%|███████▍ | 6550/8750 [3:03:29<3:30:29, 5.74s/it]76 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +11 AutoResumeHook: Checking whether to suspend... +05 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... + 75%|███████▍ | 6550/8750 [3:03:27<3:30:30, 5.74s/it]3 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... + {'loss': 0.468, 'learning_rate': 3.136686178683961e-06, 'epoch': 0.75} + 75%|███████▍ | 6550/8750 [3:03:29<3:30:29, 5.74s/it] {'loss': 0.468, 'learning_rate': 3.136686178683961e-06, 'epoch': 0.75} + 75%|███████▍ | 6550/8750 [3:03:27<3:30:30, 5.74s/it] 75%|███████▍ | 6551/8750 [3:03:35<3:29:38, 5.72s/it] 75%|███████▍ | 6551/8750 [3:03:33<3:29:38, 5.72s/it] {'loss': 0.4625, 'learning_rate': 3.1339944778039844e-06, 'epoch': 0.75} + 75%|███████▍ | 6551/8750 [3:03:35<3:29:38, 5.72s/it] {'loss': 0.4625, 'learning_rate': 3.1339944778039844e-06, 'epoch': 0.75} + 75%|███████▍ | 6551/8750 [3:03:33<3:29:38, 5.72s/it] 75%|███████▍ | 6552/8750 [3:03:41<3:29:40, 5.72s/it] 75%|███████▍ | 6552/8750 [3:03:38<3:29:40, 5.72s/it] {'loss': 0.4479, 'learning_rate': 3.131303717720053e-06, 'epoch': 0.75} + 75%|███████▍ | 6552/8750 [3:03:41<3:29:40, 5.72s/it] {'loss': 0.4479, 'learning_rate': 3.131303717720053e-06, 'epoch': 0.75} + 75%|███████▍ | 6552/8750 [3:03:38<3:29:40, 5.72s/it] 75%|███████▍ | 6553/8750 [3:03:47<3:32:04, 5.79s/it] 75%|███████▍ | 6553/8750 [3:03:44<3:32:03, 5.79s/it] {'loss': 0.4522, 'learning_rate': 3.128613898800864e-06, 'epoch': 0.75} + 75%|███████▍ | 6553/8750 [3:03:47<3:32:04, 5.79s/it]{'loss': 0.4522, 'learning_rate': 3.128613898800864e-06, 'epoch': 0.75} + 75%|███████▍ | 6553/8750 [3:03:44<3:32:03, 5.79s/it] 75%|███████▍ | 6554/8750 [3:03:52<3:30:24, 5.75s/it] 75%|███████▍ | 6554/8750 [3:03:50<3:30:23, 5.75s/it] {'loss': 0.4741, 'learning_rate': 3.125925021414985e-06, 'epoch': 0.75} + {'loss': 0.4741, 'learning_rate': 3.125925021414985e-06, 'epoch': 0.75} 75%|███████▍ | 6554/8750 [3:03:52<3:30:24, 5.75s/it] + 75%|███████▍ | 6554/8750 [3:03:50<3:30:23, 5.75s/it] 75%|███████▍ | 6555/8750 [3:03:58<3:28:54, 5.71s/it] 75%|███████▍ | 6555/8750 [3:03:56<3:28:54, 5.71s/it] {'loss': 0.4563, 'learning_rate': 3.123237085930847e-06, 'epoch': 0.75} + 75%|███████▍ | 6555/8750 [3:03:58<3:28:54, 5.71s/it] {'loss': 0.4563, 'learning_rate': 3.123237085930847e-06, 'epoch': 0.75} + 75%|███████▍ | 6555/8750 [3:03:56<3:28:54, 5.71s/it] 75%|███████▍ | 6556/8750 [3:04:04<3:28:21, 5.70s/it] 75%|███████▍ | 6556/8750 [3:04:01<3:28:22, 5.70s/it] {'loss': 0.4503, 'learning_rate': 3.1205500927167644e-06, 'epoch': 0.75} + 75%|███████▍ | 6556/8750 [3:04:04<3:28:21, 5.70s/it] {'loss': 0.4503, 'learning_rate': 3.1205500927167644e-06, 'epoch': 0.75} + 75%|███████▍ | 6556/8750 [3:04:01<3:28:22, 5.70s/it] 75%|███████▍ | 6557/8750 [3:04:09<3:29:20, 5.73s/it] {'loss': 0.4515, 'learning_rate': 3.1178640421409057e-06, 'epoch': 0.75} + 75%|███████▍ | 6557/8750 [3:04:09<3:29:20, 5.73s/it] 75%|███████▍ | 6557/8750 [3:04:07<3:29:19, 5.73s/it] {'loss': 0.4515, 'learning_rate': 3.1178640421409057e-06, 'epoch': 0.75} + 75%|███████▍ | 6557/8750 [3:04:07<3:29:19, 5.73s/it] 75%|███████▍ | 6558/8750 [3:04:15<3:30:58, 5.77s/it] 75%|███████▍ | 6558/8750 [3:04:13<3:30:58, 5.78s/it] {'loss': 0.4671, 'learning_rate': 3.11517893457133e-06, 'epoch': 0.75} + 75%|███████▍ | 6558/8750 [3:04:15<3:30:58, 5.77s/it] {'loss': 0.4671, 'learning_rate': 3.11517893457133e-06, 'epoch': 0.75} + 75%|███████▍ | 6558/8750 [3:04:13<3:30:58, 5.78s/it] 75%|███████▍ | 6559/8750 [3:04:21<3:29:47, 5.75s/it] 75%|███████▍ | 6559/8750 [3:04:19<3:29:47, 5.75s/it] {'loss': 0.4659, 'learning_rate': 3.112494770375951e-06, 'epoch': 0.75} + 75%|███████▍ | 6559/8750 [3:04:21<3:29:47, 5.75s/it] {'loss': 0.4659, 'learning_rate': 3.112494770375951e-06, 'epoch': 0.75} + 75%|███████▍ | 6559/8750 [3:04:19<3:29:47, 5.75s/it] 75%|███████▍ | 6560/8750 [3:04:27<3:28:53, 5.72s/it] 75%|███████▍ | 6560/8750 [3:04:24<3:28:53, 5.72s/it] {'loss': 0.4397, 'learning_rate': 3.1098115499225567e-06, 'epoch': 0.75} + 75%|███████▍ | 6560/8750 [3:04:27<3:28:53, 5.72s/it] {'loss': 0.4397, 'learning_rate': 3.1098115499225567e-06, 'epoch': 0.75} + 75%|███████▍ | 6560/8750 [3:04:24<3:28:53, 5.72s/it] 75%|███████▍ | 6561/8750 [3:04:32<3:28:29, 5.71s/it] 75%|███████▍ | 6561/8750 [3:04:30<3:28:29, 5.71s/it] {'loss': 0.4652, 'learning_rate': 3.107129273578815e-06, 'epoch': 0.75} + 75%|███████▍ | 6561/8750 [3:04:32<3:28:29, 5.71s/it] {'loss': 0.4652, 'learning_rate': 3.107129273578815e-06, 'epoch': 0.75} + 75%|███████▍ | 6561/8750 [3:04:30<3:28:29, 5.71s/it] 75%|███████▍ | 6562/8750 [3:04:38<3:27:10, 5.68s/it] 75%|███████▍ | 6562/8750 [3:04:36<3:27:10, 5.68s/it] {'loss': 0.4627, 'learning_rate': 3.104447941712251e-06, 'epoch': 0.75} + 75%|███████▍ | 6562/8750 [3:04:38<3:27:10, 5.68s/it] {'loss': 0.4627, 'learning_rate': 3.104447941712251e-06, 'epoch': 0.75} + 75%|███████▍ | 6562/8750 [3:04:36<3:27:10, 5.68s/it] 75%|███████▌ | 6563/8750 [3:04:43<3:26:25, 5.66s/it] {'loss': 0.5041, 'learning_rate': 3.1017675546902704e-06, 'epoch': 0.75} + 75%|███████▌ | 6563/8750 [3:04:41<3:26:25, 5.66s/it] 75%|███████▌ | 6563/8750 [3:04:43<3:26:25, 5.66s/it] {'loss': 0.5041, 'learning_rate': 3.1017675546902704e-06, 'epoch': 0.75} + 75%|███████▌ | 6563/8750 [3:04:41<3:26:25, 5.66s/it] 75%|███████▌ | 6564/8750 [3:04:49<3:26:47, 5.68s/it] 75%|███████▌ | 6564/8750 [3:04:47<3:26:47, 5.68s/it] {'loss': 0.4534, 'learning_rate': 3.0990881128801487e-06, 'epoch': 0.75} + 75%|███████▌ | 6564/8750 [3:04:49<3:26:47, 5.68s/it] {'loss': 0.4534, 'learning_rate': 3.0990881128801487e-06, 'epoch': 0.75} + 75%|███████▌ | 6564/8750 [3:04:47<3:26:47, 5.68s/it] 75%|███████▌ | 6565/8750 [3:04:55<3:26:40, 5.68s/it] 75%|███████▌ | 6565/8750 [3:04:53<3:26:40, 5.68s/it] {'loss': 0.4516, 'learning_rate': 3.096409616649023e-06, 'epoch': 0.75} + 75%|███████▌ | 6565/8750 [3:04:55<3:26:40, 5.68s/it] {'loss': 0.4516, 'learning_rate': 3.096409616649023e-06, 'epoch': 0.75} + 75%|███████▌ | 6565/8750 [3:04:53<3:26:40, 5.68s/it] 75%|███████▌ | 6566/8750 [3:05:01<3:26:53, 5.68s/it] 75%|███████▌ | 6566/8750 [3:04:58<3:26:53, 5.68s/it] {'loss': 0.4326, 'learning_rate': 3.0937320663639148e-06, 'epoch': 0.75} + 75%|███████▌ | 6566/8750 [3:05:01<3:26:53, 5.68s/it] {'loss': 0.4326, 'learning_rate': 3.0937320663639148e-06, 'epoch': 0.75} + 75%|███████▌ | 6566/8750 [3:04:58<3:26:53, 5.68s/it] 75%|███████▌ | 6567/8750 [3:05:06<3:28:00, 5.72s/it] 75%|███████▌ | 6567/8750 [3:05:04<3:28:00, 5.72s/it] {'loss': 0.4685, 'learning_rate': 3.091055462391703e-06, 'epoch': 0.75} + 75%|███████▌ | 6567/8750 [3:05:06<3:28:00, 5.72s/it] {'loss': 0.4685, 'learning_rate': 3.091055462391703e-06, 'epoch': 0.75} + 75%|███████▌ | 6567/8750 [3:05:04<3:28:00, 5.72s/it] 75%|███████▌ | 6568/8750 [3:05:12<3:29:19, 5.76s/it] 75%|███████▌ | 6568/8750 [3:05:10<3:29:19, 5.76s/it] {'loss': 0.4364, 'learning_rate': 3.088379805099141e-06, 'epoch': 0.75} + 75%|███████▌ | 6568/8750 [3:05:12<3:29:19, 5.76s/it] {'loss': 0.4364, 'learning_rate': 3.088379805099141e-06, 'epoch': 0.75} + 75%|███████▌ | 6568/8750 [3:05:10<3:29:19, 5.76s/it] 75%|███████▌ | 6569/8750 [3:05:18<3:29:15, 5.76s/it] 75%|███████▌ | 6569/8750 [3:05:16<3:29:15, 5.76s/it] {'loss': 0.4616, 'learning_rate': 3.0857050948528576e-06, 'epoch': 0.75} + 75%|███████▌ | 6569/8750 [3:05:18<3:29:15, 5.76s/it] {'loss': 0.4616, 'learning_rate': 3.0857050948528576e-06, 'epoch': 0.75} + 75%|███████▌ | 6569/8750 [3:05:16<3:29:15, 5.76s/it] 75%|███████▌ | 6570/8750 [3:05:24<3:31:59, 5.83s/it] 75%|███████▌ | 6570/8750 [3:05:22<3:31:59, 5.83s/it] {'loss': 0.4496, 'learning_rate': 3.083031332019344e-06, 'epoch': 0.75} + 75%|███████▌ | 6570/8750 [3:05:24<3:31:59, 5.83s/it] {'loss': 0.4496, 'learning_rate': 3.083031332019344e-06, 'epoch': 0.75} + 75%|███████▌ | 6570/8750 [3:05:22<3:31:59, 5.83s/it] 75%|███████▌ | 6571/8750 [3:05:30<3:29:50, 5.78s/it] 75%|███████▌ | 6571/8750 [3:05:27<3:29:50, 5.78s/it] {'loss': 0.4441, 'learning_rate': 3.0803585169649696e-06, 'epoch': 0.75} + 75%|███████▌ | 6571/8750 [3:05:30<3:29:50, 5.78s/it] {'loss': 0.4441, 'learning_rate': 3.0803585169649696e-06, 'epoch': 0.75} + 75%|███████▌ | 6571/8750 [3:05:27<3:29:50, 5.78s/it] 75%|███████▌ | 6572/8750 [3:05:35<3:28:25, 5.74s/it] 75%|███████▌ | 6572/8750 [3:05:33<3:28:25, 5.74s/it] {'loss': 0.4711, 'learning_rate': 3.0776866500559654e-06, 'epoch': 0.75} + 75%|███████▌ | 6572/8750 [3:05:35<3:28:25, 5.74s/it] {'loss': 0.4711, 'learning_rate': 3.0776866500559654e-06, 'epoch': 0.75} + 75%|███████▌ | 6572/8750 [3:05:33<3:28:25, 5.74s/it] 75%|███████▌ | 6573/8750 [3:05:41<3:28:13, 5.74s/it] 75%|███████▌ | 6573/8750 [3:05:39<3:28:13, 5.74s/it] {'loss': 0.4634, 'learning_rate': 3.0750157316584375e-06, 'epoch': 0.75} + 75%|███████▌ | 6573/8750 [3:05:41<3:28:13, 5.74s/it] {'loss': 0.4634, 'learning_rate': 3.0750157316584375e-06, 'epoch': 0.75} + 75%|███████▌ | 6573/8750 [3:05:39<3:28:13, 5.74s/it] 75%|███████▌ | 6574/8750 [3:05:47<3:30:02, 5.79s/it] 75%|███████▌ | 6574/8750 [3:05:45<3:30:02, 5.79s/it] {'loss': 0.4329, 'learning_rate': 3.0723457621383666e-06, 'epoch': 0.75} + {'loss': 0.4329, 'learning_rate': 3.0723457621383666e-06, 'epoch': 0.75} 75%|███████▌ | 6574/8750 [3:05:47<3:30:02, 5.79s/it] + 75%|███████▌ | 6574/8750 [3:05:45<3:30:02, 5.79s/it] 75%|███████▌ | 6575/8750 [3:05:53<3:30:01, 5.79s/it] 75%|███████▌ | 6575/8750 [3:05:50<3:30:01, 5.79s/it] {'loss': 0.4494, 'learning_rate': 3.0696767418615945e-06, 'epoch': 0.75} + 75%|███████▌ | 6575/8750 [3:05:53<3:30:01, 5.79s/it] {'loss': 0.4494, 'learning_rate': 3.0696767418615945e-06, 'epoch': 0.75} + 75%|███████▌ | 6575/8750 [3:05:50<3:30:01, 5.79s/it] 75%|███████▌ | 6576/8750 [3:05:58<3:29:57, 5.79s/it] 75%|███████▌ | 6576/8750 [3:05:56<3:29:57, 5.79s/it] {'loss': 0.4572, 'learning_rate': 3.067008671193833e-06, 'epoch': 0.75} + {'loss': 0.4572, 'learning_rate': 3.067008671193833e-06, 'epoch': 0.75} + 75%|███████▌ | 6576/8750 [3:05:58<3:29:57, 5.79s/it] 75%|███████▌ | 6576/8750 [3:05:56<3:29:57, 5.79s/it] 75%|███████▌ | 6577/8750 [3:06:04<3:28:07, 5.75s/it] 75%|███████▌ | 6577/8750 [3:06:02<3:28:07, 5.75s/it] {'loss': 0.4618, 'learning_rate': 3.0643415505006733e-06, 'epoch': 0.75} + 75%|███████▌ | 6577/8750 [3:06:04<3:28:07, 5.75s/it] {'loss': 0.4618, 'learning_rate': 3.0643415505006733e-06, 'epoch': 0.75} + 75%|███████▌ | 6577/8750 [3:06:02<3:28:07, 5.75s/it] 75%|███████▌ | 6578/8750 [3:06:10<3:26:17, 5.70s/it] 75%|███████▌ | 6578/8750 [3:06:07<3:26:17, 5.70s/it] {'loss': 0.4343, 'learning_rate': 3.0616753801475653e-06, 'epoch': 0.75} + 75%|███████▌ | 6578/8750 [3:06:10<3:26:17, 5.70s/it] {'loss': 0.4343, 'learning_rate': 3.0616753801475653e-06, 'epoch': 0.75} + 75%|███████▌ | 6578/8750 [3:06:07<3:26:17, 5.70s/it] 75%|███████▌ | 6579/8750 [3:06:16<3:27:38, 5.74s/it] 75%|███████▌ | 6579/8750 [3:06:13<3:27:38, 5.74s/it] {'loss': 0.4473, 'learning_rate': 3.059010160499839e-06, 'epoch': 0.75} + 75%|███████▌ | 6579/8750 [3:06:16<3:27:38, 5.74s/it] {'loss': 0.4473, 'learning_rate': 3.059010160499839e-06, 'epoch': 0.75} + 75%|███████▌ | 6579/8750 [3:06:13<3:27:38, 5.74s/it] 75%|███████▌ | 6580/8750 [3:06:21<3:26:14, 5.70s/it] 75%|███████▌ | 6580/8750 [3:06:19<3:26:14, 5.70s/it] {'loss': 0.4555, 'learning_rate': 3.056345891922684e-06, 'epoch': 0.75} + 75%|███████▌ | 6580/8750 [3:06:21<3:26:14, 5.70s/it] {'loss': 0.4555, 'learning_rate': 3.056345891922684e-06, 'epoch': 0.75} + 75%|███████▌ | 6580/8750 [3:06:19<3:26:14, 5.70s/it] 75%|███████▌ | 6581/8750 [3:06:27<3:28:35, 5.77s/it] 75%|███████▌ | 6581/8750 [3:06:25<3:28:35, 5.77s/it] {'loss': 0.4598, 'learning_rate': 3.0536825747811695e-06, 'epoch': 0.75} + 75%|███████▌ | 6581/8750 [3:06:27<3:28:35, 5.77s/it] {'loss': 0.4598, 'learning_rate': 3.0536825747811695e-06, 'epoch': 0.75} + 75%|███████▌ | 6581/8750 [3:06:25<3:28:35, 5.77s/it] 75%|███████▌ | 6582/8750 [3:06:33<3:29:12, 5.79s/it] 75%|███████▌ | 6582/8750 [3:06:31<3:29:12, 5.79s/it]{'loss': 0.4601, 'learning_rate': 3.0510202094402242e-06, 'epoch': 0.75} + 75%|███████▌ | 6582/8750 [3:06:33<3:29:12, 5.79s/it] {'loss': 0.4601, 'learning_rate': 3.0510202094402242e-06, 'epoch': 0.75} + 75%|███████▌ | 6582/8750 [3:06:31<3:29:12, 5.79s/it] 75%|███████▌ | 6583/8750 [3:06:39<3:29:03, 5.79s/it] 75%|███████▌ | 6583/8750 [3:06:36<3:29:03, 5.79s/it] {'loss': 0.4466, 'learning_rate': 3.0483587962646545e-06, 'epoch': 0.75} + 75%|███████▌ | 6583/8750 [3:06:39<3:29:03, 5.79s/it] {'loss': 0.4466, 'learning_rate': 3.0483587962646545e-06, 'epoch': 0.75} + 75%|███████▌ | 6583/8750 [3:06:36<3:29:03, 5.79s/it] 75%|███████▌ | 6584/8750 [3:06:45<3:30:14, 5.82s/it] 75%|███████▌ | 6584/8750 [3:06:42<3:30:14, 5.82s/it] {'loss': 0.4614, 'learning_rate': 3.045698335619135e-06, 'epoch': 0.75} + 75%|███████▌ | 6584/8750 [3:06:45<3:30:14, 5.82s/it] {'loss': 0.4614, 'learning_rate': 3.045698335619135e-06, 'epoch': 0.75} + 75%|███████▌ | 6584/8750 [3:06:42<3:30:14, 5.82s/it] 75%|███████▌ | 6585/8750 [3:06:50<3:27:10, 5.74s/it] 75%|███████▌ | 6585/8750 [3:06:48<3:27:10, 5.74s/it] {'loss': 0.4739, 'learning_rate': 3.0430388278682078e-06, 'epoch': 0.75} + 75%|███████▌ | 6585/8750 [3:06:50<3:27:10, 5.74s/it] {'loss': 0.4739, 'learning_rate': 3.0430388278682078e-06, 'epoch': 0.75} + 75%|███████▌ | 6585/8750 [3:06:48<3:27:10, 5.74s/it] 75%|███████▌ | 6586/8750 [3:06:56<3:25:43, 5.70s/it] 75%|███████▌ | 6586/8750 [3:06:54<3:25:43, 5.70s/it] {'loss': 0.4521, 'learning_rate': 3.04038027337628e-06, 'epoch': 0.75} + 75%|███████▌ | 6586/8750 [3:06:56<3:25:43, 5.70s/it] {'loss': 0.4521, 'learning_rate': 3.04038027337628e-06, 'epoch': 0.75} + 75%|███████▌ | 6586/8750 [3:06:54<3:25:43, 5.70s/it] 75%|███████▌ | 6587/8750 [3:07:01<3:24:26, 5.67s/it] 75%|███████▌ | 6587/8750 [3:06:59<3:24:25, 5.67s/it] {'loss': 0.4688, 'learning_rate': 3.0377226725076394e-06, 'epoch': 0.75} + 75%|███████▌ | 6587/8750 [3:07:01<3:24:26, 5.67s/it] {'loss': 0.4688, 'learning_rate': 3.0377226725076394e-06, 'epoch': 0.75} + 75%|███████▌ | 6587/8750 [3:06:59<3:24:25, 5.67s/it] 75%|███████▌ | 6588/8750 [3:07:07<3:23:51, 5.66s/it] 75%|███████▌ | 6588/8750 [3:07:05<3:23:51, 5.66s/it] {'loss': 0.4473, 'learning_rate': 3.035066025626434e-06, 'epoch': 0.75} + 75%|███████▌ | 6588/8750 [3:07:07<3:23:51, 5.66s/it] {'loss': 0.4473, 'learning_rate': 3.035066025626434e-06, 'epoch': 0.75} + 75%|███████▌ | 6588/8750 [3:07:05<3:23:51, 5.66s/it] 75%|███████▌ | 6589/8750 [3:07:13<3:23:28, 5.65s/it] 75%|███████▌ | 6589/8750 [3:07:10<3:23:28, 5.65s/it] {'loss': 0.4427, 'learning_rate': 3.0324103330966804e-06, 'epoch': 0.75} + {'loss': 0.4427, 'learning_rate': 3.0324103330966804e-06, 'epoch': 0.75} 75%|███████▌ | 6589/8750 [3:07:13<3:23:28, 5.65s/it] + 75%|███████▌ | 6589/8750 [3:07:10<3:23:28, 5.65s/it] 75%|███████▌ | 6590/8750 [3:07:19<3:26:00, 5.72s/it] 75%|███████▌ | 6590/8750 [3:07:16<3:26:00, 5.72s/it] {'loss': 0.4678, 'learning_rate': 3.02975559528227e-06, 'epoch': 0.75} + 75%|███████▌ | 6590/8750 [3:07:19<3:26:00, 5.72s/it] {'loss': 0.4678, 'learning_rate': 3.02975559528227e-06, 'epoch': 0.75} + 75%|███████▌ | 6590/8750 [3:07:16<3:26:00, 5.72s/it] 75%|███████▌ | 6591/8750 [3:07:24<3:24:27, 5.68s/it] 75%|███████▌ | 6591/8750 [3:07:22<3:24:27, 5.68s/it] {'loss': 0.4627, 'learning_rate': 3.027101812546965e-06, 'epoch': 0.75} + 75%|███████▌ | 6591/8750 [3:07:24<3:24:27, 5.68s/it] {'loss': 0.4627, 'learning_rate': 3.027101812546965e-06, 'epoch': 0.75} + 75%|███████▌ | 6591/8750 [3:07:22<3:24:27, 5.68s/it] 75%|███████▌ | 6592/8750 [3:07:30<3:25:49, 5.72s/it] 75%|███████▌ | 6592/8750 [3:07:28<3:25:49, 5.72s/it] {'loss': 0.4567, 'learning_rate': 3.024448985254387e-06, 'epoch': 0.75} + 75%|███████▌ | 6592/8750 [3:07:30<3:25:49, 5.72s/it] {'loss': 0.4567, 'learning_rate': 3.024448985254387e-06, 'epoch': 0.75} + 75%|███████▌ | 6592/8750 [3:07:28<3:25:49, 5.72s/it] 75%|███████▌ | 6593/8750 [3:07:36<3:24:46, 5.70s/it] 75%|███████▌ | 6593/8750 [3:07:33<3:24:46, 5.70s/it] {'loss': 0.4516, 'learning_rate': 3.021797113768039e-06, 'epoch': 0.75} + 75%|███████▌ | 6593/8750 [3:07:36<3:24:46, 5.70s/it] {'loss': 0.4516, 'learning_rate': 3.021797113768039e-06, 'epoch': 0.75} + 75%|███████▌ | 6593/8750 [3:07:33<3:24:46, 5.70s/it] 75%|███████▌ | 6594/8750 [3:07:42<3:27:37, 5.78s/it] 75%|███████▌ | 6594/8750 [3:07:39<3:27:37, 5.78s/it] {'loss': 0.4481, 'learning_rate': 3.0191461984512794e-06, 'epoch': 0.75} + 75%|███████▌ | 6594/8750 [3:07:42<3:27:37, 5.78s/it] {'loss': 0.4481, 'learning_rate': 3.0191461984512794e-06, 'epoch': 0.75} + 75%|███████▌ | 6594/8750 [3:07:39<3:27:37, 5.78s/it] 75%|███████▌ | 6595/8750 [3:07:47<3:25:46, 5.73s/it] 75%|███████▌ | 6595/8750 [3:07:45<3:25:46, 5.73s/it] {'loss': 0.4717, 'learning_rate': 3.016496239667349e-06, 'epoch': 0.75} + {'loss': 0.4717, 'learning_rate': 3.016496239667349e-06, 'epoch': 0.75} + 75%|███████▌ | 6595/8750 [3:07:47<3:25:46, 5.73s/it] 75%|███████▌ | 6595/8750 [3:07:45<3:25:46, 5.73s/it] 75%|███████▌ | 6596/8750 [3:07:53<3:25:33, 5.73s/it] 75%|███████▌ | 6596/8750 [3:07:51<3:25:33, 5.73s/it] {'loss': 0.4614, 'learning_rate': 3.013847237779346e-06, 'epoch': 0.75} + 75%|███████▌ | 6596/8750 [3:07:51<3:25:33, 5.73s/it] {'loss': 0.4614, 'learning_rate': 3.013847237779346e-06, 'epoch': 0.75} + 75%|███████▌ | 6596/8750 [3:07:53<3:25:33, 5.73s/it] 75%|███████▌ | 6597/8750 [3:07:59<3:26:22, 5.75s/it] 75%|███████▌ | 6597/8750 [3:07:56<3:26:22, 5.75s/it] {'loss': 0.4335, 'learning_rate': 3.0111991931502484e-06, 'epoch': 0.75} + 75%|███████▌ | 6597/8750 [3:07:59<3:26:22, 5.75s/it] {'loss': 0.4335, 'learning_rate': 3.0111991931502484e-06, 'epoch': 0.75} + 75%|███████▌ | 6597/8750 [3:07:56<3:26:22, 5.75s/it] 75%|███████▌ | 6598/8750 [3:08:04<3:26:14, 5.75s/it] 75%|███████▌ | 6598/8750 [3:08:02<3:26:15, 5.75s/it] {'loss': 0.4489, 'learning_rate': 3.0085521061428945e-06, 'epoch': 0.75} + 75%|███████▌ | 6598/8750 [3:08:04<3:26:14, 5.75s/it] {'loss': 0.4489, 'learning_rate': 3.0085521061428945e-06, 'epoch': 0.75} + 75%|███████▌ | 6598/8750 [3:08:02<3:26:15, 5.75s/it] 75%|███████▌ | 6599/8750 [3:08:10<3:25:16, 5.73s/it] 75%|███████▌ | 6599/8750 [3:08:08<3:25:16, 5.73s/it] {'loss': 0.4609, 'learning_rate': 3.005905977119992e-06, 'epoch': 0.75} + 75%|███████▌ | 6599/8750 [3:08:08<3:25:16, 5.73s/it]{'loss': 0.4609, 'learning_rate': 3.005905977119992e-06, 'epoch': 0.75} + 75%|███████▌ | 6599/8750 [3:08:10<3:25:16, 5.73s/it]914 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +6 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +158 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +10 AutoResumeHook: Checking whether to suspend...7 AutoResumeHook: Checking whether to suspend... + + 75%|███████▌ | 6600/8750 [3:08:16<3:28:54, 5.83s/it]1 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +0 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... + 75%|███████▌ | 6600/8750 [3:08:14<3:28:54, 5.83s/it]3 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4218, 'learning_rate': 3.0032608064441206e-06, 'epoch': 0.75} + 75%|███████▌ | 6600/8750 [3:08:16<3:28:54, 5.83s/it] {'loss': 0.4218, 'learning_rate': 3.0032608064441206e-06, 'epoch': 0.75} + 75%|███████▌ | 6600/8750 [3:08:14<3:28:54, 5.83s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-6600/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-6600/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-6600/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 75%|███████▌ | 6601/8750 [3:08:39<6:31:31, 10.93s/it] 75%|███████▌ | 6601/8750 [3:08:37<6:31:31, 10.93s/it] {'loss': 0.4688, 'learning_rate': 3.0006165944777333e-06, 'epoch': 0.75} + 75%|███████▌ | 6601/8750 [3:08:39<6:31:31, 10.93s/it] {'loss': 0.4688, 'learning_rate': 3.0006165944777333e-06, 'epoch': 0.75} + 75%|███████▌ | 6601/8750 [3:08:37<6:31:31, 10.93s/it] 75%|███████▌ | 6602/8750 [3:08:45<5:35:51, 9.38s/it] 75%|███████▌ | 6602/8750 [3:08:43<5:35:51, 9.38s/it] {'loss': 0.4433, 'learning_rate': 2.997973341583138e-06, 'epoch': 0.75} + 75%|███████▌ | 6602/8750 [3:08:45<5:35:51, 9.38s/it] {'loss': 0.4433, 'learning_rate': 2.997973341583138e-06, 'epoch': 0.75} + 75%|███████▌ | 6602/8750 [3:08:43<5:35:51, 9.38s/it] 75%|███████▌ | 6603/8750 [3:08:50<4:55:24, 8.26s/it] 75%|███████▌ | 6603/8750 [3:08:48<4:55:24, 8.26s/it] {'loss': 0.4752, 'learning_rate': 2.9953310481225275e-06, 'epoch': 0.75} + 75%|███████▌ | 6603/8750 [3:08:50<4:55:24, 8.26s/it] {'loss': 0.4752, 'learning_rate': 2.9953310481225275e-06, 'epoch': 0.75} + 75%|███████▌ | 6603/8750 [3:08:48<4:55:24, 8.26s/it] 75%|███████▌ | 6604/8750 [3:08:56<4:26:28, 7.45s/it] 75%|███████▌ | 6604/8750 [3:08:54<4:26:28, 7.45s/it] {'loss': 0.4381, 'learning_rate': 2.992689714457947e-06, 'epoch': 0.75} + 75%|███████▌ | 6604/8750 [3:08:56<4:26:28, 7.45s/it] {'loss': 0.4381, 'learning_rate': 2.992689714457947e-06, 'epoch': 0.75} + 75%|███████▌ | 6604/8750 [3:08:54<4:26:28, 7.45s/it] 75%|███████▌ | 6605/8750 [3:09:02<4:07:39, 6.93s/it] 75%|███████▌ | 6605/8750 [3:08:59<4:07:40, 6.93s/it] {'loss': 0.459, 'learning_rate': 2.9900493409513256e-06, 'epoch': 0.75} + 75%|███████▌ | 6605/8750 [3:09:02<4:07:39, 6.93s/it] {'loss': 0.459, 'learning_rate': 2.9900493409513256e-06, 'epoch': 0.75} + 75%|███████▌ | 6605/8750 [3:08:59<4:07:40, 6.93s/it] 75%|███████▌ | 6606/8750 [3:09:08<3:55:56, 6.60s/it] 75%|███████▌ | 6606/8750 [3:09:05<3:55:56, 6.60s/it] {'loss': 0.4376, 'learning_rate': 2.9874099279644487e-06, 'epoch': 0.75} + 75%|███████▌ | 6606/8750 [3:09:08<3:55:56, 6.60s/it] {'loss': 0.4376, 'learning_rate': 2.9874099279644487e-06, 'epoch': 0.75} + 75%|███████▌ | 6606/8750 [3:09:05<3:55:56, 6.60s/it] 76%|███████▌ | 6607/8750 [3:09:13<3:45:25, 6.31s/it] 76%|███████▌ | 6607/8750 [3:09:11<3:45:25, 6.31s/it] {'loss': 0.4624, 'learning_rate': 2.984771475858973e-06, 'epoch': 0.76} + 76%|███████▌ | 6607/8750 [3:09:13<3:45:25, 6.31s/it] {'loss': 0.4624, 'learning_rate': 2.984771475858973e-06, 'epoch': 0.76} + 76%|███████▌ | 6607/8750 [3:09:11<3:45:25, 6.31s/it] 76%|███████▌ | 6608/8750 [3:09:19<3:39:51, 6.16s/it] 76%|███████▌ | 6608/8750 [3:09:17<3:39:50, 6.16s/it] {'loss': 0.4495, 'learning_rate': 2.9821339849964324e-06, 'epoch': 0.76} + 76%|███████▌ | 6608/8750 [3:09:19<3:39:51, 6.16s/it] {'loss': 0.4495, 'learning_rate': 2.9821339849964324e-06, 'epoch': 0.76} + 76%|███████▌ | 6608/8750 [3:09:17<3:39:50, 6.16s/it] 76%|███████▌ | 6609/8750 [3:09:25<3:36:06, 6.06s/it] 76%|███████▌ | 6609/8750 [3:09:23<3:36:06, 6.06s/it] {'loss': 0.4727, 'learning_rate': 2.979497455738214e-06, 'epoch': 0.76} + 76%|███████▌ | 6609/8750 [3:09:25<3:36:06, 6.06s/it] {'loss': 0.4727, 'learning_rate': 2.979497455738214e-06, 'epoch': 0.76} + 76%|███████▌ | 6609/8750 [3:09:23<3:36:06, 6.06s/it] 76%|███████▌ | 6610/8750 [3:09:31<3:37:30, 6.10s/it] 76%|███████▌ | 6610/8750 [3:09:29<3:37:30, 6.10s/it] {'loss': 0.4405, 'learning_rate': 2.976861888445586e-06, 'epoch': 0.76} + 76%|███████▌ | 6610/8750 [3:09:31<3:37:30, 6.10s/it] {'loss': 0.4405, 'learning_rate': 2.976861888445586e-06, 'epoch': 0.76} + 76%|███████▌ | 6610/8750 [3:09:29<3:37:30, 6.10s/it] 76%|███████▌ | 6611/8750 [3:09:36<3:30:20, 5.90s/it] 76%|███████▌ | 6611/8750 [3:09:34<3:30:20, 5.90s/it] {'loss': 0.4884, 'learning_rate': 2.9742272834796813e-06, 'epoch': 0.76} + 76%|███████▌ | 6611/8750 [3:09:36<3:30:20, 5.90s/it] {'loss': 0.4884, 'learning_rate': 2.9742272834796813e-06, 'epoch': 0.76} + 76%|███████▌ | 6611/8750 [3:09:34<3:30:20, 5.90s/it] 76%|███████▌ | 6612/8750 [3:09:42<3:29:15, 5.87s/it] 76%|███████▌ | 6612/8750 [3:09:40<3:29:15, 5.87s/it] {'loss': 0.4645, 'learning_rate': 2.9715936412014945e-06, 'epoch': 0.76} + 76%|███████▌ | 6612/8750 [3:09:42<3:29:15, 5.87s/it] {'loss': 0.4645, 'learning_rate': 2.9715936412014945e-06, 'epoch': 0.76} + 76%|███████▌ | 6612/8750 [3:09:40<3:29:15, 5.87s/it] 76%|███████▌ | 6613/8750 [3:09:48<3:28:24, 5.85s/it] 76%|███████▌ | 6613/8750 [3:09:46<3:28:24, 5.85s/it]{'loss': 0.4535, 'learning_rate': 2.9689609619718996e-06, 'epoch': 0.76} + {'loss': 0.4535, 'learning_rate': 2.9689609619718996e-06, 'epoch': 0.76} + 76%|███████▌ | 6613/8750 [3:09:48<3:28:24, 5.85s/it] 76%|███████▌ | 6613/8750 [3:09:46<3:28:24, 5.85s/it] 76%|███████▌ | 6614/8750 [3:09:54<3:26:18, 5.80s/it] 76%|███████▌ | 6614/8750 [3:09:51<3:26:19, 5.80s/it] {'loss': 0.4459, 'learning_rate': 2.966329246151626e-06, 'epoch': 0.76} + 76%|███████▌ | 6614/8750 [3:09:54<3:26:18, 5.80s/it] {'loss': 0.4459, 'learning_rate': 2.966329246151626e-06, 'epoch': 0.76} + 76%|███████▌ | 6614/8750 [3:09:51<3:26:19, 5.80s/it] 76%|███████▌ | 6615/8750 [3:09:59<3:24:38, 5.75s/it] 76%|███████▌ | 6615/8750 [3:09:57<3:24:38, 5.75s/it] {'loss': 0.4543, 'learning_rate': 2.9636984941012835e-06, 'epoch': 0.76} + 76%|███████▌ | 6615/8750 [3:09:59<3:24:38, 5.75s/it] {'loss': 0.4543, 'learning_rate': 2.9636984941012835e-06, 'epoch': 0.76} + 76%|███████▌ | 6615/8750 [3:09:57<3:24:38, 5.75s/it] 76%|███████▌ | 6616/8750 [3:10:05<3:27:28, 5.83s/it] 76%|███████▌ | 6616/8750 [3:10:03<3:27:27, 5.83s/it] {'loss': 0.4395, 'learning_rate': 2.9610687061813405e-06, 'epoch': 0.76} + 76%|███████▌ | 6616/8750 [3:10:05<3:27:28, 5.83s/it] {'loss': 0.4395, 'learning_rate': 2.9610687061813405e-06, 'epoch': 0.76} + 76%|███████▌ | 6616/8750 [3:10:03<3:27:27, 5.83s/it] 76%|███████▌ | 6617/8750 [3:10:11<3:28:30, 5.87s/it] 76%|███████▌ | 6617/8750 [3:10:09<3:28:30, 5.87s/it] {'loss': 0.4394, 'learning_rate': 2.9584398827521343e-06, 'epoch': 0.76} + 76%|███████▌ | 6617/8750 [3:10:11<3:28:30, 5.87s/it] {'loss': 0.4394, 'learning_rate': 2.9584398827521343e-06, 'epoch': 0.76} + 76%|███████▌ | 6617/8750 [3:10:09<3:28:30, 5.87s/it] 76%|███████▌ | 6618/8750 [3:10:15<3:27:31, 5.84s/it] 76%|███████▌ | 6618/8750 [3:10:17<3:27:31, 5.84s/it] {'loss': 0.451, 'learning_rate': 2.9558120241738786e-06, 'epoch': 0.76} + 76%|███████▌ | 6618/8750 [3:10:15<3:27:31, 5.84s/it]{'loss': 0.451, 'learning_rate': 2.9558120241738786e-06, 'epoch': 0.76} + 76%|███████▌ | 6618/8750 [3:10:17<3:27:31, 5.84s/it] 76%|███████▌ | 6619/8750 [3:10:23<3:25:36, 5.79s/it] 76%|███████▌ | 6619/8750 [3:10:20<3:25:36, 5.79s/it] {'loss': 0.468, 'learning_rate': 2.9531851308066426e-06, 'epoch': 0.76} + 76%|███████▌ | 6619/8750 [3:10:23<3:25:36, 5.79s/it] {'loss': 0.468, 'learning_rate': 2.9531851308066426e-06, 'epoch': 0.76} + 76%|███████▌ | 6619/8750 [3:10:20<3:25:36, 5.79s/it] 76%|███████▌ | 6620/8750 [3:10:28<3:23:58, 5.75s/it] 76%|███████▌ | 6620/8750 [3:10:26<3:23:57, 5.75s/it] {'loss': 0.4386, 'learning_rate': 2.950559203010371e-06, 'epoch': 0.76} + 76%|███████▌ | 6620/8750 [3:10:28<3:23:58, 5.75s/it] {'loss': 0.4386, 'learning_rate': 2.950559203010371e-06, 'epoch': 0.76} + 76%|███████▌ | 6620/8750 [3:10:26<3:23:57, 5.75s/it] 76%|███████▌ | 6621/8750 [3:10:34<3:22:37, 5.71s/it] 76%|███████▌ | 6621/8750 [3:10:32<3:22:37, 5.71s/it] {'loss': 0.4552, 'learning_rate': 2.9479342411448797e-06, 'epoch': 0.76} + 76%|███████▌ | 6621/8750 [3:10:34<3:22:37, 5.71s/it] {'loss': 0.4552, 'learning_rate': 2.9479342411448797e-06, 'epoch': 0.76} + 76%|███████▌ | 6621/8750 [3:10:32<3:22:37, 5.71s/it] 76%|███████▌ | 6622/8750 [3:10:40<3:21:06, 5.67s/it] 76%|███████▌ | 6622/8750 [3:10:37<3:21:06, 5.67s/it] {'loss': 0.4542, 'learning_rate': 2.945310245569839e-06, 'epoch': 0.76} + 76%|███████▌ | 6622/8750 [3:10:40<3:21:06, 5.67s/it] {'loss': 0.4542, 'learning_rate': 2.945310245569839e-06, 'epoch': 0.76} + 76%|███████▌ | 6622/8750 [3:10:37<3:21:06, 5.67s/it] 76%|███████▌ | 6623/8750 [3:10:45<3:22:07, 5.70s/it] 76%|███████▌ | 6623/8750 [3:10:43<3:22:07, 5.70s/it] {'loss': 0.4395, 'learning_rate': 2.942687216644803e-06, 'epoch': 0.76} + 76%|███████▌ | 6623/8750 [3:10:45<3:22:07, 5.70s/it] {'loss': 0.4395, 'learning_rate': 2.942687216644803e-06, 'epoch': 0.76} + 76%|███████▌ | 6623/8750 [3:10:43<3:22:07, 5.70s/it] 76%|███████▌ | 6624/8750 [3:10:51<3:21:25, 5.68s/it] 76%|███████▌ | 6624/8750 [3:10:49<3:21:25, 5.68s/it] {'loss': 0.4551, 'learning_rate': 2.94006515472918e-06, 'epoch': 0.76} + 76%|███████▌ | 6624/8750 [3:10:51<3:21:25, 5.68s/it] {'loss': 0.4551, 'learning_rate': 2.94006515472918e-06, 'epoch': 0.76} + 76%|███████▌ | 6624/8750 [3:10:49<3:21:25, 5.68s/it] 76%|███████▌ | 6625/8750 [3:10:57<3:21:21, 5.69s/it] 76%|███████▌ | 6625/8750 [3:10:54<3:21:21, 5.69s/it] {'loss': 0.4472, 'learning_rate': 2.9374440601822495e-06, 'epoch': 0.76} + 76%|███████▌ | 6625/8750 [3:10:57<3:21:21, 5.69s/it] {'loss': 0.4472, 'learning_rate': 2.9374440601822495e-06, 'epoch': 0.76} + 76%|███████▌ | 6625/8750 [3:10:54<3:21:21, 5.69s/it] 76%|███████▌ | 6626/8750 [3:11:03<3:25:41, 5.81s/it] 76%|███████▌ | 6626/8750 [3:11:01<3:25:41, 5.81s/it] {'loss': 0.4362, 'learning_rate': 2.9348239333631655e-06, 'epoch': 0.76} + 76%|███████▌ | 6626/8750 [3:11:03<3:25:41, 5.81s/it]{'loss': 0.4362, 'learning_rate': 2.9348239333631655e-06, 'epoch': 0.76} + 76%|███████▌ | 6626/8750 [3:11:01<3:25:41, 5.81s/it] 76%|███████▌ | 6627/8750 [3:11:09<3:24:53, 5.79s/it] 76%|███████▌ | 6627/8750 [3:11:06<3:24:53, 5.79s/it] {'loss': 0.4542, 'learning_rate': 2.9322047746309377e-06, 'epoch': 0.76} + 76%|███████▌ | 6627/8750 [3:11:09<3:24:53, 5.79s/it] {'loss': 0.4542, 'learning_rate': 2.9322047746309377e-06, 'epoch': 0.76} + 76%|███████▌ | 6627/8750 [3:11:06<3:24:53, 5.79s/it] 76%|███████▌ | 6628/8750 [3:11:14<3:23:24, 5.75s/it] 76%|███████▌ | 6628/8750 [3:11:12<3:23:24, 5.75s/it] {'loss': 0.4466, 'learning_rate': 2.929586584344456e-06, 'epoch': 0.76} + 76%|███████▌ | 6628/8750 [3:11:14<3:23:24, 5.75s/it] {'loss': 0.4466, 'learning_rate': 2.929586584344456e-06, 'epoch': 0.76} + 76%|███████▌ | 6628/8750 [3:11:12<3:23:24, 5.75s/it] 76%|███████▌ | 6629/8750 [3:11:20<3:22:41, 5.73s/it] 76%|███████▌ | 6629/8750 [3:11:18<3:22:41, 5.73s/it] {'loss': 0.4727, 'learning_rate': 2.926969362862465e-06, 'epoch': 0.76} + 76%|███████▌ | 6629/8750 [3:11:20<3:22:41, 5.73s/it] {'loss': 0.4727, 'learning_rate': 2.926969362862465e-06, 'epoch': 0.76} + 76%|███████▌ | 6629/8750 [3:11:18<3:22:41, 5.73s/it] 76%|███████▌ | 6630/8750 [3:11:26<3:23:03, 5.75s/it] 76%|███████▌ | 6630/8750 [3:11:23<3:23:03, 5.75s/it] {'loss': 0.4393, 'learning_rate': 2.924353110543584e-06, 'epoch': 0.76} + 76%|███████▌ | 6630/8750 [3:11:26<3:23:03, 5.75s/it] {'loss': 0.4393, 'learning_rate': 2.924353110543584e-06, 'epoch': 0.76} + 76%|███████▌ | 6630/8750 [3:11:23<3:23:03, 5.75s/it] 76%|███████▌ | 6631/8750 [3:11:32<3:23:47, 5.77s/it] 76%|███████▌ | 6631/8750 [3:11:29<3:23:47, 5.77s/it] {'loss': 0.4731, 'learning_rate': 2.9217378277463025e-06, 'epoch': 0.76} + 76%|███████▌ | 6631/8750 [3:11:32<3:23:47, 5.77s/it] {'loss': 0.4731, 'learning_rate': 2.9217378277463025e-06, 'epoch': 0.76} + 76%|███████▌ | 6631/8750 [3:11:29<3:23:47, 5.77s/it] 76%|███████▌ | 6632/8750 [3:11:37<3:23:38, 5.77s/it] 76%|███████▌ | 6632/8750 [3:11:35<3:23:38, 5.77s/it] {'loss': 0.4454, 'learning_rate': 2.919123514828969e-06, 'epoch': 0.76} + 76%|███████▌ | 6632/8750 [3:11:37<3:23:38, 5.77s/it] {'loss': 0.4454, 'learning_rate': 2.919123514828969e-06, 'epoch': 0.76} + 76%|███████▌ | 6632/8750 [3:11:35<3:23:38, 5.77s/it] 76%|███████▌ | 6633/8750 [3:11:43<3:23:51, 5.78s/it] 76%|███████▌ | 6633/8750 [3:11:41<3:23:51, 5.78s/it] {'loss': 0.4309, 'learning_rate': 2.916510172149799e-06, 'epoch': 0.76} + 76%|███████▌ | 6633/8750 [3:11:43<3:23:51, 5.78s/it] {'loss': 0.4309, 'learning_rate': 2.916510172149799e-06, 'epoch': 0.76} + 76%|███████▌ | 6633/8750 [3:11:41<3:23:51, 5.78s/it] 76%|███████▌ | 6634/8750 [3:11:49<3:25:44, 5.83s/it] 76%|███████▌ | 6634/8750 [3:11:47<3:25:44, 5.83s/it] {'loss': 0.4529, 'learning_rate': 2.913897800066887e-06, 'epoch': 0.76} + 76%|███████▌ | 6634/8750 [3:11:49<3:25:44, 5.83s/it] {'loss': 0.4529, 'learning_rate': 2.913897800066887e-06, 'epoch': 0.76} + 76%|███████▌ | 6634/8750 [3:11:47<3:25:44, 5.83s/it] 76%|███████▌ | 6635/8750 [3:11:55<3:24:04, 5.79s/it] 76%|███████▌ | 6635/8750 [3:11:52<3:24:04, 5.79s/it] {'loss': 0.4504, 'learning_rate': 2.911286398938178e-06, 'epoch': 0.76} + 76%|███████▌ | 6635/8750 [3:11:55<3:24:04, 5.79s/it] {'loss': 0.4504, 'learning_rate': 2.911286398938178e-06, 'epoch': 0.76} + 76%|███████▌ | 6635/8750 [3:11:52<3:24:04, 5.79s/it] 76%|███████▌ | 6636/8750 [3:12:00<3:22:53, 5.76s/it] 76%|███████▌ | 6636/8750 [3:11:58<3:22:54, 5.76s/it] {'loss': 0.4542, 'learning_rate': 2.9086759691214994e-06, 'epoch': 0.76} + 76%|███████▌ | 6636/8750 [3:12:00<3:22:53, 5.76s/it] {'loss': 0.4542, 'learning_rate': 2.9086759691214994e-06, 'epoch': 0.76} + 76%|███████▌ | 6636/8750 [3:11:58<3:22:54, 5.76s/it] 76%|███████▌ | 6637/8750 [3:12:06<3:21:19, 5.72s/it] 76%|███████▌ | 6637/8750 [3:12:04<3:21:19, 5.72s/it] {'loss': 0.4558, 'learning_rate': 2.9060665109745324e-06, 'epoch': 0.76} + 76%|███████▌ | 6637/8750 [3:12:06<3:21:19, 5.72s/it] {'loss': 0.4558, 'learning_rate': 2.9060665109745324e-06, 'epoch': 0.76} + 76%|███████▌ | 6637/8750 [3:12:04<3:21:19, 5.72s/it] 76%|███████▌ | 6638/8750 [3:12:10<3:24:48, 5.82s/it] 76%|███████▌ | 6638/8750 [3:12:12<3:24:48, 5.82s/it] {'loss': 0.458, 'learning_rate': 2.9034580248548363e-06, 'epoch': 0.76} + 76%|███████▌ | 6638/8750 [3:12:12<3:24:48, 5.82s/it] {'loss': 0.458, 'learning_rate': 2.9034580248548363e-06, 'epoch': 0.76} + 76%|███████▌ | 6638/8750 [3:12:10<3:24:48, 5.82s/it] 76%|███████▌ | 6639/8750 [3:12:16<3:23:46, 5.79s/it] 76%|███████▌ | 6639/8750 [3:12:18<3:23:46, 5.79s/it] {'loss': 0.4515, 'learning_rate': 2.900850511119826e-06, 'epoch': 0.76} + 76%|███████▌ | 6639/8750 [3:12:18<3:23:46, 5.79s/it] {'loss': 0.4515, 'learning_rate': 2.900850511119826e-06, 'epoch': 0.76} + 76%|███████▌ | 6639/8750 [3:12:16<3:23:46, 5.79s/it] 76%|███████▌ | 6640/8750 [3:12:23<3:22:20, 5.75s/it] 76%|███████▌ | 6640/8750 [3:12:21<3:22:20, 5.75s/it] {'loss': 0.4648, 'learning_rate': 2.898243970126793e-06, 'epoch': 0.76} + 76%|███████▌ | 6640/8750 [3:12:23<3:22:20, 5.75s/it] {'loss': 0.4648, 'learning_rate': 2.898243970126793e-06, 'epoch': 0.76} + 76%|███████▌ | 6640/8750 [3:12:21<3:22:20, 5.75s/it] 76%|███████▌ | 6641/8750 [3:12:30<3:28:33, 5.93s/it] 76%|███████▌ | 6641/8750 [3:12:28<3:28:33, 5.93s/it] {'loss': 0.4455, 'learning_rate': 2.8956384022328943e-06, 'epoch': 0.76} + 76%|███████▌ | 6641/8750 [3:12:30<3:28:33, 5.93s/it] {'loss': 0.4455, 'learning_rate': 2.8956384022328943e-06, 'epoch': 0.76} + 76%|███████▌ | 6641/8750 [3:12:28<3:28:33, 5.93s/it] 76%|███████▌ | 6642/8750 [3:12:33<3:24:50, 5.83s/it] 76%|███████▌ | 6642/8750 [3:12:35<3:24:50, 5.83s/it] {'loss': 0.4698, 'learning_rate': 2.893033807795147e-06, 'epoch': 0.76} + 76%|███████▌ | 6642/8750 [3:12:35<3:24:50, 5.83s/it] {'loss': 0.4698, 'learning_rate': 2.893033807795147e-06, 'epoch': 0.76} + 76%|███████▌ | 6642/8750 [3:12:33<3:24:50, 5.83s/it] 76%|███████▌ | 6643/8750 [3:12:41<3:23:12, 5.79s/it] 76%|███████▌ | 6643/8750 [3:12:39<3:23:11, 5.79s/it] {'loss': 0.4463, 'learning_rate': 2.8904301871704377e-06, 'epoch': 0.76} + 76%|███████▌ | 6643/8750 [3:12:41<3:23:12, 5.79s/it] {'loss': 0.4463, 'learning_rate': 2.8904301871704377e-06, 'epoch': 0.76} + 76%|███████▌ | 6643/8750 [3:12:39<3:23:11, 5.79s/it] 76%|███████▌ | 6644/8750 [3:12:47<3:21:13, 5.73s/it] 76%|███████▌ | 6644/8750 [3:12:44<3:21:13, 5.73s/it] {'loss': 0.4796, 'learning_rate': 2.8878275407155244e-06, 'epoch': 0.76} + {'loss': 0.4796, 'learning_rate': 2.8878275407155244e-06, 'epoch': 0.76} + 76%|███████▌ | 6644/8750 [3:12:47<3:21:13, 5.73s/it] 76%|███████▌ | 6644/8750 [3:12:44<3:21:13, 5.73s/it] 76%|███████▌ | 6645/8750 [3:12:52<3:20:41, 5.72s/it] 76%|███████▌ | 6645/8750 [3:12:50<3:20:40, 5.72s/it] {'loss': 0.4443, 'learning_rate': 2.885225868787025e-06, 'epoch': 0.76} + 76%|███████▌ | 6645/8750 [3:12:52<3:20:41, 5.72s/it] {'loss': 0.4443, 'learning_rate': 2.885225868787025e-06, 'epoch': 0.76} + 76%|███████▌ | 6645/8750 [3:12:50<3:20:40, 5.72s/it] 76%|███████▌ | 6646/8750 [3:12:56<3:22:36, 5.78s/it] 76%|███████▌ | 6646/8750 [3:12:58<3:22:35, 5.78s/it] {'loss': 0.458, 'learning_rate': 2.8826251717414245e-06, 'epoch': 0.76} + 76%|███████▌ | 6646/8750 [3:12:56<3:22:36, 5.78s/it]{'loss': 0.458, 'learning_rate': 2.8826251717414245e-06, 'epoch': 0.76} + 76%|███████▌ | 6646/8750 [3:12:58<3:22:35, 5.78s/it] 76%|███████▌ | 6647/8750 [3:13:04<3:20:36, 5.72s/it] 76%|███████▌ | 6647/8750 [3:13:02<3:20:36, 5.72s/it] {'loss': 0.4368, 'learning_rate': 2.8800254499350797e-06, 'epoch': 0.76} + 76%|███████▌ | 6647/8750 [3:13:02<3:20:36, 5.72s/it]{'loss': 0.4368, 'learning_rate': 2.8800254499350797e-06, 'epoch': 0.76} + 76%|███████▌ | 6647/8750 [3:13:04<3:20:36, 5.72s/it] 76%|███████▌ | 6648/8750 [3:13:10<3:20:23, 5.72s/it] 76%|███████▌ | 6648/8750 [3:13:07<3:20:23, 5.72s/it] {'loss': 0.4671, 'learning_rate': 2.8774267037242133e-06, 'epoch': 0.76} + 76%|███████▌ | 6648/8750 [3:13:10<3:20:23, 5.72s/it] {'loss': 0.4671, 'learning_rate': 2.8774267037242133e-06, 'epoch': 0.76} + 76%|███████▌ | 6648/8750 [3:13:07<3:20:23, 5.72s/it] 76%|███████▌ | 6649/8750 [3:13:16<3:21:56, 5.77s/it] 76%|███████▌ | 6649/8750 [3:13:13<3:21:56, 5.77s/it] {'loss': 0.4454, 'learning_rate': 2.8748289334649036e-06, 'epoch': 0.76} + 76%|███████▌ | 6649/8750 [3:13:16<3:21:56, 5.77s/it] {'loss': 0.4454, 'learning_rate': 2.8748289334649036e-06, 'epoch': 0.76} + 76%|███████▌ | 6649/8750 [3:13:13<3:21:56, 5.77s/it]8 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... + 76%|███████▌ | 6650/8750 [3:13:21<3:21:05, 5.75s/it]13 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend...10 + AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +0 2 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... + 76%|███████▌ | 6650/8750 [3:13:19<3:21:05, 5.75s/it]7 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... + {'loss': 0.433, 'learning_rate': 2.8722321395131127e-06, 'epoch': 0.76} + 76%|███████▌ | 6650/8750 [3:13:21<3:21:05, 5.75s/it] {'loss': 0.433, 'learning_rate': 2.8722321395131127e-06, 'epoch': 0.76} + 76%|███████▌ | 6650/8750 [3:13:19<3:21:05, 5.75s/it] 76%|███████▌ | 6651/8750 [3:13:27<3:22:20, 5.78s/it] 76%|███████▌ | 6651/8750 [3:13:25<3:22:20, 5.78s/it] {'loss': 0.4398, 'learning_rate': 2.8696363222246504e-06, 'epoch': 0.76} + 76%|███████▌ | 6651/8750 [3:13:27<3:22:20, 5.78s/it] {'loss': 0.4398, 'learning_rate': 2.8696363222246504e-06, 'epoch': 0.76} + 76%|███████▌ | 6651/8750 [3:13:25<3:22:20, 5.78s/it] 76%|███████▌ | 6652/8750 [3:13:33<3:20:28, 5.73s/it] 76%|███████▌ | 6652/8750 [3:13:30<3:20:28, 5.73s/it] {'loss': 0.4618, 'learning_rate': 2.8670414819552082e-06, 'epoch': 0.76} + 76%|███████▌ | 6652/8750 [3:13:33<3:20:28, 5.73s/it] {'loss': 0.4618, 'learning_rate': 2.8670414819552082e-06, 'epoch': 0.76} + 76%|███████▌ | 6652/8750 [3:13:30<3:20:28, 5.73s/it] 76%|███████▌ | 6653/8750 [3:13:38<3:20:32, 5.74s/it] 76%|███████▌ | 6653/8750 [3:13:36<3:20:32, 5.74s/it] {'loss': 0.4626, 'learning_rate': 2.864447619060333e-06, 'epoch': 0.76} + 76%|███████▌ | 6653/8750 [3:13:38<3:20:32, 5.74s/it] {'loss': 0.4626, 'learning_rate': 2.864447619060333e-06, 'epoch': 0.76} + 76%|███████▌ | 6653/8750 [3:13:36<3:20:32, 5.74s/it] 76%|███████▌ | 6654/8750 [3:13:44<3:20:04, 5.73s/it] 76%|███████▌ | 6654/8750 [3:13:42<3:20:04, 5.73s/it] {'loss': 0.4605, 'learning_rate': 2.861854733895446e-06, 'epoch': 0.76} + 76%|███████▌ | 6654/8750 [3:13:44<3:20:04, 5.73s/it] {'loss': 0.4605, 'learning_rate': 2.861854733895446e-06, 'epoch': 0.76} + 76%|███████▌ | 6654/8750 [3:13:42<3:20:04, 5.73s/it] 76%|███████▌ | 6655/8750 [3:13:50<3:19:03, 5.70s/it] 76%|███████▌ | 6655/8750 [3:13:48<3:19:03, 5.70s/it] {'loss': 0.4402, 'learning_rate': 2.8592628268158273e-06, 'epoch': 0.76} + 76%|███████▌ | 6655/8750 [3:13:50<3:19:03, 5.70s/it] {'loss': 0.4402, 'learning_rate': 2.8592628268158273e-06, 'epoch': 0.76} + 76%|███████▌ | 6655/8750 [3:13:48<3:19:03, 5.70s/it] 76%|███████▌ | 6656/8750 [3:13:56<3:20:55, 5.76s/it] 76%|███████▌ | 6656/8750 [3:13:53<3:20:55, 5.76s/it] {'loss': 0.4339, 'learning_rate': 2.8566718981766238e-06, 'epoch': 0.76} + 76%|███████▌ | 6656/8750 [3:13:56<3:20:55, 5.76s/it] {'loss': 0.4339, 'learning_rate': 2.8566718981766238e-06, 'epoch': 0.76} + 76%|███████▌ | 6656/8750 [3:13:53<3:20:55, 5.76s/it] 76%|███████▌ | 6657/8750 [3:14:01<3:18:41, 5.70s/it] 76%|███████▌ | 6657/8750 [3:13:59<3:18:41, 5.70s/it] {'loss': 0.4739, 'learning_rate': 2.854081948332854e-06, 'epoch': 0.76} + 76%|███████▌ | 6657/8750 [3:13:59<3:18:41, 5.70s/it] {'loss': 0.4739, 'learning_rate': 2.854081948332854e-06, 'epoch': 0.76} + 76%|███████▌ | 6657/8750 [3:14:01<3:18:41, 5.70s/it] 76%|███████▌ | 6658/8750 [3:14:07<3:19:33, 5.72s/it] 76%|███████▌ | 6658/8750 [3:14:05<3:19:33, 5.72s/it] {'loss': 0.4559, 'learning_rate': 2.8514929776394006e-06, 'epoch': 0.76} + 76%|███████▌ | 6658/8750 [3:14:07<3:19:33, 5.72s/it] {'loss': 0.4559, 'learning_rate': 2.8514929776394006e-06, 'epoch': 0.76} + 76%|███████▌ | 6658/8750 [3:14:05<3:19:33, 5.72s/it] 76%|███████▌ | 6659/8750 [3:14:13<3:20:36, 5.76s/it] 76%|███████▌ | 6659/8750 [3:14:11<3:20:36, 5.76s/it] {'loss': 0.4355, 'learning_rate': 2.8489049864510053e-06, 'epoch': 0.76} + 76%|███████▌ | 6659/8750 [3:14:13<3:20:36, 5.76s/it] {'loss': 0.4355, 'learning_rate': 2.8489049864510053e-06, 'epoch': 0.76} + 76%|███████▌ | 6659/8750 [3:14:11<3:20:36, 5.76s/it] 76%|███████▌ | 6660/8750 [3:14:19<3:21:43, 5.79s/it] 76%|███████▌ | 6660/8750 [3:14:16<3:21:43, 5.79s/it] {'loss': 0.4431, 'learning_rate': 2.846317975122287e-06, 'epoch': 0.76} + 76%|███████▌ | 6660/8750 [3:14:19<3:21:43, 5.79s/it] {'loss': 0.4431, 'learning_rate': 2.846317975122287e-06, 'epoch': 0.76} + 76%|███████▌ | 6660/8750 [3:14:16<3:21:43, 5.79s/it] 76%|███████▌ | 6661/8750 [3:14:24<3:19:05, 5.72s/it] 76%|███████▌ | 6661/8750 [3:14:22<3:19:05, 5.72s/it] {'loss': 0.4666, 'learning_rate': 2.843731944007717e-06, 'epoch': 0.76} + 76%|███████▌ | 6661/8750 [3:14:24<3:19:05, 5.72s/it] {'loss': 0.4666, 'learning_rate': 2.843731944007717e-06, 'epoch': 0.76} + 76%|███████▌ | 6661/8750 [3:14:22<3:19:05, 5.72s/it] 76%|███████▌ | 6662/8750 [3:14:30<3:18:52, 5.71s/it] 76%|███████▌ | 6662/8750 [3:14:28<3:18:53, 5.72s/it] {'loss': 0.4639, 'learning_rate': 2.841146893461646e-06, 'epoch': 0.76} + 76%|███████▌ | 6662/8750 [3:14:30<3:18:52, 5.71s/it] {'loss': 0.4639, 'learning_rate': 2.841146893461646e-06, 'epoch': 0.76} + 76%|███████▌ | 6662/8750 [3:14:28<3:18:53, 5.72s/it] 76%|███████▌ | 6663/8750 [3:14:36<3:18:29, 5.71s/it] 76%|███████▌ | 6663/8750 [3:14:33<3:18:29, 5.71s/it] {'loss': 0.4551, 'learning_rate': 2.8385628238382803e-06, 'epoch': 0.76} + {'loss': 0.4551, 'learning_rate': 2.8385628238382803e-06, 'epoch': 0.76} + 76%|███████▌ | 6663/8750 [3:14:33<3:18:29, 5.71s/it] 76%|███████▌ | 6663/8750 [3:14:36<3:18:29, 5.71s/it] 76%|███████▌ | 6664/8750 [3:14:41<3:19:19, 5.73s/it] 76%|███████▌ | 6664/8750 [3:14:39<3:19:19, 5.73s/it] {'loss': 0.4547, 'learning_rate': 2.8359797354916907e-06, 'epoch': 0.76} + 76%|███████▌ | 6664/8750 [3:14:39<3:19:19, 5.73s/it]{'loss': 0.4547, 'learning_rate': 2.8359797354916907e-06, 'epoch': 0.76} + 76%|███████▌ | 6664/8750 [3:14:41<3:19:19, 5.73s/it] 76%|███████▌ | 6665/8750 [3:14:47<3:19:32, 5.74s/it] 76%|███████▌ | 6665/8750 [3:14:45<3:19:32, 5.74s/it] {'loss': 0.4457, 'learning_rate': 2.833397628775827e-06, 'epoch': 0.76} + 76%|███████▌ | 6665/8750 [3:14:47<3:19:32, 5.74s/it] {'loss': 0.4457, 'learning_rate': 2.833397628775827e-06, 'epoch': 0.76} + 76%|███████▌ | 6665/8750 [3:14:45<3:19:32, 5.74s/it] 76%|███████▌ | 6666/8750 [3:14:53<3:19:52, 5.75s/it] 76%|███████▌ | 6666/8750 [3:14:51<3:19:52, 5.75s/it] {'loss': 0.4398, 'learning_rate': 2.830816504044488e-06, 'epoch': 0.76} + 76%|███████▌ | 6666/8750 [3:14:53<3:19:52, 5.75s/it] {'loss': 0.4398, 'learning_rate': 2.830816504044488e-06, 'epoch': 0.76} + 76%|███████▌ | 6666/8750 [3:14:51<3:19:52, 5.75s/it] 76%|███████▌ | 6667/8750 [3:14:59<3:22:04, 5.82s/it] 76%|███████▌ | 6667/8750 [3:14:57<3:22:04, 5.82s/it] {'loss': 0.4445, 'learning_rate': 2.8282363616513475e-06, 'epoch': 0.76} + 76%|███████▌ | 6667/8750 [3:14:59<3:22:04, 5.82s/it] {'loss': 0.4445, 'learning_rate': 2.8282363616513475e-06, 'epoch': 0.76} + 76%|███████▌ | 6667/8750 [3:14:57<3:22:04, 5.82s/it] 76%|███████▌ | 6668/8750 [3:15:05<3:20:24, 5.78s/it] 76%|███████▌ | 6668/8750 [3:15:02<3:20:23, 5.78s/it] {'loss': 0.4753, 'learning_rate': 2.8256572019499474e-06, 'epoch': 0.76} + 76%|███████▌ | 6668/8750 [3:15:05<3:20:24, 5.78s/it] {'loss': 0.4753, 'learning_rate': 2.8256572019499474e-06, 'epoch': 0.76} + 76%|███████▌ | 6668/8750 [3:15:02<3:20:23, 5.78s/it] 76%|███████▌ | 6669/8750 [3:15:11<3:21:34, 5.81s/it] 76%|███████▌ | 6669/8750 [3:15:08<3:21:34, 5.81s/it]{'loss': 0.4523, 'learning_rate': 2.8230790252936826e-06, 'epoch': 0.76} + {'loss': 0.4523, 'learning_rate': 2.8230790252936826e-06, 'epoch': 0.76} + 76%|███████▌ | 6669/8750 [3:15:11<3:21:34, 5.81s/it] 76%|███████▌ | 6669/8750 [3:15:08<3:21:34, 5.81s/it] 76%|███████▌ | 6670/8750 [3:15:16<3:20:53, 5.80s/it] 76%|███████▌ | 6670/8750 [3:15:14<3:20:54, 5.80s/it] {'loss': 0.4613, 'learning_rate': 2.8205018320358268e-06, 'epoch': 0.76} + 76%|███████▌ | 6670/8750 [3:15:16<3:20:53, 5.80s/it] {'loss': 0.4613, 'learning_rate': 2.8205018320358268e-06, 'epoch': 0.76} + 76%|███████▌ | 6670/8750 [3:15:14<3:20:54, 5.80s/it] 76%|███████▌ | 6671/8750 [3:15:20<3:21:26, 5.81s/it] 76%|███████▌ | 6671/8750 [3:15:22<3:21:26, 5.81s/it] {'loss': 0.467, 'learning_rate': 2.8179256225295114e-06, 'epoch': 0.76} + 76%|███████▌ | 6671/8750 [3:15:20<3:21:26, 5.81s/it] {'loss': 0.467, 'learning_rate': 2.8179256225295114e-06, 'epoch': 0.76} + 76%|███████▌ | 6671/8750 [3:15:22<3:21:26, 5.81s/it] 76%|███████▋ | 6672/8750 [3:15:26<3:23:30, 5.88s/it] 76%|███████▋ | 6672/8750 [3:15:28<3:23:30, 5.88s/it] {'loss': 0.469, 'learning_rate': 2.815350397127732e-06, 'epoch': 0.76} + 76%|███████▋ | 6672/8750 [3:15:28<3:23:30, 5.88s/it] {'loss': 0.469, 'learning_rate': 2.815350397127732e-06, 'epoch': 0.76} + 76%|███████▋ | 6672/8750 [3:15:26<3:23:30, 5.88s/it] 76%|███████▋ | 6673/8750 [3:15:34<3:22:50, 5.86s/it] 76%|███████▋ | 6673/8750 [3:15:32<3:22:50, 5.86s/it] {'loss': 0.4359, 'learning_rate': 2.8127761561833554e-06, 'epoch': 0.76} + 76%|███████▋ | 6673/8750 [3:15:34<3:22:50, 5.86s/it] {'loss': 0.4359, 'learning_rate': 2.8127761561833554e-06, 'epoch': 0.76} + 76%|███████▋ | 6673/8750 [3:15:32<3:22:50, 5.86s/it] 76%|███████▋ | 6674/8750 [3:15:40<3:22:03, 5.84s/it] 76%|███████▋ | 6674/8750 [3:15:38<3:22:03, 5.84s/it] {'loss': 0.4473, 'learning_rate': 2.810202900049106e-06, 'epoch': 0.76} + 76%|███████▋ | 6674/8750 [3:15:40<3:22:03, 5.84s/it] {'loss': 0.4473, 'learning_rate': 2.810202900049106e-06, 'epoch': 0.76} + 76%|███████▋ | 6674/8750 [3:15:38<3:22:03, 5.84s/it] 76%|███████▋ | 6675/8750 [3:15:45<3:19:42, 5.77s/it] 76%|███████▋ | 6675/8750 [3:15:43<3:19:43, 5.78s/it] {'loss': 0.4625, 'learning_rate': 2.8076306290775823e-06, 'epoch': 0.76} + 76%|███████▋ | 6675/8750 [3:15:45<3:19:42, 5.77s/it] {'loss': 0.4625, 'learning_rate': 2.8076306290775823e-06, 'epoch': 0.76} + 76%|███████▋ | 6675/8750 [3:15:43<3:19:43, 5.78s/it] 76%|███████▋ | 6676/8750 [3:15:51<3:19:38, 5.78s/it] 76%|███████▋ | 6676/8750 [3:15:49<3:19:38, 5.78s/it] {'loss': 0.451, 'learning_rate': 2.8050593436212394e-06, 'epoch': 0.76} + 76%|███████▋ | 6676/8750 [3:15:51<3:19:38, 5.78s/it] {'loss': 0.451, 'learning_rate': 2.8050593436212394e-06, 'epoch': 0.76} + 76%|███████▋ | 6676/8750 [3:15:49<3:19:38, 5.78s/it] 76%|███████▋ | 6677/8750 [3:15:57<3:18:41, 5.75s/it] 76%|███████▋ | 6677/8750 [3:15:55<3:18:40, 5.75s/it] {'loss': 0.4503, 'learning_rate': 2.8024890440324e-06, 'epoch': 0.76} + 76%|███████▋ | 6677/8750 [3:15:57<3:18:41, 5.75s/it] {'loss': 0.4503, 'learning_rate': 2.8024890440324e-06, 'epoch': 0.76} + 76%|███████▋ | 6677/8750 [3:15:55<3:18:40, 5.75s/it] 76%|███████▋ | 6678/8750 [3:16:03<3:17:23, 5.72s/it] 76%|███████▋ | 6678/8750 [3:16:00<3:17:23, 5.72s/it] {'loss': 0.4582, 'learning_rate': 2.7999197306632576e-06, 'epoch': 0.76} + 76%|███████▋ | 6678/8750 [3:16:03<3:17:23, 5.72s/it] {'loss': 0.4582, 'learning_rate': 2.7999197306632576e-06, 'epoch': 0.76} + 76%|███████▋ | 6678/8750 [3:16:00<3:17:23, 5.72s/it] 76%|███████▋ | 6679/8750 [3:16:08<3:17:06, 5.71s/it] 76%|███████▋ | 6679/8750 [3:16:06<3:17:06, 5.71s/it] {'loss': 0.4593, 'learning_rate': 2.79735140386586e-06, 'epoch': 0.76} + 76%|███████▋ | 6679/8750 [3:16:08<3:17:06, 5.71s/it] {'loss': 0.4593, 'learning_rate': 2.79735140386586e-06, 'epoch': 0.76} + 76%|███████▋ | 6679/8750 [3:16:06<3:17:06, 5.71s/it] 76%|███████▋ | 6680/8750 [3:16:14<3:20:13, 5.80s/it] 76%|███████▋ | 6680/8750 [3:16:12<3:20:13, 5.80s/it] {'loss': 0.4524, 'learning_rate': 2.7947840639921308e-06, 'epoch': 0.76} + 76%|███████▋ | 6680/8750 [3:16:14<3:20:13, 5.80s/it] {'loss': 0.4524, 'learning_rate': 2.7947840639921308e-06, 'epoch': 0.76} + 76%|███████▋ | 6680/8750 [3:16:12<3:20:13, 5.80s/it] 76%|███████▋ | 6681/8750 [3:16:20<3:18:27, 5.76s/it] 76%|███████▋ | 6681/8750 [3:16:18<3:18:27, 5.76s/it]{'loss': 0.4527, 'learning_rate': 2.792217711393849e-06, 'epoch': 0.76} + {'loss': 0.4527, 'learning_rate': 2.792217711393849e-06, 'epoch': 0.76} + 76%|███████▋ | 6681/8750 [3:16:20<3:18:27, 5.76s/it] 76%|███████▋ | 6681/8750 [3:16:18<3:18:27, 5.76s/it] 76%|███████▋ | 6682/8750 [3:16:26<3:18:51, 5.77s/it] 76%|███████▋ | 6682/8750 [3:16:23<3:18:51, 5.77s/it] {'loss': 0.452, 'learning_rate': 2.78965234642266e-06, 'epoch': 0.76} + 76%|███████▋ | 6682/8750 [3:16:26<3:18:51, 5.77s/it] {'loss': 0.452, 'learning_rate': 2.78965234642266e-06, 'epoch': 0.76} + 76%|███████▋ | 6682/8750 [3:16:23<3:18:51, 5.77s/it] 76%|███████▋ | 6683/8750 [3:16:32<3:23:11, 5.90s/it] 76%|███████▋ | 6683/8750 [3:16:30<3:23:11, 5.90s/it] {'loss': 0.4344, 'learning_rate': 2.7870879694300825e-06, 'epoch': 0.76} + 76%|███████▋ | 6683/8750 [3:16:32<3:23:11, 5.90s/it] {'loss': 0.4344, 'learning_rate': 2.7870879694300825e-06, 'epoch': 0.76} + 76%|███████▋ | 6683/8750 [3:16:30<3:23:11, 5.90s/it] 76%|███████▋ | 6684/8750 [3:16:37<3:20:07, 5.81s/it] 76%|███████▋ | 6684/8750 [3:16:35<3:20:08, 5.81s/it] {'loss': 0.4756, 'learning_rate': 2.7845245807674893e-06, 'epoch': 0.76} + 76%|███████▋ | 6684/8750 [3:16:37<3:20:07, 5.81s/it] {'loss': 0.4756, 'learning_rate': 2.7845245807674893e-06, 'epoch': 0.76} + 76%|███████▋ | 6684/8750 [3:16:35<3:20:08, 5.81s/it] 76%|███████▋ | 6685/8750 [3:16:43<3:19:40, 5.80s/it] 76%|███████▋ | 6685/8750 [3:16:41<3:19:40, 5.80s/it] {'loss': 0.4306, 'learning_rate': 2.7819621807861197e-06, 'epoch': 0.76} + 76%|███████▋ | 6685/8750 [3:16:43<3:19:40, 5.80s/it] {'loss': 0.4306, 'learning_rate': 2.7819621807861197e-06, 'epoch': 0.76} + 76%|███████▋ | 6685/8750 [3:16:41<3:19:40, 5.80s/it] 76%|███████▋ | 6686/8750 [3:16:49<3:18:39, 5.78s/it] 76%|███████▋ | 6686/8750 [3:16:47<3:18:39, 5.77s/it] {'loss': 0.4731, 'learning_rate': 2.779400769837083e-06, 'epoch': 0.76} + 76%|███████▋ | 6686/8750 [3:16:49<3:18:39, 5.78s/it] {'loss': 0.4731, 'learning_rate': 2.779400769837083e-06, 'epoch': 0.76} + 76%|███████▋ | 6686/8750 [3:16:47<3:18:39, 5.77s/it] 76%|███████▋ | 6687/8750 [3:16:55<3:16:09, 5.71s/it] 76%|███████▋ | 6687/8750 [3:16:52<3:16:09, 5.71s/it] {'loss': 0.4586, 'learning_rate': 2.776840348271348e-06, 'epoch': 0.76} + 76%|███████▋ | 6687/8750 [3:16:55<3:16:09, 5.71s/it] {'loss': 0.4586, 'learning_rate': 2.776840348271348e-06, 'epoch': 0.76} + 76%|███████▋ | 6687/8750 [3:16:52<3:16:09, 5.71s/it] 76%|███████▋ | 6688/8750 [3:17:00<3:15:44, 5.70s/it] 76%|███████▋ | 6688/8750 [3:16:58<3:15:44, 5.70s/it] {'loss': 0.4586, 'learning_rate': 2.7742809164397546e-06, 'epoch': 0.76} + 76%|███████▋ | 6688/8750 [3:17:00<3:15:44, 5.70s/it] {'loss': 0.4586, 'learning_rate': 2.7742809164397546e-06, 'epoch': 0.76} + 76%|███████▋ | 6688/8750 [3:16:58<3:15:44, 5.70s/it] 76%|███████▋ | 6689/8750 [3:17:06<3:16:17, 5.71s/it] 76%|███████▋ | 6689/8750 [3:17:04<3:16:17, 5.71s/it] {'loss': 0.4355, 'learning_rate': 2.7717224746929984e-06, 'epoch': 0.76} + 76%|███████▋ | 6689/8750 [3:17:06<3:16:17, 5.71s/it] {'loss': 0.4355, 'learning_rate': 2.7717224746929984e-06, 'epoch': 0.76} + 76%|███████▋ | 6689/8750 [3:17:04<3:16:17, 5.71s/it] 76%|███████▋ | 6690/8750 [3:17:12<3:15:53, 5.71s/it] 76%|███████▋ | 6690/8750 [3:17:09<3:15:53, 5.71s/it] {'loss': 0.453, 'learning_rate': 2.769165023381639e-06, 'epoch': 0.76} + 76%|███████▋ | 6690/8750 [3:17:12<3:15:53, 5.71s/it] {'loss': 0.453, 'learning_rate': 2.769165023381639e-06, 'epoch': 0.76} + 76%|███████▋ | 6690/8750 [3:17:09<3:15:53, 5.71s/it] 76%|███████▋ | 6691/8750 [3:17:17<3:16:23, 5.72s/it] 76%|███████▋ | 6691/8750 [3:17:15<3:16:23, 5.72s/it] {'loss': 0.4595, 'learning_rate': 2.7666085628561126e-06, 'epoch': 0.76} + 76%|███████▋ | 6691/8750 [3:17:17<3:16:23, 5.72s/it] {'loss': 0.4595, 'learning_rate': 2.7666085628561126e-06, 'epoch': 0.76} + 76%|███████▋ | 6691/8750 [3:17:15<3:16:23, 5.72s/it] 76%|███████▋ | 6692/8750 [3:17:23<3:14:56, 5.68s/it] 76%|███████▋ | 6692/8750 [3:17:21<3:14:56, 5.68s/it] {'loss': 0.4609, 'learning_rate': 2.764053093466702e-06, 'epoch': 0.76} + 76%|███████▋ | 6692/8750 [3:17:23<3:14:56, 5.68s/it] {'loss': 0.4609, 'learning_rate': 2.764053093466702e-06, 'epoch': 0.76} + 76%|███████▋ | 6692/8750 [3:17:21<3:14:56, 5.68s/it] 76%|███████▋ | 6693/8750 [3:17:29<3:15:07, 5.69s/it] 76%|███████▋ | 6693/8750 [3:17:26<3:15:07, 5.69s/it] {'loss': 0.4531, 'learning_rate': 2.7614986155635737e-06, 'epoch': 0.76} + 76%|███████▋ | 6693/8750 [3:17:29<3:15:07, 5.69s/it] {'loss': 0.4531, 'learning_rate': 2.7614986155635737e-06, 'epoch': 0.76} + 76%|███████▋ | 6693/8750 [3:17:26<3:15:07, 5.69s/it] 77%|███████▋ | 6694/8750 [3:17:34<3:15:05, 5.69s/it] 77%|███████▋ | 6694/8750 [3:17:32<3:15:05, 5.69s/it] {'loss': 0.4557, 'learning_rate': 2.7589451294967383e-06, 'epoch': 0.77} + 77%|███████▋ | 6694/8750 [3:17:34<3:15:05, 5.69s/it] {'loss': 0.4557, 'learning_rate': 2.7589451294967383e-06, 'epoch': 0.77} + 77%|███████▋ | 6694/8750 [3:17:32<3:15:05, 5.69s/it] 77%|███████▋ | 6695/8750 [3:17:40<3:14:23, 5.68s/it] 77%|███████▋ | 6695/8750 [3:17:38<3:14:23, 5.68s/it] {'loss': 0.4522, 'learning_rate': 2.75639263561609e-06, 'epoch': 0.77} + 77%|███████▋ | 6695/8750 [3:17:40<3:14:23, 5.68s/it] {'loss': 0.4522, 'learning_rate': 2.75639263561609e-06, 'epoch': 0.77} + 77%|███████▋ | 6695/8750 [3:17:38<3:14:23, 5.68s/it] 77%|███████▋ | 6696/8750 [3:17:46<3:14:40, 5.69s/it] 77%|███████▋ | 6696/8750 [3:17:43<3:14:40, 5.69s/it] {'loss': 0.468, 'learning_rate': 2.753841134271368e-06, 'epoch': 0.77} + 77%|███████▋ | 6696/8750 [3:17:43<3:14:40, 5.69s/it] {'loss': 0.468, 'learning_rate': 2.753841134271368e-06, 'epoch': 0.77} + 77%|███████▋ | 6696/8750 [3:17:46<3:14:40, 5.69s/it] 77%|███████▋ | 6697/8750 [3:17:51<3:14:16, 5.68s/it] 77%|███████▋ | 6697/8750 [3:17:49<3:14:15, 5.68s/it] {'loss': 0.4466, 'learning_rate': 2.7512906258121907e-06, 'epoch': 0.77} + 77%|███████▋ | 6697/8750 [3:17:51<3:14:16, 5.68s/it] {'loss': 0.4466, 'learning_rate': 2.7512906258121907e-06, 'epoch': 0.77} + 77%|███████▋ | 6697/8750 [3:17:49<3:14:15, 5.68s/it] 77%|███████▋ | 6698/8750 [3:17:57<3:13:30, 5.66s/it] 77%|███████▋ | 6698/8750 [3:17:55<3:13:30, 5.66s/it] {'loss': 0.454, 'learning_rate': 2.7487411105880356e-06, 'epoch': 0.77} + 77%|███████▋ | 6698/8750 [3:17:57<3:13:30, 5.66s/it] {'loss': 0.454, 'learning_rate': 2.7487411105880356e-06, 'epoch': 0.77} + 77%|███████▋ | 6698/8750 [3:17:55<3:13:30, 5.66s/it] 77%|███████▋ | 6699/8750 [3:18:03<3:15:04, 5.71s/it] 77%|███████▋ | 6699/8750 [3:18:01<3:15:04, 5.71s/it] {'loss': 0.4294, 'learning_rate': 2.7461925889482422e-06, 'epoch': 0.77} + 77%|███████▋ | 6699/8750 [3:18:03<3:15:04, 5.71s/it] {'loss': 0.4294, 'learning_rate': 2.7461925889482422e-06, 'epoch': 0.77} + 77%|███████▋ | 6699/8750 [3:18:01<3:15:04, 5.71s/it]9 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... + 77%|███████▋ | 6700/8750 [3:18:09<3:15:00, 5.71s/it]12 AutoResumeHook: Checking whether to suspend... +1310 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +1 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +02 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... + 77%|███████▋ | 6700/8750 [3:18:06<3:15:01, 5.71s/it]3 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4556, 'learning_rate': 2.7436450612420098e-06, 'epoch': 0.77} + 77%|███████▋ | 6700/8750 [3:18:09<3:15:00, 5.71s/it] {'loss': 0.4556, 'learning_rate': 2.7436450612420098e-06, 'epoch': 0.77} + 77%|███████▋ | 6700/8750 [3:18:06<3:15:01, 5.71s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-6700/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-6700/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-6700/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 77%|███████▋ | 6701/8750 [3:18:30<5:51:51, 10.30s/it] 77%|███████▋ | 6701/8750 [3:18:27<5:51:51, 10.30s/it] {'loss': 0.4682, 'learning_rate': 2.7410985278184144e-06, 'epoch': 0.77} + 77%|███████▋ | 6701/8750 [3:18:30<5:51:51, 10.30s/it] {'loss': 0.4682, 'learning_rate': 2.7410985278184144e-06, 'epoch': 0.77} + 77%|███████▋ | 6701/8750 [3:18:27<5:51:51, 10.30s/it] 77%|███████▋ | 6702/8750 [3:18:35<5:04:44, 8.93s/it] 77%|███████▋ | 6702/8750 [3:18:33<5:04:44, 8.93s/it] {'loss': 0.4551, 'learning_rate': 2.738552989026384e-06, 'epoch': 0.77} + 77%|███████▋ | 6702/8750 [3:18:35<5:04:44, 8.93s/it] {'loss': 0.4551, 'learning_rate': 2.738552989026384e-06, 'epoch': 0.77} + 77%|███████▋ | 6702/8750 [3:18:33<5:04:44, 8.93s/it] 77%|███████▋ | 6703/8750 [3:18:41<4:31:20, 7.95s/it] 77%|███████▋ | 6703/8750 [3:18:39<4:31:20, 7.95s/it] {'loss': 0.4589, 'learning_rate': 2.7360084452147108e-06, 'epoch': 0.77} + 77%|███████▋ | 6703/8750 [3:18:41<4:31:20, 7.95s/it] {'loss': 0.4589, 'learning_rate': 2.7360084452147108e-06, 'epoch': 0.77} + 77%|███████▋ | 6703/8750 [3:18:39<4:31:20, 7.95s/it] 77%|███████▋ | 6704/8750 [3:18:47<4:08:40, 7.29s/it] 77%|███████▋ | 6704/8750 [3:18:44<4:08:40, 7.29s/it] {'loss': 0.4527, 'learning_rate': 2.7334648967320587e-06, 'epoch': 0.77} + 77%|███████▋ | 6704/8750 [3:18:47<4:08:40, 7.29s/it] {'loss': 0.4527, 'learning_rate': 2.7334648967320587e-06, 'epoch': 0.77} + 77%|███████▋ | 6704/8750 [3:18:44<4:08:40, 7.29s/it] 77%|███████▋ | 6705/8750 [3:18:52<3:52:47, 6.83s/it] 77%|███████▋ | 6705/8750 [3:18:50<3:52:47, 6.83s/it] {'loss': 0.4516, 'learning_rate': 2.7309223439269516e-06, 'epoch': 0.77} + 77%|███████▋ | 6705/8750 [3:18:52<3:52:47, 6.83s/it] {'loss': 0.4516, 'learning_rate': 2.7309223439269516e-06, 'epoch': 0.77} + 77%|███████▋ | 6705/8750 [3:18:50<3:52:47, 6.83s/it] 77%|███████▋ | 6706/8750 [3:18:58<3:41:33, 6.50s/it] 77%|███████▋ | 6706/8750 [3:18:56<3:41:33, 6.50s/it] {'loss': 0.4418, 'learning_rate': 2.728380787147772e-06, 'epoch': 0.77} + 77%|███████▋ | 6706/8750 [3:18:58<3:41:33, 6.50s/it] {'loss': 0.4418, 'learning_rate': 2.728380787147772e-06, 'epoch': 0.77} + 77%|███████▋ | 6706/8750 [3:18:56<3:41:33, 6.50s/it] 77%|███████▋ | 6707/8750 [3:19:04<3:33:07, 6.26s/it] 77%|███████▋ | 6707/8750 [3:19:02<3:33:07, 6.26s/it] {'loss': 0.4448, 'learning_rate': 2.725840226742774e-06, 'epoch': 0.77} + 77%|███████▋ | 6707/8750 [3:19:04<3:33:07, 6.26s/it]{'loss': 0.4448, 'learning_rate': 2.725840226742774e-06, 'epoch': 0.77} + 77%|███████▋ | 6707/8750 [3:19:02<3:33:07, 6.26s/it] 77%|███████▋ | 6708/8750 [3:19:10<3:26:57, 6.08s/it] 77%|███████▋ | 6708/8750 [3:19:07<3:26:57, 6.08s/it] {'loss': 0.4588, 'learning_rate': 2.7233006630600667e-06, 'epoch': 0.77} + 77%|███████▋ | 6708/8750 [3:19:10<3:26:57, 6.08s/it] {'loss': 0.4588, 'learning_rate': 2.7233006630600667e-06, 'epoch': 0.77} + 77%|███████▋ | 6708/8750 [3:19:07<3:26:57, 6.08s/it] 77%|███████▋ | 6709/8750 [3:19:15<3:23:52, 5.99s/it] 77%|███████▋ | 6709/8750 [3:19:13<3:23:51, 5.99s/it] {'loss': 0.4534, 'learning_rate': 2.7207620964476323e-06, 'epoch': 0.77} + 77%|███████▋ | 6709/8750 [3:19:13<3:23:51, 5.99s/it]{'loss': 0.4534, 'learning_rate': 2.7207620964476323e-06, 'epoch': 0.77} + 77%|███████▋ | 6709/8750 [3:19:15<3:23:52, 5.99s/it] 77%|███████▋ | 6710/8750 [3:19:21<3:21:14, 5.92s/it] 77%|███████▋ | 6710/8750 [3:19:19<3:21:14, 5.92s/it] {'loss': 0.4508, 'learning_rate': 2.7182245272533046e-06, 'epoch': 0.77} + 77%|███████▋ | 6710/8750 [3:19:21<3:21:14, 5.92s/it] {'loss': 0.4508, 'learning_rate': 2.7182245272533046e-06, 'epoch': 0.77} + 77%|███████▋ | 6710/8750 [3:19:19<3:21:14, 5.92s/it] 77%|███████▋ | 6711/8750 [3:19:27<3:21:06, 5.92s/it] 77%|███████▋ | 6711/8750 [3:19:25<3:21:06, 5.92s/it] {'loss': 0.4542, 'learning_rate': 2.715687955824795e-06, 'epoch': 0.77} + 77%|███████▋ | 6711/8750 [3:19:25<3:21:06, 5.92s/it]{'loss': 0.4542, 'learning_rate': 2.715687955824795e-06, 'epoch': 0.77} + 77%|███████▋ | 6711/8750 [3:19:27<3:21:06, 5.92s/it] 77%|███████▋ | 6712/8750 [3:19:33<3:17:03, 5.80s/it] 77%|███████▋ | 6712/8750 [3:19:30<3:17:03, 5.80s/it] {'loss': 0.451, 'learning_rate': 2.713152382509665e-06, 'epoch': 0.77} + 77%|███████▋ | 6712/8750 [3:19:33<3:17:03, 5.80s/it] {'loss': 0.451, 'learning_rate': 2.713152382509665e-06, 'epoch': 0.77} + 77%|███████▋ | 6712/8750 [3:19:30<3:17:03, 5.80s/it] 77%|███████▋ | 6713/8750 [3:19:38<3:14:37, 5.73s/it] 77%|███████▋ | 6713/8750 [3:19:36<3:14:37, 5.73s/it] {'loss': 0.4442, 'learning_rate': 2.7106178076553446e-06, 'epoch': 0.77} + 77%|███████▋ | 6713/8750 [3:19:38<3:14:37, 5.73s/it] {'loss': 0.4442, 'learning_rate': 2.7106178076553446e-06, 'epoch': 0.77} + 77%|███████▋ | 6713/8750 [3:19:36<3:14:37, 5.73s/it] 77%|███████▋ | 6714/8750 [3:19:44<3:15:17, 5.75s/it] 77%|███████▋ | 6714/8750 [3:19:42<3:15:17, 5.76s/it] {'loss': 0.4464, 'learning_rate': 2.7080842316091273e-06, 'epoch': 0.77} + 77%|███████▋ | 6714/8750 [3:19:44<3:15:17, 5.75s/it] {'loss': 0.4464, 'learning_rate': 2.7080842316091273e-06, 'epoch': 0.77} + 77%|███████▋ | 6714/8750 [3:19:42<3:15:17, 5.76s/it] 77%|███████▋ | 6715/8750 [3:19:50<3:18:00, 5.84s/it] 77%|███████▋ | 6715/8750 [3:19:48<3:18:00, 5.84s/it] {'loss': 0.4548, 'learning_rate': 2.7055516547181736e-06, 'epoch': 0.77} + 77%|███████▋ | 6715/8750 [3:19:50<3:18:00, 5.84s/it] {'loss': 0.4548, 'learning_rate': 2.7055516547181736e-06, 'epoch': 0.77} + 77%|███████▋ | 6715/8750 [3:19:48<3:18:00, 5.84s/it] 77%|███████▋ | 6716/8750 [3:19:56<3:16:45, 5.80s/it] 77%|███████▋ | 6716/8750 [3:19:53<3:16:45, 5.80s/it] {'loss': 0.4454, 'learning_rate': 2.703020077329498e-06, 'epoch': 0.77} + 77%|███████▋ | 6716/8750 [3:19:53<3:16:45, 5.80s/it]{'loss': 0.4454, 'learning_rate': 2.703020077329498e-06, 'epoch': 0.77} + 77%|███████▋ | 6716/8750 [3:19:56<3:16:45, 5.80s/it] 77%|███████▋ | 6717/8750 [3:20:01<3:15:33, 5.77s/it] 77%|███████▋ | 6717/8750 [3:19:59<3:15:33, 5.77s/it] {'loss': 0.438, 'learning_rate': 2.7004894997899878e-06, 'epoch': 0.77} + 77%|███████▋ | 6717/8750 [3:20:01<3:15:33, 5.77s/it] {'loss': 0.438, 'learning_rate': 2.7004894997899878e-06, 'epoch': 0.77} + 77%|███████▋ | 6717/8750 [3:19:59<3:15:33, 5.77s/it] 77%|███████▋ | 6718/8750 [3:20:07<3:15:37, 5.78s/it] 77%|███████▋ | 6718/8750 [3:20:05<3:15:37, 5.78s/it] {'loss': 0.4595, 'learning_rate': 2.6979599224463838e-06, 'epoch': 0.77} + 77%|███████▋ | 6718/8750 [3:20:07<3:15:37, 5.78s/it] {'loss': 0.4595, 'learning_rate': 2.6979599224463838e-06, 'epoch': 0.77} + 77%|███████▋ | 6718/8750 [3:20:05<3:15:37, 5.78s/it] 77%|███████▋ | 6719/8750 [3:20:13<3:14:24, 5.74s/it] 77%|███████▋ | 6719/8750 [3:20:11<3:14:24, 5.74s/it] {'loss': 0.4586, 'learning_rate': 2.6954313456452995e-06, 'epoch': 0.77} + 77%|███████▋ | 6719/8750 [3:20:13<3:14:24, 5.74s/it] {'loss': 0.4586, 'learning_rate': 2.6954313456452995e-06, 'epoch': 0.77} + 77%|███████▋ | 6719/8750 [3:20:11<3:14:24, 5.74s/it] 77%|███████▋ | 6720/8750 [3:20:19<3:15:46, 5.79s/it] 77%|███████▋ | 6720/8750 [3:20:16<3:15:46, 5.79s/it] {'loss': 0.4458, 'learning_rate': 2.6929037697332037e-06, 'epoch': 0.77} + 77%|███████▋ | 6720/8750 [3:20:19<3:15:46, 5.79s/it] {'loss': 0.4458, 'learning_rate': 2.6929037697332037e-06, 'epoch': 0.77} + 77%|███████▋ | 6720/8750 [3:20:16<3:15:46, 5.79s/it] 77%|███████▋ | 6721/8750 [3:20:24<3:14:18, 5.75s/it] 77%|███████▋ | 6721/8750 [3:20:22<3:14:17, 5.75s/it] {'loss': 0.4398, 'learning_rate': 2.6903771950564294e-06, 'epoch': 0.77} + 77%|███████▋ | 6721/8750 [3:20:24<3:14:18, 5.75s/it] {'loss': 0.4398, 'learning_rate': 2.6903771950564294e-06, 'epoch': 0.77} + 77%|███████▋ | 6721/8750 [3:20:22<3:14:17, 5.75s/it] 77%|███████▋ | 6722/8750 [3:20:30<3:12:58, 5.71s/it] 77%|███████▋ | 6722/8750 [3:20:28<3:12:58, 5.71s/it] {'loss': 0.4544, 'learning_rate': 2.6878516219611773e-06, 'epoch': 0.77} + 77%|███████▋ | 6722/8750 [3:20:30<3:12:58, 5.71s/it] {'loss': 0.4544, 'learning_rate': 2.6878516219611773e-06, 'epoch': 0.77} + 77%|███████▋ | 6722/8750 [3:20:28<3:12:58, 5.71s/it] 77%|███████▋ | 6723/8750 [3:20:36<3:13:18, 5.72s/it] 77%|███████▋ | 6723/8750 [3:20:33<3:13:18, 5.72s/it] {'loss': 0.4588, 'learning_rate': 2.6853270507935013e-06, 'epoch': 0.77} + 77%|███████▋ | 6723/8750 [3:20:36<3:13:18, 5.72s/it] {'loss': 0.4588, 'learning_rate': 2.6853270507935013e-06, 'epoch': 0.77} + 77%|███████▋ | 6723/8750 [3:20:33<3:13:18, 5.72s/it] 77%|███████▋ | 6724/8750 [3:20:41<3:11:49, 5.68s/it] 77%|███████▋ | 6724/8750 [3:20:39<3:11:49, 5.68s/it] {'loss': 0.4461, 'learning_rate': 2.6828034818993285e-06, 'epoch': 0.77} + 77%|███████▋ | 6724/8750 [3:20:41<3:11:49, 5.68s/it] {'loss': 0.4461, 'learning_rate': 2.6828034818993285e-06, 'epoch': 0.77} + 77%|███████▋ | 6724/8750 [3:20:39<3:11:49, 5.68s/it] 77%|███████▋ | 6725/8750 [3:20:47<3:10:38, 5.65s/it] 77%|███████▋ | 6725/8750 [3:20:45<3:10:38, 5.65s/it] {'loss': 0.4515, 'learning_rate': 2.680280915624448e-06, 'epoch': 0.77} + 77%|███████▋ | 6725/8750 [3:20:47<3:10:38, 5.65s/it] {'loss': 0.4515, 'learning_rate': 2.680280915624448e-06, 'epoch': 0.77} + 77%|███████▋ | 6725/8750 [3:20:45<3:10:38, 5.65s/it] 77%|███████▋ | 6726/8750 [3:20:53<3:10:51, 5.66s/it] 77%|███████▋ | 6726/8750 [3:20:50<3:10:51, 5.66s/it] {'loss': 0.4567, 'learning_rate': 2.6777593523144986e-06, 'epoch': 0.77} + 77%|███████▋ | 6726/8750 [3:20:53<3:10:51, 5.66s/it] {'loss': 0.4567, 'learning_rate': 2.6777593523144986e-06, 'epoch': 0.77} + 77%|███████▋ | 6726/8750 [3:20:50<3:10:51, 5.66s/it] 77%|███████▋ | 6727/8750 [3:20:58<3:11:56, 5.69s/it] 77%|███████▋ | 6727/8750 [3:20:56<3:11:55, 5.69s/it] {'loss': 0.4453, 'learning_rate': 2.675238792314999e-06, 'epoch': 0.77} + 77%|███████▋ | 6727/8750 [3:20:58<3:11:56, 5.69s/it] {'loss': 0.4453, 'learning_rate': 2.675238792314999e-06, 'epoch': 0.77} + 77%|███████▋ | 6727/8750 [3:20:56<3:11:55, 5.69s/it] 77%|███████▋ | 6728/8750 [3:21:04<3:14:07, 5.76s/it] 77%|███████▋ | 6728/8750 [3:21:02<3:14:07, 5.76s/it] {'loss': 0.4547, 'learning_rate': 2.6727192359713196e-06, 'epoch': 0.77} + 77%|███████▋ | 6728/8750 [3:21:04<3:14:07, 5.76s/it] {'loss': 0.4547, 'learning_rate': 2.6727192359713196e-06, 'epoch': 0.77} + 77%|███████▋ | 6728/8750 [3:21:02<3:14:07, 5.76s/it] 77%|███████▋ | 6729/8750 [3:21:10<3:12:56, 5.73s/it] 77%|███████▋ | 6729/8750 [3:21:08<3:12:56, 5.73s/it] {'loss': 0.4584, 'learning_rate': 2.670200683628691e-06, 'epoch': 0.77} + 77%|███████▋ | 6729/8750 [3:21:10<3:12:56, 5.73s/it] {'loss': 0.4584, 'learning_rate': 2.670200683628691e-06, 'epoch': 0.77} + 77%|███████▋ | 6729/8750 [3:21:08<3:12:56, 5.73s/it] 77%|███████▋ | 6730/8750 [3:21:16<3:13:26, 5.75s/it] 77%|███████▋ | 6730/8750 [3:21:13<3:13:26, 5.75s/it] {'loss': 0.4569, 'learning_rate': 2.6676831356322184e-06, 'epoch': 0.77} + 77%|███████▋ | 6730/8750 [3:21:16<3:13:26, 5.75s/it] {'loss': 0.4569, 'learning_rate': 2.6676831356322184e-06, 'epoch': 0.77} + 77%|███████▋ | 6730/8750 [3:21:13<3:13:26, 5.75s/it] 77%|███████▋ | 6731/8750 [3:21:21<3:12:25, 5.72s/it] 77%|███████▋ | 6731/8750 [3:21:19<3:12:24, 5.72s/it] {'loss': 0.4601, 'learning_rate': 2.6651665923268555e-06, 'epoch': 0.77} + 77%|███████▋ | 6731/8750 [3:21:21<3:12:25, 5.72s/it] {'loss': 0.4601, 'learning_rate': 2.6651665923268555e-06, 'epoch': 0.77} + 77%|███████▋ | 6731/8750 [3:21:19<3:12:24, 5.72s/it] 77%|███████▋ | 6732/8750 [3:21:27<3:14:33, 5.78s/it] 77%|███████▋ | 6732/8750 [3:21:25<3:14:33, 5.78s/it] {'loss': 0.4423, 'learning_rate': 2.6626510540574314e-06, 'epoch': 0.77} + 77%|███████▋ | 6732/8750 [3:21:27<3:14:33, 5.78s/it] {'loss': 0.4423, 'learning_rate': 2.6626510540574314e-06, 'epoch': 0.77} + 77%|███████▋ | 6732/8750 [3:21:25<3:14:33, 5.78s/it] 77%|███████▋ | 6733/8750 [3:21:33<3:12:36, 5.73s/it] 77%|███████▋ | 6733/8750 [3:21:31<3:12:36, 5.73s/it] {'loss': 0.459, 'learning_rate': 2.6601365211686248e-06, 'epoch': 0.77} + 77%|███████▋ | 6733/8750 [3:21:33<3:12:36, 5.73s/it] {'loss': 0.459, 'learning_rate': 2.6601365211686248e-06, 'epoch': 0.77} + 77%|███████▋ | 6733/8750 [3:21:31<3:12:36, 5.73s/it] 77%|███████▋ | 6734/8750 [3:21:38<3:11:00, 5.68s/it] 77%|███████▋ | 6734/8750 [3:21:36<3:11:00, 5.68s/it] {'loss': 0.4568, 'learning_rate': 2.657622994004986e-06, 'epoch': 0.77} + 77%|███████▋ | 6734/8750 [3:21:38<3:11:00, 5.68s/it] {'loss': 0.4568, 'learning_rate': 2.657622994004986e-06, 'epoch': 0.77} + 77%|███████▋ | 6734/8750 [3:21:36<3:11:00, 5.68s/it] 77%|███████▋ | 6735/8750 [3:21:44<3:10:11, 5.66s/it] 77%|███████▋ | 6735/8750 [3:21:42<3:10:11, 5.66s/it] {'loss': 0.4484, 'learning_rate': 2.655110472910927e-06, 'epoch': 0.77} + 77%|███████▋ | 6735/8750 [3:21:44<3:10:11, 5.66s/it] {'loss': 0.4484, 'learning_rate': 2.655110472910927e-06, 'epoch': 0.77} + 77%|███████▋ | 6735/8750 [3:21:42<3:10:11, 5.66s/it] 77%|███████▋ | 6736/8750 [3:21:50<3:10:59, 5.69s/it] 77%|███████▋ | 6736/8750 [3:21:48<3:10:58, 5.69s/it] {'loss': 0.4592, 'learning_rate': 2.652598958230713e-06, 'epoch': 0.77} + 77%|███████▋ | 6736/8750 [3:21:50<3:10:59, 5.69s/it] {'loss': 0.4592, 'learning_rate': 2.652598958230713e-06, 'epoch': 0.77} + 77%|███████▋ | 6736/8750 [3:21:48<3:10:58, 5.69s/it] 77%|███████▋ | 6737/8750 [3:21:56<3:11:35, 5.71s/it] 77%|███████▋ | 6737/8750 [3:21:53<3:11:35, 5.71s/it] {'loss': 0.4461, 'learning_rate': 2.6500884503084857e-06, 'epoch': 0.77} + 77%|███████▋ | 6737/8750 [3:21:56<3:11:35, 5.71s/it] {'loss': 0.4461, 'learning_rate': 2.6500884503084857e-06, 'epoch': 0.77} + 77%|███████▋ | 6737/8750 [3:21:53<3:11:35, 5.71s/it] 77%|███████▋ | 6738/8750 [3:22:01<3:10:19, 5.68s/it] 77%|███████▋ | 6738/8750 [3:21:59<3:10:19, 5.68s/it] {'loss': 0.4984, 'learning_rate': 2.647578949488234e-06, 'epoch': 0.77} + 77%|███████▋ | 6738/8750 [3:22:01<3:10:19, 5.68s/it] {'loss': 0.4984, 'learning_rate': 2.647578949488234e-06, 'epoch': 0.77} + 77%|███████▋ | 6738/8750 [3:21:59<3:10:19, 5.68s/it] 77%|███████▋ | 6739/8750 [3:22:07<3:09:07, 5.64s/it] 77%|███████▋ | 6739/8750 [3:22:05<3:09:07, 5.64s/it] {'loss': 0.4566, 'learning_rate': 2.645070456113816e-06, 'epoch': 0.77} + 77%|███████▋ | 6739/8750 [3:22:07<3:09:07, 5.64s/it] {'loss': 0.4566, 'learning_rate': 2.645070456113816e-06, 'epoch': 0.77} + 77%|███████▋ | 6739/8750 [3:22:05<3:09:07, 5.64s/it] 77%|███████▋ | 6740/8750 [3:22:13<3:13:41, 5.78s/it] 77%|███████▋ | 6740/8750 [3:22:11<3:13:42, 5.78s/it] {'loss': 0.4459, 'learning_rate': 2.6425629705289556e-06, 'epoch': 0.77} + 77%|███████▋ | 6740/8750 [3:22:13<3:13:41, 5.78s/it] {'loss': 0.4459, 'learning_rate': 2.6425629705289556e-06, 'epoch': 0.77} + 77%|███████▋ | 6740/8750 [3:22:11<3:13:42, 5.78s/it] 77%|███████▋ | 6741/8750 [3:22:19<3:13:36, 5.78s/it] 77%|███████▋ | 6741/8750 [3:22:16<3:13:36, 5.78s/it] {'loss': 0.4461, 'learning_rate': 2.640056493077231e-06, 'epoch': 0.77} + 77%|███████▋ | 6741/8750 [3:22:19<3:13:36, 5.78s/it] {'loss': 0.4461, 'learning_rate': 2.640056493077231e-06, 'epoch': 0.77} + 77%|███████▋ | 6741/8750 [3:22:16<3:13:36, 5.78s/it] 77%|███████▋ | 6742/8750 [3:22:24<3:11:27, 5.72s/it] 77%|███████▋ | 6742/8750 [3:22:22<3:11:28, 5.72s/it] {'loss': 0.4478, 'learning_rate': 2.637551024102084e-06, 'epoch': 0.77} + {'loss': 0.4478, 'learning_rate': 2.637551024102084e-06, 'epoch': 0.77} + 77%|███████▋ | 6742/8750 [3:22:24<3:11:27, 5.72s/it] 77%|███████▋ | 6742/8750 [3:22:22<3:11:28, 5.72s/it] 77%|███████▋ | 6743/8750 [3:22:30<3:11:54, 5.74s/it] 77%|███████▋ | 6743/8750 [3:22:28<3:11:54, 5.74s/it] {'loss': 0.44, 'learning_rate': 2.6350465639468213e-06, 'epoch': 0.77} + 77%|███████▋ | 6743/8750 [3:22:30<3:11:54, 5.74s/it] {'loss': 0.44, 'learning_rate': 2.6350465639468213e-06, 'epoch': 0.77} + 77%|███████▋ | 6743/8750 [3:22:28<3:11:54, 5.74s/it] 77%|███████▋ | 6744/8750 [3:22:36<3:11:38, 5.73s/it] 77%|███████▋ | 6744/8750 [3:22:33<3:11:38, 5.73s/it] {'loss': 0.4568, 'learning_rate': 2.6325431129546107e-06, 'epoch': 0.77} + {'loss': 0.4568, 'learning_rate': 2.6325431129546107e-06, 'epoch': 0.77} 77%|███████▋ | 6744/8750 [3:22:36<3:11:38, 5.73s/it] + 77%|███████▋ | 6744/8750 [3:22:33<3:11:38, 5.73s/it] 77%|███████▋ | 6745/8750 [3:22:41<3:11:52, 5.74s/it] 77%|███████▋ | 6745/8750 [3:22:39<3:11:52, 5.74s/it] {'loss': 0.4455, 'learning_rate': 2.6300406714684834e-06, 'epoch': 0.77} + 77%|███████▋ | 6745/8750 [3:22:42<3:11:52, 5.74s/it] {'loss': 0.4455, 'learning_rate': 2.6300406714684834e-06, 'epoch': 0.77} + 77%|███████▋ | 6745/8750 [3:22:39<3:11:52, 5.74s/it] 77%|███████▋ | 6746/8750 [3:22:47<3:11:07, 5.72s/it] 77%|███████▋ | 6746/8750 [3:22:45<3:11:08, 5.72s/it] {'loss': 0.4659, 'learning_rate': 2.627539239831328e-06, 'epoch': 0.77} + 77%|███████▋ | 6746/8750 [3:22:47<3:11:07, 5.72s/it] {'loss': 0.4659, 'learning_rate': 2.627539239831328e-06, 'epoch': 0.77} + 77%|███████▋ | 6746/8750 [3:22:45<3:11:08, 5.72s/it] 77%|███████▋ | 6747/8750 [3:22:53<3:12:13, 5.76s/it] 77%|███████▋ | 6747/8750 [3:22:51<3:12:13, 5.76s/it] {'loss': 0.4441, 'learning_rate': 2.625038818385892e-06, 'epoch': 0.77} + 77%|███████▋ | 6747/8750 [3:22:53<3:12:13, 5.76s/it] {'loss': 0.4441, 'learning_rate': 2.625038818385892e-06, 'epoch': 0.77} + 77%|███████▋ | 6747/8750 [3:22:51<3:12:13, 5.76s/it] 77%|███████▋ | 6748/8750 [3:22:59<3:11:46, 5.75s/it] 77%|███████▋ | 6748/8750 [3:22:56<3:11:47, 5.75s/it] {'loss': 0.4484, 'learning_rate': 2.6225394074747956e-06, 'epoch': 0.77} + 77%|███████▋ | 6748/8750 [3:22:59<3:11:46, 5.75s/it] {'loss': 0.4484, 'learning_rate': 2.6225394074747956e-06, 'epoch': 0.77} + 77%|███████▋ | 6748/8750 [3:22:56<3:11:47, 5.75s/it] 77%|███████▋ | 6749/8750 [3:23:04<3:10:06, 5.70s/it] 77%|███████▋ | 6749/8750 [3:23:02<3:10:06, 5.70s/it] {'loss': 0.4494, 'learning_rate': 2.620041007440508e-06, 'epoch': 0.77} + 77%|███████▋ | 6749/8750 [3:23:04<3:10:06, 5.70s/it] {'loss': 0.4494, 'learning_rate': 2.620041007440508e-06, 'epoch': 0.77} + 77%|███████▋ | 6749/8750 [3:23:02<3:10:06, 5.70s/it]9 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... + 77%|███████▋ | 6750/8750 [3:23:10<3:10:16, 5.71s/it]1 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +0 AutoResumeHook: Checking whether to suspend... + 77%|███████▋ | 6750/8750 [3:23:08<3:10:16, 5.71s/it]63 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + + {'loss': 0.4613, 'learning_rate': 2.617543618625371e-06, 'epoch': 0.77} + 77%|███████▋ | 6750/8750 [3:23:10<3:10:16, 5.71s/it] {'loss': 0.4613, 'learning_rate': 2.617543618625371e-06, 'epoch': 0.77} + 77%|███████▋ | 6750/8750 [3:23:08<3:10:16, 5.71s/it] 77%|███████▋ | 6751/8750 [3:23:16<3:11:59, 5.76s/it] 77%|███████▋ | 6751/8750 [3:23:14<3:11:59, 5.76s/it] {'loss': 0.4473, 'learning_rate': 2.615047241371581e-06, 'epoch': 0.77} + 77%|███████▋ | 6751/8750 [3:23:16<3:11:59, 5.76s/it] {'loss': 0.4473, 'learning_rate': 2.615047241371581e-06, 'epoch': 0.77} + 77%|███████▋ | 6751/8750 [3:23:14<3:11:59, 5.76s/it] 77%|███████▋ | 6752/8750 [3:23:22<3:11:01, 5.74s/it] 77%|███████▋ | 6752/8750 [3:23:19<3:11:01, 5.74s/it] {'loss': 0.459, 'learning_rate': 2.6125518760211933e-06, 'epoch': 0.77} + 77%|███████▋ | 6752/8750 [3:23:22<3:11:01, 5.74s/it] {'loss': 0.459, 'learning_rate': 2.6125518760211933e-06, 'epoch': 0.77} + 77%|███████▋ | 6752/8750 [3:23:19<3:11:01, 5.74s/it] 77%|███████▋ | 6753/8750 [3:23:27<3:10:27, 5.72s/it] 77%|███████▋ | 6753/8750 [3:23:25<3:10:27, 5.72s/it] {'loss': 0.456, 'learning_rate': 2.610057522916132e-06, 'epoch': 0.77} + 77%|███████▋ | 6753/8750 [3:23:27<3:10:27, 5.72s/it] {'loss': 0.456, 'learning_rate': 2.610057522916132e-06, 'epoch': 0.77} + 77%|███████▋ | 6753/8750 [3:23:25<3:10:27, 5.72s/it] 77%|███████▋ | 6754/8750 [3:23:33<3:09:50, 5.71s/it] 77%|███████▋ | 6754/8750 [3:23:31<3:09:50, 5.71s/it] {'loss': 0.4547, 'learning_rate': 2.6075641823981814e-06, 'epoch': 0.77} + 77%|███████▋ | 6754/8750 [3:23:33<3:09:50, 5.71s/it] {'loss': 0.4547, 'learning_rate': 2.6075641823981814e-06, 'epoch': 0.77} + 77%|███████▋ | 6754/8750 [3:23:31<3:09:50, 5.71s/it] 77%|███████▋ | 6755/8750 [3:23:39<3:09:32, 5.70s/it] 77%|███████▋ | 6755/8750 [3:23:36<3:09:32, 5.70s/it] {'loss': 0.4559, 'learning_rate': 2.6050718548089804e-06, 'epoch': 0.77} + {'loss': 0.4559, 'learning_rate': 2.6050718548089804e-06, 'epoch': 0.77} + 77%|███████▋ | 6755/8750 [3:23:39<3:09:32, 5.70s/it] 77%|███████▋ | 6755/8750 [3:23:36<3:09:32, 5.70s/it] 77%|███████▋ | 6756/8750 [3:23:44<3:08:34, 5.67s/it] 77%|███████▋ | 6756/8750 [3:23:42<3:08:34, 5.67s/it] {'loss': 0.463, 'learning_rate': 2.602580540490038e-06, 'epoch': 0.77} + 77%|███████▋ | 6756/8750 [3:23:44<3:08:34, 5.67s/it] {'loss': 0.463, 'learning_rate': 2.602580540490038e-06, 'epoch': 0.77} + 77%|███████▋ | 6756/8750 [3:23:42<3:08:34, 5.67s/it] 77%|███████▋ | 6757/8750 [3:23:50<3:10:27, 5.73s/it] 77%|███████▋ | 6757/8750 [3:23:48<3:10:27, 5.73s/it] {'loss': 0.4474, 'learning_rate': 2.6000902397827154e-06, 'epoch': 0.77} + 77%|███████▋ | 6757/8750 [3:23:50<3:10:27, 5.73s/it] {'loss': 0.4474, 'learning_rate': 2.6000902397827154e-06, 'epoch': 0.77} + 77%|███████▋ | 6757/8750 [3:23:48<3:10:27, 5.73s/it] 77%|███████▋ | 6758/8750 [3:23:56<3:09:23, 5.70s/it] 77%|███████▋ | 6758/8750 [3:23:54<3:09:23, 5.70s/it] {'loss': 0.4649, 'learning_rate': 2.5976009530282455e-06, 'epoch': 0.77} + 77%|███████▋ | 6758/8750 [3:23:56<3:09:23, 5.70s/it] {'loss': 0.4649, 'learning_rate': 2.5976009530282455e-06, 'epoch': 0.77} + 77%|███████▋ | 6758/8750 [3:23:54<3:09:23, 5.70s/it] 77%|███████▋ | 6759/8750 [3:24:02<3:10:56, 5.75s/it] {'loss': 0.4483, 'learning_rate': 2.595112680567711e-06, 'epoch': 0.77} + 77%|███████▋ | 6759/8750 [3:23:59<3:10:56, 5.75s/it] 77%|███████▋ | 6759/8750 [3:24:02<3:10:56, 5.75s/it] {'loss': 0.4483, 'learning_rate': 2.595112680567711e-06, 'epoch': 0.77} + 77%|███████▋ | 6759/8750 [3:23:59<3:10:56, 5.75s/it] 77%|███████▋ | 6760/8750 [3:24:07<3:11:06, 5.76s/it] 77%|███████▋ | 6760/8750 [3:24:05<3:11:05, 5.76s/it] {'loss': 0.4602, 'learning_rate': 2.592625422742059e-06, 'epoch': 0.77} + 77%|███████▋ | 6760/8750 [3:24:07<3:11:06, 5.76s/it] {'loss': 0.4602, 'learning_rate': 2.592625422742059e-06, 'epoch': 0.77} + 77%|███████▋ | 6760/8750 [3:24:05<3:11:05, 5.76s/it] 77%|███████▋ | 6761/8750 [3:24:13<3:09:55, 5.73s/it] 77%|███████▋ | 6761/8750 [3:24:11<3:09:55, 5.73s/it]{'loss': 0.4647, 'learning_rate': 2.5901391798921018e-06, 'epoch': 0.77} + {'loss': 0.4647, 'learning_rate': 2.5901391798921018e-06, 'epoch': 0.77} + 77%|███████▋ | 6761/8750 [3:24:13<3:09:55, 5.73s/it] 77%|███████▋ | 6761/8750 [3:24:11<3:09:55, 5.73s/it] 77%|███████▋ | 6762/8750 [3:24:19<3:09:57, 5.73s/it] 77%|███████▋ | 6762/8750 [3:24:17<3:09:57, 5.73s/it] {'loss': 0.4619, 'learning_rate': 2.5876539523585144e-06, 'epoch': 0.77} + 77%|███████▋ | 6762/8750 [3:24:19<3:09:57, 5.73s/it] {'loss': 0.4619, 'learning_rate': 2.5876539523585144e-06, 'epoch': 0.77} + 77%|███████▋ | 6762/8750 [3:24:17<3:09:57, 5.73s/it] 77%|███████▋ | 6763/8750 [3:24:24<3:08:33, 5.69s/it] 77%|███████▋ | 6763/8750 [3:24:22<3:08:33, 5.69s/it] {'loss': 0.4544, 'learning_rate': 2.585169740481822e-06, 'epoch': 0.77} + 77%|███████▋ | 6763/8750 [3:24:24<3:08:33, 5.69s/it] {'loss': 0.4544, 'learning_rate': 2.585169740481822e-06, 'epoch': 0.77} + 77%|███████▋ | 6763/8750 [3:24:22<3:08:33, 5.69s/it] 77%|███████▋ | 6764/8750 [3:24:30<3:08:53, 5.71s/it] 77%|███████▋ | 6764/8750 [3:24:28<3:08:53, 5.71s/it] {'loss': 0.4573, 'learning_rate': 2.582686544602423e-06, 'epoch': 0.77} + 77%|███████▋ | 6764/8750 [3:24:30<3:08:53, 5.71s/it] {'loss': 0.4573, 'learning_rate': 2.582686544602423e-06, 'epoch': 0.77} + 77%|███████▋ | 6764/8750 [3:24:28<3:08:53, 5.71s/it] 77%|███████▋ | 6765/8750 [3:24:36<3:11:57, 5.80s/it] 77%|███████▋ | 6765/8750 [3:24:34<3:11:57, 5.80s/it] {'loss': 0.4429, 'learning_rate': 2.5802043650605645e-06, 'epoch': 0.77} + 77%|███████▋ | 6765/8750 [3:24:36<3:11:57, 5.80s/it] {'loss': 0.4429, 'learning_rate': 2.5802043650605645e-06, 'epoch': 0.77} + 77%|███████▋ | 6765/8750 [3:24:34<3:11:57, 5.80s/it] 77%|███████▋ | 6766/8750 [3:24:42<3:11:13, 5.78s/it] 77%|███████▋ | 6766/8750 [3:24:40<3:11:13, 5.78s/it] {'loss': 0.4496, 'learning_rate': 2.577723202196366e-06, 'epoch': 0.77} + 77%|███████▋ | 6766/8750 [3:24:42<3:11:13, 5.78s/it] {'loss': 0.4496, 'learning_rate': 2.577723202196366e-06, 'epoch': 0.77} + 77%|███████▋ | 6766/8750 [3:24:40<3:11:13, 5.78s/it] 77%|███████▋ | 6767/8750 [3:24:45<3:09:25, 5.73s/it] 77%|███████▋ | 6767/8750 [3:24:48<3:09:26, 5.73s/it] {'loss': 0.4589, 'learning_rate': 2.575243056349801e-06, 'epoch': 0.77} + 77%|███████▋ | 6767/8750 [3:24:45<3:09:25, 5.73s/it] {'loss': 0.4589, 'learning_rate': 2.575243056349801e-06, 'epoch': 0.77} + 77%|███████▋ | 6767/8750 [3:24:48<3:09:26, 5.73s/it] 77%|███████▋ | 6768/8750 [3:24:54<3:12:39, 5.83s/it] 77%|███████▋ | 6768/8750 [3:24:51<3:12:39, 5.83s/it] {'loss': 0.4451, 'learning_rate': 2.5727639278606997e-06, 'epoch': 0.77} + 77%|███████▋ | 6768/8750 [3:24:54<3:12:39, 5.83s/it] {'loss': 0.4451, 'learning_rate': 2.5727639278606997e-06, 'epoch': 0.77} + 77%|███████▋ | 6768/8750 [3:24:51<3:12:39, 5.83s/it] 77%|███████▋ | 6769/8750 [3:24:59<3:11:25, 5.80s/it] 77%|███████▋ | 6769/8750 [3:24:57<3:11:25, 5.80s/it] {'loss': 0.4611, 'learning_rate': 2.570285817068765e-06, 'epoch': 0.77} + 77%|███████▋ | 6769/8750 [3:24:59<3:11:25, 5.80s/it] {'loss': 0.4611, 'learning_rate': 2.570285817068765e-06, 'epoch': 0.77} + 77%|███████▋ | 6769/8750 [3:24:57<3:11:25, 5.80s/it] 77%|███████▋ | 6770/8750 [3:25:05<3:09:22, 5.74s/it] 77%|███████▋ | 6770/8750 [3:25:03<3:09:22, 5.74s/it] {'loss': 0.4572, 'learning_rate': 2.5678087243135476e-06, 'epoch': 0.77} + 77%|███████▋ | 6770/8750 [3:25:05<3:09:22, 5.74s/it] {'loss': 0.4572, 'learning_rate': 2.5678087243135476e-06, 'epoch': 0.77} + 77%|███████▋ | 6770/8750 [3:25:03<3:09:22, 5.74s/it] 77%|███████▋ | 6771/8750 [3:25:11<3:08:48, 5.72s/it] 77%|███████▋ | 6771/8750 [3:25:08<3:08:48, 5.72s/it] {'loss': 0.4633, 'learning_rate': 2.565332649934468e-06, 'epoch': 0.77} + 77%|███████▋ | 6771/8750 [3:25:11<3:08:48, 5.72s/it] {'loss': 0.4633, 'learning_rate': 2.565332649934468e-06, 'epoch': 0.77} + 77%|███████▋ | 6771/8750 [3:25:08<3:08:48, 5.72s/it] 77%|███████▋ | 6772/8750 [3:25:16<3:09:54, 5.76s/it] 77%|███████▋ | 6772/8750 [3:25:14<3:09:54, 5.76s/it] {'loss': 0.4609, 'learning_rate': 2.5628575942708047e-06, 'epoch': 0.77} + 77%|███████▋ | 6772/8750 [3:25:16<3:09:54, 5.76s/it] {'loss': 0.4609, 'learning_rate': 2.5628575942708047e-06, 'epoch': 0.77} + 77%|███████▋ | 6772/8750 [3:25:14<3:09:54, 5.76s/it] 77%|███████▋ | 6773/8750 [3:25:22<3:09:34, 5.75s/it] 77%|███████▋ | 6773/8750 [3:25:20<3:09:34, 5.75s/it] {'loss': 0.454, 'learning_rate': 2.560383557661692e-06, 'epoch': 0.77} + 77%|███████▋ | 6773/8750 [3:25:22<3:09:34, 5.75s/it] {'loss': 0.454, 'learning_rate': 2.560383557661692e-06, 'epoch': 0.77} + 77%|███████▋ | 6773/8750 [3:25:20<3:09:34, 5.75s/it] 77%|███████▋ | 6774/8750 [3:25:28<3:07:33, 5.70s/it] 77%|███████▋ | 6774/8750 [3:25:26<3:07:33, 5.70s/it] {'loss': 0.4556, 'learning_rate': 2.5579105404461325e-06, 'epoch': 0.77} + 77%|███████▋ | 6774/8750 [3:25:28<3:07:33, 5.70s/it] {'loss': 0.4556, 'learning_rate': 2.5579105404461325e-06, 'epoch': 0.77} + 77%|███████▋ | 6774/8750 [3:25:26<3:07:33, 5.70s/it] 77%|███████▋ | 6775/8750 [3:25:33<3:07:18, 5.69s/it] 77%|███████▋ | 6775/8750 [3:25:31<3:07:17, 5.69s/it] {'loss': 0.4432, 'learning_rate': 2.555438542962979e-06, 'epoch': 0.77} + 77%|███████▋ | 6775/8750 [3:25:33<3:07:18, 5.69s/it] {'loss': 0.4432, 'learning_rate': 2.555438542962979e-06, 'epoch': 0.77} + 77%|███████▋ | 6775/8750 [3:25:31<3:07:17, 5.69s/it] 77%|███████▋ | 6776/8750 [3:25:39<3:10:10, 5.78s/it] {'loss': 0.4549, 'learning_rate': 2.5529675655509567e-06, 'epoch': 0.77} + 77%|███████▋ | 6776/8750 [3:25:37<3:10:10, 5.78s/it] {'loss': 0.4549, 'learning_rate': 2.5529675655509567e-06, 'epoch': 0.77} 77%|███████▋ | 6776/8750 [3:25:39<3:10:10, 5.78s/it] + 77%|███████▋ | 6776/8750 [3:25:37<3:10:10, 5.78s/it] 77%|███████▋ | 6777/8750 [3:25:45<3:12:28, 5.85s/it] 77%|███████▋ | 6777/8750 [3:25:43<3:12:28, 5.85s/it] {'loss': 0.4414, 'learning_rate': 2.550497608548642e-06, 'epoch': 0.77} + 77%|███████▋ | 6777/8750 [3:25:45<3:12:28, 5.85s/it] {'loss': 0.4414, 'learning_rate': 2.550497608548642e-06, 'epoch': 0.77} + 77%|███████▋ | 6777/8750 [3:25:43<3:12:28, 5.85s/it] 77%|███████▋ | 6778/8750 [3:25:51<3:11:02, 5.81s/it] 77%|███████▋ | 6778/8750 [3:25:49<3:11:02, 5.81s/it] {'loss': 0.4531, 'learning_rate': 2.5480286722944712e-06, 'epoch': 0.77} + 77%|███████▋ | 6778/8750 [3:25:51<3:11:02, 5.81s/it] {'loss': 0.4531, 'learning_rate': 2.5480286722944712e-06, 'epoch': 0.77} + 77%|███████▋ | 6778/8750 [3:25:49<3:11:02, 5.81s/it] 77%|███████▋ | 6779/8750 [3:25:57<3:09:11, 5.76s/it] 77%|███████▋ | 6779/8750 [3:25:55<3:09:11, 5.76s/it] {'loss': 0.4699, 'learning_rate': 2.5455607571267484e-06, 'epoch': 0.77} + 77%|███████▋ | 6779/8750 [3:25:57<3:09:11, 5.76s/it] {'loss': 0.4699, 'learning_rate': 2.5455607571267484e-06, 'epoch': 0.77} + 77%|███████▋ | 6779/8750 [3:25:55<3:09:11, 5.76s/it] 77%|███████▋ | 6780/8750 [3:26:02<3:07:23, 5.71s/it] 77%|███████▋ | 6780/8750 [3:26:00<3:07:23, 5.71s/it] {'loss': 0.4576, 'learning_rate': 2.543093863383629e-06, 'epoch': 0.77} + 77%|███████▋ | 6780/8750 [3:26:02<3:07:23, 5.71s/it] {'loss': 0.4576, 'learning_rate': 2.543093863383629e-06, 'epoch': 0.77} + 77%|███████▋ | 6780/8750 [3:26:00<3:07:23, 5.71s/it] 77%|███████▋ | 6781/8750 [3:26:08<3:08:24, 5.74s/it] 77%|███████▋ | 6781/8750 [3:26:06<3:08:24, 5.74s/it] {'loss': 0.4535, 'learning_rate': 2.540627991403134e-06, 'epoch': 0.77} + 77%|███████▋ | 6781/8750 [3:26:08<3:08:24, 5.74s/it] {'loss': 0.4535, 'learning_rate': 2.540627991403134e-06, 'epoch': 0.77} + 77%|███████▋ | 6781/8750 [3:26:06<3:08:24, 5.74s/it] 78%|███████▊ | 6782/8750 [3:26:14<3:08:39, 5.75s/it] 78%|███████▊ | 6782/8750 [3:26:12<3:08:39, 5.75s/it] {'loss': 0.479, 'learning_rate': 2.5381631415231455e-06, 'epoch': 0.78} + 78%|███████▊ | 6782/8750 [3:26:14<3:08:39, 5.75s/it] {'loss': 0.479, 'learning_rate': 2.5381631415231455e-06, 'epoch': 0.78} + 78%|███████▊ | 6782/8750 [3:26:12<3:08:39, 5.75s/it] 78%|███████▊ | 6783/8750 [3:26:20<3:08:51, 5.76s/it] 78%|███████▊ | 6783/8750 [3:26:18<3:08:51, 5.76s/it] {'loss': 0.4589, 'learning_rate': 2.535699314081399e-06, 'epoch': 0.78} + 78%|███████▊ | 6783/8750 [3:26:20<3:08:51, 5.76s/it] {'loss': 0.4589, 'learning_rate': 2.535699314081399e-06, 'epoch': 0.78} + 78%|███████▊ | 6783/8750 [3:26:18<3:08:51, 5.76s/it] 78%|███████▊ | 6784/8750 [3:26:25<3:07:29, 5.72s/it] 78%|███████▊ | 6784/8750 [3:26:23<3:07:28, 5.72s/it] {'loss': 0.4563, 'learning_rate': 2.5332365094154975e-06, 'epoch': 0.78} + 78%|███████▊ | 6784/8750 [3:26:25<3:07:29, 5.72s/it] {'loss': 0.4563, 'learning_rate': 2.5332365094154975e-06, 'epoch': 0.78} + 78%|███████▊ | 6784/8750 [3:26:23<3:07:28, 5.72s/it] 78%|███████▊ | 6785/8750 [3:26:31<3:10:54, 5.83s/it] 78%|███████▊ | 6785/8750 [3:26:29<3:10:54, 5.83s/it] {'loss': 0.4539, 'learning_rate': 2.530774727862899e-06, 'epoch': 0.78} + 78%|███████▊ | 6785/8750 [3:26:31<3:10:54, 5.83s/it] {'loss': 0.4539, 'learning_rate': 2.530774727862899e-06, 'epoch': 0.78} + 78%|███████▊ | 6785/8750 [3:26:29<3:10:54, 5.83s/it] 78%|███████▊ | 6786/8750 [3:26:37<3:08:19, 5.75s/it] 78%|███████▊ | 6786/8750 [3:26:35<3:08:19, 5.75s/it] {'loss': 0.4446, 'learning_rate': 2.5283139697609195e-06, 'epoch': 0.78} + 78%|███████▊ | 6786/8750 [3:26:37<3:08:19, 5.75s/it] {'loss': 0.4446, 'learning_rate': 2.5283139697609195e-06, 'epoch': 0.78} + 78%|███████▊ | 6786/8750 [3:26:35<3:08:19, 5.75s/it] 78%|███████▊ | 6787/8750 [3:26:43<3:07:37, 5.73s/it] 78%|███████▊ | 6787/8750 [3:26:40<3:07:37, 5.74s/it] {'loss': 0.4636, 'learning_rate': 2.525854235446743e-06, 'epoch': 0.78} + 78%|███████▊ | 6787/8750 [3:26:43<3:07:37, 5.73s/it] {'loss': 0.4636, 'learning_rate': 2.525854235446743e-06, 'epoch': 0.78} + 78%|███████▊ | 6787/8750 [3:26:40<3:07:37, 5.74s/it] 78%|███████▊ | 6788/8750 [3:26:48<3:06:53, 5.72s/it] 78%|███████▊ | 6788/8750 [3:26:46<3:06:52, 5.72s/it] {'loss': 0.4485, 'learning_rate': 2.5233955252574027e-06, 'epoch': 0.78} + 78%|███████▊ | 6788/8750 [3:26:48<3:06:53, 5.72s/it] {'loss': 0.4485, 'learning_rate': 2.5233955252574027e-06, 'epoch': 0.78} + 78%|███████▊ | 6788/8750 [3:26:46<3:06:52, 5.72s/it] 78%|███████▊ | 6789/8750 [3:26:54<3:06:45, 5.71s/it] 78%|███████▊ | 6789/8750 [3:26:52<3:06:45, 5.71s/it] {'loss': 0.4408, 'learning_rate': 2.5209378395298013e-06, 'epoch': 0.78} + 78%|███████▊ | 6789/8750 [3:26:54<3:06:45, 5.71s/it] {'loss': 0.4408, 'learning_rate': 2.5209378395298013e-06, 'epoch': 0.78} + 78%|███████▊ | 6789/8750 [3:26:52<3:06:45, 5.71s/it] 78%|███████▊ | 6790/8750 [3:27:00<3:05:35, 5.68s/it] 78%|███████▊ | 6790/8750 [3:26:57<3:05:35, 5.68s/it] {'loss': 0.4678, 'learning_rate': 2.5184811786006923e-06, 'epoch': 0.78} + 78%|███████▊ | 6790/8750 [3:27:00<3:05:35, 5.68s/it] {'loss': 0.4678, 'learning_rate': 2.5184811786006923e-06, 'epoch': 0.78} + 78%|███████▊ | 6790/8750 [3:26:57<3:05:35, 5.68s/it] 78%|███████▊ | 6791/8750 [3:27:05<3:05:33, 5.68s/it] 78%|███████▊ | 6791/8750 [3:27:03<3:05:32, 5.68s/it] {'loss': 0.4418, 'learning_rate': 2.516025542806696e-06, 'epoch': 0.78} + 78%|███████▊ | 6791/8750 [3:27:05<3:05:33, 5.68s/it] {'loss': 0.4418, 'learning_rate': 2.516025542806696e-06, 'epoch': 0.78} + 78%|███████▊ | 6791/8750 [3:27:03<3:05:32, 5.68s/it] 78%|███████▊ | 6792/8750 [3:27:11<3:06:53, 5.73s/it] 78%|███████▊ | 6792/8750 [3:27:09<3:06:53, 5.73s/it] {'loss': 0.4579, 'learning_rate': 2.5135709324842906e-06, 'epoch': 0.78} + 78%|███████▊ | 6792/8750 [3:27:09<3:06:53, 5.73s/it]{'loss': 0.4579, 'learning_rate': 2.5135709324842906e-06, 'epoch': 0.78} + 78%|███████▊ | 6792/8750 [3:27:11<3:06:53, 5.73s/it] 78%|███████▊ | 6793/8750 [3:27:17<3:06:53, 5.73s/it] 78%|███████▊ | 6793/8750 [3:27:15<3:06:53, 5.73s/it] {'loss': 0.453, 'learning_rate': 2.511117347969809e-06, 'epoch': 0.78} + 78%|███████▊ | 6793/8750 [3:27:17<3:06:53, 5.73s/it] {'loss': 0.453, 'learning_rate': 2.511117347969809e-06, 'epoch': 0.78} + 78%|███████▊ | 6793/8750 [3:27:15<3:06:53, 5.73s/it] 78%|███████▊ | 6794/8750 [3:27:23<3:09:54, 5.83s/it] 78%|███████▊ | 6794/8750 [3:27:21<3:09:54, 5.83s/it] {'loss': 0.4449, 'learning_rate': 2.508664789599451e-06, 'epoch': 0.78} + 78%|███████▊ | 6794/8750 [3:27:23<3:09:54, 5.83s/it] {'loss': 0.4449, 'learning_rate': 2.508664789599451e-06, 'epoch': 0.78} + 78%|███████▊ | 6794/8750 [3:27:21<3:09:54, 5.83s/it] 78%|███████▊ | 6795/8750 [3:27:29<3:10:46, 5.86s/it] 78%|███████▊ | 6795/8750 [3:27:27<3:10:46, 5.86s/it] {'loss': 0.4601, 'learning_rate': 2.50621325770927e-06, 'epoch': 0.78} + {'loss': 0.4601, 'learning_rate': 2.50621325770927e-06, 'epoch': 0.78} + 78%|███████▊ | 6795/8750 [3:27:29<3:10:46, 5.86s/it] 78%|███████▊ | 6795/8750 [3:27:27<3:10:46, 5.86s/it] 78%|███████▊ | 6796/8750 [3:27:35<3:08:43, 5.80s/it] 78%|███████▊ | 6796/8750 [3:27:32<3:08:43, 5.80s/it] {'loss': 0.4701, 'learning_rate': 2.503762752635177e-06, 'epoch': 0.78} + 78%|███████▊ | 6796/8750 [3:27:35<3:08:43, 5.80s/it] {'loss': 0.4701, 'learning_rate': 2.503762752635177e-06, 'epoch': 0.78} + 78%|███████▊ | 6796/8750 [3:27:32<3:08:43, 5.80s/it] 78%|███████▊ | 6797/8750 [3:27:40<3:08:20, 5.79s/it] 78%|███████▊ | 6797/8750 [3:27:38<3:08:20, 5.79s/it] {'loss': 0.4432, 'learning_rate': 2.501313274712952e-06, 'epoch': 0.78} + 78%|███████▊ | 6797/8750 [3:27:40<3:08:20, 5.79s/it] {'loss': 0.4432, 'learning_rate': 2.501313274712952e-06, 'epoch': 0.78} + 78%|███████▊ | 6797/8750 [3:27:38<3:08:20, 5.79s/it] 78%|███████▊ | 6798/8750 [3:27:46<3:06:35, 5.74s/it] 78%|███████▊ | 6798/8750 [3:27:44<3:06:35, 5.74s/it] {'loss': 0.4596, 'learning_rate': 2.4988648242782255e-06, 'epoch': 0.78} + 78%|███████▊ | 6798/8750 [3:27:46<3:06:35, 5.74s/it] {'loss': 0.4596, 'learning_rate': 2.4988648242782255e-06, 'epoch': 0.78} + 78%|███████▊ | 6798/8750 [3:27:44<3:06:35, 5.74s/it] 78%|███████▊ | 6799/8750 [3:27:52<3:05:51, 5.72s/it] 78%|███████▊ | 6799/8750 [3:27:49<3:05:51, 5.72s/it] {'loss': 0.4429, 'learning_rate': 2.4964174016664865e-06, 'epoch': 0.78} + 78%|███████▊ | 6799/8750 [3:27:52<3:05:51, 5.72s/it] {'loss': 0.4429, 'learning_rate': 2.4964174016664865e-06, 'epoch': 0.78} + 78%|███████▊ | 6799/8750 [3:27:49<3:05:51, 5.72s/it]9 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +81441 AutoResumeHook: Checking whether to suspend... AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend...15 + + 78%|███████▊ | 6800/8750 [3:27:57<3:06:31, 5.74s/it] AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +0 AutoResumeHook: Checking whether to suspend... + 78%|███████▊ | 6800/8750 [3:27:55<3:06:32, 5.74s/it]3 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4535, 'learning_rate': 2.4939710072130895e-06, 'epoch': 0.78} + 78%|███████▊ | 6800/8750 [3:27:57<3:06:31, 5.74s/it] {'loss': 0.4535, 'learning_rate': 2.4939710072130895e-06, 'epoch': 0.78} + 78%|███████▊ | 6800/8750 [3:27:55<3:06:32, 5.74s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-6800/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-6800/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-6800/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 78%|███████▊ | 6801/8750 [3:28:19<5:44:24, 10.60s/it] 78%|███████▊ | 6801/8750 [3:28:17<5:44:24, 10.60s/it] {'loss': 0.4357, 'learning_rate': 2.4915256412532463e-06, 'epoch': 0.78} + 78%|███████▊ | 6801/8750 [3:28:19<5:44:24, 10.60s/it] {'loss': 0.4357, 'learning_rate': 2.4915256412532463e-06, 'epoch': 0.78} + 78%|███████▊ | 6801/8750 [3:28:17<5:44:24, 10.60s/it] 78%|███████▊ | 6802/8750 [3:28:25<4:56:03, 9.12s/it] 78%|███████▊ | 6802/8750 [3:28:23<4:56:04, 9.12s/it] {'loss': 0.4536, 'learning_rate': 2.4890813041220286e-06, 'epoch': 0.78} + 78%|███████▊ | 6802/8750 [3:28:25<4:56:03, 9.12s/it] {'loss': 0.4536, 'learning_rate': 2.4890813041220286e-06, 'epoch': 0.78} + 78%|███████▊ | 6802/8750 [3:28:23<4:56:04, 9.12s/it] 78%|███████▊ | 6803/8750 [3:28:31<4:22:25, 8.09s/it] 78%|███████▊ | 6803/8750 [3:28:28<4:22:26, 8.09s/it] {'loss': 0.442, 'learning_rate': 2.486637996154362e-06, 'epoch': 0.78} + {'loss': 0.442, 'learning_rate': 2.486637996154362e-06, 'epoch': 0.78} 78%|███████▊ | 6803/8750 [3:28:31<4:22:25, 8.09s/it] + 78%|███████▊ | 6803/8750 [3:28:28<4:22:26, 8.09s/it] 78%|███████▊ | 6804/8750 [3:28:36<3:59:27, 7.38s/it] 78%|███████▊ | 6804/8750 [3:28:34<3:59:27, 7.38s/it] {'loss': 0.4543, 'learning_rate': 2.4841957176850306e-06, 'epoch': 0.78} + 78%|███████▊ | 6804/8750 [3:28:37<3:59:27, 7.38s/it] {'loss': 0.4543, 'learning_rate': 2.4841957176850306e-06, 'epoch': 0.78} + 78%|███████▊ | 6804/8750 [3:28:34<3:59:27, 7.38s/it] 78%|███████▊ | 6805/8750 [3:28:42<3:41:45, 6.84s/it] 78%|███████▊ | 6805/8750 [3:28:40<3:41:45, 6.84s/it]{'loss': 0.4535, 'learning_rate': 2.4817544690486896e-06, 'epoch': 0.78} + {'loss': 0.4535, 'learning_rate': 2.4817544690486896e-06, 'epoch': 0.78} + 78%|███████▊ | 6805/8750 [3:28:42<3:41:45, 6.84s/it] 78%|███████▊ | 6805/8750 [3:28:40<3:41:45, 6.84s/it] 78%|███████▊ | 6806/8750 [3:28:48<3:29:50, 6.48s/it] 78%|███████▊ | 6806/8750 [3:28:45<3:29:49, 6.48s/it] {'loss': 0.4411, 'learning_rate': 2.4793142505798363e-06, 'epoch': 0.78} + 78%|███████▊ | 6806/8750 [3:28:48<3:29:50, 6.48s/it] {'loss': 0.4411, 'learning_rate': 2.4793142505798363e-06, 'epoch': 0.78} + 78%|███████▊ | 6806/8750 [3:28:45<3:29:49, 6.48s/it] 78%|███████▊ | 6807/8750 [3:28:53<3:21:49, 6.23s/it] 78%|███████▊ | 6807/8750 [3:28:51<3:21:49, 6.23s/it] {'loss': 0.4497, 'learning_rate': 2.4768750626128414e-06, 'epoch': 0.78} + 78%|███████▊ | 6807/8750 [3:28:53<3:21:49, 6.23s/it] {'loss': 0.4497, 'learning_rate': 2.4768750626128414e-06, 'epoch': 0.78} + 78%|███████▊ | 6807/8750 [3:28:51<3:21:49, 6.23s/it] 78%|███████▊ | 6808/8750 [3:28:59<3:16:58, 6.09s/it] 78%|███████▊ | 6808/8750 [3:28:57<3:16:58, 6.09s/it] {'loss': 0.4533, 'learning_rate': 2.4744369054819252e-06, 'epoch': 0.78} + 78%|███████▊ | 6808/8750 [3:28:59<3:16:58, 6.09s/it] {'loss': 0.4533, 'learning_rate': 2.4744369054819252e-06, 'epoch': 0.78} + 78%|███████▊ | 6808/8750 [3:28:57<3:16:58, 6.09s/it] 78%|███████▊ | 6809/8750 [3:29:05<3:17:30, 6.11s/it] 78%|███████▊ | 6809/8750 [3:29:03<3:17:30, 6.11s/it] {'loss': 0.4417, 'learning_rate': 2.4719997795211683e-06, 'epoch': 0.78} + 78%|███████▊ | 6809/8750 [3:29:05<3:17:30, 6.11s/it] {'loss': 0.4417, 'learning_rate': 2.4719997795211683e-06, 'epoch': 0.78} + 78%|███████▊ | 6809/8750 [3:29:03<3:17:30, 6.11s/it] 78%|███████▊ | 6810/8750 [3:29:11<3:13:45, 5.99s/it] 78%|███████▊ | 6810/8750 [3:29:09<3:13:45, 5.99s/it] {'loss': 0.4485, 'learning_rate': 2.4695636850645112e-06, 'epoch': 0.78} + 78%|███████▊ | 6810/8750 [3:29:09<3:13:45, 5.99s/it]{'loss': 0.4485, 'learning_rate': 2.4695636850645112e-06, 'epoch': 0.78} + 78%|███████▊ | 6810/8750 [3:29:11<3:13:45, 5.99s/it] 78%|███████▊ | 6811/8750 [3:29:17<3:10:19, 5.89s/it] 78%|███████▊ | 6811/8750 [3:29:14<3:10:19, 5.89s/it] {'loss': 0.4413, 'learning_rate': 2.467128622445759e-06, 'epoch': 0.78} + 78%|███████▊ | 6811/8750 [3:29:17<3:10:19, 5.89s/it] {'loss': 0.4413, 'learning_rate': 2.467128622445759e-06, 'epoch': 0.78} + 78%|███████▊ | 6811/8750 [3:29:14<3:10:19, 5.89s/it] 78%|███████▊ | 6812/8750 [3:29:23<3:10:23, 5.89s/it] 78%|███████▊ | 6812/8750 [3:29:20<3:10:23, 5.89s/it] {'loss': 0.4554, 'learning_rate': 2.464694591998563e-06, 'epoch': 0.78} + 78%|███████▊ | 6812/8750 [3:29:23<3:10:23, 5.89s/it] {'loss': 0.4554, 'learning_rate': 2.464694591998563e-06, 'epoch': 0.78} + 78%|███████▊ | 6812/8750 [3:29:20<3:10:23, 5.89s/it] 78%|███████▊ | 6813/8750 [3:29:28<3:08:16, 5.83s/it] 78%|███████▊ | 6813/8750 [3:29:26<3:08:16, 5.83s/it] {'loss': 0.4551, 'learning_rate': 2.4622615940564454e-06, 'epoch': 0.78}{'loss': 0.4551, 'learning_rate': 2.4622615940564454e-06, 'epoch': 0.78} + + 78%|███████▊ | 6813/8750 [3:29:28<3:08:16, 5.83s/it] 78%|███████▊ | 6813/8750 [3:29:26<3:08:16, 5.83s/it] 78%|███████▊ | 6814/8750 [3:29:34<3:09:24, 5.87s/it] 78%|███████▊ | 6814/8750 [3:29:32<3:09:24, 5.87s/it] {'loss': 0.4501, 'learning_rate': 2.4598296289527745e-06, 'epoch': 0.78} + {'loss': 0.4501, 'learning_rate': 2.4598296289527745e-06, 'epoch': 0.78} 78%|███████▊ | 6814/8750 [3:29:34<3:09:24, 5.87s/it] + 78%|███████▊ | 6814/8750 [3:29:32<3:09:24, 5.87s/it] 78%|███████▊ | 6815/8750 [3:29:40<3:06:40, 5.79s/it] 78%|███████▊ | 6815/8750 [3:29:38<3:06:40, 5.79s/it] {'loss': 0.4521, 'learning_rate': 2.4573986970207906e-06, 'epoch': 0.78} + 78%|███████▊ | 6815/8750 [3:29:40<3:06:40, 5.79s/it] {'loss': 0.4521, 'learning_rate': 2.4573986970207906e-06, 'epoch': 0.78} + 78%|███████▊ | 6815/8750 [3:29:38<3:06:40, 5.79s/it] 78%|███████▊ | 6816/8750 [3:29:45<3:05:06, 5.74s/it] 78%|███████▊ | 6816/8750 [3:29:43<3:05:06, 5.74s/it]{'loss': 0.4545, 'learning_rate': 2.4549687985935832e-06, 'epoch': 0.78} + {'loss': 0.4545, 'learning_rate': 2.4549687985935832e-06, 'epoch': 0.78} + 78%|███████▊ | 6816/8750 [3:29:45<3:05:06, 5.74s/it] 78%|███████▊ | 6816/8750 [3:29:43<3:05:06, 5.74s/it] 78%|███████▊ | 6817/8750 [3:29:51<3:04:28, 5.73s/it] 78%|███████▊ | 6817/8750 [3:29:49<3:04:28, 5.73s/it] {'loss': 0.436, 'learning_rate': 2.452539934004099e-06, 'epoch': 0.78} + 78%|███████▊ | 6817/8750 [3:29:51<3:04:28, 5.73s/it] {'loss': 0.436, 'learning_rate': 2.452539934004099e-06, 'epoch': 0.78} + 78%|███████▊ | 6817/8750 [3:29:49<3:04:28, 5.73s/it] 78%|███████▊ | 6818/8750 [3:29:57<3:03:49, 5.71s/it] 78%|███████▊ | 6818/8750 [3:29:55<3:03:49, 5.71s/it] {'loss': 0.4512, 'learning_rate': 2.4501121035851494e-06, 'epoch': 0.78} + 78%|███████▊ | 6818/8750 [3:29:57<3:03:49, 5.71s/it] {'loss': 0.4512, 'learning_rate': 2.4501121035851494e-06, 'epoch': 0.78} + 78%|███████▊ | 6818/8750 [3:29:55<3:03:49, 5.71s/it] 78%|███████▊ | 6819/8750 [3:30:03<3:05:13, 5.76s/it] 78%|███████▊ | 6819/8750 [3:30:00<3:05:13, 5.76s/it] {'loss': 0.4505, 'learning_rate': 2.447685307669405e-06, 'epoch': 0.78} + 78%|███████▊ | 6819/8750 [3:30:03<3:05:13, 5.76s/it] {'loss': 0.4505, 'learning_rate': 2.447685307669405e-06, 'epoch': 0.78} + 78%|███████▊ | 6819/8750 [3:30:00<3:05:13, 5.76s/it] 78%|███████▊ | 6820/8750 [3:30:08<3:05:03, 5.75s/it] 78%|███████▊ | 6820/8750 [3:30:06<3:05:03, 5.75s/it] {'loss': 0.4493, 'learning_rate': 2.4452595465893847e-06, 'epoch': 0.78} + 78%|███████▊ | 6820/8750 [3:30:08<3:05:03, 5.75s/it] {'loss': 0.4493, 'learning_rate': 2.4452595465893847e-06, 'epoch': 0.78} + 78%|███████▊ | 6820/8750 [3:30:06<3:05:03, 5.75s/it] 78%|███████▊ | 6821/8750 [3:30:14<3:06:08, 5.79s/it] 78%|███████▊ | 6821/8750 [3:30:12<3:06:10, 5.79s/it] {'loss': 0.4526, 'learning_rate': 2.4428348206774775e-06, 'epoch': 0.78} + 78%|███████▊ | 6821/8750 [3:30:14<3:06:08, 5.79s/it] {'loss': 0.4526, 'learning_rate': 2.4428348206774775e-06, 'epoch': 0.78} + 78%|███████▊ | 6821/8750 [3:30:12<3:06:10, 5.79s/it] 78%|███████▊ | 6822/8750 [3:30:20<3:07:00, 5.82s/it] 78%|███████▊ | 6822/8750 [3:30:18<3:06:59, 5.82s/it] {'loss': 0.4747, 'learning_rate': 2.4404111302659195e-06, 'epoch': 0.78} + 78%|███████▊ | 6822/8750 [3:30:20<3:07:00, 5.82s/it] {'loss': 0.4747, 'learning_rate': 2.4404111302659195e-06, 'epoch': 0.78} + 78%|███████▊ | 6822/8750 [3:30:18<3:06:59, 5.82s/it] 78%|███████▊ | 6823/8750 [3:30:26<3:06:36, 5.81s/it] 78%|███████▊ | 6823/8750 [3:30:24<3:06:35, 5.81s/it] {'loss': 0.4277, 'learning_rate': 2.4379884756868167e-06, 'epoch': 0.78} + 78%|███████▊ | 6823/8750 [3:30:26<3:06:36, 5.81s/it] {'loss': 0.4277, 'learning_rate': 2.4379884756868167e-06, 'epoch': 0.78} + 78%|███████▊ | 6823/8750 [3:30:24<3:06:35, 5.81s/it] 78%|███████▊ | 6824/8750 [3:30:32<3:04:57, 5.76s/it] 78%|███████▊ | 6824/8750 [3:30:29<3:04:56, 5.76s/it] {'loss': 0.4578, 'learning_rate': 2.4355668572721224e-06, 'epoch': 0.78} + 78%|███████▊ | 6824/8750 [3:30:32<3:04:57, 5.76s/it] {'loss': 0.4578, 'learning_rate': 2.4355668572721224e-06, 'epoch': 0.78} + 78%|███████▊ | 6824/8750 [3:30:29<3:04:56, 5.76s/it] 78%|███████▊ | 6825/8750 [3:30:37<3:05:31, 5.78s/it] 78%|███████▊ | 6825/8750 [3:30:35<3:05:30, 5.78s/it] {'loss': 0.4408, 'learning_rate': 2.433146275353652e-06, 'epoch': 0.78} + 78%|███████▊ | 6825/8750 [3:30:37<3:05:31, 5.78s/it] {'loss': 0.4408, 'learning_rate': 2.433146275353652e-06, 'epoch': 0.78} + 78%|███████▊ | 6825/8750 [3:30:35<3:05:30, 5.78s/it] 78%|███████▊ | 6826/8750 [3:30:43<3:03:01, 5.71s/it] 78%|███████▊ | 6826/8750 [3:30:41<3:03:01, 5.71s/it] {'loss': 0.4679, 'learning_rate': 2.4307267302630834e-06, 'epoch': 0.78} + 78%|███████▊ | 6826/8750 [3:30:43<3:03:01, 5.71s/it] {'loss': 0.4679, 'learning_rate': 2.4307267302630834e-06, 'epoch': 0.78} + 78%|███████▊ | 6826/8750 [3:30:41<3:03:01, 5.71s/it] 78%|███████▊ | 6827/8750 [3:30:49<3:02:40, 5.70s/it] 78%|███████▊ | 6827/8750 [3:30:46<3:02:40, 5.70s/it] {'loss': 0.4573, 'learning_rate': 2.428308222331942e-06, 'epoch': 0.78} + 78%|███████▊ | 6827/8750 [3:30:49<3:02:40, 5.70s/it] {'loss': 0.4573, 'learning_rate': 2.428308222331942e-06, 'epoch': 0.78} + 78%|███████▊ | 6827/8750 [3:30:46<3:02:40, 5.70s/it] 78%|███████▊ | 6828/8750 [3:30:55<3:06:40, 5.83s/it] 78%|███████▊ | 6828/8750 [3:30:53<3:06:40, 5.83s/it] {'loss': 0.4575, 'learning_rate': 2.4258907518916207e-06, 'epoch': 0.78} + 78%|███████▊ | 6828/8750 [3:30:53<3:06:40, 5.83s/it]{'loss': 0.4575, 'learning_rate': 2.4258907518916207e-06, 'epoch': 0.78} + 78%|███████▊ | 6828/8750 [3:30:55<3:06:40, 5.83s/it] 78%|███████▊ | 6829/8750 [3:31:01<3:05:56, 5.81s/it] 78%|███████▊ | 6829/8750 [3:30:58<3:05:56, 5.81s/it] {'loss': 0.4426, 'learning_rate': 2.4234743192733713e-06, 'epoch': 0.78} + 78%|███████▊ | 6829/8750 [3:31:01<3:05:56, 5.81s/it] {'loss': 0.4426, 'learning_rate': 2.4234743192733713e-06, 'epoch': 0.78} + 78%|███████▊ | 6829/8750 [3:30:58<3:05:56, 5.81s/it] 78%|███████▊ | 6830/8750 [3:31:06<3:06:48, 5.84s/it] 78%|███████▊ | 6830/8750 [3:31:04<3:06:48, 5.84s/it] {'loss': 0.4606, 'learning_rate': 2.4210589248082914e-06, 'epoch': 0.78} + 78%|███████▊ | 6830/8750 [3:31:06<3:06:48, 5.84s/it] {'loss': 0.4606, 'learning_rate': 2.4210589248082914e-06, 'epoch': 0.78} + 78%|███████▊ | 6830/8750 [3:31:04<3:06:48, 5.84s/it] 78%|███████▊ | 6831/8750 [3:31:12<3:05:13, 5.79s/it] 78%|███████▊ | 6831/8750 [3:31:10<3:05:13, 5.79s/it] {'loss': 0.4585, 'learning_rate': 2.4186445688273508e-06, 'epoch': 0.78} + 78%|███████▊ | 6831/8750 [3:31:12<3:05:13, 5.79s/it] {'loss': 0.4585, 'learning_rate': 2.4186445688273508e-06, 'epoch': 0.78} + 78%|███████▊ | 6831/8750 [3:31:10<3:05:13, 5.79s/it] 78%|███████▊ | 6832/8750 [3:31:18<3:04:17, 5.77s/it] 78%|███████▊ | 6832/8750 [3:31:16<3:04:17, 5.77s/it] {'loss': 0.4363, 'learning_rate': 2.416231251661364e-06, 'epoch': 0.78} + 78%|███████▊ | 6832/8750 [3:31:18<3:04:17, 5.77s/it] {'loss': 0.4363, 'learning_rate': 2.416231251661364e-06, 'epoch': 0.78} + 78%|███████▊ | 6832/8750 [3:31:16<3:04:17, 5.77s/it] 78%|███████▊ | 6833/8750 [3:31:24<3:04:55, 5.79s/it] 78%|███████▊ | 6833/8750 [3:31:21<3:04:55, 5.79s/it] {'loss': 0.4513, 'learning_rate': 2.4138189736410144e-06, 'epoch': 0.78} + 78%|███████▊ | 6833/8750 [3:31:24<3:04:55, 5.79s/it] {'loss': 0.4513, 'learning_rate': 2.4138189736410144e-06, 'epoch': 0.78} + 78%|███████▊ | 6833/8750 [3:31:21<3:04:55, 5.79s/it] 78%|███████▊ | 6834/8750 [3:31:29<3:03:53, 5.76s/it] 78%|███████▊ | 6834/8750 [3:31:27<3:03:53, 5.76s/it] {'loss': 0.4403, 'learning_rate': 2.411407735096836e-06, 'epoch': 0.78} + 78%|███████▊ | 6834/8750 [3:31:29<3:03:53, 5.76s/it] {'loss': 0.4403, 'learning_rate': 2.411407735096836e-06, 'epoch': 0.78} + 78%|███████▊ | 6834/8750 [3:31:27<3:03:53, 5.76s/it] 78%|███████▊ | 6835/8750 [3:31:35<3:03:06, 5.74s/it] 78%|███████▊ | 6835/8750 [3:31:33<3:03:06, 5.74s/it] {'loss': 0.435, 'learning_rate': 2.40899753635922e-06, 'epoch': 0.78} + 78%|███████▊ | 6835/8750 [3:31:35<3:03:06, 5.74s/it] {'loss': 0.435, 'learning_rate': 2.40899753635922e-06, 'epoch': 0.78} + 78%|███████▊ | 6835/8750 [3:31:33<3:03:06, 5.74s/it] 78%|███████▊ | 6836/8750 [3:31:41<3:04:06, 5.77s/it] 78%|███████▊ | 6836/8750 [3:31:39<3:04:06, 5.77s/it] {'loss': 0.4783, 'learning_rate': 2.406588377758421e-06, 'epoch': 0.78} + 78%|███████▊ | 6836/8750 [3:31:41<3:04:06, 5.77s/it] {'loss': 0.4783, 'learning_rate': 2.406588377758421e-06, 'epoch': 0.78} + 78%|███████▊ | 6836/8750 [3:31:39<3:04:06, 5.77s/it] 78%|███████▊ | 6837/8750 [3:31:47<3:03:34, 5.76s/it] 78%|███████▊ | 6837/8750 [3:31:44<3:03:34, 5.76s/it] {'loss': 0.4533, 'learning_rate': 2.4041802596245444e-06, 'epoch': 0.78} + 78%|███████▊ | 6837/8750 [3:31:47<3:03:34, 5.76s/it] {'loss': 0.4533, 'learning_rate': 2.4041802596245444e-06, 'epoch': 0.78} + 78%|███████▊ | 6837/8750 [3:31:44<3:03:34, 5.76s/it] 78%|███████▊ | 6838/8750 [3:31:53<3:05:42, 5.83s/it] 78%|███████▊ | 6838/8750 [3:31:50<3:05:42, 5.83s/it] {'loss': 0.4545, 'learning_rate': 2.4017731822875566e-06, 'epoch': 0.78} + 78%|███████▊ | 6838/8750 [3:31:53<3:05:42, 5.83s/it] {'loss': 0.4545, 'learning_rate': 2.4017731822875566e-06, 'epoch': 0.78} + 78%|███████▊ | 6838/8750 [3:31:50<3:05:42, 5.83s/it] 78%|███████▊ | 6839/8750 [3:31:58<3:03:15, 5.75s/it] 78%|███████▊ | 6839/8750 [3:31:56<3:03:15, 5.75s/it] {'loss': 0.4624, 'learning_rate': 2.399367146077286e-06, 'epoch': 0.78} + 78%|███████▊ | 6839/8750 [3:31:58<3:03:15, 5.75s/it] {'loss': 0.4624, 'learning_rate': 2.399367146077286e-06, 'epoch': 0.78} + 78%|███████▊ | 6839/8750 [3:31:56<3:03:15, 5.75s/it] 78%|███████▊ | 6840/8750 [3:32:04<3:03:03, 5.75s/it] 78%|███████▊ | 6840/8750 [3:32:02<3:03:03, 5.75s/it] {'loss': 0.46, 'learning_rate': 2.3969621513234066e-06, 'epoch': 0.78} + 78%|███████▊ | 6840/8750 [3:32:04<3:03:03, 5.75s/it] {'loss': 0.46, 'learning_rate': 2.3969621513234066e-06, 'epoch': 0.78} + 78%|███████▊ | 6840/8750 [3:32:02<3:03:03, 5.75s/it] 78%|███████▊ | 6841/8750 [3:32:10<3:01:35, 5.71s/it] 78%|███████▊ | 6841/8750 [3:32:07<3:01:35, 5.71s/it] {'loss': 0.4667, 'learning_rate': 2.394558198355462e-06, 'epoch': 0.78} + 78%|███████▊ | 6841/8750 [3:32:10<3:01:35, 5.71s/it] {'loss': 0.4667, 'learning_rate': 2.394558198355462e-06, 'epoch': 0.78} + 78%|███████▊ | 6841/8750 [3:32:07<3:01:35, 5.71s/it] 78%|███████▊ | 6842/8750 [3:32:15<3:01:15, 5.70s/it] 78%|███████▊ | 6842/8750 [3:32:13<3:01:14, 5.70s/it] {'loss': 0.4611, 'learning_rate': 2.3921552875028443e-06, 'epoch': 0.78} + 78%|███████▊ | 6842/8750 [3:32:15<3:01:15, 5.70s/it] {'loss': 0.4611, 'learning_rate': 2.3921552875028443e-06, 'epoch': 0.78} + 78%|███████▊ | 6842/8750 [3:32:13<3:01:14, 5.70s/it] 78%|███████▊ | 6843/8750 [3:32:21<3:04:18, 5.80s/it] 78%|███████▊ | 6843/8750 [3:32:19<3:04:18, 5.80s/it] {'loss': 0.4417, 'learning_rate': 2.3897534190948034e-06, 'epoch': 0.78} + {'loss': 0.4417, 'learning_rate': 2.3897534190948034e-06, 'epoch': 0.78} 78%|███████▊ | 6843/8750 [3:32:21<3:04:18, 5.80s/it] + 78%|███████▊ | 6843/8750 [3:32:19<3:04:18, 5.80s/it] 78%|███████▊ | 6844/8750 [3:32:27<3:05:44, 5.85s/it] 78%|███████▊ | 6844/8750 [3:32:25<3:05:44, 5.85s/it] {'loss': 0.4708, 'learning_rate': 2.387352593460455e-06, 'epoch': 0.78} + 78%|███████▊ | 6844/8750 [3:32:27<3:05:44, 5.85s/it] {'loss': 0.4708, 'learning_rate': 2.387352593460455e-06, 'epoch': 0.78} + 78%|███████▊ | 6844/8750 [3:32:25<3:05:44, 5.85s/it] 78%|███████▊ | 6845/8750 [3:32:33<3:06:04, 5.86s/it] 78%|███████▊ | 6845/8750 [3:32:31<3:06:04, 5.86s/it] {'loss': 0.456, 'learning_rate': 2.384952810928759e-06, 'epoch': 0.78} + 78%|███████▊ | 6845/8750 [3:32:33<3:06:04, 5.86s/it] {'loss': 0.456, 'learning_rate': 2.384952810928759e-06, 'epoch': 0.78} + 78%|███████▊ | 6845/8750 [3:32:31<3:06:04, 5.86s/it] 78%|███████▊ | 6846/8750 [3:32:39<3:05:57, 5.86s/it] 78%|███████▊ | 6846/8750 [3:32:37<3:05:57, 5.86s/it] {'loss': 0.4431, 'learning_rate': 2.3825540718285454e-06, 'epoch': 0.78} + 78%|███████▊ | 6846/8750 [3:32:39<3:05:57, 5.86s/it] {'loss': 0.4431, 'learning_rate': 2.3825540718285454e-06, 'epoch': 0.78} + 78%|███████▊ | 6846/8750 [3:32:37<3:05:57, 5.86s/it] 78%|███████▊ | 6847/8750 [3:32:45<3:06:39, 5.89s/it] 78%|███████▊ | 6847/8750 [3:32:43<3:06:39, 5.89s/it] {'loss': 0.4557, 'learning_rate': 2.3801563764884905e-06, 'epoch': 0.78} + 78%|███████▊ | 6847/8750 [3:32:45<3:06:39, 5.89s/it] {'loss': 0.4557, 'learning_rate': 2.3801563764884905e-06, 'epoch': 0.78} + 78%|███████▊ | 6847/8750 [3:32:43<3:06:39, 5.89s/it] 78%|███████▊ | 6848/8750 [3:32:51<3:03:51, 5.80s/it] 78%|███████▊ | 6848/8750 [3:32:48<3:03:51, 5.80s/it] {'loss': 0.4571, 'learning_rate': 2.377759725237133e-06, 'epoch': 0.78} + 78%|███████▊ | 6848/8750 [3:32:51<3:03:51, 5.80s/it] {'loss': 0.4571, 'learning_rate': 2.377759725237133e-06, 'epoch': 0.78} + 78%|███████▊ | 6848/8750 [3:32:48<3:03:51, 5.80s/it] 78%|███████▊ | 6849/8750 [3:32:56<3:02:00, 5.74s/it] {'loss': 0.4388, 'learning_rate': 2.375364118402872e-06, 'epoch': 0.78} + 78%|███████▊ | 6849/8750 [3:32:56<3:02:00, 5.74s/it] 78%|███████▊ | 6849/8750 [3:32:54<3:02:00, 5.74s/it] {'loss': 0.4388, 'learning_rate': 2.375364118402872e-06, 'epoch': 0.78} + 78%|███████▊ | 6849/8750 [3:32:54<3:02:00, 5.74s/it]9 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... + 78%|███████▊ | 6850/8750 [3:33:02<3:02:18, 5.76s/it]13 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend...0 + AutoResumeHook: Checking whether to suspend... + 78%|███████▊ | 6850/8750 [3:33:00<3:02:18, 5.76s/it]6 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4392, 'learning_rate': 2.3729695563139554e-06, 'epoch': 0.78} + 78%|███████▊ | 6850/8750 [3:33:02<3:02:18, 5.76s/it] {'loss': 0.4392, 'learning_rate': 2.3729695563139554e-06, 'epoch': 0.78} + 78%|███████▊ | 6850/8750 [3:33:00<3:02:18, 5.76s/it] 78%|███████▊ | 6851/8750 [3:33:08<3:02:19, 5.76s/it] 78%|███████▊ | 6851/8750 [3:33:05<3:02:19, 5.76s/it] {'loss': 0.4436, 'learning_rate': 2.3705760392984887e-06, 'epoch': 0.78} + 78%|███████▊ | 6851/8750 [3:33:08<3:02:19, 5.76s/it] {'loss': 0.4436, 'learning_rate': 2.3705760392984887e-06, 'epoch': 0.78} + 78%|███████▊ | 6851/8750 [3:33:05<3:02:19, 5.76s/it] 78%|███████▊ | 6852/8750 [3:33:13<3:01:10, 5.73s/it] 78%|███████▊ | 6852/8750 [3:33:11<3:01:10, 5.73s/it] {'loss': 0.4396, 'learning_rate': 2.3681835676844444e-06, 'epoch': 0.78} + 78%|███████▊ | 6852/8750 [3:33:13<3:01:10, 5.73s/it] {'loss': 0.4396, 'learning_rate': 2.3681835676844444e-06, 'epoch': 0.78} + 78%|███████▊ | 6852/8750 [3:33:11<3:01:10, 5.73s/it] 78%|███████▊ | 6853/8750 [3:33:19<3:02:26, 5.77s/it] 78%|███████▊ | 6853/8750 [3:33:17<3:02:27, 5.77s/it] {'loss': 0.4496, 'learning_rate': 2.3657921417996364e-06, 'epoch': 0.78} + 78%|███████▊ | 6853/8750 [3:33:19<3:02:26, 5.77s/it] {'loss': 0.4496, 'learning_rate': 2.3657921417996364e-06, 'epoch': 0.78} + 78%|███████▊ | 6853/8750 [3:33:17<3:02:27, 5.77s/it] 78%|███████▊ | 6854/8750 [3:33:25<3:04:24, 5.84s/it] 78%|███████▊ | 6854/8750 [3:33:23<3:04:23, 5.84s/it] {'loss': 0.4712, 'learning_rate': 2.363401761971752e-06, 'epoch': 0.78} + 78%|███████▊ | 6854/8750 [3:33:23<3:04:23, 5.84s/it] {'loss': 0.4712, 'learning_rate': 2.363401761971752e-06, 'epoch': 0.78} + 78%|███████▊ | 6854/8750 [3:33:25<3:04:24, 5.84s/it] 78%|███████▊ | 6855/8750 [3:33:31<3:02:33, 5.78s/it] 78%|███████▊ | 6855/8750 [3:33:29<3:02:33, 5.78s/it] {'loss': 0.4381, 'learning_rate': 2.361012428528321e-06, 'epoch': 0.78} + 78%|███████▊ | 6855/8750 [3:33:31<3:02:33, 5.78s/it] {'loss': 0.4381, 'learning_rate': 2.361012428528321e-06, 'epoch': 0.78} + 78%|███████▊ | 6855/8750 [3:33:29<3:02:33, 5.78s/it] 78%|███████▊ | 6856/8750 [3:33:36<3:00:53, 5.73s/it] 78%|███████▊ | 6856/8750 [3:33:34<3:00:52, 5.73s/it] {'loss': 0.4399, 'learning_rate': 2.3586241417967336e-06, 'epoch': 0.78} + 78%|███████▊ | 6856/8750 [3:33:36<3:00:53, 5.73s/it] {'loss': 0.4399, 'learning_rate': 2.3586241417967336e-06, 'epoch': 0.78} + 78%|███████▊ | 6856/8750 [3:33:34<3:00:52, 5.73s/it] 78%|███████▊ | 6857/8750 [3:33:42<2:59:42, 5.70s/it] 78%|███████▊ | 6857/8750 [3:33:40<2:59:42, 5.70s/it] {'loss': 0.4752, 'learning_rate': 2.356236902104242e-06, 'epoch': 0.78} + 78%|███████▊ | 6857/8750 [3:33:42<2:59:42, 5.70s/it] {'loss': 0.4752, 'learning_rate': 2.356236902104242e-06, 'epoch': 0.78} + 78%|███████▊ | 6857/8750 [3:33:40<2:59:42, 5.70s/it] 78%|███████▊ | 6858/8750 [3:33:48<2:58:23, 5.66s/it] 78%|███████▊ | 6858/8750 [3:33:45<2:58:24, 5.66s/it] {'loss': 0.4426, 'learning_rate': 2.3538507097779505e-06, 'epoch': 0.78} + 78%|███████▊ | 6858/8750 [3:33:45<2:58:24, 5.66s/it]{'loss': 0.4426, 'learning_rate': 2.3538507097779505e-06, 'epoch': 0.78} + 78%|███████▊ | 6858/8750 [3:33:48<2:58:23, 5.66s/it] 78%|███████▊ | 6859/8750 [3:33:53<2:59:52, 5.71s/it] 78%|███████▊ | 6859/8750 [3:33:51<2:59:52, 5.71s/it] {'loss': 0.4458, 'learning_rate': 2.351465565144825e-06, 'epoch': 0.78} + 78%|███████▊ | 6859/8750 [3:33:53<2:59:52, 5.71s/it] {'loss': 0.4458, 'learning_rate': 2.351465565144825e-06, 'epoch': 0.78} + 78%|███████▊ | 6859/8750 [3:33:51<2:59:52, 5.71s/it] 78%|███████▊ | 6860/8750 [3:33:59<2:59:41, 5.70s/it] 78%|███████▊ | 6860/8750 [3:33:57<2:59:41, 5.70s/it] {'loss': 0.4235, 'learning_rate': 2.3490814685316777e-06, 'epoch': 0.78} + 78%|███████▊ | 6860/8750 [3:33:59<2:59:41, 5.70s/it] {'loss': 0.4235, 'learning_rate': 2.3490814685316777e-06, 'epoch': 0.78} + 78%|███████▊ | 6860/8750 [3:33:57<2:59:41, 5.70s/it] 78%|███████▊ | 6861/8750 [3:34:05<2:59:56, 5.72s/it] 78%|███████▊ | 6861/8750 [3:34:03<2:59:56, 5.72s/it] {'loss': 0.4463, 'learning_rate': 2.3466984202651833e-06, 'epoch': 0.78} + 78%|███████▊ | 6861/8750 [3:34:05<2:59:56, 5.72s/it] {'loss': 0.4463, 'learning_rate': 2.3466984202651833e-06, 'epoch': 0.78} + 78%|███████▊ | 6861/8750 [3:34:03<2:59:56, 5.72s/it] 78%|███████▊ | 6862/8750 [3:34:10<2:57:45, 5.65s/it] 78%|███████▊ | 6862/8750 [3:34:08<2:57:45, 5.65s/it] {'loss': 0.4494, 'learning_rate': 2.344316420671876e-06, 'epoch': 0.78} + 78%|███████▊ | 6862/8750 [3:34:10<2:57:45, 5.65s/it] {'loss': 0.4494, 'learning_rate': 2.344316420671876e-06, 'epoch': 0.78} + 78%|███████▊ | 6862/8750 [3:34:08<2:57:45, 5.65s/it] 78%|███████▊ | 6863/8750 [3:34:16<2:57:27, 5.64s/it] 78%|███████▊ | 6863/8750 [3:34:14<2:57:27, 5.64s/it] {'loss': 0.4436, 'learning_rate': 2.3419354700781393e-06, 'epoch': 0.78} + 78%|███████▊ | 6863/8750 [3:34:16<2:57:27, 5.64s/it] {'loss': 0.4436, 'learning_rate': 2.3419354700781393e-06, 'epoch': 0.78} + 78%|███████▊ | 6863/8750 [3:34:14<2:57:27, 5.64s/it] 78%|███████▊ | 6864/8750 [3:34:22<2:58:05, 5.67s/it] 78%|███████▊ | 6864/8750 [3:34:19<2:58:04, 5.67s/it] {'loss': 0.4534, 'learning_rate': 2.339555568810221e-06, 'epoch': 0.78} + 78%|███████▊ | 6864/8750 [3:34:22<2:58:05, 5.67s/it] {'loss': 0.4534, 'learning_rate': 2.339555568810221e-06, 'epoch': 0.78} + 78%|███████▊ | 6864/8750 [3:34:19<2:58:04, 5.67s/it] 78%|███████▊ | 6865/8750 [3:34:28<3:00:03, 5.73s/it] 78%|███████▊ | 6865/8750 [3:34:25<3:00:03, 5.73s/it] {'loss': 0.4484, 'learning_rate': 2.3371767171942183e-06, 'epoch': 0.78} + 78%|███████▊ | 6865/8750 [3:34:28<3:00:03, 5.73s/it] {'loss': 0.4484, 'learning_rate': 2.3371767171942183e-06, 'epoch': 0.78} + 78%|███████▊ | 6865/8750 [3:34:25<3:00:03, 5.73s/it] 78%|███████▊ | 6866/8750 [3:34:33<2:59:22, 5.71s/it] 78%|███████▊ | 6866/8750 [3:34:31<2:59:22, 5.71s/it] {'loss': 0.4561, 'learning_rate': 2.3347989155560835e-06, 'epoch': 0.78} + 78%|███████▊ | 6866/8750 [3:34:33<2:59:22, 5.71s/it] {'loss': 0.4561, 'learning_rate': 2.3347989155560835e-06, 'epoch': 0.78} + 78%|███████▊ | 6866/8750 [3:34:31<2:59:22, 5.71s/it] 78%|███████▊ | 6867/8750 [3:34:39<3:01:37, 5.79s/it] 78%|███████▊ | 6867/8750 [3:34:37<3:01:37, 5.79s/it] {'loss': 0.457, 'learning_rate': 2.3324221642216328e-06, 'epoch': 0.78} + 78%|███████▊ | 6867/8750 [3:34:39<3:01:37, 5.79s/it] {'loss': 0.457, 'learning_rate': 2.3324221642216328e-06, 'epoch': 0.78} + 78%|███████▊ | 6867/8750 [3:34:37<3:01:37, 5.79s/it] 78%|███████▊ | 6868/8750 [3:34:45<2:59:28, 5.72s/it] {'loss': 0.4517, 'learning_rate': 2.3300464635165353e-06, 'epoch': 0.78} + 78%|███████▊ | 6868/8750 [3:34:45<2:59:28, 5.72s/it] 78%|███████▊ | 6868/8750 [3:34:43<2:59:29, 5.72s/it] {'loss': 0.4517, 'learning_rate': 2.3300464635165353e-06, 'epoch': 0.78} + 78%|███████▊ | 6868/8750 [3:34:43<2:59:29, 5.72s/it] 79%|███████▊ | 6869/8750 [3:34:51<3:01:50, 5.80s/it] 79%|███████▊ | 6869/8750 [3:34:49<3:01:50, 5.80s/it] {'loss': 0.4567, 'learning_rate': 2.32767181376631e-06, 'epoch': 0.79} + 79%|███████▊ | 6869/8750 [3:34:51<3:01:50, 5.80s/it] {'loss': 0.4567, 'learning_rate': 2.32767181376631e-06, 'epoch': 0.79} + 79%|███████▊ | 6869/8750 [3:34:49<3:01:50, 5.80s/it] 79%|███████▊ | 6870/8750 [3:34:57<3:02:08, 5.81s/it] 79%|███████▊ | 6870/8750 [3:34:54<3:02:08, 5.81s/it] {'loss': 0.4546, 'learning_rate': 2.3252982152963434e-06, 'epoch': 0.79} + 79%|███████▊ | 6870/8750 [3:34:57<3:02:08, 5.81s/it] {'loss': 0.4546, 'learning_rate': 2.3252982152963434e-06, 'epoch': 0.79} + 79%|███████▊ | 6870/8750 [3:34:54<3:02:08, 5.81s/it] 79%|███████▊ | 6871/8750 [3:35:03<3:03:08, 5.85s/it] 79%|███████▊ | 6871/8750 [3:35:00<3:03:08, 5.85s/it] {'loss': 0.4425, 'learning_rate': 2.3229256684318646e-06, 'epoch': 0.79} + 79%|███████▊ | 6871/8750 [3:35:03<3:03:08, 5.85s/it] {'loss': 0.4425, 'learning_rate': 2.3229256684318646e-06, 'epoch': 0.79} + 79%|███████▊ | 6871/8750 [3:35:00<3:03:08, 5.85s/it] 79%|███████▊ | 6872/8750 [3:35:08<3:01:04, 5.78s/it] 79%|███████▊ | 6872/8750 [3:35:06<3:01:04, 5.78s/it] {'loss': 0.4547, 'learning_rate': 2.320554173497972e-06, 'epoch': 0.79} + 79%|███████▊ | 6872/8750 [3:35:08<3:01:04, 5.78s/it] {'loss': 0.4547, 'learning_rate': 2.320554173497972e-06, 'epoch': 0.79} + 79%|███████▊ | 6872/8750 [3:35:06<3:01:04, 5.78s/it] 79%|███████▊ | 6873/8750 [3:35:14<3:00:02, 5.75s/it] 79%|███████▊ | 6873/8750 [3:35:12<3:00:01, 5.75s/it] {'loss': 0.4633, 'learning_rate': 2.31818373081961e-06, 'epoch': 0.79} + 79%|███████▊ | 6873/8750 [3:35:14<3:00:02, 5.75s/it] {'loss': 0.4633, 'learning_rate': 2.31818373081961e-06, 'epoch': 0.79} + 79%|███████▊ | 6873/8750 [3:35:12<3:00:01, 5.75s/it] 79%|███████▊ | 6874/8750 [3:35:19<2:58:28, 5.71s/it] 79%|███████▊ | 6874/8750 [3:35:17<2:58:28, 5.71s/it] {'loss': 0.4514, 'learning_rate': 2.3158143407215796e-06, 'epoch': 0.79} + 79%|███████▊ | 6874/8750 [3:35:19<2:58:28, 5.71s/it] {'loss': 0.4514, 'learning_rate': 2.3158143407215796e-06, 'epoch': 0.79} + 79%|███████▊ | 6874/8750 [3:35:17<2:58:28, 5.71s/it] 79%|███████▊ | 6875/8750 [3:35:25<2:57:45, 5.69s/it] 79%|███████▊ | 6875/8750 [3:35:23<2:57:45, 5.69s/it] {'loss': 0.4556, 'learning_rate': 2.3134460035285433e-06, 'epoch': 0.79} + 79%|███████▊ | 6875/8750 [3:35:25<2:57:45, 5.69s/it] {'loss': 0.4556, 'learning_rate': 2.3134460035285433e-06, 'epoch': 0.79} + 79%|███████▊ | 6875/8750 [3:35:23<2:57:45, 5.69s/it] 79%|███████▊ | 6876/8750 [3:35:31<2:58:37, 5.72s/it] 79%|███████▊ | 6876/8750 [3:35:29<2:58:37, 5.72s/it] {'loss': 0.4472, 'learning_rate': 2.3110787195650173e-06, 'epoch': 0.79} + 79%|███████▊ | 6876/8750 [3:35:31<2:58:37, 5.72s/it] {'loss': 0.4472, 'learning_rate': 2.3110787195650173e-06, 'epoch': 0.79} + 79%|███████▊ | 6876/8750 [3:35:29<2:58:37, 5.72s/it] 79%|███████▊ | 6877/8750 [3:35:37<3:00:27, 5.78s/it] 79%|███████▊ | 6877/8750 [3:35:35<3:00:27, 5.78s/it] {'loss': 0.4514, 'learning_rate': 2.3087124891553703e-06, 'epoch': 0.79} + 79%|███████▊ | 6877/8750 [3:35:37<3:00:27, 5.78s/it] {'loss': 0.4514, 'learning_rate': 2.3087124891553703e-06, 'epoch': 0.79} + 79%|███████▊ | 6877/8750 [3:35:35<3:00:27, 5.78s/it] 79%|███████▊ | 6878/8750 [3:35:43<2:59:30, 5.75s/it] 79%|███████▊ | 6878/8750 [3:35:40<2:59:30, 5.75s/it] {'loss': 0.4429, 'learning_rate': 2.30634731262383e-06, 'epoch': 0.79} + 79%|███████▊ | 6878/8750 [3:35:43<2:59:30, 5.75s/it] {'loss': 0.4429, 'learning_rate': 2.30634731262383e-06, 'epoch': 0.79} + 79%|███████▊ | 6878/8750 [3:35:40<2:59:30, 5.75s/it] 79%|███████▊ | 6879/8750 [3:35:48<3:00:31, 5.79s/it] 79%|███████▊ | 6879/8750 [3:35:46<3:00:31, 5.79s/it] {'loss': 0.4412, 'learning_rate': 2.3039831902944766e-06, 'epoch': 0.79} + 79%|███████▊ | 6879/8750 [3:35:48<3:00:31, 5.79s/it] {'loss': 0.4412, 'learning_rate': 2.3039831902944766e-06, 'epoch': 0.79} + 79%|███████▊ | 6879/8750 [3:35:46<3:00:31, 5.79s/it] 79%|███████▊ | 6880/8750 [3:35:54<2:59:44, 5.77s/it] {'loss': 0.4455, 'learning_rate': 2.3016201224912504e-06, 'epoch': 0.79} + 79%|███████▊ | 6880/8750 [3:35:54<2:59:44, 5.77s/it] 79%|███████▊ | 6880/8750 [3:35:52<2:59:44, 5.77s/it] {'loss': 0.4455, 'learning_rate': 2.3016201224912504e-06, 'epoch': 0.79} + 79%|███████▊ | 6880/8750 [3:35:52<2:59:44, 5.77s/it] 79%|███████▊ | 6881/8750 [3:36:00<2:56:50, 5.68s/it] 79%|███████▊ | 6881/8750 [3:35:57<2:56:50, 5.68s/it] {'loss': 0.4689, 'learning_rate': 2.299258109537943e-06, 'epoch': 0.79} + 79%|███████▊ | 6881/8750 [3:36:00<2:56:50, 5.68s/it] {'loss': 0.4689, 'learning_rate': 2.299258109537943e-06, 'epoch': 0.79} + 79%|███████▊ | 6881/8750 [3:35:57<2:56:50, 5.68s/it] 79%|███████▊ | 6882/8750 [3:36:06<2:59:21, 5.76s/it] 79%|███████▊ | 6882/8750 [3:36:03<2:59:21, 5.76s/it] {'loss': 0.4426, 'learning_rate': 2.2968971517581994e-06, 'epoch': 0.79} + 79%|███████▊ | 6882/8750 [3:36:06<2:59:21, 5.76s/it] {'loss': 0.4426, 'learning_rate': 2.2968971517581994e-06, 'epoch': 0.79} + 79%|███████▊ | 6882/8750 [3:36:03<2:59:21, 5.76s/it] 79%|███████▊ | 6883/8750 [3:36:11<2:58:56, 5.75s/it] 79%|███████▊ | 6883/8750 [3:36:09<2:58:56, 5.75s/it] {'loss': 0.4538, 'learning_rate': 2.2945372494755304e-06, 'epoch': 0.79} + 79%|███████▊ | 6883/8750 [3:36:11<2:58:56, 5.75s/it] {'loss': 0.4538, 'learning_rate': 2.2945372494755304e-06, 'epoch': 0.79} + 79%|███████▊ | 6883/8750 [3:36:09<2:58:56, 5.75s/it] 79%|███████▊ | 6884/8750 [3:36:17<2:58:32, 5.74s/it] 79%|███████▊ | 6884/8750 [3:36:15<2:58:32, 5.74s/it] {'loss': 0.4454, 'learning_rate': 2.2921784030132886e-06, 'epoch': 0.79} + 79%|███████▊ | 6884/8750 [3:36:17<2:58:32, 5.74s/it] {'loss': 0.4454, 'learning_rate': 2.2921784030132886e-06, 'epoch': 0.79} + 79%|███████▊ | 6884/8750 [3:36:15<2:58:32, 5.74s/it] 79%|███████▊ | 6885/8750 [3:36:23<2:57:20, 5.71s/it] 79%|███████▊ | 6885/8750 [3:36:20<2:57:20, 5.71s/it] {'loss': 0.4694, 'learning_rate': 2.289820612694692e-06, 'epoch': 0.79} + 79%|███████▊ | 6885/8750 [3:36:23<2:57:20, 5.71s/it] {'loss': 0.4694, 'learning_rate': 2.289820612694692e-06, 'epoch': 0.79} + 79%|███████▊ | 6885/8750 [3:36:20<2:57:20, 5.71s/it] 79%|███████▊ | 6886/8750 [3:36:29<2:58:53, 5.76s/it] 79%|███████▊ | 6886/8750 [3:36:26<2:58:53, 5.76s/it] {'loss': 0.4444, 'learning_rate': 2.2874638788428128e-06, 'epoch': 0.79} + 79%|███████▊ | 6886/8750 [3:36:29<2:58:53, 5.76s/it] {'loss': 0.4444, 'learning_rate': 2.2874638788428128e-06, 'epoch': 0.79} + 79%|███████▊ | 6886/8750 [3:36:26<2:58:53, 5.76s/it] 79%|███████▊ | 6887/8750 [3:36:34<2:58:51, 5.76s/it] 79%|███████▊ | 6887/8750 [3:36:32<2:58:51, 5.76s/it] {'loss': 0.4471, 'learning_rate': 2.2851082017805704e-06, 'epoch': 0.79} + 79%|███████▊ | 6887/8750 [3:36:32<2:58:51, 5.76s/it] {'loss': 0.4471, 'learning_rate': 2.2851082017805704e-06, 'epoch': 0.79} + 79%|███████▊ | 6887/8750 [3:36:34<2:58:51, 5.76s/it] 79%|███████▊ | 6888/8750 [3:36:40<2:58:24, 5.75s/it] 79%|███████▊ | 6888/8750 [3:36:38<2:58:24, 5.75s/it] {'loss': 0.4536, 'learning_rate': 2.2827535818307513e-06, 'epoch': 0.79} + 79%|███████▊ | 6888/8750 [3:36:40<2:58:24, 5.75s/it] {'loss': 0.4536, 'learning_rate': 2.2827535818307513e-06, 'epoch': 0.79} + 79%|███████▊ | 6888/8750 [3:36:38<2:58:24, 5.75s/it] 79%|███████▊ | 6889/8750 [3:36:46<2:56:46, 5.70s/it] 79%|███████▊ | 6889/8750 [3:36:43<2:56:45, 5.70s/it] {'loss': 0.453, 'learning_rate': 2.2804000193159848e-06, 'epoch': 0.79} + 79%|███████▊ | 6889/8750 [3:36:46<2:56:46, 5.70s/it] {'loss': 0.453, 'learning_rate': 2.2804000193159848e-06, 'epoch': 0.79} + 79%|███████▊ | 6889/8750 [3:36:43<2:56:45, 5.70s/it] 79%|███████▊ | 6890/8750 [3:36:51<2:57:12, 5.72s/it] 79%|███████▊ | 6890/8750 [3:36:49<2:57:12, 5.72s/it] {'loss': 0.47, 'learning_rate': 2.278047514558769e-06, 'epoch': 0.79} + 79%|███████▊ | 6890/8750 [3:36:51<2:57:12, 5.72s/it] {'loss': 0.47, 'learning_rate': 2.278047514558769e-06, 'epoch': 0.79} + 79%|███████▊ | 6890/8750 [3:36:49<2:57:12, 5.72s/it] 79%|███████▉ | 6891/8750 [3:36:57<2:55:39, 5.67s/it] 79%|███████▉ | 6891/8750 [3:36:55<2:55:39, 5.67s/it] {'loss': 0.4648, 'learning_rate': 2.2756960678814444e-06, 'epoch': 0.79} + 79%|███████▉ | 6891/8750 [3:36:57<2:55:39, 5.67s/it] {'loss': 0.4648, 'learning_rate': 2.2756960678814444e-06, 'epoch': 0.79} + 79%|███████▉ | 6891/8750 [3:36:55<2:55:39, 5.67s/it] 79%|███████▉ | 6892/8750 [3:37:03<2:55:54, 5.68s/it] 79%|███████▉ | 6892/8750 [3:37:00<2:55:55, 5.68s/it] {'loss': 0.4424, 'learning_rate': 2.2733456796062093e-06, 'epoch': 0.79} + {'loss': 0.4424, 'learning_rate': 2.2733456796062093e-06, 'epoch': 0.79} + 79%|███████▉ | 6892/8750 [3:37:03<2:55:54, 5.68s/it] 79%|███████▉ | 6892/8750 [3:37:00<2:55:55, 5.68s/it] 79%|███████▉ | 6893/8750 [3:37:09<2:58:11, 5.76s/it] 79%|███████▉ | 6893/8750 [3:37:06<2:58:11, 5.76s/it] {'loss': 0.4472, 'learning_rate': 2.270996350055126e-06, 'epoch': 0.79} + 79%|███████▉ | 6893/8750 [3:37:09<2:58:11, 5.76s/it] {'loss': 0.4472, 'learning_rate': 2.270996350055126e-06, 'epoch': 0.79} + 79%|███████▉ | 6893/8750 [3:37:06<2:58:11, 5.76s/it] 79%|███████▉ | 6894/8750 [3:37:14<2:57:17, 5.73s/it] 79%|███████▉ | 6894/8750 [3:37:12<2:57:17, 5.73s/it] {'loss': 0.4621, 'learning_rate': 2.2686480795500986e-06, 'epoch': 0.79} + 79%|███████▉ | 6894/8750 [3:37:14<2:57:17, 5.73s/it] {'loss': 0.4621, 'learning_rate': 2.2686480795500986e-06, 'epoch': 0.79} + 79%|███████▉ | 6894/8750 [3:37:12<2:57:17, 5.73s/it] 79%|███████▉ | 6895/8750 [3:37:20<2:59:14, 5.80s/it] 79%|███████▉ | 6895/8750 [3:37:18<2:59:14, 5.80s/it] {'loss': 0.4299, 'learning_rate': 2.2663008684128964e-06, 'epoch': 0.79} + 79%|███████▉ | 6895/8750 [3:37:20<2:59:14, 5.80s/it] {'loss': 0.4299, 'learning_rate': 2.2663008684128964e-06, 'epoch': 0.79} + 79%|███████▉ | 6895/8750 [3:37:18<2:59:14, 5.80s/it] 79%|███████▉ | 6896/8750 [3:37:26<2:58:24, 5.77s/it] 79%|███████▉ | 6896/8750 [3:37:24<2:58:24, 5.77s/it] {'loss': 0.453, 'learning_rate': 2.2639547169651423e-06, 'epoch': 0.79} + 79%|███████▉ | 6896/8750 [3:37:26<2:58:24, 5.77s/it] {'loss': 0.453, 'learning_rate': 2.2639547169651423e-06, 'epoch': 0.79} + 79%|███████▉ | 6896/8750 [3:37:24<2:58:24, 5.77s/it] 79%|███████▉ | 6897/8750 [3:37:31<2:56:45, 5.72s/it] 79%|███████▉ | 6897/8750 [3:37:29<2:56:45, 5.72s/it] {'loss': 0.4602, 'learning_rate': 2.2616096255283048e-06, 'epoch': 0.79} + 79%|███████▉ | 6897/8750 [3:37:31<2:56:45, 5.72s/it] {'loss': 0.4602, 'learning_rate': 2.2616096255283048e-06, 'epoch': 0.79} + 79%|███████▉ | 6897/8750 [3:37:29<2:56:45, 5.72s/it] 79%|███████▉ | 6898/8750 [3:37:37<2:56:17, 5.71s/it] 79%|███████▉ | 6898/8750 [3:37:35<2:56:18, 5.71s/it] {'loss': 0.447, 'learning_rate': 2.25926559442372e-06, 'epoch': 0.79} + 79%|███████▉ | 6898/8750 [3:37:37<2:56:17, 5.71s/it] {'loss': 0.447, 'learning_rate': 2.25926559442372e-06, 'epoch': 0.79} + 79%|███████▉ | 6898/8750 [3:37:35<2:56:18, 5.71s/it] 79%|███████▉ | 6899/8750 [3:37:43<2:54:55, 5.67s/it] 79%|███████▉ | 6899/8750 [3:37:40<2:54:55, 5.67s/it] {'loss': 0.4674, 'learning_rate': 2.2569226239725695e-06, 'epoch': 0.79} + 79%|███████▉ | 6899/8750 [3:37:43<2:54:55, 5.67s/it] {'loss': 0.4674, 'learning_rate': 2.2569226239725695e-06, 'epoch': 0.79} + 79%|███████▉ | 6899/8750 [3:37:40<2:54:55, 5.67s/it]14 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... + 79%|███████▉ | 6900/8750 [3:37:48<2:54:47, 5.67s/it]2 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +12 0AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... + 79%|███████▉ | 6900/8750 [3:37:46<2:54:47, 5.67s/it]3 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4247, 'learning_rate': 2.2545807144958896e-06, 'epoch': 0.79} + 79%|███████▉ | 6900/8750 [3:37:48<2:54:47, 5.67s/it] {'loss': 0.4247, 'learning_rate': 2.2545807144958896e-06, 'epoch': 0.79} + 79%|███████▉ | 6900/8750 [3:37:46<2:54:47, 5.67s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-6900/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-6900/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-6900/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 79%|███████▉ | 6901/8750 [3:38:09<5:13:02, 10.16s/it] 79%|███████▉ | 6901/8750 [3:38:07<5:13:02, 10.16s/it] {'loss': 0.4581, 'learning_rate': 2.252239866314582e-06, 'epoch': 0.79} + 79%|███████▉ | 6901/8750 [3:38:09<5:13:02, 10.16s/it] {'loss': 0.4581, 'learning_rate': 2.252239866314582e-06, 'epoch': 0.79} + 79%|███████▉ | 6901/8750 [3:38:07<5:13:02, 10.16s/it] 79%|███████▉ | 6902/8750 [3:38:15<4:31:53, 8.83s/it] 79%|███████▉ | 6902/8750 [3:38:12<4:31:53, 8.83s/it] {'loss': 0.4323, 'learning_rate': 2.249900079749385e-06, 'epoch': 0.79} + 79%|███████▉ | 6902/8750 [3:38:15<4:31:53, 8.83s/it] {'loss': 0.4323, 'learning_rate': 2.249900079749385e-06, 'epoch': 0.79} + 79%|███████▉ | 6902/8750 [3:38:13<4:31:53, 8.83s/it] 79%|███████▉ | 6903/8750 [3:38:20<4:02:23, 7.87s/it] 79%|███████▉ | 6903/8750 [3:38:18<4:02:23, 7.87s/it] {'loss': 0.442, 'learning_rate': 2.247561355120912e-06, 'epoch': 0.79} + 79%|███████▉ | 6903/8750 [3:38:20<4:02:23, 7.87s/it] {'loss': 0.442, 'learning_rate': 2.247561355120912e-06, 'epoch': 0.79} + 79%|███████▉ | 6903/8750 [3:38:18<4:02:23, 7.87s/it] 79%|███████▉ | 6904/8750 [3:38:26<3:42:16, 7.22s/it] 79%|███████▉ | 6904/8750 [3:38:24<3:42:16, 7.22s/it] {'loss': 0.4487, 'learning_rate': 2.245223692749612e-06, 'epoch': 0.79} + 79%|███████▉ | 6904/8750 [3:38:26<3:42:16, 7.22s/it] {'loss': 0.4487, 'learning_rate': 2.245223692749612e-06, 'epoch': 0.79} + 79%|███████▉ | 6904/8750 [3:38:24<3:42:16, 7.22s/it] 79%|███████▉ | 6905/8750 [3:38:32<3:27:01, 6.73s/it] 79%|███████▉ | 6905/8750 [3:38:29<3:27:02, 6.73s/it] {'loss': 0.4701, 'learning_rate': 2.2428870929558012e-06, 'epoch': 0.79} + 79%|███████▉ | 6905/8750 [3:38:32<3:27:01, 6.73s/it] {'loss': 0.4701, 'learning_rate': 2.2428870929558012e-06, 'epoch': 0.79} + 79%|███████▉ | 6905/8750 [3:38:29<3:27:02, 6.73s/it] 79%|███████▉ | 6906/8750 [3:38:35<3:18:27, 6.46s/it] 79%|███████▉ | 6906/8750 [3:38:38<3:18:27, 6.46s/it] {'loss': 0.4617, 'learning_rate': 2.240551556059647e-06, 'epoch': 0.79} + 79%|███████▉ | 6906/8750 [3:38:35<3:18:27, 6.46s/it] {'loss': 0.4617, 'learning_rate': 2.240551556059647e-06, 'epoch': 0.79} + 79%|███████▉ | 6906/8750 [3:38:38<3:18:27, 6.46s/it] 79%|███████▉ | 6907/8750 [3:38:43<3:12:44, 6.27s/it] 79%|███████▉ | 6907/8750 [3:38:41<3:12:44, 6.27s/it] {'loss': 0.4767, 'learning_rate': 2.238217082381169e-06, 'epoch': 0.79} + 79%|███████▉ | 6907/8750 [3:38:43<3:12:44, 6.27s/it] {'loss': 0.4767, 'learning_rate': 2.238217082381169e-06, 'epoch': 0.79} + 79%|███████▉ | 6907/8750 [3:38:41<3:12:44, 6.27s/it] 79%|███████▉ | 6908/8750 [3:38:49<3:10:09, 6.19s/it] 79%|███████▉ | 6908/8750 [3:38:47<3:10:09, 6.19s/it] {'loss': 0.45, 'learning_rate': 2.235883672240239e-06, 'epoch': 0.79} + 79%|███████▉ | 6908/8750 [3:38:49<3:10:09, 6.19s/it] {'loss': 0.45, 'learning_rate': 2.235883672240239e-06, 'epoch': 0.79} + 79%|███████▉ | 6908/8750 [3:38:47<3:10:09, 6.19s/it] 79%|███████▉ | 6909/8750 [3:38:55<3:06:11, 6.07s/it] {'loss': 0.4683, 'learning_rate': 2.233551325956591e-06, 'epoch': 0.79} + 79%|███████▉ | 6909/8750 [3:38:55<3:06:11, 6.07s/it] 79%|███████▉ | 6909/8750 [3:38:53<3:06:11, 6.07s/it] {'loss': 0.4683, 'learning_rate': 2.233551325956591e-06, 'epoch': 0.79} + 79%|███████▉ | 6909/8750 [3:38:53<3:06:11, 6.07s/it] 79%|███████▉ | 6910/8750 [3:39:01<3:02:39, 5.96s/it] 79%|███████▉ | 6910/8750 [3:38:59<3:02:39, 5.96s/it] {'loss': 0.4498, 'learning_rate': 2.2312200438498043e-06, 'epoch': 0.79} + 79%|███████▉ | 6910/8750 [3:39:01<3:02:39, 5.96s/it] {'loss': 0.4498, 'learning_rate': 2.2312200438498043e-06, 'epoch': 0.79} + 79%|███████▉ | 6910/8750 [3:38:59<3:02:39, 5.96s/it] 79%|███████▉ | 6911/8750 [3:39:07<3:01:52, 5.93s/it] 79%|███████▉ | 6911/8750 [3:39:04<3:01:52, 5.93s/it] {'loss': 0.452, 'learning_rate': 2.2288898262393212e-06, 'epoch': 0.79} + 79%|███████▉ | 6911/8750 [3:39:07<3:01:52, 5.93s/it] {'loss': 0.452, 'learning_rate': 2.2288898262393212e-06, 'epoch': 0.79} + 79%|███████▉ | 6911/8750 [3:39:04<3:01:52, 5.93s/it] 79%|███████▉ | 6912/8750 [3:39:12<2:58:59, 5.84s/it] 79%|███████▉ | 6912/8750 [3:39:10<2:58:59, 5.84s/it] {'loss': 0.4564, 'learning_rate': 2.2265606734444314e-06, 'epoch': 0.79} + 79%|███████▉ | 6912/8750 [3:39:12<2:58:59, 5.84s/it] {'loss': 0.4564, 'learning_rate': 2.2265606734444314e-06, 'epoch': 0.79} + 79%|███████▉ | 6912/8750 [3:39:10<2:58:59, 5.84s/it] 79%|███████▉ | 6913/8750 [3:39:18<2:59:49, 5.87s/it] 79%|███████▉ | 6913/8750 [3:39:16<2:59:49, 5.87s/it] {'loss': 0.4566, 'learning_rate': 2.2242325857842773e-06, 'epoch': 0.79} + 79%|███████▉ | 6913/8750 [3:39:18<2:59:49, 5.87s/it] {'loss': 0.4566, 'learning_rate': 2.2242325857842773e-06, 'epoch': 0.79} + 79%|███████▉ | 6913/8750 [3:39:16<2:59:49, 5.87s/it] 79%|███████▉ | 6914/8750 [3:39:24<2:57:40, 5.81s/it] 79%|███████▉ | 6914/8750 [3:39:22<2:57:41, 5.81s/it] {'loss': 0.4514, 'learning_rate': 2.2219055635778618e-06, 'epoch': 0.79} + 79%|███████▉ | 6914/8750 [3:39:24<2:57:40, 5.81s/it] {'loss': 0.4514, 'learning_rate': 2.2219055635778618e-06, 'epoch': 0.79} + 79%|███████▉ | 6914/8750 [3:39:22<2:57:41, 5.81s/it] 79%|███████▉ | 6915/8750 [3:39:30<2:58:19, 5.83s/it] 79%|███████▉ | 6915/8750 [3:39:28<2:58:19, 5.83s/it] {'loss': 0.4527, 'learning_rate': 2.219579607144039e-06, 'epoch': 0.79} + 79%|███████▉ | 6915/8750 [3:39:30<2:58:19, 5.83s/it] {'loss': 0.4527, 'learning_rate': 2.219579607144039e-06, 'epoch': 0.79} + 79%|███████▉ | 6915/8750 [3:39:28<2:58:19, 5.83s/it] 79%|███████▉ | 6916/8750 [3:39:35<2:55:51, 5.75s/it] 79%|███████▉ | 6916/8750 [3:39:33<2:55:51, 5.75s/it] {'loss': 0.4504, 'learning_rate': 2.21725471680152e-06, 'epoch': 0.79} + 79%|███████▉ | 6916/8750 [3:39:35<2:55:51, 5.75s/it] {'loss': 0.4504, 'learning_rate': 2.21725471680152e-06, 'epoch': 0.79} + 79%|███████▉ | 6916/8750 [3:39:33<2:55:51, 5.75s/it] 79%|███████▉ | 6917/8750 [3:39:41<2:55:01, 5.73s/it] 79%|███████▉ | 6917/8750 [3:39:39<2:55:01, 5.73s/it] {'loss': 0.4526, 'learning_rate': 2.214930892868864e-06, 'epoch': 0.79} + 79%|███████▉ | 6917/8750 [3:39:41<2:55:01, 5.73s/it] {'loss': 0.4526, 'learning_rate': 2.214930892868864e-06, 'epoch': 0.79} + 79%|███████▉ | 6917/8750 [3:39:39<2:55:01, 5.73s/it] 79%|███████▉ | 6918/8750 [3:39:47<2:58:29, 5.85s/it] 79%|███████▉ | 6918/8750 [3:39:45<2:58:29, 5.85s/it] {'loss': 0.4445, 'learning_rate': 2.2126081356644836e-06, 'epoch': 0.79} + 79%|███████▉ | 6918/8750 [3:39:47<2:58:29, 5.85s/it] {'loss': 0.4445, 'learning_rate': 2.2126081356644836e-06, 'epoch': 0.79} + 79%|███████▉ | 6918/8750 [3:39:45<2:58:29, 5.85s/it] 79%|███████▉ | 6919/8750 [3:39:53<2:59:12, 5.87s/it] 79%|███████▉ | 6919/8750 [3:39:51<2:59:12, 5.87s/it] {'loss': 0.4556, 'learning_rate': 2.210286445506654e-06, 'epoch': 0.79} + 79%|███████▉ | 6919/8750 [3:39:53<2:59:12, 5.87s/it] {'loss': 0.4556, 'learning_rate': 2.210286445506654e-06, 'epoch': 0.79} + 79%|███████▉ | 6919/8750 [3:39:51<2:59:12, 5.87s/it] 79%|███████▉ | 6920/8750 [3:39:59<2:57:41, 5.83s/it] 79%|███████▉ | 6920/8750 [3:39:57<2:57:42, 5.83s/it] {'loss': 0.4578, 'learning_rate': 2.207965822713496e-06, 'epoch': 0.79} + 79%|███████▉ | 6920/8750 [3:39:59<2:57:41, 5.83s/it] {'loss': 0.4578, 'learning_rate': 2.207965822713496e-06, 'epoch': 0.79} + 79%|███████▉ | 6920/8750 [3:39:57<2:57:42, 5.83s/it] 79%|███████▉ | 6921/8750 [3:40:05<3:00:48, 5.93s/it] 79%|███████▉ | 6921/8750 [3:40:03<3:00:48, 5.93s/it] {'loss': 0.4627, 'learning_rate': 2.205646267602983e-06, 'epoch': 0.79} + 79%|███████▉ | 6921/8750 [3:40:05<3:00:48, 5.93s/it] {'loss': 0.4627, 'learning_rate': 2.205646267602983e-06, 'epoch': 0.79} + 79%|███████▉ | 6921/8750 [3:40:03<3:00:48, 5.93s/it] 79%|███████▉ | 6922/8750 [3:40:11<2:57:49, 5.84s/it] 79%|███████▉ | 6922/8750 [3:40:08<2:57:49, 5.84s/it] {'loss': 0.454, 'learning_rate': 2.203327780492953e-06, 'epoch': 0.79} + 79%|███████▉ | 6922/8750 [3:40:11<2:57:49, 5.84s/it] {'loss': 0.454, 'learning_rate': 2.203327780492953e-06, 'epoch': 0.79} + 79%|███████▉ | 6922/8750 [3:40:08<2:57:49, 5.84s/it] 79%|███████▉ | 6923/8750 [3:40:16<2:54:56, 5.75s/it] {'loss': 0.4572, 'learning_rate': 2.2010103617010836e-06, 'epoch': 0.79} + 79%|███████▉ | 6923/8750 [3:40:16<2:54:56, 5.75s/it] 79%|███████▉ | 6923/8750 [3:40:14<2:54:56, 5.74s/it] {'loss': 0.4572, 'learning_rate': 2.2010103617010836e-06, 'epoch': 0.79} + 79%|███████▉ | 6923/8750 [3:40:14<2:54:56, 5.74s/it] 79%|███████▉ | 6924/8750 [3:40:22<2:54:29, 5.73s/it] 79%|███████▉ | 6924/8750 [3:40:20<2:54:29, 5.73s/it] {'loss': 0.4375, 'learning_rate': 2.1986940115449173e-06, 'epoch': 0.79} + 79%|███████▉ | 6924/8750 [3:40:22<2:54:29, 5.73s/it] {'loss': 0.4375, 'learning_rate': 2.1986940115449173e-06, 'epoch': 0.79} + 79%|███████▉ | 6924/8750 [3:40:20<2:54:29, 5.73s/it] 79%|███████▉ | 6925/8750 [3:40:28<2:53:52, 5.72s/it] 79%|███████▉ | 6925/8750 [3:40:25<2:53:52, 5.72s/it] {'loss': 0.4548, 'learning_rate': 2.196378730341846e-06, 'epoch': 0.79} + 79%|███████▉ | 6925/8750 [3:40:28<2:53:52, 5.72s/it] {'loss': 0.4548, 'learning_rate': 2.196378730341846e-06, 'epoch': 0.79} + 79%|███████▉ | 6925/8750 [3:40:25<2:53:52, 5.72s/it] 79%|███████▉ | 6926/8750 [3:40:33<2:53:53, 5.72s/it] 79%|███████▉ | 6926/8750 [3:40:31<2:53:53, 5.72s/it] {'loss': 0.4455, 'learning_rate': 2.1940645184091115e-06, 'epoch': 0.79} + 79%|███████▉ | 6926/8750 [3:40:33<2:53:53, 5.72s/it] {'loss': 0.4455, 'learning_rate': 2.1940645184091115e-06, 'epoch': 0.79} + 79%|███████▉ | 6926/8750 [3:40:31<2:53:53, 5.72s/it] 79%|███████▉ | 6927/8750 [3:40:39<2:56:46, 5.82s/it] 79%|███████▉ | 6927/8750 [3:40:37<2:56:46, 5.82s/it] {'loss': 0.4483, 'learning_rate': 2.1917513760638177e-06, 'epoch': 0.79} + 79%|███████▉ | 6927/8750 [3:40:39<2:56:46, 5.82s/it] {'loss': 0.4483, 'learning_rate': 2.1917513760638177e-06, 'epoch': 0.79} + 79%|███████▉ | 6927/8750 [3:40:37<2:56:46, 5.82s/it] 79%|███████▉ | 6928/8750 [3:40:45<2:57:37, 5.85s/it] 79%|███████▉ | 6928/8750 [3:40:43<2:57:37, 5.85s/it] {'loss': 0.4536, 'learning_rate': 2.18943930362291e-06, 'epoch': 0.79} + 79%|███████▉ | 6928/8750 [3:40:45<2:57:37, 5.85s/it] {'loss': 0.4536, 'learning_rate': 2.18943930362291e-06, 'epoch': 0.79} + 79%|███████▉ | 6928/8750 [3:40:43<2:57:37, 5.85s/it] 79%|███████▉ | 6929/8750 [3:40:51<2:58:02, 5.87s/it] 79%|███████▉ | 6929/8750 [3:40:49<2:58:01, 5.87s/it] {'loss': 0.4454, 'learning_rate': 2.1871283014032007e-06, 'epoch': 0.79} + 79%|███████▉ | 6929/8750 [3:40:51<2:58:02, 5.87s/it] {'loss': 0.4454, 'learning_rate': 2.1871283014032007e-06, 'epoch': 0.79} + 79%|███████▉ | 6929/8750 [3:40:49<2:58:01, 5.87s/it] 79%|███████▉ | 6930/8750 [3:40:57<2:56:36, 5.82s/it] 79%|███████▉ | 6930/8750 [3:40:55<2:56:36, 5.82s/it] {'loss': 0.4693, 'learning_rate': 2.1848183697213467e-06, 'epoch': 0.79} + 79%|███████▉ | 6930/8750 [3:40:57<2:56:36, 5.82s/it] {'loss': 0.4693, 'learning_rate': 2.1848183697213467e-06, 'epoch': 0.79} + 79%|███████▉ | 6930/8750 [3:40:55<2:56:36, 5.82s/it] 79%|███████▉ | 6931/8750 [3:41:02<2:53:13, 5.71s/it] 79%|███████▉ | 6931/8750 [3:41:00<2:53:13, 5.71s/it] {'loss': 0.4515, 'learning_rate': 2.1825095088938553e-06, 'epoch': 0.79} + 79%|███████▉ | 6931/8750 [3:41:02<2:53:13, 5.71s/it] {'loss': 0.4515, 'learning_rate': 2.1825095088938553e-06, 'epoch': 0.79} + 79%|███████▉ | 6931/8750 [3:41:00<2:53:13, 5.71s/it] 79%|███████▉ | 6932/8750 [3:41:08<2:52:16, 5.69s/it] 79%|███████▉ | 6932/8750 [3:41:06<2:52:16, 5.69s/it] {'loss': 0.451, 'learning_rate': 2.1802017192370963e-06, 'epoch': 0.79} + 79%|███████▉ | 6932/8750 [3:41:08<2:52:16, 5.69s/it] {'loss': 0.451, 'learning_rate': 2.1802017192370963e-06, 'epoch': 0.79} + 79%|███████▉ | 6932/8750 [3:41:06<2:52:16, 5.69s/it] 79%|███████▉ | 6933/8750 [3:41:14<2:52:26, 5.69s/it] 79%|███████▉ | 6933/8750 [3:41:11<2:52:26, 5.69s/it] {'loss': 0.4543, 'learning_rate': 2.1778950010672895e-06, 'epoch': 0.79} + 79%|███████▉ | 6933/8750 [3:41:14<2:52:26, 5.69s/it] {'loss': 0.4543, 'learning_rate': 2.1778950010672895e-06, 'epoch': 0.79} + 79%|███████▉ | 6933/8750 [3:41:11<2:52:26, 5.69s/it] 79%|███████▉ | 6934/8750 [3:41:20<2:54:03, 5.75s/it] 79%|███████▉ | 6934/8750 [3:41:17<2:54:03, 5.75s/it] {'loss': 0.4561, 'learning_rate': 2.1755893547005036e-06, 'epoch': 0.79} + 79%|███████▉ | 6934/8750 [3:41:20<2:54:03, 5.75s/it] {'loss': 0.4561, 'learning_rate': 2.1755893547005036e-06, 'epoch': 0.79} + 79%|███████▉ | 6934/8750 [3:41:17<2:54:03, 5.75s/it] 79%|███████▉ | 6935/8750 [3:41:25<2:55:15, 5.79s/it] 79%|███████▉ | 6935/8750 [3:41:23<2:55:16, 5.79s/it] {'loss': 0.4421, 'learning_rate': 2.173284780452667e-06, 'epoch': 0.79} + 79%|███████▉ | 6935/8750 [3:41:25<2:55:15, 5.79s/it] {'loss': 0.4421, 'learning_rate': 2.173284780452667e-06, 'epoch': 0.79} + 79%|███████▉ | 6935/8750 [3:41:23<2:55:16, 5.79s/it] 79%|███████▉ | 6936/8750 [3:41:31<2:55:55, 5.82s/it] 79%|███████▉ | 6936/8750 [3:41:29<2:55:55, 5.82s/it] {'loss': 0.4498, 'learning_rate': 2.1709812786395545e-06, 'epoch': 0.79} + 79%|███████▉ | 6936/8750 [3:41:31<2:55:55, 5.82s/it] {'loss': 0.4498, 'learning_rate': 2.1709812786395545e-06, 'epoch': 0.79} + 79%|███████▉ | 6936/8750 [3:41:29<2:55:55, 5.82s/it] 79%|███████▉ | 6937/8750 [3:41:37<2:57:54, 5.89s/it] 79%|███████▉ | 6937/8750 [3:41:35<2:57:54, 5.89s/it] {'loss': 0.4632, 'learning_rate': 2.1686788495768006e-06, 'epoch': 0.79} + 79%|███████▉ | 6937/8750 [3:41:37<2:57:54, 5.89s/it] {'loss': 0.4632, 'learning_rate': 2.1686788495768006e-06, 'epoch': 0.79} + 79%|███████▉ | 6937/8750 [3:41:35<2:57:54, 5.89s/it] 79%|███████▉ | 6938/8750 [3:41:43<2:55:00, 5.79s/it] 79%|███████▉ | 6938/8750 [3:41:41<2:55:00, 5.79s/it] {'loss': 0.4392, 'learning_rate': 2.1663774935798886e-06, 'epoch': 0.79} + 79%|███████▉ | 6938/8750 [3:41:43<2:55:00, 5.79s/it] {'loss': 0.4392, 'learning_rate': 2.1663774935798886e-06, 'epoch': 0.79} + 79%|███████▉ | 6938/8750 [3:41:41<2:55:00, 5.79s/it] 79%|███████▉ | 6939/8750 [3:41:49<2:55:34, 5.82s/it] 79%|███████▉ | 6939/8750 [3:41:47<2:55:34, 5.82s/it] {'loss': 0.4639, 'learning_rate': 2.1640772109641504e-06, 'epoch': 0.79} + 79%|███████▉ | 6939/8750 [3:41:49<2:55:34, 5.82s/it] {'loss': 0.4639, 'learning_rate': 2.1640772109641504e-06, 'epoch': 0.79} + 79%|███████▉ | 6939/8750 [3:41:47<2:55:34, 5.82s/it] 79%|███████▉ | 6940/8750 [3:41:54<2:54:08, 5.77s/it] 79%|███████▉ | 6940/8750 [3:41:52<2:54:08, 5.77s/it] {'loss': 0.4477, 'learning_rate': 2.1617780020447854e-06, 'epoch': 0.79} + 79%|███████▉ | 6940/8750 [3:41:55<2:54:08, 5.77s/it] {'loss': 0.4477, 'learning_rate': 2.1617780020447854e-06, 'epoch': 0.79} + 79%|███████▉ | 6940/8750 [3:41:52<2:54:08, 5.77s/it] 79%|███████▉ | 6941/8750 [3:42:00<2:53:33, 5.76s/it] 79%|███████▉ | 6941/8750 [3:41:58<2:53:33, 5.76s/it] {'loss': 0.4378, 'learning_rate': 2.1594798671368265e-06, 'epoch': 0.79} + 79%|███████▉ | 6941/8750 [3:42:00<2:53:33, 5.76s/it] {'loss': 0.4378, 'learning_rate': 2.1594798671368265e-06, 'epoch': 0.79} + 79%|███████▉ | 6941/8750 [3:41:58<2:53:33, 5.76s/it] 79%|███████▉ | 6942/8750 [3:42:06<2:52:50, 5.74s/it] 79%|███████▉ | 6942/8750 [3:42:04<2:52:50, 5.74s/it] {'loss': 0.4584, 'learning_rate': 2.157182806555177e-06, 'epoch': 0.79} + 79%|███████▉ | 6942/8750 [3:42:06<2:52:50, 5.74s/it] {'loss': 0.4584, 'learning_rate': 2.157182806555177e-06, 'epoch': 0.79} + 79%|███████▉ | 6942/8750 [3:42:04<2:52:50, 5.74s/it] 79%|███████▉ | 6943/8750 [3:42:12<2:52:36, 5.73s/it] 79%|███████▉ | 6943/8750 [3:42:09<2:52:36, 5.73s/it] {'loss': 0.4768, 'learning_rate': 2.1548868206145846e-06, 'epoch': 0.79} + 79%|███████▉ | 6943/8750 [3:42:12<2:52:36, 5.73s/it] {'loss': 0.4768, 'learning_rate': 2.1548868206145846e-06, 'epoch': 0.79} + 79%|███████▉ | 6943/8750 [3:42:09<2:52:36, 5.73s/it] 79%|███████▉ | 6944/8750 [3:42:17<2:52:59, 5.75s/it] 79%|███████▉ | 6944/8750 [3:42:15<2:52:59, 5.75s/it] {'loss': 0.4334, 'learning_rate': 2.1525919096296455e-06, 'epoch': 0.79} + 79%|███████▉ | 6944/8750 [3:42:17<2:52:59, 5.75s/it] {'loss': 0.4334, 'learning_rate': 2.1525919096296455e-06, 'epoch': 0.79} + 79%|███████▉ | 6944/8750 [3:42:15<2:52:59, 5.75s/it] 79%|███████▉ | 6945/8750 [3:42:23<2:55:44, 5.84s/it] 79%|███████▉ | 6945/8750 [3:42:21<2:55:44, 5.84s/it] {'loss': 0.4477, 'learning_rate': 2.1502980739148215e-06, 'epoch': 0.79} + 79%|███████▉ | 6945/8750 [3:42:23<2:55:44, 5.84s/it] {'loss': 0.4477, 'learning_rate': 2.1502980739148215e-06, 'epoch': 0.79} + 79%|███████▉ | 6945/8750 [3:42:21<2:55:44, 5.84s/it] 79%|███████▉ | 6946/8750 [3:42:29<2:53:42, 5.78s/it] 79%|███████▉ | 6946/8750 [3:42:27<2:53:42, 5.78s/it] {'loss': 0.4488, 'learning_rate': 2.1480053137844115e-06, 'epoch': 0.79} + 79%|███████▉ | 6946/8750 [3:42:29<2:53:42, 5.78s/it] {'loss': 0.4488, 'learning_rate': 2.1480053137844115e-06, 'epoch': 0.79} + 79%|███████▉ | 6946/8750 [3:42:27<2:53:42, 5.78s/it] 79%|███████▉ | 6947/8750 [3:42:35<2:54:59, 5.82s/it] 79%|███████▉ | 6947/8750 [3:42:33<2:54:59, 5.82s/it] {'loss': 0.4608, 'learning_rate': 2.1457136295525817e-06, 'epoch': 0.79} + 79%|███████▉ | 6947/8750 [3:42:35<2:54:59, 5.82s/it] {'loss': 0.4608, 'learning_rate': 2.1457136295525817e-06, 'epoch': 0.79} + 79%|███████▉ | 6947/8750 [3:42:33<2:54:59, 5.82s/it] 79%|███████▉ | 6948/8750 [3:42:41<2:52:22, 5.74s/it] 79%|███████▉ | 6948/8750 [3:42:38<2:52:22, 5.74s/it] {'loss': 0.4605, 'learning_rate': 2.1434230215333407e-06, 'epoch': 0.79} + 79%|███████▉ | 6948/8750 [3:42:41<2:52:22, 5.74s/it] {'loss': 0.4605, 'learning_rate': 2.1434230215333407e-06, 'epoch': 0.79} + 79%|███████▉ | 6948/8750 [3:42:38<2:52:22, 5.74s/it] 79%|███████▉ | 6949/8750 [3:42:46<2:52:31, 5.75s/it] {'loss': 0.4562, 'learning_rate': 2.14113349004055e-06, 'epoch': 0.79} + 79%|███████▉ | 6949/8750 [3:42:44<2:52:31, 5.75s/it] 79%|███████▉ | 6949/8750 [3:42:46<2:52:31, 5.75s/it] {'loss': 0.4562, 'learning_rate': 2.14113349004055e-06, 'epoch': 0.79} + 79%|███████▉ | 6949/8750 [3:42:44<2:52:31, 5.75s/it]9 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... + 79%|███████▉ | 6950/8750 [3:42:52<2:51:55, 5.73s/it]5 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend...0 +15 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... + 79%|███████▉ | 6950/8750 [3:42:50<2:51:55, 5.73s/it]6 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4482, 'learning_rate': 2.138845035387932e-06, 'epoch': 0.79} + 79%|███████▉ | 6950/8750 [3:42:52<2:51:55, 5.73s/it] {'loss': 0.4482, 'learning_rate': 2.138845035387932e-06, 'epoch': 0.79} + 79%|███████▉ | 6950/8750 [3:42:50<2:51:55, 5.73s/it] 79%|███████▉ | 6951/8750 [3:42:58<2:52:00, 5.74s/it] 79%|███████▉ | 6951/8750 [3:42:56<2:52:00, 5.74s/it] {'loss': 0.4535, 'learning_rate': 2.1365576578890513e-06, 'epoch': 0.79} + 79%|███████▉ | 6951/8750 [3:42:58<2:52:00, 5.74s/it] {'loss': 0.4535, 'learning_rate': 2.1365576578890513e-06, 'epoch': 0.79} + 79%|███████▉ | 6951/8750 [3:42:56<2:52:00, 5.74s/it] 79%|███████▉ | 6952/8750 [3:43:03<2:51:42, 5.73s/it] 79%|███████▉ | 6952/8750 [3:43:01<2:51:42, 5.73s/it] {'loss': 0.44, 'learning_rate': 2.1342713578573327e-06, 'epoch': 0.79} + 79%|███████▉ | 6952/8750 [3:43:03<2:51:42, 5.73s/it] {'loss': 0.44, 'learning_rate': 2.1342713578573327e-06, 'epoch': 0.79} + 79%|███████▉ | 6952/8750 [3:43:01<2:51:42, 5.73s/it] 79%|███████▉ | 6953/8750 [3:43:09<2:50:51, 5.70s/it] 79%|███████▉ | 6953/8750 [3:43:07<2:50:51, 5.70s/it] {'loss': 0.447, 'learning_rate': 2.131986135606051e-06, 'epoch': 0.79} + 79%|███████▉ | 6953/8750 [3:43:09<2:50:51, 5.70s/it] {'loss': 0.447, 'learning_rate': 2.131986135606051e-06, 'epoch': 0.79} + 79%|███████▉ | 6953/8750 [3:43:07<2:50:51, 5.70s/it] 79%|███████▉ | 6954/8750 [3:43:15<2:50:10, 5.68s/it] 79%|███████▉ | 6954/8750 [3:43:13<2:50:09, 5.68s/it] {'loss': 0.4267, 'learning_rate': 2.1297019914483297e-06, 'epoch': 0.79} + 79%|███████▉ | 6954/8750 [3:43:15<2:50:10, 5.68s/it] {'loss': 0.4267, 'learning_rate': 2.1297019914483297e-06, 'epoch': 0.79} + 79%|███████▉ | 6954/8750 [3:43:13<2:50:09, 5.68s/it] 79%|███████▉ | 6955/8750 [3:43:18<2:50:41, 5.71s/it] 79%|███████▉ | 6955/8750 [3:43:21<2:50:42, 5.71s/it] {'loss': 0.442, 'learning_rate': 2.1274189256971523e-06, 'epoch': 0.79} + 79%|███████▉ | 6955/8750 [3:43:21<2:50:42, 5.71s/it] {'loss': 0.442, 'learning_rate': 2.1274189256971523e-06, 'epoch': 0.79} + 79%|███████▉ | 6955/8750 [3:43:18<2:50:41, 5.71s/it] 79%|███████▉ | 6956/8750 [3:43:26<2:49:51, 5.68s/it] 79%|███████▉ | 6956/8750 [3:43:24<2:49:51, 5.68s/it] {'loss': 0.4605, 'learning_rate': 2.1251369386653454e-06, 'epoch': 0.79} + 79%|███████▉ | 6956/8750 [3:43:26<2:49:51, 5.68s/it] {'loss': 0.4605, 'learning_rate': 2.1251369386653454e-06, 'epoch': 0.79} + 79%|███████▉ | 6956/8750 [3:43:24<2:49:51, 5.68s/it] 80%|███████▉ | 6957/8750 [3:43:30<2:51:05, 5.73s/it] 80%|███████▉ | 6957/8750 [3:43:32<2:51:07, 5.73s/it] {'loss': 0.4559, 'learning_rate': 2.122856030665591e-06, 'epoch': 0.8} + 80%|███████▉ | 6957/8750 [3:43:32<2:51:07, 5.73s/it] {'loss': 0.4559, 'learning_rate': 2.122856030665591e-06, 'epoch': 0.8} + 80%|███████▉ | 6957/8750 [3:43:30<2:51:05, 5.73s/it] 80%|███████▉ | 6958/8750 [3:43:38<2:50:35, 5.71s/it] 80%|███████▉ | 6958/8750 [3:43:35<2:50:35, 5.71s/it] {'loss': 0.4631, 'learning_rate': 2.1205762020104303e-06, 'epoch': 0.8} + 80%|███████▉ | 6958/8750 [3:43:38<2:50:35, 5.71s/it] {'loss': 0.4631, 'learning_rate': 2.1205762020104303e-06, 'epoch': 0.8} + 80%|███████▉ | 6958/8750 [3:43:35<2:50:35, 5.71s/it] 80%|███████▉ | 6959/8750 [3:43:43<2:50:55, 5.73s/it] 80%|███████▉ | 6959/8750 [3:43:41<2:50:55, 5.73s/it] {'loss': 0.4636, 'learning_rate': 2.1182974530122435e-06, 'epoch': 0.8} + 80%|███████▉ | 6959/8750 [3:43:43<2:50:55, 5.73s/it] {'loss': 0.4636, 'learning_rate': 2.1182974530122435e-06, 'epoch': 0.8} + 80%|███████▉ | 6959/8750 [3:43:41<2:50:55, 5.73s/it] 80%|███████▉ | 6960/8750 [3:43:47<2:52:42, 5.79s/it] 80%|███████▉ | 6960/8750 [3:43:49<2:52:43, 5.79s/it] {'loss': 0.4414, 'learning_rate': 2.1160197839832774e-06, 'epoch': 0.8} + 80%|███████▉ | 6960/8750 [3:43:49<2:52:43, 5.79s/it] {'loss': 0.4414, 'learning_rate': 2.1160197839832774e-06, 'epoch': 0.8} + 80%|███████▉ | 6960/8750 [3:43:47<2:52:42, 5.79s/it] 80%|███████▉ | 6961/8750 [3:43:53<2:51:35, 5.76s/it] 80%|███████▉ | 6961/8750 [3:43:55<2:51:36, 5.76s/it] {'loss': 0.4593, 'learning_rate': 2.113743195235617e-06, 'epoch': 0.8} + 80%|███████▉ | 6961/8750 [3:43:55<2:51:36, 5.76s/it] {'loss': 0.4593, 'learning_rate': 2.113743195235617e-06, 'epoch': 0.8} + 80%|███████▉ | 6961/8750 [3:43:53<2:51:35, 5.76s/it] 80%|███████▉ | 6962/8750 [3:44:01<2:50:32, 5.72s/it] 80%|███████▉ | 6962/8750 [3:43:58<2:50:33, 5.72s/it]{'loss': 0.4448, 'learning_rate': 2.111467687081209e-06, 'epoch': 0.8} + {'loss': 0.4448, 'learning_rate': 2.111467687081209e-06, 'epoch': 0.8} + 80%|███████▉ | 6962/8750 [3:44:01<2:50:32, 5.72s/it] 80%|███████▉ | 6962/8750 [3:43:58<2:50:33, 5.72s/it] 80%|███████▉ | 6963/8750 [3:44:04<2:51:11, 5.75s/it] 80%|███████▉ | 6963/8750 [3:44:06<2:51:12, 5.75s/it] {'loss': 0.4518, 'learning_rate': 2.109193259831851e-06, 'epoch': 0.8} + 80%|███████▉ | 6963/8750 [3:44:06<2:51:12, 5.75s/it] {'loss': 0.4518, 'learning_rate': 2.109193259831851e-06, 'epoch': 0.8} + 80%|███████▉ | 6963/8750 [3:44:04<2:51:11, 5.75s/it] 80%|███████▉ | 6964/8750 [3:44:12<2:50:13, 5.72s/it] 80%|███████▉ | 6964/8750 [3:44:10<2:50:14, 5.72s/it] {'loss': 0.4534, 'learning_rate': 2.106919913799188e-06, 'epoch': 0.8} + 80%|███████▉ | 6964/8750 [3:44:12<2:50:13, 5.72s/it] {'loss': 0.4534, 'learning_rate': 2.106919913799188e-06, 'epoch': 0.8} + 80%|███████▉ | 6964/8750 [3:44:10<2:50:14, 5.72s/it] 80%|███████▉ | 6965/8750 [3:44:18<2:50:09, 5.72s/it] 80%|███████▉ | 6965/8750 [3:44:16<2:50:09, 5.72s/it] {'loss': 0.4483, 'learning_rate': 2.1046476492947155e-06, 'epoch': 0.8} + 80%|███████▉ | 6965/8750 [3:44:18<2:50:09, 5.72s/it] {'loss': 0.4483, 'learning_rate': 2.1046476492947155e-06, 'epoch': 0.8} + 80%|███████▉ | 6965/8750 [3:44:16<2:50:09, 5.72s/it] 80%|███████▉ | 6966/8750 [3:44:23<2:49:13, 5.69s/it] 80%|███████▉ | 6966/8750 [3:44:21<2:49:13, 5.69s/it] {'loss': 0.4542, 'learning_rate': 2.102376466629792e-06, 'epoch': 0.8} + 80%|███████▉ | 6966/8750 [3:44:23<2:49:13, 5.69s/it] {'loss': 0.4542, 'learning_rate': 2.102376466629792e-06, 'epoch': 0.8} + 80%|███████▉ | 6966/8750 [3:44:21<2:49:13, 5.69s/it] 80%|███████▉ | 6967/8750 [3:44:29<2:49:57, 5.72s/it] 80%|███████▉ | 6967/8750 [3:44:27<2:49:58, 5.72s/it] {'loss': 0.4583, 'learning_rate': 2.100106366115613e-06, 'epoch': 0.8} + {'loss': 0.4583, 'learning_rate': 2.100106366115613e-06, 'epoch': 0.8} + 80%|███████▉ | 6967/8750 [3:44:29<2:49:57, 5.72s/it] 80%|███████▉ | 6967/8750 [3:44:27<2:49:58, 5.72s/it] 80%|███████▉ | 6968/8750 [3:44:35<2:52:03, 5.79s/it] 80%|███████▉ | 6968/8750 [3:44:33<2:52:03, 5.79s/it] {'loss': 0.4349, 'learning_rate': 2.0978373480632386e-06, 'epoch': 0.8} + 80%|███████▉ | 6968/8750 [3:44:35<2:52:03, 5.79s/it] {'loss': 0.4349, 'learning_rate': 2.0978373480632386e-06, 'epoch': 0.8} + 80%|███████▉ | 6968/8750 [3:44:33<2:52:03, 5.79s/it] 80%|███████▉ | 6969/8750 [3:44:41<2:51:23, 5.77s/it] 80%|███████▉ | 6969/8750 [3:44:39<2:51:23, 5.77s/it] {'loss': 0.447, 'learning_rate': 2.0955694127835736e-06, 'epoch': 0.8} + 80%|███████▉ | 6969/8750 [3:44:41<2:51:23, 5.77s/it] {'loss': 0.447, 'learning_rate': 2.0955694127835736e-06, 'epoch': 0.8} + 80%|███████▉ | 6969/8750 [3:44:39<2:51:23, 5.77s/it] 80%|███████▉ | 6970/8750 [3:44:47<2:52:25, 5.81s/it] 80%|███████▉ | 6970/8750 [3:44:45<2:52:26, 5.81s/it] {'loss': 0.4611, 'learning_rate': 2.0933025605873702e-06, 'epoch': 0.8} + 80%|███████▉ | 6970/8750 [3:44:45<2:52:26, 5.81s/it]{'loss': 0.4611, 'learning_rate': 2.0933025605873702e-06, 'epoch': 0.8} + 80%|███████▉ | 6970/8750 [3:44:47<2:52:25, 5.81s/it] 80%|███████▉ | 6971/8750 [3:44:53<2:51:43, 5.79s/it] {'loss': 0.4435, 'learning_rate': 2.0910367917852437e-06, 'epoch': 0.8} + 80%|███████▉ | 6971/8750 [3:44:53<2:51:43, 5.79s/it] 80%|███████▉ | 6971/8750 [3:44:50<2:51:43, 5.79s/it] {'loss': 0.4435, 'learning_rate': 2.0910367917852437e-06, 'epoch': 0.8} + 80%|███████▉ | 6971/8750 [3:44:50<2:51:43, 5.79s/it] 80%|███████▉ | 6972/8750 [3:44:59<2:53:01, 5.84s/it] 80%|███████▉ | 6972/8750 [3:44:56<2:53:02, 5.84s/it] {'loss': 0.4551, 'learning_rate': 2.088772106687653e-06, 'epoch': 0.8} + 80%|███████▉ | 6972/8750 [3:44:59<2:53:01, 5.84s/it] {'loss': 0.4551, 'learning_rate': 2.088772106687653e-06, 'epoch': 0.8} + 80%|███████▉ | 6972/8750 [3:44:56<2:53:02, 5.84s/it] 80%|███████▉ | 6973/8750 [3:45:04<2:51:36, 5.79s/it] 80%|███████▉ | 6973/8750 [3:45:02<2:51:36, 5.79s/it] {'loss': 0.4698, 'learning_rate': 2.0865085056049138e-06, 'epoch': 0.8} + 80%|███████▉ | 6973/8750 [3:45:02<2:51:36, 5.79s/it]{'loss': 0.4698, 'learning_rate': 2.0865085056049138e-06, 'epoch': 0.8} + 80%|███████▉ | 6973/8750 [3:45:04<2:51:36, 5.79s/it] 80%|███████▉ | 6974/8750 [3:45:08<2:50:30, 5.76s/it] 80%|███████▉ | 6974/8750 [3:45:10<2:50:31, 5.76s/it] {'loss': 0.4405, 'learning_rate': 2.084245988847188e-06, 'epoch': 0.8} + 80%|███████▉ | 6974/8750 [3:45:10<2:50:31, 5.76s/it] {'loss': 0.4405, 'learning_rate': 2.084245988847188e-06, 'epoch': 0.8} + 80%|███████▉ | 6974/8750 [3:45:08<2:50:30, 5.76s/it] 80%|███████▉ | 6975/8750 [3:45:16<2:52:04, 5.82s/it] 80%|███████▉ | 6975/8750 [3:45:14<2:52:05, 5.82s/it] {'loss': 0.4661, 'learning_rate': 2.0819845567244868e-06, 'epoch': 0.8} + 80%|███████▉ | 6975/8750 [3:45:16<2:52:04, 5.82s/it] {'loss': 0.4661, 'learning_rate': 2.0819845567244868e-06, 'epoch': 0.8} + 80%|███████▉ | 6975/8750 [3:45:14<2:52:05, 5.82s/it] 80%|███████▉ | 6976/8750 [3:45:19<2:51:06, 5.79s/it] 80%|███████▉ | 6976/8750 [3:45:22<2:51:07, 5.79s/it] {'loss': 0.4539, 'learning_rate': 2.079724209546683e-06, 'epoch': 0.8} + 80%|███████▉ | 6976/8750 [3:45:22<2:51:07, 5.79s/it] {'loss': 0.4539, 'learning_rate': 2.079724209546683e-06, 'epoch': 0.8} + 80%|███████▉ | 6976/8750 [3:45:19<2:51:06, 5.79s/it] 80%|███████▉ | 6977/8750 [3:45:25<2:50:27, 5.77s/it] 80%|███████▉ | 6977/8750 [3:45:27<2:50:27, 5.77s/it] {'loss': 0.4373, 'learning_rate': 2.077464947623492e-06, 'epoch': 0.8} + 80%|███████▉ | 6977/8750 [3:45:27<2:50:27, 5.77s/it] {'loss': 0.4373, 'learning_rate': 2.077464947623492e-06, 'epoch': 0.8} + 80%|███████▉ | 6977/8750 [3:45:25<2:50:27, 5.77s/it] 80%|███████▉ | 6978/8750 [3:45:33<2:50:12, 5.76s/it] 80%|███████▉ | 6978/8750 [3:45:31<2:50:13, 5.76s/it] {'loss': 0.4411, 'learning_rate': 2.0752067712644807e-06, 'epoch': 0.8} + 80%|███████▉ | 6978/8750 [3:45:33<2:50:12, 5.76s/it] {'loss': 0.4411, 'learning_rate': 2.0752067712644807e-06, 'epoch': 0.8} + 80%|███████▉ | 6978/8750 [3:45:31<2:50:13, 5.76s/it] 80%|███████▉ | 6979/8750 [3:45:39<2:50:23, 5.77s/it] 80%|███████▉ | 6979/8750 [3:45:37<2:50:23, 5.77s/it] {'loss': 0.4557, 'learning_rate': 2.0729496807790737e-06, 'epoch': 0.8} + 80%|███████▉ | 6979/8750 [3:45:39<2:50:23, 5.77s/it] {'loss': 0.4557, 'learning_rate': 2.0729496807790737e-06, 'epoch': 0.8} + 80%|███████▉ | 6979/8750 [3:45:37<2:50:23, 5.77s/it] 80%|███████▉ | 6980/8750 [3:45:44<2:48:23, 5.71s/it] 80%|███████▉ | 6980/8750 [3:45:42<2:48:23, 5.71s/it] {'loss': 0.4516, 'learning_rate': 2.0706936764765393e-06, 'epoch': 0.8} + 80%|███████▉ | 6980/8750 [3:45:44<2:48:23, 5.71s/it] {'loss': 0.4516, 'learning_rate': 2.0706936764765393e-06, 'epoch': 0.8} + 80%|███████▉ | 6980/8750 [3:45:42<2:48:23, 5.71s/it] 80%|███████▉ | 6981/8750 [3:45:50<2:48:25, 5.71s/it] 80%|███████▉ | 6981/8750 [3:45:48<2:48:26, 5.71s/it] {'loss': 0.4538, 'learning_rate': 2.0684387586660027e-06, 'epoch': 0.8} + 80%|███████▉ | 6981/8750 [3:45:50<2:48:25, 5.71s/it] {'loss': 0.4538, 'learning_rate': 2.0684387586660027e-06, 'epoch': 0.8} + 80%|███████▉ | 6981/8750 [3:45:48<2:48:26, 5.71s/it] 80%|███████▉ | 6982/8750 [3:45:56<2:47:19, 5.68s/it] 80%|███████▉ | 6982/8750 [3:45:53<2:47:20, 5.68s/it] {'loss': 0.4527, 'learning_rate': 2.0661849276564394e-06, 'epoch': 0.8} + 80%|███████▉ | 6982/8750 [3:45:56<2:47:19, 5.68s/it] {'loss': 0.4527, 'learning_rate': 2.0661849276564394e-06, 'epoch': 0.8} + 80%|███████▉ | 6982/8750 [3:45:53<2:47:20, 5.68s/it] 80%|███████▉ | 6983/8750 [3:45:59<2:47:00, 5.67s/it] 80%|███████▉ | 6983/8750 [3:46:01<2:47:01, 5.67s/it] {'loss': 0.4482, 'learning_rate': 2.0639321837566696e-06, 'epoch': 0.8} + 80%|███████▉ | 6983/8750 [3:46:01<2:47:01, 5.67s/it] {'loss': 0.4482, 'learning_rate': 2.0639321837566696e-06, 'epoch': 0.8} + 80%|███████▉ | 6983/8750 [3:45:59<2:47:00, 5.67s/it] 80%|███████▉ | 6984/8750 [3:46:05<2:46:33, 5.66s/it] 80%|███████▉ | 6984/8750 [3:46:07<2:46:34, 5.66s/it] {'loss': 0.4652, 'learning_rate': 2.0616805272753758e-06, 'epoch': 0.8} + 80%|███████▉ | 6984/8750 [3:46:07<2:46:34, 5.66s/it] {'loss': 0.4652, 'learning_rate': 2.0616805272753758e-06, 'epoch': 0.8} + 80%|███████▉ | 6984/8750 [3:46:05<2:46:33, 5.66s/it] 80%|███████▉ | 6985/8750 [3:46:13<2:49:07, 5.75s/it] 80%|███████▉ | 6985/8750 [3:46:11<2:49:07, 5.75s/it] {'loss': 0.4302, 'learning_rate': 2.0594299585210796e-06, 'epoch': 0.8} + 80%|███████▉ | 6985/8750 [3:46:13<2:49:07, 5.75s/it] {'loss': 0.4302, 'learning_rate': 2.0594299585210796e-06, 'epoch': 0.8} + 80%|███████▉ | 6985/8750 [3:46:11<2:49:07, 5.75s/it] 80%|███████▉ | 6986/8750 [3:46:19<2:47:45, 5.71s/it] 80%|███████▉ | 6986/8750 [3:46:16<2:47:45, 5.71s/it] {'loss': 0.4535, 'learning_rate': 2.057180477802164e-06, 'epoch': 0.8} + 80%|███████▉ | 6986/8750 [3:46:19<2:47:45, 5.71s/it] {'loss': 0.4535, 'learning_rate': 2.057180477802164e-06, 'epoch': 0.8} + 80%|███████▉ | 6986/8750 [3:46:16<2:47:45, 5.71s/it] 80%|███████▉ | 6987/8750 [3:46:22<2:47:19, 5.69s/it] 80%|███████▉ | 6987/8750 [3:46:24<2:47:19, 5.69s/it] {'loss': 0.4474, 'learning_rate': 2.054932085426856e-06, 'epoch': 0.8} + {'loss': 0.4474, 'learning_rate': 2.054932085426856e-06, 'epoch': 0.8} 80%|███████▉ | 6987/8750 [3:46:24<2:47:19, 5.69s/it] + 80%|███████▉ | 6987/8750 [3:46:22<2:47:19, 5.69s/it] 80%|███████▉ | 6988/8750 [3:46:30<2:47:58, 5.72s/it] 80%|███████▉ | 6988/8750 [3:46:28<2:47:58, 5.72s/it] {'loss': 0.4389, 'learning_rate': 2.0526847817032326e-06, 'epoch': 0.8} + 80%|███████▉ | 6988/8750 [3:46:30<2:47:58, 5.72s/it] {'loss': 0.4389, 'learning_rate': 2.0526847817032326e-06, 'epoch': 0.8} + 80%|███████▉ | 6988/8750 [3:46:28<2:47:58, 5.72s/it] 80%|███████▉ | 6989/8750 [3:46:36<2:47:16, 5.70s/it] 80%|███████▉ | 6989/8750 [3:46:33<2:47:17, 5.70s/it] {'loss': 0.4523, 'learning_rate': 2.0504385669392268e-06, 'epoch': 0.8} + 80%|███████▉ | 6989/8750 [3:46:36<2:47:16, 5.70s/it] {'loss': 0.4523, 'learning_rate': 2.0504385669392268e-06, 'epoch': 0.8} + 80%|███████▉ | 6989/8750 [3:46:33<2:47:17, 5.70s/it] 80%|███████▉ | 6990/8750 [3:46:41<2:46:55, 5.69s/it] 80%|███████▉ | 6990/8750 [3:46:39<2:46:55, 5.69s/it] {'loss': 0.4341, 'learning_rate': 2.048193441442623e-06, 'epoch': 0.8} + 80%|███████▉ | 6990/8750 [3:46:41<2:46:55, 5.69s/it] {'loss': 0.4341, 'learning_rate': 2.048193441442623e-06, 'epoch': 0.8} + 80%|███████▉ | 6990/8750 [3:46:39<2:46:55, 5.69s/it] 80%|███████▉ | 6991/8750 [3:46:47<2:46:23, 5.68s/it] 80%|███████▉ | 6991/8750 [3:46:45<2:46:23, 5.68s/it] {'loss': 0.462, 'learning_rate': 2.0459494055210495e-06, 'epoch': 0.8} + 80%|███████▉ | 6991/8750 [3:46:47<2:46:23, 5.68s/it] {'loss': 0.462, 'learning_rate': 2.0459494055210495e-06, 'epoch': 0.8} + 80%|███████▉ | 6991/8750 [3:46:45<2:46:23, 5.68s/it] 80%|███████▉ | 6992/8750 [3:46:53<2:45:17, 5.64s/it] 80%|███████▉ | 6992/8750 [3:46:50<2:45:17, 5.64s/it] {'loss': 0.4544, 'learning_rate': 2.043706459481992e-06, 'epoch': 0.8} + 80%|███████▉ | 6992/8750 [3:46:53<2:45:17, 5.64s/it] {'loss': 0.4544, 'learning_rate': 2.043706459481992e-06, 'epoch': 0.8} + 80%|███████▉ | 6992/8750 [3:46:50<2:45:17, 5.64s/it] 80%|███████▉ | 6993/8750 [3:46:58<2:45:15, 5.64s/it] 80%|███████▉ | 6993/8750 [3:46:56<2:45:15, 5.64s/it] {'loss': 0.4574, 'learning_rate': 2.0414646036327813e-06, 'epoch': 0.8} + 80%|███████▉ | 6993/8750 [3:46:58<2:45:15, 5.64s/it] {'loss': 0.4574, 'learning_rate': 2.0414646036327813e-06, 'epoch': 0.8} + 80%|███████▉ | 6993/8750 [3:46:56<2:45:15, 5.64s/it] 80%|███████▉ | 6994/8750 [3:47:04<2:47:37, 5.73s/it] 80%|███████▉ | 6994/8750 [3:47:02<2:47:37, 5.73s/it] {'loss': 0.4469, 'learning_rate': 2.039223838280606e-06, 'epoch': 0.8} + {'loss': 0.4469, 'learning_rate': 2.039223838280606e-06, 'epoch': 0.8} 80%|███████▉ | 6994/8750 [3:47:04<2:47:37, 5.73s/it] + 80%|███████▉ | 6994/8750 [3:47:02<2:47:37, 5.73s/it] 80%|███████▉ | 6995/8750 [3:47:10<2:46:50, 5.70s/it] 80%|███████▉ | 6995/8750 [3:47:08<2:46:49, 5.70s/it] {'loss': 0.4479, 'learning_rate': 2.0369841637324992e-06, 'epoch': 0.8} + 80%|███████▉ | 6995/8750 [3:47:10<2:46:50, 5.70s/it] {'loss': 0.4479, 'learning_rate': 2.0369841637324992e-06, 'epoch': 0.8} + 80%|███████▉ | 6995/8750 [3:47:08<2:46:49, 5.70s/it] 80%|███████▉ | 6996/8750 [3:47:16<2:47:14, 5.72s/it] 80%|███████▉ | 6996/8750 [3:47:13<2:47:14, 5.72s/it] {'loss': 0.4353, 'learning_rate': 2.034745580295342e-06, 'epoch': 0.8} + 80%|███████▉ | 6996/8750 [3:47:16<2:47:14, 5.72s/it] {'loss': 0.4353, 'learning_rate': 2.034745580295342e-06, 'epoch': 0.8} + 80%|███████▉ | 6996/8750 [3:47:13<2:47:14, 5.72s/it] 80%|███████▉ | 6997/8750 [3:47:21<2:46:07, 5.69s/it] 80%|███████▉ | 6997/8750 [3:47:19<2:46:07, 5.69s/it] {'loss': 0.4502, 'learning_rate': 2.0325080882758775e-06, 'epoch': 0.8} + 80%|███████▉ | 6997/8750 [3:47:21<2:46:07, 5.69s/it] {'loss': 0.4502, 'learning_rate': 2.0325080882758775e-06, 'epoch': 0.8} + 80%|███████▉ | 6997/8750 [3:47:19<2:46:07, 5.69s/it] 80%|███████▉ | 6998/8750 [3:47:27<2:46:23, 5.70s/it] 80%|███████▉ | 6998/8750 [3:47:25<2:46:23, 5.70s/it] {'loss': 0.4471, 'learning_rate': 2.030271687980685e-06, 'epoch': 0.8} + 80%|███████▉ | 6998/8750 [3:47:27<2:46:23, 5.70s/it] {'loss': 0.4471, 'learning_rate': 2.030271687980685e-06, 'epoch': 0.8} + 80%|███████▉ | 6998/8750 [3:47:25<2:46:23, 5.70s/it] 80%|███████▉ | 6999/8750 [3:47:33<2:45:41, 5.68s/it] 80%|███████▉ | 6999/8750 [3:47:30<2:45:41, 5.68s/it] {'loss': 0.457, 'learning_rate': 2.028036379716205e-06, 'epoch': 0.8} + 80%|███████▉ | 6999/8750 [3:47:33<2:45:41, 5.68s/it] {'loss': 0.457, 'learning_rate': 2.028036379716205e-06, 'epoch': 0.8} + 80%|███████▉ | 6999/8750 [3:47:30<2:45:41, 5.68s/it]9 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... + 80%|████████ | 7000/8750 [3:47:38<2:47:48, 5.75s/it]2 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +0 AutoResumeHook: Checking whether to suspend... + 80%|████████ | 7000/8750 [3:47:36<2:47:48, 5.75s/it]3 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4307, 'learning_rate': 2.025802163788727e-06, 'epoch': 0.8} + 80%|████████ | 7000/8750 [3:47:36<2:47:48, 5.75s/it] {'loss': 0.4307, 'learning_rate': 2.025802163788727e-06, 'epoch': 0.8} + 80%|████████ | 7000/8750 [3:47:38<2:47:48, 5.75s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-7000/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-7000/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-7000/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 80%|████████ | 7001/8750 [3:48:00<5:07:44, 10.56s/it] 80%|████████ | 7001/8750 [3:47:58<5:07:44, 10.56s/it] {'loss': 0.4597, 'learning_rate': 2.023569040504384e-06, 'epoch': 0.8} + 80%|████████ | 7001/8750 [3:48:00<5:07:44, 10.56s/it] {'loss': 0.4597, 'learning_rate': 2.023569040504384e-06, 'epoch': 0.8} + 80%|████████ | 7001/8750 [3:47:58<5:07:44, 10.56s/it] 80%|████████ | 7002/8750 [3:48:06<4:25:48, 9.12s/it] {'loss': 0.4453, 'learning_rate': 2.0213370101691675e-06, 'epoch': 0.8} + 80%|████████ | 7002/8750 [3:48:06<4:25:48, 9.12s/it] 80%|████████ | 7002/8750 [3:48:04<4:25:48, 9.12s/it] {'loss': 0.4453, 'learning_rate': 2.0213370101691675e-06, 'epoch': 0.8} + 80%|████████ | 7002/8750 [3:48:04<4:25:48, 9.12s/it] 80%|████████ | 7003/8750 [3:48:12<3:56:49, 8.13s/it] 80%|████████ | 7003/8750 [3:48:10<3:56:49, 8.13s/it] {'loss': 0.4399, 'learning_rate': 2.0191060730889132e-06, 'epoch': 0.8} + 80%|████████ | 7003/8750 [3:48:12<3:56:49, 8.13s/it] {'loss': 0.4399, 'learning_rate': 2.0191060730889132e-06, 'epoch': 0.8} + 80%|████████ | 7003/8750 [3:48:10<3:56:49, 8.13s/it] 80%|████████ | 7004/8750 [3:48:15<3:36:20, 7.43s/it] 80%|████████ | 7004/8750 [3:48:18<3:36:20, 7.43s/it] {'loss': 0.4464, 'learning_rate': 2.016876229569308e-06, 'epoch': 0.8} + 80%|████████ | 7004/8750 [3:48:18<3:36:20, 7.43s/it] {'loss': 0.4464, 'learning_rate': 2.016876229569308e-06, 'epoch': 0.8} + 80%|████████ | 7004/8750 [3:48:15<3:36:20, 7.43s/it] 80%|████████ | 7005/8750 [3:48:23<3:20:48, 6.90s/it] 80%|████████ | 7005/8750 [3:48:21<3:20:48, 6.90s/it] {'loss': 0.4651, 'learning_rate': 2.0146474799158935e-06, 'epoch': 0.8} + 80%|████████ | 7005/8750 [3:48:23<3:20:48, 6.90s/it] {'loss': 0.4651, 'learning_rate': 2.0146474799158935e-06, 'epoch': 0.8} + 80%|████████ | 7005/8750 [3:48:21<3:20:48, 6.90s/it] 80%|████████ | 7006/8750 [3:48:29<3:09:41, 6.53s/it] 80%|████████ | 7006/8750 [3:48:27<3:09:41, 6.53s/it] {'loss': 0.4465, 'learning_rate': 2.0124198244340543e-06, 'epoch': 0.8} + 80%|████████ | 7006/8750 [3:48:29<3:09:41, 6.53s/it] {'loss': 0.4465, 'learning_rate': 2.0124198244340543e-06, 'epoch': 0.8} + 80%|████████ | 7006/8750 [3:48:27<3:09:41, 6.53s/it] 80%|████████ | 7007/8750 [3:48:34<3:00:44, 6.22s/it] 80%|████████ | 7007/8750 [3:48:32<3:00:44, 6.22s/it] {'loss': 0.4695, 'learning_rate': 2.0101932634290345e-06, 'epoch': 0.8} + 80%|████████ | 7007/8750 [3:48:34<3:00:44, 6.22s/it] {'loss': 0.4695, 'learning_rate': 2.0101932634290345e-06, 'epoch': 0.8} + 80%|████████ | 7007/8750 [3:48:32<3:00:44, 6.22s/it] 80%|████████ | 7008/8750 [3:48:40<2:56:21, 6.07s/it] 80%|████████ | 7008/8750 [3:48:38<2:56:21, 6.07s/it]{'loss': 0.4512, 'learning_rate': 2.0079677972059163e-06, 'epoch': 0.8} + 80%|████████ | 7008/8750 [3:48:40<2:56:21, 6.07s/it] {'loss': 0.4512, 'learning_rate': 2.0079677972059163e-06, 'epoch': 0.8} + 80%|████████ | 7008/8750 [3:48:38<2:56:21, 6.07s/it] 80%|████████ | 7009/8750 [3:48:46<2:53:27, 5.98s/it] 80%|████████ | 7009/8750 [3:48:44<2:53:27, 5.98s/it] {'loss': 0.4398, 'learning_rate': 2.005743426069641e-06, 'epoch': 0.8} + 80%|████████ | 7009/8750 [3:48:46<2:53:27, 5.98s/it] {'loss': 0.4398, 'learning_rate': 2.005743426069641e-06, 'epoch': 0.8} + 80%|████████ | 7009/8750 [3:48:44<2:53:27, 5.98s/it] 80%|████████ | 7010/8750 [3:48:49<2:50:01, 5.86s/it] 80%|████████ | 7010/8750 [3:48:52<2:50:01, 5.86s/it] {'loss': 0.4535, 'learning_rate': 2.003520150325e-06, 'epoch': 0.8} + 80%|████████ | 7010/8750 [3:48:52<2:50:01, 5.86s/it] {'loss': 0.4535, 'learning_rate': 2.003520150325e-06, 'epoch': 0.8} + 80%|████████ | 7010/8750 [3:48:49<2:50:01, 5.86s/it] 80%|████████ | 7011/8750 [3:48:57<2:48:13, 5.80s/it] 80%|████████ | 7011/8750 [3:48:55<2:48:14, 5.80s/it] {'loss': 0.449, 'learning_rate': 2.0012979702766277e-06, 'epoch': 0.8} + 80%|████████ | 7011/8750 [3:48:57<2:48:13, 5.80s/it] {'loss': 0.449, 'learning_rate': 2.0012979702766277e-06, 'epoch': 0.8} + 80%|████████ | 7011/8750 [3:48:55<2:48:14, 5.80s/it] 80%|████████ | 7012/8750 [3:49:03<2:47:12, 5.77s/it] 80%|████████ | 7012/8750 [3:49:01<2:47:11, 5.77s/it] {'loss': 0.4226, 'learning_rate': 1.9990768862290155e-06, 'epoch': 0.8} + 80%|████████ | 7012/8750 [3:49:03<2:47:12, 5.77s/it] {'loss': 0.4226, 'learning_rate': 1.9990768862290155e-06, 'epoch': 0.8} + 80%|████████ | 7012/8750 [3:49:01<2:47:11, 5.77s/it] 80%|████████ | 7013/8750 [3:49:08<2:44:21, 5.68s/it] 80%|████████ | 7013/8750 [3:49:06<2:44:21, 5.68s/it] {'loss': 0.4787, 'learning_rate': 1.9968568984865e-06, 'epoch': 0.8} + 80%|████████ | 7013/8750 [3:49:08<2:44:21, 5.68s/it] {'loss': 0.4787, 'learning_rate': 1.9968568984865e-06, 'epoch': 0.8} + 80%|████████ | 7013/8750 [3:49:06<2:44:21, 5.68s/it] 80%|████████ | 7014/8750 [3:49:14<2:44:19, 5.68s/it] 80%|████████ | 7014/8750 [3:49:12<2:44:19, 5.68s/it] {'loss': 0.4357, 'learning_rate': 1.9946380073532668e-06, 'epoch': 0.8} + 80%|████████ | 7014/8750 [3:49:14<2:44:19, 5.68s/it] {'loss': 0.4357, 'learning_rate': 1.9946380073532668e-06, 'epoch': 0.8} + 80%|████████ | 7014/8750 [3:49:12<2:44:19, 5.68s/it] 80%|████████ | 7015/8750 [3:49:20<2:43:59, 5.67s/it] 80%|████████ | 7015/8750 [3:49:17<2:43:59, 5.67s/it] {'loss': 0.4803, 'learning_rate': 1.992420213133357e-06, 'epoch': 0.8} + 80%|████████ | 7015/8750 [3:49:17<2:43:59, 5.67s/it]{'loss': 0.4803, 'learning_rate': 1.992420213133357e-06, 'epoch': 0.8} + 80%|████████ | 7015/8750 [3:49:20<2:43:59, 5.67s/it] 80%|████████ | 7016/8750 [3:49:26<2:46:29, 5.76s/it] 80%|████████ | 7016/8750 [3:49:23<2:46:29, 5.76s/it] {'loss': 0.4418, 'learning_rate': 1.9902035161306574e-06, 'epoch': 0.8} + 80%|████████ | 7016/8750 [3:49:26<2:46:29, 5.76s/it] {'loss': 0.4418, 'learning_rate': 1.9902035161306574e-06, 'epoch': 0.8} + 80%|████████ | 7016/8750 [3:49:23<2:46:29, 5.76s/it] 80%|████████ | 7017/8750 [3:49:31<2:45:07, 5.72s/it] 80%|████████ | 7017/8750 [3:49:29<2:45:07, 5.72s/it] {'loss': 0.4524, 'learning_rate': 1.9879879166489023e-06, 'epoch': 0.8} + {'loss': 0.4524, 'learning_rate': 1.9879879166489023e-06, 'epoch': 0.8} 80%|████████ | 7017/8750 [3:49:31<2:45:07, 5.72s/it] + 80%|████████ | 7017/8750 [3:49:29<2:45:07, 5.72s/it] 80%|████████ | 7018/8750 [3:49:37<2:44:27, 5.70s/it] 80%|████████ | 7018/8750 [3:49:35<2:44:27, 5.70s/it] {'loss': 0.4358, 'learning_rate': 1.9857734149916787e-06, 'epoch': 0.8} + 80%|████████ | 7018/8750 [3:49:37<2:44:27, 5.70s/it] {'loss': 0.4358, 'learning_rate': 1.9857734149916787e-06, 'epoch': 0.8} + 80%|████████ | 7018/8750 [3:49:35<2:44:27, 5.70s/it] 80%|████████ | 7019/8750 [3:49:43<2:43:40, 5.67s/it] 80%|████████ | 7019/8750 [3:49:40<2:43:40, 5.67s/it] {'loss': 0.4723, 'learning_rate': 1.983560011462425e-06, 'epoch': 0.8} + 80%|████████ | 7019/8750 [3:49:43<2:43:40, 5.67s/it] {'loss': 0.4723, 'learning_rate': 1.983560011462425e-06, 'epoch': 0.8} + 80%|████████ | 7019/8750 [3:49:40<2:43:40, 5.67s/it] 80%|████████ | 7020/8750 [3:49:49<2:47:35, 5.81s/it] 80%|████████ | 7020/8750 [3:49:46<2:47:35, 5.81s/it] {'loss': 0.4411, 'learning_rate': 1.981347706364429e-06, 'epoch': 0.8} + 80%|████████ | 7020/8750 [3:49:49<2:47:35, 5.81s/it] {'loss': 0.4411, 'learning_rate': 1.981347706364429e-06, 'epoch': 0.8} + 80%|████████ | 7020/8750 [3:49:46<2:47:35, 5.81s/it] 80%|████████ | 7021/8750 [3:49:54<2:46:37, 5.78s/it] 80%|████████ | 7021/8750 [3:49:52<2:46:37, 5.78s/it] {'loss': 0.4501, 'learning_rate': 1.979136500000822e-06, 'epoch': 0.8} + 80%|████████ | 7021/8750 [3:49:54<2:46:37, 5.78s/it] {'loss': 0.4501, 'learning_rate': 1.979136500000822e-06, 'epoch': 0.8} + 80%|████████ | 7021/8750 [3:49:52<2:46:37, 5.78s/it] 80%|████████ | 7022/8750 [3:50:00<2:45:24, 5.74s/it] 80%|████████ | 7022/8750 [3:49:58<2:45:24, 5.74s/it] {'loss': 0.4283, 'learning_rate': 1.9769263926745886e-06, 'epoch': 0.8} + 80%|████████ | 7022/8750 [3:50:00<2:45:24, 5.74s/it] {'loss': 0.4283, 'learning_rate': 1.9769263926745886e-06, 'epoch': 0.8} + 80%|████████ | 7022/8750 [3:49:58<2:45:24, 5.74s/it] 80%|████████ | 7023/8750 [3:50:06<2:47:06, 5.81s/it] {'loss': 0.4435, 'learning_rate': 1.974717384688566e-06, 'epoch': 0.8} + 80%|████████ | 7023/8750 [3:50:06<2:47:06, 5.81s/it] 80%|████████ | 7023/8750 [3:50:04<2:47:06, 5.81s/it] {'loss': 0.4435, 'learning_rate': 1.974717384688566e-06, 'epoch': 0.8} + 80%|████████ | 7023/8750 [3:50:04<2:47:06, 5.81s/it] 80%|████████ | 7024/8750 [3:50:12<2:47:44, 5.83s/it] 80%|████████ | 7024/8750 [3:50:10<2:47:44, 5.83s/it] {'loss': 0.4534, 'learning_rate': 1.972509476345432e-06, 'epoch': 0.8} + 80%|████████ | 7024/8750 [3:50:12<2:47:44, 5.83s/it] {'loss': 0.4534, 'learning_rate': 1.972509476345432e-06, 'epoch': 0.8} + 80%|████████ | 7024/8750 [3:50:10<2:47:44, 5.83s/it] 80%|████████ | 7025/8750 [3:50:17<2:45:35, 5.76s/it] 80%|████████ | 7025/8750 [3:50:15<2:45:35, 5.76s/it] {'loss': 0.4596, 'learning_rate': 1.9703026679477253e-06, 'epoch': 0.8} + 80%|████████ | 7025/8750 [3:50:17<2:45:35, 5.76s/it] {'loss': 0.4596, 'learning_rate': 1.9703026679477253e-06, 'epoch': 0.8} + 80%|████████ | 7025/8750 [3:50:15<2:45:35, 5.76s/it] 80%|████████ | 7026/8750 [3:50:23<2:46:04, 5.78s/it] 80%|████████ | 7026/8750 [3:50:21<2:46:04, 5.78s/it] {'loss': 0.4401, 'learning_rate': 1.968096959797827e-06, 'epoch': 0.8} + 80%|████████ | 7026/8750 [3:50:23<2:46:04, 5.78s/it] {'loss': 0.4401, 'learning_rate': 1.968096959797827e-06, 'epoch': 0.8} + 80%|████████ | 7026/8750 [3:50:21<2:46:04, 5.78s/it] 80%|████████ | 7027/8750 [3:50:29<2:47:51, 5.85s/it] 80%|████████ | 7027/8750 [3:50:27<2:47:51, 5.85s/it]{'loss': 0.4563, 'learning_rate': 1.9658923521979633e-06, 'epoch': 0.8} + {'loss': 0.4563, 'learning_rate': 1.9658923521979633e-06, 'epoch': 0.8} + 80%|████████ | 7027/8750 [3:50:29<2:47:51, 5.85s/it] 80%|████████ | 7027/8750 [3:50:27<2:47:51, 5.85s/it] 80%|████████ | 7028/8750 [3:50:35<2:45:58, 5.78s/it] 80%|████████ | 7028/8750 [3:50:33<2:45:58, 5.78s/it] {'loss': 0.4607, 'learning_rate': 1.963688845450218e-06, 'epoch': 0.8} + 80%|████████ | 7028/8750 [3:50:35<2:45:58, 5.78s/it] {'loss': 0.4607, 'learning_rate': 1.963688845450218e-06, 'epoch': 0.8} + 80%|████████ | 7028/8750 [3:50:33<2:45:58, 5.78s/it] 80%|████████ | 7029/8750 [3:50:41<2:45:15, 5.76s/it] 80%|████████ | 7029/8750 [3:50:38<2:45:15, 5.76s/it] {'loss': 0.434, 'learning_rate': 1.9614864398565212e-06, 'epoch': 0.8} + 80%|████████ | 7029/8750 [3:50:38<2:45:15, 5.76s/it]{'loss': 0.434, 'learning_rate': 1.9614864398565212e-06, 'epoch': 0.8} + 80%|████████ | 7029/8750 [3:50:41<2:45:15, 5.76s/it] 80%|████████ | 7030/8750 [3:50:46<2:44:42, 5.75s/it] 80%|████████ | 7030/8750 [3:50:44<2:44:42, 5.75s/it] {'loss': 0.4564, 'learning_rate': 1.9592851357186537e-06, 'epoch': 0.8} + 80%|████████ | 7030/8750 [3:50:46<2:44:42, 5.75s/it] {'loss': 0.4564, 'learning_rate': 1.9592851357186537e-06, 'epoch': 0.8} + 80%|████████ | 7030/8750 [3:50:44<2:44:42, 5.75s/it] 80%|████████ | 7031/8750 [3:50:52<2:43:55, 5.72s/it] 80%|████████ | 7031/8750 [3:50:50<2:43:54, 5.72s/it] {'loss': 0.4644, 'learning_rate': 1.957084933338241e-06, 'epoch': 0.8} + 80%|████████ | 7031/8750 [3:50:52<2:43:55, 5.72s/it] {'loss': 0.4644, 'learning_rate': 1.957084933338241e-06, 'epoch': 0.8} + 80%|████████ | 7031/8750 [3:50:50<2:43:54, 5.72s/it] 80%|████████ | 7032/8750 [3:50:58<2:42:37, 5.68s/it] 80%|████████ | 7032/8750 [3:50:55<2:42:37, 5.68s/it] {'loss': 0.4615, 'learning_rate': 1.9548858330167584e-06, 'epoch': 0.8} + 80%|████████ | 7032/8750 [3:50:58<2:42:37, 5.68s/it] {'loss': 0.4615, 'learning_rate': 1.9548858330167584e-06, 'epoch': 0.8} + 80%|████████ | 7032/8750 [3:50:55<2:42:37, 5.68s/it] 80%|████████ | 7033/8750 [3:51:03<2:41:48, 5.65s/it] 80%|████████ | 7033/8750 [3:51:01<2:41:47, 5.65s/it] {'loss': 0.4764, 'learning_rate': 1.9526878350555344e-06, 'epoch': 0.8} + 80%|████████ | 7033/8750 [3:51:03<2:41:48, 5.65s/it] {'loss': 0.4764, 'learning_rate': 1.9526878350555344e-06, 'epoch': 0.8} + 80%|████████ | 7033/8750 [3:51:01<2:41:47, 5.65s/it] 80%|████████ | 7034/8750 [3:51:09<2:42:21, 5.68s/it] 80%|████████ | 7034/8750 [3:51:07<2:42:20, 5.68s/it] {'loss': 0.4279, 'learning_rate': 1.9504909397557436e-06, 'epoch': 0.8} + 80%|████████ | 7034/8750 [3:51:07<2:42:20, 5.68s/it] {'loss': 0.4279, 'learning_rate': 1.9504909397557436e-06, 'epoch': 0.8} + 80%|████████ | 7034/8750 [3:51:09<2:42:21, 5.68s/it] 80%|████████ | 7035/8750 [3:51:15<2:41:31, 5.65s/it] 80%|████████ | 7035/8750 [3:51:12<2:41:31, 5.65s/it] {'loss': 0.4497, 'learning_rate': 1.9482951474184054e-06, 'epoch': 0.8} + 80%|████████ | 7035/8750 [3:51:15<2:41:31, 5.65s/it] {'loss': 0.4497, 'learning_rate': 1.9482951474184054e-06, 'epoch': 0.8} + 80%|████████ | 7035/8750 [3:51:12<2:41:31, 5.65s/it] 80%|████████ | 7036/8750 [3:51:20<2:41:39, 5.66s/it] 80%|████████ | 7036/8750 [3:51:18<2:41:39, 5.66s/it] {'loss': 0.4471, 'learning_rate': 1.9461004583443986e-06, 'epoch': 0.8} + 80%|████████ | 7036/8750 [3:51:20<2:41:39, 5.66s/it] {'loss': 0.4471, 'learning_rate': 1.9461004583443986e-06, 'epoch': 0.8} + 80%|████████ | 7036/8750 [3:51:18<2:41:39, 5.66s/it] 80%|████████ | 7037/8750 [3:51:26<2:42:02, 5.68s/it] 80%|████████ | 7037/8750 [3:51:24<2:42:02, 5.68s/it] {'loss': 0.4558, 'learning_rate': 1.94390687283444e-06, 'epoch': 0.8} + 80%|████████ | 7037/8750 [3:51:26<2:42:02, 5.68s/it] {'loss': 0.4558, 'learning_rate': 1.94390687283444e-06, 'epoch': 0.8} + 80%|████████ | 7037/8750 [3:51:24<2:42:02, 5.68s/it] 80%|████████ | 7038/8750 [3:51:32<2:42:30, 5.70s/it] 80%|████████ | 7038/8750 [3:51:29<2:42:31, 5.70s/it] {'loss': 0.4435, 'learning_rate': 1.9417143911891003e-06, 'epoch': 0.8} + 80%|████████ | 7038/8750 [3:51:32<2:42:30, 5.70s/it] {'loss': 0.4435, 'learning_rate': 1.9417143911891003e-06, 'epoch': 0.8} + 80%|████████ | 7038/8750 [3:51:29<2:42:31, 5.70s/it] 80%|████████ | 7039/8750 [3:51:37<2:42:25, 5.70s/it] 80%|████████ | 7039/8750 [3:51:35<2:42:25, 5.70s/it] {'loss': 0.4533, 'learning_rate': 1.939523013708803e-06, 'epoch': 0.8} + 80%|████████ | 7039/8750 [3:51:37<2:42:25, 5.70s/it] {'loss': 0.4533, 'learning_rate': 1.939523013708803e-06, 'epoch': 0.8} + 80%|████████ | 7039/8750 [3:51:35<2:42:25, 5.70s/it] 80%|████████ | 7040/8750 [3:51:43<2:41:58, 5.68s/it] 80%|████████ | 7040/8750 [3:51:41<2:41:58, 5.68s/it] {'loss': 0.4468, 'learning_rate': 1.937332740693809e-06, 'epoch': 0.8} + 80%|████████ | 7040/8750 [3:51:43<2:41:58, 5.68s/it] {'loss': 0.4468, 'learning_rate': 1.937332740693809e-06, 'epoch': 0.8} + 80%|████████ | 7040/8750 [3:51:41<2:41:58, 5.68s/it] 80%|████████ | 7041/8750 [3:51:49<2:41:58, 5.69s/it] 80%|████████ | 7041/8750 [3:51:46<2:41:58, 5.69s/it] {'loss': 0.4671, 'learning_rate': 1.9351435724442412e-06, 'epoch': 0.8} + 80%|████████ | 7041/8750 [3:51:49<2:41:58, 5.69s/it] {'loss': 0.4671, 'learning_rate': 1.9351435724442412e-06, 'epoch': 0.8} + 80%|████████ | 7041/8750 [3:51:46<2:41:58, 5.69s/it] 80%|████████ | 7042/8750 [3:51:54<2:42:27, 5.71s/it] 80%|████████ | 7042/8750 [3:51:52<2:42:27, 5.71s/it] {'loss': 0.4538, 'learning_rate': 1.9329555092600593e-06, 'epoch': 0.8} + 80%|████████ | 7042/8750 [3:51:54<2:42:27, 5.71s/it] {'loss': 0.4538, 'learning_rate': 1.9329555092600593e-06, 'epoch': 0.8} + 80%|████████ | 7042/8750 [3:51:52<2:42:27, 5.71s/it] 80%|████████ | 7043/8750 [3:52:00<2:41:59, 5.69s/it] 80%|████████ | 7043/8750 [3:51:58<2:41:59, 5.69s/it] {'loss': 0.4458, 'learning_rate': 1.9307685514410803e-06, 'epoch': 0.8} + 80%|████████ | 7043/8750 [3:52:00<2:41:59, 5.69s/it] {'loss': 0.4458, 'learning_rate': 1.9307685514410803e-06, 'epoch': 0.8} + 80%|████████ | 7043/8750 [3:51:58<2:41:59, 5.69s/it] 81%|████████ | 7044/8750 [3:52:06<2:45:54, 5.84s/it] 81%|████████ | 7044/8750 [3:52:04<2:45:54, 5.84s/it] {'loss': 0.4467, 'learning_rate': 1.928582699286965e-06, 'epoch': 0.81} + 81%|████████ | 7044/8750 [3:52:06<2:45:54, 5.84s/it] {'loss': 0.4467, 'learning_rate': 1.928582699286965e-06, 'epoch': 0.81} + 81%|████████ | 7044/8750 [3:52:04<2:45:54, 5.84s/it] 81%|████████ | 7045/8750 [3:52:12<2:44:05, 5.77s/it] 81%|████████ | 7045/8750 [3:52:10<2:44:05, 5.77s/it] {'loss': 0.4473, 'learning_rate': 1.926397953097222e-06, 'epoch': 0.81} + 81%|████████ | 7045/8750 [3:52:12<2:44:05, 5.77s/it] {'loss': 0.4473, 'learning_rate': 1.926397953097222e-06, 'epoch': 0.81} + 81%|████████ | 7045/8750 [3:52:10<2:44:05, 5.77s/it] 81%|████████ | 7046/8750 [3:52:18<2:44:41, 5.80s/it] 81%|████████ | 7046/8750 [3:52:16<2:44:41, 5.80s/it] {'loss': 0.4304, 'learning_rate': 1.924214313171211e-06, 'epoch': 0.81} + 81%|████████ | 7046/8750 [3:52:18<2:44:41, 5.80s/it] {'loss': 0.4304, 'learning_rate': 1.924214313171211e-06, 'epoch': 0.81} + 81%|████████ | 7046/8750 [3:52:16<2:44:41, 5.80s/it] 81%|████████ | 7047/8750 [3:52:23<2:43:34, 5.76s/it] 81%|████████ | 7047/8750 [3:52:21<2:43:33, 5.76s/it] {'loss': 0.4559, 'learning_rate': 1.9220317798081433e-06, 'epoch': 0.81} + 81%|████████ | 7047/8750 [3:52:23<2:43:34, 5.76s/it] {'loss': 0.4559, 'learning_rate': 1.9220317798081433e-06, 'epoch': 0.81} + 81%|████████ | 7047/8750 [3:52:21<2:43:33, 5.76s/it] 81%|████████ | 7048/8750 [3:52:29<2:43:01, 5.75s/it] 81%|████████ | 7048/8750 [3:52:27<2:43:00, 5.75s/it] {'loss': 0.4668, 'learning_rate': 1.9198503533070688e-06, 'epoch': 0.81} + 81%|████████ | 7048/8750 [3:52:29<2:43:01, 5.75s/it] {'loss': 0.4668, 'learning_rate': 1.9198503533070688e-06, 'epoch': 0.81} + 81%|████████ | 7048/8750 [3:52:27<2:43:00, 5.75s/it] 81%|████████ | 7049/8750 [3:52:35<2:41:41, 5.70s/it] 81%|████████ | 7049/8750 [3:52:32<2:41:40, 5.70s/it] {'loss': 0.4588, 'learning_rate': 1.9176700339668986e-06, 'epoch': 0.81} + 81%|████████ | 7049/8750 [3:52:35<2:41:41, 5.70s/it] {'loss': 0.4588, 'learning_rate': 1.9176700339668986e-06, 'epoch': 0.81} + 81%|████████ | 7049/8750 [3:52:32<2:41:40, 5.70s/it]89 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + + 81%|████████ | 7050/8750 [3:52:41<2:42:01, 5.72s/it]14 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +011 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... + 81%|████████ | 7050/8750 [3:52:38<2:42:02, 5.72s/it] {'loss': 0.447, 'learning_rate': 1.9154908220863775e-06, 'epoch': 0.81} + 81%|████████ | 7050/8750 [3:52:41<2:42:01, 5.72s/it] {'loss': 0.447, 'learning_rate': 1.9154908220863775e-06, 'epoch': 0.81} + 81%|████████ | 7050/8750 [3:52:38<2:42:02, 5.72s/it] 81%|████████ | 7051/8750 [3:52:46<2:40:51, 5.68s/it] 81%|████████ | 7051/8750 [3:52:44<2:40:51, 5.68s/it] {'loss': 0.4615, 'learning_rate': 1.913312717964113e-06, 'epoch': 0.81} + {'loss': 0.4615, 'learning_rate': 1.913312717964113e-06, 'epoch': 0.81} 81%|████████ | 7051/8750 [3:52:46<2:40:51, 5.68s/it] + 81%|████████ | 7051/8750 [3:52:44<2:40:51, 5.68s/it] 81%|████████ | 7052/8750 [3:52:52<2:40:06, 5.66s/it] 81%|████████ | 7052/8750 [3:52:49<2:40:06, 5.66s/it] {'loss': 0.4483, 'learning_rate': 1.9111357218985504e-06, 'epoch': 0.81} + 81%|████████ | 7052/8750 [3:52:52<2:40:06, 5.66s/it] {'loss': 0.4483, 'learning_rate': 1.9111357218985504e-06, 'epoch': 0.81} + 81%|████████ | 7052/8750 [3:52:49<2:40:06, 5.66s/it] 81%|████████ | 7053/8750 [3:52:57<2:40:47, 5.69s/it] 81%|████████ | 7053/8750 [3:52:55<2:40:47, 5.69s/it] {'loss': 0.4496, 'learning_rate': 1.9089598341879855e-06, 'epoch': 0.81} + 81%|████████ | 7053/8750 [3:52:57<2:40:47, 5.69s/it] {'loss': 0.4496, 'learning_rate': 1.9089598341879855e-06, 'epoch': 0.81} + 81%|████████ | 7053/8750 [3:52:55<2:40:47, 5.69s/it] 81%|████████ | 7054/8750 [3:53:03<2:41:41, 5.72s/it] 81%|████████ | 7054/8750 [3:53:01<2:41:41, 5.72s/it] {'loss': 0.4503, 'learning_rate': 1.9067850551305678e-06, 'epoch': 0.81} + 81%|████████ | 7054/8750 [3:53:03<2:41:41, 5.72s/it] {'loss': 0.4503, 'learning_rate': 1.9067850551305678e-06, 'epoch': 0.81} + 81%|████████ | 7054/8750 [3:53:01<2:41:41, 5.72s/it] 81%|████████ | 7055/8750 [3:53:09<2:41:05, 5.70s/it] 81%|████████ | 7055/8750 [3:53:07<2:41:05, 5.70s/it] {'loss': 0.45, 'learning_rate': 1.9046113850242843e-06, 'epoch': 0.81} + 81%|████████ | 7055/8750 [3:53:09<2:41:05, 5.70s/it] {'loss': 0.45, 'learning_rate': 1.9046113850242843e-06, 'epoch': 0.81} + 81%|████████ | 7055/8750 [3:53:07<2:41:05, 5.70s/it] 81%|████████ | 7056/8750 [3:53:15<2:42:48, 5.77s/it] 81%|████████ | 7056/8750 [3:53:13<2:42:48, 5.77s/it] {'loss': 0.4505, 'learning_rate': 1.9024388241669811e-06, 'epoch': 0.81} + 81%|████████ | 7056/8750 [3:53:15<2:42:48, 5.77s/it] {'loss': 0.4505, 'learning_rate': 1.9024388241669811e-06, 'epoch': 0.81} + 81%|████████ | 7056/8750 [3:53:13<2:42:48, 5.77s/it] 81%|████████ | 7057/8750 [3:53:21<2:42:08, 5.75s/it] 81%|████████ | 7057/8750 [3:53:18<2:42:08, 5.75s/it] {'loss': 0.4639, 'learning_rate': 1.900267372856348e-06, 'epoch': 0.81} + 81%|████████ | 7057/8750 [3:53:21<2:42:08, 5.75s/it] {'loss': 0.4639, 'learning_rate': 1.900267372856348e-06, 'epoch': 0.81} + 81%|████████ | 7057/8750 [3:53:18<2:42:08, 5.75s/it] 81%|████████ | 7058/8750 [3:53:27<2:43:58, 5.81s/it] 81%|████████ | 7058/8750 [3:53:24<2:43:58, 5.81s/it] {'loss': 0.4497, 'learning_rate': 1.8980970313899193e-06, 'epoch': 0.81} + 81%|████████ | 7058/8750 [3:53:27<2:43:58, 5.81s/it] {'loss': 0.4497, 'learning_rate': 1.8980970313899193e-06, 'epoch': 0.81} + 81%|████████ | 7058/8750 [3:53:24<2:43:58, 5.81s/it] 81%|████████ | 7059/8750 [3:53:32<2:42:15, 5.76s/it] 81%|████████ | 7059/8750 [3:53:30<2:42:15, 5.76s/it] {'loss': 0.449, 'learning_rate': 1.8959278000650839e-06, 'epoch': 0.81} + 81%|████████ | 7059/8750 [3:53:32<2:42:15, 5.76s/it] {'loss': 0.449, 'learning_rate': 1.8959278000650839e-06, 'epoch': 0.81} + 81%|████████ | 7059/8750 [3:53:30<2:42:15, 5.76s/it] 81%|████████ | 7060/8750 [3:53:38<2:42:04, 5.75s/it] 81%|████████ | 7060/8750 [3:53:36<2:42:04, 5.75s/it] {'loss': 0.4499, 'learning_rate': 1.8937596791790735e-06, 'epoch': 0.81} + 81%|████████ | 7060/8750 [3:53:38<2:42:04, 5.75s/it] {'loss': 0.4499, 'learning_rate': 1.8937596791790735e-06, 'epoch': 0.81} + 81%|████████ | 7060/8750 [3:53:36<2:42:04, 5.75s/it] 81%|████████ | 7061/8750 [3:53:44<2:44:21, 5.84s/it] 81%|████████ | 7061/8750 [3:53:42<2:44:20, 5.84s/it] {'loss': 0.4336, 'learning_rate': 1.8915926690289643e-06, 'epoch': 0.81} + 81%|████████ | 7061/8750 [3:53:44<2:44:21, 5.84s/it] {'loss': 0.4336, 'learning_rate': 1.8915926690289643e-06, 'epoch': 0.81} + 81%|████████ | 7061/8750 [3:53:42<2:44:20, 5.84s/it] 81%|████████ | 7062/8750 [3:53:50<2:42:53, 5.79s/it] 81%|████████ | 7062/8750 [3:53:47<2:42:53, 5.79s/it] {'loss': 0.4468, 'learning_rate': 1.889426769911693e-06, 'epoch': 0.81} + 81%|████████ | 7062/8750 [3:53:50<2:42:53, 5.79s/it] {'loss': 0.4468, 'learning_rate': 1.889426769911693e-06, 'epoch': 0.81} + 81%|████████ | 7062/8750 [3:53:47<2:42:53, 5.79s/it] 81%|████████ | 7063/8750 [3:53:55<2:41:26, 5.74s/it] 81%|████████ | 7063/8750 [3:53:53<2:41:26, 5.74s/it] {'loss': 0.439, 'learning_rate': 1.887261982124029e-06, 'epoch': 0.81} + 81%|████████ | 7063/8750 [3:53:55<2:41:26, 5.74s/it] {'loss': 0.439, 'learning_rate': 1.887261982124029e-06, 'epoch': 0.81} + 81%|████████ | 7063/8750 [3:53:53<2:41:26, 5.74s/it] 81%|████████ | 7064/8750 [3:54:01<2:40:32, 5.71s/it] 81%|████████ | 7064/8750 [3:53:59<2:40:32, 5.71s/it] {'loss': 0.4477, 'learning_rate': 1.8850983059626026e-06, 'epoch': 0.81} + 81%|████████ | 7064/8750 [3:54:01<2:40:32, 5.71s/it] {'loss': 0.4477, 'learning_rate': 1.8850983059626026e-06, 'epoch': 0.81} + 81%|████████ | 7064/8750 [3:53:59<2:40:32, 5.71s/it] 81%|████████ | 7065/8750 [3:54:06<2:38:47, 5.65s/it] 81%|████████ | 7065/8750 [3:54:04<2:38:47, 5.65s/it] {'loss': 0.4556, 'learning_rate': 1.8829357417238802e-06, 'epoch': 0.81} + 81%|████████ | 7065/8750 [3:54:06<2:38:47, 5.65s/it] {'loss': 0.4556, 'learning_rate': 1.8829357417238802e-06, 'epoch': 0.81} + 81%|████████ | 7065/8750 [3:54:04<2:38:47, 5.65s/it] 81%|████████ | 7066/8750 [3:54:12<2:39:49, 5.69s/it] 81%|████████ | 7066/8750 [3:54:10<2:39:49, 5.69s/it] {'loss': 0.4657, 'learning_rate': 1.8807742897041847e-06, 'epoch': 0.81} + 81%|████████ | 7066/8750 [3:54:12<2:39:49, 5.69s/it] {'loss': 0.4657, 'learning_rate': 1.8807742897041847e-06, 'epoch': 0.81} + 81%|████████ | 7066/8750 [3:54:10<2:39:49, 5.69s/it] 81%|████████ | 7067/8750 [3:54:19<2:45:28, 5.90s/it] {'loss': 0.4486, 'learning_rate': 1.8786139501996847e-06, 'epoch': 0.81} + 81%|████████ | 7067/8750 [3:54:19<2:45:28, 5.90s/it] 81%|████████ | 7067/8750 [3:54:16<2:45:28, 5.90s/it] {'loss': 0.4486, 'learning_rate': 1.8786139501996847e-06, 'epoch': 0.81} + 81%|████████ | 7067/8750 [3:54:16<2:45:28, 5.90s/it] 81%|████████ | 7068/8750 [3:54:24<2:43:03, 5.82s/it] 81%|████████ | 7068/8750 [3:54:22<2:43:03, 5.82s/it] {'loss': 0.4762, 'learning_rate': 1.8764547235063912e-06, 'epoch': 0.81} + 81%|████████ | 7068/8750 [3:54:24<2:43:03, 5.82s/it] {'loss': 0.4762, 'learning_rate': 1.8764547235063912e-06, 'epoch': 0.81} + 81%|████████ | 7068/8750 [3:54:22<2:43:03, 5.82s/it] 81%|████████ | 7069/8750 [3:54:30<2:40:41, 5.74s/it] 81%|████████ | 7069/8750 [3:54:27<2:40:41, 5.74s/it] {'loss': 0.4661, 'learning_rate': 1.8742966099201699e-06, 'epoch': 0.81} + 81%|████████ | 7069/8750 [3:54:30<2:40:41, 5.74s/it] {'loss': 0.4661, 'learning_rate': 1.8742966099201699e-06, 'epoch': 0.81} + 81%|████████ | 7069/8750 [3:54:28<2:40:41, 5.74s/it] 81%|████████ | 7070/8750 [3:54:36<2:42:26, 5.80s/it] 81%|████████ | 7070/8750 [3:54:33<2:42:26, 5.80s/it] {'loss': 0.4482, 'learning_rate': 1.8721396097367294e-06, 'epoch': 0.81} + 81%|████████ | 7070/8750 [3:54:36<2:42:26, 5.80s/it] {'loss': 0.4482, 'learning_rate': 1.8721396097367294e-06, 'epoch': 0.81} + 81%|████████ | 7070/8750 [3:54:33<2:42:26, 5.80s/it] 81%|████████ | 7071/8750 [3:54:41<2:41:57, 5.79s/it] 81%|████████ | 7071/8750 [3:54:39<2:41:57, 5.79s/it] {'loss': 0.4428, 'learning_rate': 1.8699837232516226e-06, 'epoch': 0.81} + 81%|████████ | 7071/8750 [3:54:41<2:41:57, 5.79s/it] {'loss': 0.4428, 'learning_rate': 1.8699837232516226e-06, 'epoch': 0.81} + 81%|████████ | 7071/8750 [3:54:39<2:41:57, 5.79s/it] 81%|████████ | 7072/8750 [3:54:47<2:41:36, 5.78s/it] 81%|████████ | 7072/8750 [3:54:45<2:41:37, 5.78s/it] {'loss': 0.4457, 'learning_rate': 1.867828950760262e-06, 'epoch': 0.81} + 81%|████████ | 7072/8750 [3:54:47<2:41:36, 5.78s/it] {'loss': 0.4457, 'learning_rate': 1.867828950760262e-06, 'epoch': 0.81} + 81%|████████ | 7072/8750 [3:54:45<2:41:37, 5.78s/it] 81%|████████ | 7073/8750 [3:54:53<2:43:27, 5.85s/it] 81%|████████ | 7073/8750 [3:54:51<2:43:27, 5.85s/it] {'loss': 0.4608, 'learning_rate': 1.8656752925578948e-06, 'epoch': 0.81} + 81%|████████ | 7073/8750 [3:54:53<2:43:27, 5.85s/it] {'loss': 0.4608, 'learning_rate': 1.8656752925578948e-06, 'epoch': 0.81} + 81%|████████ | 7073/8750 [3:54:51<2:43:27, 5.85s/it] 81%|████████ | 7074/8750 [3:54:57<2:41:37, 5.79s/it] {'loss': 0.458, 'learning_rate': 1.8635227489396178e-06, 'epoch': 0.81} + 81%|████████ | 7074/8750 [3:54:57<2:41:37, 5.79s/it] 81%|████████ | 7074/8750 [3:54:59<2:41:37, 5.79s/it] {'loss': 0.458, 'learning_rate': 1.8635227489396178e-06, 'epoch': 0.81} + 81%|████████ | 7074/8750 [3:54:59<2:41:37, 5.79s/it] 81%|████████ | 7075/8750 [3:55:05<2:41:16, 5.78s/it] 81%|████████ | 7075/8750 [3:55:02<2:41:15, 5.78s/it] {'loss': 0.4516, 'learning_rate': 1.8613713202003813e-06, 'epoch': 0.81} + 81%|████████ | 7075/8750 [3:55:05<2:41:16, 5.78s/it] {'loss': 0.4516, 'learning_rate': 1.8613713202003813e-06, 'epoch': 0.81} + 81%|████████ | 7075/8750 [3:55:02<2:41:15, 5.78s/it] 81%|████████ | 7076/8750 [3:55:10<2:41:30, 5.79s/it] 81%|████████ | 7076/8750 [3:55:08<2:41:30, 5.79s/it] {'loss': 0.4511, 'learning_rate': 1.8592210066349781e-06, 'epoch': 0.81} + 81%|████████ | 7076/8750 [3:55:10<2:41:30, 5.79s/it] {'loss': 0.4511, 'learning_rate': 1.8592210066349781e-06, 'epoch': 0.81} + 81%|████████ | 7076/8750 [3:55:08<2:41:30, 5.79s/it] 81%|████████ | 7077/8750 [3:55:16<2:40:37, 5.76s/it] 81%|████████ | 7077/8750 [3:55:14<2:40:37, 5.76s/it] {'loss': 0.4566, 'learning_rate': 1.8570718085380512e-06, 'epoch': 0.81} + 81%|████████ | 7077/8750 [3:55:16<2:40:37, 5.76s/it] {'loss': 0.4566, 'learning_rate': 1.8570718085380512e-06, 'epoch': 0.81} + 81%|████████ | 7077/8750 [3:55:14<2:40:37, 5.76s/it] 81%|████████ | 7078/8750 [3:55:22<2:41:35, 5.80s/it] 81%|████████ | 7078/8750 [3:55:20<2:41:35, 5.80s/it] {'loss': 0.441, 'learning_rate': 1.8549237262040876e-06, 'epoch': 0.81} + 81%|████████ | 7078/8750 [3:55:20<2:41:35, 5.80s/it]{'loss': 0.441, 'learning_rate': 1.8549237262040876e-06, 'epoch': 0.81} + 81%|████████ | 7078/8750 [3:55:22<2:41:35, 5.80s/it] 81%|████████ | 7079/8750 [3:55:28<2:40:44, 5.77s/it] 81%|████████ | 7079/8750 [3:55:25<2:40:44, 5.77s/it] {'loss': 0.4488, 'learning_rate': 1.8527767599274193e-06, 'epoch': 0.81} + 81%|████████ | 7079/8750 [3:55:28<2:40:44, 5.77s/it] {'loss': 0.4488, 'learning_rate': 1.8527767599274193e-06, 'epoch': 0.81} + 81%|████████ | 7079/8750 [3:55:25<2:40:44, 5.77s/it] 81%|████████ | 7080/8750 [3:55:34<2:42:07, 5.82s/it] 81%|████████ | 7080/8750 [3:55:31<2:42:07, 5.82s/it] {'loss': 0.4426, 'learning_rate': 1.8506309100022334e-06, 'epoch': 0.81} + 81%|████████ | 7080/8750 [3:55:34<2:42:07, 5.82s/it] {'loss': 0.4426, 'learning_rate': 1.8506309100022334e-06, 'epoch': 0.81} + 81%|████████ | 7080/8750 [3:55:31<2:42:07, 5.82s/it] 81%|████████ | 7081/8750 [3:55:40<2:43:40, 5.88s/it] 81%|████████ | 7081/8750 [3:55:37<2:43:40, 5.88s/it] {'loss': 0.445, 'learning_rate': 1.8484861767225549e-06, 'epoch': 0.81} + 81%|████████ | 7081/8750 [3:55:40<2:43:40, 5.88s/it] {'loss': 0.445, 'learning_rate': 1.8484861767225549e-06, 'epoch': 0.81} + 81%|████████ | 7081/8750 [3:55:37<2:43:40, 5.88s/it] 81%|████████ | 7082/8750 [3:55:45<2:40:57, 5.79s/it] 81%|████████ | 7082/8750 [3:55:43<2:40:57, 5.79s/it] {'loss': 0.4534, 'learning_rate': 1.846342560382265e-06, 'epoch': 0.81} + 81%|████████ | 7082/8750 [3:55:45<2:40:57, 5.79s/it] {'loss': 0.4534, 'learning_rate': 1.846342560382265e-06, 'epoch': 0.81} + 81%|████████ | 7082/8750 [3:55:43<2:40:57, 5.79s/it] 81%|████████ | 7083/8750 [3:55:51<2:38:45, 5.71s/it] 81%|████████ | 7083/8750 [3:55:49<2:38:45, 5.71s/it] {'loss': 0.4501, 'learning_rate': 1.8442000612750832e-06, 'epoch': 0.81} + 81%|████████ | 7083/8750 [3:55:51<2:38:45, 5.71s/it] {'loss': 0.4501, 'learning_rate': 1.8442000612750832e-06, 'epoch': 0.81} + 81%|████████ | 7083/8750 [3:55:49<2:38:45, 5.71s/it] 81%|████████ | 7084/8750 [3:55:57<2:38:51, 5.72s/it] 81%|████████ | 7084/8750 [3:55:54<2:38:50, 5.72s/it] {'loss': 0.4448, 'learning_rate': 1.8420586796945793e-06, 'epoch': 0.81} + 81%|████████ | 7084/8750 [3:55:57<2:38:51, 5.72s/it] {'loss': 0.4448, 'learning_rate': 1.8420586796945793e-06, 'epoch': 0.81} + 81%|████████ | 7084/8750 [3:55:54<2:38:50, 5.72s/it] 81%|████████ | 7085/8750 [3:56:02<2:37:02, 5.66s/it] 81%|████████ | 7085/8750 [3:56:00<2:37:02, 5.66s/it] {'loss': 0.4544, 'learning_rate': 1.839918415934171e-06, 'epoch': 0.81} + 81%|████████ | 7085/8750 [3:56:02<2:37:02, 5.66s/it] {'loss': 0.4544, 'learning_rate': 1.839918415934171e-06, 'epoch': 0.81} + 81%|████████ | 7085/8750 [3:56:00<2:37:02, 5.66s/it] 81%|████████ | 7086/8750 [3:56:08<2:36:35, 5.65s/it] 81%|████████ | 7086/8750 [3:56:05<2:36:34, 5.65s/it] {'loss': 0.4682, 'learning_rate': 1.8377792702871266e-06, 'epoch': 0.81} + 81%|████████ | 7086/8750 [3:56:08<2:36:35, 5.65s/it] {'loss': 0.4682, 'learning_rate': 1.8377792702871266e-06, 'epoch': 0.81} + 81%|████████ | 7086/8750 [3:56:05<2:36:34, 5.65s/it] 81%|████████ | 7087/8750 [3:56:14<2:40:27, 5.79s/it] 81%|████████ | 7087/8750 [3:56:12<2:40:27, 5.79s/it]{'loss': 0.4346, 'learning_rate': 1.8356412430465498e-06, 'epoch': 0.81} + 81%|████████ | 7087/8750 [3:56:14<2:40:27, 5.79s/it] {'loss': 0.4346, 'learning_rate': 1.8356412430465498e-06, 'epoch': 0.81} + 81%|████████ | 7087/8750 [3:56:12<2:40:27, 5.79s/it] 81%|████████ | 7088/8750 [3:56:19<2:38:58, 5.74s/it] 81%|████████ | 7088/8750 [3:56:17<2:38:58, 5.74s/it] {'loss': 0.4719, 'learning_rate': 1.8335043345054048e-06, 'epoch': 0.81} + 81%|████████ | 7088/8750 [3:56:19<2:38:58, 5.74s/it] {'loss': 0.4719, 'learning_rate': 1.8335043345054048e-06, 'epoch': 0.81} + 81%|████████ | 7088/8750 [3:56:17<2:38:58, 5.74s/it] 81%|████████ | 7089/8750 [3:56:25<2:40:44, 5.81s/it] 81%|████████ | 7089/8750 [3:56:23<2:40:44, 5.81s/it] {'loss': 0.4325, 'learning_rate': 1.83136854495649e-06, 'epoch': 0.81} + 81%|████████ | 7089/8750 [3:56:25<2:40:44, 5.81s/it] {'loss': 0.4325, 'learning_rate': 1.83136854495649e-06, 'epoch': 0.81} + 81%|████████ | 7089/8750 [3:56:23<2:40:44, 5.81s/it] 81%|████████ | 7090/8750 [3:56:31<2:39:43, 5.77s/it] 81%|████████ | 7090/8750 [3:56:29<2:39:43, 5.77s/it] {'loss': 0.4662, 'learning_rate': 1.829233874692461e-06, 'epoch': 0.81} + 81%|████████ | 7090/8750 [3:56:31<2:39:43, 5.77s/it] {'loss': 0.4662, 'learning_rate': 1.829233874692461e-06, 'epoch': 0.81} + 81%|████████ | 7090/8750 [3:56:29<2:39:43, 5.77s/it] 81%|████████ | 7091/8750 [3:56:35<2:39:12, 5.76s/it] 81%|████████ | 7091/8750 [3:56:37<2:39:12, 5.76s/it] {'loss': 0.4618, 'learning_rate': 1.8271003240058127e-06, 'epoch': 0.81} + 81%|████████ | 7091/8750 [3:56:35<2:39:12, 5.76s/it] {'loss': 0.4618, 'learning_rate': 1.8271003240058127e-06, 'epoch': 0.81} + 81%|████████ | 7091/8750 [3:56:37<2:39:12, 5.76s/it] 81%|████████ | 7092/8750 [3:56:42<2:38:03, 5.72s/it] 81%|████████ | 7092/8750 [3:56:40<2:38:03, 5.72s/it] {'loss': 0.4364, 'learning_rate': 1.8249678931888881e-06, 'epoch': 0.81} + 81%|████████ | 7092/8750 [3:56:42<2:38:03, 5.72s/it] {'loss': 0.4364, 'learning_rate': 1.8249678931888881e-06, 'epoch': 0.81} + 81%|████████ | 7092/8750 [3:56:40<2:38:03, 5.72s/it] 81%|████████ | 7093/8750 [3:56:48<2:37:02, 5.69s/it] 81%|████████ | 7093/8750 [3:56:46<2:37:02, 5.69s/it] {'loss': 0.4451, 'learning_rate': 1.8228365825338811e-06, 'epoch': 0.81} + 81%|████████ | 7093/8750 [3:56:48<2:37:02, 5.69s/it] {'loss': 0.4451, 'learning_rate': 1.8228365825338811e-06, 'epoch': 0.81} + 81%|████████ | 7093/8750 [3:56:46<2:37:02, 5.69s/it] 81%|████████ | 7094/8750 [3:56:54<2:39:20, 5.77s/it] 81%|████████ | 7094/8750 [3:56:52<2:39:20, 5.77s/it] {'loss': 0.4629, 'learning_rate': 1.820706392332824e-06, 'epoch': 0.81} + 81%|████████ | 7094/8750 [3:56:54<2:39:20, 5.77s/it]{'loss': 0.4629, 'learning_rate': 1.820706392332824e-06, 'epoch': 0.81} + 81%|████████ | 7094/8750 [3:56:52<2:39:20, 5.77s/it] 81%|████████ | 7095/8750 [3:57:00<2:38:19, 5.74s/it] 81%|████████ | 7095/8750 [3:56:57<2:38:19, 5.74s/it] {'loss': 0.4462, 'learning_rate': 1.8185773228776038e-06, 'epoch': 0.81} + 81%|████████ | 7095/8750 [3:57:00<2:38:19, 5.74s/it] {'loss': 0.4462, 'learning_rate': 1.8185773228776038e-06, 'epoch': 0.81} + 81%|████████ | 7095/8750 [3:56:57<2:38:19, 5.74s/it] 81%|████████ | 7096/8750 [3:57:05<2:38:14, 5.74s/it] 81%|████████ | 7096/8750 [3:57:03<2:38:14, 5.74s/it] {'loss': 0.4569, 'learning_rate': 1.8164493744599531e-06, 'epoch': 0.81} + 81%|████████ | 7096/8750 [3:57:05<2:38:14, 5.74s/it] {'loss': 0.4569, 'learning_rate': 1.8164493744599531e-06, 'epoch': 0.81} + 81%|████████ | 7096/8750 [3:57:03<2:38:14, 5.74s/it] 81%|████████ | 7097/8750 [3:57:11<2:39:14, 5.78s/it] 81%|████████ | 7097/8750 [3:57:09<2:39:14, 5.78s/it] {'loss': 0.4359, 'learning_rate': 1.814322547371443e-06, 'epoch': 0.81} + 81%|████████ | 7097/8750 [3:57:11<2:39:14, 5.78s/it] {'loss': 0.4359, 'learning_rate': 1.814322547371443e-06, 'epoch': 0.81} + 81%|████████ | 7097/8750 [3:57:09<2:39:14, 5.78s/it] 81%|████████ | 7098/8750 [3:57:17<2:39:07, 5.78s/it] 81%|████████ | 7098/8750 [3:57:15<2:39:07, 5.78s/it] {'loss': 0.4566, 'learning_rate': 1.8121968419035007e-06, 'epoch': 0.81} + 81%|████████ | 7098/8750 [3:57:17<2:39:07, 5.78s/it] {'loss': 0.4566, 'learning_rate': 1.8121968419035007e-06, 'epoch': 0.81} + 81%|████████ | 7098/8750 [3:57:15<2:39:07, 5.78s/it] 81%|████████ | 7099/8750 [3:57:23<2:39:04, 5.78s/it] 81%|████████ | 7099/8750 [3:57:21<2:39:04, 5.78s/it] {'loss': 0.4507, 'learning_rate': 1.810072258347394e-06, 'epoch': 0.81} + 81%|████████ | 7099/8750 [3:57:23<2:39:04, 5.78s/it] {'loss': 0.4507, 'learning_rate': 1.810072258347394e-06, 'epoch': 0.81} + 81%|████████ | 7099/8750 [3:57:21<2:39:04, 5.78s/it]5 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... + 81%|████████ | 7100/8750 [3:57:29<2:38:52, 5.78s/it]1415 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +02 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... + 7 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + + 81%|████████ | 7100/8750 [3:57:26<2:38:52, 5.78s/it]6 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4589, 'learning_rate': 1.8079487969942344e-06, 'epoch': 0.81} + 81%|████████ | 7100/8750 [3:57:29<2:38:52, 5.78s/it] {'loss': 0.4589, 'learning_rate': 1.8079487969942344e-06, 'epoch': 0.81} + 81%|████████ | 7100/8750 [3:57:26<2:38:52, 5.78s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-7100/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-7100/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-7100/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( +Apr 10 18:11:30.219711 3605270 slurmstepd 0x155550aed700: error: *** STEP 6724028.0 ON batch-block1-2097 CANCELLED AT 2025-04-10T18:11:30 DUE TO TIME LIMIT *** +srun: Job step aborted: Waiting up to 122 seconds for job step to finish. +srun: error: batch-block1-2097: task 0: Terminated +srun: Terminating StepId=6724028.0 +srun: error: batch-block1-10017: task 1: Terminated +srun: job 6726214 queued and waiting for resources +srun: job 6726214 has been allocated resources +wandb: Currently logged in as: memmelma. Use `wandb login --relogin` to force relogin +MASTER_ADDR=batch-block1-10017 +JobID: 6726214 | Full list: batch-block1-10017 batch-block1-2008 +NETWORK=Efficient-Large-Model/VILA1.5-3b +WARNING:torch.distributed.run: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +wandb: Currently logged in as: memmelma. Use `wandb login --relogin` to force relogin +MASTER_ADDR=batch-block1-10017 +JobID: 6726214 | Full list: batch-block1-10017 batch-block1-2008 +NETWORK=Efficient-Large-Model/VILA1.5-3b +WARNING:torch.distributed.run: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +[2025-04-10 18:15:39,887] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 18:15:39,896] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 18:15:39,907] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 18:15:39,936] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 18:15:39,952] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 18:15:39,952] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 18:15:39,962] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 18:15:39,991] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 18:15:40,138] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 18:15:40,138] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 18:15:40,138] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 18:15:40,139] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 18:15:40,153] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 18:15:40,153] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 18:15:40,184] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 18:15:40,184] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 18:15:40,192] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 18:15:40,192] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 18:15:40,194] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 18:15:40,194] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 18:15:40,202] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 18:15:40,202] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 18:15:40,233] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 18:15:40,233] [INFO] [comm.py:594:init_distributed] cdb=None +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +[2025-04-10 18:15:51,280] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 18:15:51,280] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 18:15:51,280] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 18:15:51,280] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 18:15:51,281] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 18:15:51,281] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 18:15:51,281] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 18:15:51,281] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 18:15:52,454] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 18:15:52,454] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 18:15:52,454] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 18:15:52,454] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 18:15:52,454] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 18:15:52,454] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 18:15:52,454] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 18:15:52,454] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 18:15:52,454] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 18:15:52,454] [INFO] [comm.py:625:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +[2025-04-10 18:15:52,454] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 18:15:52,454] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 18:15:52,454] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 18:15:52,454] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 18:15:52,454] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 18:15:52,454] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 18:15:52,454] [INFO] [comm.py:594:init_distributed] cdb=None +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. +[2025-04-10 18:16:01,075] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 2.70B parameters + Loading checkpoint shards: 0%| | 0/2 [00:00 4096). Running this sequence through the model will result in indexing errors + 90%|████████▉ | 7845/8750 [1:22:56<1:26:35, 5.74s/it] 90%|████████▉ | 7845/8750 [1:22:58<1:26:35, 5.74s/it] {'loss': 0.4489, 'learning_rate': 5.558954903190483e-07, 'epoch': 0.9} + 90%|████████▉ | 7845/8750 [1:22:58<1:26:35, 5.74s/it] {'loss': 0.4489, 'learning_rate': 5.558954903190483e-07, 'epoch': 0.9} + 90%|████████▉ | 7845/8750 [1:22:56<1:26:35, 5.74s/it] 90%|████████▉ | 7846/8750 [1:23:04<1:27:54, 5.83s/it] 90%|████████▉ | 7846/8750 [1:23:02<1:27:54, 5.83s/it] {'loss': 0.4557, 'learning_rate': 5.546791495881887e-07, 'epoch': 0.9} + 90%|████████▉ | 7846/8750 [1:23:04<1:27:54, 5.83s/it] {'loss': 0.4557, 'learning_rate': 5.546791495881887e-07, 'epoch': 0.9} + 90%|████████▉ | 7846/8750 [1:23:02<1:27:54, 5.83s/it] 90%|████████▉ | 7847/8750 [1:23:10<1:28:05, 5.85s/it] 90%|████████▉ | 7847/8750 [1:23:08<1:28:05, 5.85s/it] {'loss': 0.4598, 'learning_rate': 5.534641030771615e-07, 'epoch': 0.9} + 90%|████████▉ | 7847/8750 [1:23:10<1:28:05, 5.85s/it] {'loss': 0.4598, 'learning_rate': 5.534641030771615e-07, 'epoch': 0.9} + 90%|████████▉ | 7847/8750 [1:23:08<1:28:05, 5.85s/it] 90%|████████▉ | 7848/8750 [1:23:16<1:26:46, 5.77s/it] 90%|████████▉ | 7848/8750 [1:23:14<1:26:46, 5.77s/it] {'loss': 0.4401, 'learning_rate': 5.522503509524591e-07, 'epoch': 0.9} + 90%|████████▉ | 7848/8750 [1:23:16<1:26:46, 5.77s/it] {'loss': 0.4401, 'learning_rate': 5.522503509524591e-07, 'epoch': 0.9} + 90%|████████▉ | 7848/8750 [1:23:14<1:26:46, 5.77s/it] 90%|████████▉ | 7849/8750 [1:23:19<1:25:42, 5.71s/it] 90%|████████▉ | 7849/8750 [1:23:21<1:25:42, 5.71s/it] {'loss': 0.4515, 'learning_rate': 5.510378933803895e-07, 'epoch': 0.9} + 90%|████████▉ | 7849/8750 [1:23:21<1:25:42, 5.71s/it] {'loss': 0.4515, 'learning_rate': 5.510378933803895e-07, 'epoch': 0.9} + 90%|████████▉ | 7849/8750 [1:23:19<1:25:42, 5.71s/it]1 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +1311 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +0 90%|████████▉ | 7850/8750 [1:23:28<1:27:50, 5.86s/it]3 AutoResumeHook: Checking whether to suspend... + 4 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... + 90%|████████▉ | 7850/8750 [1:23:26<1:27:50, 5.86s/it]2 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4423, 'learning_rate': 5.498267305270888e-07, 'epoch': 0.9} + 90%|████████▉ | 7850/8750 [1:23:28<1:27:50, 5.86s/it] {'loss': 0.4423, 'learning_rate': 5.498267305270888e-07, 'epoch': 0.9} + 90%|████████▉ | 7850/8750 [1:23:26<1:27:50, 5.86s/it] 90%|████████▉ | 7851/8750 [1:23:31<1:26:32, 5.78s/it] 90%|████████▉ | 7851/8750 [1:23:33<1:26:32, 5.78s/it] {'loss': 0.4442, 'learning_rate': 5.48616862558512e-07, 'epoch': 0.9} + 90%|████████▉ | 7851/8750 [1:23:33<1:26:32, 5.78s/it] {'loss': 0.4442, 'learning_rate': 5.48616862558512e-07, 'epoch': 0.9} + 90%|████████▉ | 7851/8750 [1:23:31<1:26:32, 5.78s/it] 90%|████████▉ | 7852/8750 [1:23:39<1:25:31, 5.71s/it] 90%|████████▉ | 7852/8750 [1:23:37<1:25:31, 5.71s/it] {'loss': 0.4525, 'learning_rate': 5.474082896404365e-07, 'epoch': 0.9} + 90%|████████▉ | 7852/8750 [1:23:39<1:25:31, 5.71s/it] {'loss': 0.4525, 'learning_rate': 5.474082896404365e-07, 'epoch': 0.9} + 90%|████████▉ | 7852/8750 [1:23:37<1:25:31, 5.71s/it] 90%|████████▉ | 7853/8750 [1:23:43<1:25:28, 5.72s/it] 90%|████████▉ | 7853/8750 [1:23:44<1:25:28, 5.72s/it]{'loss': 0.4448, 'learning_rate': 5.462010119384665e-07, 'epoch': 0.9} + {'loss': 0.4448, 'learning_rate': 5.462010119384665e-07, 'epoch': 0.9} + 90%|████████▉ | 7853/8750 [1:23:44<1:25:28, 5.72s/it] 90%|████████▉ | 7853/8750 [1:23:43<1:25:28, 5.72s/it] 90%|████████▉ | 7854/8750 [1:23:50<1:25:34, 5.73s/it] 90%|████████▉ | 7854/8750 [1:23:48<1:25:34, 5.73s/it] {'loss': 0.4257, 'learning_rate': 5.44995029618024e-07, 'epoch': 0.9} + 90%|████████▉ | 7854/8750 [1:23:50<1:25:34, 5.73s/it] {'loss': 0.4257, 'learning_rate': 5.44995029618024e-07, 'epoch': 0.9} + 90%|████████▉ | 7854/8750 [1:23:48<1:25:34, 5.73s/it] 90%|████████▉ | 7855/8750 [1:23:56<1:24:51, 5.69s/it] 90%|████████▉ | 7855/8750 [1:23:54<1:24:51, 5.69s/it] {'loss': 0.4738, 'learning_rate': 5.43790342844358e-07, 'epoch': 0.9} + 90%|████████▉ | 7855/8750 [1:23:56<1:24:51, 5.69s/it] {'loss': 0.4738, 'learning_rate': 5.43790342844358e-07, 'epoch': 0.9} + 90%|████████▉ | 7855/8750 [1:23:54<1:24:51, 5.69s/it] 90%|████████▉ | 7856/8750 [1:24:02<1:26:12, 5.79s/it] 90%|████████▉ | 7856/8750 [1:24:00<1:26:12, 5.79s/it] {'loss': 0.4501, 'learning_rate': 5.425869517825366e-07, 'epoch': 0.9} + 90%|████████▉ | 7856/8750 [1:24:02<1:26:12, 5.79s/it] {'loss': 0.4501, 'learning_rate': 5.425869517825366e-07, 'epoch': 0.9} + 90%|████████▉ | 7856/8750 [1:24:00<1:26:12, 5.79s/it] 90%|████████▉ | 7857/8750 [1:24:08<1:25:40, 5.76s/it] 90%|████████▉ | 7857/8750 [1:24:06<1:25:40, 5.76s/it] {'loss': 0.4749, 'learning_rate': 5.413848565974489e-07, 'epoch': 0.9} + 90%|████████▉ | 7857/8750 [1:24:08<1:25:40, 5.76s/it] {'loss': 0.4749, 'learning_rate': 5.413848565974489e-07, 'epoch': 0.9} + 90%|████████▉ | 7857/8750 [1:24:06<1:25:40, 5.76s/it] 90%|████████▉ | 7858/8750 [1:24:11<1:25:33, 5.76s/it] 90%|████████▉ | 7858/8750 [1:24:13<1:25:33, 5.76s/it] {'loss': 0.4564, 'learning_rate': 5.401840574538108e-07, 'epoch': 0.9} + {'loss': 0.4564, 'learning_rate': 5.401840574538108e-07, 'epoch': 0.9} 90%|████████▉ | 7858/8750 [1:24:13<1:25:33, 5.76s/it] + 90%|████████▉ | 7858/8750 [1:24:11<1:25:33, 5.76s/it] 90%|████████▉ | 7859/8750 [1:24:17<1:24:17, 5.68s/it] 90%|████████▉ | 7859/8750 [1:24:19<1:24:17, 5.68s/it] {'loss': 0.4435, 'learning_rate': 5.389845545161598e-07, 'epoch': 0.9} + {'loss': 0.4435, 'learning_rate': 5.389845545161598e-07, 'epoch': 0.9} 90%|████████▉ | 7859/8750 [1:24:19<1:24:17, 5.68s/it] + 90%|████████▉ | 7859/8750 [1:24:17<1:24:17, 5.68s/it] 90%|████████▉ | 7860/8750 [1:24:23<1:26:09, 5.81s/it] 90%|████████▉ | 7860/8750 [1:24:25<1:26:09, 5.81s/it] {'loss': 0.4443, 'learning_rate': 5.37786347948851e-07, 'epoch': 0.9} + {'loss': 0.4443, 'learning_rate': 5.37786347948851e-07, 'epoch': 0.9} 90%|████████▉ | 7860/8750 [1:24:25<1:26:09, 5.81s/it] + 90%|████████▉ | 7860/8750 [1:24:23<1:26:09, 5.81s/it] 90%|████████▉ | 7861/8750 [1:24:30<1:24:58, 5.74s/it] 90%|████████▉ | 7861/8750 [1:24:29<1:24:58, 5.74s/it] {'loss': 0.4699, 'learning_rate': 5.365894379160686e-07, 'epoch': 0.9} + 90%|████████▉ | 7861/8750 [1:24:30<1:24:58, 5.74s/it] {'loss': 0.4699, 'learning_rate': 5.365894379160686e-07, 'epoch': 0.9} + 90%|████████▉ | 7861/8750 [1:24:29<1:24:58, 5.74s/it]/lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/VILA/llava/model/llava_arch.py:397: UserWarning: Inputs truncated! + warnings.warn("Inputs truncated!") + 90%|████████▉ | 7862/8750 [1:24:37<1:26:28, 5.84s/it] 90%|████████▉ | 7862/8750 [1:24:35<1:26:28, 5.84s/it] {'loss': 0.4474, 'learning_rate': 5.353938245818147e-07, 'epoch': 0.9} + 90%|████████▉ | 7862/8750 [1:24:37<1:26:28, 5.84s/it] {'loss': 0.4474, 'learning_rate': 5.353938245818147e-07, 'epoch': 0.9} + 90%|████████▉ | 7862/8750 [1:24:35<1:26:28, 5.84s/it] 90%|████████▉ | 7863/8750 [1:24:42<1:26:19, 5.84s/it] 90%|████████▉ | 7863/8750 [1:24:40<1:26:19, 5.84s/it] {'loss': 0.4651, 'learning_rate': 5.341995081099139e-07, 'epoch': 0.9} + 90%|████████▉ | 7863/8750 [1:24:42<1:26:19, 5.84s/it] {'loss': 0.4651, 'learning_rate': 5.341995081099139e-07, 'epoch': 0.9} + 90%|████████▉ | 7863/8750 [1:24:40<1:26:19, 5.84s/it] 90%|████████▉ | 7864/8750 [1:24:48<1:26:14, 5.84s/it] 90%|████████▉ | 7864/8750 [1:24:46<1:26:14, 5.84s/it] {'loss': 0.4474, 'learning_rate': 5.330064886640173e-07, 'epoch': 0.9} + 90%|████████▉ | 7864/8750 [1:24:48<1:26:14, 5.84s/it] {'loss': 0.4474, 'learning_rate': 5.330064886640173e-07, 'epoch': 0.9} + 90%|████████▉ | 7864/8750 [1:24:46<1:26:14, 5.84s/it] 90%|████████▉ | 7865/8750 [1:24:54<1:25:12, 5.78s/it] 90%|████████▉ | 7865/8750 [1:24:52<1:25:12, 5.78s/it] {'loss': 0.4703, 'learning_rate': 5.318147664075923e-07, 'epoch': 0.9} + 90%|████████▉ | 7865/8750 [1:24:54<1:25:12, 5.78s/it] {'loss': 0.4703, 'learning_rate': 5.318147664075923e-07, 'epoch': 0.9} + 90%|████████▉ | 7865/8750 [1:24:52<1:25:12, 5.78s/it] 90%|████████▉ | 7866/8750 [1:24:57<1:24:00, 5.70s/it] 90%|████████▉ | 7866/8750 [1:24:59<1:24:00, 5.70s/it] {'loss': 0.452, 'learning_rate': 5.306243415039336e-07, 'epoch': 0.9} + {'loss': 0.452, 'learning_rate': 5.306243415039336e-07, 'epoch': 0.9} + 90%|████████▉ | 7866/8750 [1:24:59<1:24:00, 5.70s/it] 90%|████████▉ | 7866/8750 [1:24:57<1:24:00, 5.70s/it] 90%|████████▉ | 7867/8750 [1:25:03<1:23:26, 5.67s/it] 90%|████████▉ | 7867/8750 [1:25:05<1:23:26, 5.67s/it] {'loss': 0.4484, 'learning_rate': 5.294352141161541e-07, 'epoch': 0.9} + 90%|████████▉ | 7867/8750 [1:25:05<1:23:26, 5.67s/it] {'loss': 0.4484, 'learning_rate': 5.294352141161541e-07, 'epoch': 0.9} + 90%|████████▉ | 7867/8750 [1:25:03<1:23:26, 5.67s/it] 90%|████████▉ | 7868/8750 [1:25:11<1:23:15, 5.66s/it] 90%|████████▉ | 7868/8750 [1:25:09<1:23:15, 5.66s/it] {'loss': 0.4377, 'learning_rate': 5.282473844071933e-07, 'epoch': 0.9} + 90%|████████▉ | 7868/8750 [1:25:11<1:23:15, 5.66s/it] {'loss': 0.4377, 'learning_rate': 5.282473844071933e-07, 'epoch': 0.9} + 90%|████████▉ | 7868/8750 [1:25:09<1:23:15, 5.66s/it] 90%|████████▉ | 7869/8750 [1:25:16<1:23:10, 5.66s/it] 90%|████████▉ | 7869/8750 [1:25:14<1:23:10, 5.67s/it] {'loss': 0.4763, 'learning_rate': 5.27060852539808e-07, 'epoch': 0.9} + 90%|████████▉ | 7869/8750 [1:25:16<1:23:10, 5.66s/it] {'loss': 0.4763, 'learning_rate': 5.27060852539808e-07, 'epoch': 0.9} + 90%|████████▉ | 7869/8750 [1:25:14<1:23:10, 5.67s/it] 90%|████████▉ | 7870/8750 [1:25:22<1:23:33, 5.70s/it] 90%|████████▉ | 7870/8750 [1:25:20<1:23:33, 5.70s/it] {'loss': 0.4567, 'learning_rate': 5.258756186765801e-07, 'epoch': 0.9} + {'loss': 0.4567, 'learning_rate': 5.258756186765801e-07, 'epoch': 0.9} 90%|████████▉ | 7870/8750 [1:25:22<1:23:33, 5.70s/it] + 90%|████████▉ | 7870/8750 [1:25:20<1:23:33, 5.70s/it] 90%|████████▉ | 7871/8750 [1:25:28<1:25:05, 5.81s/it] 90%|████████▉ | 7871/8750 [1:25:26<1:25:05, 5.81s/it] {'loss': 0.4356, 'learning_rate': 5.246916829799132e-07, 'epoch': 0.9} + 90%|████████▉ | 7871/8750 [1:25:28<1:25:05, 5.81s/it] {'loss': 0.4356, 'learning_rate': 5.246916829799132e-07, 'epoch': 0.9} + 90%|████████▉ | 7871/8750 [1:25:26<1:25:05, 5.81s/it] 90%|████████▉ | 7872/8750 [1:25:34<1:25:25, 5.84s/it] 90%|████████▉ | 7872/8750 [1:25:32<1:25:25, 5.84s/it] {'loss': 0.4509, 'learning_rate': 5.235090456120329e-07, 'epoch': 0.9} + 90%|████████▉ | 7872/8750 [1:25:34<1:25:25, 5.84s/it] {'loss': 0.4509, 'learning_rate': 5.235090456120329e-07, 'epoch': 0.9} + 90%|████████▉ | 7872/8750 [1:25:32<1:25:25, 5.84s/it] 90%|████████▉ | 7873/8750 [1:25:40<1:24:58, 5.81s/it] 90%|████████▉ | 7873/8750 [1:25:38<1:24:58, 5.81s/it] {'loss': 0.4649, 'learning_rate': 5.223277067349864e-07, 'epoch': 0.9} + 90%|████████▉ | 7873/8750 [1:25:40<1:24:58, 5.81s/it] {'loss': 0.4649, 'learning_rate': 5.223277067349864e-07, 'epoch': 0.9} + 90%|████████▉ | 7873/8750 [1:25:38<1:24:58, 5.81s/it] 90%|████████▉ | 7874/8750 [1:25:44<1:25:35, 5.86s/it] 90%|████████▉ | 7874/8750 [1:25:46<1:25:36, 5.86s/it] {'loss': 0.4489, 'learning_rate': 5.211476665106463e-07, 'epoch': 0.9} + 90%|████████▉ | 7874/8750 [1:25:46<1:25:36, 5.86s/it] {'loss': 0.4489, 'learning_rate': 5.211476665106463e-07, 'epoch': 0.9} + 90%|████████▉ | 7874/8750 [1:25:44<1:25:35, 5.86s/it] 90%|█████████ | 7875/8750 [1:25:51<1:24:46, 5.81s/it] 90%|█████████ | 7875/8750 [1:25:50<1:24:46, 5.81s/it] {'loss': 0.4553, 'learning_rate': 5.199689251007001e-07, 'epoch': 0.9} + 90%|█████████ | 7875/8750 [1:25:51<1:24:46, 5.81s/it] {'loss': 0.4553, 'learning_rate': 5.199689251007001e-07, 'epoch': 0.9} + 90%|█████████ | 7875/8750 [1:25:50<1:24:46, 5.81s/it] 90%|█████████ | 7876/8750 [1:25:57<1:24:50, 5.82s/it] 90%|█████████ | 7876/8750 [1:25:55<1:24:50, 5.82s/it] {'loss': 0.4444, 'learning_rate': 5.187914826666662e-07, 'epoch': 0.9} + 90%|█████████ | 7876/8750 [1:25:57<1:24:50, 5.82s/it] {'loss': 0.4444, 'learning_rate': 5.187914826666662e-07, 'epoch': 0.9} + 90%|█████████ | 7876/8750 [1:25:55<1:24:50, 5.82s/it] 90%|█████████ | 7877/8750 [1:26:01<1:23:47, 5.76s/it] 90%|█████████ | 7877/8750 [1:26:03<1:23:47, 5.76s/it] {'loss': 0.4496, 'learning_rate': 5.17615339369878e-07, 'epoch': 0.9} + {'loss': 0.4496, 'learning_rate': 5.17615339369878e-07, 'epoch': 0.9} 90%|█████████ | 7877/8750 [1:26:03<1:23:47, 5.76s/it] + 90%|█████████ | 7877/8750 [1:26:01<1:23:47, 5.76s/it] 90%|█████████ | 7878/8750 [1:26:09<1:23:52, 5.77s/it] 90%|█████████ | 7878/8750 [1:26:07<1:23:52, 5.77s/it] {'loss': 0.4492, 'learning_rate': 5.164404953714919e-07, 'epoch': 0.9} + 90%|█████████ | 7878/8750 [1:26:09<1:23:52, 5.77s/it] {'loss': 0.4492, 'learning_rate': 5.164404953714919e-07, 'epoch': 0.9} + 90%|█████████ | 7878/8750 [1:26:07<1:23:52, 5.77s/it] 90%|█████████ | 7879/8750 [1:26:14<1:23:17, 5.74s/it] 90%|█████████ | 7879/8750 [1:26:12<1:23:17, 5.74s/it] {'loss': 0.4534, 'learning_rate': 5.152669508324904e-07, 'epoch': 0.9} + 90%|█████████ | 7879/8750 [1:26:14<1:23:17, 5.74s/it] {'loss': 0.4534, 'learning_rate': 5.152669508324904e-07, 'epoch': 0.9} + 90%|█████████ | 7879/8750 [1:26:12<1:23:17, 5.74s/it] 90%|█████████ | 7880/8750 [1:26:20<1:23:56, 5.79s/it] 90%|█████████ | 7880/8750 [1:26:18<1:23:56, 5.79s/it] {'loss': 0.4503, 'learning_rate': 5.140947059136736e-07, 'epoch': 0.9} + 90%|█████████ | 7880/8750 [1:26:20<1:23:56, 5.79s/it] {'loss': 0.4503, 'learning_rate': 5.140947059136736e-07, 'epoch': 0.9} + 90%|█████████ | 7880/8750 [1:26:18<1:23:56, 5.79s/it] 90%|█████████ | 7881/8750 [1:26:24<1:23:32, 5.77s/it] 90%|█████████ | 7881/8750 [1:26:26<1:23:32, 5.77s/it] {'loss': 0.4529, 'learning_rate': 5.129237607756677e-07, 'epoch': 0.9} + {'loss': 0.4529, 'learning_rate': 5.129237607756677e-07, 'epoch': 0.9} 90%|█████████ | 7881/8750 [1:26:26<1:23:32, 5.77s/it] + 90%|█████████ | 7881/8750 [1:26:24<1:23:32, 5.77s/it] 90%|█████████ | 7882/8750 [1:26:32<1:23:32, 5.77s/it] 90%|█████████ | 7882/8750 [1:26:30<1:23:32, 5.77s/it] {'loss': 0.4364, 'learning_rate': 5.117541155789141e-07, 'epoch': 0.9} + 90%|█████████ | 7882/8750 [1:26:32<1:23:32, 5.77s/it] {'loss': 0.4364, 'learning_rate': 5.117541155789141e-07, 'epoch': 0.9} + 90%|█████████ | 7882/8750 [1:26:30<1:23:32, 5.77s/it] 90%|█████████ | 7883/8750 [1:26:38<1:23:19, 5.77s/it] 90%|█████████ | 7883/8750 [1:26:36<1:23:19, 5.77s/it] {'loss': 0.4399, 'learning_rate': 5.105857704836836e-07, 'epoch': 0.9} + 90%|█████████ | 7883/8750 [1:26:38<1:23:19, 5.77s/it] {'loss': 0.4399, 'learning_rate': 5.105857704836836e-07, 'epoch': 0.9} + 90%|█████████ | 7883/8750 [1:26:36<1:23:19, 5.77s/it] 90%|█████████ | 7884/8750 [1:26:43<1:22:22, 5.71s/it] 90%|█████████ | 7884/8750 [1:26:41<1:22:22, 5.71s/it] {'loss': 0.4533, 'learning_rate': 5.094187256500671e-07, 'epoch': 0.9} + 90%|█████████ | 7884/8750 [1:26:43<1:22:22, 5.71s/it] {'loss': 0.4533, 'learning_rate': 5.094187256500671e-07, 'epoch': 0.9} + 90%|█████████ | 7884/8750 [1:26:41<1:22:22, 5.71s/it] 90%|█████████ | 7885/8750 [1:26:49<1:22:28, 5.72s/it] 90%|█████████ | 7885/8750 [1:26:47<1:22:28, 5.72s/it] {'loss': 0.4392, 'learning_rate': 5.08252981237971e-07, 'epoch': 0.9} + 90%|█████████ | 7885/8750 [1:26:49<1:22:28, 5.72s/it] {'loss': 0.4392, 'learning_rate': 5.08252981237971e-07, 'epoch': 0.9} + 90%|█████████ | 7885/8750 [1:26:47<1:22:28, 5.72s/it] 90%|█████████ | 7886/8750 [1:26:55<1:23:18, 5.79s/it] 90%|█████████ | 7886/8750 [1:26:53<1:23:18, 5.79s/it] {'loss': 0.4464, 'learning_rate': 5.070885374071321e-07, 'epoch': 0.9} + 90%|█████████ | 7886/8750 [1:26:55<1:23:18, 5.79s/it] {'loss': 0.4464, 'learning_rate': 5.070885374071321e-07, 'epoch': 0.9} + 90%|█████████ | 7886/8750 [1:26:53<1:23:18, 5.79s/it] 90%|█████████ | 7887/8750 [1:26:59<1:22:35, 5.74s/it] 90%|█████████ | 7887/8750 [1:27:00<1:22:35, 5.74s/it] {'loss': 0.4763, 'learning_rate': 5.05925394317105e-07, 'epoch': 0.9} + 90%|█████████ | 7887/8750 [1:27:00<1:22:35, 5.74s/it] {'loss': 0.4763, 'learning_rate': 5.05925394317105e-07, 'epoch': 0.9} + 90%|█████████ | 7887/8750 [1:26:59<1:22:35, 5.74s/it] 90%|█████████ | 7888/8750 [1:27:04<1:22:52, 5.77s/it] 90%|█████████ | 7888/8750 [1:27:06<1:22:52, 5.77s/it] {'loss': 0.4511, 'learning_rate': 5.047635521272631e-07, 'epoch': 0.9} + {'loss': 0.4511, 'learning_rate': 5.047635521272631e-07, 'epoch': 0.9} 90%|█████████ | 7888/8750 [1:27:06<1:22:52, 5.77s/it] + 90%|█████████ | 7888/8750 [1:27:04<1:22:52, 5.77s/it] 90%|█████████ | 7889/8750 [1:27:12<1:22:48, 5.77s/it] 90%|█████████ | 7889/8750 [1:27:10<1:22:48, 5.77s/it] {'loss': 0.456, 'learning_rate': 5.036030109968082e-07, 'epoch': 0.9} + 90%|█████████ | 7889/8750 [1:27:12<1:22:48, 5.77s/it] {'loss': 0.456, 'learning_rate': 5.036030109968082e-07, 'epoch': 0.9} + 90%|█████████ | 7889/8750 [1:27:10<1:22:48, 5.77s/it] 90%|█████████ | 7890/8750 [1:27:16<1:21:57, 5.72s/it] 90%|█████████ | 7890/8750 [1:27:18<1:21:57, 5.72s/it] {'loss': 0.4665, 'learning_rate': 5.024437710847574e-07, 'epoch': 0.9} + 90%|█████████ | 7890/8750 [1:27:18<1:21:57, 5.72s/it] {'loss': 0.4665, 'learning_rate': 5.024437710847574e-07, 'epoch': 0.9} + 90%|█████████ | 7890/8750 [1:27:16<1:21:57, 5.72s/it] 90%|█████████ | 7891/8750 [1:27:23<1:22:15, 5.75s/it] 90%|█████████ | 7891/8750 [1:27:22<1:22:15, 5.75s/it] {'loss': 0.4364, 'learning_rate': 5.012858325499559e-07, 'epoch': 0.9} + 90%|█████████ | 7891/8750 [1:27:23<1:22:15, 5.75s/it] {'loss': 0.4364, 'learning_rate': 5.012858325499559e-07, 'epoch': 0.9} + 90%|█████████ | 7891/8750 [1:27:22<1:22:15, 5.75s/it] 90%|█████████ | 7892/8750 [1:27:29<1:21:47, 5.72s/it] 90%|█████████ | 7892/8750 [1:27:27<1:21:47, 5.72s/it] {'loss': 0.4645, 'learning_rate': 5.001291955510634e-07, 'epoch': 0.9} + 90%|█████████ | 7892/8750 [1:27:29<1:21:47, 5.72s/it] {'loss': 0.4645, 'learning_rate': 5.001291955510634e-07, 'epoch': 0.9} + 90%|█████████ | 7892/8750 [1:27:27<1:21:47, 5.72s/it] 90%|█████████ | 7893/8750 [1:27:35<1:20:57, 5.67s/it] 90%|█████████ | 7893/8750 [1:27:33<1:20:57, 5.67s/it] {'loss': 0.4316, 'learning_rate': 4.989738602465666e-07, 'epoch': 0.9} + 90%|█████████ | 7893/8750 [1:27:35<1:20:57, 5.67s/it] {'loss': 0.4316, 'learning_rate': 4.989738602465666e-07, 'epoch': 0.9} + 90%|█████████ | 7893/8750 [1:27:33<1:20:57, 5.67s/it] 90%|█████████ | 7894/8750 [1:27:41<1:21:47, 5.73s/it] 90%|█████████ | 7894/8750 [1:27:39<1:21:47, 5.73s/it] {'loss': 0.4545, 'learning_rate': 4.978198267947742e-07, 'epoch': 0.9} + 90%|█████████ | 7894/8750 [1:27:41<1:21:47, 5.73s/it] {'loss': 0.4545, 'learning_rate': 4.978198267947742e-07, 'epoch': 0.9} + 90%|█████████ | 7894/8750 [1:27:39<1:21:47, 5.73s/it] 90%|█████████ | 7895/8750 [1:27:46<1:21:28, 5.72s/it] 90%|█████████ | 7895/8750 [1:27:44<1:21:28, 5.72s/it] {'loss': 0.4539, 'learning_rate': 4.966670953538133e-07, 'epoch': 0.9} + 90%|█████████ | 7895/8750 [1:27:46<1:21:28, 5.72s/it] {'loss': 0.4539, 'learning_rate': 4.966670953538133e-07, 'epoch': 0.9} + 90%|█████████ | 7895/8750 [1:27:44<1:21:28, 5.72s/it] 90%|█████████ | 7896/8750 [1:27:52<1:21:19, 5.71s/it] 90%|█████████ | 7896/8750 [1:27:50<1:21:19, 5.71s/it] {'loss': 0.454, 'learning_rate': 4.955156660816307e-07, 'epoch': 0.9} + 90%|█████████ | 7896/8750 [1:27:52<1:21:19, 5.71s/it] {'loss': 0.454, 'learning_rate': 4.955156660816307e-07, 'epoch': 0.9} + 90%|█████████ | 7896/8750 [1:27:50<1:21:19, 5.71s/it] 90%|█████████ | 7897/8750 [1:27:58<1:21:21, 5.72s/it] 90%|█████████ | 7897/8750 [1:27:56<1:21:21, 5.72s/it] {'loss': 0.4422, 'learning_rate': 4.943655391360025e-07, 'epoch': 0.9} + 90%|█████████ | 7897/8750 [1:27:58<1:21:21, 5.72s/it] {'loss': 0.4422, 'learning_rate': 4.943655391360025e-07, 'epoch': 0.9} + 90%|█████████ | 7897/8750 [1:27:56<1:21:21, 5.72s/it] 90%|█████████ | 7898/8750 [1:28:01<1:21:07, 5.71s/it] 90%|█████████ | 7898/8750 [1:28:03<1:21:07, 5.71s/it] {'loss': 0.4376, 'learning_rate': 4.932167146745193e-07, 'epoch': 0.9} + {'loss': 0.4376, 'learning_rate': 4.932167146745193e-07, 'epoch': 0.9} + 90%|█████████ | 7898/8750 [1:28:03<1:21:07, 5.71s/it] 90%|█████████ | 7898/8750 [1:28:01<1:21:07, 5.71s/it] 90%|█████████ | 7899/8750 [1:28:07<1:21:16, 5.73s/it] 90%|█████████ | 7899/8750 [1:28:09<1:21:16, 5.73s/it] {'loss': 0.456, 'learning_rate': 4.920691928545973e-07, 'epoch': 0.9} + 90%|█████████ | 7899/8750 [1:28:09<1:21:16, 5.73s/it] {'loss': 0.456, 'learning_rate': 4.920691928545973e-07, 'epoch': 0.9} + 90%|█████████ | 7899/8750 [1:28:07<1:21:16, 5.73s/it]1 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +12013 AutoResumeHook: Checking whether to suspend...3 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... + +9 AutoResumeHook: Checking whether to suspend... +118 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +5 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... 90%|█████████ | 7900/8750 [1:28:15<1:21:55, 5.78s/it] + + 90%|█████████ | 7900/8750 [1:28:13<1:21:55, 5.78s/it]6 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4591, 'learning_rate': 4.909229738334698e-07, 'epoch': 0.9} + 90%|█████████ | 7900/8750 [1:28:15<1:21:55, 5.78s/it] {'loss': 0.4591, 'learning_rate': 4.909229738334698e-07, 'epoch': 0.9} + 90%|█████████ | 7900/8750 [1:28:13<1:21:55, 5.78s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-7900/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-7900/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-7900/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 90%|█████████ | 7901/8750 [1:28:36<2:25:44, 10.30s/it] 90%|█████████ | 7901/8750 [1:28:34<2:25:44, 10.30s/it] {'loss': 0.4428, 'learning_rate': 4.897780577681954e-07, 'epoch': 0.9} + 90%|█████████ | 7901/8750 [1:28:36<2:25:44, 10.30s/it] {'loss': 0.4428, 'learning_rate': 4.897780577681954e-07, 'epoch': 0.9} + 90%|█████████ | 7901/8750 [1:28:34<2:25:44, 10.30s/it] 90%|█████████ | 7902/8750 [1:28:40<2:06:06, 8.92s/it] 90%|█████████ | 7902/8750 [1:28:42<2:06:06, 8.92s/it] {'loss': 0.4412, 'learning_rate': 4.886344448156566e-07, 'epoch': 0.9} + 90%|█████████ | 7902/8750 [1:28:42<2:06:06, 8.92s/it] {'loss': 0.4412, 'learning_rate': 4.886344448156566e-07, 'epoch': 0.9} + 90%|█████████ | 7902/8750 [1:28:40<2:06:06, 8.92s/it] 90%|█████████ | 7903/8750 [1:28:45<1:52:15, 7.95s/it] 90%|█████████ | 7903/8750 [1:28:47<1:52:15, 7.95s/it] {'loss': 0.4327, 'learning_rate': 4.874921351325512e-07, 'epoch': 0.9} + 90%|█████████ | 7903/8750 [1:28:47<1:52:15, 7.95s/it] {'loss': 0.4327, 'learning_rate': 4.874921351325512e-07, 'epoch': 0.9} + 90%|█████████ | 7903/8750 [1:28:45<1:52:15, 7.95s/it] 90%|█████████ | 7904/8750 [1:28:53<1:43:26, 7.34s/it] 90%|█████████ | 7904/8750 [1:28:51<1:43:26, 7.34s/it] {'loss': 0.4401, 'learning_rate': 4.863511288753986e-07, 'epoch': 0.9} + 90%|█████████ | 7904/8750 [1:28:53<1:43:26, 7.34s/it] {'loss': 0.4401, 'learning_rate': 4.863511288753986e-07, 'epoch': 0.9} + 90%|█████████ | 7904/8750 [1:28:51<1:43:26, 7.34s/it] 90%|█████████ | 7905/8750 [1:28:57<1:36:40, 6.86s/it] 90%|█████████ | 7905/8750 [1:28:59<1:36:40, 6.86s/it] {'loss': 0.4561, 'learning_rate': 4.85211426200547e-07, 'epoch': 0.9} + {'loss': 0.4561, 'learning_rate': 4.85211426200547e-07, 'epoch': 0.9} 90%|█████████ | 7905/8750 [1:28:59<1:36:40, 6.86s/it] + 90%|█████████ | 7905/8750 [1:28:57<1:36:40, 6.86s/it] 90%|█████████ | 7906/8750 [1:29:03<1:31:37, 6.51s/it] 90%|█████████ | 7906/8750 [1:29:05<1:31:37, 6.51s/it] {'loss': 0.4616, 'learning_rate': 4.840730272641569e-07, 'epoch': 0.9} + 90%|█████████ | 7906/8750 [1:29:05<1:31:37, 6.51s/it] {'loss': 0.4616, 'learning_rate': 4.840730272641569e-07, 'epoch': 0.9} + 90%|█████████ | 7906/8750 [1:29:03<1:31:37, 6.51s/it] 90%|█████████ | 7907/8750 [1:29:08<1:27:55, 6.26s/it] 90%|█████████ | 7907/8750 [1:29:10<1:27:55, 6.26s/it] {'loss': 0.4657, 'learning_rate': 4.829359322222182e-07, 'epoch': 0.9} + 90%|█████████ | 7907/8750 [1:29:10<1:27:55, 6.26s/it] {'loss': 0.4657, 'learning_rate': 4.829359322222182e-07, 'epoch': 0.9} + 90%|█████████ | 7907/8750 [1:29:08<1:27:55, 6.26s/it] 90%|█████████ | 7908/8750 [1:29:14<1:26:02, 6.13s/it] 90%|█████████ | 7908/8750 [1:29:16<1:26:02, 6.13s/it]{'loss': 0.4471, 'learning_rate': 4.818001412305362e-07, 'epoch': 0.9} + 90%|█████████ | 7908/8750 [1:29:16<1:26:02, 6.13s/it]{'loss': 0.4471, 'learning_rate': 4.818001412305362e-07, 'epoch': 0.9} + 90%|█████████ | 7908/8750 [1:29:14<1:26:02, 6.13s/it] 90%|█████████ | 7909/8750 [1:29:22<1:24:18, 6.01s/it] 90%|█████████ | 7909/8750 [1:29:20<1:24:18, 6.01s/it] {'loss': 0.4503, 'learning_rate': 4.806656544447374e-07, 'epoch': 0.9} + 90%|█████████ | 7909/8750 [1:29:22<1:24:18, 6.01s/it] {'loss': 0.4503, 'learning_rate': 4.806656544447374e-07, 'epoch': 0.9} + 90%|█████████ | 7909/8750 [1:29:20<1:24:18, 6.01s/it] 90%|█████████ | 7910/8750 [1:29:27<1:22:26, 5.89s/it] 90%|█████████ | 7910/8750 [1:29:26<1:22:26, 5.89s/it] {'loss': 0.4521, 'learning_rate': 4.795324720202754e-07, 'epoch': 0.9} + 90%|█████████ | 7910/8750 [1:29:27<1:22:26, 5.89s/it] {'loss': 0.4521, 'learning_rate': 4.795324720202754e-07, 'epoch': 0.9} + 90%|█████████ | 7910/8750 [1:29:26<1:22:26, 5.89s/it] 90%|█████████ | 7911/8750 [1:29:31<1:21:22, 5.82s/it] 90%|█████████ | 7911/8750 [1:29:33<1:21:22, 5.82s/it] {'loss': 0.4485, 'learning_rate': 4.784005941124203e-07, 'epoch': 0.9} + 90%|█████████ | 7911/8750 [1:29:33<1:21:22, 5.82s/it] {'loss': 0.4485, 'learning_rate': 4.784005941124203e-07, 'epoch': 0.9} + 90%|█████████ | 7911/8750 [1:29:31<1:21:22, 5.82s/it] 90%|█████████ | 7912/8750 [1:29:39<1:23:11, 5.96s/it] 90%|█████████ | 7912/8750 [1:29:37<1:23:11, 5.96s/it] {'loss': 0.4421, 'learning_rate': 4.772700208762659e-07, 'epoch': 0.9} + 90%|█████████ | 7912/8750 [1:29:39<1:23:11, 5.96s/it] {'loss': 0.4421, 'learning_rate': 4.772700208762659e-07, 'epoch': 0.9} + 90%|█████████ | 7912/8750 [1:29:37<1:23:11, 5.96s/it] 90%|█████████ | 7913/8750 [1:29:43<1:21:35, 5.85s/it] 90%|█████████ | 7913/8750 [1:29:45<1:21:35, 5.85s/it] {'loss': 0.4397, 'learning_rate': 4.761407524667239e-07, 'epoch': 0.9} + 90%|█████████ | 7913/8750 [1:29:45<1:21:35, 5.85s/it] {'loss': 0.4397, 'learning_rate': 4.761407524667239e-07, 'epoch': 0.9} + 90%|█████████ | 7913/8750 [1:29:43<1:21:35, 5.85s/it] 90%|█████████ | 7914/8750 [1:29:51<1:20:22, 5.77s/it] 90%|█████████ | 7914/8750 [1:29:49<1:20:22, 5.77s/it] {'loss': 0.4493, 'learning_rate': 4.750127890385292e-07, 'epoch': 0.9} + 90%|█████████ | 7914/8750 [1:29:51<1:20:22, 5.77s/it] {'loss': 0.4493, 'learning_rate': 4.750127890385292e-07, 'epoch': 0.9} + 90%|█████████ | 7914/8750 [1:29:49<1:20:22, 5.77s/it] 90%|█████████ | 7915/8750 [1:29:54<1:20:03, 5.75s/it] 90%|█████████ | 7915/8750 [1:29:56<1:20:03, 5.75s/it] {'loss': 0.461, 'learning_rate': 4.738861307462406e-07, 'epoch': 0.9} + {'loss': 0.461, 'learning_rate': 4.738861307462406e-07, 'epoch': 0.9} 90%|█████████ | 7915/8750 [1:29:56<1:20:03, 5.75s/it] + 90%|█████████ | 7915/8750 [1:29:54<1:20:03, 5.75s/it] 90%|█████████ | 7916/8750 [1:30:00<1:19:55, 5.75s/it] 90%|█████████ | 7916/8750 [1:30:02<1:19:55, 5.75s/it] {'loss': 0.4571, 'learning_rate': 4.7276077774423334e-07, 'epoch': 0.9} + {'loss': 0.4571, 'learning_rate': 4.7276077774423334e-07, 'epoch': 0.9} + 90%|█████████ | 7916/8750 [1:30:02<1:19:55, 5.75s/it] 90%|█████████ | 7916/8750 [1:30:00<1:19:55, 5.75s/it] 90%|█████████ | 7917/8750 [1:30:06<1:19:52, 5.75s/it] 90%|█████████ | 7917/8750 [1:30:08<1:19:52, 5.75s/it] {'loss': 0.4409, 'learning_rate': 4.716367301867053e-07, 'epoch': 0.9} + 90%|█████████ | 7917/8750 [1:30:08<1:19:52, 5.75s/it] {'loss': 0.4409, 'learning_rate': 4.716367301867053e-07, 'epoch': 0.9} + 90%|█████████ | 7917/8750 [1:30:06<1:19:52, 5.75s/it] 90%|█████████ | 7918/8750 [1:30:12<1:19:12, 5.71s/it] 90%|█████████ | 7918/8750 [1:30:13<1:19:12, 5.71s/it] {'loss': 0.4698, 'learning_rate': 4.705139882276788e-07, 'epoch': 0.9} + 90%|█████████ | 7918/8750 [1:30:13<1:19:12, 5.71s/it] {'loss': 0.4698, 'learning_rate': 4.705139882276788e-07, 'epoch': 0.9} + 90%|█████████ | 7918/8750 [1:30:12<1:19:12, 5.71s/it] 91%|█████████ | 7919/8750 [1:30:17<1:19:22, 5.73s/it] 91%|█████████ | 7919/8750 [1:30:19<1:19:22, 5.73s/it] {'loss': 0.4495, 'learning_rate': 4.693925520209908e-07, 'epoch': 0.91} + {'loss': 0.4495, 'learning_rate': 4.693925520209908e-07, 'epoch': 0.91} 91%|█████████ | 7919/8750 [1:30:19<1:19:22, 5.73s/it] + 91%|█████████ | 7919/8750 [1:30:17<1:19:22, 5.73s/it] 91%|█████████ | 7920/8750 [1:30:25<1:19:12, 5.73s/it] 91%|█████████ | 7920/8750 [1:30:23<1:19:12, 5.73s/it] {'loss': 0.4517, 'learning_rate': 4.6827242172030495e-07, 'epoch': 0.91} + 91%|█████████ | 7920/8750 [1:30:25<1:19:12, 5.73s/it] {'loss': 0.4517, 'learning_rate': 4.6827242172030495e-07, 'epoch': 0.91} + 91%|█████████ | 7920/8750 [1:30:23<1:19:12, 5.73s/it] 91%|█████████ | 7921/8750 [1:30:31<1:18:54, 5.71s/it] 91%|█████████ | 7921/8750 [1:30:29<1:18:55, 5.71s/it] {'loss': 0.4549, 'learning_rate': 4.6715359747910526e-07, 'epoch': 0.91} + 91%|█████████ | 7921/8750 [1:30:31<1:18:54, 5.71s/it] {'loss': 0.4549, 'learning_rate': 4.6715359747910526e-07, 'epoch': 0.91} + 91%|█████████ | 7921/8750 [1:30:29<1:18:55, 5.71s/it] 91%|█████████ | 7922/8750 [1:30:34<1:18:18, 5.67s/it] 91%|█████████ | 7922/8750 [1:30:36<1:18:18, 5.67s/it] {'loss': 0.4327, 'learning_rate': 4.660360794506946e-07, 'epoch': 0.91} + {'loss': 0.4327, 'learning_rate': 4.660360794506946e-07, 'epoch': 0.91} + 91%|█████████ | 7922/8750 [1:30:36<1:18:18, 5.67s/it] 91%|█████████ | 7922/8750 [1:30:34<1:18:18, 5.67s/it] 91%|█████████ | 7923/8750 [1:30:40<1:18:10, 5.67s/it] 91%|█████████ | 7923/8750 [1:30:42<1:18:10, 5.67s/it] {'loss': 0.4543, 'learning_rate': 4.649198677881983e-07, 'epoch': 0.91} + 91%|█████████ | 7923/8750 [1:30:42<1:18:10, 5.67s/it] {'loss': 0.4543, 'learning_rate': 4.649198677881983e-07, 'epoch': 0.91} + 91%|█████████ | 7923/8750 [1:30:40<1:18:10, 5.67s/it] 91%|█████████ | 7924/8750 [1:30:47<1:17:41, 5.64s/it] 91%|█████████ | 7924/8750 [1:30:46<1:17:41, 5.64s/it] {'loss': 0.4555, 'learning_rate': 4.6380496264456064e-07, 'epoch': 0.91} + 91%|█████████ | 7924/8750 [1:30:47<1:17:41, 5.64s/it] {'loss': 0.4555, 'learning_rate': 4.6380496264456064e-07, 'epoch': 0.91} + 91%|█████████ | 7924/8750 [1:30:46<1:17:41, 5.64s/it] 91%|█████████ | 7925/8750 [1:30:51<1:17:51, 5.66s/it] 91%|█████████ | 7925/8750 [1:30:53<1:17:51, 5.66s/it] {'loss': 0.433, 'learning_rate': 4.6269136417255167e-07, 'epoch': 0.91} + 91%|█████████ | 7925/8750 [1:30:53<1:17:51, 5.66s/it] {'loss': 0.433, 'learning_rate': 4.6269136417255167e-07, 'epoch': 0.91} + 91%|█████████ | 7925/8750 [1:30:51<1:17:51, 5.66s/it] 91%|█████████ | 7926/8750 [1:30:57<1:18:21, 5.71s/it] 91%|█████████ | 7926/8750 [1:30:59<1:18:21, 5.71s/it] {'loss': 0.4415, 'learning_rate': 4.615790725247571e-07, 'epoch': 0.91} + {'loss': 0.4415, 'learning_rate': 4.615790725247571e-07, 'epoch': 0.91} + 91%|█████████ | 7926/8750 [1:30:59<1:18:21, 5.71s/it] 91%|█████████ | 7926/8750 [1:30:57<1:18:21, 5.71s/it] 91%|█████████ | 7927/8750 [1:31:03<1:17:59, 5.69s/it] 91%|█████████ | 7927/8750 [1:31:05<1:17:59, 5.69s/it] {'loss': 0.44, 'learning_rate': 4.60468087853585e-07, 'epoch': 0.91} + {'loss': 0.44, 'learning_rate': 4.60468087853585e-07, 'epoch': 0.91} 91%|█████████ | 7927/8750 [1:31:05<1:17:59, 5.69s/it] + 91%|█████████ | 7927/8750 [1:31:03<1:17:59, 5.69s/it] 91%|█████████ | 7928/8750 [1:31:10<1:17:04, 5.63s/it] 91%|█████████ | 7928/8750 [1:31:08<1:17:04, 5.63s/it] {'loss': 0.4495, 'learning_rate': 4.5935841031126693e-07, 'epoch': 0.91} + 91%|█████████ | 7928/8750 [1:31:10<1:17:04, 5.63s/it] {'loss': 0.4495, 'learning_rate': 4.5935841031126693e-07, 'epoch': 0.91} + 91%|█████████ | 7928/8750 [1:31:08<1:17:04, 5.63s/it] 91%|█████████ | 7929/8750 [1:31:16<1:17:49, 5.69s/it] 91%|█████████ | 7929/8750 [1:31:14<1:17:50, 5.69s/it] {'loss': 0.4418, 'learning_rate': 4.582500400498513e-07, 'epoch': 0.91} + {'loss': 0.4418, 'learning_rate': 4.582500400498513e-07, 'epoch': 0.91} 91%|█████████ | 7929/8750 [1:31:16<1:17:49, 5.69s/it] + 91%|█████████ | 7929/8750 [1:31:14<1:17:50, 5.69s/it] 91%|█████████ | 7930/8750 [1:31:22<1:17:27, 5.67s/it] 91%|█████████ | 7930/8750 [1:31:20<1:17:27, 5.67s/it] {'loss': 0.458, 'learning_rate': 4.5714297722121105e-07, 'epoch': 0.91} + 91%|█████████ | 7930/8750 [1:31:22<1:17:27, 5.67s/it] {'loss': 0.458, 'learning_rate': 4.5714297722121105e-07, 'epoch': 0.91} + 91%|█████████ | 7930/8750 [1:31:20<1:17:27, 5.67s/it] 91%|█████████ | 7931/8750 [1:31:27<1:17:19, 5.66s/it] 91%|█████████ | 7931/8750 [1:31:25<1:17:19, 5.66s/it] {'loss': 0.4415, 'learning_rate': 4.5603722197703925e-07, 'epoch': 0.91} + 91%|█████████ | 7931/8750 [1:31:27<1:17:19, 5.66s/it] {'loss': 0.4415, 'learning_rate': 4.5603722197703925e-07, 'epoch': 0.91} + 91%|█████████ | 7931/8750 [1:31:25<1:17:19, 5.66s/it] 91%|█████████ | 7932/8750 [1:31:33<1:17:39, 5.70s/it] 91%|█████████ | 7932/8750 [1:31:31<1:17:39, 5.70s/it] {'loss': 0.4602, 'learning_rate': 4.54932774468847e-07, 'epoch': 0.91} + 91%|█████████ | 7932/8750 [1:31:33<1:17:39, 5.70s/it] {'loss': 0.4602, 'learning_rate': 4.54932774468847e-07, 'epoch': 0.91} + 91%|█████████ | 7932/8750 [1:31:31<1:17:39, 5.70s/it] 91%|█████████ | 7933/8750 [1:31:38<1:16:57, 5.65s/it] 91%|█████████ | 7933/8750 [1:31:37<1:16:57, 5.65s/it] {'loss': 0.4578, 'learning_rate': 4.5382963484797096e-07, 'epoch': 0.91} + 91%|█████████ | 7933/8750 [1:31:38<1:16:57, 5.65s/it] {'loss': 0.4578, 'learning_rate': 4.5382963484797096e-07, 'epoch': 0.91} + 91%|█████████ | 7933/8750 [1:31:37<1:16:57, 5.65s/it] 91%|█████████ | 7934/8750 [1:31:44<1:16:44, 5.64s/it] 91%|█████████ | 7934/8750 [1:31:42<1:16:44, 5.64s/it] {'loss': 0.469, 'learning_rate': 4.5272780326556466e-07, 'epoch': 0.91} + 91%|█████████ | 7934/8750 [1:31:44<1:16:44, 5.64s/it] {'loss': 0.469, 'learning_rate': 4.5272780326556466e-07, 'epoch': 0.91} + 91%|█████████ | 7934/8750 [1:31:42<1:16:44, 5.64s/it] 91%|█████████ | 7935/8750 [1:31:50<1:16:57, 5.67s/it] 91%|█████████ | 7935/8750 [1:31:48<1:16:57, 5.67s/it] {'loss': 0.4415, 'learning_rate': 4.516272798726018e-07, 'epoch': 0.91} + 91%|█████████ | 7935/8750 [1:31:50<1:16:57, 5.67s/it] {'loss': 0.4415, 'learning_rate': 4.516272798726018e-07, 'epoch': 0.91} + 91%|█████████ | 7935/8750 [1:31:48<1:16:57, 5.67s/it] 91%|█████████ | 7936/8750 [1:31:55<1:16:44, 5.66s/it] 91%|█████████ | 7936/8750 [1:31:54<1:16:44, 5.66s/it] {'loss': 0.4508, 'learning_rate': 4.5052806481988175e-07, 'epoch': 0.91} + 91%|█████████ | 7936/8750 [1:31:55<1:16:44, 5.66s/it] {'loss': 0.4508, 'learning_rate': 4.5052806481988175e-07, 'epoch': 0.91} + 91%|█████████ | 7936/8750 [1:31:54<1:16:44, 5.66s/it] 91%|█████████ | 7937/8750 [1:31:59<1:17:08, 5.69s/it] 91%|█████████ | 7937/8750 [1:32:01<1:17:08, 5.69s/it] {'loss': 0.4495, 'learning_rate': 4.494301582580185e-07, 'epoch': 0.91} + {'loss': 0.4495, 'learning_rate': 4.494301582580185e-07, 'epoch': 0.91} + 91%|█████████ | 7937/8750 [1:32:01<1:17:08, 5.69s/it] 91%|█████████ | 7937/8750 [1:31:59<1:17:08, 5.69s/it] 91%|█████████ | 7938/8750 [1:32:05<1:18:06, 5.77s/it] 91%|█████████ | 7938/8750 [1:32:07<1:18:06, 5.77s/it] {'loss': 0.4321, 'learning_rate': 4.4833356033745167e-07, 'epoch': 0.91} + 91%|█████████ | 7938/8750 [1:32:07<1:18:06, 5.77s/it] {'loss': 0.4321, 'learning_rate': 4.4833356033745167e-07, 'epoch': 0.91} + 91%|█████████ | 7938/8750 [1:32:05<1:18:06, 5.77s/it] 91%|█████████ | 7939/8750 [1:32:13<1:17:07, 5.71s/it] 91%|█████████ | 7939/8750 [1:32:11<1:17:07, 5.71s/it] {'loss': 0.4402, 'learning_rate': 4.472382712084389e-07, 'epoch': 0.91} + 91%|█████████ | 7939/8750 [1:32:13<1:17:07, 5.71s/it] {'loss': 0.4402, 'learning_rate': 4.472382712084389e-07, 'epoch': 0.91} + 91%|█████████ | 7939/8750 [1:32:11<1:17:07, 5.71s/it] 91%|█████████ | 7940/8750 [1:32:18<1:16:56, 5.70s/it] 91%|█████████ | 7940/8750 [1:32:17<1:16:56, 5.70s/it] {'loss': 0.441, 'learning_rate': 4.4614429102105893e-07, 'epoch': 0.91} + {'loss': 0.441, 'learning_rate': 4.4614429102105893e-07, 'epoch': 0.91} + 91%|█████████ | 7940/8750 [1:32:18<1:16:56, 5.70s/it] 91%|█████████ | 7940/8750 [1:32:17<1:16:56, 5.70s/it] 91%|█████████ | 7941/8750 [1:32:24<1:16:25, 5.67s/it] 91%|█████████ | 7941/8750 [1:32:22<1:16:25, 5.67s/it] {'loss': 0.4518, 'learning_rate': 4.4505161992521417e-07, 'epoch': 0.91} + 91%|█████████ | 7941/8750 [1:32:24<1:16:25, 5.67s/it] {'loss': 0.4518, 'learning_rate': 4.4505161992521417e-07, 'epoch': 0.91} + 91%|█████████ | 7941/8750 [1:32:22<1:16:25, 5.67s/it] 91%|█████████ | 7942/8750 [1:32:30<1:16:13, 5.66s/it] 91%|█████████ | 7942/8750 [1:32:28<1:16:13, 5.66s/it] {'loss': 0.4709, 'learning_rate': 4.439602580706226e-07, 'epoch': 0.91} + 91%|█████████ | 7942/8750 [1:32:30<1:16:13, 5.66s/it] {'loss': 0.4709, 'learning_rate': 4.439602580706226e-07, 'epoch': 0.91} + 91%|█████████ | 7942/8750 [1:32:28<1:16:13, 5.66s/it] 91%|█████████ | 7943/8750 [1:32:35<1:16:34, 5.69s/it] 91%|█████████ | 7943/8750 [1:32:34<1:16:34, 5.69s/it] {'loss': 0.4147, 'learning_rate': 4.4287020560682345e-07, 'epoch': 0.91} + 91%|█████████ | 7943/8750 [1:32:35<1:16:34, 5.69s/it] {'loss': 0.4147, 'learning_rate': 4.4287020560682345e-07, 'epoch': 0.91} + 91%|█████████ | 7943/8750 [1:32:34<1:16:34, 5.69s/it] 91%|█████████ | 7944/8750 [1:32:39<1:16:05, 5.66s/it] 91%|█████████ | 7944/8750 [1:32:41<1:16:05, 5.66s/it]{'loss': 0.4517, 'learning_rate': 4.4178146268318177e-07, 'epoch': 0.91} + {'loss': 0.4517, 'learning_rate': 4.4178146268318177e-07, 'epoch': 0.91} + 91%|█████████ | 7944/8750 [1:32:41<1:16:05, 5.66s/it] 91%|█████████ | 7944/8750 [1:32:39<1:16:05, 5.66s/it] 91%|█████████ | 7945/8750 [1:32:47<1:16:51, 5.73s/it] 91%|█████████ | 7945/8750 [1:32:45<1:16:51, 5.73s/it] {'loss': 0.4611, 'learning_rate': 4.406940294488771e-07, 'epoch': 0.91} + 91%|█████████ | 7945/8750 [1:32:47<1:16:51, 5.73s/it] {'loss': 0.4611, 'learning_rate': 4.406940294488771e-07, 'epoch': 0.91} + 91%|█████████ | 7945/8750 [1:32:45<1:16:51, 5.73s/it] 91%|█████████ | 7946/8750 [1:32:53<1:18:02, 5.82s/it] 91%|█████████ | 7946/8750 [1:32:51<1:18:02, 5.82s/it] {'loss': 0.4488, 'learning_rate': 4.396079060529146e-07, 'epoch': 0.91} + 91%|█████████ | 7946/8750 [1:32:53<1:18:02, 5.82s/it] {'loss': 0.4488, 'learning_rate': 4.396079060529146e-07, 'epoch': 0.91} + 91%|█████████ | 7946/8750 [1:32:51<1:18:02, 5.82s/it] 91%|█████████ | 7947/8750 [1:32:59<1:17:14, 5.77s/it] 91%|█████████ | 7947/8750 [1:32:57<1:17:14, 5.77s/it] {'loss': 0.434, 'learning_rate': 4.3852309264411417e-07, 'epoch': 0.91} + 91%|█████████ | 7947/8750 [1:32:59<1:17:14, 5.77s/it] {'loss': 0.434, 'learning_rate': 4.3852309264411417e-07, 'epoch': 0.91} + 91%|█████████ | 7947/8750 [1:32:57<1:17:14, 5.77s/it] 91%|█████████ | 7948/8750 [1:33:04<1:17:13, 5.78s/it] 91%|█████████ | 7948/8750 [1:33:02<1:17:13, 5.78s/it] {'loss': 0.4538, 'learning_rate': 4.3743958937112253e-07, 'epoch': 0.91} + 91%|█████████ | 7948/8750 [1:33:04<1:17:13, 5.78s/it] {'loss': 0.4538, 'learning_rate': 4.3743958937112253e-07, 'epoch': 0.91} + 91%|█████████ | 7948/8750 [1:33:02<1:17:13, 5.78s/it] 91%|█████████ | 7949/8750 [1:33:10<1:16:19, 5.72s/it] 91%|█████████ | 7949/8750 [1:33:08<1:16:19, 5.72s/it] {'loss': 0.4536, 'learning_rate': 4.363573963824008e-07, 'epoch': 0.91} + 91%|█████████ | 7949/8750 [1:33:10<1:16:19, 5.72s/it] {'loss': 0.4536, 'learning_rate': 4.363573963824008e-07, 'epoch': 0.91} + 91%|█████████ | 7949/8750 [1:33:08<1:16:19, 5.72s/it]8 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... + 91%|█████████ | 7950/8750 [1:33:16<1:16:11, 5.71s/it]12 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +45 3AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +0 AutoResumeHook: Checking whether to suspend...15 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... + + 91%|█████████ | 7950/8750 [1:33:14<1:16:11, 5.71s/it]9 10AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4758, 'learning_rate': 4.3527651382623603e-07, 'epoch': 0.91} + 91%|█████████ | 7950/8750 [1:33:16<1:16:11, 5.71s/it] {'loss': 0.4758, 'learning_rate': 4.3527651382623603e-07, 'epoch': 0.91} + 91%|█████████ | 7950/8750 [1:33:14<1:16:11, 5.71s/it] 91%|█████████ | 7951/8750 [1:33:21<1:15:18, 5.66s/it] 91%|█████████ | 7951/8750 [1:33:19<1:15:18, 5.66s/it] {'loss': 0.449, 'learning_rate': 4.3419694185073303e-07, 'epoch': 0.91} + 91%|█████████ | 7951/8750 [1:33:21<1:15:18, 5.66s/it] {'loss': 0.449, 'learning_rate': 4.3419694185073303e-07, 'epoch': 0.91} + 91%|█████████ | 7951/8750 [1:33:19<1:15:18, 5.66s/it] 91%|█████████ | 7952/8750 [1:33:27<1:15:24, 5.67s/it] 91%|█████████ | 7952/8750 [1:33:25<1:15:24, 5.67s/it] {'loss': 0.4611, 'learning_rate': 4.331186806038179e-07, 'epoch': 0.91} + 91%|█████████ | 7952/8750 [1:33:27<1:15:24, 5.67s/it] {'loss': 0.4611, 'learning_rate': 4.331186806038179e-07, 'epoch': 0.91} + 91%|█████████ | 7952/8750 [1:33:25<1:15:24, 5.67s/it] 91%|█████████ | 7953/8750 [1:33:33<1:15:36, 5.69s/it] 91%|█████████ | 7953/8750 [1:33:31<1:15:36, 5.69s/it] {'loss': 0.4398, 'learning_rate': 4.320417302332325e-07, 'epoch': 0.91} + 91%|█████████ | 7953/8750 [1:33:33<1:15:36, 5.69s/it] {'loss': 0.4398, 'learning_rate': 4.320417302332325e-07, 'epoch': 0.91} + 91%|█████████ | 7953/8750 [1:33:31<1:15:36, 5.69s/it] 91%|█████████ | 7954/8750 [1:33:38<1:15:07, 5.66s/it] 91%|█████████ | 7954/8750 [1:33:36<1:15:07, 5.66s/it] {'loss': 0.4411, 'learning_rate': 4.3096609088654873e-07, 'epoch': 0.91} + 91%|█████████ | 7954/8750 [1:33:38<1:15:07, 5.66s/it] {'loss': 0.4411, 'learning_rate': 4.3096609088654873e-07, 'epoch': 0.91} + 91%|█████████ | 7954/8750 [1:33:36<1:15:07, 5.66s/it] 91%|█████████ | 7955/8750 [1:33:44<1:15:10, 5.67s/it] 91%|█████████ | 7955/8750 [1:33:42<1:15:10, 5.67s/it] {'loss': 0.4262, 'learning_rate': 4.298917627111476e-07, 'epoch': 0.91} + 91%|█████████ | 7955/8750 [1:33:44<1:15:10, 5.67s/it] {'loss': 0.4262, 'learning_rate': 4.298917627111476e-07, 'epoch': 0.91} + 91%|█████████ | 7955/8750 [1:33:42<1:15:10, 5.67s/it] 91%|█████████ | 7956/8750 [1:33:48<1:15:27, 5.70s/it] 91%|█████████ | 7956/8750 [1:33:50<1:15:27, 5.70s/it] {'loss': 0.4694, 'learning_rate': 4.2881874585424146e-07, 'epoch': 0.91} + 91%|█████████ | 7956/8750 [1:33:50<1:15:27, 5.70s/it] {'loss': 0.4694, 'learning_rate': 4.2881874585424146e-07, 'epoch': 0.91} + 91%|█████████ | 7956/8750 [1:33:48<1:15:27, 5.70s/it] 91%|█████████ | 7957/8750 [1:33:55<1:15:06, 5.68s/it] 91%|█████████ | 7957/8750 [1:33:53<1:15:06, 5.68s/it] {'loss': 0.4418, 'learning_rate': 4.2774704046285254e-07, 'epoch': 0.91} + 91%|█████████ | 7957/8750 [1:33:55<1:15:06, 5.68s/it] {'loss': 0.4418, 'learning_rate': 4.2774704046285254e-07, 'epoch': 0.91} + 91%|█████████ | 7957/8750 [1:33:53<1:15:06, 5.68s/it] 91%|█████████ | 7958/8750 [1:34:01<1:14:37, 5.65s/it] 91%|█████████ | 7958/8750 [1:33:59<1:14:37, 5.65s/it] {'loss': 0.4588, 'learning_rate': 4.266766466838335e-07, 'epoch': 0.91} + 91%|█████████ | 7958/8750 [1:34:01<1:14:37, 5.65s/it] {'loss': 0.4588, 'learning_rate': 4.266766466838335e-07, 'epoch': 0.91} + 91%|█████████ | 7958/8750 [1:33:59<1:14:37, 5.65s/it] 91%|█████████ | 7959/8750 [1:34:07<1:14:28, 5.65s/it] 91%|█████████ | 7959/8750 [1:34:05<1:14:28, 5.65s/it] {'loss': 0.442, 'learning_rate': 4.256075646638469e-07, 'epoch': 0.91} + 91%|█████████ | 7959/8750 [1:34:07<1:14:28, 5.65s/it] {'loss': 0.442, 'learning_rate': 4.256075646638469e-07, 'epoch': 0.91} + 91%|█████████ | 7959/8750 [1:34:05<1:14:28, 5.65s/it] 91%|█████████ | 7960/8750 [1:34:12<1:14:52, 5.69s/it] 91%|█████████ | 7960/8750 [1:34:10<1:14:52, 5.69s/it] {'loss': 0.4577, 'learning_rate': 4.2453979454938563e-07, 'epoch': 0.91} + 91%|█████████ | 7960/8750 [1:34:12<1:14:52, 5.69s/it] {'loss': 0.4577, 'learning_rate': 4.2453979454938563e-07, 'epoch': 0.91} + 91%|█████████ | 7960/8750 [1:34:10<1:14:52, 5.69s/it] 91%|█████████ | 7961/8750 [1:34:18<1:14:49, 5.69s/it] 91%|█████████ | 7961/8750 [1:34:16<1:14:49, 5.69s/it] {'loss': 0.4395, 'learning_rate': 4.2347333648675383e-07, 'epoch': 0.91} + 91%|█████████ | 7961/8750 [1:34:18<1:14:49, 5.69s/it] {'loss': 0.4395, 'learning_rate': 4.2347333648675383e-07, 'epoch': 0.91} + 91%|█████████ | 7961/8750 [1:34:16<1:14:49, 5.69s/it] 91%|█████████ | 7962/8750 [1:34:24<1:15:26, 5.74s/it] 91%|█████████ | 7962/8750 [1:34:22<1:15:26, 5.74s/it] {'loss': 0.4491, 'learning_rate': 4.2240819062208337e-07, 'epoch': 0.91} + 91%|█████████ | 7962/8750 [1:34:24<1:15:26, 5.74s/it] {'loss': 0.4491, 'learning_rate': 4.2240819062208337e-07, 'epoch': 0.91} + 91%|█████████ | 7962/8750 [1:34:22<1:15:26, 5.74s/it] 91%|█████████ | 7963/8750 [1:34:28<1:15:19, 5.74s/it] 91%|█████████ | 7963/8750 [1:34:30<1:15:19, 5.74s/it] {'loss': 0.453, 'learning_rate': 4.2134435710132093e-07, 'epoch': 0.91} + 91%|█████████ | 7963/8750 [1:34:30<1:15:19, 5.74s/it] {'loss': 0.453, 'learning_rate': 4.2134435710132093e-07, 'epoch': 0.91} + 91%|█████████ | 7963/8750 [1:34:28<1:15:19, 5.74s/it] 91%|█████████ | 7964/8750 [1:34:36<1:15:39, 5.77s/it] 91%|█████████ | 7964/8750 [1:34:34<1:15:39, 5.77s/it] {'loss': 0.4556, 'learning_rate': 4.2028183607023766e-07, 'epoch': 0.91} + 91%|█████████ | 7964/8750 [1:34:36<1:15:39, 5.77s/it] {'loss': 0.4556, 'learning_rate': 4.2028183607023766e-07, 'epoch': 0.91} + 91%|█████████ | 7964/8750 [1:34:34<1:15:39, 5.77s/it] 91%|█████████ | 7965/8750 [1:34:39<1:15:25, 5.77s/it] 91%|█████████ | 7965/8750 [1:34:41<1:15:25, 5.77s/it] {'loss': 0.4387, 'learning_rate': 4.192206276744204e-07, 'epoch': 0.91} + 91%|█████████ | 7965/8750 [1:34:41<1:15:25, 5.77s/it] {'loss': 0.4387, 'learning_rate': 4.192206276744204e-07, 'epoch': 0.91} + 91%|█████████ | 7965/8750 [1:34:39<1:15:25, 5.77s/it] 91%|█████████ | 7966/8750 [1:34:47<1:14:21, 5.69s/it] 91%|█████████ | 7966/8750 [1:34:45<1:14:21, 5.69s/it] {'loss': 0.4476, 'learning_rate': 4.181607320592784e-07, 'epoch': 0.91} + 91%|█████████ | 7966/8750 [1:34:47<1:14:21, 5.69s/it] {'loss': 0.4476, 'learning_rate': 4.181607320592784e-07, 'epoch': 0.91} + 91%|█████████ | 7966/8750 [1:34:45<1:14:21, 5.69s/it] 91%|█████████ | 7967/8750 [1:34:52<1:14:04, 5.68s/it] 91%|█████████ | 7967/8750 [1:34:50<1:14:04, 5.68s/it] {'loss': 0.4507, 'learning_rate': 4.1710214937004223e-07, 'epoch': 0.91} + 91%|█████████ | 7967/8750 [1:34:52<1:14:04, 5.68s/it] {'loss': 0.4507, 'learning_rate': 4.1710214937004223e-07, 'epoch': 0.91} + 91%|█████████ | 7967/8750 [1:34:50<1:14:04, 5.68s/it] 91%|█████████ | 7968/8750 [1:34:58<1:13:46, 5.66s/it] 91%|█████████ | 7968/8750 [1:34:56<1:13:46, 5.66s/it] {'loss': 0.4773, 'learning_rate': 4.1604487975176136e-07, 'epoch': 0.91} + 91%|█████████ | 7968/8750 [1:34:58<1:13:46, 5.66s/it] {'loss': 0.4773, 'learning_rate': 4.1604487975176136e-07, 'epoch': 0.91} + 91%|█████████ | 7968/8750 [1:34:56<1:13:46, 5.66s/it] 91%|█████████ | 7969/8750 [1:35:04<1:13:25, 5.64s/it] 91%|█████████ | 7969/8750 [1:35:02<1:13:25, 5.64s/it] {'loss': 0.4412, 'learning_rate': 4.149889233493054e-07, 'epoch': 0.91} + 91%|█████████ | 7969/8750 [1:35:04<1:13:25, 5.64s/it] {'loss': 0.4412, 'learning_rate': 4.149889233493054e-07, 'epoch': 0.91} + 91%|█████████ | 7969/8750 [1:35:02<1:13:25, 5.64s/it] 91%|█████████ | 7970/8750 [1:35:10<1:14:27, 5.73s/it] 91%|█████████ | 7970/8750 [1:35:08<1:14:27, 5.73s/it] {'loss': 0.4464, 'learning_rate': 4.139342803073632e-07, 'epoch': 0.91} + 91%|█████████ | 7970/8750 [1:35:10<1:14:27, 5.73s/it] {'loss': 0.4464, 'learning_rate': 4.139342803073632e-07, 'epoch': 0.91} + 91%|█████████ | 7970/8750 [1:35:08<1:14:27, 5.73s/it] 91%|█████████ | 7971/8750 [1:35:13<1:14:08, 5.71s/it] 91%|█████████ | 7971/8750 [1:35:15<1:14:08, 5.71s/it] {'loss': 0.4332, 'learning_rate': 4.128809507704445e-07, 'epoch': 0.91} + 91%|█████████ | 7971/8750 [1:35:15<1:14:08, 5.71s/it] {'loss': 0.4332, 'learning_rate': 4.128809507704445e-07, 'epoch': 0.91} + 91%|█████████ | 7971/8750 [1:35:13<1:14:08, 5.71s/it] 91%|█████████ | 7972/8750 [1:35:19<1:14:28, 5.74s/it] 91%|█████████ | 7972/8750 [1:35:21<1:14:28, 5.74s/it] {'loss': 0.451, 'learning_rate': 4.1182893488287965e-07, 'epoch': 0.91} + {'loss': 0.451, 'learning_rate': 4.1182893488287965e-07, 'epoch': 0.91} 91%|█████████ | 7972/8750 [1:35:21<1:14:28, 5.74s/it] + 91%|█████████ | 7972/8750 [1:35:19<1:14:28, 5.74s/it] 91%|█████████ | 7973/8750 [1:35:27<1:13:57, 5.71s/it] 91%|█████████ | 7973/8750 [1:35:25<1:13:57, 5.71s/it] {'loss': 0.4384, 'learning_rate': 4.1077823278881767e-07, 'epoch': 0.91} + 91%|█████████ | 7973/8750 [1:35:27<1:13:57, 5.71s/it] {'loss': 0.4384, 'learning_rate': 4.1077823278881767e-07, 'epoch': 0.91} + 91%|█████████ | 7973/8750 [1:35:25<1:13:57, 5.71s/it] 91%|█████████ | 7974/8750 [1:35:32<1:13:24, 5.68s/it] 91%|█████████ | 7974/8750 [1:35:30<1:13:24, 5.68s/it] {'loss': 0.4486, 'learning_rate': 4.097288446322278e-07, 'epoch': 0.91} + 91%|█████████ | 7974/8750 [1:35:32<1:13:24, 5.68s/it] {'loss': 0.4486, 'learning_rate': 4.097288446322278e-07, 'epoch': 0.91} + 91%|█████████ | 7974/8750 [1:35:30<1:13:24, 5.68s/it] 91%|█████████ | 7975/8750 [1:35:38<1:12:53, 5.64s/it] 91%|█████████ | 7975/8750 [1:35:36<1:12:53, 5.64s/it] {'loss': 0.4793, 'learning_rate': 4.086807705569018e-07, 'epoch': 0.91} + 91%|█████████ | 7975/8750 [1:35:38<1:12:53, 5.64s/it] {'loss': 0.4793, 'learning_rate': 4.086807705569018e-07, 'epoch': 0.91} + 91%|█████████ | 7975/8750 [1:35:36<1:12:53, 5.64s/it] 91%|█████████ | 7976/8750 [1:35:44<1:14:17, 5.76s/it] 91%|█████████ | 7976/8750 [1:35:42<1:14:17, 5.76s/it] {'loss': 0.4487, 'learning_rate': 4.076340107064458e-07, 'epoch': 0.91} + 91%|█████████ | 7976/8750 [1:35:44<1:14:17, 5.76s/it] {'loss': 0.4487, 'learning_rate': 4.076340107064458e-07, 'epoch': 0.91} + 91%|█████████ | 7976/8750 [1:35:42<1:14:17, 5.76s/it] 91%|█████████ | 7977/8750 [1:35:50<1:13:57, 5.74s/it] 91%|█████████ | 7977/8750 [1:35:48<1:13:57, 5.74s/it] {'loss': 0.4378, 'learning_rate': 4.065885652242907e-07, 'epoch': 0.91} + {'loss': 0.4378, 'learning_rate': 4.065885652242907e-07, 'epoch': 0.91} 91%|█████████ | 7977/8750 [1:35:50<1:13:57, 5.74s/it] + 91%|█████████ | 7977/8750 [1:35:48<1:13:57, 5.74s/it] 91%|█████████ | 7978/8750 [1:35:55<1:13:26, 5.71s/it] 91%|█████████ | 7978/8750 [1:35:53<1:13:26, 5.71s/it] {'loss': 0.4562, 'learning_rate': 4.055444342536885e-07, 'epoch': 0.91} + 91%|█████████ | 7978/8750 [1:35:55<1:13:26, 5.71s/it] {'loss': 0.4562, 'learning_rate': 4.055444342536885e-07, 'epoch': 0.91} + 91%|█████████ | 7978/8750 [1:35:53<1:13:26, 5.71s/it] 91%|█████████ | 7979/8750 [1:35:59<1:13:44, 5.74s/it] 91%|█████████ | 7979/8750 [1:36:01<1:13:45, 5.74s/it] {'loss': 0.4411, 'learning_rate': 4.045016179377048e-07, 'epoch': 0.91} + 91%|█████████ | 7979/8750 [1:36:01<1:13:45, 5.74s/it] {'loss': 0.4411, 'learning_rate': 4.045016179377048e-07, 'epoch': 0.91} + 91%|█████████ | 7979/8750 [1:35:59<1:13:44, 5.74s/it] 91%|█████████ | 7980/8750 [1:36:05<1:13:47, 5.75s/it] 91%|█████████ | 7980/8750 [1:36:07<1:13:47, 5.75s/it] {'loss': 0.4305, 'learning_rate': 4.034601164192309e-07, 'epoch': 0.91} + {'loss': 0.4305, 'learning_rate': 4.034601164192309e-07, 'epoch': 0.91} 91%|█████████ | 7980/8750 [1:36:07<1:13:47, 5.75s/it] + 91%|█████████ | 7980/8750 [1:36:05<1:13:47, 5.75s/it] 91%|█████████ | 7981/8750 [1:36:12<1:13:12, 5.71s/it] 91%|█████████ | 7981/8750 [1:36:10<1:13:12, 5.71s/it] {'loss': 0.4558, 'learning_rate': 4.024199298409737e-07, 'epoch': 0.91} + 91%|█████████ | 7981/8750 [1:36:12<1:13:12, 5.71s/it] {'loss': 0.4558, 'learning_rate': 4.024199298409737e-07, 'epoch': 0.91} + 91%|█████████ | 7981/8750 [1:36:10<1:13:12, 5.71s/it] 91%|█████████ | 7982/8750 [1:36:18<1:12:43, 5.68s/it] 91%|█████████ | 7982/8750 [1:36:16<1:12:43, 5.68s/it] {'loss': 0.4528, 'learning_rate': 4.013810583454647e-07, 'epoch': 0.91} + 91%|█████████ | 7982/8750 [1:36:18<1:12:43, 5.68s/it] {'loss': 0.4528, 'learning_rate': 4.013810583454647e-07, 'epoch': 0.91} + 91%|█████████ | 7982/8750 [1:36:16<1:12:43, 5.68s/it] 91%|█████████ | 7983/8750 [1:36:24<1:12:54, 5.70s/it] 91%|█████████ | 7983/8750 [1:36:22<1:12:54, 5.70s/it] {'loss': 0.4647, 'learning_rate': 4.0034350207505124e-07, 'epoch': 0.91} + 91%|█████████ | 7983/8750 [1:36:24<1:12:54, 5.70s/it] {'loss': 0.4647, 'learning_rate': 4.0034350207505124e-07, 'epoch': 0.91} + 91%|█████████ | 7983/8750 [1:36:22<1:12:54, 5.70s/it] 91%|█████████ | 7984/8750 [1:36:28<1:13:03, 5.72s/it] 91%|█████████ | 7984/8750 [1:36:30<1:13:03, 5.72s/it]{'loss': 0.4603, 'learning_rate': 3.9930726117190064e-07, 'epoch': 0.91} + {'loss': 0.4603, 'learning_rate': 3.9930726117190064e-07, 'epoch': 0.91} + 91%|█████████ | 7984/8750 [1:36:30<1:13:03, 5.72s/it] 91%|█████████ | 7984/8750 [1:36:28<1:13:03, 5.72s/it] 91%|█████████▏| 7985/8750 [1:36:35<1:13:43, 5.78s/it] 91%|█████████▏| 7985/8750 [1:36:34<1:13:43, 5.78s/it] {'loss': 0.4369, 'learning_rate': 3.982723357780027e-07, 'epoch': 0.91} + 91%|█████████▏| 7985/8750 [1:36:35<1:13:43, 5.78s/it] {'loss': 0.4369, 'learning_rate': 3.982723357780027e-07, 'epoch': 0.91} + 91%|█████████▏| 7985/8750 [1:36:34<1:13:43, 5.78s/it] 91%|█████████▏| 7986/8750 [1:36:39<1:13:32, 5.77s/it] 91%|█████████▏| 7986/8750 [1:36:41<1:13:32, 5.77s/it] {'loss': 0.4506, 'learning_rate': 3.97238726035164e-07, 'epoch': 0.91} + {'loss': 0.4506, 'learning_rate': 3.97238726035164e-07, 'epoch': 0.91} + 91%|█████████▏| 7986/8750 [1:36:41<1:13:32, 5.77s/it] 91%|█████████▏| 7986/8750 [1:36:39<1:13:32, 5.77s/it] 91%|█████████▏| 7987/8750 [1:36:45<1:13:46, 5.80s/it] 91%|█████████▏| 7987/8750 [1:36:47<1:13:46, 5.80s/it] {'loss': 0.4454, 'learning_rate': 3.962064320850112e-07, 'epoch': 0.91} + 91%|█████████▏| 7987/8750 [1:36:47<1:13:46, 5.80s/it] {'loss': 0.4454, 'learning_rate': 3.962064320850112e-07, 'epoch': 0.91} + 91%|█████████▏| 7987/8750 [1:36:45<1:13:46, 5.80s/it] 91%|█████████▏| 7988/8750 [1:36:51<1:13:04, 5.75s/it] 91%|█████████▏| 7988/8750 [1:36:53<1:13:04, 5.75s/it] {'loss': 0.4581, 'learning_rate': 3.951754540689956e-07, 'epoch': 0.91} + 91%|█████████▏| 7988/8750 [1:36:53<1:13:04, 5.75s/it] {'loss': 0.4581, 'learning_rate': 3.951754540689956e-07, 'epoch': 0.91} + 91%|█████████▏| 7988/8750 [1:36:51<1:13:04, 5.75s/it] 91%|█████████▏| 7989/8750 [1:36:58<1:12:58, 5.75s/it] 91%|█████████▏| 7989/8750 [1:36:57<1:12:58, 5.75s/it] {'loss': 0.4423, 'learning_rate': 3.9414579212838087e-07, 'epoch': 0.91} + 91%|█████████▏| 7989/8750 [1:36:58<1:12:58, 5.75s/it] {'loss': 0.4423, 'learning_rate': 3.9414579212838087e-07, 'epoch': 0.91} + 91%|█████████▏| 7989/8750 [1:36:57<1:12:58, 5.75s/it] 91%|█████████▏| 7990/8750 [1:37:04<1:12:48, 5.75s/it] 91%|█████████▏| 7990/8750 [1:37:02<1:12:48, 5.75s/it] {'loss': 0.4595, 'learning_rate': 3.931174464042542e-07, 'epoch': 0.91} + 91%|█████████▏| 7990/8750 [1:37:04<1:12:48, 5.75s/it] {'loss': 0.4595, 'learning_rate': 3.931174464042542e-07, 'epoch': 0.91} + 91%|█████████▏| 7990/8750 [1:37:02<1:12:48, 5.75s/it] 91%|█████████▏| 7991/8750 [1:37:10<1:11:32, 5.65s/it] 91%|█████████▏| 7991/8750 [1:37:08<1:11:32, 5.65s/it] {'loss': 0.4608, 'learning_rate': 3.920904170375239e-07, 'epoch': 0.91} + 91%|█████████▏| 7991/8750 [1:37:10<1:11:32, 5.65s/it] {'loss': 0.4608, 'learning_rate': 3.920904170375239e-07, 'epoch': 0.91} + 91%|█████████▏| 7991/8750 [1:37:08<1:11:32, 5.65s/it] 91%|█████████▏| 7992/8750 [1:37:15<1:11:43, 5.68s/it] 91%|█████████▏| 7992/8750 [1:37:13<1:11:43, 5.68s/it] {'loss': 0.4527, 'learning_rate': 3.9106470416891195e-07, 'epoch': 0.91} + 91%|█████████▏| 7992/8750 [1:37:15<1:11:43, 5.68s/it] {'loss': 0.4527, 'learning_rate': 3.9106470416891195e-07, 'epoch': 0.91} + 91%|█████████▏| 7992/8750 [1:37:13<1:11:43, 5.68s/it] 91%|█████████▏| 7993/8750 [1:37:21<1:11:57, 5.70s/it] 91%|█████████▏| 7993/8750 [1:37:19<1:11:57, 5.70s/it] {'loss': 0.4451, 'learning_rate': 3.9004030793896807e-07, 'epoch': 0.91} + 91%|█████████▏| 7993/8750 [1:37:21<1:11:57, 5.70s/it] {'loss': 0.4451, 'learning_rate': 3.9004030793896807e-07, 'epoch': 0.91} + 91%|█████████▏| 7993/8750 [1:37:19<1:11:57, 5.70s/it] 91%|█████████▏| 7994/8750 [1:37:27<1:11:48, 5.70s/it] 91%|█████████▏| 7994/8750 [1:37:25<1:11:48, 5.70s/it] {'loss': 0.4519, 'learning_rate': 3.8901722848805443e-07, 'epoch': 0.91} + 91%|█████████▏| 7994/8750 [1:37:27<1:11:48, 5.70s/it] {'loss': 0.4519, 'learning_rate': 3.8901722848805443e-07, 'epoch': 0.91} + 91%|█████████▏| 7994/8750 [1:37:25<1:11:48, 5.70s/it] 91%|█████████▏| 7995/8750 [1:37:31<1:11:34, 5.69s/it] 91%|█████████▏| 7995/8750 [1:37:32<1:11:34, 5.69s/it] {'loss': 0.4403, 'learning_rate': 3.8799546595635784e-07, 'epoch': 0.91} + 91%|█████████▏| 7995/8750 [1:37:33<1:11:34, 5.69s/it] {'loss': 0.4403, 'learning_rate': 3.8799546595635784e-07, 'epoch': 0.91} + 91%|█████████▏| 7995/8750 [1:37:31<1:11:34, 5.69s/it] 91%|█████████▏| 7996/8750 [1:37:37<1:12:27, 5.77s/it] 91%|█████████▏| 7996/8750 [1:37:38<1:12:27, 5.77s/it] {'loss': 0.4511, 'learning_rate': 3.8697502048387956e-07, 'epoch': 0.91} + 91%|█████████▏| 7996/8750 [1:37:38<1:12:27, 5.77s/it] {'loss': 0.4511, 'learning_rate': 3.8697502048387956e-07, 'epoch': 0.91} + 91%|█████████▏| 7996/8750 [1:37:37<1:12:27, 5.77s/it] 91%|█████████▏| 7997/8750 [1:37:44<1:12:58, 5.81s/it] 91%|█████████▏| 7997/8750 [1:37:42<1:12:58, 5.81s/it] {'loss': 0.4411, 'learning_rate': 3.8595589221044674e-07, 'epoch': 0.91} + 91%|█████████▏| 7997/8750 [1:37:44<1:12:58, 5.81s/it] {'loss': 0.4411, 'learning_rate': 3.8595589221044674e-07, 'epoch': 0.91} + 91%|█████████▏| 7997/8750 [1:37:42<1:12:58, 5.81s/it] 91%|█████████▏| 7998/8750 [1:37:50<1:12:11, 5.76s/it] 91%|█████████▏| 7998/8750 [1:37:48<1:12:11, 5.76s/it] {'loss': 0.4515, 'learning_rate': 3.84938081275702e-07, 'epoch': 0.91} + 91%|█████████▏| 7998/8750 [1:37:50<1:12:11, 5.76s/it] {'loss': 0.4515, 'learning_rate': 3.84938081275702e-07, 'epoch': 0.91} + 91%|█████████▏| 7998/8750 [1:37:48<1:12:11, 5.76s/it] 91%|█████████▏| 7999/8750 [1:37:56<1:11:48, 5.74s/it] 91%|█████████▏| 7999/8750 [1:37:54<1:11:48, 5.74s/it] {'loss': 0.4685, 'learning_rate': 3.839215878191083e-07, 'epoch': 0.91} + 91%|█████████▏| 7999/8750 [1:37:56<1:11:48, 5.74s/it] {'loss': 0.4685, 'learning_rate': 3.839215878191083e-07, 'epoch': 0.91} + 91%|█████████▏| 7999/8750 [1:37:54<1:11:48, 5.74s/it]1 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +1312 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + + 91%|█████████▏| 8000/8750 [1:38:01<1:11:34, 5.73s/it]6 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +014 AutoResumeHook: Checking whether to suspend... + 15 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend...11 AutoResumeHook: Checking whether to suspend... +4 + AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... 91%|█████████▏| 8000/8750 [1:37:59<1:11:34, 5.73s/it]7 AutoResumeHook: Checking whether to suspend... + + {'loss': 0.4395, 'learning_rate': 3.8290641197994526e-07, 'epoch': 0.91} + {'loss': 0.4395, 'learning_rate': 3.8290641197994526e-07, 'epoch': 0.91} + 91%|█████████▏| 8000/8750 [1:38:01<1:11:34, 5.73s/it] 91%|█████████▏| 8000/8750 [1:37:59<1:11:34, 5.73s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-8000/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-8000/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-8000/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 91%|█████████▏| 8001/8750 [1:38:18<1:59:38, 9.58s/it] 91%|█████████▏| 8001/8750 [1:38:20<1:59:38, 9.58s/it] {'loss': 0.4388, 'learning_rate': 3.8189255389731837e-07, 'epoch': 0.91} + 91%|█████████▏| 8001/8750 [1:38:20<1:59:38, 9.58s/it] {'loss': 0.4388, 'learning_rate': 3.8189255389731837e-07, 'epoch': 0.91} + 91%|█████████▏| 8001/8750 [1:38:18<1:59:38, 9.58s/it] 91%|█████████▏| 8002/8750 [1:38:24<1:45:03, 8.43s/it] 91%|█████████▏| 8002/8750 [1:38:26<1:45:03, 8.43s/it]{'loss': 0.4422, 'learning_rate': 3.808800137101465e-07, 'epoch': 0.91} + {'loss': 0.4422, 'learning_rate': 3.808800137101465e-07, 'epoch': 0.91} + 91%|█████████▏| 8002/8750 [1:38:26<1:45:03, 8.43s/it] 91%|█████████▏| 8002/8750 [1:38:24<1:45:03, 8.43s/it] 91%|█████████▏| 8003/8750 [1:38:32<1:35:09, 7.64s/it] 91%|█████████▏| 8003/8750 [1:38:30<1:35:09, 7.64s/it] {'loss': 0.4631, 'learning_rate': 3.7986879155717084e-07, 'epoch': 0.91} + {'loss': 0.4631, 'learning_rate': 3.7986879155717084e-07, 'epoch': 0.91} 91%|█████████▏| 8003/8750 [1:38:32<1:35:09, 7.64s/it] + 91%|█████████▏| 8003/8750 [1:38:30<1:35:09, 7.64s/it] 91%|█████████▏| 8004/8750 [1:38:35<1:27:56, 7.07s/it] 91%|█████████▏| 8004/8750 [1:38:37<1:27:57, 7.07s/it] {'loss': 0.4384, 'learning_rate': 3.7885888757695054e-07, 'epoch': 0.91} + 91%|█████████▏| 8004/8750 [1:38:37<1:27:57, 7.07s/it] {'loss': 0.4384, 'learning_rate': 3.7885888757695054e-07, 'epoch': 0.91} + 91%|█████████▏| 8004/8750 [1:38:35<1:27:56, 7.07s/it] 91%|█████████▏| 8005/8750 [1:38:41<1:23:24, 6.72s/it] 91%|█████████▏| 8005/8750 [1:38:43<1:23:24, 6.72s/it] {'loss': 0.4312, 'learning_rate': 3.778503019078672e-07, 'epoch': 0.91} + 91%|█████████▏| 8005/8750 [1:38:43<1:23:24, 6.72s/it] {'loss': 0.4312, 'learning_rate': 3.778503019078672e-07, 'epoch': 0.91} + 91%|█████████▏| 8005/8750 [1:38:41<1:23:24, 6.72s/it] 91%|█████████▏| 8006/8750 [1:38:47<1:19:57, 6.45s/it] 91%|█████████▏| 8006/8750 [1:38:49<1:19:57, 6.45s/it] {'loss': 0.4415, 'learning_rate': 3.768430346881169e-07, 'epoch': 0.91} + 91%|█████████▏| 8006/8750 [1:38:49<1:19:57, 6.45s/it] {'loss': 0.4415, 'learning_rate': 3.768430346881169e-07, 'epoch': 0.91} + 91%|█████████▏| 8006/8750 [1:38:47<1:19:57, 6.45s/it] 92%|█████████▏| 8007/8750 [1:38:55<1:16:56, 6.21s/it] 92%|█████████▏| 8007/8750 [1:38:53<1:16:56, 6.21s/it] {'loss': 0.4421, 'learning_rate': 3.7583708605571923e-07, 'epoch': 0.92} + 92%|█████████▏| 8007/8750 [1:38:55<1:16:56, 6.21s/it] {'loss': 0.4421, 'learning_rate': 3.7583708605571923e-07, 'epoch': 0.92} + 92%|█████████▏| 8007/8750 [1:38:53<1:16:56, 6.21s/it] 92%|█████████▏| 8008/8750 [1:39:00<1:14:27, 6.02s/it] 92%|█████████▏| 8008/8750 [1:38:58<1:14:27, 6.02s/it] {'loss': 0.4674, 'learning_rate': 3.748324561485128e-07, 'epoch': 0.92} + 92%|█████████▏| 8008/8750 [1:39:00<1:14:27, 6.02s/it] {'loss': 0.4674, 'learning_rate': 3.748324561485128e-07, 'epoch': 0.92} + 92%|█████████▏| 8008/8750 [1:38:58<1:14:27, 6.02s/it] 92%|█████████▏| 8009/8750 [1:39:06<1:13:39, 5.96s/it] 92%|█████████▏| 8009/8750 [1:39:04<1:13:39, 5.96s/it] {'loss': 0.4332, 'learning_rate': 3.7382914510415316e-07, 'epoch': 0.92} + 92%|█████████▏| 8009/8750 [1:39:06<1:13:39, 5.96s/it] {'loss': 0.4332, 'learning_rate': 3.7382914510415316e-07, 'epoch': 0.92} + 92%|█████████▏| 8009/8750 [1:39:04<1:13:39, 5.96s/it] 92%|█████████▏| 8010/8750 [1:39:12<1:12:27, 5.88s/it] 92%|█████████▏| 8010/8750 [1:39:10<1:12:27, 5.88s/it] {'loss': 0.4402, 'learning_rate': 3.7282715306011465e-07, 'epoch': 0.92} + 92%|█████████▏| 8010/8750 [1:39:12<1:12:27, 5.88s/it] {'loss': 0.4402, 'learning_rate': 3.7282715306011465e-07, 'epoch': 0.92} + 92%|█████████▏| 8010/8750 [1:39:10<1:12:27, 5.88s/it] 92%|█████████▏| 8011/8750 [1:39:15<1:11:19, 5.79s/it] 92%|█████████▏| 8011/8750 [1:39:17<1:11:19, 5.79s/it] {'loss': 0.4535, 'learning_rate': 3.7182648015369524e-07, 'epoch': 0.92} + {'loss': 0.4535, 'learning_rate': 3.7182648015369524e-07, 'epoch': 0.92} 92%|█████████▏| 8011/8750 [1:39:17<1:11:19, 5.79s/it] + 92%|█████████▏| 8011/8750 [1:39:15<1:11:19, 5.79s/it] 92%|█████████▏| 8012/8750 [1:39:23<1:12:04, 5.86s/it] 92%|█████████▏| 8012/8750 [1:39:21<1:12:04, 5.86s/it] {'loss': 0.4524, 'learning_rate': 3.708271265220087e-07, 'epoch': 0.92} + 92%|█████████▏| 8012/8750 [1:39:23<1:12:04, 5.86s/it] {'loss': 0.4524, 'learning_rate': 3.708271265220087e-07, 'epoch': 0.92} + 92%|█████████▏| 8012/8750 [1:39:21<1:12:04, 5.86s/it] 92%|█████████▏| 8013/8750 [1:39:29<1:11:44, 5.84s/it] 92%|█████████▏| 8013/8750 [1:39:27<1:11:44, 5.84s/it] {'loss': 0.4364, 'learning_rate': 3.698290923019865e-07, 'epoch': 0.92} + 92%|█████████▏| 8013/8750 [1:39:29<1:11:44, 5.84s/it] {'loss': 0.4364, 'learning_rate': 3.698290923019865e-07, 'epoch': 0.92} + 92%|█████████▏| 8013/8750 [1:39:27<1:11:44, 5.84s/it] 92%|█████████▏| 8014/8750 [1:39:35<1:12:21, 5.90s/it] 92%|█████████▏| 8014/8750 [1:39:33<1:12:21, 5.90s/it] {'loss': 0.4509, 'learning_rate': 3.688323776303837e-07, 'epoch': 0.92} + 92%|█████████▏| 8014/8750 [1:39:35<1:12:21, 5.90s/it] {'loss': 0.4509, 'learning_rate': 3.688323776303837e-07, 'epoch': 0.92} + 92%|█████████▏| 8014/8750 [1:39:33<1:12:21, 5.90s/it] 92%|█████████▏| 8015/8750 [1:39:41<1:11:08, 5.81s/it] 92%|█████████▏| 8015/8750 [1:39:39<1:11:08, 5.81s/it] {'loss': 0.4523, 'learning_rate': 3.678369826437733e-07, 'epoch': 0.92} + 92%|█████████▏| 8015/8750 [1:39:41<1:11:08, 5.81s/it] {'loss': 0.4523, 'learning_rate': 3.678369826437733e-07, 'epoch': 0.92} + 92%|█████████▏| 8015/8750 [1:39:39<1:11:08, 5.81s/it] 92%|█████████▏| 8016/8750 [1:39:44<1:10:20, 5.75s/it] 92%|█████████▏| 8016/8750 [1:39:46<1:10:20, 5.75s/it] {'loss': 0.4636, 'learning_rate': 3.668429074785451e-07, 'epoch': 0.92} + 92%|█████████▏| 8016/8750 [1:39:46<1:10:20, 5.75s/it] {'loss': 0.4636, 'learning_rate': 3.668429074785451e-07, 'epoch': 0.92} + 92%|█████████▏| 8016/8750 [1:39:44<1:10:20, 5.75s/it] 92%|█████████▏| 8017/8750 [1:39:52<1:09:33, 5.69s/it] 92%|█████████▏| 8017/8750 [1:39:50<1:09:33, 5.69s/it] {'loss': 0.4432, 'learning_rate': 3.6585015227091013e-07, 'epoch': 0.92} + 92%|█████████▏| 8017/8750 [1:39:52<1:09:33, 5.69s/it] {'loss': 0.4432, 'learning_rate': 3.6585015227091013e-07, 'epoch': 0.92} + 92%|█████████▏| 8017/8750 [1:39:50<1:09:33, 5.69s/it] 92%|█████████▏| 8018/8750 [1:39:58<1:10:05, 5.75s/it] 92%|█████████▏| 8018/8750 [1:39:56<1:10:05, 5.75s/it] {'loss': 0.4289, 'learning_rate': 3.6485871715689735e-07, 'epoch': 0.92} + 92%|█████████▏| 8018/8750 [1:39:58<1:10:05, 5.75s/it] {'loss': 0.4289, 'learning_rate': 3.6485871715689735e-07, 'epoch': 0.92} + 92%|█████████▏| 8018/8750 [1:39:56<1:10:05, 5.75s/it] 92%|█████████▏| 8019/8750 [1:40:04<1:10:53, 5.82s/it] 92%|█████████▏| 8019/8750 [1:40:02<1:10:53, 5.82s/it] {'loss': 0.4573, 'learning_rate': 3.63868602272357e-07, 'epoch': 0.92} + 92%|█████████▏| 8019/8750 [1:40:04<1:10:53, 5.82s/it] {'loss': 0.4573, 'learning_rate': 3.63868602272357e-07, 'epoch': 0.92} + 92%|█████████▏| 8019/8750 [1:40:02<1:10:53, 5.82s/it] 92%|█████████▏| 8020/8750 [1:40:09<1:10:24, 5.79s/it] 92%|█████████▏| 8020/8750 [1:40:08<1:10:24, 5.79s/it] {'loss': 0.4517, 'learning_rate': 3.6287980775295603e-07, 'epoch': 0.92} + 92%|█████████▏| 8020/8750 [1:40:09<1:10:24, 5.79s/it] {'loss': 0.4517, 'learning_rate': 3.6287980775295603e-07, 'epoch': 0.92} + 92%|█████████▏| 8020/8750 [1:40:08<1:10:24, 5.79s/it] 92%|█████████▏| 8021/8750 [1:40:13<1:09:57, 5.76s/it] 92%|█████████▏| 8021/8750 [1:40:15<1:09:57, 5.76s/it] {'loss': 0.4437, 'learning_rate': 3.6189233373418064e-07, 'epoch': 0.92} + 92%|█████████▏| 8021/8750 [1:40:15<1:09:57, 5.76s/it] {'loss': 0.4437, 'learning_rate': 3.6189233373418064e-07, 'epoch': 0.92} + 92%|█████████▏| 8021/8750 [1:40:13<1:09:57, 5.76s/it] 92%|█████████▏| 8022/8750 [1:40:19<1:09:39, 5.74s/it] 92%|█████████▏| 8022/8750 [1:40:21<1:09:39, 5.74s/it]{'loss': 0.4425, 'learning_rate': 3.609061803513392e-07, 'epoch': 0.92} + {'loss': 0.4425, 'learning_rate': 3.609061803513392e-07, 'epoch': 0.92} + 92%|█████████▏| 8022/8750 [1:40:21<1:09:39, 5.74s/it] 92%|█████████▏| 8022/8750 [1:40:19<1:09:39, 5.74s/it] 92%|█████████▏| 8023/8750 [1:40:27<1:09:26, 5.73s/it] 92%|█████████▏| 8023/8750 [1:40:25<1:09:26, 5.73s/it] {'loss': 0.4543, 'learning_rate': 3.5992134773955354e-07, 'epoch': 0.92} + 92%|█████████▏| 8023/8750 [1:40:27<1:09:26, 5.73s/it] {'loss': 0.4543, 'learning_rate': 3.5992134773955354e-07, 'epoch': 0.92} + 92%|█████████▏| 8023/8750 [1:40:25<1:09:26, 5.73s/it]WARNING: tokenization mismatch: 1 vs. 1440. [[{'from': 'human', 'value': 'ASSISTANT: Du bist ein Personalmarketing-Experte mit den folgenden Kenntnissen und Eigenschaften.\n\nKENNTNISSE: Personalmarketing, Social Media Marketing, Facebook-Marketing, Kampagnenentwicklung, Zielgruppenanalyse, Content-Strategie, Werbeanzeigen, Performance-Messung, Personalgewinnung, Steuerfachangestellte.\n\nEIGENSCHAFTEN: Kreativ, strategisch, analytisch, kommunikativ, zielorientiert, detailverliebt, teamorientiert.\n\nAUFGABE: Entwickle eine erfolgreiche Personalmarketing-Strategie für die Gewinnung von Steuerfachangestellten über Facebook. Erstelle eine Tabelle mit allen Maßnahmen und weise jeder Maßnahme den zuständigen Spezialisten in einer Marketingagentur zu. Achte dabei auf Zielgruppenanalyse, Content-Strategie, Werbeanzeigen und Performance-Messung.\n\nOUTPUT: Eine Tabelle mit allen Maßnahmen für eine erfolgreiche Personalmarketing-Kampagne zur Gewinnung von Steuerfachangestellten auf Facebook, die die Verantwortlichkeiten der Spezialisten in einer Marketingagentur aufzeigt, einschließlich Zielgruppenanalyse, Content-Strategie, Werbeanzeigen und Performance-Messung.'}, {'from': 'gpt', 'value': '| Maßnahme | Beschreibung | Zuständiger Spezialist |\n| --- | --- | --- |\n| Zielgruppenanalyse | Analyse der demografischen Daten, Interessen und Verhaltensweisen der potenziellen Bewerber. | Marktanalyst |\n| Erstellung von Personas | Entwicklung von fiktiven, repräsentativen Bewerberprofilen zur besseren Zielgruppenansprache. | Personalmarketing-Experte |\n| Content-Strategie | Planung von relevantem, ansprechendem und zielgruppenspezifischem Content. | Content-Stratege |\n| Redaktionsplan | Erstellung eines Zeitplans für die Veröffentlichung von Inhalten auf Facebook. | Social Media Manager |\n| Erstellung von Werbeanzeigen | Design und Texten von ansprechenden, zielgruppenspezifischen Facebook-Werbeanzeigen. | Art Director & Copywriter |\n| Auswahl von Targeting-Optionen | Festlegung der Zielgruppenparameter für die Facebook-Werbekampagnen. | Facebook-Marketing-Experte |\n| Budgetierung und Zeitplanung | Festlegung des Budgets und der Laufzeit für die Facebook-Werbekampagnen. | Media Planner |\n| Erstellung von Landing Pages | Gestaltung und Implementierung von spezifischen Landing Pages für die Bewerbung. | Web-Designer |\n| Performance-Messung | Überwachung und Analyse der Kampagnenleistung, einschließlich KPIs wie Klicks und Bewerbungen. | Data Analyst |\n| A/B-Testing | Durchführung von A/B-Tests, um die Effektivität von Anzeigen und Landing Pages zu optimieren. | Conversion-Optimierungsexperte |\n| Community-Management | Interaktion mit der Zielgruppe auf Facebook, Beantwortung von Fragen und Moderation von Diskussionen. | Community Manager |\n| Reporting und Optimierung | Regelmäßige Berichterstattung über die Kampagnenergebnisse und Anpassung der Strategie. | Performance Marketing-Manager |'}, {'from': 'human', 'value': 'Weiter.'}, {'from': 'gpt', 'value': 'Hier sind einige zusätzliche Maßnahmen, die Sie in Ihre Personalmarketing-Strategie für die Gewinnung von Steuerfachangestellten über Facebook einbeziehen können:\n\n| Maßnahme | Beschreibung | Zuständiger Spezialist |\n| --- | --- | --- |\n| Video-Content | Erstellung von Video-Content, wie Interviews mit aktuellen Mitarbeitern oder Berufsalltagsvideos. | Videoproduzent |\n| Influencer-Marketing | Zusammenarbeit mit Influencern aus der Branche, um Ihre Arbeitgebermarke zu stärken. | Influencer-Marketing-Experte |\n| Anpassung von Stellenanzeigen | Anpassung der Stellenanzeigen auf Facebook, um die Zielgruppe besser anzusprechen. | Personalmarketing-Experte |\n| Mitarbeiter-Testimonials | Teilen von Erfahrungsberichten und Zitaten der Mitarbeiter auf Facebook, um Authentizität zu schaffen. | Content-Stratege |\n| Virtuelle Karriere-Events | Durchführung von virtuellen Karriere-Events oder Informationsveranstaltungen auf Facebook. | Event-Manager |\n| Interaktive Umfragen und Quiz | Nutzung von interaktiven Umfragen und Quiz auf Facebook, um das Engagement zu erhöhen. | Social Media Manager |\n| Remarketing-Kampagnen | Einsatz von Remarketing-Anzeigen, um die Personen erneut anzusprechen, die bereits interessiert waren. | Facebook-Marketing-Experte |\n| Monitoring von Wettbewerbern | Beobachtung der Personalmarketing-Aktivitäten der Wettbewerber auf Facebook und Ableitung von Optimierungspotenzialen. | Marktanalyst |\n| Integration von Chatbots | Implementierung von Chatbots auf der Facebook-Seite und der Landing Page zur Automatisierung des Bewerbungsprozesses. | Chatbot-Entwickler |\n| Employer Branding-Kampagne | Entwicklung einer Employer Branding-Kampagne, die die Unternehmenskultur und -werte hervorhebt. | Employer Branding-Experte |\n\nMit diesen erweiterten Maßnahmen können Sie eine noch umfassendere Personalmarketing-Strategie entwickeln, die die Chancen erhöht, qualifizierte Steuerfachangestellte über Facebook zu gewinnen.'}]] (ignored) + 92%|█████████▏| 8024/8750 [1:40:30<1:08:51, 5.69s/it] 92%|█████████▏| 8024/8750 [1:40:32<1:08:52, 5.69s/it] {'loss': 0.4701, 'learning_rate': 3.589378360337692e-07, 'epoch': 0.92} + 92%|█████████▏| 8024/8750 [1:40:32<1:08:52, 5.69s/it] {'loss': 0.4701, 'learning_rate': 3.589378360337692e-07, 'epoch': 0.92} + 92%|█████████▏| 8024/8750 [1:40:30<1:08:51, 5.69s/it] 92%|█████████▏| 8025/8750 [1:40:38<1:09:02, 5.71s/it] 92%|█████████▏| 8025/8750 [1:40:36<1:09:02, 5.71s/it] {'loss': 0.4521, 'learning_rate': 3.579556453687494e-07, 'epoch': 0.92} + 92%|█████████▏| 8025/8750 [1:40:38<1:09:02, 5.71s/it] {'loss': 0.4521, 'learning_rate': 3.579556453687494e-07, 'epoch': 0.92} + 92%|█████████▏| 8025/8750 [1:40:36<1:09:02, 5.71s/it] 92%|█████████▏| 8026/8750 [1:40:42<1:08:57, 5.72s/it] 92%|█████████▏| 8026/8750 [1:40:44<1:08:57, 5.72s/it] {'loss': 0.4548, 'learning_rate': 3.569747758790765e-07, 'epoch': 0.92} + 92%|█████████▏| 8026/8750 [1:40:44<1:08:57, 5.72s/it] {'loss': 0.4548, 'learning_rate': 3.569747758790765e-07, 'epoch': 0.92} + 92%|█████████▏| 8026/8750 [1:40:42<1:08:57, 5.72s/it] 92%|█████████▏| 8027/8750 [1:40:49<1:08:17, 5.67s/it] 92%|█████████▏| 8027/8750 [1:40:47<1:08:17, 5.67s/it] {'loss': 0.4471, 'learning_rate': 3.5599522769915074e-07, 'epoch': 0.92} + 92%|█████████▏| 8027/8750 [1:40:49<1:08:17, 5.67s/it] {'loss': 0.4471, 'learning_rate': 3.5599522769915074e-07, 'epoch': 0.92} + 92%|█████████▏| 8027/8750 [1:40:47<1:08:17, 5.67s/it] 92%|█████████▏| 8028/8750 [1:40:55<1:07:41, 5.63s/it] 92%|█████████▏| 8028/8750 [1:40:53<1:07:41, 5.63s/it] {'loss': 0.443, 'learning_rate': 3.550170009631926e-07, 'epoch': 0.92} + 92%|█████████▏| 8028/8750 [1:40:55<1:07:41, 5.63s/it] {'loss': 0.443, 'learning_rate': 3.550170009631926e-07, 'epoch': 0.92} + 92%|█████████▏| 8028/8750 [1:40:53<1:07:41, 5.63s/it] 92%|█████████▏| 8029/8750 [1:40:58<1:07:37, 5.63s/it] 92%|█████████▏| 8029/8750 [1:41:00<1:07:37, 5.63s/it] {'loss': 0.4381, 'learning_rate': 3.5404009580524144e-07, 'epoch': 0.92} + 92%|█████████▏| 8029/8750 [1:41:00<1:07:37, 5.63s/it] {'loss': 0.4381, 'learning_rate': 3.5404009580524144e-07, 'epoch': 0.92} + 92%|█████████▏| 8029/8750 [1:40:58<1:07:37, 5.63s/it] 92%|█████████▏| 8030/8750 [1:41:06<1:08:04, 5.67s/it] 92%|█████████▏| 8030/8750 [1:41:04<1:08:04, 5.67s/it] {'loss': 0.4476, 'learning_rate': 3.5306451235915475e-07, 'epoch': 0.92} + 92%|█████████▏| 8030/8750 [1:41:06<1:08:04, 5.67s/it] {'loss': 0.4476, 'learning_rate': 3.5306451235915475e-07, 'epoch': 0.92} + 92%|█████████▏| 8030/8750 [1:41:04<1:08:04, 5.67s/it] 92%|█████████▏| 8031/8750 [1:41:10<1:08:13, 5.69s/it] 92%|█████████▏| 8031/8750 [1:41:12<1:08:13, 5.69s/it] {'loss': 0.4408, 'learning_rate': 3.520902507586077e-07, 'epoch': 0.92} + 92%|█████████▏| 8031/8750 [1:41:12<1:08:13, 5.69s/it] {'loss': 0.4408, 'learning_rate': 3.520902507586077e-07, 'epoch': 0.92} + 92%|█████████▏| 8031/8750 [1:41:10<1:08:13, 5.69s/it] 92%|█████████▏| 8032/8750 [1:41:16<1:07:35, 5.65s/it] 92%|█████████▏| 8032/8750 [1:41:17<1:07:35, 5.65s/it] {'loss': 0.4631, 'learning_rate': 3.51117311137098e-07, 'epoch': 0.92} + 92%|█████████▏| 8032/8750 [1:41:17<1:07:35, 5.65s/it] {'loss': 0.4631, 'learning_rate': 3.51117311137098e-07, 'epoch': 0.92} + 92%|█████████▏| 8032/8750 [1:41:16<1:07:35, 5.65s/it] 92%|█████████▏| 8033/8750 [1:41:23<1:07:05, 5.61s/it] 92%|█████████▏| 8033/8750 [1:41:21<1:07:05, 5.61s/it] {'loss': 0.4327, 'learning_rate': 3.50145693627939e-07, 'epoch': 0.92} + 92%|█████████▏| 8033/8750 [1:41:23<1:07:05, 5.61s/it] {'loss': 0.4327, 'learning_rate': 3.50145693627939e-07, 'epoch': 0.92} + 92%|█████████▏| 8033/8750 [1:41:21<1:07:05, 5.61s/it] 92%|█████████▏| 8034/8750 [1:41:27<1:07:58, 5.70s/it] 92%|█████████▏| 8034/8750 [1:41:29<1:07:58, 5.70s/it] {'loss': 0.4601, 'learning_rate': 3.4917539836426317e-07, 'epoch': 0.92} + {'loss': 0.4601, 'learning_rate': 3.4917539836426317e-07, 'epoch': 0.92} 92%|█████████▏| 8034/8750 [1:41:29<1:07:58, 5.70s/it] + 92%|█████████▏| 8034/8750 [1:41:27<1:07:58, 5.70s/it] 92%|█████████▏| 8035/8750 [1:41:35<1:07:58, 5.70s/it] 92%|█████████▏| 8035/8750 [1:41:33<1:07:58, 5.70s/it] {'loss': 0.4565, 'learning_rate': 3.4820642547902516e-07, 'epoch': 0.92} + 92%|█████████▏| 8035/8750 [1:41:35<1:07:58, 5.70s/it] {'loss': 0.4565, 'learning_rate': 3.4820642547902516e-07, 'epoch': 0.92} + 92%|█████████▏| 8035/8750 [1:41:33<1:07:58, 5.70s/it] 92%|█████████▏| 8036/8750 [1:41:40<1:07:31, 5.67s/it] 92%|█████████▏| 8036/8750 [1:41:38<1:07:31, 5.67s/it] {'loss': 0.4424, 'learning_rate': 3.472387751049944e-07, 'epoch': 0.92} + 92%|█████████▏| 8036/8750 [1:41:40<1:07:31, 5.67s/it] {'loss': 0.4424, 'learning_rate': 3.472387751049944e-07, 'epoch': 0.92} + 92%|█████████▏| 8036/8750 [1:41:38<1:07:31, 5.67s/it] 92%|█████████▏| 8037/8750 [1:41:46<1:06:47, 5.62s/it] 92%|█████████▏| 8037/8750 [1:41:44<1:06:47, 5.62s/it] {'loss': 0.459, 'learning_rate': 3.462724473747603e-07, 'epoch': 0.92} + 92%|█████████▏| 8037/8750 [1:41:46<1:06:47, 5.62s/it] {'loss': 0.459, 'learning_rate': 3.462724473747603e-07, 'epoch': 0.92} + 92%|█████████▏| 8037/8750 [1:41:44<1:06:47, 5.62s/it] 92%|█████████▏| 8038/8750 [1:41:51<1:06:44, 5.62s/it] 92%|█████████▏| 8038/8750 [1:41:49<1:06:45, 5.63s/it] {'loss': 0.4567, 'learning_rate': 3.4530744242073143e-07, 'epoch': 0.92} + 92%|█████████▏| 8038/8750 [1:41:51<1:06:44, 5.62s/it] {'loss': 0.4567, 'learning_rate': 3.4530744242073143e-07, 'epoch': 0.92} + 92%|█████████▏| 8038/8750 [1:41:49<1:06:45, 5.63s/it] 92%|█████████▏| 8039/8750 [1:41:57<1:07:47, 5.72s/it] 92%|█████████▏| 8039/8750 [1:41:55<1:07:47, 5.72s/it] {'loss': 0.4382, 'learning_rate': 3.443437603751354e-07, 'epoch': 0.92} + 92%|█████████▏| 8039/8750 [1:41:57<1:07:47, 5.72s/it] {'loss': 0.4382, 'learning_rate': 3.443437603751354e-07, 'epoch': 0.92} + 92%|█████████▏| 8039/8750 [1:41:55<1:07:47, 5.72s/it] 92%|█████████▏| 8040/8750 [1:42:01<1:07:55, 5.74s/it] 92%|█████████▏| 8040/8750 [1:42:03<1:07:56, 5.74s/it] {'loss': 0.4704, 'learning_rate': 3.433814013700187e-07, 'epoch': 0.92} + 92%|█████████▏| 8040/8750 [1:42:03<1:07:56, 5.74s/it] {'loss': 0.4704, 'learning_rate': 3.433814013700187e-07, 'epoch': 0.92} + 92%|█████████▏| 8040/8750 [1:42:01<1:07:55, 5.74s/it] 92%|█████████▏| 8041/8750 [1:42:07<1:07:24, 5.70s/it] 92%|█████████▏| 8041/8750 [1:42:09<1:07:24, 5.70s/it]{'loss': 0.4377, 'learning_rate': 3.424203655372438e-07, 'epoch': 0.92} + {'loss': 0.4377, 'learning_rate': 3.424203655372438e-07, 'epoch': 0.92} + 92%|█████████▏| 8041/8750 [1:42:09<1:07:24, 5.70s/it] 92%|█████████▏| 8041/8750 [1:42:07<1:07:24, 5.70s/it] 92%|█████████▏| 8042/8750 [1:42:14<1:07:26, 5.72s/it] 92%|█████████▏| 8042/8750 [1:42:13<1:07:26, 5.72s/it] {'loss': 0.4581, 'learning_rate': 3.414606530084974e-07, 'epoch': 0.92} + 92%|█████████▏| 8042/8750 [1:42:14<1:07:26, 5.72s/it] {'loss': 0.4581, 'learning_rate': 3.414606530084974e-07, 'epoch': 0.92} + 92%|█████████▏| 8042/8750 [1:42:13<1:07:26, 5.72s/it] 92%|█████████▏| 8043/8750 [1:42:20<1:08:16, 5.79s/it] 92%|█████████▏| 8043/8750 [1:42:18<1:08:16, 5.79s/it] {'loss': 0.4452, 'learning_rate': 3.405022639152777e-07, 'epoch': 0.92} + 92%|█████████▏| 8043/8750 [1:42:20<1:08:16, 5.79s/it] {'loss': 0.4452, 'learning_rate': 3.405022639152777e-07, 'epoch': 0.92} + 92%|█████████▏| 8043/8750 [1:42:18<1:08:16, 5.79s/it] 92%|█████████▏| 8044/8750 [1:42:26<1:07:52, 5.77s/it] 92%|█████████▏| 8044/8750 [1:42:24<1:07:52, 5.77s/it] {'loss': 0.4596, 'learning_rate': 3.3954519838890866e-07, 'epoch': 0.92} + 92%|█████████▏| 8044/8750 [1:42:26<1:07:52, 5.77s/it] {'loss': 0.4596, 'learning_rate': 3.3954519838890866e-07, 'epoch': 0.92} + 92%|█████████▏| 8044/8750 [1:42:24<1:07:52, 5.77s/it] 92%|█████████▏| 8045/8750 [1:42:32<1:06:45, 5.68s/it] 92%|█████████▏| 8045/8750 [1:42:30<1:06:45, 5.68s/it] {'loss': 0.4302, 'learning_rate': 3.3858945656052855e-07, 'epoch': 0.92} + 92%|█████████▏| 8045/8750 [1:42:32<1:06:45, 5.68s/it] {'loss': 0.4302, 'learning_rate': 3.3858945656052855e-07, 'epoch': 0.92} + 92%|█████████▏| 8045/8750 [1:42:30<1:06:45, 5.68s/it] 92%|█████████▏| 8046/8750 [1:42:38<1:08:01, 5.80s/it] 92%|█████████▏| 8046/8750 [1:42:36<1:08:01, 5.80s/it] {'loss': 0.4521, 'learning_rate': 3.376350385610938e-07, 'epoch': 0.92} + 92%|█████████▏| 8046/8750 [1:42:38<1:08:01, 5.80s/it] {'loss': 0.4521, 'learning_rate': 3.376350385610938e-07, 'epoch': 0.92} + 92%|█████████▏| 8046/8750 [1:42:36<1:08:01, 5.80s/it] 92%|█████████▏| 8047/8750 [1:42:43<1:07:27, 5.76s/it] 92%|█████████▏| 8047/8750 [1:42:41<1:07:27, 5.76s/it] {'loss': 0.4579, 'learning_rate': 3.3668194452138423e-07, 'epoch': 0.92} + 92%|█████████▏| 8047/8750 [1:42:43<1:07:27, 5.76s/it] {'loss': 0.4579, 'learning_rate': 3.3668194452138423e-07, 'epoch': 0.92} + 92%|█████████▏| 8047/8750 [1:42:41<1:07:27, 5.76s/it] 92%|█████████▏| 8048/8750 [1:42:49<1:07:06, 5.74s/it] 92%|█████████▏| 8048/8750 [1:42:47<1:07:06, 5.74s/it] {'loss': 0.4506, 'learning_rate': 3.357301745719932e-07, 'epoch': 0.92} + 92%|█████████▏| 8048/8750 [1:42:49<1:07:06, 5.74s/it] {'loss': 0.4506, 'learning_rate': 3.357301745719932e-07, 'epoch': 0.92} + 92%|█████████▏| 8048/8750 [1:42:47<1:07:06, 5.74s/it] 92%|█████████▏| 8049/8750 [1:42:53<1:06:37, 5.70s/it] 92%|█████████▏| 8049/8750 [1:42:55<1:06:37, 5.70s/it] {'loss': 0.4377, 'learning_rate': 3.34779728843333e-07, 'epoch': 0.92} + 92%|█████████▏| 8049/8750 [1:42:55<1:06:37, 5.70s/it] {'loss': 0.4377, 'learning_rate': 3.34779728843333e-07, 'epoch': 0.92} + 92%|█████████▏| 8049/8750 [1:42:53<1:06:37, 5.70s/it]1 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... + 92%|█████████▏| 8050/8750 [1:43:00<1:06:26, 5.69s/it]12 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +119 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +15 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +07 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... + 92%|█████████▏| 8050/8750 [1:42:58<1:06:26, 5.69s/it]2 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4593, 'learning_rate': 3.3383060746563836e-07, 'epoch': 0.92} + 92%|█████████▏| 8050/8750 [1:43:00<1:06:26, 5.69s/it] {'loss': 0.4593, 'learning_rate': 3.3383060746563836e-07, 'epoch': 0.92} + 92%|█████████▏| 8050/8750 [1:42:58<1:06:26, 5.69s/it] 92%|█████████▏| 8051/8750 [1:43:06<1:05:57, 5.66s/it] 92%|█████████▏| 8051/8750 [1:43:04<1:05:57, 5.66s/it] {'loss': 0.4456, 'learning_rate': 3.3288281056895746e-07, 'epoch': 0.92} + 92%|█████████▏| 8051/8750 [1:43:06<1:05:57, 5.66s/it] {'loss': 0.4456, 'learning_rate': 3.3288281056895746e-07, 'epoch': 0.92} + 92%|█████████▏| 8051/8750 [1:43:04<1:05:57, 5.66s/it] 92%|█████████▏| 8052/8750 [1:43:11<1:05:24, 5.62s/it] 92%|█████████▏| 8052/8750 [1:43:10<1:05:24, 5.62s/it] {'loss': 0.4628, 'learning_rate': 3.3193633828316306e-07, 'epoch': 0.92} + 92%|█████████▏| 8052/8750 [1:43:11<1:05:24, 5.62s/it] {'loss': 0.4628, 'learning_rate': 3.3193633828316306e-07, 'epoch': 0.92} + 92%|█████████▏| 8052/8750 [1:43:10<1:05:24, 5.62s/it] 92%|█████████▏| 8053/8750 [1:43:17<1:05:30, 5.64s/it] 92%|█████████▏| 8053/8750 [1:43:15<1:05:30, 5.64s/it] {'loss': 0.4529, 'learning_rate': 3.309911907379393e-07, 'epoch': 0.92} + 92%|█████████▏| 8053/8750 [1:43:17<1:05:30, 5.64s/it] {'loss': 0.4529, 'learning_rate': 3.309911907379393e-07, 'epoch': 0.92} + 92%|█████████▏| 8053/8750 [1:43:15<1:05:30, 5.64s/it] 92%|█████████▏| 8054/8750 [1:43:23<1:05:19, 5.63s/it] 92%|█████████▏| 8054/8750 [1:43:21<1:05:19, 5.63s/it] {'loss': 0.4455, 'learning_rate': 3.300473680627947e-07, 'epoch': 0.92} + 92%|█████████▏| 8054/8750 [1:43:23<1:05:19, 5.63s/it] {'loss': 0.4455, 'learning_rate': 3.300473680627947e-07, 'epoch': 0.92} + 92%|█████████▏| 8054/8750 [1:43:21<1:05:19, 5.63s/it] 92%|█████████▏| 8055/8750 [1:43:28<1:05:25, 5.65s/it] 92%|█████████▏| 8055/8750 [1:43:26<1:05:25, 5.65s/it] {'loss': 0.4394, 'learning_rate': 3.2910487038705476e-07, 'epoch': 0.92} + 92%|█████████▏| 8055/8750 [1:43:28<1:05:25, 5.65s/it] {'loss': 0.4394, 'learning_rate': 3.2910487038705476e-07, 'epoch': 0.92} + 92%|█████████▏| 8055/8750 [1:43:26<1:05:25, 5.65s/it] 92%|█████████▏| 8056/8750 [1:43:34<1:05:26, 5.66s/it] 92%|█████████▏| 8056/8750 [1:43:32<1:05:26, 5.66s/it] {'loss': 0.463, 'learning_rate': 3.2816369783986166e-07, 'epoch': 0.92} + 92%|█████████▏| 8056/8750 [1:43:34<1:05:26, 5.66s/it] {'loss': 0.463, 'learning_rate': 3.2816369783986166e-07, 'epoch': 0.92} + 92%|█████████▏| 8056/8750 [1:43:32<1:05:26, 5.66s/it] 92%|█████████▏| 8057/8750 [1:43:38<1:05:07, 5.64s/it] 92%|█████████▏| 8057/8750 [1:43:40<1:05:07, 5.64s/it] {'loss': 0.4431, 'learning_rate': 3.2722385055017567e-07, 'epoch': 0.92} + {'loss': 0.4431, 'learning_rate': 3.2722385055017567e-07, 'epoch': 0.92} + 92%|█████████▏| 8057/8750 [1:43:40<1:05:07, 5.64s/it] 92%|█████████▏| 8057/8750 [1:43:38<1:05:07, 5.64s/it] 92%|█████████▏| 8058/8750 [1:43:43<1:05:04, 5.64s/it] 92%|█████████▏| 8058/8750 [1:43:45<1:05:04, 5.64s/it] {'loss': 0.4564, 'learning_rate': 3.262853286467804e-07, 'epoch': 0.92} + 92%|█████████▏| 8058/8750 [1:43:45<1:05:04, 5.64s/it] {'loss': 0.4564, 'learning_rate': 3.262853286467804e-07, 'epoch': 0.92} + 92%|█████████▏| 8058/8750 [1:43:43<1:05:04, 5.64s/it] 92%|█████████▏| 8059/8750 [1:43:51<1:05:03, 5.65s/it] 92%|█████████▏| 8059/8750 [1:43:49<1:05:03, 5.65s/it] {'loss': 0.4397, 'learning_rate': 3.2534813225826965e-07, 'epoch': 0.92} + 92%|█████████▏| 8059/8750 [1:43:51<1:05:03, 5.65s/it] {'loss': 0.4397, 'learning_rate': 3.2534813225826965e-07, 'epoch': 0.92} + 92%|█████████▏| 8059/8750 [1:43:49<1:05:03, 5.65s/it] 92%|█████████▏| 8060/8750 [1:43:55<1:05:13, 5.67s/it] 92%|█████████▏| 8060/8750 [1:43:57<1:05:13, 5.67s/it] {'loss': 0.4575, 'learning_rate': 3.2441226151306403e-07, 'epoch': 0.92} + 92%|█████████▏| 8060/8750 [1:43:57<1:05:13, 5.67s/it] {'loss': 0.4575, 'learning_rate': 3.2441226151306403e-07, 'epoch': 0.92} + 92%|█████████▏| 8060/8750 [1:43:55<1:05:13, 5.67s/it] 92%|█████████▏| 8061/8750 [1:44:02<1:04:49, 5.65s/it] 92%|█████████▏| 8061/8750 [1:44:00<1:04:49, 5.65s/it] {'loss': 0.4362, 'learning_rate': 3.234777165393965e-07, 'epoch': 0.92} + 92%|█████████▏| 8061/8750 [1:44:02<1:04:49, 5.65s/it] {'loss': 0.4362, 'learning_rate': 3.234777165393965e-07, 'epoch': 0.92} + 92%|█████████▏| 8061/8750 [1:44:00<1:04:49, 5.65s/it] 92%|█████████▏| 8062/8750 [1:44:06<1:05:06, 5.68s/it] 92%|█████████▏| 8062/8750 [1:44:08<1:05:06, 5.68s/it] {'loss': 0.4318, 'learning_rate': 3.2254449746532246e-07, 'epoch': 0.92} + 92%|█████████▏| 8062/8750 [1:44:08<1:05:06, 5.68s/it] {'loss': 0.4318, 'learning_rate': 3.2254449746532246e-07, 'epoch': 0.92} + 92%|█████████▏| 8062/8750 [1:44:06<1:05:06, 5.68s/it] 92%|█████████▏| 8063/8750 [1:44:12<1:05:12, 5.69s/it] 92%|█████████▏| 8063/8750 [1:44:14<1:05:12, 5.69s/it] {'loss': 0.4545, 'learning_rate': 3.216126044187118e-07, 'epoch': 0.92} + 92%|█████████▏| 8063/8750 [1:44:14<1:05:12, 5.69s/it] {'loss': 0.4545, 'learning_rate': 3.216126044187118e-07, 'epoch': 0.92} + 92%|█████████▏| 8063/8750 [1:44:12<1:05:12, 5.69s/it] 92%|█████████▏| 8064/8750 [1:44:19<1:05:06, 5.69s/it] 92%|█████████▏| 8064/8750 [1:44:18<1:05:06, 5.69s/it] {'loss': 0.462, 'learning_rate': 3.206820375272557e-07, 'epoch': 0.92} + 92%|█████████▏| 8064/8750 [1:44:19<1:05:06, 5.69s/it] {'loss': 0.462, 'learning_rate': 3.206820375272557e-07, 'epoch': 0.92} + 92%|█████████▏| 8064/8750 [1:44:18<1:05:06, 5.69s/it] 92%|█████████▏| 8065/8750 [1:44:24<1:07:01, 5.87s/it] 92%|█████████▏| 8065/8750 [1:44:26<1:07:01, 5.87s/it] {'loss': 0.4365, 'learning_rate': 3.1975279691846437e-07, 'epoch': 0.92} + 92%|█████████▏| 8065/8750 [1:44:26<1:07:01, 5.87s/it] {'loss': 0.4365, 'learning_rate': 3.1975279691846437e-07, 'epoch': 0.92} + 92%|█████████▏| 8065/8750 [1:44:24<1:07:01, 5.87s/it] 92%|█████████▏| 8066/8750 [1:44:31<1:06:06, 5.80s/it] 92%|█████████▏| 8066/8750 [1:44:29<1:06:06, 5.80s/it] {'loss': 0.4711, 'learning_rate': 3.188248827196616e-07, 'epoch': 0.92} + 92%|█████████▏| 8066/8750 [1:44:31<1:06:06, 5.80s/it] {'loss': 0.4711, 'learning_rate': 3.188248827196616e-07, 'epoch': 0.92} + 92%|█████████▏| 8066/8750 [1:44:29<1:06:06, 5.80s/it] 92%|█████████▏| 8067/8750 [1:44:35<1:05:33, 5.76s/it] 92%|█████████▏| 8067/8750 [1:44:37<1:05:33, 5.76s/it] {'loss': 0.4475, 'learning_rate': 3.178982950579923e-07, 'epoch': 0.92} + 92%|█████████▏| 8067/8750 [1:44:37<1:05:33, 5.76s/it] {'loss': 0.4475, 'learning_rate': 3.178982950579923e-07, 'epoch': 0.92} + 92%|█████████▏| 8067/8750 [1:44:35<1:05:33, 5.76s/it] 92%|█████████▏| 8068/8750 [1:44:43<1:05:53, 5.80s/it] 92%|█████████▏| 8068/8750 [1:44:41<1:05:53, 5.80s/it] {'loss': 0.4419, 'learning_rate': 3.169730340604227e-07, 'epoch': 0.92} + {'loss': 0.4419, 'learning_rate': 3.169730340604227e-07, 'epoch': 0.92} 92%|█████████▏| 8068/8750 [1:44:43<1:05:53, 5.80s/it] + 92%|█████████▏| 8068/8750 [1:44:41<1:05:53, 5.80s/it] 92%|█████████▏| 8069/8750 [1:44:49<1:05:08, 5.74s/it] 92%|█████████▏| 8069/8750 [1:44:47<1:05:08, 5.74s/it] {'loss': 0.4292, 'learning_rate': 3.160490998537313e-07, 'epoch': 0.92} + 92%|█████████▏| 8069/8750 [1:44:49<1:05:08, 5.74s/it] {'loss': 0.4292, 'learning_rate': 3.160490998537313e-07, 'epoch': 0.92} + 92%|█████████▏| 8069/8750 [1:44:47<1:05:08, 5.74s/it] 92%|█████████▏| 8070/8750 [1:44:54<1:04:52, 5.72s/it] 92%|█████████▏| 8070/8750 [1:44:52<1:04:52, 5.72s/it] {'loss': 0.4678, 'learning_rate': 3.151264925645192e-07, 'epoch': 0.92} + 92%|█████████▏| 8070/8750 [1:44:54<1:04:52, 5.72s/it] {'loss': 0.4678, 'learning_rate': 3.151264925645192e-07, 'epoch': 0.92} + 92%|█████████▏| 8070/8750 [1:44:52<1:04:52, 5.72s/it] 92%|█████████▏| 8071/8750 [1:45:00<1:05:15, 5.77s/it] 92%|█████████▏| 8071/8750 [1:44:58<1:05:15, 5.77s/it] {'loss': 0.4485, 'learning_rate': 3.142052123192019e-07, 'epoch': 0.92} + 92%|█████████▏| 8071/8750 [1:45:00<1:05:15, 5.77s/it] {'loss': 0.4485, 'learning_rate': 3.142052123192019e-07, 'epoch': 0.92} + 92%|█████████▏| 8071/8750 [1:44:58<1:05:15, 5.77s/it] 92%|█████████▏| 8072/8750 [1:45:04<1:04:49, 5.74s/it] 92%|█████████▏| 8072/8750 [1:45:06<1:04:49, 5.74s/it] {'loss': 0.4481, 'learning_rate': 3.132852592440194e-07, 'epoch': 0.92} + 92%|█████████▏| 8072/8750 [1:45:06<1:04:49, 5.74s/it] {'loss': 0.4481, 'learning_rate': 3.132852592440194e-07, 'epoch': 0.92} + 92%|█████████▏| 8072/8750 [1:45:04<1:04:49, 5.74s/it] 92%|█████████▏| 8073/8750 [1:45:12<1:05:10, 5.78s/it] 92%|█████████▏| 8073/8750 [1:45:10<1:05:10, 5.78s/it] {'loss': 0.4427, 'learning_rate': 3.1236663346502215e-07, 'epoch': 0.92} + 92%|█████████▏| 8073/8750 [1:45:12<1:05:10, 5.78s/it] {'loss': 0.4427, 'learning_rate': 3.1236663346502215e-07, 'epoch': 0.92} + 92%|█████████▏| 8073/8750 [1:45:10<1:05:10, 5.78s/it] 92%|█████████▏| 8074/8750 [1:45:17<1:05:08, 5.78s/it] 92%|█████████▏| 8074/8750 [1:45:16<1:05:08, 5.78s/it] {'loss': 0.4701, 'learning_rate': 3.11449335108085e-07, 'epoch': 0.92} + 92%|█████████▏| 8074/8750 [1:45:17<1:05:08, 5.78s/it] {'loss': 0.4701, 'learning_rate': 3.11449335108085e-07, 'epoch': 0.92} + 92%|█████████▏| 8074/8750 [1:45:16<1:05:08, 5.78s/it] 92%|█████████▏| 8075/8750 [1:45:21<1:05:04, 5.78s/it] 92%|█████████▏| 8075/8750 [1:45:23<1:05:04, 5.78s/it] {'loss': 0.4609, 'learning_rate': 3.1053336429889616e-07, 'epoch': 0.92} + 92%|█████████▏| 8075/8750 [1:45:23<1:05:04, 5.78s/it] {'loss': 0.4609, 'learning_rate': 3.1053336429889616e-07, 'epoch': 0.92} + 92%|█████████▏| 8075/8750 [1:45:21<1:05:04, 5.78s/it] 92%|█████████▏| 8076/8750 [1:45:27<1:04:24, 5.73s/it] 92%|█████████▏| 8076/8750 [1:45:29<1:04:24, 5.73s/it] {'loss': 0.4481, 'learning_rate': 3.0961872116296645e-07, 'epoch': 0.92} + 92%|█████████▏| 8076/8750 [1:45:29<1:04:24, 5.73s/it] {'loss': 0.4481, 'learning_rate': 3.0961872116296645e-07, 'epoch': 0.92} + 92%|█████████▏| 8076/8750 [1:45:27<1:04:24, 5.73s/it] 92%|█████████▏| 8077/8750 [1:45:33<1:04:24, 5.74s/it] 92%|█████████▏| 8077/8750 [1:45:35<1:04:24, 5.74s/it] {'loss': 0.4279, 'learning_rate': 3.0870540582562003e-07, 'epoch': 0.92} + 92%|█████████▏| 8077/8750 [1:45:35<1:04:24, 5.74s/it] {'loss': 0.4279, 'learning_rate': 3.0870540582562003e-07, 'epoch': 0.92} + 92%|█████████▏| 8077/8750 [1:45:33<1:04:24, 5.74s/it] 92%|█████████▏| 8078/8750 [1:45:39<1:04:33, 5.76s/it] 92%|█████████▏| 8078/8750 [1:45:40<1:04:33, 5.76s/it] {'loss': 0.4586, 'learning_rate': 3.077934184120035e-07, 'epoch': 0.92} + 92%|█████████▏| 8078/8750 [1:45:40<1:04:33, 5.76s/it] {'loss': 0.4586, 'learning_rate': 3.077934184120035e-07, 'epoch': 0.92} + 92%|█████████▏| 8078/8750 [1:45:39<1:04:33, 5.76s/it] 92%|█████████▏| 8079/8750 [1:45:44<1:03:58, 5.72s/it] 92%|█████████▏| 8079/8750 [1:45:46<1:03:58, 5.72s/it] {'loss': 0.4453, 'learning_rate': 3.06882759047078e-07, 'epoch': 0.92} {'loss': 0.4453, 'learning_rate': 3.06882759047078e-07, 'epoch': 0.92} + 92%|█████████▏| 8079/8750 [1:45:46<1:03:58, 5.72s/it] + 92%|█████████▏| 8079/8750 [1:45:44<1:03:58, 5.72s/it] 92%|█████████▏| 8080/8750 [1:45:50<1:04:24, 5.77s/it] 92%|█████████▏| 8080/8750 [1:45:52<1:04:24, 5.77s/it] {'loss': 0.4442, 'learning_rate': 3.059734278556237e-07, 'epoch': 0.92} + 92%|█████████▏| 8080/8750 [1:45:52<1:04:24, 5.77s/it] {'loss': 0.4442, 'learning_rate': 3.059734278556237e-07, 'epoch': 0.92} + 92%|█████████▏| 8080/8750 [1:45:50<1:04:24, 5.77s/it] 92%|█████████▏| 8081/8750 [1:45:58<1:03:47, 5.72s/it] 92%|█████████▏| 8081/8750 [1:45:56<1:03:47, 5.72s/it] {'loss': 0.4565, 'learning_rate': 3.050654249622398e-07, 'epoch': 0.92} + 92%|█████████▏| 8081/8750 [1:45:58<1:03:47, 5.72s/it] {'loss': 0.4565, 'learning_rate': 3.050654249622398e-07, 'epoch': 0.92} + 92%|█████████▏| 8081/8750 [1:45:56<1:03:47, 5.72s/it] 92%|█████████▏| 8082/8750 [1:46:03<1:03:19, 5.69s/it] 92%|█████████▏| 8082/8750 [1:46:01<1:03:19, 5.69s/it] {'loss': 0.4458, 'learning_rate': 3.0415875049134566e-07, 'epoch': 0.92} + 92%|█████████▏| 8082/8750 [1:46:03<1:03:19, 5.69s/it] {'loss': 0.4458, 'learning_rate': 3.0415875049134566e-07, 'epoch': 0.92} + 92%|█████████▏| 8082/8750 [1:46:01<1:03:19, 5.69s/it] 92%|█████████▏| 8083/8750 [1:46:07<1:03:51, 5.74s/it] 92%|█████████▏| 8083/8750 [1:46:09<1:03:51, 5.74s/it] {'loss': 0.4353, 'learning_rate': 3.03253404567172e-07, 'epoch': 0.92} + {'loss': 0.4353, 'learning_rate': 3.03253404567172e-07, 'epoch': 0.92} 92%|█████████▏| 8083/8750 [1:46:09<1:03:51, 5.74s/it] + 92%|█████████▏| 8083/8750 [1:46:07<1:03:51, 5.74s/it] 92%|█████████▏| 8084/8750 [1:46:15<1:03:26, 5.71s/it] 92%|█████████▏| 8084/8750 [1:46:13<1:03:26, 5.71s/it] {'loss': 0.4717, 'learning_rate': 3.0234938731377394e-07, 'epoch': 0.92} + 92%|█████████▏| 8084/8750 [1:46:15<1:03:26, 5.71s/it] {'loss': 0.4717, 'learning_rate': 3.0234938731377394e-07, 'epoch': 0.92} + 92%|█████████▏| 8084/8750 [1:46:13<1:03:26, 5.71s/it] 92%|█████████▏| 8085/8750 [1:46:18<1:02:47, 5.67s/it] 92%|█████████▏| 8085/8750 [1:46:20<1:02:47, 5.67s/it] {'loss': 0.4587, 'learning_rate': 3.014466988550202e-07, 'epoch': 0.92} + 92%|█████████▏| 8085/8750 [1:46:20<1:02:47, 5.67s/it] {'loss': 0.4587, 'learning_rate': 3.014466988550202e-07, 'epoch': 0.92} + 92%|█████████▏| 8085/8750 [1:46:18<1:02:47, 5.67s/it] 92%|█████████▏| 8086/8750 [1:46:24<1:02:25, 5.64s/it] 92%|█████████▏| 8086/8750 [1:46:26<1:02:25, 5.64s/it] {'loss': 0.4576, 'learning_rate': 3.0054533931460186e-07, 'epoch': 0.92} + {'loss': 0.4576, 'learning_rate': 3.0054533931460186e-07, 'epoch': 0.92} + 92%|█████████▏| 8086/8750 [1:46:26<1:02:25, 5.64s/it] 92%|█████████▏| 8086/8750 [1:46:24<1:02:25, 5.64s/it] 92%|█████████▏| 8087/8750 [1:46:30<1:02:18, 5.64s/it] 92%|█████████▏| 8087/8750 [1:46:31<1:02:18, 5.64s/it] {'loss': 0.4396, 'learning_rate': 2.996453088160234e-07, 'epoch': 0.92} + 92%|█████████▏| 8087/8750 [1:46:31<1:02:18, 5.64s/it] {'loss': 0.4396, 'learning_rate': 2.996453088160234e-07, 'epoch': 0.92} + 92%|█████████▏| 8087/8750 [1:46:30<1:02:18, 5.64s/it] 92%|█████████▏| 8088/8750 [1:46:35<1:02:23, 5.65s/it] 92%|█████████▏| 8088/8750 [1:46:37<1:02:23, 5.65s/it] {'loss': 0.443, 'learning_rate': 2.9874660748260843e-07, 'epoch': 0.92} + {'loss': 0.443, 'learning_rate': 2.9874660748260843e-07, 'epoch': 0.92} 92%|█████████▏| 8088/8750 [1:46:37<1:02:23, 5.65s/it] + 92%|█████████▏| 8088/8750 [1:46:35<1:02:23, 5.65s/it] 92%|█████████▏| 8089/8750 [1:46:41<1:02:27, 5.67s/it] 92%|█████████▏| 8089/8750 [1:46:43<1:02:27, 5.67s/it] {'loss': 0.4552, 'learning_rate': 2.978492354375007e-07, 'epoch': 0.92} + 92%|█████████▏| 8089/8750 [1:46:43<1:02:27, 5.67s/it] {'loss': 0.4552, 'learning_rate': 2.978492354375007e-07, 'epoch': 0.92} + 92%|█████████▏| 8089/8750 [1:46:41<1:02:27, 5.67s/it] 92%|█████████▏| 8090/8750 [1:46:48<1:02:07, 5.65s/it] 92%|█████████▏| 8090/8750 [1:46:47<1:02:07, 5.65s/it] {'loss': 0.4378, 'learning_rate': 2.969531928036595e-07, 'epoch': 0.92} + 92%|█████████▏| 8090/8750 [1:46:48<1:02:07, 5.65s/it] {'loss': 0.4378, 'learning_rate': 2.969531928036595e-07, 'epoch': 0.92} + 92%|█████████▏| 8090/8750 [1:46:47<1:02:07, 5.65s/it] 92%|█████████▏| 8091/8750 [1:46:52<1:02:07, 5.66s/it] 92%|█████████▏| 8091/8750 [1:46:54<1:02:07, 5.66s/it] {'loss': 0.4272, 'learning_rate': 2.9605847970386125e-07, 'epoch': 0.92} + 92%|█████████▏| 8091/8750 [1:46:54<1:02:07, 5.66s/it] {'loss': 0.4272, 'learning_rate': 2.9605847970386125e-07, 'epoch': 0.92} + 92%|█████████▏| 8091/8750 [1:46:52<1:02:07, 5.66s/it] 92%|█████████▏| 8092/8750 [1:47:00<1:02:39, 5.71s/it] 92%|█████████▏| 8092/8750 [1:46:58<1:02:39, 5.71s/it] {'loss': 0.4468, 'learning_rate': 2.9516509626070553e-07, 'epoch': 0.92} + 92%|█████████▏| 8092/8750 [1:46:58<1:02:39, 5.71s/it]{'loss': 0.4468, 'learning_rate': 2.9516509626070553e-07, 'epoch': 0.92} + 92%|█████████▏| 8092/8750 [1:47:00<1:02:39, 5.71s/it] 92%|█████████▏| 8093/8750 [1:47:04<1:02:00, 5.66s/it] 92%|█████████▏| 8093/8750 [1:47:05<1:02:00, 5.66s/it] {'loss': 0.4672, 'learning_rate': 2.9427304259660117e-07, 'epoch': 0.92} + {'loss': 0.4672, 'learning_rate': 2.9427304259660117e-07, 'epoch': 0.92} 92%|█████████▏| 8093/8750 [1:47:05<1:02:00, 5.66s/it] + 92%|█████████▏| 8093/8750 [1:47:04<1:02:00, 5.66s/it] 93%|█████████▎| 8094/8750 [1:47:11<1:02:25, 5.71s/it] 93%|█████████▎| 8094/8750 [1:47:09<1:02:25, 5.71s/it] {'loss': 0.4169, 'learning_rate': 2.9338231883378365e-07, 'epoch': 0.93} + 93%|█████████▎| 8094/8750 [1:47:11<1:02:25, 5.71s/it] {'loss': 0.4169, 'learning_rate': 2.9338231883378365e-07, 'epoch': 0.93} + 93%|█████████▎| 8094/8750 [1:47:09<1:02:25, 5.71s/it] 93%|█████████▎| 8095/8750 [1:47:17<1:02:11, 5.70s/it] 93%|█████████▎| 8095/8750 [1:47:15<1:02:11, 5.70s/it] {'loss': 0.4546, 'learning_rate': 2.924929250942998e-07, 'epoch': 0.93} + 93%|█████████▎| 8095/8750 [1:47:17<1:02:11, 5.70s/it] {'loss': 0.4546, 'learning_rate': 2.924929250942998e-07, 'epoch': 0.93} + 93%|█████████▎| 8095/8750 [1:47:15<1:02:11, 5.70s/it] 93%|█████████▎| 8096/8750 [1:47:23<1:01:49, 5.67s/it] 93%|█████████▎| 8096/8750 [1:47:21<1:01:49, 5.67s/it] {'loss': 0.4413, 'learning_rate': 2.9160486150001556e-07, 'epoch': 0.93} + 93%|█████████▎| 8096/8750 [1:47:23<1:01:49, 5.67s/it] {'loss': 0.4413, 'learning_rate': 2.9160486150001556e-07, 'epoch': 0.93} + 93%|█████████▎| 8096/8750 [1:47:21<1:01:49, 5.67s/it] 93%|█████████▎| 8097/8750 [1:47:29<1:02:39, 5.76s/it] 93%|█████████▎| 8097/8750 [1:47:27<1:02:39, 5.76s/it] {'loss': 0.4463, 'learning_rate': 2.907181281726179e-07, 'epoch': 0.93} + 93%|█████████▎| 8097/8750 [1:47:29<1:02:39, 5.76s/it] {'loss': 0.4463, 'learning_rate': 2.907181281726179e-07, 'epoch': 0.93} + 93%|█████████▎| 8097/8750 [1:47:27<1:02:39, 5.76s/it] 93%|█████████▎| 8098/8750 [1:47:32<1:02:12, 5.72s/it] 93%|█████████▎| 8098/8750 [1:47:34<1:02:11, 5.72s/it]{'loss': 0.4412, 'learning_rate': 2.8983272523360637e-07, 'epoch': 0.93} + {'loss': 0.4412, 'learning_rate': 2.8983272523360637e-07, 'epoch': 0.93} + 93%|█████████▎| 8098/8750 [1:47:34<1:02:11, 5.72s/it] 93%|█████████▎| 8098/8750 [1:47:32<1:02:12, 5.72s/it] 93%|█████████▎| 8099/8750 [1:47:40<1:01:57, 5.71s/it] 93%|█████████▎| 8099/8750 [1:47:38<1:01:57, 5.71s/it] {'loss': 0.4416, 'learning_rate': 2.889486528043028e-07, 'epoch': 0.93} + 93%|█████████▎| 8099/8750 [1:47:40<1:01:57, 5.71s/it] {'loss': 0.4416, 'learning_rate': 2.889486528043028e-07, 'epoch': 0.93} + 93%|█████████▎| 8099/8750 [1:47:38<1:01:57, 5.71s/it]1 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +118 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... + 93%|█████████▎| 8100/8750 [1:47:45<1:01:29, 5.68s/it]15 AutoResumeHook: Checking whether to suspend... +07 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... + 93%|█████████▎| 8100/8750 [1:47:44<1:01:29, 5.68s/it]10 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4381, 'learning_rate': 2.880659110058448e-07, 'epoch': 0.93} + 93%|█████████▎| 8100/8750 [1:47:45<1:01:29, 5.68s/it] {'loss': 0.4381, 'learning_rate': 2.880659110058448e-07, 'epoch': 0.93} + 93%|█████████▎| 8100/8750 [1:47:44<1:01:29, 5.68s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-8100/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-8100/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-8100/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 93%|█████████▎| 8101/8750 [1:48:06<1:49:04, 10.08s/it] 93%|█████████▎| 8101/8750 [1:48:04<1:49:04, 10.08s/it] {'loss': 0.4511, 'learning_rate': 2.8718449995918553e-07, 'epoch': 0.93} + 93%|█████████▎| 8101/8750 [1:48:06<1:49:04, 10.08s/it] {'loss': 0.4511, 'learning_rate': 2.8718449995918553e-07, 'epoch': 0.93} + 93%|█████████▎| 8101/8750 [1:48:04<1:49:04, 10.08s/it] 93%|█████████▎| 8102/8750 [1:48:11<1:34:09, 8.72s/it] 93%|█████████▎| 8102/8750 [1:48:09<1:34:09, 8.72s/it] {'loss': 0.463, 'learning_rate': 2.863044197851017e-07, 'epoch': 0.93} + 93%|█████████▎| 8102/8750 [1:48:11<1:34:09, 8.72s/it] {'loss': 0.463, 'learning_rate': 2.863044197851017e-07, 'epoch': 0.93} + 93%|█████████▎| 8102/8750 [1:48:09<1:34:09, 8.72s/it] 93%|█████████▎| 8103/8750 [1:48:15<1:24:51, 7.87s/it] 93%|█████████▎| 8103/8750 [1:48:17<1:24:51, 7.87s/it] {'loss': 0.4676, 'learning_rate': 2.8542567060418135e-07, 'epoch': 0.93} + {'loss': 0.4676, 'learning_rate': 2.8542567060418135e-07, 'epoch': 0.93} 93%|█████████▎| 8103/8750 [1:48:17<1:24:51, 7.87s/it] + 93%|█████████▎| 8103/8750 [1:48:15<1:24:51, 7.87s/it] 93%|█████████▎| 8104/8750 [1:48:23<1:17:37, 7.21s/it] 93%|█████████▎| 8104/8750 [1:48:21<1:17:37, 7.21s/it] {'loss': 0.43, 'learning_rate': 2.845482525368337e-07, 'epoch': 0.93} + 93%|█████████▎| 8104/8750 [1:48:23<1:17:37, 7.21s/it] {'loss': 0.43, 'learning_rate': 2.845482525368337e-07, 'epoch': 0.93} + 93%|█████████▎| 8104/8750 [1:48:21<1:17:37, 7.21s/it] 93%|█████████▎| 8105/8750 [1:48:29<1:12:15, 6.72s/it] 93%|█████████▎| 8105/8750 [1:48:27<1:12:15, 6.72s/it] {'loss': 0.4403, 'learning_rate': 2.836721657032848e-07, 'epoch': 0.93} + 93%|█████████▎| 8105/8750 [1:48:29<1:12:15, 6.72s/it] {'loss': 0.4403, 'learning_rate': 2.836721657032848e-07, 'epoch': 0.93} + 93%|█████████▎| 8105/8750 [1:48:27<1:12:15, 6.72s/it] 93%|█████████▎| 8106/8750 [1:48:34<1:08:48, 6.41s/it] 93%|█████████▎| 8106/8750 [1:48:32<1:08:48, 6.41s/it] {'loss': 0.444, 'learning_rate': 2.8279741022357535e-07, 'epoch': 0.93} + 93%|█████████▎| 8106/8750 [1:48:34<1:08:48, 6.41s/it] {'loss': 0.444, 'learning_rate': 2.8279741022357535e-07, 'epoch': 0.93} + 93%|█████████▎| 8106/8750 [1:48:32<1:08:48, 6.41s/it] 93%|█████████▎| 8107/8750 [1:48:38<1:06:20, 6.19s/it] 93%|█████████▎| 8107/8750 [1:48:40<1:06:20, 6.19s/it] {'loss': 0.4691, 'learning_rate': 2.8192398621757156e-07, 'epoch': 0.93} + 93%|█████████▎| 8107/8750 [1:48:40<1:06:20, 6.19s/it] {'loss': 0.4691, 'learning_rate': 2.8192398621757156e-07, 'epoch': 0.93} + 93%|█████████▎| 8107/8750 [1:48:38<1:06:20, 6.19s/it] 93%|█████████▎| 8108/8750 [1:48:44<1:04:17, 6.01s/it] 93%|█████████▎| 8108/8750 [1:48:45<1:04:17, 6.01s/it] {'loss': 0.4507, 'learning_rate': 2.810518938049478e-07, 'epoch': 0.93} + 93%|█████████▎| 8108/8750 [1:48:45<1:04:17, 6.01s/it] {'loss': 0.4507, 'learning_rate': 2.810518938049478e-07, 'epoch': 0.93} + 93%|█████████▎| 8108/8750 [1:48:44<1:04:17, 6.01s/it] 93%|█████████▎| 8109/8750 [1:48:51<1:02:56, 5.89s/it] 93%|█████████▎| 8109/8750 [1:48:49<1:02:56, 5.89s/it] {'loss': 0.4691, 'learning_rate': 2.801811331052007e-07, 'epoch': 0.93} + 93%|█████████▎| 8109/8750 [1:48:51<1:02:56, 5.89s/it] {'loss': 0.4691, 'learning_rate': 2.801811331052007e-07, 'epoch': 0.93} + 93%|█████████▎| 8109/8750 [1:48:49<1:02:56, 5.89s/it] 93%|█████████▎| 8110/8750 [1:48:57<1:01:44, 5.79s/it] 93%|█████████▎| 8110/8750 [1:48:55<1:01:44, 5.79s/it] {'loss': 0.4466, 'learning_rate': 2.7931170423764363e-07, 'epoch': 0.93} + 93%|█████████▎| 8110/8750 [1:48:57<1:01:44, 5.79s/it] {'loss': 0.4466, 'learning_rate': 2.7931170423764363e-07, 'epoch': 0.93} + 93%|█████████▎| 8110/8750 [1:48:55<1:01:44, 5.79s/it] 93%|█████████▎| 8111/8750 [1:49:01<1:01:45, 5.80s/it] 93%|█████████▎| 8111/8750 [1:49:02<1:01:45, 5.80s/it] {'loss': 0.4667, 'learning_rate': 2.784436073214103e-07, 'epoch': 0.93} + {'loss': 0.4667, 'learning_rate': 2.784436073214103e-07, 'epoch': 0.93} + 93%|█████████▎| 8111/8750 [1:49:02<1:01:45, 5.80s/it] 93%|█████████▎| 8111/8750 [1:49:01<1:01:45, 5.80s/it] 93%|█████████▎| 8112/8750 [1:49:06<1:01:41, 5.80s/it] 93%|█████████▎| 8112/8750 [1:49:08<1:01:41, 5.80s/it] {'loss': 0.443, 'learning_rate': 2.775768424754488e-07, 'epoch': 0.93} + 93%|█████████▎| 8112/8750 [1:49:08<1:01:41, 5.80s/it] {'loss': 0.443, 'learning_rate': 2.775768424754488e-07, 'epoch': 0.93} + 93%|█████████▎| 8112/8750 [1:49:06<1:01:41, 5.80s/it] 93%|█████████▎| 8113/8750 [1:49:12<1:01:10, 5.76s/it] 93%|█████████▎| 8113/8750 [1:49:14<1:01:10, 5.76s/it] {'loss': 0.4504, 'learning_rate': 2.7671140981852306e-07, 'epoch': 0.93} + 93%|█████████▎| 8113/8750 [1:49:14<1:01:10, 5.76s/it] {'loss': 0.4504, 'learning_rate': 2.7671140981852306e-07, 'epoch': 0.93} + 93%|█████████▎| 8113/8750 [1:49:12<1:01:10, 5.76s/it] 93%|█████████▎| 8114/8750 [1:49:18<1:01:42, 5.82s/it] 93%|█████████▎| 8114/8750 [1:49:20<1:01:42, 5.82s/it] {'loss': 0.442, 'learning_rate': 2.7584730946921825e-07, 'epoch': 0.93} + {'loss': 0.442, 'learning_rate': 2.7584730946921825e-07, 'epoch': 0.93} 93%|█████████▎| 8114/8750 [1:49:20<1:01:42, 5.82s/it] + 93%|█████████▎| 8114/8750 [1:49:18<1:01:42, 5.82s/it] 93%|█████████▎| 8115/8750 [1:49:24<1:00:53, 5.75s/it] 93%|█████████▎| 8115/8750 [1:49:25<1:00:53, 5.75s/it] {'loss': 0.4698, 'learning_rate': 2.7498454154593624e-07, 'epoch': 0.93} + 93%|█████████▎| 8115/8750 [1:49:25<1:00:53, 5.75s/it] {'loss': 0.4698, 'learning_rate': 2.7498454154593624e-07, 'epoch': 0.93} + 93%|█████████▎| 8115/8750 [1:49:24<1:00:53, 5.75s/it] 93%|█████████▎| 8116/8750 [1:49:31<1:00:55, 5.77s/it] 93%|█████████▎| 8116/8750 [1:49:29<1:00:55, 5.77s/it] {'loss': 0.4184, 'learning_rate': 2.741231061668925e-07, 'epoch': 0.93} + 93%|█████████▎| 8116/8750 [1:49:31<1:00:55, 5.77s/it] {'loss': 0.4184, 'learning_rate': 2.741231061668925e-07, 'epoch': 0.93} + 93%|█████████▎| 8116/8750 [1:49:29<1:00:55, 5.77s/it] 93%|█████████▎| 8117/8750 [1:49:37<1:00:22, 5.72s/it] 93%|█████████▎| 8117/8750 [1:49:35<1:00:22, 5.72s/it] {'loss': 0.4488, 'learning_rate': 2.73263003450126e-07, 'epoch': 0.93} + 93%|█████████▎| 8117/8750 [1:49:37<1:00:22, 5.72s/it] {'loss': 0.4488, 'learning_rate': 2.73263003450126e-07, 'epoch': 0.93} + 93%|█████████▎| 8117/8750 [1:49:35<1:00:22, 5.72s/it] 93%|█████████▎| 8118/8750 [1:49:41<1:00:10, 5.71s/it] 93%|█████████▎| 8118/8750 [1:49:43<1:00:10, 5.71s/it] {'loss': 0.4538, 'learning_rate': 2.72404233513488e-07, 'epoch': 0.93} + {'loss': 0.4538, 'learning_rate': 2.72404233513488e-07, 'epoch': 0.93} + 93%|█████████▎| 8118/8750 [1:49:43<1:00:10, 5.71s/it] 93%|█████████▎| 8118/8750 [1:49:41<1:00:10, 5.71s/it] 93%|█████████▎| 8119/8750 [1:49:48<1:00:02, 5.71s/it] 93%|█████████▎| 8119/8750 [1:49:46<1:00:02, 5.71s/it] {'loss': 0.4525, 'learning_rate': 2.71546796474651e-07, 'epoch': 0.93} + 93%|█████████▎| 8119/8750 [1:49:48<1:00:02, 5.71s/it] {'loss': 0.4525, 'learning_rate': 2.71546796474651e-07, 'epoch': 0.93} + 93%|█████████▎| 8119/8750 [1:49:46<1:00:02, 5.71s/it] 93%|█████████▎| 8120/8750 [1:49:52<59:50, 5.70s/it] 93%|█████████▎| 8120/8750 [1:49:54<59:50, 5.70s/it] {'loss': 0.4407, 'learning_rate': 2.70690692451101e-07, 'epoch': 0.93} + 93%|█████████▎| 8120/8750 [1:49:54<59:50, 5.70s/it] {'loss': 0.4407, 'learning_rate': 2.70690692451101e-07, 'epoch': 0.93} + 93%|█████████▎| 8120/8750 [1:49:52<59:50, 5.70s/it] 93%|█████████▎| 8121/8750 [1:49:59<59:06, 5.64s/it] 93%|█████████▎| 8121/8750 [1:49:58<59:06, 5.64s/it] {'loss': 0.4438, 'learning_rate': 2.698359215601443e-07, 'epoch': 0.93} + 93%|█████████▎| 8121/8750 [1:49:59<59:06, 5.64s/it] {'loss': 0.4438, 'learning_rate': 2.698359215601443e-07, 'epoch': 0.93} + 93%|█████████▎| 8121/8750 [1:49:58<59:06, 5.64s/it] 93%|█████████▎| 8122/8750 [1:50:03<59:00, 5.64s/it] 93%|█████████▎| 8122/8750 [1:50:05<59:00, 5.64s/it] {'loss': 0.4545, 'learning_rate': 2.689824839189037e-07, 'epoch': 0.93} + 93%|█████████▎| 8122/8750 [1:50:05<59:00, 5.64s/it] {'loss': 0.4545, 'learning_rate': 2.689824839189037e-07, 'epoch': 0.93} + 93%|█████████▎| 8122/8750 [1:50:03<59:00, 5.64s/it] 93%|█████████▎| 8123/8750 [1:50:09<1:00:45, 5.81s/it] 93%|█████████▎| 8123/8750 [1:50:11<1:00:45, 5.81s/it] {'loss': 0.4361, 'learning_rate': 2.681303796443202e-07, 'epoch': 0.93} + 93%|█████████▎| 8123/8750 [1:50:11<1:00:45, 5.81s/it] {'loss': 0.4361, 'learning_rate': 2.681303796443202e-07, 'epoch': 0.93} + 93%|█████████▎| 8123/8750 [1:50:09<1:00:45, 5.81s/it] 93%|█████████▎| 8124/8750 [1:50:17<1:00:27, 5.79s/it] 93%|█████████▎| 8124/8750 [1:50:15<1:00:27, 5.79s/it] {'loss': 0.4593, 'learning_rate': 2.672796088531493e-07, 'epoch': 0.93} + 93%|█████████▎| 8124/8750 [1:50:17<1:00:27, 5.79s/it] {'loss': 0.4593, 'learning_rate': 2.672796088531493e-07, 'epoch': 0.93} + 93%|█████████▎| 8124/8750 [1:50:15<1:00:27, 5.79s/it] 93%|█████████▎| 8125/8750 [1:50:23<1:00:27, 5.80s/it] 93%|█████████▎| 8125/8750 [1:50:21<1:00:27, 5.80s/it] {'loss': 0.4915, 'learning_rate': 2.664301716619666e-07, 'epoch': 0.93} + 93%|█████████▎| 8125/8750 [1:50:23<1:00:27, 5.80s/it] {'loss': 0.4915, 'learning_rate': 2.664301716619666e-07, 'epoch': 0.93} + 93%|█████████▎| 8125/8750 [1:50:21<1:00:27, 5.80s/it] 93%|█████████▎| 8126/8750 [1:50:29<1:00:00, 5.77s/it] 93%|█████████▎| 8126/8750 [1:50:27<1:00:00, 5.77s/it] {'loss': 0.4487, 'learning_rate': 2.655820681871635e-07, 'epoch': 0.93} + 93%|█████████▎| 8126/8750 [1:50:29<1:00:00, 5.77s/it] {'loss': 0.4487, 'learning_rate': 2.655820681871635e-07, 'epoch': 0.93} + 93%|█████████▎| 8126/8750 [1:50:27<1:00:00, 5.77s/it] 93%|█████████▎| 8127/8750 [1:50:34<59:35, 5.74s/it] 93%|█████████▎| 8127/8750 [1:50:32<59:35, 5.74s/it] {'loss': 0.4443, 'learning_rate': 2.6473529854494915e-07, 'epoch': 0.93} + 93%|█████████▎| 8127/8750 [1:50:34<59:35, 5.74s/it] {'loss': 0.4443, 'learning_rate': 2.6473529854494915e-07, 'epoch': 0.93} + 93%|█████████▎| 8127/8750 [1:50:32<59:35, 5.74s/it] 93%|█████████▎| 8128/8750 [1:50:40<59:36, 5.75s/it] 93%|█████████▎| 8128/8750 [1:50:38<59:36, 5.75s/it] {'loss': 0.443, 'learning_rate': 2.638898628513498e-07, 'epoch': 0.93} + 93%|█████████▎| 8128/8750 [1:50:40<59:36, 5.75s/it] {'loss': 0.443, 'learning_rate': 2.638898628513498e-07, 'epoch': 0.93} + 93%|█████████▎| 8128/8750 [1:50:38<59:36, 5.75s/it] 93%|█████████▎| 8129/8750 [1:50:46<58:48, 5.68s/it] 93%|█████████▎| 8129/8750 [1:50:44<58:48, 5.68s/it] {'loss': 0.4475, 'learning_rate': 2.6304576122221035e-07, 'epoch': 0.93} + 93%|█████████▎| 8129/8750 [1:50:46<58:48, 5.68s/it] {'loss': 0.4475, 'learning_rate': 2.6304576122221035e-07, 'epoch': 0.93} + 93%|█████████▎| 8129/8750 [1:50:44<58:48, 5.68s/it] 93%|█████████▎| 8130/8750 [1:50:51<58:55, 5.70s/it] 93%|█████████▎| 8130/8750 [1:50:49<58:55, 5.70s/it] {'loss': 0.4359, 'learning_rate': 2.6220299377318847e-07, 'epoch': 0.93} + 93%|█████████▎| 8130/8750 [1:50:51<58:55, 5.70s/it] {'loss': 0.4359, 'learning_rate': 2.6220299377318847e-07, 'epoch': 0.93} + 93%|█████████▎| 8130/8750 [1:50:49<58:55, 5.70s/it] 93%|█████████▎| 8131/8750 [1:50:57<59:38, 5.78s/it] 93%|█████████▎| 8131/8750 [1:50:55<59:38, 5.78s/it] {'loss': 0.444, 'learning_rate': 2.613615606197661e-07, 'epoch': 0.93} + 93%|█████████▎| 8131/8750 [1:50:57<59:38, 5.78s/it] {'loss': 0.444, 'learning_rate': 2.613615606197661e-07, 'epoch': 0.93} + 93%|█████████▎| 8131/8750 [1:50:55<59:38, 5.78s/it] 93%|█████████▎| 8132/8750 [1:51:03<58:59, 5.73s/it] 93%|█████████▎| 8132/8750 [1:51:01<58:59, 5.73s/it] {'loss': 0.4468, 'learning_rate': 2.605214618772356e-07, 'epoch': 0.93} + 93%|█████████▎| 8132/8750 [1:51:03<58:59, 5.73s/it] {'loss': 0.4468, 'learning_rate': 2.605214618772356e-07, 'epoch': 0.93} + 93%|█████████▎| 8132/8750 [1:51:01<58:59, 5.73s/it] 93%|█████████▎| 8133/8750 [1:51:09<59:05, 5.75s/it] 93%|█████████▎| 8133/8750 [1:51:07<59:05, 5.75s/it] {'loss': 0.4594, 'learning_rate': 2.596826976607114e-07, 'epoch': 0.93} + 93%|█████████▎| 8133/8750 [1:51:09<59:05, 5.75s/it] {'loss': 0.4594, 'learning_rate': 2.596826976607114e-07, 'epoch': 0.93} + 93%|█████████▎| 8133/8750 [1:51:07<59:05, 5.75s/it] 93%|█████████▎| 8134/8750 [1:51:12<58:34, 5.71s/it] 93%|█████████▎| 8134/8750 [1:51:14<58:34, 5.71s/it] {'loss': 0.4441, 'learning_rate': 2.5884526808511946e-07, 'epoch': 0.93} + 93%|█████████▎| 8134/8750 [1:51:14<58:34, 5.71s/it] {'loss': 0.4441, 'learning_rate': 2.5884526808511946e-07, 'epoch': 0.93} + 93%|█████████▎| 8134/8750 [1:51:12<58:34, 5.71s/it] 93%|█████████▎| 8135/8750 [1:51:18<58:34, 5.71s/it] 93%|█████████▎| 8135/8750 [1:51:20<58:34, 5.71s/it] {'loss': 0.4514, 'learning_rate': 2.5800917326521013e-07, 'epoch': 0.93} + 93%|█████████▎| 8135/8750 [1:51:20<58:34, 5.71s/it] {'loss': 0.4514, 'learning_rate': 2.5800917326521013e-07, 'epoch': 0.93} + 93%|█████████▎| 8135/8750 [1:51:18<58:34, 5.71s/it] 93%|█████████▎| 8136/8750 [1:51:24<58:23, 5.71s/it] 93%|█████████▎| 8136/8750 [1:51:26<58:23, 5.71s/it] {'loss': 0.4474, 'learning_rate': 2.5717441331554517e-07, 'epoch': 0.93} + 93%|█████████▎| 8136/8750 [1:51:26<58:23, 5.71s/it] {'loss': 0.4474, 'learning_rate': 2.5717441331554517e-07, 'epoch': 0.93} + 93%|█████████▎| 8136/8750 [1:51:24<58:23, 5.71s/it] 93%|█████████▎| 8137/8750 [1:51:31<58:27, 5.72s/it] 93%|█████████▎| 8137/8750 [1:51:30<58:27, 5.72s/it] {'loss': 0.439, 'learning_rate': 2.5634098835050415e-07, 'epoch': 0.93} + 93%|█████████▎| 8137/8750 [1:51:31<58:27, 5.72s/it] {'loss': 0.439, 'learning_rate': 2.5634098835050415e-07, 'epoch': 0.93} + 93%|█████████▎| 8137/8750 [1:51:30<58:27, 5.72s/it] 93%|█████████▎| 8138/8750 [1:51:37<58:30, 5.74s/it] 93%|█████████▎| 8138/8750 [1:51:35<58:30, 5.74s/it] {'loss': 0.4606, 'learning_rate': 2.555088984842868e-07, 'epoch': 0.93} + 93%|█████████▎| 8138/8750 [1:51:37<58:30, 5.74s/it] {'loss': 0.4606, 'learning_rate': 2.555088984842868e-07, 'epoch': 0.93} + 93%|█████████▎| 8138/8750 [1:51:35<58:30, 5.74s/it] 93%|█████████▎| 8139/8750 [1:51:43<58:30, 5.74s/it] 93%|█████████▎| 8139/8750 [1:51:41<58:30, 5.74s/it] {'loss': 0.4344, 'learning_rate': 2.546781438309087e-07, 'epoch': 0.93} + 93%|█████████▎| 8139/8750 [1:51:43<58:30, 5.74s/it] {'loss': 0.4344, 'learning_rate': 2.546781438309087e-07, 'epoch': 0.93} + 93%|█████████▎| 8139/8750 [1:51:41<58:30, 5.74s/it] 93%|█████████▎| 8140/8750 [1:51:47<58:56, 5.80s/it] 93%|█████████▎| 8140/8750 [1:51:49<58:56, 5.80s/it] {'loss': 0.4299, 'learning_rate': 2.5384872450419985e-07, 'epoch': 0.93} + 93%|█████████▎| 8140/8750 [1:51:49<58:56, 5.80s/it] {'loss': 0.4299, 'learning_rate': 2.5384872450419985e-07, 'epoch': 0.93} + 93%|█████████▎| 8140/8750 [1:51:47<58:56, 5.80s/it] 93%|█████████▎| 8141/8750 [1:51:55<58:19, 5.75s/it] 93%|█████████▎| 8141/8750 [1:51:53<58:19, 5.75s/it] {'loss': 0.4699, 'learning_rate': 2.530206406178104e-07, 'epoch': 0.93} + {'loss': 0.4699, 'learning_rate': 2.530206406178104e-07, 'epoch': 0.93} 93%|█████████▎| 8141/8750 [1:51:55<58:19, 5.75s/it] + 93%|█████████▎| 8141/8750 [1:51:53<58:19, 5.75s/it] 93%|█████████▎| 8142/8750 [1:52:00<58:05, 5.73s/it] 93%|█████████▎| 8142/8750 [1:51:58<58:05, 5.73s/it] {'loss': 0.452, 'learning_rate': 2.5219389228520517e-07, 'epoch': 0.93} + 93%|█████████▎| 8142/8750 [1:52:00<58:05, 5.73s/it] {'loss': 0.452, 'learning_rate': 2.5219389228520517e-07, 'epoch': 0.93} + 93%|█████████▎| 8142/8750 [1:51:58<58:05, 5.73s/it] 93%|█████████▎| 8143/8750 [1:52:06<58:18, 5.76s/it] 93%|█████████▎| 8143/8750 [1:52:04<58:18, 5.76s/it] {'loss': 0.4446, 'learning_rate': 2.51368479619668e-07, 'epoch': 0.93} + 93%|█████████▎| 8143/8750 [1:52:06<58:18, 5.76s/it] {'loss': 0.4446, 'learning_rate': 2.51368479619668e-07, 'epoch': 0.93} + 93%|█████████▎| 8143/8750 [1:52:04<58:18, 5.76s/it] 93%|█████████▎| 8144/8750 [1:52:12<57:59, 5.74s/it] 93%|█████████▎| 8144/8750 [1:52:10<57:59, 5.74s/it] {'loss': 0.4496, 'learning_rate': 2.505444027342996e-07, 'epoch': 0.93} + 93%|█████████▎| 8144/8750 [1:52:12<57:59, 5.74s/it] {'loss': 0.4496, 'learning_rate': 2.505444027342996e-07, 'epoch': 0.93} + 93%|█████████▎| 8144/8750 [1:52:10<57:59, 5.74s/it] 93%|█████████▎| 8145/8750 [1:52:17<57:34, 5.71s/it] 93%|█████████▎| 8145/8750 [1:52:15<57:34, 5.71s/it] {'loss': 0.4444, 'learning_rate': 2.497216617420151e-07, 'epoch': 0.93} + 93%|█████████▎| 8145/8750 [1:52:17<57:34, 5.71s/it] {'loss': 0.4444, 'learning_rate': 2.497216617420151e-07, 'epoch': 0.93} + 93%|█████████▎| 8145/8750 [1:52:15<57:34, 5.71s/it] 93%|█████████▎| 8146/8750 [1:52:23<57:44, 5.74s/it] 93%|█████████▎| 8146/8750 [1:52:21<57:44, 5.74s/it] {'loss': 0.4439, 'learning_rate': 2.4890025675554983e-07, 'epoch': 0.93} + 93%|█████████▎| 8146/8750 [1:52:23<57:44, 5.74s/it] {'loss': 0.4439, 'learning_rate': 2.4890025675554983e-07, 'epoch': 0.93} + 93%|█████████▎| 8146/8750 [1:52:21<57:44, 5.74s/it] 93%|█████████▎| 8147/8750 [1:52:29<57:32, 5.73s/it] 93%|█████████▎| 8147/8750 [1:52:27<57:32, 5.73s/it] {'loss': 0.4742, 'learning_rate': 2.480801878874528e-07, 'epoch': 0.93} + 93%|█████████▎| 8147/8750 [1:52:29<57:32, 5.73s/it] {'loss': 0.4742, 'learning_rate': 2.480801878874528e-07, 'epoch': 0.93} + 93%|█████████▎| 8147/8750 [1:52:27<57:32, 5.73s/it] 93%|█████████▎| 8148/8750 [1:52:35<58:03, 5.79s/it] 93%|█████████▎| 8148/8750 [1:52:33<58:03, 5.79s/it] {'loss': 0.4355, 'learning_rate': 2.4726145525009404e-07, 'epoch': 0.93} + 93%|█████████▎| 8148/8750 [1:52:35<58:03, 5.79s/it] {'loss': 0.4355, 'learning_rate': 2.4726145525009404e-07, 'epoch': 0.93} + 93%|█████████▎| 8148/8750 [1:52:33<58:03, 5.79s/it] 93%|█████████▎| 8149/8750 [1:52:39<58:12, 5.81s/it] 93%|█████████▎| 8149/8750 [1:52:41<58:12, 5.81s/it] {'loss': 0.4577, 'learning_rate': 2.4644405895565717e-07, 'epoch': 0.93} + {'loss': 0.4577, 'learning_rate': 2.4644405895565717e-07, 'epoch': 0.93} + 93%|█████████▎| 8149/8750 [1:52:41<58:12, 5.81s/it] 93%|█████████▎| 8149/8750 [1:52:39<58:12, 5.81s/it]8 AutoResumeHook: Checking whether to suspend... +16 5AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... 93%|█████████▎| 8150/8750 [1:52:46<57:53, 5.79s/it]12 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... + +04 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... + 13 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... + 93%|█████████▎| 8150/8750 [1:52:45<57:53, 5.79s/it]9 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4551, 'learning_rate': 2.456279991161437e-07, 'epoch': 0.93} + 93%|█████████▎| 8150/8750 [1:52:46<57:53, 5.79s/it] {'loss': 0.4551, 'learning_rate': 2.456279991161437e-07, 'epoch': 0.93} + 93%|█████████▎| 8150/8750 [1:52:45<57:53, 5.79s/it] 93%|█████████▎| 8151/8750 [1:52:52<57:10, 5.73s/it] 93%|█████████▎| 8151/8750 [1:52:50<57:10, 5.73s/it] {'loss': 0.4834, 'learning_rate': 2.448132758433719e-07, 'epoch': 0.93} + 93%|█████████▎| 8151/8750 [1:52:52<57:10, 5.73s/it] {'loss': 0.4834, 'learning_rate': 2.448132758433719e-07, 'epoch': 0.93} + 93%|█████████▎| 8151/8750 [1:52:50<57:10, 5.73s/it] 93%|█████████▎| 8152/8750 [1:52:56<57:02, 5.72s/it] 93%|█████████▎| 8152/8750 [1:52:58<57:02, 5.72s/it] {'loss': 0.4243, 'learning_rate': 2.439998892489781e-07, 'epoch': 0.93} + 93%|█████████▎| 8152/8750 [1:52:58<57:02, 5.72s/it] {'loss': 0.4243, 'learning_rate': 2.439998892489781e-07, 'epoch': 0.93} + 93%|█████████▎| 8152/8750 [1:52:56<57:02, 5.72s/it] 93%|█████████▎| 8153/8750 [1:53:03<56:55, 5.72s/it] 93%|█████████▎| 8153/8750 [1:53:02<56:55, 5.72s/it] {'loss': 0.458, 'learning_rate': 2.4318783944441314e-07, 'epoch': 0.93} + 93%|█████████▎| 8153/8750 [1:53:03<56:55, 5.72s/it] {'loss': 0.458, 'learning_rate': 2.4318783944441314e-07, 'epoch': 0.93} + 93%|█████████▎| 8153/8750 [1:53:02<56:55, 5.72s/it] 93%|█████████▎| 8154/8750 [1:53:09<56:51, 5.72s/it] 93%|█████████▎| 8154/8750 [1:53:07<56:51, 5.72s/it] {'loss': 0.4451, 'learning_rate': 2.4237712654094693e-07, 'epoch': 0.93} + 93%|█████████▎| 8154/8750 [1:53:09<56:51, 5.72s/it] {'loss': 0.4451, 'learning_rate': 2.4237712654094693e-07, 'epoch': 0.93} + 93%|█████████▎| 8154/8750 [1:53:07<56:51, 5.72s/it] 93%|█████████▎| 8155/8750 [1:53:15<56:26, 5.69s/it] 93%|█████████▎| 8155/8750 [1:53:13<56:26, 5.69s/it] {'loss': 0.4431, 'learning_rate': 2.4156775064966273e-07, 'epoch': 0.93} + 93%|█████████▎| 8155/8750 [1:53:15<56:26, 5.69s/it] {'loss': 0.4431, 'learning_rate': 2.4156775064966273e-07, 'epoch': 0.93} + 93%|█████████▎| 8155/8750 [1:53:13<56:26, 5.69s/it] 93%|█████████▎| 8156/8750 [1:53:21<56:50, 5.74s/it] 93%|█████████▎| 8156/8750 [1:53:19<56:50, 5.74s/it] {'loss': 0.4606, 'learning_rate': 2.4075971188146754e-07, 'epoch': 0.93} + 93%|█████████▎| 8156/8750 [1:53:21<56:50, 5.74s/it] {'loss': 0.4606, 'learning_rate': 2.4075971188146754e-07, 'epoch': 0.93} + 93%|█████████▎| 8156/8750 [1:53:19<56:50, 5.74s/it] 93%|█████████▎| 8157/8750 [1:53:26<56:53, 5.76s/it] 93%|█████████▎| 8157/8750 [1:53:25<56:53, 5.76s/it] {'loss': 0.451, 'learning_rate': 2.3995301034707597e-07, 'epoch': 0.93} + 93%|█████████▎| 8157/8750 [1:53:26<56:53, 5.76s/it] {'loss': 0.451, 'learning_rate': 2.3995301034707597e-07, 'epoch': 0.93} + 93%|█████████▎| 8157/8750 [1:53:25<56:53, 5.76s/it] 93%|█████████▎| 8158/8750 [1:53:32<56:50, 5.76s/it] 93%|█████████▎| 8158/8750 [1:53:30<56:50, 5.76s/it] {'loss': 0.4384, 'learning_rate': 2.3914764615702747e-07, 'epoch': 0.93} + 93%|█████████▎| 8158/8750 [1:53:32<56:50, 5.76s/it] {'loss': 0.4384, 'learning_rate': 2.3914764615702747e-07, 'epoch': 0.93} + 93%|█████████▎| 8158/8750 [1:53:30<56:50, 5.76s/it] 93%|█████████▎| 8159/8750 [1:53:38<56:13, 5.71s/it] 93%|█████████▎| 8159/8750 [1:53:36<56:13, 5.71s/it] {'loss': 0.4686, 'learning_rate': 2.3834361942167484e-07, 'epoch': 0.93} + 93%|█████████▎| 8159/8750 [1:53:38<56:13, 5.71s/it] {'loss': 0.4686, 'learning_rate': 2.3834361942167484e-07, 'epoch': 0.93} + 93%|█████████▎| 8159/8750 [1:53:36<56:13, 5.71s/it] 93%|█████████▎| 8160/8750 [1:53:44<56:50, 5.78s/it] 93%|█████████▎| 8160/8750 [1:53:42<56:50, 5.78s/it] {'loss': 0.4473, 'learning_rate': 2.375409302511855e-07, 'epoch': 0.93} + 93%|█████████▎| 8160/8750 [1:53:44<56:50, 5.78s/it] {'loss': 0.4473, 'learning_rate': 2.375409302511855e-07, 'epoch': 0.93} + 93%|█████████▎| 8160/8750 [1:53:42<56:50, 5.78s/it] 93%|█████████▎| 8161/8750 [1:53:50<56:49, 5.79s/it] 93%|█████████▎| 8161/8750 [1:53:48<56:49, 5.79s/it] {'loss': 0.4443, 'learning_rate': 2.367395787555482e-07, 'epoch': 0.93} + 93%|█████████▎| 8161/8750 [1:53:50<56:49, 5.79s/it] {'loss': 0.4443, 'learning_rate': 2.367395787555482e-07, 'epoch': 0.93} + 93%|█████████▎| 8161/8750 [1:53:48<56:49, 5.79s/it] 93%|█████████▎| 8162/8750 [1:53:53<56:00, 5.71s/it] 93%|█████████▎| 8162/8750 [1:53:55<56:00, 5.71s/it] {'loss': 0.4512, 'learning_rate': 2.3593956504456396e-07, 'epoch': 0.93} + {'loss': 0.4512, 'learning_rate': 2.3593956504456396e-07, 'epoch': 0.93} 93%|█████████▎| 8162/8750 [1:53:55<56:00, 5.71s/it] + 93%|█████████▎| 8162/8750 [1:53:53<56:00, 5.71s/it] 93%|█████████▎| 8163/8750 [1:53:59<55:29, 5.67s/it] 93%|█████████▎| 8163/8750 [1:54:01<55:29, 5.67s/it] {'loss': 0.4504, 'learning_rate': 2.3514088922785284e-07, 'epoch': 0.93} + 93%|█████████▎| 8163/8750 [1:54:01<55:29, 5.67s/it] {'loss': 0.4504, 'learning_rate': 2.3514088922785284e-07, 'epoch': 0.93} + 93%|█████████▎| 8163/8750 [1:53:59<55:29, 5.67s/it] 93%|█████████▎| 8164/8750 [1:54:06<55:03, 5.64s/it] 93%|█████████▎| 8164/8750 [1:54:04<55:03, 5.64s/it] {'loss': 0.4591, 'learning_rate': 2.3434355141485287e-07, 'epoch': 0.93} + 93%|█████████▎| 8164/8750 [1:54:06<55:03, 5.64s/it] {'loss': 0.4591, 'learning_rate': 2.3434355141485287e-07, 'epoch': 0.93} + 93%|█████████▎| 8164/8750 [1:54:04<55:03, 5.64s/it] 93%|█████████▎| 8165/8750 [1:54:12<54:58, 5.64s/it] 93%|█████████▎| 8165/8750 [1:54:10<54:58, 5.64s/it] {'loss': 0.4477, 'learning_rate': 2.335475517148167e-07, 'epoch': 0.93} + 93%|█████████▎| 8165/8750 [1:54:12<54:58, 5.64s/it] {'loss': 0.4477, 'learning_rate': 2.335475517148167e-07, 'epoch': 0.93} + 93%|█████████▎| 8165/8750 [1:54:10<54:58, 5.64s/it] 93%|█████████▎| 8166/8750 [1:54:18<55:49, 5.73s/it] 93%|█████████▎| 8166/8750 [1:54:16<55:49, 5.73s/it] {'loss': 0.4334, 'learning_rate': 2.3275289023681148e-07, 'epoch': 0.93} + 93%|█████████▎| 8166/8750 [1:54:18<55:49, 5.73s/it] {'loss': 0.4334, 'learning_rate': 2.3275289023681148e-07, 'epoch': 0.93} + 93%|█████████▎| 8166/8750 [1:54:16<55:49, 5.73s/it] 93%|█████████▎| 8167/8750 [1:54:23<55:27, 5.71s/it] 93%|█████████▎| 8167/8750 [1:54:22<55:27, 5.71s/it] {'loss': 0.462, 'learning_rate': 2.3195956708972566e-07, 'epoch': 0.93} + 93%|█████████▎| 8167/8750 [1:54:23<55:27, 5.71s/it] {'loss': 0.462, 'learning_rate': 2.3195956708972566e-07, 'epoch': 0.93} + 93%|█████████▎| 8167/8750 [1:54:22<55:27, 5.71s/it] 93%|█████████▎| 8168/8750 [1:54:29<54:50, 5.65s/it] 93%|█████████▎| 8168/8750 [1:54:27<54:50, 5.65s/it] {'loss': 0.4558, 'learning_rate': 2.3116758238226233e-07, 'epoch': 0.93} + 93%|█████████▎| 8168/8750 [1:54:29<54:50, 5.65s/it] {'loss': 0.4558, 'learning_rate': 2.3116758238226233e-07, 'epoch': 0.93} + 93%|█████████▎| 8168/8750 [1:54:27<54:50, 5.65s/it] 93%|█████████▎| 8169/8750 [1:54:35<55:22, 5.72s/it] 93%|█████████▎| 8169/8750 [1:54:33<55:22, 5.72s/it] {'loss': 0.4568, 'learning_rate': 2.3037693622294244e-07, 'epoch': 0.93} + 93%|█████████▎| 8169/8750 [1:54:35<55:22, 5.72s/it] {'loss': 0.4568, 'learning_rate': 2.3037693622294244e-07, 'epoch': 0.93} + 93%|█████████▎| 8169/8750 [1:54:33<55:22, 5.72s/it] 93%|█████████▎| 8170/8750 [1:54:39<54:56, 5.68s/it] 93%|█████████▎| 8170/8750 [1:54:40<54:56, 5.68s/it] {'loss': 0.4401, 'learning_rate': 2.2958762872009932e-07, 'epoch': 0.93} + 93%|█████████▎| 8170/8750 [1:54:40<54:56, 5.68s/it] {'loss': 0.4401, 'learning_rate': 2.2958762872009932e-07, 'epoch': 0.93} + 93%|█████████▎| 8170/8750 [1:54:39<54:56, 5.68s/it] 93%|█████████▎| 8171/8750 [1:54:44<54:57, 5.69s/it] 93%|█████████▎| 8171/8750 [1:54:46<54:57, 5.69s/it] {'loss': 0.4245, 'learning_rate': 2.2879965998188646e-07, 'epoch': 0.93} + {'loss': 0.4245, 'learning_rate': 2.2879965998188646e-07, 'epoch': 0.93} 93%|█████████▎| 8171/8750 [1:54:46<54:57, 5.69s/it] + 93%|█████████▎| 8171/8750 [1:54:44<54:57, 5.69s/it] 93%|█████████▎| 8172/8750 [1:54:52<54:51, 5.69s/it] 93%|█████████▎| 8172/8750 [1:54:50<54:51, 5.69s/it] {'loss': 0.4457, 'learning_rate': 2.280130301162742e-07, 'epoch': 0.93} + 93%|█████████▎| 8172/8750 [1:54:52<54:51, 5.69s/it] {'loss': 0.4457, 'learning_rate': 2.280130301162742e-07, 'epoch': 0.93} + 93%|█████████▎| 8172/8750 [1:54:50<54:51, 5.69s/it] 93%|█████████▎| 8173/8750 [1:54:56<54:55, 5.71s/it] 93%|█████████▎| 8173/8750 [1:54:58<54:56, 5.71s/it] {'loss': 0.4725, 'learning_rate': 2.2722773923104736e-07, 'epoch': 0.93} + 93%|█████████▎| 8173/8750 [1:54:58<54:56, 5.71s/it] {'loss': 0.4725, 'learning_rate': 2.2722773923104736e-07, 'epoch': 0.93} + 93%|█████████▎| 8173/8750 [1:54:56<54:55, 5.71s/it] 93%|█████████▎| 8174/8750 [1:55:02<55:26, 5.78s/it] 93%|█████████▎| 8174/8750 [1:55:04<55:26, 5.78s/it] {'loss': 0.4427, 'learning_rate': 2.264437874338099e-07, 'epoch': 0.93} + {'loss': 0.4427, 'learning_rate': 2.264437874338099e-07, 'epoch': 0.93} + 93%|█████████▎| 8174/8750 [1:55:04<55:26, 5.78s/it] 93%|█████████▎| 8174/8750 [1:55:02<55:26, 5.78s/it] 93%|█████████▎| 8175/8750 [1:55:07<55:12, 5.76s/it] 93%|█████████▎| 8175/8750 [1:55:09<55:12, 5.76s/it] {'loss': 0.4492, 'learning_rate': 2.2566117483197923e-07, 'epoch': 0.93} + 93%|█████████▎| 8175/8750 [1:55:09<55:12, 5.76s/it] {'loss': 0.4492, 'learning_rate': 2.2566117483197923e-07, 'epoch': 0.93} + 93%|█████████▎| 8175/8750 [1:55:07<55:12, 5.76s/it] 93%|█████████▎| 8176/8750 [1:55:13<54:34, 5.70s/it] 93%|█████████▎| 8176/8750 [1:55:15<54:34, 5.70s/it] {'loss': 0.4409, 'learning_rate': 2.248799015327907e-07, 'epoch': 0.93} + 93%|█████████▎| 8176/8750 [1:55:15<54:34, 5.70s/it] {'loss': 0.4409, 'learning_rate': 2.248799015327907e-07, 'epoch': 0.93} + 93%|█████████▎| 8176/8750 [1:55:13<54:34, 5.70s/it] 93%|█████████▎| 8177/8750 [1:55:19<54:35, 5.72s/it] 93%|█████████▎| 8177/8750 [1:55:21<54:35, 5.72s/it] {'loss': 0.431, 'learning_rate': 2.2409996764329644e-07, 'epoch': 0.93} + 93%|█████████▎| 8177/8750 [1:55:21<54:35, 5.72s/it] {'loss': 0.431, 'learning_rate': 2.2409996764329644e-07, 'epoch': 0.93} + 93%|█████████▎| 8177/8750 [1:55:19<54:35, 5.72s/it] 93%|█████████▎| 8178/8750 [1:55:24<54:16, 5.69s/it] 93%|█████████▎| 8178/8750 [1:55:26<54:16, 5.69s/it] {'loss': 0.4616, 'learning_rate': 2.233213732703665e-07, 'epoch': 0.93} + 93%|█████████▎| 8178/8750 [1:55:26<54:16, 5.69s/it] {'loss': 0.4616, 'learning_rate': 2.233213732703665e-07, 'epoch': 0.93} + 93%|█████████▎| 8178/8750 [1:55:24<54:16, 5.69s/it] 93%|█████████▎| 8179/8750 [1:55:30<54:17, 5.70s/it] 93%|█████████▎| 8179/8750 [1:55:32<54:17, 5.70s/it] {'loss': 0.442, 'learning_rate': 2.2254411852068226e-07, 'epoch': 0.93} + 93%|█████████▎| 8179/8750 [1:55:32<54:17, 5.70s/it] {'loss': 0.442, 'learning_rate': 2.2254411852068226e-07, 'epoch': 0.93} + 93%|█████████▎| 8179/8750 [1:55:30<54:17, 5.70s/it] 93%|█████████▎| 8180/8750 [1:55:36<53:49, 5.67s/it] 93%|█████████▎| 8180/8750 [1:55:38<53:49, 5.67s/it] {'loss': 0.4572, 'learning_rate': 2.2176820350074846e-07, 'epoch': 0.93} + 93%|█████████▎| 8180/8750 [1:55:38<53:49, 5.67s/it] {'loss': 0.4572, 'learning_rate': 2.2176820350074846e-07, 'epoch': 0.93} + 93%|█████████▎| 8180/8750 [1:55:36<53:49, 5.67s/it] 93%|█████████▎| 8181/8750 [1:55:41<53:56, 5.69s/it] 93%|█████████▎| 8181/8750 [1:55:43<53:56, 5.69s/it] {'loss': 0.4326, 'learning_rate': 2.2099362831688008e-07, 'epoch': 0.93} + 93%|█████████▎| 8181/8750 [1:55:43<53:56, 5.69s/it] {'loss': 0.4326, 'learning_rate': 2.2099362831688008e-07, 'epoch': 0.93} + 93%|█████████▎| 8181/8750 [1:55:41<53:56, 5.69s/it] 94%|█████████▎| 8182/8750 [1:55:47<53:49, 5.69s/it] 94%|█████████▎| 8182/8750 [1:55:49<53:49, 5.69s/it] {'loss': 0.4631, 'learning_rate': 2.2022039307521337e-07, 'epoch': 0.94} + {'loss': 0.4631, 'learning_rate': 2.2022039307521337e-07, 'epoch': 0.94} 94%|█████████▎| 8182/8750 [1:55:49<53:49, 5.69s/it] + 94%|█████████▎| 8182/8750 [1:55:47<53:49, 5.69s/it] 94%|█████████▎| 8183/8750 [1:55:53<53:37, 5.67s/it] 94%|█████████▎| 8183/8750 [1:55:55<53:37, 5.67s/it] {'loss': 0.4388, 'learning_rate': 2.1944849788169798e-07, 'epoch': 0.94} + 94%|█████████▎| 8183/8750 [1:55:55<53:37, 5.67s/it] {'loss': 0.4388, 'learning_rate': 2.1944849788169798e-07, 'epoch': 0.94} + 94%|█████████▎| 8183/8750 [1:55:53<53:37, 5.67s/it] 94%|█████████▎| 8184/8750 [1:55:58<53:30, 5.67s/it] 94%|█████████▎| 8184/8750 [1:56:00<53:30, 5.67s/it] {'loss': 0.4536, 'learning_rate': 2.1867794284209932e-07, 'epoch': 0.94} + 94%|█████████▎| 8184/8750 [1:56:00<53:30, 5.67s/it] {'loss': 0.4536, 'learning_rate': 2.1867794284209932e-07, 'epoch': 0.94} + 94%|█████████▎| 8184/8750 [1:55:58<53:30, 5.67s/it] 94%|█████████▎| 8185/8750 [1:56:04<52:56, 5.62s/it] 94%|█████████▎| 8185/8750 [1:56:06<52:56, 5.62s/it] {'loss': 0.4582, 'learning_rate': 2.179087280620018e-07, 'epoch': 0.94} + 94%|█████████▎| 8185/8750 [1:56:06<52:56, 5.62s/it] {'loss': 0.4582, 'learning_rate': 2.179087280620018e-07, 'epoch': 0.94} + 94%|█████████▎| 8185/8750 [1:56:04<52:56, 5.62s/it] 94%|█████████▎| 8186/8750 [1:56:10<53:02, 5.64s/it] 94%|█████████▎| 8186/8750 [1:56:11<53:02, 5.64s/it] {'loss': 0.4496, 'learning_rate': 2.1714085364680671e-07, 'epoch': 0.94} + 94%|█████████▎| 8186/8750 [1:56:11<53:02, 5.64s/it] {'loss': 0.4496, 'learning_rate': 2.1714085364680671e-07, 'epoch': 0.94} + 94%|█████████▎| 8186/8750 [1:56:10<53:02, 5.64s/it] 94%|█████████▎| 8187/8750 [1:56:15<53:00, 5.65s/it] 94%|█████████▎| 8187/8750 [1:56:17<53:00, 5.65s/it] {'loss': 0.4677, 'learning_rate': 2.163743197017265e-07, 'epoch': 0.94} + {'loss': 0.4677, 'learning_rate': 2.163743197017265e-07, 'epoch': 0.94} 94%|█████████▎| 8187/8750 [1:56:17<53:00, 5.65s/it] + 94%|█████████▎| 8187/8750 [1:56:15<53:00, 5.65s/it] 94%|█████████▎| 8188/8750 [1:56:21<52:49, 5.64s/it] 94%|█████████▎| 8188/8750 [1:56:23<52:49, 5.64s/it] {'loss': 0.4457, 'learning_rate': 2.156091263317972e-07, 'epoch': 0.94} + 94%|█████████▎| 8188/8750 [1:56:23<52:49, 5.64s/it] {'loss': 0.4457, 'learning_rate': 2.156091263317972e-07, 'epoch': 0.94} + 94%|█████████▎| 8188/8750 [1:56:21<52:49, 5.64s/it] 94%|█████████▎| 8189/8750 [1:56:26<52:44, 5.64s/it] 94%|█████████▎| 8189/8750 [1:56:28<52:44, 5.64s/it] {'loss': 0.4335, 'learning_rate': 2.1484527364186492e-07, 'epoch': 0.94} + 94%|█████████▎| 8189/8750 [1:56:28<52:44, 5.64s/it] {'loss': 0.4335, 'learning_rate': 2.1484527364186492e-07, 'epoch': 0.94} + 94%|█████████▎| 8189/8750 [1:56:26<52:44, 5.64s/it] 94%|█████████▎| 8190/8750 [1:56:32<52:59, 5.68s/it] 94%|█████████▎| 8190/8750 [1:56:34<52:59, 5.68s/it] {'loss': 0.4598, 'learning_rate': 2.140827617365948e-07, 'epoch': 0.94} + 94%|█████████▎| 8190/8750 [1:56:34<52:59, 5.68s/it] {'loss': 0.4598, 'learning_rate': 2.140827617365948e-07, 'epoch': 0.94} + 94%|█████████▎| 8190/8750 [1:56:32<52:59, 5.68s/it] 94%|█████████▎| 8191/8750 [1:56:38<53:03, 5.69s/it] 94%|█████████▎| 8191/8750 [1:56:40<53:03, 5.69s/it] {'loss': 0.4615, 'learning_rate': 2.1332159072046887e-07, 'epoch': 0.94} + {'loss': 0.4615, 'learning_rate': 2.1332159072046887e-07, 'epoch': 0.94} + 94%|█████████▎| 8191/8750 [1:56:40<53:03, 5.69s/it] 94%|█████████▎| 8191/8750 [1:56:38<53:03, 5.69s/it] 94%|█████████▎| 8192/8750 [1:56:44<52:54, 5.69s/it] 94%|█████████▎| 8192/8750 [1:56:46<52:54, 5.69s/it] {'loss': 0.4394, 'learning_rate': 2.1256176069778367e-07, 'epoch': 0.94} + 94%|█████████▎| 8192/8750 [1:56:46<52:54, 5.69s/it] {'loss': 0.4394, 'learning_rate': 2.1256176069778367e-07, 'epoch': 0.94} + 94%|█████████▎| 8192/8750 [1:56:44<52:54, 5.69s/it] 94%|█████████▎| 8193/8750 [1:56:49<52:28, 5.65s/it] 94%|█████████▎| 8193/8750 [1:56:51<52:28, 5.65s/it] {'loss': 0.4442, 'learning_rate': 2.118032717726537e-07, 'epoch': 0.94} + 94%|█████████▎| 8193/8750 [1:56:51<52:28, 5.65s/it] {'loss': 0.4442, 'learning_rate': 2.118032717726537e-07, 'epoch': 0.94} + 94%|█████████▎| 8193/8750 [1:56:49<52:28, 5.65s/it] 94%|█████████▎| 8194/8750 [1:56:55<52:26, 5.66s/it] 94%|█████████▎| 8194/8750 [1:56:57<52:26, 5.66s/it] {'loss': 0.4259, 'learning_rate': 2.1104612404900805e-07, 'epoch': 0.94} + 94%|█████████▎| 8194/8750 [1:56:57<52:26, 5.66s/it] {'loss': 0.4259, 'learning_rate': 2.1104612404900805e-07, 'epoch': 0.94} + 94%|█████████▎| 8194/8750 [1:56:55<52:26, 5.66s/it] 94%|█████████▎| 8195/8750 [1:57:00<51:56, 5.62s/it] 94%|█████████▎| 8195/8750 [1:57:02<51:56, 5.62s/it] {'loss': 0.461, 'learning_rate': 2.102903176305926e-07, 'epoch': 0.94} + 94%|█████████▎| 8195/8750 [1:57:02<51:56, 5.62s/it] {'loss': 0.461, 'learning_rate': 2.102903176305926e-07, 'epoch': 0.94} + 94%|█████████▎| 8195/8750 [1:57:00<51:56, 5.62s/it] 94%|█████████▎| 8196/8750 [1:57:06<51:57, 5.63s/it] 94%|█████████▎| 8196/8750 [1:57:08<51:57, 5.63s/it] {'loss': 0.451, 'learning_rate': 2.0953585262097232e-07, 'epoch': 0.94} + 94%|█████████▎| 8196/8750 [1:57:08<51:57, 5.63s/it] {'loss': 0.451, 'learning_rate': 2.0953585262097232e-07, 'epoch': 0.94} + 94%|█████████▎| 8196/8750 [1:57:06<51:57, 5.63s/it] 94%|█████████▎| 8197/8750 [1:57:12<52:13, 5.67s/it] 94%|█████████▎| 8197/8750 [1:57:14<52:13, 5.67s/it] {'loss': 0.455, 'learning_rate': 2.0878272912352117e-07, 'epoch': 0.94} + 94%|█████████▎| 8197/8750 [1:57:14<52:13, 5.67s/it] {'loss': 0.455, 'learning_rate': 2.0878272912352117e-07, 'epoch': 0.94} + 94%|█████████▎| 8197/8750 [1:57:12<52:13, 5.67s/it] 94%|█████████▎| 8198/8750 [1:57:18<53:06, 5.77s/it] 94%|█████████▎| 8198/8750 [1:57:20<53:06, 5.77s/it] {'loss': 0.4479, 'learning_rate': 2.0803094724143879e-07, 'epoch': 0.94} + 94%|█████████▎| 8198/8750 [1:57:20<53:06, 5.77s/it] {'loss': 0.4479, 'learning_rate': 2.0803094724143879e-07, 'epoch': 0.94} + 94%|█████████▎| 8198/8750 [1:57:18<53:06, 5.77s/it] 94%|█████████▎| 8199/8750 [1:57:24<52:48, 5.75s/it] 94%|█████████▎| 8199/8750 [1:57:25<52:48, 5.75s/it] {'loss': 0.4658, 'learning_rate': 2.0728050707773285e-07, 'epoch': 0.94} + {'loss': 0.4658, 'learning_rate': 2.0728050707773285e-07, 'epoch': 0.94} + 94%|█████████▎| 8199/8750 [1:57:25<52:48, 5.75s/it] 94%|█████████▎| 8199/8750 [1:57:24<52:48, 5.75s/it]1 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... + 94%|█████████▎| 8200/8750 [1:57:32<53:31, 5.84s/it]011 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... + 94%|█████████▎| 8200/8750 [1:57:30<53:31, 5.84s/it]2 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4647, 'learning_rate': 2.0653140873523104e-07, 'epoch': 0.94} + 94%|█████████▎| 8200/8750 [1:57:32<53:31, 5.84s/it] {'loss': 0.4647, 'learning_rate': 2.0653140873523104e-07, 'epoch': 0.94} + 94%|█████████▎| 8200/8750 [1:57:30<53:31, 5.84s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-8200/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-8200/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-8200/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 94%|█████████▎| 8201/8750 [1:57:57<1:46:41, 11.66s/it] 94%|█████████▎| 8201/8750 [1:57:55<1:46:41, 11.66s/it] {'loss': 0.4496, 'learning_rate': 2.0578365231657792e-07, 'epoch': 0.94} + 94%|█████████▎| 8201/8750 [1:57:57<1:46:41, 11.66s/it] {'loss': 0.4496, 'learning_rate': 2.0578365231657792e-07, 'epoch': 0.94} + 94%|█████████▎| 8201/8750 [1:57:55<1:46:41, 11.66s/it] 94%|█████████▎| 8202/8750 [1:58:02<1:29:57, 9.85s/it] 94%|█████████▎| 8202/8750 [1:58:00<1:29:58, 9.85s/it] {'loss': 0.4439, 'learning_rate': 2.0503723792423047e-07, 'epoch': 0.94} + 94%|█████████▎| 8202/8750 [1:58:02<1:29:57, 9.85s/it] {'loss': 0.4439, 'learning_rate': 2.0503723792423047e-07, 'epoch': 0.94} + 94%|█████████▎| 8202/8750 [1:58:00<1:29:58, 9.85s/it] 94%|█████████▎| 8203/8750 [1:58:06<1:17:49, 8.54s/it] 94%|█████████▎| 8203/8750 [1:58:08<1:17:49, 8.54s/it] {'loss': 0.4974, 'learning_rate': 2.0429216566046682e-07, 'epoch': 0.94} + 94%|█████████▎| 8203/8750 [1:58:08<1:17:49, 8.54s/it] {'loss': 0.4974, 'learning_rate': 2.0429216566046682e-07, 'epoch': 0.94} + 94%|█████████▎| 8203/8750 [1:58:06<1:17:49, 8.54s/it] 94%|█████████▍| 8204/8750 [1:58:14<1:09:51, 7.68s/it] 94%|█████████▍| 8204/8750 [1:58:12<1:09:51, 7.68s/it] {'loss': 0.4585, 'learning_rate': 2.0354843562737537e-07, 'epoch': 0.94} + 94%|█████████▍| 8204/8750 [1:58:14<1:09:51, 7.68s/it] {'loss': 0.4585, 'learning_rate': 2.0354843562737537e-07, 'epoch': 0.94} + 94%|█████████▍| 8204/8750 [1:58:12<1:09:51, 7.68s/it] 94%|█████████▍| 8205/8750 [1:58:19<1:04:50, 7.14s/it] 94%|█████████▍| 8205/8750 [1:58:17<1:04:50, 7.14s/it] {'loss': 0.4521, 'learning_rate': 2.0280604792686676e-07, 'epoch': 0.94} + 94%|█████████▍| 8205/8750 [1:58:19<1:04:50, 7.14s/it] {'loss': 0.4521, 'learning_rate': 2.0280604792686676e-07, 'epoch': 0.94} + 94%|█████████▍| 8205/8750 [1:58:17<1:04:50, 7.14s/it] 94%|█████████▍| 8206/8750 [1:58:25<1:00:54, 6.72s/it] 94%|█████████▍| 8206/8750 [1:58:23<1:00:54, 6.72s/it] {'loss': 0.4531, 'learning_rate': 2.0206500266066297e-07, 'epoch': 0.94} + 94%|█████████▍| 8206/8750 [1:58:25<1:00:54, 6.72s/it] {'loss': 0.4531, 'learning_rate': 2.0206500266066297e-07, 'epoch': 0.94} + 94%|█████████▍| 8206/8750 [1:58:23<1:00:54, 6.72s/it] 94%|█████████▍| 8207/8750 [1:58:31<58:03, 6.41s/it] 94%|█████████▍| 8207/8750 [1:58:29<58:03, 6.41s/it] {'loss': 0.4476, 'learning_rate': 2.0132529993030392e-07, 'epoch': 0.94} + 94%|█████████▍| 8207/8750 [1:58:31<58:03, 6.41s/it] {'loss': 0.4476, 'learning_rate': 2.0132529993030392e-07, 'epoch': 0.94} + 94%|█████████▍| 8207/8750 [1:58:29<58:03, 6.41s/it] 94%|█████████▍| 8208/8750 [1:58:35<55:59, 6.20s/it] 94%|█████████▍| 8208/8750 [1:58:37<55:59, 6.20s/it] {'loss': 0.4422, 'learning_rate': 2.0058693983714628e-07, 'epoch': 0.94} + {'loss': 0.4422, 'learning_rate': 2.0058693983714628e-07, 'epoch': 0.94} + 94%|█████████▍| 8208/8750 [1:58:37<55:59, 6.20s/it] 94%|█████████▍| 8208/8750 [1:58:35<55:59, 6.20s/it] 94%|█████████▍| 8209/8750 [1:58:42<54:50, 6.08s/it] 94%|█████████▍| 8209/8750 [1:58:40<54:50, 6.08s/it] {'loss': 0.4532, 'learning_rate': 1.9984992248236135e-07, 'epoch': 0.94} + 94%|█████████▍| 8209/8750 [1:58:42<54:50, 6.08s/it] {'loss': 0.4532, 'learning_rate': 1.9984992248236135e-07, 'epoch': 0.94} + 94%|█████████▍| 8209/8750 [1:58:40<54:50, 6.08s/it] 94%|█████████▍| 8210/8750 [1:58:48<53:46, 5.97s/it] 94%|█████████▍| 8210/8750 [1:58:46<53:46, 5.97s/it] {'loss': 0.4243, 'learning_rate': 1.9911424796693611e-07, 'epoch': 0.94} + 94%|█████████▍| 8210/8750 [1:58:48<53:46, 5.97s/it] {'loss': 0.4243, 'learning_rate': 1.9911424796693611e-07, 'epoch': 0.94} + 94%|█████████▍| 8210/8750 [1:58:46<53:46, 5.97s/it] 94%|█████████▍| 8211/8750 [1:58:52<52:40, 5.86s/it] 94%|█████████▍| 8211/8750 [1:58:54<52:40, 5.86s/it] {'loss': 0.4565, 'learning_rate': 1.9837991639167552e-07, 'epoch': 0.94} + {'loss': 0.4565, 'learning_rate': 1.9837991639167552e-07, 'epoch': 0.94} + 94%|█████████▍| 8211/8750 [1:58:54<52:40, 5.86s/it] 94%|█████████▍| 8211/8750 [1:58:52<52:40, 5.86s/it] 94%|█████████▍| 8212/8750 [1:58:59<52:00, 5.80s/it] 94%|█████████▍| 8212/8750 [1:58:57<52:00, 5.80s/it] {'loss': 0.426, 'learning_rate': 1.9764692785719909e-07, 'epoch': 0.94} + 94%|█████████▍| 8212/8750 [1:58:59<52:00, 5.80s/it] {'loss': 0.426, 'learning_rate': 1.9764692785719909e-07, 'epoch': 0.94} + 94%|█████████▍| 8212/8750 [1:58:57<52:00, 5.80s/it] 94%|█████████▍| 8213/8750 [1:59:03<51:20, 5.74s/it] 94%|█████████▍| 8213/8750 [1:59:05<51:20, 5.74s/it] {'loss': 0.4706, 'learning_rate': 1.9691528246394197e-07, 'epoch': 0.94} + 94%|█████████▍| 8213/8750 [1:59:05<51:20, 5.74s/it] {'loss': 0.4706, 'learning_rate': 1.9691528246394197e-07, 'epoch': 0.94} + 94%|█████████▍| 8213/8750 [1:59:03<51:20, 5.74s/it] 94%|█████████▍| 8214/8750 [1:59:11<51:10, 5.73s/it] 94%|█████████▍| 8214/8750 [1:59:09<51:10, 5.73s/it] {'loss': 0.4329, 'learning_rate': 1.9618498031215738e-07, 'epoch': 0.94} + 94%|█████████▍| 8214/8750 [1:59:11<51:10, 5.73s/it] {'loss': 0.4329, 'learning_rate': 1.9618498031215738e-07, 'epoch': 0.94} + 94%|█████████▍| 8214/8750 [1:59:09<51:10, 5.73s/it] 94%|█████████▍| 8215/8750 [1:59:15<51:24, 5.77s/it] 94%|█████████▍| 8215/8750 [1:59:16<51:24, 5.77s/it] {'loss': 0.4393, 'learning_rate': 1.954560215019108e-07, 'epoch': 0.94} + {'loss': 0.4393, 'learning_rate': 1.954560215019108e-07, 'epoch': 0.94} 94%|█████████▍| 8215/8750 [1:59:16<51:24, 5.77s/it] + 94%|█████████▍| 8215/8750 [1:59:15<51:24, 5.77s/it] 94%|█████████▍| 8216/8750 [1:59:22<51:00, 5.73s/it] 94%|█████████▍| 8216/8750 [1:59:20<51:00, 5.73s/it] {'loss': 0.4472, 'learning_rate': 1.9472840613308787e-07, 'epoch': 0.94} + 94%|█████████▍| 8216/8750 [1:59:22<51:00, 5.73s/it] {'loss': 0.4472, 'learning_rate': 1.9472840613308787e-07, 'epoch': 0.94} + 94%|█████████▍| 8216/8750 [1:59:20<51:00, 5.73s/it] 94%|█████████▍| 8217/8750 [1:59:28<50:42, 5.71s/it] 94%|█████████▍| 8217/8750 [1:59:26<50:42, 5.71s/it] {'loss': 0.4587, 'learning_rate': 1.9400213430538773e-07, 'epoch': 0.94} + 94%|█████████▍| 8217/8750 [1:59:28<50:42, 5.71s/it] {'loss': 0.4587, 'learning_rate': 1.9400213430538773e-07, 'epoch': 0.94} + 94%|█████████▍| 8217/8750 [1:59:26<50:42, 5.71s/it] 94%|█████████▍| 8218/8750 [1:59:33<50:15, 5.67s/it] 94%|█████████▍| 8218/8750 [1:59:31<50:16, 5.67s/it] {'loss': 0.4529, 'learning_rate': 1.9327720611832523e-07, 'epoch': 0.94} + 94%|█████████▍| 8218/8750 [1:59:33<50:15, 5.67s/it] {'loss': 0.4529, 'learning_rate': 1.9327720611832523e-07, 'epoch': 0.94} + 94%|█████████▍| 8218/8750 [1:59:31<50:16, 5.67s/it] 94%|█████████▍| 8219/8750 [1:59:39<50:04, 5.66s/it] 94%|█████████▍| 8219/8750 [1:59:37<50:04, 5.66s/it] {'loss': 0.4603, 'learning_rate': 1.9255362167123316e-07, 'epoch': 0.94} + 94%|█████████▍| 8219/8750 [1:59:39<50:04, 5.66s/it] {'loss': 0.4603, 'learning_rate': 1.9255362167123316e-07, 'epoch': 0.94} + 94%|█████████▍| 8219/8750 [1:59:37<50:04, 5.66s/it] 94%|█████████▍| 8220/8750 [1:59:43<50:12, 5.68s/it] 94%|█████████▍| 8220/8750 [1:59:45<50:12, 5.68s/it] {'loss': 0.4376, 'learning_rate': 1.918313810632566e-07, 'epoch': 0.94} + {'loss': 0.4376, 'learning_rate': 1.918313810632566e-07, 'epoch': 0.94} + 94%|█████████▍| 8220/8750 [1:59:45<50:12, 5.68s/it] 94%|█████████▍| 8220/8750 [1:59:43<50:12, 5.68s/it] 94%|█████████▍| 8221/8750 [1:59:51<50:20, 5.71s/it] 94%|█████████▍| 8221/8750 [1:59:49<50:20, 5.71s/it] {'loss': 0.4576, 'learning_rate': 1.9111048439335978e-07, 'epoch': 0.94} + 94%|█████████▍| 8221/8750 [1:59:51<50:20, 5.71s/it] {'loss': 0.4576, 'learning_rate': 1.9111048439335978e-07, 'epoch': 0.94} + 94%|█████████▍| 8221/8750 [1:59:49<50:20, 5.71s/it] 94%|█████████▍| 8222/8750 [1:59:56<50:02, 5.69s/it] 94%|█████████▍| 8222/8750 [1:59:54<50:02, 5.69s/it] {'loss': 0.4322, 'learning_rate': 1.903909317603214e-07, 'epoch': 0.94} + 94%|█████████▍| 8222/8750 [1:59:56<50:02, 5.69s/it] {'loss': 0.4322, 'learning_rate': 1.903909317603214e-07, 'epoch': 0.94} + 94%|█████████▍| 8222/8750 [1:59:54<50:02, 5.69s/it] 94%|█████████▍| 8223/8750 [2:00:02<50:14, 5.72s/it] 94%|█████████▍| 8223/8750 [2:00:00<50:14, 5.72s/it] {'loss': 0.4532, 'learning_rate': 1.89672723262736e-07, 'epoch': 0.94} + 94%|█████████▍| 8223/8750 [2:00:02<50:14, 5.72s/it] {'loss': 0.4532, 'learning_rate': 1.89672723262736e-07, 'epoch': 0.94} + 94%|█████████▍| 8223/8750 [2:00:00<50:14, 5.72s/it] 94%|█████████▍| 8224/8750 [2:00:08<50:00, 5.70s/it] 94%|█████████▍| 8224/8750 [2:00:06<50:00, 5.70s/it] {'loss': 0.459, 'learning_rate': 1.889558589990148e-07, 'epoch': 0.94} + 94%|█████████▍| 8224/8750 [2:00:08<50:00, 5.70s/it] {'loss': 0.459, 'learning_rate': 1.889558589990148e-07, 'epoch': 0.94} + 94%|█████████▍| 8224/8750 [2:00:06<50:00, 5.70s/it] 94%|█████████▍| 8225/8750 [2:00:13<49:58, 5.71s/it] 94%|█████████▍| 8225/8750 [2:00:11<49:58, 5.71s/it] {'loss': 0.4548, 'learning_rate': 1.882403390673837e-07, 'epoch': 0.94} + 94%|█████████▍| 8225/8750 [2:00:13<49:58, 5.71s/it] {'loss': 0.4548, 'learning_rate': 1.882403390673837e-07, 'epoch': 0.94} + 94%|█████████▍| 8225/8750 [2:00:11<49:58, 5.71s/it] 94%|█████████▍| 8226/8750 [2:00:17<49:52, 5.71s/it] 94%|█████████▍| 8226/8750 [2:00:19<49:52, 5.71s/it]{'loss': 0.4389, 'learning_rate': 1.8752616356588648e-07, 'epoch': 0.94} + {'loss': 0.4389, 'learning_rate': 1.8752616356588648e-07, 'epoch': 0.94} + 94%|█████████▍| 8226/8750 [2:00:19<49:52, 5.71s/it] 94%|█████████▍| 8226/8750 [2:00:17<49:52, 5.71s/it] 94%|█████████▍| 8227/8750 [2:00:25<50:06, 5.75s/it] 94%|█████████▍| 8227/8750 [2:00:23<50:06, 5.75s/it] {'loss': 0.4535, 'learning_rate': 1.8681333259237933e-07, 'epoch': 0.94} + 94%|█████████▍| 8227/8750 [2:00:25<50:06, 5.75s/it] {'loss': 0.4535, 'learning_rate': 1.8681333259237933e-07, 'epoch': 0.94} + 94%|█████████▍| 8227/8750 [2:00:23<50:06, 5.75s/it] 94%|█████████▍| 8228/8750 [2:00:31<49:57, 5.74s/it] 94%|█████████▍| 8228/8750 [2:00:29<49:57, 5.74s/it] {'loss': 0.4543, 'learning_rate': 1.861018462445352e-07, 'epoch': 0.94} + 94%|█████████▍| 8228/8750 [2:00:31<49:57, 5.74s/it] {'loss': 0.4543, 'learning_rate': 1.861018462445352e-07, 'epoch': 0.94} + 94%|█████████▍| 8228/8750 [2:00:29<49:57, 5.74s/it] 94%|█████████▍| 8229/8750 [2:00:36<49:40, 5.72s/it] 94%|█████████▍| 8229/8750 [2:00:34<49:40, 5.72s/it] {'loss': 0.4711, 'learning_rate': 1.8539170461984612e-07, 'epoch': 0.94} + 94%|█████████▍| 8229/8750 [2:00:36<49:40, 5.72s/it] {'loss': 0.4711, 'learning_rate': 1.8539170461984612e-07, 'epoch': 0.94} + 94%|█████████▍| 8229/8750 [2:00:34<49:40, 5.72s/it] 94%|█████████▍| 8230/8750 [2:00:40<49:28, 5.71s/it] 94%|█████████▍| 8230/8750 [2:00:42<49:28, 5.71s/it] {'loss': 0.4632, 'learning_rate': 1.8468290781561538e-07, 'epoch': 0.94} + 94%|█████████▍| 8230/8750 [2:00:42<49:28, 5.71s/it] {'loss': 0.4632, 'learning_rate': 1.8468290781561538e-07, 'epoch': 0.94} + 94%|█████████▍| 8230/8750 [2:00:40<49:28, 5.71s/it] 94%|█████████▍| 8231/8750 [2:00:48<49:16, 5.70s/it] 94%|█████████▍| 8231/8750 [2:00:46<49:16, 5.70s/it] {'loss': 0.4474, 'learning_rate': 1.8397545592896527e-07, 'epoch': 0.94} + 94%|█████████▍| 8231/8750 [2:00:48<49:16, 5.70s/it] {'loss': 0.4474, 'learning_rate': 1.8397545592896527e-07, 'epoch': 0.94} + 94%|█████████▍| 8231/8750 [2:00:46<49:16, 5.70s/it] 94%|█████████▍| 8232/8750 [2:00:53<49:20, 5.72s/it] 94%|█████████▍| 8232/8750 [2:00:51<49:20, 5.72s/it] {'loss': 0.4422, 'learning_rate': 1.832693490568327e-07, 'epoch': 0.94} + 94%|█████████▍| 8232/8750 [2:00:53<49:20, 5.72s/it] {'loss': 0.4422, 'learning_rate': 1.832693490568327e-07, 'epoch': 0.94} + 94%|█████████▍| 8232/8750 [2:00:51<49:20, 5.72s/it] 94%|█████████▍| 8233/8750 [2:00:59<48:52, 5.67s/it] 94%|█████████▍| 8233/8750 [2:00:57<48:52, 5.67s/it] {'loss': 0.4676, 'learning_rate': 1.8256458729596692e-07, 'epoch': 0.94} + 94%|█████████▍| 8233/8750 [2:00:59<48:52, 5.67s/it] {'loss': 0.4676, 'learning_rate': 1.8256458729596692e-07, 'epoch': 0.94} + 94%|█████████▍| 8233/8750 [2:00:57<48:52, 5.67s/it] 94%|█████████▍| 8234/8750 [2:01:03<49:01, 5.70s/it] 94%|█████████▍| 8234/8750 [2:01:05<49:01, 5.70s/it] {'loss': 0.4515, 'learning_rate': 1.8186117074293964e-07, 'epoch': 0.94} + 94%|█████████▍| 8234/8750 [2:01:05<49:01, 5.70s/it] {'loss': 0.4515, 'learning_rate': 1.8186117074293964e-07, 'epoch': 0.94} + 94%|█████████▍| 8234/8750 [2:01:03<49:01, 5.70s/it] 94%|█████████▍| 8235/8750 [2:01:10<48:34, 5.66s/it] 94%|█████████▍| 8235/8750 [2:01:08<48:34, 5.66s/it] {'loss': 0.4537, 'learning_rate': 1.811590994941337e-07, 'epoch': 0.94} + 94%|█████████▍| 8235/8750 [2:01:10<48:34, 5.66s/it] {'loss': 0.4537, 'learning_rate': 1.811590994941337e-07, 'epoch': 0.94} + 94%|█████████▍| 8235/8750 [2:01:08<48:34, 5.66s/it] 94%|█████████▍| 8236/8750 [2:01:16<48:41, 5.68s/it] 94%|█████████▍| 8236/8750 [2:01:14<48:41, 5.68s/it] {'loss': 0.437, 'learning_rate': 1.804583736457477e-07, 'epoch': 0.94} + 94%|█████████▍| 8236/8750 [2:01:16<48:41, 5.68s/it] {'loss': 0.437, 'learning_rate': 1.804583736457477e-07, 'epoch': 0.94} + 94%|█████████▍| 8236/8750 [2:01:14<48:41, 5.68s/it] 94%|█████████▍| 8237/8750 [2:01:20<48:46, 5.70s/it] 94%|█████████▍| 8237/8750 [2:01:22<48:46, 5.70s/it] {'loss': 0.4531, 'learning_rate': 1.797589932937982e-07, 'epoch': 0.94} + 94%|█████████▍| 8237/8750 [2:01:22<48:46, 5.70s/it] {'loss': 0.4531, 'learning_rate': 1.797589932937982e-07, 'epoch': 0.94} + 94%|█████████▍| 8237/8750 [2:01:20<48:46, 5.70s/it] 94%|█████████▍| 8238/8750 [2:01:28<48:48, 5.72s/it] 94%|█████████▍| 8238/8750 [2:01:26<48:48, 5.72s/it] {'loss': 0.4455, 'learning_rate': 1.790609585341141e-07, 'epoch': 0.94} + 94%|█████████▍| 8238/8750 [2:01:28<48:48, 5.72s/it] {'loss': 0.4455, 'learning_rate': 1.790609585341141e-07, 'epoch': 0.94} + 94%|█████████▍| 8238/8750 [2:01:26<48:48, 5.72s/it] 94%|█████████▍| 8239/8750 [2:01:33<48:58, 5.75s/it] 94%|█████████▍| 8239/8750 [2:01:31<48:58, 5.75s/it] {'loss': 0.4547, 'learning_rate': 1.7836426946234332e-07, 'epoch': 0.94} + 94%|█████████▍| 8239/8750 [2:01:33<48:58, 5.75s/it] {'loss': 0.4547, 'learning_rate': 1.7836426946234332e-07, 'epoch': 0.94} + 94%|█████████▍| 8239/8750 [2:01:31<48:58, 5.75s/it] 94%|█████████▍| 8240/8750 [2:01:37<48:54, 5.75s/it] 94%|█████████▍| 8240/8750 [2:01:39<48:54, 5.75s/it] {'loss': 0.4408, 'learning_rate': 1.7766892617394727e-07, 'epoch': 0.94} + {'loss': 0.4408, 'learning_rate': 1.7766892617394727e-07, 'epoch': 0.94} 94%|█████████▍| 8240/8750 [2:01:39<48:54, 5.75s/it] + 94%|█████████▍| 8240/8750 [2:01:37<48:54, 5.75s/it] 94%|█████████▍| 8241/8750 [2:01:43<49:28, 5.83s/it] 94%|█████████▍| 8241/8750 [2:01:45<49:28, 5.83s/it]{'loss': 0.4442, 'learning_rate': 1.7697492876420198e-07, 'epoch': 0.94} + {'loss': 0.4442, 'learning_rate': 1.7697492876420198e-07, 'epoch': 0.94} + 94%|█████████▍| 8241/8750 [2:01:45<49:28, 5.83s/it] 94%|█████████▍| 8241/8750 [2:01:43<49:28, 5.83s/it] 94%|█████████▍| 8242/8750 [2:01:51<48:53, 5.77s/it] 94%|█████████▍| 8242/8750 [2:01:49<48:53, 5.77s/it] {'loss': 0.4326, 'learning_rate': 1.7628227732820247e-07, 'epoch': 0.94} + 94%|█████████▍| 8242/8750 [2:01:51<48:53, 5.77s/it] {'loss': 0.4326, 'learning_rate': 1.7628227732820247e-07, 'epoch': 0.94} + 94%|█████████▍| 8242/8750 [2:01:49<48:53, 5.77s/it] 94%|█████████▍| 8243/8750 [2:01:57<48:46, 5.77s/it] 94%|█████████▍| 8243/8750 [2:01:55<48:46, 5.77s/it] {'loss': 0.4699, 'learning_rate': 1.755909719608573e-07, 'epoch': 0.94} + 94%|█████████▍| 8243/8750 [2:01:57<48:46, 5.77s/it] {'loss': 0.4699, 'learning_rate': 1.755909719608573e-07, 'epoch': 0.94} + 94%|█████████▍| 8243/8750 [2:01:55<48:46, 5.77s/it] 94%|█████████▍| 8244/8750 [2:02:02<48:10, 5.71s/it] 94%|█████████▍| 8244/8750 [2:02:00<48:10, 5.71s/it] {'loss': 0.459, 'learning_rate': 1.7490101275689064e-07, 'epoch': 0.94} + 94%|█████████▍| 8244/8750 [2:02:02<48:10, 5.71s/it] {'loss': 0.459, 'learning_rate': 1.7490101275689064e-07, 'epoch': 0.94} + 94%|█████████▍| 8244/8750 [2:02:00<48:10, 5.71s/it] 94%|█████████▍| 8245/8750 [2:02:06<47:48, 5.68s/it] 94%|█████████▍| 8245/8750 [2:02:08<47:48, 5.68s/it] {'loss': 0.4532, 'learning_rate': 1.7421239981084136e-07, 'epoch': 0.94} + 94%|█████████▍| 8245/8750 [2:02:08<47:48, 5.68s/it] {'loss': 0.4532, 'learning_rate': 1.7421239981084136e-07, 'epoch': 0.94} + 94%|█████████▍| 8245/8750 [2:02:06<47:48, 5.68s/it] 94%|█████████▍| 8246/8750 [2:02:13<47:57, 5.71s/it] 94%|█████████▍| 8246/8750 [2:02:12<47:57, 5.71s/it] {'loss': 0.4455, 'learning_rate': 1.7352513321706621e-07, 'epoch': 0.94} + 94%|█████████▍| 8246/8750 [2:02:14<47:57, 5.71s/it] {'loss': 0.4455, 'learning_rate': 1.7352513321706621e-07, 'epoch': 0.94} + 94%|█████████▍| 8246/8750 [2:02:12<47:57, 5.71s/it] 94%|█████████▍| 8247/8750 [2:02:19<47:53, 5.71s/it] 94%|█████████▍| 8247/8750 [2:02:17<47:53, 5.71s/it] {'loss': 0.4646, 'learning_rate': 1.7283921306973538e-07, 'epoch': 0.94} + 94%|█████████▍| 8247/8750 [2:02:19<47:53, 5.71s/it] {'loss': 0.4646, 'learning_rate': 1.7283921306973538e-07, 'epoch': 0.94} + 94%|█████████▍| 8247/8750 [2:02:17<47:53, 5.71s/it] 94%|█████████▍| 8248/8750 [2:02:25<47:49, 5.72s/it] 94%|█████████▍| 8248/8750 [2:02:23<47:49, 5.72s/it] {'loss': 0.4467, 'learning_rate': 1.7215463946283483e-07, 'epoch': 0.94} + 94%|█████████▍| 8248/8750 [2:02:25<47:49, 5.72s/it] {'loss': 0.4467, 'learning_rate': 1.7215463946283483e-07, 'epoch': 0.94} + 94%|█████████▍| 8248/8750 [2:02:23<47:49, 5.72s/it] 94%|█████████▍| 8249/8750 [2:02:31<47:57, 5.74s/it] 94%|█████████▍| 8249/8750 [2:02:29<47:57, 5.74s/it] {'loss': 0.4495, 'learning_rate': 1.714714124901662e-07, 'epoch': 0.94} + 94%|█████████▍| 8249/8750 [2:02:31<47:57, 5.74s/it] {'loss': 0.4495, 'learning_rate': 1.714714124901662e-07, 'epoch': 0.94} + 94%|█████████▍| 8249/8750 [2:02:29<47:57, 5.74s/it]14 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +5 12 AutoResumeHook: Checking whether to suspend... +811 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +9 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +1510 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + + 94%|█████████▍| 8250/8750 [2:02:36<47:20, 5.68s/it]AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +0 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... + 94%|█████████▍| 8250/8750 [2:02:34<47:20, 5.68s/it]7 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4299, 'learning_rate': 1.70789532245349e-07, 'epoch': 0.94} + 94%|█████████▍| 8250/8750 [2:02:36<47:20, 5.68s/it] {'loss': 0.4299, 'learning_rate': 1.70789532245349e-07, 'epoch': 0.94} + 94%|█████████▍| 8250/8750 [2:02:34<47:20, 5.68s/it] 94%|█████████▍| 8251/8750 [2:02:40<47:08, 5.67s/it] 94%|█████████▍| 8251/8750 [2:02:42<47:08, 5.67s/it] {'loss': 0.456, 'learning_rate': 1.70108998821813e-07, 'epoch': 0.94} + 94%|█████████▍| 8251/8750 [2:02:42<47:08, 5.67s/it] {'loss': 0.456, 'learning_rate': 1.70108998821813e-07, 'epoch': 0.94} + 94%|█████████▍| 8251/8750 [2:02:40<47:08, 5.67s/it] 94%|█████████▍| 8252/8750 [2:02:48<46:53, 5.65s/it] 94%|█████████▍| 8252/8750 [2:02:46<46:53, 5.65s/it] {'loss': 0.4508, 'learning_rate': 1.6942981231280798e-07, 'epoch': 0.94} + 94%|█████████▍| 8252/8750 [2:02:48<46:53, 5.65s/it] {'loss': 0.4508, 'learning_rate': 1.6942981231280798e-07, 'epoch': 0.94} + 94%|█████████▍| 8252/8750 [2:02:46<46:53, 5.65s/it] 94%|█████████▍| 8253/8750 [2:02:51<46:57, 5.67s/it] 94%|█████████▍| 8253/8750 [2:02:53<46:57, 5.67s/it] {'loss': 0.4679, 'learning_rate': 1.6875197281139844e-07, 'epoch': 0.94} + {'loss': 0.4679, 'learning_rate': 1.6875197281139844e-07, 'epoch': 0.94} 94%|█████████▍| 8253/8750 [2:02:53<46:57, 5.67s/it] + 94%|█████████▍| 8253/8750 [2:02:51<46:57, 5.67s/it] 94%|█████████▍| 8254/8750 [2:02:59<47:04, 5.69s/it] 94%|█████████▍| 8254/8750 [2:02:57<47:04, 5.69s/it] {'loss': 0.4539, 'learning_rate': 1.680754804104623e-07, 'epoch': 0.94} + 94%|█████████▍| 8254/8750 [2:02:59<47:04, 5.69s/it] {'loss': 0.4539, 'learning_rate': 1.680754804104623e-07, 'epoch': 0.94} + 94%|█████████▍| 8254/8750 [2:02:57<47:04, 5.69s/it] 94%|█████████▍| 8255/8750 [2:03:03<46:33, 5.64s/it] 94%|█████████▍| 8255/8750 [2:03:05<46:33, 5.64s/it] {'loss': 0.458, 'learning_rate': 1.6740033520269538e-07, 'epoch': 0.94} + 94%|█████████▍| 8255/8750 [2:03:05<46:33, 5.64s/it] {'loss': 0.458, 'learning_rate': 1.6740033520269538e-07, 'epoch': 0.94} + 94%|█████████▍| 8255/8750 [2:03:03<46:33, 5.64s/it] 94%|█████████▍| 8256/8750 [2:03:08<46:14, 5.62s/it] 94%|█████████▍| 8256/8750 [2:03:10<46:14, 5.62s/it] {'loss': 0.4426, 'learning_rate': 1.6672653728060594e-07, 'epoch': 0.94} + 94%|█████████▍| 8256/8750 [2:03:10<46:14, 5.62s/it] {'loss': 0.4426, 'learning_rate': 1.6672653728060594e-07, 'epoch': 0.94} + 94%|█████████▍| 8256/8750 [2:03:08<46:14, 5.62s/it] 94%|█████████▍| 8257/8750 [2:03:16<46:32, 5.66s/it] 94%|█████████▍| 8257/8750 [2:03:14<46:32, 5.66s/it] {'loss': 0.4507, 'learning_rate': 1.6605408673652012e-07, 'epoch': 0.94} + 94%|█████████▍| 8257/8750 [2:03:16<46:32, 5.66s/it] {'loss': 0.4507, 'learning_rate': 1.6605408673652012e-07, 'epoch': 0.94} + 94%|█████████▍| 8257/8750 [2:03:14<46:32, 5.66s/it] 94%|█████████▍| 8258/8750 [2:03:20<47:51, 5.84s/it] 94%|█████████▍| 8258/8750 [2:03:22<47:50, 5.84s/it] {'loss': 0.4408, 'learning_rate': 1.6538298366257975e-07, 'epoch': 0.94} + {'loss': 0.4408, 'learning_rate': 1.6538298366257975e-07, 'epoch': 0.94} 94%|█████████▍| 8258/8750 [2:03:22<47:50, 5.84s/it] + 94%|█████████▍| 8258/8750 [2:03:20<47:51, 5.84s/it] 94%|█████████▍| 8259/8750 [2:03:28<46:54, 5.73s/it] 94%|█████████▍| 8259/8750 [2:03:26<46:54, 5.73s/it] {'loss': 0.4554, 'learning_rate': 1.647132281507391e-07, 'epoch': 0.94} + 94%|█████████▍| 8259/8750 [2:03:28<46:54, 5.73s/it] {'loss': 0.4554, 'learning_rate': 1.647132281507391e-07, 'epoch': 0.94} + 94%|█████████▍| 8259/8750 [2:03:26<46:54, 5.73s/it] 94%|█████████▍| 8260/8750 [2:03:33<46:26, 5.69s/it] 94%|█████████▍| 8260/8750 [2:03:31<46:26, 5.69s/it] {'loss': 0.4514, 'learning_rate': 1.6404482029277023e-07, 'epoch': 0.94} + 94%|█████████▍| 8260/8750 [2:03:33<46:26, 5.69s/it] {'loss': 0.4514, 'learning_rate': 1.6404482029277023e-07, 'epoch': 0.94} + 94%|█████████▍| 8260/8750 [2:03:31<46:26, 5.69s/it] 94%|█████████▍| 8261/8750 [2:03:39<45:59, 5.64s/it] 94%|█████████▍| 8261/8750 [2:03:37<45:59, 5.64s/it] {'loss': 0.449, 'learning_rate': 1.6337776018026108e-07, 'epoch': 0.94} + 94%|█████████▍| 8261/8750 [2:03:39<45:59, 5.64s/it] {'loss': 0.449, 'learning_rate': 1.6337776018026108e-07, 'epoch': 0.94} + 94%|█████████▍| 8261/8750 [2:03:37<45:59, 5.64s/it] 94%|█████████▍| 8262/8750 [2:03:45<46:21, 5.70s/it] 94%|█████████▍| 8262/8750 [2:03:43<46:21, 5.70s/it] {'loss': 0.4437, 'learning_rate': 1.627120479046118e-07, 'epoch': 0.94} + 94%|█████████▍| 8262/8750 [2:03:45<46:21, 5.70s/it] {'loss': 0.4437, 'learning_rate': 1.627120479046118e-07, 'epoch': 0.94} + 94%|█████████▍| 8262/8750 [2:03:43<46:21, 5.70s/it] 94%|█████████▍| 8263/8750 [2:03:48<46:02, 5.67s/it] 94%|█████████▍| 8263/8750 [2:03:50<46:02, 5.67s/it] {'loss': 0.4701, 'learning_rate': 1.620476835570417e-07, 'epoch': 0.94} + {'loss': 0.4701, 'learning_rate': 1.620476835570417e-07, 'epoch': 0.94} 94%|█████████▍| 8263/8750 [2:03:50<46:02, 5.67s/it] + 94%|█████████▍| 8263/8750 [2:03:48<46:02, 5.67s/it] 94%|█████████▍| 8264/8750 [2:03:54<46:10, 5.70s/it] 94%|█████████▍| 8264/8750 [2:03:56<46:10, 5.70s/it] {'loss': 0.4428, 'learning_rate': 1.6138466722858237e-07, 'epoch': 0.94} + 94%|█████████▍| 8264/8750 [2:03:56<46:10, 5.70s/it] {'loss': 0.4428, 'learning_rate': 1.6138466722858237e-07, 'epoch': 0.94} + 94%|█████████▍| 8264/8750 [2:03:54<46:10, 5.70s/it] 94%|█████████▍| 8265/8750 [2:04:01<45:40, 5.65s/it] 94%|█████████▍| 8265/8750 [2:04:00<45:40, 5.65s/it] {'loss': 0.4623, 'learning_rate': 1.6072299901008226e-07, 'epoch': 0.94} + 94%|█████████▍| 8265/8750 [2:04:01<45:40, 5.65s/it] {'loss': 0.4623, 'learning_rate': 1.6072299901008226e-07, 'epoch': 0.94} + 94%|█████████▍| 8265/8750 [2:04:00<45:40, 5.65s/it] 94%|█████████▍| 8266/8750 [2:04:07<45:53, 5.69s/it] 94%|█████████▍| 8266/8750 [2:04:05<45:53, 5.69s/it] {'loss': 0.4378, 'learning_rate': 1.6006267899220552e-07, 'epoch': 0.94} + 94%|█████████▍| 8266/8750 [2:04:07<45:53, 5.69s/it] {'loss': 0.4378, 'learning_rate': 1.6006267899220552e-07, 'epoch': 0.94} + 94%|█████████▍| 8266/8750 [2:04:05<45:53, 5.69s/it] 94%|█████████▍| 8267/8750 [2:04:13<45:53, 5.70s/it] 94%|█████████▍| 8267/8750 [2:04:11<45:53, 5.70s/it] {'loss': 0.4599, 'learning_rate': 1.5940370726542864e-07, 'epoch': 0.94} + 94%|█████████▍| 8267/8750 [2:04:13<45:53, 5.70s/it] {'loss': 0.4599, 'learning_rate': 1.5940370726542864e-07, 'epoch': 0.94} + 94%|█████████▍| 8267/8750 [2:04:11<45:53, 5.70s/it] 94%|█████████▍| 8268/8750 [2:04:19<45:48, 5.70s/it] 94%|█████████▍| 8268/8750 [2:04:17<45:48, 5.70s/it] {'loss': 0.4498, 'learning_rate': 1.587460839200472e-07, 'epoch': 0.94} + 94%|█████████▍| 8268/8750 [2:04:19<45:48, 5.70s/it] {'loss': 0.4498, 'learning_rate': 1.587460839200472e-07, 'epoch': 0.94} + 94%|█████████▍| 8268/8750 [2:04:17<45:48, 5.70s/it] 95%|█████████▍| 8269/8750 [2:04:24<45:28, 5.67s/it] 95%|█████████▍| 8269/8750 [2:04:22<45:28, 5.67s/it] {'loss': 0.4708, 'learning_rate': 1.580898090461691e-07, 'epoch': 0.95} + 95%|█████████▍| 8269/8750 [2:04:24<45:28, 5.67s/it] {'loss': 0.4708, 'learning_rate': 1.580898090461691e-07, 'epoch': 0.95} + 95%|█████████▍| 8269/8750 [2:04:22<45:28, 5.67s/it] 95%|█████████▍| 8270/8750 [2:04:31<46:45, 5.84s/it] 95%|█████████▍| 8270/8750 [2:04:29<46:45, 5.84s/it] {'loss': 0.4397, 'learning_rate': 1.5743488273372133e-07, 'epoch': 0.95} + 95%|█████████▍| 8270/8750 [2:04:31<46:45, 5.84s/it] {'loss': 0.4397, 'learning_rate': 1.5743488273372133e-07, 'epoch': 0.95} + 95%|█████████▍| 8270/8750 [2:04:29<46:45, 5.84s/it] 95%|█████████▍| 8271/8750 [2:04:37<47:05, 5.90s/it] 95%|█████████▍| 8271/8750 [2:04:35<47:05, 5.90s/it] {'loss': 0.4513, 'learning_rate': 1.567813050724387e-07, 'epoch': 0.95} + 95%|█████████▍| 8271/8750 [2:04:37<47:05, 5.90s/it] {'loss': 0.4513, 'learning_rate': 1.567813050724387e-07, 'epoch': 0.95} + 95%|█████████▍| 8271/8750 [2:04:35<47:05, 5.90s/it] 95%|█████████▍| 8272/8750 [2:04:42<46:21, 5.82s/it] 95%|█████████▍| 8272/8750 [2:04:40<46:21, 5.82s/it] {'loss': 0.4539, 'learning_rate': 1.5612907615187967e-07, 'epoch': 0.95} + 95%|█████████▍| 8272/8750 [2:04:42<46:21, 5.82s/it] {'loss': 0.4539, 'learning_rate': 1.5612907615187967e-07, 'epoch': 0.95} + 95%|█████████▍| 8272/8750 [2:04:40<46:21, 5.82s/it] 95%|█████████▍| 8273/8750 [2:04:48<46:37, 5.87s/it] 95%|█████████▍| 8273/8750 [2:04:46<46:37, 5.86s/it] {'loss': 0.467, 'learning_rate': 1.554781960614138e-07, 'epoch': 0.95} + 95%|█████████▍| 8273/8750 [2:04:48<46:37, 5.87s/it] {'loss': 0.467, 'learning_rate': 1.554781960614138e-07, 'epoch': 0.95} + 95%|█████████▍| 8273/8750 [2:04:46<46:37, 5.86s/it] 95%|█████████▍| 8274/8750 [2:04:54<46:47, 5.90s/it] 95%|█████████▍| 8274/8750 [2:04:52<46:47, 5.90s/it] {'loss': 0.4331, 'learning_rate': 1.548286648902253e-07, 'epoch': 0.95} + 95%|█████████▍| 8274/8750 [2:04:54<46:47, 5.90s/it] {'loss': 0.4331, 'learning_rate': 1.548286648902253e-07, 'epoch': 0.95} + 95%|█████████▍| 8274/8750 [2:04:52<46:47, 5.90s/it] 95%|█████████▍| 8275/8750 [2:05:00<46:03, 5.82s/it] 95%|█████████▍| 8275/8750 [2:04:58<46:03, 5.82s/it] {'loss': 0.438, 'learning_rate': 1.5418048272731413e-07, 'epoch': 0.95} + 95%|█████████▍| 8275/8750 [2:05:00<46:03, 5.82s/it] {'loss': 0.438, 'learning_rate': 1.5418048272731413e-07, 'epoch': 0.95} + 95%|█████████▍| 8275/8750 [2:04:58<46:03, 5.82s/it] 95%|█████████▍| 8276/8750 [2:05:05<45:23, 5.75s/it] 95%|█████████▍| 8276/8750 [2:05:03<45:23, 5.75s/it] {'loss': 0.4538, 'learning_rate': 1.5353364966149697e-07, 'epoch': 0.95} + 95%|█████████▍| 8276/8750 [2:05:05<45:23, 5.75s/it] {'loss': 0.4538, 'learning_rate': 1.5353364966149697e-07, 'epoch': 0.95} + 95%|█████████▍| 8276/8750 [2:05:03<45:23, 5.75s/it] 95%|█████████▍| 8277/8750 [2:05:09<44:58, 5.71s/it] 95%|█████████▍| 8277/8750 [2:05:11<44:58, 5.71s/it] {'loss': 0.4652, 'learning_rate': 1.5288816578140298e-07, 'epoch': 0.95} + 95%|█████████▍| 8277/8750 [2:05:11<44:58, 5.71s/it] {'loss': 0.4652, 'learning_rate': 1.5288816578140298e-07, 'epoch': 0.95} + 95%|█████████▍| 8277/8750 [2:05:09<44:58, 5.71s/it] 95%|█████████▍| 8278/8750 [2:05:15<45:09, 5.74s/it] 95%|█████████▍| 8278/8750 [2:05:17<45:09, 5.74s/it] {'loss': 0.4538, 'learning_rate': 1.5224403117547916e-07, 'epoch': 0.95} + 95%|█████████▍| 8278/8750 [2:05:17<45:09, 5.74s/it] {'loss': 0.4538, 'learning_rate': 1.5224403117547916e-07, 'epoch': 0.95} + 95%|█████████▍| 8278/8750 [2:05:15<45:09, 5.74s/it] 95%|█████████▍| 8279/8750 [2:05:22<44:58, 5.73s/it] 95%|█████████▍| 8279/8750 [2:05:21<44:58, 5.73s/it] {'loss': 0.4449, 'learning_rate': 1.51601245931986e-07, 'epoch': 0.95} + 95%|█████████▍| 8279/8750 [2:05:22<44:58, 5.73s/it] {'loss': 0.4449, 'learning_rate': 1.51601245931986e-07, 'epoch': 0.95} + 95%|█████████▍| 8279/8750 [2:05:21<44:58, 5.73s/it] 95%|█████████▍| 8280/8750 [2:05:28<45:06, 5.76s/it] 95%|█████████▍| 8280/8750 [2:05:26<45:06, 5.76s/it] {'loss': 0.435, 'learning_rate': 1.5095981013899863e-07, 'epoch': 0.95} + {'loss': 0.435, 'learning_rate': 1.5095981013899863e-07, 'epoch': 0.95} + 95%|█████████▍| 8280/8750 [2:05:28<45:06, 5.76s/it] 95%|█████████▍| 8280/8750 [2:05:26<45:06, 5.76s/it] 95%|█████████▍| 8281/8750 [2:05:34<44:44, 5.72s/it] 95%|█████████▍| 8281/8750 [2:05:32<44:44, 5.72s/it] {'loss': 0.4511, 'learning_rate': 1.5031972388440787e-07, 'epoch': 0.95} + 95%|█████████▍| 8281/8750 [2:05:34<44:44, 5.72s/it] {'loss': 0.4511, 'learning_rate': 1.5031972388440787e-07, 'epoch': 0.95} + 95%|█████████▍| 8281/8750 [2:05:32<44:44, 5.72s/it] 95%|█████████▍| 8282/8750 [2:05:40<45:18, 5.81s/it] 95%|█████████▍| 8282/8750 [2:05:38<45:18, 5.81s/it] {'loss': 0.445, 'learning_rate': 1.4968098725592127e-07, 'epoch': 0.95} + 95%|█████████▍| 8282/8750 [2:05:40<45:18, 5.81s/it] {'loss': 0.445, 'learning_rate': 1.4968098725592127e-07, 'epoch': 0.95} + 95%|█████████▍| 8282/8750 [2:05:38<45:18, 5.81s/it] 95%|█████████▍| 8283/8750 [2:05:45<44:37, 5.73s/it] 95%|█████████▍| 8283/8750 [2:05:44<44:37, 5.73s/it] {'loss': 0.4407, 'learning_rate': 1.4904360034106e-07, 'epoch': 0.95} + 95%|█████████▍| 8283/8750 [2:05:45<44:37, 5.73s/it] {'loss': 0.4407, 'learning_rate': 1.4904360034106e-07, 'epoch': 0.95} + 95%|█████████▍| 8283/8750 [2:05:44<44:37, 5.73s/it] 95%|█████████▍| 8284/8750 [2:05:51<44:08, 5.68s/it] 95%|█████████▍| 8284/8750 [2:05:49<44:08, 5.68s/it] {'loss': 0.4414, 'learning_rate': 1.4840756322715866e-07, 'epoch': 0.95} + 95%|█████████▍| 8284/8750 [2:05:51<44:08, 5.68s/it] {'loss': 0.4414, 'learning_rate': 1.4840756322715866e-07, 'epoch': 0.95} + 95%|█████████▍| 8284/8750 [2:05:49<44:08, 5.68s/it] 95%|█████████▍| 8285/8750 [2:05:57<43:43, 5.64s/it] 95%|█████████▍| 8285/8750 [2:05:55<43:43, 5.64s/it] {'loss': 0.4752, 'learning_rate': 1.477728760013697e-07, 'epoch': 0.95} + 95%|█████████▍| 8285/8750 [2:05:57<43:43, 5.64s/it] {'loss': 0.4752, 'learning_rate': 1.477728760013697e-07, 'epoch': 0.95} + 95%|█████████▍| 8285/8750 [2:05:55<43:43, 5.64s/it] 95%|█████████▍| 8286/8750 [2:06:02<43:42, 5.65s/it] 95%|█████████▍| 8286/8750 [2:06:00<43:42, 5.65s/it] {'loss': 0.4692, 'learning_rate': 1.4713953875065912e-07, 'epoch': 0.95} + 95%|█████████▍| 8286/8750 [2:06:02<43:42, 5.65s/it] {'loss': 0.4692, 'learning_rate': 1.4713953875065912e-07, 'epoch': 0.95} + 95%|█████████▍| 8286/8750 [2:06:00<43:42, 5.65s/it] 95%|█████████▍| 8287/8750 [2:06:08<44:01, 5.70s/it] 95%|█████████▍| 8287/8750 [2:06:06<44:01, 5.70s/it] {'loss': 0.4513, 'learning_rate': 1.4650755156180973e-07, 'epoch': 0.95} + 95%|█████████▍| 8287/8750 [2:06:08<44:01, 5.70s/it] {'loss': 0.4513, 'learning_rate': 1.4650755156180973e-07, 'epoch': 0.95} + 95%|█████████▍| 8287/8750 [2:06:06<44:01, 5.70s/it] 95%|█████████▍| 8288/8750 [2:06:14<44:10, 5.74s/it] 95%|█████████▍| 8288/8750 [2:06:12<44:10, 5.74s/it] {'loss': 0.4425, 'learning_rate': 1.458769145214145e-07, 'epoch': 0.95} + 95%|█████████▍| 8288/8750 [2:06:14<44:10, 5.74s/it] {'loss': 0.4425, 'learning_rate': 1.458769145214145e-07, 'epoch': 0.95} + 95%|█████████▍| 8288/8750 [2:06:12<44:10, 5.74s/it] 95%|█████████▍| 8289/8750 [2:06:19<43:39, 5.68s/it] 95%|█████████▍| 8289/8750 [2:06:18<43:39, 5.68s/it] {'loss': 0.4558, 'learning_rate': 1.4524762771588763e-07, 'epoch': 0.95} + 95%|█████████▍| 8289/8750 [2:06:19<43:39, 5.68s/it] {'loss': 0.4558, 'learning_rate': 1.4524762771588763e-07, 'epoch': 0.95} + 95%|█████████▍| 8289/8750 [2:06:18<43:39, 5.68s/it] 95%|█████████▍| 8290/8750 [2:06:25<43:48, 5.72s/it] 95%|█████████▍| 8290/8750 [2:06:23<43:48, 5.71s/it] {'loss': 0.4535, 'learning_rate': 1.4461969123145458e-07, 'epoch': 0.95} + 95%|█████████▍| 8290/8750 [2:06:25<43:48, 5.72s/it] {'loss': 0.4535, 'learning_rate': 1.4461969123145458e-07, 'epoch': 0.95} + 95%|█████████▍| 8290/8750 [2:06:23<43:48, 5.71s/it] 95%|█████████▍| 8291/8750 [2:06:31<44:00, 5.75s/it] 95%|█████████▍| 8291/8750 [2:06:29<43:59, 5.75s/it] {'loss': 0.4321, 'learning_rate': 1.4399310515415655e-07, 'epoch': 0.95} + 95%|█████████▍| 8291/8750 [2:06:31<44:00, 5.75s/it] {'loss': 0.4321, 'learning_rate': 1.4399310515415655e-07, 'epoch': 0.95} + 95%|█████████▍| 8291/8750 [2:06:29<43:59, 5.75s/it] 95%|█████████▍| 8292/8750 [2:06:37<44:02, 5.77s/it] 95%|█████████▍| 8292/8750 [2:06:35<44:02, 5.77s/it] {'loss': 0.4441, 'learning_rate': 1.4336786956985038e-07, 'epoch': 0.95} + 95%|█████████▍| 8292/8750 [2:06:37<44:02, 5.77s/it] {'loss': 0.4441, 'learning_rate': 1.4336786956985038e-07, 'epoch': 0.95} + 95%|█████████▍| 8292/8750 [2:06:35<44:02, 5.77s/it] 95%|█████████▍| 8293/8750 [2:06:41<44:40, 5.87s/it] 95%|█████████▍| 8293/8750 [2:06:43<44:40, 5.87s/it]{'loss': 0.4445, 'learning_rate': 1.4274398456420647e-07, 'epoch': 0.95} {'loss': 0.4445, 'learning_rate': 1.4274398456420647e-07, 'epoch': 0.95} + + 95%|█████████▍| 8293/8750 [2:06:43<44:40, 5.87s/it] 95%|█████████▍| 8293/8750 [2:06:41<44:40, 5.87s/it] 95%|█████████▍| 8294/8750 [2:06:47<44:14, 5.82s/it] 95%|█████████▍| 8294/8750 [2:06:49<44:14, 5.82s/it] {'loss': 0.4637, 'learning_rate': 1.4212145022271196e-07, 'epoch': 0.95} + 95%|█████████▍| 8294/8750 [2:06:49<44:14, 5.82s/it] {'loss': 0.4637, 'learning_rate': 1.4212145022271196e-07, 'epoch': 0.95} + 95%|█████████▍| 8294/8750 [2:06:47<44:14, 5.82s/it] 95%|█████████▍| 8295/8750 [2:06:52<43:41, 5.76s/it] 95%|█████████▍| 8295/8750 [2:06:54<43:41, 5.76s/it] {'loss': 0.4635, 'learning_rate': 1.415002666306664e-07, 'epoch': 0.95} + 95%|█████████▍| 8295/8750 [2:06:54<43:41, 5.76s/it] {'loss': 0.4635, 'learning_rate': 1.415002666306664e-07, 'epoch': 0.95} + 95%|█████████▍| 8295/8750 [2:06:52<43:41, 5.76s/it] 95%|█████████▍| 8296/8750 [2:06:58<43:24, 5.74s/it] 95%|█████████▍| 8296/8750 [2:07:00<43:24, 5.74s/it] {'loss': 0.4472, 'learning_rate': 1.4088043387318838e-07, 'epoch': 0.95} + 95%|█████████▍| 8296/8750 [2:07:00<43:24, 5.74s/it] {'loss': 0.4472, 'learning_rate': 1.4088043387318838e-07, 'epoch': 0.95} + 95%|█████████▍| 8296/8750 [2:06:58<43:24, 5.74s/it] 95%|█████████▍| 8297/8750 [2:07:04<42:54, 5.68s/it] 95%|█████████▍| 8297/8750 [2:07:06<42:54, 5.68s/it] {'loss': 0.4377, 'learning_rate': 1.4026195203520666e-07, 'epoch': 0.95} + 95%|█████████▍| 8297/8750 [2:07:06<42:54, 5.68s/it] {'loss': 0.4377, 'learning_rate': 1.4026195203520666e-07, 'epoch': 0.95} + 95%|█████████▍| 8297/8750 [2:07:04<42:54, 5.68s/it] 95%|█████████▍| 8298/8750 [2:07:09<42:57, 5.70s/it] 95%|█████████▍| 8298/8750 [2:07:11<42:56, 5.70s/it] {'loss': 0.4463, 'learning_rate': 1.3964482120146672e-07, 'epoch': 0.95} + 95%|█████████▍| 8298/8750 [2:07:11<42:56, 5.70s/it] {'loss': 0.4463, 'learning_rate': 1.3964482120146672e-07, 'epoch': 0.95} + 95%|█████████▍| 8298/8750 [2:07:09<42:57, 5.70s/it] 95%|█████████▍| 8299/8750 [2:07:17<42:58, 5.72s/it] 95%|█████████▍| 8299/8750 [2:07:15<42:58, 5.72s/it] {'loss': 0.4532, 'learning_rate': 1.3902904145653094e-07, 'epoch': 0.95} + 95%|█████████▍| 8299/8750 [2:07:17<42:58, 5.72s/it] {'loss': 0.4532, 'learning_rate': 1.3902904145653094e-07, 'epoch': 0.95} + 95%|█████████▍| 8299/8750 [2:07:15<42:58, 5.72s/it]8 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... + 95%|█████████▍| 8300/8750 [2:07:23<42:51, 5.72s/it]9 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +06 AutoResumeHook: Checking whether to suspend... +1512 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +14 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend...3 AutoResumeHook: Checking whether to suspend... + +10 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... + 95%|█████████▍| 8300/8750 [2:07:21<42:51, 5.72s/it] {'loss': 0.4427, 'learning_rate': 1.384146128847741e-07, 'epoch': 0.95} + 95%|█████████▍| 8300/8750 [2:07:23<42:51, 5.72s/it] {'loss': 0.4427, 'learning_rate': 1.384146128847741e-07, 'epoch': 0.95} + 95%|█████████▍| 8300/8750 [2:07:21<42:51, 5.72s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-8300/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-8300/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-8300/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 95%|█████████▍| 8301/8750 [2:07:43<1:16:13, 10.19s/it] 95%|█████████▍| 8301/8750 [2:07:41<1:16:13, 10.19s/it] {'loss': 0.4533, 'learning_rate': 1.3780153557038655e-07, 'epoch': 0.95} + 95%|█████████▍| 8301/8750 [2:07:43<1:16:13, 10.19s/it] {'loss': 0.4533, 'learning_rate': 1.3780153557038655e-07, 'epoch': 0.95} + 95%|█████████▍| 8301/8750 [2:07:41<1:16:13, 10.19s/it] 95%|█████████▍| 8302/8750 [2:07:49<1:06:19, 8.88s/it] 95%|█████████▍| 8302/8750 [2:07:47<1:06:19, 8.88s/it] {'loss': 0.4675, 'learning_rate': 1.3718980959737448e-07, 'epoch': 0.95} + 95%|█████████▍| 8302/8750 [2:07:49<1:06:19, 8.88s/it] {'loss': 0.4675, 'learning_rate': 1.3718980959737448e-07, 'epoch': 0.95} + 95%|█████████▍| 8302/8750 [2:07:47<1:06:19, 8.88s/it] 95%|█████████▍| 8303/8750 [2:07:55<59:37, 8.00s/it] 95%|█████████▍| 8303/8750 [2:07:53<59:37, 8.00s/it] {'loss': 0.4572, 'learning_rate': 1.365794350495564e-07, 'epoch': 0.95} + 95%|█████████▍| 8303/8750 [2:07:55<59:37, 8.00s/it] {'loss': 0.4572, 'learning_rate': 1.365794350495564e-07, 'epoch': 0.95} + 95%|█████████▍| 8303/8750 [2:07:53<59:37, 8.00s/it] 95%|█████████▍| 8304/8750 [2:08:01<54:15, 7.30s/it] 95%|█████████▍| 8304/8750 [2:07:59<54:15, 7.30s/it] {'loss': 0.4524, 'learning_rate': 1.359704120105687e-07, 'epoch': 0.95} + 95%|█████████▍| 8304/8750 [2:08:01<54:15, 7.30s/it] {'loss': 0.4524, 'learning_rate': 1.359704120105687e-07, 'epoch': 0.95} + 95%|█████████▍| 8304/8750 [2:07:59<54:15, 7.30s/it] 95%|█████████▍| 8305/8750 [2:08:05<50:28, 6.81s/it] 95%|█████████▍| 8305/8750 [2:08:07<50:28, 6.81s/it] {'loss': 0.4236, 'learning_rate': 1.3536274056386134e-07, 'epoch': 0.95} + 95%|█████████▍| 8305/8750 [2:08:07<50:28, 6.81s/it] {'loss': 0.4236, 'learning_rate': 1.3536274056386134e-07, 'epoch': 0.95} + 95%|█████████▍| 8305/8750 [2:08:05<50:28, 6.81s/it] 95%|█████████▍| 8306/8750 [2:08:11<48:23, 6.54s/it] 95%|█████████▍| 8306/8750 [2:08:12<48:23, 6.54s/it] {'loss': 0.4386, 'learning_rate': 1.3475642079269659e-07, 'epoch': 0.95} + {'loss': 0.4386, 'learning_rate': 1.3475642079269659e-07, 'epoch': 0.95} 95%|█████████▍| 8306/8750 [2:08:12<48:23, 6.54s/it] + 95%|█████████▍| 8306/8750 [2:08:11<48:23, 6.54s/it] 95%|█████████▍| 8307/8750 [2:08:18<46:30, 6.30s/it] 95%|█████████▍| 8307/8750 [2:08:16<46:30, 6.30s/it] {'loss': 0.4413, 'learning_rate': 1.3415145278015575e-07, 'epoch': 0.95} + 95%|█████████▍| 8307/8750 [2:08:18<46:30, 6.30s/it] {'loss': 0.4413, 'learning_rate': 1.3415145278015575e-07, 'epoch': 0.95} + 95%|█████████▍| 8307/8750 [2:08:16<46:30, 6.30s/it] 95%|█████████▍| 8308/8750 [2:08:24<45:03, 6.12s/it] 95%|█████████▍| 8308/8750 [2:08:22<45:03, 6.12s/it] {'loss': 0.4468, 'learning_rate': 1.335478366091325e-07, 'epoch': 0.95} + {'loss': 0.4468, 'learning_rate': 1.335478366091325e-07, 'epoch': 0.95} + 95%|█████████▍| 8308/8750 [2:08:24<45:03, 6.12s/it] 95%|█████████▍| 8308/8750 [2:08:22<45:03, 6.12s/it] 95%|█████████▍| 8309/8750 [2:08:29<43:48, 5.96s/it] 95%|█████████▍| 8309/8750 [2:08:28<43:49, 5.96s/it] {'loss': 0.4596, 'learning_rate': 1.329455723623352e-07, 'epoch': 0.95} + 95%|█████████▍| 8309/8750 [2:08:29<43:48, 5.96s/it] {'loss': 0.4596, 'learning_rate': 1.329455723623352e-07, 'epoch': 0.95} + 95%|█████████▍| 8309/8750 [2:08:28<43:49, 5.96s/it] 95%|█████████▍| 8310/8750 [2:08:35<43:06, 5.88s/it] 95%|█████████▍| 8310/8750 [2:08:33<43:06, 5.88s/it] {'loss': 0.467, 'learning_rate': 1.3234466012228887e-07, 'epoch': 0.95} + 95%|█████████▍| 8310/8750 [2:08:35<43:06, 5.88s/it] {'loss': 0.467, 'learning_rate': 1.3234466012228887e-07, 'epoch': 0.95} + 95%|█████████▍| 8310/8750 [2:08:33<43:06, 5.88s/it] 95%|█████████▍| 8311/8750 [2:08:41<42:36, 5.82s/it] 95%|█████████▍| 8311/8750 [2:08:39<42:36, 5.82s/it] {'loss': 0.4309, 'learning_rate': 1.31745099971331e-07, 'epoch': 0.95} + 95%|█████████▍| 8311/8750 [2:08:41<42:36, 5.82s/it] {'loss': 0.4309, 'learning_rate': 1.31745099971331e-07, 'epoch': 0.95} + 95%|█████████▍| 8311/8750 [2:08:39<42:36, 5.82s/it] 95%|█████████▍| 8312/8750 [2:08:46<42:07, 5.77s/it] 95%|█████████▍| 8312/8750 [2:08:45<42:07, 5.77s/it] {'loss': 0.4875, 'learning_rate': 1.3114689199161478e-07, 'epoch': 0.95} + 95%|█████████▍| 8312/8750 [2:08:46<42:07, 5.77s/it] {'loss': 0.4875, 'learning_rate': 1.3114689199161478e-07, 'epoch': 0.95} + 95%|█████████▍| 8312/8750 [2:08:45<42:07, 5.77s/it] 95%|█████████▌| 8313/8750 [2:08:52<41:56, 5.76s/it] 95%|█████████▌| 8313/8750 [2:08:50<41:56, 5.76s/it] {'loss': 0.4334, 'learning_rate': 1.3055003626510687e-07, 'epoch': 0.95} + 95%|█████████▌| 8313/8750 [2:08:52<41:56, 5.76s/it] {'loss': 0.4334, 'learning_rate': 1.3055003626510687e-07, 'epoch': 0.95} + 95%|█████████▌| 8313/8750 [2:08:50<41:56, 5.76s/it] 95%|█████████▌| 8314/8750 [2:08:58<42:02, 5.78s/it] 95%|█████████▌| 8314/8750 [2:08:56<42:02, 5.78s/it] {'loss': 0.4388, 'learning_rate': 1.2995453287359293e-07, 'epoch': 0.95} + 95%|█████████▌| 8314/8750 [2:08:58<42:02, 5.78s/it] {'loss': 0.4388, 'learning_rate': 1.2995453287359293e-07, 'epoch': 0.95} + 95%|█████████▌| 8314/8750 [2:08:56<42:02, 5.78s/it] 95%|█████████▌| 8315/8750 [2:09:04<42:02, 5.80s/it] 95%|█████████▌| 8315/8750 [2:09:02<42:02, 5.80s/it] {'loss': 0.4582, 'learning_rate': 1.2936038189866773e-07, 'epoch': 0.95} + 95%|█████████▌| 8315/8750 [2:09:04<42:02, 5.80s/it] {'loss': 0.4582, 'learning_rate': 1.2936038189866773e-07, 'epoch': 0.95} + 95%|█████████▌| 8315/8750 [2:09:02<42:02, 5.80s/it] 95%|█████████▌| 8316/8750 [2:09:08<41:42, 5.77s/it] 95%|█████████▌| 8316/8750 [2:09:10<41:42, 5.77s/it] {'loss': 0.4398, 'learning_rate': 1.287675834217428e-07, 'epoch': 0.95} + 95%|█████████▌| 8316/8750 [2:09:10<41:42, 5.77s/it] {'loss': 0.4398, 'learning_rate': 1.287675834217428e-07, 'epoch': 0.95} + 95%|█████████▌| 8316/8750 [2:09:08<41:42, 5.77s/it] 95%|█████████▌| 8317/8750 [2:09:15<41:45, 5.79s/it] 95%|█████████▌| 8317/8750 [2:09:13<41:45, 5.79s/it] {'loss': 0.4413, 'learning_rate': 1.2817613752404646e-07, 'epoch': 0.95} + 95%|█████████▌| 8317/8750 [2:09:15<41:45, 5.79s/it]{'loss': 0.4413, 'learning_rate': 1.2817613752404646e-07, 'epoch': 0.95} + 95%|█████████▌| 8317/8750 [2:09:14<41:45, 5.79s/it] 95%|█████████▌| 8318/8750 [2:09:21<41:25, 5.75s/it] 95%|█████████▌| 8318/8750 [2:09:19<41:25, 5.75s/it] {'loss': 0.469, 'learning_rate': 1.2758604428661836e-07, 'epoch': 0.95} + 95%|█████████▌| 8318/8750 [2:09:21<41:25, 5.75s/it] {'loss': 0.469, 'learning_rate': 1.2758604428661836e-07, 'epoch': 0.95} + 95%|█████████▌| 8318/8750 [2:09:19<41:25, 5.75s/it] 95%|█████████▌| 8319/8750 [2:09:27<41:27, 5.77s/it] 95%|█████████▌| 8319/8750 [2:09:25<41:27, 5.77s/it] {'loss': 0.4525, 'learning_rate': 1.2699730379031604e-07, 'epoch': 0.95} + 95%|█████████▌| 8319/8750 [2:09:27<41:27, 5.77s/it] {'loss': 0.4525, 'learning_rate': 1.2699730379031604e-07, 'epoch': 0.95} + 95%|█████████▌| 8319/8750 [2:09:25<41:27, 5.77s/it] 95%|█████████▌| 8320/8750 [2:09:32<40:53, 5.71s/it] 95%|█████████▌| 8320/8750 [2:09:31<40:53, 5.71s/it] {'loss': 0.4553, 'learning_rate': 1.2640991611580943e-07, 'epoch': 0.95} + 95%|█████████▌| 8320/8750 [2:09:32<40:53, 5.71s/it] {'loss': 0.4553, 'learning_rate': 1.2640991611580943e-07, 'epoch': 0.95} + 95%|█████████▌| 8320/8750 [2:09:31<40:53, 5.71s/it] 95%|█████████▌| 8321/8750 [2:09:38<40:44, 5.70s/it] 95%|█████████▌| 8321/8750 [2:09:36<40:44, 5.70s/it] {'loss': 0.469, 'learning_rate': 1.2582388134358414e-07, 'epoch': 0.95} + 95%|█████████▌| 8321/8750 [2:09:38<40:44, 5.70s/it] {'loss': 0.469, 'learning_rate': 1.2582388134358414e-07, 'epoch': 0.95} + 95%|█████████▌| 8321/8750 [2:09:36<40:44, 5.70s/it] 95%|█████████▌| 8322/8750 [2:09:44<40:32, 5.68s/it] 95%|█████████▌| 8322/8750 [2:09:42<40:32, 5.68s/it] {'loss': 0.4566, 'learning_rate': 1.2523919955393925e-07, 'epoch': 0.95} + 95%|█████████▌| 8322/8750 [2:09:44<40:32, 5.68s/it] {'loss': 0.4566, 'learning_rate': 1.2523919955393925e-07, 'epoch': 0.95} + 95%|█████████▌| 8322/8750 [2:09:42<40:32, 5.68s/it] 95%|█████████▌| 8323/8750 [2:09:50<41:16, 5.80s/it] 95%|█████████▌| 8323/8750 [2:09:48<41:16, 5.80s/it] {'loss': 0.4418, 'learning_rate': 1.246558708269896e-07, 'epoch': 0.95} + 95%|█████████▌| 8323/8750 [2:09:50<41:16, 5.80s/it] {'loss': 0.4418, 'learning_rate': 1.246558708269896e-07, 'epoch': 0.95} + 95%|█████████▌| 8323/8750 [2:09:48<41:16, 5.80s/it] 95%|█████████▌| 8324/8750 [2:09:56<41:06, 5.79s/it] 95%|█████████▌| 8324/8750 [2:09:54<41:06, 5.79s/it] {'loss': 0.4348, 'learning_rate': 1.2407389524266456e-07, 'epoch': 0.95} + 95%|█████████▌| 8324/8750 [2:09:56<41:06, 5.79s/it] {'loss': 0.4348, 'learning_rate': 1.2407389524266456e-07, 'epoch': 0.95} + 95%|█████████▌| 8324/8750 [2:09:54<41:06, 5.79s/it] 95%|█████████▌| 8325/8750 [2:10:02<41:15, 5.83s/it] 95%|█████████▌| 8325/8750 [2:10:00<41:15, 5.83s/it] {'loss': 0.4491, 'learning_rate': 1.23493272880707e-07, 'epoch': 0.95} + 95%|█████████▌| 8325/8750 [2:10:02<41:15, 5.83s/it] {'loss': 0.4491, 'learning_rate': 1.23493272880707e-07, 'epoch': 0.95} + 95%|█████████▌| 8325/8750 [2:10:00<41:15, 5.83s/it] 95%|█████████▌| 8326/8750 [2:10:05<40:57, 5.79s/it] 95%|█████████▌| 8326/8750 [2:10:07<40:57, 5.80s/it] {'loss': 0.4521, 'learning_rate': 1.2291400382067553e-07, 'epoch': 0.95} + 95%|█████████▌| 8326/8750 [2:10:07<40:57, 5.80s/it] {'loss': 0.4521, 'learning_rate': 1.2291400382067553e-07, 'epoch': 0.95} + 95%|█████████▌| 8326/8750 [2:10:05<40:57, 5.79s/it] 95%|█████████▌| 8327/8750 [2:10:13<40:41, 5.77s/it] 95%|█████████▌| 8327/8750 [2:10:11<40:41, 5.77s/it] {'loss': 0.4571, 'learning_rate': 1.223360881419433e-07, 'epoch': 0.95} + 95%|█████████▌| 8327/8750 [2:10:13<40:41, 5.77s/it] {'loss': 0.4571, 'learning_rate': 1.223360881419433e-07, 'epoch': 0.95} + 95%|█████████▌| 8327/8750 [2:10:11<40:41, 5.77s/it] 95%|█████████▌| 8328/8750 [2:10:19<40:13, 5.72s/it] 95%|█████████▌| 8328/8750 [2:10:17<40:13, 5.72s/it] {'loss': 0.4485, 'learning_rate': 1.21759525923697e-07, 'epoch': 0.95} + 95%|█████████▌| 8328/8750 [2:10:19<40:13, 5.72s/it] {'loss': 0.4485, 'learning_rate': 1.21759525923697e-07, 'epoch': 0.95} + 95%|█████████▌| 8328/8750 [2:10:17<40:13, 5.72s/it] 95%|█████████▌| 8329/8750 [2:10:24<40:27, 5.77s/it] 95%|█████████▌| 8329/8750 [2:10:23<40:27, 5.77s/it] {'loss': 0.4499, 'learning_rate': 1.2118431724493895e-07, 'epoch': 0.95} + 95%|█████████▌| 8329/8750 [2:10:24<40:27, 5.77s/it] {'loss': 0.4499, 'learning_rate': 1.2118431724493895e-07, 'epoch': 0.95} + 95%|█████████▌| 8329/8750 [2:10:23<40:27, 5.77s/it] 95%|█████████▌| 8330/8750 [2:10:30<40:20, 5.76s/it] 95%|█████████▌| 8330/8750 [2:10:28<40:20, 5.76s/it] {'loss': 0.4544, 'learning_rate': 1.2061046218448724e-07, 'epoch': 0.95} + 95%|█████████▌| 8330/8750 [2:10:30<40:20, 5.76s/it] {'loss': 0.4544, 'learning_rate': 1.2061046218448724e-07, 'epoch': 0.95} + 95%|█████████▌| 8330/8750 [2:10:28<40:20, 5.76s/it] 95%|█████████▌| 8331/8750 [2:10:34<40:07, 5.75s/it] 95%|█████████▌| 8331/8750 [2:10:36<40:07, 5.75s/it] {'loss': 0.4409, 'learning_rate': 1.2003796082097008e-07, 'epoch': 0.95} + 95%|█████████▌| 8331/8750 [2:10:36<40:07, 5.75s/it] {'loss': 0.4409, 'learning_rate': 1.2003796082097008e-07, 'epoch': 0.95} + 95%|█████████▌| 8331/8750 [2:10:34<40:07, 5.75s/it] 95%|█████████▌| 8332/8750 [2:10:40<39:50, 5.72s/it] 95%|█████████▌| 8332/8750 [2:10:42<39:50, 5.72s/it] {'loss': 0.4329, 'learning_rate': 1.194668132328325e-07, 'epoch': 0.95} + 95%|█████████▌| 8332/8750 [2:10:42<39:50, 5.72s/it] {'loss': 0.4329, 'learning_rate': 1.194668132328325e-07, 'epoch': 0.95} + 95%|█████████▌| 8332/8750 [2:10:40<39:50, 5.72s/it] 95%|█████████▌| 8333/8750 [2:10:45<39:54, 5.74s/it] 95%|█████████▌| 8333/8750 [2:10:47<39:54, 5.74s/it] {'loss': 0.4414, 'learning_rate': 1.1889701949833743e-07, 'epoch': 0.95} + 95%|█████████▌| 8333/8750 [2:10:47<39:54, 5.74s/it] {'loss': 0.4414, 'learning_rate': 1.1889701949833743e-07, 'epoch': 0.95} + 95%|█████████▌| 8333/8750 [2:10:45<39:54, 5.74s/it] 95%|█████████▌| 8334/8750 [2:10:53<39:46, 5.74s/it] 95%|█████████▌| 8334/8750 [2:10:51<39:46, 5.74s/it] {'loss': 0.448, 'learning_rate': 1.18328579695558e-07, 'epoch': 0.95} + 95%|█████████▌| 8334/8750 [2:10:53<39:46, 5.74s/it] {'loss': 0.448, 'learning_rate': 1.18328579695558e-07, 'epoch': 0.95} + 95%|█████████▌| 8334/8750 [2:10:51<39:46, 5.74s/it] 95%|█████████▌| 8335/8750 [2:10:57<40:01, 5.79s/it] 95%|█████████▌| 8335/8750 [2:10:59<40:02, 5.79s/it] {'loss': 0.4538, 'learning_rate': 1.1776149390238301e-07, 'epoch': 0.95} + 95%|█████████▌| 8335/8750 [2:10:59<40:02, 5.79s/it] {'loss': 0.4538, 'learning_rate': 1.1776149390238301e-07, 'epoch': 0.95} + 95%|█████████▌| 8335/8750 [2:10:57<40:01, 5.79s/it] 95%|█████████▌| 8336/8750 [2:11:05<39:46, 5.76s/it] 95%|█████████▌| 8336/8750 [2:11:03<39:46, 5.76s/it] {'loss': 0.4408, 'learning_rate': 1.1719576219651585e-07, 'epoch': 0.95} + {'loss': 0.4408, 'learning_rate': 1.1719576219651585e-07, 'epoch': 0.95} 95%|█████████▌| 8336/8750 [2:11:05<39:46, 5.76s/it] + 95%|█████████▌| 8336/8750 [2:11:03<39:46, 5.76s/it] 95%|█████████▌| 8337/8750 [2:11:10<39:27, 5.73s/it] 95%|█████████▌| 8337/8750 [2:11:08<39:27, 5.73s/it] {'loss': 0.4368, 'learning_rate': 1.1663138465547341e-07, 'epoch': 0.95} + 95%|█████████▌| 8337/8750 [2:11:10<39:27, 5.73s/it] {'loss': 0.4368, 'learning_rate': 1.1663138465547341e-07, 'epoch': 0.95} + 95%|█████████▌| 8337/8750 [2:11:08<39:27, 5.73s/it] 95%|█████████▌| 8338/8750 [2:11:16<39:34, 5.76s/it] 95%|█████████▌| 8338/8750 [2:11:14<39:34, 5.76s/it] {'loss': 0.453, 'learning_rate': 1.1606836135658939e-07, 'epoch': 0.95} + 95%|█████████▌| 8338/8750 [2:11:16<39:34, 5.76s/it] {'loss': 0.453, 'learning_rate': 1.1606836135658939e-07, 'epoch': 0.95} + 95%|█████████▌| 8338/8750 [2:11:14<39:34, 5.76s/it] 95%|█████████▌| 8339/8750 [2:11:20<39:09, 5.72s/it] 95%|█████████▌| 8339/8750 [2:11:22<39:09, 5.72s/it] {'loss': 0.4487, 'learning_rate': 1.1550669237700985e-07, 'epoch': 0.95} + {'loss': 0.4487, 'learning_rate': 1.1550669237700985e-07, 'epoch': 0.95} + 95%|█████████▌| 8339/8750 [2:11:22<39:09, 5.72s/it] 95%|█████████▌| 8339/8750 [2:11:20<39:09, 5.72s/it] 95%|█████████▌| 8340/8750 [2:11:26<39:38, 5.80s/it] 95%|█████████▌| 8340/8750 [2:11:28<39:38, 5.80s/it] {'loss': 0.429, 'learning_rate': 1.1494637779369766e-07, 'epoch': 0.95} + 95%|█████████▌| 8340/8750 [2:11:28<39:38, 5.80s/it] {'loss': 0.429, 'learning_rate': 1.1494637779369766e-07, 'epoch': 0.95} + 95%|█████████▌| 8340/8750 [2:11:26<39:38, 5.80s/it] 95%|█████████▌| 8341/8750 [2:11:33<39:11, 5.75s/it] 95%|█████████▌| 8341/8750 [2:11:32<39:11, 5.75s/it] {'loss': 0.4466, 'learning_rate': 1.1438741768342587e-07, 'epoch': 0.95} + 95%|█████████▌| 8341/8750 [2:11:33<39:11, 5.75s/it] {'loss': 0.4466, 'learning_rate': 1.1438741768342587e-07, 'epoch': 0.95} + 95%|█████████▌| 8341/8750 [2:11:32<39:11, 5.75s/it] 95%|█████████▌| 8342/8750 [2:11:39<39:08, 5.75s/it] 95%|█████████▌| 8342/8750 [2:11:37<39:08, 5.76s/it] {'loss': 0.4571, 'learning_rate': 1.1382981212278655e-07, 'epoch': 0.95} + 95%|█████████▌| 8342/8750 [2:11:39<39:08, 5.75s/it] {'loss': 0.4571, 'learning_rate': 1.1382981212278655e-07, 'epoch': 0.95} + 95%|█████████▌| 8342/8750 [2:11:37<39:08, 5.76s/it] 95%|█████████▌| 8343/8750 [2:11:45<39:21, 5.80s/it] 95%|█████████▌| 8343/8750 [2:11:43<39:21, 5.80s/it] {'loss': 0.4546, 'learning_rate': 1.13273561188183e-07, 'epoch': 0.95} + 95%|█████████▌| 8343/8750 [2:11:45<39:21, 5.80s/it] {'loss': 0.4546, 'learning_rate': 1.13273561188183e-07, 'epoch': 0.95} + 95%|█████████▌| 8343/8750 [2:11:43<39:21, 5.80s/it] 95%|█████████▌| 8344/8750 [2:11:51<39:22, 5.82s/it] 95%|█████████▌| 8344/8750 [2:11:49<39:22, 5.82s/it] {'loss': 0.4403, 'learning_rate': 1.1271866495583428e-07, 'epoch': 0.95} + 95%|█████████▌| 8344/8750 [2:11:51<39:22, 5.82s/it] {'loss': 0.4403, 'learning_rate': 1.1271866495583428e-07, 'epoch': 0.95} + 95%|█████████▌| 8344/8750 [2:11:49<39:22, 5.82s/it] 95%|█████████▌| 8345/8750 [2:11:57<38:49, 5.75s/it] 95%|█████████▌| 8345/8750 [2:11:55<38:49, 5.75s/it] {'loss': 0.4579, 'learning_rate': 1.12165123501774e-07, 'epoch': 0.95} + 95%|█████████▌| 8345/8750 [2:11:57<38:49, 5.75s/it] {'loss': 0.4579, 'learning_rate': 1.12165123501774e-07, 'epoch': 0.95} + 95%|█████████▌| 8345/8750 [2:11:55<38:49, 5.75s/it] 95%|█████████▌| 8346/8750 [2:12:00<38:29, 5.72s/it] 95%|█████████▌| 8346/8750 [2:12:02<38:29, 5.72s/it] {'loss': 0.4458, 'learning_rate': 1.1161293690184927e-07, 'epoch': 0.95} + {'loss': 0.4458, 'learning_rate': 1.1161293690184927e-07, 'epoch': 0.95} 95%|█████████▌| 8346/8750 [2:12:02<38:29, 5.72s/it] + 95%|█████████▌| 8346/8750 [2:12:00<38:29, 5.72s/it] 95%|█████████▌| 8347/8750 [2:12:08<38:28, 5.73s/it] 95%|█████████▌| 8347/8750 [2:12:06<38:28, 5.73s/it] {'loss': 0.4447, 'learning_rate': 1.1106210523172068e-07, 'epoch': 0.95} + 95%|█████████▌| 8347/8750 [2:12:08<38:28, 5.73s/it] {'loss': 0.4447, 'learning_rate': 1.1106210523172068e-07, 'epoch': 0.95} + 95%|█████████▌| 8347/8750 [2:12:06<38:28, 5.73s/it] 95%|█████████▌| 8348/8750 [2:12:12<38:32, 5.75s/it] 95%|█████████▌| 8348/8750 [2:12:14<38:32, 5.75s/it] {'loss': 0.4585, 'learning_rate': 1.1051262856686673e-07, 'epoch': 0.95} + 95%|█████████▌| 8348/8750 [2:12:14<38:32, 5.75s/it] {'loss': 0.4585, 'learning_rate': 1.1051262856686673e-07, 'epoch': 0.95} + 95%|█████████▌| 8348/8750 [2:12:12<38:32, 5.75s/it] 95%|█████████▌| 8349/8750 [2:12:19<37:59, 5.69s/it] 95%|█████████▌| 8349/8750 [2:12:17<37:59, 5.69s/it] {'loss': 0.4428, 'learning_rate': 1.0996450698257721e-07, 'epoch': 0.95} + 95%|█████████▌| 8349/8750 [2:12:17<37:59, 5.69s/it]{'loss': 0.4428, 'learning_rate': 1.0996450698257721e-07, 'epoch': 0.95} + 95%|█████████▌| 8349/8750 [2:12:19<37:59, 5.69s/it]6 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +148 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend...3AutoResumeHook: Checking whether to suspend... AutoResumeHook: Checking whether to suspend... + 95%|█████████▌| 8350/8750 [2:12:25<37:51, 5.68s/it] + + 95%|█████████▌| 8350/8750 [2:12:23<37:51, 5.68s/it] +9 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +2710 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +13 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4559, 'learning_rate': 1.0941774055395538e-07, 'epoch': 0.95} + 95%|█████████▌| 8350/8750 [2:12:25<37:51, 5.68s/it] {'loss': 0.4559, 'learning_rate': 1.0941774055395538e-07, 'epoch': 0.95} + 95%|█████████▌| 8350/8750 [2:12:23<37:51, 5.68s/it] 95%|█████████▌| 8351/8750 [2:12:31<37:56, 5.70s/it] 95%|█████████▌| 8351/8750 [2:12:29<37:56, 5.70s/it] {'loss': 0.4442, 'learning_rate': 1.0887232935592351e-07, 'epoch': 0.95} + 95%|█████████▌| 8351/8750 [2:12:31<37:56, 5.70s/it] {'loss': 0.4442, 'learning_rate': 1.0887232935592351e-07, 'epoch': 0.95} + 95%|█████████▌| 8351/8750 [2:12:29<37:56, 5.70s/it] 95%|█████████▌| 8352/8750 [2:12:35<37:57, 5.72s/it] 95%|█████████▌| 8352/8750 [2:12:36<37:57, 5.72s/it] {'loss': 0.4561, 'learning_rate': 1.0832827346321295e-07, 'epoch': 0.95} + {'loss': 0.4561, 'learning_rate': 1.0832827346321295e-07, 'epoch': 0.95} + 95%|█████████▌| 8352/8750 [2:12:36<37:57, 5.72s/it] 95%|█████████▌| 8352/8750 [2:12:35<37:57, 5.72s/it] 95%|█████████▌| 8353/8750 [2:12:42<38:18, 5.79s/it] 95%|█████████▌| 8353/8750 [2:12:41<38:18, 5.79s/it] {'loss': 0.4558, 'learning_rate': 1.0778557295037296e-07, 'epoch': 0.95} + 95%|█████████▌| 8353/8750 [2:12:42<38:18, 5.79s/it] {'loss': 0.4558, 'learning_rate': 1.0778557295037296e-07, 'epoch': 0.95} + 95%|█████████▌| 8353/8750 [2:12:41<38:18, 5.79s/it] 95%|█████████▌| 8354/8750 [2:12:48<37:53, 5.74s/it] 95%|█████████▌| 8354/8750 [2:12:46<37:53, 5.74s/it] {'loss': 0.4638, 'learning_rate': 1.0724422789176404e-07, 'epoch': 0.95} + 95%|█████████▌| 8354/8750 [2:12:48<37:53, 5.74s/it] {'loss': 0.4638, 'learning_rate': 1.0724422789176404e-07, 'epoch': 0.95} + 95%|█████████▌| 8354/8750 [2:12:46<37:53, 5.74s/it] 95%|█████████▌| 8355/8750 [2:12:52<37:45, 5.74s/it] 95%|█████████▌| 8355/8750 [2:12:54<37:45, 5.74s/it] {'loss': 0.439, 'learning_rate': 1.0670423836156241e-07, 'epoch': 0.95} + 95%|█████████▌| 8355/8750 [2:12:54<37:45, 5.74s/it] {'loss': 0.439, 'learning_rate': 1.0670423836156241e-07, 'epoch': 0.95} + 95%|█████████▌| 8355/8750 [2:12:52<37:45, 5.74s/it] 95%|█████████▌| 8356/8750 [2:12:58<37:42, 5.74s/it] 95%|█████████▌| 8356/8750 [2:13:00<37:42, 5.74s/it] {'loss': 0.4519, 'learning_rate': 1.0616560443376e-07, 'epoch': 0.95} + 95%|█████████▌| 8356/8750 [2:13:00<37:42, 5.74s/it] {'loss': 0.4519, 'learning_rate': 1.0616560443376e-07, 'epoch': 0.95} + 95%|█████████▌| 8356/8750 [2:12:58<37:42, 5.74s/it] 96%|█████████▌| 8357/8750 [2:13:03<37:25, 5.71s/it] 96%|█████████▌| 8357/8750 [2:13:05<37:25, 5.71s/it]{'loss': 0.4447, 'learning_rate': 1.0562832618216223e-07, 'epoch': 0.96} + {'loss': 0.4447, 'learning_rate': 1.0562832618216223e-07, 'epoch': 0.96} + 96%|█████████▌| 8357/8750 [2:13:05<37:25, 5.71s/it] 96%|█████████▌| 8357/8750 [2:13:03<37:25, 5.71s/it] 96%|█████████▌| 8358/8750 [2:13:09<37:24, 5.73s/it] 96%|█████████▌| 8358/8750 [2:13:11<37:24, 5.73s/it] {'loss': 0.4416, 'learning_rate': 1.0509240368038576e-07, 'epoch': 0.96} + 96%|█████████▌| 8358/8750 [2:13:11<37:24, 5.73s/it] {'loss': 0.4416, 'learning_rate': 1.0509240368038576e-07, 'epoch': 0.96} + 96%|█████████▌| 8358/8750 [2:13:09<37:24, 5.73s/it] 96%|█████████▌| 8359/8750 [2:13:15<37:10, 5.70s/it] 96%|█████████▌| 8359/8750 [2:13:17<37:10, 5.70s/it] {'loss': 0.4505, 'learning_rate': 1.0455783700186628e-07, 'epoch': 0.96} + 96%|█████████▌| 8359/8750 [2:13:17<37:10, 5.70s/it] {'loss': 0.4505, 'learning_rate': 1.0455783700186628e-07, 'epoch': 0.96} + 96%|█████████▌| 8359/8750 [2:13:15<37:10, 5.70s/it] 96%|█████████▌| 8360/8750 [2:13:22<37:03, 5.70s/it] 96%|█████████▌| 8360/8750 [2:13:20<37:03, 5.70s/it] {'loss': 0.4415, 'learning_rate': 1.0402462621984965e-07, 'epoch': 0.96} + 96%|█████████▌| 8360/8750 [2:13:22<37:03, 5.70s/it] {'loss': 0.4415, 'learning_rate': 1.0402462621984965e-07, 'epoch': 0.96} + 96%|█████████▌| 8360/8750 [2:13:20<37:03, 5.70s/it] 96%|█████████▌| 8361/8750 [2:13:28<36:56, 5.70s/it] 96%|█████████▌| 8361/8750 [2:13:26<36:56, 5.70s/it] {'loss': 0.4615, 'learning_rate': 1.0349277140739966e-07, 'epoch': 0.96} + 96%|█████████▌| 8361/8750 [2:13:28<36:56, 5.70s/it] {'loss': 0.4615, 'learning_rate': 1.0349277140739966e-07, 'epoch': 0.96} + 96%|█████████▌| 8361/8750 [2:13:26<36:56, 5.70s/it] 96%|█████████▌| 8362/8750 [2:13:32<36:43, 5.68s/it] 96%|█████████▌| 8362/8750 [2:13:34<36:43, 5.68s/it]{'loss': 0.4467, 'learning_rate': 1.0296227263739023e-07, 'epoch': 0.96} + {'loss': 0.4467, 'learning_rate': 1.0296227263739023e-07, 'epoch': 0.96} + 96%|█████████▌| 8362/8750 [2:13:34<36:43, 5.68s/it] 96%|█████████▌| 8362/8750 [2:13:32<36:43, 5.68s/it] 96%|█████████▌| 8363/8750 [2:13:37<36:44, 5.70s/it] 96%|█████████▌| 8363/8750 [2:13:39<36:44, 5.70s/it] {'loss': 0.4314, 'learning_rate': 1.0243312998251209e-07, 'epoch': 0.96} + 96%|█████████▌| 8363/8750 [2:13:39<36:44, 5.70s/it] {'loss': 0.4314, 'learning_rate': 1.0243312998251209e-07, 'epoch': 0.96} + 96%|█████████▌| 8363/8750 [2:13:37<36:44, 5.70s/it] 96%|█████████▌| 8364/8750 [2:13:43<36:26, 5.66s/it] 96%|█████████▌| 8364/8750 [2:13:45<36:26, 5.66s/it] {'loss': 0.4581, 'learning_rate': 1.0190534351527059e-07, 'epoch': 0.96} {'loss': 0.4581, 'learning_rate': 1.0190534351527059e-07, 'epoch': 0.96} + 96%|█████████▌| 8364/8750 [2:13:45<36:26, 5.66s/it] + 96%|█████████▌| 8364/8750 [2:13:43<36:26, 5.66s/it] 96%|█████████▌| 8365/8750 [2:13:49<36:20, 5.66s/it] 96%|█████████▌| 8365/8750 [2:13:51<36:20, 5.66s/it] {'loss': 0.4285, 'learning_rate': 1.0137891330798344e-07, 'epoch': 0.96} + 96%|█████████▌| 8365/8750 [2:13:51<36:20, 5.66s/it] {'loss': 0.4285, 'learning_rate': 1.0137891330798344e-07, 'epoch': 0.96} + 96%|█████████▌| 8365/8750 [2:13:49<36:20, 5.66s/it] 96%|█████████▌| 8366/8750 [2:13:54<36:25, 5.69s/it] 96%|█████████▌| 8366/8750 [2:13:56<36:25, 5.69s/it] {'loss': 0.4495, 'learning_rate': 1.0085383943278293e-07, 'epoch': 0.96} + 96%|█████████▌| 8366/8750 [2:13:56<36:25, 5.69s/it] {'loss': 0.4495, 'learning_rate': 1.0085383943278293e-07, 'epoch': 0.96} + 96%|█████████▌| 8366/8750 [2:13:54<36:25, 5.69s/it] 96%|█████████▌| 8367/8750 [2:14:00<36:07, 5.66s/it] 96%|█████████▌| 8367/8750 [2:14:02<36:07, 5.66s/it] {'loss': 0.4665, 'learning_rate': 1.0033012196161706e-07, 'epoch': 0.96} + 96%|█████████▌| 8367/8750 [2:14:02<36:07, 5.66s/it] {'loss': 0.4665, 'learning_rate': 1.0033012196161706e-07, 'epoch': 0.96} + 96%|█████████▌| 8367/8750 [2:14:00<36:07, 5.66s/it] 96%|█████████▌| 8368/8750 [2:14:06<36:02, 5.66s/it] 96%|█████████▌| 8368/8750 [2:14:08<36:02, 5.66s/it] {'loss': 0.4569, 'learning_rate': 9.980776096624511e-08, 'epoch': 0.96} + 96%|█████████▌| 8368/8750 [2:14:08<36:02, 5.66s/it] {'loss': 0.4569, 'learning_rate': 9.980776096624511e-08, 'epoch': 0.96} + 96%|█████████▌| 8368/8750 [2:14:06<36:02, 5.66s/it] 96%|█████████▌| 8369/8750 [2:14:11<36:08, 5.69s/it] 96%|█████████▌| 8369/8750 [2:14:13<36:08, 5.69s/it] {'loss': 0.4427, 'learning_rate': 9.928675651824427e-08, 'epoch': 0.96} + 96%|█████████▌| 8369/8750 [2:14:13<36:08, 5.69s/it] {'loss': 0.4427, 'learning_rate': 9.928675651824427e-08, 'epoch': 0.96} + 96%|█████████▌| 8369/8750 [2:14:11<36:08, 5.69s/it] 96%|█████████▌| 8370/8750 [2:14:17<35:33, 5.62s/it] 96%|█████████▌| 8370/8750 [2:14:19<35:33, 5.62s/it] {'loss': 0.4595, 'learning_rate': 9.876710868900297e-08, 'epoch': 0.96} + 96%|█████████▌| 8370/8750 [2:14:19<35:33, 5.62s/it] {'loss': 0.4595, 'learning_rate': 9.876710868900297e-08, 'epoch': 0.96} + 96%|█████████▌| 8370/8750 [2:14:17<35:33, 5.62s/it] 96%|█████████▌| 8371/8750 [2:14:23<35:32, 5.63s/it] 96%|█████████▌| 8371/8750 [2:14:24<35:32, 5.63s/it] {'loss': 0.4592, 'learning_rate': 9.824881754972426e-08, 'epoch': 0.96} + 96%|█████████▌| 8371/8750 [2:14:24<35:32, 5.63s/it] {'loss': 0.4592, 'learning_rate': 9.824881754972426e-08, 'epoch': 0.96} + 96%|█████████▌| 8371/8750 [2:14:23<35:32, 5.63s/it] 96%|█████████▌| 8372/8750 [2:14:28<35:21, 5.61s/it] 96%|█████████▌| 8372/8750 [2:14:30<35:21, 5.61s/it] {'loss': 0.4459, 'learning_rate': 9.773188317142579e-08, 'epoch': 0.96} + 96%|█████████▌| 8372/8750 [2:14:30<35:21, 5.61s/it] {'loss': 0.4459, 'learning_rate': 9.773188317142579e-08, 'epoch': 0.96} + 96%|█████████▌| 8372/8750 [2:14:28<35:21, 5.61s/it] 96%|█████████▌| 8373/8750 [2:14:36<35:59, 5.73s/it] 96%|█████████▌| 8373/8750 [2:14:34<35:59, 5.73s/it] {'loss': 0.4578, 'learning_rate': 9.721630562493867e-08, 'epoch': 0.96} + 96%|█████████▌| 8373/8750 [2:14:36<35:59, 5.73s/it] {'loss': 0.4578, 'learning_rate': 9.721630562493867e-08, 'epoch': 0.96} + 96%|█████████▌| 8373/8750 [2:14:34<35:59, 5.73s/it] 96%|█████████▌| 8374/8750 [2:14:42<35:36, 5.68s/it] 96%|█████████▌| 8374/8750 [2:14:40<35:36, 5.68s/it] {'loss': 0.4662, 'learning_rate': 9.670208498090861e-08, 'epoch': 0.96} + 96%|█████████▌| 8374/8750 [2:14:42<35:36, 5.68s/it] {'loss': 0.4662, 'learning_rate': 9.670208498090861e-08, 'epoch': 0.96} + 96%|█████████▌| 8374/8750 [2:14:40<35:36, 5.68s/it] 96%|█████████▌| 8375/8750 [2:14:47<35:49, 5.73s/it] 96%|█████████▌| 8375/8750 [2:14:46<35:49, 5.73s/it] {'loss': 0.4425, 'learning_rate': 9.61892213097959e-08, 'epoch': 0.96} + 96%|█████████▌| 8375/8750 [2:14:47<35:49, 5.73s/it] {'loss': 0.4425, 'learning_rate': 9.61892213097959e-08, 'epoch': 0.96} + 96%|█████████▌| 8375/8750 [2:14:46<35:49, 5.73s/it] 96%|█████████▌| 8376/8750 [2:14:53<35:47, 5.74s/it] 96%|█████████▌| 8376/8750 [2:14:51<35:47, 5.74s/it] {'loss': 0.4341, 'learning_rate': 9.567771468187326e-08, 'epoch': 0.96} + 96%|█████████▌| 8376/8750 [2:14:53<35:47, 5.74s/it] {'loss': 0.4341, 'learning_rate': 9.567771468187326e-08, 'epoch': 0.96} + 96%|█████████▌| 8376/8750 [2:14:51<35:47, 5.74s/it] 96%|█████████▌| 8377/8750 [2:14:59<35:34, 5.72s/it] 96%|█████████▌| 8377/8750 [2:14:57<35:34, 5.72s/it] {'loss': 0.4402, 'learning_rate': 9.516756516723124e-08, 'epoch': 0.96} + 96%|█████████▌| 8377/8750 [2:14:59<35:34, 5.72s/it] {'loss': 0.4402, 'learning_rate': 9.516756516723124e-08, 'epoch': 0.96} + 96%|█████████▌| 8377/8750 [2:14:57<35:34, 5.72s/it] 96%|█████████▌| 8378/8750 [2:15:03<35:12, 5.68s/it] 96%|█████████▌| 8378/8750 [2:15:04<35:12, 5.68s/it] {'loss': 0.4484, 'learning_rate': 9.46587728357673e-08, 'epoch': 0.96} + {'loss': 0.4484, 'learning_rate': 9.46587728357673e-08, 'epoch': 0.96} + 96%|█████████▌| 8378/8750 [2:15:04<35:12, 5.68s/it] 96%|█████████▌| 8378/8750 [2:15:03<35:12, 5.68s/it] 96%|█████████▌| 8379/8750 [2:15:08<35:10, 5.69s/it] 96%|█████████▌| 8379/8750 [2:15:10<35:10, 5.69s/it] {'loss': 0.477, 'learning_rate': 9.415133775720231e-08, 'epoch': 0.96} + 96%|█████████▌| 8379/8750 [2:15:10<35:10, 5.69s/it] {'loss': 0.477, 'learning_rate': 9.415133775720231e-08, 'epoch': 0.96} + 96%|█████████▌| 8379/8750 [2:15:08<35:10, 5.69s/it] 96%|█████████▌| 8380/8750 [2:15:14<35:33, 5.77s/it] 96%|█████████▌| 8380/8750 [2:15:16<35:33, 5.77s/it] {'loss': 0.4292, 'learning_rate': 9.364526000106289e-08, 'epoch': 0.96} + 96%|█████████▌| 8380/8750 [2:15:16<35:33, 5.77s/it] {'loss': 0.4292, 'learning_rate': 9.364526000106289e-08, 'epoch': 0.96} + 96%|█████████▌| 8380/8750 [2:15:14<35:33, 5.77s/it] 96%|█████████▌| 8381/8750 [2:15:20<35:51, 5.83s/it] 96%|█████████▌| 8381/8750 [2:15:22<35:51, 5.83s/it] {'loss': 0.4351, 'learning_rate': 9.314053963669245e-08, 'epoch': 0.96} + 96%|█████████▌| 8381/8750 [2:15:22<35:51, 5.83s/it] {'loss': 0.4351, 'learning_rate': 9.314053963669245e-08, 'epoch': 0.96} + 96%|█████████▌| 8381/8750 [2:15:20<35:51, 5.83s/it] 96%|█████████▌| 8382/8750 [2:15:26<35:15, 5.75s/it] 96%|█████████▌| 8382/8750 [2:15:28<35:15, 5.75s/it] {'loss': 0.4642, 'learning_rate': 9.263717673325124e-08, 'epoch': 0.96} + 96%|█████████▌| 8382/8750 [2:15:28<35:15, 5.75s/it] {'loss': 0.4642, 'learning_rate': 9.263717673325124e-08, 'epoch': 0.96} + 96%|█████████▌| 8382/8750 [2:15:26<35:15, 5.75s/it] 96%|█████████▌| 8383/8750 [2:15:34<35:22, 5.78s/it] 96%|█████████▌| 8383/8750 [2:15:32<35:22, 5.78s/it] {'loss': 0.4251, 'learning_rate': 9.213517135971073e-08, 'epoch': 0.96} + 96%|█████████▌| 8383/8750 [2:15:34<35:22, 5.78s/it] {'loss': 0.4251, 'learning_rate': 9.213517135971073e-08, 'epoch': 0.96} + 96%|█████████▌| 8383/8750 [2:15:32<35:22, 5.78s/it] 96%|█████████▌| 8384/8750 [2:15:39<35:35, 5.83s/it] 96%|█████████▌| 8384/8750 [2:15:38<35:35, 5.83s/it] {'loss': 0.4471, 'learning_rate': 9.163452358485591e-08, 'epoch': 0.96} + 96%|█████████▌| 8384/8750 [2:15:39<35:35, 5.83s/it] {'loss': 0.4471, 'learning_rate': 9.163452358485591e-08, 'epoch': 0.96} + 96%|█████████▌| 8384/8750 [2:15:38<35:35, 5.83s/it] 96%|█████████▌| 8385/8750 [2:15:45<35:23, 5.82s/it] 96%|█████████▌| 8385/8750 [2:15:43<35:23, 5.82s/it] {'loss': 0.4405, 'learning_rate': 9.113523347728748e-08, 'epoch': 0.96} + 96%|█████████▌| 8385/8750 [2:15:45<35:23, 5.82s/it] {'loss': 0.4405, 'learning_rate': 9.113523347728748e-08, 'epoch': 0.96} + 96%|█████████▌| 8385/8750 [2:15:43<35:23, 5.82s/it] 96%|█████████▌| 8386/8750 [2:15:49<35:10, 5.80s/it] 96%|█████████▌| 8386/8750 [2:15:51<35:10, 5.80s/it]{'loss': 0.4519, 'learning_rate': 9.063730110541846e-08, 'epoch': 0.96} + {'loss': 0.4519, 'learning_rate': 9.063730110541846e-08, 'epoch': 0.96} + 96%|█████████▌| 8386/8750 [2:15:51<35:10, 5.80s/it] 96%|█████████▌| 8386/8750 [2:15:49<35:10, 5.80s/it] 96%|█████████▌| 8387/8750 [2:15:57<34:50, 5.76s/it] 96%|█████████▌| 8387/8750 [2:15:55<34:50, 5.76s/it] {'loss': 0.4561, 'learning_rate': 9.014072653747763e-08, 'epoch': 0.96} + 96%|█████████▌| 8387/8750 [2:15:57<34:50, 5.76s/it] {'loss': 0.4561, 'learning_rate': 9.014072653747763e-08, 'epoch': 0.96} + 96%|█████████▌| 8387/8750 [2:15:55<34:50, 5.76s/it] 96%|█████████▌| 8388/8750 [2:16:03<34:58, 5.80s/it] 96%|█████████▌| 8388/8750 [2:16:01<34:58, 5.80s/it] {'loss': 0.4654, 'learning_rate': 8.964550984150611e-08, 'epoch': 0.96} + 96%|█████████▌| 8388/8750 [2:16:03<34:58, 5.80s/it] {'loss': 0.4654, 'learning_rate': 8.964550984150611e-08, 'epoch': 0.96} + 96%|█████████▌| 8388/8750 [2:16:01<34:58, 5.80s/it] 96%|█████████▌| 8389/8750 [2:16:08<34:54, 5.80s/it] 96%|█████████▌| 8389/8750 [2:16:06<34:54, 5.80s/it] {'loss': 0.4423, 'learning_rate': 8.915165108536072e-08, 'epoch': 0.96} + 96%|█████████▌| 8389/8750 [2:16:08<34:54, 5.80s/it] {'loss': 0.4423, 'learning_rate': 8.915165108536072e-08, 'epoch': 0.96} + 96%|█████████▌| 8389/8750 [2:16:06<34:54, 5.80s/it] 96%|█████████▌| 8390/8750 [2:16:14<34:40, 5.78s/it] 96%|█████████▌| 8390/8750 [2:16:12<34:40, 5.78s/it] {'loss': 0.4749, 'learning_rate': 8.865915033671069e-08, 'epoch': 0.96} + 96%|█████████▌| 8390/8750 [2:16:14<34:40, 5.78s/it] {'loss': 0.4749, 'learning_rate': 8.865915033671069e-08, 'epoch': 0.96} + 96%|█████████▌| 8390/8750 [2:16:12<34:40, 5.78s/it] 96%|█████████▌| 8391/8750 [2:16:20<34:22, 5.74s/it] 96%|█████████▌| 8391/8750 [2:16:18<34:22, 5.74s/it] {'loss': 0.4455, 'learning_rate': 8.816800766303756e-08, 'epoch': 0.96} + 96%|█████████▌| 8391/8750 [2:16:20<34:22, 5.74s/it] {'loss': 0.4455, 'learning_rate': 8.816800766303756e-08, 'epoch': 0.96} + 96%|█████████▌| 8391/8750 [2:16:18<34:22, 5.74s/it] 96%|█████████▌| 8392/8750 [2:16:26<34:32, 5.79s/it] 96%|█████████▌| 8392/8750 [2:16:24<34:32, 5.79s/it] {'loss': 0.4368, 'learning_rate': 8.767822313164198e-08, 'epoch': 0.96} + 96%|█████████▌| 8392/8750 [2:16:26<34:32, 5.79s/it] {'loss': 0.4368, 'learning_rate': 8.767822313164198e-08, 'epoch': 0.96} + 96%|█████████▌| 8392/8750 [2:16:24<34:32, 5.79s/it] 96%|█████████▌| 8393/8750 [2:16:31<34:05, 5.73s/it] 96%|█████████▌| 8393/8750 [2:16:29<34:05, 5.73s/it] {'loss': 0.4495, 'learning_rate': 8.718979680963469e-08, 'epoch': 0.96} + 96%|█████████▌| 8393/8750 [2:16:31<34:05, 5.73s/it] {'loss': 0.4495, 'learning_rate': 8.718979680963469e-08, 'epoch': 0.96} + 96%|█████████▌| 8393/8750 [2:16:29<34:05, 5.73s/it] 96%|█████████▌| 8394/8750 [2:16:35<34:02, 5.74s/it] 96%|█████████▌| 8394/8750 [2:16:37<34:02, 5.74s/it] {'loss': 0.4468, 'learning_rate': 8.670272876393881e-08, 'epoch': 0.96} + 96%|█████████▌| 8394/8750 [2:16:37<34:02, 5.74s/it] {'loss': 0.4468, 'learning_rate': 8.670272876393881e-08, 'epoch': 0.96} + 96%|█████████▌| 8394/8750 [2:16:35<34:02, 5.74s/it] 96%|█████████▌| 8395/8750 [2:16:43<33:48, 5.71s/it] 96%|█████████▌| 8395/8750 [2:16:41<33:48, 5.71s/it] {'loss': 0.4538, 'learning_rate': 8.621701906129542e-08, 'epoch': 0.96} + 96%|█████████▌| 8395/8750 [2:16:43<33:48, 5.71s/it] {'loss': 0.4538, 'learning_rate': 8.621701906129542e-08, 'epoch': 0.96} + 96%|█████████▌| 8395/8750 [2:16:41<33:48, 5.71s/it] 96%|█████████▌| 8396/8750 [2:16:49<33:56, 5.75s/it] 96%|█████████▌| 8396/8750 [2:16:47<33:56, 5.75s/it] {'loss': 0.4725, 'learning_rate': 8.573266776825683e-08, 'epoch': 0.96} + {'loss': 0.4725, 'learning_rate': 8.573266776825683e-08, 'epoch': 0.96} 96%|█████████▌| 8396/8750 [2:16:49<33:56, 5.75s/it] + 96%|█████████▌| 8396/8750 [2:16:47<33:56, 5.75s/it] 96%|█████████▌| 8397/8750 [2:16:54<33:45, 5.74s/it] 96%|█████████▌| 8397/8750 [2:16:52<33:45, 5.74s/it] {'loss': 0.4482, 'learning_rate': 8.524967495119107e-08, 'epoch': 0.96} + 96%|█████████▌| 8397/8750 [2:16:54<33:45, 5.74s/it] {'loss': 0.4482, 'learning_rate': 8.524967495119107e-08, 'epoch': 0.96} + 96%|█████████▌| 8397/8750 [2:16:52<33:45, 5.74s/it] 96%|█████████▌| 8398/8750 [2:17:00<33:33, 5.72s/it] 96%|█████████▌| 8398/8750 [2:16:58<33:33, 5.72s/it] {'loss': 0.4601, 'learning_rate': 8.476804067627852e-08, 'epoch': 0.96} + 96%|█████████▌| 8398/8750 [2:17:00<33:33, 5.72s/it] {'loss': 0.4601, 'learning_rate': 8.476804067627852e-08, 'epoch': 0.96} + 96%|█████████▌| 8398/8750 [2:16:58<33:33, 5.72s/it] 96%|█████████▌| 8399/8750 [2:17:04<33:27, 5.72s/it] 96%|█████████▌| 8399/8750 [2:17:06<33:27, 5.72s/it] {'loss': 0.4497, 'learning_rate': 8.428776500951308e-08, 'epoch': 0.96} + 96%|█████████▌| 8399/8750 [2:17:06<33:27, 5.72s/it] {'loss': 0.4497, 'learning_rate': 8.428776500951308e-08, 'epoch': 0.96} + 96%|█████████▌| 8399/8750 [2:17:04<33:27, 5.72s/it]5 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +4 0AutoResumeHook: Checking whether to suspend...1 + AutoResumeHook: Checking whether to suspend... + 14 AutoResumeHook: Checking whether to suspend... +63 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... 96%|█████████▌| 8400/8750 [2:17:11<33:22, 5.72s/it]11 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... + + 96%|█████████▌| 8400/8750 [2:17:09<33:22, 5.72s/it]9 AutoResumeHook: Checking whether to suspend... + +12 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend...2 AutoResumeHook: Checking whether to suspend... + +13 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4364, 'learning_rate': 8.380884801670431e-08, 'epoch': 0.96} + 96%|█████████▌| 8400/8750 [2:17:11<33:22, 5.72s/it] {'loss': 0.4364, 'learning_rate': 8.380884801670431e-08, 'epoch': 0.96} + 96%|█████████▌| 8400/8750 [2:17:09<33:22, 5.72s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-8400/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-8400/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-8400/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 96%|█████████▌| 8401/8750 [2:17:31<57:54, 9.95s/it] 96%|█████████▌| 8401/8750 [2:17:29<57:54, 9.95s/it] {'loss': 0.4429, 'learning_rate': 8.333128976347305e-08, 'epoch': 0.96} + 96%|█████████▌| 8401/8750 [2:17:31<57:54, 9.95s/it] {'loss': 0.4429, 'learning_rate': 8.333128976347305e-08, 'epoch': 0.96} + 96%|█████████▌| 8401/8750 [2:17:29<57:54, 9.95s/it] 96%|█████████▌| 8402/8750 [2:17:35<50:17, 8.67s/it] 96%|█████████▌| 8402/8750 [2:17:37<50:17, 8.67s/it] {'loss': 0.461, 'learning_rate': 8.285509031525696e-08, 'epoch': 0.96} + 96%|█████████▌| 8402/8750 [2:17:37<50:17, 8.67s/it] {'loss': 0.461, 'learning_rate': 8.285509031525696e-08, 'epoch': 0.96} + 96%|█████████▌| 8402/8750 [2:17:35<50:17, 8.67s/it] 96%|█████████▌| 8403/8750 [2:17:43<44:54, 7.77s/it] 96%|█████████▌| 8403/8750 [2:17:41<44:54, 7.77s/it] {'loss': 0.4412, 'learning_rate': 8.238024973730497e-08, 'epoch': 0.96} + 96%|█████████▌| 8403/8750 [2:17:43<44:54, 7.77s/it] {'loss': 0.4412, 'learning_rate': 8.238024973730497e-08, 'epoch': 0.96} + 96%|█████████▌| 8403/8750 [2:17:41<44:54, 7.77s/it] 96%|█████████▌| 8404/8750 [2:17:48<41:01, 7.11s/it] 96%|█████████▌| 8404/8750 [2:17:46<41:01, 7.11s/it] {'loss': 0.4573, 'learning_rate': 8.190676809468056e-08, 'epoch': 0.96} + 96%|█████████▌| 8404/8750 [2:17:48<41:01, 7.11s/it] {'loss': 0.4573, 'learning_rate': 8.190676809468056e-08, 'epoch': 0.96} + 96%|█████████▌| 8404/8750 [2:17:46<41:01, 7.11s/it] 96%|█████████▌| 8405/8750 [2:17:52<38:21, 6.67s/it] 96%|█████████▌| 8405/8750 [2:17:54<38:21, 6.67s/it] {'loss': 0.4421, 'learning_rate': 8.143464545226298e-08, 'epoch': 0.96} + 96%|█████████▌| 8405/8750 [2:17:54<38:21, 6.67s/it] {'loss': 0.4421, 'learning_rate': 8.143464545226298e-08, 'epoch': 0.96} + 96%|█████████▌| 8405/8750 [2:17:52<38:21, 6.67s/it] 96%|█████████▌| 8406/8750 [2:17:59<36:15, 6.32s/it] 96%|█████████▌| 8406/8750 [2:17:57<36:15, 6.32s/it] {'loss': 0.4717, 'learning_rate': 8.096388187474269e-08, 'epoch': 0.96} + 96%|█████████▌| 8406/8750 [2:17:59<36:15, 6.32s/it] {'loss': 0.4717, 'learning_rate': 8.096388187474269e-08, 'epoch': 0.96} + 96%|█████████▌| 8406/8750 [2:17:57<36:15, 6.32s/it] 96%|█████████▌| 8407/8750 [2:18:03<35:09, 6.15s/it] 96%|█████████▌| 8407/8750 [2:18:05<35:09, 6.15s/it] {'loss': 0.4514, 'learning_rate': 8.049447742662364e-08, 'epoch': 0.96} + 96%|█████████▌| 8407/8750 [2:18:05<35:09, 6.15s/it] {'loss': 0.4514, 'learning_rate': 8.049447742662364e-08, 'epoch': 0.96} + 96%|█████████▌| 8407/8750 [2:18:03<35:09, 6.15s/it] 96%|█████████▌| 8408/8750 [2:18:09<33:55, 5.95s/it] 96%|█████████▌| 8408/8750 [2:18:10<33:55, 5.95s/it] {'loss': 0.4507, 'learning_rate': 8.002643217222661e-08, 'epoch': 0.96} + 96%|█████████▌| 8408/8750 [2:18:10<33:55, 5.95s/it] {'loss': 0.4507, 'learning_rate': 8.002643217222661e-08, 'epoch': 0.96} + 96%|█████████▌| 8408/8750 [2:18:09<33:55, 5.95s/it] 96%|█████████▌| 8409/8750 [2:18:14<33:19, 5.86s/it] 96%|█████████▌| 8409/8750 [2:18:16<33:19, 5.86s/it] {'loss': 0.4262, 'learning_rate': 7.955974617568252e-08, 'epoch': 0.96} + 96%|█████████▌| 8409/8750 [2:18:16<33:19, 5.86s/it] {'loss': 0.4262, 'learning_rate': 7.955974617568252e-08, 'epoch': 0.96} + 96%|█████████▌| 8409/8750 [2:18:14<33:19, 5.86s/it] 96%|█████████▌| 8410/8750 [2:18:20<32:39, 5.76s/it] 96%|█████████▌| 8410/8750 [2:18:22<32:39, 5.76s/it] {'loss': 0.4479, 'learning_rate': 7.90944195009391e-08, 'epoch': 0.96} + 96%|█████████▌| 8410/8750 [2:18:22<32:39, 5.76s/it] {'loss': 0.4479, 'learning_rate': 7.90944195009391e-08, 'epoch': 0.96} + 96%|█████████▌| 8410/8750 [2:18:20<32:39, 5.76s/it] 96%|█████████▌| 8411/8750 [2:18:28<32:41, 5.79s/it] 96%|█████████▌| 8411/8750 [2:18:26<32:41, 5.79s/it] {'loss': 0.4506, 'learning_rate': 7.863045221175647e-08, 'epoch': 0.96} + 96%|█████████▌| 8411/8750 [2:18:28<32:41, 5.79s/it] {'loss': 0.4506, 'learning_rate': 7.863045221175647e-08, 'epoch': 0.96} + 96%|█████████▌| 8411/8750 [2:18:26<32:41, 5.79s/it] 96%|█████████▌| 8412/8750 [2:18:33<32:12, 5.72s/it] 96%|█████████▌| 8412/8750 [2:18:31<32:12, 5.72s/it] {'loss': 0.4619, 'learning_rate': 7.81678443717071e-08, 'epoch': 0.96} + 96%|█████████▌| 8412/8750 [2:18:33<32:12, 5.72s/it] {'loss': 0.4619, 'learning_rate': 7.81678443717071e-08, 'epoch': 0.96} + 96%|█████████▌| 8412/8750 [2:18:31<32:12, 5.72s/it] 96%|█████████▌| 8413/8750 [2:18:37<32:03, 5.71s/it] 96%|█████████▌| 8413/8750 [2:18:39<32:03, 5.71s/it] {'loss': 0.4708, 'learning_rate': 7.77065960441803e-08, 'epoch': 0.96} + 96%|█████████▌| 8413/8750 [2:18:39<32:03, 5.71s/it] {'loss': 0.4708, 'learning_rate': 7.77065960441803e-08, 'epoch': 0.96} + 96%|█████████▌| 8413/8750 [2:18:37<32:03, 5.71s/it] 96%|█████████▌| 8414/8750 [2:18:44<31:46, 5.67s/it] 96%|█████████▌| 8414/8750 [2:18:42<31:46, 5.67s/it] {'loss': 0.4405, 'learning_rate': 7.72467072923766e-08, 'epoch': 0.96} + 96%|█████████▌| 8414/8750 [2:18:44<31:46, 5.67s/it] {'loss': 0.4405, 'learning_rate': 7.72467072923766e-08, 'epoch': 0.96} + 96%|█████████▌| 8414/8750 [2:18:42<31:46, 5.67s/it] 96%|█████████▌| 8415/8750 [2:18:48<31:42, 5.68s/it] 96%|█████████▌| 8415/8750 [2:18:50<31:42, 5.68s/it] {'loss': 0.4463, 'learning_rate': 7.678817817931006e-08, 'epoch': 0.96} + 96%|█████████▌| 8415/8750 [2:18:50<31:42, 5.68s/it] {'loss': 0.4463, 'learning_rate': 7.678817817931006e-08, 'epoch': 0.96} + 96%|█████████▌| 8415/8750 [2:18:48<31:42, 5.68s/it] 96%|█████████▌| 8416/8750 [2:18:56<31:35, 5.68s/it] 96%|█████████▌| 8416/8750 [2:18:54<31:35, 5.68s/it] {'loss': 0.48, 'learning_rate': 7.633100876781152e-08, 'epoch': 0.96} + 96%|█████████▌| 8416/8750 [2:18:56<31:35, 5.68s/it] {'loss': 0.48, 'learning_rate': 7.633100876781152e-08, 'epoch': 0.96} + 96%|█████████▌| 8416/8750 [2:18:54<31:35, 5.68s/it] 96%|█████████▌| 8417/8750 [2:19:02<31:47, 5.73s/it] 96%|█████████▌| 8417/8750 [2:19:00<31:47, 5.73s/it] {'loss': 0.4355, 'learning_rate': 7.587519912052199e-08, 'epoch': 0.96} + 96%|█████████▌| 8417/8750 [2:19:02<31:47, 5.73s/it] {'loss': 0.4355, 'learning_rate': 7.587519912052199e-08, 'epoch': 0.96} + 96%|█████████▌| 8417/8750 [2:19:00<31:47, 5.73s/it] 96%|█████████▌| 8418/8750 [2:19:05<31:36, 5.71s/it] 96%|█████████▌| 8418/8750 [2:19:07<31:36, 5.71s/it] {'loss': 0.4531, 'learning_rate': 7.542074929989818e-08, 'epoch': 0.96} + 96%|█████████▌| 8418/8750 [2:19:07<31:36, 5.71s/it] {'loss': 0.4531, 'learning_rate': 7.542074929989818e-08, 'epoch': 0.96} + 96%|█████████▌| 8418/8750 [2:19:05<31:36, 5.71s/it] 96%|█████████▌| 8419/8750 [2:19:11<31:40, 5.74s/it] 96%|█████████▌| 8419/8750 [2:19:13<31:40, 5.74s/it] {'loss': 0.4639, 'learning_rate': 7.496765936821027e-08, 'epoch': 0.96} + 96%|█████████▌| 8419/8750 [2:19:13<31:40, 5.74s/it] {'loss': 0.4639, 'learning_rate': 7.496765936821027e-08, 'epoch': 0.96} + 96%|█████████▌| 8419/8750 [2:19:11<31:40, 5.74s/it] 96%|█████████▌| 8420/8750 [2:19:17<31:38, 5.75s/it] 96%|█████████▌| 8420/8750 [2:19:19<31:38, 5.75s/it] {'loss': 0.4652, 'learning_rate': 7.451592938753971e-08, 'epoch': 0.96} + 96%|█████████▌| 8420/8750 [2:19:19<31:38, 5.75s/it] {'loss': 0.4652, 'learning_rate': 7.451592938753971e-08, 'epoch': 0.96} + 96%|█████████▌| 8420/8750 [2:19:17<31:38, 5.75s/it] 96%|█████████▌| 8421/8750 [2:19:23<31:31, 5.75s/it] 96%|█████████▌| 8421/8750 [2:19:25<31:31, 5.75s/it] {'loss': 0.4412, 'learning_rate': 7.406555941978478e-08, 'epoch': 0.96} + 96%|█████████▌| 8421/8750 [2:19:25<31:31, 5.75s/it] {'loss': 0.4412, 'learning_rate': 7.406555941978478e-08, 'epoch': 0.96} + 96%|█████████▌| 8421/8750 [2:19:23<31:31, 5.75s/it] 96%|█████████▋| 8422/8750 [2:19:28<31:13, 5.71s/it] 96%|█████████▋| 8422/8750 [2:19:30<31:13, 5.71s/it] {'loss': 0.4694, 'learning_rate': 7.361654952665608e-08, 'epoch': 0.96} + 96%|█████████▋| 8422/8750 [2:19:30<31:13, 5.71s/it] {'loss': 0.4694, 'learning_rate': 7.361654952665608e-08, 'epoch': 0.96} + 96%|█████████▋| 8422/8750 [2:19:28<31:13, 5.71s/it] 96%|█████████▋| 8423/8750 [2:19:34<30:57, 5.68s/it] 96%|█████████▋| 8423/8750 [2:19:36<30:57, 5.68s/it] {'loss': 0.4387, 'learning_rate': 7.31688997696789e-08, 'epoch': 0.96} + 96%|█████████▋| 8423/8750 [2:19:36<30:57, 5.68s/it] {'loss': 0.4387, 'learning_rate': 7.31688997696789e-08, 'epoch': 0.96} + 96%|█████████▋| 8423/8750 [2:19:34<30:57, 5.68s/it] 96%|█████████▋| 8424/8750 [2:19:40<30:45, 5.66s/it] 96%|█████████▋| 8424/8750 [2:19:41<30:45, 5.66s/it] {'loss': 0.4594, 'learning_rate': 7.272261021019079e-08, 'epoch': 0.96} + {'loss': 0.4594, 'learning_rate': 7.272261021019079e-08, 'epoch': 0.96} 96%|█████████▋| 8424/8750 [2:19:41<30:45, 5.66s/it] + 96%|█████████▋| 8424/8750 [2:19:40<30:45, 5.66s/it] 96%|█████████▋| 8425/8750 [2:19:45<30:59, 5.72s/it] 96%|█████████▋| 8425/8750 [2:19:47<30:59, 5.72s/it] {'loss': 0.4461, 'learning_rate': 7.227768090934285e-08, 'epoch': 0.96} + 96%|█████████▋| 8425/8750 [2:19:47<30:59, 5.72s/it] {'loss': 0.4461, 'learning_rate': 7.227768090934285e-08, 'epoch': 0.96} + 96%|█████████▋| 8425/8750 [2:19:45<30:59, 5.72s/it] 96%|█████████▋| 8426/8750 [2:19:51<30:39, 5.68s/it] 96%|█████████▋| 8426/8750 [2:19:53<30:39, 5.68s/it] {'loss': 0.4421, 'learning_rate': 7.183411192810075e-08, 'epoch': 0.96} + 96%|█████████▋| 8426/8750 [2:19:53<30:39, 5.68s/it] {'loss': 0.4421, 'learning_rate': 7.183411192810075e-08, 'epoch': 0.96} + 96%|█████████▋| 8426/8750 [2:19:51<30:39, 5.68s/it] 96%|█████████▋| 8427/8750 [2:19:57<30:39, 5.69s/it] 96%|█████████▋| 8427/8750 [2:19:59<30:39, 5.69s/it] {'loss': 0.4374, 'learning_rate': 7.139190332724255e-08, 'epoch': 0.96} + 96%|█████████▋| 8427/8750 [2:19:59<30:39, 5.69s/it] {'loss': 0.4374, 'learning_rate': 7.139190332724255e-08, 'epoch': 0.96} + 96%|█████████▋| 8427/8750 [2:19:57<30:39, 5.69s/it] 96%|█████████▋| 8428/8750 [2:20:02<30:22, 5.66s/it] 96%|█████████▋| 8428/8750 [2:20:04<30:22, 5.66s/it] {'loss': 0.4646, 'learning_rate': 7.095105516736201e-08, 'epoch': 0.96} + 96%|█████████▋| 8428/8750 [2:20:04<30:22, 5.66s/it] {'loss': 0.4646, 'learning_rate': 7.095105516736201e-08, 'epoch': 0.96} + 96%|█████████▋| 8428/8750 [2:20:02<30:22, 5.66s/it] 96%|█████████▋| 8429/8750 [2:20:08<30:24, 5.68s/it] 96%|█████████▋| 8429/8750 [2:20:10<30:24, 5.68s/it] {'loss': 0.451, 'learning_rate': 7.051156750886523e-08, 'epoch': 0.96} + 96%|█████████▋| 8429/8750 [2:20:10<30:24, 5.68s/it] {'loss': 0.451, 'learning_rate': 7.051156750886523e-08, 'epoch': 0.96} + 96%|█████████▋| 8429/8750 [2:20:08<30:24, 5.68s/it] 96%|█████████▋| 8430/8750 [2:20:16<30:44, 5.76s/it] 96%|█████████▋| 8430/8750 [2:20:14<30:44, 5.76s/it] {'loss': 0.4344, 'learning_rate': 7.007344041196962e-08, 'epoch': 0.96} + {'loss': 0.4344, 'learning_rate': 7.007344041196962e-08, 'epoch': 0.96} 96%|█████████▋| 8430/8750 [2:20:16<30:44, 5.76s/it] + 96%|█████████▋| 8430/8750 [2:20:14<30:44, 5.76s/it] 96%|█████████▋| 8431/8750 [2:20:21<30:23, 5.72s/it] 96%|█████████▋| 8431/8750 [2:20:20<30:23, 5.72s/it] {'loss': 0.4456, 'learning_rate': 6.963667393671048e-08, 'epoch': 0.96} + 96%|█████████▋| 8431/8750 [2:20:21<30:23, 5.72s/it] {'loss': 0.4456, 'learning_rate': 6.963667393671048e-08, 'epoch': 0.96} + 96%|█████████▋| 8431/8750 [2:20:20<30:23, 5.72s/it] 96%|█████████▋| 8432/8750 [2:20:27<30:26, 5.74s/it] 96%|█████████▋| 8432/8750 [2:20:25<30:26, 5.74s/it] {'loss': 0.4396, 'learning_rate': 6.920126814293438e-08, 'epoch': 0.96} + 96%|█████████▋| 8432/8750 [2:20:27<30:26, 5.74s/it] {'loss': 0.4396, 'learning_rate': 6.920126814293438e-08, 'epoch': 0.96} + 96%|█████████▋| 8432/8750 [2:20:25<30:26, 5.74s/it] 96%|█████████▋| 8433/8750 [2:20:33<30:23, 5.75s/it] 96%|█████████▋| 8433/8750 [2:20:31<30:23, 5.75s/it] {'loss': 0.4522, 'learning_rate': 6.876722309030026e-08, 'epoch': 0.96} + 96%|█████████▋| 8433/8750 [2:20:33<30:23, 5.75s/it] {'loss': 0.4522, 'learning_rate': 6.876722309030026e-08, 'epoch': 0.96} + 96%|█████████▋| 8433/8750 [2:20:31<30:23, 5.75s/it] 96%|█████████▋| 8434/8750 [2:20:39<30:11, 5.73s/it] 96%|█████████▋| 8434/8750 [2:20:37<30:11, 5.73s/it] {'loss': 0.449, 'learning_rate': 6.833453883828389e-08, 'epoch': 0.96} + 96%|█████████▋| 8434/8750 [2:20:39<30:11, 5.73s/it] {'loss': 0.449, 'learning_rate': 6.833453883828389e-08, 'epoch': 0.96} + 96%|█████████▋| 8434/8750 [2:20:37<30:11, 5.73s/it] 96%|█████████▋| 8435/8750 [2:20:43<30:29, 5.81s/it] 96%|█████████▋| 8435/8750 [2:20:45<30:29, 5.81s/it] {'loss': 0.4438, 'learning_rate': 6.790321544617117e-08, 'epoch': 0.96} + {'loss': 0.4438, 'learning_rate': 6.790321544617117e-08, 'epoch': 0.96} 96%|█████████▋| 8435/8750 [2:20:45<30:29, 5.81s/it] + 96%|█████████▋| 8435/8750 [2:20:43<30:29, 5.81s/it] 96%|█████████▋| 8436/8750 [2:20:50<30:12, 5.77s/it] 96%|█████████▋| 8436/8750 [2:20:48<30:12, 5.77s/it] {'loss': 0.4538, 'learning_rate': 6.747325297306484e-08, 'epoch': 0.96} + 96%|█████████▋| 8436/8750 [2:20:50<30:12, 5.77s/it] {'loss': 0.4538, 'learning_rate': 6.747325297306484e-08, 'epoch': 0.96} + 96%|█████████▋| 8436/8750 [2:20:48<30:12, 5.77s/it] 96%|█████████▋| 8437/8750 [2:20:54<29:37, 5.68s/it] 96%|█████████▋| 8437/8750 [2:20:56<29:37, 5.68s/it] {'loss': 0.4676, 'learning_rate': 6.704465147787665e-08, 'epoch': 0.96} + 96%|█████████▋| 8437/8750 [2:20:56<29:37, 5.68s/it] {'loss': 0.4676, 'learning_rate': 6.704465147787665e-08, 'epoch': 0.96} + 96%|█████████▋| 8437/8750 [2:20:54<29:37, 5.68s/it] 96%|█████████▋| 8438/8750 [2:21:00<29:33, 5.68s/it] 96%|█████████▋| 8438/8750 [2:21:02<29:33, 5.68s/it] {'loss': 0.4309, 'learning_rate': 6.661741101933628e-08, 'epoch': 0.96} + {'loss': 0.4309, 'learning_rate': 6.661741101933628e-08, 'epoch': 0.96} 96%|█████████▋| 8438/8750 [2:21:02<29:33, 5.68s/it] + 96%|█████████▋| 8438/8750 [2:21:00<29:33, 5.68s/it] 96%|█████████▋| 8439/8750 [2:21:05<29:17, 5.65s/it] 96%|█████████▋| 8439/8750 [2:21:07<29:17, 5.65s/it] {'loss': 0.4555, 'learning_rate': 6.61915316559858e-08, 'epoch': 0.96} + 96%|█████████▋| 8439/8750 [2:21:07<29:17, 5.65s/it] {'loss': 0.4555, 'learning_rate': 6.61915316559858e-08, 'epoch': 0.96} + 96%|█████████▋| 8439/8750 [2:21:05<29:17, 5.65s/it] 96%|█████████▋| 8440/8750 [2:21:11<29:04, 5.63s/it] 96%|█████████▋| 8440/8750 [2:21:13<29:04, 5.63s/it] {'loss': 0.4451, 'learning_rate': 6.576701344617964e-08, 'epoch': 0.96} + {'loss': 0.4451, 'learning_rate': 6.576701344617964e-08, 'epoch': 0.96} 96%|█████████▋| 8440/8750 [2:21:13<29:04, 5.63s/it] + 96%|█████████▋| 8440/8750 [2:21:11<29:04, 5.63s/it] 96%|█████████▋| 8441/8750 [2:21:16<28:47, 5.59s/it] 96%|█████████▋| 8441/8750 [2:21:18<28:47, 5.59s/it] {'loss': 0.4492, 'learning_rate': 6.534385644808461e-08, 'epoch': 0.96} + 96%|█████████▋| 8441/8750 [2:21:18<28:47, 5.59s/it] {'loss': 0.4492, 'learning_rate': 6.534385644808461e-08, 'epoch': 0.96} + 96%|█████████▋| 8441/8750 [2:21:16<28:47, 5.59s/it] 96%|█████████▋| 8442/8750 [2:21:24<28:38, 5.58s/it] 96%|█████████▋| 8442/8750 [2:21:22<28:38, 5.58s/it] {'loss': 0.4569, 'learning_rate': 6.492206071968432e-08, 'epoch': 0.96} + 96%|█████████▋| 8442/8750 [2:21:24<28:38, 5.58s/it] {'loss': 0.4569, 'learning_rate': 6.492206071968432e-08, 'epoch': 0.96} + 96%|█████████▋| 8442/8750 [2:21:22<28:38, 5.58s/it] 96%|█████████▋| 8443/8750 [2:21:29<28:40, 5.61s/it] 96%|█████████▋| 8443/8750 [2:21:28<28:40, 5.61s/it] {'loss': 0.4277, 'learning_rate': 6.450162631877366e-08, 'epoch': 0.96} + 96%|█████████▋| 8443/8750 [2:21:29<28:40, 5.61s/it] {'loss': 0.4277, 'learning_rate': 6.450162631877366e-08, 'epoch': 0.96} + 96%|█████████▋| 8443/8750 [2:21:28<28:40, 5.61s/it] 97%|█████████▋| 8444/8750 [2:21:33<28:42, 5.63s/it] 97%|█████████▋| 8444/8750 [2:21:35<28:42, 5.63s/it] {'loss': 0.4393, 'learning_rate': 6.40825533029632e-08, 'epoch': 0.97} + {'loss': 0.4393, 'learning_rate': 6.40825533029632e-08, 'epoch': 0.97} 97%|█████████▋| 8444/8750 [2:21:35<28:42, 5.63s/it] + 97%|█████████▋| 8444/8750 [2:21:33<28:42, 5.63s/it] 97%|█████████▋| 8445/8750 [2:21:39<28:37, 5.63s/it] 97%|█████████▋| 8445/8750 [2:21:41<28:37, 5.63s/it] {'loss': 0.4463, 'learning_rate': 6.366484172967369e-08, 'epoch': 0.97} + 97%|█████████▋| 8445/8750 [2:21:41<28:37, 5.63s/it] {'loss': 0.4463, 'learning_rate': 6.366484172967369e-08, 'epoch': 0.97} + 97%|█████████▋| 8445/8750 [2:21:39<28:37, 5.63s/it] 97%|█████████▋| 8446/8750 [2:21:45<28:44, 5.67s/it] 97%|█████████▋| 8446/8750 [2:21:47<28:44, 5.67s/it] {'loss': 0.4572, 'learning_rate': 6.324849165614045e-08, 'epoch': 0.97} + 97%|█████████▋| 8446/8750 [2:21:47<28:44, 5.67s/it] {'loss': 0.4572, 'learning_rate': 6.324849165614045e-08, 'epoch': 0.97} + 97%|█████████▋| 8446/8750 [2:21:45<28:44, 5.67s/it] 97%|█████████▋| 8447/8750 [2:21:50<28:47, 5.70s/it] 97%|█████████▋| 8447/8750 [2:21:52<28:47, 5.70s/it] {'loss': 0.4336, 'learning_rate': 6.28335031394134e-08, 'epoch': 0.97} + 97%|█████████▋| 8447/8750 [2:21:52<28:47, 5.70s/it] {'loss': 0.4336, 'learning_rate': 6.28335031394134e-08, 'epoch': 0.97} + 97%|█████████▋| 8447/8750 [2:21:50<28:47, 5.70s/it] 97%|█████████▋| 8448/8750 [2:21:56<28:37, 5.69s/it] 97%|█████████▋| 8448/8750 [2:21:58<28:37, 5.69s/it] {'loss': 0.4617, 'learning_rate': 6.241987623635482e-08, 'epoch': 0.97} + 97%|█████████▋| 8448/8750 [2:21:58<28:37, 5.69s/it] {'loss': 0.4617, 'learning_rate': 6.241987623635482e-08, 'epoch': 0.97} + 97%|█████████▋| 8448/8750 [2:21:56<28:37, 5.69s/it] 97%|█████████▋| 8449/8750 [2:22:02<28:42, 5.72s/it] 97%|█████████▋| 8449/8750 [2:22:04<28:42, 5.72s/it]{'loss': 0.4252, 'learning_rate': 6.200761100364272e-08, 'epoch': 0.97} + {'loss': 0.4252, 'learning_rate': 6.200761100364272e-08, 'epoch': 0.97} + 97%|█████████▋| 8449/8750 [2:22:04<28:42, 5.72s/it] 97%|█████████▋| 8449/8750 [2:22:02<28:42, 5.72s/it]06 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... + 158 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... + 97%|█████████▋| 8450/8750 [2:22:10<28:43, 5.74s/it]1011 + AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... + 97%|█████████▋| 8450/8750 [2:22:08<28:43, 5.74s/it]4 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4396, 'learning_rate': 6.159670749776414e-08, 'epoch': 0.97} + 97%|█████████▋| 8450/8750 [2:22:10<28:43, 5.74s/it] {'loss': 0.4396, 'learning_rate': 6.159670749776414e-08, 'epoch': 0.97} + 97%|█████████▋| 8450/8750 [2:22:08<28:43, 5.74s/it] 97%|█████████▋| 8451/8750 [2:22:13<28:37, 5.75s/it] 97%|█████████▋| 8451/8750 [2:22:15<28:37, 5.75s/it] {'loss': 0.4562, 'learning_rate': 6.118716577502404e-08, 'epoch': 0.97} + 97%|█████████▋| 8451/8750 [2:22:15<28:37, 5.75s/it] {'loss': 0.4562, 'learning_rate': 6.118716577502404e-08, 'epoch': 0.97} + 97%|█████████▋| 8451/8750 [2:22:13<28:37, 5.75s/it] 97%|█████████▋| 8452/8750 [2:22:19<28:25, 5.72s/it] 97%|█████████▋| 8452/8750 [2:22:21<28:25, 5.72s/it] {'loss': 0.4598, 'learning_rate': 6.077898589153642e-08, 'epoch': 0.97} + 97%|█████████▋| 8452/8750 [2:22:21<28:25, 5.72s/it] {'loss': 0.4598, 'learning_rate': 6.077898589153642e-08, 'epoch': 0.97} + 97%|█████████▋| 8452/8750 [2:22:19<28:25, 5.72s/it] 97%|█████████▋| 8453/8750 [2:22:25<28:04, 5.67s/it] 97%|█████████▋| 8453/8750 [2:22:27<28:04, 5.67s/it] {'loss': 0.4458, 'learning_rate': 6.037216790323319e-08, 'epoch': 0.97} + 97%|█████████▋| 8453/8750 [2:22:27<28:04, 5.67s/it] {'loss': 0.4458, 'learning_rate': 6.037216790323319e-08, 'epoch': 0.97} + 97%|█████████▋| 8453/8750 [2:22:25<28:04, 5.67s/it] 97%|█████████▋| 8454/8750 [2:22:33<28:30, 5.78s/it] 97%|█████████▋| 8454/8750 [2:22:31<28:30, 5.78s/it] {'loss': 0.4534, 'learning_rate': 5.996671186585756e-08, 'epoch': 0.97} + 97%|█████████▋| 8454/8750 [2:22:33<28:30, 5.78s/it] {'loss': 0.4534, 'learning_rate': 5.996671186585756e-08, 'epoch': 0.97} + 97%|█████████▋| 8454/8750 [2:22:31<28:30, 5.78s/it] 97%|█████████▋| 8455/8750 [2:22:36<28:17, 5.76s/it] 97%|█████████▋| 8455/8750 [2:22:38<28:17, 5.76s/it] {'loss': 0.4514, 'learning_rate': 5.9562617834963974e-08, 'epoch': 0.97} + 97%|█████████▋| 8455/8750 [2:22:38<28:17, 5.76s/it] {'loss': 0.4514, 'learning_rate': 5.9562617834963974e-08, 'epoch': 0.97} + 97%|█████████▋| 8455/8750 [2:22:36<28:17, 5.76s/it] 97%|█████████▋| 8456/8750 [2:22:42<28:09, 5.75s/it] 97%|█████████▋| 8456/8750 [2:22:44<28:09, 5.75s/it] {'loss': 0.4507, 'learning_rate': 5.915988586592481e-08, 'epoch': 0.97} + 97%|█████████▋| 8456/8750 [2:22:44<28:09, 5.75s/it] {'loss': 0.4507, 'learning_rate': 5.915988586592481e-08, 'epoch': 0.97} + 97%|█████████▋| 8456/8750 [2:22:42<28:09, 5.75s/it] 97%|█████████▋| 8457/8750 [2:22:48<28:18, 5.80s/it] 97%|█████████▋| 8457/8750 [2:22:50<28:18, 5.80s/it] {'loss': 0.4562, 'learning_rate': 5.8758516013921464e-08, 'epoch': 0.97} + 97%|█████████▋| 8457/8750 [2:22:50<28:18, 5.80s/it] {'loss': 0.4562, 'learning_rate': 5.8758516013921464e-08, 'epoch': 0.97} + 97%|█████████▋| 8457/8750 [2:22:48<28:18, 5.80s/it] 97%|█████████▋| 8458/8750 [2:22:56<28:00, 5.76s/it] 97%|█████████▋| 8458/8750 [2:22:54<28:00, 5.76s/it] {'loss': 0.4379, 'learning_rate': 5.8358508333951066e-08, 'epoch': 0.97} + 97%|█████████▋| 8458/8750 [2:22:56<28:00, 5.76s/it] {'loss': 0.4379, 'learning_rate': 5.8358508333951066e-08, 'epoch': 0.97} + 97%|█████████▋| 8458/8750 [2:22:54<28:00, 5.76s/it] 97%|█████████▋| 8459/8750 [2:23:00<28:13, 5.82s/it] 97%|█████████▋| 8459/8750 [2:23:02<28:13, 5.82s/it] {'loss': 0.438, 'learning_rate': 5.795986288082422e-08, 'epoch': 0.97} + 97%|█████████▋| 8459/8750 [2:23:02<28:13, 5.82s/it] {'loss': 0.438, 'learning_rate': 5.795986288082422e-08, 'epoch': 0.97} + 97%|█████████▋| 8459/8750 [2:23:00<28:13, 5.82s/it] 97%|█████████▋| 8460/8750 [2:23:05<28:06, 5.82s/it] 97%|█████████▋| 8460/8750 [2:23:07<28:06, 5.82s/it] {'loss': 0.4665, 'learning_rate': 5.75625797091639e-08, 'epoch': 0.97} + 97%|█████████▋| 8460/8750 [2:23:07<28:06, 5.82s/it] {'loss': 0.4665, 'learning_rate': 5.75625797091639e-08, 'epoch': 0.97} + 97%|█████████▋| 8460/8750 [2:23:05<28:06, 5.82s/it] 97%|█████████▋| 8461/8750 [2:23:13<28:03, 5.82s/it] 97%|█████████▋| 8461/8750 [2:23:11<28:03, 5.82s/it] {'loss': 0.4447, 'learning_rate': 5.716665887340656e-08, 'epoch': 0.97} + 97%|█████████▋| 8461/8750 [2:23:13<28:03, 5.82s/it] {'loss': 0.4447, 'learning_rate': 5.716665887340656e-08, 'epoch': 0.97} + 97%|█████████▋| 8461/8750 [2:23:11<28:03, 5.82s/it] 97%|█████████▋| 8462/8750 [2:23:17<27:46, 5.79s/it] 97%|█████████▋| 8462/8750 [2:23:19<27:46, 5.79s/it] {'loss': 0.4512, 'learning_rate': 5.677210042780212e-08, 'epoch': 0.97} + 97%|█████████▋| 8462/8750 [2:23:19<27:46, 5.79s/it] {'loss': 0.4512, 'learning_rate': 5.677210042780212e-08, 'epoch': 0.97} + 97%|█████████▋| 8462/8750 [2:23:17<27:46, 5.79s/it] 97%|█████████▋| 8463/8750 [2:23:23<27:52, 5.83s/it] 97%|█████████▋| 8463/8750 [2:23:25<27:52, 5.83s/it] {'loss': 0.4368, 'learning_rate': 5.637890442641403e-08, 'epoch': 0.97} + 97%|█████████▋| 8463/8750 [2:23:25<27:52, 5.83s/it] {'loss': 0.4368, 'learning_rate': 5.637890442641403e-08, 'epoch': 0.97} + 97%|█████████▋| 8463/8750 [2:23:23<27:52, 5.83s/it] 97%|█████████▋| 8464/8750 [2:23:29<27:30, 5.77s/it] 97%|█████████▋| 8464/8750 [2:23:30<27:30, 5.77s/it] {'loss': 0.4483, 'learning_rate': 5.598707092311917e-08, 'epoch': 0.97} + {'loss': 0.4483, 'learning_rate': 5.598707092311917e-08, 'epoch': 0.97} + 97%|█████████▋| 8464/8750 [2:23:30<27:30, 5.77s/it] 97%|█████████▋| 8464/8750 [2:23:29<27:30, 5.77s/it] 97%|█████████▋| 8465/8750 [2:23:34<27:13, 5.73s/it] 97%|█████████▋| 8465/8750 [2:23:36<27:13, 5.73s/it] {'loss': 0.4491, 'learning_rate': 5.5596599971606823e-08, 'epoch': 0.97} + 97%|█████████▋| 8465/8750 [2:23:36<27:13, 5.73s/it] {'loss': 0.4491, 'learning_rate': 5.5596599971606823e-08, 'epoch': 0.97} + 97%|█████████▋| 8465/8750 [2:23:34<27:13, 5.73s/it] 97%|█████████▋| 8466/8750 [2:23:40<27:27, 5.80s/it] 97%|█████████▋| 8466/8750 [2:23:42<27:27, 5.80s/it] {'loss': 0.4589, 'learning_rate': 5.520749162538197e-08, 'epoch': 0.97} + 97%|█████████▋| 8466/8750 [2:23:42<27:27, 5.80s/it] {'loss': 0.4589, 'learning_rate': 5.520749162538197e-08, 'epoch': 0.97} + 97%|█████████▋| 8466/8750 [2:23:40<27:27, 5.80s/it] 97%|█████████▋| 8467/8750 [2:23:46<27:32, 5.84s/it] 97%|█████████▋| 8467/8750 [2:23:48<27:32, 5.84s/it] {'loss': 0.4372, 'learning_rate': 5.4819745937758625e-08, 'epoch': 0.97} + 97%|█████████▋| 8467/8750 [2:23:48<27:32, 5.84s/it] {'loss': 0.4372, 'learning_rate': 5.4819745937758625e-08, 'epoch': 0.97} + 97%|█████████▋| 8467/8750 [2:23:46<27:32, 5.84s/it] 97%|█████████▋| 8468/8750 [2:23:52<27:09, 5.78s/it] 97%|█████████▋| 8468/8750 [2:23:54<27:09, 5.78s/it] {'loss': 0.4507, 'learning_rate': 5.443336296186874e-08, 'epoch': 0.97} + 97%|█████████▋| 8468/8750 [2:23:54<27:09, 5.78s/it] {'loss': 0.4507, 'learning_rate': 5.443336296186874e-08, 'epoch': 0.97} + 97%|█████████▋| 8468/8750 [2:23:52<27:09, 5.78s/it] 97%|█████████▋| 8469/8750 [2:23:59<27:00, 5.77s/it] 97%|█████████▋| 8469/8750 [2:23:57<27:00, 5.77s/it] {'loss': 0.4463, 'learning_rate': 5.40483427506544e-08, 'epoch': 0.97} + 97%|█████████▋| 8469/8750 [2:23:57<27:00, 5.77s/it]{'loss': 0.4463, 'learning_rate': 5.40483427506544e-08, 'epoch': 0.97} + 97%|█████████▋| 8469/8750 [2:23:59<27:00, 5.77s/it] 97%|█████████▋| 8470/8750 [2:24:03<26:45, 5.73s/it] 97%|█████████▋| 8470/8750 [2:24:05<26:45, 5.73s/it] {'loss': 0.4787, 'learning_rate': 5.3664685356871193e-08, 'epoch': 0.97} + 97%|█████████▋| 8470/8750 [2:24:05<26:45, 5.73s/it] {'loss': 0.4787, 'learning_rate': 5.3664685356871193e-08, 'epoch': 0.97} + 97%|█████████▋| 8470/8750 [2:24:03<26:45, 5.73s/it] 97%|█████████▋| 8471/8750 [2:24:11<26:23, 5.68s/it] 97%|█████████▋| 8471/8750 [2:24:09<26:23, 5.68s/it] {'loss': 0.4549, 'learning_rate': 5.3282390833090393e-08, 'epoch': 0.97} + 97%|█████████▋| 8471/8750 [2:24:11<26:23, 5.68s/it] {'loss': 0.4549, 'learning_rate': 5.3282390833090393e-08, 'epoch': 0.97} + 97%|█████████▋| 8471/8750 [2:24:09<26:23, 5.68s/it] 97%|█████████▋| 8472/8750 [2:24:16<26:18, 5.68s/it] 97%|█████████▋| 8472/8750 [2:24:14<26:18, 5.68s/it] {'loss': 0.4251, 'learning_rate': 5.290145923169343e-08, 'epoch': 0.97} + 97%|█████████▋| 8472/8750 [2:24:16<26:18, 5.68s/it] {'loss': 0.4251, 'learning_rate': 5.290145923169343e-08, 'epoch': 0.97} + 97%|█████████▋| 8472/8750 [2:24:14<26:18, 5.68s/it] 97%|█████████▋| 8473/8750 [2:24:22<25:59, 5.63s/it] 97%|█████████▋| 8473/8750 [2:24:20<25:59, 5.63s/it] {'loss': 0.4497, 'learning_rate': 5.252189060487855e-08, 'epoch': 0.97} + 97%|█████████▋| 8473/8750 [2:24:22<25:59, 5.63s/it] {'loss': 0.4497, 'learning_rate': 5.252189060487855e-08, 'epoch': 0.97} + 97%|█████████▋| 8473/8750 [2:24:20<25:59, 5.63s/it] 97%|█████████▋| 8474/8750 [2:24:27<25:44, 5.60s/it] 97%|█████████▋| 8474/8750 [2:24:25<25:44, 5.60s/it] {'loss': 0.4614, 'learning_rate': 5.214368500465305e-08, 'epoch': 0.97} + 97%|█████████▋| 8474/8750 [2:24:27<25:44, 5.60s/it] {'loss': 0.4614, 'learning_rate': 5.214368500465305e-08, 'epoch': 0.97} + 97%|█████████▋| 8474/8750 [2:24:25<25:44, 5.60s/it] 97%|█████████▋| 8475/8750 [2:24:33<25:56, 5.66s/it] 97%|█████████▋| 8475/8750 [2:24:31<25:56, 5.66s/it] {'loss': 0.4422, 'learning_rate': 5.176684248283992e-08, 'epoch': 0.97} +{'loss': 0.4422, 'learning_rate': 5.176684248283992e-08, 'epoch': 0.97} + 97%|█████████▋| 8475/8750 [2:24:31<25:56, 5.66s/it] 97%|█████████▋| 8475/8750 [2:24:33<25:56, 5.66s/it] 97%|█████████▋| 8476/8750 [2:24:37<25:47, 5.65s/it] 97%|█████████▋| 8476/8750 [2:24:39<25:47, 5.65s/it] {'loss': 0.4454, 'learning_rate': 5.1391363091075616e-08, 'epoch': 0.97} + 97%|█████████▋| 8476/8750 [2:24:39<25:47, 5.65s/it] {'loss': 0.4454, 'learning_rate': 5.1391363091075616e-08, 'epoch': 0.97} + 97%|█████████▋| 8476/8750 [2:24:37<25:47, 5.65s/it] 97%|█████████▋| 8477/8750 [2:24:43<25:50, 5.68s/it] 97%|█████████▋| 8477/8750 [2:24:44<25:50, 5.68s/it] {'loss': 0.4402, 'learning_rate': 5.1017246880809e-08, 'epoch': 0.97} + 97%|█████████▋| 8477/8750 [2:24:44<25:50, 5.68s/it] {'loss': 0.4402, 'learning_rate': 5.1017246880809e-08, 'epoch': 0.97} + 97%|█████████▋| 8477/8750 [2:24:43<25:50, 5.68s/it] 97%|█████████▋| 8478/8750 [2:24:50<25:37, 5.65s/it] 97%|█████████▋| 8478/8750 [2:24:48<25:37, 5.65s/it] {'loss': 0.4599, 'learning_rate': 5.064449390330239e-08, 'epoch': 0.97} + 97%|█████████▋| 8478/8750 [2:24:50<25:37, 5.65s/it] {'loss': 0.4599, 'learning_rate': 5.064449390330239e-08, 'epoch': 0.97} + 97%|█████████▋| 8478/8750 [2:24:48<25:37, 5.65s/it] 97%|█████████▋| 8479/8750 [2:24:54<25:35, 5.67s/it] 97%|█████████▋| 8479/8750 [2:24:56<25:35, 5.67s/it] {'loss': 0.4456, 'learning_rate': 5.02731042096305e-08, 'epoch': 0.97} + 97%|█████████▋| 8479/8750 [2:24:56<25:35, 5.67s/it] {'loss': 0.4456, 'learning_rate': 5.02731042096305e-08, 'epoch': 0.97} + 97%|█████████▋| 8479/8750 [2:24:54<25:35, 5.67s/it] 97%|█████████▋| 8480/8750 [2:24:59<25:27, 5.66s/it] 97%|█████████▋| 8480/8750 [2:25:01<25:27, 5.66s/it] {'loss': 0.4534, 'learning_rate': 4.99030778506826e-08, 'epoch': 0.97} + 97%|█████████▋| 8480/8750 [2:25:01<25:27, 5.66s/it] {'loss': 0.4534, 'learning_rate': 4.99030778506826e-08, 'epoch': 0.97} + 97%|█████████▋| 8480/8750 [2:24:59<25:27, 5.66s/it] 97%|█████████▋| 8481/8750 [2:25:07<25:15, 5.63s/it] 97%|█████████▋| 8481/8750 [2:25:05<25:15, 5.63s/it] {'loss': 0.4505, 'learning_rate': 4.953441487716037e-08, 'epoch': 0.97} + 97%|█████████▋| 8481/8750 [2:25:07<25:15, 5.63s/it] {'loss': 0.4505, 'learning_rate': 4.953441487716037e-08, 'epoch': 0.97} + 97%|█████████▋| 8481/8750 [2:25:05<25:15, 5.63s/it] 97%|█████████▋| 8482/8750 [2:25:11<25:01, 5.60s/it] 97%|█████████▋| 8482/8750 [2:25:12<25:02, 5.60s/it] {'loss': 0.461, 'learning_rate': 4.9167115339580074e-08, 'epoch': 0.97} + 97%|█████████▋| 8482/8750 [2:25:12<25:02, 5.60s/it] {'loss': 0.461, 'learning_rate': 4.9167115339580074e-08, 'epoch': 0.97} + 97%|█████████▋| 8482/8750 [2:25:11<25:01, 5.60s/it] 97%|█████████▋| 8483/8750 [2:25:16<25:07, 5.65s/it] 97%|█████████▋| 8483/8750 [2:25:18<25:07, 5.65s/it] {'loss': 0.4449, 'learning_rate': 4.8801179288268105e-08, 'epoch': 0.97} + 97%|█████████▋| 8483/8750 [2:25:18<25:07, 5.65s/it] {'loss': 0.4449, 'learning_rate': 4.8801179288268105e-08, 'epoch': 0.97} + 97%|█████████▋| 8483/8750 [2:25:16<25:07, 5.65s/it] 97%|█████████▋| 8484/8750 [2:25:22<24:58, 5.63s/it] 97%|█████████▋| 8484/8750 [2:25:24<24:58, 5.63s/it] {'loss': 0.451, 'learning_rate': 4.84366067733677e-08, 'epoch': 0.97} + 97%|█████████▋| 8484/8750 [2:25:24<24:58, 5.63s/it] {'loss': 0.451, 'learning_rate': 4.84366067733677e-08, 'epoch': 0.97} + 97%|█████████▋| 8484/8750 [2:25:22<24:58, 5.63s/it] 97%|█████████▋| 8485/8750 [2:25:28<25:17, 5.72s/it] 97%|█████████▋| 8485/8750 [2:25:30<25:17, 5.72s/it] {'loss': 0.4375, 'learning_rate': 4.807339784483112e-08, 'epoch': 0.97} + 97%|█████████▋| 8485/8750 [2:25:30<25:17, 5.72s/it] {'loss': 0.4375, 'learning_rate': 4.807339784483112e-08, 'epoch': 0.97} + 97%|█████████▋| 8485/8750 [2:25:28<25:17, 5.72s/it] 97%|█████████▋| 8486/8750 [2:25:33<25:02, 5.69s/it] 97%|█████████▋| 8486/8750 [2:25:35<25:02, 5.69s/it] {'loss': 0.4622, 'learning_rate': 4.771155255242854e-08, 'epoch': 0.97} + 97%|█████████▋| 8486/8750 [2:25:35<25:02, 5.69s/it] {'loss': 0.4622, 'learning_rate': 4.771155255242854e-08, 'epoch': 0.97} + 97%|█████████▋| 8486/8750 [2:25:33<25:02, 5.69s/it] 97%|█████████▋| 8487/8750 [2:25:39<25:16, 5.76s/it] 97%|█████████▋| 8487/8750 [2:25:41<25:16, 5.76s/it] {'loss': 0.4462, 'learning_rate': 4.7351070945739206e-08, 'epoch': 0.97} + 97%|█████████▋| 8487/8750 [2:25:41<25:16, 5.76s/it] {'loss': 0.4462, 'learning_rate': 4.7351070945739206e-08, 'epoch': 0.97} + 97%|█████████▋| 8487/8750 [2:25:39<25:16, 5.76s/it] 97%|█████████▋| 8488/8750 [2:25:45<24:56, 5.71s/it] 97%|█████████▋| 8488/8750 [2:25:47<24:56, 5.71s/it] {'loss': 0.4648, 'learning_rate': 4.699195307415805e-08, 'epoch': 0.97} + 97%|█████████▋| 8488/8750 [2:25:47<24:56, 5.71s/it] {'loss': 0.4648, 'learning_rate': 4.699195307415805e-08, 'epoch': 0.97} + 97%|█████████▋| 8488/8750 [2:25:45<24:56, 5.71s/it] 97%|█████████▋| 8489/8750 [2:25:51<24:48, 5.70s/it] 97%|█████████▋| 8489/8750 [2:25:53<24:48, 5.70s/it] {'loss': 0.4283, 'learning_rate': 4.663419898689125e-08, 'epoch': 0.97} + 97%|█████████▋| 8489/8750 [2:25:53<24:48, 5.70s/it] {'loss': 0.4283, 'learning_rate': 4.663419898689125e-08, 'epoch': 0.97} + 97%|█████████▋| 8489/8750 [2:25:51<24:48, 5.70s/it] 97%|█████████▋| 8490/8750 [2:25:56<24:47, 5.72s/it] 97%|█████████▋| 8490/8750 [2:25:58<24:47, 5.72s/it] {'loss': 0.4329, 'learning_rate': 4.6277808732959616e-08, 'epoch': 0.97} + 97%|█████████▋| 8490/8750 [2:25:58<24:47, 5.72s/it] {'loss': 0.4329, 'learning_rate': 4.6277808732959616e-08, 'epoch': 0.97} + 97%|█████████▋| 8490/8750 [2:25:56<24:47, 5.72s/it] 97%|█████████▋| 8491/8750 [2:26:04<24:43, 5.73s/it] 97%|█████████▋| 8491/8750 [2:26:02<24:43, 5.73s/it] {'loss': 0.4742, 'learning_rate': 4.5922782361197405e-08, 'epoch': 0.97} + 97%|█████████▋| 8491/8750 [2:26:04<24:43, 5.73s/it] {'loss': 0.4742, 'learning_rate': 4.5922782361197405e-08, 'epoch': 0.97} + 97%|█████████▋| 8491/8750 [2:26:02<24:43, 5.73s/it] 97%|█████████▋| 8492/8750 [2:26:08<24:36, 5.72s/it] 97%|█████████▋| 8492/8750 [2:26:10<24:36, 5.72s/it] {'loss': 0.4469, 'learning_rate': 4.556911992025015e-08, 'epoch': 0.97} + 97%|█████████▋| 8492/8750 [2:26:10<24:36, 5.72s/it] {'loss': 0.4469, 'learning_rate': 4.556911992025015e-08, 'epoch': 0.97} + 97%|█████████▋| 8492/8750 [2:26:08<24:36, 5.72s/it] 97%|█████████▋| 8493/8750 [2:26:14<24:31, 5.73s/it] 97%|█████████▋| 8493/8750 [2:26:16<24:31, 5.73s/it] {'loss': 0.4465, 'learning_rate': 4.521682145857797e-08, 'epoch': 0.97} + 97%|█████████▋| 8493/8750 [2:26:16<24:31, 5.73s/it] {'loss': 0.4465, 'learning_rate': 4.521682145857797e-08, 'epoch': 0.97} + 97%|█████████▋| 8493/8750 [2:26:14<24:31, 5.73s/it] 97%|█████████▋| 8494/8750 [2:26:19<24:24, 5.72s/it] 97%|█████████▋| 8494/8750 [2:26:21<24:24, 5.72s/it] {'loss': 0.4458, 'learning_rate': 4.486588702445338e-08, 'epoch': 0.97} + 97%|█████████▋| 8494/8750 [2:26:21<24:24, 5.72s/it] {'loss': 0.4458, 'learning_rate': 4.486588702445338e-08, 'epoch': 0.97} + 97%|█████████▋| 8494/8750 [2:26:19<24:24, 5.72s/it] 97%|█████████▋| 8495/8750 [2:26:25<24:19, 5.72s/it] 97%|█████████▋| 8495/8750 [2:26:27<24:19, 5.72s/it] {'loss': 0.4394, 'learning_rate': 4.451631666596123e-08, 'epoch': 0.97} + 97%|█████████▋| 8495/8750 [2:26:27<24:19, 5.72s/it] {'loss': 0.4394, 'learning_rate': 4.451631666596123e-08, 'epoch': 0.97} + 97%|█████████▋| 8495/8750 [2:26:25<24:19, 5.72s/it] 97%|█████████▋| 8496/8750 [2:26:31<23:58, 5.66s/it] 97%|█████████▋| 8496/8750 [2:26:32<23:58, 5.66s/it] {'loss': 0.4785, 'learning_rate': 4.416811043100322e-08, 'epoch': 0.97} + 97%|█████████▋| 8496/8750 [2:26:32<23:58, 5.66s/it] {'loss': 0.4785, 'learning_rate': 4.416811043100322e-08, 'epoch': 0.97} + 97%|█████████▋| 8496/8750 [2:26:31<23:58, 5.66s/it] 97%|█████████▋| 8497/8750 [2:26:36<24:01, 5.70s/it] 97%|█████████▋| 8497/8750 [2:26:38<24:01, 5.70s/it] {'loss': 0.4551, 'learning_rate': 4.382126836728895e-08, 'epoch': 0.97} + {'loss': 0.4551, 'learning_rate': 4.382126836728895e-08, 'epoch': 0.97} 97%|█████████▋| 8497/8750 [2:26:38<24:01, 5.70s/it] + 97%|█████████▋| 8497/8750 [2:26:36<24:01, 5.70s/it] 97%|█████████▋| 8498/8750 [2:26:44<23:53, 5.69s/it] 97%|█████████▋| 8498/8750 [2:26:42<23:53, 5.69s/it] {'loss': 0.4671, 'learning_rate': 4.347579052234374e-08, 'epoch': 0.97} + 97%|█████████▋| 8498/8750 [2:26:44<23:53, 5.69s/it] {'loss': 0.4671, 'learning_rate': 4.347579052234374e-08, 'epoch': 0.97} + 97%|█████████▋| 8498/8750 [2:26:42<23:53, 5.69s/it] 97%|█████████▋| 8499/8750 [2:26:50<23:53, 5.71s/it] 97%|█████████▋| 8499/8750 [2:26:48<23:53, 5.71s/it] {'loss': 0.4613, 'learning_rate': 4.3131676943506395e-08, 'epoch': 0.97} + 97%|█████████▋| 8499/8750 [2:26:50<23:53, 5.71s/it] {'loss': 0.4613, 'learning_rate': 4.3131676943506395e-08, 'epoch': 0.97} + 97%|█████████▋| 8499/8750 [2:26:48<23:53, 5.71s/it]12 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +61 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +159 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +04 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... + 97%|█████████▋| 8500/8750 [2:26:55<23:45, 5.70s/it]1013 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... + 97%|█████████▋| 8500/8750 [2:26:53<23:45, 5.70s/it]7 AutoResumeHook: Checking whether to suspend... + {'loss': 0.463, 'learning_rate': 4.278892767792808e-08, 'epoch': 0.97} + 97%|█████████▋| 8500/8750 [2:26:55<23:45, 5.70s/it]{'loss': 0.463, 'learning_rate': 4.278892767792808e-08, 'epoch': 0.97} + 97%|█████████▋| 8500/8750 [2:26:53<23:45, 5.70s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-8500/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-8500/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-8500/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 97%|█████████▋| 8501/8750 [2:27:18<47:16, 11.39s/it] 97%|█████████▋| 8501/8750 [2:27:20<47:16, 11.39s/it]{'loss': 0.4322, 'learning_rate': 4.244754277257346e-08, 'epoch': 0.97} + {'loss': 0.4322, 'learning_rate': 4.244754277257346e-08, 'epoch': 0.97} + 97%|█████████▋| 8501/8750 [2:27:20<47:16, 11.39s/it] 97%|█████████▋| 8501/8750 [2:27:18<47:16, 11.39s/it] 97%|█████████▋| 8502/8750 [2:27:24<40:02, 9.69s/it] 97%|█████████▋| 8502/8750 [2:27:26<40:02, 9.69s/it] {'loss': 0.4353, 'learning_rate': 4.210752227421955e-08, 'epoch': 0.97} + 97%|█████████▋| 8502/8750 [2:27:26<40:02, 9.69s/it] {'loss': 0.4353, 'learning_rate': 4.210752227421955e-08, 'epoch': 0.97} + 97%|█████████▋| 8502/8750 [2:27:24<40:02, 9.69s/it] 97%|█████████▋| 8503/8750 [2:27:32<35:08, 8.54s/it] 97%|█████████▋| 8503/8750 [2:27:30<35:08, 8.54s/it] {'loss': 0.4607, 'learning_rate': 4.176886622945575e-08, 'epoch': 0.97} + 97%|█████████▋| 8503/8750 [2:27:32<35:08, 8.54s/it] {'loss': 0.4607, 'learning_rate': 4.176886622945575e-08, 'epoch': 0.97} + 97%|█████████▋| 8503/8750 [2:27:30<35:08, 8.54s/it] 97%|█████████▋| 8504/8750 [2:27:35<31:32, 7.69s/it] 97%|█████████▋| 8504/8750 [2:27:37<31:32, 7.69s/it] {'loss': 0.4652, 'learning_rate': 4.143157468468717e-08, 'epoch': 0.97} + 97%|█████████▋| 8504/8750 [2:27:37<31:32, 7.69s/it] {'loss': 0.4652, 'learning_rate': 4.143157468468717e-08, 'epoch': 0.97} + 97%|█████████▋| 8504/8750 [2:27:35<31:32, 7.69s/it] 97%|█████████▋| 8505/8750 [2:27:41<28:51, 7.07s/it] 97%|█████████▋| 8505/8750 [2:27:43<28:51, 7.07s/it] {'loss': 0.4465, 'learning_rate': 4.109564768613017e-08, 'epoch': 0.97} + 97%|█████████▋| 8505/8750 [2:27:43<28:51, 7.07s/it] {'loss': 0.4465, 'learning_rate': 4.109564768613017e-08, 'epoch': 0.97} + 97%|█████████▋| 8505/8750 [2:27:41<28:51, 7.07s/it] 97%|█████████▋| 8506/8750 [2:27:49<26:59, 6.64s/it] 97%|█████████▋| 8506/8750 [2:27:47<26:59, 6.64s/it] {'loss': 0.4399, 'learning_rate': 4.076108527981237e-08, 'epoch': 0.97} + 97%|█████████▋| 8506/8750 [2:27:49<26:59, 6.64s/it] {'loss': 0.4399, 'learning_rate': 4.076108527981237e-08, 'epoch': 0.97} + 97%|█████████▋| 8506/8750 [2:27:47<26:59, 6.64s/it] 97%|█████████▋| 8507/8750 [2:27:54<25:53, 6.39s/it] 97%|█████████▋| 8507/8750 [2:27:52<25:53, 6.39s/it] {'loss': 0.4457, 'learning_rate': 4.0427887511578224e-08, 'epoch': 0.97} + 97%|█████████▋| 8507/8750 [2:27:54<25:53, 6.39s/it] {'loss': 0.4457, 'learning_rate': 4.0427887511578224e-08, 'epoch': 0.97} + 97%|█████████▋| 8507/8750 [2:27:52<25:53, 6.39s/it] 97%|█████████▋| 8508/8750 [2:28:00<24:59, 6.19s/it] 97%|█████████▋| 8508/8750 [2:27:58<24:59, 6.19s/it]{'loss': 0.4501, 'learning_rate': 4.009605442708231e-08, 'epoch': 0.97} + {'loss': 0.4501, 'learning_rate': 4.009605442708231e-08, 'epoch': 0.97} + 97%|█████████▋| 8508/8750 [2:28:00<24:59, 6.19s/it] 97%|█████████▋| 8508/8750 [2:27:58<24:59, 6.19s/it] 97%|█████████▋| 8509/8750 [2:28:06<24:22, 6.07s/it] 97%|█████████▋| 8509/8750 [2:28:04<24:22, 6.07s/it] {'loss': 0.4637, 'learning_rate': 3.976558607179382e-08, 'epoch': 0.97} + 97%|█████████▋| 8509/8750 [2:28:06<24:22, 6.07s/it] {'loss': 0.4637, 'learning_rate': 3.976558607179382e-08, 'epoch': 0.97} + 97%|█████████▋| 8509/8750 [2:28:04<24:22, 6.07s/it] 97%|█████████▋| 8510/8750 [2:28:12<23:47, 5.95s/it] 97%|█████████▋| 8510/8750 [2:28:10<23:47, 5.95s/it] {'loss': 0.4323, 'learning_rate': 3.943648249099319e-08, 'epoch': 0.97} + 97%|█████████▋| 8510/8750 [2:28:12<23:47, 5.95s/it] {'loss': 0.4323, 'learning_rate': 3.943648249099319e-08, 'epoch': 0.97} + 97%|█████████▋| 8510/8750 [2:28:10<23:47, 5.95s/it] 97%|█████████▋| 8511/8750 [2:28:17<23:24, 5.88s/it] 97%|█████████▋| 8511/8750 [2:28:15<23:24, 5.88s/it] {'loss': 0.4528, 'learning_rate': 3.910874372977658e-08, 'epoch': 0.97} + 97%|█████████▋| 8511/8750 [2:28:17<23:24, 5.88s/it] {'loss': 0.4528, 'learning_rate': 3.910874372977658e-08, 'epoch': 0.97} + 97%|█████████▋| 8511/8750 [2:28:15<23:24, 5.88s/it] 97%|█████████▋| 8512/8750 [2:28:23<22:57, 5.79s/it] 97%|█████████▋| 8512/8750 [2:28:21<22:57, 5.79s/it] {'loss': 0.4539, 'learning_rate': 3.8782369833050284e-08, 'epoch': 0.97} + 97%|█████████▋| 8512/8750 [2:28:23<22:57, 5.79s/it] {'loss': 0.4539, 'learning_rate': 3.8782369833050284e-08, 'epoch': 0.97} + 97%|█████████▋| 8512/8750 [2:28:21<22:57, 5.79s/it] 97%|█████████▋| 8513/8750 [2:28:29<22:53, 5.80s/it] 97%|█████████▋| 8513/8750 [2:28:27<22:53, 5.80s/it] {'loss': 0.4405, 'learning_rate': 3.845736084553408e-08, 'epoch': 0.97} + 97%|█████████▋| 8513/8750 [2:28:29<22:53, 5.80s/it] {'loss': 0.4405, 'learning_rate': 3.845736084553408e-08, 'epoch': 0.97} + 97%|█████████▋| 8513/8750 [2:28:27<22:53, 5.80s/it] 97%|█████████▋| 8514/8750 [2:28:34<22:37, 5.75s/it] 97%|█████████▋| 8514/8750 [2:28:32<22:37, 5.75s/it] {'loss': 0.4558, 'learning_rate': 3.813371681176348e-08, 'epoch': 0.97} + 97%|█████████▋| 8514/8750 [2:28:34<22:37, 5.75s/it] {'loss': 0.4558, 'learning_rate': 3.813371681176348e-08, 'epoch': 0.97} + 97%|█████████▋| 8514/8750 [2:28:32<22:37, 5.75s/it] 97%|█████████▋| 8515/8750 [2:28:40<22:20, 5.70s/it] 97%|█████████▋| 8515/8750 [2:28:38<22:20, 5.70s/it] {'loss': 0.4546, 'learning_rate': 3.7811437776084095e-08, 'epoch': 0.97} + 97%|█████████▋| 8515/8750 [2:28:40<22:20, 5.70s/it] {'loss': 0.4546, 'learning_rate': 3.7811437776084095e-08, 'epoch': 0.97} + 97%|█████████▋| 8515/8750 [2:28:38<22:20, 5.70s/it] 97%|█████████▋| 8516/8750 [2:28:46<22:20, 5.73s/it] 97%|█████████▋| 8516/8750 [2:28:44<22:20, 5.73s/it] {'loss': 0.4363, 'learning_rate': 3.749052378265505e-08, 'epoch': 0.97} + 97%|█████████▋| 8516/8750 [2:28:46<22:20, 5.73s/it] {'loss': 0.4363, 'learning_rate': 3.749052378265505e-08, 'epoch': 0.97} + 97%|█████████▋| 8516/8750 [2:28:44<22:20, 5.73s/it] 97%|█████████▋| 8517/8750 [2:28:51<22:13, 5.72s/it] 97%|█████████▋| 8517/8750 [2:28:50<22:13, 5.73s/it] {'loss': 0.4546, 'learning_rate': 3.717097487545007e-08, 'epoch': 0.97} + 97%|█████████▋| 8517/8750 [2:28:51<22:13, 5.72s/it] {'loss': 0.4546, 'learning_rate': 3.717097487545007e-08, 'epoch': 0.97} + 97%|█████████▋| 8517/8750 [2:28:50<22:13, 5.73s/it] 97%|█████████▋| 8518/8750 [2:28:57<22:04, 5.71s/it] 97%|█████████▋| 8518/8750 [2:28:55<22:04, 5.71s/it] {'loss': 0.4673, 'learning_rate': 3.6852791098251906e-08, 'epoch': 0.97} + 97%|█████████▋| 8518/8750 [2:28:57<22:04, 5.71s/it] {'loss': 0.4673, 'learning_rate': 3.6852791098251906e-08, 'epoch': 0.97} + 97%|█████████▋| 8518/8750 [2:28:55<22:04, 5.71s/it] 97%|█████████▋| 8519/8750 [2:29:03<22:19, 5.80s/it] 97%|█████████▋| 8519/8750 [2:29:01<22:19, 5.80s/it] {'loss': 0.4412, 'learning_rate': 3.653597249466012e-08, 'epoch': 0.97} + 97%|█████████▋| 8519/8750 [2:29:03<22:19, 5.80s/it] {'loss': 0.4412, 'learning_rate': 3.653597249466012e-08, 'epoch': 0.97} + 97%|█████████▋| 8519/8750 [2:29:01<22:19, 5.80s/it] 97%|█████████▋| 8520/8750 [2:29:09<22:10, 5.78s/it] 97%|█████████▋| 8520/8750 [2:29:07<22:10, 5.78s/it] {'loss': 0.4481, 'learning_rate': 3.622051910808666e-08, 'epoch': 0.97} + {'loss': 0.4481, 'learning_rate': 3.622051910808666e-08, 'epoch': 0.97} 97%|█████████▋| 8520/8750 [2:29:09<22:10, 5.78s/it] + 97%|█████████▋| 8520/8750 [2:29:07<22:10, 5.78s/it] 97%|█████████▋| 8521/8750 [2:29:14<21:53, 5.74s/it] 97%|█████████▋| 8521/8750 [2:29:13<21:53, 5.74s/it] {'loss': 0.4613, 'learning_rate': 3.5906430981754724e-08, 'epoch': 0.97} + 97%|█████████▋| 8521/8750 [2:29:14<21:53, 5.74s/it] {'loss': 0.4613, 'learning_rate': 3.5906430981754724e-08, 'epoch': 0.97} + 97%|█████████▋| 8521/8750 [2:29:13<21:53, 5.74s/it] 97%|█████████▋| 8522/8750 [2:29:18<21:52, 5.75s/it] 97%|█████████▋| 8522/8750 [2:29:20<21:52, 5.75s/it] {'loss': 0.4789, 'learning_rate': 3.559370815870211e-08, 'epoch': 0.97} + 97%|█████████▋| 8522/8750 [2:29:20<21:52, 5.75s/it] {'loss': 0.4789, 'learning_rate': 3.559370815870211e-08, 'epoch': 0.97} + 97%|█████████▋| 8522/8750 [2:29:18<21:52, 5.75s/it] 97%|█████████▋| 8523/8750 [2:29:26<21:57, 5.80s/it] 97%|█████████▋| 8523/8750 [2:29:24<21:57, 5.80s/it] {'loss': 0.4422, 'learning_rate': 3.528235068177899e-08, 'epoch': 0.97} + 97%|█████████▋| 8523/8750 [2:29:26<21:57, 5.80s/it] {'loss': 0.4422, 'learning_rate': 3.528235068177899e-08, 'epoch': 0.97} + 97%|█████████▋| 8523/8750 [2:29:24<21:57, 5.80s/it] 97%|█████████▋| 8524/8750 [2:29:32<21:50, 5.80s/it] 97%|█████████▋| 8524/8750 [2:29:30<21:50, 5.80s/it] {'loss': 0.4468, 'learning_rate': 3.4972358593646785e-08, 'epoch': 0.97} + 97%|█████████▋| 8524/8750 [2:29:32<21:50, 5.80s/it] {'loss': 0.4468, 'learning_rate': 3.4972358593646785e-08, 'epoch': 0.97} + 97%|█████████▋| 8524/8750 [2:29:30<21:50, 5.80s/it] 97%|█████████▋| 8525/8750 [2:29:38<21:46, 5.81s/it] 97%|█████████▋| 8525/8750 [2:29:36<21:46, 5.81s/it] {'loss': 0.4509, 'learning_rate': 3.466373193678263e-08, 'epoch': 0.97} + 97%|█████████▋| 8525/8750 [2:29:38<21:46, 5.81s/it] {'loss': 0.4509, 'learning_rate': 3.466373193678263e-08, 'epoch': 0.97} + 97%|█████████▋| 8525/8750 [2:29:36<21:46, 5.81s/it] 97%|█████████▋| 8526/8750 [2:29:44<21:36, 5.79s/it] 97%|█████████▋| 8526/8750 [2:29:42<21:36, 5.79s/it] {'loss': 0.4454, 'learning_rate': 3.4356470753474927e-08, 'epoch': 0.97} + 97%|█████████▋| 8526/8750 [2:29:44<21:36, 5.79s/it] {'loss': 0.4454, 'learning_rate': 3.4356470753474927e-08, 'epoch': 0.97} + 97%|█████████▋| 8526/8750 [2:29:42<21:36, 5.79s/it] 97%|█████████▋| 8527/8750 [2:29:50<22:03, 5.93s/it] 97%|█████████▋| 8527/8750 [2:29:48<22:03, 5.93s/it] {'loss': 0.4475, 'learning_rate': 3.4050575085825546e-08, 'epoch': 0.97} + 97%|█████████▋| 8527/8750 [2:29:50<22:03, 5.93s/it] {'loss': 0.4475, 'learning_rate': 3.4050575085825546e-08, 'epoch': 0.97} + 97%|█████████▋| 8527/8750 [2:29:48<22:03, 5.93s/it] 97%|█████████▋| 8528/8750 [2:29:55<21:36, 5.84s/it] 97%|█████████▋| 8528/8750 [2:29:54<21:36, 5.84s/it] {'loss': 0.4434, 'learning_rate': 3.3746044975749845e-08, 'epoch': 0.97} + 97%|█████████▋| 8528/8750 [2:29:55<21:36, 5.84s/it] {'loss': 0.4434, 'learning_rate': 3.3746044975749845e-08, 'epoch': 0.97} + 97%|█████████▋| 8528/8750 [2:29:54<21:36, 5.84s/it] 97%|█████████▋| 8529/8750 [2:29:59<21:18, 5.78s/it] 97%|█████████▋| 8529/8750 [2:30:01<21:18, 5.78s/it] {'loss': 0.4462, 'learning_rate': 3.3442880464972237e-08, 'epoch': 0.97} + {'loss': 0.4462, 'learning_rate': 3.3442880464972237e-08, 'epoch': 0.97} + 97%|█████████▋| 8529/8750 [2:30:01<21:18, 5.78s/it] 97%|█████████▋| 8529/8750 [2:29:59<21:18, 5.78s/it] 97%|█████████▋| 8530/8750 [2:30:07<20:53, 5.70s/it] 97%|█████████▋| 8530/8750 [2:30:05<20:53, 5.70s/it] {'loss': 0.4664, 'learning_rate': 3.314108159503726e-08, 'epoch': 0.97} + 97%|█████████▋| 8530/8750 [2:30:07<20:53, 5.70s/it] {'loss': 0.4664, 'learning_rate': 3.314108159503726e-08, 'epoch': 0.97} + 97%|█████████▋| 8530/8750 [2:30:05<20:53, 5.70s/it] 97%|█████████▋| 8531/8750 [2:30:12<20:46, 5.69s/it] 97%|█████████▋| 8531/8750 [2:30:10<20:46, 5.69s/it] {'loss': 0.453, 'learning_rate': 3.284064840729406e-08, 'epoch': 0.97} + 97%|█████████▋| 8531/8750 [2:30:12<20:46, 5.69s/it] {'loss': 0.453, 'learning_rate': 3.284064840729406e-08, 'epoch': 0.97} + 97%|█████████▋| 8531/8750 [2:30:10<20:46, 5.69s/it] 98%|█████████▊| 8532/8750 [2:30:18<20:36, 5.67s/it] 98%|█████████▊| 8532/8750 [2:30:16<20:36, 5.67s/it] {'loss': 0.4415, 'learning_rate': 3.2541580942911935e-08, 'epoch': 0.98} + 98%|█████████▊| 8532/8750 [2:30:18<20:36, 5.67s/it] {'loss': 0.4415, 'learning_rate': 3.2541580942911935e-08, 'epoch': 0.98} + 98%|█████████▊| 8532/8750 [2:30:16<20:36, 5.67s/it] 98%|█████████▊| 8533/8750 [2:30:24<20:59, 5.81s/it] 98%|█████████▊| 8533/8750 [2:30:22<20:59, 5.81s/it] {'loss': 0.4437, 'learning_rate': 3.224387924286698e-08, 'epoch': 0.98} + 98%|█████████▊| 8533/8750 [2:30:24<20:59, 5.81s/it] {'loss': 0.4437, 'learning_rate': 3.224387924286698e-08, 'epoch': 0.98} + 98%|█████████▊| 8533/8750 [2:30:22<20:59, 5.81s/it] 98%|█████████▊| 8534/8750 [2:30:30<20:44, 5.76s/it] 98%|█████████▊| 8534/8750 [2:30:28<20:44, 5.76s/it] {'loss': 0.4495, 'learning_rate': 3.1947543347953246e-08, 'epoch': 0.98} + 98%|█████████▊| 8534/8750 [2:30:30<20:44, 5.76s/it] {'loss': 0.4495, 'learning_rate': 3.1947543347953246e-08, 'epoch': 0.98} + 98%|█████████▊| 8534/8750 [2:30:28<20:44, 5.76s/it] 98%|█████████▊| 8535/8750 [2:30:36<21:00, 5.86s/it] 98%|█████████▊| 8535/8750 [2:30:34<21:00, 5.86s/it] {'loss': 0.4552, 'learning_rate': 3.1652573298774916e-08, 'epoch': 0.98} + 98%|█████████▊| 8535/8750 [2:30:36<21:00, 5.86s/it] {'loss': 0.4552, 'learning_rate': 3.1652573298774916e-08, 'epoch': 0.98} + 98%|█████████▊| 8535/8750 [2:30:34<21:00, 5.86s/it] 98%|█████████▊| 8536/8750 [2:30:41<20:40, 5.80s/it] 98%|█████████▊| 8536/8750 [2:30:39<20:40, 5.80s/it] {'loss': 0.4514, 'learning_rate': 3.135896913574743e-08, 'epoch': 0.98} + {'loss': 0.4514, 'learning_rate': 3.135896913574743e-08, 'epoch': 0.98} 98%|█████████▊| 8536/8750 [2:30:41<20:40, 5.80s/it] + 98%|█████████▊| 8536/8750 [2:30:39<20:40, 5.80s/it] 98%|█████████▊| 8537/8750 [2:30:47<20:23, 5.74s/it] 98%|█████████▊| 8537/8750 [2:30:45<20:23, 5.74s/it] {'loss': 0.4368, 'learning_rate': 3.106673089910417e-08, 'epoch': 0.98} + 98%|█████████▊| 8537/8750 [2:30:47<20:23, 5.74s/it] {'loss': 0.4368, 'learning_rate': 3.106673089910417e-08, 'epoch': 0.98} + 98%|█████████▊| 8537/8750 [2:30:45<20:23, 5.74s/it] 98%|█████████▊| 8538/8750 [2:30:53<20:03, 5.68s/it] 98%|█████████▊| 8538/8750 [2:30:51<20:03, 5.68s/it] {'loss': 0.4588, 'learning_rate': 3.077585862888643e-08, 'epoch': 0.98} + 98%|█████████▊| 8538/8750 [2:30:53<20:03, 5.68s/it] {'loss': 0.4588, 'learning_rate': 3.077585862888643e-08, 'epoch': 0.98} + 98%|█████████▊| 8538/8750 [2:30:51<20:03, 5.68s/it] 98%|█████████▊| 8539/8750 [2:30:58<20:02, 5.70s/it] 98%|█████████▊| 8539/8750 [2:30:56<20:02, 5.70s/it] {'loss': 0.446, 'learning_rate': 3.048635236495012e-08, 'epoch': 0.98} + 98%|█████████▊| 8539/8750 [2:30:58<20:02, 5.70s/it] {'loss': 0.446, 'learning_rate': 3.048635236495012e-08, 'epoch': 0.98} + 98%|█████████▊| 8539/8750 [2:30:56<20:02, 5.70s/it] 98%|█████████▊| 8540/8750 [2:31:04<19:57, 5.70s/it] 98%|█████████▊| 8540/8750 [2:31:02<19:57, 5.70s/it] {'loss': 0.4568, 'learning_rate': 3.019821214696572e-08, 'epoch': 0.98} + 98%|█████████▊| 8540/8750 [2:31:04<19:57, 5.70s/it] {'loss': 0.4568, 'learning_rate': 3.019821214696572e-08, 'epoch': 0.98} + 98%|█████████▊| 8540/8750 [2:31:02<19:57, 5.70s/it] 98%|█████████▊| 8541/8750 [2:31:10<19:45, 5.67s/it] 98%|█████████▊| 8541/8750 [2:31:08<19:45, 5.67s/it] {'loss': 0.4559, 'learning_rate': 2.9911438014412765e-08, 'epoch': 0.98} + 98%|█████████▊| 8541/8750 [2:31:10<19:45, 5.67s/it] {'loss': 0.4559, 'learning_rate': 2.9911438014412765e-08, 'epoch': 0.98} + 98%|█████████▊| 8541/8750 [2:31:08<19:45, 5.67s/it] 98%|█████████▊| 8542/8750 [2:31:13<19:45, 5.70s/it] 98%|█████████▊| 8542/8750 [2:31:15<19:45, 5.70s/it] {'loss': 0.4478, 'learning_rate': 2.962603000658648e-08, 'epoch': 0.98} + 98%|█████████▊| 8542/8750 [2:31:15<19:45, 5.70s/it] {'loss': 0.4478, 'learning_rate': 2.962603000658648e-08, 'epoch': 0.98} + 98%|█████████▊| 8542/8750 [2:31:13<19:45, 5.70s/it] 98%|█████████▊| 8543/8750 [2:31:19<19:37, 5.69s/it] 98%|█████████▊| 8543/8750 [2:31:21<19:37, 5.69s/it] {'loss': 0.458, 'learning_rate': 2.9341988162595593e-08, 'epoch': 0.98} + {'loss': 0.458, 'learning_rate': 2.9341988162595593e-08, 'epoch': 0.98} 98%|█████████▊| 8543/8750 [2:31:21<19:37, 5.69s/it] + 98%|█████████▊| 8543/8750 [2:31:19<19:37, 5.69s/it] 98%|█████████▊| 8544/8750 [2:31:27<19:43, 5.75s/it] 98%|█████████▊| 8544/8750 [2:31:25<19:43, 5.75s/it] {'loss': 0.4289, 'learning_rate': 2.905931252135785e-08, 'epoch': 0.98} + 98%|█████████▊| 8544/8750 [2:31:27<19:43, 5.75s/it] {'loss': 0.4289, 'learning_rate': 2.905931252135785e-08, 'epoch': 0.98} + 98%|█████████▊| 8544/8750 [2:31:25<19:43, 5.75s/it] 98%|█████████▊| 8545/8750 [2:31:33<19:36, 5.74s/it] 98%|█████████▊| 8545/8750 [2:31:31<19:36, 5.74s/it] {'loss': 0.4586, 'learning_rate': 2.8778003121607834e-08, 'epoch': 0.98} + 98%|█████████▊| 8545/8750 [2:31:33<19:36, 5.74s/it] {'loss': 0.4586, 'learning_rate': 2.8778003121607834e-08, 'epoch': 0.98} + 98%|█████████▊| 8545/8750 [2:31:31<19:36, 5.74s/it] 98%|█████████▊| 8546/8750 [2:31:38<19:28, 5.73s/it] 98%|█████████▊| 8546/8750 [2:31:36<19:28, 5.73s/it] {'loss': 0.4475, 'learning_rate': 2.849806000189026e-08, 'epoch': 0.98} + 98%|█████████▊| 8546/8750 [2:31:38<19:28, 5.73s/it] {'loss': 0.4475, 'learning_rate': 2.849806000189026e-08, 'epoch': 0.98} + 98%|█████████▊| 8546/8750 [2:31:36<19:28, 5.73s/it] 98%|█████████▊| 8547/8750 [2:31:44<19:16, 5.70s/it] 98%|█████████▊| 8547/8750 [2:31:42<19:16, 5.70s/it] {'loss': 0.4482, 'learning_rate': 2.8219483200563334e-08, 'epoch': 0.98} + 98%|█████████▊| 8547/8750 [2:31:44<19:16, 5.70s/it] {'loss': 0.4482, 'learning_rate': 2.8219483200563334e-08, 'epoch': 0.98} + 98%|█████████▊| 8547/8750 [2:31:42<19:16, 5.70s/it] 98%|█████████▊| 8548/8750 [2:31:50<19:20, 5.75s/it] 98%|█████████▊| 8548/8750 [2:31:48<19:20, 5.74s/it] {'loss': 0.4307, 'learning_rate': 2.794227275579986e-08, 'epoch': 0.98} + 98%|█████████▊| 8548/8750 [2:31:50<19:20, 5.75s/it] {'loss': 0.4307, 'learning_rate': 2.794227275579986e-08, 'epoch': 0.98} + 98%|█████████▊| 8548/8750 [2:31:48<19:20, 5.74s/it] 98%|█████████▊| 8549/8750 [2:31:56<19:13, 5.74s/it] 98%|█████████▊| 8549/8750 [2:31:54<19:13, 5.74s/it] {'loss': 0.4794, 'learning_rate': 2.766642870558278e-08, 'epoch': 0.98} + 98%|█████████▊| 8549/8750 [2:31:56<19:13, 5.74s/it] {'loss': 0.4794, 'learning_rate': 2.766642870558278e-08, 'epoch': 0.98} + 98%|█████████▊| 8549/8750 [2:31:54<19:13, 5.74s/it]5 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +89 AutoResumeHook: Checking whether to suspend... +AutoResumeHook: Checking whether to suspend... + 98%|█████████▊| 8550/8750 [2:32:01<19:08, 5.74s/it]10 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +014 AutoResumeHook: Checking whether to suspend... + 13 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +7 AutoResumeHook: Checking whether to suspend... + 98%|█████████▊| 8550/8750 [2:31:59<19:08, 5.74s/it]3 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4402, 'learning_rate': 2.7391951087708534e-08, 'epoch': 0.98} + 98%|█████████▊| 8550/8750 [2:32:01<19:08, 5.74s/it] {'loss': 0.4402, 'learning_rate': 2.7391951087708534e-08, 'epoch': 0.98} + 98%|█████████▊| 8550/8750 [2:31:59<19:08, 5.74s/it] 98%|█████████▊| 8551/8750 [2:32:07<19:09, 5.78s/it] 98%|█████████▊| 8551/8750 [2:32:05<19:09, 5.78s/it] {'loss': 0.4402, 'learning_rate': 2.7118839939787033e-08, 'epoch': 0.98} + 98%|█████████▊| 8551/8750 [2:32:07<19:09, 5.78s/it] {'loss': 0.4402, 'learning_rate': 2.7118839939787033e-08, 'epoch': 0.98} + 98%|█████████▊| 8551/8750 [2:32:05<19:09, 5.78s/it] 98%|█████████▊| 8552/8750 [2:32:13<18:56, 5.74s/it] 98%|█████████▊| 8552/8750 [2:32:11<18:56, 5.74s/it] {'loss': 0.4382, 'learning_rate': 2.6847095299241678e-08, 'epoch': 0.98} + 98%|█████████▊| 8552/8750 [2:32:13<18:56, 5.74s/it] {'loss': 0.4382, 'learning_rate': 2.6847095299241678e-08, 'epoch': 0.98} + 98%|█████████▊| 8552/8750 [2:32:11<18:56, 5.74s/it] 98%|█████████▊| 8553/8750 [2:32:17<18:45, 5.71s/it] 98%|█████████▊| 8553/8750 [2:32:18<18:45, 5.71s/it] {'loss': 0.4644, 'learning_rate': 2.6576717203304904e-08, 'epoch': 0.98} + {'loss': 0.4644, 'learning_rate': 2.6576717203304904e-08, 'epoch': 0.98} + 98%|█████████▊| 8553/8750 [2:32:18<18:45, 5.71s/it] 98%|█████████▊| 8553/8750 [2:32:17<18:45, 5.71s/it] 98%|█████████▊| 8554/8750 [2:32:24<18:30, 5.67s/it] 98%|█████████▊| 8554/8750 [2:32:22<18:30, 5.67s/it] {'loss': 0.4639, 'learning_rate': 2.6307705689028184e-08, 'epoch': 0.98} + 98%|█████████▊| 8554/8750 [2:32:24<18:30, 5.67s/it] {'loss': 0.4639, 'learning_rate': 2.6307705689028184e-08, 'epoch': 0.98} + 98%|█████████▊| 8554/8750 [2:32:22<18:30, 5.67s/it] 98%|█████████▊| 8555/8750 [2:32:30<18:25, 5.67s/it] 98%|█████████▊| 8555/8750 [2:32:28<18:25, 5.67s/it] {'loss': 0.4479, 'learning_rate': 2.6040060793268705e-08, 'epoch': 0.98} + 98%|█████████▊| 8555/8750 [2:32:30<18:25, 5.67s/it] {'loss': 0.4479, 'learning_rate': 2.6040060793268705e-08, 'epoch': 0.98} + 98%|█████████▊| 8555/8750 [2:32:28<18:25, 5.67s/it] 98%|█████████▊| 8556/8750 [2:32:35<18:19, 5.67s/it] 98%|█████████▊| 8556/8750 [2:32:33<18:19, 5.67s/it] {'loss': 0.4348, 'learning_rate': 2.5773782552701578e-08, 'epoch': 0.98} + 98%|█████████▊| 8556/8750 [2:32:35<18:19, 5.67s/it] {'loss': 0.4348, 'learning_rate': 2.5773782552701578e-08, 'epoch': 0.98} + 98%|█████████▊| 8556/8750 [2:32:33<18:19, 5.67s/it] 98%|█████████▊| 8557/8750 [2:32:39<18:10, 5.65s/it] 98%|█████████▊| 8557/8750 [2:32:41<18:10, 5.65s/it] {'loss': 0.436, 'learning_rate': 2.550887100381205e-08, 'epoch': 0.98} + {'loss': 0.436, 'learning_rate': 2.550887100381205e-08, 'epoch': 0.98} + 98%|█████████▊| 8557/8750 [2:32:41<18:10, 5.65s/it] 98%|█████████▊| 8557/8750 [2:32:39<18:10, 5.65s/it] 98%|█████████▊| 8558/8750 [2:32:47<18:03, 5.64s/it] 98%|█████████▊| 8558/8750 [2:32:45<18:03, 5.64s/it] {'loss': 0.4596, 'learning_rate': 2.5245326182899987e-08, 'epoch': 0.98} + 98%|█████████▊| 8558/8750 [2:32:47<18:03, 5.64s/it] {'loss': 0.4596, 'learning_rate': 2.5245326182899987e-08, 'epoch': 0.98} + 98%|█████████▊| 8558/8750 [2:32:45<18:03, 5.64s/it] 98%|█████████▊| 8559/8750 [2:32:52<18:00, 5.65s/it] 98%|█████████▊| 8559/8750 [2:32:50<18:00, 5.66s/it] {'loss': 0.4529, 'learning_rate': 2.4983148126076494e-08, 'epoch': 0.98} + 98%|█████████▊| 8559/8750 [2:32:52<18:00, 5.65s/it] {'loss': 0.4529, 'learning_rate': 2.4983148126076494e-08, 'epoch': 0.98} + 98%|█████████▊| 8559/8750 [2:32:50<18:00, 5.66s/it] 98%|█████████▊| 8560/8750 [2:32:58<18:05, 5.71s/it] 98%|█████████▊| 8560/8750 [2:32:56<18:05, 5.71s/it] {'loss': 0.4305, 'learning_rate': 2.4722336869265063e-08, 'epoch': 0.98} + 98%|█████████▊| 8560/8750 [2:32:58<18:05, 5.71s/it] {'loss': 0.4305, 'learning_rate': 2.4722336869265063e-08, 'epoch': 0.98} + 98%|█████████▊| 8560/8750 [2:32:56<18:05, 5.71s/it] 98%|█████████▊| 8561/8750 [2:33:02<17:52, 5.67s/it] 98%|█████████▊| 8561/8750 [2:33:04<17:52, 5.67s/it] {'loss': 0.4697, 'learning_rate': 2.4462892448202657e-08, 'epoch': 0.98} + {'loss': 0.4697, 'learning_rate': 2.4462892448202657e-08, 'epoch': 0.98} + 98%|█████████▊| 8561/8750 [2:33:04<17:52, 5.67s/it] 98%|█████████▊| 8561/8750 [2:33:02<17:52, 5.67s/it] 98%|█████████▊| 8562/8750 [2:33:09<17:47, 5.68s/it] 98%|█████████▊| 8562/8750 [2:33:07<17:47, 5.68s/it] {'loss': 0.4521, 'learning_rate': 2.4204814898440844e-08, 'epoch': 0.98} + 98%|█████████▊| 8562/8750 [2:33:09<17:47, 5.68s/it] {'loss': 0.4521, 'learning_rate': 2.4204814898440844e-08, 'epoch': 0.98} + 98%|█████████▊| 8562/8750 [2:33:07<17:47, 5.68s/it] 98%|█████████▊| 8563/8750 [2:33:15<17:39, 5.67s/it] 98%|█████████▊| 8563/8750 [2:33:13<17:39, 5.67s/it] {'loss': 0.4691, 'learning_rate': 2.394810425534022e-08, 'epoch': 0.98} + 98%|█████████▊| 8563/8750 [2:33:15<17:39, 5.67s/it] {'loss': 0.4691, 'learning_rate': 2.394810425534022e-08, 'epoch': 0.98} + 98%|█████████▊| 8563/8750 [2:33:13<17:39, 5.67s/it] 98%|█████████▊| 8564/8750 [2:33:19<17:45, 5.73s/it] 98%|█████████▊| 8564/8750 [2:33:21<17:45, 5.73s/it] {'loss': 0.4614, 'learning_rate': 2.369276055407599e-08, 'epoch': 0.98} + 98%|█████████▊| 8564/8750 [2:33:21<17:45, 5.73s/it] {'loss': 0.4614, 'learning_rate': 2.369276055407599e-08, 'epoch': 0.98} + 98%|█████████▊| 8564/8750 [2:33:19<17:45, 5.73s/it] 98%|█████████▊| 8565/8750 [2:33:26<17:31, 5.69s/it] 98%|█████████▊| 8565/8750 [2:33:25<17:31, 5.69s/it] {'loss': 0.4483, 'learning_rate': 2.3438783829635714e-08, 'epoch': 0.98} + 98%|█████████▊| 8565/8750 [2:33:26<17:31, 5.69s/it] {'loss': 0.4483, 'learning_rate': 2.3438783829635714e-08, 'epoch': 0.98} + 98%|█████████▊| 8565/8750 [2:33:25<17:31, 5.69s/it] 98%|█████████▊| 8566/8750 [2:33:32<17:24, 5.68s/it] 98%|█████████▊| 8566/8750 [2:33:30<17:24, 5.68s/it] {'loss': 0.4659, 'learning_rate': 2.318617411682156e-08, 'epoch': 0.98} + 98%|█████████▊| 8566/8750 [2:33:32<17:24, 5.68s/it] {'loss': 0.4659, 'learning_rate': 2.318617411682156e-08, 'epoch': 0.98} + 98%|█████████▊| 8566/8750 [2:33:30<17:24, 5.68s/it] 98%|█████████▊| 8567/8750 [2:33:36<17:20, 5.68s/it] 98%|█████████▊| 8567/8750 [2:33:38<17:20, 5.68s/it] {'loss': 0.4533, 'learning_rate': 2.2934931450245833e-08, 'epoch': 0.98} + 98%|█████████▊| 8567/8750 [2:33:38<17:20, 5.68s/it] {'loss': 0.4533, 'learning_rate': 2.2934931450245833e-08, 'epoch': 0.98} + 98%|█████████▊| 8567/8750 [2:33:36<17:20, 5.68s/it] 98%|█████████▊| 8568/8750 [2:33:42<17:52, 5.89s/it] 98%|█████████▊| 8568/8750 [2:33:44<17:52, 5.89s/it]{'loss': 0.4534, 'learning_rate': 2.2685055864333227e-08, 'epoch': 0.98} + {'loss': 0.4534, 'learning_rate': 2.2685055864333227e-08, 'epoch': 0.98} + 98%|█████████▊| 8568/8750 [2:33:44<17:52, 5.89s/it] 98%|█████████▊| 8568/8750 [2:33:42<17:52, 5.89s/it] 98%|█████████▊| 8569/8750 [2:33:50<17:29, 5.80s/it] 98%|█████████▊| 8569/8750 [2:33:48<17:29, 5.80s/it] {'loss': 0.4475, 'learning_rate': 2.2436547393323017e-08, 'epoch': 0.98} + 98%|█████████▊| 8569/8750 [2:33:50<17:29, 5.80s/it] {'loss': 0.4475, 'learning_rate': 2.2436547393323017e-08, 'epoch': 0.98} + 98%|█████████▊| 8569/8750 [2:33:48<17:29, 5.80s/it] 98%|█████████▊| 8570/8750 [2:33:56<17:20, 5.78s/it] 98%|█████████▊| 8570/8750 [2:33:54<17:20, 5.78s/it] {'loss': 0.4378, 'learning_rate': 2.218940607126685e-08, 'epoch': 0.98} + 98%|█████████▊| 8570/8750 [2:33:56<17:20, 5.78s/it] {'loss': 0.4378, 'learning_rate': 2.218940607126685e-08, 'epoch': 0.98} + 98%|█████████▊| 8570/8750 [2:33:54<17:20, 5.78s/it] 98%|█████████▊| 8571/8750 [2:34:01<17:11, 5.76s/it] 98%|█████████▊| 8571/8750 [2:33:59<17:11, 5.76s/it] {'loss': 0.4555, 'learning_rate': 2.1943631932028752e-08, 'epoch': 0.98} + 98%|█████████▊| 8571/8750 [2:34:01<17:11, 5.76s/it] {'loss': 0.4555, 'learning_rate': 2.1943631932028752e-08, 'epoch': 0.98} + 98%|█████████▊| 8571/8750 [2:33:59<17:11, 5.76s/it] 98%|█████████▊| 8572/8750 [2:34:05<17:06, 5.77s/it] 98%|█████████▊| 8572/8750 [2:34:07<17:06, 5.77s/it] {'loss': 0.4596, 'learning_rate': 2.169922500928512e-08, 'epoch': 0.98} + 98%|█████████▊| 8572/8750 [2:34:07<17:06, 5.77s/it] {'loss': 0.4596, 'learning_rate': 2.169922500928512e-08, 'epoch': 0.98} + 98%|█████████▊| 8572/8750 [2:34:05<17:06, 5.77s/it] 98%|█████████▊| 8573/8750 [2:34:13<16:54, 5.73s/it] 98%|█████████▊| 8573/8750 [2:34:11<16:54, 5.73s/it] {'loss': 0.4347, 'learning_rate': 2.1456185336524714e-08, 'epoch': 0.98} + 98%|█████████▊| 8573/8750 [2:34:13<16:54, 5.73s/it] {'loss': 0.4347, 'learning_rate': 2.1456185336524714e-08, 'epoch': 0.98} + 98%|█████████▊| 8573/8750 [2:34:11<16:54, 5.73s/it] 98%|█████████▊| 8574/8750 [2:34:18<16:37, 5.67s/it] 98%|█████████▊| 8574/8750 [2:34:16<16:37, 5.67s/it] {'loss': 0.4469, 'learning_rate': 2.1214512947048684e-08, 'epoch': 0.98} + 98%|█████████▊| 8574/8750 [2:34:18<16:37, 5.67s/it] {'loss': 0.4469, 'learning_rate': 2.1214512947048684e-08, 'epoch': 0.98} + 98%|█████████▊| 8574/8750 [2:34:16<16:37, 5.67s/it] 98%|█████████▊| 8575/8750 [2:34:24<16:30, 5.66s/it] 98%|█████████▊| 8575/8750 [2:34:22<16:30, 5.66s/it] {'loss': 0.4515, 'learning_rate': 2.097420787397275e-08, 'epoch': 0.98} + 98%|█████████▊| 8575/8750 [2:34:24<16:30, 5.66s/it] {'loss': 0.4515, 'learning_rate': 2.097420787397275e-08, 'epoch': 0.98} + 98%|█████████▊| 8575/8750 [2:34:22<16:30, 5.66s/it] 98%|█████████▊| 8576/8750 [2:34:30<16:30, 5.70s/it] 98%|█████████▊| 8576/8750 [2:34:28<16:30, 5.70s/it] {'loss': 0.4495, 'learning_rate': 2.0735270150223917e-08, 'epoch': 0.98} + 98%|█████████▊| 8576/8750 [2:34:30<16:30, 5.70s/it] {'loss': 0.4495, 'learning_rate': 2.0735270150223917e-08, 'epoch': 0.98} + 98%|█████████▊| 8576/8750 [2:34:28<16:30, 5.70s/it] 98%|█████████▊| 8577/8750 [2:34:35<16:26, 5.70s/it] 98%|█████████▊| 8577/8750 [2:34:33<16:26, 5.70s/it] {'loss': 0.4537, 'learning_rate': 2.0497699808542658e-08, 'epoch': 0.98} + 98%|█████████▊| 8577/8750 [2:34:35<16:26, 5.70s/it] {'loss': 0.4537, 'learning_rate': 2.0497699808542658e-08, 'epoch': 0.98} + 98%|█████████▊| 8577/8750 [2:34:33<16:26, 5.70s/it] 98%|█████████▊| 8578/8750 [2:34:41<16:14, 5.67s/it] 98%|█████████▊| 8578/8750 [2:34:39<16:14, 5.67s/it] {'loss': 0.4443, 'learning_rate': 2.0261496881479605e-08, 'epoch': 0.98} + 98%|█████████▊| 8578/8750 [2:34:41<16:14, 5.67s/it] {'loss': 0.4443, 'learning_rate': 2.0261496881479605e-08, 'epoch': 0.98} + 98%|█████████▊| 8578/8750 [2:34:39<16:14, 5.67s/it] 98%|█████████▊| 8579/8750 [2:34:47<16:09, 5.67s/it] 98%|█████████▊| 8579/8750 [2:34:45<16:09, 5.67s/it] {'loss': 0.4546, 'learning_rate': 2.002666140140108e-08, 'epoch': 0.98} + 98%|█████████▊| 8579/8750 [2:34:47<16:09, 5.67s/it] {'loss': 0.4546, 'learning_rate': 2.002666140140108e-08, 'epoch': 0.98} + 98%|█████████▊| 8579/8750 [2:34:45<16:09, 5.67s/it] 98%|█████████▊| 8580/8750 [2:34:52<15:54, 5.61s/it] 98%|█████████▊| 8580/8750 [2:34:50<15:54, 5.61s/it] {'loss': 0.4571, 'learning_rate': 1.979319340048469e-08, 'epoch': 0.98} + 98%|█████████▊| 8580/8750 [2:34:52<15:54, 5.61s/it] {'loss': 0.4571, 'learning_rate': 1.979319340048469e-08, 'epoch': 0.98} + 98%|█████████▊| 8580/8750 [2:34:50<15:54, 5.61s/it] 98%|█████████▊| 8581/8750 [2:34:58<16:00, 5.68s/it] 98%|█████████▊| 8581/8750 [2:34:56<16:00, 5.68s/it] {'loss': 0.4424, 'learning_rate': 1.956109291072039e-08, 'epoch': 0.98} + 98%|█████████▊| 8581/8750 [2:34:58<16:00, 5.68s/it] {'loss': 0.4424, 'learning_rate': 1.956109291072039e-08, 'epoch': 0.98} + 98%|█████████▊| 8581/8750 [2:34:56<16:00, 5.68s/it] 98%|█████████▊| 8582/8750 [2:35:03<15:42, 5.61s/it] 98%|█████████▊| 8582/8750 [2:35:01<15:42, 5.61s/it] {'loss': 0.4511, 'learning_rate': 1.9330359963910527e-08, 'epoch': 0.98} + 98%|█████████▊| 8582/8750 [2:35:03<15:42, 5.61s/it] {'loss': 0.4511, 'learning_rate': 1.9330359963910527e-08, 'epoch': 0.98} + 98%|█████████▊| 8582/8750 [2:35:01<15:42, 5.61s/it] 98%|█████████▊| 8583/8750 [2:35:09<15:45, 5.66s/it] 98%|█████████▊| 8583/8750 [2:35:07<15:45, 5.66s/it] {'loss': 0.4563, 'learning_rate': 1.910099459167314e-08, 'epoch': 0.98} + 98%|█████████▊| 8583/8750 [2:35:09<15:45, 5.66s/it] {'loss': 0.4563, 'learning_rate': 1.910099459167314e-08, 'epoch': 0.98} + 98%|█████████▊| 8583/8750 [2:35:07<15:45, 5.66s/it] 98%|█████████▊| 8584/8750 [2:35:13<15:44, 5.69s/it] 98%|█████████▊| 8584/8750 [2:35:15<15:44, 5.69s/it] {'loss': 0.4414, 'learning_rate': 1.8872996825433086e-08, 'epoch': 0.98} + 98%|█████████▊| 8584/8750 [2:35:15<15:44, 5.69s/it] {'loss': 0.4414, 'learning_rate': 1.8872996825433086e-08, 'epoch': 0.98} + 98%|█████████▊| 8584/8750 [2:35:13<15:44, 5.69s/it] 98%|█████████▊| 8585/8750 [2:35:21<15:35, 5.67s/it] 98%|█████████▊| 8585/8750 [2:35:19<15:35, 5.67s/it] {'loss': 0.4554, 'learning_rate': 1.864636669643427e-08, 'epoch': 0.98} + 98%|█████████▊| 8585/8750 [2:35:21<15:35, 5.67s/it] {'loss': 0.4554, 'learning_rate': 1.864636669643427e-08, 'epoch': 0.98} + 98%|█████████▊| 8585/8750 [2:35:19<15:35, 5.67s/it] 98%|█████████▊| 8586/8750 [2:35:26<15:42, 5.74s/it] 98%|█████████▊| 8586/8750 [2:35:25<15:42, 5.74s/it] {'loss': 0.4389, 'learning_rate': 1.8421104235727406e-08, 'epoch': 0.98} + 98%|█████████▊| 8586/8750 [2:35:26<15:42, 5.74s/it] {'loss': 0.4389, 'learning_rate': 1.8421104235727406e-08, 'epoch': 0.98} + 98%|█████████▊| 8586/8750 [2:35:25<15:42, 5.74s/it] 98%|█████████▊| 8587/8750 [2:35:32<15:36, 5.75s/it] 98%|█████████▊| 8587/8750 [2:35:30<15:36, 5.75s/it] {'loss': 0.4647, 'learning_rate': 1.8197209474180023e-08, 'epoch': 0.98} + 98%|█████████▊| 8587/8750 [2:35:32<15:36, 5.75s/it] {'loss': 0.4647, 'learning_rate': 1.8197209474180023e-08, 'epoch': 0.98} + 98%|█████████▊| 8587/8750 [2:35:30<15:36, 5.75s/it] 98%|█████████▊| 8588/8750 [2:35:38<15:26, 5.72s/it] 98%|█████████▊| 8588/8750 [2:35:36<15:26, 5.72s/it] {'loss': 0.4403, 'learning_rate': 1.7974682442470915e-08, 'epoch': 0.98} + 98%|█████████▊| 8588/8750 [2:35:38<15:26, 5.72s/it] {'loss': 0.4403, 'learning_rate': 1.7974682442470915e-08, 'epoch': 0.98} + 98%|█████████▊| 8588/8750 [2:35:36<15:26, 5.72s/it] 98%|█████████▊| 8589/8750 [2:35:43<15:15, 5.69s/it] 98%|█████████▊| 8589/8750 [2:35:42<15:15, 5.69s/it] {'loss': 0.4513, 'learning_rate': 1.775352317109014e-08, 'epoch': 0.98} + 98%|█████████▊| 8589/8750 [2:35:43<15:15, 5.69s/it] {'loss': 0.4513, 'learning_rate': 1.775352317109014e-08, 'epoch': 0.98} + 98%|█████████▊| 8589/8750 [2:35:42<15:15, 5.69s/it] 98%|█████████▊| 8590/8750 [2:35:49<15:09, 5.68s/it] 98%|█████████▊| 8590/8750 [2:35:47<15:09, 5.68s/it] {'loss': 0.4451, 'learning_rate': 1.7533731690342338e-08, 'epoch': 0.98} + 98%|█████████▊| 8590/8750 [2:35:49<15:09, 5.68s/it] {'loss': 0.4451, 'learning_rate': 1.7533731690342338e-08, 'epoch': 0.98} + 98%|█████████▊| 8590/8750 [2:35:47<15:09, 5.68s/it] 98%|█████████▊| 8591/8750 [2:35:55<15:01, 5.67s/it] 98%|█████████▊| 8591/8750 [2:35:53<15:01, 5.67s/it] {'loss': 0.4486, 'learning_rate': 1.7315308030342314e-08, 'epoch': 0.98} + 98%|█████████▊| 8591/8750 [2:35:55<15:01, 5.67s/it] {'loss': 0.4486, 'learning_rate': 1.7315308030342314e-08, 'epoch': 0.98} + 98%|█████████▊| 8591/8750 [2:35:53<15:01, 5.67s/it] 98%|█████████▊| 8592/8750 [2:36:00<14:56, 5.67s/it] 98%|█████████▊| 8592/8750 [2:35:59<14:56, 5.67s/it] {'loss': 0.4594, 'learning_rate': 1.7098252221021683e-08, 'epoch': 0.98} + 98%|█████████▊| 8592/8750 [2:36:00<14:56, 5.67s/it] {'loss': 0.4594, 'learning_rate': 1.7098252221021683e-08, 'epoch': 0.98} + 98%|█████████▊| 8592/8750 [2:35:59<14:56, 5.67s/it] 98%|█████████▊| 8593/8750 [2:36:06<14:59, 5.73s/it] 98%|█████████▊| 8593/8750 [2:36:04<14:59, 5.73s/it] {'loss': 0.4419, 'learning_rate': 1.6882564292119984e-08, 'epoch': 0.98} + 98%|█████████▊| 8593/8750 [2:36:06<14:59, 5.73s/it] {'loss': 0.4419, 'learning_rate': 1.6882564292119984e-08, 'epoch': 0.98} + 98%|█████████▊| 8593/8750 [2:36:04<14:59, 5.73s/it] 98%|█████████▊| 8594/8750 [2:36:12<14:47, 5.69s/it] 98%|█████████▊| 8594/8750 [2:36:10<14:47, 5.69s/it] {'loss': 0.4521, 'learning_rate': 1.666824427319136e-08, 'epoch': 0.98} + 98%|█████████▊| 8594/8750 [2:36:12<14:47, 5.69s/it] {'loss': 0.4521, 'learning_rate': 1.666824427319136e-08, 'epoch': 0.98} + 98%|█████████▊| 8594/8750 [2:36:10<14:47, 5.69s/it] 98%|█████████▊| 8595/8750 [2:36:18<14:40, 5.68s/it] 98%|█████████▊| 8595/8750 [2:36:16<14:40, 5.68s/it] {'loss': 0.4343, 'learning_rate': 1.6455292193603424e-08, 'epoch': 0.98} + 98%|█████████▊| 8595/8750 [2:36:18<14:40, 5.68s/it] {'loss': 0.4343, 'learning_rate': 1.6455292193603424e-08, 'epoch': 0.98} + 98%|█████████▊| 8595/8750 [2:36:16<14:40, 5.68s/it] 98%|█████████▊| 8596/8750 [2:36:23<14:36, 5.69s/it] 98%|█████████▊| 8596/8750 [2:36:21<14:36, 5.69s/it] {'loss': 0.4542, 'learning_rate': 1.624370808253506e-08, 'epoch': 0.98} + {'loss': 0.4542, 'learning_rate': 1.624370808253506e-08, 'epoch': 0.98} 98%|█████████▊| 8596/8750 [2:36:23<14:36, 5.69s/it] + 98%|█████████▊| 8596/8750 [2:36:21<14:36, 5.69s/it] 98%|█████████▊| 8597/8750 [2:36:29<14:33, 5.71s/it] 98%|█████████▊| 8597/8750 [2:36:27<14:33, 5.71s/it] {'loss': 0.4468, 'learning_rate': 1.6033491968976412e-08, 'epoch': 0.98} + 98%|█████████▊| 8597/8750 [2:36:29<14:33, 5.71s/it] {'loss': 0.4468, 'learning_rate': 1.6033491968976412e-08, 'epoch': 0.98} + 98%|█████████▊| 8597/8750 [2:36:27<14:33, 5.71s/it] 98%|█████████▊| 8598/8750 [2:36:35<14:31, 5.73s/it] 98%|█████████▊| 8598/8750 [2:36:33<14:31, 5.73s/it] {'loss': 0.4459, 'learning_rate': 1.5824643881734438e-08, 'epoch': 0.98} + 98%|█████████▊| 8598/8750 [2:36:35<14:31, 5.73s/it] {'loss': 0.4459, 'learning_rate': 1.5824643881734438e-08, 'epoch': 0.98} + 98%|█████████▊| 8598/8750 [2:36:33<14:31, 5.73s/it] 98%|█████████▊| 8599/8750 [2:36:39<14:22, 5.71s/it] 98%|█████████▊| 8599/8750 [2:36:41<14:22, 5.71s/it] {'loss': 0.4632, 'learning_rate': 1.561716384942402e-08, 'epoch': 0.98} + 98%|█████████▊| 8599/8750 [2:36:41<14:22, 5.71s/it] {'loss': 0.4632, 'learning_rate': 1.561716384942402e-08, 'epoch': 0.98} + 98%|█████████▊| 8599/8750 [2:36:39<14:22, 5.71s/it]5 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +11 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +012 AutoResumeHook: Checking whether to suspend... + 98%|█████████▊| 8600/8750 [2:36:46<14:12, 5.68s/it]14 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... + 98%|█████████▊| 8600/8750 [2:36:44<14:12, 5.68s/it]3 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4459, 'learning_rate': 1.541105190047465e-08, 'epoch': 0.98} + 98%|█████████▊| 8600/8750 [2:36:46<14:12, 5.68s/it] {'loss': 0.4459, 'learning_rate': 1.541105190047465e-08, 'epoch': 0.98} + 98%|█████████▊| 8600/8750 [2:36:44<14:12, 5.68s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-8600/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-8600/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-8600/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 98%|█████████▊| 8601/8750 [2:37:07<25:33, 10.29s/it] 98%|█████████▊| 8601/8750 [2:37:05<25:33, 10.29s/it] {'loss': 0.4326, 'learning_rate': 1.5206308063129282e-08, 'epoch': 0.98} + 98%|█████████▊| 8601/8750 [2:37:07<25:33, 10.29s/it] {'loss': 0.4326, 'learning_rate': 1.5206308063129282e-08, 'epoch': 0.98} + 98%|█████████▊| 8601/8750 [2:37:05<25:33, 10.29s/it] 98%|█████████▊| 8602/8750 [2:37:13<21:52, 8.87s/it] 98%|█████████▊| 8602/8750 [2:37:11<21:52, 8.87s/it] {'loss': 0.4464, 'learning_rate': 1.5002932365442148e-08, 'epoch': 0.98} + 98%|█████████▊| 8602/8750 [2:37:13<21:52, 8.87s/it] {'loss': 0.4464, 'learning_rate': 1.5002932365442148e-08, 'epoch': 0.98} + 98%|█████████▊| 8602/8750 [2:37:11<21:52, 8.87s/it] 98%|█████████▊| 8603/8750 [2:37:18<19:27, 7.94s/it] 98%|█████████▊| 8603/8750 [2:37:17<19:27, 7.94s/it] {'loss': 0.4586, 'learning_rate': 1.480092483527984e-08, 'epoch': 0.98} + 98%|█████████▊| 8603/8750 [2:37:18<19:27, 7.94s/it] {'loss': 0.4586, 'learning_rate': 1.480092483527984e-08, 'epoch': 0.98} + 98%|█████████▊| 8603/8750 [2:37:17<19:27, 7.94s/it] 98%|█████████▊| 8604/8750 [2:37:24<17:42, 7.28s/it] 98%|█████████▊| 8604/8750 [2:37:22<17:42, 7.28s/it] {'loss': 0.4658, 'learning_rate': 1.4600285500322442e-08, 'epoch': 0.98} + 98%|█████████▊| 8604/8750 [2:37:24<17:42, 7.28s/it] {'loss': 0.4658, 'learning_rate': 1.4600285500322442e-08, 'epoch': 0.98} + 98%|█████████▊| 8604/8750 [2:37:22<17:42, 7.28s/it] 98%|█████████▊| 8605/8750 [2:37:30<16:22, 6.78s/it] 98%|█████████▊| 8605/8750 [2:37:28<16:22, 6.78s/it] {'loss': 0.483, 'learning_rate': 1.4401014388061296e-08, 'epoch': 0.98} + 98%|█████████▊| 8605/8750 [2:37:30<16:22, 6.78s/it] {'loss': 0.483, 'learning_rate': 1.4401014388061296e-08, 'epoch': 0.98} + 98%|█████████▊| 8605/8750 [2:37:28<16:22, 6.78s/it] 98%|█████████▊| 8606/8750 [2:37:36<15:29, 6.45s/it] 98%|█████████▊| 8606/8750 [2:37:34<15:29, 6.45s/it] {'loss': 0.4419, 'learning_rate': 1.4203111525801228e-08, 'epoch': 0.98} + 98%|█████████▊| 8606/8750 [2:37:36<15:29, 6.45s/it] {'loss': 0.4419, 'learning_rate': 1.4203111525801228e-08, 'epoch': 0.98} + 98%|█████████▊| 8606/8750 [2:37:34<15:29, 6.45s/it] 98%|█████████▊| 8607/8750 [2:37:41<14:59, 6.29s/it] 98%|█████████▊| 8607/8750 [2:37:40<14:59, 6.29s/it] {'loss': 0.4547, 'learning_rate': 1.4006576940659433e-08, 'epoch': 0.98} + 98%|█████████▊| 8607/8750 [2:37:41<14:59, 6.29s/it] {'loss': 0.4547, 'learning_rate': 1.4006576940659433e-08, 'epoch': 0.98} + 98%|█████████▊| 8607/8750 [2:37:40<14:59, 6.29s/it] 98%|█████████▊| 8608/8750 [2:37:47<14:27, 6.11s/it] 98%|█████████▊| 8608/8750 [2:37:45<14:27, 6.11s/it] {'loss': 0.4322, 'learning_rate': 1.3811410659565483e-08, 'epoch': 0.98} + 98%|█████████▊| 8608/8750 [2:37:47<14:27, 6.11s/it] {'loss': 0.4322, 'learning_rate': 1.3811410659565483e-08, 'epoch': 0.98} + 98%|█████████▊| 8608/8750 [2:37:45<14:27, 6.11s/it] 98%|█████████▊| 8609/8750 [2:37:53<14:08, 6.02s/it] 98%|█████████▊| 8609/8750 [2:37:51<14:08, 6.02s/it] {'loss': 0.4459, 'learning_rate': 1.3617612709262428e-08, 'epoch': 0.98} + 98%|█████████▊| 8609/8750 [2:37:53<14:08, 6.02s/it] {'loss': 0.4459, 'learning_rate': 1.3617612709262428e-08, 'epoch': 0.98} + 98%|█████████▊| 8609/8750 [2:37:51<14:08, 6.02s/it] 98%|█████████▊| 8610/8750 [2:37:59<13:52, 5.95s/it] 98%|█████████▊| 8610/8750 [2:37:57<13:52, 5.95s/it] {'loss': 0.4408, 'learning_rate': 1.3425183116303475e-08, 'epoch': 0.98} + 98%|█████████▊| 8610/8750 [2:37:59<13:52, 5.95s/it] {'loss': 0.4408, 'learning_rate': 1.3425183116303475e-08, 'epoch': 0.98} + 98%|█████████▊| 8610/8750 [2:37:57<13:52, 5.95s/it] 98%|█████████▊| 8611/8750 [2:38:04<13:37, 5.88s/it] 98%|█████████▊| 8611/8750 [2:38:03<13:37, 5.88s/it] {'loss': 0.4458, 'learning_rate': 1.3234121907056418e-08, 'epoch': 0.98} + 98%|█████████▊| 8611/8750 [2:38:04<13:37, 5.88s/it] {'loss': 0.4458, 'learning_rate': 1.3234121907056418e-08, 'epoch': 0.98} + 98%|█████████▊| 8611/8750 [2:38:03<13:37, 5.88s/it] 98%|█████████▊| 8612/8750 [2:38:10<13:34, 5.90s/it] 98%|█████████▊| 8612/8750 [2:38:08<13:34, 5.90s/it] {'loss': 0.4581, 'learning_rate': 1.3044429107700319e-08, 'epoch': 0.98} + 98%|█████████▊| 8612/8750 [2:38:10<13:34, 5.90s/it] {'loss': 0.4581, 'learning_rate': 1.3044429107700319e-08, 'epoch': 0.98} + 98%|█████████▊| 8612/8750 [2:38:08<13:34, 5.90s/it] 98%|█████████▊| 8613/8750 [2:38:16<13:20, 5.85s/it] 98%|█████████▊| 8613/8750 [2:38:14<13:20, 5.85s/it] {'loss': 0.4792, 'learning_rate': 1.2856104744228826e-08, 'epoch': 0.98} + 98%|█████████▊| 8613/8750 [2:38:16<13:20, 5.85s/it] {'loss': 0.4792, 'learning_rate': 1.2856104744228826e-08, 'epoch': 0.98} + 98%|█████████▊| 8613/8750 [2:38:14<13:20, 5.85s/it] 98%|█████████▊| 8614/8750 [2:38:22<13:19, 5.88s/it] 98%|█████████▊| 8614/8750 [2:38:20<13:19, 5.88s/it] {'loss': 0.4454, 'learning_rate': 1.2669148842444634e-08, 'epoch': 0.98} + 98%|█████████▊| 8614/8750 [2:38:22<13:19, 5.88s/it] {'loss': 0.4454, 'learning_rate': 1.2669148842444634e-08, 'epoch': 0.98} + 98%|█████████▊| 8614/8750 [2:38:20<13:19, 5.88s/it] 98%|█████████▊| 8615/8750 [2:38:28<13:03, 5.80s/it] 98%|█████████▊| 8615/8750 [2:38:26<13:03, 5.80s/it] {'loss': 0.4506, 'learning_rate': 1.248356142796725e-08, 'epoch': 0.98} + 98%|█████████▊| 8615/8750 [2:38:28<13:03, 5.80s/it] {'loss': 0.4506, 'learning_rate': 1.248356142796725e-08, 'epoch': 0.98} + 98%|█████████▊| 8615/8750 [2:38:26<13:03, 5.80s/it] 98%|█████████▊| 8616/8750 [2:38:33<12:57, 5.80s/it] 98%|█████████▊| 8616/8750 [2:38:32<12:57, 5.80s/it] {'loss': 0.4594, 'learning_rate': 1.2299342526224112e-08, 'epoch': 0.98} + 98%|█████████▊| 8616/8750 [2:38:33<12:57, 5.80s/it] {'loss': 0.4594, 'learning_rate': 1.2299342526224112e-08, 'epoch': 0.98} + 98%|█████████▊| 8616/8750 [2:38:32<12:57, 5.80s/it] 98%|█████████▊| 8617/8750 [2:38:39<12:47, 5.77s/it] 98%|█████████▊| 8617/8750 [2:38:37<12:47, 5.77s/it] {'loss': 0.4516, 'learning_rate': 1.211649216245836e-08, 'epoch': 0.98} + 98%|█████████▊| 8617/8750 [2:38:39<12:47, 5.77s/it] {'loss': 0.4516, 'learning_rate': 1.211649216245836e-08, 'epoch': 0.98} + 98%|█████████▊| 8617/8750 [2:38:37<12:47, 5.77s/it] 98%|█████████▊| 8618/8750 [2:38:43<12:37, 5.74s/it] 98%|█████████▊| 8618/8750 [2:38:45<12:37, 5.74s/it] {'loss': 0.4324, 'learning_rate': 1.1935010361724397e-08, 'epoch': 0.98} + {'loss': 0.4324, 'learning_rate': 1.1935010361724397e-08, 'epoch': 0.98} + 98%|█████████▊| 8618/8750 [2:38:45<12:37, 5.74s/it] 98%|█████████▊| 8618/8750 [2:38:43<12:37, 5.74s/it] 99%|█████████▊| 8619/8750 [2:38:51<12:32, 5.74s/it] 99%|█████████▊| 8619/8750 [2:38:49<12:32, 5.74s/it] {'loss': 0.4501, 'learning_rate': 1.1754897148889e-08, 'epoch': 0.99} + 99%|█████████▊| 8619/8750 [2:38:51<12:32, 5.74s/it] {'loss': 0.4501, 'learning_rate': 1.1754897148889e-08, 'epoch': 0.99} + 99%|█████████▊| 8619/8750 [2:38:49<12:32, 5.74s/it] 99%|█████████▊| 8620/8750 [2:38:56<12:26, 5.74s/it] 99%|█████████▊| 8620/8750 [2:38:54<12:26, 5.74s/it] {'loss': 0.45, 'learning_rate': 1.1576152548631314e-08, 'epoch': 0.99} + 99%|█████████▊| 8620/8750 [2:38:56<12:26, 5.74s/it] {'loss': 0.45, 'learning_rate': 1.1576152548631314e-08, 'epoch': 0.99} + 99%|█████████▊| 8620/8750 [2:38:54<12:26, 5.74s/it] 99%|█████████▊| 8621/8750 [2:39:02<12:22, 5.75s/it] 99%|█████████▊| 8621/8750 [2:39:00<12:22, 5.75s/it] {'loss': 0.4689, 'learning_rate': 1.1398776585445082e-08, 'epoch': 0.99} + 99%|█████████▊| 8621/8750 [2:39:02<12:22, 5.75s/it] {'loss': 0.4689, 'learning_rate': 1.1398776585445082e-08, 'epoch': 0.99} + 99%|█████████▊| 8621/8750 [2:39:00<12:22, 5.75s/it] 99%|█████████▊| 8622/8750 [2:39:08<12:13, 5.73s/it] 99%|█████████▊| 8622/8750 [2:39:06<12:13, 5.73s/it] {'loss': 0.4379, 'learning_rate': 1.1222769283633083e-08, 'epoch': 0.99} + 99%|█████████▊| 8622/8750 [2:39:08<12:13, 5.73s/it] {'loss': 0.4379, 'learning_rate': 1.1222769283633083e-08, 'epoch': 0.99} + 99%|█████████▊| 8622/8750 [2:39:06<12:13, 5.73s/it] 99%|█████████▊| 8623/8750 [2:39:13<12:01, 5.68s/it] 99%|█████████▊| 8623/8750 [2:39:11<12:01, 5.68s/it] {'loss': 0.449, 'learning_rate': 1.1048130667312695e-08, 'epoch': 0.99} + 99%|█████████▊| 8623/8750 [2:39:13<12:01, 5.68s/it] {'loss': 0.449, 'learning_rate': 1.1048130667312695e-08, 'epoch': 0.99} + 99%|█████████▊| 8623/8750 [2:39:11<12:01, 5.68s/it] 99%|█████████▊| 8624/8750 [2:39:19<12:03, 5.74s/it] 99%|█████████▊| 8624/8750 [2:39:17<12:03, 5.74s/it] {'loss': 0.4387, 'learning_rate': 1.0874860760413664e-08, 'epoch': 0.99} + 99%|█████████▊| 8624/8750 [2:39:19<12:03, 5.74s/it] {'loss': 0.4387, 'learning_rate': 1.0874860760413664e-08, 'epoch': 0.99} + 99%|█████████▊| 8624/8750 [2:39:17<12:03, 5.74s/it] 99%|█████████▊| 8625/8750 [2:39:25<11:50, 5.68s/it] 99%|█████████▊| 8625/8750 [2:39:23<11:50, 5.68s/it] {'loss': 0.4623, 'learning_rate': 1.0702959586678108e-08, 'epoch': 0.99} + 99%|█████████▊| 8625/8750 [2:39:25<11:50, 5.68s/it] {'loss': 0.4623, 'learning_rate': 1.0702959586678108e-08, 'epoch': 0.99} + 99%|█████████▊| 8625/8750 [2:39:23<11:50, 5.68s/it] 99%|█████████▊| 8626/8750 [2:39:31<11:47, 5.70s/it] 99%|█████████▊| 8626/8750 [2:39:29<11:47, 5.70s/it] {'loss': 0.4342, 'learning_rate': 1.0532427169659409e-08, 'epoch': 0.99} + 99%|█████████▊| 8626/8750 [2:39:31<11:47, 5.70s/it] {'loss': 0.4342, 'learning_rate': 1.0532427169659409e-08, 'epoch': 0.99} + 99%|█████████▊| 8626/8750 [2:39:29<11:47, 5.70s/it] 99%|█████████▊| 8627/8750 [2:39:36<11:40, 5.69s/it] 99%|█████████▊| 8627/8750 [2:39:34<11:40, 5.69s/it] {'loss': 0.4405, 'learning_rate': 1.0363263532724433e-08, 'epoch': 0.99} + 99%|█████████▊| 8627/8750 [2:39:36<11:40, 5.69s/it] {'loss': 0.4405, 'learning_rate': 1.0363263532724433e-08, 'epoch': 0.99} + 99%|█████████▊| 8627/8750 [2:39:34<11:40, 5.69s/it] 99%|█████████▊| 8628/8750 [2:39:42<11:37, 5.72s/it] 99%|█████████▊| 8628/8750 [2:39:40<11:37, 5.72s/it] {'loss': 0.457, 'learning_rate': 1.0195468699052413e-08, 'epoch': 0.99} + 99%|█████████▊| 8628/8750 [2:39:42<11:37, 5.72s/it] {'loss': 0.457, 'learning_rate': 1.0195468699052413e-08, 'epoch': 0.99} + 99%|█████████▊| 8628/8750 [2:39:40<11:37, 5.72s/it] 99%|█████████▊| 8629/8750 [2:39:46<11:29, 5.70s/it] 99%|█████████▊| 8629/8750 [2:39:48<11:29, 5.70s/it] {'loss': 0.4239, 'learning_rate': 1.0029042691636071e-08, 'epoch': 0.99} + {'loss': 0.4239, 'learning_rate': 1.0029042691636071e-08, 'epoch': 0.99} + 99%|█████████▊| 8629/8750 [2:39:48<11:29, 5.70s/it] 99%|█████████▊| 8629/8750 [2:39:46<11:29, 5.70s/it] 99%|█████████▊| 8630/8750 [2:39:53<11:23, 5.70s/it] 99%|█████████▊| 8630/8750 [2:39:51<11:23, 5.70s/it] {'loss': 0.4434, 'learning_rate': 9.863985533278275e-09, 'epoch': 0.99} + 99%|█████████▊| 8630/8750 [2:39:53<11:23, 5.70s/it] {'loss': 0.4434, 'learning_rate': 9.863985533278275e-09, 'epoch': 0.99} + 99%|█████████▊| 8630/8750 [2:39:51<11:23, 5.70s/it] 99%|█████████▊| 8631/8750 [2:39:59<11:24, 5.75s/it] 99%|█████████▊| 8631/8750 [2:39:57<11:24, 5.75s/it] {'loss': 0.461, 'learning_rate': 9.700297246596491e-09, 'epoch': 0.99} + 99%|█████████▊| 8631/8750 [2:39:59<11:24, 5.75s/it] {'loss': 0.461, 'learning_rate': 9.700297246596491e-09, 'epoch': 0.99} + 99%|█████████▊| 8631/8750 [2:39:57<11:24, 5.75s/it] 99%|█████████▊| 8632/8750 [2:40:05<11:21, 5.78s/it] 99%|█████████▊| 8632/8750 [2:40:03<11:21, 5.78s/it] {'loss': 0.4584, 'learning_rate': 9.537977854018332e-09, 'epoch': 0.99} + 99%|█████████▊| 8632/8750 [2:40:05<11:21, 5.78s/it] {'loss': 0.4584, 'learning_rate': 9.537977854018332e-09, 'epoch': 0.99} + 99%|█████████▊| 8632/8750 [2:40:03<11:21, 5.78s/it] 99%|█████████▊| 8633/8750 [2:40:11<11:21, 5.82s/it] 99%|█████████▊| 8633/8750 [2:40:09<11:21, 5.82s/it] {'loss': 0.4492, 'learning_rate': 9.377027377786007e-09, 'epoch': 0.99} + 99%|█████████▊| 8633/8750 [2:40:11<11:21, 5.82s/it] {'loss': 0.4492, 'learning_rate': 9.377027377786007e-09, 'epoch': 0.99} + 99%|█████████▊| 8633/8750 [2:40:09<11:21, 5.82s/it] 99%|█████████▊| 8634/8750 [2:40:17<11:08, 5.76s/it] 99%|█████████▊| 8634/8750 [2:40:15<11:08, 5.76s/it] {'loss': 0.4512, 'learning_rate': 9.217445839952988e-09, 'epoch': 0.99} + 99%|█████████▊| 8634/8750 [2:40:17<11:08, 5.76s/it] {'loss': 0.4512, 'learning_rate': 9.217445839952988e-09, 'epoch': 0.99} + 99%|█████████▊| 8634/8750 [2:40:15<11:08, 5.76s/it] 99%|█████████▊| 8635/8750 [2:40:22<10:59, 5.73s/it] 99%|█████████▊| 8635/8750 [2:40:20<10:59, 5.73s/it] {'loss': 0.4474, 'learning_rate': 9.059233262386225e-09, 'epoch': 0.99} + 99%|█████████▊| 8635/8750 [2:40:22<10:59, 5.73s/it] {'loss': 0.4474, 'learning_rate': 9.059233262386225e-09, 'epoch': 0.99} + 99%|█████████▊| 8635/8750 [2:40:20<10:59, 5.73s/it] 99%|█████████▊| 8636/8750 [2:40:28<10:51, 5.72s/it] 99%|█████████▊| 8636/8750 [2:40:26<10:51, 5.72s/it] {'loss': 0.4534, 'learning_rate': 8.902389666765044e-09, 'epoch': 0.99} + 99%|█████████▊| 8636/8750 [2:40:28<10:51, 5.72s/it] {'loss': 0.4534, 'learning_rate': 8.902389666765044e-09, 'epoch': 0.99} + 99%|█████████▊| 8636/8750 [2:40:26<10:51, 5.72s/it] 99%|█████████▊| 8637/8750 [2:40:32<10:43, 5.70s/it] 99%|█████████▊| 8637/8750 [2:40:34<10:43, 5.70s/it] {'loss': 0.4459, 'learning_rate': 8.746915074577811e-09, 'epoch': 0.99} + 99%|█████████▊| 8637/8750 [2:40:34<10:43, 5.70s/it] {'loss': 0.4459, 'learning_rate': 8.746915074577811e-09, 'epoch': 0.99} + 99%|█████████▊| 8637/8750 [2:40:32<10:43, 5.70s/it] 99%|█████████▊| 8638/8750 [2:40:40<10:48, 5.79s/it] 99%|█████████▊| 8638/8750 [2:40:38<10:48, 5.79s/it] {'loss': 0.4423, 'learning_rate': 8.592809507129706e-09, 'epoch': 0.99} + 99%|█████████▊| 8638/8750 [2:40:40<10:48, 5.79s/it] {'loss': 0.4423, 'learning_rate': 8.592809507129706e-09, 'epoch': 0.99} + 99%|█████████▊| 8638/8750 [2:40:38<10:48, 5.79s/it] 99%|█████████▊| 8639/8750 [2:40:45<10:34, 5.72s/it] 99%|█████████▊| 8639/8750 [2:40:43<10:34, 5.72s/it] {'loss': 0.4627, 'learning_rate': 8.440072985537174e-09, 'epoch': 0.99} + 99%|█████████▊| 8639/8750 [2:40:45<10:34, 5.72s/it] {'loss': 0.4627, 'learning_rate': 8.440072985537174e-09, 'epoch': 0.99} + 99%|█████████▊| 8639/8750 [2:40:43<10:34, 5.72s/it] 99%|█████████▊| 8640/8750 [2:40:51<10:21, 5.65s/it] 99%|█████████▊| 8640/8750 [2:40:49<10:21, 5.65s/it] {'loss': 0.4505, 'learning_rate': 8.288705530727915e-09, 'epoch': 0.99} + 99%|█████████▊| 8640/8750 [2:40:51<10:21, 5.65s/it] {'loss': 0.4505, 'learning_rate': 8.288705530727915e-09, 'epoch': 0.99} + 99%|█████████▊| 8640/8750 [2:40:49<10:21, 5.65s/it] 99%|█████████▉| 8641/8750 [2:40:56<10:18, 5.67s/it] 99%|█████████▉| 8641/8750 [2:40:54<10:18, 5.67s/it] {'loss': 0.4333, 'learning_rate': 8.138707163442005e-09, 'epoch': 0.99} + 99%|█████████▉| 8641/8750 [2:40:56<10:18, 5.67s/it] {'loss': 0.4333, 'learning_rate': 8.138707163442005e-09, 'epoch': 0.99} + 99%|█████████▉| 8641/8750 [2:40:54<10:18, 5.67s/it] 99%|█████████▉| 8642/8750 [2:41:03<10:30, 5.84s/it] 99%|█████████▉| 8642/8750 [2:41:01<10:30, 5.84s/it] {'loss': 0.4511, 'learning_rate': 7.990077904234117e-09, 'epoch': 0.99} + 99%|█████████▉| 8642/8750 [2:41:03<10:30, 5.84s/it] {'loss': 0.4511, 'learning_rate': 7.990077904234117e-09, 'epoch': 0.99} + 99%|█████████▉| 8642/8750 [2:41:01<10:30, 5.84s/it] 99%|█████████▉| 8643/8750 [2:41:08<10:15, 5.75s/it] 99%|█████████▉| 8643/8750 [2:41:06<10:15, 5.75s/it] {'loss': 0.457, 'learning_rate': 7.84281777346796e-09, 'epoch': 0.99} + 99%|█████████▉| 8643/8750 [2:41:08<10:15, 5.75s/it] {'loss': 0.457, 'learning_rate': 7.84281777346796e-09, 'epoch': 0.99} + 99%|█████████▉| 8643/8750 [2:41:06<10:15, 5.75s/it] 99%|█████████▉| 8644/8750 [2:41:14<10:21, 5.86s/it] 99%|█████████▉| 8644/8750 [2:41:12<10:21, 5.86s/it] {'loss': 0.4234, 'learning_rate': 7.696926791322946e-09, 'epoch': 0.99} + {'loss': 0.4234, 'learning_rate': 7.696926791322946e-09, 'epoch': 0.99} 99%|█████████▉| 8644/8750 [2:41:14<10:21, 5.86s/it] + 99%|█████████▉| 8644/8750 [2:41:12<10:21, 5.86s/it] 99%|█████████▉| 8645/8750 [2:41:20<10:13, 5.84s/it] 99%|█████████▉| 8645/8750 [2:41:18<10:13, 5.84s/it] {'loss': 0.4557, 'learning_rate': 7.552404977788641e-09, 'epoch': 0.99} + 99%|█████████▉| 8645/8750 [2:41:20<10:13, 5.84s/it] {'loss': 0.4557, 'learning_rate': 7.552404977788641e-09, 'epoch': 0.99} + 99%|█████████▉| 8645/8750 [2:41:18<10:13, 5.84s/it] 99%|█████████▉| 8646/8750 [2:41:26<10:00, 5.78s/it] 99%|█████████▉| 8646/8750 [2:41:24<10:00, 5.78s/it] {'loss': 0.444, 'learning_rate': 7.409252352668095e-09, 'epoch': 0.99} + 99%|█████████▉| 8646/8750 [2:41:26<10:00, 5.78s/it] {'loss': 0.444, 'learning_rate': 7.409252352668095e-09, 'epoch': 0.99} + 99%|█████████▉| 8646/8750 [2:41:24<10:00, 5.78s/it] 99%|█████████▉| 8647/8750 [2:41:29<09:53, 5.76s/it] 99%|█████████▉| 8647/8750 [2:41:31<09:53, 5.76s/it]{'loss': 0.455, 'learning_rate': 7.267468935575617e-09, 'epoch': 0.99} + {'loss': 0.455, 'learning_rate': 7.267468935575617e-09, 'epoch': 0.99} + 99%|█████████▉| 8647/8750 [2:41:31<09:53, 5.76s/it] 99%|█████████▉| 8647/8750 [2:41:29<09:53, 5.76s/it] 99%|█████████▉| 8648/8750 [2:41:37<09:41, 5.71s/it] 99%|█████████▉| 8648/8750 [2:41:35<09:41, 5.71s/it] {'loss': 0.4692, 'learning_rate': 7.12705474594011e-09, 'epoch': 0.99} + 99%|█████████▉| 8648/8750 [2:41:37<09:41, 5.71s/it] {'loss': 0.4692, 'learning_rate': 7.12705474594011e-09, 'epoch': 0.99} + 99%|█████████▉| 8648/8750 [2:41:35<09:41, 5.71s/it] 99%|█████████▉| 8649/8750 [2:41:43<09:34, 5.68s/it] 99%|█████████▉| 8649/8750 [2:41:41<09:33, 5.68s/it] {'loss': 0.4765, 'learning_rate': 6.988009803000628e-09, 'epoch': 0.99} + 99%|█████████▉| 8649/8750 [2:41:43<09:34, 5.68s/it] {'loss': 0.4765, 'learning_rate': 6.988009803000628e-09, 'epoch': 0.99} + 99%|█████████▉| 8649/8750 [2:41:41<09:33, 5.68s/it]11 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +8 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... + 99%|█████████▉| 8650/8750 [2:41:48<09:32, 5.72s/it]12 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +1 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +7 AutoResumeHook: Checking whether to suspend... +10 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... +0 AutoResumeHook: Checking whether to suspend... + 99%|█████████▉| 8650/8750 [2:41:46<09:32, 5.72s/it] {'loss': 0.4445, 'learning_rate': 6.8503341258086e-09, 'epoch': 0.99} + 99%|█████████▉| 8650/8750 [2:41:48<09:32, 5.72s/it] {'loss': 0.4445, 'learning_rate': 6.8503341258086e-09, 'epoch': 0.99} + 99%|█████████▉| 8650/8750 [2:41:46<09:32, 5.72s/it] 99%|█████████▉| 8651/8750 [2:41:54<09:22, 5.68s/it] 99%|█████████▉| 8651/8750 [2:41:52<09:22, 5.68s/it] {'loss': 0.4319, 'learning_rate': 6.714027733230044e-09, 'epoch': 0.99} + 99%|█████████▉| 8651/8750 [2:41:54<09:22, 5.68s/it] {'loss': 0.4319, 'learning_rate': 6.714027733230044e-09, 'epoch': 0.99} + 99%|█████████▉| 8651/8750 [2:41:52<09:22, 5.68s/it] 99%|█████████▉| 8652/8750 [2:42:00<09:20, 5.72s/it] 99%|█████████▉| 8652/8750 [2:41:58<09:20, 5.72s/it] {'loss': 0.4481, 'learning_rate': 6.579090643942243e-09, 'epoch': 0.99} + 99%|█████████▉| 8652/8750 [2:42:00<09:20, 5.72s/it] {'loss': 0.4481, 'learning_rate': 6.579090643942243e-09, 'epoch': 0.99} + 99%|█████████▉| 8652/8750 [2:41:58<09:20, 5.72s/it] 99%|█████████▉| 8653/8750 [2:42:05<09:11, 5.69s/it] 99%|█████████▉| 8653/8750 [2:42:04<09:11, 5.69s/it] {'loss': 0.4501, 'learning_rate': 6.4455228764326305e-09, 'epoch': 0.99} + 99%|█████████▉| 8653/8750 [2:42:05<09:11, 5.69s/it] {'loss': 0.4501, 'learning_rate': 6.4455228764326305e-09, 'epoch': 0.99} + 99%|█████████▉| 8653/8750 [2:42:04<09:11, 5.69s/it] 99%|█████████▉| 8654/8750 [2:42:11<09:00, 5.63s/it] 99%|█████████▉| 8654/8750 [2:42:09<09:00, 5.63s/it] {'loss': 0.4475, 'learning_rate': 6.3133244490043434e-09, 'epoch': 0.99} + 99%|█████████▉| 8654/8750 [2:42:11<09:00, 5.63s/it] {'loss': 0.4475, 'learning_rate': 6.3133244490043434e-09, 'epoch': 0.99} + 99%|█████████▉| 8654/8750 [2:42:09<09:00, 5.63s/it] 99%|█████████▉| 8655/8750 [2:42:17<09:00, 5.69s/it] 99%|█████████▉| 8655/8750 [2:42:15<09:00, 5.69s/it] {'loss': 0.4532, 'learning_rate': 6.18249537977178e-09, 'epoch': 0.99} + 99%|█████████▉| 8655/8750 [2:42:17<09:00, 5.69s/it] {'loss': 0.4532, 'learning_rate': 6.18249537977178e-09, 'epoch': 0.99} + 99%|█████████▉| 8655/8750 [2:42:15<09:00, 5.69s/it] 99%|█████████▉| 8656/8750 [2:42:23<09:01, 5.76s/it] 99%|█████████▉| 8656/8750 [2:42:21<09:01, 5.76s/it] {'loss': 0.4401, 'learning_rate': 6.053035686661712e-09, 'epoch': 0.99} + 99%|█████████▉| 8656/8750 [2:42:23<09:01, 5.76s/it] {'loss': 0.4401, 'learning_rate': 6.053035686661712e-09, 'epoch': 0.99} + 99%|█████████▉| 8656/8750 [2:42:21<09:01, 5.76s/it] 99%|█████████▉| 8657/8750 [2:42:28<08:51, 5.72s/it] 99%|█████████▉| 8657/8750 [2:42:26<08:51, 5.72s/it] {'loss': 0.4532, 'learning_rate': 5.924945387411063e-09, 'epoch': 0.99} + 99%|█████████▉| 8657/8750 [2:42:28<08:51, 5.72s/it] {'loss': 0.4532, 'learning_rate': 5.924945387411063e-09, 'epoch': 0.99} + 99%|█████████▉| 8657/8750 [2:42:26<08:51, 5.72s/it] 99%|█████████▉| 8658/8750 [2:42:32<08:43, 5.69s/it] 99%|█████████▉| 8658/8750 [2:42:34<08:43, 5.69s/it] {'loss': 0.4334, 'learning_rate': 5.798224499572458e-09, 'epoch': 0.99} + 99%|█████████▉| 8658/8750 [2:42:34<08:43, 5.69s/it] {'loss': 0.4334, 'learning_rate': 5.798224499572458e-09, 'epoch': 0.99} + 99%|█████████▉| 8658/8750 [2:42:32<08:43, 5.69s/it] 99%|█████████▉| 8659/8750 [2:42:38<08:36, 5.67s/it] 99%|█████████▉| 8659/8750 [2:42:40<08:36, 5.67s/it] {'loss': 0.4682, 'learning_rate': 5.672873040509786e-09, 'epoch': 0.99} + {'loss': 0.4682, 'learning_rate': 5.672873040509786e-09, 'epoch': 0.99} 99%|█████████▉| 8659/8750 [2:42:40<08:36, 5.67s/it] + 99%|█████████▉| 8659/8750 [2:42:38<08:36, 5.67s/it] 99%|█████████▉| 8660/8750 [2:42:45<08:27, 5.64s/it] 99%|█████████▉| 8660/8750 [2:42:43<08:27, 5.64s/it] {'loss': 0.4225, 'learning_rate': 5.548891027398195e-09, 'epoch': 0.99} + 99%|█████████▉| 8660/8750 [2:42:45<08:27, 5.64s/it] {'loss': 0.4225, 'learning_rate': 5.548891027398195e-09, 'epoch': 0.99} + 99%|█████████▉| 8660/8750 [2:42:43<08:27, 5.64s/it] 99%|█████████▉| 8661/8750 [2:42:51<08:33, 5.77s/it] 99%|█████████▉| 8661/8750 [2:42:49<08:33, 5.77s/it] {'loss': 0.4527, 'learning_rate': 5.426278477226321e-09, 'epoch': 0.99} + 99%|█████████▉| 8661/8750 [2:42:51<08:33, 5.77s/it] {'loss': 0.4527, 'learning_rate': 5.426278477226321e-09, 'epoch': 0.99} + 99%|█████████▉| 8661/8750 [2:42:49<08:33, 5.77s/it] 99%|█████████▉| 8662/8750 [2:42:57<08:24, 5.73s/it] 99%|█████████▉| 8662/8750 [2:42:55<08:24, 5.73s/it] {'loss': 0.4467, 'learning_rate': 5.305035406795167e-09, 'epoch': 0.99} + 99%|█████████▉| 8662/8750 [2:42:57<08:24, 5.73s/it] {'loss': 0.4467, 'learning_rate': 5.305035406795167e-09, 'epoch': 0.99} + 99%|█████████▉| 8662/8750 [2:42:55<08:24, 5.73s/it] 99%|█████████▉| 8663/8750 [2:43:02<08:10, 5.64s/it] 99%|█████████▉| 8663/8750 [2:43:00<08:10, 5.64s/it] {'loss': 0.4284, 'learning_rate': 5.185161832718111e-09, 'epoch': 0.99} + 99%|█████████▉| 8663/8750 [2:43:02<08:10, 5.64s/it] {'loss': 0.4284, 'learning_rate': 5.185161832718111e-09, 'epoch': 0.99} + 99%|█████████▉| 8663/8750 [2:43:00<08:10, 5.64s/it] 99%|█████████▉| 8664/8750 [2:43:08<08:03, 5.62s/it] 99%|█████████▉| 8664/8750 [2:43:06<08:03, 5.62s/it] {'loss': 0.4373, 'learning_rate': 5.0666577714186815e-09, 'epoch': 0.99} + 99%|█████████▉| 8664/8750 [2:43:08<08:03, 5.62s/it] {'loss': 0.4373, 'learning_rate': 5.0666577714186815e-09, 'epoch': 0.99} + 99%|█████████▉| 8664/8750 [2:43:06<08:03, 5.62s/it] 99%|█████████▉| 8665/8750 [2:43:13<07:54, 5.59s/it] 99%|█████████▉| 8665/8750 [2:43:11<07:54, 5.59s/it] {'loss': 0.4694, 'learning_rate': 4.949523239136112e-09, 'epoch': 0.99} + 99%|█████████▉| 8665/8750 [2:43:13<07:54, 5.59s/it] {'loss': 0.4694, 'learning_rate': 4.949523239136112e-09, 'epoch': 0.99} + 99%|█████████▉| 8665/8750 [2:43:11<07:54, 5.59s/it] 99%|█████████▉| 8666/8750 [2:43:19<07:54, 5.65s/it] 99%|█████████▉| 8666/8750 [2:43:17<07:54, 5.65s/it] {'loss': 0.4355, 'learning_rate': 4.833758251919785e-09, 'epoch': 0.99} + 99%|█████████▉| 8666/8750 [2:43:19<07:54, 5.65s/it] {'loss': 0.4355, 'learning_rate': 4.833758251919785e-09, 'epoch': 0.99} + 99%|█████████▉| 8666/8750 [2:43:17<07:54, 5.65s/it] 99%|█████████▉| 8667/8750 [2:43:23<08:02, 5.82s/it] 99%|█████████▉| 8667/8750 [2:43:25<08:02, 5.82s/it] {'loss': 0.4348, 'learning_rate': 4.7193628256325676e-09, 'epoch': 0.99} + 99%|█████████▉| 8667/8750 [2:43:25<08:02, 5.82s/it] {'loss': 0.4348, 'learning_rate': 4.7193628256325676e-09, 'epoch': 0.99} + 99%|█████████▉| 8667/8750 [2:43:23<08:02, 5.82s/it] 99%|█████████▉| 8668/8750 [2:43:31<07:51, 5.75s/it] 99%|█████████▉| 8668/8750 [2:43:29<07:51, 5.75s/it] {'loss': 0.4469, 'learning_rate': 4.606336975948589e-09, 'epoch': 0.99} + 99%|█████████▉| 8668/8750 [2:43:31<07:51, 5.75s/it] {'loss': 0.4469, 'learning_rate': 4.606336975948589e-09, 'epoch': 0.99} + 99%|█████████▉| 8668/8750 [2:43:29<07:51, 5.75s/it] 99%|█████████▉| 8669/8750 [2:43:37<07:58, 5.90s/it] 99%|█████████▉| 8669/8750 [2:43:35<07:58, 5.90s/it] {'loss': 0.4447, 'learning_rate': 4.494680718355459e-09, 'epoch': 0.99} + 99%|█████████▉| 8669/8750 [2:43:37<07:58, 5.90s/it] {'loss': 0.4447, 'learning_rate': 4.494680718355459e-09, 'epoch': 0.99} + 99%|█████████▉| 8669/8750 [2:43:35<07:58, 5.90s/it] 99%|█████████▉| 8670/8750 [2:43:41<07:48, 5.85s/it] 99%|█████████▉| 8670/8750 [2:43:43<07:48, 5.85s/it] {'loss': 0.4434, 'learning_rate': 4.384394068153164e-09, 'epoch': 0.99} + {'loss': 0.4434, 'learning_rate': 4.384394068153164e-09, 'epoch': 0.99} + 99%|█████████▉| 8670/8750 [2:43:43<07:48, 5.85s/it] 99%|█████████▉| 8670/8750 [2:43:41<07:48, 5.85s/it] 99%|█████████▉| 8671/8750 [2:43:49<07:36, 5.78s/it] 99%|█████████▉| 8671/8750 [2:43:47<07:36, 5.78s/it] {'loss': 0.453, 'learning_rate': 4.275477040451836e-09, 'epoch': 0.99} + 99%|█████████▉| 8671/8750 [2:43:49<07:36, 5.78s/it] {'loss': 0.453, 'learning_rate': 4.275477040451836e-09, 'epoch': 0.99} + 99%|█████████▉| 8671/8750 [2:43:47<07:36, 5.78s/it] 99%|█████████▉| 8672/8750 [2:43:54<07:24, 5.69s/it] 99%|█████████▉| 8672/8750 [2:43:52<07:24, 5.69s/it] {'loss': 0.4588, 'learning_rate': 4.167929650176206e-09, 'epoch': 0.99} + 99%|█████████▉| 8672/8750 [2:43:54<07:24, 5.69s/it] {'loss': 0.4588, 'learning_rate': 4.167929650176206e-09, 'epoch': 0.99} + 99%|█████████▉| 8672/8750 [2:43:52<07:24, 5.69s/it] 99%|█████████▉| 8673/8750 [2:43:58<07:17, 5.68s/it] 99%|█████████▉| 8673/8750 [2:44:00<07:17, 5.68s/it] {'loss': 0.4509, 'learning_rate': 4.061751912063372e-09, 'epoch': 0.99} + {'loss': 0.4509, 'learning_rate': 4.061751912063372e-09, 'epoch': 0.99} + 99%|█████████▉| 8673/8750 [2:44:00<07:17, 5.68s/it] 99%|█████████▉| 8673/8750 [2:43:58<07:17, 5.68s/it] 99%|█████████▉| 8674/8750 [2:44:05<07:11, 5.68s/it] 99%|█████████▉| 8674/8750 [2:44:03<07:11, 5.68s/it] {'loss': 0.4543, 'learning_rate': 3.956943840661698e-09, 'epoch': 0.99} + 99%|█████████▉| 8674/8750 [2:44:05<07:11, 5.68s/it] {'loss': 0.4543, 'learning_rate': 3.956943840661698e-09, 'epoch': 0.99} + 99%|█████████▉| 8674/8750 [2:44:03<07:11, 5.68s/it] 99%|█████████▉| 8675/8750 [2:44:11<07:04, 5.66s/it] 99%|█████████▉| 8675/8750 [2:44:09<07:04, 5.66s/it] {'loss': 0.4426, 'learning_rate': 3.853505450331918e-09, 'epoch': 0.99} + 99%|█████████▉| 8675/8750 [2:44:11<07:04, 5.66s/it] {'loss': 0.4426, 'learning_rate': 3.853505450331918e-09, 'epoch': 0.99} + 99%|█████████▉| 8675/8750 [2:44:09<07:04, 5.66s/it] 99%|█████████▉| 8676/8750 [2:44:17<06:59, 5.67s/it] 99%|█████████▉| 8676/8750 [2:44:15<06:59, 5.67s/it] {'loss': 0.4451, 'learning_rate': 3.751436755247139e-09, 'epoch': 0.99} + 99%|█████████▉| 8676/8750 [2:44:17<06:59, 5.67s/it] {'loss': 0.4451, 'learning_rate': 3.751436755247139e-09, 'epoch': 0.99} + 99%|█████████▉| 8676/8750 [2:44:15<06:59, 5.67s/it] 99%|█████████▉| 8677/8750 [2:44:22<06:54, 5.68s/it] 99%|█████████▉| 8677/8750 [2:44:20<06:54, 5.68s/it] {'loss': 0.4378, 'learning_rate': 3.650737769393953e-09, 'epoch': 0.99} + 99%|█████████▉| 8677/8750 [2:44:22<06:54, 5.68s/it] {'loss': 0.4378, 'learning_rate': 3.650737769393953e-09, 'epoch': 0.99} + 99%|█████████▉| 8677/8750 [2:44:20<06:54, 5.68s/it] 99%|█████████▉| 8678/8750 [2:44:28<06:47, 5.66s/it] 99%|█████████▉| 8678/8750 [2:44:26<06:47, 5.66s/it] {'loss': 0.4355, 'learning_rate': 3.5514085065690984e-09, 'epoch': 0.99} + 99%|█████████▉| 8678/8750 [2:44:28<06:47, 5.66s/it] {'loss': 0.4355, 'learning_rate': 3.5514085065690984e-09, 'epoch': 0.99} + 99%|█████████▉| 8678/8750 [2:44:26<06:47, 5.66s/it] 99%|█████████▉| 8679/8750 [2:44:34<06:41, 5.66s/it] 99%|█████████▉| 8679/8750 [2:44:32<06:41, 5.66s/it] {'loss': 0.4543, 'learning_rate': 3.4534489803850215e-09, 'epoch': 0.99} + 99%|█████████▉| 8679/8750 [2:44:34<06:41, 5.66s/it] {'loss': 0.4543, 'learning_rate': 3.4534489803850215e-09, 'epoch': 0.99} + 99%|█████████▉| 8679/8750 [2:44:32<06:41, 5.66s/it] 99%|█████████▉| 8680/8750 [2:44:39<06:36, 5.67s/it] 99%|█████████▉| 8680/8750 [2:44:37<06:36, 5.67s/it] {'loss': 0.4409, 'learning_rate': 3.3568592042620974e-09, 'epoch': 0.99} + 99%|█████████▉| 8680/8750 [2:44:39<06:36, 5.67s/it] {'loss': 0.4409, 'learning_rate': 3.3568592042620974e-09, 'epoch': 0.99} + 99%|█████████▉| 8680/8750 [2:44:37<06:36, 5.67s/it] 99%|█████████▉| 8681/8750 [2:44:45<06:26, 5.61s/it] 99%|█████████▉| 8681/8750 [2:44:43<06:26, 5.61s/it] {'loss': 0.4628, 'learning_rate': 3.2616391914364056e-09, 'epoch': 0.99} + 99%|█████████▉| 8681/8750 [2:44:45<06:26, 5.61s/it] {'loss': 0.4628, 'learning_rate': 3.2616391914364056e-09, 'epoch': 0.99} + 99%|█████████▉| 8681/8750 [2:44:43<06:26, 5.61s/it] 99%|█████████▉| 8682/8750 [2:44:51<06:26, 5.68s/it] 99%|█████████▉| 8682/8750 [2:44:49<06:26, 5.68s/it] {'loss': 0.4445, 'learning_rate': 3.167788954954176e-09, 'epoch': 0.99} + 99%|█████████▉| 8682/8750 [2:44:51<06:26, 5.68s/it] {'loss': 0.4445, 'learning_rate': 3.167788954954176e-09, 'epoch': 0.99} + 99%|█████████▉| 8682/8750 [2:44:49<06:26, 5.68s/it] 99%|█████████▉| 8683/8750 [2:44:56<06:18, 5.66s/it] 99%|█████████▉| 8683/8750 [2:44:54<06:18, 5.66s/it] {'loss': 0.4646, 'learning_rate': 3.075308507677344e-09, 'epoch': 0.99} + 99%|█████████▉| 8683/8750 [2:44:56<06:18, 5.66s/it] {'loss': 0.4646, 'learning_rate': 3.075308507677344e-09, 'epoch': 0.99} + 99%|█████████▉| 8683/8750 [2:44:54<06:18, 5.66s/it] 99%|█████████▉| 8684/8750 [2:45:02<06:15, 5.69s/it] 99%|█████████▉| 8684/8750 [2:45:00<06:15, 5.69s/it] {'loss': 0.4466, 'learning_rate': 2.9841978622746624e-09, 'epoch': 0.99} + 99%|█████████▉| 8684/8750 [2:45:02<06:15, 5.69s/it] {'loss': 0.4466, 'learning_rate': 2.9841978622746624e-09, 'epoch': 0.99} + 99%|█████████▉| 8684/8750 [2:45:00<06:15, 5.69s/it] 99%|█████████▉| 8685/8750 [2:45:06<06:10, 5.70s/it] 99%|█████████▉| 8685/8750 [2:45:08<06:10, 5.70s/it] {'loss': 0.4682, 'learning_rate': 2.894457031232811e-09, 'epoch': 0.99} + 99%|█████████▉| 8685/8750 [2:45:08<06:10, 5.70s/it] {'loss': 0.4682, 'learning_rate': 2.894457031232811e-09, 'epoch': 0.99} + 99%|█████████▉| 8685/8750 [2:45:06<06:10, 5.70s/it] 99%|█████████▉| 8686/8750 [2:45:14<06:10, 5.79s/it] 99%|█████████▉| 8686/8750 [2:45:12<06:10, 5.79s/it] {'loss': 0.4413, 'learning_rate': 2.8060860268475097e-09, 'epoch': 0.99} + 99%|█████████▉| 8686/8750 [2:45:14<06:10, 5.79s/it] {'loss': 0.4413, 'learning_rate': 2.8060860268475097e-09, 'epoch': 0.99} + 99%|█████████▉| 8686/8750 [2:45:12<06:10, 5.79s/it] 99%|█████████▉| 8687/8750 [2:45:20<06:09, 5.86s/it] 99%|█████████▉| 8687/8750 [2:45:18<06:09, 5.86s/it] {'loss': 0.4405, 'learning_rate': 2.7190848612279606e-09, 'epoch': 0.99} + 99%|█████████▉| 8687/8750 [2:45:20<06:09, 5.86s/it] {'loss': 0.4405, 'learning_rate': 2.7190848612279606e-09, 'epoch': 0.99} + 99%|█████████▉| 8687/8750 [2:45:18<06:09, 5.86s/it] 99%|█████████▉| 8688/8750 [2:45:26<06:04, 5.87s/it] 99%|█████████▉| 8688/8750 [2:45:24<06:04, 5.87s/it] {'loss': 0.46, 'learning_rate': 2.6334535462935184e-09, 'epoch': 0.99} + 99%|█████████▉| 8688/8750 [2:45:26<06:04, 5.87s/it] {'loss': 0.46, 'learning_rate': 2.6334535462935184e-09, 'epoch': 0.99} + 99%|█████████▉| 8688/8750 [2:45:24<06:04, 5.87s/it] 99%|█████████▉| 8689/8750 [2:45:31<05:52, 5.79s/it] 99%|█████████▉| 8689/8750 [2:45:29<05:52, 5.79s/it] {'loss': 0.454, 'learning_rate': 2.54919209377924e-09, 'epoch': 0.99} + 99%|█████████▉| 8689/8750 [2:45:31<05:52, 5.79s/it] {'loss': 0.454, 'learning_rate': 2.54919209377924e-09, 'epoch': 0.99} + 99%|█████████▉| 8689/8750 [2:45:29<05:52, 5.79s/it] 99%|█████████▉| 8690/8750 [2:45:37<05:46, 5.78s/it] 99%|█████████▉| 8690/8750 [2:45:35<05:46, 5.78s/it] {'loss': 0.4464, 'learning_rate': 2.4663005152314455e-09, 'epoch': 0.99} + 99%|█████████▉| 8690/8750 [2:45:37<05:46, 5.78s/it] {'loss': 0.4464, 'learning_rate': 2.4663005152314455e-09, 'epoch': 0.99} + 99%|█████████▉| 8690/8750 [2:45:35<05:46, 5.78s/it] 99%|█████████▉| 8691/8750 [2:45:43<05:42, 5.80s/it] 99%|█████████▉| 8691/8750 [2:45:41<05:42, 5.80s/it] {'loss': 0.4624, 'learning_rate': 2.384778822006606e-09, 'epoch': 0.99} + 99%|█████████▉| 8691/8750 [2:45:43<05:42, 5.80s/it] {'loss': 0.4624, 'learning_rate': 2.384778822006606e-09, 'epoch': 0.99} + 99%|█████████▉| 8691/8750 [2:45:41<05:42, 5.80s/it] 99%|█████████▉| 8692/8750 [2:45:49<05:34, 5.76s/it] 99%|█████████▉| 8692/8750 [2:45:47<05:34, 5.76s/it] {'loss': 0.4351, 'learning_rate': 2.304627025274675e-09, 'epoch': 0.99} + 99%|█████████▉| 8692/8750 [2:45:49<05:34, 5.76s/it] {'loss': 0.4351, 'learning_rate': 2.304627025274675e-09, 'epoch': 0.99} + 99%|█████████▉| 8692/8750 [2:45:47<05:34, 5.76s/it] 99%|█████████▉| 8693/8750 [2:45:52<05:25, 5.71s/it] 99%|█████████▉| 8693/8750 [2:45:54<05:25, 5.71s/it] {'loss': 0.4529, 'learning_rate': 2.225845136019089e-09, 'epoch': 0.99} + {'loss': 0.4529, 'learning_rate': 2.225845136019089e-09, 'epoch': 0.99} 99%|█████████▉| 8693/8750 [2:45:54<05:25, 5.71s/it] + 99%|█████████▉| 8693/8750 [2:45:52<05:25, 5.71s/it] 99%|█████████▉| 8694/8750 [2:46:00<05:21, 5.75s/it] 99%|█████████▉| 8694/8750 [2:45:58<05:21, 5.75s/it] {'loss': 0.4624, 'learning_rate': 2.148433165035657e-09, 'epoch': 0.99} + 99%|█████████▉| 8694/8750 [2:46:00<05:21, 5.75s/it] {'loss': 0.4624, 'learning_rate': 2.148433165035657e-09, 'epoch': 0.99} + 99%|█████████▉| 8694/8750 [2:45:58<05:21, 5.75s/it] 99%|█████████▉| 8695/8750 [2:46:06<05:15, 5.74s/it] 99%|█████████▉| 8695/8750 [2:46:04<05:15, 5.74s/it] {'loss': 0.4256, 'learning_rate': 2.0723911229303396e-09, 'epoch': 0.99} + 99%|█████████▉| 8695/8750 [2:46:06<05:15, 5.74s/it] {'loss': 0.4256, 'learning_rate': 2.0723911229303396e-09, 'epoch': 0.99} + 99%|█████████▉| 8695/8750 [2:46:04<05:15, 5.74s/it] 99%|█████████▉| 8696/8750 [2:46:11<05:07, 5.69s/it] 99%|█████████▉| 8696/8750 [2:46:09<05:07, 5.69s/it] {'loss': 0.4375, 'learning_rate': 1.9977190201225793e-09, 'epoch': 0.99} + 99%|█████████▉| 8696/8750 [2:46:11<05:07, 5.69s/it] {'loss': 0.4375, 'learning_rate': 1.9977190201225793e-09, 'epoch': 0.99} + 99%|█████████▉| 8696/8750 [2:46:09<05:07, 5.69s/it] 99%|█████████▉| 8697/8750 [2:46:17<05:00, 5.66s/it] 99%|█████████▉| 8697/8750 [2:46:15<05:00, 5.66s/it] {'loss': 0.4619, 'learning_rate': 1.924416866844192e-09, 'epoch': 0.99} + 99%|█████████▉| 8697/8750 [2:46:17<05:00, 5.66s/it] {'loss': 0.4619, 'learning_rate': 1.924416866844192e-09, 'epoch': 0.99} + 99%|█████████▉| 8697/8750 [2:46:15<05:00, 5.66s/it] 99%|█████████▉| 8698/8750 [2:46:21<04:55, 5.67s/it] 99%|█████████▉| 8698/8750 [2:46:23<04:55, 5.67s/it] {'loss': 0.4525, 'learning_rate': 1.8524846731404755e-09, 'epoch': 0.99} + {'loss': 0.4525, 'learning_rate': 1.8524846731404755e-09, 'epoch': 0.99} 99%|█████████▉| 8698/8750 [2:46:23<04:55, 5.67s/it] + 99%|█████████▉| 8698/8750 [2:46:21<04:55, 5.67s/it] 99%|█████████▉| 8699/8750 [2:46:28<04:52, 5.74s/it] 99%|█████████▉| 8699/8750 [2:46:27<04:52, 5.74s/it] {'loss': 0.4452, 'learning_rate': 1.7819224488657695e-09, 'epoch': 0.99} + 99%|█████████▉| 8699/8750 [2:46:28<04:52, 5.74s/it] {'loss': 0.4452, 'learning_rate': 1.7819224488657695e-09, 'epoch': 0.99} + 99%|█████████▉| 8699/8750 [2:46:27<04:52, 5.74s/it]1 AutoResumeHook: Checking whether to suspend... +9 AutoResumeHook: Checking whether to suspend... +5 AutoResumeHook: Checking whether to suspend... +15 AutoResumeHook: Checking whether to suspend... +12 AutoResumeHook: Checking whether to suspend... +8 11AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... + 99%|█████████▉| 8700/8750 [2:46:34<04:44, 5.70s/it]10 AutoResumeHook: Checking whether to suspend... +6 AutoResumeHook: Checking whether to suspend... +04 AutoResumeHook: Checking whether to suspend... + AutoResumeHook: Checking whether to suspend... +14 AutoResumeHook: Checking whether to suspend... + 99%|█████████▉| 8700/8750 [2:46:32<04:44, 5.70s/it]7 AutoResumeHook: Checking whether to suspend... +13 AutoResumeHook: Checking whether to suspend... +3 AutoResumeHook: Checking whether to suspend... +2 AutoResumeHook: Checking whether to suspend... + {'loss': 0.4516, 'learning_rate': 1.7127302036901162e-09, 'epoch': 0.99} + 99%|█████████▉| 8700/8750 [2:46:34<04:44, 5.70s/it] {'loss': 0.4516, 'learning_rate': 1.7127302036901162e-09, 'epoch': 0.99} + 99%|█████████▉| 8700/8750 [2:46:32<04:44, 5.70s/it]saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-8700/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-8700/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/tmp-checkpoint-8700/mm_projector +/lustre/fs12/portfolios/nvr/users/mmemmel/miniforge3/envs/vila/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + warnings.warn( + 99%|█████████▉| 8701/8750 [2:46:57<08:52, 10.86s/it] 99%|█████████▉| 8701/8750 [2:46:55<08:52, 10.86s/it] {'loss': 0.4446, 'learning_rate': 1.6449079470937103e-09, 'epoch': 0.99} + 99%|█████████▉| 8701/8750 [2:46:57<08:52, 10.86s/it] {'loss': 0.4446, 'learning_rate': 1.6449079470937103e-09, 'epoch': 0.99} + 99%|█████████▉| 8701/8750 [2:46:55<08:52, 10.86s/it] 99%|█████████▉| 8702/8750 [2:47:03<07:24, 9.27s/it] 99%|█████████▉| 8702/8750 [2:47:01<07:24, 9.27s/it] {'loss': 0.4504, 'learning_rate': 1.5784556883691183e-09, 'epoch': 0.99} + 99%|█████████▉| 8702/8750 [2:47:03<07:24, 9.27s/it] {'loss': 0.4504, 'learning_rate': 1.5784556883691183e-09, 'epoch': 0.99} + 99%|█████████▉| 8702/8750 [2:47:01<07:24, 9.27s/it] 99%|█████████▉| 8703/8750 [2:47:08<06:26, 8.23s/it] 99%|█████████▉| 8703/8750 [2:47:06<06:26, 8.23s/it] {'loss': 0.458, 'learning_rate': 1.5133734366234998e-09, 'epoch': 0.99} + 99%|█████████▉| 8703/8750 [2:47:08<06:26, 8.23s/it] {'loss': 0.458, 'learning_rate': 1.5133734366234998e-09, 'epoch': 0.99} + 99%|█████████▉| 8703/8750 [2:47:06<06:26, 8.23s/it] 99%|█████████▉| 8704/8750 [2:47:14<05:42, 7.44s/it] 99%|█████████▉| 8704/8750 [2:47:12<05:42, 7.44s/it] {'loss': 0.4442, 'learning_rate': 1.449661200773056e-09, 'epoch': 0.99} + 99%|█████████▉| 8704/8750 [2:47:14<05:42, 7.44s/it] {'loss': 0.4442, 'learning_rate': 1.449661200773056e-09, 'epoch': 0.99} + 99%|█████████▉| 8704/8750 [2:47:12<05:42, 7.44s/it] 99%|█████████▉| 8705/8750 [2:47:20<05:11, 6.92s/it] 99%|█████████▉| 8705/8750 [2:47:18<05:11, 6.92s/it] {'loss': 0.4447, 'learning_rate': 1.3873189895485806e-09, 'epoch': 0.99} + 99%|█████████▉| 8705/8750 [2:47:20<05:11, 6.92s/it] {'loss': 0.4447, 'learning_rate': 1.3873189895485806e-09, 'epoch': 0.99} + 99%|█████████▉| 8705/8750 [2:47:18<05:11, 6.92s/it] 99%|█████████▉| 8706/8750 [2:47:25<04:47, 6.53s/it] 99%|█████████▉| 8706/8750 [2:47:23<04:47, 6.53s/it] {'loss': 0.4641, 'learning_rate': 1.3263468114921295e-09, 'epoch': 0.99} + 99%|█████████▉| 8706/8750 [2:47:25<04:47, 6.53s/it] {'loss': 0.4641, 'learning_rate': 1.3263468114921295e-09, 'epoch': 0.99} + 99%|█████████▉| 8706/8750 [2:47:23<04:47, 6.53s/it] 100%|█████████▉| 8707/8750 [2:47:31<04:33, 6.35s/it] 100%|█████████▉| 8707/8750 [2:47:29<04:33, 6.35s/it] {'loss': 0.4302, 'learning_rate': 1.2667446749581314e-09, 'epoch': 1.0} + 100%|█████████▉| 8707/8750 [2:47:31<04:33, 6.35s/it] {'loss': 0.4302, 'learning_rate': 1.2667446749581314e-09, 'epoch': 1.0} + 100%|█████████▉| 8707/8750 [2:47:29<04:33, 6.35s/it] 100%|█████████▉| 8708/8750 [2:47:37<04:21, 6.22s/it] 100%|█████████▉| 8708/8750 [2:47:35<04:21, 6.22s/it] {'loss': 0.4294, 'learning_rate': 1.2085125881133863e-09, 'epoch': 1.0} + 100%|█████████▉| 8708/8750 [2:47:37<04:21, 6.22s/it] {'loss': 0.4294, 'learning_rate': 1.2085125881133863e-09, 'epoch': 1.0} + 100%|█████████▉| 8708/8750 [2:47:35<04:21, 6.22s/it] 100%|█████████▉| 8709/8750 [2:47:41<04:05, 5.99s/it] 100%|█████████▉| 8709/8750 [2:47:43<04:05, 6.00s/it] {'loss': 0.4844, 'learning_rate': 1.1516505589381777e-09, 'epoch': 1.0} + 100%|█████████▉| 8709/8750 [2:47:43<04:05, 6.00s/it] {'loss': 0.4844, 'learning_rate': 1.1516505589381777e-09, 'epoch': 1.0} + 100%|█████████▉| 8709/8750 [2:47:41<04:05, 5.99s/it] 100%|█████████▉| 8710/8750 [2:47:48<03:56, 5.92s/it] 100%|█████████▉| 8710/8750 [2:47:46<03:56, 5.92s/it] {'loss': 0.4651, 'learning_rate': 1.0961585952218301e-09, 'epoch': 1.0} + 100%|█████████▉| 8710/8750 [2:47:48<03:56, 5.92s/it] {'loss': 0.4651, 'learning_rate': 1.0961585952218301e-09, 'epoch': 1.0} + 100%|█████████▉| 8710/8750 [2:47:46<03:56, 5.92s/it] 100%|█████████▉| 8711/8750 [2:47:54<03:47, 5.82s/it] 100%|█████████▉| 8711/8750 [2:47:52<03:47, 5.82s/it] {'loss': 0.4544, 'learning_rate': 1.042036704568261e-09, 'epoch': 1.0} + 100%|█████████▉| 8711/8750 [2:47:54<03:47, 5.82s/it] {'loss': 0.4544, 'learning_rate': 1.042036704568261e-09, 'epoch': 1.0} + 100%|█████████▉| 8711/8750 [2:47:52<03:47, 5.82s/it] 100%|█████████▉| 8712/8750 [2:48:00<03:39, 5.78s/it] 100%|█████████▉| 8712/8750 [2:47:58<03:39, 5.78s/it] {'loss': 0.4279, 'learning_rate': 9.89284894395981e-10, 'epoch': 1.0} + 100%|█████████▉| 8712/8750 [2:48:00<03:39, 5.78s/it] {'loss': 0.4279, 'learning_rate': 9.89284894395981e-10, 'epoch': 1.0} + 100%|█████████▉| 8712/8750 [2:47:58<03:39, 5.78s/it] 100%|█████████▉| 8713/8750 [2:48:06<03:36, 5.84s/it] 100%|█████████▉| 8713/8750 [2:48:04<03:36, 5.84s/it] {'loss': 0.4377, 'learning_rate': 9.37903171929211e-10, 'epoch': 1.0} + 100%|█████████▉| 8713/8750 [2:48:06<03:36, 5.84s/it] {'loss': 0.4377, 'learning_rate': 9.37903171929211e-10, 'epoch': 1.0} + 100%|█████████▉| 8713/8750 [2:48:04<03:36, 5.84s/it] 100%|█████████▉| 8714/8750 [2:48:11<03:28, 5.79s/it] 100%|█████████▉| 8714/8750 [2:48:09<03:28, 5.79s/it] {'loss': 0.4588, 'learning_rate': 8.878915442123159e-10, 'epoch': 1.0} + 100%|█████████▉| 8714/8750 [2:48:11<03:28, 5.79s/it] {'loss': 0.4588, 'learning_rate': 8.878915442123159e-10, 'epoch': 1.0} + 100%|█████████▉| 8714/8750 [2:48:09<03:28, 5.79s/it] 100%|█████████▉| 8715/8750 [2:48:17<03:21, 5.76s/it] 100%|█████████▉| 8715/8750 [2:48:15<03:21, 5.76s/it] {'loss': 0.4642, 'learning_rate': 8.392500180953722e-10, 'epoch': 1.0} + 100%|█████████▉| 8715/8750 [2:48:17<03:21, 5.76s/it] {'loss': 0.4642, 'learning_rate': 8.392500180953722e-10, 'epoch': 1.0} + 100%|█████████▉| 8715/8750 [2:48:15<03:21, 5.76s/it] 100%|█████████▉| 8716/8750 [2:48:21<03:16, 5.79s/it] 100%|█████████▉| 8716/8750 [2:48:23<03:16, 5.79s/it] {'loss': 0.4456, 'learning_rate': 7.919786002441588e-10, 'epoch': 1.0} + 100%|█████████▉| 8716/8750 [2:48:23<03:16, 5.79s/it] {'loss': 0.4456, 'learning_rate': 7.919786002441588e-10, 'epoch': 1.0} + 100%|█████████▉| 8716/8750 [2:48:21<03:16, 5.79s/it] 100%|█████████▉| 8717/8750 [2:48:28<03:09, 5.73s/it] 100%|█████████▉| 8717/8750 [2:48:26<03:09, 5.73s/it] {'loss': 0.4498, 'learning_rate': 7.460772971357167e-10, 'epoch': 1.0} + 100%|█████████▉| 8717/8750 [2:48:28<03:09, 5.73s/it] {'loss': 0.4498, 'learning_rate': 7.460772971357167e-10, 'epoch': 1.0} + 100%|█████████▉| 8717/8750 [2:48:26<03:09, 5.73s/it] 100%|█████████▉| 8718/8750 [2:48:34<03:05, 5.79s/it] 100%|█████████▉| 8718/8750 [2:48:32<03:05, 5.79s/it] {'loss': 0.4332, 'learning_rate': 7.015461150594594e-10, 'epoch': 1.0} + 100%|█████████▉| 8718/8750 [2:48:34<03:05, 5.79s/it] {'loss': 0.4332, 'learning_rate': 7.015461150594594e-10, 'epoch': 1.0} + 100%|█████████▉| 8718/8750 [2:48:32<03:05, 5.79s/it] 100%|█████████▉| 8719/8750 [2:48:38<02:59, 5.79s/it] 100%|█████████▉| 8719/8750 [2:48:40<02:59, 5.80s/it] {'loss': 0.4588, 'learning_rate': 6.583850601182829e-10, 'epoch': 1.0} + 100%|█████████▉| 8719/8750 [2:48:40<02:59, 5.80s/it] {'loss': 0.4588, 'learning_rate': 6.583850601182829e-10, 'epoch': 1.0} + 100%|█████████▉| 8719/8750 [2:48:38<02:59, 5.79s/it] 100%|█████████▉| 8720/8750 [2:48:46<02:54, 5.82s/it] 100%|█████████▉| 8720/8750 [2:48:44<02:54, 5.82s/it] {'loss': 0.449, 'learning_rate': 6.165941382241248e-10, 'epoch': 1.0} + 100%|█████████▉| 8720/8750 [2:48:46<02:54, 5.82s/it] {'loss': 0.449, 'learning_rate': 6.165941382241248e-10, 'epoch': 1.0} + 100%|█████████▉| 8720/8750 [2:48:44<02:54, 5.82s/it] 100%|█████████▉| 8721/8750 [2:48:52<02:49, 5.83s/it] 100%|█████████▉| 8721/8750 [2:48:50<02:49, 5.83s/it] {'loss': 0.4273, 'learning_rate': 5.761733551057357e-10, 'epoch': 1.0} + 100%|█████████▉| 8721/8750 [2:48:52<02:49, 5.83s/it] {'loss': 0.4273, 'learning_rate': 5.761733551057357e-10, 'epoch': 1.0} + 100%|█████████▉| 8721/8750 [2:48:50<02:49, 5.83s/it] 100%|█████████▉| 8722/8750 [2:48:58<02:41, 5.78s/it] 100%|█████████▉| 8722/8750 [2:48:56<02:41, 5.78s/it]{'loss': 0.4443, 'learning_rate': 5.371227162997983e-10, 'epoch': 1.0} + {'loss': 0.4443, 'learning_rate': 5.371227162997983e-10, 'epoch': 1.0} + 100%|█████████▉| 8722/8750 [2:48:58<02:41, 5.78s/it] 100%|█████████▉| 8722/8750 [2:48:56<02:41, 5.78s/it] 100%|█████████▉| 8723/8750 [2:49:03<02:34, 5.71s/it] 100%|█████████▉| 8723/8750 [2:49:01<02:34, 5.71s/it] {'loss': 0.4435, 'learning_rate': 4.994422271575872e-10, 'epoch': 1.0} + 100%|█████████▉| 8723/8750 [2:49:03<02:34, 5.71s/it] {'loss': 0.4435, 'learning_rate': 4.994422271575872e-10, 'epoch': 1.0} + 100%|█████████▉| 8723/8750 [2:49:01<02:34, 5.71s/it] 100%|█████████▉| 8724/8750 [2:49:09<02:27, 5.69s/it] 100%|█████████▉| 8724/8750 [2:49:07<02:27, 5.69s/it] {'loss': 0.4719, 'learning_rate': 4.631318928427497e-10, 'epoch': 1.0} + 100%|█████████▉| 8724/8750 [2:49:09<02:27, 5.69s/it] {'loss': 0.4719, 'learning_rate': 4.631318928427497e-10, 'epoch': 1.0} + 100%|█████████▉| 8724/8750 [2:49:07<02:27, 5.69s/it] 100%|█████████▉| 8725/8750 [2:49:14<02:21, 5.67s/it] 100%|█████████▉| 8725/8750 [2:49:12<02:21, 5.67s/it] {'loss': 0.4333, 'learning_rate': 4.2819171833019537e-10, 'epoch': 1.0} + 100%|█████████▉| 8725/8750 [2:49:14<02:21, 5.67s/it] {'loss': 0.4333, 'learning_rate': 4.2819171833019537e-10, 'epoch': 1.0} + 100%|█████████▉| 8725/8750 [2:49:12<02:21, 5.67s/it] 100%|█████████▉| 8726/8750 [2:49:20<02:15, 5.66s/it] 100%|█████████▉| 8726/8750 [2:49:18<02:15, 5.66s/it] {'loss': 0.4628, 'learning_rate': 3.946217084072057e-10, 'epoch': 1.0} + 100%|█████████▉| 8726/8750 [2:49:20<02:15, 5.66s/it] {'loss': 0.4628, 'learning_rate': 3.946217084072057e-10, 'epoch': 1.0} + 100%|█████████▉| 8726/8750 [2:49:18<02:15, 5.66s/it] 100%|█████████▉| 8727/8750 [2:49:26<02:12, 5.74s/it] 100%|█████████▉| 8727/8750 [2:49:24<02:12, 5.74s/it] {'loss': 0.4442, 'learning_rate': 3.624218676734348e-10, 'epoch': 1.0} + 100%|█████████▉| 8727/8750 [2:49:26<02:12, 5.74s/it] {'loss': 0.4442, 'learning_rate': 3.624218676734348e-10, 'epoch': 1.0} + 100%|█████████▉| 8727/8750 [2:49:24<02:12, 5.74s/it] 100%|█████████▉| 8728/8750 [2:49:32<02:06, 5.74s/it] 100%|█████████▉| 8728/8750 [2:49:30<02:06, 5.74s/it] {'loss': 0.467, 'learning_rate': 3.315922005420191e-10, 'epoch': 1.0} + {'loss': 0.467, 'learning_rate': 3.315922005420191e-10, 'epoch': 1.0} 100%|█████████▉| 8728/8750 [2:49:32<02:06, 5.74s/it] + 100%|█████████▉| 8728/8750 [2:49:30<02:06, 5.74s/it] 100%|█████████▉| 8729/8750 [2:49:37<02:00, 5.75s/it] 100%|█████████▉| 8729/8750 [2:49:35<02:00, 5.75s/it] {'loss': 0.4376, 'learning_rate': 3.0213271123735735e-10, 'epoch': 1.0} + 100%|█████████▉| 8729/8750 [2:49:37<02:00, 5.75s/it] {'loss': 0.4376, 'learning_rate': 3.0213271123735735e-10, 'epoch': 1.0} + 100%|█████████▉| 8729/8750 [2:49:35<02:00, 5.75s/it] 100%|█████████▉| 8730/8750 [2:49:43<01:54, 5.73s/it] 100%|█████████▉| 8730/8750 [2:49:41<01:54, 5.73s/it] {'loss': 0.4528, 'learning_rate': 2.740434037951101e-10, 'epoch': 1.0} + 100%|█████████▉| 8730/8750 [2:49:43<01:54, 5.73s/it] {'loss': 0.4528, 'learning_rate': 2.740434037951101e-10, 'epoch': 1.0} + 100%|█████████▉| 8730/8750 [2:49:41<01:54, 5.73s/it] 100%|█████████▉| 8731/8750 [2:49:49<01:48, 5.71s/it] 100%|█████████▉| 8731/8750 [2:49:47<01:48, 5.71s/it] {'loss': 0.4384, 'learning_rate': 2.4732428206442063e-10, 'epoch': 1.0} + 100%|█████████▉| 8731/8750 [2:49:49<01:48, 5.71s/it] {'loss': 0.4384, 'learning_rate': 2.4732428206442063e-10, 'epoch': 1.0} + 100%|█████████▉| 8731/8750 [2:49:47<01:48, 5.71s/it] 100%|█████████▉| 8732/8750 [2:49:54<01:42, 5.70s/it] 100%|█████████▉| 8732/8750 [2:49:53<01:42, 5.70s/it] {'loss': 0.4699, 'learning_rate': 2.2197534970569424e-10, 'epoch': 1.0} + 100%|█████████▉| 8732/8750 [2:49:54<01:42, 5.70s/it] {'loss': 0.4699, 'learning_rate': 2.2197534970569424e-10, 'epoch': 1.0} + 100%|█████████▉| 8732/8750 [2:49:53<01:42, 5.70s/it] 100%|█████████▉| 8733/8750 [2:49:58<01:36, 5.68s/it] 100%|█████████▉| 8733/8750 [2:50:00<01:36, 5.68s/it] {'loss': 0.4377, 'learning_rate': 1.9799661019392901e-10, 'epoch': 1.0} + 100%|█████████▉| 8733/8750 [2:50:00<01:36, 5.68s/it] {'loss': 0.4377, 'learning_rate': 1.9799661019392901e-10, 'epoch': 1.0} + 100%|█████████▉| 8733/8750 [2:49:58<01:36, 5.68s/it] 100%|█████████▉| 8734/8750 [2:50:06<01:31, 5.74s/it] 100%|█████████▉| 8734/8750 [2:50:04<01:31, 5.74s/it] {'loss': 0.4585, 'learning_rate': 1.7538806681316467e-10, 'epoch': 1.0} + 100%|█████████▉| 8734/8750 [2:50:06<01:31, 5.74s/it] {'loss': 0.4585, 'learning_rate': 1.7538806681316467e-10, 'epoch': 1.0} + 100%|█████████▉| 8734/8750 [2:50:04<01:31, 5.74s/it] 100%|█████████▉| 8735/8750 [2:50:12<01:25, 5.68s/it] 100%|█████████▉| 8735/8750 [2:50:10<01:25, 5.68s/it] {'loss': 0.45, 'learning_rate': 1.5414972266314389e-10, 'epoch': 1.0} + 100%|█████████▉| 8735/8750 [2:50:12<01:25, 5.68s/it] {'loss': 0.45, 'learning_rate': 1.5414972266314389e-10, 'epoch': 1.0} + 100%|█████████▉| 8735/8750 [2:50:10<01:25, 5.68s/it] 100%|█████████▉| 8736/8750 [2:50:15<01:19, 5.70s/it] 100%|█████████▉| 8736/8750 [2:50:17<01:19, 5.70s/it] {'loss': 0.433, 'learning_rate': 1.3428158065154073e-10, 'epoch': 1.0} + 100%|█████████▉| 8736/8750 [2:50:17<01:19, 5.70s/it] {'loss': 0.433, 'learning_rate': 1.3428158065154073e-10, 'epoch': 1.0} + 100%|█████████▉| 8736/8750 [2:50:15<01:19, 5.70s/it] 100%|█████████▉| 8737/8750 [2:50:23<01:14, 5.69s/it] 100%|█████████▉| 8737/8750 [2:50:21<01:14, 5.69s/it] {'loss': 0.4574, 'learning_rate': 1.1578364350284254e-10, 'epoch': 1.0} + 100%|█████████▉| 8737/8750 [2:50:23<01:14, 5.69s/it] {'loss': 0.4574, 'learning_rate': 1.1578364350284254e-10, 'epoch': 1.0} + 100%|█████████▉| 8737/8750 [2:50:21<01:14, 5.69s/it] 100%|█████████▉| 8738/8750 [2:50:29<01:08, 5.73s/it] 100%|█████████▉| 8738/8750 [2:50:27<01:08, 5.73s/it] {'loss': 0.4435, 'learning_rate': 9.865591375168848e-11, 'epoch': 1.0} + 100%|█████████▉| 8738/8750 [2:50:29<01:08, 5.73s/it] {'loss': 0.4435, 'learning_rate': 9.865591375168848e-11, 'epoch': 1.0} + 100%|█████████▉| 8738/8750 [2:50:27<01:08, 5.73s/it] 100%|█████████▉| 8739/8750 [2:50:34<01:02, 5.72s/it] 100%|█████████▉| 8739/8750 [2:50:33<01:02, 5.72s/it] {'loss': 0.4497, 'learning_rate': 8.289839374286956e-11, 'epoch': 1.0} + 100%|█████████▉| 8739/8750 [2:50:34<01:02, 5.72s/it] {'loss': 0.4497, 'learning_rate': 8.289839374286956e-11, 'epoch': 1.0} + 100%|█████████▉| 8739/8750 [2:50:33<01:02, 5.72s/it] 100%|█████████▉| 8740/8750 [2:50:40<00:56, 5.65s/it] 100%|█████████▉| 8740/8750 [2:50:38<00:56, 5.66s/it] {'loss': 0.4621, 'learning_rate': 6.851108563687981e-11, 'epoch': 1.0} + 100%|█████████▉| 8740/8750 [2:50:40<00:56, 5.65s/it] {'loss': 0.4621, 'learning_rate': 6.851108563687981e-11, 'epoch': 1.0} + 100%|█████████▉| 8740/8750 [2:50:38<00:56, 5.66s/it] 100%|█████████▉| 8741/8750 [2:50:46<00:50, 5.66s/it] 100%|█████████▉| 8741/8750 [2:50:44<00:50, 5.66s/it] {'loss': 0.4419, 'learning_rate': 5.549399140547529e-11, 'epoch': 1.0} + 100%|█████████▉| 8741/8750 [2:50:46<00:50, 5.66s/it] {'loss': 0.4419, 'learning_rate': 5.549399140547529e-11, 'epoch': 1.0} + 100%|█████████▉| 8741/8750 [2:50:44<00:50, 5.66s/it] 100%|█████████▉| 8742/8750 [2:50:49<00:45, 5.66s/it] 100%|█████████▉| 8742/8750 [2:50:51<00:45, 5.66s/it] {'loss': 0.4573, 'learning_rate': 4.3847112831674196e-11, 'epoch': 1.0} + 100%|█████████▉| 8742/8750 [2:50:51<00:45, 5.66s/it] {'loss': 0.4573, 'learning_rate': 4.3847112831674196e-11, 'epoch': 1.0} + 100%|█████████▉| 8742/8750 [2:50:49<00:45, 5.66s/it] 100%|█████████▉| 8743/8750 [2:50:57<00:39, 5.65s/it] 100%|█████████▉| 8743/8750 [2:50:55<00:39, 5.65s/it] {'loss': 0.4466, 'learning_rate': 3.3570451511977245e-11, 'epoch': 1.0} + 100%|█████████▉| 8743/8750 [2:50:57<00:39, 5.65s/it] {'loss': 0.4466, 'learning_rate': 3.3570451511977245e-11, 'epoch': 1.0} + 100%|█████████▉| 8743/8750 [2:50:55<00:39, 5.65s/it] 100%|█████████▉| 8744/8750 [2:51:03<00:33, 5.66s/it] 100%|█████████▉| 8744/8750 [2:51:01<00:33, 5.66s/it] {'loss': 0.4322, 'learning_rate': 2.466400885303699e-11, 'epoch': 1.0} + 100%|█████████▉| 8744/8750 [2:51:03<00:33, 5.66s/it] {'loss': 0.4322, 'learning_rate': 2.466400885303699e-11, 'epoch': 1.0} + 100%|█████████▉| 8744/8750 [2:51:01<00:33, 5.66s/it] 100%|█████████▉| 8745/8750 [2:51:08<00:28, 5.63s/it] 100%|█████████▉| 8745/8750 [2:51:06<00:28, 5.63s/it] {'loss': 0.4459, 'learning_rate': 1.7127786077208998e-11, 'epoch': 1.0} + 100%|█████████▉| 8745/8750 [2:51:08<00:28, 5.63s/it] {'loss': 0.4459, 'learning_rate': 1.7127786077208998e-11, 'epoch': 1.0} + 100%|█████████▉| 8745/8750 [2:51:06<00:28, 5.63s/it] 100%|█████████▉| 8746/8750 [2:51:14<00:22, 5.68s/it] 100%|█████████▉| 8746/8750 [2:51:12<00:22, 5.68s/it] {'loss': 0.4587, 'learning_rate': 1.0961784215890448e-11, 'epoch': 1.0} + 100%|█████████▉| 8746/8750 [2:51:14<00:22, 5.68s/it] {'loss': 0.4587, 'learning_rate': 1.0961784215890448e-11, 'epoch': 1.0} + 100%|█████████▉| 8746/8750 [2:51:12<00:22, 5.68s/it] 100%|█████████▉| 8747/8750 [2:51:20<00:17, 5.67s/it] 100%|█████████▉| 8747/8750 [2:51:18<00:17, 5.67s/it] {'loss': 0.4332, 'learning_rate': 6.166004113961066e-12, 'epoch': 1.0} + 100%|█████████▉| 8747/8750 [2:51:20<00:17, 5.67s/it] {'loss': 0.4332, 'learning_rate': 6.166004113961066e-12, 'epoch': 1.0} + 100%|█████████▉| 8747/8750 [2:51:18<00:17, 5.67s/it] 100%|█████████▉| 8748/8750 [2:51:25<00:11, 5.68s/it] 100%|█████████▉| 8748/8750 [2:51:23<00:11, 5.68s/it] {'loss': 0.452, 'learning_rate': 2.7404464297831057e-12, 'epoch': 1.0} + 100%|█████████▉| 8748/8750 [2:51:25<00:11, 5.68s/it] {'loss': 0.452, 'learning_rate': 2.7404464297831057e-12, 'epoch': 1.0} + 100%|█████████▉| 8748/8750 [2:51:23<00:11, 5.68s/it] 100%|█████████▉| 8749/8750 [2:51:31<00:05, 5.71s/it] 100%|█████████▉| 8749/8750 [2:51:29<00:05, 5.71s/it] {'loss': 0.4351, 'learning_rate': 6.8511163076046e-13, 'epoch': 1.0} + 100%|█████████▉| 8749/8750 [2:51:31<00:05, 5.71s/it] {'loss': 0.4351, 'learning_rate': 6.8511163076046e-13, 'epoch': 1.0} + 100%|█████████▉| 8749/8750 [2:51:29<00:05, 5.71s/it]6 AutoResumeHook: Checking whether to suspend... +158 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... + +1 AutoResumeHook: Checking whether to suspend... + 100%|██████████| 8750/8750 [2:51:38<00:00, 5.93s/it]11 AutoResumeHook: Checking whether to suspend... +02 AutoResumeHook: Checking whether to suspend...10 AutoResumeHook: Checking whether to suspend... + +14 AutoResumeHook: Checking whether to suspend... + 100%|██████████| 8750/8750 [2:51:36<00:00, 5.93s/it] 13 AutoResumeHook: Checking whether to suspend...AutoResumeHook: Checking whether to suspend... 100%|██████████| 8750/8750 [2:51:38<00:00, 5.93s/it] + + 9 AutoResumeHook: Checking whether to suspend... + 12 AutoResumeHook: Checking whether to suspend... + 100%|██████████| 8750/8750 [2:51:38<00:00, 5.93s/it]5 AutoResumeHook: Checking whether to suspend... +4 AutoResumeHook: Checking whether to suspend... + 100%|██████████| 8750/8750 [2:51:36<00:00, 5.93s/it]{'loss': 0.4951, 'learning_rate': 0.0, 'epoch': 1.0} + 7 AutoResumeHook: Checking whether to suspend... + 3 AutoResumeHook: Checking whether to suspend... +{'train_runtime': 10298.032, 'train_samples_per_second': 438.849, 'train_steps_per_second': 0.85, 'train_loss': 0.09011299923147474, 'epoch': 1.0} +{'loss': 0.4951, 'learning_rate': 0.0, 'epoch': 1.0} 100%|██████████| 8750/8750 [2:51:36<00:00, 5.93s/it] +{'train_runtime': 10298.0352, 'train_samples_per_second': 438.849, 'train_steps_per_second': 0.85, 'train_loss': 0.09011299923147474, 'epoch': 1.0} + 100%|██████████| 8750/8750 [2:51:36<00:00, 1.18s/it] 100%|██████████| 8750/8750 [2:51:38<00:00, 1.18s/it] + +saving llm to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/llm +saving vision_tower to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/vision_tower +saving mm_projector to /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/mm_projector +wandb: 🚀 View run vila_3b_oxe_sim_jack_o at: https://wandb.ai/memmelma/VILA/runs/7u2jkjeh +wandb: Find logs at: ../../../../../../../../fs12/portfolios/nvr/users/mmemmel/projects/vila/VILA/wandb/run-20250410_181754-7u2jkjeh/logs +srun: job 6729323 queued and waiting for resources +srun: job 6729323 has been allocated resources +wandb: Currently logged in as: memmelma. Use `wandb login --relogin` to force relogin +MASTER_ADDR=batch-block1-10017 +JobID: 6729323 | Full list: batch-block1-10017 batch-block1-2120 +NETWORK=Efficient-Large-Model/VILA1.5-3b +wandb: Currently logged in as: memmelma. Use `wandb login --relogin` to force relogin +MASTER_ADDR=batch-block1-10017 +JobID: 6729323 | Full list: batch-block1-10017 batch-block1-2120 +NETWORK=Efficient-Large-Model/VILA1.5-3b +WARNING:torch.distributed.run: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:torch.distributed.run: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +Did not find AutoResume SDK! +[2025-04-10 21:13:57,575] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 21:13:57,575] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 21:13:57,575] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 21:13:57,575] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 21:13:57,575] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 21:13:57,575] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 21:13:57,575] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 21:13:57,575] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 21:13:57,708] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 21:13:57,708] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 21:13:57,708] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 21:13:57,708] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 21:13:57,708] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 21:13:57,708] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 21:13:57,708] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 21:13:57,708] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-04-10 21:13:59,042] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 21:13:59,042] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 21:13:59,042] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 21:13:59,042] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 21:13:59,042] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 21:13:59,043] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 21:13:59,042] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 21:13:59,042] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 21:13:59,042] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 21:13:59,043] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 21:13:59,043] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 21:13:59,043] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 21:13:59,043] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 21:13:59,043] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 21:13:59,043] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 21:13:59,043] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 21:13:59,056] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 21:13:59,056] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 21:13:59,056] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 21:13:59,056] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 21:13:59,056] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 21:13:59,056] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 21:13:59,056] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 21:13:59,056] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 21:13:59,056] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 21:13:59,056] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 21:13:59,056] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 21:13:59,056] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 21:13:59,056] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 21:13:59,056] [INFO] [comm.py:594:init_distributed] cdb=None +[2025-04-10 21:13:59,056] [INFO] [comm.py:625:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +[2025-04-10 21:13:59,056] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2025-04-10 21:13:59,056] [INFO] [comm.py:594:init_distributed] cdb=None +Models has been ready under /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o. Skipp trainingModels has been ready under /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o. Skipp training + +Models has been ready under /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o. Skipp training +Models has been ready under /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o. Skipp training +Models has been ready under /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o. Skipp training +Models has been ready under /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o. Skipp training +Models has been ready under /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o. Skipp training +Models has been ready under /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o. Skipp training +Models has been ready under /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o. Skipp training +Models has been ready under /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o. Skipp training +Models has been ready under /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o. Skipp training +Models has been ready under /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o. Skipp training +Models has been ready under /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o. Skipp training +Models has been ready under /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o. Skipp training +Models has been ready under /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o. Skipp training +Models has been ready under /lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o. Skipp training diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..bf47e1f9ed7a2d34753ba7968d73bef788c8ab72 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,52530 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 8750, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 7.604562737642586e-08, + "loss": 0.8388, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 1.5209125475285173e-07, + "loss": 0.8142, + "step": 2 + }, + { + "epoch": 0.0, + "learning_rate": 2.281368821292776e-07, + "loss": 0.8402, + "step": 3 + }, + { + "epoch": 0.0, + "learning_rate": 3.0418250950570346e-07, + "loss": 0.808, + "step": 4 + }, + { + "epoch": 0.0, + "learning_rate": 3.802281368821293e-07, + "loss": 0.8206, + "step": 5 + }, + { + "epoch": 0.0, + "learning_rate": 4.562737642585552e-07, + "loss": 0.8594, + "step": 6 + }, + { + "epoch": 0.0, + "learning_rate": 5.32319391634981e-07, + "loss": 0.8293, + "step": 7 + }, + { + "epoch": 0.0, + "learning_rate": 6.083650190114069e-07, + "loss": 0.8594, + "step": 8 + }, + { + "epoch": 0.0, + "learning_rate": 6.844106463878328e-07, + "loss": 0.8089, + "step": 9 + }, + { + "epoch": 0.0, + "learning_rate": 7.604562737642586e-07, + "loss": 0.8156, + "step": 10 + }, + { + "epoch": 0.0, + "learning_rate": 8.365019011406844e-07, + "loss": 0.8078, + "step": 11 + }, + { + "epoch": 0.0, + "learning_rate": 9.125475285171104e-07, + "loss": 0.7722, + "step": 12 + }, + { + "epoch": 0.0, + "learning_rate": 9.885931558935361e-07, + "loss": 0.799, + "step": 13 + }, + { + "epoch": 0.0, + "learning_rate": 1.064638783269962e-06, + "loss": 0.8171, + "step": 14 + }, + { + "epoch": 0.0, + "learning_rate": 1.140684410646388e-06, + "loss": 0.7761, + "step": 15 + }, + { + "epoch": 0.0, + "learning_rate": 1.2167300380228138e-06, + "loss": 0.7554, + "step": 16 + }, + { + "epoch": 0.0, + "learning_rate": 1.2927756653992395e-06, + "loss": 0.7353, + "step": 17 + }, + { + "epoch": 0.0, + "learning_rate": 1.3688212927756656e-06, + "loss": 0.7562, + "step": 18 + }, + { + "epoch": 0.0, + "learning_rate": 1.4448669201520913e-06, + "loss": 0.7364, + "step": 19 + }, + { + "epoch": 0.0, + "learning_rate": 1.5209125475285172e-06, + "loss": 0.7192, + "step": 20 + }, + { + "epoch": 0.0, + "learning_rate": 1.5969581749049431e-06, + "loss": 0.7345, + "step": 21 + }, + { + "epoch": 0.0, + "learning_rate": 1.6730038022813688e-06, + "loss": 0.704, + "step": 22 + }, + { + "epoch": 0.0, + "learning_rate": 1.7490494296577947e-06, + "loss": 0.6885, + "step": 23 + }, + { + "epoch": 0.0, + "learning_rate": 1.8250950570342208e-06, + "loss": 0.6952, + "step": 24 + }, + { + "epoch": 0.0, + "learning_rate": 1.9011406844106463e-06, + "loss": 0.679, + "step": 25 + }, + { + "epoch": 0.0, + "learning_rate": 1.9771863117870722e-06, + "loss": 0.6861, + "step": 26 + }, + { + "epoch": 0.0, + "learning_rate": 2.053231939163498e-06, + "loss": 0.6799, + "step": 27 + }, + { + "epoch": 0.0, + "learning_rate": 2.129277566539924e-06, + "loss": 0.669, + "step": 28 + }, + { + "epoch": 0.0, + "learning_rate": 2.20532319391635e-06, + "loss": 0.6709, + "step": 29 + }, + { + "epoch": 0.0, + "learning_rate": 2.281368821292776e-06, + "loss": 0.6729, + "step": 30 + }, + { + "epoch": 0.0, + "learning_rate": 2.3574144486692017e-06, + "loss": 0.6522, + "step": 31 + }, + { + "epoch": 0.0, + "learning_rate": 2.4334600760456276e-06, + "loss": 0.6696, + "step": 32 + }, + { + "epoch": 0.0, + "learning_rate": 2.509505703422053e-06, + "loss": 0.6507, + "step": 33 + }, + { + "epoch": 0.0, + "learning_rate": 2.585551330798479e-06, + "loss": 0.6338, + "step": 34 + }, + { + "epoch": 0.0, + "learning_rate": 2.6615969581749054e-06, + "loss": 0.6667, + "step": 35 + }, + { + "epoch": 0.0, + "learning_rate": 2.7376425855513313e-06, + "loss": 0.6551, + "step": 36 + }, + { + "epoch": 0.0, + "learning_rate": 2.813688212927757e-06, + "loss": 0.6158, + "step": 37 + }, + { + "epoch": 0.0, + "learning_rate": 2.8897338403041826e-06, + "loss": 0.6222, + "step": 38 + }, + { + "epoch": 0.0, + "learning_rate": 2.9657794676806085e-06, + "loss": 0.628, + "step": 39 + }, + { + "epoch": 0.0, + "learning_rate": 3.0418250950570345e-06, + "loss": 0.6507, + "step": 40 + }, + { + "epoch": 0.0, + "learning_rate": 3.1178707224334604e-06, + "loss": 0.6518, + "step": 41 + }, + { + "epoch": 0.0, + "learning_rate": 3.1939163498098863e-06, + "loss": 0.6297, + "step": 42 + }, + { + "epoch": 0.0, + "learning_rate": 3.269961977186312e-06, + "loss": 0.6228, + "step": 43 + }, + { + "epoch": 0.01, + "learning_rate": 3.3460076045627376e-06, + "loss": 0.6278, + "step": 44 + }, + { + "epoch": 0.01, + "learning_rate": 3.4220532319391635e-06, + "loss": 0.6226, + "step": 45 + }, + { + "epoch": 0.01, + "learning_rate": 3.4980988593155894e-06, + "loss": 0.6057, + "step": 46 + }, + { + "epoch": 0.01, + "learning_rate": 3.5741444866920154e-06, + "loss": 0.6189, + "step": 47 + }, + { + "epoch": 0.01, + "learning_rate": 3.6501901140684417e-06, + "loss": 0.6293, + "step": 48 + }, + { + "epoch": 0.01, + "learning_rate": 3.7262357414448676e-06, + "loss": 0.6231, + "step": 49 + }, + { + "epoch": 0.01, + "learning_rate": 3.8022813688212926e-06, + "loss": 0.629, + "step": 50 + }, + { + "epoch": 0.01, + "learning_rate": 3.8783269961977185e-06, + "loss": 0.6011, + "step": 51 + }, + { + "epoch": 0.01, + "learning_rate": 3.9543726235741444e-06, + "loss": 0.6084, + "step": 52 + }, + { + "epoch": 0.01, + "learning_rate": 4.03041825095057e-06, + "loss": 0.5854, + "step": 53 + }, + { + "epoch": 0.01, + "learning_rate": 4.106463878326996e-06, + "loss": 0.6004, + "step": 54 + }, + { + "epoch": 0.01, + "learning_rate": 4.182509505703423e-06, + "loss": 0.5955, + "step": 55 + }, + { + "epoch": 0.01, + "learning_rate": 4.258555133079848e-06, + "loss": 0.6117, + "step": 56 + }, + { + "epoch": 0.01, + "learning_rate": 4.334600760456274e-06, + "loss": 0.5993, + "step": 57 + }, + { + "epoch": 0.01, + "learning_rate": 4.4106463878327e-06, + "loss": 0.6063, + "step": 58 + }, + { + "epoch": 0.01, + "learning_rate": 4.486692015209126e-06, + "loss": 0.5858, + "step": 59 + }, + { + "epoch": 0.01, + "learning_rate": 4.562737642585552e-06, + "loss": 0.6101, + "step": 60 + }, + { + "epoch": 0.01, + "learning_rate": 4.638783269961978e-06, + "loss": 0.5843, + "step": 61 + }, + { + "epoch": 0.01, + "learning_rate": 4.7148288973384035e-06, + "loss": 0.5875, + "step": 62 + }, + { + "epoch": 0.01, + "learning_rate": 4.790874524714829e-06, + "loss": 0.5833, + "step": 63 + }, + { + "epoch": 0.01, + "learning_rate": 4.866920152091255e-06, + "loss": 0.5843, + "step": 64 + }, + { + "epoch": 0.01, + "learning_rate": 4.942965779467681e-06, + "loss": 0.5836, + "step": 65 + }, + { + "epoch": 0.01, + "learning_rate": 5.019011406844106e-06, + "loss": 0.6352, + "step": 66 + }, + { + "epoch": 0.01, + "learning_rate": 5.095057034220533e-06, + "loss": 0.5787, + "step": 67 + }, + { + "epoch": 0.01, + "learning_rate": 5.171102661596958e-06, + "loss": 0.59, + "step": 68 + }, + { + "epoch": 0.01, + "learning_rate": 5.247148288973385e-06, + "loss": 0.5917, + "step": 69 + }, + { + "epoch": 0.01, + "learning_rate": 5.323193916349811e-06, + "loss": 0.5731, + "step": 70 + }, + { + "epoch": 0.01, + "learning_rate": 5.399239543726236e-06, + "loss": 0.5789, + "step": 71 + }, + { + "epoch": 0.01, + "learning_rate": 5.4752851711026625e-06, + "loss": 0.581, + "step": 72 + }, + { + "epoch": 0.01, + "learning_rate": 5.5513307984790876e-06, + "loss": 0.5857, + "step": 73 + }, + { + "epoch": 0.01, + "learning_rate": 5.627376425855514e-06, + "loss": 0.5863, + "step": 74 + }, + { + "epoch": 0.01, + "learning_rate": 5.703422053231939e-06, + "loss": 0.5802, + "step": 75 + }, + { + "epoch": 0.01, + "learning_rate": 5.779467680608365e-06, + "loss": 0.5812, + "step": 76 + }, + { + "epoch": 0.01, + "learning_rate": 5.855513307984791e-06, + "loss": 0.5775, + "step": 77 + }, + { + "epoch": 0.01, + "learning_rate": 5.931558935361217e-06, + "loss": 0.5732, + "step": 78 + }, + { + "epoch": 0.01, + "learning_rate": 6.007604562737643e-06, + "loss": 0.5573, + "step": 79 + }, + { + "epoch": 0.01, + "learning_rate": 6.083650190114069e-06, + "loss": 0.5563, + "step": 80 + }, + { + "epoch": 0.01, + "learning_rate": 6.159695817490496e-06, + "loss": 0.5797, + "step": 81 + }, + { + "epoch": 0.01, + "learning_rate": 6.235741444866921e-06, + "loss": 0.5945, + "step": 82 + }, + { + "epoch": 0.01, + "learning_rate": 6.311787072243346e-06, + "loss": 0.5757, + "step": 83 + }, + { + "epoch": 0.01, + "learning_rate": 6.3878326996197725e-06, + "loss": 0.569, + "step": 84 + }, + { + "epoch": 0.01, + "learning_rate": 6.4638783269961976e-06, + "loss": 0.5691, + "step": 85 + }, + { + "epoch": 0.01, + "learning_rate": 6.539923954372624e-06, + "loss": 0.5818, + "step": 86 + }, + { + "epoch": 0.01, + "learning_rate": 6.61596958174905e-06, + "loss": 0.5869, + "step": 87 + }, + { + "epoch": 0.01, + "learning_rate": 6.692015209125475e-06, + "loss": 0.5624, + "step": 88 + }, + { + "epoch": 0.01, + "learning_rate": 6.768060836501902e-06, + "loss": 0.561, + "step": 89 + }, + { + "epoch": 0.01, + "learning_rate": 6.844106463878327e-06, + "loss": 0.5586, + "step": 90 + }, + { + "epoch": 0.01, + "learning_rate": 6.920152091254754e-06, + "loss": 0.585, + "step": 91 + }, + { + "epoch": 0.01, + "learning_rate": 6.996197718631179e-06, + "loss": 0.575, + "step": 92 + }, + { + "epoch": 0.01, + "learning_rate": 7.072243346007606e-06, + "loss": 0.5674, + "step": 93 + }, + { + "epoch": 0.01, + "learning_rate": 7.148288973384031e-06, + "loss": 0.5803, + "step": 94 + }, + { + "epoch": 0.01, + "learning_rate": 7.224334600760457e-06, + "loss": 0.5745, + "step": 95 + }, + { + "epoch": 0.01, + "learning_rate": 7.300380228136883e-06, + "loss": 0.5678, + "step": 96 + }, + { + "epoch": 0.01, + "learning_rate": 7.376425855513308e-06, + "loss": 0.5495, + "step": 97 + }, + { + "epoch": 0.01, + "learning_rate": 7.452471482889735e-06, + "loss": 0.5583, + "step": 98 + }, + { + "epoch": 0.01, + "learning_rate": 7.52851711026616e-06, + "loss": 0.5921, + "step": 99 + }, + { + "epoch": 0.01, + "learning_rate": 7.604562737642585e-06, + "loss": 0.564, + "step": 100 + }, + { + "epoch": 0.01, + "learning_rate": 7.680608365019012e-06, + "loss": 0.5552, + "step": 101 + }, + { + "epoch": 0.01, + "learning_rate": 7.756653992395437e-06, + "loss": 0.5649, + "step": 102 + }, + { + "epoch": 0.01, + "learning_rate": 7.832699619771864e-06, + "loss": 0.5691, + "step": 103 + }, + { + "epoch": 0.01, + "learning_rate": 7.908745247148289e-06, + "loss": 0.5722, + "step": 104 + }, + { + "epoch": 0.01, + "learning_rate": 7.984790874524716e-06, + "loss": 0.5691, + "step": 105 + }, + { + "epoch": 0.01, + "learning_rate": 8.06083650190114e-06, + "loss": 0.5728, + "step": 106 + }, + { + "epoch": 0.01, + "learning_rate": 8.136882129277567e-06, + "loss": 0.5799, + "step": 107 + }, + { + "epoch": 0.01, + "learning_rate": 8.212927756653993e-06, + "loss": 0.5636, + "step": 108 + }, + { + "epoch": 0.01, + "learning_rate": 8.28897338403042e-06, + "loss": 0.5666, + "step": 109 + }, + { + "epoch": 0.01, + "learning_rate": 8.365019011406846e-06, + "loss": 0.5627, + "step": 110 + }, + { + "epoch": 0.01, + "learning_rate": 8.441064638783271e-06, + "loss": 0.5494, + "step": 111 + }, + { + "epoch": 0.01, + "learning_rate": 8.517110266159696e-06, + "loss": 0.5716, + "step": 112 + }, + { + "epoch": 0.01, + "learning_rate": 8.593155893536123e-06, + "loss": 0.5442, + "step": 113 + }, + { + "epoch": 0.01, + "learning_rate": 8.669201520912548e-06, + "loss": 0.5698, + "step": 114 + }, + { + "epoch": 0.01, + "learning_rate": 8.745247148288975e-06, + "loss": 0.5592, + "step": 115 + }, + { + "epoch": 0.01, + "learning_rate": 8.8212927756654e-06, + "loss": 0.5573, + "step": 116 + }, + { + "epoch": 0.01, + "learning_rate": 8.897338403041825e-06, + "loss": 0.5634, + "step": 117 + }, + { + "epoch": 0.01, + "learning_rate": 8.973384030418252e-06, + "loss": 0.5451, + "step": 118 + }, + { + "epoch": 0.01, + "learning_rate": 9.049429657794677e-06, + "loss": 0.5629, + "step": 119 + }, + { + "epoch": 0.01, + "learning_rate": 9.125475285171103e-06, + "loss": 0.5551, + "step": 120 + }, + { + "epoch": 0.01, + "learning_rate": 9.201520912547528e-06, + "loss": 0.547, + "step": 121 + }, + { + "epoch": 0.01, + "learning_rate": 9.277566539923955e-06, + "loss": 0.5649, + "step": 122 + }, + { + "epoch": 0.01, + "learning_rate": 9.35361216730038e-06, + "loss": 0.5516, + "step": 123 + }, + { + "epoch": 0.01, + "learning_rate": 9.429657794676807e-06, + "loss": 0.5661, + "step": 124 + }, + { + "epoch": 0.01, + "learning_rate": 9.505703422053234e-06, + "loss": 0.5619, + "step": 125 + }, + { + "epoch": 0.01, + "learning_rate": 9.581749049429659e-06, + "loss": 0.5714, + "step": 126 + }, + { + "epoch": 0.01, + "learning_rate": 9.657794676806086e-06, + "loss": 0.5372, + "step": 127 + }, + { + "epoch": 0.01, + "learning_rate": 9.73384030418251e-06, + "loss": 0.5492, + "step": 128 + }, + { + "epoch": 0.01, + "learning_rate": 9.809885931558936e-06, + "loss": 0.5402, + "step": 129 + }, + { + "epoch": 0.01, + "learning_rate": 9.885931558935362e-06, + "loss": 0.5687, + "step": 130 + }, + { + "epoch": 0.01, + "learning_rate": 9.961977186311787e-06, + "loss": 0.5433, + "step": 131 + }, + { + "epoch": 0.02, + "learning_rate": 1.0038022813688212e-05, + "loss": 0.5584, + "step": 132 + }, + { + "epoch": 0.02, + "learning_rate": 1.011406844106464e-05, + "loss": 0.5561, + "step": 133 + }, + { + "epoch": 0.02, + "learning_rate": 1.0190114068441066e-05, + "loss": 0.5563, + "step": 134 + }, + { + "epoch": 0.02, + "learning_rate": 1.0266159695817491e-05, + "loss": 0.5626, + "step": 135 + }, + { + "epoch": 0.02, + "learning_rate": 1.0342205323193916e-05, + "loss": 0.5659, + "step": 136 + }, + { + "epoch": 0.02, + "learning_rate": 1.0418250950570343e-05, + "loss": 0.516, + "step": 137 + }, + { + "epoch": 0.02, + "learning_rate": 1.049429657794677e-05, + "loss": 0.5602, + "step": 138 + }, + { + "epoch": 0.02, + "learning_rate": 1.0570342205323195e-05, + "loss": 0.553, + "step": 139 + }, + { + "epoch": 0.02, + "learning_rate": 1.0646387832699621e-05, + "loss": 0.5562, + "step": 140 + }, + { + "epoch": 0.02, + "learning_rate": 1.0722433460076046e-05, + "loss": 0.5302, + "step": 141 + }, + { + "epoch": 0.02, + "learning_rate": 1.0798479087452472e-05, + "loss": 0.561, + "step": 142 + }, + { + "epoch": 0.02, + "learning_rate": 1.0874524714828898e-05, + "loss": 0.5463, + "step": 143 + }, + { + "epoch": 0.02, + "learning_rate": 1.0950570342205325e-05, + "loss": 0.5567, + "step": 144 + }, + { + "epoch": 0.02, + "learning_rate": 1.1026615969581752e-05, + "loss": 0.5301, + "step": 145 + }, + { + "epoch": 0.02, + "learning_rate": 1.1102661596958175e-05, + "loss": 0.5499, + "step": 146 + }, + { + "epoch": 0.02, + "learning_rate": 1.1178707224334602e-05, + "loss": 0.5371, + "step": 147 + }, + { + "epoch": 0.02, + "learning_rate": 1.1254752851711029e-05, + "loss": 0.5385, + "step": 148 + }, + { + "epoch": 0.02, + "learning_rate": 1.1330798479087452e-05, + "loss": 0.5438, + "step": 149 + }, + { + "epoch": 0.02, + "learning_rate": 1.1406844106463879e-05, + "loss": 0.5507, + "step": 150 + }, + { + "epoch": 0.02, + "learning_rate": 1.1482889733840306e-05, + "loss": 0.5496, + "step": 151 + }, + { + "epoch": 0.02, + "learning_rate": 1.155893536121673e-05, + "loss": 0.5411, + "step": 152 + }, + { + "epoch": 0.02, + "learning_rate": 1.1634980988593156e-05, + "loss": 0.5492, + "step": 153 + }, + { + "epoch": 0.02, + "learning_rate": 1.1711026615969582e-05, + "loss": 0.5384, + "step": 154 + }, + { + "epoch": 0.02, + "learning_rate": 1.1787072243346009e-05, + "loss": 0.5209, + "step": 155 + }, + { + "epoch": 0.02, + "learning_rate": 1.1863117870722434e-05, + "loss": 0.5404, + "step": 156 + }, + { + "epoch": 0.02, + "learning_rate": 1.1939163498098861e-05, + "loss": 0.5511, + "step": 157 + }, + { + "epoch": 0.02, + "learning_rate": 1.2015209125475286e-05, + "loss": 0.53, + "step": 158 + }, + { + "epoch": 0.02, + "learning_rate": 1.2091254752851711e-05, + "loss": 0.5369, + "step": 159 + }, + { + "epoch": 0.02, + "learning_rate": 1.2167300380228138e-05, + "loss": 0.5446, + "step": 160 + }, + { + "epoch": 0.02, + "learning_rate": 1.2243346007604565e-05, + "loss": 0.5408, + "step": 161 + }, + { + "epoch": 0.02, + "learning_rate": 1.2319391634980991e-05, + "loss": 0.5411, + "step": 162 + }, + { + "epoch": 0.02, + "learning_rate": 1.2395437262357415e-05, + "loss": 0.5167, + "step": 163 + }, + { + "epoch": 0.02, + "learning_rate": 1.2471482889733841e-05, + "loss": 0.5447, + "step": 164 + }, + { + "epoch": 0.02, + "learning_rate": 1.2547528517110268e-05, + "loss": 0.5221, + "step": 165 + }, + { + "epoch": 0.02, + "learning_rate": 1.2623574144486692e-05, + "loss": 0.536, + "step": 166 + }, + { + "epoch": 0.02, + "learning_rate": 1.2699619771863118e-05, + "loss": 0.5453, + "step": 167 + }, + { + "epoch": 0.02, + "learning_rate": 1.2775665399239545e-05, + "loss": 0.531, + "step": 168 + }, + { + "epoch": 0.02, + "learning_rate": 1.2851711026615972e-05, + "loss": 0.5511, + "step": 169 + }, + { + "epoch": 0.02, + "learning_rate": 1.2927756653992395e-05, + "loss": 0.5444, + "step": 170 + }, + { + "epoch": 0.02, + "learning_rate": 1.3003802281368822e-05, + "loss": 0.5562, + "step": 171 + }, + { + "epoch": 0.02, + "learning_rate": 1.3079847908745249e-05, + "loss": 0.5343, + "step": 172 + }, + { + "epoch": 0.02, + "learning_rate": 1.3155893536121674e-05, + "loss": 0.5298, + "step": 173 + }, + { + "epoch": 0.02, + "learning_rate": 1.32319391634981e-05, + "loss": 0.5133, + "step": 174 + }, + { + "epoch": 0.02, + "learning_rate": 1.3307984790874526e-05, + "loss": 0.5407, + "step": 175 + }, + { + "epoch": 0.02, + "learning_rate": 1.338403041825095e-05, + "loss": 0.5446, + "step": 176 + }, + { + "epoch": 0.02, + "learning_rate": 1.3460076045627377e-05, + "loss": 0.5178, + "step": 177 + }, + { + "epoch": 0.02, + "learning_rate": 1.3536121673003804e-05, + "loss": 0.5481, + "step": 178 + }, + { + "epoch": 0.02, + "learning_rate": 1.361216730038023e-05, + "loss": 0.5537, + "step": 179 + }, + { + "epoch": 0.02, + "learning_rate": 1.3688212927756654e-05, + "loss": 0.5121, + "step": 180 + }, + { + "epoch": 0.02, + "learning_rate": 1.3764258555133081e-05, + "loss": 0.5132, + "step": 181 + }, + { + "epoch": 0.02, + "learning_rate": 1.3840304182509508e-05, + "loss": 0.5489, + "step": 182 + }, + { + "epoch": 0.02, + "learning_rate": 1.3916349809885931e-05, + "loss": 0.5299, + "step": 183 + }, + { + "epoch": 0.02, + "learning_rate": 1.3992395437262358e-05, + "loss": 0.5523, + "step": 184 + }, + { + "epoch": 0.02, + "learning_rate": 1.4068441064638785e-05, + "loss": 0.5257, + "step": 185 + }, + { + "epoch": 0.02, + "learning_rate": 1.4144486692015211e-05, + "loss": 0.5232, + "step": 186 + }, + { + "epoch": 0.02, + "learning_rate": 1.4220532319391636e-05, + "loss": 0.5423, + "step": 187 + }, + { + "epoch": 0.02, + "learning_rate": 1.4296577946768061e-05, + "loss": 0.528, + "step": 188 + }, + { + "epoch": 0.02, + "learning_rate": 1.4372623574144488e-05, + "loss": 0.527, + "step": 189 + }, + { + "epoch": 0.02, + "learning_rate": 1.4448669201520913e-05, + "loss": 0.5425, + "step": 190 + }, + { + "epoch": 0.02, + "learning_rate": 1.452471482889734e-05, + "loss": 0.5182, + "step": 191 + }, + { + "epoch": 0.02, + "learning_rate": 1.4600760456273767e-05, + "loss": 0.5552, + "step": 192 + }, + { + "epoch": 0.02, + "learning_rate": 1.467680608365019e-05, + "loss": 0.5517, + "step": 193 + }, + { + "epoch": 0.02, + "learning_rate": 1.4752851711026617e-05, + "loss": 0.5179, + "step": 194 + }, + { + "epoch": 0.02, + "learning_rate": 1.4828897338403044e-05, + "loss": 0.5453, + "step": 195 + }, + { + "epoch": 0.02, + "learning_rate": 1.490494296577947e-05, + "loss": 0.5325, + "step": 196 + }, + { + "epoch": 0.02, + "learning_rate": 1.4980988593155894e-05, + "loss": 0.5305, + "step": 197 + }, + { + "epoch": 0.02, + "learning_rate": 1.505703422053232e-05, + "loss": 0.5312, + "step": 198 + }, + { + "epoch": 0.02, + "learning_rate": 1.5133079847908747e-05, + "loss": 0.5371, + "step": 199 + }, + { + "epoch": 0.02, + "learning_rate": 1.520912547528517e-05, + "loss": 0.5374, + "step": 200 + }, + { + "epoch": 0.02, + "learning_rate": 1.5285171102661597e-05, + "loss": 0.537, + "step": 201 + }, + { + "epoch": 0.02, + "learning_rate": 1.5361216730038024e-05, + "loss": 0.5428, + "step": 202 + }, + { + "epoch": 0.02, + "learning_rate": 1.543726235741445e-05, + "loss": 0.5202, + "step": 203 + }, + { + "epoch": 0.02, + "learning_rate": 1.5513307984790874e-05, + "loss": 0.5168, + "step": 204 + }, + { + "epoch": 0.02, + "learning_rate": 1.55893536121673e-05, + "loss": 0.5479, + "step": 205 + }, + { + "epoch": 0.02, + "learning_rate": 1.5665399239543728e-05, + "loss": 0.5294, + "step": 206 + }, + { + "epoch": 0.02, + "learning_rate": 1.574144486692015e-05, + "loss": 0.535, + "step": 207 + }, + { + "epoch": 0.02, + "learning_rate": 1.5817490494296578e-05, + "loss": 0.5417, + "step": 208 + }, + { + "epoch": 0.02, + "learning_rate": 1.5893536121673005e-05, + "loss": 0.5564, + "step": 209 + }, + { + "epoch": 0.02, + "learning_rate": 1.596958174904943e-05, + "loss": 0.5247, + "step": 210 + }, + { + "epoch": 0.02, + "learning_rate": 1.6045627376425855e-05, + "loss": 0.5157, + "step": 211 + }, + { + "epoch": 0.02, + "learning_rate": 1.612167300380228e-05, + "loss": 0.5206, + "step": 212 + }, + { + "epoch": 0.02, + "learning_rate": 1.6197718631178708e-05, + "loss": 0.5466, + "step": 213 + }, + { + "epoch": 0.02, + "learning_rate": 1.6273764258555135e-05, + "loss": 0.5188, + "step": 214 + }, + { + "epoch": 0.02, + "learning_rate": 1.634980988593156e-05, + "loss": 0.5301, + "step": 215 + }, + { + "epoch": 0.02, + "learning_rate": 1.6425855513307985e-05, + "loss": 0.5417, + "step": 216 + }, + { + "epoch": 0.02, + "learning_rate": 1.6501901140684412e-05, + "loss": 0.5318, + "step": 217 + }, + { + "epoch": 0.02, + "learning_rate": 1.657794676806084e-05, + "loss": 0.5161, + "step": 218 + }, + { + "epoch": 0.03, + "learning_rate": 1.6653992395437265e-05, + "loss": 0.5379, + "step": 219 + }, + { + "epoch": 0.03, + "learning_rate": 1.6730038022813692e-05, + "loss": 0.5216, + "step": 220 + }, + { + "epoch": 0.03, + "learning_rate": 1.6806083650190115e-05, + "loss": 0.5264, + "step": 221 + }, + { + "epoch": 0.03, + "learning_rate": 1.6882129277566542e-05, + "loss": 0.5262, + "step": 222 + }, + { + "epoch": 0.03, + "learning_rate": 1.695817490494297e-05, + "loss": 0.5251, + "step": 223 + }, + { + "epoch": 0.03, + "learning_rate": 1.7034220532319392e-05, + "loss": 0.5259, + "step": 224 + }, + { + "epoch": 0.03, + "learning_rate": 1.711026615969582e-05, + "loss": 0.5346, + "step": 225 + }, + { + "epoch": 0.03, + "learning_rate": 1.7186311787072246e-05, + "loss": 0.5362, + "step": 226 + }, + { + "epoch": 0.03, + "learning_rate": 1.7262357414448672e-05, + "loss": 0.5164, + "step": 227 + }, + { + "epoch": 0.03, + "learning_rate": 1.7338403041825096e-05, + "loss": 0.5233, + "step": 228 + }, + { + "epoch": 0.03, + "learning_rate": 1.7414448669201523e-05, + "loss": 0.5171, + "step": 229 + }, + { + "epoch": 0.03, + "learning_rate": 1.749049429657795e-05, + "loss": 0.521, + "step": 230 + }, + { + "epoch": 0.03, + "learning_rate": 1.7566539923954373e-05, + "loss": 0.5227, + "step": 231 + }, + { + "epoch": 0.03, + "learning_rate": 1.76425855513308e-05, + "loss": 0.518, + "step": 232 + }, + { + "epoch": 0.03, + "learning_rate": 1.7718631178707226e-05, + "loss": 0.5416, + "step": 233 + }, + { + "epoch": 0.03, + "learning_rate": 1.779467680608365e-05, + "loss": 0.521, + "step": 234 + }, + { + "epoch": 0.03, + "learning_rate": 1.7870722433460076e-05, + "loss": 0.5199, + "step": 235 + }, + { + "epoch": 0.03, + "learning_rate": 1.7946768060836503e-05, + "loss": 0.5327, + "step": 236 + }, + { + "epoch": 0.03, + "learning_rate": 1.802281368821293e-05, + "loss": 0.52, + "step": 237 + }, + { + "epoch": 0.03, + "learning_rate": 1.8098859315589353e-05, + "loss": 0.519, + "step": 238 + }, + { + "epoch": 0.03, + "learning_rate": 1.817490494296578e-05, + "loss": 0.5309, + "step": 239 + }, + { + "epoch": 0.03, + "learning_rate": 1.8250950570342207e-05, + "loss": 0.5213, + "step": 240 + }, + { + "epoch": 0.03, + "learning_rate": 1.832699619771863e-05, + "loss": 0.5349, + "step": 241 + }, + { + "epoch": 0.03, + "learning_rate": 1.8403041825095057e-05, + "loss": 0.5415, + "step": 242 + }, + { + "epoch": 0.03, + "learning_rate": 1.8479087452471484e-05, + "loss": 0.5163, + "step": 243 + }, + { + "epoch": 0.03, + "learning_rate": 1.855513307984791e-05, + "loss": 0.5141, + "step": 244 + }, + { + "epoch": 0.03, + "learning_rate": 1.8631178707224337e-05, + "loss": 0.5208, + "step": 245 + }, + { + "epoch": 0.03, + "learning_rate": 1.870722433460076e-05, + "loss": 0.5091, + "step": 246 + }, + { + "epoch": 0.03, + "learning_rate": 1.8783269961977187e-05, + "loss": 0.5496, + "step": 247 + }, + { + "epoch": 0.03, + "learning_rate": 1.8859315589353614e-05, + "loss": 0.518, + "step": 248 + }, + { + "epoch": 0.03, + "learning_rate": 1.893536121673004e-05, + "loss": 0.5337, + "step": 249 + }, + { + "epoch": 0.03, + "learning_rate": 1.9011406844106467e-05, + "loss": 0.5099, + "step": 250 + }, + { + "epoch": 0.03, + "learning_rate": 1.908745247148289e-05, + "loss": 0.5258, + "step": 251 + }, + { + "epoch": 0.03, + "learning_rate": 1.9163498098859318e-05, + "loss": 0.5225, + "step": 252 + }, + { + "epoch": 0.03, + "learning_rate": 1.9239543726235744e-05, + "loss": 0.5185, + "step": 253 + }, + { + "epoch": 0.03, + "learning_rate": 1.931558935361217e-05, + "loss": 0.527, + "step": 254 + }, + { + "epoch": 0.03, + "learning_rate": 1.9391634980988594e-05, + "loss": 0.5155, + "step": 255 + }, + { + "epoch": 0.03, + "learning_rate": 1.946768060836502e-05, + "loss": 0.5138, + "step": 256 + }, + { + "epoch": 0.03, + "learning_rate": 1.9543726235741448e-05, + "loss": 0.5309, + "step": 257 + }, + { + "epoch": 0.03, + "learning_rate": 1.961977186311787e-05, + "loss": 0.527, + "step": 258 + }, + { + "epoch": 0.03, + "learning_rate": 1.9695817490494298e-05, + "loss": 0.5223, + "step": 259 + }, + { + "epoch": 0.03, + "learning_rate": 1.9771863117870725e-05, + "loss": 0.5452, + "step": 260 + }, + { + "epoch": 0.03, + "learning_rate": 1.984790874524715e-05, + "loss": 0.527, + "step": 261 + }, + { + "epoch": 0.03, + "learning_rate": 1.9923954372623575e-05, + "loss": 0.535, + "step": 262 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.5283, + "step": 263 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999999314888373e-05, + "loss": 0.5108, + "step": 264 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999997259553572e-05, + "loss": 0.5212, + "step": 265 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999993833995886e-05, + "loss": 0.5039, + "step": 266 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999989038215787e-05, + "loss": 0.5497, + "step": 267 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999982872213925e-05, + "loss": 0.5265, + "step": 268 + }, + { + "epoch": 0.03, + "learning_rate": 1.999997533599115e-05, + "loss": 0.5201, + "step": 269 + }, + { + "epoch": 0.03, + "learning_rate": 1.999996642954849e-05, + "loss": 0.5279, + "step": 270 + }, + { + "epoch": 0.03, + "learning_rate": 1.999995615288717e-05, + "loss": 0.5313, + "step": 271 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999944506008594e-05, + "loss": 0.5108, + "step": 272 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999931488914366e-05, + "loss": 0.5274, + "step": 273 + }, + { + "epoch": 0.03, + "learning_rate": 1.999991710160626e-05, + "loss": 0.5071, + "step": 274 + }, + { + "epoch": 0.03, + "learning_rate": 1.999990134408625e-05, + "loss": 0.5226, + "step": 275 + }, + { + "epoch": 0.03, + "learning_rate": 1.99998842163565e-05, + "loss": 0.5201, + "step": 276 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999865718419352e-05, + "loss": 0.5188, + "step": 277 + }, + { + "epoch": 0.03, + "learning_rate": 1.999984585027734e-05, + "loss": 0.5578, + "step": 278 + }, + { + "epoch": 0.03, + "learning_rate": 1.999982461193319e-05, + "loss": 0.5245, + "step": 279 + }, + { + "epoch": 0.03, + "learning_rate": 1.999980200338981e-05, + "loss": 0.5092, + "step": 280 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999778024650296e-05, + "loss": 0.5174, + "step": 281 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999752675717938e-05, + "loss": 0.524, + "step": 282 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999725956596204e-05, + "loss": 0.5193, + "step": 283 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999697867288764e-05, + "loss": 0.5026, + "step": 284 + }, + { + "epoch": 0.03, + "learning_rate": 1.999966840779946e-05, + "loss": 0.527, + "step": 285 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999637578132328e-05, + "loss": 0.5295, + "step": 286 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999605378291593e-05, + "loss": 0.5041, + "step": 287 + }, + { + "epoch": 0.03, + "learning_rate": 1.999957180828167e-05, + "loss": 0.5232, + "step": 288 + }, + { + "epoch": 0.03, + "learning_rate": 1.999953686810716e-05, + "loss": 0.5181, + "step": 289 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999500557772843e-05, + "loss": 0.5092, + "step": 290 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999462877283702e-05, + "loss": 0.5373, + "step": 291 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999423826644895e-05, + "loss": 0.5129, + "step": 292 + }, + { + "epoch": 0.03, + "learning_rate": 1.999938340586178e-05, + "loss": 0.5435, + "step": 293 + }, + { + "epoch": 0.03, + "learning_rate": 1.999934161493988e-05, + "loss": 0.5212, + "step": 294 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999298453884944e-05, + "loss": 0.5404, + "step": 295 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999253922702868e-05, + "loss": 0.5253, + "step": 296 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999208021399757e-05, + "loss": 0.5073, + "step": 297 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999160749981908e-05, + "loss": 0.507, + "step": 298 + }, + { + "epoch": 0.03, + "learning_rate": 1.999911210845579e-05, + "loss": 0.5219, + "step": 299 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999062096828072e-05, + "loss": 0.5168, + "step": 300 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999010715105608e-05, + "loss": 0.5167, + "step": 301 + }, + { + "epoch": 0.03, + "learning_rate": 1.9998957963295434e-05, + "loss": 0.5081, + "step": 302 + }, + { + "epoch": 0.03, + "learning_rate": 1.999890384140478e-05, + "loss": 0.5134, + "step": 303 + }, + { + "epoch": 0.03, + "learning_rate": 1.999884834944106e-05, + "loss": 0.5205, + "step": 304 + }, + { + "epoch": 0.03, + "learning_rate": 1.9998791487411887e-05, + "loss": 0.5281, + "step": 305 + }, + { + "epoch": 0.03, + "learning_rate": 1.9998733255325043e-05, + "loss": 0.5184, + "step": 306 + }, + { + "epoch": 0.04, + "learning_rate": 1.999867365318851e-05, + "loss": 0.509, + "step": 307 + }, + { + "epoch": 0.04, + "learning_rate": 1.9998612681010452e-05, + "loss": 0.5131, + "step": 308 + }, + { + "epoch": 0.04, + "learning_rate": 1.999855033879923e-05, + "loss": 0.5204, + "step": 309 + }, + { + "epoch": 0.04, + "learning_rate": 1.9998486626563376e-05, + "loss": 0.5085, + "step": 310 + }, + { + "epoch": 0.04, + "learning_rate": 1.999842154431163e-05, + "loss": 0.5362, + "step": 311 + }, + { + "epoch": 0.04, + "learning_rate": 1.9998355092052906e-05, + "loss": 0.5136, + "step": 312 + }, + { + "epoch": 0.04, + "learning_rate": 1.9998287269796313e-05, + "loss": 0.5057, + "step": 313 + }, + { + "epoch": 0.04, + "learning_rate": 1.9998218077551135e-05, + "loss": 0.5129, + "step": 314 + }, + { + "epoch": 0.04, + "learning_rate": 1.9998147515326862e-05, + "loss": 0.5301, + "step": 315 + }, + { + "epoch": 0.04, + "learning_rate": 1.9998075583133157e-05, + "loss": 0.4895, + "step": 316 + }, + { + "epoch": 0.04, + "learning_rate": 1.999800228097988e-05, + "loss": 0.5259, + "step": 317 + }, + { + "epoch": 0.04, + "learning_rate": 1.999792760887707e-05, + "loss": 0.5194, + "step": 318 + }, + { + "epoch": 0.04, + "learning_rate": 1.9997851566834966e-05, + "loss": 0.5316, + "step": 319 + }, + { + "epoch": 0.04, + "learning_rate": 1.999777415486398e-05, + "loss": 0.517, + "step": 320 + }, + { + "epoch": 0.04, + "learning_rate": 1.9997695372974725e-05, + "loss": 0.5055, + "step": 321 + }, + { + "epoch": 0.04, + "learning_rate": 1.9997615221177996e-05, + "loss": 0.543, + "step": 322 + }, + { + "epoch": 0.04, + "learning_rate": 1.999753369948477e-05, + "loss": 0.5334, + "step": 323 + }, + { + "epoch": 0.04, + "learning_rate": 1.999745080790622e-05, + "loss": 0.5019, + "step": 324 + }, + { + "epoch": 0.04, + "learning_rate": 1.999736654645371e-05, + "loss": 0.5284, + "step": 325 + }, + { + "epoch": 0.04, + "learning_rate": 1.999728091513877e-05, + "loss": 0.5158, + "step": 326 + }, + { + "epoch": 0.04, + "learning_rate": 1.9997193913973154e-05, + "loss": 0.5483, + "step": 327 + }, + { + "epoch": 0.04, + "learning_rate": 1.999710554296877e-05, + "loss": 0.5111, + "step": 328 + }, + { + "epoch": 0.04, + "learning_rate": 1.9997015802137727e-05, + "loss": 0.5069, + "step": 329 + }, + { + "epoch": 0.04, + "learning_rate": 1.9996924691492325e-05, + "loss": 0.4987, + "step": 330 + }, + { + "epoch": 0.04, + "learning_rate": 1.9996832211045048e-05, + "loss": 0.542, + "step": 331 + }, + { + "epoch": 0.04, + "learning_rate": 1.9996738360808566e-05, + "loss": 0.5133, + "step": 332 + }, + { + "epoch": 0.04, + "learning_rate": 1.999664314079574e-05, + "loss": 0.5189, + "step": 333 + }, + { + "epoch": 0.04, + "learning_rate": 1.9996546551019618e-05, + "loss": 0.4968, + "step": 334 + }, + { + "epoch": 0.04, + "learning_rate": 1.9996448591493433e-05, + "loss": 0.5309, + "step": 335 + }, + { + "epoch": 0.04, + "learning_rate": 1.9996349262230607e-05, + "loss": 0.5144, + "step": 336 + }, + { + "epoch": 0.04, + "learning_rate": 1.9996248563244755e-05, + "loss": 0.5217, + "step": 337 + }, + { + "epoch": 0.04, + "learning_rate": 1.9996146494549672e-05, + "loss": 0.4983, + "step": 338 + }, + { + "epoch": 0.04, + "learning_rate": 1.999604305615934e-05, + "loss": 0.4968, + "step": 339 + }, + { + "epoch": 0.04, + "learning_rate": 1.9995938248087937e-05, + "loss": 0.5185, + "step": 340 + }, + { + "epoch": 0.04, + "learning_rate": 1.9995832070349827e-05, + "loss": 0.4945, + "step": 341 + }, + { + "epoch": 0.04, + "learning_rate": 1.999572452295955e-05, + "loss": 0.5324, + "step": 342 + }, + { + "epoch": 0.04, + "learning_rate": 1.999561560593185e-05, + "loss": 0.5058, + "step": 343 + }, + { + "epoch": 0.04, + "learning_rate": 1.9995505319281645e-05, + "loss": 0.5145, + "step": 344 + }, + { + "epoch": 0.04, + "learning_rate": 1.9995393663024054e-05, + "loss": 0.5248, + "step": 345 + }, + { + "epoch": 0.04, + "learning_rate": 1.999528063717437e-05, + "loss": 0.515, + "step": 346 + }, + { + "epoch": 0.04, + "learning_rate": 1.9995166241748084e-05, + "loss": 0.5016, + "step": 347 + }, + { + "epoch": 0.04, + "learning_rate": 1.9995050476760864e-05, + "loss": 0.5052, + "step": 348 + }, + { + "epoch": 0.04, + "learning_rate": 1.9994933342228583e-05, + "loss": 0.5221, + "step": 349 + }, + { + "epoch": 0.04, + "learning_rate": 1.9994814838167286e-05, + "loss": 0.5006, + "step": 350 + }, + { + "epoch": 0.04, + "learning_rate": 1.9994694964593206e-05, + "loss": 0.513, + "step": 351 + }, + { + "epoch": 0.04, + "learning_rate": 1.9994573721522776e-05, + "loss": 0.5107, + "step": 352 + }, + { + "epoch": 0.04, + "learning_rate": 1.9994451108972604e-05, + "loss": 0.5247, + "step": 353 + }, + { + "epoch": 0.04, + "learning_rate": 1.999432712695949e-05, + "loss": 0.5054, + "step": 354 + }, + { + "epoch": 0.04, + "learning_rate": 1.999420177550043e-05, + "loss": 0.5129, + "step": 355 + }, + { + "epoch": 0.04, + "learning_rate": 1.999407505461259e-05, + "loss": 0.4971, + "step": 356 + }, + { + "epoch": 0.04, + "learning_rate": 1.999394696431334e-05, + "loss": 0.4876, + "step": 357 + }, + { + "epoch": 0.04, + "learning_rate": 1.999381750462023e-05, + "loss": 0.528, + "step": 358 + }, + { + "epoch": 0.04, + "learning_rate": 1.9993686675550998e-05, + "loss": 0.5143, + "step": 359 + }, + { + "epoch": 0.04, + "learning_rate": 1.9993554477123568e-05, + "loss": 0.5101, + "step": 360 + }, + { + "epoch": 0.04, + "learning_rate": 1.9993420909356058e-05, + "loss": 0.5107, + "step": 361 + }, + { + "epoch": 0.04, + "learning_rate": 1.999328597226677e-05, + "loss": 0.5321, + "step": 362 + }, + { + "epoch": 0.04, + "learning_rate": 1.9993149665874193e-05, + "loss": 0.5402, + "step": 363 + }, + { + "epoch": 0.04, + "learning_rate": 1.9993011990197e-05, + "loss": 0.5014, + "step": 364 + }, + { + "epoch": 0.04, + "learning_rate": 1.9992872945254064e-05, + "loss": 0.4866, + "step": 365 + }, + { + "epoch": 0.04, + "learning_rate": 1.9992732531064427e-05, + "loss": 0.5061, + "step": 366 + }, + { + "epoch": 0.04, + "learning_rate": 1.9992590747647334e-05, + "loss": 0.5053, + "step": 367 + }, + { + "epoch": 0.04, + "learning_rate": 1.9992447595022214e-05, + "loss": 0.5216, + "step": 368 + }, + { + "epoch": 0.04, + "learning_rate": 1.9992303073208678e-05, + "loss": 0.5027, + "step": 369 + }, + { + "epoch": 0.04, + "learning_rate": 1.9992157182226535e-05, + "loss": 0.4829, + "step": 370 + }, + { + "epoch": 0.04, + "learning_rate": 1.9992009922095766e-05, + "loss": 0.5256, + "step": 371 + }, + { + "epoch": 0.04, + "learning_rate": 1.999186129283656e-05, + "loss": 0.5018, + "step": 372 + }, + { + "epoch": 0.04, + "learning_rate": 1.9991711294469273e-05, + "loss": 0.509, + "step": 373 + }, + { + "epoch": 0.04, + "learning_rate": 1.9991559927014465e-05, + "loss": 0.5046, + "step": 374 + }, + { + "epoch": 0.04, + "learning_rate": 1.999140719049287e-05, + "loss": 0.5319, + "step": 375 + }, + { + "epoch": 0.04, + "learning_rate": 1.9991253084925425e-05, + "loss": 0.52, + "step": 376 + }, + { + "epoch": 0.04, + "learning_rate": 1.999109761033324e-05, + "loss": 0.5033, + "step": 377 + }, + { + "epoch": 0.04, + "learning_rate": 1.9990940766737617e-05, + "loss": 0.4969, + "step": 378 + }, + { + "epoch": 0.04, + "learning_rate": 1.999078255416005e-05, + "loss": 0.5246, + "step": 379 + }, + { + "epoch": 0.04, + "learning_rate": 1.9990622972622216e-05, + "loss": 0.4919, + "step": 380 + }, + { + "epoch": 0.04, + "learning_rate": 1.9990462022145985e-05, + "loss": 0.5271, + "step": 381 + }, + { + "epoch": 0.04, + "learning_rate": 1.9990299702753405e-05, + "loss": 0.5046, + "step": 382 + }, + { + "epoch": 0.04, + "learning_rate": 1.9990136014466722e-05, + "loss": 0.5027, + "step": 383 + }, + { + "epoch": 0.04, + "learning_rate": 1.9989970957308364e-05, + "loss": 0.5148, + "step": 384 + }, + { + "epoch": 0.04, + "learning_rate": 1.998980453130095e-05, + "loss": 0.529, + "step": 385 + }, + { + "epoch": 0.04, + "learning_rate": 1.9989636736467278e-05, + "loss": 0.5077, + "step": 386 + }, + { + "epoch": 0.04, + "learning_rate": 1.9989467572830342e-05, + "loss": 0.5123, + "step": 387 + }, + { + "epoch": 0.04, + "learning_rate": 1.9989297040413325e-05, + "loss": 0.5002, + "step": 388 + }, + { + "epoch": 0.04, + "learning_rate": 1.998912513923959e-05, + "loss": 0.5295, + "step": 389 + }, + { + "epoch": 0.04, + "learning_rate": 1.998895186933269e-05, + "loss": 0.4975, + "step": 390 + }, + { + "epoch": 0.04, + "learning_rate": 1.9988777230716367e-05, + "loss": 0.5053, + "step": 391 + }, + { + "epoch": 0.04, + "learning_rate": 1.9988601223414555e-05, + "loss": 0.5067, + "step": 392 + }, + { + "epoch": 0.04, + "learning_rate": 1.998842384745137e-05, + "loss": 0.5156, + "step": 393 + }, + { + "epoch": 0.05, + "learning_rate": 1.998824510285111e-05, + "loss": 0.5218, + "step": 394 + }, + { + "epoch": 0.05, + "learning_rate": 1.998806498963828e-05, + "loss": 0.5057, + "step": 395 + }, + { + "epoch": 0.05, + "learning_rate": 1.9987883507837545e-05, + "loss": 0.5167, + "step": 396 + }, + { + "epoch": 0.05, + "learning_rate": 1.998770065747378e-05, + "loss": 0.5119, + "step": 397 + }, + { + "epoch": 0.05, + "learning_rate": 1.9987516438572035e-05, + "loss": 0.5095, + "step": 398 + }, + { + "epoch": 0.05, + "learning_rate": 1.9987330851157557e-05, + "loss": 0.526, + "step": 399 + }, + { + "epoch": 0.05, + "learning_rate": 1.9987143895255774e-05, + "loss": 0.5022, + "step": 400 + }, + { + "epoch": 0.05, + "learning_rate": 1.9986955570892302e-05, + "loss": 0.5204, + "step": 401 + }, + { + "epoch": 0.05, + "learning_rate": 1.9986765878092945e-05, + "loss": 0.5211, + "step": 402 + }, + { + "epoch": 0.05, + "learning_rate": 1.99865748168837e-05, + "loss": 0.5107, + "step": 403 + }, + { + "epoch": 0.05, + "learning_rate": 1.9986382387290738e-05, + "loss": 0.5242, + "step": 404 + }, + { + "epoch": 0.05, + "learning_rate": 1.9986188589340435e-05, + "loss": 0.5001, + "step": 405 + }, + { + "epoch": 0.05, + "learning_rate": 1.9985993423059342e-05, + "loss": 0.4907, + "step": 406 + }, + { + "epoch": 0.05, + "learning_rate": 1.99857968884742e-05, + "loss": 0.5123, + "step": 407 + }, + { + "epoch": 0.05, + "learning_rate": 1.998559898561194e-05, + "loss": 0.508, + "step": 408 + }, + { + "epoch": 0.05, + "learning_rate": 1.9985399714499678e-05, + "loss": 0.4923, + "step": 409 + }, + { + "epoch": 0.05, + "learning_rate": 1.998519907516472e-05, + "loss": 0.5473, + "step": 410 + }, + { + "epoch": 0.05, + "learning_rate": 1.998499706763456e-05, + "loss": 0.5052, + "step": 411 + }, + { + "epoch": 0.05, + "learning_rate": 1.998479369193687e-05, + "loss": 0.5304, + "step": 412 + }, + { + "epoch": 0.05, + "learning_rate": 1.9984588948099528e-05, + "loss": 0.4969, + "step": 413 + }, + { + "epoch": 0.05, + "learning_rate": 1.998438283615058e-05, + "loss": 0.5129, + "step": 414 + }, + { + "epoch": 0.05, + "learning_rate": 1.9984175356118268e-05, + "loss": 0.4953, + "step": 415 + }, + { + "epoch": 0.05, + "learning_rate": 1.9983966508031026e-05, + "loss": 0.5145, + "step": 416 + }, + { + "epoch": 0.05, + "learning_rate": 1.9983756291917467e-05, + "loss": 0.5019, + "step": 417 + }, + { + "epoch": 0.05, + "learning_rate": 1.99835447078064e-05, + "loss": 0.5282, + "step": 418 + }, + { + "epoch": 0.05, + "learning_rate": 1.998333175572681e-05, + "loss": 0.4842, + "step": 419 + }, + { + "epoch": 0.05, + "learning_rate": 1.998311743570788e-05, + "loss": 0.5177, + "step": 420 + }, + { + "epoch": 0.05, + "learning_rate": 1.998290174777898e-05, + "loss": 0.5039, + "step": 421 + }, + { + "epoch": 0.05, + "learning_rate": 1.998268469196966e-05, + "loss": 0.5227, + "step": 422 + }, + { + "epoch": 0.05, + "learning_rate": 1.998246626830966e-05, + "loss": 0.4926, + "step": 423 + }, + { + "epoch": 0.05, + "learning_rate": 1.998224647682891e-05, + "loss": 0.5044, + "step": 424 + }, + { + "epoch": 0.05, + "learning_rate": 1.998202531755753e-05, + "loss": 0.5244, + "step": 425 + }, + { + "epoch": 0.05, + "learning_rate": 1.9981802790525822e-05, + "loss": 0.4932, + "step": 426 + }, + { + "epoch": 0.05, + "learning_rate": 1.9981578895764272e-05, + "loss": 0.5055, + "step": 427 + }, + { + "epoch": 0.05, + "learning_rate": 1.998135363330357e-05, + "loss": 0.537, + "step": 428 + }, + { + "epoch": 0.05, + "learning_rate": 1.998112700317457e-05, + "loss": 0.4919, + "step": 429 + }, + { + "epoch": 0.05, + "learning_rate": 1.998089900540833e-05, + "loss": 0.5127, + "step": 430 + }, + { + "epoch": 0.05, + "learning_rate": 1.998066964003609e-05, + "loss": 0.5092, + "step": 431 + }, + { + "epoch": 0.05, + "learning_rate": 1.998043890708928e-05, + "loss": 0.5153, + "step": 432 + }, + { + "epoch": 0.05, + "learning_rate": 1.9980206806599516e-05, + "loss": 0.508, + "step": 433 + }, + { + "epoch": 0.05, + "learning_rate": 1.9979973338598603e-05, + "loss": 0.5059, + "step": 434 + }, + { + "epoch": 0.05, + "learning_rate": 1.997973850311852e-05, + "loss": 0.4842, + "step": 435 + }, + { + "epoch": 0.05, + "learning_rate": 1.997950230019146e-05, + "loss": 0.5241, + "step": 436 + }, + { + "epoch": 0.05, + "learning_rate": 1.9979264729849776e-05, + "loss": 0.5035, + "step": 437 + }, + { + "epoch": 0.05, + "learning_rate": 1.9979025792126027e-05, + "loss": 0.4893, + "step": 438 + }, + { + "epoch": 0.05, + "learning_rate": 1.9978785487052952e-05, + "loss": 0.5123, + "step": 439 + }, + { + "epoch": 0.05, + "learning_rate": 1.9978543814663478e-05, + "loss": 0.5177, + "step": 440 + }, + { + "epoch": 0.05, + "learning_rate": 1.9978300774990716e-05, + "loss": 0.4992, + "step": 441 + }, + { + "epoch": 0.05, + "learning_rate": 1.9978056368067973e-05, + "loss": 0.5116, + "step": 442 + }, + { + "epoch": 0.05, + "learning_rate": 1.9977810593928736e-05, + "loss": 0.5017, + "step": 443 + }, + { + "epoch": 0.05, + "learning_rate": 1.9977563452606677e-05, + "loss": 0.5114, + "step": 444 + }, + { + "epoch": 0.05, + "learning_rate": 1.9977314944135667e-05, + "loss": 0.4902, + "step": 445 + }, + { + "epoch": 0.05, + "learning_rate": 1.9977065068549756e-05, + "loss": 0.5134, + "step": 446 + }, + { + "epoch": 0.05, + "learning_rate": 1.9976813825883182e-05, + "loss": 0.4954, + "step": 447 + }, + { + "epoch": 0.05, + "learning_rate": 1.9976561216170368e-05, + "loss": 0.5045, + "step": 448 + }, + { + "epoch": 0.05, + "learning_rate": 1.9976307239445924e-05, + "loss": 0.4949, + "step": 449 + }, + { + "epoch": 0.05, + "learning_rate": 1.9976051895744663e-05, + "loss": 0.5228, + "step": 450 + }, + { + "epoch": 0.05, + "learning_rate": 1.997579518510156e-05, + "loss": 0.5035, + "step": 451 + }, + { + "epoch": 0.05, + "learning_rate": 1.99755371075518e-05, + "loss": 0.5046, + "step": 452 + }, + { + "epoch": 0.05, + "learning_rate": 1.9975277663130736e-05, + "loss": 0.5041, + "step": 453 + }, + { + "epoch": 0.05, + "learning_rate": 1.9975016851873925e-05, + "loss": 0.5142, + "step": 454 + }, + { + "epoch": 0.05, + "learning_rate": 1.99747546738171e-05, + "loss": 0.4906, + "step": 455 + }, + { + "epoch": 0.05, + "learning_rate": 1.997449112899619e-05, + "loss": 0.5018, + "step": 456 + }, + { + "epoch": 0.05, + "learning_rate": 1.99742262174473e-05, + "loss": 0.5064, + "step": 457 + }, + { + "epoch": 0.05, + "learning_rate": 1.9973959939206734e-05, + "loss": 0.4996, + "step": 458 + }, + { + "epoch": 0.05, + "learning_rate": 1.9973692294310972e-05, + "loss": 0.5149, + "step": 459 + }, + { + "epoch": 0.05, + "learning_rate": 1.9973423282796695e-05, + "loss": 0.4956, + "step": 460 + }, + { + "epoch": 0.05, + "learning_rate": 1.9973152904700762e-05, + "loss": 0.5125, + "step": 461 + }, + { + "epoch": 0.05, + "learning_rate": 1.9972881160060216e-05, + "loss": 0.5385, + "step": 462 + }, + { + "epoch": 0.05, + "learning_rate": 1.997260804891229e-05, + "loss": 0.485, + "step": 463 + }, + { + "epoch": 0.05, + "learning_rate": 1.9972333571294418e-05, + "loss": 0.5028, + "step": 464 + }, + { + "epoch": 0.05, + "learning_rate": 1.9972057727244203e-05, + "loss": 0.4887, + "step": 465 + }, + { + "epoch": 0.05, + "learning_rate": 1.997178051679944e-05, + "loss": 0.5229, + "step": 466 + }, + { + "epoch": 0.05, + "learning_rate": 1.997150193999811e-05, + "loss": 0.4953, + "step": 467 + }, + { + "epoch": 0.05, + "learning_rate": 1.9971221996878395e-05, + "loss": 0.5123, + "step": 468 + }, + { + "epoch": 0.05, + "learning_rate": 1.9970940687478643e-05, + "loss": 0.5256, + "step": 469 + }, + { + "epoch": 0.05, + "learning_rate": 1.9970658011837404e-05, + "loss": 0.501, + "step": 470 + }, + { + "epoch": 0.05, + "learning_rate": 1.9970373969993414e-05, + "loss": 0.5173, + "step": 471 + }, + { + "epoch": 0.05, + "learning_rate": 1.997008856198559e-05, + "loss": 0.5047, + "step": 472 + }, + { + "epoch": 0.05, + "learning_rate": 1.9969801787853035e-05, + "loss": 0.5107, + "step": 473 + }, + { + "epoch": 0.05, + "learning_rate": 1.996951364763505e-05, + "loss": 0.5104, + "step": 474 + }, + { + "epoch": 0.05, + "learning_rate": 1.9969224141371114e-05, + "loss": 0.5097, + "step": 475 + }, + { + "epoch": 0.05, + "learning_rate": 1.99689332691009e-05, + "loss": 0.5093, + "step": 476 + }, + { + "epoch": 0.05, + "learning_rate": 1.9968641030864254e-05, + "loss": 0.4957, + "step": 477 + }, + { + "epoch": 0.05, + "learning_rate": 1.9968347426701228e-05, + "loss": 0.5105, + "step": 478 + }, + { + "epoch": 0.05, + "learning_rate": 1.9968052456652048e-05, + "loss": 0.504, + "step": 479 + }, + { + "epoch": 0.05, + "learning_rate": 1.9967756120757132e-05, + "loss": 0.5008, + "step": 480 + }, + { + "epoch": 0.05, + "learning_rate": 1.9967458419057092e-05, + "loss": 0.5101, + "step": 481 + }, + { + "epoch": 0.06, + "learning_rate": 1.9967159351592706e-05, + "loss": 0.5092, + "step": 482 + }, + { + "epoch": 0.06, + "learning_rate": 1.9966858918404965e-05, + "loss": 0.5023, + "step": 483 + }, + { + "epoch": 0.06, + "learning_rate": 1.996655711953503e-05, + "loss": 0.5103, + "step": 484 + }, + { + "epoch": 0.06, + "learning_rate": 1.996625395502425e-05, + "loss": 0.5325, + "step": 485 + }, + { + "epoch": 0.06, + "learning_rate": 1.9965949424914175e-05, + "loss": 0.5126, + "step": 486 + }, + { + "epoch": 0.06, + "learning_rate": 1.9965643529246526e-05, + "loss": 0.5017, + "step": 487 + }, + { + "epoch": 0.06, + "learning_rate": 1.996533626806322e-05, + "loss": 0.5367, + "step": 488 + }, + { + "epoch": 0.06, + "learning_rate": 1.9965027641406355e-05, + "loss": 0.4925, + "step": 489 + }, + { + "epoch": 0.06, + "learning_rate": 1.996471764931822e-05, + "loss": 0.5153, + "step": 490 + }, + { + "epoch": 0.06, + "learning_rate": 1.99644062918413e-05, + "loss": 0.5196, + "step": 491 + }, + { + "epoch": 0.06, + "learning_rate": 1.9964093569018247e-05, + "loss": 0.4936, + "step": 492 + }, + { + "epoch": 0.06, + "learning_rate": 1.9963779480891917e-05, + "loss": 0.5085, + "step": 493 + }, + { + "epoch": 0.06, + "learning_rate": 1.9963464027505343e-05, + "loss": 0.5153, + "step": 494 + }, + { + "epoch": 0.06, + "learning_rate": 1.996314720890175e-05, + "loss": 0.5228, + "step": 495 + }, + { + "epoch": 0.06, + "learning_rate": 1.9962829025124553e-05, + "loss": 0.499, + "step": 496 + }, + { + "epoch": 0.06, + "learning_rate": 1.9962509476217348e-05, + "loss": 0.512, + "step": 497 + }, + { + "epoch": 0.06, + "learning_rate": 1.9962188562223916e-05, + "loss": 0.4839, + "step": 498 + }, + { + "epoch": 0.06, + "learning_rate": 1.9961866283188237e-05, + "loss": 0.4876, + "step": 499 + }, + { + "epoch": 0.06, + "learning_rate": 1.9961542639154467e-05, + "loss": 0.5057, + "step": 500 + }, + { + "epoch": 0.06, + "learning_rate": 1.9961217630166954e-05, + "loss": 0.5144, + "step": 501 + }, + { + "epoch": 0.06, + "learning_rate": 1.9960891256270224e-05, + "loss": 0.5042, + "step": 502 + }, + { + "epoch": 0.06, + "learning_rate": 1.9960563517509008e-05, + "loss": 0.5188, + "step": 503 + }, + { + "epoch": 0.06, + "learning_rate": 1.996023441392821e-05, + "loss": 0.4886, + "step": 504 + }, + { + "epoch": 0.06, + "learning_rate": 1.9959903945572918e-05, + "loss": 0.4883, + "step": 505 + }, + { + "epoch": 0.06, + "learning_rate": 1.9959572112488423e-05, + "loss": 0.5606, + "step": 506 + }, + { + "epoch": 0.06, + "learning_rate": 1.9959238914720188e-05, + "loss": 0.492, + "step": 507 + }, + { + "epoch": 0.06, + "learning_rate": 1.995890435231387e-05, + "loss": 0.4934, + "step": 508 + }, + { + "epoch": 0.06, + "learning_rate": 1.9958568425315316e-05, + "loss": 0.5088, + "step": 509 + }, + { + "epoch": 0.06, + "learning_rate": 1.9958231133770548e-05, + "loss": 0.5085, + "step": 510 + }, + { + "epoch": 0.06, + "learning_rate": 1.995789247772578e-05, + "loss": 0.5067, + "step": 511 + }, + { + "epoch": 0.06, + "learning_rate": 1.9957552457227428e-05, + "loss": 0.5029, + "step": 512 + }, + { + "epoch": 0.06, + "learning_rate": 1.9957211072322075e-05, + "loss": 0.4727, + "step": 513 + }, + { + "epoch": 0.06, + "learning_rate": 1.9956868323056494e-05, + "loss": 0.5176, + "step": 514 + }, + { + "epoch": 0.06, + "learning_rate": 1.9956524209477658e-05, + "loss": 0.5033, + "step": 515 + }, + { + "epoch": 0.06, + "learning_rate": 1.9956178731632715e-05, + "loss": 0.513, + "step": 516 + }, + { + "epoch": 0.06, + "learning_rate": 1.9955831889568998e-05, + "loss": 0.5041, + "step": 517 + }, + { + "epoch": 0.06, + "learning_rate": 1.995548368333404e-05, + "loss": 0.4975, + "step": 518 + }, + { + "epoch": 0.06, + "learning_rate": 1.9955134112975548e-05, + "loss": 0.5129, + "step": 519 + }, + { + "epoch": 0.06, + "learning_rate": 1.9954783178541424e-05, + "loss": 0.505, + "step": 520 + }, + { + "epoch": 0.06, + "learning_rate": 1.995443088007975e-05, + "loss": 0.5008, + "step": 521 + }, + { + "epoch": 0.06, + "learning_rate": 1.9954077217638807e-05, + "loss": 0.5171, + "step": 522 + }, + { + "epoch": 0.06, + "learning_rate": 1.995372219126704e-05, + "loss": 0.4896, + "step": 523 + }, + { + "epoch": 0.06, + "learning_rate": 1.995336580101311e-05, + "loss": 0.521, + "step": 524 + }, + { + "epoch": 0.06, + "learning_rate": 1.9953008046925844e-05, + "loss": 0.5038, + "step": 525 + }, + { + "epoch": 0.06, + "learning_rate": 1.9952648929054262e-05, + "loss": 0.5123, + "step": 526 + }, + { + "epoch": 0.06, + "learning_rate": 1.9952288447447573e-05, + "loss": 0.5118, + "step": 527 + }, + { + "epoch": 0.06, + "learning_rate": 1.995192660215517e-05, + "loss": 0.5144, + "step": 528 + }, + { + "epoch": 0.06, + "learning_rate": 1.9951563393226632e-05, + "loss": 0.5075, + "step": 529 + }, + { + "epoch": 0.06, + "learning_rate": 1.9951198820711735e-05, + "loss": 0.497, + "step": 530 + }, + { + "epoch": 0.06, + "learning_rate": 1.995083288466042e-05, + "loss": 0.4836, + "step": 531 + }, + { + "epoch": 0.06, + "learning_rate": 1.995046558512284e-05, + "loss": 0.5177, + "step": 532 + }, + { + "epoch": 0.06, + "learning_rate": 1.995009692214932e-05, + "loss": 0.4862, + "step": 533 + }, + { + "epoch": 0.06, + "learning_rate": 1.994972689579037e-05, + "loss": 0.5107, + "step": 534 + }, + { + "epoch": 0.06, + "learning_rate": 1.99493555060967e-05, + "loss": 0.4924, + "step": 535 + }, + { + "epoch": 0.06, + "learning_rate": 1.994898275311919e-05, + "loss": 0.4966, + "step": 536 + }, + { + "epoch": 0.06, + "learning_rate": 1.9948608636908928e-05, + "loss": 0.5062, + "step": 537 + }, + { + "epoch": 0.06, + "learning_rate": 1.9948233157517164e-05, + "loss": 0.5026, + "step": 538 + }, + { + "epoch": 0.06, + "learning_rate": 1.994785631499535e-05, + "loss": 0.5129, + "step": 539 + }, + { + "epoch": 0.06, + "learning_rate": 1.9947478109395123e-05, + "loss": 0.5277, + "step": 540 + }, + { + "epoch": 0.06, + "learning_rate": 1.9947098540768306e-05, + "loss": 0.5, + "step": 541 + }, + { + "epoch": 0.06, + "learning_rate": 1.994671760916691e-05, + "loss": 0.5134, + "step": 542 + }, + { + "epoch": 0.06, + "learning_rate": 1.994633531464313e-05, + "loss": 0.5056, + "step": 543 + }, + { + "epoch": 0.06, + "learning_rate": 1.9945951657249348e-05, + "loss": 0.5002, + "step": 544 + }, + { + "epoch": 0.06, + "learning_rate": 1.9945566637038133e-05, + "loss": 0.509, + "step": 545 + }, + { + "epoch": 0.06, + "learning_rate": 1.9945180254062242e-05, + "loss": 0.5243, + "step": 546 + }, + { + "epoch": 0.06, + "learning_rate": 1.994479250837462e-05, + "loss": 0.5495, + "step": 547 + }, + { + "epoch": 0.06, + "learning_rate": 1.9944403400028392e-05, + "loss": 0.4995, + "step": 548 + }, + { + "epoch": 0.06, + "learning_rate": 1.9944012929076884e-05, + "loss": 0.4847, + "step": 549 + }, + { + "epoch": 0.06, + "learning_rate": 1.9943621095573588e-05, + "loss": 0.5106, + "step": 550 + }, + { + "epoch": 0.06, + "learning_rate": 1.9943227899572198e-05, + "loss": 0.5006, + "step": 551 + }, + { + "epoch": 0.06, + "learning_rate": 1.9942833341126597e-05, + "loss": 0.5108, + "step": 552 + }, + { + "epoch": 0.06, + "learning_rate": 1.9942437420290835e-05, + "loss": 0.4873, + "step": 553 + }, + { + "epoch": 0.06, + "learning_rate": 1.994204013711918e-05, + "loss": 0.5005, + "step": 554 + }, + { + "epoch": 0.06, + "learning_rate": 1.9941641491666052e-05, + "loss": 0.5119, + "step": 555 + }, + { + "epoch": 0.06, + "learning_rate": 1.994124148398608e-05, + "loss": 0.5057, + "step": 556 + }, + { + "epoch": 0.06, + "learning_rate": 1.9940840114134078e-05, + "loss": 0.4932, + "step": 557 + }, + { + "epoch": 0.06, + "learning_rate": 1.9940437382165038e-05, + "loss": 0.505, + "step": 558 + }, + { + "epoch": 0.06, + "learning_rate": 1.9940033288134143e-05, + "loss": 0.487, + "step": 559 + }, + { + "epoch": 0.06, + "learning_rate": 1.993962783209677e-05, + "loss": 0.5201, + "step": 560 + }, + { + "epoch": 0.06, + "learning_rate": 1.9939221014108467e-05, + "loss": 0.5063, + "step": 561 + }, + { + "epoch": 0.06, + "learning_rate": 1.9938812834224978e-05, + "loss": 0.5005, + "step": 562 + }, + { + "epoch": 0.06, + "learning_rate": 1.993840329250224e-05, + "loss": 0.5017, + "step": 563 + }, + { + "epoch": 0.06, + "learning_rate": 1.993799238899636e-05, + "loss": 0.519, + "step": 564 + }, + { + "epoch": 0.06, + "learning_rate": 1.9937580123763645e-05, + "loss": 0.4934, + "step": 565 + }, + { + "epoch": 0.06, + "learning_rate": 1.993716649686059e-05, + "loss": 0.5086, + "step": 566 + }, + { + "epoch": 0.06, + "learning_rate": 1.993675150834386e-05, + "loss": 0.4863, + "step": 567 + }, + { + "epoch": 0.06, + "learning_rate": 1.993633515827033e-05, + "loss": 0.5089, + "step": 568 + }, + { + "epoch": 0.07, + "learning_rate": 1.9935917446697038e-05, + "loss": 0.5077, + "step": 569 + }, + { + "epoch": 0.07, + "learning_rate": 1.993549837368123e-05, + "loss": 0.4964, + "step": 570 + }, + { + "epoch": 0.07, + "learning_rate": 1.9935077939280316e-05, + "loss": 0.5055, + "step": 571 + }, + { + "epoch": 0.07, + "learning_rate": 1.993465614355192e-05, + "loss": 0.503, + "step": 572 + }, + { + "epoch": 0.07, + "learning_rate": 1.9934232986553823e-05, + "loss": 0.5179, + "step": 573 + }, + { + "epoch": 0.07, + "learning_rate": 1.9933808468344016e-05, + "loss": 0.4953, + "step": 574 + }, + { + "epoch": 0.07, + "learning_rate": 1.9933382588980665e-05, + "loss": 0.4912, + "step": 575 + }, + { + "epoch": 0.07, + "learning_rate": 1.9932955348522125e-05, + "loss": 0.4973, + "step": 576 + }, + { + "epoch": 0.07, + "learning_rate": 1.9932526747026936e-05, + "loss": 0.5004, + "step": 577 + }, + { + "epoch": 0.07, + "learning_rate": 1.993209678455383e-05, + "loss": 0.5175, + "step": 578 + }, + { + "epoch": 0.07, + "learning_rate": 1.9931665461161716e-05, + "loss": 0.5021, + "step": 579 + }, + { + "epoch": 0.07, + "learning_rate": 1.9931232776909703e-05, + "loss": 0.5096, + "step": 580 + }, + { + "epoch": 0.07, + "learning_rate": 1.993079873185707e-05, + "loss": 0.4768, + "step": 581 + }, + { + "epoch": 0.07, + "learning_rate": 1.993036332606329e-05, + "loss": 0.5109, + "step": 582 + }, + { + "epoch": 0.07, + "learning_rate": 1.9929926559588032e-05, + "loss": 0.4972, + "step": 583 + }, + { + "epoch": 0.07, + "learning_rate": 1.9929488432491137e-05, + "loss": 0.4919, + "step": 584 + }, + { + "epoch": 0.07, + "learning_rate": 1.9929048944832638e-05, + "loss": 0.4959, + "step": 585 + }, + { + "epoch": 0.07, + "learning_rate": 1.9928608096672757e-05, + "loss": 0.5132, + "step": 586 + }, + { + "epoch": 0.07, + "learning_rate": 1.99281658880719e-05, + "loss": 0.512, + "step": 587 + }, + { + "epoch": 0.07, + "learning_rate": 1.992772231909066e-05, + "loss": 0.4972, + "step": 588 + }, + { + "epoch": 0.07, + "learning_rate": 1.9927277389789812e-05, + "loss": 0.5027, + "step": 589 + }, + { + "epoch": 0.07, + "learning_rate": 1.9926831100230322e-05, + "loss": 0.4921, + "step": 590 + }, + { + "epoch": 0.07, + "learning_rate": 1.9926383450473344e-05, + "loss": 0.5223, + "step": 591 + }, + { + "epoch": 0.07, + "learning_rate": 1.9925934440580218e-05, + "loss": 0.496, + "step": 592 + }, + { + "epoch": 0.07, + "learning_rate": 1.9925484070612465e-05, + "loss": 0.5087, + "step": 593 + }, + { + "epoch": 0.07, + "learning_rate": 1.9925032340631793e-05, + "loss": 0.5022, + "step": 594 + }, + { + "epoch": 0.07, + "learning_rate": 1.9924579250700104e-05, + "loss": 0.511, + "step": 595 + }, + { + "epoch": 0.07, + "learning_rate": 1.992412480087948e-05, + "loss": 0.509, + "step": 596 + }, + { + "epoch": 0.07, + "learning_rate": 1.992366899123219e-05, + "loss": 0.513, + "step": 597 + }, + { + "epoch": 0.07, + "learning_rate": 1.9923211821820692e-05, + "loss": 0.5128, + "step": 598 + }, + { + "epoch": 0.07, + "learning_rate": 1.9922753292707627e-05, + "loss": 0.5197, + "step": 599 + }, + { + "epoch": 0.07, + "learning_rate": 1.992229340395582e-05, + "loss": 0.4962, + "step": 600 + }, + { + "epoch": 0.07, + "learning_rate": 1.9921832155628295e-05, + "loss": 0.4956, + "step": 601 + }, + { + "epoch": 0.07, + "learning_rate": 1.9921369547788246e-05, + "loss": 0.4905, + "step": 602 + }, + { + "epoch": 0.07, + "learning_rate": 1.9920905580499063e-05, + "loss": 0.5012, + "step": 603 + }, + { + "epoch": 0.07, + "learning_rate": 1.9920440253824318e-05, + "loss": 0.4991, + "step": 604 + }, + { + "epoch": 0.07, + "learning_rate": 1.9919973567827776e-05, + "loss": 0.5092, + "step": 605 + }, + { + "epoch": 0.07, + "learning_rate": 1.991950552257338e-05, + "loss": 0.4813, + "step": 606 + }, + { + "epoch": 0.07, + "learning_rate": 1.991903611812526e-05, + "loss": 0.4992, + "step": 607 + }, + { + "epoch": 0.07, + "learning_rate": 1.9918565354547738e-05, + "loss": 0.4908, + "step": 608 + }, + { + "epoch": 0.07, + "learning_rate": 1.991809323190532e-05, + "loss": 0.505, + "step": 609 + }, + { + "epoch": 0.07, + "learning_rate": 1.99176197502627e-05, + "loss": 0.4865, + "step": 610 + }, + { + "epoch": 0.07, + "learning_rate": 1.9917144909684745e-05, + "loss": 0.4982, + "step": 611 + }, + { + "epoch": 0.07, + "learning_rate": 1.9916668710236528e-05, + "loss": 0.5175, + "step": 612 + }, + { + "epoch": 0.07, + "learning_rate": 1.9916191151983297e-05, + "loss": 0.529, + "step": 613 + }, + { + "epoch": 0.07, + "learning_rate": 1.9915712234990486e-05, + "loss": 0.4916, + "step": 614 + }, + { + "epoch": 0.07, + "learning_rate": 1.9915231959323722e-05, + "loss": 0.5154, + "step": 615 + }, + { + "epoch": 0.07, + "learning_rate": 1.991475032504881e-05, + "loss": 0.4903, + "step": 616 + }, + { + "epoch": 0.07, + "learning_rate": 1.9914267332231746e-05, + "loss": 0.4984, + "step": 617 + }, + { + "epoch": 0.07, + "learning_rate": 1.991378298093871e-05, + "loss": 0.5179, + "step": 618 + }, + { + "epoch": 0.07, + "learning_rate": 1.9913297271236063e-05, + "loss": 0.4865, + "step": 619 + }, + { + "epoch": 0.07, + "learning_rate": 1.9912810203190367e-05, + "loss": 0.5074, + "step": 620 + }, + { + "epoch": 0.07, + "learning_rate": 1.991232177686836e-05, + "loss": 0.5076, + "step": 621 + }, + { + "epoch": 0.07, + "learning_rate": 1.9911831992336963e-05, + "loss": 0.5042, + "step": 622 + }, + { + "epoch": 0.07, + "learning_rate": 1.9911340849663293e-05, + "loss": 0.5021, + "step": 623 + }, + { + "epoch": 0.07, + "learning_rate": 1.991084834891464e-05, + "loss": 0.5062, + "step": 624 + }, + { + "epoch": 0.07, + "learning_rate": 1.9910354490158498e-05, + "loss": 0.4975, + "step": 625 + }, + { + "epoch": 0.07, + "learning_rate": 1.9909859273462525e-05, + "loss": 0.4892, + "step": 626 + }, + { + "epoch": 0.07, + "learning_rate": 1.9909362698894585e-05, + "loss": 0.4962, + "step": 627 + }, + { + "epoch": 0.07, + "learning_rate": 1.9908864766522716e-05, + "loss": 0.5167, + "step": 628 + }, + { + "epoch": 0.07, + "learning_rate": 1.9908365476415146e-05, + "loss": 0.5168, + "step": 629 + }, + { + "epoch": 0.07, + "learning_rate": 1.9907864828640292e-05, + "loss": 0.502, + "step": 630 + }, + { + "epoch": 0.07, + "learning_rate": 1.9907362823266752e-05, + "loss": 0.5143, + "step": 631 + }, + { + "epoch": 0.07, + "learning_rate": 1.9906859460363307e-05, + "loss": 0.5045, + "step": 632 + }, + { + "epoch": 0.07, + "learning_rate": 1.9906354739998937e-05, + "loss": 0.5051, + "step": 633 + }, + { + "epoch": 0.07, + "learning_rate": 1.99058486622428e-05, + "loss": 0.4971, + "step": 634 + }, + { + "epoch": 0.07, + "learning_rate": 1.990534122716423e-05, + "loss": 0.4839, + "step": 635 + }, + { + "epoch": 0.07, + "learning_rate": 1.990483243483277e-05, + "loss": 0.5059, + "step": 636 + }, + { + "epoch": 0.07, + "learning_rate": 1.990432228531813e-05, + "loss": 0.504, + "step": 637 + }, + { + "epoch": 0.07, + "learning_rate": 1.9903810778690204e-05, + "loss": 0.5081, + "step": 638 + }, + { + "epoch": 0.07, + "learning_rate": 1.9903297915019093e-05, + "loss": 0.5, + "step": 639 + }, + { + "epoch": 0.07, + "learning_rate": 1.9902783694375064e-05, + "loss": 0.5103, + "step": 640 + }, + { + "epoch": 0.07, + "learning_rate": 1.9902268116828578e-05, + "loss": 0.5111, + "step": 641 + }, + { + "epoch": 0.07, + "learning_rate": 1.9901751182450276e-05, + "loss": 0.4893, + "step": 642 + }, + { + "epoch": 0.07, + "learning_rate": 1.9901232891310998e-05, + "loss": 0.4861, + "step": 643 + }, + { + "epoch": 0.07, + "learning_rate": 1.9900713243481758e-05, + "loss": 0.5005, + "step": 644 + }, + { + "epoch": 0.07, + "learning_rate": 1.990019223903376e-05, + "loss": 0.4887, + "step": 645 + }, + { + "epoch": 0.07, + "learning_rate": 1.9899669878038382e-05, + "loss": 0.5158, + "step": 646 + }, + { + "epoch": 0.07, + "learning_rate": 1.989914616056722e-05, + "loss": 0.4871, + "step": 647 + }, + { + "epoch": 0.07, + "learning_rate": 1.9898621086692017e-05, + "loss": 0.5133, + "step": 648 + }, + { + "epoch": 0.07, + "learning_rate": 1.989809465648473e-05, + "loss": 0.4984, + "step": 649 + }, + { + "epoch": 0.07, + "learning_rate": 1.989756687001749e-05, + "loss": 0.4881, + "step": 650 + }, + { + "epoch": 0.07, + "learning_rate": 1.9897037727362612e-05, + "loss": 0.4802, + "step": 651 + }, + { + "epoch": 0.07, + "learning_rate": 1.9896507228592604e-05, + "loss": 0.5036, + "step": 652 + }, + { + "epoch": 0.07, + "learning_rate": 1.989597537378015e-05, + "loss": 0.4942, + "step": 653 + }, + { + "epoch": 0.07, + "learning_rate": 1.9895442162998136e-05, + "loss": 0.5228, + "step": 654 + }, + { + "epoch": 0.07, + "learning_rate": 1.9894907596319615e-05, + "loss": 0.4976, + "step": 655 + }, + { + "epoch": 0.07, + "learning_rate": 1.989437167381784e-05, + "loss": 0.5151, + "step": 656 + }, + { + "epoch": 0.08, + "learning_rate": 1.9893834395566242e-05, + "loss": 0.5039, + "step": 657 + }, + { + "epoch": 0.08, + "learning_rate": 1.989329576163844e-05, + "loss": 0.4843, + "step": 658 + }, + { + "epoch": 0.08, + "learning_rate": 1.989275577210824e-05, + "loss": 0.5066, + "step": 659 + }, + { + "epoch": 0.08, + "learning_rate": 1.989221442704963e-05, + "loss": 0.497, + "step": 660 + }, + { + "epoch": 0.08, + "learning_rate": 1.9891671726536787e-05, + "loss": 0.4965, + "step": 661 + }, + { + "epoch": 0.08, + "learning_rate": 1.9891127670644076e-05, + "loss": 0.4853, + "step": 662 + }, + { + "epoch": 0.08, + "learning_rate": 1.9890582259446046e-05, + "loss": 0.5132, + "step": 663 + }, + { + "epoch": 0.08, + "learning_rate": 1.9890035493017424e-05, + "loss": 0.4881, + "step": 664 + }, + { + "epoch": 0.08, + "learning_rate": 1.9889487371433134e-05, + "loss": 0.5049, + "step": 665 + }, + { + "epoch": 0.08, + "learning_rate": 1.988893789476828e-05, + "loss": 0.4886, + "step": 666 + }, + { + "epoch": 0.08, + "learning_rate": 1.9888387063098153e-05, + "loss": 0.5109, + "step": 667 + }, + { + "epoch": 0.08, + "learning_rate": 1.9887834876498228e-05, + "loss": 0.4744, + "step": 668 + }, + { + "epoch": 0.08, + "learning_rate": 1.9887281335044167e-05, + "loss": 0.4952, + "step": 669 + }, + { + "epoch": 0.08, + "learning_rate": 1.988672643881182e-05, + "loss": 0.4774, + "step": 670 + }, + { + "epoch": 0.08, + "learning_rate": 1.9886170187877214e-05, + "loss": 0.509, + "step": 671 + }, + { + "epoch": 0.08, + "learning_rate": 1.9885612582316575e-05, + "loss": 0.4924, + "step": 672 + }, + { + "epoch": 0.08, + "learning_rate": 1.9885053622206305e-05, + "loss": 0.4924, + "step": 673 + }, + { + "epoch": 0.08, + "learning_rate": 1.9884493307622993e-05, + "loss": 0.5126, + "step": 674 + }, + { + "epoch": 0.08, + "learning_rate": 1.988393163864341e-05, + "loss": 0.4876, + "step": 675 + }, + { + "epoch": 0.08, + "learning_rate": 1.9883368615344526e-05, + "loss": 0.4895, + "step": 676 + }, + { + "epoch": 0.08, + "learning_rate": 1.9882804237803487e-05, + "loss": 0.5074, + "step": 677 + }, + { + "epoch": 0.08, + "learning_rate": 1.988223850609762e-05, + "loss": 0.4826, + "step": 678 + }, + { + "epoch": 0.08, + "learning_rate": 1.9881671420304444e-05, + "loss": 0.5268, + "step": 679 + }, + { + "epoch": 0.08, + "learning_rate": 1.9881102980501664e-05, + "loss": 0.4995, + "step": 680 + }, + { + "epoch": 0.08, + "learning_rate": 1.988053318676717e-05, + "loss": 0.5015, + "step": 681 + }, + { + "epoch": 0.08, + "learning_rate": 1.9879962039179033e-05, + "loss": 0.5134, + "step": 682 + }, + { + "epoch": 0.08, + "learning_rate": 1.9879389537815514e-05, + "loss": 0.4966, + "step": 683 + }, + { + "epoch": 0.08, + "learning_rate": 1.9878815682755062e-05, + "loss": 0.4792, + "step": 684 + }, + { + "epoch": 0.08, + "learning_rate": 1.9878240474076306e-05, + "loss": 0.523, + "step": 685 + }, + { + "epoch": 0.08, + "learning_rate": 1.987766391185806e-05, + "loss": 0.492, + "step": 686 + }, + { + "epoch": 0.08, + "learning_rate": 1.9877085996179327e-05, + "loss": 0.5097, + "step": 687 + }, + { + "epoch": 0.08, + "learning_rate": 1.9876506727119294e-05, + "loss": 0.4948, + "step": 688 + }, + { + "epoch": 0.08, + "learning_rate": 1.9875926104757337e-05, + "loss": 0.5193, + "step": 689 + }, + { + "epoch": 0.08, + "learning_rate": 1.9875344129173012e-05, + "loss": 0.5, + "step": 690 + }, + { + "epoch": 0.08, + "learning_rate": 1.9874760800446063e-05, + "loss": 0.4983, + "step": 691 + }, + { + "epoch": 0.08, + "learning_rate": 1.9874176118656415e-05, + "loss": 0.4759, + "step": 692 + }, + { + "epoch": 0.08, + "learning_rate": 1.9873590083884192e-05, + "loss": 0.5069, + "step": 693 + }, + { + "epoch": 0.08, + "learning_rate": 1.9873002696209688e-05, + "loss": 0.5129, + "step": 694 + }, + { + "epoch": 0.08, + "learning_rate": 1.9872413955713382e-05, + "loss": 0.508, + "step": 695 + }, + { + "epoch": 0.08, + "learning_rate": 1.9871823862475955e-05, + "loss": 0.4963, + "step": 696 + }, + { + "epoch": 0.08, + "learning_rate": 1.987123241657826e-05, + "loss": 0.5074, + "step": 697 + }, + { + "epoch": 0.08, + "learning_rate": 1.9870639618101333e-05, + "loss": 0.5238, + "step": 698 + }, + { + "epoch": 0.08, + "learning_rate": 1.987004546712641e-05, + "loss": 0.4916, + "step": 699 + }, + { + "epoch": 0.08, + "learning_rate": 1.9869449963734894e-05, + "loss": 0.4913, + "step": 700 + }, + { + "epoch": 0.08, + "learning_rate": 1.9868853108008387e-05, + "loss": 0.4755, + "step": 701 + }, + { + "epoch": 0.08, + "learning_rate": 1.986825490002867e-05, + "loss": 0.4836, + "step": 702 + }, + { + "epoch": 0.08, + "learning_rate": 1.9867655339877713e-05, + "loss": 0.4985, + "step": 703 + }, + { + "epoch": 0.08, + "learning_rate": 1.9867054427637667e-05, + "loss": 0.4972, + "step": 704 + }, + { + "epoch": 0.08, + "learning_rate": 1.986645216339087e-05, + "loss": 0.5101, + "step": 705 + }, + { + "epoch": 0.08, + "learning_rate": 1.9865848547219845e-05, + "loss": 0.4929, + "step": 706 + }, + { + "epoch": 0.08, + "learning_rate": 1.9865243579207304e-05, + "loss": 0.4964, + "step": 707 + }, + { + "epoch": 0.08, + "learning_rate": 1.986463725943614e-05, + "loss": 0.5103, + "step": 708 + }, + { + "epoch": 0.08, + "learning_rate": 1.9864029587989432e-05, + "loss": 0.481, + "step": 709 + }, + { + "epoch": 0.08, + "learning_rate": 1.9863420564950445e-05, + "loss": 0.4843, + "step": 710 + }, + { + "epoch": 0.08, + "learning_rate": 1.986281019040263e-05, + "loss": 0.5253, + "step": 711 + }, + { + "epoch": 0.08, + "learning_rate": 1.9862198464429614e-05, + "loss": 0.4945, + "step": 712 + }, + { + "epoch": 0.08, + "learning_rate": 1.9861585387115228e-05, + "loss": 0.4945, + "step": 713 + }, + { + "epoch": 0.08, + "learning_rate": 1.986097095854347e-05, + "loss": 0.4998, + "step": 714 + }, + { + "epoch": 0.08, + "learning_rate": 1.9860355178798536e-05, + "loss": 0.4981, + "step": 715 + }, + { + "epoch": 0.08, + "learning_rate": 1.9859738047964795e-05, + "loss": 0.5039, + "step": 716 + }, + { + "epoch": 0.08, + "learning_rate": 1.9859119566126813e-05, + "loss": 0.4968, + "step": 717 + }, + { + "epoch": 0.08, + "learning_rate": 1.9858499733369335e-05, + "loss": 0.4974, + "step": 718 + }, + { + "epoch": 0.08, + "learning_rate": 1.985787854977729e-05, + "loss": 0.4996, + "step": 719 + }, + { + "epoch": 0.08, + "learning_rate": 1.9857256015435797e-05, + "loss": 0.4793, + "step": 720 + }, + { + "epoch": 0.08, + "learning_rate": 1.985663213043015e-05, + "loss": 0.4923, + "step": 721 + }, + { + "epoch": 0.08, + "learning_rate": 1.9856006894845844e-05, + "loss": 0.4878, + "step": 722 + }, + { + "epoch": 0.08, + "learning_rate": 1.9855380308768546e-05, + "loss": 0.5218, + "step": 723 + }, + { + "epoch": 0.08, + "learning_rate": 1.9854752372284113e-05, + "loss": 0.4872, + "step": 724 + }, + { + "epoch": 0.08, + "learning_rate": 1.9854123085478587e-05, + "loss": 0.4902, + "step": 725 + }, + { + "epoch": 0.08, + "learning_rate": 1.9853492448438192e-05, + "loss": 0.5086, + "step": 726 + }, + { + "epoch": 0.08, + "learning_rate": 1.985286046124934e-05, + "loss": 0.4679, + "step": 727 + }, + { + "epoch": 0.08, + "learning_rate": 1.985222712399863e-05, + "loss": 0.4948, + "step": 728 + }, + { + "epoch": 0.08, + "learning_rate": 1.985159243677284e-05, + "loss": 0.4955, + "step": 729 + }, + { + "epoch": 0.08, + "learning_rate": 1.985095639965894e-05, + "loss": 0.4996, + "step": 730 + }, + { + "epoch": 0.08, + "learning_rate": 1.985031901274408e-05, + "loss": 0.512, + "step": 731 + }, + { + "epoch": 0.08, + "learning_rate": 1.9849680276115593e-05, + "loss": 0.492, + "step": 732 + }, + { + "epoch": 0.08, + "learning_rate": 1.9849040189861004e-05, + "loss": 0.4928, + "step": 733 + }, + { + "epoch": 0.08, + "learning_rate": 1.9848398754068018e-05, + "loss": 0.5268, + "step": 734 + }, + { + "epoch": 0.08, + "learning_rate": 1.984775596882452e-05, + "loss": 0.4822, + "step": 735 + }, + { + "epoch": 0.08, + "learning_rate": 1.98471118342186e-05, + "loss": 0.487, + "step": 736 + }, + { + "epoch": 0.08, + "learning_rate": 1.9846466350338506e-05, + "loss": 0.5087, + "step": 737 + }, + { + "epoch": 0.08, + "learning_rate": 1.9845819517272688e-05, + "loss": 0.4785, + "step": 738 + }, + { + "epoch": 0.08, + "learning_rate": 1.9845171335109776e-05, + "loss": 0.523, + "step": 739 + }, + { + "epoch": 0.08, + "learning_rate": 1.9844521803938588e-05, + "loss": 0.4755, + "step": 740 + }, + { + "epoch": 0.08, + "learning_rate": 1.9843870923848122e-05, + "loss": 0.4949, + "step": 741 + }, + { + "epoch": 0.08, + "learning_rate": 1.984321869492756e-05, + "loss": 0.5024, + "step": 742 + }, + { + "epoch": 0.08, + "learning_rate": 1.984256511726628e-05, + "loss": 0.502, + "step": 743 + }, + { + "epoch": 0.09, + "learning_rate": 1.984191019095383e-05, + "loss": 0.4899, + "step": 744 + }, + { + "epoch": 0.09, + "learning_rate": 1.9841253916079953e-05, + "loss": 0.5067, + "step": 745 + }, + { + "epoch": 0.09, + "learning_rate": 1.9840596292734573e-05, + "loss": 0.4877, + "step": 746 + }, + { + "epoch": 0.09, + "learning_rate": 1.9839937321007795e-05, + "loss": 0.5142, + "step": 747 + }, + { + "epoch": 0.09, + "learning_rate": 1.983927700098992e-05, + "loss": 0.488, + "step": 748 + }, + { + "epoch": 0.09, + "learning_rate": 1.983861533277142e-05, + "loss": 0.501, + "step": 749 + }, + { + "epoch": 0.09, + "learning_rate": 1.983795231644296e-05, + "loss": 0.4959, + "step": 750 + }, + { + "epoch": 0.09, + "learning_rate": 1.983728795209539e-05, + "loss": 0.5074, + "step": 751 + }, + { + "epoch": 0.09, + "learning_rate": 1.9836622239819743e-05, + "loss": 0.4955, + "step": 752 + }, + { + "epoch": 0.09, + "learning_rate": 1.983595517970723e-05, + "loss": 0.4876, + "step": 753 + }, + { + "epoch": 0.09, + "learning_rate": 1.9835286771849264e-05, + "loss": 0.4906, + "step": 754 + }, + { + "epoch": 0.09, + "learning_rate": 1.9834617016337424e-05, + "loss": 0.5096, + "step": 755 + }, + { + "epoch": 0.09, + "learning_rate": 1.9833945913263483e-05, + "loss": 0.513, + "step": 756 + }, + { + "epoch": 0.09, + "learning_rate": 1.9833273462719396e-05, + "loss": 0.5042, + "step": 757 + }, + { + "epoch": 0.09, + "learning_rate": 1.9832599664797306e-05, + "loss": 0.4622, + "step": 758 + }, + { + "epoch": 0.09, + "learning_rate": 1.9831924519589537e-05, + "loss": 0.4877, + "step": 759 + }, + { + "epoch": 0.09, + "learning_rate": 1.9831248027188604e-05, + "loss": 0.5123, + "step": 760 + }, + { + "epoch": 0.09, + "learning_rate": 1.983057018768719e-05, + "loss": 0.4812, + "step": 761 + }, + { + "epoch": 0.09, + "learning_rate": 1.982989100117819e-05, + "loss": 0.4839, + "step": 762 + }, + { + "epoch": 0.09, + "learning_rate": 1.9829210467754654e-05, + "loss": 0.5185, + "step": 763 + }, + { + "epoch": 0.09, + "learning_rate": 1.9828528587509836e-05, + "loss": 0.4878, + "step": 764 + }, + { + "epoch": 0.09, + "learning_rate": 1.982784536053717e-05, + "loss": 0.4959, + "step": 765 + }, + { + "epoch": 0.09, + "learning_rate": 1.9827160786930267e-05, + "loss": 0.5101, + "step": 766 + }, + { + "epoch": 0.09, + "learning_rate": 1.9826474866782933e-05, + "loss": 0.4955, + "step": 767 + }, + { + "epoch": 0.09, + "learning_rate": 1.9825787600189163e-05, + "loss": 0.5152, + "step": 768 + }, + { + "epoch": 0.09, + "learning_rate": 1.982509898724311e-05, + "loss": 0.4909, + "step": 769 + }, + { + "epoch": 0.09, + "learning_rate": 1.9824409028039143e-05, + "loss": 0.5013, + "step": 770 + }, + { + "epoch": 0.09, + "learning_rate": 1.9823717722671798e-05, + "loss": 0.4928, + "step": 771 + }, + { + "epoch": 0.09, + "learning_rate": 1.98230250712358e-05, + "loss": 0.4885, + "step": 772 + }, + { + "epoch": 0.09, + "learning_rate": 1.9822331073826056e-05, + "loss": 0.5048, + "step": 773 + }, + { + "epoch": 0.09, + "learning_rate": 1.982163573053766e-05, + "loss": 0.4921, + "step": 774 + }, + { + "epoch": 0.09, + "learning_rate": 1.9820939041465887e-05, + "loss": 0.4884, + "step": 775 + }, + { + "epoch": 0.09, + "learning_rate": 1.9820241006706203e-05, + "loss": 0.4746, + "step": 776 + }, + { + "epoch": 0.09, + "learning_rate": 1.9819541626354252e-05, + "loss": 0.4968, + "step": 777 + }, + { + "epoch": 0.09, + "learning_rate": 1.9818840900505866e-05, + "loss": 0.4767, + "step": 778 + }, + { + "epoch": 0.09, + "learning_rate": 1.9818138829257063e-05, + "loss": 0.5091, + "step": 779 + }, + { + "epoch": 0.09, + "learning_rate": 1.9817435412704037e-05, + "loss": 0.5013, + "step": 780 + }, + { + "epoch": 0.09, + "learning_rate": 1.981673065094317e-05, + "loss": 0.525, + "step": 781 + }, + { + "epoch": 0.09, + "learning_rate": 1.9816024544071038e-05, + "loss": 0.4728, + "step": 782 + }, + { + "epoch": 0.09, + "learning_rate": 1.9815317092184388e-05, + "loss": 0.505, + "step": 783 + }, + { + "epoch": 0.09, + "learning_rate": 1.9814608295380155e-05, + "loss": 0.4836, + "step": 784 + }, + { + "epoch": 0.09, + "learning_rate": 1.9813898153755465e-05, + "loss": 0.4904, + "step": 785 + }, + { + "epoch": 0.09, + "learning_rate": 1.9813186667407624e-05, + "loss": 0.4916, + "step": 786 + }, + { + "epoch": 0.09, + "learning_rate": 1.9812473836434115e-05, + "loss": 0.4835, + "step": 787 + }, + { + "epoch": 0.09, + "learning_rate": 1.981175966093262e-05, + "loss": 0.5016, + "step": 788 + }, + { + "epoch": 0.09, + "learning_rate": 1.9811044141000985e-05, + "loss": 0.507, + "step": 789 + }, + { + "epoch": 0.09, + "learning_rate": 1.9810327276737268e-05, + "loss": 0.4745, + "step": 790 + }, + { + "epoch": 0.09, + "learning_rate": 1.980960906823968e-05, + "loss": 0.5002, + "step": 791 + }, + { + "epoch": 0.09, + "learning_rate": 1.9808889515606644e-05, + "loss": 0.4881, + "step": 792 + }, + { + "epoch": 0.09, + "learning_rate": 1.9808168618936746e-05, + "loss": 0.4806, + "step": 793 + }, + { + "epoch": 0.09, + "learning_rate": 1.980744637832877e-05, + "loss": 0.4801, + "step": 794 + }, + { + "epoch": 0.09, + "learning_rate": 1.9806722793881675e-05, + "loss": 0.4944, + "step": 795 + }, + { + "epoch": 0.09, + "learning_rate": 1.9805997865694616e-05, + "loss": 0.499, + "step": 796 + }, + { + "epoch": 0.09, + "learning_rate": 1.9805271593866914e-05, + "loss": 0.5009, + "step": 797 + }, + { + "epoch": 0.09, + "learning_rate": 1.9804543978498093e-05, + "loss": 0.4867, + "step": 798 + }, + { + "epoch": 0.09, + "learning_rate": 1.9803815019687844e-05, + "loss": 0.5012, + "step": 799 + }, + { + "epoch": 0.09, + "learning_rate": 1.980308471753606e-05, + "loss": 0.5014, + "step": 800 + }, + { + "epoch": 0.09, + "learning_rate": 1.9802353072142802e-05, + "loss": 0.5037, + "step": 801 + }, + { + "epoch": 0.09, + "learning_rate": 1.9801620083608327e-05, + "loss": 0.4848, + "step": 802 + }, + { + "epoch": 0.09, + "learning_rate": 1.9800885752033067e-05, + "loss": 0.4847, + "step": 803 + }, + { + "epoch": 0.09, + "learning_rate": 1.980015007751764e-05, + "loss": 0.4958, + "step": 804 + }, + { + "epoch": 0.09, + "learning_rate": 1.9799413060162854e-05, + "loss": 0.4871, + "step": 805 + }, + { + "epoch": 0.09, + "learning_rate": 1.9798674700069698e-05, + "loss": 0.4962, + "step": 806 + }, + { + "epoch": 0.09, + "learning_rate": 1.979793499733934e-05, + "loss": 0.5009, + "step": 807 + }, + { + "epoch": 0.09, + "learning_rate": 1.9797193952073135e-05, + "loss": 0.4764, + "step": 808 + }, + { + "epoch": 0.09, + "learning_rate": 1.9796451564372624e-05, + "loss": 0.5013, + "step": 809 + }, + { + "epoch": 0.09, + "learning_rate": 1.979570783433954e-05, + "loss": 0.5022, + "step": 810 + }, + { + "epoch": 0.09, + "learning_rate": 1.9794962762075772e-05, + "loss": 0.4912, + "step": 811 + }, + { + "epoch": 0.09, + "learning_rate": 1.9794216347683425e-05, + "loss": 0.5052, + "step": 812 + }, + { + "epoch": 0.09, + "learning_rate": 1.979346859126477e-05, + "loss": 0.4961, + "step": 813 + }, + { + "epoch": 0.09, + "learning_rate": 1.979271949292227e-05, + "loss": 0.5019, + "step": 814 + }, + { + "epoch": 0.09, + "learning_rate": 1.9791969052758563e-05, + "loss": 0.497, + "step": 815 + }, + { + "epoch": 0.09, + "learning_rate": 1.979121727087648e-05, + "loss": 0.4944, + "step": 816 + }, + { + "epoch": 0.09, + "learning_rate": 1.979046414737903e-05, + "loss": 0.4989, + "step": 817 + }, + { + "epoch": 0.09, + "learning_rate": 1.978970968236941e-05, + "loss": 0.4955, + "step": 818 + }, + { + "epoch": 0.09, + "learning_rate": 1.9788953875950992e-05, + "loss": 0.4843, + "step": 819 + }, + { + "epoch": 0.09, + "learning_rate": 1.9788196728227348e-05, + "loss": 0.4918, + "step": 820 + }, + { + "epoch": 0.09, + "learning_rate": 1.9787438239302217e-05, + "loss": 0.4815, + "step": 821 + }, + { + "epoch": 0.09, + "learning_rate": 1.9786678409279535e-05, + "loss": 0.4935, + "step": 822 + }, + { + "epoch": 0.09, + "learning_rate": 1.9785917238263405e-05, + "loss": 0.4966, + "step": 823 + }, + { + "epoch": 0.09, + "learning_rate": 1.9785154726358134e-05, + "loss": 0.514, + "step": 824 + }, + { + "epoch": 0.09, + "learning_rate": 1.9784390873668206e-05, + "loss": 0.517, + "step": 825 + }, + { + "epoch": 0.09, + "learning_rate": 1.9783625680298276e-05, + "loss": 0.4887, + "step": 826 + }, + { + "epoch": 0.09, + "learning_rate": 1.9782859146353196e-05, + "loss": 0.4793, + "step": 827 + }, + { + "epoch": 0.09, + "learning_rate": 1.9782091271938e-05, + "loss": 0.4941, + "step": 828 + }, + { + "epoch": 0.09, + "learning_rate": 1.9781322057157902e-05, + "loss": 0.5018, + "step": 829 + }, + { + "epoch": 0.09, + "learning_rate": 1.9780551502118306e-05, + "loss": 0.4994, + "step": 830 + }, + { + "epoch": 0.09, + "learning_rate": 1.9779779606924788e-05, + "loss": 0.491, + "step": 831 + }, + { + "epoch": 0.1, + "learning_rate": 1.977900637168312e-05, + "loss": 0.4961, + "step": 832 + }, + { + "epoch": 0.1, + "learning_rate": 1.9778231796499254e-05, + "loss": 0.4925, + "step": 833 + }, + { + "epoch": 0.1, + "learning_rate": 1.977745588147932e-05, + "loss": 0.4914, + "step": 834 + }, + { + "epoch": 0.1, + "learning_rate": 1.977667862672964e-05, + "loss": 0.4989, + "step": 835 + }, + { + "epoch": 0.1, + "learning_rate": 1.9775900032356704e-05, + "loss": 0.4993, + "step": 836 + }, + { + "epoch": 0.1, + "learning_rate": 1.9775120098467212e-05, + "loss": 0.4878, + "step": 837 + }, + { + "epoch": 0.1, + "learning_rate": 1.9774338825168024e-05, + "loss": 0.5092, + "step": 838 + }, + { + "epoch": 0.1, + "learning_rate": 1.977355621256619e-05, + "loss": 0.4944, + "step": 839 + }, + { + "epoch": 0.1, + "learning_rate": 1.9772772260768954e-05, + "loss": 0.5022, + "step": 840 + }, + { + "epoch": 0.1, + "learning_rate": 1.9771986969883727e-05, + "loss": 0.5086, + "step": 841 + }, + { + "epoch": 0.1, + "learning_rate": 1.9771200340018115e-05, + "loss": 0.4895, + "step": 842 + }, + { + "epoch": 0.1, + "learning_rate": 1.97704123712799e-05, + "loss": 0.4911, + "step": 843 + }, + { + "epoch": 0.1, + "learning_rate": 1.976962306377706e-05, + "loss": 0.5018, + "step": 844 + }, + { + "epoch": 0.1, + "learning_rate": 1.9768832417617737e-05, + "loss": 0.4837, + "step": 845 + }, + { + "epoch": 0.1, + "learning_rate": 1.9768040432910276e-05, + "loss": 0.5251, + "step": 846 + }, + { + "epoch": 0.1, + "learning_rate": 1.976724710976319e-05, + "loss": 0.5045, + "step": 847 + }, + { + "epoch": 0.1, + "learning_rate": 1.9766452448285184e-05, + "loss": 0.5115, + "step": 848 + }, + { + "epoch": 0.1, + "learning_rate": 1.9765656448585148e-05, + "loss": 0.4769, + "step": 849 + }, + { + "epoch": 0.1, + "learning_rate": 1.976485911077215e-05, + "loss": 0.4936, + "step": 850 + }, + { + "epoch": 0.1, + "learning_rate": 1.9764060434955437e-05, + "loss": 0.4867, + "step": 851 + }, + { + "epoch": 0.1, + "learning_rate": 1.9763260421244455e-05, + "loss": 0.4924, + "step": 852 + }, + { + "epoch": 0.1, + "learning_rate": 1.9762459069748817e-05, + "loss": 0.4903, + "step": 853 + }, + { + "epoch": 0.1, + "learning_rate": 1.9761656380578328e-05, + "loss": 0.4812, + "step": 854 + }, + { + "epoch": 0.1, + "learning_rate": 1.9760852353842973e-05, + "loss": 0.4974, + "step": 855 + }, + { + "epoch": 0.1, + "learning_rate": 1.9760046989652926e-05, + "loss": 0.5047, + "step": 856 + }, + { + "epoch": 0.1, + "learning_rate": 1.9759240288118536e-05, + "loss": 0.4837, + "step": 857 + }, + { + "epoch": 0.1, + "learning_rate": 1.975843224935034e-05, + "loss": 0.4966, + "step": 858 + }, + { + "epoch": 0.1, + "learning_rate": 1.9757622873459056e-05, + "loss": 0.505, + "step": 859 + }, + { + "epoch": 0.1, + "learning_rate": 1.9756812160555586e-05, + "loss": 0.4818, + "step": 860 + }, + { + "epoch": 0.1, + "learning_rate": 1.9756000110751023e-05, + "loss": 0.4965, + "step": 861 + }, + { + "epoch": 0.1, + "learning_rate": 1.975518672415663e-05, + "loss": 0.4987, + "step": 862 + }, + { + "epoch": 0.1, + "learning_rate": 1.975437200088386e-05, + "loss": 0.4718, + "step": 863 + }, + { + "epoch": 0.1, + "learning_rate": 1.9753555941044345e-05, + "loss": 0.5024, + "step": 864 + }, + { + "epoch": 0.1, + "learning_rate": 1.9752738544749906e-05, + "loss": 0.4937, + "step": 865 + }, + { + "epoch": 0.1, + "learning_rate": 1.975191981211255e-05, + "loss": 0.4968, + "step": 866 + }, + { + "epoch": 0.1, + "learning_rate": 1.9751099743244454e-05, + "loss": 0.4785, + "step": 867 + }, + { + "epoch": 0.1, + "learning_rate": 1.9750278338257985e-05, + "loss": 0.5002, + "step": 868 + }, + { + "epoch": 0.1, + "learning_rate": 1.9749455597265704e-05, + "loss": 0.4948, + "step": 869 + }, + { + "epoch": 0.1, + "learning_rate": 1.9748631520380333e-05, + "loss": 0.4887, + "step": 870 + }, + { + "epoch": 0.1, + "learning_rate": 1.97478061077148e-05, + "loss": 0.4751, + "step": 871 + }, + { + "epoch": 0.1, + "learning_rate": 1.9746979359382193e-05, + "loss": 0.4976, + "step": 872 + }, + { + "epoch": 0.1, + "learning_rate": 1.9746151275495803e-05, + "loss": 0.5071, + "step": 873 + }, + { + "epoch": 0.1, + "learning_rate": 1.974532185616909e-05, + "loss": 0.5251, + "step": 874 + }, + { + "epoch": 0.1, + "learning_rate": 1.9744491101515715e-05, + "loss": 0.484, + "step": 875 + }, + { + "epoch": 0.1, + "learning_rate": 1.9743659011649495e-05, + "loss": 0.4894, + "step": 876 + }, + { + "epoch": 0.1, + "learning_rate": 1.9742825586684457e-05, + "loss": 0.4815, + "step": 877 + }, + { + "epoch": 0.1, + "learning_rate": 1.9741990826734793e-05, + "loss": 0.5014, + "step": 878 + }, + { + "epoch": 0.1, + "learning_rate": 1.9741154731914882e-05, + "loss": 0.4918, + "step": 879 + }, + { + "epoch": 0.1, + "learning_rate": 1.974031730233929e-05, + "loss": 0.4868, + "step": 880 + }, + { + "epoch": 0.1, + "learning_rate": 1.9739478538122765e-05, + "loss": 0.4782, + "step": 881 + }, + { + "epoch": 0.1, + "learning_rate": 1.9738638439380237e-05, + "loss": 0.5087, + "step": 882 + }, + { + "epoch": 0.1, + "learning_rate": 1.9737797006226815e-05, + "loss": 0.4789, + "step": 883 + }, + { + "epoch": 0.1, + "learning_rate": 1.9736954238777793e-05, + "loss": 0.4782, + "step": 884 + }, + { + "epoch": 0.1, + "learning_rate": 1.973611013714865e-05, + "loss": 0.4946, + "step": 885 + }, + { + "epoch": 0.1, + "learning_rate": 1.9735264701455054e-05, + "loss": 0.5054, + "step": 886 + }, + { + "epoch": 0.1, + "learning_rate": 1.973441793181284e-05, + "loss": 0.4952, + "step": 887 + }, + { + "epoch": 0.1, + "learning_rate": 1.9733569828338038e-05, + "loss": 0.4816, + "step": 888 + }, + { + "epoch": 0.1, + "learning_rate": 1.9732720391146852e-05, + "loss": 0.5189, + "step": 889 + }, + { + "epoch": 0.1, + "learning_rate": 1.973186962035568e-05, + "loss": 0.4958, + "step": 890 + }, + { + "epoch": 0.1, + "learning_rate": 1.97310175160811e-05, + "loss": 0.5128, + "step": 891 + }, + { + "epoch": 0.1, + "learning_rate": 1.9730164078439857e-05, + "loss": 0.5123, + "step": 892 + }, + { + "epoch": 0.1, + "learning_rate": 1.97293093075489e-05, + "loss": 0.4817, + "step": 893 + }, + { + "epoch": 0.1, + "learning_rate": 1.9728453203525352e-05, + "loss": 0.5027, + "step": 894 + }, + { + "epoch": 0.1, + "learning_rate": 1.9727595766486514e-05, + "loss": 0.5033, + "step": 895 + }, + { + "epoch": 0.1, + "learning_rate": 1.972673699654988e-05, + "loss": 0.4904, + "step": 896 + }, + { + "epoch": 0.1, + "learning_rate": 1.9725876893833108e-05, + "loss": 0.4848, + "step": 897 + }, + { + "epoch": 0.1, + "learning_rate": 1.9725015458454068e-05, + "loss": 0.4898, + "step": 898 + }, + { + "epoch": 0.1, + "learning_rate": 1.9724152690530785e-05, + "loss": 0.5056, + "step": 899 + }, + { + "epoch": 0.1, + "learning_rate": 1.972328859018148e-05, + "loss": 0.4908, + "step": 900 + }, + { + "epoch": 0.1, + "learning_rate": 1.9722423157524553e-05, + "loss": 0.4829, + "step": 901 + }, + { + "epoch": 0.1, + "learning_rate": 1.972155639267859e-05, + "loss": 0.4885, + "step": 902 + }, + { + "epoch": 0.1, + "learning_rate": 1.972068829576236e-05, + "loss": 0.4938, + "step": 903 + }, + { + "epoch": 0.1, + "learning_rate": 1.9719818866894802e-05, + "loss": 0.4802, + "step": 904 + }, + { + "epoch": 0.1, + "learning_rate": 1.9718948106195055e-05, + "loss": 0.5079, + "step": 905 + }, + { + "epoch": 0.1, + "learning_rate": 1.971807601378243e-05, + "loss": 0.4982, + "step": 906 + }, + { + "epoch": 0.1, + "learning_rate": 1.9717202589776424e-05, + "loss": 0.4993, + "step": 907 + }, + { + "epoch": 0.1, + "learning_rate": 1.971632783429672e-05, + "loss": 0.5026, + "step": 908 + }, + { + "epoch": 0.1, + "learning_rate": 1.9715451747463168e-05, + "loss": 0.4881, + "step": 909 + }, + { + "epoch": 0.1, + "learning_rate": 1.971457432939582e-05, + "loss": 0.5166, + "step": 910 + }, + { + "epoch": 0.1, + "learning_rate": 1.97136955802149e-05, + "loss": 0.4888, + "step": 911 + }, + { + "epoch": 0.1, + "learning_rate": 1.9712815500040815e-05, + "loss": 0.4803, + "step": 912 + }, + { + "epoch": 0.1, + "learning_rate": 1.9711934088994157e-05, + "loss": 0.4939, + "step": 913 + }, + { + "epoch": 0.1, + "learning_rate": 1.97110513471957e-05, + "loss": 0.4948, + "step": 914 + }, + { + "epoch": 0.1, + "learning_rate": 1.9710167274766395e-05, + "loss": 0.4926, + "step": 915 + }, + { + "epoch": 0.1, + "learning_rate": 1.9709281871827386e-05, + "loss": 0.4969, + "step": 916 + }, + { + "epoch": 0.1, + "learning_rate": 1.9708395138499986e-05, + "loss": 0.4836, + "step": 917 + }, + { + "epoch": 0.1, + "learning_rate": 1.97075070749057e-05, + "loss": 0.5263, + "step": 918 + }, + { + "epoch": 0.11, + "learning_rate": 1.970661768116622e-05, + "loss": 0.4922, + "step": 919 + }, + { + "epoch": 0.11, + "learning_rate": 1.9705726957403398e-05, + "loss": 0.4912, + "step": 920 + }, + { + "epoch": 0.11, + "learning_rate": 1.9704834903739297e-05, + "loss": 0.4587, + "step": 921 + }, + { + "epoch": 0.11, + "learning_rate": 1.970394152029614e-05, + "loss": 0.5059, + "step": 922 + }, + { + "epoch": 0.11, + "learning_rate": 1.970304680719634e-05, + "loss": 0.4826, + "step": 923 + }, + { + "epoch": 0.11, + "learning_rate": 1.9702150764562498e-05, + "loss": 0.5044, + "step": 924 + }, + { + "epoch": 0.11, + "learning_rate": 1.970125339251739e-05, + "loss": 0.4838, + "step": 925 + }, + { + "epoch": 0.11, + "learning_rate": 1.9700354691183977e-05, + "loss": 0.5082, + "step": 926 + }, + { + "epoch": 0.11, + "learning_rate": 1.9699454660685398e-05, + "loss": 0.4833, + "step": 927 + }, + { + "epoch": 0.11, + "learning_rate": 1.969855330114498e-05, + "loss": 0.4837, + "step": 928 + }, + { + "epoch": 0.11, + "learning_rate": 1.9697650612686228e-05, + "loss": 0.4915, + "step": 929 + }, + { + "epoch": 0.11, + "learning_rate": 1.9696746595432828e-05, + "loss": 0.4941, + "step": 930 + }, + { + "epoch": 0.11, + "learning_rate": 1.9695841249508656e-05, + "loss": 0.5001, + "step": 931 + }, + { + "epoch": 0.11, + "learning_rate": 1.9694934575037762e-05, + "loss": 0.514, + "step": 932 + }, + { + "epoch": 0.11, + "learning_rate": 1.969402657214438e-05, + "loss": 0.4917, + "step": 933 + }, + { + "epoch": 0.11, + "learning_rate": 1.9693117240952928e-05, + "loss": 0.4981, + "step": 934 + }, + { + "epoch": 0.11, + "learning_rate": 1.9692206581588e-05, + "loss": 0.4806, + "step": 935 + }, + { + "epoch": 0.11, + "learning_rate": 1.969129459417438e-05, + "loss": 0.496, + "step": 936 + }, + { + "epoch": 0.11, + "learning_rate": 1.9690381278837038e-05, + "loss": 0.4817, + "step": 937 + }, + { + "epoch": 0.11, + "learning_rate": 1.9689466635701106e-05, + "loss": 0.5036, + "step": 938 + }, + { + "epoch": 0.11, + "learning_rate": 1.9688550664891915e-05, + "loss": 0.5118, + "step": 939 + }, + { + "epoch": 0.11, + "learning_rate": 1.968763336653498e-05, + "loss": 0.5007, + "step": 940 + }, + { + "epoch": 0.11, + "learning_rate": 1.968671474075598e-05, + "loss": 0.4755, + "step": 941 + }, + { + "epoch": 0.11, + "learning_rate": 1.96857947876808e-05, + "loss": 0.51, + "step": 942 + }, + { + "epoch": 0.11, + "learning_rate": 1.968487350743548e-05, + "loss": 0.4993, + "step": 943 + }, + { + "epoch": 0.11, + "learning_rate": 1.968395090014627e-05, + "loss": 0.5162, + "step": 944 + }, + { + "epoch": 0.11, + "learning_rate": 1.968302696593958e-05, + "loss": 0.4912, + "step": 945 + }, + { + "epoch": 0.11, + "learning_rate": 1.968210170494201e-05, + "loss": 0.4905, + "step": 946 + }, + { + "epoch": 0.11, + "learning_rate": 1.9681175117280343e-05, + "loss": 0.4988, + "step": 947 + }, + { + "epoch": 0.11, + "learning_rate": 1.9680247203081537e-05, + "loss": 0.4947, + "step": 948 + }, + { + "epoch": 0.11, + "learning_rate": 1.9679317962472746e-05, + "loss": 0.4997, + "step": 949 + }, + { + "epoch": 0.11, + "learning_rate": 1.9678387395581292e-05, + "loss": 0.4728, + "step": 950 + }, + { + "epoch": 0.11, + "learning_rate": 1.967745550253468e-05, + "loss": 0.4971, + "step": 951 + }, + { + "epoch": 0.11, + "learning_rate": 1.9676522283460606e-05, + "loss": 0.488, + "step": 952 + }, + { + "epoch": 0.11, + "learning_rate": 1.9675587738486935e-05, + "loss": 0.4897, + "step": 953 + }, + { + "epoch": 0.11, + "learning_rate": 1.9674651867741733e-05, + "loss": 0.4924, + "step": 954 + }, + { + "epoch": 0.11, + "learning_rate": 1.967371467135322e-05, + "loss": 0.489, + "step": 955 + }, + { + "epoch": 0.11, + "learning_rate": 1.9672776149449826e-05, + "loss": 0.4761, + "step": 956 + }, + { + "epoch": 0.11, + "learning_rate": 1.967183630216014e-05, + "loss": 0.513, + "step": 957 + }, + { + "epoch": 0.11, + "learning_rate": 1.9670895129612946e-05, + "loss": 0.4968, + "step": 958 + }, + { + "epoch": 0.11, + "learning_rate": 1.9669952631937206e-05, + "loss": 0.4754, + "step": 959 + }, + { + "epoch": 0.11, + "learning_rate": 1.9669008809262064e-05, + "loss": 0.4952, + "step": 960 + }, + { + "epoch": 0.11, + "learning_rate": 1.9668063661716837e-05, + "loss": 0.4881, + "step": 961 + }, + { + "epoch": 0.11, + "learning_rate": 1.9667117189431045e-05, + "loss": 0.4988, + "step": 962 + }, + { + "epoch": 0.11, + "learning_rate": 1.9666169392534363e-05, + "loss": 0.4809, + "step": 963 + }, + { + "epoch": 0.11, + "learning_rate": 1.966522027115667e-05, + "loss": 0.4772, + "step": 964 + }, + { + "epoch": 0.11, + "learning_rate": 1.966426982542801e-05, + "loss": 0.4936, + "step": 965 + }, + { + "epoch": 0.11, + "learning_rate": 1.9663318055478616e-05, + "loss": 0.4872, + "step": 966 + }, + { + "epoch": 0.11, + "learning_rate": 1.9662364961438907e-05, + "loss": 0.5017, + "step": 967 + }, + { + "epoch": 0.11, + "learning_rate": 1.966141054343947e-05, + "loss": 0.4938, + "step": 968 + }, + { + "epoch": 0.11, + "learning_rate": 1.9660454801611094e-05, + "loss": 0.4742, + "step": 969 + }, + { + "epoch": 0.11, + "learning_rate": 1.9659497736084722e-05, + "loss": 0.4818, + "step": 970 + }, + { + "epoch": 0.11, + "learning_rate": 1.9658539346991504e-05, + "loss": 0.4862, + "step": 971 + }, + { + "epoch": 0.11, + "learning_rate": 1.9657579634462757e-05, + "loss": 0.4894, + "step": 972 + }, + { + "epoch": 0.11, + "learning_rate": 1.9656618598629985e-05, + "loss": 0.4805, + "step": 973 + }, + { + "epoch": 0.11, + "learning_rate": 1.9655656239624864e-05, + "loss": 0.4846, + "step": 974 + }, + { + "epoch": 0.11, + "learning_rate": 1.965469255757927e-05, + "loss": 0.5027, + "step": 975 + }, + { + "epoch": 0.11, + "learning_rate": 1.9653727552625242e-05, + "loss": 0.4863, + "step": 976 + }, + { + "epoch": 0.11, + "learning_rate": 1.9652761224895006e-05, + "loss": 0.4852, + "step": 977 + }, + { + "epoch": 0.11, + "learning_rate": 1.9651793574520975e-05, + "loss": 0.4887, + "step": 978 + }, + { + "epoch": 0.11, + "learning_rate": 1.965082460163574e-05, + "loss": 0.4766, + "step": 979 + }, + { + "epoch": 0.11, + "learning_rate": 1.9649854306372065e-05, + "loss": 0.5133, + "step": 980 + }, + { + "epoch": 0.11, + "learning_rate": 1.9648882688862905e-05, + "loss": 0.4669, + "step": 981 + }, + { + "epoch": 0.11, + "learning_rate": 1.9647909749241394e-05, + "loss": 0.4821, + "step": 982 + }, + { + "epoch": 0.11, + "learning_rate": 1.9646935487640848e-05, + "loss": 0.4946, + "step": 983 + }, + { + "epoch": 0.11, + "learning_rate": 1.964595990419476e-05, + "loss": 0.5043, + "step": 984 + }, + { + "epoch": 0.11, + "learning_rate": 1.964498299903681e-05, + "loss": 0.4751, + "step": 985 + }, + { + "epoch": 0.11, + "learning_rate": 1.964400477230085e-05, + "loss": 0.5033, + "step": 986 + }, + { + "epoch": 0.11, + "learning_rate": 1.9643025224120923e-05, + "loss": 0.4757, + "step": 987 + }, + { + "epoch": 0.11, + "learning_rate": 1.9642044354631255e-05, + "loss": 0.4983, + "step": 988 + }, + { + "epoch": 0.11, + "learning_rate": 1.9641062163966232e-05, + "loss": 0.49, + "step": 989 + }, + { + "epoch": 0.11, + "learning_rate": 1.9640078652260447e-05, + "loss": 0.4855, + "step": 990 + }, + { + "epoch": 0.11, + "learning_rate": 1.9639093819648664e-05, + "loss": 0.5073, + "step": 991 + }, + { + "epoch": 0.11, + "learning_rate": 1.963810766626582e-05, + "loss": 0.5177, + "step": 992 + }, + { + "epoch": 0.11, + "learning_rate": 1.9637120192247046e-05, + "loss": 0.4823, + "step": 993 + }, + { + "epoch": 0.11, + "learning_rate": 1.9636131397727646e-05, + "loss": 0.4998, + "step": 994 + }, + { + "epoch": 0.11, + "learning_rate": 1.9635141282843105e-05, + "loss": 0.4702, + "step": 995 + }, + { + "epoch": 0.11, + "learning_rate": 1.9634149847729093e-05, + "loss": 0.4997, + "step": 996 + }, + { + "epoch": 0.11, + "learning_rate": 1.963315709252146e-05, + "loss": 0.4942, + "step": 997 + }, + { + "epoch": 0.11, + "learning_rate": 1.963216301735623e-05, + "loss": 0.5002, + "step": 998 + }, + { + "epoch": 0.11, + "learning_rate": 1.9631167622369617e-05, + "loss": 0.5039, + "step": 999 + }, + { + "epoch": 0.11, + "learning_rate": 1.9630170907698015e-05, + "loss": 0.497, + "step": 1000 + }, + { + "epoch": 0.11, + "learning_rate": 1.9629172873477995e-05, + "loss": 0.4884, + "step": 1001 + }, + { + "epoch": 0.11, + "learning_rate": 1.9628173519846308e-05, + "loss": 0.4728, + "step": 1002 + }, + { + "epoch": 0.11, + "learning_rate": 1.9627172846939886e-05, + "loss": 0.5004, + "step": 1003 + }, + { + "epoch": 0.11, + "learning_rate": 1.962617085489585e-05, + "loss": 0.4989, + "step": 1004 + }, + { + "epoch": 0.11, + "learning_rate": 1.962516754385149e-05, + "loss": 0.4877, + "step": 1005 + }, + { + "epoch": 0.11, + "learning_rate": 1.962416291394428e-05, + "loss": 0.4992, + "step": 1006 + }, + { + "epoch": 0.12, + "learning_rate": 1.9623156965311884e-05, + "loss": 0.4895, + "step": 1007 + }, + { + "epoch": 0.12, + "learning_rate": 1.9622149698092135e-05, + "loss": 0.4922, + "step": 1008 + }, + { + "epoch": 0.12, + "learning_rate": 1.962114111242305e-05, + "loss": 0.5085, + "step": 1009 + }, + { + "epoch": 0.12, + "learning_rate": 1.962013120844283e-05, + "loss": 0.473, + "step": 1010 + }, + { + "epoch": 0.12, + "learning_rate": 1.9619119986289855e-05, + "loss": 0.4942, + "step": 1011 + }, + { + "epoch": 0.12, + "learning_rate": 1.9618107446102682e-05, + "loss": 0.4962, + "step": 1012 + }, + { + "epoch": 0.12, + "learning_rate": 1.9617093588020057e-05, + "loss": 0.4859, + "step": 1013 + }, + { + "epoch": 0.12, + "learning_rate": 1.9616078412180896e-05, + "loss": 0.4819, + "step": 1014 + }, + { + "epoch": 0.12, + "learning_rate": 1.96150619187243e-05, + "loss": 0.4826, + "step": 1015 + }, + { + "epoch": 0.12, + "learning_rate": 1.9614044107789553e-05, + "loss": 0.5166, + "step": 1016 + }, + { + "epoch": 0.12, + "learning_rate": 1.9613024979516123e-05, + "loss": 0.4963, + "step": 1017 + }, + { + "epoch": 0.12, + "learning_rate": 1.9612004534043644e-05, + "loss": 0.4796, + "step": 1018 + }, + { + "epoch": 0.12, + "learning_rate": 1.9610982771511947e-05, + "loss": 0.4787, + "step": 1019 + }, + { + "epoch": 0.12, + "learning_rate": 1.9609959692061037e-05, + "loss": 0.4938, + "step": 1020 + }, + { + "epoch": 0.12, + "learning_rate": 1.9608935295831092e-05, + "loss": 0.48, + "step": 1021 + }, + { + "epoch": 0.12, + "learning_rate": 1.9607909582962478e-05, + "loss": 0.4807, + "step": 1022 + }, + { + "epoch": 0.12, + "learning_rate": 1.9606882553595748e-05, + "loss": 0.4893, + "step": 1023 + }, + { + "epoch": 0.12, + "learning_rate": 1.960585420787162e-05, + "loss": 0.4854, + "step": 1024 + }, + { + "epoch": 0.12, + "learning_rate": 1.9604824545931005e-05, + "loss": 0.5115, + "step": 1025 + }, + { + "epoch": 0.12, + "learning_rate": 1.960379356791499e-05, + "loss": 0.4804, + "step": 1026 + }, + { + "epoch": 0.12, + "learning_rate": 1.960276127396484e-05, + "loss": 0.4954, + "step": 1027 + }, + { + "epoch": 0.12, + "learning_rate": 1.9601727664222e-05, + "loss": 0.4761, + "step": 1028 + }, + { + "epoch": 0.12, + "learning_rate": 1.96006927388281e-05, + "loss": 0.5093, + "step": 1029 + }, + { + "epoch": 0.12, + "learning_rate": 1.959965649792495e-05, + "loss": 0.4766, + "step": 1030 + }, + { + "epoch": 0.12, + "learning_rate": 1.9598618941654535e-05, + "loss": 0.4788, + "step": 1031 + }, + { + "epoch": 0.12, + "learning_rate": 1.9597580070159026e-05, + "loss": 0.4934, + "step": 1032 + }, + { + "epoch": 0.12, + "learning_rate": 1.9596539883580773e-05, + "loss": 0.4993, + "step": 1033 + }, + { + "epoch": 0.12, + "learning_rate": 1.9595498382062295e-05, + "loss": 0.4967, + "step": 1034 + }, + { + "epoch": 0.12, + "learning_rate": 1.9594455565746313e-05, + "loss": 0.5108, + "step": 1035 + }, + { + "epoch": 0.12, + "learning_rate": 1.959341143477571e-05, + "loss": 0.482, + "step": 1036 + }, + { + "epoch": 0.12, + "learning_rate": 1.9592365989293557e-05, + "loss": 0.4744, + "step": 1037 + }, + { + "epoch": 0.12, + "learning_rate": 1.95913192294431e-05, + "loss": 0.5033, + "step": 1038 + }, + { + "epoch": 0.12, + "learning_rate": 1.9590271155367776e-05, + "loss": 0.4749, + "step": 1039 + }, + { + "epoch": 0.12, + "learning_rate": 1.9589221767211188e-05, + "loss": 0.489, + "step": 1040 + }, + { + "epoch": 0.12, + "learning_rate": 1.9588171065117122e-05, + "loss": 0.4956, + "step": 1041 + }, + { + "epoch": 0.12, + "learning_rate": 1.9587119049229558e-05, + "loss": 0.4975, + "step": 1042 + }, + { + "epoch": 0.12, + "learning_rate": 1.9586065719692636e-05, + "loss": 0.5006, + "step": 1043 + }, + { + "epoch": 0.12, + "learning_rate": 1.9585011076650695e-05, + "loss": 0.5086, + "step": 1044 + }, + { + "epoch": 0.12, + "learning_rate": 1.958395512024824e-05, + "loss": 0.4902, + "step": 1045 + }, + { + "epoch": 0.12, + "learning_rate": 1.9582897850629958e-05, + "loss": 0.4795, + "step": 1046 + }, + { + "epoch": 0.12, + "learning_rate": 1.9581839267940722e-05, + "loss": 0.4852, + "step": 1047 + }, + { + "epoch": 0.12, + "learning_rate": 1.9580779372325583e-05, + "loss": 0.4886, + "step": 1048 + }, + { + "epoch": 0.12, + "learning_rate": 1.9579718163929767e-05, + "loss": 0.4913, + "step": 1049 + }, + { + "epoch": 0.12, + "learning_rate": 1.957865564289868e-05, + "loss": 0.5032, + "step": 1050 + }, + { + "epoch": 0.12, + "learning_rate": 1.9577591809377917e-05, + "loss": 0.4787, + "step": 1051 + }, + { + "epoch": 0.12, + "learning_rate": 1.957652666351325e-05, + "loss": 0.5013, + "step": 1052 + }, + { + "epoch": 0.12, + "learning_rate": 1.9575460205450616e-05, + "loss": 0.5499, + "step": 1053 + }, + { + "epoch": 0.12, + "learning_rate": 1.9574392435336156e-05, + "loss": 0.4768, + "step": 1054 + }, + { + "epoch": 0.12, + "learning_rate": 1.957332335331617e-05, + "loss": 0.4963, + "step": 1055 + }, + { + "epoch": 0.12, + "learning_rate": 1.957225295953715e-05, + "loss": 0.4937, + "step": 1056 + }, + { + "epoch": 0.12, + "learning_rate": 1.9571181254145762e-05, + "loss": 0.4775, + "step": 1057 + }, + { + "epoch": 0.12, + "learning_rate": 1.9570108237288853e-05, + "loss": 0.5146, + "step": 1058 + }, + { + "epoch": 0.12, + "learning_rate": 1.9569033909113454e-05, + "loss": 0.4953, + "step": 1059 + }, + { + "epoch": 0.12, + "learning_rate": 1.9567958269766768e-05, + "loss": 0.4758, + "step": 1060 + }, + { + "epoch": 0.12, + "learning_rate": 1.9566881319396184e-05, + "loss": 0.5177, + "step": 1061 + }, + { + "epoch": 0.12, + "learning_rate": 1.956580305814927e-05, + "loss": 0.4715, + "step": 1062 + }, + { + "epoch": 0.12, + "learning_rate": 1.9564723486173766e-05, + "loss": 0.4987, + "step": 1063 + }, + { + "epoch": 0.12, + "learning_rate": 1.95636426036176e-05, + "loss": 0.4819, + "step": 1064 + }, + { + "epoch": 0.12, + "learning_rate": 1.9562560410628883e-05, + "loss": 0.5034, + "step": 1065 + }, + { + "epoch": 0.12, + "learning_rate": 1.9561476907355886e-05, + "loss": 0.5026, + "step": 1066 + }, + { + "epoch": 0.12, + "learning_rate": 1.956039209394709e-05, + "loss": 0.5056, + "step": 1067 + }, + { + "epoch": 0.12, + "learning_rate": 1.9559305970551125e-05, + "loss": 0.4825, + "step": 1068 + }, + { + "epoch": 0.12, + "learning_rate": 1.955821853731682e-05, + "loss": 0.515, + "step": 1069 + }, + { + "epoch": 0.12, + "learning_rate": 1.955712979439318e-05, + "loss": 0.4992, + "step": 1070 + }, + { + "epoch": 0.12, + "learning_rate": 1.955603974192938e-05, + "loss": 0.483, + "step": 1071 + }, + { + "epoch": 0.12, + "learning_rate": 1.955494838007479e-05, + "loss": 0.493, + "step": 1072 + }, + { + "epoch": 0.12, + "learning_rate": 1.9553855708978943e-05, + "loss": 0.4969, + "step": 1073 + }, + { + "epoch": 0.12, + "learning_rate": 1.9552761728791563e-05, + "loss": 0.506, + "step": 1074 + }, + { + "epoch": 0.12, + "learning_rate": 1.955166643966255e-05, + "loss": 0.4959, + "step": 1075 + }, + { + "epoch": 0.12, + "learning_rate": 1.9550569841741984e-05, + "loss": 0.4879, + "step": 1076 + }, + { + "epoch": 0.12, + "learning_rate": 1.9549471935180123e-05, + "loss": 0.4908, + "step": 1077 + }, + { + "epoch": 0.12, + "learning_rate": 1.95483727201274e-05, + "loss": 0.4847, + "step": 1078 + }, + { + "epoch": 0.12, + "learning_rate": 1.9547272196734436e-05, + "loss": 0.5054, + "step": 1079 + }, + { + "epoch": 0.12, + "learning_rate": 1.954617036515203e-05, + "loss": 0.4997, + "step": 1080 + }, + { + "epoch": 0.12, + "learning_rate": 1.9545067225531155e-05, + "loss": 0.5026, + "step": 1081 + }, + { + "epoch": 0.12, + "learning_rate": 1.954396277802296e-05, + "loss": 0.5015, + "step": 1082 + }, + { + "epoch": 0.12, + "learning_rate": 1.954285702277879e-05, + "loss": 0.4918, + "step": 1083 + }, + { + "epoch": 0.12, + "learning_rate": 1.954174995995015e-05, + "loss": 0.4966, + "step": 1084 + }, + { + "epoch": 0.12, + "learning_rate": 1.9540641589688735e-05, + "loss": 0.4972, + "step": 1085 + }, + { + "epoch": 0.12, + "learning_rate": 1.953953191214642e-05, + "loss": 0.4849, + "step": 1086 + }, + { + "epoch": 0.12, + "learning_rate": 1.9538420927475247e-05, + "loss": 0.5057, + "step": 1087 + }, + { + "epoch": 0.12, + "learning_rate": 1.953730863582745e-05, + "loss": 0.4687, + "step": 1088 + }, + { + "epoch": 0.12, + "learning_rate": 1.9536195037355438e-05, + "loss": 0.4987, + "step": 1089 + }, + { + "epoch": 0.12, + "learning_rate": 1.9535080132211805e-05, + "loss": 0.4879, + "step": 1090 + }, + { + "epoch": 0.12, + "learning_rate": 1.9533963920549307e-05, + "loss": 0.4896, + "step": 1091 + }, + { + "epoch": 0.12, + "learning_rate": 1.9532846402520898e-05, + "loss": 0.4914, + "step": 1092 + }, + { + "epoch": 0.12, + "learning_rate": 1.95317275782797e-05, + "loss": 0.4981, + "step": 1093 + }, + { + "epoch": 0.13, + "learning_rate": 1.953060744797901e-05, + "loss": 0.5114, + "step": 1094 + }, + { + "epoch": 0.13, + "learning_rate": 1.9529486011772326e-05, + "loss": 0.4893, + "step": 1095 + }, + { + "epoch": 0.13, + "learning_rate": 1.95283632698133e-05, + "loss": 0.4894, + "step": 1096 + }, + { + "epoch": 0.13, + "learning_rate": 1.952723922225577e-05, + "loss": 0.4872, + "step": 1097 + }, + { + "epoch": 0.13, + "learning_rate": 1.952611386925376e-05, + "loss": 0.4907, + "step": 1098 + }, + { + "epoch": 0.13, + "learning_rate": 1.952498721096147e-05, + "loss": 0.4816, + "step": 1099 + }, + { + "epoch": 0.13, + "learning_rate": 1.952385924753328e-05, + "loss": 0.4909, + "step": 1100 + }, + { + "epoch": 0.13, + "learning_rate": 1.9522729979123735e-05, + "loss": 0.493, + "step": 1101 + }, + { + "epoch": 0.13, + "learning_rate": 1.952159940588758e-05, + "loss": 0.5024, + "step": 1102 + }, + { + "epoch": 0.13, + "learning_rate": 1.9520467527979726e-05, + "loss": 0.4942, + "step": 1103 + }, + { + "epoch": 0.13, + "learning_rate": 1.9519334345555264e-05, + "loss": 0.484, + "step": 1104 + }, + { + "epoch": 0.13, + "learning_rate": 1.9518199858769466e-05, + "loss": 0.4942, + "step": 1105 + }, + { + "epoch": 0.13, + "learning_rate": 1.9517064067777786e-05, + "loss": 0.469, + "step": 1106 + }, + { + "epoch": 0.13, + "learning_rate": 1.9515926972735847e-05, + "loss": 0.4857, + "step": 1107 + }, + { + "epoch": 0.13, + "learning_rate": 1.9514788573799457e-05, + "loss": 0.4872, + "step": 1108 + }, + { + "epoch": 0.13, + "learning_rate": 1.9513648871124604e-05, + "loss": 0.5115, + "step": 1109 + }, + { + "epoch": 0.13, + "learning_rate": 1.9512507864867452e-05, + "loss": 0.4947, + "step": 1110 + }, + { + "epoch": 0.13, + "learning_rate": 1.9511365555184345e-05, + "loss": 0.4913, + "step": 1111 + }, + { + "epoch": 0.13, + "learning_rate": 1.9510221942231803e-05, + "loss": 0.4762, + "step": 1112 + }, + { + "epoch": 0.13, + "learning_rate": 1.950907702616653e-05, + "loss": 0.4975, + "step": 1113 + }, + { + "epoch": 0.13, + "learning_rate": 1.9507930807145406e-05, + "loss": 0.4734, + "step": 1114 + }, + { + "epoch": 0.13, + "learning_rate": 1.9506783285325482e-05, + "loss": 0.4966, + "step": 1115 + }, + { + "epoch": 0.13, + "learning_rate": 1.9505634460863997e-05, + "loss": 0.4885, + "step": 1116 + }, + { + "epoch": 0.13, + "learning_rate": 1.950448433391837e-05, + "loss": 0.4667, + "step": 1117 + }, + { + "epoch": 0.13, + "learning_rate": 1.9503332904646188e-05, + "loss": 0.5057, + "step": 1118 + }, + { + "epoch": 0.13, + "learning_rate": 1.9502180173205227e-05, + "loss": 0.5029, + "step": 1119 + }, + { + "epoch": 0.13, + "learning_rate": 1.9501026139753433e-05, + "loss": 0.4856, + "step": 1120 + }, + { + "epoch": 0.13, + "learning_rate": 1.9499870804448936e-05, + "loss": 0.4836, + "step": 1121 + }, + { + "epoch": 0.13, + "learning_rate": 1.9498714167450047e-05, + "loss": 0.4915, + "step": 1122 + }, + { + "epoch": 0.13, + "learning_rate": 1.9497556228915246e-05, + "loss": 0.4904, + "step": 1123 + }, + { + "epoch": 0.13, + "learning_rate": 1.9496396989003195e-05, + "loss": 0.4843, + "step": 1124 + }, + { + "epoch": 0.13, + "learning_rate": 1.9495236447872738e-05, + "loss": 0.481, + "step": 1125 + }, + { + "epoch": 0.13, + "learning_rate": 1.94940746056829e-05, + "loss": 0.5034, + "step": 1126 + }, + { + "epoch": 0.13, + "learning_rate": 1.949291146259287e-05, + "loss": 0.4913, + "step": 1127 + }, + { + "epoch": 0.13, + "learning_rate": 1.949174701876203e-05, + "loss": 0.4805, + "step": 1128 + }, + { + "epoch": 0.13, + "learning_rate": 1.9490581274349934e-05, + "loss": 0.4944, + "step": 1129 + }, + { + "epoch": 0.13, + "learning_rate": 1.9489414229516318e-05, + "loss": 0.4868, + "step": 1130 + }, + { + "epoch": 0.13, + "learning_rate": 1.9488245884421087e-05, + "loss": 0.4768, + "step": 1131 + }, + { + "epoch": 0.13, + "learning_rate": 1.9487076239224337e-05, + "loss": 0.4823, + "step": 1132 + }, + { + "epoch": 0.13, + "learning_rate": 1.948590529408633e-05, + "loss": 0.4853, + "step": 1133 + }, + { + "epoch": 0.13, + "learning_rate": 1.948473304916751e-05, + "loss": 0.4846, + "step": 1134 + }, + { + "epoch": 0.13, + "learning_rate": 1.948355950462851e-05, + "loss": 0.5139, + "step": 1135 + }, + { + "epoch": 0.13, + "learning_rate": 1.9482384660630125e-05, + "loss": 0.4903, + "step": 1136 + }, + { + "epoch": 0.13, + "learning_rate": 1.9481208517333336e-05, + "loss": 0.5032, + "step": 1137 + }, + { + "epoch": 0.13, + "learning_rate": 1.9480031074899303e-05, + "loss": 0.4933, + "step": 1138 + }, + { + "epoch": 0.13, + "learning_rate": 1.9478852333489356e-05, + "loss": 0.4865, + "step": 1139 + }, + { + "epoch": 0.13, + "learning_rate": 1.9477672293265014e-05, + "loss": 0.4838, + "step": 1140 + }, + { + "epoch": 0.13, + "learning_rate": 1.9476490954387968e-05, + "loss": 0.4925, + "step": 1141 + }, + { + "epoch": 0.13, + "learning_rate": 1.947530831702009e-05, + "loss": 0.5201, + "step": 1142 + }, + { + "epoch": 0.13, + "learning_rate": 1.9474124381323424e-05, + "loss": 0.4918, + "step": 1143 + }, + { + "epoch": 0.13, + "learning_rate": 1.9472939147460194e-05, + "loss": 0.4993, + "step": 1144 + }, + { + "epoch": 0.13, + "learning_rate": 1.947175261559281e-05, + "loss": 0.4926, + "step": 1145 + }, + { + "epoch": 0.13, + "learning_rate": 1.9470564785883848e-05, + "loss": 0.5156, + "step": 1146 + }, + { + "epoch": 0.13, + "learning_rate": 1.9469375658496066e-05, + "loss": 0.4835, + "step": 1147 + }, + { + "epoch": 0.13, + "learning_rate": 1.946818523359241e-05, + "loss": 0.4778, + "step": 1148 + }, + { + "epoch": 0.13, + "learning_rate": 1.9466993511335985e-05, + "loss": 0.4853, + "step": 1149 + }, + { + "epoch": 0.13, + "learning_rate": 1.9465800491890087e-05, + "loss": 0.4952, + "step": 1150 + }, + { + "epoch": 0.13, + "learning_rate": 1.946460617541819e-05, + "loss": 0.4911, + "step": 1151 + }, + { + "epoch": 0.13, + "learning_rate": 1.9463410562083937e-05, + "loss": 0.4908, + "step": 1152 + }, + { + "epoch": 0.13, + "learning_rate": 1.946221365205115e-05, + "loss": 0.4897, + "step": 1153 + }, + { + "epoch": 0.13, + "learning_rate": 1.9461015445483843e-05, + "loss": 0.4868, + "step": 1154 + }, + { + "epoch": 0.13, + "learning_rate": 1.9459815942546192e-05, + "loss": 0.497, + "step": 1155 + }, + { + "epoch": 0.13, + "learning_rate": 1.9458615143402554e-05, + "loss": 0.4966, + "step": 1156 + }, + { + "epoch": 0.13, + "learning_rate": 1.9457413048217466e-05, + "loss": 0.4687, + "step": 1157 + }, + { + "epoch": 0.13, + "learning_rate": 1.9456209657155645e-05, + "loss": 0.4876, + "step": 1158 + }, + { + "epoch": 0.13, + "learning_rate": 1.9455004970381978e-05, + "loss": 0.5257, + "step": 1159 + }, + { + "epoch": 0.13, + "learning_rate": 1.9453798988061535e-05, + "loss": 0.4873, + "step": 1160 + }, + { + "epoch": 0.13, + "learning_rate": 1.9452591710359566e-05, + "loss": 0.4734, + "step": 1161 + }, + { + "epoch": 0.13, + "learning_rate": 1.9451383137441492e-05, + "loss": 0.485, + "step": 1162 + }, + { + "epoch": 0.13, + "learning_rate": 1.9450173269472915e-05, + "loss": 0.4731, + "step": 1163 + }, + { + "epoch": 0.13, + "learning_rate": 1.9448962106619614e-05, + "loss": 0.4943, + "step": 1164 + }, + { + "epoch": 0.13, + "learning_rate": 1.944774964904754e-05, + "loss": 0.4884, + "step": 1165 + }, + { + "epoch": 0.13, + "learning_rate": 1.944653589692284e-05, + "loss": 0.48, + "step": 1166 + }, + { + "epoch": 0.13, + "learning_rate": 1.9445320850411816e-05, + "loss": 0.5034, + "step": 1167 + }, + { + "epoch": 0.13, + "learning_rate": 1.9444104509680954e-05, + "loss": 0.5059, + "step": 1168 + }, + { + "epoch": 0.13, + "learning_rate": 1.9442886874896924e-05, + "loss": 0.4864, + "step": 1169 + }, + { + "epoch": 0.13, + "learning_rate": 1.944166794622657e-05, + "loss": 0.491, + "step": 1170 + }, + { + "epoch": 0.13, + "learning_rate": 1.9440447723836914e-05, + "loss": 0.4786, + "step": 1171 + }, + { + "epoch": 0.13, + "learning_rate": 1.9439226207895143e-05, + "loss": 0.4786, + "step": 1172 + }, + { + "epoch": 0.13, + "learning_rate": 1.9438003398568647e-05, + "loss": 0.4986, + "step": 1173 + }, + { + "epoch": 0.13, + "learning_rate": 1.9436779296024967e-05, + "loss": 0.4635, + "step": 1174 + }, + { + "epoch": 0.13, + "learning_rate": 1.9435553900431838e-05, + "loss": 0.5228, + "step": 1175 + }, + { + "epoch": 0.13, + "learning_rate": 1.9434327211957166e-05, + "loss": 0.4745, + "step": 1176 + }, + { + "epoch": 0.13, + "learning_rate": 1.943309923076903e-05, + "loss": 0.4875, + "step": 1177 + }, + { + "epoch": 0.13, + "learning_rate": 1.9431869957035698e-05, + "loss": 0.479, + "step": 1178 + }, + { + "epoch": 0.13, + "learning_rate": 1.9430639390925604e-05, + "loss": 0.4843, + "step": 1179 + }, + { + "epoch": 0.13, + "learning_rate": 1.942940753260736e-05, + "loss": 0.4825, + "step": 1180 + }, + { + "epoch": 0.13, + "learning_rate": 1.9428174382249764e-05, + "loss": 0.4885, + "step": 1181 + }, + { + "epoch": 0.14, + "learning_rate": 1.942693994002178e-05, + "loss": 0.4608, + "step": 1182 + }, + { + "epoch": 0.14, + "learning_rate": 1.9425704206092562e-05, + "loss": 0.4976, + "step": 1183 + }, + { + "epoch": 0.14, + "learning_rate": 1.9424467180631422e-05, + "loss": 0.4948, + "step": 1184 + }, + { + "epoch": 0.14, + "learning_rate": 1.942322886380787e-05, + "loss": 0.5119, + "step": 1185 + }, + { + "epoch": 0.14, + "learning_rate": 1.942198925579158e-05, + "loss": 0.4797, + "step": 1186 + }, + { + "epoch": 0.14, + "learning_rate": 1.9420748356752405e-05, + "loss": 0.4915, + "step": 1187 + }, + { + "epoch": 0.14, + "learning_rate": 1.9419506166860374e-05, + "loss": 0.4795, + "step": 1188 + }, + { + "epoch": 0.14, + "learning_rate": 1.9418262686285697e-05, + "loss": 0.4857, + "step": 1189 + }, + { + "epoch": 0.14, + "learning_rate": 1.9417017915198758e-05, + "loss": 0.4809, + "step": 1190 + }, + { + "epoch": 0.14, + "learning_rate": 1.9415771853770117e-05, + "loss": 0.4958, + "step": 1191 + }, + { + "epoch": 0.14, + "learning_rate": 1.9414524502170514e-05, + "loss": 0.49, + "step": 1192 + }, + { + "epoch": 0.14, + "learning_rate": 1.941327586057087e-05, + "loss": 0.5236, + "step": 1193 + }, + { + "epoch": 0.14, + "learning_rate": 1.9412025929142263e-05, + "loss": 0.4767, + "step": 1194 + }, + { + "epoch": 0.14, + "learning_rate": 1.9410774708055972e-05, + "loss": 0.5111, + "step": 1195 + }, + { + "epoch": 0.14, + "learning_rate": 1.940952219748344e-05, + "loss": 0.4883, + "step": 1196 + }, + { + "epoch": 0.14, + "learning_rate": 1.9408268397596287e-05, + "loss": 0.4899, + "step": 1197 + }, + { + "epoch": 0.14, + "learning_rate": 1.9407013308566315e-05, + "loss": 0.4783, + "step": 1198 + }, + { + "epoch": 0.14, + "learning_rate": 1.9405756930565496e-05, + "loss": 0.5071, + "step": 1199 + }, + { + "epoch": 0.14, + "learning_rate": 1.9404499263765983e-05, + "loss": 0.4763, + "step": 1200 + }, + { + "epoch": 0.14, + "learning_rate": 1.9403240308340105e-05, + "loss": 0.4966, + "step": 1201 + }, + { + "epoch": 0.14, + "learning_rate": 1.940198006446037e-05, + "loss": 0.4701, + "step": 1202 + }, + { + "epoch": 0.14, + "learning_rate": 1.940071853229945e-05, + "loss": 0.5047, + "step": 1203 + }, + { + "epoch": 0.14, + "learning_rate": 1.939945571203021e-05, + "loss": 0.4907, + "step": 1204 + }, + { + "epoch": 0.14, + "learning_rate": 1.9398191603825687e-05, + "loss": 0.4777, + "step": 1205 + }, + { + "epoch": 0.14, + "learning_rate": 1.9396926207859085e-05, + "loss": 0.4816, + "step": 1206 + }, + { + "epoch": 0.14, + "learning_rate": 1.9395659524303795e-05, + "loss": 0.492, + "step": 1207 + }, + { + "epoch": 0.14, + "learning_rate": 1.9394391553333384e-05, + "loss": 0.4973, + "step": 1208 + }, + { + "epoch": 0.14, + "learning_rate": 1.939312229512159e-05, + "loss": 0.501, + "step": 1209 + }, + { + "epoch": 0.14, + "learning_rate": 1.9391851749842326e-05, + "loss": 0.4835, + "step": 1210 + }, + { + "epoch": 0.14, + "learning_rate": 1.939057991766969e-05, + "loss": 0.4839, + "step": 1211 + }, + { + "epoch": 0.14, + "learning_rate": 1.938930679877795e-05, + "loss": 0.4796, + "step": 1212 + }, + { + "epoch": 0.14, + "learning_rate": 1.938803239334155e-05, + "loss": 0.4854, + "step": 1213 + }, + { + "epoch": 0.14, + "learning_rate": 1.9386756701535115e-05, + "loss": 0.5023, + "step": 1214 + }, + { + "epoch": 0.14, + "learning_rate": 1.938547972353344e-05, + "loss": 0.476, + "step": 1215 + }, + { + "epoch": 0.14, + "learning_rate": 1.93842014595115e-05, + "loss": 0.4784, + "step": 1216 + }, + { + "epoch": 0.14, + "learning_rate": 1.9382921909644448e-05, + "loss": 0.5077, + "step": 1217 + }, + { + "epoch": 0.14, + "learning_rate": 1.938164107410761e-05, + "loss": 0.5086, + "step": 1218 + }, + { + "epoch": 0.14, + "learning_rate": 1.938035895307649e-05, + "loss": 0.4828, + "step": 1219 + }, + { + "epoch": 0.14, + "learning_rate": 1.9379075546726764e-05, + "loss": 0.4738, + "step": 1220 + }, + { + "epoch": 0.14, + "learning_rate": 1.9377790855234288e-05, + "loss": 0.484, + "step": 1221 + }, + { + "epoch": 0.14, + "learning_rate": 1.9376504878775098e-05, + "loss": 0.4836, + "step": 1222 + }, + { + "epoch": 0.14, + "learning_rate": 1.9375217617525396e-05, + "loss": 0.4859, + "step": 1223 + }, + { + "epoch": 0.14, + "learning_rate": 1.937392907166157e-05, + "loss": 0.4851, + "step": 1224 + }, + { + "epoch": 0.14, + "learning_rate": 1.9372639241360173e-05, + "loss": 0.4709, + "step": 1225 + }, + { + "epoch": 0.14, + "learning_rate": 1.937134812679795e-05, + "loss": 0.5075, + "step": 1226 + }, + { + "epoch": 0.14, + "learning_rate": 1.9370055728151805e-05, + "loss": 0.5, + "step": 1227 + }, + { + "epoch": 0.14, + "learning_rate": 1.936876204559883e-05, + "loss": 0.4768, + "step": 1228 + }, + { + "epoch": 0.14, + "learning_rate": 1.936746707931628e-05, + "loss": 0.5111, + "step": 1229 + }, + { + "epoch": 0.14, + "learning_rate": 1.9366170829481607e-05, + "loss": 0.4642, + "step": 1230 + }, + { + "epoch": 0.14, + "learning_rate": 1.9364873296272414e-05, + "loss": 0.4755, + "step": 1231 + }, + { + "epoch": 0.14, + "learning_rate": 1.9363574479866504e-05, + "loss": 0.4973, + "step": 1232 + }, + { + "epoch": 0.14, + "learning_rate": 1.936227438044183e-05, + "loss": 0.4738, + "step": 1233 + }, + { + "epoch": 0.14, + "learning_rate": 1.9360972998176547e-05, + "loss": 0.4883, + "step": 1234 + }, + { + "epoch": 0.14, + "learning_rate": 1.9359670333248967e-05, + "loss": 0.4855, + "step": 1235 + }, + { + "epoch": 0.14, + "learning_rate": 1.935836638583759e-05, + "loss": 0.503, + "step": 1236 + }, + { + "epoch": 0.14, + "learning_rate": 1.935706115612108e-05, + "loss": 0.4941, + "step": 1237 + }, + { + "epoch": 0.14, + "learning_rate": 1.935575464427828e-05, + "loss": 0.4675, + "step": 1238 + }, + { + "epoch": 0.14, + "learning_rate": 1.9354446850488216e-05, + "loss": 0.468, + "step": 1239 + }, + { + "epoch": 0.14, + "learning_rate": 1.9353137774930085e-05, + "loss": 0.5015, + "step": 1240 + }, + { + "epoch": 0.14, + "learning_rate": 1.935182741778326e-05, + "loss": 0.4915, + "step": 1241 + }, + { + "epoch": 0.14, + "learning_rate": 1.9350515779227294e-05, + "loss": 0.4781, + "step": 1242 + }, + { + "epoch": 0.14, + "learning_rate": 1.93492028594419e-05, + "loss": 0.4798, + "step": 1243 + }, + { + "epoch": 0.14, + "learning_rate": 1.934788865860698e-05, + "loss": 0.4966, + "step": 1244 + }, + { + "epoch": 0.14, + "learning_rate": 1.9346573176902616e-05, + "loss": 0.4916, + "step": 1245 + }, + { + "epoch": 0.14, + "learning_rate": 1.934525641450905e-05, + "loss": 0.4928, + "step": 1246 + }, + { + "epoch": 0.14, + "learning_rate": 1.9343938371606714e-05, + "loss": 0.4975, + "step": 1247 + }, + { + "epoch": 0.14, + "learning_rate": 1.9342619048376202e-05, + "loss": 0.4795, + "step": 1248 + }, + { + "epoch": 0.14, + "learning_rate": 1.93412984449983e-05, + "loss": 0.4847, + "step": 1249 + }, + { + "epoch": 0.14, + "learning_rate": 1.9339976561653956e-05, + "loss": 0.4977, + "step": 1250 + }, + { + "epoch": 0.14, + "learning_rate": 1.9338653398524295e-05, + "loss": 0.4751, + "step": 1251 + }, + { + "epoch": 0.14, + "learning_rate": 1.933732895579062e-05, + "loss": 0.4929, + "step": 1252 + }, + { + "epoch": 0.14, + "learning_rate": 1.933600323363442e-05, + "loss": 0.4971, + "step": 1253 + }, + { + "epoch": 0.14, + "learning_rate": 1.933467623223733e-05, + "loss": 0.4927, + "step": 1254 + }, + { + "epoch": 0.14, + "learning_rate": 1.9333347951781194e-05, + "loss": 0.4969, + "step": 1255 + }, + { + "epoch": 0.14, + "learning_rate": 1.933201839244801e-05, + "loss": 0.4833, + "step": 1256 + }, + { + "epoch": 0.14, + "learning_rate": 1.9330687554419956e-05, + "loss": 0.4656, + "step": 1257 + }, + { + "epoch": 0.14, + "learning_rate": 1.932935543787939e-05, + "loss": 0.496, + "step": 1258 + }, + { + "epoch": 0.14, + "learning_rate": 1.9328022043008842e-05, + "loss": 0.4697, + "step": 1259 + }, + { + "epoch": 0.14, + "learning_rate": 1.9326687369991012e-05, + "loss": 0.4901, + "step": 1260 + }, + { + "epoch": 0.14, + "learning_rate": 1.9325351419008783e-05, + "loss": 0.4935, + "step": 1261 + }, + { + "epoch": 0.14, + "learning_rate": 1.932401419024521e-05, + "loss": 0.5023, + "step": 1262 + }, + { + "epoch": 0.14, + "learning_rate": 1.9322675683883528e-05, + "loss": 0.4864, + "step": 1263 + }, + { + "epoch": 0.14, + "learning_rate": 1.9321335900107134e-05, + "loss": 0.4949, + "step": 1264 + }, + { + "epoch": 0.14, + "learning_rate": 1.931999483909961e-05, + "loss": 0.4694, + "step": 1265 + }, + { + "epoch": 0.14, + "learning_rate": 1.9318652501044715e-05, + "loss": 0.4915, + "step": 1266 + }, + { + "epoch": 0.14, + "learning_rate": 1.931730888612638e-05, + "loss": 0.492, + "step": 1267 + }, + { + "epoch": 0.14, + "learning_rate": 1.9315963994528707e-05, + "loss": 0.4826, + "step": 1268 + }, + { + "epoch": 0.15, + "learning_rate": 1.931461782643598e-05, + "loss": 0.4878, + "step": 1269 + }, + { + "epoch": 0.15, + "learning_rate": 1.9313270382032644e-05, + "loss": 0.5038, + "step": 1270 + }, + { + "epoch": 0.15, + "learning_rate": 1.9311921661503338e-05, + "loss": 0.4938, + "step": 1271 + }, + { + "epoch": 0.15, + "learning_rate": 1.9310571665032867e-05, + "loss": 0.487, + "step": 1272 + }, + { + "epoch": 0.15, + "learning_rate": 1.9309220392806206e-05, + "loss": 0.4885, + "step": 1273 + }, + { + "epoch": 0.15, + "learning_rate": 1.9307867845008513e-05, + "loss": 0.4796, + "step": 1274 + }, + { + "epoch": 0.15, + "learning_rate": 1.930651402182512e-05, + "loss": 0.4803, + "step": 1275 + }, + { + "epoch": 0.15, + "learning_rate": 1.9305158923441524e-05, + "loss": 0.4871, + "step": 1276 + }, + { + "epoch": 0.15, + "learning_rate": 1.9303802550043404e-05, + "loss": 0.4888, + "step": 1277 + }, + { + "epoch": 0.15, + "learning_rate": 1.930244490181662e-05, + "loss": 0.4926, + "step": 1278 + }, + { + "epoch": 0.15, + "learning_rate": 1.9301085978947195e-05, + "loss": 0.4892, + "step": 1279 + }, + { + "epoch": 0.15, + "learning_rate": 1.9299725781621335e-05, + "loss": 0.4975, + "step": 1280 + }, + { + "epoch": 0.15, + "learning_rate": 1.9298364310025412e-05, + "loss": 0.4898, + "step": 1281 + }, + { + "epoch": 0.15, + "learning_rate": 1.929700156434599e-05, + "loss": 0.472, + "step": 1282 + }, + { + "epoch": 0.15, + "learning_rate": 1.929563754476978e-05, + "loss": 0.4749, + "step": 1283 + }, + { + "epoch": 0.15, + "learning_rate": 1.929427225148369e-05, + "loss": 0.4967, + "step": 1284 + }, + { + "epoch": 0.15, + "learning_rate": 1.92929056846748e-05, + "loss": 0.4912, + "step": 1285 + }, + { + "epoch": 0.15, + "learning_rate": 1.9291537844530352e-05, + "loss": 0.4963, + "step": 1286 + }, + { + "epoch": 0.15, + "learning_rate": 1.9290168731237776e-05, + "loss": 0.5038, + "step": 1287 + }, + { + "epoch": 0.15, + "learning_rate": 1.9288798344984673e-05, + "loss": 0.4706, + "step": 1288 + }, + { + "epoch": 0.15, + "learning_rate": 1.928742668595881e-05, + "loss": 0.498, + "step": 1289 + }, + { + "epoch": 0.15, + "learning_rate": 1.9286053754348142e-05, + "loss": 0.488, + "step": 1290 + }, + { + "epoch": 0.15, + "learning_rate": 1.9284679550340783e-05, + "loss": 0.4734, + "step": 1291 + }, + { + "epoch": 0.15, + "learning_rate": 1.928330407412504e-05, + "loss": 0.4883, + "step": 1292 + }, + { + "epoch": 0.15, + "learning_rate": 1.9281927325889373e-05, + "loss": 0.4742, + "step": 1293 + }, + { + "epoch": 0.15, + "learning_rate": 1.9280549305822435e-05, + "loss": 0.499, + "step": 1294 + }, + { + "epoch": 0.15, + "learning_rate": 1.927917001411304e-05, + "loss": 0.4847, + "step": 1295 + }, + { + "epoch": 0.15, + "learning_rate": 1.9277789450950187e-05, + "loss": 0.4661, + "step": 1296 + }, + { + "epoch": 0.15, + "learning_rate": 1.9276407616523044e-05, + "loss": 0.4792, + "step": 1297 + }, + { + "epoch": 0.15, + "learning_rate": 1.927502451102095e-05, + "loss": 0.4986, + "step": 1298 + }, + { + "epoch": 0.15, + "learning_rate": 1.927364013463342e-05, + "loss": 0.4912, + "step": 1299 + }, + { + "epoch": 0.15, + "learning_rate": 1.9272254487550144e-05, + "loss": 0.4729, + "step": 1300 + }, + { + "epoch": 0.15, + "learning_rate": 1.9270867569960994e-05, + "loss": 0.4785, + "step": 1301 + }, + { + "epoch": 0.15, + "learning_rate": 1.9269479382056008e-05, + "loss": 0.5064, + "step": 1302 + }, + { + "epoch": 0.15, + "learning_rate": 1.926808992402539e-05, + "loss": 0.4841, + "step": 1303 + }, + { + "epoch": 0.15, + "learning_rate": 1.926669919605953e-05, + "loss": 0.4717, + "step": 1304 + }, + { + "epoch": 0.15, + "learning_rate": 1.926530719834899e-05, + "loss": 0.4853, + "step": 1305 + }, + { + "epoch": 0.15, + "learning_rate": 1.9263913931084507e-05, + "loss": 0.4704, + "step": 1306 + }, + { + "epoch": 0.15, + "learning_rate": 1.9262519394456985e-05, + "loss": 0.5001, + "step": 1307 + }, + { + "epoch": 0.15, + "learning_rate": 1.9261123588657514e-05, + "loss": 0.4755, + "step": 1308 + }, + { + "epoch": 0.15, + "learning_rate": 1.925972651387734e-05, + "loss": 0.4841, + "step": 1309 + }, + { + "epoch": 0.15, + "learning_rate": 1.9258328170307905e-05, + "loss": 0.5065, + "step": 1310 + }, + { + "epoch": 0.15, + "learning_rate": 1.9256928558140806e-05, + "loss": 0.4867, + "step": 1311 + }, + { + "epoch": 0.15, + "learning_rate": 1.925552767756782e-05, + "loss": 0.4879, + "step": 1312 + }, + { + "epoch": 0.15, + "learning_rate": 1.9254125528780908e-05, + "loss": 0.4927, + "step": 1313 + }, + { + "epoch": 0.15, + "learning_rate": 1.9252722111972182e-05, + "loss": 0.5016, + "step": 1314 + }, + { + "epoch": 0.15, + "learning_rate": 1.9251317427333953e-05, + "loss": 0.5064, + "step": 1315 + }, + { + "epoch": 0.15, + "learning_rate": 1.924991147505869e-05, + "loss": 0.4779, + "step": 1316 + }, + { + "epoch": 0.15, + "learning_rate": 1.924850425533904e-05, + "loss": 0.4678, + "step": 1317 + }, + { + "epoch": 0.15, + "learning_rate": 1.9247095768367822e-05, + "loss": 0.4932, + "step": 1318 + }, + { + "epoch": 0.15, + "learning_rate": 1.924568601433803e-05, + "loss": 0.4863, + "step": 1319 + }, + { + "epoch": 0.15, + "learning_rate": 1.9244274993442836e-05, + "loss": 0.484, + "step": 1320 + }, + { + "epoch": 0.15, + "learning_rate": 1.924286270587558e-05, + "loss": 0.4773, + "step": 1321 + }, + { + "epoch": 0.15, + "learning_rate": 1.924144915182977e-05, + "loss": 0.4771, + "step": 1322 + }, + { + "epoch": 0.15, + "learning_rate": 1.9240034331499105e-05, + "loss": 0.4836, + "step": 1323 + }, + { + "epoch": 0.15, + "learning_rate": 1.923861824507744e-05, + "loss": 0.4933, + "step": 1324 + }, + { + "epoch": 0.15, + "learning_rate": 1.9237200892758814e-05, + "loss": 0.4814, + "step": 1325 + }, + { + "epoch": 0.15, + "learning_rate": 1.923578227473743e-05, + "loss": 0.477, + "step": 1326 + }, + { + "epoch": 0.15, + "learning_rate": 1.923436239120768e-05, + "loss": 0.4851, + "step": 1327 + }, + { + "epoch": 0.15, + "learning_rate": 1.9232941242364114e-05, + "loss": 0.4971, + "step": 1328 + }, + { + "epoch": 0.15, + "learning_rate": 1.9231518828401458e-05, + "loss": 0.488, + "step": 1329 + }, + { + "epoch": 0.15, + "learning_rate": 1.923009514951462e-05, + "loss": 0.4949, + "step": 1330 + }, + { + "epoch": 0.15, + "learning_rate": 1.9228670205898675e-05, + "loss": 0.4887, + "step": 1331 + }, + { + "epoch": 0.15, + "learning_rate": 1.922724399774887e-05, + "loss": 0.4823, + "step": 1332 + }, + { + "epoch": 0.15, + "learning_rate": 1.9225816525260626e-05, + "loss": 0.4923, + "step": 1333 + }, + { + "epoch": 0.15, + "learning_rate": 1.9224387788629547e-05, + "loss": 0.4817, + "step": 1334 + }, + { + "epoch": 0.15, + "learning_rate": 1.922295778805139e-05, + "loss": 0.4834, + "step": 1335 + }, + { + "epoch": 0.15, + "learning_rate": 1.9221526523722104e-05, + "loss": 0.5067, + "step": 1336 + }, + { + "epoch": 0.15, + "learning_rate": 1.9220093995837805e-05, + "loss": 0.4721, + "step": 1337 + }, + { + "epoch": 0.15, + "learning_rate": 1.9218660204594778e-05, + "loss": 0.5119, + "step": 1338 + }, + { + "epoch": 0.15, + "learning_rate": 1.9217225150189483e-05, + "loss": 0.469, + "step": 1339 + }, + { + "epoch": 0.15, + "learning_rate": 1.921578883281856e-05, + "loss": 0.4769, + "step": 1340 + }, + { + "epoch": 0.15, + "learning_rate": 1.9214351252678815e-05, + "loss": 0.4706, + "step": 1341 + }, + { + "epoch": 0.15, + "learning_rate": 1.9212912409967223e-05, + "loss": 0.4812, + "step": 1342 + }, + { + "epoch": 0.15, + "learning_rate": 1.9211472304880945e-05, + "loss": 0.4874, + "step": 1343 + }, + { + "epoch": 0.15, + "learning_rate": 1.9210030937617303e-05, + "loss": 0.5001, + "step": 1344 + }, + { + "epoch": 0.15, + "learning_rate": 1.9208588308373798e-05, + "loss": 0.477, + "step": 1345 + }, + { + "epoch": 0.15, + "learning_rate": 1.9207144417348103e-05, + "loss": 0.5098, + "step": 1346 + }, + { + "epoch": 0.15, + "learning_rate": 1.9205699264738063e-05, + "loss": 0.4901, + "step": 1347 + }, + { + "epoch": 0.15, + "learning_rate": 1.9204252850741695e-05, + "loss": 0.4818, + "step": 1348 + }, + { + "epoch": 0.15, + "learning_rate": 1.920280517555719e-05, + "loss": 0.4806, + "step": 1349 + }, + { + "epoch": 0.15, + "learning_rate": 1.9201356239382914e-05, + "loss": 0.4831, + "step": 1350 + }, + { + "epoch": 0.15, + "learning_rate": 1.9199906042417403e-05, + "loss": 0.4894, + "step": 1351 + }, + { + "epoch": 0.15, + "learning_rate": 1.919845458485936e-05, + "loss": 0.4902, + "step": 1352 + }, + { + "epoch": 0.15, + "learning_rate": 1.9197001866907676e-05, + "loss": 0.4933, + "step": 1353 + }, + { + "epoch": 0.15, + "learning_rate": 1.9195547888761403e-05, + "loss": 0.4747, + "step": 1354 + }, + { + "epoch": 0.15, + "learning_rate": 1.9194092650619767e-05, + "loss": 0.4754, + "step": 1355 + }, + { + "epoch": 0.15, + "learning_rate": 1.9192636152682173e-05, + "loss": 0.5062, + "step": 1356 + }, + { + "epoch": 0.16, + "learning_rate": 1.9191178395148188e-05, + "loss": 0.4714, + "step": 1357 + }, + { + "epoch": 0.16, + "learning_rate": 1.9189719378217554e-05, + "loss": 0.4884, + "step": 1358 + }, + { + "epoch": 0.16, + "learning_rate": 1.91882591020902e-05, + "loss": 0.4867, + "step": 1359 + }, + { + "epoch": 0.16, + "learning_rate": 1.9186797566966205e-05, + "loss": 0.5001, + "step": 1360 + }, + { + "epoch": 0.16, + "learning_rate": 1.918533477304584e-05, + "loss": 0.4858, + "step": 1361 + }, + { + "epoch": 0.16, + "learning_rate": 1.918387072052954e-05, + "loss": 0.4954, + "step": 1362 + }, + { + "epoch": 0.16, + "learning_rate": 1.918240540961791e-05, + "loss": 0.4623, + "step": 1363 + }, + { + "epoch": 0.16, + "learning_rate": 1.9180938840511727e-05, + "loss": 0.5021, + "step": 1364 + }, + { + "epoch": 0.16, + "learning_rate": 1.917947101341195e-05, + "loss": 0.4912, + "step": 1365 + }, + { + "epoch": 0.16, + "learning_rate": 1.9178001928519703e-05, + "loss": 0.4741, + "step": 1366 + }, + { + "epoch": 0.16, + "learning_rate": 1.9176531586036282e-05, + "loss": 0.4754, + "step": 1367 + }, + { + "epoch": 0.16, + "learning_rate": 1.9175059986163157e-05, + "loss": 0.4776, + "step": 1368 + }, + { + "epoch": 0.16, + "learning_rate": 1.9173587129101967e-05, + "loss": 0.4847, + "step": 1369 + }, + { + "epoch": 0.16, + "learning_rate": 1.917211301505453e-05, + "loss": 0.4818, + "step": 1370 + }, + { + "epoch": 0.16, + "learning_rate": 1.9170637644222835e-05, + "loss": 0.465, + "step": 1371 + }, + { + "epoch": 0.16, + "learning_rate": 1.9169161016809036e-05, + "loss": 0.516, + "step": 1372 + }, + { + "epoch": 0.16, + "learning_rate": 1.9167683133015465e-05, + "loss": 0.4855, + "step": 1373 + }, + { + "epoch": 0.16, + "learning_rate": 1.9166203993044627e-05, + "loss": 0.4876, + "step": 1374 + }, + { + "epoch": 0.16, + "learning_rate": 1.9164723597099192e-05, + "loss": 0.4911, + "step": 1375 + }, + { + "epoch": 0.16, + "learning_rate": 1.9163241945382012e-05, + "loss": 0.4916, + "step": 1376 + }, + { + "epoch": 0.16, + "learning_rate": 1.9161759038096108e-05, + "loss": 0.4721, + "step": 1377 + }, + { + "epoch": 0.16, + "learning_rate": 1.9160274875444668e-05, + "loss": 0.4814, + "step": 1378 + }, + { + "epoch": 0.16, + "learning_rate": 1.9158789457631054e-05, + "loss": 0.4883, + "step": 1379 + }, + { + "epoch": 0.16, + "learning_rate": 1.9157302784858807e-05, + "loss": 0.4865, + "step": 1380 + }, + { + "epoch": 0.16, + "learning_rate": 1.915581485733163e-05, + "loss": 0.4839, + "step": 1381 + }, + { + "epoch": 0.16, + "learning_rate": 1.91543256752534e-05, + "loss": 0.4959, + "step": 1382 + }, + { + "epoch": 0.16, + "learning_rate": 1.915283523882818e-05, + "loss": 0.4822, + "step": 1383 + }, + { + "epoch": 0.16, + "learning_rate": 1.9151343548260176e-05, + "loss": 0.4907, + "step": 1384 + }, + { + "epoch": 0.16, + "learning_rate": 1.9149850603753793e-05, + "loss": 0.4936, + "step": 1385 + }, + { + "epoch": 0.16, + "learning_rate": 1.91483564055136e-05, + "loss": 0.4744, + "step": 1386 + }, + { + "epoch": 0.16, + "learning_rate": 1.9146860953744325e-05, + "loss": 0.4955, + "step": 1387 + }, + { + "epoch": 0.16, + "learning_rate": 1.9145364248650892e-05, + "loss": 0.4918, + "step": 1388 + }, + { + "epoch": 0.16, + "learning_rate": 1.914386629043837e-05, + "loss": 0.4582, + "step": 1389 + }, + { + "epoch": 0.16, + "learning_rate": 1.9142367079312023e-05, + "loss": 0.4987, + "step": 1390 + }, + { + "epoch": 0.16, + "learning_rate": 1.9140866615477272e-05, + "loss": 0.4701, + "step": 1391 + }, + { + "epoch": 0.16, + "learning_rate": 1.913936489913971e-05, + "loss": 0.4822, + "step": 1392 + }, + { + "epoch": 0.16, + "learning_rate": 1.9137861930505112e-05, + "loss": 0.4738, + "step": 1393 + }, + { + "epoch": 0.16, + "learning_rate": 1.9136357709779418e-05, + "loss": 0.4974, + "step": 1394 + }, + { + "epoch": 0.16, + "learning_rate": 1.9134852237168738e-05, + "loss": 0.4939, + "step": 1395 + }, + { + "epoch": 0.16, + "learning_rate": 1.9133345512879353e-05, + "loss": 0.4838, + "step": 1396 + }, + { + "epoch": 0.16, + "learning_rate": 1.9131837537117724e-05, + "loss": 0.4822, + "step": 1397 + }, + { + "epoch": 0.16, + "learning_rate": 1.913032831009047e-05, + "loss": 0.5028, + "step": 1398 + }, + { + "epoch": 0.16, + "learning_rate": 1.9128817832004393e-05, + "loss": 0.4745, + "step": 1399 + }, + { + "epoch": 0.16, + "learning_rate": 1.912730610306646e-05, + "loss": 0.4764, + "step": 1400 + }, + { + "epoch": 0.16, + "learning_rate": 1.9125793123483814e-05, + "loss": 0.4963, + "step": 1401 + }, + { + "epoch": 0.16, + "learning_rate": 1.912427889346377e-05, + "loss": 0.4879, + "step": 1402 + }, + { + "epoch": 0.16, + "learning_rate": 1.91227634132138e-05, + "loss": 0.4866, + "step": 1403 + }, + { + "epoch": 0.16, + "learning_rate": 1.912124668294157e-05, + "loss": 0.4828, + "step": 1404 + }, + { + "epoch": 0.16, + "learning_rate": 1.91197287028549e-05, + "loss": 0.5033, + "step": 1405 + }, + { + "epoch": 0.16, + "learning_rate": 1.9118209473161794e-05, + "loss": 0.4922, + "step": 1406 + }, + { + "epoch": 0.16, + "learning_rate": 1.9116688994070413e-05, + "loss": 0.4738, + "step": 1407 + }, + { + "epoch": 0.16, + "learning_rate": 1.9115167265789096e-05, + "loss": 0.4901, + "step": 1408 + }, + { + "epoch": 0.16, + "learning_rate": 1.911364428852636e-05, + "loss": 0.4672, + "step": 1409 + }, + { + "epoch": 0.16, + "learning_rate": 1.9112120062490883e-05, + "loss": 0.4893, + "step": 1410 + }, + { + "epoch": 0.16, + "learning_rate": 1.911059458789152e-05, + "loss": 0.4717, + "step": 1411 + }, + { + "epoch": 0.16, + "learning_rate": 1.9109067864937292e-05, + "loss": 0.4992, + "step": 1412 + }, + { + "epoch": 0.16, + "learning_rate": 1.9107539893837396e-05, + "loss": 0.4814, + "step": 1413 + }, + { + "epoch": 0.16, + "learning_rate": 1.91060106748012e-05, + "loss": 0.4832, + "step": 1414 + }, + { + "epoch": 0.16, + "learning_rate": 1.9104480208038236e-05, + "loss": 0.4766, + "step": 1415 + }, + { + "epoch": 0.16, + "learning_rate": 1.9102948493758217e-05, + "loss": 0.4931, + "step": 1416 + }, + { + "epoch": 0.16, + "learning_rate": 1.9101415532171018e-05, + "loss": 0.4702, + "step": 1417 + }, + { + "epoch": 0.16, + "learning_rate": 1.90998813234867e-05, + "loss": 0.4695, + "step": 1418 + }, + { + "epoch": 0.16, + "learning_rate": 1.9098345867915467e-05, + "loss": 0.5036, + "step": 1419 + }, + { + "epoch": 0.16, + "learning_rate": 1.909680916566772e-05, + "loss": 0.5027, + "step": 1420 + }, + { + "epoch": 0.16, + "learning_rate": 1.9095271216954022e-05, + "loss": 0.4953, + "step": 1421 + }, + { + "epoch": 0.16, + "learning_rate": 1.9093732021985103e-05, + "loss": 0.4851, + "step": 1422 + }, + { + "epoch": 0.16, + "learning_rate": 1.909219158097187e-05, + "loss": 0.4972, + "step": 1423 + }, + { + "epoch": 0.16, + "learning_rate": 1.9090649894125395e-05, + "loss": 0.4746, + "step": 1424 + }, + { + "epoch": 0.16, + "learning_rate": 1.908910696165693e-05, + "loss": 0.4911, + "step": 1425 + }, + { + "epoch": 0.16, + "learning_rate": 1.908756278377788e-05, + "loss": 0.4701, + "step": 1426 + }, + { + "epoch": 0.16, + "learning_rate": 1.9086017360699843e-05, + "loss": 0.4793, + "step": 1427 + }, + { + "epoch": 0.16, + "learning_rate": 1.9084470692634567e-05, + "loss": 0.4962, + "step": 1428 + }, + { + "epoch": 0.16, + "learning_rate": 1.9082922779793988e-05, + "loss": 0.4917, + "step": 1429 + }, + { + "epoch": 0.16, + "learning_rate": 1.9081373622390204e-05, + "loss": 0.5028, + "step": 1430 + }, + { + "epoch": 0.16, + "learning_rate": 1.9079823220635477e-05, + "loss": 0.471, + "step": 1431 + }, + { + "epoch": 0.16, + "learning_rate": 1.907827157474225e-05, + "loss": 0.4818, + "step": 1432 + }, + { + "epoch": 0.16, + "learning_rate": 1.9076718684923136e-05, + "loss": 0.4884, + "step": 1433 + }, + { + "epoch": 0.16, + "learning_rate": 1.9075164551390918e-05, + "loss": 0.4989, + "step": 1434 + }, + { + "epoch": 0.16, + "learning_rate": 1.9073609174358535e-05, + "loss": 0.4663, + "step": 1435 + }, + { + "epoch": 0.16, + "learning_rate": 1.9072052554039123e-05, + "loss": 0.5005, + "step": 1436 + }, + { + "epoch": 0.16, + "learning_rate": 1.9070494690645966e-05, + "loss": 0.4952, + "step": 1437 + }, + { + "epoch": 0.16, + "learning_rate": 1.9068935584392522e-05, + "loss": 0.4959, + "step": 1438 + }, + { + "epoch": 0.16, + "learning_rate": 1.906737523549243e-05, + "loss": 0.4891, + "step": 1439 + }, + { + "epoch": 0.16, + "learning_rate": 1.9065813644159495e-05, + "loss": 0.4844, + "step": 1440 + }, + { + "epoch": 0.16, + "learning_rate": 1.906425081060768e-05, + "loss": 0.4942, + "step": 1441 + }, + { + "epoch": 0.16, + "learning_rate": 1.906268673505114e-05, + "loss": 0.4822, + "step": 1442 + }, + { + "epoch": 0.16, + "learning_rate": 1.906112141770418e-05, + "loss": 0.4636, + "step": 1443 + }, + { + "epoch": 0.17, + "learning_rate": 1.9059554858781285e-05, + "loss": 0.488, + "step": 1444 + }, + { + "epoch": 0.17, + "learning_rate": 1.9057987058497106e-05, + "loss": 0.5003, + "step": 1445 + }, + { + "epoch": 0.17, + "learning_rate": 1.9056418017066476e-05, + "loss": 0.503, + "step": 1446 + }, + { + "epoch": 0.17, + "learning_rate": 1.905484773470438e-05, + "loss": 0.4837, + "step": 1447 + }, + { + "epoch": 0.17, + "learning_rate": 1.905327621162598e-05, + "loss": 0.4655, + "step": 1448 + }, + { + "epoch": 0.17, + "learning_rate": 1.905170344804662e-05, + "loss": 0.4768, + "step": 1449 + }, + { + "epoch": 0.17, + "learning_rate": 1.90501294441818e-05, + "loss": 0.4872, + "step": 1450 + }, + { + "epoch": 0.17, + "learning_rate": 1.9048554200247184e-05, + "loss": 0.4736, + "step": 1451 + }, + { + "epoch": 0.17, + "learning_rate": 1.9046977716458627e-05, + "loss": 0.4808, + "step": 1452 + }, + { + "epoch": 0.17, + "learning_rate": 1.904539999303214e-05, + "loss": 0.4925, + "step": 1453 + }, + { + "epoch": 0.17, + "learning_rate": 1.90438210301839e-05, + "loss": 0.4836, + "step": 1454 + }, + { + "epoch": 0.17, + "learning_rate": 1.9042240828130267e-05, + "loss": 0.4678, + "step": 1455 + }, + { + "epoch": 0.17, + "learning_rate": 1.9040659387087762e-05, + "loss": 0.4835, + "step": 1456 + }, + { + "epoch": 0.17, + "learning_rate": 1.903907670727308e-05, + "loss": 0.4949, + "step": 1457 + }, + { + "epoch": 0.17, + "learning_rate": 1.903749278890308e-05, + "loss": 0.4879, + "step": 1458 + }, + { + "epoch": 0.17, + "learning_rate": 1.903590763219479e-05, + "loss": 0.5004, + "step": 1459 + }, + { + "epoch": 0.17, + "learning_rate": 1.9034321237365424e-05, + "loss": 0.4882, + "step": 1460 + }, + { + "epoch": 0.17, + "learning_rate": 1.9032733604632347e-05, + "loss": 0.4818, + "step": 1461 + }, + { + "epoch": 0.17, + "learning_rate": 1.9031144734213097e-05, + "loss": 0.5016, + "step": 1462 + }, + { + "epoch": 0.17, + "learning_rate": 1.9029554626325386e-05, + "loss": 0.5071, + "step": 1463 + }, + { + "epoch": 0.17, + "learning_rate": 1.90279632811871e-05, + "loss": 0.4721, + "step": 1464 + }, + { + "epoch": 0.17, + "learning_rate": 1.902637069901628e-05, + "loss": 0.4908, + "step": 1465 + }, + { + "epoch": 0.17, + "learning_rate": 1.9024776880031154e-05, + "loss": 0.4762, + "step": 1466 + }, + { + "epoch": 0.17, + "learning_rate": 1.9023181824450106e-05, + "loss": 0.4817, + "step": 1467 + }, + { + "epoch": 0.17, + "learning_rate": 1.9021585532491694e-05, + "loss": 0.4806, + "step": 1468 + }, + { + "epoch": 0.17, + "learning_rate": 1.9019988004374645e-05, + "loss": 0.4877, + "step": 1469 + }, + { + "epoch": 0.17, + "learning_rate": 1.901838924031786e-05, + "loss": 0.4813, + "step": 1470 + }, + { + "epoch": 0.17, + "learning_rate": 1.90167892405404e-05, + "loss": 0.4945, + "step": 1471 + }, + { + "epoch": 0.17, + "learning_rate": 1.9015188005261505e-05, + "loss": 0.4863, + "step": 1472 + }, + { + "epoch": 0.17, + "learning_rate": 1.9013585534700582e-05, + "loss": 0.4841, + "step": 1473 + }, + { + "epoch": 0.17, + "learning_rate": 1.90119818290772e-05, + "loss": 0.4733, + "step": 1474 + }, + { + "epoch": 0.17, + "learning_rate": 1.9010376888611106e-05, + "loss": 0.4872, + "step": 1475 + }, + { + "epoch": 0.17, + "learning_rate": 1.9008770713522206e-05, + "loss": 0.4819, + "step": 1476 + }, + { + "epoch": 0.17, + "learning_rate": 1.9007163304030593e-05, + "loss": 0.4832, + "step": 1477 + }, + { + "epoch": 0.17, + "learning_rate": 1.9005554660356505e-05, + "loss": 0.478, + "step": 1478 + }, + { + "epoch": 0.17, + "learning_rate": 1.9003944782720375e-05, + "loss": 0.4873, + "step": 1479 + }, + { + "epoch": 0.17, + "learning_rate": 1.9002333671342782e-05, + "loss": 0.4852, + "step": 1480 + }, + { + "epoch": 0.17, + "learning_rate": 1.9000721326444492e-05, + "loss": 0.4965, + "step": 1481 + }, + { + "epoch": 0.17, + "learning_rate": 1.8999107748246427e-05, + "loss": 0.4757, + "step": 1482 + }, + { + "epoch": 0.17, + "learning_rate": 1.8997492936969686e-05, + "loss": 0.4775, + "step": 1483 + }, + { + "epoch": 0.17, + "learning_rate": 1.899587689283553e-05, + "loss": 0.4777, + "step": 1484 + }, + { + "epoch": 0.17, + "learning_rate": 1.89942596160654e-05, + "loss": 0.4887, + "step": 1485 + }, + { + "epoch": 0.17, + "learning_rate": 1.899264110688089e-05, + "loss": 0.4752, + "step": 1486 + }, + { + "epoch": 0.17, + "learning_rate": 1.8991021365503782e-05, + "loss": 0.4975, + "step": 1487 + }, + { + "epoch": 0.17, + "learning_rate": 1.8989400392156012e-05, + "loss": 0.4667, + "step": 1488 + }, + { + "epoch": 0.17, + "learning_rate": 1.898777818705969e-05, + "loss": 0.5035, + "step": 1489 + }, + { + "epoch": 0.17, + "learning_rate": 1.898615475043709e-05, + "loss": 0.4994, + "step": 1490 + }, + { + "epoch": 0.17, + "learning_rate": 1.8984530082510665e-05, + "loss": 0.4835, + "step": 1491 + }, + { + "epoch": 0.17, + "learning_rate": 1.898290418350303e-05, + "loss": 0.4679, + "step": 1492 + }, + { + "epoch": 0.17, + "learning_rate": 1.8981277053636963e-05, + "loss": 0.4847, + "step": 1493 + }, + { + "epoch": 0.17, + "learning_rate": 1.8979648693135428e-05, + "loss": 0.4753, + "step": 1494 + }, + { + "epoch": 0.17, + "learning_rate": 1.8978019102221538e-05, + "loss": 0.5043, + "step": 1495 + }, + { + "epoch": 0.17, + "learning_rate": 1.8976388281118584e-05, + "loss": 0.4708, + "step": 1496 + }, + { + "epoch": 0.17, + "learning_rate": 1.8974756230050028e-05, + "loss": 0.4852, + "step": 1497 + }, + { + "epoch": 0.17, + "learning_rate": 1.8973122949239497e-05, + "loss": 0.4888, + "step": 1498 + }, + { + "epoch": 0.17, + "learning_rate": 1.897148843891079e-05, + "loss": 0.4918, + "step": 1499 + }, + { + "epoch": 0.17, + "learning_rate": 1.8969852699287865e-05, + "loss": 0.4865, + "step": 1500 + }, + { + "epoch": 0.17, + "learning_rate": 1.896821573059486e-05, + "loss": 0.4742, + "step": 1501 + }, + { + "epoch": 0.17, + "learning_rate": 1.896657753305607e-05, + "loss": 0.4891, + "step": 1502 + }, + { + "epoch": 0.17, + "learning_rate": 1.896493810689597e-05, + "loss": 0.5085, + "step": 1503 + }, + { + "epoch": 0.17, + "learning_rate": 1.89632974523392e-05, + "loss": 0.5098, + "step": 1504 + }, + { + "epoch": 0.17, + "learning_rate": 1.8961655569610557e-05, + "loss": 0.4616, + "step": 1505 + }, + { + "epoch": 0.17, + "learning_rate": 1.8960012458935025e-05, + "loss": 0.4735, + "step": 1506 + }, + { + "epoch": 0.17, + "learning_rate": 1.8958368120537746e-05, + "loss": 0.5024, + "step": 1507 + }, + { + "epoch": 0.17, + "learning_rate": 1.8956722554644026e-05, + "loss": 0.4795, + "step": 1508 + }, + { + "epoch": 0.17, + "learning_rate": 1.8955075761479342e-05, + "loss": 0.4802, + "step": 1509 + }, + { + "epoch": 0.17, + "learning_rate": 1.895342774126935e-05, + "loss": 0.4744, + "step": 1510 + }, + { + "epoch": 0.17, + "learning_rate": 1.8951778494239862e-05, + "loss": 0.4734, + "step": 1511 + }, + { + "epoch": 0.17, + "learning_rate": 1.8950128020616863e-05, + "loss": 0.4954, + "step": 1512 + }, + { + "epoch": 0.17, + "learning_rate": 1.89484763206265e-05, + "loss": 0.4863, + "step": 1513 + }, + { + "epoch": 0.17, + "learning_rate": 1.89468233944951e-05, + "loss": 0.4804, + "step": 1514 + }, + { + "epoch": 0.17, + "learning_rate": 1.8945169242449145e-05, + "loss": 0.5027, + "step": 1515 + }, + { + "epoch": 0.17, + "learning_rate": 1.894351386471529e-05, + "loss": 0.4762, + "step": 1516 + }, + { + "epoch": 0.17, + "learning_rate": 1.8941857261520363e-05, + "loss": 0.4789, + "step": 1517 + }, + { + "epoch": 0.17, + "learning_rate": 1.8940199433091354e-05, + "loss": 0.4867, + "step": 1518 + }, + { + "epoch": 0.17, + "learning_rate": 1.893854037965542e-05, + "loss": 0.4938, + "step": 1519 + }, + { + "epoch": 0.17, + "learning_rate": 1.8936880101439893e-05, + "loss": 0.4817, + "step": 1520 + }, + { + "epoch": 0.17, + "learning_rate": 1.8935218598672266e-05, + "loss": 0.5014, + "step": 1521 + }, + { + "epoch": 0.17, + "learning_rate": 1.8933555871580204e-05, + "loss": 0.4909, + "step": 1522 + }, + { + "epoch": 0.17, + "learning_rate": 1.8931891920391533e-05, + "loss": 0.5019, + "step": 1523 + }, + { + "epoch": 0.17, + "learning_rate": 1.893022674533425e-05, + "loss": 0.482, + "step": 1524 + }, + { + "epoch": 0.17, + "learning_rate": 1.8928560346636532e-05, + "loss": 0.507, + "step": 1525 + }, + { + "epoch": 0.17, + "learning_rate": 1.89268927245267e-05, + "loss": 0.4968, + "step": 1526 + }, + { + "epoch": 0.17, + "learning_rate": 1.8925223879233267e-05, + "loss": 0.4785, + "step": 1527 + }, + { + "epoch": 0.17, + "learning_rate": 1.8923553810984893e-05, + "loss": 0.4749, + "step": 1528 + }, + { + "epoch": 0.17, + "learning_rate": 1.8921882520010416e-05, + "loss": 0.4744, + "step": 1529 + }, + { + "epoch": 0.17, + "learning_rate": 1.8920210006538843e-05, + "loss": 0.5001, + "step": 1530 + }, + { + "epoch": 0.17, + "learning_rate": 1.891853627079935e-05, + "loss": 0.4816, + "step": 1531 + }, + { + "epoch": 0.18, + "learning_rate": 1.8916861313021268e-05, + "loss": 0.4662, + "step": 1532 + }, + { + "epoch": 0.18, + "learning_rate": 1.8915185133434107e-05, + "loss": 0.4976, + "step": 1533 + }, + { + "epoch": 0.18, + "learning_rate": 1.891350773226754e-05, + "loss": 0.48, + "step": 1534 + }, + { + "epoch": 0.18, + "learning_rate": 1.891182910975141e-05, + "loss": 0.4784, + "step": 1535 + }, + { + "epoch": 0.18, + "learning_rate": 1.8910149266115724e-05, + "loss": 0.4776, + "step": 1536 + }, + { + "epoch": 0.18, + "learning_rate": 1.890846820159066e-05, + "loss": 0.4834, + "step": 1537 + }, + { + "epoch": 0.18, + "learning_rate": 1.890678591640656e-05, + "loss": 0.4873, + "step": 1538 + }, + { + "epoch": 0.18, + "learning_rate": 1.8905102410793936e-05, + "loss": 0.5074, + "step": 1539 + }, + { + "epoch": 0.18, + "learning_rate": 1.8903417684983465e-05, + "loss": 0.478, + "step": 1540 + }, + { + "epoch": 0.18, + "learning_rate": 1.8901731739205992e-05, + "loss": 0.483, + "step": 1541 + }, + { + "epoch": 0.18, + "learning_rate": 1.8900044573692527e-05, + "loss": 0.4688, + "step": 1542 + }, + { + "epoch": 0.18, + "learning_rate": 1.8898356188674253e-05, + "loss": 0.487, + "step": 1543 + }, + { + "epoch": 0.18, + "learning_rate": 1.8896666584382516e-05, + "loss": 0.489, + "step": 1544 + }, + { + "epoch": 0.18, + "learning_rate": 1.8894975761048826e-05, + "loss": 0.4841, + "step": 1545 + }, + { + "epoch": 0.18, + "learning_rate": 1.8893283718904866e-05, + "loss": 0.4876, + "step": 1546 + }, + { + "epoch": 0.18, + "learning_rate": 1.8891590458182486e-05, + "loss": 0.4965, + "step": 1547 + }, + { + "epoch": 0.18, + "learning_rate": 1.8889895979113698e-05, + "loss": 0.4915, + "step": 1548 + }, + { + "epoch": 0.18, + "learning_rate": 1.888820028193068e-05, + "loss": 0.4695, + "step": 1549 + }, + { + "epoch": 0.18, + "learning_rate": 1.8886503366865786e-05, + "loss": 0.4902, + "step": 1550 + }, + { + "epoch": 0.18, + "learning_rate": 1.888480523415153e-05, + "loss": 0.4967, + "step": 1551 + }, + { + "epoch": 0.18, + "learning_rate": 1.8883105884020595e-05, + "loss": 0.4639, + "step": 1552 + }, + { + "epoch": 0.18, + "learning_rate": 1.8881405316705824e-05, + "loss": 0.4954, + "step": 1553 + }, + { + "epoch": 0.18, + "learning_rate": 1.887970353244024e-05, + "loss": 0.4777, + "step": 1554 + }, + { + "epoch": 0.18, + "learning_rate": 1.887800053145702e-05, + "loss": 0.4929, + "step": 1555 + }, + { + "epoch": 0.18, + "learning_rate": 1.8876296313989516e-05, + "loss": 0.5071, + "step": 1556 + }, + { + "epoch": 0.18, + "learning_rate": 1.8874590880271245e-05, + "loss": 0.4704, + "step": 1557 + }, + { + "epoch": 0.18, + "learning_rate": 1.8872884230535886e-05, + "loss": 0.468, + "step": 1558 + }, + { + "epoch": 0.18, + "learning_rate": 1.8871176365017293e-05, + "loss": 0.4986, + "step": 1559 + }, + { + "epoch": 0.18, + "learning_rate": 1.8869467283949475e-05, + "loss": 0.4718, + "step": 1560 + }, + { + "epoch": 0.18, + "learning_rate": 1.8867756987566615e-05, + "loss": 0.4777, + "step": 1561 + }, + { + "epoch": 0.18, + "learning_rate": 1.8866045476103073e-05, + "loss": 0.492, + "step": 1562 + }, + { + "epoch": 0.18, + "learning_rate": 1.886433274979335e-05, + "loss": 0.5308, + "step": 1563 + }, + { + "epoch": 0.18, + "learning_rate": 1.8862618808872138e-05, + "loss": 0.4872, + "step": 1564 + }, + { + "epoch": 0.18, + "learning_rate": 1.8860903653574277e-05, + "loss": 0.4714, + "step": 1565 + }, + { + "epoch": 0.18, + "learning_rate": 1.8859187284134785e-05, + "loss": 0.4829, + "step": 1566 + }, + { + "epoch": 0.18, + "learning_rate": 1.8857469700788845e-05, + "loss": 0.4899, + "step": 1567 + }, + { + "epoch": 0.18, + "learning_rate": 1.8855750903771805e-05, + "loss": 0.4854, + "step": 1568 + }, + { + "epoch": 0.18, + "learning_rate": 1.8854030893319173e-05, + "loss": 0.4738, + "step": 1569 + }, + { + "epoch": 0.18, + "learning_rate": 1.8852309669666634e-05, + "loss": 0.4797, + "step": 1570 + }, + { + "epoch": 0.18, + "learning_rate": 1.885058723305003e-05, + "loss": 0.4985, + "step": 1571 + }, + { + "epoch": 0.18, + "learning_rate": 1.8848863583705373e-05, + "loss": 0.4893, + "step": 1572 + }, + { + "epoch": 0.18, + "learning_rate": 1.884713872186885e-05, + "loss": 0.5035, + "step": 1573 + }, + { + "epoch": 0.18, + "learning_rate": 1.8845412647776795e-05, + "loss": 0.4932, + "step": 1574 + }, + { + "epoch": 0.18, + "learning_rate": 1.8843685361665724e-05, + "loss": 0.4753, + "step": 1575 + }, + { + "epoch": 0.18, + "learning_rate": 1.8841956863772314e-05, + "loss": 0.4796, + "step": 1576 + }, + { + "epoch": 0.18, + "learning_rate": 1.8840227154333405e-05, + "loss": 0.4888, + "step": 1577 + }, + { + "epoch": 0.18, + "learning_rate": 1.883849623358601e-05, + "loss": 0.4732, + "step": 1578 + }, + { + "epoch": 0.18, + "learning_rate": 1.88367641017673e-05, + "loss": 0.4919, + "step": 1579 + }, + { + "epoch": 0.18, + "learning_rate": 1.8835030759114617e-05, + "loss": 0.4883, + "step": 1580 + }, + { + "epoch": 0.18, + "learning_rate": 1.8833296205865466e-05, + "loss": 0.4923, + "step": 1581 + }, + { + "epoch": 0.18, + "learning_rate": 1.8831560442257523e-05, + "loss": 0.477, + "step": 1582 + }, + { + "epoch": 0.18, + "learning_rate": 1.8829823468528624e-05, + "loss": 0.4749, + "step": 1583 + }, + { + "epoch": 0.18, + "learning_rate": 1.8828085284916777e-05, + "loss": 0.4813, + "step": 1584 + }, + { + "epoch": 0.18, + "learning_rate": 1.882634589166014e-05, + "loss": 0.5162, + "step": 1585 + }, + { + "epoch": 0.18, + "learning_rate": 1.8824605288997064e-05, + "loss": 0.4578, + "step": 1586 + }, + { + "epoch": 0.18, + "learning_rate": 1.882286347716604e-05, + "loss": 0.5004, + "step": 1587 + }, + { + "epoch": 0.18, + "learning_rate": 1.8821120456405743e-05, + "loss": 0.4933, + "step": 1588 + }, + { + "epoch": 0.18, + "learning_rate": 1.8819376226955e-05, + "loss": 0.5036, + "step": 1589 + }, + { + "epoch": 0.18, + "learning_rate": 1.8817630789052813e-05, + "loss": 0.4774, + "step": 1590 + }, + { + "epoch": 0.18, + "learning_rate": 1.881588414293834e-05, + "loss": 0.4817, + "step": 1591 + }, + { + "epoch": 0.18, + "learning_rate": 1.881413628885092e-05, + "loss": 0.4606, + "step": 1592 + }, + { + "epoch": 0.18, + "learning_rate": 1.8812387227030035e-05, + "loss": 0.4866, + "step": 1593 + }, + { + "epoch": 0.18, + "learning_rate": 1.8810636957715357e-05, + "loss": 0.4622, + "step": 1594 + }, + { + "epoch": 0.18, + "learning_rate": 1.880888548114671e-05, + "loss": 0.4884, + "step": 1595 + }, + { + "epoch": 0.18, + "learning_rate": 1.880713279756408e-05, + "loss": 0.479, + "step": 1596 + }, + { + "epoch": 0.18, + "learning_rate": 1.880537890720763e-05, + "loss": 0.4996, + "step": 1597 + }, + { + "epoch": 0.18, + "learning_rate": 1.8803623810317678e-05, + "loss": 0.4693, + "step": 1598 + }, + { + "epoch": 0.18, + "learning_rate": 1.8801867507134712e-05, + "loss": 0.4771, + "step": 1599 + }, + { + "epoch": 0.18, + "learning_rate": 1.8800109997899386e-05, + "loss": 0.4914, + "step": 1600 + }, + { + "epoch": 0.18, + "learning_rate": 1.879835128285252e-05, + "loss": 0.4782, + "step": 1601 + }, + { + "epoch": 0.18, + "learning_rate": 1.879659136223509e-05, + "loss": 0.4953, + "step": 1602 + }, + { + "epoch": 0.18, + "learning_rate": 1.8794830236288254e-05, + "loss": 0.4904, + "step": 1603 + }, + { + "epoch": 0.18, + "learning_rate": 1.8793067905253318e-05, + "loss": 0.4721, + "step": 1604 + }, + { + "epoch": 0.18, + "learning_rate": 1.8791304369371765e-05, + "loss": 0.5035, + "step": 1605 + }, + { + "epoch": 0.18, + "learning_rate": 1.8789539628885233e-05, + "loss": 0.4752, + "step": 1606 + }, + { + "epoch": 0.18, + "learning_rate": 1.878777368403554e-05, + "loss": 0.4852, + "step": 1607 + }, + { + "epoch": 0.18, + "learning_rate": 1.8786006535064654e-05, + "loss": 0.496, + "step": 1608 + }, + { + "epoch": 0.18, + "learning_rate": 1.8784238182214713e-05, + "loss": 0.4785, + "step": 1609 + }, + { + "epoch": 0.18, + "learning_rate": 1.8782468625728027e-05, + "loss": 0.4813, + "step": 1610 + }, + { + "epoch": 0.18, + "learning_rate": 1.8780697865847056e-05, + "loss": 0.5088, + "step": 1611 + }, + { + "epoch": 0.18, + "learning_rate": 1.877892590281444e-05, + "loss": 0.4805, + "step": 1612 + }, + { + "epoch": 0.18, + "learning_rate": 1.877715273687297e-05, + "loss": 0.5001, + "step": 1613 + }, + { + "epoch": 0.18, + "learning_rate": 1.8775378368265622e-05, + "loss": 0.4999, + "step": 1614 + }, + { + "epoch": 0.18, + "learning_rate": 1.8773602797235516e-05, + "loss": 0.4834, + "step": 1615 + }, + { + "epoch": 0.18, + "learning_rate": 1.8771826024025944e-05, + "loss": 0.4797, + "step": 1616 + }, + { + "epoch": 0.18, + "learning_rate": 1.8770048048880367e-05, + "loss": 0.488, + "step": 1617 + }, + { + "epoch": 0.18, + "learning_rate": 1.8768268872042402e-05, + "loss": 0.4786, + "step": 1618 + }, + { + "epoch": 0.19, + "learning_rate": 1.8766488493755845e-05, + "loss": 0.4776, + "step": 1619 + }, + { + "epoch": 0.19, + "learning_rate": 1.8764706914264636e-05, + "loss": 0.4899, + "step": 1620 + }, + { + "epoch": 0.19, + "learning_rate": 1.8762924133812905e-05, + "loss": 0.4933, + "step": 1621 + }, + { + "epoch": 0.19, + "learning_rate": 1.876114015264492e-05, + "loss": 0.4851, + "step": 1622 + }, + { + "epoch": 0.19, + "learning_rate": 1.8759354971005133e-05, + "loss": 0.4766, + "step": 1623 + }, + { + "epoch": 0.19, + "learning_rate": 1.875756858913815e-05, + "loss": 0.5006, + "step": 1624 + }, + { + "epoch": 0.19, + "learning_rate": 1.875578100728875e-05, + "loss": 0.4958, + "step": 1625 + }, + { + "epoch": 0.19, + "learning_rate": 1.8753992225701868e-05, + "loss": 0.482, + "step": 1626 + }, + { + "epoch": 0.19, + "learning_rate": 1.875220224462261e-05, + "loss": 0.4894, + "step": 1627 + }, + { + "epoch": 0.19, + "learning_rate": 1.8750411064296237e-05, + "loss": 0.474, + "step": 1628 + }, + { + "epoch": 0.19, + "learning_rate": 1.8748618684968187e-05, + "loss": 0.507, + "step": 1629 + }, + { + "epoch": 0.19, + "learning_rate": 1.8746825106884055e-05, + "loss": 0.4657, + "step": 1630 + }, + { + "epoch": 0.19, + "learning_rate": 1.87450303302896e-05, + "loss": 0.484, + "step": 1631 + }, + { + "epoch": 0.19, + "learning_rate": 1.8743234355430746e-05, + "loss": 0.479, + "step": 1632 + }, + { + "epoch": 0.19, + "learning_rate": 1.8741437182553582e-05, + "loss": 0.4876, + "step": 1633 + }, + { + "epoch": 0.19, + "learning_rate": 1.8739638811904363e-05, + "loss": 0.5082, + "step": 1634 + }, + { + "epoch": 0.19, + "learning_rate": 1.8737839243729504e-05, + "loss": 0.4617, + "step": 1635 + }, + { + "epoch": 0.19, + "learning_rate": 1.8736038478275584e-05, + "loss": 0.4841, + "step": 1636 + }, + { + "epoch": 0.19, + "learning_rate": 1.873423651578935e-05, + "loss": 0.4825, + "step": 1637 + }, + { + "epoch": 0.19, + "learning_rate": 1.8732433356517713e-05, + "loss": 0.4856, + "step": 1638 + }, + { + "epoch": 0.19, + "learning_rate": 1.8730629000707746e-05, + "loss": 0.4894, + "step": 1639 + }, + { + "epoch": 0.19, + "learning_rate": 1.872882344860668e-05, + "loss": 0.4878, + "step": 1640 + }, + { + "epoch": 0.19, + "learning_rate": 1.872701670046192e-05, + "loss": 0.5066, + "step": 1641 + }, + { + "epoch": 0.19, + "learning_rate": 1.8725208756521036e-05, + "loss": 0.4937, + "step": 1642 + }, + { + "epoch": 0.19, + "learning_rate": 1.8723399617031754e-05, + "loss": 0.4747, + "step": 1643 + }, + { + "epoch": 0.19, + "learning_rate": 1.8721589282241956e-05, + "loss": 0.4769, + "step": 1644 + }, + { + "epoch": 0.19, + "learning_rate": 1.8719777752399713e-05, + "loss": 0.4826, + "step": 1645 + }, + { + "epoch": 0.19, + "learning_rate": 1.8717965027753235e-05, + "loss": 0.4781, + "step": 1646 + }, + { + "epoch": 0.19, + "learning_rate": 1.8716151108550912e-05, + "loss": 0.4992, + "step": 1647 + }, + { + "epoch": 0.19, + "learning_rate": 1.871433599504129e-05, + "loss": 0.4906, + "step": 1648 + }, + { + "epoch": 0.19, + "learning_rate": 1.8712519687473075e-05, + "loss": 0.4832, + "step": 1649 + }, + { + "epoch": 0.19, + "learning_rate": 1.8710702186095147e-05, + "loss": 0.4871, + "step": 1650 + }, + { + "epoch": 0.19, + "learning_rate": 1.8708883491156544e-05, + "loss": 0.4787, + "step": 1651 + }, + { + "epoch": 0.19, + "learning_rate": 1.8707063602906466e-05, + "loss": 0.4772, + "step": 1652 + }, + { + "epoch": 0.19, + "learning_rate": 1.8705242521594276e-05, + "loss": 0.4791, + "step": 1653 + }, + { + "epoch": 0.19, + "learning_rate": 1.870342024746951e-05, + "loss": 0.4847, + "step": 1654 + }, + { + "epoch": 0.19, + "learning_rate": 1.8701596780781855e-05, + "loss": 0.4919, + "step": 1655 + }, + { + "epoch": 0.19, + "learning_rate": 1.869977212178117e-05, + "loss": 0.4897, + "step": 1656 + }, + { + "epoch": 0.19, + "learning_rate": 1.8697946270717468e-05, + "loss": 0.4652, + "step": 1657 + }, + { + "epoch": 0.19, + "learning_rate": 1.8696119227840937e-05, + "loss": 0.493, + "step": 1658 + }, + { + "epoch": 0.19, + "learning_rate": 1.869429099340192e-05, + "loss": 0.46, + "step": 1659 + }, + { + "epoch": 0.19, + "learning_rate": 1.8692461567650925e-05, + "loss": 0.5069, + "step": 1660 + }, + { + "epoch": 0.19, + "learning_rate": 1.869063095083863e-05, + "loss": 0.4855, + "step": 1661 + }, + { + "epoch": 0.19, + "learning_rate": 1.8688799143215863e-05, + "loss": 0.4678, + "step": 1662 + }, + { + "epoch": 0.19, + "learning_rate": 1.8686966145033626e-05, + "loss": 0.4733, + "step": 1663 + }, + { + "epoch": 0.19, + "learning_rate": 1.8685131956543082e-05, + "loss": 0.4652, + "step": 1664 + }, + { + "epoch": 0.19, + "learning_rate": 1.8683296577995554e-05, + "loss": 0.4923, + "step": 1665 + }, + { + "epoch": 0.19, + "learning_rate": 1.8681460009642533e-05, + "loss": 0.4963, + "step": 1666 + }, + { + "epoch": 0.19, + "learning_rate": 1.867962225173566e-05, + "loss": 0.4617, + "step": 1667 + }, + { + "epoch": 0.19, + "learning_rate": 1.867778330452676e-05, + "loss": 0.4967, + "step": 1668 + }, + { + "epoch": 0.19, + "learning_rate": 1.8675943168267804e-05, + "loss": 0.4924, + "step": 1669 + }, + { + "epoch": 0.19, + "learning_rate": 1.8674101843210935e-05, + "loss": 0.4953, + "step": 1670 + }, + { + "epoch": 0.19, + "learning_rate": 1.8672259329608457e-05, + "loss": 0.4848, + "step": 1671 + }, + { + "epoch": 0.19, + "learning_rate": 1.8670415627712825e-05, + "loss": 0.4905, + "step": 1672 + }, + { + "epoch": 0.19, + "learning_rate": 1.866857073777668e-05, + "loss": 0.4961, + "step": 1673 + }, + { + "epoch": 0.19, + "learning_rate": 1.8666724660052807e-05, + "loss": 0.4816, + "step": 1674 + }, + { + "epoch": 0.19, + "learning_rate": 1.8664877394794158e-05, + "loss": 0.4707, + "step": 1675 + }, + { + "epoch": 0.19, + "learning_rate": 1.8663028942253854e-05, + "loss": 0.4883, + "step": 1676 + }, + { + "epoch": 0.19, + "learning_rate": 1.8661179302685177e-05, + "loss": 0.4905, + "step": 1677 + }, + { + "epoch": 0.19, + "learning_rate": 1.8659328476341557e-05, + "loss": 0.4828, + "step": 1678 + }, + { + "epoch": 0.19, + "learning_rate": 1.865747646347661e-05, + "loss": 0.4787, + "step": 1679 + }, + { + "epoch": 0.19, + "learning_rate": 1.8655623264344103e-05, + "loss": 0.478, + "step": 1680 + }, + { + "epoch": 0.19, + "learning_rate": 1.8653768879197956e-05, + "loss": 0.5098, + "step": 1681 + }, + { + "epoch": 0.19, + "learning_rate": 1.865191330829227e-05, + "loss": 0.4905, + "step": 1682 + }, + { + "epoch": 0.19, + "learning_rate": 1.8650056551881297e-05, + "loss": 0.4881, + "step": 1683 + }, + { + "epoch": 0.19, + "learning_rate": 1.8648198610219452e-05, + "loss": 0.4741, + "step": 1684 + }, + { + "epoch": 0.19, + "learning_rate": 1.864633948356132e-05, + "loss": 0.4758, + "step": 1685 + }, + { + "epoch": 0.19, + "learning_rate": 1.8644479172161635e-05, + "loss": 0.4836, + "step": 1686 + }, + { + "epoch": 0.19, + "learning_rate": 1.8642617676275306e-05, + "loss": 0.4902, + "step": 1687 + }, + { + "epoch": 0.19, + "learning_rate": 1.8640754996157397e-05, + "loss": 0.4673, + "step": 1688 + }, + { + "epoch": 0.19, + "learning_rate": 1.863889113206314e-05, + "loss": 0.5032, + "step": 1689 + }, + { + "epoch": 0.19, + "learning_rate": 1.863702608424793e-05, + "loss": 0.4906, + "step": 1690 + }, + { + "epoch": 0.19, + "learning_rate": 1.863515985296731e-05, + "loss": 0.4808, + "step": 1691 + }, + { + "epoch": 0.19, + "learning_rate": 1.8633292438476998e-05, + "loss": 0.4903, + "step": 1692 + }, + { + "epoch": 0.19, + "learning_rate": 1.8631423841032876e-05, + "loss": 0.4751, + "step": 1693 + }, + { + "epoch": 0.19, + "learning_rate": 1.8629554060890982e-05, + "loss": 0.4874, + "step": 1694 + }, + { + "epoch": 0.19, + "learning_rate": 1.8627683098307516e-05, + "loss": 0.4805, + "step": 1695 + }, + { + "epoch": 0.19, + "learning_rate": 1.862581095353884e-05, + "loss": 0.4862, + "step": 1696 + }, + { + "epoch": 0.19, + "learning_rate": 1.8623937626841485e-05, + "loss": 0.4728, + "step": 1697 + }, + { + "epoch": 0.19, + "learning_rate": 1.8622063118472135e-05, + "loss": 0.4992, + "step": 1698 + }, + { + "epoch": 0.19, + "learning_rate": 1.8620187428687643e-05, + "loss": 0.4798, + "step": 1699 + }, + { + "epoch": 0.19, + "learning_rate": 1.861831055774501e-05, + "loss": 0.4886, + "step": 1700 + }, + { + "epoch": 0.19, + "learning_rate": 1.8616432505901427e-05, + "loss": 0.4727, + "step": 1701 + }, + { + "epoch": 0.19, + "learning_rate": 1.861455327341421e-05, + "loss": 0.4862, + "step": 1702 + }, + { + "epoch": 0.19, + "learning_rate": 1.8612672860540865e-05, + "loss": 0.4721, + "step": 1703 + }, + { + "epoch": 0.19, + "learning_rate": 1.8610791267539053e-05, + "loss": 0.4631, + "step": 1704 + }, + { + "epoch": 0.19, + "learning_rate": 1.8608908494666593e-05, + "loss": 0.4935, + "step": 1705 + }, + { + "epoch": 0.19, + "learning_rate": 1.8607024542181465e-05, + "loss": 0.4937, + "step": 1706 + }, + { + "epoch": 0.2, + "learning_rate": 1.860513941034181e-05, + "loss": 0.474, + "step": 1707 + }, + { + "epoch": 0.2, + "learning_rate": 1.8603253099405937e-05, + "loss": 0.4716, + "step": 1708 + }, + { + "epoch": 0.2, + "learning_rate": 1.8601365609632315e-05, + "loss": 0.5024, + "step": 1709 + }, + { + "epoch": 0.2, + "learning_rate": 1.859947694127956e-05, + "loss": 0.4774, + "step": 1710 + }, + { + "epoch": 0.2, + "learning_rate": 1.859758709460648e-05, + "loss": 0.4771, + "step": 1711 + }, + { + "epoch": 0.2, + "learning_rate": 1.8595696069872013e-05, + "loss": 0.4811, + "step": 1712 + }, + { + "epoch": 0.2, + "learning_rate": 1.8593803867335276e-05, + "loss": 0.476, + "step": 1713 + }, + { + "epoch": 0.2, + "learning_rate": 1.859191048725554e-05, + "loss": 0.4997, + "step": 1714 + }, + { + "epoch": 0.2, + "learning_rate": 1.8590015929892245e-05, + "loss": 0.4814, + "step": 1715 + }, + { + "epoch": 0.2, + "learning_rate": 1.858812019550499e-05, + "loss": 0.4644, + "step": 1716 + }, + { + "epoch": 0.2, + "learning_rate": 1.8586223284353522e-05, + "loss": 0.4897, + "step": 1717 + }, + { + "epoch": 0.2, + "learning_rate": 1.8584325196697767e-05, + "loss": 0.4907, + "step": 1718 + }, + { + "epoch": 0.2, + "learning_rate": 1.8582425932797807e-05, + "loss": 0.475, + "step": 1719 + }, + { + "epoch": 0.2, + "learning_rate": 1.8580525492913884e-05, + "loss": 0.4799, + "step": 1720 + }, + { + "epoch": 0.2, + "learning_rate": 1.8578623877306394e-05, + "loss": 0.4759, + "step": 1721 + }, + { + "epoch": 0.2, + "learning_rate": 1.8576721086235908e-05, + "loss": 0.509, + "step": 1722 + }, + { + "epoch": 0.2, + "learning_rate": 1.8574817119963145e-05, + "loss": 0.4782, + "step": 1723 + }, + { + "epoch": 0.2, + "learning_rate": 1.8572911978748993e-05, + "loss": 0.4763, + "step": 1724 + }, + { + "epoch": 0.2, + "learning_rate": 1.8571005662854502e-05, + "loss": 0.5017, + "step": 1725 + }, + { + "epoch": 0.2, + "learning_rate": 1.8569098172540875e-05, + "loss": 0.4736, + "step": 1726 + }, + { + "epoch": 0.2, + "learning_rate": 1.856718950806949e-05, + "loss": 0.4701, + "step": 1727 + }, + { + "epoch": 0.2, + "learning_rate": 1.8565279669701862e-05, + "loss": 0.4726, + "step": 1728 + }, + { + "epoch": 0.2, + "learning_rate": 1.8563368657699693e-05, + "loss": 0.4578, + "step": 1729 + }, + { + "epoch": 0.2, + "learning_rate": 1.856145647232483e-05, + "loss": 0.4809, + "step": 1730 + }, + { + "epoch": 0.2, + "learning_rate": 1.8559543113839285e-05, + "loss": 0.4705, + "step": 1731 + }, + { + "epoch": 0.2, + "learning_rate": 1.8557628582505235e-05, + "loss": 0.5124, + "step": 1732 + }, + { + "epoch": 0.2, + "learning_rate": 1.8555712878585005e-05, + "loss": 0.473, + "step": 1733 + }, + { + "epoch": 0.2, + "learning_rate": 1.8553796002341098e-05, + "loss": 0.4715, + "step": 1734 + }, + { + "epoch": 0.2, + "learning_rate": 1.8551877954036165e-05, + "loss": 0.4905, + "step": 1735 + }, + { + "epoch": 0.2, + "learning_rate": 1.854995873393302e-05, + "loss": 0.4924, + "step": 1736 + }, + { + "epoch": 0.2, + "learning_rate": 1.854803834229464e-05, + "loss": 0.4684, + "step": 1737 + }, + { + "epoch": 0.2, + "learning_rate": 1.8546116779384165e-05, + "loss": 0.4869, + "step": 1738 + }, + { + "epoch": 0.2, + "learning_rate": 1.8544194045464888e-05, + "loss": 0.47, + "step": 1739 + }, + { + "epoch": 0.2, + "learning_rate": 1.8542270140800266e-05, + "loss": 0.4872, + "step": 1740 + }, + { + "epoch": 0.2, + "learning_rate": 1.854034506565392e-05, + "loss": 0.4664, + "step": 1741 + }, + { + "epoch": 0.2, + "learning_rate": 1.8538418820289628e-05, + "loss": 0.4862, + "step": 1742 + }, + { + "epoch": 0.2, + "learning_rate": 1.8536491404971327e-05, + "loss": 0.466, + "step": 1743 + }, + { + "epoch": 0.2, + "learning_rate": 1.8534562819963112e-05, + "loss": 0.4869, + "step": 1744 + }, + { + "epoch": 0.2, + "learning_rate": 1.853263306552925e-05, + "loss": 0.477, + "step": 1745 + }, + { + "epoch": 0.2, + "learning_rate": 1.8530702141934157e-05, + "loss": 0.4889, + "step": 1746 + }, + { + "epoch": 0.2, + "learning_rate": 1.8528770049442413e-05, + "loss": 0.4812, + "step": 1747 + }, + { + "epoch": 0.2, + "learning_rate": 1.852683678831876e-05, + "loss": 0.5022, + "step": 1748 + }, + { + "epoch": 0.2, + "learning_rate": 1.852490235882809e-05, + "loss": 0.477, + "step": 1749 + }, + { + "epoch": 0.2, + "learning_rate": 1.852296676123547e-05, + "loss": 0.4985, + "step": 1750 + }, + { + "epoch": 0.2, + "learning_rate": 1.8521029995806123e-05, + "loss": 0.489, + "step": 1751 + }, + { + "epoch": 0.2, + "learning_rate": 1.851909206280542e-05, + "loss": 0.4941, + "step": 1752 + }, + { + "epoch": 0.2, + "learning_rate": 1.8517152962498908e-05, + "loss": 0.4833, + "step": 1753 + }, + { + "epoch": 0.2, + "learning_rate": 1.8515212695152284e-05, + "loss": 0.4874, + "step": 1754 + }, + { + "epoch": 0.2, + "learning_rate": 1.8513271261031406e-05, + "loss": 0.4818, + "step": 1755 + }, + { + "epoch": 0.2, + "learning_rate": 1.8511328660402302e-05, + "loss": 0.4875, + "step": 1756 + }, + { + "epoch": 0.2, + "learning_rate": 1.850938489353114e-05, + "loss": 0.4796, + "step": 1757 + }, + { + "epoch": 0.2, + "learning_rate": 1.850743996068427e-05, + "loss": 0.4783, + "step": 1758 + }, + { + "epoch": 0.2, + "learning_rate": 1.8505493862128187e-05, + "loss": 0.4777, + "step": 1759 + }, + { + "epoch": 0.2, + "learning_rate": 1.8503546598129547e-05, + "loss": 0.478, + "step": 1760 + }, + { + "epoch": 0.2, + "learning_rate": 1.8501598168955172e-05, + "loss": 0.4774, + "step": 1761 + }, + { + "epoch": 0.2, + "learning_rate": 1.8499648574872042e-05, + "loss": 0.4938, + "step": 1762 + }, + { + "epoch": 0.2, + "learning_rate": 1.849769781614729e-05, + "loss": 0.4838, + "step": 1763 + }, + { + "epoch": 0.2, + "learning_rate": 1.849574589304822e-05, + "loss": 0.4955, + "step": 1764 + }, + { + "epoch": 0.2, + "learning_rate": 1.8493792805842278e-05, + "loss": 0.4886, + "step": 1765 + }, + { + "epoch": 0.2, + "learning_rate": 1.8491838554797096e-05, + "loss": 0.4945, + "step": 1766 + }, + { + "epoch": 0.2, + "learning_rate": 1.8489883140180437e-05, + "loss": 0.4612, + "step": 1767 + }, + { + "epoch": 0.2, + "learning_rate": 1.848792656226024e-05, + "loss": 0.492, + "step": 1768 + }, + { + "epoch": 0.2, + "learning_rate": 1.8485968821304604e-05, + "loss": 0.4717, + "step": 1769 + }, + { + "epoch": 0.2, + "learning_rate": 1.848400991758178e-05, + "loss": 0.4814, + "step": 1770 + }, + { + "epoch": 0.2, + "learning_rate": 1.8482049851360182e-05, + "loss": 0.48, + "step": 1771 + }, + { + "epoch": 0.2, + "learning_rate": 1.8480088622908382e-05, + "loss": 0.4792, + "step": 1772 + }, + { + "epoch": 0.2, + "learning_rate": 1.8478126232495114e-05, + "loss": 0.4839, + "step": 1773 + }, + { + "epoch": 0.2, + "learning_rate": 1.8476162680389268e-05, + "loss": 0.4825, + "step": 1774 + }, + { + "epoch": 0.2, + "learning_rate": 1.847419796685989e-05, + "loss": 0.4821, + "step": 1775 + }, + { + "epoch": 0.2, + "learning_rate": 1.84722320921762e-05, + "loss": 0.5115, + "step": 1776 + }, + { + "epoch": 0.2, + "learning_rate": 1.8470265056607557e-05, + "loss": 0.46, + "step": 1777 + }, + { + "epoch": 0.2, + "learning_rate": 1.8468296860423494e-05, + "loss": 0.4905, + "step": 1778 + }, + { + "epoch": 0.2, + "learning_rate": 1.8466327503893697e-05, + "loss": 0.4867, + "step": 1779 + }, + { + "epoch": 0.2, + "learning_rate": 1.8464356987288012e-05, + "loss": 0.4755, + "step": 1780 + }, + { + "epoch": 0.2, + "learning_rate": 1.8462385310876444e-05, + "loss": 0.4933, + "step": 1781 + }, + { + "epoch": 0.2, + "learning_rate": 1.8460412474929154e-05, + "loss": 0.4886, + "step": 1782 + }, + { + "epoch": 0.2, + "learning_rate": 1.8458438479716466e-05, + "loss": 0.4768, + "step": 1783 + }, + { + "epoch": 0.2, + "learning_rate": 1.845646332550886e-05, + "loss": 0.4779, + "step": 1784 + }, + { + "epoch": 0.2, + "learning_rate": 1.845448701257698e-05, + "loss": 0.4866, + "step": 1785 + }, + { + "epoch": 0.2, + "learning_rate": 1.8452509541191625e-05, + "loss": 0.4603, + "step": 1786 + }, + { + "epoch": 0.2, + "learning_rate": 1.8450530911623747e-05, + "loss": 0.5003, + "step": 1787 + }, + { + "epoch": 0.2, + "learning_rate": 1.8448551124144467e-05, + "loss": 0.4798, + "step": 1788 + }, + { + "epoch": 0.2, + "learning_rate": 1.844657017902506e-05, + "loss": 0.4767, + "step": 1789 + }, + { + "epoch": 0.2, + "learning_rate": 1.844458807653696e-05, + "loss": 0.4962, + "step": 1790 + }, + { + "epoch": 0.2, + "learning_rate": 1.8442604816951757e-05, + "loss": 0.4689, + "step": 1791 + }, + { + "epoch": 0.2, + "learning_rate": 1.8440620400541202e-05, + "loss": 0.5121, + "step": 1792 + }, + { + "epoch": 0.2, + "learning_rate": 1.843863482757721e-05, + "loss": 0.4729, + "step": 1793 + }, + { + "epoch": 0.21, + "learning_rate": 1.8436648098331838e-05, + "loss": 0.4883, + "step": 1794 + }, + { + "epoch": 0.21, + "learning_rate": 1.843466021307732e-05, + "loss": 0.4687, + "step": 1795 + }, + { + "epoch": 0.21, + "learning_rate": 1.8432671172086044e-05, + "loss": 0.4604, + "step": 1796 + }, + { + "epoch": 0.21, + "learning_rate": 1.8430680975630545e-05, + "loss": 0.4798, + "step": 1797 + }, + { + "epoch": 0.21, + "learning_rate": 1.8428689623983526e-05, + "loss": 0.468, + "step": 1798 + }, + { + "epoch": 0.21, + "learning_rate": 1.8426697117417848e-05, + "loss": 0.4735, + "step": 1799 + }, + { + "epoch": 0.21, + "learning_rate": 1.8424703456206533e-05, + "loss": 0.4889, + "step": 1800 + }, + { + "epoch": 0.21, + "learning_rate": 1.842270864062275e-05, + "loss": 0.4724, + "step": 1801 + }, + { + "epoch": 0.21, + "learning_rate": 1.8420712670939837e-05, + "loss": 0.4837, + "step": 1802 + }, + { + "epoch": 0.21, + "learning_rate": 1.8418715547431283e-05, + "loss": 0.4789, + "step": 1803 + }, + { + "epoch": 0.21, + "learning_rate": 1.8416717270370744e-05, + "loss": 0.4703, + "step": 1804 + }, + { + "epoch": 0.21, + "learning_rate": 1.841471784003203e-05, + "loss": 0.4746, + "step": 1805 + }, + { + "epoch": 0.21, + "learning_rate": 1.84127172566891e-05, + "loss": 0.4867, + "step": 1806 + }, + { + "epoch": 0.21, + "learning_rate": 1.841071552061608e-05, + "loss": 0.4663, + "step": 1807 + }, + { + "epoch": 0.21, + "learning_rate": 1.8408712632087256e-05, + "loss": 0.5056, + "step": 1808 + }, + { + "epoch": 0.21, + "learning_rate": 1.840670859137707e-05, + "loss": 0.4712, + "step": 1809 + }, + { + "epoch": 0.21, + "learning_rate": 1.840470339876011e-05, + "loss": 0.4915, + "step": 1810 + }, + { + "epoch": 0.21, + "learning_rate": 1.8402697054511145e-05, + "loss": 0.4752, + "step": 1811 + }, + { + "epoch": 0.21, + "learning_rate": 1.8400689558905083e-05, + "loss": 0.4876, + "step": 1812 + }, + { + "epoch": 0.21, + "learning_rate": 1.8398680912216997e-05, + "loss": 0.4676, + "step": 1813 + }, + { + "epoch": 0.21, + "learning_rate": 1.8396671114722112e-05, + "loss": 0.497, + "step": 1814 + }, + { + "epoch": 0.21, + "learning_rate": 1.8394660166695822e-05, + "loss": 0.4645, + "step": 1815 + }, + { + "epoch": 0.21, + "learning_rate": 1.8392648068413667e-05, + "loss": 0.5071, + "step": 1816 + }, + { + "epoch": 0.21, + "learning_rate": 1.8390634820151353e-05, + "loss": 0.4908, + "step": 1817 + }, + { + "epoch": 0.21, + "learning_rate": 1.8388620422184738e-05, + "loss": 0.4663, + "step": 1818 + }, + { + "epoch": 0.21, + "learning_rate": 1.8386604874789836e-05, + "loss": 0.4705, + "step": 1819 + }, + { + "epoch": 0.21, + "learning_rate": 1.8384588178242828e-05, + "loss": 0.4809, + "step": 1820 + }, + { + "epoch": 0.21, + "learning_rate": 1.8382570332820045e-05, + "loss": 0.5041, + "step": 1821 + }, + { + "epoch": 0.21, + "learning_rate": 1.8380551338797974e-05, + "loss": 0.472, + "step": 1822 + }, + { + "epoch": 0.21, + "learning_rate": 1.8378531196453265e-05, + "loss": 0.4672, + "step": 1823 + }, + { + "epoch": 0.21, + "learning_rate": 1.837650990606272e-05, + "loss": 0.498, + "step": 1824 + }, + { + "epoch": 0.21, + "learning_rate": 1.8374487467903303e-05, + "loss": 0.4822, + "step": 1825 + }, + { + "epoch": 0.21, + "learning_rate": 1.8372463882252133e-05, + "loss": 0.5021, + "step": 1826 + }, + { + "epoch": 0.21, + "learning_rate": 1.8370439149386484e-05, + "loss": 0.4562, + "step": 1827 + }, + { + "epoch": 0.21, + "learning_rate": 1.8368413269583795e-05, + "loss": 0.4936, + "step": 1828 + }, + { + "epoch": 0.21, + "learning_rate": 1.8366386243121654e-05, + "loss": 0.4606, + "step": 1829 + }, + { + "epoch": 0.21, + "learning_rate": 1.8364358070277807e-05, + "loss": 0.4959, + "step": 1830 + }, + { + "epoch": 0.21, + "learning_rate": 1.836232875133016e-05, + "loss": 0.4866, + "step": 1831 + }, + { + "epoch": 0.21, + "learning_rate": 1.8360298286556774e-05, + "loss": 0.4869, + "step": 1832 + }, + { + "epoch": 0.21, + "learning_rate": 1.8358266676235872e-05, + "loss": 0.4695, + "step": 1833 + }, + { + "epoch": 0.21, + "learning_rate": 1.8356233920645822e-05, + "loss": 0.5119, + "step": 1834 + }, + { + "epoch": 0.21, + "learning_rate": 1.8354200020065168e-05, + "loss": 0.4823, + "step": 1835 + }, + { + "epoch": 0.21, + "learning_rate": 1.8352164974772592e-05, + "loss": 0.4872, + "step": 1836 + }, + { + "epoch": 0.21, + "learning_rate": 1.8350128785046943e-05, + "loss": 0.4604, + "step": 1837 + }, + { + "epoch": 0.21, + "learning_rate": 1.8348091451167224e-05, + "loss": 0.4571, + "step": 1838 + }, + { + "epoch": 0.21, + "learning_rate": 1.8346052973412593e-05, + "loss": 0.5112, + "step": 1839 + }, + { + "epoch": 0.21, + "learning_rate": 1.834401335206237e-05, + "loss": 0.4873, + "step": 1840 + }, + { + "epoch": 0.21, + "learning_rate": 1.8341972587396032e-05, + "loss": 0.4778, + "step": 1841 + }, + { + "epoch": 0.21, + "learning_rate": 1.8339930679693202e-05, + "loss": 0.475, + "step": 1842 + }, + { + "epoch": 0.21, + "learning_rate": 1.8337887629233672e-05, + "loss": 0.4649, + "step": 1843 + }, + { + "epoch": 0.21, + "learning_rate": 1.833584343629738e-05, + "loss": 0.5096, + "step": 1844 + }, + { + "epoch": 0.21, + "learning_rate": 1.8333798101164433e-05, + "loss": 0.4945, + "step": 1845 + }, + { + "epoch": 0.21, + "learning_rate": 1.833175162411508e-05, + "loss": 0.4645, + "step": 1846 + }, + { + "epoch": 0.21, + "learning_rate": 1.8329704005429745e-05, + "loss": 0.4822, + "step": 1847 + }, + { + "epoch": 0.21, + "learning_rate": 1.8327655245388986e-05, + "loss": 0.4826, + "step": 1848 + }, + { + "epoch": 0.21, + "learning_rate": 1.8325605344273536e-05, + "loss": 0.4994, + "step": 1849 + }, + { + "epoch": 0.21, + "learning_rate": 1.8323554302364273e-05, + "loss": 0.4873, + "step": 1850 + }, + { + "epoch": 0.21, + "learning_rate": 1.832150211994224e-05, + "loss": 0.4714, + "step": 1851 + }, + { + "epoch": 0.21, + "learning_rate": 1.8319448797288628e-05, + "loss": 0.4985, + "step": 1852 + }, + { + "epoch": 0.21, + "learning_rate": 1.831739433468479e-05, + "loss": 0.4863, + "step": 1853 + }, + { + "epoch": 0.21, + "learning_rate": 1.831533873241223e-05, + "loss": 0.4833, + "step": 1854 + }, + { + "epoch": 0.21, + "learning_rate": 1.831328199075262e-05, + "loss": 0.4667, + "step": 1855 + }, + { + "epoch": 0.21, + "learning_rate": 1.8311224109987768e-05, + "loss": 0.476, + "step": 1856 + }, + { + "epoch": 0.21, + "learning_rate": 1.8309165090399657e-05, + "loss": 0.516, + "step": 1857 + }, + { + "epoch": 0.21, + "learning_rate": 1.8307104932270415e-05, + "loss": 0.4699, + "step": 1858 + }, + { + "epoch": 0.21, + "learning_rate": 1.8305043635882334e-05, + "loss": 0.4787, + "step": 1859 + }, + { + "epoch": 0.21, + "learning_rate": 1.830298120151785e-05, + "loss": 0.4944, + "step": 1860 + }, + { + "epoch": 0.21, + "learning_rate": 1.8300917629459575e-05, + "loss": 0.494, + "step": 1861 + }, + { + "epoch": 0.21, + "learning_rate": 1.8298852919990254e-05, + "loss": 0.4809, + "step": 1862 + }, + { + "epoch": 0.21, + "learning_rate": 1.82967870733928e-05, + "loss": 0.4735, + "step": 1863 + }, + { + "epoch": 0.21, + "learning_rate": 1.8294720089950282e-05, + "loss": 0.4649, + "step": 1864 + }, + { + "epoch": 0.21, + "learning_rate": 1.8292651969945923e-05, + "loss": 0.491, + "step": 1865 + }, + { + "epoch": 0.21, + "learning_rate": 1.82905827136631e-05, + "loss": 0.4947, + "step": 1866 + }, + { + "epoch": 0.21, + "learning_rate": 1.828851232138535e-05, + "loss": 0.4686, + "step": 1867 + }, + { + "epoch": 0.21, + "learning_rate": 1.828644079339636e-05, + "loss": 0.4651, + "step": 1868 + }, + { + "epoch": 0.21, + "learning_rate": 1.828436812997998e-05, + "loss": 0.4662, + "step": 1869 + }, + { + "epoch": 0.21, + "learning_rate": 1.8282294331420204e-05, + "loss": 0.4872, + "step": 1870 + }, + { + "epoch": 0.21, + "learning_rate": 1.8280219398001192e-05, + "loss": 0.4907, + "step": 1871 + }, + { + "epoch": 0.21, + "learning_rate": 1.827814333000726e-05, + "loss": 0.4671, + "step": 1872 + }, + { + "epoch": 0.21, + "learning_rate": 1.827606612772287e-05, + "loss": 0.474, + "step": 1873 + }, + { + "epoch": 0.21, + "learning_rate": 1.827398779143265e-05, + "loss": 0.4831, + "step": 1874 + }, + { + "epoch": 0.21, + "learning_rate": 1.8271908321421376e-05, + "loss": 0.4973, + "step": 1875 + }, + { + "epoch": 0.21, + "learning_rate": 1.8269827717973982e-05, + "loss": 0.4786, + "step": 1876 + }, + { + "epoch": 0.21, + "learning_rate": 1.8267745981375555e-05, + "loss": 0.4745, + "step": 1877 + }, + { + "epoch": 0.21, + "learning_rate": 1.8265663111911344e-05, + "loss": 0.5025, + "step": 1878 + }, + { + "epoch": 0.21, + "learning_rate": 1.8263579109866745e-05, + "loss": 0.4768, + "step": 1879 + }, + { + "epoch": 0.21, + "learning_rate": 1.8261493975527312e-05, + "loss": 0.4807, + "step": 1880 + }, + { + "epoch": 0.21, + "learning_rate": 1.8259407709178758e-05, + "loss": 0.4922, + "step": 1881 + }, + { + "epoch": 0.22, + "learning_rate": 1.8257320311106948e-05, + "loss": 0.4806, + "step": 1882 + }, + { + "epoch": 0.22, + "learning_rate": 1.82552317815979e-05, + "loss": 0.491, + "step": 1883 + }, + { + "epoch": 0.22, + "learning_rate": 1.825314212093779e-05, + "loss": 0.4727, + "step": 1884 + }, + { + "epoch": 0.22, + "learning_rate": 1.825105132941295e-05, + "loss": 0.4835, + "step": 1885 + }, + { + "epoch": 0.22, + "learning_rate": 1.8248959407309862e-05, + "loss": 0.4773, + "step": 1886 + }, + { + "epoch": 0.22, + "learning_rate": 1.824686635491517e-05, + "loss": 0.4654, + "step": 1887 + }, + { + "epoch": 0.22, + "learning_rate": 1.824477217251566e-05, + "loss": 0.4921, + "step": 1888 + }, + { + "epoch": 0.22, + "learning_rate": 1.8242676860398295e-05, + "loss": 0.5063, + "step": 1889 + }, + { + "epoch": 0.22, + "learning_rate": 1.824058041885017e-05, + "loss": 0.4779, + "step": 1890 + }, + { + "epoch": 0.22, + "learning_rate": 1.8238482848158548e-05, + "loss": 0.4864, + "step": 1891 + }, + { + "epoch": 0.22, + "learning_rate": 1.8236384148610843e-05, + "loss": 0.4714, + "step": 1892 + }, + { + "epoch": 0.22, + "learning_rate": 1.823428432049462e-05, + "loss": 0.4911, + "step": 1893 + }, + { + "epoch": 0.22, + "learning_rate": 1.8232183364097605e-05, + "loss": 0.4711, + "step": 1894 + }, + { + "epoch": 0.22, + "learning_rate": 1.8230081279707675e-05, + "loss": 0.4749, + "step": 1895 + }, + { + "epoch": 0.22, + "learning_rate": 1.822797806761287e-05, + "loss": 0.4695, + "step": 1896 + }, + { + "epoch": 0.22, + "learning_rate": 1.8225873728101367e-05, + "loss": 0.4946, + "step": 1897 + }, + { + "epoch": 0.22, + "learning_rate": 1.822376826146151e-05, + "loss": 0.4869, + "step": 1898 + }, + { + "epoch": 0.22, + "learning_rate": 1.8221661667981795e-05, + "loss": 0.4795, + "step": 1899 + }, + { + "epoch": 0.22, + "learning_rate": 1.8219553947950874e-05, + "loss": 0.4721, + "step": 1900 + }, + { + "epoch": 0.22, + "learning_rate": 1.8217445101657553e-05, + "loss": 0.4663, + "step": 1901 + }, + { + "epoch": 0.22, + "learning_rate": 1.8215335129390785e-05, + "loss": 0.4833, + "step": 1902 + }, + { + "epoch": 0.22, + "learning_rate": 1.821322403143969e-05, + "loss": 0.481, + "step": 1903 + }, + { + "epoch": 0.22, + "learning_rate": 1.8211111808093534e-05, + "loss": 0.473, + "step": 1904 + }, + { + "epoch": 0.22, + "learning_rate": 1.8208998459641737e-05, + "loss": 0.4881, + "step": 1905 + }, + { + "epoch": 0.22, + "learning_rate": 1.8206883986373872e-05, + "loss": 0.4802, + "step": 1906 + }, + { + "epoch": 0.22, + "learning_rate": 1.820476838857968e-05, + "loss": 0.4852, + "step": 1907 + }, + { + "epoch": 0.22, + "learning_rate": 1.820265166654903e-05, + "loss": 0.4831, + "step": 1908 + }, + { + "epoch": 0.22, + "learning_rate": 1.8200533820571973e-05, + "loss": 0.497, + "step": 1909 + }, + { + "epoch": 0.22, + "learning_rate": 1.8198414850938694e-05, + "loss": 0.4615, + "step": 1910 + }, + { + "epoch": 0.22, + "learning_rate": 1.8196294757939543e-05, + "loss": 0.4956, + "step": 1911 + }, + { + "epoch": 0.22, + "learning_rate": 1.8194173541865014e-05, + "loss": 0.4716, + "step": 1912 + }, + { + "epoch": 0.22, + "learning_rate": 1.8192051203005768e-05, + "loss": 0.4638, + "step": 1913 + }, + { + "epoch": 0.22, + "learning_rate": 1.818992774165261e-05, + "loss": 0.4879, + "step": 1914 + }, + { + "epoch": 0.22, + "learning_rate": 1.81878031580965e-05, + "loss": 0.4875, + "step": 1915 + }, + { + "epoch": 0.22, + "learning_rate": 1.8185677452628557e-05, + "loss": 0.4882, + "step": 1916 + }, + { + "epoch": 0.22, + "learning_rate": 1.818355062554005e-05, + "loss": 0.4997, + "step": 1917 + }, + { + "epoch": 0.22, + "learning_rate": 1.81814226771224e-05, + "loss": 0.4726, + "step": 1918 + }, + { + "epoch": 0.22, + "learning_rate": 1.8179293607667177e-05, + "loss": 0.4946, + "step": 1919 + }, + { + "epoch": 0.22, + "learning_rate": 1.8177163417466122e-05, + "loss": 0.4876, + "step": 1920 + }, + { + "epoch": 0.22, + "learning_rate": 1.8175032106811114e-05, + "loss": 0.4709, + "step": 1921 + }, + { + "epoch": 0.22, + "learning_rate": 1.817289967599419e-05, + "loss": 0.4999, + "step": 1922 + }, + { + "epoch": 0.22, + "learning_rate": 1.8170766125307543e-05, + "loss": 0.4864, + "step": 1923 + }, + { + "epoch": 0.22, + "learning_rate": 1.816863145504351e-05, + "loss": 0.4812, + "step": 1924 + }, + { + "epoch": 0.22, + "learning_rate": 1.81664956654946e-05, + "loss": 0.4748, + "step": 1925 + }, + { + "epoch": 0.22, + "learning_rate": 1.816435875695345e-05, + "loss": 0.4671, + "step": 1926 + }, + { + "epoch": 0.22, + "learning_rate": 1.8162220729712875e-05, + "loss": 0.4808, + "step": 1927 + }, + { + "epoch": 0.22, + "learning_rate": 1.8160081584065833e-05, + "loss": 0.4736, + "step": 1928 + }, + { + "epoch": 0.22, + "learning_rate": 1.8157941320305424e-05, + "loss": 0.4868, + "step": 1929 + }, + { + "epoch": 0.22, + "learning_rate": 1.815579993872492e-05, + "loss": 0.4725, + "step": 1930 + }, + { + "epoch": 0.22, + "learning_rate": 1.8153657439617738e-05, + "loss": 0.491, + "step": 1931 + }, + { + "epoch": 0.22, + "learning_rate": 1.8151513823277447e-05, + "loss": 0.4761, + "step": 1932 + }, + { + "epoch": 0.22, + "learning_rate": 1.8149369089997767e-05, + "loss": 0.5125, + "step": 1933 + }, + { + "epoch": 0.22, + "learning_rate": 1.814722324007258e-05, + "loss": 0.4676, + "step": 1934 + }, + { + "epoch": 0.22, + "learning_rate": 1.8145076273795914e-05, + "loss": 0.4984, + "step": 1935 + }, + { + "epoch": 0.22, + "learning_rate": 1.814292819146195e-05, + "loss": 0.47, + "step": 1936 + }, + { + "epoch": 0.22, + "learning_rate": 1.814077899336502e-05, + "loss": 0.4997, + "step": 1937 + }, + { + "epoch": 0.22, + "learning_rate": 1.813862867979962e-05, + "loss": 0.4623, + "step": 1938 + }, + { + "epoch": 0.22, + "learning_rate": 1.8136477251060385e-05, + "loss": 0.463, + "step": 1939 + }, + { + "epoch": 0.22, + "learning_rate": 1.813432470744211e-05, + "loss": 0.4888, + "step": 1940 + }, + { + "epoch": 0.22, + "learning_rate": 1.813217104923974e-05, + "loss": 0.4848, + "step": 1941 + }, + { + "epoch": 0.22, + "learning_rate": 1.813001627674838e-05, + "loss": 0.5043, + "step": 1942 + }, + { + "epoch": 0.22, + "learning_rate": 1.8127860390263275e-05, + "loss": 0.4762, + "step": 1943 + }, + { + "epoch": 0.22, + "learning_rate": 1.812570339007983e-05, + "loss": 0.4681, + "step": 1944 + }, + { + "epoch": 0.22, + "learning_rate": 1.8123545276493607e-05, + "loss": 0.4824, + "step": 1945 + }, + { + "epoch": 0.22, + "learning_rate": 1.8121386049800317e-05, + "loss": 0.4819, + "step": 1946 + }, + { + "epoch": 0.22, + "learning_rate": 1.8119225710295815e-05, + "loss": 0.4734, + "step": 1947 + }, + { + "epoch": 0.22, + "learning_rate": 1.811706425827612e-05, + "loss": 0.4641, + "step": 1948 + }, + { + "epoch": 0.22, + "learning_rate": 1.8114901694037402e-05, + "loss": 0.4786, + "step": 1949 + }, + { + "epoch": 0.22, + "learning_rate": 1.8112738017875974e-05, + "loss": 0.484, + "step": 1950 + }, + { + "epoch": 0.22, + "learning_rate": 1.811057323008831e-05, + "loss": 0.5028, + "step": 1951 + }, + { + "epoch": 0.22, + "learning_rate": 1.810840733097104e-05, + "loss": 0.481, + "step": 1952 + }, + { + "epoch": 0.22, + "learning_rate": 1.8106240320820928e-05, + "loss": 0.4853, + "step": 1953 + }, + { + "epoch": 0.22, + "learning_rate": 1.8104072199934916e-05, + "loss": 0.468, + "step": 1954 + }, + { + "epoch": 0.22, + "learning_rate": 1.8101902968610082e-05, + "loss": 0.4989, + "step": 1955 + }, + { + "epoch": 0.22, + "learning_rate": 1.8099732627143655e-05, + "loss": 0.4905, + "step": 1956 + }, + { + "epoch": 0.22, + "learning_rate": 1.809756117583302e-05, + "loss": 0.464, + "step": 1957 + }, + { + "epoch": 0.22, + "learning_rate": 1.809538861497572e-05, + "loss": 0.4789, + "step": 1958 + }, + { + "epoch": 0.22, + "learning_rate": 1.8093214944869437e-05, + "loss": 0.4867, + "step": 1959 + }, + { + "epoch": 0.22, + "learning_rate": 1.8091040165812018e-05, + "loss": 0.4939, + "step": 1960 + }, + { + "epoch": 0.22, + "learning_rate": 1.8088864278101452e-05, + "loss": 0.4874, + "step": 1961 + }, + { + "epoch": 0.22, + "learning_rate": 1.808668728203589e-05, + "loss": 0.4741, + "step": 1962 + }, + { + "epoch": 0.22, + "learning_rate": 1.8084509177913623e-05, + "loss": 0.4968, + "step": 1963 + }, + { + "epoch": 0.22, + "learning_rate": 1.8082329966033105e-05, + "loss": 0.4904, + "step": 1964 + }, + { + "epoch": 0.22, + "learning_rate": 1.8080149646692932e-05, + "loss": 0.4782, + "step": 1965 + }, + { + "epoch": 0.22, + "learning_rate": 1.807796822019186e-05, + "loss": 0.4839, + "step": 1966 + }, + { + "epoch": 0.22, + "learning_rate": 1.807578568682879e-05, + "loss": 0.4976, + "step": 1967 + }, + { + "epoch": 0.22, + "learning_rate": 1.8073602046902784e-05, + "loss": 0.4823, + "step": 1968 + }, + { + "epoch": 0.23, + "learning_rate": 1.8071417300713038e-05, + "loss": 0.4928, + "step": 1969 + }, + { + "epoch": 0.23, + "learning_rate": 1.8069231448558923e-05, + "loss": 0.4787, + "step": 1970 + }, + { + "epoch": 0.23, + "learning_rate": 1.806704449073994e-05, + "loss": 0.4659, + "step": 1971 + }, + { + "epoch": 0.23, + "learning_rate": 1.806485642755576e-05, + "loss": 0.477, + "step": 1972 + }, + { + "epoch": 0.23, + "learning_rate": 1.8062667259306193e-05, + "loss": 0.4825, + "step": 1973 + }, + { + "epoch": 0.23, + "learning_rate": 1.80604769862912e-05, + "loss": 0.4756, + "step": 1974 + }, + { + "epoch": 0.23, + "learning_rate": 1.8058285608810903e-05, + "loss": 0.4778, + "step": 1975 + }, + { + "epoch": 0.23, + "learning_rate": 1.8056093127165564e-05, + "loss": 0.5072, + "step": 1976 + }, + { + "epoch": 0.23, + "learning_rate": 1.8053899541655605e-05, + "loss": 0.4728, + "step": 1977 + }, + { + "epoch": 0.23, + "learning_rate": 1.8051704852581595e-05, + "loss": 0.4835, + "step": 1978 + }, + { + "epoch": 0.23, + "learning_rate": 1.804950906024426e-05, + "loss": 0.4829, + "step": 1979 + }, + { + "epoch": 0.23, + "learning_rate": 1.804731216494447e-05, + "loss": 0.4627, + "step": 1980 + }, + { + "epoch": 0.23, + "learning_rate": 1.804511416698324e-05, + "loss": 0.4922, + "step": 1981 + }, + { + "epoch": 0.23, + "learning_rate": 1.804291506666176e-05, + "loss": 0.4687, + "step": 1982 + }, + { + "epoch": 0.23, + "learning_rate": 1.8040714864281347e-05, + "loss": 0.5074, + "step": 1983 + }, + { + "epoch": 0.23, + "learning_rate": 1.8038513560143477e-05, + "loss": 0.479, + "step": 1984 + }, + { + "epoch": 0.23, + "learning_rate": 1.8036311154549783e-05, + "loss": 0.4841, + "step": 1985 + }, + { + "epoch": 0.23, + "learning_rate": 1.803410764780204e-05, + "loss": 0.4791, + "step": 1986 + }, + { + "epoch": 0.23, + "learning_rate": 1.803190304020218e-05, + "loss": 0.5005, + "step": 1987 + }, + { + "epoch": 0.23, + "learning_rate": 1.8029697332052277e-05, + "loss": 0.4771, + "step": 1988 + }, + { + "epoch": 0.23, + "learning_rate": 1.8027490523654568e-05, + "loss": 0.4959, + "step": 1989 + }, + { + "epoch": 0.23, + "learning_rate": 1.8025282615311437e-05, + "loss": 0.4576, + "step": 1990 + }, + { + "epoch": 0.23, + "learning_rate": 1.802307360732541e-05, + "loss": 0.4692, + "step": 1991 + }, + { + "epoch": 0.23, + "learning_rate": 1.8020863499999182e-05, + "loss": 0.4766, + "step": 1992 + }, + { + "epoch": 0.23, + "learning_rate": 1.801865229363557e-05, + "loss": 0.4822, + "step": 1993 + }, + { + "epoch": 0.23, + "learning_rate": 1.8016439988537576e-05, + "loss": 0.4648, + "step": 1994 + }, + { + "epoch": 0.23, + "learning_rate": 1.8014226585008322e-05, + "loss": 0.505, + "step": 1995 + }, + { + "epoch": 0.23, + "learning_rate": 1.80120120833511e-05, + "loss": 0.4659, + "step": 1996 + }, + { + "epoch": 0.23, + "learning_rate": 1.8009796483869347e-05, + "loss": 0.485, + "step": 1997 + }, + { + "epoch": 0.23, + "learning_rate": 1.8007579786866648e-05, + "loss": 0.4654, + "step": 1998 + }, + { + "epoch": 0.23, + "learning_rate": 1.8005361992646736e-05, + "loss": 0.4911, + "step": 1999 + }, + { + "epoch": 0.23, + "learning_rate": 1.8003143101513502e-05, + "loss": 0.5044, + "step": 2000 + }, + { + "epoch": 0.23, + "learning_rate": 1.8000923113770987e-05, + "loss": 0.4838, + "step": 2001 + }, + { + "epoch": 0.23, + "learning_rate": 1.7998702029723372e-05, + "loss": 0.454, + "step": 2002 + }, + { + "epoch": 0.23, + "learning_rate": 1.7996479849675e-05, + "loss": 0.4945, + "step": 2003 + }, + { + "epoch": 0.23, + "learning_rate": 1.799425657393036e-05, + "loss": 0.4773, + "step": 2004 + }, + { + "epoch": 0.23, + "learning_rate": 1.7992032202794084e-05, + "loss": 0.4931, + "step": 2005 + }, + { + "epoch": 0.23, + "learning_rate": 1.798980673657097e-05, + "loss": 0.4624, + "step": 2006 + }, + { + "epoch": 0.23, + "learning_rate": 1.7987580175565948e-05, + "loss": 0.481, + "step": 2007 + }, + { + "epoch": 0.23, + "learning_rate": 1.798535252008411e-05, + "loss": 0.4772, + "step": 2008 + }, + { + "epoch": 0.23, + "learning_rate": 1.7983123770430696e-05, + "loss": 0.4951, + "step": 2009 + }, + { + "epoch": 0.23, + "learning_rate": 1.7980893926911092e-05, + "loss": 0.4809, + "step": 2010 + }, + { + "epoch": 0.23, + "learning_rate": 1.7978662989830834e-05, + "loss": 0.4861, + "step": 2011 + }, + { + "epoch": 0.23, + "learning_rate": 1.7976430959495617e-05, + "loss": 0.4623, + "step": 2012 + }, + { + "epoch": 0.23, + "learning_rate": 1.7974197836211275e-05, + "loss": 0.4759, + "step": 2013 + }, + { + "epoch": 0.23, + "learning_rate": 1.7971963620283795e-05, + "loss": 0.4842, + "step": 2014 + }, + { + "epoch": 0.23, + "learning_rate": 1.7969728312019316e-05, + "loss": 0.4847, + "step": 2015 + }, + { + "epoch": 0.23, + "learning_rate": 1.7967491911724125e-05, + "loss": 0.484, + "step": 2016 + }, + { + "epoch": 0.23, + "learning_rate": 1.796525441970466e-05, + "loss": 0.4841, + "step": 2017 + }, + { + "epoch": 0.23, + "learning_rate": 1.7963015836267502e-05, + "loss": 0.4862, + "step": 2018 + }, + { + "epoch": 0.23, + "learning_rate": 1.7960776161719396e-05, + "loss": 0.4713, + "step": 2019 + }, + { + "epoch": 0.23, + "learning_rate": 1.7958535396367218e-05, + "loss": 0.4855, + "step": 2020 + }, + { + "epoch": 0.23, + "learning_rate": 1.795629354051801e-05, + "loss": 0.4947, + "step": 2021 + }, + { + "epoch": 0.23, + "learning_rate": 1.7954050594478952e-05, + "loss": 0.4709, + "step": 2022 + }, + { + "epoch": 0.23, + "learning_rate": 1.795180655855738e-05, + "loss": 0.4753, + "step": 2023 + }, + { + "epoch": 0.23, + "learning_rate": 1.7949561433060775e-05, + "loss": 0.4592, + "step": 2024 + }, + { + "epoch": 0.23, + "learning_rate": 1.794731521829677e-05, + "loss": 0.479, + "step": 2025 + }, + { + "epoch": 0.23, + "learning_rate": 1.7945067914573147e-05, + "loss": 0.4769, + "step": 2026 + }, + { + "epoch": 0.23, + "learning_rate": 1.7942819522197837e-05, + "loss": 0.4883, + "step": 2027 + }, + { + "epoch": 0.23, + "learning_rate": 1.794057004147892e-05, + "loss": 0.4726, + "step": 2028 + }, + { + "epoch": 0.23, + "learning_rate": 1.793831947272463e-05, + "loss": 0.4931, + "step": 2029 + }, + { + "epoch": 0.23, + "learning_rate": 1.793606781624333e-05, + "loss": 0.4699, + "step": 2030 + }, + { + "epoch": 0.23, + "learning_rate": 1.7933815072343565e-05, + "loss": 0.4965, + "step": 2031 + }, + { + "epoch": 0.23, + "learning_rate": 1.7931561241333998e-05, + "loss": 0.4694, + "step": 2032 + }, + { + "epoch": 0.23, + "learning_rate": 1.7929306323523463e-05, + "loss": 0.4896, + "step": 2033 + }, + { + "epoch": 0.23, + "learning_rate": 1.792705031922093e-05, + "loss": 0.4737, + "step": 2034 + }, + { + "epoch": 0.23, + "learning_rate": 1.792479322873552e-05, + "loss": 0.4702, + "step": 2035 + }, + { + "epoch": 0.23, + "learning_rate": 1.792253505237651e-05, + "loss": 0.5163, + "step": 2036 + }, + { + "epoch": 0.23, + "learning_rate": 1.7920275790453318e-05, + "loss": 0.4697, + "step": 2037 + }, + { + "epoch": 0.23, + "learning_rate": 1.7918015443275517e-05, + "loss": 0.4757, + "step": 2038 + }, + { + "epoch": 0.23, + "learning_rate": 1.7915754011152815e-05, + "loss": 0.4824, + "step": 2039 + }, + { + "epoch": 0.23, + "learning_rate": 1.791349149439509e-05, + "loss": 0.4852, + "step": 2040 + }, + { + "epoch": 0.23, + "learning_rate": 1.7911227893312347e-05, + "loss": 0.4749, + "step": 2041 + }, + { + "epoch": 0.23, + "learning_rate": 1.790896320821476e-05, + "loss": 0.485, + "step": 2042 + }, + { + "epoch": 0.23, + "learning_rate": 1.7906697439412634e-05, + "loss": 0.4604, + "step": 2043 + }, + { + "epoch": 0.23, + "learning_rate": 1.790443058721643e-05, + "loss": 0.4646, + "step": 2044 + }, + { + "epoch": 0.23, + "learning_rate": 1.7902162651936766e-05, + "loss": 0.4981, + "step": 2045 + }, + { + "epoch": 0.23, + "learning_rate": 1.789989363388439e-05, + "loss": 0.4722, + "step": 2046 + }, + { + "epoch": 0.23, + "learning_rate": 1.7897623533370212e-05, + "loss": 0.4707, + "step": 2047 + }, + { + "epoch": 0.23, + "learning_rate": 1.7895352350705288e-05, + "loss": 0.4786, + "step": 2048 + }, + { + "epoch": 0.23, + "learning_rate": 1.7893080086200817e-05, + "loss": 0.4885, + "step": 2049 + }, + { + "epoch": 0.23, + "learning_rate": 1.789080674016815e-05, + "loss": 0.4913, + "step": 2050 + }, + { + "epoch": 0.23, + "learning_rate": 1.7888532312918793e-05, + "loss": 0.498, + "step": 2051 + }, + { + "epoch": 0.23, + "learning_rate": 1.7886256804764385e-05, + "loss": 0.4651, + "step": 2052 + }, + { + "epoch": 0.23, + "learning_rate": 1.7883980216016724e-05, + "loss": 0.4745, + "step": 2053 + }, + { + "epoch": 0.23, + "learning_rate": 1.788170254698776e-05, + "loss": 0.4867, + "step": 2054 + }, + { + "epoch": 0.23, + "learning_rate": 1.7879423797989573e-05, + "loss": 0.455, + "step": 2055 + }, + { + "epoch": 0.23, + "learning_rate": 1.787714396933441e-05, + "loss": 0.4727, + "step": 2056 + }, + { + "epoch": 0.24, + "learning_rate": 1.7874863061334658e-05, + "loss": 0.4833, + "step": 2057 + }, + { + "epoch": 0.24, + "learning_rate": 1.7872581074302852e-05, + "loss": 0.494, + "step": 2058 + }, + { + "epoch": 0.24, + "learning_rate": 1.7870298008551674e-05, + "loss": 0.4881, + "step": 2059 + }, + { + "epoch": 0.24, + "learning_rate": 1.786801386439395e-05, + "loss": 0.4843, + "step": 2060 + }, + { + "epoch": 0.24, + "learning_rate": 1.7865728642142668e-05, + "loss": 0.4787, + "step": 2061 + }, + { + "epoch": 0.24, + "learning_rate": 1.786344234211095e-05, + "loss": 0.4876, + "step": 2062 + }, + { + "epoch": 0.24, + "learning_rate": 1.786115496461207e-05, + "loss": 0.4639, + "step": 2063 + }, + { + "epoch": 0.24, + "learning_rate": 1.7858866509959455e-05, + "loss": 0.4657, + "step": 2064 + }, + { + "epoch": 0.24, + "learning_rate": 1.7856576978466666e-05, + "loss": 0.4768, + "step": 2065 + }, + { + "epoch": 0.24, + "learning_rate": 1.785428637044742e-05, + "loss": 0.4837, + "step": 2066 + }, + { + "epoch": 0.24, + "learning_rate": 1.7851994686215592e-05, + "loss": 0.498, + "step": 2067 + }, + { + "epoch": 0.24, + "learning_rate": 1.7849701926085183e-05, + "loss": 0.4789, + "step": 2068 + }, + { + "epoch": 0.24, + "learning_rate": 1.7847408090370355e-05, + "loss": 0.47, + "step": 2069 + }, + { + "epoch": 0.24, + "learning_rate": 1.784511317938542e-05, + "loss": 0.4868, + "step": 2070 + }, + { + "epoch": 0.24, + "learning_rate": 1.7842817193444823e-05, + "loss": 0.4803, + "step": 2071 + }, + { + "epoch": 0.24, + "learning_rate": 1.7840520132863173e-05, + "loss": 0.4863, + "step": 2072 + }, + { + "epoch": 0.24, + "learning_rate": 1.783822199795522e-05, + "loss": 0.4667, + "step": 2073 + }, + { + "epoch": 0.24, + "learning_rate": 1.7835922789035853e-05, + "loss": 0.4738, + "step": 2074 + }, + { + "epoch": 0.24, + "learning_rate": 1.7833622506420116e-05, + "loss": 0.454, + "step": 2075 + }, + { + "epoch": 0.24, + "learning_rate": 1.7831321150423203e-05, + "loss": 0.5017, + "step": 2076 + }, + { + "epoch": 0.24, + "learning_rate": 1.782901872136045e-05, + "loss": 0.4968, + "step": 2077 + }, + { + "epoch": 0.24, + "learning_rate": 1.7826715219547336e-05, + "loss": 0.4852, + "step": 2078 + }, + { + "epoch": 0.24, + "learning_rate": 1.78244106452995e-05, + "loss": 0.4827, + "step": 2079 + }, + { + "epoch": 0.24, + "learning_rate": 1.7822104998932715e-05, + "loss": 0.4838, + "step": 2080 + }, + { + "epoch": 0.24, + "learning_rate": 1.7819798280762907e-05, + "loss": 0.4729, + "step": 2081 + }, + { + "epoch": 0.24, + "learning_rate": 1.7817490491106148e-05, + "loss": 0.4647, + "step": 2082 + }, + { + "epoch": 0.24, + "learning_rate": 1.7815181630278656e-05, + "loss": 0.4783, + "step": 2083 + }, + { + "epoch": 0.24, + "learning_rate": 1.78128716985968e-05, + "loss": 0.4797, + "step": 2084 + }, + { + "epoch": 0.24, + "learning_rate": 1.781056069637709e-05, + "loss": 0.5009, + "step": 2085 + }, + { + "epoch": 0.24, + "learning_rate": 1.7808248623936183e-05, + "loss": 0.5092, + "step": 2086 + }, + { + "epoch": 0.24, + "learning_rate": 1.780593548159089e-05, + "loss": 0.4692, + "step": 2087 + }, + { + "epoch": 0.24, + "learning_rate": 1.7803621269658154e-05, + "loss": 0.4904, + "step": 2088 + }, + { + "epoch": 0.24, + "learning_rate": 1.7801305988455085e-05, + "loss": 0.4693, + "step": 2089 + }, + { + "epoch": 0.24, + "learning_rate": 1.779898963829892e-05, + "loss": 0.4656, + "step": 2090 + }, + { + "epoch": 0.24, + "learning_rate": 1.779667221950705e-05, + "loss": 0.4907, + "step": 2091 + }, + { + "epoch": 0.24, + "learning_rate": 1.7794353732397018e-05, + "loss": 0.47, + "step": 2092 + }, + { + "epoch": 0.24, + "learning_rate": 1.7792034177286508e-05, + "loss": 0.4755, + "step": 2093 + }, + { + "epoch": 0.24, + "learning_rate": 1.778971355449335e-05, + "loss": 0.4875, + "step": 2094 + }, + { + "epoch": 0.24, + "learning_rate": 1.7787391864335517e-05, + "loss": 0.4787, + "step": 2095 + }, + { + "epoch": 0.24, + "learning_rate": 1.778506910713114e-05, + "loss": 0.4721, + "step": 2096 + }, + { + "epoch": 0.24, + "learning_rate": 1.778274528319848e-05, + "loss": 0.4827, + "step": 2097 + }, + { + "epoch": 0.24, + "learning_rate": 1.778042039285596e-05, + "loss": 0.462, + "step": 2098 + }, + { + "epoch": 0.24, + "learning_rate": 1.777809443642214e-05, + "loss": 0.4773, + "step": 2099 + }, + { + "epoch": 0.24, + "learning_rate": 1.7775767414215726e-05, + "loss": 0.4847, + "step": 2100 + }, + { + "epoch": 0.24, + "learning_rate": 1.7773439326555574e-05, + "loss": 0.4716, + "step": 2101 + }, + { + "epoch": 0.24, + "learning_rate": 1.777111017376068e-05, + "loss": 0.4797, + "step": 2102 + }, + { + "epoch": 0.24, + "learning_rate": 1.7768779956150196e-05, + "loss": 0.4778, + "step": 2103 + }, + { + "epoch": 0.24, + "learning_rate": 1.776644867404341e-05, + "loss": 0.4625, + "step": 2104 + }, + { + "epoch": 0.24, + "learning_rate": 1.776411632775976e-05, + "loss": 0.46, + "step": 2105 + }, + { + "epoch": 0.24, + "learning_rate": 1.7761782917618836e-05, + "loss": 0.4934, + "step": 2106 + }, + { + "epoch": 0.24, + "learning_rate": 1.7759448443940355e-05, + "loss": 0.459, + "step": 2107 + }, + { + "epoch": 0.24, + "learning_rate": 1.77571129070442e-05, + "loss": 0.4785, + "step": 2108 + }, + { + "epoch": 0.24, + "learning_rate": 1.775477630725039e-05, + "loss": 0.4648, + "step": 2109 + }, + { + "epoch": 0.24, + "learning_rate": 1.7752438644879092e-05, + "loss": 0.5045, + "step": 2110 + }, + { + "epoch": 0.24, + "learning_rate": 1.7750099920250616e-05, + "loss": 0.4848, + "step": 2111 + }, + { + "epoch": 0.24, + "learning_rate": 1.774776013368542e-05, + "loss": 0.4724, + "step": 2112 + }, + { + "epoch": 0.24, + "learning_rate": 1.774541928550411e-05, + "loss": 0.4749, + "step": 2113 + }, + { + "epoch": 0.24, + "learning_rate": 1.7743077376027433e-05, + "loss": 0.4872, + "step": 2114 + }, + { + "epoch": 0.24, + "learning_rate": 1.7740734405576283e-05, + "loss": 0.4778, + "step": 2115 + }, + { + "epoch": 0.24, + "learning_rate": 1.7738390374471696e-05, + "loss": 0.4904, + "step": 2116 + }, + { + "epoch": 0.24, + "learning_rate": 1.773604528303486e-05, + "loss": 0.4892, + "step": 2117 + }, + { + "epoch": 0.24, + "learning_rate": 1.7733699131587104e-05, + "loss": 0.4918, + "step": 2118 + }, + { + "epoch": 0.24, + "learning_rate": 1.77313519204499e-05, + "loss": 0.4776, + "step": 2119 + }, + { + "epoch": 0.24, + "learning_rate": 1.7729003649944878e-05, + "loss": 0.4777, + "step": 2120 + }, + { + "epoch": 0.24, + "learning_rate": 1.7726654320393795e-05, + "loss": 0.4935, + "step": 2121 + }, + { + "epoch": 0.24, + "learning_rate": 1.772430393211856e-05, + "loss": 0.482, + "step": 2122 + }, + { + "epoch": 0.24, + "learning_rate": 1.7721952485441232e-05, + "loss": 0.4718, + "step": 2123 + }, + { + "epoch": 0.24, + "learning_rate": 1.7719599980684016e-05, + "loss": 0.4591, + "step": 2124 + }, + { + "epoch": 0.24, + "learning_rate": 1.7717246418169252e-05, + "loss": 0.481, + "step": 2125 + }, + { + "epoch": 0.24, + "learning_rate": 1.7714891798219432e-05, + "loss": 0.4851, + "step": 2126 + }, + { + "epoch": 0.24, + "learning_rate": 1.771253612115719e-05, + "loss": 0.5005, + "step": 2127 + }, + { + "epoch": 0.24, + "learning_rate": 1.7710179387305308e-05, + "loss": 0.4734, + "step": 2128 + }, + { + "epoch": 0.24, + "learning_rate": 1.7707821596986715e-05, + "loss": 0.4805, + "step": 2129 + }, + { + "epoch": 0.24, + "learning_rate": 1.7705462750524474e-05, + "loss": 0.4809, + "step": 2130 + }, + { + "epoch": 0.24, + "learning_rate": 1.77031028482418e-05, + "loss": 0.4728, + "step": 2131 + }, + { + "epoch": 0.24, + "learning_rate": 1.770074189046206e-05, + "loss": 0.4759, + "step": 2132 + }, + { + "epoch": 0.24, + "learning_rate": 1.7698379877508755e-05, + "loss": 0.4709, + "step": 2133 + }, + { + "epoch": 0.24, + "learning_rate": 1.7696016809705525e-05, + "loss": 0.4993, + "step": 2134 + }, + { + "epoch": 0.24, + "learning_rate": 1.7693652687376173e-05, + "loss": 0.4765, + "step": 2135 + }, + { + "epoch": 0.24, + "learning_rate": 1.769128751084463e-05, + "loss": 0.4845, + "step": 2136 + }, + { + "epoch": 0.24, + "learning_rate": 1.7688921280434984e-05, + "loss": 0.4776, + "step": 2137 + }, + { + "epoch": 0.24, + "learning_rate": 1.768655399647146e-05, + "loss": 0.4779, + "step": 2138 + }, + { + "epoch": 0.24, + "learning_rate": 1.7684185659278423e-05, + "loss": 0.4532, + "step": 2139 + }, + { + "epoch": 0.24, + "learning_rate": 1.7681816269180394e-05, + "loss": 0.4901, + "step": 2140 + }, + { + "epoch": 0.24, + "learning_rate": 1.7679445826502033e-05, + "loss": 0.4582, + "step": 2141 + }, + { + "epoch": 0.24, + "learning_rate": 1.767707433156814e-05, + "loss": 0.4821, + "step": 2142 + }, + { + "epoch": 0.24, + "learning_rate": 1.767470178470366e-05, + "loss": 0.4743, + "step": 2143 + }, + { + "epoch": 0.25, + "learning_rate": 1.7672328186233692e-05, + "loss": 0.5039, + "step": 2144 + }, + { + "epoch": 0.25, + "learning_rate": 1.766995353648347e-05, + "loss": 0.4517, + "step": 2145 + }, + { + "epoch": 0.25, + "learning_rate": 1.766757783577837e-05, + "loss": 0.4928, + "step": 2146 + }, + { + "epoch": 0.25, + "learning_rate": 1.766520108444392e-05, + "loss": 0.4828, + "step": 2147 + }, + { + "epoch": 0.25, + "learning_rate": 1.7662823282805788e-05, + "loss": 0.4716, + "step": 2148 + }, + { + "epoch": 0.25, + "learning_rate": 1.766044443118978e-05, + "loss": 0.4756, + "step": 2149 + }, + { + "epoch": 0.25, + "learning_rate": 1.765806452992186e-05, + "loss": 0.4674, + "step": 2150 + }, + { + "epoch": 0.25, + "learning_rate": 1.7655683579328128e-05, + "loss": 0.4787, + "step": 2151 + }, + { + "epoch": 0.25, + "learning_rate": 1.765330157973482e-05, + "loss": 0.5245, + "step": 2152 + }, + { + "epoch": 0.25, + "learning_rate": 1.7650918531468326e-05, + "loss": 0.4759, + "step": 2153 + }, + { + "epoch": 0.25, + "learning_rate": 1.7648534434855177e-05, + "loss": 0.4719, + "step": 2154 + }, + { + "epoch": 0.25, + "learning_rate": 1.764614929022205e-05, + "loss": 0.4807, + "step": 2155 + }, + { + "epoch": 0.25, + "learning_rate": 1.764376309789576e-05, + "loss": 0.4773, + "step": 2156 + }, + { + "epoch": 0.25, + "learning_rate": 1.764137585820327e-05, + "loss": 0.4787, + "step": 2157 + }, + { + "epoch": 0.25, + "learning_rate": 1.7638987571471685e-05, + "loss": 0.4781, + "step": 2158 + }, + { + "epoch": 0.25, + "learning_rate": 1.7636598238028253e-05, + "loss": 0.4679, + "step": 2159 + }, + { + "epoch": 0.25, + "learning_rate": 1.7634207858200366e-05, + "loss": 0.4922, + "step": 2160 + }, + { + "epoch": 0.25, + "learning_rate": 1.763181643231556e-05, + "loss": 0.491, + "step": 2161 + }, + { + "epoch": 0.25, + "learning_rate": 1.7629423960701513e-05, + "loss": 0.4677, + "step": 2162 + }, + { + "epoch": 0.25, + "learning_rate": 1.7627030443686047e-05, + "loss": 0.4873, + "step": 2163 + }, + { + "epoch": 0.25, + "learning_rate": 1.762463588159713e-05, + "loss": 0.4677, + "step": 2164 + }, + { + "epoch": 0.25, + "learning_rate": 1.762224027476287e-05, + "loss": 0.4581, + "step": 2165 + }, + { + "epoch": 0.25, + "learning_rate": 1.761984362351151e-05, + "loss": 0.5085, + "step": 2166 + }, + { + "epoch": 0.25, + "learning_rate": 1.7617445928171458e-05, + "loss": 0.4582, + "step": 2167 + }, + { + "epoch": 0.25, + "learning_rate": 1.761504718907124e-05, + "loss": 0.499, + "step": 2168 + }, + { + "epoch": 0.25, + "learning_rate": 1.7612647406539548e-05, + "loss": 0.4789, + "step": 2169 + }, + { + "epoch": 0.25, + "learning_rate": 1.76102465809052e-05, + "loss": 0.4757, + "step": 2170 + }, + { + "epoch": 0.25, + "learning_rate": 1.760784471249716e-05, + "loss": 0.4804, + "step": 2171 + }, + { + "epoch": 0.25, + "learning_rate": 1.760544180164454e-05, + "loss": 0.4877, + "step": 2172 + }, + { + "epoch": 0.25, + "learning_rate": 1.7603037848676593e-05, + "loss": 0.4686, + "step": 2173 + }, + { + "epoch": 0.25, + "learning_rate": 1.7600632853922713e-05, + "loss": 0.4784, + "step": 2174 + }, + { + "epoch": 0.25, + "learning_rate": 1.7598226817712442e-05, + "loss": 0.4851, + "step": 2175 + }, + { + "epoch": 0.25, + "learning_rate": 1.7595819740375457e-05, + "loss": 0.4848, + "step": 2176 + }, + { + "epoch": 0.25, + "learning_rate": 1.7593411622241584e-05, + "loss": 0.4683, + "step": 2177 + }, + { + "epoch": 0.25, + "learning_rate": 1.7591002463640784e-05, + "loss": 0.4815, + "step": 2178 + }, + { + "epoch": 0.25, + "learning_rate": 1.758859226490317e-05, + "loss": 0.5039, + "step": 2179 + }, + { + "epoch": 0.25, + "learning_rate": 1.7586181026358987e-05, + "loss": 0.4845, + "step": 2180 + }, + { + "epoch": 0.25, + "learning_rate": 1.758376874833864e-05, + "loss": 0.4929, + "step": 2181 + }, + { + "epoch": 0.25, + "learning_rate": 1.7581355431172653e-05, + "loss": 0.4848, + "step": 2182 + }, + { + "epoch": 0.25, + "learning_rate": 1.757894107519171e-05, + "loss": 0.4796, + "step": 2183 + }, + { + "epoch": 0.25, + "learning_rate": 1.757652568072663e-05, + "loss": 0.4889, + "step": 2184 + }, + { + "epoch": 0.25, + "learning_rate": 1.757410924810838e-05, + "loss": 0.4622, + "step": 2185 + }, + { + "epoch": 0.25, + "learning_rate": 1.757169177766806e-05, + "loss": 0.4766, + "step": 2186 + }, + { + "epoch": 0.25, + "learning_rate": 1.7569273269736918e-05, + "loss": 0.4708, + "step": 2187 + }, + { + "epoch": 0.25, + "learning_rate": 1.756685372464635e-05, + "loss": 0.537, + "step": 2188 + }, + { + "epoch": 0.25, + "learning_rate": 1.7564433142727882e-05, + "loss": 0.4926, + "step": 2189 + }, + { + "epoch": 0.25, + "learning_rate": 1.7562011524313187e-05, + "loss": 0.4633, + "step": 2190 + }, + { + "epoch": 0.25, + "learning_rate": 1.755958886973408e-05, + "loss": 0.4688, + "step": 2191 + }, + { + "epoch": 0.25, + "learning_rate": 1.7557165179322522e-05, + "loss": 0.4958, + "step": 2192 + }, + { + "epoch": 0.25, + "learning_rate": 1.7554740453410617e-05, + "loss": 0.4528, + "step": 2193 + }, + { + "epoch": 0.25, + "learning_rate": 1.75523146923306e-05, + "loss": 0.4919, + "step": 2194 + }, + { + "epoch": 0.25, + "learning_rate": 1.7549887896414853e-05, + "loss": 0.4762, + "step": 2195 + }, + { + "epoch": 0.25, + "learning_rate": 1.7547460065995903e-05, + "loss": 0.4834, + "step": 2196 + }, + { + "epoch": 0.25, + "learning_rate": 1.754503120140642e-05, + "loss": 0.5091, + "step": 2197 + }, + { + "epoch": 0.25, + "learning_rate": 1.7542601302979213e-05, + "loss": 0.4773, + "step": 2198 + }, + { + "epoch": 0.25, + "learning_rate": 1.7540170371047228e-05, + "loss": 0.4704, + "step": 2199 + }, + { + "epoch": 0.25, + "learning_rate": 1.753773840594356e-05, + "loss": 0.4855, + "step": 2200 + }, + { + "epoch": 0.25, + "learning_rate": 1.753530540800144e-05, + "loss": 0.48, + "step": 2201 + }, + { + "epoch": 0.25, + "learning_rate": 1.7532871377554243e-05, + "loss": 0.4906, + "step": 2202 + }, + { + "epoch": 0.25, + "learning_rate": 1.7530436314935492e-05, + "loss": 0.4709, + "step": 2203 + }, + { + "epoch": 0.25, + "learning_rate": 1.7528000220478836e-05, + "loss": 0.4907, + "step": 2204 + }, + { + "epoch": 0.25, + "learning_rate": 1.7525563094518078e-05, + "loss": 0.4629, + "step": 2205 + }, + { + "epoch": 0.25, + "learning_rate": 1.7523124937387162e-05, + "loss": 0.4857, + "step": 2206 + }, + { + "epoch": 0.25, + "learning_rate": 1.7520685749420164e-05, + "loss": 0.4732, + "step": 2207 + }, + { + "epoch": 0.25, + "learning_rate": 1.7518245530951315e-05, + "loss": 0.4853, + "step": 2208 + }, + { + "epoch": 0.25, + "learning_rate": 1.7515804282314974e-05, + "loss": 0.4757, + "step": 2209 + }, + { + "epoch": 0.25, + "learning_rate": 1.751336200384564e-05, + "loss": 0.48, + "step": 2210 + }, + { + "epoch": 0.25, + "learning_rate": 1.751091869587797e-05, + "loss": 0.4729, + "step": 2211 + }, + { + "epoch": 0.25, + "learning_rate": 1.7508474358746753e-05, + "loss": 0.4996, + "step": 2212 + }, + { + "epoch": 0.25, + "learning_rate": 1.7506028992786912e-05, + "loss": 0.4631, + "step": 2213 + }, + { + "epoch": 0.25, + "learning_rate": 1.7503582598333517e-05, + "loss": 0.4687, + "step": 2214 + }, + { + "epoch": 0.25, + "learning_rate": 1.750113517572178e-05, + "loss": 0.4873, + "step": 2215 + }, + { + "epoch": 0.25, + "learning_rate": 1.749868672528705e-05, + "loss": 0.474, + "step": 2216 + }, + { + "epoch": 0.25, + "learning_rate": 1.7496237247364827e-05, + "loss": 0.4696, + "step": 2217 + }, + { + "epoch": 0.25, + "learning_rate": 1.7493786742290734e-05, + "loss": 0.4869, + "step": 2218 + }, + { + "epoch": 0.25, + "learning_rate": 1.7491335210400554e-05, + "loss": 0.4751, + "step": 2219 + }, + { + "epoch": 0.25, + "learning_rate": 1.7488882652030193e-05, + "loss": 0.5086, + "step": 2220 + }, + { + "epoch": 0.25, + "learning_rate": 1.748642906751571e-05, + "loss": 0.4697, + "step": 2221 + }, + { + "epoch": 0.25, + "learning_rate": 1.7483974457193307e-05, + "loss": 0.4575, + "step": 2222 + }, + { + "epoch": 0.25, + "learning_rate": 1.748151882139931e-05, + "loss": 0.482, + "step": 2223 + }, + { + "epoch": 0.25, + "learning_rate": 1.7479062160470205e-05, + "loss": 0.4892, + "step": 2224 + }, + { + "epoch": 0.25, + "learning_rate": 1.74766044747426e-05, + "loss": 0.4507, + "step": 2225 + }, + { + "epoch": 0.25, + "learning_rate": 1.7474145764553262e-05, + "loss": 0.4824, + "step": 2226 + }, + { + "epoch": 0.25, + "learning_rate": 1.7471686030239082e-05, + "loss": 0.4685, + "step": 2227 + }, + { + "epoch": 0.25, + "learning_rate": 1.7469225272137104e-05, + "loss": 0.5125, + "step": 2228 + }, + { + "epoch": 0.25, + "learning_rate": 1.7466763490584504e-05, + "loss": 0.4813, + "step": 2229 + }, + { + "epoch": 0.25, + "learning_rate": 1.7464300685918602e-05, + "loss": 0.4821, + "step": 2230 + }, + { + "epoch": 0.25, + "learning_rate": 1.7461836858476858e-05, + "loss": 0.4607, + "step": 2231 + }, + { + "epoch": 0.26, + "learning_rate": 1.745937200859687e-05, + "loss": 0.4742, + "step": 2232 + }, + { + "epoch": 0.26, + "learning_rate": 1.7456906136616374e-05, + "loss": 0.47, + "step": 2233 + }, + { + "epoch": 0.26, + "learning_rate": 1.7454439242873257e-05, + "loss": 0.4881, + "step": 2234 + }, + { + "epoch": 0.26, + "learning_rate": 1.745197132770553e-05, + "loss": 0.4716, + "step": 2235 + }, + { + "epoch": 0.26, + "learning_rate": 1.7449502391451362e-05, + "loss": 0.5046, + "step": 2236 + }, + { + "epoch": 0.26, + "learning_rate": 1.7447032434449045e-05, + "loss": 0.48, + "step": 2237 + }, + { + "epoch": 0.26, + "learning_rate": 1.7444561457037022e-05, + "loss": 0.4854, + "step": 2238 + }, + { + "epoch": 0.26, + "learning_rate": 1.744208945955387e-05, + "loss": 0.4625, + "step": 2239 + }, + { + "epoch": 0.26, + "learning_rate": 1.743961644233831e-05, + "loss": 0.4795, + "step": 2240 + }, + { + "epoch": 0.26, + "learning_rate": 1.7437142405729196e-05, + "loss": 0.4831, + "step": 2241 + }, + { + "epoch": 0.26, + "learning_rate": 1.743466735006553e-05, + "loss": 0.4779, + "step": 2242 + }, + { + "epoch": 0.26, + "learning_rate": 1.7432191275686454e-05, + "loss": 0.4721, + "step": 2243 + }, + { + "epoch": 0.26, + "learning_rate": 1.7429714182931238e-05, + "loss": 0.4879, + "step": 2244 + }, + { + "epoch": 0.26, + "learning_rate": 1.7427236072139306e-05, + "loss": 0.4855, + "step": 2245 + }, + { + "epoch": 0.26, + "learning_rate": 1.7424756943650203e-05, + "loss": 0.4936, + "step": 2246 + }, + { + "epoch": 0.26, + "learning_rate": 1.7422276797803638e-05, + "loss": 0.4762, + "step": 2247 + }, + { + "epoch": 0.26, + "learning_rate": 1.741979563493944e-05, + "loss": 0.4638, + "step": 2248 + }, + { + "epoch": 0.26, + "learning_rate": 1.741731345539758e-05, + "loss": 0.4827, + "step": 2249 + }, + { + "epoch": 0.26, + "learning_rate": 1.741483025951818e-05, + "loss": 0.4847, + "step": 2250 + }, + { + "epoch": 0.26, + "learning_rate": 1.7412346047641485e-05, + "loss": 0.4682, + "step": 2251 + }, + { + "epoch": 0.26, + "learning_rate": 1.74098608201079e-05, + "loss": 0.4909, + "step": 2252 + }, + { + "epoch": 0.26, + "learning_rate": 1.7407374577257945e-05, + "loss": 0.479, + "step": 2253 + }, + { + "epoch": 0.26, + "learning_rate": 1.7404887319432294e-05, + "loss": 0.5104, + "step": 2254 + }, + { + "epoch": 0.26, + "learning_rate": 1.740239904697176e-05, + "loss": 0.4657, + "step": 2255 + }, + { + "epoch": 0.26, + "learning_rate": 1.7399909760217285e-05, + "loss": 0.4881, + "step": 2256 + }, + { + "epoch": 0.26, + "learning_rate": 1.7397419459509962e-05, + "loss": 0.4684, + "step": 2257 + }, + { + "epoch": 0.26, + "learning_rate": 1.739492814519102e-05, + "loss": 0.4574, + "step": 2258 + }, + { + "epoch": 0.26, + "learning_rate": 1.739243581760182e-05, + "loss": 0.4794, + "step": 2259 + }, + { + "epoch": 0.26, + "learning_rate": 1.738994247708387e-05, + "loss": 0.4778, + "step": 2260 + }, + { + "epoch": 0.26, + "learning_rate": 1.7387448123978813e-05, + "loss": 0.489, + "step": 2261 + }, + { + "epoch": 0.26, + "learning_rate": 1.7384952758628423e-05, + "loss": 0.4871, + "step": 2262 + }, + { + "epoch": 0.26, + "learning_rate": 1.738245638137463e-05, + "loss": 0.4855, + "step": 2263 + }, + { + "epoch": 0.26, + "learning_rate": 1.7379958992559494e-05, + "loss": 0.471, + "step": 2264 + }, + { + "epoch": 0.26, + "learning_rate": 1.737746059252521e-05, + "loss": 0.4735, + "step": 2265 + }, + { + "epoch": 0.26, + "learning_rate": 1.737496118161411e-05, + "loss": 0.4721, + "step": 2266 + }, + { + "epoch": 0.26, + "learning_rate": 1.7372460760168676e-05, + "loss": 0.492, + "step": 2267 + }, + { + "epoch": 0.26, + "learning_rate": 1.736995932853152e-05, + "loss": 0.4756, + "step": 2268 + }, + { + "epoch": 0.26, + "learning_rate": 1.736745688704539e-05, + "loss": 0.5099, + "step": 2269 + }, + { + "epoch": 0.26, + "learning_rate": 1.736495343605318e-05, + "loss": 0.476, + "step": 2270 + }, + { + "epoch": 0.26, + "learning_rate": 1.736244897589792e-05, + "loss": 0.4761, + "step": 2271 + }, + { + "epoch": 0.26, + "learning_rate": 1.7359943506922775e-05, + "loss": 0.4725, + "step": 2272 + }, + { + "epoch": 0.26, + "learning_rate": 1.735743702947105e-05, + "loss": 0.477, + "step": 2273 + }, + { + "epoch": 0.26, + "learning_rate": 1.7354929543886186e-05, + "loss": 0.4682, + "step": 2274 + }, + { + "epoch": 0.26, + "learning_rate": 1.7352421050511767e-05, + "loss": 0.4591, + "step": 2275 + }, + { + "epoch": 0.26, + "learning_rate": 1.734991154969152e-05, + "loss": 0.4755, + "step": 2276 + }, + { + "epoch": 0.26, + "learning_rate": 1.7347401041769284e-05, + "loss": 0.4902, + "step": 2277 + }, + { + "epoch": 0.26, + "learning_rate": 1.7344889527089074e-05, + "loss": 0.4777, + "step": 2278 + }, + { + "epoch": 0.26, + "learning_rate": 1.7342377005995014e-05, + "loss": 0.4873, + "step": 2279 + }, + { + "epoch": 0.26, + "learning_rate": 1.733986347883138e-05, + "loss": 0.4777, + "step": 2280 + }, + { + "epoch": 0.26, + "learning_rate": 1.7337348945942572e-05, + "loss": 0.4675, + "step": 2281 + }, + { + "epoch": 0.26, + "learning_rate": 1.7334833407673145e-05, + "loss": 0.4943, + "step": 2282 + }, + { + "epoch": 0.26, + "learning_rate": 1.7332316864367785e-05, + "loss": 0.4708, + "step": 2283 + }, + { + "epoch": 0.26, + "learning_rate": 1.732979931637131e-05, + "loss": 0.4695, + "step": 2284 + }, + { + "epoch": 0.26, + "learning_rate": 1.7327280764028683e-05, + "loss": 0.488, + "step": 2285 + }, + { + "epoch": 0.26, + "learning_rate": 1.7324761207685005e-05, + "loss": 0.5035, + "step": 2286 + }, + { + "epoch": 0.26, + "learning_rate": 1.7322240647685503e-05, + "loss": 0.4726, + "step": 2287 + }, + { + "epoch": 0.26, + "learning_rate": 1.7319719084375556e-05, + "loss": 0.4609, + "step": 2288 + }, + { + "epoch": 0.26, + "learning_rate": 1.7317196518100672e-05, + "loss": 0.488, + "step": 2289 + }, + { + "epoch": 0.26, + "learning_rate": 1.7314672949206502e-05, + "loss": 0.474, + "step": 2290 + }, + { + "epoch": 0.26, + "learning_rate": 1.731214837803883e-05, + "loss": 0.4646, + "step": 2291 + }, + { + "epoch": 0.26, + "learning_rate": 1.7309622804943573e-05, + "loss": 0.4767, + "step": 2292 + }, + { + "epoch": 0.26, + "learning_rate": 1.73070962302668e-05, + "loss": 0.5126, + "step": 2293 + }, + { + "epoch": 0.26, + "learning_rate": 1.7304568654354703e-05, + "loss": 0.4779, + "step": 2294 + }, + { + "epoch": 0.26, + "learning_rate": 1.7302040077553616e-05, + "loss": 0.4769, + "step": 2295 + }, + { + "epoch": 0.26, + "learning_rate": 1.7299510500210015e-05, + "loss": 0.4764, + "step": 2296 + }, + { + "epoch": 0.26, + "learning_rate": 1.7296979922670502e-05, + "loss": 0.4786, + "step": 2297 + }, + { + "epoch": 0.26, + "learning_rate": 1.729444834528183e-05, + "loss": 0.4735, + "step": 2298 + }, + { + "epoch": 0.26, + "learning_rate": 1.7291915768390875e-05, + "loss": 0.4773, + "step": 2299 + }, + { + "epoch": 0.26, + "learning_rate": 1.728938219234466e-05, + "loss": 0.4762, + "step": 2300 + }, + { + "epoch": 0.26, + "learning_rate": 1.728684761749034e-05, + "loss": 0.4629, + "step": 2301 + }, + { + "epoch": 0.26, + "learning_rate": 1.728431204417521e-05, + "loss": 0.4624, + "step": 2302 + }, + { + "epoch": 0.26, + "learning_rate": 1.7281775472746695e-05, + "loss": 0.4793, + "step": 2303 + }, + { + "epoch": 0.26, + "learning_rate": 1.727923790355237e-05, + "loss": 0.4707, + "step": 2304 + }, + { + "epoch": 0.26, + "learning_rate": 1.7276699336939936e-05, + "loss": 0.4638, + "step": 2305 + }, + { + "epoch": 0.26, + "learning_rate": 1.7274159773257227e-05, + "loss": 0.492, + "step": 2306 + }, + { + "epoch": 0.26, + "learning_rate": 1.7271619212852232e-05, + "loss": 0.4632, + "step": 2307 + }, + { + "epoch": 0.26, + "learning_rate": 1.726907765607305e-05, + "loss": 0.479, + "step": 2308 + }, + { + "epoch": 0.26, + "learning_rate": 1.7266535103267943e-05, + "loss": 0.4955, + "step": 2309 + }, + { + "epoch": 0.26, + "learning_rate": 1.726399155478529e-05, + "loss": 0.471, + "step": 2310 + }, + { + "epoch": 0.26, + "learning_rate": 1.7261447010973623e-05, + "loss": 0.4869, + "step": 2311 + }, + { + "epoch": 0.26, + "learning_rate": 1.7258901472181587e-05, + "loss": 0.4765, + "step": 2312 + }, + { + "epoch": 0.26, + "learning_rate": 1.725635493875799e-05, + "loss": 0.4795, + "step": 2313 + }, + { + "epoch": 0.26, + "learning_rate": 1.725380741105176e-05, + "loss": 0.4612, + "step": 2314 + }, + { + "epoch": 0.26, + "learning_rate": 1.7251258889411964e-05, + "loss": 0.48, + "step": 2315 + }, + { + "epoch": 0.26, + "learning_rate": 1.724870937418781e-05, + "loss": 0.483, + "step": 2316 + }, + { + "epoch": 0.26, + "learning_rate": 1.7246158865728634e-05, + "loss": 0.4708, + "step": 2317 + }, + { + "epoch": 0.26, + "learning_rate": 1.7243607364383916e-05, + "loss": 0.4752, + "step": 2318 + }, + { + "epoch": 0.27, + "learning_rate": 1.7241054870503262e-05, + "loss": 0.4801, + "step": 2319 + }, + { + "epoch": 0.27, + "learning_rate": 1.723850138443643e-05, + "loss": 0.4739, + "step": 2320 + }, + { + "epoch": 0.27, + "learning_rate": 1.72359469065333e-05, + "loss": 0.4771, + "step": 2321 + }, + { + "epoch": 0.27, + "learning_rate": 1.723339143714389e-05, + "loss": 0.4811, + "step": 2322 + }, + { + "epoch": 0.27, + "learning_rate": 1.7230834976618364e-05, + "loss": 0.5013, + "step": 2323 + }, + { + "epoch": 0.27, + "learning_rate": 1.7228277525307007e-05, + "loss": 0.4576, + "step": 2324 + }, + { + "epoch": 0.27, + "learning_rate": 1.7225719083560246e-05, + "loss": 0.4736, + "step": 2325 + }, + { + "epoch": 0.27, + "learning_rate": 1.7223159651728653e-05, + "loss": 0.4784, + "step": 2326 + }, + { + "epoch": 0.27, + "learning_rate": 1.7220599230162917e-05, + "loss": 0.4742, + "step": 2327 + }, + { + "epoch": 0.27, + "learning_rate": 1.7218037819213883e-05, + "loss": 0.4799, + "step": 2328 + }, + { + "epoch": 0.27, + "learning_rate": 1.7215475419232516e-05, + "loss": 0.4987, + "step": 2329 + }, + { + "epoch": 0.27, + "learning_rate": 1.7212912030569923e-05, + "loss": 0.4768, + "step": 2330 + }, + { + "epoch": 0.27, + "learning_rate": 1.7210347653577343e-05, + "loss": 0.4819, + "step": 2331 + }, + { + "epoch": 0.27, + "learning_rate": 1.7207782288606154e-05, + "loss": 0.4671, + "step": 2332 + }, + { + "epoch": 0.27, + "learning_rate": 1.720521593600787e-05, + "loss": 0.4661, + "step": 2333 + }, + { + "epoch": 0.27, + "learning_rate": 1.7202648596134143e-05, + "loss": 0.4803, + "step": 2334 + }, + { + "epoch": 0.27, + "learning_rate": 1.7200080269336745e-05, + "loss": 0.4671, + "step": 2335 + }, + { + "epoch": 0.27, + "learning_rate": 1.71975109559676e-05, + "loss": 0.4705, + "step": 2336 + }, + { + "epoch": 0.27, + "learning_rate": 1.7194940656378763e-05, + "loss": 0.4898, + "step": 2337 + }, + { + "epoch": 0.27, + "learning_rate": 1.7192369370922423e-05, + "loss": 0.4762, + "step": 2338 + }, + { + "epoch": 0.27, + "learning_rate": 1.7189797099950895e-05, + "loss": 0.4991, + "step": 2339 + }, + { + "epoch": 0.27, + "learning_rate": 1.7187223843816648e-05, + "loss": 0.4784, + "step": 2340 + }, + { + "epoch": 0.27, + "learning_rate": 1.7184649602872274e-05, + "loss": 0.4749, + "step": 2341 + }, + { + "epoch": 0.27, + "learning_rate": 1.7182074377470494e-05, + "loss": 0.4777, + "step": 2342 + }, + { + "epoch": 0.27, + "learning_rate": 1.717949816796418e-05, + "loss": 0.4733, + "step": 2343 + }, + { + "epoch": 0.27, + "learning_rate": 1.7176920974706318e-05, + "loss": 0.4914, + "step": 2344 + }, + { + "epoch": 0.27, + "learning_rate": 1.7174342798050056e-05, + "loss": 0.4806, + "step": 2345 + }, + { + "epoch": 0.27, + "learning_rate": 1.7171763638348653e-05, + "loss": 0.4793, + "step": 2346 + }, + { + "epoch": 0.27, + "learning_rate": 1.7169183495955516e-05, + "loss": 0.4827, + "step": 2347 + }, + { + "epoch": 0.27, + "learning_rate": 1.7166602371224178e-05, + "loss": 0.4725, + "step": 2348 + }, + { + "epoch": 0.27, + "learning_rate": 1.716402026450831e-05, + "loss": 0.4927, + "step": 2349 + }, + { + "epoch": 0.27, + "learning_rate": 1.7161437176161727e-05, + "loss": 0.4618, + "step": 2350 + }, + { + "epoch": 0.27, + "learning_rate": 1.7158853106538358e-05, + "loss": 0.4854, + "step": 2351 + }, + { + "epoch": 0.27, + "learning_rate": 1.7156268055992286e-05, + "loss": 0.4853, + "step": 2352 + }, + { + "epoch": 0.27, + "learning_rate": 1.7153682024877716e-05, + "loss": 0.4795, + "step": 2353 + }, + { + "epoch": 0.27, + "learning_rate": 1.7151095013548996e-05, + "loss": 0.4838, + "step": 2354 + }, + { + "epoch": 0.27, + "learning_rate": 1.7148507022360602e-05, + "loss": 0.4743, + "step": 2355 + }, + { + "epoch": 0.27, + "learning_rate": 1.7145918051667147e-05, + "loss": 0.4669, + "step": 2356 + }, + { + "epoch": 0.27, + "learning_rate": 1.714332810182338e-05, + "loss": 0.4712, + "step": 2357 + }, + { + "epoch": 0.27, + "learning_rate": 1.7140737173184178e-05, + "loss": 0.4703, + "step": 2358 + }, + { + "epoch": 0.27, + "learning_rate": 1.713814526610456e-05, + "loss": 0.48, + "step": 2359 + }, + { + "epoch": 0.27, + "learning_rate": 1.713555238093967e-05, + "loss": 0.4717, + "step": 2360 + }, + { + "epoch": 0.27, + "learning_rate": 1.7132958518044797e-05, + "loss": 0.4669, + "step": 2361 + }, + { + "epoch": 0.27, + "learning_rate": 1.713036367777535e-05, + "loss": 0.4762, + "step": 2362 + }, + { + "epoch": 0.27, + "learning_rate": 1.7127767860486892e-05, + "loss": 0.4835, + "step": 2363 + }, + { + "epoch": 0.27, + "learning_rate": 1.71251710665351e-05, + "loss": 0.4753, + "step": 2364 + }, + { + "epoch": 0.27, + "learning_rate": 1.7122573296275788e-05, + "loss": 0.4621, + "step": 2365 + }, + { + "epoch": 0.27, + "learning_rate": 1.711997455006492e-05, + "loss": 0.4649, + "step": 2366 + }, + { + "epoch": 0.27, + "learning_rate": 1.711737482825858e-05, + "loss": 0.4877, + "step": 2367 + }, + { + "epoch": 0.27, + "learning_rate": 1.7114774131212983e-05, + "loss": 0.4709, + "step": 2368 + }, + { + "epoch": 0.27, + "learning_rate": 1.7112172459284478e-05, + "loss": 0.4921, + "step": 2369 + }, + { + "epoch": 0.27, + "learning_rate": 1.7109569812829565e-05, + "loss": 0.4739, + "step": 2370 + }, + { + "epoch": 0.27, + "learning_rate": 1.710696619220486e-05, + "loss": 0.4971, + "step": 2371 + }, + { + "epoch": 0.27, + "learning_rate": 1.7104361597767107e-05, + "loss": 0.4782, + "step": 2372 + }, + { + "epoch": 0.27, + "learning_rate": 1.7101756029873208e-05, + "loss": 0.4725, + "step": 2373 + }, + { + "epoch": 0.27, + "learning_rate": 1.7099149488880174e-05, + "loss": 0.4709, + "step": 2374 + }, + { + "epoch": 0.27, + "learning_rate": 1.709654197514517e-05, + "loss": 0.4594, + "step": 2375 + }, + { + "epoch": 0.27, + "learning_rate": 1.709393348902547e-05, + "loss": 0.4697, + "step": 2376 + }, + { + "epoch": 0.27, + "learning_rate": 1.7091324030878504e-05, + "loss": 0.4889, + "step": 2377 + }, + { + "epoch": 0.27, + "learning_rate": 1.7088713601061823e-05, + "loss": 0.4615, + "step": 2378 + }, + { + "epoch": 0.27, + "learning_rate": 1.7086102199933116e-05, + "loss": 0.4891, + "step": 2379 + }, + { + "epoch": 0.27, + "learning_rate": 1.7083489827850202e-05, + "loss": 0.4881, + "step": 2380 + }, + { + "epoch": 0.27, + "learning_rate": 1.7080876485171035e-05, + "loss": 0.4834, + "step": 2381 + }, + { + "epoch": 0.27, + "learning_rate": 1.70782621722537e-05, + "loss": 0.4731, + "step": 2382 + }, + { + "epoch": 0.27, + "learning_rate": 1.7075646889456415e-05, + "loss": 0.5067, + "step": 2383 + }, + { + "epoch": 0.27, + "learning_rate": 1.7073030637137535e-05, + "loss": 0.4691, + "step": 2384 + }, + { + "epoch": 0.27, + "learning_rate": 1.7070413415655548e-05, + "loss": 0.4868, + "step": 2385 + }, + { + "epoch": 0.27, + "learning_rate": 1.7067795225369063e-05, + "loss": 0.4821, + "step": 2386 + }, + { + "epoch": 0.27, + "learning_rate": 1.7065176066636836e-05, + "loss": 0.4938, + "step": 2387 + }, + { + "epoch": 0.27, + "learning_rate": 1.706255593981775e-05, + "loss": 0.4777, + "step": 2388 + }, + { + "epoch": 0.27, + "learning_rate": 1.7059934845270826e-05, + "loss": 0.5058, + "step": 2389 + }, + { + "epoch": 0.27, + "learning_rate": 1.70573127833552e-05, + "loss": 0.4644, + "step": 2390 + }, + { + "epoch": 0.27, + "learning_rate": 1.705468975443016e-05, + "loss": 0.4899, + "step": 2391 + }, + { + "epoch": 0.27, + "learning_rate": 1.7052065758855123e-05, + "loss": 0.4905, + "step": 2392 + }, + { + "epoch": 0.27, + "learning_rate": 1.704944079698963e-05, + "loss": 0.4652, + "step": 2393 + }, + { + "epoch": 0.27, + "learning_rate": 1.704681486919336e-05, + "loss": 0.4839, + "step": 2394 + }, + { + "epoch": 0.27, + "learning_rate": 1.7044187975826126e-05, + "loss": 0.4743, + "step": 2395 + }, + { + "epoch": 0.27, + "learning_rate": 1.704156011724787e-05, + "loss": 0.4723, + "step": 2396 + }, + { + "epoch": 0.27, + "learning_rate": 1.7038931293818665e-05, + "loss": 0.4885, + "step": 2397 + }, + { + "epoch": 0.27, + "learning_rate": 1.703630150589872e-05, + "loss": 0.471, + "step": 2398 + }, + { + "epoch": 0.27, + "learning_rate": 1.7033670753848373e-05, + "loss": 0.4791, + "step": 2399 + }, + { + "epoch": 0.27, + "learning_rate": 1.7031039038028103e-05, + "loss": 0.4562, + "step": 2400 + }, + { + "epoch": 0.27, + "learning_rate": 1.7028406358798505e-05, + "loss": 0.4891, + "step": 2401 + }, + { + "epoch": 0.27, + "learning_rate": 1.7025772716520324e-05, + "loss": 0.4637, + "step": 2402 + }, + { + "epoch": 0.27, + "learning_rate": 1.7023138111554412e-05, + "loss": 0.4753, + "step": 2403 + }, + { + "epoch": 0.27, + "learning_rate": 1.702050254426179e-05, + "loss": 0.4727, + "step": 2404 + }, + { + "epoch": 0.27, + "learning_rate": 1.701786601500357e-05, + "loss": 0.4795, + "step": 2405 + }, + { + "epoch": 0.27, + "learning_rate": 1.701522852414103e-05, + "loss": 0.4705, + "step": 2406 + }, + { + "epoch": 0.28, + "learning_rate": 1.7012590072035554e-05, + "loss": 0.5054, + "step": 2407 + }, + { + "epoch": 0.28, + "learning_rate": 1.7009950659048677e-05, + "loss": 0.4825, + "step": 2408 + }, + { + "epoch": 0.28, + "learning_rate": 1.7007310285542057e-05, + "loss": 0.4799, + "step": 2409 + }, + { + "epoch": 0.28, + "learning_rate": 1.7004668951877475e-05, + "loss": 0.4526, + "step": 2410 + }, + { + "epoch": 0.28, + "learning_rate": 1.7002026658416862e-05, + "loss": 0.4672, + "step": 2411 + }, + { + "epoch": 0.28, + "learning_rate": 1.699938340552227e-05, + "loss": 0.4956, + "step": 2412 + }, + { + "epoch": 0.28, + "learning_rate": 1.699673919355588e-05, + "loss": 0.4727, + "step": 2413 + }, + { + "epoch": 0.28, + "learning_rate": 1.699409402288001e-05, + "loss": 0.4746, + "step": 2414 + }, + { + "epoch": 0.28, + "learning_rate": 1.699144789385711e-05, + "loss": 0.4814, + "step": 2415 + }, + { + "epoch": 0.28, + "learning_rate": 1.6988800806849754e-05, + "loss": 0.4645, + "step": 2416 + }, + { + "epoch": 0.28, + "learning_rate": 1.6986152762220655e-05, + "loss": 0.4808, + "step": 2417 + }, + { + "epoch": 0.28, + "learning_rate": 1.6983503760332653e-05, + "loss": 0.4715, + "step": 2418 + }, + { + "epoch": 0.28, + "learning_rate": 1.698085380154872e-05, + "loss": 0.4588, + "step": 2419 + }, + { + "epoch": 0.28, + "learning_rate": 1.6978202886231963e-05, + "loss": 0.4597, + "step": 2420 + }, + { + "epoch": 0.28, + "learning_rate": 1.6975551014745614e-05, + "loss": 0.5044, + "step": 2421 + }, + { + "epoch": 0.28, + "learning_rate": 1.697289818745304e-05, + "loss": 0.463, + "step": 2422 + }, + { + "epoch": 0.28, + "learning_rate": 1.6970244404717732e-05, + "loss": 0.481, + "step": 2423 + }, + { + "epoch": 0.28, + "learning_rate": 1.6967589666903324e-05, + "loss": 0.4816, + "step": 2424 + }, + { + "epoch": 0.28, + "learning_rate": 1.696493397437357e-05, + "loss": 0.4928, + "step": 2425 + }, + { + "epoch": 0.28, + "learning_rate": 1.6962277327492366e-05, + "loss": 0.4689, + "step": 2426 + }, + { + "epoch": 0.28, + "learning_rate": 1.6959619726623722e-05, + "loss": 0.4803, + "step": 2427 + }, + { + "epoch": 0.28, + "learning_rate": 1.6956961172131796e-05, + "loss": 0.4684, + "step": 2428 + }, + { + "epoch": 0.28, + "learning_rate": 1.6954301664380867e-05, + "loss": 0.4796, + "step": 2429 + }, + { + "epoch": 0.28, + "learning_rate": 1.6951641203735345e-05, + "loss": 0.4839, + "step": 2430 + }, + { + "epoch": 0.28, + "learning_rate": 1.694897979055978e-05, + "loss": 0.4781, + "step": 2431 + }, + { + "epoch": 0.28, + "learning_rate": 1.6946317425218834e-05, + "loss": 0.4846, + "step": 2432 + }, + { + "epoch": 0.28, + "learning_rate": 1.6943654108077317e-05, + "loss": 0.4773, + "step": 2433 + }, + { + "epoch": 0.28, + "learning_rate": 1.6940989839500167e-05, + "loss": 0.4712, + "step": 2434 + }, + { + "epoch": 0.28, + "learning_rate": 1.6938324619852435e-05, + "loss": 0.4936, + "step": 2435 + }, + { + "epoch": 0.28, + "learning_rate": 1.693565844949933e-05, + "loss": 0.4735, + "step": 2436 + }, + { + "epoch": 0.28, + "learning_rate": 1.693299132880617e-05, + "loss": 0.4663, + "step": 2437 + }, + { + "epoch": 0.28, + "learning_rate": 1.693032325813841e-05, + "loss": 0.488, + "step": 2438 + }, + { + "epoch": 0.28, + "learning_rate": 1.6927654237861635e-05, + "loss": 0.4665, + "step": 2439 + }, + { + "epoch": 0.28, + "learning_rate": 1.6924984268341563e-05, + "loss": 0.4767, + "step": 2440 + }, + { + "epoch": 0.28, + "learning_rate": 1.6922313349944037e-05, + "loss": 0.4821, + "step": 2441 + }, + { + "epoch": 0.28, + "learning_rate": 1.6919641483035035e-05, + "loss": 0.4513, + "step": 2442 + }, + { + "epoch": 0.28, + "learning_rate": 1.6916968667980658e-05, + "loss": 0.4734, + "step": 2443 + }, + { + "epoch": 0.28, + "learning_rate": 1.6914294905147144e-05, + "loss": 0.4782, + "step": 2444 + }, + { + "epoch": 0.28, + "learning_rate": 1.6911620194900862e-05, + "loss": 0.4684, + "step": 2445 + }, + { + "epoch": 0.28, + "learning_rate": 1.6908944537608302e-05, + "loss": 0.4643, + "step": 2446 + }, + { + "epoch": 0.28, + "learning_rate": 1.6906267933636087e-05, + "loss": 0.4868, + "step": 2447 + }, + { + "epoch": 0.28, + "learning_rate": 1.6903590383350975e-05, + "loss": 0.4797, + "step": 2448 + }, + { + "epoch": 0.28, + "learning_rate": 1.690091188711985e-05, + "loss": 0.4894, + "step": 2449 + }, + { + "epoch": 0.28, + "learning_rate": 1.689823244530973e-05, + "loss": 0.4657, + "step": 2450 + }, + { + "epoch": 0.28, + "learning_rate": 1.6895552058287752e-05, + "loss": 0.4817, + "step": 2451 + }, + { + "epoch": 0.28, + "learning_rate": 1.689287072642119e-05, + "loss": 0.4785, + "step": 2452 + }, + { + "epoch": 0.28, + "learning_rate": 1.6890188450077445e-05, + "loss": 0.4668, + "step": 2453 + }, + { + "epoch": 0.28, + "learning_rate": 1.6887505229624053e-05, + "loss": 0.4684, + "step": 2454 + }, + { + "epoch": 0.28, + "learning_rate": 1.6884821065428673e-05, + "loss": 0.4807, + "step": 2455 + }, + { + "epoch": 0.28, + "learning_rate": 1.6882135957859095e-05, + "loss": 0.4894, + "step": 2456 + }, + { + "epoch": 0.28, + "learning_rate": 1.6879449907283238e-05, + "loss": 0.4591, + "step": 2457 + }, + { + "epoch": 0.28, + "learning_rate": 1.6876762914069154e-05, + "loss": 0.4734, + "step": 2458 + }, + { + "epoch": 0.28, + "learning_rate": 1.6874074978585018e-05, + "loss": 0.4596, + "step": 2459 + }, + { + "epoch": 0.28, + "learning_rate": 1.687138610119914e-05, + "loss": 0.4737, + "step": 2460 + }, + { + "epoch": 0.28, + "learning_rate": 1.686869628227995e-05, + "loss": 0.5029, + "step": 2461 + }, + { + "epoch": 0.28, + "learning_rate": 1.686600552219602e-05, + "loss": 0.4675, + "step": 2462 + }, + { + "epoch": 0.28, + "learning_rate": 1.686331382131604e-05, + "loss": 0.4601, + "step": 2463 + }, + { + "epoch": 0.28, + "learning_rate": 1.686062118000884e-05, + "loss": 0.4806, + "step": 2464 + }, + { + "epoch": 0.28, + "learning_rate": 1.6857927598643362e-05, + "loss": 0.4758, + "step": 2465 + }, + { + "epoch": 0.28, + "learning_rate": 1.6855233077588697e-05, + "loss": 0.503, + "step": 2466 + }, + { + "epoch": 0.28, + "learning_rate": 1.6852537617214043e-05, + "loss": 0.4894, + "step": 2467 + }, + { + "epoch": 0.28, + "learning_rate": 1.6849841217888748e-05, + "loss": 0.4551, + "step": 2468 + }, + { + "epoch": 0.28, + "learning_rate": 1.6847143879982276e-05, + "loss": 0.4697, + "step": 2469 + }, + { + "epoch": 0.28, + "learning_rate": 1.684444560386422e-05, + "loss": 0.4702, + "step": 2470 + }, + { + "epoch": 0.28, + "learning_rate": 1.6841746389904306e-05, + "loss": 0.4724, + "step": 2471 + }, + { + "epoch": 0.28, + "learning_rate": 1.6839046238472387e-05, + "loss": 0.49, + "step": 2472 + }, + { + "epoch": 0.28, + "learning_rate": 1.6836345149938445e-05, + "loss": 0.4787, + "step": 2473 + }, + { + "epoch": 0.28, + "learning_rate": 1.6833643124672586e-05, + "loss": 0.4715, + "step": 2474 + }, + { + "epoch": 0.28, + "learning_rate": 1.683094016304505e-05, + "loss": 0.4781, + "step": 2475 + }, + { + "epoch": 0.28, + "learning_rate": 1.6828236265426205e-05, + "loss": 0.4622, + "step": 2476 + }, + { + "epoch": 0.28, + "learning_rate": 1.6825531432186545e-05, + "loss": 0.4672, + "step": 2477 + }, + { + "epoch": 0.28, + "learning_rate": 1.6822825663696683e-05, + "loss": 0.4731, + "step": 2478 + }, + { + "epoch": 0.28, + "learning_rate": 1.6820118960327386e-05, + "loss": 0.4835, + "step": 2479 + }, + { + "epoch": 0.28, + "learning_rate": 1.681741132244952e-05, + "loss": 0.4742, + "step": 2480 + }, + { + "epoch": 0.28, + "learning_rate": 1.6814702750434097e-05, + "loss": 0.4779, + "step": 2481 + }, + { + "epoch": 0.28, + "learning_rate": 1.6811993244652248e-05, + "loss": 0.4848, + "step": 2482 + }, + { + "epoch": 0.28, + "learning_rate": 1.6809282805475243e-05, + "loss": 0.4667, + "step": 2483 + }, + { + "epoch": 0.28, + "learning_rate": 1.680657143327447e-05, + "loss": 0.479, + "step": 2484 + }, + { + "epoch": 0.28, + "learning_rate": 1.680385912842144e-05, + "loss": 0.466, + "step": 2485 + }, + { + "epoch": 0.28, + "learning_rate": 1.680114589128781e-05, + "loss": 0.4768, + "step": 2486 + }, + { + "epoch": 0.28, + "learning_rate": 1.6798431722245346e-05, + "loss": 0.4632, + "step": 2487 + }, + { + "epoch": 0.28, + "learning_rate": 1.6795716621665957e-05, + "loss": 0.5026, + "step": 2488 + }, + { + "epoch": 0.28, + "learning_rate": 1.6793000589921666e-05, + "loss": 0.4801, + "step": 2489 + }, + { + "epoch": 0.28, + "learning_rate": 1.6790283627384633e-05, + "loss": 0.4888, + "step": 2490 + }, + { + "epoch": 0.28, + "learning_rate": 1.6787565734427143e-05, + "loss": 0.4666, + "step": 2491 + }, + { + "epoch": 0.28, + "learning_rate": 1.6784846911421605e-05, + "loss": 0.4721, + "step": 2492 + }, + { + "epoch": 0.28, + "learning_rate": 1.678212715874056e-05, + "loss": 0.4713, + "step": 2493 + }, + { + "epoch": 0.29, + "learning_rate": 1.677940647675668e-05, + "loss": 0.4702, + "step": 2494 + }, + { + "epoch": 0.29, + "learning_rate": 1.6776684865842748e-05, + "loss": 0.4716, + "step": 2495 + }, + { + "epoch": 0.29, + "learning_rate": 1.6773962326371696e-05, + "loss": 0.4623, + "step": 2496 + }, + { + "epoch": 0.29, + "learning_rate": 1.677123885871657e-05, + "loss": 0.4892, + "step": 2497 + }, + { + "epoch": 0.29, + "learning_rate": 1.6768514463250544e-05, + "loss": 0.4916, + "step": 2498 + }, + { + "epoch": 0.29, + "learning_rate": 1.6765789140346916e-05, + "loss": 0.4808, + "step": 2499 + }, + { + "epoch": 0.29, + "learning_rate": 1.6763062890379128e-05, + "loss": 0.4719, + "step": 2500 + }, + { + "epoch": 0.29, + "learning_rate": 1.6760335713720727e-05, + "loss": 0.4711, + "step": 2501 + }, + { + "epoch": 0.29, + "learning_rate": 1.6757607610745405e-05, + "loss": 0.4837, + "step": 2502 + }, + { + "epoch": 0.29, + "learning_rate": 1.6754878581826966e-05, + "loss": 0.4707, + "step": 2503 + }, + { + "epoch": 0.29, + "learning_rate": 1.675214862733935e-05, + "loss": 0.4648, + "step": 2504 + }, + { + "epoch": 0.29, + "learning_rate": 1.674941774765662e-05, + "loss": 0.4742, + "step": 2505 + }, + { + "epoch": 0.29, + "learning_rate": 1.6746685943152975e-05, + "loss": 0.4896, + "step": 2506 + }, + { + "epoch": 0.29, + "learning_rate": 1.674395321420273e-05, + "loss": 0.4759, + "step": 2507 + }, + { + "epoch": 0.29, + "learning_rate": 1.674121956118032e-05, + "loss": 0.4774, + "step": 2508 + }, + { + "epoch": 0.29, + "learning_rate": 1.673848498446033e-05, + "loss": 0.4727, + "step": 2509 + }, + { + "epoch": 0.29, + "learning_rate": 1.6735749484417452e-05, + "loss": 0.4816, + "step": 2510 + }, + { + "epoch": 0.29, + "learning_rate": 1.673301306142651e-05, + "loss": 0.475, + "step": 2511 + }, + { + "epoch": 0.29, + "learning_rate": 1.6730275715862455e-05, + "loss": 0.4677, + "step": 2512 + }, + { + "epoch": 0.29, + "learning_rate": 1.672753744810037e-05, + "loss": 0.4701, + "step": 2513 + }, + { + "epoch": 0.29, + "learning_rate": 1.6724798258515452e-05, + "loss": 0.4852, + "step": 2514 + }, + { + "epoch": 0.29, + "learning_rate": 1.6722058147483034e-05, + "loss": 0.49, + "step": 2515 + }, + { + "epoch": 0.29, + "learning_rate": 1.671931711537857e-05, + "loss": 0.4704, + "step": 2516 + }, + { + "epoch": 0.29, + "learning_rate": 1.6716575162577647e-05, + "loss": 0.4733, + "step": 2517 + }, + { + "epoch": 0.29, + "learning_rate": 1.671383228945597e-05, + "loss": 0.4668, + "step": 2518 + }, + { + "epoch": 0.29, + "learning_rate": 1.6711088496389375e-05, + "loss": 0.4812, + "step": 2519 + }, + { + "epoch": 0.29, + "learning_rate": 1.6708343783753824e-05, + "loss": 0.4573, + "step": 2520 + }, + { + "epoch": 0.29, + "learning_rate": 1.67055981519254e-05, + "loss": 0.485, + "step": 2521 + }, + { + "epoch": 0.29, + "learning_rate": 1.6702851601280322e-05, + "loss": 0.4801, + "step": 2522 + }, + { + "epoch": 0.29, + "learning_rate": 1.6700104132194925e-05, + "loss": 0.4627, + "step": 2523 + }, + { + "epoch": 0.29, + "learning_rate": 1.6697355745045678e-05, + "loss": 0.479, + "step": 2524 + }, + { + "epoch": 0.29, + "learning_rate": 1.6694606440209163e-05, + "loss": 0.4802, + "step": 2525 + }, + { + "epoch": 0.29, + "learning_rate": 1.6691856218062105e-05, + "loss": 0.4808, + "step": 2526 + }, + { + "epoch": 0.29, + "learning_rate": 1.6689105078981333e-05, + "loss": 0.4646, + "step": 2527 + }, + { + "epoch": 0.29, + "learning_rate": 1.668635302334383e-05, + "loss": 0.48, + "step": 2528 + }, + { + "epoch": 0.29, + "learning_rate": 1.6683600051526682e-05, + "loss": 0.4655, + "step": 2529 + }, + { + "epoch": 0.29, + "learning_rate": 1.6680846163907107e-05, + "loss": 0.4952, + "step": 2530 + }, + { + "epoch": 0.29, + "learning_rate": 1.6678091360862447e-05, + "loss": 0.4787, + "step": 2531 + }, + { + "epoch": 0.29, + "learning_rate": 1.6675335642770178e-05, + "loss": 0.4895, + "step": 2532 + }, + { + "epoch": 0.29, + "learning_rate": 1.667257901000789e-05, + "loss": 0.4627, + "step": 2533 + }, + { + "epoch": 0.29, + "learning_rate": 1.6669821462953303e-05, + "loss": 0.4718, + "step": 2534 + }, + { + "epoch": 0.29, + "learning_rate": 1.6667063001984267e-05, + "loss": 0.4801, + "step": 2535 + }, + { + "epoch": 0.29, + "learning_rate": 1.6664303627478745e-05, + "loss": 0.4728, + "step": 2536 + }, + { + "epoch": 0.29, + "learning_rate": 1.6661543339814847e-05, + "loss": 0.4682, + "step": 2537 + }, + { + "epoch": 0.29, + "learning_rate": 1.6658782139370775e-05, + "loss": 0.4591, + "step": 2538 + }, + { + "epoch": 0.29, + "learning_rate": 1.6656020026524887e-05, + "loss": 0.4752, + "step": 2539 + }, + { + "epoch": 0.29, + "learning_rate": 1.6653257001655652e-05, + "loss": 0.4631, + "step": 2540 + }, + { + "epoch": 0.29, + "learning_rate": 1.6650493065141672e-05, + "loss": 0.4763, + "step": 2541 + }, + { + "epoch": 0.29, + "learning_rate": 1.6647728217361658e-05, + "loss": 0.4713, + "step": 2542 + }, + { + "epoch": 0.29, + "learning_rate": 1.6644962458694457e-05, + "loss": 0.4599, + "step": 2543 + }, + { + "epoch": 0.29, + "learning_rate": 1.6642195789519045e-05, + "loss": 0.4702, + "step": 2544 + }, + { + "epoch": 0.29, + "learning_rate": 1.6639428210214514e-05, + "loss": 0.4738, + "step": 2545 + }, + { + "epoch": 0.29, + "learning_rate": 1.6636659721160088e-05, + "loss": 0.4827, + "step": 2546 + }, + { + "epoch": 0.29, + "learning_rate": 1.6633890322735107e-05, + "loss": 0.4857, + "step": 2547 + }, + { + "epoch": 0.29, + "learning_rate": 1.6631120015319044e-05, + "loss": 0.4765, + "step": 2548 + }, + { + "epoch": 0.29, + "learning_rate": 1.662834879929149e-05, + "loss": 0.4676, + "step": 2549 + }, + { + "epoch": 0.29, + "learning_rate": 1.6625576675032163e-05, + "loss": 0.4788, + "step": 2550 + }, + { + "epoch": 0.29, + "learning_rate": 1.6622803642920912e-05, + "loss": 0.4827, + "step": 2551 + }, + { + "epoch": 0.29, + "learning_rate": 1.6620029703337697e-05, + "loss": 0.469, + "step": 2552 + }, + { + "epoch": 0.29, + "learning_rate": 1.6617254856662613e-05, + "loss": 0.4874, + "step": 2553 + }, + { + "epoch": 0.29, + "learning_rate": 1.6614479103275875e-05, + "loss": 0.4735, + "step": 2554 + }, + { + "epoch": 0.29, + "learning_rate": 1.6611702443557826e-05, + "loss": 0.4696, + "step": 2555 + }, + { + "epoch": 0.29, + "learning_rate": 1.6608924877888926e-05, + "loss": 0.4801, + "step": 2556 + }, + { + "epoch": 0.29, + "learning_rate": 1.6606146406649767e-05, + "loss": 0.471, + "step": 2557 + }, + { + "epoch": 0.29, + "learning_rate": 1.660336703022106e-05, + "loss": 0.4862, + "step": 2558 + }, + { + "epoch": 0.29, + "learning_rate": 1.6600586748983642e-05, + "loss": 0.4797, + "step": 2559 + }, + { + "epoch": 0.29, + "learning_rate": 1.6597805563318475e-05, + "loss": 0.4754, + "step": 2560 + }, + { + "epoch": 0.29, + "learning_rate": 1.659502347360664e-05, + "loss": 0.4828, + "step": 2561 + }, + { + "epoch": 0.29, + "learning_rate": 1.659224048022935e-05, + "loss": 0.4729, + "step": 2562 + }, + { + "epoch": 0.29, + "learning_rate": 1.6589456583567934e-05, + "loss": 0.4708, + "step": 2563 + }, + { + "epoch": 0.29, + "learning_rate": 1.6586671784003846e-05, + "loss": 0.4831, + "step": 2564 + }, + { + "epoch": 0.29, + "learning_rate": 1.658388608191867e-05, + "loss": 0.4759, + "step": 2565 + }, + { + "epoch": 0.29, + "learning_rate": 1.658109947769411e-05, + "loss": 0.4712, + "step": 2566 + }, + { + "epoch": 0.29, + "learning_rate": 1.657831197171199e-05, + "loss": 0.4785, + "step": 2567 + }, + { + "epoch": 0.29, + "learning_rate": 1.657552356435426e-05, + "loss": 0.4917, + "step": 2568 + }, + { + "epoch": 0.29, + "learning_rate": 1.6572734256002997e-05, + "loss": 0.4723, + "step": 2569 + }, + { + "epoch": 0.29, + "learning_rate": 1.6569944047040394e-05, + "loss": 0.4645, + "step": 2570 + }, + { + "epoch": 0.29, + "learning_rate": 1.6567152937848776e-05, + "loss": 0.483, + "step": 2571 + }, + { + "epoch": 0.29, + "learning_rate": 1.6564360928810588e-05, + "loss": 0.4732, + "step": 2572 + }, + { + "epoch": 0.29, + "learning_rate": 1.6561568020308397e-05, + "loss": 0.48, + "step": 2573 + }, + { + "epoch": 0.29, + "learning_rate": 1.6558774212724888e-05, + "loss": 0.48, + "step": 2574 + }, + { + "epoch": 0.29, + "learning_rate": 1.655597950644288e-05, + "loss": 0.4795, + "step": 2575 + }, + { + "epoch": 0.29, + "learning_rate": 1.6553183901845313e-05, + "loss": 0.475, + "step": 2576 + }, + { + "epoch": 0.29, + "learning_rate": 1.6550387399315246e-05, + "loss": 0.4733, + "step": 2577 + }, + { + "epoch": 0.29, + "learning_rate": 1.6547589999235854e-05, + "loss": 0.4748, + "step": 2578 + }, + { + "epoch": 0.29, + "learning_rate": 1.654479170199045e-05, + "loss": 0.467, + "step": 2579 + }, + { + "epoch": 0.29, + "learning_rate": 1.6541992507962467e-05, + "loss": 0.4737, + "step": 2580 + }, + { + "epoch": 0.29, + "learning_rate": 1.653919241753545e-05, + "loss": 0.4803, + "step": 2581 + }, + { + "epoch": 0.3, + "learning_rate": 1.6536391431093077e-05, + "loss": 0.4576, + "step": 2582 + }, + { + "epoch": 0.3, + "learning_rate": 1.6533589549019147e-05, + "loss": 0.4855, + "step": 2583 + }, + { + "epoch": 0.3, + "learning_rate": 1.6530786771697575e-05, + "loss": 0.4809, + "step": 2584 + }, + { + "epoch": 0.3, + "learning_rate": 1.6527983099512414e-05, + "loss": 0.4582, + "step": 2585 + }, + { + "epoch": 0.3, + "learning_rate": 1.6525178532847816e-05, + "loss": 0.4754, + "step": 2586 + }, + { + "epoch": 0.3, + "learning_rate": 1.6522373072088083e-05, + "loss": 0.4623, + "step": 2587 + }, + { + "epoch": 0.3, + "learning_rate": 1.6519566717617616e-05, + "loss": 0.4678, + "step": 2588 + }, + { + "epoch": 0.3, + "learning_rate": 1.6516759469820955e-05, + "loss": 0.4837, + "step": 2589 + }, + { + "epoch": 0.3, + "learning_rate": 1.6513951329082746e-05, + "loss": 0.4858, + "step": 2590 + }, + { + "epoch": 0.3, + "learning_rate": 1.651114229578778e-05, + "loss": 0.4732, + "step": 2591 + }, + { + "epoch": 0.3, + "learning_rate": 1.6508332370320948e-05, + "loss": 0.4903, + "step": 2592 + }, + { + "epoch": 0.3, + "learning_rate": 1.6505521553067273e-05, + "loss": 0.4576, + "step": 2593 + }, + { + "epoch": 0.3, + "learning_rate": 1.6502709844411907e-05, + "loss": 0.4749, + "step": 2594 + }, + { + "epoch": 0.3, + "learning_rate": 1.6499897244740107e-05, + "loss": 0.4708, + "step": 2595 + }, + { + "epoch": 0.3, + "learning_rate": 1.649708375443727e-05, + "loss": 0.4769, + "step": 2596 + }, + { + "epoch": 0.3, + "learning_rate": 1.6494269373888902e-05, + "loss": 0.4642, + "step": 2597 + }, + { + "epoch": 0.3, + "learning_rate": 1.6491454103480637e-05, + "loss": 0.4851, + "step": 2598 + }, + { + "epoch": 0.3, + "learning_rate": 1.6488637943598235e-05, + "loss": 0.4549, + "step": 2599 + }, + { + "epoch": 0.3, + "learning_rate": 1.648582089462756e-05, + "loss": 0.5042, + "step": 2600 + }, + { + "epoch": 0.3, + "learning_rate": 1.6483002956954622e-05, + "loss": 0.4748, + "step": 2601 + }, + { + "epoch": 0.3, + "learning_rate": 1.6480184130965542e-05, + "loss": 0.4497, + "step": 2602 + }, + { + "epoch": 0.3, + "learning_rate": 1.647736441704656e-05, + "loss": 0.4645, + "step": 2603 + }, + { + "epoch": 0.3, + "learning_rate": 1.647454381558403e-05, + "loss": 0.4807, + "step": 2604 + }, + { + "epoch": 0.3, + "learning_rate": 1.647172232696445e-05, + "loss": 0.4787, + "step": 2605 + }, + { + "epoch": 0.3, + "learning_rate": 1.6468899951574423e-05, + "loss": 0.4748, + "step": 2606 + }, + { + "epoch": 0.3, + "learning_rate": 1.6466076689800677e-05, + "loss": 0.4747, + "step": 2607 + }, + { + "epoch": 0.3, + "learning_rate": 1.6463252542030058e-05, + "loss": 0.4748, + "step": 2608 + }, + { + "epoch": 0.3, + "learning_rate": 1.6460427508649546e-05, + "loss": 0.4958, + "step": 2609 + }, + { + "epoch": 0.3, + "learning_rate": 1.6457601590046227e-05, + "loss": 0.454, + "step": 2610 + }, + { + "epoch": 0.3, + "learning_rate": 1.6454774786607317e-05, + "loss": 0.4737, + "step": 2611 + }, + { + "epoch": 0.3, + "learning_rate": 1.6451947098720148e-05, + "loss": 0.4823, + "step": 2612 + }, + { + "epoch": 0.3, + "learning_rate": 1.6449118526772183e-05, + "loss": 0.4601, + "step": 2613 + }, + { + "epoch": 0.3, + "learning_rate": 1.6446289071150993e-05, + "loss": 0.4766, + "step": 2614 + }, + { + "epoch": 0.3, + "learning_rate": 1.644345873224428e-05, + "loss": 0.4621, + "step": 2615 + }, + { + "epoch": 0.3, + "learning_rate": 1.6440627510439862e-05, + "loss": 0.4784, + "step": 2616 + }, + { + "epoch": 0.3, + "learning_rate": 1.6437795406125684e-05, + "loss": 0.4711, + "step": 2617 + }, + { + "epoch": 0.3, + "learning_rate": 1.6434962419689803e-05, + "loss": 0.4825, + "step": 2618 + }, + { + "epoch": 0.3, + "learning_rate": 1.64321285515204e-05, + "loss": 0.4636, + "step": 2619 + }, + { + "epoch": 0.3, + "learning_rate": 1.6429293802005783e-05, + "loss": 0.4749, + "step": 2620 + }, + { + "epoch": 0.3, + "learning_rate": 1.642645817153437e-05, + "loss": 0.4664, + "step": 2621 + }, + { + "epoch": 0.3, + "learning_rate": 1.6423621660494714e-05, + "loss": 0.4854, + "step": 2622 + }, + { + "epoch": 0.3, + "learning_rate": 1.6420784269275474e-05, + "loss": 0.4695, + "step": 2623 + }, + { + "epoch": 0.3, + "learning_rate": 1.6417945998265436e-05, + "loss": 0.4813, + "step": 2624 + }, + { + "epoch": 0.3, + "learning_rate": 1.641510684785351e-05, + "loss": 0.464, + "step": 2625 + }, + { + "epoch": 0.3, + "learning_rate": 1.641226681842872e-05, + "loss": 0.4837, + "step": 2626 + }, + { + "epoch": 0.3, + "learning_rate": 1.6409425910380215e-05, + "loss": 0.4592, + "step": 2627 + }, + { + "epoch": 0.3, + "learning_rate": 1.640658412409726e-05, + "loss": 0.4575, + "step": 2628 + }, + { + "epoch": 0.3, + "learning_rate": 1.640374145996925e-05, + "loss": 0.4661, + "step": 2629 + }, + { + "epoch": 0.3, + "learning_rate": 1.6400897918385687e-05, + "loss": 0.501, + "step": 2630 + }, + { + "epoch": 0.3, + "learning_rate": 1.63980534997362e-05, + "loss": 0.4592, + "step": 2631 + }, + { + "epoch": 0.3, + "learning_rate": 1.639520820441054e-05, + "loss": 0.4907, + "step": 2632 + }, + { + "epoch": 0.3, + "learning_rate": 1.6392362032798578e-05, + "loss": 0.4751, + "step": 2633 + }, + { + "epoch": 0.3, + "learning_rate": 1.63895149852903e-05, + "loss": 0.4735, + "step": 2634 + }, + { + "epoch": 0.3, + "learning_rate": 1.6386667062275817e-05, + "loss": 0.4791, + "step": 2635 + }, + { + "epoch": 0.3, + "learning_rate": 1.638381826414535e-05, + "loss": 0.476, + "step": 2636 + }, + { + "epoch": 0.3, + "learning_rate": 1.638096859128926e-05, + "loss": 0.453, + "step": 2637 + }, + { + "epoch": 0.3, + "learning_rate": 1.637811804409801e-05, + "loss": 0.4987, + "step": 2638 + }, + { + "epoch": 0.3, + "learning_rate": 1.6375266622962188e-05, + "loss": 0.4523, + "step": 2639 + }, + { + "epoch": 0.3, + "learning_rate": 1.6372414328272502e-05, + "loss": 0.4917, + "step": 2640 + }, + { + "epoch": 0.3, + "learning_rate": 1.6369561160419783e-05, + "loss": 0.4654, + "step": 2641 + }, + { + "epoch": 0.3, + "learning_rate": 1.6366707119794978e-05, + "loss": 0.4869, + "step": 2642 + }, + { + "epoch": 0.3, + "learning_rate": 1.6363852206789155e-05, + "loss": 0.4597, + "step": 2643 + }, + { + "epoch": 0.3, + "learning_rate": 1.6360996421793497e-05, + "loss": 0.4904, + "step": 2644 + }, + { + "epoch": 0.3, + "learning_rate": 1.635813976519931e-05, + "loss": 0.4572, + "step": 2645 + }, + { + "epoch": 0.3, + "learning_rate": 1.6355282237398026e-05, + "loss": 0.4793, + "step": 2646 + }, + { + "epoch": 0.3, + "learning_rate": 1.6352423838781185e-05, + "loss": 0.4746, + "step": 2647 + }, + { + "epoch": 0.3, + "learning_rate": 1.6349564569740454e-05, + "loss": 0.4728, + "step": 2648 + }, + { + "epoch": 0.3, + "learning_rate": 1.6346704430667612e-05, + "loss": 0.477, + "step": 2649 + }, + { + "epoch": 0.3, + "learning_rate": 1.6343843421954567e-05, + "loss": 0.4799, + "step": 2650 + }, + { + "epoch": 0.3, + "learning_rate": 1.634098154399334e-05, + "loss": 0.4626, + "step": 2651 + }, + { + "epoch": 0.3, + "learning_rate": 1.6338118797176074e-05, + "loss": 0.4803, + "step": 2652 + }, + { + "epoch": 0.3, + "learning_rate": 1.6335255181895026e-05, + "loss": 0.482, + "step": 2653 + }, + { + "epoch": 0.3, + "learning_rate": 1.633239069854257e-05, + "loss": 0.4549, + "step": 2654 + }, + { + "epoch": 0.3, + "learning_rate": 1.632952534751122e-05, + "loss": 0.4724, + "step": 2655 + }, + { + "epoch": 0.3, + "learning_rate": 1.6326659129193577e-05, + "loss": 0.4907, + "step": 2656 + }, + { + "epoch": 0.3, + "learning_rate": 1.632379204398238e-05, + "loss": 0.4783, + "step": 2657 + }, + { + "epoch": 0.3, + "learning_rate": 1.6320924092270494e-05, + "loss": 0.5081, + "step": 2658 + }, + { + "epoch": 0.3, + "learning_rate": 1.6318055274450885e-05, + "loss": 0.4605, + "step": 2659 + }, + { + "epoch": 0.3, + "learning_rate": 1.6315185590916644e-05, + "loss": 0.4626, + "step": 2660 + }, + { + "epoch": 0.3, + "learning_rate": 1.6312315042060984e-05, + "loss": 0.4659, + "step": 2661 + }, + { + "epoch": 0.3, + "learning_rate": 1.630944362827723e-05, + "loss": 0.4902, + "step": 2662 + }, + { + "epoch": 0.3, + "learning_rate": 1.6306571349958833e-05, + "loss": 0.4671, + "step": 2663 + }, + { + "epoch": 0.3, + "learning_rate": 1.6303698207499364e-05, + "loss": 0.4744, + "step": 2664 + }, + { + "epoch": 0.3, + "learning_rate": 1.63008242012925e-05, + "loss": 0.4611, + "step": 2665 + }, + { + "epoch": 0.3, + "learning_rate": 1.6297949331732047e-05, + "loss": 0.5038, + "step": 2666 + }, + { + "epoch": 0.3, + "learning_rate": 1.629507359921193e-05, + "loss": 0.4835, + "step": 2667 + }, + { + "epoch": 0.3, + "learning_rate": 1.6292197004126184e-05, + "loss": 0.4742, + "step": 2668 + }, + { + "epoch": 0.31, + "learning_rate": 1.6289319546868966e-05, + "loss": 0.4685, + "step": 2669 + }, + { + "epoch": 0.31, + "learning_rate": 1.6286441227834552e-05, + "loss": 0.4954, + "step": 2670 + }, + { + "epoch": 0.31, + "learning_rate": 1.6283562047417342e-05, + "loss": 0.4682, + "step": 2671 + }, + { + "epoch": 0.31, + "learning_rate": 1.628068200601184e-05, + "loss": 0.4653, + "step": 2672 + }, + { + "epoch": 0.31, + "learning_rate": 1.627780110401268e-05, + "loss": 0.4799, + "step": 2673 + }, + { + "epoch": 0.31, + "learning_rate": 1.6274919341814607e-05, + "loss": 0.4749, + "step": 2674 + }, + { + "epoch": 0.31, + "learning_rate": 1.6272036719812496e-05, + "loss": 0.4631, + "step": 2675 + }, + { + "epoch": 0.31, + "learning_rate": 1.6269153238401317e-05, + "loss": 0.4921, + "step": 2676 + }, + { + "epoch": 0.31, + "learning_rate": 1.626626889797618e-05, + "loss": 0.4678, + "step": 2677 + }, + { + "epoch": 0.31, + "learning_rate": 1.6263383698932307e-05, + "loss": 0.4558, + "step": 2678 + }, + { + "epoch": 0.31, + "learning_rate": 1.6260497641665028e-05, + "loss": 0.4778, + "step": 2679 + }, + { + "epoch": 0.31, + "learning_rate": 1.6257610726569798e-05, + "loss": 0.4876, + "step": 2680 + }, + { + "epoch": 0.31, + "learning_rate": 1.625472295404219e-05, + "loss": 0.4687, + "step": 2681 + }, + { + "epoch": 0.31, + "learning_rate": 1.625183432447789e-05, + "loss": 0.4772, + "step": 2682 + }, + { + "epoch": 0.31, + "learning_rate": 1.6248944838272712e-05, + "loss": 0.46, + "step": 2683 + }, + { + "epoch": 0.31, + "learning_rate": 1.6246054495822575e-05, + "loss": 0.4925, + "step": 2684 + }, + { + "epoch": 0.31, + "learning_rate": 1.6243163297523524e-05, + "loss": 0.4656, + "step": 2685 + }, + { + "epoch": 0.31, + "learning_rate": 1.6240271243771713e-05, + "loss": 0.4525, + "step": 2686 + }, + { + "epoch": 0.31, + "learning_rate": 1.6237378334963422e-05, + "loss": 0.4818, + "step": 2687 + }, + { + "epoch": 0.31, + "learning_rate": 1.623448457149504e-05, + "loss": 0.4646, + "step": 2688 + }, + { + "epoch": 0.31, + "learning_rate": 1.623158995376308e-05, + "loss": 0.4739, + "step": 2689 + }, + { + "epoch": 0.31, + "learning_rate": 1.6228694482164167e-05, + "loss": 0.4613, + "step": 2690 + }, + { + "epoch": 0.31, + "learning_rate": 1.622579815709505e-05, + "loss": 0.5051, + "step": 2691 + }, + { + "epoch": 0.31, + "learning_rate": 1.6222900978952586e-05, + "loss": 0.4472, + "step": 2692 + }, + { + "epoch": 0.31, + "learning_rate": 1.6220002948133756e-05, + "loss": 0.5041, + "step": 2693 + }, + { + "epoch": 0.31, + "learning_rate": 1.6217104065035652e-05, + "loss": 0.4845, + "step": 2694 + }, + { + "epoch": 0.31, + "learning_rate": 1.6214204330055484e-05, + "loss": 0.4754, + "step": 2695 + }, + { + "epoch": 0.31, + "learning_rate": 1.621130374359059e-05, + "loss": 0.4589, + "step": 2696 + }, + { + "epoch": 0.31, + "learning_rate": 1.6208402306038406e-05, + "loss": 0.4649, + "step": 2697 + }, + { + "epoch": 0.31, + "learning_rate": 1.620550001779649e-05, + "loss": 0.4751, + "step": 2698 + }, + { + "epoch": 0.31, + "learning_rate": 1.6202596879262536e-05, + "loss": 0.4805, + "step": 2699 + }, + { + "epoch": 0.31, + "learning_rate": 1.6199692890834324e-05, + "loss": 0.46, + "step": 2700 + }, + { + "epoch": 0.31, + "learning_rate": 1.6196788052909772e-05, + "loss": 0.477, + "step": 2701 + }, + { + "epoch": 0.31, + "learning_rate": 1.6193882365886905e-05, + "loss": 0.4757, + "step": 2702 + }, + { + "epoch": 0.31, + "learning_rate": 1.6190975830163872e-05, + "loss": 0.4712, + "step": 2703 + }, + { + "epoch": 0.31, + "learning_rate": 1.6188068446138925e-05, + "loss": 0.4701, + "step": 2704 + }, + { + "epoch": 0.31, + "learning_rate": 1.6185160214210447e-05, + "loss": 0.4706, + "step": 2705 + }, + { + "epoch": 0.31, + "learning_rate": 1.6182251134776927e-05, + "loss": 0.4659, + "step": 2706 + }, + { + "epoch": 0.31, + "learning_rate": 1.6179341208236977e-05, + "loss": 0.4882, + "step": 2707 + }, + { + "epoch": 0.31, + "learning_rate": 1.617643043498932e-05, + "loss": 0.4824, + "step": 2708 + }, + { + "epoch": 0.31, + "learning_rate": 1.6173518815432797e-05, + "loss": 0.4679, + "step": 2709 + }, + { + "epoch": 0.31, + "learning_rate": 1.6170606349966367e-05, + "loss": 0.4932, + "step": 2710 + }, + { + "epoch": 0.31, + "learning_rate": 1.6167693038989098e-05, + "loss": 0.485, + "step": 2711 + }, + { + "epoch": 0.31, + "learning_rate": 1.6164778882900186e-05, + "loss": 0.4555, + "step": 2712 + }, + { + "epoch": 0.31, + "learning_rate": 1.6161863882098926e-05, + "loss": 0.5035, + "step": 2713 + }, + { + "epoch": 0.31, + "learning_rate": 1.615894803698475e-05, + "loss": 0.4557, + "step": 2714 + }, + { + "epoch": 0.31, + "learning_rate": 1.615603134795718e-05, + "loss": 0.4986, + "step": 2715 + }, + { + "epoch": 0.31, + "learning_rate": 1.615311381541588e-05, + "loss": 0.4746, + "step": 2716 + }, + { + "epoch": 0.31, + "learning_rate": 1.615019543976061e-05, + "loss": 0.4601, + "step": 2717 + }, + { + "epoch": 0.31, + "learning_rate": 1.6147276221391256e-05, + "loss": 0.4762, + "step": 2718 + }, + { + "epoch": 0.31, + "learning_rate": 1.614435616070781e-05, + "loss": 0.483, + "step": 2719 + }, + { + "epoch": 0.31, + "learning_rate": 1.6141435258110397e-05, + "loss": 0.4747, + "step": 2720 + }, + { + "epoch": 0.31, + "learning_rate": 1.6138513513999234e-05, + "loss": 0.4668, + "step": 2721 + }, + { + "epoch": 0.31, + "learning_rate": 1.613559092877467e-05, + "loss": 0.4704, + "step": 2722 + }, + { + "epoch": 0.31, + "learning_rate": 1.6132667502837164e-05, + "loss": 0.4809, + "step": 2723 + }, + { + "epoch": 0.31, + "learning_rate": 1.6129743236587293e-05, + "loss": 0.4661, + "step": 2724 + }, + { + "epoch": 0.31, + "learning_rate": 1.6126818130425746e-05, + "loss": 0.48, + "step": 2725 + }, + { + "epoch": 0.31, + "learning_rate": 1.6123892184753324e-05, + "loss": 0.4916, + "step": 2726 + }, + { + "epoch": 0.31, + "learning_rate": 1.612096539997095e-05, + "loss": 0.4791, + "step": 2727 + }, + { + "epoch": 0.31, + "learning_rate": 1.611803777647966e-05, + "loss": 0.4851, + "step": 2728 + }, + { + "epoch": 0.31, + "learning_rate": 1.6115109314680603e-05, + "loss": 0.4755, + "step": 2729 + }, + { + "epoch": 0.31, + "learning_rate": 1.611218001497504e-05, + "loss": 0.4662, + "step": 2730 + }, + { + "epoch": 0.31, + "learning_rate": 1.610924987776436e-05, + "loss": 0.4743, + "step": 2731 + }, + { + "epoch": 0.31, + "learning_rate": 1.6106318903450042e-05, + "loss": 0.446, + "step": 2732 + }, + { + "epoch": 0.31, + "learning_rate": 1.6103387092433704e-05, + "loss": 0.48, + "step": 2733 + }, + { + "epoch": 0.31, + "learning_rate": 1.6100454445117074e-05, + "loss": 0.4846, + "step": 2734 + }, + { + "epoch": 0.31, + "learning_rate": 1.6097520961901983e-05, + "loss": 0.4946, + "step": 2735 + }, + { + "epoch": 0.31, + "learning_rate": 1.6094586643190388e-05, + "loss": 0.4798, + "step": 2736 + }, + { + "epoch": 0.31, + "learning_rate": 1.609165148938435e-05, + "loss": 0.4737, + "step": 2737 + }, + { + "epoch": 0.31, + "learning_rate": 1.608871550088606e-05, + "loss": 0.4634, + "step": 2738 + }, + { + "epoch": 0.31, + "learning_rate": 1.6085778678097804e-05, + "loss": 0.4716, + "step": 2739 + }, + { + "epoch": 0.31, + "learning_rate": 1.6082841021422e-05, + "loss": 0.4621, + "step": 2740 + }, + { + "epoch": 0.31, + "learning_rate": 1.607990253126117e-05, + "loss": 0.4844, + "step": 2741 + }, + { + "epoch": 0.31, + "learning_rate": 1.607696320801795e-05, + "loss": 0.479, + "step": 2742 + }, + { + "epoch": 0.31, + "learning_rate": 1.6074023052095096e-05, + "loss": 0.4718, + "step": 2743 + }, + { + "epoch": 0.31, + "learning_rate": 1.6071082063895476e-05, + "loss": 0.4843, + "step": 2744 + }, + { + "epoch": 0.31, + "learning_rate": 1.6068140243822065e-05, + "loss": 0.4677, + "step": 2745 + }, + { + "epoch": 0.31, + "learning_rate": 1.6065197592277965e-05, + "loss": 0.4666, + "step": 2746 + }, + { + "epoch": 0.31, + "learning_rate": 1.6062254109666383e-05, + "loss": 0.4853, + "step": 2747 + }, + { + "epoch": 0.31, + "learning_rate": 1.6059309796390638e-05, + "loss": 0.4735, + "step": 2748 + }, + { + "epoch": 0.31, + "learning_rate": 1.6056364652854174e-05, + "loss": 0.4768, + "step": 2749 + }, + { + "epoch": 0.31, + "learning_rate": 1.6053418679460534e-05, + "loss": 0.4528, + "step": 2750 + }, + { + "epoch": 0.31, + "learning_rate": 1.6050471876613386e-05, + "loss": 0.4702, + "step": 2751 + }, + { + "epoch": 0.31, + "learning_rate": 1.6047524244716506e-05, + "loss": 0.4713, + "step": 2752 + }, + { + "epoch": 0.31, + "learning_rate": 1.604457578417379e-05, + "loss": 0.4758, + "step": 2753 + }, + { + "epoch": 0.31, + "learning_rate": 1.6041626495389235e-05, + "loss": 0.4742, + "step": 2754 + }, + { + "epoch": 0.31, + "learning_rate": 1.6038676378766968e-05, + "loss": 0.4639, + "step": 2755 + }, + { + "epoch": 0.31, + "learning_rate": 1.603572543471121e-05, + "loss": 0.4743, + "step": 2756 + }, + { + "epoch": 0.32, + "learning_rate": 1.603277366362632e-05, + "loss": 0.4833, + "step": 2757 + }, + { + "epoch": 0.32, + "learning_rate": 1.6029821065916745e-05, + "loss": 0.4669, + "step": 2758 + }, + { + "epoch": 0.32, + "learning_rate": 1.602686764198706e-05, + "loss": 0.4883, + "step": 2759 + }, + { + "epoch": 0.32, + "learning_rate": 1.602391339224196e-05, + "loss": 0.4739, + "step": 2760 + }, + { + "epoch": 0.32, + "learning_rate": 1.6020958317086224e-05, + "loss": 0.466, + "step": 2761 + }, + { + "epoch": 0.32, + "learning_rate": 1.601800241692478e-05, + "loss": 0.4624, + "step": 2762 + }, + { + "epoch": 0.32, + "learning_rate": 1.6015045692162644e-05, + "loss": 0.4698, + "step": 2763 + }, + { + "epoch": 0.32, + "learning_rate": 1.6012088143204953e-05, + "loss": 0.4567, + "step": 2764 + }, + { + "epoch": 0.32, + "learning_rate": 1.6009129770456962e-05, + "loss": 0.4719, + "step": 2765 + }, + { + "epoch": 0.32, + "learning_rate": 1.6006170574324033e-05, + "loss": 0.4681, + "step": 2766 + }, + { + "epoch": 0.32, + "learning_rate": 1.6003210555211635e-05, + "loss": 0.4753, + "step": 2767 + }, + { + "epoch": 0.32, + "learning_rate": 1.6000249713525366e-05, + "loss": 0.4674, + "step": 2768 + }, + { + "epoch": 0.32, + "learning_rate": 1.5997288049670924e-05, + "loss": 0.4563, + "step": 2769 + }, + { + "epoch": 0.32, + "learning_rate": 1.5994325564054122e-05, + "loss": 0.5064, + "step": 2770 + }, + { + "epoch": 0.32, + "learning_rate": 1.599136225708089e-05, + "loss": 0.4761, + "step": 2771 + }, + { + "epoch": 0.32, + "learning_rate": 1.598839812915726e-05, + "loss": 0.4571, + "step": 2772 + }, + { + "epoch": 0.32, + "learning_rate": 1.598543318068939e-05, + "loss": 0.4861, + "step": 2773 + }, + { + "epoch": 0.32, + "learning_rate": 1.5982467412083543e-05, + "loss": 0.4579, + "step": 2774 + }, + { + "epoch": 0.32, + "learning_rate": 1.5979500823746096e-05, + "loss": 0.4937, + "step": 2775 + }, + { + "epoch": 0.32, + "learning_rate": 1.5976533416083535e-05, + "loss": 0.4646, + "step": 2776 + }, + { + "epoch": 0.32, + "learning_rate": 1.5973565189502463e-05, + "loss": 0.4636, + "step": 2777 + }, + { + "epoch": 0.32, + "learning_rate": 1.5970596144409595e-05, + "loss": 0.4741, + "step": 2778 + }, + { + "epoch": 0.32, + "learning_rate": 1.5967626281211754e-05, + "loss": 0.4716, + "step": 2779 + }, + { + "epoch": 0.32, + "learning_rate": 1.596465560031588e-05, + "loss": 0.4568, + "step": 2780 + }, + { + "epoch": 0.32, + "learning_rate": 1.5961684102129015e-05, + "loss": 0.4772, + "step": 2781 + }, + { + "epoch": 0.32, + "learning_rate": 1.5958711787058332e-05, + "loss": 0.4642, + "step": 2782 + }, + { + "epoch": 0.32, + "learning_rate": 1.5955738655511094e-05, + "loss": 0.4938, + "step": 2783 + }, + { + "epoch": 0.32, + "learning_rate": 1.5952764707894696e-05, + "loss": 0.4634, + "step": 2784 + }, + { + "epoch": 0.32, + "learning_rate": 1.594978994461663e-05, + "loss": 0.4767, + "step": 2785 + }, + { + "epoch": 0.32, + "learning_rate": 1.5946814366084505e-05, + "loss": 0.4734, + "step": 2786 + }, + { + "epoch": 0.32, + "learning_rate": 1.594383797270604e-05, + "loss": 0.4918, + "step": 2787 + }, + { + "epoch": 0.32, + "learning_rate": 1.5940860764889073e-05, + "loss": 0.4859, + "step": 2788 + }, + { + "epoch": 0.32, + "learning_rate": 1.5937882743041543e-05, + "loss": 0.462, + "step": 2789 + }, + { + "epoch": 0.32, + "learning_rate": 1.5934903907571507e-05, + "loss": 0.4686, + "step": 2790 + }, + { + "epoch": 0.32, + "learning_rate": 1.593192425888713e-05, + "loss": 0.4752, + "step": 2791 + }, + { + "epoch": 0.32, + "learning_rate": 1.5928943797396695e-05, + "loss": 0.4988, + "step": 2792 + }, + { + "epoch": 0.32, + "learning_rate": 1.592596252350859e-05, + "loss": 0.4771, + "step": 2793 + }, + { + "epoch": 0.32, + "learning_rate": 1.5922980437631314e-05, + "loss": 0.4763, + "step": 2794 + }, + { + "epoch": 0.32, + "learning_rate": 1.591999754017348e-05, + "loss": 0.4701, + "step": 2795 + }, + { + "epoch": 0.32, + "learning_rate": 1.5917013831543814e-05, + "loss": 0.4927, + "step": 2796 + }, + { + "epoch": 0.32, + "learning_rate": 1.5914029312151146e-05, + "loss": 0.4832, + "step": 2797 + }, + { + "epoch": 0.32, + "learning_rate": 1.5911043982404426e-05, + "loss": 0.475, + "step": 2798 + }, + { + "epoch": 0.32, + "learning_rate": 1.590805784271271e-05, + "loss": 0.4782, + "step": 2799 + }, + { + "epoch": 0.32, + "learning_rate": 1.5905070893485165e-05, + "loss": 0.4783, + "step": 2800 + }, + { + "epoch": 0.32, + "learning_rate": 1.5902083135131067e-05, + "loss": 0.514, + "step": 2801 + }, + { + "epoch": 0.32, + "learning_rate": 1.5899094568059812e-05, + "loss": 0.4526, + "step": 2802 + }, + { + "epoch": 0.32, + "learning_rate": 1.58961051926809e-05, + "loss": 0.4837, + "step": 2803 + }, + { + "epoch": 0.32, + "learning_rate": 1.5893115009403932e-05, + "loss": 0.4742, + "step": 2804 + }, + { + "epoch": 0.32, + "learning_rate": 1.589012401863864e-05, + "loss": 0.4731, + "step": 2805 + }, + { + "epoch": 0.32, + "learning_rate": 1.5887132220794855e-05, + "loss": 0.4697, + "step": 2806 + }, + { + "epoch": 0.32, + "learning_rate": 1.5884139616282517e-05, + "loss": 0.4692, + "step": 2807 + }, + { + "epoch": 0.32, + "learning_rate": 1.5881146205511683e-05, + "loss": 0.4793, + "step": 2808 + }, + { + "epoch": 0.32, + "learning_rate": 1.5878151988892513e-05, + "loss": 0.4826, + "step": 2809 + }, + { + "epoch": 0.32, + "learning_rate": 1.5875156966835285e-05, + "loss": 0.4683, + "step": 2810 + }, + { + "epoch": 0.32, + "learning_rate": 1.5872161139750384e-05, + "loss": 0.4728, + "step": 2811 + }, + { + "epoch": 0.32, + "learning_rate": 1.5869164508048304e-05, + "loss": 0.4519, + "step": 2812 + }, + { + "epoch": 0.32, + "learning_rate": 1.5866167072139645e-05, + "loss": 0.4566, + "step": 2813 + }, + { + "epoch": 0.32, + "learning_rate": 1.5863168832435137e-05, + "loss": 0.4898, + "step": 2814 + }, + { + "epoch": 0.32, + "learning_rate": 1.5860169789345592e-05, + "loss": 0.472, + "step": 2815 + }, + { + "epoch": 0.32, + "learning_rate": 1.5857169943281948e-05, + "loss": 0.4816, + "step": 2816 + }, + { + "epoch": 0.32, + "learning_rate": 1.585416929465526e-05, + "loss": 0.4718, + "step": 2817 + }, + { + "epoch": 0.32, + "learning_rate": 1.585116784387667e-05, + "loss": 0.4443, + "step": 2818 + }, + { + "epoch": 0.32, + "learning_rate": 1.5848165591357458e-05, + "loss": 0.4654, + "step": 2819 + }, + { + "epoch": 0.32, + "learning_rate": 1.584516253750899e-05, + "loss": 0.4793, + "step": 2820 + }, + { + "epoch": 0.32, + "learning_rate": 1.5842158682742756e-05, + "loss": 0.4703, + "step": 2821 + }, + { + "epoch": 0.32, + "learning_rate": 1.5839154027470346e-05, + "loss": 0.4631, + "step": 2822 + }, + { + "epoch": 0.32, + "learning_rate": 1.583614857210347e-05, + "loss": 0.4858, + "step": 2823 + }, + { + "epoch": 0.32, + "learning_rate": 1.5833142317053943e-05, + "loss": 0.4655, + "step": 2824 + }, + { + "epoch": 0.32, + "learning_rate": 1.5830135262733684e-05, + "loss": 0.4708, + "step": 2825 + }, + { + "epoch": 0.32, + "learning_rate": 1.582712740955473e-05, + "loss": 0.477, + "step": 2826 + }, + { + "epoch": 0.32, + "learning_rate": 1.5824118757929224e-05, + "loss": 0.4869, + "step": 2827 + }, + { + "epoch": 0.32, + "learning_rate": 1.5821109308269416e-05, + "loss": 0.4708, + "step": 2828 + }, + { + "epoch": 0.32, + "learning_rate": 1.581809906098767e-05, + "loss": 0.4834, + "step": 2829 + }, + { + "epoch": 0.32, + "learning_rate": 1.581508801649646e-05, + "loss": 0.4669, + "step": 2830 + }, + { + "epoch": 0.32, + "learning_rate": 1.581207617520836e-05, + "loss": 0.4761, + "step": 2831 + }, + { + "epoch": 0.32, + "learning_rate": 1.5809063537536066e-05, + "loss": 0.4765, + "step": 2832 + }, + { + "epoch": 0.32, + "learning_rate": 1.580605010389237e-05, + "loss": 0.4775, + "step": 2833 + }, + { + "epoch": 0.32, + "learning_rate": 1.5803035874690186e-05, + "loss": 0.4697, + "step": 2834 + }, + { + "epoch": 0.32, + "learning_rate": 1.5800020850342524e-05, + "loss": 0.4574, + "step": 2835 + }, + { + "epoch": 0.32, + "learning_rate": 1.5797005031262514e-05, + "loss": 0.4732, + "step": 2836 + }, + { + "epoch": 0.32, + "learning_rate": 1.579398841786339e-05, + "loss": 0.492, + "step": 2837 + }, + { + "epoch": 0.32, + "learning_rate": 1.57909710105585e-05, + "loss": 0.4496, + "step": 2838 + }, + { + "epoch": 0.32, + "learning_rate": 1.5787952809761286e-05, + "loss": 0.478, + "step": 2839 + }, + { + "epoch": 0.32, + "learning_rate": 1.5784933815885315e-05, + "loss": 0.4775, + "step": 2840 + }, + { + "epoch": 0.32, + "learning_rate": 1.5781914029344254e-05, + "loss": 0.4648, + "step": 2841 + }, + { + "epoch": 0.32, + "learning_rate": 1.5778893450551888e-05, + "loss": 0.4642, + "step": 2842 + }, + { + "epoch": 0.32, + "learning_rate": 1.5775872079922098e-05, + "loss": 0.4863, + "step": 2843 + }, + { + "epoch": 0.33, + "learning_rate": 1.5772849917868876e-05, + "loss": 0.4598, + "step": 2844 + }, + { + "epoch": 0.33, + "learning_rate": 1.576982696480633e-05, + "loss": 0.478, + "step": 2845 + }, + { + "epoch": 0.33, + "learning_rate": 1.5766803221148676e-05, + "loss": 0.4735, + "step": 2846 + }, + { + "epoch": 0.33, + "learning_rate": 1.5763778687310224e-05, + "loss": 0.4723, + "step": 2847 + }, + { + "epoch": 0.33, + "learning_rate": 1.5760753363705412e-05, + "loss": 0.4484, + "step": 2848 + }, + { + "epoch": 0.33, + "learning_rate": 1.5757727250748773e-05, + "loss": 0.4742, + "step": 2849 + }, + { + "epoch": 0.33, + "learning_rate": 1.5754700348854955e-05, + "loss": 0.4912, + "step": 2850 + }, + { + "epoch": 0.33, + "learning_rate": 1.5751672658438707e-05, + "loss": 0.4788, + "step": 2851 + }, + { + "epoch": 0.33, + "learning_rate": 1.574864417991489e-05, + "loss": 0.4717, + "step": 2852 + }, + { + "epoch": 0.33, + "learning_rate": 1.5745614913698478e-05, + "loss": 0.5, + "step": 2853 + }, + { + "epoch": 0.33, + "learning_rate": 1.5742584860204547e-05, + "loss": 0.4665, + "step": 2854 + }, + { + "epoch": 0.33, + "learning_rate": 1.5739554019848274e-05, + "loss": 0.471, + "step": 2855 + }, + { + "epoch": 0.33, + "learning_rate": 1.5736522393044962e-05, + "loss": 0.4684, + "step": 2856 + }, + { + "epoch": 0.33, + "learning_rate": 1.5733489980210007e-05, + "loss": 0.4837, + "step": 2857 + }, + { + "epoch": 0.33, + "learning_rate": 1.573045678175892e-05, + "loss": 0.4908, + "step": 2858 + }, + { + "epoch": 0.33, + "learning_rate": 1.5727422798107313e-05, + "loss": 0.4824, + "step": 2859 + }, + { + "epoch": 0.33, + "learning_rate": 1.5724388029670912e-05, + "loss": 0.4636, + "step": 2860 + }, + { + "epoch": 0.33, + "learning_rate": 1.5721352476865546e-05, + "loss": 0.4617, + "step": 2861 + }, + { + "epoch": 0.33, + "learning_rate": 1.5718316140107156e-05, + "loss": 0.4884, + "step": 2862 + }, + { + "epoch": 0.33, + "learning_rate": 1.5715279019811783e-05, + "loss": 0.4766, + "step": 2863 + }, + { + "epoch": 0.33, + "learning_rate": 1.571224111639559e-05, + "loss": 0.4708, + "step": 2864 + }, + { + "epoch": 0.33, + "learning_rate": 1.570920243027483e-05, + "loss": 0.4695, + "step": 2865 + }, + { + "epoch": 0.33, + "learning_rate": 1.5706162961865866e-05, + "loss": 0.4738, + "step": 2866 + }, + { + "epoch": 0.33, + "learning_rate": 1.570312271158519e-05, + "loss": 0.4878, + "step": 2867 + }, + { + "epoch": 0.33, + "learning_rate": 1.5700081679849362e-05, + "loss": 0.4698, + "step": 2868 + }, + { + "epoch": 0.33, + "learning_rate": 1.569703986707509e-05, + "loss": 0.4732, + "step": 2869 + }, + { + "epoch": 0.33, + "learning_rate": 1.5693997273679165e-05, + "loss": 0.5003, + "step": 2870 + }, + { + "epoch": 0.33, + "learning_rate": 1.5690953900078485e-05, + "loss": 0.4552, + "step": 2871 + }, + { + "epoch": 0.33, + "learning_rate": 1.5687909746690064e-05, + "loss": 0.472, + "step": 2872 + }, + { + "epoch": 0.33, + "learning_rate": 1.568486481393102e-05, + "loss": 0.4709, + "step": 2873 + }, + { + "epoch": 0.33, + "learning_rate": 1.5681819102218572e-05, + "loss": 0.4626, + "step": 2874 + }, + { + "epoch": 0.33, + "learning_rate": 1.5678772611970056e-05, + "loss": 0.4704, + "step": 2875 + }, + { + "epoch": 0.33, + "learning_rate": 1.5675725343602904e-05, + "loss": 0.4631, + "step": 2876 + }, + { + "epoch": 0.33, + "learning_rate": 1.5672677297534665e-05, + "loss": 0.4735, + "step": 2877 + }, + { + "epoch": 0.33, + "learning_rate": 1.566962847418299e-05, + "loss": 0.4643, + "step": 2878 + }, + { + "epoch": 0.33, + "learning_rate": 1.5666578873965627e-05, + "loss": 0.4823, + "step": 2879 + }, + { + "epoch": 0.33, + "learning_rate": 1.566352849730045e-05, + "loss": 0.4623, + "step": 2880 + }, + { + "epoch": 0.33, + "learning_rate": 1.566047734460542e-05, + "loss": 0.4846, + "step": 2881 + }, + { + "epoch": 0.33, + "learning_rate": 1.5657425416298623e-05, + "loss": 0.4666, + "step": 2882 + }, + { + "epoch": 0.33, + "learning_rate": 1.565437271279823e-05, + "loss": 0.4601, + "step": 2883 + }, + { + "epoch": 0.33, + "learning_rate": 1.5651319234522538e-05, + "loss": 0.4834, + "step": 2884 + }, + { + "epoch": 0.33, + "learning_rate": 1.5648264981889936e-05, + "loss": 0.4699, + "step": 2885 + }, + { + "epoch": 0.33, + "learning_rate": 1.564520995531893e-05, + "loss": 0.4772, + "step": 2886 + }, + { + "epoch": 0.33, + "learning_rate": 1.5642154155228124e-05, + "loss": 0.4867, + "step": 2887 + }, + { + "epoch": 0.33, + "learning_rate": 1.5639097582036226e-05, + "loss": 0.4585, + "step": 2888 + }, + { + "epoch": 0.33, + "learning_rate": 1.5636040236162066e-05, + "loss": 0.4807, + "step": 2889 + }, + { + "epoch": 0.33, + "learning_rate": 1.5632982118024556e-05, + "loss": 0.4746, + "step": 2890 + }, + { + "epoch": 0.33, + "learning_rate": 1.562992322804274e-05, + "loss": 0.4685, + "step": 2891 + }, + { + "epoch": 0.33, + "learning_rate": 1.5626863566635744e-05, + "loss": 0.4727, + "step": 2892 + }, + { + "epoch": 0.33, + "learning_rate": 1.5623803134222812e-05, + "loss": 0.4873, + "step": 2893 + }, + { + "epoch": 0.33, + "learning_rate": 1.5620741931223292e-05, + "loss": 0.4863, + "step": 2894 + }, + { + "epoch": 0.33, + "learning_rate": 1.5617679958056643e-05, + "loss": 0.4903, + "step": 2895 + }, + { + "epoch": 0.33, + "learning_rate": 1.5614617215142412e-05, + "loss": 0.4736, + "step": 2896 + }, + { + "epoch": 0.33, + "learning_rate": 1.5611553702900275e-05, + "loss": 0.4637, + "step": 2897 + }, + { + "epoch": 0.33, + "learning_rate": 1.5608489421749995e-05, + "loss": 0.4758, + "step": 2898 + }, + { + "epoch": 0.33, + "learning_rate": 1.5605424372111447e-05, + "loss": 0.4812, + "step": 2899 + }, + { + "epoch": 0.33, + "learning_rate": 1.5602358554404613e-05, + "loss": 0.4662, + "step": 2900 + }, + { + "epoch": 0.33, + "learning_rate": 1.5599291969049575e-05, + "loss": 0.4708, + "step": 2901 + }, + { + "epoch": 0.33, + "learning_rate": 1.5596224616466527e-05, + "loss": 0.469, + "step": 2902 + }, + { + "epoch": 0.33, + "learning_rate": 1.5593156497075767e-05, + "loss": 0.4772, + "step": 2903 + }, + { + "epoch": 0.33, + "learning_rate": 1.5590087611297694e-05, + "loss": 0.484, + "step": 2904 + }, + { + "epoch": 0.33, + "learning_rate": 1.558701795955281e-05, + "loss": 0.468, + "step": 2905 + }, + { + "epoch": 0.33, + "learning_rate": 1.558394754226173e-05, + "loss": 0.4638, + "step": 2906 + }, + { + "epoch": 0.33, + "learning_rate": 1.5580876359845166e-05, + "loss": 0.4748, + "step": 2907 + }, + { + "epoch": 0.33, + "learning_rate": 1.557780441272395e-05, + "loss": 0.4646, + "step": 2908 + }, + { + "epoch": 0.33, + "learning_rate": 1.5574731701318987e-05, + "loss": 0.4657, + "step": 2909 + }, + { + "epoch": 0.33, + "learning_rate": 1.5571658226051325e-05, + "loss": 0.5036, + "step": 2910 + }, + { + "epoch": 0.33, + "learning_rate": 1.556858398734209e-05, + "loss": 0.478, + "step": 2911 + }, + { + "epoch": 0.33, + "learning_rate": 1.5565508985612525e-05, + "loss": 0.4641, + "step": 2912 + }, + { + "epoch": 0.33, + "learning_rate": 1.556243322128397e-05, + "loss": 0.4988, + "step": 2913 + }, + { + "epoch": 0.33, + "learning_rate": 1.5559356694777882e-05, + "loss": 0.4682, + "step": 2914 + }, + { + "epoch": 0.33, + "learning_rate": 1.5556279406515802e-05, + "loss": 0.4605, + "step": 2915 + }, + { + "epoch": 0.33, + "learning_rate": 1.5553201356919394e-05, + "loss": 0.4572, + "step": 2916 + }, + { + "epoch": 0.33, + "learning_rate": 1.555012254641042e-05, + "loss": 0.4649, + "step": 2917 + }, + { + "epoch": 0.33, + "learning_rate": 1.554704297541074e-05, + "loss": 0.4938, + "step": 2918 + }, + { + "epoch": 0.33, + "learning_rate": 1.5543962644342335e-05, + "loss": 0.4751, + "step": 2919 + }, + { + "epoch": 0.33, + "learning_rate": 1.5540881553627264e-05, + "loss": 0.4628, + "step": 2920 + }, + { + "epoch": 0.33, + "learning_rate": 1.553779970368772e-05, + "loss": 0.4738, + "step": 2921 + }, + { + "epoch": 0.33, + "learning_rate": 1.553471709494598e-05, + "loss": 0.4807, + "step": 2922 + }, + { + "epoch": 0.33, + "learning_rate": 1.5531633727824423e-05, + "loss": 0.456, + "step": 2923 + }, + { + "epoch": 0.33, + "learning_rate": 1.5528549602745545e-05, + "loss": 0.4865, + "step": 2924 + }, + { + "epoch": 0.33, + "learning_rate": 1.5525464720131945e-05, + "loss": 0.4608, + "step": 2925 + }, + { + "epoch": 0.33, + "learning_rate": 1.5522379080406315e-05, + "loss": 0.4985, + "step": 2926 + }, + { + "epoch": 0.33, + "learning_rate": 1.5519292683991455e-05, + "loss": 0.4733, + "step": 2927 + }, + { + "epoch": 0.33, + "learning_rate": 1.5516205531310272e-05, + "loss": 0.4852, + "step": 2928 + }, + { + "epoch": 0.33, + "learning_rate": 1.5513117622785778e-05, + "loss": 0.4641, + "step": 2929 + }, + { + "epoch": 0.33, + "learning_rate": 1.5510028958841085e-05, + "loss": 0.4682, + "step": 2930 + }, + { + "epoch": 0.33, + "learning_rate": 1.5506939539899403e-05, + "loss": 0.4544, + "step": 2931 + }, + { + "epoch": 0.34, + "learning_rate": 1.5503849366384053e-05, + "loss": 0.4609, + "step": 2932 + }, + { + "epoch": 0.34, + "learning_rate": 1.5500758438718463e-05, + "loss": 0.4947, + "step": 2933 + }, + { + "epoch": 0.34, + "learning_rate": 1.5497666757326157e-05, + "loss": 0.4783, + "step": 2934 + }, + { + "epoch": 0.34, + "learning_rate": 1.5494574322630765e-05, + "loss": 0.4702, + "step": 2935 + }, + { + "epoch": 0.34, + "learning_rate": 1.5491481135056012e-05, + "loss": 0.5003, + "step": 2936 + }, + { + "epoch": 0.34, + "learning_rate": 1.5488387195025745e-05, + "loss": 0.4549, + "step": 2937 + }, + { + "epoch": 0.34, + "learning_rate": 1.5485292502963892e-05, + "loss": 0.4735, + "step": 2938 + }, + { + "epoch": 0.34, + "learning_rate": 1.548219705929451e-05, + "loss": 0.4817, + "step": 2939 + }, + { + "epoch": 0.34, + "learning_rate": 1.5479100864441726e-05, + "loss": 0.4679, + "step": 2940 + }, + { + "epoch": 0.34, + "learning_rate": 1.54760039188298e-05, + "loss": 0.4699, + "step": 2941 + }, + { + "epoch": 0.34, + "learning_rate": 1.5472906222883075e-05, + "loss": 0.4791, + "step": 2942 + }, + { + "epoch": 0.34, + "learning_rate": 1.5469807777026014e-05, + "loss": 0.4749, + "step": 2943 + }, + { + "epoch": 0.34, + "learning_rate": 1.5466708581683164e-05, + "loss": 0.4845, + "step": 2944 + }, + { + "epoch": 0.34, + "learning_rate": 1.546360863727919e-05, + "loss": 0.4682, + "step": 2945 + }, + { + "epoch": 0.34, + "learning_rate": 1.546050794423885e-05, + "loss": 0.4739, + "step": 2946 + }, + { + "epoch": 0.34, + "learning_rate": 1.5457406502987007e-05, + "loss": 0.4703, + "step": 2947 + }, + { + "epoch": 0.34, + "learning_rate": 1.5454304313948635e-05, + "loss": 0.486, + "step": 2948 + }, + { + "epoch": 0.34, + "learning_rate": 1.5451201377548793e-05, + "loss": 0.4633, + "step": 2949 + }, + { + "epoch": 0.34, + "learning_rate": 1.5448097694212663e-05, + "loss": 0.4788, + "step": 2950 + }, + { + "epoch": 0.34, + "learning_rate": 1.544499326436551e-05, + "loss": 0.4686, + "step": 2951 + }, + { + "epoch": 0.34, + "learning_rate": 1.5441888088432716e-05, + "loss": 0.482, + "step": 2952 + }, + { + "epoch": 0.34, + "learning_rate": 1.5438782166839757e-05, + "loss": 0.4666, + "step": 2953 + }, + { + "epoch": 0.34, + "learning_rate": 1.5435675500012212e-05, + "loss": 0.4929, + "step": 2954 + }, + { + "epoch": 0.34, + "learning_rate": 1.5432568088375766e-05, + "loss": 0.4673, + "step": 2955 + }, + { + "epoch": 0.34, + "learning_rate": 1.542945993235621e-05, + "loss": 0.4798, + "step": 2956 + }, + { + "epoch": 0.34, + "learning_rate": 1.5426351032379418e-05, + "loss": 0.4764, + "step": 2957 + }, + { + "epoch": 0.34, + "learning_rate": 1.5423241388871383e-05, + "loss": 0.4605, + "step": 2958 + }, + { + "epoch": 0.34, + "learning_rate": 1.54201310022582e-05, + "loss": 0.4917, + "step": 2959 + }, + { + "epoch": 0.34, + "learning_rate": 1.541701987296606e-05, + "loss": 0.468, + "step": 2960 + }, + { + "epoch": 0.34, + "learning_rate": 1.5413908001421257e-05, + "loss": 0.4596, + "step": 2961 + }, + { + "epoch": 0.34, + "learning_rate": 1.5410795388050182e-05, + "loss": 0.488, + "step": 2962 + }, + { + "epoch": 0.34, + "learning_rate": 1.540768203327934e-05, + "loss": 0.4573, + "step": 2963 + }, + { + "epoch": 0.34, + "learning_rate": 1.5404567937535326e-05, + "loss": 0.4672, + "step": 2964 + }, + { + "epoch": 0.34, + "learning_rate": 1.540145310124484e-05, + "loss": 0.4718, + "step": 2965 + }, + { + "epoch": 0.34, + "learning_rate": 1.5398337524834688e-05, + "loss": 0.4789, + "step": 2966 + }, + { + "epoch": 0.34, + "learning_rate": 1.5395221208731766e-05, + "loss": 0.4671, + "step": 2967 + }, + { + "epoch": 0.34, + "learning_rate": 1.5392104153363086e-05, + "loss": 0.4992, + "step": 2968 + }, + { + "epoch": 0.34, + "learning_rate": 1.538898635915576e-05, + "loss": 0.4666, + "step": 2969 + }, + { + "epoch": 0.34, + "learning_rate": 1.5385867826536977e-05, + "loss": 0.486, + "step": 2970 + }, + { + "epoch": 0.34, + "learning_rate": 1.5382748555934058e-05, + "loss": 0.4593, + "step": 2971 + }, + { + "epoch": 0.34, + "learning_rate": 1.5379628547774412e-05, + "loss": 0.4736, + "step": 2972 + }, + { + "epoch": 0.34, + "learning_rate": 1.5376507802485547e-05, + "loss": 0.4685, + "step": 2973 + }, + { + "epoch": 0.34, + "learning_rate": 1.537338632049508e-05, + "loss": 0.4572, + "step": 2974 + }, + { + "epoch": 0.34, + "learning_rate": 1.5370264102230716e-05, + "loss": 0.4599, + "step": 2975 + }, + { + "epoch": 0.34, + "learning_rate": 1.5367141148120275e-05, + "loss": 0.4735, + "step": 2976 + }, + { + "epoch": 0.34, + "learning_rate": 1.5364017458591668e-05, + "loss": 0.476, + "step": 2977 + }, + { + "epoch": 0.34, + "learning_rate": 1.536089303407291e-05, + "loss": 0.4663, + "step": 2978 + }, + { + "epoch": 0.34, + "learning_rate": 1.535776787499212e-05, + "loss": 0.4701, + "step": 2979 + }, + { + "epoch": 0.34, + "learning_rate": 1.5354641981777514e-05, + "loss": 0.483, + "step": 2980 + }, + { + "epoch": 0.34, + "learning_rate": 1.5351515354857404e-05, + "loss": 0.4585, + "step": 2981 + }, + { + "epoch": 0.34, + "learning_rate": 1.5348387994660214e-05, + "loss": 0.4679, + "step": 2982 + }, + { + "epoch": 0.34, + "learning_rate": 1.534525990161446e-05, + "loss": 0.4726, + "step": 2983 + }, + { + "epoch": 0.34, + "learning_rate": 1.534213107614876e-05, + "loss": 0.4763, + "step": 2984 + }, + { + "epoch": 0.34, + "learning_rate": 1.5339001518691833e-05, + "loss": 0.4865, + "step": 2985 + }, + { + "epoch": 0.34, + "learning_rate": 1.5335871229672496e-05, + "loss": 0.4598, + "step": 2986 + }, + { + "epoch": 0.34, + "learning_rate": 1.5332740209519674e-05, + "loss": 0.4754, + "step": 2987 + }, + { + "epoch": 0.34, + "learning_rate": 1.5329608458662383e-05, + "loss": 0.4915, + "step": 2988 + }, + { + "epoch": 0.34, + "learning_rate": 1.5326475977529745e-05, + "loss": 0.4633, + "step": 2989 + }, + { + "epoch": 0.34, + "learning_rate": 1.5323342766550978e-05, + "loss": 0.4808, + "step": 2990 + }, + { + "epoch": 0.34, + "learning_rate": 1.53202088261554e-05, + "loss": 0.4753, + "step": 2991 + }, + { + "epoch": 0.34, + "learning_rate": 1.5317074156772434e-05, + "loss": 0.4628, + "step": 2992 + }, + { + "epoch": 0.34, + "learning_rate": 1.5313938758831596e-05, + "loss": 0.4698, + "step": 2993 + }, + { + "epoch": 0.34, + "learning_rate": 1.531080263276251e-05, + "loss": 0.4773, + "step": 2994 + }, + { + "epoch": 0.34, + "learning_rate": 1.5307665778994897e-05, + "loss": 0.4576, + "step": 2995 + }, + { + "epoch": 0.34, + "learning_rate": 1.5304528197958565e-05, + "loss": 0.4724, + "step": 2996 + }, + { + "epoch": 0.34, + "learning_rate": 1.5301389890083446e-05, + "loss": 0.4608, + "step": 2997 + }, + { + "epoch": 0.34, + "learning_rate": 1.529825085579955e-05, + "loss": 0.4734, + "step": 2998 + }, + { + "epoch": 0.34, + "learning_rate": 1.5295111095536997e-05, + "loss": 0.4648, + "step": 2999 + }, + { + "epoch": 0.34, + "learning_rate": 1.5291970609726008e-05, + "loss": 0.454, + "step": 3000 + }, + { + "epoch": 0.34, + "learning_rate": 1.5288829398796892e-05, + "loss": 0.4805, + "step": 3001 + }, + { + "epoch": 0.34, + "learning_rate": 1.528568746318007e-05, + "loss": 0.4806, + "step": 3002 + }, + { + "epoch": 0.34, + "learning_rate": 1.5282544803306056e-05, + "loss": 0.4626, + "step": 3003 + }, + { + "epoch": 0.34, + "learning_rate": 1.5279401419605466e-05, + "loss": 0.4611, + "step": 3004 + }, + { + "epoch": 0.34, + "learning_rate": 1.527625731250901e-05, + "loss": 0.4811, + "step": 3005 + }, + { + "epoch": 0.34, + "learning_rate": 1.527311248244751e-05, + "loss": 0.4894, + "step": 3006 + }, + { + "epoch": 0.34, + "learning_rate": 1.5269966929851866e-05, + "loss": 0.4652, + "step": 3007 + }, + { + "epoch": 0.34, + "learning_rate": 1.52668206551531e-05, + "loss": 0.4701, + "step": 3008 + }, + { + "epoch": 0.34, + "learning_rate": 1.526367365878231e-05, + "loss": 0.4649, + "step": 3009 + }, + { + "epoch": 0.34, + "learning_rate": 1.526052594117071e-05, + "loss": 0.4541, + "step": 3010 + }, + { + "epoch": 0.34, + "learning_rate": 1.5257377502749614e-05, + "loss": 0.4898, + "step": 3011 + }, + { + "epoch": 0.34, + "learning_rate": 1.525422834395042e-05, + "loss": 0.4587, + "step": 3012 + }, + { + "epoch": 0.34, + "learning_rate": 1.525107846520464e-05, + "loss": 0.4891, + "step": 3013 + }, + { + "epoch": 0.34, + "learning_rate": 1.5247927866943869e-05, + "loss": 0.4719, + "step": 3014 + }, + { + "epoch": 0.34, + "learning_rate": 1.5244776549599816e-05, + "loss": 0.4638, + "step": 3015 + }, + { + "epoch": 0.34, + "learning_rate": 1.5241624513604281e-05, + "loss": 0.4802, + "step": 3016 + }, + { + "epoch": 0.34, + "learning_rate": 1.523847175938916e-05, + "loss": 0.4526, + "step": 3017 + }, + { + "epoch": 0.34, + "learning_rate": 1.5235318287386455e-05, + "loss": 0.4785, + "step": 3018 + }, + { + "epoch": 0.35, + "learning_rate": 1.5232164098028257e-05, + "loss": 0.4786, + "step": 3019 + }, + { + "epoch": 0.35, + "learning_rate": 1.5229009191746769e-05, + "loss": 0.4906, + "step": 3020 + }, + { + "epoch": 0.35, + "learning_rate": 1.5225853568974271e-05, + "loss": 0.4532, + "step": 3021 + }, + { + "epoch": 0.35, + "learning_rate": 1.5222697230143166e-05, + "loss": 0.4748, + "step": 3022 + }, + { + "epoch": 0.35, + "learning_rate": 1.5219540175685938e-05, + "loss": 0.4803, + "step": 3023 + }, + { + "epoch": 0.35, + "learning_rate": 1.521638240603517e-05, + "loss": 0.4751, + "step": 3024 + }, + { + "epoch": 0.35, + "learning_rate": 1.5213223921623553e-05, + "loss": 0.4679, + "step": 3025 + }, + { + "epoch": 0.35, + "learning_rate": 1.5210064722883865e-05, + "loss": 0.4836, + "step": 3026 + }, + { + "epoch": 0.35, + "learning_rate": 1.5206904810248992e-05, + "loss": 0.4656, + "step": 3027 + }, + { + "epoch": 0.35, + "learning_rate": 1.5203744184151907e-05, + "loss": 0.4757, + "step": 3028 + }, + { + "epoch": 0.35, + "learning_rate": 1.5200582845025688e-05, + "loss": 0.4992, + "step": 3029 + }, + { + "epoch": 0.35, + "learning_rate": 1.5197420793303514e-05, + "loss": 0.4701, + "step": 3030 + }, + { + "epoch": 0.35, + "learning_rate": 1.5194258029418657e-05, + "loss": 0.4565, + "step": 3031 + }, + { + "epoch": 0.35, + "learning_rate": 1.5191094553804476e-05, + "loss": 0.4628, + "step": 3032 + }, + { + "epoch": 0.35, + "learning_rate": 1.5187930366894442e-05, + "loss": 0.4777, + "step": 3033 + }, + { + "epoch": 0.35, + "learning_rate": 1.5184765469122122e-05, + "loss": 0.4835, + "step": 3034 + }, + { + "epoch": 0.35, + "learning_rate": 1.5181599860921182e-05, + "loss": 0.4564, + "step": 3035 + }, + { + "epoch": 0.35, + "learning_rate": 1.517843354272537e-05, + "loss": 0.4759, + "step": 3036 + }, + { + "epoch": 0.35, + "learning_rate": 1.517526651496855e-05, + "loss": 0.4583, + "step": 3037 + }, + { + "epoch": 0.35, + "learning_rate": 1.5172098778084672e-05, + "loss": 0.4962, + "step": 3038 + }, + { + "epoch": 0.35, + "learning_rate": 1.5168930332507791e-05, + "loss": 0.4479, + "step": 3039 + }, + { + "epoch": 0.35, + "learning_rate": 1.5165761178672052e-05, + "loss": 0.4767, + "step": 3040 + }, + { + "epoch": 0.35, + "learning_rate": 1.51625913170117e-05, + "loss": 0.4743, + "step": 3041 + }, + { + "epoch": 0.35, + "learning_rate": 1.5159420747961076e-05, + "loss": 0.4861, + "step": 3042 + }, + { + "epoch": 0.35, + "learning_rate": 1.5156249471954617e-05, + "loss": 0.46, + "step": 3043 + }, + { + "epoch": 0.35, + "learning_rate": 1.5153077489426865e-05, + "loss": 0.468, + "step": 3044 + }, + { + "epoch": 0.35, + "learning_rate": 1.5149904800812448e-05, + "loss": 0.4918, + "step": 3045 + }, + { + "epoch": 0.35, + "learning_rate": 1.514673140654609e-05, + "loss": 0.4897, + "step": 3046 + }, + { + "epoch": 0.35, + "learning_rate": 1.514355730706263e-05, + "loss": 0.4717, + "step": 3047 + }, + { + "epoch": 0.35, + "learning_rate": 1.5140382502796978e-05, + "loss": 0.4652, + "step": 3048 + }, + { + "epoch": 0.35, + "learning_rate": 1.5137206994184159e-05, + "loss": 0.4705, + "step": 3049 + }, + { + "epoch": 0.35, + "learning_rate": 1.5134030781659288e-05, + "loss": 0.4599, + "step": 3050 + }, + { + "epoch": 0.35, + "learning_rate": 1.513085386565758e-05, + "loss": 0.4557, + "step": 3051 + }, + { + "epoch": 0.35, + "learning_rate": 1.5127676246614336e-05, + "loss": 0.4788, + "step": 3052 + }, + { + "epoch": 0.35, + "learning_rate": 1.5124497924964966e-05, + "loss": 0.4772, + "step": 3053 + }, + { + "epoch": 0.35, + "learning_rate": 1.512131890114497e-05, + "loss": 0.473, + "step": 3054 + }, + { + "epoch": 0.35, + "learning_rate": 1.5118139175589944e-05, + "loss": 0.4603, + "step": 3055 + }, + { + "epoch": 0.35, + "learning_rate": 1.5114958748735584e-05, + "loss": 0.4807, + "step": 3056 + }, + { + "epoch": 0.35, + "learning_rate": 1.5111777621017677e-05, + "loss": 0.4671, + "step": 3057 + }, + { + "epoch": 0.35, + "learning_rate": 1.5108595792872112e-05, + "loss": 0.4546, + "step": 3058 + }, + { + "epoch": 0.35, + "learning_rate": 1.5105413264734866e-05, + "loss": 0.4701, + "step": 3059 + }, + { + "epoch": 0.35, + "learning_rate": 1.5102230037042018e-05, + "loss": 0.4811, + "step": 3060 + }, + { + "epoch": 0.35, + "learning_rate": 1.5099046110229742e-05, + "loss": 0.4597, + "step": 3061 + }, + { + "epoch": 0.35, + "learning_rate": 1.5095861484734307e-05, + "loss": 0.4749, + "step": 3062 + }, + { + "epoch": 0.35, + "learning_rate": 1.5092676160992077e-05, + "loss": 0.4709, + "step": 3063 + }, + { + "epoch": 0.35, + "learning_rate": 1.5089490139439514e-05, + "loss": 0.4601, + "step": 3064 + }, + { + "epoch": 0.35, + "learning_rate": 1.508630342051317e-05, + "loss": 0.4482, + "step": 3065 + }, + { + "epoch": 0.35, + "learning_rate": 1.5083116004649703e-05, + "loss": 0.4873, + "step": 3066 + }, + { + "epoch": 0.35, + "learning_rate": 1.5079927892285855e-05, + "loss": 0.4526, + "step": 3067 + }, + { + "epoch": 0.35, + "learning_rate": 1.5076739083858472e-05, + "loss": 0.4646, + "step": 3068 + }, + { + "epoch": 0.35, + "learning_rate": 1.5073549579804493e-05, + "loss": 0.4813, + "step": 3069 + }, + { + "epoch": 0.35, + "learning_rate": 1.5070359380560944e-05, + "loss": 0.4693, + "step": 3070 + }, + { + "epoch": 0.35, + "learning_rate": 1.5067168486564959e-05, + "loss": 0.4967, + "step": 3071 + }, + { + "epoch": 0.35, + "learning_rate": 1.5063976898253763e-05, + "loss": 0.4671, + "step": 3072 + }, + { + "epoch": 0.35, + "learning_rate": 1.506078461606467e-05, + "loss": 0.4815, + "step": 3073 + }, + { + "epoch": 0.35, + "learning_rate": 1.5057591640435098e-05, + "loss": 0.471, + "step": 3074 + }, + { + "epoch": 0.35, + "learning_rate": 1.5054397971802557e-05, + "loss": 0.4657, + "step": 3075 + }, + { + "epoch": 0.35, + "learning_rate": 1.5051203610604643e-05, + "loss": 0.4786, + "step": 3076 + }, + { + "epoch": 0.35, + "learning_rate": 1.5048008557279064e-05, + "loss": 0.4743, + "step": 3077 + }, + { + "epoch": 0.35, + "learning_rate": 1.504481281226361e-05, + "loss": 0.4958, + "step": 3078 + }, + { + "epoch": 0.35, + "learning_rate": 1.504161637599617e-05, + "loss": 0.4849, + "step": 3079 + }, + { + "epoch": 0.35, + "learning_rate": 1.5038419248914725e-05, + "loss": 0.4544, + "step": 3080 + }, + { + "epoch": 0.35, + "learning_rate": 1.5035221431457352e-05, + "loss": 0.478, + "step": 3081 + }, + { + "epoch": 0.35, + "learning_rate": 1.5032022924062228e-05, + "loss": 0.4758, + "step": 3082 + }, + { + "epoch": 0.35, + "learning_rate": 1.5028823727167621e-05, + "loss": 0.4713, + "step": 3083 + }, + { + "epoch": 0.35, + "learning_rate": 1.5025623841211885e-05, + "loss": 0.4591, + "step": 3084 + }, + { + "epoch": 0.35, + "learning_rate": 1.502242326663348e-05, + "loss": 0.457, + "step": 3085 + }, + { + "epoch": 0.35, + "learning_rate": 1.5019222003870954e-05, + "loss": 0.4923, + "step": 3086 + }, + { + "epoch": 0.35, + "learning_rate": 1.501602005336296e-05, + "loss": 0.4548, + "step": 3087 + }, + { + "epoch": 0.35, + "learning_rate": 1.5012817415548226e-05, + "loss": 0.4693, + "step": 3088 + }, + { + "epoch": 0.35, + "learning_rate": 1.500961409086559e-05, + "loss": 0.4745, + "step": 3089 + }, + { + "epoch": 0.35, + "learning_rate": 1.5006410079753974e-05, + "loss": 0.4642, + "step": 3090 + }, + { + "epoch": 0.35, + "learning_rate": 1.5003205382652409e-05, + "loss": 0.4622, + "step": 3091 + }, + { + "epoch": 0.35, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.4943, + "step": 3092 + }, + { + "epoch": 0.35, + "learning_rate": 1.4996793932235965e-05, + "loss": 0.4657, + "step": 3093 + }, + { + "epoch": 0.35, + "learning_rate": 1.4993587179799598e-05, + "loss": 0.4849, + "step": 3094 + }, + { + "epoch": 0.35, + "learning_rate": 1.49903797431303e-05, + "loss": 0.4615, + "step": 3095 + }, + { + "epoch": 0.35, + "learning_rate": 1.4987171622667562e-05, + "loss": 0.4694, + "step": 3096 + }, + { + "epoch": 0.35, + "learning_rate": 1.4983962818850967e-05, + "loss": 0.4771, + "step": 3097 + }, + { + "epoch": 0.35, + "learning_rate": 1.4980753332120193e-05, + "loss": 0.4754, + "step": 3098 + }, + { + "epoch": 0.35, + "learning_rate": 1.4977543162915011e-05, + "loss": 0.4576, + "step": 3099 + }, + { + "epoch": 0.35, + "learning_rate": 1.4974332311675286e-05, + "loss": 0.4701, + "step": 3100 + }, + { + "epoch": 0.35, + "learning_rate": 1.497112077884098e-05, + "loss": 0.4787, + "step": 3101 + }, + { + "epoch": 0.35, + "learning_rate": 1.4967908564852137e-05, + "loss": 0.5031, + "step": 3102 + }, + { + "epoch": 0.35, + "learning_rate": 1.4964695670148907e-05, + "loss": 0.47, + "step": 3103 + }, + { + "epoch": 0.35, + "learning_rate": 1.4961482095171529e-05, + "loss": 0.4716, + "step": 3104 + }, + { + "epoch": 0.35, + "learning_rate": 1.4958267840360332e-05, + "loss": 0.4739, + "step": 3105 + }, + { + "epoch": 0.35, + "learning_rate": 1.495505290615574e-05, + "loss": 0.4693, + "step": 3106 + }, + { + "epoch": 0.36, + "learning_rate": 1.4951837292998277e-05, + "loss": 0.4659, + "step": 3107 + }, + { + "epoch": 0.36, + "learning_rate": 1.4948621001328544e-05, + "loss": 0.4686, + "step": 3108 + }, + { + "epoch": 0.36, + "learning_rate": 1.4945404031587255e-05, + "loss": 0.4645, + "step": 3109 + }, + { + "epoch": 0.36, + "learning_rate": 1.4942186384215198e-05, + "loss": 0.4806, + "step": 3110 + }, + { + "epoch": 0.36, + "learning_rate": 1.4938968059653269e-05, + "loss": 0.4747, + "step": 3111 + }, + { + "epoch": 0.36, + "learning_rate": 1.4935749058342446e-05, + "loss": 0.4749, + "step": 3112 + }, + { + "epoch": 0.36, + "learning_rate": 1.4932529380723806e-05, + "loss": 0.4757, + "step": 3113 + }, + { + "epoch": 0.36, + "learning_rate": 1.4929309027238517e-05, + "loss": 0.4718, + "step": 3114 + }, + { + "epoch": 0.36, + "learning_rate": 1.4926087998327838e-05, + "loss": 0.4873, + "step": 3115 + }, + { + "epoch": 0.36, + "learning_rate": 1.4922866294433122e-05, + "loss": 0.458, + "step": 3116 + }, + { + "epoch": 0.36, + "learning_rate": 1.4919643915995816e-05, + "loss": 0.4518, + "step": 3117 + }, + { + "epoch": 0.36, + "learning_rate": 1.4916420863457456e-05, + "loss": 0.484, + "step": 3118 + }, + { + "epoch": 0.36, + "learning_rate": 1.4913197137259675e-05, + "loss": 0.4704, + "step": 3119 + }, + { + "epoch": 0.36, + "learning_rate": 1.490997273784419e-05, + "loss": 0.4703, + "step": 3120 + }, + { + "epoch": 0.36, + "learning_rate": 1.4906747665652821e-05, + "loss": 0.472, + "step": 3121 + }, + { + "epoch": 0.36, + "learning_rate": 1.4903521921127472e-05, + "loss": 0.4628, + "step": 3122 + }, + { + "epoch": 0.36, + "learning_rate": 1.4900295504710143e-05, + "loss": 0.4817, + "step": 3123 + }, + { + "epoch": 0.36, + "learning_rate": 1.4897068416842926e-05, + "loss": 0.464, + "step": 3124 + }, + { + "epoch": 0.36, + "learning_rate": 1.4893840657968001e-05, + "loss": 0.4822, + "step": 3125 + }, + { + "epoch": 0.36, + "learning_rate": 1.4890612228527648e-05, + "loss": 0.4618, + "step": 3126 + }, + { + "epoch": 0.36, + "learning_rate": 1.4887383128964232e-05, + "loss": 0.4776, + "step": 3127 + }, + { + "epoch": 0.36, + "learning_rate": 1.4884153359720205e-05, + "loss": 0.4775, + "step": 3128 + }, + { + "epoch": 0.36, + "learning_rate": 1.4880922921238128e-05, + "loss": 0.4653, + "step": 3129 + }, + { + "epoch": 0.36, + "learning_rate": 1.4877691813960638e-05, + "loss": 0.4933, + "step": 3130 + }, + { + "epoch": 0.36, + "learning_rate": 1.4874460038330469e-05, + "loss": 0.465, + "step": 3131 + }, + { + "epoch": 0.36, + "learning_rate": 1.4871227594790447e-05, + "loss": 0.4592, + "step": 3132 + }, + { + "epoch": 0.36, + "learning_rate": 1.4867994483783485e-05, + "loss": 0.4704, + "step": 3133 + }, + { + "epoch": 0.36, + "learning_rate": 1.48647607057526e-05, + "loss": 0.4597, + "step": 3134 + }, + { + "epoch": 0.36, + "learning_rate": 1.4861526261140886e-05, + "loss": 0.4649, + "step": 3135 + }, + { + "epoch": 0.36, + "learning_rate": 1.4858291150391533e-05, + "loss": 0.4881, + "step": 3136 + }, + { + "epoch": 0.36, + "learning_rate": 1.4855055373947829e-05, + "loss": 0.4686, + "step": 3137 + }, + { + "epoch": 0.36, + "learning_rate": 1.4851818932253137e-05, + "loss": 0.4904, + "step": 3138 + }, + { + "epoch": 0.36, + "learning_rate": 1.4848581825750935e-05, + "loss": 0.4775, + "step": 3139 + }, + { + "epoch": 0.36, + "learning_rate": 1.4845344054884772e-05, + "loss": 0.456, + "step": 3140 + }, + { + "epoch": 0.36, + "learning_rate": 1.4842105620098292e-05, + "loss": 0.4727, + "step": 3141 + }, + { + "epoch": 0.36, + "learning_rate": 1.4838866521835238e-05, + "loss": 0.4698, + "step": 3142 + }, + { + "epoch": 0.36, + "learning_rate": 1.4835626760539437e-05, + "loss": 0.4501, + "step": 3143 + }, + { + "epoch": 0.36, + "learning_rate": 1.483238633665481e-05, + "loss": 0.4737, + "step": 3144 + }, + { + "epoch": 0.36, + "learning_rate": 1.4829145250625368e-05, + "loss": 0.476, + "step": 3145 + }, + { + "epoch": 0.36, + "learning_rate": 1.4825903502895207e-05, + "loss": 0.4562, + "step": 3146 + }, + { + "epoch": 0.36, + "learning_rate": 1.4822661093908521e-05, + "loss": 0.4805, + "step": 3147 + }, + { + "epoch": 0.36, + "learning_rate": 1.4819418024109595e-05, + "loss": 0.4816, + "step": 3148 + }, + { + "epoch": 0.36, + "learning_rate": 1.4816174293942804e-05, + "loss": 0.4696, + "step": 3149 + }, + { + "epoch": 0.36, + "learning_rate": 1.4812929903852606e-05, + "loss": 0.4819, + "step": 3150 + }, + { + "epoch": 0.36, + "learning_rate": 1.4809684854283557e-05, + "loss": 0.4685, + "step": 3151 + }, + { + "epoch": 0.36, + "learning_rate": 1.4806439145680298e-05, + "loss": 0.4632, + "step": 3152 + }, + { + "epoch": 0.36, + "learning_rate": 1.4803192778487569e-05, + "loss": 0.4622, + "step": 3153 + }, + { + "epoch": 0.36, + "learning_rate": 1.4799945753150194e-05, + "loss": 0.508, + "step": 3154 + }, + { + "epoch": 0.36, + "learning_rate": 1.4796698070113084e-05, + "loss": 0.4752, + "step": 3155 + }, + { + "epoch": 0.36, + "learning_rate": 1.4793449729821248e-05, + "loss": 0.4801, + "step": 3156 + }, + { + "epoch": 0.36, + "learning_rate": 1.4790200732719779e-05, + "loss": 0.4513, + "step": 3157 + }, + { + "epoch": 0.36, + "learning_rate": 1.4786951079253861e-05, + "loss": 0.4687, + "step": 3158 + }, + { + "epoch": 0.36, + "learning_rate": 1.4783700769868775e-05, + "loss": 0.5047, + "step": 3159 + }, + { + "epoch": 0.36, + "learning_rate": 1.4780449805009878e-05, + "loss": 0.4439, + "step": 3160 + }, + { + "epoch": 0.36, + "learning_rate": 1.477719818512263e-05, + "loss": 0.481, + "step": 3161 + }, + { + "epoch": 0.36, + "learning_rate": 1.4773945910652576e-05, + "loss": 0.4716, + "step": 3162 + }, + { + "epoch": 0.36, + "learning_rate": 1.4770692982045344e-05, + "loss": 0.4807, + "step": 3163 + }, + { + "epoch": 0.36, + "learning_rate": 1.4767439399746666e-05, + "loss": 0.476, + "step": 3164 + }, + { + "epoch": 0.36, + "learning_rate": 1.4764185164202349e-05, + "loss": 0.4745, + "step": 3165 + }, + { + "epoch": 0.36, + "learning_rate": 1.47609302758583e-05, + "loss": 0.4644, + "step": 3166 + }, + { + "epoch": 0.36, + "learning_rate": 1.4757674735160512e-05, + "loss": 0.4823, + "step": 3167 + }, + { + "epoch": 0.36, + "learning_rate": 1.475441854255506e-05, + "loss": 0.464, + "step": 3168 + }, + { + "epoch": 0.36, + "learning_rate": 1.4751161698488124e-05, + "loss": 0.4667, + "step": 3169 + }, + { + "epoch": 0.36, + "learning_rate": 1.4747904203405959e-05, + "loss": 0.4758, + "step": 3170 + }, + { + "epoch": 0.36, + "learning_rate": 1.4744646057754913e-05, + "loss": 0.4825, + "step": 3171 + }, + { + "epoch": 0.36, + "learning_rate": 1.4741387261981428e-05, + "loss": 0.447, + "step": 3172 + }, + { + "epoch": 0.36, + "learning_rate": 1.4738127816532034e-05, + "loss": 0.4674, + "step": 3173 + }, + { + "epoch": 0.36, + "learning_rate": 1.4734867721853341e-05, + "loss": 0.4779, + "step": 3174 + }, + { + "epoch": 0.36, + "learning_rate": 1.4731606978392061e-05, + "loss": 0.4519, + "step": 3175 + }, + { + "epoch": 0.36, + "learning_rate": 1.4728345586594986e-05, + "loss": 0.4539, + "step": 3176 + }, + { + "epoch": 0.36, + "learning_rate": 1.4725083546909e-05, + "loss": 0.474, + "step": 3177 + }, + { + "epoch": 0.36, + "learning_rate": 1.4721820859781076e-05, + "loss": 0.479, + "step": 3178 + }, + { + "epoch": 0.36, + "learning_rate": 1.4718557525658272e-05, + "loss": 0.477, + "step": 3179 + }, + { + "epoch": 0.36, + "learning_rate": 1.471529354498774e-05, + "loss": 0.4798, + "step": 3180 + }, + { + "epoch": 0.36, + "learning_rate": 1.471202891821672e-05, + "loss": 0.4852, + "step": 3181 + }, + { + "epoch": 0.36, + "learning_rate": 1.4708763645792531e-05, + "loss": 0.4692, + "step": 3182 + }, + { + "epoch": 0.36, + "learning_rate": 1.4705497728162602e-05, + "loss": 0.4714, + "step": 3183 + }, + { + "epoch": 0.36, + "learning_rate": 1.4702231165774423e-05, + "loss": 0.4565, + "step": 3184 + }, + { + "epoch": 0.36, + "learning_rate": 1.4698963959075592e-05, + "loss": 0.494, + "step": 3185 + }, + { + "epoch": 0.36, + "learning_rate": 1.469569610851379e-05, + "loss": 0.458, + "step": 3186 + }, + { + "epoch": 0.36, + "learning_rate": 1.4692427614536783e-05, + "loss": 0.4748, + "step": 3187 + }, + { + "epoch": 0.36, + "learning_rate": 1.4689158477592433e-05, + "loss": 0.4661, + "step": 3188 + }, + { + "epoch": 0.36, + "learning_rate": 1.4685888698128677e-05, + "loss": 0.4881, + "step": 3189 + }, + { + "epoch": 0.36, + "learning_rate": 1.468261827659355e-05, + "loss": 0.4749, + "step": 3190 + }, + { + "epoch": 0.36, + "learning_rate": 1.4679347213435176e-05, + "loss": 0.4825, + "step": 3191 + }, + { + "epoch": 0.36, + "learning_rate": 1.4676075509101763e-05, + "loss": 0.4861, + "step": 3192 + }, + { + "epoch": 0.36, + "learning_rate": 1.4672803164041604e-05, + "loss": 0.4816, + "step": 3193 + }, + { + "epoch": 0.37, + "learning_rate": 1.4669530178703089e-05, + "loss": 0.4642, + "step": 3194 + }, + { + "epoch": 0.37, + "learning_rate": 1.4666256553534681e-05, + "loss": 0.4829, + "step": 3195 + }, + { + "epoch": 0.37, + "learning_rate": 1.466298228898495e-05, + "loss": 0.4793, + "step": 3196 + }, + { + "epoch": 0.37, + "learning_rate": 1.465970738550254e-05, + "loss": 0.4768, + "step": 3197 + }, + { + "epoch": 0.37, + "learning_rate": 1.4656431843536182e-05, + "loss": 0.4439, + "step": 3198 + }, + { + "epoch": 0.37, + "learning_rate": 1.4653155663534702e-05, + "loss": 0.4805, + "step": 3199 + }, + { + "epoch": 0.37, + "learning_rate": 1.464987884594701e-05, + "loss": 0.4833, + "step": 3200 + }, + { + "epoch": 0.37, + "learning_rate": 1.4646601391222102e-05, + "loss": 0.4571, + "step": 3201 + }, + { + "epoch": 0.37, + "learning_rate": 1.464332329980906e-05, + "loss": 0.4729, + "step": 3202 + }, + { + "epoch": 0.37, + "learning_rate": 1.4640044572157062e-05, + "loss": 0.4818, + "step": 3203 + }, + { + "epoch": 0.37, + "learning_rate": 1.4636765208715358e-05, + "loss": 0.4877, + "step": 3204 + }, + { + "epoch": 0.37, + "learning_rate": 1.4633485209933305e-05, + "loss": 0.4683, + "step": 3205 + }, + { + "epoch": 0.37, + "learning_rate": 1.4630204576260328e-05, + "loss": 0.4616, + "step": 3206 + }, + { + "epoch": 0.37, + "learning_rate": 1.4626923308145948e-05, + "loss": 0.4905, + "step": 3207 + }, + { + "epoch": 0.37, + "learning_rate": 1.4623641406039776e-05, + "loss": 0.4609, + "step": 3208 + }, + { + "epoch": 0.37, + "learning_rate": 1.46203588703915e-05, + "loss": 0.4601, + "step": 3209 + }, + { + "epoch": 0.37, + "learning_rate": 1.4617075701650907e-05, + "loss": 0.4575, + "step": 3210 + }, + { + "epoch": 0.37, + "learning_rate": 1.461379190026786e-05, + "loss": 0.4594, + "step": 3211 + }, + { + "epoch": 0.37, + "learning_rate": 1.4610507466692312e-05, + "loss": 0.4681, + "step": 3212 + }, + { + "epoch": 0.37, + "learning_rate": 1.460722240137431e-05, + "loss": 0.48, + "step": 3213 + }, + { + "epoch": 0.37, + "learning_rate": 1.4603936704763975e-05, + "loss": 0.4612, + "step": 3214 + }, + { + "epoch": 0.37, + "learning_rate": 1.4600650377311523e-05, + "loss": 0.4754, + "step": 3215 + }, + { + "epoch": 0.37, + "learning_rate": 1.4597363419467257e-05, + "loss": 0.4533, + "step": 3216 + }, + { + "epoch": 0.37, + "learning_rate": 1.4594075831681557e-05, + "loss": 0.4847, + "step": 3217 + }, + { + "epoch": 0.37, + "learning_rate": 1.4590787614404902e-05, + "loss": 0.4619, + "step": 3218 + }, + { + "epoch": 0.37, + "learning_rate": 1.4587498768087849e-05, + "loss": 0.4724, + "step": 3219 + }, + { + "epoch": 0.37, + "learning_rate": 1.4584209293181044e-05, + "loss": 0.4591, + "step": 3220 + }, + { + "epoch": 0.37, + "learning_rate": 1.4580919190135219e-05, + "loss": 0.4992, + "step": 3221 + }, + { + "epoch": 0.37, + "learning_rate": 1.4577628459401188e-05, + "loss": 0.4668, + "step": 3222 + }, + { + "epoch": 0.37, + "learning_rate": 1.457433710142986e-05, + "loss": 0.4642, + "step": 3223 + }, + { + "epoch": 0.37, + "learning_rate": 1.4571045116672219e-05, + "loss": 0.4759, + "step": 3224 + }, + { + "epoch": 0.37, + "learning_rate": 1.4567752505579345e-05, + "loss": 0.4752, + "step": 3225 + }, + { + "epoch": 0.37, + "learning_rate": 1.4564459268602396e-05, + "loss": 0.4603, + "step": 3226 + }, + { + "epoch": 0.37, + "learning_rate": 1.4561165406192622e-05, + "loss": 0.4835, + "step": 3227 + }, + { + "epoch": 0.37, + "learning_rate": 1.455787091880135e-05, + "loss": 0.466, + "step": 3228 + }, + { + "epoch": 0.37, + "learning_rate": 1.4554575806880005e-05, + "loss": 0.4776, + "step": 3229 + }, + { + "epoch": 0.37, + "learning_rate": 1.4551280070880089e-05, + "loss": 0.4615, + "step": 3230 + }, + { + "epoch": 0.37, + "learning_rate": 1.454798371125319e-05, + "loss": 0.4812, + "step": 3231 + }, + { + "epoch": 0.37, + "learning_rate": 1.4544686728450982e-05, + "loss": 0.4703, + "step": 3232 + }, + { + "epoch": 0.37, + "learning_rate": 1.4541389122925229e-05, + "loss": 0.4607, + "step": 3233 + }, + { + "epoch": 0.37, + "learning_rate": 1.4538090895127774e-05, + "loss": 0.466, + "step": 3234 + }, + { + "epoch": 0.37, + "learning_rate": 1.4534792045510548e-05, + "loss": 0.4682, + "step": 3235 + }, + { + "epoch": 0.37, + "learning_rate": 1.453149257452557e-05, + "loss": 0.4742, + "step": 3236 + }, + { + "epoch": 0.37, + "learning_rate": 1.4528192482624932e-05, + "loss": 0.4683, + "step": 3237 + }, + { + "epoch": 0.37, + "learning_rate": 1.4524891770260831e-05, + "loss": 0.4715, + "step": 3238 + }, + { + "epoch": 0.37, + "learning_rate": 1.4521590437885533e-05, + "loss": 0.4758, + "step": 3239 + }, + { + "epoch": 0.37, + "learning_rate": 1.4518288485951398e-05, + "loss": 0.4706, + "step": 3240 + }, + { + "epoch": 0.37, + "learning_rate": 1.4514985914910862e-05, + "loss": 0.4904, + "step": 3241 + }, + { + "epoch": 0.37, + "learning_rate": 1.451168272521645e-05, + "loss": 0.4547, + "step": 3242 + }, + { + "epoch": 0.37, + "learning_rate": 1.450837891732078e-05, + "loss": 0.4668, + "step": 3243 + }, + { + "epoch": 0.37, + "learning_rate": 1.4505074491676542e-05, + "loss": 0.457, + "step": 3244 + }, + { + "epoch": 0.37, + "learning_rate": 1.450176944873652e-05, + "loss": 0.4586, + "step": 3245 + }, + { + "epoch": 0.37, + "learning_rate": 1.4498463788953574e-05, + "loss": 0.4766, + "step": 3246 + }, + { + "epoch": 0.37, + "learning_rate": 1.4495157512780655e-05, + "loss": 0.4627, + "step": 3247 + }, + { + "epoch": 0.37, + "learning_rate": 1.4491850620670798e-05, + "loss": 0.487, + "step": 3248 + }, + { + "epoch": 0.37, + "learning_rate": 1.4488543113077121e-05, + "loss": 0.4664, + "step": 3249 + }, + { + "epoch": 0.37, + "learning_rate": 1.4485234990452826e-05, + "loss": 0.4712, + "step": 3250 + }, + { + "epoch": 0.37, + "learning_rate": 1.4481926253251197e-05, + "loss": 0.4645, + "step": 3251 + }, + { + "epoch": 0.37, + "learning_rate": 1.4478616901925606e-05, + "loss": 0.4795, + "step": 3252 + }, + { + "epoch": 0.37, + "learning_rate": 1.4475306936929513e-05, + "loss": 0.4754, + "step": 3253 + }, + { + "epoch": 0.37, + "learning_rate": 1.4471996358716451e-05, + "loss": 0.4654, + "step": 3254 + }, + { + "epoch": 0.37, + "learning_rate": 1.4468685167740044e-05, + "loss": 0.4913, + "step": 3255 + }, + { + "epoch": 0.37, + "learning_rate": 1.4465373364454001e-05, + "loss": 0.4487, + "step": 3256 + }, + { + "epoch": 0.37, + "learning_rate": 1.4462060949312114e-05, + "loss": 0.4955, + "step": 3257 + }, + { + "epoch": 0.37, + "learning_rate": 1.4458747922768256e-05, + "loss": 0.4717, + "step": 3258 + }, + { + "epoch": 0.37, + "learning_rate": 1.4455434285276385e-05, + "loss": 0.4566, + "step": 3259 + }, + { + "epoch": 0.37, + "learning_rate": 1.4452120037290547e-05, + "loss": 0.4784, + "step": 3260 + }, + { + "epoch": 0.37, + "learning_rate": 1.444880517926486e-05, + "loss": 0.4602, + "step": 3261 + }, + { + "epoch": 0.37, + "learning_rate": 1.4445489711653542e-05, + "loss": 0.4926, + "step": 3262 + }, + { + "epoch": 0.37, + "learning_rate": 1.4442173634910881e-05, + "loss": 0.4705, + "step": 3263 + }, + { + "epoch": 0.37, + "learning_rate": 1.4438856949491258e-05, + "loss": 0.4573, + "step": 3264 + }, + { + "epoch": 0.37, + "learning_rate": 1.4435539655849126e-05, + "loss": 0.4668, + "step": 3265 + }, + { + "epoch": 0.37, + "learning_rate": 1.4432221754439037e-05, + "loss": 0.4646, + "step": 3266 + }, + { + "epoch": 0.37, + "learning_rate": 1.4428903245715611e-05, + "loss": 0.4654, + "step": 3267 + }, + { + "epoch": 0.37, + "learning_rate": 1.442558413013356e-05, + "loss": 0.4748, + "step": 3268 + }, + { + "epoch": 0.37, + "learning_rate": 1.4422264408147676e-05, + "loss": 0.4817, + "step": 3269 + }, + { + "epoch": 0.37, + "learning_rate": 1.4418944080212838e-05, + "loss": 0.4665, + "step": 3270 + }, + { + "epoch": 0.37, + "learning_rate": 1.4415623146784e-05, + "loss": 0.4809, + "step": 3271 + }, + { + "epoch": 0.37, + "learning_rate": 1.441230160831621e-05, + "loss": 0.4672, + "step": 3272 + }, + { + "epoch": 0.37, + "learning_rate": 1.4408979465264588e-05, + "loss": 0.4713, + "step": 3273 + }, + { + "epoch": 0.37, + "learning_rate": 1.4405656718084344e-05, + "loss": 0.4615, + "step": 3274 + }, + { + "epoch": 0.37, + "learning_rate": 1.440233336723077e-05, + "loss": 0.5039, + "step": 3275 + }, + { + "epoch": 0.37, + "learning_rate": 1.4399009413159234e-05, + "loss": 0.4667, + "step": 3276 + }, + { + "epoch": 0.37, + "learning_rate": 1.4395684856325198e-05, + "loss": 0.4888, + "step": 3277 + }, + { + "epoch": 0.37, + "learning_rate": 1.4392359697184197e-05, + "loss": 0.4571, + "step": 3278 + }, + { + "epoch": 0.37, + "learning_rate": 1.4389033936191851e-05, + "loss": 0.4598, + "step": 3279 + }, + { + "epoch": 0.37, + "learning_rate": 1.4385707573803869e-05, + "loss": 0.4715, + "step": 3280 + }, + { + "epoch": 0.37, + "learning_rate": 1.4382380610476032e-05, + "loss": 0.5006, + "step": 3281 + }, + { + "epoch": 0.38, + "learning_rate": 1.4379053046664208e-05, + "loss": 0.4744, + "step": 3282 + }, + { + "epoch": 0.38, + "learning_rate": 1.437572488282435e-05, + "loss": 0.4742, + "step": 3283 + }, + { + "epoch": 0.38, + "learning_rate": 1.4372396119412493e-05, + "loss": 0.4635, + "step": 3284 + }, + { + "epoch": 0.38, + "learning_rate": 1.4369066756884745e-05, + "loss": 0.4539, + "step": 3285 + }, + { + "epoch": 0.38, + "learning_rate": 1.4365736795697306e-05, + "loss": 0.4807, + "step": 3286 + }, + { + "epoch": 0.38, + "learning_rate": 1.436240623630646e-05, + "loss": 0.486, + "step": 3287 + }, + { + "epoch": 0.38, + "learning_rate": 1.4359075079168562e-05, + "loss": 0.4795, + "step": 3288 + }, + { + "epoch": 0.38, + "learning_rate": 1.4355743324740055e-05, + "loss": 0.4836, + "step": 3289 + }, + { + "epoch": 0.38, + "learning_rate": 1.4352410973477466e-05, + "loss": 0.4509, + "step": 3290 + }, + { + "epoch": 0.38, + "learning_rate": 1.4349078025837401e-05, + "loss": 0.4845, + "step": 3291 + }, + { + "epoch": 0.38, + "learning_rate": 1.4345744482276551e-05, + "loss": 0.4521, + "step": 3292 + }, + { + "epoch": 0.38, + "learning_rate": 1.4342410343251683e-05, + "loss": 0.461, + "step": 3293 + }, + { + "epoch": 0.38, + "learning_rate": 1.4339075609219645e-05, + "loss": 0.4714, + "step": 3294 + }, + { + "epoch": 0.38, + "learning_rate": 1.4335740280637374e-05, + "loss": 0.4697, + "step": 3295 + }, + { + "epoch": 0.38, + "learning_rate": 1.4332404357961884e-05, + "loss": 0.4775, + "step": 3296 + }, + { + "epoch": 0.38, + "learning_rate": 1.4329067841650274e-05, + "loss": 0.4835, + "step": 3297 + }, + { + "epoch": 0.38, + "learning_rate": 1.4325730732159717e-05, + "loss": 0.4689, + "step": 3298 + }, + { + "epoch": 0.38, + "learning_rate": 1.432239302994747e-05, + "loss": 0.4673, + "step": 3299 + }, + { + "epoch": 0.38, + "learning_rate": 1.4319054735470879e-05, + "loss": 0.462, + "step": 3300 + }, + { + "epoch": 0.38, + "learning_rate": 1.4315715849187362e-05, + "loss": 0.4648, + "step": 3301 + }, + { + "epoch": 0.38, + "learning_rate": 1.4312376371554417e-05, + "loss": 0.4644, + "step": 3302 + }, + { + "epoch": 0.38, + "learning_rate": 1.4309036303029632e-05, + "loss": 0.4579, + "step": 3303 + }, + { + "epoch": 0.38, + "learning_rate": 1.4305695644070665e-05, + "loss": 0.4781, + "step": 3304 + }, + { + "epoch": 0.38, + "learning_rate": 1.4302354395135269e-05, + "loss": 0.4719, + "step": 3305 + }, + { + "epoch": 0.38, + "learning_rate": 1.4299012556681269e-05, + "loss": 0.4776, + "step": 3306 + }, + { + "epoch": 0.38, + "learning_rate": 1.4295670129166564e-05, + "loss": 0.4732, + "step": 3307 + }, + { + "epoch": 0.38, + "learning_rate": 1.4292327113049145e-05, + "loss": 0.4742, + "step": 3308 + }, + { + "epoch": 0.38, + "learning_rate": 1.428898350878708e-05, + "loss": 0.4832, + "step": 3309 + }, + { + "epoch": 0.38, + "learning_rate": 1.428563931683852e-05, + "loss": 0.4557, + "step": 3310 + }, + { + "epoch": 0.38, + "learning_rate": 1.4282294537661692e-05, + "loss": 0.4736, + "step": 3311 + }, + { + "epoch": 0.38, + "learning_rate": 1.4278949171714904e-05, + "loss": 0.4591, + "step": 3312 + }, + { + "epoch": 0.38, + "learning_rate": 1.4275603219456544e-05, + "loss": 0.4673, + "step": 3313 + }, + { + "epoch": 0.38, + "learning_rate": 1.4272256681345087e-05, + "loss": 0.4759, + "step": 3314 + }, + { + "epoch": 0.38, + "learning_rate": 1.4268909557839085e-05, + "loss": 0.4635, + "step": 3315 + }, + { + "epoch": 0.38, + "learning_rate": 1.4265561849397163e-05, + "loss": 0.4533, + "step": 3316 + }, + { + "epoch": 0.38, + "learning_rate": 1.4262213556478033e-05, + "loss": 0.4715, + "step": 3317 + }, + { + "epoch": 0.38, + "learning_rate": 1.4258864679540488e-05, + "loss": 0.4616, + "step": 3318 + }, + { + "epoch": 0.38, + "learning_rate": 1.4255515219043398e-05, + "loss": 0.4528, + "step": 3319 + }, + { + "epoch": 0.38, + "learning_rate": 1.425216517544571e-05, + "loss": 0.4803, + "step": 3320 + }, + { + "epoch": 0.38, + "learning_rate": 1.4248814549206464e-05, + "loss": 0.462, + "step": 3321 + }, + { + "epoch": 0.38, + "learning_rate": 1.4245463340784761e-05, + "loss": 0.4962, + "step": 3322 + }, + { + "epoch": 0.38, + "learning_rate": 1.4242111550639797e-05, + "loss": 0.4631, + "step": 3323 + }, + { + "epoch": 0.38, + "learning_rate": 1.4238759179230841e-05, + "loss": 0.4858, + "step": 3324 + }, + { + "epoch": 0.38, + "learning_rate": 1.4235406227017241e-05, + "loss": 0.4673, + "step": 3325 + }, + { + "epoch": 0.38, + "learning_rate": 1.423205269445843e-05, + "loss": 0.4831, + "step": 3326 + }, + { + "epoch": 0.38, + "learning_rate": 1.4228698582013908e-05, + "loss": 0.4592, + "step": 3327 + }, + { + "epoch": 0.38, + "learning_rate": 1.4225343890143275e-05, + "loss": 0.4626, + "step": 3328 + }, + { + "epoch": 0.38, + "learning_rate": 1.4221988619306192e-05, + "loss": 0.4664, + "step": 3329 + }, + { + "epoch": 0.38, + "learning_rate": 1.4218632769962408e-05, + "loss": 0.4782, + "step": 3330 + }, + { + "epoch": 0.38, + "learning_rate": 1.4215276342571749e-05, + "loss": 0.4562, + "step": 3331 + }, + { + "epoch": 0.38, + "learning_rate": 1.4211919337594118e-05, + "loss": 0.4621, + "step": 3332 + }, + { + "epoch": 0.38, + "learning_rate": 1.4208561755489502e-05, + "loss": 0.4751, + "step": 3333 + }, + { + "epoch": 0.38, + "learning_rate": 1.4205203596717966e-05, + "loss": 0.4712, + "step": 3334 + }, + { + "epoch": 0.38, + "learning_rate": 1.420184486173965e-05, + "loss": 0.4627, + "step": 3335 + }, + { + "epoch": 0.38, + "learning_rate": 1.4198485551014778e-05, + "loss": 0.4534, + "step": 3336 + }, + { + "epoch": 0.38, + "learning_rate": 1.4195125665003648e-05, + "loss": 0.4545, + "step": 3337 + }, + { + "epoch": 0.38, + "learning_rate": 1.4191765204166643e-05, + "loss": 0.4793, + "step": 3338 + }, + { + "epoch": 0.38, + "learning_rate": 1.4188404168964219e-05, + "loss": 0.4492, + "step": 3339 + }, + { + "epoch": 0.38, + "learning_rate": 1.418504255985691e-05, + "loss": 0.4563, + "step": 3340 + }, + { + "epoch": 0.38, + "learning_rate": 1.4181680377305336e-05, + "loss": 0.4673, + "step": 3341 + }, + { + "epoch": 0.38, + "learning_rate": 1.4178317621770187e-05, + "loss": 0.4813, + "step": 3342 + }, + { + "epoch": 0.38, + "learning_rate": 1.4174954293712242e-05, + "loss": 0.4651, + "step": 3343 + }, + { + "epoch": 0.38, + "learning_rate": 1.4171590393592346e-05, + "loss": 0.469, + "step": 3344 + }, + { + "epoch": 0.38, + "learning_rate": 1.4168225921871433e-05, + "loss": 0.4549, + "step": 3345 + }, + { + "epoch": 0.38, + "learning_rate": 1.4164860879010502e-05, + "loss": 0.4827, + "step": 3346 + }, + { + "epoch": 0.38, + "learning_rate": 1.4161495265470649e-05, + "loss": 0.4757, + "step": 3347 + }, + { + "epoch": 0.38, + "learning_rate": 1.4158129081713035e-05, + "loss": 0.4788, + "step": 3348 + }, + { + "epoch": 0.38, + "learning_rate": 1.41547623281989e-05, + "loss": 0.4634, + "step": 3349 + }, + { + "epoch": 0.38, + "learning_rate": 1.415139500538957e-05, + "loss": 0.4736, + "step": 3350 + }, + { + "epoch": 0.38, + "learning_rate": 1.4148027113746435e-05, + "loss": 0.4737, + "step": 3351 + }, + { + "epoch": 0.38, + "learning_rate": 1.4144658653730976e-05, + "loss": 0.4693, + "step": 3352 + }, + { + "epoch": 0.38, + "learning_rate": 1.4141289625804748e-05, + "loss": 0.4686, + "step": 3353 + }, + { + "epoch": 0.38, + "learning_rate": 1.4137920030429386e-05, + "loss": 0.4651, + "step": 3354 + }, + { + "epoch": 0.38, + "learning_rate": 1.4134549868066594e-05, + "loss": 0.4646, + "step": 3355 + }, + { + "epoch": 0.38, + "learning_rate": 1.4131179139178157e-05, + "loss": 0.4861, + "step": 3356 + }, + { + "epoch": 0.38, + "learning_rate": 1.4127807844225947e-05, + "loss": 0.4695, + "step": 3357 + }, + { + "epoch": 0.38, + "learning_rate": 1.4124435983671907e-05, + "loss": 0.4706, + "step": 3358 + }, + { + "epoch": 0.38, + "learning_rate": 1.4121063557978051e-05, + "loss": 0.4766, + "step": 3359 + }, + { + "epoch": 0.38, + "learning_rate": 1.4117690567606483e-05, + "loss": 0.4701, + "step": 3360 + }, + { + "epoch": 0.38, + "learning_rate": 1.411431701301937e-05, + "loss": 0.4642, + "step": 3361 + }, + { + "epoch": 0.38, + "learning_rate": 1.4110942894678971e-05, + "loss": 0.4676, + "step": 3362 + }, + { + "epoch": 0.38, + "learning_rate": 1.410756821304762e-05, + "loss": 0.4786, + "step": 3363 + }, + { + "epoch": 0.38, + "learning_rate": 1.410419296858771e-05, + "loss": 0.47, + "step": 3364 + }, + { + "epoch": 0.38, + "learning_rate": 1.4100817161761738e-05, + "loss": 0.4691, + "step": 3365 + }, + { + "epoch": 0.38, + "learning_rate": 1.4097440793032253e-05, + "loss": 0.4569, + "step": 3366 + }, + { + "epoch": 0.38, + "learning_rate": 1.4094063862861904e-05, + "loss": 0.4553, + "step": 3367 + }, + { + "epoch": 0.38, + "learning_rate": 1.4090686371713403e-05, + "loss": 0.4536, + "step": 3368 + }, + { + "epoch": 0.39, + "learning_rate": 1.4087308320049536e-05, + "loss": 0.452, + "step": 3369 + }, + { + "epoch": 0.39, + "learning_rate": 1.4083929708333173e-05, + "loss": 0.4892, + "step": 3370 + }, + { + "epoch": 0.39, + "learning_rate": 1.4080550537027264e-05, + "loss": 0.4583, + "step": 3371 + }, + { + "epoch": 0.39, + "learning_rate": 1.4077170806594831e-05, + "loss": 0.4813, + "step": 3372 + }, + { + "epoch": 0.39, + "learning_rate": 1.4073790517498967e-05, + "loss": 0.4754, + "step": 3373 + }, + { + "epoch": 0.39, + "learning_rate": 1.4070409670202849e-05, + "loss": 0.4719, + "step": 3374 + }, + { + "epoch": 0.39, + "learning_rate": 1.4067028265169728e-05, + "loss": 0.4677, + "step": 3375 + }, + { + "epoch": 0.39, + "learning_rate": 1.4063646302862938e-05, + "loss": 0.4727, + "step": 3376 + }, + { + "epoch": 0.39, + "learning_rate": 1.406026378374588e-05, + "loss": 0.4658, + "step": 3377 + }, + { + "epoch": 0.39, + "learning_rate": 1.405688070828203e-05, + "loss": 0.4719, + "step": 3378 + }, + { + "epoch": 0.39, + "learning_rate": 1.4053497076934948e-05, + "loss": 0.4711, + "step": 3379 + }, + { + "epoch": 0.39, + "learning_rate": 1.405011289016827e-05, + "loss": 0.4865, + "step": 3380 + }, + { + "epoch": 0.39, + "learning_rate": 1.4046728148445701e-05, + "loss": 0.4585, + "step": 3381 + }, + { + "epoch": 0.39, + "learning_rate": 1.4043342852231027e-05, + "loss": 0.4705, + "step": 3382 + }, + { + "epoch": 0.39, + "learning_rate": 1.4039957001988112e-05, + "loss": 0.4612, + "step": 3383 + }, + { + "epoch": 0.39, + "learning_rate": 1.4036570598180888e-05, + "loss": 0.4656, + "step": 3384 + }, + { + "epoch": 0.39, + "learning_rate": 1.4033183641273374e-05, + "loss": 0.4502, + "step": 3385 + }, + { + "epoch": 0.39, + "learning_rate": 1.4029796131729652e-05, + "loss": 0.4727, + "step": 3386 + }, + { + "epoch": 0.39, + "learning_rate": 1.4026408070013892e-05, + "loss": 0.4752, + "step": 3387 + }, + { + "epoch": 0.39, + "learning_rate": 1.4023019456590335e-05, + "loss": 0.4736, + "step": 3388 + }, + { + "epoch": 0.39, + "learning_rate": 1.4019630291923289e-05, + "loss": 0.4678, + "step": 3389 + }, + { + "epoch": 0.39, + "learning_rate": 1.4016240576477152e-05, + "loss": 0.483, + "step": 3390 + }, + { + "epoch": 0.39, + "learning_rate": 1.401285031071639e-05, + "loss": 0.4638, + "step": 3391 + }, + { + "epoch": 0.39, + "learning_rate": 1.4009459495105542e-05, + "loss": 0.4708, + "step": 3392 + }, + { + "epoch": 0.39, + "learning_rate": 1.400606813010923e-05, + "loss": 0.4631, + "step": 3393 + }, + { + "epoch": 0.39, + "learning_rate": 1.4002676216192141e-05, + "loss": 0.4596, + "step": 3394 + }, + { + "epoch": 0.39, + "learning_rate": 1.3999283753819047e-05, + "loss": 0.4453, + "step": 3395 + }, + { + "epoch": 0.39, + "learning_rate": 1.3995890743454789e-05, + "loss": 0.4751, + "step": 3396 + }, + { + "epoch": 0.39, + "learning_rate": 1.3992497185564289e-05, + "loss": 0.4807, + "step": 3397 + }, + { + "epoch": 0.39, + "learning_rate": 1.3989103080612533e-05, + "loss": 0.4586, + "step": 3398 + }, + { + "epoch": 0.39, + "learning_rate": 1.3985708429064598e-05, + "loss": 0.4707, + "step": 3399 + }, + { + "epoch": 0.39, + "learning_rate": 1.3982313231385622e-05, + "loss": 0.4871, + "step": 3400 + }, + { + "epoch": 0.39, + "learning_rate": 1.3978917488040822e-05, + "loss": 0.4575, + "step": 3401 + }, + { + "epoch": 0.39, + "learning_rate": 1.3975521199495495e-05, + "loss": 0.446, + "step": 3402 + }, + { + "epoch": 0.39, + "learning_rate": 1.3972124366215002e-05, + "loss": 0.4673, + "step": 3403 + }, + { + "epoch": 0.39, + "learning_rate": 1.3968726988664788e-05, + "loss": 0.4686, + "step": 3404 + }, + { + "epoch": 0.39, + "learning_rate": 1.3965329067310372e-05, + "loss": 0.4776, + "step": 3405 + }, + { + "epoch": 0.39, + "learning_rate": 1.3961930602617345e-05, + "loss": 0.4751, + "step": 3406 + }, + { + "epoch": 0.39, + "learning_rate": 1.3958531595051367e-05, + "loss": 0.4603, + "step": 3407 + }, + { + "epoch": 0.39, + "learning_rate": 1.395513204507818e-05, + "loss": 0.4653, + "step": 3408 + }, + { + "epoch": 0.39, + "learning_rate": 1.3951731953163606e-05, + "loss": 0.4896, + "step": 3409 + }, + { + "epoch": 0.39, + "learning_rate": 1.3948331319773525e-05, + "loss": 0.4479, + "step": 3410 + }, + { + "epoch": 0.39, + "learning_rate": 1.3944930145373903e-05, + "loss": 0.47, + "step": 3411 + }, + { + "epoch": 0.39, + "learning_rate": 1.3941528430430773e-05, + "loss": 0.4531, + "step": 3412 + }, + { + "epoch": 0.39, + "learning_rate": 1.393812617541025e-05, + "loss": 0.4815, + "step": 3413 + }, + { + "epoch": 0.39, + "learning_rate": 1.3934723380778517e-05, + "loss": 0.4752, + "step": 3414 + }, + { + "epoch": 0.39, + "learning_rate": 1.3931320047001838e-05, + "loss": 0.4631, + "step": 3415 + }, + { + "epoch": 0.39, + "learning_rate": 1.3927916174546536e-05, + "loss": 0.4979, + "step": 3416 + }, + { + "epoch": 0.39, + "learning_rate": 1.3924511763879025e-05, + "loss": 0.4603, + "step": 3417 + }, + { + "epoch": 0.39, + "learning_rate": 1.3921106815465782e-05, + "loss": 0.4488, + "step": 3418 + }, + { + "epoch": 0.39, + "learning_rate": 1.3917701329773364e-05, + "loss": 0.4639, + "step": 3419 + }, + { + "epoch": 0.39, + "learning_rate": 1.3914295307268396e-05, + "loss": 0.49, + "step": 3420 + }, + { + "epoch": 0.39, + "learning_rate": 1.3910888748417577e-05, + "loss": 0.4681, + "step": 3421 + }, + { + "epoch": 0.39, + "learning_rate": 1.3907481653687687e-05, + "loss": 0.4684, + "step": 3422 + }, + { + "epoch": 0.39, + "learning_rate": 1.3904074023545566e-05, + "loss": 0.4867, + "step": 3423 + }, + { + "epoch": 0.39, + "learning_rate": 1.390066585845815e-05, + "loss": 0.4704, + "step": 3424 + }, + { + "epoch": 0.39, + "learning_rate": 1.389725715889242e-05, + "loss": 0.4487, + "step": 3425 + }, + { + "epoch": 0.39, + "learning_rate": 1.3893847925315447e-05, + "loss": 0.4687, + "step": 3426 + }, + { + "epoch": 0.39, + "learning_rate": 1.3890438158194374e-05, + "loss": 0.4856, + "step": 3427 + }, + { + "epoch": 0.39, + "learning_rate": 1.3887027857996416e-05, + "loss": 0.4726, + "step": 3428 + }, + { + "epoch": 0.39, + "learning_rate": 1.3883617025188858e-05, + "loss": 0.4639, + "step": 3429 + }, + { + "epoch": 0.39, + "learning_rate": 1.3880205660239062e-05, + "loss": 0.4716, + "step": 3430 + }, + { + "epoch": 0.39, + "learning_rate": 1.387679376361446e-05, + "loss": 0.4622, + "step": 3431 + }, + { + "epoch": 0.39, + "learning_rate": 1.3873381335782559e-05, + "loss": 0.4813, + "step": 3432 + }, + { + "epoch": 0.39, + "learning_rate": 1.3869968377210936e-05, + "loss": 0.4577, + "step": 3433 + }, + { + "epoch": 0.39, + "learning_rate": 1.3866554888367243e-05, + "loss": 0.4814, + "step": 3434 + }, + { + "epoch": 0.39, + "learning_rate": 1.3863140869719207e-05, + "loss": 0.4519, + "step": 3435 + }, + { + "epoch": 0.39, + "learning_rate": 1.3859726321734623e-05, + "loss": 0.4714, + "step": 3436 + }, + { + "epoch": 0.39, + "learning_rate": 1.385631124488136e-05, + "loss": 0.4725, + "step": 3437 + }, + { + "epoch": 0.39, + "learning_rate": 1.3852895639627357e-05, + "loss": 0.476, + "step": 3438 + }, + { + "epoch": 0.39, + "learning_rate": 1.3849479506440633e-05, + "loss": 0.4709, + "step": 3439 + }, + { + "epoch": 0.39, + "learning_rate": 1.3846062845789275e-05, + "loss": 0.4698, + "step": 3440 + }, + { + "epoch": 0.39, + "learning_rate": 1.3842645658141436e-05, + "loss": 0.4537, + "step": 3441 + }, + { + "epoch": 0.39, + "learning_rate": 1.383922794396535e-05, + "loss": 0.4755, + "step": 3442 + }, + { + "epoch": 0.39, + "learning_rate": 1.3835809703729322e-05, + "loss": 0.4526, + "step": 3443 + }, + { + "epoch": 0.39, + "learning_rate": 1.3832390937901723e-05, + "loss": 0.4573, + "step": 3444 + }, + { + "epoch": 0.39, + "learning_rate": 1.3828971646951005e-05, + "loss": 0.4754, + "step": 3445 + }, + { + "epoch": 0.39, + "learning_rate": 1.3825551831345685e-05, + "loss": 0.4687, + "step": 3446 + }, + { + "epoch": 0.39, + "learning_rate": 1.3822131491554355e-05, + "loss": 0.4574, + "step": 3447 + }, + { + "epoch": 0.39, + "learning_rate": 1.3818710628045677e-05, + "loss": 0.4768, + "step": 3448 + }, + { + "epoch": 0.39, + "learning_rate": 1.3815289241288383e-05, + "loss": 0.473, + "step": 3449 + }, + { + "epoch": 0.39, + "learning_rate": 1.3811867331751286e-05, + "loss": 0.4808, + "step": 3450 + }, + { + "epoch": 0.39, + "learning_rate": 1.380844489990326e-05, + "loss": 0.4514, + "step": 3451 + }, + { + "epoch": 0.39, + "learning_rate": 1.3805021946213251e-05, + "loss": 0.4766, + "step": 3452 + }, + { + "epoch": 0.39, + "learning_rate": 1.3801598471150286e-05, + "loss": 0.4523, + "step": 3453 + }, + { + "epoch": 0.39, + "learning_rate": 1.3798174475183457e-05, + "loss": 0.4715, + "step": 3454 + }, + { + "epoch": 0.39, + "learning_rate": 1.3794749958781924e-05, + "loss": 0.4447, + "step": 3455 + }, + { + "epoch": 0.39, + "learning_rate": 1.3791324922414924e-05, + "loss": 0.471, + "step": 3456 + }, + { + "epoch": 0.4, + "learning_rate": 1.3787899366551764e-05, + "loss": 0.4725, + "step": 3457 + }, + { + "epoch": 0.4, + "learning_rate": 1.3784473291661824e-05, + "loss": 0.4766, + "step": 3458 + }, + { + "epoch": 0.4, + "learning_rate": 1.3781046698214549e-05, + "loss": 0.4553, + "step": 3459 + }, + { + "epoch": 0.4, + "learning_rate": 1.3777619586679458e-05, + "loss": 0.4858, + "step": 3460 + }, + { + "epoch": 0.4, + "learning_rate": 1.3774191957526144e-05, + "loss": 0.4454, + "step": 3461 + }, + { + "epoch": 0.4, + "learning_rate": 1.3770763811224273e-05, + "loss": 0.4684, + "step": 3462 + }, + { + "epoch": 0.4, + "learning_rate": 1.376733514824357e-05, + "loss": 0.4524, + "step": 3463 + }, + { + "epoch": 0.4, + "learning_rate": 1.3763905969053841e-05, + "loss": 0.4756, + "step": 3464 + }, + { + "epoch": 0.4, + "learning_rate": 1.376047627412496e-05, + "loss": 0.4817, + "step": 3465 + }, + { + "epoch": 0.4, + "learning_rate": 1.3757046063926876e-05, + "loss": 0.4717, + "step": 3466 + }, + { + "epoch": 0.4, + "learning_rate": 1.3753615338929598e-05, + "loss": 0.4687, + "step": 3467 + }, + { + "epoch": 0.4, + "learning_rate": 1.3750184099603216e-05, + "loss": 0.4765, + "step": 3468 + }, + { + "epoch": 0.4, + "learning_rate": 1.3746752346417884e-05, + "loss": 0.4669, + "step": 3469 + }, + { + "epoch": 0.4, + "learning_rate": 1.3743320079843828e-05, + "loss": 0.4586, + "step": 3470 + }, + { + "epoch": 0.4, + "learning_rate": 1.3739887300351349e-05, + "loss": 0.4725, + "step": 3471 + }, + { + "epoch": 0.4, + "learning_rate": 1.3736454008410816e-05, + "loss": 0.4761, + "step": 3472 + }, + { + "epoch": 0.4, + "learning_rate": 1.373302020449266e-05, + "loss": 0.4666, + "step": 3473 + }, + { + "epoch": 0.4, + "learning_rate": 1.3729585889067391e-05, + "loss": 0.4902, + "step": 3474 + }, + { + "epoch": 0.4, + "learning_rate": 1.3726151062605588e-05, + "loss": 0.461, + "step": 3475 + }, + { + "epoch": 0.4, + "learning_rate": 1.3722715725577902e-05, + "loss": 0.4729, + "step": 3476 + }, + { + "epoch": 0.4, + "learning_rate": 1.3719279878455046e-05, + "loss": 0.4725, + "step": 3477 + }, + { + "epoch": 0.4, + "learning_rate": 1.3715843521707805e-05, + "loss": 0.4572, + "step": 3478 + }, + { + "epoch": 0.4, + "learning_rate": 1.3712406655807047e-05, + "loss": 0.4482, + "step": 3479 + }, + { + "epoch": 0.4, + "learning_rate": 1.3708969281223687e-05, + "loss": 0.4834, + "step": 3480 + }, + { + "epoch": 0.4, + "learning_rate": 1.3705531398428736e-05, + "loss": 0.4743, + "step": 3481 + }, + { + "epoch": 0.4, + "learning_rate": 1.3702093007893249e-05, + "loss": 0.4896, + "step": 3482 + }, + { + "epoch": 0.4, + "learning_rate": 1.3698654110088365e-05, + "loss": 0.4706, + "step": 3483 + }, + { + "epoch": 0.4, + "learning_rate": 1.3695214705485294e-05, + "loss": 0.4695, + "step": 3484 + }, + { + "epoch": 0.4, + "learning_rate": 1.3691774794555306e-05, + "loss": 0.4616, + "step": 3485 + }, + { + "epoch": 0.4, + "learning_rate": 1.368833437776975e-05, + "loss": 0.4823, + "step": 3486 + }, + { + "epoch": 0.4, + "learning_rate": 1.3684893455600036e-05, + "loss": 0.4539, + "step": 3487 + }, + { + "epoch": 0.4, + "learning_rate": 1.368145202851765e-05, + "loss": 0.4737, + "step": 3488 + }, + { + "epoch": 0.4, + "learning_rate": 1.3678010096994143e-05, + "loss": 0.4613, + "step": 3489 + }, + { + "epoch": 0.4, + "learning_rate": 1.3674567661501138e-05, + "loss": 0.485, + "step": 3490 + }, + { + "epoch": 0.4, + "learning_rate": 1.3671124722510325e-05, + "loss": 0.4766, + "step": 3491 + }, + { + "epoch": 0.4, + "learning_rate": 1.366768128049346e-05, + "loss": 0.4763, + "step": 3492 + }, + { + "epoch": 0.4, + "learning_rate": 1.3664237335922377e-05, + "loss": 0.4588, + "step": 3493 + }, + { + "epoch": 0.4, + "learning_rate": 1.3660792889268967e-05, + "loss": 0.4883, + "step": 3494 + }, + { + "epoch": 0.4, + "learning_rate": 1.3657347941005204e-05, + "loss": 0.4671, + "step": 3495 + }, + { + "epoch": 0.4, + "learning_rate": 1.3653902491603117e-05, + "loss": 0.464, + "step": 3496 + }, + { + "epoch": 0.4, + "learning_rate": 1.3650456541534811e-05, + "loss": 0.4619, + "step": 3497 + }, + { + "epoch": 0.4, + "learning_rate": 1.3647010091272456e-05, + "loss": 0.4806, + "step": 3498 + }, + { + "epoch": 0.4, + "learning_rate": 1.3643563141288297e-05, + "loss": 0.4832, + "step": 3499 + }, + { + "epoch": 0.4, + "learning_rate": 1.364011569205464e-05, + "loss": 0.476, + "step": 3500 + }, + { + "epoch": 0.4, + "learning_rate": 1.3636667744043864e-05, + "loss": 0.4768, + "step": 3501 + }, + { + "epoch": 0.4, + "learning_rate": 1.3633219297728415e-05, + "loss": 0.4722, + "step": 3502 + }, + { + "epoch": 0.4, + "learning_rate": 1.3629770353580804e-05, + "loss": 0.4721, + "step": 3503 + }, + { + "epoch": 0.4, + "learning_rate": 1.3626320912073616e-05, + "loss": 0.4715, + "step": 3504 + }, + { + "epoch": 0.4, + "learning_rate": 1.3622870973679503e-05, + "loss": 0.4711, + "step": 3505 + }, + { + "epoch": 0.4, + "learning_rate": 1.361942053887118e-05, + "loss": 0.4722, + "step": 3506 + }, + { + "epoch": 0.4, + "learning_rate": 1.3615969608121438e-05, + "loss": 0.4726, + "step": 3507 + }, + { + "epoch": 0.4, + "learning_rate": 1.3612518181903127e-05, + "loss": 0.465, + "step": 3508 + }, + { + "epoch": 0.4, + "learning_rate": 1.360906626068917e-05, + "loss": 0.4693, + "step": 3509 + }, + { + "epoch": 0.4, + "learning_rate": 1.3605613844952561e-05, + "loss": 0.4564, + "step": 3510 + }, + { + "epoch": 0.4, + "learning_rate": 1.3602160935166357e-05, + "loss": 0.4639, + "step": 3511 + }, + { + "epoch": 0.4, + "learning_rate": 1.359870753180368e-05, + "loss": 0.4704, + "step": 3512 + }, + { + "epoch": 0.4, + "learning_rate": 1.3595253635337724e-05, + "loss": 0.4511, + "step": 3513 + }, + { + "epoch": 0.4, + "learning_rate": 1.3591799246241753e-05, + "loss": 0.4738, + "step": 3514 + }, + { + "epoch": 0.4, + "learning_rate": 1.3588344364989096e-05, + "loss": 0.478, + "step": 3515 + }, + { + "epoch": 0.4, + "learning_rate": 1.3584888992053146e-05, + "loss": 0.4549, + "step": 3516 + }, + { + "epoch": 0.4, + "learning_rate": 1.3581433127907366e-05, + "loss": 0.4767, + "step": 3517 + }, + { + "epoch": 0.4, + "learning_rate": 1.357797677302529e-05, + "loss": 0.4492, + "step": 3518 + }, + { + "epoch": 0.4, + "learning_rate": 1.3574519927880511e-05, + "loss": 0.4672, + "step": 3519 + }, + { + "epoch": 0.4, + "learning_rate": 1.3571062592946703e-05, + "loss": 0.473, + "step": 3520 + }, + { + "epoch": 0.4, + "learning_rate": 1.3567604768697585e-05, + "loss": 0.4583, + "step": 3521 + }, + { + "epoch": 0.4, + "learning_rate": 1.3564146455606961e-05, + "loss": 0.4575, + "step": 3522 + }, + { + "epoch": 0.4, + "learning_rate": 1.3560687654148703e-05, + "loss": 0.4642, + "step": 3523 + }, + { + "epoch": 0.4, + "learning_rate": 1.3557228364796742e-05, + "loss": 0.4688, + "step": 3524 + }, + { + "epoch": 0.4, + "learning_rate": 1.3553768588025073e-05, + "loss": 0.4582, + "step": 3525 + }, + { + "epoch": 0.4, + "learning_rate": 1.3550308324307767e-05, + "loss": 0.4701, + "step": 3526 + }, + { + "epoch": 0.4, + "learning_rate": 1.3546847574118951e-05, + "loss": 0.4618, + "step": 3527 + }, + { + "epoch": 0.4, + "learning_rate": 1.3543386337932834e-05, + "loss": 0.4574, + "step": 3528 + }, + { + "epoch": 0.4, + "learning_rate": 1.3539924616223679e-05, + "loss": 0.4567, + "step": 3529 + }, + { + "epoch": 0.4, + "learning_rate": 1.3536462409465816e-05, + "loss": 0.4813, + "step": 3530 + }, + { + "epoch": 0.4, + "learning_rate": 1.3532999718133648e-05, + "loss": 0.4813, + "step": 3531 + }, + { + "epoch": 0.4, + "learning_rate": 1.3529536542701638e-05, + "loss": 0.4799, + "step": 3532 + }, + { + "epoch": 0.4, + "learning_rate": 1.3526072883644326e-05, + "loss": 0.4817, + "step": 3533 + }, + { + "epoch": 0.4, + "learning_rate": 1.3522608741436303e-05, + "loss": 0.4664, + "step": 3534 + }, + { + "epoch": 0.4, + "learning_rate": 1.3519144116552236e-05, + "loss": 0.4972, + "step": 3535 + }, + { + "epoch": 0.4, + "learning_rate": 1.3515679009466856e-05, + "loss": 0.4505, + "step": 3536 + }, + { + "epoch": 0.4, + "learning_rate": 1.3512213420654959e-05, + "loss": 0.4666, + "step": 3537 + }, + { + "epoch": 0.4, + "learning_rate": 1.350874735059141e-05, + "loss": 0.4746, + "step": 3538 + }, + { + "epoch": 0.4, + "learning_rate": 1.3505280799751134e-05, + "loss": 0.4741, + "step": 3539 + }, + { + "epoch": 0.4, + "learning_rate": 1.3501813768609134e-05, + "loss": 0.4802, + "step": 3540 + }, + { + "epoch": 0.4, + "learning_rate": 1.3498346257640461e-05, + "loss": 0.4697, + "step": 3541 + }, + { + "epoch": 0.4, + "learning_rate": 1.349487826732025e-05, + "loss": 0.4593, + "step": 3542 + }, + { + "epoch": 0.4, + "learning_rate": 1.3491409798123687e-05, + "loss": 0.4985, + "step": 3543 + }, + { + "epoch": 0.41, + "learning_rate": 1.3487940850526033e-05, + "loss": 0.4663, + "step": 3544 + }, + { + "epoch": 0.41, + "learning_rate": 1.348447142500261e-05, + "loss": 0.4558, + "step": 3545 + }, + { + "epoch": 0.41, + "learning_rate": 1.3481001522028807e-05, + "loss": 0.4566, + "step": 3546 + }, + { + "epoch": 0.41, + "learning_rate": 1.3477531142080076e-05, + "loss": 0.4626, + "step": 3547 + }, + { + "epoch": 0.41, + "learning_rate": 1.347406028563194e-05, + "loss": 0.487, + "step": 3548 + }, + { + "epoch": 0.41, + "learning_rate": 1.3470588953159982e-05, + "loss": 0.4712, + "step": 3549 + }, + { + "epoch": 0.41, + "learning_rate": 1.3467117145139854e-05, + "loss": 0.4427, + "step": 3550 + }, + { + "epoch": 0.41, + "learning_rate": 1.3463644862047267e-05, + "loss": 0.4693, + "step": 3551 + }, + { + "epoch": 0.41, + "learning_rate": 1.3460172104358007e-05, + "loss": 0.4581, + "step": 3552 + }, + { + "epoch": 0.41, + "learning_rate": 1.3456698872547915e-05, + "loss": 0.4775, + "step": 3553 + }, + { + "epoch": 0.41, + "learning_rate": 1.3453225167092902e-05, + "loss": 0.4602, + "step": 3554 + }, + { + "epoch": 0.41, + "learning_rate": 1.3449750988468943e-05, + "loss": 0.4855, + "step": 3555 + }, + { + "epoch": 0.41, + "learning_rate": 1.344627633715208e-05, + "loss": 0.4711, + "step": 3556 + }, + { + "epoch": 0.41, + "learning_rate": 1.3442801213618417e-05, + "loss": 0.4648, + "step": 3557 + }, + { + "epoch": 0.41, + "learning_rate": 1.3439325618344123e-05, + "loss": 0.4784, + "step": 3558 + }, + { + "epoch": 0.41, + "learning_rate": 1.3435849551805436e-05, + "loss": 0.4907, + "step": 3559 + }, + { + "epoch": 0.41, + "learning_rate": 1.3432373014478644e-05, + "loss": 0.4504, + "step": 3560 + }, + { + "epoch": 0.41, + "learning_rate": 1.3428896006840122e-05, + "loss": 0.4856, + "step": 3561 + }, + { + "epoch": 0.41, + "learning_rate": 1.3425418529366293e-05, + "loss": 0.4712, + "step": 3562 + }, + { + "epoch": 0.41, + "learning_rate": 1.3421940582533645e-05, + "loss": 0.4771, + "step": 3563 + }, + { + "epoch": 0.41, + "learning_rate": 1.3418462166818743e-05, + "loss": 0.4686, + "step": 3564 + }, + { + "epoch": 0.41, + "learning_rate": 1.34149832826982e-05, + "loss": 0.4561, + "step": 3565 + }, + { + "epoch": 0.41, + "learning_rate": 1.3411503930648704e-05, + "loss": 0.4762, + "step": 3566 + }, + { + "epoch": 0.41, + "learning_rate": 1.3408024111147004e-05, + "loss": 0.477, + "step": 3567 + }, + { + "epoch": 0.41, + "learning_rate": 1.3404543824669915e-05, + "loss": 0.4383, + "step": 3568 + }, + { + "epoch": 0.41, + "learning_rate": 1.3401063071694309e-05, + "loss": 0.4824, + "step": 3569 + }, + { + "epoch": 0.41, + "learning_rate": 1.3397581852697128e-05, + "loss": 0.4664, + "step": 3570 + }, + { + "epoch": 0.41, + "learning_rate": 1.3394100168155382e-05, + "loss": 0.4723, + "step": 3571 + }, + { + "epoch": 0.41, + "learning_rate": 1.3390618018546135e-05, + "loss": 0.4462, + "step": 3572 + }, + { + "epoch": 0.41, + "learning_rate": 1.3387135404346519e-05, + "loss": 0.4555, + "step": 3573 + }, + { + "epoch": 0.41, + "learning_rate": 1.338365232603373e-05, + "loss": 0.4666, + "step": 3574 + }, + { + "epoch": 0.41, + "learning_rate": 1.3380168784085028e-05, + "loss": 0.4895, + "step": 3575 + }, + { + "epoch": 0.41, + "learning_rate": 1.3376684778977738e-05, + "loss": 0.4813, + "step": 3576 + }, + { + "epoch": 0.41, + "learning_rate": 1.3373200311189245e-05, + "loss": 0.485, + "step": 3577 + }, + { + "epoch": 0.41, + "learning_rate": 1.3369715381197e-05, + "loss": 0.4799, + "step": 3578 + }, + { + "epoch": 0.41, + "learning_rate": 1.336622998947851e-05, + "loss": 0.4667, + "step": 3579 + }, + { + "epoch": 0.41, + "learning_rate": 1.336274413651136e-05, + "loss": 0.4623, + "step": 3580 + }, + { + "epoch": 0.41, + "learning_rate": 1.3359257822773187e-05, + "loss": 0.4914, + "step": 3581 + }, + { + "epoch": 0.41, + "learning_rate": 1.3355771048741692e-05, + "loss": 0.4591, + "step": 3582 + }, + { + "epoch": 0.41, + "learning_rate": 1.335228381489464e-05, + "loss": 0.4642, + "step": 3583 + }, + { + "epoch": 0.41, + "learning_rate": 1.3348796121709862e-05, + "loss": 0.4682, + "step": 3584 + }, + { + "epoch": 0.41, + "learning_rate": 1.3345307969665252e-05, + "loss": 0.4726, + "step": 3585 + }, + { + "epoch": 0.41, + "learning_rate": 1.3341819359238762e-05, + "loss": 0.4532, + "step": 3586 + }, + { + "epoch": 0.41, + "learning_rate": 1.3338330290908408e-05, + "loss": 0.4763, + "step": 3587 + }, + { + "epoch": 0.41, + "learning_rate": 1.3334840765152272e-05, + "loss": 0.4579, + "step": 3588 + }, + { + "epoch": 0.41, + "learning_rate": 1.3331350782448495e-05, + "loss": 0.4719, + "step": 3589 + }, + { + "epoch": 0.41, + "learning_rate": 1.332786034327529e-05, + "loss": 0.4579, + "step": 3590 + }, + { + "epoch": 0.41, + "learning_rate": 1.3324369448110916e-05, + "loss": 0.4598, + "step": 3591 + }, + { + "epoch": 0.41, + "learning_rate": 1.3320878097433707e-05, + "loss": 0.4768, + "step": 3592 + }, + { + "epoch": 0.41, + "learning_rate": 1.331738629172206e-05, + "loss": 0.4901, + "step": 3593 + }, + { + "epoch": 0.41, + "learning_rate": 1.3313894031454421e-05, + "loss": 0.4635, + "step": 3594 + }, + { + "epoch": 0.41, + "learning_rate": 1.3310401317109316e-05, + "loss": 0.4515, + "step": 3595 + }, + { + "epoch": 0.41, + "learning_rate": 1.330690814916532e-05, + "loss": 0.4682, + "step": 3596 + }, + { + "epoch": 0.41, + "learning_rate": 1.330341452810108e-05, + "loss": 0.4734, + "step": 3597 + }, + { + "epoch": 0.41, + "learning_rate": 1.3299920454395296e-05, + "loss": 0.4524, + "step": 3598 + }, + { + "epoch": 0.41, + "learning_rate": 1.3296425928526735e-05, + "loss": 0.4722, + "step": 3599 + }, + { + "epoch": 0.41, + "learning_rate": 1.3292930950974223e-05, + "loss": 0.4714, + "step": 3600 + }, + { + "epoch": 0.41, + "learning_rate": 1.3289435522216657e-05, + "loss": 0.4808, + "step": 3601 + }, + { + "epoch": 0.41, + "learning_rate": 1.3285939642732979e-05, + "loss": 0.4508, + "step": 3602 + }, + { + "epoch": 0.41, + "learning_rate": 1.3282443313002209e-05, + "loss": 0.5019, + "step": 3603 + }, + { + "epoch": 0.41, + "learning_rate": 1.3278946533503422e-05, + "loss": 0.4552, + "step": 3604 + }, + { + "epoch": 0.41, + "learning_rate": 1.3275449304715753e-05, + "loss": 0.4557, + "step": 3605 + }, + { + "epoch": 0.41, + "learning_rate": 1.3271951627118402e-05, + "loss": 0.4747, + "step": 3606 + }, + { + "epoch": 0.41, + "learning_rate": 1.3268453501190628e-05, + "loss": 0.4785, + "step": 3607 + }, + { + "epoch": 0.41, + "learning_rate": 1.3264954927411751e-05, + "loss": 0.4874, + "step": 3608 + }, + { + "epoch": 0.41, + "learning_rate": 1.3261455906261154e-05, + "loss": 0.4692, + "step": 3609 + }, + { + "epoch": 0.41, + "learning_rate": 1.3257956438218283e-05, + "loss": 0.472, + "step": 3610 + }, + { + "epoch": 0.41, + "learning_rate": 1.3254456523762643e-05, + "loss": 0.4603, + "step": 3611 + }, + { + "epoch": 0.41, + "learning_rate": 1.3250956163373801e-05, + "loss": 0.476, + "step": 3612 + }, + { + "epoch": 0.41, + "learning_rate": 1.324745535753138e-05, + "loss": 0.469, + "step": 3613 + }, + { + "epoch": 0.41, + "learning_rate": 1.3243954106715074e-05, + "loss": 0.4551, + "step": 3614 + }, + { + "epoch": 0.41, + "learning_rate": 1.3240452411404628e-05, + "loss": 0.4715, + "step": 3615 + }, + { + "epoch": 0.41, + "learning_rate": 1.3236950272079858e-05, + "loss": 0.4817, + "step": 3616 + }, + { + "epoch": 0.41, + "learning_rate": 1.3233447689220629e-05, + "loss": 0.4714, + "step": 3617 + }, + { + "epoch": 0.41, + "learning_rate": 1.3229944663306877e-05, + "loss": 0.4745, + "step": 3618 + }, + { + "epoch": 0.41, + "learning_rate": 1.3226441194818596e-05, + "loss": 0.4591, + "step": 3619 + }, + { + "epoch": 0.41, + "learning_rate": 1.3222937284235835e-05, + "loss": 0.4703, + "step": 3620 + }, + { + "epoch": 0.41, + "learning_rate": 1.3219432932038712e-05, + "loss": 0.4754, + "step": 3621 + }, + { + "epoch": 0.41, + "learning_rate": 1.3215928138707396e-05, + "loss": 0.4626, + "step": 3622 + }, + { + "epoch": 0.41, + "learning_rate": 1.321242290472213e-05, + "loss": 0.4779, + "step": 3623 + }, + { + "epoch": 0.41, + "learning_rate": 1.3208917230563201e-05, + "loss": 0.4725, + "step": 3624 + }, + { + "epoch": 0.41, + "learning_rate": 1.3205411116710973e-05, + "loss": 0.4656, + "step": 3625 + }, + { + "epoch": 0.41, + "learning_rate": 1.3201904563645853e-05, + "loss": 0.486, + "step": 3626 + }, + { + "epoch": 0.41, + "learning_rate": 1.3198397571848323e-05, + "loss": 0.4626, + "step": 3627 + }, + { + "epoch": 0.41, + "learning_rate": 1.319489014179892e-05, + "loss": 0.4682, + "step": 3628 + }, + { + "epoch": 0.41, + "learning_rate": 1.3191382273978237e-05, + "loss": 0.4631, + "step": 3629 + }, + { + "epoch": 0.41, + "learning_rate": 1.3187873968866928e-05, + "loss": 0.4509, + "step": 3630 + }, + { + "epoch": 0.41, + "learning_rate": 1.3184365226945715e-05, + "loss": 0.4576, + "step": 3631 + }, + { + "epoch": 0.42, + "learning_rate": 1.318085604869537e-05, + "loss": 0.4649, + "step": 3632 + }, + { + "epoch": 0.42, + "learning_rate": 1.3177346434596734e-05, + "loss": 0.4934, + "step": 3633 + }, + { + "epoch": 0.42, + "learning_rate": 1.3173836385130693e-05, + "loss": 0.4483, + "step": 3634 + }, + { + "epoch": 0.42, + "learning_rate": 1.3170325900778211e-05, + "loss": 0.4704, + "step": 3635 + }, + { + "epoch": 0.42, + "learning_rate": 1.3166814982020298e-05, + "loss": 0.4706, + "step": 3636 + }, + { + "epoch": 0.42, + "learning_rate": 1.3163303629338029e-05, + "loss": 0.4671, + "step": 3637 + }, + { + "epoch": 0.42, + "learning_rate": 1.3159791843212542e-05, + "loss": 0.4487, + "step": 3638 + }, + { + "epoch": 0.42, + "learning_rate": 1.3156279624125023e-05, + "loss": 0.4742, + "step": 3639 + }, + { + "epoch": 0.42, + "learning_rate": 1.3152766972556727e-05, + "loss": 0.4657, + "step": 3640 + }, + { + "epoch": 0.42, + "learning_rate": 1.3149253888988967e-05, + "loss": 0.4887, + "step": 3641 + }, + { + "epoch": 0.42, + "learning_rate": 1.3145740373903118e-05, + "loss": 0.4618, + "step": 3642 + }, + { + "epoch": 0.42, + "learning_rate": 1.31422264277806e-05, + "loss": 0.4543, + "step": 3643 + }, + { + "epoch": 0.42, + "learning_rate": 1.3138712051102908e-05, + "loss": 0.4653, + "step": 3644 + }, + { + "epoch": 0.42, + "learning_rate": 1.3135197244351595e-05, + "loss": 0.4668, + "step": 3645 + }, + { + "epoch": 0.42, + "learning_rate": 1.3131682008008255e-05, + "loss": 0.4705, + "step": 3646 + }, + { + "epoch": 0.42, + "learning_rate": 1.3128166342554567e-05, + "loss": 0.4697, + "step": 3647 + }, + { + "epoch": 0.42, + "learning_rate": 1.3124650248472248e-05, + "loss": 0.4545, + "step": 3648 + }, + { + "epoch": 0.42, + "learning_rate": 1.3121133726243083e-05, + "loss": 0.4855, + "step": 3649 + }, + { + "epoch": 0.42, + "learning_rate": 1.3117616776348915e-05, + "loss": 0.4607, + "step": 3650 + }, + { + "epoch": 0.42, + "learning_rate": 1.3114099399271646e-05, + "loss": 0.4721, + "step": 3651 + }, + { + "epoch": 0.42, + "learning_rate": 1.311058159549323e-05, + "loss": 0.4476, + "step": 3652 + }, + { + "epoch": 0.42, + "learning_rate": 1.3107063365495692e-05, + "loss": 0.4616, + "step": 3653 + }, + { + "epoch": 0.42, + "learning_rate": 1.31035447097611e-05, + "loss": 0.454, + "step": 3654 + }, + { + "epoch": 0.42, + "learning_rate": 1.3100025628771595e-05, + "loss": 0.4719, + "step": 3655 + }, + { + "epoch": 0.42, + "learning_rate": 1.3096506123009368e-05, + "loss": 0.4565, + "step": 3656 + }, + { + "epoch": 0.42, + "learning_rate": 1.3092986192956665e-05, + "loss": 0.478, + "step": 3657 + }, + { + "epoch": 0.42, + "learning_rate": 1.3089465839095803e-05, + "loss": 0.4596, + "step": 3658 + }, + { + "epoch": 0.42, + "learning_rate": 1.3085945061909144e-05, + "loss": 0.4778, + "step": 3659 + }, + { + "epoch": 0.42, + "learning_rate": 1.3082423861879114e-05, + "loss": 0.4706, + "step": 3660 + }, + { + "epoch": 0.42, + "learning_rate": 1.3078902239488196e-05, + "loss": 0.4497, + "step": 3661 + }, + { + "epoch": 0.42, + "learning_rate": 1.3075380195218931e-05, + "loss": 0.4668, + "step": 3662 + }, + { + "epoch": 0.42, + "learning_rate": 1.3071857729553918e-05, + "loss": 0.469, + "step": 3663 + }, + { + "epoch": 0.42, + "learning_rate": 1.3068334842975813e-05, + "loss": 0.4776, + "step": 3664 + }, + { + "epoch": 0.42, + "learning_rate": 1.306481153596733e-05, + "loss": 0.4726, + "step": 3665 + }, + { + "epoch": 0.42, + "learning_rate": 1.3061287809011243e-05, + "loss": 0.4661, + "step": 3666 + }, + { + "epoch": 0.42, + "learning_rate": 1.3057763662590377e-05, + "loss": 0.4822, + "step": 3667 + }, + { + "epoch": 0.42, + "learning_rate": 1.3054239097187625e-05, + "loss": 0.4588, + "step": 3668 + }, + { + "epoch": 0.42, + "learning_rate": 1.3050714113285922e-05, + "loss": 0.4591, + "step": 3669 + }, + { + "epoch": 0.42, + "learning_rate": 1.3047188711368278e-05, + "loss": 0.4658, + "step": 3670 + }, + { + "epoch": 0.42, + "learning_rate": 1.3043662891917748e-05, + "loss": 0.4681, + "step": 3671 + }, + { + "epoch": 0.42, + "learning_rate": 1.3040136655417448e-05, + "loss": 0.4674, + "step": 3672 + }, + { + "epoch": 0.42, + "learning_rate": 1.303661000235055e-05, + "loss": 0.4597, + "step": 3673 + }, + { + "epoch": 0.42, + "learning_rate": 1.3033082933200287e-05, + "loss": 0.4537, + "step": 3674 + }, + { + "epoch": 0.42, + "learning_rate": 1.3029555448449947e-05, + "loss": 0.4705, + "step": 3675 + }, + { + "epoch": 0.42, + "learning_rate": 1.302602754858287e-05, + "loss": 0.4701, + "step": 3676 + }, + { + "epoch": 0.42, + "learning_rate": 1.3022499234082463e-05, + "loss": 0.4851, + "step": 3677 + }, + { + "epoch": 0.42, + "learning_rate": 1.3018970505432176e-05, + "loss": 0.4751, + "step": 3678 + }, + { + "epoch": 0.42, + "learning_rate": 1.3015441363115526e-05, + "loss": 0.4522, + "step": 3679 + }, + { + "epoch": 0.42, + "learning_rate": 1.3011911807616091e-05, + "loss": 0.4703, + "step": 3680 + }, + { + "epoch": 0.42, + "learning_rate": 1.3008381839417493e-05, + "loss": 0.4713, + "step": 3681 + }, + { + "epoch": 0.42, + "learning_rate": 1.3004851459003416e-05, + "loss": 0.4806, + "step": 3682 + }, + { + "epoch": 0.42, + "learning_rate": 1.30013206668576e-05, + "loss": 0.4972, + "step": 3683 + }, + { + "epoch": 0.42, + "learning_rate": 1.2997789463463848e-05, + "loss": 0.4535, + "step": 3684 + }, + { + "epoch": 0.42, + "learning_rate": 1.2994257849306009e-05, + "loss": 0.479, + "step": 3685 + }, + { + "epoch": 0.42, + "learning_rate": 1.2990725824867995e-05, + "loss": 0.4595, + "step": 3686 + }, + { + "epoch": 0.42, + "learning_rate": 1.2987193390633773e-05, + "loss": 0.464, + "step": 3687 + }, + { + "epoch": 0.42, + "learning_rate": 1.298366054708736e-05, + "loss": 0.4592, + "step": 3688 + }, + { + "epoch": 0.42, + "learning_rate": 1.2980127294712839e-05, + "loss": 0.4798, + "step": 3689 + }, + { + "epoch": 0.42, + "learning_rate": 1.2976593633994347e-05, + "loss": 0.4443, + "step": 3690 + }, + { + "epoch": 0.42, + "learning_rate": 1.297305956541607e-05, + "loss": 0.4957, + "step": 3691 + }, + { + "epoch": 0.42, + "learning_rate": 1.2969525089462253e-05, + "loss": 0.4539, + "step": 3692 + }, + { + "epoch": 0.42, + "learning_rate": 1.2965990206617203e-05, + "loss": 0.4583, + "step": 3693 + }, + { + "epoch": 0.42, + "learning_rate": 1.2962454917365275e-05, + "loss": 0.4544, + "step": 3694 + }, + { + "epoch": 0.42, + "learning_rate": 1.2958919222190885e-05, + "loss": 0.4833, + "step": 3695 + }, + { + "epoch": 0.42, + "learning_rate": 1.2955383121578498e-05, + "loss": 0.469, + "step": 3696 + }, + { + "epoch": 0.42, + "learning_rate": 1.2951846616012642e-05, + "loss": 0.4562, + "step": 3697 + }, + { + "epoch": 0.42, + "learning_rate": 1.2948309705977893e-05, + "loss": 0.4555, + "step": 3698 + }, + { + "epoch": 0.42, + "learning_rate": 1.2944772391958896e-05, + "loss": 0.4657, + "step": 3699 + }, + { + "epoch": 0.42, + "learning_rate": 1.294123467444033e-05, + "loss": 0.4753, + "step": 3700 + }, + { + "epoch": 0.42, + "learning_rate": 1.2937696553906949e-05, + "loss": 0.4762, + "step": 3701 + }, + { + "epoch": 0.42, + "learning_rate": 1.2934158030843554e-05, + "loss": 0.45, + "step": 3702 + }, + { + "epoch": 0.42, + "learning_rate": 1.2930619105734999e-05, + "loss": 0.4803, + "step": 3703 + }, + { + "epoch": 0.42, + "learning_rate": 1.2927079779066196e-05, + "loss": 0.4837, + "step": 3704 + }, + { + "epoch": 0.42, + "learning_rate": 1.2923540051322114e-05, + "loss": 0.4546, + "step": 3705 + }, + { + "epoch": 0.42, + "learning_rate": 1.2919999922987775e-05, + "loss": 0.4731, + "step": 3706 + }, + { + "epoch": 0.42, + "learning_rate": 1.291645939454825e-05, + "loss": 0.4707, + "step": 3707 + }, + { + "epoch": 0.42, + "learning_rate": 1.2912918466488678e-05, + "loss": 0.4518, + "step": 3708 + }, + { + "epoch": 0.42, + "learning_rate": 1.2909377139294242e-05, + "loss": 0.4774, + "step": 3709 + }, + { + "epoch": 0.42, + "learning_rate": 1.290583541345018e-05, + "loss": 0.4678, + "step": 3710 + }, + { + "epoch": 0.42, + "learning_rate": 1.2902293289441791e-05, + "loss": 0.4693, + "step": 3711 + }, + { + "epoch": 0.42, + "learning_rate": 1.2898750767754427e-05, + "loss": 0.4519, + "step": 3712 + }, + { + "epoch": 0.42, + "learning_rate": 1.2895207848873488e-05, + "loss": 0.4656, + "step": 3713 + }, + { + "epoch": 0.42, + "learning_rate": 1.2891664533284434e-05, + "loss": 0.4719, + "step": 3714 + }, + { + "epoch": 0.42, + "learning_rate": 1.288812082147278e-05, + "loss": 0.4629, + "step": 3715 + }, + { + "epoch": 0.42, + "learning_rate": 1.2884576713924093e-05, + "loss": 0.4535, + "step": 3716 + }, + { + "epoch": 0.42, + "learning_rate": 1.2881032211123994e-05, + "loss": 0.4501, + "step": 3717 + }, + { + "epoch": 0.42, + "learning_rate": 1.2877487313558159e-05, + "loss": 0.4916, + "step": 3718 + }, + { + "epoch": 0.43, + "learning_rate": 1.287394202171232e-05, + "loss": 0.4555, + "step": 3719 + }, + { + "epoch": 0.43, + "learning_rate": 1.287039633607226e-05, + "loss": 0.4693, + "step": 3720 + }, + { + "epoch": 0.43, + "learning_rate": 1.2866850257123817e-05, + "loss": 0.4711, + "step": 3721 + }, + { + "epoch": 0.43, + "learning_rate": 1.2863303785352883e-05, + "loss": 0.4536, + "step": 3722 + }, + { + "epoch": 0.43, + "learning_rate": 1.2859756921245403e-05, + "loss": 0.4687, + "step": 3723 + }, + { + "epoch": 0.43, + "learning_rate": 1.2856209665287378e-05, + "loss": 0.4715, + "step": 3724 + }, + { + "epoch": 0.43, + "learning_rate": 1.2852662017964863e-05, + "loss": 0.4529, + "step": 3725 + }, + { + "epoch": 0.43, + "learning_rate": 1.2849113979763956e-05, + "loss": 0.4851, + "step": 3726 + }, + { + "epoch": 0.43, + "learning_rate": 1.2845565551170829e-05, + "loss": 0.4807, + "step": 3727 + }, + { + "epoch": 0.43, + "learning_rate": 1.2842016732671689e-05, + "loss": 0.456, + "step": 3728 + }, + { + "epoch": 0.43, + "learning_rate": 1.2838467524752808e-05, + "loss": 0.4591, + "step": 3729 + }, + { + "epoch": 0.43, + "learning_rate": 1.2834917927900504e-05, + "loss": 0.4645, + "step": 3730 + }, + { + "epoch": 0.43, + "learning_rate": 1.2831367942601146e-05, + "loss": 0.4533, + "step": 3731 + }, + { + "epoch": 0.43, + "learning_rate": 1.2827817569341167e-05, + "loss": 0.4886, + "step": 3732 + }, + { + "epoch": 0.43, + "learning_rate": 1.282426680860705e-05, + "loss": 0.4656, + "step": 3733 + }, + { + "epoch": 0.43, + "learning_rate": 1.2820715660885328e-05, + "loss": 0.4773, + "step": 3734 + }, + { + "epoch": 0.43, + "learning_rate": 1.2817164126662581e-05, + "loss": 0.4702, + "step": 3735 + }, + { + "epoch": 0.43, + "learning_rate": 1.281361220642545e-05, + "loss": 0.4791, + "step": 3736 + }, + { + "epoch": 0.43, + "learning_rate": 1.281005990066063e-05, + "loss": 0.4713, + "step": 3737 + }, + { + "epoch": 0.43, + "learning_rate": 1.280650720985487e-05, + "loss": 0.4736, + "step": 3738 + }, + { + "epoch": 0.43, + "learning_rate": 1.2802954134494963e-05, + "loss": 0.449, + "step": 3739 + }, + { + "epoch": 0.43, + "learning_rate": 1.2799400675067754e-05, + "loss": 0.4865, + "step": 3740 + }, + { + "epoch": 0.43, + "learning_rate": 1.2795846832060157e-05, + "loss": 0.4372, + "step": 3741 + }, + { + "epoch": 0.43, + "learning_rate": 1.2792292605959125e-05, + "loss": 0.4791, + "step": 3742 + }, + { + "epoch": 0.43, + "learning_rate": 1.2788737997251665e-05, + "loss": 0.4706, + "step": 3743 + }, + { + "epoch": 0.43, + "learning_rate": 1.2785183006424836e-05, + "loss": 0.4556, + "step": 3744 + }, + { + "epoch": 0.43, + "learning_rate": 1.278162763396575e-05, + "loss": 0.4523, + "step": 3745 + }, + { + "epoch": 0.43, + "learning_rate": 1.2778071880361577e-05, + "loss": 0.4699, + "step": 3746 + }, + { + "epoch": 0.43, + "learning_rate": 1.2774515746099536e-05, + "loss": 0.4654, + "step": 3747 + }, + { + "epoch": 0.43, + "learning_rate": 1.277095923166689e-05, + "loss": 0.4471, + "step": 3748 + }, + { + "epoch": 0.43, + "learning_rate": 1.2767402337550966e-05, + "loss": 0.4751, + "step": 3749 + }, + { + "epoch": 0.43, + "learning_rate": 1.2763845064239134e-05, + "loss": 0.4884, + "step": 3750 + }, + { + "epoch": 0.43, + "learning_rate": 1.2760287412218824e-05, + "loss": 0.4594, + "step": 3751 + }, + { + "epoch": 0.43, + "learning_rate": 1.275672938197751e-05, + "loss": 0.4848, + "step": 3752 + }, + { + "epoch": 0.43, + "learning_rate": 1.2753170974002727e-05, + "loss": 0.4542, + "step": 3753 + }, + { + "epoch": 0.43, + "learning_rate": 1.2749612188782048e-05, + "loss": 0.4643, + "step": 3754 + }, + { + "epoch": 0.43, + "learning_rate": 1.2746053026803114e-05, + "loss": 0.46, + "step": 3755 + }, + { + "epoch": 0.43, + "learning_rate": 1.2742493488553606e-05, + "loss": 0.4519, + "step": 3756 + }, + { + "epoch": 0.43, + "learning_rate": 1.2738933574521262e-05, + "loss": 0.4655, + "step": 3757 + }, + { + "epoch": 0.43, + "learning_rate": 1.2735373285193867e-05, + "loss": 0.4716, + "step": 3758 + }, + { + "epoch": 0.43, + "learning_rate": 1.2731812621059262e-05, + "loss": 0.469, + "step": 3759 + }, + { + "epoch": 0.43, + "learning_rate": 1.2728251582605335e-05, + "loss": 0.4608, + "step": 3760 + }, + { + "epoch": 0.43, + "learning_rate": 1.2724690170320031e-05, + "loss": 0.4635, + "step": 3761 + }, + { + "epoch": 0.43, + "learning_rate": 1.2721128384691342e-05, + "loss": 0.4552, + "step": 3762 + }, + { + "epoch": 0.43, + "learning_rate": 1.2717566226207311e-05, + "loss": 0.4805, + "step": 3763 + }, + { + "epoch": 0.43, + "learning_rate": 1.2714003695356037e-05, + "loss": 0.4595, + "step": 3764 + }, + { + "epoch": 0.43, + "learning_rate": 1.2710440792625662e-05, + "loss": 0.4617, + "step": 3765 + }, + { + "epoch": 0.43, + "learning_rate": 1.2706877518504384e-05, + "loss": 0.485, + "step": 3766 + }, + { + "epoch": 0.43, + "learning_rate": 1.2703313873480451e-05, + "loss": 0.462, + "step": 3767 + }, + { + "epoch": 0.43, + "learning_rate": 1.2699749858042164e-05, + "loss": 0.4743, + "step": 3768 + }, + { + "epoch": 0.43, + "learning_rate": 1.269618547267787e-05, + "loss": 0.4731, + "step": 3769 + }, + { + "epoch": 0.43, + "learning_rate": 1.2692620717875972e-05, + "loss": 0.4591, + "step": 3770 + }, + { + "epoch": 0.43, + "learning_rate": 1.2689055594124919e-05, + "loss": 0.4616, + "step": 3771 + }, + { + "epoch": 0.43, + "learning_rate": 1.2685490101913214e-05, + "loss": 0.4946, + "step": 3772 + }, + { + "epoch": 0.43, + "learning_rate": 1.2681924241729409e-05, + "loss": 0.4563, + "step": 3773 + }, + { + "epoch": 0.43, + "learning_rate": 1.2678358014062104e-05, + "loss": 0.4606, + "step": 3774 + }, + { + "epoch": 0.43, + "learning_rate": 1.2674791419399956e-05, + "loss": 0.4596, + "step": 3775 + }, + { + "epoch": 0.43, + "learning_rate": 1.2671224458231664e-05, + "loss": 0.5041, + "step": 3776 + }, + { + "epoch": 0.43, + "learning_rate": 1.2667657131045983e-05, + "loss": 0.4335, + "step": 3777 + }, + { + "epoch": 0.43, + "learning_rate": 1.2664089438331716e-05, + "loss": 0.4698, + "step": 3778 + }, + { + "epoch": 0.43, + "learning_rate": 1.266052138057772e-05, + "loss": 0.4479, + "step": 3779 + }, + { + "epoch": 0.43, + "learning_rate": 1.2656952958272893e-05, + "loss": 0.4608, + "step": 3780 + }, + { + "epoch": 0.43, + "learning_rate": 1.2653384171906192e-05, + "loss": 0.4745, + "step": 3781 + }, + { + "epoch": 0.43, + "learning_rate": 1.264981502196662e-05, + "loss": 0.4606, + "step": 3782 + }, + { + "epoch": 0.43, + "learning_rate": 1.2646245508943227e-05, + "loss": 0.47, + "step": 3783 + }, + { + "epoch": 0.43, + "learning_rate": 1.2642675633325122e-05, + "loss": 0.4923, + "step": 3784 + }, + { + "epoch": 0.43, + "learning_rate": 1.2639105395601452e-05, + "loss": 0.4503, + "step": 3785 + }, + { + "epoch": 0.43, + "learning_rate": 1.2635534796261424e-05, + "loss": 0.4632, + "step": 3786 + }, + { + "epoch": 0.43, + "learning_rate": 1.2631963835794285e-05, + "loss": 0.4807, + "step": 3787 + }, + { + "epoch": 0.43, + "learning_rate": 1.2628392514689339e-05, + "loss": 0.4647, + "step": 3788 + }, + { + "epoch": 0.43, + "learning_rate": 1.2624820833435939e-05, + "loss": 0.4696, + "step": 3789 + }, + { + "epoch": 0.43, + "learning_rate": 1.262124879252348e-05, + "loss": 0.4604, + "step": 3790 + }, + { + "epoch": 0.43, + "learning_rate": 1.2617676392441419e-05, + "loss": 0.4466, + "step": 3791 + }, + { + "epoch": 0.43, + "learning_rate": 1.2614103633679244e-05, + "loss": 0.4844, + "step": 3792 + }, + { + "epoch": 0.43, + "learning_rate": 1.2610530516726506e-05, + "loss": 0.4597, + "step": 3793 + }, + { + "epoch": 0.43, + "learning_rate": 1.260695704207281e-05, + "loss": 0.4664, + "step": 3794 + }, + { + "epoch": 0.43, + "learning_rate": 1.2603383210207796e-05, + "loss": 0.4714, + "step": 3795 + }, + { + "epoch": 0.43, + "learning_rate": 1.2599809021621157e-05, + "loss": 0.4559, + "step": 3796 + }, + { + "epoch": 0.43, + "learning_rate": 1.2596234476802636e-05, + "loss": 0.4551, + "step": 3797 + }, + { + "epoch": 0.43, + "learning_rate": 1.2592659576242028e-05, + "loss": 0.4842, + "step": 3798 + }, + { + "epoch": 0.43, + "learning_rate": 1.2589084320429178e-05, + "loss": 0.4632, + "step": 3799 + }, + { + "epoch": 0.43, + "learning_rate": 1.2585508709853971e-05, + "loss": 0.477, + "step": 3800 + }, + { + "epoch": 0.43, + "learning_rate": 1.2581932745006343e-05, + "loss": 0.4712, + "step": 3801 + }, + { + "epoch": 0.43, + "learning_rate": 1.2578356426376283e-05, + "loss": 0.4702, + "step": 3802 + }, + { + "epoch": 0.43, + "learning_rate": 1.2574779754453831e-05, + "loss": 0.4609, + "step": 3803 + }, + { + "epoch": 0.43, + "learning_rate": 1.257120272972907e-05, + "loss": 0.4698, + "step": 3804 + }, + { + "epoch": 0.43, + "learning_rate": 1.2567625352692127e-05, + "loss": 0.4681, + "step": 3805 + }, + { + "epoch": 0.43, + "learning_rate": 1.2564047623833186e-05, + "loss": 0.4557, + "step": 3806 + }, + { + "epoch": 0.44, + "learning_rate": 1.2560469543642472e-05, + "loss": 0.4728, + "step": 3807 + }, + { + "epoch": 0.44, + "learning_rate": 1.255689111261027e-05, + "loss": 0.4641, + "step": 3808 + }, + { + "epoch": 0.44, + "learning_rate": 1.2553312331226896e-05, + "loss": 0.4671, + "step": 3809 + }, + { + "epoch": 0.44, + "learning_rate": 1.254973319998273e-05, + "loss": 0.4723, + "step": 3810 + }, + { + "epoch": 0.44, + "learning_rate": 1.2546153719368189e-05, + "loss": 0.4556, + "step": 3811 + }, + { + "epoch": 0.44, + "learning_rate": 1.2542573889873741e-05, + "loss": 0.479, + "step": 3812 + }, + { + "epoch": 0.44, + "learning_rate": 1.2538993711989906e-05, + "loss": 0.4741, + "step": 3813 + }, + { + "epoch": 0.44, + "learning_rate": 1.2535413186207247e-05, + "loss": 0.4584, + "step": 3814 + }, + { + "epoch": 0.44, + "learning_rate": 1.2531832313016374e-05, + "loss": 0.4645, + "step": 3815 + }, + { + "epoch": 0.44, + "learning_rate": 1.2528251092907948e-05, + "loss": 0.4779, + "step": 3816 + }, + { + "epoch": 0.44, + "learning_rate": 1.2524669526372674e-05, + "loss": 0.4673, + "step": 3817 + }, + { + "epoch": 0.44, + "learning_rate": 1.2521087613901313e-05, + "loss": 0.4735, + "step": 3818 + }, + { + "epoch": 0.44, + "learning_rate": 1.251750535598466e-05, + "loss": 0.4649, + "step": 3819 + }, + { + "epoch": 0.44, + "learning_rate": 1.2513922753113567e-05, + "loss": 0.4772, + "step": 3820 + }, + { + "epoch": 0.44, + "learning_rate": 1.2510339805778932e-05, + "loss": 0.4602, + "step": 3821 + }, + { + "epoch": 0.44, + "learning_rate": 1.2506756514471696e-05, + "loss": 0.4734, + "step": 3822 + }, + { + "epoch": 0.44, + "learning_rate": 1.2503172879682853e-05, + "loss": 0.4635, + "step": 3823 + }, + { + "epoch": 0.44, + "learning_rate": 1.2499588901903437e-05, + "loss": 0.4662, + "step": 3824 + }, + { + "epoch": 0.44, + "learning_rate": 1.2496004581624538e-05, + "loss": 0.4722, + "step": 3825 + }, + { + "epoch": 0.44, + "learning_rate": 1.2492419919337282e-05, + "loss": 0.4521, + "step": 3826 + }, + { + "epoch": 0.44, + "learning_rate": 1.2488834915532852e-05, + "loss": 0.4681, + "step": 3827 + }, + { + "epoch": 0.44, + "learning_rate": 1.2485249570702471e-05, + "loss": 0.5047, + "step": 3828 + }, + { + "epoch": 0.44, + "learning_rate": 1.2481663885337417e-05, + "loss": 0.4595, + "step": 3829 + }, + { + "epoch": 0.44, + "learning_rate": 1.2478077859929e-05, + "loss": 0.4662, + "step": 3830 + }, + { + "epoch": 0.44, + "learning_rate": 1.2474491494968593e-05, + "loss": 0.4725, + "step": 3831 + }, + { + "epoch": 0.44, + "learning_rate": 1.2470904790947605e-05, + "loss": 0.4675, + "step": 3832 + }, + { + "epoch": 0.44, + "learning_rate": 1.2467317748357493e-05, + "loss": 0.4636, + "step": 3833 + }, + { + "epoch": 0.44, + "learning_rate": 1.2463730367689768e-05, + "loss": 0.4673, + "step": 3834 + }, + { + "epoch": 0.44, + "learning_rate": 1.246014264943597e-05, + "loss": 0.4526, + "step": 3835 + }, + { + "epoch": 0.44, + "learning_rate": 1.2456554594087709e-05, + "loss": 0.4888, + "step": 3836 + }, + { + "epoch": 0.44, + "learning_rate": 1.245296620213662e-05, + "loss": 0.4601, + "step": 3837 + }, + { + "epoch": 0.44, + "learning_rate": 1.2449377474074398e-05, + "loss": 0.4894, + "step": 3838 + }, + { + "epoch": 0.44, + "learning_rate": 1.2445788410392778e-05, + "loss": 0.4568, + "step": 3839 + }, + { + "epoch": 0.44, + "learning_rate": 1.2442199011583538e-05, + "loss": 0.4489, + "step": 3840 + }, + { + "epoch": 0.44, + "learning_rate": 1.2438609278138509e-05, + "loss": 0.4727, + "step": 3841 + }, + { + "epoch": 0.44, + "learning_rate": 1.2435019210549564e-05, + "loss": 0.4745, + "step": 3842 + }, + { + "epoch": 0.44, + "learning_rate": 1.2431428809308625e-05, + "loss": 0.4597, + "step": 3843 + }, + { + "epoch": 0.44, + "learning_rate": 1.2427838074907654e-05, + "loss": 0.4838, + "step": 3844 + }, + { + "epoch": 0.44, + "learning_rate": 1.2424247007838659e-05, + "loss": 0.4504, + "step": 3845 + }, + { + "epoch": 0.44, + "learning_rate": 1.2420655608593701e-05, + "loss": 0.4759, + "step": 3846 + }, + { + "epoch": 0.44, + "learning_rate": 1.2417063877664883e-05, + "loss": 0.4513, + "step": 3847 + }, + { + "epoch": 0.44, + "learning_rate": 1.241347181554435e-05, + "loss": 0.471, + "step": 3848 + }, + { + "epoch": 0.44, + "learning_rate": 1.2409879422724293e-05, + "loss": 0.4847, + "step": 3849 + }, + { + "epoch": 0.44, + "learning_rate": 1.240628669969695e-05, + "loss": 0.4829, + "step": 3850 + }, + { + "epoch": 0.44, + "learning_rate": 1.2402693646954607e-05, + "loss": 0.4523, + "step": 3851 + }, + { + "epoch": 0.44, + "learning_rate": 1.2399100264989593e-05, + "loss": 0.4763, + "step": 3852 + }, + { + "epoch": 0.44, + "learning_rate": 1.2395506554294281e-05, + "loss": 0.4629, + "step": 3853 + }, + { + "epoch": 0.44, + "learning_rate": 1.2391912515361085e-05, + "loss": 0.4689, + "step": 3854 + }, + { + "epoch": 0.44, + "learning_rate": 1.2388318148682474e-05, + "loss": 0.459, + "step": 3855 + }, + { + "epoch": 0.44, + "learning_rate": 1.2384723454750957e-05, + "loss": 0.4638, + "step": 3856 + }, + { + "epoch": 0.44, + "learning_rate": 1.2381128434059082e-05, + "loss": 0.4496, + "step": 3857 + }, + { + "epoch": 0.44, + "learning_rate": 1.2377533087099451e-05, + "loss": 0.4754, + "step": 3858 + }, + { + "epoch": 0.44, + "learning_rate": 1.2373937414364703e-05, + "loss": 0.4502, + "step": 3859 + }, + { + "epoch": 0.44, + "learning_rate": 1.237034141634753e-05, + "loss": 0.4675, + "step": 3860 + }, + { + "epoch": 0.44, + "learning_rate": 1.2366745093540667e-05, + "loss": 0.4682, + "step": 3861 + }, + { + "epoch": 0.44, + "learning_rate": 1.2363148446436882e-05, + "loss": 0.4683, + "step": 3862 + }, + { + "epoch": 0.44, + "learning_rate": 1.2359551475529e-05, + "loss": 0.4558, + "step": 3863 + }, + { + "epoch": 0.44, + "learning_rate": 1.2355954181309883e-05, + "loss": 0.4749, + "step": 3864 + }, + { + "epoch": 0.44, + "learning_rate": 1.235235656427245e-05, + "loss": 0.4658, + "step": 3865 + }, + { + "epoch": 0.44, + "learning_rate": 1.2348758624909644e-05, + "loss": 0.4681, + "step": 3866 + }, + { + "epoch": 0.44, + "learning_rate": 1.2345160363714471e-05, + "loss": 0.4694, + "step": 3867 + }, + { + "epoch": 0.44, + "learning_rate": 1.2341561781179965e-05, + "loss": 0.4718, + "step": 3868 + }, + { + "epoch": 0.44, + "learning_rate": 1.233796287779922e-05, + "loss": 0.4808, + "step": 3869 + }, + { + "epoch": 0.44, + "learning_rate": 1.2334363654065363e-05, + "loss": 0.4671, + "step": 3870 + }, + { + "epoch": 0.44, + "learning_rate": 1.2330764110471567e-05, + "loss": 0.4458, + "step": 3871 + }, + { + "epoch": 0.44, + "learning_rate": 1.2327164247511051e-05, + "loss": 0.4809, + "step": 3872 + }, + { + "epoch": 0.44, + "learning_rate": 1.2323564065677078e-05, + "loss": 0.4554, + "step": 3873 + }, + { + "epoch": 0.44, + "learning_rate": 1.2319963565462949e-05, + "loss": 0.4595, + "step": 3874 + }, + { + "epoch": 0.44, + "learning_rate": 1.2316362747362019e-05, + "loss": 0.4644, + "step": 3875 + }, + { + "epoch": 0.44, + "learning_rate": 1.2312761611867673e-05, + "loss": 0.463, + "step": 3876 + }, + { + "epoch": 0.44, + "learning_rate": 1.2309160159473354e-05, + "loss": 0.4827, + "step": 3877 + }, + { + "epoch": 0.44, + "learning_rate": 1.2305558390672539e-05, + "loss": 0.4786, + "step": 3878 + }, + { + "epoch": 0.44, + "learning_rate": 1.2301956305958746e-05, + "loss": 0.4691, + "step": 3879 + }, + { + "epoch": 0.44, + "learning_rate": 1.2298353905825549e-05, + "loss": 0.4597, + "step": 3880 + }, + { + "epoch": 0.44, + "learning_rate": 1.2294751190766552e-05, + "loss": 0.4655, + "step": 3881 + }, + { + "epoch": 0.44, + "learning_rate": 1.229114816127541e-05, + "loss": 0.4744, + "step": 3882 + }, + { + "epoch": 0.44, + "learning_rate": 1.2287544817845817e-05, + "loss": 0.4498, + "step": 3883 + }, + { + "epoch": 0.44, + "learning_rate": 1.2283941160971512e-05, + "loss": 0.4816, + "step": 3884 + }, + { + "epoch": 0.44, + "learning_rate": 1.2280337191146276e-05, + "loss": 0.4777, + "step": 3885 + }, + { + "epoch": 0.44, + "learning_rate": 1.2276732908863933e-05, + "loss": 0.4736, + "step": 3886 + }, + { + "epoch": 0.44, + "learning_rate": 1.2273128314618353e-05, + "loss": 0.462, + "step": 3887 + }, + { + "epoch": 0.44, + "learning_rate": 1.226952340890344e-05, + "loss": 0.4851, + "step": 3888 + }, + { + "epoch": 0.44, + "learning_rate": 1.2265918192213153e-05, + "loss": 0.4576, + "step": 3889 + }, + { + "epoch": 0.44, + "learning_rate": 1.2262312665041482e-05, + "loss": 0.4893, + "step": 3890 + }, + { + "epoch": 0.44, + "learning_rate": 1.2258706827882472e-05, + "loss": 0.462, + "step": 3891 + }, + { + "epoch": 0.44, + "learning_rate": 1.2255100681230192e-05, + "loss": 0.4632, + "step": 3892 + }, + { + "epoch": 0.44, + "learning_rate": 1.2251494225578775e-05, + "loss": 0.4544, + "step": 3893 + }, + { + "epoch": 0.45, + "learning_rate": 1.224788746142238e-05, + "loss": 0.4882, + "step": 3894 + }, + { + "epoch": 0.45, + "learning_rate": 1.2244280389255218e-05, + "loss": 0.4637, + "step": 3895 + }, + { + "epoch": 0.45, + "learning_rate": 1.2240673009571536e-05, + "loss": 0.459, + "step": 3896 + }, + { + "epoch": 0.45, + "learning_rate": 1.2237065322865625e-05, + "loss": 0.4563, + "step": 3897 + }, + { + "epoch": 0.45, + "learning_rate": 1.223345732963182e-05, + "loss": 0.4665, + "step": 3898 + }, + { + "epoch": 0.45, + "learning_rate": 1.2229849030364496e-05, + "loss": 0.4574, + "step": 3899 + }, + { + "epoch": 0.45, + "learning_rate": 1.2226240425558071e-05, + "loss": 0.4701, + "step": 3900 + }, + { + "epoch": 0.45, + "learning_rate": 1.2222631515707005e-05, + "loss": 0.4649, + "step": 3901 + }, + { + "epoch": 0.45, + "learning_rate": 1.2219022301305796e-05, + "loss": 0.4686, + "step": 3902 + }, + { + "epoch": 0.45, + "learning_rate": 1.2215412782848993e-05, + "loss": 0.4741, + "step": 3903 + }, + { + "epoch": 0.45, + "learning_rate": 1.2211802960831176e-05, + "loss": 0.4629, + "step": 3904 + }, + { + "epoch": 0.45, + "learning_rate": 1.2208192835746973e-05, + "loss": 0.4678, + "step": 3905 + }, + { + "epoch": 0.45, + "learning_rate": 1.2204582408091047e-05, + "loss": 0.4732, + "step": 3906 + }, + { + "epoch": 0.45, + "learning_rate": 1.2200971678358113e-05, + "loss": 0.4798, + "step": 3907 + }, + { + "epoch": 0.45, + "learning_rate": 1.2197360647042922e-05, + "loss": 0.4674, + "step": 3908 + }, + { + "epoch": 0.45, + "learning_rate": 1.2193749314640264e-05, + "loss": 0.4708, + "step": 3909 + }, + { + "epoch": 0.45, + "learning_rate": 1.2190137681644968e-05, + "loss": 0.4884, + "step": 3910 + }, + { + "epoch": 0.45, + "learning_rate": 1.2186525748551914e-05, + "loss": 0.468, + "step": 3911 + }, + { + "epoch": 0.45, + "learning_rate": 1.2182913515856016e-05, + "loss": 0.4619, + "step": 3912 + }, + { + "epoch": 0.45, + "learning_rate": 1.2179300984052233e-05, + "loss": 0.4604, + "step": 3913 + }, + { + "epoch": 0.45, + "learning_rate": 1.217568815363556e-05, + "loss": 0.4674, + "step": 3914 + }, + { + "epoch": 0.45, + "learning_rate": 1.2172075025101032e-05, + "loss": 0.4838, + "step": 3915 + }, + { + "epoch": 0.45, + "learning_rate": 1.2168461598943728e-05, + "loss": 0.4492, + "step": 3916 + }, + { + "epoch": 0.45, + "learning_rate": 1.2164847875658776e-05, + "loss": 0.4636, + "step": 3917 + }, + { + "epoch": 0.45, + "learning_rate": 1.2161233855741332e-05, + "loss": 0.4594, + "step": 3918 + }, + { + "epoch": 0.45, + "learning_rate": 1.2157619539686597e-05, + "loss": 0.488, + "step": 3919 + }, + { + "epoch": 0.45, + "learning_rate": 1.2154004927989815e-05, + "loss": 0.4618, + "step": 3920 + }, + { + "epoch": 0.45, + "learning_rate": 1.2150390021146263e-05, + "loss": 0.4698, + "step": 3921 + }, + { + "epoch": 0.45, + "learning_rate": 1.2146774819651275e-05, + "loss": 0.4638, + "step": 3922 + }, + { + "epoch": 0.45, + "learning_rate": 1.2143159324000204e-05, + "loss": 0.4549, + "step": 3923 + }, + { + "epoch": 0.45, + "learning_rate": 1.2139543534688456e-05, + "loss": 0.4666, + "step": 3924 + }, + { + "epoch": 0.45, + "learning_rate": 1.2135927452211477e-05, + "loss": 0.4641, + "step": 3925 + }, + { + "epoch": 0.45, + "learning_rate": 1.2132311077064749e-05, + "loss": 0.4634, + "step": 3926 + }, + { + "epoch": 0.45, + "learning_rate": 1.2128694409743797e-05, + "loss": 0.4843, + "step": 3927 + }, + { + "epoch": 0.45, + "learning_rate": 1.2125077450744187e-05, + "loss": 0.4659, + "step": 3928 + }, + { + "epoch": 0.45, + "learning_rate": 1.2121460200561521e-05, + "loss": 0.4573, + "step": 3929 + }, + { + "epoch": 0.45, + "learning_rate": 1.2117842659691444e-05, + "loss": 0.4587, + "step": 3930 + }, + { + "epoch": 0.45, + "learning_rate": 1.2114224828629638e-05, + "loss": 0.467, + "step": 3931 + }, + { + "epoch": 0.45, + "learning_rate": 1.2110606707871828e-05, + "loss": 0.4531, + "step": 3932 + }, + { + "epoch": 0.45, + "learning_rate": 1.2106988297913778e-05, + "loss": 0.4764, + "step": 3933 + }, + { + "epoch": 0.45, + "learning_rate": 1.2103369599251289e-05, + "loss": 0.4769, + "step": 3934 + }, + { + "epoch": 0.45, + "learning_rate": 1.2099750612380205e-05, + "loss": 0.4911, + "step": 3935 + }, + { + "epoch": 0.45, + "learning_rate": 1.2096131337796408e-05, + "loss": 0.4438, + "step": 3936 + }, + { + "epoch": 0.45, + "learning_rate": 1.2092511775995821e-05, + "loss": 0.4926, + "step": 3937 + }, + { + "epoch": 0.45, + "learning_rate": 1.20888919274744e-05, + "loss": 0.4568, + "step": 3938 + }, + { + "epoch": 0.45, + "learning_rate": 1.208527179272815e-05, + "loss": 0.4464, + "step": 3939 + }, + { + "epoch": 0.45, + "learning_rate": 1.2081651372253107e-05, + "loss": 0.4697, + "step": 3940 + }, + { + "epoch": 0.45, + "learning_rate": 1.2078030666545351e-05, + "loss": 0.4728, + "step": 3941 + }, + { + "epoch": 0.45, + "learning_rate": 1.2074409676101e-05, + "loss": 0.4593, + "step": 3942 + }, + { + "epoch": 0.45, + "learning_rate": 1.2070788401416209e-05, + "loss": 0.4709, + "step": 3943 + }, + { + "epoch": 0.45, + "learning_rate": 1.2067166842987175e-05, + "loss": 0.4667, + "step": 3944 + }, + { + "epoch": 0.45, + "learning_rate": 1.206354500131013e-05, + "loss": 0.4728, + "step": 3945 + }, + { + "epoch": 0.45, + "learning_rate": 1.205992287688135e-05, + "loss": 0.4962, + "step": 3946 + }, + { + "epoch": 0.45, + "learning_rate": 1.2056300470197144e-05, + "loss": 0.4613, + "step": 3947 + }, + { + "epoch": 0.45, + "learning_rate": 1.2052677781753869e-05, + "loss": 0.4539, + "step": 3948 + }, + { + "epoch": 0.45, + "learning_rate": 1.2049054812047905e-05, + "loss": 0.4602, + "step": 3949 + }, + { + "epoch": 0.45, + "learning_rate": 1.2045431561575685e-05, + "loss": 0.4621, + "step": 3950 + }, + { + "epoch": 0.45, + "learning_rate": 1.2041808030833675e-05, + "loss": 0.4672, + "step": 3951 + }, + { + "epoch": 0.45, + "learning_rate": 1.2038184220318381e-05, + "loss": 0.4647, + "step": 3952 + }, + { + "epoch": 0.45, + "learning_rate": 1.2034560130526341e-05, + "loss": 0.4773, + "step": 3953 + }, + { + "epoch": 0.45, + "learning_rate": 1.2030935761954137e-05, + "loss": 0.4605, + "step": 3954 + }, + { + "epoch": 0.45, + "learning_rate": 1.2027311115098395e-05, + "loss": 0.4847, + "step": 3955 + }, + { + "epoch": 0.45, + "learning_rate": 1.2023686190455766e-05, + "loss": 0.4613, + "step": 3956 + }, + { + "epoch": 0.45, + "learning_rate": 1.202006098852295e-05, + "loss": 0.4723, + "step": 3957 + }, + { + "epoch": 0.45, + "learning_rate": 1.2016435509796677e-05, + "loss": 0.4552, + "step": 3958 + }, + { + "epoch": 0.45, + "learning_rate": 1.2012809754773718e-05, + "loss": 0.4701, + "step": 3959 + }, + { + "epoch": 0.45, + "learning_rate": 1.2009183723950886e-05, + "loss": 0.4679, + "step": 3960 + }, + { + "epoch": 0.45, + "learning_rate": 1.2005557417825029e-05, + "loss": 0.4634, + "step": 3961 + }, + { + "epoch": 0.45, + "learning_rate": 1.2001930836893026e-05, + "loss": 0.4503, + "step": 3962 + }, + { + "epoch": 0.45, + "learning_rate": 1.1998303981651804e-05, + "loss": 0.4864, + "step": 3963 + }, + { + "epoch": 0.45, + "learning_rate": 1.199467685259832e-05, + "loss": 0.4726, + "step": 3964 + }, + { + "epoch": 0.45, + "learning_rate": 1.1991049450229577e-05, + "loss": 0.4478, + "step": 3965 + }, + { + "epoch": 0.45, + "learning_rate": 1.1987421775042605e-05, + "loss": 0.4787, + "step": 3966 + }, + { + "epoch": 0.45, + "learning_rate": 1.1983793827534477e-05, + "loss": 0.4795, + "step": 3967 + }, + { + "epoch": 0.45, + "learning_rate": 1.1980165608202303e-05, + "loss": 0.4697, + "step": 3968 + }, + { + "epoch": 0.45, + "learning_rate": 1.1976537117543234e-05, + "loss": 0.4527, + "step": 3969 + }, + { + "epoch": 0.45, + "learning_rate": 1.1972908356054455e-05, + "loss": 0.4764, + "step": 3970 + }, + { + "epoch": 0.45, + "learning_rate": 1.1969279324233179e-05, + "loss": 0.4666, + "step": 3971 + }, + { + "epoch": 0.45, + "learning_rate": 1.1965650022576672e-05, + "loss": 0.4536, + "step": 3972 + }, + { + "epoch": 0.45, + "learning_rate": 1.196202045158222e-05, + "loss": 0.4626, + "step": 3973 + }, + { + "epoch": 0.45, + "learning_rate": 1.1958390611747167e-05, + "loss": 0.471, + "step": 3974 + }, + { + "epoch": 0.45, + "learning_rate": 1.1954760503568878e-05, + "loss": 0.4596, + "step": 3975 + }, + { + "epoch": 0.45, + "learning_rate": 1.1951130127544756e-05, + "loss": 0.4595, + "step": 3976 + }, + { + "epoch": 0.45, + "learning_rate": 1.1947499484172245e-05, + "loss": 0.4803, + "step": 3977 + }, + { + "epoch": 0.45, + "learning_rate": 1.1943868573948825e-05, + "loss": 0.4693, + "step": 3978 + }, + { + "epoch": 0.45, + "learning_rate": 1.194023739737201e-05, + "loss": 0.4819, + "step": 3979 + }, + { + "epoch": 0.45, + "learning_rate": 1.1936605954939355e-05, + "loss": 0.4549, + "step": 3980 + }, + { + "epoch": 0.45, + "learning_rate": 1.1932974247148445e-05, + "loss": 0.4593, + "step": 3981 + }, + { + "epoch": 0.46, + "learning_rate": 1.192934227449691e-05, + "loss": 0.4699, + "step": 3982 + }, + { + "epoch": 0.46, + "learning_rate": 1.1925710037482405e-05, + "loss": 0.4489, + "step": 3983 + }, + { + "epoch": 0.46, + "learning_rate": 1.1922077536602634e-05, + "loss": 0.4618, + "step": 3984 + }, + { + "epoch": 0.46, + "learning_rate": 1.1918444772355329e-05, + "loss": 0.4802, + "step": 3985 + }, + { + "epoch": 0.46, + "learning_rate": 1.1914811745238256e-05, + "loss": 0.4796, + "step": 3986 + }, + { + "epoch": 0.46, + "learning_rate": 1.1911178455749223e-05, + "loss": 0.4778, + "step": 3987 + }, + { + "epoch": 0.46, + "learning_rate": 1.1907544904386074e-05, + "loss": 0.4448, + "step": 3988 + }, + { + "epoch": 0.46, + "learning_rate": 1.1903911091646684e-05, + "loss": 0.4815, + "step": 3989 + }, + { + "epoch": 0.46, + "learning_rate": 1.190027701802897e-05, + "loss": 0.4605, + "step": 3990 + }, + { + "epoch": 0.46, + "learning_rate": 1.1896642684030874e-05, + "loss": 0.4692, + "step": 3991 + }, + { + "epoch": 0.46, + "learning_rate": 1.1893008090150389e-05, + "loss": 0.4576, + "step": 3992 + }, + { + "epoch": 0.46, + "learning_rate": 1.1889373236885531e-05, + "loss": 0.472, + "step": 3993 + }, + { + "epoch": 0.46, + "learning_rate": 1.1885738124734359e-05, + "loss": 0.4549, + "step": 3994 + }, + { + "epoch": 0.46, + "learning_rate": 1.188210275419496e-05, + "loss": 0.481, + "step": 3995 + }, + { + "epoch": 0.46, + "learning_rate": 1.1878467125765464e-05, + "loss": 0.449, + "step": 3996 + }, + { + "epoch": 0.46, + "learning_rate": 1.1874831239944034e-05, + "loss": 0.4738, + "step": 3997 + }, + { + "epoch": 0.46, + "learning_rate": 1.1871195097228864e-05, + "loss": 0.4507, + "step": 3998 + }, + { + "epoch": 0.46, + "learning_rate": 1.1867558698118192e-05, + "loss": 0.4777, + "step": 3999 + }, + { + "epoch": 0.46, + "learning_rate": 1.1863922043110282e-05, + "loss": 0.4692, + "step": 4000 + }, + { + "epoch": 0.46, + "learning_rate": 1.1860285132703435e-05, + "loss": 0.474, + "step": 4001 + }, + { + "epoch": 0.46, + "learning_rate": 1.1856647967395995e-05, + "loss": 0.462, + "step": 4002 + }, + { + "epoch": 0.46, + "learning_rate": 1.185301054768633e-05, + "loss": 0.4657, + "step": 4003 + }, + { + "epoch": 0.46, + "learning_rate": 1.1849372874072852e-05, + "loss": 0.4854, + "step": 4004 + }, + { + "epoch": 0.46, + "learning_rate": 1.1845734947054e-05, + "loss": 0.4589, + "step": 4005 + }, + { + "epoch": 0.46, + "learning_rate": 1.1842096767128249e-05, + "loss": 0.4663, + "step": 4006 + }, + { + "epoch": 0.46, + "learning_rate": 1.1838458334794116e-05, + "loss": 0.4764, + "step": 4007 + }, + { + "epoch": 0.46, + "learning_rate": 1.1834819650550144e-05, + "loss": 0.4485, + "step": 4008 + }, + { + "epoch": 0.46, + "learning_rate": 1.1831180714894918e-05, + "loss": 0.4674, + "step": 4009 + }, + { + "epoch": 0.46, + "learning_rate": 1.1827541528327052e-05, + "loss": 0.4603, + "step": 4010 + }, + { + "epoch": 0.46, + "learning_rate": 1.182390209134519e-05, + "loss": 0.4521, + "step": 4011 + }, + { + "epoch": 0.46, + "learning_rate": 1.1820262404448023e-05, + "loss": 0.4839, + "step": 4012 + }, + { + "epoch": 0.46, + "learning_rate": 1.181662246813427e-05, + "loss": 0.4953, + "step": 4013 + }, + { + "epoch": 0.46, + "learning_rate": 1.1812982282902676e-05, + "loss": 0.4508, + "step": 4014 + }, + { + "epoch": 0.46, + "learning_rate": 1.1809341849252034e-05, + "loss": 0.4796, + "step": 4015 + }, + { + "epoch": 0.46, + "learning_rate": 1.180570116768116e-05, + "loss": 0.4567, + "step": 4016 + }, + { + "epoch": 0.46, + "learning_rate": 1.1802060238688915e-05, + "loss": 0.4681, + "step": 4017 + }, + { + "epoch": 0.46, + "learning_rate": 1.1798419062774185e-05, + "loss": 0.4736, + "step": 4018 + }, + { + "epoch": 0.46, + "learning_rate": 1.179477764043589e-05, + "loss": 0.4584, + "step": 4019 + }, + { + "epoch": 0.46, + "learning_rate": 1.1791135972172989e-05, + "loss": 0.4717, + "step": 4020 + }, + { + "epoch": 0.46, + "learning_rate": 1.1787494058484468e-05, + "loss": 0.4796, + "step": 4021 + }, + { + "epoch": 0.46, + "learning_rate": 1.1783851899869357e-05, + "loss": 0.4622, + "step": 4022 + }, + { + "epoch": 0.46, + "learning_rate": 1.1780209496826707e-05, + "loss": 0.4661, + "step": 4023 + }, + { + "epoch": 0.46, + "learning_rate": 1.177656684985561e-05, + "loss": 0.4547, + "step": 4024 + }, + { + "epoch": 0.46, + "learning_rate": 1.1772923959455188e-05, + "loss": 0.4589, + "step": 4025 + }, + { + "epoch": 0.46, + "learning_rate": 1.1769280826124604e-05, + "loss": 0.4461, + "step": 4026 + }, + { + "epoch": 0.46, + "learning_rate": 1.1765637450363048e-05, + "loss": 0.479, + "step": 4027 + }, + { + "epoch": 0.46, + "learning_rate": 1.176199383266974e-05, + "loss": 0.475, + "step": 4028 + }, + { + "epoch": 0.46, + "learning_rate": 1.1758349973543936e-05, + "loss": 0.485, + "step": 4029 + }, + { + "epoch": 0.46, + "learning_rate": 1.1754705873484929e-05, + "loss": 0.4537, + "step": 4030 + }, + { + "epoch": 0.46, + "learning_rate": 1.1751061532992045e-05, + "loss": 0.4889, + "step": 4031 + }, + { + "epoch": 0.46, + "learning_rate": 1.1747416952564632e-05, + "loss": 0.4695, + "step": 4032 + }, + { + "epoch": 0.46, + "learning_rate": 1.1743772132702086e-05, + "loss": 0.4752, + "step": 4033 + }, + { + "epoch": 0.46, + "learning_rate": 1.1740127073903826e-05, + "loss": 0.4519, + "step": 4034 + }, + { + "epoch": 0.46, + "learning_rate": 1.1736481776669307e-05, + "loss": 0.4603, + "step": 4035 + }, + { + "epoch": 0.46, + "learning_rate": 1.1732836241498013e-05, + "loss": 0.4767, + "step": 4036 + }, + { + "epoch": 0.46, + "learning_rate": 1.1729190468889466e-05, + "loss": 0.4883, + "step": 4037 + }, + { + "epoch": 0.46, + "learning_rate": 1.1725544459343221e-05, + "loss": 0.4877, + "step": 4038 + }, + { + "epoch": 0.46, + "learning_rate": 1.172189821335886e-05, + "loss": 0.4644, + "step": 4039 + }, + { + "epoch": 0.46, + "learning_rate": 1.1718251731436001e-05, + "loss": 0.4534, + "step": 4040 + }, + { + "epoch": 0.46, + "learning_rate": 1.1714605014074291e-05, + "loss": 0.4697, + "step": 4041 + }, + { + "epoch": 0.46, + "learning_rate": 1.1710958061773413e-05, + "loss": 0.4632, + "step": 4042 + }, + { + "epoch": 0.46, + "learning_rate": 1.1707310875033085e-05, + "loss": 0.4561, + "step": 4043 + }, + { + "epoch": 0.46, + "learning_rate": 1.1703663454353045e-05, + "loss": 0.4752, + "step": 4044 + }, + { + "epoch": 0.46, + "learning_rate": 1.1700015800233078e-05, + "loss": 0.4879, + "step": 4045 + }, + { + "epoch": 0.46, + "learning_rate": 1.169636791317299e-05, + "loss": 0.4552, + "step": 4046 + }, + { + "epoch": 0.46, + "learning_rate": 1.1692719793672627e-05, + "loss": 0.4641, + "step": 4047 + }, + { + "epoch": 0.46, + "learning_rate": 1.1689071442231858e-05, + "loss": 0.4615, + "step": 4048 + }, + { + "epoch": 0.46, + "learning_rate": 1.1685422859350592e-05, + "loss": 0.4859, + "step": 4049 + }, + { + "epoch": 0.46, + "learning_rate": 1.1681774045528764e-05, + "loss": 0.4727, + "step": 4050 + }, + { + "epoch": 0.46, + "learning_rate": 1.1678125001266347e-05, + "loss": 0.4682, + "step": 4051 + }, + { + "epoch": 0.46, + "learning_rate": 1.1674475727063337e-05, + "loss": 0.4597, + "step": 4052 + }, + { + "epoch": 0.46, + "learning_rate": 1.1670826223419766e-05, + "loss": 0.4775, + "step": 4053 + }, + { + "epoch": 0.46, + "learning_rate": 1.1667176490835701e-05, + "loss": 0.4741, + "step": 4054 + }, + { + "epoch": 0.46, + "learning_rate": 1.1663526529811235e-05, + "loss": 0.4678, + "step": 4055 + }, + { + "epoch": 0.46, + "learning_rate": 1.1659876340846494e-05, + "loss": 0.4658, + "step": 4056 + }, + { + "epoch": 0.46, + "learning_rate": 1.165622592444164e-05, + "loss": 0.4533, + "step": 4057 + }, + { + "epoch": 0.46, + "learning_rate": 1.165257528109685e-05, + "loss": 0.4822, + "step": 4058 + }, + { + "epoch": 0.46, + "learning_rate": 1.1648924411312354e-05, + "loss": 0.4722, + "step": 4059 + }, + { + "epoch": 0.46, + "learning_rate": 1.1645273315588399e-05, + "loss": 0.4607, + "step": 4060 + }, + { + "epoch": 0.46, + "learning_rate": 1.164162199442527e-05, + "loss": 0.4798, + "step": 4061 + }, + { + "epoch": 0.46, + "learning_rate": 1.1637970448323274e-05, + "loss": 0.4758, + "step": 4062 + }, + { + "epoch": 0.46, + "learning_rate": 1.1634318677782755e-05, + "loss": 0.4617, + "step": 4063 + }, + { + "epoch": 0.46, + "learning_rate": 1.163066668330409e-05, + "loss": 0.467, + "step": 4064 + }, + { + "epoch": 0.46, + "learning_rate": 1.1627014465387685e-05, + "loss": 0.4489, + "step": 4065 + }, + { + "epoch": 0.46, + "learning_rate": 1.1623362024533974e-05, + "loss": 0.4678, + "step": 4066 + }, + { + "epoch": 0.46, + "learning_rate": 1.161970936124342e-05, + "loss": 0.4925, + "step": 4067 + }, + { + "epoch": 0.46, + "learning_rate": 1.1616056476016521e-05, + "loss": 0.4605, + "step": 4068 + }, + { + "epoch": 0.47, + "learning_rate": 1.1612403369353806e-05, + "loss": 0.4645, + "step": 4069 + }, + { + "epoch": 0.47, + "learning_rate": 1.1608750041755832e-05, + "loss": 0.4682, + "step": 4070 + }, + { + "epoch": 0.47, + "learning_rate": 1.1605096493723183e-05, + "loss": 0.4891, + "step": 4071 + }, + { + "epoch": 0.47, + "learning_rate": 1.1601442725756478e-05, + "loss": 0.4667, + "step": 4072 + }, + { + "epoch": 0.47, + "learning_rate": 1.1597788738356365e-05, + "loss": 0.4829, + "step": 4073 + }, + { + "epoch": 0.47, + "learning_rate": 1.1594134532023525e-05, + "loss": 0.4345, + "step": 4074 + }, + { + "epoch": 0.47, + "learning_rate": 1.1590480107258663e-05, + "loss": 0.4448, + "step": 4075 + }, + { + "epoch": 0.47, + "learning_rate": 1.1586825464562515e-05, + "loss": 0.4806, + "step": 4076 + }, + { + "epoch": 0.47, + "learning_rate": 1.1583170604435851e-05, + "loss": 0.4897, + "step": 4077 + }, + { + "epoch": 0.47, + "learning_rate": 1.1579515527379468e-05, + "loss": 0.4704, + "step": 4078 + }, + { + "epoch": 0.47, + "learning_rate": 1.1575860233894195e-05, + "loss": 0.4678, + "step": 4079 + }, + { + "epoch": 0.47, + "learning_rate": 1.1572204724480887e-05, + "loss": 0.4891, + "step": 4080 + }, + { + "epoch": 0.47, + "learning_rate": 1.1568548999640428e-05, + "loss": 0.4619, + "step": 4081 + }, + { + "epoch": 0.47, + "learning_rate": 1.1564893059873734e-05, + "loss": 0.4855, + "step": 4082 + }, + { + "epoch": 0.47, + "learning_rate": 1.1561236905681761e-05, + "loss": 0.4489, + "step": 4083 + }, + { + "epoch": 0.47, + "learning_rate": 1.155758053756547e-05, + "loss": 0.4747, + "step": 4084 + }, + { + "epoch": 0.47, + "learning_rate": 1.1553923956025871e-05, + "loss": 0.4877, + "step": 4085 + }, + { + "epoch": 0.47, + "learning_rate": 1.1550267161563998e-05, + "loss": 0.4616, + "step": 4086 + }, + { + "epoch": 0.47, + "learning_rate": 1.1546610154680908e-05, + "loss": 0.4657, + "step": 4087 + }, + { + "epoch": 0.47, + "learning_rate": 1.1542952935877703e-05, + "loss": 0.4554, + "step": 4088 + }, + { + "epoch": 0.47, + "learning_rate": 1.1539295505655494e-05, + "loss": 0.4621, + "step": 4089 + }, + { + "epoch": 0.47, + "learning_rate": 1.1535637864515436e-05, + "loss": 0.4696, + "step": 4090 + }, + { + "epoch": 0.47, + "learning_rate": 1.1531980012958706e-05, + "loss": 0.4821, + "step": 4091 + }, + { + "epoch": 0.47, + "learning_rate": 1.152832195148651e-05, + "loss": 0.4452, + "step": 4092 + }, + { + "epoch": 0.47, + "learning_rate": 1.1524663680600083e-05, + "loss": 0.5015, + "step": 4093 + }, + { + "epoch": 0.47, + "learning_rate": 1.1521005200800694e-05, + "loss": 0.4502, + "step": 4094 + }, + { + "epoch": 0.47, + "learning_rate": 1.1517346512589635e-05, + "loss": 0.4993, + "step": 4095 + }, + { + "epoch": 0.47, + "learning_rate": 1.1513687616468225e-05, + "loss": 0.4516, + "step": 4096 + }, + { + "epoch": 0.47, + "learning_rate": 1.1510028512937818e-05, + "loss": 0.4495, + "step": 4097 + }, + { + "epoch": 0.47, + "learning_rate": 1.1506369202499791e-05, + "loss": 0.4643, + "step": 4098 + }, + { + "epoch": 0.47, + "learning_rate": 1.1502709685655553e-05, + "loss": 0.474, + "step": 4099 + }, + { + "epoch": 0.47, + "learning_rate": 1.149904996290654e-05, + "loss": 0.448, + "step": 4100 + }, + { + "epoch": 0.47, + "learning_rate": 1.149539003475421e-05, + "loss": 0.4667, + "step": 4101 + }, + { + "epoch": 0.47, + "learning_rate": 1.1491729901700062e-05, + "loss": 0.4557, + "step": 4102 + }, + { + "epoch": 0.47, + "learning_rate": 1.148806956424561e-05, + "loss": 0.4826, + "step": 4103 + }, + { + "epoch": 0.47, + "learning_rate": 1.1484409022892406e-05, + "loss": 0.4517, + "step": 4104 + }, + { + "epoch": 0.47, + "learning_rate": 1.1480748278142025e-05, + "loss": 0.4756, + "step": 4105 + }, + { + "epoch": 0.47, + "learning_rate": 1.1477087330496071e-05, + "loss": 0.4744, + "step": 4106 + }, + { + "epoch": 0.47, + "learning_rate": 1.1473426180456174e-05, + "loss": 0.4785, + "step": 4107 + }, + { + "epoch": 0.47, + "learning_rate": 1.1469764828523995e-05, + "loss": 0.4586, + "step": 4108 + }, + { + "epoch": 0.47, + "learning_rate": 1.146610327520122e-05, + "loss": 0.4743, + "step": 4109 + }, + { + "epoch": 0.47, + "learning_rate": 1.1462441520989565e-05, + "loss": 0.4627, + "step": 4110 + }, + { + "epoch": 0.47, + "learning_rate": 1.1458779566390768e-05, + "loss": 0.4851, + "step": 4111 + }, + { + "epoch": 0.47, + "learning_rate": 1.1455117411906604e-05, + "loss": 0.4588, + "step": 4112 + }, + { + "epoch": 0.47, + "learning_rate": 1.1451455058038864e-05, + "loss": 0.47, + "step": 4113 + }, + { + "epoch": 0.47, + "learning_rate": 1.1447792505289384e-05, + "loss": 0.4498, + "step": 4114 + }, + { + "epoch": 0.47, + "learning_rate": 1.1444129754159998e-05, + "loss": 0.4372, + "step": 4115 + }, + { + "epoch": 0.47, + "learning_rate": 1.1440466805152596e-05, + "loss": 0.4844, + "step": 4116 + }, + { + "epoch": 0.47, + "learning_rate": 1.1436803658769082e-05, + "loss": 0.4632, + "step": 4117 + }, + { + "epoch": 0.47, + "learning_rate": 1.1433140315511392e-05, + "loss": 0.4676, + "step": 4118 + }, + { + "epoch": 0.47, + "learning_rate": 1.142947677588148e-05, + "loss": 0.4622, + "step": 4119 + }, + { + "epoch": 0.47, + "learning_rate": 1.1425813040381332e-05, + "loss": 0.4423, + "step": 4120 + }, + { + "epoch": 0.47, + "learning_rate": 1.1422149109512967e-05, + "loss": 0.4961, + "step": 4121 + }, + { + "epoch": 0.47, + "learning_rate": 1.1418484983778421e-05, + "loss": 0.4723, + "step": 4122 + }, + { + "epoch": 0.47, + "learning_rate": 1.1414820663679768e-05, + "loss": 0.486, + "step": 4123 + }, + { + "epoch": 0.47, + "learning_rate": 1.1411156149719094e-05, + "loss": 0.4641, + "step": 4124 + }, + { + "epoch": 0.47, + "learning_rate": 1.1407491442398518e-05, + "loss": 0.4745, + "step": 4125 + }, + { + "epoch": 0.47, + "learning_rate": 1.1403826542220193e-05, + "loss": 0.4447, + "step": 4126 + }, + { + "epoch": 0.47, + "learning_rate": 1.1400161449686293e-05, + "loss": 0.4544, + "step": 4127 + }, + { + "epoch": 0.47, + "learning_rate": 1.139649616529901e-05, + "loss": 0.4771, + "step": 4128 + }, + { + "epoch": 0.47, + "learning_rate": 1.1392830689560577e-05, + "loss": 0.4845, + "step": 4129 + }, + { + "epoch": 0.47, + "learning_rate": 1.1389165022973238e-05, + "loss": 0.4613, + "step": 4130 + }, + { + "epoch": 0.47, + "learning_rate": 1.1385499166039281e-05, + "loss": 0.4566, + "step": 4131 + }, + { + "epoch": 0.47, + "learning_rate": 1.1381833119261003e-05, + "loss": 0.4597, + "step": 4132 + }, + { + "epoch": 0.47, + "learning_rate": 1.1378166883140738e-05, + "loss": 0.4653, + "step": 4133 + }, + { + "epoch": 0.47, + "learning_rate": 1.1374500458180839e-05, + "loss": 0.4702, + "step": 4134 + }, + { + "epoch": 0.47, + "learning_rate": 1.137083384488369e-05, + "loss": 0.4467, + "step": 4135 + }, + { + "epoch": 0.47, + "learning_rate": 1.1367167043751701e-05, + "loss": 0.4668, + "step": 4136 + }, + { + "epoch": 0.47, + "learning_rate": 1.1363500055287301e-05, + "loss": 0.4543, + "step": 4137 + }, + { + "epoch": 0.47, + "learning_rate": 1.1359832879992956e-05, + "loss": 0.4653, + "step": 4138 + }, + { + "epoch": 0.47, + "learning_rate": 1.1356165518371142e-05, + "loss": 0.4749, + "step": 4139 + }, + { + "epoch": 0.47, + "learning_rate": 1.1352497970924376e-05, + "loss": 0.4748, + "step": 4140 + }, + { + "epoch": 0.47, + "learning_rate": 1.1348830238155191e-05, + "loss": 0.4532, + "step": 4141 + }, + { + "epoch": 0.47, + "learning_rate": 1.134516232056615e-05, + "loss": 0.4458, + "step": 4142 + }, + { + "epoch": 0.47, + "learning_rate": 1.134149421865984e-05, + "loss": 0.4781, + "step": 4143 + }, + { + "epoch": 0.47, + "learning_rate": 1.1337825932938866e-05, + "loss": 0.4488, + "step": 4144 + }, + { + "epoch": 0.47, + "learning_rate": 1.1334157463905876e-05, + "loss": 0.4605, + "step": 4145 + }, + { + "epoch": 0.47, + "learning_rate": 1.1330488812063526e-05, + "loss": 0.4605, + "step": 4146 + }, + { + "epoch": 0.47, + "learning_rate": 1.1326819977914503e-05, + "loss": 0.4966, + "step": 4147 + }, + { + "epoch": 0.47, + "learning_rate": 1.132315096196152e-05, + "loss": 0.4677, + "step": 4148 + }, + { + "epoch": 0.47, + "learning_rate": 1.1319481764707313e-05, + "loss": 0.4461, + "step": 4149 + }, + { + "epoch": 0.47, + "learning_rate": 1.131581238665465e-05, + "loss": 0.4615, + "step": 4150 + }, + { + "epoch": 0.47, + "learning_rate": 1.1312142828306309e-05, + "loss": 0.4617, + "step": 4151 + }, + { + "epoch": 0.47, + "learning_rate": 1.1308473090165107e-05, + "loss": 0.4534, + "step": 4152 + }, + { + "epoch": 0.47, + "learning_rate": 1.1304803172733878e-05, + "loss": 0.4575, + "step": 4153 + }, + { + "epoch": 0.47, + "learning_rate": 1.1301133076515482e-05, + "loss": 0.463, + "step": 4154 + }, + { + "epoch": 0.47, + "learning_rate": 1.1297462802012806e-05, + "loss": 0.4876, + "step": 4155 + }, + { + "epoch": 0.47, + "learning_rate": 1.129379234972876e-05, + "loss": 0.4654, + "step": 4156 + }, + { + "epoch": 0.48, + "learning_rate": 1.1290121720166277e-05, + "loss": 0.4713, + "step": 4157 + }, + { + "epoch": 0.48, + "learning_rate": 1.1286450913828313e-05, + "loss": 0.4479, + "step": 4158 + }, + { + "epoch": 0.48, + "learning_rate": 1.1282779931217852e-05, + "loss": 0.4562, + "step": 4159 + }, + { + "epoch": 0.48, + "learning_rate": 1.1279108772837901e-05, + "loss": 0.4596, + "step": 4160 + }, + { + "epoch": 0.48, + "learning_rate": 1.1275437439191493e-05, + "loss": 0.469, + "step": 4161 + }, + { + "epoch": 0.48, + "learning_rate": 1.1271765930781677e-05, + "loss": 0.4607, + "step": 4162 + }, + { + "epoch": 0.48, + "learning_rate": 1.1268094248111536e-05, + "loss": 0.471, + "step": 4163 + }, + { + "epoch": 0.48, + "learning_rate": 1.1264422391684171e-05, + "loss": 0.4497, + "step": 4164 + }, + { + "epoch": 0.48, + "learning_rate": 1.126075036200271e-05, + "loss": 0.4635, + "step": 4165 + }, + { + "epoch": 0.48, + "learning_rate": 1.1257078159570303e-05, + "loss": 0.4637, + "step": 4166 + }, + { + "epoch": 0.48, + "learning_rate": 1.125340578489012e-05, + "loss": 0.4544, + "step": 4167 + }, + { + "epoch": 0.48, + "learning_rate": 1.1249733238465359e-05, + "loss": 0.4707, + "step": 4168 + }, + { + "epoch": 0.48, + "learning_rate": 1.1246060520799244e-05, + "loss": 0.4489, + "step": 4169 + }, + { + "epoch": 0.48, + "learning_rate": 1.1242387632395019e-05, + "loss": 0.4794, + "step": 4170 + }, + { + "epoch": 0.48, + "learning_rate": 1.1238714573755954e-05, + "loss": 0.4467, + "step": 4171 + }, + { + "epoch": 0.48, + "learning_rate": 1.1235041345385328e-05, + "loss": 0.4618, + "step": 4172 + }, + { + "epoch": 0.48, + "learning_rate": 1.123136794778647e-05, + "loss": 0.4623, + "step": 4173 + }, + { + "epoch": 0.48, + "learning_rate": 1.122769438146271e-05, + "loss": 0.4752, + "step": 4174 + }, + { + "epoch": 0.48, + "learning_rate": 1.1224020646917413e-05, + "loss": 0.4514, + "step": 4175 + }, + { + "epoch": 0.48, + "learning_rate": 1.1220346744653956e-05, + "loss": 0.4628, + "step": 4176 + }, + { + "epoch": 0.48, + "learning_rate": 1.1216672675175748e-05, + "loss": 0.4627, + "step": 4177 + }, + { + "epoch": 0.48, + "learning_rate": 1.1212998438986223e-05, + "loss": 0.4575, + "step": 4178 + }, + { + "epoch": 0.48, + "learning_rate": 1.1209324036588828e-05, + "loss": 0.4789, + "step": 4179 + }, + { + "epoch": 0.48, + "learning_rate": 1.1205649468487042e-05, + "loss": 0.4914, + "step": 4180 + }, + { + "epoch": 0.48, + "learning_rate": 1.1201974735184362e-05, + "loss": 0.4602, + "step": 4181 + }, + { + "epoch": 0.48, + "learning_rate": 1.1198299837184305e-05, + "loss": 0.4697, + "step": 4182 + }, + { + "epoch": 0.48, + "learning_rate": 1.1194624774990418e-05, + "loss": 0.4737, + "step": 4183 + }, + { + "epoch": 0.48, + "learning_rate": 1.119094954910627e-05, + "loss": 0.4548, + "step": 4184 + }, + { + "epoch": 0.48, + "learning_rate": 1.118727416003544e-05, + "loss": 0.4352, + "step": 4185 + }, + { + "epoch": 0.48, + "learning_rate": 1.1183598608281543e-05, + "loss": 0.4824, + "step": 4186 + }, + { + "epoch": 0.48, + "learning_rate": 1.1179922894348207e-05, + "loss": 0.4557, + "step": 4187 + }, + { + "epoch": 0.48, + "learning_rate": 1.11762470187391e-05, + "loss": 0.4896, + "step": 4188 + }, + { + "epoch": 0.48, + "learning_rate": 1.1172570981957886e-05, + "loss": 0.4562, + "step": 4189 + }, + { + "epoch": 0.48, + "learning_rate": 1.1168894784508268e-05, + "loss": 0.4579, + "step": 4190 + }, + { + "epoch": 0.48, + "learning_rate": 1.1165218426893969e-05, + "loss": 0.4793, + "step": 4191 + }, + { + "epoch": 0.48, + "learning_rate": 1.1161541909618728e-05, + "loss": 0.4601, + "step": 4192 + }, + { + "epoch": 0.48, + "learning_rate": 1.1157865233186315e-05, + "loss": 0.4474, + "step": 4193 + }, + { + "epoch": 0.48, + "learning_rate": 1.1154188398100516e-05, + "loss": 0.4726, + "step": 4194 + }, + { + "epoch": 0.48, + "learning_rate": 1.1150511404865136e-05, + "loss": 0.4498, + "step": 4195 + }, + { + "epoch": 0.48, + "learning_rate": 1.1146834253984008e-05, + "loss": 0.473, + "step": 4196 + }, + { + "epoch": 0.48, + "learning_rate": 1.114315694596098e-05, + "loss": 0.4606, + "step": 4197 + }, + { + "epoch": 0.48, + "learning_rate": 1.1139479481299928e-05, + "loss": 0.4608, + "step": 4198 + }, + { + "epoch": 0.48, + "learning_rate": 1.113580186050475e-05, + "loss": 0.4625, + "step": 4199 + }, + { + "epoch": 0.48, + "learning_rate": 1.1132124084079359e-05, + "loss": 0.4688, + "step": 4200 + }, + { + "epoch": 0.48, + "learning_rate": 1.112844615252769e-05, + "loss": 0.4852, + "step": 4201 + }, + { + "epoch": 0.48, + "learning_rate": 1.1124768066353705e-05, + "loss": 0.4653, + "step": 4202 + }, + { + "epoch": 0.48, + "learning_rate": 1.1121089826061385e-05, + "loss": 0.4611, + "step": 4203 + }, + { + "epoch": 0.48, + "learning_rate": 1.1117411432154725e-05, + "loss": 0.47, + "step": 4204 + }, + { + "epoch": 0.48, + "learning_rate": 1.1113732885137755e-05, + "loss": 0.4632, + "step": 4205 + }, + { + "epoch": 0.48, + "learning_rate": 1.1110054185514513e-05, + "loss": 0.4561, + "step": 4206 + }, + { + "epoch": 0.48, + "learning_rate": 1.1106375333789065e-05, + "loss": 0.4782, + "step": 4207 + }, + { + "epoch": 0.48, + "learning_rate": 1.1102696330465495e-05, + "loss": 0.4728, + "step": 4208 + }, + { + "epoch": 0.48, + "learning_rate": 1.1099017176047909e-05, + "loss": 0.4594, + "step": 4209 + }, + { + "epoch": 0.48, + "learning_rate": 1.109533787104043e-05, + "loss": 0.4676, + "step": 4210 + }, + { + "epoch": 0.48, + "learning_rate": 1.109165841594721e-05, + "loss": 0.46, + "step": 4211 + }, + { + "epoch": 0.48, + "learning_rate": 1.1087978811272417e-05, + "loss": 0.4804, + "step": 4212 + }, + { + "epoch": 0.48, + "learning_rate": 1.1084299057520234e-05, + "loss": 0.4493, + "step": 4213 + }, + { + "epoch": 0.48, + "learning_rate": 1.1080619155194873e-05, + "loss": 0.4893, + "step": 4214 + }, + { + "epoch": 0.48, + "learning_rate": 1.107693910480056e-05, + "loss": 0.452, + "step": 4215 + }, + { + "epoch": 0.48, + "learning_rate": 1.1073258906841547e-05, + "loss": 0.4683, + "step": 4216 + }, + { + "epoch": 0.48, + "learning_rate": 1.10695785618221e-05, + "loss": 0.4677, + "step": 4217 + }, + { + "epoch": 0.48, + "learning_rate": 1.1065898070246512e-05, + "loss": 0.4687, + "step": 4218 + }, + { + "epoch": 0.48, + "learning_rate": 1.1062217432619095e-05, + "loss": 0.4808, + "step": 4219 + }, + { + "epoch": 0.48, + "learning_rate": 1.1058536649444167e-05, + "loss": 0.4601, + "step": 4220 + }, + { + "epoch": 0.48, + "learning_rate": 1.105485572122609e-05, + "loss": 0.4564, + "step": 4221 + }, + { + "epoch": 0.48, + "learning_rate": 1.1051174648469225e-05, + "loss": 0.4677, + "step": 4222 + }, + { + "epoch": 0.48, + "learning_rate": 1.104749343167797e-05, + "loss": 0.4712, + "step": 4223 + }, + { + "epoch": 0.48, + "learning_rate": 1.104381207135672e-05, + "loss": 0.475, + "step": 4224 + }, + { + "epoch": 0.48, + "learning_rate": 1.104013056800992e-05, + "loss": 0.4599, + "step": 4225 + }, + { + "epoch": 0.48, + "learning_rate": 1.1036448922142004e-05, + "loss": 0.4706, + "step": 4226 + }, + { + "epoch": 0.48, + "learning_rate": 1.1032767134257451e-05, + "loss": 0.4714, + "step": 4227 + }, + { + "epoch": 0.48, + "learning_rate": 1.102908520486074e-05, + "loss": 0.4596, + "step": 4228 + }, + { + "epoch": 0.48, + "learning_rate": 1.1025403134456378e-05, + "loss": 0.4529, + "step": 4229 + }, + { + "epoch": 0.48, + "learning_rate": 1.1021720923548897e-05, + "loss": 0.4729, + "step": 4230 + }, + { + "epoch": 0.48, + "learning_rate": 1.1018038572642837e-05, + "loss": 0.4506, + "step": 4231 + }, + { + "epoch": 0.48, + "learning_rate": 1.1014356082242766e-05, + "loss": 0.4713, + "step": 4232 + }, + { + "epoch": 0.48, + "learning_rate": 1.1010673452853262e-05, + "loss": 0.4613, + "step": 4233 + }, + { + "epoch": 0.48, + "learning_rate": 1.1006990684978928e-05, + "loss": 0.4614, + "step": 4234 + }, + { + "epoch": 0.48, + "learning_rate": 1.1003307779124392e-05, + "loss": 0.4434, + "step": 4235 + }, + { + "epoch": 0.48, + "learning_rate": 1.0999624735794292e-05, + "loss": 0.4647, + "step": 4236 + }, + { + "epoch": 0.48, + "learning_rate": 1.0995941555493283e-05, + "loss": 0.4507, + "step": 4237 + }, + { + "epoch": 0.48, + "learning_rate": 1.0992258238726046e-05, + "loss": 0.4711, + "step": 4238 + }, + { + "epoch": 0.48, + "learning_rate": 1.0988574785997275e-05, + "loss": 0.4512, + "step": 4239 + }, + { + "epoch": 0.48, + "learning_rate": 1.0984891197811686e-05, + "loss": 0.4598, + "step": 4240 + }, + { + "epoch": 0.48, + "learning_rate": 1.0981207474674021e-05, + "loss": 0.4618, + "step": 4241 + }, + { + "epoch": 0.48, + "learning_rate": 1.0977523617089019e-05, + "loss": 0.473, + "step": 4242 + }, + { + "epoch": 0.48, + "learning_rate": 1.097383962556146e-05, + "loss": 0.4364, + "step": 4243 + }, + { + "epoch": 0.49, + "learning_rate": 1.0970155500596127e-05, + "loss": 0.4559, + "step": 4244 + }, + { + "epoch": 0.49, + "learning_rate": 1.0966471242697834e-05, + "loss": 0.4674, + "step": 4245 + }, + { + "epoch": 0.49, + "learning_rate": 1.0962786852371402e-05, + "loss": 0.4773, + "step": 4246 + }, + { + "epoch": 0.49, + "learning_rate": 1.0959102330121676e-05, + "loss": 0.4772, + "step": 4247 + }, + { + "epoch": 0.49, + "learning_rate": 1.0955417676453517e-05, + "loss": 0.4668, + "step": 4248 + }, + { + "epoch": 0.49, + "learning_rate": 1.0951732891871807e-05, + "loss": 0.4536, + "step": 4249 + }, + { + "epoch": 0.49, + "learning_rate": 1.0948047976881439e-05, + "loss": 0.4819, + "step": 4250 + }, + { + "epoch": 0.49, + "learning_rate": 1.0944362931987336e-05, + "loss": 0.4545, + "step": 4251 + }, + { + "epoch": 0.49, + "learning_rate": 1.0940677757694425e-05, + "loss": 0.4557, + "step": 4252 + }, + { + "epoch": 0.49, + "learning_rate": 1.093699245450766e-05, + "loss": 0.4531, + "step": 4253 + }, + { + "epoch": 0.49, + "learning_rate": 1.093330702293201e-05, + "loss": 0.4528, + "step": 4254 + }, + { + "epoch": 0.49, + "learning_rate": 1.092962146347246e-05, + "loss": 0.4605, + "step": 4255 + }, + { + "epoch": 0.49, + "learning_rate": 1.0925935776634014e-05, + "loss": 0.4651, + "step": 4256 + }, + { + "epoch": 0.49, + "learning_rate": 1.0922249962921694e-05, + "loss": 0.4555, + "step": 4257 + }, + { + "epoch": 0.49, + "learning_rate": 1.0918564022840539e-05, + "loss": 0.4513, + "step": 4258 + }, + { + "epoch": 0.49, + "learning_rate": 1.0914877956895604e-05, + "loss": 0.4656, + "step": 4259 + }, + { + "epoch": 0.49, + "learning_rate": 1.0911191765591966e-05, + "loss": 0.4609, + "step": 4260 + }, + { + "epoch": 0.49, + "learning_rate": 1.090750544943471e-05, + "loss": 0.4698, + "step": 4261 + }, + { + "epoch": 0.49, + "learning_rate": 1.0903819008928948e-05, + "loss": 0.4613, + "step": 4262 + }, + { + "epoch": 0.49, + "learning_rate": 1.0900132444579801e-05, + "loss": 0.4463, + "step": 4263 + }, + { + "epoch": 0.49, + "learning_rate": 1.0896445756892415e-05, + "loss": 0.4858, + "step": 4264 + }, + { + "epoch": 0.49, + "learning_rate": 1.0892758946371943e-05, + "loss": 0.4598, + "step": 4265 + }, + { + "epoch": 0.49, + "learning_rate": 1.0889072013523568e-05, + "loss": 0.4422, + "step": 4266 + }, + { + "epoch": 0.49, + "learning_rate": 1.0885384958852474e-05, + "loss": 0.4569, + "step": 4267 + }, + { + "epoch": 0.49, + "learning_rate": 1.0881697782863874e-05, + "loss": 0.4665, + "step": 4268 + }, + { + "epoch": 0.49, + "learning_rate": 1.0878010486062993e-05, + "loss": 0.4516, + "step": 4269 + }, + { + "epoch": 0.49, + "learning_rate": 1.0874323068955073e-05, + "loss": 0.4574, + "step": 4270 + }, + { + "epoch": 0.49, + "learning_rate": 1.0870635532045375e-05, + "loss": 0.4604, + "step": 4271 + }, + { + "epoch": 0.49, + "learning_rate": 1.0866947875839167e-05, + "loss": 0.4557, + "step": 4272 + }, + { + "epoch": 0.49, + "learning_rate": 1.0863260100841744e-05, + "loss": 0.4617, + "step": 4273 + }, + { + "epoch": 0.49, + "learning_rate": 1.0859572207558416e-05, + "loss": 0.4668, + "step": 4274 + }, + { + "epoch": 0.49, + "learning_rate": 1.0855884196494507e-05, + "loss": 0.451, + "step": 4275 + }, + { + "epoch": 0.49, + "learning_rate": 1.0852196068155352e-05, + "loss": 0.4583, + "step": 4276 + }, + { + "epoch": 0.49, + "learning_rate": 1.0848507823046306e-05, + "loss": 0.4804, + "step": 4277 + }, + { + "epoch": 0.49, + "learning_rate": 1.0844819461672748e-05, + "loss": 0.4617, + "step": 4278 + }, + { + "epoch": 0.49, + "learning_rate": 1.0841130984540063e-05, + "loss": 0.4703, + "step": 4279 + }, + { + "epoch": 0.49, + "learning_rate": 1.0837442392153651e-05, + "loss": 0.4611, + "step": 4280 + }, + { + "epoch": 0.49, + "learning_rate": 1.0833753685018935e-05, + "loss": 0.451, + "step": 4281 + }, + { + "epoch": 0.49, + "learning_rate": 1.0830064863641352e-05, + "loss": 0.4829, + "step": 4282 + }, + { + "epoch": 0.49, + "learning_rate": 1.082637592852635e-05, + "loss": 0.459, + "step": 4283 + }, + { + "epoch": 0.49, + "learning_rate": 1.0822686880179395e-05, + "loss": 0.468, + "step": 4284 + }, + { + "epoch": 0.49, + "learning_rate": 1.081899771910597e-05, + "loss": 0.4545, + "step": 4285 + }, + { + "epoch": 0.49, + "learning_rate": 1.081530844581157e-05, + "loss": 0.4563, + "step": 4286 + }, + { + "epoch": 0.49, + "learning_rate": 1.0811619060801713e-05, + "loss": 0.4765, + "step": 4287 + }, + { + "epoch": 0.49, + "learning_rate": 1.0807929564581925e-05, + "loss": 0.4501, + "step": 4288 + }, + { + "epoch": 0.49, + "learning_rate": 1.080423995765775e-05, + "loss": 0.4503, + "step": 4289 + }, + { + "epoch": 0.49, + "learning_rate": 1.0800550240534742e-05, + "loss": 0.4785, + "step": 4290 + }, + { + "epoch": 0.49, + "learning_rate": 1.0796860413718475e-05, + "loss": 0.4585, + "step": 4291 + }, + { + "epoch": 0.49, + "learning_rate": 1.0793170477714546e-05, + "loss": 0.4625, + "step": 4292 + }, + { + "epoch": 0.49, + "learning_rate": 1.0789480433028551e-05, + "loss": 0.4596, + "step": 4293 + }, + { + "epoch": 0.49, + "learning_rate": 1.0785790280166114e-05, + "loss": 0.4536, + "step": 4294 + }, + { + "epoch": 0.49, + "learning_rate": 1.078210001963286e-05, + "loss": 0.4592, + "step": 4295 + }, + { + "epoch": 0.49, + "learning_rate": 1.0778409651934442e-05, + "loss": 0.4829, + "step": 4296 + }, + { + "epoch": 0.49, + "learning_rate": 1.0774719177576526e-05, + "loss": 0.4804, + "step": 4297 + }, + { + "epoch": 0.49, + "learning_rate": 1.0771028597064785e-05, + "loss": 0.4635, + "step": 4298 + }, + { + "epoch": 0.49, + "learning_rate": 1.076733791090491e-05, + "loss": 0.458, + "step": 4299 + }, + { + "epoch": 0.49, + "learning_rate": 1.0763647119602614e-05, + "loss": 0.4521, + "step": 4300 + }, + { + "epoch": 0.49, + "learning_rate": 1.0759956223663608e-05, + "loss": 0.4635, + "step": 4301 + }, + { + "epoch": 0.49, + "learning_rate": 1.0756265223593637e-05, + "loss": 0.4627, + "step": 4302 + }, + { + "epoch": 0.49, + "learning_rate": 1.0752574119898445e-05, + "loss": 0.4643, + "step": 4303 + }, + { + "epoch": 0.49, + "learning_rate": 1.0748882913083794e-05, + "loss": 0.4849, + "step": 4304 + }, + { + "epoch": 0.49, + "learning_rate": 1.0745191603655466e-05, + "loss": 0.4653, + "step": 4305 + }, + { + "epoch": 0.49, + "learning_rate": 1.074150019211925e-05, + "loss": 0.48, + "step": 4306 + }, + { + "epoch": 0.49, + "learning_rate": 1.0737808678980954e-05, + "loss": 0.4599, + "step": 4307 + }, + { + "epoch": 0.49, + "learning_rate": 1.0734117064746395e-05, + "loss": 0.4787, + "step": 4308 + }, + { + "epoch": 0.49, + "learning_rate": 1.073042534992141e-05, + "loss": 0.4467, + "step": 4309 + }, + { + "epoch": 0.49, + "learning_rate": 1.0726733535011844e-05, + "loss": 0.4669, + "step": 4310 + }, + { + "epoch": 0.49, + "learning_rate": 1.0723041620523558e-05, + "loss": 0.4679, + "step": 4311 + }, + { + "epoch": 0.49, + "learning_rate": 1.0719349606962426e-05, + "loss": 0.443, + "step": 4312 + }, + { + "epoch": 0.49, + "learning_rate": 1.071565749483434e-05, + "loss": 0.4815, + "step": 4313 + }, + { + "epoch": 0.49, + "learning_rate": 1.0711965284645198e-05, + "loss": 0.4714, + "step": 4314 + }, + { + "epoch": 0.49, + "learning_rate": 1.0708272976900915e-05, + "loss": 0.45, + "step": 4315 + }, + { + "epoch": 0.49, + "learning_rate": 1.0704580572107424e-05, + "loss": 0.4824, + "step": 4316 + }, + { + "epoch": 0.49, + "learning_rate": 1.0700888070770663e-05, + "loss": 0.4578, + "step": 4317 + }, + { + "epoch": 0.49, + "learning_rate": 1.0697195473396587e-05, + "loss": 0.4675, + "step": 4318 + }, + { + "epoch": 0.49, + "learning_rate": 1.0693502780491168e-05, + "loss": 0.4679, + "step": 4319 + }, + { + "epoch": 0.49, + "learning_rate": 1.0689809992560382e-05, + "loss": 0.4471, + "step": 4320 + }, + { + "epoch": 0.49, + "learning_rate": 1.0686117110110228e-05, + "loss": 0.4587, + "step": 4321 + }, + { + "epoch": 0.49, + "learning_rate": 1.0682424133646712e-05, + "loss": 0.4886, + "step": 4322 + }, + { + "epoch": 0.49, + "learning_rate": 1.067873106367585e-05, + "loss": 0.4718, + "step": 4323 + }, + { + "epoch": 0.49, + "learning_rate": 1.0675037900703684e-05, + "loss": 0.4528, + "step": 4324 + }, + { + "epoch": 0.49, + "learning_rate": 1.0671344645236253e-05, + "loss": 0.4683, + "step": 4325 + }, + { + "epoch": 0.49, + "learning_rate": 1.0667651297779615e-05, + "loss": 0.473, + "step": 4326 + }, + { + "epoch": 0.49, + "learning_rate": 1.0663957858839843e-05, + "loss": 0.4572, + "step": 4327 + }, + { + "epoch": 0.49, + "learning_rate": 1.0660264328923024e-05, + "loss": 0.4593, + "step": 4328 + }, + { + "epoch": 0.49, + "learning_rate": 1.0656570708535248e-05, + "loss": 0.4559, + "step": 4329 + }, + { + "epoch": 0.49, + "learning_rate": 1.0652876998182626e-05, + "loss": 0.4716, + "step": 4330 + }, + { + "epoch": 0.49, + "learning_rate": 1.064918319837128e-05, + "loss": 0.5014, + "step": 4331 + }, + { + "epoch": 0.5, + "learning_rate": 1.0645489309607346e-05, + "loss": 0.4458, + "step": 4332 + }, + { + "epoch": 0.5, + "learning_rate": 1.064179533239696e-05, + "loss": 0.4546, + "step": 4333 + }, + { + "epoch": 0.5, + "learning_rate": 1.0638101267246283e-05, + "loss": 0.5054, + "step": 4334 + }, + { + "epoch": 0.5, + "learning_rate": 1.0634407114661492e-05, + "loss": 0.4734, + "step": 4335 + }, + { + "epoch": 0.5, + "learning_rate": 1.0630712875148758e-05, + "loss": 0.4572, + "step": 4336 + }, + { + "epoch": 0.5, + "learning_rate": 1.0627018549214284e-05, + "loss": 0.4573, + "step": 4337 + }, + { + "epoch": 0.5, + "learning_rate": 1.062332413736426e-05, + "loss": 0.4595, + "step": 4338 + }, + { + "epoch": 0.5, + "learning_rate": 1.0619629640104921e-05, + "loss": 0.4774, + "step": 4339 + }, + { + "epoch": 0.5, + "learning_rate": 1.0615935057942485e-05, + "loss": 0.4731, + "step": 4340 + }, + { + "epoch": 0.5, + "learning_rate": 1.0612240391383197e-05, + "loss": 0.4404, + "step": 4341 + }, + { + "epoch": 0.5, + "learning_rate": 1.0608545640933304e-05, + "loss": 0.4807, + "step": 4342 + }, + { + "epoch": 0.5, + "learning_rate": 1.060485080709907e-05, + "loss": 0.4475, + "step": 4343 + }, + { + "epoch": 0.5, + "learning_rate": 1.0601155890386771e-05, + "loss": 0.468, + "step": 4344 + }, + { + "epoch": 0.5, + "learning_rate": 1.05974608913027e-05, + "loss": 0.4684, + "step": 4345 + }, + { + "epoch": 0.5, + "learning_rate": 1.0593765810353142e-05, + "loss": 0.4471, + "step": 4346 + }, + { + "epoch": 0.5, + "learning_rate": 1.0590070648044415e-05, + "loss": 0.465, + "step": 4347 + }, + { + "epoch": 0.5, + "learning_rate": 1.0586375404882832e-05, + "loss": 0.4646, + "step": 4348 + }, + { + "epoch": 0.5, + "learning_rate": 1.0582680081374728e-05, + "loss": 0.4695, + "step": 4349 + }, + { + "epoch": 0.5, + "learning_rate": 1.0578984678026445e-05, + "loss": 0.4684, + "step": 4350 + }, + { + "epoch": 0.5, + "learning_rate": 1.0575289195344334e-05, + "loss": 0.454, + "step": 4351 + }, + { + "epoch": 0.5, + "learning_rate": 1.0571593633834758e-05, + "loss": 0.477, + "step": 4352 + }, + { + "epoch": 0.5, + "learning_rate": 1.0567897994004093e-05, + "loss": 0.4632, + "step": 4353 + }, + { + "epoch": 0.5, + "learning_rate": 1.0564202276358726e-05, + "loss": 0.4516, + "step": 4354 + }, + { + "epoch": 0.5, + "learning_rate": 1.0560506481405048e-05, + "loss": 0.4679, + "step": 4355 + }, + { + "epoch": 0.5, + "learning_rate": 1.0556810609649471e-05, + "loss": 0.4686, + "step": 4356 + }, + { + "epoch": 0.5, + "learning_rate": 1.0553114661598406e-05, + "loss": 0.4695, + "step": 4357 + }, + { + "epoch": 0.5, + "learning_rate": 1.0549418637758284e-05, + "loss": 0.4569, + "step": 4358 + }, + { + "epoch": 0.5, + "learning_rate": 1.0545722538635544e-05, + "loss": 0.4721, + "step": 4359 + }, + { + "epoch": 0.5, + "learning_rate": 1.054202636473663e-05, + "loss": 0.4599, + "step": 4360 + }, + { + "epoch": 0.5, + "learning_rate": 1.0538330116568006e-05, + "loss": 0.4584, + "step": 4361 + }, + { + "epoch": 0.5, + "learning_rate": 1.0534633794636134e-05, + "loss": 0.469, + "step": 4362 + }, + { + "epoch": 0.5, + "learning_rate": 1.0530937399447496e-05, + "loss": 0.4706, + "step": 4363 + }, + { + "epoch": 0.5, + "learning_rate": 1.0527240931508582e-05, + "loss": 0.4607, + "step": 4364 + }, + { + "epoch": 0.5, + "learning_rate": 1.0523544391325888e-05, + "loss": 0.4766, + "step": 4365 + }, + { + "epoch": 0.5, + "learning_rate": 1.0519847779405926e-05, + "loss": 0.4552, + "step": 4366 + }, + { + "epoch": 0.5, + "learning_rate": 1.051615109625521e-05, + "loss": 0.4917, + "step": 4367 + }, + { + "epoch": 0.5, + "learning_rate": 1.0512454342380269e-05, + "loss": 0.4614, + "step": 4368 + }, + { + "epoch": 0.5, + "learning_rate": 1.0508757518287642e-05, + "loss": 0.453, + "step": 4369 + }, + { + "epoch": 0.5, + "learning_rate": 1.0505060624483878e-05, + "loss": 0.4705, + "step": 4370 + }, + { + "epoch": 0.5, + "learning_rate": 1.0501363661475533e-05, + "loss": 0.4748, + "step": 4371 + }, + { + "epoch": 0.5, + "learning_rate": 1.0497666629769172e-05, + "loss": 0.4521, + "step": 4372 + }, + { + "epoch": 0.5, + "learning_rate": 1.049396952987137e-05, + "loss": 0.463, + "step": 4373 + }, + { + "epoch": 0.5, + "learning_rate": 1.0490272362288716e-05, + "loss": 0.4574, + "step": 4374 + }, + { + "epoch": 0.5, + "learning_rate": 1.0486575127527802e-05, + "loss": 0.5077, + "step": 4375 + }, + { + "epoch": 0.5, + "learning_rate": 1.0482877826095233e-05, + "loss": 0.4699, + "step": 4376 + }, + { + "epoch": 0.5, + "learning_rate": 1.047918045849762e-05, + "loss": 0.4626, + "step": 4377 + }, + { + "epoch": 0.5, + "learning_rate": 1.0475483025241587e-05, + "loss": 0.4677, + "step": 4378 + }, + { + "epoch": 0.5, + "learning_rate": 1.0471785526833762e-05, + "loss": 0.4496, + "step": 4379 + }, + { + "epoch": 0.5, + "learning_rate": 1.046808796378079e-05, + "loss": 0.4818, + "step": 4380 + }, + { + "epoch": 0.5, + "learning_rate": 1.0464390336589311e-05, + "loss": 0.4648, + "step": 4381 + }, + { + "epoch": 0.5, + "learning_rate": 1.046069264576599e-05, + "loss": 0.4639, + "step": 4382 + }, + { + "epoch": 0.5, + "learning_rate": 1.0456994891817492e-05, + "loss": 0.501, + "step": 4383 + }, + { + "epoch": 0.5, + "learning_rate": 1.045329707525049e-05, + "loss": 0.4538, + "step": 4384 + }, + { + "epoch": 0.5, + "learning_rate": 1.0449599196571671e-05, + "loss": 0.4661, + "step": 4385 + }, + { + "epoch": 0.5, + "learning_rate": 1.044590125628772e-05, + "loss": 0.4508, + "step": 4386 + }, + { + "epoch": 0.5, + "learning_rate": 1.0442203254905346e-05, + "loss": 0.4545, + "step": 4387 + }, + { + "epoch": 0.5, + "learning_rate": 1.043850519293125e-05, + "loss": 0.4552, + "step": 4388 + }, + { + "epoch": 0.5, + "learning_rate": 1.0434807070872154e-05, + "loss": 0.4872, + "step": 4389 + }, + { + "epoch": 0.5, + "learning_rate": 1.0431108889234783e-05, + "loss": 0.4596, + "step": 4390 + }, + { + "epoch": 0.5, + "learning_rate": 1.0427410648525863e-05, + "loss": 0.4589, + "step": 4391 + }, + { + "epoch": 0.5, + "learning_rate": 1.0423712349252148e-05, + "loss": 0.4557, + "step": 4392 + }, + { + "epoch": 0.5, + "learning_rate": 1.0420013991920382e-05, + "loss": 0.4582, + "step": 4393 + }, + { + "epoch": 0.5, + "learning_rate": 1.041631557703732e-05, + "loss": 0.4466, + "step": 4394 + }, + { + "epoch": 0.5, + "learning_rate": 1.0412617105109725e-05, + "loss": 0.4584, + "step": 4395 + }, + { + "epoch": 0.5, + "learning_rate": 1.0408918576644378e-05, + "loss": 0.4645, + "step": 4396 + }, + { + "epoch": 0.5, + "learning_rate": 1.0405219992148057e-05, + "loss": 0.4663, + "step": 4397 + }, + { + "epoch": 0.5, + "learning_rate": 1.040152135212755e-05, + "loss": 0.4447, + "step": 4398 + }, + { + "epoch": 0.5, + "learning_rate": 1.0397822657089653e-05, + "loss": 0.4731, + "step": 4399 + }, + { + "epoch": 0.5, + "learning_rate": 1.039412390754117e-05, + "loss": 0.4579, + "step": 4400 + }, + { + "epoch": 0.5, + "learning_rate": 1.039042510398891e-05, + "loss": 0.4788, + "step": 4401 + }, + { + "epoch": 0.5, + "learning_rate": 1.03867262469397e-05, + "loss": 0.473, + "step": 4402 + }, + { + "epoch": 0.5, + "learning_rate": 1.0383027336900356e-05, + "loss": 0.4693, + "step": 4403 + }, + { + "epoch": 0.5, + "learning_rate": 1.0379328374377715e-05, + "loss": 0.4546, + "step": 4404 + }, + { + "epoch": 0.5, + "learning_rate": 1.0375629359878616e-05, + "loss": 0.4708, + "step": 4405 + }, + { + "epoch": 0.5, + "learning_rate": 1.0371930293909911e-05, + "loss": 0.4519, + "step": 4406 + }, + { + "epoch": 0.5, + "learning_rate": 1.0368231176978454e-05, + "loss": 0.4842, + "step": 4407 + }, + { + "epoch": 0.5, + "learning_rate": 1.0364532009591101e-05, + "loss": 0.4637, + "step": 4408 + }, + { + "epoch": 0.5, + "learning_rate": 1.0360832792254727e-05, + "loss": 0.4791, + "step": 4409 + }, + { + "epoch": 0.5, + "learning_rate": 1.03571335254762e-05, + "loss": 0.4545, + "step": 4410 + }, + { + "epoch": 0.5, + "learning_rate": 1.0353434209762412e-05, + "loss": 0.4852, + "step": 4411 + }, + { + "epoch": 0.5, + "learning_rate": 1.0349734845620244e-05, + "loss": 0.4396, + "step": 4412 + }, + { + "epoch": 0.5, + "learning_rate": 1.0346035433556594e-05, + "loss": 0.4579, + "step": 4413 + }, + { + "epoch": 0.5, + "learning_rate": 1.0342335974078364e-05, + "loss": 0.4664, + "step": 4414 + }, + { + "epoch": 0.5, + "learning_rate": 1.0338636467692462e-05, + "loss": 0.4647, + "step": 4415 + }, + { + "epoch": 0.5, + "learning_rate": 1.0334936914905802e-05, + "loss": 0.4764, + "step": 4416 + }, + { + "epoch": 0.5, + "learning_rate": 1.0331237316225309e-05, + "loss": 0.4702, + "step": 4417 + }, + { + "epoch": 0.5, + "learning_rate": 1.0327537672157908e-05, + "loss": 0.4681, + "step": 4418 + }, + { + "epoch": 0.51, + "learning_rate": 1.0323837983210535e-05, + "loss": 0.4651, + "step": 4419 + }, + { + "epoch": 0.51, + "learning_rate": 1.0320138249890126e-05, + "loss": 0.465, + "step": 4420 + }, + { + "epoch": 0.51, + "learning_rate": 1.031643847270363e-05, + "loss": 0.4518, + "step": 4421 + }, + { + "epoch": 0.51, + "learning_rate": 1.0312738652157996e-05, + "loss": 0.4564, + "step": 4422 + }, + { + "epoch": 0.51, + "learning_rate": 1.0309038788760185e-05, + "loss": 0.481, + "step": 4423 + }, + { + "epoch": 0.51, + "learning_rate": 1.0305338883017163e-05, + "loss": 0.4721, + "step": 4424 + }, + { + "epoch": 0.51, + "learning_rate": 1.0301638935435896e-05, + "loss": 0.469, + "step": 4425 + }, + { + "epoch": 0.51, + "learning_rate": 1.0297938946523361e-05, + "loss": 0.4487, + "step": 4426 + }, + { + "epoch": 0.51, + "learning_rate": 1.0294238916786537e-05, + "loss": 0.4555, + "step": 4427 + }, + { + "epoch": 0.51, + "learning_rate": 1.0290538846732415e-05, + "loss": 0.453, + "step": 4428 + }, + { + "epoch": 0.51, + "learning_rate": 1.0286838736867981e-05, + "loss": 0.4768, + "step": 4429 + }, + { + "epoch": 0.51, + "learning_rate": 1.0283138587700236e-05, + "loss": 0.4549, + "step": 4430 + }, + { + "epoch": 0.51, + "learning_rate": 1.0279438399736185e-05, + "loss": 0.4679, + "step": 4431 + }, + { + "epoch": 0.51, + "learning_rate": 1.0275738173482835e-05, + "loss": 0.4489, + "step": 4432 + }, + { + "epoch": 0.51, + "learning_rate": 1.0272037909447197e-05, + "loss": 0.4635, + "step": 4433 + }, + { + "epoch": 0.51, + "learning_rate": 1.0268337608136292e-05, + "loss": 0.4474, + "step": 4434 + }, + { + "epoch": 0.51, + "learning_rate": 1.0264637270057146e-05, + "loss": 0.4724, + "step": 4435 + }, + { + "epoch": 0.51, + "learning_rate": 1.0260936895716781e-05, + "loss": 0.458, + "step": 4436 + }, + { + "epoch": 0.51, + "learning_rate": 1.0257236485622241e-05, + "loss": 0.4632, + "step": 4437 + }, + { + "epoch": 0.51, + "learning_rate": 1.0253536040280556e-05, + "loss": 0.4592, + "step": 4438 + }, + { + "epoch": 0.51, + "learning_rate": 1.0249835560198772e-05, + "loss": 0.4643, + "step": 4439 + }, + { + "epoch": 0.51, + "learning_rate": 1.0246135045883943e-05, + "loss": 0.4771, + "step": 4440 + }, + { + "epoch": 0.51, + "learning_rate": 1.0242434497843117e-05, + "loss": 0.4872, + "step": 4441 + }, + { + "epoch": 0.51, + "learning_rate": 1.023873391658335e-05, + "loss": 0.4517, + "step": 4442 + }, + { + "epoch": 0.51, + "learning_rate": 1.0235033302611704e-05, + "loss": 0.4768, + "step": 4443 + }, + { + "epoch": 0.51, + "learning_rate": 1.023133265643525e-05, + "loss": 0.4565, + "step": 4444 + }, + { + "epoch": 0.51, + "learning_rate": 1.0227631978561057e-05, + "loss": 0.4665, + "step": 4445 + }, + { + "epoch": 0.51, + "learning_rate": 1.0223931269496204e-05, + "loss": 0.4572, + "step": 4446 + }, + { + "epoch": 0.51, + "learning_rate": 1.0220230529747766e-05, + "loss": 0.4538, + "step": 4447 + }, + { + "epoch": 0.51, + "learning_rate": 1.0216529759822823e-05, + "loss": 0.4572, + "step": 4448 + }, + { + "epoch": 0.51, + "learning_rate": 1.0212828960228475e-05, + "loss": 0.4631, + "step": 4449 + }, + { + "epoch": 0.51, + "learning_rate": 1.0209128131471809e-05, + "loss": 0.454, + "step": 4450 + }, + { + "epoch": 0.51, + "learning_rate": 1.0205427274059915e-05, + "loss": 0.4702, + "step": 4451 + }, + { + "epoch": 0.51, + "learning_rate": 1.02017263884999e-05, + "loss": 0.4545, + "step": 4452 + }, + { + "epoch": 0.51, + "learning_rate": 1.0198025475298865e-05, + "loss": 0.4729, + "step": 4453 + }, + { + "epoch": 0.51, + "learning_rate": 1.019432453496392e-05, + "loss": 0.4482, + "step": 4454 + }, + { + "epoch": 0.51, + "learning_rate": 1.0190623568002178e-05, + "loss": 0.446, + "step": 4455 + }, + { + "epoch": 0.51, + "learning_rate": 1.0186922574920747e-05, + "loss": 0.4636, + "step": 4456 + }, + { + "epoch": 0.51, + "learning_rate": 1.018322155622675e-05, + "loss": 0.4636, + "step": 4457 + }, + { + "epoch": 0.51, + "learning_rate": 1.017952051242731e-05, + "loss": 0.469, + "step": 4458 + }, + { + "epoch": 0.51, + "learning_rate": 1.0175819444029555e-05, + "loss": 0.4491, + "step": 4459 + }, + { + "epoch": 0.51, + "learning_rate": 1.0172118351540608e-05, + "loss": 0.4544, + "step": 4460 + }, + { + "epoch": 0.51, + "learning_rate": 1.0168417235467604e-05, + "loss": 0.4653, + "step": 4461 + }, + { + "epoch": 0.51, + "learning_rate": 1.0164716096317677e-05, + "loss": 0.4614, + "step": 4462 + }, + { + "epoch": 0.51, + "learning_rate": 1.016101493459797e-05, + "loss": 0.4497, + "step": 4463 + }, + { + "epoch": 0.51, + "learning_rate": 1.0157313750815623e-05, + "loss": 0.4507, + "step": 4464 + }, + { + "epoch": 0.51, + "learning_rate": 1.0153612545477778e-05, + "loss": 0.4477, + "step": 4465 + }, + { + "epoch": 0.51, + "learning_rate": 1.0149911319091583e-05, + "loss": 0.4675, + "step": 4466 + }, + { + "epoch": 0.51, + "learning_rate": 1.014621007216419e-05, + "loss": 0.4669, + "step": 4467 + }, + { + "epoch": 0.51, + "learning_rate": 1.0142508805202757e-05, + "loss": 0.4668, + "step": 4468 + }, + { + "epoch": 0.51, + "learning_rate": 1.0138807518714435e-05, + "loss": 0.4754, + "step": 4469 + }, + { + "epoch": 0.51, + "learning_rate": 1.0135106213206382e-05, + "loss": 0.4581, + "step": 4470 + }, + { + "epoch": 0.51, + "learning_rate": 1.0131404889185762e-05, + "loss": 0.4621, + "step": 4471 + }, + { + "epoch": 0.51, + "learning_rate": 1.012770354715974e-05, + "loss": 0.459, + "step": 4472 + }, + { + "epoch": 0.51, + "learning_rate": 1.012400218763548e-05, + "loss": 0.4633, + "step": 4473 + }, + { + "epoch": 0.51, + "learning_rate": 1.012030081112015e-05, + "loss": 0.4751, + "step": 4474 + }, + { + "epoch": 0.51, + "learning_rate": 1.0116599418120924e-05, + "loss": 0.4624, + "step": 4475 + }, + { + "epoch": 0.51, + "learning_rate": 1.0112898009144977e-05, + "loss": 0.4584, + "step": 4476 + }, + { + "epoch": 0.51, + "learning_rate": 1.0109196584699478e-05, + "loss": 0.4615, + "step": 4477 + }, + { + "epoch": 0.51, + "learning_rate": 1.0105495145291612e-05, + "loss": 0.4712, + "step": 4478 + }, + { + "epoch": 0.51, + "learning_rate": 1.0101793691428554e-05, + "loss": 0.456, + "step": 4479 + }, + { + "epoch": 0.51, + "learning_rate": 1.0098092223617488e-05, + "loss": 0.4551, + "step": 4480 + }, + { + "epoch": 0.51, + "learning_rate": 1.0094390742365598e-05, + "loss": 0.4553, + "step": 4481 + }, + { + "epoch": 0.51, + "learning_rate": 1.009068924818007e-05, + "loss": 0.47, + "step": 4482 + }, + { + "epoch": 0.51, + "learning_rate": 1.0086987741568089e-05, + "loss": 0.4609, + "step": 4483 + }, + { + "epoch": 0.51, + "learning_rate": 1.0083286223036845e-05, + "loss": 0.4694, + "step": 4484 + }, + { + "epoch": 0.51, + "learning_rate": 1.0079584693093529e-05, + "loss": 0.4731, + "step": 4485 + }, + { + "epoch": 0.51, + "learning_rate": 1.0075883152245334e-05, + "loss": 0.4774, + "step": 4486 + }, + { + "epoch": 0.51, + "learning_rate": 1.007218160099945e-05, + "loss": 0.4797, + "step": 4487 + }, + { + "epoch": 0.51, + "learning_rate": 1.006848003986308e-05, + "loss": 0.4382, + "step": 4488 + }, + { + "epoch": 0.51, + "learning_rate": 1.0064778469343413e-05, + "loss": 0.4554, + "step": 4489 + }, + { + "epoch": 0.51, + "learning_rate": 1.006107688994765e-05, + "loss": 0.4602, + "step": 4490 + }, + { + "epoch": 0.51, + "learning_rate": 1.0057375302182988e-05, + "loss": 0.4634, + "step": 4491 + }, + { + "epoch": 0.51, + "learning_rate": 1.005367370655663e-05, + "loss": 0.4633, + "step": 4492 + }, + { + "epoch": 0.51, + "learning_rate": 1.0049972103575775e-05, + "loss": 0.4775, + "step": 4493 + }, + { + "epoch": 0.51, + "learning_rate": 1.004627049374763e-05, + "loss": 0.462, + "step": 4494 + }, + { + "epoch": 0.51, + "learning_rate": 1.0042568877579388e-05, + "loss": 0.4605, + "step": 4495 + }, + { + "epoch": 0.51, + "learning_rate": 1.0038867255578261e-05, + "loss": 0.45, + "step": 4496 + }, + { + "epoch": 0.51, + "learning_rate": 1.0035165628251455e-05, + "loss": 0.4773, + "step": 4497 + }, + { + "epoch": 0.51, + "learning_rate": 1.0031463996106175e-05, + "loss": 0.462, + "step": 4498 + }, + { + "epoch": 0.51, + "learning_rate": 1.002776235964962e-05, + "loss": 0.4733, + "step": 4499 + }, + { + "epoch": 0.51, + "learning_rate": 1.0024060719389002e-05, + "loss": 0.4687, + "step": 4500 + }, + { + "epoch": 0.51, + "learning_rate": 1.002035907583153e-05, + "loss": 0.4611, + "step": 4501 + }, + { + "epoch": 0.51, + "learning_rate": 1.001665742948441e-05, + "loss": 0.4397, + "step": 4502 + }, + { + "epoch": 0.51, + "learning_rate": 1.0012955780854852e-05, + "loss": 0.4622, + "step": 4503 + }, + { + "epoch": 0.51, + "learning_rate": 1.0009254130450059e-05, + "loss": 0.4544, + "step": 4504 + }, + { + "epoch": 0.51, + "learning_rate": 1.0005552478777244e-05, + "loss": 0.4854, + "step": 4505 + }, + { + "epoch": 0.51, + "learning_rate": 1.0001850826343615e-05, + "loss": 0.458, + "step": 4506 + }, + { + "epoch": 0.52, + "learning_rate": 9.998149173656387e-06, + "loss": 0.4704, + "step": 4507 + }, + { + "epoch": 0.52, + "learning_rate": 9.994447521222758e-06, + "loss": 0.4649, + "step": 4508 + }, + { + "epoch": 0.52, + "learning_rate": 9.990745869549943e-06, + "loss": 0.4603, + "step": 4509 + }, + { + "epoch": 0.52, + "learning_rate": 9.987044219145155e-06, + "loss": 0.4715, + "step": 4510 + }, + { + "epoch": 0.52, + "learning_rate": 9.983342570515592e-06, + "loss": 0.4502, + "step": 4511 + }, + { + "epoch": 0.52, + "learning_rate": 9.979640924168475e-06, + "loss": 0.4481, + "step": 4512 + }, + { + "epoch": 0.52, + "learning_rate": 9.975939280611e-06, + "loss": 0.4688, + "step": 4513 + }, + { + "epoch": 0.52, + "learning_rate": 9.972237640350383e-06, + "loss": 0.4572, + "step": 4514 + }, + { + "epoch": 0.52, + "learning_rate": 9.968536003893832e-06, + "loss": 0.4765, + "step": 4515 + }, + { + "epoch": 0.52, + "learning_rate": 9.964834371748547e-06, + "loss": 0.4421, + "step": 4516 + }, + { + "epoch": 0.52, + "learning_rate": 9.96113274442174e-06, + "loss": 0.4692, + "step": 4517 + }, + { + "epoch": 0.52, + "learning_rate": 9.957431122420615e-06, + "loss": 0.4652, + "step": 4518 + }, + { + "epoch": 0.52, + "learning_rate": 9.953729506252374e-06, + "loss": 0.4713, + "step": 4519 + }, + { + "epoch": 0.52, + "learning_rate": 9.950027896424228e-06, + "loss": 0.4544, + "step": 4520 + }, + { + "epoch": 0.52, + "learning_rate": 9.946326293443371e-06, + "loss": 0.4721, + "step": 4521 + }, + { + "epoch": 0.52, + "learning_rate": 9.942624697817015e-06, + "loss": 0.4527, + "step": 4522 + }, + { + "epoch": 0.52, + "learning_rate": 9.938923110052353e-06, + "loss": 0.4667, + "step": 4523 + }, + { + "epoch": 0.52, + "learning_rate": 9.935221530656589e-06, + "loss": 0.4684, + "step": 4524 + }, + { + "epoch": 0.52, + "learning_rate": 9.931519960136925e-06, + "loss": 0.469, + "step": 4525 + }, + { + "epoch": 0.52, + "learning_rate": 9.92781839900055e-06, + "loss": 0.4682, + "step": 4526 + }, + { + "epoch": 0.52, + "learning_rate": 9.92411684775467e-06, + "loss": 0.4617, + "step": 4527 + }, + { + "epoch": 0.52, + "learning_rate": 9.920415306906475e-06, + "loss": 0.4692, + "step": 4528 + }, + { + "epoch": 0.52, + "learning_rate": 9.916713776963156e-06, + "loss": 0.4503, + "step": 4529 + }, + { + "epoch": 0.52, + "learning_rate": 9.913012258431915e-06, + "loss": 0.453, + "step": 4530 + }, + { + "epoch": 0.52, + "learning_rate": 9.909310751819936e-06, + "loss": 0.4832, + "step": 4531 + }, + { + "epoch": 0.52, + "learning_rate": 9.905609257634404e-06, + "loss": 0.4704, + "step": 4532 + }, + { + "epoch": 0.52, + "learning_rate": 9.901907776382514e-06, + "loss": 0.4666, + "step": 4533 + }, + { + "epoch": 0.52, + "learning_rate": 9.898206308571446e-06, + "loss": 0.4546, + "step": 4534 + }, + { + "epoch": 0.52, + "learning_rate": 9.894504854708391e-06, + "loss": 0.4571, + "step": 4535 + }, + { + "epoch": 0.52, + "learning_rate": 9.890803415300527e-06, + "loss": 0.475, + "step": 4536 + }, + { + "epoch": 0.52, + "learning_rate": 9.887101990855027e-06, + "loss": 0.4718, + "step": 4537 + }, + { + "epoch": 0.52, + "learning_rate": 9.883400581879077e-06, + "loss": 0.4536, + "step": 4538 + }, + { + "epoch": 0.52, + "learning_rate": 9.87969918887985e-06, + "loss": 0.4675, + "step": 4539 + }, + { + "epoch": 0.52, + "learning_rate": 9.875997812364524e-06, + "loss": 0.451, + "step": 4540 + }, + { + "epoch": 0.52, + "learning_rate": 9.872296452840266e-06, + "loss": 0.4575, + "step": 4541 + }, + { + "epoch": 0.52, + "learning_rate": 9.86859511081424e-06, + "loss": 0.4571, + "step": 4542 + }, + { + "epoch": 0.52, + "learning_rate": 9.86489378679362e-06, + "loss": 0.4678, + "step": 4543 + }, + { + "epoch": 0.52, + "learning_rate": 9.86119248128557e-06, + "loss": 0.4673, + "step": 4544 + }, + { + "epoch": 0.52, + "learning_rate": 9.857491194797244e-06, + "loss": 0.4733, + "step": 4545 + }, + { + "epoch": 0.52, + "learning_rate": 9.853789927835811e-06, + "loss": 0.4682, + "step": 4546 + }, + { + "epoch": 0.52, + "learning_rate": 9.85008868090842e-06, + "loss": 0.4519, + "step": 4547 + }, + { + "epoch": 0.52, + "learning_rate": 9.846387454522225e-06, + "loss": 0.4542, + "step": 4548 + }, + { + "epoch": 0.52, + "learning_rate": 9.842686249184384e-06, + "loss": 0.4461, + "step": 4549 + }, + { + "epoch": 0.52, + "learning_rate": 9.838985065402032e-06, + "loss": 0.4728, + "step": 4550 + }, + { + "epoch": 0.52, + "learning_rate": 9.835283903682327e-06, + "loss": 0.4747, + "step": 4551 + }, + { + "epoch": 0.52, + "learning_rate": 9.831582764532399e-06, + "loss": 0.4419, + "step": 4552 + }, + { + "epoch": 0.52, + "learning_rate": 9.827881648459396e-06, + "loss": 0.4559, + "step": 4553 + }, + { + "epoch": 0.52, + "learning_rate": 9.824180555970451e-06, + "loss": 0.4902, + "step": 4554 + }, + { + "epoch": 0.52, + "learning_rate": 9.820479487572691e-06, + "loss": 0.443, + "step": 4555 + }, + { + "epoch": 0.52, + "learning_rate": 9.816778443773253e-06, + "loss": 0.4473, + "step": 4556 + }, + { + "epoch": 0.52, + "learning_rate": 9.813077425079258e-06, + "loss": 0.4676, + "step": 4557 + }, + { + "epoch": 0.52, + "learning_rate": 9.809376431997825e-06, + "loss": 0.446, + "step": 4558 + }, + { + "epoch": 0.52, + "learning_rate": 9.805675465036084e-06, + "loss": 0.4786, + "step": 4559 + }, + { + "epoch": 0.52, + "learning_rate": 9.801974524701135e-06, + "loss": 0.4651, + "step": 4560 + }, + { + "epoch": 0.52, + "learning_rate": 9.798273611500103e-06, + "loss": 0.4478, + "step": 4561 + }, + { + "epoch": 0.52, + "learning_rate": 9.794572725940088e-06, + "loss": 0.4633, + "step": 4562 + }, + { + "epoch": 0.52, + "learning_rate": 9.790871868528194e-06, + "loss": 0.4631, + "step": 4563 + }, + { + "epoch": 0.52, + "learning_rate": 9.787171039771528e-06, + "loss": 0.458, + "step": 4564 + }, + { + "epoch": 0.52, + "learning_rate": 9.783470240177175e-06, + "loss": 0.4644, + "step": 4565 + }, + { + "epoch": 0.52, + "learning_rate": 9.779769470252237e-06, + "loss": 0.4525, + "step": 4566 + }, + { + "epoch": 0.52, + "learning_rate": 9.776068730503801e-06, + "loss": 0.48, + "step": 4567 + }, + { + "epoch": 0.52, + "learning_rate": 9.772368021438943e-06, + "loss": 0.4676, + "step": 4568 + }, + { + "epoch": 0.52, + "learning_rate": 9.768667343564752e-06, + "loss": 0.4516, + "step": 4569 + }, + { + "epoch": 0.52, + "learning_rate": 9.7649666973883e-06, + "loss": 0.4608, + "step": 4570 + }, + { + "epoch": 0.52, + "learning_rate": 9.761266083416655e-06, + "loss": 0.4624, + "step": 4571 + }, + { + "epoch": 0.52, + "learning_rate": 9.75756550215689e-06, + "loss": 0.4609, + "step": 4572 + }, + { + "epoch": 0.52, + "learning_rate": 9.753864954116058e-06, + "loss": 0.4632, + "step": 4573 + }, + { + "epoch": 0.52, + "learning_rate": 9.75016443980123e-06, + "loss": 0.4696, + "step": 4574 + }, + { + "epoch": 0.52, + "learning_rate": 9.746463959719447e-06, + "loss": 0.4639, + "step": 4575 + }, + { + "epoch": 0.52, + "learning_rate": 9.74276351437776e-06, + "loss": 0.4705, + "step": 4576 + }, + { + "epoch": 0.52, + "learning_rate": 9.73906310428322e-06, + "loss": 0.4716, + "step": 4577 + }, + { + "epoch": 0.52, + "learning_rate": 9.735362729942856e-06, + "loss": 0.4548, + "step": 4578 + }, + { + "epoch": 0.52, + "learning_rate": 9.731662391863711e-06, + "loss": 0.453, + "step": 4579 + }, + { + "epoch": 0.52, + "learning_rate": 9.727962090552808e-06, + "loss": 0.4619, + "step": 4580 + }, + { + "epoch": 0.52, + "learning_rate": 9.724261826517167e-06, + "loss": 0.4502, + "step": 4581 + }, + { + "epoch": 0.52, + "learning_rate": 9.720561600263818e-06, + "loss": 0.4543, + "step": 4582 + }, + { + "epoch": 0.52, + "learning_rate": 9.716861412299769e-06, + "loss": 0.4512, + "step": 4583 + }, + { + "epoch": 0.52, + "learning_rate": 9.713161263132022e-06, + "loss": 0.4865, + "step": 4584 + }, + { + "epoch": 0.52, + "learning_rate": 9.70946115326759e-06, + "loss": 0.4704, + "step": 4585 + }, + { + "epoch": 0.52, + "learning_rate": 9.705761083213463e-06, + "loss": 0.4619, + "step": 4586 + }, + { + "epoch": 0.52, + "learning_rate": 9.702061053476642e-06, + "loss": 0.4452, + "step": 4587 + }, + { + "epoch": 0.52, + "learning_rate": 9.698361064564107e-06, + "loss": 0.4892, + "step": 4588 + }, + { + "epoch": 0.52, + "learning_rate": 9.694661116982838e-06, + "loss": 0.437, + "step": 4589 + }, + { + "epoch": 0.52, + "learning_rate": 9.690961211239816e-06, + "loss": 0.4578, + "step": 4590 + }, + { + "epoch": 0.52, + "learning_rate": 9.687261347842004e-06, + "loss": 0.4736, + "step": 4591 + }, + { + "epoch": 0.52, + "learning_rate": 9.683561527296375e-06, + "loss": 0.4749, + "step": 4592 + }, + { + "epoch": 0.52, + "learning_rate": 9.67986175010988e-06, + "loss": 0.4598, + "step": 4593 + }, + { + "epoch": 0.53, + "learning_rate": 9.676162016789469e-06, + "loss": 0.4787, + "step": 4594 + }, + { + "epoch": 0.53, + "learning_rate": 9.672462327842095e-06, + "loss": 0.4639, + "step": 4595 + }, + { + "epoch": 0.53, + "learning_rate": 9.668762683774691e-06, + "loss": 0.4549, + "step": 4596 + }, + { + "epoch": 0.53, + "learning_rate": 9.6650630850942e-06, + "loss": 0.4702, + "step": 4597 + }, + { + "epoch": 0.53, + "learning_rate": 9.661363532307543e-06, + "loss": 0.4616, + "step": 4598 + }, + { + "epoch": 0.53, + "learning_rate": 9.65766402592164e-06, + "loss": 0.4599, + "step": 4599 + }, + { + "epoch": 0.53, + "learning_rate": 9.65396456644341e-06, + "loss": 0.4743, + "step": 4600 + }, + { + "epoch": 0.53, + "learning_rate": 9.650265154379761e-06, + "loss": 0.4605, + "step": 4601 + }, + { + "epoch": 0.53, + "learning_rate": 9.64656579023759e-06, + "loss": 0.4574, + "step": 4602 + }, + { + "epoch": 0.53, + "learning_rate": 9.642866474523802e-06, + "loss": 0.458, + "step": 4603 + }, + { + "epoch": 0.53, + "learning_rate": 9.639167207745276e-06, + "loss": 0.448, + "step": 4604 + }, + { + "epoch": 0.53, + "learning_rate": 9.6354679904089e-06, + "loss": 0.4661, + "step": 4605 + }, + { + "epoch": 0.53, + "learning_rate": 9.631768823021551e-06, + "loss": 0.4667, + "step": 4606 + }, + { + "epoch": 0.53, + "learning_rate": 9.628069706090089e-06, + "loss": 0.4673, + "step": 4607 + }, + { + "epoch": 0.53, + "learning_rate": 9.624370640121387e-06, + "loss": 0.4743, + "step": 4608 + }, + { + "epoch": 0.53, + "learning_rate": 9.620671625622287e-06, + "loss": 0.4708, + "step": 4609 + }, + { + "epoch": 0.53, + "learning_rate": 9.616972663099648e-06, + "loss": 0.4519, + "step": 4610 + }, + { + "epoch": 0.53, + "learning_rate": 9.613273753060306e-06, + "loss": 0.4693, + "step": 4611 + }, + { + "epoch": 0.53, + "learning_rate": 9.60957489601109e-06, + "loss": 0.4497, + "step": 4612 + }, + { + "epoch": 0.53, + "learning_rate": 9.605876092458835e-06, + "loss": 0.4547, + "step": 4613 + }, + { + "epoch": 0.53, + "learning_rate": 9.60217734291035e-06, + "loss": 0.4704, + "step": 4614 + }, + { + "epoch": 0.53, + "learning_rate": 9.598478647872451e-06, + "loss": 0.4674, + "step": 4615 + }, + { + "epoch": 0.53, + "learning_rate": 9.594780007851947e-06, + "loss": 0.4687, + "step": 4616 + }, + { + "epoch": 0.53, + "learning_rate": 9.591081423355622e-06, + "loss": 0.4746, + "step": 4617 + }, + { + "epoch": 0.53, + "learning_rate": 9.587382894890276e-06, + "loss": 0.4641, + "step": 4618 + }, + { + "epoch": 0.53, + "learning_rate": 9.583684422962686e-06, + "loss": 0.4568, + "step": 4619 + }, + { + "epoch": 0.53, + "learning_rate": 9.57998600807962e-06, + "loss": 0.4585, + "step": 4620 + }, + { + "epoch": 0.53, + "learning_rate": 9.576287650747854e-06, + "loss": 0.455, + "step": 4621 + }, + { + "epoch": 0.53, + "learning_rate": 9.572589351474135e-06, + "loss": 0.4596, + "step": 4622 + }, + { + "epoch": 0.53, + "learning_rate": 9.568891110765219e-06, + "loss": 0.4498, + "step": 4623 + }, + { + "epoch": 0.53, + "learning_rate": 9.565192929127849e-06, + "loss": 0.4718, + "step": 4624 + }, + { + "epoch": 0.53, + "learning_rate": 9.56149480706875e-06, + "loss": 0.459, + "step": 4625 + }, + { + "epoch": 0.53, + "learning_rate": 9.557796745094659e-06, + "loss": 0.4699, + "step": 4626 + }, + { + "epoch": 0.53, + "learning_rate": 9.554098743712282e-06, + "loss": 0.4617, + "step": 4627 + }, + { + "epoch": 0.53, + "learning_rate": 9.55040080342833e-06, + "loss": 0.4584, + "step": 4628 + }, + { + "epoch": 0.53, + "learning_rate": 9.546702924749513e-06, + "loss": 0.4683, + "step": 4629 + }, + { + "epoch": 0.53, + "learning_rate": 9.543005108182508e-06, + "loss": 0.4565, + "step": 4630 + }, + { + "epoch": 0.53, + "learning_rate": 9.539307354234013e-06, + "loss": 0.4596, + "step": 4631 + }, + { + "epoch": 0.53, + "learning_rate": 9.535609663410692e-06, + "loss": 0.4669, + "step": 4632 + }, + { + "epoch": 0.53, + "learning_rate": 9.531912036219214e-06, + "loss": 0.4542, + "step": 4633 + }, + { + "epoch": 0.53, + "learning_rate": 9.528214473166241e-06, + "loss": 0.4628, + "step": 4634 + }, + { + "epoch": 0.53, + "learning_rate": 9.524516974758415e-06, + "loss": 0.4588, + "step": 4635 + }, + { + "epoch": 0.53, + "learning_rate": 9.520819541502384e-06, + "loss": 0.4724, + "step": 4636 + }, + { + "epoch": 0.53, + "learning_rate": 9.51712217390477e-06, + "loss": 0.4556, + "step": 4637 + }, + { + "epoch": 0.53, + "learning_rate": 9.5134248724722e-06, + "loss": 0.4625, + "step": 4638 + }, + { + "epoch": 0.53, + "learning_rate": 9.509727637711287e-06, + "loss": 0.4548, + "step": 4639 + }, + { + "epoch": 0.53, + "learning_rate": 9.506030470128635e-06, + "loss": 0.4666, + "step": 4640 + }, + { + "epoch": 0.53, + "learning_rate": 9.502333370230831e-06, + "loss": 0.4542, + "step": 4641 + }, + { + "epoch": 0.53, + "learning_rate": 9.49863633852447e-06, + "loss": 0.4654, + "step": 4642 + }, + { + "epoch": 0.53, + "learning_rate": 9.494939375516122e-06, + "loss": 0.4402, + "step": 4643 + }, + { + "epoch": 0.53, + "learning_rate": 9.49124248171236e-06, + "loss": 0.459, + "step": 4644 + }, + { + "epoch": 0.53, + "learning_rate": 9.487545657619736e-06, + "loss": 0.4613, + "step": 4645 + }, + { + "epoch": 0.53, + "learning_rate": 9.483848903744795e-06, + "loss": 0.4691, + "step": 4646 + }, + { + "epoch": 0.53, + "learning_rate": 9.48015222059408e-06, + "loss": 0.451, + "step": 4647 + }, + { + "epoch": 0.53, + "learning_rate": 9.476455608674112e-06, + "loss": 0.4657, + "step": 4648 + }, + { + "epoch": 0.53, + "learning_rate": 9.472759068491421e-06, + "loss": 0.4631, + "step": 4649 + }, + { + "epoch": 0.53, + "learning_rate": 9.469062600552509e-06, + "loss": 0.4553, + "step": 4650 + }, + { + "epoch": 0.53, + "learning_rate": 9.46536620536387e-06, + "loss": 0.4589, + "step": 4651 + }, + { + "epoch": 0.53, + "learning_rate": 9.461669883431997e-06, + "loss": 0.4644, + "step": 4652 + }, + { + "epoch": 0.53, + "learning_rate": 9.457973635263375e-06, + "loss": 0.4886, + "step": 4653 + }, + { + "epoch": 0.53, + "learning_rate": 9.45427746136446e-06, + "loss": 0.4691, + "step": 4654 + }, + { + "epoch": 0.53, + "learning_rate": 9.45058136224172e-06, + "loss": 0.4444, + "step": 4655 + }, + { + "epoch": 0.53, + "learning_rate": 9.446885338401597e-06, + "loss": 0.4715, + "step": 4656 + }, + { + "epoch": 0.53, + "learning_rate": 9.443189390350534e-06, + "loss": 0.4667, + "step": 4657 + }, + { + "epoch": 0.53, + "learning_rate": 9.439493518594957e-06, + "loss": 0.4679, + "step": 4658 + }, + { + "epoch": 0.53, + "learning_rate": 9.435797723641277e-06, + "loss": 0.4479, + "step": 4659 + }, + { + "epoch": 0.53, + "learning_rate": 9.432102005995912e-06, + "loss": 0.4777, + "step": 4660 + }, + { + "epoch": 0.53, + "learning_rate": 9.428406366165244e-06, + "loss": 0.4656, + "step": 4661 + }, + { + "epoch": 0.53, + "learning_rate": 9.424710804655669e-06, + "loss": 0.4465, + "step": 4662 + }, + { + "epoch": 0.53, + "learning_rate": 9.42101532197356e-06, + "loss": 0.4584, + "step": 4663 + }, + { + "epoch": 0.53, + "learning_rate": 9.417319918625274e-06, + "loss": 0.4644, + "step": 4664 + }, + { + "epoch": 0.53, + "learning_rate": 9.413624595117173e-06, + "loss": 0.4449, + "step": 4665 + }, + { + "epoch": 0.53, + "learning_rate": 9.409929351955592e-06, + "loss": 0.4884, + "step": 4666 + }, + { + "epoch": 0.53, + "learning_rate": 9.40623418964686e-06, + "loss": 0.4483, + "step": 4667 + }, + { + "epoch": 0.53, + "learning_rate": 9.402539108697306e-06, + "loss": 0.4915, + "step": 4668 + }, + { + "epoch": 0.53, + "learning_rate": 9.398844109613228e-06, + "loss": 0.4573, + "step": 4669 + }, + { + "epoch": 0.53, + "learning_rate": 9.395149192900934e-06, + "loss": 0.4712, + "step": 4670 + }, + { + "epoch": 0.53, + "learning_rate": 9.391454359066701e-06, + "loss": 0.4664, + "step": 4671 + }, + { + "epoch": 0.53, + "learning_rate": 9.387759608616806e-06, + "loss": 0.4573, + "step": 4672 + }, + { + "epoch": 0.53, + "learning_rate": 9.384064942057518e-06, + "loss": 0.4352, + "step": 4673 + }, + { + "epoch": 0.53, + "learning_rate": 9.380370359895079e-06, + "loss": 0.4715, + "step": 4674 + }, + { + "epoch": 0.53, + "learning_rate": 9.37667586263574e-06, + "loss": 0.4777, + "step": 4675 + }, + { + "epoch": 0.53, + "learning_rate": 9.372981450785723e-06, + "loss": 0.4662, + "step": 4676 + }, + { + "epoch": 0.53, + "learning_rate": 9.369287124851243e-06, + "loss": 0.4557, + "step": 4677 + }, + { + "epoch": 0.53, + "learning_rate": 9.365592885338512e-06, + "loss": 0.457, + "step": 4678 + }, + { + "epoch": 0.53, + "learning_rate": 9.361898732753715e-06, + "loss": 0.4632, + "step": 4679 + }, + { + "epoch": 0.53, + "learning_rate": 9.358204667603043e-06, + "loss": 0.4692, + "step": 4680 + }, + { + "epoch": 0.53, + "learning_rate": 9.35451069039266e-06, + "loss": 0.4565, + "step": 4681 + }, + { + "epoch": 0.54, + "learning_rate": 9.35081680162872e-06, + "loss": 0.4649, + "step": 4682 + }, + { + "epoch": 0.54, + "learning_rate": 9.347123001817376e-06, + "loss": 0.4416, + "step": 4683 + }, + { + "epoch": 0.54, + "learning_rate": 9.343429291464756e-06, + "loss": 0.485, + "step": 4684 + }, + { + "epoch": 0.54, + "learning_rate": 9.339735671076978e-06, + "loss": 0.4466, + "step": 4685 + }, + { + "epoch": 0.54, + "learning_rate": 9.336042141160158e-06, + "loss": 0.4855, + "step": 4686 + }, + { + "epoch": 0.54, + "learning_rate": 9.332348702220386e-06, + "loss": 0.4635, + "step": 4687 + }, + { + "epoch": 0.54, + "learning_rate": 9.32865535476375e-06, + "loss": 0.4589, + "step": 4688 + }, + { + "epoch": 0.54, + "learning_rate": 9.32496209929632e-06, + "loss": 0.4634, + "step": 4689 + }, + { + "epoch": 0.54, + "learning_rate": 9.32126893632415e-06, + "loss": 0.4645, + "step": 4690 + }, + { + "epoch": 0.54, + "learning_rate": 9.317575866353293e-06, + "loss": 0.442, + "step": 4691 + }, + { + "epoch": 0.54, + "learning_rate": 9.313882889889773e-06, + "loss": 0.4607, + "step": 4692 + }, + { + "epoch": 0.54, + "learning_rate": 9.31019000743962e-06, + "loss": 0.4629, + "step": 4693 + }, + { + "epoch": 0.54, + "learning_rate": 9.306497219508835e-06, + "loss": 0.4754, + "step": 4694 + }, + { + "epoch": 0.54, + "learning_rate": 9.302804526603413e-06, + "loss": 0.4514, + "step": 4695 + }, + { + "epoch": 0.54, + "learning_rate": 9.29911192922934e-06, + "loss": 0.4604, + "step": 4696 + }, + { + "epoch": 0.54, + "learning_rate": 9.29541942789258e-06, + "loss": 0.4665, + "step": 4697 + }, + { + "epoch": 0.54, + "learning_rate": 9.291727023099087e-06, + "loss": 0.4833, + "step": 4698 + }, + { + "epoch": 0.54, + "learning_rate": 9.288034715354806e-06, + "loss": 0.4474, + "step": 4699 + }, + { + "epoch": 0.54, + "learning_rate": 9.28434250516566e-06, + "loss": 0.4553, + "step": 4700 + }, + { + "epoch": 0.54, + "learning_rate": 9.280650393037578e-06, + "loss": 0.4573, + "step": 4701 + }, + { + "epoch": 0.54, + "learning_rate": 9.276958379476449e-06, + "loss": 0.4991, + "step": 4702 + }, + { + "epoch": 0.54, + "learning_rate": 9.27326646498816e-06, + "loss": 0.4458, + "step": 4703 + }, + { + "epoch": 0.54, + "learning_rate": 9.269574650078594e-06, + "loss": 0.471, + "step": 4704 + }, + { + "epoch": 0.54, + "learning_rate": 9.265882935253605e-06, + "loss": 0.4525, + "step": 4705 + }, + { + "epoch": 0.54, + "learning_rate": 9.262191321019049e-06, + "loss": 0.473, + "step": 4706 + }, + { + "epoch": 0.54, + "learning_rate": 9.258499807880755e-06, + "loss": 0.4637, + "step": 4707 + }, + { + "epoch": 0.54, + "learning_rate": 9.254808396344536e-06, + "loss": 0.4749, + "step": 4708 + }, + { + "epoch": 0.54, + "learning_rate": 9.251117086916209e-06, + "loss": 0.4477, + "step": 4709 + }, + { + "epoch": 0.54, + "learning_rate": 9.247425880101561e-06, + "loss": 0.4813, + "step": 4710 + }, + { + "epoch": 0.54, + "learning_rate": 9.243734776406365e-06, + "loss": 0.4593, + "step": 4711 + }, + { + "epoch": 0.54, + "learning_rate": 9.240043776336397e-06, + "loss": 0.4823, + "step": 4712 + }, + { + "epoch": 0.54, + "learning_rate": 9.23635288039739e-06, + "loss": 0.4511, + "step": 4713 + }, + { + "epoch": 0.54, + "learning_rate": 9.232662089095091e-06, + "loss": 0.4458, + "step": 4714 + }, + { + "epoch": 0.54, + "learning_rate": 9.22897140293522e-06, + "loss": 0.4836, + "step": 4715 + }, + { + "epoch": 0.54, + "learning_rate": 9.225280822423477e-06, + "loss": 0.4481, + "step": 4716 + }, + { + "epoch": 0.54, + "learning_rate": 9.221590348065561e-06, + "loss": 0.4664, + "step": 4717 + }, + { + "epoch": 0.54, + "learning_rate": 9.217899980367142e-06, + "loss": 0.4613, + "step": 4718 + }, + { + "epoch": 0.54, + "learning_rate": 9.214209719833891e-06, + "loss": 0.4589, + "step": 4719 + }, + { + "epoch": 0.54, + "learning_rate": 9.210519566971452e-06, + "loss": 0.4755, + "step": 4720 + }, + { + "epoch": 0.54, + "learning_rate": 9.206829522285456e-06, + "loss": 0.4648, + "step": 4721 + }, + { + "epoch": 0.54, + "learning_rate": 9.203139586281527e-06, + "loss": 0.4539, + "step": 4722 + }, + { + "epoch": 0.54, + "learning_rate": 9.199449759465263e-06, + "loss": 0.455, + "step": 4723 + }, + { + "epoch": 0.54, + "learning_rate": 9.195760042342254e-06, + "loss": 0.4757, + "step": 4724 + }, + { + "epoch": 0.54, + "learning_rate": 9.192070435418079e-06, + "loss": 0.4543, + "step": 4725 + }, + { + "epoch": 0.54, + "learning_rate": 9.188380939198287e-06, + "loss": 0.4611, + "step": 4726 + }, + { + "epoch": 0.54, + "learning_rate": 9.184691554188432e-06, + "loss": 0.4593, + "step": 4727 + }, + { + "epoch": 0.54, + "learning_rate": 9.181002280894034e-06, + "loss": 0.4526, + "step": 4728 + }, + { + "epoch": 0.54, + "learning_rate": 9.177313119820608e-06, + "loss": 0.4462, + "step": 4729 + }, + { + "epoch": 0.54, + "learning_rate": 9.173624071473655e-06, + "loss": 0.4694, + "step": 4730 + }, + { + "epoch": 0.54, + "learning_rate": 9.16993513635865e-06, + "loss": 0.4631, + "step": 4731 + }, + { + "epoch": 0.54, + "learning_rate": 9.166246314981066e-06, + "loss": 0.4379, + "step": 4732 + }, + { + "epoch": 0.54, + "learning_rate": 9.162557607846352e-06, + "loss": 0.4645, + "step": 4733 + }, + { + "epoch": 0.54, + "learning_rate": 9.158869015459939e-06, + "loss": 0.4768, + "step": 4734 + }, + { + "epoch": 0.54, + "learning_rate": 9.155180538327255e-06, + "loss": 0.4581, + "step": 4735 + }, + { + "epoch": 0.54, + "learning_rate": 9.151492176953697e-06, + "loss": 0.4598, + "step": 4736 + }, + { + "epoch": 0.54, + "learning_rate": 9.147803931844651e-06, + "loss": 0.4646, + "step": 4737 + }, + { + "epoch": 0.54, + "learning_rate": 9.144115803505498e-06, + "loss": 0.493, + "step": 4738 + }, + { + "epoch": 0.54, + "learning_rate": 9.140427792441584e-06, + "loss": 0.4566, + "step": 4739 + }, + { + "epoch": 0.54, + "learning_rate": 9.136739899158257e-06, + "loss": 0.4656, + "step": 4740 + }, + { + "epoch": 0.54, + "learning_rate": 9.133052124160837e-06, + "loss": 0.4568, + "step": 4741 + }, + { + "epoch": 0.54, + "learning_rate": 9.129364467954628e-06, + "loss": 0.4596, + "step": 4742 + }, + { + "epoch": 0.54, + "learning_rate": 9.125676931044928e-06, + "loss": 0.4573, + "step": 4743 + }, + { + "epoch": 0.54, + "learning_rate": 9.121989513937007e-06, + "loss": 0.456, + "step": 4744 + }, + { + "epoch": 0.54, + "learning_rate": 9.11830221713613e-06, + "loss": 0.4509, + "step": 4745 + }, + { + "epoch": 0.54, + "learning_rate": 9.11461504114753e-06, + "loss": 0.4743, + "step": 4746 + }, + { + "epoch": 0.54, + "learning_rate": 9.110927986476434e-06, + "loss": 0.4662, + "step": 4747 + }, + { + "epoch": 0.54, + "learning_rate": 9.107241053628058e-06, + "loss": 0.4476, + "step": 4748 + }, + { + "epoch": 0.54, + "learning_rate": 9.103554243107592e-06, + "loss": 0.4494, + "step": 4749 + }, + { + "epoch": 0.54, + "learning_rate": 9.0998675554202e-06, + "loss": 0.4702, + "step": 4750 + }, + { + "epoch": 0.54, + "learning_rate": 9.096180991071055e-06, + "loss": 0.4599, + "step": 4751 + }, + { + "epoch": 0.54, + "learning_rate": 9.09249455056529e-06, + "loss": 0.4609, + "step": 4752 + }, + { + "epoch": 0.54, + "learning_rate": 9.088808234408037e-06, + "loss": 0.4478, + "step": 4753 + }, + { + "epoch": 0.54, + "learning_rate": 9.0851220431044e-06, + "loss": 0.4582, + "step": 4754 + }, + { + "epoch": 0.54, + "learning_rate": 9.081435977159464e-06, + "loss": 0.4391, + "step": 4755 + }, + { + "epoch": 0.54, + "learning_rate": 9.07775003707831e-06, + "loss": 0.4849, + "step": 4756 + }, + { + "epoch": 0.54, + "learning_rate": 9.074064223365986e-06, + "loss": 0.4503, + "step": 4757 + }, + { + "epoch": 0.54, + "learning_rate": 9.070378536527544e-06, + "loss": 0.4415, + "step": 4758 + }, + { + "epoch": 0.54, + "learning_rate": 9.066692977067996e-06, + "loss": 0.4643, + "step": 4759 + }, + { + "epoch": 0.54, + "learning_rate": 9.063007545492342e-06, + "loss": 0.4695, + "step": 4760 + }, + { + "epoch": 0.54, + "learning_rate": 9.059322242305579e-06, + "loss": 0.4582, + "step": 4761 + }, + { + "epoch": 0.54, + "learning_rate": 9.055637068012664e-06, + "loss": 0.4431, + "step": 4762 + }, + { + "epoch": 0.54, + "learning_rate": 9.051952023118563e-06, + "loss": 0.4624, + "step": 4763 + }, + { + "epoch": 0.54, + "learning_rate": 9.048267108128198e-06, + "loss": 0.4788, + "step": 4764 + }, + { + "epoch": 0.54, + "learning_rate": 9.044582323546486e-06, + "loss": 0.457, + "step": 4765 + }, + { + "epoch": 0.54, + "learning_rate": 9.040897669878327e-06, + "loss": 0.4669, + "step": 4766 + }, + { + "epoch": 0.54, + "learning_rate": 9.037213147628603e-06, + "loss": 0.4772, + "step": 4767 + }, + { + "epoch": 0.54, + "learning_rate": 9.033528757302167e-06, + "loss": 0.4535, + "step": 4768 + }, + { + "epoch": 0.55, + "learning_rate": 9.029844499403876e-06, + "loss": 0.4751, + "step": 4769 + }, + { + "epoch": 0.55, + "learning_rate": 9.026160374438543e-06, + "loss": 0.4523, + "step": 4770 + }, + { + "epoch": 0.55, + "learning_rate": 9.022476382910983e-06, + "loss": 0.462, + "step": 4771 + }, + { + "epoch": 0.55, + "learning_rate": 9.018792525325986e-06, + "loss": 0.458, + "step": 4772 + }, + { + "epoch": 0.55, + "learning_rate": 9.015108802188314e-06, + "loss": 0.4503, + "step": 4773 + }, + { + "epoch": 0.55, + "learning_rate": 9.01142521400273e-06, + "loss": 0.4615, + "step": 4774 + }, + { + "epoch": 0.55, + "learning_rate": 9.007741761273957e-06, + "loss": 0.447, + "step": 4775 + }, + { + "epoch": 0.55, + "learning_rate": 9.004058444506718e-06, + "loss": 0.4495, + "step": 4776 + }, + { + "epoch": 0.55, + "learning_rate": 9.000375264205713e-06, + "loss": 0.4651, + "step": 4777 + }, + { + "epoch": 0.55, + "learning_rate": 8.996692220875608e-06, + "loss": 0.448, + "step": 4778 + }, + { + "epoch": 0.55, + "learning_rate": 8.993009315021073e-06, + "loss": 0.467, + "step": 4779 + }, + { + "epoch": 0.55, + "learning_rate": 8.989326547146743e-06, + "loss": 0.4631, + "step": 4780 + }, + { + "epoch": 0.55, + "learning_rate": 8.985643917757237e-06, + "loss": 0.4623, + "step": 4781 + }, + { + "epoch": 0.55, + "learning_rate": 8.981961427357166e-06, + "loss": 0.459, + "step": 4782 + }, + { + "epoch": 0.55, + "learning_rate": 8.978279076451104e-06, + "loss": 0.4612, + "step": 4783 + }, + { + "epoch": 0.55, + "learning_rate": 8.974596865543624e-06, + "loss": 0.449, + "step": 4784 + }, + { + "epoch": 0.55, + "learning_rate": 8.970914795139264e-06, + "loss": 0.4779, + "step": 4785 + }, + { + "epoch": 0.55, + "learning_rate": 8.967232865742552e-06, + "loss": 0.4482, + "step": 4786 + }, + { + "epoch": 0.55, + "learning_rate": 8.963551077857999e-06, + "loss": 0.4515, + "step": 4787 + }, + { + "epoch": 0.55, + "learning_rate": 8.959869431990082e-06, + "loss": 0.4554, + "step": 4788 + }, + { + "epoch": 0.55, + "learning_rate": 8.956187928643281e-06, + "loss": 0.474, + "step": 4789 + }, + { + "epoch": 0.55, + "learning_rate": 8.952506568322036e-06, + "loss": 0.4525, + "step": 4790 + }, + { + "epoch": 0.55, + "learning_rate": 8.948825351530774e-06, + "loss": 0.4463, + "step": 4791 + }, + { + "epoch": 0.55, + "learning_rate": 8.945144278773914e-06, + "loss": 0.4477, + "step": 4792 + }, + { + "epoch": 0.55, + "learning_rate": 8.941463350555835e-06, + "loss": 0.4675, + "step": 4793 + }, + { + "epoch": 0.55, + "learning_rate": 8.937782567380908e-06, + "loss": 0.4434, + "step": 4794 + }, + { + "epoch": 0.55, + "learning_rate": 8.93410192975349e-06, + "loss": 0.4687, + "step": 4795 + }, + { + "epoch": 0.55, + "learning_rate": 8.9304214381779e-06, + "loss": 0.4757, + "step": 4796 + }, + { + "epoch": 0.55, + "learning_rate": 8.926741093158456e-06, + "loss": 0.4694, + "step": 4797 + }, + { + "epoch": 0.55, + "learning_rate": 8.923060895199442e-06, + "loss": 0.4357, + "step": 4798 + }, + { + "epoch": 0.55, + "learning_rate": 8.919380844805129e-06, + "loss": 0.4529, + "step": 4799 + }, + { + "epoch": 0.55, + "learning_rate": 8.915700942479769e-06, + "loss": 0.4772, + "step": 4800 + }, + { + "epoch": 0.55, + "learning_rate": 8.912021188727585e-06, + "loss": 0.4556, + "step": 4801 + }, + { + "epoch": 0.55, + "learning_rate": 8.908341584052791e-06, + "loss": 0.4584, + "step": 4802 + }, + { + "epoch": 0.55, + "learning_rate": 8.904662128959571e-06, + "loss": 0.4709, + "step": 4803 + }, + { + "epoch": 0.55, + "learning_rate": 8.900982823952093e-06, + "loss": 0.4442, + "step": 4804 + }, + { + "epoch": 0.55, + "learning_rate": 8.897303669534508e-06, + "loss": 0.4682, + "step": 4805 + }, + { + "epoch": 0.55, + "learning_rate": 8.89362466621094e-06, + "loss": 0.4557, + "step": 4806 + }, + { + "epoch": 0.55, + "learning_rate": 8.88994581448549e-06, + "loss": 0.4719, + "step": 4807 + }, + { + "epoch": 0.55, + "learning_rate": 8.886267114862248e-06, + "loss": 0.4619, + "step": 4808 + }, + { + "epoch": 0.55, + "learning_rate": 8.882588567845275e-06, + "loss": 0.4531, + "step": 4809 + }, + { + "epoch": 0.55, + "learning_rate": 8.87891017393862e-06, + "loss": 0.4544, + "step": 4810 + }, + { + "epoch": 0.55, + "learning_rate": 8.8752319336463e-06, + "loss": 0.4691, + "step": 4811 + }, + { + "epoch": 0.55, + "learning_rate": 8.871553847472313e-06, + "loss": 0.4549, + "step": 4812 + }, + { + "epoch": 0.55, + "learning_rate": 8.867875915920645e-06, + "loss": 0.4411, + "step": 4813 + }, + { + "epoch": 0.55, + "learning_rate": 8.86419813949525e-06, + "loss": 0.4696, + "step": 4814 + }, + { + "epoch": 0.55, + "learning_rate": 8.860520518700074e-06, + "loss": 0.4568, + "step": 4815 + }, + { + "epoch": 0.55, + "learning_rate": 8.856843054039026e-06, + "loss": 0.4446, + "step": 4816 + }, + { + "epoch": 0.55, + "learning_rate": 8.853165746015997e-06, + "loss": 0.4509, + "step": 4817 + }, + { + "epoch": 0.55, + "learning_rate": 8.849488595134867e-06, + "loss": 0.4493, + "step": 4818 + }, + { + "epoch": 0.55, + "learning_rate": 8.84581160189949e-06, + "loss": 0.4747, + "step": 4819 + }, + { + "epoch": 0.55, + "learning_rate": 8.842134766813687e-06, + "loss": 0.4503, + "step": 4820 + }, + { + "epoch": 0.55, + "learning_rate": 8.838458090381274e-06, + "loss": 0.4668, + "step": 4821 + }, + { + "epoch": 0.55, + "learning_rate": 8.834781573106035e-06, + "loss": 0.4503, + "step": 4822 + }, + { + "epoch": 0.55, + "learning_rate": 8.831105215491734e-06, + "loss": 0.4464, + "step": 4823 + }, + { + "epoch": 0.55, + "learning_rate": 8.827429018042119e-06, + "loss": 0.467, + "step": 4824 + }, + { + "epoch": 0.55, + "learning_rate": 8.823752981260904e-06, + "loss": 0.4819, + "step": 4825 + }, + { + "epoch": 0.55, + "learning_rate": 8.820077105651794e-06, + "loss": 0.4618, + "step": 4826 + }, + { + "epoch": 0.55, + "learning_rate": 8.81640139171846e-06, + "loss": 0.478, + "step": 4827 + }, + { + "epoch": 0.55, + "learning_rate": 8.812725839964564e-06, + "loss": 0.4558, + "step": 4828 + }, + { + "epoch": 0.55, + "learning_rate": 8.809050450893737e-06, + "loss": 0.4699, + "step": 4829 + }, + { + "epoch": 0.55, + "learning_rate": 8.805375225009583e-06, + "loss": 0.4374, + "step": 4830 + }, + { + "epoch": 0.55, + "learning_rate": 8.801700162815698e-06, + "loss": 0.465, + "step": 4831 + }, + { + "epoch": 0.55, + "learning_rate": 8.798025264815643e-06, + "loss": 0.4757, + "step": 4832 + }, + { + "epoch": 0.55, + "learning_rate": 8.79435053151296e-06, + "loss": 0.4561, + "step": 4833 + }, + { + "epoch": 0.55, + "learning_rate": 8.790675963411175e-06, + "loss": 0.4736, + "step": 4834 + }, + { + "epoch": 0.55, + "learning_rate": 8.787001561013779e-06, + "loss": 0.4534, + "step": 4835 + }, + { + "epoch": 0.55, + "learning_rate": 8.783327324824255e-06, + "loss": 0.4696, + "step": 4836 + }, + { + "epoch": 0.55, + "learning_rate": 8.779653255346049e-06, + "loss": 0.4809, + "step": 4837 + }, + { + "epoch": 0.55, + "learning_rate": 8.77597935308259e-06, + "loss": 0.4455, + "step": 4838 + }, + { + "epoch": 0.55, + "learning_rate": 8.772305618537293e-06, + "loss": 0.4506, + "step": 4839 + }, + { + "epoch": 0.55, + "learning_rate": 8.768632052213532e-06, + "loss": 0.4514, + "step": 4840 + }, + { + "epoch": 0.55, + "learning_rate": 8.764958654614673e-06, + "loss": 0.4674, + "step": 4841 + }, + { + "epoch": 0.55, + "learning_rate": 8.761285426244053e-06, + "loss": 0.45, + "step": 4842 + }, + { + "epoch": 0.55, + "learning_rate": 8.757612367604983e-06, + "loss": 0.4517, + "step": 4843 + }, + { + "epoch": 0.55, + "learning_rate": 8.753939479200758e-06, + "loss": 0.4691, + "step": 4844 + }, + { + "epoch": 0.55, + "learning_rate": 8.750266761534641e-06, + "loss": 0.4638, + "step": 4845 + }, + { + "epoch": 0.55, + "learning_rate": 8.746594215109884e-06, + "loss": 0.4645, + "step": 4846 + }, + { + "epoch": 0.55, + "learning_rate": 8.742921840429702e-06, + "loss": 0.4585, + "step": 4847 + }, + { + "epoch": 0.55, + "learning_rate": 8.73924963799729e-06, + "loss": 0.4481, + "step": 4848 + }, + { + "epoch": 0.55, + "learning_rate": 8.73557760831583e-06, + "loss": 0.4553, + "step": 4849 + }, + { + "epoch": 0.55, + "learning_rate": 8.731905751888466e-06, + "loss": 0.4728, + "step": 4850 + }, + { + "epoch": 0.55, + "learning_rate": 8.728234069218325e-06, + "loss": 0.459, + "step": 4851 + }, + { + "epoch": 0.55, + "learning_rate": 8.724562560808512e-06, + "loss": 0.4681, + "step": 4852 + }, + { + "epoch": 0.55, + "learning_rate": 8.720891227162099e-06, + "loss": 0.4699, + "step": 4853 + }, + { + "epoch": 0.55, + "learning_rate": 8.71722006878215e-06, + "loss": 0.4628, + "step": 4854 + }, + { + "epoch": 0.55, + "learning_rate": 8.71354908617169e-06, + "loss": 0.4618, + "step": 4855 + }, + { + "epoch": 0.55, + "learning_rate": 8.709878279833725e-06, + "loss": 0.4481, + "step": 4856 + }, + { + "epoch": 0.56, + "learning_rate": 8.706207650271243e-06, + "loss": 0.4791, + "step": 4857 + }, + { + "epoch": 0.56, + "learning_rate": 8.702537197987193e-06, + "loss": 0.4634, + "step": 4858 + }, + { + "epoch": 0.56, + "learning_rate": 8.698866923484521e-06, + "loss": 0.4584, + "step": 4859 + }, + { + "epoch": 0.56, + "learning_rate": 8.695196827266126e-06, + "loss": 0.4717, + "step": 4860 + }, + { + "epoch": 0.56, + "learning_rate": 8.691526909834895e-06, + "loss": 0.4573, + "step": 4861 + }, + { + "epoch": 0.56, + "learning_rate": 8.687857171693693e-06, + "loss": 0.4662, + "step": 4862 + }, + { + "epoch": 0.56, + "learning_rate": 8.684187613345356e-06, + "loss": 0.4881, + "step": 4863 + }, + { + "epoch": 0.56, + "learning_rate": 8.680518235292688e-06, + "loss": 0.4418, + "step": 4864 + }, + { + "epoch": 0.56, + "learning_rate": 8.676849038038483e-06, + "loss": 0.4666, + "step": 4865 + }, + { + "epoch": 0.56, + "learning_rate": 8.673180022085499e-06, + "loss": 0.4722, + "step": 4866 + }, + { + "epoch": 0.56, + "learning_rate": 8.669511187936478e-06, + "loss": 0.4519, + "step": 4867 + }, + { + "epoch": 0.56, + "learning_rate": 8.66584253609413e-06, + "loss": 0.4627, + "step": 4868 + }, + { + "epoch": 0.56, + "learning_rate": 8.662174067061135e-06, + "loss": 0.4508, + "step": 4869 + }, + { + "epoch": 0.56, + "learning_rate": 8.658505781340166e-06, + "loss": 0.4772, + "step": 4870 + }, + { + "epoch": 0.56, + "learning_rate": 8.654837679433852e-06, + "loss": 0.4594, + "step": 4871 + }, + { + "epoch": 0.56, + "learning_rate": 8.651169761844812e-06, + "loss": 0.4637, + "step": 4872 + }, + { + "epoch": 0.56, + "learning_rate": 8.64750202907563e-06, + "loss": 0.4445, + "step": 4873 + }, + { + "epoch": 0.56, + "learning_rate": 8.643834481628861e-06, + "loss": 0.4444, + "step": 4874 + }, + { + "epoch": 0.56, + "learning_rate": 8.640167120007047e-06, + "loss": 0.4511, + "step": 4875 + }, + { + "epoch": 0.56, + "learning_rate": 8.636499944712702e-06, + "loss": 0.4727, + "step": 4876 + }, + { + "epoch": 0.56, + "learning_rate": 8.6328329562483e-06, + "loss": 0.4674, + "step": 4877 + }, + { + "epoch": 0.56, + "learning_rate": 8.629166155116312e-06, + "loss": 0.48, + "step": 4878 + }, + { + "epoch": 0.56, + "learning_rate": 8.625499541819163e-06, + "loss": 0.4411, + "step": 4879 + }, + { + "epoch": 0.56, + "learning_rate": 8.621833116859264e-06, + "loss": 0.4496, + "step": 4880 + }, + { + "epoch": 0.56, + "learning_rate": 8.618166880739e-06, + "loss": 0.5047, + "step": 4881 + }, + { + "epoch": 0.56, + "learning_rate": 8.614500833960722e-06, + "loss": 0.4631, + "step": 4882 + }, + { + "epoch": 0.56, + "learning_rate": 8.610834977026765e-06, + "loss": 0.4548, + "step": 4883 + }, + { + "epoch": 0.56, + "learning_rate": 8.607169310439427e-06, + "loss": 0.4439, + "step": 4884 + }, + { + "epoch": 0.56, + "learning_rate": 8.603503834700993e-06, + "loss": 0.4659, + "step": 4885 + }, + { + "epoch": 0.56, + "learning_rate": 8.599838550313714e-06, + "loss": 0.4751, + "step": 4886 + }, + { + "epoch": 0.56, + "learning_rate": 8.596173457779807e-06, + "loss": 0.4699, + "step": 4887 + }, + { + "epoch": 0.56, + "learning_rate": 8.592508557601484e-06, + "loss": 0.4542, + "step": 4888 + }, + { + "epoch": 0.56, + "learning_rate": 8.588843850280911e-06, + "loss": 0.4825, + "step": 4889 + }, + { + "epoch": 0.56, + "learning_rate": 8.585179336320235e-06, + "loss": 0.4437, + "step": 4890 + }, + { + "epoch": 0.56, + "learning_rate": 8.58151501622158e-06, + "loss": 0.455, + "step": 4891 + }, + { + "epoch": 0.56, + "learning_rate": 8.577850890487035e-06, + "loss": 0.4626, + "step": 4892 + }, + { + "epoch": 0.56, + "learning_rate": 8.574186959618671e-06, + "loss": 0.4551, + "step": 4893 + }, + { + "epoch": 0.56, + "learning_rate": 8.570523224118526e-06, + "loss": 0.472, + "step": 4894 + }, + { + "epoch": 0.56, + "learning_rate": 8.566859684488611e-06, + "loss": 0.4657, + "step": 4895 + }, + { + "epoch": 0.56, + "learning_rate": 8.56319634123092e-06, + "loss": 0.4572, + "step": 4896 + }, + { + "epoch": 0.56, + "learning_rate": 8.559533194847406e-06, + "loss": 0.4532, + "step": 4897 + }, + { + "epoch": 0.56, + "learning_rate": 8.555870245840005e-06, + "loss": 0.4627, + "step": 4898 + }, + { + "epoch": 0.56, + "learning_rate": 8.552207494710623e-06, + "loss": 0.4717, + "step": 4899 + }, + { + "epoch": 0.56, + "learning_rate": 8.548544941961134e-06, + "loss": 0.458, + "step": 4900 + }, + { + "epoch": 0.56, + "learning_rate": 8.544882588093399e-06, + "loss": 0.4661, + "step": 4901 + }, + { + "epoch": 0.56, + "learning_rate": 8.541220433609234e-06, + "loss": 0.4533, + "step": 4902 + }, + { + "epoch": 0.56, + "learning_rate": 8.53755847901044e-06, + "loss": 0.4695, + "step": 4903 + }, + { + "epoch": 0.56, + "learning_rate": 8.533896724798784e-06, + "loss": 0.4595, + "step": 4904 + }, + { + "epoch": 0.56, + "learning_rate": 8.530235171476005e-06, + "loss": 0.4565, + "step": 4905 + }, + { + "epoch": 0.56, + "learning_rate": 8.526573819543828e-06, + "loss": 0.4569, + "step": 4906 + }, + { + "epoch": 0.56, + "learning_rate": 8.522912669503932e-06, + "loss": 0.4691, + "step": 4907 + }, + { + "epoch": 0.56, + "learning_rate": 8.519251721857977e-06, + "loss": 0.4539, + "step": 4908 + }, + { + "epoch": 0.56, + "learning_rate": 8.515590977107597e-06, + "loss": 0.4729, + "step": 4909 + }, + { + "epoch": 0.56, + "learning_rate": 8.511930435754391e-06, + "loss": 0.4549, + "step": 4910 + }, + { + "epoch": 0.56, + "learning_rate": 8.508270098299943e-06, + "loss": 0.4571, + "step": 4911 + }, + { + "epoch": 0.56, + "learning_rate": 8.504609965245793e-06, + "loss": 0.4656, + "step": 4912 + }, + { + "epoch": 0.56, + "learning_rate": 8.500950037093462e-06, + "loss": 0.4603, + "step": 4913 + }, + { + "epoch": 0.56, + "learning_rate": 8.49729031434445e-06, + "loss": 0.4627, + "step": 4914 + }, + { + "epoch": 0.56, + "learning_rate": 8.493630797500214e-06, + "loss": 0.4876, + "step": 4915 + }, + { + "epoch": 0.56, + "learning_rate": 8.489971487062184e-06, + "loss": 0.4382, + "step": 4916 + }, + { + "epoch": 0.56, + "learning_rate": 8.486312383531777e-06, + "loss": 0.4776, + "step": 4917 + }, + { + "epoch": 0.56, + "learning_rate": 8.482653487410367e-06, + "loss": 0.4457, + "step": 4918 + }, + { + "epoch": 0.56, + "learning_rate": 8.478994799199308e-06, + "loss": 0.4529, + "step": 4919 + }, + { + "epoch": 0.56, + "learning_rate": 8.47533631939992e-06, + "loss": 0.4611, + "step": 4920 + }, + { + "epoch": 0.56, + "learning_rate": 8.471678048513494e-06, + "loss": 0.4637, + "step": 4921 + }, + { + "epoch": 0.56, + "learning_rate": 8.468019987041298e-06, + "loss": 0.4798, + "step": 4922 + }, + { + "epoch": 0.56, + "learning_rate": 8.464362135484564e-06, + "loss": 0.4612, + "step": 4923 + }, + { + "epoch": 0.56, + "learning_rate": 8.460704494344508e-06, + "loss": 0.45, + "step": 4924 + }, + { + "epoch": 0.56, + "learning_rate": 8.4570470641223e-06, + "loss": 0.4651, + "step": 4925 + }, + { + "epoch": 0.56, + "learning_rate": 8.453389845319092e-06, + "loss": 0.4471, + "step": 4926 + }, + { + "epoch": 0.56, + "learning_rate": 8.449732838436006e-06, + "loss": 0.4679, + "step": 4927 + }, + { + "epoch": 0.56, + "learning_rate": 8.44607604397413e-06, + "loss": 0.4558, + "step": 4928 + }, + { + "epoch": 0.56, + "learning_rate": 8.442419462434533e-06, + "loss": 0.4744, + "step": 4929 + }, + { + "epoch": 0.56, + "learning_rate": 8.438763094318245e-06, + "loss": 0.4627, + "step": 4930 + }, + { + "epoch": 0.56, + "learning_rate": 8.435106940126266e-06, + "loss": 0.4728, + "step": 4931 + }, + { + "epoch": 0.56, + "learning_rate": 8.431451000359575e-06, + "loss": 0.4689, + "step": 4932 + }, + { + "epoch": 0.56, + "learning_rate": 8.42779527551912e-06, + "loss": 0.4442, + "step": 4933 + }, + { + "epoch": 0.56, + "learning_rate": 8.424139766105808e-06, + "loss": 0.4379, + "step": 4934 + }, + { + "epoch": 0.56, + "learning_rate": 8.420484472620535e-06, + "loss": 0.4609, + "step": 4935 + }, + { + "epoch": 0.56, + "learning_rate": 8.41682939556415e-06, + "loss": 0.4641, + "step": 4936 + }, + { + "epoch": 0.56, + "learning_rate": 8.413174535437486e-06, + "loss": 0.4572, + "step": 4937 + }, + { + "epoch": 0.56, + "learning_rate": 8.409519892741342e-06, + "loss": 0.4448, + "step": 4938 + }, + { + "epoch": 0.56, + "learning_rate": 8.405865467976477e-06, + "loss": 0.4714, + "step": 4939 + }, + { + "epoch": 0.56, + "learning_rate": 8.402211261643638e-06, + "loss": 0.4533, + "step": 4940 + }, + { + "epoch": 0.56, + "learning_rate": 8.398557274243524e-06, + "loss": 0.4597, + "step": 4941 + }, + { + "epoch": 0.56, + "learning_rate": 8.39490350627682e-06, + "loss": 0.4445, + "step": 4942 + }, + { + "epoch": 0.56, + "learning_rate": 8.391249958244173e-06, + "loss": 0.4642, + "step": 4943 + }, + { + "epoch": 0.57, + "learning_rate": 8.387596630646195e-06, + "loss": 0.4629, + "step": 4944 + }, + { + "epoch": 0.57, + "learning_rate": 8.383943523983482e-06, + "loss": 0.445, + "step": 4945 + }, + { + "epoch": 0.57, + "learning_rate": 8.380290638756584e-06, + "loss": 0.4638, + "step": 4946 + }, + { + "epoch": 0.57, + "learning_rate": 8.376637975466029e-06, + "loss": 0.4669, + "step": 4947 + }, + { + "epoch": 0.57, + "learning_rate": 8.372985534612317e-06, + "loss": 0.4508, + "step": 4948 + }, + { + "epoch": 0.57, + "learning_rate": 8.369333316695909e-06, + "loss": 0.4773, + "step": 4949 + }, + { + "epoch": 0.57, + "learning_rate": 8.365681322217247e-06, + "loss": 0.4547, + "step": 4950 + }, + { + "epoch": 0.57, + "learning_rate": 8.362029551676731e-06, + "loss": 0.4548, + "step": 4951 + }, + { + "epoch": 0.57, + "learning_rate": 8.358378005574731e-06, + "loss": 0.4562, + "step": 4952 + }, + { + "epoch": 0.57, + "learning_rate": 8.354726684411604e-06, + "loss": 0.4575, + "step": 4953 + }, + { + "epoch": 0.57, + "learning_rate": 8.351075588687648e-06, + "loss": 0.4677, + "step": 4954 + }, + { + "epoch": 0.57, + "learning_rate": 8.347424718903152e-06, + "loss": 0.4593, + "step": 4955 + }, + { + "epoch": 0.57, + "learning_rate": 8.343774075558366e-06, + "loss": 0.4511, + "step": 4956 + }, + { + "epoch": 0.57, + "learning_rate": 8.340123659153506e-06, + "loss": 0.4596, + "step": 4957 + }, + { + "epoch": 0.57, + "learning_rate": 8.336473470188767e-06, + "loss": 0.4628, + "step": 4958 + }, + { + "epoch": 0.57, + "learning_rate": 8.3328235091643e-06, + "loss": 0.4524, + "step": 4959 + }, + { + "epoch": 0.57, + "learning_rate": 8.329173776580236e-06, + "loss": 0.4604, + "step": 4960 + }, + { + "epoch": 0.57, + "learning_rate": 8.325524272936668e-06, + "loss": 0.4824, + "step": 4961 + }, + { + "epoch": 0.57, + "learning_rate": 8.321874998733654e-06, + "loss": 0.4744, + "step": 4962 + }, + { + "epoch": 0.57, + "learning_rate": 8.318225954471238e-06, + "loss": 0.4642, + "step": 4963 + }, + { + "epoch": 0.57, + "learning_rate": 8.31457714064941e-06, + "loss": 0.4616, + "step": 4964 + }, + { + "epoch": 0.57, + "learning_rate": 8.310928557768145e-06, + "loss": 0.4421, + "step": 4965 + }, + { + "epoch": 0.57, + "learning_rate": 8.307280206327376e-06, + "loss": 0.4755, + "step": 4966 + }, + { + "epoch": 0.57, + "learning_rate": 8.30363208682701e-06, + "loss": 0.4527, + "step": 4967 + }, + { + "epoch": 0.57, + "learning_rate": 8.299984199766925e-06, + "loss": 0.4444, + "step": 4968 + }, + { + "epoch": 0.57, + "learning_rate": 8.296336545646957e-06, + "loss": 0.4497, + "step": 4969 + }, + { + "epoch": 0.57, + "learning_rate": 8.292689124966917e-06, + "loss": 0.4672, + "step": 4970 + }, + { + "epoch": 0.57, + "learning_rate": 8.28904193822659e-06, + "loss": 0.4503, + "step": 4971 + }, + { + "epoch": 0.57, + "learning_rate": 8.285394985925714e-06, + "loss": 0.4621, + "step": 4972 + }, + { + "epoch": 0.57, + "learning_rate": 8.281748268564002e-06, + "loss": 0.459, + "step": 4973 + }, + { + "epoch": 0.57, + "learning_rate": 8.278101786641142e-06, + "loss": 0.4655, + "step": 4974 + }, + { + "epoch": 0.57, + "learning_rate": 8.27445554065678e-06, + "loss": 0.4612, + "step": 4975 + }, + { + "epoch": 0.57, + "learning_rate": 8.270809531110536e-06, + "loss": 0.4627, + "step": 4976 + }, + { + "epoch": 0.57, + "learning_rate": 8.267163758501992e-06, + "loss": 0.4448, + "step": 4977 + }, + { + "epoch": 0.57, + "learning_rate": 8.263518223330698e-06, + "loss": 0.4608, + "step": 4978 + }, + { + "epoch": 0.57, + "learning_rate": 8.259872926096177e-06, + "loss": 0.4677, + "step": 4979 + }, + { + "epoch": 0.57, + "learning_rate": 8.256227867297915e-06, + "loss": 0.47, + "step": 4980 + }, + { + "epoch": 0.57, + "learning_rate": 8.25258304743537e-06, + "loss": 0.4541, + "step": 4981 + }, + { + "epoch": 0.57, + "learning_rate": 8.24893846700796e-06, + "loss": 0.4523, + "step": 4982 + }, + { + "epoch": 0.57, + "learning_rate": 8.245294126515073e-06, + "loss": 0.4618, + "step": 4983 + }, + { + "epoch": 0.57, + "learning_rate": 8.241650026456065e-06, + "loss": 0.4756, + "step": 4984 + }, + { + "epoch": 0.57, + "learning_rate": 8.238006167330266e-06, + "loss": 0.44, + "step": 4985 + }, + { + "epoch": 0.57, + "learning_rate": 8.234362549636953e-06, + "loss": 0.4509, + "step": 4986 + }, + { + "epoch": 0.57, + "learning_rate": 8.230719173875399e-06, + "loss": 0.4718, + "step": 4987 + }, + { + "epoch": 0.57, + "learning_rate": 8.227076040544813e-06, + "loss": 0.4867, + "step": 4988 + }, + { + "epoch": 0.57, + "learning_rate": 8.223433150144393e-06, + "loss": 0.4502, + "step": 4989 + }, + { + "epoch": 0.57, + "learning_rate": 8.2197905031733e-06, + "loss": 0.4644, + "step": 4990 + }, + { + "epoch": 0.57, + "learning_rate": 8.216148100130647e-06, + "loss": 0.4476, + "step": 4991 + }, + { + "epoch": 0.57, + "learning_rate": 8.212505941515536e-06, + "loss": 0.4573, + "step": 4992 + }, + { + "epoch": 0.57, + "learning_rate": 8.208864027827015e-06, + "loss": 0.474, + "step": 4993 + }, + { + "epoch": 0.57, + "learning_rate": 8.205222359564113e-06, + "loss": 0.4377, + "step": 4994 + }, + { + "epoch": 0.57, + "learning_rate": 8.20158093722582e-06, + "loss": 0.462, + "step": 4995 + }, + { + "epoch": 0.57, + "learning_rate": 8.197939761311087e-06, + "loss": 0.4753, + "step": 4996 + }, + { + "epoch": 0.57, + "learning_rate": 8.194298832318843e-06, + "loss": 0.4643, + "step": 4997 + }, + { + "epoch": 0.57, + "learning_rate": 8.190658150747973e-06, + "loss": 0.4496, + "step": 4998 + }, + { + "epoch": 0.57, + "learning_rate": 8.187017717097327e-06, + "loss": 0.4611, + "step": 4999 + }, + { + "epoch": 0.57, + "learning_rate": 8.183377531865737e-06, + "loss": 0.4553, + "step": 5000 + }, + { + "epoch": 0.57, + "learning_rate": 8.179737595551979e-06, + "loss": 0.4481, + "step": 5001 + }, + { + "epoch": 0.57, + "learning_rate": 8.176097908654814e-06, + "loss": 0.4679, + "step": 5002 + }, + { + "epoch": 0.57, + "learning_rate": 8.172458471672953e-06, + "loss": 0.4676, + "step": 5003 + }, + { + "epoch": 0.57, + "learning_rate": 8.168819285105084e-06, + "loss": 0.4493, + "step": 5004 + }, + { + "epoch": 0.57, + "learning_rate": 8.165180349449857e-06, + "loss": 0.4536, + "step": 5005 + }, + { + "epoch": 0.57, + "learning_rate": 8.161541665205885e-06, + "loss": 0.4662, + "step": 5006 + }, + { + "epoch": 0.57, + "learning_rate": 8.157903232871755e-06, + "loss": 0.4608, + "step": 5007 + }, + { + "epoch": 0.57, + "learning_rate": 8.154265052946005e-06, + "loss": 0.4607, + "step": 5008 + }, + { + "epoch": 0.57, + "learning_rate": 8.15062712592715e-06, + "loss": 0.4508, + "step": 5009 + }, + { + "epoch": 0.57, + "learning_rate": 8.146989452313671e-06, + "loss": 0.4459, + "step": 5010 + }, + { + "epoch": 0.57, + "learning_rate": 8.143352032604007e-06, + "loss": 0.4568, + "step": 5011 + }, + { + "epoch": 0.57, + "learning_rate": 8.139714867296567e-06, + "loss": 0.4647, + "step": 5012 + }, + { + "epoch": 0.57, + "learning_rate": 8.136077956889723e-06, + "loss": 0.4756, + "step": 5013 + }, + { + "epoch": 0.57, + "learning_rate": 8.13244130188181e-06, + "loss": 0.4675, + "step": 5014 + }, + { + "epoch": 0.57, + "learning_rate": 8.128804902771137e-06, + "loss": 0.4786, + "step": 5015 + }, + { + "epoch": 0.57, + "learning_rate": 8.125168760055971e-06, + "loss": 0.471, + "step": 5016 + }, + { + "epoch": 0.57, + "learning_rate": 8.121532874234539e-06, + "loss": 0.4604, + "step": 5017 + }, + { + "epoch": 0.57, + "learning_rate": 8.117897245805044e-06, + "loss": 0.4471, + "step": 5018 + }, + { + "epoch": 0.57, + "learning_rate": 8.114261875265643e-06, + "loss": 0.4433, + "step": 5019 + }, + { + "epoch": 0.57, + "learning_rate": 8.11062676311447e-06, + "loss": 0.4612, + "step": 5020 + }, + { + "epoch": 0.57, + "learning_rate": 8.106991909849613e-06, + "loss": 0.4797, + "step": 5021 + }, + { + "epoch": 0.57, + "learning_rate": 8.103357315969127e-06, + "loss": 0.4456, + "step": 5022 + }, + { + "epoch": 0.57, + "learning_rate": 8.099722981971035e-06, + "loss": 0.4634, + "step": 5023 + }, + { + "epoch": 0.57, + "learning_rate": 8.096088908353316e-06, + "loss": 0.4674, + "step": 5024 + }, + { + "epoch": 0.57, + "learning_rate": 8.092455095613927e-06, + "loss": 0.4568, + "step": 5025 + }, + { + "epoch": 0.57, + "learning_rate": 8.088821544250778e-06, + "loss": 0.4572, + "step": 5026 + }, + { + "epoch": 0.57, + "learning_rate": 8.085188254761744e-06, + "loss": 0.4588, + "step": 5027 + }, + { + "epoch": 0.57, + "learning_rate": 8.081555227644674e-06, + "loss": 0.4563, + "step": 5028 + }, + { + "epoch": 0.57, + "learning_rate": 8.077922463397371e-06, + "loss": 0.4857, + "step": 5029 + }, + { + "epoch": 0.57, + "learning_rate": 8.074289962517597e-06, + "loss": 0.4454, + "step": 5030 + }, + { + "epoch": 0.57, + "learning_rate": 8.070657725503094e-06, + "loss": 0.4636, + "step": 5031 + }, + { + "epoch": 0.58, + "learning_rate": 8.067025752851555e-06, + "loss": 0.4605, + "step": 5032 + }, + { + "epoch": 0.58, + "learning_rate": 8.063394045060648e-06, + "loss": 0.4575, + "step": 5033 + }, + { + "epoch": 0.58, + "learning_rate": 8.059762602627995e-06, + "loss": 0.4647, + "step": 5034 + }, + { + "epoch": 0.58, + "learning_rate": 8.05613142605118e-06, + "loss": 0.4465, + "step": 5035 + }, + { + "epoch": 0.58, + "learning_rate": 8.052500515827759e-06, + "loss": 0.4594, + "step": 5036 + }, + { + "epoch": 0.58, + "learning_rate": 8.048869872455246e-06, + "loss": 0.4623, + "step": 5037 + }, + { + "epoch": 0.58, + "learning_rate": 8.045239496431125e-06, + "loss": 0.4749, + "step": 5038 + }, + { + "epoch": 0.58, + "learning_rate": 8.041609388252836e-06, + "loss": 0.4546, + "step": 5039 + }, + { + "epoch": 0.58, + "learning_rate": 8.037979548417781e-06, + "loss": 0.4709, + "step": 5040 + }, + { + "epoch": 0.58, + "learning_rate": 8.034349977423332e-06, + "loss": 0.4572, + "step": 5041 + }, + { + "epoch": 0.58, + "learning_rate": 8.030720675766825e-06, + "loss": 0.4705, + "step": 5042 + }, + { + "epoch": 0.58, + "learning_rate": 8.027091643945548e-06, + "loss": 0.4479, + "step": 5043 + }, + { + "epoch": 0.58, + "learning_rate": 8.023462882456768e-06, + "loss": 0.4596, + "step": 5044 + }, + { + "epoch": 0.58, + "learning_rate": 8.019834391797696e-06, + "loss": 0.4481, + "step": 5045 + }, + { + "epoch": 0.58, + "learning_rate": 8.016206172465525e-06, + "loss": 0.4726, + "step": 5046 + }, + { + "epoch": 0.58, + "learning_rate": 8.0125782249574e-06, + "loss": 0.4526, + "step": 5047 + }, + { + "epoch": 0.58, + "learning_rate": 8.008950549770426e-06, + "loss": 0.4478, + "step": 5048 + }, + { + "epoch": 0.58, + "learning_rate": 8.005323147401684e-06, + "loss": 0.4693, + "step": 5049 + }, + { + "epoch": 0.58, + "learning_rate": 8.0016960183482e-06, + "loss": 0.4515, + "step": 5050 + }, + { + "epoch": 0.58, + "learning_rate": 7.998069163106977e-06, + "loss": 0.4497, + "step": 5051 + }, + { + "epoch": 0.58, + "learning_rate": 7.994442582174976e-06, + "loss": 0.4524, + "step": 5052 + }, + { + "epoch": 0.58, + "learning_rate": 7.990816276049115e-06, + "loss": 0.4489, + "step": 5053 + }, + { + "epoch": 0.58, + "learning_rate": 7.987190245226285e-06, + "loss": 0.4759, + "step": 5054 + }, + { + "epoch": 0.58, + "learning_rate": 7.983564490203328e-06, + "loss": 0.4643, + "step": 5055 + }, + { + "epoch": 0.58, + "learning_rate": 7.979939011477052e-06, + "loss": 0.4524, + "step": 5056 + }, + { + "epoch": 0.58, + "learning_rate": 7.976313809544237e-06, + "loss": 0.4649, + "step": 5057 + }, + { + "epoch": 0.58, + "learning_rate": 7.972688884901606e-06, + "loss": 0.439, + "step": 5058 + }, + { + "epoch": 0.58, + "learning_rate": 7.969064238045865e-06, + "loss": 0.4411, + "step": 5059 + }, + { + "epoch": 0.58, + "learning_rate": 7.965439869473664e-06, + "loss": 0.4819, + "step": 5060 + }, + { + "epoch": 0.58, + "learning_rate": 7.961815779681624e-06, + "loss": 0.4461, + "step": 5061 + }, + { + "epoch": 0.58, + "learning_rate": 7.95819196916633e-06, + "loss": 0.4635, + "step": 5062 + }, + { + "epoch": 0.58, + "learning_rate": 7.954568438424315e-06, + "loss": 0.44, + "step": 5063 + }, + { + "epoch": 0.58, + "learning_rate": 7.9509451879521e-06, + "loss": 0.4765, + "step": 5064 + }, + { + "epoch": 0.58, + "learning_rate": 7.947322218246136e-06, + "loss": 0.4726, + "step": 5065 + }, + { + "epoch": 0.58, + "learning_rate": 7.943699529802854e-06, + "loss": 0.4714, + "step": 5066 + }, + { + "epoch": 0.58, + "learning_rate": 7.940077123118654e-06, + "loss": 0.4558, + "step": 5067 + }, + { + "epoch": 0.58, + "learning_rate": 7.936454998689875e-06, + "loss": 0.4609, + "step": 5068 + }, + { + "epoch": 0.58, + "learning_rate": 7.932833157012829e-06, + "loss": 0.4656, + "step": 5069 + }, + { + "epoch": 0.58, + "learning_rate": 7.929211598583795e-06, + "loss": 0.4652, + "step": 5070 + }, + { + "epoch": 0.58, + "learning_rate": 7.925590323899002e-06, + "loss": 0.4624, + "step": 5071 + }, + { + "epoch": 0.58, + "learning_rate": 7.921969333454652e-06, + "loss": 0.46, + "step": 5072 + }, + { + "epoch": 0.58, + "learning_rate": 7.918348627746896e-06, + "loss": 0.4763, + "step": 5073 + }, + { + "epoch": 0.58, + "learning_rate": 7.914728207271853e-06, + "loss": 0.4567, + "step": 5074 + }, + { + "epoch": 0.58, + "learning_rate": 7.911108072525603e-06, + "loss": 0.4655, + "step": 5075 + }, + { + "epoch": 0.58, + "learning_rate": 7.90748822400418e-06, + "loss": 0.4465, + "step": 5076 + }, + { + "epoch": 0.58, + "learning_rate": 7.903868662203594e-06, + "loss": 0.4486, + "step": 5077 + }, + { + "epoch": 0.58, + "learning_rate": 7.900249387619797e-06, + "loss": 0.4767, + "step": 5078 + }, + { + "epoch": 0.58, + "learning_rate": 7.896630400748715e-06, + "loss": 0.4576, + "step": 5079 + }, + { + "epoch": 0.58, + "learning_rate": 7.893011702086225e-06, + "loss": 0.4671, + "step": 5080 + }, + { + "epoch": 0.58, + "learning_rate": 7.889393292128177e-06, + "loss": 0.4425, + "step": 5081 + }, + { + "epoch": 0.58, + "learning_rate": 7.885775171370364e-06, + "loss": 0.4573, + "step": 5082 + }, + { + "epoch": 0.58, + "learning_rate": 7.88215734030856e-06, + "loss": 0.4664, + "step": 5083 + }, + { + "epoch": 0.58, + "learning_rate": 7.878539799438479e-06, + "loss": 0.464, + "step": 5084 + }, + { + "epoch": 0.58, + "learning_rate": 7.874922549255814e-06, + "loss": 0.444, + "step": 5085 + }, + { + "epoch": 0.58, + "learning_rate": 7.871305590256206e-06, + "loss": 0.4777, + "step": 5086 + }, + { + "epoch": 0.58, + "learning_rate": 7.867688922935253e-06, + "loss": 0.4662, + "step": 5087 + }, + { + "epoch": 0.58, + "learning_rate": 7.864072547788526e-06, + "loss": 0.4694, + "step": 5088 + }, + { + "epoch": 0.58, + "learning_rate": 7.860456465311544e-06, + "loss": 0.4534, + "step": 5089 + }, + { + "epoch": 0.58, + "learning_rate": 7.856840675999799e-06, + "loss": 0.4482, + "step": 5090 + }, + { + "epoch": 0.58, + "learning_rate": 7.85322518034873e-06, + "loss": 0.4837, + "step": 5091 + }, + { + "epoch": 0.58, + "learning_rate": 7.849609978853735e-06, + "loss": 0.4548, + "step": 5092 + }, + { + "epoch": 0.58, + "learning_rate": 7.845995072010188e-06, + "loss": 0.4736, + "step": 5093 + }, + { + "epoch": 0.58, + "learning_rate": 7.842380460313408e-06, + "loss": 0.4498, + "step": 5094 + }, + { + "epoch": 0.58, + "learning_rate": 7.83876614425867e-06, + "loss": 0.4545, + "step": 5095 + }, + { + "epoch": 0.58, + "learning_rate": 7.835152124341228e-06, + "loss": 0.459, + "step": 5096 + }, + { + "epoch": 0.58, + "learning_rate": 7.831538401056274e-06, + "loss": 0.4741, + "step": 5097 + }, + { + "epoch": 0.58, + "learning_rate": 7.827924974898973e-06, + "loss": 0.4798, + "step": 5098 + }, + { + "epoch": 0.58, + "learning_rate": 7.824311846364448e-06, + "loss": 0.4477, + "step": 5099 + }, + { + "epoch": 0.58, + "learning_rate": 7.82069901594777e-06, + "loss": 0.4804, + "step": 5100 + }, + { + "epoch": 0.58, + "learning_rate": 7.817086484143987e-06, + "loss": 0.4712, + "step": 5101 + }, + { + "epoch": 0.58, + "learning_rate": 7.813474251448086e-06, + "loss": 0.44, + "step": 5102 + }, + { + "epoch": 0.58, + "learning_rate": 7.809862318355033e-06, + "loss": 0.4624, + "step": 5103 + }, + { + "epoch": 0.58, + "learning_rate": 7.806250685359742e-06, + "loss": 0.468, + "step": 5104 + }, + { + "epoch": 0.58, + "learning_rate": 7.80263935295708e-06, + "loss": 0.4681, + "step": 5105 + }, + { + "epoch": 0.58, + "learning_rate": 7.799028321641889e-06, + "loss": 0.457, + "step": 5106 + }, + { + "epoch": 0.58, + "learning_rate": 7.795417591908954e-06, + "loss": 0.4619, + "step": 5107 + }, + { + "epoch": 0.58, + "learning_rate": 7.79180716425303e-06, + "loss": 0.4449, + "step": 5108 + }, + { + "epoch": 0.58, + "learning_rate": 7.788197039168829e-06, + "loss": 0.4714, + "step": 5109 + }, + { + "epoch": 0.58, + "learning_rate": 7.784587217151009e-06, + "loss": 0.4515, + "step": 5110 + }, + { + "epoch": 0.58, + "learning_rate": 7.780977698694206e-06, + "loss": 0.4551, + "step": 5111 + }, + { + "epoch": 0.58, + "learning_rate": 7.777368484293e-06, + "loss": 0.4647, + "step": 5112 + }, + { + "epoch": 0.58, + "learning_rate": 7.77375957444193e-06, + "loss": 0.4572, + "step": 5113 + }, + { + "epoch": 0.58, + "learning_rate": 7.770150969635509e-06, + "loss": 0.4862, + "step": 5114 + }, + { + "epoch": 0.58, + "learning_rate": 7.766542670368182e-06, + "loss": 0.4475, + "step": 5115 + }, + { + "epoch": 0.58, + "learning_rate": 7.76293467713438e-06, + "loss": 0.4581, + "step": 5116 + }, + { + "epoch": 0.58, + "learning_rate": 7.759326990428468e-06, + "loss": 0.4624, + "step": 5117 + }, + { + "epoch": 0.58, + "learning_rate": 7.755719610744783e-06, + "loss": 0.4533, + "step": 5118 + }, + { + "epoch": 0.59, + "learning_rate": 7.752112538577623e-06, + "loss": 0.458, + "step": 5119 + }, + { + "epoch": 0.59, + "learning_rate": 7.748505774421227e-06, + "loss": 0.4647, + "step": 5120 + }, + { + "epoch": 0.59, + "learning_rate": 7.744899318769811e-06, + "loss": 0.4381, + "step": 5121 + }, + { + "epoch": 0.59, + "learning_rate": 7.741293172117533e-06, + "loss": 0.486, + "step": 5122 + }, + { + "epoch": 0.59, + "learning_rate": 7.737687334958518e-06, + "loss": 0.4649, + "step": 5123 + }, + { + "epoch": 0.59, + "learning_rate": 7.73408180778685e-06, + "loss": 0.4676, + "step": 5124 + }, + { + "epoch": 0.59, + "learning_rate": 7.730476591096565e-06, + "loss": 0.464, + "step": 5125 + }, + { + "epoch": 0.59, + "learning_rate": 7.726871685381652e-06, + "loss": 0.4425, + "step": 5126 + }, + { + "epoch": 0.59, + "learning_rate": 7.72326709113607e-06, + "loss": 0.4543, + "step": 5127 + }, + { + "epoch": 0.59, + "learning_rate": 7.719662808853726e-06, + "loss": 0.454, + "step": 5128 + }, + { + "epoch": 0.59, + "learning_rate": 7.71605883902849e-06, + "loss": 0.4574, + "step": 5129 + }, + { + "epoch": 0.59, + "learning_rate": 7.712455182154186e-06, + "loss": 0.477, + "step": 5130 + }, + { + "epoch": 0.59, + "learning_rate": 7.708851838724592e-06, + "loss": 0.4669, + "step": 5131 + }, + { + "epoch": 0.59, + "learning_rate": 7.70524880923345e-06, + "loss": 0.4791, + "step": 5132 + }, + { + "epoch": 0.59, + "learning_rate": 7.701646094174451e-06, + "loss": 0.4468, + "step": 5133 + }, + { + "epoch": 0.59, + "learning_rate": 7.698043694041256e-06, + "loss": 0.4403, + "step": 5134 + }, + { + "epoch": 0.59, + "learning_rate": 7.694441609327465e-06, + "loss": 0.4693, + "step": 5135 + }, + { + "epoch": 0.59, + "learning_rate": 7.69083984052665e-06, + "loss": 0.4503, + "step": 5136 + }, + { + "epoch": 0.59, + "learning_rate": 7.68723838813233e-06, + "loss": 0.4546, + "step": 5137 + }, + { + "epoch": 0.59, + "learning_rate": 7.683637252637988e-06, + "loss": 0.474, + "step": 5138 + }, + { + "epoch": 0.59, + "learning_rate": 7.680036434537054e-06, + "loss": 0.4519, + "step": 5139 + }, + { + "epoch": 0.59, + "learning_rate": 7.676435934322925e-06, + "loss": 0.4492, + "step": 5140 + }, + { + "epoch": 0.59, + "learning_rate": 7.67283575248895e-06, + "loss": 0.4679, + "step": 5141 + }, + { + "epoch": 0.59, + "learning_rate": 7.669235889528436e-06, + "loss": 0.4641, + "step": 5142 + }, + { + "epoch": 0.59, + "learning_rate": 7.665636345934642e-06, + "loss": 0.4657, + "step": 5143 + }, + { + "epoch": 0.59, + "learning_rate": 7.662037122200783e-06, + "loss": 0.4692, + "step": 5144 + }, + { + "epoch": 0.59, + "learning_rate": 7.658438218820037e-06, + "loss": 0.4639, + "step": 5145 + }, + { + "epoch": 0.59, + "learning_rate": 7.65483963628553e-06, + "loss": 0.4528, + "step": 5146 + }, + { + "epoch": 0.59, + "learning_rate": 7.651241375090358e-06, + "loss": 0.4656, + "step": 5147 + }, + { + "epoch": 0.59, + "learning_rate": 7.647643435727555e-06, + "loss": 0.4515, + "step": 5148 + }, + { + "epoch": 0.59, + "learning_rate": 7.644045818690118e-06, + "loss": 0.4597, + "step": 5149 + }, + { + "epoch": 0.59, + "learning_rate": 7.640448524471002e-06, + "loss": 0.4585, + "step": 5150 + }, + { + "epoch": 0.59, + "learning_rate": 7.636851553563123e-06, + "loss": 0.4527, + "step": 5151 + }, + { + "epoch": 0.59, + "learning_rate": 7.633254906459335e-06, + "loss": 0.4651, + "step": 5152 + }, + { + "epoch": 0.59, + "learning_rate": 7.629658583652471e-06, + "loss": 0.4537, + "step": 5153 + }, + { + "epoch": 0.59, + "learning_rate": 7.6260625856352966e-06, + "loss": 0.4566, + "step": 5154 + }, + { + "epoch": 0.59, + "learning_rate": 7.6224669129005516e-06, + "loss": 0.4639, + "step": 5155 + }, + { + "epoch": 0.59, + "learning_rate": 7.6188715659409216e-06, + "loss": 0.4979, + "step": 5156 + }, + { + "epoch": 0.59, + "learning_rate": 7.615276545249046e-06, + "loss": 0.4343, + "step": 5157 + }, + { + "epoch": 0.59, + "learning_rate": 7.611681851317529e-06, + "loss": 0.4736, + "step": 5158 + }, + { + "epoch": 0.59, + "learning_rate": 7.608087484638915e-06, + "loss": 0.4522, + "step": 5159 + }, + { + "epoch": 0.59, + "learning_rate": 7.604493445705722e-06, + "loss": 0.4479, + "step": 5160 + }, + { + "epoch": 0.59, + "learning_rate": 7.60089973501041e-06, + "loss": 0.4702, + "step": 5161 + }, + { + "epoch": 0.59, + "learning_rate": 7.597306353045393e-06, + "loss": 0.4507, + "step": 5162 + }, + { + "epoch": 0.59, + "learning_rate": 7.593713300303054e-06, + "loss": 0.4657, + "step": 5163 + }, + { + "epoch": 0.59, + "learning_rate": 7.590120577275713e-06, + "loss": 0.4633, + "step": 5164 + }, + { + "epoch": 0.59, + "learning_rate": 7.586528184455653e-06, + "loss": 0.4432, + "step": 5165 + }, + { + "epoch": 0.59, + "learning_rate": 7.58293612233512e-06, + "loss": 0.4541, + "step": 5166 + }, + { + "epoch": 0.59, + "learning_rate": 7.579344391406299e-06, + "loss": 0.4552, + "step": 5167 + }, + { + "epoch": 0.59, + "learning_rate": 7.575752992161345e-06, + "loss": 0.4491, + "step": 5168 + }, + { + "epoch": 0.59, + "learning_rate": 7.572161925092352e-06, + "loss": 0.4613, + "step": 5169 + }, + { + "epoch": 0.59, + "learning_rate": 7.568571190691378e-06, + "loss": 0.4477, + "step": 5170 + }, + { + "epoch": 0.59, + "learning_rate": 7.564980789450438e-06, + "loss": 0.4663, + "step": 5171 + }, + { + "epoch": 0.59, + "learning_rate": 7.5613907218614925e-06, + "loss": 0.4692, + "step": 5172 + }, + { + "epoch": 0.59, + "learning_rate": 7.5578009884164646e-06, + "loss": 0.4534, + "step": 5173 + }, + { + "epoch": 0.59, + "learning_rate": 7.554211589607227e-06, + "loss": 0.469, + "step": 5174 + }, + { + "epoch": 0.59, + "learning_rate": 7.5506225259256025e-06, + "loss": 0.4627, + "step": 5175 + }, + { + "epoch": 0.59, + "learning_rate": 7.547033797863382e-06, + "loss": 0.4678, + "step": 5176 + }, + { + "epoch": 0.59, + "learning_rate": 7.543445405912298e-06, + "loss": 0.4545, + "step": 5177 + }, + { + "epoch": 0.59, + "learning_rate": 7.5398573505640326e-06, + "loss": 0.4504, + "step": 5178 + }, + { + "epoch": 0.59, + "learning_rate": 7.536269632310238e-06, + "loss": 0.4522, + "step": 5179 + }, + { + "epoch": 0.59, + "learning_rate": 7.532682251642508e-06, + "loss": 0.4594, + "step": 5180 + }, + { + "epoch": 0.59, + "learning_rate": 7.5290952090523995e-06, + "loss": 0.4656, + "step": 5181 + }, + { + "epoch": 0.59, + "learning_rate": 7.525508505031412e-06, + "loss": 0.4779, + "step": 5182 + }, + { + "epoch": 0.59, + "learning_rate": 7.521922140071003e-06, + "loss": 0.4321, + "step": 5183 + }, + { + "epoch": 0.59, + "learning_rate": 7.518336114662588e-06, + "loss": 0.4792, + "step": 5184 + }, + { + "epoch": 0.59, + "learning_rate": 7.514750429297528e-06, + "loss": 0.4704, + "step": 5185 + }, + { + "epoch": 0.59, + "learning_rate": 7.5111650844671515e-06, + "loss": 0.4456, + "step": 5186 + }, + { + "epoch": 0.59, + "learning_rate": 7.507580080662722e-06, + "loss": 0.4904, + "step": 5187 + }, + { + "epoch": 0.59, + "learning_rate": 7.5039954183754674e-06, + "loss": 0.4394, + "step": 5188 + }, + { + "epoch": 0.59, + "learning_rate": 7.5004110980965664e-06, + "loss": 0.467, + "step": 5189 + }, + { + "epoch": 0.59, + "learning_rate": 7.496827120317149e-06, + "loss": 0.4668, + "step": 5190 + }, + { + "epoch": 0.59, + "learning_rate": 7.493243485528306e-06, + "loss": 0.4549, + "step": 5191 + }, + { + "epoch": 0.59, + "learning_rate": 7.489660194221071e-06, + "loss": 0.4694, + "step": 5192 + }, + { + "epoch": 0.59, + "learning_rate": 7.486077246886435e-06, + "loss": 0.465, + "step": 5193 + }, + { + "epoch": 0.59, + "learning_rate": 7.4824946440153436e-06, + "loss": 0.4509, + "step": 5194 + }, + { + "epoch": 0.59, + "learning_rate": 7.478912386098692e-06, + "loss": 0.4735, + "step": 5195 + }, + { + "epoch": 0.59, + "learning_rate": 7.475330473627327e-06, + "loss": 0.4617, + "step": 5196 + }, + { + "epoch": 0.59, + "learning_rate": 7.471748907092056e-06, + "loss": 0.4542, + "step": 5197 + }, + { + "epoch": 0.59, + "learning_rate": 7.468167686983627e-06, + "loss": 0.4588, + "step": 5198 + }, + { + "epoch": 0.59, + "learning_rate": 7.464586813792757e-06, + "loss": 0.4637, + "step": 5199 + }, + { + "epoch": 0.59, + "learning_rate": 7.4610062880100985e-06, + "loss": 0.4593, + "step": 5200 + }, + { + "epoch": 0.59, + "learning_rate": 7.4574261101262604e-06, + "loss": 0.4585, + "step": 5201 + }, + { + "epoch": 0.59, + "learning_rate": 7.453846280631814e-06, + "loss": 0.457, + "step": 5202 + }, + { + "epoch": 0.59, + "learning_rate": 7.45026680001727e-06, + "loss": 0.4588, + "step": 5203 + }, + { + "epoch": 0.59, + "learning_rate": 7.446687668773105e-06, + "loss": 0.4484, + "step": 5204 + }, + { + "epoch": 0.59, + "learning_rate": 7.443108887389735e-06, + "loss": 0.4708, + "step": 5205 + }, + { + "epoch": 0.59, + "learning_rate": 7.439530456357528e-06, + "loss": 0.452, + "step": 5206 + }, + { + "epoch": 0.6, + "learning_rate": 7.435952376166818e-06, + "loss": 0.4659, + "step": 5207 + }, + { + "epoch": 0.6, + "learning_rate": 7.432374647307878e-06, + "loss": 0.4689, + "step": 5208 + }, + { + "epoch": 0.6, + "learning_rate": 7.428797270270933e-06, + "loss": 0.4541, + "step": 5209 + }, + { + "epoch": 0.6, + "learning_rate": 7.425220245546172e-06, + "loss": 0.4608, + "step": 5210 + }, + { + "epoch": 0.6, + "learning_rate": 7.421643573623717e-06, + "loss": 0.4559, + "step": 5211 + }, + { + "epoch": 0.6, + "learning_rate": 7.41806725499366e-06, + "loss": 0.4525, + "step": 5212 + }, + { + "epoch": 0.6, + "learning_rate": 7.4144912901460355e-06, + "loss": 0.4628, + "step": 5213 + }, + { + "epoch": 0.6, + "learning_rate": 7.410915679570825e-06, + "loss": 0.4476, + "step": 5214 + }, + { + "epoch": 0.6, + "learning_rate": 7.407340423757974e-06, + "loss": 0.4521, + "step": 5215 + }, + { + "epoch": 0.6, + "learning_rate": 7.403765523197365e-06, + "loss": 0.4727, + "step": 5216 + }, + { + "epoch": 0.6, + "learning_rate": 7.4001909783788465e-06, + "loss": 0.4596, + "step": 5217 + }, + { + "epoch": 0.6, + "learning_rate": 7.396616789792208e-06, + "loss": 0.4606, + "step": 5218 + }, + { + "epoch": 0.6, + "learning_rate": 7.39304295792719e-06, + "loss": 0.485, + "step": 5219 + }, + { + "epoch": 0.6, + "learning_rate": 7.389469483273495e-06, + "loss": 0.4432, + "step": 5220 + }, + { + "epoch": 0.6, + "learning_rate": 7.385896366320761e-06, + "loss": 0.4672, + "step": 5221 + }, + { + "epoch": 0.6, + "learning_rate": 7.382323607558585e-06, + "loss": 0.4679, + "step": 5222 + }, + { + "epoch": 0.6, + "learning_rate": 7.378751207476522e-06, + "loss": 0.4712, + "step": 5223 + }, + { + "epoch": 0.6, + "learning_rate": 7.375179166564062e-06, + "loss": 0.4411, + "step": 5224 + }, + { + "epoch": 0.6, + "learning_rate": 7.3716074853106635e-06, + "loss": 0.4791, + "step": 5225 + }, + { + "epoch": 0.6, + "learning_rate": 7.368036164205719e-06, + "loss": 0.4627, + "step": 5226 + }, + { + "epoch": 0.6, + "learning_rate": 7.3644652037385785e-06, + "loss": 0.4696, + "step": 5227 + }, + { + "epoch": 0.6, + "learning_rate": 7.3608946043985515e-06, + "loss": 0.4502, + "step": 5228 + }, + { + "epoch": 0.6, + "learning_rate": 7.357324366674881e-06, + "loss": 0.4456, + "step": 5229 + }, + { + "epoch": 0.6, + "learning_rate": 7.353754491056776e-06, + "loss": 0.4576, + "step": 5230 + }, + { + "epoch": 0.6, + "learning_rate": 7.350184978033386e-06, + "loss": 0.4659, + "step": 5231 + }, + { + "epoch": 0.6, + "learning_rate": 7.34661582809381e-06, + "loss": 0.4441, + "step": 5232 + }, + { + "epoch": 0.6, + "learning_rate": 7.3430470417271106e-06, + "loss": 0.4653, + "step": 5233 + }, + { + "epoch": 0.6, + "learning_rate": 7.339478619422287e-06, + "loss": 0.4634, + "step": 5234 + }, + { + "epoch": 0.6, + "learning_rate": 7.335910561668286e-06, + "loss": 0.4618, + "step": 5235 + }, + { + "epoch": 0.6, + "learning_rate": 7.3323428689540184e-06, + "loss": 0.4568, + "step": 5236 + }, + { + "epoch": 0.6, + "learning_rate": 7.328775541768336e-06, + "loss": 0.4478, + "step": 5237 + }, + { + "epoch": 0.6, + "learning_rate": 7.3252085806000474e-06, + "loss": 0.4481, + "step": 5238 + }, + { + "epoch": 0.6, + "learning_rate": 7.3216419859379e-06, + "loss": 0.4719, + "step": 5239 + }, + { + "epoch": 0.6, + "learning_rate": 7.318075758270593e-06, + "loss": 0.4603, + "step": 5240 + }, + { + "epoch": 0.6, + "learning_rate": 7.314509898086788e-06, + "loss": 0.4593, + "step": 5241 + }, + { + "epoch": 0.6, + "learning_rate": 7.31094440587508e-06, + "loss": 0.4439, + "step": 5242 + }, + { + "epoch": 0.6, + "learning_rate": 7.30737928212403e-06, + "loss": 0.4547, + "step": 5243 + }, + { + "epoch": 0.6, + "learning_rate": 7.303814527322132e-06, + "loss": 0.4527, + "step": 5244 + }, + { + "epoch": 0.6, + "learning_rate": 7.300250141957839e-06, + "loss": 0.4529, + "step": 5245 + }, + { + "epoch": 0.6, + "learning_rate": 7.296686126519552e-06, + "loss": 0.4366, + "step": 5246 + }, + { + "epoch": 0.6, + "learning_rate": 7.293122481495623e-06, + "loss": 0.4555, + "step": 5247 + }, + { + "epoch": 0.6, + "learning_rate": 7.2895592073743415e-06, + "loss": 0.4766, + "step": 5248 + }, + { + "epoch": 0.6, + "learning_rate": 7.2859963046439665e-06, + "loss": 0.4804, + "step": 5249 + }, + { + "epoch": 0.6, + "learning_rate": 7.282433773792689e-06, + "loss": 0.4415, + "step": 5250 + }, + { + "epoch": 0.6, + "learning_rate": 7.2788716153086604e-06, + "loss": 0.4475, + "step": 5251 + }, + { + "epoch": 0.6, + "learning_rate": 7.275309829679973e-06, + "loss": 0.479, + "step": 5252 + }, + { + "epoch": 0.6, + "learning_rate": 7.271748417394668e-06, + "loss": 0.4674, + "step": 5253 + }, + { + "epoch": 0.6, + "learning_rate": 7.2681873789407435e-06, + "loss": 0.4474, + "step": 5254 + }, + { + "epoch": 0.6, + "learning_rate": 7.264626714806135e-06, + "loss": 0.4562, + "step": 5255 + }, + { + "epoch": 0.6, + "learning_rate": 7.2610664254787425e-06, + "loss": 0.4435, + "step": 5256 + }, + { + "epoch": 0.6, + "learning_rate": 7.257506511446398e-06, + "loss": 0.4784, + "step": 5257 + }, + { + "epoch": 0.6, + "learning_rate": 7.253946973196888e-06, + "loss": 0.4483, + "step": 5258 + }, + { + "epoch": 0.6, + "learning_rate": 7.2503878112179534e-06, + "loss": 0.4493, + "step": 5259 + }, + { + "epoch": 0.6, + "learning_rate": 7.246829025997279e-06, + "loss": 0.4603, + "step": 5260 + }, + { + "epoch": 0.6, + "learning_rate": 7.243270618022492e-06, + "loss": 0.4749, + "step": 5261 + }, + { + "epoch": 0.6, + "learning_rate": 7.2397125877811816e-06, + "loss": 0.4541, + "step": 5262 + }, + { + "epoch": 0.6, + "learning_rate": 7.2361549357608685e-06, + "loss": 0.4485, + "step": 5263 + }, + { + "epoch": 0.6, + "learning_rate": 7.232597662449038e-06, + "loss": 0.4716, + "step": 5264 + }, + { + "epoch": 0.6, + "learning_rate": 7.2290407683331154e-06, + "loss": 0.4592, + "step": 5265 + }, + { + "epoch": 0.6, + "learning_rate": 7.225484253900468e-06, + "loss": 0.4722, + "step": 5266 + }, + { + "epoch": 0.6, + "learning_rate": 7.221928119638426e-06, + "loss": 0.4733, + "step": 5267 + }, + { + "epoch": 0.6, + "learning_rate": 7.218372366034252e-06, + "loss": 0.4663, + "step": 5268 + }, + { + "epoch": 0.6, + "learning_rate": 7.214816993575168e-06, + "loss": 0.4417, + "step": 5269 + }, + { + "epoch": 0.6, + "learning_rate": 7.211262002748341e-06, + "loss": 0.4623, + "step": 5270 + }, + { + "epoch": 0.6, + "learning_rate": 7.207707394040877e-06, + "loss": 0.4508, + "step": 5271 + }, + { + "epoch": 0.6, + "learning_rate": 7.2041531679398445e-06, + "loss": 0.4486, + "step": 5272 + }, + { + "epoch": 0.6, + "learning_rate": 7.200599324932246e-06, + "loss": 0.4616, + "step": 5273 + }, + { + "epoch": 0.6, + "learning_rate": 7.197045865505041e-06, + "loss": 0.464, + "step": 5274 + }, + { + "epoch": 0.6, + "learning_rate": 7.193492790145135e-06, + "loss": 0.4784, + "step": 5275 + }, + { + "epoch": 0.6, + "learning_rate": 7.18994009933937e-06, + "loss": 0.441, + "step": 5276 + }, + { + "epoch": 0.6, + "learning_rate": 7.186387793574554e-06, + "loss": 0.4565, + "step": 5277 + }, + { + "epoch": 0.6, + "learning_rate": 7.182835873337425e-06, + "loss": 0.4541, + "step": 5278 + }, + { + "epoch": 0.6, + "learning_rate": 7.179284339114676e-06, + "loss": 0.4627, + "step": 5279 + }, + { + "epoch": 0.6, + "learning_rate": 7.175733191392952e-06, + "loss": 0.4565, + "step": 5280 + }, + { + "epoch": 0.6, + "learning_rate": 7.172182430658832e-06, + "loss": 0.4737, + "step": 5281 + }, + { + "epoch": 0.6, + "learning_rate": 7.168632057398857e-06, + "loss": 0.4672, + "step": 5282 + }, + { + "epoch": 0.6, + "learning_rate": 7.165082072099503e-06, + "loss": 0.453, + "step": 5283 + }, + { + "epoch": 0.6, + "learning_rate": 7.161532475247195e-06, + "loss": 0.4718, + "step": 5284 + }, + { + "epoch": 0.6, + "learning_rate": 7.157983267328314e-06, + "loss": 0.4627, + "step": 5285 + }, + { + "epoch": 0.6, + "learning_rate": 7.1544344488291725e-06, + "loss": 0.4551, + "step": 5286 + }, + { + "epoch": 0.6, + "learning_rate": 7.150886020236045e-06, + "loss": 0.4413, + "step": 5287 + }, + { + "epoch": 0.6, + "learning_rate": 7.147337982035143e-06, + "loss": 0.4505, + "step": 5288 + }, + { + "epoch": 0.6, + "learning_rate": 7.143790334712623e-06, + "loss": 0.4672, + "step": 5289 + }, + { + "epoch": 0.6, + "learning_rate": 7.140243078754601e-06, + "loss": 0.4728, + "step": 5290 + }, + { + "epoch": 0.6, + "learning_rate": 7.136696214647123e-06, + "loss": 0.4598, + "step": 5291 + }, + { + "epoch": 0.6, + "learning_rate": 7.133149742876187e-06, + "loss": 0.4481, + "step": 5292 + }, + { + "epoch": 0.6, + "learning_rate": 7.129603663927742e-06, + "loss": 0.4895, + "step": 5293 + }, + { + "epoch": 0.61, + "learning_rate": 7.12605797828768e-06, + "loss": 0.4599, + "step": 5294 + }, + { + "epoch": 0.61, + "learning_rate": 7.1225126864418425e-06, + "loss": 0.4663, + "step": 5295 + }, + { + "epoch": 0.61, + "learning_rate": 7.118967788876011e-06, + "loss": 0.4485, + "step": 5296 + }, + { + "epoch": 0.61, + "learning_rate": 7.11542328607591e-06, + "loss": 0.4594, + "step": 5297 + }, + { + "epoch": 0.61, + "learning_rate": 7.111879178527223e-06, + "loss": 0.4633, + "step": 5298 + }, + { + "epoch": 0.61, + "learning_rate": 7.108335466715566e-06, + "loss": 0.46, + "step": 5299 + }, + { + "epoch": 0.61, + "learning_rate": 7.104792151126515e-06, + "loss": 0.4562, + "step": 5300 + }, + { + "epoch": 0.61, + "learning_rate": 7.101249232245576e-06, + "loss": 0.4664, + "step": 5301 + }, + { + "epoch": 0.61, + "learning_rate": 7.09770671055821e-06, + "loss": 0.4507, + "step": 5302 + }, + { + "epoch": 0.61, + "learning_rate": 7.094164586549821e-06, + "loss": 0.4531, + "step": 5303 + }, + { + "epoch": 0.61, + "learning_rate": 7.090622860705764e-06, + "loss": 0.4541, + "step": 5304 + }, + { + "epoch": 0.61, + "learning_rate": 7.087081533511324e-06, + "loss": 0.4608, + "step": 5305 + }, + { + "epoch": 0.61, + "learning_rate": 7.0835406054517505e-06, + "loss": 0.4726, + "step": 5306 + }, + { + "epoch": 0.61, + "learning_rate": 7.080000077012229e-06, + "loss": 0.4511, + "step": 5307 + }, + { + "epoch": 0.61, + "learning_rate": 7.076459948677889e-06, + "loss": 0.4427, + "step": 5308 + }, + { + "epoch": 0.61, + "learning_rate": 7.072920220933808e-06, + "loss": 0.4556, + "step": 5309 + }, + { + "epoch": 0.61, + "learning_rate": 7.069380894265004e-06, + "loss": 0.4427, + "step": 5310 + }, + { + "epoch": 0.61, + "learning_rate": 7.06584196915645e-06, + "loss": 0.477, + "step": 5311 + }, + { + "epoch": 0.61, + "learning_rate": 7.062303446093051e-06, + "loss": 0.4334, + "step": 5312 + }, + { + "epoch": 0.61, + "learning_rate": 7.058765325559673e-06, + "loss": 0.4567, + "step": 5313 + }, + { + "epoch": 0.61, + "learning_rate": 7.055227608041111e-06, + "loss": 0.4657, + "step": 5314 + }, + { + "epoch": 0.61, + "learning_rate": 7.051690294022108e-06, + "loss": 0.4766, + "step": 5315 + }, + { + "epoch": 0.61, + "learning_rate": 7.0481533839873614e-06, + "loss": 0.4642, + "step": 5316 + }, + { + "epoch": 0.61, + "learning_rate": 7.044616878421506e-06, + "loss": 0.484, + "step": 5317 + }, + { + "epoch": 0.61, + "learning_rate": 7.041080777809118e-06, + "loss": 0.4496, + "step": 5318 + }, + { + "epoch": 0.61, + "learning_rate": 7.037545082634729e-06, + "loss": 0.456, + "step": 5319 + }, + { + "epoch": 0.61, + "learning_rate": 7.034009793382799e-06, + "loss": 0.4493, + "step": 5320 + }, + { + "epoch": 0.61, + "learning_rate": 7.030474910537748e-06, + "loss": 0.4484, + "step": 5321 + }, + { + "epoch": 0.61, + "learning_rate": 7.0269404345839356e-06, + "loss": 0.4352, + "step": 5322 + }, + { + "epoch": 0.61, + "learning_rate": 7.023406366005655e-06, + "loss": 0.4687, + "step": 5323 + }, + { + "epoch": 0.61, + "learning_rate": 7.019872705287163e-06, + "loss": 0.468, + "step": 5324 + }, + { + "epoch": 0.61, + "learning_rate": 7.016339452912642e-06, + "loss": 0.4487, + "step": 5325 + }, + { + "epoch": 0.61, + "learning_rate": 7.012806609366231e-06, + "loss": 0.4607, + "step": 5326 + }, + { + "epoch": 0.61, + "learning_rate": 7.009274175132009e-06, + "loss": 0.453, + "step": 5327 + }, + { + "epoch": 0.61, + "learning_rate": 7.005742150693993e-06, + "loss": 0.4402, + "step": 5328 + }, + { + "epoch": 0.61, + "learning_rate": 7.0022105365361555e-06, + "loss": 0.4583, + "step": 5329 + }, + { + "epoch": 0.61, + "learning_rate": 6.998679333142403e-06, + "loss": 0.4477, + "step": 5330 + }, + { + "epoch": 0.61, + "learning_rate": 6.995148540996588e-06, + "loss": 0.467, + "step": 5331 + }, + { + "epoch": 0.61, + "learning_rate": 6.991618160582512e-06, + "loss": 0.4758, + "step": 5332 + }, + { + "epoch": 0.61, + "learning_rate": 6.9880881923839105e-06, + "loss": 0.4547, + "step": 5333 + }, + { + "epoch": 0.61, + "learning_rate": 6.9845586368844755e-06, + "loss": 0.4476, + "step": 5334 + }, + { + "epoch": 0.61, + "learning_rate": 6.981029494567828e-06, + "loss": 0.4614, + "step": 5335 + }, + { + "epoch": 0.61, + "learning_rate": 6.97750076591754e-06, + "loss": 0.4657, + "step": 5336 + }, + { + "epoch": 0.61, + "learning_rate": 6.973972451417132e-06, + "loss": 0.4678, + "step": 5337 + }, + { + "epoch": 0.61, + "learning_rate": 6.9704445515500544e-06, + "loss": 0.4482, + "step": 5338 + }, + { + "epoch": 0.61, + "learning_rate": 6.966917066799714e-06, + "loss": 0.4502, + "step": 5339 + }, + { + "epoch": 0.61, + "learning_rate": 6.9633899976494525e-06, + "loss": 0.4655, + "step": 5340 + }, + { + "epoch": 0.61, + "learning_rate": 6.959863344582554e-06, + "loss": 0.4492, + "step": 5341 + }, + { + "epoch": 0.61, + "learning_rate": 6.956337108082256e-06, + "loss": 0.4732, + "step": 5342 + }, + { + "epoch": 0.61, + "learning_rate": 6.952811288631728e-06, + "loss": 0.4797, + "step": 5343 + }, + { + "epoch": 0.61, + "learning_rate": 6.949285886714081e-06, + "loss": 0.4374, + "step": 5344 + }, + { + "epoch": 0.61, + "learning_rate": 6.9457609028123795e-06, + "loss": 0.4326, + "step": 5345 + }, + { + "epoch": 0.61, + "learning_rate": 6.942236337409623e-06, + "loss": 0.4609, + "step": 5346 + }, + { + "epoch": 0.61, + "learning_rate": 6.93871219098876e-06, + "loss": 0.4855, + "step": 5347 + }, + { + "epoch": 0.61, + "learning_rate": 6.935188464032674e-06, + "loss": 0.4507, + "step": 5348 + }, + { + "epoch": 0.61, + "learning_rate": 6.93166515702419e-06, + "loss": 0.4745, + "step": 5349 + }, + { + "epoch": 0.61, + "learning_rate": 6.928142270446086e-06, + "loss": 0.4519, + "step": 5350 + }, + { + "epoch": 0.61, + "learning_rate": 6.924619804781069e-06, + "loss": 0.457, + "step": 5351 + }, + { + "epoch": 0.61, + "learning_rate": 6.921097760511807e-06, + "loss": 0.442, + "step": 5352 + }, + { + "epoch": 0.61, + "learning_rate": 6.917576138120892e-06, + "loss": 0.4732, + "step": 5353 + }, + { + "epoch": 0.61, + "learning_rate": 6.91405493809086e-06, + "loss": 0.4478, + "step": 5354 + }, + { + "epoch": 0.61, + "learning_rate": 6.9105341609042e-06, + "loss": 0.4681, + "step": 5355 + }, + { + "epoch": 0.61, + "learning_rate": 6.907013807043335e-06, + "loss": 0.4417, + "step": 5356 + }, + { + "epoch": 0.61, + "learning_rate": 6.903493876990637e-06, + "loss": 0.4551, + "step": 5357 + }, + { + "epoch": 0.61, + "learning_rate": 6.899974371228409e-06, + "loss": 0.4597, + "step": 5358 + }, + { + "epoch": 0.61, + "learning_rate": 6.896455290238902e-06, + "loss": 0.459, + "step": 5359 + }, + { + "epoch": 0.61, + "learning_rate": 6.892936634504313e-06, + "loss": 0.4587, + "step": 5360 + }, + { + "epoch": 0.61, + "learning_rate": 6.889418404506774e-06, + "loss": 0.467, + "step": 5361 + }, + { + "epoch": 0.61, + "learning_rate": 6.885900600728358e-06, + "loss": 0.4496, + "step": 5362 + }, + { + "epoch": 0.61, + "learning_rate": 6.882383223651088e-06, + "loss": 0.4505, + "step": 5363 + }, + { + "epoch": 0.61, + "learning_rate": 6.878866273756919e-06, + "loss": 0.4636, + "step": 5364 + }, + { + "epoch": 0.61, + "learning_rate": 6.8753497515277555e-06, + "loss": 0.4444, + "step": 5365 + }, + { + "epoch": 0.61, + "learning_rate": 6.871833657445438e-06, + "loss": 0.4537, + "step": 5366 + }, + { + "epoch": 0.61, + "learning_rate": 6.8683179919917465e-06, + "loss": 0.4657, + "step": 5367 + }, + { + "epoch": 0.61, + "learning_rate": 6.8648027556484095e-06, + "loss": 0.4502, + "step": 5368 + }, + { + "epoch": 0.61, + "learning_rate": 6.861287948897091e-06, + "loss": 0.4813, + "step": 5369 + }, + { + "epoch": 0.61, + "learning_rate": 6.857773572219402e-06, + "loss": 0.4466, + "step": 5370 + }, + { + "epoch": 0.61, + "learning_rate": 6.854259626096888e-06, + "loss": 0.4636, + "step": 5371 + }, + { + "epoch": 0.61, + "learning_rate": 6.850746111011034e-06, + "loss": 0.4505, + "step": 5372 + }, + { + "epoch": 0.61, + "learning_rate": 6.847233027443274e-06, + "loss": 0.4592, + "step": 5373 + }, + { + "epoch": 0.61, + "learning_rate": 6.843720375874983e-06, + "loss": 0.4563, + "step": 5374 + }, + { + "epoch": 0.61, + "learning_rate": 6.8402081567874625e-06, + "loss": 0.4667, + "step": 5375 + }, + { + "epoch": 0.61, + "learning_rate": 6.836696370661975e-06, + "loss": 0.4653, + "step": 5376 + }, + { + "epoch": 0.61, + "learning_rate": 6.833185017979704e-06, + "loss": 0.4613, + "step": 5377 + }, + { + "epoch": 0.61, + "learning_rate": 6.8296740992217915e-06, + "loss": 0.4702, + "step": 5378 + }, + { + "epoch": 0.61, + "learning_rate": 6.82616361486931e-06, + "loss": 0.4558, + "step": 5379 + }, + { + "epoch": 0.61, + "learning_rate": 6.82265356540327e-06, + "loss": 0.4441, + "step": 5380 + }, + { + "epoch": 0.61, + "learning_rate": 6.819143951304632e-06, + "loss": 0.4734, + "step": 5381 + }, + { + "epoch": 0.62, + "learning_rate": 6.815634773054286e-06, + "loss": 0.4551, + "step": 5382 + }, + { + "epoch": 0.62, + "learning_rate": 6.812126031133073e-06, + "loss": 0.4691, + "step": 5383 + }, + { + "epoch": 0.62, + "learning_rate": 6.8086177260217675e-06, + "loss": 0.4692, + "step": 5384 + }, + { + "epoch": 0.62, + "learning_rate": 6.8051098582010825e-06, + "loss": 0.4457, + "step": 5385 + }, + { + "epoch": 0.62, + "learning_rate": 6.801602428151679e-06, + "loss": 0.4683, + "step": 5386 + }, + { + "epoch": 0.62, + "learning_rate": 6.7980954363541506e-06, + "loss": 0.4593, + "step": 5387 + }, + { + "epoch": 0.62, + "learning_rate": 6.79458888328903e-06, + "loss": 0.4525, + "step": 5388 + }, + { + "epoch": 0.62, + "learning_rate": 6.791082769436801e-06, + "loss": 0.4436, + "step": 5389 + }, + { + "epoch": 0.62, + "learning_rate": 6.787577095277873e-06, + "loss": 0.4504, + "step": 5390 + }, + { + "epoch": 0.62, + "learning_rate": 6.784071861292607e-06, + "loss": 0.4928, + "step": 5391 + }, + { + "epoch": 0.62, + "learning_rate": 6.780567067961293e-06, + "loss": 0.4695, + "step": 5392 + }, + { + "epoch": 0.62, + "learning_rate": 6.777062715764166e-06, + "loss": 0.4518, + "step": 5393 + }, + { + "epoch": 0.62, + "learning_rate": 6.773558805181408e-06, + "loss": 0.4454, + "step": 5394 + }, + { + "epoch": 0.62, + "learning_rate": 6.770055336693123e-06, + "loss": 0.4695, + "step": 5395 + }, + { + "epoch": 0.62, + "learning_rate": 6.766552310779374e-06, + "loss": 0.4315, + "step": 5396 + }, + { + "epoch": 0.62, + "learning_rate": 6.763049727920145e-06, + "loss": 0.454, + "step": 5397 + }, + { + "epoch": 0.62, + "learning_rate": 6.759547588595372e-06, + "loss": 0.4592, + "step": 5398 + }, + { + "epoch": 0.62, + "learning_rate": 6.7560458932849306e-06, + "loss": 0.457, + "step": 5399 + }, + { + "epoch": 0.62, + "learning_rate": 6.752544642468626e-06, + "loss": 0.4682, + "step": 5400 + }, + { + "epoch": 0.62, + "learning_rate": 6.749043836626203e-06, + "loss": 0.4561, + "step": 5401 + }, + { + "epoch": 0.62, + "learning_rate": 6.74554347623736e-06, + "loss": 0.4611, + "step": 5402 + }, + { + "epoch": 0.62, + "learning_rate": 6.742043561781717e-06, + "loss": 0.4398, + "step": 5403 + }, + { + "epoch": 0.62, + "learning_rate": 6.738544093738848e-06, + "loss": 0.4694, + "step": 5404 + }, + { + "epoch": 0.62, + "learning_rate": 6.735045072588256e-06, + "loss": 0.4432, + "step": 5405 + }, + { + "epoch": 0.62, + "learning_rate": 6.731546498809376e-06, + "loss": 0.4421, + "step": 5406 + }, + { + "epoch": 0.62, + "learning_rate": 6.7280483728816016e-06, + "loss": 0.4487, + "step": 5407 + }, + { + "epoch": 0.62, + "learning_rate": 6.724550695284247e-06, + "loss": 0.4678, + "step": 5408 + }, + { + "epoch": 0.62, + "learning_rate": 6.72105346649658e-06, + "loss": 0.4643, + "step": 5409 + }, + { + "epoch": 0.62, + "learning_rate": 6.717556686997795e-06, + "loss": 0.4759, + "step": 5410 + }, + { + "epoch": 0.62, + "learning_rate": 6.714060357267023e-06, + "loss": 0.4702, + "step": 5411 + }, + { + "epoch": 0.62, + "learning_rate": 6.7105644777833475e-06, + "loss": 0.4544, + "step": 5412 + }, + { + "epoch": 0.62, + "learning_rate": 6.707069049025781e-06, + "loss": 0.4695, + "step": 5413 + }, + { + "epoch": 0.62, + "learning_rate": 6.703574071473269e-06, + "loss": 0.4528, + "step": 5414 + }, + { + "epoch": 0.62, + "learning_rate": 6.700079545604707e-06, + "loss": 0.4569, + "step": 5415 + }, + { + "epoch": 0.62, + "learning_rate": 6.696585471898922e-06, + "loss": 0.4543, + "step": 5416 + }, + { + "epoch": 0.62, + "learning_rate": 6.693091850834681e-06, + "loss": 0.4681, + "step": 5417 + }, + { + "epoch": 0.62, + "learning_rate": 6.6895986828906886e-06, + "loss": 0.4582, + "step": 5418 + }, + { + "epoch": 0.62, + "learning_rate": 6.686105968545582e-06, + "loss": 0.4586, + "step": 5419 + }, + { + "epoch": 0.62, + "learning_rate": 6.682613708277945e-06, + "loss": 0.4444, + "step": 5420 + }, + { + "epoch": 0.62, + "learning_rate": 6.679121902566294e-06, + "loss": 0.4541, + "step": 5421 + }, + { + "epoch": 0.62, + "learning_rate": 6.675630551889088e-06, + "loss": 0.4553, + "step": 5422 + }, + { + "epoch": 0.62, + "learning_rate": 6.672139656724715e-06, + "loss": 0.467, + "step": 5423 + }, + { + "epoch": 0.62, + "learning_rate": 6.668649217551505e-06, + "loss": 0.4665, + "step": 5424 + }, + { + "epoch": 0.62, + "learning_rate": 6.665159234847731e-06, + "loss": 0.4776, + "step": 5425 + }, + { + "epoch": 0.62, + "learning_rate": 6.6616697090915975e-06, + "loss": 0.4475, + "step": 5426 + }, + { + "epoch": 0.62, + "learning_rate": 6.658180640761241e-06, + "loss": 0.4668, + "step": 5427 + }, + { + "epoch": 0.62, + "learning_rate": 6.654692030334753e-06, + "loss": 0.4912, + "step": 5428 + }, + { + "epoch": 0.62, + "learning_rate": 6.651203878290139e-06, + "loss": 0.4346, + "step": 5429 + }, + { + "epoch": 0.62, + "learning_rate": 6.647716185105362e-06, + "loss": 0.4658, + "step": 5430 + }, + { + "epoch": 0.62, + "learning_rate": 6.644228951258313e-06, + "loss": 0.4585, + "step": 5431 + }, + { + "epoch": 0.62, + "learning_rate": 6.640742177226816e-06, + "loss": 0.4507, + "step": 5432 + }, + { + "epoch": 0.62, + "learning_rate": 6.637255863488643e-06, + "loss": 0.4771, + "step": 5433 + }, + { + "epoch": 0.62, + "learning_rate": 6.63377001052149e-06, + "loss": 0.4594, + "step": 5434 + }, + { + "epoch": 0.62, + "learning_rate": 6.630284618803003e-06, + "loss": 0.4394, + "step": 5435 + }, + { + "epoch": 0.62, + "learning_rate": 6.626799688810759e-06, + "loss": 0.4817, + "step": 5436 + }, + { + "epoch": 0.62, + "learning_rate": 6.623315221022263e-06, + "loss": 0.4329, + "step": 5437 + }, + { + "epoch": 0.62, + "learning_rate": 6.619831215914974e-06, + "loss": 0.4537, + "step": 5438 + }, + { + "epoch": 0.62, + "learning_rate": 6.6163476739662724e-06, + "loss": 0.449, + "step": 5439 + }, + { + "epoch": 0.62, + "learning_rate": 6.612864595653483e-06, + "loss": 0.4501, + "step": 5440 + }, + { + "epoch": 0.62, + "learning_rate": 6.609381981453869e-06, + "loss": 0.4529, + "step": 5441 + }, + { + "epoch": 0.62, + "learning_rate": 6.60589983184462e-06, + "loss": 0.4821, + "step": 5442 + }, + { + "epoch": 0.62, + "learning_rate": 6.602418147302874e-06, + "loss": 0.443, + "step": 5443 + }, + { + "epoch": 0.62, + "learning_rate": 6.598936928305695e-06, + "loss": 0.4841, + "step": 5444 + }, + { + "epoch": 0.62, + "learning_rate": 6.5954561753300885e-06, + "loss": 0.4656, + "step": 5445 + }, + { + "epoch": 0.62, + "learning_rate": 6.591975888852998e-06, + "loss": 0.4388, + "step": 5446 + }, + { + "epoch": 0.62, + "learning_rate": 6.5884960693512965e-06, + "loss": 0.463, + "step": 5447 + }, + { + "epoch": 0.62, + "learning_rate": 6.585016717301805e-06, + "loss": 0.4715, + "step": 5448 + }, + { + "epoch": 0.62, + "learning_rate": 6.581537833181262e-06, + "loss": 0.4748, + "step": 5449 + }, + { + "epoch": 0.62, + "learning_rate": 6.578059417466356e-06, + "loss": 0.45, + "step": 5450 + }, + { + "epoch": 0.62, + "learning_rate": 6.5745814706337115e-06, + "loss": 0.4486, + "step": 5451 + }, + { + "epoch": 0.62, + "learning_rate": 6.57110399315988e-06, + "loss": 0.4499, + "step": 5452 + }, + { + "epoch": 0.62, + "learning_rate": 6.5676269855213585e-06, + "loss": 0.4558, + "step": 5453 + }, + { + "epoch": 0.62, + "learning_rate": 6.56415044819457e-06, + "loss": 0.4761, + "step": 5454 + }, + { + "epoch": 0.62, + "learning_rate": 6.560674381655876e-06, + "loss": 0.4435, + "step": 5455 + }, + { + "epoch": 0.62, + "learning_rate": 6.557198786381584e-06, + "loss": 0.4555, + "step": 5456 + }, + { + "epoch": 0.62, + "learning_rate": 6.553723662847924e-06, + "loss": 0.4568, + "step": 5457 + }, + { + "epoch": 0.62, + "learning_rate": 6.550249011531058e-06, + "loss": 0.4636, + "step": 5458 + }, + { + "epoch": 0.62, + "learning_rate": 6.546774832907101e-06, + "loss": 0.461, + "step": 5459 + }, + { + "epoch": 0.62, + "learning_rate": 6.543301127452086e-06, + "loss": 0.4682, + "step": 5460 + }, + { + "epoch": 0.62, + "learning_rate": 6.539827895641997e-06, + "loss": 0.4469, + "step": 5461 + }, + { + "epoch": 0.62, + "learning_rate": 6.536355137952737e-06, + "loss": 0.495, + "step": 5462 + }, + { + "epoch": 0.62, + "learning_rate": 6.53288285486015e-06, + "loss": 0.4454, + "step": 5463 + }, + { + "epoch": 0.62, + "learning_rate": 6.52941104684002e-06, + "loss": 0.4627, + "step": 5464 + }, + { + "epoch": 0.62, + "learning_rate": 6.52593971436806e-06, + "loss": 0.4606, + "step": 5465 + }, + { + "epoch": 0.62, + "learning_rate": 6.522468857919926e-06, + "loss": 0.46, + "step": 5466 + }, + { + "epoch": 0.62, + "learning_rate": 6.518998477971199e-06, + "loss": 0.4615, + "step": 5467 + }, + { + "epoch": 0.62, + "learning_rate": 6.515528574997394e-06, + "loss": 0.4869, + "step": 5468 + }, + { + "epoch": 0.63, + "learning_rate": 6.512059149473971e-06, + "loss": 0.4803, + "step": 5469 + }, + { + "epoch": 0.63, + "learning_rate": 6.508590201876317e-06, + "loss": 0.4653, + "step": 5470 + }, + { + "epoch": 0.63, + "learning_rate": 6.5051217326797535e-06, + "loss": 0.4486, + "step": 5471 + }, + { + "epoch": 0.63, + "learning_rate": 6.501653742359539e-06, + "loss": 0.4593, + "step": 5472 + }, + { + "epoch": 0.63, + "learning_rate": 6.49818623139087e-06, + "loss": 0.4658, + "step": 5473 + }, + { + "epoch": 0.63, + "learning_rate": 6.494719200248867e-06, + "loss": 0.4547, + "step": 5474 + }, + { + "epoch": 0.63, + "learning_rate": 6.491252649408596e-06, + "loss": 0.4492, + "step": 5475 + }, + { + "epoch": 0.63, + "learning_rate": 6.4877865793450445e-06, + "loss": 0.4601, + "step": 5476 + }, + { + "epoch": 0.63, + "learning_rate": 6.484320990533148e-06, + "loss": 0.4599, + "step": 5477 + }, + { + "epoch": 0.63, + "learning_rate": 6.480855883447767e-06, + "loss": 0.4706, + "step": 5478 + }, + { + "epoch": 0.63, + "learning_rate": 6.4773912585637e-06, + "loss": 0.4422, + "step": 5479 + }, + { + "epoch": 0.63, + "learning_rate": 6.473927116355678e-06, + "loss": 0.4583, + "step": 5480 + }, + { + "epoch": 0.63, + "learning_rate": 6.4704634572983615e-06, + "loss": 0.442, + "step": 5481 + }, + { + "epoch": 0.63, + "learning_rate": 6.4670002818663535e-06, + "loss": 0.4692, + "step": 5482 + }, + { + "epoch": 0.63, + "learning_rate": 6.463537590534188e-06, + "loss": 0.4508, + "step": 5483 + }, + { + "epoch": 0.63, + "learning_rate": 6.4600753837763255e-06, + "loss": 0.4677, + "step": 5484 + }, + { + "epoch": 0.63, + "learning_rate": 6.4566136620671705e-06, + "loss": 0.4529, + "step": 5485 + }, + { + "epoch": 0.63, + "learning_rate": 6.453152425881051e-06, + "loss": 0.4515, + "step": 5486 + }, + { + "epoch": 0.63, + "learning_rate": 6.4496916756922375e-06, + "loss": 0.4291, + "step": 5487 + }, + { + "epoch": 0.63, + "learning_rate": 6.4462314119749315e-06, + "loss": 0.464, + "step": 5488 + }, + { + "epoch": 0.63, + "learning_rate": 6.44277163520326e-06, + "loss": 0.4463, + "step": 5489 + }, + { + "epoch": 0.63, + "learning_rate": 6.439312345851298e-06, + "loss": 0.4595, + "step": 5490 + }, + { + "epoch": 0.63, + "learning_rate": 6.435853544393038e-06, + "loss": 0.4468, + "step": 5491 + }, + { + "epoch": 0.63, + "learning_rate": 6.432395231302418e-06, + "loss": 0.451, + "step": 5492 + }, + { + "epoch": 0.63, + "learning_rate": 6.428937407053304e-06, + "loss": 0.4634, + "step": 5493 + }, + { + "epoch": 0.63, + "learning_rate": 6.425480072119488e-06, + "loss": 0.459, + "step": 5494 + }, + { + "epoch": 0.63, + "learning_rate": 6.422023226974713e-06, + "loss": 0.4458, + "step": 5495 + }, + { + "epoch": 0.63, + "learning_rate": 6.4185668720926365e-06, + "loss": 0.455, + "step": 5496 + }, + { + "epoch": 0.63, + "learning_rate": 6.4151110079468545e-06, + "loss": 0.4607, + "step": 5497 + }, + { + "epoch": 0.63, + "learning_rate": 6.411655635010907e-06, + "loss": 0.4528, + "step": 5498 + }, + { + "epoch": 0.63, + "learning_rate": 6.4082007537582465e-06, + "loss": 0.4468, + "step": 5499 + }, + { + "epoch": 0.63, + "learning_rate": 6.40474636466228e-06, + "loss": 0.458, + "step": 5500 + }, + { + "epoch": 0.63, + "learning_rate": 6.4012924681963255e-06, + "loss": 0.4821, + "step": 5501 + }, + { + "epoch": 0.63, + "learning_rate": 6.397839064833647e-06, + "loss": 0.458, + "step": 5502 + }, + { + "epoch": 0.63, + "learning_rate": 6.394386155047443e-06, + "loss": 0.4543, + "step": 5503 + }, + { + "epoch": 0.63, + "learning_rate": 6.39093373931083e-06, + "loss": 0.4515, + "step": 5504 + }, + { + "epoch": 0.63, + "learning_rate": 6.387481818096877e-06, + "loss": 0.4509, + "step": 5505 + }, + { + "epoch": 0.63, + "learning_rate": 6.384030391878566e-06, + "loss": 0.4444, + "step": 5506 + }, + { + "epoch": 0.63, + "learning_rate": 6.38057946112882e-06, + "loss": 0.4718, + "step": 5507 + }, + { + "epoch": 0.63, + "learning_rate": 6.3771290263205e-06, + "loss": 0.4506, + "step": 5508 + }, + { + "epoch": 0.63, + "learning_rate": 6.373679087926388e-06, + "loss": 0.4686, + "step": 5509 + }, + { + "epoch": 0.63, + "learning_rate": 6.370229646419199e-06, + "loss": 0.4743, + "step": 5510 + }, + { + "epoch": 0.63, + "learning_rate": 6.366780702271589e-06, + "loss": 0.458, + "step": 5511 + }, + { + "epoch": 0.63, + "learning_rate": 6.363332255956136e-06, + "loss": 0.4525, + "step": 5512 + }, + { + "epoch": 0.63, + "learning_rate": 6.359884307945363e-06, + "loss": 0.4658, + "step": 5513 + }, + { + "epoch": 0.63, + "learning_rate": 6.356436858711708e-06, + "loss": 0.4587, + "step": 5514 + }, + { + "epoch": 0.63, + "learning_rate": 6.352989908727546e-06, + "loss": 0.4463, + "step": 5515 + }, + { + "epoch": 0.63, + "learning_rate": 6.349543458465193e-06, + "loss": 0.4761, + "step": 5516 + }, + { + "epoch": 0.63, + "learning_rate": 6.346097508396885e-06, + "loss": 0.4726, + "step": 5517 + }, + { + "epoch": 0.63, + "learning_rate": 6.3426520589947985e-06, + "loss": 0.4668, + "step": 5518 + }, + { + "epoch": 0.63, + "learning_rate": 6.339207110731036e-06, + "loss": 0.4606, + "step": 5519 + }, + { + "epoch": 0.63, + "learning_rate": 6.335762664077627e-06, + "loss": 0.454, + "step": 5520 + }, + { + "epoch": 0.63, + "learning_rate": 6.332318719506543e-06, + "loss": 0.469, + "step": 5521 + }, + { + "epoch": 0.63, + "learning_rate": 6.328875277489677e-06, + "loss": 0.4462, + "step": 5522 + }, + { + "epoch": 0.63, + "learning_rate": 6.325432338498865e-06, + "loss": 0.4572, + "step": 5523 + }, + { + "epoch": 0.63, + "learning_rate": 6.321989903005861e-06, + "loss": 0.4478, + "step": 5524 + }, + { + "epoch": 0.63, + "learning_rate": 6.318547971482352e-06, + "loss": 0.4604, + "step": 5525 + }, + { + "epoch": 0.63, + "learning_rate": 6.315106544399966e-06, + "loss": 0.4631, + "step": 5526 + }, + { + "epoch": 0.63, + "learning_rate": 6.311665622230254e-06, + "loss": 0.4559, + "step": 5527 + }, + { + "epoch": 0.63, + "learning_rate": 6.3082252054446955e-06, + "loss": 0.455, + "step": 5528 + }, + { + "epoch": 0.63, + "learning_rate": 6.304785294514709e-06, + "loss": 0.4549, + "step": 5529 + }, + { + "epoch": 0.63, + "learning_rate": 6.301345889911636e-06, + "loss": 0.4664, + "step": 5530 + }, + { + "epoch": 0.63, + "learning_rate": 6.297906992106755e-06, + "loss": 0.4549, + "step": 5531 + }, + { + "epoch": 0.63, + "learning_rate": 6.29446860157127e-06, + "loss": 0.4561, + "step": 5532 + }, + { + "epoch": 0.63, + "learning_rate": 6.291030718776313e-06, + "loss": 0.4534, + "step": 5533 + }, + { + "epoch": 0.63, + "learning_rate": 6.287593344192957e-06, + "loss": 0.4905, + "step": 5534 + }, + { + "epoch": 0.63, + "learning_rate": 6.284156478292196e-06, + "loss": 0.4596, + "step": 5535 + }, + { + "epoch": 0.63, + "learning_rate": 6.2807201215449584e-06, + "loss": 0.4457, + "step": 5536 + }, + { + "epoch": 0.63, + "learning_rate": 6.277284274422104e-06, + "loss": 0.4593, + "step": 5537 + }, + { + "epoch": 0.63, + "learning_rate": 6.273848937394413e-06, + "loss": 0.463, + "step": 5538 + }, + { + "epoch": 0.63, + "learning_rate": 6.270414110932611e-06, + "loss": 0.4587, + "step": 5539 + }, + { + "epoch": 0.63, + "learning_rate": 6.266979795507346e-06, + "loss": 0.4452, + "step": 5540 + }, + { + "epoch": 0.63, + "learning_rate": 6.2635459915891876e-06, + "loss": 0.4585, + "step": 5541 + }, + { + "epoch": 0.63, + "learning_rate": 6.260112699648653e-06, + "loss": 0.4637, + "step": 5542 + }, + { + "epoch": 0.63, + "learning_rate": 6.256679920156172e-06, + "loss": 0.4702, + "step": 5543 + }, + { + "epoch": 0.63, + "learning_rate": 6.253247653582119e-06, + "loss": 0.4526, + "step": 5544 + }, + { + "epoch": 0.63, + "learning_rate": 6.2498159003967896e-06, + "loss": 0.4575, + "step": 5545 + }, + { + "epoch": 0.63, + "learning_rate": 6.246384661070404e-06, + "loss": 0.4568, + "step": 5546 + }, + { + "epoch": 0.63, + "learning_rate": 6.2429539360731286e-06, + "loss": 0.4498, + "step": 5547 + }, + { + "epoch": 0.63, + "learning_rate": 6.239523725875041e-06, + "loss": 0.4679, + "step": 5548 + }, + { + "epoch": 0.63, + "learning_rate": 6.23609403094616e-06, + "loss": 0.4461, + "step": 5549 + }, + { + "epoch": 0.63, + "learning_rate": 6.232664851756434e-06, + "loss": 0.4776, + "step": 5550 + }, + { + "epoch": 0.63, + "learning_rate": 6.229236188775729e-06, + "loss": 0.4537, + "step": 5551 + }, + { + "epoch": 0.63, + "learning_rate": 6.225808042473857e-06, + "loss": 0.4517, + "step": 5552 + }, + { + "epoch": 0.63, + "learning_rate": 6.222380413320546e-06, + "loss": 0.4589, + "step": 5553 + }, + { + "epoch": 0.63, + "learning_rate": 6.218953301785453e-06, + "loss": 0.462, + "step": 5554 + }, + { + "epoch": 0.63, + "learning_rate": 6.2155267083381795e-06, + "loss": 0.4469, + "step": 5555 + }, + { + "epoch": 0.63, + "learning_rate": 6.212100633448237e-06, + "loss": 0.4665, + "step": 5556 + }, + { + "epoch": 0.64, + "learning_rate": 6.208675077585079e-06, + "loss": 0.4465, + "step": 5557 + }, + { + "epoch": 0.64, + "learning_rate": 6.2052500412180805e-06, + "loss": 0.4778, + "step": 5558 + }, + { + "epoch": 0.64, + "learning_rate": 6.201825524816545e-06, + "loss": 0.4497, + "step": 5559 + }, + { + "epoch": 0.64, + "learning_rate": 6.198401528849717e-06, + "loss": 0.4788, + "step": 5560 + }, + { + "epoch": 0.64, + "learning_rate": 6.194978053786749e-06, + "loss": 0.4548, + "step": 5561 + }, + { + "epoch": 0.64, + "learning_rate": 6.191555100096744e-06, + "loss": 0.4663, + "step": 5562 + }, + { + "epoch": 0.64, + "learning_rate": 6.188132668248716e-06, + "loss": 0.4589, + "step": 5563 + }, + { + "epoch": 0.64, + "learning_rate": 6.184710758711616e-06, + "loss": 0.4489, + "step": 5564 + }, + { + "epoch": 0.64, + "learning_rate": 6.181289371954327e-06, + "loss": 0.4458, + "step": 5565 + }, + { + "epoch": 0.64, + "learning_rate": 6.177868508445651e-06, + "loss": 0.4718, + "step": 5566 + }, + { + "epoch": 0.64, + "learning_rate": 6.174448168654317e-06, + "loss": 0.4501, + "step": 5567 + }, + { + "epoch": 0.64, + "learning_rate": 6.171028353048996e-06, + "loss": 0.4906, + "step": 5568 + }, + { + "epoch": 0.64, + "learning_rate": 6.167609062098276e-06, + "loss": 0.4411, + "step": 5569 + }, + { + "epoch": 0.64, + "learning_rate": 6.164190296270683e-06, + "loss": 0.4616, + "step": 5570 + }, + { + "epoch": 0.64, + "learning_rate": 6.160772056034655e-06, + "loss": 0.4449, + "step": 5571 + }, + { + "epoch": 0.64, + "learning_rate": 6.157354341858568e-06, + "loss": 0.4601, + "step": 5572 + }, + { + "epoch": 0.64, + "learning_rate": 6.1539371542107295e-06, + "loss": 0.4472, + "step": 5573 + }, + { + "epoch": 0.64, + "learning_rate": 6.1505204935593665e-06, + "loss": 0.4845, + "step": 5574 + }, + { + "epoch": 0.64, + "learning_rate": 6.147104360372644e-06, + "loss": 0.4442, + "step": 5575 + }, + { + "epoch": 0.64, + "learning_rate": 6.1436887551186466e-06, + "loss": 0.4733, + "step": 5576 + }, + { + "epoch": 0.64, + "learning_rate": 6.14027367826538e-06, + "loss": 0.4424, + "step": 5577 + }, + { + "epoch": 0.64, + "learning_rate": 6.136859130280794e-06, + "loss": 0.4735, + "step": 5578 + }, + { + "epoch": 0.64, + "learning_rate": 6.133445111632761e-06, + "loss": 0.4515, + "step": 5579 + }, + { + "epoch": 0.64, + "learning_rate": 6.130031622789067e-06, + "loss": 0.4605, + "step": 5580 + }, + { + "epoch": 0.64, + "learning_rate": 6.126618664217448e-06, + "loss": 0.4499, + "step": 5581 + }, + { + "epoch": 0.64, + "learning_rate": 6.123206236385543e-06, + "loss": 0.4527, + "step": 5582 + }, + { + "epoch": 0.64, + "learning_rate": 6.119794339760941e-06, + "loss": 0.4556, + "step": 5583 + }, + { + "epoch": 0.64, + "learning_rate": 6.1163829748111466e-06, + "loss": 0.4661, + "step": 5584 + }, + { + "epoch": 0.64, + "learning_rate": 6.112972142003587e-06, + "loss": 0.45, + "step": 5585 + }, + { + "epoch": 0.64, + "learning_rate": 6.109561841805629e-06, + "loss": 0.4588, + "step": 5586 + }, + { + "epoch": 0.64, + "learning_rate": 6.106152074684556e-06, + "loss": 0.4475, + "step": 5587 + }, + { + "epoch": 0.64, + "learning_rate": 6.102742841107585e-06, + "loss": 0.4708, + "step": 5588 + }, + { + "epoch": 0.64, + "learning_rate": 6.099334141541856e-06, + "loss": 0.4573, + "step": 5589 + }, + { + "epoch": 0.64, + "learning_rate": 6.095925976454433e-06, + "loss": 0.4513, + "step": 5590 + }, + { + "epoch": 0.64, + "learning_rate": 6.092518346312317e-06, + "loss": 0.4378, + "step": 5591 + }, + { + "epoch": 0.64, + "learning_rate": 6.089111251582427e-06, + "loss": 0.4679, + "step": 5592 + }, + { + "epoch": 0.64, + "learning_rate": 6.085704692731609e-06, + "loss": 0.4659, + "step": 5593 + }, + { + "epoch": 0.64, + "learning_rate": 6.082298670226642e-06, + "loss": 0.4478, + "step": 5594 + }, + { + "epoch": 0.64, + "learning_rate": 6.0788931845342205e-06, + "loss": 0.452, + "step": 5595 + }, + { + "epoch": 0.64, + "learning_rate": 6.075488236120978e-06, + "loss": 0.4487, + "step": 5596 + }, + { + "epoch": 0.64, + "learning_rate": 6.0720838254534675e-06, + "loss": 0.4547, + "step": 5597 + }, + { + "epoch": 0.64, + "learning_rate": 6.068679952998167e-06, + "loss": 0.4457, + "step": 5598 + }, + { + "epoch": 0.64, + "learning_rate": 6.065276619221485e-06, + "loss": 0.4448, + "step": 5599 + }, + { + "epoch": 0.64, + "learning_rate": 6.061873824589751e-06, + "loss": 0.4378, + "step": 5600 + }, + { + "epoch": 0.64, + "learning_rate": 6.058471569569228e-06, + "loss": 0.462, + "step": 5601 + }, + { + "epoch": 0.64, + "learning_rate": 6.055069854626102e-06, + "loss": 0.4632, + "step": 5602 + }, + { + "epoch": 0.64, + "learning_rate": 6.051668680226477e-06, + "loss": 0.4506, + "step": 5603 + }, + { + "epoch": 0.64, + "learning_rate": 6.0482680468363964e-06, + "loss": 0.4816, + "step": 5604 + }, + { + "epoch": 0.64, + "learning_rate": 6.044867954921818e-06, + "loss": 0.4275, + "step": 5605 + }, + { + "epoch": 0.64, + "learning_rate": 6.0414684049486335e-06, + "loss": 0.4369, + "step": 5606 + }, + { + "epoch": 0.64, + "learning_rate": 6.0380693973826595e-06, + "loss": 0.4646, + "step": 5607 + }, + { + "epoch": 0.64, + "learning_rate": 6.034670932689629e-06, + "loss": 0.4391, + "step": 5608 + }, + { + "epoch": 0.64, + "learning_rate": 6.031273011335215e-06, + "loss": 0.4563, + "step": 5609 + }, + { + "epoch": 0.64, + "learning_rate": 6.027875633785003e-06, + "loss": 0.4534, + "step": 5610 + }, + { + "epoch": 0.64, + "learning_rate": 6.024478800504509e-06, + "loss": 0.4575, + "step": 5611 + }, + { + "epoch": 0.64, + "learning_rate": 6.0210825119591806e-06, + "loss": 0.4483, + "step": 5612 + }, + { + "epoch": 0.64, + "learning_rate": 6.0176867686143795e-06, + "loss": 0.446, + "step": 5613 + }, + { + "epoch": 0.64, + "learning_rate": 6.014291570935405e-06, + "loss": 0.448, + "step": 5614 + }, + { + "epoch": 0.64, + "learning_rate": 6.0108969193874675e-06, + "loss": 0.4926, + "step": 5615 + }, + { + "epoch": 0.64, + "learning_rate": 6.007502814435713e-06, + "loss": 0.4475, + "step": 5616 + }, + { + "epoch": 0.64, + "learning_rate": 6.0041092565452135e-06, + "loss": 0.4339, + "step": 5617 + }, + { + "epoch": 0.64, + "learning_rate": 6.000716246180953e-06, + "loss": 0.4616, + "step": 5618 + }, + { + "epoch": 0.64, + "learning_rate": 5.9973237838078625e-06, + "loss": 0.474, + "step": 5619 + }, + { + "epoch": 0.64, + "learning_rate": 5.993931869890774e-06, + "loss": 0.4726, + "step": 5620 + }, + { + "epoch": 0.64, + "learning_rate": 5.9905405048944575e-06, + "loss": 0.464, + "step": 5621 + }, + { + "epoch": 0.64, + "learning_rate": 5.987149689283614e-06, + "loss": 0.4486, + "step": 5622 + }, + { + "epoch": 0.64, + "learning_rate": 5.983759423522852e-06, + "loss": 0.4448, + "step": 5623 + }, + { + "epoch": 0.64, + "learning_rate": 5.980369708076713e-06, + "loss": 0.4598, + "step": 5624 + }, + { + "epoch": 0.64, + "learning_rate": 5.976980543409669e-06, + "loss": 0.4511, + "step": 5625 + }, + { + "epoch": 0.64, + "learning_rate": 5.973591929986108e-06, + "loss": 0.4508, + "step": 5626 + }, + { + "epoch": 0.64, + "learning_rate": 5.97020386827035e-06, + "loss": 0.4723, + "step": 5627 + }, + { + "epoch": 0.64, + "learning_rate": 5.966816358726633e-06, + "loss": 0.4761, + "step": 5628 + }, + { + "epoch": 0.64, + "learning_rate": 5.9634294018191145e-06, + "loss": 0.4594, + "step": 5629 + }, + { + "epoch": 0.64, + "learning_rate": 5.960042998011892e-06, + "loss": 0.4682, + "step": 5630 + }, + { + "epoch": 0.64, + "learning_rate": 5.9566571477689735e-06, + "loss": 0.4535, + "step": 5631 + }, + { + "epoch": 0.64, + "learning_rate": 5.953271851554303e-06, + "loss": 0.4446, + "step": 5632 + }, + { + "epoch": 0.64, + "learning_rate": 5.949887109831736e-06, + "loss": 0.4797, + "step": 5633 + }, + { + "epoch": 0.64, + "learning_rate": 5.946502923065054e-06, + "loss": 0.4445, + "step": 5634 + }, + { + "epoch": 0.64, + "learning_rate": 5.943119291717974e-06, + "loss": 0.4538, + "step": 5635 + }, + { + "epoch": 0.64, + "learning_rate": 5.939736216254126e-06, + "loss": 0.4487, + "step": 5636 + }, + { + "epoch": 0.64, + "learning_rate": 5.936353697137063e-06, + "loss": 0.4681, + "step": 5637 + }, + { + "epoch": 0.64, + "learning_rate": 5.932971734830273e-06, + "loss": 0.4663, + "step": 5638 + }, + { + "epoch": 0.64, + "learning_rate": 5.929590329797154e-06, + "loss": 0.4624, + "step": 5639 + }, + { + "epoch": 0.64, + "learning_rate": 5.926209482501037e-06, + "loss": 0.4462, + "step": 5640 + }, + { + "epoch": 0.64, + "learning_rate": 5.9228291934051754e-06, + "loss": 0.4564, + "step": 5641 + }, + { + "epoch": 0.64, + "learning_rate": 5.919449462972737e-06, + "loss": 0.4501, + "step": 5642 + }, + { + "epoch": 0.64, + "learning_rate": 5.916070291666831e-06, + "loss": 0.4547, + "step": 5643 + }, + { + "epoch": 0.65, + "learning_rate": 5.9126916799504685e-06, + "loss": 0.4662, + "step": 5644 + }, + { + "epoch": 0.65, + "learning_rate": 5.9093136282866014e-06, + "loss": 0.4753, + "step": 5645 + }, + { + "epoch": 0.65, + "learning_rate": 5.9059361371381e-06, + "loss": 0.4517, + "step": 5646 + }, + { + "epoch": 0.65, + "learning_rate": 5.9025592069677475e-06, + "loss": 0.4549, + "step": 5647 + }, + { + "epoch": 0.65, + "learning_rate": 5.899182838238265e-06, + "loss": 0.4504, + "step": 5648 + }, + { + "epoch": 0.65, + "learning_rate": 5.895807031412293e-06, + "loss": 0.4433, + "step": 5649 + }, + { + "epoch": 0.65, + "learning_rate": 5.892431786952384e-06, + "loss": 0.453, + "step": 5650 + }, + { + "epoch": 0.65, + "learning_rate": 5.8890571053210295e-06, + "loss": 0.4681, + "step": 5651 + }, + { + "epoch": 0.65, + "learning_rate": 5.88568298698063e-06, + "loss": 0.4705, + "step": 5652 + }, + { + "epoch": 0.65, + "learning_rate": 5.88230943239352e-06, + "loss": 0.4575, + "step": 5653 + }, + { + "epoch": 0.65, + "learning_rate": 5.878936442021952e-06, + "loss": 0.4592, + "step": 5654 + }, + { + "epoch": 0.65, + "learning_rate": 5.875564016328096e-06, + "loss": 0.457, + "step": 5655 + }, + { + "epoch": 0.65, + "learning_rate": 5.872192155774056e-06, + "loss": 0.4668, + "step": 5656 + }, + { + "epoch": 0.65, + "learning_rate": 5.868820860821844e-06, + "loss": 0.4524, + "step": 5657 + }, + { + "epoch": 0.65, + "learning_rate": 5.8654501319334105e-06, + "loss": 0.4498, + "step": 5658 + }, + { + "epoch": 0.65, + "learning_rate": 5.862079969570619e-06, + "loss": 0.4785, + "step": 5659 + }, + { + "epoch": 0.65, + "learning_rate": 5.858710374195251e-06, + "loss": 0.4567, + "step": 5660 + }, + { + "epoch": 0.65, + "learning_rate": 5.855341346269026e-06, + "loss": 0.4633, + "step": 5661 + }, + { + "epoch": 0.65, + "learning_rate": 5.851972886253569e-06, + "loss": 0.4638, + "step": 5662 + }, + { + "epoch": 0.65, + "learning_rate": 5.848604994610434e-06, + "loss": 0.441, + "step": 5663 + }, + { + "epoch": 0.65, + "learning_rate": 5.845237671801103e-06, + "loss": 0.4802, + "step": 5664 + }, + { + "epoch": 0.65, + "learning_rate": 5.841870918286967e-06, + "loss": 0.4347, + "step": 5665 + }, + { + "epoch": 0.65, + "learning_rate": 5.838504734529353e-06, + "loss": 0.4517, + "step": 5666 + }, + { + "epoch": 0.65, + "learning_rate": 5.835139120989503e-06, + "loss": 0.4683, + "step": 5667 + }, + { + "epoch": 0.65, + "learning_rate": 5.831774078128574e-06, + "loss": 0.4603, + "step": 5668 + }, + { + "epoch": 0.65, + "learning_rate": 5.828409606407659e-06, + "loss": 0.48, + "step": 5669 + }, + { + "epoch": 0.65, + "learning_rate": 5.825045706287762e-06, + "loss": 0.4576, + "step": 5670 + }, + { + "epoch": 0.65, + "learning_rate": 5.821682378229813e-06, + "loss": 0.4416, + "step": 5671 + }, + { + "epoch": 0.65, + "learning_rate": 5.818319622694668e-06, + "loss": 0.4558, + "step": 5672 + }, + { + "epoch": 0.65, + "learning_rate": 5.814957440143092e-06, + "loss": 0.4724, + "step": 5673 + }, + { + "epoch": 0.65, + "learning_rate": 5.811595831035786e-06, + "loss": 0.4637, + "step": 5674 + }, + { + "epoch": 0.65, + "learning_rate": 5.8082347958333625e-06, + "loss": 0.4548, + "step": 5675 + }, + { + "epoch": 0.65, + "learning_rate": 5.804874334996353e-06, + "loss": 0.4525, + "step": 5676 + }, + { + "epoch": 0.65, + "learning_rate": 5.801514448985226e-06, + "loss": 0.4845, + "step": 5677 + }, + { + "epoch": 0.65, + "learning_rate": 5.798155138260352e-06, + "loss": 0.4575, + "step": 5678 + }, + { + "epoch": 0.65, + "learning_rate": 5.794796403282035e-06, + "loss": 0.4511, + "step": 5679 + }, + { + "epoch": 0.65, + "learning_rate": 5.791438244510499e-06, + "loss": 0.4552, + "step": 5680 + }, + { + "epoch": 0.65, + "learning_rate": 5.788080662405881e-06, + "loss": 0.4433, + "step": 5681 + }, + { + "epoch": 0.65, + "learning_rate": 5.784723657428255e-06, + "loss": 0.4881, + "step": 5682 + }, + { + "epoch": 0.65, + "learning_rate": 5.781367230037592e-06, + "loss": 0.4417, + "step": 5683 + }, + { + "epoch": 0.65, + "learning_rate": 5.7780113806938095e-06, + "loss": 0.4435, + "step": 5684 + }, + { + "epoch": 0.65, + "learning_rate": 5.774656109856729e-06, + "loss": 0.4837, + "step": 5685 + }, + { + "epoch": 0.65, + "learning_rate": 5.7713014179860925e-06, + "loss": 0.4599, + "step": 5686 + }, + { + "epoch": 0.65, + "learning_rate": 5.767947305541577e-06, + "loss": 0.459, + "step": 5687 + }, + { + "epoch": 0.65, + "learning_rate": 5.764593772982762e-06, + "loss": 0.4506, + "step": 5688 + }, + { + "epoch": 0.65, + "learning_rate": 5.76124082076916e-06, + "loss": 0.4723, + "step": 5689 + }, + { + "epoch": 0.65, + "learning_rate": 5.757888449360205e-06, + "loss": 0.4578, + "step": 5690 + }, + { + "epoch": 0.65, + "learning_rate": 5.754536659215239e-06, + "loss": 0.4623, + "step": 5691 + }, + { + "epoch": 0.65, + "learning_rate": 5.751185450793539e-06, + "loss": 0.4391, + "step": 5692 + }, + { + "epoch": 0.65, + "learning_rate": 5.747834824554293e-06, + "loss": 0.458, + "step": 5693 + }, + { + "epoch": 0.65, + "learning_rate": 5.744484780956605e-06, + "loss": 0.4721, + "step": 5694 + }, + { + "epoch": 0.65, + "learning_rate": 5.741135320459516e-06, + "loss": 0.4749, + "step": 5695 + }, + { + "epoch": 0.65, + "learning_rate": 5.737786443521968e-06, + "loss": 0.4534, + "step": 5696 + }, + { + "epoch": 0.65, + "learning_rate": 5.734438150602841e-06, + "loss": 0.449, + "step": 5697 + }, + { + "epoch": 0.65, + "learning_rate": 5.731090442160917e-06, + "loss": 0.4614, + "step": 5698 + }, + { + "epoch": 0.65, + "learning_rate": 5.727743318654911e-06, + "loss": 0.4712, + "step": 5699 + }, + { + "epoch": 0.65, + "learning_rate": 5.724396780543457e-06, + "loss": 0.4552, + "step": 5700 + }, + { + "epoch": 0.65, + "learning_rate": 5.721050828285097e-06, + "loss": 0.4427, + "step": 5701 + }, + { + "epoch": 0.65, + "learning_rate": 5.717705462338311e-06, + "loss": 0.4601, + "step": 5702 + }, + { + "epoch": 0.65, + "learning_rate": 5.714360683161484e-06, + "loss": 0.4697, + "step": 5703 + }, + { + "epoch": 0.65, + "learning_rate": 5.711016491212922e-06, + "loss": 0.4579, + "step": 5704 + }, + { + "epoch": 0.65, + "learning_rate": 5.707672886950859e-06, + "loss": 0.4585, + "step": 5705 + }, + { + "epoch": 0.65, + "learning_rate": 5.704329870833443e-06, + "loss": 0.4581, + "step": 5706 + }, + { + "epoch": 0.65, + "learning_rate": 5.700987443318737e-06, + "loss": 0.4484, + "step": 5707 + }, + { + "epoch": 0.65, + "learning_rate": 5.697645604864732e-06, + "loss": 0.4493, + "step": 5708 + }, + { + "epoch": 0.65, + "learning_rate": 5.694304355929333e-06, + "loss": 0.4515, + "step": 5709 + }, + { + "epoch": 0.65, + "learning_rate": 5.690963696970371e-06, + "loss": 0.4578, + "step": 5710 + }, + { + "epoch": 0.65, + "learning_rate": 5.687623628445588e-06, + "loss": 0.4852, + "step": 5711 + }, + { + "epoch": 0.65, + "learning_rate": 5.684284150812642e-06, + "loss": 0.4439, + "step": 5712 + }, + { + "epoch": 0.65, + "learning_rate": 5.680945264529125e-06, + "loss": 0.4691, + "step": 5713 + }, + { + "epoch": 0.65, + "learning_rate": 5.67760697005253e-06, + "loss": 0.4385, + "step": 5714 + }, + { + "epoch": 0.65, + "learning_rate": 5.674269267840287e-06, + "loss": 0.4582, + "step": 5715 + }, + { + "epoch": 0.65, + "learning_rate": 5.670932158349732e-06, + "loss": 0.4456, + "step": 5716 + }, + { + "epoch": 0.65, + "learning_rate": 5.667595642038117e-06, + "loss": 0.4779, + "step": 5717 + }, + { + "epoch": 0.65, + "learning_rate": 5.664259719362627e-06, + "loss": 0.4467, + "step": 5718 + }, + { + "epoch": 0.65, + "learning_rate": 5.660924390780359e-06, + "loss": 0.4604, + "step": 5719 + }, + { + "epoch": 0.65, + "learning_rate": 5.657589656748321e-06, + "loss": 0.4391, + "step": 5720 + }, + { + "epoch": 0.65, + "learning_rate": 5.654255517723452e-06, + "loss": 0.4577, + "step": 5721 + }, + { + "epoch": 0.65, + "learning_rate": 5.650921974162598e-06, + "loss": 0.45, + "step": 5722 + }, + { + "epoch": 0.65, + "learning_rate": 5.647589026522535e-06, + "loss": 0.4597, + "step": 5723 + }, + { + "epoch": 0.65, + "learning_rate": 5.644256675259949e-06, + "loss": 0.4548, + "step": 5724 + }, + { + "epoch": 0.65, + "learning_rate": 5.640924920831441e-06, + "loss": 0.4683, + "step": 5725 + }, + { + "epoch": 0.65, + "learning_rate": 5.637593763693545e-06, + "loss": 0.4619, + "step": 5726 + }, + { + "epoch": 0.65, + "learning_rate": 5.634263204302694e-06, + "loss": 0.453, + "step": 5727 + }, + { + "epoch": 0.65, + "learning_rate": 5.630933243115255e-06, + "loss": 0.4557, + "step": 5728 + }, + { + "epoch": 0.65, + "learning_rate": 5.627603880587511e-06, + "loss": 0.4494, + "step": 5729 + }, + { + "epoch": 0.65, + "learning_rate": 5.624275117175649e-06, + "loss": 0.4481, + "step": 5730 + }, + { + "epoch": 0.65, + "learning_rate": 5.620946953335793e-06, + "loss": 0.4744, + "step": 5731 + }, + { + "epoch": 0.66, + "learning_rate": 5.617619389523973e-06, + "loss": 0.4303, + "step": 5732 + }, + { + "epoch": 0.66, + "learning_rate": 5.614292426196133e-06, + "loss": 0.4716, + "step": 5733 + }, + { + "epoch": 0.66, + "learning_rate": 5.610966063808152e-06, + "loss": 0.4592, + "step": 5734 + }, + { + "epoch": 0.66, + "learning_rate": 5.607640302815806e-06, + "loss": 0.4709, + "step": 5735 + }, + { + "epoch": 0.66, + "learning_rate": 5.6043151436748035e-06, + "loss": 0.4491, + "step": 5736 + }, + { + "epoch": 0.66, + "learning_rate": 5.600990586840768e-06, + "loss": 0.4589, + "step": 5737 + }, + { + "epoch": 0.66, + "learning_rate": 5.597666632769232e-06, + "loss": 0.4456, + "step": 5738 + }, + { + "epoch": 0.66, + "learning_rate": 5.594343281915658e-06, + "loss": 0.4658, + "step": 5739 + }, + { + "epoch": 0.66, + "learning_rate": 5.5910205347354114e-06, + "loss": 0.4628, + "step": 5740 + }, + { + "epoch": 0.66, + "learning_rate": 5.587698391683792e-06, + "loss": 0.433, + "step": 5741 + }, + { + "epoch": 0.66, + "learning_rate": 5.584376853216003e-06, + "loss": 0.4459, + "step": 5742 + }, + { + "epoch": 0.66, + "learning_rate": 5.581055919787165e-06, + "loss": 0.4464, + "step": 5743 + }, + { + "epoch": 0.66, + "learning_rate": 5.577735591852327e-06, + "loss": 0.4552, + "step": 5744 + }, + { + "epoch": 0.66, + "learning_rate": 5.574415869866443e-06, + "loss": 0.4547, + "step": 5745 + }, + { + "epoch": 0.66, + "learning_rate": 5.571096754284389e-06, + "loss": 0.4586, + "step": 5746 + }, + { + "epoch": 0.66, + "learning_rate": 5.567778245560966e-06, + "loss": 0.5003, + "step": 5747 + }, + { + "epoch": 0.66, + "learning_rate": 5.564460344150873e-06, + "loss": 0.4509, + "step": 5748 + }, + { + "epoch": 0.66, + "learning_rate": 5.561143050508746e-06, + "loss": 0.4781, + "step": 5749 + }, + { + "epoch": 0.66, + "learning_rate": 5.5578263650891225e-06, + "loss": 0.4499, + "step": 5750 + }, + { + "epoch": 0.66, + "learning_rate": 5.554510288346459e-06, + "loss": 0.4647, + "step": 5751 + }, + { + "epoch": 0.66, + "learning_rate": 5.551194820735144e-06, + "loss": 0.4586, + "step": 5752 + }, + { + "epoch": 0.66, + "learning_rate": 5.547879962709457e-06, + "loss": 0.4501, + "step": 5753 + }, + { + "epoch": 0.66, + "learning_rate": 5.544565714723619e-06, + "loss": 0.4588, + "step": 5754 + }, + { + "epoch": 0.66, + "learning_rate": 5.541252077231746e-06, + "loss": 0.4566, + "step": 5755 + }, + { + "epoch": 0.66, + "learning_rate": 5.537939050687886e-06, + "loss": 0.4355, + "step": 5756 + }, + { + "epoch": 0.66, + "learning_rate": 5.534626635546e-06, + "loss": 0.4584, + "step": 5757 + }, + { + "epoch": 0.66, + "learning_rate": 5.53131483225996e-06, + "loss": 0.458, + "step": 5758 + }, + { + "epoch": 0.66, + "learning_rate": 5.528003641283552e-06, + "loss": 0.451, + "step": 5759 + }, + { + "epoch": 0.66, + "learning_rate": 5.524693063070492e-06, + "loss": 0.4604, + "step": 5760 + }, + { + "epoch": 0.66, + "learning_rate": 5.521383098074395e-06, + "loss": 0.4517, + "step": 5761 + }, + { + "epoch": 0.66, + "learning_rate": 5.5180737467488085e-06, + "loss": 0.4649, + "step": 5762 + }, + { + "epoch": 0.66, + "learning_rate": 5.514765009547181e-06, + "loss": 0.4876, + "step": 5763 + }, + { + "epoch": 0.66, + "learning_rate": 5.511456886922883e-06, + "loss": 0.4452, + "step": 5764 + }, + { + "epoch": 0.66, + "learning_rate": 5.508149379329204e-06, + "loss": 0.4783, + "step": 5765 + }, + { + "epoch": 0.66, + "learning_rate": 5.504842487219344e-06, + "loss": 0.4413, + "step": 5766 + }, + { + "epoch": 0.66, + "learning_rate": 5.5015362110464275e-06, + "loss": 0.4431, + "step": 5767 + }, + { + "epoch": 0.66, + "learning_rate": 5.4982305512634845e-06, + "loss": 0.4764, + "step": 5768 + }, + { + "epoch": 0.66, + "learning_rate": 5.4949255083234585e-06, + "loss": 0.4596, + "step": 5769 + }, + { + "epoch": 0.66, + "learning_rate": 5.491621082679224e-06, + "loss": 0.4597, + "step": 5770 + }, + { + "epoch": 0.66, + "learning_rate": 5.48831727478355e-06, + "loss": 0.4672, + "step": 5771 + }, + { + "epoch": 0.66, + "learning_rate": 5.4850140850891445e-06, + "loss": 0.4401, + "step": 5772 + }, + { + "epoch": 0.66, + "learning_rate": 5.481711514048609e-06, + "loss": 0.4626, + "step": 5773 + }, + { + "epoch": 0.66, + "learning_rate": 5.478409562114469e-06, + "loss": 0.444, + "step": 5774 + }, + { + "epoch": 0.66, + "learning_rate": 5.47510822973917e-06, + "loss": 0.4472, + "step": 5775 + }, + { + "epoch": 0.66, + "learning_rate": 5.4718075173750695e-06, + "loss": 0.4574, + "step": 5776 + }, + { + "epoch": 0.66, + "learning_rate": 5.4685074254744346e-06, + "loss": 0.4639, + "step": 5777 + }, + { + "epoch": 0.66, + "learning_rate": 5.465207954489454e-06, + "loss": 0.4587, + "step": 5778 + }, + { + "epoch": 0.66, + "learning_rate": 5.461909104872226e-06, + "loss": 0.4665, + "step": 5779 + }, + { + "epoch": 0.66, + "learning_rate": 5.458610877074773e-06, + "loss": 0.4562, + "step": 5780 + }, + { + "epoch": 0.66, + "learning_rate": 5.455313271549021e-06, + "loss": 0.4441, + "step": 5781 + }, + { + "epoch": 0.66, + "learning_rate": 5.452016288746813e-06, + "loss": 0.4498, + "step": 5782 + }, + { + "epoch": 0.66, + "learning_rate": 5.448719929119916e-06, + "loss": 0.4597, + "step": 5783 + }, + { + "epoch": 0.66, + "learning_rate": 5.445424193119997e-06, + "loss": 0.4465, + "step": 5784 + }, + { + "epoch": 0.66, + "learning_rate": 5.44212908119865e-06, + "loss": 0.442, + "step": 5785 + }, + { + "epoch": 0.66, + "learning_rate": 5.4388345938073824e-06, + "loss": 0.441, + "step": 5786 + }, + { + "epoch": 0.66, + "learning_rate": 5.435540731397606e-06, + "loss": 0.4598, + "step": 5787 + }, + { + "epoch": 0.66, + "learning_rate": 5.432247494420659e-06, + "loss": 0.4435, + "step": 5788 + }, + { + "epoch": 0.66, + "learning_rate": 5.4289548833277865e-06, + "loss": 0.475, + "step": 5789 + }, + { + "epoch": 0.66, + "learning_rate": 5.425662898570144e-06, + "loss": 0.453, + "step": 5790 + }, + { + "epoch": 0.66, + "learning_rate": 5.422371540598816e-06, + "loss": 0.4684, + "step": 5791 + }, + { + "epoch": 0.66, + "learning_rate": 5.419080809864785e-06, + "loss": 0.4355, + "step": 5792 + }, + { + "epoch": 0.66, + "learning_rate": 5.415790706818958e-06, + "loss": 0.4451, + "step": 5793 + }, + { + "epoch": 0.66, + "learning_rate": 5.412501231912153e-06, + "loss": 0.472, + "step": 5794 + }, + { + "epoch": 0.66, + "learning_rate": 5.409212385595098e-06, + "loss": 0.4648, + "step": 5795 + }, + { + "epoch": 0.66, + "learning_rate": 5.405924168318446e-06, + "loss": 0.4761, + "step": 5796 + }, + { + "epoch": 0.66, + "learning_rate": 5.4026365805327455e-06, + "loss": 0.4499, + "step": 5797 + }, + { + "epoch": 0.66, + "learning_rate": 5.399349622688479e-06, + "loss": 0.4524, + "step": 5798 + }, + { + "epoch": 0.66, + "learning_rate": 5.39606329523603e-06, + "loss": 0.4547, + "step": 5799 + }, + { + "epoch": 0.66, + "learning_rate": 5.392777598625694e-06, + "loss": 0.4504, + "step": 5800 + }, + { + "epoch": 0.66, + "learning_rate": 5.389492533307692e-06, + "loss": 0.4597, + "step": 5801 + }, + { + "epoch": 0.66, + "learning_rate": 5.386208099732144e-06, + "loss": 0.461, + "step": 5802 + }, + { + "epoch": 0.66, + "learning_rate": 5.382924298349095e-06, + "loss": 0.4603, + "step": 5803 + }, + { + "epoch": 0.66, + "learning_rate": 5.379641129608501e-06, + "loss": 0.474, + "step": 5804 + }, + { + "epoch": 0.66, + "learning_rate": 5.3763585939602244e-06, + "loss": 0.4792, + "step": 5805 + }, + { + "epoch": 0.66, + "learning_rate": 5.373076691854054e-06, + "loss": 0.448, + "step": 5806 + }, + { + "epoch": 0.66, + "learning_rate": 5.3697954237396764e-06, + "loss": 0.4556, + "step": 5807 + }, + { + "epoch": 0.66, + "learning_rate": 5.366514790066697e-06, + "loss": 0.4481, + "step": 5808 + }, + { + "epoch": 0.66, + "learning_rate": 5.363234791284644e-06, + "loss": 0.4715, + "step": 5809 + }, + { + "epoch": 0.66, + "learning_rate": 5.3599554278429415e-06, + "loss": 0.4523, + "step": 5810 + }, + { + "epoch": 0.66, + "learning_rate": 5.356676700190944e-06, + "loss": 0.483, + "step": 5811 + }, + { + "epoch": 0.66, + "learning_rate": 5.353398608777901e-06, + "loss": 0.4732, + "step": 5812 + }, + { + "epoch": 0.66, + "learning_rate": 5.35012115405299e-06, + "loss": 0.4547, + "step": 5813 + }, + { + "epoch": 0.66, + "learning_rate": 5.3468443364653e-06, + "loss": 0.4648, + "step": 5814 + }, + { + "epoch": 0.66, + "learning_rate": 5.343568156463821e-06, + "loss": 0.4508, + "step": 5815 + }, + { + "epoch": 0.66, + "learning_rate": 5.3402926144974625e-06, + "loss": 0.4693, + "step": 5816 + }, + { + "epoch": 0.66, + "learning_rate": 5.337017711015052e-06, + "loss": 0.4573, + "step": 5817 + }, + { + "epoch": 0.66, + "learning_rate": 5.333743446465318e-06, + "loss": 0.4506, + "step": 5818 + }, + { + "epoch": 0.67, + "learning_rate": 5.330469821296916e-06, + "loss": 0.4474, + "step": 5819 + }, + { + "epoch": 0.67, + "learning_rate": 5.327196835958402e-06, + "loss": 0.4784, + "step": 5820 + }, + { + "epoch": 0.67, + "learning_rate": 5.323924490898242e-06, + "loss": 0.4562, + "step": 5821 + }, + { + "epoch": 0.67, + "learning_rate": 5.320652786564826e-06, + "loss": 0.4497, + "step": 5822 + }, + { + "epoch": 0.67, + "learning_rate": 5.31738172340645e-06, + "loss": 0.4454, + "step": 5823 + }, + { + "epoch": 0.67, + "learning_rate": 5.314111301871325e-06, + "loss": 0.4383, + "step": 5824 + }, + { + "epoch": 0.67, + "learning_rate": 5.3108415224075725e-06, + "loss": 0.4667, + "step": 5825 + }, + { + "epoch": 0.67, + "learning_rate": 5.307572385463218e-06, + "loss": 0.4501, + "step": 5826 + }, + { + "epoch": 0.67, + "learning_rate": 5.304303891486213e-06, + "loss": 0.4514, + "step": 5827 + }, + { + "epoch": 0.67, + "learning_rate": 5.301036040924412e-06, + "loss": 0.4677, + "step": 5828 + }, + { + "epoch": 0.67, + "learning_rate": 5.297768834225581e-06, + "loss": 0.4647, + "step": 5829 + }, + { + "epoch": 0.67, + "learning_rate": 5.294502271837405e-06, + "loss": 0.4455, + "step": 5830 + }, + { + "epoch": 0.67, + "learning_rate": 5.2912363542074695e-06, + "loss": 0.4584, + "step": 5831 + }, + { + "epoch": 0.67, + "learning_rate": 5.287971081783283e-06, + "loss": 0.4672, + "step": 5832 + }, + { + "epoch": 0.67, + "learning_rate": 5.284706455012263e-06, + "loss": 0.4464, + "step": 5833 + }, + { + "epoch": 0.67, + "learning_rate": 5.281442474341729e-06, + "loss": 0.4605, + "step": 5834 + }, + { + "epoch": 0.67, + "learning_rate": 5.278179140218928e-06, + "loss": 0.4518, + "step": 5835 + }, + { + "epoch": 0.67, + "learning_rate": 5.274916453091001e-06, + "loss": 0.4436, + "step": 5836 + }, + { + "epoch": 0.67, + "learning_rate": 5.271654413405016e-06, + "loss": 0.465, + "step": 5837 + }, + { + "epoch": 0.67, + "learning_rate": 5.268393021607944e-06, + "loss": 0.4715, + "step": 5838 + }, + { + "epoch": 0.67, + "learning_rate": 5.2651322781466606e-06, + "loss": 0.445, + "step": 5839 + }, + { + "epoch": 0.67, + "learning_rate": 5.261872183467972e-06, + "loss": 0.468, + "step": 5840 + }, + { + "epoch": 0.67, + "learning_rate": 5.258612738018574e-06, + "loss": 0.4424, + "step": 5841 + }, + { + "epoch": 0.67, + "learning_rate": 5.255353942245089e-06, + "loss": 0.4553, + "step": 5842 + }, + { + "epoch": 0.67, + "learning_rate": 5.252095796594046e-06, + "loss": 0.4456, + "step": 5843 + }, + { + "epoch": 0.67, + "learning_rate": 5.2488383015118785e-06, + "loss": 0.4466, + "step": 5844 + }, + { + "epoch": 0.67, + "learning_rate": 5.2455814574449415e-06, + "loss": 0.456, + "step": 5845 + }, + { + "epoch": 0.67, + "learning_rate": 5.242325264839494e-06, + "loss": 0.4612, + "step": 5846 + }, + { + "epoch": 0.67, + "learning_rate": 5.239069724141701e-06, + "loss": 0.4419, + "step": 5847 + }, + { + "epoch": 0.67, + "learning_rate": 5.235814835797655e-06, + "loss": 0.4759, + "step": 5848 + }, + { + "epoch": 0.67, + "learning_rate": 5.232560600253336e-06, + "loss": 0.4619, + "step": 5849 + }, + { + "epoch": 0.67, + "learning_rate": 5.229307017954655e-06, + "loss": 0.4547, + "step": 5850 + }, + { + "epoch": 0.67, + "learning_rate": 5.226054089347428e-06, + "loss": 0.4444, + "step": 5851 + }, + { + "epoch": 0.67, + "learning_rate": 5.22280181487737e-06, + "loss": 0.4596, + "step": 5852 + }, + { + "epoch": 0.67, + "learning_rate": 5.219550194990124e-06, + "loss": 0.4476, + "step": 5853 + }, + { + "epoch": 0.67, + "learning_rate": 5.216299230131227e-06, + "loss": 0.4702, + "step": 5854 + }, + { + "epoch": 0.67, + "learning_rate": 5.21304892074614e-06, + "loss": 0.4289, + "step": 5855 + }, + { + "epoch": 0.67, + "learning_rate": 5.209799267280225e-06, + "loss": 0.4721, + "step": 5856 + }, + { + "epoch": 0.67, + "learning_rate": 5.206550270178754e-06, + "loss": 0.4441, + "step": 5857 + }, + { + "epoch": 0.67, + "learning_rate": 5.20330192988692e-06, + "loss": 0.4478, + "step": 5858 + }, + { + "epoch": 0.67, + "learning_rate": 5.2000542468498085e-06, + "loss": 0.4388, + "step": 5859 + }, + { + "epoch": 0.67, + "learning_rate": 5.19680722151243e-06, + "loss": 0.456, + "step": 5860 + }, + { + "epoch": 0.67, + "learning_rate": 5.1935608543197035e-06, + "loss": 0.4623, + "step": 5861 + }, + { + "epoch": 0.67, + "learning_rate": 5.1903151457164445e-06, + "loss": 0.4772, + "step": 5862 + }, + { + "epoch": 0.67, + "learning_rate": 5.187070096147397e-06, + "loss": 0.4432, + "step": 5863 + }, + { + "epoch": 0.67, + "learning_rate": 5.183825706057199e-06, + "loss": 0.4853, + "step": 5864 + }, + { + "epoch": 0.67, + "learning_rate": 5.180581975890404e-06, + "loss": 0.4542, + "step": 5865 + }, + { + "epoch": 0.67, + "learning_rate": 5.177338906091481e-06, + "loss": 0.4507, + "step": 5866 + }, + { + "epoch": 0.67, + "learning_rate": 5.1740964971047945e-06, + "loss": 0.4507, + "step": 5867 + }, + { + "epoch": 0.67, + "learning_rate": 5.1708547493746376e-06, + "loss": 0.4551, + "step": 5868 + }, + { + "epoch": 0.67, + "learning_rate": 5.16761366334519e-06, + "loss": 0.4536, + "step": 5869 + }, + { + "epoch": 0.67, + "learning_rate": 5.164373239460561e-06, + "loss": 0.4742, + "step": 5870 + }, + { + "epoch": 0.67, + "learning_rate": 5.161133478164764e-06, + "loss": 0.4438, + "step": 5871 + }, + { + "epoch": 0.67, + "learning_rate": 5.157894379901711e-06, + "loss": 0.4696, + "step": 5872 + }, + { + "epoch": 0.67, + "learning_rate": 5.154655945115233e-06, + "loss": 0.4426, + "step": 5873 + }, + { + "epoch": 0.67, + "learning_rate": 5.15141817424907e-06, + "loss": 0.447, + "step": 5874 + }, + { + "epoch": 0.67, + "learning_rate": 5.148181067746862e-06, + "loss": 0.4607, + "step": 5875 + }, + { + "epoch": 0.67, + "learning_rate": 5.144944626052178e-06, + "loss": 0.4548, + "step": 5876 + }, + { + "epoch": 0.67, + "learning_rate": 5.141708849608473e-06, + "loss": 0.4623, + "step": 5877 + }, + { + "epoch": 0.67, + "learning_rate": 5.138473738859118e-06, + "loss": 0.4585, + "step": 5878 + }, + { + "epoch": 0.67, + "learning_rate": 5.1352392942474005e-06, + "loss": 0.4618, + "step": 5879 + }, + { + "epoch": 0.67, + "learning_rate": 5.132005516216512e-06, + "loss": 0.4381, + "step": 5880 + }, + { + "epoch": 0.67, + "learning_rate": 5.128772405209556e-06, + "loss": 0.4473, + "step": 5881 + }, + { + "epoch": 0.67, + "learning_rate": 5.1255399616695345e-06, + "loss": 0.4759, + "step": 5882 + }, + { + "epoch": 0.67, + "learning_rate": 5.122308186039364e-06, + "loss": 0.4472, + "step": 5883 + }, + { + "epoch": 0.67, + "learning_rate": 5.119077078761875e-06, + "loss": 0.453, + "step": 5884 + }, + { + "epoch": 0.67, + "learning_rate": 5.115846640279798e-06, + "loss": 0.4527, + "step": 5885 + }, + { + "epoch": 0.67, + "learning_rate": 5.1126168710357735e-06, + "loss": 0.4394, + "step": 5886 + }, + { + "epoch": 0.67, + "learning_rate": 5.109387771472356e-06, + "loss": 0.4605, + "step": 5887 + }, + { + "epoch": 0.67, + "learning_rate": 5.106159342032e-06, + "loss": 0.4662, + "step": 5888 + }, + { + "epoch": 0.67, + "learning_rate": 5.102931583157074e-06, + "loss": 0.4503, + "step": 5889 + }, + { + "epoch": 0.67, + "learning_rate": 5.099704495289859e-06, + "loss": 0.4692, + "step": 5890 + }, + { + "epoch": 0.67, + "learning_rate": 5.096478078872528e-06, + "loss": 0.4399, + "step": 5891 + }, + { + "epoch": 0.67, + "learning_rate": 5.093252334347183e-06, + "loss": 0.4462, + "step": 5892 + }, + { + "epoch": 0.67, + "learning_rate": 5.09002726215581e-06, + "loss": 0.4574, + "step": 5893 + }, + { + "epoch": 0.67, + "learning_rate": 5.08680286274033e-06, + "loss": 0.4606, + "step": 5894 + }, + { + "epoch": 0.67, + "learning_rate": 5.083579136542548e-06, + "loss": 0.4525, + "step": 5895 + }, + { + "epoch": 0.67, + "learning_rate": 5.080356084004187e-06, + "loss": 0.4595, + "step": 5896 + }, + { + "epoch": 0.67, + "learning_rate": 5.0771337055668826e-06, + "loss": 0.4541, + "step": 5897 + }, + { + "epoch": 0.67, + "learning_rate": 5.073912001672165e-06, + "loss": 0.4641, + "step": 5898 + }, + { + "epoch": 0.67, + "learning_rate": 5.070690972761484e-06, + "loss": 0.4504, + "step": 5899 + }, + { + "epoch": 0.67, + "learning_rate": 5.067470619276196e-06, + "loss": 0.4616, + "step": 5900 + }, + { + "epoch": 0.67, + "learning_rate": 5.064250941657555e-06, + "loss": 0.4527, + "step": 5901 + }, + { + "epoch": 0.67, + "learning_rate": 5.061031940346734e-06, + "loss": 0.4591, + "step": 5902 + }, + { + "epoch": 0.67, + "learning_rate": 5.057813615784806e-06, + "loss": 0.4393, + "step": 5903 + }, + { + "epoch": 0.67, + "learning_rate": 5.054595968412748e-06, + "loss": 0.4637, + "step": 5904 + }, + { + "epoch": 0.67, + "learning_rate": 5.051378998671459e-06, + "loss": 0.4544, + "step": 5905 + }, + { + "epoch": 0.67, + "learning_rate": 5.048162707001727e-06, + "loss": 0.4612, + "step": 5906 + }, + { + "epoch": 0.68, + "learning_rate": 5.044947093844259e-06, + "loss": 0.4416, + "step": 5907 + }, + { + "epoch": 0.68, + "learning_rate": 5.0417321596396715e-06, + "loss": 0.4594, + "step": 5908 + }, + { + "epoch": 0.68, + "learning_rate": 5.038517904828473e-06, + "loss": 0.4519, + "step": 5909 + }, + { + "epoch": 0.68, + "learning_rate": 5.035304329851096e-06, + "loss": 0.4565, + "step": 5910 + }, + { + "epoch": 0.68, + "learning_rate": 5.032091435147867e-06, + "loss": 0.4487, + "step": 5911 + }, + { + "epoch": 0.68, + "learning_rate": 5.028879221159025e-06, + "loss": 0.4597, + "step": 5912 + }, + { + "epoch": 0.68, + "learning_rate": 5.025667688324718e-06, + "loss": 0.4687, + "step": 5913 + }, + { + "epoch": 0.68, + "learning_rate": 5.02245683708499e-06, + "loss": 0.4769, + "step": 5914 + }, + { + "epoch": 0.68, + "learning_rate": 5.0192466678798116e-06, + "loss": 0.4546, + "step": 5915 + }, + { + "epoch": 0.68, + "learning_rate": 5.016037181149036e-06, + "loss": 0.4698, + "step": 5916 + }, + { + "epoch": 0.68, + "learning_rate": 5.012828377332438e-06, + "loss": 0.447, + "step": 5917 + }, + { + "epoch": 0.68, + "learning_rate": 5.009620256869703e-06, + "loss": 0.4466, + "step": 5918 + }, + { + "epoch": 0.68, + "learning_rate": 5.0064128202004025e-06, + "loss": 0.4612, + "step": 5919 + }, + { + "epoch": 0.68, + "learning_rate": 5.003206067764039e-06, + "loss": 0.4515, + "step": 5920 + }, + { + "epoch": 0.68, + "learning_rate": 5.000000000000003e-06, + "loss": 0.4684, + "step": 5921 + }, + { + "epoch": 0.68, + "learning_rate": 4.996794617347593e-06, + "loss": 0.4616, + "step": 5922 + }, + { + "epoch": 0.68, + "learning_rate": 4.993589920246028e-06, + "loss": 0.4691, + "step": 5923 + }, + { + "epoch": 0.68, + "learning_rate": 4.9903859091344175e-06, + "loss": 0.46, + "step": 5924 + }, + { + "epoch": 0.68, + "learning_rate": 4.987182584451778e-06, + "loss": 0.4516, + "step": 5925 + }, + { + "epoch": 0.68, + "learning_rate": 4.983979946637043e-06, + "loss": 0.4487, + "step": 5926 + }, + { + "epoch": 0.68, + "learning_rate": 4.980777996129043e-06, + "loss": 0.4312, + "step": 5927 + }, + { + "epoch": 0.68, + "learning_rate": 4.977576733366521e-06, + "loss": 0.4613, + "step": 5928 + }, + { + "epoch": 0.68, + "learning_rate": 4.974376158788119e-06, + "loss": 0.4487, + "step": 5929 + }, + { + "epoch": 0.68, + "learning_rate": 4.971176272832382e-06, + "loss": 0.4573, + "step": 5930 + }, + { + "epoch": 0.68, + "learning_rate": 4.967977075937774e-06, + "loss": 0.4519, + "step": 5931 + }, + { + "epoch": 0.68, + "learning_rate": 4.964778568542649e-06, + "loss": 0.4419, + "step": 5932 + }, + { + "epoch": 0.68, + "learning_rate": 4.9615807510852795e-06, + "loss": 0.448, + "step": 5933 + }, + { + "epoch": 0.68, + "learning_rate": 4.958383624003836e-06, + "loss": 0.4687, + "step": 5934 + }, + { + "epoch": 0.68, + "learning_rate": 4.955187187736393e-06, + "loss": 0.4446, + "step": 5935 + }, + { + "epoch": 0.68, + "learning_rate": 4.951991442720937e-06, + "loss": 0.4657, + "step": 5936 + }, + { + "epoch": 0.68, + "learning_rate": 4.948796389395355e-06, + "loss": 0.4441, + "step": 5937 + }, + { + "epoch": 0.68, + "learning_rate": 4.945602028197447e-06, + "loss": 0.4799, + "step": 5938 + }, + { + "epoch": 0.68, + "learning_rate": 4.942408359564906e-06, + "loss": 0.4605, + "step": 5939 + }, + { + "epoch": 0.68, + "learning_rate": 4.939215383935331e-06, + "loss": 0.4591, + "step": 5940 + }, + { + "epoch": 0.68, + "learning_rate": 4.936023101746242e-06, + "loss": 0.4457, + "step": 5941 + }, + { + "epoch": 0.68, + "learning_rate": 4.932831513435045e-06, + "loss": 0.4516, + "step": 5942 + }, + { + "epoch": 0.68, + "learning_rate": 4.929640619439059e-06, + "loss": 0.4394, + "step": 5943 + }, + { + "epoch": 0.68, + "learning_rate": 4.926450420195513e-06, + "loss": 0.4495, + "step": 5944 + }, + { + "epoch": 0.68, + "learning_rate": 4.92326091614153e-06, + "loss": 0.4513, + "step": 5945 + }, + { + "epoch": 0.68, + "learning_rate": 4.920072107714145e-06, + "loss": 0.4674, + "step": 5946 + }, + { + "epoch": 0.68, + "learning_rate": 4.916883995350299e-06, + "loss": 0.4625, + "step": 5947 + }, + { + "epoch": 0.68, + "learning_rate": 4.913696579486829e-06, + "loss": 0.4644, + "step": 5948 + }, + { + "epoch": 0.68, + "learning_rate": 4.91050986056049e-06, + "loss": 0.4591, + "step": 5949 + }, + { + "epoch": 0.68, + "learning_rate": 4.907323839007925e-06, + "loss": 0.4544, + "step": 5950 + }, + { + "epoch": 0.68, + "learning_rate": 4.904138515265696e-06, + "loss": 0.4634, + "step": 5951 + }, + { + "epoch": 0.68, + "learning_rate": 4.900953889770264e-06, + "loss": 0.4425, + "step": 5952 + }, + { + "epoch": 0.68, + "learning_rate": 4.897769962957986e-06, + "loss": 0.4416, + "step": 5953 + }, + { + "epoch": 0.68, + "learning_rate": 4.89458673526514e-06, + "loss": 0.4747, + "step": 5954 + }, + { + "epoch": 0.68, + "learning_rate": 4.891404207127892e-06, + "loss": 0.4554, + "step": 5955 + }, + { + "epoch": 0.68, + "learning_rate": 4.888222378982323e-06, + "loss": 0.4564, + "step": 5956 + }, + { + "epoch": 0.68, + "learning_rate": 4.885041251264419e-06, + "loss": 0.4421, + "step": 5957 + }, + { + "epoch": 0.68, + "learning_rate": 4.881860824410056e-06, + "loss": 0.4374, + "step": 5958 + }, + { + "epoch": 0.68, + "learning_rate": 4.8786810988550326e-06, + "loss": 0.4528, + "step": 5959 + }, + { + "epoch": 0.68, + "learning_rate": 4.875502075035039e-06, + "loss": 0.474, + "step": 5960 + }, + { + "epoch": 0.68, + "learning_rate": 4.872323753385667e-06, + "loss": 0.4414, + "step": 5961 + }, + { + "epoch": 0.68, + "learning_rate": 4.869146134342426e-06, + "loss": 0.4653, + "step": 5962 + }, + { + "epoch": 0.68, + "learning_rate": 4.8659692183407135e-06, + "loss": 0.4524, + "step": 5963 + }, + { + "epoch": 0.68, + "learning_rate": 4.862793005815841e-06, + "loss": 0.4981, + "step": 5964 + }, + { + "epoch": 0.68, + "learning_rate": 4.859617497203024e-06, + "loss": 0.4381, + "step": 5965 + }, + { + "epoch": 0.68, + "learning_rate": 4.856442692937372e-06, + "loss": 0.4612, + "step": 5966 + }, + { + "epoch": 0.68, + "learning_rate": 4.85326859345391e-06, + "loss": 0.4269, + "step": 5967 + }, + { + "epoch": 0.68, + "learning_rate": 4.850095199187559e-06, + "loss": 0.4602, + "step": 5968 + }, + { + "epoch": 0.68, + "learning_rate": 4.846922510573139e-06, + "loss": 0.4351, + "step": 5969 + }, + { + "epoch": 0.68, + "learning_rate": 4.843750528045387e-06, + "loss": 0.4577, + "step": 5970 + }, + { + "epoch": 0.68, + "learning_rate": 4.8405792520389275e-06, + "loss": 0.4465, + "step": 5971 + }, + { + "epoch": 0.68, + "learning_rate": 4.837408682988305e-06, + "loss": 0.4765, + "step": 5972 + }, + { + "epoch": 0.68, + "learning_rate": 4.83423882132795e-06, + "loss": 0.4499, + "step": 5973 + }, + { + "epoch": 0.68, + "learning_rate": 4.831069667492209e-06, + "loss": 0.4669, + "step": 5974 + }, + { + "epoch": 0.68, + "learning_rate": 4.8279012219153284e-06, + "loss": 0.479, + "step": 5975 + }, + { + "epoch": 0.68, + "learning_rate": 4.8247334850314495e-06, + "loss": 0.4517, + "step": 5976 + }, + { + "epoch": 0.68, + "learning_rate": 4.821566457274632e-06, + "loss": 0.4491, + "step": 5977 + }, + { + "epoch": 0.68, + "learning_rate": 4.818400139078824e-06, + "loss": 0.4552, + "step": 5978 + }, + { + "epoch": 0.68, + "learning_rate": 4.815234530877879e-06, + "loss": 0.4508, + "step": 5979 + }, + { + "epoch": 0.68, + "learning_rate": 4.812069633105563e-06, + "loss": 0.4679, + "step": 5980 + }, + { + "epoch": 0.68, + "learning_rate": 4.808905446195532e-06, + "loss": 0.4566, + "step": 5981 + }, + { + "epoch": 0.68, + "learning_rate": 4.80574197058135e-06, + "loss": 0.4555, + "step": 5982 + }, + { + "epoch": 0.68, + "learning_rate": 4.802579206696486e-06, + "loss": 0.4574, + "step": 5983 + }, + { + "epoch": 0.68, + "learning_rate": 4.7994171549743085e-06, + "loss": 0.4367, + "step": 5984 + }, + { + "epoch": 0.68, + "learning_rate": 4.796255815848094e-06, + "loss": 0.4542, + "step": 5985 + }, + { + "epoch": 0.68, + "learning_rate": 4.7930951897510126e-06, + "loss": 0.4746, + "step": 5986 + }, + { + "epoch": 0.68, + "learning_rate": 4.7899352771161355e-06, + "loss": 0.4405, + "step": 5987 + }, + { + "epoch": 0.68, + "learning_rate": 4.786776078376451e-06, + "loss": 0.4694, + "step": 5988 + }, + { + "epoch": 0.68, + "learning_rate": 4.783617593964831e-06, + "loss": 0.466, + "step": 5989 + }, + { + "epoch": 0.68, + "learning_rate": 4.7804598243140664e-06, + "loss": 0.4495, + "step": 5990 + }, + { + "epoch": 0.68, + "learning_rate": 4.777302769856838e-06, + "loss": 0.4656, + "step": 5991 + }, + { + "epoch": 0.68, + "learning_rate": 4.7741464310257305e-06, + "loss": 0.4563, + "step": 5992 + }, + { + "epoch": 0.68, + "learning_rate": 4.770990808253234e-06, + "loss": 0.4475, + "step": 5993 + }, + { + "epoch": 0.69, + "learning_rate": 4.767835901971745e-06, + "loss": 0.4664, + "step": 5994 + }, + { + "epoch": 0.69, + "learning_rate": 4.764681712613547e-06, + "loss": 0.4534, + "step": 5995 + }, + { + "epoch": 0.69, + "learning_rate": 4.761528240610842e-06, + "loss": 0.4654, + "step": 5996 + }, + { + "epoch": 0.69, + "learning_rate": 4.758375486395721e-06, + "loss": 0.4525, + "step": 5997 + }, + { + "epoch": 0.69, + "learning_rate": 4.755223450400186e-06, + "loss": 0.4443, + "step": 5998 + }, + { + "epoch": 0.69, + "learning_rate": 4.752072133056135e-06, + "loss": 0.456, + "step": 5999 + }, + { + "epoch": 0.69, + "learning_rate": 4.748921534795365e-06, + "loss": 0.459, + "step": 6000 + }, + { + "epoch": 0.69, + "learning_rate": 4.745771656049584e-06, + "loss": 0.4575, + "step": 6001 + }, + { + "epoch": 0.69, + "learning_rate": 4.742622497250389e-06, + "loss": 0.4503, + "step": 6002 + }, + { + "epoch": 0.69, + "learning_rate": 4.739474058829288e-06, + "loss": 0.451, + "step": 6003 + }, + { + "epoch": 0.69, + "learning_rate": 4.736326341217694e-06, + "loss": 0.4712, + "step": 6004 + }, + { + "epoch": 0.69, + "learning_rate": 4.7331793448469045e-06, + "loss": 0.4605, + "step": 6005 + }, + { + "epoch": 0.69, + "learning_rate": 4.730033070148135e-06, + "loss": 0.4639, + "step": 6006 + }, + { + "epoch": 0.69, + "learning_rate": 4.726887517552495e-06, + "loss": 0.4432, + "step": 6007 + }, + { + "epoch": 0.69, + "learning_rate": 4.723742687490988e-06, + "loss": 0.4674, + "step": 6008 + }, + { + "epoch": 0.69, + "learning_rate": 4.7205985803945375e-06, + "loss": 0.4835, + "step": 6009 + }, + { + "epoch": 0.69, + "learning_rate": 4.717455196693945e-06, + "loss": 0.4496, + "step": 6010 + }, + { + "epoch": 0.69, + "learning_rate": 4.7143125368199335e-06, + "loss": 0.4478, + "step": 6011 + }, + { + "epoch": 0.69, + "learning_rate": 4.71117060120311e-06, + "loss": 0.4502, + "step": 6012 + }, + { + "epoch": 0.69, + "learning_rate": 4.708029390273994e-06, + "loss": 0.4507, + "step": 6013 + }, + { + "epoch": 0.69, + "learning_rate": 4.704888904463003e-06, + "loss": 0.4554, + "step": 6014 + }, + { + "epoch": 0.69, + "learning_rate": 4.701749144200449e-06, + "loss": 0.4624, + "step": 6015 + }, + { + "epoch": 0.69, + "learning_rate": 4.698610109916556e-06, + "loss": 0.4803, + "step": 6016 + }, + { + "epoch": 0.69, + "learning_rate": 4.695471802041437e-06, + "loss": 0.4823, + "step": 6017 + }, + { + "epoch": 0.69, + "learning_rate": 4.692334221005108e-06, + "loss": 0.4486, + "step": 6018 + }, + { + "epoch": 0.69, + "learning_rate": 4.689197367237494e-06, + "loss": 0.4389, + "step": 6019 + }, + { + "epoch": 0.69, + "learning_rate": 4.686061241168406e-06, + "loss": 0.4453, + "step": 6020 + }, + { + "epoch": 0.69, + "learning_rate": 4.6829258432275685e-06, + "loss": 0.4557, + "step": 6021 + }, + { + "epoch": 0.69, + "learning_rate": 4.679791173844604e-06, + "loss": 0.4706, + "step": 6022 + }, + { + "epoch": 0.69, + "learning_rate": 4.676657233449025e-06, + "loss": 0.4546, + "step": 6023 + }, + { + "epoch": 0.69, + "learning_rate": 4.673524022470259e-06, + "loss": 0.4494, + "step": 6024 + }, + { + "epoch": 0.69, + "learning_rate": 4.67039154133762e-06, + "loss": 0.4828, + "step": 6025 + }, + { + "epoch": 0.69, + "learning_rate": 4.667259790480327e-06, + "loss": 0.459, + "step": 6026 + }, + { + "epoch": 0.69, + "learning_rate": 4.664128770327506e-06, + "loss": 0.4288, + "step": 6027 + }, + { + "epoch": 0.69, + "learning_rate": 4.66099848130817e-06, + "loss": 0.4452, + "step": 6028 + }, + { + "epoch": 0.69, + "learning_rate": 4.657868923851244e-06, + "loss": 0.4451, + "step": 6029 + }, + { + "epoch": 0.69, + "learning_rate": 4.6547400983855415e-06, + "loss": 0.4561, + "step": 6030 + }, + { + "epoch": 0.69, + "learning_rate": 4.651612005339786e-06, + "loss": 0.461, + "step": 6031 + }, + { + "epoch": 0.69, + "learning_rate": 4.648484645142597e-06, + "loss": 0.4506, + "step": 6032 + }, + { + "epoch": 0.69, + "learning_rate": 4.645358018222486e-06, + "loss": 0.4559, + "step": 6033 + }, + { + "epoch": 0.69, + "learning_rate": 4.642232125007881e-06, + "loss": 0.4586, + "step": 6034 + }, + { + "epoch": 0.69, + "learning_rate": 4.639106965927093e-06, + "loss": 0.4874, + "step": 6035 + }, + { + "epoch": 0.69, + "learning_rate": 4.635982541408334e-06, + "loss": 0.454, + "step": 6036 + }, + { + "epoch": 0.69, + "learning_rate": 4.632858851879729e-06, + "loss": 0.4698, + "step": 6037 + }, + { + "epoch": 0.69, + "learning_rate": 4.629735897769289e-06, + "loss": 0.4706, + "step": 6038 + }, + { + "epoch": 0.69, + "learning_rate": 4.626613679504924e-06, + "loss": 0.4483, + "step": 6039 + }, + { + "epoch": 0.69, + "learning_rate": 4.623492197514453e-06, + "loss": 0.476, + "step": 6040 + }, + { + "epoch": 0.69, + "learning_rate": 4.620371452225587e-06, + "loss": 0.464, + "step": 6041 + }, + { + "epoch": 0.69, + "learning_rate": 4.6172514440659435e-06, + "loss": 0.4384, + "step": 6042 + }, + { + "epoch": 0.69, + "learning_rate": 4.614132173463027e-06, + "loss": 0.4674, + "step": 6043 + }, + { + "epoch": 0.69, + "learning_rate": 4.611013640844245e-06, + "loss": 0.4489, + "step": 6044 + }, + { + "epoch": 0.69, + "learning_rate": 4.607895846636914e-06, + "loss": 0.4501, + "step": 6045 + }, + { + "epoch": 0.69, + "learning_rate": 4.604778791268233e-06, + "loss": 0.4437, + "step": 6046 + }, + { + "epoch": 0.69, + "learning_rate": 4.601662475165316e-06, + "loss": 0.4566, + "step": 6047 + }, + { + "epoch": 0.69, + "learning_rate": 4.598546898755164e-06, + "loss": 0.4739, + "step": 6048 + }, + { + "epoch": 0.69, + "learning_rate": 4.595432062464678e-06, + "loss": 0.454, + "step": 6049 + }, + { + "epoch": 0.69, + "learning_rate": 4.592317966720661e-06, + "loss": 0.432, + "step": 6050 + }, + { + "epoch": 0.69, + "learning_rate": 4.589204611949819e-06, + "loss": 0.4652, + "step": 6051 + }, + { + "epoch": 0.69, + "learning_rate": 4.5860919985787454e-06, + "loss": 0.4672, + "step": 6052 + }, + { + "epoch": 0.69, + "learning_rate": 4.582980127033943e-06, + "loss": 0.4432, + "step": 6053 + }, + { + "epoch": 0.69, + "learning_rate": 4.5798689977418e-06, + "loss": 0.451, + "step": 6054 + }, + { + "epoch": 0.69, + "learning_rate": 4.576758611128619e-06, + "loss": 0.4519, + "step": 6055 + }, + { + "epoch": 0.69, + "learning_rate": 4.573648967620589e-06, + "loss": 0.4571, + "step": 6056 + }, + { + "epoch": 0.69, + "learning_rate": 4.570540067643796e-06, + "loss": 0.4493, + "step": 6057 + }, + { + "epoch": 0.69, + "learning_rate": 4.567431911624236e-06, + "loss": 0.481, + "step": 6058 + }, + { + "epoch": 0.69, + "learning_rate": 4.56432449998779e-06, + "loss": 0.4614, + "step": 6059 + }, + { + "epoch": 0.69, + "learning_rate": 4.5612178331602445e-06, + "loss": 0.4561, + "step": 6060 + }, + { + "epoch": 0.69, + "learning_rate": 4.558111911567287e-06, + "loss": 0.4479, + "step": 6061 + }, + { + "epoch": 0.69, + "learning_rate": 4.55500673563449e-06, + "loss": 0.4753, + "step": 6062 + }, + { + "epoch": 0.69, + "learning_rate": 4.55190230578734e-06, + "loss": 0.4457, + "step": 6063 + }, + { + "epoch": 0.69, + "learning_rate": 4.54879862245121e-06, + "loss": 0.4726, + "step": 6064 + }, + { + "epoch": 0.69, + "learning_rate": 4.545695686051369e-06, + "loss": 0.4457, + "step": 6065 + }, + { + "epoch": 0.69, + "learning_rate": 4.542593497012996e-06, + "loss": 0.4729, + "step": 6066 + }, + { + "epoch": 0.69, + "learning_rate": 4.539492055761153e-06, + "loss": 0.4398, + "step": 6067 + }, + { + "epoch": 0.69, + "learning_rate": 4.536391362720816e-06, + "loss": 0.4609, + "step": 6068 + }, + { + "epoch": 0.69, + "learning_rate": 4.533291418316837e-06, + "loss": 0.4608, + "step": 6069 + }, + { + "epoch": 0.69, + "learning_rate": 4.530192222973987e-06, + "loss": 0.443, + "step": 6070 + }, + { + "epoch": 0.69, + "learning_rate": 4.527093777116925e-06, + "loss": 0.4677, + "step": 6071 + }, + { + "epoch": 0.69, + "learning_rate": 4.523996081170201e-06, + "loss": 0.4473, + "step": 6072 + }, + { + "epoch": 0.69, + "learning_rate": 4.520899135558276e-06, + "loss": 0.455, + "step": 6073 + }, + { + "epoch": 0.69, + "learning_rate": 4.5178029407054965e-06, + "loss": 0.4453, + "step": 6074 + }, + { + "epoch": 0.69, + "learning_rate": 4.514707497036107e-06, + "loss": 0.4705, + "step": 6075 + }, + { + "epoch": 0.69, + "learning_rate": 4.511612804974259e-06, + "loss": 0.4396, + "step": 6076 + }, + { + "epoch": 0.69, + "learning_rate": 4.508518864943989e-06, + "loss": 0.4703, + "step": 6077 + }, + { + "epoch": 0.69, + "learning_rate": 4.505425677369238e-06, + "loss": 0.4372, + "step": 6078 + }, + { + "epoch": 0.69, + "learning_rate": 4.5023332426738445e-06, + "loss": 0.4476, + "step": 6079 + }, + { + "epoch": 0.69, + "learning_rate": 4.4992415612815355e-06, + "loss": 0.4527, + "step": 6080 + }, + { + "epoch": 0.69, + "learning_rate": 4.496150633615947e-06, + "loss": 0.4683, + "step": 6081 + }, + { + "epoch": 0.7, + "learning_rate": 4.4930604601006025e-06, + "loss": 0.4637, + "step": 6082 + }, + { + "epoch": 0.7, + "learning_rate": 4.489971041158919e-06, + "loss": 0.4505, + "step": 6083 + }, + { + "epoch": 0.7, + "learning_rate": 4.486882377214226e-06, + "loss": 0.4505, + "step": 6084 + }, + { + "epoch": 0.7, + "learning_rate": 4.483794468689728e-06, + "loss": 0.4463, + "step": 6085 + }, + { + "epoch": 0.7, + "learning_rate": 4.480707316008549e-06, + "loss": 0.4404, + "step": 6086 + }, + { + "epoch": 0.7, + "learning_rate": 4.477620919593688e-06, + "loss": 0.4671, + "step": 6087 + }, + { + "epoch": 0.7, + "learning_rate": 4.474535279868055e-06, + "loss": 0.449, + "step": 6088 + }, + { + "epoch": 0.7, + "learning_rate": 4.4714503972544545e-06, + "loss": 0.4668, + "step": 6089 + }, + { + "epoch": 0.7, + "learning_rate": 4.4683662721755805e-06, + "loss": 0.4563, + "step": 6090 + }, + { + "epoch": 0.7, + "learning_rate": 4.465282905054025e-06, + "loss": 0.4432, + "step": 6091 + }, + { + "epoch": 0.7, + "learning_rate": 4.462200296312284e-06, + "loss": 0.4485, + "step": 6092 + }, + { + "epoch": 0.7, + "learning_rate": 4.459118446372736e-06, + "loss": 0.4552, + "step": 6093 + }, + { + "epoch": 0.7, + "learning_rate": 4.45603735565767e-06, + "loss": 0.4486, + "step": 6094 + }, + { + "epoch": 0.7, + "learning_rate": 4.4529570245892625e-06, + "loss": 0.4531, + "step": 6095 + }, + { + "epoch": 0.7, + "learning_rate": 4.449877453589584e-06, + "loss": 0.4359, + "step": 6096 + }, + { + "epoch": 0.7, + "learning_rate": 4.446798643080608e-06, + "loss": 0.4864, + "step": 6097 + }, + { + "epoch": 0.7, + "learning_rate": 4.443720593484198e-06, + "loss": 0.4502, + "step": 6098 + }, + { + "epoch": 0.7, + "learning_rate": 4.440643305222121e-06, + "loss": 0.4621, + "step": 6099 + }, + { + "epoch": 0.7, + "learning_rate": 4.43756677871603e-06, + "loss": 0.4472, + "step": 6100 + }, + { + "epoch": 0.7, + "learning_rate": 4.4344910143874755e-06, + "loss": 0.4595, + "step": 6101 + }, + { + "epoch": 0.7, + "learning_rate": 4.431416012657912e-06, + "loss": 0.4382, + "step": 6102 + }, + { + "epoch": 0.7, + "learning_rate": 4.42834177394868e-06, + "loss": 0.4416, + "step": 6103 + }, + { + "epoch": 0.7, + "learning_rate": 4.425268298681015e-06, + "loss": 0.4432, + "step": 6104 + }, + { + "epoch": 0.7, + "learning_rate": 4.422195587276058e-06, + "loss": 0.4588, + "step": 6105 + }, + { + "epoch": 0.7, + "learning_rate": 4.419123640154834e-06, + "loss": 0.4484, + "step": 6106 + }, + { + "epoch": 0.7, + "learning_rate": 4.416052457738271e-06, + "loss": 0.4875, + "step": 6107 + }, + { + "epoch": 0.7, + "learning_rate": 4.412982040447193e-06, + "loss": 0.4492, + "step": 6108 + }, + { + "epoch": 0.7, + "learning_rate": 4.409912388702308e-06, + "loss": 0.4476, + "step": 6109 + }, + { + "epoch": 0.7, + "learning_rate": 4.406843502924235e-06, + "loss": 0.4601, + "step": 6110 + }, + { + "epoch": 0.7, + "learning_rate": 4.403775383533472e-06, + "loss": 0.4618, + "step": 6111 + }, + { + "epoch": 0.7, + "learning_rate": 4.400708030950428e-06, + "loss": 0.4439, + "step": 6112 + }, + { + "epoch": 0.7, + "learning_rate": 4.397641445595393e-06, + "loss": 0.4651, + "step": 6113 + }, + { + "epoch": 0.7, + "learning_rate": 4.394575627888558e-06, + "loss": 0.4394, + "step": 6114 + }, + { + "epoch": 0.7, + "learning_rate": 4.391510578250011e-06, + "loss": 0.4715, + "step": 6115 + }, + { + "epoch": 0.7, + "learning_rate": 4.388446297099728e-06, + "loss": 0.4585, + "step": 6116 + }, + { + "epoch": 0.7, + "learning_rate": 4.385382784857587e-06, + "loss": 0.4418, + "step": 6117 + }, + { + "epoch": 0.7, + "learning_rate": 4.382320041943361e-06, + "loss": 0.4464, + "step": 6118 + }, + { + "epoch": 0.7, + "learning_rate": 4.379258068776706e-06, + "loss": 0.4421, + "step": 6119 + }, + { + "epoch": 0.7, + "learning_rate": 4.3761968657771905e-06, + "loss": 0.4487, + "step": 6120 + }, + { + "epoch": 0.7, + "learning_rate": 4.3731364333642615e-06, + "loss": 0.4634, + "step": 6121 + }, + { + "epoch": 0.7, + "learning_rate": 4.370076771957264e-06, + "loss": 0.4363, + "step": 6122 + }, + { + "epoch": 0.7, + "learning_rate": 4.367017881975446e-06, + "loss": 0.4684, + "step": 6123 + }, + { + "epoch": 0.7, + "learning_rate": 4.363959763837938e-06, + "loss": 0.4628, + "step": 6124 + }, + { + "epoch": 0.7, + "learning_rate": 4.360902417963777e-06, + "loss": 0.4726, + "step": 6125 + }, + { + "epoch": 0.7, + "learning_rate": 4.357845844771881e-06, + "loss": 0.4525, + "step": 6126 + }, + { + "epoch": 0.7, + "learning_rate": 4.354790044681072e-06, + "loss": 0.4496, + "step": 6127 + }, + { + "epoch": 0.7, + "learning_rate": 4.351735018110066e-06, + "loss": 0.4513, + "step": 6128 + }, + { + "epoch": 0.7, + "learning_rate": 4.348680765477463e-06, + "loss": 0.4627, + "step": 6129 + }, + { + "epoch": 0.7, + "learning_rate": 4.3456272872017725e-06, + "loss": 0.4446, + "step": 6130 + }, + { + "epoch": 0.7, + "learning_rate": 4.342574583701382e-06, + "loss": 0.4567, + "step": 6131 + }, + { + "epoch": 0.7, + "learning_rate": 4.33952265539458e-06, + "loss": 0.4456, + "step": 6132 + }, + { + "epoch": 0.7, + "learning_rate": 4.336471502699554e-06, + "loss": 0.4527, + "step": 6133 + }, + { + "epoch": 0.7, + "learning_rate": 4.333421126034374e-06, + "loss": 0.4668, + "step": 6134 + }, + { + "epoch": 0.7, + "learning_rate": 4.330371525817012e-06, + "loss": 0.4575, + "step": 6135 + }, + { + "epoch": 0.7, + "learning_rate": 4.327322702465335e-06, + "loss": 0.4421, + "step": 6136 + }, + { + "epoch": 0.7, + "learning_rate": 4.324274656397095e-06, + "loss": 0.4728, + "step": 6137 + }, + { + "epoch": 0.7, + "learning_rate": 4.321227388029947e-06, + "loss": 0.4593, + "step": 6138 + }, + { + "epoch": 0.7, + "learning_rate": 4.318180897781432e-06, + "loss": 0.4651, + "step": 6139 + }, + { + "epoch": 0.7, + "learning_rate": 4.315135186068984e-06, + "loss": 0.4454, + "step": 6140 + }, + { + "epoch": 0.7, + "learning_rate": 4.312090253309941e-06, + "loss": 0.4575, + "step": 6141 + }, + { + "epoch": 0.7, + "learning_rate": 4.309046099921518e-06, + "loss": 0.4456, + "step": 6142 + }, + { + "epoch": 0.7, + "learning_rate": 4.306002726320839e-06, + "loss": 0.4633, + "step": 6143 + }, + { + "epoch": 0.7, + "learning_rate": 4.302960132924909e-06, + "loss": 0.4532, + "step": 6144 + }, + { + "epoch": 0.7, + "learning_rate": 4.299918320150634e-06, + "loss": 0.4513, + "step": 6145 + }, + { + "epoch": 0.7, + "learning_rate": 4.296877288414815e-06, + "loss": 0.4538, + "step": 6146 + }, + { + "epoch": 0.7, + "learning_rate": 4.2938370381341355e-06, + "loss": 0.445, + "step": 6147 + }, + { + "epoch": 0.7, + "learning_rate": 4.290797569725175e-06, + "loss": 0.4534, + "step": 6148 + }, + { + "epoch": 0.7, + "learning_rate": 4.287758883604415e-06, + "loss": 0.4542, + "step": 6149 + }, + { + "epoch": 0.7, + "learning_rate": 4.284720980188216e-06, + "loss": 0.4479, + "step": 6150 + }, + { + "epoch": 0.7, + "learning_rate": 4.281683859892849e-06, + "loss": 0.4849, + "step": 6151 + }, + { + "epoch": 0.7, + "learning_rate": 4.278647523134459e-06, + "loss": 0.4355, + "step": 6152 + }, + { + "epoch": 0.7, + "learning_rate": 4.275611970329092e-06, + "loss": 0.4748, + "step": 6153 + }, + { + "epoch": 0.7, + "learning_rate": 4.272577201892688e-06, + "loss": 0.4573, + "step": 6154 + }, + { + "epoch": 0.7, + "learning_rate": 4.269543218241079e-06, + "loss": 0.4502, + "step": 6155 + }, + { + "epoch": 0.7, + "learning_rate": 4.266510019789993e-06, + "loss": 0.464, + "step": 6156 + }, + { + "epoch": 0.7, + "learning_rate": 4.26347760695504e-06, + "loss": 0.4589, + "step": 6157 + }, + { + "epoch": 0.7, + "learning_rate": 4.260445980151725e-06, + "loss": 0.4541, + "step": 6158 + }, + { + "epoch": 0.7, + "learning_rate": 4.257415139795458e-06, + "loss": 0.4498, + "step": 6159 + }, + { + "epoch": 0.7, + "learning_rate": 4.2543850863015266e-06, + "loss": 0.451, + "step": 6160 + }, + { + "epoch": 0.7, + "learning_rate": 4.2513558200851115e-06, + "loss": 0.4428, + "step": 6161 + }, + { + "epoch": 0.7, + "learning_rate": 4.248327341561298e-06, + "loss": 0.4627, + "step": 6162 + }, + { + "epoch": 0.7, + "learning_rate": 4.245299651145048e-06, + "loss": 0.4586, + "step": 6163 + }, + { + "epoch": 0.7, + "learning_rate": 4.242272749251228e-06, + "loss": 0.4486, + "step": 6164 + }, + { + "epoch": 0.7, + "learning_rate": 4.239246636294591e-06, + "loss": 0.4699, + "step": 6165 + }, + { + "epoch": 0.7, + "learning_rate": 4.236221312689777e-06, + "loss": 0.4538, + "step": 6166 + }, + { + "epoch": 0.7, + "learning_rate": 4.2331967788513295e-06, + "loss": 0.4742, + "step": 6167 + }, + { + "epoch": 0.7, + "learning_rate": 4.230173035193671e-06, + "loss": 0.4524, + "step": 6168 + }, + { + "epoch": 0.71, + "learning_rate": 4.227150082131128e-06, + "loss": 0.4548, + "step": 6169 + }, + { + "epoch": 0.71, + "learning_rate": 4.2241279200779105e-06, + "loss": 0.4454, + "step": 6170 + }, + { + "epoch": 0.71, + "learning_rate": 4.221106549448116e-06, + "loss": 0.4473, + "step": 6171 + }, + { + "epoch": 0.71, + "learning_rate": 4.21808597065575e-06, + "loss": 0.4565, + "step": 6172 + }, + { + "epoch": 0.71, + "learning_rate": 4.215066184114689e-06, + "loss": 0.4502, + "step": 6173 + }, + { + "epoch": 0.71, + "learning_rate": 4.212047190238716e-06, + "loss": 0.4569, + "step": 6174 + }, + { + "epoch": 0.71, + "learning_rate": 4.209028989441505e-06, + "loss": 0.4568, + "step": 6175 + }, + { + "epoch": 0.71, + "learning_rate": 4.2060115821366085e-06, + "loss": 0.4514, + "step": 6176 + }, + { + "epoch": 0.71, + "learning_rate": 4.202994968737487e-06, + "loss": 0.4507, + "step": 6177 + }, + { + "epoch": 0.71, + "learning_rate": 4.199979149657481e-06, + "loss": 0.4546, + "step": 6178 + }, + { + "epoch": 0.71, + "learning_rate": 4.196964125309818e-06, + "loss": 0.4484, + "step": 6179 + }, + { + "epoch": 0.71, + "learning_rate": 4.1939498961076345e-06, + "loss": 0.451, + "step": 6180 + }, + { + "epoch": 0.71, + "learning_rate": 4.190936462463937e-06, + "loss": 0.4628, + "step": 6181 + }, + { + "epoch": 0.71, + "learning_rate": 4.187923824791642e-06, + "loss": 0.4572, + "step": 6182 + }, + { + "epoch": 0.71, + "learning_rate": 4.184911983503541e-06, + "loss": 0.483, + "step": 6183 + }, + { + "epoch": 0.71, + "learning_rate": 4.1819009390123276e-06, + "loss": 0.4448, + "step": 6184 + }, + { + "epoch": 0.71, + "learning_rate": 4.178890691730585e-06, + "loss": 0.4624, + "step": 6185 + }, + { + "epoch": 0.71, + "learning_rate": 4.17588124207078e-06, + "loss": 0.4336, + "step": 6186 + }, + { + "epoch": 0.71, + "learning_rate": 4.172872590445273e-06, + "loss": 0.4823, + "step": 6187 + }, + { + "epoch": 0.71, + "learning_rate": 4.169864737266321e-06, + "loss": 0.4495, + "step": 6188 + }, + { + "epoch": 0.71, + "learning_rate": 4.166857682946061e-06, + "loss": 0.4592, + "step": 6189 + }, + { + "epoch": 0.71, + "learning_rate": 4.163851427896534e-06, + "loss": 0.4319, + "step": 6190 + }, + { + "epoch": 0.71, + "learning_rate": 4.160845972529656e-06, + "loss": 0.4667, + "step": 6191 + }, + { + "epoch": 0.71, + "learning_rate": 4.157841317257245e-06, + "loss": 0.4676, + "step": 6192 + }, + { + "epoch": 0.71, + "learning_rate": 4.154837462491012e-06, + "loss": 0.4332, + "step": 6193 + }, + { + "epoch": 0.71, + "learning_rate": 4.151834408642542e-06, + "loss": 0.4331, + "step": 6194 + }, + { + "epoch": 0.71, + "learning_rate": 4.148832156123329e-06, + "loss": 0.4461, + "step": 6195 + }, + { + "epoch": 0.71, + "learning_rate": 4.145830705344746e-06, + "loss": 0.4527, + "step": 6196 + }, + { + "epoch": 0.71, + "learning_rate": 4.142830056718052e-06, + "loss": 0.4627, + "step": 6197 + }, + { + "epoch": 0.71, + "learning_rate": 4.139830210654413e-06, + "loss": 0.4456, + "step": 6198 + }, + { + "epoch": 0.71, + "learning_rate": 4.136831167564867e-06, + "loss": 0.4483, + "step": 6199 + }, + { + "epoch": 0.71, + "learning_rate": 4.133832927860356e-06, + "loss": 0.4507, + "step": 6200 + }, + { + "epoch": 0.71, + "learning_rate": 4.130835491951699e-06, + "loss": 0.4712, + "step": 6201 + }, + { + "epoch": 0.71, + "learning_rate": 4.127838860249617e-06, + "loss": 0.4428, + "step": 6202 + }, + { + "epoch": 0.71, + "learning_rate": 4.124843033164716e-06, + "loss": 0.4576, + "step": 6203 + }, + { + "epoch": 0.71, + "learning_rate": 4.12184801110749e-06, + "loss": 0.4357, + "step": 6204 + }, + { + "epoch": 0.71, + "learning_rate": 4.11885379448832e-06, + "loss": 0.4641, + "step": 6205 + }, + { + "epoch": 0.71, + "learning_rate": 4.115860383717486e-06, + "loss": 0.4702, + "step": 6206 + }, + { + "epoch": 0.71, + "learning_rate": 4.1128677792051465e-06, + "loss": 0.4636, + "step": 6207 + }, + { + "epoch": 0.71, + "learning_rate": 4.109875981361363e-06, + "loss": 0.4425, + "step": 6208 + }, + { + "epoch": 0.71, + "learning_rate": 4.106884990596073e-06, + "loss": 0.4679, + "step": 6209 + }, + { + "epoch": 0.71, + "learning_rate": 4.103894807319106e-06, + "loss": 0.4445, + "step": 6210 + }, + { + "epoch": 0.71, + "learning_rate": 4.100905431940189e-06, + "loss": 0.4616, + "step": 6211 + }, + { + "epoch": 0.71, + "learning_rate": 4.097916864868932e-06, + "loss": 0.4389, + "step": 6212 + }, + { + "epoch": 0.71, + "learning_rate": 4.0949291065148375e-06, + "loss": 0.4391, + "step": 6213 + }, + { + "epoch": 0.71, + "learning_rate": 4.091942157287294e-06, + "loss": 0.4661, + "step": 6214 + }, + { + "epoch": 0.71, + "learning_rate": 4.088956017595575e-06, + "loss": 0.453, + "step": 6215 + }, + { + "epoch": 0.71, + "learning_rate": 4.085970687848857e-06, + "loss": 0.4618, + "step": 6216 + }, + { + "epoch": 0.71, + "learning_rate": 4.082986168456192e-06, + "loss": 0.4677, + "step": 6217 + }, + { + "epoch": 0.71, + "learning_rate": 4.080002459826523e-06, + "loss": 0.439, + "step": 6218 + }, + { + "epoch": 0.71, + "learning_rate": 4.077019562368691e-06, + "loss": 0.4778, + "step": 6219 + }, + { + "epoch": 0.71, + "learning_rate": 4.074037476491414e-06, + "loss": 0.4595, + "step": 6220 + }, + { + "epoch": 0.71, + "learning_rate": 4.071056202603305e-06, + "loss": 0.446, + "step": 6221 + }, + { + "epoch": 0.71, + "learning_rate": 4.0680757411128714e-06, + "loss": 0.4495, + "step": 6222 + }, + { + "epoch": 0.71, + "learning_rate": 4.0650960924284945e-06, + "loss": 0.4578, + "step": 6223 + }, + { + "epoch": 0.71, + "learning_rate": 4.06211725695846e-06, + "loss": 0.4525, + "step": 6224 + }, + { + "epoch": 0.71, + "learning_rate": 4.059139235110928e-06, + "loss": 0.4576, + "step": 6225 + }, + { + "epoch": 0.71, + "learning_rate": 4.056162027293962e-06, + "loss": 0.4491, + "step": 6226 + }, + { + "epoch": 0.71, + "learning_rate": 4.053185633915501e-06, + "loss": 0.4703, + "step": 6227 + }, + { + "epoch": 0.71, + "learning_rate": 4.050210055383373e-06, + "loss": 0.447, + "step": 6228 + }, + { + "epoch": 0.71, + "learning_rate": 4.047235292105308e-06, + "loss": 0.4667, + "step": 6229 + }, + { + "epoch": 0.71, + "learning_rate": 4.0442613444889065e-06, + "loss": 0.4357, + "step": 6230 + }, + { + "epoch": 0.71, + "learning_rate": 4.04128821294167e-06, + "loss": 0.455, + "step": 6231 + }, + { + "epoch": 0.71, + "learning_rate": 4.0383158978709865e-06, + "loss": 0.4835, + "step": 6232 + }, + { + "epoch": 0.71, + "learning_rate": 4.035344399684124e-06, + "loss": 0.4562, + "step": 6233 + }, + { + "epoch": 0.71, + "learning_rate": 4.032373718788248e-06, + "loss": 0.4429, + "step": 6234 + }, + { + "epoch": 0.71, + "learning_rate": 4.029403855590409e-06, + "loss": 0.4534, + "step": 6235 + }, + { + "epoch": 0.71, + "learning_rate": 4.026434810497538e-06, + "loss": 0.4701, + "step": 6236 + }, + { + "epoch": 0.71, + "learning_rate": 4.023466583916469e-06, + "loss": 0.4494, + "step": 6237 + }, + { + "epoch": 0.71, + "learning_rate": 4.020499176253907e-06, + "loss": 0.4445, + "step": 6238 + }, + { + "epoch": 0.71, + "learning_rate": 4.017532587916461e-06, + "loss": 0.4381, + "step": 6239 + }, + { + "epoch": 0.71, + "learning_rate": 4.014566819310612e-06, + "loss": 0.4701, + "step": 6240 + }, + { + "epoch": 0.71, + "learning_rate": 4.011601870842739e-06, + "loss": 0.4726, + "step": 6241 + }, + { + "epoch": 0.71, + "learning_rate": 4.008637742919114e-06, + "loss": 0.4449, + "step": 6242 + }, + { + "epoch": 0.71, + "learning_rate": 4.005674435945881e-06, + "loss": 0.461, + "step": 6243 + }, + { + "epoch": 0.71, + "learning_rate": 4.0027119503290776e-06, + "loss": 0.4431, + "step": 6244 + }, + { + "epoch": 0.71, + "learning_rate": 3.999750286474637e-06, + "loss": 0.4566, + "step": 6245 + }, + { + "epoch": 0.71, + "learning_rate": 3.996789444788366e-06, + "loss": 0.4659, + "step": 6246 + }, + { + "epoch": 0.71, + "learning_rate": 3.993829425675974e-06, + "loss": 0.4216, + "step": 6247 + }, + { + "epoch": 0.71, + "learning_rate": 3.99087022954304e-06, + "loss": 0.4622, + "step": 6248 + }, + { + "epoch": 0.71, + "learning_rate": 3.987911856795047e-06, + "loss": 0.4577, + "step": 6249 + }, + { + "epoch": 0.71, + "learning_rate": 3.98495430783736e-06, + "loss": 0.4687, + "step": 6250 + }, + { + "epoch": 0.71, + "learning_rate": 3.981997583075222e-06, + "loss": 0.4482, + "step": 6251 + }, + { + "epoch": 0.71, + "learning_rate": 3.979041682913777e-06, + "loss": 0.4611, + "step": 6252 + }, + { + "epoch": 0.71, + "learning_rate": 3.976086607758047e-06, + "loss": 0.4587, + "step": 6253 + }, + { + "epoch": 0.71, + "learning_rate": 3.973132358012939e-06, + "loss": 0.4507, + "step": 6254 + }, + { + "epoch": 0.71, + "learning_rate": 3.970178934083259e-06, + "loss": 0.4569, + "step": 6255 + }, + { + "epoch": 0.71, + "learning_rate": 3.967226336373686e-06, + "loss": 0.4404, + "step": 6256 + }, + { + "epoch": 0.72, + "learning_rate": 3.964274565288792e-06, + "loss": 0.4412, + "step": 6257 + }, + { + "epoch": 0.72, + "learning_rate": 3.961323621233036e-06, + "loss": 0.465, + "step": 6258 + }, + { + "epoch": 0.72, + "learning_rate": 3.9583735046107655e-06, + "loss": 0.4492, + "step": 6259 + }, + { + "epoch": 0.72, + "learning_rate": 3.9554242158262134e-06, + "loss": 0.4508, + "step": 6260 + }, + { + "epoch": 0.72, + "learning_rate": 3.952475755283497e-06, + "loss": 0.4388, + "step": 6261 + }, + { + "epoch": 0.72, + "learning_rate": 3.949528123386617e-06, + "loss": 0.4523, + "step": 6262 + }, + { + "epoch": 0.72, + "learning_rate": 3.94658132053947e-06, + "loss": 0.4702, + "step": 6263 + }, + { + "epoch": 0.72, + "learning_rate": 3.943635347145829e-06, + "loss": 0.4598, + "step": 6264 + }, + { + "epoch": 0.72, + "learning_rate": 3.940690203609364e-06, + "loss": 0.4392, + "step": 6265 + }, + { + "epoch": 0.72, + "learning_rate": 3.937745890333623e-06, + "loss": 0.4792, + "step": 6266 + }, + { + "epoch": 0.72, + "learning_rate": 3.934802407722038e-06, + "loss": 0.4307, + "step": 6267 + }, + { + "epoch": 0.72, + "learning_rate": 3.931859756177936e-06, + "loss": 0.4622, + "step": 6268 + }, + { + "epoch": 0.72, + "learning_rate": 3.928917936104529e-06, + "loss": 0.4523, + "step": 6269 + }, + { + "epoch": 0.72, + "learning_rate": 3.925976947904906e-06, + "loss": 0.4647, + "step": 6270 + }, + { + "epoch": 0.72, + "learning_rate": 3.923036791982053e-06, + "loss": 0.4302, + "step": 6271 + }, + { + "epoch": 0.72, + "learning_rate": 3.920097468738833e-06, + "loss": 0.4587, + "step": 6272 + }, + { + "epoch": 0.72, + "learning_rate": 3.917158978578003e-06, + "loss": 0.4541, + "step": 6273 + }, + { + "epoch": 0.72, + "learning_rate": 3.914221321902199e-06, + "loss": 0.4458, + "step": 6274 + }, + { + "epoch": 0.72, + "learning_rate": 3.911284499113943e-06, + "loss": 0.4601, + "step": 6275 + }, + { + "epoch": 0.72, + "learning_rate": 3.908348510615653e-06, + "loss": 0.4432, + "step": 6276 + }, + { + "epoch": 0.72, + "learning_rate": 3.905413356809615e-06, + "loss": 0.4498, + "step": 6277 + }, + { + "epoch": 0.72, + "learning_rate": 3.902479038098017e-06, + "loss": 0.4558, + "step": 6278 + }, + { + "epoch": 0.72, + "learning_rate": 3.899545554882927e-06, + "loss": 0.4669, + "step": 6279 + }, + { + "epoch": 0.72, + "learning_rate": 3.896612907566294e-06, + "loss": 0.4479, + "step": 6280 + }, + { + "epoch": 0.72, + "learning_rate": 3.893681096549961e-06, + "loss": 0.4509, + "step": 6281 + }, + { + "epoch": 0.72, + "learning_rate": 3.890750122235645e-06, + "loss": 0.4552, + "step": 6282 + }, + { + "epoch": 0.72, + "learning_rate": 3.887819985024962e-06, + "loss": 0.4569, + "step": 6283 + }, + { + "epoch": 0.72, + "learning_rate": 3.884890685319402e-06, + "loss": 0.4695, + "step": 6284 + }, + { + "epoch": 0.72, + "learning_rate": 3.881962223520343e-06, + "loss": 0.4291, + "step": 6285 + }, + { + "epoch": 0.72, + "learning_rate": 3.879034600029054e-06, + "loss": 0.4662, + "step": 6286 + }, + { + "epoch": 0.72, + "learning_rate": 3.876107815246678e-06, + "loss": 0.4642, + "step": 6287 + }, + { + "epoch": 0.72, + "learning_rate": 3.873181869574256e-06, + "loss": 0.4396, + "step": 6288 + }, + { + "epoch": 0.72, + "learning_rate": 3.87025676341271e-06, + "loss": 0.4598, + "step": 6289 + }, + { + "epoch": 0.72, + "learning_rate": 3.867332497162836e-06, + "loss": 0.4541, + "step": 6290 + }, + { + "epoch": 0.72, + "learning_rate": 3.864409071225334e-06, + "loss": 0.4548, + "step": 6291 + }, + { + "epoch": 0.72, + "learning_rate": 3.861486486000771e-06, + "loss": 0.4845, + "step": 6292 + }, + { + "epoch": 0.72, + "learning_rate": 3.858564741889608e-06, + "loss": 0.451, + "step": 6293 + }, + { + "epoch": 0.72, + "learning_rate": 3.855643839292193e-06, + "loss": 0.4693, + "step": 6294 + }, + { + "epoch": 0.72, + "learning_rate": 3.852723778608748e-06, + "loss": 0.4537, + "step": 6295 + }, + { + "epoch": 0.72, + "learning_rate": 3.849804560239394e-06, + "loss": 0.4532, + "step": 6296 + }, + { + "epoch": 0.72, + "learning_rate": 3.846886184584122e-06, + "loss": 0.4356, + "step": 6297 + }, + { + "epoch": 0.72, + "learning_rate": 3.8439686520428185e-06, + "loss": 0.4561, + "step": 6298 + }, + { + "epoch": 0.72, + "learning_rate": 3.841051963015254e-06, + "loss": 0.4448, + "step": 6299 + }, + { + "epoch": 0.72, + "learning_rate": 3.8381361179010755e-06, + "loss": 0.4528, + "step": 6300 + }, + { + "epoch": 0.72, + "learning_rate": 3.8352211170998165e-06, + "loss": 0.4597, + "step": 6301 + }, + { + "epoch": 0.72, + "learning_rate": 3.8323069610109046e-06, + "loss": 0.4527, + "step": 6302 + }, + { + "epoch": 0.72, + "learning_rate": 3.829393650033635e-06, + "loss": 0.4397, + "step": 6303 + }, + { + "epoch": 0.72, + "learning_rate": 3.8264811845672055e-06, + "loss": 0.4605, + "step": 6304 + }, + { + "epoch": 0.72, + "learning_rate": 3.823569565010682e-06, + "loss": 0.4474, + "step": 6305 + }, + { + "epoch": 0.72, + "learning_rate": 3.820658791763023e-06, + "loss": 0.4566, + "step": 6306 + }, + { + "epoch": 0.72, + "learning_rate": 3.817748865223075e-06, + "loss": 0.4645, + "step": 6307 + }, + { + "epoch": 0.72, + "learning_rate": 3.814839785789555e-06, + "loss": 0.4646, + "step": 6308 + }, + { + "epoch": 0.72, + "learning_rate": 3.811931553861078e-06, + "loss": 0.4435, + "step": 6309 + }, + { + "epoch": 0.72, + "learning_rate": 3.809024169836134e-06, + "loss": 0.4757, + "step": 6310 + }, + { + "epoch": 0.72, + "learning_rate": 3.8061176341130955e-06, + "loss": 0.4353, + "step": 6311 + }, + { + "epoch": 0.72, + "learning_rate": 3.803211947090232e-06, + "loss": 0.4753, + "step": 6312 + }, + { + "epoch": 0.72, + "learning_rate": 3.8003071091656806e-06, + "loss": 0.4374, + "step": 6313 + }, + { + "epoch": 0.72, + "learning_rate": 3.7974031207374685e-06, + "loss": 0.4449, + "step": 6314 + }, + { + "epoch": 0.72, + "learning_rate": 3.7944999822035077e-06, + "loss": 0.4608, + "step": 6315 + }, + { + "epoch": 0.72, + "learning_rate": 3.791597693961596e-06, + "loss": 0.4467, + "step": 6316 + }, + { + "epoch": 0.72, + "learning_rate": 3.788696256409412e-06, + "loss": 0.4641, + "step": 6317 + }, + { + "epoch": 0.72, + "learning_rate": 3.7857956699445164e-06, + "loss": 0.467, + "step": 6318 + }, + { + "epoch": 0.72, + "learning_rate": 3.78289593496435e-06, + "loss": 0.448, + "step": 6319 + }, + { + "epoch": 0.72, + "learning_rate": 3.7799970518662477e-06, + "loss": 0.4555, + "step": 6320 + }, + { + "epoch": 0.72, + "learning_rate": 3.7770990210474147e-06, + "loss": 0.4534, + "step": 6321 + }, + { + "epoch": 0.72, + "learning_rate": 3.774201842904953e-06, + "loss": 0.4596, + "step": 6322 + }, + { + "epoch": 0.72, + "learning_rate": 3.771305517835837e-06, + "loss": 0.4588, + "step": 6323 + }, + { + "epoch": 0.72, + "learning_rate": 3.7684100462369244e-06, + "loss": 0.466, + "step": 6324 + }, + { + "epoch": 0.72, + "learning_rate": 3.765515428504963e-06, + "loss": 0.4445, + "step": 6325 + }, + { + "epoch": 0.72, + "learning_rate": 3.7626216650365833e-06, + "loss": 0.4649, + "step": 6326 + }, + { + "epoch": 0.72, + "learning_rate": 3.7597287562282892e-06, + "loss": 0.4526, + "step": 6327 + }, + { + "epoch": 0.72, + "learning_rate": 3.7568367024764794e-06, + "loss": 0.4537, + "step": 6328 + }, + { + "epoch": 0.72, + "learning_rate": 3.7539455041774255e-06, + "loss": 0.4432, + "step": 6329 + }, + { + "epoch": 0.72, + "learning_rate": 3.7510551617272907e-06, + "loss": 0.4636, + "step": 6330 + }, + { + "epoch": 0.72, + "learning_rate": 3.748165675522113e-06, + "loss": 0.4407, + "step": 6331 + }, + { + "epoch": 0.72, + "learning_rate": 3.7452770459578134e-06, + "loss": 0.4403, + "step": 6332 + }, + { + "epoch": 0.72, + "learning_rate": 3.742389273430208e-06, + "loss": 0.4471, + "step": 6333 + }, + { + "epoch": 0.72, + "learning_rate": 3.7395023583349755e-06, + "loss": 0.4537, + "step": 6334 + }, + { + "epoch": 0.72, + "learning_rate": 3.7366163010676937e-06, + "loss": 0.4709, + "step": 6335 + }, + { + "epoch": 0.72, + "learning_rate": 3.733731102023819e-06, + "loss": 0.4529, + "step": 6336 + }, + { + "epoch": 0.72, + "learning_rate": 3.730846761598682e-06, + "loss": 0.4427, + "step": 6337 + }, + { + "epoch": 0.72, + "learning_rate": 3.7279632801875076e-06, + "loss": 0.4683, + "step": 6338 + }, + { + "epoch": 0.72, + "learning_rate": 3.725080658185395e-06, + "loss": 0.4596, + "step": 6339 + }, + { + "epoch": 0.72, + "learning_rate": 3.7221988959873232e-06, + "loss": 0.4385, + "step": 6340 + }, + { + "epoch": 0.72, + "learning_rate": 3.7193179939881665e-06, + "loss": 0.4397, + "step": 6341 + }, + { + "epoch": 0.72, + "learning_rate": 3.716437952582663e-06, + "loss": 0.4779, + "step": 6342 + }, + { + "epoch": 0.72, + "learning_rate": 3.7135587721654533e-06, + "loss": 0.4569, + "step": 6343 + }, + { + "epoch": 0.73, + "learning_rate": 3.710680453131039e-06, + "loss": 0.4611, + "step": 6344 + }, + { + "epoch": 0.73, + "learning_rate": 3.7078029958738194e-06, + "loss": 0.4375, + "step": 6345 + }, + { + "epoch": 0.73, + "learning_rate": 3.7049264007880735e-06, + "loss": 0.4601, + "step": 6346 + }, + { + "epoch": 0.73, + "learning_rate": 3.7020506682679524e-06, + "loss": 0.4459, + "step": 6347 + }, + { + "epoch": 0.73, + "learning_rate": 3.699175798707502e-06, + "loss": 0.4453, + "step": 6348 + }, + { + "epoch": 0.73, + "learning_rate": 3.6963017925006407e-06, + "loss": 0.4557, + "step": 6349 + }, + { + "epoch": 0.73, + "learning_rate": 3.6934286500411675e-06, + "loss": 0.4617, + "step": 6350 + }, + { + "epoch": 0.73, + "learning_rate": 3.690556371722774e-06, + "loss": 0.46, + "step": 6351 + }, + { + "epoch": 0.73, + "learning_rate": 3.6876849579390205e-06, + "loss": 0.4603, + "step": 6352 + }, + { + "epoch": 0.73, + "learning_rate": 3.6848144090833602e-06, + "loss": 0.4403, + "step": 6353 + }, + { + "epoch": 0.73, + "learning_rate": 3.681944725549117e-06, + "loss": 0.4655, + "step": 6354 + }, + { + "epoch": 0.73, + "learning_rate": 3.6790759077295046e-06, + "loss": 0.4291, + "step": 6355 + }, + { + "epoch": 0.73, + "learning_rate": 3.676207956017618e-06, + "loss": 0.4618, + "step": 6356 + }, + { + "epoch": 0.73, + "learning_rate": 3.6733408708064265e-06, + "loss": 0.4499, + "step": 6357 + }, + { + "epoch": 0.73, + "learning_rate": 3.6704746524887835e-06, + "loss": 0.5017, + "step": 6358 + }, + { + "epoch": 0.73, + "learning_rate": 3.667609301457431e-06, + "loss": 0.4465, + "step": 6359 + }, + { + "epoch": 0.73, + "learning_rate": 3.664744818104977e-06, + "loss": 0.4627, + "step": 6360 + }, + { + "epoch": 0.73, + "learning_rate": 3.6618812028239304e-06, + "loss": 0.4456, + "step": 6361 + }, + { + "epoch": 0.73, + "learning_rate": 3.65901845600666e-06, + "loss": 0.465, + "step": 6362 + }, + { + "epoch": 0.73, + "learning_rate": 3.656156578045431e-06, + "loss": 0.4447, + "step": 6363 + }, + { + "epoch": 0.73, + "learning_rate": 3.653295569332389e-06, + "loss": 0.4692, + "step": 6364 + }, + { + "epoch": 0.73, + "learning_rate": 3.650435430259548e-06, + "loss": 0.4425, + "step": 6365 + }, + { + "epoch": 0.73, + "learning_rate": 3.6475761612188177e-06, + "loss": 0.4569, + "step": 6366 + }, + { + "epoch": 0.73, + "learning_rate": 3.644717762601978e-06, + "loss": 0.4629, + "step": 6367 + }, + { + "epoch": 0.73, + "learning_rate": 3.6418602348006903e-06, + "loss": 0.4604, + "step": 6368 + }, + { + "epoch": 0.73, + "learning_rate": 3.639003578206508e-06, + "loss": 0.4635, + "step": 6369 + }, + { + "epoch": 0.73, + "learning_rate": 3.6361477932108513e-06, + "loss": 0.4692, + "step": 6370 + }, + { + "epoch": 0.73, + "learning_rate": 3.633292880205024e-06, + "loss": 0.4452, + "step": 6371 + }, + { + "epoch": 0.73, + "learning_rate": 3.630438839580217e-06, + "loss": 0.4524, + "step": 6372 + }, + { + "epoch": 0.73, + "learning_rate": 3.627585671727496e-06, + "loss": 0.4447, + "step": 6373 + }, + { + "epoch": 0.73, + "learning_rate": 3.6247333770378133e-06, + "loss": 0.4762, + "step": 6374 + }, + { + "epoch": 0.73, + "learning_rate": 3.6218819559019934e-06, + "loss": 0.4542, + "step": 6375 + }, + { + "epoch": 0.73, + "learning_rate": 3.6190314087107415e-06, + "loss": 0.4638, + "step": 6376 + }, + { + "epoch": 0.73, + "learning_rate": 3.6161817358546513e-06, + "loss": 0.4357, + "step": 6377 + }, + { + "epoch": 0.73, + "learning_rate": 3.6133329377241866e-06, + "loss": 0.4745, + "step": 6378 + }, + { + "epoch": 0.73, + "learning_rate": 3.6104850147097035e-06, + "loss": 0.4524, + "step": 6379 + }, + { + "epoch": 0.73, + "learning_rate": 3.6076379672014263e-06, + "loss": 0.4682, + "step": 6380 + }, + { + "epoch": 0.73, + "learning_rate": 3.6047917955894606e-06, + "loss": 0.4454, + "step": 6381 + }, + { + "epoch": 0.73, + "learning_rate": 3.6019465002638e-06, + "loss": 0.4475, + "step": 6382 + }, + { + "epoch": 0.73, + "learning_rate": 3.5991020816143164e-06, + "loss": 0.452, + "step": 6383 + }, + { + "epoch": 0.73, + "learning_rate": 3.5962585400307504e-06, + "loss": 0.4758, + "step": 6384 + }, + { + "epoch": 0.73, + "learning_rate": 3.5934158759027405e-06, + "loss": 0.4591, + "step": 6385 + }, + { + "epoch": 0.73, + "learning_rate": 3.590574089619786e-06, + "loss": 0.4632, + "step": 6386 + }, + { + "epoch": 0.73, + "learning_rate": 3.587733181571282e-06, + "loss": 0.4452, + "step": 6387 + }, + { + "epoch": 0.73, + "learning_rate": 3.5848931521464947e-06, + "loss": 0.4504, + "step": 6388 + }, + { + "epoch": 0.73, + "learning_rate": 3.5820540017345663e-06, + "loss": 0.4659, + "step": 6389 + }, + { + "epoch": 0.73, + "learning_rate": 3.5792157307245313e-06, + "loss": 0.4513, + "step": 6390 + }, + { + "epoch": 0.73, + "learning_rate": 3.5763783395052887e-06, + "loss": 0.4511, + "step": 6391 + }, + { + "epoch": 0.73, + "learning_rate": 3.5735418284656287e-06, + "loss": 0.4506, + "step": 6392 + }, + { + "epoch": 0.73, + "learning_rate": 3.5707061979942205e-06, + "loss": 0.4609, + "step": 6393 + }, + { + "epoch": 0.73, + "learning_rate": 3.5678714484796006e-06, + "loss": 0.4443, + "step": 6394 + }, + { + "epoch": 0.73, + "learning_rate": 3.565037580310201e-06, + "loss": 0.4451, + "step": 6395 + }, + { + "epoch": 0.73, + "learning_rate": 3.56220459387432e-06, + "loss": 0.4531, + "step": 6396 + }, + { + "epoch": 0.73, + "learning_rate": 3.559372489560139e-06, + "loss": 0.4487, + "step": 6397 + }, + { + "epoch": 0.73, + "learning_rate": 3.5565412677557233e-06, + "loss": 0.4554, + "step": 6398 + }, + { + "epoch": 0.73, + "learning_rate": 3.553710928849009e-06, + "loss": 0.4365, + "step": 6399 + }, + { + "epoch": 0.73, + "learning_rate": 3.550881473227822e-06, + "loss": 0.4724, + "step": 6400 + }, + { + "epoch": 0.73, + "learning_rate": 3.548052901279854e-06, + "loss": 0.4517, + "step": 6401 + }, + { + "epoch": 0.73, + "learning_rate": 3.5452252133926856e-06, + "loss": 0.4618, + "step": 6402 + }, + { + "epoch": 0.73, + "learning_rate": 3.5423984099537754e-06, + "loss": 0.4458, + "step": 6403 + }, + { + "epoch": 0.73, + "learning_rate": 3.5395724913504546e-06, + "loss": 0.4444, + "step": 6404 + }, + { + "epoch": 0.73, + "learning_rate": 3.536747457969942e-06, + "loss": 0.4485, + "step": 6405 + }, + { + "epoch": 0.73, + "learning_rate": 3.5339233101993287e-06, + "loss": 0.4616, + "step": 6406 + }, + { + "epoch": 0.73, + "learning_rate": 3.5311000484255796e-06, + "loss": 0.4301, + "step": 6407 + }, + { + "epoch": 0.73, + "learning_rate": 3.5282776730355537e-06, + "loss": 0.4606, + "step": 6408 + }, + { + "epoch": 0.73, + "learning_rate": 3.5254561844159718e-06, + "loss": 0.4518, + "step": 6409 + }, + { + "epoch": 0.73, + "learning_rate": 3.5226355829534475e-06, + "loss": 0.4358, + "step": 6410 + }, + { + "epoch": 0.73, + "learning_rate": 3.51981586903446e-06, + "loss": 0.4838, + "step": 6411 + }, + { + "epoch": 0.73, + "learning_rate": 3.516997043045376e-06, + "loss": 0.4628, + "step": 6412 + }, + { + "epoch": 0.73, + "learning_rate": 3.5141791053724405e-06, + "loss": 0.4449, + "step": 6413 + }, + { + "epoch": 0.73, + "learning_rate": 3.5113620564017727e-06, + "loss": 0.4448, + "step": 6414 + }, + { + "epoch": 0.73, + "learning_rate": 3.5085458965193654e-06, + "loss": 0.4661, + "step": 6415 + }, + { + "epoch": 0.73, + "learning_rate": 3.5057306261111024e-06, + "loss": 0.4502, + "step": 6416 + }, + { + "epoch": 0.73, + "learning_rate": 3.502916245562733e-06, + "loss": 0.4547, + "step": 6417 + }, + { + "epoch": 0.73, + "learning_rate": 3.5001027552598952e-06, + "loss": 0.4459, + "step": 6418 + }, + { + "epoch": 0.73, + "learning_rate": 3.4972901555880957e-06, + "loss": 0.4551, + "step": 6419 + }, + { + "epoch": 0.73, + "learning_rate": 3.4944784469327253e-06, + "loss": 0.4592, + "step": 6420 + }, + { + "epoch": 0.73, + "learning_rate": 3.491667629679054e-06, + "loss": 0.4369, + "step": 6421 + }, + { + "epoch": 0.73, + "learning_rate": 3.488857704212224e-06, + "loss": 0.4437, + "step": 6422 + }, + { + "epoch": 0.73, + "learning_rate": 3.4860486709172536e-06, + "loss": 0.4401, + "step": 6423 + }, + { + "epoch": 0.73, + "learning_rate": 3.4832405301790507e-06, + "loss": 0.4533, + "step": 6424 + }, + { + "epoch": 0.73, + "learning_rate": 3.4804332823823862e-06, + "loss": 0.4487, + "step": 6425 + }, + { + "epoch": 0.73, + "learning_rate": 3.477626927911921e-06, + "loss": 0.4677, + "step": 6426 + }, + { + "epoch": 0.73, + "learning_rate": 3.4748214671521875e-06, + "loss": 0.4409, + "step": 6427 + }, + { + "epoch": 0.73, + "learning_rate": 3.4720169004875914e-06, + "loss": 0.4481, + "step": 6428 + }, + { + "epoch": 0.73, + "learning_rate": 3.469213228302425e-06, + "loss": 0.4797, + "step": 6429 + }, + { + "epoch": 0.73, + "learning_rate": 3.466410450980854e-06, + "loss": 0.4376, + "step": 6430 + }, + { + "epoch": 0.73, + "learning_rate": 3.4636085689069244e-06, + "loss": 0.4519, + "step": 6431 + }, + { + "epoch": 0.74, + "learning_rate": 3.4608075824645524e-06, + "loss": 0.4359, + "step": 6432 + }, + { + "epoch": 0.74, + "learning_rate": 3.4580074920375352e-06, + "loss": 0.4585, + "step": 6433 + }, + { + "epoch": 0.74, + "learning_rate": 3.4552082980095514e-06, + "loss": 0.4479, + "step": 6434 + }, + { + "epoch": 0.74, + "learning_rate": 3.4524100007641517e-06, + "loss": 0.4467, + "step": 6435 + }, + { + "epoch": 0.74, + "learning_rate": 3.4496126006847596e-06, + "loss": 0.4351, + "step": 6436 + }, + { + "epoch": 0.74, + "learning_rate": 3.446816098154692e-06, + "loss": 0.4652, + "step": 6437 + }, + { + "epoch": 0.74, + "learning_rate": 3.4440204935571208e-06, + "loss": 0.443, + "step": 6438 + }, + { + "epoch": 0.74, + "learning_rate": 3.441225787275113e-06, + "loss": 0.4518, + "step": 6439 + }, + { + "epoch": 0.74, + "learning_rate": 3.4384319796916075e-06, + "loss": 0.4392, + "step": 6440 + }, + { + "epoch": 0.74, + "learning_rate": 3.435639071189413e-06, + "loss": 0.4458, + "step": 6441 + }, + { + "epoch": 0.74, + "learning_rate": 3.4328470621512257e-06, + "loss": 0.462, + "step": 6442 + }, + { + "epoch": 0.74, + "learning_rate": 3.430055952959607e-06, + "loss": 0.4441, + "step": 6443 + }, + { + "epoch": 0.74, + "learning_rate": 3.427265743997007e-06, + "loss": 0.4649, + "step": 6444 + }, + { + "epoch": 0.74, + "learning_rate": 3.4244764356457438e-06, + "loss": 0.4703, + "step": 6445 + }, + { + "epoch": 0.74, + "learning_rate": 3.4216880282880128e-06, + "loss": 0.4514, + "step": 6446 + }, + { + "epoch": 0.74, + "learning_rate": 3.4189005223058937e-06, + "loss": 0.4628, + "step": 6447 + }, + { + "epoch": 0.74, + "learning_rate": 3.416113918081331e-06, + "loss": 0.4485, + "step": 6448 + }, + { + "epoch": 0.74, + "learning_rate": 3.4133282159961535e-06, + "loss": 0.4489, + "step": 6449 + }, + { + "epoch": 0.74, + "learning_rate": 3.4105434164320695e-06, + "loss": 0.4353, + "step": 6450 + }, + { + "epoch": 0.74, + "learning_rate": 3.4077595197706505e-06, + "loss": 0.4739, + "step": 6451 + }, + { + "epoch": 0.74, + "learning_rate": 3.404976526393361e-06, + "loss": 0.4615, + "step": 6452 + }, + { + "epoch": 0.74, + "learning_rate": 3.4021944366815286e-06, + "loss": 0.4412, + "step": 6453 + }, + { + "epoch": 0.74, + "learning_rate": 3.399413251016359e-06, + "loss": 0.4565, + "step": 6454 + }, + { + "epoch": 0.74, + "learning_rate": 3.3966329697789424e-06, + "loss": 0.4467, + "step": 6455 + }, + { + "epoch": 0.74, + "learning_rate": 3.393853593350235e-06, + "loss": 0.4398, + "step": 6456 + }, + { + "epoch": 0.74, + "learning_rate": 3.391075122111077e-06, + "loss": 0.4441, + "step": 6457 + }, + { + "epoch": 0.74, + "learning_rate": 3.3882975564421773e-06, + "loss": 0.4504, + "step": 6458 + }, + { + "epoch": 0.74, + "learning_rate": 3.3855208967241247e-06, + "loss": 0.46, + "step": 6459 + }, + { + "epoch": 0.74, + "learning_rate": 3.3827451433373904e-06, + "loss": 0.4562, + "step": 6460 + }, + { + "epoch": 0.74, + "learning_rate": 3.379970296662305e-06, + "loss": 0.468, + "step": 6461 + }, + { + "epoch": 0.74, + "learning_rate": 3.3771963570790924e-06, + "loss": 0.4444, + "step": 6462 + }, + { + "epoch": 0.74, + "learning_rate": 3.3744233249678403e-06, + "loss": 0.4594, + "step": 6463 + }, + { + "epoch": 0.74, + "learning_rate": 3.3716512007085133e-06, + "loss": 0.4506, + "step": 6464 + }, + { + "epoch": 0.74, + "learning_rate": 3.368879984680962e-06, + "loss": 0.4545, + "step": 6465 + }, + { + "epoch": 0.74, + "learning_rate": 3.366109677264895e-06, + "loss": 0.4571, + "step": 6466 + }, + { + "epoch": 0.74, + "learning_rate": 3.363340278839916e-06, + "loss": 0.4337, + "step": 6467 + }, + { + "epoch": 0.74, + "learning_rate": 3.3605717897854872e-06, + "loss": 0.4496, + "step": 6468 + }, + { + "epoch": 0.74, + "learning_rate": 3.357804210480955e-06, + "loss": 0.4677, + "step": 6469 + }, + { + "epoch": 0.74, + "learning_rate": 3.355037541305545e-06, + "loss": 0.441, + "step": 6470 + }, + { + "epoch": 0.74, + "learning_rate": 3.3522717826383477e-06, + "loss": 0.4474, + "step": 6471 + }, + { + "epoch": 0.74, + "learning_rate": 3.349506934858331e-06, + "loss": 0.4609, + "step": 6472 + }, + { + "epoch": 0.74, + "learning_rate": 3.3467429983443477e-06, + "loss": 0.4565, + "step": 6473 + }, + { + "epoch": 0.74, + "learning_rate": 3.3439799734751132e-06, + "loss": 0.4461, + "step": 6474 + }, + { + "epoch": 0.74, + "learning_rate": 3.3412178606292276e-06, + "loss": 0.465, + "step": 6475 + }, + { + "epoch": 0.74, + "learning_rate": 3.3384566601851574e-06, + "loss": 0.4509, + "step": 6476 + }, + { + "epoch": 0.74, + "learning_rate": 3.3356963725212523e-06, + "loss": 0.4677, + "step": 6477 + }, + { + "epoch": 0.74, + "learning_rate": 3.3329369980157345e-06, + "loss": 0.4562, + "step": 6478 + }, + { + "epoch": 0.74, + "learning_rate": 3.330178537046699e-06, + "loss": 0.4441, + "step": 6479 + }, + { + "epoch": 0.74, + "learning_rate": 3.327420989992112e-06, + "loss": 0.4541, + "step": 6480 + }, + { + "epoch": 0.74, + "learning_rate": 3.3246643572298253e-06, + "loss": 0.466, + "step": 6481 + }, + { + "epoch": 0.74, + "learning_rate": 3.321908639137553e-06, + "loss": 0.4409, + "step": 6482 + }, + { + "epoch": 0.74, + "learning_rate": 3.3191538360928977e-06, + "loss": 0.447, + "step": 6483 + }, + { + "epoch": 0.74, + "learning_rate": 3.3163999484733232e-06, + "loss": 0.4702, + "step": 6484 + }, + { + "epoch": 0.74, + "learning_rate": 3.313646976656172e-06, + "loss": 0.4823, + "step": 6485 + }, + { + "epoch": 0.74, + "learning_rate": 3.3108949210186657e-06, + "loss": 0.4589, + "step": 6486 + }, + { + "epoch": 0.74, + "learning_rate": 3.308143781937898e-06, + "loss": 0.46, + "step": 6487 + }, + { + "epoch": 0.74, + "learning_rate": 3.305393559790838e-06, + "loss": 0.4426, + "step": 6488 + }, + { + "epoch": 0.74, + "learning_rate": 3.302644254954326e-06, + "loss": 0.4549, + "step": 6489 + }, + { + "epoch": 0.74, + "learning_rate": 3.299895867805074e-06, + "loss": 0.4496, + "step": 6490 + }, + { + "epoch": 0.74, + "learning_rate": 3.2971483987196783e-06, + "loss": 0.4512, + "step": 6491 + }, + { + "epoch": 0.74, + "learning_rate": 3.294401848074602e-06, + "loss": 0.4596, + "step": 6492 + }, + { + "epoch": 0.74, + "learning_rate": 3.2916562162461784e-06, + "loss": 0.4452, + "step": 6493 + }, + { + "epoch": 0.74, + "learning_rate": 3.288911503610629e-06, + "loss": 0.4427, + "step": 6494 + }, + { + "epoch": 0.74, + "learning_rate": 3.2861677105440335e-06, + "loss": 0.4639, + "step": 6495 + }, + { + "epoch": 0.74, + "learning_rate": 3.2834248374223556e-06, + "loss": 0.4535, + "step": 6496 + }, + { + "epoch": 0.74, + "learning_rate": 3.2806828846214324e-06, + "loss": 0.4548, + "step": 6497 + }, + { + "epoch": 0.74, + "learning_rate": 3.277941852516968e-06, + "loss": 0.4639, + "step": 6498 + }, + { + "epoch": 0.74, + "learning_rate": 3.2752017414845514e-06, + "loss": 0.4343, + "step": 6499 + }, + { + "epoch": 0.74, + "learning_rate": 3.2724625518996322e-06, + "loss": 0.449, + "step": 6500 + }, + { + "epoch": 0.74, + "learning_rate": 3.2697242841375452e-06, + "loss": 0.4611, + "step": 6501 + }, + { + "epoch": 0.74, + "learning_rate": 3.2669869385734943e-06, + "loss": 0.4584, + "step": 6502 + }, + { + "epoch": 0.74, + "learning_rate": 3.264250515582551e-06, + "loss": 0.4496, + "step": 6503 + }, + { + "epoch": 0.74, + "learning_rate": 3.2615150155396747e-06, + "loss": 0.4493, + "step": 6504 + }, + { + "epoch": 0.74, + "learning_rate": 3.258780438819681e-06, + "loss": 0.4551, + "step": 6505 + }, + { + "epoch": 0.74, + "learning_rate": 3.2560467857972744e-06, + "loss": 0.4449, + "step": 6506 + }, + { + "epoch": 0.74, + "learning_rate": 3.2533140568470266e-06, + "loss": 0.4602, + "step": 6507 + }, + { + "epoch": 0.74, + "learning_rate": 3.2505822523433785e-06, + "loss": 0.4392, + "step": 6508 + }, + { + "epoch": 0.74, + "learning_rate": 3.247851372660653e-06, + "loss": 0.4686, + "step": 6509 + }, + { + "epoch": 0.74, + "learning_rate": 3.2451214181730396e-06, + "loss": 0.4433, + "step": 6510 + }, + { + "epoch": 0.74, + "learning_rate": 3.2423923892545994e-06, + "loss": 0.461, + "step": 6511 + }, + { + "epoch": 0.74, + "learning_rate": 3.239664286279276e-06, + "loss": 0.4404, + "step": 6512 + }, + { + "epoch": 0.74, + "learning_rate": 3.2369371096208744e-06, + "loss": 0.4545, + "step": 6513 + }, + { + "epoch": 0.74, + "learning_rate": 3.2342108596530865e-06, + "loss": 0.44, + "step": 6514 + }, + { + "epoch": 0.74, + "learning_rate": 3.23148553674946e-06, + "loss": 0.4454, + "step": 6515 + }, + { + "epoch": 0.74, + "learning_rate": 3.2287611412834306e-06, + "loss": 0.4478, + "step": 6516 + }, + { + "epoch": 0.74, + "learning_rate": 3.226037673628305e-06, + "loss": 0.4523, + "step": 6517 + }, + { + "epoch": 0.74, + "learning_rate": 3.223315134157253e-06, + "loss": 0.4633, + "step": 6518 + }, + { + "epoch": 0.75, + "learning_rate": 3.220593523243324e-06, + "loss": 0.4656, + "step": 6519 + }, + { + "epoch": 0.75, + "learning_rate": 3.2178728412594417e-06, + "loss": 0.4449, + "step": 6520 + }, + { + "epoch": 0.75, + "learning_rate": 3.2151530885783967e-06, + "loss": 0.4725, + "step": 6521 + }, + { + "epoch": 0.75, + "learning_rate": 3.212434265572861e-06, + "loss": 0.4602, + "step": 6522 + }, + { + "epoch": 0.75, + "learning_rate": 3.209716372615369e-06, + "loss": 0.4637, + "step": 6523 + }, + { + "epoch": 0.75, + "learning_rate": 3.2069994100783376e-06, + "loss": 0.4622, + "step": 6524 + }, + { + "epoch": 0.75, + "learning_rate": 3.2042833783340453e-06, + "loss": 0.4492, + "step": 6525 + }, + { + "epoch": 0.75, + "learning_rate": 3.201568277754652e-06, + "loss": 0.4493, + "step": 6526 + }, + { + "epoch": 0.75, + "learning_rate": 3.1988541087121916e-06, + "loss": 0.4665, + "step": 6527 + }, + { + "epoch": 0.75, + "learning_rate": 3.1961408715785615e-06, + "loss": 0.435, + "step": 6528 + }, + { + "epoch": 0.75, + "learning_rate": 3.193428566725534e-06, + "loss": 0.4442, + "step": 6529 + }, + { + "epoch": 0.75, + "learning_rate": 3.1907171945247595e-06, + "loss": 0.4554, + "step": 6530 + }, + { + "epoch": 0.75, + "learning_rate": 3.1880067553477513e-06, + "loss": 0.4353, + "step": 6531 + }, + { + "epoch": 0.75, + "learning_rate": 3.1852972495659064e-06, + "loss": 0.463, + "step": 6532 + }, + { + "epoch": 0.75, + "learning_rate": 3.182588677550482e-06, + "loss": 0.4648, + "step": 6533 + }, + { + "epoch": 0.75, + "learning_rate": 3.179881039672619e-06, + "loss": 0.4438, + "step": 6534 + }, + { + "epoch": 0.75, + "learning_rate": 3.1771743363033156e-06, + "loss": 0.4627, + "step": 6535 + }, + { + "epoch": 0.75, + "learning_rate": 3.174468567813461e-06, + "loss": 0.4732, + "step": 6536 + }, + { + "epoch": 0.75, + "learning_rate": 3.171763734573796e-06, + "loss": 0.434, + "step": 6537 + }, + { + "epoch": 0.75, + "learning_rate": 3.169059836954952e-06, + "loss": 0.4621, + "step": 6538 + }, + { + "epoch": 0.75, + "learning_rate": 3.1663568753274153e-06, + "loss": 0.4477, + "step": 6539 + }, + { + "epoch": 0.75, + "learning_rate": 3.1636548500615583e-06, + "loss": 0.4271, + "step": 6540 + }, + { + "epoch": 0.75, + "learning_rate": 3.1609537615276174e-06, + "loss": 0.4784, + "step": 6541 + }, + { + "epoch": 0.75, + "learning_rate": 3.1582536100956973e-06, + "loss": 0.4413, + "step": 6542 + }, + { + "epoch": 0.75, + "learning_rate": 3.1555543961357824e-06, + "loss": 0.4723, + "step": 6543 + }, + { + "epoch": 0.75, + "learning_rate": 3.1528561200177254e-06, + "loss": 0.4543, + "step": 6544 + }, + { + "epoch": 0.75, + "learning_rate": 3.1501587821112532e-06, + "loss": 0.4522, + "step": 6545 + }, + { + "epoch": 0.75, + "learning_rate": 3.14746238278596e-06, + "loss": 0.4649, + "step": 6546 + }, + { + "epoch": 0.75, + "learning_rate": 3.1447669224113074e-06, + "loss": 0.4431, + "step": 6547 + }, + { + "epoch": 0.75, + "learning_rate": 3.1420724013566408e-06, + "loss": 0.4451, + "step": 6548 + }, + { + "epoch": 0.75, + "learning_rate": 3.1393788199911657e-06, + "loss": 0.4428, + "step": 6549 + }, + { + "epoch": 0.75, + "learning_rate": 3.136686178683961e-06, + "loss": 0.468, + "step": 6550 + }, + { + "epoch": 0.75, + "learning_rate": 3.1339944778039844e-06, + "loss": 0.4625, + "step": 6551 + }, + { + "epoch": 0.75, + "learning_rate": 3.131303717720053e-06, + "loss": 0.4479, + "step": 6552 + }, + { + "epoch": 0.75, + "learning_rate": 3.128613898800864e-06, + "loss": 0.4522, + "step": 6553 + }, + { + "epoch": 0.75, + "learning_rate": 3.125925021414985e-06, + "loss": 0.4741, + "step": 6554 + }, + { + "epoch": 0.75, + "learning_rate": 3.123237085930847e-06, + "loss": 0.4563, + "step": 6555 + }, + { + "epoch": 0.75, + "learning_rate": 3.1205500927167644e-06, + "loss": 0.4503, + "step": 6556 + }, + { + "epoch": 0.75, + "learning_rate": 3.1178640421409057e-06, + "loss": 0.4515, + "step": 6557 + }, + { + "epoch": 0.75, + "learning_rate": 3.11517893457133e-06, + "loss": 0.4671, + "step": 6558 + }, + { + "epoch": 0.75, + "learning_rate": 3.112494770375951e-06, + "loss": 0.4659, + "step": 6559 + }, + { + "epoch": 0.75, + "learning_rate": 3.1098115499225567e-06, + "loss": 0.4397, + "step": 6560 + }, + { + "epoch": 0.75, + "learning_rate": 3.107129273578815e-06, + "loss": 0.4652, + "step": 6561 + }, + { + "epoch": 0.75, + "learning_rate": 3.104447941712251e-06, + "loss": 0.4627, + "step": 6562 + }, + { + "epoch": 0.75, + "learning_rate": 3.1017675546902704e-06, + "loss": 0.5041, + "step": 6563 + }, + { + "epoch": 0.75, + "learning_rate": 3.0990881128801487e-06, + "loss": 0.4534, + "step": 6564 + }, + { + "epoch": 0.75, + "learning_rate": 3.096409616649023e-06, + "loss": 0.4516, + "step": 6565 + }, + { + "epoch": 0.75, + "learning_rate": 3.0937320663639148e-06, + "loss": 0.4326, + "step": 6566 + }, + { + "epoch": 0.75, + "learning_rate": 3.091055462391703e-06, + "loss": 0.4685, + "step": 6567 + }, + { + "epoch": 0.75, + "learning_rate": 3.088379805099141e-06, + "loss": 0.4364, + "step": 6568 + }, + { + "epoch": 0.75, + "learning_rate": 3.0857050948528576e-06, + "loss": 0.4616, + "step": 6569 + }, + { + "epoch": 0.75, + "learning_rate": 3.083031332019344e-06, + "loss": 0.4496, + "step": 6570 + }, + { + "epoch": 0.75, + "learning_rate": 3.0803585169649696e-06, + "loss": 0.4441, + "step": 6571 + }, + { + "epoch": 0.75, + "learning_rate": 3.0776866500559654e-06, + "loss": 0.4711, + "step": 6572 + }, + { + "epoch": 0.75, + "learning_rate": 3.0750157316584375e-06, + "loss": 0.4634, + "step": 6573 + }, + { + "epoch": 0.75, + "learning_rate": 3.0723457621383666e-06, + "loss": 0.4329, + "step": 6574 + }, + { + "epoch": 0.75, + "learning_rate": 3.0696767418615945e-06, + "loss": 0.4494, + "step": 6575 + }, + { + "epoch": 0.75, + "learning_rate": 3.067008671193833e-06, + "loss": 0.4572, + "step": 6576 + }, + { + "epoch": 0.75, + "learning_rate": 3.0643415505006733e-06, + "loss": 0.4618, + "step": 6577 + }, + { + "epoch": 0.75, + "learning_rate": 3.0616753801475653e-06, + "loss": 0.4343, + "step": 6578 + }, + { + "epoch": 0.75, + "learning_rate": 3.059010160499839e-06, + "loss": 0.4473, + "step": 6579 + }, + { + "epoch": 0.75, + "learning_rate": 3.056345891922684e-06, + "loss": 0.4555, + "step": 6580 + }, + { + "epoch": 0.75, + "learning_rate": 3.0536825747811695e-06, + "loss": 0.4598, + "step": 6581 + }, + { + "epoch": 0.75, + "learning_rate": 3.0510202094402242e-06, + "loss": 0.4601, + "step": 6582 + }, + { + "epoch": 0.75, + "learning_rate": 3.0483587962646545e-06, + "loss": 0.4466, + "step": 6583 + }, + { + "epoch": 0.75, + "learning_rate": 3.045698335619135e-06, + "loss": 0.4614, + "step": 6584 + }, + { + "epoch": 0.75, + "learning_rate": 3.0430388278682078e-06, + "loss": 0.4739, + "step": 6585 + }, + { + "epoch": 0.75, + "learning_rate": 3.04038027337628e-06, + "loss": 0.4521, + "step": 6586 + }, + { + "epoch": 0.75, + "learning_rate": 3.0377226725076394e-06, + "loss": 0.4688, + "step": 6587 + }, + { + "epoch": 0.75, + "learning_rate": 3.035066025626434e-06, + "loss": 0.4473, + "step": 6588 + }, + { + "epoch": 0.75, + "learning_rate": 3.0324103330966804e-06, + "loss": 0.4427, + "step": 6589 + }, + { + "epoch": 0.75, + "learning_rate": 3.02975559528227e-06, + "loss": 0.4678, + "step": 6590 + }, + { + "epoch": 0.75, + "learning_rate": 3.027101812546965e-06, + "loss": 0.4627, + "step": 6591 + }, + { + "epoch": 0.75, + "learning_rate": 3.024448985254387e-06, + "loss": 0.4567, + "step": 6592 + }, + { + "epoch": 0.75, + "learning_rate": 3.021797113768039e-06, + "loss": 0.4516, + "step": 6593 + }, + { + "epoch": 0.75, + "learning_rate": 3.0191461984512794e-06, + "loss": 0.4481, + "step": 6594 + }, + { + "epoch": 0.75, + "learning_rate": 3.016496239667349e-06, + "loss": 0.4717, + "step": 6595 + }, + { + "epoch": 0.75, + "learning_rate": 3.013847237779346e-06, + "loss": 0.4614, + "step": 6596 + }, + { + "epoch": 0.75, + "learning_rate": 3.0111991931502484e-06, + "loss": 0.4335, + "step": 6597 + }, + { + "epoch": 0.75, + "learning_rate": 3.0085521061428945e-06, + "loss": 0.4489, + "step": 6598 + }, + { + "epoch": 0.75, + "learning_rate": 3.005905977119992e-06, + "loss": 0.4609, + "step": 6599 + }, + { + "epoch": 0.75, + "learning_rate": 3.0032608064441206e-06, + "loss": 0.4218, + "step": 6600 + }, + { + "epoch": 0.75, + "learning_rate": 3.0006165944777333e-06, + "loss": 0.4688, + "step": 6601 + }, + { + "epoch": 0.75, + "learning_rate": 2.997973341583138e-06, + "loss": 0.4433, + "step": 6602 + }, + { + "epoch": 0.75, + "learning_rate": 2.9953310481225275e-06, + "loss": 0.4752, + "step": 6603 + }, + { + "epoch": 0.75, + "learning_rate": 2.992689714457947e-06, + "loss": 0.4381, + "step": 6604 + }, + { + "epoch": 0.75, + "learning_rate": 2.9900493409513256e-06, + "loss": 0.459, + "step": 6605 + }, + { + "epoch": 0.75, + "learning_rate": 2.9874099279644487e-06, + "loss": 0.4376, + "step": 6606 + }, + { + "epoch": 0.76, + "learning_rate": 2.984771475858973e-06, + "loss": 0.4624, + "step": 6607 + }, + { + "epoch": 0.76, + "learning_rate": 2.9821339849964324e-06, + "loss": 0.4495, + "step": 6608 + }, + { + "epoch": 0.76, + "learning_rate": 2.979497455738214e-06, + "loss": 0.4727, + "step": 6609 + }, + { + "epoch": 0.76, + "learning_rate": 2.976861888445586e-06, + "loss": 0.4405, + "step": 6610 + }, + { + "epoch": 0.76, + "learning_rate": 2.9742272834796813e-06, + "loss": 0.4884, + "step": 6611 + }, + { + "epoch": 0.76, + "learning_rate": 2.9715936412014945e-06, + "loss": 0.4645, + "step": 6612 + }, + { + "epoch": 0.76, + "learning_rate": 2.9689609619718996e-06, + "loss": 0.4535, + "step": 6613 + }, + { + "epoch": 0.76, + "learning_rate": 2.966329246151626e-06, + "loss": 0.4459, + "step": 6614 + }, + { + "epoch": 0.76, + "learning_rate": 2.9636984941012835e-06, + "loss": 0.4543, + "step": 6615 + }, + { + "epoch": 0.76, + "learning_rate": 2.9610687061813405e-06, + "loss": 0.4395, + "step": 6616 + }, + { + "epoch": 0.76, + "learning_rate": 2.9584398827521343e-06, + "loss": 0.4394, + "step": 6617 + }, + { + "epoch": 0.76, + "learning_rate": 2.9558120241738786e-06, + "loss": 0.451, + "step": 6618 + }, + { + "epoch": 0.76, + "learning_rate": 2.9531851308066426e-06, + "loss": 0.468, + "step": 6619 + }, + { + "epoch": 0.76, + "learning_rate": 2.950559203010371e-06, + "loss": 0.4386, + "step": 6620 + }, + { + "epoch": 0.76, + "learning_rate": 2.9479342411448797e-06, + "loss": 0.4552, + "step": 6621 + }, + { + "epoch": 0.76, + "learning_rate": 2.945310245569839e-06, + "loss": 0.4542, + "step": 6622 + }, + { + "epoch": 0.76, + "learning_rate": 2.942687216644803e-06, + "loss": 0.4395, + "step": 6623 + }, + { + "epoch": 0.76, + "learning_rate": 2.94006515472918e-06, + "loss": 0.4551, + "step": 6624 + }, + { + "epoch": 0.76, + "learning_rate": 2.9374440601822495e-06, + "loss": 0.4472, + "step": 6625 + }, + { + "epoch": 0.76, + "learning_rate": 2.9348239333631655e-06, + "loss": 0.4362, + "step": 6626 + }, + { + "epoch": 0.76, + "learning_rate": 2.9322047746309377e-06, + "loss": 0.4542, + "step": 6627 + }, + { + "epoch": 0.76, + "learning_rate": 2.929586584344456e-06, + "loss": 0.4466, + "step": 6628 + }, + { + "epoch": 0.76, + "learning_rate": 2.926969362862465e-06, + "loss": 0.4727, + "step": 6629 + }, + { + "epoch": 0.76, + "learning_rate": 2.924353110543584e-06, + "loss": 0.4393, + "step": 6630 + }, + { + "epoch": 0.76, + "learning_rate": 2.9217378277463025e-06, + "loss": 0.4731, + "step": 6631 + }, + { + "epoch": 0.76, + "learning_rate": 2.919123514828969e-06, + "loss": 0.4454, + "step": 6632 + }, + { + "epoch": 0.76, + "learning_rate": 2.916510172149799e-06, + "loss": 0.4309, + "step": 6633 + }, + { + "epoch": 0.76, + "learning_rate": 2.913897800066887e-06, + "loss": 0.4529, + "step": 6634 + }, + { + "epoch": 0.76, + "learning_rate": 2.911286398938178e-06, + "loss": 0.4504, + "step": 6635 + }, + { + "epoch": 0.76, + "learning_rate": 2.9086759691214994e-06, + "loss": 0.4542, + "step": 6636 + }, + { + "epoch": 0.76, + "learning_rate": 2.9060665109745324e-06, + "loss": 0.4558, + "step": 6637 + }, + { + "epoch": 0.76, + "learning_rate": 2.9034580248548363e-06, + "loss": 0.458, + "step": 6638 + }, + { + "epoch": 0.76, + "learning_rate": 2.900850511119826e-06, + "loss": 0.4515, + "step": 6639 + }, + { + "epoch": 0.76, + "learning_rate": 2.898243970126793e-06, + "loss": 0.4648, + "step": 6640 + }, + { + "epoch": 0.76, + "learning_rate": 2.8956384022328943e-06, + "loss": 0.4455, + "step": 6641 + }, + { + "epoch": 0.76, + "learning_rate": 2.893033807795147e-06, + "loss": 0.4698, + "step": 6642 + }, + { + "epoch": 0.76, + "learning_rate": 2.8904301871704377e-06, + "loss": 0.4463, + "step": 6643 + }, + { + "epoch": 0.76, + "learning_rate": 2.8878275407155244e-06, + "loss": 0.4796, + "step": 6644 + }, + { + "epoch": 0.76, + "learning_rate": 2.885225868787025e-06, + "loss": 0.4443, + "step": 6645 + }, + { + "epoch": 0.76, + "learning_rate": 2.8826251717414245e-06, + "loss": 0.458, + "step": 6646 + }, + { + "epoch": 0.76, + "learning_rate": 2.8800254499350797e-06, + "loss": 0.4368, + "step": 6647 + }, + { + "epoch": 0.76, + "learning_rate": 2.8774267037242133e-06, + "loss": 0.4671, + "step": 6648 + }, + { + "epoch": 0.76, + "learning_rate": 2.8748289334649036e-06, + "loss": 0.4454, + "step": 6649 + }, + { + "epoch": 0.76, + "learning_rate": 2.8722321395131127e-06, + "loss": 0.433, + "step": 6650 + }, + { + "epoch": 0.76, + "learning_rate": 2.8696363222246504e-06, + "loss": 0.4398, + "step": 6651 + }, + { + "epoch": 0.76, + "learning_rate": 2.8670414819552082e-06, + "loss": 0.4618, + "step": 6652 + }, + { + "epoch": 0.76, + "learning_rate": 2.864447619060333e-06, + "loss": 0.4626, + "step": 6653 + }, + { + "epoch": 0.76, + "learning_rate": 2.861854733895446e-06, + "loss": 0.4605, + "step": 6654 + }, + { + "epoch": 0.76, + "learning_rate": 2.8592628268158273e-06, + "loss": 0.4402, + "step": 6655 + }, + { + "epoch": 0.76, + "learning_rate": 2.8566718981766238e-06, + "loss": 0.4339, + "step": 6656 + }, + { + "epoch": 0.76, + "learning_rate": 2.854081948332854e-06, + "loss": 0.4739, + "step": 6657 + }, + { + "epoch": 0.76, + "learning_rate": 2.8514929776394006e-06, + "loss": 0.4559, + "step": 6658 + }, + { + "epoch": 0.76, + "learning_rate": 2.8489049864510053e-06, + "loss": 0.4355, + "step": 6659 + }, + { + "epoch": 0.76, + "learning_rate": 2.846317975122287e-06, + "loss": 0.4431, + "step": 6660 + }, + { + "epoch": 0.76, + "learning_rate": 2.843731944007717e-06, + "loss": 0.4666, + "step": 6661 + }, + { + "epoch": 0.76, + "learning_rate": 2.841146893461646e-06, + "loss": 0.4639, + "step": 6662 + }, + { + "epoch": 0.76, + "learning_rate": 2.8385628238382803e-06, + "loss": 0.4551, + "step": 6663 + }, + { + "epoch": 0.76, + "learning_rate": 2.8359797354916907e-06, + "loss": 0.4547, + "step": 6664 + }, + { + "epoch": 0.76, + "learning_rate": 2.833397628775827e-06, + "loss": 0.4457, + "step": 6665 + }, + { + "epoch": 0.76, + "learning_rate": 2.830816504044488e-06, + "loss": 0.4398, + "step": 6666 + }, + { + "epoch": 0.76, + "learning_rate": 2.8282363616513475e-06, + "loss": 0.4445, + "step": 6667 + }, + { + "epoch": 0.76, + "learning_rate": 2.8256572019499474e-06, + "loss": 0.4753, + "step": 6668 + }, + { + "epoch": 0.76, + "learning_rate": 2.8230790252936826e-06, + "loss": 0.4523, + "step": 6669 + }, + { + "epoch": 0.76, + "learning_rate": 2.8205018320358268e-06, + "loss": 0.4613, + "step": 6670 + }, + { + "epoch": 0.76, + "learning_rate": 2.8179256225295114e-06, + "loss": 0.467, + "step": 6671 + }, + { + "epoch": 0.76, + "learning_rate": 2.815350397127732e-06, + "loss": 0.469, + "step": 6672 + }, + { + "epoch": 0.76, + "learning_rate": 2.8127761561833554e-06, + "loss": 0.4359, + "step": 6673 + }, + { + "epoch": 0.76, + "learning_rate": 2.810202900049106e-06, + "loss": 0.4473, + "step": 6674 + }, + { + "epoch": 0.76, + "learning_rate": 2.8076306290775823e-06, + "loss": 0.4625, + "step": 6675 + }, + { + "epoch": 0.76, + "learning_rate": 2.8050593436212394e-06, + "loss": 0.451, + "step": 6676 + }, + { + "epoch": 0.76, + "learning_rate": 2.8024890440324e-06, + "loss": 0.4503, + "step": 6677 + }, + { + "epoch": 0.76, + "learning_rate": 2.7999197306632576e-06, + "loss": 0.4582, + "step": 6678 + }, + { + "epoch": 0.76, + "learning_rate": 2.79735140386586e-06, + "loss": 0.4593, + "step": 6679 + }, + { + "epoch": 0.76, + "learning_rate": 2.7947840639921308e-06, + "loss": 0.4524, + "step": 6680 + }, + { + "epoch": 0.76, + "learning_rate": 2.792217711393849e-06, + "loss": 0.4527, + "step": 6681 + }, + { + "epoch": 0.76, + "learning_rate": 2.78965234642266e-06, + "loss": 0.452, + "step": 6682 + }, + { + "epoch": 0.76, + "learning_rate": 2.7870879694300825e-06, + "loss": 0.4344, + "step": 6683 + }, + { + "epoch": 0.76, + "learning_rate": 2.7845245807674893e-06, + "loss": 0.4756, + "step": 6684 + }, + { + "epoch": 0.76, + "learning_rate": 2.7819621807861197e-06, + "loss": 0.4306, + "step": 6685 + }, + { + "epoch": 0.76, + "learning_rate": 2.779400769837083e-06, + "loss": 0.4731, + "step": 6686 + }, + { + "epoch": 0.76, + "learning_rate": 2.776840348271348e-06, + "loss": 0.4586, + "step": 6687 + }, + { + "epoch": 0.76, + "learning_rate": 2.7742809164397546e-06, + "loss": 0.4586, + "step": 6688 + }, + { + "epoch": 0.76, + "learning_rate": 2.7717224746929984e-06, + "loss": 0.4355, + "step": 6689 + }, + { + "epoch": 0.76, + "learning_rate": 2.769165023381639e-06, + "loss": 0.453, + "step": 6690 + }, + { + "epoch": 0.76, + "learning_rate": 2.7666085628561126e-06, + "loss": 0.4595, + "step": 6691 + }, + { + "epoch": 0.76, + "learning_rate": 2.764053093466702e-06, + "loss": 0.4609, + "step": 6692 + }, + { + "epoch": 0.76, + "learning_rate": 2.7614986155635737e-06, + "loss": 0.4531, + "step": 6693 + }, + { + "epoch": 0.77, + "learning_rate": 2.7589451294967383e-06, + "loss": 0.4557, + "step": 6694 + }, + { + "epoch": 0.77, + "learning_rate": 2.75639263561609e-06, + "loss": 0.4522, + "step": 6695 + }, + { + "epoch": 0.77, + "learning_rate": 2.753841134271368e-06, + "loss": 0.468, + "step": 6696 + }, + { + "epoch": 0.77, + "learning_rate": 2.7512906258121907e-06, + "loss": 0.4466, + "step": 6697 + }, + { + "epoch": 0.77, + "learning_rate": 2.7487411105880356e-06, + "loss": 0.454, + "step": 6698 + }, + { + "epoch": 0.77, + "learning_rate": 2.7461925889482422e-06, + "loss": 0.4294, + "step": 6699 + }, + { + "epoch": 0.77, + "learning_rate": 2.7436450612420098e-06, + "loss": 0.4556, + "step": 6700 + }, + { + "epoch": 0.77, + "learning_rate": 2.7410985278184144e-06, + "loss": 0.4682, + "step": 6701 + }, + { + "epoch": 0.77, + "learning_rate": 2.738552989026384e-06, + "loss": 0.4551, + "step": 6702 + }, + { + "epoch": 0.77, + "learning_rate": 2.7360084452147108e-06, + "loss": 0.4589, + "step": 6703 + }, + { + "epoch": 0.77, + "learning_rate": 2.7334648967320587e-06, + "loss": 0.4527, + "step": 6704 + }, + { + "epoch": 0.77, + "learning_rate": 2.7309223439269516e-06, + "loss": 0.4516, + "step": 6705 + }, + { + "epoch": 0.77, + "learning_rate": 2.728380787147772e-06, + "loss": 0.4418, + "step": 6706 + }, + { + "epoch": 0.77, + "learning_rate": 2.725840226742774e-06, + "loss": 0.4448, + "step": 6707 + }, + { + "epoch": 0.77, + "learning_rate": 2.7233006630600667e-06, + "loss": 0.4588, + "step": 6708 + }, + { + "epoch": 0.77, + "learning_rate": 2.7207620964476323e-06, + "loss": 0.4534, + "step": 6709 + }, + { + "epoch": 0.77, + "learning_rate": 2.7182245272533046e-06, + "loss": 0.4508, + "step": 6710 + }, + { + "epoch": 0.77, + "learning_rate": 2.715687955824795e-06, + "loss": 0.4542, + "step": 6711 + }, + { + "epoch": 0.77, + "learning_rate": 2.713152382509665e-06, + "loss": 0.451, + "step": 6712 + }, + { + "epoch": 0.77, + "learning_rate": 2.7106178076553446e-06, + "loss": 0.4442, + "step": 6713 + }, + { + "epoch": 0.77, + "learning_rate": 2.7080842316091273e-06, + "loss": 0.4464, + "step": 6714 + }, + { + "epoch": 0.77, + "learning_rate": 2.7055516547181736e-06, + "loss": 0.4548, + "step": 6715 + }, + { + "epoch": 0.77, + "learning_rate": 2.703020077329498e-06, + "loss": 0.4454, + "step": 6716 + }, + { + "epoch": 0.77, + "learning_rate": 2.7004894997899878e-06, + "loss": 0.438, + "step": 6717 + }, + { + "epoch": 0.77, + "learning_rate": 2.6979599224463838e-06, + "loss": 0.4595, + "step": 6718 + }, + { + "epoch": 0.77, + "learning_rate": 2.6954313456452995e-06, + "loss": 0.4586, + "step": 6719 + }, + { + "epoch": 0.77, + "learning_rate": 2.6929037697332037e-06, + "loss": 0.4458, + "step": 6720 + }, + { + "epoch": 0.77, + "learning_rate": 2.6903771950564294e-06, + "loss": 0.4398, + "step": 6721 + }, + { + "epoch": 0.77, + "learning_rate": 2.6878516219611773e-06, + "loss": 0.4544, + "step": 6722 + }, + { + "epoch": 0.77, + "learning_rate": 2.6853270507935013e-06, + "loss": 0.4588, + "step": 6723 + }, + { + "epoch": 0.77, + "learning_rate": 2.6828034818993285e-06, + "loss": 0.4461, + "step": 6724 + }, + { + "epoch": 0.77, + "learning_rate": 2.680280915624448e-06, + "loss": 0.4515, + "step": 6725 + }, + { + "epoch": 0.77, + "learning_rate": 2.6777593523144986e-06, + "loss": 0.4567, + "step": 6726 + }, + { + "epoch": 0.77, + "learning_rate": 2.675238792314999e-06, + "loss": 0.4453, + "step": 6727 + }, + { + "epoch": 0.77, + "learning_rate": 2.6727192359713196e-06, + "loss": 0.4547, + "step": 6728 + }, + { + "epoch": 0.77, + "learning_rate": 2.670200683628691e-06, + "loss": 0.4584, + "step": 6729 + }, + { + "epoch": 0.77, + "learning_rate": 2.6676831356322184e-06, + "loss": 0.4569, + "step": 6730 + }, + { + "epoch": 0.77, + "learning_rate": 2.6651665923268555e-06, + "loss": 0.4601, + "step": 6731 + }, + { + "epoch": 0.77, + "learning_rate": 2.6626510540574314e-06, + "loss": 0.4423, + "step": 6732 + }, + { + "epoch": 0.77, + "learning_rate": 2.6601365211686248e-06, + "loss": 0.459, + "step": 6733 + }, + { + "epoch": 0.77, + "learning_rate": 2.657622994004986e-06, + "loss": 0.4568, + "step": 6734 + }, + { + "epoch": 0.77, + "learning_rate": 2.655110472910927e-06, + "loss": 0.4484, + "step": 6735 + }, + { + "epoch": 0.77, + "learning_rate": 2.652598958230713e-06, + "loss": 0.4592, + "step": 6736 + }, + { + "epoch": 0.77, + "learning_rate": 2.6500884503084857e-06, + "loss": 0.4461, + "step": 6737 + }, + { + "epoch": 0.77, + "learning_rate": 2.647578949488234e-06, + "loss": 0.4984, + "step": 6738 + }, + { + "epoch": 0.77, + "learning_rate": 2.645070456113816e-06, + "loss": 0.4566, + "step": 6739 + }, + { + "epoch": 0.77, + "learning_rate": 2.6425629705289556e-06, + "loss": 0.4459, + "step": 6740 + }, + { + "epoch": 0.77, + "learning_rate": 2.640056493077231e-06, + "loss": 0.4461, + "step": 6741 + }, + { + "epoch": 0.77, + "learning_rate": 2.637551024102084e-06, + "loss": 0.4478, + "step": 6742 + }, + { + "epoch": 0.77, + "learning_rate": 2.6350465639468213e-06, + "loss": 0.44, + "step": 6743 + }, + { + "epoch": 0.77, + "learning_rate": 2.6325431129546107e-06, + "loss": 0.4568, + "step": 6744 + }, + { + "epoch": 0.77, + "learning_rate": 2.6300406714684834e-06, + "loss": 0.4455, + "step": 6745 + }, + { + "epoch": 0.77, + "learning_rate": 2.627539239831328e-06, + "loss": 0.4659, + "step": 6746 + }, + { + "epoch": 0.77, + "learning_rate": 2.625038818385892e-06, + "loss": 0.4441, + "step": 6747 + }, + { + "epoch": 0.77, + "learning_rate": 2.6225394074747956e-06, + "loss": 0.4484, + "step": 6748 + }, + { + "epoch": 0.77, + "learning_rate": 2.620041007440508e-06, + "loss": 0.4494, + "step": 6749 + }, + { + "epoch": 0.77, + "learning_rate": 2.617543618625371e-06, + "loss": 0.4613, + "step": 6750 + }, + { + "epoch": 0.77, + "learning_rate": 2.615047241371581e-06, + "loss": 0.4473, + "step": 6751 + }, + { + "epoch": 0.77, + "learning_rate": 2.6125518760211933e-06, + "loss": 0.459, + "step": 6752 + }, + { + "epoch": 0.77, + "learning_rate": 2.610057522916132e-06, + "loss": 0.456, + "step": 6753 + }, + { + "epoch": 0.77, + "learning_rate": 2.6075641823981814e-06, + "loss": 0.4547, + "step": 6754 + }, + { + "epoch": 0.77, + "learning_rate": 2.6050718548089804e-06, + "loss": 0.4559, + "step": 6755 + }, + { + "epoch": 0.77, + "learning_rate": 2.602580540490038e-06, + "loss": 0.463, + "step": 6756 + }, + { + "epoch": 0.77, + "learning_rate": 2.6000902397827154e-06, + "loss": 0.4474, + "step": 6757 + }, + { + "epoch": 0.77, + "learning_rate": 2.5976009530282455e-06, + "loss": 0.4649, + "step": 6758 + }, + { + "epoch": 0.77, + "learning_rate": 2.595112680567711e-06, + "loss": 0.4483, + "step": 6759 + }, + { + "epoch": 0.77, + "learning_rate": 2.592625422742059e-06, + "loss": 0.4602, + "step": 6760 + }, + { + "epoch": 0.77, + "learning_rate": 2.5901391798921018e-06, + "loss": 0.4647, + "step": 6761 + }, + { + "epoch": 0.77, + "learning_rate": 2.5876539523585144e-06, + "loss": 0.4619, + "step": 6762 + }, + { + "epoch": 0.77, + "learning_rate": 2.585169740481822e-06, + "loss": 0.4544, + "step": 6763 + }, + { + "epoch": 0.77, + "learning_rate": 2.582686544602423e-06, + "loss": 0.4573, + "step": 6764 + }, + { + "epoch": 0.77, + "learning_rate": 2.5802043650605645e-06, + "loss": 0.4429, + "step": 6765 + }, + { + "epoch": 0.77, + "learning_rate": 2.577723202196366e-06, + "loss": 0.4496, + "step": 6766 + }, + { + "epoch": 0.77, + "learning_rate": 2.575243056349801e-06, + "loss": 0.4589, + "step": 6767 + }, + { + "epoch": 0.77, + "learning_rate": 2.5727639278606997e-06, + "loss": 0.4451, + "step": 6768 + }, + { + "epoch": 0.77, + "learning_rate": 2.570285817068765e-06, + "loss": 0.4611, + "step": 6769 + }, + { + "epoch": 0.77, + "learning_rate": 2.5678087243135476e-06, + "loss": 0.4572, + "step": 6770 + }, + { + "epoch": 0.77, + "learning_rate": 2.565332649934468e-06, + "loss": 0.4633, + "step": 6771 + }, + { + "epoch": 0.77, + "learning_rate": 2.5628575942708047e-06, + "loss": 0.4609, + "step": 6772 + }, + { + "epoch": 0.77, + "learning_rate": 2.560383557661692e-06, + "loss": 0.454, + "step": 6773 + }, + { + "epoch": 0.77, + "learning_rate": 2.5579105404461325e-06, + "loss": 0.4556, + "step": 6774 + }, + { + "epoch": 0.77, + "learning_rate": 2.555438542962979e-06, + "loss": 0.4432, + "step": 6775 + }, + { + "epoch": 0.77, + "learning_rate": 2.5529675655509567e-06, + "loss": 0.4549, + "step": 6776 + }, + { + "epoch": 0.77, + "learning_rate": 2.550497608548642e-06, + "loss": 0.4414, + "step": 6777 + }, + { + "epoch": 0.77, + "learning_rate": 2.5480286722944712e-06, + "loss": 0.4531, + "step": 6778 + }, + { + "epoch": 0.77, + "learning_rate": 2.5455607571267484e-06, + "loss": 0.4699, + "step": 6779 + }, + { + "epoch": 0.77, + "learning_rate": 2.543093863383629e-06, + "loss": 0.4576, + "step": 6780 + }, + { + "epoch": 0.77, + "learning_rate": 2.540627991403134e-06, + "loss": 0.4535, + "step": 6781 + }, + { + "epoch": 0.78, + "learning_rate": 2.5381631415231455e-06, + "loss": 0.479, + "step": 6782 + }, + { + "epoch": 0.78, + "learning_rate": 2.535699314081399e-06, + "loss": 0.4589, + "step": 6783 + }, + { + "epoch": 0.78, + "learning_rate": 2.5332365094154975e-06, + "loss": 0.4563, + "step": 6784 + }, + { + "epoch": 0.78, + "learning_rate": 2.530774727862899e-06, + "loss": 0.4539, + "step": 6785 + }, + { + "epoch": 0.78, + "learning_rate": 2.5283139697609195e-06, + "loss": 0.4446, + "step": 6786 + }, + { + "epoch": 0.78, + "learning_rate": 2.525854235446743e-06, + "loss": 0.4636, + "step": 6787 + }, + { + "epoch": 0.78, + "learning_rate": 2.5233955252574027e-06, + "loss": 0.4485, + "step": 6788 + }, + { + "epoch": 0.78, + "learning_rate": 2.5209378395298013e-06, + "loss": 0.4408, + "step": 6789 + }, + { + "epoch": 0.78, + "learning_rate": 2.5184811786006923e-06, + "loss": 0.4678, + "step": 6790 + }, + { + "epoch": 0.78, + "learning_rate": 2.516025542806696e-06, + "loss": 0.4418, + "step": 6791 + }, + { + "epoch": 0.78, + "learning_rate": 2.5135709324842906e-06, + "loss": 0.4579, + "step": 6792 + }, + { + "epoch": 0.78, + "learning_rate": 2.511117347969809e-06, + "loss": 0.453, + "step": 6793 + }, + { + "epoch": 0.78, + "learning_rate": 2.508664789599451e-06, + "loss": 0.4449, + "step": 6794 + }, + { + "epoch": 0.78, + "learning_rate": 2.50621325770927e-06, + "loss": 0.4601, + "step": 6795 + }, + { + "epoch": 0.78, + "learning_rate": 2.503762752635177e-06, + "loss": 0.4701, + "step": 6796 + }, + { + "epoch": 0.78, + "learning_rate": 2.501313274712952e-06, + "loss": 0.4432, + "step": 6797 + }, + { + "epoch": 0.78, + "learning_rate": 2.4988648242782255e-06, + "loss": 0.4596, + "step": 6798 + }, + { + "epoch": 0.78, + "learning_rate": 2.4964174016664865e-06, + "loss": 0.4429, + "step": 6799 + }, + { + "epoch": 0.78, + "learning_rate": 2.4939710072130895e-06, + "loss": 0.4535, + "step": 6800 + }, + { + "epoch": 0.78, + "learning_rate": 2.4915256412532463e-06, + "loss": 0.4357, + "step": 6801 + }, + { + "epoch": 0.78, + "learning_rate": 2.4890813041220286e-06, + "loss": 0.4536, + "step": 6802 + }, + { + "epoch": 0.78, + "learning_rate": 2.486637996154362e-06, + "loss": 0.442, + "step": 6803 + }, + { + "epoch": 0.78, + "learning_rate": 2.4841957176850306e-06, + "loss": 0.4543, + "step": 6804 + }, + { + "epoch": 0.78, + "learning_rate": 2.4817544690486896e-06, + "loss": 0.4535, + "step": 6805 + }, + { + "epoch": 0.78, + "learning_rate": 2.4793142505798363e-06, + "loss": 0.4411, + "step": 6806 + }, + { + "epoch": 0.78, + "learning_rate": 2.4768750626128414e-06, + "loss": 0.4497, + "step": 6807 + }, + { + "epoch": 0.78, + "learning_rate": 2.4744369054819252e-06, + "loss": 0.4533, + "step": 6808 + }, + { + "epoch": 0.78, + "learning_rate": 2.4719997795211683e-06, + "loss": 0.4417, + "step": 6809 + }, + { + "epoch": 0.78, + "learning_rate": 2.4695636850645112e-06, + "loss": 0.4485, + "step": 6810 + }, + { + "epoch": 0.78, + "learning_rate": 2.467128622445759e-06, + "loss": 0.4413, + "step": 6811 + }, + { + "epoch": 0.78, + "learning_rate": 2.464694591998563e-06, + "loss": 0.4554, + "step": 6812 + }, + { + "epoch": 0.78, + "learning_rate": 2.4622615940564454e-06, + "loss": 0.4551, + "step": 6813 + }, + { + "epoch": 0.78, + "learning_rate": 2.4598296289527745e-06, + "loss": 0.4501, + "step": 6814 + }, + { + "epoch": 0.78, + "learning_rate": 2.4573986970207906e-06, + "loss": 0.4521, + "step": 6815 + }, + { + "epoch": 0.78, + "learning_rate": 2.4549687985935832e-06, + "loss": 0.4545, + "step": 6816 + }, + { + "epoch": 0.78, + "learning_rate": 2.452539934004099e-06, + "loss": 0.436, + "step": 6817 + }, + { + "epoch": 0.78, + "learning_rate": 2.4501121035851494e-06, + "loss": 0.4512, + "step": 6818 + }, + { + "epoch": 0.78, + "learning_rate": 2.447685307669405e-06, + "loss": 0.4505, + "step": 6819 + }, + { + "epoch": 0.78, + "learning_rate": 2.4452595465893847e-06, + "loss": 0.4493, + "step": 6820 + }, + { + "epoch": 0.78, + "learning_rate": 2.4428348206774775e-06, + "loss": 0.4526, + "step": 6821 + }, + { + "epoch": 0.78, + "learning_rate": 2.4404111302659195e-06, + "loss": 0.4747, + "step": 6822 + }, + { + "epoch": 0.78, + "learning_rate": 2.4379884756868167e-06, + "loss": 0.4277, + "step": 6823 + }, + { + "epoch": 0.78, + "learning_rate": 2.4355668572721224e-06, + "loss": 0.4578, + "step": 6824 + }, + { + "epoch": 0.78, + "learning_rate": 2.433146275353652e-06, + "loss": 0.4408, + "step": 6825 + }, + { + "epoch": 0.78, + "learning_rate": 2.4307267302630834e-06, + "loss": 0.4679, + "step": 6826 + }, + { + "epoch": 0.78, + "learning_rate": 2.428308222331942e-06, + "loss": 0.4573, + "step": 6827 + }, + { + "epoch": 0.78, + "learning_rate": 2.4258907518916207e-06, + "loss": 0.4575, + "step": 6828 + }, + { + "epoch": 0.78, + "learning_rate": 2.4234743192733713e-06, + "loss": 0.4426, + "step": 6829 + }, + { + "epoch": 0.78, + "learning_rate": 2.4210589248082914e-06, + "loss": 0.4606, + "step": 6830 + }, + { + "epoch": 0.78, + "learning_rate": 2.4186445688273508e-06, + "loss": 0.4585, + "step": 6831 + }, + { + "epoch": 0.78, + "learning_rate": 2.416231251661364e-06, + "loss": 0.4363, + "step": 6832 + }, + { + "epoch": 0.78, + "learning_rate": 2.4138189736410144e-06, + "loss": 0.4513, + "step": 6833 + }, + { + "epoch": 0.78, + "learning_rate": 2.411407735096836e-06, + "loss": 0.4403, + "step": 6834 + }, + { + "epoch": 0.78, + "learning_rate": 2.40899753635922e-06, + "loss": 0.435, + "step": 6835 + }, + { + "epoch": 0.78, + "learning_rate": 2.406588377758421e-06, + "loss": 0.4783, + "step": 6836 + }, + { + "epoch": 0.78, + "learning_rate": 2.4041802596245444e-06, + "loss": 0.4533, + "step": 6837 + }, + { + "epoch": 0.78, + "learning_rate": 2.4017731822875566e-06, + "loss": 0.4545, + "step": 6838 + }, + { + "epoch": 0.78, + "learning_rate": 2.399367146077286e-06, + "loss": 0.4624, + "step": 6839 + }, + { + "epoch": 0.78, + "learning_rate": 2.3969621513234066e-06, + "loss": 0.46, + "step": 6840 + }, + { + "epoch": 0.78, + "learning_rate": 2.394558198355462e-06, + "loss": 0.4667, + "step": 6841 + }, + { + "epoch": 0.78, + "learning_rate": 2.3921552875028443e-06, + "loss": 0.4611, + "step": 6842 + }, + { + "epoch": 0.78, + "learning_rate": 2.3897534190948034e-06, + "loss": 0.4417, + "step": 6843 + }, + { + "epoch": 0.78, + "learning_rate": 2.387352593460455e-06, + "loss": 0.4708, + "step": 6844 + }, + { + "epoch": 0.78, + "learning_rate": 2.384952810928759e-06, + "loss": 0.456, + "step": 6845 + }, + { + "epoch": 0.78, + "learning_rate": 2.3825540718285454e-06, + "loss": 0.4431, + "step": 6846 + }, + { + "epoch": 0.78, + "learning_rate": 2.3801563764884905e-06, + "loss": 0.4557, + "step": 6847 + }, + { + "epoch": 0.78, + "learning_rate": 2.377759725237133e-06, + "loss": 0.4571, + "step": 6848 + }, + { + "epoch": 0.78, + "learning_rate": 2.375364118402872e-06, + "loss": 0.4388, + "step": 6849 + }, + { + "epoch": 0.78, + "learning_rate": 2.3729695563139554e-06, + "loss": 0.4392, + "step": 6850 + }, + { + "epoch": 0.78, + "learning_rate": 2.3705760392984887e-06, + "loss": 0.4436, + "step": 6851 + }, + { + "epoch": 0.78, + "learning_rate": 2.3681835676844444e-06, + "loss": 0.4396, + "step": 6852 + }, + { + "epoch": 0.78, + "learning_rate": 2.3657921417996364e-06, + "loss": 0.4496, + "step": 6853 + }, + { + "epoch": 0.78, + "learning_rate": 2.363401761971752e-06, + "loss": 0.4712, + "step": 6854 + }, + { + "epoch": 0.78, + "learning_rate": 2.361012428528321e-06, + "loss": 0.4381, + "step": 6855 + }, + { + "epoch": 0.78, + "learning_rate": 2.3586241417967336e-06, + "loss": 0.4399, + "step": 6856 + }, + { + "epoch": 0.78, + "learning_rate": 2.356236902104242e-06, + "loss": 0.4752, + "step": 6857 + }, + { + "epoch": 0.78, + "learning_rate": 2.3538507097779505e-06, + "loss": 0.4426, + "step": 6858 + }, + { + "epoch": 0.78, + "learning_rate": 2.351465565144825e-06, + "loss": 0.4458, + "step": 6859 + }, + { + "epoch": 0.78, + "learning_rate": 2.3490814685316777e-06, + "loss": 0.4235, + "step": 6860 + }, + { + "epoch": 0.78, + "learning_rate": 2.3466984202651833e-06, + "loss": 0.4463, + "step": 6861 + }, + { + "epoch": 0.78, + "learning_rate": 2.344316420671876e-06, + "loss": 0.4494, + "step": 6862 + }, + { + "epoch": 0.78, + "learning_rate": 2.3419354700781393e-06, + "loss": 0.4436, + "step": 6863 + }, + { + "epoch": 0.78, + "learning_rate": 2.339555568810221e-06, + "loss": 0.4534, + "step": 6864 + }, + { + "epoch": 0.78, + "learning_rate": 2.3371767171942183e-06, + "loss": 0.4484, + "step": 6865 + }, + { + "epoch": 0.78, + "learning_rate": 2.3347989155560835e-06, + "loss": 0.4561, + "step": 6866 + }, + { + "epoch": 0.78, + "learning_rate": 2.3324221642216328e-06, + "loss": 0.457, + "step": 6867 + }, + { + "epoch": 0.78, + "learning_rate": 2.3300464635165353e-06, + "loss": 0.4517, + "step": 6868 + }, + { + "epoch": 0.79, + "learning_rate": 2.32767181376631e-06, + "loss": 0.4567, + "step": 6869 + }, + { + "epoch": 0.79, + "learning_rate": 2.3252982152963434e-06, + "loss": 0.4546, + "step": 6870 + }, + { + "epoch": 0.79, + "learning_rate": 2.3229256684318646e-06, + "loss": 0.4425, + "step": 6871 + }, + { + "epoch": 0.79, + "learning_rate": 2.320554173497972e-06, + "loss": 0.4547, + "step": 6872 + }, + { + "epoch": 0.79, + "learning_rate": 2.31818373081961e-06, + "loss": 0.4633, + "step": 6873 + }, + { + "epoch": 0.79, + "learning_rate": 2.3158143407215796e-06, + "loss": 0.4514, + "step": 6874 + }, + { + "epoch": 0.79, + "learning_rate": 2.3134460035285433e-06, + "loss": 0.4556, + "step": 6875 + }, + { + "epoch": 0.79, + "learning_rate": 2.3110787195650173e-06, + "loss": 0.4472, + "step": 6876 + }, + { + "epoch": 0.79, + "learning_rate": 2.3087124891553703e-06, + "loss": 0.4514, + "step": 6877 + }, + { + "epoch": 0.79, + "learning_rate": 2.30634731262383e-06, + "loss": 0.4429, + "step": 6878 + }, + { + "epoch": 0.79, + "learning_rate": 2.3039831902944766e-06, + "loss": 0.4412, + "step": 6879 + }, + { + "epoch": 0.79, + "learning_rate": 2.3016201224912504e-06, + "loss": 0.4455, + "step": 6880 + }, + { + "epoch": 0.79, + "learning_rate": 2.299258109537943e-06, + "loss": 0.4689, + "step": 6881 + }, + { + "epoch": 0.79, + "learning_rate": 2.2968971517581994e-06, + "loss": 0.4426, + "step": 6882 + }, + { + "epoch": 0.79, + "learning_rate": 2.2945372494755304e-06, + "loss": 0.4538, + "step": 6883 + }, + { + "epoch": 0.79, + "learning_rate": 2.2921784030132886e-06, + "loss": 0.4454, + "step": 6884 + }, + { + "epoch": 0.79, + "learning_rate": 2.289820612694692e-06, + "loss": 0.4694, + "step": 6885 + }, + { + "epoch": 0.79, + "learning_rate": 2.2874638788428128e-06, + "loss": 0.4444, + "step": 6886 + }, + { + "epoch": 0.79, + "learning_rate": 2.2851082017805704e-06, + "loss": 0.4471, + "step": 6887 + }, + { + "epoch": 0.79, + "learning_rate": 2.2827535818307513e-06, + "loss": 0.4536, + "step": 6888 + }, + { + "epoch": 0.79, + "learning_rate": 2.2804000193159848e-06, + "loss": 0.453, + "step": 6889 + }, + { + "epoch": 0.79, + "learning_rate": 2.278047514558769e-06, + "loss": 0.47, + "step": 6890 + }, + { + "epoch": 0.79, + "learning_rate": 2.2756960678814444e-06, + "loss": 0.4648, + "step": 6891 + }, + { + "epoch": 0.79, + "learning_rate": 2.2733456796062093e-06, + "loss": 0.4424, + "step": 6892 + }, + { + "epoch": 0.79, + "learning_rate": 2.270996350055126e-06, + "loss": 0.4472, + "step": 6893 + }, + { + "epoch": 0.79, + "learning_rate": 2.2686480795500986e-06, + "loss": 0.4621, + "step": 6894 + }, + { + "epoch": 0.79, + "learning_rate": 2.2663008684128964e-06, + "loss": 0.4299, + "step": 6895 + }, + { + "epoch": 0.79, + "learning_rate": 2.2639547169651423e-06, + "loss": 0.453, + "step": 6896 + }, + { + "epoch": 0.79, + "learning_rate": 2.2616096255283048e-06, + "loss": 0.4602, + "step": 6897 + }, + { + "epoch": 0.79, + "learning_rate": 2.25926559442372e-06, + "loss": 0.447, + "step": 6898 + }, + { + "epoch": 0.79, + "learning_rate": 2.2569226239725695e-06, + "loss": 0.4674, + "step": 6899 + }, + { + "epoch": 0.79, + "learning_rate": 2.2545807144958896e-06, + "loss": 0.4247, + "step": 6900 + }, + { + "epoch": 0.79, + "learning_rate": 2.252239866314582e-06, + "loss": 0.4581, + "step": 6901 + }, + { + "epoch": 0.79, + "learning_rate": 2.249900079749385e-06, + "loss": 0.4323, + "step": 6902 + }, + { + "epoch": 0.79, + "learning_rate": 2.247561355120912e-06, + "loss": 0.442, + "step": 6903 + }, + { + "epoch": 0.79, + "learning_rate": 2.245223692749612e-06, + "loss": 0.4487, + "step": 6904 + }, + { + "epoch": 0.79, + "learning_rate": 2.2428870929558012e-06, + "loss": 0.4701, + "step": 6905 + }, + { + "epoch": 0.79, + "learning_rate": 2.240551556059647e-06, + "loss": 0.4617, + "step": 6906 + }, + { + "epoch": 0.79, + "learning_rate": 2.238217082381169e-06, + "loss": 0.4767, + "step": 6907 + }, + { + "epoch": 0.79, + "learning_rate": 2.235883672240239e-06, + "loss": 0.45, + "step": 6908 + }, + { + "epoch": 0.79, + "learning_rate": 2.233551325956591e-06, + "loss": 0.4683, + "step": 6909 + }, + { + "epoch": 0.79, + "learning_rate": 2.2312200438498043e-06, + "loss": 0.4498, + "step": 6910 + }, + { + "epoch": 0.79, + "learning_rate": 2.2288898262393212e-06, + "loss": 0.452, + "step": 6911 + }, + { + "epoch": 0.79, + "learning_rate": 2.2265606734444314e-06, + "loss": 0.4564, + "step": 6912 + }, + { + "epoch": 0.79, + "learning_rate": 2.2242325857842773e-06, + "loss": 0.4566, + "step": 6913 + }, + { + "epoch": 0.79, + "learning_rate": 2.2219055635778618e-06, + "loss": 0.4514, + "step": 6914 + }, + { + "epoch": 0.79, + "learning_rate": 2.219579607144039e-06, + "loss": 0.4527, + "step": 6915 + }, + { + "epoch": 0.79, + "learning_rate": 2.21725471680152e-06, + "loss": 0.4504, + "step": 6916 + }, + { + "epoch": 0.79, + "learning_rate": 2.214930892868864e-06, + "loss": 0.4526, + "step": 6917 + }, + { + "epoch": 0.79, + "learning_rate": 2.2126081356644836e-06, + "loss": 0.4445, + "step": 6918 + }, + { + "epoch": 0.79, + "learning_rate": 2.210286445506654e-06, + "loss": 0.4556, + "step": 6919 + }, + { + "epoch": 0.79, + "learning_rate": 2.207965822713496e-06, + "loss": 0.4578, + "step": 6920 + }, + { + "epoch": 0.79, + "learning_rate": 2.205646267602983e-06, + "loss": 0.4627, + "step": 6921 + }, + { + "epoch": 0.79, + "learning_rate": 2.203327780492953e-06, + "loss": 0.454, + "step": 6922 + }, + { + "epoch": 0.79, + "learning_rate": 2.2010103617010836e-06, + "loss": 0.4572, + "step": 6923 + }, + { + "epoch": 0.79, + "learning_rate": 2.1986940115449173e-06, + "loss": 0.4375, + "step": 6924 + }, + { + "epoch": 0.79, + "learning_rate": 2.196378730341846e-06, + "loss": 0.4548, + "step": 6925 + }, + { + "epoch": 0.79, + "learning_rate": 2.1940645184091115e-06, + "loss": 0.4455, + "step": 6926 + }, + { + "epoch": 0.79, + "learning_rate": 2.1917513760638177e-06, + "loss": 0.4483, + "step": 6927 + }, + { + "epoch": 0.79, + "learning_rate": 2.18943930362291e-06, + "loss": 0.4536, + "step": 6928 + }, + { + "epoch": 0.79, + "learning_rate": 2.1871283014032007e-06, + "loss": 0.4454, + "step": 6929 + }, + { + "epoch": 0.79, + "learning_rate": 2.1848183697213467e-06, + "loss": 0.4693, + "step": 6930 + }, + { + "epoch": 0.79, + "learning_rate": 2.1825095088938553e-06, + "loss": 0.4515, + "step": 6931 + }, + { + "epoch": 0.79, + "learning_rate": 2.1802017192370963e-06, + "loss": 0.451, + "step": 6932 + }, + { + "epoch": 0.79, + "learning_rate": 2.1778950010672895e-06, + "loss": 0.4543, + "step": 6933 + }, + { + "epoch": 0.79, + "learning_rate": 2.1755893547005036e-06, + "loss": 0.4561, + "step": 6934 + }, + { + "epoch": 0.79, + "learning_rate": 2.173284780452667e-06, + "loss": 0.4421, + "step": 6935 + }, + { + "epoch": 0.79, + "learning_rate": 2.1709812786395545e-06, + "loss": 0.4498, + "step": 6936 + }, + { + "epoch": 0.79, + "learning_rate": 2.1686788495768006e-06, + "loss": 0.4632, + "step": 6937 + }, + { + "epoch": 0.79, + "learning_rate": 2.1663774935798886e-06, + "loss": 0.4392, + "step": 6938 + }, + { + "epoch": 0.79, + "learning_rate": 2.1640772109641504e-06, + "loss": 0.4639, + "step": 6939 + }, + { + "epoch": 0.79, + "learning_rate": 2.1617780020447854e-06, + "loss": 0.4477, + "step": 6940 + }, + { + "epoch": 0.79, + "learning_rate": 2.1594798671368265e-06, + "loss": 0.4378, + "step": 6941 + }, + { + "epoch": 0.79, + "learning_rate": 2.157182806555177e-06, + "loss": 0.4584, + "step": 6942 + }, + { + "epoch": 0.79, + "learning_rate": 2.1548868206145846e-06, + "loss": 0.4768, + "step": 6943 + }, + { + "epoch": 0.79, + "learning_rate": 2.1525919096296455e-06, + "loss": 0.4334, + "step": 6944 + }, + { + "epoch": 0.79, + "learning_rate": 2.1502980739148215e-06, + "loss": 0.4477, + "step": 6945 + }, + { + "epoch": 0.79, + "learning_rate": 2.1480053137844115e-06, + "loss": 0.4488, + "step": 6946 + }, + { + "epoch": 0.79, + "learning_rate": 2.1457136295525817e-06, + "loss": 0.4608, + "step": 6947 + }, + { + "epoch": 0.79, + "learning_rate": 2.1434230215333407e-06, + "loss": 0.4605, + "step": 6948 + }, + { + "epoch": 0.79, + "learning_rate": 2.14113349004055e-06, + "loss": 0.4562, + "step": 6949 + }, + { + "epoch": 0.79, + "learning_rate": 2.138845035387932e-06, + "loss": 0.4482, + "step": 6950 + }, + { + "epoch": 0.79, + "learning_rate": 2.1365576578890513e-06, + "loss": 0.4535, + "step": 6951 + }, + { + "epoch": 0.79, + "learning_rate": 2.1342713578573327e-06, + "loss": 0.44, + "step": 6952 + }, + { + "epoch": 0.79, + "learning_rate": 2.131986135606051e-06, + "loss": 0.447, + "step": 6953 + }, + { + "epoch": 0.79, + "learning_rate": 2.1297019914483297e-06, + "loss": 0.4267, + "step": 6954 + }, + { + "epoch": 0.79, + "learning_rate": 2.1274189256971523e-06, + "loss": 0.442, + "step": 6955 + }, + { + "epoch": 0.79, + "learning_rate": 2.1251369386653454e-06, + "loss": 0.4605, + "step": 6956 + }, + { + "epoch": 0.8, + "learning_rate": 2.122856030665591e-06, + "loss": 0.4559, + "step": 6957 + }, + { + "epoch": 0.8, + "learning_rate": 2.1205762020104303e-06, + "loss": 0.4631, + "step": 6958 + }, + { + "epoch": 0.8, + "learning_rate": 2.1182974530122435e-06, + "loss": 0.4636, + "step": 6959 + }, + { + "epoch": 0.8, + "learning_rate": 2.1160197839832774e-06, + "loss": 0.4414, + "step": 6960 + }, + { + "epoch": 0.8, + "learning_rate": 2.113743195235617e-06, + "loss": 0.4593, + "step": 6961 + }, + { + "epoch": 0.8, + "learning_rate": 2.111467687081209e-06, + "loss": 0.4448, + "step": 6962 + }, + { + "epoch": 0.8, + "learning_rate": 2.109193259831851e-06, + "loss": 0.4518, + "step": 6963 + }, + { + "epoch": 0.8, + "learning_rate": 2.106919913799188e-06, + "loss": 0.4534, + "step": 6964 + }, + { + "epoch": 0.8, + "learning_rate": 2.1046476492947155e-06, + "loss": 0.4483, + "step": 6965 + }, + { + "epoch": 0.8, + "learning_rate": 2.102376466629792e-06, + "loss": 0.4542, + "step": 6966 + }, + { + "epoch": 0.8, + "learning_rate": 2.100106366115613e-06, + "loss": 0.4583, + "step": 6967 + }, + { + "epoch": 0.8, + "learning_rate": 2.0978373480632386e-06, + "loss": 0.4349, + "step": 6968 + }, + { + "epoch": 0.8, + "learning_rate": 2.0955694127835736e-06, + "loss": 0.447, + "step": 6969 + }, + { + "epoch": 0.8, + "learning_rate": 2.0933025605873702e-06, + "loss": 0.4611, + "step": 6970 + }, + { + "epoch": 0.8, + "learning_rate": 2.0910367917852437e-06, + "loss": 0.4435, + "step": 6971 + }, + { + "epoch": 0.8, + "learning_rate": 2.088772106687653e-06, + "loss": 0.4551, + "step": 6972 + }, + { + "epoch": 0.8, + "learning_rate": 2.0865085056049138e-06, + "loss": 0.4698, + "step": 6973 + }, + { + "epoch": 0.8, + "learning_rate": 2.084245988847188e-06, + "loss": 0.4405, + "step": 6974 + }, + { + "epoch": 0.8, + "learning_rate": 2.0819845567244868e-06, + "loss": 0.4661, + "step": 6975 + }, + { + "epoch": 0.8, + "learning_rate": 2.079724209546683e-06, + "loss": 0.4539, + "step": 6976 + }, + { + "epoch": 0.8, + "learning_rate": 2.077464947623492e-06, + "loss": 0.4373, + "step": 6977 + }, + { + "epoch": 0.8, + "learning_rate": 2.0752067712644807e-06, + "loss": 0.4411, + "step": 6978 + }, + { + "epoch": 0.8, + "learning_rate": 2.0729496807790737e-06, + "loss": 0.4557, + "step": 6979 + }, + { + "epoch": 0.8, + "learning_rate": 2.0706936764765393e-06, + "loss": 0.4516, + "step": 6980 + }, + { + "epoch": 0.8, + "learning_rate": 2.0684387586660027e-06, + "loss": 0.4538, + "step": 6981 + }, + { + "epoch": 0.8, + "learning_rate": 2.0661849276564394e-06, + "loss": 0.4527, + "step": 6982 + }, + { + "epoch": 0.8, + "learning_rate": 2.0639321837566696e-06, + "loss": 0.4482, + "step": 6983 + }, + { + "epoch": 0.8, + "learning_rate": 2.0616805272753758e-06, + "loss": 0.4652, + "step": 6984 + }, + { + "epoch": 0.8, + "learning_rate": 2.0594299585210796e-06, + "loss": 0.4302, + "step": 6985 + }, + { + "epoch": 0.8, + "learning_rate": 2.057180477802164e-06, + "loss": 0.4535, + "step": 6986 + }, + { + "epoch": 0.8, + "learning_rate": 2.054932085426856e-06, + "loss": 0.4474, + "step": 6987 + }, + { + "epoch": 0.8, + "learning_rate": 2.0526847817032326e-06, + "loss": 0.4389, + "step": 6988 + }, + { + "epoch": 0.8, + "learning_rate": 2.0504385669392268e-06, + "loss": 0.4523, + "step": 6989 + }, + { + "epoch": 0.8, + "learning_rate": 2.048193441442623e-06, + "loss": 0.4341, + "step": 6990 + }, + { + "epoch": 0.8, + "learning_rate": 2.0459494055210495e-06, + "loss": 0.462, + "step": 6991 + }, + { + "epoch": 0.8, + "learning_rate": 2.043706459481992e-06, + "loss": 0.4544, + "step": 6992 + }, + { + "epoch": 0.8, + "learning_rate": 2.0414646036327813e-06, + "loss": 0.4574, + "step": 6993 + }, + { + "epoch": 0.8, + "learning_rate": 2.039223838280606e-06, + "loss": 0.4469, + "step": 6994 + }, + { + "epoch": 0.8, + "learning_rate": 2.0369841637324992e-06, + "loss": 0.4479, + "step": 6995 + }, + { + "epoch": 0.8, + "learning_rate": 2.034745580295342e-06, + "loss": 0.4353, + "step": 6996 + }, + { + "epoch": 0.8, + "learning_rate": 2.0325080882758775e-06, + "loss": 0.4502, + "step": 6997 + }, + { + "epoch": 0.8, + "learning_rate": 2.030271687980685e-06, + "loss": 0.4471, + "step": 6998 + }, + { + "epoch": 0.8, + "learning_rate": 2.028036379716205e-06, + "loss": 0.457, + "step": 6999 + }, + { + "epoch": 0.8, + "learning_rate": 2.025802163788727e-06, + "loss": 0.4307, + "step": 7000 + }, + { + "epoch": 0.8, + "learning_rate": 2.023569040504384e-06, + "loss": 0.4597, + "step": 7001 + }, + { + "epoch": 0.8, + "learning_rate": 2.0213370101691675e-06, + "loss": 0.4454, + "step": 7002 + }, + { + "epoch": 0.8, + "learning_rate": 2.0191060730889132e-06, + "loss": 0.44, + "step": 7003 + }, + { + "epoch": 0.8, + "learning_rate": 2.016876229569308e-06, + "loss": 0.4463, + "step": 7004 + }, + { + "epoch": 0.8, + "learning_rate": 2.0146474799158935e-06, + "loss": 0.4651, + "step": 7005 + }, + { + "epoch": 0.8, + "learning_rate": 2.0124198244340543e-06, + "loss": 0.4465, + "step": 7006 + }, + { + "epoch": 0.8, + "learning_rate": 2.0101932634290345e-06, + "loss": 0.4694, + "step": 7007 + }, + { + "epoch": 0.8, + "learning_rate": 2.0079677972059163e-06, + "loss": 0.4513, + "step": 7008 + }, + { + "epoch": 0.8, + "learning_rate": 2.005743426069641e-06, + "loss": 0.4398, + "step": 7009 + }, + { + "epoch": 0.8, + "learning_rate": 2.003520150325e-06, + "loss": 0.4537, + "step": 7010 + }, + { + "epoch": 0.8, + "learning_rate": 2.0012979702766277e-06, + "loss": 0.4491, + "step": 7011 + }, + { + "epoch": 0.8, + "learning_rate": 1.9990768862290155e-06, + "loss": 0.4225, + "step": 7012 + }, + { + "epoch": 0.8, + "learning_rate": 1.9968568984865e-06, + "loss": 0.4786, + "step": 7013 + }, + { + "epoch": 0.8, + "learning_rate": 1.9946380073532668e-06, + "loss": 0.4358, + "step": 7014 + }, + { + "epoch": 0.8, + "learning_rate": 1.992420213133357e-06, + "loss": 0.4804, + "step": 7015 + }, + { + "epoch": 0.8, + "learning_rate": 1.9902035161306574e-06, + "loss": 0.4417, + "step": 7016 + }, + { + "epoch": 0.8, + "learning_rate": 1.9879879166489023e-06, + "loss": 0.4523, + "step": 7017 + }, + { + "epoch": 0.8, + "learning_rate": 1.9857734149916787e-06, + "loss": 0.4359, + "step": 7018 + }, + { + "epoch": 0.8, + "learning_rate": 1.983560011462425e-06, + "loss": 0.4723, + "step": 7019 + }, + { + "epoch": 0.8, + "learning_rate": 1.981347706364429e-06, + "loss": 0.441, + "step": 7020 + }, + { + "epoch": 0.8, + "learning_rate": 1.979136500000822e-06, + "loss": 0.4501, + "step": 7021 + }, + { + "epoch": 0.8, + "learning_rate": 1.9769263926745886e-06, + "loss": 0.4283, + "step": 7022 + }, + { + "epoch": 0.8, + "learning_rate": 1.974717384688566e-06, + "loss": 0.4435, + "step": 7023 + }, + { + "epoch": 0.8, + "learning_rate": 1.972509476345432e-06, + "loss": 0.4534, + "step": 7024 + }, + { + "epoch": 0.8, + "learning_rate": 1.9703026679477253e-06, + "loss": 0.4595, + "step": 7025 + }, + { + "epoch": 0.8, + "learning_rate": 1.968096959797827e-06, + "loss": 0.4401, + "step": 7026 + }, + { + "epoch": 0.8, + "learning_rate": 1.9658923521979633e-06, + "loss": 0.4562, + "step": 7027 + }, + { + "epoch": 0.8, + "learning_rate": 1.963688845450218e-06, + "loss": 0.4607, + "step": 7028 + }, + { + "epoch": 0.8, + "learning_rate": 1.9614864398565212e-06, + "loss": 0.4341, + "step": 7029 + }, + { + "epoch": 0.8, + "learning_rate": 1.9592851357186537e-06, + "loss": 0.4564, + "step": 7030 + }, + { + "epoch": 0.8, + "learning_rate": 1.957084933338241e-06, + "loss": 0.4644, + "step": 7031 + }, + { + "epoch": 0.8, + "learning_rate": 1.9548858330167584e-06, + "loss": 0.4615, + "step": 7032 + }, + { + "epoch": 0.8, + "learning_rate": 1.9526878350555344e-06, + "loss": 0.4764, + "step": 7033 + }, + { + "epoch": 0.8, + "learning_rate": 1.9504909397557436e-06, + "loss": 0.4278, + "step": 7034 + }, + { + "epoch": 0.8, + "learning_rate": 1.9482951474184054e-06, + "loss": 0.4495, + "step": 7035 + }, + { + "epoch": 0.8, + "learning_rate": 1.9461004583443986e-06, + "loss": 0.447, + "step": 7036 + }, + { + "epoch": 0.8, + "learning_rate": 1.94390687283444e-06, + "loss": 0.4558, + "step": 7037 + }, + { + "epoch": 0.8, + "learning_rate": 1.9417143911891003e-06, + "loss": 0.4436, + "step": 7038 + }, + { + "epoch": 0.8, + "learning_rate": 1.939523013708803e-06, + "loss": 0.4531, + "step": 7039 + }, + { + "epoch": 0.8, + "learning_rate": 1.937332740693809e-06, + "loss": 0.4468, + "step": 7040 + }, + { + "epoch": 0.8, + "learning_rate": 1.9351435724442412e-06, + "loss": 0.4673, + "step": 7041 + }, + { + "epoch": 0.8, + "learning_rate": 1.9329555092600593e-06, + "loss": 0.4542, + "step": 7042 + }, + { + "epoch": 0.8, + "learning_rate": 1.9307685514410803e-06, + "loss": 0.4458, + "step": 7043 + }, + { + "epoch": 0.81, + "learning_rate": 1.928582699286965e-06, + "loss": 0.4467, + "step": 7044 + }, + { + "epoch": 0.81, + "learning_rate": 1.926397953097222e-06, + "loss": 0.4474, + "step": 7045 + }, + { + "epoch": 0.81, + "learning_rate": 1.924214313171211e-06, + "loss": 0.4305, + "step": 7046 + }, + { + "epoch": 0.81, + "learning_rate": 1.9220317798081433e-06, + "loss": 0.456, + "step": 7047 + }, + { + "epoch": 0.81, + "learning_rate": 1.9198503533070688e-06, + "loss": 0.467, + "step": 7048 + }, + { + "epoch": 0.81, + "learning_rate": 1.9176700339668986e-06, + "loss": 0.4586, + "step": 7049 + }, + { + "epoch": 0.81, + "learning_rate": 1.9154908220863775e-06, + "loss": 0.447, + "step": 7050 + }, + { + "epoch": 0.81, + "learning_rate": 1.913312717964113e-06, + "loss": 0.4616, + "step": 7051 + }, + { + "epoch": 0.81, + "learning_rate": 1.9111357218985504e-06, + "loss": 0.4483, + "step": 7052 + }, + { + "epoch": 0.81, + "learning_rate": 1.9089598341879855e-06, + "loss": 0.4495, + "step": 7053 + }, + { + "epoch": 0.81, + "learning_rate": 1.9067850551305678e-06, + "loss": 0.4501, + "step": 7054 + }, + { + "epoch": 0.81, + "learning_rate": 1.9046113850242843e-06, + "loss": 0.4501, + "step": 7055 + }, + { + "epoch": 0.81, + "learning_rate": 1.9024388241669811e-06, + "loss": 0.4504, + "step": 7056 + }, + { + "epoch": 0.81, + "learning_rate": 1.900267372856348e-06, + "loss": 0.4638, + "step": 7057 + }, + { + "epoch": 0.81, + "learning_rate": 1.8980970313899193e-06, + "loss": 0.4497, + "step": 7058 + }, + { + "epoch": 0.81, + "learning_rate": 1.8959278000650839e-06, + "loss": 0.449, + "step": 7059 + }, + { + "epoch": 0.81, + "learning_rate": 1.8937596791790735e-06, + "loss": 0.45, + "step": 7060 + }, + { + "epoch": 0.81, + "learning_rate": 1.8915926690289643e-06, + "loss": 0.4337, + "step": 7061 + }, + { + "epoch": 0.81, + "learning_rate": 1.889426769911693e-06, + "loss": 0.4467, + "step": 7062 + }, + { + "epoch": 0.81, + "learning_rate": 1.887261982124029e-06, + "loss": 0.4389, + "step": 7063 + }, + { + "epoch": 0.81, + "learning_rate": 1.8850983059626026e-06, + "loss": 0.4479, + "step": 7064 + }, + { + "epoch": 0.81, + "learning_rate": 1.8829357417238802e-06, + "loss": 0.4554, + "step": 7065 + }, + { + "epoch": 0.81, + "learning_rate": 1.8807742897041847e-06, + "loss": 0.4657, + "step": 7066 + }, + { + "epoch": 0.81, + "learning_rate": 1.8786139501996847e-06, + "loss": 0.4486, + "step": 7067 + }, + { + "epoch": 0.81, + "learning_rate": 1.8764547235063912e-06, + "loss": 0.4762, + "step": 7068 + }, + { + "epoch": 0.81, + "learning_rate": 1.8742966099201699e-06, + "loss": 0.4658, + "step": 7069 + }, + { + "epoch": 0.81, + "learning_rate": 1.8721396097367294e-06, + "loss": 0.4481, + "step": 7070 + }, + { + "epoch": 0.81, + "learning_rate": 1.8699837232516226e-06, + "loss": 0.4427, + "step": 7071 + }, + { + "epoch": 0.81, + "learning_rate": 1.867828950760262e-06, + "loss": 0.4456, + "step": 7072 + }, + { + "epoch": 0.81, + "learning_rate": 1.8656752925578948e-06, + "loss": 0.4609, + "step": 7073 + }, + { + "epoch": 0.81, + "learning_rate": 1.8635227489396178e-06, + "loss": 0.4578, + "step": 7074 + }, + { + "epoch": 0.81, + "learning_rate": 1.8613713202003813e-06, + "loss": 0.4515, + "step": 7075 + }, + { + "epoch": 0.81, + "learning_rate": 1.8592210066349781e-06, + "loss": 0.4513, + "step": 7076 + }, + { + "epoch": 0.81, + "learning_rate": 1.8570718085380512e-06, + "loss": 0.4564, + "step": 7077 + }, + { + "epoch": 0.81, + "learning_rate": 1.8549237262040876e-06, + "loss": 0.4408, + "step": 7078 + }, + { + "epoch": 0.81, + "learning_rate": 1.8527767599274193e-06, + "loss": 0.4488, + "step": 7079 + }, + { + "epoch": 0.81, + "learning_rate": 1.8506309100022334e-06, + "loss": 0.4426, + "step": 7080 + }, + { + "epoch": 0.81, + "learning_rate": 1.8484861767225549e-06, + "loss": 0.4452, + "step": 7081 + }, + { + "epoch": 0.81, + "learning_rate": 1.846342560382265e-06, + "loss": 0.4534, + "step": 7082 + }, + { + "epoch": 0.81, + "learning_rate": 1.8442000612750832e-06, + "loss": 0.4499, + "step": 7083 + }, + { + "epoch": 0.81, + "learning_rate": 1.8420586796945793e-06, + "loss": 0.4449, + "step": 7084 + }, + { + "epoch": 0.81, + "learning_rate": 1.839918415934171e-06, + "loss": 0.4541, + "step": 7085 + }, + { + "epoch": 0.81, + "learning_rate": 1.8377792702871266e-06, + "loss": 0.4681, + "step": 7086 + }, + { + "epoch": 0.81, + "learning_rate": 1.8356412430465498e-06, + "loss": 0.4346, + "step": 7087 + }, + { + "epoch": 0.81, + "learning_rate": 1.8335043345054048e-06, + "loss": 0.4719, + "step": 7088 + }, + { + "epoch": 0.81, + "learning_rate": 1.83136854495649e-06, + "loss": 0.4324, + "step": 7089 + }, + { + "epoch": 0.81, + "learning_rate": 1.829233874692461e-06, + "loss": 0.4662, + "step": 7090 + }, + { + "epoch": 0.81, + "learning_rate": 1.8271003240058127e-06, + "loss": 0.4618, + "step": 7091 + }, + { + "epoch": 0.81, + "learning_rate": 1.8249678931888881e-06, + "loss": 0.4364, + "step": 7092 + }, + { + "epoch": 0.81, + "learning_rate": 1.8228365825338811e-06, + "loss": 0.4453, + "step": 7093 + }, + { + "epoch": 0.81, + "learning_rate": 1.820706392332824e-06, + "loss": 0.4628, + "step": 7094 + }, + { + "epoch": 0.81, + "learning_rate": 1.8185773228776038e-06, + "loss": 0.4462, + "step": 7095 + }, + { + "epoch": 0.81, + "learning_rate": 1.8164493744599531e-06, + "loss": 0.4569, + "step": 7096 + }, + { + "epoch": 0.81, + "learning_rate": 1.814322547371443e-06, + "loss": 0.4359, + "step": 7097 + }, + { + "epoch": 0.81, + "learning_rate": 1.8121968419035007e-06, + "loss": 0.4567, + "step": 7098 + }, + { + "epoch": 0.81, + "learning_rate": 1.810072258347394e-06, + "loss": 0.4508, + "step": 7099 + }, + { + "epoch": 0.81, + "learning_rate": 1.8079487969942344e-06, + "loss": 0.4589, + "step": 7100 + }, + { + "epoch": 0.81, + "learning_rate": 1.8058264581349893e-06, + "loss": 0.463, + "step": 7101 + }, + { + "epoch": 0.81, + "learning_rate": 1.8037052420604618e-06, + "loss": 0.4545, + "step": 7102 + }, + { + "epoch": 0.81, + "learning_rate": 1.8015851490613079e-06, + "loss": 0.4578, + "step": 7103 + }, + { + "epoch": 0.81, + "learning_rate": 1.799466179428031e-06, + "loss": 0.4468, + "step": 7104 + }, + { + "epoch": 0.81, + "learning_rate": 1.7973483334509701e-06, + "loss": 0.4457, + "step": 7105 + }, + { + "epoch": 0.81, + "learning_rate": 1.795231611420325e-06, + "loss": 0.4449, + "step": 7106 + }, + { + "epoch": 0.81, + "learning_rate": 1.7931160136261272e-06, + "loss": 0.4342, + "step": 7107 + }, + { + "epoch": 0.81, + "learning_rate": 1.7910015403582659e-06, + "loss": 0.4544, + "step": 7108 + }, + { + "epoch": 0.81, + "learning_rate": 1.7888881919064694e-06, + "loss": 0.4629, + "step": 7109 + }, + { + "epoch": 0.81, + "learning_rate": 1.7867759685603115e-06, + "loss": 0.4911, + "step": 7110 + }, + { + "epoch": 0.81, + "learning_rate": 1.7846648706092173e-06, + "loss": 0.444, + "step": 7111 + }, + { + "epoch": 0.81, + "learning_rate": 1.78255489834245e-06, + "loss": 0.4455, + "step": 7112 + }, + { + "epoch": 0.81, + "learning_rate": 1.7804460520491263e-06, + "loss": 0.4514, + "step": 7113 + }, + { + "epoch": 0.81, + "learning_rate": 1.7783383320182069e-06, + "loss": 0.4652, + "step": 7114 + }, + { + "epoch": 0.81, + "learning_rate": 1.776231738538492e-06, + "loss": 0.4435, + "step": 7115 + }, + { + "epoch": 0.81, + "learning_rate": 1.7741262718986363e-06, + "loss": 0.4645, + "step": 7116 + }, + { + "epoch": 0.81, + "learning_rate": 1.7720219323871346e-06, + "loss": 0.4633, + "step": 7117 + }, + { + "epoch": 0.81, + "learning_rate": 1.7699187202923241e-06, + "loss": 0.437, + "step": 7118 + }, + { + "epoch": 0.81, + "learning_rate": 1.7678166359023973e-06, + "loss": 0.4594, + "step": 7119 + }, + { + "epoch": 0.81, + "learning_rate": 1.7657156795053821e-06, + "loss": 0.4571, + "step": 7120 + }, + { + "epoch": 0.81, + "learning_rate": 1.763615851389161e-06, + "loss": 0.4344, + "step": 7121 + }, + { + "epoch": 0.81, + "learning_rate": 1.7615171518414542e-06, + "loss": 0.4568, + "step": 7122 + }, + { + "epoch": 0.81, + "learning_rate": 1.7594195811498294e-06, + "loss": 0.4552, + "step": 7123 + }, + { + "epoch": 0.81, + "learning_rate": 1.7573231396017064e-06, + "loss": 0.4622, + "step": 7124 + }, + { + "epoch": 0.81, + "learning_rate": 1.755227827484338e-06, + "loss": 0.4444, + "step": 7125 + }, + { + "epoch": 0.81, + "learning_rate": 1.7531336450848335e-06, + "loss": 0.4418, + "step": 7126 + }, + { + "epoch": 0.81, + "learning_rate": 1.7510405926901408e-06, + "loss": 0.4377, + "step": 7127 + }, + { + "epoch": 0.81, + "learning_rate": 1.7489486705870517e-06, + "loss": 0.448, + "step": 7128 + }, + { + "epoch": 0.81, + "learning_rate": 1.7468578790622126e-06, + "loss": 0.4435, + "step": 7129 + }, + { + "epoch": 0.81, + "learning_rate": 1.7447682184021042e-06, + "loss": 0.4557, + "step": 7130 + }, + { + "epoch": 0.81, + "learning_rate": 1.7426796888930553e-06, + "loss": 0.4505, + "step": 7131 + }, + { + "epoch": 0.82, + "learning_rate": 1.7405922908212436e-06, + "loss": 0.4546, + "step": 7132 + }, + { + "epoch": 0.82, + "learning_rate": 1.7385060244726882e-06, + "loss": 0.44, + "step": 7133 + }, + { + "epoch": 0.82, + "learning_rate": 1.736420890133258e-06, + "loss": 0.4504, + "step": 7134 + }, + { + "epoch": 0.82, + "learning_rate": 1.7343368880886603e-06, + "loss": 0.4574, + "step": 7135 + }, + { + "epoch": 0.82, + "learning_rate": 1.7322540186244462e-06, + "loss": 0.4503, + "step": 7136 + }, + { + "epoch": 0.82, + "learning_rate": 1.7301722820260226e-06, + "loss": 0.4587, + "step": 7137 + }, + { + "epoch": 0.82, + "learning_rate": 1.7280916785786261e-06, + "loss": 0.4356, + "step": 7138 + }, + { + "epoch": 0.82, + "learning_rate": 1.7260122085673525e-06, + "loss": 0.4409, + "step": 7139 + }, + { + "epoch": 0.82, + "learning_rate": 1.7239338722771326e-06, + "loss": 0.4578, + "step": 7140 + }, + { + "epoch": 0.82, + "learning_rate": 1.721856669992743e-06, + "loss": 0.447, + "step": 7141 + }, + { + "epoch": 0.82, + "learning_rate": 1.7197806019988084e-06, + "loss": 0.4626, + "step": 7142 + }, + { + "epoch": 0.82, + "learning_rate": 1.7177056685797988e-06, + "loss": 0.4323, + "step": 7143 + }, + { + "epoch": 0.82, + "learning_rate": 1.7156318700200236e-06, + "loss": 0.4271, + "step": 7144 + }, + { + "epoch": 0.82, + "learning_rate": 1.713559206603642e-06, + "loss": 0.4705, + "step": 7145 + }, + { + "epoch": 0.82, + "learning_rate": 1.7114876786146505e-06, + "loss": 0.4579, + "step": 7146 + }, + { + "epoch": 0.82, + "learning_rate": 1.7094172863369007e-06, + "loss": 0.4371, + "step": 7147 + }, + { + "epoch": 0.82, + "learning_rate": 1.7073480300540802e-06, + "loss": 0.4559, + "step": 7148 + }, + { + "epoch": 0.82, + "learning_rate": 1.7052799100497197e-06, + "loss": 0.4504, + "step": 7149 + }, + { + "epoch": 0.82, + "learning_rate": 1.703212926607204e-06, + "loss": 0.4783, + "step": 7150 + }, + { + "epoch": 0.82, + "learning_rate": 1.7011470800097496e-06, + "loss": 0.4489, + "step": 7151 + }, + { + "epoch": 0.82, + "learning_rate": 1.6990823705404269e-06, + "loss": 0.4556, + "step": 7152 + }, + { + "epoch": 0.82, + "learning_rate": 1.6970187984821496e-06, + "loss": 0.4528, + "step": 7153 + }, + { + "epoch": 0.82, + "learning_rate": 1.694956364117668e-06, + "loss": 0.4559, + "step": 7154 + }, + { + "epoch": 0.82, + "learning_rate": 1.6928950677295875e-06, + "loss": 0.4405, + "step": 7155 + }, + { + "epoch": 0.82, + "learning_rate": 1.6908349096003484e-06, + "loss": 0.4654, + "step": 7156 + }, + { + "epoch": 0.82, + "learning_rate": 1.6887758900122352e-06, + "loss": 0.453, + "step": 7157 + }, + { + "epoch": 0.82, + "learning_rate": 1.6867180092473866e-06, + "loss": 0.495, + "step": 7158 + }, + { + "epoch": 0.82, + "learning_rate": 1.6846612675877716e-06, + "loss": 0.4487, + "step": 7159 + }, + { + "epoch": 0.82, + "learning_rate": 1.6826056653152122e-06, + "loss": 0.46, + "step": 7160 + }, + { + "epoch": 0.82, + "learning_rate": 1.6805512027113745e-06, + "loss": 0.4511, + "step": 7161 + }, + { + "epoch": 0.82, + "learning_rate": 1.6784978800577611e-06, + "loss": 0.4701, + "step": 7162 + }, + { + "epoch": 0.82, + "learning_rate": 1.6764456976357279e-06, + "loss": 0.4382, + "step": 7163 + }, + { + "epoch": 0.82, + "learning_rate": 1.6743946557264656e-06, + "loss": 0.4417, + "step": 7164 + }, + { + "epoch": 0.82, + "learning_rate": 1.672344754611016e-06, + "loss": 0.4622, + "step": 7165 + }, + { + "epoch": 0.82, + "learning_rate": 1.67029599457026e-06, + "loss": 0.4403, + "step": 7166 + }, + { + "epoch": 0.82, + "learning_rate": 1.6682483758849199e-06, + "loss": 0.4704, + "step": 7167 + }, + { + "epoch": 0.82, + "learning_rate": 1.666201898835572e-06, + "loss": 0.4515, + "step": 7168 + }, + { + "epoch": 0.82, + "learning_rate": 1.6641565637026225e-06, + "loss": 0.4407, + "step": 7169 + }, + { + "epoch": 0.82, + "learning_rate": 1.6621123707663312e-06, + "loss": 0.448, + "step": 7170 + }, + { + "epoch": 0.82, + "learning_rate": 1.6600693203068007e-06, + "loss": 0.4588, + "step": 7171 + }, + { + "epoch": 0.82, + "learning_rate": 1.6580274126039698e-06, + "loss": 0.4502, + "step": 7172 + }, + { + "epoch": 0.82, + "learning_rate": 1.6559866479376297e-06, + "loss": 0.4401, + "step": 7173 + }, + { + "epoch": 0.82, + "learning_rate": 1.6539470265874092e-06, + "loss": 0.4351, + "step": 7174 + }, + { + "epoch": 0.82, + "learning_rate": 1.651908548832779e-06, + "loss": 0.4603, + "step": 7175 + }, + { + "epoch": 0.82, + "learning_rate": 1.6498712149530606e-06, + "loss": 0.4821, + "step": 7176 + }, + { + "epoch": 0.82, + "learning_rate": 1.64783502522741e-06, + "loss": 0.4347, + "step": 7177 + }, + { + "epoch": 0.82, + "learning_rate": 1.6457999799348345e-06, + "loss": 0.4536, + "step": 7178 + }, + { + "epoch": 0.82, + "learning_rate": 1.6437660793541776e-06, + "loss": 0.4413, + "step": 7179 + }, + { + "epoch": 0.82, + "learning_rate": 1.6417333237641298e-06, + "loss": 0.446, + "step": 7180 + }, + { + "epoch": 0.82, + "learning_rate": 1.6397017134432281e-06, + "loss": 0.455, + "step": 7181 + }, + { + "epoch": 0.82, + "learning_rate": 1.6376712486698443e-06, + "loss": 0.4394, + "step": 7182 + }, + { + "epoch": 0.82, + "learning_rate": 1.635641929722196e-06, + "loss": 0.4533, + "step": 7183 + }, + { + "epoch": 0.82, + "learning_rate": 1.6336137568783495e-06, + "loss": 0.4685, + "step": 7184 + }, + { + "epoch": 0.82, + "learning_rate": 1.6315867304162058e-06, + "loss": 0.4504, + "step": 7185 + }, + { + "epoch": 0.82, + "learning_rate": 1.6295608506135162e-06, + "loss": 0.4686, + "step": 7186 + }, + { + "epoch": 0.82, + "learning_rate": 1.627536117747871e-06, + "loss": 0.4506, + "step": 7187 + }, + { + "epoch": 0.82, + "learning_rate": 1.625512532096699e-06, + "loss": 0.4543, + "step": 7188 + }, + { + "epoch": 0.82, + "learning_rate": 1.623490093937281e-06, + "loss": 0.4635, + "step": 7189 + }, + { + "epoch": 0.82, + "learning_rate": 1.6214688035467363e-06, + "loss": 0.4493, + "step": 7190 + }, + { + "epoch": 0.82, + "learning_rate": 1.6194486612020277e-06, + "loss": 0.4523, + "step": 7191 + }, + { + "epoch": 0.82, + "learning_rate": 1.6174296671799571e-06, + "loss": 0.4677, + "step": 7192 + }, + { + "epoch": 0.82, + "learning_rate": 1.6154118217571723e-06, + "loss": 0.4354, + "step": 7193 + }, + { + "epoch": 0.82, + "learning_rate": 1.6133951252101642e-06, + "loss": 0.4581, + "step": 7194 + }, + { + "epoch": 0.82, + "learning_rate": 1.6113795778152663e-06, + "loss": 0.4396, + "step": 7195 + }, + { + "epoch": 0.82, + "learning_rate": 1.6093651798486487e-06, + "loss": 0.4427, + "step": 7196 + }, + { + "epoch": 0.82, + "learning_rate": 1.6073519315863351e-06, + "loss": 0.4417, + "step": 7197 + }, + { + "epoch": 0.82, + "learning_rate": 1.6053398333041791e-06, + "loss": 0.4423, + "step": 7198 + }, + { + "epoch": 0.82, + "learning_rate": 1.6033288852778882e-06, + "loss": 0.4448, + "step": 7199 + }, + { + "epoch": 0.82, + "learning_rate": 1.6013190877830065e-06, + "loss": 0.4561, + "step": 7200 + }, + { + "epoch": 0.82, + "learning_rate": 1.5993104410949189e-06, + "loss": 0.4564, + "step": 7201 + }, + { + "epoch": 0.82, + "learning_rate": 1.5973029454888578e-06, + "loss": 0.4518, + "step": 7202 + }, + { + "epoch": 0.82, + "learning_rate": 1.5952966012398908e-06, + "loss": 0.4463, + "step": 7203 + }, + { + "epoch": 0.82, + "learning_rate": 1.5932914086229366e-06, + "loss": 0.4785, + "step": 7204 + }, + { + "epoch": 0.82, + "learning_rate": 1.5912873679127495e-06, + "loss": 0.4446, + "step": 7205 + }, + { + "epoch": 0.82, + "learning_rate": 1.5892844793839235e-06, + "loss": 0.4422, + "step": 7206 + }, + { + "epoch": 0.82, + "learning_rate": 1.5872827433109073e-06, + "loss": 0.4585, + "step": 7207 + }, + { + "epoch": 0.82, + "learning_rate": 1.5852821599679747e-06, + "loss": 0.4655, + "step": 7208 + }, + { + "epoch": 0.82, + "learning_rate": 1.5832827296292564e-06, + "loss": 0.4546, + "step": 7209 + }, + { + "epoch": 0.82, + "learning_rate": 1.5812844525687188e-06, + "loss": 0.4432, + "step": 7210 + }, + { + "epoch": 0.82, + "learning_rate": 1.5792873290601662e-06, + "loss": 0.4541, + "step": 7211 + }, + { + "epoch": 0.82, + "learning_rate": 1.5772913593772543e-06, + "loss": 0.4737, + "step": 7212 + }, + { + "epoch": 0.82, + "learning_rate": 1.575296543793473e-06, + "loss": 0.4297, + "step": 7213 + }, + { + "epoch": 0.82, + "learning_rate": 1.573302882582154e-06, + "loss": 0.4452, + "step": 7214 + }, + { + "epoch": 0.82, + "learning_rate": 1.5713103760164782e-06, + "loss": 0.4724, + "step": 7215 + }, + { + "epoch": 0.82, + "learning_rate": 1.56931902436946e-06, + "loss": 0.4592, + "step": 7216 + }, + { + "epoch": 0.82, + "learning_rate": 1.5673288279139586e-06, + "loss": 0.4475, + "step": 7217 + }, + { + "epoch": 0.82, + "learning_rate": 1.5653397869226806e-06, + "loss": 0.4772, + "step": 7218 + }, + { + "epoch": 0.83, + "learning_rate": 1.5633519016681631e-06, + "loss": 0.4337, + "step": 7219 + }, + { + "epoch": 0.83, + "learning_rate": 1.561365172422795e-06, + "loss": 0.4538, + "step": 7220 + }, + { + "epoch": 0.83, + "learning_rate": 1.559379599458798e-06, + "loss": 0.4443, + "step": 7221 + }, + { + "epoch": 0.83, + "learning_rate": 1.5573951830482458e-06, + "loss": 0.4493, + "step": 7222 + }, + { + "epoch": 0.83, + "learning_rate": 1.5554119234630438e-06, + "loss": 0.4358, + "step": 7223 + }, + { + "epoch": 0.83, + "learning_rate": 1.553429820974941e-06, + "loss": 0.4569, + "step": 7224 + }, + { + "epoch": 0.83, + "learning_rate": 1.5514488758555357e-06, + "loss": 0.4639, + "step": 7225 + }, + { + "epoch": 0.83, + "learning_rate": 1.5494690883762553e-06, + "loss": 0.4464, + "step": 7226 + }, + { + "epoch": 0.83, + "learning_rate": 1.5474904588083772e-06, + "loss": 0.4525, + "step": 7227 + }, + { + "epoch": 0.83, + "learning_rate": 1.5455129874230212e-06, + "loss": 0.4562, + "step": 7228 + }, + { + "epoch": 0.83, + "learning_rate": 1.5435366744911406e-06, + "loss": 0.4502, + "step": 7229 + }, + { + "epoch": 0.83, + "learning_rate": 1.5415615202835377e-06, + "loss": 0.4496, + "step": 7230 + }, + { + "epoch": 0.83, + "learning_rate": 1.5395875250708513e-06, + "loss": 0.4492, + "step": 7231 + }, + { + "epoch": 0.83, + "learning_rate": 1.53761468912356e-06, + "loss": 0.4481, + "step": 7232 + }, + { + "epoch": 0.83, + "learning_rate": 1.5356430127119915e-06, + "loss": 0.4517, + "step": 7233 + }, + { + "epoch": 0.83, + "learning_rate": 1.5336724961063043e-06, + "loss": 0.4612, + "step": 7234 + }, + { + "epoch": 0.83, + "learning_rate": 1.5317031395765081e-06, + "loss": 0.447, + "step": 7235 + }, + { + "epoch": 0.83, + "learning_rate": 1.5297349433924435e-06, + "loss": 0.4387, + "step": 7236 + }, + { + "epoch": 0.83, + "learning_rate": 1.5277679078238018e-06, + "loss": 0.4682, + "step": 7237 + }, + { + "epoch": 0.83, + "learning_rate": 1.5258020331401102e-06, + "loss": 0.4466, + "step": 7238 + }, + { + "epoch": 0.83, + "learning_rate": 1.523837319610737e-06, + "loss": 0.4367, + "step": 7239 + }, + { + "epoch": 0.83, + "learning_rate": 1.5218737675048888e-06, + "loss": 0.452, + "step": 7240 + }, + { + "epoch": 0.83, + "learning_rate": 1.5199113770916207e-06, + "loss": 0.4353, + "step": 7241 + }, + { + "epoch": 0.83, + "learning_rate": 1.5179501486398196e-06, + "loss": 0.4702, + "step": 7242 + }, + { + "epoch": 0.83, + "learning_rate": 1.5159900824182227e-06, + "loss": 0.4602, + "step": 7243 + }, + { + "epoch": 0.83, + "learning_rate": 1.5140311786953986e-06, + "loss": 0.4643, + "step": 7244 + }, + { + "epoch": 0.83, + "learning_rate": 1.5120734377397617e-06, + "loss": 0.4605, + "step": 7245 + }, + { + "epoch": 0.83, + "learning_rate": 1.5101168598195647e-06, + "loss": 0.4429, + "step": 7246 + }, + { + "epoch": 0.83, + "learning_rate": 1.508161445202906e-06, + "loss": 0.4756, + "step": 7247 + }, + { + "epoch": 0.83, + "learning_rate": 1.5062071941577217e-06, + "loss": 0.4581, + "step": 7248 + }, + { + "epoch": 0.83, + "learning_rate": 1.5042541069517846e-06, + "loss": 0.4531, + "step": 7249 + }, + { + "epoch": 0.83, + "learning_rate": 1.5023021838527108e-06, + "loss": 0.4378, + "step": 7250 + }, + { + "epoch": 0.83, + "learning_rate": 1.5003514251279616e-06, + "loss": 0.4601, + "step": 7251 + }, + { + "epoch": 0.83, + "learning_rate": 1.4984018310448312e-06, + "loss": 0.466, + "step": 7252 + }, + { + "epoch": 0.83, + "learning_rate": 1.4964534018704558e-06, + "loss": 0.4375, + "step": 7253 + }, + { + "epoch": 0.83, + "learning_rate": 1.4945061378718184e-06, + "loss": 0.459, + "step": 7254 + }, + { + "epoch": 0.83, + "learning_rate": 1.4925600393157325e-06, + "loss": 0.4477, + "step": 7255 + }, + { + "epoch": 0.83, + "learning_rate": 1.4906151064688602e-06, + "loss": 0.4788, + "step": 7256 + }, + { + "epoch": 0.83, + "learning_rate": 1.4886713395977015e-06, + "loss": 0.4355, + "step": 7257 + }, + { + "epoch": 0.83, + "learning_rate": 1.4867287389685936e-06, + "loss": 0.4485, + "step": 7258 + }, + { + "epoch": 0.83, + "learning_rate": 1.4847873048477191e-06, + "loss": 0.4611, + "step": 7259 + }, + { + "epoch": 0.83, + "learning_rate": 1.482847037501094e-06, + "loss": 0.4529, + "step": 7260 + }, + { + "epoch": 0.83, + "learning_rate": 1.4809079371945823e-06, + "loss": 0.4453, + "step": 7261 + }, + { + "epoch": 0.83, + "learning_rate": 1.4789700041938816e-06, + "loss": 0.4605, + "step": 7262 + }, + { + "epoch": 0.83, + "learning_rate": 1.4770332387645293e-06, + "loss": 0.4372, + "step": 7263 + }, + { + "epoch": 0.83, + "learning_rate": 1.475097641171912e-06, + "loss": 0.4592, + "step": 7264 + }, + { + "epoch": 0.83, + "learning_rate": 1.4731632116812434e-06, + "loss": 0.4514, + "step": 7265 + }, + { + "epoch": 0.83, + "learning_rate": 1.4712299505575868e-06, + "loss": 0.4276, + "step": 7266 + }, + { + "epoch": 0.83, + "learning_rate": 1.4692978580658434e-06, + "loss": 0.4622, + "step": 7267 + }, + { + "epoch": 0.83, + "learning_rate": 1.4673669344707498e-06, + "loss": 0.4572, + "step": 7268 + }, + { + "epoch": 0.83, + "learning_rate": 1.4654371800368882e-06, + "loss": 0.4503, + "step": 7269 + }, + { + "epoch": 0.83, + "learning_rate": 1.4635085950286776e-06, + "loss": 0.4591, + "step": 7270 + }, + { + "epoch": 0.83, + "learning_rate": 1.4615811797103751e-06, + "loss": 0.4537, + "step": 7271 + }, + { + "epoch": 0.83, + "learning_rate": 1.459654934346083e-06, + "loss": 0.4507, + "step": 7272 + }, + { + "epoch": 0.83, + "learning_rate": 1.4577298591997357e-06, + "loss": 0.4602, + "step": 7273 + }, + { + "epoch": 0.83, + "learning_rate": 1.4558059545351144e-06, + "loss": 0.4625, + "step": 7274 + }, + { + "epoch": 0.83, + "learning_rate": 1.4538832206158381e-06, + "loss": 0.4445, + "step": 7275 + }, + { + "epoch": 0.83, + "learning_rate": 1.4519616577053597e-06, + "loss": 0.4464, + "step": 7276 + }, + { + "epoch": 0.83, + "learning_rate": 1.4500412660669828e-06, + "loss": 0.4605, + "step": 7277 + }, + { + "epoch": 0.83, + "learning_rate": 1.448122045963839e-06, + "loss": 0.4558, + "step": 7278 + }, + { + "epoch": 0.83, + "learning_rate": 1.4462039976589048e-06, + "loss": 0.4509, + "step": 7279 + }, + { + "epoch": 0.83, + "learning_rate": 1.444287121414998e-06, + "loss": 0.4439, + "step": 7280 + }, + { + "epoch": 0.83, + "learning_rate": 1.442371417494769e-06, + "loss": 0.4491, + "step": 7281 + }, + { + "epoch": 0.83, + "learning_rate": 1.4404568861607172e-06, + "loss": 0.4543, + "step": 7282 + }, + { + "epoch": 0.83, + "learning_rate": 1.4385435276751724e-06, + "loss": 0.4388, + "step": 7283 + }, + { + "epoch": 0.83, + "learning_rate": 1.4366313423003087e-06, + "loss": 0.4459, + "step": 7284 + }, + { + "epoch": 0.83, + "learning_rate": 1.4347203302981393e-06, + "loss": 0.4641, + "step": 7285 + }, + { + "epoch": 0.83, + "learning_rate": 1.432810491930514e-06, + "loss": 0.4506, + "step": 7286 + }, + { + "epoch": 0.83, + "learning_rate": 1.4309018274591246e-06, + "loss": 0.4384, + "step": 7287 + }, + { + "epoch": 0.83, + "learning_rate": 1.4289943371455007e-06, + "loss": 0.4767, + "step": 7288 + }, + { + "epoch": 0.83, + "learning_rate": 1.4270880212510086e-06, + "loss": 0.4374, + "step": 7289 + }, + { + "epoch": 0.83, + "learning_rate": 1.4251828800368594e-06, + "loss": 0.4517, + "step": 7290 + }, + { + "epoch": 0.83, + "learning_rate": 1.4232789137640968e-06, + "loss": 0.4359, + "step": 7291 + }, + { + "epoch": 0.83, + "learning_rate": 1.4213761226936095e-06, + "loss": 0.4627, + "step": 7292 + }, + { + "epoch": 0.83, + "learning_rate": 1.4194745070861194e-06, + "loss": 0.4673, + "step": 7293 + }, + { + "epoch": 0.83, + "learning_rate": 1.417574067202192e-06, + "loss": 0.4501, + "step": 7294 + }, + { + "epoch": 0.83, + "learning_rate": 1.4156748033022328e-06, + "loss": 0.443, + "step": 7295 + }, + { + "epoch": 0.83, + "learning_rate": 1.413776715646481e-06, + "loss": 0.4604, + "step": 7296 + }, + { + "epoch": 0.83, + "learning_rate": 1.4118798044950132e-06, + "loss": 0.4462, + "step": 7297 + }, + { + "epoch": 0.83, + "learning_rate": 1.409984070107755e-06, + "loss": 0.4305, + "step": 7298 + }, + { + "epoch": 0.83, + "learning_rate": 1.4080895127444594e-06, + "loss": 0.4609, + "step": 7299 + }, + { + "epoch": 0.83, + "learning_rate": 1.4061961326647266e-06, + "loss": 0.4414, + "step": 7300 + }, + { + "epoch": 0.83, + "learning_rate": 1.4043039301279904e-06, + "loss": 0.476, + "step": 7301 + }, + { + "epoch": 0.83, + "learning_rate": 1.402412905393523e-06, + "loss": 0.4646, + "step": 7302 + }, + { + "epoch": 0.83, + "learning_rate": 1.4005230587204388e-06, + "loss": 0.4492, + "step": 7303 + }, + { + "epoch": 0.83, + "learning_rate": 1.398634390367688e-06, + "loss": 0.4339, + "step": 7304 + }, + { + "epoch": 0.83, + "learning_rate": 1.3967469005940638e-06, + "loss": 0.448, + "step": 7305 + }, + { + "epoch": 0.83, + "learning_rate": 1.3948605896581923e-06, + "loss": 0.4402, + "step": 7306 + }, + { + "epoch": 0.84, + "learning_rate": 1.3929754578185373e-06, + "loss": 0.4364, + "step": 7307 + }, + { + "epoch": 0.84, + "learning_rate": 1.3910915053334094e-06, + "loss": 0.463, + "step": 7308 + }, + { + "epoch": 0.84, + "learning_rate": 1.3892087324609482e-06, + "loss": 0.4454, + "step": 7309 + }, + { + "epoch": 0.84, + "learning_rate": 1.3873271394591348e-06, + "loss": 0.4623, + "step": 7310 + }, + { + "epoch": 0.84, + "learning_rate": 1.385446726585794e-06, + "loss": 0.4608, + "step": 7311 + }, + { + "epoch": 0.84, + "learning_rate": 1.3835674940985788e-06, + "loss": 0.4374, + "step": 7312 + }, + { + "epoch": 0.84, + "learning_rate": 1.3816894422549888e-06, + "loss": 0.4594, + "step": 7313 + }, + { + "epoch": 0.84, + "learning_rate": 1.379812571312361e-06, + "loss": 0.4467, + "step": 7314 + }, + { + "epoch": 0.84, + "learning_rate": 1.3779368815278648e-06, + "loss": 0.4594, + "step": 7315 + }, + { + "epoch": 0.84, + "learning_rate": 1.3760623731585165e-06, + "loss": 0.4557, + "step": 7316 + }, + { + "epoch": 0.84, + "learning_rate": 1.3741890464611597e-06, + "loss": 0.446, + "step": 7317 + }, + { + "epoch": 0.84, + "learning_rate": 1.3723169016924865e-06, + "loss": 0.4579, + "step": 7318 + }, + { + "epoch": 0.84, + "learning_rate": 1.370445939109022e-06, + "loss": 0.4598, + "step": 7319 + }, + { + "epoch": 0.84, + "learning_rate": 1.3685761589671253e-06, + "loss": 0.4376, + "step": 7320 + }, + { + "epoch": 0.84, + "learning_rate": 1.366707561523004e-06, + "loss": 0.4252, + "step": 7321 + }, + { + "epoch": 0.84, + "learning_rate": 1.3648401470326932e-06, + "loss": 0.4451, + "step": 7322 + }, + { + "epoch": 0.84, + "learning_rate": 1.3629739157520728e-06, + "loss": 0.4647, + "step": 7323 + }, + { + "epoch": 0.84, + "learning_rate": 1.361108867936859e-06, + "loss": 0.4317, + "step": 7324 + }, + { + "epoch": 0.84, + "learning_rate": 1.359245003842602e-06, + "loss": 0.4451, + "step": 7325 + }, + { + "epoch": 0.84, + "learning_rate": 1.3573823237246965e-06, + "loss": 0.465, + "step": 7326 + }, + { + "epoch": 0.84, + "learning_rate": 1.3555208278383691e-06, + "loss": 0.4469, + "step": 7327 + }, + { + "epoch": 0.84, + "learning_rate": 1.353660516438684e-06, + "loss": 0.4517, + "step": 7328 + }, + { + "epoch": 0.84, + "learning_rate": 1.3518013897805504e-06, + "loss": 0.4704, + "step": 7329 + }, + { + "epoch": 0.84, + "learning_rate": 1.3499434481187045e-06, + "loss": 0.4471, + "step": 7330 + }, + { + "epoch": 0.84, + "learning_rate": 1.3480866917077294e-06, + "loss": 0.4489, + "step": 7331 + }, + { + "epoch": 0.84, + "learning_rate": 1.346231120802044e-06, + "loss": 0.4499, + "step": 7332 + }, + { + "epoch": 0.84, + "learning_rate": 1.3443767356558989e-06, + "loss": 0.4474, + "step": 7333 + }, + { + "epoch": 0.84, + "learning_rate": 1.3425235365233892e-06, + "loss": 0.4623, + "step": 7334 + }, + { + "epoch": 0.84, + "learning_rate": 1.3406715236584433e-06, + "loss": 0.4493, + "step": 7335 + }, + { + "epoch": 0.84, + "learning_rate": 1.3388206973148265e-06, + "loss": 0.4323, + "step": 7336 + }, + { + "epoch": 0.84, + "learning_rate": 1.336971057746147e-06, + "loss": 0.4577, + "step": 7337 + }, + { + "epoch": 0.84, + "learning_rate": 1.335122605205843e-06, + "loss": 0.436, + "step": 7338 + }, + { + "epoch": 0.84, + "learning_rate": 1.3332753399471976e-06, + "loss": 0.4512, + "step": 7339 + }, + { + "epoch": 0.84, + "learning_rate": 1.3314292622233227e-06, + "loss": 0.4495, + "step": 7340 + }, + { + "epoch": 0.84, + "learning_rate": 1.329584372287176e-06, + "loss": 0.4417, + "step": 7341 + }, + { + "epoch": 0.84, + "learning_rate": 1.3277406703915485e-06, + "loss": 0.4506, + "step": 7342 + }, + { + "epoch": 0.84, + "learning_rate": 1.325898156789066e-06, + "loss": 0.4717, + "step": 7343 + }, + { + "epoch": 0.84, + "learning_rate": 1.3240568317321966e-06, + "loss": 0.4456, + "step": 7344 + }, + { + "epoch": 0.84, + "learning_rate": 1.322216695473243e-06, + "loss": 0.4531, + "step": 7345 + }, + { + "epoch": 0.84, + "learning_rate": 1.320377748264341e-06, + "loss": 0.4392, + "step": 7346 + }, + { + "epoch": 0.84, + "learning_rate": 1.3185399903574724e-06, + "loss": 0.4464, + "step": 7347 + }, + { + "epoch": 0.84, + "learning_rate": 1.3167034220044494e-06, + "loss": 0.4205, + "step": 7348 + }, + { + "epoch": 0.84, + "learning_rate": 1.3148680434569206e-06, + "loss": 0.4495, + "step": 7349 + }, + { + "epoch": 0.84, + "learning_rate": 1.3130338549663745e-06, + "loss": 0.4453, + "step": 7350 + }, + { + "epoch": 0.84, + "learning_rate": 1.3112008567841371e-06, + "loss": 0.4435, + "step": 7351 + }, + { + "epoch": 0.84, + "learning_rate": 1.309369049161372e-06, + "loss": 0.4557, + "step": 7352 + }, + { + "epoch": 0.84, + "learning_rate": 1.3075384323490759e-06, + "loss": 0.4497, + "step": 7353 + }, + { + "epoch": 0.84, + "learning_rate": 1.3057090065980816e-06, + "loss": 0.4427, + "step": 7354 + }, + { + "epoch": 0.84, + "learning_rate": 1.3038807721590663e-06, + "loss": 0.4577, + "step": 7355 + }, + { + "epoch": 0.84, + "learning_rate": 1.302053729282533e-06, + "loss": 0.4432, + "step": 7356 + }, + { + "epoch": 0.84, + "learning_rate": 1.3002278782188337e-06, + "loss": 0.4521, + "step": 7357 + }, + { + "epoch": 0.84, + "learning_rate": 1.2984032192181473e-06, + "loss": 0.4507, + "step": 7358 + }, + { + "epoch": 0.84, + "learning_rate": 1.2965797525304913e-06, + "loss": 0.447, + "step": 7359 + }, + { + "epoch": 0.84, + "learning_rate": 1.2947574784057237e-06, + "loss": 0.4633, + "step": 7360 + }, + { + "epoch": 0.84, + "learning_rate": 1.2929363970935371e-06, + "loss": 0.4423, + "step": 7361 + }, + { + "epoch": 0.84, + "learning_rate": 1.2911165088434584e-06, + "loss": 0.4446, + "step": 7362 + }, + { + "epoch": 0.84, + "learning_rate": 1.2892978139048562e-06, + "loss": 0.459, + "step": 7363 + }, + { + "epoch": 0.84, + "learning_rate": 1.2874803125269274e-06, + "loss": 0.4648, + "step": 7364 + }, + { + "epoch": 0.84, + "learning_rate": 1.2856640049587154e-06, + "loss": 0.4511, + "step": 7365 + }, + { + "epoch": 0.84, + "learning_rate": 1.283848891449092e-06, + "loss": 0.4607, + "step": 7366 + }, + { + "epoch": 0.84, + "learning_rate": 1.2820349722467663e-06, + "loss": 0.4432, + "step": 7367 + }, + { + "epoch": 0.84, + "learning_rate": 1.2802222476002911e-06, + "loss": 0.4673, + "step": 7368 + }, + { + "epoch": 0.84, + "learning_rate": 1.278410717758045e-06, + "loss": 0.449, + "step": 7369 + }, + { + "epoch": 0.84, + "learning_rate": 1.2766003829682504e-06, + "loss": 0.4386, + "step": 7370 + }, + { + "epoch": 0.84, + "learning_rate": 1.2747912434789655e-06, + "loss": 0.4515, + "step": 7371 + }, + { + "epoch": 0.84, + "learning_rate": 1.2729832995380775e-06, + "loss": 0.4452, + "step": 7372 + }, + { + "epoch": 0.84, + "learning_rate": 1.2711765513933216e-06, + "loss": 0.4626, + "step": 7373 + }, + { + "epoch": 0.84, + "learning_rate": 1.2693709992922575e-06, + "loss": 0.4452, + "step": 7374 + }, + { + "epoch": 0.84, + "learning_rate": 1.2675666434822887e-06, + "loss": 0.443, + "step": 7375 + }, + { + "epoch": 0.84, + "learning_rate": 1.2657634842106526e-06, + "loss": 0.4632, + "step": 7376 + }, + { + "epoch": 0.84, + "learning_rate": 1.2639615217244194e-06, + "loss": 0.459, + "step": 7377 + }, + { + "epoch": 0.84, + "learning_rate": 1.2621607562705018e-06, + "loss": 0.4533, + "step": 7378 + }, + { + "epoch": 0.84, + "learning_rate": 1.26036118809564e-06, + "loss": 0.4352, + "step": 7379 + }, + { + "epoch": 0.84, + "learning_rate": 1.2585628174464192e-06, + "loss": 0.4407, + "step": 7380 + }, + { + "epoch": 0.84, + "learning_rate": 1.2567656445692566e-06, + "loss": 0.4571, + "step": 7381 + }, + { + "epoch": 0.84, + "learning_rate": 1.254969669710402e-06, + "loss": 0.4504, + "step": 7382 + }, + { + "epoch": 0.84, + "learning_rate": 1.2531748931159472e-06, + "loss": 0.4709, + "step": 7383 + }, + { + "epoch": 0.84, + "learning_rate": 1.2513813150318155e-06, + "loss": 0.4495, + "step": 7384 + }, + { + "epoch": 0.84, + "learning_rate": 1.249588935703765e-06, + "loss": 0.4497, + "step": 7385 + }, + { + "epoch": 0.84, + "learning_rate": 1.2477977553773957e-06, + "loss": 0.4771, + "step": 7386 + }, + { + "epoch": 0.84, + "learning_rate": 1.2460077742981347e-06, + "loss": 0.4575, + "step": 7387 + }, + { + "epoch": 0.84, + "learning_rate": 1.2442189927112514e-06, + "loss": 0.4703, + "step": 7388 + }, + { + "epoch": 0.84, + "learning_rate": 1.2424314108618507e-06, + "loss": 0.4338, + "step": 7389 + }, + { + "epoch": 0.84, + "learning_rate": 1.240645028994869e-06, + "loss": 0.44, + "step": 7390 + }, + { + "epoch": 0.84, + "learning_rate": 1.2388598473550828e-06, + "loss": 0.4633, + "step": 7391 + }, + { + "epoch": 0.84, + "learning_rate": 1.2370758661870997e-06, + "loss": 0.4367, + "step": 7392 + }, + { + "epoch": 0.84, + "learning_rate": 1.235293085735364e-06, + "loss": 0.4532, + "step": 7393 + }, + { + "epoch": 0.85, + "learning_rate": 1.2335115062441593e-06, + "loss": 0.4382, + "step": 7394 + }, + { + "epoch": 0.85, + "learning_rate": 1.2317311279575982e-06, + "loss": 0.4713, + "step": 7395 + }, + { + "epoch": 0.85, + "learning_rate": 1.2299519511196368e-06, + "loss": 0.4241, + "step": 7396 + }, + { + "epoch": 0.85, + "learning_rate": 1.2281739759740575e-06, + "loss": 0.4588, + "step": 7397 + }, + { + "epoch": 0.85, + "learning_rate": 1.2263972027644854e-06, + "loss": 0.4597, + "step": 7398 + }, + { + "epoch": 0.85, + "learning_rate": 1.2246216317343796e-06, + "loss": 0.4638, + "step": 7399 + }, + { + "epoch": 0.85, + "learning_rate": 1.2228472631270272e-06, + "loss": 0.4634, + "step": 7400 + }, + { + "epoch": 0.85, + "learning_rate": 1.221074097185564e-06, + "loss": 0.4513, + "step": 7401 + }, + { + "epoch": 0.85, + "learning_rate": 1.2193021341529477e-06, + "loss": 0.4579, + "step": 7402 + }, + { + "epoch": 0.85, + "learning_rate": 1.2175313742719775e-06, + "loss": 0.4613, + "step": 7403 + }, + { + "epoch": 0.85, + "learning_rate": 1.2157618177852893e-06, + "loss": 0.466, + "step": 7404 + }, + { + "epoch": 0.85, + "learning_rate": 1.2139934649353503e-06, + "loss": 0.4523, + "step": 7405 + }, + { + "epoch": 0.85, + "learning_rate": 1.212226315964462e-06, + "loss": 0.4477, + "step": 7406 + }, + { + "epoch": 0.85, + "learning_rate": 1.2104603711147666e-06, + "loss": 0.4393, + "step": 7407 + }, + { + "epoch": 0.85, + "learning_rate": 1.2086956306282371e-06, + "loss": 0.4522, + "step": 7408 + }, + { + "epoch": 0.85, + "learning_rate": 1.2069320947466845e-06, + "loss": 0.4699, + "step": 7409 + }, + { + "epoch": 0.85, + "learning_rate": 1.20516976371175e-06, + "loss": 0.4423, + "step": 7410 + }, + { + "epoch": 0.85, + "learning_rate": 1.2034086377649102e-06, + "loss": 0.4703, + "step": 7411 + }, + { + "epoch": 0.85, + "learning_rate": 1.2016487171474844e-06, + "loss": 0.4311, + "step": 7412 + }, + { + "epoch": 0.85, + "learning_rate": 1.1998900021006155e-06, + "loss": 0.4456, + "step": 7413 + }, + { + "epoch": 0.85, + "learning_rate": 1.1981324928652905e-06, + "loss": 0.4382, + "step": 7414 + }, + { + "epoch": 0.85, + "learning_rate": 1.1963761896823255e-06, + "loss": 0.4528, + "step": 7415 + }, + { + "epoch": 0.85, + "learning_rate": 1.1946210927923729e-06, + "loss": 0.4467, + "step": 7416 + }, + { + "epoch": 0.85, + "learning_rate": 1.1928672024359211e-06, + "loss": 0.448, + "step": 7417 + }, + { + "epoch": 0.85, + "learning_rate": 1.1911145188532936e-06, + "loss": 0.4427, + "step": 7418 + }, + { + "epoch": 0.85, + "learning_rate": 1.1893630422846437e-06, + "loss": 0.4495, + "step": 7419 + }, + { + "epoch": 0.85, + "learning_rate": 1.187612772969966e-06, + "loss": 0.4557, + "step": 7420 + }, + { + "epoch": 0.85, + "learning_rate": 1.1858637111490845e-06, + "loss": 0.4693, + "step": 7421 + }, + { + "epoch": 0.85, + "learning_rate": 1.1841158570616617e-06, + "loss": 0.453, + "step": 7422 + }, + { + "epoch": 0.85, + "learning_rate": 1.1823692109471919e-06, + "loss": 0.4517, + "step": 7423 + }, + { + "epoch": 0.85, + "learning_rate": 1.1806237730450009e-06, + "loss": 0.4596, + "step": 7424 + }, + { + "epoch": 0.85, + "learning_rate": 1.1788795435942591e-06, + "loss": 0.4496, + "step": 7425 + }, + { + "epoch": 0.85, + "learning_rate": 1.1771365228339593e-06, + "loss": 0.4406, + "step": 7426 + }, + { + "epoch": 0.85, + "learning_rate": 1.1753947110029373e-06, + "loss": 0.4555, + "step": 7427 + }, + { + "epoch": 0.85, + "learning_rate": 1.17365410833986e-06, + "loss": 0.4693, + "step": 7428 + }, + { + "epoch": 0.85, + "learning_rate": 1.1719147150832278e-06, + "loss": 0.4451, + "step": 7429 + }, + { + "epoch": 0.85, + "learning_rate": 1.1701765314713786e-06, + "loss": 0.4423, + "step": 7430 + }, + { + "epoch": 0.85, + "learning_rate": 1.16843955774248e-06, + "loss": 0.4563, + "step": 7431 + }, + { + "epoch": 0.85, + "learning_rate": 1.1667037941345361e-06, + "loss": 0.4354, + "step": 7432 + }, + { + "epoch": 0.85, + "learning_rate": 1.1649692408853875e-06, + "loss": 0.4411, + "step": 7433 + }, + { + "epoch": 0.85, + "learning_rate": 1.163235898232703e-06, + "loss": 0.4415, + "step": 7434 + }, + { + "epoch": 0.85, + "learning_rate": 1.1615037664139928e-06, + "loss": 0.4729, + "step": 7435 + }, + { + "epoch": 0.85, + "learning_rate": 1.1597728456665958e-06, + "loss": 0.4422, + "step": 7436 + }, + { + "epoch": 0.85, + "learning_rate": 1.1580431362276866e-06, + "loss": 0.47, + "step": 7437 + }, + { + "epoch": 0.85, + "learning_rate": 1.156314638334277e-06, + "loss": 0.4466, + "step": 7438 + }, + { + "epoch": 0.85, + "learning_rate": 1.1545873522232055e-06, + "loss": 0.4476, + "step": 7439 + }, + { + "epoch": 0.85, + "learning_rate": 1.1528612781311532e-06, + "loss": 0.4512, + "step": 7440 + }, + { + "epoch": 0.85, + "learning_rate": 1.1511364162946282e-06, + "loss": 0.4626, + "step": 7441 + }, + { + "epoch": 0.85, + "learning_rate": 1.1494127669499732e-06, + "loss": 0.4321, + "step": 7442 + }, + { + "epoch": 0.85, + "learning_rate": 1.147690330333371e-06, + "loss": 0.4471, + "step": 7443 + }, + { + "epoch": 0.85, + "learning_rate": 1.14596910668083e-06, + "loss": 0.463, + "step": 7444 + }, + { + "epoch": 0.85, + "learning_rate": 1.1442490962281983e-06, + "loss": 0.449, + "step": 7445 + }, + { + "epoch": 0.85, + "learning_rate": 1.1425302992111564e-06, + "loss": 0.4584, + "step": 7446 + }, + { + "epoch": 0.85, + "learning_rate": 1.140812715865215e-06, + "loss": 0.4398, + "step": 7447 + }, + { + "epoch": 0.85, + "learning_rate": 1.1390963464257254e-06, + "loss": 0.4492, + "step": 7448 + }, + { + "epoch": 0.85, + "learning_rate": 1.1373811911278666e-06, + "loss": 0.4535, + "step": 7449 + }, + { + "epoch": 0.85, + "learning_rate": 1.1356672502066512e-06, + "loss": 0.4298, + "step": 7450 + }, + { + "epoch": 0.85, + "learning_rate": 1.1339545238969308e-06, + "loss": 0.4531, + "step": 7451 + }, + { + "epoch": 0.85, + "learning_rate": 1.1322430124333839e-06, + "loss": 0.4477, + "step": 7452 + }, + { + "epoch": 0.85, + "learning_rate": 1.1305327160505286e-06, + "loss": 0.4755, + "step": 7453 + }, + { + "epoch": 0.85, + "learning_rate": 1.1288236349827108e-06, + "loss": 0.4572, + "step": 7454 + }, + { + "epoch": 0.85, + "learning_rate": 1.1271157694641144e-06, + "loss": 0.433, + "step": 7455 + }, + { + "epoch": 0.85, + "learning_rate": 1.1254091197287564e-06, + "loss": 0.4565, + "step": 7456 + }, + { + "epoch": 0.85, + "learning_rate": 1.1237036860104833e-06, + "loss": 0.4589, + "step": 7457 + }, + { + "epoch": 0.85, + "learning_rate": 1.1219994685429814e-06, + "loss": 0.4504, + "step": 7458 + }, + { + "epoch": 0.85, + "learning_rate": 1.1202964675597627e-06, + "loss": 0.4553, + "step": 7459 + }, + { + "epoch": 0.85, + "learning_rate": 1.1185946832941774e-06, + "loss": 0.4452, + "step": 7460 + }, + { + "epoch": 0.85, + "learning_rate": 1.116894115979409e-06, + "loss": 0.4826, + "step": 7461 + }, + { + "epoch": 0.85, + "learning_rate": 1.115194765848473e-06, + "loss": 0.4367, + "step": 7462 + }, + { + "epoch": 0.85, + "learning_rate": 1.1134966331342157e-06, + "loss": 0.4409, + "step": 7463 + }, + { + "epoch": 0.85, + "learning_rate": 1.1117997180693207e-06, + "loss": 0.474, + "step": 7464 + }, + { + "epoch": 0.85, + "learning_rate": 1.1101040208863035e-06, + "loss": 0.4283, + "step": 7465 + }, + { + "epoch": 0.85, + "learning_rate": 1.1084095418175156e-06, + "loss": 0.4431, + "step": 7466 + }, + { + "epoch": 0.85, + "learning_rate": 1.106716281095136e-06, + "loss": 0.4705, + "step": 7467 + }, + { + "epoch": 0.85, + "learning_rate": 1.1050242389511757e-06, + "loss": 0.4547, + "step": 7468 + }, + { + "epoch": 0.85, + "learning_rate": 1.103333415617488e-06, + "loss": 0.4496, + "step": 7469 + }, + { + "epoch": 0.85, + "learning_rate": 1.1016438113257487e-06, + "loss": 0.4439, + "step": 7470 + }, + { + "epoch": 0.85, + "learning_rate": 1.0999554263074752e-06, + "loss": 0.4491, + "step": 7471 + }, + { + "epoch": 0.85, + "learning_rate": 1.0982682607940131e-06, + "loss": 0.4593, + "step": 7472 + }, + { + "epoch": 0.85, + "learning_rate": 1.0965823150165378e-06, + "loss": 0.4651, + "step": 7473 + }, + { + "epoch": 0.85, + "learning_rate": 1.0948975892060655e-06, + "loss": 0.4513, + "step": 7474 + }, + { + "epoch": 0.85, + "learning_rate": 1.0932140835934414e-06, + "loss": 0.4345, + "step": 7475 + }, + { + "epoch": 0.85, + "learning_rate": 1.091531798409341e-06, + "loss": 0.4719, + "step": 7476 + }, + { + "epoch": 0.85, + "learning_rate": 1.0898507338842779e-06, + "loss": 0.4596, + "step": 7477 + }, + { + "epoch": 0.85, + "learning_rate": 1.088170890248591e-06, + "loss": 0.4358, + "step": 7478 + }, + { + "epoch": 0.85, + "learning_rate": 1.086492267732462e-06, + "loss": 0.452, + "step": 7479 + }, + { + "epoch": 0.85, + "learning_rate": 1.0848148665658975e-06, + "loss": 0.4601, + "step": 7480 + }, + { + "epoch": 0.85, + "learning_rate": 1.0831386869787353e-06, + "loss": 0.4573, + "step": 7481 + }, + { + "epoch": 0.86, + "learning_rate": 1.0814637292006536e-06, + "loss": 0.4399, + "step": 7482 + }, + { + "epoch": 0.86, + "learning_rate": 1.0797899934611567e-06, + "loss": 0.4623, + "step": 7483 + }, + { + "epoch": 0.86, + "learning_rate": 1.0781174799895844e-06, + "loss": 0.4499, + "step": 7484 + }, + { + "epoch": 0.86, + "learning_rate": 1.0764461890151112e-06, + "loss": 0.4421, + "step": 7485 + }, + { + "epoch": 0.86, + "learning_rate": 1.0747761207667372e-06, + "loss": 0.4446, + "step": 7486 + }, + { + "epoch": 0.86, + "learning_rate": 1.0731072754733019e-06, + "loss": 0.4585, + "step": 7487 + }, + { + "epoch": 0.86, + "learning_rate": 1.071439653363473e-06, + "loss": 0.4381, + "step": 7488 + }, + { + "epoch": 0.86, + "learning_rate": 1.0697732546657512e-06, + "loss": 0.4405, + "step": 7489 + }, + { + "epoch": 0.86, + "learning_rate": 1.068108079608473e-06, + "loss": 0.4639, + "step": 7490 + }, + { + "epoch": 0.86, + "learning_rate": 1.0664441284198002e-06, + "loss": 0.4362, + "step": 7491 + }, + { + "epoch": 0.86, + "learning_rate": 1.0647814013277358e-06, + "loss": 0.4415, + "step": 7492 + }, + { + "epoch": 0.86, + "learning_rate": 1.0631198985601077e-06, + "loss": 0.4541, + "step": 7493 + }, + { + "epoch": 0.86, + "learning_rate": 1.0614596203445793e-06, + "loss": 0.4463, + "step": 7494 + }, + { + "epoch": 0.86, + "learning_rate": 1.0598005669086475e-06, + "loss": 0.4557, + "step": 7495 + }, + { + "epoch": 0.86, + "learning_rate": 1.0581427384796372e-06, + "loss": 0.4597, + "step": 7496 + }, + { + "epoch": 0.86, + "learning_rate": 1.056486135284711e-06, + "loss": 0.4463, + "step": 7497 + }, + { + "epoch": 0.86, + "learning_rate": 1.0548307575508587e-06, + "loss": 0.4581, + "step": 7498 + }, + { + "epoch": 0.86, + "learning_rate": 1.053176605504902e-06, + "loss": 0.439, + "step": 7499 + }, + { + "epoch": 0.86, + "learning_rate": 1.0515236793735007e-06, + "loss": 0.4394, + "step": 7500 + }, + { + "epoch": 0.86, + "learning_rate": 1.049871979383138e-06, + "loss": 0.4437, + "step": 7501 + }, + { + "epoch": 0.86, + "learning_rate": 1.0482215057601364e-06, + "loss": 0.4593, + "step": 7502 + }, + { + "epoch": 0.86, + "learning_rate": 1.0465722587306494e-06, + "loss": 0.4444, + "step": 7503 + }, + { + "epoch": 0.86, + "learning_rate": 1.044924238520657e-06, + "loss": 0.4475, + "step": 7504 + }, + { + "epoch": 0.86, + "learning_rate": 1.043277445355978e-06, + "loss": 0.4565, + "step": 7505 + }, + { + "epoch": 0.86, + "learning_rate": 1.0416318794622594e-06, + "loss": 0.4562, + "step": 7506 + }, + { + "epoch": 0.86, + "learning_rate": 1.0399875410649763e-06, + "loss": 0.441, + "step": 7507 + }, + { + "epoch": 0.86, + "learning_rate": 1.0383444303894453e-06, + "loss": 0.4679, + "step": 7508 + }, + { + "epoch": 0.86, + "learning_rate": 1.0367025476608038e-06, + "loss": 0.4631, + "step": 7509 + }, + { + "epoch": 0.86, + "learning_rate": 1.0350618931040324e-06, + "loss": 0.4433, + "step": 7510 + }, + { + "epoch": 0.86, + "learning_rate": 1.033422466943933e-06, + "loss": 0.4734, + "step": 7511 + }, + { + "epoch": 0.86, + "learning_rate": 1.031784269405144e-06, + "loss": 0.4519, + "step": 7512 + }, + { + "epoch": 0.86, + "learning_rate": 1.0301473007121376e-06, + "loss": 0.4499, + "step": 7513 + }, + { + "epoch": 0.86, + "learning_rate": 1.0285115610892138e-06, + "loss": 0.4637, + "step": 7514 + }, + { + "epoch": 0.86, + "learning_rate": 1.026877050760503e-06, + "loss": 0.4271, + "step": 7515 + }, + { + "epoch": 0.86, + "learning_rate": 1.025243769949974e-06, + "loss": 0.4543, + "step": 7516 + }, + { + "epoch": 0.86, + "learning_rate": 1.0236117188814187e-06, + "loss": 0.4584, + "step": 7517 + }, + { + "epoch": 0.86, + "learning_rate": 1.0219808977784673e-06, + "loss": 0.4422, + "step": 7518 + }, + { + "epoch": 0.86, + "learning_rate": 1.0203513068645788e-06, + "loss": 0.4595, + "step": 7519 + }, + { + "epoch": 0.86, + "learning_rate": 1.01872294636304e-06, + "loss": 0.4512, + "step": 7520 + }, + { + "epoch": 0.86, + "learning_rate": 1.0170958164969746e-06, + "loss": 0.438, + "step": 7521 + }, + { + "epoch": 0.86, + "learning_rate": 1.0154699174893367e-06, + "loss": 0.4515, + "step": 7522 + }, + { + "epoch": 0.86, + "learning_rate": 1.0138452495629125e-06, + "loss": 0.4465, + "step": 7523 + }, + { + "epoch": 0.86, + "learning_rate": 1.012221812940315e-06, + "loss": 0.4385, + "step": 7524 + }, + { + "epoch": 0.86, + "learning_rate": 1.0105996078439894e-06, + "loss": 0.4496, + "step": 7525 + }, + { + "epoch": 0.86, + "learning_rate": 1.0089786344962194e-06, + "loss": 0.4472, + "step": 7526 + }, + { + "epoch": 0.86, + "learning_rate": 1.0073588931191104e-06, + "loss": 0.4485, + "step": 7527 + }, + { + "epoch": 0.86, + "learning_rate": 1.0057403839346037e-06, + "loss": 0.4544, + "step": 7528 + }, + { + "epoch": 0.86, + "learning_rate": 1.004123107164472e-06, + "loss": 0.4726, + "step": 7529 + }, + { + "epoch": 0.86, + "learning_rate": 1.0025070630303168e-06, + "loss": 0.4521, + "step": 7530 + }, + { + "epoch": 0.86, + "learning_rate": 1.0008922517535747e-06, + "loss": 0.4609, + "step": 7531 + }, + { + "epoch": 0.86, + "learning_rate": 9.992786735555104e-07, + "loss": 0.4313, + "step": 7532 + }, + { + "epoch": 0.86, + "learning_rate": 9.976663286572176e-07, + "loss": 0.4458, + "step": 7533 + }, + { + "epoch": 0.86, + "learning_rate": 9.960552172796278e-07, + "loss": 0.4657, + "step": 7534 + }, + { + "epoch": 0.86, + "learning_rate": 9.94445339643495e-07, + "loss": 0.435, + "step": 7535 + }, + { + "epoch": 0.86, + "learning_rate": 9.928366959694113e-07, + "loss": 0.4605, + "step": 7536 + }, + { + "epoch": 0.86, + "learning_rate": 9.912292864777961e-07, + "loss": 0.4526, + "step": 7537 + }, + { + "epoch": 0.86, + "learning_rate": 9.896231113888988e-07, + "loss": 0.468, + "step": 7538 + }, + { + "epoch": 0.86, + "learning_rate": 9.880181709228032e-07, + "loss": 0.4484, + "step": 7539 + }, + { + "epoch": 0.86, + "learning_rate": 9.8641446529942e-07, + "loss": 0.4639, + "step": 7540 + }, + { + "epoch": 0.86, + "learning_rate": 9.848119947384937e-07, + "loss": 0.4492, + "step": 7541 + }, + { + "epoch": 0.86, + "learning_rate": 9.832107594596008e-07, + "loss": 0.4683, + "step": 7542 + }, + { + "epoch": 0.86, + "learning_rate": 9.81610759682141e-07, + "loss": 0.4317, + "step": 7543 + }, + { + "epoch": 0.86, + "learning_rate": 9.800119956253574e-07, + "loss": 0.4468, + "step": 7544 + }, + { + "epoch": 0.86, + "learning_rate": 9.784144675083107e-07, + "loss": 0.4447, + "step": 7545 + }, + { + "epoch": 0.86, + "learning_rate": 9.768181755498973e-07, + "loss": 0.4756, + "step": 7546 + }, + { + "epoch": 0.86, + "learning_rate": 9.7522311996885e-07, + "loss": 0.4309, + "step": 7547 + }, + { + "epoch": 0.86, + "learning_rate": 9.736293009837206e-07, + "loss": 0.446, + "step": 7548 + }, + { + "epoch": 0.86, + "learning_rate": 9.720367188129043e-07, + "loss": 0.452, + "step": 7549 + }, + { + "epoch": 0.86, + "learning_rate": 9.704453736746156e-07, + "loss": 0.4351, + "step": 7550 + }, + { + "epoch": 0.86, + "learning_rate": 9.688552657869055e-07, + "loss": 0.4406, + "step": 7551 + }, + { + "epoch": 0.86, + "learning_rate": 9.672663953676563e-07, + "loss": 0.467, + "step": 7552 + }, + { + "epoch": 0.86, + "learning_rate": 9.656787626345765e-07, + "loss": 0.4444, + "step": 7553 + }, + { + "epoch": 0.86, + "learning_rate": 9.640923678052094e-07, + "loss": 0.4607, + "step": 7554 + }, + { + "epoch": 0.86, + "learning_rate": 9.625072110969246e-07, + "loss": 0.4649, + "step": 7555 + }, + { + "epoch": 0.86, + "learning_rate": 9.60923292726923e-07, + "loss": 0.4632, + "step": 7556 + }, + { + "epoch": 0.86, + "learning_rate": 9.593406129122397e-07, + "loss": 0.4489, + "step": 7557 + }, + { + "epoch": 0.86, + "learning_rate": 9.577591718697343e-07, + "loss": 0.4486, + "step": 7558 + }, + { + "epoch": 0.86, + "learning_rate": 9.561789698161007e-07, + "loss": 0.4312, + "step": 7559 + }, + { + "epoch": 0.86, + "learning_rate": 9.54600006967864e-07, + "loss": 0.4647, + "step": 7560 + }, + { + "epoch": 0.86, + "learning_rate": 9.530222835413739e-07, + "loss": 0.451, + "step": 7561 + }, + { + "epoch": 0.86, + "learning_rate": 9.514457997528171e-07, + "loss": 0.4704, + "step": 7562 + }, + { + "epoch": 0.86, + "learning_rate": 9.498705558182053e-07, + "loss": 0.4483, + "step": 7563 + }, + { + "epoch": 0.86, + "learning_rate": 9.482965519533804e-07, + "loss": 0.4656, + "step": 7564 + }, + { + "epoch": 0.86, + "learning_rate": 9.467237883740199e-07, + "loss": 0.4515, + "step": 7565 + }, + { + "epoch": 0.86, + "learning_rate": 9.451522652956225e-07, + "loss": 0.4385, + "step": 7566 + }, + { + "epoch": 0.86, + "learning_rate": 9.435819829335269e-07, + "loss": 0.4314, + "step": 7567 + }, + { + "epoch": 0.86, + "learning_rate": 9.420129415028934e-07, + "loss": 0.4629, + "step": 7568 + }, + { + "epoch": 0.87, + "learning_rate": 9.404451412187166e-07, + "loss": 0.4392, + "step": 7569 + }, + { + "epoch": 0.87, + "learning_rate": 9.388785822958224e-07, + "loss": 0.4575, + "step": 7570 + }, + { + "epoch": 0.87, + "learning_rate": 9.373132649488636e-07, + "loss": 0.4626, + "step": 7571 + }, + { + "epoch": 0.87, + "learning_rate": 9.357491893923198e-07, + "loss": 0.4594, + "step": 7572 + }, + { + "epoch": 0.87, + "learning_rate": 9.341863558405084e-07, + "loss": 0.4318, + "step": 7573 + }, + { + "epoch": 0.87, + "learning_rate": 9.326247645075703e-07, + "loss": 0.4612, + "step": 7574 + }, + { + "epoch": 0.87, + "learning_rate": 9.310644156074811e-07, + "loss": 0.4626, + "step": 7575 + }, + { + "epoch": 0.87, + "learning_rate": 9.295053093540408e-07, + "loss": 0.43, + "step": 7576 + }, + { + "epoch": 0.87, + "learning_rate": 9.279474459608806e-07, + "loss": 0.4374, + "step": 7577 + }, + { + "epoch": 0.87, + "learning_rate": 9.263908256414656e-07, + "loss": 0.4794, + "step": 7578 + }, + { + "epoch": 0.87, + "learning_rate": 9.24835448609085e-07, + "loss": 0.4464, + "step": 7579 + }, + { + "epoch": 0.87, + "learning_rate": 9.23281315076865e-07, + "loss": 0.4559, + "step": 7580 + }, + { + "epoch": 0.87, + "learning_rate": 9.217284252577519e-07, + "loss": 0.4274, + "step": 7581 + }, + { + "epoch": 0.87, + "learning_rate": 9.201767793645255e-07, + "loss": 0.4647, + "step": 7582 + }, + { + "epoch": 0.87, + "learning_rate": 9.186263776098014e-07, + "loss": 0.4514, + "step": 7583 + }, + { + "epoch": 0.87, + "learning_rate": 9.170772202060141e-07, + "loss": 0.4447, + "step": 7584 + }, + { + "epoch": 0.87, + "learning_rate": 9.155293073654337e-07, + "loss": 0.4388, + "step": 7585 + }, + { + "epoch": 0.87, + "learning_rate": 9.139826393001617e-07, + "loss": 0.452, + "step": 7586 + }, + { + "epoch": 0.87, + "learning_rate": 9.124372162221217e-07, + "loss": 0.4597, + "step": 7587 + }, + { + "epoch": 0.87, + "learning_rate": 9.108930383430736e-07, + "loss": 0.4609, + "step": 7588 + }, + { + "epoch": 0.87, + "learning_rate": 9.093501058746057e-07, + "loss": 0.4423, + "step": 7589 + }, + { + "epoch": 0.87, + "learning_rate": 9.078084190281311e-07, + "loss": 0.4459, + "step": 7590 + }, + { + "epoch": 0.87, + "learning_rate": 9.062679780148987e-07, + "loss": 0.4487, + "step": 7591 + }, + { + "epoch": 0.87, + "learning_rate": 9.047287830459806e-07, + "loss": 0.4524, + "step": 7592 + }, + { + "epoch": 0.87, + "learning_rate": 9.031908343322826e-07, + "loss": 0.449, + "step": 7593 + }, + { + "epoch": 0.87, + "learning_rate": 9.016541320845373e-07, + "loss": 0.4447, + "step": 7594 + }, + { + "epoch": 0.87, + "learning_rate": 9.001186765133052e-07, + "loss": 0.4528, + "step": 7595 + }, + { + "epoch": 0.87, + "learning_rate": 8.985844678289823e-07, + "loss": 0.4641, + "step": 7596 + }, + { + "epoch": 0.87, + "learning_rate": 8.97051506241785e-07, + "loss": 0.4672, + "step": 7597 + }, + { + "epoch": 0.87, + "learning_rate": 8.955197919617653e-07, + "loss": 0.4495, + "step": 7598 + }, + { + "epoch": 0.87, + "learning_rate": 8.93989325198803e-07, + "loss": 0.4568, + "step": 7599 + }, + { + "epoch": 0.87, + "learning_rate": 8.924601061626049e-07, + "loss": 0.4654, + "step": 7600 + }, + { + "epoch": 0.87, + "learning_rate": 8.909321350627109e-07, + "loss": 0.4634, + "step": 7601 + }, + { + "epoch": 0.87, + "learning_rate": 8.894054121084839e-07, + "loss": 0.4489, + "step": 7602 + }, + { + "epoch": 0.87, + "learning_rate": 8.878799375091185e-07, + "loss": 0.444, + "step": 7603 + }, + { + "epoch": 0.87, + "learning_rate": 8.863557114736432e-07, + "loss": 0.4585, + "step": 7604 + }, + { + "epoch": 0.87, + "learning_rate": 8.848327342109053e-07, + "loss": 0.4396, + "step": 7605 + }, + { + "epoch": 0.87, + "learning_rate": 8.833110059295913e-07, + "loss": 0.4655, + "step": 7606 + }, + { + "epoch": 0.87, + "learning_rate": 8.817905268382088e-07, + "loss": 0.4619, + "step": 7607 + }, + { + "epoch": 0.87, + "learning_rate": 8.802712971450989e-07, + "loss": 0.4306, + "step": 7608 + }, + { + "epoch": 0.87, + "learning_rate": 8.787533170584317e-07, + "loss": 0.4527, + "step": 7609 + }, + { + "epoch": 0.87, + "learning_rate": 8.772365867862021e-07, + "loss": 0.4511, + "step": 7610 + }, + { + "epoch": 0.87, + "learning_rate": 8.757211065362359e-07, + "loss": 0.4495, + "step": 7611 + }, + { + "epoch": 0.87, + "learning_rate": 8.742068765161893e-07, + "loss": 0.4779, + "step": 7612 + }, + { + "epoch": 0.87, + "learning_rate": 8.726938969335419e-07, + "loss": 0.4548, + "step": 7613 + }, + { + "epoch": 0.87, + "learning_rate": 8.711821679956111e-07, + "loss": 0.4472, + "step": 7614 + }, + { + "epoch": 0.87, + "learning_rate": 8.696716899095336e-07, + "loss": 0.4485, + "step": 7615 + }, + { + "epoch": 0.87, + "learning_rate": 8.681624628822794e-07, + "loss": 0.4791, + "step": 7616 + }, + { + "epoch": 0.87, + "learning_rate": 8.666544871206484e-07, + "loss": 0.4501, + "step": 7617 + }, + { + "epoch": 0.87, + "learning_rate": 8.651477628312632e-07, + "loss": 0.4501, + "step": 7618 + }, + { + "epoch": 0.87, + "learning_rate": 8.636422902205821e-07, + "loss": 0.4504, + "step": 7619 + }, + { + "epoch": 0.87, + "learning_rate": 8.621380694948878e-07, + "loss": 0.4417, + "step": 7620 + }, + { + "epoch": 0.87, + "learning_rate": 8.606351008602898e-07, + "loss": 0.4476, + "step": 7621 + }, + { + "epoch": 0.87, + "learning_rate": 8.591333845227312e-07, + "loss": 0.4568, + "step": 7622 + }, + { + "epoch": 0.87, + "learning_rate": 8.576329206879785e-07, + "loss": 0.4544, + "step": 7623 + }, + { + "epoch": 0.87, + "learning_rate": 8.561337095616306e-07, + "loss": 0.4428, + "step": 7624 + }, + { + "epoch": 0.87, + "learning_rate": 8.54635751349111e-07, + "loss": 0.4396, + "step": 7625 + }, + { + "epoch": 0.87, + "learning_rate": 8.531390462556744e-07, + "loss": 0.4423, + "step": 7626 + }, + { + "epoch": 0.87, + "learning_rate": 8.516435944864043e-07, + "loss": 0.4446, + "step": 7627 + }, + { + "epoch": 0.87, + "learning_rate": 8.501493962462092e-07, + "loss": 0.4541, + "step": 7628 + }, + { + "epoch": 0.87, + "learning_rate": 8.486564517398265e-07, + "loss": 0.4383, + "step": 7629 + }, + { + "epoch": 0.87, + "learning_rate": 8.471647611718259e-07, + "loss": 0.4603, + "step": 7630 + }, + { + "epoch": 0.87, + "learning_rate": 8.456743247465992e-07, + "loss": 0.4471, + "step": 7631 + }, + { + "epoch": 0.87, + "learning_rate": 8.441851426683723e-07, + "loss": 0.4791, + "step": 7632 + }, + { + "epoch": 0.87, + "learning_rate": 8.426972151411961e-07, + "loss": 0.4562, + "step": 7633 + }, + { + "epoch": 0.87, + "learning_rate": 8.412105423689465e-07, + "loss": 0.4443, + "step": 7634 + }, + { + "epoch": 0.87, + "learning_rate": 8.397251245553339e-07, + "loss": 0.4659, + "step": 7635 + }, + { + "epoch": 0.87, + "learning_rate": 8.382409619038923e-07, + "loss": 0.4401, + "step": 7636 + }, + { + "epoch": 0.87, + "learning_rate": 8.367580546179877e-07, + "loss": 0.4638, + "step": 7637 + }, + { + "epoch": 0.87, + "learning_rate": 8.352764029008098e-07, + "loss": 0.4554, + "step": 7638 + }, + { + "epoch": 0.87, + "learning_rate": 8.337960069553763e-07, + "loss": 0.4306, + "step": 7639 + }, + { + "epoch": 0.87, + "learning_rate": 8.323168669845383e-07, + "loss": 0.4672, + "step": 7640 + }, + { + "epoch": 0.87, + "learning_rate": 8.30838983190968e-07, + "loss": 0.453, + "step": 7641 + }, + { + "epoch": 0.87, + "learning_rate": 8.29362355777168e-07, + "loss": 0.4381, + "step": 7642 + }, + { + "epoch": 0.87, + "learning_rate": 8.278869849454718e-07, + "loss": 0.4571, + "step": 7643 + }, + { + "epoch": 0.87, + "learning_rate": 8.264128708980345e-07, + "loss": 0.4619, + "step": 7644 + }, + { + "epoch": 0.87, + "learning_rate": 8.249400138368457e-07, + "loss": 0.4437, + "step": 7645 + }, + { + "epoch": 0.87, + "learning_rate": 8.234684139637205e-07, + "loss": 0.4636, + "step": 7646 + }, + { + "epoch": 0.87, + "learning_rate": 8.219980714802978e-07, + "loss": 0.4432, + "step": 7647 + }, + { + "epoch": 0.87, + "learning_rate": 8.205289865880505e-07, + "loss": 0.4722, + "step": 7648 + }, + { + "epoch": 0.87, + "learning_rate": 8.190611594882736e-07, + "loss": 0.4393, + "step": 7649 + }, + { + "epoch": 0.87, + "learning_rate": 8.175945903820937e-07, + "loss": 0.4602, + "step": 7650 + }, + { + "epoch": 0.87, + "learning_rate": 8.161292794704634e-07, + "loss": 0.4411, + "step": 7651 + }, + { + "epoch": 0.87, + "learning_rate": 8.146652269541599e-07, + "loss": 0.458, + "step": 7652 + }, + { + "epoch": 0.87, + "learning_rate": 8.132024330337962e-07, + "loss": 0.4397, + "step": 7653 + }, + { + "epoch": 0.87, + "learning_rate": 8.11740897909803e-07, + "loss": 0.4447, + "step": 7654 + }, + { + "epoch": 0.87, + "learning_rate": 8.102806217824455e-07, + "loss": 0.4601, + "step": 7655 + }, + { + "epoch": 0.87, + "learning_rate": 8.08821604851816e-07, + "loss": 0.4818, + "step": 7656 + }, + { + "epoch": 0.88, + "learning_rate": 8.073638473178291e-07, + "loss": 0.4526, + "step": 7657 + }, + { + "epoch": 0.88, + "learning_rate": 8.059073493802327e-07, + "loss": 0.4646, + "step": 7658 + }, + { + "epoch": 0.88, + "learning_rate": 8.044521112385983e-07, + "loss": 0.4406, + "step": 7659 + }, + { + "epoch": 0.88, + "learning_rate": 8.029981330923242e-07, + "loss": 0.4413, + "step": 7660 + }, + { + "epoch": 0.88, + "learning_rate": 8.01545415140641e-07, + "loss": 0.4463, + "step": 7661 + }, + { + "epoch": 0.88, + "learning_rate": 8.000939575826016e-07, + "loss": 0.4578, + "step": 7662 + }, + { + "epoch": 0.88, + "learning_rate": 7.986437606170893e-07, + "loss": 0.4601, + "step": 7663 + }, + { + "epoch": 0.88, + "learning_rate": 7.971948244428118e-07, + "loss": 0.4446, + "step": 7664 + }, + { + "epoch": 0.88, + "learning_rate": 7.957471492583068e-07, + "loss": 0.4322, + "step": 7665 + }, + { + "epoch": 0.88, + "learning_rate": 7.943007352619392e-07, + "loss": 0.4489, + "step": 7666 + }, + { + "epoch": 0.88, + "learning_rate": 7.928555826518991e-07, + "loss": 0.4336, + "step": 7667 + }, + { + "epoch": 0.88, + "learning_rate": 7.914116916262027e-07, + "loss": 0.4407, + "step": 7668 + }, + { + "epoch": 0.88, + "learning_rate": 7.899690623826983e-07, + "loss": 0.4515, + "step": 7669 + }, + { + "epoch": 0.88, + "learning_rate": 7.885276951190568e-07, + "loss": 0.4516, + "step": 7670 + }, + { + "epoch": 0.88, + "learning_rate": 7.870875900327779e-07, + "loss": 0.4426, + "step": 7671 + }, + { + "epoch": 0.88, + "learning_rate": 7.856487473211871e-07, + "loss": 0.4682, + "step": 7672 + }, + { + "epoch": 0.88, + "learning_rate": 7.842111671814401e-07, + "loss": 0.4245, + "step": 7673 + }, + { + "epoch": 0.88, + "learning_rate": 7.82774849810517e-07, + "loss": 0.4563, + "step": 7674 + }, + { + "epoch": 0.88, + "learning_rate": 7.813397954052237e-07, + "loss": 0.4313, + "step": 7675 + }, + { + "epoch": 0.88, + "learning_rate": 7.799060041621975e-07, + "loss": 0.449, + "step": 7676 + }, + { + "epoch": 0.88, + "learning_rate": 7.784734762778978e-07, + "loss": 0.4433, + "step": 7677 + }, + { + "epoch": 0.88, + "learning_rate": 7.77042211948611e-07, + "loss": 0.438, + "step": 7678 + }, + { + "epoch": 0.88, + "learning_rate": 7.756122113704567e-07, + "loss": 0.4465, + "step": 7679 + }, + { + "epoch": 0.88, + "learning_rate": 7.741834747393751e-07, + "loss": 0.4503, + "step": 7680 + }, + { + "epoch": 0.88, + "learning_rate": 7.727560022511327e-07, + "loss": 0.4502, + "step": 7681 + }, + { + "epoch": 0.88, + "learning_rate": 7.713297941013264e-07, + "loss": 0.455, + "step": 7682 + }, + { + "epoch": 0.88, + "learning_rate": 7.69904850485379e-07, + "loss": 0.4359, + "step": 7683 + }, + { + "epoch": 0.88, + "learning_rate": 7.684811715985429e-07, + "loss": 0.4722, + "step": 7684 + }, + { + "epoch": 0.88, + "learning_rate": 7.670587576358889e-07, + "loss": 0.4377, + "step": 7685 + }, + { + "epoch": 0.88, + "learning_rate": 7.656376087923212e-07, + "loss": 0.4555, + "step": 7686 + }, + { + "epoch": 0.88, + "learning_rate": 7.642177252625704e-07, + "loss": 0.4569, + "step": 7687 + }, + { + "epoch": 0.88, + "learning_rate": 7.627991072411889e-07, + "loss": 0.4471, + "step": 7688 + }, + { + "epoch": 0.88, + "learning_rate": 7.613817549225621e-07, + "loss": 0.4646, + "step": 7689 + }, + { + "epoch": 0.88, + "learning_rate": 7.599656685008982e-07, + "loss": 0.4708, + "step": 7690 + }, + { + "epoch": 0.88, + "learning_rate": 7.585508481702308e-07, + "loss": 0.4315, + "step": 7691 + }, + { + "epoch": 0.88, + "learning_rate": 7.571372941244237e-07, + "loss": 0.4523, + "step": 7692 + }, + { + "epoch": 0.88, + "learning_rate": 7.557250065571664e-07, + "loss": 0.435, + "step": 7693 + }, + { + "epoch": 0.88, + "learning_rate": 7.543139856619708e-07, + "loss": 0.4433, + "step": 7694 + }, + { + "epoch": 0.88, + "learning_rate": 7.52904231632181e-07, + "loss": 0.4802, + "step": 7695 + }, + { + "epoch": 0.88, + "learning_rate": 7.514957446609627e-07, + "loss": 0.4475, + "step": 7696 + }, + { + "epoch": 0.88, + "learning_rate": 7.500885249413126e-07, + "loss": 0.469, + "step": 7697 + }, + { + "epoch": 0.88, + "learning_rate": 7.486825726660496e-07, + "loss": 0.4423, + "step": 7698 + }, + { + "epoch": 0.88, + "learning_rate": 7.472778880278197e-07, + "loss": 0.4658, + "step": 7699 + }, + { + "epoch": 0.88, + "learning_rate": 7.45874471219098e-07, + "loss": 0.4586, + "step": 7700 + }, + { + "epoch": 0.88, + "learning_rate": 7.444723224321804e-07, + "loss": 0.4538, + "step": 7701 + }, + { + "epoch": 0.88, + "learning_rate": 7.430714418591966e-07, + "loss": 0.4298, + "step": 7702 + }, + { + "epoch": 0.88, + "learning_rate": 7.416718296920977e-07, + "loss": 0.4549, + "step": 7703 + }, + { + "epoch": 0.88, + "learning_rate": 7.40273486122659e-07, + "loss": 0.438, + "step": 7704 + }, + { + "epoch": 0.88, + "learning_rate": 7.388764113424895e-07, + "loss": 0.4753, + "step": 7705 + }, + { + "epoch": 0.88, + "learning_rate": 7.37480605543015e-07, + "loss": 0.4512, + "step": 7706 + }, + { + "epoch": 0.88, + "learning_rate": 7.360860689154969e-07, + "loss": 0.4467, + "step": 7707 + }, + { + "epoch": 0.88, + "learning_rate": 7.346928016510135e-07, + "loss": 0.438, + "step": 7708 + }, + { + "epoch": 0.88, + "learning_rate": 7.333008039404743e-07, + "loss": 0.4606, + "step": 7709 + }, + { + "epoch": 0.88, + "learning_rate": 7.319100759746167e-07, + "loss": 0.4459, + "step": 7710 + }, + { + "epoch": 0.88, + "learning_rate": 7.305206179439972e-07, + "loss": 0.4351, + "step": 7711 + }, + { + "epoch": 0.88, + "learning_rate": 7.291324300390057e-07, + "loss": 0.46, + "step": 7712 + }, + { + "epoch": 0.88, + "learning_rate": 7.277455124498545e-07, + "loss": 0.4704, + "step": 7713 + }, + { + "epoch": 0.88, + "learning_rate": 7.263598653665815e-07, + "loss": 0.4559, + "step": 7714 + }, + { + "epoch": 0.88, + "learning_rate": 7.249754889790539e-07, + "loss": 0.4432, + "step": 7715 + }, + { + "epoch": 0.88, + "learning_rate": 7.235923834769599e-07, + "loss": 0.4376, + "step": 7716 + }, + { + "epoch": 0.88, + "learning_rate": 7.222105490498133e-07, + "loss": 0.4602, + "step": 7717 + }, + { + "epoch": 0.88, + "learning_rate": 7.208299858869616e-07, + "loss": 0.449, + "step": 7718 + }, + { + "epoch": 0.88, + "learning_rate": 7.194506941775681e-07, + "loss": 0.4414, + "step": 7719 + }, + { + "epoch": 0.88, + "learning_rate": 7.180726741106303e-07, + "loss": 0.4416, + "step": 7720 + }, + { + "epoch": 0.88, + "learning_rate": 7.16695925874964e-07, + "loss": 0.4513, + "step": 7721 + }, + { + "epoch": 0.88, + "learning_rate": 7.15320449659217e-07, + "loss": 0.4673, + "step": 7722 + }, + { + "epoch": 0.88, + "learning_rate": 7.139462456518619e-07, + "loss": 0.4606, + "step": 7723 + }, + { + "epoch": 0.88, + "learning_rate": 7.125733140411928e-07, + "loss": 0.4422, + "step": 7724 + }, + { + "epoch": 0.88, + "learning_rate": 7.1120165501533e-07, + "loss": 0.45, + "step": 7725 + }, + { + "epoch": 0.88, + "learning_rate": 7.098312687622256e-07, + "loss": 0.4541, + "step": 7726 + }, + { + "epoch": 0.88, + "learning_rate": 7.084621554696502e-07, + "loss": 0.4516, + "step": 7727 + }, + { + "epoch": 0.88, + "learning_rate": 7.070943153252053e-07, + "loss": 0.4362, + "step": 7728 + }, + { + "epoch": 0.88, + "learning_rate": 7.057277485163116e-07, + "loss": 0.4724, + "step": 7729 + }, + { + "epoch": 0.88, + "learning_rate": 7.043624552302231e-07, + "loss": 0.4389, + "step": 7730 + }, + { + "epoch": 0.88, + "learning_rate": 7.029984356540153e-07, + "loss": 0.4602, + "step": 7731 + }, + { + "epoch": 0.88, + "learning_rate": 7.016356899745869e-07, + "loss": 0.4498, + "step": 7732 + }, + { + "epoch": 0.88, + "learning_rate": 7.002742183786671e-07, + "loss": 0.4505, + "step": 7733 + }, + { + "epoch": 0.88, + "learning_rate": 6.989140210528067e-07, + "loss": 0.4436, + "step": 7734 + }, + { + "epoch": 0.88, + "learning_rate": 6.975550981833823e-07, + "loss": 0.4532, + "step": 7735 + }, + { + "epoch": 0.88, + "learning_rate": 6.961974499565982e-07, + "loss": 0.4534, + "step": 7736 + }, + { + "epoch": 0.88, + "learning_rate": 6.948410765584813e-07, + "loss": 0.4557, + "step": 7737 + }, + { + "epoch": 0.88, + "learning_rate": 6.934859781748848e-07, + "loss": 0.4727, + "step": 7738 + }, + { + "epoch": 0.88, + "learning_rate": 6.921321549914872e-07, + "loss": 0.4457, + "step": 7739 + }, + { + "epoch": 0.88, + "learning_rate": 6.907796071937944e-07, + "loss": 0.4459, + "step": 7740 + }, + { + "epoch": 0.88, + "learning_rate": 6.894283349671349e-07, + "loss": 0.4443, + "step": 7741 + }, + { + "epoch": 0.88, + "learning_rate": 6.880783384966638e-07, + "loss": 0.4602, + "step": 7742 + }, + { + "epoch": 0.88, + "learning_rate": 6.867296179673588e-07, + "loss": 0.4401, + "step": 7743 + }, + { + "epoch": 0.89, + "learning_rate": 6.853821735640265e-07, + "loss": 0.438, + "step": 7744 + }, + { + "epoch": 0.89, + "learning_rate": 6.840360054712946e-07, + "loss": 0.4418, + "step": 7745 + }, + { + "epoch": 0.89, + "learning_rate": 6.826911138736214e-07, + "loss": 0.4423, + "step": 7746 + }, + { + "epoch": 0.89, + "learning_rate": 6.81347498955286e-07, + "loss": 0.461, + "step": 7747 + }, + { + "epoch": 0.89, + "learning_rate": 6.800051609003911e-07, + "loss": 0.4588, + "step": 7748 + }, + { + "epoch": 0.89, + "learning_rate": 6.786640998928684e-07, + "loss": 0.457, + "step": 7749 + }, + { + "epoch": 0.89, + "learning_rate": 6.773243161164756e-07, + "loss": 0.4321, + "step": 7750 + }, + { + "epoch": 0.89, + "learning_rate": 6.7598580975479e-07, + "loss": 0.4422, + "step": 7751 + }, + { + "epoch": 0.89, + "learning_rate": 6.746485809912184e-07, + "loss": 0.4472, + "step": 7752 + }, + { + "epoch": 0.89, + "learning_rate": 6.733126300089898e-07, + "loss": 0.4487, + "step": 7753 + }, + { + "epoch": 0.89, + "learning_rate": 6.719779569911622e-07, + "loss": 0.4533, + "step": 7754 + }, + { + "epoch": 0.89, + "learning_rate": 6.706445621206126e-07, + "loss": 0.4623, + "step": 7755 + }, + { + "epoch": 0.89, + "learning_rate": 6.69312445580046e-07, + "loss": 0.4483, + "step": 7756 + }, + { + "epoch": 0.89, + "learning_rate": 6.67981607551994e-07, + "loss": 0.4525, + "step": 7757 + }, + { + "epoch": 0.89, + "learning_rate": 6.666520482188087e-07, + "loss": 0.4394, + "step": 7758 + }, + { + "epoch": 0.89, + "learning_rate": 6.653237677626701e-07, + "loss": 0.458, + "step": 7759 + }, + { + "epoch": 0.89, + "learning_rate": 6.639967663655844e-07, + "loss": 0.4302, + "step": 7760 + }, + { + "epoch": 0.89, + "learning_rate": 6.626710442093776e-07, + "loss": 0.4385, + "step": 7761 + }, + { + "epoch": 0.89, + "learning_rate": 6.613466014757064e-07, + "loss": 0.4548, + "step": 7762 + }, + { + "epoch": 0.89, + "learning_rate": 6.600234383460469e-07, + "loss": 0.4378, + "step": 7763 + }, + { + "epoch": 0.89, + "learning_rate": 6.587015550017006e-07, + "loss": 0.431, + "step": 7764 + }, + { + "epoch": 0.89, + "learning_rate": 6.573809516237984e-07, + "loss": 0.4637, + "step": 7765 + }, + { + "epoch": 0.89, + "learning_rate": 6.560616283932897e-07, + "loss": 0.4629, + "step": 7766 + }, + { + "epoch": 0.89, + "learning_rate": 6.547435854909534e-07, + "loss": 0.4502, + "step": 7767 + }, + { + "epoch": 0.89, + "learning_rate": 6.534268230973873e-07, + "loss": 0.4431, + "step": 7768 + }, + { + "epoch": 0.89, + "learning_rate": 6.521113413930202e-07, + "loss": 0.452, + "step": 7769 + }, + { + "epoch": 0.89, + "learning_rate": 6.507971405581037e-07, + "loss": 0.4402, + "step": 7770 + }, + { + "epoch": 0.89, + "learning_rate": 6.494842207727092e-07, + "loss": 0.4589, + "step": 7771 + }, + { + "epoch": 0.89, + "learning_rate": 6.481725822167384e-07, + "loss": 0.446, + "step": 7772 + }, + { + "epoch": 0.89, + "learning_rate": 6.468622250699152e-07, + "loss": 0.4478, + "step": 7773 + }, + { + "epoch": 0.89, + "learning_rate": 6.45553149511785e-07, + "loss": 0.4533, + "step": 7774 + }, + { + "epoch": 0.89, + "learning_rate": 6.442453557217243e-07, + "loss": 0.4651, + "step": 7775 + }, + { + "epoch": 0.89, + "learning_rate": 6.429388438789252e-07, + "loss": 0.4491, + "step": 7776 + }, + { + "epoch": 0.89, + "learning_rate": 6.416336141624146e-07, + "loss": 0.4507, + "step": 7777 + }, + { + "epoch": 0.89, + "learning_rate": 6.403296667510339e-07, + "loss": 0.4598, + "step": 7778 + }, + { + "epoch": 0.89, + "learning_rate": 6.390270018234534e-07, + "loss": 0.4574, + "step": 7779 + }, + { + "epoch": 0.89, + "learning_rate": 6.377256195581705e-07, + "loss": 0.4618, + "step": 7780 + }, + { + "epoch": 0.89, + "learning_rate": 6.364255201335013e-07, + "loss": 0.4581, + "step": 7781 + }, + { + "epoch": 0.89, + "learning_rate": 6.351267037275877e-07, + "loss": 0.4482, + "step": 7782 + }, + { + "epoch": 0.89, + "learning_rate": 6.338291705183986e-07, + "loss": 0.4528, + "step": 7783 + }, + { + "epoch": 0.89, + "learning_rate": 6.325329206837217e-07, + "loss": 0.463, + "step": 7784 + }, + { + "epoch": 0.89, + "learning_rate": 6.31237954401176e-07, + "loss": 0.4425, + "step": 7785 + }, + { + "epoch": 0.89, + "learning_rate": 6.299442718481974e-07, + "loss": 0.4418, + "step": 7786 + }, + { + "epoch": 0.89, + "learning_rate": 6.286518732020519e-07, + "loss": 0.4509, + "step": 7787 + }, + { + "epoch": 0.89, + "learning_rate": 6.273607586398267e-07, + "loss": 0.4507, + "step": 7788 + }, + { + "epoch": 0.89, + "learning_rate": 6.260709283384326e-07, + "loss": 0.4678, + "step": 7789 + }, + { + "epoch": 0.89, + "learning_rate": 6.247823824746058e-07, + "loss": 0.4374, + "step": 7790 + }, + { + "epoch": 0.89, + "learning_rate": 6.234951212249052e-07, + "loss": 0.4768, + "step": 7791 + }, + { + "epoch": 0.89, + "learning_rate": 6.222091447657119e-07, + "loss": 0.4333, + "step": 7792 + }, + { + "epoch": 0.89, + "learning_rate": 6.209244532732394e-07, + "loss": 0.4462, + "step": 7793 + }, + { + "epoch": 0.89, + "learning_rate": 6.196410469235148e-07, + "loss": 0.4515, + "step": 7794 + }, + { + "epoch": 0.89, + "learning_rate": 6.183589258923928e-07, + "loss": 0.4577, + "step": 7795 + }, + { + "epoch": 0.89, + "learning_rate": 6.170780903555529e-07, + "loss": 0.4478, + "step": 7796 + }, + { + "epoch": 0.89, + "learning_rate": 6.157985404885003e-07, + "loss": 0.4433, + "step": 7797 + }, + { + "epoch": 0.89, + "learning_rate": 6.145202764665626e-07, + "loss": 0.4634, + "step": 7798 + }, + { + "epoch": 0.89, + "learning_rate": 6.132432984648895e-07, + "loss": 0.4328, + "step": 7799 + }, + { + "epoch": 0.89, + "learning_rate": 6.119676066584523e-07, + "loss": 0.4386, + "step": 7800 + }, + { + "epoch": 0.89, + "learning_rate": 6.106932012220534e-07, + "loss": 0.4653, + "step": 7801 + }, + { + "epoch": 0.89, + "learning_rate": 6.09420082330312e-07, + "loss": 0.4458, + "step": 7802 + }, + { + "epoch": 0.89, + "learning_rate": 6.081482501576763e-07, + "loss": 0.4492, + "step": 7803 + }, + { + "epoch": 0.89, + "learning_rate": 6.068777048784136e-07, + "loss": 0.4465, + "step": 7804 + }, + { + "epoch": 0.89, + "learning_rate": 6.056084466666167e-07, + "loss": 0.4539, + "step": 7805 + }, + { + "epoch": 0.89, + "learning_rate": 6.043404756962046e-07, + "loss": 0.4583, + "step": 7806 + }, + { + "epoch": 0.89, + "learning_rate": 6.030737921409169e-07, + "loss": 0.4634, + "step": 7807 + }, + { + "epoch": 0.89, + "learning_rate": 6.01808396174316e-07, + "loss": 0.4564, + "step": 7808 + }, + { + "epoch": 0.89, + "learning_rate": 6.005442879697909e-07, + "loss": 0.4556, + "step": 7809 + }, + { + "epoch": 0.89, + "learning_rate": 5.992814677005521e-07, + "loss": 0.4375, + "step": 7810 + }, + { + "epoch": 0.89, + "learning_rate": 5.980199355396343e-07, + "loss": 0.4505, + "step": 7811 + }, + { + "epoch": 0.89, + "learning_rate": 5.967596916598961e-07, + "loss": 0.4443, + "step": 7812 + }, + { + "epoch": 0.89, + "learning_rate": 5.955007362340171e-07, + "loss": 0.4439, + "step": 7813 + }, + { + "epoch": 0.89, + "learning_rate": 5.942430694345058e-07, + "loss": 0.4427, + "step": 7814 + }, + { + "epoch": 0.89, + "learning_rate": 5.929866914336857e-07, + "loss": 0.4509, + "step": 7815 + }, + { + "epoch": 0.89, + "learning_rate": 5.917316024037123e-07, + "loss": 0.4467, + "step": 7816 + }, + { + "epoch": 0.89, + "learning_rate": 5.904778025165614e-07, + "loss": 0.4594, + "step": 7817 + }, + { + "epoch": 0.89, + "learning_rate": 5.892252919440289e-07, + "loss": 0.4512, + "step": 7818 + }, + { + "epoch": 0.89, + "learning_rate": 5.879740708577386e-07, + "loss": 0.4463, + "step": 7819 + }, + { + "epoch": 0.89, + "learning_rate": 5.867241394291356e-07, + "loss": 0.4634, + "step": 7820 + }, + { + "epoch": 0.89, + "learning_rate": 5.854754978294863e-07, + "loss": 0.4553, + "step": 7821 + }, + { + "epoch": 0.89, + "learning_rate": 5.84228146229886e-07, + "loss": 0.444, + "step": 7822 + }, + { + "epoch": 0.89, + "learning_rate": 5.829820848012457e-07, + "loss": 0.4574, + "step": 7823 + }, + { + "epoch": 0.89, + "learning_rate": 5.817373137143079e-07, + "loss": 0.4532, + "step": 7824 + }, + { + "epoch": 0.89, + "learning_rate": 5.804938331396292e-07, + "loss": 0.4523, + "step": 7825 + }, + { + "epoch": 0.89, + "learning_rate": 5.79251643247598e-07, + "loss": 0.4373, + "step": 7826 + }, + { + "epoch": 0.89, + "learning_rate": 5.780107442084215e-07, + "loss": 0.4631, + "step": 7827 + }, + { + "epoch": 0.89, + "learning_rate": 5.767711361921291e-07, + "loss": 0.4318, + "step": 7828 + }, + { + "epoch": 0.89, + "learning_rate": 5.755328193685772e-07, + "loss": 0.4593, + "step": 7829 + }, + { + "epoch": 0.89, + "learning_rate": 5.742957939074412e-07, + "loss": 0.4494, + "step": 7830 + }, + { + "epoch": 0.89, + "learning_rate": 5.730600599782188e-07, + "loss": 0.4487, + "step": 7831 + }, + { + "epoch": 0.9, + "learning_rate": 5.718256177502379e-07, + "loss": 0.4477, + "step": 7832 + }, + { + "epoch": 0.9, + "learning_rate": 5.70592467392641e-07, + "loss": 0.4551, + "step": 7833 + }, + { + "epoch": 0.9, + "learning_rate": 5.693606090744008e-07, + "loss": 0.4259, + "step": 7834 + }, + { + "epoch": 0.9, + "learning_rate": 5.681300429643044e-07, + "loss": 0.4667, + "step": 7835 + }, + { + "epoch": 0.9, + "learning_rate": 5.669007692309703e-07, + "loss": 0.4529, + "step": 7836 + }, + { + "epoch": 0.9, + "learning_rate": 5.65672788042837e-07, + "loss": 0.4611, + "step": 7837 + }, + { + "epoch": 0.9, + "learning_rate": 5.644460995681644e-07, + "loss": 0.4351, + "step": 7838 + }, + { + "epoch": 0.9, + "learning_rate": 5.632207039750348e-07, + "loss": 0.4465, + "step": 7839 + }, + { + "epoch": 0.9, + "learning_rate": 5.61996601431356e-07, + "loss": 0.463, + "step": 7840 + }, + { + "epoch": 0.9, + "learning_rate": 5.607737921048573e-07, + "loss": 0.4737, + "step": 7841 + }, + { + "epoch": 0.9, + "learning_rate": 5.595522761630911e-07, + "loss": 0.4388, + "step": 7842 + }, + { + "epoch": 0.9, + "learning_rate": 5.583320537734315e-07, + "loss": 0.455, + "step": 7843 + }, + { + "epoch": 0.9, + "learning_rate": 5.57113125103077e-07, + "loss": 0.4427, + "step": 7844 + }, + { + "epoch": 0.9, + "learning_rate": 5.558954903190483e-07, + "loss": 0.4489, + "step": 7845 + }, + { + "epoch": 0.9, + "learning_rate": 5.546791495881887e-07, + "loss": 0.4557, + "step": 7846 + }, + { + "epoch": 0.9, + "learning_rate": 5.534641030771615e-07, + "loss": 0.4598, + "step": 7847 + }, + { + "epoch": 0.9, + "learning_rate": 5.522503509524591e-07, + "loss": 0.4401, + "step": 7848 + }, + { + "epoch": 0.9, + "learning_rate": 5.510378933803895e-07, + "loss": 0.4515, + "step": 7849 + }, + { + "epoch": 0.9, + "learning_rate": 5.498267305270888e-07, + "loss": 0.4423, + "step": 7850 + }, + { + "epoch": 0.9, + "learning_rate": 5.48616862558512e-07, + "loss": 0.4442, + "step": 7851 + }, + { + "epoch": 0.9, + "learning_rate": 5.474082896404365e-07, + "loss": 0.4525, + "step": 7852 + }, + { + "epoch": 0.9, + "learning_rate": 5.462010119384665e-07, + "loss": 0.4448, + "step": 7853 + }, + { + "epoch": 0.9, + "learning_rate": 5.44995029618024e-07, + "loss": 0.4257, + "step": 7854 + }, + { + "epoch": 0.9, + "learning_rate": 5.43790342844358e-07, + "loss": 0.4738, + "step": 7855 + }, + { + "epoch": 0.9, + "learning_rate": 5.425869517825366e-07, + "loss": 0.4501, + "step": 7856 + }, + { + "epoch": 0.9, + "learning_rate": 5.413848565974489e-07, + "loss": 0.4749, + "step": 7857 + }, + { + "epoch": 0.9, + "learning_rate": 5.401840574538108e-07, + "loss": 0.4564, + "step": 7858 + }, + { + "epoch": 0.9, + "learning_rate": 5.389845545161598e-07, + "loss": 0.4435, + "step": 7859 + }, + { + "epoch": 0.9, + "learning_rate": 5.37786347948851e-07, + "loss": 0.4443, + "step": 7860 + }, + { + "epoch": 0.9, + "learning_rate": 5.365894379160686e-07, + "loss": 0.4699, + "step": 7861 + }, + { + "epoch": 0.9, + "learning_rate": 5.353938245818147e-07, + "loss": 0.4474, + "step": 7862 + }, + { + "epoch": 0.9, + "learning_rate": 5.341995081099139e-07, + "loss": 0.4651, + "step": 7863 + }, + { + "epoch": 0.9, + "learning_rate": 5.330064886640173e-07, + "loss": 0.4474, + "step": 7864 + }, + { + "epoch": 0.9, + "learning_rate": 5.318147664075923e-07, + "loss": 0.4703, + "step": 7865 + }, + { + "epoch": 0.9, + "learning_rate": 5.306243415039336e-07, + "loss": 0.452, + "step": 7866 + }, + { + "epoch": 0.9, + "learning_rate": 5.294352141161541e-07, + "loss": 0.4484, + "step": 7867 + }, + { + "epoch": 0.9, + "learning_rate": 5.282473844071933e-07, + "loss": 0.4377, + "step": 7868 + }, + { + "epoch": 0.9, + "learning_rate": 5.27060852539808e-07, + "loss": 0.4763, + "step": 7869 + }, + { + "epoch": 0.9, + "learning_rate": 5.258756186765801e-07, + "loss": 0.4567, + "step": 7870 + }, + { + "epoch": 0.9, + "learning_rate": 5.246916829799132e-07, + "loss": 0.4356, + "step": 7871 + }, + { + "epoch": 0.9, + "learning_rate": 5.235090456120329e-07, + "loss": 0.4509, + "step": 7872 + }, + { + "epoch": 0.9, + "learning_rate": 5.223277067349864e-07, + "loss": 0.4649, + "step": 7873 + }, + { + "epoch": 0.9, + "learning_rate": 5.211476665106463e-07, + "loss": 0.4489, + "step": 7874 + }, + { + "epoch": 0.9, + "learning_rate": 5.199689251007001e-07, + "loss": 0.4553, + "step": 7875 + }, + { + "epoch": 0.9, + "learning_rate": 5.187914826666662e-07, + "loss": 0.4444, + "step": 7876 + }, + { + "epoch": 0.9, + "learning_rate": 5.17615339369878e-07, + "loss": 0.4496, + "step": 7877 + }, + { + "epoch": 0.9, + "learning_rate": 5.164404953714919e-07, + "loss": 0.4492, + "step": 7878 + }, + { + "epoch": 0.9, + "learning_rate": 5.152669508324904e-07, + "loss": 0.4534, + "step": 7879 + }, + { + "epoch": 0.9, + "learning_rate": 5.140947059136736e-07, + "loss": 0.4503, + "step": 7880 + }, + { + "epoch": 0.9, + "learning_rate": 5.129237607756677e-07, + "loss": 0.4529, + "step": 7881 + }, + { + "epoch": 0.9, + "learning_rate": 5.117541155789141e-07, + "loss": 0.4364, + "step": 7882 + }, + { + "epoch": 0.9, + "learning_rate": 5.105857704836836e-07, + "loss": 0.4399, + "step": 7883 + }, + { + "epoch": 0.9, + "learning_rate": 5.094187256500671e-07, + "loss": 0.4533, + "step": 7884 + }, + { + "epoch": 0.9, + "learning_rate": 5.08252981237971e-07, + "loss": 0.4392, + "step": 7885 + }, + { + "epoch": 0.9, + "learning_rate": 5.070885374071321e-07, + "loss": 0.4464, + "step": 7886 + }, + { + "epoch": 0.9, + "learning_rate": 5.05925394317105e-07, + "loss": 0.4763, + "step": 7887 + }, + { + "epoch": 0.9, + "learning_rate": 5.047635521272631e-07, + "loss": 0.4511, + "step": 7888 + }, + { + "epoch": 0.9, + "learning_rate": 5.036030109968082e-07, + "loss": 0.456, + "step": 7889 + }, + { + "epoch": 0.9, + "learning_rate": 5.024437710847574e-07, + "loss": 0.4665, + "step": 7890 + }, + { + "epoch": 0.9, + "learning_rate": 5.012858325499559e-07, + "loss": 0.4364, + "step": 7891 + }, + { + "epoch": 0.9, + "learning_rate": 5.001291955510634e-07, + "loss": 0.4645, + "step": 7892 + }, + { + "epoch": 0.9, + "learning_rate": 4.989738602465666e-07, + "loss": 0.4316, + "step": 7893 + }, + { + "epoch": 0.9, + "learning_rate": 4.978198267947742e-07, + "loss": 0.4545, + "step": 7894 + }, + { + "epoch": 0.9, + "learning_rate": 4.966670953538133e-07, + "loss": 0.4539, + "step": 7895 + }, + { + "epoch": 0.9, + "learning_rate": 4.955156660816307e-07, + "loss": 0.454, + "step": 7896 + }, + { + "epoch": 0.9, + "learning_rate": 4.943655391360025e-07, + "loss": 0.4422, + "step": 7897 + }, + { + "epoch": 0.9, + "learning_rate": 4.932167146745193e-07, + "loss": 0.4376, + "step": 7898 + }, + { + "epoch": 0.9, + "learning_rate": 4.920691928545973e-07, + "loss": 0.456, + "step": 7899 + }, + { + "epoch": 0.9, + "learning_rate": 4.909229738334698e-07, + "loss": 0.4591, + "step": 7900 + }, + { + "epoch": 0.9, + "learning_rate": 4.897780577681954e-07, + "loss": 0.4428, + "step": 7901 + }, + { + "epoch": 0.9, + "learning_rate": 4.886344448156566e-07, + "loss": 0.4412, + "step": 7902 + }, + { + "epoch": 0.9, + "learning_rate": 4.874921351325512e-07, + "loss": 0.4327, + "step": 7903 + }, + { + "epoch": 0.9, + "learning_rate": 4.863511288753986e-07, + "loss": 0.4401, + "step": 7904 + }, + { + "epoch": 0.9, + "learning_rate": 4.85211426200547e-07, + "loss": 0.4561, + "step": 7905 + }, + { + "epoch": 0.9, + "learning_rate": 4.840730272641569e-07, + "loss": 0.4616, + "step": 7906 + }, + { + "epoch": 0.9, + "learning_rate": 4.829359322222182e-07, + "loss": 0.4657, + "step": 7907 + }, + { + "epoch": 0.9, + "learning_rate": 4.818001412305362e-07, + "loss": 0.4471, + "step": 7908 + }, + { + "epoch": 0.9, + "learning_rate": 4.806656544447374e-07, + "loss": 0.4503, + "step": 7909 + }, + { + "epoch": 0.9, + "learning_rate": 4.795324720202754e-07, + "loss": 0.4521, + "step": 7910 + }, + { + "epoch": 0.9, + "learning_rate": 4.784005941124203e-07, + "loss": 0.4485, + "step": 7911 + }, + { + "epoch": 0.9, + "learning_rate": 4.772700208762659e-07, + "loss": 0.4421, + "step": 7912 + }, + { + "epoch": 0.9, + "learning_rate": 4.761407524667239e-07, + "loss": 0.4397, + "step": 7913 + }, + { + "epoch": 0.9, + "learning_rate": 4.750127890385292e-07, + "loss": 0.4493, + "step": 7914 + }, + { + "epoch": 0.9, + "learning_rate": 4.738861307462406e-07, + "loss": 0.461, + "step": 7915 + }, + { + "epoch": 0.9, + "learning_rate": 4.7276077774423334e-07, + "loss": 0.4571, + "step": 7916 + }, + { + "epoch": 0.9, + "learning_rate": 4.716367301867053e-07, + "loss": 0.4409, + "step": 7917 + }, + { + "epoch": 0.9, + "learning_rate": 4.705139882276788e-07, + "loss": 0.4698, + "step": 7918 + }, + { + "epoch": 0.91, + "learning_rate": 4.693925520209908e-07, + "loss": 0.4495, + "step": 7919 + }, + { + "epoch": 0.91, + "learning_rate": 4.6827242172030495e-07, + "loss": 0.4517, + "step": 7920 + }, + { + "epoch": 0.91, + "learning_rate": 4.6715359747910526e-07, + "loss": 0.4549, + "step": 7921 + }, + { + "epoch": 0.91, + "learning_rate": 4.660360794506946e-07, + "loss": 0.4327, + "step": 7922 + }, + { + "epoch": 0.91, + "learning_rate": 4.649198677881983e-07, + "loss": 0.4543, + "step": 7923 + }, + { + "epoch": 0.91, + "learning_rate": 4.6380496264456064e-07, + "loss": 0.4555, + "step": 7924 + }, + { + "epoch": 0.91, + "learning_rate": 4.6269136417255167e-07, + "loss": 0.433, + "step": 7925 + }, + { + "epoch": 0.91, + "learning_rate": 4.615790725247571e-07, + "loss": 0.4415, + "step": 7926 + }, + { + "epoch": 0.91, + "learning_rate": 4.60468087853585e-07, + "loss": 0.44, + "step": 7927 + }, + { + "epoch": 0.91, + "learning_rate": 4.5935841031126693e-07, + "loss": 0.4495, + "step": 7928 + }, + { + "epoch": 0.91, + "learning_rate": 4.582500400498513e-07, + "loss": 0.4418, + "step": 7929 + }, + { + "epoch": 0.91, + "learning_rate": 4.5714297722121105e-07, + "loss": 0.458, + "step": 7930 + }, + { + "epoch": 0.91, + "learning_rate": 4.5603722197703925e-07, + "loss": 0.4415, + "step": 7931 + }, + { + "epoch": 0.91, + "learning_rate": 4.54932774468847e-07, + "loss": 0.4602, + "step": 7932 + }, + { + "epoch": 0.91, + "learning_rate": 4.5382963484797096e-07, + "loss": 0.4578, + "step": 7933 + }, + { + "epoch": 0.91, + "learning_rate": 4.5272780326556466e-07, + "loss": 0.469, + "step": 7934 + }, + { + "epoch": 0.91, + "learning_rate": 4.516272798726018e-07, + "loss": 0.4415, + "step": 7935 + }, + { + "epoch": 0.91, + "learning_rate": 4.5052806481988175e-07, + "loss": 0.4508, + "step": 7936 + }, + { + "epoch": 0.91, + "learning_rate": 4.494301582580185e-07, + "loss": 0.4495, + "step": 7937 + }, + { + "epoch": 0.91, + "learning_rate": 4.4833356033745167e-07, + "loss": 0.4321, + "step": 7938 + }, + { + "epoch": 0.91, + "learning_rate": 4.472382712084389e-07, + "loss": 0.4402, + "step": 7939 + }, + { + "epoch": 0.91, + "learning_rate": 4.4614429102105893e-07, + "loss": 0.441, + "step": 7940 + }, + { + "epoch": 0.91, + "learning_rate": 4.4505161992521417e-07, + "loss": 0.4518, + "step": 7941 + }, + { + "epoch": 0.91, + "learning_rate": 4.439602580706226e-07, + "loss": 0.4709, + "step": 7942 + }, + { + "epoch": 0.91, + "learning_rate": 4.4287020560682345e-07, + "loss": 0.4147, + "step": 7943 + }, + { + "epoch": 0.91, + "learning_rate": 4.4178146268318177e-07, + "loss": 0.4517, + "step": 7944 + }, + { + "epoch": 0.91, + "learning_rate": 4.406940294488771e-07, + "loss": 0.4611, + "step": 7945 + }, + { + "epoch": 0.91, + "learning_rate": 4.396079060529146e-07, + "loss": 0.4488, + "step": 7946 + }, + { + "epoch": 0.91, + "learning_rate": 4.3852309264411417e-07, + "loss": 0.434, + "step": 7947 + }, + { + "epoch": 0.91, + "learning_rate": 4.3743958937112253e-07, + "loss": 0.4538, + "step": 7948 + }, + { + "epoch": 0.91, + "learning_rate": 4.363573963824008e-07, + "loss": 0.4536, + "step": 7949 + }, + { + "epoch": 0.91, + "learning_rate": 4.3527651382623603e-07, + "loss": 0.4758, + "step": 7950 + }, + { + "epoch": 0.91, + "learning_rate": 4.3419694185073303e-07, + "loss": 0.449, + "step": 7951 + }, + { + "epoch": 0.91, + "learning_rate": 4.331186806038179e-07, + "loss": 0.4611, + "step": 7952 + }, + { + "epoch": 0.91, + "learning_rate": 4.320417302332325e-07, + "loss": 0.4398, + "step": 7953 + }, + { + "epoch": 0.91, + "learning_rate": 4.3096609088654873e-07, + "loss": 0.4411, + "step": 7954 + }, + { + "epoch": 0.91, + "learning_rate": 4.298917627111476e-07, + "loss": 0.4262, + "step": 7955 + }, + { + "epoch": 0.91, + "learning_rate": 4.2881874585424146e-07, + "loss": 0.4694, + "step": 7956 + }, + { + "epoch": 0.91, + "learning_rate": 4.2774704046285254e-07, + "loss": 0.4418, + "step": 7957 + }, + { + "epoch": 0.91, + "learning_rate": 4.266766466838335e-07, + "loss": 0.4588, + "step": 7958 + }, + { + "epoch": 0.91, + "learning_rate": 4.256075646638469e-07, + "loss": 0.442, + "step": 7959 + }, + { + "epoch": 0.91, + "learning_rate": 4.2453979454938563e-07, + "loss": 0.4577, + "step": 7960 + }, + { + "epoch": 0.91, + "learning_rate": 4.2347333648675383e-07, + "loss": 0.4395, + "step": 7961 + }, + { + "epoch": 0.91, + "learning_rate": 4.2240819062208337e-07, + "loss": 0.4491, + "step": 7962 + }, + { + "epoch": 0.91, + "learning_rate": 4.2134435710132093e-07, + "loss": 0.453, + "step": 7963 + }, + { + "epoch": 0.91, + "learning_rate": 4.2028183607023766e-07, + "loss": 0.4556, + "step": 7964 + }, + { + "epoch": 0.91, + "learning_rate": 4.192206276744204e-07, + "loss": 0.4387, + "step": 7965 + }, + { + "epoch": 0.91, + "learning_rate": 4.181607320592784e-07, + "loss": 0.4476, + "step": 7966 + }, + { + "epoch": 0.91, + "learning_rate": 4.1710214937004223e-07, + "loss": 0.4507, + "step": 7967 + }, + { + "epoch": 0.91, + "learning_rate": 4.1604487975176136e-07, + "loss": 0.4773, + "step": 7968 + }, + { + "epoch": 0.91, + "learning_rate": 4.149889233493054e-07, + "loss": 0.4412, + "step": 7969 + }, + { + "epoch": 0.91, + "learning_rate": 4.139342803073632e-07, + "loss": 0.4464, + "step": 7970 + }, + { + "epoch": 0.91, + "learning_rate": 4.128809507704445e-07, + "loss": 0.4332, + "step": 7971 + }, + { + "epoch": 0.91, + "learning_rate": 4.1182893488287965e-07, + "loss": 0.451, + "step": 7972 + }, + { + "epoch": 0.91, + "learning_rate": 4.1077823278881767e-07, + "loss": 0.4384, + "step": 7973 + }, + { + "epoch": 0.91, + "learning_rate": 4.097288446322278e-07, + "loss": 0.4486, + "step": 7974 + }, + { + "epoch": 0.91, + "learning_rate": 4.086807705569018e-07, + "loss": 0.4793, + "step": 7975 + }, + { + "epoch": 0.91, + "learning_rate": 4.076340107064458e-07, + "loss": 0.4487, + "step": 7976 + }, + { + "epoch": 0.91, + "learning_rate": 4.065885652242907e-07, + "loss": 0.4378, + "step": 7977 + }, + { + "epoch": 0.91, + "learning_rate": 4.055444342536885e-07, + "loss": 0.4562, + "step": 7978 + }, + { + "epoch": 0.91, + "learning_rate": 4.045016179377048e-07, + "loss": 0.4411, + "step": 7979 + }, + { + "epoch": 0.91, + "learning_rate": 4.034601164192309e-07, + "loss": 0.4305, + "step": 7980 + }, + { + "epoch": 0.91, + "learning_rate": 4.024199298409737e-07, + "loss": 0.4558, + "step": 7981 + }, + { + "epoch": 0.91, + "learning_rate": 4.013810583454647e-07, + "loss": 0.4528, + "step": 7982 + }, + { + "epoch": 0.91, + "learning_rate": 4.0034350207505124e-07, + "loss": 0.4647, + "step": 7983 + }, + { + "epoch": 0.91, + "learning_rate": 3.9930726117190064e-07, + "loss": 0.4603, + "step": 7984 + }, + { + "epoch": 0.91, + "learning_rate": 3.982723357780027e-07, + "loss": 0.4369, + "step": 7985 + }, + { + "epoch": 0.91, + "learning_rate": 3.97238726035164e-07, + "loss": 0.4506, + "step": 7986 + }, + { + "epoch": 0.91, + "learning_rate": 3.962064320850112e-07, + "loss": 0.4454, + "step": 7987 + }, + { + "epoch": 0.91, + "learning_rate": 3.951754540689956e-07, + "loss": 0.4581, + "step": 7988 + }, + { + "epoch": 0.91, + "learning_rate": 3.9414579212838087e-07, + "loss": 0.4423, + "step": 7989 + }, + { + "epoch": 0.91, + "learning_rate": 3.931174464042542e-07, + "loss": 0.4595, + "step": 7990 + }, + { + "epoch": 0.91, + "learning_rate": 3.920904170375239e-07, + "loss": 0.4608, + "step": 7991 + }, + { + "epoch": 0.91, + "learning_rate": 3.9106470416891195e-07, + "loss": 0.4527, + "step": 7992 + }, + { + "epoch": 0.91, + "learning_rate": 3.9004030793896807e-07, + "loss": 0.4451, + "step": 7993 + }, + { + "epoch": 0.91, + "learning_rate": 3.8901722848805443e-07, + "loss": 0.4519, + "step": 7994 + }, + { + "epoch": 0.91, + "learning_rate": 3.8799546595635784e-07, + "loss": 0.4403, + "step": 7995 + }, + { + "epoch": 0.91, + "learning_rate": 3.8697502048387956e-07, + "loss": 0.4511, + "step": 7996 + }, + { + "epoch": 0.91, + "learning_rate": 3.8595589221044674e-07, + "loss": 0.4411, + "step": 7997 + }, + { + "epoch": 0.91, + "learning_rate": 3.84938081275702e-07, + "loss": 0.4515, + "step": 7998 + }, + { + "epoch": 0.91, + "learning_rate": 3.839215878191083e-07, + "loss": 0.4685, + "step": 7999 + }, + { + "epoch": 0.91, + "learning_rate": 3.8290641197994526e-07, + "loss": 0.4395, + "step": 8000 + }, + { + "epoch": 0.91, + "learning_rate": 3.8189255389731837e-07, + "loss": 0.4388, + "step": 8001 + }, + { + "epoch": 0.91, + "learning_rate": 3.808800137101465e-07, + "loss": 0.4422, + "step": 8002 + }, + { + "epoch": 0.91, + "learning_rate": 3.7986879155717084e-07, + "loss": 0.4631, + "step": 8003 + }, + { + "epoch": 0.91, + "learning_rate": 3.7885888757695054e-07, + "loss": 0.4384, + "step": 8004 + }, + { + "epoch": 0.91, + "learning_rate": 3.778503019078672e-07, + "loss": 0.4312, + "step": 8005 + }, + { + "epoch": 0.91, + "learning_rate": 3.768430346881169e-07, + "loss": 0.4415, + "step": 8006 + }, + { + "epoch": 0.92, + "learning_rate": 3.7583708605571923e-07, + "loss": 0.4421, + "step": 8007 + }, + { + "epoch": 0.92, + "learning_rate": 3.748324561485128e-07, + "loss": 0.4674, + "step": 8008 + }, + { + "epoch": 0.92, + "learning_rate": 3.7382914510415316e-07, + "loss": 0.4332, + "step": 8009 + }, + { + "epoch": 0.92, + "learning_rate": 3.7282715306011465e-07, + "loss": 0.4402, + "step": 8010 + }, + { + "epoch": 0.92, + "learning_rate": 3.7182648015369524e-07, + "loss": 0.4535, + "step": 8011 + }, + { + "epoch": 0.92, + "learning_rate": 3.708271265220087e-07, + "loss": 0.4524, + "step": 8012 + }, + { + "epoch": 0.92, + "learning_rate": 3.698290923019865e-07, + "loss": 0.4364, + "step": 8013 + }, + { + "epoch": 0.92, + "learning_rate": 3.688323776303837e-07, + "loss": 0.4509, + "step": 8014 + }, + { + "epoch": 0.92, + "learning_rate": 3.678369826437733e-07, + "loss": 0.4523, + "step": 8015 + }, + { + "epoch": 0.92, + "learning_rate": 3.668429074785451e-07, + "loss": 0.4636, + "step": 8016 + }, + { + "epoch": 0.92, + "learning_rate": 3.6585015227091013e-07, + "loss": 0.4432, + "step": 8017 + }, + { + "epoch": 0.92, + "learning_rate": 3.6485871715689735e-07, + "loss": 0.4289, + "step": 8018 + }, + { + "epoch": 0.92, + "learning_rate": 3.63868602272357e-07, + "loss": 0.4573, + "step": 8019 + }, + { + "epoch": 0.92, + "learning_rate": 3.6287980775295603e-07, + "loss": 0.4517, + "step": 8020 + }, + { + "epoch": 0.92, + "learning_rate": 3.6189233373418064e-07, + "loss": 0.4437, + "step": 8021 + }, + { + "epoch": 0.92, + "learning_rate": 3.609061803513392e-07, + "loss": 0.4425, + "step": 8022 + }, + { + "epoch": 0.92, + "learning_rate": 3.5992134773955354e-07, + "loss": 0.4543, + "step": 8023 + }, + { + "epoch": 0.92, + "learning_rate": 3.589378360337692e-07, + "loss": 0.4701, + "step": 8024 + }, + { + "epoch": 0.92, + "learning_rate": 3.579556453687494e-07, + "loss": 0.4521, + "step": 8025 + }, + { + "epoch": 0.92, + "learning_rate": 3.569747758790765e-07, + "loss": 0.4548, + "step": 8026 + }, + { + "epoch": 0.92, + "learning_rate": 3.5599522769915074e-07, + "loss": 0.4471, + "step": 8027 + }, + { + "epoch": 0.92, + "learning_rate": 3.550170009631926e-07, + "loss": 0.443, + "step": 8028 + }, + { + "epoch": 0.92, + "learning_rate": 3.5404009580524144e-07, + "loss": 0.4381, + "step": 8029 + }, + { + "epoch": 0.92, + "learning_rate": 3.5306451235915475e-07, + "loss": 0.4476, + "step": 8030 + }, + { + "epoch": 0.92, + "learning_rate": 3.520902507586077e-07, + "loss": 0.4408, + "step": 8031 + }, + { + "epoch": 0.92, + "learning_rate": 3.51117311137098e-07, + "loss": 0.4631, + "step": 8032 + }, + { + "epoch": 0.92, + "learning_rate": 3.50145693627939e-07, + "loss": 0.4327, + "step": 8033 + }, + { + "epoch": 0.92, + "learning_rate": 3.4917539836426317e-07, + "loss": 0.4601, + "step": 8034 + }, + { + "epoch": 0.92, + "learning_rate": 3.4820642547902516e-07, + "loss": 0.4565, + "step": 8035 + }, + { + "epoch": 0.92, + "learning_rate": 3.472387751049944e-07, + "loss": 0.4424, + "step": 8036 + }, + { + "epoch": 0.92, + "learning_rate": 3.462724473747603e-07, + "loss": 0.459, + "step": 8037 + }, + { + "epoch": 0.92, + "learning_rate": 3.4530744242073143e-07, + "loss": 0.4567, + "step": 8038 + }, + { + "epoch": 0.92, + "learning_rate": 3.443437603751354e-07, + "loss": 0.4382, + "step": 8039 + }, + { + "epoch": 0.92, + "learning_rate": 3.433814013700187e-07, + "loss": 0.4704, + "step": 8040 + }, + { + "epoch": 0.92, + "learning_rate": 3.424203655372438e-07, + "loss": 0.4377, + "step": 8041 + }, + { + "epoch": 0.92, + "learning_rate": 3.414606530084974e-07, + "loss": 0.4581, + "step": 8042 + }, + { + "epoch": 0.92, + "learning_rate": 3.405022639152777e-07, + "loss": 0.4452, + "step": 8043 + }, + { + "epoch": 0.92, + "learning_rate": 3.3954519838890866e-07, + "loss": 0.4596, + "step": 8044 + }, + { + "epoch": 0.92, + "learning_rate": 3.3858945656052855e-07, + "loss": 0.4302, + "step": 8045 + }, + { + "epoch": 0.92, + "learning_rate": 3.376350385610938e-07, + "loss": 0.4521, + "step": 8046 + }, + { + "epoch": 0.92, + "learning_rate": 3.3668194452138423e-07, + "loss": 0.4579, + "step": 8047 + }, + { + "epoch": 0.92, + "learning_rate": 3.357301745719932e-07, + "loss": 0.4506, + "step": 8048 + }, + { + "epoch": 0.92, + "learning_rate": 3.34779728843333e-07, + "loss": 0.4377, + "step": 8049 + }, + { + "epoch": 0.92, + "learning_rate": 3.3383060746563836e-07, + "loss": 0.4593, + "step": 8050 + }, + { + "epoch": 0.92, + "learning_rate": 3.3288281056895746e-07, + "loss": 0.4456, + "step": 8051 + }, + { + "epoch": 0.92, + "learning_rate": 3.3193633828316306e-07, + "loss": 0.4628, + "step": 8052 + }, + { + "epoch": 0.92, + "learning_rate": 3.309911907379393e-07, + "loss": 0.4529, + "step": 8053 + }, + { + "epoch": 0.92, + "learning_rate": 3.300473680627947e-07, + "loss": 0.4455, + "step": 8054 + }, + { + "epoch": 0.92, + "learning_rate": 3.2910487038705476e-07, + "loss": 0.4394, + "step": 8055 + }, + { + "epoch": 0.92, + "learning_rate": 3.2816369783986166e-07, + "loss": 0.463, + "step": 8056 + }, + { + "epoch": 0.92, + "learning_rate": 3.2722385055017567e-07, + "loss": 0.4431, + "step": 8057 + }, + { + "epoch": 0.92, + "learning_rate": 3.262853286467804e-07, + "loss": 0.4564, + "step": 8058 + }, + { + "epoch": 0.92, + "learning_rate": 3.2534813225826965e-07, + "loss": 0.4397, + "step": 8059 + }, + { + "epoch": 0.92, + "learning_rate": 3.2441226151306403e-07, + "loss": 0.4575, + "step": 8060 + }, + { + "epoch": 0.92, + "learning_rate": 3.234777165393965e-07, + "loss": 0.4362, + "step": 8061 + }, + { + "epoch": 0.92, + "learning_rate": 3.2254449746532246e-07, + "loss": 0.4318, + "step": 8062 + }, + { + "epoch": 0.92, + "learning_rate": 3.216126044187118e-07, + "loss": 0.4545, + "step": 8063 + }, + { + "epoch": 0.92, + "learning_rate": 3.206820375272557e-07, + "loss": 0.462, + "step": 8064 + }, + { + "epoch": 0.92, + "learning_rate": 3.1975279691846437e-07, + "loss": 0.4365, + "step": 8065 + }, + { + "epoch": 0.92, + "learning_rate": 3.188248827196616e-07, + "loss": 0.4711, + "step": 8066 + }, + { + "epoch": 0.92, + "learning_rate": 3.178982950579923e-07, + "loss": 0.4475, + "step": 8067 + }, + { + "epoch": 0.92, + "learning_rate": 3.169730340604227e-07, + "loss": 0.4419, + "step": 8068 + }, + { + "epoch": 0.92, + "learning_rate": 3.160490998537313e-07, + "loss": 0.4292, + "step": 8069 + }, + { + "epoch": 0.92, + "learning_rate": 3.151264925645192e-07, + "loss": 0.4678, + "step": 8070 + }, + { + "epoch": 0.92, + "learning_rate": 3.142052123192019e-07, + "loss": 0.4485, + "step": 8071 + }, + { + "epoch": 0.92, + "learning_rate": 3.132852592440194e-07, + "loss": 0.4481, + "step": 8072 + }, + { + "epoch": 0.92, + "learning_rate": 3.1236663346502215e-07, + "loss": 0.4427, + "step": 8073 + }, + { + "epoch": 0.92, + "learning_rate": 3.11449335108085e-07, + "loss": 0.4701, + "step": 8074 + }, + { + "epoch": 0.92, + "learning_rate": 3.1053336429889616e-07, + "loss": 0.4609, + "step": 8075 + }, + { + "epoch": 0.92, + "learning_rate": 3.0961872116296645e-07, + "loss": 0.4481, + "step": 8076 + }, + { + "epoch": 0.92, + "learning_rate": 3.0870540582562003e-07, + "loss": 0.4279, + "step": 8077 + }, + { + "epoch": 0.92, + "learning_rate": 3.077934184120035e-07, + "loss": 0.4586, + "step": 8078 + }, + { + "epoch": 0.92, + "learning_rate": 3.06882759047078e-07, + "loss": 0.4453, + "step": 8079 + }, + { + "epoch": 0.92, + "learning_rate": 3.059734278556237e-07, + "loss": 0.4442, + "step": 8080 + }, + { + "epoch": 0.92, + "learning_rate": 3.050654249622398e-07, + "loss": 0.4565, + "step": 8081 + }, + { + "epoch": 0.92, + "learning_rate": 3.0415875049134566e-07, + "loss": 0.4458, + "step": 8082 + }, + { + "epoch": 0.92, + "learning_rate": 3.03253404567172e-07, + "loss": 0.4353, + "step": 8083 + }, + { + "epoch": 0.92, + "learning_rate": 3.0234938731377394e-07, + "loss": 0.4717, + "step": 8084 + }, + { + "epoch": 0.92, + "learning_rate": 3.014466988550202e-07, + "loss": 0.4587, + "step": 8085 + }, + { + "epoch": 0.92, + "learning_rate": 3.0054533931460186e-07, + "loss": 0.4576, + "step": 8086 + }, + { + "epoch": 0.92, + "learning_rate": 2.996453088160234e-07, + "loss": 0.4396, + "step": 8087 + }, + { + "epoch": 0.92, + "learning_rate": 2.9874660748260843e-07, + "loss": 0.443, + "step": 8088 + }, + { + "epoch": 0.92, + "learning_rate": 2.978492354375007e-07, + "loss": 0.4552, + "step": 8089 + }, + { + "epoch": 0.92, + "learning_rate": 2.969531928036595e-07, + "loss": 0.4378, + "step": 8090 + }, + { + "epoch": 0.92, + "learning_rate": 2.9605847970386125e-07, + "loss": 0.4272, + "step": 8091 + }, + { + "epoch": 0.92, + "learning_rate": 2.9516509626070553e-07, + "loss": 0.4468, + "step": 8092 + }, + { + "epoch": 0.92, + "learning_rate": 2.9427304259660117e-07, + "loss": 0.4672, + "step": 8093 + }, + { + "epoch": 0.93, + "learning_rate": 2.9338231883378365e-07, + "loss": 0.4169, + "step": 8094 + }, + { + "epoch": 0.93, + "learning_rate": 2.924929250942998e-07, + "loss": 0.4546, + "step": 8095 + }, + { + "epoch": 0.93, + "learning_rate": 2.9160486150001556e-07, + "loss": 0.4413, + "step": 8096 + }, + { + "epoch": 0.93, + "learning_rate": 2.907181281726179e-07, + "loss": 0.4463, + "step": 8097 + }, + { + "epoch": 0.93, + "learning_rate": 2.8983272523360637e-07, + "loss": 0.4412, + "step": 8098 + }, + { + "epoch": 0.93, + "learning_rate": 2.889486528043028e-07, + "loss": 0.4416, + "step": 8099 + }, + { + "epoch": 0.93, + "learning_rate": 2.880659110058448e-07, + "loss": 0.4381, + "step": 8100 + }, + { + "epoch": 0.93, + "learning_rate": 2.8718449995918553e-07, + "loss": 0.4511, + "step": 8101 + }, + { + "epoch": 0.93, + "learning_rate": 2.863044197851017e-07, + "loss": 0.463, + "step": 8102 + }, + { + "epoch": 0.93, + "learning_rate": 2.8542567060418135e-07, + "loss": 0.4676, + "step": 8103 + }, + { + "epoch": 0.93, + "learning_rate": 2.845482525368337e-07, + "loss": 0.43, + "step": 8104 + }, + { + "epoch": 0.93, + "learning_rate": 2.836721657032848e-07, + "loss": 0.4403, + "step": 8105 + }, + { + "epoch": 0.93, + "learning_rate": 2.8279741022357535e-07, + "loss": 0.444, + "step": 8106 + }, + { + "epoch": 0.93, + "learning_rate": 2.8192398621757156e-07, + "loss": 0.4691, + "step": 8107 + }, + { + "epoch": 0.93, + "learning_rate": 2.810518938049478e-07, + "loss": 0.4507, + "step": 8108 + }, + { + "epoch": 0.93, + "learning_rate": 2.801811331052007e-07, + "loss": 0.4691, + "step": 8109 + }, + { + "epoch": 0.93, + "learning_rate": 2.7931170423764363e-07, + "loss": 0.4466, + "step": 8110 + }, + { + "epoch": 0.93, + "learning_rate": 2.784436073214103e-07, + "loss": 0.4667, + "step": 8111 + }, + { + "epoch": 0.93, + "learning_rate": 2.775768424754488e-07, + "loss": 0.443, + "step": 8112 + }, + { + "epoch": 0.93, + "learning_rate": 2.7671140981852306e-07, + "loss": 0.4504, + "step": 8113 + }, + { + "epoch": 0.93, + "learning_rate": 2.7584730946921825e-07, + "loss": 0.442, + "step": 8114 + }, + { + "epoch": 0.93, + "learning_rate": 2.7498454154593624e-07, + "loss": 0.4698, + "step": 8115 + }, + { + "epoch": 0.93, + "learning_rate": 2.741231061668925e-07, + "loss": 0.4184, + "step": 8116 + }, + { + "epoch": 0.93, + "learning_rate": 2.73263003450126e-07, + "loss": 0.4488, + "step": 8117 + }, + { + "epoch": 0.93, + "learning_rate": 2.72404233513488e-07, + "loss": 0.4538, + "step": 8118 + }, + { + "epoch": 0.93, + "learning_rate": 2.71546796474651e-07, + "loss": 0.4525, + "step": 8119 + }, + { + "epoch": 0.93, + "learning_rate": 2.70690692451101e-07, + "loss": 0.4407, + "step": 8120 + }, + { + "epoch": 0.93, + "learning_rate": 2.698359215601443e-07, + "loss": 0.4438, + "step": 8121 + }, + { + "epoch": 0.93, + "learning_rate": 2.689824839189037e-07, + "loss": 0.4545, + "step": 8122 + }, + { + "epoch": 0.93, + "learning_rate": 2.681303796443202e-07, + "loss": 0.4361, + "step": 8123 + }, + { + "epoch": 0.93, + "learning_rate": 2.672796088531493e-07, + "loss": 0.4593, + "step": 8124 + }, + { + "epoch": 0.93, + "learning_rate": 2.664301716619666e-07, + "loss": 0.4915, + "step": 8125 + }, + { + "epoch": 0.93, + "learning_rate": 2.655820681871635e-07, + "loss": 0.4487, + "step": 8126 + }, + { + "epoch": 0.93, + "learning_rate": 2.6473529854494915e-07, + "loss": 0.4443, + "step": 8127 + }, + { + "epoch": 0.93, + "learning_rate": 2.638898628513498e-07, + "loss": 0.443, + "step": 8128 + }, + { + "epoch": 0.93, + "learning_rate": 2.6304576122221035e-07, + "loss": 0.4475, + "step": 8129 + }, + { + "epoch": 0.93, + "learning_rate": 2.6220299377318847e-07, + "loss": 0.4359, + "step": 8130 + }, + { + "epoch": 0.93, + "learning_rate": 2.613615606197661e-07, + "loss": 0.444, + "step": 8131 + }, + { + "epoch": 0.93, + "learning_rate": 2.605214618772356e-07, + "loss": 0.4468, + "step": 8132 + }, + { + "epoch": 0.93, + "learning_rate": 2.596826976607114e-07, + "loss": 0.4594, + "step": 8133 + }, + { + "epoch": 0.93, + "learning_rate": 2.5884526808511946e-07, + "loss": 0.4441, + "step": 8134 + }, + { + "epoch": 0.93, + "learning_rate": 2.5800917326521013e-07, + "loss": 0.4514, + "step": 8135 + }, + { + "epoch": 0.93, + "learning_rate": 2.5717441331554517e-07, + "loss": 0.4474, + "step": 8136 + }, + { + "epoch": 0.93, + "learning_rate": 2.5634098835050415e-07, + "loss": 0.439, + "step": 8137 + }, + { + "epoch": 0.93, + "learning_rate": 2.555088984842868e-07, + "loss": 0.4606, + "step": 8138 + }, + { + "epoch": 0.93, + "learning_rate": 2.546781438309087e-07, + "loss": 0.4344, + "step": 8139 + }, + { + "epoch": 0.93, + "learning_rate": 2.5384872450419985e-07, + "loss": 0.4299, + "step": 8140 + }, + { + "epoch": 0.93, + "learning_rate": 2.530206406178104e-07, + "loss": 0.4699, + "step": 8141 + }, + { + "epoch": 0.93, + "learning_rate": 2.5219389228520517e-07, + "loss": 0.452, + "step": 8142 + }, + { + "epoch": 0.93, + "learning_rate": 2.51368479619668e-07, + "loss": 0.4446, + "step": 8143 + }, + { + "epoch": 0.93, + "learning_rate": 2.505444027342996e-07, + "loss": 0.4496, + "step": 8144 + }, + { + "epoch": 0.93, + "learning_rate": 2.497216617420151e-07, + "loss": 0.4444, + "step": 8145 + }, + { + "epoch": 0.93, + "learning_rate": 2.4890025675554983e-07, + "loss": 0.4439, + "step": 8146 + }, + { + "epoch": 0.93, + "learning_rate": 2.480801878874528e-07, + "loss": 0.4742, + "step": 8147 + }, + { + "epoch": 0.93, + "learning_rate": 2.4726145525009404e-07, + "loss": 0.4355, + "step": 8148 + }, + { + "epoch": 0.93, + "learning_rate": 2.4644405895565717e-07, + "loss": 0.4577, + "step": 8149 + }, + { + "epoch": 0.93, + "learning_rate": 2.456279991161437e-07, + "loss": 0.4551, + "step": 8150 + }, + { + "epoch": 0.93, + "learning_rate": 2.448132758433719e-07, + "loss": 0.4834, + "step": 8151 + }, + { + "epoch": 0.93, + "learning_rate": 2.439998892489781e-07, + "loss": 0.4243, + "step": 8152 + }, + { + "epoch": 0.93, + "learning_rate": 2.4318783944441314e-07, + "loss": 0.458, + "step": 8153 + }, + { + "epoch": 0.93, + "learning_rate": 2.4237712654094693e-07, + "loss": 0.4451, + "step": 8154 + }, + { + "epoch": 0.93, + "learning_rate": 2.4156775064966273e-07, + "loss": 0.4431, + "step": 8155 + }, + { + "epoch": 0.93, + "learning_rate": 2.4075971188146754e-07, + "loss": 0.4606, + "step": 8156 + }, + { + "epoch": 0.93, + "learning_rate": 2.3995301034707597e-07, + "loss": 0.451, + "step": 8157 + }, + { + "epoch": 0.93, + "learning_rate": 2.3914764615702747e-07, + "loss": 0.4384, + "step": 8158 + }, + { + "epoch": 0.93, + "learning_rate": 2.3834361942167484e-07, + "loss": 0.4686, + "step": 8159 + }, + { + "epoch": 0.93, + "learning_rate": 2.375409302511855e-07, + "loss": 0.4473, + "step": 8160 + }, + { + "epoch": 0.93, + "learning_rate": 2.367395787555482e-07, + "loss": 0.4443, + "step": 8161 + }, + { + "epoch": 0.93, + "learning_rate": 2.3593956504456396e-07, + "loss": 0.4512, + "step": 8162 + }, + { + "epoch": 0.93, + "learning_rate": 2.3514088922785284e-07, + "loss": 0.4504, + "step": 8163 + }, + { + "epoch": 0.93, + "learning_rate": 2.3434355141485287e-07, + "loss": 0.4591, + "step": 8164 + }, + { + "epoch": 0.93, + "learning_rate": 2.335475517148167e-07, + "loss": 0.4477, + "step": 8165 + }, + { + "epoch": 0.93, + "learning_rate": 2.3275289023681148e-07, + "loss": 0.4334, + "step": 8166 + }, + { + "epoch": 0.93, + "learning_rate": 2.3195956708972566e-07, + "loss": 0.462, + "step": 8167 + }, + { + "epoch": 0.93, + "learning_rate": 2.3116758238226233e-07, + "loss": 0.4558, + "step": 8168 + }, + { + "epoch": 0.93, + "learning_rate": 2.3037693622294244e-07, + "loss": 0.4568, + "step": 8169 + }, + { + "epoch": 0.93, + "learning_rate": 2.2958762872009932e-07, + "loss": 0.4401, + "step": 8170 + }, + { + "epoch": 0.93, + "learning_rate": 2.2879965998188646e-07, + "loss": 0.4245, + "step": 8171 + }, + { + "epoch": 0.93, + "learning_rate": 2.280130301162742e-07, + "loss": 0.4457, + "step": 8172 + }, + { + "epoch": 0.93, + "learning_rate": 2.2722773923104736e-07, + "loss": 0.4725, + "step": 8173 + }, + { + "epoch": 0.93, + "learning_rate": 2.264437874338099e-07, + "loss": 0.4427, + "step": 8174 + }, + { + "epoch": 0.93, + "learning_rate": 2.2566117483197923e-07, + "loss": 0.4492, + "step": 8175 + }, + { + "epoch": 0.93, + "learning_rate": 2.248799015327907e-07, + "loss": 0.4409, + "step": 8176 + }, + { + "epoch": 0.93, + "learning_rate": 2.2409996764329644e-07, + "loss": 0.431, + "step": 8177 + }, + { + "epoch": 0.93, + "learning_rate": 2.233213732703665e-07, + "loss": 0.4616, + "step": 8178 + }, + { + "epoch": 0.93, + "learning_rate": 2.2254411852068226e-07, + "loss": 0.442, + "step": 8179 + }, + { + "epoch": 0.93, + "learning_rate": 2.2176820350074846e-07, + "loss": 0.4572, + "step": 8180 + }, + { + "epoch": 0.93, + "learning_rate": 2.2099362831688008e-07, + "loss": 0.4326, + "step": 8181 + }, + { + "epoch": 0.94, + "learning_rate": 2.2022039307521337e-07, + "loss": 0.4631, + "step": 8182 + }, + { + "epoch": 0.94, + "learning_rate": 2.1944849788169798e-07, + "loss": 0.4388, + "step": 8183 + }, + { + "epoch": 0.94, + "learning_rate": 2.1867794284209932e-07, + "loss": 0.4536, + "step": 8184 + }, + { + "epoch": 0.94, + "learning_rate": 2.179087280620018e-07, + "loss": 0.4582, + "step": 8185 + }, + { + "epoch": 0.94, + "learning_rate": 2.1714085364680671e-07, + "loss": 0.4496, + "step": 8186 + }, + { + "epoch": 0.94, + "learning_rate": 2.163743197017265e-07, + "loss": 0.4677, + "step": 8187 + }, + { + "epoch": 0.94, + "learning_rate": 2.156091263317972e-07, + "loss": 0.4457, + "step": 8188 + }, + { + "epoch": 0.94, + "learning_rate": 2.1484527364186492e-07, + "loss": 0.4335, + "step": 8189 + }, + { + "epoch": 0.94, + "learning_rate": 2.140827617365948e-07, + "loss": 0.4598, + "step": 8190 + }, + { + "epoch": 0.94, + "learning_rate": 2.1332159072046887e-07, + "loss": 0.4615, + "step": 8191 + }, + { + "epoch": 0.94, + "learning_rate": 2.1256176069778367e-07, + "loss": 0.4394, + "step": 8192 + }, + { + "epoch": 0.94, + "learning_rate": 2.118032717726537e-07, + "loss": 0.4442, + "step": 8193 + }, + { + "epoch": 0.94, + "learning_rate": 2.1104612404900805e-07, + "loss": 0.4259, + "step": 8194 + }, + { + "epoch": 0.94, + "learning_rate": 2.102903176305926e-07, + "loss": 0.461, + "step": 8195 + }, + { + "epoch": 0.94, + "learning_rate": 2.0953585262097232e-07, + "loss": 0.451, + "step": 8196 + }, + { + "epoch": 0.94, + "learning_rate": 2.0878272912352117e-07, + "loss": 0.455, + "step": 8197 + }, + { + "epoch": 0.94, + "learning_rate": 2.0803094724143879e-07, + "loss": 0.4479, + "step": 8198 + }, + { + "epoch": 0.94, + "learning_rate": 2.0728050707773285e-07, + "loss": 0.4658, + "step": 8199 + }, + { + "epoch": 0.94, + "learning_rate": 2.0653140873523104e-07, + "loss": 0.4647, + "step": 8200 + }, + { + "epoch": 0.94, + "learning_rate": 2.0578365231657792e-07, + "loss": 0.4496, + "step": 8201 + }, + { + "epoch": 0.94, + "learning_rate": 2.0503723792423047e-07, + "loss": 0.4439, + "step": 8202 + }, + { + "epoch": 0.94, + "learning_rate": 2.0429216566046682e-07, + "loss": 0.4974, + "step": 8203 + }, + { + "epoch": 0.94, + "learning_rate": 2.0354843562737537e-07, + "loss": 0.4585, + "step": 8204 + }, + { + "epoch": 0.94, + "learning_rate": 2.0280604792686676e-07, + "loss": 0.4521, + "step": 8205 + }, + { + "epoch": 0.94, + "learning_rate": 2.0206500266066297e-07, + "loss": 0.4531, + "step": 8206 + }, + { + "epoch": 0.94, + "learning_rate": 2.0132529993030392e-07, + "loss": 0.4476, + "step": 8207 + }, + { + "epoch": 0.94, + "learning_rate": 2.0058693983714628e-07, + "loss": 0.4422, + "step": 8208 + }, + { + "epoch": 0.94, + "learning_rate": 1.9984992248236135e-07, + "loss": 0.4532, + "step": 8209 + }, + { + "epoch": 0.94, + "learning_rate": 1.9911424796693611e-07, + "loss": 0.4243, + "step": 8210 + }, + { + "epoch": 0.94, + "learning_rate": 1.9837991639167552e-07, + "loss": 0.4565, + "step": 8211 + }, + { + "epoch": 0.94, + "learning_rate": 1.9764692785719909e-07, + "loss": 0.426, + "step": 8212 + }, + { + "epoch": 0.94, + "learning_rate": 1.9691528246394197e-07, + "loss": 0.4706, + "step": 8213 + }, + { + "epoch": 0.94, + "learning_rate": 1.9618498031215738e-07, + "loss": 0.4329, + "step": 8214 + }, + { + "epoch": 0.94, + "learning_rate": 1.954560215019108e-07, + "loss": 0.4393, + "step": 8215 + }, + { + "epoch": 0.94, + "learning_rate": 1.9472840613308787e-07, + "loss": 0.4472, + "step": 8216 + }, + { + "epoch": 0.94, + "learning_rate": 1.9400213430538773e-07, + "loss": 0.4587, + "step": 8217 + }, + { + "epoch": 0.94, + "learning_rate": 1.9327720611832523e-07, + "loss": 0.4529, + "step": 8218 + }, + { + "epoch": 0.94, + "learning_rate": 1.9255362167123316e-07, + "loss": 0.4603, + "step": 8219 + }, + { + "epoch": 0.94, + "learning_rate": 1.918313810632566e-07, + "loss": 0.4376, + "step": 8220 + }, + { + "epoch": 0.94, + "learning_rate": 1.9111048439335978e-07, + "loss": 0.4576, + "step": 8221 + }, + { + "epoch": 0.94, + "learning_rate": 1.903909317603214e-07, + "loss": 0.4322, + "step": 8222 + }, + { + "epoch": 0.94, + "learning_rate": 1.89672723262736e-07, + "loss": 0.4532, + "step": 8223 + }, + { + "epoch": 0.94, + "learning_rate": 1.889558589990148e-07, + "loss": 0.459, + "step": 8224 + }, + { + "epoch": 0.94, + "learning_rate": 1.882403390673837e-07, + "loss": 0.4548, + "step": 8225 + }, + { + "epoch": 0.94, + "learning_rate": 1.8752616356588648e-07, + "loss": 0.4389, + "step": 8226 + }, + { + "epoch": 0.94, + "learning_rate": 1.8681333259237933e-07, + "loss": 0.4535, + "step": 8227 + }, + { + "epoch": 0.94, + "learning_rate": 1.861018462445352e-07, + "loss": 0.4543, + "step": 8228 + }, + { + "epoch": 0.94, + "learning_rate": 1.8539170461984612e-07, + "loss": 0.4711, + "step": 8229 + }, + { + "epoch": 0.94, + "learning_rate": 1.8468290781561538e-07, + "loss": 0.4632, + "step": 8230 + }, + { + "epoch": 0.94, + "learning_rate": 1.8397545592896527e-07, + "loss": 0.4474, + "step": 8231 + }, + { + "epoch": 0.94, + "learning_rate": 1.832693490568327e-07, + "loss": 0.4422, + "step": 8232 + }, + { + "epoch": 0.94, + "learning_rate": 1.8256458729596692e-07, + "loss": 0.4676, + "step": 8233 + }, + { + "epoch": 0.94, + "learning_rate": 1.8186117074293964e-07, + "loss": 0.4515, + "step": 8234 + }, + { + "epoch": 0.94, + "learning_rate": 1.811590994941337e-07, + "loss": 0.4537, + "step": 8235 + }, + { + "epoch": 0.94, + "learning_rate": 1.804583736457477e-07, + "loss": 0.437, + "step": 8236 + }, + { + "epoch": 0.94, + "learning_rate": 1.797589932937982e-07, + "loss": 0.4531, + "step": 8237 + }, + { + "epoch": 0.94, + "learning_rate": 1.790609585341141e-07, + "loss": 0.4455, + "step": 8238 + }, + { + "epoch": 0.94, + "learning_rate": 1.7836426946234332e-07, + "loss": 0.4547, + "step": 8239 + }, + { + "epoch": 0.94, + "learning_rate": 1.7766892617394727e-07, + "loss": 0.4408, + "step": 8240 + }, + { + "epoch": 0.94, + "learning_rate": 1.7697492876420198e-07, + "loss": 0.4442, + "step": 8241 + }, + { + "epoch": 0.94, + "learning_rate": 1.7628227732820247e-07, + "loss": 0.4326, + "step": 8242 + }, + { + "epoch": 0.94, + "learning_rate": 1.755909719608573e-07, + "loss": 0.4699, + "step": 8243 + }, + { + "epoch": 0.94, + "learning_rate": 1.7490101275689064e-07, + "loss": 0.459, + "step": 8244 + }, + { + "epoch": 0.94, + "learning_rate": 1.7421239981084136e-07, + "loss": 0.4532, + "step": 8245 + }, + { + "epoch": 0.94, + "learning_rate": 1.7352513321706621e-07, + "loss": 0.4455, + "step": 8246 + }, + { + "epoch": 0.94, + "learning_rate": 1.7283921306973538e-07, + "loss": 0.4646, + "step": 8247 + }, + { + "epoch": 0.94, + "learning_rate": 1.7215463946283483e-07, + "loss": 0.4467, + "step": 8248 + }, + { + "epoch": 0.94, + "learning_rate": 1.714714124901662e-07, + "loss": 0.4495, + "step": 8249 + }, + { + "epoch": 0.94, + "learning_rate": 1.70789532245349e-07, + "loss": 0.4299, + "step": 8250 + }, + { + "epoch": 0.94, + "learning_rate": 1.70108998821813e-07, + "loss": 0.456, + "step": 8251 + }, + { + "epoch": 0.94, + "learning_rate": 1.6942981231280798e-07, + "loss": 0.4508, + "step": 8252 + }, + { + "epoch": 0.94, + "learning_rate": 1.6875197281139844e-07, + "loss": 0.4679, + "step": 8253 + }, + { + "epoch": 0.94, + "learning_rate": 1.680754804104623e-07, + "loss": 0.4539, + "step": 8254 + }, + { + "epoch": 0.94, + "learning_rate": 1.6740033520269538e-07, + "loss": 0.458, + "step": 8255 + }, + { + "epoch": 0.94, + "learning_rate": 1.6672653728060594e-07, + "loss": 0.4426, + "step": 8256 + }, + { + "epoch": 0.94, + "learning_rate": 1.6605408673652012e-07, + "loss": 0.4507, + "step": 8257 + }, + { + "epoch": 0.94, + "learning_rate": 1.6538298366257975e-07, + "loss": 0.4408, + "step": 8258 + }, + { + "epoch": 0.94, + "learning_rate": 1.647132281507391e-07, + "loss": 0.4554, + "step": 8259 + }, + { + "epoch": 0.94, + "learning_rate": 1.6404482029277023e-07, + "loss": 0.4514, + "step": 8260 + }, + { + "epoch": 0.94, + "learning_rate": 1.6337776018026108e-07, + "loss": 0.449, + "step": 8261 + }, + { + "epoch": 0.94, + "learning_rate": 1.627120479046118e-07, + "loss": 0.4437, + "step": 8262 + }, + { + "epoch": 0.94, + "learning_rate": 1.620476835570417e-07, + "loss": 0.4701, + "step": 8263 + }, + { + "epoch": 0.94, + "learning_rate": 1.6138466722858237e-07, + "loss": 0.4428, + "step": 8264 + }, + { + "epoch": 0.94, + "learning_rate": 1.6072299901008226e-07, + "loss": 0.4623, + "step": 8265 + }, + { + "epoch": 0.94, + "learning_rate": 1.6006267899220552e-07, + "loss": 0.4378, + "step": 8266 + }, + { + "epoch": 0.94, + "learning_rate": 1.5940370726542864e-07, + "loss": 0.4599, + "step": 8267 + }, + { + "epoch": 0.94, + "learning_rate": 1.587460839200472e-07, + "loss": 0.4498, + "step": 8268 + }, + { + "epoch": 0.95, + "learning_rate": 1.580898090461691e-07, + "loss": 0.4708, + "step": 8269 + }, + { + "epoch": 0.95, + "learning_rate": 1.5743488273372133e-07, + "loss": 0.4397, + "step": 8270 + }, + { + "epoch": 0.95, + "learning_rate": 1.567813050724387e-07, + "loss": 0.4513, + "step": 8271 + }, + { + "epoch": 0.95, + "learning_rate": 1.5612907615187967e-07, + "loss": 0.4539, + "step": 8272 + }, + { + "epoch": 0.95, + "learning_rate": 1.554781960614138e-07, + "loss": 0.467, + "step": 8273 + }, + { + "epoch": 0.95, + "learning_rate": 1.548286648902253e-07, + "loss": 0.4331, + "step": 8274 + }, + { + "epoch": 0.95, + "learning_rate": 1.5418048272731413e-07, + "loss": 0.438, + "step": 8275 + }, + { + "epoch": 0.95, + "learning_rate": 1.5353364966149697e-07, + "loss": 0.4538, + "step": 8276 + }, + { + "epoch": 0.95, + "learning_rate": 1.5288816578140298e-07, + "loss": 0.4652, + "step": 8277 + }, + { + "epoch": 0.95, + "learning_rate": 1.5224403117547916e-07, + "loss": 0.4538, + "step": 8278 + }, + { + "epoch": 0.95, + "learning_rate": 1.51601245931986e-07, + "loss": 0.4449, + "step": 8279 + }, + { + "epoch": 0.95, + "learning_rate": 1.5095981013899863e-07, + "loss": 0.435, + "step": 8280 + }, + { + "epoch": 0.95, + "learning_rate": 1.5031972388440787e-07, + "loss": 0.4511, + "step": 8281 + }, + { + "epoch": 0.95, + "learning_rate": 1.4968098725592127e-07, + "loss": 0.445, + "step": 8282 + }, + { + "epoch": 0.95, + "learning_rate": 1.4904360034106e-07, + "loss": 0.4407, + "step": 8283 + }, + { + "epoch": 0.95, + "learning_rate": 1.4840756322715866e-07, + "loss": 0.4414, + "step": 8284 + }, + { + "epoch": 0.95, + "learning_rate": 1.477728760013697e-07, + "loss": 0.4752, + "step": 8285 + }, + { + "epoch": 0.95, + "learning_rate": 1.4713953875065912e-07, + "loss": 0.4692, + "step": 8286 + }, + { + "epoch": 0.95, + "learning_rate": 1.4650755156180973e-07, + "loss": 0.4513, + "step": 8287 + }, + { + "epoch": 0.95, + "learning_rate": 1.458769145214145e-07, + "loss": 0.4425, + "step": 8288 + }, + { + "epoch": 0.95, + "learning_rate": 1.4524762771588763e-07, + "loss": 0.4558, + "step": 8289 + }, + { + "epoch": 0.95, + "learning_rate": 1.4461969123145458e-07, + "loss": 0.4535, + "step": 8290 + }, + { + "epoch": 0.95, + "learning_rate": 1.4399310515415655e-07, + "loss": 0.4321, + "step": 8291 + }, + { + "epoch": 0.95, + "learning_rate": 1.4336786956985038e-07, + "loss": 0.4441, + "step": 8292 + }, + { + "epoch": 0.95, + "learning_rate": 1.4274398456420647e-07, + "loss": 0.4445, + "step": 8293 + }, + { + "epoch": 0.95, + "learning_rate": 1.4212145022271196e-07, + "loss": 0.4637, + "step": 8294 + }, + { + "epoch": 0.95, + "learning_rate": 1.415002666306664e-07, + "loss": 0.4635, + "step": 8295 + }, + { + "epoch": 0.95, + "learning_rate": 1.4088043387318838e-07, + "loss": 0.4472, + "step": 8296 + }, + { + "epoch": 0.95, + "learning_rate": 1.4026195203520666e-07, + "loss": 0.4377, + "step": 8297 + }, + { + "epoch": 0.95, + "learning_rate": 1.3964482120146672e-07, + "loss": 0.4463, + "step": 8298 + }, + { + "epoch": 0.95, + "learning_rate": 1.3902904145653094e-07, + "loss": 0.4532, + "step": 8299 + }, + { + "epoch": 0.95, + "learning_rate": 1.384146128847741e-07, + "loss": 0.4427, + "step": 8300 + }, + { + "epoch": 0.95, + "learning_rate": 1.3780153557038655e-07, + "loss": 0.4533, + "step": 8301 + }, + { + "epoch": 0.95, + "learning_rate": 1.3718980959737448e-07, + "loss": 0.4675, + "step": 8302 + }, + { + "epoch": 0.95, + "learning_rate": 1.365794350495564e-07, + "loss": 0.4572, + "step": 8303 + }, + { + "epoch": 0.95, + "learning_rate": 1.359704120105687e-07, + "loss": 0.4524, + "step": 8304 + }, + { + "epoch": 0.95, + "learning_rate": 1.3536274056386134e-07, + "loss": 0.4236, + "step": 8305 + }, + { + "epoch": 0.95, + "learning_rate": 1.3475642079269659e-07, + "loss": 0.4386, + "step": 8306 + }, + { + "epoch": 0.95, + "learning_rate": 1.3415145278015575e-07, + "loss": 0.4413, + "step": 8307 + }, + { + "epoch": 0.95, + "learning_rate": 1.335478366091325e-07, + "loss": 0.4468, + "step": 8308 + }, + { + "epoch": 0.95, + "learning_rate": 1.329455723623352e-07, + "loss": 0.4596, + "step": 8309 + }, + { + "epoch": 0.95, + "learning_rate": 1.3234466012228887e-07, + "loss": 0.467, + "step": 8310 + }, + { + "epoch": 0.95, + "learning_rate": 1.31745099971331e-07, + "loss": 0.4309, + "step": 8311 + }, + { + "epoch": 0.95, + "learning_rate": 1.3114689199161478e-07, + "loss": 0.4875, + "step": 8312 + }, + { + "epoch": 0.95, + "learning_rate": 1.3055003626510687e-07, + "loss": 0.4334, + "step": 8313 + }, + { + "epoch": 0.95, + "learning_rate": 1.2995453287359293e-07, + "loss": 0.4388, + "step": 8314 + }, + { + "epoch": 0.95, + "learning_rate": 1.2936038189866773e-07, + "loss": 0.4582, + "step": 8315 + }, + { + "epoch": 0.95, + "learning_rate": 1.287675834217428e-07, + "loss": 0.4398, + "step": 8316 + }, + { + "epoch": 0.95, + "learning_rate": 1.2817613752404646e-07, + "loss": 0.4413, + "step": 8317 + }, + { + "epoch": 0.95, + "learning_rate": 1.2758604428661836e-07, + "loss": 0.469, + "step": 8318 + }, + { + "epoch": 0.95, + "learning_rate": 1.2699730379031604e-07, + "loss": 0.4525, + "step": 8319 + }, + { + "epoch": 0.95, + "learning_rate": 1.2640991611580943e-07, + "loss": 0.4553, + "step": 8320 + }, + { + "epoch": 0.95, + "learning_rate": 1.2582388134358414e-07, + "loss": 0.469, + "step": 8321 + }, + { + "epoch": 0.95, + "learning_rate": 1.2523919955393925e-07, + "loss": 0.4566, + "step": 8322 + }, + { + "epoch": 0.95, + "learning_rate": 1.246558708269896e-07, + "loss": 0.4418, + "step": 8323 + }, + { + "epoch": 0.95, + "learning_rate": 1.2407389524266456e-07, + "loss": 0.4348, + "step": 8324 + }, + { + "epoch": 0.95, + "learning_rate": 1.23493272880707e-07, + "loss": 0.4491, + "step": 8325 + }, + { + "epoch": 0.95, + "learning_rate": 1.2291400382067553e-07, + "loss": 0.4521, + "step": 8326 + }, + { + "epoch": 0.95, + "learning_rate": 1.223360881419433e-07, + "loss": 0.4571, + "step": 8327 + }, + { + "epoch": 0.95, + "learning_rate": 1.21759525923697e-07, + "loss": 0.4485, + "step": 8328 + }, + { + "epoch": 0.95, + "learning_rate": 1.2118431724493895e-07, + "loss": 0.4499, + "step": 8329 + }, + { + "epoch": 0.95, + "learning_rate": 1.2061046218448724e-07, + "loss": 0.4544, + "step": 8330 + }, + { + "epoch": 0.95, + "learning_rate": 1.2003796082097008e-07, + "loss": 0.4409, + "step": 8331 + }, + { + "epoch": 0.95, + "learning_rate": 1.194668132328325e-07, + "loss": 0.4329, + "step": 8332 + }, + { + "epoch": 0.95, + "learning_rate": 1.1889701949833743e-07, + "loss": 0.4414, + "step": 8333 + }, + { + "epoch": 0.95, + "learning_rate": 1.18328579695558e-07, + "loss": 0.448, + "step": 8334 + }, + { + "epoch": 0.95, + "learning_rate": 1.1776149390238301e-07, + "loss": 0.4538, + "step": 8335 + }, + { + "epoch": 0.95, + "learning_rate": 1.1719576219651585e-07, + "loss": 0.4408, + "step": 8336 + }, + { + "epoch": 0.95, + "learning_rate": 1.1663138465547341e-07, + "loss": 0.4368, + "step": 8337 + }, + { + "epoch": 0.95, + "learning_rate": 1.1606836135658939e-07, + "loss": 0.453, + "step": 8338 + }, + { + "epoch": 0.95, + "learning_rate": 1.1550669237700985e-07, + "loss": 0.4487, + "step": 8339 + }, + { + "epoch": 0.95, + "learning_rate": 1.1494637779369766e-07, + "loss": 0.429, + "step": 8340 + }, + { + "epoch": 0.95, + "learning_rate": 1.1438741768342587e-07, + "loss": 0.4466, + "step": 8341 + }, + { + "epoch": 0.95, + "learning_rate": 1.1382981212278655e-07, + "loss": 0.4571, + "step": 8342 + }, + { + "epoch": 0.95, + "learning_rate": 1.13273561188183e-07, + "loss": 0.4546, + "step": 8343 + }, + { + "epoch": 0.95, + "learning_rate": 1.1271866495583428e-07, + "loss": 0.4403, + "step": 8344 + }, + { + "epoch": 0.95, + "learning_rate": 1.12165123501774e-07, + "loss": 0.4579, + "step": 8345 + }, + { + "epoch": 0.95, + "learning_rate": 1.1161293690184927e-07, + "loss": 0.4458, + "step": 8346 + }, + { + "epoch": 0.95, + "learning_rate": 1.1106210523172068e-07, + "loss": 0.4447, + "step": 8347 + }, + { + "epoch": 0.95, + "learning_rate": 1.1051262856686673e-07, + "loss": 0.4585, + "step": 8348 + }, + { + "epoch": 0.95, + "learning_rate": 1.0996450698257721e-07, + "loss": 0.4428, + "step": 8349 + }, + { + "epoch": 0.95, + "learning_rate": 1.0941774055395538e-07, + "loss": 0.4559, + "step": 8350 + }, + { + "epoch": 0.95, + "learning_rate": 1.0887232935592351e-07, + "loss": 0.4442, + "step": 8351 + }, + { + "epoch": 0.95, + "learning_rate": 1.0832827346321295e-07, + "loss": 0.4561, + "step": 8352 + }, + { + "epoch": 0.95, + "learning_rate": 1.0778557295037296e-07, + "loss": 0.4558, + "step": 8353 + }, + { + "epoch": 0.95, + "learning_rate": 1.0724422789176404e-07, + "loss": 0.4638, + "step": 8354 + }, + { + "epoch": 0.95, + "learning_rate": 1.0670423836156241e-07, + "loss": 0.439, + "step": 8355 + }, + { + "epoch": 0.95, + "learning_rate": 1.0616560443376e-07, + "loss": 0.4519, + "step": 8356 + }, + { + "epoch": 0.96, + "learning_rate": 1.0562832618216223e-07, + "loss": 0.4447, + "step": 8357 + }, + { + "epoch": 0.96, + "learning_rate": 1.0509240368038576e-07, + "loss": 0.4416, + "step": 8358 + }, + { + "epoch": 0.96, + "learning_rate": 1.0455783700186628e-07, + "loss": 0.4505, + "step": 8359 + }, + { + "epoch": 0.96, + "learning_rate": 1.0402462621984965e-07, + "loss": 0.4415, + "step": 8360 + }, + { + "epoch": 0.96, + "learning_rate": 1.0349277140739966e-07, + "loss": 0.4615, + "step": 8361 + }, + { + "epoch": 0.96, + "learning_rate": 1.0296227263739023e-07, + "loss": 0.4467, + "step": 8362 + }, + { + "epoch": 0.96, + "learning_rate": 1.0243312998251209e-07, + "loss": 0.4314, + "step": 8363 + }, + { + "epoch": 0.96, + "learning_rate": 1.0190534351527059e-07, + "loss": 0.4581, + "step": 8364 + }, + { + "epoch": 0.96, + "learning_rate": 1.0137891330798344e-07, + "loss": 0.4285, + "step": 8365 + }, + { + "epoch": 0.96, + "learning_rate": 1.0085383943278293e-07, + "loss": 0.4495, + "step": 8366 + }, + { + "epoch": 0.96, + "learning_rate": 1.0033012196161706e-07, + "loss": 0.4665, + "step": 8367 + }, + { + "epoch": 0.96, + "learning_rate": 9.980776096624511e-08, + "loss": 0.4569, + "step": 8368 + }, + { + "epoch": 0.96, + "learning_rate": 9.928675651824427e-08, + "loss": 0.4427, + "step": 8369 + }, + { + "epoch": 0.96, + "learning_rate": 9.876710868900297e-08, + "loss": 0.4595, + "step": 8370 + }, + { + "epoch": 0.96, + "learning_rate": 9.824881754972426e-08, + "loss": 0.4592, + "step": 8371 + }, + { + "epoch": 0.96, + "learning_rate": 9.773188317142579e-08, + "loss": 0.4459, + "step": 8372 + }, + { + "epoch": 0.96, + "learning_rate": 9.721630562493867e-08, + "loss": 0.4578, + "step": 8373 + }, + { + "epoch": 0.96, + "learning_rate": 9.670208498090861e-08, + "loss": 0.4662, + "step": 8374 + }, + { + "epoch": 0.96, + "learning_rate": 9.61892213097959e-08, + "loss": 0.4425, + "step": 8375 + }, + { + "epoch": 0.96, + "learning_rate": 9.567771468187326e-08, + "loss": 0.4341, + "step": 8376 + }, + { + "epoch": 0.96, + "learning_rate": 9.516756516723124e-08, + "loss": 0.4402, + "step": 8377 + }, + { + "epoch": 0.96, + "learning_rate": 9.46587728357673e-08, + "loss": 0.4484, + "step": 8378 + }, + { + "epoch": 0.96, + "learning_rate": 9.415133775720231e-08, + "loss": 0.477, + "step": 8379 + }, + { + "epoch": 0.96, + "learning_rate": 9.364526000106289e-08, + "loss": 0.4292, + "step": 8380 + }, + { + "epoch": 0.96, + "learning_rate": 9.314053963669245e-08, + "loss": 0.4351, + "step": 8381 + }, + { + "epoch": 0.96, + "learning_rate": 9.263717673325124e-08, + "loss": 0.4642, + "step": 8382 + }, + { + "epoch": 0.96, + "learning_rate": 9.213517135971073e-08, + "loss": 0.4251, + "step": 8383 + }, + { + "epoch": 0.96, + "learning_rate": 9.163452358485591e-08, + "loss": 0.4471, + "step": 8384 + }, + { + "epoch": 0.96, + "learning_rate": 9.113523347728748e-08, + "loss": 0.4405, + "step": 8385 + }, + { + "epoch": 0.96, + "learning_rate": 9.063730110541846e-08, + "loss": 0.4519, + "step": 8386 + }, + { + "epoch": 0.96, + "learning_rate": 9.014072653747763e-08, + "loss": 0.4561, + "step": 8387 + }, + { + "epoch": 0.96, + "learning_rate": 8.964550984150611e-08, + "loss": 0.4654, + "step": 8388 + }, + { + "epoch": 0.96, + "learning_rate": 8.915165108536072e-08, + "loss": 0.4423, + "step": 8389 + }, + { + "epoch": 0.96, + "learning_rate": 8.865915033671069e-08, + "loss": 0.4749, + "step": 8390 + }, + { + "epoch": 0.96, + "learning_rate": 8.816800766303756e-08, + "loss": 0.4455, + "step": 8391 + }, + { + "epoch": 0.96, + "learning_rate": 8.767822313164198e-08, + "loss": 0.4368, + "step": 8392 + }, + { + "epoch": 0.96, + "learning_rate": 8.718979680963469e-08, + "loss": 0.4495, + "step": 8393 + }, + { + "epoch": 0.96, + "learning_rate": 8.670272876393881e-08, + "loss": 0.4468, + "step": 8394 + }, + { + "epoch": 0.96, + "learning_rate": 8.621701906129542e-08, + "loss": 0.4538, + "step": 8395 + }, + { + "epoch": 0.96, + "learning_rate": 8.573266776825683e-08, + "loss": 0.4725, + "step": 8396 + }, + { + "epoch": 0.96, + "learning_rate": 8.524967495119107e-08, + "loss": 0.4482, + "step": 8397 + }, + { + "epoch": 0.96, + "learning_rate": 8.476804067627852e-08, + "loss": 0.4601, + "step": 8398 + }, + { + "epoch": 0.96, + "learning_rate": 8.428776500951308e-08, + "loss": 0.4497, + "step": 8399 + }, + { + "epoch": 0.96, + "learning_rate": 8.380884801670431e-08, + "loss": 0.4364, + "step": 8400 + }, + { + "epoch": 0.96, + "learning_rate": 8.333128976347305e-08, + "loss": 0.4429, + "step": 8401 + }, + { + "epoch": 0.96, + "learning_rate": 8.285509031525696e-08, + "loss": 0.461, + "step": 8402 + }, + { + "epoch": 0.96, + "learning_rate": 8.238024973730497e-08, + "loss": 0.4412, + "step": 8403 + }, + { + "epoch": 0.96, + "learning_rate": 8.190676809468056e-08, + "loss": 0.4573, + "step": 8404 + }, + { + "epoch": 0.96, + "learning_rate": 8.143464545226298e-08, + "loss": 0.4421, + "step": 8405 + }, + { + "epoch": 0.96, + "learning_rate": 8.096388187474269e-08, + "loss": 0.4717, + "step": 8406 + }, + { + "epoch": 0.96, + "learning_rate": 8.049447742662364e-08, + "loss": 0.4514, + "step": 8407 + }, + { + "epoch": 0.96, + "learning_rate": 8.002643217222661e-08, + "loss": 0.4507, + "step": 8408 + }, + { + "epoch": 0.96, + "learning_rate": 7.955974617568252e-08, + "loss": 0.4262, + "step": 8409 + }, + { + "epoch": 0.96, + "learning_rate": 7.90944195009391e-08, + "loss": 0.4479, + "step": 8410 + }, + { + "epoch": 0.96, + "learning_rate": 7.863045221175647e-08, + "loss": 0.4506, + "step": 8411 + }, + { + "epoch": 0.96, + "learning_rate": 7.81678443717071e-08, + "loss": 0.4619, + "step": 8412 + }, + { + "epoch": 0.96, + "learning_rate": 7.77065960441803e-08, + "loss": 0.4708, + "step": 8413 + }, + { + "epoch": 0.96, + "learning_rate": 7.72467072923766e-08, + "loss": 0.4405, + "step": 8414 + }, + { + "epoch": 0.96, + "learning_rate": 7.678817817931006e-08, + "loss": 0.4463, + "step": 8415 + }, + { + "epoch": 0.96, + "learning_rate": 7.633100876781152e-08, + "loss": 0.48, + "step": 8416 + }, + { + "epoch": 0.96, + "learning_rate": 7.587519912052199e-08, + "loss": 0.4355, + "step": 8417 + }, + { + "epoch": 0.96, + "learning_rate": 7.542074929989818e-08, + "loss": 0.4531, + "step": 8418 + }, + { + "epoch": 0.96, + "learning_rate": 7.496765936821027e-08, + "loss": 0.4639, + "step": 8419 + }, + { + "epoch": 0.96, + "learning_rate": 7.451592938753971e-08, + "loss": 0.4652, + "step": 8420 + }, + { + "epoch": 0.96, + "learning_rate": 7.406555941978478e-08, + "loss": 0.4412, + "step": 8421 + }, + { + "epoch": 0.96, + "learning_rate": 7.361654952665608e-08, + "loss": 0.4694, + "step": 8422 + }, + { + "epoch": 0.96, + "learning_rate": 7.31688997696789e-08, + "loss": 0.4387, + "step": 8423 + }, + { + "epoch": 0.96, + "learning_rate": 7.272261021019079e-08, + "loss": 0.4594, + "step": 8424 + }, + { + "epoch": 0.96, + "learning_rate": 7.227768090934285e-08, + "loss": 0.4461, + "step": 8425 + }, + { + "epoch": 0.96, + "learning_rate": 7.183411192810075e-08, + "loss": 0.4421, + "step": 8426 + }, + { + "epoch": 0.96, + "learning_rate": 7.139190332724255e-08, + "loss": 0.4374, + "step": 8427 + }, + { + "epoch": 0.96, + "learning_rate": 7.095105516736201e-08, + "loss": 0.4646, + "step": 8428 + }, + { + "epoch": 0.96, + "learning_rate": 7.051156750886523e-08, + "loss": 0.451, + "step": 8429 + }, + { + "epoch": 0.96, + "learning_rate": 7.007344041196962e-08, + "loss": 0.4344, + "step": 8430 + }, + { + "epoch": 0.96, + "learning_rate": 6.963667393671048e-08, + "loss": 0.4456, + "step": 8431 + }, + { + "epoch": 0.96, + "learning_rate": 6.920126814293438e-08, + "loss": 0.4396, + "step": 8432 + }, + { + "epoch": 0.96, + "learning_rate": 6.876722309030026e-08, + "loss": 0.4522, + "step": 8433 + }, + { + "epoch": 0.96, + "learning_rate": 6.833453883828389e-08, + "loss": 0.449, + "step": 8434 + }, + { + "epoch": 0.96, + "learning_rate": 6.790321544617117e-08, + "loss": 0.4438, + "step": 8435 + }, + { + "epoch": 0.96, + "learning_rate": 6.747325297306484e-08, + "loss": 0.4538, + "step": 8436 + }, + { + "epoch": 0.96, + "learning_rate": 6.704465147787665e-08, + "loss": 0.4676, + "step": 8437 + }, + { + "epoch": 0.96, + "learning_rate": 6.661741101933628e-08, + "loss": 0.4309, + "step": 8438 + }, + { + "epoch": 0.96, + "learning_rate": 6.61915316559858e-08, + "loss": 0.4555, + "step": 8439 + }, + { + "epoch": 0.96, + "learning_rate": 6.576701344617964e-08, + "loss": 0.4451, + "step": 8440 + }, + { + "epoch": 0.96, + "learning_rate": 6.534385644808461e-08, + "loss": 0.4492, + "step": 8441 + }, + { + "epoch": 0.96, + "learning_rate": 6.492206071968432e-08, + "loss": 0.4569, + "step": 8442 + }, + { + "epoch": 0.96, + "learning_rate": 6.450162631877366e-08, + "loss": 0.4277, + "step": 8443 + }, + { + "epoch": 0.97, + "learning_rate": 6.40825533029632e-08, + "loss": 0.4393, + "step": 8444 + }, + { + "epoch": 0.97, + "learning_rate": 6.366484172967369e-08, + "loss": 0.4463, + "step": 8445 + }, + { + "epoch": 0.97, + "learning_rate": 6.324849165614045e-08, + "loss": 0.4572, + "step": 8446 + }, + { + "epoch": 0.97, + "learning_rate": 6.28335031394134e-08, + "loss": 0.4336, + "step": 8447 + }, + { + "epoch": 0.97, + "learning_rate": 6.241987623635482e-08, + "loss": 0.4617, + "step": 8448 + }, + { + "epoch": 0.97, + "learning_rate": 6.200761100364272e-08, + "loss": 0.4252, + "step": 8449 + }, + { + "epoch": 0.97, + "learning_rate": 6.159670749776414e-08, + "loss": 0.4396, + "step": 8450 + }, + { + "epoch": 0.97, + "learning_rate": 6.118716577502404e-08, + "loss": 0.4562, + "step": 8451 + }, + { + "epoch": 0.97, + "learning_rate": 6.077898589153642e-08, + "loss": 0.4598, + "step": 8452 + }, + { + "epoch": 0.97, + "learning_rate": 6.037216790323319e-08, + "loss": 0.4458, + "step": 8453 + }, + { + "epoch": 0.97, + "learning_rate": 5.996671186585756e-08, + "loss": 0.4534, + "step": 8454 + }, + { + "epoch": 0.97, + "learning_rate": 5.9562617834963974e-08, + "loss": 0.4514, + "step": 8455 + }, + { + "epoch": 0.97, + "learning_rate": 5.915988586592481e-08, + "loss": 0.4507, + "step": 8456 + }, + { + "epoch": 0.97, + "learning_rate": 5.8758516013921464e-08, + "loss": 0.4562, + "step": 8457 + }, + { + "epoch": 0.97, + "learning_rate": 5.8358508333951066e-08, + "loss": 0.4379, + "step": 8458 + }, + { + "epoch": 0.97, + "learning_rate": 5.795986288082422e-08, + "loss": 0.438, + "step": 8459 + }, + { + "epoch": 0.97, + "learning_rate": 5.75625797091639e-08, + "loss": 0.4665, + "step": 8460 + }, + { + "epoch": 0.97, + "learning_rate": 5.716665887340656e-08, + "loss": 0.4447, + "step": 8461 + }, + { + "epoch": 0.97, + "learning_rate": 5.677210042780212e-08, + "loss": 0.4512, + "step": 8462 + }, + { + "epoch": 0.97, + "learning_rate": 5.637890442641403e-08, + "loss": 0.4368, + "step": 8463 + }, + { + "epoch": 0.97, + "learning_rate": 5.598707092311917e-08, + "loss": 0.4483, + "step": 8464 + }, + { + "epoch": 0.97, + "learning_rate": 5.5596599971606823e-08, + "loss": 0.4491, + "step": 8465 + }, + { + "epoch": 0.97, + "learning_rate": 5.520749162538197e-08, + "loss": 0.4589, + "step": 8466 + }, + { + "epoch": 0.97, + "learning_rate": 5.4819745937758625e-08, + "loss": 0.4372, + "step": 8467 + }, + { + "epoch": 0.97, + "learning_rate": 5.443336296186874e-08, + "loss": 0.4507, + "step": 8468 + }, + { + "epoch": 0.97, + "learning_rate": 5.40483427506544e-08, + "loss": 0.4463, + "step": 8469 + }, + { + "epoch": 0.97, + "learning_rate": 5.3664685356871193e-08, + "loss": 0.4787, + "step": 8470 + }, + { + "epoch": 0.97, + "learning_rate": 5.3282390833090393e-08, + "loss": 0.4549, + "step": 8471 + }, + { + "epoch": 0.97, + "learning_rate": 5.290145923169343e-08, + "loss": 0.4251, + "step": 8472 + }, + { + "epoch": 0.97, + "learning_rate": 5.252189060487855e-08, + "loss": 0.4497, + "step": 8473 + }, + { + "epoch": 0.97, + "learning_rate": 5.214368500465305e-08, + "loss": 0.4614, + "step": 8474 + }, + { + "epoch": 0.97, + "learning_rate": 5.176684248283992e-08, + "loss": 0.4422, + "step": 8475 + }, + { + "epoch": 0.97, + "learning_rate": 5.1391363091075616e-08, + "loss": 0.4454, + "step": 8476 + }, + { + "epoch": 0.97, + "learning_rate": 5.1017246880809e-08, + "loss": 0.4402, + "step": 8477 + }, + { + "epoch": 0.97, + "learning_rate": 5.064449390330239e-08, + "loss": 0.4599, + "step": 8478 + }, + { + "epoch": 0.97, + "learning_rate": 5.02731042096305e-08, + "loss": 0.4456, + "step": 8479 + }, + { + "epoch": 0.97, + "learning_rate": 4.99030778506826e-08, + "loss": 0.4534, + "step": 8480 + }, + { + "epoch": 0.97, + "learning_rate": 4.953441487716037e-08, + "loss": 0.4505, + "step": 8481 + }, + { + "epoch": 0.97, + "learning_rate": 4.9167115339580074e-08, + "loss": 0.461, + "step": 8482 + }, + { + "epoch": 0.97, + "learning_rate": 4.8801179288268105e-08, + "loss": 0.4449, + "step": 8483 + }, + { + "epoch": 0.97, + "learning_rate": 4.84366067733677e-08, + "loss": 0.451, + "step": 8484 + }, + { + "epoch": 0.97, + "learning_rate": 4.807339784483112e-08, + "loss": 0.4375, + "step": 8485 + }, + { + "epoch": 0.97, + "learning_rate": 4.771155255242854e-08, + "loss": 0.4622, + "step": 8486 + }, + { + "epoch": 0.97, + "learning_rate": 4.7351070945739206e-08, + "loss": 0.4462, + "step": 8487 + }, + { + "epoch": 0.97, + "learning_rate": 4.699195307415805e-08, + "loss": 0.4648, + "step": 8488 + }, + { + "epoch": 0.97, + "learning_rate": 4.663419898689125e-08, + "loss": 0.4283, + "step": 8489 + }, + { + "epoch": 0.97, + "learning_rate": 4.6277808732959616e-08, + "loss": 0.4329, + "step": 8490 + }, + { + "epoch": 0.97, + "learning_rate": 4.5922782361197405e-08, + "loss": 0.4742, + "step": 8491 + }, + { + "epoch": 0.97, + "learning_rate": 4.556911992025015e-08, + "loss": 0.4469, + "step": 8492 + }, + { + "epoch": 0.97, + "learning_rate": 4.521682145857797e-08, + "loss": 0.4465, + "step": 8493 + }, + { + "epoch": 0.97, + "learning_rate": 4.486588702445338e-08, + "loss": 0.4458, + "step": 8494 + }, + { + "epoch": 0.97, + "learning_rate": 4.451631666596123e-08, + "loss": 0.4394, + "step": 8495 + }, + { + "epoch": 0.97, + "learning_rate": 4.416811043100322e-08, + "loss": 0.4785, + "step": 8496 + }, + { + "epoch": 0.97, + "learning_rate": 4.382126836728895e-08, + "loss": 0.4551, + "step": 8497 + }, + { + "epoch": 0.97, + "learning_rate": 4.347579052234374e-08, + "loss": 0.4671, + "step": 8498 + }, + { + "epoch": 0.97, + "learning_rate": 4.3131676943506395e-08, + "loss": 0.4613, + "step": 8499 + }, + { + "epoch": 0.97, + "learning_rate": 4.278892767792808e-08, + "loss": 0.463, + "step": 8500 + }, + { + "epoch": 0.97, + "learning_rate": 4.244754277257346e-08, + "loss": 0.4322, + "step": 8501 + }, + { + "epoch": 0.97, + "learning_rate": 4.210752227421955e-08, + "loss": 0.4353, + "step": 8502 + }, + { + "epoch": 0.97, + "learning_rate": 4.176886622945575e-08, + "loss": 0.4607, + "step": 8503 + }, + { + "epoch": 0.97, + "learning_rate": 4.143157468468717e-08, + "loss": 0.4652, + "step": 8504 + }, + { + "epoch": 0.97, + "learning_rate": 4.109564768613017e-08, + "loss": 0.4465, + "step": 8505 + }, + { + "epoch": 0.97, + "learning_rate": 4.076108527981237e-08, + "loss": 0.4399, + "step": 8506 + }, + { + "epoch": 0.97, + "learning_rate": 4.0427887511578224e-08, + "loss": 0.4457, + "step": 8507 + }, + { + "epoch": 0.97, + "learning_rate": 4.009605442708231e-08, + "loss": 0.4501, + "step": 8508 + }, + { + "epoch": 0.97, + "learning_rate": 3.976558607179382e-08, + "loss": 0.4637, + "step": 8509 + }, + { + "epoch": 0.97, + "learning_rate": 3.943648249099319e-08, + "loss": 0.4323, + "step": 8510 + }, + { + "epoch": 0.97, + "learning_rate": 3.910874372977658e-08, + "loss": 0.4528, + "step": 8511 + }, + { + "epoch": 0.97, + "learning_rate": 3.8782369833050284e-08, + "loss": 0.4539, + "step": 8512 + }, + { + "epoch": 0.97, + "learning_rate": 3.845736084553408e-08, + "loss": 0.4405, + "step": 8513 + }, + { + "epoch": 0.97, + "learning_rate": 3.813371681176348e-08, + "loss": 0.4558, + "step": 8514 + }, + { + "epoch": 0.97, + "learning_rate": 3.7811437776084095e-08, + "loss": 0.4546, + "step": 8515 + }, + { + "epoch": 0.97, + "learning_rate": 3.749052378265505e-08, + "loss": 0.4363, + "step": 8516 + }, + { + "epoch": 0.97, + "learning_rate": 3.717097487545007e-08, + "loss": 0.4546, + "step": 8517 + }, + { + "epoch": 0.97, + "learning_rate": 3.6852791098251906e-08, + "loss": 0.4673, + "step": 8518 + }, + { + "epoch": 0.97, + "learning_rate": 3.653597249466012e-08, + "loss": 0.4412, + "step": 8519 + }, + { + "epoch": 0.97, + "learning_rate": 3.622051910808666e-08, + "loss": 0.4481, + "step": 8520 + }, + { + "epoch": 0.97, + "learning_rate": 3.5906430981754724e-08, + "loss": 0.4613, + "step": 8521 + }, + { + "epoch": 0.97, + "learning_rate": 3.559370815870211e-08, + "loss": 0.4789, + "step": 8522 + }, + { + "epoch": 0.97, + "learning_rate": 3.528235068177899e-08, + "loss": 0.4422, + "step": 8523 + }, + { + "epoch": 0.97, + "learning_rate": 3.4972358593646785e-08, + "loss": 0.4468, + "step": 8524 + }, + { + "epoch": 0.97, + "learning_rate": 3.466373193678263e-08, + "loss": 0.4509, + "step": 8525 + }, + { + "epoch": 0.97, + "learning_rate": 3.4356470753474927e-08, + "loss": 0.4454, + "step": 8526 + }, + { + "epoch": 0.97, + "learning_rate": 3.4050575085825546e-08, + "loss": 0.4475, + "step": 8527 + }, + { + "epoch": 0.97, + "learning_rate": 3.3746044975749845e-08, + "loss": 0.4434, + "step": 8528 + }, + { + "epoch": 0.97, + "learning_rate": 3.3442880464972237e-08, + "loss": 0.4462, + "step": 8529 + }, + { + "epoch": 0.97, + "learning_rate": 3.314108159503726e-08, + "loss": 0.4664, + "step": 8530 + }, + { + "epoch": 0.97, + "learning_rate": 3.284064840729406e-08, + "loss": 0.453, + "step": 8531 + }, + { + "epoch": 0.98, + "learning_rate": 3.2541580942911935e-08, + "loss": 0.4415, + "step": 8532 + }, + { + "epoch": 0.98, + "learning_rate": 3.224387924286698e-08, + "loss": 0.4437, + "step": 8533 + }, + { + "epoch": 0.98, + "learning_rate": 3.1947543347953246e-08, + "loss": 0.4495, + "step": 8534 + }, + { + "epoch": 0.98, + "learning_rate": 3.1652573298774916e-08, + "loss": 0.4552, + "step": 8535 + }, + { + "epoch": 0.98, + "learning_rate": 3.135896913574743e-08, + "loss": 0.4514, + "step": 8536 + }, + { + "epoch": 0.98, + "learning_rate": 3.106673089910417e-08, + "loss": 0.4368, + "step": 8537 + }, + { + "epoch": 0.98, + "learning_rate": 3.077585862888643e-08, + "loss": 0.4588, + "step": 8538 + }, + { + "epoch": 0.98, + "learning_rate": 3.048635236495012e-08, + "loss": 0.446, + "step": 8539 + }, + { + "epoch": 0.98, + "learning_rate": 3.019821214696572e-08, + "loss": 0.4568, + "step": 8540 + }, + { + "epoch": 0.98, + "learning_rate": 2.9911438014412765e-08, + "loss": 0.4559, + "step": 8541 + }, + { + "epoch": 0.98, + "learning_rate": 2.962603000658648e-08, + "loss": 0.4478, + "step": 8542 + }, + { + "epoch": 0.98, + "learning_rate": 2.9341988162595593e-08, + "loss": 0.458, + "step": 8543 + }, + { + "epoch": 0.98, + "learning_rate": 2.905931252135785e-08, + "loss": 0.4289, + "step": 8544 + }, + { + "epoch": 0.98, + "learning_rate": 2.8778003121607834e-08, + "loss": 0.4586, + "step": 8545 + }, + { + "epoch": 0.98, + "learning_rate": 2.849806000189026e-08, + "loss": 0.4475, + "step": 8546 + }, + { + "epoch": 0.98, + "learning_rate": 2.8219483200563334e-08, + "loss": 0.4482, + "step": 8547 + }, + { + "epoch": 0.98, + "learning_rate": 2.794227275579986e-08, + "loss": 0.4307, + "step": 8548 + }, + { + "epoch": 0.98, + "learning_rate": 2.766642870558278e-08, + "loss": 0.4794, + "step": 8549 + }, + { + "epoch": 0.98, + "learning_rate": 2.7391951087708534e-08, + "loss": 0.4402, + "step": 8550 + }, + { + "epoch": 0.98, + "learning_rate": 2.7118839939787033e-08, + "loss": 0.4402, + "step": 8551 + }, + { + "epoch": 0.98, + "learning_rate": 2.6847095299241678e-08, + "loss": 0.4382, + "step": 8552 + }, + { + "epoch": 0.98, + "learning_rate": 2.6576717203304904e-08, + "loss": 0.4644, + "step": 8553 + }, + { + "epoch": 0.98, + "learning_rate": 2.6307705689028184e-08, + "loss": 0.4639, + "step": 8554 + }, + { + "epoch": 0.98, + "learning_rate": 2.6040060793268705e-08, + "loss": 0.4479, + "step": 8555 + }, + { + "epoch": 0.98, + "learning_rate": 2.5773782552701578e-08, + "loss": 0.4348, + "step": 8556 + }, + { + "epoch": 0.98, + "learning_rate": 2.550887100381205e-08, + "loss": 0.436, + "step": 8557 + }, + { + "epoch": 0.98, + "learning_rate": 2.5245326182899987e-08, + "loss": 0.4596, + "step": 8558 + }, + { + "epoch": 0.98, + "learning_rate": 2.4983148126076494e-08, + "loss": 0.4529, + "step": 8559 + }, + { + "epoch": 0.98, + "learning_rate": 2.4722336869265063e-08, + "loss": 0.4305, + "step": 8560 + }, + { + "epoch": 0.98, + "learning_rate": 2.4462892448202657e-08, + "loss": 0.4697, + "step": 8561 + }, + { + "epoch": 0.98, + "learning_rate": 2.4204814898440844e-08, + "loss": 0.4521, + "step": 8562 + }, + { + "epoch": 0.98, + "learning_rate": 2.394810425534022e-08, + "loss": 0.4691, + "step": 8563 + }, + { + "epoch": 0.98, + "learning_rate": 2.369276055407599e-08, + "loss": 0.4614, + "step": 8564 + }, + { + "epoch": 0.98, + "learning_rate": 2.3438783829635714e-08, + "loss": 0.4483, + "step": 8565 + }, + { + "epoch": 0.98, + "learning_rate": 2.318617411682156e-08, + "loss": 0.4659, + "step": 8566 + }, + { + "epoch": 0.98, + "learning_rate": 2.2934931450245833e-08, + "loss": 0.4533, + "step": 8567 + }, + { + "epoch": 0.98, + "learning_rate": 2.2685055864333227e-08, + "loss": 0.4534, + "step": 8568 + }, + { + "epoch": 0.98, + "learning_rate": 2.2436547393323017e-08, + "loss": 0.4475, + "step": 8569 + }, + { + "epoch": 0.98, + "learning_rate": 2.218940607126685e-08, + "loss": 0.4378, + "step": 8570 + }, + { + "epoch": 0.98, + "learning_rate": 2.1943631932028752e-08, + "loss": 0.4555, + "step": 8571 + }, + { + "epoch": 0.98, + "learning_rate": 2.169922500928512e-08, + "loss": 0.4596, + "step": 8572 + }, + { + "epoch": 0.98, + "learning_rate": 2.1456185336524714e-08, + "loss": 0.4347, + "step": 8573 + }, + { + "epoch": 0.98, + "learning_rate": 2.1214512947048684e-08, + "loss": 0.4469, + "step": 8574 + }, + { + "epoch": 0.98, + "learning_rate": 2.097420787397275e-08, + "loss": 0.4515, + "step": 8575 + }, + { + "epoch": 0.98, + "learning_rate": 2.0735270150223917e-08, + "loss": 0.4495, + "step": 8576 + }, + { + "epoch": 0.98, + "learning_rate": 2.0497699808542658e-08, + "loss": 0.4537, + "step": 8577 + }, + { + "epoch": 0.98, + "learning_rate": 2.0261496881479605e-08, + "loss": 0.4443, + "step": 8578 + }, + { + "epoch": 0.98, + "learning_rate": 2.002666140140108e-08, + "loss": 0.4546, + "step": 8579 + }, + { + "epoch": 0.98, + "learning_rate": 1.979319340048469e-08, + "loss": 0.4571, + "step": 8580 + }, + { + "epoch": 0.98, + "learning_rate": 1.956109291072039e-08, + "loss": 0.4424, + "step": 8581 + }, + { + "epoch": 0.98, + "learning_rate": 1.9330359963910527e-08, + "loss": 0.4511, + "step": 8582 + }, + { + "epoch": 0.98, + "learning_rate": 1.910099459167314e-08, + "loss": 0.4563, + "step": 8583 + }, + { + "epoch": 0.98, + "learning_rate": 1.8872996825433086e-08, + "loss": 0.4414, + "step": 8584 + }, + { + "epoch": 0.98, + "learning_rate": 1.864636669643427e-08, + "loss": 0.4554, + "step": 8585 + }, + { + "epoch": 0.98, + "learning_rate": 1.8421104235727406e-08, + "loss": 0.4389, + "step": 8586 + }, + { + "epoch": 0.98, + "learning_rate": 1.8197209474180023e-08, + "loss": 0.4647, + "step": 8587 + }, + { + "epoch": 0.98, + "learning_rate": 1.7974682442470915e-08, + "loss": 0.4403, + "step": 8588 + }, + { + "epoch": 0.98, + "learning_rate": 1.775352317109014e-08, + "loss": 0.4513, + "step": 8589 + }, + { + "epoch": 0.98, + "learning_rate": 1.7533731690342338e-08, + "loss": 0.4451, + "step": 8590 + }, + { + "epoch": 0.98, + "learning_rate": 1.7315308030342314e-08, + "loss": 0.4486, + "step": 8591 + }, + { + "epoch": 0.98, + "learning_rate": 1.7098252221021683e-08, + "loss": 0.4594, + "step": 8592 + }, + { + "epoch": 0.98, + "learning_rate": 1.6882564292119984e-08, + "loss": 0.4419, + "step": 8593 + }, + { + "epoch": 0.98, + "learning_rate": 1.666824427319136e-08, + "loss": 0.4521, + "step": 8594 + }, + { + "epoch": 0.98, + "learning_rate": 1.6455292193603424e-08, + "loss": 0.4343, + "step": 8595 + }, + { + "epoch": 0.98, + "learning_rate": 1.624370808253506e-08, + "loss": 0.4542, + "step": 8596 + }, + { + "epoch": 0.98, + "learning_rate": 1.6033491968976412e-08, + "loss": 0.4468, + "step": 8597 + }, + { + "epoch": 0.98, + "learning_rate": 1.5824643881734438e-08, + "loss": 0.4459, + "step": 8598 + }, + { + "epoch": 0.98, + "learning_rate": 1.561716384942402e-08, + "loss": 0.4632, + "step": 8599 + }, + { + "epoch": 0.98, + "learning_rate": 1.541105190047465e-08, + "loss": 0.4459, + "step": 8600 + }, + { + "epoch": 0.98, + "learning_rate": 1.5206308063129282e-08, + "loss": 0.4326, + "step": 8601 + }, + { + "epoch": 0.98, + "learning_rate": 1.5002932365442148e-08, + "loss": 0.4464, + "step": 8602 + }, + { + "epoch": 0.98, + "learning_rate": 1.480092483527984e-08, + "loss": 0.4586, + "step": 8603 + }, + { + "epoch": 0.98, + "learning_rate": 1.4600285500322442e-08, + "loss": 0.4658, + "step": 8604 + }, + { + "epoch": 0.98, + "learning_rate": 1.4401014388061296e-08, + "loss": 0.483, + "step": 8605 + }, + { + "epoch": 0.98, + "learning_rate": 1.4203111525801228e-08, + "loss": 0.4419, + "step": 8606 + }, + { + "epoch": 0.98, + "learning_rate": 1.4006576940659433e-08, + "loss": 0.4547, + "step": 8607 + }, + { + "epoch": 0.98, + "learning_rate": 1.3811410659565483e-08, + "loss": 0.4322, + "step": 8608 + }, + { + "epoch": 0.98, + "learning_rate": 1.3617612709262428e-08, + "loss": 0.4459, + "step": 8609 + }, + { + "epoch": 0.98, + "learning_rate": 1.3425183116303475e-08, + "loss": 0.4408, + "step": 8610 + }, + { + "epoch": 0.98, + "learning_rate": 1.3234121907056418e-08, + "loss": 0.4458, + "step": 8611 + }, + { + "epoch": 0.98, + "learning_rate": 1.3044429107700319e-08, + "loss": 0.4581, + "step": 8612 + }, + { + "epoch": 0.98, + "learning_rate": 1.2856104744228826e-08, + "loss": 0.4792, + "step": 8613 + }, + { + "epoch": 0.98, + "learning_rate": 1.2669148842444634e-08, + "loss": 0.4454, + "step": 8614 + }, + { + "epoch": 0.98, + "learning_rate": 1.248356142796725e-08, + "loss": 0.4506, + "step": 8615 + }, + { + "epoch": 0.98, + "learning_rate": 1.2299342526224112e-08, + "loss": 0.4594, + "step": 8616 + }, + { + "epoch": 0.98, + "learning_rate": 1.211649216245836e-08, + "loss": 0.4516, + "step": 8617 + }, + { + "epoch": 0.98, + "learning_rate": 1.1935010361724397e-08, + "loss": 0.4324, + "step": 8618 + }, + { + "epoch": 0.99, + "learning_rate": 1.1754897148889e-08, + "loss": 0.4501, + "step": 8619 + }, + { + "epoch": 0.99, + "learning_rate": 1.1576152548631314e-08, + "loss": 0.45, + "step": 8620 + }, + { + "epoch": 0.99, + "learning_rate": 1.1398776585445082e-08, + "loss": 0.4689, + "step": 8621 + }, + { + "epoch": 0.99, + "learning_rate": 1.1222769283633083e-08, + "loss": 0.4379, + "step": 8622 + }, + { + "epoch": 0.99, + "learning_rate": 1.1048130667312695e-08, + "loss": 0.449, + "step": 8623 + }, + { + "epoch": 0.99, + "learning_rate": 1.0874860760413664e-08, + "loss": 0.4387, + "step": 8624 + }, + { + "epoch": 0.99, + "learning_rate": 1.0702959586678108e-08, + "loss": 0.4623, + "step": 8625 + }, + { + "epoch": 0.99, + "learning_rate": 1.0532427169659409e-08, + "loss": 0.4342, + "step": 8626 + }, + { + "epoch": 0.99, + "learning_rate": 1.0363263532724433e-08, + "loss": 0.4405, + "step": 8627 + }, + { + "epoch": 0.99, + "learning_rate": 1.0195468699052413e-08, + "loss": 0.457, + "step": 8628 + }, + { + "epoch": 0.99, + "learning_rate": 1.0029042691636071e-08, + "loss": 0.4239, + "step": 8629 + }, + { + "epoch": 0.99, + "learning_rate": 9.863985533278275e-09, + "loss": 0.4434, + "step": 8630 + }, + { + "epoch": 0.99, + "learning_rate": 9.700297246596491e-09, + "loss": 0.461, + "step": 8631 + }, + { + "epoch": 0.99, + "learning_rate": 9.537977854018332e-09, + "loss": 0.4584, + "step": 8632 + }, + { + "epoch": 0.99, + "learning_rate": 9.377027377786007e-09, + "loss": 0.4492, + "step": 8633 + }, + { + "epoch": 0.99, + "learning_rate": 9.217445839952988e-09, + "loss": 0.4512, + "step": 8634 + }, + { + "epoch": 0.99, + "learning_rate": 9.059233262386225e-09, + "loss": 0.4474, + "step": 8635 + }, + { + "epoch": 0.99, + "learning_rate": 8.902389666765044e-09, + "loss": 0.4534, + "step": 8636 + }, + { + "epoch": 0.99, + "learning_rate": 8.746915074577811e-09, + "loss": 0.4459, + "step": 8637 + }, + { + "epoch": 0.99, + "learning_rate": 8.592809507129706e-09, + "loss": 0.4423, + "step": 8638 + }, + { + "epoch": 0.99, + "learning_rate": 8.440072985537174e-09, + "loss": 0.4627, + "step": 8639 + }, + { + "epoch": 0.99, + "learning_rate": 8.288705530727915e-09, + "loss": 0.4505, + "step": 8640 + }, + { + "epoch": 0.99, + "learning_rate": 8.138707163442005e-09, + "loss": 0.4333, + "step": 8641 + }, + { + "epoch": 0.99, + "learning_rate": 7.990077904234117e-09, + "loss": 0.4511, + "step": 8642 + }, + { + "epoch": 0.99, + "learning_rate": 7.84281777346796e-09, + "loss": 0.457, + "step": 8643 + }, + { + "epoch": 0.99, + "learning_rate": 7.696926791322946e-09, + "loss": 0.4234, + "step": 8644 + }, + { + "epoch": 0.99, + "learning_rate": 7.552404977788641e-09, + "loss": 0.4557, + "step": 8645 + }, + { + "epoch": 0.99, + "learning_rate": 7.409252352668095e-09, + "loss": 0.444, + "step": 8646 + }, + { + "epoch": 0.99, + "learning_rate": 7.267468935575617e-09, + "loss": 0.455, + "step": 8647 + }, + { + "epoch": 0.99, + "learning_rate": 7.12705474594011e-09, + "loss": 0.4692, + "step": 8648 + }, + { + "epoch": 0.99, + "learning_rate": 6.988009803000628e-09, + "loss": 0.4765, + "step": 8649 + }, + { + "epoch": 0.99, + "learning_rate": 6.8503341258086e-09, + "loss": 0.4445, + "step": 8650 + }, + { + "epoch": 0.99, + "learning_rate": 6.714027733230044e-09, + "loss": 0.4319, + "step": 8651 + }, + { + "epoch": 0.99, + "learning_rate": 6.579090643942243e-09, + "loss": 0.4481, + "step": 8652 + }, + { + "epoch": 0.99, + "learning_rate": 6.4455228764326305e-09, + "loss": 0.4501, + "step": 8653 + }, + { + "epoch": 0.99, + "learning_rate": 6.3133244490043434e-09, + "loss": 0.4475, + "step": 8654 + }, + { + "epoch": 0.99, + "learning_rate": 6.18249537977178e-09, + "loss": 0.4532, + "step": 8655 + }, + { + "epoch": 0.99, + "learning_rate": 6.053035686661712e-09, + "loss": 0.4401, + "step": 8656 + }, + { + "epoch": 0.99, + "learning_rate": 5.924945387411063e-09, + "loss": 0.4532, + "step": 8657 + }, + { + "epoch": 0.99, + "learning_rate": 5.798224499572458e-09, + "loss": 0.4334, + "step": 8658 + }, + { + "epoch": 0.99, + "learning_rate": 5.672873040509786e-09, + "loss": 0.4682, + "step": 8659 + }, + { + "epoch": 0.99, + "learning_rate": 5.548891027398195e-09, + "loss": 0.4225, + "step": 8660 + }, + { + "epoch": 0.99, + "learning_rate": 5.426278477226321e-09, + "loss": 0.4527, + "step": 8661 + }, + { + "epoch": 0.99, + "learning_rate": 5.305035406795167e-09, + "loss": 0.4467, + "step": 8662 + }, + { + "epoch": 0.99, + "learning_rate": 5.185161832718111e-09, + "loss": 0.4284, + "step": 8663 + }, + { + "epoch": 0.99, + "learning_rate": 5.0666577714186815e-09, + "loss": 0.4373, + "step": 8664 + }, + { + "epoch": 0.99, + "learning_rate": 4.949523239136112e-09, + "loss": 0.4694, + "step": 8665 + }, + { + "epoch": 0.99, + "learning_rate": 4.833758251919785e-09, + "loss": 0.4355, + "step": 8666 + }, + { + "epoch": 0.99, + "learning_rate": 4.7193628256325676e-09, + "loss": 0.4348, + "step": 8667 + }, + { + "epoch": 0.99, + "learning_rate": 4.606336975948589e-09, + "loss": 0.4469, + "step": 8668 + }, + { + "epoch": 0.99, + "learning_rate": 4.494680718355459e-09, + "loss": 0.4447, + "step": 8669 + }, + { + "epoch": 0.99, + "learning_rate": 4.384394068153164e-09, + "loss": 0.4434, + "step": 8670 + }, + { + "epoch": 0.99, + "learning_rate": 4.275477040451836e-09, + "loss": 0.453, + "step": 8671 + }, + { + "epoch": 0.99, + "learning_rate": 4.167929650176206e-09, + "loss": 0.4588, + "step": 8672 + }, + { + "epoch": 0.99, + "learning_rate": 4.061751912063372e-09, + "loss": 0.4509, + "step": 8673 + }, + { + "epoch": 0.99, + "learning_rate": 3.956943840661698e-09, + "loss": 0.4543, + "step": 8674 + }, + { + "epoch": 0.99, + "learning_rate": 3.853505450331918e-09, + "loss": 0.4426, + "step": 8675 + }, + { + "epoch": 0.99, + "learning_rate": 3.751436755247139e-09, + "loss": 0.4451, + "step": 8676 + }, + { + "epoch": 0.99, + "learning_rate": 3.650737769393953e-09, + "loss": 0.4378, + "step": 8677 + }, + { + "epoch": 0.99, + "learning_rate": 3.5514085065690984e-09, + "loss": 0.4355, + "step": 8678 + }, + { + "epoch": 0.99, + "learning_rate": 3.4534489803850215e-09, + "loss": 0.4543, + "step": 8679 + }, + { + "epoch": 0.99, + "learning_rate": 3.3568592042620974e-09, + "loss": 0.4409, + "step": 8680 + }, + { + "epoch": 0.99, + "learning_rate": 3.2616391914364056e-09, + "loss": 0.4628, + "step": 8681 + }, + { + "epoch": 0.99, + "learning_rate": 3.167788954954176e-09, + "loss": 0.4445, + "step": 8682 + }, + { + "epoch": 0.99, + "learning_rate": 3.075308507677344e-09, + "loss": 0.4646, + "step": 8683 + }, + { + "epoch": 0.99, + "learning_rate": 2.9841978622746624e-09, + "loss": 0.4466, + "step": 8684 + }, + { + "epoch": 0.99, + "learning_rate": 2.894457031232811e-09, + "loss": 0.4682, + "step": 8685 + }, + { + "epoch": 0.99, + "learning_rate": 2.8060860268475097e-09, + "loss": 0.4413, + "step": 8686 + }, + { + "epoch": 0.99, + "learning_rate": 2.7190848612279606e-09, + "loss": 0.4405, + "step": 8687 + }, + { + "epoch": 0.99, + "learning_rate": 2.6334535462935184e-09, + "loss": 0.46, + "step": 8688 + }, + { + "epoch": 0.99, + "learning_rate": 2.54919209377924e-09, + "loss": 0.454, + "step": 8689 + }, + { + "epoch": 0.99, + "learning_rate": 2.4663005152314455e-09, + "loss": 0.4464, + "step": 8690 + }, + { + "epoch": 0.99, + "learning_rate": 2.384778822006606e-09, + "loss": 0.4624, + "step": 8691 + }, + { + "epoch": 0.99, + "learning_rate": 2.304627025274675e-09, + "loss": 0.4351, + "step": 8692 + }, + { + "epoch": 0.99, + "learning_rate": 2.225845136019089e-09, + "loss": 0.4529, + "step": 8693 + }, + { + "epoch": 0.99, + "learning_rate": 2.148433165035657e-09, + "loss": 0.4624, + "step": 8694 + }, + { + "epoch": 0.99, + "learning_rate": 2.0723911229303396e-09, + "loss": 0.4256, + "step": 8695 + }, + { + "epoch": 0.99, + "learning_rate": 1.9977190201225793e-09, + "loss": 0.4375, + "step": 8696 + }, + { + "epoch": 0.99, + "learning_rate": 1.924416866844192e-09, + "loss": 0.4619, + "step": 8697 + }, + { + "epoch": 0.99, + "learning_rate": 1.8524846731404755e-09, + "loss": 0.4525, + "step": 8698 + }, + { + "epoch": 0.99, + "learning_rate": 1.7819224488657695e-09, + "loss": 0.4452, + "step": 8699 + }, + { + "epoch": 0.99, + "learning_rate": 1.7127302036901162e-09, + "loss": 0.4516, + "step": 8700 + }, + { + "epoch": 0.99, + "learning_rate": 1.6449079470937103e-09, + "loss": 0.4446, + "step": 8701 + }, + { + "epoch": 0.99, + "learning_rate": 1.5784556883691183e-09, + "loss": 0.4504, + "step": 8702 + }, + { + "epoch": 0.99, + "learning_rate": 1.5133734366234998e-09, + "loss": 0.458, + "step": 8703 + }, + { + "epoch": 0.99, + "learning_rate": 1.449661200773056e-09, + "loss": 0.4442, + "step": 8704 + }, + { + "epoch": 0.99, + "learning_rate": 1.3873189895485806e-09, + "loss": 0.4447, + "step": 8705 + }, + { + "epoch": 0.99, + "learning_rate": 1.3263468114921295e-09, + "loss": 0.4641, + "step": 8706 + }, + { + "epoch": 1.0, + "learning_rate": 1.2667446749581314e-09, + "loss": 0.4302, + "step": 8707 + }, + { + "epoch": 1.0, + "learning_rate": 1.2085125881133863e-09, + "loss": 0.4294, + "step": 8708 + }, + { + "epoch": 1.0, + "learning_rate": 1.1516505589381777e-09, + "loss": 0.4844, + "step": 8709 + }, + { + "epoch": 1.0, + "learning_rate": 1.0961585952218301e-09, + "loss": 0.4651, + "step": 8710 + }, + { + "epoch": 1.0, + "learning_rate": 1.042036704568261e-09, + "loss": 0.4544, + "step": 8711 + }, + { + "epoch": 1.0, + "learning_rate": 9.89284894395981e-10, + "loss": 0.4279, + "step": 8712 + }, + { + "epoch": 1.0, + "learning_rate": 9.37903171929211e-10, + "loss": 0.4377, + "step": 8713 + }, + { + "epoch": 1.0, + "learning_rate": 8.878915442123159e-10, + "loss": 0.4588, + "step": 8714 + }, + { + "epoch": 1.0, + "learning_rate": 8.392500180953722e-10, + "loss": 0.4642, + "step": 8715 + }, + { + "epoch": 1.0, + "learning_rate": 7.919786002441588e-10, + "loss": 0.4456, + "step": 8716 + }, + { + "epoch": 1.0, + "learning_rate": 7.460772971357167e-10, + "loss": 0.4498, + "step": 8717 + }, + { + "epoch": 1.0, + "learning_rate": 7.015461150594594e-10, + "loss": 0.4332, + "step": 8718 + }, + { + "epoch": 1.0, + "learning_rate": 6.583850601182829e-10, + "loss": 0.4588, + "step": 8719 + }, + { + "epoch": 1.0, + "learning_rate": 6.165941382241248e-10, + "loss": 0.449, + "step": 8720 + }, + { + "epoch": 1.0, + "learning_rate": 5.761733551057357e-10, + "loss": 0.4273, + "step": 8721 + }, + { + "epoch": 1.0, + "learning_rate": 5.371227162997983e-10, + "loss": 0.4443, + "step": 8722 + }, + { + "epoch": 1.0, + "learning_rate": 4.994422271575872e-10, + "loss": 0.4435, + "step": 8723 + }, + { + "epoch": 1.0, + "learning_rate": 4.631318928427497e-10, + "loss": 0.4719, + "step": 8724 + }, + { + "epoch": 1.0, + "learning_rate": 4.2819171833019537e-10, + "loss": 0.4333, + "step": 8725 + }, + { + "epoch": 1.0, + "learning_rate": 3.946217084072057e-10, + "loss": 0.4628, + "step": 8726 + }, + { + "epoch": 1.0, + "learning_rate": 3.624218676734348e-10, + "loss": 0.4442, + "step": 8727 + }, + { + "epoch": 1.0, + "learning_rate": 3.315922005420191e-10, + "loss": 0.467, + "step": 8728 + }, + { + "epoch": 1.0, + "learning_rate": 3.0213271123735735e-10, + "loss": 0.4376, + "step": 8729 + }, + { + "epoch": 1.0, + "learning_rate": 2.740434037951101e-10, + "loss": 0.4528, + "step": 8730 + }, + { + "epoch": 1.0, + "learning_rate": 2.4732428206442063e-10, + "loss": 0.4384, + "step": 8731 + }, + { + "epoch": 1.0, + "learning_rate": 2.2197534970569424e-10, + "loss": 0.4699, + "step": 8732 + }, + { + "epoch": 1.0, + "learning_rate": 1.9799661019392901e-10, + "loss": 0.4377, + "step": 8733 + }, + { + "epoch": 1.0, + "learning_rate": 1.7538806681316467e-10, + "loss": 0.4585, + "step": 8734 + }, + { + "epoch": 1.0, + "learning_rate": 1.5414972266314389e-10, + "loss": 0.45, + "step": 8735 + }, + { + "epoch": 1.0, + "learning_rate": 1.3428158065154073e-10, + "loss": 0.433, + "step": 8736 + }, + { + "epoch": 1.0, + "learning_rate": 1.1578364350284254e-10, + "loss": 0.4574, + "step": 8737 + }, + { + "epoch": 1.0, + "learning_rate": 9.865591375168848e-11, + "loss": 0.4435, + "step": 8738 + }, + { + "epoch": 1.0, + "learning_rate": 8.289839374286956e-11, + "loss": 0.4497, + "step": 8739 + }, + { + "epoch": 1.0, + "learning_rate": 6.851108563687981e-11, + "loss": 0.4621, + "step": 8740 + }, + { + "epoch": 1.0, + "learning_rate": 5.549399140547529e-11, + "loss": 0.4419, + "step": 8741 + }, + { + "epoch": 1.0, + "learning_rate": 4.3847112831674196e-11, + "loss": 0.4573, + "step": 8742 + }, + { + "epoch": 1.0, + "learning_rate": 3.3570451511977245e-11, + "loss": 0.4466, + "step": 8743 + }, + { + "epoch": 1.0, + "learning_rate": 2.466400885303699e-11, + "loss": 0.4322, + "step": 8744 + }, + { + "epoch": 1.0, + "learning_rate": 1.7127786077208998e-11, + "loss": 0.4459, + "step": 8745 + }, + { + "epoch": 1.0, + "learning_rate": 1.0961784215890448e-11, + "loss": 0.4587, + "step": 8746 + }, + { + "epoch": 1.0, + "learning_rate": 6.166004113961066e-12, + "loss": 0.4332, + "step": 8747 + }, + { + "epoch": 1.0, + "learning_rate": 2.7404464297831057e-12, + "loss": 0.452, + "step": 8748 + }, + { + "epoch": 1.0, + "learning_rate": 6.8511163076046e-13, + "loss": 0.4351, + "step": 8749 + }, + { + "epoch": 1.0, + "learning_rate": 0.0, + "loss": 0.4951, + "step": 8750 + }, + { + "epoch": 1.0, + "step": 8750, + "total_flos": 0.0, + "train_loss": 0.09011299923147474, + "train_runtime": 10298.0352, + "train_samples_per_second": 438.849, + "train_steps_per_second": 0.85 + } + ], + "logging_steps": 1.0, + "max_steps": 8750, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/vision_tower/config.json b/vision_tower/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a907af9dfe1c6a7aac932c02ff2f5fc856a9d453 --- /dev/null +++ b/vision_tower/config.json @@ -0,0 +1,19 @@ +{ + "_name_or_path": "/lustre/fs12/portfolios/nvr/users/mmemmel/projects/vila/checkpoints/finetuned/vila/vila_3b_oxe_sim_jack_o/vision_tower", + "architectures": [ + "SiglipVisionModel" + ], + "attention_dropout": 0.0, + "hidden_act": "gelu_pytorch_tanh", + "hidden_size": 1152, + "image_size": 384, + "intermediate_size": 4304, + "layer_norm_eps": 1e-06, + "model_type": "siglip_vision_model", + "num_attention_heads": 16, + "num_channels": 3, + "num_hidden_layers": 27, + "patch_size": 14, + "torch_dtype": "bfloat16", + "transformers_version": "4.36.2" +} diff --git a/vision_tower/model.safetensors b/vision_tower/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8f555ed04111f75dd66e8fd03c493312dafc6ac2 --- /dev/null +++ b/vision_tower/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:546005524d4391030a370ac71ea629549254613296ab5d7118e637ee142066bc +size 856506120 diff --git a/vision_tower/preprocessor_config.json b/vision_tower/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0f13134ed29056f82f3ab7e0246f0ab973e7ecf3 --- /dev/null +++ b/vision_tower/preprocessor_config.json @@ -0,0 +1,24 @@ +{ + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.5, + 0.5, + 0.5 + ], + "image_processor_type": "SiglipImageProcessor", + "image_std": [ + 0.5, + 0.5, + 0.5 + ], + "processor_class": "SiglipProcessor", + "resample": 3, + "rescale_factor": 0.00392156862745098, + "size": { + "height": 384, + "width": 384 + } +}